From: Apple Date: Wed, 5 Feb 2020 22:25:23 +0000 (+0000) Subject: xnu-6153.11.26.tar.gz X-Git-Tag: macos-1015^0 X-Git-Url: https://git.saurik.com/apple/xnu.git/commitdiff_plain/cb3231590a3c94ab4375e2228bd5e86b0cf1ad7e xnu-6153.11.26.tar.gz --- diff --git a/.gitignore b/.gitignore index f5ad2c6fd..70d6a4014 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,9 @@ compile_commands.json # /libkern/kmod/libkmod.xcodeproj/ /libkern/kmod/libkmod.xcodeproj/xcuserdata +# /libkdd/kdd.xcodeproj/ +/libkdd/kdd.xcodeproj/xcuserdata + # /libsyscall/Libsyscall.xcodeproj/ /libsyscall/Libsyscall.xcodeproj/xcuserdata /libsyscall/Libsyscall.xcodeproj/project.xcworkspace @@ -44,14 +47,25 @@ compile_commands.json # /tools/tests/testkext/testkext.xcodeproj/ /tools/tests/testkext/testkext.xcodeproj/xcuserdata +#/tools/tests/unit_tests/cpu_monitor_tests_11646922_src/CatchRN/CatchRN.xcodeproj/ +/tools/tests/unit_tests/cpu_monitor_tests_11646922_src/CatchRN/CatchRN.xcodeproj/xcuserdata + # /tools/tests/unit_tests/cpu_monitor_tests_11646922_src/cpu_hog/cpu_hog.xcodeproj/ /tools/tests/unit_tests/cpu_monitor_tests_11646922_src/cpu_hog/cpu_hog.xcodeproj/xcuserdata +# /tools/tests/unit_tests/mach_test_15789220_src/mach_test.xcodeproj/ +/tools/tests/unit_tests/mach_test_15789220_src/mach_test.xcodeproj/xcuserdata + # /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/ /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/xcuserdata # /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/project.xcworkspace/ /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/project.xcworkspace/xcuserdata +#/tools/tests/unit_tests/test_14395574/test_14395574.xcodeproj/ +/tools/tests/unit_tests/test_14395574/test_14395574.xcodeproj/xcuserdata + # /tools/tests/zero-to-n /tools/tests/zero-to-n/zn* + +# do not add *.orig, *.rej, use `git clean` instead diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index a8db883a3..770e156ec 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -8,7 +8,8 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ architecture \ - mach-o + mach-o \ + sys INSTINC_SUBDIRS_X86_64 = \ architecture @@ -32,9 +33,7 @@ KERNEL_FILES = \ stdatomic.h \ stdbool.h \ stddef.h \ - stdint.h - -KERNEL_FILES += \ + stdint.h \ ptrauth.h INSTALL_MI_LIST = diff --git a/EXTERNAL_HEADERS/corecrypto/cc.h b/EXTERNAL_HEADERS/corecrypto/cc.h index 5493e41c9..4b2a6dec2 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc.h +++ b/EXTERNAL_HEADERS/corecrypto/cc.h @@ -16,12 +16,41 @@ #include #include +#if __has_feature(attribute_availability_with_replacement) +#if __has_feature(attribute_availability_bridgeos) + #ifndef __CC_BRIDGE_OS_DEPRECATED + #define __CC_BRIDGEOS_DEPRECATED(_dep, _msg) __attribute__((availability(bridgeos,deprecated=_dep, replacement=_msg))) + #endif +#endif + +#ifndef __CC_BRIDGEOS_DEPRECATED + #define __CC_BRIDGEOS_DEPRECATED(_dep, _msg) +#endif + +#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version) \ +__attribute__((availability(macos,deprecated=macos_version, replacement=replacement_message)))\ +__attribute__((availability(ios,deprecated=ios_version, replacement=replacement_message)))\ +__attribute__((availability(watchos,deprecated=watchos_version, replacement=replacement_message)))\ +__attribute__((availability(tvos,deprecated=tvos_version, replacement=replacement_message)))\ +__CC_BRIDGEOS_DEPRECATED(bridgeos_version, replacement_message) + +#else /* !__has_feature(attribute_availability_with_replacement) */ + +#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version) + +#endif /* __has_feature(attribute_availability_with_replacement) */ + /* Provide a general purpose macro concat method. */ #define cc_concat_(a, b) a##b #define cc_concat(a, b) cc_concat_(a, b) /* Manage asserts here because a few functions in header public files do use asserts */ +#if CORECRYPTO_DEBUG #define cc_assert(x) assert(x) +#else +#define cc_assert(x) +#endif + #if CC_KERNEL #include #elif CC_USE_S3 @@ -32,7 +61,7 @@ /* Provide a static assert that can be used to create compile-type failures. */ #define cc_static_assert(e,m) \ - ;enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) } + enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) } /* Declare a struct element with a guarenteed alignment of _alignment_. The resulting struct can be used to create arrays that are aligned by @@ -42,6 +71,15 @@ typedef struct { \ uint8_t b[_alignment_]; \ } CC_ALIGNED(_alignment_) +#if defined(__BIGGEST_ALIGNMENT__) +#define CC_MAX_ALIGNMENT __BIGGEST_ALIGNMENT__ +#else +#define CC_MAX_ALIGNMENT 16 +#endif + +/* pads a given size to be a multiple of the biggest alignment for any type */ +#define cc_pad_align(_size_) ((_size_ + CC_MAX_ALIGNMENT - 1) & (~(CC_MAX_ALIGNMENT - 1))) + /* number of array elements used in a cc_ctx_decl */ #define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_)) @@ -55,14 +93,14 @@ uint8_t b[_alignment_]; \ 3. Never use sizeof() operator for the variables declared with cc_ctx_decl(), because it is not be compatible with the _MSC_VER version of cc_ctx_decl(). */ #if defined(_MSC_VER) +#include #define cc_ctx_decl(_type_, _size_, _name_) _type_ * _name_ = (_type_ *) _alloca(sizeof(_type_) * cc_ctx_n(_type_, _size_) ) #else #define cc_ctx_decl(_type_, _size_, _name_) _type_ _name_ [cc_ctx_n(_type_, _size_)] #endif -/* bzero is deprecated. memset is the way to go */ -/* FWIW, L4, HEXAGON and ARMCC even with gnu compatibility mode don't have bzero */ -#define cc_zero(_size_,_data_) memset((_data_),0 ,(_size_)) +// cc_zero is deprecated, please use cc_clear instead. +#define cc_zero(_size_,_data_) _Pragma ("corecrypto deprecation warning \"'cc_zero' macro is deprecated. Use 'cc_clear' instead.\"") cc_clear(_size_,_data_) /*! @brief cc_clear(len, dst) zeroizes array dst and it will not be optimized out. @@ -99,12 +137,16 @@ int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2); /* Exchange S and T of any type. NOTE: Both and S and T are evaluated mutliple times and MUST NOT be expressions. */ #define CC_SWAP(S,T) do { \ - __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \ + volatile __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \ + _cc_swap_tmp = 0;\ } while(0) /* Return the maximum value between S and T. */ #define CC_MAX(S, T) ({__typeof__(S) _cc_max_s = S; __typeof__(T) _cc_max_t = T; _cc_max_s > _cc_max_t ? _cc_max_s : _cc_max_t;}) +/* Clone of CC_MAX() that evalutes S and T multiple times to allow nesting. */ +#define CC_MAX_EVAL(S, T) ((S) > (T) ? (S) : (T)) + /* Return the minimum value between S and T. */ #define CC_MIN(S, T) ({__typeof__(S) _cc_min_s = S; __typeof__(T) _cc_min_t = T; _cc_min_s <= _cc_min_t ? _cc_min_s : _cc_min_t;}) diff --git a/EXTERNAL_HEADERS/corecrypto/cc_config.h b/EXTERNAL_HEADERS/corecrypto/cc_config.h index fbdb2c61c..5fb183288 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_config.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_config.h @@ -258,8 +258,7 @@ #define CCN_OSX 1 #endif -#if CC_USE_L4 || CC_USE_S3 -/* No dynamic linking allowed in L4, e.g. avoid nonlazy symbols */ +#if CC_USE_S3 /* For corecrypto kext, CC_STATIC should be undefined */ #define CC_STATIC 1 #endif @@ -296,9 +295,9 @@ // see rdar://problem/26636018 #if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__)) -#define CCEC25519_CURVE25519DONNA_64BIT 1 +#define CCEC25519_CURVE25519_64BIT 1 #else -#define CCEC25519_CURVE25519DONNA_64BIT 0 +#define CCEC25519_CURVE25519_64BIT 0 #endif //- functions implemented in assembly ------------------------------------------ @@ -307,10 +306,15 @@ #warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform" #endif +// Enable assembler in Linux if CC_LINUX_ASM is defined +#if CC_LINUX && defined(CC_LINUX_ASM) && CC_LINUX_ASM +#define CC_USE_ASM 1 +#endif + // Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc. // Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well. #if !defined(CC_USE_ASM) - #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_RTKIT || CC_RTKITROM + #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_LINUX #define CC_USE_ASM 0 #else #define CC_USE_ASM 1 @@ -327,11 +331,18 @@ #define CCN_ADDMUL1_ASM 1 #define CCN_MUL1_ASM 1 #define CCN_CMP_ASM 1 - #define CCN_ADD1_ASM 0 - #define CCN_SUB1_ASM 0 + #define CCN_ADD1_ASM 1 + #define CCN_SUB1_ASM 1 #define CCN_N_ASM 1 #define CCN_SET_ASM 1 #define CCN_SHIFT_RIGHT_ASM 1 + #if defined(__ARM_NEON__) + #define CCN_SHIFT_LEFT_ASM 1 + #else + #define CCN_SHIFT_LEFT_ASM 0 + #endif + #define CCN_MOD_224_ASM 1 + #define CCN_MULMOD_256_ASM 1 #define CCAES_ARM_ASM 1 #define CCAES_INTEL_ASM 0 #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3 @@ -344,14 +355,16 @@ #define CCSHA2_VNG_INTEL 0 #if defined(__ARM_NEON__) || CC_KERNEL - #define CCSHA1_VNG_ARMV7NEON 1 - #define CCSHA2_VNG_ARMV7NEON 1 + #define CCSHA1_VNG_ARM 1 + #define CCSHA2_VNG_ARM 1 #else /* !defined(__ARM_NEON__) */ - #define CCSHA1_VNG_ARMV7NEON 0 - #define CCSHA2_VNG_ARMV7NEON 0 + #define CCSHA1_VNG_ARM 0 + #define CCSHA2_VNG_ARM 0 #endif /* !defined(__ARM_NEON__) */ #define CCSHA256_ARMV6M_ASM 0 + #define CC_ACCELERATECRYPTO 1 + //-(2) ARM 64 #elif defined(__arm64__) && __clang__ && CC_USE_ASM #define CCN_DEDICATED_SQR CC_SMALL_CODE @@ -367,16 +380,21 @@ #define CCN_N_ASM 1 #define CCN_SET_ASM 0 #define CCN_SHIFT_RIGHT_ASM 1 + #define CCN_SHIFT_LEFT_ASM 1 + #define CCN_MOD_224_ASM 0 + #define CCN_MULMOD_256_ASM 1 #define CCAES_ARM_ASM 1 #define CCAES_INTEL_ASM 0 #define CCAES_MUX 0 // On 64bit SoC, asm is much faster than HW #define CCN_USE_BUILTIN_CLZ 1 #define CCSHA1_VNG_INTEL 0 #define CCSHA2_VNG_INTEL 0 - #define CCSHA1_VNG_ARMV7NEON 1 // reused this to avoid making change to xcode project, put arm64 assembly code with armv7 code - #define CCSHA2_VNG_ARMV7NEON 1 + #define CCSHA1_VNG_ARM 1 + #define CCSHA2_VNG_ARM 1 #define CCSHA256_ARMV6M_ASM 0 + #define CC_ACCELERATECRYPTO 1 + //-(3) Intel 32/64 #elif (defined(__x86_64__) || defined(__i386__)) && __clang__ && CC_USE_ASM #define CCN_DEDICATED_SQR 1 @@ -396,12 +414,16 @@ #define CCN_CMP_ASM 1 #define CCN_N_ASM 1 #define CCN_SHIFT_RIGHT_ASM 1 + #define CCN_SHIFT_LEFT_ASM 1 #else #define CCN_CMP_ASM 0 #define CCN_N_ASM 0 #define CCN_SHIFT_RIGHT_ASM 0 + #define CCN_SHIFT_LEFT_ASM 0 #endif + #define CCN_MOD_224_ASM 0 + #define CCN_MULMOD_256_ASM 0 #define CCN_ADDMUL1_ASM 0 #define CCN_MUL1_ASM 0 #define CCN_ADD1_ASM 0 @@ -413,10 +435,12 @@ #define CCN_USE_BUILTIN_CLZ 0 #define CCSHA1_VNG_INTEL 1 #define CCSHA2_VNG_INTEL 1 - #define CCSHA1_VNG_ARMV7NEON 0 - #define CCSHA2_VNG_ARMV7NEON 0 + #define CCSHA1_VNG_ARM 0 + #define CCSHA2_VNG_ARM 0 #define CCSHA256_ARMV6M_ASM 0 + #define CC_ACCELERATECRYPTO 1 + //-(4) disable assembly #else #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH @@ -436,16 +460,21 @@ #define CCN_N_ASM 0 #define CCN_SET_ASM 0 #define CCN_SHIFT_RIGHT_ASM 0 + #define CCN_SHIFT_LEFT_ASM 0 + #define CCN_MOD_224_ASM 0 + #define CCN_MULMOD_256_ASM 0 #define CCAES_ARM_ASM 0 #define CCAES_INTEL_ASM 0 #define CCAES_MUX 0 #define CCN_USE_BUILTIN_CLZ 0 #define CCSHA1_VNG_INTEL 0 #define CCSHA2_VNG_INTEL 0 - #define CCSHA1_VNG_ARMV7NEON 0 - #define CCSHA2_VNG_ARMV7NEON 0 + #define CCSHA1_VNG_ARM 0 + #define CCSHA2_VNG_ARM 0 #define CCSHA256_ARMV6M_ASM 0 + #define CC_ACCELERATECRYPTO 0 + #endif #define CC_INLINE static inline @@ -457,10 +486,12 @@ #define CC_NONNULL4 CC_NONNULL((4)) #define CC_NONNULL_ALL __attribute__((__nonnull__)) #define CC_SENTINEL __attribute__((__sentinel__)) + // Only apply the `CC_CONST` attribute to functions with no side-effects where the output is a strict function of pass by value input vars with no exterior side-effects. + // Specifically, do not apply CC_CONST if the function has any arguments that are pointers (directly, or indirectly) #define CC_CONST __attribute__((__const__)) #define CC_PURE __attribute__((__pure__)) #define CC_WARN_RESULT __attribute__((__warn_unused_result__)) - #define CC_MALLOC __attribute__((__malloc__)) + #define CC_MALLOC_CLEAR __attribute__((__malloc__)) #define CC_UNUSED __attribute__((unused)) #else /* !__GNUC__ */ /*! @parseOnly */ @@ -484,9 +515,24 @@ /*! @parseOnly */ #define CC_WARN_RESULT /*! @parseOnly */ - #define CC_MALLOC + #define CC_MALLOC_CLEAR #endif /* !__GNUC__ */ + +// Bridge differences between MachO and ELF compiler/assemblers. */ +#if CC_USE_ASM +#if CC_LINUX +#define CC_ASM_SECTION_CONST .rodata +#define CC_ASM_PRIVATE_EXTERN .hidden +#define CC_C_LABEL(_sym) _sym +#else /* !CC_LINUX */ +#define CC_ASM_SECTION_CONST .const +#define CC_ASM_PRIVATE_EXTERN .private_extern +#define CC_C_LABEL(_sym) _##_sym +#endif /* !CC_LINUX */ +#endif /* CC_USE_ASM */ + + // Enable FIPSPOST function tracing only when supported. */ #ifdef CORECRYPTO_POST_TRACE #define CC_FIPSPOST_TRACE 1 diff --git a/EXTERNAL_HEADERS/corecrypto/cc_error.h b/EXTERNAL_HEADERS/corecrypto/cc_error.h index 57b8ec70c..b382cc5c1 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_error.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_error.h @@ -116,6 +116,30 @@ enum { CCPOST_INTEGRITY_ERROR = -74, // Output of the algo is not as expected CCPOST_KAT_FAILURE = -75, + + CCKPRNG_SEEDFILE_OPEN = -76, + CCKPRNG_SEEDFILE_READ = -78, + CCKPRNG_SEEDFILE_WRITE = -79, + CCKPRNG_SEEDFILE_CHMOD = -80, + CCKPRNG_SEEDFILE_CHOWN = -81, + CCKPRNG_RANDOMDEV_OPEN = -82, + CCKPRNG_RANDOMDEV_WRITE = -83, + CCKPRNG_GETENTROPY = -84, + + CCSAE_HUNTPECK_EXCEEDED_MAX_TRIALS = -85, + + CCERR_CALL_SEQUENCE = -86, + + CCVRF_POINT_DECODE_FAILURE = -87, + CCVRF_POINT_INVALID_PUBLIC_KEY = -88, + CCVRF_VERIFY_FAILURE = -89, + + // Error codes for Authenticated Encryption Modes + CCMODE_TAG_LENGTH_REQUEST_TOO_LONG = -100, + CCMODE_TAG_LENGTH_TOO_SHORT = -101, + CCMODE_NONCE_EMPTY = -102, + CCMODE_AD_EMPTY = -103, + CCMODE_DECRYPTION_OR_VERIFICATION_ERR=-104, }; #define CCDRBG_STATUS_OK CCERR_OK diff --git a/EXTERNAL_HEADERS/corecrypto/cc_priv.h b/EXTERNAL_HEADERS/corecrypto/cc_priv.h index 0a51e66ee..6a201eade 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_priv.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_priv.h @@ -14,11 +14,22 @@ #include #include +// Fork handlers for the stateful components of corecrypto. +void cc_atfork_prepare(void); +void cc_atfork_parent(void); +void cc_atfork_child(void); + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#ifndef __DECONST +#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) +#endif + /* defines the following macros : - CC_MEMCPY : optimized memcpy. - CC_MEMMOVE : optimized memmove. - CC_MEMSET : optimized memset. + CC_ARRAY_LEN: returns the number of elements in an array CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer. CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer. @@ -45,12 +56,7 @@ CC_H2BE32 : convert a 32 bits value between host and big endian order. CC_H2LE32 : convert a 32 bits value between host and little endian order. -The following are not defined yet... define them if needed. - - CC_BSWAPc : byte swap a 32 bits constant - CC_BSWAP64 : byte swap a 64 bits variable - CC_BSWAP64c : byte swap a 64 bits constant CC_READ_LE32 : read a 32 bits little endian value @@ -62,10 +68,32 @@ The following are not defined yet... define them if needed. */ -/* TODO: optimized versions */ -#define CC_MEMCPY(D,S,L) memcpy((D),(S),(L)) -#define CC_MEMMOVE(D,S,L) memmove((D),(S),(L)) -#define CC_MEMSET(D,V,L) memset((D),(V),(L)) +// RTKitOSPlatform should replace CC_MEMCPY with memcpy +#define CC_MEMCPY(D,S,L) cc_memcpy((D),(S),(L)) +#define CC_MEMMOVE(D,S,L) cc_memmove((D),(S),(L)) +#define CC_MEMSET(D,V,L) cc_memset((D),(V),(L)) + +#if __has_builtin(__builtin___memcpy_chk) && !CC_RTKIT +#define cc_memcpy(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 1)) +#define cc_memcpy_nochk(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 0)) +#else +#define cc_memcpy(dst, src, len) memcpy((dst), (src), (len)) +#define cc_memcpy_nochk(dst, src, len) memcpy((dst), (src), (len)) +#endif + +#if __has_builtin(__builtin___memmove_chk) && !CC_RTKIT +#define cc_memmove(dst, src, len) __builtin___memmove_chk((dst), (src), (len), __builtin_object_size((dst), 1)) +#else +#define cc_memmove(dst, src, len) memmove((dst), (src), (len)) +#endif + +#if __has_builtin(__builtin___memset_chk) && !CC_RTKIT +#define cc_memset(dst, val, len) __builtin___memset_chk((dst), (val), (len), __builtin_object_size((dst), 1)) +#else +#define cc_memset(dst, val, len) memset((dst), (val), (len)) +#endif + +#define CC_ARRAY_LEN(x) (sizeof((x))/sizeof((x)[0])) // MARK: - Loads and Store @@ -327,32 +355,46 @@ CC_INLINE uint64_t CC_ROR64(uint64_t word, int i) // MARK: - Byte Swaps -CC_INLINE uint32_t CC_BSWAP(uint32_t x) +#if __has_builtin(__builtin_bswap32) +#define CC_BSWAP32(x) __builtin_bswap32(x) +#else +CC_INLINE uint32_t CC_BSWAP32(uint32_t x) { - return ( - ((x>>24)&0x000000FF) | - ((x<<24)&0xFF000000) | - ((x>>8) &0x0000FF00) | - ((x<<8) &0x00FF0000) - ); + return + ((x & 0xff000000) >> 24) | + ((x & 0x00ff0000) >> 8) | + ((x & 0x0000ff00) << 8) | + ((x & 0x000000ff) << 24); } +#endif -#define CC_BSWAP64(x) \ -((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ -(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ -(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ -(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ -(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ -(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ -(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ -(((uint64_t)(x) & 0x00000000000000ffULL) << 56))) +#if __has_builtin(__builtin_bswap64) +#define CC_BSWAP64(x) __builtin_bswap64(x) +#else +CC_INLINE uint64_t CC_BSWAP64(uint64_t x) +{ + return + ((x & 0xff00000000000000ULL) >> 56) | + ((x & 0x00ff000000000000ULL) >> 40) | + ((x & 0x0000ff0000000000ULL) >> 24) | + ((x & 0x000000ff00000000ULL) >> 8) | + ((x & 0x00000000ff000000ULL) << 8) | + ((x & 0x0000000000ff0000ULL) << 24) | + ((x & 0x000000000000ff00ULL) << 40) | + ((x & 0x00000000000000ffULL) << 56); +} +#endif #ifdef __LITTLE_ENDIAN__ -#define CC_H2BE32(x) CC_BSWAP(x) +#define CC_H2BE32(x) CC_BSWAP32(x) #define CC_H2LE32(x) (x) +#define CC_H2BE64(x) CC_BSWAP64(x) +#define CC_H2LE64(x) (x) #else #define CC_H2BE32(x) (x) -#define CC_H2LE32(x) CC_BSWAP(x) +#define CC_H2LE32(x) CC_BSWAP32(x) +#define CC_H2BE64(x) (x) +#define CC_H2LE64(x) CC_BSWAP64(x) #endif #define CC_READ_LE32(ptr) \ @@ -389,54 +431,156 @@ do { \ #define cc_byte(x, n) (((x) >> (8 * (n))) & 255) #endif +/* Count leading zeros (for nonzero inputs) */ + +/* + * On i386 and x86_64, we know clang and GCC will generate BSR for + * __builtin_clzl. This instruction IS NOT constant time on all micro- + * architectures, but it *is* constant time on all micro-architectures that + * have been used by Apple, and we expect that to continue to be the case. + * + * When building for x86_64h with clang, this produces LZCNT, which is exactly + * what we want. + * + * On arm and arm64, we know that clang and GCC generate the constant-time CLZ + * instruction from __builtin_clzl( ). + */ + +#if defined(_WIN32) +/* We use the Windows implementations below. */ +#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) +/* We use a thought-to-be-good version of __builtin_clz. */ +#elif defined __GNUC__ +#warning Using __builtin_clz() on an unknown architecture; it may not be constant-time. +/* If you find yourself seeing this warning, file a radar for someone to + * check whether or not __builtin_clz() generates a constant-time + * implementation on the architecture you are targeting. If it does, append + * the name of that architecture to the list of "safe" architectures above. */ */ +#endif + + +#if defined(_WIN32) + +#include +#include + +CC_INLINE CC_CONST unsigned clz64_win(uint64_t value) +{ + DWORD leading_zero; + _BitScanReverse64(&leading_zero, value); + return 63 - leading_zero; +} + + +CC_INLINE CC_CONST unsigned clz32_win(uint32_t value) +{ + DWORD leading_zero; + _BitScanReverse(&leading_zero, value); + return 31 - leading_zero; +} + +#endif + +CC_INLINE CC_CONST unsigned cc_clz32_fallback(uint32_t data) +{ + unsigned int b = 0; + unsigned int bit = 0; + // Work from LSB to MSB + for (int i = 0; i < 32; i++) { + bit = (data >> i) & 1; + // If the bit is 0, update the "leading bits are zero" counter "b". + b += (1 - bit); + /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained. + * If the bit is 1, (bit - 1) is 0 therefore b is set to 0. + */ + b &= (bit - 1); + } + return b; +} + +CC_INLINE CC_CONST unsigned cc_clz64_fallback(uint64_t data) +{ + unsigned int b = 0; + unsigned int bit = 0; + // Work from LSB to MSB + for (int i = 0; i < 64; i++) { + bit = (data >> i) & 1; + // If the bit is 0, update the "leading bits are zero" counter. + b += (1 - bit); + /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained. + * If the bit is 1, (bit - 1) is 0 therefore b is set to 0. + */ + b &= (bit - 1); + } + return b; +} + +/*! + @function cc_clz32 + @abstract Count leading zeros of a nonzero 32-bit value + + @param data A nonzero 32-bit value + + @result Count of leading zeros of @p data + + @discussion @p data is assumed to be nonzero. +*/ +CC_INLINE CC_CONST unsigned cc_clz32(uint32_t data) { +#if defined(_WIN32) + return clz32_win(data); +#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__) + cc_static_assert(sizeof(unsigned) == 4, "clz relies on an unsigned int being 4 bytes"); + return (unsigned)__builtin_clz(data); +#else + return cc_clz32_fallback(data); +#endif +} + +/*! + @function cc_clz64 + @abstract Count leading zeros of a nonzero 64-bit value + + @param data A nonzero 64-bit value + + @result Count of leading zeros of @p data + + @discussion @p data is assumed to be nonzero. +*/ +CC_INLINE CC_CONST unsigned cc_clz64(uint64_t data) { +#if defined(_WIN32) + return clz64_win(data); +#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__) + return (unsigned)__builtin_clzll(data); +#else + return cc_clz64_fallback(data); +#endif +} + /* HEAVISIDE_STEP (shifted by one) - function f(x): x->0, when x=0 + function f(x): x->0, when x=0 x->1, when x>0 - Can also be seen as a bitwise operation: + Can also be seen as a bitwise operation: f(x): x -> y y[0]=(OR x[i]) for all i (all bits) y[i]=0 for all i>0 - Run in constant time (log2()) + Run in constant time (log2()) Useful to run constant time checks */ -#define HEAVISIDE_STEP_UINT64(r,s) {uint64_t _t=s; \ - _t=(((_t)>>32) | (_t)); \ - _t=(0xFFFFFFFF + (_t & 0xFFFFFFFF)); \ - r=_t >> 32;} - -#define HEAVISIDE_STEP_UINT32(r,s) {uint32_t _t=s; \ - _t=(((_t)>>16) | (_t)); \ - _t=(0xFFFF + (_t & 0xFFFF)); \ - r=_t >> 16;} - -#define HEAVISIDE_STEP_UINT16(r,s) {uint32_t _t=s; \ - _t=(0xFFFF + ((_t) & 0xFFFF)); \ - r=_t >> 16;} - -#define HEAVISIDE_STEP_UINT8(r,s) {uint16_t _t=s; \ - _t=(0xFF + ((_t) & 0xFF)); \ - r=_t >> 8;} - -#define CC_HEAVISIDE_STEP(r,s) { \ - if (sizeof(s) == 1) {HEAVISIDE_STEP_UINT8(r,s);} \ - else if (sizeof(s) == 2) {HEAVISIDE_STEP_UINT16(r,s);} \ - else if (sizeof(s) == 4) {HEAVISIDE_STEP_UINT32(r,s);} \ - else if (sizeof(s) == 8) {HEAVISIDE_STEP_UINT64(r,s);} \ - else {r=(((s)==0)?0:1);} \ - } +#define CC_HEAVISIDE_STEP(r, s) { \ + const uint64_t _s = (uint64_t)s; \ + const uint64_t _t = (_s & 0xffffffff) | (_s >> 32); \ + r = (__typeof__(r))((0xffffffff + _t) >> 32); \ +} /* Return 1 if x mod 4 =1,2,3, 0 otherwise */ #define CC_CARRY_2BITS(x) (((x>>1) | x) & 0x1) #define CC_CARRY_3BITS(x) (((x>>2) | (x>>1) | x) & 0x1) -/* Set a variable to the biggest power of 2 which can be represented */ +/* Set a variable to the biggest power of 2 which can be represented */ #define MAX_POWER_OF_2(x) ((__typeof__(x))1<<(8*sizeof(x)-1)) #define cc_ceiling(a,b) (((a)+((b)-1))/(b)) #define CC_BITLEN_TO_BYTELEN(x) cc_ceiling((x), 8) -//cc_try_abort() is implemented to comply with FIPS 140-2. See radar 19129408 -void cc_try_abort(const char * msg , ...); - /*! @brief cc_muxp(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time @param a input pointer @@ -447,30 +591,56 @@ void cc_try_abort(const char * msg , ...); void *cc_muxp(int s, const void *a, const void *b); /*! - @brief cc_mux2p - @param a input pointer - @param b input pointer - @param r_true output pointer: if s is integer 1 r_true=a is returned, otherwise r_true=b - @param r_false output pointer: if s is integer 1 r_false=b is returned, otherwise r_false=a - @param s The selection parameter s must be 0 or 1. - @discussion Executes in constant time + @brief CC_MUXU(r, s, a, b) is equivalent to r = s ? a : b, but executes in constant time + @param a Input a + @param b Input b + @param s Selection parameter s. Must be 0 or 1. + @param r Output, set to a if s=1, or b if s=0. */ -void cc_mux2p(int s, void **r_true, void **r_false, const void *a, const void *b); +#define CC_MUXU(r, s, a, b) \ +{ \ + __typeof__(r) _cond = ((__typeof__(r))(s)-(__typeof__(r))1); \ + r = (~_cond&(a))|(_cond&(b)); \ +} + +#define CC_PROVIDES_ABORT (!(CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKITROM)) /*! - @brief CC_MUXU(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time - @param a input unsigned type - @param b input unsigned type - @param s The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined. - @param r output - @return r = a, if s is 1 and b if s is 0 + @function cc_abort + @abstract Abort execution unconditionally */ -#define CC_MUXU(r, s, a, b) \ -{ \ - __typeof__(r) _cond = ((__typeof__(r))(s)-(__typeof__(r))1); \ - r = (~_cond&(a))|(_cond&(b)); \ +CC_NORETURN +void cc_abort(const char *msg); + +/*! + @function cc_try_abort + @abstract Abort execution iff the platform provides a function like @p abort() or @p panic() + + @discussion If the platform does not provide a means to abort execution, this function does nothing; therefore, callers should return an error code after calling this function. +*/ +#if CC_PROVIDES_ABORT + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmissing-noreturn" + +CC_INLINE +void cc_try_abort(const char *msg) +{ + cc_abort(msg); } +#pragma clang diagnostic pop + +#else + +CC_INLINE +void cc_try_abort(CC_UNUSED const char *msg) +{ + +} + +#endif + /* Unfortunately, since we export this symbol, this declaration needs to be in a public header to satisfy TAPI. diff --git a/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h b/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h index 0d7ac5289..996accee1 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h @@ -14,8 +14,22 @@ #include /* Only intel systems have these runtime switches today. */ -#if (CCSHA1_VNG_INTEL || CCSHA2_VNG_INTEL || CCAES_INTEL_ASM) \ - && (defined(__x86_64__) || defined(__i386__)) + +#if defined(__x86_64__) || defined(__i386__) + +#if CC_KERNEL + #include + #define CC_HAS_RDRAND() ((cpuid_features() & CPUID_FEATURE_RDRAND) != 0) +#elif CC_XNU_KERNEL_AVAILABLE + #include + + extern int _cpu_capabilities; + #define CC_HAS_RDRAND() (_cpu_capabilities & kHasRDRAND) +#else + #define CC_HAS_RDRAND() 0 +#endif + +#if (CCSHA1_VNG_INTEL || CCSHA2_VNG_INTEL || CCAES_INTEL_ASM) #if CC_KERNEL #include @@ -26,11 +40,7 @@ #define CC_HAS_AVX512_AND_IN_KERNEL() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0) #elif CC_XNU_KERNEL_AVAILABLE - # include - - #ifndef kHasAVX2_0 /* 10.8 doesn't have kHasAVX2_0 defined */ - #define kHasAVX2_0 0 - #endif + #include extern int _cpu_capabilities; #define CC_HAS_AESNI() (_cpu_capabilities & kHasAES) @@ -46,6 +56,8 @@ #define CC_HAS_AVX512_AND_IN_KERNEL() 0 #endif -#endif /* !(defined(__x86_64__) || defined(__i386__)) */ +#endif // (CCSHA1_VNG_INTEL || CCSHA2_VNG_INTEL || CCAES_INTEL_ASM) + +#endif // defined(__x86_64__) || defined(__i386__) #endif /* CORECRYPTO_CC_RUNTIME_CONFIG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccaes.h b/EXTERNAL_HEADERS/corecrypto/ccaes.h index 281c99d22..9c664b842 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccaes.h +++ b/EXTERNAL_HEADERS/corecrypto/ccaes.h @@ -122,4 +122,7 @@ const struct ccmode_ofb *ccaes_ofb_crypt_mode(void); const struct ccmode_siv *ccaes_siv_encrypt_mode(void); const struct ccmode_siv *ccaes_siv_decrypt_mode(void); +const struct ccmode_siv_hmac *ccaes_siv_hmac_sha256_encrypt_mode(void); +const struct ccmode_siv_hmac *ccaes_siv_hmac_sha256_decrypt_mode(void); + #endif /* _CORECRYPTO_CCAES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cccmac.h b/EXTERNAL_HEADERS/corecrypto/cccmac.h index d2e018143..e29e543dd 100644 --- a/EXTERNAL_HEADERS/corecrypto/cccmac.h +++ b/EXTERNAL_HEADERS/corecrypto/cccmac.h @@ -24,7 +24,7 @@ struct cccmac_ctx { size_t block_nbytes; // Number of byte occupied in block size_t cumulated_nbytes; // Total size processed const struct ccmode_cbc *cbc; - uint8_t ctx[8]; + uint8_t ctx[1]; } CC_ALIGNED(8);// cccmac_ctx_hdr; typedef struct cccmac_ctx* cccmac_ctx_t; diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest.h b/EXTERNAL_HEADERS/corecrypto/ccdigest.h index 52ee15123..fa2b765f9 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest.h @@ -85,15 +85,6 @@ struct ccdigest_info { #define ccdigest_u64(_state_) (&((ccdigest_state_t)(_state_))->state.u64) #define ccdigest_ccn(_state_) (&((ccdigest_state_t)(_state_))->state.ccn) -/* We could just use memcpy instead of this special macro, but this allows us - to use the optimized ccn_set() assembly routine if we have one, which for - 32 bit arm is about 200% quicker than generic memcpy(). */ -#if CCN_SET_ASM && CCN_UNIT_SIZE <= 4 -#define ccdigest_copy_state(_di_, _dst_, _src_) ccn_set((_di_)->state_size / CCN_UNIT_SIZE, _dst_, _src_) -#else -#define ccdigest_copy_state(_di_, _dst_, _src_) CC_MEMCPY(_dst_, _src_, (_di_)->state_size) -#endif - void ccdigest_init(const struct ccdigest_info *di, ccdigest_ctx_t ctx); void ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, size_t len, const void *data); @@ -117,9 +108,6 @@ void ccdigest(const struct ccdigest_info *di, size_t len, #define CC_DIGEST_OID_SHA256 OID_DEF("\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01") #define CC_DIGEST_OID_SHA384 OID_DEF("\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x02") #define CC_DIGEST_OID_SHA512 OID_DEF("\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x03") -#define CC_DIGEST_OID_RMD128 OID_DEF("\x06\x06\x28\xCF\x06\x03\x00\x32") #define CC_DIGEST_OID_RMD160 OID_DEF("\x06\x05\x2B\x24\x03\x02\x01") -#define CC_DIGEST_OID_RMD256 OID_DEF("\x06\x05\x2B\x24\x03\x02\x03") -#define CC_DIGEST_OID_RMD320 OID_DEF(NULL) #endif /* _CORECRYPTO_CCDIGEST_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h b/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h index 9d42de519..8061c5faf 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h @@ -11,6 +11,7 @@ #ifndef _CORECRYPTO_CCDIGEST_PRIV_H_ #define _CORECRYPTO_CCDIGEST_PRIV_H_ +#include #include #include @@ -26,4 +27,6 @@ typedef const struct ccdigest_info *(ccdigest_lookup)(ccoid_t oid); #include const struct ccdigest_info *ccdigest_oid_lookup(ccoid_t oid, ...); +#define ccdigest_copy_state(_di_, _dst_, _src_) cc_memcpy_nochk(_dst_, _src_, (_di_)->state_size) + #endif /* _CORECRYPTO_CCDIGEST_PRIV_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdrbg.h b/EXTERNAL_HEADERS/corecrypto/ccdrbg.h index 7717d0c03..14db0a16b 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdrbg.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdrbg.h @@ -32,7 +32,7 @@ #define CCDRBG_MAX_ADDITIONALINPUT_SIZE ((uint32_t)1<<16) #define CCDRBG_MAX_PSINPUT_SIZE ((uint32_t)1<<16) #define CCDRBG_MAX_REQUEST_SIZE ((uint32_t)1<<16) //this is the absolute maximum in NIST 800-90A -#define CCDRBG_RESEED_INTERVAL ((uint64_t)1<<30) // must be able to fit the NIST maximum of 2^48 +#define CCDRBG_RESEED_INTERVAL ((uint64_t)1<<48) // must be able to fit the NIST maximum of 2^48 /* diff --git a/EXTERNAL_HEADERS/corecrypto/cchmac.h b/EXTERNAL_HEADERS/corecrypto/cchmac.h index 048c0de14..3b6ac339b 100644 --- a/EXTERNAL_HEADERS/corecrypto/cchmac.h +++ b/EXTERNAL_HEADERS/corecrypto/cchmac.h @@ -16,12 +16,12 @@ /* An hmac_ctx_t is normally allocated as an array of these. */ struct cchmac_ctx { - uint8_t b[8]; + uint8_t b[1]; } CC_ALIGNED(8); typedef struct cchmac_ctx* cchmac_ctx_t; -#define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE) (ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE) + (STATE_SIZE)) +#define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE) (cc_pad_align(ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE)) + (STATE_SIZE)) #define cchmac_di_size(_di_) (cchmac_ctx_size((_di_)->state_size, (_di_)->block_size)) #define cchmac_ctx_n(STATE_SIZE, BLOCK_SIZE) ccn_nof_size(cchmac_ctx_size((STATE_SIZE), (BLOCK_SIZE))) @@ -35,7 +35,7 @@ typedef struct cchmac_ctx* cchmac_ctx_t; #define cchmac_digest_ctx(_di_, HC) ((ccdigest_ctx_t)(HC)) /* Accesors for ostate fields, this is all cchmac_ctx_t adds to the ccdigest_ctx_t. */ -#define cchmac_ostate(_di_, HC) ((struct ccdigest_state *)(((cchmac_ctx_t)(HC))->b + ccdigest_di_size(_di_))) +#define cchmac_ostate(_di_, HC) ((struct ccdigest_state *)(((cchmac_ctx_t)(HC))->b + cc_pad_align(ccdigest_di_size(_di_)))) #define cchmac_ostate8(_di_, HC) (ccdigest_u8(cchmac_ostate(_di_, HC))) #define cchmac_ostate32(_di_, HC) (ccdigest_u32(cchmac_ostate(_di_, HC))) #define cchmac_ostate64(_di_, HC) (ccdigest_u64(cchmac_ostate(_di_, HC))) diff --git a/EXTERNAL_HEADERS/corecrypto/cckprng.h b/EXTERNAL_HEADERS/corecrypto/cckprng.h index 5e5bfcacd..edcff9a61 100644 --- a/EXTERNAL_HEADERS/corecrypto/cckprng.h +++ b/EXTERNAL_HEADERS/corecrypto/cckprng.h @@ -11,10 +11,15 @@ #ifndef _CORECRYPTO_CCKPRNG_H_ #define _CORECRYPTO_CCKPRNG_H_ +#include + #include +#define CCKPRNG_YARROW 0 + +#if CCKPRNG_YARROW + typedef struct PRNG *PrngRef; -typedef struct cckprng_ctx *cckprng_ctx_t; struct cckprng_ctx { PrngRef prng; @@ -25,59 +30,326 @@ struct cckprng_ctx { #define CCKPRNG_ENTROPY_INTERVAL (1 << 14) #define CCKPRNG_RESEED_NTICKS 50 +typedef struct cckprng_ctx *cckprng_ctx_t; + +#else + +// This is a Fortuna-inspired PRNG. While it differs from Fortuna in +// many minor details, the biggest difference is its support for +// multiple independent output generators. This is to make it suitable +// for use in concurrent environments. +// +// This PRNG targets a 256-bit security level. +// +// First, the user should call cckprng_init. The user must specify the +// maximum number of output generators that might be +// needed. (Typically, users should align this argument with the +// number of available CPUs.) +// +// The user must also provide a read-only handle to an entropy +// source. This is a fixed-size buffer that will receive entropy +// updates out of band from the PRNG (e.g. in an interrupt +// handler). The PRNG will consume entropy from this buffer according +// to an internal schedule driven by calls to cckprng_refresh (see +// below). +// +// The user should call cckprng_initgen for as many output generators +// as are needed. The numeric argument is an identifier to be reused +// during calls to cckprng_generate (see below) and must be less than +// the maximum number of generators specified to cckprng_init. +// +// After initialization, the user is free to call cckprng_generate to +// generate random bytes. The user must specify the generator in this +// call using a numeric identifier passed in the call to +// cckprng_initgen. +// +// Output generation is limited to 256 bytes per request. Users should +// make multiple requests if more output is needed. +// +// The user is expected to call cckprng_refresh regularly. This +// function consumes entropy and mixes it into the output generators +// according to an internal schedule. +// +// This implementation is thread-safe. Internally, a set of mutexes +// guard access to internal state. Most functions rely on a single +// mutex to protect shared state. The main exception is the +// cckprng_generate function, which uses a per-generator mutex to +// allow concurrent output generation on different threads. +// +// Another important exception is cckprng_refresh. While this function +// relies on the shared mutex, it returns immediately if it cannot +// acquire it. +// +// The PRNG also supports user-initiated reseeds. This is to support a +// user-writable random device. +// +// This PRNG supports reseeds concurrent with output generation, +// i.e. it is safe to call cckprng_reseed or cckprng_refresh while +// another thread is calling cckprng_generate. + +#define CCKPRNG_NPOOLS 32 +#define CCKPRNG_SEED_NBYTES 32 +#define CCKPRNG_POOL_NBYTES 32 +#define CCKPRNG_KEY_NBYTES 32 + +struct cckprng_gen_diag { + // The number of times this generator has been rekeyed from the master seed + uint64_t nrekeys; + + // The number of requests this generator has fulfilled + uint64_t out_nreqs; + + // The total number of bytes this generator has generated over all requests + uint64_t out_nbytes; + + // The maximum number of bytes this generator has generated in any one request + uint64_t out_nbytes_req_max; + + // The total number of bytes this generator has generated since the last rekey + uint64_t out_nbytes_key; + + // The maximum total number of bytes this generator has generated between two rekeys + uint64_t out_nbytes_key_max; +}; + +struct cckprng_pool_diag { + // The number of samples currently resident in the pool + uint64_t nsamples; + + // The number of times this pool has been drained in a reseed + uint64_t ndrains; + + // The maximum number of samples this pool has held at any one time + uint64_t nsamples_max; +}; + +struct cckprng_diag { + // The number of reseeds via user input (e.g. by writing to /dev/random) + uint64_t userreseed_nreseeds; + + // The number of reseeds via the scheduler + uint64_t schedreseed_nreseeds; + + // The maximum number of samples included in any one scheduler reseed + uint64_t schedreseed_nsamples_max; + + // The maximum number of samples included in any one entropy input + uint64_t addentropy_nsamples_max; + + // Diagnostics corresponding to individual output generators + unsigned ngens; + struct cckprng_gen_diag *gens; + + // Diagnostics corresponding to internal entropy pools + struct cckprng_pool_diag pools[CCKPRNG_NPOOLS]; +}; + +#if CC_KERNEL + +#include + +typedef lck_grp_t *cckprng_lock_group; +typedef lck_mtx_t *cckprng_lock_mutex; + +struct cckprng_lock_ctx { + cckprng_lock_group group; + cckprng_lock_mutex mutex; +}; + +#else + +#include + +typedef os_unfair_lock cckprng_lock_mutex; + +struct cckprng_lock_ctx { + cckprng_lock_mutex mutex; +}; + +#endif + +struct cckprng_key_ctx { + uint8_t data[CCKPRNG_KEY_NBYTES]; +}; + +struct cckprng_gen_ctx { + // We maintain two keys (one live and one idle) to allow + // concurrent generation and reseeding + struct cckprng_key_ctx keys[2]; + _Atomic unsigned swap; + unsigned key_live_idx; + unsigned key_idle_idx; + + // A counter used in CTR mode + uint8_t ctr[16]; + + // Whether the generator has been initialized + bool init; + + // A mutex governing this generator's state (but note the idle key + // context is under control of the PRNG's shared mutex) + struct { + cckprng_lock_mutex mutex; + } lock; +}; + +struct cckprng_pool_ctx { + uint8_t data[CCKPRNG_POOL_NBYTES]; +}; + +// This is a handle to an "entropy buffer" to be managed externally +// (i.e. in xnu). This is a non-cryptographic +// accumulator. Practically, the buffer is filled with timestamps +// collected during interrupts. The existing state of the buffer is +// rotated and new timestamps are added in. A counter of raw timing +// samples is also managed externally. The buffer and the counter are +// both subject to data races, which we tolerate. + +struct cckprng_entropybuf { + + // A read-only handle to an "entropy buffer" (a non-cryptographic accumulator) to be managed externally + const void *buf; + + // The size of the entropy buffer + size_t nbytes; + + // A read-only handle to a count of raw samples in the buffer + const uint32_t *nsamples; + + // The count of raw samples in the buffer at time of last read + uint32_t nsamples_last; +}; + +struct cckprng_sched_ctx { + // A counter governing the set of entropy pools to drain + uint64_t reseed_sched; + + // A timestamp from the last reseed + uint64_t reseed_last; + + // An index used to add entropy to pools in a round-robin style + unsigned pool_idx; +}; + +struct cckprng_ctx { + + // The master secret of the PRNG + uint8_t seed[CCKPRNG_SEED_NBYTES]; + + // State used to schedule entropy consumption and reseeds + struct cckprng_sched_ctx sched; + + // A mutex governing access to shared state + struct cckprng_lock_ctx lock; + + // The maximum number of generators that may be allocated + unsigned max_ngens; + + // An array of output generators (allocated dynamically) of length max_ngens + struct cckprng_gen_ctx *gens; + + // A set of entropy pools + struct cckprng_pool_ctx pools[CCKPRNG_NPOOLS]; + + // A handle to an entropy source managed externally + struct cckprng_entropybuf entropybuf; + + // Diagnostics for the PRNG + struct cckprng_diag diag; +}; + +// This collection of function pointers is just a convenience for +// registering the PRNG with xnu +struct cckprng_funcs { + void (*init)(struct cckprng_ctx *ctx, + unsigned max_ngens, + size_t entropybuf_nbytes, + const void *entropybuf, + const uint32_t *entropybuf_nsamples, + size_t seed_nbytes, + const void *seed, + size_t nonce_nbytes, + const void *nonce); + void (*initgen)(struct cckprng_ctx *ctx, unsigned gen_idx); + void (*reseed)(struct cckprng_ctx *ctx, size_t nbytes, const void *seed); + void (*refresh)(struct cckprng_ctx *ctx); + void (*generate)(struct cckprng_ctx *ctx, unsigned gen_idx, size_t nbytes, void *out); +}; + +#endif + /* @function cckprng_init @abstract Initialize a kernel PRNG context. @param ctx Context for this instance - @param nbytes Length of the seed in bytes + @param max_ngens Maximum count of generators that may be allocated + @param entropybuf_nbytes Length of the entropy buffer in bytes + @param entropybuf Read-only pointer to a long-lived entropy buffer + @param entropybuf_nsamples Read-only pointer to a counter of samples in the entropy buffer + @param seed_nbytes Length of the seed in bytes @param seed Pointer to a high-entropy seed + @param nonce_nbytes Length of the nonce in bytes + @param seed Pointer to a single-use nonce + + @discussion @p max_ngens should be set based on an upper bound of CPUs available on the device. The entropy buffer should be managed outside the PRNG and updated continuously (e.g. by an interrupt handler). The count of samples in the entropy buffer needn't be better than a rough estimate. +*/ +void cckprng_init(struct cckprng_ctx *ctx, + unsigned max_ngens, + size_t entropybuf_nbytes, + const void *entropybuf, + const uint32_t *entropybuf_nsamples, + size_t seed_nbytes, + const void *seed, + size_t nonce_nbytes, + const void *nonce); + +/* + @function cckprng_initgen + @abstract Initialize an output generator. + + @param ctx Context for this instance + @param gen_idx Index of the generator - @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT. + @discussion @p gen_idx must be less than @p max_ngens provided to @cckprng_init and must be unique within the lifetime of a PRNG context. This function will abort if these contracts are violated. */ -int cckprng_init(cckprng_ctx_t ctx, size_t nbytes, const void *seed); +void cckprng_initgen(struct cckprng_ctx *ctx, unsigned gen_idx); /* @function cckprng_reseed - @abstract Reseed a kernel PRNG context immediately. + @abstract Reseed a kernel PRNG context with a user-supplied seed. @param ctx Context for this instance @param nbytes Length of the seed in bytes @param seed Pointer to a high-entropy seed - @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT. + @discussion It is safe to expose this function to attacker-controlled requests (e.g. writes to /dev/random). */ -int cckprng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void *seed); +void cckprng_reseed(struct cckprng_ctx *ctx, size_t nbytes, const void *seed); /* - @function cckprng_addentropy - @abstract Add entropy to a kernel PRNG context. + @function cckprng_refresh + @abstract Consume entropy and reseed according to an internal schedule. @param ctx Context for this instance - @param nbytes Length of the input entropy in bytes - @param seed Pointer to input entropy - @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT. - - @discussion Input entropy is stored internally and consumed at the - opportune moment. This will not necessarily be before the next call - to @p cckprng_generate. To force an immediate reseed, call @p - cckprng_reseed. + @discussion This function should be called on a regular basis. (For example, it is reasonable to call this inline before a call to @p cckprng_generate.) This function will not necessarily consume entropy or reseed the internal state on any given invocation. To force an immediate reseed, call @p cckprng_reseed. */ -int cckprng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void *entropy); +void cckprng_refresh(struct cckprng_ctx *ctx); + +#define CCKPRNG_GENERATE_MAX_NBYTES 256 /* @function cckprng_generate @abstract Generate random values for use in applications. @param ctx Context for this instance + @param gen_idx Index of the output generator @param nbytes Length of the desired output in bytes - @param seed Pointer to the output buffer + @param out Pointer to the output buffer - @result @p CCKPRNG_OK iff successful. Panic on @p - CCKPRNG_ABORT. Provide input to @p cckprng_addentropy on @p - CCKPRNG_NEED_ENTROPY. + @discussion @p gen_idx must be a previous argument to @p cckprng_initgen. @p nbytes must be less than or equal to @p CCKPRNG_GENERATE_MAX_NBYTES. (Callers may invoke this function in a loop to generate larger outputs.) This function will abort if these contracts are violated. */ -int cckprng_generate(cckprng_ctx_t ctx, size_t nbytes, void *out); +void cckprng_generate(struct cckprng_ctx *ctx, unsigned gen_idx, size_t nbytes, void *out); #endif /* _CORECRYPTO_CCKPRNG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode.h b/EXTERNAL_HEADERS/corecrypto/ccmode.h index 191460b9b..f4aa20a99 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode.h @@ -14,6 +14,7 @@ #include #include #include +#include /* ECB mode. */ @@ -29,36 +30,35 @@ CC_INLINE size_t ccecb_context_size(const struct ccmode_ecb *mode) CC_INLINE size_t ccecb_block_size(const struct ccmode_ecb *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int ccecb_init(const struct ccmode_ecb *mode, ccecb_ctx *ctx, - size_t key_len, const void *key) +CC_INLINE int ccecb_init(const struct ccmode_ecb *mode, ccecb_ctx *ctx, size_t key_len, const void *key) { return mode->init(mode, ctx, key_len, key); } -CC_INLINE int ccecb_update(const struct ccmode_ecb *mode, const ccecb_ctx *ctx, - size_t nblocks, const void *in, void *out) +CC_INLINE int ccecb_update(const struct ccmode_ecb *mode, const ccecb_ctx *ctx, size_t nblocks, const void *in, void *out) { - return mode->ecb(ctx, nblocks, in, out); + return mode->ecb(ctx, nblocks, in, out); } -CC_INLINE int ccecb_one_shot(const struct ccmode_ecb *mode, - size_t key_len, const void *key, - size_t nblocks, const void *in, void *out) +CC_INLINE int +ccecb_one_shot(const struct ccmode_ecb *mode, size_t key_len, const void *key, size_t nblocks, const void *in, void *out) { int rc; - ccecb_ctx_decl(mode->size, ctx); - rc = mode->init(mode, ctx, key_len, key); - mode->ecb(ctx, nblocks, in, out); - ccecb_ctx_clear(mode->size, ctx); + ccecb_ctx_decl(mode->size, ctx); + rc = mode->init(mode, ctx, key_len, key); + if (rc == 0) { + rc = mode->ecb(ctx, nblocks, in, out); + } + ccecb_ctx_clear(mode->size, ctx); return rc; } /* CBC mode. */ -/* The CBC interface changed due to rdar://11468135. This macros is to indicate +/* The CBC interface changed due to rdar://11468135. This macros is to indicate to client which CBC API is implemented. Clients can support old versions of corecrypto at build time using this. */ @@ -89,36 +89,36 @@ CC_INLINE size_t cccbc_context_size(const struct ccmode_cbc *mode) CC_INLINE size_t cccbc_block_size(const struct ccmode_cbc *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int cccbc_init(const struct ccmode_cbc *mode, cccbc_ctx *ctx, - size_t key_len, const void *key) +CC_INLINE int cccbc_init(const struct ccmode_cbc *mode, cccbc_ctx *ctx, size_t key_len, const void *key) { return mode->init(mode, ctx, key_len, key); } -CC_INLINE int cccbc_set_iv(const struct ccmode_cbc *mode, cccbc_iv *iv_ctx, - const void *iv) +CC_INLINE int cccbc_set_iv(const struct ccmode_cbc *mode, cccbc_iv *iv_ctx, const void *iv) { - if (iv) + if (iv) { cc_copy(mode->block_size, iv_ctx, iv); - else - cc_zero(mode->block_size, iv_ctx); + } else { + cc_clear(mode->block_size, iv_ctx); + } return 0; } -CC_INLINE int cccbc_update(const struct ccmode_cbc *mode, cccbc_ctx *ctx, - cccbc_iv *iv, size_t nblocks, - const void *in, void *out) +CC_INLINE int cccbc_update(const struct ccmode_cbc *mode, cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks, const void *in, void *out) { - return mode->cbc(ctx, iv, nblocks, in, out); + return mode->cbc(ctx, iv, nblocks, in, out); } int cccbc_one_shot(const struct ccmode_cbc *mode, - size_t key_len, const void *key, - const void *iv, size_t nblocks, - const void *in, void *out); + size_t key_len, + const void *key, + const void *iv, + size_t nblocks, + const void *in, + void *out); /* CFB mode. */ @@ -134,31 +134,34 @@ CC_INLINE size_t cccfb_context_size(const struct ccmode_cfb *mode) CC_INLINE size_t cccfb_block_size(const struct ccmode_cfb *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int cccfb_init(const struct ccmode_cfb *mode, cccfb_ctx *ctx, - size_t key_len, const void *key, - const void *iv) +CC_INLINE int cccfb_init(const struct ccmode_cfb *mode, cccfb_ctx *ctx, size_t key_len, const void *key, const void *iv) { return mode->init(mode, ctx, key_len, key, iv); } -CC_INLINE int cccfb_update(const struct ccmode_cfb *mode, cccfb_ctx *ctx, - size_t nbytes, const void *in, void *out) +CC_INLINE int cccfb_update(const struct ccmode_cfb *mode, cccfb_ctx *ctx, size_t nbytes, const void *in, void *out) { - return mode->cfb(ctx, nbytes, in, out); + return mode->cfb(ctx, nbytes, in, out); } CC_INLINE int cccfb_one_shot(const struct ccmode_cfb *mode, - size_t key_len, const void *key, const void *iv, - size_t nbytes, const void *in, void *out) + size_t key_len, + const void *key, + const void *iv, + size_t nbytes, + const void *in, + void *out) { int rc; - cccfb_ctx_decl(mode->size, ctx); - rc = mode->init(mode, ctx, key_len, key, iv); - mode->cfb(ctx, nbytes, in, out); - cccfb_ctx_clear(mode->size, ctx); + cccfb_ctx_decl(mode->size, ctx); + rc = mode->init(mode, ctx, key_len, key, iv); + if (rc == 0) { + rc = mode->cfb(ctx, nbytes, in, out); + } + cccfb_ctx_clear(mode->size, ctx); return rc; } @@ -176,30 +179,34 @@ CC_INLINE size_t cccfb8_context_size(const struct ccmode_cfb8 *mode) CC_INLINE size_t cccfb8_block_size(const struct ccmode_cfb8 *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int cccfb8_init(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, - size_t key_len, const void *key, const void *iv) +CC_INLINE int cccfb8_init(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, size_t key_len, const void *key, const void *iv) { return mode->init(mode, ctx, key_len, key, iv); } -CC_INLINE int cccfb8_update(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, - size_t nbytes, const void *in, void *out) +CC_INLINE int cccfb8_update(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, size_t nbytes, const void *in, void *out) { - return mode->cfb8(ctx, nbytes, in, out); + return mode->cfb8(ctx, nbytes, in, out); } CC_INLINE int cccfb8_one_shot(const struct ccmode_cfb8 *mode, - size_t key_len, const void *key, const void *iv, - size_t nbytes, const void *in, void *out) + size_t key_len, + const void *key, + const void *iv, + size_t nbytes, + const void *in, + void *out) { int rc; - cccfb8_ctx_decl(mode->size, ctx); - rc = mode->init(mode, ctx, key_len, key, iv); - mode->cfb8(ctx, nbytes, in, out); - cccfb8_ctx_clear(mode->size, ctx); + cccfb8_ctx_decl(mode->size, ctx); + rc = mode->init(mode, ctx, key_len, key, iv); + if (rc == 0) { + rc = mode->cfb8(ctx, nbytes, in, out); + } + cccfb8_ctx_clear(mode->size, ctx); return rc; } @@ -221,35 +228,37 @@ CC_INLINE size_t ccctr_context_size(const struct ccmode_ctr *mode) CC_INLINE size_t ccctr_block_size(const struct ccmode_ctr *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int ccctr_init(const struct ccmode_ctr *mode, ccctr_ctx *ctx, - size_t key_len, const void *key, const void *iv) +CC_INLINE int ccctr_init(const struct ccmode_ctr *mode, ccctr_ctx *ctx, size_t key_len, const void *key, const void *iv) { return mode->init(mode, ctx, key_len, key, iv); } -CC_INLINE int ccctr_update(const struct ccmode_ctr *mode, ccctr_ctx *ctx, - size_t nbytes, const void *in, void *out) +CC_INLINE int ccctr_update(const struct ccmode_ctr *mode, ccctr_ctx *ctx, size_t nbytes, const void *in, void *out) { - return mode->ctr(ctx, nbytes, in, out); + return mode->ctr(ctx, nbytes, in, out); } CC_INLINE int ccctr_one_shot(const struct ccmode_ctr *mode, - size_t key_len, const void *key, const void *iv, - size_t nbytes, const void *in, void *out) + size_t key_len, + const void *key, + const void *iv, + size_t nbytes, + const void *in, + void *out) { int rc; - ccctr_ctx_decl(mode->size, ctx); - rc = mode->init(mode, ctx, key_len, key, iv); - if (rc) return rc; - rc = mode->ctr(ctx, nbytes, in, out); - ccctr_ctx_clear(mode->size, ctx); + ccctr_ctx_decl(mode->size, ctx); + rc = mode->init(mode, ctx, key_len, key, iv); + if (rc == 0) { + rc = mode->ctr(ctx, nbytes, in, out); + } + ccctr_ctx_clear(mode->size, ctx); return rc; } - /* OFB mode. */ /* Declare a ofb key named _name_. Pass the size field of a struct ccmode_ofb @@ -264,30 +273,34 @@ CC_INLINE size_t ccofb_context_size(const struct ccmode_ofb *mode) CC_INLINE size_t ccofb_block_size(const struct ccmode_ofb *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int ccofb_init(const struct ccmode_ofb *mode, ccofb_ctx *ctx, - size_t key_len, const void *key, const void *iv) +CC_INLINE int ccofb_init(const struct ccmode_ofb *mode, ccofb_ctx *ctx, size_t key_len, const void *key, const void *iv) { return mode->init(mode, ctx, key_len, key, iv); } -CC_INLINE int ccofb_update(const struct ccmode_ofb *mode, ccofb_ctx *ctx, - size_t nbytes, const void *in, void *out) +CC_INLINE int ccofb_update(const struct ccmode_ofb *mode, ccofb_ctx *ctx, size_t nbytes, const void *in, void *out) { - return mode->ofb(ctx, nbytes, in, out); + return mode->ofb(ctx, nbytes, in, out); } CC_INLINE int ccofb_one_shot(const struct ccmode_ofb *mode, - size_t key_len, const void *key, const void *iv, - size_t nbytes, const void *in, void *out) + size_t key_len, + const void *key, + const void *iv, + size_t nbytes, + const void *in, + void *out) { int rc; - ccofb_ctx_decl(mode->size, ctx); + ccofb_ctx_decl(mode->size, ctx); rc = mode->init(mode, ctx, key_len, key, iv); - mode->ofb(ctx, nbytes, in, out); - ccofb_ctx_clear(mode->size, ctx); + if (rc == 0) { + rc = mode->ofb(ctx, nbytes, in, out); + } + ccofb_ctx_clear(mode->size, ctx); return rc; } @@ -323,26 +336,25 @@ CC_INLINE size_t ccxts_context_size(const struct ccmode_xts *mode) CC_INLINE size_t ccxts_block_size(const struct ccmode_xts *mode) { - return mode->block_size; + return mode->block_size; } /*! @function ccxts_init @abstract Initialize an XTS context. - + @param mode Descriptor for the mode @param ctx Context for this instance @param key_nbytes Length of the key arguments in bytes @param data_key Key for data encryption @param tweak_key Key for tweak generation - + @result 0 iff successful. - + @discussion For security reasons, the two keys must be different. */ -CC_INLINE int ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx, - size_t key_nbytes, const void *data_key, - const void *tweak_key) +CC_INLINE int +ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx, size_t key_nbytes, const void *data_key, const void *tweak_key) { return mode->init(mode, ctx, key_nbytes, data_key, tweak_key); } @@ -350,43 +362,42 @@ CC_INLINE int ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx, /*! @function ccxts_set_tweak @abstract Initialize the tweak for a sector. - + @param mode Descriptor for the mode @param ctx Context for this instance @param tweak Context for the tweak for this sector @param iv Data used to generate the tweak - + @discussion The IV must be exactly one block in length. */ -CC_INLINE int ccxts_set_tweak(const struct ccmode_xts *mode, ccxts_ctx *ctx, - ccxts_tweak *tweak, const void *iv) +CC_INLINE int ccxts_set_tweak(const struct ccmode_xts *mode, ccxts_ctx *ctx, ccxts_tweak *tweak, const void *iv) { - return mode->set_tweak(ctx, tweak, iv); + return mode->set_tweak(ctx, tweak, iv); } /*! @function ccxts_update @abstract Encrypt or decrypt data. - + @param mode Descriptor for the mode @param ctx Context for an instance @param tweak Context for the tweak for this sector @param nblocks Length of the data in blocks @param in Input data @param out Output buffer - + @result The updated internal buffer of the tweak context. May be ignored. */ -CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, - ccxts_tweak *tweak, size_t nblocks, const void *in, void *out) +CC_INLINE void * +ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, ccxts_tweak *tweak, size_t nblocks, const void *in, void *out) { - return mode->xts(ctx, tweak, nblocks, in, out); + return mode->xts(ctx, tweak, nblocks, in, out); } /*! @function ccxts_one_shot @abstract Encrypt or decrypt data in XTS mode. - + @param mode Descriptor for the mode @param key_nbytes Length of the key arguments in bytes @param data_key Key for data encryption @@ -395,15 +406,19 @@ CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, @param nblocks Length of the data in blocks @param in Input data @param out Output buffer - + @result 0 iff successful. - + @discussion For security reasons, the two keys must be different. */ int ccxts_one_shot(const struct ccmode_xts *mode, - size_t key_nbytes, const void *data_key, - const void *tweak_key, const void *iv, - size_t nblocks, const void *in, void *out); + size_t key_nbytes, + const void *data_key, + const void *tweak_key, + const void *iv, + size_t nblocks, + const void *in, + void *out); /* Authenticated cipher modes. */ @@ -430,44 +445,44 @@ CC_INLINE size_t ccgcm_context_size(const struct ccmode_gcm *mode) CC_INLINE size_t ccgcm_block_size(const struct ccmode_gcm *mode) { - return mode->block_size; + return mode->block_size; } /*! @function ccgcm_init @abstract Initialize a GCM context. - + @param mode Descriptor for the mode @param ctx Context for this instance @param key_nbytes Length of the key in bytes @param key Key for the underlying blockcipher (AES) - + @result 0 iff successful. - + @discussion The correct sequence of calls is: - + @code ccgcm_init(...) ccgcm_set_iv(...) ccgcm_aad(...) (may be called zero or more times) ccgcm_update(...) (may be called zero or more times) ccgcm_finalize(...) - + To reuse the context for additional encryptions, follow this sequence: - + @code ccgcm_reset(...) ccgcm_set_iv(...) ccgcm_aad(...) (may be called zero or more times) ccgcm_update(...) (may be called zero or more times) ccgcm_finalize(...) - + @warning The key-IV pair must be unique per encryption. The IV must be nonzero in length. - - @warning It is not permitted to call @p ccgcm_inc_iv after initializing the cipher via the @p ccgcm_init interface. Nonzero is returned in the event of an improper call sequence. + + @warning It is not permitted to call @p ccgcm_inc_iv after initializing the cipher via the @p ccgcm_init interface. Nonzero is + returned in the event of an improper call sequence. @warning This function is not FIPS-compliant. Use @p ccgcm_init_with_iv instead. */ -CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t key_nbytes, const void *key) +CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t key_nbytes, const void *key) { return mode->init(mode, ctx, key_nbytes, key); } @@ -475,200 +490,204 @@ CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, /*! @function ccgcm_init_with_iv @abstract Initialize a GCM context to manage IVs internally. - + @param mode Descriptor for the mode @param ctx Context for this instance @param key_nbytes Length of the key in bytes @param key Key for the underlying blockcipher (AES) @param iv IV for the first encryption - + @result 0 iff successful. - + @discussion The correct sequence of calls is: - + @code ccgcm_init_with_iv(...) ccgcm_aad(...) (may be called zero or more times) ccgcm_update(...) (may be called zero or more times) ccgcm_finalize(...) - + To reuse the context for additional encryptions, follow this sequence: - + @code ccgcm_reset(...) ccgcm_inc_iv(...) ccgcm_aad(...) (may be called zero or more times) ccgcm_update(...) (may be called zero or more times) ccgcm_finalize(...) - + The IV must be exactly 12 bytes in length. - - Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain protocols (e.g. TLS). In the call to @p ccgcm_inc_iv, the counter component will be interpreted as a big-endian, unsigned value and incremented in place. - - @warning It is not permitted to call @p ccgcm_set_iv after initializing the cipher via the @p ccgcm_init_with_iv interface. Nonzero is returned in the event of an improper call sequence. - - @warning The security of GCM depends on the uniqueness of key-IV pairs. To avoid key-IV repetition, callers should not initialize multiple contexts with the same key material via the @p ccgcm_init_with_iv interface. + + Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain + protocols (e.g. TLS). In the call to @p ccgcm_inc_iv, the counter component will be interpreted as a big-endian, unsigned value + and incremented in place. + + @warning It is not permitted to call @p ccgcm_set_iv after initializing the cipher via the @p ccgcm_init_with_iv interface. + Nonzero is returned in the event of an improper call sequence. + + @warning The security of GCM depends on the uniqueness of key-IV pairs. To avoid key-IV repetition, callers should not initialize + multiple contexts with the same key material via the @p ccgcm_init_with_iv interface. */ -int ccgcm_init_with_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t key_nbytes, const void *key, - const void *iv); +int ccgcm_init_with_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t key_nbytes, const void *key, const void *iv); /*! @function ccgcm_set_iv @abstract Set the IV for encryption. - + @param mode Descriptor for the mode @param ctx Context for this instance @param iv_nbytes Length of the IV in bytes @param iv Initialization vector - + @result 0 iff successful. - + @discussion Set the initialization vector for encryption. - + @warning The key-IV pair must be unique per encryption. The IV must be nonzero in length. - - In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for use as the IV. - - In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number generator (e.g. @p ccrng). - - @warning This function may not be used after initializing the cipher via @p ccgcm_init_with_iv. Nonzero is returned in the event of an improper call sequence. - + + In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for + use as the IV. + + In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number + generator (e.g. @p ccrng). + + @warning This function may not be used after initializing the cipher via @p ccgcm_init_with_iv. Nonzero is returned in the event + of an improper call sequence. + @warning This function is not FIPS-compliant. Use @p ccgcm_init_with_iv instead. */ -CC_INLINE int ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t iv_nbytes, const void *iv) +CC_INLINE int ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t iv_nbytes, const void *iv) { - return mode->set_iv(ctx, iv_nbytes, iv); + return mode->set_iv(ctx, iv_nbytes, iv); } /*! @function ccgcm_set_iv_legacy @abstract Set the IV for encryption. - + @param mode Descriptor for the mode @param ctx Context for this instance @param iv_nbytes Length of the IV in bytes @param iv Initialization vector - + @result 0 iff successful. - + @discussion Identical to @p ccgcm_set_iv except that it allows zero-length IVs. - + @warning Zero-length IVs nullify the authenticity guarantees of GCM. - + @warning Do not use this function in new applications. */ -int ccgcm_set_iv_legacy(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t iv_nbytes, const void *iv); +int ccgcm_set_iv_legacy(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t iv_nbytes, const void *iv); /*! @function ccgcm_inc_iv @abstract Increment the IV for another encryption. - + @param mode Descriptor for the mode @param ctx Context for this instance @param iv Updated initialization vector - + @result 0 iff successful. - + @discussion Updates the IV internally for another encryption. - - Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain protocols (e.g. TLS). The counter component is interpreted as a big-endian, unsigned value and incremented in place. - - The updated IV is copied to @p iv. This is to support protocols that require part of the IV to be specified explicitly in each packet (e.g. TLS). - + + Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain + protocols (e.g. TLS). The counter component is interpreted as a big-endian, unsigned value and incremented in place. + + The updated IV is copied to @p iv. This is to support protocols that require part of the IV to be specified explicitly in each + packet (e.g. TLS). + @warning This function may be used only after initializing the cipher via @p ccgcm_init_with_iv. */ int ccgcm_inc_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, void *iv); - /*! @function ccgcm_aad @abstract Authenticate additional data. - + @param mode Descriptor for the mode @param ctx Context for this instance @param nbytes Length of the additional data in bytes @param additional_data Additional data to authenticate - + @result 0 iff successful. - + @discussion This is typically used to authenticate data that cannot be encrypted (e.g. packet headers). - + This function may be called zero or more times. */ -CC_INLINE int ccgcm_aad(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t nbytes, const void *additional_data) +CC_INLINE int ccgcm_aad(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *additional_data) { return mode->gmac(ctx, nbytes, additional_data); } /*! @function ccgcm_gmac - - @discussion See @p ccgcm_aad. + + @discussion ccgcm_gmac is deprecated. Use the drop-in replacement 'ccgcm_aad' instead. */ -CC_INLINE int ccgcm_gmac(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t nbytes, const void *in) +CC_INLINE int ccgcm_gmac (const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *in) +cc_deprecate_with_replacement("ccgcm_aad", 13.0, 10.15, 13.0, 6.0, 4.0) { - return mode->gmac(ctx, nbytes, in); + return mode->gmac(ctx, nbytes, in); } /*! @function ccgcm_update @abstract Encrypt or decrypt data. - + @param mode Descriptor for the mode @param ctx Context for this instance @param nbytes Length of the data in bytes @param in Input plaintext or ciphertext @param out Output ciphertext or plaintext - + @result 0 iff successful. - + @discussion In-place processing is supported. - + This function may be called zero or more times. */ -CC_INLINE int ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t nbytes, const void *in, void *out) +CC_INLINE int ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out) { - return mode->gcm(ctx, nbytes, in, out); + return mode->gcm(ctx, nbytes, in, out); } /*! @function ccgcm_finalize @abstract Finish processing and authenticate. - + @param mode Descriptor for the mode @param ctx Context for this instance @param tag_nbytes Length of the tag in bytes @param tag Authentication tag - + @result 0 iff successful. - + @discussion Finish processing a packet and generate the authentication tag. - + On encryption, @p tag is purely an output parameter. The generated tag is written to @p tag. - - On decryption, @p tag is both an input and an output parameter. Well-behaved callers should provide the authentication tag generated during encryption. The function will return nonzero if the input tag does not match the generated tag. The generated tag will be written into the @p tag buffer whether authentication succeeds or fails. - - @warning The generated tag is written to @p tag to support legacy applications that perform authentication manually. Do not follow this usage pattern in new applications. Rely on the function's error code to verify authenticity. + + On decryption, @p tag is both an input and an output parameter. Well-behaved callers should provide the authentication tag + generated during encryption. The function will return nonzero if the input tag does not match the generated tag. The generated + tag will be written into the @p tag buffer whether authentication succeeds or fails. + + @warning The generated tag is written to @p tag to support legacy applications that perform authentication manually. Do not + follow this usage pattern in new applications. Rely on the function's error code to verify authenticity. */ -CC_INLINE int ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, - size_t tag_nbytes, void *tag) +CC_INLINE int ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t tag_nbytes, void *tag) { - return mode->finalize(ctx, tag_nbytes, tag); + return mode->finalize(ctx, tag_nbytes, tag); } /*! @function ccgcm_reset @abstract Reset the context for another encryption. - + @param mode Descriptor for the mode @param ctx Context for this instance - + @result 0 iff successful. - + @discussion Refer to @p ccgcm_init for correct usage. */ CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx) @@ -676,11 +695,10 @@ CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx) return mode->reset(ctx); } - /*! @function ccgcm_one_shot @abstract Encrypt or decrypt with GCM. - + @param mode Descriptor for the mode @param key_nbytes Length of the key in bytes @param key Key for the underlying blockcipher (AES) @@ -693,37 +711,47 @@ CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx) @param out Output ciphertext or plaintext @param tag_nbytes Length of the tag in bytes @param tag Authentication tag - + @result 0 iff successful. - + @discussion Perform GCM encryption or decryption. - + @warning The key-IV pair must be unique per encryption. The IV must be nonzero in length. - - In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for use as the IV. - - In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number generator (e.g. @p ccrng). - + + In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for + use as the IV. + + In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number + generator (e.g. @p ccrng). + In-place processing is supported. - + On encryption, @p tag is purely an output parameter. The generated tag is written to @p tag. - - On decryption, @p tag is primarily an input parameter. The caller should provide the authentication tag generated during encryption. The function will return nonzero if the input tag does not match the generated tag. - - @warning To support legacy applications, @p tag is also an output parameter during decryption. The generated tag is written to @p tag. Legacy callers may choose to compare this to the tag generated during encryption. Do not follow this usage pattern in new applications. + + On decryption, @p tag is primarily an input parameter. The caller should provide the authentication tag generated during + encryption. The function will return nonzero if the input tag does not match the generated tag. + + @warning To support legacy applications, @p tag is also an output parameter during decryption. The generated tag is written to @p + tag. Legacy callers may choose to compare this to the tag generated during encryption. Do not follow this usage pattern in new + applications. */ int ccgcm_one_shot(const struct ccmode_gcm *mode, - size_t key_nbytes, const void *key, - size_t iv_nbytes, const void *iv, - size_t adata_nbytes, const void *adata, - size_t nbytes, const void *in, void *out, - size_t tag_nbytes, void *tag); - + size_t key_nbytes, + const void *key, + size_t iv_nbytes, + const void *iv, + size_t adata_nbytes, + const void *adata, + size_t nbytes, + const void *in, + void *out, + size_t tag_nbytes, + void *tag); /*! @function ccgcm_one_shot_legacy @abstract Encrypt or decrypt with GCM. - + @param mode Descriptor for the mode @param key_nbytes Length of the key in bytes @param key Key for the underlying blockcipher (AES) @@ -736,22 +764,27 @@ int ccgcm_one_shot(const struct ccmode_gcm *mode, @param out Output ciphertext or plaintext @param tag_nbytes Length of the tag in bytes @param tag Authentication tag - + @result 0 iff successful. - + @discussion Identical to @p ccgcm_one_shot except that it allows zero-length IVs. - + @warning Zero-length IVs nullify the authenticity guarantees of GCM. - + @warning Do not use this function in new applications. */ int ccgcm_one_shot_legacy(const struct ccmode_gcm *mode, - size_t key_nbytes, const void *key, - size_t iv_nbytes, const void *iv, - size_t adata_nbytes, const void *adata, - size_t nbytes, const void *in, void *out, - size_t tag_nbytes, void *tag); - + size_t key_nbytes, + const void *key, + size_t iv_nbytes, + const void *iv, + size_t adata_nbytes, + const void *adata, + size_t nbytes, + const void *in, + void *out, + size_t tag_nbytes, + void *tag); /* CCM */ @@ -762,7 +795,6 @@ int ccgcm_one_shot_legacy(const struct ccmode_gcm *mode, #define ccccm_nonce_decl(_size_, _name_) cc_ctx_decl(ccccm_nonce, _size_, _name_) #define ccccm_nonce_clear(_size_, _name_) cc_clear(_size_, _name_) - CC_INLINE size_t ccccm_context_size(const struct ccmode_ccm *mode) { return mode->size; @@ -770,38 +802,40 @@ CC_INLINE size_t ccccm_context_size(const struct ccmode_ccm *mode) CC_INLINE size_t ccccm_block_size(const struct ccmode_ccm *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int ccccm_init(const struct ccmode_ccm *mode, ccccm_ctx *ctx, - size_t key_len, const void *key) +CC_INLINE int ccccm_init(const struct ccmode_ccm *mode, ccccm_ctx *ctx, size_t key_len, const void *key) { return mode->init(mode, ctx, key_len, key); } -CC_INLINE int ccccm_set_iv(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, - size_t nonce_len, const void *nonce, - size_t mac_size, size_t auth_len, size_t data_len) +CC_INLINE int ccccm_set_iv(const struct ccmode_ccm *mode, + ccccm_ctx *ctx, + ccccm_nonce *nonce_ctx, + size_t nonce_len, + const void *nonce, + size_t mac_size, + size_t auth_len, + size_t data_len) { - return mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, auth_len, data_len); + return mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, auth_len, data_len); } -CC_INLINE int ccccm_cbcmac(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, - size_t nbytes, const void *in) +CC_INLINE int ccccm_cbcmac(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in) { - return mode->cbcmac(ctx, nonce_ctx, nbytes, in); + return mode->cbcmac(ctx, nonce_ctx, nbytes, in); } -CC_INLINE int ccccm_update(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, - size_t nbytes, const void *in, void *out) +CC_INLINE int +ccccm_update(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out) { - return mode->ccm(ctx, nonce_ctx, nbytes, in, out); + return mode->ccm(ctx, nonce_ctx, nbytes, in, out); } -CC_INLINE int ccccm_finalize(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, - void *mac) +CC_INLINE int ccccm_finalize(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, void *mac) { - return mode->finalize(ctx, nonce_ctx, mac); + return mode->finalize(ctx, nonce_ctx, mac); } CC_INLINE int ccccm_reset(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx) @@ -809,32 +843,43 @@ CC_INLINE int ccccm_reset(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_n return mode->reset(ctx, nonce_ctx); } - CC_INLINE int ccccm_one_shot(const struct ccmode_ccm *mode, - size_t key_len, const void *key, - size_t nonce_len, const void *nonce, - size_t nbytes, const void *in, void *out, - size_t adata_len, const void* adata, - size_t mac_size, void *mac) -{ - int rc=0; - ccccm_ctx_decl(mode->size, ctx); - ccccm_nonce_decl(mode->nonce_size, nonce_ctx); - rc = mode->init(mode, ctx, key_len, key); - if(rc==0) rc=mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, adata_len, nbytes); - if(rc==0) rc=mode->cbcmac(ctx, nonce_ctx, adata_len, adata); - if(rc==0) rc=mode->ccm(ctx, nonce_ctx, nbytes, in, out); - if(rc==0) rc=mode->finalize(ctx, nonce_ctx, mac); - ccccm_ctx_clear(mode->size, ctx); + size_t key_len, + const void *key, + size_t nonce_len, + const void *nonce, + size_t nbytes, + const void *in, + void *out, + size_t adata_len, + const void *adata, + size_t mac_size, + void *mac) +{ + int rc; + ccccm_ctx_decl(mode->size, ctx); + ccccm_nonce_decl(mode->nonce_size, nonce_ctx); + rc = mode->init(mode, ctx, key_len, key); + if (rc == 0) { + rc = mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, adata_len, nbytes); + } + if (rc == 0) { + rc = mode->cbcmac(ctx, nonce_ctx, adata_len, adata); + } + if (rc == 0) { + rc = mode->ccm(ctx, nonce_ctx, nbytes, in, out); + } + if (rc == 0) { + rc = mode->finalize(ctx, nonce_ctx, mac); + } + ccccm_ctx_clear(mode->size, ctx); ccccm_nonce_clear(mode->nonce_size, nonce_ctx); return rc; } - /* OMAC mode. */ - /* Declare a omac key named _name_. Pass the size field of a struct ccmode_omac for _size_. */ #define ccomac_ctx_decl(_size_, _name_) cc_ctx_decl(ccomac_ctx, _size_, _name_) @@ -847,32 +892,37 @@ CC_INLINE size_t ccomac_context_size(const struct ccmode_omac *mode) CC_INLINE size_t ccomac_block_size(const struct ccmode_omac *mode) { - return mode->block_size; + return mode->block_size; } -CC_INLINE int ccomac_init(const struct ccmode_omac *mode, ccomac_ctx *ctx, - size_t tweak_len, size_t key_len, const void *key) +CC_INLINE int ccomac_init(const struct ccmode_omac *mode, ccomac_ctx *ctx, size_t tweak_len, size_t key_len, const void *key) { return mode->init(mode, ctx, tweak_len, key_len, key); } -CC_INLINE int ccomac_update(const struct ccmode_omac *mode, ccomac_ctx *ctx, - size_t nblocks, const void *tweak, const void *in, void *out) +CC_INLINE int +ccomac_update(const struct ccmode_omac *mode, ccomac_ctx *ctx, size_t nblocks, const void *tweak, const void *in, void *out) { - return mode->omac(ctx, nblocks, tweak, in, out); + return mode->omac(ctx, nblocks, tweak, in, out); } CC_INLINE int ccomac_one_shot(const struct ccmode_omac *mode, - size_t tweak_len, size_t key_len, const void *key, - const void *tweak, size_t nblocks, const void *in, void *out) + size_t tweak_len, + size_t key_len, + const void *key, + const void *tweak, + size_t nblocks, + const void *in, + void *out) { int rc; - ccomac_ctx_decl(mode->size, ctx); - rc = mode->init(mode, ctx, tweak_len, key_len, key); - if (rc == 0) rc = mode->omac(ctx, nblocks, tweak, in, out); - ccomac_ctx_clear(mode->size, ctx); + ccomac_ctx_decl(mode->size, ctx); + rc = mode->init(mode, ctx, tweak_len, key_len, key); + if (rc == 0) { + rc = mode->omac(ctx, nblocks, tweak, in, out); + } + ccomac_ctx_clear(mode->size, ctx); return rc; } - #endif /* _CORECRYPTO_CCMODE_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h index a9498d1f7..aa8cb0527 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h @@ -14,75 +14,10 @@ #include /* TODO: Remove dependency on this header. */ #include -/* Function and macros defined in this file are only to be used +/* Functions defined in this file are only to be used within corecrypto files. */ -/* For CBC, direction of underlying ecb is the same as the cbc direction */ -#define CCMODE_CBC_FACTORY(_cipher_, _dir_) \ -static struct ccmode_cbc cbc_##_cipher_##_##_dir_; \ - \ -const struct ccmode_cbc *cc##_cipher_##_cbc_##_dir_##_mode(void) \ -{ \ - const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_##_dir_##_mode(); \ - ccmode_factory_cbc_##_dir_(&cbc_##_cipher_##_##_dir_, ecb); \ - return &cbc_##_cipher_##_##_dir_; \ -} - -/* For CTR, only one direction, underlying ecb is always encrypt */ -#define CCMODE_CTR_FACTORY(_cipher_) \ -static struct ccmode_ctr ctr_##_cipher_; \ - \ -const struct ccmode_ctr *cc##_cipher_##_ctr_crypt_mode(void) \ -{ \ - const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode(); \ - ccmode_factory_ctr_crypt(&ctr_##_cipher_, ecb); \ - return &ctr_##_cipher_; \ -} - -/* OFB, same as CTR */ -#define CCMODE_OFB_FACTORY(_cipher_) \ -static struct ccmode_ofb ofb_##_cipher_; \ - \ -const struct ccmode_ofb *cc##_cipher_##_ofb_crypt_mode(void) \ -{ \ - const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode(); \ - ccmode_factory_ofb_crypt(&ofb_##_cipher_, ecb); \ - return &ofb_##_cipher_; \ -} - - -/* For CFB, the underlying ecb operation is encrypt for both directions */ -#define CCMODE_CFB_FACTORY(_cipher_, _mode_, _dir_) \ -static struct ccmode_##_mode_ _mode_##_##_cipher_##_##_dir_; \ - \ -const struct ccmode_##_mode_ *cc##_cipher_##_##_mode_##_##_dir_##_mode(void) \ -{ \ - const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode(); \ - ccmode_factory_##_mode_##_##_dir_(&_mode_##_##_cipher_##_##_dir_, ecb); \ - return &_mode_##_##_cipher_##_##_dir_; \ -} - -/* For GCM, same as CFB */ -#define CCMODE_GCM_FACTORY(_cipher_, _dir_) CCMODE_CFB_FACTORY(_cipher_, gcm, _dir_) - -/* For CCM, same as CFB */ -#define CCMODE_CCM_FACTORY(_cipher_, _dir_) CCMODE_CFB_FACTORY(_cipher_, ccm, _dir_) - - -/* Fot XTS, you always need an ecb encrypt */ -#define CCMODE_XTS_FACTORY(_cipher_ , _dir_) \ -static struct ccmode_xts xts##_cipher_##_##_dir_; \ - \ -const struct ccmode_xts *cc##_cipher_##_xts_##_dir_##_mode(void) \ -{ \ - const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_##_dir_##_mode(); \ - const struct ccmode_ecb *ecb_enc=cc##_cipher_##_ecb_encrypt_mode(); \ - \ - ccmode_factory_xts_##_dir_(&xts##_cipher_##_##_dir_, ecb, ecb_enc); \ - return &xts##_cipher_##_##_dir_; \ -} - /* Use these function to runtime initialize a ccmode_cbc decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h b/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h index 99322ad2d..1b05c638e 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h @@ -56,7 +56,7 @@ CC_INLINE size_t ccsiv_block_size(const struct ccmode_siv *mode) CC_INLINE size_t ccsiv_ciphertext_size(const struct ccmode_siv *mode, size_t plaintext_size) { - return plaintext_size+mode->cbc->block_size; + return plaintext_size + mode->cbc->block_size; } CC_INLINE size_t ccsiv_plaintext_size(const struct ccmode_siv *mode, @@ -65,7 +65,7 @@ CC_INLINE size_t ccsiv_plaintext_size(const struct ccmode_siv *mode, if (ciphertext_sizecbc->block_size) { return 0; // error } - return ciphertext_size-mode->cbc->block_size; + return ciphertext_size - mode->cbc->block_size; } // Supported key sizes are 32, 48, 64 bytes @@ -99,7 +99,6 @@ CC_INLINE int ccsiv_crypt(const struct ccmode_siv *mode, ccsiv_ctx *ctx, } // Clear all context for reuse. -// Key is clear to avoid leaking it CC_INLINE int ccsiv_reset(const struct ccmode_siv *mode, ccsiv_ctx *ctx) { return mode->reset(ctx); diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_siv_hmac.h b/EXTERNAL_HEADERS/corecrypto/ccmode_siv_hmac.h new file mode 100644 index 000000000..2cbc9a131 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_siv_hmac.h @@ -0,0 +1,205 @@ +// +// ccmode_siv_hmac.h +// corecrypto +// +// Created by Apple on 12/10/18. +// + +#ifndef ccmode_siv_hmac_h +#define ccmode_siv_hmac_h + +#include +#include +#include +#include +#include +#include + +/* This provides an implementation of SIV using AES CTR mode with HMAC as the MAC, + allowing for a tagging mechanism with collision resistant tags. This is a modification of the + standard specified in https://tools.ietf.org/html/rfc5297 + also in http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/siv/siv.pdf + Counter Mode where IV is based on HMAC. + */ + +cc_aligned_struct(16) ccsiv_hmac_ctx; + +struct ccmode_siv_hmac { + size_t size; /* first argument to ccsiv_hmac_ctx_decl(). */ + size_t block_size; + + int (*init)(const struct ccmode_siv_hmac *sivhmac, + ccsiv_hmac_ctx *ctx, + size_t key_len, + const uint8_t *key, + const size_t tag_size); + int (*set_nonce)(ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in); + int (*auth)(ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in); + int (*crypt)(ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in, uint8_t *out); + int (*reset)(ccsiv_hmac_ctx *ctx); + const struct ccdigest_info *hmac_digest; // Digest to be used in HMAC; + const struct ccmode_ctr *ctr; +}; + +#define ccsiv_hmac_ctx_decl(_size_, _name_) cc_ctx_decl(ccsiv_hmac_ctx, _size_, _name_) +#define ccsiv_hmac_ctx_clear(_size_, _name_) cc_clear(_size_, _name_) + +/*! + @function ccsiv_hmac_context_size + @abstract Return size of context + + @param mode Descriptor for the mode + */ +CC_INLINE size_t ccsiv_hmac_context_size(const struct ccmode_siv_hmac *mode) +{ + return mode->size; +} + +/*! + @function ccsiv_hmac_block_size + @abstract Return size of context + + @param mode Descriptor for the mode + */ +CC_INLINE size_t ccsiv_hmac_block_size(const struct ccmode_siv_hmac *mode) +{ + return mode->block_size; +} + +/*! + @function ccsiv_hmac_ciphertext_size + @abstract Return size of Ciphertext (which is the ciphertext and corresponding tag) given the mode and plaintext length + + @param ctx Current siv_hmac context that has been previously initialized + @param plaintext_size Size of the plaintext + + @discussion returns the length of the aead ciphertext that the context will generate which includes both the encrypted plaintext + and tag. + */ +size_t ccsiv_hmac_ciphertext_size(ccsiv_hmac_ctx *ctx, size_t plaintext_size); + +/*! + @function ccsiv_hmac_plaintext_size + @abstract Return size of plaintext given a ciphertext length and mode. + + @param ctx Current siv_hmac context that has been previously initialized + @param ciphertext_size Size of the ciphertext + + @discussion returns the length of the aead ciphertext which is both the encrypted plaintext and tag length together. + */ +size_t ccsiv_hmac_plaintext_size(ccsiv_hmac_ctx *ctx, size_t ciphertext_size); + +/*! + @function ccsiv_hmac_init + @abstract Initialize a context for siv_hmac with an associated mode, given key and specifying output tag size. + + @param mode Descriptor for the mode + @param ctx Alocated context to be intialized + @param key_byte_len Length of the key: Supported key sizes are 32, 48, 64 bytes + @param key key for siv_hmac + @param tag_size The length of the output tag requested. Must be at least 20 bytes, and can be as larged as the + associated digest's output + + @discussion In order to compute HMAC_SIV_Enc_k(a1,...,am, n, x) where ai is the ith piece of associated data, n is a nonce and x + is a plaintext, we first initialize the context with this call, and then use it to call ccsiv_hmac_aad for each ai, followed by + ccsiv_hmac_set_nonce for nonce n, and finally a call to ccsiv_hmac_crypt for the plaintext x. Note the order of the calls to aad, + nonce and then crypt is critical. If a second encryption is needed then a call to ccsiv_hmac_reset can be used to reset state, + and begin again. + */ +int ccsiv_hmac_init(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t key_byte_len, const uint8_t *key, size_t tag_size); + +/*! + @function ccsiv_hmac_aad + @abstract Add the next piece of associated data to the hmac_siv's computation of the tag. Note this call is optional and no + associated data needs to be provided. Multiple pieces of associated data can be provided by multiple calls to this + function. Each input is regarded as a seperate piece of associated data, and the mac is NOT simply computed on the + concatenation of all of the associated data inputs. Therefore on decryption the same inputs must be prodivded and in + the same order. + + @param mode Descriptor for the mode + @param ctx Intialized ctx + @param nbytes Length of the current associated data being added + @param in Associated data to be authenticated. + + @discussion Adds the associated data given by in to the computation of the tag in the associated data. + */ +int ccsiv_hmac_aad(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in); + +/*! + @function ccsiv_hmac_nonce + @abstract Add the nonce to the hmac_siv's computation of the the tag. Changes the internal state of the context + so that after the call only a crypt or reset call is permitted. + @param mode Descriptor for the mode + @param ctx Intialized ctx + @param nbytes Length of the current nonce data being added + @param in Nonce data to be authenticated. + + @discussion The nonce is a special form of authenticated data. If provided ( a call to hmac_nonce is optional) it allows + randomization of the of ciphertext (preventing deterministic encryption). While the length of the nonce is not limimited, the + amount of entropy that can be provided is limited by the number of bits in the block of the associated block-cipher in mode. + */ +int ccsiv_hmac_set_nonce(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in); + +/*! + @function ccsiv_hmac_crypt + @abstract Depending on whether mode has been setup to encrypt or decrypt, this function + 1) Encrypts the plaintext given as input in, and provides the ciphertext (which is a concatenation of the tag + followed by the encrypted plaintext) as output out. 2) Decrypts plaintext using the input ciphertext at in (which again is the + tag, followed by encrypted plaintext), and then verifies that the computer tag and provided tags match. + @param mode Descriptor for the mode + @param ctx Intialized ctx + @param nbytes Case 1) Length of the current plaintext + Case 2) Length of the current ciphertext (tag length + plaintext length) + @param in Case 1) Plaintext + Case 2) Ciphertext + @discussion This function is only called once. If one wishes to compute another (en)/(de)cryption, one resets the state with + ccsiv_hmac_reset, and then begins the process again. There is no way to stream large plaintext/ciphertext inputs into the + function. + + In the case of a decryption, if there is a failure in verifying the computed tag against the provided tag (embedded int he ciphertext), then a decryption/verification + failure is returned, and any internally computed plaintexts and tags are zeroed out. + Lastly the contexts internal state is reset, so that a new decryption/encryption can be commenced. + */ +int ccsiv_hmac_crypt(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in, uint8_t *out); + +/*! + @function ccsiv_hmac_reset + @abstract Resets the state of the siv_hamc ctx, maintaing the key, but preparing the + ctx to preform a new Associated Data Authenticated (En)/(De)cryption. + @param mode Descriptor for the mode + @param ctx Intialized ctx + */ +int ccsiv_hmac_reset(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx); + +/*! + @function ccsiv_hmac_one_shot + @abstract A simplified but more constrained way of performing an AEAD SIV HMAC (en)/(de)cryption. It is limited because only + one piece of associated data may be provided. + @param mode Descriptor for the mode + @param key_len Length of the key: Supported key sizes are 32, 48, 64 bytes + @param key key for siv_hmac + @param tag_length The length of the tag to produce or accept as input. Must be at least 20 + bytes, and can be as large as the hmac's digest's output + @param nonce_nbytes Length of the current nonce data being added + @param nonce Nonce data to be authenticated. + @param adata_nbytes Length of the associated data. + @param adata Associated data to be authenticated. + @param in_nbytes Length of either the plaintext (for encryption) or ciphertext (for decryption) + @param in plaintext or ciphertext. Note that the ciphertext includes a tag of length tag_length prepended to + it. + */ + +// One shot AEAD with only one input for adata, and a nonce. +int ccsiv_hmac_one_shot(const struct ccmode_siv_hmac *mode, + size_t key_len, + const uint8_t *key, + size_t tag_length, + unsigned nonce_nbytes, + const uint8_t *nonce, + unsigned adata_nbytes, + const uint8_t *adata, + size_t in_nbytes, + const uint8_t *in, + uint8_t *out); + +#endif /* ccmode_siv_hmac_h */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccn.h b/EXTERNAL_HEADERS/corecrypto/ccn.h index 2d3e847c9..778f3e5cf 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccn.h +++ b/EXTERNAL_HEADERS/corecrypto/ccn.h @@ -62,18 +62,10 @@ typedef uint16_t cc_dunit; // 16 bit double width unit #error invalid CCN_UNIT_SIZE #endif -// All mp types have units in little endian unit order. -typedef cc_unit *ccn_t; // n unit long mp -typedef cc_unit *ccnp1_t; // n + 1 unit long mp -typedef cc_unit *cc2n_t; // 2 * n unit long mp -typedef cc_unit *cc2np2_t; // 2 * n + 2 unit long mp -typedef const cc_unit *ccn_in_t; // n unit long mp -typedef const cc_unit *ccnp1_in_t; // n + 1 unit long mp -typedef const cc_unit *cc2n_in_t; // 2 * n unit long mp -typedef const cc_unit *cc2np2_in_t; // 2 * n + 2 unit long mp - #define CCN_UNIT_BITS (sizeof(cc_unit) * 8) #define CCN_UNIT_MASK ((cc_unit)~0) +#define CCN_UNIT_LOWER_HALF_MASK ((CCN_UNIT_MASK) >> (CCN_UNIT_BITS/2)) +#define CCN_UNIT_UPPER_HALF_MASK (~CCN_UNIT_LOWER_HALF_MASK) typedef struct { cc_unit *start; // First cc_unit of the workspace @@ -233,6 +225,7 @@ typedef struct { /* Macros to construct fixed size ccn arrays from 64 or 32 bit quantities. */ #define ccn192_64(a2,a1,a0) ccn64_64(a0),ccn64_64(a1),ccn64_64(a2) +#define ccn192_32(a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4) #define ccn224_32(a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn32_32(a6) #define ccn256_32(a7,a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn64_32(a7,a6) #define ccn384_32(a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn64_32(a7,a6),ccn64_32(a9,a8),ccn64_32(a11,a10) @@ -286,18 +279,23 @@ typedef struct { /* Return the number of used units after stripping leading 0 units. */ CC_PURE CC_NONNULL((2)) -cc_size ccn_n(cc_size n, const cc_unit *s); +cc_size ccn_n(cc_size n, const cc_unit *s) __asm__("_ccn_n"); -/* s >> k -> r return bits shifted out of least significant word in bits [0, n> +/* s >> k -> r return bits shifted out of least significant word in the higest order bits of + the retuned value. For example if CCN_UNIT_SIZE == 1, then (0b1101 1110)>>4 returns (0b1110 0000) + and sets r==(0b0000 1101). { N bit, scalar -> N bit } N = n * sizeof(cc_unit) * 8 the _multi version doesn't return the shifted bits, but does support multiple word shifts. */ CC_NONNULL((2, 3)) -cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k); +cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k) __asm__("_ccn_shift_right"); /* s == 0 -> return 0 | s > 0 -> return index (starting at 1) of most - significant bit that is 1. - { N bit } N = n * sizeof(cc_unit) * 8 */ + * significant bit that is 1. + * { N bit } N = n * sizeof(cc_unit) * 8 + * + * Runs in constant time, independent of the value of `s`. + */ CC_NONNULL((2)) size_t ccn_bitlen(cc_size n, const cc_unit *s); @@ -314,7 +312,7 @@ size_t ccn_bitlen(cc_size n, const cc_unit *s); /* s < t -> return - 1 | s == t -> return 0 | s > t -> return 1 { N bit, N bit -> int } N = n * sizeof(cc_unit) * 8 */ CC_PURE CC_NONNULL((2, 3)) -int ccn_cmp(cc_size n, const cc_unit *s, const cc_unit *t); +int ccn_cmp(cc_size n, const cc_unit *s, const cc_unit *t) __asm__("_ccn_cmp"); /* s < t -> return - 1 | s == t -> return 0 | s > t -> return 1 { N bit, M bit -> int } N = ns * sizeof(cc_unit) * 8 M = nt * sizeof(cc_unit) * 8 */ @@ -332,7 +330,7 @@ int ccn_cmpn(cc_size ns, const cc_unit *s, /* s - t -> r return 1 iff t > s { N bit, N bit -> N bit } N = n * sizeof(cc_unit) * 8 */ CC_NONNULL((2, 3, 4)) -cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); +cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) __asm__("_ccn_sub"); /* s - v -> r return 1 iff v > s return 0 otherwise. { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */ @@ -353,7 +351,7 @@ cc_unit ccn_subn(cc_size n, cc_unit *r, const cc_unit *s, /* s + t -> r return carry if result doesn't fit in n bits. { N bit, N bit -> N bit } N = n * sizeof(cc_unit) * 8 */ CC_NONNULL((2, 3, 4)) -cc_unit ccn_add(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); +cc_unit ccn_add(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) __asm__("_ccn_add"); /* s + v -> r return carry if result doesn't fit in n bits. { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */ @@ -375,7 +373,7 @@ cc_unit ccn_addn(cc_size n, cc_unit *r, const cc_unit *s, { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8 { N bit, N bit -> 2N bit } N = ccn_bitsof(n) */ CC_NONNULL((2, 3, 4)) -void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t); +void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t) __asm__("_ccn_mul"); /* s[0..n) * v -> r[0..n)+return value { N bit, sizeof(cc_unit) * 8 bit -> N + sizeof(cc_unit) * 8 bit } N = n * sizeof(cc_unit) * 8 */ @@ -387,50 +385,120 @@ cc_unit ccn_mul1(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit v); CC_NONNULL((2, 3)) cc_unit ccn_addmul1(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit v); -#if 0 -/* a % d -> n - {2 * n bit, n bit -> n bit } n = count * sizeof(cc_unit) * 8 */ -CC_NONNULL((2, 3, 4)) -void ccn_mod(cc_size n, cc_unit *r, const cc_unit *a_2n, const cc_unit *d); -#endif -/* r = (data, len) treated as a big endian byte array, return -1 if data - doesn't fit in r, return 0 otherwise. */ +/*! + @function ccn_read_uint + @abstract Copy big endian integer and represent it in cc_units + + @param n Input allocated size of the cc_unit output array r + @param r Ouput cc_unit array for unsigned integer + @param data_nbytes Input byte size of data + @param data Input unsigned integer represented in big endian + + @result r is initialized with the big unsigned number + + @return 0 if no error, !=0 if the big number cannot be represented in the allocated cc_unit array. + + @discussion The execution pattern of this function depends on both n and data_nbytes but not on data values except the handling + of the error case. + */ + CC_NONNULL((2, 4)) -int ccn_read_uint(cc_size n, cc_unit *r, size_t data_size, const uint8_t *data); +int ccn_read_uint(cc_size n, cc_unit *r, size_t data_nbytes, const uint8_t *data); /* r = (data, len) treated as a big endian byte array, return -1 if data doesn't fit in r, return 0 otherwise. ccn_read_uint strips leading zeroes and doesn't care about sign. */ #define ccn_read_int(n, r, data_size, data) ccn_read_uint(n, r, data_size, data) -/* Return actual size in bytes needed to serialize s. */ -CC_PURE CC_NONNULL((2)) -size_t ccn_write_uint_size(cc_size n, const cc_unit *s); +/*! + @function ccn_write_uint_size + @abstract Compute the minimum size required to store an big integer + + @param n Input size of the cc_unit array representing the input + @param s Input cc_unit array + + @result Return value is the exact byte size of the big integer + + @discussion + The execution flow is independent on the value of the big integer. + However, the use of the returned value may leak the position of the most significant byte + */ +CC_PURE CC_NONNULL((2)) size_t ccn_write_uint_size(cc_size n, const cc_unit *s); -/* Serialize s, to out. - First byte of byte stream is the m.s. byte of s, - regardless of the size of cc_unit. +/*! + @function ccn_write_uint + @abstract Serialize the big integer into a big endian byte buffer - No assumption is made about the alignment of out. + @param n Input size of the cc_unit array representing the input + @param s Input cc_unit array + @param out_size Size of the output buffer + @param out Output byte array of size at least out_size + + @discussion This function writes exactly + MIN(out_size,ccn_write_uint_size(n,s)) bytes truncating to keep the + most significant bytes when out_size bytesInKey) ? out_size - bytesInKey : 0; + @param n Input size of the cc_unit array representing the input + @param s Input cc_unit array + @param out_size Size of the output buffer + @param out Output byte array of size at least out_size + + @return number of leading zero bytes in case of success, a negative error value in case of failure + + @result This function writes exactly out_size byte, padding with zeroes when necessary. + This function DOES NOT support truncation and returns an error if out_size < ccn_write_uint_size + + @discussion The execution flow of function is independent on the value of the big integer + However, the processing of the return value by the caller may expose the position of + the most significant byte + */ +CC_NONNULL((2, 4)) +int ccn_write_uint_padded_ct(cc_size n, const cc_unit *s, size_t out_size, uint8_t *out); + +/*! + @function ccn_write_uint_padded + @abstract Serialize the big integer into a big endian byte buffer + Not recommended, for most cases ccn_write_uint_padded_ct is more appropriate + Sensitive big integers are exposed since the processing expose the position of the MS byte + + @param n Input size of the cc_unit array representing the input + @param s Input cc_unit array + @param out_size Size of the output buffer + @param out Output byte array of size at least out_size - cc_zero(offset, to); - ccn_write_uint(n, s, out_size - offset, to + offset); + @return number of leading zero bytes + @result This function writes exactly out_size byte, padding with zeroes when necessary. + This function DOES support truncation when out_size= 0) { + // It worked + offset = (size_t)offset_int; + } else { + // Truncation case, execution depends on the position of the MSByte + ccn_write_uint(n, s, out_size, out); + } return offset; } @@ -456,11 +524,11 @@ void ccn_write_int(cc_size n, const cc_unit *s, size_t out_size, void *out); /* s -> r { n bit -> n bit } */ CC_NONNULL((2, 3)) -void ccn_set(cc_size n, cc_unit *r, const cc_unit *s); +void ccn_set(cc_size n, cc_unit *r, const cc_unit *s) __asm__("_ccn_set"); CC_INLINE CC_NONNULL((2)) void ccn_zero(cc_size n, cc_unit *r) { - cc_zero(ccn_sizeof_n(n),r); + cc_clear(ccn_sizeof_n(n),r); } CC_INLINE CC_NONNULL((2)) diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng.h b/EXTERNAL_HEADERS/corecrypto/ccrng.h index c6bc18a90..731f3e7bc 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrng.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrng.h @@ -13,24 +13,27 @@ #include -#define CCRNG_STATE_COMMON \ +#define CCRNG_STATE_COMMON \ int (*generate)(struct ccrng_state *rng, size_t outlen, void *out); -/* default state structure. Do not instantiate, ccrng() returns a reference to this structure */ +/*! + @type struct ccrng_state + @abstract Default state structure. Do not instantiate. ccrng() returns a reference to this structure + */ struct ccrng_state { CCRNG_STATE_COMMON }; /*! @function ccrng - @abstract initializes a AES-CTR mode cryptographic random number generator and returns the statically alocated rng object. - Getting a pointer to a ccrng has never been simpler! + @abstract Initializes an AES-CTR mode cryptographic random number generator and returns the statically-allocated rng object. + Getting a pointer to a ccrng has never been simpler! Call this function, get an rng object and then pass the object to ccrng_generate() to generate randoms. ccrng() may be called more than once. It returns pointer to the same object on all calls. @result a cryptographically secure random number generator or NULL if fails - - @discussion + + @discussion - It is significantly faster than using the system /dev/random - FIPS Compliant: NIST SP800-80A + FIPS 140-2 - Seeded from the system entropy. @@ -42,7 +45,29 @@ struct ccrng_state { struct ccrng_state *ccrng(int *error); -//call this macro with the rng argument set to output of the call to the ccrng() function -#define ccrng_generate(rng, outlen, out) ((rng)->generate((struct ccrng_state *)(rng), (outlen), (out))) +/*! + @function ccrng_generate + @abstract Generate `outlen` bytes of output, stored in `out`, using ccrng_state `rng`. + + @param rng `struct ccrng_state` representing the state of the RNG. + @param outlen Amount of random bytes to generate. + @param out Pointer to memory where random bytes are stored, of size at least `outlen`. + + @result 0 on success and nonzero on failure. + */ +#define ccrng_generate(rng, outlen, out) \ + ((rng)->generate((struct ccrng_state *)(rng), (outlen), (out))) + +/*! + @function ccrng_uniform + @abstract Generate a random value in @p [0, bound). + + @param rng The state of the RNG. + @param bound The exclusive upper bound on the output. + @param rand A pointer to a single @p uint64_t to store the result. + + @result Returns zero iff the operation is successful. + */ +int ccrng_uniform(struct ccrng_state *rng, uint64_t bound, uint64_t *rand); #endif /* _CORECRYPTO_CCRNG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng_system.h b/EXTERNAL_HEADERS/corecrypto/ccrng_system.h deleted file mode 100644 index a5aab7ed2..000000000 --- a/EXTERNAL_HEADERS/corecrypto/ccrng_system.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * ccrng_system.h - * corecrypto - * - * Created on 12/13/2010 - * - * Copyright (c) 2010,2013,2014,2015 Apple Inc. All rights reserved. - * - */ - -#ifndef _CORECRYPTO_CCRNG_SYSTEM_H_ -#define _CORECRYPTO_CCRNG_SYSTEM_H_ - -#include - -struct ccrng_system_state { - CCRNG_STATE_COMMON - int fd; -}; - -/*! - @function ccrng_system_init - DEPRECATED - @abstract Default ccrng. - Please transition to ccrng() which is easier to use and with provide the fastest, most secure option - - @param rng Structure containing the state of the RNG, must remain allocated as - long as the rng is used. - @result 0 iff successful - - @discussion - This RNG require call to "init" AND "done", otherwise it may leak a file descriptor. - */ - -// Initialize ccrng -// Deprecated, if you need a rng, just call the function ccrng() -int ccrng_system_init(struct ccrng_system_state *rng); - -// Close the system RNG -// Mandatory step to avoid leaking file descriptor -void ccrng_system_done(struct ccrng_system_state *rng); - -#endif /* _CORECRYPTO_CCRNG_SYSTEM_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrsa.h b/EXTERNAL_HEADERS/corecrypto/ccrsa.h index 0f70c3740..a2baa932b 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrsa.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrsa.h @@ -56,7 +56,7 @@ typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t; /* Declare a fully scheduled rsa key. Size is the size in bytes each ccn in the key. For example to declare (on the stack or in a struct) a 1021 bit - rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo). + rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo). */ #define ccrsa_full_ctx_decl(_size_, _name_) cc_ctx_decl(struct ccrsa_full_ctx, ccrsa_full_ctx_size(_size_), _name_) #define ccrsa_full_ctx_clear(_size_, _name_) cc_clear(ccrsa_full_ctx_size(_size_), _name_) @@ -84,7 +84,7 @@ typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t; #define ccrsa_ctx_private_qinv(FK) ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 6 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp))) /* rvalue accessors to ccec_key fields. */ -CC_CONST CC_INLINE +CC_INLINE ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) { ccrsa_priv_ctx_t priv = (ccrsa_priv_ctx_t)(ccrsa_ctx_d(fk)+ccrsa_ctx_n(fk)); return priv; @@ -96,16 +96,14 @@ ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) { @param fk RSA full key @result Returns RSA public ker */ -CC_CONST CC_INLINE +CC_INLINE ccrsa_pub_ctx_t ccrsa_ctx_public(ccrsa_full_ctx_t fk) { return (ccrsa_pub_ctx_t) fk; } /* Return exact key bit size */ -static inline size_t -ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk) { - return cczp_bitlen(ccrsa_ctx_zm(pubk)); -} +CC_NONNULL_ALL +size_t ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk); /* PKCS1 pad_markers */ #define CCRSA_PKCS1_PAD_SIGN 1 @@ -116,6 +114,33 @@ CC_NONNULL((1, 2, 3)) int ccrsa_init_pub(ccrsa_pub_ctx_t key, const cc_unit *modulus, const cc_unit *e); +/*! + @function ccrsa_make_priv + @abstract Initialize public and private key based on modulus and e, p and q as big endian byte arrays; + + @param full_ctx Initialized context with full_ctx->zp.n already set to 2*ccn_nof_size(p_mbytes) + @param exp_mbytes Number of bytes in big endian e. + @param exp_in pointer to big endian exponent e (may have leading 0's). + @param p_mbytes Number of bytes in big endian p. + @param p_in Pointer to the rsa p. + @param q_mbytes Number of bytes in big endian q. + @param q_in Pointer to the rsa q. + @result 0 iff successful. + + @discussion full_ctx->zp.n must already be set to 2*ccn_nof_size(p_mbytes), witt the expectation that p_mbytes>q_mbytes. + e is the public exponent, and exp_mbytes<= 2*p_mbytes. + The output is a fully formed rsa context with N=pq, d=e^{-1} mod phi(N), and appropriate inverses of different associated values precomputed + to speed computation. + */ + +int ccrsa_make_priv(ccrsa_full_ctx_t full_ctx, + size_t exp_mbytes, + const uint8_t *exp_in, + size_t p_mbytes, + const uint8_t *p_in, + size_t q_mbytes, + const uint8_t *q_in); + /* Initialize key based on modulus and e as big endian byte array key->zp.n must already be set. */ CC_NONNULL((1, 3, 5)) @@ -139,12 +164,15 @@ CC_NONNULL((2, 4, 5)) int ccrsa_generate_key(size_t nbits, ccrsa_full_ctx_t rsa_ctx, size_t e_size, const void *e, struct ccrng_state *rng) CC_WARN_RESULT; -/* Generate RSA key in conformance with FIPS186-4 standard */ +/* Generate RSA key in conformance with FIPS186-4 standard. + The first RNG `rng` will be used to generate p and q. + The second RNG `rng_mr` will be used only for primality testing. + This is relevant only for testing, just pass the same RNG twice. */ CC_NONNULL((2, 4, 5, 6)) int ccrsa_generate_fips186_key(size_t nbits, ccrsa_full_ctx_t fk, size_t e_size, const void *eBytes, - struct ccrng_state *rng1, struct ccrng_state *rng2) CC_WARN_RESULT; + struct ccrng_state *rng, struct ccrng_state *rng_mr) CC_WARN_RESULT; /* Construct RSA key from fix input in conformance with FIPS186-4 standard */ CC_NONNULL((3, 5, 7, 9, 11, 13, 15, 16)) @@ -221,7 +249,7 @@ int ccrsa_verify_pss(ccrsa_pub_ctx_t key, for the output signature @result 0 iff successful. - + @discussion Null OID is a special case, required to support RFC 4346 where the padding is based on SHA1+MD5. In general it is not recommended to use a NULL OID, except when strictly required for interoperability @@ -261,9 +289,9 @@ int ccrsa_verify_pkcs1v15(ccrsa_pub_ctx_t key, const uint8_t *oid, /*! @function ccder_encode_rsa_pub_size @abstract Calculate size of public key export format data package. - + @param key Public key - + @result Returns size required for encoding. */ @@ -273,7 +301,7 @@ size_t ccder_encode_rsa_pub_size(const ccrsa_pub_ctx_t key); /*! @function ccrsa_export_priv_pkcs1 @abstract Export a public key. - + @param key Public key @param der Beginning of output DER buffer @param der_end End of output DER buffer @@ -286,9 +314,9 @@ uint8_t *ccder_encode_rsa_pub(const ccrsa_pub_ctx_t key, uint8_t *der, uint8_t * /*! @function ccder_encode_rsa_priv_size @abstract Calculate size of full key exported in PKCS#1 format. - + @param key Full key - + @result Returns size required for encoding. */ @@ -298,7 +326,7 @@ size_t ccder_encode_rsa_priv_size(const ccrsa_full_ctx_t key); /*! @function ccder_encode_rsa_priv @abstract Export a full key in PKCS#1 format. - + @param key Full key @param der Beginning of output DER buffer @param der_end End of output DER buffer @@ -311,10 +339,10 @@ uint8_t *ccder_encode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *der, u @function ccder_decode_rsa_pub_n @abstract Calculate "n" for a public key imported from a data package. PKCS #1 format - + @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ @@ -326,11 +354,11 @@ cc_size ccder_decode_rsa_pub_n(const uint8_t *der, const uint8_t *der_end); @function ccder_decode_rsa_pub @abstract Import a public RSA key from a package in public key format. PKCS #1 format - + @param key Public key (n must be set) @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result Key is initialized using the data in the public key message. */ @@ -369,10 +397,10 @@ const uint8_t *ccder_decode_rsa_pub_x509(const ccrsa_pub_ctx_t key, const uint8_ /*! @function ccder_decode_rsa_priv_n @abstract Calculate "n" for a private key imported from a data package. - + @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ @@ -383,11 +411,11 @@ cc_size ccder_decode_rsa_priv_n(const uint8_t *der, const uint8_t *der_end); /*! @function ccder_decode_rsa_priv @abstract Import a private RSA key from a package in PKCS#1 format. - + @param key Full key (n must be set) @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result Key is initialized using the data in the public key message. */ @@ -397,13 +425,13 @@ const uint8_t *ccder_decode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t * /*! @function ccrsa_export_pub_size @abstract Calculate size of public key exported data package. - + @param key Public key - + @result Returns size required for encoding. */ -CC_CONST CC_INLINE CC_NONNULL((1)) +CC_INLINE CC_NONNULL((1)) size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) { return ccder_encode_rsa_pub_size(key); } @@ -411,7 +439,7 @@ size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) { /*! @function ccrsa_export_pub @abstract Export a public key in public key format. - + @param key Public key @param out_len Allocated size @param out Output buffer @@ -422,15 +450,15 @@ int ccrsa_export_pub(const ccrsa_pub_ctx_t key, size_t out_len, uint8_t *out); /*! @function ccrsa_import_pub_n @abstract Calculate "n" for a public key imported from a data package. - + @param inlen Length of public key package data @param der pointer to public key package data - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ -CC_CONST CC_INLINE CC_NONNULL((2)) +CC_INLINE CC_NONNULL((2)) cc_size ccrsa_import_pub_n(size_t inlen, const uint8_t *der) { cc_size size = ccder_decode_rsa_pub_x509_n(der, der + inlen); if(size == 0) { @@ -442,11 +470,11 @@ cc_size ccrsa_import_pub_n(size_t inlen, const uint8_t *der) { /*! @function ccrsa_import_pub @abstract Import a public RSA key from a package in public key format. - + @param key Public key (n must be set) @param inlen Length of public key package data @param der pointer to public key package data - + @result Key is initialized using the data in the public key message. */ @@ -456,13 +484,13 @@ int ccrsa_import_pub(ccrsa_pub_ctx_t key, size_t inlen, const uint8_t *der); /*! @function ccrsa_export_priv_size @abstract Calculate size of full key exported in PKCS#1 format. - + @param key Full key - + @result Returns size required for encoding. */ -CC_CONST CC_INLINE CC_NONNULL((1)) +CC_INLINE CC_NONNULL((1)) size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) { return ccder_encode_rsa_priv_size(key); } @@ -470,13 +498,13 @@ size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) { /*! @function ccrsa_export_priv @abstract Export a full key in PKCS#1 format. - + @param key Full key @param out_len Allocated size @param out Output buffer */ -CC_CONST CC_INLINE CC_NONNULL((1, 3)) +CC_INLINE CC_NONNULL((1, 3)) int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out) { return (ccder_encode_rsa_priv(key, out, out+out_len) != out); } @@ -484,15 +512,15 @@ int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out) /*! @function ccrsa_import_priv_n @abstract Calculate size of full key exported in PKCS#1 format. - + @param inlen Length of PKCS#1 package data @param der pointer to PKCS#1 package data - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ -CC_CONST CC_INLINE CC_NONNULL((2)) +CC_INLINE CC_NONNULL((2)) cc_size ccrsa_import_priv_n(size_t inlen, const uint8_t *der) { return ccder_decode_rsa_priv_n(der, der + inlen); } @@ -500,15 +528,15 @@ cc_size ccrsa_import_priv_n(size_t inlen, const uint8_t *der) { /*! @function ccrsa_import_priv @abstract Import a full RSA key from a package in PKCS#1 format. - + @param key Full key (n must be set) @param inlen Length of PKCS#1 package data @param der pointer to PKCS#1 package data - + @result Key is initialized using the data in the PKCS#1 message. */ -CC_CONST CC_INLINE CC_NONNULL((1, 3)) +CC_INLINE CC_NONNULL((1, 3)) int ccrsa_import_priv(ccrsa_full_ctx_t key, size_t inlen, const uint8_t *der) { return (ccder_decode_rsa_priv(key, der, der+inlen) == NULL); } diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha1.h b/EXTERNAL_HEADERS/corecrypto/ccsha1.h index 3f343401e..4dc3c5194 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccsha1.h +++ b/EXTERNAL_HEADERS/corecrypto/ccsha1.h @@ -29,8 +29,8 @@ extern const struct ccdigest_info ccsha1_eay_di; extern const struct ccdigest_info ccsha1_vng_intel_SupplementalSSE3_di; #endif -#if CCSHA1_VNG_ARMV7NEON -extern const struct ccdigest_info ccsha1_vng_armv7neon_di; +#if CCSHA1_VNG_ARM +extern const struct ccdigest_info ccsha1_vng_arm_di; #endif /* TODO: Placeholders */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha2.h b/EXTERNAL_HEADERS/corecrypto/ccsha2.h index 995ef7e26..e80c70e9e 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccsha2.h +++ b/EXTERNAL_HEADERS/corecrypto/ccsha2.h @@ -42,9 +42,14 @@ extern const struct ccdigest_info ccsha256_ltc_di; extern const struct ccdigest_info ccsha224_vng_intel_SupplementalSSE3_di; extern const struct ccdigest_info ccsha256_vng_intel_SupplementalSSE3_di; #endif -#if CCSHA2_VNG_ARMV7NEON -extern const struct ccdigest_info ccsha224_vng_armv7neon_di; -extern const struct ccdigest_info ccsha256_vng_armv7neon_di; +#if CCSHA2_VNG_ARM +extern const struct ccdigest_info ccsha224_vng_arm_di; +extern const struct ccdigest_info ccsha256_vng_arm_di; +#if CC_ACCELERATECRYPTO && defined(__arm64__) && CCSHA2_VNG_ARM +extern const struct ccdigest_info ccsha256_vng_arm64neon_di; +#endif // CC_ACCELERATECRYPTO +extern const struct ccdigest_info ccsha384_vng_arm_di; +extern const struct ccdigest_info ccsha512_vng_arm_di; #endif /* SHA224 */ diff --git a/EXTERNAL_HEADERS/corecrypto/cczp.h b/EXTERNAL_HEADERS/corecrypto/cczp.h index d392432dc..e77f6b863 100644 --- a/EXTERNAL_HEADERS/corecrypto/cczp.h +++ b/EXTERNAL_HEADERS/corecrypto/cczp.h @@ -19,7 +19,7 @@ definitions. Declare cczp objects using cczp_decl_n(). It allocates cc_unit arrays of the length returned by - either cczp_nof_n() or cczp_short_nof_n(). + cczp_nof_n(). */ struct cczp; @@ -27,7 +27,7 @@ struct cczp; typedef struct cczp *cczp_t; typedef const struct cczp *cczp_const_t; -typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s); +typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y); // keep cczp_hd and cczp structures consistent // cczp_hd is typecasted to cczp to read EC curve params @@ -36,7 +36,7 @@ typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_u #define __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \ cc_size pre##n; \ cc_unit pre##options; \ - ccmod_func_t pre##mod_prime; + ccmod_func_t pre##mulmod_prime; #define __CCZP_ELEMENTS_DEFINITIONS(pre) \ __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \ @@ -60,85 +60,44 @@ struct cczp { #define cczp_nof_n(_n_) (ccn_nof_size(sizeof(struct cczp)) + 1 + 2 * (_n_)) /* Return number of units that a struct cczp needs to be in units for a prime - size of _n_ units. The _short variant does not have room for CCZP_RECIP, - so it can not be used with cczp_mod, cczp_mul, cczp_sqr. It can be used - with cczp_add, cczp_sub, cczp_div2, cczp_mod_inv. */ -#define cczp_short_nof_n(_n_) (ccn_nof_size(sizeof(struct cczp)) + (_n_)) - + size of _n_ units. */ #define cczp_decl_n(_n_, _name_) cc_ctx_decl(struct cczp, ccn_sizeof_n(cczp_nof_n(_n_)), _name_) -#define cczp_short_decl_n(_n_, _name_) \ - cc_ctx_decl(struct cczp_short, ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_) - #define cczp_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_nof_n(_n_)), _name_) -#define cczp_short_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_) #define CCZP_N(ZP) ((ZP)->n) -#define CCZP_MOD(ZP) ((ZP)->mod_prime) -#define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP) #define CCZP_PRIME(ZP) ((ZP)->ccn) #define CCZP_RECIP(ZP) ((ZP)->ccn + CCZP_N(ZP)) -#define CCZP_OPS(ZP) ((ZP)->options) -CC_CONST CC_NONNULL((1)) static inline cc_size cczp_n(cczp_const_t zp) +CC_NONNULL((1)) CC_INLINE cc_size cczp_n(cczp_const_t zp) { return zp->n; } -CC_CONST CC_NONNULL((1)) static inline cc_unit cczp_options(cczp_const_t zp) -{ - return zp->options; -} - -CC_CONST CC_NONNULL((1)) static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp) -{ - return zp->mod_prime; -} - -CC_CONST CC_NONNULL((1)) static inline const cc_unit *cczp_prime(cczp_const_t zp) +CC_NONNULL((1)) CC_INLINE const cc_unit *cczp_prime(cczp_const_t zp) { return zp->ccn; } /* Return a pointer to the Reciprocal or Montgomery constant of zp, which is allocated cczp_n(zp) + 1 units long. */ -CC_CONST CC_NONNULL((1)) - - static inline const cc_unit *cczp_recip(cczp_const_t zp) +CC_NONNULL((1)) CC_INLINE const cc_unit *cczp_recip(cczp_const_t zp) { return zp->ccn + zp->n; } -CC_CONST CC_NONNULL((1)) CC_INLINE size_t cczp_bitlen(cczp_const_t zp) -{ - return ccn_bitlen(cczp_n(zp), cczp_prime(zp)); -} - /* Ensure both cczp_mod_prime(zp) and cczp_recip(zp) are valid. cczp_n and - cczp_prime must have been previously initialized. */ + cczp_prime must have been previously initialized. The reciprocal will + be computed and set. */ CC_NONNULL((1)) int cczp_init(cczp_t zp); -/* Compute r = s2n mod cczp_prime(zp). Will write cczp_n(zp) - units to r and reads 2 * cczp_n(zp) units units from s2n. If r and s2n are not - identical they must not overlap. Before calling this function either - cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) - and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL((1, 2, 3)) void cczp_mod(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s2n); - -/* Compute r = sn mod cczp_prime(zp), Will write cczp_n(zp) - units to r and reads sn units units from s. If r and s are not - identical they must not overlap. Before calling this function either - cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) - and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL((1, 2, 4)) int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s); - -/* Compute r = x * y mod cczp_prime(zp). Will write cczp_n(zp) units to r - and reads cczp_n(zp) units units from both x and y. If r and x are not - identical they must not overlap, The same holds for r and y. Before - calling this function either cczp_init(zp) must have been called or both - CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be - initialized some other way. */ -CC_NONNULL((1, 2, 3, 4)) -void cczp_mul(cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y); +/*! @function cczp_init_with_recip + @abstract Initializes a cczp struct with a given reciprocal. + + @param zp Pointer to a cczp struct. + @param recip Reciprocal for zp's prime. + */ +CC_NONNULL((1, 2)) +void cczp_init_with_recip(cczp_t zp, const cc_unit *recip); /* Compute r = m ^ e mod cczp_prime(zp), using Montgomery ladder. - writes cczp_n(zp) units to r @@ -152,21 +111,6 @@ void cczp_mul(cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y); CC_NONNULL((1, 2, 3, 4)) int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e); -/* Compute r = m ^ e mod cczp_prime(zp), using Square Square Multiply Always. - - writes cczp_n(zp) units to r - - reads cczp_n(zp) units units from m and e - - if r and m are not identical they must not overlap. - - r and e must not overlap nor be identical. - - before calling this function either cczp_init(zp) must have been called - or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must - be initialized some other way. - - Important: This function is intented to be constant time but is more likely - to leak information due to memory cache. Only used with randomized input - */ -CC_NONNULL((1, 2, 3, 4)) -int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e); - /*! @brief cczp_inv(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp). @discussion It is a general function and works for any p. It validates the inputs. r and x can @@ -182,32 +126,4 @@ int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit CC_NONNULL((1, 2, 3)) int cczp_inv(cczp_const_t zp, cc_unit *r, const cc_unit *x); -/*! - @brief cczp_inv_odd(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is an odd number. - @discussion r and x can overlap. - @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to - be called before invoking. - @param x input big integer - @param r output big integer - @return 0 if successful - */ -CC_NONNULL((1, 2, 3)) int cczp_inv_odd(cczp_const_t zp, cc_unit *r, const cc_unit *x); - -/*! - @brief cczp_inv_field(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is a prime - number number. - @discussion r and x must NOT overlap. The excution time of the function is independent to the value - of the input x. It works only if p is a field. That is, when p is a prime. It supports Montgomery - and non-Montgomery form of zp. It leaks the value of the prime and should only be used be used for - public (not secret) primes (ex. Elliptic Curves) - - @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to - be called before invoking cczp_inv_field(). - @param x input big unteger - @param r output big integer - @return 0 if inverse exists and correctly computed. - */ -CC_NONNULL((1, 2, 3)) -int cczp_inv_field(cczp_const_t zp, cc_unit *r, const cc_unit *x); - #endif /* _CORECRYPTO_CCZP_H_ */ diff --git a/EXTERNAL_HEADERS/img4/api.h b/EXTERNAL_HEADERS/img4/api.h index ecaf2efed..9cd1d4e4b 100644 --- a/EXTERNAL_HEADERS/img4/api.h +++ b/EXTERNAL_HEADERS/img4/api.h @@ -40,7 +40,7 @@ * individual preprocessor macros in this header that declare new behavior as * required. */ -#define IMG4_API_VERSION (20181106u) +#define IMG4_API_VERSION (20190125u) #if !defined(KERNEL) && !IMG4_PROJECT_BUILD #define IMG4_API_AVAILABLE_20180112 \ @@ -52,11 +52,23 @@ #define IMG4_API_AVAILABLE_20181106 \ __API_UNAVAILABLE(macos) \ API_AVAILABLE(ios(12.2), tvos(12.2), watchos(5.2)) -#define IMG4_API_AVAILABLE_20181106 +#define IMG4_API_AVAILABLE_20190125 \ + API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)) #else #define IMG4_API_AVAILABLE_20180112 #define IMG4_API_AVAILABLE_20181004 #define IMG4_API_AVAILABLE_20181106 +#define IMG4_API_AVAILABLE_20190125 +#endif // !defined(KERNEL) && !IMG4_PROJECT_BUILD + +#if !defined(OS_CLOSED_ENUM) +#define OS_CLOSED_ENUM(_name, _type, ...) \ + OS_ENUM(_name, _type, ## __VA_ARGS__) +#endif + +#if !defined(OS_CLOSED_OPTIONS) +#define OS_CLOSED_OPTIONS(_name, _type, ...) \ + OS_ENUM(_name, _type, ## __VA_ARGS__) #endif /*! diff --git a/EXTERNAL_HEADERS/img4/environment.h b/EXTERNAL_HEADERS/img4/environment.h index 5f5ba1d02..6942de840 100644 --- a/EXTERNAL_HEADERS/img4/environment.h +++ b/EXTERNAL_HEADERS/img4/environment.h @@ -9,6 +9,10 @@ #error "Please #include instead of this file directly" #endif // __IMG4_INDIRECT +#if IMG4_TAPI +#include "tapi.h" +#endif + /*! * @typedef img4_environment_t * An opaque type describing an Image4 environment. @@ -21,7 +25,7 @@ typedef struct _img4_environment img4_environment_t; * resolve the environment. This is the environment against which manifests are * personalized. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT const struct _img4_environment _img4_environment_platform; @@ -37,7 +41,7 @@ const struct _img4_environment _img4_environment_platform; * environment should be used as a fallback when validation against the platform * fails, and the caller is handling a loadable trust cache. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20181004 OS_EXPORT const struct _img4_environment _img4_environment_trust_cache; diff --git a/EXTERNAL_HEADERS/img4/img4.h b/EXTERNAL_HEADERS/img4/img4.h index c3faf5a28..cb68c4645 100644 --- a/EXTERNAL_HEADERS/img4/img4.h +++ b/EXTERNAL_HEADERS/img4/img4.h @@ -137,18 +137,40 @@ #include #include +#if KERNEL +#if !defined(OS_CLOSED_ENUM) +#define OS_CLOSED_ENUM(...) OS_ENUM(__VA_ARGS__) +#endif + +#if !defined(OS_OPTIONS) +#define OS_OPTIONS(...) OS_ENUM(__VA_ARGS__) +#endif + +#if !defined(OS_CLOSED_OPTIONS) +#define OS_CLOSED_OPTIONS(...) OS_ENUM(__VA_ARGS__) +#endif +#endif + #define __IMG4_INDIRECT 1 /* - * This header is used in the pmap layer in xnu, which is in osfmk, which does - * not have access to most of the BSD headers. (But for some reason it does have - * access to sys/cdefs.h.) The only thing we need from that header is the - * errno_t typedef though, so if we can't get to it, then just typedef it - * ourselves. + * When used from the pmap layer, this header pulls in the types from libsa, + * which conflict with the BSD sys/types.h header that we need to pull in. But + * we only need it for the errno_t typedef and the vnode_t typedef. So when + * building MACH_KERNEL_PRIVATE, we do two things: + * + * 1. Explicitly pull in , so we get errno_t and + * nothing else (no transitive #include's) + * 2. #define _SYS_TYPES_H_ before #includ'ing so that + * we don't get the transitive #include of but we still get + * the definitions we need */ #if MACH_KERNEL_PRIVATE -typedef int errno_t; +#define _SYS_TYPES_H_ 1 +#include +#include #else +#include #include #endif @@ -238,7 +260,7 @@ typedef void (*img4_destructor_t)( * It is illegal to use a manifest which possesses a CHMH tag as a first-stage * manifest. */ -OS_ENUM(img4_flags, uint64_t, +OS_CLOSED_OPTIONS(img4_flags, uint64_t, I4F_INIT = 0, I4F_TRUST_MANIFEST = (1 << 0), I4F_FORCE_MIXNMATCH = (1 << 1), @@ -264,12 +286,13 @@ typedef struct _img4 { #endif } img4_t; -typedef char _img4_payload_opaque_data_64[496]; +typedef char _img4_payload_opaque_data_64[504]; -#if __ARM_ARCH_7S__ || __i386__ -typedef char _img4_payload_opaque_data_32[324]; -#else +#if __ARM_ARCH_7A__ || __ARM_ARCH_7S__ || __ARM_ARCH_7K__ || \ + __ARM64_ARCH_8_32__ || __i386__ typedef char _img4_payload_opaque_data_32[328]; +#else +typedef char _img4_payload_opaque_data_32[332]; #endif /*! @@ -330,7 +353,7 @@ typedef struct _img4_payload { * The bytes given to this routine must represent an Image4 manifest. They may * optionally also represent an Image4 payload. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 errno_t @@ -361,7 +384,7 @@ img4_init(img4_t *i4, img4_flags_t flags, const uint8_t *bytes, size_t len, * though there is no nonce in the environment. Therefore, any manifests which * have a BNCH property constraint will fail to validate. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT OS_NONNULL1 OS_NONNULL2 void @@ -384,7 +407,7 @@ img4_set_nonce(img4_t *i4, const void *bytes, size_t len); * @discussion * See discussion for {@link img4_set_nonce}. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20181106 OS_EXPORT OS_NONNULL1 OS_NONNULL2 void @@ -446,7 +469,7 @@ img4_set_nonce_domain(img4_t *i4, const img4_nonce_domain_t *nd); * If any one of these validation checks fails, the payload is considered * untrustworthy and is not returned. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 OS_NONNULL4 OS_NONNULL5 errno_t @@ -475,10 +498,18 @@ img4_get_trusted_payload(img4_t *i4, img4_tag_t tag, * A pointer to the storage where the pointer to the payload buffer will be * written on success. * + * If the payload objects was initialized with + * {@link img4_payload_init_with_vnode_4xnu}, this parameter should be NULL, as + * there will be no in-memory buffer to return. + * * @param len * A pointer to the storage where the length of the payload buffer will be * written on success. * + * If the payload objects was initialized with + * {@link img4_payload_init_with_vnode_4xnu}, this parameter should be NULL, as + * there will be no in-memory buffer to return. + * * @result * Upon success, zero is returned. The implementation may also return one of the * following error codes directly: @@ -494,6 +525,18 @@ img4_get_trusted_payload(img4_t *i4, img4_tag_t tag, * [EILSEQ] The payload for the given tag does not match its description * in the manifest * [EIO] The payload could not be fetched + * [EIO] The payload was initialized with + * {@link img4_payload_init_with_vnode_4xnu}, and reading from + * the vnode stalled repeatedly beyond the implementation's + * tolerance + * + * If the payload was initialized with + * {@link img4_payload_init_with_vnode_4xnu}, any error returned by + * {@link vnode_getattr} or {@link vn_rdwr} may be returned. + * + * If the payload was initialized with + * {@link img4_payload_init_with_fd_4MSM}, any error returned by stat(2), + * read(2), or malloc(3) may be returned. * * Otherwise, an error from the underlying Image4 implementation will be * returned. @@ -502,10 +545,9 @@ img4_get_trusted_payload(img4_t *i4, img4_tag_t tag, * This routine performs the same validation steps as * {@link img4_get_trusted_payload}. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 -OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2 OS_NONNULL3 OS_NONNULL4 -OS_NONNULL5 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2 OS_NONNULL3 errno_t img4_get_trusted_external_payload(img4_t *i4, img4_payload_t *payload, const img4_environment_t *env, const uint8_t **bytes, size_t *len); @@ -525,7 +567,7 @@ img4_get_trusted_external_payload(img4_t *i4, img4_payload_t *payload, * The destructor passed to {@link img4_init} is called as a result of this * routine, if any was set. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT OS_NONNULL1 void diff --git a/EXTERNAL_HEADERS/img4/nonce.h b/EXTERNAL_HEADERS/img4/nonce.h index c9571f704..93c10c0c9 100644 --- a/EXTERNAL_HEADERS/img4/nonce.h +++ b/EXTERNAL_HEADERS/img4/nonce.h @@ -1,8 +1,8 @@ /*! * @header * Provides an interface for managing nonces to govern the lifetime of a - * personalization performed with Tatsu. A nonce managed by this interface may - * be used in a Tatsu signing request as the value for the BNCH tag. + * personalization performed with TSS. A nonce managed by this interface may + * be used in a TSS signing request as the value for the BNCH tag. * * These interfaces require the caller to possess the * @@ -49,6 +49,10 @@ #error "Please #include instead of this file directly" #endif // __IMG4_INDIRECT +#if IMG4_TAPI +#include "tapi.h" +#endif + /*! * @typedef img4_nonce_domain_t * An opaque type describing a nonce domain. @@ -116,7 +120,7 @@ typedef struct _img4_nonce { * * entitlement. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20181106 OS_EXPORT const struct _img4_nonce_domain _img4_nonce_domain_trust_cache; @@ -125,6 +129,42 @@ const struct _img4_nonce_domain _img4_nonce_domain_trust_cache; #define IMG4_NONCE_DOMAIN_TRUST_CACHE (img4if->i4if_v1.nonce_domain_trust_cache) #endif +/*! + * @const IMG4_NONCE_DOMAIN_PDI + * The nonce domain governing disk image personalizations. Use of this domain + * requires the + * + * com.apple.private.img4.nonce.pdi + * + * entitlement. The nonce for this domain is regenerated once every boot. + */ +#if !XNU_KERNEL_PRIVATE +IMG4_API_AVAILABLE_20181106 +OS_EXPORT +const struct _img4_nonce_domain _img4_nonce_domain_pdi; +#define IMG4_NONCE_DOMAIN_PDI (&_img4_nonce_domain_pdi) +#else +#define IMG4_NONCE_DOMAIN_PDI (img4if->i4if_v3.nonce_domain_pdi) +#endif + +/*! + * @const IMG4_NONCE_DOMAIN_CRYPTEX + * The nonce domain governing cryptex personalizations. Use of this domain + * requires the + * + * com.apple.private.img4.nonce.cryptex + * + * entitlement. + */ +#if !XNU_KERNEL_PRIVATE +IMG4_API_AVAILABLE_20181106 +OS_EXPORT +const struct _img4_nonce_domain _img4_nonce_domain_cryptex; +#define IMG4_NONCE_DOMAIN_CRYPTEX (&_img4_nonce_domain_cryptex) +#else +#define IMG4_NONCE_DOMAIN_CRYPTEX (img4if->i4if_v1.nonce_domain_cryptex) +#endif + /*! * @function img4_nonce_domain_copy_nonce * Copies the current value of the nonce in the given domain. @@ -146,7 +186,7 @@ const struct _img4_nonce_domain _img4_nonce_domain_trust_cache; * [EPERM] The caller lacked the entitlement necessary to read the * given nonce */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20181106 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2 errno_t @@ -172,7 +212,7 @@ img4_nonce_domain_copy_nonce(const img4_nonce_domain_t *nd, img4_nonce_t *n); * [EPERM] The caller lacked the entitlement necessary to roll the * given nonce */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20181106 OS_EXPORT OS_NONNULL1 errno_t diff --git a/EXTERNAL_HEADERS/img4/payload.h b/EXTERNAL_HEADERS/img4/payload.h index 4a1d119d3..8196742f0 100644 --- a/EXTERNAL_HEADERS/img4/payload.h +++ b/EXTERNAL_HEADERS/img4/payload.h @@ -17,6 +17,10 @@ #error "Please #include instead of this file directly" #endif // __IMG4_INDIRECT +#if IMG4_TAPI +#include "tapi.h" +#endif + /*! * @typedef img4_payload_flags_t * Flags modifying the behavior of an Image4 payload object. @@ -32,7 +36,7 @@ * describe portable executable files which must be fed directly to the firmware * and cannot tolerate being wrapped in an intermediary format. */ -OS_ENUM(img4_payload_flags, uint64_t, +OS_CLOSED_OPTIONS(img4_payload_flags, uint64_t, I4PLF_INIT = 0, I4PLF_UNWRAPPED = (1 << 0), ); @@ -67,7 +71,7 @@ OS_ENUM(img4_payload_flags, uint64_t, * [EFTYPE] The data does not contain an Image4 payload * [ENOENT] The bytes do not contain a payload for the specified tag */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL4 errno_t @@ -78,6 +82,92 @@ img4_payload_init(img4_payload_t *i4p, img4_tag_t tag, #define img4_payload_init(...) img4if->i4if_payload_init(__VA_ARGS__) #endif +/*! + * @function img4_payload_init_with_vnode_4xnu + * Initializes an Image4 payload object from a vnode. + * + * @param i4p + * A pointer to the payload object to initialize. + * + * @param tag + * The expected tag for the payload. + * + * @param vn + * The vnode from which to initialize the payload. + * + * @param flags + * Flags modifying the behavior of the payload object. + * + * @result + * Upon success, zero is returned. Otherwise, one of the following error codes: + * + * [ENOENT] The vnode is either dead or in the process of being + * recycled + * [EIO] Reading from the vnode stalled repeatedly beyond the + * implementation's tolerance + * + * Additionally, the implementation may return any error that vnode_ref() may + * return. + * + * @discussion + * Verification of a vnode is performed by reading in chunks of data, updating + * an ongoing hash operation with that data, and then discarding it. Therefore, + * payload objects created in this manner can only guarantee their validity at + * the time the check was performed since the vnode's contents are not kept in + * memory and may be tampered with after validation has been performed. + * + * Additionally, this operation requires the payload to be unwrapped, as it does + * not parse or recognize any Image4 payload wrapper. Payloads created with this + * interface are therefore implicitly created with the {@link I4PLF_UNWRAPPED} + * flag. + */ + +#if KERNEL +#if !XNU_KERNEL_PRIVATE +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 +errno_t +img4_payload_init_with_vnode_4xnu(img4_payload_t *i4p, img4_tag_t tag, + vnode_t vn, img4_payload_flags_t flags); +#else +#define img4_payload_init_with_vnode_4xnu(...) \ + (img4if->i4if_v2.payload_init_with_vnode_4xnu(__VA_ARGS__)) +#endif // !XNU_KERNEL_PRIVATE +#endif // KERNEL + +/*! + * @function img4_payload_init_with_fd_4MSM + * Initializes an Image4 payload object from a file descriptor. + * + * @param i4p + * A pointer to the payload object to initialize. + * + * @param tag + * The expected tag for the payload. + * + * @param fd + * The file descriptor from which to initialize the payload. + * + * @param flags + * Flags modifying the behavior of the payload object. + * + * @result + * Upon success, zero is returned. Otherwise, the implementation may return any + * errno that is set by the dup(2) system call. + * + * @discussion + * This interface is a userspace equivalent to + * {@link img4_payload_init_with_vnode_4xnu}, and all the same caveats apply. + */ + +#if !KERNEL +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 +errno_t +img4_payload_init_with_fd_4MSM(img4_payload_t *i4p, img4_tag_t tag, + int fd, img4_payload_flags_t flags); +#endif // KERNEL + /*! * @function img4_payload_destroy * Disposes of the resources associated with the payload object. @@ -90,7 +180,7 @@ img4_payload_init(img4_payload_t *i4p, img4_tag_t tag, * only the associated resources. This routine will cause the destructor given * in {@link img4_payload_init} to be called, if any. */ -#if !MACH_KERNEL_PRIVATE +#if !XNU_KERNEL_PRIVATE IMG4_API_AVAILABLE_20180112 OS_EXPORT OS_NONNULL1 void diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index d6bc7e0cd..64f6bac73 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -210,6 +210,22 @@ struct mach_header_64 { #define MH_APP_EXTENSION_SAFE 0x02000000 /* The code was linked for use in an application extension. */ +#define MH_NLIST_OUTOFSYNC_WITH_DYLDINFO 0x04000000 /* The external symbols + listed in the nlist symbol table do + not include all the symbols listed in + the dyld info. */ + +#define MH_SIM_SUPPORT 0x08000000 /* Allow LC_MIN_VERSION_MACOS and + LC_BUILD_VERSION load commands with + the platforms macOS, iOSMac, + iOSSimulator, tvOSSimulator and + watchOSSimulator. */ + +#define MH_DYLIB_IN_CACHE 0x80000000 /* Only for use on dylibs. When this bit + is set, the dylib is part of the dyld + shared cache, rather than loose in + the filesystem. */ + /* * The load commands directly follow the mach_header. The total size of all * of the commands is given by the sizeofcmds field in the mach_header. All @@ -304,6 +320,8 @@ struct load_command { #define LC_VERSION_MIN_WATCHOS 0x30 /* build for Watch min OS version */ #define LC_NOTE 0x31 /* arbitrary data included within a Mach-O file */ #define LC_BUILD_VERSION 0x32 /* build for platform min OS version */ +#define LC_DYLD_EXPORTS_TRIE (0x33 | LC_REQ_DYLD) /* used with linkedit_data_command, payload is trie */ +#define LC_DYLD_CHAINED_FIXUPS (0x34 | LC_REQ_DYLD) /* used with linkedit_data_command */ /* * A variable length string in a load command is represented by an lc_str @@ -381,6 +399,9 @@ struct segment_command_64 { /* for 64-bit architectures */ first page of the segment is not protected. All other pages of the segment are protected. */ +#define SG_READ_ONLY 0x10 /* This segment is made read-only after fixups */ + + /* * A segment is made up of zero or more sections. Non-MH_OBJECT files have @@ -506,6 +527,8 @@ struct section_64 { /* for 64-bit architectures */ #define S_THREAD_LOCAL_INIT_FUNCTION_POINTERS 0x15 /* functions to call to initialize TLV values */ +#define S_INIT_FUNC_OFFSETS 0x16 /* 32-bit offsets to + initializers */ /* * Constants for the section attributes part of the flags field of a section @@ -767,14 +790,14 @@ struct dylinker_command { * Thread commands contain machine-specific data structures suitable for * use in the thread state primitives. The machine specific data structures * follow the struct thread_command as follows. - * Each flavor of machine specific data structure is preceded by an unsigned - * long constant for the flavor of that data structure, an uint32_t - * that is the count of longs of the size of the state data structure and then + * Each flavor of machine specific data structure is preceded by an uint32_t + * constant for the flavor of that data structure, an uint32_t that is the + * count of uint32_t's of the size of the state data structure and then * the state data structure follows. This triple may be repeated for many * flavors. The constants for the flavors, counts and state data structure * definitions are expected to be in the header file . * These machine specific data structures sizes must be multiples of - * 4 bytes The cmdsize reflects the total size of the thread_command + * 4 bytes. The cmdsize reflects the total size of the thread_command * and all of the sizes of the constants for the flavors, counts and state * data structures. * @@ -788,7 +811,7 @@ struct thread_command { uint32_t cmd; /* LC_THREAD or LC_UNIXTHREAD */ uint32_t cmdsize; /* total size of this command */ /* uint32_t flavor flavor of thread state */ - /* uint32_t count count of longs in thread state */ + /* uint32_t count count of uint32_t's in thread state */ /* struct XXX_thread_state state thread state for this flavor */ /* ... */ }; @@ -1164,8 +1187,10 @@ struct rpath_command { struct linkedit_data_command { uint32_t cmd; /* LC_CODE_SIGNATURE, LC_SEGMENT_SPLIT_INFO, LC_FUNCTION_STARTS, LC_DATA_IN_CODE, - LC_DYLIB_CODE_SIGN_DRS or - LC_LINKER_OPTIMIZATION_HINT. */ + LC_DYLIB_CODE_SIGN_DRS, + LC_LINKER_OPTIMIZATION_HINT, + LC_DYLD_EXPORTS_TRIE, or + LC_DYLD_CHAINED_FIXUPS. */ uint32_t cmdsize; /* sizeof(struct linkedit_data_command) */ uint32_t dataoff; /* file offset of data in __LINKEDIT segment */ uint32_t datasize; /* file size of data in __LINKEDIT segment */ @@ -1238,6 +1263,12 @@ struct build_tool_version { #define PLATFORM_IOS 2 #define PLATFORM_TVOS 3 #define PLATFORM_WATCHOS 4 +#define PLATFORM_BRIDGEOS 5 +#define PLATFORM_IOSMAC 6 +#define PLATFORM_IOSSIMULATOR 7 +#define PLATFORM_TVOSSIMULATOR 8 +#define PLATFORM_WATCHOSSIMULATOR 9 +#define PLATFORM_DRIVERKIT 10 /* Known values for the tool field above. */ #define TOOL_CLANG 1 @@ -1385,6 +1416,7 @@ struct dyld_info_command { #define BIND_SPECIAL_DYLIB_SELF 0 #define BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE -1 #define BIND_SPECIAL_DYLIB_FLAT_LOOKUP -2 +#define BIND_SPECIAL_DYLIB_WEAK_LOOKUP -3 #define BIND_SYMBOL_FLAGS_WEAK_IMPORT 0x1 #define BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION 0x8 @@ -1404,6 +1436,9 @@ struct dyld_info_command { #define BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB 0xA0 #define BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED 0xB0 #define BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB 0xC0 +#define BIND_OPCODE_THREADED 0xD0 +#define BIND_SUBOPCODE_THREADED_SET_BIND_ORDINAL_TABLE_SIZE_ULEB 0x00 +#define BIND_SUBOPCODE_THREADED_APPLY 0x01 /* @@ -1413,6 +1448,7 @@ struct dyld_info_command { #define EXPORT_SYMBOL_FLAGS_KIND_MASK 0x03 #define EXPORT_SYMBOL_FLAGS_KIND_REGULAR 0x00 #define EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL 0x01 +#define EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE 0x02 #define EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION 0x04 #define EXPORT_SYMBOL_FLAGS_REEXPORT 0x08 #define EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER 0x10 diff --git a/EXTERNAL_HEADERS/ptrauth.h b/EXTERNAL_HEADERS/ptrauth.h index b6db0fb14..e7d7dfd9b 100644 --- a/EXTERNAL_HEADERS/ptrauth.h +++ b/EXTERNAL_HEADERS/ptrauth.h @@ -152,6 +152,17 @@ typedef uintptr_t ptrauth_generic_signature_t; #define ptrauth_blend_discriminator(__pointer, __integer) \ __builtin_ptrauth_blend_discriminator(__pointer, __integer) +/* Compute the 16-bit integer discriminator of the given type. + + The argument must be a type. +*/ +#if __has_feature(ptrauth_type_discriminator) +#define ptrauth_type_discriminator(__type) \ + __builtin_ptrauth_type_discriminator(__type) +#else +#define ptrauth_type_discriminator(__type) ((uintptr_t)0) +#endif + /* Add a signature to the given pointer value using a specific key, using the given extra data as a salt to the signing process. @@ -308,6 +319,7 @@ typedef uintptr_t ptrauth_generic_signature_t; #define ptrauth_strip(__value, __key) __value #define ptrauth_blend_discriminator(__pointer, __integer) ((uintptr_t)0) +#define ptrauth_type_discriminator(__type) ((uintptr_t)0) #define ptrauth_sign_constant(__value, __key, __data) __value #define ptrauth_sign_unauthenticated(__value, __key, __data) __value #define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) __value diff --git a/EXTERNAL_HEADERS/stdatomic.h b/EXTERNAL_HEADERS/stdatomic.h index 261c653af..2ce9fa540 100644 --- a/EXTERNAL_HEADERS/stdatomic.h +++ b/EXTERNAL_HEADERS/stdatomic.h @@ -44,16 +44,16 @@ extern "C" { /* 7.17.1 Introduction */ -#define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE -#define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE -#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE -#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE -#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE -#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE -#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE -#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE -#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE -#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE +#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE +#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE +#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE +#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE +#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE +#define ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE +#define ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE +#define ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE +#define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE +#define ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE /* 7.17.2 Initialization */ diff --git a/EXTERNAL_HEADERS/stddef.h b/EXTERNAL_HEADERS/stddef.h index d9bb51e83..9678b998f 100644 --- a/EXTERNAL_HEADERS/stddef.h +++ b/EXTERNAL_HEADERS/stddef.h @@ -26,9 +26,21 @@ #ifndef __STDDEF_H #define __STDDEF_H +#undef NULL +#ifdef __cplusplus +#if __cplusplus >= 201103L +#define NULL nullptr +#else +#undef __null // VC++ hack. +#define NULL __null +#endif +#else +#define NULL ((void*)0) +#endif + #ifndef _PTRDIFF_T #define _PTRDIFF_T -typedef __typeof__(((int*)0)-((int*)0)) ptrdiff_t; +typedef __typeof__(((int*)NULL)-((int*)NULL)) ptrdiff_t; #endif #ifndef _SIZE_T #define _SIZE_T @@ -41,14 +53,6 @@ typedef __WCHAR_TYPE__ wchar_t; #endif #endif -#undef NULL -#ifdef __cplusplus -#undef __null // VC++ hack. -#define NULL __null -#else -#define NULL ((void*)0) -#endif - #ifndef offsetof #define offsetof(t, d) __builtin_offsetof(t, d) #endif diff --git a/EXTERNAL_HEADERS/sys/Makefile b/EXTERNAL_HEADERS/sys/Makefile new file mode 100644 index 000000000..978ac4176 --- /dev/null +++ b/EXTERNAL_HEADERS/sys/Makefile @@ -0,0 +1,13 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = \ + _pthread + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/EXTERNAL_HEADERS/sys/_pthread/Makefile b/EXTERNAL_HEADERS/sys/_pthread/Makefile new file mode 100644 index 000000000..0f815445b --- /dev/null +++ b/EXTERNAL_HEADERS/sys/_pthread/Makefile @@ -0,0 +1,21 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +EXPORT_FILES = \ + _pthread_types.h + +EXPORT_MI_LIST = ${EXPORT_FILES} + +EXPORT_MI_DIR = sys/_pthread + +INSTALL_KF_MI_LCL_LIST = $(empty) + +INSTALL_KF_MI_LIST = $(empty) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/EXTERNAL_HEADERS/sys/_pthread/_pthread_types.h b/EXTERNAL_HEADERS/sys/_pthread/_pthread_types.h new file mode 100644 index 000000000..d9d51b89e --- /dev/null +++ b/EXTERNAL_HEADERS/sys/_pthread/_pthread_types.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2003-2013 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS__PTHREAD_TYPES_H_ +#define _SYS__PTHREAD_TYPES_H_ + +#include + +// pthread opaque structures +#if defined(__LP64__) +#define __PTHREAD_SIZE__ 8176 +#define __PTHREAD_ATTR_SIZE__ 56 +#define __PTHREAD_MUTEXATTR_SIZE__ 8 +#define __PTHREAD_MUTEX_SIZE__ 56 +#define __PTHREAD_CONDATTR_SIZE__ 8 +#define __PTHREAD_COND_SIZE__ 40 +#define __PTHREAD_ONCE_SIZE__ 8 +#define __PTHREAD_RWLOCK_SIZE__ 192 +#define __PTHREAD_RWLOCKATTR_SIZE__ 16 +#else // !__LP64__ +#define __PTHREAD_SIZE__ 4088 +#define __PTHREAD_ATTR_SIZE__ 36 +#define __PTHREAD_MUTEXATTR_SIZE__ 8 +#define __PTHREAD_MUTEX_SIZE__ 40 +#define __PTHREAD_CONDATTR_SIZE__ 4 +#define __PTHREAD_COND_SIZE__ 24 +#define __PTHREAD_ONCE_SIZE__ 4 +#define __PTHREAD_RWLOCK_SIZE__ 124 +#define __PTHREAD_RWLOCKATTR_SIZE__ 12 +#endif // !__LP64__ + +struct __darwin_pthread_handler_rec { + void (*__routine)(void *); // Routine to call + void *__arg; // Argument to pass + struct __darwin_pthread_handler_rec *__next; +}; + +struct _opaque_pthread_attr_t { + long __sig; + char __opaque[__PTHREAD_ATTR_SIZE__]; +}; + +struct _opaque_pthread_cond_t { + long __sig; + char __opaque[__PTHREAD_COND_SIZE__]; +}; + +struct _opaque_pthread_condattr_t { + long __sig; + char __opaque[__PTHREAD_CONDATTR_SIZE__]; +}; + +struct _opaque_pthread_mutex_t { + long __sig; + char __opaque[__PTHREAD_MUTEX_SIZE__]; +}; + +struct _opaque_pthread_mutexattr_t { + long __sig; + char __opaque[__PTHREAD_MUTEXATTR_SIZE__]; +}; + +struct _opaque_pthread_once_t { + long __sig; + char __opaque[__PTHREAD_ONCE_SIZE__]; +}; + +struct _opaque_pthread_rwlock_t { + long __sig; + char __opaque[__PTHREAD_RWLOCK_SIZE__]; +}; + +struct _opaque_pthread_rwlockattr_t { + long __sig; + char __opaque[__PTHREAD_RWLOCKATTR_SIZE__]; +}; + +struct _opaque_pthread_t { + long __sig; + struct __darwin_pthread_handler_rec *__cleanup_stack; + char __opaque[__PTHREAD_SIZE__]; +}; + +typedef struct _opaque_pthread_attr_t __darwin_pthread_attr_t; +typedef struct _opaque_pthread_cond_t __darwin_pthread_cond_t; +typedef struct _opaque_pthread_condattr_t __darwin_pthread_condattr_t; +typedef unsigned long __darwin_pthread_key_t; +typedef struct _opaque_pthread_mutex_t __darwin_pthread_mutex_t; +typedef struct _opaque_pthread_mutexattr_t __darwin_pthread_mutexattr_t; +typedef struct _opaque_pthread_once_t __darwin_pthread_once_t; +typedef struct _opaque_pthread_rwlock_t __darwin_pthread_rwlock_t; +typedef struct _opaque_pthread_rwlockattr_t __darwin_pthread_rwlockattr_t; +typedef struct _opaque_pthread_t *__darwin_pthread_t; + +#endif // _SYS__PTHREAD_TYPES_H_ diff --git a/Makefile b/Makefile index 38cb74935..64822cdf0 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,12 @@ endif ifndef SYMROOT export SYMROOT = $(SRCROOT)/BUILD/sym endif +ifndef MallocNanoZone +export MallocNanoZone := 1 +endif + +# Avoid make default rules, make becomes faster +MAKEFLAGS+=r export MakeInc_top=${VERSDIR}/makedefs/MakeInc.top export MakeInc_kernel=${VERSDIR}/makedefs/MakeInc.kernel @@ -32,8 +38,10 @@ export MakeInc_dir=${VERSDIR}/makedefs/MakeInc.dir ifeq ($(findstring Libsyscall,$(RC_ProjectName)),Libsyscall) -ifeq ($(RC_ProjectName),Libsyscall_headers_Sim) -TARGET=-target Libsyscall_headers_Sim +include $(MakeInc_cmd) + +ifneq ($(findstring Libsyscall_,$(RC_ProjectName)),) +TARGET=-target $(RC_ProjectName) endif default: install @@ -44,12 +52,17 @@ SDKROOT ?= macosx.internal installhdrs install: cd libsyscall ; \ xcodebuild $@ $(TARGET) \ + $(MAKEOVERRIDES) \ "SRCROOT=$(SRCROOT)/libsyscall" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ "DSTROOT=$(DSTROOT)" \ "SDKROOT=$(SDKROOT)" +Libsyscall_driverkit: install + +.PHONY: Libsyscall_driverkit + clean: installsrc: @@ -90,6 +103,7 @@ default: install installhdrs install: cd libkern/kmod ; \ xcodebuild $@ \ + $(MAKEOVERRIDES) \ "SRCROOT=$(SRCROOT)/libkern/kmod" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ @@ -105,11 +119,7 @@ else ifeq ($(RC_ProjectName),xnu_tests) export SYSCTL_HW_PHYSICALCPU := $(shell /usr/sbin/sysctl -n hw.physicalcpu) export SYSCTL_HW_LOGICALCPU := $(shell /usr/sbin/sysctl -n hw.logicalcpu) -ifeq ($(SYSCTL_HW_PHYSICALCPU),$(SYSCTL_HW_LOGICALCPU)) -MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_PHYSICALCPU) + 1) -else -MAKEJOBS := --jobs=$(SYSCTL_HW_LOGICALCPU) -endif +MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_LOGICALCPU) + 1) default: install @@ -142,11 +152,7 @@ endif # export SYSCTL_HW_PHYSICALCPU := $(shell /usr/sbin/sysctl -n hw.physicalcpu) export SYSCTL_HW_LOGICALCPU := $(shell /usr/sbin/sysctl -n hw.logicalcpu) -ifeq ($(SYSCTL_HW_PHYSICALCPU),$(SYSCTL_HW_LOGICALCPU)) -MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_PHYSICALCPU) + 1) -else -MAKEJOBS := --jobs=$(SYSCTL_HW_LOGICALCPU) -endif +MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_LOGICALCPU) + 1) TOP_TARGETS = \ clean \ @@ -220,7 +226,7 @@ EXPINC_SUBDIRS_X86_64H = $(EXPINC_SUBDIRS) EXPINC_SUBDIRS_ARM = $(EXPINC_SUBDIRS) EXPINC_SUBDIRS_ARM64 = $(EXPINC_SUBDIRS) -SETUP_SUBDIRS = SETUP osfmk san +SETUP_SUBDIRS = SETUP san bsd COMP_SUBDIRS_X86_64 = $(ALL_SUBDIRS) COMP_SUBDIRS_X86_64H = $(ALL_SUBDIRS) @@ -241,6 +247,7 @@ endif # all other RC_ProjectName installapi_libkdd installhdrs_libkdd install_libkdd: cd libkdd; \ xcodebuild -target Default $(subst _libkdd,,$@) \ + $(MAKEOVERRIDES) \ "SRCROOT=$(SRCROOT)/libkdd" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ @@ -251,6 +258,7 @@ installapi_libkdd installhdrs_libkdd install_libkdd: installapi_libkdd_tests installhdrs_libkdd_tests install_libkdd_tests: cd libkdd; \ xcodebuild -target tests $(subst _libkdd_tests,,$@) \ + $(MAKEOVERRIDES) \ "SRCROOT=$(SRCROOT)/libkdd" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ @@ -261,6 +269,7 @@ installapi_libkdd_tests installhdrs_libkdd_tests install_libkdd_tests: installapi_libkdd_host installhdrs_libkdd_host install_libkdd_host: cd libkdd; \ xcodebuild -configuration ReleaseHost -target kdd.framework $(subst _libkdd_host,,$@) \ + $(MAKEOVERRIDES) \ "SRCROOT=$(SRCROOT)/libkdd" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ diff --git a/README.md b/README.md index a65afae99..bb146bea3 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ kernel together into a single bootable image. To build a kernelcache you can use the following mechanisms: * Using automatic kernelcache generation with `kextd`. - The kextd daemon keeps watching for changing in `/System/Library/Extensions` directory. + The kextd daemon keeps watching for changing in `/System/Library/Extensions` directory. So you can setup new kernel as $ cp BUILD/obj/DEVELOPMENT/X86_64/kernel.development /System/Library/Kernels/ @@ -178,10 +178,12 @@ XNU installs header files at the following locations - a. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers b. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders c. $(DSTROOT)/usr/include/ - d. $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders + d. $(DSTROOT)/System/DriverKit/usr/include/ + e. $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders `Kernel.framework` is used by kernel extensions.\ The `System.framework` and `/usr/include` are used by user level applications. \ +`/System/DriverKit/usr/include` is used by userspace drivers. \ The header files in framework's `PrivateHeaders` are only available for ** Apple Internal Development **. The directory containing the header file should have a Makefile that @@ -196,15 +198,18 @@ from each file list are - a. `DATAFILES` : To make header file available in user level - `$(DSTROOT)/usr/include` - b. `PRIVATE_DATAFILES` : To make header file available to Apple internal in + b. `DRIVERKIT_DATAFILES` : To make header file available to DriverKit userspace drivers - + `$(DSTROOT)/System/DriverKit/usr/include` + + c. `PRIVATE_DATAFILES` : To make header file available to Apple internal in user level - `$(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders` - c. `KERNELFILES` : To make header file available in kernel level - + d. `KERNELFILES` : To make header file available in kernel level - `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers` `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders` - d. `PRIVATE_KERNELFILES` : To make header file available to Apple internal + e. `PRIVATE_KERNELFILES` : To make header file available to Apple internal for kernel extensions - `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders` @@ -227,28 +232,35 @@ member file lists and their default location are described below - Definition - INSTALL_MI_LIST = ${DATAFILES} - b. `INSTALL_MI_LCL_LIST` : Installs header file to a location that is available + b. `INSTALL_DRIVERKIT_MI_LIST` : Installs header file to a location that is + available to DriverKit userspace drivers. + Locations - + $(DSTROOT)/System/DriverKit/usr/include + Definition - + INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} + + c. `INSTALL_MI_LCL_LIST` : Installs header file to a location that is available for Apple internal in user level. Locations - $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders Definition - INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} - c. `INSTALL_KF_MI_LIST` : Installs header file to location that is available + d. `INSTALL_KF_MI_LIST` : Installs header file to location that is available to everyone for kernel extensions. Locations - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers Definition - INSTALL_KF_MI_LIST = ${KERNELFILES} - d. `INSTALL_KF_MI_LCL_LIST` : Installs header file to location that is + e. `INSTALL_KF_MI_LCL_LIST` : Installs header file to location that is available for Apple internal for kernel extensions. Locations - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders Definition - INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} - e. `EXPORT_MI_LIST` : Exports header file to all of xnu (bsd/, osfmk/, etc.) + f. `EXPORT_MI_LIST` : Exports header file to all of xnu (bsd/, osfmk/, etc.) for compilation only. Does not install anything into the SDK. Definition - EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} @@ -291,6 +303,8 @@ want to export a function only to kernel level but not user level. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders + g. `DRIVERKIT`: If defined, enclosed code is visible exclusively in the + DriverKit SDK headers used by userspace drivers. Conditional compilation ======================= @@ -317,8 +331,9 @@ does not define the platform macros from `TargetConditionals.h` (`TARGET_OS_OSX`, `TARGET_OS_IOS`, etc.). -There is a `TARGET_OS_EMBEDDED` macro, but this should be avoided as it is in -general too broad a definition for most functionality. +There is a deprecated `TARGET_OS_EMBEDDED` macro, but this should be avoided +as it is in general too broad a definition for most functionality. +Please refer to TargetConditionals.h for a full picture. How to add a new syscall ======================== @@ -375,7 +390,7 @@ common options. To debug a panic'ed kernel, use llvm debugger (lldb) along with unstripped symbol rich kernel binary. sh$ lldb kernel.development.unstripped - + And then you can connect to panic'ed machine with `kdp_remote [ip addr]` or `gdb_remote [hostip : port]` commands. Each kernel is packaged with kernel specific debug scripts as part of the build process. For security reasons these special commands diff --git a/SETUP/config/Makefile b/SETUP/config/Makefile index 56032b45d..fb79f3fcd 100644 --- a/SETUP/config/Makefile +++ b/SETUP/config/Makefile @@ -17,21 +17,21 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) config: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< parser.c: parser.y - @echo "$(ColorH)HOST_BISON$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_BISON$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_BISON) -y -d -d -o $@ $< lexer.yy.c: lexer.l - @echo "$(ColorH)HOST_FLEX$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_FLEX$(Color0) $(ColorF)$@$(Color0)) $(_v)env M4=$(HOST_GM4) $(HOST_FLEX) --header-file=lexer.yy.h -o $@ $< main.o mkheaders.o mkioconf.o mkmakefile.o lexer.yy.c: parser.c diff --git a/SETUP/config/mkmakefile.c b/SETUP/config/mkmakefile.c index a32236fd1..9614ba195 100644 --- a/SETUP/config/mkmakefile.c +++ b/SETUP/config/mkmakefile.c @@ -738,25 +738,14 @@ common: fprintf(f, "%s%.*s${%c_RULE_1B%s}%s\n", source_dir, (int)(tp - np), np, och_upper, extras, nl); - /* While we are still using CTF, any build that normally does not support CTF will - * a "standard" compile done as well that we can harvest CTF information from; do - * that here. - */ - fprintf(f, "\t${%c_CTFRULE_1A%s}", och_upper, extras); - if (ftp->f_extra) { - fprintf(f, "%s", ftp->f_extra); - } - fprintf(f, "%s%.*s${%c_CTFRULE_1B%s}%s\n", - source_dir, (int)(tp - np), np, och_upper, extras, nl); - fprintf(f, "\t${%c_RULE_2%s}%s\n", och_upper, extras, nl); - fprintf(f, "\t${%c_CTFRULE_2%s}%s\n", och_upper, extras, nl); fprintf(f, "\t${%c_RULE_3%s}%s\n", och_upper, extras, nl); - fprintf(f, "\t${%c_RULE_4A%s}", och_upper, extras); + fprintf(f, "\t$(if ${%c_RULE_4A%s},${%c_RULE_4A%s}", + och_upper, extras, och_upper, extras); if (ftp->f_extra) { fprintf(f, "%s", ftp->f_extra); } - fprintf(f, "%s%.*s${%c_RULE_4B%s}%s\n", + fprintf(f, "%s%.*s${%c_RULE_4B%s}%s)\n", source_dir, (int)(tp - np), np, och_upper, extras, nl); break; diff --git a/SETUP/decomment/Makefile b/SETUP/decomment/Makefile index 7018eb19e..a22212f6e 100644 --- a/SETUP/decomment/Makefile +++ b/SETUP/decomment/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) decomment: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: decomment diff --git a/SETUP/installfile/Makefile b/SETUP/installfile/Makefile index eb1f3afbb..4ad7a7498 100644 --- a/SETUP/installfile/Makefile +++ b/SETUP/installfile/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) installfile: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: installfile diff --git a/SETUP/json_compilation_db/Makefile b/SETUP/json_compilation_db/Makefile index 518644cb5..18af26bdd 100644 --- a/SETUP/json_compilation_db/Makefile +++ b/SETUP/json_compilation_db/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) json_compilation_db: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: json_compilation_db diff --git a/SETUP/kextsymboltool/Makefile b/SETUP/kextsymboltool/Makefile index af6cdcafd..dde295bae 100644 --- a/SETUP/kextsymboltool/Makefile +++ b/SETUP/kextsymboltool/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) -lstdc++ kextsymboltool: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: kextsymboltool diff --git a/SETUP/replacecontents/Makefile b/SETUP/replacecontents/Makefile index e1e84844e..45459e48b 100644 --- a/SETUP/replacecontents/Makefile +++ b/SETUP/replacecontents/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) replacecontents: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: replacecontents diff --git a/SETUP/setsegname/Makefile b/SETUP/setsegname/Makefile index 7e9224ef0..70a55a7b5 100644 --- a/SETUP/setsegname/Makefile +++ b/SETUP/setsegname/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) setsegname: $(OBJS) - @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)) $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)) $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: setsegname diff --git a/bsd/Makefile b/bsd/Makefile index f79dc7046..22db7cb4d 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -6,6 +6,8 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) +SETUP_SUBDIRS = sys + INSTINC_SUBDIRS = \ bsm \ crypto \ @@ -21,6 +23,7 @@ INSTINC_SUBDIRS = \ security \ pthread \ sys \ + sys_private \ uuid \ vfs @@ -52,6 +55,7 @@ EXPINC_SUBDIRS = \ security \ pthread \ sys \ + sys_private \ uuid \ vfs \ vm diff --git a/bsd/arm/exec.h b/bsd/arm/exec.h deleted file mode 100644 index ed29b140f..000000000 --- a/bsd/arm/exec.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - */ -/*- - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)exec.h 8.1 (Berkeley) 6/11/93 - */ - -#ifndef _BSD_ARM_EXEC_H_ -#define _BSD_ARM_EXEC_H_ - - -#ifdef BSD_KERNEL_PRIVATE -/* Size of a page in an object file. */ -#define __LDPGSZ 4096 - -/* Valid magic number check. */ -#define N_BADMAG(ex) \ - ((ex).a_magic != NMAGIC && (ex).a_magic != OMAGIC && \ - (ex).a_magic != ZMAGIC) - -/* Address of the bottom of the text segment. */ -#define N_TXTADDR(X) 0 - -/* Address of the bottom of the data segment. */ -#define N_DATADDR(ex) \ - (N_TXTADDR(ex) + ((ex).a_magic == OMAGIC ? (ex).a_text \ - : __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Text segment offset. */ -#define N_TXTOFF(ex) \ - ((ex).a_magic == ZMAGIC ? __LDPGSZ : sizeof(struct exec)) - -/* Data segment offset. */ -#define N_DATOFF(ex) \ - (N_TXTOFF(ex) + ((ex).a_magic != ZMAGIC ? (ex).a_text : \ - __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Symbol table offset. */ -#define N_SYMOFF(ex) \ - (N_TXTOFF(ex) + (ex).a_text + (ex).a_data + (ex).a_trsize + \ - (ex).a_drsize) - -/* String table offset. */ -#define N_STROFF(ex) (N_SYMOFF(ex) + (ex).a_syms) - -/* Description of the object file header (a.out format). */ -struct exec { -#define OMAGIC 0407 /* old impure format */ -#define NMAGIC 0410 /* read-only text */ -#define ZMAGIC 0413 /* demand load format */ -#define QMAGIC 0314 /* demand load format. Header in text. */ - unsigned int a_magic; /* magic number */ - - unsigned int a_text; /* text segment size */ - unsigned int a_data; /* initialized data size */ - unsigned int a_bss; /* uninitialized data size */ - unsigned int a_syms; /* symbol table size */ - unsigned int a_entry; /* entry point */ - unsigned int a_trsize; /* text relocation size */ - unsigned int a_drsize; /* data relocation size */ -}; - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_ARM_EXEC_H_ */ diff --git a/bsd/arm/fasttrap_isa.h b/bsd/arm/fasttrap_isa.h index 823ecc583..69a777f41 100644 --- a/bsd/arm/fasttrap_isa.h +++ b/bsd/arm/fasttrap_isa.h @@ -30,8 +30,6 @@ #ifndef _FASTTRAP_ISA_H #define _FASTTRAP_ISA_H -/* #pragma ident "@(#)fasttrap_isa.h 1.4 05/06/08 SMI" */ - #include #include diff --git a/bsd/arm/reboot.h b/bsd/arm/reboot.h deleted file mode 100644 index 0bb3b5aec..000000000 --- a/bsd/arm/reboot.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - */ - -#ifndef _BSD_ARM_REBOOT_H_ -#define _BSD_ARM_REBOOT_H_ - -/* - * Empty file (publicly) - */ - -#include - -#ifdef BSD_KERNEL_PRIVATE - -/* - * Use most significant 16 bits to avoid collisions with - * machine independent flags. - */ -#define RB_POWERDOWN 0x00010000 /* power down on halt */ -#define RB_NOBOOTRC 0x00020000 /* don't run '/etc/rc.boot' */ -#define RB_DEBUG 0x00040000 /* drop into mini monitor on panic */ -#define RB_EJECT 0x00080000 /* eject disks on halt */ -#define RB_COMMAND 0x00100000 /* new boot command specified */ -#define RB_NOFP 0x00200000 /* don't use floating point */ -#define RB_BOOTNEXT 0x00400000 /* reboot into NeXT */ -#define RB_BOOTDOS 0x00800000 /* reboot into DOS */ -#define RB_PRETTY 0x01000000 /* shutdown with pretty graphics */ - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_ARM_REBOOT_H_ */ diff --git a/bsd/bsm/audit_fcntl.h b/bsd/bsm/audit_fcntl.h index 20b73988a..b23d91bc9 100644 --- a/bsd/bsm/audit_fcntl.h +++ b/bsd/bsm/audit_fcntl.h @@ -115,7 +115,8 @@ #define BSM_F_MARKDEPENDENCY 360 /* Darwin-specific. */ #define BSM_F_BARRIERFSYNC 361 /* Darwin-specific. */ #define BSM_F_PUNCHHOLE 362 /* Darwin-specific. */ -#define BSM_F_TRIM_ACTIVE_FILE 363 /* Darwin-specific. */ +#define BSM_F_TRIM_ACTIVE_FILE 363 /* Darwin-specific. */ +#define BSM_F_SPECULATIVE_READ 364 /* Darwin-specific. */ /* * Darwin file system specific (400-499). diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 3f4ddea63..37dc16b53 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -447,7 +447,7 @@ #define AUE_PIDFORTASK 43049 /* Darwin-specific. */ #define AUE_SYSCTL_NONADMIN 43050 #define AUE_COPYFILE 43051 /* Darwin-specific. */ - +#define AUE_DBGPORTFORPID 43052 /* Darwin-specific. */ /* * Events added to OpenBSM for FreeBSD and Linux; may also be used by Darwin * in the future. @@ -614,6 +614,7 @@ #define AUE_FCLONEFILEAT 43211 /* Darwin. */ #define AUE_SETATTRLISTAT 43212 /* Darwin. */ #define AUE_FMOUNT 43213 /* Darwin. */ +#define AUE_FSGETPATH_EXTENDED 43214 /* Darwin. */ #define AUE_SESSION_START 44901 /* Darwin. */ #define AUE_SESSION_UPDATE 44902 /* Darwin. */ diff --git a/bsd/conf/Makefile b/bsd/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/bsd/conf/Makefile +++ b/bsd/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index c38c2ffb6..fa831c803 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -127,8 +127,6 @@ OBJS_NO_SIGN_COMPARE = \ in6_ifattach.o \ ip6_input.o \ ip6_output.o \ - ipcomp_input.o \ - ipcomp_output.o \ in6_proto.o \ mld6.o \ nd6.o \ @@ -250,6 +248,7 @@ OBJS_NO_PACKED_ADDRESS = \ nd6_prproxy.o \ nd6_rtr.o \ necp.o \ + packet_mangler.o \ pf.o \ pf_norm.o \ pktap.o \ @@ -258,7 +257,8 @@ OBJS_NO_PACKED_ADDRESS = \ tcp_subr.o \ udp6_output.o \ udp6_usrreq.o \ - udp_usrreq.o + udp_usrreq.o \ + sixxlowpan.o $(foreach file,$(OBJS_NO_PACKED_ADDRESS),$(eval $(call add_perfile_cflags,$(file),-Wno-address-of-packed-member))) @@ -284,27 +284,27 @@ $(SOBJS): .SFLAGS $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS) $(COMPONENT).filelist: $(OBJS) - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh init_sysent.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)) $(_v)$(MAKESYSCALLS) $< table > /dev/null syscalls.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)) $(_v)$(MAKESYSCALLS) $< names > /dev/null audit_kevents.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)) $(_v)$(MAKESYSCALLS) $< audit > /dev/null systrace_args.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)) $(_v)$(MAKESYSCALLS) $< systrace > /dev/null do_all: $(COMPONENT).filelist diff --git a/bsd/conf/files b/bsd/conf/files index e5a34f794..99c4f51bc 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -28,10 +28,11 @@ OPTIONS/mach_vm_debug optional mach_vm_debug OPTIONS/mach_xp optional mach_xp OPTIONS/mach_xp_fpd optional mach_xp_fpd OPTIONS/quota optional quota -OPTIONS/xpr_debug optional xpr_debug OPTIONS/kdebug optional kdebug OPTIONS/nfsclient optional nfsclient OPTIONS/nfsserver optional nfsserver +OPTIONS/config_nfs4 optional config_nfs4 +OPTIONS/config_triggers optional config_triggers OPTIONS/kernremote optional kernremote OPTIONS/compat_43 optional compat_43 OPTIONS/diagnostic optional diagnostic @@ -58,8 +59,10 @@ OPTIONS/inet6 optional inet6 OPTIONS/ipv6send optional ipv6send OPTIONS/ether optional ether OPTIONS/vlan optional vlan +OPTIONS/sixlowpan optional sixlowpan OPTIONS/bond optional bond OPTIONS/if_fake optional if_fake +OPTIONS/if_headless optional if_headless OPTIONS/bpfilter optional bpfilter OPTIONS/multipath optional multipath OPTIONS/mptcp optional mptcp @@ -72,6 +75,7 @@ OPTIONS/sendfile optional sendfile OPTIONS/pf optional pf OPTIONS/pflog optional pflog pf OPTIONS/zlib optional zlib +OPTIONS/sixlowpan optional sixlowpan # @@ -87,39 +91,49 @@ OPTIONS/fs_compression optional fs_compression OPTIONS/config_imageboot optional config_imageboot bsd/nfs/nfs4_subs.c optional nfsclient +bsd/nfs/nfs4_subs.c optional config_nfs4 bsd/nfs/nfs4_vnops.c optional nfsclient +bsd/nfs/nfs4_vnops.c optional config_nfs4 bsd/nfs/krpc_subr.c optional nfsclient bsd/nfs/nfs_bio.c optional nfsclient -bsd/nfs/nfs_boot.c optional nfsclient -bsd/nfs/nfs_gss.c optional nfsclient -bsd/nfs/nfs_gss.c optional nfsserver +bsd/nfs/nfs_bio.c optional config_nfs4 +bsd/nfs/nfs_boot.c optional config_netboot +bsd/nfs/nfs_gss.c optional config_nfs_gss bsd/nfs/nfs_lock.c optional nfsclient bsd/nfs/nfs_node.c optional nfsclient +bsd/nfs/nfs_node.c optional config_nfs4 bsd/nfs/nfs_serv.c optional nfsserver bsd/nfs/nfs_socket.c optional nfsclient bsd/nfs/nfs_socket.c optional nfsserver +bsd/nfs/nfs_socket.c optional config_nfs4 bsd/nfs/nfs_srvcache.c optional nfsserver bsd/nfs/nfs_subs.c optional nfsclient bsd/nfs/nfs_subs.c optional nfsserver +bsd/nfs/nfs_subs.c optional config_nfs4 bsd/nfs/nfs_syscalls.c optional nfsclient bsd/nfs/nfs_syscalls.c optional nfsserver +bsd/nfs/nfs_syscalls.c optional config_nfs4 bsd/nfs/nfs_vfsops.c optional nfsclient +bsd/nfs/nfs_vfsops.c optional config_nfs4 bsd/nfs/nfs_vnops.c optional nfsclient +bsd/nfs/nfs_vnops.c optional config_nfs4 bsd/nfs/nfs_upcall.c optional nfsserver -bsd/nfs/gss/gss_krb5_mech.c optional nfsclient -bsd/nfs/gss/gss_krb5_mech.c optional nfsserver -bsd/nfs/gss/ccrypto.c optional nfsclient -bsd/nfs/gss/ccrypto.c optional nfsserver -bsd/kern/netboot.c optional nfsclient +bsd/nfs/gss/gss_krb5_mech.c optional config_nfs_gss +bsd/nfs/gss/ccrypto.c optional config_nfs_gss +bsd/kern/netboot.c optional config_netboot + +# NFS v4 is on for macOS builds bsd/dev/dtrace/dtrace.c optional config_dtrace bsd/dev/dtrace/lockprof.c optional config_dtrace bsd/dev/dtrace/lockstat.c optional config_dtrace bsd/dev/dtrace/dtrace_ptss.c optional config_dtrace bsd/dev/dtrace/dtrace_subr.c optional config_dtrace -bsd/dev/dtrace/dtrace_glue.c standard +bsd/dev/dtrace/dtrace_glue.c optional config_dtrace +bsd/dev/dtrace/dtrace_xoroshiro128_plus.c optional config_dtrace bsd/dev/dtrace/blist.c optional config_dtrace bsd/dev/dtrace/fbt.c optional config_dtrace +bsd/dev/dtrace/fbt_blacklist.c optional config_dtrace bsd/dev/dtrace/sdt.c optional config_dtrace bsd/dev/dtrace/sdt_subr.c optional config_dtrace bsd/dev/dtrace/systrace.c optional config_dtrace @@ -154,6 +168,7 @@ bsd/vfs/vfs_bio.c standard bsd/vfs/vfs_cache.c standard bsd/vfs/vfs_cluster.c standard bsd/vfs/vfs_conf.c standard +bsd/vfs/vfs_conf.c optional config_nfs4 bsd/vfs/vfs_fslog.c standard bsd/vfs/vfs_init.c standard bsd/vfs/vfs_lookup.c standard @@ -203,6 +218,8 @@ bsd/net/if_loop.c optional loop bsd/net/if_mib.c optional networking bsd/net/if_vlan.c optional vlan bsd/net/if_fake.c optional if_fake +bsd/net/if_headless.c optional if_headless +bsd/net/if_6lowpan.c optional sixlowpan bsd/net/multicast_list.c optional networking bsd/net/if_bond.c optional bond bsd/net/devtimer.c optional bond @@ -244,10 +261,12 @@ bsd/net/if_llreach.c optional networking bsd/net/flowhash.c optional networking bsd/net/flowadv.c optional networking bsd/net/content_filter.c optional content_filter +bsd/net/content_filter_crypto.c optional content_filter bsd/net/packet_mangler.c optional packet_mangler bsd/net/if_llatbl.c optional networking bsd/net/nwk_wq.c optional networking bsd/net/skmem_sysctl.c optional skywalk +bsd/net/restricted_in_port.c optional networking bsd/net/classq/classq.c optional networking bsd/net/classq/classq_sfb.c optional networking @@ -259,6 +278,7 @@ bsd/net/pktsched/pktsched.c optional networking bsd/net/pktsched/pktsched_qfq.c optional networking bsd/net/pktsched/pktsched_tcq.c optional networking bsd/net/pktsched/pktsched_fq_codel.c optional networking +bsd/net/pktsched/pktsched_netem.c optional networking bsd/netinet/cpu_in_cksum_gen.c standard bsd/netinet/in_cksum.c optional inet @@ -293,6 +313,7 @@ bsd/netinet/tcp_cubic.c optional inet bsd/netinet/cbrtf.c optional inet bsd/netinet/tcp_lro.c optional inet bsd/netinet/tcp_ledbat.c optional inet +bsd/netinet/tcp_log.c optional inet bsd/netinet/udp_usrreq.c optional inet bsd/netinet/in_gif.c optional gif inet bsd/netinet/ip_ecn.c optional inet @@ -327,9 +348,6 @@ bsd/netinet6/in6_ifattach.c optional inet6 bsd/netinet6/ip6_input.c optional inet6 bsd/netinet6/ip6_output.c optional inet6 bsd/netinet6/in6_src.c optional inet6 -bsd/netinet6/ipcomp_core.c optional ipsec -bsd/netinet6/ipcomp_input.c optional ipsec -bsd/netinet6/ipcomp_output.c optional ipsec bsd/netinet6/in6_mcast.c optional inet6 bsd/netinet6/in6_pcb.c optional inet6 bsd/netinet6/in6_proto.c optional inet6 @@ -347,11 +365,17 @@ bsd/netinet6/udp6_output.c optional inet6 bsd/netinet6/udp6_usrreq.c optional inet6 bsd/netinet6/ip6_id.c optional inet6 +bsd/net/sixxlowpan.c optional sixlowpan +bsd/net/frame802154.c optional sixlowpan +bsd/net/linkaddr.c optional sixlowpan + bsd/netkey/key.c optional ipsec bsd/netkey/key_debug.c optional ipsec bsd/netkey/keysock.c optional ipsec bsd/netkey/keydb.c optional ipsec +bsd/net/multi_layer_pkt_log.c optional inet inet6 ipsec ipsec_esp + bsd/crypto/rc4/rc4.c optional crypto #bsd/netpm/pm_aTT.c optional pm @@ -424,6 +448,8 @@ bsd/kern/kern_synch.c standard bsd/kern/kern_sysctl.c standard bsd/kern/kern_newsysctl.c standard bsd/kern/kern_memorystatus.c optional config_memorystatus +bsd/kern/kern_memorystatus_freeze.c optional config_memorystatus +bsd/kern/kern_memorystatus_notify.c optional config_memorystatus bsd/kern/kern_mib.c standard bsd/kern/kpi_mbuf.c optional sockets bsd/kern/kern_sfi.c standard @@ -434,7 +460,6 @@ bsd/kern/mcache.c optional sockets bsd/kern/stackshot.c standard bsd/kern/subr_log.c standard bsd/kern/subr_prf.c standard -bsd/kern/subr_prof.c standard bsd/kern/subr_sbuf.c standard bsd/kern/subr_xxx.c standard bsd/kern/sys_generic.c standard @@ -495,6 +520,7 @@ bsd/conf/param.c standard ./ioconf.c standard bsd/kern/imageboot.c optional config_imageboot +bsd/kern/chunklist.c optional config_imageboot_chunklist osfmk/kperf/kperfbsd.c optional kperf bsd/kern/kern_kpc.c optional kpc @@ -509,6 +535,7 @@ bsd/miscfs/nullfs/null_vfsops.c optional nullfs bsd/miscfs/nullfs/null_vnops.c optional nullfs bsd/tests/bsd_tests.c optional config_xnupost +bsd/tests/copyio_tests.c optional config_xnupost bsd/tests/pmap_test_sysctl.c optional config_xnupost bsd/net/skywalk_stubs.c standard diff --git a/bsd/conf/param.c b/bsd/conf/param.c index 401b05a3d..687822177 100644 --- a/bsd/conf/param.c +++ b/bsd/conf/param.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,13 +80,13 @@ #include #include -struct timezone tz = { 0, 0 }; +struct timezone tz = { .tz_minuteswest = 0, .tz_dsttime = 0 }; #if CONFIG_EMBEDDED #define NPROC 1000 /* Account for TOTAL_CORPSES_ALLOWED by making this slightly lower than we can. */ #define NPROC_PER_UID 950 #else -#define NPROC (20 + 16 * 32) +#define NPROC (20 + 32 * 32) #define NPROC_PER_UID (NPROC/2) #endif diff --git a/bsd/dev/arm/conf.c b/bsd/dev/arm/conf.c index 0e0e8a435..8925f9070 100644 --- a/bsd/dev/arm/conf.c +++ b/bsd/dev/arm/conf.c @@ -215,12 +215,7 @@ struct cdevsw cdevsw[] = { kmioctl, nullstop, nullreset, km_tty, ttselect, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, - [13 ... 41] = NO_CDEVICE, - [42] = { - volopen, volclose, eno_rdwrt, eno_rdwrt, - volioctl, eno_stop, eno_reset, 0, (select_fcn_t *) seltrue, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - } + [13 ... 42] = NO_CDEVICE, }; const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]); @@ -237,7 +232,7 @@ isdisk(dev_t dev, int type) switch (type) { case VCHR: - maj = chrtoblk(maj); + maj = chrtoblk(dev); if (maj == NODEV) { break; } @@ -251,32 +246,7 @@ isdisk(dev_t dev, int type) return 0; } -static int chrtoblktab[] = { - /* CHR *//* BLK *//* CHR *//* BLK */ - /* 0 */ NODEV, /* 1 */ NODEV, - /* 2 */ NODEV, /* 3 */ NODEV, - /* 4 */ NODEV, /* 5 */ NODEV, - /* 6 */ NODEV, /* 7 */ NODEV, - /* 8 */ NODEV, /* 9 */ NODEV, - /* 10 */ NODEV, /* 11 */ NODEV, - /* 12 */ NODEV, /* 13 */ NODEV, - /* 14 */ NODEV, /* 15 */ NODEV, - /* 16 */ NODEV, /* 17 */ NODEV, - /* 18 */ NODEV, /* 19 */ NODEV, - /* 20 */ NODEV, /* 21 */ NODEV, - /* 22 */ NODEV, /* 23 */ NODEV, - /* 24 */ NODEV, /* 25 */ NODEV, - /* 26 */ NODEV, /* 27 */ NODEV, - /* 28 */ NODEV, /* 29 */ NODEV, - /* 30 */ NODEV, /* 31 */ NODEV, - /* 32 */ NODEV, /* 33 */ NODEV, - /* 34 */ NODEV, /* 35 */ NODEV, - /* 36 */ NODEV, /* 37 */ NODEV, - /* 38 */ NODEV, /* 39 */ NODEV, - /* 40 */ NODEV, /* 41 */ NODEV, - /* 42 */ NODEV, /* 43 */ NODEV, - /* 44 */ NODEV, -}; +static int chrtoblktab[] = {[0 ... nchrdev] = NODEV }; /* * convert chr dev to blk dev diff --git a/bsd/dev/arm/dtrace_isa.c b/bsd/dev/arm/dtrace_isa.c index 23d09f6a0..1f8dbd2ef 100644 --- a/bsd/dev/arm/dtrace_isa.c +++ b/bsd/dev/arm/dtrace_isa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,8 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from - * mach/ppc/thread_status.h */ #include #include @@ -44,7 +42,7 @@ #include #include #include -#include +#include #include #include /* for thread_wakeup() */ #include @@ -123,7 +121,7 @@ xcRemote(void *foo) (pArg->f)(pArg->arg); } - if (hw_atomic_sub(&dt_xc_sync, 1) == 0) { + if (os_atomic_dec(&dt_xc_sync, relaxed) == 0) { thread_wakeup((event_t) &dt_xc_sync); } } diff --git a/bsd/dev/arm/dtrace_subr_arm.c b/bsd/dev/arm/dtrace_subr_arm.c index f29583129..f227223a5 100644 --- a/bsd/dev/arm/dtrace_subr_arm.c +++ b/bsd/dev/arm/dtrace_subr_arm.c @@ -27,10 +27,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)dtrace_subr.c 1.12 05/06/08 SMI" - */ - #include #include #include @@ -172,16 +168,3 @@ dtrace_user_probe(arm_saved_state_t *regs, unsigned int instr) return KERN_FAILURE; } - -void -dtrace_safe_synchronous_signal(void) -{ - /* Not implemented */ -} - -int -dtrace_safe_defer_signal(void) -{ - /* Not implemented */ - return 0; -} diff --git a/bsd/dev/arm/fasttrap_isa.c b/bsd/dev/arm/fasttrap_isa.c index c45a95288..08f831a01 100644 --- a/bsd/dev/arm/fasttrap_isa.c +++ b/bsd/dev/arm/fasttrap_isa.c @@ -27,16 +27,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)fasttrap_isa.c 1.19 05/09/14 SMI" - */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include #include @@ -293,8 +283,8 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_ } if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } @@ -326,6 +316,9 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_ lck_mtx_unlock(pid_mtx); } +#if DEBUG +__dead2 +#endif static void fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr, arm_saved_state_t *regs) { @@ -522,8 +515,8 @@ fasttrap_pid_probe(arm_saved_state_t *regs) #endif } else { if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } diff --git a/bsd/dev/arm/fbt_arm.c b/bsd/dev/arm/fbt_arm.c index 95ee1dfe2..9986da8e4 100644 --- a/bsd/dev/arm/fbt_arm.c +++ b/bsd/dev/arm/fbt_arm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. */ /* * CDDL HEADER START @@ -27,16 +27,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from - * mach/ppc/thread_status.h */ #include #include #include diff --git a/bsd/dev/arm/kern_machdep.c b/bsd/dev/arm/kern_machdep.c index 695c74ff0..076f3abd8 100644 --- a/bsd/dev/arm/kern_machdep.c +++ b/bsd/dev/arm/kern_machdep.c @@ -17,7 +17,6 @@ #include #if __arm64__ -extern int bootarg_no64exec; /* bsd_init.c */ static cpu_subtype_t cpu_subtype32(void); #endif /* __arm64__ */ @@ -47,7 +46,7 @@ cpu_subtype32() * not acceptable. **********************************************************************/ int -grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) +grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, bool allow_simulator_binary __unused) { #if __arm64__ cpu_subtype_t hostsubtype = @@ -59,10 +58,6 @@ grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) switch (exectype) { #if __arm64__ case CPU_TYPE_ARM64: - if (bootarg_no64exec) { - return 0; - } - switch (hostsubtype) { case CPU_SUBTYPE_ARM64_V8: switch (execsubtype) { diff --git a/bsd/dev/arm/munge.c b/bsd/dev/arm/munge.c index 65eb5a2ae..094970e28 100644 --- a/bsd/dev/arm/munge.c +++ b/bsd/dev/arm/munge.c @@ -166,7 +166,7 @@ int munge_wwl(const void *regs, void *args) { if (REGS_TO_STYLE(regs) == kDirect) { - return marshal_no_pad(regs, args, 3); + return marshal_no_pad(regs, args, 4); } else { DECLARE_AND_CAST(regs, args, ss, uu_args); diff --git a/bsd/dev/arm/pci_device.h b/bsd/dev/arm/pci_device.h deleted file mode 100644 index f624a4215..000000000 --- a/bsd/dev/arm/pci_device.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - */ -/* - * @OSF_FREE_COPYRIGHT@ - * - */ -/* - * HISTORY - * - * Revision 1.2 1998/09/30 21:20:44 wsanchez - * Merged in IntelMerge1 (mburg: Intel support) - * - * Revision 1.1.2.1 1998/09/30 18:18:50 mburg - * Changes for Intel port - * - * Revision 1.1.1.1 1998/03/07 02:25:45 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.2 1995/12/15 10:52:14 bernadat - * Split dev and vendor ids. - * [95/11/15 bernadat] - * - * Revision 1.1.6.1 1995/02/23 17:22:27 alanl - * Taken from DIPC2_SHARED - * [1995/01/03 19:09:31 alanl] - * - * Revision 1.1.2.1 1994/10/11 18:24:42 rwd - * Created. - * [1994/10/11 18:15:31 rwd] - * - * $EndLog$ - */ -/* - * Taken from - * - * Copyright (c) 1994 Wolfgang Stanglmeier, Koeln, Germany - * - */ - -#ifndef __PCI_DEVICE_H__ -#define __PCI_DEVICE_H__ - -/*------------------------------------------------------------ - * - * Per driver structure. - * - *------------------------------------------------------------ - */ - -typedef unsigned short pci_vendor_id_t; -typedef unsigned short pci_dev_id_t; - -typedef union { - unsigned long cfg1; - struct { - unsigned char enable; - unsigned char forward; - unsigned short port; - } cfg2; -} pcici_t; - -struct pci_driver { - int (*probe )(pcici_t pci_ident);/* test whether device - * is present */ - int (*attach)(pcici_t pci_ident);/* setup driver for a - * device */ - pci_vendor_id_t vendor_id; /* vendor pci id */ - pci_dev_id_t device_id; /* device pci id */ - char *name; /* device name */ - char *vendor; /* device long name */ - void (*intr)(int); /* interupt handler */ -}; - -/*----------------------------------------------------------- - * - * Per device structure. - * - * It is initialized by the config utility and should live in - * "ioconf.c". At the moment there is only one field. - * - * This is a first attempt to include the pci bus to 386bsd. - * So this structure may grow .. - * - *----------------------------------------------------------- - */ - -struct pci_device { - struct pci_driver * pd_driver; -}; - -/*----------------------------------------------------------- - * - * This functions may be used by drivers to map devices - * to virtual and physical addresses. The va and pa - * addresses are "in/out" parameters. If they are 0 - * on entry, the mapping function assigns an address. - * - *----------------------------------------------------------- - */ - -int pci_map_mem(pcici_t tag, - unsigned long entry, - vm_offset_t *va, - vm_offset_t *pa); -#endif /*__PCI_DEVICE_H__*/ diff --git a/bsd/dev/arm/pio.h b/bsd/dev/arm/pio.h deleted file mode 100644 index 9cbdc6517..000000000 --- a/bsd/dev/arm/pio.h +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2000-2007 AppleInc. All rights reserved. - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.2 1998/09/30 21:20:45 wsanchez - * Merged in IntelMerge1 (mburg: Intel support) - * - * Revision 1.1.2.1 1998/09/30 18:18:50 mburg - * Changes for Intel port - * - * Revision 1.1.1.1 1998/03/07 02:25:38 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.8.2 1996/07/31 09:46:36 paire - * Merged with nmk20b7_shared (1.1.11.2 -> 1.1.11.1) - * [96/06/10 paire] - * - * Revision 1.1.11.2 1996/06/13 12:38:25 bernadat - * Do not use inline macros when MACH_ASSERT is configured. - * [96/05/24 bernadat] - * - * Revision 1.1.11.1 1996/05/14 13:50:23 paire - * Added new linl and loutl __inline__. - * Added conditional compilation for [l]{in|oub}[bwl]() __inline__. - * [95/11/24 paire] - * - * Revision 1.1.8.1 1994/09/23 02:00:28 ezf - * change marker to not FREE - * [1994/09/22 21:25:52 ezf] - * - * Revision 1.1.4.5 1993/08/09 19:40:41 dswartz - * Add ANSI prototypes - CR#9523 - * [1993/08/06 17:45:57 dswartz] - * - * Revision 1.1.4.4 1993/06/11 15:17:37 jeffc - * CR9176 - ANSI C violations: inb/outb macros must be changed from - * ({ ... }) to inline functions, with proper type definitions. Callers - * must pass proper types to these functions: 386 I/O port addresses - * are unsigned shorts (not pointers). - * [1993/06/10 14:26:10 jeffc] - * - * Revision 1.1.4.3 1993/06/07 22:09:28 jeffc - * CR9176 - ANSI C violations: trailing tokens on CPP - * directives, extra semicolons after decl_ ..., asm keywords - * [1993/06/07 19:00:26 jeffc] - * - * Revision 1.1.4.2 1993/06/04 15:28:45 jeffc - * CR9176 - ANSI problems - - * Added casts to get macros to take caddr_t as an I/O space address. - * [1993/06/04 13:45:55 jeffc] - * - * Revision 1.1 1992/09/30 02:25:51 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.5 91/05/14 16:14:20 mrt - * Correcting copyright - * - * Revision 2.4 91/02/05 17:13:56 mrt - * Changed to new Mach copyright - * [91/02/01 17:37:08 mrt] - * - * Revision 2.3 90/12/20 16:36:37 jeffreyh - * changes for __STDC__ - * [90/12/07 jeffreyh] - * - * Revision 2.2 90/11/26 14:48:41 rvb - * Pulled from 2.5 - * [90/11/22 10:09:38 rvb] - * - * [90/08/14 mg32] - * - * Now we know how types are factor in. - * Cleaned up a bunch: eliminated ({ for output and flushed unused - * output variables. - * [90/08/14 rvb] - * - * This is how its done in gcc: - * Created. - * [90/03/26 rvb] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -#ifndef ARM_PIO_H -#define ARM_PIO_H - -typedef unsigned short i386_ioport_t; - -/* read a longword */ -extern unsigned long inl( - i386_ioport_t port); -/* read a shortword */ -extern unsigned short inw( - i386_ioport_t port); -/* read a byte */ -extern unsigned char inb( - i386_ioport_t port); -/* write a longword */ -extern void outl( - i386_ioport_t port, - unsigned long datum); -/* write a word */ -extern void outw( - i386_ioport_t port, - unsigned short datum); -/* write a longword */ -extern void outb( - i386_ioport_t port, - unsigned char datum); - -/* input an array of longwords */ -extern void linl( - i386_ioport_t port, - int * data, - int count); -/* output an array of longwords */ -extern void loutl( - i386_ioport_t port, - int * data, - int count); - -/* input an array of words */ -extern void linw( - i386_ioport_t port, - int * data, - int count); -/* output an array of words */ -extern void loutw( - i386_ioport_t port, - int * data, - int count); - -/* input an array of bytes */ -extern void linb( - i386_ioport_t port, - char * data, - int count); -/* output an array of bytes */ -extern void loutb( - i386_ioport_t port, - char * data, - int count); - -extern __inline__ unsigned long -inl( - i386_ioport_t port) -{ - unsigned long datum; - __asm__ volatile ("inl %1, %0" : "=a" (datum) : "d" (port)); - return datum; -} - -extern __inline__ unsigned short -inw( - i386_ioport_t port) -{ - unsigned short datum; - __asm__ volatile (".byte 0x66; inl %1, %0" : "=a" (datum) : "d" (port)); - return datum; -} - -extern __inline__ unsigned char -inb( - i386_ioport_t port) -{ - unsigned char datum; - __asm__ volatile ("inb %1, %0" : "=a" (datum) : "d" (port)); - return datum; -} - -extern __inline__ void -outl( - i386_ioport_t port, - unsigned long datum) -{ - __asm__ volatile ("outl %0, %1" : : "a" (datum), "d" (port)); -} - -extern __inline__ void -outw( - i386_ioport_t port, - unsigned short datum) -{ - __asm__ volatile (".byte 0x66; outl %0, %1" : : "a" (datum), "d" (port)); -} - -extern __inline__ void -outb( - i386_ioport_t port, - unsigned char datum) -{ - __asm__ volatile ("outb %0, %1" : : "a" (datum), "d" (port)); -} - -#endif /* ARM_PIO_H */ diff --git a/bsd/dev/arm/sdt_arm.c b/bsd/dev/arm/sdt_arm.c index 938aa048b..2fa0b7d87 100644 --- a/bsd/dev/arm/sdt_arm.c +++ b/bsd/dev/arm/sdt_arm.c @@ -23,15 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)sdt.c 1.6 06/03/24 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include #include #include diff --git a/bsd/dev/arm/stubs.c b/bsd/dev/arm/stubs.c index 298450d4e..a76f54b60 100644 --- a/bsd/dev/arm/stubs.c +++ b/bsd/dev/arm/stubs.c @@ -7,6 +7,7 @@ * */ +#include #include #include #include @@ -33,7 +34,11 @@ copyoutstr(const void *from, user_addr_t to, size_t maxlen, size_t * lencopied) { size_t slen; size_t len; - int error = 0; + int error = copyoutstr_prevalidate(from, to, maxlen); + + if (__improbable(error)) { + return error; + } slen = strlen(from) + 1; if (slen > maxlen) { diff --git a/bsd/dev/arm/systemcalls.c b/bsd/dev/arm/systemcalls.c index 5ac5fcde2..36deb9bff 100644 --- a/bsd/dev/arm/systemcalls.c +++ b/bsd/dev/arm/systemcalls.c @@ -26,9 +26,14 @@ #include #include #include +#include #include +#if CONFIG_MACF +#include +#endif + #if CONFIG_DTRACE extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); extern void dtrace_systrace_syscall_return(unsigned short, int, int *); @@ -88,8 +93,8 @@ unix_syscall( { struct sysent *callp; int error; - unsigned short code; - pid_t pid; + unsigned short code, syscode; + pid_t pid; #if defined(__arm__) assert(is_saved_state32(state)); @@ -101,16 +106,15 @@ unix_syscall( #define unix_syscall_kprintf(x...) /* kprintf("unix_syscall: " x) */ -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) if (kdebug_enable && !code_is_kdebug_trace(code)) { arm_trace_unix_syscall(code, state); } -#endif if ((uthread->uu_flag & UT_VFORK)) proc = current_proc(); - callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; + syscode = (code < nsysent) ? code : SYS_invalid; + callp = &sysent[syscode]; /* * sy_narg is inaccurate on ARM if a 64 bit parameter is specified. Since user_addr_t @@ -157,10 +161,22 @@ unix_syscall( unix_syscall_kprintf("code %d (pid %d - %s, tid %lld)\n", code, pid, proc->p_comm, thread_tid(current_thread())); +#if CONFIG_MACF + if (__improbable(proc->syscall_filter_mask != NULL && !bitstr_test(proc->syscall_filter_mask, syscode))) { + error = mac_proc_check_syscall_unix(proc, syscode); + if (error) + goto skip_syscall; + } +#endif /* CONFIG_MACF */ + AUDIT_SYSCALL_ENTER(code, proc, uthread); error = (*(callp->sy_call)) (proc, &uthread->uu_arg[0], &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, proc, uthread, error); +#if CONFIG_MACF +skip_syscall: +#endif /* CONFIG_MACF */ + unix_syscall_kprintf("code %d, error %d, results %x, %x (pid %d - %s, tid %lld)\n", code, error, uthread->uu_rval[0], uthread->uu_rval[1], pid, get_bsdtask_info(current_task()) ? proc->p_comm : "unknown" , thread_tid(current_thread())); @@ -194,13 +210,10 @@ unix_syscall( */ throttle_lowpri_io(1); } -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) if (kdebug_enable && !code_is_kdebug_trace(code)) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], pid, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], pid); } -#endif #if PROC_REF_DEBUG if (__improbable(uthread_get_proc_refcount(uthread) != 0)) { @@ -264,13 +277,10 @@ unix_syscall_return(int error) */ throttle_lowpri_io(1); } -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) if (kdebug_enable && !code_is_kdebug_trace(code)) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid); } -#endif thread_exception_return(); /* NOTREACHED */ @@ -321,15 +331,14 @@ arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state_t *regs, ut static void arm_trace_u32_unix_syscall(int code, arm_saved_state32_t *regs) { - boolean_t indirect = (regs->save_r12 == 0); - if (indirect) - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r1, regs->save_r2, regs->save_r3, regs->save_r4, 0); - else - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r0, regs->save_r1, regs->save_r2, regs->save_r3, 0); + bool indirect = (regs->save_r12 == 0); + if (indirect) { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + regs->save_r1, regs->save_r2, regs->save_r3, regs->save_r4); + } else { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + regs->save_r0, regs->save_r1, regs->save_r2, regs->save_r3); + } } static void @@ -597,7 +606,7 @@ arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state_t *regs, ut arm_saved_state64_t *ss64 = saved_state64(regs); if (error == ERESTART) { - ss64->pc -= 4; + add_saved_state_pc(regs, -4); } else if (error != EJUSTRETURN) { if (error) { ss64->x[0] = error; @@ -642,15 +651,14 @@ arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state_t *regs, ut static void arm_trace_u64_unix_syscall(int code, arm_saved_state64_t *regs) { - boolean_t indirect = (regs->x[ARM64_SYSCALL_CODE_REG_NUM] == 0); - if (indirect) - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->x[1], regs->x[2], regs->x[3], regs->x[4], 0); - else - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->x[0], regs->x[1], regs->x[2], regs->x[3], 0); + bool indirect = (regs->x[ARM64_SYSCALL_CODE_REG_NUM] == 0); + if (indirect) { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + regs->x[1], regs->x[2], regs->x[3], regs->x[4]); + } else { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + regs->x[0], regs->x[1], regs->x[2], regs->x[3]); + } } static void diff --git a/bsd/dev/arm/table_inline.h b/bsd/dev/arm/table_inline.h deleted file mode 100644 index f5996137d..000000000 --- a/bsd/dev/arm/table_inline.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - */ -/* - * Copyright (c) 1992 NeXT Computer, Inc. - * - * Intel386 Family: Selector based access to descriptor tables. - * - * HISTORY - * - * 2 April 1992 ? at NeXT - * Created. - */ - -#include - -#include -#include - -static inline gdt_entry_t * -sel_to_gdt_entry(sel_t sel) -{ - return &gdt[sel.index]; -} - -static inline idt_entry_t * -sel_to_idt_entry(sel_t sel) -{ - return &idt[sel.index]; -} - -static inline ldt_entry_t * -sel_to_ldt_entry(ldt_t *tbl, sel_t sel) -{ - return &tbl[sel.index]; -} diff --git a/bsd/dev/arm/unix_signal.c b/bsd/dev/arm/unix_signal.c index 12d7b69f7..1e3bb03f9 100644 --- a/bsd/dev/arm/unix_signal.c +++ b/bsd/dev/arm/unix_signal.c @@ -24,6 +24,7 @@ #include #include +#include #include extern struct arm_saved_state *get_user_regs(thread_t); @@ -264,7 +265,8 @@ sendsig( user_addr_t catcher, int sig, int mask, - __unused uint32_t code + __unused uint32_t code, + sigset_t siginfo ) { union { @@ -300,7 +302,7 @@ sendsig( bzero(&ts, sizeof(ts)); bzero(&user_frame, sizeof(user_frame)); - if (p->p_sigacts->ps_siginfo & sigmask(sig)) { + if (siginfo & sigmask(sig)) { infostyle = UC_FLAVOR; } else { infostyle = UC_TRAD; @@ -409,6 +411,30 @@ sendsig( break; case SIGFPE: + switch (ut->uu_code) { + case EXC_ARM_FP_UF: + sinfo.si_code = FPE_FLTUND; + break; + case EXC_ARM_FP_OF: + sinfo.si_code = FPE_FLTOVF; + break; + case EXC_ARM_FP_IO: + sinfo.si_code = FPE_FLTINV; + break; + case EXC_ARM_FP_DZ: + sinfo.si_code = FPE_FLTDIV; + break; + case EXC_ARM_FP_ID: + sinfo.si_code = FPE_FLTINV; + break; + case EXC_ARM_FP_IX: + sinfo.si_code = FPE_FLTRES; + break; + default: + sinfo.si_code = FPE_NOOP; + break; + } + break; case SIGBUS: @@ -730,6 +756,9 @@ sigreturn( th_act = current_thread(); ut = (struct uthread *) get_bsdthread_info(th_act); + /* see osfmk/kern/restartable.c */ + act_set_ast_reset_pcs(th_act); + if (proc_is64bit_data(p)) { #if defined(__arm64__) error = sigreturn_copyin_ctx64(&uctx.uc64, &mctx.mc64, uap->uctx); diff --git a/bsd/dev/arm64/conf.c b/bsd/dev/arm64/conf.c index e40f4340a..06062ce28 100644 --- a/bsd/dev/arm64/conf.c +++ b/bsd/dev/arm64/conf.c @@ -215,12 +215,7 @@ struct cdevsw cdevsw[] = { kmioctl, nullstop, nullreset, km_tty, ttselect, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, - [13 ... 41] = NO_CDEVICE, - [42] = { - volopen, volclose, eno_rdwrt, eno_rdwrt, - volioctl, eno_stop, eno_reset, 0, (select_fcn_t *) seltrue, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - } + [13 ... 42] = NO_CDEVICE, }; const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]); @@ -237,7 +232,7 @@ isdisk(dev_t dev, int type) switch (type) { case VCHR: - maj = chrtoblk(maj); + maj = chrtoblk(dev); if (maj == NODEV) { break; } @@ -251,32 +246,7 @@ isdisk(dev_t dev, int type) return 0; } -static int chrtoblktab[] = { - /* CHR *//* BLK *//* CHR *//* BLK */ - /* 0 */ NODEV, /* 1 */ NODEV, - /* 2 */ NODEV, /* 3 */ NODEV, - /* 4 */ NODEV, /* 5 */ NODEV, - /* 6 */ NODEV, /* 7 */ NODEV, - /* 8 */ NODEV, /* 9 */ NODEV, - /* 10 */ NODEV, /* 11 */ NODEV, - /* 12 */ NODEV, /* 13 */ NODEV, - /* 14 */ NODEV, /* 15 */ NODEV, - /* 16 */ NODEV, /* 17 */ NODEV, - /* 18 */ NODEV, /* 19 */ NODEV, - /* 20 */ NODEV, /* 21 */ NODEV, - /* 22 */ NODEV, /* 23 */ NODEV, - /* 24 */ NODEV, /* 25 */ NODEV, - /* 26 */ NODEV, /* 27 */ NODEV, - /* 28 */ NODEV, /* 29 */ NODEV, - /* 30 */ NODEV, /* 31 */ NODEV, - /* 32 */ NODEV, /* 33 */ NODEV, - /* 34 */ NODEV, /* 35 */ NODEV, - /* 36 */ NODEV, /* 37 */ NODEV, - /* 38 */ NODEV, /* 39 */ NODEV, - /* 40 */ NODEV, /* 41 */ NODEV, - /* 42 */ NODEV, /* 43 */ NODEV, - /* 44 */ NODEV, -}; +static int chrtoblktab[] = {[0 ... nchrdev] = NODEV }; /* * convert chr dev to blk dev diff --git a/bsd/dev/arm64/cpu_in_cksum.s b/bsd/dev/arm64/cpu_in_cksum.s index 86d892aa3..35f317a44 100644 --- a/bsd/dev/arm64/cpu_in_cksum.s +++ b/bsd/dev/arm64/cpu_in_cksum.s @@ -221,6 +221,26 @@ L_post_initial_offset: add w7, w7, w9 1: +/* + * if ((uintptr_t)data & 4) { + * if (mlen < 4) + * goto L2_bytes; + * partial += *(uint32_t *)(void *)data; + * data += 4; + * mlen -= 4; + * } + */ + // align on 8-bytes boundary if applicable + tst data, #4 + b.eq 1f + cmp mlen, #4 + b.lt L2_bytes + ldr w9, [data], #4 + sub mlen, mlen, #4 + adds w7, w7, w9 + adc x7, x7, x10 // assumes x10 still is #0 as set above +1: + /* * while (mlen >= 64) { * __builtin_prefetch(data + 32); diff --git a/bsd/dev/arm64/disassembler.c b/bsd/dev/arm64/disassembler.c index 48bf43cb8..c9cb73582 100644 --- a/bsd/dev/arm64/disassembler.c +++ b/bsd/dev/arm64/disassembler.c @@ -1,1280 +1,12 @@ /* - * Copyright (c) 2017 Apple Inc. All rights reserved. + * Copyright (c) 2017-2018 Apple Inc. All rights reserved. * - * Disassemblers for ARM (arm), Thumb (thumb16), and Thumb2 (thumb32). - * - * Each disassembly begins with a call to dtrace_decode_arm or dtrace_decode_thumb. The thumb - * decoder will then call dtrace_decode_thumb16 or dtrace_decode_thumb32 as appropriate. - * - * The respective disassembly functions are all of the form {arm,thumb16,thumb32}_type. They - * follow the ordering and breakdown in the ARMv7 Architecture Reference Manual. - */ - -#include - -#define BITS(x, n, mask) (((x) >> (n)) & (mask)) - -static uint32_t -thumb32_instword_to_arm(uint16_t hw1, uint16_t hw2) -{ - return (hw1 << 16) | hw2; -} - -int dtrace_decode_arm(uint32_t instr); -int dtrace_decode_arm64(uint32_t instr); -int dtrace_decode_thumb(uint32_t instr); - -/* - * VFP decoder - shared between ARM and THUMB32 mode - */ - -static -int -vfp_struct_loadstore(uint32_t instr) -{ - if (ARM_RM(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -vfp_64transfer(uint32_t instr) -{ - /* These instructions all use RD and RN */ - if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -vfp_transfer(uint32_t instr) -{ - /* These instructions all use RD only */ - if (ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -vfp_loadstore(uint32_t instr) -{ - int opcode = BITS(instr, 20, 0x1F); - - /* Instrument VLDR */ - if ((opcode & 0x13) == 0x11 && ARM_RN(instr) == REG_PC) { - return FASTTRAP_T_VLDR_PC_IMMED; - } - - /* These instructions all use RN only */ - if (ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -/* - * ARM decoder - */ - -static -int -arm_unconditional_misc(uint32_t instr) -{ - int op = BITS(instr, 20, 0x7F); - - if ((op & 0x60) == 0x20) { - /* VFP data processing uses its own registers */ - return FASTTRAP_T_COMMON; - } - - if ((op & 0x71) == 0x40) { - return vfp_struct_loadstore(instr); - } - - return FASTTRAP_T_INV; -} - -static -int -arm_unconditional(uint32_t instr) -{ - if (BITS(instr, 27, 0x1) == 0) { - return arm_unconditional_misc(instr); - } - - /* The rest are privileged or BL/BLX, do not instrument */ - - /* Do not need to instrument BL/BLX either, see comment in arm_misc(uint32_t) */ - - return FASTTRAP_T_INV; -} - -static -int -arm_syscall_coproc(uint32_t instr) -{ - /* Instrument any VFP data processing instructions, ignore the rest */ - - int op1 = BITS(instr, 20, 0x3F), coproc = BITS(instr, 8, 0xF), op = BITS(instr, 4, 0x1); - - if ((op1 & 0x3E) == 0 || (op1 & 0x30) == 0x30) { - /* Undefined or swi */ - return FASTTRAP_T_INV; - } - - if ((coproc & 0xE) == 0xA) { - /* VFP instruction */ - - if ((op1 & 0x20) == 0 && (op1 & 0x3A) != 0) { - return vfp_loadstore(instr); - } - - if ((op1 & 0x3E) == 0x04) { - return vfp_64transfer(instr); - } - - if ((op1 & 0x30) == 0x20) { - /* VFP data processing or 8, 16, or 32 bit move between ARM reg and VFP reg */ - if (op == 0) { - /* VFP data processing uses its own registers */ - return FASTTRAP_T_COMMON; - } else { - return vfp_transfer(instr); - } - } - } - - return FASTTRAP_T_INV; -} - -static -int -arm_branch_link_blockdata(uint32_t instr) -{ - int branch = BITS(instr, 25, 0x1), link = BITS(instr, 24, 0x1), op = BITS(instr, 20, 0x1F), uses_pc = BITS(instr, 15, 0x1), uses_lr = BITS(instr, 14, 0x1); - - if (branch == 1) { - if (link == 0) { - return FASTTRAP_T_B_COND; - } - return FASTTRAP_T_INV; - } else { - /* Only emulate a use of the pc if it's a return from function: ldmia sp!, { ... pc } */ - if (op == 0x0B && ARM_RN(instr) == REG_SP && uses_pc == 1) { - return FASTTRAP_T_LDM_PC; - } - - /* stmia sp!, { ... lr } doesn't touch the pc, but it is very common, so special case it */ - if (op == 0x12 && ARM_RN(instr) == REG_SP && uses_lr == 1) { - return FASTTRAP_T_STM_LR; - } - - if (ARM_RN(instr) != REG_PC && uses_pc == 0) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -arm_signed_multiplies(uint32_t instr) -{ - int op1 = BITS(instr, 20, 0x7), op2 = BITS(instr, 5, 0x7); - - /* smlald, smlsld, smmls use RD in addition to RM, RS, and RN */ - if ((op1 == 0x4 && (op2 & 0x4) == 0) || (op1 == 0x5 && (op2 & 0x6) == 0x6)) { - if (ARM_RD(instr) == REG_PC) { - return FASTTRAP_T_INV; - } - } - - if (ARM_RM(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_pack_unpack_sat_reversal(uint32_t instr) -{ - int op1 = BITS(instr, 20, 0x7), op2 = BITS(instr, 5, 0x7); - - /* pkh, sel use RN in addition to RD and RM */ - if ((op1 == 0 && (op2 & 0x1) == 0) || (op1 == 0 && op2 == 0x5)) { - if (ARM_RN(instr) == REG_PC) { - return FASTTRAP_T_INV; - } - } - - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_parallel_addsub_unsigned(uint32_t instr) -{ - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_parallel_addsub_signed(uint32_t instr) -{ - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_media(uint32_t instr) -{ - int op1 = BITS(instr, 20, 0x1F), op2 = BITS(instr, 5, 0x7); - - if ((op1 & 0x1C) == 0) { - return arm_parallel_addsub_signed(instr); - } - - if ((op1 & 0x1C) == 0x04) { - return arm_parallel_addsub_unsigned(instr); - } - - if ((op1 & 0x18) == 0x08) { - return arm_pack_unpack_sat_reversal(instr); - } - - if ((op1 & 0x18) == 0x10) { - return arm_signed_multiplies(instr); - } - - if (op1 == 0x1F && op2 == 0x7) { - /* Undefined instruction */ - return FASTTRAP_T_INV; - } - - if (op1 == 0x18 && op2 == 0) { - /* usad8 usada8 */ - /* The registers are named differently in the reference manual for this instruction - * but the following positions are correct */ - - if (ARM_RM(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; - } - - if ((op1 & 0x1E) == 0x1C && (op2 & 0x3) == 0) { - /* bfc bfi */ - if (ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; - } - - if (((op1 & 0x1E) == 0x1A || (op1 & 0x1E) == 0x1E) && ((op2 & 0x3) == 0x2)) { - /* sbfx ubfx */ - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_loadstore_wordbyte(uint32_t instr) -{ - /* Instrument PC relative load with immediate, ignore any other uses of the PC */ - int R = BITS(instr, 25, 0x1), L = BITS(instr, 20, 0x1); - - if (R == 1) { - /* Three register load/store */ - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - /* Immediate load/store, but still do not support ldr pc, [pc...] */ - if (L == 1 && ARM_RN(instr) == REG_PC && ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_LDR_PC_IMMED; - } - - if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -arm_saturating(uint32_t instr) -{ - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_misc(uint32_t instr) -{ - int op = BITS(instr, 21, 0x3), __unused op1 = BITS(instr, 16, 0xF), op2 = BITS(instr, 4, 0x7); - - if (op2 == 1 && op == 1) { - return FASTTRAP_T_BX_REG; - } - - /* We do not need to emulate BLX for entry/return probes; if we eventually support full offset - * tracing, then we will. This is because BLX overwrites the link register, so a function that - * can execute this as its first instruction is a special function indeed. - */ - - if (op2 == 0x5) { - return arm_saturating(instr); - } - - return FASTTRAP_T_INV; -} - -static -int -arm_msr_hints(__unused uint32_t instr) -{ - /* These deal with the psr, not instrumented */ - - return FASTTRAP_T_INV; -} - -static -int -arm_sync_primitive(__unused uint32_t instr) -{ - /* TODO will instrumenting these interfere with any kernel usage of these instructions? */ - /* Don't instrument for now */ - - return FASTTRAP_T_INV; -} - -static -int -arm_extra_loadstore_unpriv(uint32_t instr) -{ - int op = BITS(instr, 20, 0x1), __unused op2 = BITS(instr, 5, 0x3), immed = BITS(instr, 22, 0x1); - - if (op == 0 && (op2 & 0x2) == 0x2) { - /* Unpredictable or undefined */ - return FASTTRAP_T_INV; - } - - if (immed == 1) { - if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -arm_extra_loadstore(uint32_t instr) -{ - int op1 = BITS(instr, 20, 0x1F); - - /* There are two variants, and we do not instrument either of them that use the PC */ - - if ((op1 & 0x4) == 0) { - /* Variant 1, register */ - if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - /* Variant 2, immediate */ - if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -arm_halfword_multiply(uint32_t instr) -{ - /* Not all multiply instructions use all four registers. The ones that don't should have those - * register locations set to 0, so we can test them anyway. - */ - - if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RM(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_multiply(uint32_t instr) -{ - /* Not all multiply instructions use all four registers. The ones that don't should have those - * register locations set to 0, so we can test them anyway. - */ - - if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RM(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_dataproc_immed(uint32_t instr) -{ - /* All these instructions are either two registers, or one register and have 0 where the other reg would be used */ - if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_dataproc_regshift(uint32_t instr) -{ - /* All these instructions are either four registers, or three registers and have 0 where there last reg would be used */ - if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RM(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_dataproc_reg(uint32_t instr) -{ - int op1 = BITS(instr, 20, 0x1F), op2 = BITS(instr, 7, 0x1F), op3 = BITS(instr, 5, 0x3); - - if (op1 == 0x11 || op1 == 0x13 || op1 == 0x15 || op1 == 0x17) { - /* These are comparison flag setting instructions and do not have RD */ - if (ARM_RN(instr) != REG_PC && ARM_RM(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; - } - - /* The rest can, in theory, write or use the PC. The only one we instrument is mov pc, reg. - * movs pc, reg is a privileged instruction so we don't instrument that variant. The s bit - * is bit 0 of op1 and should be zero. - */ - if (op1 == 0x1A && op2 == 0 && op3 == 0 && ARM_RD(instr) == REG_PC) { - return FASTTRAP_T_MOV_PC_REG; - } - - /* Any instruction at this point is a three register instruction or two register instruction with RN = 0 */ - if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RM(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -arm_dataproc_misc(uint32_t instr) -{ - int op = BITS(instr, 25, 0x1), op1 = BITS(instr, 20, 0x1F), op2 = BITS(instr, 4, 0xF); - - if (op == 0) { - if ((op1 & 0x19) != 0x10 && (op2 & 0x1) == 0) { - return arm_dataproc_reg(instr); - } - - if ((op1 & 0x19) != 0x10 && (op2 & 0x9) == 0x1) { - return arm_dataproc_regshift(instr); - } - - if ((op1 & 0x19) == 0x10 && (op2 & 0x8) == 0) { - return arm_misc(instr); - } - - if ((op1 & 0x19) == 0x19 && (op2 & 0x9) == 0x8) { - return arm_halfword_multiply(instr); - } - - if ((op1 & 0x10) == 0 && op2 == 0x9) { - return arm_multiply(instr); - } - - if ((op1 & 0x10) == 0x10 && op2 == 0x9) { - return arm_sync_primitive(instr); - } - - if ((op1 & 0x12) != 0x02 && (op2 == 0xB || (op2 & 0xD) == 0xD)) { - return arm_extra_loadstore(instr); - } - - if ((op1 & 0x12) == 0x02 && (op2 == 0xB || (op2 & 0xD) == 0xD)) { - return arm_extra_loadstore_unpriv(instr); - } - } else { - if ((op1 & 0x19) != 0x10) { - return arm_dataproc_immed(instr); - } - - if (op1 == 0x10) { - /* 16 bit immediate load (mov (immed)) [encoding A2] */ - if (ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; - } - - if (op1 == 0x14) { - /* high halfword 16 bit immediate load (movt) [encoding A1] */ - if (ARM_RD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; - } - - if ((op1 & 0x1B) == 0x12) { - return arm_msr_hints(instr); - } - } - - return FASTTRAP_T_INV; -} - -int -dtrace_decode_arm(uint32_t instr) -{ - int cond = BITS(instr, 28, 0xF), op1 = BITS(instr, 25, 0x7), op = BITS(instr, 4, 0x1); - - if (cond == 0xF) { - return arm_unconditional(instr); - } - - if ((op1 & 0x6) == 0) { - return arm_dataproc_misc(instr); - } - - if (op1 == 0x2) { - return arm_loadstore_wordbyte(instr); - } - - if (op1 == 0x3 && op == 0) { - return arm_loadstore_wordbyte(instr); - } - - if (op1 == 0x3 && op == 1) { - return arm_media(instr); - } - - if ((op1 & 0x6) == 0x4) { - return arm_branch_link_blockdata(instr); - } - - if ((op1 & 0x6) == 0x6) { - return arm_syscall_coproc(instr); - } - - return FASTTRAP_T_INV; -} - -/* - * Thumb 16-bit decoder - */ - -static -int -thumb16_cond_supervisor(uint16_t instr) -{ - int opcode = BITS(instr, 8, 0xF); - - if ((opcode & 0xE) != 0xE) { - return FASTTRAP_T_B_COND; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb16_misc(uint16_t instr) -{ - int opcode = BITS(instr, 5, 0x7F); - - if ((opcode & 0x70) == 0x30 || (opcode & 0x70) == 0x70) { - /* setend, cps, breakpoint, or if-then, not instrumentable */ - return FASTTRAP_T_INV; - } else if ((opcode & 0x78) == 0x28) { - /* Doesn't modify pc, but this happens a lot so make this a special case for emulation */ - return FASTTRAP_T_PUSH_LR; - } else if ((opcode & 0x78) == 0x68) { - return FASTTRAP_T_POP_PC; - } else if ((opcode & 0x28) == 0x08) { - return FASTTRAP_T_CB_N_Z; - } - - /* All other instructions work on low regs only and are instrumentable */ - return FASTTRAP_T_COMMON; -} - -static -int -thumb16_loadstore_single(__unused uint16_t instr) -{ - /* These all access the low registers or SP only */ - return FASTTRAP_T_COMMON; -} - -static -int -thumb16_data_special_and_branch(uint16_t instr) -{ - int opcode = BITS(instr, 6, 0xF); - - if (opcode == 0x4) { - /* Unpredictable */ - return FASTTRAP_T_INV; - } else if ((opcode & 0xC) == 0xC) { - /* bx or blx */ - /* Only instrument the bx */ - if ((opcode & 0x2) == 0) { - return FASTTRAP_T_BX_REG; - } - return FASTTRAP_T_INV; - } else { - /* Data processing on high registers, only instrument mov pc, reg */ - if ((opcode & 0xC) == 0x8 && THUMB16_HRD(instr) == REG_PC) { - return FASTTRAP_T_CPY_PC; - } - - if (THUMB16_HRM(instr) != REG_PC && THUMB16_HRD(instr) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -thumb16_data_proc(__unused uint16_t instr) -{ - /* These all access the low registers only */ - return FASTTRAP_T_COMMON; -} - -static -int -thumb16_shift_addsub_move_compare(__unused uint16_t instr) -{ - /* These all access the low registers only */ - return FASTTRAP_T_COMMON; -} - -static -int -dtrace_decode_thumb16(uint16_t instr) -{ - int opcode = BITS(instr, 10, 0x3F); - - if ((opcode & 0x30) == 0) { - return thumb16_shift_addsub_move_compare(instr); - } - - if (opcode == 0x10) { - return thumb16_data_proc(instr); - } - - if (opcode == 0x11) { - return thumb16_data_special_and_branch(instr); - } - - if ((opcode & 0x3E) == 0x12) { - /* ldr (literal) */ - return FASTTRAP_T_LDR_PC_IMMED; - } - - if ((opcode & 0x3C) == 0x14 || (opcode & 0x38) == 0x18 || (opcode & 0x38) == 0x20) { - return thumb16_loadstore_single(instr); - } - - if ((opcode & 0x3E) == 0x28) { - /* adr, uses the pc */ - return FASTTRAP_T_INV; - } - - if ((opcode & 0x3E) == 0x2A) { - /* add (sp plus immediate) */ - return FASTTRAP_T_COMMON; - } - - if ((opcode & 0x3C) == 0x2C) { - return thumb16_misc(instr); - } - - if ((opcode & 0x3E) == 0x30) { - /* stm - can't access high registers */ - return FASTTRAP_T_COMMON; - } - - if ((opcode & 0x3E) == 0x32) { - /* ldm - can't access high registers */ - return FASTTRAP_T_COMMON; - } - - if ((opcode & 0x3C) == 0x34) { - return thumb16_cond_supervisor(instr); - } - - if ((opcode & 0x3E) == 0x38) { - /* b unconditional */ - return FASTTRAP_T_B_UNCOND; - } - - return FASTTRAP_T_INV; -} - -/* - * Thumb 32-bit decoder + * Disassemblers for ARM64 (AArch64) */ -static -int -thumb32_coproc(uint16_t instr1, uint16_t instr2) -{ - /* Instrument any VFP data processing instructions, ignore the rest */ - - int op1 = BITS(instr1, 4, 0x3F), coproc = BITS(instr2, 8, 0xF), op = BITS(instr2, 4, 0x1); - - if ((op1 & 0x3E) == 0) { - /* Undefined */ - return FASTTRAP_T_INV; - } - - if ((coproc & 0xE) == 0xA || (op1 & 0x30) == 0x30) { - /* VFP instruction */ - uint32_t instr = thumb32_instword_to_arm(instr1, instr2); - - if ((op1 & 0x30) == 0x30) { - /* VFP data processing uses its own registers */ - return FASTTRAP_T_COMMON; - } - - if ((op1 & 0x3A) == 0x02 || (op1 & 0x38) == 0x08 || (op1 & 0x30) == 0x10) { - return vfp_loadstore(instr); - } - - if ((op1 & 0x3E) == 0x04) { - return vfp_64transfer(instr); - } - - if ((op1 & 0x30) == 0x20) { - /* VFP data processing or 8, 16, or 32 bit move between ARM reg and VFP reg */ - if (op == 0) { - /* VFP data processing uses its own registers */ - return FASTTRAP_T_COMMON; - } else { - return vfp_transfer(instr); - } - } - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_longmultiply(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 4, 0x7), op2 = BITS(instr2, 4, 0xF); - - if ((op1 == 1 && op2 == 0xF) || (op1 == 0x3 && op2 == 0xF)) { - /* Three register instruction */ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - /* Four register instruction */ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && - THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_multiply(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 4, 0x7), op2 = BITS(instr2, 4, 0x3); - - if ((op1 == 0 && op2 == 1) || (op1 == 0x6 && (op2 & 0x2) == 0)) { - if (THUMB32_RT(instr1, instr2) == REG_PC) { - return FASTTRAP_T_INV; - } - } - - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_misc(uint16_t instr1, uint16_t instr2) -{ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_parallel_addsub_unsigned(uint16_t instr1, uint16_t instr2) -{ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_parallel_addsub_signed(uint16_t instr1, uint16_t instr2) -{ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_dataproc_reg(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 4, 0xF), op2 = BITS(instr2, 4, 0xF); - - if (((0 <= op1) && (op1 <= 5)) && (op2 & 0x8) == 0x8) { - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - if ((op1 & 0x8) == 0 && op2 == 0) { - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - if ((op1 & 0x8) == 0x8 && (op2 & 0xC) == 0) { - return thumb32_parallel_addsub_signed(instr1, instr2); - } - - if ((op1 & 0x8) == 0x8 && (op2 & 0xC) == 0x4) { - return thumb32_parallel_addsub_unsigned(instr1, instr2); - } - - if ((op1 & 0xC) == 0x8 && (op2 & 0xC) == 0x8) { - return thumb32_misc(instr1, instr2); - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_dataproc_regshift(uint16_t instr1, uint16_t instr2) -{ - int op = BITS(instr1, 5, 0xF), S = BITS(instr1, 4, 0x1); - - if (op == 0 || op == 0x4 || op == 0x8 || op == 0xD) { - /* These become test instructions if S is 1 and Rd is PC, otherwise they are data instructions. */ - if (S == 1) { - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && - THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - } else if (op == 0x2 || op == 0x3) { - /* These become moves if RN is PC, otherwise they are data insts. We don't instrument mov pc, reg here */ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - /* Normal three register instruction */ - if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_store_single(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 5, 0x7), op2 = BITS(instr2, 6, 0x3F); - - /* Do not support any use of the pc yet */ - if ((op1 == 0 || op1 == 1 || op1 == 2) && (op2 & 0x20) == 0) { - /* str (register) uses RM */ - if (THUMB32_RM(instr1, instr2) == REG_PC) { - return FASTTRAP_T_INV; - } - } - - if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_loadbyte_memhint(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 7, 0x3), __unused op2 = BITS(instr2, 6, 0x3F); - - /* Do not support any use of the pc yet */ - if ((op1 == 0 || op1 == 0x2) && THUMB32_RM(instr1, instr2) == REG_PC) { - return FASTTRAP_T_INV; - } - - if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_loadhalfword_memhint(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 7, 0x3), op2 = BITS(instr2, 6, 0x3F); - - /* Do not support any use of the PC yet */ - if (op1 == 0 && op2 == 0 && THUMB32_RM(inst1, instr2) == REG_PC) { - return FASTTRAP_T_INV; - } - - if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_loadword(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 7, 0x3), op2 = BITS(instr2, 6, 0x3F); - - if ((op1 & 0x2) == 0 && THUMB32_RN(instr1, instr2) == REG_PC && THUMB32_RT(instr1, instr2) != REG_PC) { - return FASTTRAP_T_LDR_PC_IMMED; - } - - if (op1 == 0 && op2 == 0) { - /* ldr (register) uses an additional reg */ - if (THUMB32_RM(instr1, instr2) == REG_PC) { - return FASTTRAP_T_INV; - } - } - - if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_loadstore_double_exclusive_table(__unused uint16_t instr1, __unused uint16_t instr2) -{ - /* Don't instrument any of these */ - - return FASTTRAP_T_INV; -} - -static -int -thumb32_loadstore_multiple(uint16_t instr1, uint16_t instr2) -{ - int op = BITS(instr1, 7, 0x3), L = BITS(instr1, 4, 0x1), uses_pc = BITS(instr2, 15, 0x1), uses_lr = BITS(instr2, 14, 0x1); - - if (op == 0 || op == 0x3) { - /* Privileged instructions: srs, rfe */ - return FASTTRAP_T_INV; - } - - /* Only emulate a use of the pc if it's a return from function: ldmia sp!, { ... pc }, aka pop { ... pc } */ - if (op == 0x1 && L == 1 && THUMB32_RN(instr1, instr2) == REG_SP && uses_pc == 1) { - return FASTTRAP_T_LDM_PC; - } - - /* stmia sp!, { ... lr }, aka push { ... lr } doesn't touch the pc, but it is very common, so special case it */ - if (op == 0x2 && L == 0 && THUMB32_RN(instr1, instr2) == REG_SP && uses_lr == 1) { - return FASTTRAP_T_STM_LR; - } - - if (THUMB32_RN(instr1, instr2) != REG_PC && uses_pc == 0) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_misc_control(__unused uint16_t instr1, __unused uint16_t instr2) -{ - /* Privileged, and instructions dealing with ThumbEE */ - return FASTTRAP_T_INV; -} - -static -int -thumb32_cps_hints(__unused uint16_t instr1, __unused uint16_t instr2) -{ - /* Privileged */ - return FASTTRAP_T_INV; -} - -static -int -thumb32_b_misc_control(uint16_t instr1, uint16_t instr2) -{ - int op = BITS(instr1, 4, 0x7F), op1 = BITS(instr2, 12, 0x7), __unused op2 = BITS(instr2, 8, 0xF); - - if ((op1 & 0x5) == 0) { - if ((op & 0x38) != 0x38) { - return FASTTRAP_T_B_COND; - } - - if (op == 0x3A) { - return thumb32_cps_hints(instr1, instr2); - } - - if (op == 0x3B) { - return thumb32_misc_control(instr1, instr2); - } - } - - if ((op1 & 0x5) == 1) { - return FASTTRAP_T_B_UNCOND; - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_dataproc_plain_immed(uint16_t instr1, uint16_t instr2) -{ - int op = BITS(instr1, 4, 0x1F); - - if (op == 0x04 || op == 0x0C || op == 0x16) { - /* mov, movt, bfi, bfc */ - /* These use only RD */ - if (THUMB32_RD(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } else { - if (THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - return FASTTRAP_T_INV; -} - -static -int -thumb32_dataproc_mod_immed(uint16_t instr1, uint16_t instr2) -{ - int op = BITS(instr1, 5, 0xF), S = BITS(instr1, 4, 0x1); - - if (op == 0x2 || op == 0x3) { - /* These allow REG_PC in RN, but it doesn't mean use the PC! */ - if (THUMB32_RD(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - if (op == 0 || op == 0x4 || op == 0x8 || op == 0xD) { - /* These are test instructions, if the sign bit is set and RD is the PC. */ - if (S && THUMB32_RD(instr1, instr2) == REG_PC) { - return FASTTRAP_T_COMMON; - } - } - - if (THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) { - return FASTTRAP_T_COMMON; - } - - return FASTTRAP_T_INV; -} - -static -int -dtrace_decode_thumb32(uint16_t instr1, uint16_t instr2) -{ - int op1 = BITS(instr1, 11, 0x3), op2 = BITS(instr1, 4, 0x7F), op = BITS(instr2, 15, 0x1); - - if (op1 == 0x1) { - if ((op2 & 0x64) == 0) { - return thumb32_loadstore_multiple(instr1, instr2); - } - - if ((op2 & 0x64) == 0x04) { - return thumb32_loadstore_double_exclusive_table(instr1, instr2); - } - - if ((op2 & 0x60) == 0x20) { - return thumb32_dataproc_regshift(instr1, instr2); - } - - if ((op2 & 0x40) == 0x40) { - return thumb32_coproc(instr1, instr2); - } - } - - if (op1 == 0x2) { - if ((op2 & 0x20) == 0 && op == 0) { - return thumb32_dataproc_mod_immed(instr1, instr2); - } - - if ((op2 & 0x20) == 0x20 && op == 0) { - return thumb32_dataproc_plain_immed(instr1, instr2); - } - - if (op == 1) { - return thumb32_b_misc_control(instr1, instr2); - } - } - - if (op1 == 0x3) { - if ((op2 & 0x71) == 0) { - return thumb32_store_single(instr1, instr2); - } - - if ((op2 & 0x71) == 0x10) { - return vfp_struct_loadstore(thumb32_instword_to_arm(instr1, instr2)); - } - - if ((op2 & 0x67) == 0x01) { - return thumb32_loadbyte_memhint(instr1, instr2); - } - - if ((op2 & 0x67) == 0x03) { - return thumb32_loadhalfword_memhint(instr1, instr2); - } - - if ((op2 & 0x67) == 0x05) { - return thumb32_loadword(instr1, instr2); - } - - if ((op2 & 0x67) == 0x07) { - /* Undefined instruction */ - return FASTTRAP_T_INV; - } - - if ((op2 & 0x70) == 0x20) { - return thumb32_dataproc_reg(instr1, instr2); - } - - if ((op2 & 0x78) == 0x30) { - return thumb32_multiply(instr1, instr2); - } - - if ((op2 & 0x78) == 0x38) { - return thumb32_longmultiply(instr1, instr2); - } - - if ((op2 & 0x40) == 0x40) { - return thumb32_coproc(instr1, instr2); - } - } - - return FASTTRAP_T_INV; -} - -int -dtrace_decode_thumb(uint32_t instr) -{ - uint16_t* pInstr = (uint16_t*) &instr; - uint16_t hw1 = pInstr[0], hw2 = pInstr[1]; - - int size = BITS(hw1, 11, 0x1F); +#include - if (size == 0x1D || size == 0x1E || size == 0x1F) { - return dtrace_decode_thumb32(hw1, hw2); - } else { - return dtrace_decode_thumb16(hw1); - } -} +int dtrace_decode_arm64(uint32_t instr); struct arm64_decode_entry { uint32_t mask; @@ -1311,8 +43,6 @@ struct arm64_decode_entry arm64_decode_table[] = { #define NUM_DECODE_ENTRIES (sizeof(arm64_decode_table) / sizeof(struct arm64_decode_entry)) - - int dtrace_decode_arm64(uint32_t instr) { diff --git a/bsd/dev/arm64/dtrace_isa.c b/bsd/dev/arm64/dtrace_isa.c index 6a9296fb0..5714f7971 100644 --- a/bsd/dev/arm64/dtrace_isa.c +++ b/bsd/dev/arm64/dtrace_isa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,20 +26,13 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from - * mach/ppc/thread_status.h */ #include -#include - #include -#include #if __has_include() #include #endif #include -#include -#include #include #include #include @@ -47,13 +40,11 @@ #include #include #include -#include +#include #include #include /* for thread_wakeup() */ #include #include -#include -#include extern struct arm_saved_state *find_kern_regs(thread_t); @@ -130,7 +121,7 @@ xcRemote(void *foo) (pArg->f)(pArg->arg); } - if (hw_atomic_sub(&dt_xc_sync, 1) == 0) { + if (os_atomic_dec(&dt_xc_sync, relaxed) == 0) { thread_wakeup((event_t) &dt_xc_sync); } } @@ -180,12 +171,6 @@ dtrace_isa_init(void) /** * Register definitions */ -#define ARM_FP 7 -#define ARM_SP 13 -#define ARM_LR 14 -#define ARM_PC 15 -#define ARM_CPSR 16 - #define ARM64_FP 29 #define ARM64_LR 30 #define ARM64_SP 31 @@ -205,27 +190,6 @@ dtrace_getreg(struct regs * savearea, uint_t reg) return 0; } - if (is_saved_state32(regs)) { - // Fix special registers if user is 32 bits - switch (reg) { - case ARM64_FP: - reg = ARM_FP; - break; - case ARM64_SP: - reg = ARM_SP; - break; - case ARM64_LR: - reg = ARM_LR; - break; - case ARM64_PC: - reg = ARM_PC; - break; - case ARM64_CPSR: - reg = ARM_CPSR; - break; - } - } - if (!check_saved_state_reglimit(regs, reg)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); return 0; @@ -234,7 +198,6 @@ dtrace_getreg(struct regs * savearea, uint_t reg) return (uint64_t)get_saved_state_reg(regs, reg); } -#define RETURN_OFFSET 4 #define RETURN_OFFSET64 8 static int @@ -242,7 +205,6 @@ dtrace_getustack_common(uint64_t * pcstack, int pcstack_limit, user_addr_t pc, user_addr_t sp) { int ret = 0; - boolean_t is64bit = proc_is64bit_data(current_proc()); ASSERT(pcstack == NULL || pcstack_limit > 0); @@ -260,13 +222,8 @@ dtrace_getustack_common(uint64_t * pcstack, int pcstack_limit, user_addr_t pc, break; } - if (is64bit) { - pc = dtrace_fuword64((sp + RETURN_OFFSET64)); - sp = dtrace_fuword64(sp); - } else { - pc = dtrace_fuword32((sp + RETURN_OFFSET)); - sp = dtrace_fuword32(sp); - } + pc = dtrace_fuword64((sp + RETURN_OFFSET64)); + sp = dtrace_fuword64(sp); } return ret; @@ -387,10 +344,6 @@ dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit) user_addr_t pc, sp; volatile uint16_t *flags = (volatile uint16_t *) &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; -#if 0 - uintptr_t oldcontext; - size_t s1, s2; -#endif if (*flags & CPU_DTRACE_FAULT) { return; @@ -478,13 +431,8 @@ dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit) } else #endif { - if (is64bit) { - pc = dtrace_fuword64((sp + RETURN_OFFSET64)); - sp = dtrace_fuword64(sp); - } else { - pc = dtrace_fuword32((sp + RETURN_OFFSET)); - sp = dtrace_fuword32(sp); - } + pc = dtrace_fuword64((sp + RETURN_OFFSET64)); + sp = dtrace_fuword64(sp); } #if 0 @@ -606,28 +554,6 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes, } } -/* - * On arm64, we support both 32bit and 64bit user processes. - * This routine is only called when handling 32bit processes - * where thumb_mode is pertinent. - * If this routine is called when handling 64bit processes - * thumb_mode should always be zero. - */ -int -dtrace_instr_size(uint32_t instr, int thumb_mode) -{ - if (thumb_mode) { - uint16_t instr16 = *(uint16_t*) &instr; - if (((instr16 >> 11) & 0x1F) > 0x1C) { - return 4; - } else { - return 2; - } - } else { - return 4; - } -} - uint64_t dtrace_getarg(int arg, int aframes, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { diff --git a/bsd/dev/arm64/dtrace_subr_arm.c b/bsd/dev/arm64/dtrace_subr_arm.c index 584317298..c5c427d62 100644 --- a/bsd/dev/arm64/dtrace_subr_arm.c +++ b/bsd/dev/arm64/dtrace_subr_arm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. */ /* * CDDL HEADER START @@ -27,10 +27,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)dtrace_subr.c 1.12 05/06/08 SMI" - */ - #include #include #include @@ -65,27 +61,11 @@ dtrace_user_probe(arm_saved_state_t *regs) kauth_cred_uthread_update(uthread, p); - if (is_saved_state32(regs)) { - if (saved_state32(regs)->cpsr & PSR_TF) { - uint16_t pc; - if (copyin((user_addr_t)saved_state32(regs)->pc, &pc, sizeof(uint16_t))) { - return KERN_FAILURE; - } - is_fasttrap = (pc == FASTTRAP_THUMB32_RET_INSTR); - } else { - uint32_t pc; - if (copyin((user_addr_t)saved_state32(regs)->pc, &pc, sizeof(uint32_t))) { - return KERN_FAILURE; - } - is_fasttrap = (pc == FASTTRAP_ARM32_RET_INSTR); - } - } else { - uint32_t pc; - if (copyin((user_addr_t)saved_state64(regs)->pc, &pc, sizeof(uint32_t))) { - return KERN_FAILURE; - } - is_fasttrap = (pc == FASTTRAP_ARM64_RET_INSTR); + uint32_t pc; + if (copyin((user_addr_t)saved_state64(regs)->pc, &pc, sizeof(uint32_t))) { + return KERN_FAILURE; } + is_fasttrap = (pc == FASTTRAP_ARM64_RET_INSTR); if (is_fasttrap) { uint8_t step = uthread->t_dtrace_step; @@ -183,38 +163,11 @@ dtrace_user_probe(arm_saved_state_t *regs) * * Note that the PC points to the instruction that caused the fault. */ - if (is_saved_state32(regs)) { - if (saved_state32(regs)->cpsr & PSR_TF) { - uint16_t instr; - if (fuword16(saved_state32(regs)->pc, &instr) == 0 && instr != FASTTRAP_THUMB32_INSTR) { - return KERN_SUCCESS; - } - } else { - uint32_t instr; - if (fuword32(saved_state32(regs)->pc, &instr) == 0 && instr != FASTTRAP_ARM32_INSTR) { - return KERN_SUCCESS; - } - } - } else { - uint32_t instr; - if (fuword32(saved_state64(regs)->pc, &instr) == 0 && instr != FASTTRAP_ARM64_INSTR) { - return KERN_SUCCESS; - } + uint32_t instr; + if (fuword32(saved_state64(regs)->pc, &instr) == 0 && instr != FASTTRAP_ARM64_INSTR) { + return KERN_SUCCESS; } } return KERN_FAILURE; } - -void -dtrace_safe_synchronous_signal(void) -{ - /* Not implemented */ -} - -int -dtrace_safe_defer_signal(void) -{ - /* Not implemented */ - return 0; -} diff --git a/bsd/dev/arm64/fasttrap_isa.c b/bsd/dev/arm64/fasttrap_isa.c index 50f980f2c..b547aa992 100644 --- a/bsd/dev/arm64/fasttrap_isa.c +++ b/bsd/dev/arm64/fasttrap_isa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. */ /* * CDDL HEADER START @@ -27,198 +27,33 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)fasttrap_isa.c 1.19 05/09/14 SMI" - */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif #include #include #include #include #include -#include -#include -#include -#include #include -#include #include -#include - -#include #if __has_include() #include #endif - extern dtrace_id_t dtrace_probeid_error; /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ extern int dtrace_decode_arm64(uint32_t instr); -extern int dtrace_decode_arm(uint32_t instr); -extern int dtrace_decode_thumb(uint32_t instr); - -/* - * Lossless User-Land Tracing on ARM - * --------------------------------- - * - * The details here will be fleshed out as more of this is implemented. The - * basic design will be the same way as tracing works in x86. - * - * Some ARM specific issues: - * - * We need to patch differently for ARM instructions and Thumb instructions. - * When we hit a probe, we check to see if the mode we're currently in is the - * same as the mode we're patching for. If not, we remove the tracepoint and - * abort. This ARM/Thumb information is pulled in from the arch specific - * information in the fasttrap probe. - * - * On ARM, any instruction that uses registers can also use the pc as a - * register. This presents problems during emulation because we have copied - * the instruction and thus the pc can be different. Currently we've emulated - * any instructions that use the pc if they can be used in a return probe. - * Eventually we will want to support all instructions that use the pc, but - * to do so requires disassembling the instruction and reconstituting it by - * substituting a different register. - * - */ - -#define THUMB_INSTR(x) (*(uint16_t*) &(x)) - -#define SIGNEXTEND(x, v) ((((int) (x)) << (32-(v))) >> (32-(v))) -#define ALIGNADDR(x, v) (((x) >> (v)) << (v)) -#define GETITSTATE(x) ((((x) >> 8) & 0xFC) | (((x) >> 25) & 0x3)) -#define ISLASTINIT(x) (((x) & 0xF) == 8) - -#define SET16(x, w) *((uint16_t*) (x)) = (w) -#define SET32(x, w) *((uint32_t*) (x)) = (w) - -#define IS_ARM32_NOP(x) ((x) == 0xE1A00000) -/* Marker for is-enabled probes */ -#define IS_ARM32_IS_ENABLED(x) ((x) == 0xE0200000) #define IS_ARM64_NOP(x) ((x) == 0xD503201F) /* Marker for is-enabled probes */ #define IS_ARM64_IS_ENABLED(x) ((x) == 0xD2800000) -#define IS_THUMB32_NOP(x) ((x) == 0x46C0) -/* Marker for is-enabled probes */ -#define IS_THUMB32_IS_ENABLED(x) ((x) == 0x4040) - -#define ARM_LDM_UF (1 << 23) -#define ARM_LDM_PF (1 << 24) -#define ARM_LDM_WF (1 << 21) - -#define ARM_LDR_UF (1 << 23) -#define ARM_LDR_BF (1 << 22) - -static int fasttrap_tracepoint_init32(proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t); -static int fasttrap_tracepoint_init64(proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t); - int fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, user_addr_t pc, fasttrap_probe_type_t type) -{ - if (proc_is64bit_data(p)) { - return fasttrap_tracepoint_init64(p, tp, pc, type); - } else { - return fasttrap_tracepoint_init32(p, tp, pc, type); - } -} - -static int -fasttrap_tracepoint_init32(proc_t *p, fasttrap_tracepoint_t *tp, - user_addr_t pc, fasttrap_probe_type_t type) -{ -#pragma unused(type) - uint32_t instr; - - /* - * Read the instruction at the given address out of the process's - * address space. We don't have to worry about a debugger - * changing this instruction before we overwrite it with our trap - * instruction since P_PR_LOCK is set. Since instructions can span - * pages, we potentially read the instruction in two parts. If the - * second part fails, we just zero out that part of the instruction. - */ - /* - * APPLE NOTE: Of course, we do not have a P_PR_LOCK, so this is racey... - */ - - if (uread(p, &instr, 4, pc) != 0) { - return -1; - } - - /* We want &instr to always point to the saved instruction, so just copy the - * whole thing When cast to a pointer to a uint16_t, that will give us a - * pointer to the first two bytes, which is the thumb instruction. - */ - tp->ftt_instr = instr; - - if (tp->ftt_fntype != FASTTRAP_FN_DONE_INIT) { - switch (tp->ftt_fntype) { - case FASTTRAP_FN_UNKNOWN: - /* Can't instrument without any information. We can add some heuristics later if necessary. */ - return -1; - - case FASTTRAP_FN_USDT: - if (IS_ARM32_NOP(instr) || IS_ARM32_IS_ENABLED(instr)) { - tp->ftt_thumb = 0; - } else if (IS_THUMB32_NOP(THUMB_INSTR(instr)) || IS_THUMB32_IS_ENABLED(THUMB_INSTR(instr))) { - tp->ftt_thumb = 1; - } else { - /* Shouldn't reach here - this means we don't recognize - * the instruction at one of the USDT probe locations - */ - return -1; - } - tp->ftt_fntype = FASTTRAP_FN_DONE_INIT; - break; - - case FASTTRAP_FN_ARM: - tp->ftt_thumb = 0; - tp->ftt_fntype = FASTTRAP_FN_DONE_INIT; - break; - - case FASTTRAP_FN_THUMB: - tp->ftt_thumb = 1; - tp->ftt_fntype = FASTTRAP_FN_DONE_INIT; - break; - - default: - return -1; - } - } - - if (tp->ftt_thumb) { - tp->ftt_type = dtrace_decode_thumb(instr); - } else { - tp->ftt_type = dtrace_decode_arm(instr); - } - - if (tp->ftt_type == FASTTRAP_T_INV) { - /* This is an instruction we either don't recognize or can't instrument */ - printf("dtrace: fasttrap init32: Unrecognized instruction: %08x at %08llx\n", - (tp->ftt_thumb && dtrace_instr_size(tp->ftt_instr, tp->ftt_thumb) == 2) ? tp->ftt_instr1 : instr, pc); - return -1; - } - - return 0; -} - - -static int -fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp, - user_addr_t pc, fasttrap_probe_type_t type) { #pragma unused(type) uint32_t instr = 0; @@ -240,7 +75,6 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp, } tp->ftt_instr = instr; - tp->ftt_thumb = 0; /* Always zero on 64bit */ if (tp->ftt_fntype != FASTTRAP_FN_DONE_INIT) { switch (tp->ftt_fntype) { @@ -299,7 +133,6 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp, int fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) { - /* The thumb patch is a 2 byte instruction regardless of the size of the original instruction */ uint32_t instr; int size; @@ -307,12 +140,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) size = 4; instr = FASTTRAP_ARM64_INSTR; } else { - size = tp->ftt_thumb ? 2 : 4; - if (tp->ftt_thumb) { - *((uint16_t*) &instr) = FASTTRAP_THUMB32_INSTR; - } else { - instr = FASTTRAP_ARM32_INSTR; - } + return -1; } if (uwrite(p, &instr, size, tp->ftt_pc) != 0) { @@ -327,16 +155,14 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) int fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) { - /* The thumb patch is a 2 byte instruction regardless of the size of the original instruction */ uint32_t instr; - int size; + int size = 4; if (proc_is64bit_data(p)) { /* * Distinguish between read or write failures and a changed * instruction. */ - size = 4; if (uread(p, &instr, size, tp->ftt_pc) != 0) { goto end; } @@ -345,24 +171,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) goto end; } } else { - /* - * Distinguish between read or write failures and a changed - * instruction. - */ - size = tp->ftt_thumb ? 2 : 4; - if (uread(p, &instr, size, tp->ftt_pc) != 0) { - goto end; - } - - if (tp->ftt_thumb) { - if (*((uint16_t*) &instr) != FASTTRAP_THUMB32_INSTR) { - goto end; - } - } else { - if (instr != FASTTRAP_ARM32_INSTR) { - goto end; - } - } + return -1; } if (uwrite(p, &tp->ftt_instr, size, tp->ftt_pc) != 0) { @@ -407,27 +216,14 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_ for (id = tp->ftt_retids; id != NULL; id = id->fti_next) { fasttrap_probe_t *probe = id->fti_probe; - /* - * If there's a branch that could act as a return site, we - * need to trace it, and check here if the program counter is - * external to the function. - */ - if (is_saved_state32(regs)) { - if (tp->ftt_type != FASTTRAP_T_LDM_PC && - tp->ftt_type != FASTTRAP_T_POP_PC && - new_pc - probe->ftp_faddr < probe->ftp_fsize) { - continue; - } - } else { - /* ARM64_TODO - check for FASTTRAP_T_RET */ - if ((tp->ftt_type != FASTTRAP_T_ARM64_RET || tp->ftt_type != FASTTRAP_T_ARM64_RETAB) && - new_pc - probe->ftp_faddr < probe->ftp_fsize) { - continue; - } + /* ARM64_TODO - check for FASTTRAP_T_RET */ + if ((tp->ftt_type != FASTTRAP_T_ARM64_RET || tp->ftt_type != FASTTRAP_T_ARM64_RETAB) && + new_pc - probe->ftp_faddr < probe->ftp_fsize) { + continue; } if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } @@ -448,15 +244,9 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_ if (FALSE) { #endif } else { - if (is_saved_state32(regs)) { - dtrace_probe(probe->ftp_id, - pc - id->fti_probe->ftp_faddr, - saved_state32(regs)->r[0], 0, 0, 0); - } else { - dtrace_probe(probe->ftp_id, - pc - id->fti_probe->ftp_faddr, - saved_state64(regs)->x[0], 0, 0, 0); - } + dtrace_probe(probe->ftp_id, + pc - id->fti_probe->ftp_faddr, + saved_state64(regs)->x[0], 0, 0, 0); } } if (retire_tp) { @@ -466,6 +256,9 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_ lck_mtx_unlock(pid_mtx); } +#if DEBUG +__dead2 +#endif static void fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr, arm_saved_state_t *regs) { @@ -503,31 +296,6 @@ fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr, arm_saved_state_t *re #endif } -static void -fasttrap_usdt_args32(fasttrap_probe_t *probe, arm_saved_state32_t *regs32, int argc, - uint64_t *argv) -{ - int i, x, cap = MIN(argc, probe->ftp_nargs); - - for (i = 0; i < cap; i++) { - x = probe->ftp_argmap[i]; - - /* Up to 4 args are passed in registers on arm */ - if (x < 4) { - argv[i] = regs32->r[x]; - } else { - uint32_t arg; - fasttrap_fuword32_noerr(regs32->sp + (x - 4) * sizeof(uint32_t), &arg); - - argv[i] = arg; - } - } - - for (; i < argc; i++) { - argv[i] = 0; - } -} - static void fasttrap_usdt_args64(fasttrap_probe_t *probe, arm_saved_state64_t *regs64, int argc, uint64_t *argv) @@ -581,591 +349,6 @@ condition_true(int cond, int cpsr) return taken; } -static void -set_thumb_flag(arm_saved_state32_t *regs32, user_addr_t pc) -{ - if (pc & 1) { - regs32->cpsr |= PSR_TF; - } else { - regs32->cpsr &= ~PSR_TF; - } -} - -static int -fasttrap_pid_probe_thumb_state_valid(arm_saved_state32_t *state32, fasttrap_tracepoint_t *tp) -{ - uint32_t cpsr = state32->cpsr; - uint32_t itstate = GETITSTATE(cpsr); - - /* If in IT block, make sure it's the last statement in the block */ - if ((itstate != 0) && !ISLASTINIT(itstate)) { - printf("dtrace: fasttrap: Tried to trace instruction %08x at %08x but not at end of IT block\n", - (tp->ftt_thumb && dtrace_instr_size(tp->ftt_instr, tp->ftt_thumb) == 2) ? tp->ftt_instr1 : tp->ftt_instr, state32->pc); - return 0; - } - - if (!(cpsr & PSR_TF)) { - return 0; - } - - return 1; -} - -static int -fasttrap_get_condition_code(arm_saved_state32_t *regs32, fasttrap_tracepoint_t *tp) -{ - /* Default to always execute */ - int condition_code = 0xE; - if (tp->ftt_thumb) { - uint32_t itstate = GETITSTATE(regs32->cpsr); - if (itstate != 0) { - /* In IT block, make sure it's the last statement in the block */ - assert(ISLASTINIT(itstate)); - condition_code = itstate >> 4; - } - } else { - condition_code = ARM_CONDCODE(tp->ftt_instr); - } - - return condition_code; -} - -static void -fasttrap_pid_probe_handle_patched_instr32(arm_saved_state_t *state, fasttrap_tracepoint_t *tp, uthread_t uthread, - proc_t *p, uint_t is_enabled, int *was_simulated) -{ - arm_saved_state32_t *regs32 = saved_state32(state); - uint32_t new_pc = 0; - uint32_t pc = regs32->pc; - int instr_size; - int condition_code; - - *was_simulated = 1; - - /* - * If there's an is-enabled probe connected to this tracepoint it - * means that there was a 'eor r0,r0,r0' - * instruction that was placed there by DTrace when the binary was - * linked. As this probe is, in fact, enabled, we need to stuff 1 - * into R0. Accordingly, we can bypass all the instruction - * emulation logic since we know the inevitable result. It's possible - * that a user could construct a scenario where the 'is-enabled' - * probe was on some other instruction, but that would be a rather - * exotic way to shoot oneself in the foot. - */ - - if (is_enabled) { - regs32->r[0] = 1; - new_pc = regs32->pc + (tp->ftt_thumb ? 2 : 4); - goto done; - } - - /* For USDT probes, bypass all the emulation logic for the nop instruction */ - if ((tp->ftt_thumb && IS_THUMB32_NOP(THUMB_INSTR(tp->ftt_instr))) || - (!tp->ftt_thumb && IS_ARM32_NOP(tp->ftt_instr))) { - new_pc = regs32->pc + (tp->ftt_thumb ? 2 : 4); - goto done; - } - - condition_code = fasttrap_get_condition_code(regs32, tp); - instr_size = dtrace_instr_size(tp->ftt_instr, tp->ftt_thumb); - - switch (tp->ftt_type) { - case FASTTRAP_T_MOV_PC_REG: - case FASTTRAP_T_CPY_PC: - { - if (!condition_true(condition_code, regs32->cpsr)) { - new_pc = pc + instr_size; - break; - } - - int rm; - if (tp->ftt_thumb) { - rm = THUMB16_HRM(tp->ftt_instr1); - } else { - rm = tp->ftt_instr & 0xF; - } - new_pc = regs32->r[rm]; - - /* This instruction does not change the Thumb state */ - - break; - } - - case FASTTRAP_T_STM_LR: - case FASTTRAP_T_PUSH_LR: - { - /* - * This is a very common case, so we want to emulate this instruction if - * possible. However, on a push, it is possible that we might reach the end - * of a page and have to allocate a new page. Most of the time this will not - * happen, and we know that the push instruction can store at most 16 words, - * so check to see if we are far from the boundary, and if so, emulate. This - * can be made more aggressive by checking the actual number of words being - * pushed, but we won't do that for now. - * - * Some of the same issues that apply to POP_PC probably apply here also. - */ - - int reglist; - int ret; - uint32_t base; - - if (!condition_true(condition_code, regs32->cpsr)) { - new_pc = pc + instr_size; - break; - } - - base = regs32->sp; - if (((base - 16 * 4) >> PAGE_SHIFT) != (base >> PAGE_SHIFT)) { - /* Crosses the page boundary, go to emulation */ - goto instr_emulate; - } - - if (tp->ftt_thumb) { - if (instr_size == 4) { - /* We know we have to push lr, never push sp or pc */ - reglist = tp->ftt_instr2 & 0x1FFF; - } else { - reglist = tp->ftt_instr1 & 0xFF; - } - } else { - /* We know we have to push lr, never push sp or pc */ - reglist = tp->ftt_instr & 0x1FFF; - } - - /* Push the link register */ - base -= 4; - ret = fasttrap_suword32(base, regs32->lr); - if (ret == -1) { - fasttrap_sigsegv(p, uthread, (user_addr_t) base, state); - new_pc = regs32->pc; - break; - } - - /* Start pushing from $r12 */ - int regmask = 1 << 12; - int regnum = 12; - - while (regmask) { - if (reglist & regmask) { - base -= 4; - ret = fasttrap_suword32(base, regs32->r[regnum]); - if (ret == -1) { - fasttrap_sigsegv(p, uthread, (user_addr_t) base, state); - new_pc = regs32->pc; - break; - } - } - regmask >>= 1; - regnum--; - } - - regs32->sp = base; - - new_pc = pc + instr_size; - - break; - } - - - case FASTTRAP_T_LDM_PC: - case FASTTRAP_T_POP_PC: - { - /* TODO Two issues that will eventually need to be resolved: - * - * 1. Understand what the hardware does if we have to segfault (data abort) in - * the middle of a load multiple. We currently don't have a working segfault - * handler anyway, and with no swapfile we should never segfault on this load. - * If we do, we'll just kill the process by setting the pc to 0. - * - * 2. The emulation is no longer atomic. We currently only emulate pop for - * function epilogues, and so we should never have a race here because one - * thread should never be trying to manipulate another thread's stack frames. - * That is almost certainly a bug in the program. - * - * This will need to be fixed if we ever: - * a. Ship dtrace externally, as this could be a potential attack vector - * b. Support instruction level tracing, as we might then pop/ldm non epilogues. - * - */ - - /* Assume ldmia! sp/pop ... pc */ - - int regnum = 0, reglist; - int ret; - uint32_t base; - - if (!condition_true(condition_code, regs32->cpsr)) { - new_pc = pc + instr_size; - break; - } - - if (tp->ftt_thumb) { - if (instr_size == 4) { - /* We know we have to load the pc, don't do it twice */ - reglist = tp->ftt_instr2 & 0x7FFF; - } else { - reglist = tp->ftt_instr1 & 0xFF; - } - } else { - /* We know we have to load the pc, don't do it twice */ - reglist = tp->ftt_instr & 0x7FFF; - } - - base = regs32->sp; - while (reglist) { - if (reglist & 1) { - ret = fasttrap_fuword32((user_addr_t)base, ®s32->r[regnum]); - if (ret == -1) { - fasttrap_sigsegv(p, uthread, (user_addr_t) base, state); - new_pc = regs32->pc; - break; - } - base += 4; - } - reglist >>= 1; - regnum++; - } - - ret = fasttrap_fuword32((user_addr_t)base, &new_pc); - if (ret == -1) { - fasttrap_sigsegv(p, uthread, (user_addr_t) base, state); - new_pc = regs32->pc; - break; - } - base += 4; - - regs32->sp = base; - - set_thumb_flag(regs32, new_pc); - - break; - } - - case FASTTRAP_T_CB_N_Z: - { - /* Thumb mode instruction, and not permitted in IT block, so skip the condition code check */ - int rn = tp->ftt_instr1 & 0x7; - int offset = (((tp->ftt_instr1 & 0x00F8) >> 2) | ((tp->ftt_instr1 & 0x0200) >> 3)) + 4; - int nonzero = tp->ftt_instr1 & 0x0800; - if (!nonzero != !(regs32->r[rn] == 0)) { - new_pc = pc + offset; - } else { - new_pc = pc + instr_size; - } - break; - } - - case FASTTRAP_T_B_COND: - { - /* Use the condition code in the instruction and ignore the ITSTATE */ - - int code, offset; - if (tp->ftt_thumb) { - if (instr_size == 4) { - code = (tp->ftt_instr1 >> 6) & 0xF; - if (code == 14 || code == 15) { - panic("fasttrap: Emulation of invalid branch"); - } - int S = (tp->ftt_instr1 >> 10) & 1, - J1 = (tp->ftt_instr2 >> 13) & 1, - J2 = (tp->ftt_instr2 >> 11) & 1; - offset = 4 + SIGNEXTEND( - (S << 20) | (J2 << 19) | (J1 << 18) | - ((tp->ftt_instr1 & 0x003F) << 12) | - ((tp->ftt_instr2 & 0x07FF) << 1), - 21); - } else { - code = (tp->ftt_instr1 >> 8) & 0xF; - if (code == 14 || code == 15) { - panic("fasttrap: Emulation of invalid branch"); - } - offset = 4 + (SIGNEXTEND(tp->ftt_instr1 & 0xFF, 8) << 1); - } - } else { - code = ARM_CONDCODE(tp->ftt_instr); - if (code == 15) { - panic("fasttrap: Emulation of invalid branch"); - } - offset = 8 + (SIGNEXTEND(tp->ftt_instr & 0x00FFFFFF, 24) << 2); - } - - if (condition_true(code, regs32->cpsr)) { - new_pc = pc + offset; - } else { - new_pc = pc + instr_size; - } - - break; - } - - case FASTTRAP_T_B_UNCOND: - { - int offset; - - /* Unconditional branches can only be taken from Thumb mode */ - /* (This is different from an ARM branch with condition code "always") */ - ASSERT(tp->ftt_thumb == 1); - - if (!condition_true(condition_code, regs32->cpsr)) { - new_pc = pc + instr_size; - break; - } - - if (instr_size == 4) { - int S = (tp->ftt_instr1 >> 10) & 1, - J1 = (tp->ftt_instr2 >> 13) & 1, - J2 = (tp->ftt_instr2 >> 11) & 1; - int I1 = (J1 != S) ? 0 : 1, I2 = (J2 != S) ? 0 : 1; - offset = 4 + SIGNEXTEND( - (S << 24) | (I1 << 23) | (I2 << 22) | - ((tp->ftt_instr1 & 0x03FF) << 12) | - ((tp->ftt_instr2 & 0x07FF) << 1), - 25); - } else { - uint32_t instr1 = tp->ftt_instr1; - offset = 4 + (SIGNEXTEND(instr1 & 0x7FF, 11) << 1); - } - - new_pc = pc + offset; - - break; - } - - case FASTTRAP_T_BX_REG: - { - int reg; - - if (!condition_true(condition_code, regs32->cpsr)) { - new_pc = pc + instr_size; - break; - } - - if (tp->ftt_thumb) { - reg = THUMB16_HRM(tp->ftt_instr1); - } else { - reg = ARM_RM(tp->ftt_instr); - } - new_pc = regs32->r[reg]; - set_thumb_flag(regs32, new_pc); - - break; - } - - case FASTTRAP_T_LDR_PC_IMMED: - case FASTTRAP_T_VLDR_PC_IMMED: - /* Handle these instructions by replacing the PC in the instruction with another - * register. They are common, so we'd like to support them, and this way we do so - * without any risk of having to simulate a segfault. - */ - - /* Fall through */ - -instr_emulate: - case FASTTRAP_T_COMMON: - { - user_addr_t addr; - uint8_t scratch[32]; - uint_t i = 0; - fasttrap_instr_t emul_instr; - emul_instr.instr32 = tp->ftt_instr; - int emul_instr_size; - - /* - * Unfortunately sometimes when we emulate the instruction and have to replace the - * PC, there is no longer a thumb mode equivalent. We end up having to run the - * modified instruction in ARM mode. We use this variable to keep track of which - * mode we should emulate in. We still use the original variable to determine - * what mode to return to. - */ - uint8_t emul_thumb = tp->ftt_thumb; - int save_reg = -1; - uint32_t save_val = 0; - - /* - * Dealing with condition codes and emulation: - * We can't just uniformly do a condition code check here because not all instructions - * have condition codes. We currently do not support an instruction by instruction trace, - * so we can assume that either: 1. We are executing a Thumb instruction, in which case - * we either are not in an IT block and should execute always, or we are last in an IT - * block. Either way, the traced instruction will run correctly, and we won't have any - * problems when we return to the original code, because we will no longer be in the IT - * block. 2. We are executing an ARM instruction, in which case we are ok as long as - * we don't attempt to change the condition code. - */ - if (tp->ftt_type == FASTTRAP_T_LDR_PC_IMMED) { - /* We know we always have a free register (the one we plan to write the - * result value to!). So we'll replace the pc with that one. - */ - int new_reg; - if (tp->ftt_thumb) { - /* Check to see if thumb or thumb2 */ - if (instr_size == 2) { - /* - * Sadness. We need to emulate this instruction in ARM mode - * because it has an 8 bit immediate offset. Instead of having - * to deal with condition codes in the ARM instruction, we'll - * just check the condition and abort if the condition is false. - */ - if (!condition_true(condition_code, regs32->cpsr)) { - new_pc = pc + instr_size; - break; - } - - new_reg = (tp->ftt_instr1 >> 8) & 0x7; - regs32->r[new_reg] = ALIGNADDR(regs32->pc + 4, 2); - emul_thumb = 0; - emul_instr.instr32 = 0xE5900000 | (new_reg << 16) | (new_reg << 12) | ((tp->ftt_instr1 & 0xFF) << 2); - } else { - /* Thumb2. Just replace the register. */ - new_reg = (tp->ftt_instr2 >> 12) & 0xF; - regs32->r[new_reg] = ALIGNADDR(regs32->pc + 4, 2); - emul_instr.instr16.instr1 &= ~0x000F; - emul_instr.instr16.instr1 |= new_reg; - } - } else { - /* ARM. Just replace the register. */ - new_reg = (tp->ftt_instr >> 12) & 0xF; - regs32->r[new_reg] = ALIGNADDR(regs32->pc + 8, 2); - emul_instr.instr32 &= ~0x000F0000; - emul_instr.instr32 |= new_reg << 16; - } - } else if (tp->ftt_type == FASTTRAP_T_VLDR_PC_IMMED) { - /* This instruction only uses one register, and if we're here, we know - * it must be the pc. So we'll just replace it with R0. - */ - save_reg = 0; - save_val = regs32->r[0]; - regs32->r[save_reg] = ALIGNADDR(regs32->pc + (tp->ftt_thumb ? 4 : 8), 2); - if (tp->ftt_thumb) { - emul_instr.instr16.instr1 &= ~0x000F; - } else { - emul_instr.instr32 &= ~0x000F0000; - } - } - - emul_instr_size = dtrace_instr_size(emul_instr.instr32, emul_thumb); - - /* - * At this point: - * tp->ftt_thumb = thumb mode of original instruction - * emul_thumb = thumb mode for emulation - * emul_instr = instruction we are using to emulate original instruction - * emul_instr_size = size of emulating instruction - */ - - addr = uthread->t_dtrace_scratch->addr; - - if (addr == 0LL) { - fasttrap_sigtrap(p, uthread, pc); // Should be killing target proc - new_pc = pc; - break; - } - - uthread->t_dtrace_scrpc = addr; - if (emul_thumb) { - /* - * No way to do an unconditional branch in Thumb mode, shove the address - * onto the user stack and go to the next location with a pop. This can - * segfault if this push happens to cross a stack page, but that's ok, since - * we are running in userland, and the kernel knows how to handle userland - * stack expansions correctly. - * - * Layout of scratch space for Thumb mode: - * Emulated instruction - * ldr save_reg, [pc, #16] (if necessary, restore any register we clobbered) - * push { r0, r1 } - * ldr r0, [pc, #4] - * str r0, [sp, #4] - * pop { r0, pc } - * Location we should return to in original program - * Saved value of clobbered register (if necessary) - */ - - bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size; - - if (save_reg != -1) { - uint16_t restore_inst = 0x4803; - restore_inst |= (save_reg & 0x7) << 8; - SET16(scratch + i, restore_inst); i += 2; // ldr reg, [pc , #16] - } - - SET16(scratch + i, 0xB403); i += 2; // push { r0, r1 } - SET16(scratch + i, 0x4801); i += 2; // ldr r0, [pc, #4] - SET16(scratch + i, 0x9001); i += 2; // str r0, [sp, #4] - SET16(scratch + i, 0xBD01); i += 2; // pop { r0, pc } - - if (i % 4) { - SET16(scratch + i, 0); i += 2; // padding - saved 32 bit words must be aligned - } - SET32(scratch + i, pc + instr_size + (tp->ftt_thumb ? 1 : 0)); i += 4; // Return address - if (save_reg != -1) { - SET32(scratch + i, save_val); i += 4; // saved value of clobbered register - } - - uthread->t_dtrace_astpc = addr + i; - bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size; - SET16(scratch + i, FASTTRAP_THUMB32_RET_INSTR); i += 2; - } else { - /* - * Layout of scratch space for ARM mode: - * Emulated instruction - * ldr save_reg, [pc, #12] (if necessary, restore any register we clobbered) - * ldr pc, [pc, #4] - * Location we should return to in original program - * Saved value of clobbered register (if necessary) - */ - - bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size; - - if (save_reg != -1) { - uint32_t restore_inst = 0xE59F0004; - restore_inst |= save_reg << 12; - SET32(scratch + i, restore_inst); i += 4; // ldr reg, [pc, #12] - } - SET32(scratch + i, 0xE51FF004); i += 4; // ldr pc, [pc, #4] - - SET32(scratch + i, pc + instr_size + (tp->ftt_thumb ? 1 : 0)); i += 4; // Return address - if (save_reg != -1) { - SET32(scratch + i, save_val); i += 4; // Saved value of clobbered register - } - - uthread->t_dtrace_astpc = addr + i; - bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size; - SET32(scratch + i, FASTTRAP_ARM32_RET_INSTR); i += 4; - } - - if (uwrite(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) { - fasttrap_sigtrap(p, uthread, pc); - new_pc = pc; - break; - } - - if (tp->ftt_retids != NULL) { - uthread->t_dtrace_step = 1; - uthread->t_dtrace_ret = 1; - new_pc = uthread->t_dtrace_astpc + (emul_thumb ? 1 : 0); - } else { - new_pc = uthread->t_dtrace_scrpc + (emul_thumb ? 1 : 0); - } - - uthread->t_dtrace_pc = pc; - uthread->t_dtrace_npc = pc + instr_size; - uthread->t_dtrace_on = 1; - *was_simulated = 0; - set_thumb_flag(regs32, new_pc); - break; - } - - default: - panic("fasttrap: mishandled an instruction"); - } -done: - set_saved_state_pc(state, new_pc); - return; -} - /* * Copy out an instruction for execution in userland. * Trap back to kernel to handle return to original flow of execution, because @@ -1408,8 +591,10 @@ fasttrap_pid_probe_handle_patched_instr64(arm_saved_state_t *state, fasttrap_tra res2 = fasttrap_suword64(regs64->sp - 8, regs64->lr); if (res1 != 0 || res2 != 0) { fasttrap_sigsegv(p, uthread, regs64->sp - (res1 ? 16 : 8), state); +#ifndef DEBUG new_pc = regs64->pc; /* Bit of a hack */ break; +#endif } /* Move stack pointer */ @@ -1479,8 +664,10 @@ fasttrap_pid_probe_handle_patched_instr64(arm_saved_state_t *state, fasttrap_tra if (copyin(address, &value, valsize) != 0) { fasttrap_sigsegv(p, uthread, address, state); +#ifndef DEBUG new_pc = regs64->pc; /* Bit of a hack, we know about update in fasttrap_sigsegv() */ break; +#endif } /* Stash in correct register slot */ @@ -1750,7 +937,7 @@ fasttrap_pid_probe(arm_saved_state_t *state) uint64_t pc = get_saved_state_pc(state); - assert(is_64_bit || (pc <= UINT32_MAX)); + assert(is_64_bit); uthread_t uthread = (uthread_t) get_bsdthread_info(current_thread()); @@ -1814,15 +1001,6 @@ fasttrap_pid_probe(arm_saved_state_t *state) return -1; } - /* Validation of THUMB-related state */ - if (tp->ftt_thumb) { - if (!fasttrap_pid_probe_thumb_state_valid(saved_state32(state), tp)) { - fasttrap_tracepoint_remove(p, tp); - lck_mtx_unlock(pid_mtx); - return -1; - } - } - /* Execute the actual probe */ if (tp->ftt_ids != NULL) { fasttrap_id_t *id; @@ -1831,11 +1009,7 @@ fasttrap_pid_probe(arm_saved_state_t *state) if (is_saved_state64(state)) { arg4 = get_saved_state_reg(state, 4); } else { - uint32_t arg; - user_addr_t stack = (user_addr_t)get_saved_state_sp(state); - - fasttrap_fuword32_noerr(stack, &arg); - arg4 = arg; + return -1; } @@ -1853,8 +1027,8 @@ fasttrap_pid_probe(arm_saved_state_t *state) #endif } else { if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } @@ -1901,11 +1075,7 @@ fasttrap_pid_probe(arm_saved_state_t *state) } else { uint64_t t[5]; - if (is_64_bit) { - fasttrap_usdt_args64(probe, saved_state64(state), 5, t); - } else { - fasttrap_usdt_args32(probe, saved_state32(state), 5, t); - } + fasttrap_usdt_args64(probe, saved_state64(state), 5, t); dtrace_probe(probe->ftp_id, t[0], t[1], t[2], t[3], t[4]); } } @@ -1932,11 +1102,7 @@ fasttrap_pid_probe(arm_saved_state_t *state) * reported at: d, b, a. The new way gives c, b, a, which is closer * to correct, as the return instruction has already exectued. */ - if (is_64_bit) { - fasttrap_pid_probe_handle_patched_instr64(state, tp, uthread, p, is_enabled, &was_simulated); - } else { - fasttrap_pid_probe_handle_patched_instr32(state, tp, uthread, p, is_enabled, &was_simulated); - } + fasttrap_pid_probe_handle_patched_instr64(state, tp, uthread, p, is_enabled, &was_simulated); /* * If there were no return probes when we first found the tracepoint, @@ -2018,44 +1184,24 @@ fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, #pragma unused(arg, id, parg, aframes) arm_saved_state_t* regs = find_user_regs(current_thread()); - if (is_saved_state32(regs)) { - /* First four arguments are in registers */ - if (argno < 4) { - return saved_state32(regs)->r[argno]; - } - - /* Look on the stack for the rest */ - uint32_t value; - uint32_t* sp = (uint32_t*)(uintptr_t) saved_state32(regs)->sp; - DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - value = dtrace_fuword32((user_addr_t) (sp + argno - 4)); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); - - return value; - } else { - /* First eight arguments are in registers */ - if (argno < 8) { - return saved_state64(regs)->x[argno]; - } + /* First eight arguments are in registers */ + if (argno < 8) { + return saved_state64(regs)->x[argno]; + } - /* Look on the stack for the rest */ - uint64_t value; - uint64_t* sp = (uint64_t*) saved_state64(regs)->sp; - DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - value = dtrace_fuword64((user_addr_t) (sp + argno - 8)); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); + /* Look on the stack for the rest */ + uint64_t value; + uint64_t* sp = (uint64_t*) saved_state64(regs)->sp; + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + value = dtrace_fuword64((user_addr_t) (sp + argno - 8)); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); - return value; - } + return value; } uint64_t fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { #pragma unused(arg, id, parg, argno, aframes) -#if 0 - return fasttrap_anarg(ttolwp(curthread)->lwp_regs, 0, argno); -#endif - return 0; } diff --git a/bsd/dev/arm64/fbt_arm.c b/bsd/dev/arm64/fbt_arm.c index 083f98665..4cff0d3f6 100644 --- a/bsd/dev/arm64/fbt_arm.c +++ b/bsd/dev/arm64/fbt_arm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. */ /* * CDDL HEADER START @@ -27,16 +27,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from - * mach/ppc/thread_status.h */ #include #include #include @@ -231,7 +221,7 @@ fbt_perfCallback( if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(regs)) { boolean_t oldlevel = 0; machine_inst_t emul = 0; - uint64_t sp, pc, lr, imm; + uint64_t sp, lr, imm; oldlevel = ml_set_interrupts_enabled(FALSE); @@ -259,8 +249,7 @@ fbt_perfCallback( /* * Skip over the patched NOP planted by sdt */ - pc = get_saved_state_pc(regs); - set_saved_state_pc(regs, pc + DTRACE_INVOP_NOP_SKIP); + add_saved_state_pc(regs, DTRACE_INVOP_NOP_SKIP); retval = KERN_SUCCESS; } else if (FBT_IS_ARM64_ADD_FP_SP(emul)) { /* retrieve the value to add */ @@ -278,8 +267,7 @@ fbt_perfCallback( set_saved_state_fp(regs, sp + val); /* skip over the bytes of the patched instruction */ - pc = get_saved_state_pc(regs); - set_saved_state_pc(regs, pc + DTRACE_INVOP_ADD_FP_SP_SKIP); + add_saved_state_pc(regs, DTRACE_INVOP_ADD_FP_SP_SKIP); retval = KERN_SUCCESS; } else if (FBT_IS_ARM64_RET(emul)) { @@ -290,9 +278,8 @@ fbt_perfCallback( set_saved_state_pc(regs, lr); retval = KERN_SUCCESS; } else if (FBT_IS_ARM64_B_INSTR(emul)) { - pc = get_saved_state_pc(regs); imm = FBT_GET_ARM64_B_IMM(emul); - set_saved_state_pc(regs, pc + imm); + add_saved_state_pc(regs, imm); retval = KERN_SUCCESS; } else if (emul == FBT_PATCHVAL) { /* Means we encountered an error but handled it, try same inst again */ diff --git a/bsd/dev/arm64/sdt_arm.c b/bsd/dev/arm64/sdt_arm.c index 598bd05b7..a4b65f887 100644 --- a/bsd/dev/arm64/sdt_arm.c +++ b/bsd/dev/arm64/sdt_arm.c @@ -23,14 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)sdt.c 1.6 06/03/24 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include #include diff --git a/bsd/dev/arm64/sysctl.c b/bsd/dev/arm64/sysctl.c index e2715281d..d67aa4a0b 100644 --- a/bsd/dev/arm64/sysctl.c +++ b/bsd/dev/arm64/sysctl.c @@ -24,14 +24,8 @@ static int sysctl_time_since_reset SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2, oidp) - int error = 0; - uint64_t return_value = 0; - - return_value = ml_get_time_since_reset(); - - SYSCTL_OUT(req, &return_value, sizeof(return_value)); - - return error; + uint64_t return_value = ml_get_time_since_reset(); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); } SYSCTL_PROC(_machdep, OID_AUTO, time_since_reset, @@ -43,14 +37,8 @@ static int sysctl_wake_conttime SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2, oidp) - int error = 0; - uint64_t return_value = 0; - - return_value = ml_get_conttime_wake_time(); - - SYSCTL_OUT(req, &return_value, sizeof(return_value)); - - return error; + uint64_t return_value = ml_get_conttime_wake_time(); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); } SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime, @@ -185,9 +173,125 @@ SYSCTL_INT(_machdep, OID_AUTO, lck_mtx_adaptive_spin_mode, CTLFLAG_RW, &lck_mtx_adaptive_spin_mode, 0, "Enable adaptive spin behavior for kernel mutexes"); + #if DEVELOPMENT || DEBUG extern uint64_t TLockTimeOut; SYSCTL_QUAD(_machdep, OID_AUTO, tlto, CTLFLAG_RW | CTLFLAG_LOCKED, &TLockTimeOut, "Ticket spinlock timeout (MATUs): use with care"); + +static int +sysctl_sysreg_vbar_el1 SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t return_value = __builtin_arm_rsr64("VBAR_EL1"); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); +} + +/* + * machdep.cpu.sysreg_vbar_el1 + * + * ARM64: Vector Base Address Register. + * Read from the current CPU's system registers. + */ +SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_vbar_el1, + CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED, + 0, 0, sysctl_sysreg_vbar_el1, "Q", + "VBAR_EL1 register on the current CPU"); + +static int +sysctl_sysreg_mair_el1 SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t return_value = __builtin_arm_rsr64("MAIR_EL1"); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); +} + +/* + * machdep.cpu.sysreg_mair_el1 + * + * ARM64: Memory Attribute Indirection Register. + * Read from the current CPU's system registers. + */ +SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_mair_el1, + CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED, + 0, 0, sysctl_sysreg_mair_el1, "Q", + "MAIR_EL1 register on the current CPU"); + +static int +sysctl_sysreg_ttbr1_el1 SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t return_value = __builtin_arm_rsr64("TTBR1_EL1"); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); +} + +/* + * machdep.cpu.sysreg_ttbr1_el1 + * + * ARM64: Translation table base register 1. + * Read from the current CPU's system registers. + */ +SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_ttbr1_el1, + CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED, + 0, 0, sysctl_sysreg_ttbr1_el1, "Q", + "TTBR1_EL1 register on the current CPU"); + +static int +sysctl_sysreg_sctlr_el1 SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t return_value = __builtin_arm_rsr64("SCTLR_EL1"); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); +} + +/* + * machdep.cpu.sysreg_sctlr_el1 + * + * ARM64: System Control Register. + * Read from the current CPU's system registers. + */ +SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_sctlr_el1, + CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED, + 0, 0, sysctl_sysreg_sctlr_el1, "Q", + "SCTLR_EL1 register on the current CPU"); + +static int +sysctl_sysreg_tcr_el1 SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t return_value = __builtin_arm_rsr64("TCR_EL1"); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); +} + +/* + * machdep.cpu.sysreg_tcr_el1 + * + * ARM64: Translation Control Register. + * Read from the current CPU's system registers. + */ +SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_tcr_el1, + CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED, + 0, 0, sysctl_sysreg_tcr_el1, "Q", + "TCR_EL1 register on the current CPU"); + +static int +sysctl_sysreg_id_aa64mmfr0_el1 SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t return_value = __builtin_arm_rsr64("ID_AA64MMFR0_EL1"); + return SYSCTL_OUT(req, &return_value, sizeof(return_value)); +} + +/* + * machdep.cpu.sysreg_id_aa64mmfr0_el1 + * + * ARM64: AArch64 Memory Model Feature Register 0. + * Read from the current CPU's system registers. + */ +SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_id_aa64mmfr0_el1, + CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED, + 0, 0, sysctl_sysreg_id_aa64mmfr0_el1, "Q", + "ID_AA64MMFR0_EL1 register on the current CPU"); + #endif diff --git a/bsd/dev/dtrace/blist.c b/bsd/dev/dtrace/blist.c index 180d30ffb..6d219f95d 100644 --- a/bsd/dev/dtrace/blist.c +++ b/bsd/dev/dtrace/blist.c @@ -62,59 +62,12 @@ * $FreeBSD: src/sys/kern/subr_blist.c,v 1.5.2.1 2000/03/17 10:47:29 ps Exp $ */ -#if !defined(__APPLE__) -#ifdef _KERNEL - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#else - -#ifndef BLIST_NO_DEBUG -#define BLIST_DEBUG -#endif - -#define SWAPBLK_NONE ((daddr_t)-1) - -#include -#include -#include -#include -#include - -#define malloc(a, b, c) malloc(a) -#define free(a, b) free(a) - -typedef unsigned int u_daddr_t; - -#include - -void panic(const char *ctl, ...); - -#endif -#else /* is MacOS X */ -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - typedef unsigned int u_daddr_t; #include #include #include #include -/* #include */ #include "blist.h" #include @@ -123,8 +76,6 @@ typedef unsigned int u_daddr_t; #define free _FREE #define M_SWAP M_TEMP -#endif /* __APPLE__ */ - /* * static support functions */ @@ -139,16 +90,6 @@ static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip, blist_t dest, daddr_t count); static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count); -#ifndef _KERNEL -static void blst_radix_print(blmeta_t *scan, daddr_t blk, - daddr_t radix, int skip, int tab); -#endif - -#if !defined(__APPLE__) -#ifdef _KERNEL -static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); -#endif -#endif /* __APPLE__ */ /* * blist_create() - create a blist capable of handling up to the specified diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 6de3c98c4..9cc3b6094 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -29,8 +29,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)dtrace.c 1.65 08/07/02 SMI" */ - /* * DTrace - Dynamic Tracing for Solaris * @@ -75,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -105,6 +104,8 @@ #include #endif /* MONOTONIC */ +#include "dtrace_xoroshiro128_plus.h" + #include #include @@ -112,6 +113,7 @@ extern uint32_t pmap_find_phys(void *, uint64_t); extern boolean_t pmap_valid_page(uint32_t); extern void OSKextRegisterKextsWithDTrace(void); extern kmod_info_t g_kernel_kmod_info; +extern void commpage_update_dof(boolean_t enabled); /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ @@ -182,12 +184,12 @@ dtrace_optval_t dtrace_jstackstrsize_default = 512; dtrace_optval_t dtrace_buflimit_default = 75; dtrace_optval_t dtrace_buflimit_min = 1; dtrace_optval_t dtrace_buflimit_max = 99; +size_t dtrace_nprobes_default = 4; int dtrace_msgdsize_max = 128; hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */ hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */ int dtrace_devdepth_max = 32; int dtrace_err_verbose; -int dtrace_provide_private_probes = 0; hrtime_t dtrace_deadman_interval = NANOSEC; hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC; hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC; @@ -855,46 +857,16 @@ SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize, &dtrace_statvar_maxsize, 0, sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize"); -static int -sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS -{ -#pragma unused(oidp, arg2) - int error; - int value = *(int *) arg1; - - error = sysctl_io_number(req, value, sizeof(value), &value, NULL); - if (error) - return (error); - - if (req->newptr) { - if (value != 0 && value != 1) - return (ERANGE); - - /* - * We do not allow changing this back to zero, as private probes - * would still be left registered - */ - if (value != 1) - return (EPERM); - - lck_mtx_lock(&dtrace_lock); - dtrace_provide_private_probes = value; - lck_mtx_unlock(&dtrace_lock); - } - return (0); -} /* * kern.dtrace.provide_private_probes * * Set whether the providers must provide the private probes. This is - * mainly used by the FBT provider to request probes for the private/static - * symbols. + * kept as compatibility as they are always provided. */ -SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, - &dtrace_provide_private_probes, 0, - sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes"); +SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes, + CTLFLAG_RD | CTLFLAG_LOCKED, + (int *)NULL, 1, "provider must provide the private probes"); /* * kern.dtrace.dof_mode @@ -1293,11 +1265,72 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain, vstate)); } +#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') +#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') +#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ + ((ch) == '\t') || ((ch) == '\f')) +#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ + ((ch) >= 'A' && (ch) <= 'F')) +#define lisalnum(x) \ + (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z')) + +#define DIGIT(x) \ + (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A') + +/* + * Convert a string to a signed integer using safe loads. + */ +static int64_t +dtrace_strtoll(char *input, int base, size_t limit) +{ + uintptr_t pos = (uintptr_t)input; + int64_t val = 0; + int x; + boolean_t neg = B_FALSE; + char c, cc, ccc; + uintptr_t end = pos + limit; + + /* + * Consume any whitespace preceding digits. + */ + while ((c = dtrace_load8(pos)) == ' ' || c == '\t') + pos++; + + /* + * Handle an explicit sign if one is present. + */ + if (c == '-' || c == '+') { + if (c == '-') + neg = B_TRUE; + c = dtrace_load8(++pos); + } + + /* + * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it + * if present. + */ + if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' || + cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) { + pos += 2; + c = ccc; + } + + /* + * Read in contiguous digits until the first non-digit character. + */ + for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base; + c = dtrace_load8(++pos)) + val = val * base + x; + + return (neg ? -val : val); +} + + /* * Compare two strings using safe loads. */ static int -dtrace_strncmp(char *s1, char *s2, size_t limit) +dtrace_strncmp(const char *s1, const char *s2, size_t limit) { uint8_t c1, c2; volatile uint16_t *flags; @@ -3273,10 +3306,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS); if (ndx >= sizeof (mstate->dtms_arg) / sizeof (mstate->dtms_arg[0])) { - /* - * APPLE NOTE: Account for introduction of __dtrace_probe() - */ - int aframes = mstate->dtms_probe->dtpr_aframes + 3; + int aframes = mstate->dtms_probe->dtpr_aframes + 2; dtrace_vstate_t *vstate = &state->dts_vstate; dtrace_provider_t *pv; uint64_t val; @@ -3382,10 +3412,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, if (!dtrace_priv_kernel(state)) return (0); if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) { - /* - * APPLE NOTE: Account for introduction of __dtrace_probe() - */ - int aframes = mstate->dtms_probe->dtpr_aframes + 3; + int aframes = mstate->dtms_probe->dtpr_aframes + 2; mstate->dtms_stackdepth = dtrace_getstackdepth(aframes); mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH; @@ -3416,10 +3443,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, if (!dtrace_priv_kernel(state)) return (0); if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) { - /* - * APPLE NOTE: Account for introduction of __dtrace_probe() - */ - int aframes = mstate->dtms_probe->dtpr_aframes + 3; + int aframes = mstate->dtms_probe->dtpr_aframes + 2; if (!DTRACE_ANCHORED(mstate->dtms_probe)) { /* @@ -3663,6 +3687,458 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, } } +typedef enum dtrace_json_state { + DTRACE_JSON_REST = 1, + DTRACE_JSON_OBJECT, + DTRACE_JSON_STRING, + DTRACE_JSON_STRING_ESCAPE, + DTRACE_JSON_STRING_ESCAPE_UNICODE, + DTRACE_JSON_COLON, + DTRACE_JSON_COMMA, + DTRACE_JSON_VALUE, + DTRACE_JSON_IDENTIFIER, + DTRACE_JSON_NUMBER, + DTRACE_JSON_NUMBER_FRAC, + DTRACE_JSON_NUMBER_EXP, + DTRACE_JSON_COLLECT_OBJECT +} dtrace_json_state_t; + +/* + * This function possesses just enough knowledge about JSON to extract a single + * value from a JSON string and store it in the scratch buffer. It is able + * to extract nested object values, and members of arrays by index. + * + * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to + * be looked up as we descend into the object tree. e.g. + * + * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL + * with nelems = 5. + * + * The run time of this function must be bounded above by strsize to limit the + * amount of work done in probe context. As such, it is implemented as a + * simple state machine, reading one character at a time using safe loads + * until we find the requested element, hit a parsing error or run off the + * end of the object or string. + * + * As there is no way for a subroutine to return an error without interrupting + * clause execution, we simply return NULL in the event of a missing key or any + * other error condition. Each NULL return in this function is commented with + * the error condition it represents -- parsing or otherwise. + * + * The set of states for the state machine closely matches the JSON + * specification (http://json.org/). Briefly: + * + * DTRACE_JSON_REST: + * Skip whitespace until we find either a top-level Object, moving + * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_OBJECT: + * Locate the next key String in an Object. Sets a flag to denote + * the next String as a key string and moves to DTRACE_JSON_STRING. + * + * DTRACE_JSON_COLON: + * Skip whitespace until we find the colon that separates key Strings + * from their values. Once found, move to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_VALUE: + * Detects the type of the next value (String, Number, Identifier, Object + * or Array) and routes to the states that process that type. Here we also + * deal with the element selector list if we are requested to traverse down + * into the object tree. + * + * DTRACE_JSON_COMMA: + * Skip whitespace until we find the comma that separates key-value pairs + * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays + * (similarly DTRACE_JSON_VALUE). All following literal value processing + * states return to this state at the end of their value, unless otherwise + * noted. + * + * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP: + * Processes a Number literal from the JSON, including any exponent + * component that may be present. Numbers are returned as strings, which + * may be passed to strtoll() if an integer is required. + * + * DTRACE_JSON_IDENTIFIER: + * Processes a "true", "false" or "null" literal in the JSON. + * + * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE, + * DTRACE_JSON_STRING_ESCAPE_UNICODE: + * Processes a String literal from the JSON, whether the String denotes + * a key, a value or part of a larger Object. Handles all escape sequences + * present in the specification, including four-digit unicode characters, + * but merely includes the escape sequence without converting it to the + * actual escaped character. If the String is flagged as a key, we + * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA. + * + * DTRACE_JSON_COLLECT_OBJECT: + * This state collects an entire Object (or Array), correctly handling + * embedded strings. If the full element selector list matches this nested + * object, we return the Object in full as a string. If not, we use this + * state to skip to the next value at this level and continue processing. + */ +static char * +dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems, + char *dest) +{ + dtrace_json_state_t state = DTRACE_JSON_REST; + int64_t array_elem = INT64_MIN; + int64_t array_pos = 0; + uint8_t escape_unicount = 0; + boolean_t string_is_key = B_FALSE; + boolean_t collect_object = B_FALSE; + boolean_t found_key = B_FALSE; + boolean_t in_array = B_FALSE; + uint32_t braces = 0, brackets = 0; + char *elem = elemlist; + char *dd = dest; + uintptr_t cur; + + for (cur = json; cur < json + size; cur++) { + char cc = dtrace_load8(cur); + if (cc == '\0') + return (NULL); + + switch (state) { + case DTRACE_JSON_REST: + if (isspace(cc)) + break; + + if (cc == '{') { + state = DTRACE_JSON_OBJECT; + break; + } + + if (cc == '[') { + in_array = B_TRUE; + array_pos = 0; + array_elem = dtrace_strtoll(elem, 10, size); + found_key = array_elem == 0 ? B_TRUE : B_FALSE; + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected to find a top-level object or array. + */ + return (NULL); + case DTRACE_JSON_OBJECT: + if (isspace(cc)) + break; + + if (cc == '"') { + state = DTRACE_JSON_STRING; + string_is_key = B_TRUE; + break; + } + + /* + * ERROR: either the object did not start with a key + * string, or we've run off the end of the object + * without finding the requested key. + */ + return (NULL); + case DTRACE_JSON_STRING: + if (cc == '\\') { + *dd++ = '\\'; + state = DTRACE_JSON_STRING_ESCAPE; + break; + } + + if (cc == '"') { + if (collect_object) { + /* + * We don't reset the dest here, as + * the string is part of a larger + * object being collected. + */ + *dd++ = cc; + collect_object = B_FALSE; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (string_is_key) { + if (dtrace_strncmp(dest, elem, + size) == 0) + found_key = B_TRUE; + } else if (found_key) { + if (nelems > 1) { + /* + * We expected an object, not + * this string. + */ + return (NULL); + } + return (dest); + } + state = string_is_key ? DTRACE_JSON_COLON : + DTRACE_JSON_COMMA; + string_is_key = B_FALSE; + break; + } + + *dd++ = cc; + break; + case DTRACE_JSON_STRING_ESCAPE: + *dd++ = cc; + if (cc == 'u') { + escape_unicount = 0; + state = DTRACE_JSON_STRING_ESCAPE_UNICODE; + } else { + state = DTRACE_JSON_STRING; + } + break; + case DTRACE_JSON_STRING_ESCAPE_UNICODE: + if (!isxdigit(cc)) { + /* + * ERROR: invalid unicode escape, expected + * four valid hexidecimal digits. + */ + return (NULL); + } + + *dd++ = cc; + if (++escape_unicount == 4) + state = DTRACE_JSON_STRING; + break; + case DTRACE_JSON_COLON: + if (isspace(cc)) + break; + + if (cc == ':') { + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected a colon. + */ + return (NULL); + case DTRACE_JSON_COMMA: + if (isspace(cc)) + break; + + if (cc == ',') { + if (in_array) { + state = DTRACE_JSON_VALUE; + if (++array_pos == array_elem) + found_key = B_TRUE; + } else { + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * ERROR: either we hit an unexpected character, or + * we reached the end of the object or array without + * finding the requested key. + */ + return (NULL); + case DTRACE_JSON_IDENTIFIER: + if (islower(cc)) { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + + if (dtrace_strncmp(dest, "true", 5) == 0 || + dtrace_strncmp(dest, "false", 6) == 0 || + dtrace_strncmp(dest, "null", 5) == 0) { + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, + * not this identifier. + */ + return (NULL); + } + return (dest); + } else { + cur--; + state = DTRACE_JSON_COMMA; + break; + } + } + + /* + * ERROR: we did not recognise the identifier as one + * of those in the JSON specification. + */ + return (NULL); + case DTRACE_JSON_NUMBER: + if (cc == '.') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_FRAC; + break; + } + + if (cc == 'x' || cc == 'X') { + /* + * ERROR: specification explicitly excludes + * hexidecimal or octal numbers. + */ + return (NULL); + } + + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_FRAC: + if (cc == 'e' || cc == 'E') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_EXP; + break; + } + + if (cc == '+' || cc == '-') { + /* + * ERROR: expect sign as part of exponent only. + */ + return (NULL); + } + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_EXP: + if (isdigit(cc) || cc == '+' || cc == '-') { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, not + * this number. + */ + return (NULL); + } + return (dest); + } + + cur--; + state = DTRACE_JSON_COMMA; + break; + case DTRACE_JSON_VALUE: + if (isspace(cc)) + break; + + if (cc == '{' || cc == '[') { + if (nelems > 1 && found_key) { + in_array = cc == '[' ? B_TRUE : B_FALSE; + /* + * If our element selector directs us + * to descend into this nested object, + * then move to the next selector + * element in the list and restart the + * state machine. + */ + while (*elem != '\0') + elem++; + elem++; /* skip the inter-element NUL */ + nelems--; + dd = dest; + if (in_array) { + state = DTRACE_JSON_VALUE; + array_pos = 0; + array_elem = dtrace_strtoll( + elem, 10, size); + found_key = array_elem == 0 ? + B_TRUE : B_FALSE; + } else { + found_key = B_FALSE; + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * Otherwise, we wish to either skip this + * nested object or return it in full. + */ + if (cc == '[') + brackets = 1; + else + braces = 1; + *dd++ = cc; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + + if (cc == '"') { + state = DTRACE_JSON_STRING; + break; + } + + if (islower(cc)) { + /* + * Here we deal with true, false and null. + */ + *dd++ = cc; + state = DTRACE_JSON_IDENTIFIER; + break; + } + + if (cc == '-' || isdigit(cc)) { + *dd++ = cc; + state = DTRACE_JSON_NUMBER; + break; + } + + /* + * ERROR: unexpected character at start of value. + */ + return (NULL); + case DTRACE_JSON_COLLECT_OBJECT: + if (cc == '\0') + /* + * ERROR: unexpected end of input. + */ + return (NULL); + + *dd++ = cc; + if (cc == '"') { + collect_object = B_TRUE; + state = DTRACE_JSON_STRING; + break; + } + + if (cc == ']') { + if (brackets-- == 0) { + /* + * ERROR: unbalanced brackets. + */ + return (NULL); + } + } else if (cc == '}') { + if (braces-- == 0) { + /* + * ERROR: unbalanced braces. + */ + return (NULL); + } + } else if (cc == '{') { + braces++; + } else if (cc == '[') { + brackets++; + } + + if (brackets == 0 && braces == 0) { + if (found_key) { + *dd = '\0'; + return (dest); + } + dd = dest; /* reset string buffer */ + state = DTRACE_JSON_COMMA; + } + break; + } + } + return (NULL); +} + /* * Emulate the execution of DTrace ID subroutines invoked by the call opcode. * Notice that we don't bother validating the proper number of arguments or @@ -3695,7 +4171,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, switch (subr) { case DIF_SUBR_RAND: - regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875; + regs[rd] = dtrace_xoroshiro128_plus_next( + state->dts_rstate[CPU->cpu_id]); break; #if !defined(__APPLE__) @@ -4421,6 +4898,29 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } + case DIF_SUBR_STRTOLL: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t lim; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) { + regs[rd] = INT64_MIN; + break; + } + + regs[rd] = dtrace_strtoll((char *)s, base, lim); + break; + } + case DIF_SUBR_LLTOSTR: { int64_t i = (int64_t)tupregs[0].dttk_value; uint64_t val, digit; @@ -4976,6 +5476,65 @@ inetout: regs[rd] = (uintptr_t)end + 1; break; } + case DIF_SUBR_JSON: { + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t json = tupregs[0].dttk_value; + size_t jsonlen = dtrace_strlen((char *)json, size); + uintptr_t elem = tupregs[1].dttk_value; + size_t elemlen = dtrace_strlen((char *)elem, size); + + char *dest = (char *)mstate->dtms_scratch_ptr; + char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1; + char *ee = elemlist; + int nelems = 1; + uintptr_t cur; + + if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) || + !dtrace_canload(elem, elemlen + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + /* + * Read the element selector and split it up into a packed list + * of strings. + */ + for (cur = elem; cur < elem + elemlen; cur++) { + char cc = dtrace_load8(cur); + + if (cur == elem && cc == '[') { + /* + * If the first element selector key is + * actually an array index then ignore the + * bracket. + */ + continue; + } + + if (cc == ']') + continue; + + if (cc == '.' || cc == '[') { + nelems++; + cc = '\0'; + } + + *ee++ = cc; + } + *ee++ = '\0'; + + if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist, + nelems, dest)) != 0) + mstate->dtms_scratch_ptr += jsonlen + 1; + break; + } + case DIF_SUBR_TOUPPER: case DIF_SUBR_TOLOWER: { uintptr_t src = tupregs[0].dttk_value; @@ -5016,6 +5575,14 @@ inetout: regs[rd] = (uintptr_t)end + 1; break; } + case DIF_SUBR_STRIP: + if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + break; + } + regs[rd] = (uint64_t)dtrace_ptrauth_strip( + (void*)tupregs[0].dttk_value, tupregs[1].dttk_value); + break; #if defined(__APPLE__) case DIF_SUBR_VM_KERNEL_ADDRPERM: { @@ -5890,6 +6457,10 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, } *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1]; break; + case DIF_OP_STRIP: + regs[rd] = (uint64_t)dtrace_ptrauth_strip( + (void*)regs[r1], r2); + break; } } @@ -6287,13 +6858,64 @@ dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size, *valoffsp = valoffs; } +/* + * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is + * defined, we also assert that we are not recursing unless the probe ID is an + * error probe. + */ +static dtrace_icookie_t +dtrace_probe_enter(dtrace_id_t id) +{ + thread_t thread = current_thread(); + uint16_t inprobe; + + dtrace_icookie_t cookie; + + cookie = dtrace_interrupt_disable(); + + /* + * Unless this is an ERROR probe, we are not allowed to recurse in + * dtrace_probe(). Recursing into DTrace probe usually means that a + * function is instrumented that should not have been instrumented or + * that the ordering guarantee of the records will be violated, + * resulting in unexpected output. If there is an exception to this + * assertion, a new case should be added. + */ + inprobe = dtrace_get_thread_inprobe(thread); + VERIFY(inprobe == 0 || + id == dtrace_probeid_error); + ASSERT(inprobe < UINT16_MAX); + dtrace_set_thread_inprobe(thread, inprobe + 1); + + return (cookie); +} + +/* + * Clears the per-thread inprobe flag and enables interrupts. + */ +static void +dtrace_probe_exit(dtrace_icookie_t cookie) +{ + thread_t thread = current_thread(); + uint16_t inprobe = dtrace_get_thread_inprobe(thread); + + ASSERT(inprobe > 0); + dtrace_set_thread_inprobe(thread, inprobe - 1); + +#if INTERRUPT_MASKED_DEBUG + ml_spin_debug_reset(thread); +#endif /* INTERRUPT_MASKED_DEBUG */ + + dtrace_interrupt_enable(cookie); +} + /* * If you're looking for the epicenter of DTrace, you just found it. This * is the function called by the provider to fire a probe -- from which all * subsequent probe-context DTrace activity emanates. */ -static void -__dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, +void +dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) { processorid_t cpuid; @@ -6308,7 +6930,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, volatile uint16_t *flags; hrtime_t now; - cookie = dtrace_interrupt_disable(); + cookie = dtrace_probe_enter(id); probe = dtrace_probes[id - 1]; cpuid = CPU->cpu_id; onintr = CPU_ON_INTR(CPU); @@ -6319,7 +6941,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, * We have hit in the predicate cache; we know that * this predicate would evaluate to be false. */ - dtrace_interrupt_enable(cookie); + dtrace_probe_exit(cookie); return; } @@ -6327,7 +6949,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, /* * We don't trace anything if we're panicking. */ - dtrace_interrupt_enable(cookie); + dtrace_probe_exit(cookie); return; } @@ -6999,45 +7621,16 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, thread_t thread = current_thread(); int64_t t = dtrace_get_thread_tracing(thread); - if (t >= 0) { + if (t >= 0) { /* Usual case, accumulate time spent here into t_dtrace_tracing */ dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now)); - } else { + } else { /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */ - dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t); + dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t); } } - dtrace_interrupt_enable(cookie); -} - -/* - * APPLE NOTE: Don't allow a thread to re-enter dtrace_probe(). - * This could occur if a probe is encountered on some function in the - * transitive closure of the call to dtrace_probe(). - * Solaris has some strong guarantees that this won't happen. - * The Darwin implementation is not so mature as to make those guarantees. - * Hence, the introduction of __dtrace_probe() on xnu. - */ - -void -dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, - uint64_t arg2, uint64_t arg3, uint64_t arg4) -{ - thread_t thread = current_thread(); - disable_preemption(); - if (id == dtrace_probeid_error) { - __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); - dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */ - } else if (!dtrace_get_thread_reentering(thread)) { - dtrace_set_thread_reentering(thread, TRUE); - __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); - dtrace_set_thread_reentering(thread, FALSE); - } -#if DEBUG - else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN); -#endif - enable_preemption(); + dtrace_probe_exit(cookie); } /* @@ -8355,36 +8948,24 @@ dtrace_probe_create(dtrace_provider_id_t prov, const char *mod, if (id - 1 >= (dtrace_id_t)dtrace_nprobes) { size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *); - size_t nsize = osize << 1; - - if (nsize == 0) { - ASSERT(osize == 0); - ASSERT(dtrace_probes == NULL); - nsize = sizeof (dtrace_probe_t *); - } + size_t nsize = osize * 2; probes = kmem_zalloc(nsize, KM_SLEEP); - if (dtrace_probes == NULL) { - ASSERT(osize == 0); - dtrace_probes = probes; - dtrace_nprobes = 1; - } else { - dtrace_probe_t **oprobes = dtrace_probes; + dtrace_probe_t **oprobes = dtrace_probes; - bcopy(oprobes, probes, osize); - dtrace_membar_producer(); - dtrace_probes = probes; + bcopy(oprobes, probes, osize); + dtrace_membar_producer(); + dtrace_probes = probes; - dtrace_sync(); + dtrace_sync(); - /* - * All CPUs are now seeing the new probes array; we can - * safely free the old array. - */ - kmem_free(oprobes, osize); - dtrace_nprobes <<= 1; - } + /* + * All CPUs are now seeing the new probes array; we can + * safely free the old array. + */ + kmem_free(oprobes, osize); + dtrace_nprobes *= 2; ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes); } @@ -9020,7 +9601,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_NOT: case DIF_OP_MOV: @@ -9032,7 +9613,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_LDSB: case DIF_OP_LDSH: @@ -9048,7 +9629,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); if (kcheckload) dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op + DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd); @@ -9067,7 +9648,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_ULDSB: case DIF_OP_ULDSH: @@ -9083,7 +9664,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_STB: case DIF_OP_STH: @@ -9153,7 +9734,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_SETS: if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) { @@ -9163,7 +9744,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_LDGA: case DIF_OP_LDTA: @@ -9174,7 +9755,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_LDGS: case DIF_OP_LDTS: @@ -9186,7 +9767,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); break; case DIF_OP_STGS: case DIF_OP_STTS: @@ -9205,7 +9786,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); if (rd == 0) - err += efunc(pc, "cannot write to %r0\n"); + err += efunc(pc, "cannot write to %%r0\n"); if (subr == DIF_SUBR_COPYOUT || subr == DIF_SUBR_COPYOUTSTR || @@ -9230,6 +9811,16 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, if (rs >= nregs) err += efunc(pc, "invalid register %u\n", rs); break; + case DIF_OP_STRIP: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (!dtrace_is_valid_ptrauth_key(r2)) + err += efunc(pc, "invalid key\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %%r0\n"); + break; default: err += efunc(pc, "invalid opcode %u\n", DIF_INSTR_OP(instr)); @@ -9532,7 +10123,9 @@ dtrace_difo_validate_helper(dtrace_difo_t *dp) subr == DIF_SUBR_INET_NTOA || subr == DIF_SUBR_INET_NTOA6 || subr == DIF_SUBR_INET_NTOP || + subr == DIF_SUBR_JSON || subr == DIF_SUBR_LLTOSTR || + subr == DIF_SUBR_STRTOLL || subr == DIF_SUBR_RINDEX || subr == DIF_SUBR_STRCHR || subr == DIF_SUBR_STRJOIN || @@ -11419,7 +12012,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, if (buf->dtb_cur_limit == buf->dtb_limit) { buf->dtb_cur_limit = buf->dtb_size; - atomic_add_32(&state->dts_buf_over_limit, 1); + os_atomic_inc(&state->dts_buf_over_limit, relaxed); /** * Set an AST on the current processor * so that we can wake up the process @@ -11429,7 +12022,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, minor_t minor = getminor(state->dts_dev); ASSERT(minor < 32); - atomic_or_32(&dtrace_wake_clients, 1 << minor); + os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed); ast_dtrace_on(); } if ((uint64_t)soffs > buf->dtb_size) { @@ -13359,6 +13952,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) dtrace_state_t *state; dtrace_optval_t *opt; int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i; + unsigned int cpu_it; LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED); @@ -13405,6 +13999,25 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP); state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP); state->dts_buf_over_limit = 0; + + /* + * Allocate and initialise the per-process per-CPU random state. + * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is + * assumed to be seeded at this point (if from Fortuna seed file). + */ + state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP); + state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP); + (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t)); + for (cpu_it = 1; cpu_it < NCPU; cpu_it++) { + state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP); + /* + * Each CPU is assigned a 2^64 period, non-overlapping + * subsequence. + */ + dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1], + state->dts_rstate[cpu_it]); + } + state->dts_cleaner = CYCLIC_NONE; state->dts_deadman = CYCLIC_NONE; state->dts_vstate.dtvs_state = state; @@ -14178,6 +14791,11 @@ dtrace_state_destroy(dtrace_state_t *state) dtrace_buffer_free(state->dts_buffer); dtrace_buffer_free(state->dts_aggbuffer); + for (i = 0; i < (int)NCPU; i++) { + kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t)); + } + kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*)); + for (i = 0; i < nspec; i++) dtrace_buffer_free(spec[i].dtsp_buffer); @@ -16518,6 +17136,10 @@ dtrace_attach(dev_info_t *devi) LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED); + dtrace_nprobes = dtrace_nprobes_default; + dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes, + KM_SLEEP); + dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider, 0, /* unused */ offsetof(dtrace_probe_t, dtpr_nextprov), @@ -17664,7 +18286,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv * checking the buffer over limit count at this point. */ if (over_limit) { - uint32_t old = atomic_add_32(&state->dts_buf_over_limit, -1); + uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed); #pragma unused(old) /* @@ -17888,10 +18510,6 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv lck_mtx_lock(&mod_lock); struct modctl* ctl = dtrace_modctl_list; while (ctl) { - /* Update the private probes bit */ - if (dtrace_provide_private_probes) - ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES; - ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) { dtmul_count++; @@ -17939,10 +18557,6 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv struct modctl* ctl = dtrace_modctl_list; while (ctl) { - /* Update the private probes bit */ - if (dtrace_provide_private_probes) - ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES; - /* * We assume that userspace symbols will be "better" than kernel level symbols, * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms @@ -18060,10 +18674,6 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv struct modctl* ctl = dtrace_modctl_list; while (ctl) { - /* Update the private probes bit */ - if (dtrace_provide_private_probes) - ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES; - ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) { dtrace_provider_t *prv; @@ -18427,7 +19037,7 @@ void dtrace_ast(void) { int i; - uint32_t clients = atomic_and_32(&dtrace_wake_clients, 0); + uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed); if (clients == 0) return; /** @@ -18649,6 +19259,11 @@ dtrace_init( void ) break; } +#if CONFIG_DTRACE + if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER) + commpage_update_dof(true); +#endif + gDTraceInited = 1; } else @@ -18679,10 +19294,6 @@ dtrace_postinit(void) if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) { printf("dtrace_postinit: Could not register mach_kernel modctl\n"); } - - if (!PE_parse_boot_argn("dtrace_provide_private_probes", &dtrace_provide_private_probes, sizeof (dtrace_provide_private_probes))) { - dtrace_provide_private_probes = 0; - } (void)OSKextRegisterKextsWithDTrace(); } diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index d33a8f030..cd047e8d9 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -26,45 +26,28 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -/* - * APPLE NOTE: This file is compiled even if dtrace is unconfig'd. A symbol - * from this file (_dtrace_register_anon_DOF) always needs to be exported for - * an external kext to link against. - */ - -#if CONFIG_DTRACE - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include -#include -#include -#include -#include #include #include -#include #include #include #include #include #include -#include +#include #include #include #include #include #include #include -#include #include #include #include #include #include -#include #include /* All the bits we care about are guarded by MACH_KERNEL_PRIVATE :-( */ /* @@ -76,7 +59,6 @@ void dtrace_sprlock(proc_t *p) { - lck_mtx_assert(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(&p->p_dtrace_sprlock); } @@ -100,8 +82,6 @@ sprlock(pid_t pid) dtrace_sprlock(p); - proc_lock(p); - return p; } @@ -110,8 +90,6 @@ void sprunlock(proc_t *p) { if (p != PROC_NULL) { - proc_unlock(p); - dtrace_sprunlock(p); task_resume_internal(p->task); @@ -273,11 +251,6 @@ dtrace_CRED(void) } } -#define HAS_ALLPRIVS(cr) priv_isfullset(&CR_OEPRIV(cr)) -#define HAS_PRIVILEGE(cr, pr) ((pr) == PRIV_ALL ? \ - HAS_ALLPRIVS(cr) : \ - PRIV_ISASSERT(&CR_OEPRIV(cr), pr)) - int PRIV_POLICY_CHOICE(void* cred, int priv, int all) { @@ -605,15 +578,6 @@ cyclic_remove(cyclic_id_t cyclic) } } -kern_return_t _dtrace_register_anon_DOF(char *, uchar_t *, uint_t); - -kern_return_t -_dtrace_register_anon_DOF(char *name, uchar_t *data, uint_t nelements) -{ -#pragma unused(name, data, nelements) - return KERN_FAILURE; -} - int ddi_driver_major(dev_info_t *devi) { @@ -1503,25 +1467,3 @@ void dtrace_vtime_disable(void) { } - -#else /* else ! CONFIG_DTRACE */ - -#include -#include -#include - -/* - * This exists to prevent build errors when dtrace is unconfigured. - */ - -kern_return_t _dtrace_register_anon_DOF(char *, unsigned char *, uint32_t); - -kern_return_t -_dtrace_register_anon_DOF(char *arg1, unsigned char *arg2, uint32_t arg3) -{ -#pragma unused(arg1, arg2, arg3) - - return KERN_FAILURE; -} - -#endif /* CONFIG_DTRACE */ diff --git a/bsd/dev/dtrace/dtrace_subr.c b/bsd/dev/dtrace/dtrace_subr.c index 3bc601af8..5f28ca810 100644 --- a/bsd/dev/dtrace/dtrace_subr.c +++ b/bsd/dev/dtrace/dtrace_subr.c @@ -24,10 +24,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)dtrace_subr.c 1.8 07/06/05 SMI" - */ - #include #include #include @@ -295,6 +291,44 @@ dtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) kmem_free(hdlr, sizeof (dtrace_invop_hdlr_t)); } +void* +dtrace_ptrauth_strip(void *ptr, uint64_t key) +{ +#pragma unused(key) +#if __has_feature(ptrauth_calls) + /* + * The key argument to ptrauth_strip needs to be a compile-time + * constant + */ + switch (key) { + case ptrauth_key_asia: + return ptrauth_strip(ptr, ptrauth_key_asia); + case ptrauth_key_asib: + return ptrauth_strip(ptr, ptrauth_key_asib); + case ptrauth_key_asda: + return ptrauth_strip(ptr, ptrauth_key_asda); + case ptrauth_key_asdb: + return ptrauth_strip(ptr, ptrauth_key_asdb); + default: + return ptr; + } +#else + return ptr; +#endif // __has_feature(ptrauth_calls) +} + +int +dtrace_is_valid_ptrauth_key(uint64_t key) +{ +#pragma unused(key) +#if __has_feature(ptrauth_calls) + return (key == ptrauth_key_asia) || (key == ptrauth_key_asib) || + (key == ptrauth_key_asda) || (key == ptrauth_key_asdb); +#else + return (0); +#endif /* __has_feature(ptrauth_calls) */ +} + static minor_t next_minor = 0; static dtrace_state_t* dtrace_clients[DTRACE_NCLIENTS] = {NULL}; @@ -303,7 +337,7 @@ minor_t dtrace_state_reserve(void) { for (int i = 0; i < DTRACE_NCLIENTS; i++) { - minor_t minor = atomic_add_32(&next_minor, 1) % DTRACE_NCLIENTS; + minor_t minor = os_atomic_inc_orig(&next_minor, relaxed) % DTRACE_NCLIENTS; if (dtrace_clients[minor] == NULL) return minor; } diff --git a/bsd/dev/dtrace/dtrace_xoroshiro128_plus.c b/bsd/dev/dtrace/dtrace_xoroshiro128_plus.c new file mode 100644 index 000000000..d29d58ed8 --- /dev/null +++ b/bsd/dev/dtrace/dtrace_xoroshiro128_plus.c @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 2016 (Graeme Jenkinson) + * All rights reserved. + * + * This software was developed by BAE Systems, the University of Cambridge + * Computer Laboratory, and Memorial University under DARPA/AFRL contract + * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing + * (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include + +#include "dtrace_xoroshiro128_plus.h" + +static __inline uint64_t +rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +/* + * This is the jump function for the generator. It is equivalent to 2^64 calls + * to next(); it can be used to generate 2^64 non-overlapping subsequences for + * parallel computations. + */ +void +dtrace_xoroshiro128_plus_jump(uint64_t * const state, + uint64_t * const jump_state) +{ + static const uint64_t JUMP[] = { 0xbeac0467eba5facb, + 0xd86b048b86aa9922 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + size_t i = 0; + int b = 0; + for (i = 0; i < sizeof JUMP / sizeof *JUMP; i++) { + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= state[0]; + s1 ^= state[1]; + } + dtrace_xoroshiro128_plus_next(state); + } + } + jump_state[0] = s0; + jump_state[1] = s1; +} + +/* + * xoroshiro128+ - XOR/rotate/shift/rotate + * xorshift.di.unimi.it + */ +uint64_t +dtrace_xoroshiro128_plus_next(uint64_t * const state) +{ + const uint64_t s0 = state[0]; + uint64_t s1 = state[1]; + uint64_t result; + result = s0 + s1; + + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + + return result; +} diff --git a/bsd/dev/dtrace/dtrace_xoroshiro128_plus.h b/bsd/dev/dtrace/dtrace_xoroshiro128_plus.h new file mode 100644 index 000000000..c1dafcd7b --- /dev/null +++ b/bsd/dev/dtrace/dtrace_xoroshiro128_plus.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2016 (Graeme Jenkinson) + * All rights reserved. + * + * This software was developed by BAE Systems, the University of Cambridge + * Computer Laboratory, and Memorial University under DARPA/AFRL contract + * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing + * (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef _DTRACE_XOROSHIRO128_PLUS_H +#define _DTRACE_XOROSHIRO128_PLUS_H + +#include +#include + +void dtrace_xoroshiro128_plus_jump(uint64_t * const, uint64_t * const); +uint64_t dtrace_xoroshiro128_plus_next(uint64_t * const); + +#endif diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index 9ce7ccc9c..e90e109f0 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -24,10 +24,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)fasttrap.c 1.26 08/04/21 SMI" - */ - #include #include @@ -519,7 +515,7 @@ fasttrap_pid_cleanup_cb(void) while (1) { unsigned int later = 0; - work = atomic_and_32(&fasttrap_cleanup_work, 0); + work = os_atomic_xchg(&fasttrap_cleanup_work, 0, relaxed); lck_mtx_unlock(&fasttrap_cleanup_mtx); if (work & FASTTRAP_CLEANUP_PROVIDER) { later = fasttrap_pid_cleanup_providers(); @@ -542,7 +538,7 @@ fasttrap_pid_cleanup_cb(void) * (if detach fails). */ if (later > 0) { - struct timespec t = {1, 0}; + struct timespec t = {.tv_sec = 1, .tv_nsec = 0}; msleep(&fasttrap_pid_cleanup_cb, &fasttrap_cleanup_mtx, PRIBIO, "fasttrap_pid_cleanup_cb", &t); } else @@ -559,7 +555,7 @@ static void fasttrap_pid_cleanup(uint32_t work) { lck_mtx_lock(&fasttrap_cleanup_mtx); - atomic_or_32(&fasttrap_cleanup_work, work); + os_atomic_or(&fasttrap_cleanup_work, work, relaxed); fasttrap_pid_cleanup_compute_priority(); wakeup(&fasttrap_pid_cleanup_cb); lck_mtx_unlock(&fasttrap_cleanup_mtx); @@ -601,7 +597,6 @@ fasttrap_fork(proc_t *p, proc_t *cp) printf("fasttrap_fork: sprlock(%d) returned a different proc\n", cp->p_pid); return; } - proc_unlock(cp); /* * Iterate over every tracepoint looking for ones that belong to the @@ -635,7 +630,6 @@ fasttrap_fork(proc_t *p, proc_t *cp) */ dtrace_ptss_fork(p, cp); - proc_lock(cp); sprunlock(cp); } @@ -656,9 +650,9 @@ fasttrap_exec_exit(proc_t *p) * explaining. This method is always called with the proc_lock held. * We must drop the proc_lock before calling fasttrap_provider_retire * to avoid a deadlock when it takes the bucket lock. - * + * * Next, the dtrace_ptss_exec_exit function requires the sprlock - * be held, but not the proc_lock. + * be held, but not the proc_lock. * * Finally, we must re-acquire the proc_lock */ @@ -922,13 +916,13 @@ fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index) ASSERT(tp->ftt_ids != NULL); idp = &tp->ftt_ids; break; - + case DTFTP_RETURN: case DTFTP_POST_OFFSETS: ASSERT(tp->ftt_retids != NULL); idp = &tp->ftt_retids; break; - + default: /* Fix compiler warning... */ idp = NULL; @@ -1151,6 +1145,8 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) return(0); } + proc_lock(p); + if ((p->p_csflags & (CS_KILL|CS_HARD))) { proc_unlock(p); for (i = 0; i < DTRACE_NCLIENTS; i++) { @@ -1162,12 +1158,12 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) mac_proc_check_get_task(state->dts_cred.dcr_cred, p); } rc = cs_allow_invalid(p); - proc_lock(p); if (rc == 0) { sprunlock(p); cmn_err(CE_WARN, "process doesn't allow invalid code pages, failing to install fasttrap probe\n"); return (0); } + proc_lock(p); } /* @@ -1217,7 +1213,6 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) i--; } - proc_lock(p); sprunlock(p); /* @@ -1229,7 +1224,6 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) } } - proc_lock(p); sprunlock(p); probe->ftp_enabled = 1; @@ -1256,7 +1250,6 @@ fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg) */ if ((p = sprlock(probe->ftp_pid)) != PROC_NULL) { // ASSERT(!(p->p_flag & SVFORK)); - proc_unlock(p); } lck_mtx_lock(&provider->ftp_mtx); @@ -1283,7 +1276,6 @@ fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg) whack = provider->ftp_marked = 1; lck_mtx_unlock(&provider->ftp_mtx); - proc_lock(p); sprunlock(p); } else { /* @@ -1360,8 +1352,8 @@ fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg) ASSERT(!probe->ftp_enabled); ASSERT(fasttrap_total >= probe->ftp_ntps); - atomic_add_32(&fasttrap_total, -probe->ftp_ntps); - atomic_add_32(&fasttrap_retired, -probe->ftp_ntps); + os_atomic_sub(&fasttrap_total, probe->ftp_ntps, relaxed); + os_atomic_sub(&fasttrap_retired, probe->ftp_ntps, relaxed); if (probe->ftp_gen + 1 >= fasttrap_mod_gen) fasttrap_mod_barrier(probe->ftp_gen); @@ -1427,7 +1419,7 @@ fasttrap_proc_lookup(pid_t pid) lck_mtx_lock(&fprc->ftpc_mtx); lck_mtx_unlock(&bucket->ftb_mtx); fprc->ftpc_rcount++; - atomic_add_64(&fprc->ftpc_acount, 1); + os_atomic_inc(&fprc->ftpc_acount, relaxed); ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); lck_mtx_unlock(&fprc->ftpc_mtx); @@ -1458,7 +1450,7 @@ fasttrap_proc_lookup(pid_t pid) lck_mtx_lock(&fprc->ftpc_mtx); lck_mtx_unlock(&bucket->ftb_mtx); fprc->ftpc_rcount++; - atomic_add_64(&fprc->ftpc_acount, 1); + os_atomic_inc(&fprc->ftpc_acount, relaxed); ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); lck_mtx_unlock(&fprc->ftpc_mtx); @@ -1686,7 +1678,7 @@ fasttrap_provider_free(fasttrap_provider_t *provider) * count of active providers on the associated process structure. */ if (!provider->ftp_retired) { - atomic_add_64(&provider->ftp_proc->ftpc_acount, -1); + os_atomic_dec(&provider->ftp_proc->ftpc_acount, relaxed); ASSERT(provider->ftp_proc->ftpc_acount < provider->ftp_proc->ftpc_rcount); } @@ -1716,7 +1708,7 @@ fasttrap_provider_free(fasttrap_provider_t *provider) proc_lock(p); p->p_dtrace_probes--; proc_unlock(p); - + proc_rele(p); } @@ -1765,14 +1757,14 @@ fasttrap_provider_retire(proc_t *p, const char *name, int mprov) * bucket lock therefore protects the integrity of the provider hash * table. */ - atomic_add_64(&fp->ftp_proc->ftpc_acount, -1); + os_atomic_dec(&fp->ftp_proc->ftpc_acount, relaxed); ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount); /* * Add this provider probes to the retired count and * make sure we don't add them twice */ - atomic_add_32(&fasttrap_retired, fp->ftp_pcount); + os_atomic_add(&fasttrap_retired, fp->ftp_pcount, relaxed); fp->ftp_pcount = 0; fp->ftp_retired = 1; @@ -1892,9 +1884,9 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) pdata->ftps_mod, pdata->ftps_func, name_str) != 0) continue; - atomic_add_32(&fasttrap_total, 1); + os_atomic_inc(&fasttrap_total, relaxed); if (fasttrap_total > fasttrap_max) { - atomic_add_32(&fasttrap_total, -1); + os_atomic_dec(&fasttrap_total, relaxed); goto no_mem; } provider->ftp_pcount++; @@ -1908,7 +1900,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) pp->ftp_pid = pdata->ftps_pid; pp->ftp_ntps = 1; - tp = zalloc(fasttrap_tracepoint_t_zone); + tp = zalloc(fasttrap_tracepoint_t_zone); bzero(tp, sizeof (fasttrap_tracepoint_t)); tp->ftt_proc = provider->ftp_proc; @@ -1935,10 +1927,10 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) } else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod, pdata->ftps_func, name) == 0) { - atomic_add_32(&fasttrap_total, pdata->ftps_noffs); + os_atomic_add(&fasttrap_total, pdata->ftps_noffs, relaxed); if (fasttrap_total > fasttrap_max) { - atomic_add_32(&fasttrap_total, -pdata->ftps_noffs); + os_atomic_sub(&fasttrap_total, pdata->ftps_noffs, relaxed); goto no_mem; } @@ -1953,7 +1945,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1]) continue; - atomic_add_32(&fasttrap_total, -pdata->ftps_noffs); + os_atomic_sub(&fasttrap_total, pdata->ftps_noffs, relaxed); goto no_mem; } provider->ftp_pcount += pdata->ftps_noffs; @@ -1985,7 +1977,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) * this field is simply initialized to 0 on its way * into the kernel. */ - + tp->ftt_fntype = pdata->ftps_arch_subinfo; #endif pp->ftp_tps[i].fit_tp = tp; @@ -2177,7 +2169,7 @@ fasttrap_meta_create_probe(void *arg, void *parg, #if 0 /* - * APPLE NOTE: This is hideously expensive. See note in + * APPLE NOTE: This is hideously expensive. See note in * fasttrap_meta_provide() for why we can get away without * checking here. */ @@ -2191,10 +2183,10 @@ fasttrap_meta_create_probe(void *arg, void *parg, ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs; ASSERT(ntps > 0); - atomic_add_32(&fasttrap_total, ntps); + os_atomic_add(&fasttrap_total, ntps, relaxed); if (fasttrap_total > fasttrap_max) { - atomic_add_32(&fasttrap_total, -ntps); + os_atomic_sub(&fasttrap_total, ntps, relaxed); lck_mtx_unlock(&provider->ftp_cmtx); return; } @@ -2239,7 +2231,7 @@ fasttrap_meta_create_probe(void *arg, void *parg, * All ARM and ARM64 probes are zero offset. We need to zero out the * thumb bit because we still support 32bit user processes. * On 64bit user processes, bit zero won't be set anyway. - */ + */ tp->ftt_pc = (dhpb->dthpb_base + (int64_t)dhpb->dthpb_offs[i]) & ~0x1UL; tp->ftt_fntype = FASTTRAP_FN_USDT; #else @@ -2277,7 +2269,7 @@ fasttrap_meta_create_probe(void *arg, void *parg, * All ARM and ARM64 probes are zero offset. We need to zero out the * thumb bit because we still support 32bit user processes. * On 64bit user processes, bit zero won't be set anyway. - */ + */ tp->ftt_pc = (dhpb->dthpb_base + (int64_t)dhpb->dthpb_enoffs[j]) & ~0x1UL; tp->ftt_fntype = FASTTRAP_FN_USDT; #else @@ -2613,7 +2605,7 @@ fasttrap_attach(void) &fasttrap_meta_id); } -static int +static int _fasttrap_open(dev_t dev, int flags, int devtype, struct proc *p) { #pragma unused(dev, flags, devtype, p) @@ -2640,7 +2632,7 @@ _fasttrap_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p) } else if (rv != 0) { ASSERT( (rv & 0xfff00000) == 0 ); return (((rv & 0xfffff) << 12)); /* ioctl returns -1 and errno set to a return value >= 4096 */ - } else + } else return 0; } @@ -2717,12 +2709,12 @@ fasttrap_init( void ) fasttrap_probe_t_zone_names[i]); } - + /* * Create the fasttrap lock group. Must be done before fasttrap_attach()! */ fasttrap_lck_attr = lck_attr_alloc_init(); - fasttrap_lck_grp_attr= lck_grp_attr_alloc_init(); + fasttrap_lck_grp_attr= lck_grp_attr_alloc_init(); fasttrap_lck_grp = lck_grp_alloc_init("fasttrap", fasttrap_lck_grp_attr); /* diff --git a/bsd/dev/dtrace/fbt.c b/bsd/dev/dtrace/fbt.c index 036d85bcb..fe2918435 100644 --- a/bsd/dev/dtrace/fbt.c +++ b/bsd/dev/dtrace/fbt.c @@ -23,14 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)fbt.c 1.18 07/01/10 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include @@ -80,471 +72,13 @@ fbt_probe_t **fbt_probetab; int fbt_probetab_mask; static int fbt_verbose = 0; -int ignore_fbt_blacklist = 0; +extern int ignore_fbt_blacklist; extern int dtrace_kernel_symbol_mode; void fbt_init( void ); -/* - * Critical routines that must not be probed. PR_5221096, PR_5379018. - * The blacklist must be kept in alphabetic order for purposes of bsearch(). - */ -static const char * critical_blacklist[] = -{ - "Call_DebuggerC", - "DebuggerCall", - "DebuggerTrapWithState", - "DebuggerXCallEnter", - "IOCPURunPlatformPanicActions", - "PEARMDebugPanicHook", - "PEHaltRestart", - "SavePanicInfo", - "SysChoked", - "_ZN9IOService14newTemperatureElPS_", /* IOService::newTemperature */ - "_ZN9IOService26temperatureCriticalForZoneEPS_", /* IOService::temperatureCriticalForZone */ - "_ZNK6OSData14getBytesNoCopyEv", /* Data::getBytesNoCopy, IOHibernateSystemWake path */ - "__ZN16IOPlatformExpert11haltRestartEj", - "__ZN18IODTPlatformExpert11haltRestartEj", - "__ZN9IODTNVRAM13savePanicInfoEPhy", - "_disable_preemption", - "_enable_preemption", - "alternate_debugger_enter", - "bcopy_phys", - "console_cpu_alloc", - "console_cpu_free", - "cpu_IA32e_disable", - "cpu_IA32e_enable", - "cpu_NMI_interrupt", - "cpu_control", - "cpu_data_alloc", - "cpu_desc_init", - "cpu_desc_init64", - "cpu_desc_load", - "cpu_desc_load64", - "cpu_exit_wait", - "cpu_info", - "cpu_info_count", - "cpu_init", - "cpu_interrupt", - "cpu_machine_init", - "cpu_mode_init", - "cpu_processor_alloc", - "cpu_processor_free", - "cpu_signal_handler", - "cpu_sleep", - "cpu_start", - "cpu_subtype", - "cpu_thread_alloc", - "cpu_thread_halt", - "cpu_thread_init", - "cpu_threadtype", - "cpu_to_processor", - "cpu_topology_sort", - "cpu_topology_start_cpu", - "cpu_type", - "cpuid_cpu_display", - "cpuid_extfeatures", - "dtrace_invop", - "enter_lohandler", - "fbt_invop", - "fbt_perfCallback", - "get_preemption_level" - "get_threadtask", - "handle_pending_TLB_flushes", - "hw_compare_and_store", - "interrupt", - "is_saved_state32", - "kernel_preempt_check", - "kernel_trap", - "kprintf", - "ks_dispatch_kernel", - "ks_dispatch_user", - "ks_kernel_trap", - "lo_alltraps", - "lock_debugger", - "machine_idle_cstate", - "machine_thread_get_kern_state", - "mca_cpu_alloc", - "mca_cpu_init", - "ml_nofault_copy", - "nanoseconds_to_absolutetime", - "nanotime_to_absolutetime", - "packA", - "panic", - "phystokv", - "phystokv_range", - "pltrace", - "pmKextRegister", - "pmMarkAllCPUsOff", - "pmSafeMode", - "pmTimerRestore", - "pmTimerSave", - "pmUnRegister", - "pmap_cpu_alloc", - "pmap_cpu_free", - "pmap_cpu_high_map_vaddr", - "pmap_cpu_high_shared_remap", - "pmap_cpu_init", - "power_management_init", - "preemption_underflow_panic", - "register_cpu_setup_func", - "ret64_iret" - "ret_to_user" - "return_to_kernel", - "return_to_user", - "saved_state64", - "sdt_invop", - "sprlock", - "sprunlock", - "strlen", - "strncmp", - "t_invop", - "tmrCvt", - "trap_from_kernel", - "uart_putc", - "unlock_debugger", - "unpackA", - "unregister_cpu_setup_func", - "uread", - "uwrite", - "vstart" -}; - -#define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0])) - -/* - * The transitive closure of entry points that can be reached from probe context. - * (Apart from routines whose names begin with dtrace_). - */ -static const char * probe_ctx_closure[] = -{ - "ClearIdlePop", - "Debugger", - "IS_64BIT_PROCESS", - "OSCompareAndSwap", - "SetIdlePop", - "__dtrace_probe", - "absolutetime_to_microtime", - "act_set_astbsd", - "arm_init_idle_cpu", - "ast_dtrace_on", - "ast_pending", - "clean_dcache", - "clean_mmu_dcache", - "clock_get_calendar_nanotime_nowait", - "copyin", - "copyin_kern", - "copyin_user", - "copyinstr", - "copyout", - "copyoutstr", - "cpu_number", - "current_proc", - "current_processor", - "current_task", - "current_thread", - "debug_enter", - "drain_write_buffer", - "find_user_regs", - "flush_dcache", - "flush_tlb64", - "get_bsdtask_info", - "get_bsdthread_info", - "hertz_tick", - "hw_atomic_and", - "invalidate_mmu_icache", - "kauth_cred_get", - "kauth_getgid", - "kauth_getuid", - "kernel_preempt_check", - "kvtophys", - "mach_absolute_time", - "max_valid_stack_address", - "memcpy", - "memmove", - "ml_at_interrupt_context", - "ml_phys_write_byte_64", - "ml_phys_write_half_64", - "ml_phys_write_word_64", - "ml_set_interrupts_enabled", - "mt_core_snap", - "mt_cur_cpu_cycles", - "mt_cur_cpu_instrs", - "mt_cur_thread_cycles", - "mt_cur_thread_instrs", - "mt_fixed_counts", - "mt_fixed_counts_internal", - "mt_mtc_update_count", - "mt_update_thread", - "ovbcopy", - "panic", - "pmap64_pdpt", - "pmap_find_phys", - "pmap_get_mapwindow", - "pmap_pde", - "pmap_pde_internal0", - "pmap_pde_internal1", - "pmap_pte", - "pmap_pte_internal", - "pmap_put_mapwindow", - "pmap_valid_page", - "prf", - "proc_is64bit", - "proc_selfname", - "psignal_lock", - "rtc_nanotime_load", - "rtc_nanotime_read", - "sdt_getargdesc", - "setPop", - "strlcpy", - "sync_iss_to_iks_unconditionally", - "systrace_stub", - "timer_grab" -}; -#define PROBE_CTX_CLOSURE_COUNT (sizeof(probe_ctx_closure)/sizeof(probe_ctx_closure[0])) - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wcast-qual" -static int -_cmp(const void *a, const void *b) -{ - return strncmp((const char *)a, *(const char **)b, strlen((const char *)a) + 1); -} -#pragma clang diagnostic pop -/* - * Module validation - */ -int -fbt_module_excluded(struct modctl* ctl) -{ - ASSERT(!MOD_FBT_DONE(ctl)); - - if (ctl->mod_address == 0 || ctl->mod_size == 0) { - return TRUE; - } - - if (ctl->mod_loaded == 0) { - return TRUE; - } - - /* - * If the user sets this, trust they know what they are doing. - */ - if (ignore_fbt_blacklist) { - return FALSE; - } - - /* - * These drivers control low level functions that when traced - * cause problems often in the sleep/wake paths as well as - * critical debug and panic paths. - * If somebody really wants to drill in on one of these kexts, then - * they can override blacklisting using the boot-arg above. - */ - -#ifdef __x86_64__ - if (strstr(ctl->mod_modname, "AppleACPIEC") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "AppleACPIPlatform") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "AppleRTC") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "IOACPIFamily") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "AppleIntelCPUPowerManagement") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "AppleProfile") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "AppleIntelProfile") != NULL) { - return TRUE; - } - - if (strstr(ctl->mod_modname, "AppleEFI") != NULL) { - return TRUE; - } - -#elif __arm__ || __arm64__ - if (LIT_STRNEQL(ctl->mod_modname, "com.apple.driver.AppleARMPlatform") || - LIT_STRNEQL(ctl->mod_modname, "com.apple.driver.AppleARMPL192VIC") || - LIT_STRNEQL(ctl->mod_modname, "com.apple.driver.AppleInterruptController")) { - return TRUE; - } -#endif - - return FALSE; -} - -/* - * FBT probe name validation - */ -int -fbt_excluded(const char* name) -{ - /* - * If the user set this, trust they know what they are doing. - */ - if (ignore_fbt_blacklist) { - return FALSE; - } - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - return TRUE; - } - - /* - * Place no probes on critical routines (5221096) - */ - if (bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) { - return TRUE; - } - - /* - * Place no probes that could be hit in probe context. - */ - if (bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) { - return TRUE; - } - - /* - * Place no probes that could be hit in probe context. - * In the interests of safety, some of these may be overly cautious. - * Also exclude very low-level "firmware" class calls. - */ - if (LIT_STRNSTART(name, "cpu_") || /* Coarse */ - LIT_STRNSTART(name, "platform_") || /* Coarse */ - LIT_STRNSTART(name, "machine_") || /* Coarse */ - LIT_STRNSTART(name, "ml_") || /* Coarse */ - LIT_STRNSTART(name, "PE_") || /* Coarse */ - LIT_STRNSTART(name, "rtc_") || /* Coarse */ - LIT_STRNSTART(name, "_rtc_") || - LIT_STRNSTART(name, "rtclock_") || - LIT_STRNSTART(name, "clock_") || - LIT_STRNSTART(name, "bcopy") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "hw_") || /* Coarse */ - LIT_STRNSTART(name, "lapic_") || /* Coarse */ - LIT_STRNSTART(name, "OSAdd") || - LIT_STRNSTART(name, "OSBit") || - LIT_STRNSTART(name, "OSDecrement") || - LIT_STRNSTART(name, "OSIncrement") || - LIT_STRNSTART(name, "OSCompareAndSwap") || - LIT_STRNSTART(name, "etimer_") || - LIT_STRNSTART(name, "dtxnu_kern_") || - LIT_STRNSTART(name, "flush_mmu_tlb_")) { - return TRUE; - } - /* - * Fasttrap inner-workings we can't instrument - * on Intel (6230149) - */ - if (LIT_STRNSTART(name, "fasttrap_") || - LIT_STRNSTART(name, "fuword") || - LIT_STRNSTART(name, "suword")) { - return TRUE; - } - - if (LIT_STRNSTART(name, "_dtrace")) { - return TRUE; /* Shims in dtrace.c */ - } - if (LIT_STRNSTART(name, "hibernate_")) { - return TRUE; - } - - /* - * Place no probes in the exception handling path - */ -#if __arm__ || __arm64__ - if (LIT_STRNSTART(name, "fleh_") || - LIT_STRNSTART(name, "sleh_") || - LIT_STRNSTART(name, "timer_state_event") || - LIT_STRNEQL(name, "get_vfp_enabled")) { - return TRUE; - } - - if (LIT_STRNSTART(name, "_ZNK15OSMetaClassBase8metaCastEPK11OSMetaClass") || - LIT_STRNSTART(name, "_ZN15OSMetaClassBase12safeMetaCastEPKS_PK11OSMetaClass") || - LIT_STRNSTART(name, "_ZNK11OSMetaClass13checkMetaCastEPK15OSMetaClassBase")) { - return TRUE; - } -#endif - -#ifdef __x86_64__ - if (LIT_STRNSTART(name, "machine_") || - LIT_STRNSTART(name, "idt64") || - LIT_STRNSTART(name, "ks_") || - LIT_STRNSTART(name, "hndl_") || - LIT_STRNSTART(name, "_intr_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNSTART(name, "tsc_") || - LIT_STRNSTART(name, "pmCPU") || - LIT_STRNSTART(name, "pms") || - LIT_STRNSTART(name, "usimple_") || - LIT_STRNSTART(name, "lck_spin_lock") || - LIT_STRNSTART(name, "lck_spin_unlock") || - LIT_STRNSTART(name, "absolutetime_to_") || - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "ml_") || - LIT_STRNSTART(name, "PE_") || - LIT_STRNSTART(name, "act_machine") || - LIT_STRNSTART(name, "acpi_") || - LIT_STRNSTART(name, "pal_")) { - return TRUE; - } - // Don't Steal Mac OS X - if (LIT_STRNSTART(name, "dsmos_")) { - return TRUE; - } - -#endif - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "debug_")) { - return TRUE; - } - -#if KASAN - if (LIT_STRNSTART(name, "kasan") || - LIT_STRNSTART(name, "__kasan") || - LIT_STRNSTART(name, "__asan")) { - return TRUE; - } -#endif - - /* - * Place no probes that could be hit on the way to a panic. - */ - if (NULL != strstr(name, "panic_")) { - return TRUE; - } - - return FALSE; -} - - /*ARGSUSED*/ static void fbt_destroy(void *arg, dtrace_id_t id, void *parg) @@ -785,7 +319,7 @@ fbt_provide_module_user_syms(struct modctl *ctl) name += 1; } - if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) { + if (fbt_excluded(name)) { continue; } @@ -848,7 +382,7 @@ fbt_provide_kernel_section(struct modctl *ctl, kernel_section_t *sect, kernel_nl } #endif /* defined(__arm__) */ - if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) { + if (fbt_excluded(name)) { continue; } @@ -970,9 +504,6 @@ fbt_provide_module(void *arg, struct modctl *ctl) if (MOD_HAS_USERSPACE_SYMBOLS(ctl)) { fbt_provide_module_user_syms(ctl); ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED; - if (MOD_FBT_PROVIDE_PRIVATE_PROBES(ctl)) { - ctl->mod_flags |= MODCTL_FBT_PRIVATE_PROBES_PROVIDED; - } if (MOD_FBT_PROVIDE_BLACKLISTED_PROBES(ctl)) { ctl->mod_flags |= MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED; } @@ -1046,49 +577,6 @@ _fbt_open(dev_t dev, int flags, int devtype, struct proc *p) #define FBT_MAJOR -24 /* let the kernel pick the device number */ -SYSCTL_DECL(_kern_dtrace); - -static int -sysctl_dtrace_ignore_fbt_blacklist SYSCTL_HANDLER_ARGS -{ -#pragma unused(oidp, arg2) - int err; - int value = *(int*)arg1; - - err = sysctl_io_number(req, value, sizeof(value), &value, NULL); - if (err) { - return err; - } - if (req->newptr) { - if (!(value == 0 || value == 1)) { - return ERANGE; - } - - /* - * We do not allow setting the blacklist back to on, as we have no way - * of knowing if those unsafe probes are still used. - * - * If we are using kernel symbols, we also do not allow any change, - * since the symbols are jettison'd after the first pass. - * - * We do not need to take any locks here because those symbol modes - * are permanent and do not change after boot. - */ - if (value != 1 || dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER || - dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) { - return EPERM; - } - - ignore_fbt_blacklist = 1; - } - - return 0; -} - -SYSCTL_PROC(_kern_dtrace, OID_AUTO, ignore_fbt_blacklist, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, - &ignore_fbt_blacklist, 0, - sysctl_dtrace_ignore_fbt_blacklist, "I", "fbt provider ignore blacklist"); /* * A struct describing which functions will get invoked for certain @@ -1116,6 +604,7 @@ static struct cdevsw fbt_cdevsw = #undef kmem_free /* from its binding to dt_kmem_free glue */ #include + void fbt_init( void ) { @@ -1126,8 +615,7 @@ fbt_init( void ) return; } - PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof(ignore_fbt_blacklist)); - + fbt_blacklist_init(); fbt_attach((dev_info_t*)(uintptr_t)majdevno); } #undef FBT_MAJOR diff --git a/bsd/dev/dtrace/fbt_blacklist.c b/bsd/dev/dtrace/fbt_blacklist.c new file mode 100644 index 000000000..f8f34ae33 --- /dev/null +++ b/bsd/dev/dtrace/fbt_blacklist.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +#define CLOSURE(s) #s, +#define CRITICAL(s) #s, + +#if KASAN +#define KASAN_ONLY(s) #s, +#else +#define KASAN_ONLY(s) +#endif /* KASAN */ + +#if defined(__arm__) || defined(__arm64__) +#define ARM_ONLY(s) #s, +#else +#define ARM_ONLY(s) +#endif /* defined(__arm__) || defined(__arm64__) */ +#if defined(__x86_64__) +#define X86_ONLY(s) #s, +#else +#define X86_ONLY(s) +#endif /* defined(__x86_64__) */ + +/* + * Routine prefixes that must not be probed, either because they are used in + * the exception path, by dtrace code in probe context, or are general + * critical routines that must never be probed. + * + * All routines whose name start with one of these will be ignored. + * + * This must be kept in asciibetical order for purposes of bsearch(). + */ +const char * fbt_blacklist[] = +{ + CRITICAL(Call_DebuggerC) + CLOSURE(ClearIdlePop) + CLOSURE(Debugger) + CRITICAL(IOCPURunPlatformPanicActions) + CLOSURE(IS_64BIT_PROCESS) + CRITICAL(OSAdd) + CRITICAL(OSBit) + CLOSURE(OSCompareAndSwap) + CRITICAL(OSDecrement) + CRITICAL(OSIncrement) + CRITICAL(PEARMDebugPanicHook) + CRITICAL(PEHaltRestart) + CRITICAL(PE_) + CRITICAL(SavePanicInfo) + CLOSURE(SetIdlePop) + CRITICAL(SysChoked) + CRITICAL(_ZN15OSMetaClassBase12safeMetaCastEPKS_PK11OSMetaClass) /* OSMetaClassBase::safeMetaCast */ + CRITICAL(_ZN16IOPlatformExpert11haltRestartEj) /* IOPlatformExpert::haltRestart */ + CRITICAL(_ZN18IODTPlatformExpert11haltRestartEj) /* IODTPlatformExpert::haltRestart */ + ARM_ONLY(_ZN8ASPNVRAM4syncEv) /* ASPNVRAM::sync */ + CRITICAL(_ZN9IODTNVRAM13savePanicInfoEPhy) /* IODTNVRAM::savePanicInfo */ + CRITICAL(_ZN9IOService14newTemperatureElPS_) /* IOService::newTemperature */ + CRITICAL(_ZN9IOService26temperatureCriticalForZoneEPS_) /* IOService::temperatureCriticalForZone */ + CRITICAL(_ZNK11OSMetaClass13checkMetaCastEPK15OSMetaClassBase) /* OSMetaClass::checkMetaCast */ + CRITICAL(_ZNK15OSMetaClassBase8metaCastEPK11OSMetaClass) /* OSMetaClassBase::metaCast */ + CRITICAL(_ZNK6OSData14getBytesNoCopyEv) /* Data::getBytesNoCopy, IOHibernateSystemWake path */ + KASAN_ONLY(__asan) + ARM_ONLY(__div) + CLOSURE(__dtrace_probe) + KASAN_ONLY(__kasan) + ARM_ONLY(__mod) + CRITICAL(__strlcpy_chk) + ARM_ONLY(__udiv) + ARM_ONLY(__umod) + CRITICAL(_disable_preemption) + CRITICAL(_enable_preemption) + CLOSURE(absolutetime_to_microtime) + X86_ONLY(acpi_) + X86_ONLY(act_machine) + CLOSURE(act_set_astbsd) + ARM_ONLY(alternate_debugger_enter) + ARM_ONLY(arm_init_idle_cpu) + CLOSURE(ast_dtrace_on) + CLOSURE(ast_pending) + CRITICAL(bcopy) + CLOSURE(clean_dcache) + CLOSURE(clean_mmu_dcache) + CRITICAL(clock_) + X86_ONLY(commpage_) + CRITICAL(console_cpu_alloc) + CRITICAL(console_cpu_free) + CLOSURE(copyin) + CLOSURE(copyout) + CRITICAL(cpu_) + CLOSURE(current_proc) + CLOSURE(current_processor) + CLOSURE(current_task) + CLOSURE(current_thread) + CLOSURE(debug_) + X86_ONLY(dsmos_) + CLOSURE(dtrace_) + CRITICAL(enter_lohandler) + CRITICAL(fasttrap_) + CRITICAL(fbt_invop) + CRITICAL(fbt_perfCallback) + CLOSURE(find_user_regs) + ARM_ONLY(fleh_) + CLOSURE(flush_dcache) + ARM_ONLY(flush_mmu_tlb_) + CLOSURE(flush_tlb64) + CRITICAL(fuword) + CLOSURE(get_bsdtask_info) + CLOSURE(get_bsdthread_info) + CRITICAL(get_preemption_level) + CRITICAL(get_threadtask) + ARM_ONLY(get_vfp_enabled) + CRITICAL(getminor) + CRITICAL(handle_pending_TLB_flushes) + CRITICAL(hibernate_) + X86_ONLY(hndl_) + CRITICAL(hw_) + X86_ONLY(idt64) + CRITICAL(interrupt) + CRITICAL(invalidate_mmu_icache) + CRITICAL(is_saved_state32) + KASAN_ONLY(kasan) + CLOSURE(kauth_cred_get) + CLOSURE(kauth_getgid) + CLOSURE(kauth_getuid) + CRITICAL(kdb_) + CRITICAL(kdp_) + CRITICAL(kernel_preempt_check) + CRITICAL(kernel_trap) + CRITICAL(kprintf) + CRITICAL(ks_) + CLOSURE(kvtophys) + X86_ONLY(lapic_) + CRITICAL(lo_alltraps) + CRITICAL(lock_debugger) + CLOSURE(mach_absolute_time) + CRITICAL(machine_) + X86_ONLY(mapping_) + CRITICAL(mca_cpu_alloc) + CRITICAL(mca_cpu_init) + CLOSURE(memcpy) + CLOSURE(memmove) + CRITICAL(ml_) + CLOSURE(mt_core_snap) + CLOSURE(mt_cur_cpu_cycles) + CLOSURE(mt_cur_cpu_instrs) + CLOSURE(mt_cur_thread_cycles) + CLOSURE(mt_cur_thread_instrs) + CLOSURE(mt_fixed_counts) + CLOSURE(mt_fixed_counts_internal) + CLOSURE(mt_mtc_update_count) + CLOSURE(mt_update_thread) + CRITICAL(nanoseconds_to_absolutetime) + CRITICAL(nanotime_to_absolutetime) + CRITICAL(ovbcopy) + CRITICAL(packA) + X86_ONLY(pal_) + CLOSURE(panic) + CRITICAL(phystokv) + CRITICAL(platform_) + X86_ONLY(pltrace) + X86_ONLY(pmCPU) + X86_ONLY(pmKextRegister) + X86_ONLY(pmMarkAllCPUsOff) + X86_ONLY(pmSafeMode) + X86_ONLY(pmTimerRestore) + X86_ONLY(pmTimerSave) + X86_ONLY(pmUnRegister) + X86_ONLY(pmap64_pdpt) + CLOSURE(pmap_find_phys) + CLOSURE(pmap_get_mapwindow) + CLOSURE(pmap_pde) + CLOSURE(pmap_pde_internal0) + CLOSURE(pmap_pde_internal1) + CLOSURE(pmap_pte) + CLOSURE(pmap_pte_internal) + CLOSURE(pmap_put_mapwindow) + CLOSURE(pmap_valid_page) + X86_ONLY(pms) + CRITICAL(power_management_init) + CRITICAL(preemption_underflow_panic) + CLOSURE(prf) + CLOSURE(proc_is64bit) + CLOSURE(proc_selfname) + CRITICAL(register_cpu_setup_func) + CRITICAL(ret64_iret) + CRITICAL(ret_to_user) + CRITICAL(return_to_kernel) + CRITICAL(return_to_user) + CRITICAL(rtc_) + CRITICAL(rtclock_) + CRITICAL(saved_state64) + CLOSURE(sdt_getargdesc) + CRITICAL(sdt_invop) + CLOSURE(setPop) + ARM_ONLY(sleh_) + CRITICAL(sprlock) + CRITICAL(sprunlock) + CLOSURE(strlcpy) + CRITICAL(strlen) + CRITICAL(strncmp) + CRITICAL(suword) + X86_ONLY(sync_iss_to_iks_unconditionally) + CLOSURE(systrace_stub) + CRITICAL(t_invop) + CLOSURE(timer_grab) + ARM_ONLY(timer_state_event) + CRITICAL(tmrCvt) + CRITICAL(trap_from_kernel) + CRITICAL(tsc_) + CRITICAL(uart_putc) + CRITICAL(unlock_debugger) + CRITICAL(unpackA) + CRITICAL(unregister_cpu_setup_func) + CRITICAL(uread) + CRITICAL(uwrite) + CRITICAL(vstart) +}; +#define BLACKLIST_COUNT (sizeof(fbt_blacklist)/sizeof(fbt_blacklist[0])) + +/* + * Modules that should not be probed. + * + * This must be kept in asciibetical order for purposes of bsearch(). + */ +static const char* fbt_module_blacklist[] = { + X86_ONLY(com.apple.driver.AppleACPIEC) + X86_ONLY(com.apple.driver.AppleACPIPlatform) + ARM_ONLY(com.apple.driver.AppleARMPlatform) + X86_ONLY(com.apple.driver.AppleEFI) + X86_ONLY(com.apple.driver.AppleIntelCPUPowerManagement) + ARM_ONLY(com.apple.driver.AppleInterruptController) + X86_ONLY(com.apple.driver.AppleRTC) + X86_ONLY(com.apple.iokit.IOACPIFamily) +}; +#define MODULE_BLACKLIST_COUNT (sizeof(fbt_module_blacklist)/sizeof(fbt_module_blacklist[0])) + +int ignore_fbt_blacklist = 0; +extern int dtrace_kernel_symbol_mode; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-qual" +static int +_cmp(const void *a, const void *b) +{ + const char *v = *(const char **)b; + return strncmp((const char *)a, v, strlen(v)); +} + + +#pragma clang diagnostic pop +/* + * Module validation + */ +bool +fbt_module_excluded(struct modctl* ctl) +{ + const char *excluded; + + ASSERT(!MOD_FBT_DONE(ctl)); + + if (ctl->mod_address == 0 || ctl->mod_size == 0 || !ctl->mod_loaded) { + return true; + } + + if (ignore_fbt_blacklist) { + return false; + } + + excluded = bsearch(ctl->mod_modname, fbt_module_blacklist, + MODULE_BLACKLIST_COUNT, sizeof(fbt_module_blacklist[0]), _cmp); + return excluded; +} + +/* + * FBT probe name validation + */ +bool +fbt_excluded(const char* name) +{ + const char *excluded; + + if (ignore_fbt_blacklist) { + return false; + } + + excluded = bsearch(name, fbt_blacklist, BLACKLIST_COUNT, sizeof(name), + _cmp ); + return excluded; +} + +SYSCTL_DECL(_kern_dtrace); + +static int +sysctl_dtrace_ignore_fbt_blacklist SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + int err; + int value = *(int*)arg1; + + err = sysctl_io_number(req, value, sizeof(value), &value, NULL); + if (err) { + return err; + } + if (req->newptr) { + if (!(value == 0 || value == 1)) { + return ERANGE; + } + + /* + * We do not allow setting the blacklist back to on, as we have no way + * of knowing if those unsafe probes are still used. + * + * If we are using kernel symbols, we also do not allow any change, + * since the symbols are jettison'd after the first pass. + * + * We do not need to take any locks here because those symbol modes + * are permanent and do not change after boot. + */ + if (value != 1 || dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER || + dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) { + return EPERM; + } + + ignore_fbt_blacklist = 1; + } + + return 0; +} + +SYSCTL_PROC(_kern_dtrace, OID_AUTO, ignore_fbt_blacklist, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &ignore_fbt_blacklist, 0, + sysctl_dtrace_ignore_fbt_blacklist, "I", "fbt provider ignore blacklist"); + +void +fbt_blacklist_init(void) +{ + PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof(ignore_fbt_blacklist)); +#if DEBUG || DEVELOPMENT + for (size_t i = 1; i < BLACKLIST_COUNT; i++) { + if (strcmp(fbt_blacklist[i - 1], fbt_blacklist[i]) > 0) { + panic("unordered fbt blacklist %s > %s", fbt_blacklist[i - 1], fbt_blacklist[i]); + } + } +#endif /* DEBUG || DEVELOPMENT */ +} diff --git a/bsd/dev/dtrace/lockprof.c b/bsd/dev/dtrace/lockprof.c index f7ea6085e..12f777ae2 100644 --- a/bsd/dev/dtrace/lockprof.c +++ b/bsd/dev/dtrace/lockprof.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ #include #if LOCK_STATS + #define SPIN_HELD 0 #define SPIN_MISS 1 #define SPIN_SPIN 2 @@ -44,7 +45,7 @@ static dtrace_provider_id_t lockprof_id; -decl_lck_mtx_data(extern, lck_grp_lock) +decl_lck_mtx_data(extern, lck_grp_lock); extern queue_head_t lck_grp_queue; extern unsigned int lck_grp_cnt; diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index f28db3a39..8c44121a6 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -23,15 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)lockstat.c 1.12 08/01/16 SMI" */ - - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include #include diff --git a/bsd/dev/dtrace/profile_prvd.c b/bsd/dev/dtrace/profile_prvd.c index 6d36e4cde..2294eedfd 100644 --- a/bsd/dev/dtrace/profile_prvd.c +++ b/bsd/dev/dtrace/profile_prvd.c @@ -23,14 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)profile.c 1.7 07/01/10 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include #include @@ -324,9 +316,9 @@ profile_create(hrtime_t interval, const char *name, int kind) return; } - atomic_add_32(&profile_total, 1); + os_atomic_inc(&profile_total, relaxed); if (profile_total > profile_max) { - atomic_add_32(&profile_total, -1); + os_atomic_dec(&profile_total, relaxed); return; } @@ -503,7 +495,7 @@ profile_destroy(void *arg, dtrace_id_t id, void *parg) } ASSERT(profile_total >= 1); - atomic_add_32(&profile_total, -1); + os_atomic_dec(&profile_total, relaxed); } /*ARGSUSED*/ diff --git a/bsd/dev/dtrace/scripts/Makefile b/bsd/dev/dtrace/scripts/Makefile index 1957fb2b0..58fc8b304 100644 --- a/bsd/dev/dtrace/scripts/Makefile +++ b/bsd/dev/dtrace/scripts/Makefile @@ -27,7 +27,7 @@ endif ifeq ($(CURRENT_ARCH_CONFIG),ARM64) -INSTALL_DTRACE_SCRIPTS_LIST += regs_arm64.d +INSTALL_DTRACE_SCRIPTS_LIST += regs_arm64.d ptrauth_arm64.d else ifeq ($(CURRENT_ARCH_CONFIG),ARM) INSTALL_DTRACE_SCRIPTS_LIST += regs_arm.d else @@ -39,7 +39,7 @@ INSTALL_DTRACE_SCRIPTS_FILES = \ $(INSTALL_DTRACE_SCRIPTS_FILES): $(DSTROOT)/$(INSTALL_DTRACE_SCRIPTS_DIR)/% : % $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_DTRACE_SCRIPTS_DIR) - @echo INSTALL $(@F) + $(call makelog,INSTALL $(@F)) $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ INSTALL_DTRACE_LIBEXEC_FILES = \ @@ -47,7 +47,7 @@ INSTALL_DTRACE_LIBEXEC_FILES = \ $(INSTALL_DTRACE_LIBEXEC_FILES): $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR)/% : % $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR) - @echo INSTALL $(@F) + $(call makelog,INSTALL $(@F)) $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ do_textfiles_install:: $(INSTALL_DTRACE_SCRIPTS_FILES) $(INSTALL_DTRACE_LIBEXEC_FILES) diff --git a/bsd/dev/dtrace/scripts/ptrauth_arm64.d b/bsd/dev/dtrace/scripts/ptrauth_arm64.d new file mode 100644 index 000000000..184c1bf88 --- /dev/null +++ b/bsd/dev/dtrace/scripts/ptrauth_arm64.d @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +enum ptrauth_key { + ptrauth_key_asia = 0, + ptrauth_key_asib = 1, + ptrauth_key_asda = 2, + ptrauth_key_asdb = 3, + + /* A process-independent key which can be used to sign code pointers. + Signing and authenticating with this key is a no-op in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_independent_code = ptrauth_key_asia, + + /* A process-specific key which can be used to sign code pointers. + Signing and authenticating with this key is enforced even in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_dependent_code = ptrauth_key_asib, + + /* A process-independent key which can be used to sign data pointers. + Signing and authenticating with this key is a no-op in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_independent_data = ptrauth_key_asda, + + /* A process-specific key which can be used to sign data pointers. + Signing and authenticating with this key is a no-op in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_dependent_data = ptrauth_key_asdb, + + /* The key used to sign C function pointers. + The extra data is always 0. */ + ptrauth_key_function_pointer = ptrauth_key_process_independent_code, + + /* The key used to sign return addresses on the stack. + The extra data is based on the storage address of the return address. + On ARM64, that is always the storage address of the return address plus 8 + (or, in other words, the value of the stack pointer on function entry) */ + ptrauth_key_return_address = ptrauth_key_process_dependent_code, + + /* The key used to sign frame pointers on the stack. + The extra data is based on the storage address of the frame pointer. + On ARM64, that is always the storage address of the frame pointer plus 16 + (or, in other words, the value of the stack pointer on function entry) */ + ptrauth_key_frame_pointer = ptrauth_key_process_dependent_data, + + /* The key used to sign block function pointers, including: + invocation functions, + block object copy functions, + block object destroy functions, + __block variable copy functions, and + __block variable destroy functions. + The extra data is always the address at which the function pointer + is stored. + + Note that block object pointers themselves (i.e. the direct + representations of values of block-pointer type) are not signed. */ + ptrauth_key_block_function = ptrauth_key_asia, + + /* The key used to sign C++ v-table pointers. + The extra data is always 0. */ + ptrauth_key_cxx_vtable_pointer = ptrauth_key_asda + +}; + diff --git a/bsd/dev/dtrace/scripts/regs_arm.d b/bsd/dev/dtrace/scripts/regs_arm.d index 23d3b5387..885f9ecfd 100644 --- a/bsd/dev/dtrace/scripts/regs_arm.d +++ b/bsd/dev/dtrace/scripts/regs_arm.d @@ -3,8 +3,6 @@ * Use is subject to license terms. */ -#pragma ident "@(#)regs.d.in 1.0 04/09/28 SMI" - inline int R_R0 = 0; #pragma D binding "1.0" R_R0 inline int R_R1 = 1; diff --git a/bsd/dev/dtrace/scripts/regs_arm64.d b/bsd/dev/dtrace/scripts/regs_arm64.d index 8979dea77..528b96ce6 100644 --- a/bsd/dev/dtrace/scripts/regs_arm64.d +++ b/bsd/dev/dtrace/scripts/regs_arm64.d @@ -3,8 +3,6 @@ * Use is subject to license terms. */ -#pragma ident "@(#)regs.d.in 1.0 04/09/28 SMI" - inline int R_R0 = 0; #pragma D binding "1.0" R_R0 inline int R_R1 = 1; diff --git a/bsd/dev/dtrace/scripts/regs_x86_64.d b/bsd/dev/dtrace/scripts/regs_x86_64.d index 8a5acc699..b18333392 100644 --- a/bsd/dev/dtrace/scripts/regs_x86_64.d +++ b/bsd/dev/dtrace/scripts/regs_x86_64.d @@ -3,8 +3,6 @@ * Use is subject to license terms. */ -#pragma ident "@(#)regs.d.in 1.1 04/09/28 SMI" - inline int R_GS = 0; #pragma D binding "1.0" R_GS inline int R_FS = 1; diff --git a/bsd/dev/dtrace/scripts/unistd.d b/bsd/dev/dtrace/scripts/unistd.d index 7279b3118..ead9d23ce 100644 --- a/bsd/dev/dtrace/scripts/unistd.d +++ b/bsd/dev/dtrace/scripts/unistd.d @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "@(#)unistd.d 1.4 07/02/20 SMI" - inline int DTRACEFLT_UNKNOWN = 0; /* Unknown fault */ #pragma D binding "1.0" DTRACEFLT_UNKNOWN diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index d851fb659..1a38e614b 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -23,14 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)sdt.c 1.9 08/07/01 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include #include diff --git a/bsd/dev/dtrace/sdt_subr.c b/bsd/dev/dtrace/sdt_subr.c index 65ae963fd..c9c52fb6a 100644 --- a/bsd/dev/dtrace/sdt_subr.c +++ b/bsd/dev/dtrace/sdt_subr.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)sdt_subr.c 1.13 08/06/13 SMI" */ - #include static dtrace_pattr_t vtrace_attr = { @@ -93,6 +91,10 @@ sdt_provider_t sdt_providers[] = { { "sysevent", "__sysevent____", &stab_attr, 0 }, { "sdt", "__sdt____", &sdt_attr, 0 }, { "boost", "__boost____", &stab_attr, 0}, + { "route", "__route____", &stab_attr, 0 }, +#if KASAN + { "kasan", "__kasan____", &stab_attr, 0 }, +#endif { NULL, NULL, NULL, 0 } }; diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 27d199eeb..ef85a1fca 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -23,14 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)systrace.c 1.6 06/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include diff --git a/bsd/dev/dtrace/systrace.h b/bsd/dev/dtrace/systrace.h index f7b92bc9f..b8976d2d0 100644 --- a/bsd/dev/dtrace/systrace.h +++ b/bsd/dev/dtrace/systrace.h @@ -27,22 +27,12 @@ #ifndef _SYS_SYSTRACE_H #define _SYS_SYSTRACE_H -/* #pragma ident "@(#)systrace.h 1.3 06/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #ifdef __cplusplus extern "C" { #endif -#ifdef _KERNEL - typedef struct systrace_sysent { dtrace_id_t stsy_entry; dtrace_id_t stsy_return; @@ -62,8 +52,6 @@ extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); extern void dtrace_systrace_syscall_return(unsigned short, int, int *); -#endif /* _KERNEL */ - #ifdef __cplusplus } #endif diff --git a/bsd/dev/i386/conf.c b/bsd/dev/i386/conf.c index 094506014..e81719bf6 100644 --- a/bsd/dev/i386/conf.c +++ b/bsd/dev/i386/conf.c @@ -238,12 +238,7 @@ struct cdevsw cdevsw[] = { kmioctl, nullstop, nullreset, km_tty, ttselect, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, - [13 ... 41] = NO_CDEVICE, - [42] = { - volopen, volclose, eno_rdwrt, eno_rdwrt, - volioctl, eno_stop, eno_reset, 0, (select_fcn_t *) seltrue, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - } + [13 ... 63] = NO_CDEVICE, }; const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]); @@ -260,7 +255,7 @@ isdisk(dev_t dev, int type) switch (type) { case VCHR: - maj = chrtoblk(maj); + maj = chrtoblk(dev); if (maj == NODEV) { break; } @@ -274,32 +269,7 @@ isdisk(dev_t dev, int type) return 0; } -static int chrtoblktab[] = { - /* CHR*/ /* BLK*/ /* CHR*/ /* BLK*/ - /* 0 */ NODEV, /* 1 */ NODEV, - /* 2 */ NODEV, /* 3 */ NODEV, - /* 4 */ NODEV, /* 5 */ NODEV, - /* 6 */ NODEV, /* 7 */ NODEV, - /* 8 */ NODEV, /* 9 */ NODEV, - /* 10 */ NODEV, /* 11 */ NODEV, - /* 12 */ NODEV, /* 13 */ NODEV, - /* 14 */ NODEV, /* 15 */ NODEV, - /* 16 */ NODEV, /* 17 */ NODEV, - /* 18 */ NODEV, /* 19 */ NODEV, - /* 20 */ NODEV, /* 21 */ NODEV, - /* 22 */ NODEV, /* 23 */ NODEV, - /* 24 */ NODEV, /* 25 */ NODEV, - /* 26 */ NODEV, /* 27 */ NODEV, - /* 28 */ NODEV, /* 29 */ NODEV, - /* 30 */ NODEV, /* 31 */ NODEV, - /* 32 */ NODEV, /* 33 */ NODEV, - /* 34 */ NODEV, /* 35 */ NODEV, - /* 36 */ NODEV, /* 37 */ NODEV, - /* 38 */ NODEV, /* 39 */ NODEV, - /* 40 */ NODEV, /* 41 */ NODEV, - /* 42 */ NODEV, /* 43 */ NODEV, - /* 44 */ NODEV, -}; +static int chrtoblktab[] = {[0 ... nchrdev] = NODEV }; /* * convert chr dev to blk dev diff --git a/bsd/dev/i386/dis_tables.c b/bsd/dev/i386/dis_tables.c index c67273b79..f167167ca 100644 --- a/bsd/dev/i386/dis_tables.c +++ b/bsd/dev/i386/dis_tables.c @@ -40,9 +40,6 @@ * It needs to be in sync with this file. */ -/* - * #pragma ident "@(#)dis_tables.c 1.18 08/05/24 SMI" - */ #include #include #include diff --git a/bsd/dev/i386/dtrace_isa.c b/bsd/dev/i386/dtrace_isa.c index 458fc15b3..6785dc536 100644 --- a/bsd/dev/i386/dtrace_isa.c +++ b/bsd/dev/i386/dtrace_isa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2012 Apple Inc. All rights reserved. + * Copyright (c) 2005-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,7 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include #include diff --git a/bsd/dev/i386/dtrace_subr_x86.c b/bsd/dev/i386/dtrace_subr_x86.c index a5064d688..e78af6efc 100644 --- a/bsd/dev/i386/dtrace_subr_x86.c +++ b/bsd/dev/i386/dtrace_subr_x86.c @@ -24,10 +24,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)dtrace_subr.c 1.16 07/09/18 SMI" - */ - #include #include #include @@ -204,110 +200,6 @@ dtrace_user_probe(x86_saved_state_t *regs) return KERN_FAILURE; } -void -dtrace_safe_synchronous_signal(void) -{ -#if 0 - kthread_t *t = curthread; - struct regs *rp = lwptoregs(ttolwp(t)); - size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; - - ASSERT(t->t_dtrace_on); - - /* - * If we're not in the range of scratch addresses, we're not actually - * tracing user instructions so turn off the flags. If the instruction - * we copied out caused a synchonous trap, reset the pc back to its - * original value and turn off the flags. - */ - if (rp->r_pc < t->t_dtrace_scrpc || - rp->r_pc > t->t_dtrace_astpc + isz) { - t->t_dtrace_ft = 0; - } else if (rp->r_pc == t->t_dtrace_scrpc || - rp->r_pc == t->t_dtrace_astpc) { - rp->r_pc = t->t_dtrace_pc; - t->t_dtrace_ft = 0; - } -#endif /* 0 */ -} - -int -dtrace_safe_defer_signal(void) -{ -#if 0 - kthread_t *t = curthread; - struct regs *rp = lwptoregs(ttolwp(t)); - size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; - - ASSERT(t->t_dtrace_on); - - /* - * If we're not in the range of scratch addresses, we're not actually - * tracing user instructions so turn off the flags. - */ - if (rp->r_pc < t->t_dtrace_scrpc || - rp->r_pc > t->t_dtrace_astpc + isz) { - t->t_dtrace_ft = 0; - return (0); - } - - /* - * If we've executed the original instruction, but haven't performed - * the jmp back to t->t_dtrace_npc or the clean up of any registers - * used to emulate %rip-relative instructions in 64-bit mode, do that - * here and take the signal right away. We detect this condition by - * seeing if the program counter is the range [scrpc + isz, astpc). - */ - if (t->t_dtrace_astpc - rp->r_pc < - t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) { -#ifdef __sol64 - /* - * If there is a scratch register and we're on the - * instruction immediately after the modified instruction, - * restore the value of that scratch register. - */ - if (t->t_dtrace_reg != 0 && - rp->r_pc == t->t_dtrace_scrpc + isz) { - switch (t->t_dtrace_reg) { - case REG_RAX: - rp->r_rax = t->t_dtrace_regv; - break; - case REG_RCX: - rp->r_rcx = t->t_dtrace_regv; - break; - case REG_R8: - rp->r_r8 = t->t_dtrace_regv; - break; - case REG_R9: - rp->r_r9 = t->t_dtrace_regv; - break; - } - } -#endif - rp->r_pc = t->t_dtrace_npc; - t->t_dtrace_ft = 0; - return (0); - } - - /* - * Otherwise, make sure we'll return to the kernel after executing - * the copied out instruction and defer the signal. - */ - if (!t->t_dtrace_step) { - ASSERT(rp->r_pc < t->t_dtrace_astpc); - rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; - t->t_dtrace_step = 1; - } - - t->t_dtrace_ast = 1; - - return (1); - -#endif /* 0 */ - - return 0; -} - void dtrace_flush_caches(void) { diff --git a/bsd/dev/i386/fasttrap_isa.c b/bsd/dev/i386/fasttrap_isa.c index 0e9e97849..6801862e0 100644 --- a/bsd/dev/i386/fasttrap_isa.c +++ b/bsd/dev/i386/fasttrap_isa.c @@ -24,16 +24,6 @@ * Use is subject to license terms. */ -/* - * #pragma ident "@(#)fasttrap_isa.c 1.27 08/04/09 SMI" - */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #include #include #include @@ -235,7 +225,7 @@ fasttrap_anarg(x86_saved_state_t *regs, int function_entry, int argno) if (p_model == DATAMODEL_LP64) { user_addr_t stack; - + /* * In 64-bit mode, the first six arguments are stored in * registers. @@ -725,8 +715,8 @@ fasttrap_return_common(x86_saved_state_t *regs, user_addr_t pc, pid_t pid, continue; if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } @@ -767,14 +757,14 @@ fasttrap_return_common(x86_saved_state_t *regs, user_addr_t pc, pid_t pid, static void fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr) -{ +{ proc_lock(p); /* Set fault address and mark signal */ t->uu_code = addr; t->uu_siglist |= sigmask(SIGSEGV); - /* + /* * XXX These two line may be redundant; if not, then we need * XXX to potentially set the data address in the machine * XXX specific thread state structure to indicate the address. @@ -1041,10 +1031,10 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) if (tp->ftt_ids != NULL) { fasttrap_id_t *id; - + uint32_t s0, s1, s2, s3, s4, s5; uint32_t *stack = (uint32_t *)(uintptr_t)(regs32->uesp); - + /* * In 32-bit mode, all arguments are passed on the * stack. If this is a function entry probe, we need @@ -1058,17 +1048,17 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) fasttrap_fuword32_noerr((user_addr_t)(unsigned long)&stack[3], &s3); fasttrap_fuword32_noerr((user_addr_t)(unsigned long)&stack[4], &s4); fasttrap_fuword32_noerr((user_addr_t)(unsigned long)&stack[5], &s5); - + for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { fasttrap_probe_t *probe = id->fti_probe; - + if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) { - dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id, + dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id, 1 /* ndx */, -1 /* offset */, DTRACEFLT_UPRIV); } else { if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } @@ -1182,10 +1172,10 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) new_pc = pc; break; } - + if (tp->ftt_type == FASTTRAP_T_RET16) addr += tp->ftt_dest; - + regs32->uesp = addr; new_pc = dst; break; @@ -1194,7 +1184,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) case FASTTRAP_T_JCC: { uint_t taken; - + switch (tp->ftt_code) { case FASTTRAP_JO: taken = (regs32->efl & FASTTRAP_EFLAGS_OF) != 0; @@ -1255,7 +1245,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) default: taken = FALSE; } - + if (taken) new_pc = tp->ftt_dest; else @@ -1283,7 +1273,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) default: taken = FALSE; } - + if (taken) new_pc = tp->ftt_dest; else @@ -1294,7 +1284,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) case FASTTRAP_T_JCXZ: { greg_t cx = regs32->ecx; - + if (cx == 0) new_pc = tp->ftt_dest; else @@ -1306,18 +1296,18 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) { user_addr_t addr = regs32->uesp - sizeof (uint32_t); int ret = fasttrap_suword32(addr, (uint32_t)regs32->ebp); - + if (ret == -1) { fasttrap_sigsegv(p, uthread, addr); new_pc = pc; break; } - + regs32->uesp = addr; new_pc = pc + tp->ftt_size; break; } - + case FASTTRAP_T_NOP: new_pc = pc + tp->ftt_size; break; @@ -1334,7 +1324,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) if (tp->ftt_index != FASTTRAP_NOREG) addr += fasttrap_getreg(regs, tp->ftt_index) << tp->ftt_scale; - + if (tp->ftt_code == 1) { /* * If there's a segment prefix for this @@ -1348,7 +1338,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) new_pc = pc; break; } - + uint32_t value32; addr = (user_addr_t)(uint32_t)addr; if (fasttrap_fuword32(addr, &value32) == -1) { @@ -1371,13 +1361,13 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) if (tp->ftt_type == FASTTRAP_T_CALL) { user_addr_t addr = regs32->uesp - sizeof (uint32_t); int ret = fasttrap_suword32(addr, (uint32_t)(pc + tp->ftt_size)); - + if (ret == -1) { fasttrap_sigsegv(p, uthread, addr); new_pc = pc; break; } - + regs32->uesp = addr; } break; @@ -1456,7 +1446,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) i += tp->ftt_size; scratch[i++] = FASTTRAP_INT; scratch[i++] = T_DTRACE_RET; - + ASSERT(i <= sizeof (scratch)); if (fasttrap_copyout(scratch, write_addr, i)) { @@ -1464,7 +1454,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) new_pc = pc; break; } - + if (tp->ftt_retids != NULL) { uthread->t_dtrace_step = 1; uthread->t_dtrace_ret = 1; @@ -1472,17 +1462,17 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) } else { new_pc = uthread->t_dtrace_scrpc; } - + uthread->t_dtrace_pc = pc; uthread->t_dtrace_npc = pc + tp->ftt_size; uthread->t_dtrace_on = 1; break; } - + default: panic("fasttrap: mishandled an instruction"); } - + done: /* * APPLE NOTE: @@ -1619,10 +1609,10 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { fasttrap_probe_t *probe = id->fti_probe; - + if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1); - if (already_triggered) { + if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) { + /* already triggered */ continue; } } @@ -1635,7 +1625,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) retire_tp = 0; } if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) { - dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id, + dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id, 1 /* ndx */, -1 /* offset */, DTRACEFLT_UPRIV); } else if (id->fti_ptype == DTFTP_ENTRY) { /* @@ -1665,10 +1655,10 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) regs64->r8); } else { uint64_t t[5]; - + fasttrap_usdt_args64(probe, regs64, sizeof (t) / sizeof (t[0]), t); - + dtrace_probe(probe->ftp_id, t[0], t[1], t[2], t[3], t[4]); } @@ -1725,7 +1715,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) user_addr_t dst; user_addr_t addr; int ret; - + /* * We have to emulate _every_ facet of the behavior of a ret * instruction including what happens if the load from %esp @@ -1733,25 +1723,25 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) */ ret = fasttrap_fuword64((user_addr_t)regs64->isf.rsp, &dst); addr = regs64->isf.rsp + sizeof (uint64_t); - + if (ret == -1) { fasttrap_sigsegv(p, uthread, (user_addr_t)regs64->isf.rsp); new_pc = pc; break; } - + if (tp->ftt_type == FASTTRAP_T_RET16) addr += tp->ftt_dest; - + regs64->isf.rsp = addr; new_pc = dst; break; } - + case FASTTRAP_T_JCC: { uint_t taken; - + switch (tp->ftt_code) { case FASTTRAP_JO: taken = (regs64->isf.rflags & FASTTRAP_EFLAGS_OF) != 0; @@ -1812,7 +1802,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) default: taken = FALSE; } - + if (taken) new_pc = tp->ftt_dest; else @@ -1824,7 +1814,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) { uint_t taken; uint64_t cx = regs64->rcx--; - + switch (tp->ftt_code) { case FASTTRAP_LOOPNZ: taken = (regs64->isf.rflags & FASTTRAP_EFLAGS_ZF) == 0 && @@ -1840,14 +1830,14 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) default: taken = FALSE; } - + if (taken) new_pc = tp->ftt_dest; else new_pc = pc + tp->ftt_size; break; } - + case FASTTRAP_T_JCXZ: { uint64_t cx = regs64->rcx; @@ -1863,18 +1853,18 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) { user_addr_t addr = regs64->isf.rsp - sizeof (uint64_t); int ret = fasttrap_suword64(addr, (uint64_t)regs64->rbp); - + if (ret == -1) { fasttrap_sigsegv(p, uthread, addr); new_pc = pc; break; } - + regs64->isf.rsp = addr; new_pc = pc + tp->ftt_size; break; } - + case FASTTRAP_T_NOP: new_pc = pc + tp->ftt_size; break; @@ -1885,13 +1875,13 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) new_pc = tp->ftt_dest; } else { user_addr_t value, addr = tp->ftt_dest; - + if (tp->ftt_base != FASTTRAP_NOREG) addr += fasttrap_getreg(regs, tp->ftt_base); if (tp->ftt_index != FASTTRAP_NOREG) addr += fasttrap_getreg(regs, tp->ftt_index) << tp->ftt_scale; - + if (tp->ftt_code == 1) { /* * If there's a segment prefix for this @@ -1905,7 +1895,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) new_pc = pc; break; } - + if (fasttrap_fuword64(addr, &value) == -1) { fasttrap_sigsegv(p, uthread, addr); new_pc = pc; @@ -1926,13 +1916,13 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) if (tp->ftt_type == FASTTRAP_T_CALL) { user_addr_t addr = regs64->isf.rsp - sizeof (uint64_t); int ret = fasttrap_suword64(addr, pc + tp->ftt_size); - + if (ret == -1) { fasttrap_sigsegv(p, uthread, addr); new_pc = pc; break; } - + regs64->isf.rsp = addr; } break; @@ -1942,7 +1932,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) user_addr_t addr, write_addr; uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22]; uint_t i = 0; - + /* * Generic Instruction Tracing * --------------------------- @@ -2043,10 +2033,10 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) if (tp->ftt_ripmode != 0) { uint64_t* reg; - + ASSERT(tp->ftt_ripmode & (FASTTRAP_RIP_1 | FASTTRAP_RIP_2)); - + /* * If this was a %rip-relative instruction, we change * it to be either a %rax- or %rcx-relative @@ -2060,12 +2050,12 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) scratch[i++] = FASTTRAP_REX(1, 0, 0, 1); else scratch[i++] = FASTTRAP_REX(1, 0, 0, 0); - + if (tp->ftt_ripmode & FASTTRAP_RIP_1) scratch[i++] = FASTTRAP_MOV_EAX; else scratch[i++] = FASTTRAP_MOV_ECX; - + switch (tp->ftt_ripmode) { case FASTTRAP_RIP_1: reg = ®s64->rax; @@ -2087,7 +2077,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) reg = NULL; panic("unhandled ripmode in fasttrap_pid_probe64"); } - + /* LINTED - alignment */ *(uint64_t *)&scratch[i] = *reg; uthread->t_dtrace_regv = *reg; @@ -2132,17 +2122,17 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) } else { new_pc = uthread->t_dtrace_scrpc; } - + uthread->t_dtrace_pc = pc; uthread->t_dtrace_npc = pc + tp->ftt_size; uthread->t_dtrace_on = 1; break; } - + default: panic("fasttrap: mishandled an instruction"); } - + done: /* * APPLE NOTE: diff --git a/bsd/dev/i386/fasttrap_regset.h b/bsd/dev/i386/fasttrap_regset.h index 348e04a30..e286708cb 100644 --- a/bsd/dev/i386/fasttrap_regset.h +++ b/bsd/dev/i386/fasttrap_regset.h @@ -36,10 +36,6 @@ * APPLE NOTE: This file was orginally uts/intel/sys/regset.h */ -/* - * #pragma ident "@(#)regset.h 1.11 05/06/08 SMI" - */ - #ifdef __cplusplus extern "C" { #endif diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index 63d1a8430..0b7d9516e 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -24,15 +24,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include #include #include diff --git a/bsd/dev/i386/instr_size.c b/bsd/dev/i386/instr_size.c index 14f7ea974..6d7c48b7a 100644 --- a/bsd/dev/i386/instr_size.c +++ b/bsd/dev/i386/instr_size.c @@ -27,11 +27,6 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ - -/* - * #pragma ident "@(#)instr_size.c 1.14 05/07/08 SMI" - */ - #include #include diff --git a/bsd/dev/i386/kern_machdep.c b/bsd/dev/i386/kern_machdep.c index 1b3d774f3..1512e6b0c 100644 --- a/bsd/dev/i386/kern_machdep.c +++ b/bsd/dev/i386/kern_machdep.c @@ -40,6 +40,10 @@ #include #include +#if __x86_64__ +extern int bootarg_no32exec; /* bsd_init.c */ +#endif + /********************************************************************** * Routine: grade_binary() * @@ -48,7 +52,7 @@ * by 32-bit binaries. 0 means unsupported. **********************************************************************/ int -grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) +grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, bool allow_simulator_binary __unused) { cpu_subtype_t hostsubtype = cpu_subtype(); @@ -72,6 +76,11 @@ grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) } break; case CPU_TYPE_X86: /* native */ +#if __x86_64__ + if (bootarg_no32exec && !allow_simulator_binary) { + return 0; + } +#endif return 1; } diff --git a/bsd/dev/i386/sdt_x86.c b/bsd/dev/i386/sdt_x86.c index b5c244cd8..4b78fe791 100644 --- a/bsd/dev/i386/sdt_x86.c +++ b/bsd/dev/i386/sdt_x86.c @@ -23,15 +23,6 @@ * Use is subject to license terms. */ -/* #pragma ident "@(#)sdt.c 1.9 08/07/01 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include #include #include diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 39dd11110..2300e0b7f 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -1027,9 +1027,28 @@ SYSCTL_INT(_machdep, OID_AUTO, fpsimd_fault_popc, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, &fpsimd_fault_popc, 0, ""); -extern int allow_64bit_proc_LDT_ops; -SYSCTL_INT(_machdep, OID_AUTO, ldt64, - CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, - &allow_64bit_proc_LDT_ops, 0, ""); +volatile int stop_spinning; +static int +spin_in_the_kernel(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new = 0, old = 0, changed = 0, error; + + error = sysctl_io_number(req, old, sizeof(int), &new, &changed); + if (error == 0 && changed) { + stop_spinning = FALSE; + while (stop_spinning == FALSE) { + __builtin_ia32_pause(); + } + } else if (error == 0) { + stop_spinning = TRUE; + } + + return error; +} + +SYSCTL_PROC(_machdep_misc, OID_AUTO, spin_forever, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, + spin_in_the_kernel, "I", "Spin forever"); #endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index ebbca2d75..a5a7255bd 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -56,6 +57,10 @@ #include +#if CONFIG_MACF +#include +#endif + #if CONFIG_DTRACE extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); extern void dtrace_systrace_syscall_return(unsigned short, int, int *); @@ -85,7 +90,7 @@ unix_syscall(x86_saved_state_t *state) { thread_t thread; void *vt; - unsigned int code; + unsigned int code, syscode; struct sysent *callp; int error; @@ -116,19 +121,21 @@ unix_syscall(x86_saved_state_t *state) p = (struct proc *)get_bsdtask_info(current_task()); } - code = regs->eax & I386_SYSCALL_NUMBER_MASK; + code = regs->eax & I386_SYSCALL_NUMBER_MASK; + syscode = (code < nsysent) ? code : SYS_invalid; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", - code, syscallnames[code >= nsysent ? SYS_invalid : code], (uint32_t)regs->eip); + code, syscallnames[syscode], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof(int)); regs->efl &= ~(EFL_CF); - callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; + callp = &sysent[syscode]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); - callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; + syscode = (code < nsysent) ? code : SYS_invalid; + callp = &sysent[syscode]; } vt = (void *)uthread->uu_arg; @@ -152,11 +159,9 @@ unix_syscall(x86_saved_state_t *state) } if (__probable(!code_is_kdebug_trace(code))) { - int *ip = (int *)vt; - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - *ip, *(ip + 1), *(ip + 2), *(ip + 3), 0); + uint32_t *uip = vt; + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + uip[0], uip[1], uip[2], uip[3]); } #if CONFIG_REQUIRES_U32_MUNGING @@ -167,9 +172,7 @@ unix_syscall(x86_saved_state_t *state) } #endif } else { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - 0, 0, 0, 0, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START); } /* @@ -189,10 +192,23 @@ unix_syscall(x86_saved_state_t *state) uthread->uu_vpindex = 0; #endif +#if CONFIG_MACF + if (__improbable(p->syscall_filter_mask != NULL && !bitstr_test(p->syscall_filter_mask, syscode))) { + error = mac_proc_check_syscall_unix(p, syscode); + if (error) { + goto skip_syscall; + } + } +#endif /* CONFIG_MACF */ + AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); +#if CONFIG_MACF +skip_syscall: +#endif /* CONFIG_MACF */ + #ifdef JOE_DEBUG if (uthread->uu_iocount) { printf("system call returned with uu_iocount != 0\n"); @@ -250,9 +266,8 @@ unix_syscall(x86_saved_state_t *state) throttle_lowpri_io(1); } if (__probable(!code_is_kdebug_trace(code))) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], pid, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], pid); } if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { @@ -275,7 +290,7 @@ unix_syscall64(x86_saved_state_t *state) { thread_t thread; void *vt; - unsigned int code; + unsigned int code, syscode; struct sysent *callp; int args_in_regs; boolean_t args_start_at_rdi; @@ -313,11 +328,12 @@ unix_syscall64(x86_saved_state_t *state) /* NOTREACHED */ } - code = regs->rax & SYSCALL_NUMBER_MASK; + code = regs->rax & SYSCALL_NUMBER_MASK; + syscode = (code < nsysent) ? code : SYS_invalid; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx\n", - code, syscallnames[code >= nsysent ? SYS_invalid : code], regs->isf.rip); - callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; + code, syscallnames[syscode], regs->isf.rip); + callp = &sysent[syscode]; vt = (void *)uthread->uu_arg; @@ -326,8 +342,9 @@ unix_syscall64(x86_saved_state_t *state) * indirect system call... system call number * passed as 'arg0' */ - code = regs->rdi; - callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; + code = regs->rdi; + syscode = (code < nsysent) ? code : SYS_invalid; + callp = &sysent[syscode]; args_start_at_rdi = FALSE; args_in_regs = 5; } else { @@ -341,13 +358,11 @@ unix_syscall64(x86_saved_state_t *state) args_in_regs = MIN(args_in_regs, callp->sy_narg); memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t)); - if (!code_is_kdebug_trace(code)) { - uint64_t *ip = (uint64_t *)vt; + uint64_t *uip = vt; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - (int)(*ip), (int)(*(ip + 1)), (int)(*(ip + 2)), (int)(*(ip + 3)), 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + uip[0], uip[1], uip[2], uip[3]); } if (__improbable(callp->sy_narg > args_in_regs)) { @@ -364,9 +379,7 @@ unix_syscall64(x86_saved_state_t *state) } } } else { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - 0, 0, 0, 0, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START); } /* @@ -386,10 +399,23 @@ unix_syscall64(x86_saved_state_t *state) uthread->uu_vpindex = 0; #endif +#if CONFIG_MACF + if (__improbable(p->syscall_filter_mask != NULL && !bitstr_test(p->syscall_filter_mask, syscode))) { + error = mac_proc_check_syscall_unix(p, syscode); + if (error) { + goto skip_syscall; + } + } +#endif /* CONFIG_MACF */ + AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); +#if CONFIG_MACF +skip_syscall: +#endif /* CONFIG_MACF */ + #ifdef JOE_DEBUG if (uthread->uu_iocount) { printf("system call returned with uu_iocount != 0\n"); @@ -463,9 +489,8 @@ unix_syscall64(x86_saved_state_t *state) throttle_lowpri_io(1); } if (__probable(!code_is_kdebug_trace(code))) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], pid, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], pid); } #if PROC_REF_DEBUG @@ -602,9 +627,8 @@ unix_syscall_return(int error) throttle_lowpri_io(1); } if (!code_is_kdebug_trace(code)) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid); } thread_exception_return(); diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index 603b21614..724a1d210 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -37,6 +37,7 @@ #include #include +#include #include #include @@ -160,7 +161,7 @@ siginfo_user_to_user64_x86(user_siginfo_t *in, user64_siginfo_t *out) } void -sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint32_t code) +sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint32_t code, sigset_t siginfo) { union { struct mcontext_avx32 mctx_avx32; @@ -198,7 +199,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint thread = current_thread(); ut = get_bsdthread_info(thread); - if (p->p_sigacts->ps_siginfo & sigmask(sig)) { + if (siginfo & sigmask(sig)) { infostyle = UC_FLAVOR; } @@ -802,6 +803,9 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) return 0; } + /* see osfmk/kern/restartable.c */ + act_set_ast_reset_pcs(thread); + bzero(mctxp, sizeof(*mctxp)); sig_xstate = current_xstate(); diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index ffac54d04..184862aa2 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,30 +131,30 @@ extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); */ static struct bdevsw mdevbdevsw = { - /* open */ mdevopen, - /* close */ mdevclose, - /* strategy */ mdevstrategy, - /* ioctl */ mdevbioctl, - /* dump */ eno_dump, - /* psize */ mdevsize, - /* flags */ D_DISK, + .d_open = mdevopen, + .d_close = mdevclose, + .d_strategy = mdevstrategy, + .d_ioctl = mdevbioctl, + .d_dump = eno_dump, + .d_psize = mdevsize, + .d_type = D_DISK, }; static struct cdevsw mdevcdevsw = { - /* open */ mdevopen, - /* close */ mdevclose, - /* read */ mdevrw, - /* write */ mdevrw, - /* ioctl */ mdevcioctl, - /* stop */ eno_stop, - /* reset */ eno_reset, - /* ttys */ NULL, - /* select */ eno_select, - /* mmap */ eno_mmap, - /* strategy */ eno_strat, - /* getc */ eno_getc, - /* putc */ eno_putc, - /* flags */ D_DISK, + .d_open = mdevopen, + .d_close = mdevclose, + .d_read = mdevrw, + .d_write = mdevrw, + .d_ioctl = mdevcioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = D_DISK, }; struct mdev { diff --git a/bsd/dev/monotonic.c b/bsd/dev/monotonic.c index 4a320cbbb..6c445d7a3 100644 --- a/bsd/dev/monotonic.c +++ b/bsd/dev/monotonic.c @@ -346,9 +346,11 @@ mt_sysctl SYSCTL_HANDLER_ARGS case MT_SUPPORTED: return sysctl_io_number(req, (int)mt_core_supported, sizeof(int), NULL, NULL); case MT_PMIS: - return sysctl_io_number(req, mt_pmis, sizeof(mt_pmis), NULL, NULL); - case MT_RETROGRADE: - return sysctl_io_number(req, mt_retrograde, sizeof(mt_retrograde), NULL, NULL); + return sysctl_io_number(req, mt_count_pmis(), sizeof(uint64_t), NULL, NULL); + case MT_RETROGRADE: { + uint64_t value = os_atomic_load_wide(&mt_retrograde, relaxed); + return sysctl_io_number(req, value, sizeof(mt_retrograde), NULL, NULL); + } case MT_TASK_THREAD: return sysctl_io_number(req, (int)mt_core_supported, sizeof(int), NULL, NULL); case MT_DEBUG: { diff --git a/bsd/dev/vn/vn.c b/bsd/dev/vn/vn.c index c0819facb..a3926c5db 100644 --- a/bsd/dev/vn/vn.c +++ b/bsd/dev/vn/vn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -142,30 +142,30 @@ static int vndevice_cdev_major; */ static struct bdevsw vn_bdevsw = { - /* open */ vnopen, - /* close */ vnclose, - /* strategy */ vnstrategy, - /* ioctl */ vnioctl_blk, - /* dump */ eno_dump, - /* psize */ vnsize, - /* flags */ D_DISK, + .d_open = vnopen, + .d_close = vnclose, + .d_strategy = vnstrategy, + .d_ioctl = vnioctl_blk, + .d_dump = eno_dump, + .d_psize = vnsize, + .d_type = D_DISK, }; static struct cdevsw vn_cdevsw = { - /* open */ vnopen, - /* close */ vnclose, - /* read */ vnread, - /* write */ vnwrite, - /* ioctl */ vnioctl_chr, - /* stop */ eno_stop, - /* reset */ eno_reset, - /* ttys */ NULL, - /* select */ eno_select, - /* mmap */ eno_mmap, - /* strategy */ eno_strat, - /* getc */ eno_getc, - /* putc */ eno_putc, - /* flags */ D_DISK, + .d_open = vnopen, + .d_close = vnclose, + .d_read = vnread, + .d_write = vnwrite, + .d_ioctl = vnioctl_chr, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = D_DISK, }; struct vn_softc { diff --git a/bsd/i386/Makefile b/bsd/i386/Makefile index 5763410f8..ab4a4ac86 100644 --- a/bsd/i386/Makefile +++ b/bsd/i386/Makefile @@ -12,6 +12,9 @@ DATAFILES = \ types.h vmparam.h _types.h _param.h \ _mcontext.h +DRIVERKIT_DATAFILES = \ + limits.h types.h _types.h + PRIVATE_DATAFILES = \ disklabel.h @@ -22,6 +25,7 @@ KERNELFILES = \ _mcontext.h INSTALL_MD_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MD_LIST = ${DRIVERKIT_DATAFILES} INSTALL_MD_LCL_LIST = ${PRIVATE_DATAFILES} INSTALL_MD_DIR = i386 diff --git a/bsd/i386/dis_tables.h b/bsd/i386/dis_tables.h index b627e5201..1f6f2c781 100644 --- a/bsd/i386/dis_tables.h +++ b/bsd/i386/dis_tables.h @@ -30,8 +30,6 @@ #ifndef _DIS_TABLES_H #define _DIS_TABLES_H -/* #pragma ident "@(#)dis_tables.h 1.10 07/07/10 SMI" */ - /* * Constants and prototypes for the IA32 disassembler backend. See dis_tables.c * for usage information and documentation. diff --git a/bsd/i386/exec.h b/bsd/i386/exec.h deleted file mode 100644 index 24de8642d..000000000 --- a/bsd/i386/exec.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)exec.h 8.1 (Berkeley) 6/11/93 - */ - -#ifndef _BSD_I386_EXEC_H_ -#define _BSD_I386_EXEC_H_ - - -#ifdef BSD_KERNEL_PRIVATE -/* Size of a page in an object file. */ -#define __LDPGSZ 4096 - -/* Valid magic number check. */ -#define N_BADMAG(ex) \ - ((ex).a_magic != NMAGIC && (ex).a_magic != OMAGIC && \ - (ex).a_magic != ZMAGIC) - -/* Address of the bottom of the text segment. */ -#define N_TXTADDR(X) 0 - -/* Address of the bottom of the data segment. */ -#define N_DATADDR(ex) \ - (N_TXTADDR(ex) + ((ex).a_magic == OMAGIC ? (ex).a_text \ - : __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Text segment offset. */ -#define N_TXTOFF(ex) \ - ((ex).a_magic == ZMAGIC ? __LDPGSZ : sizeof(struct exec)) - -/* Data segment offset. */ -#define N_DATOFF(ex) \ - (N_TXTOFF(ex) + ((ex).a_magic != ZMAGIC ? (ex).a_text : \ - __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Symbol table offset. */ -#define N_SYMOFF(ex) \ - (N_TXTOFF(ex) + (ex).a_text + (ex).a_data + (ex).a_trsize + \ - (ex).a_drsize) - -/* String table offset. */ -#define N_STROFF(ex) (N_SYMOFF(ex) + (ex).a_syms) - -/* Description of the object file header (a.out format). */ -struct exec { -#define OMAGIC 0407 /* old impure format */ -#define NMAGIC 0410 /* read-only text */ -#define ZMAGIC 0413 /* demand load format */ -#define QMAGIC 0314 /* demand load format. Header in text. */ - unsigned int a_magic; /* magic number */ - - unsigned int a_text; /* text segment size */ - unsigned int a_data; /* initialized data size */ - unsigned int a_bss; /* uninitialized data size */ - unsigned int a_syms; /* symbol table size */ - unsigned int a_entry; /* entry point */ - unsigned int a_trsize; /* text relocation size */ - unsigned int a_drsize; /* data relocation size */ -}; - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_I386_EXEC_H_ */ diff --git a/bsd/i386/fasttrap_isa.h b/bsd/i386/fasttrap_isa.h index 974b59c5b..512d55512 100644 --- a/bsd/i386/fasttrap_isa.h +++ b/bsd/i386/fasttrap_isa.h @@ -26,10 +26,6 @@ #ifndef _FASTTRAP_ISA_H #define _FASTTRAP_ISA_H -/* - * #pragma ident "@(#)fasttrap_isa.h 1.6 06/09/19 SMI" - */ - #include #include diff --git a/bsd/i386/limits.h b/bsd/i386/limits.h index 9bc2e5718..f6cafd9c9 100644 --- a/bsd/i386/limits.h +++ b/bsd/i386/limits.h @@ -37,14 +37,18 @@ #define _I386_LIMITS_H_ #include +#ifndef DRIVERKIT #include +#endif #define CHAR_BIT 8 /* number of bits in a char */ #define MB_LEN_MAX 6 /* Allow 31 bit UTF2 */ +#ifndef DRIVERKIT #if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) #define CLK_TCK __DARWIN_CLK_TCK /* ticks per second */ #endif /* !_ANSI_SOURCE && (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ +#endif /* * According to ANSI (section 2.2.4.2), the values below must be usable by diff --git a/bsd/i386/reboot.h b/bsd/i386/reboot.h deleted file mode 100644 index 0fbfa53e5..000000000 --- a/bsd/i386/reboot.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _BSD_I386_REBOOT_H_ -#define _BSD_I386_REBOOT_H_ - -/* - * Empty file (publicly) - */ - -#include - -#ifdef BSD_KERNEL_PRIVATE - -/* - * Use most significant 16 bits to avoid collisions with - * machine independent flags. - */ -#define RB_POWERDOWN 0x00010000 /* power down on halt */ -#define RB_NOBOOTRC 0x00020000 /* don't run '/etc/rc.boot' */ -#define RB_DEBUG 0x00040000 /* drop into mini monitor on panic */ -#define RB_EJECT 0x00080000 /* eject disks on halt */ -#define RB_COMMAND 0x00100000 /* new boot command specified */ -#define RB_NOFP 0x00200000 /* don't use floating point */ -#define RB_BOOTNEXT 0x00400000 /* reboot into NeXT */ -#define RB_BOOTDOS 0x00800000 /* reboot into DOS */ -#define RB_PRETTY 0x01000000 /* shutdown with pretty graphics */ - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_I386_REBOOT_H_ */ diff --git a/bsd/kern/ast.h b/bsd/kern/ast.h index 7fc56d217..19183cbbf 100644 --- a/bsd/kern/ast.h +++ b/bsd/kern/ast.h @@ -44,6 +44,8 @@ extern void bsd_ast(thread_t); extern void kevent_ast(thread_t thread, uint16_t bits); extern void act_set_astkevent(thread_t thread, uint16_t bits); +extern uint16_t act_clear_astkevent(thread_t thread, uint16_t bits); +extern void act_set_ast_reset_pcs(thread_t thread); #if CONFIG_DTRACE extern void ast_dtrace_on(void); diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 73be8cd43..3e2052fb0 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -133,6 +133,7 @@ #include /* for knote_init() */ #include /* for eventhandler_init() */ #include /* for memorystatus_init() */ +#include /* for memorystatus_freeze_init() */ #include /* for aio_init() */ #include /* for psem_cache_init() */ #include /* for dlil_init() */ @@ -164,6 +165,7 @@ #include /* for tcp_cc_init() */ #include /* for mptcp_control_register() */ #include /* for nwk_wq_init */ +#include /* for restricted_in_port_init() */ #include /* for assert() */ #include /* for init_system_override() */ @@ -177,7 +179,7 @@ #include -#if NFSCLIENT +#if CONFIG_NETBOOT #include #endif @@ -236,9 +238,10 @@ dev_t dumpdev; /* device to take dumps on */ long dumplo; /* offset into dumpdev */ long hostid; char hostname[MAXHOSTNAMELEN]; -int hostnamelen; +lck_mtx_t hostname_lock; +lck_grp_t *hostname_lck_grp; char domainname[MAXDOMNAMELEN]; -int domainnamelen; +lck_mtx_t domainname_lock; char rootdevice[DEVMAXNAMESIZE]; @@ -247,12 +250,16 @@ struct kmemstats kmemstats[M_LAST]; #endif struct vnode *rootvp; -int boothowto = RB_DEBUG; +int boothowto; int minimalboot = 0; #if CONFIG_EMBEDDED int darkboot = 0; #endif +#if __arm64__ +int legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE; +#endif /* __arm64__ */ + #if PROC_REF_DEBUG __private_extern__ int proc_ref_tracking_disabled = 0; /* disable panics on leaked proc refs across syscall boundary */ #endif @@ -272,8 +279,19 @@ extern void oslog_setsize(int size); extern void throttle_init(void); extern void acct_init(void); +#if CONFIG_LOCKERBOOT +#define LOCKER_PROTOBOOT_MOUNT "/protoboot" + +const char kernel_protoboot_mount[] = LOCKER_PROTOBOOT_MOUNT; +extern int mount_locker_protoboot(const char *fsname, const char *mntpoint, + const char *pbdevpath); +#endif + extern int serverperfmode; extern int ncl; +#if DEVELOPMENT || DEBUG +extern int syscallfilter_disable; +#endif // DEVELOPMENT || DEBUG vm_map_t bsd_pageable_map; vm_map_t mb_map; @@ -286,11 +304,10 @@ __private_extern__ vm_offset_t * execargs_cache = NULL; void bsd_exec_setup(int); -#if __arm64__ -__private_extern__ int bootarg_no64exec = 0; -#endif +__private_extern__ int bootarg_execfailurereports = 0; + #if __x86_64__ -__private_extern__ int bootarg_no32exec = 0; +__private_extern__ int bootarg_no32exec = 1; #endif __private_extern__ int bootarg_vnode_cache_defeat = 0; @@ -312,6 +329,7 @@ __private_extern__ int bootarg_disable_aslr = 0; #if DEVELOPMENT || DEBUG char dyld_alt_path[MAXPATHLEN]; int use_alt_dyld = 0; +extern uint64_t dyld_flags; #endif int cmask = CMASK; @@ -380,9 +398,9 @@ process_name(const char *s, proc_t p) /* To allow these values to be patched, they're globals here */ #include -struct rlimit vm_initial_limit_stack = { DFLSSIZ, MAXSSIZ - PAGE_MAX_SIZE }; -struct rlimit vm_initial_limit_data = { DFLDSIZ, MAXDSIZ }; -struct rlimit vm_initial_limit_core = { DFLCSIZ, MAXCSIZ }; +struct rlimit vm_initial_limit_stack = { .rlim_cur = DFLSSIZ, .rlim_max = MAXSSIZ - PAGE_MAX_SIZE }; +struct rlimit vm_initial_limit_data = { .rlim_cur = DFLDSIZ, .rlim_max = MAXDSIZ }; +struct rlimit vm_initial_limit_core = { .rlim_cur = DFLCSIZ, .rlim_max = MAXCSIZ }; extern thread_t cloneproc(task_t, coalition_t, proc_t, int, int); extern int (*mountroot)(void); @@ -445,11 +463,25 @@ bsd_init(void) kern_return_t ret; struct ucred temp_cred; struct posix_cred temp_pcred; -#if NFSCLIENT || CONFIG_IMAGEBOOT +#if CONFIG_NETBOOT || CONFIG_IMAGEBOOT boolean_t netboot = FALSE; #endif +#if CONFIG_LOCKERBOOT + vnode_t pbvn = NULLVP; + mount_t pbmnt = NULL; + char *pbdevp = NULL; + char pbdevpath[64]; + char pbfsname[MFSNAMELEN]; + char *slash_dev = NULL; +#endif -#define bsd_init_kprintf(x...) /* kprintf("bsd_init: " x) */ +#define DEBUG_BSDINIT 0 + +#if DEBUG_BSDINIT +#define bsd_init_kprintf(x, ...) kprintf("bsd_init: " x, ## __VA_ARGS__) +#else +#define bsd_init_kprintf(x, ...) +#endif throttle_init(); @@ -546,6 +578,10 @@ bsd_init(void) ulock_initialize(); + hostname_lck_grp = lck_grp_alloc_init("hostname", LCK_GRP_ATTR_NULL); + lck_mtx_init(&hostname_lock, hostname_lck_grp, LCK_ATTR_NULL); + lck_mtx_init(&domainname_lock, hostname_lck_grp, LCK_ATTR_NULL); + /* * Create process 0. */ @@ -646,7 +682,7 @@ bsd_init(void) /* Create the file descriptor table. */ kernproc->p_fd = &filedesc0; filedesc0.fd_cmask = cmask; - filedesc0.fd_knlistsize = -1; + filedesc0.fd_knlistsize = 0; filedesc0.fd_knlist = NULL; filedesc0.fd_knhash = NULL; filedesc0.fd_knhashmask = 0; @@ -738,6 +774,7 @@ bsd_init(void) bsd_init_kprintf("calling mbinit\n"); mbinit(); net_str_id_init(); /* for mbuf tags */ + restricted_in_port_init(); #endif /* SOCKETS */ /* @@ -839,13 +876,8 @@ bsd_init(void) bsd_init_kprintf("calling acct_init\n"); acct_init(); -#ifdef GPROF - /* Initialize kernel profiling. */ - kmstartup(); -#endif - bsd_init_kprintf("calling sysctl_mib_init\n"); - sysctl_mib_init() + sysctl_mib_init(); bsd_init_kprintf("calling bsd_autoconf\n"); bsd_autoconf(); @@ -928,7 +960,7 @@ bsd_init(void) bsd_init_kprintf("calling setconf\n"); setconf(); -#if NFSCLIENT +#if CONFIG_NETBOOT netboot = (mountroot == netboot_mountroot); #endif @@ -937,7 +969,7 @@ bsd_init(void) break; } rootdevice[0] = '\0'; -#if NFSCLIENT +#if CONFIG_NETBOOT if (netboot) { PE_display_icon( 0, "noroot"); /* XXX a netboot-specific icon would be nicer */ vc_progress_set(FALSE, 0); @@ -970,7 +1002,7 @@ bsd_init(void) (void)vnode_put(rootvnode); filedesc0.fd_cdir = rootvnode; -#if NFSCLIENT +#if CONFIG_NETBOOT if (netboot) { int err; @@ -992,17 +1024,60 @@ bsd_init(void) #if CONFIG_IMAGEBOOT +#if CONFIG_LOCKERBOOT + /* + * Stash the protoboot vnode, mount, filesystem name, and device name for + * later use. Note that the mount-from name may not have the "/dev/" + * component, so we must sniff out this condition and add it as needed. + */ + pbvn = rootvnode; + pbmnt = pbvn->v_mount; + pbdevp = vfs_statfs(pbmnt)->f_mntfromname; + slash_dev = strnstr(pbdevp, "/dev/", strlen(pbdevp)); + if (slash_dev) { + /* + * If the old root is a snapshot mount, it will have the form: + * + * com.apple.os.update-@ + * + * So we just search the mntfromname for any occurrence of "/dev/" and + * grab that as the device path. The image boot code needs a dev node to + * do the re-mount, so we cannot directly mount the snapshot as the + * protoboot volume currently. + */ + strlcpy(pbdevpath, slash_dev, sizeof(pbdevpath)); + } else { + snprintf(pbdevpath, sizeof(pbdevpath), "/dev/%s", pbdevp); + } + + bsd_init_kprintf("protoboot mount-from: %s\n", pbdevp); + bsd_init_kprintf("protoboot dev path: %s\n", pbdevpath); + + strlcpy(pbfsname, pbmnt->mnt_vtable->vfc_name, sizeof(pbfsname)); +#endif /* * See if a system disk image is present. If so, mount it and * switch the root vnode to point to it */ - if (netboot == FALSE && imageboot_needed()) { + imageboot_type_t imageboot_type = imageboot_needed(); + if (netboot == FALSE && imageboot_type) { /* * An image was found. No turning back: we're booted * with a kernel from the disk image. */ - imageboot_setup(); + bsd_init_kprintf("doing image boot: type = %d\n", imageboot_type); + imageboot_setup(imageboot_type); } + +#if CONFIG_LOCKERBOOT + if (imageboot_type == IMAGEBOOT_LOCKER) { + bsd_init_kprintf("booting from locker\n"); + if (vnode_tag(rootvnode) != VT_LOCKERFS) { + panic("root filesystem not a locker: fsname = %s", + rootvnode->v_mount->mnt_vtable->vfc_name); + } + } +#endif /* CONFIG_LOCKERBOOT */ #endif /* CONFIG_IMAGEBOOT */ /* set initial time; all other resource data is already zero'ed */ @@ -1017,6 +1092,30 @@ bsd_init(void) } #endif /* DEVFS */ + if (vfs_mount_rosv_data()) { + panic("failed to mount data volume!"); + } + + if (vfs_mount_vm()) { + printf("failed to mount vm volume!"); + } + +#if CONFIG_LOCKERBOOT + /* + * We need to wait until devfs is up before remounting the protoboot volume + * within the locker so that it can have a real devfs vnode backing it. + */ + if (imageboot_type == IMAGEBOOT_LOCKER) { + bsd_init_kprintf("re-mounting protoboot volume\n"); + int error = mount_locker_protoboot(pbfsname, LOCKER_PROTOBOOT_MOUNT, + pbdevpath); + if (error) { + panic("failed to mount protoboot volume: dev path = %s, error = %d", + pbdevpath, error); + } + } +#endif /* CONFIG_LOCKERBOOT */ + /* Initialize signal state for process 0. */ bsd_init_kprintf("calling siginit\n"); siginit(kernproc); @@ -1111,7 +1210,7 @@ setconf(void) flags = 0; } -#if NFSCLIENT +#if CONFIG_NETBOOT if (flags & 1) { /* network device */ mountroot = netboot_mountroot; @@ -1119,7 +1218,7 @@ setconf(void) #endif /* otherwise have vfs determine root filesystem */ mountroot = NULL; -#if NFSCLIENT +#if CONFIG_NETBOOT } #endif } @@ -1153,23 +1252,19 @@ bsd_utaskbootstrap(void) ut = (struct uthread *)get_bsdthread_info(thread); ut->uu_sigmask = 0; act_set_astbsd(thread); - task_clear_return_wait(get_threadtask(thread)); + task_clear_return_wait(get_threadtask(thread), TCRW_CLEAR_ALL_WAIT); } static void parse_bsd_args(void) { - char namep[16]; + char namep[48]; int msgbuf; if (PE_parse_boot_argn("-s", namep, sizeof(namep))) { boothowto |= RB_SINGLE; } - if (PE_parse_boot_argn("-b", namep, sizeof(namep))) { - boothowto |= RB_NOBOOTRC; - } - if (PE_parse_boot_argn("-x", namep, sizeof(namep))) { /* safe boot */ boothowto |= RB_SAFEBOOT; } @@ -1183,19 +1278,21 @@ parse_bsd_args(void) minimalboot = 1; } -#if __arm64__ - /* disable 64 bit grading */ - if (PE_parse_boot_argn("-no64exec", namep, sizeof(namep))) { - bootarg_no64exec = 1; - } -#endif #if __x86_64__ + int no32exec; + /* disable 32 bit grading */ - if (PE_parse_boot_argn("-no32exec", namep, sizeof(namep))) { - bootarg_no32exec = 1; + if (PE_parse_boot_argn("no32exec", &no32exec, sizeof(no32exec))) { + bootarg_no32exec = !!no32exec; } #endif + int execfailure_crashreports; + /* enable crash reports on various exec failures */ + if (PE_parse_boot_argn("execfailurecrashes", &execfailure_crashreports, sizeof(execfailure_crashreports))) { + bootarg_execfailurereports = !!execfailure_crashreports; + } + /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */ if (PE_parse_boot_argn("-vnode_cache_defeat", namep, sizeof(namep))) { bootarg_vnode_cache_defeat = 1; @@ -1266,15 +1363,48 @@ parse_bsd_args(void) if (PE_parse_boot_argn("-no_sigsys", namep, sizeof(namep))) { send_sigsys = false; } -#endif -#if (DEVELOPMENT || DEBUG) if (PE_parse_boot_argn("alt-dyld", dyld_alt_path, sizeof(dyld_alt_path))) { if (strlen(dyld_alt_path) > 0) { use_alt_dyld = 1; } } -#endif + PE_parse_boot_argn("dyld_flags", &dyld_flags, sizeof(dyld_flags)); + + if (PE_parse_boot_argn("-disable_syscallfilter", &namep, sizeof(namep))) { + syscallfilter_disable = 1; + } + +#if __arm64__ + if (PE_parse_boot_argn("legacy_footprint_entitlement_mode", &legacy_footprint_entitlement_mode, sizeof(legacy_footprint_entitlement_mode))) { + /* + * legacy_footprint_entitlement_mode specifies the behavior we want associated + * with the entitlement. The supported modes are: + * + * LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE: + * Indicates that we want every process to have the memory accounting + * that is available in iOS 12.0 and beyond. + * + * LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT: + * Indicates that for every process that has the 'legacy footprint entitlement', + * we want to give it the old iOS 11.0 accounting behavior which accounted some + * of the process's memory to the kernel. + * + * LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE: + * Indicates that for every process that has the 'legacy footprint entitlement', + * we want it to have a higher memory limit which will help them acclimate to the + * iOS 12.0 (& beyond) accounting behavior that does the right accounting. + * The bonus added to the system-wide task limit to calculate this higher memory limit + * is available in legacy_footprint_bonus_mb. + */ + + if (legacy_footprint_entitlement_mode < LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE || + legacy_footprint_entitlement_mode > LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE) { + legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE; + } + } +#endif /* __arm64__ */ +#endif /* DEVELOPMENT || DEBUG */ } void @@ -1304,7 +1434,7 @@ bsd_exec_setup(int scale) bsd_pageable_map_size = (bsd_simul_execs * BSD_PAGEABLE_SIZE_PER_EXEC); } -#if !NFSCLIENT +#if !CONFIG_NETBOOT int netboot_root(void); diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index fb33955de..f73834598 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -348,16 +348,21 @@ cdevsw_setkqueueok(int maj, struct cdevsw * csw, int extra_flags) int bsd_hostname(char * buf, int bufsize, int * len) { + int ret, hnlen; /* - * "hostname" is null-terminated, and "hostnamelen" is equivalent to strlen(hostname). + * "hostname" is null-terminated */ - if (hostnamelen < bufsize) { + lck_mtx_lock(&hostname_lock); + hnlen = strlen(hostname); + if (hnlen < bufsize) { strlcpy(buf, hostname, bufsize); - *len = hostnamelen; - return 0; + *len = hnlen; + ret = 0; } else { - return ENAMETOOLONG; + ret = ENAMETOOLONG; } + lck_mtx_unlock(&hostname_lock); + return ret; } void diff --git a/bsd/kern/chunklist.c b/bsd/kern/chunklist.c new file mode 100644 index 000000000..ed93a2fd9 --- /dev/null +++ b/bsd/kern/chunklist.c @@ -0,0 +1,676 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +extern int read_file(const char *path, void **bufp, size_t *bufszp); /* implemented in imageboot.c */ +extern vnode_t imgboot_get_image_file(const char *path, off_t *fsize, int *errp); /* implemented in imageboot.c */ + +#define AUTHDBG(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0) +#define AUTHPRNT(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0) +#define kfree_safe(x) do { if ((x)) { kfree_addr((x)); (x) = NULL; } } while (0) + +static const char *libkern_path = "/System/Library/Extensions/System.kext/PlugIns/Libkern.kext/Libkern"; +static const char *libkern_bundle = "com.apple.kpi.libkern"; + +/* + * Rev1 chunklist handling + */ +const struct chunklist_pubkey rev1_chunklist_pubkeys[] = { +}; +const size_t rev1_chunklist_num_pubkeys = sizeof(rev1_chunklist_pubkeys) / sizeof(rev1_chunklist_pubkeys[0]); + +static void +key_byteswap(void *_dst, const void *_src, size_t len) +{ + uint32_t *dst __attribute__((align_value(1))) = _dst; + const uint32_t *src __attribute__((align_value(1))) = _src; + + assert(len % sizeof(uint32_t) == 0); + + len = len / sizeof(uint32_t); + for (size_t i = 0; i < len; i++) { + dst[len - i - 1] = OSSwapInt32(src[i]); + } +} + +static int +construct_chunklist_path(const char *root_path, char **bufp) +{ + int err = 0; + char *path = NULL; + size_t len = 0; + + path = kalloc(MAXPATHLEN); + if (path == NULL) { + AUTHPRNT("failed to allocate space for chunklist path"); + err = ENOMEM; + goto out; + } + + len = strnlen(root_path, MAXPATHLEN); + if (len < MAXPATHLEN && len > strlen(".dmg")) { + /* correctly terminated string with space for extension */ + } else { + AUTHPRNT("malformed root path"); + err = EOVERFLOW; + goto out; + } + + len = strlcpy(path, root_path, MAXPATHLEN); + if (len >= MAXPATHLEN) { + AUTHPRNT("root path is too long"); + err = EOVERFLOW; + goto out; + } + + path[len - strlen(".dmg")] = '\0'; + len = strlcat(path, ".chunklist", MAXPATHLEN); + if (len >= MAXPATHLEN) { + AUTHPRNT("chunklist path is too long"); + err = EOVERFLOW; + goto out; + } + +out: + if (err) { + kfree_safe(path); + } else { + *bufp = path; + } + return err; +} + +static int +validate_signature(const uint8_t *key_msb, size_t keylen, uint8_t *sig_msb, size_t siglen, uint8_t *digest) +{ + int err = 0; + bool sig_valid = false; + uint8_t *sig = NULL; + + const uint8_t exponent[] = { 0x01, 0x00, 0x01 }; + uint8_t *modulus = kalloc(keylen); + rsa_pub_ctx *rsa_ctx = kalloc(sizeof(rsa_pub_ctx)); + sig = kalloc(siglen); + + if (modulus == NULL || rsa_ctx == NULL || sig == NULL) { + err = ENOMEM; + goto out; + } + + bzero(rsa_ctx, sizeof(rsa_pub_ctx)); + key_byteswap(modulus, key_msb, keylen); + key_byteswap(sig, sig_msb, siglen); + + err = rsa_make_pub(rsa_ctx, + sizeof(exponent), exponent, + CHUNKLIST_PUBKEY_LEN, modulus); + if (err) { + AUTHPRNT("rsa_make_pub() failed"); + goto out; + } + + err = rsa_verify_pkcs1v15(rsa_ctx, CC_DIGEST_OID_SHA256, + SHA256_DIGEST_LENGTH, digest, + siglen, sig, + &sig_valid); + if (err) { + sig_valid = false; + AUTHPRNT("rsa_verify() failed"); + goto out; + } + +out: + kfree_safe(sig); + kfree_safe(rsa_ctx); + kfree_safe(modulus); + + if (err) { + return err; + } else if (sig_valid == true) { + return 0; /* success */ + } else { + return EAUTH; + } +} + +static int +validate_root_image(const char *root_path, void *chunklist) +{ + int err = 0; + struct chunklist_hdr *hdr = chunklist; + struct chunklist_chunk *chk = NULL; + size_t ch = 0; + struct vnode *vp = NULL; + off_t fsize = 0; + off_t offset = 0; + bool doclose = false; + size_t bufsz = 0; + void *buf = NULL; + + vfs_context_t ctx = vfs_context_kernel(); + kauth_cred_t kerncred = vfs_context_ucred(ctx); + proc_t p = vfs_context_proc(ctx); + + AUTHDBG("validating root dmg %s", root_path); + + vp = imgboot_get_image_file(root_path, &fsize, &err); + if (vp == NULL) { + goto out; + } + + if ((err = VNOP_OPEN(vp, FREAD, ctx)) != 0) { + AUTHPRNT("failed to open vnode"); + goto out; + } + doclose = true; + + /* + * Iterate the chunk list and check each chunk + */ + chk = chunklist + hdr->cl_chunk_offset; + for (ch = 0; ch < hdr->cl_chunk_count; ch++) { + int resid = 0; + + if (!buf) { + /* allocate buffer based on first chunk size */ + buf = kalloc(chk->chunk_size); + if (buf == NULL) { + err = ENOMEM; + goto out; + } + bufsz = chk->chunk_size; + } + + if (chk->chunk_size > bufsz) { + AUTHPRNT("chunk size too big"); + err = EINVAL; + goto out; + } + + err = vn_rdwr(UIO_READ, vp, (caddr_t)buf, chk->chunk_size, offset, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p); + if (err) { + AUTHPRNT("vn_rdrw fail (err = %d, resid = %d)", err, resid); + goto out; + } + if (resid) { + err = EINVAL; + AUTHPRNT("chunk covered non-existant part of image"); + goto out; + } + + /* calculate the SHA256 of this chunk */ + uint8_t sha_digest[SHA256_DIGEST_LENGTH]; + SHA256_CTX sha_ctx; + SHA256_Init(&sha_ctx); + SHA256_Update(&sha_ctx, buf, chk->chunk_size); + SHA256_Final(sha_digest, &sha_ctx); + + /* Check the calculated SHA matches the chunk list */ + if (bcmp(sha_digest, chk->chunk_sha256, SHA256_DIGEST_LENGTH) != 0) { + AUTHPRNT("SHA mismatch on chunk %lu (offset %lld, size %u)", ch, offset, chk->chunk_size); + err = EINVAL; + goto out; + } + + if (os_add_overflow(offset, chk->chunk_size, &offset)) { + err = EINVAL; + goto out; + } + chk++; + } + + if (offset != fsize) { + AUTHPRNT("chunklist did not cover entire file (offset = %lld, fsize = %lld)", offset, fsize); + err = EINVAL; + goto out; + } + +out: + kfree_safe(buf); + if (doclose) { + VNOP_CLOSE(vp, FREAD, ctx); + } + if (vp) { + vnode_put(vp); + vp = NULL; + } + + return err; +} + +static const uuid_t * +getuuidfromheader_safe(const void *buf, size_t bufsz, size_t *uuidsz) +{ + const struct uuid_command *cmd = NULL; + const kernel_mach_header_t *mh = buf; + + /* space for the header and at least one load command? */ + if (bufsz < sizeof(kernel_mach_header_t) + sizeof(struct uuid_command)) { + AUTHPRNT("libkern image too small"); + return NULL; + } + + /* validate the mach header */ + if (mh->magic != MH_MAGIC_64 || (mh->sizeofcmds > bufsz - sizeof(kernel_mach_header_t))) { + AUTHPRNT("invalid MachO header"); + return NULL; + } + + /* iterate the load commands */ + size_t offset = sizeof(kernel_mach_header_t); + for (size_t i = 0; i < mh->ncmds; i++) { + cmd = buf + offset; + + if (cmd->cmd == LC_UUID) { + *uuidsz = sizeof(cmd->uuid); + return &cmd->uuid; + } + + if (os_add_overflow(cmd->cmdsize, offset, &offset) || + offset > bufsz - sizeof(struct uuid_command)) { + return NULL; + } + } + + return NULL; +} + +/* + * Rev2 chunklist handling + */ +const struct chunklist_pubkey rev2_chunklist_pubkeys[] = { +}; +const size_t rev2_chunklist_num_pubkeys = sizeof(rev2_chunklist_pubkeys) / sizeof(rev2_chunklist_pubkeys[0]); + +static const struct efi_guid_t gEfiSignAppleCertTypeGuid = CHUNKLIST_REV2_SIG_HASH_GUID; +static const struct efi_guid_t gEfiSignCertTypeRsa2048Sha256Guid = EFI_CERT_TYPE_RSA2048_SHA256; + +static boolean_t +validate_rev2_certificate(struct rev2_chunklist_certificate *certificate) +{ + /* Default value of current security epoch MUST be CHUNKLIST_MIN_SECURITY_EPOCH */ + uint8_t current_security_epoch = CHUNKLIST_MIN_SECURITY_EPOCH; + + /* Certificate.Length must be equal to sizeof(CERTIFICATE) */ + if (certificate->length != sizeof(struct rev2_chunklist_certificate)) { + AUTHDBG("invalid certificate length"); + return FALSE; + } + + /* Certificate.Revision MUST be equal to 2 */ + if (certificate->revision != 2) { + AUTHDBG("invalid certificate revision"); + return FALSE; + } + + /* Certificate.SecurityEpoch MUST be current or higher */ + if (PE_parse_boot_argn(CHUNKLIST_SECURITY_EPOCH, ¤t_security_epoch, sizeof(current_security_epoch)) && + certificate->security_epoch < current_security_epoch) { + AUTHDBG("invalid certificate security epoch"); + return FALSE; + } + + /* Certificate.CertificateType MUST be equal to WIN_CERT_TYPE_EFI_GUID (0x0EF1) */ + if (certificate->certificate_type != WIN_CERT_TYPE_EFI_GUID) { + AUTHDBG("invalid certificate type"); + return FALSE; + } + + /* Certificate.CertificateGuid MUST be equal to 45E7BC51-913C-42AC-96A2-10712FFBEBA7 */ + if (0 != memcmp(&certificate->certificate_guid, &gEfiSignAppleCertTypeGuid, sizeof(struct efi_guid_t))) { + AUTHDBG("invalid certificate GUID"); + return FALSE; + } + + /* Certificate.HashTypeGuid MUST be equal to A7717414-C616-4977-9420-844712A735BF */ + if (0 != memcmp(&certificate->hash_type_guid, &gEfiSignCertTypeRsa2048Sha256Guid, sizeof(struct efi_guid_t))) { + AUTHDBG("invalid hash type GUID"); + return FALSE; + } + + return TRUE; +} + +static int +validate_rev2_chunklist(uint8_t *buffer, size_t buffer_size) +{ + struct rev2_chunklist_certificate *certificate; + size_t security_data_offset; + + /* Check input parameters to be sane */ + if (buffer == NULL || buffer_size == 0) { + AUTHDBG("invalid parameter"); + return EINVAL; + } + + /* Check for existing signature */ + if (buffer_size < sizeof(struct rev2_chunklist_certificate)) { + AUTHDBG("no space for certificate"); + return EINVAL; + } + + security_data_offset = buffer_size - sizeof(struct rev2_chunklist_certificate); + certificate = (struct rev2_chunklist_certificate*)(buffer + security_data_offset); + + /* Check signature candidate to be a valid rev2 chunklist certificate */ + if (TRUE != validate_rev2_certificate(certificate)) { + return EINVAL; + } + + /* Check public key to be trusted */ + for (size_t i = 0; i < rev2_chunklist_num_pubkeys; i++) { + const struct chunklist_pubkey *key = &rev2_chunklist_pubkeys[i]; + /* Production keys are always trusted */ + if (key->is_production != TRUE) { + uint8_t no_rev2_dev = 0; + /* Do not trust rev2 development keys if CHUNKLIST_NO_REV2_DEV is present */ + if (PE_parse_boot_argn(CHUNKLIST_NO_REV2_DEV, &no_rev2_dev, sizeof(no_rev2_dev))) { + AUTHDBG("rev2 development key is not trusted"); + continue; + } + } + + /* Check certificate public key to be the trusted one */ + if (0 == memcmp(key->key, certificate->rsa_public_key, sizeof(certificate->rsa_public_key))) { + AUTHDBG("certificate public key is trusted"); + + /* Hash everything but signature */ + SHA256_CTX hash_ctx; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, buffer, security_data_offset); + + /* Include Certificate.SecurityEpoch value */ + SHA256_Update(&hash_ctx, &certificate->security_epoch, sizeof(certificate->security_epoch)); + + /* Finalize hashing into the output buffer */ + uint8_t sha_digest[SHA256_DIGEST_LENGTH]; + SHA256_Final(sha_digest, &hash_ctx); + + /* Validate signature */ + return validate_signature(certificate->rsa_public_key, + sizeof(certificate->rsa_public_key), + certificate->rsa_signature, + sizeof(certificate->rsa_signature), + sha_digest); + } + } + + AUTHDBG("certificate public key is not trusted"); + return EINVAL; +} + +/* + * Main chunklist validation routine + */ +static int +validate_chunklist(void *buf, size_t len) +{ + int err = 0; + size_t sigsz = 0; + size_t sig_end = 0; + size_t chunks_end = 0; + size_t sig_len = 0; + boolean_t valid_sig = FALSE; + struct chunklist_hdr *hdr = buf; + + if (len < sizeof(struct chunklist_hdr)) { + AUTHPRNT("no space for header"); + return EINVAL; + } + + /* recognized file format? */ + if (hdr->cl_magic != CHUNKLIST_MAGIC || + hdr->cl_file_ver != CHUNKLIST_FILE_VERSION_10 || + hdr->cl_chunk_method != CHUNKLIST_CHUNK_METHOD_10) { + AUTHPRNT("unrecognized chunklist format"); + return EINVAL; + } + + /* determine signature length based on signature method */ + if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV1) { + AUTHPRNT("rev1 chunklist"); + sig_len = CHUNKLIST_REV1_SIG_LEN; + } else if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV2) { + AUTHPRNT("rev2 chunklist"); + sig_len = CHUNKLIST_REV2_SIG_LEN; + } else { + AUTHPRNT("unrecognized chunklist signature method"); + return EINVAL; + } + + /* does the chunk list fall within the bounds of the buffer? */ + if (os_mul_and_add_overflow(hdr->cl_chunk_count, sizeof(struct chunklist_chunk), hdr->cl_chunk_offset, &chunks_end) || + hdr->cl_chunk_offset < sizeof(struct chunklist_hdr) || chunks_end > len) { + AUTHPRNT("invalid chunk_count (%llu) or chunk_offset (%llu)", + hdr->cl_chunk_count, hdr->cl_chunk_offset); + return EINVAL; + } + + /* does the signature fall within the bounds of the buffer? */ + if (os_add_overflow(hdr->cl_sig_offset, sig_len, &sig_end) || + hdr->cl_sig_offset < sizeof(struct chunklist_hdr) || + hdr->cl_sig_offset < chunks_end || + hdr->cl_sig_offset > len) { + AUTHPRNT("invalid signature offset (%llu)", hdr->cl_sig_offset); + return EINVAL; + } + + if (sig_end > len || + os_sub_overflow(len, hdr->cl_sig_offset, &sigsz) || + sigsz != sig_len) { + /* missing or incorrect signature size */ + return EINVAL; + } + + /* validate rev1 chunklist */ + if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV1) { + /* Do not trust rev1 chunklists if CHUNKLIST_NO_REV1 is present */ + uint8_t no_rev1; + if (PE_parse_boot_argn(CHUNKLIST_NO_REV1, &no_rev1, sizeof(no_rev1))) { + AUTHDBG("rev1 chunklists are not trusted"); + return EINVAL; + } + + /* hash the chunklist (excluding the signature) */ + AUTHDBG("hashing rev1 chunklist"); + uint8_t sha_digest[SHA256_DIGEST_LENGTH]; + SHA256_CTX sha_ctx; + SHA256_Init(&sha_ctx); + SHA256_Update(&sha_ctx, buf, hdr->cl_sig_offset); + SHA256_Final(sha_digest, &sha_ctx); + + AUTHDBG("validating rev1 chunklist signature against rev1 pub keys"); + for (size_t i = 0; i < rev1_chunklist_num_pubkeys; i++) { + const struct chunklist_pubkey *key = &rev1_chunklist_pubkeys[i]; + err = validate_signature(key->key, CHUNKLIST_PUBKEY_LEN, buf + hdr->cl_sig_offset, CHUNKLIST_SIGNATURE_LEN, sha_digest); + if (err == 0) { + AUTHDBG("validated rev1 chunklist signature with rev1 key %lu (prod=%d)", i, key->is_production); + valid_sig = key->is_production; +#if IMAGEBOOT_ALLOW_DEVKEYS + if (!key->is_production) { + /* allow dev keys in dev builds only */ + AUTHDBG("*** allowing DEV rev1 key: this will fail in customer builds ***"); + valid_sig = TRUE; + } +#endif + goto out; + } + } + + /* At this point we tried all the keys: nothing went wrong but none of them + * signed our chunklist. */ + AUTHPRNT("rev1 signature did not verify against any known rev1 public key"); + } else if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV2) { + AUTHDBG("validating rev2 chunklist signature against rev2 pub keys"); + err = validate_rev2_chunklist(buf, len); + if (err) { + goto out; + } + valid_sig = TRUE; + } + +out: + if (err) { + return err; + } else if (valid_sig == TRUE) { + return 0; /* signed, and everything checked out */ + } else { + return EINVAL; + } +} + +/* + * Authenticate a given DMG file using chunklist + */ +int +authenticate_root_with_chunklist(const char *root_path) +{ + char *chunklist_path = NULL; + void *chunklist_buf = NULL; + size_t chunklist_len = 32 * 1024 * 1024UL; + int err = 0; + + err = construct_chunklist_path(root_path, &chunklist_path); + if (err) { + AUTHPRNT("failed creating chunklist path"); + goto out; + } + + AUTHDBG("validating root against chunklist %s", chunklist_path); + + /* + * Read and authenticate the chunklist, then validate the root image against + * the chunklist. + */ + AUTHDBG("reading chunklist"); + err = read_file(chunklist_path, &chunklist_buf, &chunklist_len); + if (err) { + AUTHPRNT("failed to read chunklist"); + goto out; + } + + AUTHDBG("validating chunklist"); + err = validate_chunklist(chunklist_buf, chunklist_len); + if (err) { + AUTHPRNT("failed to validate chunklist"); + goto out; + } + AUTHDBG("successfully validated chunklist"); + + AUTHDBG("validating root image against chunklist"); + err = validate_root_image(root_path, chunklist_buf); + if (err) { + AUTHPRNT("failed to validate root image against chunklist (%d)", err); + goto out; + } + + /* everything checked out - go ahead and mount this */ + AUTHDBG("root image authenticated"); + +out: + kfree_safe(chunklist_buf); + kfree_safe(chunklist_path); + return err; +} + +/* + * Check that the UUID of the libkern currently loaded matches the one on disk. + */ +int +authenticate_root_version_check(void) +{ + int err = 0; + void *buf = NULL; + size_t bufsz = 4 * 1024 * 1024UL; + + /* get the UUID of the libkern in /S/L/E */ + err = read_file(libkern_path, &buf, &bufsz); + if (err) { + goto out; + } + + unsigned long uuidsz = 0; + const uuid_t *img_uuid = getuuidfromheader_safe(buf, bufsz, &uuidsz); + if (img_uuid == NULL || uuidsz != sizeof(uuid_t)) { + AUTHPRNT("invalid UUID (sz = %lu)", uuidsz); + err = EINVAL; + goto out; + } + + /* Get the UUID of the loaded libkern */ + uuid_t live_uuid; + err = OSKextGetUUIDForName(libkern_bundle, live_uuid); + if (err) { + AUTHPRNT("could not find loaded libkern"); + goto out; + } + + /* ... and compare them */ + if (bcmp(live_uuid, img_uuid, uuidsz) != 0) { + AUTHPRNT("UUID of running libkern does not match %s", libkern_path); + + uuid_string_t img_uuid_str, live_uuid_str; + uuid_unparse(*img_uuid, img_uuid_str); + uuid_unparse(live_uuid, live_uuid_str); + AUTHPRNT("loaded libkern UUID = %s", live_uuid_str); + AUTHPRNT("on-disk libkern UUID = %s", img_uuid_str); + + err = EINVAL; + goto out; + } + + /* UUID matches! */ +out: + kfree_safe(buf); + return err; +} diff --git a/bsd/kern/chunklist.h b/bsd/kern/chunklist.h index b4fe59d01..7a1042e75 100644 --- a/bsd/kern/chunklist.h +++ b/bsd/kern/chunklist.h @@ -1,19 +1,60 @@ #ifndef _CHUNKLIST_H #define _CHUNKLIST_H - #include +/* + * Boot argument for disabling trust in rev2 development key(s) + * Set by boot.efi + */ +#define CHUNKLIST_NO_REV2_DEV "-chunklist-no-rev2-dev" + +/* + * Boot argument for disabling trust in rev1 chunklists + * Set by boot.efi + */ +#define CHUNKLIST_NO_REV1 "-chunklist-no-rev1" + +/* + * Boot argument for obtaining current security epoch + * Set by boot.efi + */ +#define CHUNKLIST_SECURITY_EPOCH "chunklist-security-epoch" +#define CHUNKLIST_MIN_SECURITY_EPOCH 0 + /* * Chunklist file format */ +#define CHUNKLIST_MAGIC 0x4C4B4E43 +#define CHUNKLIST_FILE_VERSION_10 1 +#define CHUNKLIST_CHUNK_METHOD_10 1 +#define CHUNKLIST_SIGNATURE_METHOD_REV1 1 +#define CHUNKLIST_SIGNATURE_METHOD_REV2 3 +#define CHUNKLIST_REV1_SIG_LEN 256 +#define CHUNKLIST_REV2_SIG_LEN 808 +#define CHUNKLIST_PUBKEY_LEN (2048/8) +#define CHUNKLIST_SIGNATURE_LEN (2048/8) + +struct efi_guid_t { + uint32_t data1; + uint16_t data2; + uint16_t data3; + uint8_t data4[8]; +} __attribute__((packed)); -#define CHUNKLIST_MAGIC 0x4C4B4E43 -#define CHUNKLIST_FILE_VERSION_10 1 -#define CHUNKLIST_CHUNK_METHOD_10 1 -#define CHUNKLIST_SIGNATURE_METHOD_10 1 -#define CHUNKLIST_SIG_LEN 256 -#define CHUNKLIST_PUBKEY_LEN (2048/8) +// 45E7BC51-913C-42AC-96A2-10712FFBEBA7 +#define CHUNKLIST_REV2_SIG_HASH_GUID \ +{ \ + 0x45E7BC51, 0x913C, 0x42AC, { 0x96, 0xA2, 0x10, 0x71, 0x2F, 0xFB, 0xEB, 0xA7 } \ +}; + +// A7717414-C616-4977-9420-844712A735BF +#define EFI_CERT_TYPE_RSA2048_SHA256 \ +{ \ + 0xa7717414, 0xc616, 0x4977, { 0x94, 0x20, 0x84, 0x47, 0x12, 0xa7, 0x35, 0xbf } \ +} + +#define WIN_CERT_TYPE_EFI_GUID 0x0EF1 struct chunklist_hdr { uint32_t cl_magic; @@ -32,23 +73,22 @@ struct chunklist_chunk { uint8_t chunk_sha256[SHA256_DIGEST_LENGTH]; } __attribute__((packed)); -struct chunklist_sig { - uint8_t cl_sig[CHUNKLIST_SIG_LEN]; -}; - - -/* - * Chunklist signing public keys - */ +struct rev2_chunklist_certificate { + uint32_t length; + uint8_t revision; + uint8_t security_epoch; + uint16_t certificate_type; + guid_t certificate_guid; + guid_t hash_type_guid; + uint8_t rsa_public_key[CHUNKLIST_PUBKEY_LEN]; + uint8_t rsa_signature[CHUNKLIST_SIGNATURE_LEN]; +} __attribute__((packed)); struct chunklist_pubkey { - const bool isprod; + const boolean_t is_production; const uint8_t key[CHUNKLIST_PUBKEY_LEN]; }; -const struct chunklist_pubkey chunklist_pubkeys[] = { -}; - -#define CHUNKLIST_NPUBKEYS (sizeof(chunklist_pubkeys)/sizeof(chunklist_pubkeys[0])) - -#endif +int authenticate_root_with_chunklist(const char *root_path); +int authenticate_root_version_check(void); +#endif /* _CHUNKLIST_H */ diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index 84866b9e2..fd532f100 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -38,6 +38,8 @@ UNUSED_SYMBOL(decmpfs_read_compressed) UNUSED_SYMBOL(decmpfs_cnode_cmp_type) UNUSED_SYMBOL(decmpfs_cnode_get_vnode_state) UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_size) +UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_nchildren) +UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_total_size) UNUSED_SYMBOL(decmpfs_lock_compressed_data) UNUSED_SYMBOL(decmpfs_cnode_free) UNUSED_SYMBOL(decmpfs_cnode_alloc) @@ -457,7 +459,19 @@ decmpfs_cnode_get_vnode_cached_size(decmpfs_cnode *cp) return cp->uncompressed_size; } -static void +uint64_t +decmpfs_cnode_get_vnode_cached_nchildren(decmpfs_cnode *cp) +{ + return cp->nchildren; +} + +uint64_t +decmpfs_cnode_get_vnode_cached_total_size(decmpfs_cnode *cp) +{ + return cp->total_size; +} + +void decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size) { while (1) { @@ -470,6 +484,32 @@ decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size) } } +void +decmpfs_cnode_set_vnode_cached_nchildren(decmpfs_cnode *cp, uint64_t nchildren) +{ + while (1) { + uint64_t old = cp->nchildren; + if (OSCompareAndSwap64(old, nchildren, (UInt64*)&cp->nchildren)) { + return; + } else { + /* failed to write our value, so loop */ + } + } +} + +void +decmpfs_cnode_set_vnode_cached_total_size(decmpfs_cnode *cp, uint64_t total_sz) +{ + while (1) { + uint64_t old = cp->total_size; + if (OSCompareAndSwap64(old, total_sz, (UInt64*)&cp->total_size)) { + return; + } else { + /* failed to write our value, so loop */ + } + } +} + static uint64_t decmpfs_cnode_get_decompression_flags(decmpfs_cnode *cp) { @@ -539,7 +579,19 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** hdr->attr_size = sizeof(decmpfs_disk_header); hdr->compression_magic = DECMPFS_MAGIC; hdr->compression_type = cp->cmp_type; - hdr->uncompressed_size = decmpfs_cnode_get_vnode_cached_size(cp); + if (hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) { + if (!vnode_isdir(vp)) { + err = EINVAL; + goto out; + } + hdr->_size.value = DECMPFS_PKG_VALUE_FROM_SIZE_COUNT( + decmpfs_cnode_get_vnode_cached_size(cp), + decmpfs_cnode_get_vnode_cached_nchildren(cp)); + } else if (vnode_isdir(vp)) { + hdr->_size.value = decmpfs_cnode_get_vnode_cached_nchildren(cp); + } else { + hdr->_size.value = decmpfs_cnode_get_vnode_cached_size(cp); + } } else { /* figure out how big the xattr is on disk */ err = vn_getxattr(vp, DECMPFS_XATTR_NAME, NULL, &attr_size, XATTR_NOSECURITY, decmpfs_ctx); @@ -585,7 +637,14 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** goto out; } - if (hdr->compression_type >= CMP_MAX) { + /* + * Special-case the DATALESS compressor here; that is a valid type, + * even through there will never be an entry in the decompressor + * handler table for it. If we don't do this, then the cmp_state + * for this cnode will end up being marked NOT_COMPRESSED, and + * we'll be stuck in limbo. + */ + if (hdr->compression_type >= CMP_MAX && !decmpfs_type_is_dataless(hdr->compression_type)) { if (returnInvalid) { /* return the header even though the type is out of range */ err = ERANGE; @@ -686,19 +745,21 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp) goto out; } - lck_rw_lock_shared(decompressorsLock); - decmpfs_validate_compressed_file_func validate = decmp_get_func(vp, hdr->compression_type, validate); - if (validate) { /* make sure this validation function is valid */ - /* is the data okay? */ - err = validate(vp, decmpfs_ctx, hdr); - } else if (decmp_get_func(vp, hdr->compression_type, fetch) == NULL) { - /* the type isn't registered */ - err = EIO; - } else { - /* no validate registered, so nothing to do */ - err = 0; + if (!decmpfs_type_is_dataless(hdr->compression_type)) { + lck_rw_lock_shared(decompressorsLock); + decmpfs_validate_compressed_file_func validate = decmp_get_func(vp, hdr->compression_type, validate); + if (validate) { /* make sure this validation function is valid */ + /* is the data okay? */ + err = validate(vp, decmpfs_ctx, hdr); + } else if (decmp_get_func(vp, hdr->compression_type, fetch) == NULL) { + /* the type isn't registered */ + err = EIO; + } else { + /* no validate registered, so nothing to do */ + err = 0; + } + lck_rw_unlock_shared(decompressorsLock); } - lck_rw_unlock_shared(decompressorsLock); out: if (hdr) { FREE(hdr, M_TEMP); @@ -761,12 +822,6 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) return 0; } - if (!vnode_isreg(vp)) { - /* only regular files can be compressed */ - ret = FILE_IS_NOT_COMPRESSED; - goto done; - } - is_mounted = false; is_local_fs = false; mp = vnode_mount(vp); @@ -825,7 +880,16 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) ret = FILE_IS_NOT_COMPRESSED; goto done; } - /* we got the xattr, so the file is compressed */ + /* + * We got the xattr, so the file is at least tagged compressed. + * For DATALESS, regular files and directories can be "compressed". + * For all other types, only files are allowed. + */ + if (!vnode_isreg(vp) && + !(decmpfs_type_is_dataless(hdr->compression_type) && vnode_isdir(vp))) { + ret = FILE_IS_NOT_COMPRESSED; + goto done; + } ret = FILE_IS_COMPRESSED; goto done; } @@ -847,7 +911,15 @@ done: cnode_locked = 1; } - decmpfs_cnode_set_vnode_cached_size(cp, hdr->uncompressed_size); + if (vnode_isdir(vp)) { + decmpfs_cnode_set_vnode_cached_size(cp, 64); + decmpfs_cnode_set_vnode_cached_nchildren(cp, decmpfs_get_directory_entries(hdr)); + if (hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) { + decmpfs_cnode_set_vnode_cached_total_size(cp, DECMPFS_PKG_SIZE(hdr->_size)); + } + } else { + decmpfs_cnode_set_vnode_cached_size(cp, hdr->uncompressed_size); + } decmpfs_cnode_set_vnode_state(cp, ret, 1); decmpfs_cnode_set_vnode_cmp_type(cp, hdr->compression_type, 1); /* remember if the xattr's size was equal to the minimal xattr */ @@ -941,11 +1013,19 @@ decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap) error = decmpfs_fetch_compressed_header(vp, NULL, &hdr, 1); if (error == 0) { /* - * allow the flag to be set since the decmpfs attribute is present - * in that case, we also want to truncate the data fork of the file + * Allow the flag to be set since the decmpfs attribute + * is present. + * + * If we're creating a dataless file we do not want to + * truncate it to zero which allows the file resolver to + * have more control over when truncation should happen. + * All other types of compressed files are truncated to + * zero. */ - VATTR_SET_ACTIVE(vap, va_data_size); - vap->va_data_size = 0; + if (!decmpfs_type_is_dataless(hdr->compression_type)) { + VATTR_SET_ACTIVE(vap, va_data_size); + vap->va_data_size = 0; + } } else if (error == ERANGE) { /* the file had a decmpfs attribute but the type was out of range, so don't muck with the file's data size */ } else { diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 96c0a1e73..207d1fe0e 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -47,6 +47,11 @@ #include #include +#if CONFIG_IMAGEBOOT_IMG4 +#include +#include +#endif + #include #include @@ -57,18 +62,32 @@ extern struct filedesc filedesc0; extern int (*mountroot)(void); extern char rootdevice[DEVMAXNAMESIZE]; +#if CONFIG_LOCKERBOOT +typedef struct _locker_mount_args { + char lmnt_path[PATH_MAX]; + uint16_t lmnt_preferred_hash; +} locker_mount_args_t; +#endif + #define DEBUG_IMAGEBOOT 0 #if DEBUG_IMAGEBOOT -#define DBG_TRACE(...) printf(__VA_ARGS__) +#define DBG_TRACE(...) printf("imageboot: " __VA_ARGS__) #else #define DBG_TRACE(...) do {} while(0) #endif +#define AUTHDBG(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0) +#define AUTHPRNT(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0) +#define kfree_safe(x) do { if ((x)) { kfree_addr((x)); (x) = NULL; } } while (0) + extern int di_root_image(const char *path, char *devname, size_t devsz, dev_t *dev_p); extern int di_root_ramfile_buf(void *buf, size_t bufsz, char *devname, size_t devsz, dev_t *dev_p); -static boolean_t imageboot_setup_new(void); +static boolean_t imageboot_setup_new(imageboot_type_t type); + +vnode_t imgboot_get_image_file(const char *path, off_t *fsize, int *errp); /* may be required by chunklist.c */ +int read_file(const char *path, void **bufp, size_t *bufszp); /* may be required by chunklist.c */ #define kIBFilePrefix "file://" @@ -87,10 +106,10 @@ vnode_get_and_drop_always(vnode_t vp) vnode_put(vp); } -__private_extern__ int +__private_extern__ imageboot_type_t imageboot_needed(void) { - int result = 0; + imageboot_type_t result = IMAGEBOOT_NONE; char *root_path = NULL; DBG_TRACE("%s: checking for presence of root path\n", __FUNCTION__); @@ -100,8 +119,18 @@ imageboot_needed(void) panic("%s: M_NAMEI zone exhausted", __FUNCTION__); } +#if CONFIG_LOCKERBOOT + if (PE_parse_boot_argn(IMAGEBOOT_LOCKER_ARG, root_path, MAXPATHLEN)) { + result = IMAGEBOOT_LOCKER; + goto out; + } +#endif + /* Check for first layer */ if (!(PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) || +#if CONFIG_IMAGEBOOT_IMG4 + PE_parse_boot_argn("arp0", root_path, MAXPATHLEN) || +#endif PE_parse_boot_argn("rp", root_path, MAXPATHLEN) || PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) || PE_parse_boot_argn(IMAGEBOOT_AUTHROOT_ARG, root_path, MAXPATHLEN))) { @@ -115,7 +144,7 @@ imageboot_needed(void) goto out; } - result = 1; + result = IMAGEBOOT_DMG; /* Check for second layer */ if (!(PE_parse_boot_argn("rp1", root_path, MAXPATHLEN) || @@ -146,25 +175,61 @@ out: * is returned with usecount (no iocount). */ __private_extern__ int -imageboot_mount_image(const char *root_path, int height) +imageboot_mount_image(const char *root_path, int height, imageboot_type_t type) { dev_t dev; int error; - vnode_t old_rootvnode = NULL; + /* + * Need to stash this here since we may do a kernel_mount() on /, which will + * automatically update the rootvnode global. Note that vfs_mountroot() does + * not update that global, which is a bit weird. + */ + vnode_t old_rootvnode = rootvnode; vnode_t newdp; mount_t new_rootfs; + boolean_t update_rootvnode = FALSE; - error = di_root_image(root_path, rootdevice, DEVMAXNAMESIZE, &dev); - if (error) { - panic("%s: di_root_image failed: %d\n", __FUNCTION__, error); + if (type == IMAGEBOOT_DMG) { + error = di_root_image(root_path, rootdevice, DEVMAXNAMESIZE, &dev); + if (error) { + panic("%s: di_root_image failed: %d\n", __FUNCTION__, error); + } + + rootdev = dev; + mountroot = NULL; + printf("%s: root device 0x%x\n", __FUNCTION__, rootdev); + error = vfs_mountroot(); + if (error != 0) { + panic("vfs_mountroot() failed.\n"); + } + + update_rootvnode = TRUE; } +#if CONFIG_LOCKERBOOT + else if (type == IMAGEBOOT_LOCKER) { + locker_mount_args_t *mntargs = kalloc(sizeof(*mntargs)); + if (!mntargs) { + panic("could not alloc mount args"); + } - rootdev = dev; - mountroot = NULL; - printf("%s: root device 0x%x\n", __FUNCTION__, rootdev); - error = vfs_mountroot(); - if (error != 0) { - panic("vfs_mountroot() failed.\n"); + strlcpy(mntargs->lmnt_path, root_path, sizeof(mntargs->lmnt_path)); + mntargs->lmnt_preferred_hash = 0; + + DBG_TRACE("%s: mounting locker: %s\n", __FUNCTION__, root_path); + error = kernel_mount(LOCKERFS_NAME, NULLVP, NULLVP, "/", + mntargs, sizeof(*mntargs), 0, 0, vfs_context_kernel()); + if (error) { + panic("failed to mount locker: %d", error); + } + kfree(mntargs, sizeof(*mntargs)); + + /* Clear the old mount association. */ + old_rootvnode->v_mountedhere = NULL; + rootvnode->v_mount->mnt_vnodecovered = NULL; + } +#endif + else { + panic("invalid imageboot type: %d", type); } /* @@ -174,16 +239,13 @@ imageboot_mount_image(const char *root_path, int height) if (VFS_ROOT(TAILQ_LAST(&mountlist, mntlist), &newdp, vfs_context_kernel())) { panic("%s: cannot find root vnode", __FUNCTION__); } + DBG_TRACE("%s: old root fsname: %s\n", __FUNCTION__, old_rootvnode->v_mount->mnt_vtable->vfc_name); - if (rootvnode != NULL) { + if (old_rootvnode != NULL) { /* remember the old rootvnode, but remove it from mountlist */ - mount_t old_rootfs; - - old_rootvnode = rootvnode; - old_rootfs = rootvnode->v_mount; + mount_t old_rootfs = old_rootvnode->v_mount; mount_list_remove(old_rootfs); - mount_lock(old_rootfs); #ifdef CONFIG_IMGSRC_ACCESS old_rootfs->mnt_kern_flag |= MNTK_BACKS_ROOT; @@ -193,7 +255,9 @@ imageboot_mount_image(const char *root_path, int height) } /* switch to the new rootvnode */ - rootvnode = newdp; + if (update_rootvnode) { + rootvnode = newdp; + } new_rootfs = rootvnode->v_mount; mount_lock(new_rootfs); @@ -213,43 +277,14 @@ imageboot_mount_image(const char *root_path, int height) vnode_get_and_drop_always(old_rootvnode); } #else - height = 0; /* keep the compiler from complaining */ +#pragma unused(height) vnode_get_and_drop_always(old_rootvnode); #endif /* CONFIG_IMGSRC_ACCESS */ } return 0; } - -/* - * Authenticated root-dmg support - */ - -#define AUTHDBG(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0) -#define AUTHPRNT(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0) - -#define kfree_safe(x) do { if ((x)) { kfree_addr((x)); (x) = NULL; } } while (0) - -enum { - MISSING_SIG = -1, - INVALID_SIG = -2 -}; - -static void -key_byteswap(void *_dst, const void *_src, size_t len) -{ - uint32_t *dst __attribute__((align_value(1))) = _dst; - const uint32_t *src __attribute__((align_value(1))) = _src; - - assert(len % sizeof(uint32_t) == 0); - - len = len / sizeof(uint32_t); - for (size_t i = 0; i < len; i++) { - dst[len - i - 1] = OSSwapInt32(src[i]); - } -} - -static int +int read_file(const char *path, void **bufp, size_t *bufszp) { int err = 0; @@ -266,14 +301,14 @@ read_file(const char *path, void **bufp, size_t *bufszp) NDINIT(&ndp, LOOKUP, OP_OPEN, LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); if ((err = namei(&ndp)) != 0) { - AUTHPRNT("namei failed (%s)", path); + AUTHPRNT("namei failed (%s) - %d", path, err); goto out; } nameidone(&ndp); vp = ndp.ni_vp; if ((err = vnode_size(vp, &fsize, ctx)) != 0) { - AUTHPRNT("failed to get vnode size"); + AUTHPRNT("failed to get vnode size of %s - %d", path, err); goto out; } if (fsize < 0) { @@ -281,7 +316,7 @@ read_file(const char *path, void **bufp, size_t *bufszp) } if ((err = VNOP_OPEN(vp, FREAD, ctx)) != 0) { - AUTHPRNT("failed to open vnode"); + AUTHPRNT("failed to open %s - %d", path, err); goto out; } doclose = true; @@ -298,13 +333,13 @@ read_file(const char *path, void **bufp, size_t *bufszp) } if ((err = vn_rdwr(UIO_READ, vp, (caddr_t)buf, fsize, 0, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p)) != 0) { - AUTHPRNT("vn_rdwr() failed"); + AUTHPRNT("Cannot read %d bytes from %s - %d", (int)fsize, path, err); goto out; } if (resid) { /* didnt get everything we wanted */ - AUTHPRNT("vn_rdwr resid = %d", resid); + AUTHPRNT("Short read of %d bytes from %s - %d", (int)fsize, path, resid); err = EINVAL; goto out; } @@ -328,513 +363,131 @@ out: return err; } -static int -validate_signature(const uint8_t *key_msb, size_t keylen, uint8_t *sig_msb, size_t siglen, uint8_t *digest) -{ - int err = 0; - bool sig_valid = false; - uint8_t *sig = NULL; - - const uint8_t exponent[] = { 0x01, 0x00, 0x01 }; - uint8_t *modulus = kalloc(keylen); - rsa_pub_ctx *rsa_ctx = kalloc(sizeof(rsa_pub_ctx)); - sig = kalloc(siglen); - - if (modulus == NULL || rsa_ctx == NULL || sig == NULL) { - err = ENOMEM; - goto out; - } - - bzero(rsa_ctx, sizeof(rsa_pub_ctx)); - key_byteswap(modulus, key_msb, keylen); - key_byteswap(sig, sig_msb, siglen); - - err = rsa_make_pub(rsa_ctx, - sizeof(exponent), exponent, - CHUNKLIST_PUBKEY_LEN, modulus); - if (err) { - AUTHPRNT("rsa_make_pub() failed"); - goto out; - } - - err = rsa_verify_pkcs1v15(rsa_ctx, CC_DIGEST_OID_SHA256, - SHA256_DIGEST_LENGTH, digest, - siglen, sig, - &sig_valid); - if (err) { - sig_valid = false; - AUTHPRNT("rsa_verify() failed"); - err = EINVAL; - goto out; - } - -out: - kfree_safe(sig); - kfree_safe(rsa_ctx); - kfree_safe(modulus); - - if (err) { - return err; - } else if (sig_valid == true) { - return 0; /* success */ - } else { - return INVALID_SIG; - } -} - -static int -validate_chunklist(void *buf, size_t len) +#if CONFIG_IMAGEBOOT_IMG4 || CONFIG_IMAGEBOOT_CHUNKLIST +vnode_t +imgboot_get_image_file(const char *path, off_t *fsize, int *errp) { - int err = 0; - size_t sigsz = 0; - size_t sig_end = 0; - size_t chunks_end = 0; - bool valid_sig = false; - struct chunklist_hdr *hdr = buf; - - if (len < sizeof(struct chunklist_hdr)) { - AUTHPRNT("no space for header"); - return EINVAL; - } - - /* recognized file format? */ - if (hdr->cl_magic != CHUNKLIST_MAGIC || - hdr->cl_file_ver != CHUNKLIST_FILE_VERSION_10 || - hdr->cl_chunk_method != CHUNKLIST_SIGNATURE_METHOD_10 || - hdr->cl_sig_method != CHUNKLIST_SIGNATURE_METHOD_10) { - AUTHPRNT("unrecognized chunklist format"); - return EINVAL; - } - - /* does the chunk list fall within the bounds of the buffer? */ - if (os_mul_and_add_overflow(hdr->cl_chunk_count, sizeof(struct chunklist_chunk), hdr->cl_chunk_offset, &chunks_end) || - hdr->cl_chunk_offset < sizeof(struct chunklist_hdr) || chunks_end > len) { - AUTHPRNT("invalid chunk_count (%llu) or chunk_offset (%llu)", - hdr->cl_chunk_count, hdr->cl_chunk_offset); - return EINVAL; - } - - /* does the signature fall within the bounds of the buffer? */ - if (os_add_overflow(hdr->cl_sig_offset, sizeof(struct chunklist_sig), &sig_end) || - hdr->cl_sig_offset < sizeof(struct chunklist_hdr) || - hdr->cl_sig_offset < chunks_end || - hdr->cl_sig_offset > len) { - AUTHPRNT("invalid signature offset (%llu)", hdr->cl_sig_offset); - return EINVAL; - } - - if (sig_end > len || os_sub_overflow(len, hdr->cl_sig_offset, &sigsz) || sigsz != CHUNKLIST_SIG_LEN) { - /* missing or incorrect signature size */ - return MISSING_SIG; - } - - AUTHDBG("hashing chunklist"); - - /* hash the chunklist (excluding the signature) */ - uint8_t sha_digest[SHA256_DIGEST_LENGTH]; - SHA256_CTX sha_ctx; - SHA256_Init(&sha_ctx); - SHA256_Update(&sha_ctx, buf, hdr->cl_sig_offset); - SHA256_Final(sha_digest, &sha_ctx); - - AUTHDBG("validating chunklist signature against pub keys"); - for (size_t i = 0; i < CHUNKLIST_NPUBKEYS; i++) { - const struct chunklist_pubkey *key = &chunklist_pubkeys[i]; - err = validate_signature(key->key, CHUNKLIST_PUBKEY_LEN, - buf + hdr->cl_sig_offset, sigsz, sha_digest); - if (err == 0) { - AUTHDBG("validated chunklist signature with key %lu (prod=%d)", i, key->isprod); - valid_sig = key->isprod; -#if IMAGEBOOT_ALLOW_DEVKEYS - if (!key->isprod) { - /* allow dev keys in dev builds only */ - AUTHDBG("*** allowing DEV key: this will fail in customer builds ***"); - valid_sig = true; - } -#endif - goto out; - } else if (err == INVALID_SIG) { - /* try the next key */ - } else { - goto out; /* something bad happened */ - } - } - - /* At this point we tried all the keys: nothing went wrong but none of them - * signed our chunklist. */ - AUTHPRNT("signature did not verify against any known public key"); - -out: - if (err) { - return err; - } else if (valid_sig == true) { - return 0; /* signed, and everything checked out */ - } else { - return EINVAL; - } -} - -static int -validate_root_image(const char *root_path, void *chunklist) -{ - int err = 0; - struct chunklist_hdr *hdr = chunklist; - struct chunklist_chunk *chk = NULL; - size_t ch = 0; struct nameidata ndp = {}; - struct vnode *vp = NULL; - off_t fsize = 0; - off_t offset = 0; - bool doclose = false; - size_t bufsz = 0; - void *buf = NULL; - + vnode_t vp = NULL; vfs_context_t ctx = vfs_context_kernel(); - kauth_cred_t kerncred = vfs_context_ucred(ctx); - proc_t p = vfs_context_proc(ctx); - - AUTHDBG("validating root dmg %s", root_path); + int err; - /* - * Open the DMG - */ - NDINIT(&ndp, LOOKUP, OP_OPEN, LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(root_path), ctx); + NDINIT(&ndp, LOOKUP, OP_OPEN, LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); if ((err = namei(&ndp)) != 0) { - AUTHPRNT("namei failed (%s)", root_path); - goto out; - } - nameidone(&ndp); - vp = ndp.ni_vp; - - if (vp->v_type != VREG) { - err = EINVAL; - goto out; - } - - if ((err = vnode_size(vp, &fsize, ctx)) != 0) { - AUTHPRNT("failed to get vnode size"); - goto out; - } - - if ((err = VNOP_OPEN(vp, FREAD, ctx)) != 0) { - AUTHPRNT("failed to open vnode"); - goto out; - } - doclose = true; - - /* - * Iterate the chunk list and check each chunk - */ - chk = chunklist + hdr->cl_chunk_offset; - for (ch = 0; ch < hdr->cl_chunk_count; ch++) { - int resid = 0; - - if (!buf) { - /* allocate buffer based on first chunk size */ - buf = kalloc(chk->chunk_size); - if (buf == NULL) { - err = ENOMEM; - goto out; - } - bufsz = chk->chunk_size; - } - - if (chk->chunk_size > bufsz) { - AUTHPRNT("chunk size too big"); - err = EINVAL; - goto out; - } - - err = vn_rdwr(UIO_READ, vp, (caddr_t)buf, chk->chunk_size, offset, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p); - if (err) { - AUTHPRNT("vn_rdrw fail (err = %d, resid = %d)", err, resid); - goto out; - } - if (resid) { - err = EINVAL; - AUTHPRNT("chunk covered non-existant part of image"); - goto out; - } - - /* calculate the SHA256 of this chunk */ - uint8_t sha_digest[SHA256_DIGEST_LENGTH]; - SHA256_CTX sha_ctx; - SHA256_Init(&sha_ctx); - SHA256_Update(&sha_ctx, buf, chk->chunk_size); - SHA256_Final(sha_digest, &sha_ctx); - - /* Check the calculated SHA matches the chunk list */ - if (bcmp(sha_digest, chk->chunk_sha256, SHA256_DIGEST_LENGTH) != 0) { - AUTHPRNT("SHA mismatch on chunk %lu (offset %lld, size %u)", ch, offset, chk->chunk_size); - err = EINVAL; - goto out; - } + AUTHPRNT("Cannot find %s - error %d", path, err); + } else { + nameidone(&ndp); + vp = ndp.ni_vp; - if (os_add_overflow(offset, chk->chunk_size, &offset)) { + if (vp->v_type != VREG) { err = EINVAL; - goto out; + AUTHPRNT("%s it not a regular file", path); + } else if (fsize) { + if ((err = vnode_size(vp, fsize, ctx)) != 0) { + AUTHPRNT("Cannot get file size of %s - error %d", path, err); + } } - chk++; } - if (offset != fsize) { - AUTHPRNT("chunklist did not cover entire file (offset = %lld, fsize = %lld)", offset, fsize); - err = EINVAL; - goto out; - } - -out: - kfree_safe(buf); - if (doclose) { - VNOP_CLOSE(vp, FREAD, ctx); - } - if (vp) { - vnode_put(vp); + if (err) { + *errp = err; vp = NULL; } - - return err; + return vp; } +#endif /* CONFIG_IMAGEBOOT_CHUNKLIST || CONFIG_IMAGEBOOT_CHUNKLIST */ -static int -construct_chunklist_path(const char *root_path, char **bufp) -{ - int err = 0; - char *path = NULL; - size_t len = 0; - - path = kalloc(MAXPATHLEN); - if (path == NULL) { - AUTHPRNT("failed to allocate space for chunklist path"); - err = ENOMEM; - goto out; - } +#if CONFIG_IMAGEBOOT_IMG4 - len = strnlen(root_path, MAXPATHLEN); - if (len < MAXPATHLEN && len > strlen(".dmg")) { - /* correctly terminated string with space for extension */ - } else { - AUTHPRNT("malformed root path"); - err = EINVAL; - goto out; - } +#define APTICKET_NAME "apticket.der" - len = strlcpy(path, root_path, MAXPATHLEN); - if (len >= MAXPATHLEN) { - AUTHPRNT("root path is too long"); - err = EINVAL; - goto out; - } +static char * +imgboot_get_apticket_path(const char *rootpath) +{ + size_t plen = strlen(rootpath) + sizeof(APTICKET_NAME); + char *path = kalloc(plen); - path[len - strlen(".dmg")] = '\0'; - len = strlcat(path, ".chunklist", MAXPATHLEN); - if (len >= MAXPATHLEN) { - AUTHPRNT("chunklist path is too long"); - err = EINVAL; - goto out; - } + if (path) { + char *slash; -out: - if (err) { - kfree_safe(path); - } else { - *bufp = path; + strlcpy(path, rootpath, plen); + slash = strrchr(path, '/'); + if (slash == NULL) { + slash = path; + } else { + slash++; + } + strlcpy(slash, APTICKET_NAME, sizeof(APTICKET_NAME) + 1); } - return err; + return path; } static int -authenticate_root(const char *root_path) +authenticate_root_with_img4(const char *rootpath) { - char *chunklist_path = NULL; - void *chunklist_buf = NULL; - size_t chunklist_len = 32 * 1024 * 1024UL; - int err = 0; - - err = construct_chunklist_path(root_path, &chunklist_path); - if (err) { - AUTHPRNT("failed creating chunklist path"); - goto out; - } - - AUTHDBG("validating root against chunklist %s", chunklist_path); + errno_t rv; + img4_t i4; + img4_payload_t i4pl; + vnode_t vp; + char *ticket_path; + size_t tcksz = 0; + void *tckbuf = NULL; - /* - * Read and authenticate the chunklist, then validate the root image against - * the chunklist. - */ - - AUTHDBG("reading chunklist"); - err = read_file(chunklist_path, &chunklist_buf, &chunklist_len); - if (err) { - AUTHPRNT("failed to read chunklist"); - goto out; - } - - AUTHDBG("validating chunklist"); - err = validate_chunklist(chunklist_buf, chunklist_len); - if (err < 0) { - AUTHDBG("missing or incorrect signature on chunklist"); - goto out; - } else if (err) { - AUTHPRNT("failed to validate chunklist"); - goto out; - } else { - AUTHDBG("successfully validated chunklist"); - } + DBG_TRACE("Check %s\n", rootpath); - AUTHDBG("validating root image against chunklist"); - err = validate_root_image(root_path, chunklist_buf); - if (err) { - AUTHPRNT("failed to validate root image against chunklist (%d)", err); - goto out; + if (img4if == NULL) { + AUTHPRNT("AppleImage4 is not ready"); + return EAGAIN; } - /* everything checked out - go ahead and mount this */ - AUTHDBG("root image authenticated"); - -out: - kfree_safe(chunklist_buf); - kfree_safe(chunklist_path); - return err; -} - -static const uuid_t * -getuuidfromheader_safe(const void *buf, size_t bufsz, size_t *uuidsz) -{ - const struct uuid_command *cmd = NULL; - const kernel_mach_header_t *mh = buf; - - /* space for the header and at least one load command? */ - if (bufsz < sizeof(kernel_mach_header_t) + sizeof(struct uuid_command)) { - AUTHPRNT("libkern image too small"); - return NULL; + ticket_path = imgboot_get_apticket_path(rootpath); + if (ticket_path == NULL) { + AUTHPRNT("Cannot construct ticket path - out of memory"); + return ENOMEM; } - /* validate the mach header */ - if (mh->magic != MH_MAGIC_64 || (mh->sizeofcmds > bufsz - sizeof(kernel_mach_header_t))) { - AUTHPRNT("invalid MachO header"); - return NULL; + rv = read_file(ticket_path, &tckbuf, &tcksz); + if (rv) { + AUTHPRNT("Cannot get a ticket from %s - %d\n", ticket_path, rv); + goto out_with_ticket_path; } - /* iterate the load commands */ - size_t offset = sizeof(kernel_mach_header_t); - for (size_t i = 0; i < mh->ncmds; i++) { - cmd = buf + offset; - - if (cmd->cmd == LC_UUID) { - *uuidsz = sizeof(cmd->uuid); - return &cmd->uuid; - } + DBG_TRACE("Got %d bytes of manifest from %s\n", (int)tcksz, ticket_path); - if (os_add_overflow(cmd->cmdsize, offset, &offset) || - offset > bufsz - sizeof(struct uuid_command)) { - return NULL; - } + rv = img4_init(&i4, 0, tckbuf, tcksz, NULL); + if (rv) { + AUTHPRNT("Cannot initialise verification handle - error %d", rv); + goto out_with_ticket_bytes; } - return NULL; -} - -static const char *libkern_path = "/System/Library/Extensions/System.kext/PlugIns/Libkern.kext/Libkern"; -static const char *libkern_bundle = "com.apple.kpi.libkern"; - -/* - * Check that the UUID of the libkern currently loaded matches the one on disk. - */ -static int -auth_version_check(void) -{ - int err = 0; - void *buf = NULL; - size_t bufsz = 4 * 1024 * 1024UL; - - /* get the UUID of the libkern in /S/L/E */ - - err = read_file(libkern_path, &buf, &bufsz); - if (err) { + vp = imgboot_get_image_file(rootpath, NULL, &rv); + if (vp == NULL) { + /* Error message had been printed already */ goto out; } - unsigned long uuidsz = 0; - const uuid_t *img_uuid = getuuidfromheader_safe(buf, bufsz, &uuidsz); - if (img_uuid == NULL || uuidsz != sizeof(uuid_t)) { - AUTHPRNT("invalid UUID (sz = %lu)", uuidsz); - err = EINVAL; + rv = img4_payload_init_with_vnode_4xnu(&i4pl, 'rosi', vp, I4PLF_UNWRAPPED); + if (rv) { + AUTHPRNT("failed to init payload: %d", rv); goto out; } - /* Get the UUID of the loaded libkern */ - uuid_t live_uuid; - err = OSKextGetUUIDForName(libkern_bundle, live_uuid); - if (err) { - AUTHPRNT("could not find loaded libkern"); - goto out; + rv = img4_get_trusted_external_payload(&i4, &i4pl, IMG4_ENVIRONMENT_PPL, NULL, NULL); + if (rv) { + AUTHPRNT("failed to validate root image %s: %d", rootpath, rv); } - /* ... and compare them */ - if (bcmp(live_uuid, img_uuid, uuidsz) != 0) { - AUTHPRNT("UUID of running libkern does not match %s", libkern_path); - - uuid_string_t img_uuid_str, live_uuid_str; - uuid_unparse(*img_uuid, img_uuid_str); - uuid_unparse(live_uuid, live_uuid_str); - AUTHPRNT("loaded libkern UUID = %s", live_uuid_str); - AUTHPRNT("on-disk libkern UUID = %s", img_uuid_str); - - err = EINVAL; - goto out; - } - - /* UUID matches! */ - + img4_payload_destroy(&i4pl); out: - kfree_safe(buf); - return err; + img4_destroy(&i4); +out_with_ticket_bytes: + kfree_safe(tckbuf); +out_with_ticket_path: + kfree_safe(ticket_path); + return rv; } +#endif /* CONFIG_IMAGEBOOT_IMG4 */ -#if 0 -int -auth_imgboot_test(proc_t __unused ap, struct auth_imgboot_test_args *uap, int32_t *retval) -{ - int ret = 0; - int err; - char path[MAXPATHLEN]; - vm_size_t len; - *retval = 0; - - err = copyinstr(uap->path, path, MAXPATHLEN, &len); - if (err) { - return err; - } - if (len >= MAXPATHLEN) { - return ENAMETOOLONG; - } - - AUTHDBG("authenticating root image at %s", path); - err = authenticate_root(path); - if (err) { - AUTHPRNT("root authentication FAIL (%d)", err); - ret = err; - } else { - AUTHDBG("successfully authenticated %s", path); - } - - AUTHDBG("checking root image version"); - err = auth_version_check(); - if (err) { - AUTHPRNT("root image version check FAIL (%d)", err); - err = err ?: ret; - } else { - AUTHPRNT("root version check success (%d)", err); - } - - if (ret < 0) { - return EINVAL; /* negative return values have special meaning */ - } - return ret; -} -#endif /* * Attach the image at 'path' as a ramdisk and mount it as our new rootfs. @@ -926,19 +579,77 @@ out: return err; } +/* + * If the path is in URL format then we allocate memory and decode it, + * otherwise return the same pointer. + * + * Caller is expected to check if the pointers are different. + */ +static char * +url_to_path(char *url_path) +{ + char *path = url_path; + size_t len = strlen(kIBFilePrefix); + + if (strncmp(kIBFilePrefix, url_path, len) == 0) { + /* its a URL - remove the file:// prefix and percent-decode */ + url_path += len; + + len = strlen(url_path); + if (len) { + /* Make a copy of the path to URL-decode */ + path = kalloc(len + 1); + if (path == NULL) { + panic("imageboot path allocation failed - cannot allocate %d bytes\n", (int)len); + } + + strlcpy(path, url_path, len + 1); + url_decode(path); + } else { + panic("Bogus imageboot path URL - missing path\n"); + } + + DBG_TRACE("%s: root image URL <%s> becomes %s\n", __func__, url_path, path); + } + + return path; +} + static boolean_t -imageboot_setup_new() +imageboot_setup_new(imageboot_type_t type) { int error; char *root_path = NULL; int height = 0; boolean_t done = FALSE; - boolean_t auth_root = FALSE; + boolean_t auth_root = TRUE; boolean_t ramdisk_root = FALSE; MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); assert(root_path != NULL); +#if CONFIG_LOCKERBOOT + if (type == IMAGEBOOT_LOCKER) { + if (!PE_parse_boot_argn(IMAGEBOOT_LOCKER_ARG, root_path, MAXPATHLEN)) { + panic("locker boot with no locker given"); + } + + DBG_TRACE("%s: root fsname: %s\n", __FUNCTION__, rootvnode->v_mount->mnt_vtable->vfc_name); + + /* + * The locker path is a path, not a URL, so just pass it directly to + * imageboot_mount_image(). + */ + error = imageboot_mount_image(root_path, 0, type); + if (error) { + panic("failed to mount system locker: %d", error); + } + + done = TRUE; + goto out; + } +#endif /* CONFIG_LOCKERBOOT */ + unsigned imgboot_arg; if (PE_parse_boot_argn("-rootdmg-ramdisk", &imgboot_arg, sizeof(imgboot_arg))) { ramdisk_root = TRUE; @@ -946,7 +657,7 @@ imageboot_setup_new() if (PE_parse_boot_argn(IMAGEBOOT_CONTAINER_ARG, root_path, MAXPATHLEN) == TRUE) { printf("%s: container image url is %s\n", __FUNCTION__, root_path); - error = imageboot_mount_image(root_path, height); + error = imageboot_mount_image(root_path, height, type); if (error != 0) { panic("Failed to mount container image."); } @@ -954,71 +665,65 @@ imageboot_setup_new() height++; } - if (PE_parse_boot_argn(IMAGEBOOT_AUTHROOT_ARG, root_path, MAXPATHLEN) == TRUE) { - auth_root = TRUE; - } else if (PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) == FALSE) { + if (PE_parse_boot_argn(IMAGEBOOT_AUTHROOT_ARG, root_path, MAXPATHLEN) == FALSE && + PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) == FALSE) { if (height > 0) { - panic("%s specified without %s?\n", IMAGEBOOT_CONTAINER_ARG, IMAGEBOOT_ROOT_ARG); + panic("%s specified without %s or %s?\n", IMAGEBOOT_CONTAINER_ARG, IMAGEBOOT_AUTHROOT_ARG, IMAGEBOOT_ROOT_ARG); } goto out; } - printf("%s: root image url is %s\n", __func__, root_path); + printf("%s: root image URL is '%s'\n", __func__, root_path); #if CONFIG_CSR if (auth_root && (csr_check(CSR_ALLOW_ANY_RECOVERY_OS) == 0)) { AUTHPRNT("CSR_ALLOW_ANY_RECOVERY_OS set, skipping root image authentication"); - auth_root = false; + auth_root = FALSE; } #endif /* Make a copy of the path to URL-decode */ - char *path_alloc = kalloc(MAXPATHLEN); - if (path_alloc == NULL) { - panic("imageboot path allocation failed\n"); - } - char *path = path_alloc; - - size_t len = strlen(kIBFilePrefix); - strlcpy(path, root_path, MAXPATHLEN); - if (strncmp(kIBFilePrefix, path, len) == 0) { - /* its a URL - remove the file:// prefix and percent-decode */ - path += len; - url_decode(path); - } + char *path = url_to_path(root_path); + assert(path); +#if CONFIG_IMAGEBOOT_CHUNKLIST if (auth_root) { AUTHDBG("authenticating root image at %s", path); - error = authenticate_root(path); + error = authenticate_root_with_chunklist(path); if (error) { panic("root image authentication failed (err = %d)\n", error); } AUTHDBG("successfully authenticated %s", path); } +#endif if (ramdisk_root) { error = imageboot_mount_ramdisk(path); } else { - error = imageboot_mount_image(root_path, height); + error = imageboot_mount_image(root_path, height, type); } - kfree_safe(path_alloc); + if (path != root_path) { + kfree_safe(path); + } if (error) { panic("Failed to mount root image (err=%d, auth=%d, ramdisk=%d)\n", error, auth_root, ramdisk_root); } +#if CONFIG_IMAGEBOOT_CHUNKLIST if (auth_root) { /* check that the image version matches the running kernel */ AUTHDBG("checking root image version"); - error = auth_version_check(); + error = authenticate_root_version_check(); if (error) { panic("root image version check failed"); } else { AUTHDBG("root image version matches kernel"); } } +#endif done = TRUE; @@ -1028,7 +733,7 @@ out: } __private_extern__ void -imageboot_setup() +imageboot_setup(imageboot_type_t type) { int error = 0; char *root_path = NULL; @@ -1041,11 +746,13 @@ imageboot_setup() /* * New boot-arg scheme: - * root-dmg : the dmg that will be the root filesystem. - * auth-root-dmg : same as root-dmg but with image authentication. + * root-dmg : the dmg that will be the root filesystem, authenticated by default. + * auth-root-dmg : same as root-dmg. * container-dmg : an optional dmg that contains the root-dmg. + * locker : the locker that will be the root filesystem -- mutually + * exclusive with any other boot-arg. */ - if (imageboot_setup_new()) { + if (imageboot_setup_new(type)) { return; } @@ -1059,14 +766,28 @@ imageboot_setup() * device vnode created for it, and should not show up in getfsstat() until exposed * with MNT_IMGSRC. We just make it the temporary root. */ +#if CONFIG_IMAGEBOOT_IMG4 + if (PE_parse_boot_argn("arp0", root_path, MAXPATHLEN)) { + char *path = url_to_path(root_path); + + assert(path); + + if (authenticate_root_with_img4(path)) { + panic("Root image %s does not match the manifest\n", root_path); + } + if (path != root_path) { + kfree_safe(path); + } + } else +#endif /* CONFIG_IMAGEBOOT_IMG4 */ if ((PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == FALSE) && (PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) == FALSE)) { panic("%s: no valid path to image.\n", __FUNCTION__); } - printf("%s: root image url is %s\n", __FUNCTION__, root_path); + DBG_TRACE("%s: root image url is %s\n", __FUNCTION__, root_path); - error = imageboot_mount_image(root_path, 0); + error = imageboot_mount_image(root_path, 0, type); if (error) { panic("Failed on first stage of imageboot."); } @@ -1084,7 +805,7 @@ imageboot_setup() * If we fail to set up second image, it's not a given that we * can safely root off the first. */ - error = imageboot_mount_image(root_path, 1); + error = imageboot_mount_image(root_path, 1, type); if (error) { panic("Failed on second stage of imageboot."); } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index f901211e0..0110e7114 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @Apple_LICENSE_HEADER_START@ * @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -282,16 +283,16 @@ static int kdbg_setreg(kd_regtype *); static int kdbg_setpidex(kd_regtype *); static int kdbg_setpid(kd_regtype *); static void kdbg_thrmap_init(void); -static int kdbg_reinit(boolean_t); -static int kdbg_bootstrap(boolean_t); +static int kdbg_reinit(bool); +static int kdbg_bootstrap(bool); static int kdbg_test(size_t flavor); -static int kdbg_write_v1_header(boolean_t write_thread_map, vnode_t vp, vfs_context_t ctx); +static int kdbg_write_v1_header(bool write_thread_map, vnode_t vp, vfs_context_t ctx); static int kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx); static int kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size); static void kdbg_clear_thread_map(void); -static boolean_t kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait); +static bool kdbg_wait(uint64_t timeout_ms, bool locked_wait); static void kdbg_wakeup(void); int kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count, @@ -301,7 +302,7 @@ static kd_threadmap *kdbg_thrmap_init_internal(unsigned int count, unsigned int *mapsize, unsigned int *mapcount); -static boolean_t kdebug_current_proc_enabled(uint32_t debugid); +static bool kdebug_current_proc_enabled(uint32_t debugid); static errno_t kdebug_check_trace_string(uint32_t debugid, uint64_t str_id); int kdbg_write_v3_header(user_addr_t, size_t *, int); @@ -315,7 +316,7 @@ user_addr_t kdbg_write_v3_event_chunk_header(user_addr_t buffer, uint32_t tag, // Helper functions -static int create_buffers(boolean_t); +static int create_buffers(bool); static void delete_buffers(void); extern int tasks_count; @@ -365,7 +366,7 @@ struct kd_storage { uint32_t kds_bufindx; uint32_t kds_bufcnt; uint32_t kds_readlast; - boolean_t kds_lostevents; + bool kds_lostevents; uint64_t kds_timestamp; kd_buf kds_records[EVENTS_PER_STORAGE_UNIT]; @@ -392,7 +393,7 @@ int kds_waiter = 0; struct kd_bufinfo { union kds_ptr kd_list_head; union kds_ptr kd_list_tail; - boolean_t kd_lostevents; + bool kd_lostevents; uint32_t _pad; uint64_t kd_prev_timebase; uint32_t num_bufs; @@ -496,10 +497,129 @@ struct krt { struct tts *atts; }; +/* + * TRACE file formats... + * + * RAW_VERSION0 + * + * uint32_t #threadmaps + * kd_threadmap[] + * kd_buf[] + * + * RAW_VERSION1 + * + * RAW_header, with version_no set to RAW_VERSION1 + * kd_threadmap[] + * Empty space to pad alignment to the nearest page boundary. + * kd_buf[] + * + * RAW_VERSION1+ + * + * RAW_header, with version_no set to RAW_VERSION1 + * kd_threadmap[] + * kd_cpumap_header, with version_no set to RAW_VERSION1 + * kd_cpumap[] + * Empty space to pad alignment to the nearest page boundary. + * kd_buf[] + * + * V1+ implementation details... + * + * It would have been nice to add the cpumap data "correctly", but there were + * several obstacles. Existing code attempts to parse both V1 and V0 files. + * Due to the fact that V0 has no versioning or header, the test looks like + * this: + * + * // Read header + * if (header.version_no != RAW_VERSION1) { // Assume V0 } + * + * If we add a VERSION2 file format, all existing code is going to treat that + * as a VERSION0 file when reading it, and crash terribly when trying to read + * RAW_VERSION2 threadmap entries. + * + * To differentiate between a V1 and V1+ file, read as V1 until you reach + * the padding bytes. Then: + * + * boolean_t is_v1plus = FALSE; + * if (padding_bytes >= sizeof(kd_cpumap_header)) { + * kd_cpumap_header header = // read header; + * if (header.version_no == RAW_VERSION1) { + * is_v1plus = TRUE; + * } + * } + * + */ + +#define RAW_VERSION3 0x00001000 + +// Version 3 header +// The header chunk has the tag 0x00001000 which also serves as a magic word +// that identifies the file as a version 3 trace file. The header payload is +// a set of fixed fields followed by a variable number of sub-chunks: +/* + * ____________________________________________________________________________ + | Offset | Size | Field | + | ---------------------------------------------------------------------------- + | 0 | 4 | Tag (0x00001000) | + | 4 | 4 | Sub-tag. Represents the version of the header. | + | 8 | 8 | Length of header payload (40+8x) | + | 16 | 8 | Time base info. Two 32-bit numbers, numer/denom, | + | | | for converting timestamps to nanoseconds. | + | 24 | 8 | Timestamp of trace start. | + | 32 | 8 | Wall time seconds since Unix epoch. | + | | | As returned by gettimeofday(). | + | 40 | 4 | Wall time microseconds. As returned by gettimeofday(). | + | 44 | 4 | Local time zone offset in minutes. ( " ) | + | 48 | 4 | Type of daylight savings time correction to apply. ( " ) | + | 52 | 4 | Flags. 1 = 64-bit. Remaining bits should be written | + | | | as 0 and ignored when reading. | + | 56 | 8x | Variable number of sub-chunks. None are required. | + | | | Ignore unknown chunks. | + | ---------------------------------------------------------------------------- + */ +// NOTE: The header sub-chunks are considered part of the header chunk, +// so they must be included in the header chunk’s length field. +// The CPU map is an optional sub-chunk of the header chunk. It provides +// information about the CPUs that are referenced from the trace events. +typedef struct { + uint32_t tag; + uint32_t sub_tag; + uint64_t length; + uint32_t timebase_numer; + uint32_t timebase_denom; + uint64_t timestamp; + uint64_t walltime_secs; + uint32_t walltime_usecs; + uint32_t timezone_minuteswest; + uint32_t timezone_dst; + uint32_t flags; +} __attribute__((packed)) kd_header_v3; + +typedef struct { + uint32_t tag; + uint32_t sub_tag; + uint64_t length; +} __attribute__((packed)) kd_chunk_header_v3; + +#define V3_CONFIG 0x00001b00 +#define V3_CPU_MAP 0x00001c00 +#define V3_THREAD_MAP 0x00001d00 +#define V3_RAW_EVENTS 0x00001e00 +#define V3_NULL_CHUNK 0x00002000 + +// The current version of all kernel managed chunks is 1. The +// V3_CURRENT_CHUNK_VERSION is added to ease the simple case +// when most/all the kernel managed chunks have the same version. + +#define V3_CURRENT_CHUNK_VERSION 1 +#define V3_HEADER_VERSION V3_CURRENT_CHUNK_VERSION +#define V3_CPUMAP_VERSION V3_CURRENT_CHUNK_VERSION +#define V3_THRMAP_VERSION V3_CURRENT_CHUNK_VERSION +#define V3_EVENT_DATA_VERSION V3_CURRENT_CHUNK_VERSION + typedef struct krt krt_t; static uint32_t -kdbg_cpu_count(boolean_t early_trace) +kdbg_cpu_count(bool early_trace) { if (early_trace) { #if CONFIG_EMBEDDED @@ -518,7 +638,7 @@ kdbg_cpu_count(boolean_t early_trace) #if MACH_ASSERT #if CONFIG_EMBEDDED -static boolean_t +static bool kdbg_iop_list_is_valid(kd_iop_t* iop) { if (iop) { @@ -526,7 +646,7 @@ kdbg_iop_list_is_valid(kd_iop_t* iop) kd_iop_t* temp = iop; do { assert(!temp->next || temp->next->cpu_id == temp->cpu_id - 1); - assert(temp->next || (temp->cpu_id == kdbg_cpu_count(FALSE) || temp->cpu_id == kdbg_cpu_count(TRUE))); + assert(temp->next || (temp->cpu_id == kdbg_cpu_count(false) || temp->cpu_id == kdbg_cpu_count(true))); } while ((temp = temp->next)); /* Does each entry have a function and a name? */ @@ -537,20 +657,20 @@ kdbg_iop_list_is_valid(kd_iop_t* iop) } while ((temp = temp->next)); } - return TRUE; + return true; } -static boolean_t +static bool kdbg_iop_list_contains_cpu_id(kd_iop_t* list, uint32_t cpu_id) { while (list) { if (list->cpu_id == cpu_id) { - return TRUE; + return true; } list = list->next; } - return FALSE; + return false; } #endif /* CONFIG_EMBEDDED */ #endif /* MACH_ASSERT */ @@ -564,16 +684,25 @@ kdbg_iop_list_callback(kd_iop_t* iop, kd_callback_type type, void* arg) } } -static lck_grp_t *kdebug_lck_grp = NULL; +static lck_grp_t *kdebug_lck_grp = NULL; static void -kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type) +kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type) { - int s = ml_set_interrupts_enabled(FALSE); + /* + * Drain any events from IOPs before making the state change. On + * enabling, this removes any stale events from before tracing. On + * disabling, this saves any events up to the point tracing is disabled. + */ + kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, + NULL); + + int s = ml_set_interrupts_enabled(false); lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + if (enabled) { /* - * The oldest valid time is now; reject old events from IOPs. + * The oldest valid time is now; reject past events from IOPs. */ kd_ctrl_page.oldest_time = kdbg_timestamp(); kdebug_enable |= trace_type; @@ -590,22 +719,18 @@ kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type) ml_set_interrupts_enabled(s); if (enabled) { - kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_KDEBUG_ENABLED, NULL); + kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, + KD_CALLBACK_KDEBUG_ENABLED, NULL); } else { - /* - * If you do not flush the IOP trace buffers, they can linger - * for a considerable period; consider code which disables and - * deallocates without a final sync flush. - */ - kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_KDEBUG_DISABLED, NULL); - kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, NULL); + kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, + KD_CALLBACK_KDEBUG_DISABLED, NULL); } } static void -kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled) +kdbg_set_flags(int slowflag, int enableflag, bool enabled) { - int s = ml_set_interrupts_enabled(FALSE); + int s = ml_set_interrupts_enabled(false); lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); if (enabled) { @@ -623,11 +748,11 @@ kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled) /* * Disable wrapping and return true if trace wrapped, false otherwise. */ -static boolean_t +static bool disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) { - boolean_t wrapped; - int s = ml_set_interrupts_enabled(FALSE); + bool wrapped; + int s = ml_set_interrupts_enabled(false); lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); *old_slowcheck = kd_ctrl_page.kdebug_slowcheck; @@ -646,7 +771,7 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) static void enable_wrap(uint32_t old_slowcheck) { - int s = ml_set_interrupts_enabled(FALSE); + int s = ml_set_interrupts_enabled(false); lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); kd_ctrl_page.kdebug_flags &= ~KDBG_NOWRAP; @@ -660,7 +785,7 @@ enable_wrap(uint32_t old_slowcheck) } static int -create_buffers(boolean_t early_trace) +create_buffers(bool early_trace) { unsigned int i; unsigned int p_buffer_size; @@ -769,7 +894,7 @@ create_buffers(boolean_t early_trace) for (i = 0; i < kd_ctrl_page.kdebug_cpus; i++) { kdbip[i].kd_list_head.raw = KDS_PTR_NULL; kdbip[i].kd_list_tail.raw = KDS_PTR_NULL; - kdbip[i].kd_lostevents = FALSE; + kdbip[i].kd_lostevents = false; kdbip[i].num_bufs = 0; } @@ -828,7 +953,7 @@ release_storage_unit(int cpu, uint32_t kdsp_raw) kdsp.raw = kdsp_raw; - s = ml_set_interrupts_enabled(FALSE); + s = ml_set_interrupts_enabled(false); lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); kdbp = &kdbip[cpu]; @@ -856,18 +981,17 @@ release_storage_unit(int cpu, uint32_t kdsp_raw) ml_set_interrupts_enabled(s); } - -boolean_t +bool allocate_storage_unit(int cpu) { union kds_ptr kdsp; struct kd_storage *kdsp_actual, *kdsp_next_actual; struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try; uint64_t oldest_ts, ts; - boolean_t retval = TRUE; + bool retval = true; int s = 0; - s = ml_set_interrupts_enabled(FALSE); + s = ml_set_interrupts_enabled(false); lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); kdbp = &kdbip[cpu]; @@ -896,8 +1020,8 @@ allocate_storage_unit(int cpu) */ if (kd_ctrl_page.kdebug_flags & KDBG_NOWRAP) { kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; - kdbp->kd_lostevents = TRUE; - retval = FALSE; + kdbp->kd_lostevents = true; + retval = false; goto out; } kdbp_vict = NULL; @@ -941,7 +1065,7 @@ allocate_storage_unit(int cpu) kdebug_enable = 0; kd_ctrl_page.enabled = 0; commpage_update_kdebug_state(); - retval = FALSE; + retval = false; goto out; } kdsp = kdbp_vict->kd_list_head; @@ -950,9 +1074,9 @@ allocate_storage_unit(int cpu) if (kdbp_vict->kd_list_head.raw != KDS_PTR_NULL) { kdsp_next_actual = POINTER_FROM_KDS_PTR(kdbp_vict->kd_list_head); - kdsp_next_actual->kds_lostevents = TRUE; + kdsp_next_actual->kds_lostevents = true; } else { - kdbp_vict->kd_lostevents = TRUE; + kdbp_vict->kd_lostevents = true; } if (kd_ctrl_page.oldest_time < oldest_ts) { @@ -966,7 +1090,7 @@ allocate_storage_unit(int cpu) kdsp_actual->kds_readlast = 0; kdsp_actual->kds_lostevents = kdbp->kd_lostevents; - kdbp->kd_lostevents = FALSE; + kdbp->kd_lostevents = false; kdsp_actual->kds_bufindx = 0; if (kdbp->kd_list_head.raw == KDS_PTR_NULL) { @@ -995,7 +1119,7 @@ kernel_debug_register_callback(kd_callback_t callback) * Remove when fixed. */ { - boolean_t is_valid_name = FALSE; + bool is_valid_name = false; for (uint32_t length = 0; length < sizeof(callback.iop_name); ++length) { /* This is roughly isprintable(c) */ if (callback.iop_name[length] > 0x20 && callback.iop_name[length] < 0x7F) { @@ -1003,7 +1127,7 @@ kernel_debug_register_callback(kd_callback_t callback) } if (callback.iop_name[length] == 0) { if (length) { - is_valid_name = TRUE; + is_valid_name = true; } break; } @@ -1026,7 +1150,7 @@ kernel_debug_register_callback(kd_callback_t callback) * TLDR; Must not read kd_iops more than once per loop. */ iop->next = kd_iops; - iop->cpu_id = iop->next ? (iop->next->cpu_id + 1) : kdbg_cpu_count(FALSE); + iop->cpu_id = iop->next ? (iop->next->cpu_id + 1) : kdbg_cpu_count(false); /* * Header says OSCompareAndSwapPtr has a memory barrier @@ -1128,7 +1252,7 @@ retry_q: } if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) { - if (allocate_storage_unit(coreid) == FALSE) { + if (allocate_storage_unit(coreid) == false) { /* * this can only happen if wrapping * has been disabled @@ -1308,7 +1432,7 @@ retry_q: } if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) { - if (allocate_storage_unit(cpu) == FALSE) { + if (allocate_storage_unit(cpu) == false) { /* * this can only happen if wrapping * has been disabled @@ -1357,6 +1481,7 @@ out1: } } +__attribute__((noinline)) void kernel_debug( uint32_t debugid, @@ -1370,6 +1495,7 @@ kernel_debug( (uintptr_t)thread_tid(current_thread()), 0); } +__attribute__((noinline)) void kernel_debug1( uint32_t debugid, @@ -1382,6 +1508,7 @@ kernel_debug1( kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0); } +__attribute__((noinline)) void kernel_debug_flags( uint32_t debugid, @@ -1395,6 +1522,7 @@ kernel_debug_flags( (uintptr_t)thread_tid(current_thread()), flags); } +__attribute__((noinline)) void kernel_debug_filtered( uint32_t debugid, @@ -1560,7 +1688,7 @@ void kernel_debug_disable(void) { if (kdebug_enable) { - kdbg_set_tracing_enabled(FALSE, 0); + kdbg_set_tracing_enabled(false, 0); } } @@ -1624,7 +1752,7 @@ kdebug_typefilter(__unused struct proc* p, * the first atomic load test of Global Typefilter Ptr, this function * can then safely use the remaining global state without atomic checks. */ - if (!__c11_atomic_load((_Atomic typefilter_t *)&kdbg_typefilter, memory_order_acquire)) { + if (!os_atomic_load(&kdbg_typefilter, acquire)) { return EINVAL; } @@ -1643,7 +1771,7 @@ kdebug_typefilter(__unused struct proc* p, VM_KERN_MEMORY_NONE, kdbg_typefilter_memory_entry, // port (memory entry!) 0, // offset (in memory entry) - FALSE, // should copy + false, // should copy VM_PROT_READ, // cur_prot VM_PROT_READ, // max_prot VM_INHERIT_SHARE)); // inherit behavior on fork @@ -1787,18 +1915,18 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, * Trace system and scheduling events circumvent this check, as do events * emitted in interrupt context. */ -static boolean_t +static bool kdebug_current_proc_enabled(uint32_t debugid) { /* can't determine current process in interrupt context */ if (ml_at_interrupt_context()) { - return TRUE; + return true; } /* always emit trace system and scheduling events */ if ((KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE || (debugid & KDBG_CSC_MASK) == MACHDBG_CODE(DBG_MACH_SCHED, 0))) { - return TRUE; + return true; } if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) { @@ -1806,52 +1934,58 @@ kdebug_current_proc_enabled(uint32_t debugid) /* only the process with the kdebug bit set is allowed */ if (cur_proc && !(cur_proc->p_kdebug)) { - return FALSE; + return false; } } else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) { proc_t cur_proc = current_proc(); /* every process except the one with the kdebug bit set is allowed */ if (cur_proc && cur_proc->p_kdebug) { - return FALSE; + return false; } } - return TRUE; + return true; } -boolean_t +bool kdebug_debugid_enabled(uint32_t debugid) { /* if no filtering is enabled */ if (!kd_ctrl_page.kdebug_slowcheck) { - return TRUE; + return true; } return kdebug_debugid_explicitly_enabled(debugid); } -boolean_t +bool kdebug_debugid_explicitly_enabled(uint32_t debugid) { if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { return typefilter_is_debugid_allowed(kdbg_typefilter, debugid); } else if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) { - return TRUE; + return true; } else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { if (debugid < kdlog_beg || debugid > kdlog_end) { - return FALSE; + return false; } } else if (kd_ctrl_page.kdebug_flags & KDBG_VALCHECK) { if ((debugid & KDBG_EVENTID_MASK) != kdlog_value1 && (debugid & KDBG_EVENTID_MASK) != kdlog_value2 && (debugid & KDBG_EVENTID_MASK) != kdlog_value3 && (debugid & KDBG_EVENTID_MASK) != kdlog_value4) { - return FALSE; + return false; } } - return TRUE; + return true; +} + +bool +kdebug_using_continuous_time(void) +{ + return kdebug_enable & KDEBUG_ENABLE_CONT_TIME; } /* @@ -2006,7 +2140,7 @@ kdbg_lock_init(void) } int -kdbg_bootstrap(boolean_t early_trace) +kdbg_bootstrap(bool early_trace) { kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; @@ -2014,7 +2148,7 @@ kdbg_bootstrap(boolean_t early_trace) } int -kdbg_reinit(boolean_t early_trace) +kdbg_reinit(bool early_trace) { int ret = 0; @@ -2060,12 +2194,9 @@ kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid) void -kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4) +kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, + long *arg4) { - char *dbg_nameptr; - int dbg_namelen; - long dbg_parms[4]; - if (!proc) { *arg1 = 0; *arg2 = 0; @@ -2073,26 +2204,22 @@ kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *a *arg4 = 0; return; } - /* - * Collect the pathname for tracing - */ - dbg_nameptr = proc->p_comm; - dbg_namelen = (int)strlen(proc->p_comm); - dbg_parms[0] = 0L; - dbg_parms[1] = 0L; - dbg_parms[2] = 0L; - dbg_parms[3] = 0L; - if (dbg_namelen > (int)sizeof(dbg_parms)) { - dbg_namelen = (int)sizeof(dbg_parms); + const char *procname = proc_best_name(proc); + size_t namelen = strlen(procname); + + long args[4] = { 0 }; + + if (namelen > sizeof(args)) { + namelen = sizeof(args); } - strncpy((char *)dbg_parms, dbg_nameptr, dbg_namelen); + strncpy((char *)args, procname, namelen); - *arg1 = dbg_parms[0]; - *arg2 = dbg_parms[1]; - *arg3 = dbg_parms[2]; - *arg4 = dbg_parms[3]; + *arg1 = args[0]; + *arg2 = args[1]; + *arg3 = args[2]; + *arg4 = args[3]; } static void @@ -2394,7 +2521,7 @@ kdbg_setpid(kd_regtype *kdr) */ kd_ctrl_page.kdebug_flags |= KDBG_PIDCHECK; kd_ctrl_page.kdebug_flags &= ~KDBG_PIDEXCLUDE; - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); p->p_kdebug = 1; } else { @@ -2436,7 +2563,7 @@ kdbg_setpidex(kd_regtype *kdr) */ kd_ctrl_page.kdebug_flags |= KDBG_PIDEXCLUDE; kd_ctrl_page.kdebug_flags &= ~KDBG_PIDCHECK; - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); p->p_kdebug = 1; } else { @@ -2490,7 +2617,7 @@ kdbg_initialize_typefilter(typefilter_t tf) * that any non-null kdbg_typefilter means a * valid memory_entry is available. */ - __c11_atomic_store(((_Atomic typefilter_t*)&kdbg_typefilter), tf, memory_order_release); + os_atomic_store(&kdbg_typefilter, tf, release); return KERN_SUCCESS; } @@ -2552,7 +2679,7 @@ kdbg_enable_typefilter(void) assert(kdbg_typefilter); kd_ctrl_page.kdebug_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK); kd_ctrl_page.kdebug_flags |= KDBG_TYPEFILTER_CHECK; - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); commpage_update_kdebug_state(); } @@ -2567,9 +2694,9 @@ kdbg_disable_typefilter(void) kd_ctrl_page.kdebug_flags &= ~KDBG_TYPEFILTER_CHECK; if ((kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE))) { - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); } else { - kdbg_set_flags(SLOW_CHECKS, 0, FALSE); + kdbg_set_flags(SLOW_CHECKS, 0, false); } commpage_update_kdebug_state(); @@ -2613,7 +2740,7 @@ kdbg_setreg(kd_regtype * kdr) kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); break; case KDBG_SUBCLSTYPE: val_1 = (kdr->value1 & 0xff); @@ -2624,7 +2751,7 @@ kdbg_setreg(kd_regtype * kdr) kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); break; case KDBG_RANGETYPE: kdlog_beg = (kdr->value1); @@ -2632,7 +2759,7 @@ kdbg_setreg(kd_regtype * kdr) kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); break; case KDBG_VALCHECK: kdlog_value1 = (kdr->value1); @@ -2642,7 +2769,7 @@ kdbg_setreg(kd_regtype * kdr) kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kd_ctrl_page.kdebug_flags &= ~KDBG_RANGECHECK; /* Turn off range check */ kd_ctrl_page.kdebug_flags |= KDBG_VALCHECK; /* Turn on specific value check */ - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); break; case KDBG_TYPENONE: kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; @@ -2650,9 +2777,9 @@ kdbg_setreg(kd_regtype * kdr) if ((kd_ctrl_page.kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE | KDBG_TYPEFILTER_CHECK))) { - kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + kdbg_set_flags(SLOW_CHECKS, 0, true); } else { - kdbg_set_flags(SLOW_CHECKS, 0, FALSE); + kdbg_set_flags(SLOW_CHECKS, 0, false); } kdlog_beg = 0; @@ -2705,25 +2832,7 @@ write_error: return ret; } -int -kdbg_write_v3_chunk_header_to_buffer(void * buffer, uint32_t tag, uint32_t sub_tag, uint64_t length) -{ - kd_chunk_header_v3 header = { - .tag = tag, - .sub_tag = sub_tag, - .length = length, - }; - - if (!buffer) { - return 0; - } - - memcpy(buffer, &header, sizeof(kd_chunk_header_v3)); - - return sizeof(kd_chunk_header_v3); -} - -int +static int kdbg_write_v3_chunk_to_fd(uint32_t tag, uint32_t sub_tag, uint64_t length, void *payload, uint64_t payload_size, int fd) { proc_t p; @@ -3006,7 +3115,7 @@ kdbg_readcurthrmap(user_addr_t buffer, size_t *bufsize) } static int -kdbg_write_v1_header(boolean_t write_thread_map, vnode_t vp, vfs_context_t ctx) +kdbg_write_v1_header(bool write_thread_map, vnode_t vp, vfs_context_t ctx) { int ret = 0; RAW_header header; @@ -3175,7 +3284,7 @@ static int kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx) { int ret = 0; - boolean_t map_initialized; + bool map_initialized; ktrace_assert_lock_held(); assert(ctx != NULL); @@ -3204,7 +3313,7 @@ kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx) static int kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size) { - boolean_t map_initialized; + bool map_initialized; size_t map_size; int ret = 0; @@ -3233,7 +3342,7 @@ int kdbg_readthrmap_v3(user_addr_t buffer, size_t buffer_size, int fd) { int ret = 0; - boolean_t map_initialized; + bool map_initialized; size_t map_size; ktrace_assert_lock_held(); @@ -3278,8 +3387,8 @@ kdbg_set_nkdbufs(unsigned int req_nkdbufs) * * Called with `ktrace_lock` locked and interrupts enabled. */ -static boolean_t -kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait) +static bool +kdbg_wait(uint64_t timeout_ms, bool locked_wait) { int wait_result = THREAD_AWAKENED; uint64_t abstime = 0; @@ -3292,7 +3401,7 @@ kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait) clock_absolutetime_interval_to_deadline(abstime, &abstime); } - boolean_t s = ml_set_interrupts_enabled(FALSE); + bool s = ml_set_interrupts_enabled(false); if (!s) { panic("kdbg_wait() called with interrupts disabled"); } @@ -3317,7 +3426,7 @@ kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait) } /* check the count under the spinlock */ - boolean_t threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold); + bool threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold); lck_spin_unlock(kdw_spin_lock); ml_set_interrupts_enabled(s); @@ -3338,7 +3447,7 @@ kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait) static void kdbg_wakeup(void) { - boolean_t need_kds_wakeup = FALSE; + bool need_kds_wakeup = false; /* * Try to take the lock here to synchronize with the waiter entering @@ -3348,20 +3457,20 @@ kdbg_wakeup(void) * conditions. No problem if we fail, there will be lots of additional * events coming in that will eventually succeed in grabbing this lock. */ - boolean_t s = ml_set_interrupts_enabled(FALSE); + bool s = ml_set_interrupts_enabled(false); if (lck_spin_try_lock(kdw_spin_lock)) { if (kds_waiter && (kd_ctrl_page.kds_inuse_count >= n_storage_threshold)) { kds_waiter = 0; - need_kds_wakeup = TRUE; + need_kds_wakeup = true; } lck_spin_unlock(kdw_spin_lock); } ml_set_interrupts_enabled(s); - if (need_kds_wakeup == TRUE) { + if (need_kds_wakeup == true) { wakeup(&kds_waiter); } } @@ -3493,7 +3602,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) } kdbg_thrmap_init(); - kdbg_set_tracing_enabled(TRUE, value); + kdbg_set_tracing_enabled(true, value); } else { if (!kdebug_enable) { break; @@ -3508,7 +3617,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) break; case KERN_KDSETUP: - ret = kdbg_reinit(FALSE); + ret = kdbg_reinit(false); break; case KERN_KDREMOVE: @@ -3548,7 +3657,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) int fd; if (name[0] == KERN_KDWRITETR || name[0] == KERN_KDWRITETR_V3) { - (void)kdbg_wait(size, TRUE); + (void)kdbg_wait(size, true); } p = current_proc(); fd = value; @@ -3601,7 +3710,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) break; } case KERN_KDBUFWAIT: - *sizep = kdbg_wait(size, FALSE); + *sizep = kdbg_wait(size, false); break; case KERN_KDPIDTR: @@ -3683,8 +3792,8 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin uint32_t tempbuf_number; uint32_t old_kdebug_flags; uint32_t old_kdebug_slowcheck; - boolean_t out_of_events = FALSE; - boolean_t wrapped = FALSE; + bool out_of_events = false; + bool wrapped = false; assert(number); count = *number / sizeof(kd_buf); @@ -3701,22 +3810,19 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin memset(&lostevent, 0, sizeof(lostevent)); lostevent.debugid = TRACE_LOST_EVENTS; - /* - * Capture the current time. Only sort events that have occured - * before now. Since the IOPs are being flushed here, it is possible - * that events occur on the AP while running live tracing. If we are - * disabled, no new events should occur on the AP. - */ - if (kd_ctrl_page.enabled) { - barrier_max = kdbg_timestamp() & KDBG_TIMESTAMP_MASK; - } - /* * Request each IOP to provide us with up to date entries before merging * buffers together. */ kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, NULL); + /* + * Capture the current time. Only sort events that have occured + * before now. Since the IOPs are being flushed here, it is possible + * that events occur on the AP while running live tracing. + */ + barrier_max = kdbg_timestamp() & KDBG_TIMESTAMP_MASK; + /* * Disable wrap so storage units cannot be stolen out from underneath us * while merging events. @@ -3749,7 +3855,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin continue; } kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - kdsp_actual->kds_lostevents = FALSE; + kdsp_actual->kds_lostevents = false; } } /* @@ -3771,7 +3877,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin */ kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0); *tempbuf = lostevent; - wrapped = FALSE; + wrapped = false; goto nextevent; } @@ -3809,7 +3915,7 @@ next_event: */ if (kdsp_actual->kds_lostevents) { lostevents = true; - kdsp_actual->kds_lostevents = FALSE; + kdsp_actual->kds_lostevents = false; /* * The earliest event we can trust is the first one in this @@ -3831,7 +3937,7 @@ next_event: t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); - if ((t > barrier_max) && (barrier_max > 0)) { + if (t > barrier_max) { if (kdbg_debug) { printf("kdebug: FUTURE EVENT: debugid %#8x: " "time %lld from CPU %u " @@ -3839,12 +3945,7 @@ next_event: kdsp_actual->kds_records[rcursor].debugid, t, cpu, barrier_max, *number + tempbuf_number); } - /* - * Need to flush IOPs again before we can sort any more - * data from the buffers. - */ - out_of_events = TRUE; - break; + goto next_cpu; } if (t < kdsp_actual->kds_timestamp) { /* @@ -3857,7 +3958,7 @@ next_event: * Bail out so we don't get out-of-order events by * continuing to read events from other CPUs' events. */ - out_of_events = TRUE; + out_of_events = true; break; } @@ -3867,6 +3968,13 @@ next_event: */ if (t < barrier_min) { kdsp_actual->kds_readlast++; + if (kdbg_debug) { + printf("kdebug: PAST EVENT: debugid %#8x: " + "time %lld from CPU %u " + "(barrier at time %lld)\n", + kdsp_actual->kds_records[rcursor].debugid, + t, cpu, barrier_min); + } if (kdsp_actual->kds_readlast >= EVENTS_PER_STORAGE_UNIT) { release_storage_unit(cpu, kdsp.raw); @@ -3906,7 +4014,7 @@ next_event: } if (min_kdbp == NULL) { /* All buffers ran empty. */ - out_of_events = TRUE; + out_of_events = true; } if (out_of_events) { break; @@ -4000,7 +4108,7 @@ check_error: count -= tempbuf_number; *number += tempbuf_number; } - if (out_of_events == TRUE) { + if (out_of_events == true) { /* * all trace buffers are empty */ @@ -4018,13 +4126,37 @@ check_error: return error; } +#define KDEBUG_TEST_CODE(code) BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, (code)) + +/* + * A test IOP for the SYNC_FLUSH callback. + */ + +static int sync_flush_iop = 0; + +static void +sync_flush_callback(void * __unused context, kd_callback_type reason, + void * __unused arg) +{ + assert(sync_flush_iop > 0); + + if (reason == KD_CALLBACK_SYNC_FLUSH) { + kernel_debug_enter(sync_flush_iop, KDEBUG_TEST_CODE(0xff), + kdbg_timestamp(), 0, 0, 0, 0, 0); + } +} + +static struct kd_callback sync_flush_kdcb = { + .func = sync_flush_callback, + .iop_name = "test_sf", +}; + static int kdbg_test(size_t flavor) { int code = 0; int dummy_iop = 0; -#define KDEBUG_TEST_CODE(code) BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, (code)) switch (flavor) { case 1: /* try each macro */ @@ -4067,25 +4199,40 @@ kdbg_test(size_t flavor) /* ensure old timestamps are not emitted from kernel_debug_enter */ kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code), - 100 /* very old timestamp */, 0, 0, 0, - 0, (uintptr_t)thread_tid(current_thread())); + 100 /* very old timestamp */, 0, 0, 0, 0, 0); code++; kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code), - kdbg_timestamp(), 0, 0, 0, 0, - (uintptr_t)thread_tid(current_thread())); + kdbg_timestamp(), 0, 0, 0, 0, 0); code++; break; + case 3: + if (kd_ctrl_page.kdebug_iops) { + dummy_iop = kd_ctrl_page.kdebug_iops[0].cpu_id; + } + kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code), + kdbg_timestamp() * 2 /* !!! */, 0, 0, 0, 0, 0); + break; + + case 4: + if (!sync_flush_iop) { + sync_flush_iop = kernel_debug_register_callback( + sync_flush_kdcb); + assert(sync_flush_iop > 0); + } + break; + default: return ENOTSUP; } -#undef KDEBUG_TEST_CODE return 0; } +#undef KDEBUG_TEST_CODE + void -kdebug_init(unsigned int n_events, char *filter_desc, boolean_t wrapping) +kdebug_init(unsigned int n_events, char *filter_desc, bool wrapping) { assert(filter_desc != NULL); @@ -4105,7 +4252,7 @@ kdebug_init(unsigned int n_events, char *filter_desc, boolean_t wrapping) n_events = 200000; } - kdebug_trace_start(n_events, filter_desc, wrapping, FALSE); + kdebug_trace_start(n_events, filter_desc, wrapping, false); } static void @@ -4179,7 +4326,7 @@ kdbg_set_typefilter_string(const char *filter_desc) */ void kdebug_trace_start(unsigned int n_events, const char *filter_desc, - boolean_t wrapping, boolean_t at_wake) + bool wrapping, bool at_wake) { if (!n_events) { kd_early_done = true; @@ -4196,7 +4343,7 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc, kernel_debug_string_early("start_kern_tracing"); - if (kdbg_reinit(TRUE)) { + if (kdbg_reinit(true)) { printf("error from kdbg_reinit, kernel tracing not started\n"); goto out; } @@ -4221,13 +4368,13 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc, * Hold off interrupts between getting a thread map and enabling trace * and until the early traces are recorded. */ - boolean_t s = ml_set_interrupts_enabled(FALSE); + bool s = ml_set_interrupts_enabled(false); if (at_wake) { kdbg_thrmap_init(); } - kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE | (kdebug_serial ? + kdbg_set_tracing_enabled(true, KDEBUG_ENABLE_TRACE | (kdebug_serial ? KDEBUG_ENABLE_SERIAL : 0)); if (!at_wake) { @@ -4369,7 +4516,7 @@ binary_search(uint32_t id) low = 0; high = (int)(sizeof(kd_events) / sizeof(kd_event_t)) - 1; - while (TRUE) { + while (true) { mid = (low + high) / 2; if (low > high) { diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index ee523dff5..3ad8a516d 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1339,6 +1339,44 @@ out: return result; } +/* + * validate user_sigevent. at this point we only support + * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means + * sigev_value, sigev_notify_function, and sigev_notify_attributes + * are ignored, since SIGEV_THREAD is unsupported. This is consistent + * with no [RTS] (RalTime Signal) option group support. + */ +static int +aio_sigev_validate( const struct user_sigevent *sigev ) +{ + switch (sigev->sigev_notify) { + case SIGEV_SIGNAL: + { + int signum; + + /* make sure we have a valid signal number */ + signum = sigev->sigev_signo; + if (signum <= 0 || signum >= NSIG || + signum == SIGKILL || signum == SIGSTOP) { + return EINVAL; + } + } + break; + + case SIGEV_NONE: + break; + + case SIGEV_THREAD: + /* Unsupported [RTS] */ + + default: + return EINVAL; + } + + return 0; +} + + /* * aio_enqueue_work * @@ -1517,6 +1555,10 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) if (call_result) { goto ExitRoutine; } + call_result = aio_sigev_validate(&aiosigev); + if (call_result) { + goto ExitRoutine; + } } /* process list of aio requests */ @@ -1603,9 +1645,9 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) 0 ); } + aio_proc_lock_spin(p); switch (uap->mode) { case LIO_WAIT: - aio_proc_lock_spin(p); while (lio_context->io_completed < lio_context->io_issued) { result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0); @@ -1622,12 +1664,16 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) free_context = TRUE; } - aio_proc_unlock(p); break; case LIO_NOWAIT: + /* If no IOs were issued must free it (rdar://problem/45717887) */ + if (lio_context->io_issued == 0) { + free_context = TRUE; + } break; } + aio_proc_unlock(p); /* call_result == -1 means we had no trouble queueing up requests */ if (call_result == -1) { @@ -2128,35 +2174,9 @@ aio_validate( aio_workq_entry *entryp ) } } - /* - * validate aiocb.aio_sigevent. at this point we only support - * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means - * sigev_value, sigev_notify_function, and sigev_notify_attributes - * are ignored, since SIGEV_THREAD is unsupported. This is consistent - * with no [RTS] (RalTime Signal) option group support. - */ - switch (entryp->aiocb.aio_sigevent.sigev_notify) { - case SIGEV_SIGNAL: - { - int signum; - - /* make sure we have a valid signal number */ - signum = entryp->aiocb.aio_sigevent.sigev_signo; - if (signum <= 0 || signum >= NSIG || - signum == SIGKILL || signum == SIGSTOP) { - return EINVAL; - } - } - break; - - case SIGEV_NONE: - break; - - case SIGEV_THREAD: - /* Unsupported [RTS] */ - - default: - return EINVAL; + result = aio_sigev_validate(&entryp->aiocb.aio_sigevent); + if (result) { + return result; } /* validate the file descriptor and that the file was opened diff --git a/bsd/kern/kern_asl.c b/bsd/kern/kern_asl.c index 09e1cd059..a005c1055 100644 --- a/bsd/kern/kern_asl.c +++ b/bsd/kern/kern_asl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,35 +62,29 @@ /* Function to print input values as key-value pairs in format * identifiable by Apple system log (ASL) facility. All key-value pairs - * are assumed to be pointer to strings and are provided using two ways - - * (a) va_list argument which is a list of varying number of arguments - * created by the caller of this function. - * (b) variable number of arguments passed to this function. + * are assumed to be pointer to strings and are provided using va_list + * argument which is a list of varying number of arguments created by the + * caller of this function. * * Parameters - * level - Priority level for this ASL message * facility - Facility for this ASL message. * num_pairs - Number of key-value pairs provided by vargs argument. * vargs - List of key-value pairs. - * ... - Additional key-value pairs (apart from vargs) as variable - * argument list. A NULL value indicates the end of the - * variable argument list. * * Returns - * zero - On success, when it prints all key-values pairs provided. * E2BIG - When it cannot print all key-value pairs provided and had * to truncate the output. */ -int -kern_asl_msg_va(int level, const char *facility, int num_pairs, va_list vargs, ...) +static int +kern_asl_msg_va(int level, const char *facility, int num_pairs, va_list vargs) { int err = 0; char fmt[MAX_FMT_LEN]; /* Format string to use with vaddlog */ int calc_pairs = 0; size_t len; int i; - va_list ap; - char *ptr; /* Mask extra bits, if any, from priority level */ level = LOG_PRI(level); @@ -130,60 +124,6 @@ kern_asl_msg_va(int level, const char *facility, int num_pairs, va_list vargs, . (void) strlcat(fmt, KASL_KEYVAL_FMT, len); } - /* Count number of variable arguments provided to this function - * and determine total number of key-value pairs. - */ - calc_pairs = 0; - va_start(ap, vargs); - ptr = va_arg(ap, char *); - while (ptr) { - calc_pairs++; - ptr = va_arg(ap, char *); - } - calc_pairs /= 2; - va_end(ap); - - /* If user provided variable number of arguments, append them as - * as real key-value "[k v]" into the format string. If the format - * string is too small, ignore the key-value pair completely. - */ - if (calc_pairs) { - char *key, *val; - size_t pairlen; - int offset; - - /* Calculate bytes available for key-value pairs after reserving - * bytes for newline character and NULL terminator - */ - len = MAX_FMT_LEN - strlen(fmt) - KASL_NEWLINE_CHAR_LEN - 1; - offset = strlen(fmt); - - va_start(ap, vargs); - for (i = 0; i < calc_pairs; i++) { - key = va_arg(ap, char *); - val = va_arg(ap, char *); - - /* Calculate bytes required to store next key-value pair - * as "[key val] " including space for '[', ']', and - * two spaces. - */ - pairlen = strlen(key) + strlen(val) + 4; - if (pairlen > len) { - err = E2BIG; - break; - } - - /* len + 1 because one byte has been set aside for NULL - * terminator in calculation of 'len' above - */ - snprintf((fmt + offset), len + 1, KASL_KEYVAL_FMT, - key, val); - offset += pairlen; - len -= pairlen; - } - va_end(ap); - } - /* Append newline */ (void) strlcat(fmt, KASL_NEWLINE_CHAR, MAX_FMT_LEN); @@ -208,7 +148,7 @@ kern_asl_msg(int level, const char *facility, int num_pairs, ...) va_start(ap, num_pairs); err = kern_asl_msg_va(level, facility, - num_pairs, ap, NULL); + num_pairs, ap); va_end(ap); return err; diff --git a/bsd/kern/kern_backtrace.c b/bsd/kern/kern_backtrace.c index 29329bf77..d5b5ca727 100644 --- a/bsd/kern/kern_backtrace.c +++ b/bsd/kern/kern_backtrace.c @@ -57,7 +57,6 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS uint32_t bt_len = 0, bt_filled = 0; size_t bt_size = 0; int error = 0; - bool user_64 = false; if (type != BACKTRACE_USER) { return EINVAL; @@ -74,7 +73,7 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS return ENOBUFS; } memset(bt, 0, bt_size); - error = backtrace_user(bt, bt_len, &bt_filled, &user_64); + error = backtrace_user(bt, bt_len, &bt_filled, NULL, NULL); if (error) { goto out; } diff --git a/bsd/kern/kern_clock.c b/bsd/kern/kern_clock.c index fd776ee2f..a9c778a64 100644 --- a/bsd/kern/kern_clock.c +++ b/bsd/kern/kern_clock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,10 +79,6 @@ #include #include -#ifdef GPROF -#include -#endif - #include #include #include @@ -332,61 +328,6 @@ tvtohz(struct timeval *tv) return (int)ticks; } - -/* - * Start profiling on a process. - * - * Kernel profiling passes kernel_proc which never exits and hence - * keeps the profile clock running constantly. - */ -void -startprofclock(struct proc *p) -{ - if ((p->p_flag & P_PROFIL) == 0) { - OSBitOrAtomic(P_PROFIL, &p->p_flag); - } -} - -/* - * Stop profiling on a process. - */ -void -stopprofclock(struct proc *p) -{ - if (p->p_flag & P_PROFIL) { - OSBitAndAtomic(~((uint32_t)P_PROFIL), &p->p_flag); - } -} - -/* TBD locking user profiling is not resolved yet */ -void -bsd_uprofil(struct time_value *syst, user_addr_t pc) -{ - struct proc *p = current_proc(); - int ticks; - struct timeval *tv; - struct timeval st; - - if (p == NULL) { - return; - } - if (!(p->p_flag & P_PROFIL)) { - return; - } - - st.tv_sec = syst->seconds; - st.tv_usec = syst->microseconds; - - tv = &(p->p_stats->p_ru.ru_stime); - - ticks = ((tv->tv_sec - st.tv_sec) * 1000 + - (tv->tv_usec - st.tv_usec) / 1000) / - (tick / 1000); - if (ticks) { - addupc_task(p, pc, ticks); - } -} - /* TBD locking user profiling is not resolved yet */ void get_procrustime(time_value_t *tv) diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index 34b65a24f..643b1cebb 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -95,7 +95,7 @@ void mach_kauth_cred_uthread_update( void ); #endif # define K_UUID_FMT "%08x:%08x:%08x:%08x" -# define K_UUID_ARG(_u) *(int *)&_u.g_guid[0],*(int *)&_u.g_guid[4],*(int *)&_u.g_guid[8],*(int *)&_u.g_guid[12] +# define K_UUID_ARG(_u) &_u.g_guid_asint[0],&_u.g_guid_asint[1],&_u.g_guid_asint[2],&_u.g_guid_asint[3] # define KAUTH_DEBUG(fmt, args...) do { printf("%s:%d: " fmt "\n", __PRETTY_FUNCTION__, __LINE__ , ##args); } while (0) #endif @@ -1089,7 +1089,7 @@ kauth_resolver_complete(user_addr_t message) } else if (extl.el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM | KAUTH_EXTLOOKUP_VALID_GRNAM)) { error = EFAULT; KAUTH_DEBUG("RESOLVER - resolver returned mismatching extension flags (%d), request contained (%d)", - extl.el_flags, request_flags); + extl.el_flags, want_extend_data); } /* @@ -4744,7 +4744,7 @@ kauth_proc_setlabel(__unused struct proc *p, __unused void *label) #define KAUTH_CRED_REF_MAX 0x0ffffffful -__attribute__((noinline, cold, not_tail_called, noreturn)) +__attribute__((noinline, cold, noreturn)) static void kauth_cred_panic_resurrection(kauth_cred_t cred) { @@ -4752,7 +4752,7 @@ kauth_cred_panic_resurrection(kauth_cred_t cred) __builtin_unreachable(); } -__attribute__((noinline, cold, not_tail_called, noreturn)) +__attribute__((noinline, cold, noreturn)) static void kauth_cred_panic_over_released(kauth_cred_t cred) { @@ -4760,7 +4760,7 @@ kauth_cred_panic_over_released(kauth_cred_t cred) __builtin_unreachable(); } -__attribute__((noinline, cold, not_tail_called, noreturn)) +__attribute__((noinline, cold, noreturn)) static void kauth_cred_panic_over_retain(kauth_cred_t cred) { diff --git a/bsd/kern/kern_cs.c b/bsd/kern/kern_cs.c index c6ab1e5bf..4a9fbc3ff 100644 --- a/bsd/kern/kern_cs.c +++ b/bsd/kern/kern_cs.c @@ -224,6 +224,9 @@ cs_allow_invalid(struct proc *p) } proc_unlock(p); + /* allow a debugged process to hide some (debug-only!) memory */ + task_set_memory_ownership_transfer(p->task, TRUE); + vm_map_switch_protect(get_task_map(p->task), FALSE); #endif return (p->p_csflags & (CS_KILL | CS_HARD)) == 0; @@ -1137,6 +1140,39 @@ cs_entitlements_blob_get(proc_t p, void **out_start, size_t *out_length) return csblob_get_entitlements(csblob, out_start, out_length); } + +/* Retrieve the cached entitlements for a process + * Returns: + * EINVAL no text vnode associated with the process + * EBADEXEC invalid code signing data + * 0 no error occurred + * + * Note: the entitlements may be NULL if there is nothing cached. + */ + +int +cs_entitlements_dictionary_copy(proc_t p, void **entitlements) +{ + struct cs_blob *csblob; + + *entitlements = NULL; + + if ((p->p_csflags & CS_SIGNED) == 0) { + return 0; + } + + if (NULL == p->p_textvp) { + return EINVAL; + } + + if ((csblob = ubc_cs_blob_get(p->p_textvp, -1, p->p_textoff)) == NULL) { + return 0; + } + + *entitlements = csblob_entitlements_dictionary_copy(csblob); + return 0; +} + /* Retrieve the codesign identity for a process. * Returns: * NULL an error occured diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 9d68de20e..320c27b2c 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -125,8 +125,9 @@ #include #endif +#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, - mach_msg_type_name_t, ipc_port_t *); + mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t); void ipc_port_release_send(ipc_port_t); struct psemnode; @@ -908,7 +909,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = 0; goto out; } - error = fo_ioctl(fp, (int)TIOCGPGRP, (caddr_t)retval, &context); + error = fo_ioctl(fp, TIOCGPGRP, (caddr_t)retval, &context); *retval = -*retval; goto out; @@ -936,7 +937,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) tmp = (int)p1->p_pgrpid; proc_rele(p1); } - error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context); + error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context); goto out; case F_SETNOSIGPIPE: @@ -1398,6 +1399,50 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } + case F_SPECULATIVE_READ: { + fspecread_t args; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + if ((error = copyin(argp, (caddr_t)&args, sizeof(args)))) { + goto outdrop; + } + + /* Discard invalid offsets or lengths */ + if ((args.fsr_offset < 0) || (args.fsr_length < 0)) { + error = EINVAL; + goto outdrop; + } + + /* + * Round the file offset down to a page-size boundary (or to 0). + * The filesystem will need to round the length up to the end of the page boundary + * or to the EOF of the file. + */ + uint64_t foff = (((uint64_t)args.fsr_offset) & ~((uint64_t)PAGE_MASK)); + uint64_t foff_delta = args.fsr_offset - foff; + args.fsr_offset = (off_t) foff; + + /* + * Now add in the delta to the supplied length. Since we may have adjusted the + * offset, increase it by the amount that we adjusted. + */ + args.fsr_length += foff_delta; + + if ((error = vnode_getwithref(vp))) { + goto outdrop; + } + error = VNOP_IOCTL(vp, F_SPECULATIVE_READ, (caddr_t)&args, 0, &context); + (void)vnode_put(vp); + + goto outdrop; + } case F_SETSIZE: if (fp->f_type != DTYPE_VNODE) { error = EBADF; @@ -1657,7 +1702,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } goto outdrop; } - case F_GETPATH: { + case F_GETPATH: + case F_GETPATH_NOFIRMLINK: { char *pathbufp; int pathlen; @@ -1675,7 +1721,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } if ((error = vnode_getwithref(vp)) == 0) { - error = vn_getpath(vp, pathbufp, &pathlen); + if (uap->cmd == F_GETPATH_NOFIRMLINK) { + error = vn_getpath_ext(vp, NULL, pathbufp, &pathlen, VN_GETPATH_NO_FIRMLINK); + } else { + error = vn_getpath(vp, pathbufp, &pathlen); + } (void)vnode_put(vp); if (error == 0) { @@ -2202,9 +2252,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto out; } - /* For now, special case HFS+ only, since this is SPI. */ + /* + * For now, special case HFS+ and APFS only, since this + * is SPI. + */ src_vp = (struct vnode *)fp->f_data; - if (src_vp->v_tag != VT_HFS) { + if (src_vp->v_tag != VT_HFS && src_vp->v_tag != VT_APFS) { error = ENOTSUP; goto out; } @@ -2223,7 +2276,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto out; } dst_vp = (struct vnode *)fp2->f_data; - if (dst_vp->v_tag != VT_HFS) { + if (dst_vp->v_tag != VT_HFS && dst_vp->v_tag != VT_APFS) { fp_drop(p, fd2, fp2, 1); error = ENOTSUP; goto out; @@ -2592,10 +2645,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) case (int)APFSIOC_REVERT_TO_SNAPSHOT: case (int)FSIOC_FIOSEEKHOLE: case (int)FSIOC_FIOSEEKDATA: + case (int)FSIOC_CAS_BSDFLAGS: case HFS_GET_BOOT_INFO: case HFS_SET_BOOT_INFO: case FIOPINSWAP: case F_MARKDEPENDENCY: + case TIOCREVOKE: error = EINVAL; goto out; default: @@ -2933,6 +2988,8 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) knote_fdclose(p, fd); } + /* release the ref returned from fp_lookup before calling drain */ + (void) os_ref_release_locked(&fp->f_iocount); fileproc_drain(p, fp); if (fp->f_flags & FP_WAITEVENT) { @@ -3051,10 +3108,10 @@ fstat1(proc_t p, int fd, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsec * going to let them get the basic stat information. */ if (xsecurity == USER_ADDR_NULL) { - error = vn_stat_noauth((vnode_t)data, sbptr, NULL, isstat64, ctx, + error = vn_stat_noauth((vnode_t)data, sbptr, NULL, isstat64, 0, ctx, fp->f_fglob->fg_cred); } else { - error = vn_stat((vnode_t)data, sbptr, &fsec, isstat64, ctx); + error = vn_stat((vnode_t)data, sbptr, &fsec, isstat64, 0, ctx); } AUDIT_ARG(vnpath, (struct vnode *)data, ARG_VNODE1); @@ -3573,7 +3630,7 @@ fp_getfvp(proc_t p, int fd, struct fileproc **resultfp, struct vnode **resultvp) proc_fdunlock(p); return ENOTSUP; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3634,7 +3691,7 @@ fp_getfvpandvid(proc_t p, int fd, struct fileproc **resultfp, proc_fdunlock(p); return ENOTSUP; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3694,7 +3751,7 @@ fp_getfsock(proc_t p, int fd, struct fileproc **resultfp, proc_fdunlock(p); return EOPNOTSUPP; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3751,7 +3808,7 @@ fp_getfkq(proc_t p, int fd, struct fileproc **resultfp, proc_fdunlock(p); return EBADF; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3810,7 +3867,7 @@ fp_getfpshm(proc_t p, int fd, struct fileproc **resultfp, proc_fdunlock(p); return EBADF; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3878,7 +3935,7 @@ fp_getfpsem(proc_t p, int fd, struct fileproc **resultfp, proc_fdunlock(p); return EBADF; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3935,7 +3992,7 @@ fp_getfpipe(proc_t p, int fd, struct fileproc **resultfp, proc_fdunlock(p); return EBADF; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -3990,7 +4047,7 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked) } return EBADF; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); if (resultfp) { *resultfp = fp; @@ -4009,6 +4066,7 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked) * Description: Swap the fileproc pointer for a given fd with a new * fileproc pointer in the per-process open file table of * the specified process. The fdlock must be held at entry. + * Iff the swap is successful, the old fileproc pointer is freed. * * Parameters: p Process containing the fd * fd The fd of interest @@ -4017,7 +4075,7 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked) * Returns: 0 Success * EBADF Bad file descriptor * EINTR Interrupted - * EKEEPLOOKING f_iocount changed while lock was dropped. + * EKEEPLOOKING Other references were active, try again. */ int fp_tryswap(proc_t p, int fd, struct fileproc *nfp) @@ -4034,20 +4092,28 @@ fp_tryswap(proc_t p, int fd, struct fileproc *nfp) * At this point, our caller (change_guardedfd_np) has * one f_iocount reference, and we just took another * one to begin the replacement. + * fp and nfp have a +1 reference from allocation. + * Thus if no-one else is looking, f_iocount should be 3. */ - if (fp->f_iocount < 2) { - panic("f_iocount too small %d", fp->f_iocount); - } else if (2 == fp->f_iocount) { + if (os_ref_get_count(&fp->f_iocount) < 3 || + 1 != os_ref_get_count(&nfp->f_iocount)) { + panic("%s: f_iocount", __func__); + } else if (3 == os_ref_get_count(&fp->f_iocount)) { /* Copy the contents of *fp, preserving the "type" of *nfp */ nfp->f_flags = (nfp->f_flags & FP_TYPEMASK) | (fp->f_flags & ~FP_TYPEMASK); - nfp->f_iocount = fp->f_iocount; + os_ref_retain_locked(&nfp->f_iocount); + os_ref_retain_locked(&nfp->f_iocount); nfp->f_fglob = fp->f_fglob; nfp->f_wset = fp->f_wset; p->p_fd->fd_ofiles[fd] = nfp; - (void) fp_drop(p, fd, nfp, 1); + fp_drop(p, fd, nfp, 1); + + os_ref_release_live(&fp->f_iocount); + os_ref_release_live(&fp->f_iocount); + fileproc_free(fp); } else { /* * Wait for all other active references to evaporate. @@ -4061,7 +4127,6 @@ fp_tryswap(proc_t p, int fd, struct fileproc *nfp) * reevaluation of the change-guard attempt. */ error = EKEEPLOOKING; - printf("%s: lookup collision fd %d\n", __func__, fd); } (void) fp_drop(p, fd, fp, 1); } @@ -4182,9 +4247,8 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked) } return EBADF; } - fp->f_iocount--; - if (fp->f_iocount == 0) { + if (1 == os_ref_release_locked(&fp->f_iocount)) { if (fp->f_flags & FP_SELCONFLICT) { fp->f_flags &= ~FP_SELCONFLICT; } @@ -4487,9 +4551,8 @@ file_drop(int fd) proc_fdunlock(p); return EBADF; } - fp->f_iocount--; - if (fp->f_iocount == 0) { + if (1 == os_ref_release_locked(&fp->f_iocount)) { if (fp->f_flags & FP_SELCONFLICT) { fp->f_flags &= ~FP_SELCONFLICT; } @@ -4630,22 +4693,22 @@ falloc_withalloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, struct fileglob *fg; int error, nfd; + if (nfiles >= maxfiles) { + tablefull("file"); + return ENFILE; + } + if (!locked) { proc_fdlock(p); } + if ((error = fdalloc(p, 0, &nfd))) { if (!locked) { proc_fdunlock(p); } return error; } - if (nfiles >= maxfiles) { - if (!locked) { - proc_fdunlock(p); - } - tablefull("file"); - return ENFILE; - } + #if CONFIG_MACF error = mac_file_check_create(proc_ucred(p)); if (error) { @@ -4682,7 +4745,7 @@ falloc_withalloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, bzero(fg, sizeof(struct fileglob)); lck_mtx_init(&fg->fg_lock, file_lck_grp, file_lck_attr); - fp->f_iocount = 1; + os_ref_retain_locked(&fp->f_iocount); fg->fg_count = 1; fg->fg_ops = &uninitops; fp->f_fglob = fg; @@ -4753,6 +4816,27 @@ fg_free(struct fileglob *fg) } +/* + * fg_get_vnode + * + * Description: Return vnode associated with the file structure, if + * any. The lifetime of the returned vnode is bound to + * the lifetime of the file structure. + * + * Parameters: fg Pointer to fileglob to + * inspect + * + * Returns: vnode_t + */ +vnode_t +fg_get_vnode(struct fileglob *fg) +{ + if (FILEGLOB_DTYPE(fg) == DTYPE_VNODE) { + return (vnode_t)fg->fg_data; + } else { + return NULL; + } +} /* * fdexec @@ -4782,7 +4866,7 @@ fdexec(proc_t p, short flags, int self_exec) boolean_t cloexec_default = (flags & POSIX_SPAWN_CLOEXEC_DEFAULT) != 0; thread_t self = current_thread(); struct uthread *ut = get_bsdthread_info(self); - struct kqueue *dealloc_kq = NULL; + struct kqworkq *dealloc_kqwq = NULL; /* * If the current thread is bound as a workq/workloop @@ -4800,7 +4884,7 @@ fdexec(proc_t p, short flags, int self_exec) * subsequent kqueue closes go faster. */ knotes_dealloc(p); - assert(fdp->fd_knlistsize == -1); + assert(fdp->fd_knlistsize == 0); assert(fdp->fd_knhashmask == 0); for (i = fdp->fd_lastfile; i >= 0; i--) { @@ -4838,7 +4922,7 @@ fdexec(proc_t p, short flags, int self_exec) * Wait for any third party viewers (e.g., lsof) * to release their references to this fileproc. */ - while (fp->f_iocount > 0) { + while (os_ref_get_count(&fp->f_iocount) > 1) { p->p_fpdrainwait = 1; msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO, "fpdrain", NULL); @@ -4854,15 +4938,15 @@ fdexec(proc_t p, short flags, int self_exec) /* release the per-process workq kq */ if (fdp->fd_wqkqueue) { - dealloc_kq = fdp->fd_wqkqueue; + dealloc_kqwq = fdp->fd_wqkqueue; fdp->fd_wqkqueue = NULL; } proc_fdunlock(p); /* Anything to free? */ - if (dealloc_kq) { - kqueue_dealloc(dealloc_kq); + if (dealloc_kqwq) { + kqworkq_dealloc(dealloc_kqwq); } } @@ -5087,7 +5171,7 @@ fdcopy(proc_t p, vnode_t uth_cdir) * Initialize knote and kqueue tracking structs */ newfdp->fd_knlist = NULL; - newfdp->fd_knlistsize = -1; + newfdp->fd_knlistsize = 0; newfdp->fd_knhash = NULL; newfdp->fd_knhashmask = 0; newfdp->fd_kqhash = NULL; @@ -5119,7 +5203,7 @@ fdfree(proc_t p) { struct filedesc *fdp; struct fileproc *fp; - struct kqueue *dealloc_kq = NULL; + struct kqworkq *dealloc_kqwq = NULL; int i; proc_fdlock(p); @@ -5140,7 +5224,7 @@ fdfree(proc_t p) * tables to make any subsequent kqueue closes faster. */ knotes_dealloc(p); - assert(fdp->fd_knlistsize == -1); + assert(fdp->fd_knlistsize == 0); assert(fdp->fd_knhashmask == 0); /* @@ -5157,6 +5241,7 @@ fdfree(proc_t p) panic("fdfree: found fp with UF_RESERVED"); } + fileproc_drain(p, fp); procfdtbl_reservefd(p, i); if (fp->f_flags & FP_WAITEVENT) { @@ -5172,16 +5257,15 @@ fdfree(proc_t p) } if (fdp->fd_wqkqueue) { - dealloc_kq = fdp->fd_wqkqueue; + dealloc_kqwq = fdp->fd_wqkqueue; fdp->fd_wqkqueue = NULL; } proc_fdunlock(p); - if (dealloc_kq) { - kqueue_dealloc(dealloc_kq); + if (dealloc_kqwq) { + kqworkq_dealloc(dealloc_kqwq); } - if (fdp->fd_cdir) { vnode_rele(fdp->fd_cdir); } @@ -5195,7 +5279,7 @@ fdfree(proc_t p) if (fdp->fd_kqhash) { for (uint32_t j = 0; j <= fdp->fd_kqhashmask; j++) { - assert(SLIST_EMPTY(&fdp->fd_kqhash[j])); + assert(LIST_EMPTY(&fdp->fd_kqhash[j])); } FREE(fdp->fd_kqhash, M_KQUEUE); } @@ -5337,14 +5421,13 @@ fileproc_drain(proc_t p, struct fileproc * fp) context.vc_thread = proc_thread(p); /* XXX */ context.vc_ucred = fp->f_fglob->fg_cred; - fp->f_iocount--; /* (the one the close holds) */ + /* Set the vflag for drain */ + fileproc_modify_vflags(fp, FPV_DRAIN, FALSE); - while (fp->f_iocount) { + while (os_ref_get_count(&fp->f_iocount) > 1) { lck_mtx_convert_spin(&p->p_fdmlock); - if (fp->f_fglob->fg_ops->fo_drain) { - (*fp->f_fglob->fg_ops->fo_drain)(fp, &context); - } + fo_drain(fp, &context); if ((fp->f_flags & FP_INSELECT) == FP_INSELECT) { if (waitq_wakeup64_all((struct waitq *)fp->f_wset, NO_EVENT64, THREAD_INTERRUPTED, WAITQ_ALL_PRIORITIES) == KERN_INVALID_ARGUMENT) { @@ -5382,13 +5465,8 @@ fileproc_drain(proc_t p, struct fileproc * fp) * Parameters: p Process containing fd * fd fd to be released * fp fileproc to be freed - * - * Returns: 0 Success - * - * Notes: XXX function should be void - no one interprets the returns - * XXX code */ -int +void fp_free(proc_t p, int fd, struct fileproc * fp) { proc_fdlock_spin(p); @@ -5396,8 +5474,8 @@ fp_free(proc_t p, int fd, struct fileproc * fp) proc_fdunlock(p); fg_free(fp->f_fglob); + os_ref_release_live(&fp->f_iocount); fileproc_free(fp); - return 0; } @@ -5584,15 +5662,11 @@ fileport_releasefg(struct fileglob *fg) return; } - /* - * fileport_makefd + * fileport_makefd_internal * * Description: Obtain the file descriptor for a given Mach send right. * - * Parameters: p Process calling fileport - * uap->port Name of send right to file port. - * * Returns: 0 Success * EINVAL Invalid Mach port name, or port is not for a file. * fdalloc:EMFILE @@ -5602,24 +5676,13 @@ fileport_releasefg(struct fileglob *fg) * *retval (modified) The new descriptor */ int -fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) +fileport_makefd_internal(proc_t p, ipc_port_t port, int uf_flags, int *retval) { struct fileglob *fg; struct fileproc *fp = FILEPROC_NULL; - ipc_port_t port = IPC_PORT_NULL; - mach_port_name_t send = uap->port; - kern_return_t res; int fd; int err; - res = ipc_object_copyin(get_task_ipcspace(p->task), - send, MACH_MSG_TYPE_COPY_SEND, &port); - - if (res != KERN_SUCCESS) { - err = EINVAL; - goto out; - } - fg = fileport_port_to_fileglob(port); if (fg == NULL) { err = EINVAL; @@ -5642,7 +5705,9 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) fg_drop(fp); goto out; } - *fdflags(p, fd) |= UF_EXCLOSE; + if (uf_flags) { + *fdflags(p, fd) |= uf_flags; + } procfdtbl_releasefd(p, fd, fp); proc_fdunlock(p); @@ -5654,6 +5719,42 @@ out: fileproc_free(fp); } + return err; +} + +/* + * fileport_makefd + * + * Description: Obtain the file descriptor for a given Mach send right. + * + * Parameters: p Process calling fileport + * uap->port Name of send right to file port. + * + * Returns: 0 Success + * EINVAL Invalid Mach port name, or port is not for a file. + * fdalloc:EMFILE + * fdalloc:ENOMEM Unable to allocate fileproc or extend file table. + * + * Implicit returns: + * *retval (modified) The new descriptor + */ +int +fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) +{ + ipc_port_t port = IPC_PORT_NULL; + mach_port_name_t send = uap->port; + kern_return_t res; + int err; + + res = ipc_object_copyin(get_task_ipcspace(p->task), + send, MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + + if (res == KERN_SUCCESS) { + err = fileport_makefd_internal(p, port, UF_EXCLOSE, retval); + } else { + err = EINVAL; + } + if (IPC_PORT_NULL != port) { ipc_port_release_send(port); } @@ -5979,6 +6080,13 @@ fo_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) return (*fp->f_ops->fo_read)(fp, uio, flags, ctx); } +int +fo_no_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) +{ +#pragma unused(fp, uio, flags, ctx) + return ENXIO; +} + /* * fo_write @@ -6000,6 +6108,13 @@ fo_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) return (*fp->f_ops->fo_write)(fp, uio, flags, ctx); } +int +fo_no_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) +{ +#pragma unused(fp, uio, flags, ctx) + return ENXIO; +} + /* * fo_ioctl @@ -6034,6 +6149,13 @@ fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) return error; } +int +fo_no_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) +{ +#pragma unused(fp, com, data, ctx) + return ENOTTY; +} + /* * fo_select @@ -6055,6 +6177,13 @@ fo_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) return (*fp->f_ops->fo_select)(fp, which, wql, ctx); } +int +fo_no_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) +{ +#pragma unused(fp, which, wql, ctx) + return ENOTSUP; +} + /* * fo_close @@ -6076,6 +6205,32 @@ fo_close(struct fileglob *fg, vfs_context_t ctx) } +/* + * fo_drain + * + * Description: Generic fileops kqueue filter indirected through the fileops + * pointer in the fileproc structure + * + * Parameters: fp fileproc structure pointer + * ctx VFS context for operation + * + * Returns: 0 Success + * !0 errno from drain + */ +int +fo_drain(struct fileproc *fp, vfs_context_t ctx) +{ + return (*fp->f_ops->fo_drain)(fp, ctx); +} + +int +fo_no_drain(struct fileproc *fp, vfs_context_t ctx) +{ +#pragma unused(fp, ctx) + return ENOTSUP; +} + + /* * fo_kqfilter * @@ -6084,19 +6239,26 @@ fo_close(struct fileglob *fg, vfs_context_t ctx) * * Parameters: fp fileproc structure pointer * kn pointer to knote to filter on - * ctx VFS context for operation * * Returns: (kn->kn_flags & EV_ERROR) error in kn->kn_data * 0 Filter is not active * !0 Filter is active */ int -fo_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx) +fo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev) { - return (*fp->f_ops->fo_kqfilter)(fp, kn, kev, ctx); + return (*fp->f_ops->fo_kqfilter)(fp, kn, kev); } +int +fo_no_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev) +{ +#pragma unused(fp, kev) + knote_set_error(kn, ENOTSUP); + return 0; +} + + /* * The ability to send a file descriptor to another * process is opt-in by file type. @@ -6119,6 +6281,7 @@ file_issendable(proc_t p, struct fileproc *fp) } } +os_refgrp_decl(, f_iocount_refgrp, "f_iocount", NULL); struct fileproc * fileproc_alloc_init(__unused void *arg) @@ -6128,14 +6291,23 @@ fileproc_alloc_init(__unused void *arg) MALLOC_ZONE(fp, struct fileproc *, sizeof(*fp), M_FILEPROC, M_WAITOK); if (fp) { bzero(fp, sizeof(*fp)); + os_ref_init(&fp->f_iocount, &f_iocount_refgrp); } return fp; } + void fileproc_free(struct fileproc *fp) { + os_ref_count_t __unused refc = os_ref_release(&fp->f_iocount); +#if DEVELOPMENT || DEBUG + if (0 != refc) { + panic("%s: pid %d refc: %u != 0", + __func__, proc_pid(current_proc()), refc); + } +#endif switch (FILEPROC_TYPE(fp)) { case FTYPE_SIMPLE: FREE_ZONE(fp, sizeof(*fp), M_FILEPROC); @@ -6147,3 +6319,19 @@ fileproc_free(struct fileproc *fp) panic("%s: corrupt fp %p flags %x", __func__, fp, fp->f_flags); } } + +void +fileproc_modify_vflags(struct fileproc *fp, fileproc_vflags_t vflags, boolean_t clearflags) +{ + if (clearflags) { + os_atomic_andnot(&fp->f_vflags, vflags, relaxed); + } else { + os_atomic_or(&fp->f_vflags, vflags, relaxed); + } +} + +fileproc_vflags_t +fileproc_get_vflags(struct fileproc *fp) +{ + return os_atomic_load(&fp->f_vflags, relaxed); +} diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index c45fbcffa..725f96d1e 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,8 +86,7 @@ #include #include #include -#include -#include +#include #include #include @@ -108,7 +107,6 @@ #include #include #include -#include #include "net/net_str_id.h" @@ -119,7 +117,6 @@ #include #endif -extern thread_t port_name_to_thread(mach_port_name_t port_name); /* osfmk/kern/ipc_tt.h */ extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */ #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code)) @@ -128,73 +125,49 @@ MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); #define KQ_EVENT NO_EVENT64 -static int kqueue_read(struct fileproc *fp, struct uio *uio, - int flags, vfs_context_t ctx); -static int kqueue_write(struct fileproc *fp, struct uio *uio, - int flags, vfs_context_t ctx); -static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data, - vfs_context_t ctx); static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id, vfs_context_t ctx); static int kqueue_close(struct fileglob *fg, vfs_context_t ctx); static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); + struct kevent_qos_s *kev); static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx); static const struct fileops kqueueops = { - .fo_type = DTYPE_KQUEUE, - .fo_read = kqueue_read, - .fo_write = kqueue_write, - .fo_ioctl = kqueue_ioctl, - .fo_select = kqueue_select, - .fo_close = kqueue_close, + .fo_type = DTYPE_KQUEUE, + .fo_read = fo_no_read, + .fo_write = fo_no_write, + .fo_ioctl = fo_no_ioctl, + .fo_select = kqueue_select, + .fo_close = kqueue_close, + .fo_drain = kqueue_drain, .fo_kqfilter = kqueue_kqfilter, - .fo_drain = kqueue_drain, }; -static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, struct kqueue *kq); -static int kevent_internal(struct proc *p, - kqueue_id_t id, kqueue_id_t *id_out, - user_addr_t changelist, int nchanges, - user_addr_t eventlist, int nevents, - user_addr_t data_out, uint64_t data_available, - unsigned int flags, user_addr_t utimeout, - kqueue_continue_t continuation, - int32_t *retval); -static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, - struct proc *p, unsigned int flags); -static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, - struct proc *p, unsigned int flags); -char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n); - -static int kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev); +static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *); +static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result); static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread, - struct knote_lock_ctx *knlc, thread_continue_t cont, - struct _kevent_register *cont_args) __dead2; + thread_continue_t cont, struct _kevent_register *cont_args) __dead2; static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2; static void kevent_register_wait_cleanup(struct knote *kn); -static inline void kqueue_release_last(struct proc *p, kqueue_t kqu); -static void kqueue_interrupt(struct kqueue *kq); -static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp, - void *data); -static void kevent_continue(struct kqueue *kq, void *data, int error); -static void kqueue_scan_continue(void *contp, wait_result_t wait_result); -static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data, - struct filt_process_s *process_data, int *countp); -static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index); static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn); -static void kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos, int flags); +static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags); + +static void kqworkq_unbind(proc_t p, workq_threadreq_t); +static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread); +static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index); -static void kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, kq_index_t qos); -static void kqworkq_unbind(proc_t p, struct kqrequest *kqr); -static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, struct kqrequest *kqr, thread_t thread); -static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index); +static void kqworkloop_unbind(struct kqworkloop *kwql); -static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index); -static void kqworkloop_unbind(proc_t p, struct kqworkloop *kwql); -static thread_qos_t kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread); -static kq_index_t kqworkloop_owner_override(struct kqworkloop *kqwl); +enum kqwl_unbind_locked_mode { + KQWL_OVERRIDE_DROP_IMMEDIATELY, + KQWL_OVERRIDE_DROP_DELAYED, +}; +static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread, + enum kqwl_unbind_locked_mode how); +static void kqworkloop_unbind_delayed_override_drop(thread_t thread); +static kq_index_t kqworkloop_override(struct kqworkloop *kqwl); +static void kqworkloop_set_overcommit(struct kqworkloop *kqwl); enum { KQWL_UTQ_NONE, /* @@ -202,7 +175,7 @@ enum { * * This QoS is accounted for with the events override in the * kqr_override_index field. It is raised each time a new knote is queued at - * a given QoS. The kqr_wakeup_indexes field is a superset of the non empty + * a given QoS. The kqwl_wakeup_indexes field is a superset of the non empty * knote buckets and is recomputed after each event delivery. */ KQWL_UTQ_UPDATE_WAKEUP_QOS, @@ -227,40 +200,28 @@ enum { KQWL_UTQ_REDRIVE_EVENTS, }; static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos); -static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index); static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags); -static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data, - struct filt_process_s *process_data); - -static int kq_add_knote(struct kqueue *kq, struct knote *kn, - struct knote_lock_ctx *knlc, struct proc *p); -static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p); - -static void knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); +static int kq_add_knote(struct kqueue *kq, struct knote *kn, + struct knote_lock_ctx *knlc, struct proc *p); +static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, + struct kevent_qos_s *kev, bool is_fd, struct proc *p); -static void knote_activate(struct knote *kn); -static void knote_deactivate(struct knote *kn); - -static void knote_enable(struct knote *kn); -static void knote_disable(struct knote *kn); - -static int knote_enqueue(struct knote *kn); -static void knote_dequeue(struct knote *kn); +static void knote_activate(kqueue_t kqu, struct knote *kn, int result); +static void knote_dequeue(kqueue_t kqu, struct knote *kn); -static void knote_suppress(struct knote *kn); -static void knote_unsuppress(struct knote *kn); -static void knote_wakeup(struct knote *kn); +static void knote_apply_touch(kqueue_t kqu, struct knote *kn, + struct kevent_qos_s *kev, int result); +static void knote_suppress(kqueue_t kqu, struct knote *kn); +static void knote_unsuppress(kqueue_t kqu, struct knote *kn); +static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc); -static bool knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, - int result, thread_qos_t *qos_out); -static void knote_apply_qos_override(struct knote *kn, kq_index_t qos_index); +// both these functions may dequeue the knote and it is up to the caller +// to enqueue the knote back static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result); -static void knote_reset_priority(struct knote *kn, pthread_priority_t pp); -static kq_index_t knote_get_qos_override_index(struct knote *kn); -static void knote_set_qos_overcommit(struct knote *kn); +static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp); static zone_t knote_zone; static zone_t kqfile_zone; @@ -291,11 +252,18 @@ kevent_debug_flags(void) #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) -/* placeholder for not-yet-implemented filters */ -static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev); -static int filt_badevent(struct knote *kn, long hint); +static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev); +static void filt_no_detach(struct knote *kn); +static int filt_bad_event(struct knote *kn, long hint); +static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev); + SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = { - .f_attach = filt_badattach, + .f_attach = filt_no_attach, + .f_detach = filt_no_detach, + .f_event = filt_bad_event, + .f_touch = filt_bad_touch, + .f_process = filt_bad_process, }; #if CONFIG_MEMORYSTATUS @@ -304,6 +272,7 @@ extern const struct filterops memorystatus_filtops; extern const struct filterops fs_filtops; extern const struct filterops sig_filtops; extern const struct filterops machport_filtops; +extern const struct filterops pipe_nfiltops; extern const struct filterops pipe_rfiltops; extern const struct filterops pipe_wfiltops; extern const struct filterops ptsd_kqops; @@ -342,7 +311,8 @@ const static struct filterops workloop_filtops; * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of * the Private filters section of the array. */ -SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = { +static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true"); +static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = { /* Public Filters */ [~EVFILT_READ] = &file_filtops, [~EVFILT_WRITE] = &file_filtops, @@ -354,7 +324,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = [~EVFILT_MACHPORT] = &machport_filtops, [~EVFILT_FS] = &fs_filtops, [~EVFILT_USER] = &user_filtops, - &bad_filtops, + [~EVFILT_UNUSED_11] = &bad_filtops, [~EVFILT_VM] = &bad_filtops, [~EVFILT_SOCK] = &file_filtops, #if CONFIG_MEMORYSTATUS @@ -367,6 +337,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = /* Private filters */ [EVFILTID_KQREAD] = &kqread_filtops, + [EVFILTID_PIPE_N] = &pipe_nfiltops, [EVFILTID_PIPE_R] = &pipe_rfiltops, [EVFILTID_PIPE_W] = &pipe_wfiltops, [EVFILTID_PTSD] = &ptsd_kqops, @@ -381,40 +352,69 @@ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = [EVFILTID_VN] = &vnode_filtops, [EVFILTID_TTY] = &tty_filtops, [EVFILTID_PTMX] = &ptmx_kqops, + + /* fake filter for detached knotes, keep last */ + [EVFILTID_DETACHED] = &bad_filtops, }; /* waitq prepost callback */ -void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos); +void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook); + +static inline bool +kqr_thread_bound(workq_threadreq_t kqr) +{ + return kqr->tr_state == WORKQ_TR_STATE_BOUND; +} + +static inline bool +kqr_thread_requested_pending(workq_threadreq_t kqr) +{ + workq_tr_state_t tr_state = kqr->tr_state; + return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND; +} + +static inline bool +kqr_thread_requested(workq_threadreq_t kqr) +{ + return kqr->tr_state != WORKQ_TR_STATE_IDLE; +} + +static inline thread_t +kqr_thread_fast(workq_threadreq_t kqr) +{ + assert(kqr_thread_bound(kqr)); + return kqr->tr_thread; +} + +static inline thread_t +kqr_thread(workq_threadreq_t kqr) +{ + return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL; +} static inline struct kqworkloop * -kqr_kqworkloop(struct kqrequest *kqr) +kqr_kqworkloop(workq_threadreq_t kqr) { - if (kqr->kqr_state & KQR_WORKLOOP) { + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { return __container_of(kqr, struct kqworkloop, kqwl_request); } return NULL; } static inline kqueue_t -kqr_kqueue(proc_t p, struct kqrequest *kqr) +kqr_kqueue(proc_t p, workq_threadreq_t kqr) { kqueue_t kqu; - if (kqr->kqr_state & KQR_WORKLOOP) { + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { kqu.kqwl = kqr_kqworkloop(kqr); } else { - kqu.kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue; + kqu.kqwq = p->p_fd->fd_wqkqueue; assert(kqr >= kqu.kqwq->kqwq_request && kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS); } return kqu; } -static inline boolean_t -is_workqueue_thread(thread_t thread) -{ - return thread_get_tag(thread) & THREAD_TAG_WORKQUEUE; -} - /* * kqueue/note lock implementations * @@ -456,36 +456,56 @@ kqunlock(kqueue_t kqu) } static inline void -kq_req_lock(kqueue_t kqu) +knhash_lock(struct filedesc *fdp) { - assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)); - lck_spin_lock(&kqu.kq->kq_reqlock); + lck_mtx_lock(&fdp->fd_knhashlock); } static inline void -kq_req_unlock(kqueue_t kqu) +knhash_unlock(struct filedesc *fdp) { - assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)); - lck_spin_unlock(&kqu.kq->kq_reqlock); + lck_mtx_unlock(&fdp->fd_knhashlock); } -static inline void -kq_req_held(__assert_only kqueue_t kqu) +/* wait event for knote locks */ +static inline event_t +knote_lock_wev(struct knote *kn) { - assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)); - LCK_SPIN_ASSERT(&kqu.kq->kq_reqlock, LCK_ASSERT_OWNED); + return (event_t)(&kn->kn_hook); } -static inline void -knhash_lock(proc_t p) +/* wait event for kevent_register_wait_* */ +static inline event64_t +knote_filt_wev64(struct knote *kn) { - lck_mtx_lock(&p->p_fd->fd_knhashlock); + /* kdp_workloop_sync_wait_find_owner knows about this */ + return CAST_EVENT64_T(kn); } -static inline void -knhash_unlock(proc_t p) +/* wait event for knote_post/knote_drop */ +static inline event64_t +knote_post_wev64(struct knote *kn) +{ + return CAST_EVENT64_T(&kn->kn_kevent); +} + +/*! + * @function knote_has_qos + * + * @brief + * Whether the knote has a regular QoS. + * + * @discussion + * kn_qos_override is: + * - 0 on kqfiles + * - THREAD_QOS_LAST for special buckets (stayactive, manager) + * + * Other values mean the knote participates to QoS propagation. + */ +static inline bool +knote_has_qos(struct knote *kn) { - lck_mtx_unlock(&p->p_fd->fd_knhashlock); + return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST; } #pragma mark knote locks @@ -496,37 +516,29 @@ knhash_unlock(proc_t p) * KNOTE_KQ_LOCK_ALWAYS * The function will always return with the kq lock held. * - * KNOTE_KQ_UNLOCK_ON_SUCCESS + * KNOTE_KQ_LOCK_ON_SUCCESS * The function will return with the kq lock held if it was successful * (knote_lock() is the only function that can fail). * - * KNOTE_KQ_UNLOCK_ON_FAILURE + * KNOTE_KQ_LOCK_ON_FAILURE * The function will return with the kq lock held if it was unsuccessful * (knote_lock() is the only function that can fail). * * KNOTE_KQ_UNLOCK: * The function returns with the kq unlocked. */ -#define KNOTE_KQ_LOCK_ALWAYS 0x0 -#define KNOTE_KQ_LOCK_ON_SUCCESS 0x1 -#define KNOTE_KQ_LOCK_ON_FAILURE 0x2 -#define KNOTE_KQ_UNLOCK 0x3 - -#if DEBUG || DEVELOPMENT -__attribute__((noinline, not_tail_called, disable_tail_calls)) -void -knote_lock_ctx_chk(struct knote_lock_ctx *knlc) -{ - /* evil hackery to make sure no one forgets to unlock */ - assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED); -} -#endif +enum kqlocking { + KNOTE_KQ_LOCK_ALWAYS, + KNOTE_KQ_LOCK_ON_SUCCESS, + KNOTE_KQ_LOCK_ON_FAILURE, + KNOTE_KQ_UNLOCK, +}; static struct knote_lock_ctx * -knote_lock_ctx_find(struct kqueue *kq, struct knote *kn) +knote_lock_ctx_find(kqueue_t kqu, struct knote *kn) { struct knote_lock_ctx *ctx; - LIST_FOREACH(ctx, &kq->kq_knlocks, knlc_le) { + LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) { if (ctx->knlc_knote == kn) { return ctx; } @@ -538,42 +550,60 @@ knote_lock_ctx_find(struct kqueue *kq, struct knote *kn) /* slowpath of knote_lock() */ __attribute__((noinline)) static bool __result_use_check -knote_lock_slow(struct kqueue *kq, struct knote *kn, +knote_lock_slow(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc, int kqlocking) { - kqlock_held(kq); + struct knote_lock_ctx *owner_lc; + struct uthread *uth = current_uthread(); + wait_result_t wr; - struct knote_lock_ctx *owner_lc = knote_lock_ctx_find(kq, kn); - thread_t owner_thread = owner_lc->knlc_thread; + kqlock_held(kqu); + owner_lc = knote_lock_ctx_find(kqu, kn); #if DEBUG || DEVELOPMENT knlc->knlc_state = KNOTE_LOCK_CTX_WAITING; #endif + owner_lc->knlc_waiters++; - thread_reference(owner_thread); - TAILQ_INSERT_TAIL(&owner_lc->knlc_head, knlc, knlc_tqe); - assert_wait(&kn->kn_status, THREAD_UNINT | THREAD_WAIT_NOREPORT); - kqunlock(kq); + /* + * Make our lock context visible to knote_unlock() + */ + uth->uu_knlock = knlc; - if (thread_handoff_deallocate(owner_thread) == THREAD_RESTART) { - if (kqlocking == KNOTE_KQ_LOCK_ALWAYS || - kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { - kqlock(kq); - } + wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK, + knote_lock_wev(kn), owner_lc->knlc_thread, + THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER); + + if (wr == THREAD_RESTART) { + /* + * We haven't been woken up by knote_unlock() but knote_unlock_cancel. + * We need to cleanup the state since no one did. + */ + uth->uu_knlock = NULL; #if DEBUG || DEVELOPMENT assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING); knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED; #endif + + if (kqlocking == KNOTE_KQ_LOCK_ALWAYS || + kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { + kqlock(kqu); + } return false; - } + } else { + if (kqlocking == KNOTE_KQ_LOCK_ALWAYS || + kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) { + kqlock(kqu); #if DEBUG || DEVELOPMENT - assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED); + /* + * This state is set under the lock so we can't + * really assert this unless we hold the lock. + */ + assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED); #endif - if (kqlocking == KNOTE_KQ_LOCK_ALWAYS || - kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) { - kqlock(kq); + } + return true; } - return true; } /* @@ -584,20 +614,20 @@ knote_lock_slow(struct kqueue *kq, struct knote *kn, * Returns true if the knote lock is acquired, false if it has been dropped */ static bool __result_use_check -knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, - int kqlocking) +knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc, + enum kqlocking kqlocking) { - kqlock_held(kq); + kqlock_held(kqu); #if DEBUG || DEVELOPMENT assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED); #endif knlc->knlc_knote = kn; knlc->knlc_thread = current_thread(); - TAILQ_INIT(&knlc->knlc_head); + knlc->knlc_waiters = 0; if (__improbable(kn->kn_status & KN_LOCKED)) { - return knote_lock_slow(kq, kn, knlc, kqlocking); + return knote_lock_slow(kqu, kn, knlc, kqlocking); } /* @@ -606,7 +636,7 @@ knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, * hash table that references it before the lock is canceled. */ assert((kn->kn_status & KN_DROPPING) == 0); - LIST_INSERT_HEAD(&kq->kq_knlocks, knlc, knlc_le); + LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link); kn->kn_status |= KN_LOCKED; #if DEBUG || DEVELOPMENT knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED; @@ -614,7 +644,7 @@ knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, if (kqlocking == KNOTE_KQ_UNLOCK || kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { - kqunlock(kq); + kqunlock(kqu); } return true; } @@ -624,13 +654,13 @@ knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, * * Called with the kqueue lock held. * - * Returns with the kqueue lock held according to KNOTE_KQ_* flags + * Returns with the kqueue lock held according to KNOTE_KQ_* mode. */ static void -knote_unlock(struct kqueue *kq, struct knote *kn, - struct knote_lock_ctx *knlc, int flags) +knote_unlock(kqueue_t kqu, struct knote *kn, + struct knote_lock_ctx *knlc, enum kqlocking kqlocking) { - kqlock_held(kq); + kqlock_held(kqu); assert(knlc->knlc_knote == kn); assert(kn->kn_status & KN_LOCKED); @@ -638,36 +668,45 @@ knote_unlock(struct kqueue *kq, struct knote *kn, assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED); #endif - struct knote_lock_ctx *next_owner_lc = TAILQ_FIRST(&knlc->knlc_head); + LIST_REMOVE(knlc, knlc_link); - LIST_REMOVE(knlc, knlc_le); + if (knlc->knlc_waiters) { + thread_t thread = THREAD_NULL; - if (next_owner_lc) { - assert(next_owner_lc->knlc_knote == kn); - TAILQ_REMOVE(&knlc->knlc_head, next_owner_lc, knlc_tqe); + wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED, + LCK_WAKE_DEFAULT, &thread); + + /* + * knote_lock_slow() publishes the lock context of waiters + * in uthread::uu_knlock. + * + * Reach out and make this context the new owner. + */ + struct uthread *ut = get_bsdthread_info(thread); + struct knote_lock_ctx *next_owner_lc = ut->uu_knlock; - assert(TAILQ_EMPTY(&next_owner_lc->knlc_head)); - TAILQ_CONCAT(&next_owner_lc->knlc_head, &knlc->knlc_head, knlc_tqe); - LIST_INSERT_HEAD(&kq->kq_knlocks, next_owner_lc, knlc_le); + assert(next_owner_lc->knlc_knote == kn); + next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1; + LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link); #if DEBUG || DEVELOPMENT next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED; #endif + ut->uu_knlock = NULL; + thread_deallocate_safe(thread); } else { kn->kn_status &= ~KN_LOCKED; } - if (kn->kn_inuse == 0) { + + if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) { /* * No f_event() in flight anymore, we can leave QoS "Merge" mode * - * See knote_should_apply_qos_override() + * See knote_adjust_qos() */ kn->kn_status &= ~KN_MERGE_QOS; } - if (flags & KNOTE_KQ_UNLOCK) { - kqunlock(kq); - } - if (next_owner_lc) { - thread_wakeup_thread(&kn->kn_status, next_owner_lc->knlc_thread); + if (kqlocking == KNOTE_KQ_UNLOCK) { + kqunlock(kqu); } #if DEBUG || DEVELOPMENT knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED; @@ -679,11 +718,11 @@ knote_unlock(struct kqueue *kq, struct knote *kn, * * Called with the kqueue lock held. * - * Returns with the kqueue lock held according to KNOTE_KQ_* flags + * Returns with the kqueue unlocked. */ static void knote_unlock_cancel(struct kqueue *kq, struct knote *kn, - struct knote_lock_ctx *knlc, int kqlocking) + struct knote_lock_ctx *knlc) { kqlock_held(kq); @@ -691,15 +730,12 @@ knote_unlock_cancel(struct kqueue *kq, struct knote *kn, assert(kn->kn_status & KN_LOCKED); assert(kn->kn_status & KN_DROPPING); - LIST_REMOVE(knlc, knlc_le); + LIST_REMOVE(knlc, knlc_link); kn->kn_status &= ~KN_LOCKED; + kqunlock(kq); - if (kqlocking == KNOTE_KQ_UNLOCK || - kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { - kqunlock(kq); - } - if (!TAILQ_EMPTY(&knlc->knlc_head)) { - thread_wakeup_with_result(&kn->kn_status, THREAD_RESTART); + if (knlc->knlc_waiters) { + wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART); } #if DEBUG || DEVELOPMENT knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED; @@ -712,17 +748,23 @@ knote_unlock_cancel(struct kqueue *kq, struct knote *kn, * Takes a use count to protect against concurrent drops. */ static void -knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint) +knote_post(struct knote *kn, long hint) { - int result, dropping = 0; + struct kqueue *kq = knote_get_kq(kn); + int dropping, result; - kqlock_held(kq); + kqlock(kq); - if (kn->kn_status & (KN_DROPPING | KN_VANISHED)) { - return; + if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) { + return kqunlock(kq); + } + + if (__improbable(kn->kn_status & KN_POSTING)) { + panic("KNOTE() called concurrently on knote %p", kn); } - kn->kn_inuse++; + kn->kn_status |= KN_POSTING; + kqunlock(kq); result = filter_call(knote_fops(kn), f_event(kn, hint)); kqlock(kq); @@ -730,28 +772,26 @@ knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint) dropping = (kn->kn_status & KN_DROPPING); if (!dropping && (result & FILTER_ACTIVE)) { - if (result & FILTER_ADJUST_EVENT_QOS_BIT) { - knote_adjust_qos(kq, kn, result); - } - knote_activate(kn); + knote_activate(kq, kn, result); } - if (--kn->kn_inuse == 0) { - if ((kn->kn_status & KN_LOCKED) == 0) { - /* - * We're the last f_event() call and there's no other f_* call in - * flight, we can leave QoS "Merge" mode. - * - * See knote_should_apply_qos_override() - */ - kn->kn_status &= ~KN_MERGE_QOS; - } - if (dropping) { - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_inuse), - THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); - } + if ((kn->kn_status & KN_LOCKED) == 0) { + /* + * There's no other f_* call in flight, we can leave QoS "Merge" mode. + * + * See knote_adjust_qos() + */ + kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS); + } else { + kn->kn_status &= ~KN_POSTING; } + + if (__improbable(dropping)) { + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, knote_post_wev64(kn), + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + } + + kqunlock(kq); } /* @@ -761,7 +801,7 @@ knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint) * - kq unlocked at exit */ static void -knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn) +knote_wait_for_post(struct kqueue *kq, struct knote *kn) { wait_result_t wr = THREAD_NOT_WAITING; @@ -769,10 +809,10 @@ knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn) assert(kn->kn_status & KN_DROPPING); - if (kn->kn_inuse) { + if (kn->kn_status & KN_POSTING) { wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_inuse), - THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER); + knote_post_wev64(kn), THREAD_UNINT | THREAD_WAIT_NOREPORT, + TIMEOUT_WAIT_FOREVER); } kqunlock(kq); if (wr == THREAD_WAITING) { @@ -780,12 +820,107 @@ knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn) } } +#pragma mark knote helpers for filters + +OS_ALWAYS_INLINE +void +knote_set_error(struct knote *kn, int error) +{ + kn->kn_flags |= EV_ERROR; + kn->kn_sdata = error; +} + +OS_ALWAYS_INLINE +int64_t +knote_low_watermark(const struct knote *kn) +{ + return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1; +} + +/*! + * @function knote_fill_kevent_with_sdata + * + * @brief + * Fills in a kevent from the current content of a knote. + * + * @discussion + * This is meant to be called from filter's f_event hooks. + * The kevent data is filled with kn->kn_sdata. + * + * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set. + * + * Using knote_fill_kevent is typically preferred. + */ +OS_ALWAYS_INLINE +void +knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev) +{ +#define knote_assert_aliases(name1, offs1, name2) \ + static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \ + offsetof(struct kevent_internal_s, name2), \ + "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias") + /* + * All the code makes assumptions on these aliasing, + * so make sure we fail the build if we ever ever ever break them. + */ + knote_assert_aliases(ident, 0, kei_ident); +#ifdef __LITTLE_ENDIAN__ + knote_assert_aliases(filter, 0, kei_filter); // non trivial overlap + knote_assert_aliases(filter, 1, kei_filtid); // non trivial overlap +#else + knote_assert_aliases(filter, 0, kei_filtid); // non trivial overlap + knote_assert_aliases(filter, 1, kei_filter); // non trivial overlap +#endif + knote_assert_aliases(flags, 0, kei_flags); + knote_assert_aliases(qos, 0, kei_qos); + knote_assert_aliases(udata, 0, kei_udata); + knote_assert_aliases(fflags, 0, kei_fflags); + knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap + knote_assert_aliases(data, 0, kei_sdata); // non trivial overlap + knote_assert_aliases(ext, 0, kei_ext); +#undef knote_assert_aliases + + /* + * Fix the differences between kevent_qos_s and kevent_internal_s: + * - xflags is where kn_sfflags lives, we need to zero it + * - fixup the high bits of `filter` where kn_filtid lives + */ + *kev = *(struct kevent_qos_s *)&kn->kn_kevent; + kev->xflags = 0; + kev->filter |= 0xff00; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + } +} + +/*! + * @function knote_fill_kevent + * + * @brief + * Fills in a kevent from the current content of a knote. + * + * @discussion + * This is meant to be called from filter's f_event hooks. + * The kevent data is filled with the passed in data. + * + * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set. + */ +OS_ALWAYS_INLINE +void +knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data) +{ + knote_fill_kevent_with_sdata(kn, kev); + kev->filter = kn->kn_filter; + kev->data = data; +} + + #pragma mark file_filtops static int -filt_fileattach(struct knote *kn, struct kevent_internal_s *kev) +filt_fileattach(struct knote *kn, struct kevent_qos_s *kev) { - return fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current()); + return fo_kqfilter(kn->kn_fp, kn, kev); } SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = { @@ -820,36 +955,29 @@ filt_kqueue(struct knote *kn, __unused long hint) } static int -filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev) +filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev) { #pragma unused(kev) struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; int res; kqlock(kq); - kn->kn_data = kq->kq_count; - res = (kn->kn_data > 0); - + res = (kq->kq_count > 0); kqunlock(kq); return res; } static int -filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; - int res; + int res = 0; kqlock(kq); - kn->kn_data = kq->kq_count; - res = (kn->kn_data > 0); - if (res) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - } + if (kq->kq_count) { + knote_fill_kevent(kn, kev, kq->kq_count); + res = 1; } kqunlock(kq); @@ -867,7 +995,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = { #pragma mark proc_filtops static int -filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct proc *p; @@ -884,7 +1012,7 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev) return 0; } - const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS; + const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS; if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) { do { @@ -903,9 +1031,11 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev) } while (0); } - proc_klist_lock(); + kn->kn_proc = p; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + kn->kn_sdata = 0; /* incoming data is ignored */ - kn->kn_ptr.p_proc = p; /* store the proc handle */ + proc_klist_lock(); KNOTE_ATTACH(&p->p_klist, kn); @@ -933,9 +1063,9 @@ filt_procdetach(struct knote *kn) proc_klist_lock(); - p = kn->kn_ptr.p_proc; + p = kn->kn_proc; if (p != PROC_NULL) { - kn->kn_ptr.p_proc = PROC_NULL; + kn->kn_proc = PROC_NULL; KNOTE_DETACH(&p->p_klist, kn); } @@ -943,7 +1073,7 @@ filt_procdetach(struct knote *kn) } static int -filt_proc(struct knote *kn, long hint) +filt_procevent(struct knote *kn, long hint) { u_int event; @@ -952,7 +1082,7 @@ filt_proc(struct knote *kn, long hint) /* * Note: a lot of bits in hint may be obtained from the knote * To free some of those bits, see Freeing up - * bits in hint for filt_proc + * bits in hint for filt_procevent * * mask off extra data */ @@ -967,8 +1097,8 @@ filt_proc(struct knote *kn, long hint) * parent and these knotes re-fired. */ if (event & NOTE_EXIT) { - if ((kn->kn_ptr.p_proc->p_oppid != 0) - && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) { + if ((kn->kn_proc->p_oppid != 0) + && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_proc->p_ppid)) { /* * This knote is not for the current ptrace(2) parent, ignore. */ @@ -993,52 +1123,52 @@ filt_proc(struct knote *kn, long hint) /* * The kernel has a wrapper in place that returns the same data - * as is collected here, in kn_data. Any changes to how + * as is collected here, in kn_hook64. Any changes to how * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected * should also be reflected in the proc_pidnoteexit() wrapper. */ if (event == NOTE_EXIT) { - kn->kn_data = 0; + kn->kn_hook64 = 0; if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) { kn->kn_fflags |= NOTE_EXITSTATUS; - kn->kn_data |= (hint & NOTE_PDATAMASK); + kn->kn_hook64 |= (hint & NOTE_PDATAMASK); } if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) { kn->kn_fflags |= NOTE_EXIT_DETAIL; - if ((kn->kn_ptr.p_proc->p_lflag & + if ((kn->kn_proc->p_lflag & P_LTERM_DECRYPTFAIL) != 0) { - kn->kn_data |= NOTE_EXIT_DECRYPTFAIL; + kn->kn_hook64 |= NOTE_EXIT_DECRYPTFAIL; } - if ((kn->kn_ptr.p_proc->p_lflag & + if ((kn->kn_proc->p_lflag & P_LTERM_JETSAM) != 0) { - kn->kn_data |= NOTE_EXIT_MEMORY; - switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) { + kn->kn_hook64 |= NOTE_EXIT_MEMORY; + switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) { case P_JETSAM_VMPAGESHORTAGE: - kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE; break; case P_JETSAM_VMTHRASHING: - kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_VMTHRASHING; break; case P_JETSAM_FCTHRASHING: - kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_FCTHRASHING; break; case P_JETSAM_VNODE: - kn->kn_data |= NOTE_EXIT_MEMORY_VNODE; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_VNODE; break; case P_JETSAM_HIWAT: - kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_HIWAT; break; case P_JETSAM_PID: - kn->kn_data |= NOTE_EXIT_MEMORY_PID; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_PID; break; case P_JETSAM_IDLEEXIT: - kn->kn_data |= NOTE_EXIT_MEMORY_IDLE; + kn->kn_hook64 |= NOTE_EXIT_MEMORY_IDLE; break; } } - if ((kn->kn_ptr.p_proc->p_csflags & + if ((kn->kn_proc->p_csflags & CS_KILLED) != 0) { - kn->kn_data |= NOTE_EXIT_CSERROR; + kn->kn_hook64 |= NOTE_EXIT_CSERROR; } } } @@ -1048,7 +1178,7 @@ filt_proc(struct knote *kn, long hint) } static int -filt_proctouch(struct knote *kn, struct kevent_internal_s *kev) +filt_proctouch(struct knote *kn, struct kevent_qos_s *kev) { int res; @@ -1072,28 +1202,25 @@ filt_proctouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_procprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) - int res; + int res = 0; proc_klist_lock(); - res = (kn->kn_fflags != 0); - if (res) { - *kev = kn->kn_kevent; - kn->kn_flags |= EV_CLEAR; /* automatically set */ - kn->kn_fflags = 0; - kn->kn_data = 0; + if (kn->kn_fflags) { + knote_fill_kevent(kn, kev, kn->kn_hook64); + kn->kn_hook64 = 0; + res = 1; } proc_klist_unlock(); return res; } SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = { - .f_attach = filt_procattach, - .f_detach = filt_procdetach, - .f_event = filt_proc, - .f_touch = filt_proctouch, + .f_attach = filt_procattach, + .f_detach = filt_procdetach, + .f_event = filt_procevent, + .f_touch = filt_proctouch, .f_process = filt_procprocess, }; @@ -1109,12 +1236,12 @@ struct filt_timer_params { /* * Values stored in the knote at rest (using Mach absolute time units) * - * kn->kn_hook where the thread_call object is stored + * kn->kn_thcall where the thread_call object is stored * kn->kn_ext[0] next deadline or 0 if immediate expiration * kn->kn_ext[1] leeway value * kn->kn_sdata interval timer: the interval * absolute/deadline timer: 0 - * kn->kn_hookid timer state + * kn->kn_hook32 timer state * * TIMER_IDLE: * The timer has either never been scheduled or been cancelled. @@ -1164,7 +1291,7 @@ filt_timer_set_params(struct knote *kn, struct filt_timer_params *params) * Called with timer filter lock held. */ static int -filt_timervalidate(const struct kevent_internal_s *kev, +filt_timervalidate(const struct kevent_qos_s *kev, struct filt_timer_params *params) { /* @@ -1354,13 +1481,13 @@ filt_timerexpire(void *knx, __unused void *spare) struct knote *kn = knx; int v; - if (os_atomic_cmpxchgv(&kn->kn_hookid, TIMER_ARMED, TIMER_FIRED, + if (os_atomic_cmpxchgv(&kn->kn_hook32, TIMER_ARMED, TIMER_FIRED, &v, relaxed)) { // our f_event always would say FILTER_ACTIVE, // so be leaner and just do it. struct kqueue *kq = knote_get_kq(kn); kqlock(kq); - knote_activate(kn); + knote_activate(kq, kn, FILTER_ACTIVE); kqunlock(kq); } else { /* @@ -1377,9 +1504,9 @@ filt_timerexpire(void *knx, __unused void *spare) static void filt_timercancel(struct knote *kn) { - if (os_atomic_xchg(&kn->kn_hookid, TIMER_IDLE, relaxed) == TIMER_ARMED) { + if (os_atomic_xchg(&kn->kn_hook32, TIMER_IDLE, relaxed) == TIMER_ARMED) { /* cancel the thread call and wait for any filt_timerexpire in flight */ - thread_call_cancel_wait((thread_call_t)kn->kn_hook); + thread_call_cancel_wait(kn->kn_thcall); } } @@ -1418,7 +1545,7 @@ filt_timerarm(struct knote *kn) int filter_flags = kn->kn_sfflags; unsigned int timer_flags = 0; - assert(os_atomic_load(&kn->kn_hookid, relaxed) == TIMER_IDLE); + assert(os_atomic_load(&kn->kn_hook32, relaxed) == TIMER_IDLE); if (filter_flags & NOTE_CRITICAL) { timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL; @@ -1436,8 +1563,8 @@ filt_timerarm(struct knote *kn) timer_flags |= THREAD_CALL_CONTINUOUS; } - os_atomic_store(&kn->kn_hookid, TIMER_ARMED, relaxed); - thread_call_enter_delayed_with_leeway((thread_call_t)kn->kn_hook, NULL, + os_atomic_store(&kn->kn_hook32, TIMER_ARMED, relaxed); + thread_call_enter_delayed_with_leeway(kn->kn_thcall, NULL, deadline, leeway, timer_flags); } @@ -1445,7 +1572,7 @@ filt_timerarm(struct knote *kn) * Allocate a thread call for the knote's lifetime, and kick off the timer. */ static int -filt_timerattach(struct knote *kn, struct kevent_internal_s *kev) +filt_timerattach(struct knote *kn, struct kevent_qos_s *kev) { thread_call_t callout; struct filt_timer_params params; @@ -1466,9 +1593,9 @@ filt_timerattach(struct knote *kn, struct kevent_internal_s *kev) } filt_timer_set_params(kn, ¶ms); - kn->kn_hook = callout; + kn->kn_thcall = callout; kn->kn_flags |= EV_CLEAR; - os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed); + os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed); /* NOTE_ABSOLUTE implies EV_ONESHOT */ if (kn->kn_sfflags & NOTE_ABSOLUTE) { @@ -1476,7 +1603,7 @@ filt_timerattach(struct knote *kn, struct kevent_internal_s *kev) } if (filt_timer_is_ready(kn)) { - os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed); + os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed); return FILTER_ACTIVE; } else { filt_timerarm(kn); @@ -1496,8 +1623,8 @@ filt_timerdetach(struct knote *kn) * Unconditionally cancel to make sure there can't be any filt_timerexpire() * running anymore. */ - thread_call_cancel_wait((thread_call_t)kn->kn_hook); - freed = thread_call_free((thread_call_t)kn->kn_hook); + thread_call_cancel_wait(kn->kn_thcall); + freed = thread_call_free(kn->kn_thcall); assert(freed); } @@ -1509,7 +1636,7 @@ filt_timerdetach(struct knote *kn) * pops have gone off (in kn_data). */ static int -filt_timertouch(struct knote *kn, struct kevent_internal_s *kev) +filt_timertouch(struct knote *kn, struct kevent_qos_s *kev) { struct filt_timer_params params; uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags); @@ -1533,7 +1660,7 @@ filt_timertouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sfflags = kev->fflags; if (filt_timer_is_ready(kn)) { - os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed); + os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed); return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS; } else { filt_timerarm(kn); @@ -1549,10 +1676,7 @@ filt_timertouch(struct knote *kn, struct kevent_internal_s *kev) * counters for the next time. */ static int -filt_timerprocess( - struct knote *kn, - __unused struct filt_process_s *data, - struct kevent_internal_s *kev) +filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev) { /* * filt_timerprocess is serialized with any filter routine except for @@ -1563,7 +1687,7 @@ filt_timerprocess( * whether we see any of the "FIRED" state, and if we do, it is safe to * do simple state machine transitions. */ - switch (os_atomic_load(&kn->kn_hookid, relaxed)) { + switch (os_atomic_load(&kn->kn_hook32, relaxed)) { case TIMER_IDLE: case TIMER_ARMED: /* @@ -1573,7 +1697,7 @@ filt_timerprocess( return 0; } - os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed); + os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed); /* * Copy out the interesting kevent state, @@ -1584,13 +1708,11 @@ filt_timerprocess( * - return kn_sfflags in the fflags field so the client can know * under what flags the timer fired */ - *kev = kn->kn_kevent; + knote_fill_kevent(kn, kev, 1); kev->ext[0] = 0; /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */ - if (kn->kn_sdata == 0) { - kev->data = 1; - } else { + if (kn->kn_sdata != 0) { /* * This is a 'repeating' timer, so we have to emit * how many intervals expired between the arm @@ -1654,7 +1776,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = { .f_extended_codes = true, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, - .f_event = filt_badevent, + .f_event = filt_bad_event, .f_touch = filt_timertouch, .f_process = filt_timerprocess, }; @@ -1662,24 +1784,18 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = { #pragma mark user_filtops static int -filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev) { if (kn->kn_sfflags & NOTE_TRIGGER) { - kn->kn_hookid = FILTER_ACTIVE; + kn->kn_hook32 = FILTER_ACTIVE; } else { - kn->kn_hookid = 0; + kn->kn_hook32 = 0; } - return kn->kn_hookid; -} - -static void -filt_userdetach(__unused struct knote *kn) -{ - /* EVFILT_USER knotes are not attached to anything in the kernel */ + return kn->kn_hook32; } static int -filt_usertouch(struct knote *kn, struct kevent_internal_s *kev) +filt_usertouch(struct knote *kn, struct kevent_qos_s *kev) { uint32_t ffctrl; int fflags; @@ -1702,27 +1818,23 @@ filt_usertouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; if (kev->fflags & NOTE_TRIGGER) { - kn->kn_hookid = FILTER_ACTIVE; + kn->kn_hook32 = FILTER_ACTIVE; } - return (int)kn->kn_hookid; + return (int)kn->kn_hook32; } static int -filt_userprocess( - struct knote *kn, - __unused struct filt_process_s *data, - struct kevent_internal_s *kev) +filt_userprocess(struct knote *kn, struct kevent_qos_s *kev) { - int result = (int)kn->kn_hookid; + int result = (int)kn->kn_hook32; if (result) { - *kev = kn->kn_kevent; + /* EVFILT_USER returns the data that was passed in */ + knote_fill_kevent_with_sdata(kn, kev); kev->fflags = kn->kn_sfflags; - kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { - kn->kn_hookid = 0; - kn->kn_data = 0; - kn->kn_fflags = 0; + /* knote_fill_kevent cleared kn_fflags */ + kn->kn_hook32 = 0; } } @@ -1732,24 +1844,26 @@ filt_userprocess( SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = { .f_extended_codes = true, .f_attach = filt_userattach, - .f_detach = filt_userdetach, - .f_event = filt_badevent, + .f_detach = filt_no_detach, + .f_event = filt_bad_event, .f_touch = filt_usertouch, .f_process = filt_userprocess, }; #pragma mark workloop_filtops +#define EPREEMPTDISABLED (-1) + static inline void filt_wllock(struct kqworkloop *kqwl) { - lck_mtx_lock(&kqwl->kqwl_statelock); + lck_spin_lock(&kqwl->kqwl_statelock); } static inline void filt_wlunlock(struct kqworkloop *kqwl) { - lck_mtx_unlock(&kqwl->kqwl_statelock); + lck_spin_unlock(&kqwl->kqwl_statelock); } /* @@ -1766,9 +1880,7 @@ filt_wlunlock(struct kqworkloop *kqwl) static inline bool filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl) { - struct kqrequest *kqr = &kqwl->kqwl_request; - return (kqr->kqr_state & KQR_THREQUESTED) && - (kqr->kqr_thread == THREAD_NULL); + return kqr_thread_requested_pending(&kqwl->kqwl_request); } static void @@ -1776,7 +1888,7 @@ filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts, turnstile_update_flags_t flags) { turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; - struct kqrequest *kqr = &kqwl->kqwl_request; + workq_threadreq_t kqr = &kqwl->kqwl_request; /* * binding to the workq should always happen through @@ -1786,13 +1898,14 @@ filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts, if ((inheritor = kqwl->kqwl_owner)) { flags |= TURNSTILE_INHERITOR_THREAD; - } else if ((inheritor = kqr->kqr_thread)) { + } else if ((inheritor = kqr_thread(kqr))) { flags |= TURNSTILE_INHERITOR_THREAD; } turnstile_update_inheritor(ts, inheritor, flags); } +#define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100 #define FILT_WLATTACH 0 #define FILT_WLTOUCH 1 #define FILT_WLDROP 2 @@ -1800,43 +1913,24 @@ filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts, __result_use_check static int filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, - struct kevent_internal_s *kev, kq_index_t qos_index, int op) + struct kevent_qos_s *kev, kq_index_t qos_index, int op) { user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]); - struct kqrequest *kqr = &kqwl->kqwl_request; + workq_threadreq_t kqr = &kqwl->kqwl_request; thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL; - kq_index_t cur_owner_override = THREAD_QOS_UNSPECIFIED; + kq_index_t cur_override = THREAD_QOS_UNSPECIFIED; + int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT; int action = KQWL_UTQ_NONE, error = 0; - bool needs_wake = false, needs_wllock = false; + bool wl_inheritor_updated = false, needs_wake = false; uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE]; uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK]; uint64_t udata = 0; + struct turnstile *ts = TURNSTILE_NULL; - if (kev->fflags & (NOTE_WL_END_OWNERSHIP | NOTE_WL_DISCOVER_OWNER)) { - /* - * If we're maybe going to change the kqwl_owner, - * then we need to hold the filt_wllock(). - */ - needs_wllock = true; - } else if (kqr->kqr_thread == current_thread()) { - /* - * Servicer updates need to be serialized with - * any ownership change too, as the kqr_thread value influences the - * outcome of handling NOTE_WL_DISCOVER_OWNER. - */ - needs_wllock = true; - } + filt_wllock(kqwl); - if (needs_wllock) { - filt_wllock(kqwl); - /* - * The kqwl owner is set under both the req and filter lock, - * meaning it's fine to look at it under any. - */ - new_owner = cur_owner = kqwl->kqwl_owner; - } else { - new_owner = cur_owner = THREAD_NULL; - } +again: + new_owner = cur_owner = kqwl->kqwl_owner; /* * Phase 1: @@ -1853,8 +1947,33 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, * Lastly decide whether we need to perform a QoS update. */ if (uaddr) { - error = copyin_word(uaddr, &udata, sizeof(udata)); - if (error) { + /* + * Until exists, + * disabling preemption copyin forces any + * vm_fault we encounter to fail. + */ + error = copyin_atomic64(uaddr, &udata); + + /* + * If we get EFAULT, drop locks, and retry. + * If we still get an error report it, + * else assume the memory has been faulted + * and attempt to copyin under lock again. + */ + switch (error) { + case 0: + break; + case EFAULT: + if (efault_retry-- > 0) { + filt_wlunlock(kqwl); + error = copyin_atomic64(uaddr, &udata); + filt_wllock(kqwl); + if (error == 0) { + goto again; + } + } + /* FALLTHROUGH */ + default: goto out; } @@ -1873,7 +1992,8 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, mach_port_name_t name = (mach_port_name_t)udata & ~0x3; if (name != MACH_PORT_NULL) { name = ipc_entry_name_mask(name); - extra_thread_ref = port_name_to_thread(name); + extra_thread_ref = port_name_to_thread(name, + PORT_TO_THREAD_IN_CURRENT_TASK); if (extra_thread_ref == THREAD_NULL) { error = EOWNERDEAD; goto out; @@ -1890,7 +2010,7 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, if (error == 0) { if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) { action = KQWL_UTQ_SET_QOS_INDEX; - } else if (qos_index && kqr->kqr_qos_index != qos_index) { + } else if (qos_index && kqr->tr_kq_qos_index != qos_index) { action = KQWL_UTQ_SET_QOS_INDEX; } @@ -1902,9 +2022,8 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, */ kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK; kn->kn_sfflags |= kev->fflags; - kn->kn_sdata = kev->data; if (kev->fflags & NOTE_WL_SYNC_WAKE) { - needs_wake = (kn->kn_hook != THREAD_NULL); + needs_wake = (kn->kn_thread != THREAD_NULL); } } else if (op == FILT_WLDROP) { if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) == @@ -1914,7 +2033,7 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, * explicitly, issue a wake up. */ kn->kn_sfflags |= NOTE_WL_SYNC_WAKE; - needs_wake = (kn->kn_hook != THREAD_NULL); + needs_wake = (kn->kn_thread != THREAD_NULL); } } } @@ -1929,10 +2048,10 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, goto out; } - kq_req_lock(kqwl); + kqlock(kqwl); /* If already tracked as servicer, don't track as owner */ - if (new_owner == kqr->kqr_thread) { + if (new_owner == kqr_thread(kqr)) { new_owner = THREAD_NULL; } @@ -1942,25 +2061,20 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, /* we just transfered this ref to kqwl_owner */ extra_thread_ref = THREAD_NULL; } - cur_owner_override = kqworkloop_owner_override(kqwl); - - if (cur_owner) { - thread_ends_owning_workloop(cur_owner); - } + cur_override = kqworkloop_override(kqwl); if (new_owner) { /* override it before we drop the old */ - if (cur_owner_override != THREAD_QOS_UNSPECIFIED) { - thread_add_ipc_override(new_owner, cur_owner_override); + if (cur_override != THREAD_QOS_UNSPECIFIED) { + thread_add_kevent_override(new_owner, cur_override); } - thread_starts_owning_workloop(new_owner); - if ((kqr->kqr_state & KQR_THREQUESTED) && !kqr->kqr_thread) { + if (kqr_thread_requested_pending(kqr)) { if (action == KQWL_UTQ_NONE) { action = KQWL_UTQ_REDRIVE_EVENTS; } } } else { - if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) { + if (!kqr_thread_requested(kqr) && kqr->tr_kq_wakeup) { if (action == KQWL_UTQ_NONE) { action = KQWL_UTQ_REDRIVE_EVENTS; } @@ -1968,13 +2082,11 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, } } - struct turnstile *ts = kqwl->kqwl_turnstile; - bool wl_inheritor_updated = false; - if (action != KQWL_UTQ_NONE) { kqworkloop_update_threads_qos(kqwl, action, qos_index); } + ts = kqwl->kqwl_turnstile; if (cur_owner != new_owner && ts) { if (action == KQWL_UTQ_REDRIVE_EVENTS) { /* @@ -2012,16 +2124,15 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, } if (needs_wake && ts) { - waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T((event_t)kn), - (thread_t)kn->kn_hook, THREAD_AWAKENED); + waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn), + kn->kn_thread, THREAD_AWAKENED); + if (op == FILT_WLATTACH || op == FILT_WLTOUCH) { + disable_preemption(); + error = EPREEMPTDISABLED; + } } - kq_req_unlock(kqwl); - - if (wl_inheritor_updated) { - turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); - turnstile_deallocate(ts); - } + kqunlock(kqwl); out: /* @@ -2029,14 +2140,12 @@ out: * * Unlock and cleanup various lingering references and things. */ - if (needs_wllock) { - filt_wlunlock(kqwl); - } + filt_wlunlock(kqwl); #if CONFIG_WORKLOOP_DEBUG KQWL_HISTORY_WRITE_ENTRY(kqwl, { .updater = current_thread(), - .servicer = kqr->kqr_thread, /* Note: racy */ + .servicer = kqr_thread(kqr), /* Note: racy */ .old_owner = cur_owner, .new_owner = new_owner, @@ -2051,15 +2160,19 @@ out: }); #endif // CONFIG_WORKLOOP_DEBUG + if (wl_inheritor_updated) { + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate_safe(ts); + } + if (cur_owner && new_owner != cur_owner) { - if (cur_owner_override != THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(cur_owner); + if (cur_override != THREAD_QOS_UNSPECIFIED) { + thread_drop_kevent_override(cur_owner); } - thread_deallocate(cur_owner); + thread_deallocate_safe(cur_owner); } - if (extra_thread_ref) { - thread_deallocate(extra_thread_ref); + thread_deallocate_safe(extra_thread_ref); } return error; } @@ -2072,67 +2185,122 @@ out: * - data is set to the error if any */ static inline void -filt_wlremember_last_update(struct knote *kn, struct kevent_internal_s *kev, +filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev, int error) { kn->kn_fflags = kev->fflags; - kn->kn_data = error; + kn->kn_sdata = error; memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext)); } static int -filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) +filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn, + struct kevent_qos_s *kev, int op) { - struct kqueue *kq = knote_get_kq(kn); - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + uint64_t uaddr = kev->ext[EV_EXTIDX_WL_ADDR]; + uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE]; + uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK]; + uint64_t udata = 0; + int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT; int error = 0; - kq_index_t qos_index = 0; - if ((kq->kq_state & KQ_WORKLOOP) == 0) { - error = ENOTSUP; - goto out; + if (op == FILT_WLATTACH) { + (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue); + } else if (uaddr == 0) { + return 0; } -#if DEVELOPMENT || DEBUG - if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) { - struct kqrequest *kqr = &kqwl->kqwl_request; - - kq_req_lock(kqwl); - kev->fflags = 0; - if (kqr->kqr_dsync_waiters) { - kev->fflags |= NOTE_WL_SYNC_WAIT; - } - if (kqr->kqr_qos_index) { - kev->fflags |= NOTE_WL_THREAD_REQUEST; - } - kev->ext[0] = thread_tid(kqwl->kqwl_owner); - kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread); - kev->ext[2] = thread_owned_workloops_count(current_thread()); - kev->ext[3] = kn->kn_kevent.ext[3]; - kq_req_unlock(kqwl); - error = EBUSY; - goto out; - } -#endif + filt_wllock(kqwl); - int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK); - switch (command) { - case NOTE_WL_THREAD_REQUEST: - if (kn->kn_id != kqwl->kqwl_dynamicid) { - error = EINVAL; - goto out; - } - qos_index = _pthread_priority_thread_qos(kn->kn_qos); - if (qos_index == THREAD_QOS_UNSPECIFIED) { - error = ERANGE; +again: + + /* + * Do the debounce thing, the lock serializing the state is the knote lock. + */ + if (uaddr) { + /* + * Until exists, + * disabling preemption copyin forces any + * vm_fault we encounter to fail. + */ + error = copyin_atomic64(uaddr, &udata); + + /* + * If we get EFAULT, drop locks, and retry. + * If we still get an error report it, + * else assume the memory has been faulted + * and attempt to copyin under lock again. + */ + switch (error) { + case 0: + break; + case EFAULT: + if (efault_retry-- > 0) { + filt_wlunlock(kqwl); + error = copyin_atomic64(uaddr, &udata); + filt_wllock(kqwl); + if (error == 0) { + goto again; + } + } + /* FALLTHROUGH */ + default: goto out; } - if (kqwl->kqwl_request.kqr_qos_index) { - /* - * There already is a thread request, and well, you're only allowed - * one per workloop, so fail the attach. - */ - error = EALREADY; + + kev->ext[EV_EXTIDX_WL_VALUE] = udata; + kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata; + + if ((udata & mask) != (kdata & mask)) { + error = ESTALE; + goto out; + } + } + + if (op == FILT_WLATTACH) { + error = filt_wlattach_sync_ipc(kn); + if (error == 0) { + disable_preemption(); + error = EPREEMPTDISABLED; + } + } + +out: + filt_wlunlock(kqwl); + return error; +} + +static int +filt_wlattach(struct knote *kn, struct kevent_qos_s *kev) +{ + struct kqueue *kq = knote_get_kq(kn); + struct kqworkloop *kqwl = (struct kqworkloop *)kq; + int error = 0, result = 0; + kq_index_t qos_index = 0; + + if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) { + error = ENOTSUP; + goto out; + } + + uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK); + switch (command) { + case NOTE_WL_THREAD_REQUEST: + if (kn->kn_id != kqwl->kqwl_dynamicid) { + error = EINVAL; + goto out; + } + qos_index = _pthread_priority_thread_qos(kn->kn_qos); + if (qos_index == THREAD_QOS_UNSPECIFIED) { + error = ERANGE; + goto out; + } + if (kqwl->kqwl_request.tr_kq_qos_index) { + /* + * There already is a thread request, and well, you're only allowed + * one per workloop, so fail the attach. + */ + error = EALREADY; goto out; } break; @@ -2151,13 +2319,32 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) goto out; } break; + + case NOTE_WL_SYNC_IPC: + if ((kn->kn_flags & EV_DISABLE) == 0) { + error = EINVAL; + goto out; + } + if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) { + error = EINVAL; + goto out; + } + break; default: error = EINVAL; goto out; } - error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH); + if (command == NOTE_WL_SYNC_IPC) { + error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH); + } else { + error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH); + } + if (error == EPREEMPTDISABLED) { + error = 0; + result = FILTER_THREADREQ_NODEFEER; + } out: if (error) { /* If userland wants ESTALE to be hidden, fail the attach anyway */ @@ -2165,10 +2352,10 @@ out: error = 0; } knote_set_error(kn, error); - return 0; + return result; } if (command == NOTE_WL_SYNC_WAIT) { - return kevent_register_wait_prepare(kn, kev); + return kevent_register_wait_prepare(kn, kev, result); } /* Just attaching the thread request successfully will fire it */ if (command == NOTE_WL_THREAD_REQUEST) { @@ -2177,28 +2364,26 @@ out: * so delivering an event needs to also consume it. */ kn->kn_flags |= EV_CLEAR; - return FILTER_ACTIVE; + return result | FILTER_ACTIVE; } - return 0; + return result; } static void __dead2 filt_wlwait_continue(void *parameter, wait_result_t wr) { struct _kevent_register *cont_args = parameter; - struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + struct kqworkloop *kqwl = cont_args->kqwl; - kq_req_lock(kqwl); - kqr->kqr_dsync_waiters--; + kqlock(kqwl); if (filt_wlturnstile_interlock_is_workq(kqwl)) { workq_kern_threadreq_lock(kqwl->kqwl_p); - turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL); + turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS); workq_kern_threadreq_unlock(kqwl->kqwl_p); } else { - turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL); + turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS); } - kq_req_unlock(kqwl); + kqunlock(kqwl); turnstile_cleanup(); @@ -2217,17 +2402,15 @@ filt_wlwait_continue(void *parameter, wait_result_t wr) * calls filt_wlwait_continue through a continuation. */ static void __dead2 -filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc, +filt_wlpost_register_wait(struct uthread *uth, struct knote *kn, struct _kevent_register *cont_args) { - struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + struct kqworkloop *kqwl = cont_args->kqwl; + workq_threadreq_t kqr = &kqwl->kqwl_request; struct turnstile *ts; bool workq_locked = false; - kq_req_lock(kqwl); - - kqr->kqr_dsync_waiters++; + kqlock_held(kqwl); if (filt_wlturnstile_interlock_is_workq(kqwl)) { workq_kern_threadreq_lock(kqwl->kqwl_p); @@ -2259,20 +2442,19 @@ filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc, } thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait); - waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(cont_args->knote), + waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn), THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER); if (workq_locked) { workq_kern_threadreq_unlock(kqwl->kqwl_p); } - thread_t thread = kqwl->kqwl_owner ?: kqr->kqr_thread; + thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr); if (thread) { thread_reference(thread); } - kq_req_unlock(kqwl); - kevent_register_wait_block(ts, thread, knlc, filt_wlwait_continue, cont_args); + kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args); } /* called in stackshot context to report the thread responsible for blocking this thread */ @@ -2283,28 +2465,26 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread, struct knote *kn = (struct knote *)event; assert(kdp_is_in_zone(kn, "knote zone")); - assert(kn->kn_hook == thread); + assert(kn->kn_thread == thread); struct kqueue *kq = knote_get_kq(kn); assert(kdp_is_in_zone(kq, "kqueue workloop zone")); assert(kq->kq_state & KQ_WORKLOOP); struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + workq_threadreq_t kqr = &kqwl->kqwl_request; thread_t kqwl_owner = kqwl->kqwl_owner; - thread_t servicer = kqr->kqr_thread; if (kqwl_owner != THREAD_NULL) { assert(kdp_is_in_zone(kqwl_owner, "threads")); waitinfo->owner = thread_tid(kqwl->kqwl_owner); - } else if (servicer != THREAD_NULL) { - assert(kdp_is_in_zone(servicer, "threads")); - - waitinfo->owner = thread_tid(servicer); - } else if (kqr->kqr_state & KQR_THREQUESTED) { + } else if (kqr_thread_requested_pending(kqr)) { waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED; + } else if (kqr->tr_state >= WORKQ_TR_STATE_BINDING) { + assert(kdp_is_in_zone(kqr->tr_thread, "threads")); + waitinfo->owner = thread_tid(kqr->tr_thread); } else { waitinfo->owner = 0; } @@ -2313,20 +2493,21 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread, } static void -filt_wldetach(__assert_only struct knote *kn) +filt_wldetach(struct knote *kn) { - assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP); - if (kn->kn_hook) { + if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) { + filt_wldetach_sync_ipc(kn); + } else if (kn->kn_thread) { kevent_register_wait_cleanup(kn); } } static int -filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev, +filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev, thread_qos_t *qos_index) { - int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK; - int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK; + uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK; + uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK; if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) { return EINVAL; @@ -2367,6 +2548,15 @@ sync_checks: } break; + case NOTE_WL_SYNC_IPC: + if (sav_commands != NOTE_WL_SYNC_IPC) { + return EINVAL; + } + if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) { + return EINVAL; + } + break; + default: return EINVAL; } @@ -2374,48 +2564,54 @@ sync_checks: } static int -filt_wltouch(struct knote *kn, struct kevent_internal_s *kev) +filt_wltouch(struct knote *kn, struct kevent_qos_s *kev) { struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn); thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED; + int result = 0; int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index); if (error) { goto out; } - error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH); - filt_wlremember_last_update(kn, kev, error); - if (error) { - goto out; + uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK; + if (command == NOTE_WL_SYNC_IPC) { + error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH); + } else { + error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH); + filt_wlremember_last_update(kn, kev, error); + } + if (error == EPREEMPTDISABLED) { + error = 0; + result = FILTER_THREADREQ_NODEFEER; } out: if (error) { if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) { /* If userland wants ESTALE to be hidden, do not activate */ - return 0; + return result; } kev->flags |= EV_ERROR; kev->data = error; - return 0; + return result; } - int command = kev->fflags & NOTE_WL_COMMANDS_MASK; if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) { - return kevent_register_wait_prepare(kn, kev); + return kevent_register_wait_prepare(kn, kev, result); } /* Just touching the thread request successfully will fire it */ if (command == NOTE_WL_THREAD_REQUEST) { if (kev->fflags & NOTE_WL_UPDATE_QOS) { - return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS; + result |= FILTER_UPDATE_REQ_QOS; } - return FILTER_ACTIVE; + result |= FILTER_ACTIVE; } - return 0; + return result; } static bool -filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev) +filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev) { struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn); @@ -2424,11 +2620,14 @@ filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev) goto out; } - error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP); - filt_wlremember_last_update(kn, kev, error); - if (error) { - goto out; + uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK); + if (command == NOTE_WL_SYNC_IPC) { + error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP); + } else { + error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP); + filt_wlremember_last_update(kn, kev, error); } + assert(error != EPREEMPTDISABLED); out: if (error) { @@ -2443,17 +2642,14 @@ out: } static int -filt_wlprocess( - struct knote *kn, - __unused struct filt_process_s *data, - struct kevent_internal_s *kev) +filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev) { struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn); int rc = 0; assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST); - filt_wllock(kqwl); + kqlock(kqwl); if (kqwl->kqwl_owner) { /* @@ -2464,9 +2660,7 @@ filt_wlprocess( * When that happens, the automatic deactivation due to process * would swallow the event, so we have to activate the knote again. */ - kqlock(kqwl); - knote_activate(kn); - kqunlock(kqwl); + knote_activate(kqwl, kn, FILTER_ACTIVE); } else { #if DEBUG || DEVELOPMENT if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) { @@ -2478,7 +2672,7 @@ filt_wlprocess( task_t t = current_task(); uint64_t val; if (addr && task_is_active(t) && !task_is_halting(t) && - copyin_word(addr, &val, sizeof(val)) == 0 && + copyin_atomic64(addr, &val) == 0 && val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 && (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) { panic("kevent: workloop %#016llx is not enqueued " @@ -2487,14 +2681,12 @@ filt_wlprocess( } } #endif - *kev = kn->kn_kevent; + knote_fill_kevent(kn, kev, 0); kev->fflags = kn->kn_sfflags; - kev->data = kn->kn_sdata; - kev->qos = kn->kn_qos; rc |= FILTER_ACTIVE; } - filt_wlunlock(kqwl); + kqunlock(kqwl); if (rc & FILTER_ACTIVE) { workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request); @@ -2506,410 +2698,292 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = { .f_extended_codes = true, .f_attach = filt_wlattach, .f_detach = filt_wldetach, - .f_event = filt_badevent, + .f_event = filt_bad_event, .f_touch = filt_wltouch, .f_process = filt_wlprocess, .f_allow_drop = filt_wlallow_drop, .f_post_register_wait = filt_wlpost_register_wait, }; -#pragma mark kevent / knotes +#pragma mark - kqueues allocation and deallocation -/* - * JMM - placeholder for not-yet-implemented filters +/*! + * @enum kqworkloop_dealloc_flags_t + * + * @brief + * Flags that alter kqworkloop_dealloc() behavior. + * + * @const KQWL_DEALLOC_NONE + * Convenient name for "no flags". + * + * @const KQWL_DEALLOC_SKIP_HASH_REMOVE + * Do not remove the workloop fromt he hash table. + * This is used for process tear-down codepaths as the workloops have been + * removed by the caller already. */ -static int -filt_badevent(struct knote *kn, long hint) +OS_OPTIONS(kqworkloop_dealloc_flags, unsigned, + KQWL_DEALLOC_NONE = 0x0000, + KQWL_DEALLOC_SKIP_HASH_REMOVE = 0x0001, + ); + +static void +kqworkloop_dealloc(struct kqworkloop *, kqworkloop_dealloc_flags_t, uint32_t); + +OS_NOINLINE OS_COLD OS_NORETURN +static void +kqworkloop_retain_panic(struct kqworkloop *kqwl, uint32_t previous) { - panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint); - return 0; + if (previous == 0) { + panic("kq(%p) resurrection", kqwl); + } else { + panic("kq(%p) retain overflow", kqwl); + } } -static int -filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev) +OS_NOINLINE OS_COLD OS_NORETURN +static void +kqworkloop_release_panic(struct kqworkloop *kqwl) { - knote_set_error(kn, ENOTSUP); - return 0; + panic("kq(%p) over-release", kqwl); } -struct kqueue * -kqueue_alloc(struct proc *p, unsigned int flags) +OS_ALWAYS_INLINE +static inline bool +kqworkloop_try_retain(struct kqworkloop *kqwl) { - struct filedesc *fdp = p->p_fd; - struct kqueue *kq = NULL; - int policy; - void *hook = NULL; - - if (flags & KEVENT_FLAG_WORKQ) { - struct kqworkq *kqwq; - int i; - - kqwq = (struct kqworkq *)zalloc(kqworkq_zone); - if (kqwq == NULL) { - return NULL; - } - - kq = &kqwq->kqwq_kqueue; - bzero(kqwq, sizeof(struct kqworkq)); - - kqwq->kqwq_state = KQ_WORKQ; - - for (i = 0; i < KQWQ_NBUCKETS; i++) { - TAILQ_INIT(&kqwq->kqwq_queue[i]); - } - for (i = 0; i < KQWQ_NBUCKETS; i++) { - if (i != KQWQ_QOS_MANAGER) { - /* - * Because of how the bucketized system works, we mix overcommit - * sources with not overcommit: each time we move a knote from - * one bucket to the next due to overrides, we'd had to track - * overcommitness, and it's really not worth it in the workloop - * enabled world that track this faithfully. - * - * Incidentally, this behaves like the original manager-based - * kqwq where event delivery always happened (hence is - * "overcommit") - */ - kqwq->kqwq_request[i].kqr_state |= KQR_THOVERCOMMIT; - } - kqwq->kqwq_request[i].kqr_qos_index = i; - TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed); + uint32_t old_ref, new_ref; + os_atomic_rmw_loop(&kqwl->kqwl_retains, old_ref, new_ref, relaxed, { + if (__improbable(old_ref == 0)) { + os_atomic_rmw_loop_give_up(return false); } - - policy = SYNC_POLICY_FIFO; - hook = (void *)kqwq; - } else if (flags & KEVENT_FLAG_WORKLOOP) { - struct kqworkloop *kqwl; - int i; - - kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone); - if (kqwl == NULL) { - return NULL; + if (__improbable(old_ref >= KQ_WORKLOOP_RETAINS_MAX)) { + kqworkloop_retain_panic(kqwl, old_ref); } + new_ref = old_ref + 1; + }); + return true; +} - bzero(kqwl, sizeof(struct kqworkloop)); - - kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC; - kqwl->kqwl_retains = 1; /* donate a retain to creator */ - kqwl->kqwl_request.kqr_state = KQR_WORKLOOP; +OS_ALWAYS_INLINE +static inline void +kqworkloop_retain(struct kqworkloop *kqwl) +{ + uint32_t previous = os_atomic_inc_orig(&kqwl->kqwl_retains, relaxed); + if (__improbable(previous == 0 || previous >= KQ_WORKLOOP_RETAINS_MAX)) { + kqworkloop_retain_panic(kqwl, previous); + } +} - kq = &kqwl->kqwl_kqueue; - for (i = 0; i < KQWL_NBUCKETS; i++) { - TAILQ_INIT(&kqwl->kqwl_queue[i]); - } - TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed); +OS_ALWAYS_INLINE +static inline void +kqueue_retain(kqueue_t kqu) +{ + if (kqu.kq->kq_state & KQ_DYNAMIC) { + kqworkloop_retain(kqu.kqwl); + } +} - lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr); +OS_ALWAYS_INLINE +static inline void +kqworkloop_release_live(struct kqworkloop *kqwl) +{ + uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed); + if (__improbable(refs <= 1)) { + kqworkloop_release_panic(kqwl); + } +} - policy = SYNC_POLICY_FIFO; - hook = (void *)kqwl; - } else { - struct kqfile *kqf; +OS_ALWAYS_INLINE +static inline void +kqueue_release_live(kqueue_t kqu) +{ + if (kqu.kq->kq_state & KQ_DYNAMIC) { + kqworkloop_release_live(kqu.kqwl); + } +} - kqf = (struct kqfile *)zalloc(kqfile_zone); - if (kqf == NULL) { - return NULL; - } +OS_ALWAYS_INLINE +static inline void +kqworkloop_release(struct kqworkloop *kqwl) +{ + uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed); - kq = &kqf->kqf_kqueue; - bzero(kqf, sizeof(struct kqfile)); - TAILQ_INIT(&kqf->kqf_queue); - TAILQ_INIT(&kqf->kqf_suppressed); + if (__improbable(refs <= 1)) { + kqworkloop_dealloc(kqwl, KQWL_DEALLOC_NONE, refs - 1); + } +} - policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST; +OS_ALWAYS_INLINE +static inline void +kqueue_release(kqueue_t kqu) +{ + if (kqu.kq->kq_state & KQ_DYNAMIC) { + kqworkloop_release(kqu.kqwl); } +} - waitq_set_init(&kq->kq_wqs, policy, NULL, hook); - lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr); - lck_spin_init(&kq->kq_reqlock, kq_lck_grp, kq_lck_attr); - kq->kq_p = p; +/*! + * @function kqueue_destroy + * + * @brief + * Common part to all kqueue dealloc functions. + */ +OS_NOINLINE +static void +kqueue_destroy(kqueue_t kqu, zone_t zone) +{ + /* + * waitq_set_deinit() remove the KQ's waitq set from + * any select sets to which it may belong. + * + * The order of these deinits matter: before waitq_set_deinit() returns, + * waitq_set__CALLING_PREPOST_HOOK__ may be called and it will take the + * kq_lock. + */ + waitq_set_deinit(&kqu.kq->kq_wqs); + lck_spin_destroy(&kqu.kq->kq_lock, kq_lck_grp); - if (fdp->fd_knlistsize < 0) { - proc_fdlock(p); - if (fdp->fd_knlistsize < 0) { - fdp->fd_knlistsize = 0; /* this process has had a kq */ - } - proc_fdunlock(p); - } + zfree(zone, kqu.kq); +} - return kq; +/*! + * @function kqueue_init + * + * @brief + * Common part to all kqueue alloc functions. + */ +static kqueue_t +kqueue_init(kqueue_t kqu, waitq_set_prepost_hook_t *hook, int policy) +{ + waitq_set_init(&kqu.kq->kq_wqs, policy, NULL, hook); + lck_spin_init(&kqu.kq->kq_lock, kq_lck_grp, kq_lck_attr); + return kqu; } -/* - * knotes_dealloc - detach all knotes for the process and drop them +#pragma mark kqfile allocation and deallocation + +/*! + * @function kqueue_dealloc * - * Called with proc_fdlock held. - * Returns with it locked. - * May drop it temporarily. - * Process is in such a state that it will not try to allocate - * any more knotes during this process (stopped for exit or exec). + * @brief + * Detach all knotes from a kqfile and free it. + * + * @discussion + * We walk each list looking for knotes referencing this + * this kqueue. If we find one, we try to drop it. But + * if we fail to get a drop reference, that will wait + * until it is dropped. So, we can just restart again + * safe in the assumption that the list will eventually + * not contain any more references to this kqueue (either + * we dropped them all, or someone else did). + * + * Assumes no new events are being added to the kqueue. + * Nothing locked on entry or exit. */ void -knotes_dealloc(proc_t p) +kqueue_dealloc(struct kqueue *kq) { + KNOTE_LOCK_CTX(knlc); + struct proc *p = kq->kq_p; struct filedesc *fdp = p->p_fd; - struct kqueue *kq; struct knote *kn; - struct klist *kn_hash = NULL; - int i; - /* Close all the fd-indexed knotes up front */ - if (fdp->fd_knlistsize > 0) { - for (i = 0; i < fdp->fd_knlistsize; i++) { - while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) { - kq = knote_get_kq(kn); + assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0); + + proc_fdlock(p); + for (int i = 0; i < fdp->fd_knlistsize; i++) { + kn = SLIST_FIRST(&fdp->fd_knlist[i]); + while (kn != NULL) { + if (kq == knote_get_kq(kn)) { kqlock(kq); proc_fdunlock(p); - knote_drop(kq, kn, NULL); + if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + knote_drop(kq, kn, &knlc); + } proc_fdlock(p); + /* start over at beginning of list */ + kn = SLIST_FIRST(&fdp->fd_knlist[i]); + continue; } + kn = SLIST_NEXT(kn, kn_link); } - /* free the table */ - FREE(fdp->fd_knlist, M_KQUEUE); - fdp->fd_knlist = NULL; } - fdp->fd_knlistsize = -1; - knhash_lock(p); + knhash_lock(fdp); proc_fdunlock(p); - /* Clean out all the hashed knotes as well */ if (fdp->fd_knhashmask != 0) { - for (i = 0; i <= (int)fdp->fd_knhashmask; i++) { - while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) { - kq = knote_get_kq(kn); - kqlock(kq); - knhash_unlock(p); - knote_drop(kq, kn, NULL); - knhash_lock(p); + for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { + kn = SLIST_FIRST(&fdp->fd_knhash[i]); + while (kn != NULL) { + if (kq == knote_get_kq(kn)) { + kqlock(kq); + knhash_unlock(fdp); + if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + knote_drop(kq, kn, &knlc); + } + knhash_lock(fdp); + /* start over at beginning of list */ + kn = SLIST_FIRST(&fdp->fd_knhash[i]); + continue; + } + kn = SLIST_NEXT(kn, kn_link); } } - kn_hash = fdp->fd_knhash; - fdp->fd_knhashmask = 0; - fdp->fd_knhash = NULL; - } - - knhash_unlock(p); - - /* free the kn_hash table */ - if (kn_hash) { - FREE(kn_hash, M_KQUEUE); } + knhash_unlock(fdp); - proc_fdlock(p); + kqueue_destroy(kq, kqfile_zone); } -/* - * kqworkloop_invalidate - * - * Invalidate ownership of a workloop. - * - * This is meant to be used so that any remnant of overrides and ownership - * information is dropped before a kqworkloop can no longer be found in the - * global hash table and have ghost workloop ownership left over. +/*! + * @function kqueue_alloc * - * Possibly returns a thread to deallocate in a safe context. + * @brief + * Allocate a kqfile. */ -static thread_t -kqworkloop_invalidate(struct kqworkloop *kqwl) +struct kqueue * +kqueue_alloc(struct proc *p) { - thread_t cur_owner = kqwl->kqwl_owner; + struct kqfile *kqf; - assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed)); - if (cur_owner) { - /* - * If the kqueue had an owner that prevented the thread request to - * go through, then no unbind happened, and we may have lingering - * overrides to drop. - */ - if (kqworkloop_owner_override(kqwl) != THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(cur_owner); - } - thread_ends_owning_workloop(cur_owner); - kqwl->kqwl_owner = THREAD_NULL; + kqf = (struct kqfile *)zalloc(kqfile_zone); + if (__improbable(kqf == NULL)) { + return NULL; } + bzero(kqf, sizeof(struct kqfile)); + + /* + * kqfiles are created with kqueue() so we need to wait for + * the first kevent syscall to know which bit among + * KQ_KEV_{32,64,QOS} will be set in kqf_state + */ + kqf->kqf_p = p; + TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue); + TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed); - return cur_owner; + return kqueue_init(kqf, NULL, SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST).kq; } -/* - * kqueue_dealloc - detach all knotes from a kqueue and free it - * - * We walk each list looking for knotes referencing this - * this kqueue. If we find one, we try to drop it. But - * if we fail to get a drop reference, that will wait - * until it is dropped. So, we can just restart again - * safe in the assumption that the list will eventually - * not contain any more references to this kqueue (either - * we dropped them all, or someone else did). - * - * Assumes no new events are being added to the kqueue. - * Nothing locked on entry or exit. - * - * Workloop kqueues cant get here unless all the knotes - * are already gone and all requested threads have come - * and gone (cancelled or arrived). +/*! + * @function kqueue_internal + * + * @brief + * Core implementation for kqueue and guarded_kqueue_np() */ -void -kqueue_dealloc(struct kqueue *kq) +int +kqueue_internal(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval) { - struct proc *p; - struct filedesc *fdp; - struct knote *kn; - int i; + struct kqueue *kq; + struct fileproc *fp; + int fd, error; - if (kq == NULL) { - return; - } - - p = kq->kq_p; - fdp = p->p_fd; - - /* - * Workloops are refcounted by their knotes, so there's no point - * spending a lot of time under these locks just to deallocate one. - */ - if ((kq->kq_state & KQ_WORKLOOP) == 0) { - KNOTE_LOCK_CTX(knlc); - - proc_fdlock(p); - for (i = 0; i < fdp->fd_knlistsize; i++) { - kn = SLIST_FIRST(&fdp->fd_knlist[i]); - while (kn != NULL) { - if (kq == knote_get_kq(kn)) { - kqlock(kq); - proc_fdunlock(p); - if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { - knote_drop(kq, kn, &knlc); - } - proc_fdlock(p); - /* start over at beginning of list */ - kn = SLIST_FIRST(&fdp->fd_knlist[i]); - continue; - } - kn = SLIST_NEXT(kn, kn_link); - } - } - - knhash_lock(p); - proc_fdunlock(p); - - if (fdp->fd_knhashmask != 0) { - for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { - kn = SLIST_FIRST(&fdp->fd_knhash[i]); - while (kn != NULL) { - if (kq == knote_get_kq(kn)) { - kqlock(kq); - knhash_unlock(p); - if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { - knote_drop(kq, kn, &knlc); - } - knhash_lock(p); - /* start over at beginning of list */ - kn = SLIST_FIRST(&fdp->fd_knhash[i]); - continue; - } - kn = SLIST_NEXT(kn, kn_link); - } - } - } - knhash_unlock(p); - } - - if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - thread_t cur_owner = kqworkloop_invalidate(kqwl); - - if (cur_owner) { - thread_deallocate(cur_owner); - } - - if (kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) { - struct turnstile *ts; - turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, &ts); - turnstile_cleanup(); - turnstile_deallocate(ts); - } else { - assert(kqwl->kqwl_turnstile == NULL); - } - } - - /* - * waitq_set_deinit() remove the KQ's waitq set from - * any select sets to which it may belong. - */ - waitq_set_deinit(&kq->kq_wqs); - lck_spin_destroy(&kq->kq_lock, kq_lck_grp); - lck_spin_destroy(&kq->kq_reqlock, kq_lck_grp); - - if (kq->kq_state & KQ_WORKQ) { - zfree(kqworkq_zone, kq); - } else if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - - assert(kqwl->kqwl_retains == 0); - lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp); - zfree(kqworkloop_zone, kqwl); - } else { - zfree(kqfile_zone, kq); - } -} - -static inline void -kqueue_retain(struct kqueue *kq) -{ - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - uint32_t previous; - - if ((kq->kq_state & KQ_DYNAMIC) == 0) { - return; - } - - previous = OSIncrementAtomic(&kqwl->kqwl_retains); - if (previous == KQ_WORKLOOP_RETAINS_MAX) { - panic("kq(%p) retain overflow", kq); - } - - if (previous == 0) { - panic("kq(%p) resurrection", kq); - } -} - -#define KQUEUE_CANT_BE_LAST_REF 0 -#define KQUEUE_MIGHT_BE_LAST_REF 1 - -static inline int -kqueue_release(kqueue_t kqu, __assert_only int possibly_last) -{ - if ((kqu.kq->kq_state & KQ_DYNAMIC) == 0) { - return 0; - } - - assert(kqu.kq->kq_state & KQ_WORKLOOP); /* for now */ - uint32_t refs = OSDecrementAtomic(&kqu.kqwl->kqwl_retains); - if (__improbable(refs == 0)) { - panic("kq(%p) over-release", kqu.kq); - } - if (refs == 1) { - assert(possibly_last); - } - return refs == 1; -} - -int -kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval) -{ - struct kqueue *kq; - struct fileproc *fp; - int fd, error; - - error = falloc_withalloc(p, - &fp, &fd, vfs_context_current(), fp_zalloc, cra); + error = falloc_withalloc(p, &fp, &fd, vfs_context_current(), fp_zalloc, cra); if (error) { return error; } - kq = kqueue_alloc(p, 0); + kq = kqueue_alloc(p); if (kq == NULL) { fp_free(p, fd, fp); return ENOMEM; @@ -2930,631 +3004,489 @@ kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval) return error; } +/*! + * @function kqueue + * + * @brief + * The kqueue syscall. + */ int kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval) { - return kqueue_body(p, fileproc_alloc_init, NULL, retval); + return kqueue_internal(p, fileproc_alloc_init, NULL, retval); } -static int -kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p, - unsigned int flags) -{ - int advance; - int error; - - if (flags & KEVENT_FLAG_LEGACY32) { - bzero(kevp, sizeof(*kevp)); +#pragma mark kqworkq allocation and deallocation - if (IS_64BIT_PROCESS(p)) { - struct user64_kevent kev64; +/*! + * @function kqworkq_dealloc + * + * @brief + * Deallocates a workqueue kqueue. + * + * @discussion + * This only happens at process death, or for races with concurrent + * kevent_get_kqwq calls, hence we don't have to care about knotes referencing + * this kqueue, either there are none, or someone else took care of them. + */ +void +kqworkq_dealloc(struct kqworkq *kqwq) +{ + kqueue_destroy(kqwq, kqworkq_zone); +} - advance = sizeof(kev64); - error = copyin(*addrp, (caddr_t)&kev64, advance); - if (error) { - return error; - } - kevp->ident = kev64.ident; - kevp->filter = kev64.filter; - kevp->flags = kev64.flags; - kevp->udata = kev64.udata; - kevp->fflags = kev64.fflags; - kevp->data = kev64.data; - } else { - struct user32_kevent kev32; +/*! + * @function kqworkq_alloc + * + * @brief + * Allocates a workqueue kqueue. + * + * @discussion + * This is the slow path of kevent_get_kqwq. + * This takes care of making sure procs have a single workq kqueue. + */ +OS_NOINLINE +static struct kqworkq * +kqworkq_alloc(struct proc *p, unsigned int flags) +{ + struct kqworkq *kqwq, *tmp; - advance = sizeof(kev32); - error = copyin(*addrp, (caddr_t)&kev32, advance); - if (error) { - return error; - } - kevp->ident = (uintptr_t)kev32.ident; - kevp->filter = kev32.filter; - kevp->flags = kev32.flags; - kevp->udata = CAST_USER_ADDR_T(kev32.udata); - kevp->fflags = kev32.fflags; - kevp->data = (intptr_t)kev32.data; - } - } else if (flags & KEVENT_FLAG_LEGACY64) { - struct kevent64_s kev64; + kqwq = (struct kqworkq *)zalloc(kqworkq_zone); + if (__improbable(kqwq == NULL)) { + return NULL; + } + bzero(kqwq, sizeof(struct kqworkq)); - bzero(kevp, sizeof(*kevp)); + assert((flags & KEVENT_FLAG_LEGACY32) == 0); + if (flags & KEVENT_FLAG_LEGACY64) { + kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64; + } else { + kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS; + } + kqwq->kqwq_p = p; - advance = sizeof(struct kevent64_s); - error = copyin(*addrp, (caddr_t)&kev64, advance); - if (error) { - return error; + for (int i = 0; i < KQWQ_NBUCKETS; i++) { + TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]); + TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]); + } + for (int i = 0; i < KQWQ_NBUCKETS; i++) { + /* + * Because of how the bucketized system works, we mix overcommit + * sources with not overcommit: each time we move a knote from + * one bucket to the next due to overrides, we'd had to track + * overcommitness, and it's really not worth it in the workloop + * enabled world that track this faithfully. + * + * Incidentally, this behaves like the original manager-based + * kqwq where event delivery always happened (hence is + * "overcommit") + */ + kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE; + kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT; + if (i != KQWQ_QOS_MANAGER) { + kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT; } - kevp->ident = kev64.ident; - kevp->filter = kev64.filter; - kevp->flags = kev64.flags; - kevp->udata = kev64.udata; - kevp->fflags = kev64.fflags; - kevp->data = kev64.data; - kevp->ext[0] = kev64.ext[0]; - kevp->ext[1] = kev64.ext[1]; - } else { - struct kevent_qos_s kevqos; + kqwq->kqwq_request[i].tr_kq_qos_index = i; + } - bzero(kevp, sizeof(*kevp)); + kqueue_init(kqwq, &kqwq->kqwq_waitq_hook, SYNC_POLICY_FIFO); - advance = sizeof(struct kevent_qos_s); - error = copyin(*addrp, (caddr_t)&kevqos, advance); - if (error) { - return error; - } - kevp->ident = kevqos.ident; - kevp->filter = kevqos.filter; - kevp->flags = kevqos.flags; - kevp->qos = kevqos.qos; -// kevp->xflags = kevqos.xflags; - kevp->udata = kevqos.udata; - kevp->fflags = kevqos.fflags; - kevp->data = kevqos.data; - kevp->ext[0] = kevqos.ext[0]; - kevp->ext[1] = kevqos.ext[1]; - kevp->ext[2] = kevqos.ext[2]; - kevp->ext[3] = kevqos.ext[3]; - } - if (!error) { - *addrp += advance; + if (!os_atomic_cmpxchgv(&p->p_fd->fd_wqkqueue, NULL, kqwq, &tmp, release)) { + kqworkq_dealloc(kqwq); + return tmp; } - return error; -} -static int -kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p, - unsigned int flags) -{ - user_addr_t addr = *addrp; - int advance; - int error; + return kqwq; +} - /* - * fully initialize the differnt output event structure - * types from the internal kevent (and some universal - * defaults for fields not represented in the internal - * form). - */ - if (flags & KEVENT_FLAG_LEGACY32) { - assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0); +#pragma mark kqworkloop allocation and deallocation - if (IS_64BIT_PROCESS(p)) { - struct user64_kevent kev64; +#define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) +#define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE - advance = sizeof(kev64); - bzero(&kev64, advance); +OS_ALWAYS_INLINE +static inline void +kqhash_lock(struct filedesc *fdp) +{ + lck_mtx_lock_spin_always(&fdp->fd_kqhashlock); +} - /* - * deal with the special case of a user-supplied - * value of (uintptr_t)-1. - */ - kev64.ident = (kevp->ident == (uintptr_t)-1) ? - (uint64_t)-1LL : (uint64_t)kevp->ident; - - kev64.filter = kevp->filter; - kev64.flags = kevp->flags; - kev64.fflags = kevp->fflags; - kev64.data = (int64_t) kevp->data; - kev64.udata = kevp->udata; - error = copyout((caddr_t)&kev64, addr, advance); - } else { - struct user32_kevent kev32; - - advance = sizeof(kev32); - bzero(&kev32, advance); - kev32.ident = (uint32_t)kevp->ident; - kev32.filter = kevp->filter; - kev32.flags = kevp->flags; - kev32.fflags = kevp->fflags; - kev32.data = (int32_t)kevp->data; - kev32.udata = kevp->udata; - error = copyout((caddr_t)&kev32, addr, advance); - } - } else if (flags & KEVENT_FLAG_LEGACY64) { - struct kevent64_s kev64; +OS_ALWAYS_INLINE +static inline void +kqhash_unlock(struct filedesc *fdp) +{ + lck_mtx_unlock(&fdp->fd_kqhashlock); +} - advance = sizeof(struct kevent64_s); - if (flags & KEVENT_FLAG_STACK_EVENTS) { - addr -= advance; - } - bzero(&kev64, advance); - kev64.ident = kevp->ident; - kev64.filter = kevp->filter; - kev64.flags = kevp->flags; - kev64.fflags = kevp->fflags; - kev64.data = (int64_t) kevp->data; - kev64.udata = kevp->udata; - kev64.ext[0] = kevp->ext[0]; - kev64.ext[1] = kevp->ext[1]; - error = copyout((caddr_t)&kev64, addr, advance); - } else { - struct kevent_qos_s kevqos; - - advance = sizeof(struct kevent_qos_s); - if (flags & KEVENT_FLAG_STACK_EVENTS) { - addr -= advance; - } - bzero(&kevqos, advance); - kevqos.ident = kevp->ident; - kevqos.filter = kevp->filter; - kevqos.flags = kevp->flags; - kevqos.qos = kevp->qos; - kevqos.udata = kevp->udata; - kevqos.fflags = kevp->fflags; - kevqos.xflags = 0; - kevqos.data = (int64_t) kevp->data; - kevqos.ext[0] = kevp->ext[0]; - kevqos.ext[1] = kevp->ext[1]; - kevqos.ext[2] = kevp->ext[2]; - kevqos.ext[3] = kevp->ext[3]; - error = copyout((caddr_t)&kevqos, addr, advance); - } - if (!error) { - if (flags & KEVENT_FLAG_STACK_EVENTS) { - *addrp = addr; - } else { - *addrp = addr + advance; - } - } - return error; +OS_ALWAYS_INLINE +static inline void +kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id, + struct kqworkloop *kqwl) +{ + struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)]; + LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink); } -static int -kevent_get_data_size( - struct proc *p, - uint64_t data_available, - unsigned int flags, - user_size_t *residp) +OS_ALWAYS_INLINE +static inline struct kqworkloop * +kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id) { - user_size_t resid; - int error = 0; + struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)]; + struct kqworkloop *kqwl; - if (data_available != USER_ADDR_NULL) { - if (flags & KEVENT_FLAG_KERNEL) { - resid = *(user_size_t *)(uintptr_t)data_available; - } else if (IS_64BIT_PROCESS(p)) { - user64_size_t usize; - error = copyin((user_addr_t)data_available, &usize, sizeof(usize)); - resid = (user_size_t)usize; - } else { - user32_size_t usize; - error = copyin((user_addr_t)data_available, &usize, sizeof(usize)); - resid = (user_size_t)usize; - } - if (error) { - return error; + LIST_FOREACH(kqwl, list, kqwl_hashlink) { + if (kqwl->kqwl_dynamicid == id) { + return kqwl; } - } else { - resid = 0; } - *residp = resid; - return 0; + return NULL; } -static int -kevent_put_data_size( - struct proc *p, - uint64_t data_available, - unsigned int flags, - user_size_t resid) +static struct kqworkloop * +kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id) { - int error = 0; + struct kqworkloop *kqwl = NULL; - if (data_available) { - if (flags & KEVENT_FLAG_KERNEL) { - *(user_size_t *)(uintptr_t)data_available = resid; - } else if (IS_64BIT_PROCESS(p)) { - user64_size_t usize = (user64_size_t)resid; - error = copyout(&usize, (user_addr_t)data_available, sizeof(usize)); - } else { - user32_size_t usize = (user32_size_t)resid; - error = copyout(&usize, (user_addr_t)data_available, sizeof(usize)); + kqhash_lock(fdp); + if (__probable(fdp->fd_kqhash)) { + kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id); + if (kqwl && !kqworkloop_try_retain(kqwl)) { + kqwl = NULL; } } - return error; + kqhash_unlock(fdp); + return kqwl; } -/* - * kevent_continue - continue a kevent syscall after blocking - * - * assume we inherit a use count on the kq fileglob. - */ -__attribute__((noreturn)) +OS_NOINLINE static void -kevent_continue(__unused struct kqueue *kq, void *data, int error) +kqworkloop_hash_init(struct filedesc *fdp) { - struct _kevent *cont_args; - struct fileproc *fp; - uint64_t data_available; - user_size_t data_size; - user_size_t data_resid; - unsigned int flags; - int32_t *retval; - int noutputs; - int fd; - struct proc *p = current_proc(); - - cont_args = (struct _kevent *)data; - data_available = cont_args->data_available; - flags = cont_args->process_data.fp_flags; - data_size = cont_args->process_data.fp_data_size; - data_resid = cont_args->process_data.fp_data_resid; - noutputs = cont_args->eventout; - retval = cont_args->retval; - fd = cont_args->fd; - fp = cont_args->fp; - - kevent_put_kq(p, fd, fp, kq); - - /* don't abandon other output just because of residual copyout failures */ - if (error == 0 && data_available && data_resid != data_size) { - (void)kevent_put_data_size(p, data_available, flags, data_resid); - } + struct kqwllist *alloc_hash; + u_long alloc_mask; - /* don't restart after signals... */ - if (error == ERESTART) { - error = EINTR; - } else if (error == EWOULDBLOCK) { - error = 0; - } - if (error == 0) { - *retval = noutputs; + kqhash_unlock(fdp); + alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask); + kqhash_lock(fdp); + + /* See if we won the race */ + if (__probable(fdp->fd_kqhashmask == 0)) { + fdp->fd_kqhash = alloc_hash; + fdp->fd_kqhashmask = alloc_mask; + } else { + kqhash_unlock(fdp); + FREE(alloc_hash, M_KQUEUE); + kqhash_lock(fdp); } - unix_syscall_return(error); } -/* - * kevent - [syscall] register and wait for kernel events +/*! + * @function kqworkloop_dealloc + * + * @brief + * Deallocates a workloop kqueue. + * + * @discussion + * Knotes hold references on the workloop, so we can't really reach this + * function unless all of these are already gone. + * + * Nothing locked on entry or exit. * + * @param flags + * Unless KQWL_DEALLOC_SKIP_HASH_REMOVE is set, the workloop is removed + * from its hash table. + * + * @param current_ref + * This function is also called to undo a kqworkloop_alloc in case of + * allocation races, expected_ref is the current refcount that is expected + * on the workloop object, usually 0, and 1 when a dealloc race is resolved. */ -int -kevent(struct proc *p, struct kevent_args *uap, int32_t *retval) +static void +kqworkloop_dealloc(struct kqworkloop *kqwl, kqworkloop_dealloc_flags_t flags, + uint32_t current_ref) { - unsigned int flags = KEVENT_FLAG_LEGACY32; + thread_t cur_owner; - return kevent_internal(p, - (kqueue_id_t)uap->fd, NULL, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - 0ULL, 0ULL, - flags, - uap->timeout, - kevent_continue, - retval); -} + if (__improbable(current_ref > 1)) { + kqworkloop_release_panic(kqwl); + } + assert(kqwl->kqwl_retains == current_ref); -int -kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval) -{ - unsigned int flags; + /* pair with kqunlock() and other kq locks */ + os_atomic_thread_fence(acquire); - /* restrict to user flags and set legacy64 */ - flags = uap->flags & KEVENT_FLAG_USER; - flags |= KEVENT_FLAG_LEGACY64; + cur_owner = kqwl->kqwl_owner; + if (cur_owner) { + if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) { + thread_drop_kevent_override(cur_owner); + } + thread_deallocate(cur_owner); + kqwl->kqwl_owner = THREAD_NULL; + } - return kevent_internal(p, - (kqueue_id_t)uap->fd, NULL, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - 0ULL, 0ULL, - flags, - uap->timeout, - kevent_continue, - retval); -} + if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) { + struct turnstile *ts; + turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, + &ts, TURNSTILE_WORKLOOPS); + turnstile_cleanup(); + turnstile_deallocate(ts); + } -int -kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval) -{ - /* restrict to user flags */ - uap->flags &= KEVENT_FLAG_USER; - - return kevent_internal(p, - (kqueue_id_t)uap->fd, NULL, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - uap->data_out, (uint64_t)uap->data_available, - uap->flags, - 0ULL, - kevent_continue, - retval); -} + if ((flags & KQWL_DEALLOC_SKIP_HASH_REMOVE) == 0) { + struct filedesc *fdp = kqwl->kqwl_p->p_fd; -int -kevent_qos_internal(struct proc *p, int fd, - user_addr_t changelist, int nchanges, - user_addr_t eventlist, int nevents, - user_addr_t data_out, user_size_t *data_available, - unsigned int flags, - int32_t *retval) -{ - return kevent_internal(p, - (kqueue_id_t)fd, NULL, - changelist, nchanges, - eventlist, nevents, - data_out, (uint64_t)data_available, - (flags | KEVENT_FLAG_KERNEL), - 0ULL, - NULL, - retval); -} + kqhash_lock(fdp); + LIST_REMOVE(kqwl, kqwl_hashlink); + kqhash_unlock(fdp); + } -int -kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval) -{ - /* restrict to user flags */ - uap->flags &= KEVENT_FLAG_USER; - - return kevent_internal(p, - (kqueue_id_t)uap->id, NULL, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - uap->data_out, (uint64_t)uap->data_available, - (uap->flags | KEVENT_FLAG_DYNAMIC_KQUEUE), - 0ULL, - kevent_continue, - retval); -} + assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed)); + assert(kqwl->kqwl_owner == THREAD_NULL); + assert(kqwl->kqwl_turnstile == TURNSTILE_NULL); -int -kevent_id_internal(struct proc *p, kqueue_id_t *id, - user_addr_t changelist, int nchanges, - user_addr_t eventlist, int nevents, - user_addr_t data_out, user_size_t *data_available, - unsigned int flags, - int32_t *retval) -{ - return kevent_internal(p, - *id, id, - changelist, nchanges, - eventlist, nevents, - data_out, (uint64_t)data_available, - (flags | KEVENT_FLAG_KERNEL | KEVENT_FLAG_DYNAMIC_KQUEUE), - 0ULL, - NULL, - retval); + lck_spin_destroy(&kqwl->kqwl_statelock, kq_lck_grp); + kqueue_destroy(kqwl, kqworkloop_zone); } -static int -kevent_get_timeout(struct proc *p, - user_addr_t utimeout, - unsigned int flags, - struct timeval *atvp) +/*! + * @function kqworkloop_alloc + * + * @brief + * Allocates a workloop kqueue. + */ +static void +kqworkloop_init(struct kqworkloop *kqwl, proc_t p, + kqueue_id_t id, workq_threadreq_param_t *trp) { - struct timeval atv; - int error = 0; + bzero(kqwl, sizeof(struct kqworkloop)); - if (flags & KEVENT_FLAG_IMMEDIATE) { - getmicrouptime(&atv); - } else if (utimeout != USER_ADDR_NULL) { - struct timeval rtv; - if (flags & KEVENT_FLAG_KERNEL) { - struct timespec *tsp = (struct timespec *)utimeout; - TIMESPEC_TO_TIMEVAL(&rtv, tsp); - } else if (IS_64BIT_PROCESS(p)) { - struct user64_timespec ts; - error = copyin(utimeout, &ts, sizeof(ts)); - if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0) { - error = EINVAL; - } else { - TIMESPEC_TO_TIMEVAL(&rtv, &ts); - } - } else { - struct user32_timespec ts; - error = copyin(utimeout, &ts, sizeof(ts)); - TIMESPEC_TO_TIMEVAL(&rtv, &ts); - } - if (error) { - return error; + kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS; + kqwl->kqwl_retains = 1; /* donate a retain to creator */ + kqwl->kqwl_dynamicid = id; + kqwl->kqwl_p = p; + if (trp) { + kqwl->kqwl_params = trp->trp_value; + } + + workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP; + if (trp) { + if (trp->trp_flags & TRP_PRIORITY) { + tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS; } - if (itimerfix(&rtv)) { - return EINVAL; + if (trp->trp_flags) { + tr_flags |= WORKQ_TR_FLAG_WL_PARAMS; } - getmicrouptime(&atv); - timevaladd(&atv, &rtv); - } else { - /* wait forever value */ - atv.tv_sec = 0; - atv.tv_usec = 0; } - *atvp = atv; - return 0; + kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE; + kqwl->kqwl_request.tr_flags = tr_flags; + + for (int i = 0; i < KQWL_NBUCKETS; i++) { + TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]); + } + TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed); + + lck_spin_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr); + + kqueue_init(kqwl, &kqwl->kqwl_waitq_hook, SYNC_POLICY_FIFO); } +/*! + * @function kqworkloop_get_or_create + * + * @brief + * Wrapper around kqworkloop_alloc that handles the uniquing of workloops. + * + * @returns + * 0: success + * EINVAL: invalid parameters + * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists. + * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found. + * ENOMEM: allocation failed + */ static int -kevent_set_kq_mode(struct kqueue *kq, unsigned int flags) +kqworkloop_get_or_create(struct proc *p, kqueue_id_t id, + workq_threadreq_param_t *trp, unsigned int flags, struct kqworkloop **kqwlp) { - /* each kq should only be used for events of one type */ - kqlock(kq); - if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) { - if (flags & KEVENT_FLAG_LEGACY32) { - if ((kq->kq_state & KQ_KEV32) == 0) { - kqunlock(kq); - return EINVAL; - } - } else if (kq->kq_state & KQ_KEV32) { - kqunlock(kq); - return EINVAL; - } - } else if (flags & KEVENT_FLAG_LEGACY32) { - kq->kq_state |= KQ_KEV32; - } else if (flags & KEVENT_FLAG_LEGACY64) { - kq->kq_state |= KQ_KEV64; - } else { - kq->kq_state |= KQ_KEV_QOS; - } - kqunlock(kq); - return 0; -} + struct filedesc *fdp = p->p_fd; + struct kqworkloop *alloc_kqwl = NULL; + struct kqworkloop *kqwl = NULL; + int error = 0; -#define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) -#define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE + assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)); -static inline void -kqhash_lock(proc_t p) -{ - lck_mtx_lock_spin_always(&p->p_fd->fd_kqhashlock); -} + if (id == 0 || id == (kqueue_id_t)-1) { + return EINVAL; + } -static inline void -kqhash_lock_held(__assert_only proc_t p) -{ - LCK_MTX_ASSERT(&p->p_fd->fd_kqhashlock, LCK_MTX_ASSERT_OWNED); -} + for (;;) { + kqhash_lock(fdp); + if (__improbable(fdp->fd_kqhash == NULL)) { + kqworkloop_hash_init(fdp); + } -static inline void -kqhash_unlock(proc_t p) -{ - lck_mtx_unlock(&p->p_fd->fd_kqhashlock); -} + kqwl = kqworkloop_hash_lookup_locked(fdp, id); + if (kqwl) { + if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) { + /* + * If MUST_NOT_EXIST was passed, even if we would have failed + * the try_retain, it could have gone the other way, and + * userspace can't tell. Let'em fix their race. + */ + error = EEXIST; + break; + } -static void -kqueue_hash_init_if_needed(proc_t p) -{ - struct filedesc *fdp = p->p_fd; + if (__probable(kqworkloop_try_retain(kqwl))) { + /* + * This is a valid live workloop ! + */ + *kqwlp = kqwl; + error = 0; + break; + } + } - kqhash_lock_held(p); + if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) { + error = ENOENT; + break; + } - if (__improbable(fdp->fd_kqhash == NULL)) { - struct kqlist *alloc_hash; - u_long alloc_mask; + /* + * We didn't find what we were looking for. + * + * If this is the second time we reach this point (alloc_kqwl != NULL), + * then we're done. + * + * If this is the first time we reach this point (alloc_kqwl == NULL), + * then try to allocate one without blocking. + */ + if (__probable(alloc_kqwl == NULL)) { + alloc_kqwl = (struct kqworkloop *)zalloc_noblock(kqworkloop_zone); + } + if (__probable(alloc_kqwl)) { + kqworkloop_init(alloc_kqwl, p, id, trp); + kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl); + kqhash_unlock(fdp); + *kqwlp = alloc_kqwl; + return 0; + } - kqhash_unlock(p); - alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask); - kqhash_lock(p); + /* + * We have to block to allocate a workloop, drop the lock, + * allocate one, but then we need to retry lookups as someone + * else could race with us. + */ + kqhash_unlock(fdp); - /* See if we won the race */ - if (fdp->fd_kqhashmask == 0) { - fdp->fd_kqhash = alloc_hash; - fdp->fd_kqhashmask = alloc_mask; - } else { - kqhash_unlock(p); - FREE(alloc_hash, M_KQUEUE); - kqhash_lock(p); + alloc_kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone); + if (__improbable(!alloc_kqwl)) { + return ENOMEM; } } -} - -/* - * Called with the kqhash_lock() held - */ -static void -kqueue_hash_insert( - struct proc *p, - kqueue_id_t id, - struct kqueue *kq) -{ - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct filedesc *fdp = p->p_fd; - struct kqlist *list; - /* should hold the kq hash lock */ - kqhash_lock_held(p); + kqhash_unlock(fdp); - if ((kq->kq_state & KQ_DYNAMIC) == 0) { - assert(kq->kq_state & KQ_DYNAMIC); - return; + if (__improbable(alloc_kqwl)) { + zfree(kqworkloop_zone, alloc_kqwl); } - /* only dynamically allocate workloop kqs for now */ - assert(kq->kq_state & KQ_WORKLOOP); - assert(fdp->fd_kqhash); + return error; +} - kqwl->kqwl_dynamicid = id; +#pragma mark - knotes - list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)]; - SLIST_INSERT_HEAD(list, kqwl, kqwl_hashlink); +static int +filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev) +{ + knote_set_error(kn, ENOTSUP); + return 0; } -/* Called with kqhash_lock held */ static void -kqueue_hash_remove( - struct proc *p, - struct kqueue *kq) +filt_no_detach(__unused struct knote *kn) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct filedesc *fdp = p->p_fd; - struct kqlist *list; +} - /* should hold the kq hash lock */ - kqhash_lock_held(p); +static int __dead2 +filt_bad_event(struct knote *kn, long hint) +{ + panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint); +} - if ((kq->kq_state & KQ_DYNAMIC) == 0) { - assert(kq->kq_state & KQ_DYNAMIC); - return; - } - assert(kq->kq_state & KQ_WORKLOOP); /* for now */ - list = &fdp->fd_kqhash[KQ_HASH(kqwl->kqwl_dynamicid, fdp->fd_kqhashmask)]; - SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink); +static int __dead2 +filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev) +{ + panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev); } -/* Called with kqhash_lock held */ -static struct kqueue * -kqueue_hash_lookup(struct proc *p, kqueue_id_t id) +static int __dead2 +filt_bad_process(struct knote *kn, struct kevent_qos_s *kev) { - struct filedesc *fdp = p->p_fd; - struct kqlist *list; - struct kqworkloop *kqwl; + panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev); +} - /* should hold the kq hash lock */ - kqhash_lock_held(p); +/* + * knotes_dealloc - detach all knotes for the process and drop them + * + * Called with proc_fdlock held. + * Returns with it locked. + * May drop it temporarily. + * Process is in such a state that it will not try to allocate + * any more knotes during this process (stopped for exit or exec). + */ +void +knotes_dealloc(proc_t p) +{ + struct filedesc *fdp = p->p_fd; + struct kqueue *kq; + struct knote *kn; + struct klist *kn_hash = NULL; + int i; - if (fdp->fd_kqhashmask == 0) { - return NULL; + /* Close all the fd-indexed knotes up front */ + if (fdp->fd_knlistsize > 0) { + for (i = 0; i < fdp->fd_knlistsize; i++) { + while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) { + kq = knote_get_kq(kn); + kqlock(kq); + proc_fdunlock(p); + knote_drop(kq, kn, NULL); + proc_fdlock(p); + } + } + /* free the table */ + FREE(fdp->fd_knlist, M_KQUEUE); + fdp->fd_knlist = NULL; } + fdp->fd_knlistsize = 0; - list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)]; - SLIST_FOREACH(kqwl, list, kqwl_hashlink) { - if (kqwl->kqwl_dynamicid == id) { - struct kqueue *kq = (struct kqueue *)kqwl; + knhash_lock(fdp); + proc_fdunlock(p); - assert(kq->kq_state & KQ_DYNAMIC); - assert(kq->kq_state & KQ_WORKLOOP); /* for now */ - return kq; + /* Clean out all the hashed knotes as well */ + if (fdp->fd_knhashmask != 0) { + for (i = 0; i <= (int)fdp->fd_knhashmask; i++) { + while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) { + kq = knote_get_kq(kn); + kqlock(kq); + knhash_unlock(fdp); + knote_drop(kq, kn, NULL); + knhash_lock(fdp); + } } + kn_hash = fdp->fd_knhash; + fdp->fd_knhashmask = 0; + fdp->fd_knhash = NULL; } - return NULL; -} -static inline void -kqueue_release_last(struct proc *p, kqueue_t kqu) -{ - struct kqueue *kq = kqu.kq; - if (kq->kq_state & KQ_DYNAMIC) { - kqhash_lock(p); - if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) { - thread_t cur_owner = kqworkloop_invalidate(kqu.kqwl); - kqueue_hash_remove(p, kq); - kqhash_unlock(p); - if (cur_owner) { - thread_deallocate(cur_owner); - } - kqueue_dealloc(kq); - } else { - kqhash_unlock(p); - } + knhash_unlock(fdp); + + /* free the kn_hash table */ + if (kn_hash) { + FREE(kn_hash, M_KQUEUE); } + + proc_fdlock(p); } /* @@ -3570,4358 +3502,4554 @@ void kqworkloops_dealloc(proc_t p) { struct filedesc *fdp = p->p_fd; - struct kqlist *list; struct kqworkloop *kqwl, *kqwln; - struct kqlist tofree; - int i; + struct kqwllist tofree; if (!(fdp->fd_flags & FD_WORKLOOP)) { return; } - SLIST_INIT(&tofree); + kqhash_lock(fdp); + + if (fdp->fd_kqhashmask == 0) { + kqhash_unlock(fdp); + return; + } - kqhash_lock(p); - assert(fdp->fd_kqhashmask != 0); + LIST_INIT(&tofree); - for (i = 0; i <= (int)fdp->fd_kqhashmask; i++) { - list = &fdp->fd_kqhash[i]; - SLIST_FOREACH_SAFE(kqwl, list, kqwl_hashlink, kqwln) { + for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) { + LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) { /* * kqworkloops that have scheduling parameters have an * implicit retain from kqueue_workloop_ctl that needs * to be balanced on process exit. */ assert(kqwl->kqwl_params); - SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink); - SLIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink); + LIST_REMOVE(kqwl, kqwl_hashlink); + LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink); } } - kqhash_unlock(p); + kqhash_unlock(fdp); - SLIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) { - struct kqueue *kq = (struct kqueue *)kqwl; - __assert_only bool released; - released = kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF); - assert(released); - kqueue_dealloc(kq); + LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) { + kqworkloop_dealloc(kqwl, KQWL_DEALLOC_SKIP_HASH_REMOVE, 1); } } -static struct kqueue * -kevent_get_bound_kqworkloop(thread_t thread) -{ - struct uthread *ut = get_bsdthread_info(thread); - struct kqrequest *kqr = ut->uu_kqr_bound; - - return kqr ? (struct kqueue *)kqr_kqworkloop(kqr) : NULL; -} - static int -kevent_get_kq(struct proc *p, kqueue_id_t id, workq_threadreq_param_t *trp, - unsigned int flags, struct fileproc **fpp, int *fdp, - struct kqueue **kqp) -{ - struct filedesc *descp = p->p_fd; - struct fileproc *fp = NULL; - struct kqueue *kq = NULL; - int fd = 0; - int error = 0; - thread_t th = current_thread(); - - assert(!trp || (flags & KEVENT_FLAG_WORKLOOP)); - - /* Was the workloop flag passed? Then it is for sure only a workloop */ - if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) { - assert(flags & KEVENT_FLAG_WORKLOOP); - assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)); - kq = kevent_get_bound_kqworkloop(th); +kevent_register_validate_priority(struct kqueue *kq, struct knote *kn, + struct kevent_qos_s *kev) +{ + /* We don't care about the priority of a disabled or deleted knote */ + if (kev->flags & (EV_DISABLE | EV_DELETE)) { + return 0; + } + if (kq->kq_state & KQ_WORKLOOP) { /* - * when kevent_id_internal is called from within the - * kernel, and the passed 'id' value is '-1' then we - * look for the currently bound workloop kq. + * Workloops need valid priorities with a QOS (excluding manager) for + * any enabled knote. + * + * When it is pre-existing, just make sure it has a valid QoS as + * kevent_register() will not use the incoming priority (filters who do + * have the responsibility to validate it again, see filt_wltouch). + * + * If the knote is being made, validate the incoming priority. */ - if (id == (kqueue_id_t)-1 && - (flags & KEVENT_FLAG_KERNEL) && - (flags & KEVENT_FLAG_WORKLOOP)) { - if (!is_workqueue_thread(th) || !kq) { - return EINVAL; - } - - kqueue_retain(kq); - goto out; - } - - if (id == 0 || id == (kqueue_id_t)-1) { - return EINVAL; - } - - /* try shortcut on kq lookup for bound threads */ - if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) { - if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) { - return EEXIST; - } - - /* retain a reference while working with this kq. */ - assert(kq->kq_state & KQ_DYNAMIC); - kqueue_retain(kq); - goto out; + if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) { + return ERANGE; } + } - /* look for the kq on the hash table */ - kqhash_lock(p); - kq = kqueue_hash_lookup(p, id); - if (kq == NULL) { - kqhash_unlock(p); - - if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) { - return ENOENT; - } - - struct kqueue *alloc_kq; - alloc_kq = kqueue_alloc(p, flags); - if (!alloc_kq) { - return ENOMEM; - } + return 0; +} - kqhash_lock(p); - kqueue_hash_init_if_needed(p); - kq = kqueue_hash_lookup(p, id); - if (kq == NULL) { - /* insert our new one */ - kq = alloc_kq; - if (trp) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - kqwl->kqwl_params = trp->trp_value; - } - kqueue_hash_insert(p, id, kq); - kqhash_unlock(p); - } else if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) { - /* lost race and caller wants an error */ - kqhash_unlock(p); - kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF); - kqueue_dealloc(alloc_kq); - return EEXIST; - } else { - /* lost race, retain existing workloop */ - kqueue_retain(kq); - kqhash_unlock(p); - kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF); - kqueue_dealloc(alloc_kq); - } - } else { - if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) { - kqhash_unlock(p); - return EEXIST; - } +/* + * Prepare a filter for waiting after register. + * + * The f_post_register_wait hook will be called later by kevent_register() + * and should call kevent_register_wait_block() + */ +static int +kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc) +{ + thread_t thread = current_thread(); - /* retain a reference while working with this kq. */ - assert(kq->kq_state & KQ_DYNAMIC); - kqueue_retain(kq); - kqhash_unlock(p); - } - } else if (flags & KEVENT_FLAG_WORKQ) { - /* must already exist for bound threads. */ - if (flags & KEVENT_FLAG_KERNEL) { - assert(descp->fd_wqkqueue != NULL); - } + assert(knote_fops(kn)->f_extended_codes); + if (kn->kn_thread == NULL) { + thread_reference(thread); + kn->kn_thread = thread; + } else if (kn->kn_thread != thread) { /* - * use the private kq associated with the proc workq. - * Just being a thread within the process (and not - * being the exit/exec thread) is enough to hold a - * reference on this special kq. + * kn_thread may be set from a previous aborted wait + * However, it has to be from the same thread. */ - kq = descp->fd_wqkqueue; - if (kq == NULL) { - struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ); - if (alloc_kq == NULL) { - return ENOMEM; - } - - knhash_lock(p); - if (descp->fd_wqkqueue == NULL) { - kq = descp->fd_wqkqueue = alloc_kq; - knhash_unlock(p); - } else { - knhash_unlock(p); - kq = descp->fd_wqkqueue; - kqueue_dealloc(alloc_kq); - } - } - } else { - /* get a usecount for the kq itself */ - fd = (int)id; - if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) { - return error; - } - } - if ((error = kevent_set_kq_mode(kq, flags)) != 0) { - /* drop the usecount */ - if (fp != NULL) { - fp_drop(p, fd, fp, 0); - } - return error; + kev->flags |= EV_ERROR; + kev->data = EXDEV; + return 0; } -out: - *fpp = fp; - *fdp = fd; - *kqp = kq; - - return error; + return FILTER_REGISTER_WAIT | rc; } +/* + * Cleanup a kevent_register_wait_prepare() effect for threads that have been + * aborted instead of properly woken up with thread_wakeup_thread(). + */ static void -kevent_put_kq( - struct proc *p, - kqueue_id_t id, - struct fileproc *fp, - struct kqueue *kq) +kevent_register_wait_cleanup(struct knote *kn) { - kqueue_release_last(p, kq); - if (fp != NULL) { - assert((kq->kq_state & KQ_WORKQ) == 0); - fp_drop(p, (int)id, fp, 0); - } + thread_t thread = kn->kn_thread; + kn->kn_thread = NULL; + thread_deallocate(thread); } -static uint64_t -kevent_workloop_serial_no_copyin(proc_t p, uint64_t workloop_id) +/* + * Must be called at the end of a f_post_register_wait call from a filter. + */ +static void +kevent_register_wait_block(struct turnstile *ts, thread_t thread, + thread_continue_t cont, struct _kevent_register *cont_args) { - uint64_t serial_no = 0; - user_addr_t addr; - int rc; - - if (workloop_id == 0 || p->p_dispatchqueue_serialno_offset == 0) { - return 0; - } - addr = (user_addr_t)(workloop_id + p->p_dispatchqueue_serialno_offset); - - if (proc_is64bit(p)) { - rc = copyin(addr, (caddr_t)&serial_no, sizeof(serial_no)); - } else { - uint32_t serial_no32 = 0; - rc = copyin(addr, (caddr_t)&serial_no32, sizeof(serial_no32)); - serial_no = serial_no32; - } - return rc == 0 ? serial_no : 0; + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + kqunlock(cont_args->kqwl); + cont_args->handoff_thread = thread; + thread_handoff_parameter(thread, cont, cont_args); } -int -kevent_exit_on_workloop_ownership_leak(thread_t thread) +/* + * Called by Filters using a f_post_register_wait to return from their wait. + */ +static void +kevent_register_wait_return(struct _kevent_register *cont_args) { - proc_t p = current_proc(); - struct filedesc *fdp = p->p_fd; - kqueue_id_t workloop_id = 0; - os_reason_t reason = OS_REASON_NULL; - mach_vm_address_t addr; - uint32_t reason_size; + struct kqworkloop *kqwl = cont_args->kqwl; + struct kevent_qos_s *kev = &cont_args->kev; + int error = 0; - kqhash_lock(p); - if (fdp->fd_kqhashmask > 0) { - for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) { - struct kqworkloop *kqwl; - - SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) { - struct kqueue *kq = &kqwl->kqwl_kqueue; - if ((kq->kq_state & KQ_DYNAMIC) && kqwl->kqwl_owner == thread) { - workloop_id = kqwl->kqwl_dynamicid; - break; - } - } - } - } - kqhash_unlock(p); - - reason = os_reason_create(OS_REASON_LIBSYSTEM, - OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK); - if (reason == OS_REASON_NULL) { - goto out; - } - - reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; - reason_size = 2 * sizeof(uint64_t); - reason_size = kcdata_estimate_required_buffer_size(2, reason_size); - if (os_reason_alloc_buffer(reason, reason_size) != 0) { - goto out; + if (cont_args->handoff_thread) { + thread_deallocate(cont_args->handoff_thread); } - if (workloop_id) { - struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor; - - if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID, - sizeof(workloop_id), &addr) == KERN_SUCCESS) { - kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id)); + if (kev->flags & (EV_ERROR | EV_RECEIPT)) { + if ((kev->flags & EV_ERROR) == 0) { + kev->flags |= EV_ERROR; + kev->data = 0; } - - uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id); - if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO, - sizeof(serial_no), &addr) == KERN_SUCCESS) { - kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no)); + error = kevent_modern_copyout(kev, &cont_args->ueventlist); + if (error == 0) { + cont_args->eventout++; } } -out: -#if DEVELOPMENT || DEBUG - if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK) { - panic("thread %p in task %p is leaked workloop 0x%016llx ownership", - thread, p->task, workloop_id); + + kqworkloop_release(kqwl); + if (error == 0) { + *(int32_t *)¤t_uthread()->uu_rval = cont_args->eventout; } - psignal_try_thread_with_reason(p, thread, SIGABRT, reason); - return 0; -#else - return exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, - FALSE, FALSE, 0, reason); -#endif + unix_syscall_return(error); } -static inline boolean_t -kevent_args_requesting_events(unsigned int flags, int nevents) -{ - return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0; -} +/* + * kevent_register - add a new event to a kqueue + * + * Creates a mapping between the event source and + * the kqueue via a knote data structure. + * + * Because many/most the event sources are file + * descriptor related, the knote is linked off + * the filedescriptor table for quick access. + * + * called with nothing locked + * caller holds a reference on the kqueue + */ -static int -kevent_internal(struct proc *p, - kqueue_id_t id, kqueue_id_t *id_out, - user_addr_t changelist, int nchanges, - user_addr_t ueventlist, int nevents, - user_addr_t data_out, uint64_t data_available, - unsigned int flags, - user_addr_t utimeout, - kqueue_continue_t continuation, - int32_t *retval) +int +kevent_register(struct kqueue *kq, struct kevent_qos_s *kev, + struct knote **kn_out) { - uthread_t ut; - struct kqueue *kq; - struct fileproc *fp = NULL; - int fd = 0; - struct kevent_internal_s kev; - int error, noutputs, register_rc; - bool needs_end_processing = false; - struct timeval atv; - user_size_t data_size; - user_size_t data_resid; - thread_t thread = current_thread(); + struct proc *p = kq->kq_p; + const struct filterops *fops; + struct knote *kn = NULL; + int result = 0, error = 0; + unsigned short kev_flags = kev->flags; KNOTE_LOCK_CTX(knlc); - /* Don't allow user-space threads to process output events from the workq kqs */ - if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) && - kevent_args_requesting_events(flags, nevents)) { - return EINVAL; - } - - if (flags & KEVENT_FLAG_PARKING) { - if (!kevent_args_requesting_events(flags, nevents) || id != (kqueue_id_t)-1) { - return EINVAL; - } - } - - /* restrict dynamic kqueue allocation to workloops (for now) */ - if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE) { - return EINVAL; + if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) { + fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ + } else { + error = EINVAL; + goto out; } - if ((flags & (KEVENT_FLAG_WORKLOOP)) && (flags & (KEVENT_FLAG_WORKQ))) { - return EINVAL; + /* restrict EV_VANISHED to adding udata-specific dispatch kevents */ + if (__improbable((kev->flags & EV_VANISHED) && + (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) { + error = EINVAL; + goto out; } - if (flags & (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) { - /* allowed only on workloops when calling kevent_id from user-space */ - if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE)) { - return EINVAL; - } + /* Simplify the flags - delete and disable overrule */ + if (kev->flags & EV_DELETE) { + kev->flags &= ~EV_ADD; } - - /* prepare to deal with stack-wise allocation of out events */ - if (flags & KEVENT_FLAG_STACK_EVENTS) { - int scale = ((flags & KEVENT_FLAG_LEGACY32) ? - (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) : - sizeof(struct user32_kevent)) : - ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) : - sizeof(struct kevent_qos_s))); - ueventlist += nevents * scale; + if (kev->flags & EV_DISABLE) { + kev->flags &= ~EV_ENABLE; } - /* convert timeout to absolute - if we have one (and not immediate) */ - error = kevent_get_timeout(p, utimeout, flags, &atv); - if (error) { - return error; + if (kq->kq_state & KQ_WORKLOOP) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER), + ((struct kqworkloop *)kq)->kqwl_dynamicid, + kev->udata, kev->flags, kev->filter); + } else if (kq->kq_state & KQ_WORKQ) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER), + 0, kev->udata, kev->flags, kev->filter); + } else { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER), + VM_KERNEL_UNSLIDE_OR_PERM(kq), + kev->udata, kev->flags, kev->filter); } - /* copyin initial value of data residual from data_available */ - error = kevent_get_data_size(p, data_available, flags, &data_size); +restart: + /* find the matching knote from the fd tables/hashes */ + kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p); + error = kevent_register_validate_priority(kq, kn, kev); + result = 0; if (error) { - return error; + goto out; } - /* get the kq we are going to be working on */ - error = kevent_get_kq(p, id, NULL, flags, &fp, &fd, &kq); -#if CONFIG_WORKLOOP_DEBUG - ut = (uthread_t)get_bsdthread_info(thread); - UU_KEVENT_HISTORY_WRITE_ENTRY(ut, { - .uu_kqid = id, - .uu_kq = error ? NULL : kq, - .uu_error = error, - .uu_nchanges = nchanges, - .uu_nevents = nevents, - .uu_flags = flags, - }); -#endif // CONFIG_WORKLOOP_DEBUG - if (error) { - return error; - } + if (kn == NULL && (kev->flags & EV_ADD) == 0) { + /* + * No knote found, EV_ADD wasn't specified + */ - /* only bound threads can receive events on workloops */ - if (flags & KEVENT_FLAG_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) && + (kq->kq_state & KQ_WORKLOOP)) { + /* + * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete + * that doesn't care about ENOENT, so just pretend the deletion + * happened. + */ + } else { + error = ENOENT; + } + goto out; + } else if (kn == NULL) { + /* + * No knote found, need to attach a new one (attach) + */ - assert(kq->kq_state & KQ_WORKLOOP); + struct fileproc *knote_fp = NULL; - if (kevent_args_requesting_events(flags, nevents)) { - if (kq != kevent_get_bound_kqworkloop(thread)) { - error = EXDEV; + /* grab a file reference for the new knote */ + if (fops->f_isfd) { + if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) { goto out; } - - kq_req_lock(kqwl); - /* - * Disable the R2K notification while doing a register, if the - * caller wants events too, we don't want the AST to be set if we - * will process these events soon. - */ - kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED; - needs_end_processing = true; - kq_req_unlock(kq); } - if (id_out) { - *id_out = kqwl->kqwl_dynamicid; + kn = knote_alloc(); + if (kn == NULL) { + error = ENOMEM; + if (knote_fp != NULL) { + fp_drop(p, kev->ident, knote_fp, 0); + } + goto out; } - } - /* register all the change requests the user provided... */ - noutputs = 0; - while (nchanges > 0 && error == 0) { - error = kevent_copyin(&changelist, &kev, p, flags); - if (error) { - break; + kn->kn_fp = knote_fp; + kn->kn_is_fd = fops->f_isfd; + kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq; + kn->kn_status = 0; + + /* was vanish support requested */ + if (kev->flags & EV_VANISHED) { + kev->flags &= ~EV_VANISHED; + kn->kn_status |= KN_REQVANISH; } - /* Make sure user doesn't pass in any system flags */ - kev.flags &= ~EV_SYSFLAGS; + /* snapshot matching/dispatching protcol flags into knote */ + if (kev->flags & EV_DISABLE) { + kn->kn_status |= KN_DISABLED; + } - register_rc = kevent_register(kq, &kev, &knlc); - if (register_rc & FILTER_REGISTER_WAIT) { - kqlock_held(kq); + /* + * copy the kevent state into knote + * protocol is that fflags and data + * are saved off, and cleared before + * calling the attach routine. + * + * - kn->kn_sfflags aliases with kev->xflags + * - kn->kn_sdata aliases with kev->data + * - kn->kn_filter is the top 8 bits of kev->filter + */ + kn->kn_kevent = *(struct kevent_internal_s *)kev; + kn->kn_sfflags = kev->fflags; + kn->kn_filtid = (uint8_t)~kev->filter; + kn->kn_fflags = 0; + knote_reset_priority(kq, kn, kev->qos); - // f_post_register_wait is meant to call a continuation and not to - // return, which is why we don't support FILTER_REGISTER_WAIT if - // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that - // waits isn't the last. - // - // It is implementable, but not used by any userspace code at the - // moment, so for now return ENOTSUP if someone tries to do it. - if (nchanges == 1 && nevents >= 1 && (flags & KEVENT_FLAG_ERROR_EVENTS)) { - struct _kevent_register *cont_args; - /* store the continuation/completion data in the uthread */ - ut = (uthread_t)get_bsdthread_info(thread); - cont_args = &ut->uu_save.uus_kevent_register; - cont_args->kev = kev; - cont_args->kq = kq; - cont_args->fp = fp; - cont_args->fd = fd; - cont_args->ueventlist = ueventlist; - cont_args->flags = flags; - cont_args->retval = retval; - cont_args->eventcount = nevents; - cont_args->eventout = noutputs; - knote_fops(cont_args->knote)->f_post_register_wait(ut, &knlc, cont_args); - panic("f_post_register_wait returned (kev: %p)", &kev); + /* Add the knote for lookup thru the fd table */ + error = kq_add_knote(kq, kn, &knlc, p); + if (error) { + knote_free(kn); + if (knote_fp != NULL) { + fp_drop(p, kev->ident, knote_fp, 0); } - kev.flags |= EV_ERROR; - kev.data = ENOTSUP; - knote_unlock(kq, knlc.knlc_knote, &knlc, KNOTE_KQ_UNLOCK); - } - - // keep in sync with kevent_register_wait_return() - if (nevents > 0 && (kev.flags & (EV_ERROR | EV_RECEIPT))) { - if ((kev.flags & EV_ERROR) == 0) { - kev.flags |= EV_ERROR; - kev.data = 0; - } - error = kevent_copyout(&kev, &ueventlist, p, flags); - if (error == 0) { - nevents--; - noutputs++; + if (error == ERESTART) { + goto restart; } - } else if (kev.flags & EV_ERROR) { - error = kev.data; + goto out; } - nchanges--; - } - /* short-circuit the scan if we only want error events */ - if (flags & KEVENT_FLAG_ERROR_EVENTS) { - nevents = 0; - } - - /* process pending events */ - if (nevents > 0 && noutputs == 0 && error == 0) { - struct _kevent *cont_args; - /* store the continuation/completion data in the uthread */ - ut = (uthread_t)get_bsdthread_info(thread); - cont_args = &ut->uu_save.uus_kevent; - cont_args->fp = fp; - cont_args->fd = fd; - cont_args->retval = retval; - cont_args->eventlist = ueventlist; - cont_args->eventcount = nevents; - cont_args->eventout = noutputs; - cont_args->data_available = data_available; - cont_args->process_data.fp_fd = (int)id; - cont_args->process_data.fp_flags = flags; - cont_args->process_data.fp_data_out = data_out; - cont_args->process_data.fp_data_size = data_size; - cont_args->process_data.fp_data_resid = data_size; + /* fp reference count now applies to knote */ /* - * kqworkloop_end_processing() will happen at the end of kqueue_scan() + * we can't use filter_call() because f_attach can change the filter ops + * for a filter that supports f_extended_codes, so we need to reload + * knote_fops() and not use `fops`. */ - needs_end_processing = false; + result = fops->f_attach(kn, kev); + if (result && !knote_fops(kn)->f_extended_codes) { + result = FILTER_ACTIVE; + } - error = kqueue_scan(kq, kevent_callback, - continuation, cont_args, - &cont_args->process_data, - &atv, p); + kqlock(kq); - /* process remaining outputs */ - noutputs = cont_args->eventout; - data_resid = cont_args->process_data.fp_data_resid; + if (result & FILTER_THREADREQ_NODEFEER) { + enable_preemption(); + } - /* copyout residual data size value (if it needs to be copied out) */ - /* don't abandon other output just because of residual copyout failures */ - if (error == 0 && data_available && data_resid != data_size) { - (void)kevent_put_data_size(p, data_available, flags, data_resid); + if (kn->kn_flags & EV_ERROR) { + /* + * Failed to attach correctly, so drop. + */ + kn->kn_filtid = EVFILTID_DETACHED; + error = kn->kn_sdata; + knote_drop(kq, kn, &knlc); + result = 0; + goto out; } - } -out: - if (__improbable(needs_end_processing)) { /* - * If we didn't through kqworkloop_end_processing(), - * we need to do it here. + * end "attaching" phase - now just attached + * + * Mark the thread request overcommit, if appropos + * + * If the attach routine indicated that an + * event is already fired, activate the knote. + */ + if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) && + (kq->kq_state & KQ_WORKLOOP)) { + kqworkloop_set_overcommit((struct kqworkloop *)kq); + } + } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + /* + * The knote was dropped while we were waiting for the lock, + * we need to re-evaluate entirely */ - kqlock(kq); - kqworkloop_end_processing((struct kqworkloop *)kq, 0, 0); - kqunlock(kq); - } - kevent_put_kq(p, id, fp, kq); - - /* don't restart after signals... */ - if (error == ERESTART) { - error = EINTR; - } else if (error == EWOULDBLOCK) { - error = 0; - } - if (error == 0) { - *retval = noutputs; - } - return error; -} - -/* - * kevent_callback - callback for each individual event - * - * called with nothing locked - * caller holds a reference on the kqueue - */ -static int -kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, - void *data) -{ - struct _kevent *cont_args; - int error; - - cont_args = (struct _kevent *)data; - assert(cont_args->eventout < cont_args->eventcount); - - /* - * Copy out the appropriate amount of event data for this user. - */ - error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(), - cont_args->process_data.fp_flags); - - /* - * If there isn't space for additional events, return - * a harmless error to stop the processing here - */ - if (error == 0 && ++cont_args->eventout == cont_args->eventcount) { - error = EWOULDBLOCK; - } - return error; -} + goto restart; + } else if (kev->flags & EV_DELETE) { + /* + * Deletion of a knote (drop) + * + * If the filter wants to filter drop events, let it do so. + * + * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote, + * we must wait for the knote to be re-enabled (unless it is being + * re-enabled atomically here). + */ -/* - * kevent_description - format a description of a kevent for diagnostic output - * - * called with a 256-byte string buffer - */ + if (knote_fops(kn)->f_allow_drop) { + bool drop; -char * -kevent_description(struct kevent_internal_s *kevp, char *s, size_t n) -{ - snprintf(s, n, - "kevent=" - "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}", - kevp->ident, - kevp->filter, - kevp->flags, - kevp->udata, - kevp->fflags, - kevp->data, - kevp->ext[0], - kevp->ext[1] ); + kqunlock(kq); + drop = knote_fops(kn)->f_allow_drop(kn, kev); + kqlock(kq); - return s; -} + if (!drop) { + goto out_unlock; + } + } -static int -kevent_register_validate_priority(struct kqueue *kq, struct knote *kn, - struct kevent_internal_s *kev) -{ - /* We don't care about the priority of a disabled or deleted knote */ - if (kev->flags & (EV_DISABLE | EV_DELETE)) { - return 0; - } + if ((kev->flags & EV_ENABLE) == 0 && + (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 && + (kn->kn_status & KN_DISABLED) != 0) { + kn->kn_status |= KN_DEFERDELETE; + error = EINPROGRESS; + goto out_unlock; + } - if (kq->kq_state & KQ_WORKLOOP) { + knote_drop(kq, kn, &knlc); + goto out; + } else { /* - * Workloops need valid priorities with a QOS (excluding manager) for - * any enabled knote. + * Regular update of a knote (touch) * - * When it is pre-existing, just make sure it has a valid QoS as - * kevent_register() will not use the incoming priority (filters who do - * have the responsibility to validate it again, see filt_wltouch). + * Call touch routine to notify filter of changes in filter values + * (and to re-determine if any events are fired). * - * If the knote is being made, validate the incoming priority. + * If the knote is in defer-delete, avoid calling the filter touch + * routine (it has delivered its last event already). + * + * If the touch routine had no failure, + * apply the requested side effects to the knote. */ - if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) { - return ERANGE; + + if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { + if (kev->flags & EV_ENABLE) { + result = FILTER_ACTIVE; + } + } else { + kqunlock(kq); + result = filter_call(knote_fops(kn), f_touch(kn, kev)); + kqlock(kq); + if (result & FILTER_THREADREQ_NODEFEER) { + enable_preemption(); + } + } + + if (kev->flags & EV_ERROR) { + result = 0; + goto out_unlock; + } + + if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 && + kn->kn_udata != kev->udata) { + // this allows klist_copy_udata() not to take locks + os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed); + } + if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) { + kn->kn_status |= KN_DISABLED; + knote_dequeue(kq, kn); } } - return 0; + /* accept new kevent state */ + knote_apply_touch(kq, kn, kev, result); + +out_unlock: + /* + * When the filter asked for a post-register wait, + * we leave the kqueue locked for kevent_register() + * to call the filter's f_post_register_wait hook. + */ + if (result & FILTER_REGISTER_WAIT) { + knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); + *kn_out = kn; + } else { + knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); + } + +out: + /* output local errors through the kevent */ + if (error) { + kev->flags |= EV_ERROR; + kev->data = error; + } + return result; } /* - * Prepare a filter for waiting after register. + * knote_process - process a triggered event * - * The f_post_register_wait hook will be called later by kevent_register() - * and should call kevent_register_wait_block() + * Validate that it is really still a triggered event + * by calling the filter routines (if necessary). Hold + * a use reference on the knote to avoid it being detached. + * + * If it is still considered triggered, we will have taken + * a copy of the state under the filter lock. We use that + * snapshot to dispatch the knote for future processing (or + * not, if this was a lost event). + * + * Our caller assures us that nobody else can be processing + * events from this knote during the whole operation. But + * others can be touching or posting events to the knote + * interspersed with our processing it. + * + * caller holds a reference on the kqueue. + * kqueue locked on entry and exit - but may be dropped */ static int -kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev) +knote_process(struct knote *kn, kevent_ctx_t kectx, + kevent_callback_t callback) { - thread_t thread = current_thread(); - struct uthread *uth = get_bsdthread_info(thread); + struct kevent_qos_s kev; + struct kqueue *kq = knote_get_kq(kn); + KNOTE_LOCK_CTX(knlc); + int result = FILTER_ACTIVE; + int error = 0; + bool drop = false; - assert(knote_fops(kn)->f_extended_codes); + /* + * Must be active or stayactive + * Must be queued and not disabled/suppressed or dropping + */ + assert(kn->kn_status & KN_QUEUED); + assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)); + assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING))); - if (kn->kn_hook == NULL) { - thread_reference(thread); - kn->kn_hook = thread; - } else if (kn->kn_hook != thread) { + if (kq->kq_state & KQ_WORKLOOP) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS), + ((struct kqworkloop *)kq)->kqwl_dynamicid, + kn->kn_udata, kn->kn_status | (kn->kn_id << 32), + kn->kn_filtid); + } else if (kq->kq_state & KQ_WORKQ) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS), + 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32), + kn->kn_filtid); + } else { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS), + VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata, + kn->kn_status | (kn->kn_id << 32), kn->kn_filtid); + } + + if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) { /* - * kn_hook may be set from a previous aborted wait - * However, it has to be from the same thread. + * When the knote is dropping or has dropped, + * then there's nothing we want to process. */ - kev->flags |= EV_ERROR; - kev->data = EXDEV; - return 0; + return EJUSTRETURN; } - uth->uu_save.uus_kevent_register.knote = kn; - return FILTER_REGISTER_WAIT; -} - -/* - * Cleanup a kevent_register_wait_prepare() effect for threads that have been - * aborted instead of properly woken up with thread_wakeup_thread(). - */ -static void -kevent_register_wait_cleanup(struct knote *kn) -{ - thread_t thread = kn->kn_hook; - kn->kn_hook = NULL; - thread_deallocate(thread); -} + /* + * While waiting for the knote lock, we may have dropped the kq lock. + * and a touch may have disabled and dequeued the knote. + */ + if (!(kn->kn_status & KN_QUEUED)) { + knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); + return EJUSTRETURN; + } -/* - * Must be called at the end of a f_post_register_wait call from a filter. - */ -static void -kevent_register_wait_block(struct turnstile *ts, thread_t thread, - struct knote_lock_ctx *knlc, thread_continue_t cont, - struct _kevent_register *cont_args) -{ - knote_unlock(cont_args->kq, cont_args->knote, knlc, KNOTE_KQ_UNLOCK); - turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); - cont_args->handoff_thread = thread; - thread_handoff_parameter(thread, cont, cont_args); -} + /* + * For deferred-drop or vanished events, we just create a fake + * event to acknowledge end-of-life. Otherwise, we call the + * filter's process routine to snapshot the kevent state under + * the filter's locking protocol. + * + * suppress knotes to avoid returning the same event multiple times in + * a single call. + */ + knote_suppress(kq, kn); -/* - * Called by Filters using a f_post_register_wait to return from their wait. - */ -static void -kevent_register_wait_return(struct _kevent_register *cont_args) -{ - struct kqueue *kq = cont_args->kq; - proc_t p = kq->kq_p; - struct kevent_internal_s *kev = &cont_args->kev; - int error = 0; + if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { + int kev_flags = EV_DISPATCH2 | EV_ONESHOT; + if (kn->kn_status & KN_DEFERDELETE) { + kev_flags |= EV_DELETE; + } else { + kev_flags |= EV_VANISHED; + } - if (cont_args->handoff_thread) { - thread_deallocate(cont_args->handoff_thread); + /* create fake event */ + kev = (struct kevent_qos_s){ + .filter = kn->kn_filter, + .ident = kn->kn_id, + .flags = kev_flags, + .udata = kn->kn_udata, + }; + } else { + kqunlock(kq); + kev = (struct kevent_qos_s) { }; + result = filter_call(knote_fops(kn), f_process(kn, &kev)); + kqlock(kq); } - if (kev->flags & (EV_ERROR | EV_RECEIPT)) { - if ((kev->flags & EV_ERROR) == 0) { - kev->flags |= EV_ERROR; - kev->data = 0; + /* + * Determine how to dispatch the knote for future event handling. + * not-fired: just return (do not callout, leave deactivated). + * One-shot: If dispatch2, enter deferred-delete mode (unless this is + * is the deferred delete event delivery itself). Otherwise, + * drop it. + * Dispatch: don't clear state, just mark it disabled. + * Cleared: just leave it deactivated. + * Others: re-activate as there may be more events to handle. + * This will not wake up more handlers right now, but + * at the completion of handling events it may trigger + * more handler threads (TODO: optimize based on more than + * just this one event being detected by the filter). + */ + if ((result & FILTER_ACTIVE) == 0) { + if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) { + /* + * Stay active knotes should not be unsuppressed or we'd create an + * infinite loop. + * + * Some knotes (like EVFILT_WORKLOOP) can be reactivated from + * within f_process() but that doesn't necessarily make them + * ready to process, so we should leave them be. + * + * For other knotes, since we will not return an event, + * there's no point keeping the knote suppressed. + */ + knote_unsuppress(kq, kn); } - error = kevent_copyout(kev, &cont_args->ueventlist, p, cont_args->flags); - if (error == 0) { - cont_args->eventout++; + knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); + return EJUSTRETURN; + } + + if (result & FILTER_ADJUST_EVENT_QOS_BIT) { + knote_adjust_qos(kq, kn, result); + } + kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override); + + if (kev.flags & EV_ONESHOT) { + if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 && + (kn->kn_status & KN_DEFERDELETE) == 0) { + /* defer dropping non-delete oneshot dispatch2 events */ + kn->kn_status |= KN_DEFERDELETE | KN_DISABLED; + } else { + drop = true; } + } else if (kn->kn_flags & EV_DISPATCH) { + /* disable all dispatch knotes */ + kn->kn_status |= KN_DISABLED; + } else if ((kn->kn_flags & EV_CLEAR) == 0) { + /* re-activate in case there are more events */ + knote_activate(kq, kn, FILTER_ACTIVE); } - kevent_put_kq(p, cont_args->fd, cont_args->fp, kq); - if (error == 0) { - *cont_args->retval = cont_args->eventout; + /* + * callback to handle each event as we find it. + * If we have to detach and drop the knote, do + * it while we have the kq unlocked. + */ + if (drop) { + knote_drop(kq, kn, &knlc); + } else { + knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); } - unix_syscall_return(error); -} -/* - * kevent_register - add a new event to a kqueue - * - * Creates a mapping between the event source and - * the kqueue via a knote data structure. - * - * Because many/most the event sources are file - * descriptor related, the knote is linked off - * the filedescriptor table for quick access. - * - * called with nothing locked - * caller holds a reference on the kqueue - */ + if (kev.flags & EV_VANISHED) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED), + kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32), + kn->kn_filtid); + } -int -kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, - struct knote_lock_ctx *knlc) + error = (callback)(&kev, kectx); + kqlock(kq); + return error; +} + +/* + * Returns -1 if the kqueue was unbound and processing should not happen + */ +#define KQWQAE_BEGIN_PROCESSING 1 +#define KQWQAE_END_PROCESSING 2 +#define KQWQAE_UNBIND 3 +static int +kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr, + int kevent_flags, int kqwqae_op) { - struct proc *p = kq->kq_p; - const struct filterops *fops; - struct knote *kn = NULL; - int result = 0, error = 0; - unsigned short kev_flags = kev->flags; + thread_qos_t old_override = THREAD_QOS_UNSPECIFIED; + thread_t thread = kqr_thread_fast(kqr); + struct knote *kn; + int rc = 0; + bool unbind; + struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index]; - if (kev->filter < 0) { - if (kev->filter + EVFILT_SYSCOUNT < 0) { - error = EINVAL; - goto out; + kqlock_held(&kqwq->kqwq_kqueue); + + if (!TAILQ_EMPTY(suppressq)) { + /* + * Return suppressed knotes to their original state. + * For workq kqueues, suppressed ones that are still + * truly active (not just forced into the queue) will + * set flags we check below to see if anything got + * woken up. + */ + while ((kn = TAILQ_FIRST(suppressq)) != NULL) { + assert(kn->kn_status & KN_SUPPRESSED); + knote_unsuppress(kqwq, kn); } - fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ - } else { - error = EINVAL; - goto out; } - /* restrict EV_VANISHED to adding udata-specific dispatch kevents */ - if ((kev->flags & EV_VANISHED) && - (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) { - error = EINVAL; - goto out; - } +#if DEBUG || DEVELOPMENT + thread_t self = current_thread(); + struct uthread *ut = get_bsdthread_info(self); - /* Simplify the flags - delete and disable overrule */ - if (kev->flags & EV_DELETE) { - kev->flags &= ~EV_ADD; - } - if (kev->flags & EV_DISABLE) { - kev->flags &= ~EV_ENABLE; - } + assert(thread == self); + assert(ut->uu_kqr_bound == kqr); +#endif // DEBUG || DEVELOPMENT - if (kq->kq_state & KQ_WORKLOOP) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER), - ((struct kqworkloop *)kq)->kqwl_dynamicid, - kev->udata, kev->flags, kev->filter); - } else if (kq->kq_state & KQ_WORKQ) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER), - 0, kev->udata, kev->flags, kev->filter); + if (kqwqae_op == KQWQAE_UNBIND) { + unbind = true; + } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) { + unbind = false; } else { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER), - VM_KERNEL_UNSLIDE_OR_PERM(kq), - kev->udata, kev->flags, kev->filter); - } - -restart: - /* find the matching knote from the fd tables/hashes */ - kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p); - error = kevent_register_validate_priority(kq, kn, kev); - result = 0; - if (error) { - goto out; + unbind = !kqr->tr_kq_wakeup; } - - if (kn == NULL && (kev->flags & EV_ADD) == 0) { + if (unbind) { + old_override = kqworkq_unbind_locked(kqwq, kqr, thread); + rc = -1; /* - * No knote found, EV_ADD wasn't specified + * request a new thread if we didn't process the whole queue or real events + * have happened (not just putting stay-active events back). */ - - if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) && - (kq->kq_state & KQ_WORKLOOP)) { - /* - * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete - * that doesn't care about ENOENT, so just pretend the deletion - * happened. - */ - } else { - error = ENOENT; + if (kqr->tr_kq_wakeup) { + kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, + kqr->tr_kq_qos_index, 0); } - goto out; - } else if (kn == NULL) { + } + + if (rc == 0) { /* - * No knote found, need to attach a new one (attach) + * Reset wakeup bit to notice events firing while we are processing, + * as we cannot rely on the bucket queue emptiness because of stay + * active knotes. */ + kqr->tr_kq_wakeup = false; + } - struct fileproc *knote_fp = NULL; - - /* grab a file reference for the new knote */ - if (fops->f_isfd) { - if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) { - goto out; - } - } + if (old_override) { + thread_drop_kevent_override(thread); + } - kn = knote_alloc(); - if (kn == NULL) { - error = ENOMEM; - if (knote_fp != NULL) { - fp_drop(p, kev->ident, knote_fp, 0); - } - goto out; - } + return rc; +} - kn->kn_fp = knote_fp; - kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq; - kqueue_retain(kq); /* retain a kq ref */ - kn->kn_filtid = ~kev->filter; - kn->kn_status = KN_ATTACHING | KN_ATTACHED; +/* + * Return 0 to indicate that processing should proceed, + * -1 if there is nothing to process. + * + * Called with kqueue locked and returns the same way, + * but may drop lock temporarily. + */ +static int +kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr, + int kevent_flags) +{ + int rc = 0; - /* was vanish support requested */ - if (kev->flags & EV_VANISHED) { - kev->flags &= ~EV_VANISHED; - kn->kn_status |= KN_REQVANISH; - } + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START, + 0, kqr->tr_kq_qos_index); - /* snapshot matching/dispatching protcol flags into knote */ - if (kev->flags & EV_DISPATCH) { - kn->kn_status |= KN_DISPATCH; - } - if (kev->flags & EV_UDATA_SPECIFIC) { - kn->kn_status |= KN_UDATA_SPECIFIC; - } - if (kev->flags & EV_DISABLE) { - kn->kn_status |= KN_DISABLED; - } + rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags, + KQWQAE_BEGIN_PROCESSING); - /* - * copy the kevent state into knote - * protocol is that fflags and data - * are saved off, and cleared before - * calling the attach routine. - */ - kn->kn_kevent = *kev; - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - kn->kn_fflags = 0; - kn->kn_data = 0; - knote_reset_priority(kn, kev->qos); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END, + thread_tid(kqr_thread(kqr)), kqr->tr_kq_wakeup); - /* Add the knote for lookup thru the fd table */ - error = kq_add_knote(kq, kn, knlc, p); - if (error) { - (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); - knote_free(kn); - if (knote_fp != NULL) { - fp_drop(p, kev->ident, knote_fp, 0); - } + return rc; +} - if (error == ERESTART) { - goto restart; - } - goto out; - } +static thread_qos_t +kqworkloop_acknowledge_events(struct kqworkloop *kqwl) +{ + kq_index_t qos = THREAD_QOS_UNSPECIFIED; + struct knote *kn, *tmp; - /* fp reference count now applies to knote */ + kqlock_held(kqwl); + TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) { /* - * we can't use filter_call() because f_attach can change the filter ops - * for a filter that supports f_extended_codes, so we need to reload - * knote_fops() and not use `fops`. + * If a knote that can adjust QoS is disabled because of the automatic + * behavior of EV_DISPATCH, the knotes should stay suppressed so that + * further overrides keep pushing. */ - result = fops->f_attach(kn, kev); - if (result && !knote_fops(kn)->f_extended_codes) { - result = FILTER_ACTIVE; - } - - kqlock(kq); - - if (kn->kn_flags & EV_ERROR) { - /* - * Failed to attach correctly, so drop. - */ - kn->kn_status &= ~(KN_ATTACHED | KN_ATTACHING); - error = kn->kn_data; - knote_drop(kq, kn, knlc); - result = 0; - goto out; + if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) && + (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 && + (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) { + qos = MAX(qos, kn->kn_qos_override); + continue; } + knote_unsuppress(kqwl, kn); + } - /* - * end "attaching" phase - now just attached - * - * Mark the thread request overcommit, if appropos - * - * If the attach routine indicated that an - * event is already fired, activate the knote. - */ - kn->kn_status &= ~KN_ATTACHING; - knote_set_qos_overcommit(kn); + return qos; +} - if (result & FILTER_ACTIVE) { - if (result & FILTER_ADJUST_EVENT_QOS_BIT) { - knote_adjust_qos(kq, kn, result); - } - knote_activate(kn); - } - } else if (!knote_lock(kq, kn, knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { - /* - * The knote was dropped while we were waiting for the lock, - * we need to re-evaluate entirely - */ +static int +kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags) +{ + workq_threadreq_t kqr = &kqwl->kqwl_request; + struct kqueue *kq = &kqwl->kqwl_kqueue; + thread_qos_t qos_override; + thread_t thread = kqr_thread_fast(kqr); + int rc = 0, op = KQWL_UTQ_NONE; - goto restart; - } else if (kev->flags & EV_DELETE) { - /* - * Deletion of a knote (drop) - * - * If the filter wants to filter drop events, let it do so. - * - * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote, - * we must wait for the knote to be re-enabled (unless it is being - * re-enabled atomically here). - */ + kqlock_held(kq); - if (knote_fops(kn)->f_allow_drop) { - bool drop; + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START, + kqwl->kqwl_dynamicid, 0, 0); - kqunlock(kq); - drop = knote_fops(kn)->f_allow_drop(kn, kev); - kqlock(kq); + /* nobody else should still be processing */ + assert((kq->kq_state & KQ_PROCESSING) == 0); - if (!drop) { - goto out_unlock; - } - } + kq->kq_state |= KQ_PROCESSING; - if ((kev->flags & EV_ENABLE) == 0 && - (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) == - (KN_DISPATCH2 | KN_DISABLED)) { - kn->kn_status |= KN_DEFERDELETE; - error = EINPROGRESS; - goto out_unlock; - } + if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) { + op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE; + } - knote_drop(kq, kn, knlc); - goto out; - } else { + if (kevent_flags & KEVENT_FLAG_PARKING) { /* - * Regular update of a knote (touch) - * - * Call touch routine to notify filter of changes in filter values - * (and to re-determine if any events are fired). - * - * If the knote is in defer-delete, avoid calling the filter touch - * routine (it has delivered its last event already). + * When "parking" we want to process events and if no events are found + * unbind. * - * If the touch routine had no failure, - * apply the requested side effects to the knote. + * However, non overcommit threads sometimes park even when they have + * more work so that the pool can narrow. For these, we need to unbind + * early, so that calling kqworkloop_update_threads_qos() can ask the + * workqueue subsystem whether the thread should park despite having + * pending events. */ - - if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { - if (kev->flags & EV_ENABLE) { - result = FILTER_ACTIVE; - } + if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) { + op = KQWL_UTQ_PARKING; } else { - kqunlock(kq); - result = filter_call(knote_fops(kn), f_touch(kn, kev)); - kqlock(kq); + op = KQWL_UTQ_UNBINDING; } + } + if (op == KQWL_UTQ_NONE) { + goto done; + } - if (kev->flags & EV_ERROR) { - result = 0; + qos_override = kqworkloop_acknowledge_events(kqwl); + + if (op == KQWL_UTQ_UNBINDING) { + kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_IMMEDIATELY); + kqworkloop_release_live(kqwl); + } + kqworkloop_update_threads_qos(kqwl, op, qos_override); + if (op == KQWL_UTQ_PARKING) { + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) { + /* + * We cannot trust tr_kq_wakeup when looking at stay active knotes. + * We need to process once, and kqworkloop_end_processing will + * handle the unbind. + */ + } else if (!kqr->tr_kq_wakeup || kqwl->kqwl_owner) { + kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED); + kqworkloop_release_live(kqwl); + rc = -1; + } + } else if (op == KQWL_UTQ_UNBINDING) { + if (kqr_thread(kqr) == thread) { + /* + * The thread request fired again, passed the admission check and + * got bound to the current thread again. + */ } else { - /* accept new kevent state */ - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) { - kn->kn_udata = kev->udata; - } - if (kev->flags & EV_DISABLE) { - knote_disable(kn); - } - if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) { - knote_dequeue(kn); - } - if ((result & FILTER_UPDATE_REQ_QOS) && - kev->qos && kev->qos != kn->kn_qos) { - knote_reset_priority(kn, kev->qos); - } - if (result & FILTER_ACTIVE) { - thread_qos_t qos; - if (result & FILTER_ADJUST_EVENT_QOS_BIT) { - if (knote_should_apply_qos_override(kq, kn, result, &qos)) { - knote_apply_qos_override(kn, qos); - } - } - knote_activate(kn); - } - if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) { - if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) { - knote_wakeup(kn); - } - } - if (kev->flags & EV_ENABLE) { - knote_enable(kn); - } + rc = -1; } } -out_unlock: - if ((result & FILTER_REGISTER_WAIT) == 0) { + if (rc == 0) { /* - * When the filter asked for a post-register wait, - * we leave the knote and kqueue locked for kevent_register() - * to call the filter's f_post_register_wait hook. + * Reset wakeup bit to notice stay active events firing while we are + * processing, as we cannot rely on the stayactive bucket emptiness. */ - knote_unlock(kq, kn, knlc, KNOTE_KQ_UNLOCK); + kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT; + } else { + kq->kq_state &= ~KQ_PROCESSING; } -out: - /* output local errors through the kevent */ - if (error) { - kev->flags |= EV_ERROR; - kev->data = error; + if (rc == -1) { + kqworkloop_unbind_delayed_override_drop(thread); } - return result; + +done: + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END, + kqwl->kqwl_dynamicid, 0, 0); + + return rc; } /* - * knote_process - process a triggered event - * - * Validate that it is really still a triggered event - * by calling the filter routines (if necessary). Hold - * a use reference on the knote to avoid it being detached. - * - * If it is still considered triggered, we will have taken - * a copy of the state under the filter lock. We use that - * snapshot to dispatch the knote for future processing (or - * not, if this was a lost event). - * - * Our caller assures us that nobody else can be processing - * events from this knote during the whole operation. But - * others can be touching or posting events to the knote - * interspersed with our processing it. + * Return 0 to indicate that processing should proceed, + * -1 if there is nothing to process. + * EBADF if the kqueue is draining * - * caller holds a reference on the kqueue. - * kqueue locked on entry and exit - but may be dropped + * Called with kqueue locked and returns the same way, + * but may drop lock temporarily. + * May block. */ static int -knote_process(struct knote *kn, - kevent_callback_t callback, - void *callback_data, - struct filt_process_s *process_data) +kqfile_begin_processing(struct kqfile *kq) { - struct kevent_internal_s kev; - struct kqueue *kq = knote_get_kq(kn); - KNOTE_LOCK_CTX(knlc); - int result = FILTER_ACTIVE; - int error = 0; - bool drop = false; - - bzero(&kev, sizeof(kev)); + struct kqtailq *suppressq; - /* - * Must be active or stayactive - * Must be queued and not disabled/suppressed - */ - assert(kn->kn_status & KN_QUEUED); - assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)); - assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING))); + kqlock_held(kq); - if (kq->kq_state & KQ_WORKLOOP) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS), - ((struct kqworkloop *)kq)->kqwl_dynamicid, - kn->kn_udata, kn->kn_status | (kn->kn_id << 32), - kn->kn_filtid); - } else if (kq->kq_state & KQ_WORKQ) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS), - 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32), - kn->kn_filtid); - } else { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS), - VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata, - kn->kn_status | (kn->kn_id << 32), kn->kn_filtid); - } + assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START, + VM_KERNEL_UNSLIDE_OR_PERM(kq), 0); - if ((kn->kn_status & KN_DROPPING) || - !knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) { - /* - * When the knote is dropping or has dropped, - * then there's nothing we want to process. - */ - return EJUSTRETURN; - } + /* wait to become the exclusive processing thread */ + for (;;) { + if (kq->kqf_state & KQ_DRAIN) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, + VM_KERNEL_UNSLIDE_OR_PERM(kq), 2); + return EBADF; + } - /* - * For deferred-drop or vanished events, we just create a fake - * event to acknowledge end-of-life. Otherwise, we call the - * filter's process routine to snapshot the kevent state under - * the filter's locking protocol. - * - * suppress knotes to avoid returning the same event multiple times in - * a single call. - */ - knote_suppress(kn); + if ((kq->kqf_state & KQ_PROCESSING) == 0) { + break; + } - if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { - /* create fake event */ - kev.filter = kn->kn_filter; - kev.ident = kn->kn_id; - kev.flags = (kn->kn_status & KN_DEFERDELETE) ? EV_DELETE : EV_VANISHED; - kev.flags |= (EV_DISPATCH2 | EV_ONESHOT); - kev.udata = kn->kn_udata; - } else { - /* deactivate - so new activations indicate a wakeup */ - knote_deactivate(kn); + /* if someone else is processing the queue, wait */ + kq->kqf_state |= KQ_PROCWAIT; + suppressq = &kq->kqf_suppressed; + waitq_assert_wait64((struct waitq *)&kq->kqf_wqs, + CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT, + TIMEOUT_WAIT_FOREVER); kqunlock(kq); - result = filter_call(knote_fops(kn), f_process(kn, process_data, &kev)); + thread_block(THREAD_CONTINUE_NULL); kqlock(kq); } - /* - * Determine how to dispatch the knote for future event handling. - * not-fired: just return (do not callout, leave deactivated). - * One-shot: If dispatch2, enter deferred-delete mode (unless this is - * is the deferred delete event delivery itself). Otherwise, - * drop it. - * Dispatch: don't clear state, just mark it disabled. - * Cleared: just leave it deactivated. - * Others: re-activate as there may be more events to handle. - * This will not wake up more handlers right now, but - * at the completion of handling events it may trigger - * more handler threads (TODO: optimize based on more than - * just this one event being detected by the filter). - */ - if ((result & FILTER_ACTIVE) == 0) { - if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) { - /* - * Stay active knotes should not be unsuppressed or we'd create an - * infinite loop. - * - * Some knotes (like EVFILT_WORKLOOP) can be reactivated from - * within f_process() but that doesn't necessarily make them - * ready to process, so we should leave them be. - * - * For other knotes, since we will not return an event, - * there's no point keeping the knote suppressed. - */ - knote_unsuppress(kn); - } - knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); - return EJUSTRETURN; - } + /* Nobody else processing */ - if (result & FILTER_ADJUST_EVENT_QOS_BIT) { - knote_adjust_qos(kq, kn, result); - } - kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override); + /* clear pre-posts and KQ_WAKEUP now, in case we bail early */ + waitq_set_clear_preposts(&kq->kqf_wqs); + kq->kqf_state &= ~KQ_WAKEUP; - if (kev.flags & EV_ONESHOT) { - if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) { - /* defer dropping non-delete oneshot dispatch2 events */ - kn->kn_status |= KN_DEFERDELETE; - knote_disable(kn); - } else { - drop = true; - } - } else if (kn->kn_status & KN_DISPATCH) { - /* disable all dispatch knotes */ - knote_disable(kn); - } else if ((kev.flags & EV_CLEAR) == 0) { - /* re-activate in case there are more events */ - knote_activate(kn); + /* anything left to process? */ + if (TAILQ_EMPTY(&kq->kqf_queue)) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, + VM_KERNEL_UNSLIDE_OR_PERM(kq), 1); + return -1; } - /* - * callback to handle each event as we find it. - * If we have to detach and drop the knote, do - * it while we have the kq unlocked. - */ - if (drop) { - knote_drop(kq, kn, &knlc); - } else { - knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); - } + /* convert to processing mode */ + kq->kqf_state |= KQ_PROCESSING; - if (kev.flags & EV_VANISHED) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED), - kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32), - kn->kn_filtid); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, + VM_KERNEL_UNSLIDE_OR_PERM(kq)); + + return 0; +} + +/* + * Try to end the processing, only called when a workq thread is attempting to + * park (KEVENT_FLAG_PARKING is set). + * + * When returning -1, the kqworkq is setup again so that it is ready to be + * processed. + */ +static int +kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr, + int kevent_flags) +{ + if (!TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index])) { + /* remember we didn't process everything */ + kqr->tr_kq_wakeup = true; + } + + if (kevent_flags & KEVENT_FLAG_PARKING) { + /* + * if acknowledge events "succeeds" it means there are events, + * which is a failure condition for end_processing. + */ + int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags, + KQWQAE_END_PROCESSING); + if (rc == 0) { + return -1; + } } - error = (callback)(kq, &kev, callback_data); - kqlock(kq); - return error; + return 0; } /* - * Returns -1 if the kqueue was unbound and processing should not happen + * Try to end the processing, only called when a workq thread is attempting to + * park (KEVENT_FLAG_PARKING is set). + * + * When returning -1, the kqworkq is setup again so that it is ready to be + * processed (as if kqworkloop_begin_processing had just been called). + * + * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags, + * the kqworkloop is unbound from its servicer as a side effect. */ -#define KQWQAE_BEGIN_PROCESSING 1 -#define KQWQAE_END_PROCESSING 2 -#define KQWQAE_UNBIND 3 static int -kqworkq_acknowledge_events(struct kqworkq *kqwq, struct kqrequest *kqr, - int kevent_flags, int kqwqae_op) +kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags) { - thread_qos_t old_override = THREAD_QOS_UNSPECIFIED; - thread_t thread = kqr->kqr_thread; - struct knote *kn; + struct kqueue *kq = &kqwl->kqwl_kqueue; + workq_threadreq_t kqr = &kqwl->kqwl_request; + thread_qos_t qos_override; + thread_t thread = kqr_thread_fast(kqr); int rc = 0; - bool seen_stayactive = false, unbind; - - kqlock_held(&kqwq->kqwq_kqueue); - - if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) { - /* - * Return suppressed knotes to their original state. - * For workq kqueues, suppressed ones that are still - * truly active (not just forced into the queue) will - * set flags we check below to see if anything got - * woken up. - */ - while ((kn = TAILQ_FIRST(&kqr->kqr_suppressed)) != NULL) { - assert(kn->kn_status & KN_SUPPRESSED); - knote_unsuppress(kn); - if (kn->kn_status & KN_STAYACTIVE) { - seen_stayactive = true; - } - } - } - kq_req_lock(kqwq); + kqlock_held(kq); -#if DEBUG || DEVELOPMENT - thread_t self = current_thread(); - struct uthread *ut = get_bsdthread_info(self); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START, + kqwl->kqwl_dynamicid, 0, 0); - assert(kqr->kqr_state & KQR_THREQUESTED); - assert(kqr->kqr_thread == self); - assert(ut->uu_kqr_bound == kqr); -#endif // DEBUG || DEVELOPMENT + if (flags & KQ_PROCESSING) { + assert(kq->kq_state & KQ_PROCESSING); - if (kqwqae_op == KQWQAE_UNBIND) { - unbind = true; - } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) { - unbind = false; - } else if (kqwqae_op == KQWQAE_BEGIN_PROCESSING && seen_stayactive) { - /* - * When we unsuppress stayactive knotes, for the kind that are hooked - * through select, we need to process once before we can assert there's - * no event pending. Hence we can't unbind during BEGIN PROCESSING. - */ - unbind = false; - } else { - unbind = ((kqr->kqr_state & KQR_WAKEUP) == 0); - } - if (unbind) { - old_override = kqworkq_unbind_locked(kqwq, kqr, thread); - rc = -1; /* - * request a new thread if we didn't process the whole queue or real events - * have happened (not just putting stay-active events back). + * If we still have queued stayactive knotes, remember we didn't finish + * processing all of them. This should be extremely rare and would + * require to have a lot of them registered and fired. */ - if (kqr->kqr_state & KQR_WAKEUP) { - kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, - kqr->kqr_qos_index, 0); + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) { + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, + KQWL_BUCKET_STAYACTIVE); } - } - if (rc == 0) { /* - * Reset wakeup bit to notice events firing while we are processing, - * as we cannot rely on the bucket queue emptiness because of stay - * active knotes. + * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while + * still under the lock. + * + * So we do everything kqworkloop_unbind() would do, but because we're + * inside kqueue_process(), if the workloop actually received events + * while our locks were dropped, we have the opportunity to fail the end + * processing and loop again. + * + * This avoids going through the process-wide workqueue lock hence + * scales better. */ - kqr->kqr_state &= ~KQR_WAKEUP; + if (kevent_flags & KEVENT_FLAG_PARKING) { + qos_override = kqworkloop_acknowledge_events(kqwl); + } } - kq_req_unlock(kqwq); + if (kevent_flags & KEVENT_FLAG_PARKING) { + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override); + if (kqr->tr_kq_wakeup && !kqwl->kqwl_owner) { + /* + * Reset wakeup bit to notice stay active events firing while we are + * processing, as we cannot rely on the stayactive bucket emptiness. + */ + kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT; + rc = -1; + } else { + kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED); + kqworkloop_release_live(kqwl); + kq->kq_state &= ~flags; + } + } else { + kq->kq_state &= ~flags; + kq->kq_state |= KQ_R2K_ARMED; + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0); + } - if (old_override) { - thread_drop_ipc_override(thread); + if ((kevent_flags & KEVENT_FLAG_PARKING) && rc == 0) { + kqworkloop_unbind_delayed_override_drop(thread); } + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END, + kqwl->kqwl_dynamicid, 0, 0); + return rc; } /* - * Return 0 to indicate that processing should proceed, - * -1 if there is nothing to process. + * Called with kqueue lock held. * - * Called with kqueue locked and returns the same way, - * but may drop lock temporarily. + * 0: no more events + * -1: has more events + * EBADF: kqueue is in draining mode */ static int -kqworkq_begin_processing(struct kqworkq *kqwq, struct kqrequest *kqr, - int kevent_flags) +kqfile_end_processing(struct kqfile *kq) { - int rc = 0; + struct kqtailq *suppressq = &kq->kqf_suppressed; + struct knote *kn; + int procwait; - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START, - 0, kqr->kqr_qos_index); + kqlock_held(kq); - rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags, - KQWQAE_BEGIN_PROCESSING); + assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END, - thread_tid(kqr->kqr_thread), kqr->kqr_state); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END), + VM_KERNEL_UNSLIDE_OR_PERM(kq), 0); - return rc; -} + /* + * Return suppressed knotes to their original state. + */ + while ((kn = TAILQ_FIRST(suppressq)) != NULL) { + assert(kn->kn_status & KN_SUPPRESSED); + knote_unsuppress(kq, kn); + } -static inline bool -kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl) -{ - struct kqueue *kq = &kqwl->kqwl_kqueue; + procwait = (kq->kqf_state & KQ_PROCWAIT); + kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT); - kqlock_held(kq); + if (procwait) { + /* first wake up any thread already waiting to process */ + waitq_wakeup64_all((struct waitq *)&kq->kqf_wqs, + CAST_EVENT64_T(suppressq), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + } - if (kq->kq_state & KQ_PROCESSING) { - /* - * KQ_PROCESSING is unset with the kqlock held, and the kqr thread is - * never modified while KQ_PROCESSING is set, meaning that peeking at - * its value is safe from this context. - */ - return kqwl->kqwl_request.kqr_thread == current_thread(); + if (kq->kqf_state & KQ_DRAIN) { + return EBADF; } - return false; + return (kq->kqf_state & KQ_WAKEUP) ? -1 : 0; } -static thread_qos_t -kqworkloop_acknowledge_events(struct kqworkloop *kqwl) +static int +kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options, + struct kqueue_workloop_params *params, int *retval) { - struct kqrequest *kqr = &kqwl->kqwl_request; - kq_index_t qos = THREAD_QOS_UNSPECIFIED; - struct knote *kn, *tmp; + int error = 0; + struct kqworkloop *kqwl; + struct filedesc *fdp = p->p_fd; + workq_threadreq_param_t trp = { }; + + switch (cmd) { + case KQ_WORKLOOP_CREATE: + if (!params->kqwlp_flags) { + error = EINVAL; + break; + } - kqlock_held(&kqwl->kqwl_kqueue); + if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) && + (params->kqwlp_sched_pri < 1 || + params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) { + error = EINVAL; + break; + } - TAILQ_FOREACH_SAFE(kn, &kqr->kqr_suppressed, kn_tqe, tmp) { - /* - * If a knote that can adjust QoS is disabled because of the automatic - * behavior of EV_DISPATCH, the knotes should stay suppressed so that - * further overrides keep pushing. - */ - if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) && - (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 && - (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) { - qos = MAX(qos, knote_get_qos_override_index(kn)); - continue; + if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) && + invalid_policy(params->kqwlp_sched_pol)) { + error = EINVAL; + break; + } + + if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) && + (params->kqwlp_cpu_percent <= 0 || + params->kqwlp_cpu_percent > 100 || + params->kqwlp_cpu_refillms <= 0 || + params->kqwlp_cpu_refillms > 0x00ffffff)) { + error = EINVAL; + break; } - knote_unsuppress(kn); + + if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) { + trp.trp_flags |= TRP_PRIORITY; + trp.trp_pri = params->kqwlp_sched_pri; + } + if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) { + trp.trp_flags |= TRP_POLICY; + trp.trp_pol = params->kqwlp_sched_pol; + } + if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) { + trp.trp_flags |= TRP_CPUPERCENT; + trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent; + trp.trp_refillms = params->kqwlp_cpu_refillms; + } + + error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp, + KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP | + KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl); + if (error) { + break; + } + + if (!(fdp->fd_flags & FD_WORKLOOP)) { + /* FD_WORKLOOP indicates we've ever created a workloop + * via this syscall but its only ever added to a process, never + * removed. + */ + proc_fdlock(p); + fdp->fd_flags |= FD_WORKLOOP; + proc_fdunlock(p); + } + break; + case KQ_WORKLOOP_DESTROY: + error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL, + KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP | + KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl); + if (error) { + break; + } + kqlock(kqwl); + trp.trp_value = kqwl->kqwl_params; + if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) { + trp.trp_flags |= TRP_RELEASED; + kqworkloop_release_live(kqwl); + } else { + error = EINVAL; + } + kqunlock(kqwl); + kqworkloop_release(kqwl); + break; + } + *retval = 0; + return error; +} + +int +kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval) +{ + struct kqueue_workloop_params params = { + .kqwlp_id = 0, + }; + if (uap->sz < sizeof(params.kqwlp_version)) { + return EINVAL; + } + + size_t copyin_sz = MIN(sizeof(params), uap->sz); + int rv = copyin(uap->addr, ¶ms, copyin_sz); + if (rv) { + return rv; + } + + if (params.kqwlp_version != (int)uap->sz) { + return EINVAL; } - return qos; + return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, ¶ms, + retval); } +/*ARGSUSED*/ static int -kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags) +kqueue_select(struct fileproc *fp, int which, void *wq_link_id, + __unused vfs_context_t ctx) { - struct kqrequest *kqr = &kqwl->kqwl_request; - struct kqueue *kq = &kqwl->kqwl_kqueue; - thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override; - thread_t thread = kqr->kqr_thread; - int rc = 0, op = KQWL_UTQ_NONE; + struct kqfile *kq = (struct kqfile *)fp->f_data; + struct kqtailq *suppressq = &kq->kqf_suppressed; + struct kqtailq *queue = &kq->kqf_queue; + struct knote *kn; + int retnum = 0; - kqlock_held(kq); + if (which != FREAD) { + return 0; + } - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START, - kqwl->kqwl_dynamicid, 0, 0); + kqlock(kq); - /* nobody else should still be processing */ - assert((kq->kq_state & KQ_PROCESSING) == 0); + assert((kq->kqf_state & KQ_WORKQ) == 0); - kq->kq_state |= KQ_PROCESSING; + /* + * If this is the first pass, link the wait queue associated with the + * the kqueue onto the wait queue set for the select(). Normally we + * use selrecord() for this, but it uses the wait queue within the + * selinfo structure and we need to use the main one for the kqueue to + * catch events from KN_STAYQUEUED sources. So we do the linkage manually. + * (The select() call will unlink them when it ends). + */ + if (wq_link_id != NULL) { + thread_t cur_act = current_thread(); + struct uthread * ut = get_bsdthread_info(cur_act); - if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) { - op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE; - } + kq->kqf_state |= KQ_SEL; + waitq_link((struct waitq *)&kq->kqf_wqs, ut->uu_wqset, + WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id); + + /* always consume the reserved link object */ + waitq_link_release(*(uint64_t *)wq_link_id); + *(uint64_t *)wq_link_id = 0; - if (kevent_flags & KEVENT_FLAG_PARKING) { /* - * When "parking" we want to process events and if no events are found - * unbind. - * - * However, non overcommit threads sometimes park even when they have - * more work so that the pool can narrow. For these, we need to unbind - * early, so that calling kqworkloop_update_threads_qos() can ask the - * workqueue subsystem whether the thread should park despite having - * pending events. + * selprocess() is expecting that we send it back the waitq + * that was just added to the thread's waitq set. In order + * to not change the selrecord() API (which is exported to + * kexts), we pass this value back through the + * void *wq_link_id pointer we were passed. We need to use + * memcpy here because the pointer may not be properly aligned + * on 32-bit systems. */ - if (kqr->kqr_state & KQR_THOVERCOMMIT) { - op = KQWL_UTQ_PARKING; - } else { - op = KQWL_UTQ_UNBINDING; - } - } - if (op == KQWL_UTQ_NONE) { - goto done; + void *wqptr = &kq->kqf_wqs; + memcpy(wq_link_id, (void *)&wqptr, sizeof(void *)); } - qos_override = kqworkloop_acknowledge_events(kqwl); - - kq_req_lock(kqwl); - - if (op == KQWL_UTQ_UNBINDING) { - old_override = kqworkloop_unbind_locked(kqwl, thread); - (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF); + if (kqfile_begin_processing(kq) == -1) { + kqunlock(kq); + return 0; } - kqworkloop_update_threads_qos(kqwl, op, qos_override); - if (op == KQWL_UTQ_PARKING) { - if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) { - /* - * We cannot trust KQR_WAKEUP when looking at stay active knotes. - * We need to process once, and kqworkloop_end_processing will - * handle the unbind. - */ - } else if ((kqr->kqr_state & KQR_WAKEUP) == 0 || kqwl->kqwl_owner) { - old_override = kqworkloop_unbind_locked(kqwl, thread); - (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF); - rc = -1; - } - } else if (op == KQWL_UTQ_UNBINDING) { - if (kqr->kqr_thread == thread) { - /* - * The thread request fired again, passed the admission check and - * got bound to the current thread again. - */ - } else { - rc = -1; + + if (!TAILQ_EMPTY(queue)) { + /* + * there is something queued - but it might be a + * KN_STAYACTIVE knote, which may or may not have + * any events pending. Otherwise, we have to walk + * the list of knotes to see, and peek at the + * (non-vanished) stay-active ones to be really sure. + */ + while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) { + if (kn->kn_status & KN_ACTIVE) { + retnum = 1; + goto out; + } + assert(kn->kn_status & KN_STAYACTIVE); + knote_suppress(kq, kn); } - } - if (rc == 0) { /* - * Reset wakeup bit to notice stay active events firing while we are - * processing, as we cannot rely on the stayactive bucket emptiness. + * There were no regular events on the queue, so take + * a deeper look at the stay-queued ones we suppressed. */ - kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT; - } else { - kq->kq_state &= ~KQ_PROCESSING; - } + while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) { + KNOTE_LOCK_CTX(knlc); + int result = 0; - kq_req_unlock(kqwl); + /* If didn't vanish while suppressed - peek at it */ + if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc, + KNOTE_KQ_LOCK_ON_FAILURE)) { + continue; + } - if (old_override) { - thread_drop_ipc_override(thread); - } + result = filter_call(knote_fops(kn), f_peek(kn)); -done: - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END, - kqwl->kqwl_dynamicid, 0, 0); + kqlock(kq); + knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); - return rc; + /* unsuppress it */ + knote_unsuppress(kq, kn); + + /* has data or it has to report a vanish */ + if (result & FILTER_ACTIVE) { + retnum = 1; + goto out; + } + } + } + +out: + kqfile_end_processing(kq); + kqunlock(kq); + return retnum; } /* - * Return 0 to indicate that processing should proceed, - * -1 if there is nothing to process. - * - * Called with kqueue locked and returns the same way, - * but may drop lock temporarily. - * May block. + * kqueue_close - */ +/*ARGSUSED*/ static int -kqfile_begin_processing(struct kqueue *kq) +kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx) { - struct kqtailq *suppressq; - - kqlock_held(kq); - - assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START, - VM_KERNEL_UNSLIDE_OR_PERM(kq), 0); - - /* wait to become the exclusive processing thread */ - for (;;) { - if (kq->kq_state & KQ_DRAIN) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, - VM_KERNEL_UNSLIDE_OR_PERM(kq), 2); - return -1; - } - - if ((kq->kq_state & KQ_PROCESSING) == 0) { - break; - } - - /* if someone else is processing the queue, wait */ - kq->kq_state |= KQ_PROCWAIT; - suppressq = kqueue_get_suppressed_queue(kq, NULL); - waitq_assert_wait64((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT, - TIMEOUT_WAIT_FOREVER); - - kqunlock(kq); - thread_block(THREAD_CONTINUE_NULL); - kqlock(kq); - } - - /* Nobody else processing */ - - /* clear pre-posts and KQ_WAKEUP now, in case we bail early */ - waitq_set_clear_preposts(&kq->kq_wqs); - kq->kq_state &= ~KQ_WAKEUP; - - /* anything left to process? */ - if (kqueue_queue_empty(kq, QOS_INDEX_KQFILE)) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, - VM_KERNEL_UNSLIDE_OR_PERM(kq), 1); - return -1; - } - - /* convert to processing mode */ - kq->kq_state |= KQ_PROCESSING; - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, - VM_KERNEL_UNSLIDE_OR_PERM(kq)); + struct kqfile *kqf = (struct kqfile *)fg->fg_data; + assert((kqf->kqf_state & KQ_WORKQ) == 0); + kqueue_dealloc(&kqf->kqf_kqueue); + fg->fg_data = NULL; return 0; } /* - * Try to end the processing, only called when a workq thread is attempting to - * park (KEVENT_FLAG_PARKING is set). - * - * When returning -1, the kqworkq is setup again so that it is ready to be - * processed. + * Max depth of the nested kq path that can be created. + * Note that this has to be less than the size of kq_level + * to avoid wrapping around and mislabeling the level. + */ +#define MAX_NESTED_KQ 1000 + +/*ARGSUSED*/ +/* + * The callers has taken a use-count reference on this kqueue and will donate it + * to the kqueue we are being added to. This keeps the kqueue from closing until + * that relationship is torn down. */ static int -kqworkq_end_processing(struct kqworkq *kqwq, struct kqrequest *kqr, - int kevent_flags) +kqueue_kqfilter(struct fileproc *fp, struct knote *kn, + __unused struct kevent_qos_s *kev) { - if (!kqueue_queue_empty(&kqwq->kqwq_kqueue, kqr->kqr_qos_index)) { - /* remember we didn't process everything */ - kq_req_lock(kqwq); - kqr->kqr_state |= KQR_WAKEUP; - kq_req_unlock(kqwq); + struct kqfile *kqf = (struct kqfile *)fp->f_data; + struct kqueue *kq = &kqf->kqf_kqueue; + struct kqueue *parentkq = knote_get_kq(kn); + + assert((kqf->kqf_state & KQ_WORKQ) == 0); + + if (parentkq == kq || kn->kn_filter != EVFILT_READ) { + knote_set_error(kn, EINVAL); + return 0; } - if (kevent_flags & KEVENT_FLAG_PARKING) { - /* - * if acknowledge events "succeeds" it means there are events, - * which is a failure condition for end_processing. - */ - int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags, - KQWQAE_END_PROCESSING); - if (rc == 0) { - return -1; + /* + * We have to avoid creating a cycle when nesting kqueues + * inside another. Rather than trying to walk the whole + * potential DAG of nested kqueues, we just use a simple + * ceiling protocol. When a kqueue is inserted into another, + * we check that the (future) parent is not already nested + * into another kqueue at a lower level than the potenial + * child (because it could indicate a cycle). If that test + * passes, we just mark the nesting levels accordingly. + * + * Only up to MAX_NESTED_KQ can be nested. + * + * Note: kqworkq and kqworkloop cannot be nested and have reused their + * kq_level field, so ignore these as parent. + */ + + kqlock(parentkq); + + if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) { + if (parentkq->kq_level > 0 && + parentkq->kq_level < kq->kq_level) { + kqunlock(parentkq); + knote_set_error(kn, EINVAL); + return 0; + } + + /* set parent level appropriately */ + uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level; + if (plevel < kq->kq_level + 1) { + if (kq->kq_level + 1 > MAX_NESTED_KQ) { + kqunlock(parentkq); + knote_set_error(kn, EINVAL); + return 0; + } + plevel = kq->kq_level + 1; } + + parentkq->kq_level = plevel; } - return 0; + kqunlock(parentkq); + + kn->kn_filtid = EVFILTID_KQREAD; + kqlock(kq); + KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn); + /* indicate nesting in child, if needed */ + if (kq->kq_level == 0) { + kq->kq_level = 1; + } + + int count = kq->kq_count; + kqunlock(kq); + return count > 0; } /* - * Try to end the processing, only called when a workq thread is attempting to - * park (KEVENT_FLAG_PARKING is set). - * - * When returning -1, the kqworkq is setup again so that it is ready to be - * processed (as if kqworkloop_begin_processing had just been called). - * - * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags, - * the kqworkloop is unbound from its servicer as a side effect. + * kqueue_drain - called when kq is closed */ +/*ARGSUSED*/ static int -kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags) +kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx) { - struct kqueue *kq = &kqwl->kqwl_kqueue; - struct kqrequest *kqr = &kqwl->kqwl_request; - thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override; - thread_t thread = kqr->kqr_thread; - int rc = 0; + struct kqfile *kqf = (struct kqfile *)fp->f_fglob->fg_data; - kqlock_held(kq); + assert((kqf->kqf_state & KQ_WORKQ) == 0); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START, - kqwl->kqwl_dynamicid, 0, 0); + kqlock(kqf); + kqf->kqf_state |= KQ_DRAIN; - if (flags & KQ_PROCESSING) { - assert(kq->kq_state & KQ_PROCESSING); + /* wakeup sleeping threads */ + if ((kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) != 0) { + kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL); + (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs, + KQ_EVENT, + THREAD_RESTART, + WAITQ_ALL_PRIORITIES); + } - /* - * If we still have queued stayactive knotes, remember we didn't finish - * processing all of them. This should be extremely rare and would - * require to have a lot of them registered and fired. - */ - if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) { - kq_req_lock(kqwl); - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, - KQWL_BUCKET_STAYACTIVE); - kq_req_unlock(kqwl); - } + /* wakeup threads waiting their turn to process */ + if (kqf->kqf_state & KQ_PROCWAIT) { + assert(kqf->kqf_state & KQ_PROCESSING); - /* - * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while - * still under the lock. - * - * So we do everything kqworkloop_unbind() would do, but because we're - * inside kqueue_process(), if the workloop actually received events - * while our locks were dropped, we have the opportunity to fail the end - * processing and loop again. - * - * This avoids going through the process-wide workqueue lock hence - * scales better. - */ - if (kevent_flags & KEVENT_FLAG_PARKING) { - qos_override = kqworkloop_acknowledge_events(kqwl); - } + kqf->kqf_state &= ~KQ_PROCWAIT; + (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs, + CAST_EVENT64_T(&kqf->kqf_suppressed), + THREAD_RESTART, WAITQ_ALL_PRIORITIES); } - kq_req_lock(kqwl); + kqunlock(kqf); + return 0; +} + +/*ARGSUSED*/ +int +kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p) +{ + assert((kq->kq_state & KQ_WORKQ) == 0); - if (kevent_flags & KEVENT_FLAG_PARKING) { - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override); - if ((kqr->kqr_state & KQR_WAKEUP) && !kqwl->kqwl_owner) { - /* - * Reset wakeup bit to notice stay active events firing while we are - * processing, as we cannot rely on the stayactive bucket emptiness. - */ - kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT; - rc = -1; + kqlock(kq); + if (isstat64 != 0) { + struct stat64 *sb64 = (struct stat64 *)ub; + + bzero((void *)sb64, sizeof(*sb64)); + sb64->st_size = kq->kq_count; + if (kq->kq_state & KQ_KEV_QOS) { + sb64->st_blksize = sizeof(struct kevent_qos_s); + } else if (kq->kq_state & KQ_KEV64) { + sb64->st_blksize = sizeof(struct kevent64_s); + } else if (IS_64BIT_PROCESS(p)) { + sb64->st_blksize = sizeof(struct user64_kevent); } else { - old_override = kqworkloop_unbind_locked(kqwl, thread); - (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF); - kq->kq_state &= ~flags; + sb64->st_blksize = sizeof(struct user32_kevent); } + sb64->st_mode = S_IFIFO; } else { - kq->kq_state &= ~flags; - kqr->kqr_state |= KQR_R2K_NOTIF_ARMED; - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0); - } - - kq_req_unlock(kqwl); + struct stat *sb = (struct stat *)ub; - if (old_override) { - thread_drop_ipc_override(thread); + bzero((void *)sb, sizeof(*sb)); + sb->st_size = kq->kq_count; + if (kq->kq_state & KQ_KEV_QOS) { + sb->st_blksize = sizeof(struct kevent_qos_s); + } else if (kq->kq_state & KQ_KEV64) { + sb->st_blksize = sizeof(struct kevent64_s); + } else if (IS_64BIT_PROCESS(p)) { + sb->st_blksize = sizeof(struct user64_kevent); + } else { + sb->st_blksize = sizeof(struct user32_kevent); + } + sb->st_mode = S_IFIFO; } + kqunlock(kq); + return 0; +} - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END, - kqwl->kqwl_dynamicid, 0, 0); - - return rc; +static inline bool +kqueue_threadreq_can_use_ast(struct kqueue *kq) +{ + if (current_proc() == kq->kq_p) { + /* + * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can + * do combined send/receive and in the case of self-IPC, the AST may bet + * set on a thread that will not return to userspace and needs the + * thread the AST would create to unblock itself. + * + * At this time, we really want to target: + * + * - kevent variants that can cause thread creations, and dispatch + * really only uses kevent_qos and kevent_id, + * + * - workq_kernreturn (directly about thread creations) + * + * - bsdthread_ctl which is used for qos changes and has direct impact + * on the creator thread scheduling decisions. + */ + switch (current_uthread()->syscall_code) { + case SYS_kevent_qos: + case SYS_kevent_id: + case SYS_workq_kernreturn: + case SYS_bsdthread_ctl: + return true; + } + } + return false; } /* - * Called with kqueue lock held. + * Interact with the pthread kext to request a servicing there at a specific QoS + * level. + * + * - Caller holds the workq request lock + * + * - May be called with the kqueue's wait queue set locked, + * so cannot do anything that could recurse on that. */ static void -kqfile_end_processing(struct kqueue *kq) +kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t kqr, + kq_index_t qos, int flags) { - struct knote *kn; - struct kqtailq *suppressq; - int procwait; + assert(kqr->tr_kq_wakeup); + assert(kqr_thread(kqr) == THREAD_NULL); + assert(!kqr_thread_requested(kqr)); + struct turnstile *ts = TURNSTILE_NULL; + + if (workq_is_exiting(kq->kq_p)) { + return; + } kqlock_held(kq); - assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0); + if (kq->kq_state & KQ_WORKLOOP) { + __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq; - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END), - VM_KERNEL_UNSLIDE_OR_PERM(kq), 0); + assert(kqwl->kqwl_owner == THREAD_NULL); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST), + kqwl->kqwl_dynamicid, 0, qos, kqr->tr_kq_wakeup); + ts = kqwl->kqwl_turnstile; + /* Add a thread request reference on the kqueue. */ + kqworkloop_retain(kqwl); + } else { + assert(kq->kq_state & KQ_WORKQ); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), + -1, 0, qos, kqr->tr_kq_wakeup); + } /* - * Return suppressed knotes to their original state. + * New-style thread request supported. + * Provide the pthread kext a pointer to a workq_threadreq_s structure for + * its use until a corresponding kqueue_threadreq_bind callback. */ - suppressq = kqueue_get_suppressed_queue(kq, NULL); - while ((kn = TAILQ_FIRST(suppressq)) != NULL) { - assert(kn->kn_status & KN_SUPPRESSED); - knote_unsuppress(kn); + if (kqueue_threadreq_can_use_ast(kq)) { + flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE; + } + if (qos == KQWQ_QOS_MANAGER) { + qos = WORKQ_THREAD_QOS_MANAGER; + } + if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) { + /* + * Process is shutting down or exec'ing. + * All the kqueues are going to be cleaned up + * soon. Forget we even asked for a thread - + * and make sure we don't ask for more. + */ + kq->kq_state &= ~KQ_R2K_ARMED; + kqueue_release_live(kq); } +} - procwait = (kq->kq_state & KQ_PROCWAIT); - kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT); +/* + * kqueue_threadreq_bind_prepost - prepost the bind to kevent + * + * This is used when kqueue_threadreq_bind may cause a lock inversion. + */ +__attribute__((always_inline)) +void +kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr, + struct uthread *ut) +{ + ut->uu_kqr_bound = kqr; + kqr->tr_thread = ut->uu_thread; + kqr->tr_state = WORKQ_TR_STATE_BINDING; +} - if (procwait) { - /* first wake up any thread already waiting to process */ - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(suppressq), - THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); +/* + * kqueue_threadreq_bind_commit - commit a bind prepost + * + * The workq code has to commit any binding prepost before the thread has + * a chance to come back to userspace (and do kevent syscalls) or be aborted. + */ +void +kqueue_threadreq_bind_commit(struct proc *p, thread_t thread) +{ + struct uthread *ut = get_bsdthread_info(thread); + workq_threadreq_t kqr = ut->uu_kqr_bound; + kqueue_t kqu = kqr_kqueue(p, kqr); + + kqlock(kqu); + if (kqr->tr_state == WORKQ_TR_STATE_BINDING) { + kqueue_threadreq_bind(p, kqr, thread, 0); } + kqunlock(kqu); } -static int -kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options, - struct kqueue_workloop_params *params, int *retval) +static void +kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos, + workq_kern_threadreq_flags_t flags) { - int error = 0; - int fd; - struct fileproc *fp; - struct kqueue *kq; - struct kqworkloop *kqwl; - struct filedesc *fdp = p->p_fd; - workq_threadreq_param_t trp = { }; + assert(kqr_thread_requested_pending(kqr)); - switch (cmd) { - case KQ_WORKLOOP_CREATE: - if (!params->kqwlp_flags) { - error = EINVAL; - break; - } + kqlock_held(kqu); - if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) && - (params->kqwlp_sched_pri < 1 || - params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) { - error = EINVAL; - break; - } + if (kqueue_threadreq_can_use_ast(kqu.kq)) { + flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE; + } + workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags); +} + +/* + * kqueue_threadreq_bind - bind thread to processing kqrequest + * + * The provided thread will be responsible for delivering events + * associated with the given kqrequest. Bind it and get ready for + * the thread to eventually arrive. + */ +void +kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread, + unsigned int flags) +{ + kqueue_t kqu = kqr_kqueue(p, kqr); + struct uthread *ut = get_bsdthread_info(thread); - if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) && - invalid_policy(params->kqwlp_sched_pol)) { - error = EINVAL; - break; - } + kqlock_held(kqu); - if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) && - (params->kqwlp_cpu_percent <= 0 || - params->kqwlp_cpu_percent > 100 || - params->kqwlp_cpu_refillms <= 0 || - params->kqwlp_cpu_refillms > 0x00ffffff)) { - error = EINVAL; - break; - } + assert(ut->uu_kqueue_override == 0); - if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) { - trp.trp_flags |= TRP_PRIORITY; - trp.trp_pri = params->kqwlp_sched_pri; - } - if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) { - trp.trp_flags |= TRP_POLICY; - trp.trp_pol = params->kqwlp_sched_pol; - } - if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) { - trp.trp_flags |= TRP_CPUPERCENT; - trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent; - trp.trp_refillms = params->kqwlp_cpu_refillms; - } + if (kqr->tr_state == WORKQ_TR_STATE_BINDING) { + assert(ut->uu_kqr_bound == kqr); + assert(kqr->tr_thread == thread); + } else { + assert(kqr_thread_requested_pending(kqr)); + assert(kqr->tr_thread == THREAD_NULL); + assert(ut->uu_kqr_bound == NULL); + ut->uu_kqr_bound = kqr; + kqr->tr_thread = thread; + } - error = kevent_get_kq(p, params->kqwlp_id, &trp, - KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP | - KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &fp, &fd, &kq); - if (error) { - break; - } + kqr->tr_state = WORKQ_TR_STATE_BOUND; - if (!(fdp->fd_flags & FD_WORKLOOP)) { - /* FD_WORKLOOP indicates we've ever created a workloop - * via this syscall but its only ever added to a process, never - * removed. + if (kqu.kq->kq_state & KQ_WORKLOOP) { + struct turnstile *ts = kqu.kqwl->kqwl_turnstile; + + if (__improbable(thread == kqu.kqwl->kqwl_owner)) { + /* + * shows that asserting here is not ok. + * + * This is not supposed to happen for correct use of the interface, + * but it is sadly possible for userspace (with the help of memory + * corruption, such as over-release of a dispatch queue) to make + * the creator thread the "owner" of a workloop. + * + * Once that happens, and that creator thread picks up the same + * workloop as a servicer, we trip this codepath. We need to fixup + * the state to forget about this thread being the owner, as the + * entire workloop state machine expects servicers to never be + * owners and everything would basically go downhill from here. */ - proc_fdlock(p); - fdp->fd_flags |= FD_WORKLOOP; - proc_fdunlock(p); + kqu.kqwl->kqwl_owner = THREAD_NULL; + if (kqworkloop_override(kqu.kqwl)) { + thread_drop_kevent_override(thread); + } } - break; - case KQ_WORKLOOP_DESTROY: - error = kevent_get_kq(p, params->kqwlp_id, NULL, - KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP | - KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &fp, &fd, &kq); - if (error) { - break; + + if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) { + /* + * Past this point, the interlock is the kq req lock again, + * so we can fix the inheritor for good. + */ + filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); } - kqlock(kq); - kqwl = (struct kqworkloop *)kq; - trp.trp_value = kqwl->kqwl_params; - if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) { - trp.trp_flags |= TRP_RELEASED; - kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); - } else { - error = EINVAL; + + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid, + thread_tid(thread), kqr->tr_kq_qos_index, + (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup); + + ut->uu_kqueue_override = kqr->tr_kq_override_index; + if (kqr->tr_kq_override_index) { + thread_add_servicer_override(thread, kqr->tr_kq_override_index); } - kqunlock(kq); - kqueue_release_last(p, kq); - break; + } else { + assert(kqr->tr_kq_override_index == 0); + + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1, + thread_tid(thread), kqr->tr_kq_qos_index, + (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup); } - *retval = 0; - return error; } -int -kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval) +/* + * kqueue_threadreq_cancel - abort a pending thread request + * + * Called when exiting/exec'ing. Forget our pending request. + */ +void +kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr) { - struct kqueue_workloop_params params = { - .kqwlp_id = 0, - }; - if (uap->sz < sizeof(params.kqwlp_version)) { - return EINVAL; - } - - size_t copyin_sz = MIN(sizeof(params), uap->sz); - int rv = copyin(uap->addr, ¶ms, copyin_sz); - if (rv) { - return rv; - } + kqueue_release(kqr_kqueue(p, kqr)); +} - if (params.kqwlp_version != (int)uap->sz) { - return EINVAL; - } +workq_threadreq_param_t +kqueue_threadreq_workloop_param(workq_threadreq_t kqr) +{ + struct kqworkloop *kqwl; + workq_threadreq_param_t trp; - return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, ¶ms, - retval); + assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP); + kqwl = __container_of(kqr, struct kqworkloop, kqwl_request); + trp.trp_value = kqwl->kqwl_params; + return trp; } /* - * kqueue_process - process the triggered events in a kqueue - * - * Walk the queued knotes and validate that they are really still triggered - * events by calling the filter routines (if necessary). + * kqueue_threadreq_unbind - unbind thread from processing kqueue * - * For each event that is still considered triggered, invoke the callback - * routine provided. + * End processing the per-QoS bucket of events and allow other threads + * to be requested for future servicing. * * caller holds a reference on the kqueue. - * kqueue locked on entry and exit - but may be dropped - * kqueue list locked (held for duration of call) */ -static int -kqueue_process(struct kqueue *kq, - kevent_callback_t callback, - void *callback_data, - struct filt_process_s *process_data, - int *countp) +void +kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr) { - struct uthread *ut = get_bsdthread_info(current_thread()); - struct kqrequest *kqr = ut->uu_kqr_bound; - struct knote *kn; - unsigned int flags = process_data ? process_data->fp_flags : 0; - int nevents = 0, error = 0, rc = 0; - struct kqtailq *base_queue, *queue; - kqueue_t kqu = { .kq = kq }; -#if DEBUG || DEVELOPMENT - int retries = 64; -#endif - - if (kq->kq_state & KQ_WORKQ) { - if (kqr == NULL || (kqr->kqr_state & KQR_WORKLOOP)) { - return EJUSTRETURN; - } - rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags); - } else if (kq->kq_state & KQ_WORKLOOP) { - if (ut->uu_kqr_bound != &kqu.kqwl->kqwl_request) { - return EJUSTRETURN; - } - rc = kqworkloop_begin_processing(kqu.kqwl, flags); + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { + kqworkloop_unbind(kqr_kqworkloop(kqr)); } else { - rc = kqfile_begin_processing(kq); + kqworkq_unbind(p, kqr); } +} - if (rc == -1) { - /* Nothing to process */ - *countp = 0; - return 0; +/* + * If we aren't already busy processing events [for this QoS], + * request workq thread support as appropriate. + * + * TBD - for now, we don't segregate out processing by QoS. + * + * - May be called with the kqueue's wait queue set locked, + * so cannot do anything that could recurse on that. + */ +static void +kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index) +{ + workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index); + + /* convert to thread qos value */ + assert(qos_index < KQWQ_NBUCKETS); + + if (!kqr->tr_kq_wakeup) { + kqr->tr_kq_wakeup = true; + if (!kqr_thread_requested(kqr)) { + kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0); + } } +} - /* - * loop through the enqueued knotes associated with this request, - * processing each one. Each request may have several queues - * of knotes to process (depending on the type of kqueue) so we - * have to loop through all the queues as long as we have additional - * space. - */ +/* + * This represent the asynchronous QoS a given workloop contributes, + * hence is the max of the current active knotes (override index) + * and the workloop max qos (userspace async qos). + */ +static kq_index_t +kqworkloop_override(struct kqworkloop *kqwl) +{ + workq_threadreq_t kqr = &kqwl->kqwl_request; + return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index); +} -process_again: - if (kq->kq_state & KQ_WORKQ) { - base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->kqr_qos_index]; - } else if (kq->kq_state & KQ_WORKLOOP) { - base_queue = &kqu.kqwl->kqwl_queue[0]; - queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1]; - } else { - base_queue = queue = &kq->kq_queue[QOS_INDEX_KQFILE]; +static inline void +kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl) +{ + workq_threadreq_t kqr = &kqwl->kqwl_request; + + kqlock_held(kqwl); + + if (kqwl->kqwl_state & KQ_R2K_ARMED) { + kqwl->kqwl_state &= ~KQ_R2K_ARMED; + act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL); } +} - do { - while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) { - error = knote_process(kn, callback, callback_data, process_data); - if (error == EJUSTRETURN) { - error = 0; - } else { - nevents++; - } - /* error is EWOULDBLOCK when the out event array is full */ - } +static void +kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) +{ + workq_threadreq_t kqr = &kqwl->kqwl_request; + struct kqueue *kq = &kqwl->kqwl_kqueue; + kq_index_t old_override = kqworkloop_override(kqwl); + kq_index_t i; - if (error == EWOULDBLOCK) { - /* break out if no more space for additional events */ - error = 0; + kqlock_held(kqwl); + + switch (op) { + case KQWL_UTQ_UPDATE_WAKEUP_QOS: + if (qos == KQWL_BUCKET_STAYACTIVE) { + /* + * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember + * a high watermark (kqwl_stayactive_qos) of any stay active knote + * that was ever registered with this workloop. + * + * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active + * knote, we use this high-watermark as a wakeup-index, and also set + * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember + * there is at least one stay active knote fired until the next full + * processing of this bucket. + */ + kqwl->kqwl_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT; + qos = kqwl->kqwl_stayactive_qos; + assert(qos); + } + if (kqwl->kqwl_wakeup_indexes & (1 << qos)) { + assert(kqr->tr_kq_wakeup); break; } - } while (queue-- > base_queue); - *countp = nevents; + kqwl->kqwl_wakeup_indexes |= (1 << qos); + kqr->tr_kq_wakeup = true; + kqworkloop_request_fire_r2k_notification(kqwl); + goto recompute; - /* - * If KEVENT_FLAG_PARKING is set, and no kevents have been returned, - * we want to unbind the kqrequest from the thread. - * - * However, because the kq locks are dropped several times during process, - * new knotes may have fired again, in which case, we want to fail the end - * processing and process again, until it converges. - * - * If we returned events however, end processing never fails. - */ - if (error || nevents) { - flags &= ~KEVENT_FLAG_PARKING; - } - if (kq->kq_state & KQ_WORKQ) { - rc = kqworkq_end_processing(kqu.kqwq, kqr, flags); - } else if (kq->kq_state & KQ_WORKLOOP) { - rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags); - } else { - kqfile_end_processing(kq); - rc = 0; - } - if (rc == -1) { - assert(flags & KEVENT_FLAG_PARKING); -#if DEBUG || DEVELOPMENT - if (retries-- == 0) { - panic("kevent: way too many knote_process retries, kq: %p (0x%02x)", - kq, kq->kq_state); + case KQWL_UTQ_UPDATE_STAYACTIVE_QOS: + assert(qos); + if (kqwl->kqwl_stayactive_qos < qos) { + kqwl->kqwl_stayactive_qos = qos; + if (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) { + assert(kqr->tr_kq_wakeup); + kqwl->kqwl_wakeup_indexes |= (1 << qos); + goto recompute; + } + } + break; + + case KQWL_UTQ_PARKING: + case KQWL_UTQ_UNBINDING: + kqr->tr_kq_override_index = qos; + /* FALLTHROUGH */ + case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS: + if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) { + assert(qos == THREAD_QOS_UNSPECIFIED); + } + i = KQWL_BUCKET_STAYACTIVE; + if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) { + kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED; + } + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) && + (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) { + /* + * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active + * knote may have fired, so we need to merge in kqwl_stayactive_qos. + * + * Unlike other buckets, this one is never empty but could be idle. + */ + kqwl->kqwl_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT; + kqwl->kqwl_wakeup_indexes |= (1 << kqwl->kqwl_stayactive_qos); + } else { + kqwl->kqwl_wakeup_indexes = 0; } -#endif - goto process_again; - } - return error; -} - -static void -kqueue_scan_continue(void *data, wait_result_t wait_result) -{ - thread_t self = current_thread(); - uthread_t ut = (uthread_t)get_bsdthread_info(self); - struct _kqueue_scan * cont_args = &ut->uu_save.uus_kqueue_scan; - struct kqueue *kq = (struct kqueue *)data; - struct filt_process_s *process_data = cont_args->process_data; - int error; - int count; - - /* convert the (previous) wait_result to a proper error */ - switch (wait_result) { - case THREAD_AWAKENED: { - kqlock(kq); -retry: - error = kqueue_process(kq, cont_args->call, cont_args->data, - process_data, &count); - if (error == 0 && count == 0) { - if (kq->kq_state & KQ_DRAIN) { - kqunlock(kq); - goto drain; + for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) { + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) { + kqwl->kqwl_wakeup_indexes |= (1 << i); } + } + if (kqwl->kqwl_wakeup_indexes) { + kqr->tr_kq_wakeup = true; + kqworkloop_request_fire_r2k_notification(kqwl); + } else { + kqr->tr_kq_wakeup = false; + } + goto recompute; - if (kq->kq_state & KQ_WAKEUP) { - goto retry; - } + case KQWL_UTQ_RESET_WAKEUP_OVERRIDE: + kqr->tr_kq_override_index = qos; + goto recompute; - waitq_assert_wait64((struct waitq *)&kq->kq_wqs, - KQ_EVENT, THREAD_ABORTSAFE, - cont_args->deadline); - kq->kq_state |= KQ_SLEEP; - kqunlock(kq); - thread_block_parameter(kqueue_scan_continue, kq); - /* NOTREACHED */ + case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE: +recompute: + /* + * When modifying the wakeup QoS or the override QoS, we always need to + * maintain our invariant that kqr_override_index is at least as large + * as the highest QoS for which an event is fired. + * + * However this override index can be larger when there is an overriden + * suppressed knote pushing on the kqueue. + */ + if (kqwl->kqwl_wakeup_indexes > (1 << qos)) { + qos = fls(kqwl->kqwl_wakeup_indexes) - 1; /* fls is 1-based */ + } + if (kqr->tr_kq_override_index < qos) { + kqr->tr_kq_override_index = qos; } - kqunlock(kq); - } break; - case THREAD_TIMED_OUT: - error = EWOULDBLOCK; break; - case THREAD_INTERRUPTED: - error = EINTR; + + case KQWL_UTQ_REDRIVE_EVENTS: break; - case THREAD_RESTART: -drain: - error = EBADF; + + case KQWL_UTQ_SET_QOS_INDEX: + kqr->tr_kq_qos_index = qos; break; + default: - panic("%s: - invalid wait_result (%d)", __func__, - wait_result); - error = 0; + panic("unknown kqwl thread qos update operation: %d", op); } - /* call the continuation with the results */ - assert(cont_args->cont != NULL); - (cont_args->cont)(kq, cont_args->data, error); -} - - -/* - * kqueue_scan - scan and wait for events in a kqueue - * - * Process the triggered events in a kqueue. - * - * If there are no events triggered arrange to - * wait for them. If the caller provided a - * continuation routine, then kevent_scan will - * also. - * - * The callback routine must be valid. - * The caller must hold a use-count reference on the kq. - */ -int -kqueue_scan(struct kqueue *kq, - kevent_callback_t callback, - kqueue_continue_t continuation, - void *callback_data, - struct filt_process_s *process_data, - struct timeval *atvp, - __unused struct proc *p) -{ - thread_continue_t cont = THREAD_CONTINUE_NULL; - unsigned int flags; - uint64_t deadline; - int error; - int first; - int fd; - - assert(callback != NULL); + thread_t kqwl_owner = kqwl->kqwl_owner; + thread_t servicer = kqr_thread(kqr); + boolean_t qos_changed = FALSE; + kq_index_t new_override = kqworkloop_override(kqwl); /* - * Determine which QoS index we are servicing + * Apply the diffs to the owner if applicable */ - flags = (process_data) ? process_data->fp_flags : 0; - fd = (process_data) ? process_data->fp_fd : -1; - - first = 1; - for (;;) { - wait_result_t wait_result; - int count; + if (kqwl_owner) { +#if 0 + /* JMM - need new trace hooks for owner overrides */ + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), + kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index, + (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup); +#endif + if (new_override == old_override) { + // nothing to do + } else if (old_override == THREAD_QOS_UNSPECIFIED) { + thread_add_kevent_override(kqwl_owner, new_override); + } else if (new_override == THREAD_QOS_UNSPECIFIED) { + thread_drop_kevent_override(kqwl_owner); + } else { /* old_override != new_override */ + thread_update_kevent_override(kqwl_owner, new_override); + } + } + /* + * apply the diffs to the servicer + */ + if (!kqr_thread_requested(kqr)) { /* - * Make a pass through the kq to find events already - * triggered. + * No servicer, nor thread-request + * + * Make a new thread request, unless there is an owner (or the workloop + * is suspended in userland) or if there is no asynchronous work in the + * first place. */ - kqlock(kq); - error = kqueue_process(kq, callback, callback_data, - process_data, &count); - if (error || count) { - break; /* lock still held */ - } - /* looks like we have to consider blocking */ - if (first) { - first = 0; - /* convert the timeout to a deadline once */ - if (atvp->tv_sec || atvp->tv_usec) { - uint64_t now; - - clock_get_uptime(&now); - nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC + - atvp->tv_usec * (long)NSEC_PER_USEC, - &deadline); - if (now >= deadline) { - /* non-blocking call */ - error = EWOULDBLOCK; - break; /* lock still held */ - } - deadline -= now; - clock_absolutetime_interval_to_deadline(deadline, &deadline); - } else { - deadline = 0; /* block forever */ - } - if (continuation) { - uthread_t ut = (uthread_t)get_bsdthread_info(current_thread()); - struct _kqueue_scan *cont_args = &ut->uu_save.uus_kqueue_scan; - - cont_args->call = callback; - cont_args->cont = continuation; - cont_args->deadline = deadline; - cont_args->data = callback_data; - cont_args->process_data = process_data; - cont = kqueue_scan_continue; + if (kqwl_owner == NULL && kqr->tr_kq_wakeup) { + int initiate_flags = 0; + if (op == KQWL_UTQ_UNBINDING) { + initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND; } + kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags); } - - if (kq->kq_state & KQ_DRAIN) { - kqunlock(kq); - return EBADF; - } - - /* If awakened during processing, try again */ - if (kq->kq_state & KQ_WAKEUP) { - kqunlock(kq); - continue; + } else if (servicer) { + /* + * Servicer in flight + * + * Just apply the diff to the servicer + */ + struct uthread *ut = get_bsdthread_info(servicer); + if (ut->uu_kqueue_override != new_override) { + if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) { + thread_add_servicer_override(servicer, new_override); + } else if (new_override == THREAD_QOS_UNSPECIFIED) { + thread_drop_servicer_override(servicer); + } else { /* ut->uu_kqueue_override != new_override */ + thread_update_servicer_override(servicer, new_override); + } + ut->uu_kqueue_override = new_override; + qos_changed = TRUE; } + } else if (new_override == THREAD_QOS_UNSPECIFIED) { + /* + * No events to deliver anymore. + * + * However canceling with turnstiles is challenging, so the fact that + * the request isn't useful will be discovered by the servicer himself + * later on. + */ + } else if (old_override != new_override) { + /* + * Request is in flight + * + * Apply the diff to the thread request + */ + kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE); + qos_changed = TRUE; + } - /* go ahead and wait */ - waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs, - KQ_EVENT, THREAD_ABORTSAFE, - TIMEOUT_URGENCY_USER_NORMAL, - deadline, TIMEOUT_NO_LEEWAY); - kq->kq_state |= KQ_SLEEP; - kqunlock(kq); - wait_result = thread_block_parameter(cont, kq); - /* NOTREACHED if (continuation != NULL) */ - - switch (wait_result) { - case THREAD_AWAKENED: - continue; - case THREAD_TIMED_OUT: - return EWOULDBLOCK; - case THREAD_INTERRUPTED: - return EINTR; - case THREAD_RESTART: - return EBADF; - default: - panic("%s: - bad wait_result (%d)", __func__, - wait_result); - error = 0; - } + if (qos_changed) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid, + thread_tid(servicer), kqr->tr_kq_qos_index, + (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup); } - kqunlock(kq); - return error; } - -/* - * XXX - * This could be expanded to call kqueue_scan, if desired. - */ -/*ARGSUSED*/ -static int -kqueue_read(__unused struct fileproc *fp, - __unused struct uio *uio, - __unused int flags, - __unused vfs_context_t ctx) +static void +kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos) { - return ENXIO; -} + if ((kqwl->kqwl_state & KQ_PROCESSING) && + kqr_thread(&kqwl->kqwl_request) == current_thread()) { + /* + * kqworkloop_end_processing() will perform the required QoS + * computations when it unsets the processing mode. + */ + return; + } -/*ARGSUSED*/ -static int -kqueue_write(__unused struct fileproc *fp, - __unused struct uio *uio, - __unused int flags, - __unused vfs_context_t ctx) -{ - return ENXIO; + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos); } -/*ARGSUSED*/ -static int -kqueue_ioctl(__unused struct fileproc *fp, - __unused u_long com, - __unused caddr_t data, - __unused vfs_context_t ctx) +static struct kqtailq * +kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn) { - return ENOTTY; + if (kq.kq->kq_state & KQ_WORKLOOP) { + return &kq.kqwl->kqwl_suppressed; + } else if (kq.kq->kq_state & KQ_WORKQ) { + return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index]; + } else { + return &kq.kqf->kqf_suppressed; + } } -/*ARGSUSED*/ -static int -kqueue_select(struct fileproc *fp, int which, void *wq_link_id, - __unused vfs_context_t ctx) +struct turnstile * +kqueue_alloc_turnstile(kqueue_t kqu) { - struct kqueue *kq = (struct kqueue *)fp->f_data; - struct kqtailq *queue; - struct kqtailq *suppressq; - struct knote *kn; - int retnum = 0; + struct kqworkloop *kqwl = kqu.kqwl; + kq_state_t kq_state; - if (which != FREAD) { - return 0; + kq_state = os_atomic_load(&kqu.kq->kq_state, dependency); + if (kq_state & KQ_HAS_TURNSTILE) { + /* force a dependency to pair with the atomic or with release below */ + return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile, + (uintptr_t)kq_state); } - kqlock(kq); + if (!(kq_state & KQ_WORKLOOP)) { + return TURNSTILE_NULL; + } - assert((kq->kq_state & KQ_WORKQ) == 0); + struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL; + bool workq_locked = false; - /* - * If this is the first pass, link the wait queue associated with the - * the kqueue onto the wait queue set for the select(). Normally we - * use selrecord() for this, but it uses the wait queue within the - * selinfo structure and we need to use the main one for the kqueue to - * catch events from KN_STAYQUEUED sources. So we do the linkage manually. - * (The select() call will unlink them when it ends). - */ - if (wq_link_id != NULL) { - thread_t cur_act = current_thread(); - struct uthread * ut = get_bsdthread_info(cur_act); + kqlock(kqu); - kq->kq_state |= KQ_SEL; - waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset, - WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id); + if (filt_wlturnstile_interlock_is_workq(kqwl)) { + workq_locked = true; + workq_kern_threadreq_lock(kqwl->kqwl_p); + } - /* always consume the reserved link object */ - waitq_link_release(*(uint64_t *)wq_link_id); - *(uint64_t *)wq_link_id = 0; + if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) { + free_ts = ts; + ts = kqwl->kqwl_turnstile; + } else { + ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile, + ts, TURNSTILE_WORKLOOPS); - /* - * selprocess() is expecting that we send it back the waitq - * that was just added to the thread's waitq set. In order - * to not change the selrecord() API (which is exported to - * kexts), we pass this value back through the - * void *wq_link_id pointer we were passed. We need to use - * memcpy here because the pointer may not be properly aligned - * on 32-bit systems. - */ - void *wqptr = &kq->kq_wqs; - memcpy(wq_link_id, (void *)&wqptr, sizeof(void *)); + /* release-barrier to pair with the unlocked load of kqwl_turnstile above */ + os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release); + + if (filt_wlturnstile_interlock_is_workq(kqwl)) { + workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, + &kqwl->kqwl_request, kqwl->kqwl_owner, + ts, TURNSTILE_IMMEDIATE_UPDATE); + /* + * The workq may no longer be the interlock after this. + * In which case the inheritor wasn't updated. + */ + } + if (!filt_wlturnstile_interlock_is_workq(kqwl)) { + filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE); + } + } + + if (workq_locked) { + workq_kern_threadreq_unlock(kqwl->kqwl_p); + } + + kqunlock(kqu); + + if (free_ts) { + turnstile_deallocate(free_ts); + } else { + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + } + return ts; +} + +__attribute__((always_inline)) +struct turnstile * +kqueue_turnstile(kqueue_t kqu) +{ + kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed); + if (kq_state & KQ_WORKLOOP) { + return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed); + } + return TURNSTILE_NULL; +} + +__attribute__((always_inline)) +struct turnstile * +kqueue_threadreq_get_turnstile(workq_threadreq_t kqr) +{ + struct kqworkloop *kqwl = kqr_kqworkloop(kqr); + if (kqwl) { + return os_atomic_load(&kqwl->kqwl_turnstile, relaxed); } + return TURNSTILE_NULL; +} + +static void +kqworkloop_set_overcommit(struct kqworkloop *kqwl) +{ + workq_threadreq_t kqr = &kqwl->kqwl_request; - if (kqfile_begin_processing(kq) == -1) { - kqunlock(kq); - return 0; + /* + * This test is racy, but since we never remove this bit, + * it allows us to avoid taking a lock. + */ + if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) { + return; } - queue = &kq->kq_queue[QOS_INDEX_KQFILE]; - if (!TAILQ_EMPTY(queue)) { - /* - * there is something queued - but it might be a - * KN_STAYACTIVE knote, which may or may not have - * any events pending. Otherwise, we have to walk - * the list of knotes to see, and peek at the - * (non-vanished) stay-active ones to be really sure. - */ - while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) { - if (kn->kn_status & KN_ACTIVE) { - retnum = 1; - goto out; - } - assert(kn->kn_status & KN_STAYACTIVE); - knote_suppress(kn); - } + kqlock_held(kqwl); - /* - * There were no regular events on the queue, so take - * a deeper look at the stay-queued ones we suppressed. - */ - suppressq = kqueue_get_suppressed_queue(kq, NULL); - while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) { - KNOTE_LOCK_CTX(knlc); - int result = 0; + if (kqr_thread_requested_pending(kqr)) { + kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos, + WORKQ_THREADREQ_MAKE_OVERCOMMIT); + } else { + kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT; + } +} - /* If didn't vanish while suppressed - peek at it */ - if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc, - KNOTE_KQ_LOCK_ON_FAILURE)) { - continue; - } +static void +kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, + kq_index_t override_index) +{ + workq_threadreq_t kqr; + kq_index_t old_override_index; + kq_index_t queue_index = kn->kn_qos_index; - result = filter_call(knote_fops(kn), f_peek(kn)); + if (override_index <= queue_index) { + return; + } - kqlock(kq); - knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); + kqr = kqworkq_get_request(kqwq, queue_index); - /* unsuppress it */ - knote_unsuppress(kn); + kqlock_held(kqwq); - /* has data or it has to report a vanish */ - if (result & FILTER_ACTIVE) { - retnum = 1; - goto out; + old_override_index = kqr->tr_kq_override_index; + if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) { + thread_t servicer = kqr_thread(kqr); + kqr->tr_kq_override_index = override_index; + + /* apply the override to [incoming?] servicing thread */ + if (servicer) { + if (old_override_index) { + thread_update_kevent_override(servicer, override_index); + } else { + thread_add_kevent_override(servicer, override_index); } } } - -out: - kqfile_end_processing(kq); - kqunlock(kq); - return retnum; } -/* - * kqueue_close - - */ -/*ARGSUSED*/ -static int -kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx) +static void +kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos) { - struct kqfile *kqf = (struct kqfile *)fg->fg_data; - - assert((kqf->kqf_state & KQ_WORKQ) == 0); - kqueue_dealloc(&kqf->kqf_kqueue); - fg->fg_data = NULL; - return 0; + if (kqu.kq->kq_state & KQ_WORKLOOP) { + kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE, + qos); + } else { + kqworkq_update_override(kqu.kqwq, kn, qos); + } } -/* - * Max depth of the nested kq path that can be created. - * Note that this has to be less than the size of kq_level - * to avoid wrapping around and mislabeling the level. - */ -#define MAX_NESTED_KQ 1000 - -/*ARGSUSED*/ -/* - * The callers has taken a use-count reference on this kqueue and will donate it - * to the kqueue we are being added to. This keeps the kqueue from closing until - * that relationship is torn down. - */ -static int -kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, - __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx) +static void +kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread, + enum kqwl_unbind_locked_mode how) { - struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data; - struct kqueue *kq = &kqf->kqf_kqueue; - struct kqueue *parentkq = knote_get_kq(kn); - uint16_t plevel = 0; - - assert((kqf->kqf_state & KQ_WORKQ) == 0); + struct uthread *ut = get_bsdthread_info(thread); + workq_threadreq_t kqr = &kqwl->kqwl_request; - if (parentkq == kq || kn->kn_filter != EVFILT_READ) { - knote_set_error(kn, EINVAL); - return 0; - } + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid, + thread_tid(thread), 0, 0); - /* - * We have to avoid creating a cycle when nesting kqueues - * inside another. Rather than trying to walk the whole - * potential DAG of nested kqueues, we just use a simple - * ceiling protocol. When a kqueue is inserted into another, - * we check that the (future) parent is not already nested - * into another kqueue at a lower level than the potenial - * child (because it could indicate a cycle). If that test - * passes, we just mark the nesting levels accordingly. - * - * Only up to MAX_NESTED_KQ can be nested. - */ + kqlock_held(kqwl); - kqlock(parentkq); - if (parentkq->kq_level > 0 && - parentkq->kq_level < kq->kq_level) { - kqunlock(parentkq); - knote_set_error(kn, EINVAL); - return 0; - } else { - /* set parent level appropriately */ - plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level; - if (plevel < kq->kq_level + 1) { - if (kq->kq_level + 1 > MAX_NESTED_KQ) { - kqunlock(parentkq); - knote_set_error(kn, EINVAL); - return 0; - } - plevel = kq->kq_level + 1; - } + assert(ut->uu_kqr_bound == kqr); + ut->uu_kqr_bound = NULL; + if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY && + ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) { + thread_drop_servicer_override(thread); + ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED; + } - parentkq->kq_level = plevel; - kqunlock(parentkq); + if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) { + turnstile_update_inheritor(kqwl->kqwl_turnstile, + TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(kqwl->kqwl_turnstile, + TURNSTILE_INTERLOCK_HELD); + } - kn->kn_filtid = EVFILTID_KQREAD; - kqlock(kq); - KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn); - /* indicate nesting in child, if needed */ - if (kq->kq_level == 0) { - kq->kq_level = 1; - } + kqr->tr_thread = THREAD_NULL; + kqr->tr_state = WORKQ_TR_STATE_IDLE; + kqwl->kqwl_state &= ~KQ_R2K_ARMED; +} - int count = kq->kq_count; - kqunlock(kq); - return count > 0; +static void +kqworkloop_unbind_delayed_override_drop(thread_t thread) +{ + struct uthread *ut = get_bsdthread_info(thread); + assert(ut->uu_kqr_bound == NULL); + if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) { + thread_drop_servicer_override(thread); + ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED; } } /* - * kqueue_drain - called when kq is closed + * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue + * + * It will acknowledge events, and possibly request a new thread if: + * - there were active events left + * - we pended waitq hook callouts during processing + * - we pended wakeups while processing (or unsuppressing) + * + * Called with kqueue lock held. */ -/*ARGSUSED*/ -static int -kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx) +static void +kqworkloop_unbind(struct kqworkloop *kqwl) { - struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data; + struct kqueue *kq = &kqwl->kqwl_kqueue; + workq_threadreq_t kqr = &kqwl->kqwl_request; + thread_t thread = kqr_thread_fast(kqr); + int op = KQWL_UTQ_PARKING; + kq_index_t qos_override = THREAD_QOS_UNSPECIFIED; - assert((kq->kq_state & KQ_WORKQ) == 0); + assert(thread == current_thread()); - kqlock(kq); - kq->kq_state |= KQ_DRAIN; - kqueue_interrupt(kq); - kqunlock(kq); - return 0; -} + kqlock(kqwl); -/*ARGSUSED*/ -int -kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p) -{ - assert((kq->kq_state & KQ_WORKQ) == 0); + /* + * Forcing the KQ_PROCESSING flag allows for QoS updates because of + * unsuppressing knotes not to be applied until the eventual call to + * kqworkloop_update_threads_qos() below. + */ + assert((kq->kq_state & KQ_PROCESSING) == 0); + if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) { + kq->kq_state |= KQ_PROCESSING; + qos_override = kqworkloop_acknowledge_events(kqwl); + kq->kq_state &= ~KQ_PROCESSING; + } - kqlock(kq); - if (isstat64 != 0) { - struct stat64 *sb64 = (struct stat64 *)ub; + kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED); + kqworkloop_update_threads_qos(kqwl, op, qos_override); - bzero((void *)sb64, sizeof(*sb64)); - sb64->st_size = kq->kq_count; - if (kq->kq_state & KQ_KEV_QOS) { - sb64->st_blksize = sizeof(struct kevent_qos_s); - } else if (kq->kq_state & KQ_KEV64) { - sb64->st_blksize = sizeof(struct kevent64_s); - } else if (IS_64BIT_PROCESS(p)) { - sb64->st_blksize = sizeof(struct user64_kevent); - } else { - sb64->st_blksize = sizeof(struct user32_kevent); - } - sb64->st_mode = S_IFIFO; - } else { - struct stat *sb = (struct stat *)ub; + kqunlock(kqwl); - bzero((void *)sb, sizeof(*sb)); - sb->st_size = kq->kq_count; - if (kq->kq_state & KQ_KEV_QOS) { - sb->st_blksize = sizeof(struct kevent_qos_s); - } else if (kq->kq_state & KQ_KEV64) { - sb->st_blksize = sizeof(struct kevent64_s); - } else if (IS_64BIT_PROCESS(p)) { - sb->st_blksize = sizeof(struct user64_kevent); - } else { - sb->st_blksize = sizeof(struct user32_kevent); - } - sb->st_mode = S_IFIFO; - } - kqunlock(kq); - return 0; + /* + * Drop the override on the current thread last, after the call to + * kqworkloop_update_threads_qos above. + */ + kqworkloop_unbind_delayed_override_drop(thread); + + /* If last reference, dealloc the workloop kq */ + kqworkloop_release(kqwl); } -static inline bool -kqueue_threadreq_can_use_ast(struct kqueue *kq) -{ - if (current_proc() == kq->kq_p) { - /* - * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can - * do combined send/receive and in the case of self-IPC, the AST may bet - * set on a thread that will not return to userspace and needs the - * thread the AST would create to unblock itself. - * - * At this time, we really want to target: - * - * - kevent variants that can cause thread creations, and dispatch - * really only uses kevent_qos and kevent_id, - * - * - workq_kernreturn (directly about thread creations) - * - * - bsdthread_ctl which is used for qos changes and has direct impact - * on the creator thread scheduling decisions. - */ - switch (current_uthread()->syscall_code) { - case SYS_kevent_qos: - case SYS_kevent_id: - case SYS_workq_kernreturn: - case SYS_bsdthread_ctl: - return true; - } - } - return false; +static thread_qos_t +kqworkq_unbind_locked(struct kqworkq *kqwq, + workq_threadreq_t kqr, thread_t thread) +{ + struct uthread *ut = get_bsdthread_info(thread); + kq_index_t old_override = kqr->tr_kq_override_index; + + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1, + thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0); + + kqlock_held(kqwq); + + assert(ut->uu_kqr_bound == kqr); + ut->uu_kqr_bound = NULL; + kqr->tr_thread = THREAD_NULL; + kqr->tr_state = WORKQ_TR_STATE_IDLE; + kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED; + kqwq->kqwq_state &= ~KQ_R2K_ARMED; + + return old_override; } /* - * Interact with the pthread kext to request a servicing there at a specific QoS - * level. - * - * - Caller holds the workq request lock + * kqworkq_unbind - unbind of a workq kqueue from a thread * - * - May be called with the kqueue's wait queue set locked, - * so cannot do anything that could recurse on that. + * We may have to request new threads. + * This can happen there are no waiting processing threads and: + * - there were active events we never got to (count > 0) + * - we pended waitq hook callouts during processing + * - we pended wakeups while processing (or unsuppressing) */ static void -kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, - kq_index_t qos, int flags) +kqworkq_unbind(proc_t p, workq_threadreq_t kqr) { - assert(kqr->kqr_state & KQR_WAKEUP); - assert(kqr->kqr_thread == THREAD_NULL); - assert((kqr->kqr_state & KQR_THREQUESTED) == 0); - struct turnstile *ts = TURNSTILE_NULL; - - if (workq_is_exiting(kq->kq_p)) { - return; - } + struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue; + __assert_only int rc; - /* Add a thread request reference on the kqueue. */ - kqueue_retain(kq); + kqlock(kqwq); + rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND); + assert(rc == -1); + kqunlock(kqwq); +} - kq_req_held(kq); +workq_threadreq_t +kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index) +{ + assert(qos_index < KQWQ_NBUCKETS); + return &kqwq->kqwq_request[qos_index]; +} - if (kq->kq_state & KQ_WORKLOOP) { - __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq; +static void +knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp) +{ + kq_index_t qos = _pthread_priority_thread_qos(pp); - assert(kqwl->kqwl_owner == THREAD_NULL); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST), - kqwl->kqwl_dynamicid, 0, qos, kqr->kqr_state); - ts = kqwl->kqwl_turnstile; + if (kqu.kq->kq_state & KQ_WORKLOOP) { + assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0); + pp = _pthread_priority_normalize(pp); + } else if (kqu.kq->kq_state & KQ_WORKQ) { + if (qos == THREAD_QOS_UNSPECIFIED) { + /* On workqueues, outside of QoS means MANAGER */ + qos = KQWQ_QOS_MANAGER; + pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; + } else { + pp = _pthread_priority_normalize(pp); + } } else { - assert(kq->kq_state & KQ_WORKQ); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), - -1, 0, qos, kqr->kqr_state); + pp = _pthread_unspecified_priority(); + qos = THREAD_QOS_UNSPECIFIED; + } + + kn->kn_qos = pp; + + if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) { + /* Never lower QoS when in "Merge" mode */ + kn->kn_qos_override = qos; + } + + /* only adjust in-use qos index when not suppressed */ + if (kn->kn_status & KN_SUPPRESSED) { + kqueue_update_override(kqu, kn, qos); + } else if (kn->kn_qos_index != qos) { + knote_dequeue(kqu, kn); + kn->kn_qos_index = qos; } +} + +static void +knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result) +{ + thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7; - kqr->kqr_state |= KQR_THREQUESTED; + kqlock_held(kq); + + assert(result & FILTER_ADJUST_EVENT_QOS_BIT); + assert(qos_index < THREAD_QOS_LAST); /* - * New-style thread request supported. - * Provide the pthread kext a pointer to a workq_threadreq_s structure for - * its use until a corresponding kqueue_threadreq_bind callback. + * Early exit for knotes that should not change QoS */ - if (kqueue_threadreq_can_use_ast(kq)) { - flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE; + if (__improbable(!knote_fops(kn)->f_adjusts_qos)) { + panic("filter %d cannot change QoS", kn->kn_filtid); + } else if (__improbable(!knote_has_qos(kn))) { + return; } - if (qos == KQWQ_QOS_MANAGER) { - qos = WORKQ_THREAD_QOS_MANAGER; + + /* + * knotes with the FALLBACK flag will only use their registration QoS if the + * incoming event has no QoS, else, the registration QoS acts as a floor. + */ + thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos); + if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) { + if (qos_index == THREAD_QOS_UNSPECIFIED) { + qos_index = req_qos; + } + } else { + if (qos_index < req_qos) { + qos_index = req_qos; + } } - if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) { + if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) { + /* Never lower QoS when in "Merge" mode */ + return; + } + + if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) { /* - * Process is shutting down or exec'ing. - * All the kqueues are going to be cleaned up - * soon. Forget we even asked for a thread - - * and make sure we don't ask for more. + * When we're trying to update the QoS override and that both an + * f_event() and other f_* calls are running concurrently, any of these + * in flight calls may want to perform overrides that aren't properly + * serialized with each other. + * + * The first update that observes this racy situation enters a "Merge" + * mode which causes subsequent override requests to saturate the + * override instead of replacing its value. + * + * This mode is left when knote_unlock() or knote_post() + * observe that no other f_* routine is in flight. */ - kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); - kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); + kn->kn_status |= KN_MERGE_QOS; } -} -/* - * kqueue_threadreq_bind_prepost - prepost the bind to kevent - * - * This is used when kqueue_threadreq_bind may cause a lock inversion. - */ -void -kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t req, - thread_t thread) -{ - struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); - struct uthread *ut = get_bsdthread_info(thread); + /* + * Now apply the override if it changed. + */ - req->tr_binding_thread = thread; - ut->uu_kqr_bound = kqr; - req->tr_state = TR_STATE_BINDING; + if (kn->kn_qos_override == qos_index) { + return; + } - struct kqworkloop *kqwl = kqr_kqworkloop(kqr); - if (kqwl && kqwl->kqwl_turnstile) { - struct turnstile *ts = kqwl->kqwl_turnstile; + kn->kn_qos_override = qos_index; + + if (kn->kn_status & KN_SUPPRESSED) { /* - * While a thread request is in flight, the workqueue - * is the interlock for the turnstile and can update the inheritor. + * For suppressed events, the kn_qos_index field cannot be touched as it + * allows us to know on which supress queue the knote is for a kqworkq. + * + * Also, there's no natural push applied on the kqueues when this field + * changes anyway. We hence need to apply manual overrides in this case, + * which will be cleared when the events are later acknowledged. */ - turnstile_update_inheritor(ts, thread, TURNSTILE_IMMEDIATE_UPDATE | - TURNSTILE_INHERITOR_THREAD); - turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + kqueue_update_override(kq, kn, qos_index); + } else if (kn->kn_qos_index != qos_index) { + knote_dequeue(kq, kn); + kn->kn_qos_index = qos_index; } } /* - * kqueue_threadreq_bind_commit - commit a bind prepost + * Called back from waitq code when no threads waiting and the hook was set. * - * The workq code has to commit any binding prepost before the thread has - * a chance to come back to userspace (and do kevent syscalls) or be aborted. + * Preemption is disabled - minimal work can be done in this context!!! */ void -kqueue_threadreq_bind_commit(struct proc *p, thread_t thread) +waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook) { - struct uthread *ut = get_bsdthread_info(thread); - struct kqrequest *kqr = ut->uu_kqr_bound; - kqueue_t kqu = kqr_kqueue(p, kqr); + kqueue_t kqu; - kq_req_lock(kqu); - if (kqr->kqr_req.tr_state == TR_STATE_BINDING) { - kqueue_threadreq_bind(p, &kqr->kqr_req, thread, 0); + kqu.kq = __container_of(kq_hook, struct kqueue, kq_waitq_hook); + assert(kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)); + + kqlock(kqu); + + if (kqu.kq->kq_count > 0) { + if (kqu.kq->kq_state & KQ_WORKLOOP) { + kqworkloop_wakeup(kqu.kqwl, KQWL_BUCKET_STAYACTIVE); + } else { + kqworkq_wakeup(kqu.kqwq, KQWQ_QOS_MANAGER); + } } - kq_req_unlock(kqu); + + kqunlock(kqu); } -static void -kqueue_threadreq_modify(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos) +void +klist_init(struct klist *list) { - assert(kqr->kqr_state & KQR_THREQUESTED); - assert(kqr->kqr_thread == THREAD_NULL); - - kq_req_held(kq); - - int flags = 0; - if (kqueue_threadreq_can_use_ast(kq)) { - flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE; - } - workq_kern_threadreq_modify(kq->kq_p, kqr, qos, flags); + SLIST_INIT(list); } + /* - * kqueue_threadreq_bind - bind thread to processing kqrequest + * Query/Post each knote in the object's list * - * The provided thread will be responsible for delivering events - * associated with the given kqrequest. Bind it and get ready for - * the thread to eventually arrive. + * The object lock protects the list. It is assumed + * that the filter/event routine for the object can + * determine that the object is already locked (via + * the hint) and not deadlock itself. + * + * The object lock should also hold off pending + * detach/drop operations. */ void -kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread, - unsigned int flags) +knote(struct klist *list, long hint) { - struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); - kqueue_t kqu = kqr_kqueue(p, kqr); - struct uthread *ut = get_bsdthread_info(thread); - - kq_req_held(kqu); - - assert(kqr->kqr_state & KQR_THREQUESTED); - assert(kqr->kqr_thread == THREAD_NULL); - assert(ut->uu_kqueue_override == 0); + struct knote *kn; - if (kqr->kqr_req.tr_state == TR_STATE_BINDING) { - assert(ut->uu_kqr_bound == kqr); - assert(kqr->kqr_req.tr_binding_thread == thread); - kqr->kqr_req.tr_state = TR_STATE_IDLE; - kqr->kqr_req.tr_binding_thread = NULL; - } else { - assert(ut->uu_kqr_bound == NULL); + SLIST_FOREACH(kn, list, kn_selnext) { + knote_post(kn, hint); } +} - ut->uu_kqr_bound = kqr; - kqr->kqr_thread = thread; +/* + * attach a knote to the specified list. Return true if this is the first entry. + * The list is protected by whatever lock the object it is associated with uses. + */ +int +knote_attach(struct klist *list, struct knote *kn) +{ + int ret = SLIST_EMPTY(list); + SLIST_INSERT_HEAD(list, kn, kn_selnext); + return ret; +} - if (kqu.kq->kq_state & KQ_WORKLOOP) { - struct turnstile *ts = kqu.kqwl->kqwl_turnstile; +/* + * detach a knote from the specified list. Return true if that was the last entry. + * The list is protected by whatever lock the object it is associated with uses. + */ +int +knote_detach(struct klist *list, struct knote *kn) +{ + SLIST_REMOVE(list, kn, knote, kn_selnext); + return SLIST_EMPTY(list); +} - if (__improbable(thread == kqu.kqwl->kqwl_owner)) { - /* - * shows that asserting here is not ok. - * - * This is not supposed to happen for correct use of the interface, - * but it is sadly possible for userspace (with the help of memory - * corruption, such as over-release of a dispatch queue) to make - * the creator thread the "owner" of a workloop. - * - * Once that happens, and that creator thread picks up the same - * workloop as a servicer, we trip this codepath. We need to fixup - * the state to forget about this thread being the owner, as the - * entire workloop state machine expects servicers to never be - * owners and everything would basically go downhill from here. - */ - kqu.kqwl->kqwl_owner = THREAD_NULL; - if (kqworkloop_owner_override(kqu.kqwl)) { - thread_drop_ipc_override(thread); - } - thread_ends_owning_workloop(thread); - } +/* + * knote_vanish - Indicate that the source has vanished + * + * If the knote has requested EV_VANISHED delivery, + * arrange for that. Otherwise, deliver a NOTE_REVOKE + * event for backward compatibility. + * + * The knote is marked as having vanished, but is not + * actually detached from the source in this instance. + * The actual detach is deferred until the knote drop. + * + * Our caller already has the object lock held. Calling + * the detach routine would try to take that lock + * recursively - which likely is not supported. + */ +void +knote_vanish(struct klist *list, bool make_active) +{ + struct knote *kn; + struct knote *kn_next; - if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) { + SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) { + struct kqueue *kq = knote_get_kq(kn); + + kqlock(kq); + if (__probable(kn->kn_status & KN_REQVANISH)) { /* - * Past this point, the interlock is the kq req lock again, - * so we can fix the inheritor for good. + * If EV_VANISH supported - prepare to deliver one */ - filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE); - turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + kn->kn_status |= KN_VANISHED; + } else { + /* + * Handle the legacy way to indicate that the port/portset was + * deallocated or left the current Mach portspace (modern technique + * is with an EV_VANISHED protocol). + * + * Deliver an EV_EOF event for these changes (hopefully it will get + * delivered before the port name recycles to the same generation + * count and someone tries to re-register a kevent for it or the + * events are udata-specific - avoiding a conflict). + */ + kn->kn_flags |= EV_EOF | EV_ONESHOT; } - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid, - thread_tid(thread), kqr->kqr_qos_index, - (kqr->kqr_override_index << 16) | kqr->kqr_state); - - ut->uu_kqueue_override = kqr->kqr_override_index; - if (kqr->kqr_override_index) { - thread_add_ipc_override(thread, kqr->kqr_override_index); + if (make_active) { + knote_activate(kq, kn, FILTER_ACTIVE); } - } else { - assert(kqr->kqr_override_index == 0); - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1, - thread_tid(thread), kqr->kqr_qos_index, - (kqr->kqr_override_index << 16) | kqr->kqr_state); + kqunlock(kq); } } /* - * kqueue_threadreq_cancel - abort a pending thread request + * Force a lazy allocation of the waitqset link + * of the kq_wqs associated with the kn + * if it wasn't already allocated. * - * Called when exiting/exec'ing. Forget our pending request. + * This allows knote_link_waitq to never block + * if reserved_link is not NULL. */ void -kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req) +knote_link_waitqset_lazy_alloc(struct knote *kn) { - struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); - kqueue_t kqu = kqr_kqueue(p, kqr); - - kq_req_lock(kqu); - - assert(kqr->kqr_thread == THREAD_NULL); - assert(kqr->kqr_state & KQR_THREQUESTED); - kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); - - kq_req_unlock(kqu); - - kqueue_release_last(p, kqu); /* may dealloc kqu */ + struct kqueue *kq = knote_get_kq(kn); + waitq_set_lazy_init_link(&kq->kq_wqs); } -workq_threadreq_param_t -kqueue_threadreq_workloop_param(workq_threadreq_t req) +/* + * Check if a lazy allocation for the waitqset link + * of the kq_wqs is needed. + */ +boolean_t +knote_link_waitqset_should_lazy_alloc(struct knote *kn) { - struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); - struct kqworkloop *kqwl; - workq_threadreq_param_t trp; - - assert(kqr->kqr_state & KQR_WORKLOOP); - kqwl = __container_of(kqr, struct kqworkloop, kqwl_request); - trp.trp_value = kqwl->kqwl_params; - return trp; + struct kqueue *kq = knote_get_kq(kn); + return waitq_set_should_lazy_init_link(&kq->kq_wqs); } /* - * kqueue_threadreq_unbind - unbind thread from processing kqueue + * For a given knote, link a provided wait queue directly with the kqueue. + * Wakeups will happen via recursive wait queue support. But nothing will move + * the knote to the active list at wakeup (nothing calls knote()). Instead, + * we permanently enqueue them here. * - * End processing the per-QoS bucket of events and allow other threads - * to be requested for future servicing. + * kqueue and knote references are held by caller. + * waitq locked by caller. * - * caller holds a reference on the kqueue. + * caller provides the wait queue link structure and insures that the kq->kq_wqs + * is linked by previously calling knote_link_waitqset_lazy_alloc. */ -void -kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr) +int +knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link) { - if (kqr->kqr_state & KQR_WORKLOOP) { - kqworkloop_unbind(p, kqr_kqworkloop(kqr)); + struct kqueue *kq = knote_get_kq(kn); + kern_return_t kr; + + kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link); + if (kr == KERN_SUCCESS) { + knote_markstayactive(kn); + return 0; } else { - kqworkq_unbind(p, kqr); + return EINVAL; } } /* - * If we aren't already busy processing events [for this QoS], - * request workq thread support as appropriate. + * Unlink the provided wait queue from the kqueue associated with a knote. + * Also remove it from the magic list of directly attached knotes. * - * TBD - for now, we don't segregate out processing by QoS. + * Note that the unlink may have already happened from the other side, so + * ignore any failures to unlink and just remove it from the kqueue list. * - * - May be called with the kqueue's wait queue set locked, - * so cannot do anything that could recurse on that. + * On success, caller is responsible for the link structure */ -static void -kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index) +int +knote_unlink_waitq(struct knote *kn, struct waitq *wq) { - struct kqrequest *kqr; - - /* convert to thread qos value */ - assert(qos_index < KQWQ_NBUCKETS); - - kq_req_lock(kqwq); - kqr = kqworkq_get_request(kqwq, qos_index); - - if ((kqr->kqr_state & KQR_WAKEUP) == 0) { - kqr->kqr_state |= KQR_WAKEUP; - if ((kqr->kqr_state & KQR_THREQUESTED) == 0) { - kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0); - } - } - kq_req_unlock(kqwq); -} + struct kqueue *kq = knote_get_kq(kn); + kern_return_t kr; -static kq_index_t -kqworkloop_owner_override(struct kqworkloop *kqwl) -{ - struct kqrequest *kqr = &kqwl->kqwl_request; - return MAX(kqr->kqr_qos_index, kqr->kqr_override_index); + kr = waitq_unlink(wq, &kq->kq_wqs); + knote_clearstayactive(kn); + return (kr != KERN_SUCCESS) ? EINVAL : 0; } -static inline void -kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl) +/* + * remove all knotes referencing a specified fd + * + * Entered with the proc_fd lock already held. + * It returns the same way, but may drop it temporarily. + */ +void +knote_fdclose(struct proc *p, int fd) { - struct kqrequest *kqr = &kqwl->kqwl_request; - - kq_req_held(kqwl); - - if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) { - assert(kqr->kqr_thread); - kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED; - act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL); - } -} + struct klist *list; + struct knote *kn; + KNOTE_LOCK_CTX(knlc); -static void -kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) -{ - struct kqrequest *kqr = &kqwl->kqwl_request; - struct kqueue *kq = &kqwl->kqwl_kqueue; - kq_index_t old_owner_override = kqworkloop_owner_override(kqwl); - kq_index_t i; +restart: + list = &p->p_fd->fd_knlist[fd]; + SLIST_FOREACH(kn, list, kn_link) { + struct kqueue *kq = knote_get_kq(kn); - /* must hold the kqr lock */ - kq_req_held(kqwl); + kqlock(kq); - switch (op) { - case KQWL_UTQ_UPDATE_WAKEUP_QOS: - if (qos == KQWL_BUCKET_STAYACTIVE) { - /* - * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember - * a high watermark (kqr_stayactive_qos) of any stay active knote - * that was ever registered with this workloop. - * - * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active - * knote, we use this high-watermark as a wakeup-index, and also set - * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember - * there is at least one stay active knote fired until the next full - * processing of this bucket. - */ - kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT; - qos = kqr->kqr_stayactive_qos; - assert(qos); + if (kq->kq_p != p) { + panic("%s: proc mismatch (kq->kq_p=%p != p=%p)", + __func__, kq->kq_p, p); } - if (kqr->kqr_wakeup_indexes & (1 << qos)) { - assert(kqr->kqr_state & KQR_WAKEUP); - break; + + /* + * If the knote supports EV_VANISHED delivery, + * transition it to vanished mode (or skip over + * it if already vanished). + */ + if (kn->kn_status & KN_VANISHED) { + kqunlock(kq); + continue; } - kqr->kqr_wakeup_indexes |= (1 << qos); - kqr->kqr_state |= KQR_WAKEUP; - kqworkloop_request_fire_r2k_notification(kqwl); - goto recompute; + proc_fdunlock(p); + if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + /* the knote was dropped by someone, nothing to do */ + } else if (kn->kn_status & KN_REQVANISH) { + kn->kn_status |= KN_VANISHED; - case KQWL_UTQ_UPDATE_STAYACTIVE_QOS: - assert(qos); - if (kqr->kqr_stayactive_qos < qos) { - kqr->kqr_stayactive_qos = qos; - if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) { - assert(kqr->kqr_state & KQR_WAKEUP); - kqr->kqr_wakeup_indexes |= (1 << qos); - goto recompute; + kqunlock(kq); + knote_fops(kn)->f_detach(kn); + if (kn->kn_is_fd) { + fp_drop(p, kn->kn_id, kn->kn_fp, 0); } - } - break; + kn->kn_filtid = EVFILTID_DETACHED; + kqlock(kq); - case KQWL_UTQ_PARKING: - case KQWL_UTQ_UNBINDING: - kqr->kqr_override_index = qos; - /* FALLTHROUGH */ - case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS: - if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) { - assert(qos == THREAD_QOS_UNSPECIFIED); - } - kqlock_held(kqwl); // to look at kq_queues - i = KQWL_BUCKET_STAYACTIVE; - if (TAILQ_EMPTY(&kqr->kqr_suppressed)) { - kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; - } - if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) && - (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) { - /* - * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active - * knote may have fired, so we need to merge in kqr_stayactive_qos. - * - * Unlike other buckets, this one is never empty but could be idle. - */ - kqr->kqr_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT; - kqr->kqr_wakeup_indexes |= (1 << kqr->kqr_stayactive_qos); - } else { - kqr->kqr_wakeup_indexes = 0; - } - for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) { - if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) { - kqr->kqr_wakeup_indexes |= (1 << i); - } - } - if (kqr->kqr_wakeup_indexes) { - kqr->kqr_state |= KQR_WAKEUP; - kqworkloop_request_fire_r2k_notification(kqwl); + knote_activate(kq, kn, FILTER_ACTIVE); + knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); } else { - kqr->kqr_state &= ~KQR_WAKEUP; - } - goto recompute; - - case KQWL_UTQ_RESET_WAKEUP_OVERRIDE: - kqr->kqr_override_index = qos; - goto recompute; - - case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE: -recompute: - /* - * When modifying the wakeup QoS or the override QoS, we always need to - * maintain our invariant that kqr_override_index is at least as large - * as the highest QoS for which an event is fired. - * - * However this override index can be larger when there is an overriden - * suppressed knote pushing on the kqueue. - */ - if (kqr->kqr_wakeup_indexes > (1 << qos)) { - qos = fls(kqr->kqr_wakeup_indexes) - 1; /* fls is 1-based */ - } - if (kqr->kqr_override_index < qos) { - kqr->kqr_override_index = qos; + knote_drop(kq, kn, &knlc); } - break; - - case KQWL_UTQ_REDRIVE_EVENTS: - break; - case KQWL_UTQ_SET_QOS_INDEX: - kqr->kqr_qos_index = qos; - break; - - default: - panic("unknown kqwl thread qos update operation: %d", op); + proc_fdlock(p); + goto restart; } +} - thread_t kqwl_owner = kqwl->kqwl_owner; - thread_t servicer = kqr->kqr_thread; - boolean_t qos_changed = FALSE; - kq_index_t new_owner_override = kqworkloop_owner_override(kqwl); +/* + * knote_fdfind - lookup a knote in the fd table for process + * + * If the filter is file-based, lookup based on fd index. + * Otherwise use a hash based on the ident. + * + * Matching is based on kq, filter, and ident. Optionally, + * it may also be based on the udata field in the kevent - + * allowing multiple event registration for the file object + * per kqueue. + * + * fd_knhashlock or fdlock held on entry (and exit) + */ +static struct knote * +knote_fdfind(struct kqueue *kq, + const struct kevent_internal_s *kev, + bool is_fd, + struct proc *p) +{ + struct filedesc *fdp = p->p_fd; + struct klist *list = NULL; + struct knote *kn = NULL; /* - * Apply the diffs to the owner if applicable + * determine where to look for the knote */ - if (kqwl_owner) { -#if 0 - /* JMM - need new trace hooks for owner overrides */ - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), - kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->kqr_qos_index, - (kqr->kqr_override_index << 16) | kqr->kqr_state); -#endif - if (new_owner_override == old_owner_override) { - // nothing to do - } else if (old_owner_override == THREAD_QOS_UNSPECIFIED) { - thread_add_ipc_override(kqwl_owner, new_owner_override); - } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(kqwl_owner); - } else { /* old_owner_override != new_owner_override */ - thread_update_ipc_override(kqwl_owner, new_owner_override); + if (is_fd) { + /* fd-based knotes are linked off the fd table */ + if (kev->kei_ident < (u_int)fdp->fd_knlistsize) { + list = &fdp->fd_knlist[kev->kei_ident]; } + } else if (fdp->fd_knhashmask != 0) { + /* hash non-fd knotes here too */ + list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)]; } /* - * apply the diffs to the servicer + * scan the selected list looking for a match */ - if ((kqr->kqr_state & KQR_THREQUESTED) == 0) { - /* - * No servicer, nor thread-request - * - * Make a new thread request, unless there is an owner (or the workloop - * is suspended in userland) or if there is no asynchronous work in the - * first place. - */ - - if (kqwl_owner == NULL && (kqr->kqr_state & KQR_WAKEUP)) { - int initiate_flags = 0; - if (op == KQWL_UTQ_UNBINDING) { - initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND; - } - kqueue_threadreq_initiate(kq, kqr, new_owner_override, - initiate_flags); - } - } else if (servicer) { - /* - * Servicer in flight - * - * Just apply the diff to the servicer - */ - struct uthread *ut = get_bsdthread_info(servicer); - if (ut->uu_kqueue_override != kqr->kqr_override_index) { - if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) { - thread_add_ipc_override(servicer, kqr->kqr_override_index); - } else if (kqr->kqr_override_index == THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(servicer); - } else { /* ut->uu_kqueue_override != kqr->kqr_override_index */ - thread_update_ipc_override(servicer, kqr->kqr_override_index); + if (list != NULL) { + SLIST_FOREACH(kn, list, kn_link) { + if (kq == knote_get_kq(kn) && + kev->kei_ident == kn->kn_id && + kev->kei_filter == kn->kn_filter) { + if (kev->kei_flags & EV_UDATA_SPECIFIC) { + if ((kn->kn_flags & EV_UDATA_SPECIFIC) && + kev->kei_udata == kn->kn_udata) { + break; /* matching udata-specific knote */ + } + } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) { + break; /* matching non-udata-specific knote */ + } } - ut->uu_kqueue_override = kqr->kqr_override_index; - qos_changed = TRUE; } - } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) { - /* - * No events to deliver anymore. - * - * However canceling with turnstiles is challenging, so the fact that - * the request isn't useful will be discovered by the servicer himself - * later on. - */ - } else if (old_owner_override != new_owner_override) { - /* - * Request is in flight - * - * Apply the diff to the thread request - */ - kqueue_threadreq_modify(kq, kqr, new_owner_override); - qos_changed = TRUE; - } - - if (qos_changed) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid, - thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, - (kqr->kqr_override_index << 16) | kqr->kqr_state); } + return kn; } -static void -kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index) +/* + * kq_add_knote- Add knote to the fd table for process + * while checking for duplicates. + * + * All file-based filters associate a list of knotes by file + * descriptor index. All other filters hash the knote by ident. + * + * May have to grow the table of knote lists to cover the + * file descriptor index presented. + * + * fd_knhashlock and fdlock unheld on entry (and exit). + * + * Takes a rwlock boost if inserting the knote is successful. + */ +static int +kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, + struct proc *p) { - /* convert to thread qos value */ - assert(qos_index < KQWL_NBUCKETS); - - kq_req_lock(kqwl); - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index); - kq_req_unlock(kqwl); -} + struct filedesc *fdp = p->p_fd; + struct klist *list = NULL; + int ret = 0; + bool is_fd = kn->kn_is_fd; -static struct kqtailq * -kqueue_get_queue(struct kqueue *kq, kq_index_t qos_index) -{ - if (kq->kq_state & KQ_WORKQ) { - assert(qos_index < KQWQ_NBUCKETS); - } else if (kq->kq_state & KQ_WORKLOOP) { - assert(qos_index < KQWL_NBUCKETS); + if (is_fd) { + proc_fdlock(p); } else { - assert(qos_index == QOS_INDEX_KQFILE); + knhash_lock(fdp); } - static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue), - "struct kqueue::kq_queue must be exactly at the end"); - return &kq->kq_queue[qos_index]; -} - -static int -kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index) -{ - return TAILQ_EMPTY(kqueue_get_queue(kq, qos_index)); -} -static struct kqtailq * -kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn) -{ - if (kq.kq->kq_state & KQ_WORKQ) { - return &kqworkq_get_request(kq.kqwq, kn->kn_qos_index)->kqr_suppressed; - } else if (kq.kq->kq_state & KQ_WORKLOOP) { - return &kq.kqwl->kqwl_request.kqr_suppressed; - } else { - return &kq.kqf->kqf_suppressed; + if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) { + /* found an existing knote: we can't add this one */ + ret = ERESTART; + goto out_locked; } -} -static struct turnstile * -kqueue_get_turnstile(kqueue_t kqu, bool can_alloc) -{ - uint8_t kqr_state; + /* knote was not found: add it now */ + if (!is_fd) { + if (fdp->fd_knhashmask == 0) { + u_long size = 0; - if ((kqu.kq->kq_state & KQ_WORKLOOP) == 0) { - return TURNSTILE_NULL; - } + list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size); + if (list == NULL) { + ret = ENOMEM; + goto out_locked; + } - kqr_state = os_atomic_load(&kqu.kqwl->kqwl_request.kqr_state, relaxed); - if (kqr_state & KQR_ALLOCATED_TURNSTILE) { - /* force a dependency to pair with the atomic or with release below */ - return os_atomic_load_with_dependency_on(&kqu.kqwl->kqwl_turnstile, - kqr_state); - } + fdp->fd_knhash = list; + fdp->fd_knhashmask = size; + } - if (!can_alloc) { - return TURNSTILE_NULL; - } + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + SLIST_INSERT_HEAD(list, kn, kn_link); + ret = 0; + goto out_locked; + } else { + /* knote is fd based */ - struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL; + if ((u_int)fdp->fd_knlistsize <= kn->kn_id) { + u_int size = 0; - kq_req_lock(kqu); - if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) { - workq_kern_threadreq_lock(kqu.kqwl->kqwl_p); - } + if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur + || kn->kn_id >= (uint64_t)maxfiles) { + ret = EINVAL; + goto out_locked; + } + /* have to grow the fd_knlist */ + size = fdp->fd_knlistsize; + while (size <= kn->kn_id) { + size += KQEXTENT; + } - if (kqu.kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) { - free_ts = ts; - ts = kqu.kqwl->kqwl_turnstile; - } else { - ts = turnstile_prepare((uintptr_t)kqu.kqwl, &kqu.kqwl->kqwl_turnstile, - ts, TURNSTILE_WORKLOOPS); + if (size >= (UINT_MAX / sizeof(struct klist *))) { + ret = EINVAL; + goto out_locked; + } - /* release-barrier to pair with the unlocked load of kqwl_turnstile above */ - os_atomic_or(&kqu.kqwl->kqwl_request.kqr_state, - KQR_ALLOCATED_TURNSTILE, release); - } + MALLOC(list, struct klist *, + size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + if (list == NULL) { + ret = ENOMEM; + goto out_locked; + } - if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) { - workq_kern_threadreq_unlock(kqu.kqwl->kqwl_p); - } - kq_req_unlock(kqu.kqwl); + bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list, + fdp->fd_knlistsize * sizeof(struct klist *)); + bzero((caddr_t)list + + fdp->fd_knlistsize * sizeof(struct klist *), + (size - fdp->fd_knlistsize) * sizeof(struct klist *)); + FREE(fdp->fd_knlist, M_KQUEUE); + fdp->fd_knlist = list; + fdp->fd_knlistsize = size; + } - if (free_ts) { - turnstile_deallocate(free_ts); + list = &fdp->fd_knlist[kn->kn_id]; + SLIST_INSERT_HEAD(list, kn, kn_link); + ret = 0; + goto out_locked; } - return ts; -} -struct turnstile * -kqueue_turnstile(struct kqueue *kq) -{ - return kqueue_get_turnstile(kq, false); -} - -struct turnstile * -kqueue_alloc_turnstile(struct kqueue *kq) -{ - return kqueue_get_turnstile(kq, true); -} +out_locked: + if (ret == 0) { + kqlock(kq); + assert((kn->kn_status & KN_LOCKED) == 0); + (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK); + kqueue_retain(kq); /* retain a kq ref */ + } + if (is_fd) { + proc_fdunlock(p); + } else { + knhash_unlock(fdp); + } -static struct kqtailq * -knote_get_queue(struct knote *kn) -{ - return kqueue_get_queue(knote_get_kq(kn), kn->kn_qos_index); + return ret; } +/* + * kq_remove_knote - remove a knote from the fd table for process + * + * If the filter is file-based, remove based on fd index. + * Otherwise remove from the hash based on the ident. + * + * fd_knhashlock and fdlock unheld on entry (and exit). + */ static void -knote_reset_priority(struct knote *kn, pthread_priority_t pp) +kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, + struct knote_lock_ctx *knlc) { - struct kqueue *kq = knote_get_kq(kn); - kq_index_t qos = _pthread_priority_thread_qos(pp); - - assert((kn->kn_status & KN_QUEUED) == 0); + struct filedesc *fdp = p->p_fd; + struct klist *list = NULL; + uint16_t kq_state; + bool is_fd = kn->kn_is_fd; - if (kq->kq_state & KQ_WORKQ) { - if (qos == THREAD_QOS_UNSPECIFIED) { - /* On workqueues, outside of QoS means MANAGER */ - qos = KQWQ_QOS_MANAGER; - pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; - } else { - pp = _pthread_priority_normalize(pp); - } - } else if (kq->kq_state & KQ_WORKLOOP) { - assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0); - pp = _pthread_priority_normalize(pp); + if (is_fd) { + proc_fdlock(p); } else { - pp = _pthread_unspecified_priority(); - qos = THREAD_QOS_UNSPECIFIED; + knhash_lock(fdp); } - kn->kn_qos = pp; - kn->kn_req_index = qos; + if (is_fd) { + assert((u_int)fdp->fd_knlistsize > kn->kn_id); + list = &fdp->fd_knlist[kn->kn_id]; + } else { + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + } + SLIST_REMOVE(list, kn, knote, kn_link); - if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) { - /* Never lower QoS when in "Merge" mode */ - kn->kn_qos_override = qos; + kqlock(kq); + kq_state = kq->kq_state; + if (knlc) { + knote_unlock_cancel(kq, kn, knlc); + } else { + kqunlock(kq); + } + if (is_fd) { + proc_fdunlock(p); + } else { + knhash_unlock(fdp); } - /* only adjust in-use qos index when not suppressed */ - if ((kn->kn_status & KN_SUPPRESSED) == 0) { - kn->kn_qos_index = qos; - } else if (kq->kq_state & KQ_WORKQ) { - kqworkq_update_override((struct kqworkq *)kq, kn, qos); - } else if (kq->kq_state & KQ_WORKLOOP) { - kqworkloop_update_override((struct kqworkloop *)kq, qos); + if (kq_state & KQ_DYNAMIC) { + kqworkloop_release((struct kqworkloop *)kq); } } -static void -knote_set_qos_overcommit(struct knote *kn) +/* + * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process + * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock. + * + * fd_knhashlock or fdlock unheld on entry (and exit) + */ + +static struct knote * +kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev, + bool is_fd, struct proc *p) { - struct kqueue *kq = knote_get_kq(kn); + struct filedesc *fdp = p->p_fd; + struct knote *kn; - /* turn overcommit on for the appropriate thread request? */ - if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) && - (kq->kq_state & KQ_WORKLOOP)) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + if (is_fd) { + proc_fdlock(p); + } else { + knhash_lock(fdp); + } - /* - * This test is racy, but since we never remove this bit, - * it allows us to avoid taking a lock. - */ - if (kqr->kqr_state & KQR_THOVERCOMMIT) { - return; - } + /* + * Temporary horrible hack: + * this cast is gross and will go away in a future change. + * It is OK to do because we don't look at xflags/s_fflags, + * and that when we cast down the kev this way, + * the truncated filter field works. + */ + kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p); - kq_req_lock(kqwl); - kqr->kqr_state |= KQR_THOVERCOMMIT; - if (!kqr->kqr_thread && (kqr->kqr_state & KQR_THREQUESTED)) { - kqueue_threadreq_modify(kq, kqr, kqr->kqr_req.tr_qos); - } - kq_req_unlock(kqwl); + if (kn) { + kqlock(kq); + assert(knote_get_kq(kn) == kq); } -} -static kq_index_t -knote_get_qos_override_index(struct knote *kn) -{ - return kn->kn_qos_override; + if (is_fd) { + proc_fdunlock(p); + } else { + knhash_unlock(fdp); + } + + return kn; } +__attribute__((noinline)) static void -kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, - kq_index_t override_index) +kqfile_wakeup(struct kqfile *kqf, __unused kq_index_t qos) { - struct kqrequest *kqr; - kq_index_t old_override_index; - kq_index_t queue_index = kn->kn_qos_index; - - if (override_index <= queue_index) { - return; + /* flag wakeups during processing */ + if (kqf->kqf_state & KQ_PROCESSING) { + kqf->kqf_state |= KQ_WAKEUP; } - kqr = kqworkq_get_request(kqwq, queue_index); - - kq_req_lock(kqwq); - old_override_index = kqr->kqr_override_index; - if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) { - kqr->kqr_override_index = override_index; - - /* apply the override to [incoming?] servicing thread */ - if (kqr->kqr_thread) { - if (old_override_index) { - thread_update_ipc_override(kqr->kqr_thread, override_index); - } else { - thread_add_ipc_override(kqr->kqr_thread, override_index); - } - } + /* wakeup a thread waiting on this queue */ + if (kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) { + kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL); + waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs, KQ_EVENT, + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); } - kq_req_unlock(kqwq); -} -static void -kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index) -{ - kq_req_lock(kqwl); - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE, - override_index); - kq_req_unlock(kqwl); + /* wakeup other kqueues/select sets we're inside */ + KNOTE(&kqf->kqf_sel.si_note, 0); } -static thread_qos_t -kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread) +static struct kqtailq * +knote_get_tailq(kqueue_t kqu, struct knote *kn) { - struct uthread *ut = get_bsdthread_info(thread); - struct kqrequest *kqr = &kqwl->kqwl_request; - kq_index_t ipc_override = ut->uu_kqueue_override; - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid, - thread_tid(thread), 0, 0); - - kq_req_held(kqwl); - assert(ut->uu_kqr_bound == kqr); - ut->uu_kqr_bound = NULL; - ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED; + kq_index_t qos_index = kn->kn_qos_index; - if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) { - turnstile_update_inheritor(kqwl->kqwl_turnstile, - TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); - turnstile_update_inheritor_complete(kqwl->kqwl_turnstile, - TURNSTILE_INTERLOCK_HELD); + if (kqu.kq->kq_state & KQ_WORKLOOP) { + assert(qos_index < KQWL_NBUCKETS); + } else if (kqu.kq->kq_state & KQ_WORKQ) { + assert(qos_index < KQWQ_NBUCKETS); + } else { + assert(qos_index == QOS_INDEX_KQFILE); } - - kqr->kqr_thread = NULL; - kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); - return ipc_override; + static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue), + "struct kqueue::kq_queue must be exactly at the end"); + return &kqu.kq->kq_queue[qos_index]; } -/* - * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue - * - * It will acknowledge events, and possibly request a new thread if: - * - there were active events left - * - we pended waitq hook callouts during processing - * - we pended wakeups while processing (or unsuppressing) - * - * Called with kqueue lock held. - */ static void -kqworkloop_unbind(proc_t p, struct kqworkloop *kqwl) +knote_enqueue(kqueue_t kqu, struct knote *kn, kn_status_t wakeup_mask) { - struct kqueue *kq = &kqwl->kqwl_kqueue; - struct kqrequest *kqr = &kqwl->kqwl_request; - thread_t thread = kqr->kqr_thread; - int op = KQWL_UTQ_PARKING; - kq_index_t ipc_override, qos_override = THREAD_QOS_UNSPECIFIED; - - assert(thread == current_thread()); - - kqlock(kqwl); + kqlock_held(kqu); - /* - * Forcing the KQ_PROCESSING flag allows for QoS updates because of - * unsuppressing knotes not to be applied until the eventual call to - * kqworkloop_update_threads_qos() below. - */ - assert((kq->kq_state & KQ_PROCESSING) == 0); - if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) { - kq->kq_state |= KQ_PROCESSING; - qos_override = kqworkloop_acknowledge_events(kqwl); - kq->kq_state &= ~KQ_PROCESSING; + if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) { + return; } - kq_req_lock(kqwl); - - ipc_override = kqworkloop_unbind_locked(kqwl, thread); - kqworkloop_update_threads_qos(kqwl, op, qos_override); - - kq_req_unlock(kqwl); + if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)) { + return; + } - kqunlock(kqwl); + if ((kn->kn_status & KN_QUEUED) == 0) { + struct kqtailq *queue = knote_get_tailq(kqu, kn); - /* - * Drop the override on the current thread last, after the call to - * kqworkloop_update_threads_qos above. - */ - if (ipc_override) { - thread_drop_ipc_override(thread); + TAILQ_INSERT_TAIL(queue, kn, kn_tqe); + kn->kn_status |= KN_QUEUED; + kqu.kq->kq_count++; + } else if ((kn->kn_status & KN_STAYACTIVE) == 0) { + return; } - /* If last reference, dealloc the workloop kq */ - kqueue_release_last(p, kqwl); + if (kn->kn_status & wakeup_mask) { + if (kqu.kq->kq_state & KQ_WORKLOOP) { + kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index); + } else if (kqu.kq->kq_state & KQ_WORKQ) { + kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index); + } else { + kqfile_wakeup(kqu.kqf, kn->kn_qos_index); + } + } } -static thread_qos_t -kqworkq_unbind_locked(__assert_only struct kqworkq *kqwq, - struct kqrequest *kqr, thread_t thread) +__attribute__((always_inline)) +static inline void +knote_dequeue(kqueue_t kqu, struct knote *kn) { - struct uthread *ut = get_bsdthread_info(thread); - kq_index_t old_override = kqr->kqr_override_index; + if (kn->kn_status & KN_QUEUED) { + struct kqtailq *queue = knote_get_tailq(kqu, kn); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1, - thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, 0); - - kq_req_held(kqwq); - assert(ut->uu_kqr_bound == kqr); - ut->uu_kqr_bound = NULL; - kqr->kqr_thread = NULL; - kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); - kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; + // attaching the knote calls knote_reset_priority() without + // the kqlock which is fine, so we can't call kqlock_held() + // if we're not queued. + kqlock_held(kqu); - return old_override; + TAILQ_REMOVE(queue, kn, kn_tqe); + kn->kn_status &= ~KN_QUEUED; + kqu.kq->kq_count--; + } } -/* - * kqworkq_unbind - unbind of a workq kqueue from a thread - * - * We may have to request new threads. - * This can happen there are no waiting processing threads and: - * - there were active events we never got to (count > 0) - * - we pended waitq hook callouts during processing - * - we pended wakeups while processing (or unsuppressing) - */ +/* called with kqueue lock held */ static void -kqworkq_unbind(proc_t p, struct kqrequest *kqr) -{ - struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue; - __assert_only int rc; - - kqlock(kqwq); - rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND); - assert(rc == -1); - kqunlock(kqwq); -} - -struct kqrequest * -kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index) +knote_suppress(kqueue_t kqu, struct knote *kn) { - assert(qos_index < KQWQ_NBUCKETS); - return &kqwq->kqwq_request[qos_index]; -} + struct kqtailq *suppressq; -static void -knote_apply_qos_override(struct knote *kn, kq_index_t qos_index) -{ - assert((kn->kn_status & KN_QUEUED) == 0); + kqlock_held(kqu); - kn->kn_qos_override = qos_index; + assert((kn->kn_status & KN_SUPPRESSED) == 0); + assert(kn->kn_status & KN_QUEUED); - if (kn->kn_status & KN_SUPPRESSED) { - struct kqueue *kq = knote_get_kq(kn); - /* - * For suppressed events, the kn_qos_index field cannot be touched as it - * allows us to know on which supress queue the knote is for a kqworkq. - * - * Also, there's no natural push applied on the kqueues when this field - * changes anyway. We hence need to apply manual overrides in this case, - * which will be cleared when the events are later acknowledged. - */ - if (kq->kq_state & KQ_WORKQ) { - kqworkq_update_override((struct kqworkq *)kq, kn, qos_index); - } else { - kqworkloop_update_override((struct kqworkloop *)kq, qos_index); - } - } else { - kn->kn_qos_index = qos_index; - } + knote_dequeue(kqu, kn); + /* deactivate - so new activations indicate a wakeup */ + kn->kn_status &= ~KN_ACTIVE; + kn->kn_status |= KN_SUPPRESSED; + suppressq = kqueue_get_suppressed_queue(kqu, kn); + TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe); } -static bool -knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, int result, - thread_qos_t *qos_out) +__attribute__((always_inline)) +static inline void +knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn) { - thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7; + struct kqtailq *suppressq; - kqlock_held(kq); + kqlock_held(kqu); - assert(result & FILTER_ADJUST_EVENT_QOS_BIT); - assert(qos_index < THREAD_QOS_LAST); + assert(kn->kn_status & KN_SUPPRESSED); - /* - * Early exit for knotes that should not change QoS - * - * It is safe to test kn_req_index against MANAGER / STAYACTIVE because - * knotes with such kn_req_index values never change for their entire - * lifetime. - */ - if (__improbable(!knote_fops(kn)->f_adjusts_qos)) { - panic("filter %d cannot change QoS", kn->kn_filtid); - } else if (kq->kq_state & KQ_WORKLOOP) { - if (kn->kn_req_index == KQWL_BUCKET_STAYACTIVE) { - return false; - } - } else if (kq->kq_state & KQ_WORKQ) { - if (kn->kn_req_index == KQWQ_QOS_MANAGER) { - return false; - } - } else { - return false; - } + kn->kn_status &= ~KN_SUPPRESSED; + suppressq = kqueue_get_suppressed_queue(kqu, kn); + TAILQ_REMOVE(suppressq, kn, kn_tqe); /* - * knotes with the FALLBACK flag will only use their registration QoS if the - * incoming event has no QoS, else, the registration QoS acts as a floor. + * If the knote is no longer active, reset its push, + * and resynchronize kn_qos_index with kn_qos_override + * for knotes with a real qos. */ - if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) { - if (qos_index == THREAD_QOS_UNSPECIFIED) { - qos_index = kn->kn_req_index; - } - } else { - if (qos_index < kn->kn_req_index) { - qos_index = kn->kn_req_index; - } - } - if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) { - /* Never lower QoS when in "Merge" mode */ - return false; + if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) { + kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos); } + kn->kn_qos_index = kn->kn_qos_override; +} - if ((kn->kn_status & KN_LOCKED) && kn->kn_inuse) { - /* - * When we're trying to update the QoS override and that both an - * f_event() and other f_* calls are running concurrently, any of these - * in flight calls may want to perform overrides that aren't properly - * serialized with each other. - * - * The first update that observes this racy situation enters a "Merge" - * mode which causes subsequent override requests to saturate the - * override instead of replacing its value. - * - * This mode is left when knote_unlock() or knote_call_filter_event() - * observe that no other f_* routine is in flight. - */ - kn->kn_status |= KN_MERGE_QOS; +/* called with kqueue lock held */ +static void +knote_unsuppress(kqueue_t kqu, struct knote *kn) +{ + if (kn->kn_status & KN_SUPPRESSED) { + knote_unsuppress_noqueue(kqu, kn); + + /* don't wakeup if unsuppressing just a stay-active knote */ + knote_enqueue(kqu, kn, KN_ACTIVE); } +} - if (kn->kn_qos_override == qos_index) { - return false; +__attribute__((always_inline)) +static inline void +knote_mark_active(struct knote *kn) +{ + if ((kn->kn_status & KN_ACTIVE) == 0) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE), + kn->kn_udata, kn->kn_status | (kn->kn_id << 32), + kn->kn_filtid); } - *qos_out = qos_index; - return true; + kn->kn_status |= KN_ACTIVE; } +/* called with kqueue lock held */ static void -knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result) +knote_activate(kqueue_t kqu, struct knote *kn, int result) { - thread_qos_t qos; - if (knote_should_apply_qos_override(kq, kn, result, &qos)) { - knote_dequeue(kn); - knote_apply_qos_override(kn, qos); - if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) { - knote_wakeup(kn); - } + assert(result & FILTER_ACTIVE); + if (result & FILTER_ADJUST_EVENT_QOS_BIT) { + // may dequeue the knote + knote_adjust_qos(kqu.kq, kn, result); } + knote_mark_active(kn); + knote_enqueue(kqu, kn, KN_ACTIVE | KN_STAYACTIVE); } +/* + * This function applies changes requested by f_attach or f_touch for + * a given filter. It proceeds in a carefully chosen order to help + * every single transition do the minimal amount of work possible. + */ static void -knote_wakeup(struct knote *kn) +knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev, + int result) { - struct kqueue *kq = knote_get_kq(kn); - - kqlock_held(kq); + kn_status_t wakeup_mask = KN_ACTIVE; - if (kq->kq_state & KQ_WORKQ) { - struct kqworkq *kqwq = (struct kqworkq *)kq; - - kqworkq_request_help(kqwq, kn->kn_qos_index); - } else if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { + /* + * When a stayactive knote is reenabled, we may have missed wakeups + * while it was disabled, so we need to poll it. To do so, ask + * knote_enqueue() below to reenqueue it. + */ + wakeup_mask |= KN_STAYACTIVE; + kn->kn_status &= ~KN_DISABLED; /* - * kqworkloop_end_processing() will perform the required QoS - * computations when it unsets the processing mode. + * it is possible for userland to have knotes registered for a given + * workloop `wl_orig` but really handled on another workloop `wl_new`. + * + * In that case, rearming will happen from the servicer thread of + * `wl_new` which if `wl_orig` is no longer being serviced, would cause + * this knote to stay suppressed forever if we only relied on + * kqworkloop_acknowledge_events to be called by `wl_orig`. + * + * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't + * unsuppress because that would mess with the processing phase of + * `wl_orig`, however it also means kqworkloop_acknowledge_events() + * will be called. */ - if (!kqworkloop_is_processing_on_current_thread(kqwl)) { - kqworkloop_request_help(kqwl, kn->kn_qos_index); + if (__improbable(kn->kn_status & KN_SUPPRESSED)) { + if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) { + knote_unsuppress_noqueue(kqu, kn); + } } - } else { - struct kqfile *kqf = (struct kqfile *)kq; + } - /* flag wakeups during processing */ - if (kq->kq_state & KQ_PROCESSING) { - kq->kq_state |= KQ_WAKEUP; - } + if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) { + // may dequeue the knote + knote_reset_priority(kqu, kn, kev->qos); + } - /* wakeup a thread waiting on this queue */ - if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) { - kq->kq_state &= ~(KQ_SLEEP | KQ_SEL); - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT, - THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); - } + /* + * When we unsuppress above, or because of knote_reset_priority(), + * the knote may have been dequeued, we need to restore the invariant + * that if the knote is active it needs to be queued now that + * we're done applying changes. + */ + if (result & FILTER_ACTIVE) { + knote_activate(kqu, kn, result); + } else { + knote_enqueue(kqu, kn, wakeup_mask); + } - /* wakeup other kqueues/select sets we're inside */ - KNOTE(&kqf->kqf_sel.si_note, 0); + if ((result & FILTER_THREADREQ_NODEFEER) && + act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) { + workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE); } } /* - * Called with the kqueue locked + * knote_drop - disconnect and drop the knote + * + * Called with the kqueue locked, returns with the kqueue unlocked. + * + * If a knote locking context is passed, it is canceled. + * + * The knote may have already been detached from + * (or not yet attached to) its source object. */ static void -kqueue_interrupt(struct kqueue *kq) +knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc) { - assert((kq->kq_state & KQ_WORKQ) == 0); + struct proc *p = kq->kq_p; - /* wakeup sleeping threads */ - if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) { - kq->kq_state &= ~(KQ_SLEEP | KQ_SEL); - (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - KQ_EVENT, - THREAD_RESTART, - WAITQ_ALL_PRIORITIES); + kqlock_held(kq); + + assert((kn->kn_status & KN_DROPPING) == 0); + if (knlc == NULL) { + assert((kn->kn_status & KN_LOCKED) == 0); } + kn->kn_status |= KN_DROPPING; - /* wakeup threads waiting their turn to process */ - if (kq->kq_state & KQ_PROCWAIT) { - struct kqtailq *suppressq; + if (kn->kn_status & KN_SUPPRESSED) { + knote_unsuppress_noqueue(kq, kn); + } else { + knote_dequeue(kq, kn); + } + knote_wait_for_post(kq, kn); - assert(kq->kq_state & KQ_PROCESSING); + knote_fops(kn)->f_detach(kn); - kq->kq_state &= ~KQ_PROCWAIT; - suppressq = kqueue_get_suppressed_queue(kq, NULL); - (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(suppressq), - THREAD_RESTART, - WAITQ_ALL_PRIORITIES); + /* kq may be freed when kq_remove_knote() returns */ + kq_remove_knote(kq, kn, p, knlc); + if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) { + fp_drop(p, kn->kn_id, kn->kn_fp, 0); } + + knote_free(kn); } -/* - * Called back from waitq code when no threads waiting and the hook was set. - * - * Interrupts are likely disabled and spin locks are held - minimal work - * can be done in this context!!! - * - * JMM - in the future, this will try to determine which knotes match the - * wait queue wakeup and apply these wakeups against those knotes themselves. - * For now, all the events dispatched this way are dispatch-manager handled, - * so hard-code that for now. - */ void -waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos) +knote_init(void) { -#pragma unused(knote_hook, qos) + knote_zone = zinit(sizeof(struct knote), 8192 * sizeof(struct knote), + 8192, "knote zone"); + zone_change(knote_zone, Z_CACHING_ENABLED, TRUE); - struct kqueue *kq = (struct kqueue *)kq_hook; + kqfile_zone = zinit(sizeof(struct kqfile), 8192 * sizeof(struct kqfile), + 8192, "kqueue file zone"); - if (kq->kq_state & KQ_WORKQ) { - struct kqworkq *kqwq = (struct kqworkq *)kq; + kqworkq_zone = zinit(sizeof(struct kqworkq), 8192 * sizeof(struct kqworkq), + 8192, "kqueue workq zone"); - kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER); - } else if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192 * sizeof(struct kqworkloop), + 8192, "kqueue workloop zone"); + zone_change(kqworkloop_zone, Z_CACHING_ENABLED, TRUE); - kqworkloop_request_help(kqwl, KQWL_BUCKET_STAYACTIVE); - } + /* allocate kq lock group attribute and group */ + kq_lck_grp_attr = lck_grp_attr_alloc_init(); + + kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr); + + /* Allocate kq lock attribute */ + kq_lck_attr = lck_attr_alloc_init(); + +#if CONFIG_MEMORYSTATUS + /* Initialize the memorystatus list lock */ + memorystatus_kevent_init(kq_lck_grp, kq_lck_attr); +#endif } +SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); -void -klist_init(struct klist *list) +const struct filterops * +knote_fops(struct knote *kn) { - SLIST_INIT(list); + return sysfilt_ops[kn->kn_filtid]; } +static struct knote * +knote_alloc(void) +{ + struct knote *kn = ((struct knote *)zalloc(knote_zone)); + bzero(kn, sizeof(struct knote)); + return kn; +} -/* - * Query/Post each knote in the object's list - * - * The object lock protects the list. It is assumed - * that the filter/event routine for the object can - * determine that the object is already locked (via - * the hint) and not deadlock itself. - * - * The object lock should also hold off pending - * detach/drop operations. - */ -void -knote(struct klist *list, long hint) +static void +knote_free(struct knote *kn) { - struct knote *kn; + assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0); + zfree(knote_zone, kn); +} - SLIST_FOREACH(kn, list, kn_selnext) { - struct kqueue *kq = knote_get_kq(kn); - kqlock(kq); - knote_call_filter_event(kq, kn, hint); - kqunlock(kq); - } +#pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id + +kevent_ctx_t +kevent_get_context(thread_t thread) +{ + uthread_t ut = get_bsdthread_info(thread); + return &ut->uu_save.uus_kevent; } -/* - * attach a knote to the specified list. Return true if this is the first entry. - * The list is protected by whatever lock the object it is associated with uses. - */ -int -knote_attach(struct klist *list, struct knote *kn) +static inline bool +kevent_args_requesting_events(unsigned int flags, int nevents) { - int ret = SLIST_EMPTY(list); - SLIST_INSERT_HEAD(list, kn, kn_selnext); - return ret; + return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0; } -/* - * detach a knote from the specified list. Return true if that was the last entry. - * The list is protected by whatever lock the object it is associated with uses. - */ -int -knote_detach(struct klist *list, struct knote *kn) +static inline int +kevent_adjust_flags_for_proc(proc_t p, int flags) { - SLIST_REMOVE(list, kn, knote, kn_selnext); - return SLIST_EMPTY(list); + __builtin_assume(p); + return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0); } -/* - * knote_vanish - Indicate that the source has vanished +/*! + * @function kevent_get_kqfile * - * If the knote has requested EV_VANISHED delivery, - * arrange for that. Otherwise, deliver a NOTE_REVOKE - * event for backward compatibility. + * @brief + * Lookup a kqfile by fd. * - * The knote is marked as having vanished, but is not - * actually detached from the source in this instance. - * The actual detach is deferred until the knote drop. + * @discussion + * Callers: kevent, kevent64, kevent_qos * - * Our caller already has the object lock held. Calling - * the detach routine would try to take that lock - * recursively - which likely is not supported. + * This is not assumed to be a fastpath (kqfile interfaces are legacy) */ -void -knote_vanish(struct klist *list, bool make_active) +OS_NOINLINE +static int +kevent_get_kqfile(struct proc *p, int fd, int flags, + struct fileproc **fp, struct kqueue **kqp) { - struct knote *kn; - struct knote *kn_next; + int error = 0; + struct kqueue *kq; - SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) { - struct kqueue *kq = knote_get_kq(kn); + error = fp_getfkq(p, fd, fp, &kq); + if (__improbable(error)) { + return error; + } + uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed); + if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) { kqlock(kq); - if (__probable(kn->kn_status & KN_REQVANISH)) { - /* - * If EV_VANISH supported - prepare to deliver one - */ - kn->kn_status |= KN_VANISHED; - } else { - /* - * Handle the legacy way to indicate that the port/portset was - * deallocated or left the current Mach portspace (modern technique - * is with an EV_VANISHED protocol). - * - * Deliver an EV_EOF event for these changes (hopefully it will get - * delivered before the port name recycles to the same generation - * count and someone tries to re-register a kevent for it or the - * events are udata-specific - avoiding a conflict). - */ - kn->kn_flags |= EV_EOF | EV_ONESHOT; - } - if (make_active) { - knote_activate(kn); + kq_state = kq->kq_state; + if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) { + if (flags & KEVENT_FLAG_LEGACY32) { + kq_state |= KQ_KEV32; + } else if (flags & KEVENT_FLAG_LEGACY64) { + kq_state |= KQ_KEV64; + } else { + kq_state |= KQ_KEV_QOS; + } + kq->kq_state = kq_state; } kqunlock(kq); } + + /* + * kqfiles can't be used through the legacy kevent() + * and other interfaces at the same time. + */ + if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) != + (bool)(kq_state & KQ_KEV32))) { + fp_drop(p, fd, *fp, 0); + return EINVAL; + } + + *kqp = kq; + return 0; } -/* - * Force a lazy allocation of the waitqset link - * of the kq_wqs associated with the kn - * if it wasn't already allocated. +/*! + * @function kevent_get_kqwq * - * This allows knote_link_waitq to never block - * if reserved_link is not NULL. + * @brief + * Lookup or create the process kqwq (faspath). + * + * @discussion + * Callers: kevent64, kevent_qos */ -void -knote_link_waitqset_lazy_alloc(struct knote *kn) +OS_ALWAYS_INLINE +static int +kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp) { - struct kqueue *kq = knote_get_kq(kn); - waitq_set_lazy_init_link(&kq->kq_wqs); -} + struct kqworkq *kqwq = p->p_fd->fd_wqkqueue; -/* - * Check if a lazy allocation for the waitqset link - * of the kq_wqs is needed. - */ -boolean_t -knote_link_waitqset_should_lazy_alloc(struct knote *kn) -{ - struct kqueue *kq = knote_get_kq(kn); - return waitq_set_should_lazy_init_link(&kq->kq_wqs); + if (__improbable(kevent_args_requesting_events(flags, nevents))) { + return EINVAL; + } + if (__improbable(kqwq == NULL)) { + kqwq = kqworkq_alloc(p, flags); + if (__improbable(kqwq == NULL)) { + return ENOMEM; + } + } + + *kqp = &kqwq->kqwq_kqueue; + return 0; } -/* - * For a given knote, link a provided wait queue directly with the kqueue. - * Wakeups will happen via recursive wait queue support. But nothing will move - * the knote to the active list at wakeup (nothing calls knote()). Instead, - * we permanently enqueue them here. - * - * kqueue and knote references are held by caller. - * waitq locked by caller. +#pragma mark kevent copyio + +/*! + * @function kevent_get_data_size * - * caller provides the wait queue link structure and insures that the kq->kq_wqs - * is linked by previously calling knote_link_waitqset_lazy_alloc. + * @brief + * Copies in the extra data size from user-space. */ -int -knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link) -{ - struct kqueue *kq = knote_get_kq(kn); - kern_return_t kr; - - kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link); - if (kr == KERN_SUCCESS) { - knote_markstayactive(kn); - return 0; +static int +kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out, + kevent_ctx_t kectx) +{ + if (!data_avail || !data_out) { + kectx->kec_data_size = 0; + kectx->kec_data_resid = 0; + } else if (flags & KEVENT_FLAG_PROC64) { + user64_size_t usize = 0; + int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize)); + if (__improbable(error)) { + return error; + } + kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize; } else { - return EINVAL; + user32_size_t usize = 0; + int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize)); + if (__improbable(error)) { + return error; + } + kectx->kec_data_avail = data_avail; + kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize; } + kectx->kec_data_out = data_out; + kectx->kec_data_avail = data_avail; + return 0; } -/* - * Unlink the provided wait queue from the kqueue associated with a knote. - * Also remove it from the magic list of directly attached knotes. - * - * Note that the unlink may have already happened from the other side, so - * ignore any failures to unlink and just remove it from the kqueue list. +/*! + * @function kevent_put_data_size * - * On success, caller is responsible for the link structure + * @brief + * Copies out the residual data size to user-space if any has been used. */ -int -knote_unlink_waitq(struct knote *kn, struct waitq *wq) +static int +kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx) { - struct kqueue *kq = knote_get_kq(kn); - kern_return_t kr; - - kr = waitq_unlink(wq, &kq->kq_wqs); - knote_clearstayactive(kn); - return (kr != KERN_SUCCESS) ? EINVAL : 0; + if (kectx->kec_data_resid == kectx->kec_data_size) { + return 0; + } + if (flags & KEVENT_FLAG_KERNEL) { + *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid; + return 0; + } + if (flags & KEVENT_FLAG_PROC64) { + user64_size_t usize = (user64_size_t)kectx->kec_data_resid; + return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize)); + } else { + user32_size_t usize = (user32_size_t)kectx->kec_data_resid; + return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize)); + } } -/* - * remove all knotes referencing a specified fd +/*! + * @function kevent_legacy_copyin * - * Entered with the proc_fd lock already held. - * It returns the same way, but may drop it temporarily. + * @brief + * Handles the copyin of a kevent/kevent64 event. */ -void -knote_fdclose(struct proc *p, int fd) +static int +kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags) { - struct klist *list; - struct knote *kn; - KNOTE_LOCK_CTX(knlc); + int error; -restart: - list = &p->p_fd->fd_knlist[fd]; - SLIST_FOREACH(kn, list, kn_link) { - struct kqueue *kq = knote_get_kq(kn); + assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0); - kqlock(kq); + if (flags & KEVENT_FLAG_LEGACY64) { + struct kevent64_s kev64; - if (kq->kq_p != p) { - panic("%s: proc mismatch (kq->kq_p=%p != p=%p)", - __func__, kq->kq_p, p); + error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64)); + if (__improbable(error)) { + return error; + } + *addrp += sizeof(kev64); + *kevp = (struct kevent_qos_s){ + .ident = kev64.ident, + .filter = kev64.filter, + /* Make sure user doesn't pass in any system flags */ + .flags = kev64.flags & ~EV_SYSFLAGS, + .udata = kev64.udata, + .fflags = kev64.fflags, + .data = kev64.data, + .ext[0] = kev64.ext[0], + .ext[1] = kev64.ext[1], + }; + } else if (flags & KEVENT_FLAG_PROC64) { + struct user64_kevent kev64; + + error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64)); + if (__improbable(error)) { + return error; } + *addrp += sizeof(kev64); + *kevp = (struct kevent_qos_s){ + .ident = kev64.ident, + .filter = kev64.filter, + /* Make sure user doesn't pass in any system flags */ + .flags = kev64.flags & ~EV_SYSFLAGS, + .udata = kev64.udata, + .fflags = kev64.fflags, + .data = kev64.data, + }; + } else { + struct user32_kevent kev32; - /* - * If the knote supports EV_VANISHED delivery, - * transition it to vanished mode (or skip over - * it if already vanished). - */ - if (kn->kn_status & KN_VANISHED) { - kqunlock(kq); - continue; + error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32)); + if (__improbable(error)) { + return error; } + *addrp += sizeof(kev32); + *kevp = (struct kevent_qos_s){ + .ident = (uintptr_t)kev32.ident, + .filter = kev32.filter, + /* Make sure user doesn't pass in any system flags */ + .flags = kev32.flags & ~EV_SYSFLAGS, + .udata = CAST_USER_ADDR_T(kev32.udata), + .fflags = kev32.fflags, + .data = (intptr_t)kev32.data, + }; + } - proc_fdunlock(p); - if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { - /* the knote was dropped by someone, nothing to do */ - } else if (kn->kn_status & KN_REQVANISH) { - kn->kn_status |= KN_VANISHED; - kn->kn_status &= ~KN_ATTACHED; + return 0; +} - kqunlock(kq); - knote_fops(kn)->f_detach(kn); - if (knote_fops(kn)->f_isfd) { - fp_drop(p, kn->kn_id, kn->kn_fp, 0); - } - kqlock(kq); +/*! + * @function kevent_modern_copyin + * + * @brief + * Handles the copyin of a kevent_qos/kevent_id event. + */ +static int +kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp) +{ + int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s)); + if (__probable(!error)) { + /* Make sure user doesn't pass in any system flags */ + *addrp += sizeof(struct kevent_qos_s); + kevp->flags &= ~EV_SYSFLAGS; + } + return error; +} - knote_activate(kn); - knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); - } else { - knote_drop(kq, kn, &knlc); - } +/*! + * @function kevent_legacy_copyout + * + * @brief + * Handles the copyout of a kevent/kevent64 event. + */ +static int +kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags) +{ + int advance; + int error; - proc_fdlock(p); - goto restart; + assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0); + + /* + * fully initialize the differnt output event structure + * types from the internal kevent (and some universal + * defaults for fields not represented in the internal + * form). + * + * Note: these structures have no padding hence the C99 + * initializers below do not leak kernel info. + */ + if (flags & KEVENT_FLAG_LEGACY64) { + struct kevent64_s kev64 = { + .ident = kevp->ident, + .filter = kevp->filter, + .flags = kevp->flags, + .fflags = kevp->fflags, + .data = (int64_t)kevp->data, + .udata = kevp->udata, + .ext[0] = kevp->ext[0], + .ext[1] = kevp->ext[1], + }; + advance = sizeof(struct kevent64_s); + error = copyout((caddr_t)&kev64, *addrp, advance); + } else if (flags & KEVENT_FLAG_PROC64) { + /* + * deal with the special case of a user-supplied + * value of (uintptr_t)-1. + */ + uint64_t ident = (kevp->ident == (uintptr_t)-1) ? + (uint64_t)-1LL : (uint64_t)kevp->ident; + struct user64_kevent kev64 = { + .ident = ident, + .filter = kevp->filter, + .flags = kevp->flags, + .fflags = kevp->fflags, + .data = (int64_t) kevp->data, + .udata = kevp->udata, + }; + advance = sizeof(kev64); + error = copyout((caddr_t)&kev64, *addrp, advance); + } else { + struct user32_kevent kev32 = { + .ident = (uint32_t)kevp->ident, + .filter = kevp->filter, + .flags = kevp->flags, + .fflags = kevp->fflags, + .data = (int32_t)kevp->data, + .udata = kevp->udata, + }; + advance = sizeof(kev32); + error = copyout((caddr_t)&kev32, *addrp, advance); + } + if (__probable(!error)) { + *addrp += advance; } + return error; } -/* - * knote_fdfind - lookup a knote in the fd table for process +/*! + * @function kevent_modern_copyout * - * If the filter is file-based, lookup based on fd index. - * Otherwise use a hash based on the ident. + * @brief + * Handles the copyout of a kevent_qos/kevent_id event. + */ +OS_ALWAYS_INLINE +static inline int +kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp) +{ + int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s)); + if (__probable(!error)) { + *addrp += sizeof(struct kevent_qos_s); + } + return error; +} + +#pragma mark kevent core implementation + +/*! + * @function kevent_callback_inline * - * Matching is based on kq, filter, and ident. Optionally, - * it may also be based on the udata field in the kevent - - * allowing multiple event registration for the file object - * per kqueue. + * @brief + * Callback for each individual event * - * fd_knhashlock or fdlock held on entry (and exit) + * @discussion + * This is meant to be inlined in kevent_modern_callback and + * kevent_legacy_callback. */ -static struct knote * -knote_fdfind(struct kqueue *kq, - struct kevent_internal_s *kev, - bool is_fd, - struct proc *p) +OS_ALWAYS_INLINE +static inline int +kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy) { - struct filedesc *fdp = p->p_fd; - struct klist *list = NULL; - struct knote *kn = NULL; + int error; + + assert(kectx->kec_process_noutputs < kectx->kec_process_nevents); /* - * determine where to look for the knote + * Copy out the appropriate amount of event data for this user. */ - if (is_fd) { - /* fd-based knotes are linked off the fd table */ - if (kev->ident < (u_int)fdp->fd_knlistsize) { - list = &fdp->fd_knlist[kev->ident]; - } - } else if (fdp->fd_knhashmask != 0) { - /* hash non-fd knotes here too */ - list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; + if (legacy) { + error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist, + kectx->kec_process_flags); + } else { + error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist); } /* - * scan the selected list looking for a match + * If there isn't space for additional events, return + * a harmless error to stop the processing here */ - if (list != NULL) { - SLIST_FOREACH(kn, list, kn_link) { - if (kq == knote_get_kq(kn) && - kev->ident == kn->kn_id && - kev->filter == kn->kn_filter) { - if (kev->flags & EV_UDATA_SPECIFIC) { - if ((kn->kn_status & KN_UDATA_SPECIFIC) && - kev->udata == kn->kn_udata) { - break; /* matching udata-specific knote */ - } - } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) { - break; /* matching non-udata-specific knote */ - } - } - } + if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) { + error = EWOULDBLOCK; } - return kn; + return error; } -/* - * kq_add_knote- Add knote to the fd table for process - * while checking for duplicates. +/*! + * @function kevent_modern_callback * - * All file-based filters associate a list of knotes by file - * descriptor index. All other filters hash the knote by ident. + * @brief + * Callback for each individual modern event. * - * May have to grow the table of knote lists to cover the - * file descriptor index presented. + * @discussion + * This callback handles kevent_qos/kevent_id events. + */ +static int +kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx) +{ + return kevent_callback_inline(kevp, kectx, /*legacy*/ false); +} + +/*! + * @function kevent_legacy_callback * - * fd_knhashlock and fdlock unheld on entry (and exit). + * @brief + * Callback for each individual legacy event. * - * Takes a rwlock boost if inserting the knote is successful. + * @discussion + * This callback handles kevent/kevent64 events. */ static int -kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, - struct proc *p) +kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx) { - struct filedesc *fdp = p->p_fd; - struct klist *list = NULL; - int ret = 0; - bool is_fd = knote_fops(kn)->f_isfd; + return kevent_callback_inline(kevp, kectx, /*legacy*/ true); +} - if (is_fd) { - proc_fdlock(p); +/*! + * @function kevent_cleanup + * + * @brief + * Handles the cleanup returning from a kevent call. + * + * @discussion + * kevent entry points will take a reference on workloops, + * and a usecount on the fileglob of kqfiles. + * + * This function undoes this on the exit paths of kevents. + * + * @returns + * The error to return to userspace. + */ +static int +kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx) +{ + // poll should not call any codepath leading to this + assert((flags & KEVENT_FLAG_POLL) == 0); + + if (flags & KEVENT_FLAG_WORKLOOP) { + kqworkloop_release(kqu.kqwl); + } else if (flags & KEVENT_FLAG_WORKQ) { + /* nothing held */ } else { - knhash_lock(p); + fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0); } - if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) { - /* found an existing knote: we can't add this one */ - ret = ERESTART; - goto out_locked; + /* don't restart after signals... */ + if (error == ERESTART) { + error = EINTR; + } else if (error == 0) { + /* don't abandon other output just because of residual copyout failures */ + (void)kevent_put_data_size(flags, kectx); } - /* knote was not found: add it now */ - if (!is_fd) { - if (fdp->fd_knhashmask == 0) { - u_long size = 0; - - list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size); - if (list == NULL) { - ret = ENOMEM; - goto out_locked; - } - - fdp->fd_knhash = list; - fdp->fd_knhashmask = size; + if (flags & KEVENT_FLAG_PARKING) { + thread_t th = current_thread(); + struct uthread *uth = get_bsdthread_info(th); + if (uth->uu_kqr_bound) { + thread_unfreeze_base_pri(th); } + } + return error; +} - list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; - SLIST_INSERT_HEAD(list, kn, kn_link); - ret = 0; - goto out_locked; +/*! + * @function kqueue_process + * + * @brief + * Process the triggered events in a kqueue. + * + * @discussion + * Walk the queued knotes and validate that they are really still triggered + * events by calling the filter routines (if necessary). + * + * For each event that is still considered triggered, invoke the callback + * routine provided. + * + * caller holds a reference on the kqueue. + * kqueue locked on entry and exit - but may be dropped + * kqueue list locked (held for duration of call) + * + * This is only called by kqueue_scan() so that the compiler can inline it. + * + * @returns + * - 0: no event was returned, no other error occured + * - EBADF: the kqueue is being destroyed (KQ_DRAIN is set) + * - EWOULDBLOCK: (not an error) events have been found and we should return + * - EFAULT: copyout failed + * - filter specific errors + */ +static int +kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx, + kevent_callback_t callback) +{ + workq_threadreq_t kqr = current_uthread()->uu_kqr_bound; + struct knote *kn; + int error = 0, rc = 0; + struct kqtailq *base_queue, *queue; +#if DEBUG || DEVELOPMENT + int retries = 64; +#endif + uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)); + + if (kq_type & KQ_WORKQ) { + rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags); + } else if (kq_type & KQ_WORKLOOP) { + rc = kqworkloop_begin_processing(kqu.kqwl, flags); } else { - /* knote is fd based */ +kqfile_retry: + rc = kqfile_begin_processing(kqu.kqf); + if (rc == EBADF) { + return EBADF; + } + } - if ((u_int)fdp->fd_knlistsize <= kn->kn_id) { - u_int size = 0; + if (rc == -1) { + /* Nothing to process */ + return 0; + } - if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur - || kn->kn_id >= (uint64_t)maxfiles) { - ret = EINVAL; - goto out_locked; - } - /* have to grow the fd_knlist */ - size = fdp->fd_knlistsize; - while (size <= kn->kn_id) { - size += KQEXTENT; - } + /* + * loop through the enqueued knotes associated with this request, + * processing each one. Each request may have several queues + * of knotes to process (depending on the type of kqueue) so we + * have to loop through all the queues as long as we have additional + * space. + */ - if (size >= (UINT_MAX / sizeof(struct klist *))) { - ret = EINVAL; - goto out_locked; - } +process_again: + if (kq_type & KQ_WORKQ) { + base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index]; + } else if (kq_type & KQ_WORKLOOP) { + base_queue = &kqu.kqwl->kqwl_queue[0]; + queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1]; + } else { + base_queue = queue = &kqu.kqf->kqf_queue; + } - MALLOC(list, struct klist *, - size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); - if (list == NULL) { - ret = ENOMEM; - goto out_locked; + do { + while ((kn = TAILQ_FIRST(queue)) != NULL) { + error = knote_process(kn, kectx, callback); + if (error == EJUSTRETURN) { + error = 0; + } else if (__improbable(error)) { + /* error is EWOULDBLOCK when the out event array is full */ + goto stop_processing; } - - bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list, - fdp->fd_knlistsize * sizeof(struct klist *)); - bzero((caddr_t)list + - fdp->fd_knlistsize * sizeof(struct klist *), - (size - fdp->fd_knlistsize) * sizeof(struct klist *)); - FREE(fdp->fd_knlist, M_KQUEUE); - fdp->fd_knlist = list; - fdp->fd_knlistsize = size; } + } while (queue-- > base_queue); - list = &fdp->fd_knlist[kn->kn_id]; - SLIST_INSERT_HEAD(list, kn, kn_link); - ret = 0; - goto out_locked; + if (kectx->kec_process_noutputs) { + /* callers will transform this into no error */ + error = EWOULDBLOCK; } -out_locked: - if (ret == 0) { - kqlock(kq); - assert((kn->kn_status & KN_LOCKED) == 0); - (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK); +stop_processing: + /* + * If KEVENT_FLAG_PARKING is set, and no kevents have been returned, + * we want to unbind the kqrequest from the thread. + * + * However, because the kq locks are dropped several times during process, + * new knotes may have fired again, in which case, we want to fail the end + * processing and process again, until it converges. + * + * If we have an error or returned events, end processing never fails. + */ + if (error) { + flags &= ~KEVENT_FLAG_PARKING; } - if (is_fd) { - proc_fdunlock(p); + if (kq_type & KQ_WORKQ) { + rc = kqworkq_end_processing(kqu.kqwq, kqr, flags); + } else if (kq_type & KQ_WORKLOOP) { + rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags); } else { - knhash_unlock(p); + rc = kqfile_end_processing(kqu.kqf); } - return ret; + if (__probable(error)) { + return error; + } + + if (__probable(rc >= 0)) { + assert(rc == 0 || rc == EBADF); + return rc; + } + +#if DEBUG || DEVELOPMENT + if (retries-- == 0) { + panic("kevent: way too many knote_process retries, kq: %p (0x%04x)", + kqu.kq, kqu.kq->kq_state); + } +#endif + if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) { + assert(flags & KEVENT_FLAG_PARKING); + goto process_again; + } else { + goto kqfile_retry; + } } -/* - * kq_remove_knote - remove a knote from the fd table for process +/*! + * @function kqueue_scan_continue * - * If the filter is file-based, remove based on fd index. - * Otherwise remove from the hash based on the ident. + * @brief + * The continuation used by kqueue_scan for kevent entry points. * - * fd_knhashlock and fdlock unheld on entry (and exit). + * @discussion + * Assumes we inherit a use/ref count on the kq or its fileglob. + * + * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor + * KEVENT_FLAG_KERNEL was set, and the caller had to wait. */ +OS_NORETURN OS_NOINLINE static void -kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, - struct knote_lock_ctx *knlc) +kqueue_scan_continue(void *data, wait_result_t wait_result) { - struct filedesc *fdp = p->p_fd; - struct klist *list = NULL; - uint16_t kq_state; - bool is_fd; + uthread_t ut = current_uthread(); + kevent_ctx_t kectx = &ut->uu_save.uus_kevent; + int error = 0, flags = kectx->kec_process_flags; + struct kqueue *kq = data; - is_fd = knote_fops(kn)->f_isfd; - - if (is_fd) { - proc_fdlock(p); - } else { - knhash_lock(p); - } + /* + * only kevent variants call in here, so we know the callback is + * kevent_legacy_callback or kevent_modern_callback. + */ + assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0); - if (is_fd) { - assert((u_int)fdp->fd_knlistsize > kn->kn_id); - list = &fdp->fd_knlist[kn->kn_id]; - } else { - list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + switch (wait_result) { + case THREAD_AWAKENED: + if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) { + error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback); + } else { + error = kqueue_scan(kq, flags, kectx, kevent_modern_callback); + } + break; + case THREAD_TIMED_OUT: + error = 0; + break; + case THREAD_INTERRUPTED: + error = EINTR; + break; + case THREAD_RESTART: + error = EBADF; + break; + default: + panic("%s: - invalid wait_result (%d)", __func__, wait_result); } - SLIST_REMOVE(list, kn, knote, kn_link); - kqlock(kq); - kq_state = kq->kq_state; - if (knlc) { - knote_unlock_cancel(kq, kn, knlc, KNOTE_KQ_UNLOCK); - } else { - kqunlock(kq); - } - if (is_fd) { - proc_fdunlock(p); - } else { - knhash_unlock(p); - } - if (kq_state & KQ_DYNAMIC) { - kqueue_release_last(p, kq); - } + error = kevent_cleanup(kq, flags, error, kectx); + *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs; + unix_syscall_return(error); } -/* - * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process - * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock. +/*! + * @function kqueue_scan * - * fd_knhashlock or fdlock unheld on entry (and exit) + * @brief + * Scan and wait for events in a kqueue (used by poll & kevent). + * + * @discussion + * Process the triggered events in a kqueue. + * + * If there are no events triggered arrange to wait for them: + * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags + * - possibly until kectx->kec_deadline expires + * + * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL + * are set, then it will wait in the kqueue_scan_continue continuation. + * + * poll() will block in place, and KEVENT_FLAG_KERNEL calls + * all pass KEVENT_FLAG_IMMEDIATE and will not wait. + * + * @param kq + * The kqueue being scanned. + * + * @param flags + * The KEVENT_FLAG_* flags for this call. + * + * @param kectx + * The context used for this scan. + * The uthread_t::uu_save.uus_kevent storage is used for this purpose. + * + * @param callback + * The callback to be called on events sucessfully processed. + * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback) */ - -static struct knote * -kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, - bool is_fd, struct proc *p) +int +kqueue_scan(struct kqueue *kq, int flags, kevent_ctx_t kectx, + kevent_callback_t callback) { - struct knote * ret; + int error; - if (is_fd) { - proc_fdlock(p); - } else { - knhash_lock(p); - } + for (;;) { + kqlock(kq); + error = kqueue_process(kq, flags, kectx, callback); - ret = knote_fdfind(kq, kev, is_fd, p); + /* + * If we got an error, events returned (EWOULDBLOCK) + * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE), + * just return. + */ + if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) { + kqunlock(kq); + return error == EWOULDBLOCK ? 0 : error; + } - if (ret) { - kqlock(kq); - } + waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs, + KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL, + kectx->kec_deadline, TIMEOUT_NO_LEEWAY); + kq->kq_state |= KQ_SLEEP; - if (is_fd) { - proc_fdunlock(p); - } else { - knhash_unlock(p); - } + kqunlock(kq); - return ret; + if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) { + thread_block_parameter(kqueue_scan_continue, kq); + __builtin_unreachable(); + } + + wait_result_t wr = thread_block(THREAD_CONTINUE_NULL); + switch (wr) { + case THREAD_AWAKENED: + break; + case THREAD_TIMED_OUT: + return 0; + case THREAD_INTERRUPTED: + return EINTR; + case THREAD_RESTART: + return EBADF; + default: + panic("%s: - bad wait_result (%d)", __func__, wr); + } + } } -/* - * knote_drop - disconnect and drop the knote + +/*! + * @function kevent_internal * - * Called with the kqueue locked, returns with the kqueue unlocked. + * @brief + * Common kevent code. * - * If a knote locking context is passed, it is canceled. + * @discussion + * Needs to be inlined to specialize for legacy or modern and + * eliminate dead code. * - * The knote may have already been detached from - * (or not yet attached to) its source object. + * This is the core logic of kevent entry points, that will: + * - register kevents + * - optionally scan the kqueue for events + * + * The caller is giving kevent_internal a reference on the kqueue + * or its fileproc that needs to be cleaned up by kevent_cleanup(). */ -static void -knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc) +OS_ALWAYS_INLINE +static inline int +kevent_internal(kqueue_t kqu, + user_addr_t changelist, int nchanges, + user_addr_t ueventlist, int nevents, + int flags, kevent_ctx_t kectx, int32_t *retval, + bool legacy) { - struct proc *p = kq->kq_p; + int error = 0, noutputs = 0, register_rc; - kqlock_held(kq); + /* only bound threads can receive events on workloops */ + if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) { +#if CONFIG_WORKLOOP_DEBUG + UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), { + .uu_kqid = kqu.kqwl->kqwl_dynamicid, + .uu_kq = error ? NULL : kqu.kq, + .uu_error = error, + .uu_nchanges = nchanges, + .uu_nevents = nevents, + .uu_flags = flags, + }); +#endif // CONFIG_WORKLOOP_DEBUG - assert((kn->kn_status & KN_DROPPING) == 0); - if (knlc == NULL) { - assert((kn->kn_status & KN_LOCKED) == 0); + if (flags & KEVENT_FLAG_KERNEL) { + /* see kevent_workq_internal */ + error = copyout(&kqu.kqwl->kqwl_dynamicid, + ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t)); + kectx->kec_data_resid -= sizeof(kqueue_id_t); + if (__improbable(error)) { + goto out; + } + } + + if (kevent_args_requesting_events(flags, nevents)) { + /* + * Disable the R2K notification while doing a register, if the + * caller wants events too, we don't want the AST to be set if we + * will process these events soon. + */ + kqlock(kqu); + kqu.kq->kq_state &= ~KQ_R2K_ARMED; + kqunlock(kqu); + flags |= KEVENT_FLAG_NEEDS_END_PROCESSING; + } } - kn->kn_status |= KN_DROPPING; - knote_unsuppress(kn); - knote_dequeue(kn); - knote_wait_for_filter_events(kq, kn); + /* register all the change requests the user provided... */ + while (nchanges > 0 && error == 0) { + struct kevent_qos_s kev; + struct knote *kn = NULL; + + if (legacy) { + error = kevent_legacy_copyin(&changelist, &kev, flags); + } else { + error = kevent_modern_copyin(&changelist, &kev); + } + if (error) { + break; + } + + register_rc = kevent_register(kqu.kq, &kev, &kn); + if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) { + thread_t thread = current_thread(); + + kqlock_held(kqu); + + if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) { + workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE); + } + + // f_post_register_wait is meant to call a continuation and not to + // return, which is why we don't support FILTER_REGISTER_WAIT if + // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that + // waits isn't the last. + // + // It is implementable, but not used by any userspace code at the + // moment, so for now return ENOTSUP if someone tries to do it. + if (nchanges == 1 && noutputs < nevents && + (flags & KEVENT_FLAG_KERNEL) == 0 && + (flags & KEVENT_FLAG_PARKING) == 0 && + (flags & KEVENT_FLAG_ERROR_EVENTS) && + (flags & KEVENT_FLAG_WORKLOOP)) { + uthread_t ut = get_bsdthread_info(thread); + + /* + * store the continuation/completion data in the uthread + * + * Note: the kectx aliases with this, + * and is destroyed in the process. + */ + ut->uu_save.uus_kevent_register = (struct _kevent_register){ + .kev = kev, + .kqwl = kqu.kqwl, + .eventout = noutputs, + .ueventlist = ueventlist, + }; + knote_fops(kn)->f_post_register_wait(ut, kn, + &ut->uu_save.uus_kevent_register); + __builtin_unreachable(); + } + kqunlock(kqu); + + kev.flags |= EV_ERROR; + kev.data = ENOTSUP; + } else { + assert((register_rc & FILTER_REGISTER_WAIT) == 0); + } - /* If we are attached, disconnect from the source first */ - if (kn->kn_status & KN_ATTACHED) { - knote_fops(kn)->f_detach(kn); + // keep in sync with kevent_register_wait_return() + if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) { + if ((kev.flags & EV_ERROR) == 0) { + kev.flags |= EV_ERROR; + kev.data = 0; + } + if (legacy) { + error = kevent_legacy_copyout(&kev, &ueventlist, flags); + } else { + error = kevent_modern_copyout(&kev, &ueventlist); + } + if (error == 0) { + noutputs++; + } + } else if (kev.flags & EV_ERROR) { + error = kev.data; + } + nchanges--; } - /* kq may be freed when kq_remove_knote() returns */ - kq_remove_knote(kq, kn, p, knlc); - if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0)) { - fp_drop(p, kn->kn_id, kn->kn_fp, 0); + if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 && + nevents > 0 && noutputs == 0 && error == 0) { + kectx->kec_process_flags = flags; + kectx->kec_process_nevents = nevents; + kectx->kec_process_noutputs = 0; + kectx->kec_process_eventlist = ueventlist; + + if (legacy) { + error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback); + } else { + error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback); + } + + noutputs = kectx->kec_process_noutputs; + } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) { + /* + * If we didn't through kqworkloop_end_processing(), + * we need to do it here. + * + * kqueue_scan will call kqworkloop_end_processing(), + * so we only need to do it if we didn't scan. + */ + kqlock(kqu); + kqworkloop_end_processing(kqu.kqwl, 0, 0); + kqunlock(kqu); } - knote_free(kn); + *retval = noutputs; +out: + return kevent_cleanup(kqu.kq, flags, error, kectx); } -/* called with kqueue lock held */ -static void -knote_activate(struct knote *kn) -{ - if (kn->kn_status & KN_ACTIVE) { - return; - } - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE), - kn->kn_udata, kn->kn_status | (kn->kn_id << 32), - kn->kn_filtid); +#pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal - kn->kn_status |= KN_ACTIVE; - if (knote_enqueue(kn)) { - knote_wakeup(kn); - } +/*! + * @function kevent_modern_internal + * + * @brief + * The backend of the kevent_id and kevent_workq_internal entry points. + * + * @discussion + * Needs to be inline due to the number of arguments. + */ +OS_NOINLINE +static int +kevent_modern_internal(kqueue_t kqu, + user_addr_t changelist, int nchanges, + user_addr_t ueventlist, int nevents, + int flags, kevent_ctx_t kectx, int32_t *retval) +{ + return kevent_internal(kqu.kq, changelist, nchanges, + ueventlist, nevents, flags, kectx, retval, /*legacy*/ false); } -/* called with kqueue lock held */ -static void -knote_deactivate(struct knote *kn) +/*! + * @function kevent_id + * + * @brief + * The kevent_id() syscall. + */ +int +kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval) { - kn->kn_status &= ~KN_ACTIVE; - if ((kn->kn_status & KN_STAYACTIVE) == 0) { - knote_dequeue(kn); + int error, flags = uap->flags & KEVENT_FLAG_USER; + uthread_t uth = current_uthread(); + workq_threadreq_t kqr = uth->uu_kqr_bound; + kevent_ctx_t kectx = &uth->uu_save.uus_kevent; + kqueue_t kqu; + + flags = kevent_adjust_flags_for_proc(p, flags); + flags |= KEVENT_FLAG_DYNAMIC_KQUEUE; + + if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) != + KEVENT_FLAG_WORKLOOP)) { + return EINVAL; } -} -/* called with kqueue lock held */ -static void -knote_enable(struct knote *kn) -{ - if ((kn->kn_status & KN_DISABLED) == 0) { - return; + error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx); + if (__improbable(error)) { + return error; } - kn->kn_status &= ~KN_DISABLED; + kectx->kec_deadline = 0; + kectx->kec_fp = NULL; + kectx->kec_fd = -1; + /* the kec_process_* fields are filled if kqueue_scann is called only */ - if (kn->kn_status & KN_SUPPRESSED) { - /* - * it is possible for userland to have knotes registered for a given - * workloop `wl_orig` but really handled on another workloop `wl_new`. - * - * In that case, rearming will happen from the servicer thread of - * `wl_new` which if `wl_orig` is no longer being serviced, would cause - * this knote to stay suppressed forever if we only relied on - * kqworkloop_acknowledge_events to be called by `wl_orig`. - * - * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't - * unsuppress because that would mess with the processing phase of - * `wl_orig`, however it also means kqworkloop_acknowledge_events() - * will be called. - */ - struct kqueue *kq = knote_get_kq(kn); - if ((kq->kq_state & KQ_PROCESSING) == 0) { - knote_unsuppress(kn); + /* + * Get the kq we are going to be working on + * As a fastpath, look at the currently bound workloop. + */ + kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL; + if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) { + if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) { + return EEXIST; + } + kqworkloop_retain(kqu.kqwl); + } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) { + return EXDEV; + } else { + error = kqworkloop_get_or_create(p, uap->id, NULL, flags, &kqu.kqwl); + if (__improbable(error)) { + return error; } - } else if (knote_enqueue(kn)) { - knote_wakeup(kn); } + + return kevent_modern_internal(kqu, uap->changelist, uap->nchanges, + uap->eventlist, uap->nevents, flags, kectx, retval); } -/* called with kqueue lock held */ -static void -knote_disable(struct knote *kn) +/**! + * @function kevent_workq_internal + * + * @discussion + * This function is exported for the sake of the workqueue subsystem. + * + * It is called in two ways: + * - when a thread is about to go to userspace to ask for pending event + * - when a thread is returning from userspace with events back + * + * the workqueue subsystem will only use the following flags: + * - KEVENT_FLAG_STACK_DATA (always) + * - KEVENT_FLAG_IMMEDIATE (always) + * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from + * userspace). + * + * It implicitly acts on the bound kqueue, and for the case of workloops + * will copyout the kqueue ID before anything else. + * + * + * Pthread will have setup the various arguments to fit this stack layout: + * + * +-------....----+--------------+-----------+--------------------+ + * | user stack | data avail | nevents | pthread_self() | + * +-------....----+--------------+-----------+--------------------+ + * ^ ^ + * data_out eventlist + * + * When a workloop is used, the workloop ID is copied out right before + * the eventlist and is taken from the data buffer. + * + * @warning + * This function is carefuly tailored to not make any call except the final tail + * call into kevent_modern_internal. (LTO inlines current_uthread()). + * + * This function is performance sensitive due to the workq subsystem. + */ +int +kevent_workq_internal(struct proc *p, + user_addr_t changelist, int nchanges, + user_addr_t eventlist, int nevents, + user_addr_t data_out, user_size_t *data_available, + unsigned int flags, int32_t *retval) { - if (kn->kn_status & KN_DISABLED) { - return; - } + uthread_t uth = current_uthread(); + workq_threadreq_t kqr = uth->uu_kqr_bound; + kevent_ctx_t kectx = &uth->uu_save.uus_kevent; + kqueue_t kqu; - kn->kn_status |= KN_DISABLED; - knote_dequeue(kn); -} + assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) || + flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING)); -/* called with kqueue lock held */ -static void -knote_suppress(struct knote *kn) -{ - struct kqtailq *suppressq; - struct kqueue *kq = knote_get_kq(kn); + kectx->kec_data_out = data_out; + kectx->kec_data_avail = (uint64_t)data_available; + kectx->kec_data_size = *data_available; + kectx->kec_data_resid = *data_available; + kectx->kec_deadline = 0; + kectx->kec_fp = NULL; + kectx->kec_fd = -1; + /* the kec_process_* fields are filled if kqueue_scann is called only */ - kqlock_held(kq); + flags = kevent_adjust_flags_for_proc(p, flags); - if (kn->kn_status & KN_SUPPRESSED) { - return; + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { + kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request); + kqworkloop_retain(kqu.kqwl); + + flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE | + KEVENT_FLAG_KERNEL; + } else { + kqu.kqwq = p->p_fd->fd_wqkqueue; + + flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL; } - knote_dequeue(kn); - kn->kn_status |= KN_SUPPRESSED; - suppressq = kqueue_get_suppressed_queue(kq, kn); - TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe); + return kevent_modern_internal(kqu, changelist, nchanges, + eventlist, nevents, flags, kectx, retval); } -/* called with kqueue lock held */ -static void -knote_unsuppress(struct knote *kn) +/*! + * @function kevent_qos + * + * @brief + * The kevent_qos() syscall. + */ +int +kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval) { - struct kqtailq *suppressq; - struct kqueue *kq = knote_get_kq(kn); - - kqlock_held(kq); + uthread_t uth = current_uthread(); + kevent_ctx_t kectx = &uth->uu_save.uus_kevent; + int error, flags = uap->flags & KEVENT_FLAG_USER; + struct kqueue *kq; - if ((kn->kn_status & KN_SUPPRESSED) == 0) { - return; + if (__improbable(flags & KEVENT_ID_FLAG_USER)) { + return EINVAL; } - kn->kn_status &= ~KN_SUPPRESSED; - suppressq = kqueue_get_suppressed_queue(kq, kn); - TAILQ_REMOVE(suppressq, kn, kn_tqe); - - /* - * If the knote is no longer active, reset its push, - * and resynchronize kn_qos_index with kn_qos_override - */ - if ((kn->kn_status & KN_ACTIVE) == 0) { - kn->kn_qos_override = kn->kn_req_index; - } - kn->kn_qos_index = kn->kn_qos_override; + flags = kevent_adjust_flags_for_proc(p, flags); - /* don't wakeup if unsuppressing just a stay-active knote */ - if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) { - knote_wakeup(kn); + error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx); + if (__improbable(error)) { + return error; } - if ((kq->kq_state & KQ_WORKLOOP) && TAILQ_EMPTY(suppressq)) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + kectx->kec_deadline = 0; + kectx->kec_fp = NULL; + kectx->kec_fd = uap->fd; + /* the kec_process_* fields are filled if kqueue_scann is called only */ - if (kqworkloop_is_processing_on_current_thread(kqwl)) { - /* - * kqworkloop_end_processing() or kqworkloop_begin_processing() - * will perform the required QoS computations when it unsets the - * processing mode. - */ - } else { - kq_req_lock(kqwl); - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0); - kq_req_unlock(kqwl); - } + /* get the kq we are going to be working on */ + if (__probable(flags & KEVENT_FLAG_WORKQ)) { + error = kevent_get_kqwq(p, flags, uap->nevents, &kq); + } else { + error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq); } -} - -/* called with kqueue lock held */ -static int -knote_enqueue(struct knote *kn) -{ - if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 || - (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING))) { - return 0; + if (__improbable(error)) { + return error; } - if ((kn->kn_status & KN_QUEUED) == 0) { - struct kqtailq *queue = knote_get_queue(kn); - struct kqueue *kq = knote_get_kq(kn); - - kqlock_held(kq); - TAILQ_INSERT_TAIL(queue, kn, kn_tqe); - kn->kn_status |= KN_QUEUED; - kq->kq_count++; - return 1; - } - return (kn->kn_status & KN_STAYACTIVE) != 0; + return kevent_modern_internal(kq, uap->changelist, uap->nchanges, + uap->eventlist, uap->nevents, flags, kectx, retval); } +#pragma mark legacy syscalls: kevent, kevent64 -/* called with kqueue lock held */ -static void -knote_dequeue(struct knote *kn) +/*! + * @function kevent_legacy_get_deadline + * + * @brief + * Compute the deadline for the legacy kevent syscalls. + * + * @discussion + * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified, + * as this takes precedence over the deadline. + * + * This function will fail if utimeout is USER_ADDR_NULL + * (the caller should check). + */ +static int +kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline) { - struct kqueue *kq = knote_get_kq(kn); - struct kqtailq *queue; - - kqlock_held(kq); + struct timespec ts; - if ((kn->kn_status & KN_QUEUED) == 0) { - return; + if (flags & KEVENT_FLAG_PROC64) { + struct user64_timespec ts64; + int error = copyin(utimeout, &ts64, sizeof(ts64)); + if (__improbable(error)) { + return error; + } + ts.tv_sec = ts64.tv_sec; + ts.tv_nsec = ts64.tv_nsec; + } else { + struct user32_timespec ts32; + int error = copyin(utimeout, &ts32, sizeof(ts32)); + if (__improbable(error)) { + return error; + } + ts.tv_sec = ts32.tv_sec; + ts.tv_nsec = ts32.tv_nsec; + } + if (!timespec_is_valid(&ts)) { + return EINVAL; } - queue = knote_get_queue(kn); - TAILQ_REMOVE(queue, kn, kn_tqe); - kn->kn_status &= ~KN_QUEUED; - kq->kq_count--; + clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline); + return 0; } -void -knote_init(void) +/*! + * @function kevent_legacy_internal + * + * @brief + * The core implementation for kevent and kevent64 + */ +OS_NOINLINE +static int +kevent_legacy_internal(struct proc *p, struct kevent64_args *uap, + int32_t *retval, int flags) { - knote_zone = zinit(sizeof(struct knote), 8192 * sizeof(struct knote), - 8192, "knote zone"); - - kqfile_zone = zinit(sizeof(struct kqfile), 8192 * sizeof(struct kqfile), - 8192, "kqueue file zone"); + uthread_t uth = current_uthread(); + kevent_ctx_t kectx = &uth->uu_save.uus_kevent; + struct kqueue *kq; + int error; - kqworkq_zone = zinit(sizeof(struct kqworkq), 8192 * sizeof(struct kqworkq), - 8192, "kqueue workq zone"); + if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) { + return EINVAL; + } - kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192 * sizeof(struct kqworkloop), - 8192, "kqueue workloop zone"); + flags = kevent_adjust_flags_for_proc(p, flags); - /* allocate kq lock group attribute and group */ - kq_lck_grp_attr = lck_grp_attr_alloc_init(); + kectx->kec_data_out = 0; + kectx->kec_data_avail = 0; + kectx->kec_data_size = 0; + kectx->kec_data_resid = 0; + kectx->kec_deadline = 0; + kectx->kec_fp = NULL; + kectx->kec_fd = uap->fd; + /* the kec_process_* fields are filled if kqueue_scann is called only */ - kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr); + /* convert timeout to absolute - if we have one (and not immediate) */ + if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) { + error = kevent_legacy_get_deadline(flags, uap->timeout, + &kectx->kec_deadline); + if (__improbable(error)) { + return error; + } + } - /* Allocate kq lock attribute */ - kq_lck_attr = lck_attr_alloc_init(); + /* get the kq we are going to be working on */ + if (flags & KEVENT_FLAG_WORKQ) { + error = kevent_get_kqwq(p, flags, uap->nevents, &kq); + } else { + error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq); + } + if (__improbable(error)) { + return error; + } -#if CONFIG_MEMORYSTATUS - /* Initialize the memorystatus list lock */ - memorystatus_kevent_init(kq_lck_grp, kq_lck_attr); -#endif + return kevent_internal(kq, uap->changelist, uap->nchanges, + uap->eventlist, uap->nevents, flags, kectx, retval, + /*legacy*/ true); } -SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) -const struct filterops * -knote_fops(struct knote *kn) +/*! + * @function kevent + * + * @brief + * The legacy kevent() syscall. + */ +int +kevent(struct proc *p, struct kevent_args *uap, int32_t *retval) { - return sysfilt_ops[kn->kn_filtid]; -} + struct kevent64_args args = { + .fd = uap->fd, + .changelist = uap->changelist, + .nchanges = uap->nchanges, + .eventlist = uap->eventlist, + .nevents = uap->nevents, + .timeout = uap->timeout, + }; -static struct knote * -knote_alloc(void) -{ - struct knote *kn = ((struct knote *)zalloc(knote_zone)); - bzero(kn, sizeof(struct knote)); - return kn; + return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32); } -static void -knote_free(struct knote *kn) +/*! + * @function kevent64 + * + * @brief + * The legacy kevent64() syscall. + */ +int +kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval) { - assert(kn->kn_inuse == 0); - assert((kn->kn_status & KN_LOCKED) == 0); - zfree(knote_zone, kn); + int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64; + return kevent_legacy_internal(p, uap, retval, flags); } +#pragma mark - socket interface + #if SOCKETS #include #include @@ -8266,7 +8394,7 @@ kev_msg_post(struct kev_msg *event_msg) */ if (event_msg->vendor_code < min_vendor || event_msg->vendor_code > max_vendor) { - OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor); + os_atomic_inc(&kevtstat.kes_badvendor, relaxed); return EINVAL; } return kev_post_msg(event_msg); @@ -8293,13 +8421,13 @@ kev_post_msg(struct kev_msg *event_msg) } if (total_size > MLEN) { - OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig); + os_atomic_inc(&kevtstat.kes_toobig, relaxed); return EMSGSIZE; } m = m_get(M_WAIT, MT_DATA); if (m == 0) { - OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem); + os_atomic_inc(&kevtstat.kes_nomem, relaxed); return ENOMEM; } ev = mtod(m, struct kern_event_msg *); @@ -8358,7 +8486,7 @@ kev_post_msg(struct kev_msg *event_msg) m2 = m_copym(m, 0, m->m_len, M_WAIT); if (m2 == 0) { - OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem); + os_atomic_inc(&kevtstat.kes_nomem, relaxed); m_free(m); lck_mtx_unlock(&ev_pcb->evp_mtx); lck_rw_done(kev_rwlock); @@ -8373,9 +8501,9 @@ kev_post_msg(struct kev_msg *event_msg) 1, m->m_len, MBUF_TC_BE); sorwakeup(ev_pcb->evp_socket); - OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted); + os_atomic_inc(&kevtstat.kes_posted, relaxed); } else { - OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock); + os_atomic_inc(&kevtstat.kes_fullsock, relaxed); } lck_mtx_unlock(&ev_pcb->evp_mtx); } @@ -8590,29 +8718,28 @@ fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo) } static int -fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi) +fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + workq_threadreq_t kqr = &kqwl->kqwl_request; workq_threadreq_param_t trp = {}; int err; - if ((kq->kq_state & KQ_WORKLOOP) == 0) { + if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) { return EINVAL; } - if ((err = fill_kqueueinfo(kq, &kqdi->kqdi_info))) { + if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) { return err; } - kq_req_lock(kqwl); + kqlock(kqwl); - kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread); + kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr)); kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner); - kqdi->kqdi_request_state = kqr->kqr_state; - kqdi->kqdi_async_qos = kqr->kqr_qos_index; - kqdi->kqdi_events_qos = kqr->kqr_override_index; - kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters; + kqdi->kqdi_request_state = kqr->tr_state; + kqdi->kqdi_async_qos = kqr->tr_kq_qos_index; + kqdi->kqdi_events_qos = kqr->tr_kq_override_index; + kqdi->kqdi_sync_waiters = 0; kqdi->kqdi_sync_waiter_qos = 0; trp.trp_value = kqwl->kqwl_params; @@ -8634,7 +8761,7 @@ fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi) kqdi->kqdi_cpupercent = 0; } - kq_req_unlock(kqwl); + kqunlock(kqwl); return 0; } @@ -8653,40 +8780,37 @@ knote_markstayactive(struct knote *kn) * Making a knote stay active is a property of the knote that must be * established before it is fully attached. */ - assert(kn->kn_status & KN_ATTACHING); assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0); /* handle all stayactive knotes on the (appropriate) manager */ - if (kq->kq_state & KQ_WORKQ) { - qos = KQWQ_QOS_MANAGER; - } else if (kq->kq_state & KQ_WORKLOOP) { + if (kq->kq_state & KQ_WORKLOOP) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; qos = _pthread_priority_thread_qos(kn->kn_qos); assert(qos && qos < THREAD_QOS_LAST); - kq_req_lock(kq); kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos); - kq_req_unlock(kq); qos = KQWL_BUCKET_STAYACTIVE; + } else if (kq->kq_state & KQ_WORKQ) { + qos = KQWQ_QOS_MANAGER; } else { qos = THREAD_QOS_UNSPECIFIED; } - kn->kn_req_index = qos; kn->kn_qos_override = qos; kn->kn_qos_index = qos; - knote_activate(kn); + knote_activate(kq, kn, FILTER_ACTIVE); kqunlock(kq); } void knote_clearstayactive(struct knote *kn) { - kqlock(knote_get_kq(kn)); - kn->kn_status &= ~KN_STAYACTIVE; - knote_deactivate(kn); - kqunlock(knote_get_kq(kn)); + struct kqueue *kq = knote_get_kq(kn); + kqlock(kq); + kn->kn_status &= ~(KN_STAYACTIVE | KN_ACTIVE); + knote_dequeue(kq, kn); + kqunlock(kq); } static unsigned long @@ -8697,26 +8821,22 @@ kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo * if (kq == knote_get_kq(kn)) { if (nknotes < buflen) { struct kevent_extinfo *info = &buf[nknotes]; - struct kevent_internal_s *kevp = &kn->kn_kevent; kqlock(kq); - info->kqext_kev = (struct kevent_qos_s){ - .ident = kevp->ident, - .filter = kevp->filter, - .flags = kevp->flags, - .fflags = kevp->fflags, - .data = (int64_t)kevp->data, - .udata = kevp->udata, - .ext[0] = kevp->ext[0], - .ext[1] = kevp->ext[1], - .ext[2] = kevp->ext[2], - .ext[3] = kevp->ext[3], - .qos = kn->kn_req_index, - }; - info->kqext_sdata = kn->kn_sdata; - info->kqext_status = kn->kn_status; - info->kqext_sfflags = kn->kn_sfflags; + info->kqext_kev = *(struct kevent_qos_s *)&kn->kn_kevent; + if (knote_has_qos(kn)) { + info->kqext_kev.qos = + _pthread_priority_thread_qos_fast(kn->kn_qos); + } else { + info->kqext_kev.qos = kn->kn_qos_override; + } + info->kqext_kev.filter |= 0xff00; /* sign extend filter */ + info->kqext_kev.xflags = 0; /* this is where sfflags lives */ + info->kqext_kev.data = 0; /* this is where sdata lives */ + info->kqext_sdata = kn->kn_sdata; + info->kqext_status = kn->kn_status; + info->kqext_sfflags = kn->kn_sfflags; kqunlock(kq); } @@ -8763,13 +8883,13 @@ kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize, bzero(kq_ids, bufsize); } - kqhash_lock(p); + kqhash_lock(fdp); if (fdp->fd_kqhashmask > 0) { for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) { struct kqworkloop *kqwl; - SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) { + LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) { /* report the number of kqueues, even if they don't all fit */ if (nkqueues < buflen) { kq_ids[nkqueues] = kqwl->kqwl_dynamicid; @@ -8779,7 +8899,7 @@ kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize, } } - kqhash_unlock(p); + kqhash_unlock(fdp); if (kq_ids) { size_t copysize; @@ -8808,7 +8928,7 @@ kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, uint32_t ubufsize, int32_t *size_out) { proc_t p = (proc_t)proc; - struct kqueue *kq; + struct kqworkloop *kqwl; int err = 0; struct kqueue_dyninfo kqdi = { }; @@ -8818,14 +8938,10 @@ kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, return ENOBUFS; } - kqhash_lock(p); - kq = kqueue_hash_lookup(p, kq_id); - if (!kq) { - kqhash_unlock(p); + kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id); + if (!kqwl) { return ESRCH; } - kqueue_retain(kq); - kqhash_unlock(p); /* * backward compatibility: allow the argument to this call to only be @@ -8833,15 +8949,15 @@ kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, */ if (ubufsize >= sizeof(struct kqueue_dyninfo)) { ubufsize = sizeof(struct kqueue_dyninfo); - err = fill_kqueue_dyninfo(kq, &kqdi); + err = fill_kqueue_dyninfo(kqwl, &kqdi); } else { ubufsize = sizeof(struct kqueue_info); - err = fill_kqueueinfo(kq, &kqdi.kqdi_info); + err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info); } if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) { *size_out = ubufsize; } - kqueue_release_last(p, kq); + kqworkloop_release(kqwl); return err; } @@ -8850,22 +8966,16 @@ kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, uint32_t ubufsize, int32_t *nknotes_out) { proc_t p = (proc_t)proc; - struct kqueue *kq; + struct kqworkloop *kqwl; int err; - assert(p != NULL); - - kqhash_lock(p); - kq = kqueue_hash_lookup(p, kq_id); - if (!kq) { - kqhash_unlock(p); + kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id); + if (!kqwl) { return ESRCH; } - kqueue_retain(kq); - kqhash_unlock(p); - err = pid_kqueue_extinfo(p, kq, ubuf, ubufsize, nknotes_out); - kqueue_release_last(p, kq); + err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out); + kqworkloop_release(kqwl); return err; } @@ -8900,10 +9010,10 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf, if (fdp->fd_knhashmask != 0) { for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { - kqhash_lock(p); + knhash_lock(fdp); kn = SLIST_FIRST(&fdp->fd_knhash[i]); nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes); - kqhash_unlock(p); + knhash_unlock(fdp); } } @@ -8926,15 +9036,14 @@ static unsigned int klist_copy_udata(struct klist *list, uint64_t *buf, unsigned int buflen, unsigned int nknotes) { - struct kevent_internal_s *kev; struct knote *kn; SLIST_FOREACH(kn, list, kn_link) { if (nknotes < buflen) { - struct kqueue *kq = knote_get_kq(kn); - kqlock(kq); - kev = &(kn->kn_kevent); - buf[nknotes] = kev->udata; - kqunlock(kq); + /* + * kevent_register will always set kn_udata atomically + * so that we don't have to take any kqlock here. + */ + buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed); } /* we return total number of knotes, which may be more than requested */ nknotes++; @@ -8943,21 +9052,6 @@ klist_copy_udata(struct klist *list, uint64_t *buf, return nknotes; } -static unsigned int -kqlist_copy_dynamicids(__assert_only proc_t p, struct kqlist *list, - uint64_t *buf, unsigned int buflen, unsigned int nids) -{ - kqhash_lock_held(p); - struct kqworkloop *kqwl; - SLIST_FOREACH(kqwl, list, kqwl_hashlink) { - if (nids < buflen) { - buf[nids] = kqwl->kqwl_dynamicid; - } - nids++; - } - return nids; -} - int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize) { @@ -8965,6 +9059,7 @@ kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize) struct filedesc *fdp = p->p_fd; unsigned int nuptrs = 0; unsigned long buflen = bufsize / sizeof(uint64_t); + struct kqworkloop *kqwl; if (buflen > 0) { assert(buf != NULL); @@ -8974,23 +9069,28 @@ kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize) for (int i = 0; i < fdp->fd_knlistsize; i++) { nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs); } - knhash_lock(p); proc_fdunlock(p); + + knhash_lock(fdp); if (fdp->fd_knhashmask != 0) { - for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { + for (size_t i = 0; i < fdp->fd_knhashmask + 1; i++) { nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs); } } - knhash_unlock(p); + knhash_unlock(fdp); - kqhash_lock(p); + kqhash_lock(fdp); if (fdp->fd_kqhashmask != 0) { - for (int i = 0; i < (int)fdp->fd_kqhashmask + 1; i++) { - nuptrs = kqlist_copy_dynamicids(p, &fdp->fd_kqhash[i], buf, buflen, - nuptrs); + for (size_t i = 0; i < fdp->fd_kqhashmask + 1; i++) { + LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) { + if (nuptrs < buflen) { + buf[nuptrs] = kqwl->kqwl_dynamicid; + } + nuptrs++; + } } } - kqhash_unlock(p); + kqhash_unlock(fdp); return (int)nuptrs; } @@ -9068,9 +9168,9 @@ kevent_sysctl SYSCTL_HANDLER_ARGS return EFAULT; } - struct kqrequest *kqr = ut->uu_kqr_bound; + workq_threadreq_t kqr = ut->uu_kqr_bound; if (kqr) { - if (kqr->kqr_state & KQR_WORKLOOP) { + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid; } else { bound_id = -1; diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index fea51a172..03bcf7896 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -121,6 +121,7 @@ #include +#include #include #include #include @@ -145,6 +146,10 @@ #include #endif +#if CONFIG_ARCADE +#include +#endif + #include #include #include @@ -154,6 +159,7 @@ #include +#include #include #include @@ -166,6 +172,8 @@ extern boolean_t vm_darkwake_mode; +extern int bootarg_execfailurereports; /* bsd_init.c */ + #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ extern void dtrace_proc_exec(proc_t); @@ -198,10 +206,12 @@ boolean_t thread_is_active(thread_t thread); void thread_copy_resource_info(thread_t dst_thread, thread_t src_thread); void *ipc_importance_exec_switch_task(task_t old_task, task_t new_task); extern void ipc_importance_release(void *elem); +extern boolean_t task_has_watchports(task_t task); /* * Mach things for which prototypes are unavailable from Mach headers */ +#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 void ipc_task_reset( task_t task); void ipc_thread_reset( @@ -210,7 +220,10 @@ kern_return_t ipc_object_copyin( ipc_space_t space, mach_port_name_t name, mach_msg_type_name_t msgt_name, - ipc_object_t *objectp); + ipc_object_t *objectp, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags, + uint32_t kmsg_flags); void ipc_port_release_send(ipc_port_t); #if DEVELOPMENT || DEBUG @@ -265,6 +278,13 @@ SYSCTL_INT(_security_mac, OID_AUTO, platform_exec_logging, CTLFLAG_RW, &platform static os_log_t peLog = OS_LOG_DEFAULT; +struct exec_port_actions { + uint32_t portwatch_count; + uint32_t registered_count; + ipc_port_t *portwatch_array; + ipc_port_t *registered_array; +}; + struct image_params; /* Forward */ static int exec_activate_image(struct image_params *imgp); static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp); @@ -282,9 +302,11 @@ static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size); static void exec_resettextvp(proc_t, struct image_params *); static int check_for_signature(proc_t, struct image_params *); static void exec_prefault_data(proc_t, struct image_params *, load_result_t *); -static errno_t exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_present, ipc_port_t * portwatch_ports); -static errno_t exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role, - ipc_port_t * portwatch_ports, int portwatch_count); +static errno_t exec_handle_port_actions(struct image_params *imgp, + struct exec_port_actions *port_actions); +static errno_t exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_t psa_qos_clamp, + uint64_t psa_darwin_role, struct exec_port_actions *port_actions); +static void exec_port_actions_destroy(struct exec_port_actions *port_actions); /* * exec_add_user_string @@ -689,6 +711,7 @@ exec_fat_imgact(struct image_params *imgp) lret = fatfile_getbestarch_for_cputype(pref, (vm_offset_t)fat_header, PAGE_SIZE, + imgp, &fat_arch); if (lret == LOAD_SUCCESS) { goto use_arch; @@ -704,6 +727,7 @@ regular_grading: /* Look up our preferred architecture in the fat file. */ lret = fatfile_getbestarch((vm_offset_t)fat_header, PAGE_SIZE, + imgp, &fat_arch); if (lret != LOAD_SUCCESS) { error = load_return_to_errno(lret); @@ -748,6 +772,7 @@ activate_exec_state(task_t task, proc_t p, thread_t thread, load_result_t *resul } else { OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag); } + task_set_mach_header_address(task, result->mach_header); ret = thread_state_initialize(thread); if (ret != KERN_SUCCESS) { @@ -914,11 +939,34 @@ exec_mach_imgact(struct image_params *imgp) goto bad; } grade: - if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK)) { + if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK, TRUE)) { + error = EBADARCH; + goto bad; + } + + if (validate_potential_simulator_binary(imgp->ip_origcputype, imgp, + imgp->ip_arch_offset, imgp->ip_arch_size) != LOAD_SUCCESS) { +#if __x86_64__ + const char *excpath; + error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg, &excpath); + os_log_error(OS_LOG_DEFAULT, "Unsupported 32-bit executable: \"%s\"", (error) ? imgp->ip_vp->v_name : excpath); +#endif error = EBADARCH; goto bad; } +#if defined(HAS_APPLE_PAC) + assert(mach_header->cputype == CPU_TYPE_ARM64 + ); + + if (((mach_header->cputype == CPU_TYPE_ARM64 && + (mach_header->cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E) + ) && (CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(mach_header->cpusubtype) == 0)) { + imgp->ip_flags &= ~IMGPF_NOJOP; + } else { + imgp->ip_flags |= IMGPF_NOJOP; + } +#endif /* Copy in arguments/environment from the old process */ error = exec_extract_strings(imgp); @@ -981,29 +1029,28 @@ grade: KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO, 0, 0); if (lret == LOAD_BADMACHO_UPX) { - /* set anything that might be useful in the crash report */ set_proc_name(imgp, p); - exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_UPX); exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; - exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE; - } else if (lret == LOAD_BADARCH_X86) { - /* set anything that might be useful in the crash report */ - set_proc_name(imgp, p); - - exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_NO32EXEC); - exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; - exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE; } else { exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO); + + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } } + exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE; + goto badtoolate; } proc_lock(p); p->p_cputype = imgp->ip_origcputype; p->p_cpusubtype = imgp->ip_origcpusubtype; + p->p_platform = load_result.ip_platform; + p->p_sdk = load_result.lr_sdk; proc_unlock(p); vm_map_set_user_wire_limit(map, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); @@ -1049,6 +1096,19 @@ grade: */ int cpu_subtype; cpu_subtype = 0; /* all cpu_subtypes use the same shared region */ +#if defined(HAS_APPLE_PAC) + if (cpu_type() == CPU_TYPE_ARM64 && + (p->p_cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E) { + assertf(p->p_cputype == CPU_TYPE_ARM64, + "p %p cpu_type() 0x%x p->p_cputype 0x%x p->p_cpusubtype 0x%x", + p, cpu_type(), p->p_cputype, p->p_cpusubtype); + /* + * arm64e uses pointer authentication, so request a separate + * shared region for this CPU subtype. + */ + cpu_subtype = p->p_cpusubtype & ~CPU_SUBTYPE_MASK; + } +#endif /* HAS_APPLE_PAC */ vm_map_exec(map, task, load_result.is_64bit_addr, (void *)p->p_fd->fd_rdir, cpu_type(), cpu_subtype); /* @@ -1065,7 +1125,13 @@ grade: KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE); + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } + goto badtoolate; } @@ -1089,7 +1155,13 @@ grade: if (lret != KERN_SUCCESS) { KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE); + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } + goto badtoolate; } @@ -1113,7 +1185,13 @@ grade: KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC); + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } + goto badtoolate; } @@ -1121,7 +1199,12 @@ grade: if (error) { KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT); + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } goto badtoolate; } @@ -1142,7 +1225,12 @@ grade: KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS); + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } goto badtoolate; } /* Set the stack */ @@ -1162,7 +1250,12 @@ grade: KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER); + if (bootarg_execfailurereports) { + set_proc_name(imgp, p); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } goto badtoolate; } task_set_dyld_info(task, load_result.all_image_info_addr, @@ -1174,9 +1267,6 @@ grade: vm_map_switch(old_map); - /* Stop profiling */ - stopprofclock(p); - /* * Reset signal state. */ @@ -1226,11 +1316,7 @@ grade: #if __arm64__ if (load_result.legacy_footprint) { -#if DEVELOPMENT || DEBUG - printf("%s: %d[%s] legacy footprint (mach-o)\n", - __FUNCTION__, p->p_pid, p->p_name); -#endif /* DEVELOPMENT || DEBUG */ - task_set_legacy_footprint(task, TRUE); + task_set_legacy_footprint(task); } #endif /* __arm64__ */ @@ -1382,9 +1468,9 @@ bad: * XXX hardcoded, for now; should use linker sets */ struct execsw { - int (*ex_imgact)(struct image_params *); + int(*const ex_imgact)(struct image_params *); const char *ex_name; -} execsw[] = { +}const execsw[] = { { exec_mach_imgact, "Mach-o Binary" }, { exec_fat_imgact, "Fat Binary" }, { exec_shell_imgact, "Interpreter Script" }, @@ -1597,6 +1683,30 @@ bad_notrans: return error; } +/* + * exec_validate_spawnattr_policy + * + * Description: Validates the entitlements required to set the apptype. + * + * Parameters: int psa_apptype posix spawn attribute apptype + * + * Returns: 0 Success + * EPERM Failure + */ +static errno_t +exec_validate_spawnattr_policy(int psa_apptype) +{ + if ((psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) != 0) { + int proctype = psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK; + if (proctype == POSIX_SPAWN_PROC_TYPE_DRIVER) { + if (!IOTaskHasEntitlement(current_task(), POSIX_SPAWN_ENTITLEMENT_DRIVER)) { + return EPERM; + } + } + } + + return 0; +} /* * exec_handle_spawnattr_policy @@ -1609,8 +1719,8 @@ bad_notrans: * Returns: 0 Success */ static errno_t -exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role, - ipc_port_t * portwatch_ports, int portwatch_count) +exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_t psa_qos_clamp, + uint64_t psa_darwin_role, struct exec_port_actions *port_actions) { int apptype = TASK_APPTYPE_NONE; int qos_clamp = THREAD_QOS_UNSPECIFIED; @@ -1640,6 +1750,9 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, apptype = TASK_APPTYPE_APP_TAL; break; #endif /* !CONFIG_EMBEDDED */ + case POSIX_SPAWN_PROC_TYPE_DRIVER: + apptype = TASK_APPTYPE_DRIVER; + break; default: apptype = TASK_APPTYPE_NONE; /* TODO: Should an invalid value here fail the spawn? */ @@ -1671,14 +1784,50 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, if (apptype != TASK_APPTYPE_NONE || qos_clamp != THREAD_QOS_UNSPECIFIED || - role != TASK_UNSPECIFIED) { - proc_set_task_spawnpolicy(p->task, apptype, qos_clamp, role, - portwatch_ports, portwatch_count); + role != TASK_UNSPECIFIED || + port_actions->portwatch_count) { + proc_set_task_spawnpolicy(p->task, thread, apptype, qos_clamp, role, + port_actions->portwatch_array, port_actions->portwatch_count); + } + + if (port_actions->registered_count) { + if (mach_ports_register(p->task, port_actions->registered_array, + port_actions->registered_count)) { + return EINVAL; + } + /* mach_ports_register() consumed the array */ + port_actions->registered_array = NULL; + port_actions->registered_count = 0; } return 0; } +static void +exec_port_actions_destroy(struct exec_port_actions *port_actions) +{ + if (port_actions->portwatch_array) { + for (uint32_t i = 0; i < port_actions->portwatch_count; i++) { + ipc_port_t port = NULL; + if ((port = port_actions->portwatch_array[i]) != NULL) { + ipc_port_release_send(port); + } + } + kfree(port_actions->portwatch_array, + port_actions->portwatch_count * sizeof(ipc_port_t *)); + } + + if (port_actions->registered_array) { + for (uint32_t i = 0; i < port_actions->registered_count; i++) { + ipc_port_t port = NULL; + if ((port = port_actions->registered_array[i]) != NULL) { + ipc_port_release_send(port); + } + } + kfree(port_actions->registered_array, + port_actions->registered_count * sizeof(ipc_port_t *)); + } +} /* * exec_handle_port_actions @@ -1694,8 +1843,8 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, * ENOTSUP Illegal posix_spawn attr flag was set */ static errno_t -exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_present, - ipc_port_t * portwatch_ports) +exec_handle_port_actions(struct image_params *imgp, + struct exec_port_actions *actions) { _posix_spawn_port_actions_t pacts = imgp->ip_px_spa; #if CONFIG_AUDIT @@ -1705,10 +1854,64 @@ exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_presen task_t task = get_threadtask(imgp->ip_new_thread); ipc_port_t port = NULL; errno_t ret = 0; - int i; + int i, portwatch_i = 0, registered_i = 0; kern_return_t kr; + boolean_t task_has_watchport_boost = task_has_watchports(current_task()); + boolean_t in_exec = (imgp->ip_flags & IMGPF_EXEC); + + for (i = 0; i < pacts->pspa_count; i++) { + act = &pacts->pspa_actions[i]; + + switch (act->port_type) { + case PSPA_SPECIAL: + case PSPA_EXCEPTION: +#if CONFIG_AUDIT + case PSPA_AU_SESSION: +#endif + break; + case PSPA_IMP_WATCHPORTS: + if (++actions->portwatch_count > TASK_MAX_WATCHPORT_COUNT) { + ret = EINVAL; + goto done; + } + break; + case PSPA_REGISTERED_PORTS: + if (++actions->registered_count > TASK_PORT_REGISTER_MAX) { + ret = EINVAL; + goto done; + } + break; + default: + ret = EINVAL; + goto done; + } + } + + if (actions->portwatch_count) { + if (in_exec && task_has_watchport_boost) { + ret = EINVAL; + goto done; + } + actions->portwatch_array = + kalloc(sizeof(ipc_port_t *) * actions->portwatch_count); + if (actions->portwatch_array == NULL) { + ret = ENOMEM; + goto done; + } + bzero(actions->portwatch_array, + sizeof(ipc_port_t *) * actions->portwatch_count); + } - *portwatch_present = FALSE; + if (actions->registered_count) { + actions->registered_array = + kalloc(sizeof(ipc_port_t *) * actions->registered_count); + if (actions->registered_array == NULL) { + ret = ENOMEM; + goto done; + } + bzero(actions->registered_array, + sizeof(ipc_port_t *) * actions->registered_count); + } for (i = 0; i < pacts->pspa_count; i++) { act = &pacts->pspa_actions[i]; @@ -1716,7 +1919,7 @@ exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_presen if (MACH_PORT_VALID(act->new_port)) { kr = ipc_object_copyin(get_task_ipcspace(current_task()), act->new_port, MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port); + (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); if (kr != KERN_SUCCESS) { ret = EINVAL; @@ -1754,14 +1957,16 @@ exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_presen break; #endif case PSPA_IMP_WATCHPORTS: - if (portwatch_ports != NULL && IPC_PORT_VALID(port)) { - *portwatch_present = TRUE; + if (actions->portwatch_array) { /* hold on to this till end of spawn */ - portwatch_ports[i] = port; + actions->portwatch_array[portwatch_i++] = port; } else { ipc_port_release_send(port); } - + break; + case PSPA_REGISTERED_PORTS: + /* hold on to this till end of spawn */ + actions->registered_array[registered_i++] = port; break; default: ret = EINVAL; @@ -1900,7 +2105,7 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) struct dup2_args dup2a; dup2a.from = psfa->psfaa_filedes; - dup2a.to = psfa->psfaa_openargs.psfao_oflag; + dup2a.to = psfa->psfaa_dup2args.psfad_newfiledes; /* * The dup2() system call implementation sets @@ -1912,6 +2117,47 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) } break; + case PSFA_FILEPORT_DUP2: { + ipc_port_t port; + kern_return_t kr; + struct dup2_args dup2a; + struct close_nocancel_args ca; + + if (!MACH_PORT_VALID(psfa->psfaa_fileport)) { + error = EINVAL; + break; + } + + kr = ipc_object_copyin(get_task_ipcspace(current_task()), + psfa->psfaa_fileport, MACH_MSG_TYPE_COPY_SEND, + (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + + if (kr != KERN_SUCCESS) { + error = EINVAL; + break; + } + + error = fileport_makefd_internal(p, port, 0, ival); + + if (IPC_PORT_NULL != port) { + ipc_port_release_send(port); + } + + if (error || ival[0] == psfa->psfaa_dup2args.psfad_newfiledes) { + break; + } + + dup2a.from = ca.fd = ival[0]; + dup2a.to = psfa->psfaa_dup2args.psfad_newfiledes; + error = dup2(p, &dup2a, ival); + if (error) { + break; + } + + error = close_nocancel(p, &ca, ival); + } + break; + case PSFA_CLOSE: { struct close_nocancel_args ca; @@ -1946,6 +2192,34 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) } break; + case PSFA_CHDIR: { + /* + * Chdir is different, in that it requires the use of + * a path argument, which is normally copied in from + * user space; because of this, we have to support a + * chdir from kernel space that passes an address space + * context of UIO_SYSSPACE, and casts the address + * argument to a user_addr_t. + */ + struct nameidata nd; + + NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE, + CAST_USER_ADDR_T(psfa->psfaa_chdirargs.psfac_path), + imgp->ip_vfs_context); + + error = chdir_internal(p, imgp->ip_vfs_context, &nd, 0); + } + break; + + case PSFA_FCHDIR: { + struct fchdir_args fchdira; + + fchdira.fd = psfa->psfaa_filedes; + + error = fchdir(p, &fchdira, ival); + } + break; + default: error = EINVAL; break; @@ -1984,7 +2258,8 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) switch (psfa->psfaa_type) { case PSFA_DUP2: - fd = psfa->psfaa_openargs.psfao_oflag; + case PSFA_FILEPORT_DUP2: + fd = psfa->psfaa_dup2args.psfad_newfiledes; /*FALLTHROUGH*/ case PSFA_OPEN: case PSFA_INHERIT: @@ -1992,6 +2267,15 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) break; case PSFA_CLOSE: + case PSFA_CHDIR: + case PSFA_FCHDIR: + /* + * Although PSFA_FCHDIR does have a file descriptor, it is not + * *creating* one, thus we do not automatically mark it for + * inheritance under POSIX_SPAWN_CLOEXEC_DEFAULT. A client that + * wishes it to be inherited should use the PSFA_INHERIT action + * explicitly. + */ break; } } @@ -2126,14 +2410,16 @@ spawn_validate_persona(struct _posix_spawn_persona_info *px_persona) struct persona *persona = NULL; int verify = px_persona->pspi_flags & POSIX_SPAWN_PERSONA_FLAGS_VERIFY; - /* - * TODO: rdar://problem/19981151 - * Add entitlement check! - */ - if (!kauth_cred_issuser(kauth_cred_get())) { + if (!IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) { return EPERM; } + if (px_persona->pspi_flags & POSIX_SPAWN_PERSONA_GROUPS) { + if (px_persona->pspi_ngroups > NGROUPS_MAX) { + return EINVAL; + } + } + persona = persona_lookup(px_persona->pspi_id); if (!persona) { error = ESRCH; @@ -2245,21 +2531,119 @@ out: #endif #if __arm64__ +extern int legacy_footprint_entitlement_mode; static inline void -proc_legacy_footprint(proc_t p, task_t task, const char *caller) +proc_legacy_footprint_entitled(proc_t p, task_t task, const char *caller) { +#pragma unused(p, caller) boolean_t legacy_footprint_entitled; - legacy_footprint_entitled = IOTaskHasEntitlement(task, - "com.apple.private.memory.legacy_footprint"); - if (legacy_footprint_entitled) { - printf("%s: %d[%s] legacy footprint (entitled)\n", - caller, p->p_pid, p->p_name); - task_set_legacy_footprint(task, TRUE); + switch (legacy_footprint_entitlement_mode) { + case LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE: + /* the entitlement is ignored */ + break; + case LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT: + /* the entitlement grants iOS11 legacy accounting */ + legacy_footprint_entitled = IOTaskHasEntitlement(task, + "com.apple.private.memory.legacy_footprint"); + if (legacy_footprint_entitled) { + task_set_legacy_footprint(task); + } + break; + case LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE: + /* the entitlement grants a footprint limit increase */ + legacy_footprint_entitled = IOTaskHasEntitlement(task, + "com.apple.private.memory.legacy_footprint"); + if (legacy_footprint_entitled) { + task_set_extra_footprint_limit(task); + } + break; + default: + break; } } #endif /* __arm64__ */ +/* + * Apply a modification on the proc's kauth cred until it converges. + * + * `update` consumes its argument to return a new kauth cred. + */ +static void +apply_kauth_cred_update(proc_t p, + kauth_cred_t (^update)(kauth_cred_t orig_cred)) +{ + kauth_cred_t my_cred, my_new_cred; + + my_cred = kauth_cred_proc_ref(p); + for (;;) { + my_new_cred = update(my_cred); + if (my_cred == my_new_cred) { + kauth_cred_unref(&my_new_cred); + break; + } + + /* try update cred on proc */ + proc_ucred_lock(p); + + if (p->p_ucred == my_cred) { + /* base pointer didn't change, donate our ref */ + p->p_ucred = my_new_cred; + PROC_UPDATE_CREDS_ONPROC(p); + proc_ucred_unlock(p); + + /* drop p->p_ucred reference */ + kauth_cred_unref(&my_cred); + break; + } + + /* base pointer changed, retry */ + my_cred = p->p_ucred; + kauth_cred_ref(my_cred); + proc_ucred_unlock(p); + + kauth_cred_unref(&my_new_cred); + } +} + +static int +spawn_posix_cred_adopt(proc_t p, + struct _posix_spawn_posix_cred_info *px_pcred_info) +{ + int error = 0; + + if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GID) { + struct setgid_args args = { + .gid = px_pcred_info->pspci_gid, + }; + error = setgid(p, &args, NULL); + if (error) { + return error; + } + } + + if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GROUPS) { + error = setgroups_internal(p, + px_pcred_info->pspci_ngroups, + px_pcred_info->pspci_groups, + px_pcred_info->pspci_gmuid); + if (error) { + return error; + } + } + + if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_UID) { + struct setuid_args args = { + .uid = px_pcred_info->pspci_uid, + }; + error = setuid(p, &args, NULL); + if (error) { + return error; + } + } + return 0; +} + /* * posix_spawn * @@ -2280,6 +2664,7 @@ proc_legacy_footprint(proc_t p, task_t task, const char *caller) * exec_activate_image:ENAMETOOLONG Filename too long * exec_activate_image:ENOEXEC Executable file format error * exec_activate_image:ETXTBSY Text file busy [misuse of error code] + * exec_activate_image:EAUTH Image decryption failed * exec_activate_image:EBADEXEC The executable is corrupt/unknown * exec_activate_image:??? * mac_execve_enter:??? @@ -2310,8 +2695,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) boolean_t spawn_no_exec = FALSE; boolean_t proc_transit_set = TRUE; boolean_t exec_done = FALSE; - int portwatch_count = 0; - ipc_port_t * portwatch_ports = NULL; + struct exec_port_actions port_actions = { }; vm_size_t px_sa_offset = offsetof(struct _posix_spawnattr, psa_ports); task_t old_task = current_task(); task_t new_task = NULL; @@ -2320,6 +2704,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) #if CONFIG_PERSONAS struct _posix_spawn_persona_info *px_persona = NULL; #endif + struct _posix_spawn_posix_cred_info *px_pcred_info = NULL; /* * Allocate a big chunk for locals instead of using stack since these @@ -2345,7 +2730,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); imgp->ip_mac_return = 0; imgp->ip_px_persona = NULL; + imgp->ip_px_pcred_info = NULL; imgp->ip_cs_error = OS_REASON_NULL; + imgp->ip_simulator_binary = IMGPF_SB_DEFAULT; if (uap->adesc != USER_ADDR_NULL) { if (is_64) { @@ -2371,6 +2758,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) px_args.coal_info = CAST_USER_ADDR_T(px_args32.coal_info); px_args.persona_info_size = px_args32.persona_info_size; px_args.persona_info = CAST_USER_ADDR_T(px_args32.persona_info); + px_args.posix_cred_info_size = px_args32.posix_cred_info_size; + px_args.posix_cred_info = CAST_USER_ADDR_T(px_args32.posix_cred_info); } if (error) { goto bad; @@ -2472,6 +2861,39 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } } #endif + /* copy in the posix cred info */ + if (px_args.posix_cred_info_size != 0 && px_args.posix_cred_info != 0) { + /* for now, we need the exact same struct in user space */ + if (px_args.posix_cred_info_size != sizeof(*px_pcred_info)) { + error = ERANGE; + goto bad; + } + + if (!kauth_cred_issuser(kauth_cred_get())) { + error = EPERM; + goto bad; + } + + MALLOC(px_pcred_info, struct _posix_spawn_posix_cred_info *, + px_args.posix_cred_info_size, M_TEMP, M_WAITOK | M_ZERO); + if (px_pcred_info == NULL) { + error = ENOMEM; + goto bad; + } + imgp->ip_px_pcred_info = px_pcred_info; + + if ((error = copyin(px_args.posix_cred_info, px_pcred_info, + px_args.posix_cred_info_size)) != 0) { + goto bad; + } + + if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GROUPS) { + if (px_pcred_info->pspci_ngroups > NGROUPS_MAX) { + error = EINVAL; + goto bad; + } + } + } #if CONFIG_MACF if (px_args.mac_extensions_size != 0) { if ((error = spawn_copyin_macpolicyinfo(&px_args, (_posix_spawn_mac_policy_extensions_t *)&imgp->ip_px_smpx)) != 0) { @@ -2494,6 +2916,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) goto bad; } + if (imgp->ip_px_sa != NULL) { + struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa; + if ((error = exec_validate_spawnattr_policy(psa->psa_apptype)) != 0) { + goto bad; + } + } + /* * If we don't have the extension flag that turns "posix_spawn()" * into "execve() with options", then we will be creating a new @@ -2633,31 +3062,6 @@ do_fork1: } imgp->ip_flags |= IMGPF_SPAWN; /* spawn w/o exec */ spawn_no_exec = TRUE; /* used in later tests */ - -#if CONFIG_PERSONAS - /* - * If the parent isn't in a persona (launchd), and - * hasn't specified a new persona for the process, - * then we'll put the process into the system persona - * - * TODO: this will have to be re-worked because as of - * now, without any launchd adoption, the resulting - * xpcproxy process will not have sufficient - * privileges to setuid/gid. - */ -#if 0 - if (!proc_has_persona(p) && imgp->ip_px_persona == NULL) { - MALLOC(px_persona, struct _posix_spawn_persona_info *, - sizeof(*px_persona), M_TEMP, M_WAITOK | M_ZERO); - if (px_persona == NULL) { - error = ENOMEM; - goto bad; - } - px_persona->pspi_id = persona_get_id(g_system_persona); - imgp->ip_px_persona = px_persona; - } -#endif /* 0 */ -#endif /* CONFIG_PERSONAS */ } else { /* * For execve case, create a new task and thread @@ -2737,56 +3141,13 @@ do_fork1: /* Has spawn port actions? */ if (imgp->ip_px_spa != NULL) { - boolean_t is_adaptive = FALSE; - boolean_t portwatch_present = FALSE; - - /* Will this process become adaptive? The apptype isn't ready yet, so we can't look there. */ - if (imgp->ip_px_sa != NULL && px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE) { - is_adaptive = TRUE; - } - - /* - * portwatch only: - * Allocate a place to store the ports we want to bind to the new task - * We can't bind them until after the apptype is set. - */ - if (px_spap->pspa_count != 0 && is_adaptive) { - portwatch_count = px_spap->pspa_count; - MALLOC(portwatch_ports, ipc_port_t *, (sizeof(ipc_port_t) * portwatch_count), M_TEMP, M_WAITOK | M_ZERO); - } else { - portwatch_ports = NULL; - } - - if ((error = exec_handle_port_actions(imgp, &portwatch_present, portwatch_ports)) != 0) { + if ((error = exec_handle_port_actions(imgp, &port_actions)) != 0) { goto bad; } - - if (portwatch_present == FALSE && portwatch_ports != NULL) { - FREE(portwatch_ports, M_TEMP); - portwatch_ports = NULL; - portwatch_count = 0; - } } /* Has spawn attr? */ if (imgp->ip_px_sa != NULL) { - /* - * Set the process group ID of the child process; this has - * to happen before the image activation. - */ - if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) { - struct setpgid_args spga; - spga.pid = p->p_pid; - spga.pgid = px_sa.psa_pgroup; - /* - * Effectively, call setpgid() system call; works - * because there are no pointer arguments. - */ - if ((error = setpgid(p, &spga, ival)) != 0) { - goto bad; - } - } - /* * Reset UID/GID to parent's RUID/RGID; This works only * because the operation occurs *after* the vfork() and @@ -2800,35 +3161,33 @@ do_fork1: * proc's ucred lock. This prevents others from accessing * a garbage credential. */ - while (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) { - kauth_cred_t my_cred = kauth_cred_proc_ref(p); - kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, kauth_cred_getruid(my_cred), kauth_cred_getrgid(my_cred)); + if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) { + apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred){ + return kauth_cred_setuidgid(my_cred, + kauth_cred_getruid(my_cred), + kauth_cred_getrgid(my_cred)); + }); + } - if (my_cred == my_new_cred) { - kauth_cred_unref(&my_cred); - break; + if (imgp->ip_px_pcred_info) { + if (!spawn_no_exec) { + error = ENOTSUP; + goto bad; } - /* update cred on proc */ - proc_ucred_lock(p); - - if (p->p_ucred != my_cred) { - proc_ucred_unlock(p); - kauth_cred_unref(&my_new_cred); - continue; + error = spawn_posix_cred_adopt(p, imgp->ip_px_pcred_info); + if (error != 0) { + goto bad; } - - /* donate cred reference on my_new_cred to p->p_ucred */ - p->p_ucred = my_new_cred; - PROC_UPDATE_CREDS_ONPROC(p); - proc_ucred_unlock(p); - - /* drop additional reference that was taken on the previous cred */ - kauth_cred_unref(&my_cred); } #if CONFIG_PERSONAS - if (spawn_no_exec && imgp->ip_px_persona != NULL) { + if (imgp->ip_px_persona != NULL) { + if (!spawn_no_exec) { + error = ENOTSUP; + goto bad; + } + /* * If we were asked to spawn a process into a new persona, * do the credential switch now (which may override the UID/GID @@ -2864,6 +3223,7 @@ do_fork1: imgp->ip_flags |= IMGPF_HIGH_BITS_ASLR; } +#if !SECURE_KERNEL /* * Forcibly disallow execution from data pages for the spawned process * even if it would otherwise be permitted by the architecture default. @@ -2871,6 +3231,12 @@ do_fork1: if (px_sa.psa_flags & _POSIX_SPAWN_ALLOW_DATA_EXEC) { imgp->ip_flags |= IMGPF_ALLOW_DATA_EXEC; } +#endif /* !SECURE_KERNEL */ + + if ((px_sa.psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) == + POSIX_SPAWN_PROC_TYPE_DRIVER) { + imgp->ip_flags |= IMGPF_DRIVER; + } } /* @@ -2906,6 +3272,10 @@ do_fork1: * Activate the image */ error = exec_activate_image(imgp); +#if defined(HAS_APPLE_PAC) + ml_task_set_disable_user_jop(new_task, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE); + ml_thread_set_disable_user_jop(imgp->ip_new_thread, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE); +#endif if (error == 0 && !spawn_no_exec) { p = proc_exec_switch_task(p, old_task, new_task, imgp->ip_new_thread); @@ -2930,19 +3300,45 @@ do_fork1: error = ENOEXEC; } - /* - * If we have a spawn attr, and it contains signal related flags, - * the we need to process them in the "context" of the new child - * process, so we have to process it following image activation, - * prior to making the thread runnable in user space. This is - * necessitated by some signal information being per-thread rather - * than per-process, and we don't have the new allocation in hand - * until after the image is activated. - */ if (!error && imgp->ip_px_sa != NULL) { thread_t child_thread = imgp->ip_new_thread; uthread_t child_uthread = get_bsdthread_info(child_thread); + /* + * Because of POSIX_SPAWN_SETEXEC, we need to handle this after image + * activation, else when image activation fails (before the point of no + * return) would leave the parent process in a modified state. + */ + if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) { + struct setpgid_args spga; + spga.pid = p->p_pid; + spga.pgid = px_sa.psa_pgroup; + /* + * Effectively, call setpgid() system call; works + * because there are no pointer arguments. + */ + if ((error = setpgid(p, &spga, ival)) != 0) { + goto bad; + } + } + + if (px_sa.psa_flags & POSIX_SPAWN_SETSID) { + error = setsid_internal(p); + if (error != 0) { + goto bad; + } + } + + /* + * If we have a spawn attr, and it contains signal related flags, + * the we need to process them in the "context" of the new child + * process, so we have to process it following image activation, + * prior to making the thread runnable in user space. This is + * necessitated by some signal information being per-thread rather + * than per-process, and we don't have the new allocation in hand + * until after the image is activated. + */ + /* * Mask a list of signals, instead of them being unmasked, if * they were unmasked in the parent; note that some signals @@ -2989,6 +3385,15 @@ do_fork1: px_sa.psa_cpumonitor_interval * NSEC_PER_SEC, 0, TRUE); } + + + if (px_pcred_info && + (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_LOGIN)) { + /* + * setlogin() must happen after setsid() + */ + setlogin_internal(p, px_pcred_info->pspci_login); + } } bad: @@ -3022,6 +3427,11 @@ bad: exec_resettextvp(p, imgp); #if CONFIG_MEMORYSTATUS + /* Set jetsam priority for DriverKit processes */ + if (px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DRIVER) { + px_sa.psa_priority = JETSAM_PRIORITY_DRIVER_APPLE; + } + /* Has jetsam attributes? */ if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_SET)) { /* @@ -3032,14 +3442,15 @@ bad: * we attempt to mimic previous behavior by forcing the BG limit data into the * inactive/non-fatal mode and force the active slots to hold system_wide/fatal mode. */ + if (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND) { - memorystatus_update(p, px_sa.psa_priority, 0, + memorystatus_update(p, px_sa.psa_priority, 0, FALSE, /* assertion priority */ (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY), TRUE, -1, TRUE, px_sa.psa_memlimit_inactive, FALSE); } else { - memorystatus_update(p, px_sa.psa_priority, 0, + memorystatus_update(p, px_sa.psa_priority, 0, FALSE, /* assertion priority */ (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY), TRUE, px_sa.psa_memlimit_active, @@ -3048,6 +3459,31 @@ bad: (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL)); } } + + /* Has jetsam relaunch behavior? */ + if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK)) { + /* + * Launchd has passed in data indicating the behavior of this process in response to jetsam. + * This data would be used by the jetsam subsystem to determine the position and protection + * offered to this process on dirty -> clean transitions. + */ + int relaunch_flags = P_MEMSTAT_RELAUNCH_UNKNOWN; + switch (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK) { + case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW: + relaunch_flags = P_MEMSTAT_RELAUNCH_LOW; + break; + case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED: + relaunch_flags = P_MEMSTAT_RELAUNCH_MED; + break; + case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH: + relaunch_flags = P_MEMSTAT_RELAUNCH_HIGH; + break; + default: + break; + } + memorystatus_relaunch_flags_update(p, relaunch_flags); + } + #endif /* CONFIG_MEMORYSTATUS */ if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > 0) { task_set_thread_limit(new_task, (uint16_t)px_sa.psa_thread_limit); @@ -3099,7 +3535,7 @@ bad: } #if __arm64__ - proc_legacy_footprint(p, new_task, __FUNCTION__); + proc_legacy_footprint_entitled(p, new_task, __FUNCTION__); #endif /* __arm64__ */ } @@ -3108,6 +3544,21 @@ bad: proc_inherit_task_role(new_task, old_task); } +#if CONFIG_ARCADE + if (error == 0) { + /* + * Check to see if we need to trigger an arcade upcall AST now + * that the vnode has been reset on the task. + */ + arcade_prepare(new_task, imgp->ip_new_thread); + } +#endif /* CONFIG_ARCADE */ + + /* Clear the initial wait on the thread before handling spawn policy */ + if (imgp && imgp->ip_new_thread) { + task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT); + } + /* * Apply the spawnattr policy, apptype (which primes the task for importance donation), * and bind any portwatch ports to the new task. @@ -3120,8 +3571,13 @@ bad: if (error == 0 && imgp->ip_px_sa != NULL) { struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa; - exec_handle_spawnattr_policy(p, psa->psa_apptype, psa->psa_qos_clamp, psa->psa_darwin_role, - portwatch_ports, portwatch_count); + error = exec_handle_spawnattr_policy(p, imgp->ip_new_thread, psa->psa_apptype, psa->psa_qos_clamp, + psa->psa_darwin_role, &port_actions); + } + + /* Transfer the turnstile watchport boost to new task if in exec */ + if (error == 0 && !spawn_no_exec) { + task_transfer_turnstile_watchports(old_task, new_task, imgp->ip_new_thread); } /* @@ -3147,6 +3603,7 @@ bad: */ if (mac_proc_check_map_anon(p, 0, 0, 0, MAP_JIT, NULL) == 0) { vm_map_set_jumbo(get_task_map(new_task)); + vm_map_set_jit_entitled(get_task_map(new_task)); } #endif /* CONFIG_MACF */ } @@ -3155,16 +3612,8 @@ bad: * Release any ports we kept around for binding to the new task * We need to release the rights even if the posix_spawn has failed. */ - if (portwatch_ports != NULL) { - for (int i = 0; i < portwatch_count; i++) { - ipc_port_t port = NULL; - if ((port = portwatch_ports[i]) != NULL) { - ipc_port_release_send(port); - } - } - FREE(portwatch_ports, M_TEMP); - portwatch_ports = NULL; - portwatch_count = 0; + if (imgp->ip_px_spa != NULL) { + exec_port_actions_destroy(&port_actions); } /* @@ -3212,6 +3661,9 @@ bad: FREE(imgp->ip_px_persona, M_TEMP); } #endif + if (imgp->ip_px_pcred_info != NULL) { + FREE(imgp->ip_px_pcred_info, M_TEMP); + } #if CONFIG_MACF if (imgp->ip_px_smpx != NULL) { spawn_free_macpolicyinfo(imgp->ip_px_smpx); @@ -3301,7 +3753,13 @@ bad: * If the parent wants the pid, copy it out */ if (pid != USER_ADDR_NULL) { - (void)suword(pid, p->p_pid); + _Static_assert(sizeof(p->p_pid) == 4, "posix_spawn() assumes a 32-bit pid_t"); + bool aligned = (pid & 3) == 0; + if (aligned) { + (void)copyout_atomic32(p->p_pid, pid); + } else { + (void)suword(pid, p->p_pid); + } } retval[0] = error; @@ -3339,7 +3797,7 @@ bad: /* Release the thread ref returned by fork_create_child/fork1 */ if (imgp != NULL && imgp->ip_new_thread) { /* wake up the new thread */ - task_clear_return_wait(get_threadtask(imgp->ip_new_thread)); + task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_FINAL_WAIT); thread_deallocate(imgp->ip_new_thread); imgp->ip_new_thread = NULL; } @@ -3438,6 +3896,7 @@ proc_exec_switch_task(proc_t p, task_t old_task, task_t new_task, thread_t new_t /* Clear dispatchqueue and workloop ast offset */ p->p_dispatchqueue_offset = 0; p->p_dispatchqueue_serialno_offset = 0; + p->p_dispatchqueue_label_offset = 0; p->p_return_to_kernel_offset = 0; /* Copy the signal state, dtrace state and set bsd ast on new thread */ @@ -3600,6 +4059,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); imgp->ip_mac_return = 0; imgp->ip_cs_error = OS_REASON_NULL; + imgp->ip_simulator_binary = IMGPF_SB_DEFAULT; #if CONFIG_MACF if (uap->mac_p != USER_ADDR_NULL) { @@ -3668,6 +4128,10 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) * for vfexec. */ new_task = get_threadtask(imgp->ip_new_thread); +#if defined(HAS_APPLE_PAC) + ml_task_set_disable_user_jop(new_task, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE); + ml_thread_set_disable_user_jop(imgp->ip_new_thread, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE); +#endif } if (!error && !in_vfexec) { @@ -3742,7 +4206,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) proc_transend(p, 0); #if __arm64__ - proc_legacy_footprint(p, new_task, __FUNCTION__); + proc_legacy_footprint_entitled(p, new_task, __FUNCTION__); #endif /* __arm64__ */ /* Sever any extant thread affinity */ @@ -3757,6 +4221,14 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) task_set_main_thread_qos(new_task, main_thread); +#if CONFIG_ARCADE + /* + * Check to see if we need to trigger an arcade upcall AST now + * that the vnode has been reset on the task. + */ + arcade_prepare(new_task, imgp->ip_new_thread); +#endif /* CONFIG_ARCADE */ + #if CONFIG_MACF /* * Processes with the MAP_JIT entitlement are permitted to have @@ -3764,6 +4236,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) */ if (mac_proc_check_map_anon(p, 0, 0, 0, MAP_JIT, NULL) == 0) { vm_map_set_jumbo(get_task_map(new_task)); + vm_map_set_jit_entitled(get_task_map(new_task)); } #endif /* CONFIG_MACF */ @@ -3817,6 +4290,16 @@ exit_with_error: } if (imgp != NULL) { + /* Clear the initial wait on the thread transferring watchports */ + if (imgp->ip_new_thread) { + task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT); + } + + /* Transfer the watchport boost to new task */ + if (!error && !in_vfexec) { + task_transfer_turnstile_watchports(old_task, + new_task, imgp->ip_new_thread); + } /* * Do not terminate the current task, if proc_exec_switch_task did not * switch the tasks, terminating the current task without the switch would @@ -3830,7 +4313,7 @@ exit_with_error: /* Release the thread ref returned by fork_create_child */ if (imgp->ip_new_thread) { /* wake up the new exec thread */ - task_clear_return_wait(get_threadtask(imgp->ip_new_thread)); + task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_FINAL_WAIT); thread_deallocate(imgp->ip_new_thread); imgp->ip_new_thread = NULL; } @@ -3881,7 +4364,7 @@ copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size) if (ptr_size == 4) { /* 64 bit value containing 32 bit address */ - unsigned int i; + unsigned int i = 0; error = copyin(froma, &i, 4); *toptr = CAST_USER_ADDR_T(i); /* SAFE */ @@ -4438,6 +4921,7 @@ extern user64_addr_t commpage_text64_location; #define FSID_KEY "executable_file=" #define DYLD_FSID_KEY "dyld_file=" #define CDHASH_KEY "executable_cdhash=" +#define DYLD_FLAGS_KEY "dyld_flags=" #define FSID_MAX_STRING "0x1234567890abcdef,0x1234567890abcdef" @@ -4476,6 +4960,10 @@ exec_add_entropy_key(struct image_params *imgp, /* * Build up the contents of the apple[] string vector */ +#if (DEVELOPMENT || DEBUG) +uint64_t dyld_flags = 0; +#endif + static int exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result) @@ -4611,6 +5099,17 @@ exec_add_apple_strings(struct image_params *imgp, } imgp->ip_applec++; } +#if (DEVELOPMENT || DEBUG) + if (dyld_flags) { + char dyld_flags_string[strlen(DYLD_FLAGS_KEY) + HEX_STR_LEN + 1]; + snprintf(dyld_flags_string, sizeof(dyld_flags_string), DYLD_FLAGS_KEY "0x%llx", dyld_flags); + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_flags_string), UIO_SYSSPACE, FALSE); + if (error) { + goto bad; + } + imgp->ip_applec++; + } +#endif /* Align the tail of the combined applev area */ while (imgp->ip_strspace % img_ptr_size != 0) { @@ -4763,7 +5262,6 @@ exec_handle_sugid(struct image_params *imgp) { proc_t p = vfs_context_proc(imgp->ip_vfs_context); kauth_cred_t cred = vfs_context_ucred(imgp->ip_vfs_context); - kauth_cred_t my_cred, my_new_cred; int i; int leave_sugid_clear = 0; int mac_reset_ipc = 0; @@ -4840,62 +5338,23 @@ handle_mac_transition: * proc's ucred lock. This prevents others from accessing * a garbage credential. */ - while (imgp->ip_origvattr->va_mode & VSUID) { - my_cred = kauth_cred_proc_ref(p); - my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE); - - if (my_new_cred == my_cred) { - kauth_cred_unref(&my_cred); - break; - } - - /* update cred on proc */ - proc_ucred_lock(p); - - if (p->p_ucred != my_cred) { - proc_ucred_unlock(p); - kauth_cred_unref(&my_new_cred); - continue; - } - - /* donate cred reference on my_new_cred to p->p_ucred */ - p->p_ucred = my_new_cred; - PROC_UPDATE_CREDS_ONPROC(p); - proc_ucred_unlock(p); - - /* drop additional reference that was taken on the previous cred */ - kauth_cred_unref(&my_cred); - - break; - } - - while (imgp->ip_origvattr->va_mode & VSGID) { - my_cred = kauth_cred_proc_ref(p); - my_new_cred = kauth_cred_setresgid(my_cred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid); - - if (my_new_cred == my_cred) { - kauth_cred_unref(&my_cred); - break; - } - - /* update cred on proc */ - proc_ucred_lock(p); - - if (p->p_ucred != my_cred) { - proc_ucred_unlock(p); - kauth_cred_unref(&my_new_cred); - continue; - } - - /* donate cred reference on my_new_cred to p->p_ucred */ - p->p_ucred = my_new_cred; - PROC_UPDATE_CREDS_ONPROC(p); - proc_ucred_unlock(p); - - /* drop additional reference that was taken on the previous cred */ - kauth_cred_unref(&my_cred); - - break; + if (imgp->ip_origvattr->va_mode & VSUID) { + apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) { + return kauth_cred_setresuid(my_cred, + KAUTH_UID_NONE, + imgp->ip_origvattr->va_uid, + imgp->ip_origvattr->va_uid, + KAUTH_UID_NONE); + }); + } + + if (imgp->ip_origvattr->va_mode & VSGID) { + apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) { + return kauth_cred_setresgid(my_cred, + KAUTH_GID_NONE, + imgp->ip_origvattr->va_gid, + imgp->ip_origvattr->va_gid); + }); } #endif /* !SECURE_KERNEL */ @@ -5072,35 +5531,11 @@ handle_mac_transition: * proc's ucred lock. This prevents others from accessing * a garbage credential. */ - for (;;) { - my_cred = kauth_cred_proc_ref(p); - my_new_cred = kauth_cred_setsvuidgid(my_cred, kauth_cred_getuid(my_cred), kauth_cred_getgid(my_cred)); - - if (my_new_cred == my_cred) { - kauth_cred_unref(&my_cred); - break; - } - - /* update cred on proc */ - proc_ucred_lock(p); - - if (p->p_ucred != my_cred) { - proc_ucred_unlock(p); - kauth_cred_unref(&my_new_cred); - continue; - } - - /* donate cred reference on my_new_cred to p->p_ucred */ - p->p_ucred = my_new_cred; - PROC_UPDATE_CREDS_ONPROC(p); - proc_ucred_unlock(p); - - /* drop additional reference that was taken on the previous cred */ - kauth_cred_unref(&my_cred); - - break; - } - + apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) { + return kauth_cred_setsvuidgid(my_cred, + kauth_cred_getuid(my_cred), + kauth_cred_getgid(my_cred)); + }); /* Update the process' identity version and set the security token */ p->p_idversion = OSIncrementAtomic(&nextpidversion); @@ -5442,7 +5877,6 @@ load_return_to_errno(load_return_t lrtn) case LOAD_SUCCESS: return 0; case LOAD_BADARCH: - case LOAD_BADARCH_X86: return EBADARCH; case LOAD_BADMACHO: case LOAD_BADMACHO_UPX: @@ -5458,8 +5892,9 @@ load_return_to_errno(load_return_t lrtn) return ENOENT; case LOAD_IOERROR: return EIO; - case LOAD_FAILURE: case LOAD_DECRYPTFAIL: + return EAUTH; + case LOAD_FAILURE: default: return EBADEXEC; } @@ -5737,7 +6172,7 @@ __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port static int check_for_signature(proc_t p, struct image_params *imgp) { - mach_port_t port = NULL; + mach_port_t port = IPC_PORT_NULL; kern_return_t kr = KERN_FAILURE; int error = EACCES; boolean_t unexpected_failure = FALSE; @@ -5905,6 +6340,10 @@ done: } } + if (port != IPC_PORT_NULL) { + ipc_port_release_send(port); + } + /* If we hit this, we likely would have leaked an exit reason */ assert(signature_failure_reason == OS_REASON_NULL); return error; diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index e958587b4..825508bf3 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -129,6 +129,7 @@ #include #include +#include #include @@ -152,6 +153,9 @@ void dtrace_proc_exit(proc_t p); #include #endif /* CONFIG_MACF */ +#if CONFIG_MEMORYSTATUS +static void proc_memorystatus_remove(proc_t p); +#endif /* CONFIG_MEMORYSTATUS */ void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify); void gather_populate_corpse_crashinfo(proc_t p, task_t corpse_task, mach_exception_data_type_t code, mach_exception_data_type_t subcode, @@ -509,6 +513,11 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * kcdata_memcpy(crash_info_ptr, uaddr, &p->p_responsible_pid, sizeof(p->p_responsible_pid)); } + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_PERSONA_ID, sizeof(uid_t), &uaddr)) { + uid_t persona_id = proc_persona_id(p); + kcdata_memcpy(crash_info_ptr, uaddr, &persona_id, sizeof(persona_id)); + } + #if CONFIG_COALITIONS if (KERN_SUCCESS == kcdata_get_memory_addr_for_array(crash_info_ptr, TASK_CRASHINFO_COALITION_ID, sizeof(uint64_t), COALITION_NUM_TYPES, &uaddr)) { uint64_t coalition_ids[COALITION_NUM_TYPES]; @@ -518,12 +527,16 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * #endif /* CONFIG_COALITIONS */ #if CONFIG_MEMORYSTATUS - memstat_dirty_flags = memorystatus_dirty_get(p); + memstat_dirty_flags = memorystatus_dirty_get(p, FALSE); if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_DIRTY_FLAGS, sizeof(memstat_dirty_flags), &uaddr)) { kcdata_memcpy(crash_info_ptr, uaddr, &memstat_dirty_flags, sizeof(memstat_dirty_flags)); } #endif + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_MEMORY_LIMIT_INCREASE, sizeof(p->p_memlimit_increase), &uaddr)) { + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_memlimit_increase, sizeof(p->p_memlimit_increase)); + } + if (p->p_exit_reason != OS_REASON_NULL && reason == OS_REASON_NULL) { reason = p->p_exit_reason; } @@ -596,7 +609,8 @@ launchd_exit_reason_get_string_desc(os_reason_t exit_reason) return (char *)kcdata_iter_payload(iter); } -static __attribute__((noinline)) void +__abortlike +static void launchd_crashed_panic(proc_t p, int rv) { char *launchd_exit_reason_desc = launchd_exit_reason_get_string_desc(p->p_exit_reason); @@ -921,6 +935,25 @@ exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, return 0; } +#if CONFIG_MEMORYSTATUS +/* + * Remove this process from jetsam bands for freezing or exiting. Note this will block, if the process + * is currently being frozen. + * The proc_list_lock is held by the caller. + * NB: If the process should be ineligible for future freezing or jetsaming the caller should first set + * the p_listflag P_LIST_EXITED bit. + */ +static void +proc_memorystatus_remove(proc_t p) +{ + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + while (memorystatus_remove(p) == EAGAIN) { + os_log(OS_LOG_DEFAULT, "memorystatus_remove: Process[%d] tried to exit while being frozen. Blocking exit until freeze completes.", p->p_pid); + msleep(&p->p_memstat_state, proc_list_mlock, PWAIT, "proc_memorystatus_remove", NULL); + } +} +#endif + void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify) { @@ -1056,7 +1089,7 @@ skipcheck: proc_list_lock(); #if CONFIG_MEMORYSTATUS - memorystatus_remove(p, TRUE); + proc_memorystatus_remove(p); #endif LIST_REMOVE(p, p_list); @@ -1066,7 +1099,6 @@ skipcheck: proc_list_unlock(); - #ifdef PGINPROF vmsizmon(); #endif @@ -1140,8 +1172,6 @@ proc_exit(proc_t p) dtrace_proc_exit(p); #endif - nspace_proc_exit(p); - /* * need to cancel async IO requests that can be cancelled and wait for those * already active. MAY BLOCK! @@ -1179,6 +1209,14 @@ proc_exit(proc_t p) throttle_lowpri_io(0); } + if (p->p_lflag & P_LNSPACE_RESOLVER) { + /* + * The namespace resolver is exiting; there may be + * outstanding materialization requests to clean up. + */ + nspace_resolver_exited(p); + } + #if SYSV_SHM /* Close ref SYSV Shared memory*/ if (p->vm_shm) { @@ -2327,7 +2365,7 @@ proc_reparentlocked(proc_t child, proc_t parent, int signallable, int locked) } #endif oldparent->p_childrencnt--; -#if __PROC_INTERNAL_DEBUG1 +#if __PROC_INTERNAL_DEBUG if (oldparent->p_childrencnt < 0) { panic("process children count -ve\n"); } @@ -2411,7 +2449,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit) proc_list_lock(); #if CONFIG_MEMORYSTATUS - memorystatus_remove(p, TRUE); + proc_memorystatus_remove(p); #endif LIST_REMOVE(p, p_list); diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index 772c16355..4b0f0e9a4 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -720,7 +720,7 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval) /* restore thread-set-id state */ if (uth->uu_flag & UT_WASSETUID) { uth->uu_flag |= UT_SETUID; - uth->uu_flag &= UT_WASSETUID; + uth->uu_flag &= ~UT_WASSETUID; } uth->uu_proc = 0; uth->uu_sigmask = uth->uu_vforkmask; @@ -792,8 +792,9 @@ fork_create_child(task_t parent_task, inherit_memory, is_64bit_addr, is_64bit_data, - TF_LRETURNWAIT | TF_LRETURNWAITER, /* All created threads will wait in task_wait_to_return */ - in_exec ? TPF_EXEC_COPY : TPF_NONE, /* Mark the task exec copy if in execve */ + TF_NONE, + in_exec ? TPF_EXEC_COPY : TPF_NONE, /* Mark the task exec copy if in execve */ + (TRW_LRETURNWAIT | TRW_LRETURNWAITER), /* All created threads will wait in task_wait_to_return */ &child_task); if (result != KERN_SUCCESS) { printf("%s: task_create_internal failed. Code: %d\n", @@ -922,7 +923,7 @@ fork(proc_t parent_proc, __unused struct fork_args *uap, int32_t *retval) #endif /* "Return" to the child */ - task_clear_return_wait(get_threadtask(child_thread)); + task_clear_return_wait(get_threadtask(child_thread), TCRW_CLEAR_ALL_WAIT); /* drop the extra references we got during the creation */ if ((child_task = (task_t)get_threadtask(child_thread)) != NULL) { @@ -1107,9 +1108,6 @@ forkproc_free(proc_t p) p->p_textvp = NULL; } - /* Stop the profiling clock */ - stopprofclock(p); - /* Update the audit session proc count */ AUDIT_SESSION_PROCEXIT(p); @@ -1246,7 +1244,6 @@ retry: } nprocs++; child_proc->p_pid = nextpid; - child_proc->p_responsible_pid = nextpid; /* initially responsible for self */ child_proc->p_idversion = OSIncrementAtomic(&nextpidversion); /* kernel process is handcrafted and not from fork, so start from 1 */ child_proc->p_uniqueid = ++nextuniqueid; @@ -1282,7 +1279,7 @@ retry: * for insertion to hash. Copy the section that is to be copied * directly from the parent. */ - bcopy(&parent_proc->p_startcopy, &child_proc->p_startcopy, + __nochk_bcopy(&parent_proc->p_startcopy, &child_proc->p_startcopy, (unsigned) ((caddr_t)&child_proc->p_endcopy - (caddr_t)&child_proc->p_startcopy)); /* @@ -1296,12 +1293,11 @@ retry: #else /* !CONFIG_EMBEDDED */ child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_DISABLE_ASLR | P_SUGID)); #endif /* !CONFIG_EMBEDDED */ - if (parent_proc->p_flag & P_PROFIL) { - startprofclock(child_proc); - } child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_VALID_MASK)); + child_proc->p_responsible_pid = parent_proc->p_responsible_pid; + /* * Note that if the current thread has an assumed identity, this * credential will be granted to the new process. @@ -1414,6 +1410,7 @@ retry: } child_proc->p_dispatchqueue_offset = parent_proc->p_dispatchqueue_offset; child_proc->p_dispatchqueue_serialno_offset = parent_proc->p_dispatchqueue_serialno_offset; + child_proc->p_dispatchqueue_label_offset = parent_proc->p_dispatchqueue_label_offset; child_proc->p_return_to_kernel_offset = parent_proc->p_return_to_kernel_offset; child_proc->p_mach_thread_self_offset = parent_proc->p_mach_thread_self_offset; child_proc->p_pth_tsd_offset = parent_proc->p_pth_tsd_offset; @@ -1437,12 +1434,14 @@ retry: child_proc->p_memstat_state = 0; child_proc->p_memstat_effectivepriority = JETSAM_PRIORITY_DEFAULT; child_proc->p_memstat_requestedpriority = JETSAM_PRIORITY_DEFAULT; + child_proc->p_memstat_assertionpriority = 0; child_proc->p_memstat_userdata = 0; child_proc->p_memstat_idle_start = 0; child_proc->p_memstat_idle_delta = 0; child_proc->p_memstat_memlimit = 0; child_proc->p_memstat_memlimit_active = 0; child_proc->p_memstat_memlimit_inactive = 0; + child_proc->p_memstat_relaunch_flags = P_MEMSTAT_RELAUNCH_UNKNOWN; #if CONFIG_FREEZE child_proc->p_memstat_freeze_sharedanon_pages = 0; #endif diff --git a/bsd/kern/kern_guarded.c b/bsd/kern/kern_guarded.c index dc29cb531..c78c64673 100644 --- a/bsd/kern/kern_guarded.c +++ b/bsd/kern/kern_guarded.c @@ -104,6 +104,10 @@ struct gfp_crarg { u_int gca_attrs; }; +#ifdef OS_REFCNT_DEBUG +extern struct os_refgrp f_iocount_refgrp; +#endif + static struct fileproc * guarded_fileproc_alloc_init(void *crarg) { @@ -115,7 +119,11 @@ guarded_fileproc_alloc_init(void *crarg) } bzero(gfp, sizeof(*gfp)); - gfp->gf_fileproc.f_flags = FTYPE_GUARDED; + + struct fileproc *fp = &gfp->gf_fileproc; + os_ref_init(&fp->f_iocount, &f_iocount_refgrp); + fp->f_flags = FTYPE_GUARDED; + gfp->gf_magic = GUARDED_FILEPROC_MAGIC; gfp->gf_guard = aarg->gca_guard; gfp->gf_attrs = aarg->gca_attrs; @@ -172,7 +180,7 @@ fp_lookup_guarded(proc_t p, int fd, guardid_t guard, * if (FP_ISGUARDED(fp, GUARD_CLOSE)) { * error = fp_guard_exception(p, fd, fp, kGUARD_EXC_CLOSE); * proc_fdunlock(p); - * return (error); + * return error; * } */ @@ -211,7 +219,7 @@ fp_guard_exception(proc_t p, int fd, struct fileproc *fp, u_int flavor) mach_exception_subcode_t subcode = gfp->gf_guard; thread_t t = current_thread(); - thread_guard_violation(t, code, subcode); + thread_guard_violation(t, code, subcode, TRUE); return EPERM; } @@ -413,7 +421,7 @@ guarded_kqueue_np(proc_t p, struct guarded_kqueue_np_args *uap, int32_t *retval) return EINVAL; } - return kqueue_body(p, guarded_fileproc_alloc_init, &crarg, retval); + return kqueue_internal(p, guarded_fileproc_alloc_init, &crarg, retval); } /* @@ -636,14 +644,14 @@ restart: proc_fdlock(p); switch (error = fp_tryswap(p, fd, nfp)) { - case 0: /* guarded-ness comes with side-effects */ + case 0: /* success; guarded-ness comes with side-effects */ + fp = NULL; gfp = FP_TO_GFP(nfp); if (gfp->gf_attrs & GUARD_CLOSE) { FDFLAGS_SET(p, fd, UF_FORKCLOSE); } FDFLAGS_SET(p, fd, UF_EXCLOSE); (void) fp_drop(p, fd, nfp, 1); - fileproc_free(fp); break; case EKEEPLOOKING: /* f_iocount indicates a collision */ (void) fp_drop(p, fd, fp, 1); @@ -688,7 +696,8 @@ restart: proc_fdlock(p); switch (error = fp_tryswap(p, fd, nfp)) { - case 0: /* undo side-effects of guarded-ness */ + case 0: /* success; undo side-effects of guarded-ness */ + fp = NULL; FDFLAGS_CLR(p, fd, UF_FORKCLOSE | UF_EXCLOSE); FDFLAGS_SET(p, fd, (nfdflags & FD_CLOFORK) ? UF_FORKCLOSE : 0); @@ -696,7 +705,6 @@ restart: FDFLAGS_SET(p, fd, (nfdflags & FD_CLOEXEC) ? UF_EXCLOSE : 0); (void) fp_drop(p, fd, nfp, 1); - fileproc_free(fp); break; case EKEEPLOOKING: /* f_iocount indicates collision */ (void) fp_drop(p, fd, fp, 1); @@ -1077,6 +1085,59 @@ vng_lbl_set(struct label *label, void *data) mac_label_set(label, label_slot, (intptr_t)data); } +static int +vnguard_sysc_getguardattr(proc_t p, struct vnguard_getattr *vga) +{ + const int fd = vga->vga_fd; + + if (0 == vga->vga_guard) { + return EINVAL; + } + + int error; + struct fileproc *fp; + if (0 != (error = fp_lookup(p, fd, &fp, 0))) { + return error; + } + do { + struct fileglob *fg = fp->f_fglob; + if (FILEGLOB_DTYPE(fg) != DTYPE_VNODE) { + error = EBADF; + break; + } + struct vnode *vp = fg->fg_data; + if (!vnode_isreg(vp) || NULL == vp->v_mount) { + error = EBADF; + break; + } + error = vnode_getwithref(vp); + if (0 != error) { + break; + } + + vga->vga_attrs = 0; + + lck_rw_lock_shared(&llock); + + if (NULL != vp->v_label) { + const struct vng_info *vgi = vng_lbl_get(vp->v_label); + if (NULL != vgi) { + if (vgi->vgi_guard != vga->vga_guard) { + error = EPERM; + } else { + vga->vga_attrs = vgi->vgi_attrs; + } + } + } + + lck_rw_unlock_shared(&llock); + vnode_put(vp); + } while (0); + + fp_drop(p, fd, fp, 0); + return error; +} + static int vnguard_sysc_setguard(proc_t p, const struct vnguard_set *vns) { @@ -1122,9 +1183,9 @@ vnguard_sysc_setguard(proc_t p, const struct vnguard_set *vns) } error = vnode_getwithref(vp); if (0 != error) { - fp_drop(p, fd, fp, 0); break; } + /* Ensure the target vnode -has- a label */ struct vfs_context *ctx = vfs_context_current(); mac_vnode_label_update(ctx, vp, NULL); @@ -1165,7 +1226,16 @@ vnguard_sysc_setguard(proc_t p, const struct vnguard_set *vns) if (vgi->vgi_guard != vns->vns_guard) { error = EPERM; /* guard mismatch */ } else if (vgi->vgi_attrs != vns->vns_attrs) { - error = EACCES; /* attr mismatch */ + /* + * Temporary workaround for older versions of SQLite: + * allow newer guard attributes to be silently cleared. + */ + const unsigned mask = ~(VNG_WRITE_OTHER | VNG_TRUNC_OTHER); + if ((vgi->vgi_attrs & mask) == (vns->vns_attrs & mask)) { + vgi->vgi_attrs &= vns->vns_attrs; + } else { + error = EACCES; /* attr mismatch */ + } } if (0 != error || NULL != vgo) { free_vgo(nvgo); @@ -1205,6 +1275,19 @@ vng_policy_syscall(proc_t p, int cmd, user_addr_t arg) error = vnguard_sysc_setguard(p, &vns); break; } + case VNG_SYSC_GET_ATTR: { + struct vnguard_getattr vga; + error = copyin(arg, (void *)&vga, sizeof(vga)); + if (error) { + break; + } + error = vnguard_sysc_getguardattr(p, &vga); + if (error) { + break; + } + error = copyout((void *)&vga, arg, sizeof(vga)); + break; + } default: break; } @@ -1281,6 +1364,11 @@ vng_reason_from_pathname(const char *path, uint32_t pathlen) static int vng_policy_flags; +/* + * Note: if an EXC_GUARD is generated, llock will be dropped and + * subsequently reacquired by this routine. Data derived from + * any label in the caller should be regenerated. + */ static int vng_guard_violation(const struct vng_info *vgi, unsigned opval, vnode_t vp) @@ -1364,6 +1452,8 @@ vng_guard_violation(const struct vng_info *vgi, EXC_GUARD_ENCODE_TARGET(code, pid); subcode = vgi->vgi_guard; + lck_rw_unlock_shared(&llock); + if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) { char *path; int len = MAXPATHLEN; @@ -1384,8 +1474,10 @@ vng_guard_violation(const struct vng_info *vgi, } } else { thread_t t = current_thread(); - thread_guard_violation(t, code, subcode); + thread_guard_violation(t, code, subcode, TRUE); } + + lck_rw_lock_shared(&llock); } else if (vng_policy_flags & kVNG_POLICY_SIGKILL) { proc_t p = current_proc(); psignal(p, SIGKILL); @@ -1614,7 +1706,7 @@ SECURITY_READ_ONLY_LATE(static struct mac_policy_conf) vng_policy_conf = { .mpc_runtime_flags = 0 }; -static mac_policy_handle_t vng_policy_handle; +SECURITY_READ_ONLY_LATE(static mac_policy_handle_t) vng_policy_handle; void vnguard_policy_init(void) diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 92ecc8164..21edbc5d9 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,12 +80,6 @@ #include -/* - * This variable controls the maximum number of processes that will - * be checked in doing deadlock detection. - */ -static int maxlockdepth = MAXDEPTH; - #if (DEVELOPMENT || DEBUG) #define LOCKF_DEBUGGING 1 #endif @@ -99,6 +93,7 @@ void lf_printlist(const char *tag, struct lockf *lock); #define LF_DBG_LIST (1 << 1) /* split, coalesce */ #define LF_DBG_IMPINH (1 << 2) /* importance inheritance */ #define LF_DBG_TRACE (1 << 3) /* errors, exit */ +#define LF_DBG_DEADLOCK (1 << 4) /* deadlock detection */ static int lockf_debug = 0; /* was 2, could be 3 ;-) */ SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, ""); @@ -109,10 +104,16 @@ SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_de */ #define LOCKF_DEBUG(mask, ...) \ do { \ - if( !(mask) || ((mask) & lockf_debug)) { \ + if (!(mask) || ((mask) & lockf_debug)) { \ + printf("%s>", __FUNCTION__); \ printf(__VA_ARGS__); \ } \ } while(0) + +#define LOCKF_DEBUGP(mask) \ + ({ \ + ((mask) & lockf_debug); \ + }) #else /* !LOCKF_DEBUGGING */ #define LOCKF_DEBUG(mask, ...) /* mask */ #endif /* !LOCKF_DEBUGGING */ @@ -503,11 +504,12 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) overlap_t ovcase; #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_print("lf_setlock", lock); lf_printlist("lf_setlock(in)", lock); } #endif /* LOCKF_DEBUGGING */ + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode); /* * Set the priority @@ -517,6 +519,7 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) priority += 4; } priority |= PCATCH; +scan: /* * Scan lock list for this file looking for locks that would block us. */ @@ -530,6 +533,8 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) return EAGAIN; } + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block); + /* * We are blocked. Since flock style locks cover * the whole file, there is no chance for deadlock. @@ -541,36 +546,59 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) * * Deadlock detection is done by looking through the * wait channels to see if there are any cycles that - * involve us. MAXDEPTH is set just to make sure we - * do not go off into neverland. + * involve us. */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { - struct proc *wproc, *bproc; + struct proc *wproc; struct uthread *ut; - struct lockf *waitblock; - int i = 0; /* The block is waiting on something */ wproc = block->lf_owner; proc_lock(wproc); + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc)); TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) { /* - * While the thread is asleep (uu_wchan != 0) + * If the thread is asleep (uu_wchan != 0) * in this code (uu_wmesg == lockstr) - * and we have not exceeded the maximum cycle - * depth (i < maxlockdepth), then check for a - * cycle to see if the lock is blocked behind + * check to see if the lock is blocked behind * someone blocked behind us. */ - while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) && - ut->uu_wmesg == lockstr && - (i++ < maxlockdepth)) { - waitblock = (struct lockf *)ut->uu_wchan; + if ((ut->uu_wchan != NULL) && (ut->uu_wmesg == lockstr)) { + struct lockf *waitblock = (struct lockf *)ut->uu_wchan; + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode); + + vnode_t othervp = NULL; + if (waitblock->lf_vnode != vp) { + /* + * This thread in wproc is waiting for a lock + * on a different vnode; grab the lock on it + * that protects lf_next while we examine it. + */ + othervp = waitblock->lf_vnode; + if (!lck_mtx_try_lock(&othervp->v_lock)) { + /* + * avoid kernel deadlock: drop all + * locks, pause for a bit to let the + * other thread do what it needs to do, + * then (because we drop and retake + * v_lock) retry the scan. + */ + proc_unlock(wproc); + static struct timespec ts = { + .tv_sec = 0, + .tv_nsec = 10 * NSEC_PER_MSEC, + }; + (void) msleep(lock, &vp->v_lock, priority, lockstr, &ts); + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp); + goto scan; + } + } + /* * Get the lock blocking the lock * which would block us, and make - * certain it hasn't come unblocked + * certain it hasn't become unblocked * (been granted, e.g. between the time * we called lf_getblock, and the time * we successfully acquired the @@ -578,8 +606,13 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) */ waitblock = waitblock->lf_next; if (waitblock == NULL) { - break; + if (othervp) { + lck_mtx_unlock(&othervp->v_lock); + } + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with no lf_next\n", lock); + continue; } + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode); /* * Make sure it's an advisory range @@ -588,7 +621,10 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) * fault. */ if ((waitblock->lf_flags & F_POSIX) == 0) { - break; + if (othervp) { + lck_mtx_unlock(&othervp->v_lock); + } + continue; } /* @@ -597,13 +633,21 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) * getting the requested lock, then we * would deadlock, so error out. */ - bproc = waitblock->lf_owner; - if (bproc == lock->lf_owner) { + struct proc *bproc = waitblock->lf_owner; + const boolean_t deadlocked = bproc == lock->lf_owner; + + if (othervp) { + lck_mtx_unlock(&othervp->v_lock); + } + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc)); + if (deadlocked) { + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock); proc_unlock(wproc); FREE(lock, M_LOCKF); return EDEADLK; } } + LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock); } proc_unlock(wproc); } @@ -658,7 +702,7 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) #endif /* IMPORTANCE_INHERITANCE */ #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_print("lf_setlock: blocking on", block); lf_printlist("lf_setlock(block)", block); } @@ -853,7 +897,7 @@ lf_setlock(struct lockf *lock, struct timespec *timeout) /* Coalesce adjacent locks with identical attributes */ lf_coalesce_adjacent(lock); #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_print("lf_setlock: got the lock", lock); lf_printlist("lf_setlock(out)", lock); } @@ -893,7 +937,7 @@ lf_clearlock(struct lockf *unlock) if (unlock->lf_type != F_UNLCK) { panic("lf_clearlock: bad type"); } - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_print("lf_clearlock", unlock); } #endif /* LOCKF_DEBUGGING */ @@ -952,7 +996,7 @@ lf_clearlock(struct lockf *unlock) break; } #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_printlist("lf_clearlock", unlock); } #endif /* LOCKF_DEBUGGING */ @@ -988,7 +1032,7 @@ lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid) struct lockf *block; #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_print("lf_getlock", lock); } #endif /* LOCKF_DEBUGGING */ @@ -1121,7 +1165,7 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, return 0; } #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LIST) { + if (LOCKF_DEBUGP(LF_DBG_LIST)) { lf_print("lf_findoverlap: looking for overlap in", lock); } #endif /* LOCKF_DEBUGGING */ @@ -1153,7 +1197,7 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, } #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LIST) { + if (LOCKF_DEBUGP(LF_DBG_LIST)) { lf_print("\tchecking", lf); } #endif /* LOCKF_DEBUGGING */ @@ -1238,7 +1282,7 @@ lf_split(struct lockf *lock1, struct lockf *lock2) struct lockf *splitlock; #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LIST) { + if (LOCKF_DEBUGP(LF_DBG_LIST)) { lf_print("lf_split", lock1); lf_print("splitting from", lock2); } @@ -1314,7 +1358,7 @@ lf_wakelock(struct lockf *listhead, boolean_t force_all) wakelock->lf_next = NOLOCKF; #ifdef LOCKF_DEBUGGING - if (lockf_debug & LF_DBG_LOCKOP) { + if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) { lf_print("lf_wakelock: awakening", wakelock); } #endif /* LOCKF_DEBUGGING */ diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 5c3410624..15512dd41 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2018 Apple Inc. All rights reserved. + * Copyright (c) 2006-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,6 +40,7 @@ #include +#include #include #include #include @@ -62,16 +63,17 @@ #include #include #include +#include +#include +#include #if CONFIG_FREEZE #include #endif /* CONFIG_FREEZE */ #include - -#include -#include -#include +#include +#include /* For logging clarity */ static const char *memorystatus_kill_cause_name[] = { @@ -100,6 +102,8 @@ memorystatus_priority_band_name(int32_t priority) return "AUDIO_AND_ACCESSORY"; case JETSAM_PRIORITY_CONDUCTOR: return "CONDUCTOR"; + case JETSAM_PRIORITY_DRIVER_APPLE: + return "DRIVER_APPLE"; case JETSAM_PRIORITY_HOME: return "HOME"; case JETSAM_PRIORITY_EXECUTIVE: @@ -149,18 +153,6 @@ extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity); */ extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size); -/* These are very verbose printfs(), enable with - * MEMORYSTATUS_DEBUG_LOG - */ -#if MEMORYSTATUS_DEBUG_LOG -#define MEMORYSTATUS_DEBUG(cond, format, ...) \ -do { \ - if (cond) { printf(format, ##__VA_ARGS__); } \ -} while(0) -#else -#define MEMORYSTATUS_DEBUG(cond, format, ...) -#endif - /* * Active / Inactive limit support * proc list must be locked @@ -221,120 +213,147 @@ MACRO_END unsigned long delta_percentage = 5; unsigned long critical_threshold_percentage = 5; +// On embedded devices with more than 3GB of memory we lower the critical percentage. +uint64_t config_jetsam_large_memory_cutoff = 3UL * (1UL << 30); +unsigned long critical_threshold_percentage_larger_devices = 4; +unsigned long delta_percentage_larger_devices = 4; unsigned long idle_offset_percentage = 5; unsigned long pressure_threshold_percentage = 15; -unsigned long freeze_threshold_percentage = 50; unsigned long policy_more_free_offset_percentage = 5; - -/* General memorystatus stuff */ - -struct klist memorystatus_klist; -static lck_mtx_t memorystatus_klist_mutex; - -static void memorystatus_klist_lock(void); -static void memorystatus_klist_unlock(void); - -static uint64_t memorystatus_sysprocs_idle_delay_time = 0; -static uint64_t memorystatus_apps_idle_delay_time = 0; +unsigned long sysproc_aging_aggr_threshold_percentage = 7; /* - * Memorystatus kevents + * default jetsam snapshot support */ +memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot; +memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy; +unsigned int memorystatus_jetsam_snapshot_count = 0; +unsigned int memorystatus_jetsam_snapshot_copy_count = 0; +unsigned int memorystatus_jetsam_snapshot_max = 0; +unsigned int memorystatus_jetsam_snapshot_size = 0; +uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0; +uint64_t memorystatus_jetsam_snapshot_timeout = 0; -static int filt_memorystatusattach(struct knote *kn, struct kevent_internal_s *kev); -static void filt_memorystatusdetach(struct knote *kn); -static int filt_memorystatus(struct knote *kn, long hint); -static int filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); - -SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = { - .f_attach = filt_memorystatusattach, - .f_detach = filt_memorystatusdetach, - .f_event = filt_memorystatus, - .f_touch = filt_memorystatustouch, - .f_process = filt_memorystatusprocess, -}; +/* General memorystatus stuff */ -enum { - kMemorystatusNoPressure = 0x1, - kMemorystatusPressure = 0x2, - kMemorystatusLowSwap = 0x4, - kMemorystatusProcLimitWarn = 0x8, - kMemorystatusProcLimitCritical = 0x10 -}; +uint64_t memorystatus_sysprocs_idle_delay_time = 0; +uint64_t memorystatus_apps_idle_delay_time = 0; + +static lck_grp_attr_t *memorystatus_jetsam_fg_band_lock_grp_attr; +static lck_grp_t *memorystatus_jetsam_fg_band_lock_grp; +lck_mtx_t memorystatus_jetsam_fg_band_lock; /* Idle guard handling */ static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0; static int32_t memorystatus_scheduled_idle_demotions_apps = 0; -static thread_call_t memorystatus_idle_demotion_call; - static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2); static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state); -static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state); static void memorystatus_reschedule_idle_demotion_locked(void); - -static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check); - int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap); - vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); - boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t); void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear); void memorystatus_send_low_swap_note(void); +int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index); +boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, + uint32_t *errors, uint64_t *memory_reclaimed); +uint64_t memorystatus_available_memory_internal(proc_t p); unsigned int memorystatus_level = 0; - static int memorystatus_list_count = 0; - - -#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1) - -typedef struct memstat_bucket { - TAILQ_HEAD(, proc) list; - int count; -} memstat_bucket_t; - memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT]; - -int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index); - +static thread_call_t memorystatus_idle_demotion_call; uint64_t memstat_idle_demotion_deadline = 0; - int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; int applications_aging_band = JETSAM_PRIORITY_IDLE; #define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band))) -/* - * Checking the p_memstat_state almost always requires the proc_list_lock - * because the jetsam thread could be on the other core changing the state. - * - * App -- almost always managed by a system process. Always have dirty tracking OFF. Can include extensions too. - * System Processes -- not managed by anybody. Always have dirty tracking ON. Can include extensions (here) too. - */ -#define isApp(p) ((p->p_memstat_state & P_MEMSTAT_MANAGED) || ! (p->p_memstat_dirty & P_DIRTY_TRACK)) -#define isSysProc(p) ( ! (p->p_memstat_state & P_MEMSTAT_MANAGED) || (p->p_memstat_dirty & P_DIRTY_TRACK)) - #define kJetsamAgingPolicyNone (0) #define kJetsamAgingPolicyLegacy (1) #define kJetsamAgingPolicySysProcsReclaimedFirst (2) #define kJetsamAgingPolicyAppsReclaimedFirst (3) #define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst -unsigned int jetsam_aging_policy = kJetsamAgingPolicyLegacy; +unsigned int jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst; extern int corpse_for_fatal_memkill; -extern unsigned long total_corpses_count(void) __attribute__((pure)); -extern void task_purge_all_corpses(void); extern uint64_t vm_purgeable_purge_task_owned(task_t task); boolean_t memorystatus_allowed_vm_map_fork(task_t); #if DEVELOPMENT || DEBUG void memorystatus_abort_vm_map_fork(task_t); #endif +/* + * Idle delay timeout factors for daemons based on relaunch behavior. Only used in + * kJetsamAgingPolicySysProcsReclaimedFirst aging policy. + */ +#define kJetsamSysProcsIdleDelayTimeLowRatio (5) +#define kJetsamSysProcsIdleDelayTimeMedRatio (2) +#define kJetsamSysProcsIdleDelayTimeHighRatio (1) +static_assert(kJetsamSysProcsIdleDelayTimeLowRatio <= DEFERRED_IDLE_EXIT_TIME_SECS, "sysproc idle delay time for low relaunch daemons would be 0"); + +/* + * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well + * behaved daemons for aging purposes. + */ +#define kJetsamAppsIdleDelayTimeRatio (kJetsamSysProcsIdleDelayTimeLowRatio) + +static uint64_t +memorystatus_sysprocs_idle_time(proc_t p) +{ + /* + * The kJetsamAgingPolicySysProcsReclaimedFirst aging policy uses the relaunch behavior to + * determine the exact idle deferred time provided to the daemons. For all other aging + * policies, simply return the default aging idle time. + */ + if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) { + return memorystatus_sysprocs_idle_delay_time; + } + + uint64_t idle_delay_time = 0; + /* + * For system processes, base the idle delay time on the + * jetsam relaunch behavior specified by launchd. The idea + * is to provide extra protection to the daemons which would + * relaunch immediately after jetsam. + */ + switch (p->p_memstat_relaunch_flags) { + case P_MEMSTAT_RELAUNCH_UNKNOWN: + case P_MEMSTAT_RELAUNCH_LOW: + idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio; + break; + case P_MEMSTAT_RELAUNCH_MED: + idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio; + break; + case P_MEMSTAT_RELAUNCH_HIGH: + idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio; + break; + default: + panic("Unknown relaunch flags on process!"); + break; + } + return idle_delay_time; +} + +static uint64_t +memorystatus_apps_idle_time(__unused proc_t p) +{ + /* + * For kJetsamAgingPolicySysProcsReclaimedFirst, the Apps are considered as low + * relaunch candidates. So only provide limited protection to them. In the other + * aging policies, return the default aging idle time. + */ + if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) { + return memorystatus_apps_idle_delay_time; + } + + return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio; +} + + #if 0 /* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */ @@ -518,6 +537,103 @@ static unsigned int memorystatus_dirty_count = 0; SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, &max_task_footprint_mb, 0, ""); +static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */ +static boolean_t proc_jetsam_state_is_active_locked(proc_t); + +#if __arm64__ +#if CONFIG_MEMORYSTATUS +int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps + * that needed the additional room in their footprint when + * the 'correct' accounting methods were applied to them. + */ + +#if DEVELOPMENT || DEBUG +SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ + +void +memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase) +{ + int memlimit_mb_active = 0, memlimit_mb_inactive = 0; + boolean_t memlimit_active_is_fatal = FALSE, memlimit_inactive_is_fatal = 0, use_active_limit = FALSE; + + if (p == NULL) { + return; + } + + proc_list_lock(); + + if (p->p_memstat_memlimit_active > 0) { + memlimit_mb_active = p->p_memstat_memlimit_active; + } else if (p->p_memstat_memlimit_active == -1) { + memlimit_mb_active = max_task_footprint_mb; + } else { + /* + * Nothing to do for '0' which is + * a special value only used internally + * to test 'no limits'. + */ + proc_list_unlock(); + return; + } + + if (p->p_memstat_memlimit_inactive > 0) { + memlimit_mb_inactive = p->p_memstat_memlimit_inactive; + } else if (p->p_memstat_memlimit_inactive == -1) { + memlimit_mb_inactive = max_task_footprint_mb; + } else { + /* + * Nothing to do for '0' which is + * a special value only used internally + * to test 'no limits'. + */ + proc_list_unlock(); + return; + } + + if (footprint_increase) { + memlimit_mb_active += legacy_footprint_bonus_mb; + memlimit_mb_inactive += legacy_footprint_bonus_mb; + } else { + memlimit_mb_active -= legacy_footprint_bonus_mb; + if (memlimit_mb_active == max_task_footprint_mb) { + memlimit_mb_active = -1; /* reverting back to default system limit */ + } + + memlimit_mb_inactive -= legacy_footprint_bonus_mb; + if (memlimit_mb_inactive == max_task_footprint_mb) { + memlimit_mb_inactive = -1; /* reverting back to default system limit */ + } + } + + memlimit_active_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL); + memlimit_inactive_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL); + + SET_ACTIVE_LIMITS_LOCKED(p, memlimit_mb_active, memlimit_active_is_fatal); + SET_INACTIVE_LIMITS_LOCKED(p, memlimit_mb_inactive, memlimit_inactive_is_fatal); + + if (proc_jetsam_state_is_active_locked(p) == TRUE) { + use_active_limit = TRUE; + CACHE_ACTIVE_LIMITS_LOCKED(p, memlimit_active_is_fatal); + } else { + CACHE_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive_is_fatal); + } + + + if (memorystatus_highwater_enabled) { + task_set_phys_footprint_limit_internal(p->task, + (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, + NULL, /*return old value */ + use_active_limit, /*active limit?*/ + (use_active_limit ? memlimit_active_is_fatal : memlimit_inactive_is_fatal)); + } + + proc_list_unlock(); +} + +#endif /* CONFIG_MEMORYSTATUS */ +#endif /* __arm64__ */ + #if CONFIG_EMBEDDED SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, ""); @@ -538,16 +654,10 @@ memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_ar return 0; } -static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search); -static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search); - static void memorystatus_thread(void *param __unused, wait_result_t wr __unused); /* Memory Limits */ -static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */ - -static boolean_t proc_jetsam_state_is_active_locked(proc_t); static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); @@ -560,6 +670,9 @@ static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffe static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval); +static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry); +static int memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry); + int proc_get_memstat_priority(proc_t, boolean_t); static boolean_t memorystatus_idle_snapshot = 0; @@ -601,20 +714,6 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, static uint32_t kill_under_pressure_cause = 0; -/* - * default jetsam snapshot support - */ -static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot; -static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy; -#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries -static unsigned int memorystatus_jetsam_snapshot_count = 0; -static unsigned int memorystatus_jetsam_snapshot_copy_count = 0; -static unsigned int memorystatus_jetsam_snapshot_max = 0; -static unsigned int memorystatus_jetsam_snapshot_size = 0; -static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0; -static uint64_t memorystatus_jetsam_snapshot_timeout = 0; -#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30 - /* * snapshot support for memstats collected at boot. */ @@ -625,7 +724,6 @@ static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memory static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime); static void memorystatus_clear_errors(void); -static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages); static void memorystatus_get_task_phys_footprint_page_counts(task_t task, uint64_t *internal_pages, uint64_t *internal_compressed_pages, uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages, @@ -637,10 +735,10 @@ static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *cou static uint32_t memorystatus_build_state(proc_t p); //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured); -static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors); -static boolean_t memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors); -static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors); -static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged); +static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, + uint32_t *errors, uint64_t *memory_reclaimed); +static boolean_t memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors, uint64_t *memory_reclaimed); +static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed); static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause); @@ -665,14 +763,18 @@ extern unsigned int vm_page_purgeable_count; extern unsigned int vm_page_wire_count; #if CONFIG_SECLUDED_MEMORY extern unsigned int vm_page_secluded_count; +extern unsigned int vm_page_secluded_count_over_target; #endif /* CONFIG_SECLUDED_MEMORY */ +/* Aggressive jetsam pages threshold for sysproc aging policy */ +unsigned int memorystatus_sysproc_aging_aggr_pages = 0; + #if CONFIG_JETSAM unsigned int memorystatus_available_pages = (unsigned int)-1; unsigned int memorystatus_available_pages_pressure = 0; unsigned int memorystatus_available_pages_critical = 0; -static unsigned int memorystatus_available_pages_critical_base = 0; -static unsigned int memorystatus_available_pages_critical_idle_offset = 0; +unsigned int memorystatus_available_pages_critical_base = 0; +unsigned int memorystatus_available_pages_critical_idle_offset = 0; #if DEVELOPMENT || DEBUG SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, ""); @@ -688,6 +790,15 @@ static unsigned int memorystatus_thread_wasted_wakeup = 0; /* Callback into vm_compressor.c to signal that thrashing has been mitigated. */ extern void vm_thrashing_jetsam_done(void); static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit); +#if DEVELOPMENT || DEBUG +static inline uint32_t +roundToNearestMB(uint32_t in) +{ + return (in + ((1 << 20) - 1)) >> 20; +} + +static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase); +#endif int32_t max_kill_priority = JETSAM_PRIORITY_MAX; @@ -700,56 +811,6 @@ uint64_t memorystatus_available_pages_critical = (uint64_t)-1; int32_t max_kill_priority = JETSAM_PRIORITY_IDLE; #endif /* CONFIG_JETSAM */ -unsigned int memorystatus_frozen_count = 0; -unsigned int memorystatus_frozen_processes_max = 0; -unsigned int memorystatus_frozen_shared_mb = 0; -unsigned int memorystatus_frozen_shared_mb_max = 0; -unsigned int memorystatus_freeze_shared_mb_per_process_max = 0; /* Max. MB allowed per process to be freezer-eligible. */ -unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of private:shared pages for a process to be freezer-eligible. */ -unsigned int memorystatus_suspended_count = 0; -unsigned int memorystatus_thaw_count = 0; -unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */ - -#if VM_PRESSURE_EVENTS - -boolean_t memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t exceeded); - -vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal; - -/* - * We use this flag to signal if we have any HWM offenders - * on the system. This way we can reduce the number of wakeups - * of the memorystatus_thread when the system is between the - * "pressure" and "critical" threshold. - * - * The (re-)setting of this variable is done without any locks - * or synchronization simply because it is not possible (currently) - * to keep track of HWM offenders that drop down below their memory - * limit and/or exit. So, we choose to burn a couple of wasted wakeups - * by allowing the unguarded modification of this variable. - */ -boolean_t memorystatus_hwm_candidates = 0; - -static int memorystatus_send_note(int event_code, void *data, size_t data_length); - -/* - * This value is the threshold that a process must meet to be considered for scavenging. - */ -#if CONFIG_EMBEDDED -#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */ -#else /* CONFIG_EMBEDDED */ -#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ -#endif /* CONFIG_EMBEDDED */ - -uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE; - -#if DEVELOPMENT || DEBUG -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, ""); -#endif /* DEVELOPMENT || DEBUG */ - -#endif /* VM_PRESSURE_EVENTS */ - - #if DEVELOPMENT || DEBUG lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr; @@ -760,80 +821,6 @@ extern boolean_t kill_on_no_paging_space; #endif /* DEVELOPMENT || DEBUG */ -/* - * Table that expresses the probability of a process - * being used in the next hour. - */ -typedef struct memorystatus_internal_probabilities { - char proc_name[MAXCOMLEN + 1]; - int use_probability; -} memorystatus_internal_probabilities_t; - -static memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL; -static size_t memorystatus_global_probabilities_size = 0; - -/* Freeze */ - -#if CONFIG_FREEZE -boolean_t memorystatus_freeze_enabled = FALSE; -int memorystatus_freeze_wakeup = 0; -int memorystatus_freeze_jetsam_band = 0; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */ - -lck_grp_attr_t *freezer_lck_grp_attr; -lck_grp_t *freezer_lck_grp; -static lck_mtx_t freezer_mutex; - -static inline boolean_t memorystatus_can_freeze_processes(void); -static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low); -static boolean_t memorystatus_is_process_eligible_for_freeze(proc_t p); -static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused); -static boolean_t memorystatus_freeze_thread_should_run(void); - -void memorystatus_disable_freeze(void); - -/* Thresholds */ -static unsigned int memorystatus_freeze_threshold = 0; - -static unsigned int memorystatus_freeze_pages_min = 0; -static unsigned int memorystatus_freeze_pages_max = 0; - -static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT; - -static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT; -static uint64_t memorystatus_freeze_budget_pages_remaining = 0; //remaining # of pages that can be frozen to disk -static boolean_t memorystatus_freeze_degradation = FALSE; //protected by the freezer mutex. Signals we are in a degraded freeze mode. - -static unsigned int memorystatus_max_frozen_demotions_daily = 0; -static unsigned int memorystatus_thaw_count_demotion_threshold = 0; - -/* Stats */ -static uint64_t memorystatus_freeze_pageouts = 0; - -/* Throttling */ -#define DEGRADED_WINDOW_MINS (30) -#define NORMAL_WINDOW_MINS (24 * 60) - -static throttle_interval_t throttle_intervals[] = { - { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, - { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, -}; -throttle_interval_t *degraded_throttle_window = &throttle_intervals[0]; -throttle_interval_t *normal_throttle_window = &throttle_intervals[1]; - -extern uint64_t vm_swap_get_free_space(void); -extern boolean_t vm_swap_max_budget(uint64_t *); - -static void memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed); - -static uint64_t memorystatus_freezer_thread_next_run_ts = 0; - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_count, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, ""); - -#endif /* CONFIG_FREEZE */ - /* Debug */ extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *); @@ -870,16 +857,17 @@ memorystatus_debug_dump_bucket_locked(unsigned int bucket_index) */ printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64); - printf("bucket [pid] [pages / MB] [state] [EP / RP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n"); + printf("bucket [pid] [pages / MB] [state] [EP / RP / AP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n"); p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets); while (p) { bytes = get_task_phys_footprint(p->task); task_get_phys_footprint_limit(p->task, &ledger_limit); - printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n", + printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n", b, p->p_pid, (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */ (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */ - p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline, + p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority, + p->p_memstat_dirty, p->p_memstat_idledeadline, ledger_limit, p->p_memstat_memlimit, (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), @@ -999,7525 +987,5078 @@ sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, ""); + +#if CONFIG_JETSAM +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, ""); + +static unsigned int memorystatus_jetsam_panic_debug = 0; + #if VM_PRESSURE_EVENTS -/* - * This routine is used for targeted notifications regardless of system memory pressure - * and regardless of whether or not the process has already been notified. - * It bypasses and has no effect on the only-one-notification per soft-limit policy. - * - * "memnote" is the current user. - */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, ""); -static int -sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) +#endif /* VM_PRESSURE_EVENTS */ - int error = 0, pid = 0; - struct knote *kn = NULL; - boolean_t found_knote = FALSE; - int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */ - uint64_t value = 0; +#endif /* CONFIG_JETSAM */ - error = sysctl_handle_quad(oidp, &value, 0, req); - if (error || !req->newptr) { - return error; - } +#endif /* DEVELOPMENT || DEBUG */ - /* - * Find the pid in the low 32 bits of value passed in. - */ - pid = (int)(value & 0xFFFFFFFF); +extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation, + void *parameter, + integer_t priority, + thread_t *new_thread); - /* - * Find notification in the high 32 bits of the value passed in. - */ - fflags = (int)((value >> 32) & 0xFFFFFFFF); +#if DEVELOPMENT || DEBUG - /* - * For backwards compatibility, when no notification is - * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN - */ - if (fflags == 0) { - fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; - // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags); - } +static int +sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0, pid = 0; + proc_t p; - /* - * See event.h ... fflags for EVFILT_MEMORYSTATUS - */ - if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) || - (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) || - (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) || - (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) || - (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) || - (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) || - (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 && - ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) { - printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags); - error = 1; + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) { return error; } - /* - * Forcibly send pid a memorystatus notification. - */ + lck_mtx_lock(&disconnect_page_mappings_mutex); + + if (pid == -1) { + vm_pageout_disconnect_all_pages(); + } else { + p = proc_find(pid); - memorystatus_klist_lock(); + if (p != NULL) { + error = task_disconnect_page_mappings(p->task); - SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { - proc_t knote_proc = knote_get_kq(kn)->kq_p; - pid_t knote_pid = knote_proc->p_pid; + proc_rele(p); - if (knote_pid == pid) { - /* - * Forcibly send this pid a memorystatus notification. - */ - kn->kn_fflags = fflags; - found_knote = TRUE; + if (error) { + error = EIO; + } + } else { + error = EINVAL; } } - - if (found_knote) { - KNOTE(&memorystatus_klist, 0); - printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid); - error = 0; - } else { - printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid); - error = 1; - } - - memorystatus_klist_unlock(); + lck_mtx_unlock(&disconnect_page_mappings_mutex); return error; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", ""); - -#endif /* VM_PRESSURE_EVENTS */ - -SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, ""); - -#if CONFIG_JETSAM -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, ""); - -static unsigned int memorystatus_jetsam_panic_debug = 0; -static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0; - -/* Diagnostic code */ +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", ""); -enum { - kJetsamDiagnosticModeNone = 0, - kJetsamDiagnosticModeAll = 1, - kJetsamDiagnosticModeStopAtFirstActive = 2, - kJetsamDiagnosticModeCount -} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone; +#endif /* DEVELOPMENT || DEBUG */ -static int jetsam_diagnostic_suspended_one_active_proc = 0; +/* + * Picks the sorting routine for a given jetsam priority band. + * + * Input: + * bucket_index - jetsam priority band to be sorted. + * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h + * Currently sort_order is only meaningful when handling + * coalitions. + * + * Return: + * 0 on success + * non-0 on failure + */ static int -sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS +memorystatus_sort_bucket(unsigned int bucket_index, int sort_order) { -#pragma unused(arg1, arg2) - - const char *diagnosticStrings[] = { - "jetsam: diagnostic mode: resetting critical level.", - "jetsam: diagnostic mode: will examine all processes", - "jetsam: diagnostic mode: will stop at first active process" - }; + int coal_sort_order; - int error, val = jetsam_diagnostic_mode; - boolean_t changed = FALSE; + /* + * Verify the jetsam priority + */ + if (bucket_index >= MEMSTAT_BUCKET_COUNT) { + return EINVAL; + } - error = sysctl_handle_int(oidp, &val, 0, req); - if (error || !req->newptr) { - return error; +#if DEVELOPMENT || DEBUG + if (sort_order == JETSAM_SORT_DEFAULT) { + coal_sort_order = COALITION_SORT_DEFAULT; + } else { + coal_sort_order = sort_order; /* only used for testing scenarios */ } - if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) { - printf("jetsam: diagnostic mode: invalid value - %d\n", val); +#else + /* Verify default */ + if (sort_order == JETSAM_SORT_DEFAULT) { + coal_sort_order = COALITION_SORT_DEFAULT; + } else { return EINVAL; } +#endif proc_list_lock(); - if ((unsigned int) val != jetsam_diagnostic_mode) { - jetsam_diagnostic_mode = val; - - memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive; + if (memstat_bucket[bucket_index].count == 0) { + proc_list_unlock(); + return 0; + } - switch (jetsam_diagnostic_mode) { - case kJetsamDiagnosticModeNone: - /* Already cleared */ - break; - case kJetsamDiagnosticModeAll: - memorystatus_jetsam_policy |= kPolicyDiagnoseAll; - break; - case kJetsamDiagnosticModeStopAtFirstActive: - memorystatus_jetsam_policy |= kPolicyDiagnoseFirst; - break; - default: - /* Already validated */ - break; + switch (bucket_index) { + case JETSAM_PRIORITY_FOREGROUND: + if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) { + /* + * Fall back to per process sorting when zero coalitions are found. + */ + memorystatus_sort_by_largest_process_locked(bucket_index); } - - memorystatus_update_levels_locked(FALSE); - changed = TRUE; + break; + default: + memorystatus_sort_by_largest_process_locked(bucket_index); + break; } - proc_list_unlock(); - if (changed) { - printf("%s\n", diagnosticStrings[val]); - } - return 0; } -SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, - &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode"); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, ""); - -#if VM_PRESSURE_EVENTS - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, ""); - -#endif /* VM_PRESSURE_EVENTS */ - -#endif /* CONFIG_JETSAM */ - -#if CONFIG_FREEZE - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, ""); - -/* - * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band. - * "0" means no limit. - * Default is 10% of system-wide task limit. - */ - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb_max, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, ""); - -/* - * max. # of frozen process demotions we will allow in our daily cycle. - */ -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, ""); -/* - * min # of thaws needed by a process to protect it from getting demoted into the IDLE band. - */ -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, ""); - -boolean_t memorystatus_freeze_throttle_enabled = TRUE; -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, ""); - /* - * When set to true, this keeps frozen processes in the compressor pool in memory, instead of swapping them out to disk. - * Exposed via the sysctl kern.memorystatus_freeze_to_memory. + * Sort processes by size for a single jetsam bucket. */ -boolean_t memorystatus_freeze_to_memory = FALSE; -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_to_memory, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_to_memory, 0, ""); -#define VM_PAGES_FOR_ALL_PROCS (2) -/* - * Manual trigger of freeze and thaw for dev / debug kernels only. - */ -static int -sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS +static void +memorystatus_sort_by_largest_process_locked(unsigned int bucket_index) { -#pragma unused(arg1, arg2) - int error, pid = 0; - proc_t p; - int freezer_error_code = 0; - - if (memorystatus_freeze_enabled == FALSE) { - printf("sysctl_freeze: Freeze is DISABLED\n"); - return ENOTSUP; - } - - error = sysctl_handle_int(oidp, &pid, 0, req); - if (error || !req->newptr) { - return error; - } - - if (pid == VM_PAGES_FOR_ALL_PROCS) { - vm_pageout_anonymous_pages(); + proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL; + proc_t next_p = NULL, prev_max_proc = NULL; + uint32_t pages = 0, max_pages = 0; + memstat_bucket_t *current_bucket; - return 0; + if (bucket_index >= MEMSTAT_BUCKET_COUNT) { + return; } - lck_mtx_lock(&freezer_mutex); - - p = proc_find(pid); - if (p != NULL) { - uint32_t purgeable, wired, clean, dirty, shared; - uint32_t max_pages = 0, state = 0; - - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - /* - * Freezer backed by the compressor and swap file(s) - * will hold compressed data. - * - * Set the sysctl kern.memorystatus_freeze_to_memory to true to keep compressed data from - * being swapped out to disk. Note that this disables freezer swap support globally, - * not just for the process being frozen. - * - * - * We don't care about the global freezer budget or the process's (min/max) budget here. - * The freeze sysctl is meant to force-freeze a process. - * - * We also don't update any global or process stats on this path, so that the jetsam/ freeze - * logic remains unaffected. The tasks we're performing here are: freeze the process, set the - * P_MEMSTAT_FROZEN bit, and elevate the process to a higher band (if the freezer is active). - */ - max_pages = memorystatus_freeze_pages_max; - } else { - /* - * We only have the compressor without any swap. - */ - max_pages = UINT32_MAX - 1; - } - - proc_list_lock(); - state = p->p_memstat_state; - proc_list_unlock(); - - /* - * The jetsam path also verifies that the process is a suspended App. We don't care about that here. - * We simply ensure that jetsam is not already working on the process and that the process has not - * explicitly disabled freezing. - */ - if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED)) { - printf("sysctl_freeze: p_memstat_state check failed, process is%s%s%s\n", - (state & P_MEMSTAT_TERMINATED) ? " terminated" : "", - (state & P_MEMSTAT_LOCKED) ? " locked" : "", - (state & P_MEMSTAT_FREEZE_DISABLED) ? " unfreezable" : ""); - - proc_rele(p); - lck_mtx_unlock(&freezer_mutex); - return EPERM; - } - - error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); - - if (error) { - char reason[128]; - if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { - strlcpy(reason, "too much shared memory", 128); - } + current_bucket = &memstat_bucket[bucket_index]; - if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { - strlcpy(reason, "low private-shared pages ratio", 128); - } + p = TAILQ_FIRST(¤t_bucket->list); - if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { - strlcpy(reason, "no compressor space", 128); - } + while (p) { + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); + max_pages = pages; + max_proc = p; + prev_max_proc = p; - if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { - strlcpy(reason, "no swap space", 128); + while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) { + /* traversing list until we find next largest process */ + p = next_p; + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); + if (pages > max_pages) { + max_pages = pages; + max_proc = p; } + } - printf("sysctl_freeze: task_freeze failed: %s\n", reason); - - if (error == KERN_NO_SPACE) { - /* Make it easy to distinguish between failures due to low compressor/ swap space and other failures. */ - error = ENOSPC; + if (prev_max_proc != max_proc) { + /* found a larger process, place it in the list */ + TAILQ_REMOVE(¤t_bucket->list, max_proc, p_memstat_list); + if (insert_after_proc == NULL) { + TAILQ_INSERT_HEAD(¤t_bucket->list, max_proc, p_memstat_list); } else { - error = EIO; - } - } else { - proc_list_lock(); - if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { - p->p_memstat_state |= P_MEMSTAT_FROZEN; - memorystatus_frozen_count++; - } - p->p_memstat_frozen_count++; - - - proc_list_unlock(); - - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - /* - * We elevate only if we are going to swap out the data. - */ - error = memorystatus_update_inactive_jetsam_priority_band(pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, - memorystatus_freeze_jetsam_band, TRUE); - - if (error) { - printf("sysctl_freeze: Elevating frozen process to higher jetsam band failed with %d\n", error); - } + TAILQ_INSERT_AFTER(¤t_bucket->list, insert_after_proc, max_proc, p_memstat_list); } + prev_max_proc = max_proc; } - proc_rele(p); + insert_after_proc = max_proc; - lck_mtx_unlock(&freezer_mutex); - return error; - } else { - printf("sysctl_freeze: Invalid process\n"); + p = TAILQ_NEXT(max_proc, p_memstat_list); } - - - lck_mtx_unlock(&freezer_mutex); - return EINVAL; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_freeze, "I", ""); - -static int -sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS +proc_t +memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) { -#pragma unused(arg1, arg2) - - int error, pid = 0; - proc_t p; - - if (memorystatus_freeze_enabled == FALSE) { - return ENOTSUP; - } + memstat_bucket_t *current_bucket; + proc_t next_p; - error = sysctl_handle_int(oidp, &pid, 0, req); - if (error || !req->newptr) { - return error; + if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) { + return NULL; } - if (pid == VM_PAGES_FOR_ALL_PROCS) { - do_fastwake_warmup_all(); - return 0; - } else { - p = proc_find(pid); - if (p != NULL) { - error = task_thaw(p->task); - - if (error) { - error = EIO; - } else { - /* - * task_thaw() succeeded. - * - * We increment memorystatus_frozen_count on the sysctl freeze path. - * And so we need the P_MEMSTAT_FROZEN to decrement the frozen count - * when this process exits. - * - * proc_list_lock(); - * p->p_memstat_state &= ~P_MEMSTAT_FROZEN; - * proc_list_unlock(); - */ - } - proc_rele(p); - return error; + current_bucket = &memstat_bucket[*bucket_index]; + next_p = TAILQ_FIRST(¤t_bucket->list); + if (!next_p && search) { + while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) { + current_bucket = &memstat_bucket[*bucket_index]; + next_p = TAILQ_FIRST(¤t_bucket->list); } } - return EINVAL; + return next_p; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", ""); - -typedef struct _global_freezable_status { - boolean_t freeze_pages_threshold_crossed; - boolean_t freeze_eligible_procs_available; - boolean_t freeze_scheduled_in_future; -}global_freezable_status_t; - -typedef struct _proc_freezable_status { - boolean_t freeze_has_memstat_state; - boolean_t freeze_has_pages_min; - int freeze_has_probability; - boolean_t freeze_attempted; - uint32_t p_memstat_state; - uint32_t p_pages; - int p_freeze_error_code; - int p_pid; - char p_name[MAXCOMLEN + 1]; -}proc_freezable_status_t; - -#define MAX_FREEZABLE_PROCESSES 100 - -static int -memorystatus_freezer_get_status(user_addr_t buffer, size_t buffer_size, int32_t *retval) +proc_t +memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) { - uint32_t proc_count = 0, i = 0; - global_freezable_status_t *list_head; - proc_freezable_status_t *list_entry; - size_t list_size = 0; - proc_t p; - memstat_bucket_t *bucket; - uint32_t state = 0, pages = 0, entry_count = 0; - boolean_t try_freeze = TRUE; - int error = 0, probability_of_use = 0; - - - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) { - return ENOTSUP; - } - - list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES); + memstat_bucket_t *current_bucket; + proc_t next_p; - if (buffer_size < list_size) { - return EINVAL; + if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) { + return NULL; } - list_head = (global_freezable_status_t*)kalloc(list_size); - if (list_head == NULL) { - return ENOMEM; + next_p = TAILQ_NEXT(p, p_memstat_list); + while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) { + current_bucket = &memstat_bucket[*bucket_index]; + next_p = TAILQ_FIRST(¤t_bucket->list); } - memset(list_head, 0, list_size); + return next_p; +} - list_size = sizeof(global_freezable_status_t); +/* + * Structure to hold state for a jetsam thread. + * Typically there should be a single jetsam thread + * unless parallel jetsam is enabled. + */ +struct jetsam_thread_state { + uint8_t inited; /* boolean - if the thread is initialized */ + uint8_t limit_to_low_bands; /* boolean */ + int memorystatus_wakeup; /* wake channel */ + int index; /* jetsam thread index */ + thread_t thread; /* jetsam thread pointer */ +} *jetsam_threads; - proc_list_lock(); +/* Maximum number of jetsam threads allowed */ +#define JETSAM_THREADS_LIMIT 3 - uint64_t curr_time = mach_absolute_time(); +/* Number of active jetsam threads */ +_Atomic int active_jetsam_threads = 1; - list_head->freeze_pages_threshold_crossed = (memorystatus_available_pages < memorystatus_freeze_threshold); - list_head->freeze_eligible_procs_available = ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold); - list_head->freeze_scheduled_in_future = (curr_time < memorystatus_freezer_thread_next_run_ts); +/* Number of maximum jetsam threads configured */ +int max_jetsam_threads = JETSAM_THREADS_LIMIT; - list_entry = (proc_freezable_status_t*) ((uintptr_t)list_head + sizeof(global_freezable_status_t)); +/* + * Global switch for enabling fast jetsam. Fast jetsam is + * hooked up via the system_override() system call. It has the + * following effects: + * - Raise the jetsam threshold ("clear-the-deck") + * - Enabled parallel jetsam on eligible devices + */ +int fast_jetsam_enabled = 0; - bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; +/* Routine to find the jetsam state structure for the current jetsam thread */ +static inline struct jetsam_thread_state * +jetsam_current_thread(void) +{ + for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) { + if (jetsam_threads[thr_id].thread == current_thread()) { + return &(jetsam_threads[thr_id]); + } + } + return NULL; +} - entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t)); - p = memorystatus_get_first_proc_locked(&i, FALSE); - proc_count++; - - while ((proc_count <= MAX_FREEZABLE_PROCESSES) && - (p) && - (list_size < buffer_size)) { - if (isApp(p) == FALSE) { - p = memorystatus_get_next_proc_locked(&i, p, FALSE); - proc_count++; - continue; - } +__private_extern__ void +memorystatus_init(void) +{ + kern_return_t result; + int i; - strlcpy(list_entry->p_name, p->p_name, MAXCOMLEN + 1); +#if CONFIG_FREEZE + memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT; + memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX; + memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */ + memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4); + memorystatus_freeze_pages_min = FREEZE_PAGES_MIN; + memorystatus_freeze_pages_max = FREEZE_PAGES_MAX; + memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS; + memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD; +#endif - list_entry->p_pid = p->p_pid; +#if DEVELOPMENT || DEBUG + disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init(); + disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr); - state = p->p_memstat_state; + lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL); - if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) || - !(state & P_MEMSTAT_SUSPENDED)) { - try_freeze = list_entry->freeze_has_memstat_state = FALSE; - } else { - try_freeze = list_entry->freeze_has_memstat_state = TRUE; - } + if (kill_on_no_paging_space == TRUE) { + max_kill_priority = JETSAM_PRIORITY_MAX; + } +#endif - list_entry->p_memstat_state = state; + memorystatus_jetsam_fg_band_lock_grp_attr = lck_grp_attr_alloc_init(); + memorystatus_jetsam_fg_band_lock_grp = + lck_grp_alloc_init("memorystatus_jetsam_fg_band", memorystatus_jetsam_fg_band_lock_grp_attr); + lck_mtx_init(&memorystatus_jetsam_fg_band_lock, memorystatus_jetsam_fg_band_lock_grp, NULL); - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); - if (pages < memorystatus_freeze_pages_min) { - try_freeze = list_entry->freeze_has_pages_min = FALSE; - } else { - list_entry->freeze_has_pages_min = TRUE; - if (try_freeze != FALSE) { - try_freeze = TRUE; - } - } + /* Init buckets */ + for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) { + TAILQ_INIT(&memstat_bucket[i].list); + memstat_bucket[i].count = 0; + memstat_bucket[i].relaunch_high_count = 0; + } + memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL); - list_entry->p_pages = pages; + nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time); + nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time); - if (entry_count) { - uint32_t j = 0; - for (j = 0; j < entry_count; j++) { - if (strncmp(memorystatus_global_probabilities_table[j].proc_name, - p->p_name, - MAXCOMLEN + 1) == 0) { - probability_of_use = memorystatus_global_probabilities_table[j].use_probability; - break; - } - } +#if CONFIG_JETSAM + /* Apply overrides */ + if (!PE_parse_boot_argn("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage))) { + PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage)); + } + if (delta_percentage == 0) { + delta_percentage = 5; + } + if (max_mem > config_jetsam_large_memory_cutoff) { + critical_threshold_percentage = critical_threshold_percentage_larger_devices; + delta_percentage = delta_percentage_larger_devices; + } + assert(delta_percentage < 100); + if (!PE_parse_boot_argn("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage))) { + PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage)); + } + assert(critical_threshold_percentage < 100); + PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage)); + assert(idle_offset_percentage < 100); + PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage)); + assert(pressure_threshold_percentage < 100); + PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage)); + assert(freeze_threshold_percentage < 100); - list_entry->freeze_has_probability = probability_of_use; - if (probability_of_use && try_freeze != FALSE) { - try_freeze = TRUE; - } else { - try_freeze = FALSE; - } - } else { - if (try_freeze != FALSE) { - try_freeze = TRUE; - } - list_entry->freeze_has_probability = -1; + if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy, + sizeof(jetsam_aging_policy))) { + if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy, + sizeof(jetsam_aging_policy))) { + jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst; } + } - if (try_freeze) { - uint32_t purgeable, wired, clean, dirty, shared; - uint32_t max_pages = 0; - int freezer_error_code = 0; - - error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, TRUE /* eval only */); + if (jetsam_aging_policy > kJetsamAgingPolicyMax) { + jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst; + } - if (error) { - list_entry->p_freeze_error_code = freezer_error_code; - } + switch (jetsam_aging_policy) { + case kJetsamAgingPolicyNone: + system_procs_aging_band = JETSAM_PRIORITY_IDLE; + applications_aging_band = JETSAM_PRIORITY_IDLE; + break; - list_entry->freeze_attempted = TRUE; - } + case kJetsamAgingPolicyLegacy: + /* + * Legacy behavior where some daemons get a 10s protection once + * AND only before the first clean->dirty->clean transition before + * going into IDLE band. + */ + system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; + applications_aging_band = JETSAM_PRIORITY_IDLE; + break; - list_entry++; + case kJetsamAgingPolicySysProcsReclaimedFirst: + system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; + applications_aging_band = JETSAM_PRIORITY_AGING_BAND2; + break; - list_size += sizeof(proc_freezable_status_t); + case kJetsamAgingPolicyAppsReclaimedFirst: + system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2; + applications_aging_band = JETSAM_PRIORITY_AGING_BAND1; + break; - p = memorystatus_get_next_proc_locked(&i, p, FALSE); - proc_count++; + default: + break; } - proc_list_unlock(); + /* + * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE + * band and must be below it in priority. This is so that we don't have to make + * our 'aging' code worry about a mix of processes, some of which need to age + * and some others that need to stay elevated in the jetsam bands. + */ + assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band); + assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band); + + /* Take snapshots for idle-exit kills by default? First check the boot-arg... */ + if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) { + /* ...no boot-arg, so check the device tree */ + PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot)); + } - buffer_size = list_size; + memorystatus_delta = delta_percentage * atop_64(max_mem) / 100; + memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100; + memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta; + memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta; + memorystatus_sysproc_aging_aggr_pages = sysproc_aging_aggr_threshold_percentage * atop_64(max_mem) / 100; - error = copyout(list_head, buffer, buffer_size); - if (error == 0) { - *retval = buffer_size; + /* Jetsam Loop Detection */ + if (max_mem <= (512 * 1024 * 1024)) { + /* 512 MB devices */ + memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */ } else { - *retval = 0; + /* 1GB and larger devices */ + memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */ } - list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES); - kfree(list_head, list_size); - - MEMORYSTATUS_DEBUG(1, "memorystatus_freezer_get_status: returning %d (%lu - size)\n", error, (unsigned long)*list_size); + memorystatus_jld_enabled = TRUE; - return error; -} + /* No contention at this point */ + memorystatus_update_levels_locked(FALSE); -static int -memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) -{ - int err = ENOTSUP; +#endif /* CONFIG_JETSAM */ - if (flags == FREEZER_CONTROL_GET_STATUS) { - err = memorystatus_freezer_get_status(buffer, buffer_size, retval); - } + memorystatus_jetsam_snapshot_max = maxproc; - return err; -} + memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max); -#endif /* CONFIG_FREEZE */ + memorystatus_jetsam_snapshot = + (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size); + if (!memorystatus_jetsam_snapshot) { + panic("Could not allocate memorystatus_jetsam_snapshot"); + } -#endif /* DEVELOPMENT || DEBUG */ + memorystatus_jetsam_snapshot_copy = + (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size); + if (!memorystatus_jetsam_snapshot_copy) { + panic("Could not allocate memorystatus_jetsam_snapshot_copy"); + } -extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation, - void *parameter, - integer_t priority, - thread_t *new_thread); + nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout); -#if DEVELOPMENT || DEBUG + memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t)); -static int -sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) - int error = 0, pid = 0; - proc_t p; +#if CONFIG_FREEZE + memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta; +#endif - error = sysctl_handle_int(oidp, &pid, 0, req); - if (error || !req->newptr) { - return error; + /* Check the boot-arg to see if fast jetsam is allowed */ + if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof(fast_jetsam_enabled))) { + fast_jetsam_enabled = 0; } - lck_mtx_lock(&disconnect_page_mappings_mutex); + /* Check the boot-arg to configure the maximum number of jetsam threads */ + if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) { + max_jetsam_threads = JETSAM_THREADS_LIMIT; + } - if (pid == -1) { - vm_pageout_disconnect_all_pages(); - } else { - p = proc_find(pid); + /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */ + if (max_jetsam_threads > JETSAM_THREADS_LIMIT) { + max_jetsam_threads = JETSAM_THREADS_LIMIT; + } - if (p != NULL) { - error = task_disconnect_page_mappings(p->task); + /* For low CPU systems disable fast jetsam mechanism */ + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + max_jetsam_threads = 1; + fast_jetsam_enabled = 0; + } - proc_rele(p); + /* Initialize the jetsam_threads state array */ + jetsam_threads = kalloc(sizeof(struct jetsam_thread_state) * max_jetsam_threads); - if (error) { - error = EIO; - } - } else { - error = EINVAL; + /* Initialize all the jetsam threads */ + for (i = 0; i < max_jetsam_threads; i++) { + jetsam_threads[i].inited = FALSE; + jetsam_threads[i].index = i; + result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread); + if (result != KERN_SUCCESS) { + panic("Could not create memorystatus_thread %d", i); } + thread_deallocate(jetsam_threads[i].thread); } - lck_mtx_unlock(&disconnect_page_mappings_mutex); - - return error; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", ""); - -#endif /* DEVELOPMENT || DEBUG */ - +/* Centralised for the purposes of allowing panic-on-jetsam */ +extern void +vm_run_compactor(void); /* - * Picks the sorting routine for a given jetsam priority band. - * - * Input: - * bucket_index - jetsam priority band to be sorted. - * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h - * Currently sort_order is only meaningful when handling - * coalitions. - * - * Return: - * 0 on success - * non-0 on failure + * The jetsam no frills kill call + * Return: 0 on success + * error code on failure (EINVAL...) */ static int -memorystatus_sort_bucket(unsigned int bucket_index, int sort_order) +jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason) { - int coal_sort_order; + int error = 0; + error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason); + return error; +} - /* - * Verify the jetsam priority - */ - if (bucket_index >= MEMSTAT_BUCKET_COUNT) { - return EINVAL; - } +/* + * Wrapper for processes exiting with memorystatus details + */ +static boolean_t +memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_of_killed_proc) +{ + int error = 0; + __unused pid_t victim_pid = p->p_pid; + uint64_t footprint = get_task_phys_footprint(p->task); +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) + int32_t memstat_effectivepriority = p->p_memstat_effectivepriority; +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ -#if DEVELOPMENT || DEBUG - if (sort_order == JETSAM_SORT_DEFAULT) { - coal_sort_order = COALITION_SORT_DEFAULT; - } else { - coal_sort_order = sort_order; /* only used for testing scenarios */ + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START, + victim_pid, cause, vm_page_free_count, footprint, 0); + DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint); +#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) + if (memorystatus_jetsam_panic_debug & (1 << cause)) { + panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause); } #else - /* Verify default */ - if (sort_order == JETSAM_SORT_DEFAULT) { - coal_sort_order = COALITION_SORT_DEFAULT; - } else { - return EINVAL; - } +#pragma unused(cause) #endif - proc_list_lock(); - - if (memstat_bucket[bucket_index].count == 0) { - proc_list_unlock(); - return 0; + if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) { + printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid, + (*p->p_name ? p->p_name : "unknown"), + memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority, + (uint64_t)memorystatus_available_pages); } - switch (bucket_index) { - case JETSAM_PRIORITY_FOREGROUND: - if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) { - /* - * Fall back to per process sorting when zero coalitions are found. - */ - memorystatus_sort_by_largest_process_locked(bucket_index); - } - break; - default: - memorystatus_sort_by_largest_process_locked(bucket_index); - break; + /* + * The jetsam_reason (os_reason_t) has enough information about the kill cause. + * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped. + */ + int jetsam_flags = P_LTERM_JETSAM; + switch (cause) { + case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break; + case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break; + case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break; + case kMemorystatusKilledVMCompressorThrashing: + case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break; + case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break; + case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break; + case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break; } - proc_list_unlock(); + error = jetsam_do_kill(p, jetsam_flags, jetsam_reason); + *footprint_of_killed_proc = ((error == 0) ? footprint : 0); - return 0; + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, + victim_pid, memstat_effectivepriority, vm_page_free_count, error, 0); + + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_START, + victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc, 0); + + vm_run_compactor(); + + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_END, + victim_pid, cause, vm_page_free_count, 0, 0); + + return error == 0; } /* - * Sort processes by size for a single jetsam bucket. + * Node manipulation */ static void -memorystatus_sort_by_largest_process_locked(unsigned int bucket_index) +memorystatus_check_levels_locked(void) { - proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL; - proc_t next_p = NULL, prev_max_proc = NULL; - uint32_t pages = 0, max_pages = 0; - memstat_bucket_t *current_bucket; +#if CONFIG_JETSAM + /* Update levels */ + memorystatus_update_levels_locked(TRUE); +#else /* CONFIG_JETSAM */ + /* + * Nothing to do here currently since we update + * memorystatus_available_pages in vm_pressure_response. + */ +#endif /* CONFIG_JETSAM */ +} - if (bucket_index >= MEMSTAT_BUCKET_COUNT) { - return; - } +/* + * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work. + * For an application: that means no longer in the FG band + * For a daemon: that means no longer in its 'requested' jetsam priority band + */ - current_bucket = &memstat_bucket[bucket_index]; +int +memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now) +{ + int error = 0; + boolean_t enable = FALSE; + proc_t p = NULL; - p = TAILQ_FIRST(¤t_bucket->list); + if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) { + enable = TRUE; + } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) { + enable = FALSE; + } else { + return EINVAL; + } - while (p) { - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); - max_pages = pages; - max_proc = p; - prev_max_proc = p; + p = proc_find(pid); + if (p != NULL) { + if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) || + (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) { + /* + * No change in state. + */ + } else { + proc_list_lock(); - while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) { - /* traversing list until we find next largest process */ - p = next_p; - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); - if (pages > max_pages) { - max_pages = pages; - max_proc = p; - } - } + if (enable) { + p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); - if (prev_max_proc != max_proc) { - /* found a larger process, place it in the list */ - TAILQ_REMOVE(¤t_bucket->list, max_proc, p_memstat_list); - if (insert_after_proc == NULL) { - TAILQ_INSERT_HEAD(¤t_bucket->list, max_proc, p_memstat_list); + if (effective_now) { + if (p->p_memstat_effectivepriority < jetsam_prio) { + if (memorystatus_highwater_enabled) { + /* + * Process is about to transition from + * inactive --> active + * assign active state + */ + boolean_t is_fatal; + boolean_t use_active = TRUE; + CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); + task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal); + } + memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE); + } + } else { + if (isProcessInAgingBands(p)) { + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } + } } else { - TAILQ_INSERT_AFTER(¤t_bucket->list, insert_after_proc, max_proc, p_memstat_list); - } - prev_max_proc = max_proc; - } + p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); - insert_after_proc = max_proc; + if (effective_now) { + if (p->p_memstat_effectivepriority == jetsam_prio) { + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } + } else { + if (isProcessInAgingBands(p)) { + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } + } + } - p = TAILQ_NEXT(max_proc, p_memstat_list); + proc_list_unlock(); + } + proc_rele(p); + error = 0; + } else { + error = ESRCH; } + + return error; } -static proc_t -memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) +static void +memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2) { - memstat_bucket_t *current_bucket; - proc_t next_p; + proc_t p; + uint64_t current_time = 0, idle_delay_time = 0; + int demote_prio_band = 0; + memstat_bucket_t *demotion_bucket; - if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) { - return NULL; - } + MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n"); - current_bucket = &memstat_bucket[*bucket_index]; - next_p = TAILQ_FIRST(¤t_bucket->list); - if (!next_p && search) { - while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) { - current_bucket = &memstat_bucket[*bucket_index]; - next_p = TAILQ_FIRST(¤t_bucket->list); - } - } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0); - return next_p; -} + current_time = mach_absolute_time(); -static proc_t -memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) -{ - memstat_bucket_t *current_bucket; - proc_t next_p; + proc_list_lock(); - if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) { - return NULL; - } + demote_prio_band = JETSAM_PRIORITY_IDLE + 1; - next_p = TAILQ_NEXT(p, p_memstat_list); - while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) { - current_bucket = &memstat_bucket[*bucket_index]; - next_p = TAILQ_FIRST(¤t_bucket->list); - } + for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) { + if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) { + continue; + } - return next_p; -} - -/* - * Structure to hold state for a jetsam thread. - * Typically there should be a single jetsam thread - * unless parallel jetsam is enabled. - */ -struct jetsam_thread_state { - boolean_t inited; /* if the thread is initialized */ - int memorystatus_wakeup; /* wake channel */ - int index; /* jetsam thread index */ - thread_t thread; /* jetsam thread pointer */ -} *jetsam_threads; + demotion_bucket = &memstat_bucket[demote_prio_band]; + p = TAILQ_FIRST(&demotion_bucket->list); -/* Maximum number of jetsam threads allowed */ -#define JETSAM_THREADS_LIMIT 3 + while (p) { + MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid); -/* Number of active jetsam threads */ -_Atomic int active_jetsam_threads = 1; + assert(p->p_memstat_idledeadline); -/* Number of maximum jetsam threads configured */ -int max_jetsam_threads = JETSAM_THREADS_LIMIT; + assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS); -/* - * Global switch for enabling fast jetsam. Fast jetsam is - * hooked up via the system_override() system call. It has the - * following effects: - * - Raise the jetsam threshold ("clear-the-deck") - * - Enabled parallel jetsam on eligible devices - */ -int fast_jetsam_enabled = 0; + if (current_time >= p->p_memstat_idledeadline) { + if ((isSysProc(p) && + ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/ + task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */ + idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p); -/* Routine to find the jetsam state structure for the current jetsam thread */ -static inline struct jetsam_thread_state * -jetsam_current_thread(void) -{ - for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) { - if (jetsam_threads[thr_id].thread == current_thread()) { - return &(jetsam_threads[thr_id]); - } - } - panic("jetsam_current_thread() is being called from a non-jetsam thread\n"); - /* Contol should not reach here */ - return NULL; -} + p->p_memstat_idledeadline += idle_delay_time; + p = TAILQ_NEXT(p, p_memstat_list); + } else { + proc_t next_proc = NULL; + next_proc = TAILQ_NEXT(p, p_memstat_list); + memorystatus_invalidate_idle_demotion_locked(p, TRUE); -__private_extern__ void -memorystatus_init(void) -{ - kern_return_t result; - int i; + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true); -#if CONFIG_FREEZE - memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT; - memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX; - memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */ - memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4); - memorystatus_freeze_pages_min = FREEZE_PAGES_MIN; - memorystatus_freeze_pages_max = FREEZE_PAGES_MAX; - memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS; - memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD; -#endif + p = next_proc; + continue; + } + } else { + // No further candidates + break; + } + } + } -#if DEVELOPMENT || DEBUG - disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init(); - disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr); + memorystatus_reschedule_idle_demotion_locked(); - lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL); + proc_list_unlock(); - if (kill_on_no_paging_space == TRUE) { - max_kill_priority = JETSAM_PRIORITY_MAX; - } -#endif + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0); +} +static void +memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state) +{ + boolean_t present_in_sysprocs_aging_bucket = FALSE; + boolean_t present_in_apps_aging_bucket = FALSE; + uint64_t idle_delay_time = 0; - /* Init buckets */ - for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) { - TAILQ_INIT(&memstat_bucket[i].list); - memstat_bucket[i].count = 0; + if (jetsam_aging_policy == kJetsamAgingPolicyNone) { + return; } - memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL); - -#if CONFIG_JETSAM - nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time); - nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time); - /* Apply overrides */ - PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage)); - if (delta_percentage == 0) { - delta_percentage = 5; + if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) || + (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION)) { + /* + * This process isn't going to be making the trip to the lower bands. + */ + return; } - assert(delta_percentage < 100); - PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage)); - assert(critical_threshold_percentage < 100); - PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage)); - assert(idle_offset_percentage < 100); - PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage)); - assert(pressure_threshold_percentage < 100); - PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage)); - assert(freeze_threshold_percentage < 100); - if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy, - sizeof(jetsam_aging_policy))) { - if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy, - sizeof(jetsam_aging_policy))) { - jetsam_aging_policy = kJetsamAgingPolicyLegacy; + if (isProcessInAgingBands(p)) { + if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) { + assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS); } - } - if (jetsam_aging_policy > kJetsamAgingPolicyMax) { - jetsam_aging_policy = kJetsamAgingPolicyLegacy; + if (isSysProc(p) && system_procs_aging_band) { + present_in_sysprocs_aging_bucket = TRUE; + } else if (isApp(p) && applications_aging_band) { + present_in_apps_aging_bucket = TRUE; + } } - switch (jetsam_aging_policy) { - case kJetsamAgingPolicyNone: - system_procs_aging_band = JETSAM_PRIORITY_IDLE; - applications_aging_band = JETSAM_PRIORITY_IDLE; - break; - - case kJetsamAgingPolicyLegacy: - /* - * Legacy behavior where some daemons get a 10s protection once - * AND only before the first clean->dirty->clean transition before - * going into IDLE band. - */ - system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; - applications_aging_band = JETSAM_PRIORITY_IDLE; - break; + assert(!present_in_sysprocs_aging_bucket); + assert(!present_in_apps_aging_bucket); - case kJetsamAgingPolicySysProcsReclaimedFirst: - system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; - applications_aging_band = JETSAM_PRIORITY_AGING_BAND2; - break; + MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n", + p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)); - case kJetsamAgingPolicyAppsReclaimedFirst: - system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2; - applications_aging_band = JETSAM_PRIORITY_AGING_BAND1; - break; + if (isSysProc(p)) { + assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED); + } - default: - break; + idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p); + if (set_state) { + p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS; + p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time; } - /* - * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE - * band and must be below it in priority. This is so that we don't have to make - * our 'aging' code worry about a mix of processes, some of which need to age - * and some others that need to stay elevated in the jetsam bands. - */ - assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band); - assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band); + assert(p->p_memstat_idledeadline); - /* Take snapshots for idle-exit kills by default? First check the boot-arg... */ - if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) { - /* ...no boot-arg, so check the device tree */ - PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot)); + if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) { + memorystatus_scheduled_idle_demotions_sysprocs++; + } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) { + memorystatus_scheduled_idle_demotions_apps++; } +} - memorystatus_delta = delta_percentage * atop_64(max_mem) / 100; - memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100; - memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta; - memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta; +void +memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state) +{ + boolean_t present_in_sysprocs_aging_bucket = FALSE; + boolean_t present_in_apps_aging_bucket = FALSE; - /* Jetsam Loop Detection */ - if (max_mem <= (512 * 1024 * 1024)) { - /* 512 MB devices */ - memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */ - } else { - /* 1GB and larger devices */ - memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */ + if (!system_procs_aging_band && !applications_aging_band) { + return; } - memorystatus_jld_enabled = TRUE; + if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) { + return; + } - /* No contention at this point */ - memorystatus_update_levels_locked(FALSE); + if (isProcessInAgingBands(p)) { + if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) { + assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS); + } -#endif /* CONFIG_JETSAM */ + if (isSysProc(p) && system_procs_aging_band) { + assert(p->p_memstat_effectivepriority == system_procs_aging_band); + assert(p->p_memstat_idledeadline); + present_in_sysprocs_aging_bucket = TRUE; + } else if (isApp(p) && applications_aging_band) { + assert(p->p_memstat_effectivepriority == applications_aging_band); + assert(p->p_memstat_idledeadline); + present_in_apps_aging_bucket = TRUE; + } + } - memorystatus_jetsam_snapshot_max = maxproc; + MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n", + p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)); - memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + - (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max); - memorystatus_jetsam_snapshot = - (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size); - if (!memorystatus_jetsam_snapshot) { - panic("Could not allocate memorystatus_jetsam_snapshot"); + if (clear_state) { + p->p_memstat_idledeadline = 0; + p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS; } - memorystatus_jetsam_snapshot_copy = - (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size); - if (!memorystatus_jetsam_snapshot_copy) { - panic("Could not allocate memorystatus_jetsam_snapshot_copy"); + if (isSysProc(p) && present_in_sysprocs_aging_bucket == TRUE) { + memorystatus_scheduled_idle_demotions_sysprocs--; + assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0); + } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) { + memorystatus_scheduled_idle_demotions_apps--; + assert(memorystatus_scheduled_idle_demotions_apps >= 0); } - nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout); - - memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t)); - -#if CONFIG_FREEZE - memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta; -#endif + assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0); +} - /* Check the boot-arg to see if fast jetsam is allowed */ - if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof(fast_jetsam_enabled))) { - fast_jetsam_enabled = 0; - } +static void +memorystatus_reschedule_idle_demotion_locked(void) +{ + if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) { + if (memstat_idle_demotion_deadline) { + /* Transitioned 1->0, so cancel next call */ + thread_call_cancel(memorystatus_idle_demotion_call); + memstat_idle_demotion_deadline = 0; + } + } else { + memstat_bucket_t *demotion_bucket; + proc_t p = NULL, p1 = NULL, p2 = NULL; - /* Check the boot-arg to configure the maximum number of jetsam threads */ - if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) { - max_jetsam_threads = JETSAM_THREADS_LIMIT; - } + if (system_procs_aging_band) { + demotion_bucket = &memstat_bucket[system_procs_aging_band]; + p1 = TAILQ_FIRST(&demotion_bucket->list); - /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */ - if (max_jetsam_threads > JETSAM_THREADS_LIMIT) { - max_jetsam_threads = JETSAM_THREADS_LIMIT; - } + p = p1; + } - /* For low CPU systems disable fast jetsam mechanism */ - if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { - max_jetsam_threads = 1; - fast_jetsam_enabled = 0; - } + if (applications_aging_band) { + demotion_bucket = &memstat_bucket[applications_aging_band]; + p2 = TAILQ_FIRST(&demotion_bucket->list); - /* Initialize the jetsam_threads state array */ - jetsam_threads = kalloc(sizeof(struct jetsam_thread_state) * max_jetsam_threads); + if (p1 && p2) { + p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1; + } else { + p = (p1 == NULL) ? p2 : p1; + } + } - /* Initialize all the jetsam threads */ - for (i = 0; i < max_jetsam_threads; i++) { - result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread); - if (result == KERN_SUCCESS) { - jetsam_threads[i].inited = FALSE; - jetsam_threads[i].index = i; - thread_deallocate(jetsam_threads[i].thread); - } else { - panic("Could not create memorystatus_thread %d", i); + assert(p); + + if (p != NULL) { + assert(p && p->p_memstat_idledeadline); + if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline) { + thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline); + memstat_idle_demotion_deadline = p->p_memstat_idledeadline; + } } } } -/* Centralised for the purposes of allowing panic-on-jetsam */ -extern void -vm_run_compactor(void); - /* - * The jetsam no frills kill call - * Return: 0 on success - * error code on failure (EINVAL...) + * List manipulation */ -static int -jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason) -{ - int error = 0; - error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason); - return error; -} -/* - * Wrapper for processes exiting with memorystatus details - */ -static boolean_t -memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason) +int +memorystatus_add(proc_t p, boolean_t locked) { - int error = 0; - __unused pid_t victim_pid = p->p_pid; + memstat_bucket_t *bucket; - KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START, - victim_pid, cause, vm_page_free_count, 0, 0); + MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority); - DTRACE_MEMORYSTATUS3(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause); -#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) - if (memorystatus_jetsam_panic_debug & (1 << cause)) { - panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause); + if (!locked) { + proc_list_lock(); } -#else -#pragma unused(cause) -#endif - if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) { - printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid, - (*p->p_name ? p->p_name : "unknown"), - memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority, - (uint64_t)memorystatus_available_pages); + DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority); + + /* Processes marked internal do not have priority tracked */ + if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { + goto exit; } /* - * The jetsam_reason (os_reason_t) has enough information about the kill cause. - * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped. + * Opt out system processes from being frozen by default. + * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in. */ - int jetsam_flags = P_LTERM_JETSAM; - switch (cause) { - case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break; - case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break; - case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break; - case kMemorystatusKilledVMCompressorThrashing: - case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break; - case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break; - case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break; - case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break; + if (isSysProc(p)) { + p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED; } - error = jetsam_do_kill(p, jetsam_flags, jetsam_reason); - KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, - victim_pid, cause, vm_page_free_count, error, 0); + bucket = &memstat_bucket[p->p_memstat_effectivepriority]; - vm_run_compactor(); + if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) { + assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1); + } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) { + assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1); + } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { + /* + * Entering the idle band. + * Record idle start time. + */ + p->p_memstat_idle_start = mach_absolute_time(); + } - return error == 0; -} + TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list); + bucket->count++; + if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { + bucket->relaunch_high_count++; + } -/* - * Node manipulation - */ + memorystatus_list_count++; -static void -memorystatus_check_levels_locked(void) -{ -#if CONFIG_JETSAM - /* Update levels */ - memorystatus_update_levels_locked(TRUE); -#else /* CONFIG_JETSAM */ - /* - * Nothing to do here currently since we update - * memorystatus_available_pages in vm_pressure_response. - */ -#endif /* CONFIG_JETSAM */ + memorystatus_check_levels_locked(); + +exit: + if (!locked) { + proc_list_unlock(); + } + + return 0; } /* - * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work. - * For an application: that means no longer in the FG band - * For a daemon: that means no longer in its 'requested' jetsam priority band + * Description: + * Moves a process from one jetsam bucket to another. + * which changes the LRU position of the process. + * + * Monitors transition between buckets and if necessary + * will update cached memory limits accordingly. + * + * skip_demotion_check: + * - if the 'jetsam aging policy' is NOT 'legacy': + * When this flag is TRUE, it means we are going + * to age the ripe processes out of the aging bands and into the + * IDLE band and apply their inactive memory limits. + * + * - if the 'jetsam aging policy' is 'legacy': + * When this flag is TRUE, it might mean the above aging mechanism + * OR + * It might be that we have a process that has used up its 'idle deferral' + * stay that is given to it once per lifetime. And in this case, the process + * won't be going through any aging codepaths. But we still need to apply + * the right inactive limits and so we explicitly set this to TRUE if the + * new priority for the process is the IDLE band. */ - -int -memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now) +void +memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check) { - int error = 0; - boolean_t enable = FALSE; - proc_t p = NULL; + memstat_bucket_t *old_bucket, *new_bucket; - if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) { - enable = TRUE; - } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) { - enable = FALSE; - } else { - return EINVAL; + assert(priority < MEMSTAT_BUCKET_COUNT); + + /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */ + if ((p->p_listflag & P_LIST_EXITED) != 0) { + return; } - p = proc_find(pid); - if (p != NULL) { - if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) || - (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) { + MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n", + (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail"); + + DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority); + + old_bucket = &memstat_bucket[p->p_memstat_effectivepriority]; + + if (skip_demotion_check == FALSE) { + if (isSysProc(p)) { /* - * No change in state. + * For system processes, the memorystatus_dirty_* routines take care of adding/removing + * the processes from the aging bands and balancing the demotion counts. + * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute. */ - } else { - proc_list_lock(); - - if (enable) { - p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - if (effective_now) { - if (p->p_memstat_effectivepriority < jetsam_prio) { - if (memorystatus_highwater_enabled) { - /* - * Process is about to transition from - * inactive --> active - * assign active state - */ - boolean_t is_fatal; - boolean_t use_active = TRUE; - CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); - task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal); - } - memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE); - } - } else { - if (isProcessInAgingBands(p)) { - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { + /* + * 2 types of processes can use the non-standard elevated inactive band: + * - Frozen processes that always land in memorystatus_freeze_jetsam_band + * OR + * - processes that specifically opt-in to the elevated inactive support e.g. docked processes. + */ +#if CONFIG_FREEZE + if (p->p_memstat_state & P_MEMSTAT_FROZEN) { + if (priority <= memorystatus_freeze_jetsam_band) { + priority = memorystatus_freeze_jetsam_band; + } + } else +#endif /* CONFIG_FREEZE */ + { + if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) { + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; } } - } else { - p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; - memorystatus_invalidate_idle_demotion_locked(p, TRUE); + assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); + } + } else if (isApp(p)) { + /* + * Check to see if the application is being lowered in jetsam priority. If so, and: + * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band. + * - it is a normal application, then let it age in the aging band if that policy is in effect. + */ - if (effective_now) { - if (p->p_memstat_effectivepriority == jetsam_prio) { - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { +#if CONFIG_FREEZE + if (p->p_memstat_state & P_MEMSTAT_FROZEN) { + if (priority <= memorystatus_freeze_jetsam_band) { + priority = memorystatus_freeze_jetsam_band; } - } else { - if (isProcessInAgingBands(p)) { - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } else +#endif /* CONFIG_FREEZE */ + { + if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) { + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; } } - } + } else { + if (applications_aging_band) { + if (p->p_memstat_effectivepriority == applications_aging_band) { + assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1)); + } - proc_list_unlock(); + if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) { + assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); + priority = applications_aging_band; + memorystatus_schedule_idle_demotion_locked(p, TRUE); + } + } + } } - proc_rele(p); - error = 0; - } else { - error = ESRCH; } - return error; -} - -static void -memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2) -{ - proc_t p; - uint64_t current_time = 0, idle_delay_time = 0; - int demote_prio_band = 0; - memstat_bucket_t *demotion_bucket; - - MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n"); - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0); - - current_time = mach_absolute_time(); - - proc_list_lock(); - - demote_prio_band = JETSAM_PRIORITY_IDLE + 1; - - for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) { - if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) { - continue; - } - - demotion_bucket = &memstat_bucket[demote_prio_band]; - p = TAILQ_FIRST(&demotion_bucket->list); - - while (p) { - MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid); + if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) { + assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS); + } - assert(p->p_memstat_idledeadline); +#if DEVELOPMENT || DEBUG + if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */ + skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */ + (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */ + ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */ + ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? (!(p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) { /* OR type (fatal vs non-fatal) */ + printf("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */ + } +#endif /* DEVELOPMENT || DEBUG */ - assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS); + TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list); + old_bucket->count--; + if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { + old_bucket->relaunch_high_count--; + } - if (current_time >= p->p_memstat_idledeadline) { - if ((isSysProc(p) && - ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/ - task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */ - idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time; + new_bucket = &memstat_bucket[priority]; + if (head_insert) { + TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list); + } else { + TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list); + } + new_bucket->count++; + if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { + new_bucket->relaunch_high_count++; + } - p->p_memstat_idledeadline += idle_delay_time; - p = TAILQ_NEXT(p, p_memstat_list); - } else { - proc_t next_proc = NULL; + if (memorystatus_highwater_enabled) { + boolean_t is_fatal; + boolean_t use_active; - next_proc = TAILQ_NEXT(p, p_memstat_list); - memorystatus_invalidate_idle_demotion_locked(p, TRUE); + /* + * If cached limit data is updated, then the limits + * will be enforced by writing to the ledgers. + */ + boolean_t ledger_update_needed = TRUE; - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true); + /* + * Here, we must update the cached memory limit if the task + * is transitioning between: + * active <--> inactive + * FG <--> BG + * but: + * dirty <--> clean is ignored + * + * We bypass non-idle processes that have opted into dirty tracking because + * a move between buckets does not imply a transition between the + * dirty <--> clean state. + */ - p = next_proc; - continue; - } + if (p->p_memstat_dirty & P_DIRTY_TRACK) { + if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) { + CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = FALSE; } else { - // No further candidates - break; + ledger_update_needed = FALSE; } + } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) { + /* + * inactive --> active + * BG --> FG + * assign active state + */ + CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = TRUE; + } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) { + /* + * active --> inactive + * FG --> BG + * assign inactive state + */ + CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = FALSE; + } else { + /* + * The transition between jetsam priority buckets apparently did + * not affect active/inactive state. + * This is not unusual... especially during startup when + * processes are getting established in their respective bands. + */ + ledger_update_needed = FALSE; } - } - - memorystatus_reschedule_idle_demotion_locked(); - - proc_list_unlock(); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0); -} - -static void -memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state) -{ - boolean_t present_in_sysprocs_aging_bucket = FALSE; - boolean_t present_in_apps_aging_bucket = FALSE; - uint64_t idle_delay_time = 0; + /* + * Enforce the new limits by writing to the ledger + */ + if (ledger_update_needed) { + task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal); - if (jetsam_aging_policy == kJetsamAgingPolicyNone) { - return; + MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n", + p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), + (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty, + (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); + } } - if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { + /* + * Record idle start or idle delta. + */ + if (p->p_memstat_effectivepriority == priority) { /* - * This process isn't going to be making the trip to the lower bands. + * This process is not transitioning between + * jetsam priority buckets. Do nothing. */ - return; - } - - if (isProcessInAgingBands(p)) { - if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) { - assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS); + } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { + uint64_t now; + /* + * Transitioning out of the idle priority bucket. + * Record idle delta. + */ + assert(p->p_memstat_idle_start != 0); + now = mach_absolute_time(); + if (now > p->p_memstat_idle_start) { + p->p_memstat_idle_delta = now - p->p_memstat_idle_start; } - if (isSysProc(p) && system_procs_aging_band) { - present_in_sysprocs_aging_bucket = TRUE; - } else if (isApp(p) && applications_aging_band) { - present_in_apps_aging_bucket = TRUE; + /* + * About to become active and so memory footprint could change. + * So mark it eligible for freeze-considerations next time around. + */ + if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) { + p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE; } + } else if (priority == JETSAM_PRIORITY_IDLE) { + /* + * Transitioning into the idle priority bucket. + * Record idle start. + */ + p->p_memstat_idle_start = mach_absolute_time(); } - assert(!present_in_sysprocs_aging_bucket); - assert(!present_in_apps_aging_bucket); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0); - MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n", - p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)); + p->p_memstat_effectivepriority = priority; - if (isSysProc(p)) { - assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED); +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_apps && + task_could_use_secluded_mem(p->task)) { + task_set_can_use_secluded_mem( + p->task, + (priority >= JETSAM_PRIORITY_FOREGROUND)); } +#endif /* CONFIG_SECLUDED_MEMORY */ - idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time; + memorystatus_check_levels_locked(); +} - if (set_state) { - p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS; - p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time; - } - - assert(p->p_memstat_idledeadline); - - if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) { - memorystatus_scheduled_idle_demotions_sysprocs++; - } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) { - memorystatus_scheduled_idle_demotions_apps++; - } +int +memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags) +{ + p->p_memstat_relaunch_flags = relaunch_flags; + KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), p->p_pid, relaunch_flags, 0, 0, 0); + return 0; } -static void -memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state) -{ - boolean_t present_in_sysprocs_aging_bucket = FALSE; - boolean_t present_in_apps_aging_bucket = FALSE; +/* + * + * Description: Update the jetsam priority and memory limit attributes for a given process. + * + * Parameters: + * p init this process's jetsam information. + * priority The jetsam priority band + * user_data user specific data, unused by the kernel + * is_assertion When true, a priority update is driven by an assertion. + * effective guards against race if process's update already occurred + * update_memlimit When true we know this is the init step via the posix_spawn path. + * + * memlimit_active Value in megabytes; The monitored footprint level while the + * process is active. Exceeding it may result in termination + * based on it's associated fatal flag. + * + * memlimit_active_is_fatal When a process is active and exceeds its memory footprint, + * this describes whether or not it should be immediately fatal. + * + * memlimit_inactive Value in megabytes; The monitored footprint level while the + * process is inactive. Exceeding it may result in termination + * based on it's associated fatal flag. + * + * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint, + * this describes whether or not it should be immediatly fatal. + * + * Returns: 0 Success + * non-0 Failure + */ - if (!system_procs_aging_band && !applications_aging_band) { - return; - } +int +memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t is_assertion, boolean_t effective, boolean_t update_memlimit, + int32_t memlimit_active, boolean_t memlimit_active_is_fatal, + int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal) +{ + int ret; + boolean_t head_insert = false; - if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) { - return; - } + MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data); - if (isProcessInAgingBands(p)) { - if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) { - assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS); - } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0); - if (isSysProc(p) && system_procs_aging_band) { - assert(p->p_memstat_effectivepriority == system_procs_aging_band); - assert(p->p_memstat_idledeadline); - present_in_sysprocs_aging_bucket = TRUE; - } else if (isApp(p) && applications_aging_band) { - assert(p->p_memstat_effectivepriority == applications_aging_band); - assert(p->p_memstat_idledeadline); - present_in_apps_aging_bucket = TRUE; - } + if (priority == -1) { + /* Use as shorthand for default priority */ + priority = JETSAM_PRIORITY_DEFAULT; + } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) { + /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */ + priority = JETSAM_PRIORITY_IDLE; + } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) { + /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */ + priority = JETSAM_PRIORITY_IDLE; + head_insert = TRUE; + } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) { + /* Sanity check */ + ret = EINVAL; + goto out; } - MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n", - p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)); + proc_list_lock(); + assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL)); - if (clear_state) { - p->p_memstat_idledeadline = 0; - p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS; + if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) { + ret = EALREADY; + proc_list_unlock(); + MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid); + goto out; } - if (isSysProc(p) && present_in_sysprocs_aging_bucket == TRUE) { - memorystatus_scheduled_idle_demotions_sysprocs--; - assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0); - } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) { - memorystatus_scheduled_idle_demotions_apps--; - assert(memorystatus_scheduled_idle_demotions_apps >= 0); + if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) { + /* + * This could happen when a process calling posix_spawn() is exiting on the jetsam thread. + */ + ret = EBUSY; + proc_list_unlock(); + goto out; } - assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0); -} + p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED; + p->p_memstat_userdata = user_data; -static void -memorystatus_reschedule_idle_demotion_locked(void) -{ - if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) { - if (memstat_idle_demotion_deadline) { - /* Transitioned 1->0, so cancel next call */ - thread_call_cancel(memorystatus_idle_demotion_call); - memstat_idle_demotion_deadline = 0; + if (is_assertion) { + if (priority == JETSAM_PRIORITY_IDLE) { + /* + * Assertions relinquish control when the process is heading to IDLE. + */ + if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) { + /* + * Mark the process as no longer being managed by assertions. + */ + p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION; + } else { + /* + * Ignore an idle priority transition if the process is not + * already managed by assertions. We won't treat this as + * an error, but we will log the unexpected behavior and bail. + */ + os_log(OS_LOG_DEFAULT, "memorystatus: Ignore assertion driven idle priority. Process not previously controlled %s:%d\n", + (*p->p_name ? p->p_name : "unknown"), p->p_pid); + + ret = 0; + proc_list_unlock(); + goto out; + } + } else { + /* + * Process is now being managed by assertions, + */ + p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION; } - } else { - memstat_bucket_t *demotion_bucket; - proc_t p = NULL, p1 = NULL, p2 = NULL; - if (system_procs_aging_band) { - demotion_bucket = &memstat_bucket[system_procs_aging_band]; - p1 = TAILQ_FIRST(&demotion_bucket->list); + /* Always update the assertion priority in this path */ - p = p1; - } + p->p_memstat_assertionpriority = priority; - if (applications_aging_band) { - demotion_bucket = &memstat_bucket[applications_aging_band]; - p2 = TAILQ_FIRST(&demotion_bucket->list); + int memstat_dirty_flags = memorystatus_dirty_get(p, TRUE); /* proc_list_lock is held */ - if (p1 && p2) { - p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1; + if (memstat_dirty_flags != 0) { + /* + * Calculate maximum priority only when dirty tracking processes are involved. + */ + int maxpriority; + if (memstat_dirty_flags & PROC_DIRTY_IS_DIRTY) { + maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority); } else { - p = (p1 == NULL) ? p2 : p1; - } - } + /* clean */ - assert(p); + if (memstat_dirty_flags & PROC_DIRTY_ALLOWS_IDLE_EXIT) { + /* + * The aging policy must be evaluated and applied here because runnningboardd + * has relinquished its hold on the jetsam priority by attempting to move a + * clean process to the idle band. + */ - if (p != NULL) { - assert(p && p->p_memstat_idledeadline); - if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline) { - thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline); - memstat_idle_demotion_deadline = p->p_memstat_idledeadline; + int newpriority = JETSAM_PRIORITY_IDLE; + if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) { + newpriority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE; + } + + maxpriority = MAX(p->p_memstat_assertionpriority, newpriority ); + + if (newpriority == system_procs_aging_band) { + memorystatus_schedule_idle_demotion_locked(p, FALSE); + } + } else { + /* + * Preserves requestedpriority when the process does not support pressured exit. + */ + maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority); + } } + priority = maxpriority; } + } else { + p->p_memstat_requestedpriority = priority; } -} -/* - * List manipulation - */ + if (update_memlimit) { + boolean_t is_fatal; + boolean_t use_active; -int -memorystatus_add(proc_t p, boolean_t locked) -{ - memstat_bucket_t *bucket; + /* + * Posix_spawn'd processes come through this path to instantiate ledger limits. + * Forked processes do not come through this path, so no ledger limits exist. + * (That's why forked processes can consume unlimited memory.) + */ - MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority); + MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n", + p->p_pid, priority, p->p_memstat_dirty, + memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"), + memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF")); - if (!locked) { - proc_list_lock(); - } + if (memlimit_active <= 0) { + /* + * This process will have a system_wide task limit when active. + * System_wide task limit is always fatal. + * It's quite common to see non-fatal flag passed in here. + * It's not an error, we just ignore it. + */ - DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority); + /* + * For backward compatibility with some unexplained launchd behavior, + * we allow a zero sized limit. But we still enforce system_wide limit + * when written to the ledgers. + */ - /* Processes marked internal do not have priority tracked */ - if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { - goto exit; - } + if (memlimit_active < 0) { + memlimit_active = -1; /* enforces system_wide task limit */ + } + memlimit_active_is_fatal = TRUE; + } - bucket = &memstat_bucket[p->p_memstat_effectivepriority]; + if (memlimit_inactive <= 0) { + /* + * This process will have a system_wide task limit when inactive. + * System_wide task limit is always fatal. + */ + + memlimit_inactive = -1; + memlimit_inactive_is_fatal = TRUE; + } - if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) { - assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1); - } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) { - assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1); - } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { /* - * Entering the idle band. - * Record idle start time. + * Initialize the active limit variants for this process. */ - p->p_memstat_idle_start = mach_absolute_time(); - } - - TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list); - bucket->count++; + SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal); - memorystatus_list_count++; + /* + * Initialize the inactive limit variants for this process. + */ + SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal); - memorystatus_check_levels_locked(); + /* + * Initialize the cached limits for target process. + * When the target process is dirty tracked, it's typically + * in a clean state. Non dirty tracked processes are + * typically active (Foreground or above). + * But just in case, we don't make assumptions... + */ -exit: - if (!locked) { - proc_list_unlock(); + if (proc_jetsam_state_is_active_locked(p) == TRUE) { + CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = TRUE; + } else { + CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = FALSE; + } + + /* + * Enforce the cached limit by writing to the ledger. + */ + if (memorystatus_highwater_enabled) { + /* apply now */ + task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal); + + MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n", + p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), + (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty, + (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); + } } - return 0; + /* + * We can't add to the aging bands buckets here. + * But, we could be removing it from those buckets. + * Check and take appropriate steps if so. + */ + + if (isProcessInAgingBands(p)) { + if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && isApp(p) && (priority > applications_aging_band)) { + /* + * Runningboardd is pulling up an application that is in the aging band. + * We reset the app's state here so that it'll get a fresh stay in the + * aging band on the way back. + * + * We always handled the app 'aging' in the memorystatus_update_priority_locked() + * function. Daemons used to be handled via the dirty 'set/clear/track' path. + * But with extensions (daemon-app hybrid), runningboardd is now going through + * this routine for daemons too and things have gotten a bit tangled. This should + * be simplified/untangled at some point and might require some assistance from + * runningboardd. + */ + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + } else { + memorystatus_invalidate_idle_demotion_locked(p, FALSE); + } + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } else { + if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) { + /* + * Daemons with 'inactive' limits will go through the dirty tracking codepath. + * This path deals with apps that may have 'inactive' limits e.g. WebContent processes. + * If this is the legacy aging policy we explicitly need to apply those limits. If it + * is any other aging policy, then we don't need to worry because all processes + * will go through the aging bands and then the demotion thread will take care to + * move them into the IDLE band and apply the required limits. + */ + memorystatus_update_priority_locked(p, priority, head_insert, TRUE); + } + } + + memorystatus_update_priority_locked(p, priority, head_insert, FALSE); + + proc_list_unlock(); + ret = 0; + +out: + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0); + + return ret; } -/* - * Description: - * Moves a process from one jetsam bucket to another. - * which changes the LRU position of the process. - * - * Monitors transition between buckets and if necessary - * will update cached memory limits accordingly. - * - * skip_demotion_check: - * - if the 'jetsam aging policy' is NOT 'legacy': - * When this flag is TRUE, it means we are going - * to age the ripe processes out of the aging bands and into the - * IDLE band and apply their inactive memory limits. - * - * - if the 'jetsam aging policy' is 'legacy': - * When this flag is TRUE, it might mean the above aging mechanism - * OR - * It might be that we have a process that has used up its 'idle deferral' - * stay that is given to it once per lifetime. And in this case, the process - * won't be going through any aging codepaths. But we still need to apply - * the right inactive limits and so we explicitly set this to TRUE if the - * new priority for the process is the IDLE band. - */ -void -memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check) +int +memorystatus_remove(proc_t p) { - memstat_bucket_t *old_bucket, *new_bucket; + int ret; + memstat_bucket_t *bucket; + boolean_t reschedule = FALSE; - assert(priority < MEMSTAT_BUCKET_COUNT); + MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid); - /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */ - if ((p->p_listflag & P_LIST_EXITED) != 0) { - return; + /* + * Check if this proc is locked (because we're performing a freeze). + * If so, we fail and instruct the caller to try again later. + */ + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + return EAGAIN; } - MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n", - (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail"); + assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL)); - DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority); + bucket = &memstat_bucket[p->p_memstat_effectivepriority]; -#if DEVELOPMENT || DEBUG - if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */ - skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */ - (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */ - ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */ - ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? (!(p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) { /* OR type (fatal vs non-fatal) */ - panic("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */ + if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) { + assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs); + reschedule = TRUE; + } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) { + assert(bucket->count == memorystatus_scheduled_idle_demotions_apps); + reschedule = TRUE; } -#endif /* DEVELOPMENT || DEBUG */ - old_bucket = &memstat_bucket[p->p_memstat_effectivepriority]; + /* + * Record idle delta + */ - if (skip_demotion_check == FALSE) { - if (isSysProc(p)) { - /* - * For system processes, the memorystatus_dirty_* routines take care of adding/removing - * the processes from the aging bands and balancing the demotion counts. - * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute. - */ + if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { + uint64_t now = mach_absolute_time(); + if (now > p->p_memstat_idle_start) { + p->p_memstat_idle_delta = now - p->p_memstat_idle_start; + } + } - if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { - /* - * 2 types of processes can use the non-standard elevated inactive band: - * - Frozen processes that always land in memorystatus_freeze_jetsam_band - * OR - * - processes that specifically opt-in to the elevated inactive support e.g. docked processes. - */ -#if CONFIG_FREEZE - if (p->p_memstat_state & P_MEMSTAT_FROZEN) { - if (priority <= memorystatus_freeze_jetsam_band) { - priority = memorystatus_freeze_jetsam_band; - } - } else -#endif /* CONFIG_FREEZE */ - { - if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) { - priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; - } - } - assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); - } - } else if (isApp(p)) { - /* - * Check to see if the application is being lowered in jetsam priority. If so, and: - * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band. - * - it is a normal application, then let it age in the aging band if that policy is in effect. - */ + TAILQ_REMOVE(&bucket->list, p, p_memstat_list); + bucket->count--; + if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { + bucket->relaunch_high_count--; + } - if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { -#if CONFIG_FREEZE - if (p->p_memstat_state & P_MEMSTAT_FROZEN) { - if (priority <= memorystatus_freeze_jetsam_band) { - priority = memorystatus_freeze_jetsam_band; - } - } else -#endif /* CONFIG_FREEZE */ - { - if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) { - priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; - } - } - } else { - if (applications_aging_band) { - if (p->p_memstat_effectivepriority == applications_aging_band) { - assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1)); - } + memorystatus_list_count--; - if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) { - assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); - priority = applications_aging_band; - memorystatus_schedule_idle_demotion_locked(p, TRUE); - } - } - } - } + /* If awaiting demotion to the idle band, clean up */ + if (reschedule) { + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + memorystatus_reschedule_idle_demotion_locked(); } - if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) { - assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS); + memorystatus_check_levels_locked(); + +#if CONFIG_FREEZE + if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) { + if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { + p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count--; + } + + memorystatus_frozen_count--; + memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages; + p->p_memstat_freeze_sharedanon_pages = 0; } - TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list); - old_bucket->count--; + if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) { + memorystatus_suspended_count--; + } +#endif - new_bucket = &memstat_bucket[priority]; - if (head_insert) { - TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list); + if (p) { + ret = 0; } else { - TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list); + ret = ESRCH; } - new_bucket->count++; - - if (memorystatus_highwater_enabled) { - boolean_t is_fatal; - boolean_t use_active; - /* - * If cached limit data is updated, then the limits - * will be enforced by writing to the ledgers. - */ - boolean_t ledger_update_needed = TRUE; + return ret; +} - /* - * Here, we must update the cached memory limit if the task - * is transitioning between: - * active <--> inactive - * FG <--> BG - * but: - * dirty <--> clean is ignored - * - * We bypass non-idle processes that have opted into dirty tracking because - * a move between buckets does not imply a transition between the - * dirty <--> clean state. - */ +/* + * Validate dirty tracking flags with process state. + * + * Return: + * 0 on success + * non-0 on failure + * + * The proc_list_lock is held by the caller. + */ - if (p->p_memstat_dirty & P_DIRTY_TRACK) { - if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) { - CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = FALSE; - } else { - ledger_update_needed = FALSE; - } - } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) { - /* - * inactive --> active - * BG --> FG - * assign active state - */ - CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = TRUE; - } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) { - /* - * active --> inactive - * FG --> BG - * assign inactive state - */ - CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = FALSE; - } else { - /* - * The transition between jetsam priority buckets apparently did - * not affect active/inactive state. - * This is not unusual... especially during startup when - * processes are getting established in their respective bands. - */ - ledger_update_needed = FALSE; - } +static int +memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) +{ + /* See that the process isn't marked for termination */ + if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) { + return EBUSY; + } - /* - * Enforce the new limits by writing to the ledger - */ - if (ledger_update_needed) { - task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal); + /* Idle exit requires that process be tracked */ + if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) && + !(pcontrol & PROC_DIRTY_TRACK)) { + return EINVAL; + } - MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n", - p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), - (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty, - (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); - } + /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */ + if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) && + !(pcontrol & PROC_DIRTY_TRACK)) { + return EINVAL; } - /* - * Record idle start or idle delta. - */ - if (p->p_memstat_effectivepriority == priority) { - /* - * This process is not transitioning between - * jetsam priority buckets. Do nothing. - */ - } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { - uint64_t now; - /* - * Transitioning out of the idle priority bucket. - * Record idle delta. - */ - assert(p->p_memstat_idle_start != 0); - now = mach_absolute_time(); - if (now > p->p_memstat_idle_start) { - p->p_memstat_idle_delta = now - p->p_memstat_idle_start; - } + /* Only one type of DEFER behavior is allowed.*/ + if ((pcontrol & PROC_DIRTY_DEFER) && + (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) { + return EINVAL; + } + + /* Deferral is only relevant if idle exit is specified */ + if (((pcontrol & PROC_DIRTY_DEFER) || + (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) && + !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) { + return EINVAL; + } + + return 0; +} + +static void +memorystatus_update_idle_priority_locked(proc_t p) +{ + int32_t priority; + + MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty); + + assert(isSysProc(p)); + + if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) { + priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE; + } else { + priority = p->p_memstat_requestedpriority; + } + if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) { /* - * About to become active and so memory footprint could change. - * So mark it eligible for freeze-considerations next time around. + * This process has a jetsam priority managed by an assertion. + * Policy is to choose the max priority. */ - if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) { - p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE; + if (p->p_memstat_assertionpriority > priority) { + os_log(OS_LOG_DEFAULT, "memorystatus: assertion priority %d overrides priority %d for %s:%d\n", + p->p_memstat_assertionpriority, priority, + (*p->p_name ? p->p_name : "unknown"), p->p_pid); + priority = p->p_memstat_assertionpriority; } - } else if (priority == JETSAM_PRIORITY_IDLE) { - /* - * Transitioning into the idle priority bucket. - * Record idle start. - */ - p->p_memstat_idle_start = mach_absolute_time(); } - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0); + if (priority != p->p_memstat_effectivepriority) { + if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) && + (priority == JETSAM_PRIORITY_IDLE)) { + /* + * This process is on its way into the IDLE band. The system is + * using 'legacy' jetsam aging policy. That means, this process + * has already used up its idle-deferral aging time that is given + * once per its lifetime. So we need to set the INACTIVE limits + * explicitly because it won't be going through the demotion paths + * that take care to apply the limits appropriately. + */ - p->p_memstat_effectivepriority = priority; + if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { + /* + * This process has the 'elevated inactive jetsam band' attribute. + * So, there will be no trip to IDLE after all. + * Instead, we pin the process in the elevated band, + * where its ACTIVE limits will apply. + */ -#if CONFIG_SECLUDED_MEMORY - if (secluded_for_apps && - task_could_use_secluded_mem(p->task)) { - task_set_can_use_secluded_mem( - p->task, - (priority >= JETSAM_PRIORITY_FOREGROUND)); - } -#endif /* CONFIG_SECLUDED_MEMORY */ + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; + } - memorystatus_check_levels_locked(); + memorystatus_update_priority_locked(p, priority, false, true); + } else { + memorystatus_update_priority_locked(p, priority, false, false); + } + } } /* + * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle + * (clean). They may also indicate that they support termination when idle, with the result that they are promoted + * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low + * priority idle band when clean (and killed earlier, protecting higher priority procesess). * - * Description: Update the jetsam priority and memory limit attributes for a given process. - * - * Parameters: - * p init this process's jetsam information. - * priority The jetsam priority band - * user_data user specific data, unused by the kernel - * effective guards against race if process's update already occurred - * update_memlimit When true we know this is the init step via the posix_spawn path. - * - * memlimit_active Value in megabytes; The monitored footprint level while the - * process is active. Exceeding it may result in termination - * based on it's associated fatal flag. - * - * memlimit_active_is_fatal When a process is active and exceeds its memory footprint, - * this describes whether or not it should be immediately fatal. - * - * memlimit_inactive Value in megabytes; The monitored footprint level while the - * process is inactive. Exceeding it may result in termination - * based on it's associated fatal flag. - * - * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint, - * this describes whether or not it should be immediatly fatal. + * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by + * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band + * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to + * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle + * band. The deferral can be cleared early by clearing the appropriate flag. * - * Returns: 0 Success - * non-0 Failure + * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process + * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be + * re-enabled or the guard state cleared, depending on whether the guard deadline has passed. */ int -memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit, - int32_t memlimit_active, boolean_t memlimit_active_is_fatal, - int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal) +memorystatus_dirty_track(proc_t p, uint32_t pcontrol) { - int ret; - boolean_t head_insert = false; - - MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data); - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0); + unsigned int old_dirty; + boolean_t reschedule = FALSE; + boolean_t already_deferred = FALSE; + boolean_t defer_now = FALSE; + int ret = 0; - if (priority == -1) { - /* Use as shorthand for default priority */ - priority = JETSAM_PRIORITY_DEFAULT; - } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) { - /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */ - priority = JETSAM_PRIORITY_IDLE; - } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) { - /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */ - priority = JETSAM_PRIORITY_IDLE; - head_insert = TRUE; - } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) { - /* Sanity check */ - ret = EINVAL; - goto out; - } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK), + p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0); proc_list_lock(); - assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL)); - - if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) { - ret = EALREADY; - proc_list_unlock(); - MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid); - goto out; - } - - if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) { + if ((p->p_listflag & P_LIST_EXITED) != 0) { /* - * This could happen when a process calling posix_spawn() is exiting on the jetsam thread. + * Process is on its way out. */ ret = EBUSY; - proc_list_unlock(); - goto out; + goto exit; } - p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED; - p->p_memstat_userdata = user_data; - p->p_memstat_requestedpriority = priority; - - if (update_memlimit) { - boolean_t is_fatal; - boolean_t use_active; + if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { + ret = EPERM; + goto exit; + } - /* - * Posix_spawn'd processes come through this path to instantiate ledger limits. - * Forked processes do not come through this path, so no ledger limits exist. - * (That's why forked processes can consume unlimited memory.) - */ + if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) { + /* error */ + goto exit; + } - MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n", - p->p_pid, priority, p->p_memstat_dirty, - memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"), - memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF")); + old_dirty = p->p_memstat_dirty; - if (memlimit_active <= 0) { - /* - * This process will have a system_wide task limit when active. - * System_wide task limit is always fatal. - * It's quite common to see non-fatal flag passed in here. - * It's not an error, we just ignore it. - */ + /* These bits are cumulative, as per */ + if (pcontrol & PROC_DIRTY_TRACK) { + p->p_memstat_dirty |= P_DIRTY_TRACK; + } - /* - * For backward compatibility with some unexplained launchd behavior, - * we allow a zero sized limit. But we still enforce system_wide limit - * when written to the ledgers. - */ + if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) { + p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT; + } - if (memlimit_active < 0) { - memlimit_active = -1; /* enforces system_wide task limit */ - } - memlimit_active_is_fatal = TRUE; - } + if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) { + p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS; + } - if (memlimit_inactive <= 0) { - /* - * This process will have a system_wide task limit when inactive. - * System_wide task limit is always fatal. - */ + if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) { + already_deferred = TRUE; + } - memlimit_inactive = -1; - memlimit_inactive_is_fatal = TRUE; + + /* This can be set and cleared exactly once. */ + if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) { + if ((pcontrol & (PROC_DIRTY_DEFER)) && + !(old_dirty & P_DIRTY_DEFER)) { + p->p_memstat_dirty |= P_DIRTY_DEFER; } - /* - * Initialize the active limit variants for this process. - */ - SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal); + if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) && + !(old_dirty & P_DIRTY_DEFER_ALWAYS)) { + p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS; + } - /* - * Initialize the inactive limit variants for this process. - */ - SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal); + defer_now = TRUE; + } - /* - * Initialize the cached limits for target process. - * When the target process is dirty tracked, it's typically - * in a clean state. Non dirty tracked processes are - * typically active (Foreground or above). - * But just in case, we don't make assumptions... - */ + MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n", + ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N", + defer_now ? "Y" : "N", + p->p_memstat_dirty & P_DIRTY ? "Y" : "N", + p->p_pid); - if (proc_jetsam_state_is_active_locked(p) == TRUE) { - CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = TRUE; - } else { - CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = FALSE; - } + /* Kick off or invalidate the idle exit deferment if there's a state transition. */ + if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) { + if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { + if (defer_now && !already_deferred) { + /* + * Request to defer a clean process that's idle-exit enabled + * and not already in the jetsam deferred band. Most likely a + * new launch. + */ + memorystatus_schedule_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } else if (!defer_now) { + /* + * The process isn't asking for the 'aging' facility. + * Could be that it is: + */ + + if (already_deferred) { + /* + * already in the aging bands. Traditionally, + * some processes have tried to use this to + * opt out of the 'aging' facility. + */ + + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + } else { + /* + * agnostic to the 'aging' facility. In that case, + * we'll go ahead and opt it in because this is likely + * a new launch (clean process, dirty tracking enabled) + */ + + memorystatus_schedule_idle_demotion_locked(p, TRUE); + } + reschedule = TRUE; + } + } + } else { /* - * Enforce the cached limit by writing to the ledger. + * We are trying to operate on a dirty process. Dirty processes have to + * be removed from the deferred band. The question is do we reset the + * deferred state or not? + * + * This could be a legal request like: + * - this process had opted into the 'aging' band + * - but it's now dirty and requests to opt out. + * In this case, we remove the process from the band and reset its + * state too. It'll opt back in properly when needed. + * + * OR, this request could be a user-space bug. E.g.: + * - this process had opted into the 'aging' band when clean + * - and, then issues another request to again put it into the band except + * this time the process is dirty. + * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of + * the deferred band with its state intact. So our request below is no-op. + * But we do it here anyways for coverage. + * + * memorystatus_update_idle_priority_locked() + * single-mindedly treats a dirty process as "cannot be in the aging band". */ - if (memorystatus_highwater_enabled) { - /* apply now */ - task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal); - MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n", - p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), - (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty, - (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); + if (!defer_now && already_deferred) { + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } else { + boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE; + + memorystatus_invalidate_idle_demotion_locked(p, reset_state); + reschedule = TRUE; } } - /* - * We can't add to the aging bands buckets here. - * But, we could be removing it from those buckets. - * Check and take appropriate steps if so. - */ + memorystatus_update_idle_priority_locked(p); - if (isProcessInAgingBands(p)) { - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); - } else { - if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) { - /* - * Daemons with 'inactive' limits will go through the dirty tracking codepath. - * This path deals with apps that may have 'inactive' limits e.g. WebContent processes. - * If this is the legacy aging policy we explicitly need to apply those limits. If it - * is any other aging policy, then we don't need to worry because all processes - * will go through the aging bands and then the demotion thread will take care to - * move them into the IDLE band and apply the required limits. - */ - memorystatus_update_priority_locked(p, priority, head_insert, TRUE); - } + if (reschedule) { + memorystatus_reschedule_idle_demotion_locked(); } - memorystatus_update_priority_locked(p, priority, head_insert, FALSE); - - proc_list_unlock(); ret = 0; -out: - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0); +exit: + proc_list_unlock(); return ret; } int -memorystatus_remove(proc_t p, boolean_t locked) +memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { int ret; - memstat_bucket_t *bucket; - boolean_t reschedule = FALSE; - - MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid); + boolean_t kill = false; + boolean_t reschedule = FALSE; + boolean_t was_dirty = FALSE; + boolean_t now_dirty = FALSE; - if (!locked) { - proc_list_lock(); - } + MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0); - assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL)); + proc_list_lock(); - bucket = &memstat_bucket[p->p_memstat_effectivepriority]; + if ((p->p_listflag & P_LIST_EXITED) != 0) { + /* + * Process is on its way out. + */ + ret = EBUSY; + goto exit; + } - if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) { - assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs); - reschedule = TRUE; - } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) { - assert(bucket->count == memorystatus_scheduled_idle_demotions_apps); - reschedule = TRUE; + if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { + ret = EPERM; + goto exit; } - /* - * Record idle delta - */ + if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { + was_dirty = TRUE; + } - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { - uint64_t now = mach_absolute_time(); - if (now > p->p_memstat_idle_start) { - p->p_memstat_idle_delta = now - p->p_memstat_idle_start; + if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) { + /* Dirty tracking not enabled */ + ret = EINVAL; + } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) { + /* + * Process is set to be terminated and we're attempting to mark it dirty. + * Set for termination and marking as clean is OK - see . + */ + ret = EBUSY; + } else { + int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN; + if (pcontrol && !(p->p_memstat_dirty & flag)) { + /* Mark the process as having been dirtied at some point */ + p->p_memstat_dirty |= (flag | P_DIRTY_MARKED); + memorystatus_dirty_count++; + ret = 0; + } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) { + if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) { + /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */ + p->p_memstat_dirty |= P_DIRTY_TERMINATED; + kill = true; + } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) { + /* Kill previously terminated processes if set clean */ + kill = true; + } + p->p_memstat_dirty &= ~flag; + memorystatus_dirty_count--; + ret = 0; + } else { + /* Already set */ + ret = EALREADY; } } - TAILQ_REMOVE(&bucket->list, p, p_memstat_list); - bucket->count--; - - memorystatus_list_count--; - - /* If awaiting demotion to the idle band, clean up */ - if (reschedule) { - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - memorystatus_reschedule_idle_demotion_locked(); - } - - memorystatus_check_levels_locked(); - -#if CONFIG_FREEZE - if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) { - if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { - p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; - memorystatus_refreeze_eligible_count--; - } - - memorystatus_frozen_count--; - memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages; - p->p_memstat_freeze_sharedanon_pages = 0; - } - - if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) { - memorystatus_suspended_count--; - } -#endif - - if (!locked) { - proc_list_unlock(); - } - - if (p) { - ret = 0; - } else { - ret = ESRCH; - } - - return ret; -} - -/* - * Validate dirty tracking flags with process state. - * - * Return: - * 0 on success - * non-0 on failure - * - * The proc_list_lock is held by the caller. - */ - -static int -memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) -{ - /* See that the process isn't marked for termination */ - if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) { - return EBUSY; - } - - /* Idle exit requires that process be tracked */ - if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) && - !(pcontrol & PROC_DIRTY_TRACK)) { - return EINVAL; - } - - /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */ - if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) && - !(pcontrol & PROC_DIRTY_TRACK)) { - return EINVAL; - } - - /* Only one type of DEFER behavior is allowed.*/ - if ((pcontrol & PROC_DIRTY_DEFER) && - (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) { - return EINVAL; - } - - /* Deferral is only relevant if idle exit is specified */ - if (((pcontrol & PROC_DIRTY_DEFER) || - (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) && - !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) { - return EINVAL; - } - - return 0; -} - -static void -memorystatus_update_idle_priority_locked(proc_t p) -{ - int32_t priority; - - MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty); - - assert(isSysProc(p)); - - if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) { - priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE; - } else { - priority = p->p_memstat_requestedpriority; - } - - if (priority != p->p_memstat_effectivepriority) { - if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) && - (priority == JETSAM_PRIORITY_IDLE)) { - /* - * This process is on its way into the IDLE band. The system is - * using 'legacy' jetsam aging policy. That means, this process - * has already used up its idle-deferral aging time that is given - * once per its lifetime. So we need to set the INACTIVE limits - * explicitly because it won't be going through the demotion paths - * that take care to apply the limits appropriately. - */ - - if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { - /* - * This process has the 'elevated inactive jetsam band' attribute. - * So, there will be no trip to IDLE after all. - * Instead, we pin the process in the elevated band, - * where its ACTIVE limits will apply. - */ - - priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; - } - - memorystatus_update_priority_locked(p, priority, false, true); - } else { - memorystatus_update_priority_locked(p, priority, false, false); - } - } -} - -/* - * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle - * (clean). They may also indicate that they support termination when idle, with the result that they are promoted - * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low - * priority idle band when clean (and killed earlier, protecting higher priority procesess). - * - * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by - * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band - * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to - * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle - * band. The deferral can be cleared early by clearing the appropriate flag. - * - * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process - * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be - * re-enabled or the guard state cleared, depending on whether the guard deadline has passed. - */ - -int -memorystatus_dirty_track(proc_t p, uint32_t pcontrol) -{ - unsigned int old_dirty; - boolean_t reschedule = FALSE; - boolean_t already_deferred = FALSE; - boolean_t defer_now = FALSE; - int ret = 0; - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK), - p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0); - - proc_list_lock(); - - if ((p->p_listflag & P_LIST_EXITED) != 0) { - /* - * Process is on its way out. - */ - ret = EBUSY; - goto exit; - } - - if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { - ret = EPERM; - goto exit; - } - - if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) { - /* error */ - goto exit; - } - - old_dirty = p->p_memstat_dirty; - - /* These bits are cumulative, as per */ - if (pcontrol & PROC_DIRTY_TRACK) { - p->p_memstat_dirty |= P_DIRTY_TRACK; - } - - if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) { - p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT; - } - - if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) { - p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS; - } - - if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) { - already_deferred = TRUE; - } - - - /* This can be set and cleared exactly once. */ - if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) { - if ((pcontrol & (PROC_DIRTY_DEFER)) && - !(old_dirty & P_DIRTY_DEFER)) { - p->p_memstat_dirty |= P_DIRTY_DEFER; - } - - if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) && - !(old_dirty & P_DIRTY_DEFER_ALWAYS)) { - p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS; - } - - defer_now = TRUE; - } - - MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n", - ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N", - defer_now ? "Y" : "N", - p->p_memstat_dirty & P_DIRTY ? "Y" : "N", - p->p_pid); - - /* Kick off or invalidate the idle exit deferment if there's a state transition. */ - if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) { - if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { - if (defer_now && !already_deferred) { - /* - * Request to defer a clean process that's idle-exit enabled - * and not already in the jetsam deferred band. Most likely a - * new launch. - */ - memorystatus_schedule_idle_demotion_locked(p, TRUE); - reschedule = TRUE; - } else if (!defer_now) { - /* - * The process isn't asking for the 'aging' facility. - * Could be that it is: - */ - - if (already_deferred) { - /* - * already in the aging bands. Traditionally, - * some processes have tried to use this to - * opt out of the 'aging' facility. - */ - - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - } else { - /* - * agnostic to the 'aging' facility. In that case, - * we'll go ahead and opt it in because this is likely - * a new launch (clean process, dirty tracking enabled) - */ - - memorystatus_schedule_idle_demotion_locked(p, TRUE); - } - - reschedule = TRUE; - } - } - } else { - /* - * We are trying to operate on a dirty process. Dirty processes have to - * be removed from the deferred band. The question is do we reset the - * deferred state or not? - * - * This could be a legal request like: - * - this process had opted into the 'aging' band - * - but it's now dirty and requests to opt out. - * In this case, we remove the process from the band and reset its - * state too. It'll opt back in properly when needed. - * - * OR, this request could be a user-space bug. E.g.: - * - this process had opted into the 'aging' band when clean - * - and, then issues another request to again put it into the band except - * this time the process is dirty. - * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of - * the deferred band with its state intact. So our request below is no-op. - * But we do it here anyways for coverage. - * - * memorystatus_update_idle_priority_locked() - * single-mindedly treats a dirty process as "cannot be in the aging band". - */ - - if (!defer_now && already_deferred) { - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - reschedule = TRUE; - } else { - boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE; - - memorystatus_invalidate_idle_demotion_locked(p, reset_state); - reschedule = TRUE; - } - } - - memorystatus_update_idle_priority_locked(p); - - if (reschedule) { - memorystatus_reschedule_idle_demotion_locked(); - } - - ret = 0; - -exit: - proc_list_unlock(); - - return ret; -} - -int -memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) -{ - int ret; - boolean_t kill = false; - boolean_t reschedule = FALSE; - boolean_t was_dirty = FALSE; - boolean_t now_dirty = FALSE; - - MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0); - - proc_list_lock(); - - if ((p->p_listflag & P_LIST_EXITED) != 0) { - /* - * Process is on its way out. - */ - ret = EBUSY; - goto exit; - } - - if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { - ret = EPERM; - goto exit; - } - - if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { - was_dirty = TRUE; - } - - if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) { - /* Dirty tracking not enabled */ - ret = EINVAL; - } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) { - /* - * Process is set to be terminated and we're attempting to mark it dirty. - * Set for termination and marking as clean is OK - see . - */ - ret = EBUSY; - } else { - int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN; - if (pcontrol && !(p->p_memstat_dirty & flag)) { - /* Mark the process as having been dirtied at some point */ - p->p_memstat_dirty |= (flag | P_DIRTY_MARKED); - memorystatus_dirty_count++; - ret = 0; - } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) { - if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) { - /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */ - p->p_memstat_dirty |= P_DIRTY_TERMINATED; - kill = true; - } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) { - /* Kill previously terminated processes if set clean */ - kill = true; - } - p->p_memstat_dirty &= ~flag; - memorystatus_dirty_count--; - ret = 0; - } else { - /* Already set */ - ret = EALREADY; - } - } - - if (ret != 0) { - goto exit; - } - - if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { - now_dirty = TRUE; - } - - if ((was_dirty == TRUE && now_dirty == FALSE) || - (was_dirty == FALSE && now_dirty == TRUE)) { - /* Manage idle exit deferral, if applied */ - if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { - /* - * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back - * there once it's clean again. For the legacy case, this only applies if it has some protection window left. - * P_DIRTY_DEFER: one-time protection window given at launch - * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode. - * - * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over - * in that band on it's way to IDLE. - */ - - if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { - /* - * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE" - * - * The process will move from its aging band to its higher requested - * jetsam band. - */ - boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE; - - memorystatus_invalidate_idle_demotion_locked(p, reset_state); - reschedule = TRUE; - } else { - /* - * Process is back from "dirty" to "clean". - */ - - if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) { - if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) && - (mach_absolute_time() >= p->p_memstat_idledeadline)) { - /* - * The process' hasn't enrolled in the "always defer after dirty" - * mode and its deadline has expired. It currently - * does not reside in any of the aging buckets. - * - * It's on its way to the JETSAM_PRIORITY_IDLE - * bucket via memorystatus_update_idle_priority_locked() - * below. - * - * So all we need to do is reset all the state on the - * process that's related to the aging bucket i.e. - * the AGING_IN_PROGRESS flag and the timer deadline. - */ - - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - reschedule = TRUE; - } else { - /* - * Process enrolled in "always stop in deferral band after dirty" OR - * it still has some protection window left and so - * we just re-arm the timer without modifying any - * state on the process iff it still wants into that band. - */ - - if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) { - memorystatus_schedule_idle_demotion_locked(p, TRUE); - reschedule = TRUE; - } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) { - memorystatus_schedule_idle_demotion_locked(p, FALSE); - reschedule = TRUE; - } - } - } else { - memorystatus_schedule_idle_demotion_locked(p, TRUE); - reschedule = TRUE; - } - } - } - - memorystatus_update_idle_priority_locked(p); - - if (memorystatus_highwater_enabled) { - boolean_t ledger_update_needed = TRUE; - boolean_t use_active; - boolean_t is_fatal; - /* - * We are in this path because this process transitioned between - * dirty <--> clean state. Update the cached memory limits. - */ - - if (proc_jetsam_state_is_active_locked(p) == TRUE) { - /* - * process is pinned in elevated band - * or - * process is dirty - */ - CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = TRUE; - ledger_update_needed = TRUE; - } else { - /* - * process is clean...but if it has opted into pressured-exit - * we don't apply the INACTIVE limit till the process has aged - * out and is entering the IDLE band. - * See memorystatus_update_priority_locked() for that. - */ - - if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) { - ledger_update_needed = FALSE; - } else { - CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); - use_active = FALSE; - ledger_update_needed = TRUE; - } - } - - /* - * Enforce the new limits by writing to the ledger. - * - * This is a hot path and holding the proc_list_lock while writing to the ledgers, - * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock. - * We aren't traversing the jetsam bucket list here, so we should be safe. - * See rdar://21394491. - */ - - if (ledger_update_needed && proc_ref_locked(p) == p) { - int ledger_limit; - if (p->p_memstat_memlimit > 0) { - ledger_limit = p->p_memstat_memlimit; - } else { - ledger_limit = -1; - } - proc_list_unlock(); - task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal); - proc_list_lock(); - proc_rele_locked(p); - - MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n", - p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), - (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty, - (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); - } - } - - /* If the deferral state changed, reschedule the demotion timer */ - if (reschedule) { - memorystatus_reschedule_idle_demotion_locked(); - } - } - - if (kill) { - if (proc_ref_locked(p) == p) { - proc_list_unlock(); - psignal(p, SIGKILL); - proc_list_lock(); - proc_rele_locked(p); - } - } - -exit: - proc_list_unlock(); - - return ret; -} - -int -memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) -{ - int ret = 0; - - MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty); - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0); - - proc_list_lock(); - - if ((p->p_listflag & P_LIST_EXITED) != 0) { - /* - * Process is on its way out. - */ - ret = EBUSY; - goto exit; - } - - if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { - ret = EPERM; - goto exit; - } - - if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) { - /* Dirty tracking not enabled */ - ret = EINVAL; - goto exit; - } - - if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) { - ret = EINVAL; - goto exit; - } - - if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) { - p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS; - } - - /* This can be set and cleared exactly once. */ - if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) { - if (p->p_memstat_dirty & P_DIRTY_DEFER) { - p->p_memstat_dirty &= ~(P_DIRTY_DEFER); - } - - if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) { - p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS); - } - - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - memorystatus_update_idle_priority_locked(p); - memorystatus_reschedule_idle_demotion_locked(); - } - - ret = 0; -exit: - proc_list_unlock(); - - return ret; -} - -int -memorystatus_dirty_get(proc_t p) -{ - int ret = 0; - - proc_list_lock(); - - if (p->p_memstat_dirty & P_DIRTY_TRACK) { - ret |= PROC_DIRTY_TRACKED; - if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) { - ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT; - } - if (p->p_memstat_dirty & P_DIRTY) { - ret |= PROC_DIRTY_IS_DIRTY; - } - if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) { - ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS; - } - } - - proc_list_unlock(); - - return ret; -} - -int -memorystatus_on_terminate(proc_t p) -{ - int sig; - - proc_list_lock(); - - p->p_memstat_dirty |= P_DIRTY_TERMINATED; - - if ((p->p_memstat_dirty & (P_DIRTY_TRACK | P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) { - /* Clean; mark as terminated and issue SIGKILL */ - sig = SIGKILL; - } else { - /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */ - sig = SIGTERM; - } - - proc_list_unlock(); - - return sig; -} - -void -memorystatus_on_suspend(proc_t p) -{ -#if CONFIG_FREEZE - uint32_t pages; - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); -#endif - proc_list_lock(); -#if CONFIG_FREEZE - memorystatus_suspended_count++; -#endif - p->p_memstat_state |= P_MEMSTAT_SUSPENDED; - proc_list_unlock(); -} - -void -memorystatus_on_resume(proc_t p) -{ -#if CONFIG_FREEZE - boolean_t frozen; - pid_t pid; -#endif - - proc_list_lock(); - -#if CONFIG_FREEZE - frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN); - if (frozen) { - /* - * Now that we don't _thaw_ a process completely, - * resuming it (and having some on-demand swapins) - * shouldn't preclude it from being counted as frozen. - * - * memorystatus_frozen_count--; - * - * We preserve the P_MEMSTAT_FROZEN state since the process - * could have state on disk AND so will deserve some protection - * in the jetsam bands. - */ - if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) { - p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE; - memorystatus_refreeze_eligible_count++; - } - p->p_memstat_thaw_count++; - - memorystatus_thaw_count++; - } - - memorystatus_suspended_count--; - - pid = p->p_pid; -#endif - - /* - * P_MEMSTAT_FROZEN will remain unchanged. This used to be: - * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN); - */ - p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED; - - proc_list_unlock(); - -#if CONFIG_FREEZE - if (frozen) { - memorystatus_freeze_entry_t data = { pid, FALSE, 0 }; - memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); - } -#endif -} - -void -memorystatus_on_inactivity(proc_t p) -{ -#pragma unused(p) -#if CONFIG_FREEZE - /* Wake the freeze thread */ - thread_wakeup((event_t)&memorystatus_freeze_wakeup); -#endif -} - -/* - * The proc_list_lock is held by the caller. - */ -static uint32_t -memorystatus_build_state(proc_t p) -{ - uint32_t snapshot_state = 0; - - /* General */ - if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) { - snapshot_state |= kMemorystatusSuspended; - } - if (p->p_memstat_state & P_MEMSTAT_FROZEN) { - snapshot_state |= kMemorystatusFrozen; - } - if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { - snapshot_state |= kMemorystatusWasThawed; - } - - /* Tracking */ - if (p->p_memstat_dirty & P_DIRTY_TRACK) { - snapshot_state |= kMemorystatusTracked; - } - if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { - snapshot_state |= kMemorystatusSupportsIdleExit; - } - if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { - snapshot_state |= kMemorystatusDirty; - } - - return snapshot_state; -} - -static boolean_t -kill_idle_exit_proc(void) -{ - proc_t p, victim_p = PROC_NULL; - uint64_t current_time; - boolean_t killed = FALSE; - unsigned int i = 0; - os_reason_t jetsam_reason = OS_REASON_NULL; - - /* Pick next idle exit victim. */ - current_time = mach_absolute_time(); - - jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT); - if (jetsam_reason == OS_REASON_NULL) { - printf("kill_idle_exit_proc: failed to allocate jetsam reason\n"); - } - - proc_list_lock(); - - p = memorystatus_get_first_proc_locked(&i, FALSE); - while (p) { - /* No need to look beyond the idle band */ - if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) { - break; - } - - if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) { - if (current_time >= p->p_memstat_idledeadline) { - p->p_memstat_dirty |= P_DIRTY_TERMINATED; - victim_p = proc_ref_locked(p); - break; - } - } - - p = memorystatus_get_next_proc_locked(&i, p, FALSE); - } - - proc_list_unlock(); - - if (victim_p) { - printf("memorystatus: killing_idle_process pid %d [%s]\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown")); - killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason); - proc_rele(victim_p); - } else { - os_reason_free(jetsam_reason); - } - - return killed; -} - -static void -memorystatus_thread_wake(void) -{ - int thr_id = 0; - int active_thr = atomic_load(&active_jetsam_threads); - - /* Wakeup all the jetsam threads */ - for (thr_id = 0; thr_id < active_thr; thr_id++) { - thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup); - } -} - -#if CONFIG_JETSAM - -static void -memorystatus_thread_pool_max() -{ - /* Increase the jetsam thread pool to max_jetsam_threads */ - int max_threads = max_jetsam_threads; - printf("Expanding memorystatus pool to %d!\n", max_threads); - atomic_store(&active_jetsam_threads, max_threads); -} - -static void -memorystatus_thread_pool_default() -{ - /* Restore the jetsam thread pool to a single thread */ - printf("Reverting memorystatus pool back to 1\n"); - atomic_store(&active_jetsam_threads, 1); -} - -#endif /* CONFIG_JETSAM */ - -extern void vm_pressure_response(void); - -static int -memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation) -{ - struct jetsam_thread_state *jetsam_thread = jetsam_current_thread(); - - if (interval_ms) { - assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC); - } else { - assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT); - } - - return thread_block(continuation); -} - -static boolean_t -memorystatus_avail_pages_below_pressure(void) -{ -#if CONFIG_EMBEDDED -/* - * Instead of CONFIG_EMBEDDED for these *avail_pages* routines, we should - * key off of the system having dynamic swap support. With full swap support, - * the system shouldn't really need to worry about various page thresholds. - */ - return memorystatus_available_pages <= memorystatus_available_pages_pressure; -#else /* CONFIG_EMBEDDED */ - return FALSE; -#endif /* CONFIG_EMBEDDED */ -} - -static boolean_t -memorystatus_avail_pages_below_critical(void) -{ -#if CONFIG_EMBEDDED - return memorystatus_available_pages <= memorystatus_available_pages_critical; -#else /* CONFIG_EMBEDDED */ - return FALSE; -#endif /* CONFIG_EMBEDDED */ -} - -static boolean_t -memorystatus_post_snapshot(int32_t priority, uint32_t cause) -{ -#if CONFIG_EMBEDDED -#pragma unused(cause) - /* - * Don't generate logs for steady-state idle-exit kills, - * unless it is overridden for debug or by the device - * tree. - */ - - return (priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot; - -#else /* CONFIG_EMBEDDED */ - /* - * Don't generate logs for steady-state idle-exit kills, - * unless - * - it is overridden for debug or by the device - * tree. - * OR - * - the kill causes are important i.e. not kMemorystatusKilledIdleExit - */ - - boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause)); - return (priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot || snapshot_eligible_kill_cause; -#endif /* CONFIG_EMBEDDED */ -} - -static boolean_t -memorystatus_action_needed(void) -{ -#if CONFIG_EMBEDDED - return is_reason_thrashing(kill_under_pressure_cause) || - is_reason_zone_map_exhaustion(kill_under_pressure_cause) || - memorystatus_available_pages <= memorystatus_available_pages_pressure; -#else /* CONFIG_EMBEDDED */ - return is_reason_thrashing(kill_under_pressure_cause) || - is_reason_zone_map_exhaustion(kill_under_pressure_cause); -#endif /* CONFIG_EMBEDDED */ -} - -#if CONFIG_FREEZE -extern void vm_swap_consider_defragmenting(int); - -/* - * This routine will _jetsam_ all frozen processes - * and reclaim the swap space immediately. - * - * So freeze has to be DISABLED when we call this routine. - */ - -void -memorystatus_disable_freeze(void) -{ - memstat_bucket_t *bucket; - int bucket_count = 0, retries = 0; - boolean_t retval = FALSE, killed = FALSE; - uint32_t errors = 0, errors_over_prev_iteration = 0; - os_reason_t jetsam_reason = 0; - unsigned int band = 0; - proc_t p = PROC_NULL, next_p = PROC_NULL; - - assert(memorystatus_freeze_enabled == FALSE); - - jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_disable_freeze: failed to allocate jetsam reason\n"); - } - - /* - * Let's relocate all frozen processes into band 8. Demoted frozen processes - * are sitting in band 0 currently and it's possible to have a frozen process - * in the FG band being actively used. We don't reset its frozen state when - * it is resumed because it has state on disk. - * - * We choose to do this relocation rather than implement a new 'kill frozen' - * process function for these reasons: - * - duplication of code: too many kill functions exist and we need to rework them better. - * - disk-space-shortage kills are rare - * - not having the 'real' jetsam band at time of the this frozen kill won't preclude us - * from answering any imp. questions re. jetsam policy/effectiveness. - * - * This is essentially what memorystatus_update_inactive_jetsam_priority_band() does while - * avoiding the application of memory limits. - */ - -again: - proc_list_lock(); - - band = JETSAM_PRIORITY_IDLE; - p = PROC_NULL; - next_p = PROC_NULL; - - next_p = memorystatus_get_first_proc_locked(&band, TRUE); - while (next_p) { - p = next_p; - next_p = memorystatus_get_next_proc_locked(&band, p, TRUE); - - if (p->p_memstat_effectivepriority > JETSAM_PRIORITY_FOREGROUND) { - break; - } - - if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { - continue; - } - - if (p->p_memstat_state & P_MEMSTAT_ERROR) { - p->p_memstat_state &= ~P_MEMSTAT_ERROR; - } - - if (p->p_memstat_effectivepriority == memorystatus_freeze_jetsam_band) { - continue; - } - - /* - * We explicitly add this flag here so the process looks like a normal - * frozen process i.e. P_MEMSTAT_FROZEN and P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND. - * We don't bother with assigning the 'active' memory - * limits at this point because we are going to be killing it soon below. - */ - p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - - memorystatus_update_priority_locked(p, memorystatus_freeze_jetsam_band, FALSE, TRUE); - } - - bucket = &memstat_bucket[memorystatus_freeze_jetsam_band]; - bucket_count = bucket->count; - proc_list_unlock(); - - /* - * Bucket count is already stale at this point. But, we don't expect - * freezing to continue since we have already disabled the freeze functionality. - * However, an existing freeze might be in progress. So we might miss that process - * in the first go-around. We hope to catch it in the next. - */ - - errors_over_prev_iteration = 0; - while (bucket_count) { - bucket_count--; - - /* - * memorystatus_kill_elevated_process() drops a reference, - * so take another one so we can continue to use this exit reason - * even after it returns. - */ - - os_reason_ref(jetsam_reason); - retval = memorystatus_kill_elevated_process( - kMemorystatusKilledDiskSpaceShortage, - jetsam_reason, - memorystatus_freeze_jetsam_band, - 0, /* the iteration of aggressive jetsam..ignored here */ - &errors); - - if (errors > 0) { - printf("memorystatus_disable_freeze: memorystatus_kill_elevated_process returned %d error(s)\n", errors); - errors_over_prev_iteration += errors; - errors = 0; - } - - if (retval == 0) { - /* - * No frozen processes left to kill. - */ - break; - } - - killed = TRUE; - } - - proc_list_lock(); - - if (memorystatus_frozen_count) { - /* - * A frozen process snuck in and so - * go back around to kill it. That - * process may have been resumed and - * put into the FG band too. So we - * have to do the relocation again. - */ - assert(memorystatus_freeze_enabled == FALSE); - - retries++; - if (retries < 3) { - proc_list_unlock(); - goto again; - } -#if DEVELOPMENT || DEBUG - panic("memorystatus_disable_freeze: Failed to kill all frozen processes, memorystatus_frozen_count = %d, errors = %d", - memorystatus_frozen_count, errors_over_prev_iteration); -#endif /* DEVELOPMENT || DEBUG */ - } - proc_list_unlock(); - - os_reason_free(jetsam_reason); - - if (killed) { - vm_swap_consider_defragmenting(VM_SWAP_FLAGS_FORCE_DEFRAG | VM_SWAP_FLAGS_FORCE_RECLAIM); - - proc_list_lock(); - size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + - sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count); - uint64_t timestamp_now = mach_absolute_time(); - memorystatus_jetsam_snapshot->notification_time = timestamp_now; - memorystatus_jetsam_snapshot->js_gencount++; - if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || - timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { - proc_list_unlock(); - int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); - if (!ret) { - proc_list_lock(); - memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; - proc_list_unlock(); - } - } else { - proc_list_unlock(); - } - } - - return; -} -#endif /* CONFIG_FREEZE */ - -static boolean_t -memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical) -{ - boolean_t purged = FALSE; - boolean_t killed = memorystatus_kill_hiwat_proc(errors, &purged); - - if (killed) { - *hwm_kill = *hwm_kill + 1; - *post_snapshot = TRUE; - return TRUE; - } else { - if (purged == FALSE) { - /* couldn't purge and couldn't kill */ - memorystatus_hwm_candidates = FALSE; - } - } - -#if CONFIG_JETSAM - /* No highwater processes to kill. Continue or stop for now? */ - if (!is_reason_thrashing(kill_under_pressure_cause) && - !is_reason_zone_map_exhaustion(kill_under_pressure_cause) && - (memorystatus_available_pages > memorystatus_available_pages_critical)) { - /* - * We are _not_ out of pressure but we are above the critical threshold and there's: - * - no compressor thrashing - * - enough zone memory - * - no more HWM processes left. - * For now, don't kill any other processes. - */ - - if (*hwm_kill == 0) { - memorystatus_thread_wasted_wakeup++; - } - - *is_critical = FALSE; - - return TRUE; - } -#endif /* CONFIG_JETSAM */ - - return FALSE; -} - -static boolean_t -memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot) -{ - if (memorystatus_jld_enabled == TRUE) { - boolean_t killed; - uint32_t errors = 0; - - /* Jetsam Loop Detection - locals */ - memstat_bucket_t *bucket; - int jld_bucket_count = 0; - struct timeval jld_now_tstamp = {0, 0}; - uint64_t jld_now_msecs = 0; - int elevated_bucket_count = 0; - - /* Jetsam Loop Detection - statics */ - static uint64_t jld_timestamp_msecs = 0; - static int jld_idle_kill_candidates = 0; /* Number of available processes in band 0,1 at start */ - static int jld_eval_aggressive_count = 0; /* Bumps the max priority in aggressive loop */ - static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT; - /* - * Jetsam Loop Detection: attempt to detect - * rapid daemon relaunches in the lower bands. - */ - - microuptime(&jld_now_tstamp); - - /* - * Ignore usecs in this calculation. - * msecs granularity is close enough. - */ - jld_now_msecs = (jld_now_tstamp.tv_sec * 1000); - - proc_list_lock(); - switch (jetsam_aging_policy) { - case kJetsamAgingPolicyLegacy: - bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; - jld_bucket_count = bucket->count; - bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1]; - jld_bucket_count += bucket->count; - break; - case kJetsamAgingPolicySysProcsReclaimedFirst: - case kJetsamAgingPolicyAppsReclaimedFirst: - bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; - jld_bucket_count = bucket->count; - bucket = &memstat_bucket[system_procs_aging_band]; - jld_bucket_count += bucket->count; - bucket = &memstat_bucket[applications_aging_band]; - jld_bucket_count += bucket->count; - break; - case kJetsamAgingPolicyNone: - default: - bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; - jld_bucket_count = bucket->count; - break; - } - - bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE]; - elevated_bucket_count = bucket->count; - - proc_list_unlock(); - - /* - * memorystatus_jld_eval_period_msecs is a tunable - * memorystatus_jld_eval_aggressive_count is a tunable - * memorystatus_jld_eval_aggressive_priority_band_max is a tunable - */ - if ((jld_bucket_count == 0) || - (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) { - /* - * Refresh evaluation parameters - */ - jld_timestamp_msecs = jld_now_msecs; - jld_idle_kill_candidates = jld_bucket_count; - *jld_idle_kills = 0; - jld_eval_aggressive_count = 0; - jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT; - } - - if (*jld_idle_kills > jld_idle_kill_candidates) { - jld_eval_aggressive_count++; - -#if DEVELOPMENT || DEBUG - printf("memorystatus: aggressive%d: beginning of window: %lld ms, : timestamp now: %lld ms\n", - jld_eval_aggressive_count, - jld_timestamp_msecs, - jld_now_msecs); - printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n", - jld_eval_aggressive_count, - jld_idle_kill_candidates, - *jld_idle_kills); -#endif /* DEVELOPMENT || DEBUG */ - - if ((jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) && - (total_corpses_count() > 0) && (*corpse_list_purged == FALSE)) { - /* - * If we reach this aggressive cycle, corpses might be causing memory pressure. - * So, in an effort to avoid jetsams in the FG band, we will attempt to purge - * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT. - */ - task_purge_all_corpses(); - *corpse_list_purged = TRUE; - } else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) { - /* - * Bump up the jetsam priority limit (eg: the bucket index) - * Enforce bucket index sanity. - */ - if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) || - (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) { - /* - * Do nothing. Stick with the default level. - */ - } else { - jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max; - } - } - - /* Visit elevated processes first */ - while (elevated_bucket_count) { - elevated_bucket_count--; - - /* - * memorystatus_kill_elevated_process() drops a reference, - * so take another one so we can continue to use this exit reason - * even after it returns. - */ - - os_reason_ref(jetsam_reason); - killed = memorystatus_kill_elevated_process( - cause, - jetsam_reason, - JETSAM_PRIORITY_ELEVATED_INACTIVE, - jld_eval_aggressive_count, - &errors); - - if (killed) { - *post_snapshot = TRUE; - if (memorystatus_avail_pages_below_pressure()) { - /* - * Still under pressure. - * Find another pinned processes. - */ - continue; - } else { - return TRUE; - } - } else { - /* - * No pinned processes left to kill. - * Abandon elevated band. - */ - break; - } - } - - /* - * memorystatus_kill_top_process_aggressive() allocates its own - * jetsam_reason so the kMemorystatusKilledProcThrashing cause - * is consistent throughout the aggressive march. - */ - killed = memorystatus_kill_top_process_aggressive( - kMemorystatusKilledProcThrashing, - jld_eval_aggressive_count, - jld_priority_band_max, - &errors); - - if (killed) { - /* Always generate logs after aggressive kill */ - *post_snapshot = TRUE; - *jld_idle_kills = 0; - return TRUE; - } - } - - return FALSE; - } - - return FALSE; -} - - -static void -memorystatus_thread(void *param __unused, wait_result_t wr __unused) -{ - boolean_t post_snapshot = FALSE; - uint32_t errors = 0; - uint32_t hwm_kill = 0; - boolean_t sort_flag = TRUE; - boolean_t corpse_list_purged = FALSE; - int jld_idle_kills = 0; - struct jetsam_thread_state *jetsam_thread = jetsam_current_thread(); - - if (jetsam_thread->inited == FALSE) { - /* - * It's the first time the thread has run, so just mark the thread as privileged and block. - * This avoids a spurious pass with unset variables, as set out in . - */ - - char name[32]; - thread_wire(host_priv_self(), current_thread(), TRUE); - snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1); - - if (jetsam_thread->index == 0) { - if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { - thread_vm_bind_group_add(); - } - } - thread_set_thread_name(current_thread(), name); - jetsam_thread->inited = TRUE; - memorystatus_thread_block(0, memorystatus_thread); - } - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START, - memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count, 0); - - /* - * Jetsam aware version. - * - * The VM pressure notification thread is working it's way through clients in parallel. - * - * So, while the pressure notification thread is targeting processes in order of - * increasing jetsam priority, we can hopefully reduce / stop it's work by killing - * any processes that have exceeded their highwater mark. - * - * If we run out of HWM processes and our available pages drops below the critical threshold, then, - * we target the least recently used process in order of increasing jetsam priority (exception: the FG band). - */ - while (memorystatus_action_needed()) { - boolean_t killed; - int32_t priority; - uint32_t cause; - uint64_t jetsam_reason_code = JETSAM_REASON_INVALID; - os_reason_t jetsam_reason = OS_REASON_NULL; - - cause = kill_under_pressure_cause; - switch (cause) { - case kMemorystatusKilledFCThrashing: - jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING; - break; - case kMemorystatusKilledVMCompressorThrashing: - jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING; - break; - case kMemorystatusKilledVMCompressorSpaceShortage: - jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE; - break; - case kMemorystatusKilledZoneMapExhaustion: - jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION; - break; - case kMemorystatusKilledVMPageShortage: - /* falls through */ - default: - jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE; - cause = kMemorystatusKilledVMPageShortage; - break; - } - - /* Highwater */ - boolean_t is_critical = TRUE; - if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical)) { - if (is_critical == FALSE) { - /* - * For now, don't kill any other processes. - */ - break; - } else { - goto done; - } - } - - jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_thread: failed to allocate jetsam reason\n"); - } - - if (memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot)) { - goto done; - } - - /* - * memorystatus_kill_top_process() drops a reference, - * so take another one so we can continue to use this exit reason - * even after it returns - */ - os_reason_ref(jetsam_reason); - - /* LRU */ - killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors); - sort_flag = FALSE; - - if (killed) { - if (memorystatus_post_snapshot(priority, cause) == TRUE) { - post_snapshot = TRUE; - } - - /* Jetsam Loop Detection */ - if (memorystatus_jld_enabled == TRUE) { - if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) { - jld_idle_kills++; - } else { - /* - * We've reached into bands beyond idle deferred. - * We make no attempt to monitor them - */ - } - } - - if ((priority >= JETSAM_PRIORITY_UI_SUPPORT) && (total_corpses_count() > 0) && (corpse_list_purged == FALSE)) { - /* - * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT - * then we attempt to relieve pressure by purging corpse memory. - */ - task_purge_all_corpses(); - corpse_list_purged = TRUE; - } - goto done; - } - - if (memorystatus_avail_pages_below_critical()) { - /* - * Still under pressure and unable to kill a process - purge corpse memory - */ - if (total_corpses_count() > 0) { - task_purge_all_corpses(); - corpse_list_purged = TRUE; - } - - if (memorystatus_avail_pages_below_critical()) { - /* - * Still under pressure and unable to kill a process - panic - */ - panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)memorystatus_available_pages); - } - } - -done: - - /* - * We do not want to over-kill when thrashing has been detected. - * To avoid that, we reset the flag here and notify the - * compressor. - */ - if (is_reason_thrashing(kill_under_pressure_cause)) { - kill_under_pressure_cause = 0; -#if CONFIG_JETSAM - vm_thrashing_jetsam_done(); -#endif /* CONFIG_JETSAM */ - } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) { - kill_under_pressure_cause = 0; - } - - os_reason_free(jetsam_reason); - } - - kill_under_pressure_cause = 0; - - if (errors) { - memorystatus_clear_errors(); - } - - if (post_snapshot) { - proc_list_lock(); - size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + - sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count); - uint64_t timestamp_now = mach_absolute_time(); - memorystatus_jetsam_snapshot->notification_time = timestamp_now; - memorystatus_jetsam_snapshot->js_gencount++; - if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || - timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { - proc_list_unlock(); - int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); - if (!ret) { - proc_list_lock(); - memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; - proc_list_unlock(); - } - } else { - proc_list_unlock(); - } - } - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END, - memorystatus_available_pages, 0, 0, 0, 0); - - memorystatus_thread_block(0, memorystatus_thread); -} - -/* - * Returns TRUE: - * when an idle-exitable proc was killed - * Returns FALSE: - * when there are no more idle-exitable procs found - * when the attempt to kill an idle-exitable proc failed - */ -boolean_t -memorystatus_idle_exit_from_VM(void) -{ - /* - * This routine should no longer be needed since we are - * now using jetsam bands on all platforms and so will deal - * with IDLE processes within the memorystatus thread itself. - * - * But we still use it because we observed that macos systems - * started heavy compression/swapping with a bunch of - * idle-exitable processes alive and doing nothing. We decided - * to rather kill those processes than start swapping earlier. - */ - - return kill_idle_exit_proc(); -} - -/* - * Callback invoked when allowable physical memory footprint exceeded - * (dirty pages + IOKit mappings) - * - * This is invoked for both advisory, non-fatal per-task high watermarks, - * as well as the fatal task memory limits. - */ -void -memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal) -{ - os_reason_t jetsam_reason = OS_REASON_NULL; - - proc_t p = current_proc(); - -#if VM_PRESSURE_EVENTS - if (warning == TRUE) { - /* - * This is a warning path which implies that the current process is close, but has - * not yet exceeded its per-process memory limit. - */ - if (memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) { - /* Print warning, since it's possible that task has not registered for pressure notifications */ - os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid); - } - return; - } -#endif /* VM_PRESSURE_EVENTS */ - - if (memlimit_is_fatal) { - /* - * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task - * has violated either the system-wide per-task memory limit OR its own task limit. - */ - jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT); - if (jetsam_reason == NULL) { - printf("task_exceeded footprint: failed to allocate jetsam reason\n"); - } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) { - /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */ - jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; - } - - if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) { - printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n"); - } - } else { - /* - * HWM offender exists. Done without locks or synchronization. - * See comment near its declaration for more details. - */ - memorystatus_hwm_candidates = TRUE; - -#if VM_PRESSURE_EVENTS - /* - * The current process is not in the warning path. - * This path implies the current process has exceeded a non-fatal (soft) memory limit. - * Failure to send note is ignored here. - */ - (void)memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */); - -#endif /* VM_PRESSURE_EVENTS */ - } -} - -void -memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal) -{ - proc_t p = current_proc(); - - /* - * The limit violation is logged here, but only once per process per limit. - * Soft memory limit is a non-fatal high-water-mark - * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit. - */ - - os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n", - (*p->p_name ? p->p_name : "unknown"), p->p_pid, (memlimit_is_active ? "Active" : "Inactive"), - (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb, - (memlimit_is_fatal ? "fatal" : "non-fatal")); - - return; -} - - -/* - * Description: - * Evaluates process state to determine which limit - * should be applied (active vs. inactive limit). - * - * Processes that have the 'elevated inactive jetsam band' attribute - * are first evaluated based on their current priority band. - * presently elevated ==> active - * - * Processes that opt into dirty tracking are evaluated - * based on clean vs dirty state. - * dirty ==> active - * clean ==> inactive - * - * Process that do not opt into dirty tracking are - * evalulated based on priority level. - * Foreground or above ==> active - * Below Foreground ==> inactive - * - * Return: TRUE if active - * False if inactive - */ - -static boolean_t -proc_jetsam_state_is_active_locked(proc_t p) -{ - if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) && - (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) { - /* - * process has the 'elevated inactive jetsam band' attribute - * and process is present in the elevated band - * implies active state - */ - return TRUE; - } else if (p->p_memstat_dirty & P_DIRTY_TRACK) { - /* - * process has opted into dirty tracking - * active state is based on dirty vs. clean - */ - if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { - /* - * process is dirty - * implies active state - */ - return TRUE; - } else { - /* - * process is clean - * implies inactive state - */ - return FALSE; - } - } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) { - /* - * process is Foreground or higher - * implies active state - */ - return TRUE; - } else { - /* - * process found below Foreground - * implies inactive state - */ - return FALSE; - } -} - -static boolean_t -memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) -{ - boolean_t res; - - uint32_t errors = 0; - - if (victim_pid == -1) { - /* No pid, so kill first process */ - res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors); - } else { - res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason); - } - - if (errors) { - memorystatus_clear_errors(); - } - - if (res == TRUE) { - /* Fire off snapshot notification */ - proc_list_lock(); - size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + - sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count; - uint64_t timestamp_now = mach_absolute_time(); - memorystatus_jetsam_snapshot->notification_time = timestamp_now; - if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || - timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { - proc_list_unlock(); - int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); - if (!ret) { - proc_list_lock(); - memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; - proc_list_unlock(); - } - } else { - proc_list_unlock(); - } - } - - return res; -} - -/* - * Jetsam a specific process. - */ -static boolean_t -memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) -{ - boolean_t killed; - proc_t p; - uint64_t killtime = 0; - clock_sec_t tv_sec; - clock_usec_t tv_usec; - uint32_t tv_msec; - - /* TODO - add a victim queue and push this into the main jetsam thread */ - - p = proc_find(victim_pid); - if (!p) { - os_reason_free(jetsam_reason); - return FALSE; - } - - proc_list_lock(); - - if (memorystatus_jetsam_snapshot_count == 0) { - memorystatus_init_jetsam_snapshot_locked(NULL, 0); - } - - killtime = mach_absolute_time(); - absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); - tv_msec = tv_usec / 1000; - - memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); - - proc_list_unlock(); - - os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n", - (unsigned long)tv_sec, tv_msec, victim_pid, (*p->p_name ? p->p_name : "unknown"), - memorystatus_kill_cause_name[cause], p->p_memstat_effectivepriority, (uint64_t)memorystatus_available_pages); - - killed = memorystatus_do_kill(p, cause, jetsam_reason); - proc_rele(p); - - return killed; -} - - -/* - * Toggle the P_MEMSTAT_TERMINATED state. - * Takes the proc_list_lock. - */ -void -proc_memstat_terminated(proc_t p, boolean_t set) -{ -#if DEVELOPMENT || DEBUG - if (p) { - proc_list_lock(); - if (set == TRUE) { - p->p_memstat_state |= P_MEMSTAT_TERMINATED; - } else { - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - } - proc_list_unlock(); - } -#else -#pragma unused(p, set) - /* - * do nothing - */ -#endif /* DEVELOPMENT || DEBUG */ - return; -} - - -#if CONFIG_JETSAM -/* - * This is invoked when cpulimits have been exceeded while in fatal mode. - * The jetsam_flags do not apply as those are for memory related kills. - * We call this routine so that the offending process is killed with - * a non-zero exit status. - */ -void -jetsam_on_ledger_cpulimit_exceeded(void) -{ - int retval = 0; - int jetsam_flags = 0; /* make it obvious */ - proc_t p = current_proc(); - os_reason_t jetsam_reason = OS_REASON_NULL; - - printf("task_exceeded_cpulimit: killing pid %d [%s]\n", - p->p_pid, (*p->p_name ? p->p_name : "(unknown)")); - - jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT); - if (jetsam_reason == OS_REASON_NULL) { - printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n"); - } - - retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason); - - if (retval) { - printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n"); - } -} - -#endif /* CONFIG_JETSAM */ - -static void -memorystatus_get_task_memory_region_count(task_t task, uint64_t *count) -{ - assert(task); - assert(count); - - *count = get_task_memory_region_count(task); -} - - -#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000 -#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000 - -#if DEVELOPMENT || DEBUG - -/* - * Sysctl only used to test memorystatus_allowed_vm_map_fork() path. - * set a new pidwatch value - * or - * get the current pidwatch value - * - * The pidwatch_val starts out with a PID to watch for in the map_fork path. - * Its value is: - * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork. - * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork. - * - set to -1ull if the map_fork() is aborted for other reasons. - */ - -uint64_t memorystatus_vm_map_fork_pidwatch_val = 0; - -static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS { -#pragma unused(oidp, arg1, arg2) - - uint64_t new_value = 0; - uint64_t old_value = 0; - int error = 0; - - /* - * The pid is held in the low 32 bits. - * The 'allowed' flags are in the upper 32 bits. - */ - old_value = memorystatus_vm_map_fork_pidwatch_val; - - error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL); - - if (error || !req->newptr) { - /* - * No new value passed in. - */ - return error; - } - - /* - * A new pid was passed in via req->newptr. - * Ignore any attempt to set the higher order bits. - */ - memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF; - printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value); - - return error; -} - -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork"); - - -/* - * Record if a watched process fails to qualify for a vm_map_fork(). - */ -void -memorystatus_abort_vm_map_fork(task_t task) -{ - if (memorystatus_vm_map_fork_pidwatch_val != 0) { - proc_t p = get_bsdtask_info(task); - if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid) { - memorystatus_vm_map_fork_pidwatch_val = -1ull; - } - } -} - -static void -set_vm_map_fork_pidwatch(task_t task, uint64_t x) -{ - if (memorystatus_vm_map_fork_pidwatch_val != 0) { - proc_t p = get_bsdtask_info(task); - if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) { - memorystatus_vm_map_fork_pidwatch_val |= x; - } - } -} - -#else /* DEVELOPMENT || DEBUG */ - - -static void -set_vm_map_fork_pidwatch(task_t task, uint64_t x) -{ -#pragma unused(task) -#pragma unused(x) -} - -#endif /* DEVELOPMENT || DEBUG */ - -/* - * Called during EXC_RESOURCE handling when a process exceeds a soft - * memory limit. This is the corpse fork path and here we decide if - * vm_map_fork will be allowed when creating the corpse. - * The task being considered is suspended. - * - * By default, a vm_map_fork is allowed to proceed. - * - * A few simple policy assumptions: - * Desktop platform is not considered in this path. - * The vm_map_fork is always allowed. - * - * If the device has a zero system-wide task limit, - * then the vm_map_fork is allowed. - * - * And if a process's memory footprint calculates less - * than or equal to half of the system-wide task limit, - * then the vm_map_fork is allowed. This calculation - * is based on the assumption that a process can - * munch memory up to the system-wide task limit. - */ -boolean_t -memorystatus_allowed_vm_map_fork(task_t task) -{ - boolean_t is_allowed = TRUE; /* default */ - -#if CONFIG_EMBEDDED - - uint64_t footprint_in_bytes; - uint64_t max_allowed_bytes; - - if (max_task_footprint_mb == 0) { - set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED); - return is_allowed; - } - - footprint_in_bytes = get_task_phys_footprint(task); - - /* - * Maximum is 1/4 of the system-wide task limit. - */ - max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2; - - if (footprint_in_bytes > max_allowed_bytes) { - printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes); - set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED); - return !is_allowed; - } -#endif /* CONFIG_EMBEDDED */ - - set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED); - return is_allowed; -} - -static void -memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages) -{ - assert(task); - assert(footprint); - - uint64_t pages; - - pages = (get_task_phys_footprint(task) / PAGE_SIZE_64); - assert(((uint32_t)pages) == pages); - *footprint = (uint32_t)pages; - - if (max_footprint_lifetime) { - pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64); - assert(((uint32_t)pages) == pages); - *max_footprint_lifetime = (uint32_t)pages; - } - if (purgeable_pages) { - pages = (get_task_purgeable_size(task) / PAGE_SIZE_64); - assert(((uint32_t)pages) == pages); - *purgeable_pages = (uint32_t)pages; - } -} - -static void -memorystatus_get_task_phys_footprint_page_counts(task_t task, - uint64_t *internal_pages, uint64_t *internal_compressed_pages, - uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages, - uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages, - uint64_t *iokit_mapped_pages, uint64_t *page_table_pages) -{ - assert(task); - - if (internal_pages) { - *internal_pages = (get_task_internal(task) / PAGE_SIZE_64); - } - - if (internal_compressed_pages) { - *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64); - } - - if (purgeable_nonvolatile_pages) { - *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64); - } - - if (purgeable_nonvolatile_compressed_pages) { - *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64); - } - - if (alternate_accounting_pages) { - *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64); - } - - if (alternate_accounting_compressed_pages) { - *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64); - } - - if (iokit_mapped_pages) { - *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64); - } - - if (page_table_pages) { - *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64); - } -} - -/* - * This routine only acts on the global jetsam event snapshot. - * Updating the process's entry can race when the memorystatus_thread - * has chosen to kill a process that is racing to exit on another core. - */ -static void -memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime) -{ - memorystatus_jetsam_snapshot_entry_t *entry = NULL; - memorystatus_jetsam_snapshot_t *snapshot = NULL; - memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; - - unsigned int i; - - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); - - if (memorystatus_jetsam_snapshot_count == 0) { - /* - * No active snapshot. - * Nothing to do. - */ - return; - } - - /* - * Sanity check as this routine should only be called - * from a jetsam kill path. - */ - assert(kill_cause != 0 && killtime != 0); - - snapshot = memorystatus_jetsam_snapshot; - snapshot_list = memorystatus_jetsam_snapshot->entries; - - for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) { - if (snapshot_list[i].pid == p->p_pid) { - entry = &snapshot_list[i]; - - if (entry->killed || entry->jse_killtime) { - /* - * We apparently raced on the exit path - * for this process, as it's snapshot entry - * has already recorded a kill. - */ - assert(entry->killed && entry->jse_killtime); - break; - } - - /* - * Update the entry we just found in the snapshot. - */ - - entry->killed = kill_cause; - entry->jse_killtime = killtime; - entry->jse_gencount = snapshot->js_gencount; - entry->jse_idle_delta = p->p_memstat_idle_delta; -#if CONFIG_FREEZE - entry->jse_thaw_count = p->p_memstat_thaw_count; -#else /* CONFIG_FREEZE */ - entry->jse_thaw_count = 0; -#endif /* CONFIG_FREEZE */ - - /* - * If a process has moved between bands since snapshot was - * initialized, then likely these fields changed too. - */ - if (entry->priority != p->p_memstat_effectivepriority) { - strlcpy(entry->name, p->p_name, sizeof(entry->name)); - entry->priority = p->p_memstat_effectivepriority; - entry->state = memorystatus_build_state(p); - entry->user_data = p->p_memstat_userdata; - entry->fds = p->p_fd->fd_nfiles; - } - - /* - * Always update the page counts on a kill. - */ - - uint32_t pages = 0; - uint32_t max_pages_lifetime = 0; - uint32_t purgeable_pages = 0; - - memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages); - entry->pages = (uint64_t)pages; - entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; - entry->purgeable_pages = (uint64_t)purgeable_pages; - - uint64_t internal_pages = 0; - uint64_t internal_compressed_pages = 0; - uint64_t purgeable_nonvolatile_pages = 0; - uint64_t purgeable_nonvolatile_compressed_pages = 0; - uint64_t alternate_accounting_pages = 0; - uint64_t alternate_accounting_compressed_pages = 0; - uint64_t iokit_mapped_pages = 0; - uint64_t page_table_pages = 0; - - memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages, - &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages, - &alternate_accounting_pages, &alternate_accounting_compressed_pages, - &iokit_mapped_pages, &page_table_pages); - - entry->jse_internal_pages = internal_pages; - entry->jse_internal_compressed_pages = internal_compressed_pages; - entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages; - entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages; - entry->jse_alternate_accounting_pages = alternate_accounting_pages; - entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages; - entry->jse_iokit_mapped_pages = iokit_mapped_pages; - entry->jse_page_table_pages = page_table_pages; - - uint64_t region_count = 0; - memorystatus_get_task_memory_region_count(p->task, ®ion_count); - entry->jse_memory_region_count = region_count; - - goto exit; - } - } - - if (entry == NULL) { - /* - * The entry was not found in the snapshot, so the process must have - * launched after the snapshot was initialized. - * Let's try to append the new entry. - */ - if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) { - /* - * A populated snapshot buffer exists - * and there is room to init a new entry. - */ - assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count); - - unsigned int next = memorystatus_jetsam_snapshot_count; - - if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) { - entry = &snapshot_list[next]; - entry->killed = kill_cause; - entry->jse_killtime = killtime; - - snapshot->entry_count = ++next; - memorystatus_jetsam_snapshot_count = next; - - if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) { - /* - * We just used the last slot in the snapshot buffer. - * We only want to log it once... so we do it here - * when we notice we've hit the max. - */ - printf("memorystatus: WARNING snapshot buffer is full, count %d\n", - memorystatus_jetsam_snapshot_count); - } - } - } - } - -exit: - if (entry == NULL) { - /* - * If we reach here, the snapshot buffer could not be updated. - * Most likely, the buffer is full, in which case we would have - * logged a warning in the previous call. - * - * For now, we will stop appending snapshot entries. - * When the buffer is consumed, the snapshot state will reset. - */ - - MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n", - p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count); - } - - return; -} - -#if CONFIG_JETSAM -void -memorystatus_pages_update(unsigned int pages_avail) -{ - memorystatus_available_pages = pages_avail; - -#if VM_PRESSURE_EVENTS - /* - * Since memorystatus_available_pages changes, we should - * re-evaluate the pressure levels on the system and - * check if we need to wake the pressure thread. - * We also update memorystatus_level in that routine. - */ - vm_pressure_response(); - - if (memorystatus_available_pages <= memorystatus_available_pages_pressure) { - if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) { - memorystatus_thread_wake(); - } - } -#if CONFIG_FREEZE - /* - * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect - * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this - * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here - * will result in the "mutex with preemption disabled" panic. - */ - - if (memorystatus_freeze_thread_should_run() == TRUE) { - /* - * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process). - * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here. - */ - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - thread_wakeup((event_t)&memorystatus_freeze_wakeup); - } - } -#endif /* CONFIG_FREEZE */ - -#else /* VM_PRESSURE_EVENTS */ - - boolean_t critical, delta; - - if (!memorystatus_delta) { - return; - } - - critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE; - delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta)) - || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE; - - if (critical || delta) { - unsigned int total_pages; - - total_pages = (unsigned int) atop_64(max_mem); -#if CONFIG_SECLUDED_MEMORY - total_pages -= vm_page_secluded_count; -#endif /* CONFIG_SECLUDED_MEMORY */ - memorystatus_level = memorystatus_available_pages * 100 / total_pages; - memorystatus_thread_wake(); - } -#endif /* VM_PRESSURE_EVENTS */ -} -#endif /* CONFIG_JETSAM */ - -static boolean_t -memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount) -{ - clock_sec_t tv_sec; - clock_usec_t tv_usec; - uint32_t pages = 0; - uint32_t max_pages_lifetime = 0; - uint32_t purgeable_pages = 0; - uint64_t internal_pages = 0; - uint64_t internal_compressed_pages = 0; - uint64_t purgeable_nonvolatile_pages = 0; - uint64_t purgeable_nonvolatile_compressed_pages = 0; - uint64_t alternate_accounting_pages = 0; - uint64_t alternate_accounting_compressed_pages = 0; - uint64_t iokit_mapped_pages = 0; - uint64_t page_table_pages = 0; - uint64_t region_count = 0; - uint64_t cids[COALITION_NUM_TYPES]; + if (ret != 0) { + goto exit; + } - memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t)); + if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { + now_dirty = TRUE; + } - entry->pid = p->p_pid; - strlcpy(&entry->name[0], p->p_name, sizeof(entry->name)); - entry->priority = p->p_memstat_effectivepriority; + if ((was_dirty == TRUE && now_dirty == FALSE) || + (was_dirty == FALSE && now_dirty == TRUE)) { + /* Manage idle exit deferral, if applied */ + if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { + /* + * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back + * there once it's clean again. For the legacy case, this only applies if it has some protection window left. + * P_DIRTY_DEFER: one-time protection window given at launch + * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode. + * + * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over + * in that band on it's way to IDLE. + */ - memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages); - entry->pages = (uint64_t)pages; - entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; - entry->purgeable_pages = (uint64_t)purgeable_pages; + if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { + /* + * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE" + * + * The process will move from its aging band to its higher requested + * jetsam band. + */ + boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE; - memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages, - &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages, - &alternate_accounting_pages, &alternate_accounting_compressed_pages, - &iokit_mapped_pages, &page_table_pages); + memorystatus_invalidate_idle_demotion_locked(p, reset_state); + reschedule = TRUE; + } else { + /* + * Process is back from "dirty" to "clean". + */ - entry->jse_internal_pages = internal_pages; - entry->jse_internal_compressed_pages = internal_compressed_pages; - entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages; - entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages; - entry->jse_alternate_accounting_pages = alternate_accounting_pages; - entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages; - entry->jse_iokit_mapped_pages = iokit_mapped_pages; - entry->jse_page_table_pages = page_table_pages; + if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) { + if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) && + (mach_absolute_time() >= p->p_memstat_idledeadline)) { + /* + * The process' hasn't enrolled in the "always defer after dirty" + * mode and its deadline has expired. It currently + * does not reside in any of the aging buckets. + * + * It's on its way to the JETSAM_PRIORITY_IDLE + * bucket via memorystatus_update_idle_priority_locked() + * below. + * + * So all we need to do is reset all the state on the + * process that's related to the aging bucket i.e. + * the AGING_IN_PROGRESS flag and the timer deadline. + */ - memorystatus_get_task_memory_region_count(p->task, ®ion_count); - entry->jse_memory_region_count = region_count; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } else { + /* + * Process enrolled in "always stop in deferral band after dirty" OR + * it still has some protection window left and so + * we just re-arm the timer without modifying any + * state on the process iff it still wants into that band. + */ - entry->state = memorystatus_build_state(p); - entry->user_data = p->p_memstat_userdata; - memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid)); - entry->fds = p->p_fd->fd_nfiles; + if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) { + memorystatus_schedule_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) { + memorystatus_schedule_idle_demotion_locked(p, FALSE); + reschedule = TRUE; + } + } + } else { + memorystatus_schedule_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } + } + } - absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec); - entry->cpu_time.tv_sec = (int64_t)tv_sec; - entry->cpu_time.tv_usec = (int64_t)tv_usec; + memorystatus_update_idle_priority_locked(p); - assert(p->p_stats != NULL); - entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */ - entry->jse_killtime = 0; /* abstime jetsam chose to kill process */ - entry->killed = 0; /* the jetsam kill cause */ - entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */ + if (memorystatus_highwater_enabled) { + boolean_t ledger_update_needed = TRUE; + boolean_t use_active; + boolean_t is_fatal; + /* + * We are in this path because this process transitioned between + * dirty <--> clean state. Update the cached memory limits. + */ - entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */ + if (proc_jetsam_state_is_active_locked(p) == TRUE) { + /* + * process is pinned in elevated band + * or + * process is dirty + */ + CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = TRUE; + ledger_update_needed = TRUE; + } else { + /* + * process is clean...but if it has opted into pressured-exit + * we don't apply the INACTIVE limit till the process has aged + * out and is entering the IDLE band. + * See memorystatus_update_priority_locked() for that. + */ -#if CONFIG_FREEZE - entry->jse_thaw_count = p->p_memstat_thaw_count; -#else /* CONFIG_FREEZE */ - entry->jse_thaw_count = 0; -#endif /* CONFIG_FREEZE */ + if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) { + ledger_update_needed = FALSE; + } else { + CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal); + use_active = FALSE; + ledger_update_needed = TRUE; + } + } - proc_coalitionids(p, cids); - entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM]; + /* + * Enforce the new limits by writing to the ledger. + * + * This is a hot path and holding the proc_list_lock while writing to the ledgers, + * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock. + * We aren't traversing the jetsam bucket list here, so we should be safe. + * See rdar://21394491. + */ - return TRUE; -} + if (ledger_update_needed && proc_ref_locked(p) == p) { + int ledger_limit; + if (p->p_memstat_memlimit > 0) { + ledger_limit = p->p_memstat_memlimit; + } else { + ledger_limit = -1; + } + proc_list_unlock(); + task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal); + proc_list_lock(); + proc_rele_locked(p); -static void -memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot) -{ - kern_return_t kr = KERN_SUCCESS; - mach_msg_type_number_t count = HOST_VM_INFO64_COUNT; - vm_statistics64_data_t vm_stat; + MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n", + p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), + (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty, + (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); + } + } - if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) { - printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr); - memset(&snapshot->stats, 0, sizeof(snapshot->stats)); - } else { - snapshot->stats.free_pages = vm_stat.free_count; - snapshot->stats.active_pages = vm_stat.active_count; - snapshot->stats.inactive_pages = vm_stat.inactive_count; - snapshot->stats.throttled_pages = vm_stat.throttled_count; - snapshot->stats.purgeable_pages = vm_stat.purgeable_count; - snapshot->stats.wired_pages = vm_stat.wire_count; + /* If the deferral state changed, reschedule the demotion timer */ + if (reschedule) { + memorystatus_reschedule_idle_demotion_locked(); + } + } - snapshot->stats.speculative_pages = vm_stat.speculative_count; - snapshot->stats.filebacked_pages = vm_stat.external_page_count; - snapshot->stats.anonymous_pages = vm_stat.internal_page_count; - snapshot->stats.compressions = vm_stat.compressions; - snapshot->stats.decompressions = vm_stat.decompressions; - snapshot->stats.compressor_pages = vm_stat.compressor_page_count; - snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor; + if (kill) { + if (proc_ref_locked(p) == p) { + proc_list_unlock(); + psignal(p, SIGKILL); + proc_list_lock(); + proc_rele_locked(p); + } } - get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity); - get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name), - &snapshot->stats.largest_zone_size); -} +exit: + proc_list_unlock(); -/* - * Collect vm statistics at boot. - * Called only once (see kern_exec.c) - * Data can be consumed at any time. - */ -void -memorystatus_init_at_boot_snapshot() -{ - memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot); - memorystatus_at_boot_snapshot.entry_count = 0; - memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */ - memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time(); + return ret; } -static void -memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count ) +int +memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) { - proc_t p, next_p; - unsigned int b = 0, i = 0; + int ret = 0; - memorystatus_jetsam_snapshot_t *snapshot = NULL; - memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; - unsigned int snapshot_max = 0; + MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty); - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0); - if (od_snapshot) { - /* - * This is an on_demand snapshot - */ - snapshot = od_snapshot; - snapshot_list = od_snapshot->entries; - snapshot_max = ods_list_count; - } else { + proc_list_lock(); + + if ((p->p_listflag & P_LIST_EXITED) != 0) { /* - * This is a jetsam event snapshot - */ - snapshot = memorystatus_jetsam_snapshot; - snapshot_list = memorystatus_jetsam_snapshot->entries; - snapshot_max = memorystatus_jetsam_snapshot_max; + * Process is on its way out. + */ + ret = EBUSY; + goto exit; } - /* - * Init the snapshot header information - */ - memorystatus_init_snapshot_vmstats(snapshot); - snapshot->snapshot_time = mach_absolute_time(); - snapshot->notification_time = 0; - snapshot->js_gencount = 0; + if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { + ret = EPERM; + goto exit; + } - next_p = memorystatus_get_first_proc_locked(&b, TRUE); - while (next_p) { - p = next_p; - next_p = memorystatus_get_next_proc_locked(&b, p, TRUE); + if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) { + /* Dirty tracking not enabled */ + ret = EINVAL; + goto exit; + } - if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) { - continue; - } + if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) { + ret = EINVAL; + goto exit; + } - MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", - p->p_pid, - p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7], - p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]); + if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) { + p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS; + } - if (++i == snapshot_max) { - break; + /* This can be set and cleared exactly once. */ + if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) { + if (p->p_memstat_dirty & P_DIRTY_DEFER) { + p->p_memstat_dirty &= ~(P_DIRTY_DEFER); } - } - snapshot->entry_count = i; + if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) { + p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS); + } - if (!od_snapshot) { - /* update the system buffer count */ - memorystatus_jetsam_snapshot_count = i; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + memorystatus_update_idle_priority_locked(p); + memorystatus_reschedule_idle_demotion_locked(); } -} -#if DEVELOPMENT || DEBUG + ret = 0; +exit: + proc_list_unlock(); -#if CONFIG_JETSAM -static int -memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) + return ret; +} + +int +memorystatus_dirty_get(proc_t p, boolean_t locked) { - int ret; - memorystatus_jetsam_panic_options_t debug; + int ret = 0; - if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) { - return EINVAL; + if (!locked) { + proc_list_lock(); } - ret = copyin(buffer, &debug, buffer_size); - if (ret) { - return ret; + if (p->p_memstat_dirty & P_DIRTY_TRACK) { + ret |= PROC_DIRTY_TRACKED; + if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) { + ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT; + } + if (p->p_memstat_dirty & P_DIRTY) { + ret |= PROC_DIRTY_IS_DIRTY; + } + if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) { + ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS; + } } - /* Panic bits match kMemorystatusKilled* enum */ - memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask); - - /* Copyout new value */ - debug.data = memorystatus_jetsam_panic_debug; - ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t)); + if (!locked) { + proc_list_unlock(); + } return ret; } -#endif /* CONFIG_JETSAM */ -/* - * Triggers a sort_order on a specified jetsam priority band. - * This is for testing only, used to force a path through the sort - * function. - */ -static int -memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) +int +memorystatus_on_terminate(proc_t p) { - int error = 0; + int sig; - unsigned int bucket_index = 0; + proc_list_lock(); - if (priority == -1) { - /* Use as shorthand for default priority */ - bucket_index = JETSAM_PRIORITY_DEFAULT; + p->p_memstat_dirty |= P_DIRTY_TERMINATED; + + if ((p->p_memstat_dirty & (P_DIRTY_TRACK | P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) { + /* Clean; mark as terminated and issue SIGKILL */ + sig = SIGKILL; } else { - bucket_index = (unsigned int)priority; + /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */ + sig = SIGTERM; } - error = memorystatus_sort_bucket(bucket_index, sort_order); + proc_list_unlock(); - return error; + return sig; } -#endif /* DEVELOPMENT || DEBUG */ - -/* - * Prepare the process to be killed (set state, update snapshot) and kill it. - */ -static uint64_t memorystatus_purge_before_jetsam_success = 0; - -static boolean_t -memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, boolean_t *killed) +void +memorystatus_on_suspend(proc_t p) { - pid_t aPid = 0; - uint32_t aPid_ep = 0; +#if CONFIG_FREEZE + uint32_t pages; + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); +#endif + proc_list_lock(); +#if CONFIG_FREEZE + memorystatus_suspended_count++; +#endif + p->p_memstat_state |= P_MEMSTAT_SUSPENDED; + proc_list_unlock(); +} - uint64_t killtime = 0; - clock_sec_t tv_sec; - clock_usec_t tv_usec; - uint32_t tv_msec; - boolean_t retval = FALSE; - uint64_t num_pages_purged = 0; +void +memorystatus_on_resume(proc_t p) +{ +#if CONFIG_FREEZE + boolean_t frozen; + pid_t pid; +#endif - aPid = p->p_pid; - aPid_ep = p->p_memstat_effectivepriority; + proc_list_lock(); - if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) { +#if CONFIG_FREEZE + frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN); + if (frozen) { /* - * Genuine memory pressure and not other (vnode/zone) resource exhaustion. + * Now that we don't _thaw_ a process completely, + * resuming it (and having some on-demand swapins) + * shouldn't preclude it from being counted as frozen. + * + * memorystatus_frozen_count--; + * + * We preserve the P_MEMSTAT_FROZEN state since the process + * could have state on disk AND so will deserve some protection + * in the jetsam bands. */ - boolean_t success = FALSE; + if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) { + p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count++; + } + p->p_memstat_thaw_count++; - networking_memstatus_callout(p, cause); - num_pages_purged = vm_purgeable_purge_task_owned(p->task); + memorystatus_thaw_count++; + } - if (num_pages_purged) { - /* - * We actually purged something and so let's - * check if we need to continue with the kill. - */ - if (cause == kMemorystatusKilledHiwat) { - uint64_t footprint_in_bytes = get_task_phys_footprint(p->task); - uint64_t memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */ - success = (footprint_in_bytes <= memlimit_in_bytes); - } else { - success = (memorystatus_avail_pages_below_pressure() == FALSE); - } + memorystatus_suspended_count--; - if (success) { - memorystatus_purge_before_jetsam_success++; + pid = p->p_pid; +#endif + + /* + * P_MEMSTAT_FROZEN will remain unchanged. This used to be: + * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN); + */ + p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED; - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: purged %llu pages from pid %d [%s] and avoided %s\n", - num_pages_purged, aPid, (*p->p_name ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]); + proc_list_unlock(); - *killed = FALSE; +#if CONFIG_FREEZE + if (frozen) { + memorystatus_freeze_entry_t data = { pid, FALSE, 0 }; + memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + } +#endif +} - return TRUE; - } - } +void +memorystatus_on_inactivity(proc_t p) +{ +#pragma unused(p) +#if CONFIG_FREEZE + /* Wake the freeze thread */ + thread_wakeup((event_t)&memorystatus_freeze_wakeup); +#endif +} + +/* + * The proc_list_lock is held by the caller. + */ +static uint32_t +memorystatus_build_state(proc_t p) +{ + uint32_t snapshot_state = 0; + + /* General */ + if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) { + snapshot_state |= kMemorystatusSuspended; + } + if (p->p_memstat_state & P_MEMSTAT_FROZEN) { + snapshot_state |= kMemorystatusFrozen; + } + if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { + snapshot_state |= kMemorystatusWasThawed; + } + if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) { + snapshot_state |= kMemorystatusAssertion; + } + + /* Tracking */ + if (p->p_memstat_dirty & P_DIRTY_TRACK) { + snapshot_state |= kMemorystatusTracked; + } + if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { + snapshot_state |= kMemorystatusSupportsIdleExit; + } + if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { + snapshot_state |= kMemorystatusDirty; } -#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) - MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %lld Mb > 1 (%d Mb)\n", - (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", - aPid, (*p->p_name ? p->p_name : "unknown"), - (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */ - p->p_memstat_memlimit); -#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ + return snapshot_state; +} - killtime = mach_absolute_time(); - absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); - tv_msec = tv_usec / 1000; +static boolean_t +kill_idle_exit_proc(void) +{ + proc_t p, victim_p = PROC_NULL; + uint64_t current_time, footprint_of_killed_proc; + boolean_t killed = FALSE; + unsigned int i = 0; + os_reason_t jetsam_reason = OS_REASON_NULL; -#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) - if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) { - if (cause == kMemorystatusKilledHiwat) { - MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] for diagnosis - memorystatus_available_pages: %d\n", - aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_available_pages); - } else { - int activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND; - if (activeProcess) { - MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memorystatus_available_pages: %d\n", - aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_available_pages); - - if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) { - jetsam_diagnostic_suspended_one_active_proc = 1; - printf("jetsam: returning after suspending first active proc - %d\n", aPid); - } - } - } + /* Pick next idle exit victim. */ + current_time = mach_absolute_time(); - proc_list_lock(); - /* This diagnostic code is going away soon. Ignore the kMemorystatusInvalid cause here. */ - memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusInvalid, killtime); - proc_list_unlock(); + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT); + if (jetsam_reason == OS_REASON_NULL) { + printf("kill_idle_exit_proc: failed to allocate jetsam reason\n"); + } - p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED; + proc_list_lock(); - if (p) { - task_suspend(p->task); - *killed = TRUE; + p = memorystatus_get_first_proc_locked(&i, FALSE); + while (p) { + /* No need to look beyond the idle band */ + if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) { + break; } - } else -#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ - { - proc_list_lock(); - memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); - proc_list_unlock(); - char kill_reason_string[128]; - - if (cause == kMemorystatusKilledHiwat) { - strlcpy(kill_reason_string, "killing_highwater_process", 128); - } else { - if (aPid_ep == JETSAM_PRIORITY_IDLE) { - strlcpy(kill_reason_string, "killing_idle_process", 128); - } else { - strlcpy(kill_reason_string, "killing_top_process", 128); + if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) { + if (current_time >= p->p_memstat_idledeadline) { + p->p_memstat_dirty |= P_DIRTY_TERMINATED; + victim_p = proc_ref_locked(p); + break; } } - os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n", - (unsigned long)tv_sec, tv_msec, kill_reason_string, - aPid, (*p->p_name ? p->p_name : "unknown"), - memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages); - - /* - * memorystatus_do_kill drops a reference, so take another one so we can - * continue to use this exit reason even after memorystatus_do_kill() - * returns - */ - os_reason_ref(jetsam_reason); + p = memorystatus_get_next_proc_locked(&i, p, FALSE); + } - retval = memorystatus_do_kill(p, cause, jetsam_reason); + proc_list_unlock(); - *killed = retval; + if (victim_p) { + printf("memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown"), jetsam_reason->osr_code); + killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason, &footprint_of_killed_proc); + proc_rele(victim_p); + } else { + os_reason_free(jetsam_reason); } - return retval; + return killed; } -/* - * Jetsam the first process in the queue. - */ -static boolean_t -memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, - int32_t *priority, uint32_t *errors) +static void +memorystatus_thread_wake(void) { - pid_t aPid; - proc_t p = PROC_NULL, next_p = PROC_NULL; - boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE; - unsigned int i = 0; - uint32_t aPid_ep; - int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE; - -#ifndef CONFIG_FREEZE -#pragma unused(any) -#endif - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); - + int thr_id = 0; + int active_thr = atomic_load(&active_jetsam_threads); -#if CONFIG_JETSAM - if (sort_flag == TRUE) { - (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT); + /* Wakeup all the jetsam threads */ + for (thr_id = 0; thr_id < active_thr; thr_id++) { + thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup); } +} - local_max_kill_prio = max_kill_priority; - - force_new_snapshot = FALSE; - -#else /* CONFIG_JETSAM */ - - if (sort_flag == TRUE) { - (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT); - } +#if CONFIG_JETSAM - /* - * On macos, we currently only have 2 reasons to be here: - * - * kMemorystatusKilledZoneMapExhaustion - * AND - * kMemorystatusKilledVMCompressorSpaceShortage - * - * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider - * any and all processes as eligible kill candidates since we need to avoid a panic. - * - * Since this function can be called async. it is harder to toggle the max_kill_priority - * value before and after a call. And so we use this local variable to set the upper band - * on the eligible kill bands. - */ - if (cause == kMemorystatusKilledZoneMapExhaustion) { - local_max_kill_prio = JETSAM_PRIORITY_MAX; - } else { - local_max_kill_prio = max_kill_priority; - } +static void +memorystatus_thread_pool_max() +{ + /* Increase the jetsam thread pool to max_jetsam_threads */ + int max_threads = max_jetsam_threads; + printf("Expanding memorystatus pool to %d!\n", max_threads); + atomic_store(&active_jetsam_threads, max_threads); +} - /* - * And, because we are here under extreme circumstances, we force a snapshot even for - * IDLE kills. - */ - force_new_snapshot = TRUE; +static void +memorystatus_thread_pool_default() +{ + /* Restore the jetsam thread pool to a single thread */ + printf("Reverting memorystatus pool back to 1\n"); + atomic_store(&active_jetsam_threads, 1); +} #endif /* CONFIG_JETSAM */ - proc_list_lock(); - - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) { -#if DEVELOPMENT || DEBUG - int procSuspendedForDiagnosis; -#endif /* DEVELOPMENT || DEBUG */ +extern void vm_pressure_response(void); - p = next_p; - next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); +static int +memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation) +{ + struct jetsam_thread_state *jetsam_thread = jetsam_current_thread(); -#if DEVELOPMENT || DEBUG - procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED; -#endif /* DEVELOPMENT || DEBUG */ + assert(jetsam_thread != NULL); + if (interval_ms) { + assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC); + } else { + assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT); + } - aPid = p->p_pid; - aPid_ep = p->p_memstat_effectivepriority; + return thread_block(continuation); +} - if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { - continue; /* with lock held */ - } +static boolean_t +memorystatus_avail_pages_below_pressure(void) +{ +#if CONFIG_EMBEDDED +/* + * Instead of CONFIG_EMBEDDED for these *avail_pages* routines, we should + * key off of the system having dynamic swap support. With full swap support, + * the system shouldn't really need to worry about various page thresholds. + */ + return memorystatus_available_pages <= memorystatus_available_pages_pressure; +#else /* CONFIG_EMBEDDED */ + return FALSE; +#endif /* CONFIG_EMBEDDED */ +} -#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) - if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) { - printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid); - continue; - } -#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ +static boolean_t +memorystatus_avail_pages_below_critical(void) +{ +#if CONFIG_EMBEDDED + return memorystatus_available_pages <= memorystatus_available_pages_critical; +#else /* CONFIG_EMBEDDED */ + return FALSE; +#endif /* CONFIG_EMBEDDED */ +} - if (cause == kMemorystatusKilledVnodes) { - /* - * If the system runs out of vnodes, we systematically jetsam - * processes in hopes of stumbling onto a vnode gain that helps - * the system recover. The process that happens to trigger - * this path has no known relationship to the vnode shortage. - * Deadlock avoidance: attempt to safeguard the caller. - */ +static boolean_t +memorystatus_post_snapshot(int32_t priority, uint32_t cause) +{ + boolean_t is_idle_priority; - if (p == current_proc()) { - /* do not jetsam the current process */ - continue; - } - } + if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) { + is_idle_priority = (priority == JETSAM_PRIORITY_IDLE); + } else { + is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED); + } +#if CONFIG_EMBEDDED +#pragma unused(cause) + /* + * Don't generate logs for steady-state idle-exit kills, + * unless it is overridden for debug or by the device + * tree. + */ -#if CONFIG_FREEZE - boolean_t skip; - boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED); - if (any || reclaim_proc) { - skip = FALSE; - } else { - skip = TRUE; - } + return !is_idle_priority || memorystatus_idle_snapshot; - if (skip) { - continue; - } else -#endif - { - if (proc_ref_locked(p) == p) { - /* - * Mark as terminated so that if exit1() indicates success, but the process (for example) - * is blocked in task_exception_notify(), it'll be skipped if encountered again - see - * . This is cheaper than examining P_LEXIT, which requires the - * acquisition of the proc lock. - */ - p->p_memstat_state |= P_MEMSTAT_TERMINATED; - } else { - /* - * We need to restart the search again because - * proc_ref_locked _can_ drop the proc_list lock - * and we could have lost our stored next_p via - * an exit() on another core. - */ - i = 0; - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - continue; - } +#else /* CONFIG_EMBEDDED */ + /* + * Don't generate logs for steady-state idle-exit kills, + * unless + * - it is overridden for debug or by the device + * tree. + * OR + * - the kill causes are important i.e. not kMemorystatusKilledIdleExit + */ - /* - * Capture a snapshot if none exists and: - * - we are forcing a new snapshot creation, either because: - * - on a particular platform we need these snapshots every time, OR - * - a boot-arg/embedded device tree property has been set. - * - priority was not requested (this is something other than an ambient kill) - * - the priority was requested *and* the targeted process is not at idle priority - */ - if ((memorystatus_jetsam_snapshot_count == 0) && - (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) { - memorystatus_init_jetsam_snapshot_locked(NULL, 0); - new_snapshot = TRUE; - } + boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause)); + return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause; +#endif /* CONFIG_EMBEDDED */ +} - proc_list_unlock(); +static boolean_t +memorystatus_action_needed(void) +{ +#if CONFIG_EMBEDDED + return is_reason_thrashing(kill_under_pressure_cause) || + is_reason_zone_map_exhaustion(kill_under_pressure_cause) || + memorystatus_available_pages <= memorystatus_available_pages_pressure; +#else /* CONFIG_EMBEDDED */ + return is_reason_thrashing(kill_under_pressure_cause) || + is_reason_zone_map_exhaustion(kill_under_pressure_cause); +#endif /* CONFIG_EMBEDDED */ +} - freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed); /* purged and/or killed 'p' */ - /* Success? */ - if (freed_mem) { - if (killed) { - if (priority) { - *priority = aPid_ep; - } - } else { - /* purged */ - proc_list_lock(); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - proc_list_unlock(); - } - proc_rele(p); - goto exit; - } +static boolean_t +memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical, uint64_t *memory_reclaimed) +{ + boolean_t purged = FALSE, killed = FALSE; - /* - * Failure - first unwind the state, - * then fall through to restart the search. - */ - proc_list_lock(); - proc_rele_locked(p); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - p->p_memstat_state |= P_MEMSTAT_ERROR; - *errors += 1; + *memory_reclaimed = 0; + killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed); - i = 0; - next_p = memorystatus_get_first_proc_locked(&i, TRUE); + if (killed) { + *hwm_kill = *hwm_kill + 1; + *post_snapshot = TRUE; + return TRUE; + } else { + if (purged == FALSE) { + /* couldn't purge and couldn't kill */ + memorystatus_hwm_candidates = FALSE; } } - proc_list_unlock(); +#if CONFIG_JETSAM + /* No highwater processes to kill. Continue or stop for now? */ + if (!is_reason_thrashing(kill_under_pressure_cause) && + !is_reason_zone_map_exhaustion(kill_under_pressure_cause) && + (memorystatus_available_pages > memorystatus_available_pages_critical)) { + /* + * We are _not_ out of pressure but we are above the critical threshold and there's: + * - no compressor thrashing + * - enough zone memory + * - no more HWM processes left. + * For now, don't kill any other processes. + */ -exit: - os_reason_free(jetsam_reason); + if (*hwm_kill == 0) { + memorystatus_thread_wasted_wakeup++; + } - /* Clear snapshot if freshly captured and no target was found */ - if (new_snapshot && !killed) { - proc_list_lock(); - memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; - proc_list_unlock(); - } + *is_critical = FALSE; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, - memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0); + return TRUE; + } +#endif /* CONFIG_JETSAM */ - return killed; + return FALSE; } /* - * Jetsam aggressively + * kJetsamHighRelaunchCandidatesThreshold defines the percentage of candidates + * in the idle & deferred bands that need to be bad candidates in order to trigger + * aggressive jetsam. */ +#define kJetsamHighRelaunchCandidatesThreshold (100) + +/* kJetsamMinCandidatesThreshold defines the minimum number of candidates in the + * idle/deferred bands to trigger aggressive jetsam. This value basically decides + * how much memory the system is ready to hold in the lower bands without triggering + * aggressive jetsam. This number should ideally be tuned based on the memory config + * of the device. + */ +#define kJetsamMinCandidatesThreshold (5) + static boolean_t -memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, - int32_t priority_max, uint32_t *errors) +memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count, __unused int *jld_idle_kills, __unused int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count) { - pid_t aPid; - proc_t p = PROC_NULL, next_p = PROC_NULL; - boolean_t new_snapshot = FALSE, killed = FALSE; - int kill_count = 0; - unsigned int i = 0; - int32_t aPid_ep = 0; - unsigned int memorystatus_level_snapshot = 0; - uint64_t killtime = 0; - clock_sec_t tv_sec; - clock_usec_t tv_usec; - uint32_t tv_msec; - os_reason_t jetsam_reason = OS_REASON_NULL; + boolean_t aggressive_jetsam_needed = false; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, - memorystatus_available_pages, priority_max, 0, 0, 0); + /* + * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, we maintain the jetsam + * relaunch behavior for all daemons. Also, daemons and apps are aged in deferred bands on + * every dirty->clean transition. For this aging policy, the best way to determine if + * aggressive jetsam is needed, is to see if the kill candidates are mostly bad candidates. + * If yes, then we need to go to higher bands to reclaim memory. + */ + proc_list_lock(); + /* Get total candidate counts for idle and idle deferred bands */ + *total_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].count + memstat_bucket[system_procs_aging_band].count; + /* Get counts of bad kill candidates in idle and idle deferred bands */ + int bad_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].relaunch_high_count + memstat_bucket[system_procs_aging_band].relaunch_high_count; - memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT); + *elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count; - jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_top_process_aggressive: failed to allocate exit reason\n"); - } + proc_list_unlock(); - proc_list_lock(); + /* Check if the number of bad candidates is greater than kJetsamHighRelaunchCandidatesThreshold % */ + aggressive_jetsam_needed = (((bad_candidates * 100) / *total_candidates) >= kJetsamHighRelaunchCandidatesThreshold); + + /* + * Since the new aging policy bases the aggressive jetsam trigger on percentage of + * bad candidates, it is prone to being overly aggressive. In order to mitigate that, + * make sure the system is really under memory pressure before triggering aggressive + * jetsam. + */ + if (memorystatus_available_pages > memorystatus_sysproc_aging_aggr_pages) { + aggressive_jetsam_needed = false; + } - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - while (next_p) { #if DEVELOPMENT || DEBUG - int activeProcess; - int procSuspendedForDiagnosis; + printf("memorystatus: aggressive%d: [%s] Bad Candidate Threshold Check (total: %d, bad: %d, threshold: %d %%); Memory Pressure Check (available_pgs: %llu, threshold_pgs: %llu)\n", + jld_eval_aggressive_count, aggressive_jetsam_needed ? "PASSED" : "FAILED", *total_candidates, bad_candidates, + kJetsamHighRelaunchCandidatesThreshold, (uint64_t)memorystatus_available_pages, (uint64_t)memorystatus_sysproc_aging_aggr_pages); #endif /* DEVELOPMENT || DEBUG */ + return aggressive_jetsam_needed; +} - if (((next_p->p_listflag & P_LIST_EXITED) != 0) || - ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) { - /* - * We have raced with next_p running on another core. - * It may be exiting or it may have moved to a different - * jetsam priority band. This means we have lost our - * place in line while traversing the jetsam list. We - * attempt to recover by rewinding to the beginning of the band - * we were already traversing. By doing this, we do not guarantee - * that no process escapes this aggressive march, but we can make - * skipping an entire range of processes less likely. (PR-21069019) - */ - - MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n", - aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid); +static boolean_t +memorystatus_aggressive_jetsam_needed_default(__unused int jld_eval_aggressive_count, int *jld_idle_kills, int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count) +{ + boolean_t aggressive_jetsam_needed = false; + /* Jetsam Loop Detection - locals */ + memstat_bucket_t *bucket; + int jld_bucket_count = 0; - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - continue; - } + proc_list_lock(); + switch (jetsam_aging_policy) { + case kJetsamAgingPolicyLegacy: + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + jld_bucket_count = bucket->count; + bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1]; + jld_bucket_count += bucket->count; + break; + case kJetsamAgingPolicyAppsReclaimedFirst: + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + jld_bucket_count = bucket->count; + bucket = &memstat_bucket[system_procs_aging_band]; + jld_bucket_count += bucket->count; + bucket = &memstat_bucket[applications_aging_band]; + jld_bucket_count += bucket->count; + break; + case kJetsamAgingPolicyNone: + default: + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + jld_bucket_count = bucket->count; + break; + } - p = next_p; - next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); + bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE]; + *elevated_bucket_count = bucket->count; + *total_candidates = jld_bucket_count; + proc_list_unlock(); - if (p->p_memstat_effectivepriority > priority_max) { - /* - * Bail out of this killing spree if we have - * reached beyond the priority_max jetsam band. - * That is, we kill up to and through the - * priority_max jetsam band. - */ - proc_list_unlock(); - goto exit; - } + aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates); #if DEVELOPMENT || DEBUG - activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND; - procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED; + if (aggressive_jetsam_needed) { + printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n", + jld_eval_aggressive_count, + jld_idle_kill_candidates, + *jld_idle_kills); + } #endif /* DEVELOPMENT || DEBUG */ + return aggressive_jetsam_needed; +} - aPid = p->p_pid; - aPid_ep = p->p_memstat_effectivepriority; - - if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { - continue; - } - -#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) - if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) { - printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid); - continue; - } -#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ - - /* - * Capture a snapshot if none exists. - */ - if (memorystatus_jetsam_snapshot_count == 0) { - memorystatus_init_jetsam_snapshot_locked(NULL, 0); - new_snapshot = TRUE; - } +static boolean_t +memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot, uint64_t *memory_reclaimed) +{ + boolean_t aggressive_jetsam_needed = false; + boolean_t killed; + uint32_t errors = 0; + uint64_t footprint_of_killed_proc = 0; + int elevated_bucket_count = 0; + int total_candidates = 0; + *memory_reclaimed = 0; - /* - * Mark as terminated so that if exit1() indicates success, but the process (for example) - * is blocked in task_exception_notify(), it'll be skipped if encountered again - see - * . This is cheaper than examining P_LEXIT, which requires the - * acquisition of the proc lock. - */ - p->p_memstat_state |= P_MEMSTAT_TERMINATED; + /* + * The aggressive jetsam logic looks at the number of times it has been in the + * aggressive loop to determine the max priority band it should kill upto. The + * static variables below are used to track that property. + * + * To reset those values, the implementation checks if it has been + * memorystatus_jld_eval_period_msecs since the parameters were reset. + */ + static int jld_eval_aggressive_count = 0; + static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT; + static uint64_t jld_timestamp_msecs = 0; + static int jld_idle_kill_candidates = 0; - killtime = mach_absolute_time(); - absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); - tv_msec = tv_usec / 1000; + if (memorystatus_jld_enabled == FALSE) { + /* If aggressive jetsam is disabled, nothing to do here */ + return FALSE; + } - /* Shift queue, update stats */ - memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); + /* Get current timestamp (msecs only) */ + struct timeval jld_now_tstamp = {0, 0}; + uint64_t jld_now_msecs = 0; + microuptime(&jld_now_tstamp); + jld_now_msecs = (jld_now_tstamp.tv_sec * 1000); + /* + * The aggressive jetsam logic looks at the number of candidates and their + * properties to decide if aggressive jetsam should be engaged. + */ + if (jetsam_aging_policy == kJetsamAgingPolicySysProcsReclaimedFirst) { /* - * In order to kill the target process, we will drop the proc_list_lock. - * To guaranteee that p and next_p don't disappear out from under the lock, - * we must take a ref on both. - * If we cannot get a reference, then it's likely we've raced with - * that process exiting on another core. - */ - if (proc_ref_locked(p) == p) { - if (next_p) { - while (next_p && (proc_ref_locked(next_p) != next_p)) { - proc_t temp_p; - - /* - * We must have raced with next_p exiting on another core. - * Recover by getting the next eligible process in the band. - */ - - MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n", - aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)")); - - temp_p = next_p; - next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE); - } - } - proc_list_unlock(); - - printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n", - (unsigned long)tv_sec, tv_msec, - ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"), - aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"), - memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages); - - memorystatus_level_snapshot = memorystatus_level; + * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, the logic looks at the number of + * candidates in the idle and deferred band and how many out of them are marked as high relaunch + * probability. + */ + aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_sysproc_aging(jld_eval_aggressive_count, + jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count); + } else { + /* + * The other aging policies look at number of candidate processes over a specific time window and + * evaluate if the system is in a jetsam loop. If yes, aggressive jetsam is triggered. + */ + aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_default(jld_eval_aggressive_count, + jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count); + } - /* - * memorystatus_do_kill() drops a reference, so take another one so we can - * continue to use this exit reason even after memorystatus_do_kill() - * returns. - */ - os_reason_ref(jetsam_reason); - killed = memorystatus_do_kill(p, cause, jetsam_reason); + /* + * Check if its been really long since the aggressive jetsam evaluation + * parameters have been refreshed. This logic also resets the jld_eval_aggressive_count + * counter to make sure we reset the aggressive jetsam severity. + */ + boolean_t param_reval = false; - /* Success? */ - if (killed) { - proc_rele(p); - kill_count++; - p = NULL; - killed = FALSE; + if ((total_candidates == 0) || + (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) { + jld_timestamp_msecs = jld_now_msecs; + jld_idle_kill_candidates = total_candidates; + *jld_idle_kills = 0; + jld_eval_aggressive_count = 0; + jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT; + param_reval = true; + } - /* - * Continue the killing spree. - */ - proc_list_lock(); - if (next_p) { - proc_rele_locked(next_p); - } + /* + * If the parameters have been updated, re-evaluate the aggressive_jetsam_needed condition for + * the non kJetsamAgingPolicySysProcsReclaimedFirst policy since its based on jld_idle_kill_candidates etc. + */ + if ((param_reval == true) && (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst)) { + aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates); + } - if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) { - if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) { + /* + * It is also possible that the system is down to a very small number of processes in the candidate + * bands. In that case, the decisions made by the memorystatus_aggressive_jetsam_needed_* routines + * would not be useful. In that case, do not trigger aggressive jetsam. + */ + if (total_candidates < kJetsamMinCandidatesThreshold) { #if DEVELOPMENT || DEBUG - printf("Disabling Lenient mode after one-time deployment.\n"); + printf("memorystatus: aggressive: [FAILED] Low Candidate Count (current: %d, threshold: %d)\n", total_candidates, kJetsamMinCandidatesThreshold); #endif /* DEVELOPMENT || DEBUG */ - memorystatus_aggressive_jetsam_lenient = FALSE; - break; - } - } + aggressive_jetsam_needed = false; + } - continue; - } + if (aggressive_jetsam_needed == false) { + /* Either the aging policy or the candidate count decided that aggressive jetsam is not needed. Nothing more to do here. */ + return FALSE; + } - /* - * Failure - first unwind the state, - * then fall through to restart the search. - */ - proc_list_lock(); - proc_rele_locked(p); - if (next_p) { - proc_rele_locked(next_p); - } - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - p->p_memstat_state |= P_MEMSTAT_ERROR; - *errors += 1; - p = NULL; - } + /* Looks like aggressive jetsam is needed */ + jld_eval_aggressive_count++; + + if (jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) { + memorystatus_issue_fg_band_notify(); /* - * Failure - restart the search at the beginning of - * the band we were already traversing. - * - * We might have raced with "p" exiting on another core, resulting in no - * ref on "p". Or, we may have failed to kill "p". - * - * Either way, we fall thru to here, leaving the proc in the - * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state. - * - * And, we hold the the proc_list_lock at this point. + * If we reach this aggressive cycle, corpses might be causing memory pressure. + * So, in an effort to avoid jetsams in the FG band, we will attempt to purge + * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT. */ - - next_p = memorystatus_get_first_proc_locked(&i, TRUE); + if (total_corpses_count() > 0 && !*corpse_list_purged) { + task_purge_all_corpses(); + *corpse_list_purged = TRUE; + } + } else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) { + /* + * Bump up the jetsam priority limit (eg: the bucket index) + * Enforce bucket index sanity. + */ + if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) || + (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) { + /* + * Do nothing. Stick with the default level. + */ + } else { + jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max; + } } - proc_list_unlock(); + /* Visit elevated processes first */ + while (elevated_bucket_count) { + elevated_bucket_count--; -exit: - os_reason_free(jetsam_reason); + /* + * memorystatus_kill_elevated_process() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns. + */ - /* Clear snapshot if freshly captured and no target was found */ - if (new_snapshot && (kill_count == 0)) { - proc_list_lock(); - memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; - proc_list_unlock(); + os_reason_ref(jetsam_reason); + killed = memorystatus_kill_elevated_process( + cause, + jetsam_reason, + JETSAM_PRIORITY_ELEVATED_INACTIVE, + jld_eval_aggressive_count, + &errors, &footprint_of_killed_proc); + if (killed) { + *post_snapshot = TRUE; + *memory_reclaimed += footprint_of_killed_proc; + if (memorystatus_avail_pages_below_pressure()) { + /* + * Still under pressure. + * Find another pinned processes. + */ + continue; + } else { + return TRUE; + } + } else { + /* + * No pinned processes left to kill. + * Abandon elevated band. + */ + break; + } } - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, - memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0); + /* + * memorystatus_kill_processes_aggressive() allocates its own + * jetsam_reason so the kMemorystatusKilledProcThrashing cause + * is consistent throughout the aggressive march. + */ + killed = memorystatus_kill_processes_aggressive( + kMemorystatusKilledProcThrashing, + jld_eval_aggressive_count, + jld_priority_band_max, + &errors, &footprint_of_killed_proc); - if (kill_count > 0) { + if (killed) { + /* Always generate logs after aggressive kill */ + *post_snapshot = TRUE; + *memory_reclaimed += footprint_of_killed_proc; + *jld_idle_kills = 0; return TRUE; - } else { - return FALSE; } + + return FALSE; } -static boolean_t -memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged) + +static void +memorystatus_thread(void *param __unused, wait_result_t wr __unused) { - pid_t aPid = 0; - proc_t p = PROC_NULL, next_p = PROC_NULL; - boolean_t new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE; - unsigned int i = 0; - uint32_t aPid_ep; - os_reason_t jetsam_reason = OS_REASON_NULL; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); + boolean_t post_snapshot = FALSE; + uint32_t errors = 0; + uint32_t hwm_kill = 0; + boolean_t sort_flag = TRUE; + boolean_t corpse_list_purged = FALSE; + int jld_idle_kills = 0; + struct jetsam_thread_state *jetsam_thread = jetsam_current_thread(); + uint64_t total_memory_reclaimed = 0; - jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n"); + assert(jetsam_thread != NULL); + if (jetsam_thread->inited == FALSE) { + /* + * It's the first time the thread has run, so just mark the thread as privileged and block. + * This avoids a spurious pass with unset variables, as set out in . + */ + + char name[32]; + thread_wire(host_priv_self(), current_thread(), TRUE); + snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1); + + /* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */ + if (jetsam_thread->index == 0) { + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + thread_vm_bind_group_add(); + } + jetsam_thread->limit_to_low_bands = FALSE; + } else { + jetsam_thread->limit_to_low_bands = TRUE; + } + thread_set_thread_name(current_thread(), name); + jetsam_thread->inited = TRUE; + memorystatus_thread_block(0, memorystatus_thread); } - proc_list_lock(); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START, + memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count, 0); - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - while (next_p) { - uint64_t footprint_in_bytes = 0; - uint64_t memlimit_in_bytes = 0; - boolean_t skip = 0; + /* + * Jetsam aware version. + * + * The VM pressure notification thread is working it's way through clients in parallel. + * + * So, while the pressure notification thread is targeting processes in order of + * increasing jetsam priority, we can hopefully reduce / stop it's work by killing + * any processes that have exceeded their highwater mark. + * + * If we run out of HWM processes and our available pages drops below the critical threshold, then, + * we target the least recently used process in order of increasing jetsam priority (exception: the FG band). + */ + while (memorystatus_action_needed()) { + boolean_t killed; + int32_t priority; + uint32_t cause; + uint64_t memory_reclaimed = 0; + uint64_t jetsam_reason_code = JETSAM_REASON_INVALID; + os_reason_t jetsam_reason = OS_REASON_NULL; - p = next_p; - next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); + cause = kill_under_pressure_cause; + switch (cause) { + case kMemorystatusKilledFCThrashing: + jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING; + break; + case kMemorystatusKilledVMCompressorThrashing: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING; + break; + case kMemorystatusKilledVMCompressorSpaceShortage: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE; + break; + case kMemorystatusKilledZoneMapExhaustion: + jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION; + break; + case kMemorystatusKilledVMPageShortage: + /* falls through */ + default: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE; + cause = kMemorystatusKilledVMPageShortage; + break; + } - aPid = p->p_pid; - aPid_ep = p->p_memstat_effectivepriority; + /* Highwater */ + boolean_t is_critical = TRUE; + if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical, &memory_reclaimed)) { + total_memory_reclaimed += memory_reclaimed; + if (is_critical == FALSE) { + /* + * For now, don't kill any other processes. + */ + break; + } else { + goto done; + } + } - if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { - continue; + jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_thread: failed to allocate jetsam reason\n"); } - /* skip if no limit set */ - if (p->p_memstat_memlimit <= 0) { - continue; + /* Only unlimited jetsam threads should act aggressive */ + if (!jetsam_thread->limit_to_low_bands && + memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot, &memory_reclaimed)) { + total_memory_reclaimed += memory_reclaimed; + goto done; } - footprint_in_bytes = get_task_phys_footprint(p->task); - memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */ - skip = (footprint_in_bytes <= memlimit_in_bytes); + /* + * memorystatus_kill_top_process() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns + */ + os_reason_ref(jetsam_reason); -#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) - if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) { - if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) { - continue; + /* LRU */ + killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors, &memory_reclaimed); + sort_flag = FALSE; + + if (killed) { + total_memory_reclaimed += memory_reclaimed; + if (memorystatus_post_snapshot(priority, cause) == TRUE) { + post_snapshot = TRUE; + } + + /* Jetsam Loop Detection */ + if (memorystatus_jld_enabled == TRUE) { + if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) { + jld_idle_kills++; + } else { + /* + * We've reached into bands beyond idle deferred. + * We make no attempt to monitor them + */ + } } - } -#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ -#if CONFIG_FREEZE - if (!skip) { - if (p->p_memstat_state & P_MEMSTAT_LOCKED) { - skip = TRUE; - } else { - skip = FALSE; + /* + * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT + * then we attempt to relieve pressure by purging corpse memory and notifying + * anybody wanting to know this. + */ + if (priority >= JETSAM_PRIORITY_UI_SUPPORT) { + memorystatus_issue_fg_band_notify(); + if (total_corpses_count() > 0 && !corpse_list_purged) { + task_purge_all_corpses(); + corpse_list_purged = TRUE; + } } + goto done; } -#endif - if (skip) { - continue; - } else { - if (memorystatus_jetsam_snapshot_count == 0) { - memorystatus_init_jetsam_snapshot_locked(NULL, 0); - new_snapshot = TRUE; + if (memorystatus_avail_pages_below_critical()) { + /* + * Still under pressure and unable to kill a process - purge corpse memory + */ + if (total_corpses_count() > 0) { + task_purge_all_corpses(); + corpse_list_purged = TRUE; } - if (proc_ref_locked(p) == p) { - /* - * Mark as terminated so that if exit1() indicates success, but the process (for example) - * is blocked in task_exception_notify(), it'll be skipped if encountered again - see - * . This is cheaper than examining P_LEXIT, which requires the - * acquisition of the proc lock. - */ - p->p_memstat_state |= P_MEMSTAT_TERMINATED; - - proc_list_unlock(); - } else { + if (!jetsam_thread->limit_to_low_bands && memorystatus_avail_pages_below_critical()) { /* - * We need to restart the search again because - * proc_ref_locked _can_ drop the proc_list lock - * and we could have lost our stored next_p via - * an exit() on another core. + * Still under pressure and unable to kill a process - panic */ - i = 0; - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - continue; + panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)memorystatus_available_pages); } + } - freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed); /* purged and/or killed 'p' */ - - /* Success? */ - if (freed_mem) { - if (killed == FALSE) { - /* purged 'p'..don't reset HWM candidate count */ - *purged = TRUE; - - proc_list_lock(); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - proc_list_unlock(); - } - proc_rele(p); - goto exit; - } - /* - * Failure - first unwind the state, - * then fall through to restart the search. - */ - proc_list_lock(); - proc_rele_locked(p); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - p->p_memstat_state |= P_MEMSTAT_ERROR; - *errors += 1; +done: - i = 0; - next_p = memorystatus_get_first_proc_locked(&i, TRUE); + /* + * We do not want to over-kill when thrashing has been detected. + * To avoid that, we reset the flag here and notify the + * compressor. + */ + if (is_reason_thrashing(kill_under_pressure_cause)) { + kill_under_pressure_cause = 0; +#if CONFIG_JETSAM + vm_thrashing_jetsam_done(); +#endif /* CONFIG_JETSAM */ + } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) { + kill_under_pressure_cause = 0; } + + os_reason_free(jetsam_reason); } - proc_list_unlock(); + kill_under_pressure_cause = 0; -exit: - os_reason_free(jetsam_reason); + if (errors) { + memorystatus_clear_errors(); + } - /* Clear snapshot if freshly captured and no target was found */ - if (new_snapshot && !killed) { + if (post_snapshot) { proc_list_lock(); - memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; - proc_list_unlock(); + size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count); + uint64_t timestamp_now = mach_absolute_time(); + memorystatus_jetsam_snapshot->notification_time = timestamp_now; + memorystatus_jetsam_snapshot->js_gencount++; + if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || + timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { + proc_list_unlock(); + int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); + if (!ret) { + proc_list_lock(); + memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; + proc_list_unlock(); + } + } else { + proc_list_unlock(); + } } - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END, - memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END, + memorystatus_available_pages, total_memory_reclaimed, 0, 0, 0); - return killed; + memorystatus_thread_block(0, memorystatus_thread); } /* - * Jetsam a process pinned in the elevated band. - * - * Return: true -- at least one pinned process was jetsammed - * false -- no pinned process was jetsammed + * Returns TRUE: + * when an idle-exitable proc was killed + * Returns FALSE: + * when there are no more idle-exitable procs found + * when the attempt to kill an idle-exitable proc failed */ -static boolean_t -memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors) +boolean_t +memorystatus_idle_exit_from_VM(void) { - pid_t aPid = 0; - proc_t p = PROC_NULL, next_p = PROC_NULL; - boolean_t new_snapshot = FALSE, killed = FALSE; - int kill_count = 0; - uint32_t aPid_ep; - uint64_t killtime = 0; - clock_sec_t tv_sec; - clock_usec_t tv_usec; - uint32_t tv_msec; - - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); - -#if CONFIG_FREEZE - boolean_t consider_frozen_only = FALSE; - - if (band == (unsigned int) memorystatus_freeze_jetsam_band) { - consider_frozen_only = TRUE; - } -#endif /* CONFIG_FREEZE */ + /* + * This routine should no longer be needed since we are + * now using jetsam bands on all platforms and so will deal + * with IDLE processes within the memorystatus thread itself. + * + * But we still use it because we observed that macos systems + * started heavy compression/swapping with a bunch of + * idle-exitable processes alive and doing nothing. We decided + * to rather kill those processes than start swapping earlier. + */ - proc_list_lock(); + return kill_idle_exit_proc(); +} - next_p = memorystatus_get_first_proc_locked(&band, FALSE); - while (next_p) { - p = next_p; - next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); +/* + * Callback invoked when allowable physical memory footprint exceeded + * (dirty pages + IOKit mappings) + * + * This is invoked for both advisory, non-fatal per-task high watermarks, + * as well as the fatal task memory limits. + */ +void +memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal) +{ + os_reason_t jetsam_reason = OS_REASON_NULL; - aPid = p->p_pid; - aPid_ep = p->p_memstat_effectivepriority; + proc_t p = current_proc(); +#if VM_PRESSURE_EVENTS + if (warning == TRUE) { /* - * Only pick a process pinned in this elevated band - */ - if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { - continue; - } - - if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { - continue; - } - -#if CONFIG_FREEZE - if (consider_frozen_only && !(p->p_memstat_state & P_MEMSTAT_FROZEN)) { - continue; - } - - if (p->p_memstat_state & P_MEMSTAT_LOCKED) { - continue; - } -#endif /* CONFIG_FREEZE */ - -#if DEVELOPMENT || DEBUG - MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n", - aggr_count, - aPid, (*p->p_name ? p->p_name : "unknown"), - memorystatus_available_pages); -#endif /* DEVELOPMENT || DEBUG */ - - if (memorystatus_jetsam_snapshot_count == 0) { - memorystatus_init_jetsam_snapshot_locked(NULL, 0); - new_snapshot = TRUE; - } - - p->p_memstat_state |= P_MEMSTAT_TERMINATED; - - killtime = mach_absolute_time(); - absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); - tv_msec = tv_usec / 1000; - - memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); - - if (proc_ref_locked(p) == p) { - proc_list_unlock(); - - os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n", - (unsigned long)tv_sec, tv_msec, - aggr_count, - aPid, (*p->p_name ? p->p_name : "unknown"), - memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages); - - /* - * memorystatus_do_kill drops a reference, so take another one so we can - * continue to use this exit reason even after memorystatus_do_kill() - * returns - */ - os_reason_ref(jetsam_reason); - killed = memorystatus_do_kill(p, cause, jetsam_reason); - - /* Success? */ - if (killed) { - proc_rele(p); - kill_count++; - goto exit; - } - - /* - * Failure - first unwind the state, - * then fall through to restart the search. - */ - proc_list_lock(); - proc_rele_locked(p); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - p->p_memstat_state |= P_MEMSTAT_ERROR; - *errors += 1; + * This is a warning path which implies that the current process is close, but has + * not yet exceeded its per-process memory limit. + */ + if (memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) { + /* Print warning, since it's possible that task has not registered for pressure notifications */ + os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid); } + return; + } +#endif /* VM_PRESSURE_EVENTS */ + if (memlimit_is_fatal) { /* - * Failure - restart the search. - * - * We might have raced with "p" exiting on another core, resulting in no - * ref on "p". Or, we may have failed to kill "p". - * - * Either way, we fall thru to here, leaving the proc in the - * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state. - * - * And, we hold the the proc_list_lock at this point. + * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task + * has violated either the system-wide per-task memory limit OR its own task limit. */ + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT); + if (jetsam_reason == NULL) { + printf("task_exceeded footprint: failed to allocate jetsam reason\n"); + } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) { + /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */ + jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } - next_p = memorystatus_get_first_proc_locked(&band, FALSE); - } - - proc_list_unlock(); + if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) { + printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n"); + } + } else { + /* + * HWM offender exists. Done without locks or synchronization. + * See comment near its declaration for more details. + */ + memorystatus_hwm_candidates = TRUE; -exit: - os_reason_free(jetsam_reason); +#if VM_PRESSURE_EVENTS + /* + * The current process is not in the warning path. + * This path implies the current process has exceeded a non-fatal (soft) memory limit. + * Failure to send note is ignored here. + */ + (void)memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */); - /* Clear snapshot if freshly captured and no target was found */ - if (new_snapshot && (kill_count == 0)) { - proc_list_lock(); - memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; - proc_list_unlock(); +#endif /* VM_PRESSURE_EVENTS */ } - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, - memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0); - - return killed; } -static boolean_t -memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) +void +memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal) { + proc_t p = current_proc(); + /* - * TODO: allow a general async path - * - * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to - * add the appropriate exit reason code mapping. + * The limit violation is logged here, but only once per process per limit. + * Soft memory limit is a non-fatal high-water-mark + * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit. */ - if ((victim_pid != -1) || - (cause != kMemorystatusKilledVMPageShortage && - cause != kMemorystatusKilledVMCompressorThrashing && - cause != kMemorystatusKilledVMCompressorSpaceShortage && - cause != kMemorystatusKilledFCThrashing && - cause != kMemorystatusKilledZoneMapExhaustion)) { - return FALSE; - } - kill_under_pressure_cause = cause; - memorystatus_thread_wake(); - return TRUE; + os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n", + ((p && *p->p_name) ? p->p_name : "unknown"), (p ? p->p_pid : -1), (memlimit_is_active ? "Active" : "Inactive"), + (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb, + (memlimit_is_fatal ? "fatal" : "non-fatal")); + + return; } -boolean_t -memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async) -{ - if (async) { - return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage); - } else { - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n"); - } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason); - } -} +/* + * Description: + * Evaluates process state to determine which limit + * should be applied (active vs. inactive limit). + * + * Processes that have the 'elevated inactive jetsam band' attribute + * are first evaluated based on their current priority band. + * presently elevated ==> active + * + * Processes that opt into dirty tracking are evaluated + * based on clean vs dirty state. + * dirty ==> active + * clean ==> inactive + * + * Process that do not opt into dirty tracking are + * evalulated based on priority level. + * Foreground or above ==> active + * Below Foreground ==> inactive + * + * Return: TRUE if active + * False if inactive + */ -#if CONFIG_JETSAM -boolean_t -memorystatus_kill_on_VM_compressor_thrashing(boolean_t async) +static boolean_t +proc_jetsam_state_is_active_locked(proc_t p) { - if (async) { - return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing); - } else { - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n"); + if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) && + (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) { + /* + * process has the 'elevated inactive jetsam band' attribute + * and process is present in the elevated band + * implies active state + */ + return TRUE; + } else if (p->p_memstat_dirty & P_DIRTY_TRACK) { + /* + * process has opted into dirty tracking + * active state is based on dirty vs. clean + */ + if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { + /* + * process is dirty + * implies active state + */ + return TRUE; + } else { + /* + * process is clean + * implies inactive state + */ + return FALSE; } - - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason); + } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) { + /* + * process is Foreground or higher + * implies active state + */ + return TRUE; + } else { + /* + * process found below Foreground + * implies inactive state + */ + return FALSE; } } -boolean_t -memorystatus_kill_on_VM_page_shortage(boolean_t async) +static boolean_t +memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) { - if (async) { - return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage); - } else { - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n"); - } + boolean_t res; - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason); - } -} + uint32_t errors = 0; + uint64_t memory_reclaimed = 0; -boolean_t -memorystatus_kill_on_FC_thrashing(boolean_t async) -{ - if (async) { - return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing); + if (victim_pid == -1) { + /* No pid, so kill first process */ + res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors, &memory_reclaimed); } else { - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n"); - } + res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason); + } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason); + if (errors) { + memorystatus_clear_errors(); } -} -boolean_t -memorystatus_kill_on_vnode_limit(void) -{ - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n"); + if (res == TRUE) { + /* Fire off snapshot notification */ + proc_list_lock(); + size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count; + uint64_t timestamp_now = mach_absolute_time(); + memorystatus_jetsam_snapshot->notification_time = timestamp_now; + if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || + timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { + proc_list_unlock(); + int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); + if (!ret) { + proc_list_lock(); + memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; + proc_list_unlock(); + } + } else { + proc_list_unlock(); + } } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason); + return res; } -#endif /* CONFIG_JETSAM */ - -boolean_t -memorystatus_kill_on_zone_map_exhaustion(pid_t pid) +/* + * Jetsam a specific process. + */ +static boolean_t +memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) { - boolean_t res = FALSE; - if (pid == -1) { - res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion); - } else { - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION); - if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n"); - } + boolean_t killed; + proc_t p; + uint64_t killtime = 0; + uint64_t footprint_of_killed_proc; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + + /* TODO - add a victim queue and push this into the main jetsam thread */ - res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason); + p = proc_find(victim_pid); + if (!p) { + os_reason_free(jetsam_reason); + return FALSE; } - return res; -} -#if CONFIG_FREEZE + proc_list_lock(); -__private_extern__ void -memorystatus_freeze_init(void) -{ - kern_return_t result; - thread_t thread; + if (memorystatus_jetsam_snapshot_count == 0) { + memorystatus_init_jetsam_snapshot_locked(NULL, 0); + } - freezer_lck_grp_attr = lck_grp_attr_alloc_init(); - freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr); + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; - lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL); + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); - /* - * This is just the default value if the underlying - * storage device doesn't have any specific budget. - * We check with the storage layer in memorystatus_freeze_update_throttle() - * before we start our freezing the first time. - */ - memorystatus_freeze_budget_pages_remaining = (memorystatus_freeze_daily_mb_max * 1024 * 1024) / PAGE_SIZE; + proc_list_unlock(); - result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread); - if (result == KERN_SUCCESS) { - proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); - proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); - thread_set_thread_name(thread, "VM_freezer"); + killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc); - thread_deallocate(thread); - } else { - panic("Could not create memorystatus_freeze_thread"); - } + os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n", + (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"), + memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1), + footprint_of_killed_proc >> 10, (uint64_t)memorystatus_available_pages); + + proc_rele(p); + + return killed; } -static boolean_t -memorystatus_is_process_eligible_for_freeze(proc_t p) + +/* + * Toggle the P_MEMSTAT_TERMINATED state. + * Takes the proc_list_lock. + */ +void +proc_memstat_terminated(proc_t p, boolean_t set) { +#if DEVELOPMENT || DEBUG + if (p) { + proc_list_lock(); + if (set == TRUE) { + p->p_memstat_state |= P_MEMSTAT_TERMINATED; + } else { + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + } + proc_list_unlock(); + } +#else +#pragma unused(p, set) /* - * Called with proc_list_lock held. + * do nothing */ +#endif /* DEVELOPMENT || DEBUG */ + return; +} - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); - boolean_t should_freeze = FALSE; - uint32_t state = 0, entry_count = 0, pages = 0, i = 0; - int probability_of_use = 0; +#if CONFIG_JETSAM +/* + * This is invoked when cpulimits have been exceeded while in fatal mode. + * The jetsam_flags do not apply as those are for memory related kills. + * We call this routine so that the offending process is killed with + * a non-zero exit status. + */ +void +jetsam_on_ledger_cpulimit_exceeded(void) +{ + int retval = 0; + int jetsam_flags = 0; /* make it obvious */ + proc_t p = current_proc(); + os_reason_t jetsam_reason = OS_REASON_NULL; + + printf("task_exceeded_cpulimit: killing pid %d [%s]\n", + p->p_pid, (*p->p_name ? p->p_name : "(unknown)")); - if (isApp(p) == FALSE) { - goto out; + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT); + if (jetsam_reason == OS_REASON_NULL) { + printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n"); } - state = p->p_memstat_state; + retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason); - if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) || - !(state & P_MEMSTAT_SUSPENDED)) { - goto out; + if (retval) { + printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n"); } +} - /* Only freeze processes meeting our minimum resident page criteria */ - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); - if (pages < memorystatus_freeze_pages_min) { - goto out; - } +#endif /* CONFIG_JETSAM */ - entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t)); +static void +memorystatus_get_task_memory_region_count(task_t task, uint64_t *count) +{ + assert(task); + assert(count); - if (entry_count) { - for (i = 0; i < entry_count; i++) { - if (strncmp(memorystatus_global_probabilities_table[i].proc_name, - p->p_name, - MAXCOMLEN + 1) == 0) { - probability_of_use = memorystatus_global_probabilities_table[i].use_probability; - break; - } - } + *count = get_task_memory_region_count(task); +} - if (probability_of_use == 0) { - goto out; - } - } - should_freeze = TRUE; -out: - return should_freeze; -} +#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000 +#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000 + +#if DEVELOPMENT || DEBUG /* - * Synchronously freeze the passed proc. Called with a reference to the proc held. - * - * Doesn't deal with re-freezing because this is called on a specific process and - * not by the freezer thread. If that changes, we'll have to teach it about - * refreezing a frozen process. + * Sysctl only used to test memorystatus_allowed_vm_map_fork() path. + * set a new pidwatch value + * or + * get the current pidwatch value * - * Returns EINVAL or the value returned by task_freeze(). + * The pidwatch_val starts out with a PID to watch for in the map_fork path. + * Its value is: + * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork. + * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork. + * - set to -1ull if the map_fork() is aborted for other reasons. */ -int -memorystatus_freeze_process_sync(proc_t p) -{ - int ret = EINVAL; - pid_t aPid = 0; - boolean_t memorystatus_freeze_swap_low = FALSE; - int freezer_error_code = 0; - lck_mtx_lock(&freezer_mutex); +uint64_t memorystatus_vm_map_fork_pidwatch_val = 0; - if (p == NULL) { - printf("memorystatus_freeze_process_sync: Invalid process\n"); - goto exit; - } +static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS { +#pragma unused(oidp, arg1, arg2) - if (memorystatus_freeze_enabled == FALSE) { - printf("memorystatus_freeze_process_sync: Freezing is DISABLED\n"); - goto exit; - } + uint64_t new_value = 0; + uint64_t old_value = 0; + int error = 0; - if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { - printf("memorystatus_freeze_process_sync: Low compressor and/or low swap space...skipping freeze\n"); - goto exit; - } + /* + * The pid is held in the low 32 bits. + * The 'allowed' flags are in the upper 32 bits. + */ + old_value = memorystatus_vm_map_fork_pidwatch_val; - memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); - if (!memorystatus_freeze_budget_pages_remaining) { - printf("memorystatus_freeze_process_sync: exit with NO available budget\n"); - goto exit; + error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL); + + if (error || !req->newptr) { + /* + * No new value passed in. + */ + return error; } - proc_list_lock(); + /* + * A new pid was passed in via req->newptr. + * Ignore any attempt to set the higher order bits. + */ + memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF; + printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value); - if (p != NULL) { - uint32_t purgeable, wired, clean, dirty, shared; - uint32_t max_pages, i; + return error; +} - aPid = p->p_pid; +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork"); - /* Ensure the process is eligible for freezing */ - if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) { - proc_list_unlock(); - goto exit; - } - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining); - } else { - /* - * We only have the compressor without any swap. - */ - max_pages = UINT32_MAX - 1; +/* + * Record if a watched process fails to qualify for a vm_map_fork(). + */ +void +memorystatus_abort_vm_map_fork(task_t task) +{ + if (memorystatus_vm_map_fork_pidwatch_val != 0) { + proc_t p = get_bsdtask_info(task); + if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid) { + memorystatus_vm_map_fork_pidwatch_val = -1ull; } + } +} - /* Mark as locked temporarily to avoid kill */ - p->p_memstat_state |= P_MEMSTAT_LOCKED; - proc_list_unlock(); - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); - - ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); +static void +set_vm_map_fork_pidwatch(task_t task, uint64_t x) +{ + if (memorystatus_vm_map_fork_pidwatch_val != 0) { + proc_t p = get_bsdtask_info(task); + if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) { + memorystatus_vm_map_fork_pidwatch_val |= x; + } + } +} - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, - memorystatus_available_pages, aPid, 0, 0, 0); +#else /* DEVELOPMENT || DEBUG */ - DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty); - MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - " - "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", - (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), - memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); +static void +set_vm_map_fork_pidwatch(task_t task, uint64_t x) +{ +#pragma unused(task) +#pragma unused(x) +} - proc_list_lock(); +#endif /* DEVELOPMENT || DEBUG */ - if (ret == KERN_SUCCESS) { - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...done", - aPid, (*p->p_name ? p->p_name : "unknown")); +/* + * Called during EXC_RESOURCE handling when a process exceeds a soft + * memory limit. This is the corpse fork path and here we decide if + * vm_map_fork will be allowed when creating the corpse. + * The task being considered is suspended. + * + * By default, a vm_map_fork is allowed to proceed. + * + * A few simple policy assumptions: + * Desktop platform is not considered in this path. + * The vm_map_fork is always allowed. + * + * If the device has a zero system-wide task limit, + * then the vm_map_fork is allowed. + * + * And if a process's memory footprint calculates less + * than or equal to half of the system-wide task limit, + * then the vm_map_fork is allowed. This calculation + * is based on the assumption that a process can + * munch memory up to the system-wide task limit. + */ +boolean_t +memorystatus_allowed_vm_map_fork(task_t task) +{ + boolean_t is_allowed = TRUE; /* default */ - memorystatus_freeze_entry_t data = { aPid, TRUE, dirty }; +#if CONFIG_EMBEDDED - p->p_memstat_freeze_sharedanon_pages += shared; + uint64_t footprint_in_bytes; + uint64_t max_allowed_bytes; - memorystatus_frozen_shared_mb += shared; + if (max_task_footprint_mb == 0) { + set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED); + return is_allowed; + } - if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { - p->p_memstat_state |= P_MEMSTAT_FROZEN; - memorystatus_frozen_count++; - } + footprint_in_bytes = get_task_phys_footprint(task); - p->p_memstat_frozen_count++; + /* + * Maximum is 1/4 of the system-wide task limit. + */ + max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2; - /* - * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process - * to its higher jetsam band. - */ - proc_list_unlock(); + if (footprint_in_bytes > max_allowed_bytes) { + printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes); + set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED); + return !is_allowed; + } +#endif /* CONFIG_EMBEDDED */ - memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED); + return is_allowed; +} - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, - memorystatus_freeze_jetsam_band, TRUE); +void +memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages) +{ + assert(task); + assert(footprint); - if (ret) { - printf("Elevating the frozen process failed with %d\n", ret); - /* not fatal */ - ret = 0; - } + uint64_t pages; - proc_list_lock(); + pages = (get_task_phys_footprint(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *footprint = (uint32_t)pages; - /* Update stats */ - for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { - throttle_intervals[i].pageouts += dirty; - } - } else { - proc_list_lock(); - } + if (max_footprint_lifetime) { + pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *max_footprint_lifetime = (uint32_t)pages; + } + if (purgeable_pages) { + pages = (get_task_purgeable_size(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *purgeable_pages = (uint32_t)pages; + } +} - memorystatus_freeze_pageouts += dirty; +static void +memorystatus_get_task_phys_footprint_page_counts(task_t task, + uint64_t *internal_pages, uint64_t *internal_compressed_pages, + uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages, + uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages, + uint64_t *iokit_mapped_pages, uint64_t *page_table_pages) +{ + assert(task); - if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) { - /* - * Add some eviction logic here? At some point should we - * jetsam a process to get back its swap space so that we - * can freeze a more eligible process at this moment in time? - */ - } - } else { - char reason[128]; - if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { - strlcpy(reason, "too much shared memory", 128); - } + if (internal_pages) { + *internal_pages = (get_task_internal(task) / PAGE_SIZE_64); + } - if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { - strlcpy(reason, "low private-shared pages ratio", 128); - } + if (internal_compressed_pages) { + *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64); + } - if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { - strlcpy(reason, "no compressor space", 128); - } + if (purgeable_nonvolatile_pages) { + *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64); + } - if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { - strlcpy(reason, "no swap space", 128); - } + if (purgeable_nonvolatile_compressed_pages) { + *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64); + } - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...skipped (%s)", - aPid, (*p->p_name ? p->p_name : "unknown"), reason); - p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; - } + if (alternate_accounting_pages) { + *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64); + } - p->p_memstat_state &= ~P_MEMSTAT_LOCKED; - proc_list_unlock(); + if (alternate_accounting_compressed_pages) { + *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64); } -exit: - lck_mtx_unlock(&freezer_mutex); + if (iokit_mapped_pages) { + *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64); + } - return ret; + if (page_table_pages) { + *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64); + } } -static int -memorystatus_freeze_top_process(void) +/* + * This routine only acts on the global jetsam event snapshot. + * Updating the process's entry can race when the memorystatus_thread + * has chosen to kill a process that is racing to exit on another core. + */ +static void +memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime) { - pid_t aPid = 0; - int ret = -1; - proc_t p = PROC_NULL, next_p = PROC_NULL; - unsigned int i = 0; - unsigned int band = JETSAM_PRIORITY_IDLE; - boolean_t refreeze_processes = FALSE; + memorystatus_jetsam_snapshot_entry_t *entry = NULL; + memorystatus_jetsam_snapshot_t *snapshot = NULL; + memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; - proc_list_lock(); + unsigned int i; + + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); - if (memorystatus_frozen_count >= memorystatus_frozen_processes_max) { + if (memorystatus_jetsam_snapshot_count == 0) { /* - * Freezer is already full but we are here and so let's - * try to refreeze any processes we might have thawed - * in the past and push out their compressed state out. + * No active snapshot. + * Nothing to do. */ - refreeze_processes = TRUE; - band = (unsigned int) memorystatus_freeze_jetsam_band; + return; } -freeze_process: - - next_p = memorystatus_get_first_proc_locked(&band, FALSE); - while (next_p) { - kern_return_t kr; - uint32_t purgeable, wired, clean, dirty, shared; - uint32_t max_pages = 0; - int freezer_error_code = 0; - - p = next_p; - next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); + /* + * Sanity check as this routine should only be called + * from a jetsam kill path. + */ + assert(kill_cause != 0 && killtime != 0); - aPid = p->p_pid; + snapshot = memorystatus_jetsam_snapshot; + snapshot_list = memorystatus_jetsam_snapshot->entries; - if (p->p_memstat_effectivepriority != (int32_t) band) { - /* - * We shouldn't be freezing processes outside the - * prescribed band. - */ - break; - } + for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) { + if (snapshot_list[i].pid == p->p_pid) { + entry = &snapshot_list[i]; - /* Ensure the process is eligible for (re-)freezing */ - if (refreeze_processes) { - /* - * Has to have been frozen once before. - */ - if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { - continue; + if (entry->killed || entry->jse_killtime) { + /* + * We apparently raced on the exit path + * for this process, as it's snapshot entry + * has already recorded a kill. + */ + assert(entry->killed && entry->jse_killtime); + break; } /* - * Has to have been resumed once before. + * Update the entry we just found in the snapshot. */ - if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == FALSE) { - continue; - } - /* - * Not currently being looked at for something. - */ - if (p->p_memstat_state & P_MEMSTAT_LOCKED) { - continue; - } + entry->killed = kill_cause; + entry->jse_killtime = killtime; + entry->jse_gencount = snapshot->js_gencount; + entry->jse_idle_delta = p->p_memstat_idle_delta; +#if CONFIG_FREEZE + entry->jse_thaw_count = p->p_memstat_thaw_count; +#else /* CONFIG_FREEZE */ + entry->jse_thaw_count = 0; +#endif /* CONFIG_FREEZE */ /* - * We are going to try and refreeze and so re-evaluate - * the process. We don't want to double count the shared - * memory. So deduct the old snapshot here. + * If a process has moved between bands since snapshot was + * initialized, then likely these fields changed too. */ - memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages; - p->p_memstat_freeze_sharedanon_pages = 0; - - p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; - memorystatus_refreeze_eligible_count--; - } else { - if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) { - continue; // with lock held + if (entry->priority != p->p_memstat_effectivepriority) { + strlcpy(entry->name, p->p_name, sizeof(entry->name)); + entry->priority = p->p_memstat_effectivepriority; + entry->state = memorystatus_build_state(p); + entry->user_data = p->p_memstat_userdata; + entry->fds = p->p_fd->fd_nfiles; } - } - - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - /* - * Freezer backed by the compressor and swap file(s) - * will hold compressed data. - */ - max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining); - } else { /* - * We only have the compressor pool. + * Always update the page counts on a kill. */ - max_pages = UINT32_MAX - 1; - } - - /* Mark as locked temporarily to avoid kill */ - p->p_memstat_state |= P_MEMSTAT_LOCKED; - - p = proc_ref_locked(p); - if (!p) { - break; - } - - proc_list_unlock(); - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); - kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, - memorystatus_available_pages, aPid, 0, 0, 0); - - MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - " - "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", - (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), - memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); - - proc_list_lock(); + uint32_t pages = 0; + uint32_t max_pages_lifetime = 0; + uint32_t purgeable_pages = 0; - /* Success? */ - if (KERN_SUCCESS == kr) { - if (refreeze_processes) { - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Refreezing (general) pid %d [%s]...done", - aPid, (*p->p_name ? p->p_name : "unknown")); - } else { - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...done", - aPid, (*p->p_name ? p->p_name : "unknown")); - } + memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages); + entry->pages = (uint64_t)pages; + entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; + entry->purgeable_pages = (uint64_t)purgeable_pages; - memorystatus_freeze_entry_t data = { aPid, TRUE, dirty }; + uint64_t internal_pages = 0; + uint64_t internal_compressed_pages = 0; + uint64_t purgeable_nonvolatile_pages = 0; + uint64_t purgeable_nonvolatile_compressed_pages = 0; + uint64_t alternate_accounting_pages = 0; + uint64_t alternate_accounting_compressed_pages = 0; + uint64_t iokit_mapped_pages = 0; + uint64_t page_table_pages = 0; - p->p_memstat_freeze_sharedanon_pages += shared; + memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages, + &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages, + &alternate_accounting_pages, &alternate_accounting_compressed_pages, + &iokit_mapped_pages, &page_table_pages); - memorystatus_frozen_shared_mb += shared; + entry->jse_internal_pages = internal_pages; + entry->jse_internal_compressed_pages = internal_compressed_pages; + entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages; + entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages; + entry->jse_alternate_accounting_pages = alternate_accounting_pages; + entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages; + entry->jse_iokit_mapped_pages = iokit_mapped_pages; + entry->jse_page_table_pages = page_table_pages; - if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { - p->p_memstat_state |= P_MEMSTAT_FROZEN; - memorystatus_frozen_count++; - } + uint64_t region_count = 0; + memorystatus_get_task_memory_region_count(p->task, ®ion_count); + entry->jse_memory_region_count = region_count; - p->p_memstat_frozen_count++; + goto exit; + } + } + if (entry == NULL) { + /* + * The entry was not found in the snapshot, so the process must have + * launched after the snapshot was initialized. + * Let's try to append the new entry. + */ + if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) { /* - * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process - * to its higher jetsam band. + * A populated snapshot buffer exists + * and there is room to init a new entry. */ - proc_list_unlock(); - - memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); - - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, memorystatus_freeze_jetsam_band, TRUE); - - if (ret) { - printf("Elevating the frozen process failed with %d\n", ret); - /* not fatal */ - ret = 0; - } - - proc_list_lock(); - - /* Update stats */ - for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { - throttle_intervals[i].pageouts += dirty; - } - } else { - proc_list_lock(); - } - - memorystatus_freeze_pageouts += dirty; - - if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) { - /* - * Add some eviction logic here? At some point should we - * jetsam a process to get back its swap space so that we - * can freeze a more eligible process at this moment in time? - */ - } - - /* Return KERN_SUCCESS */ - ret = kr; + assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count); - p->p_memstat_state &= ~P_MEMSTAT_LOCKED; - proc_rele_locked(p); + unsigned int next = memorystatus_jetsam_snapshot_count; - /* - * We froze a process successfully. We can stop now - * and see if that helped. - */ + if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) { + entry = &snapshot_list[next]; + entry->killed = kill_cause; + entry->jse_killtime = killtime; - break; - } else { - p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + snapshot->entry_count = ++next; + memorystatus_jetsam_snapshot_count = next; - if (refreeze_processes == TRUE) { - if ((freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) || - (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO)) { + if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) { /* - * Keeping this prior-frozen process in this high band when - * we failed to re-freeze it due to bad shared memory usage - * could cause excessive pressure on the lower bands. - * We need to demote it for now. It'll get re-evaluated next - * time because we don't set the P_MEMSTAT_FREEZE_IGNORE - * bit. + * We just used the last slot in the snapshot buffer. + * We only want to log it once... so we do it here + * when we notice we've hit the max. */ - - p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE); + printf("memorystatus: WARNING snapshot buffer is full, count %d\n", + memorystatus_jetsam_snapshot_count); } - } else { - p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; } + } + } - proc_rele_locked(p); - - char reason[128]; - if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { - strlcpy(reason, "too much shared memory", 128); - } +exit: + if (entry == NULL) { + /* + * If we reach here, the snapshot buffer could not be updated. + * Most likely, the buffer is full, in which case we would have + * logged a warning in the previous call. + * + * For now, we will stop appending snapshot entries. + * When the buffer is consumed, the snapshot state will reset. + */ - if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { - strlcpy(reason, "low private-shared pages ratio", 128); - } + MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n", + p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count); + } - if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { - strlcpy(reason, "no compressor space", 128); - } + return; +} - if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { - strlcpy(reason, "no swap space", 128); - } +#if CONFIG_JETSAM +void +memorystatus_pages_update(unsigned int pages_avail) +{ + memorystatus_available_pages = pages_avail; - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...skipped (%s)", - aPid, (*p->p_name ? p->p_name : "unknown"), reason); +#if VM_PRESSURE_EVENTS + /* + * Since memorystatus_available_pages changes, we should + * re-evaluate the pressure levels on the system and + * check if we need to wake the pressure thread. + * We also update memorystatus_level in that routine. + */ + vm_pressure_response(); - if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { - break; - } + if (memorystatus_available_pages <= memorystatus_available_pages_pressure) { + if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) { + memorystatus_thread_wake(); } } +#if CONFIG_FREEZE + /* + * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect + * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this + * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here + * will result in the "mutex with preemption disabled" panic. + */ - if ((ret == -1) && - (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD) && - (refreeze_processes == FALSE)) { + if (memorystatus_freeze_thread_should_run() == TRUE) { /* - * We failed to freeze a process from the IDLE - * band AND we have some thawed processes - * AND haven't tried refreezing as yet. - * Let's try and re-freeze processes in the - * frozen band that have been resumed in the past - * and so have brought in state from disk. + * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process). + * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here. */ + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + thread_wakeup((event_t)&memorystatus_freeze_wakeup); + } + } +#endif /* CONFIG_FREEZE */ - band = (unsigned int) memorystatus_freeze_jetsam_band; +#else /* VM_PRESSURE_EVENTS */ - refreeze_processes = TRUE; + boolean_t critical, delta; - goto freeze_process; + if (!memorystatus_delta) { + return; } - proc_list_unlock(); + critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE; + delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta)) + || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE; - return ret; + if (critical || delta) { + unsigned int total_pages; + + total_pages = (unsigned int) atop_64(max_mem); +#if CONFIG_SECLUDED_MEMORY + total_pages -= vm_page_secluded_count; +#endif /* CONFIG_SECLUDED_MEMORY */ + memorystatus_level = memorystatus_available_pages * 100 / total_pages; + memorystatus_thread_wake(); + } +#endif /* VM_PRESSURE_EVENTS */ } +#endif /* CONFIG_JETSAM */ -static inline boolean_t -memorystatus_can_freeze_processes(void) +static boolean_t +memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount) { - boolean_t ret; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t pages = 0; + uint32_t max_pages_lifetime = 0; + uint32_t purgeable_pages = 0; + uint64_t internal_pages = 0; + uint64_t internal_compressed_pages = 0; + uint64_t purgeable_nonvolatile_pages = 0; + uint64_t purgeable_nonvolatile_compressed_pages = 0; + uint64_t alternate_accounting_pages = 0; + uint64_t alternate_accounting_compressed_pages = 0; + uint64_t iokit_mapped_pages = 0; + uint64_t page_table_pages = 0; + uint64_t region_count = 0; + uint64_t cids[COALITION_NUM_TYPES]; - proc_list_lock(); + memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t)); - if (memorystatus_suspended_count) { - memorystatus_freeze_suspended_threshold = MIN(memorystatus_freeze_suspended_threshold, FREEZE_SUSPENDED_THRESHOLD_DEFAULT); + entry->pid = p->p_pid; + strlcpy(&entry->name[0], p->p_name, sizeof(entry->name)); + entry->priority = p->p_memstat_effectivepriority; - if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) { - ret = TRUE; - } else { - ret = FALSE; - } - } else { - ret = FALSE; - } + memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages); + entry->pages = (uint64_t)pages; + entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; + entry->purgeable_pages = (uint64_t)purgeable_pages; - proc_list_unlock(); + memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages, + &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages, + &alternate_accounting_pages, &alternate_accounting_compressed_pages, + &iokit_mapped_pages, &page_table_pages); - return ret; -} + entry->jse_internal_pages = internal_pages; + entry->jse_internal_compressed_pages = internal_compressed_pages; + entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages; + entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages; + entry->jse_alternate_accounting_pages = alternate_accounting_pages; + entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages; + entry->jse_iokit_mapped_pages = iokit_mapped_pages; + entry->jse_page_table_pages = page_table_pages; -static boolean_t -memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low) -{ - boolean_t can_freeze = TRUE; + memorystatus_get_task_memory_region_count(p->task, ®ion_count); + entry->jse_memory_region_count = region_count; - /* Only freeze if we're sufficiently low on memory; this holds off freeze right - * after boot, and is generally is a no-op once we've reached steady state. */ - if (memorystatus_available_pages > memorystatus_freeze_threshold) { - return FALSE; - } + entry->state = memorystatus_build_state(p); + entry->user_data = p->p_memstat_userdata; + memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid)); + entry->fds = p->p_fd->fd_nfiles; - /* Check minimum suspended process threshold. */ - if (!memorystatus_can_freeze_processes()) { - return FALSE; - } - assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec); + entry->cpu_time.tv_sec = (int64_t)tv_sec; + entry->cpu_time.tv_usec = (int64_t)tv_usec; - if (!VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - /* - * In-core compressor used for freezing WITHOUT on-disk swap support. - */ - if (vm_compressor_low_on_space()) { - if (*memorystatus_freeze_swap_low) { - *memorystatus_freeze_swap_low = TRUE; - } + assert(p->p_stats != NULL); + entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */ + entry->jse_killtime = 0; /* abstime jetsam chose to kill process */ + entry->killed = 0; /* the jetsam kill cause */ + entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */ - can_freeze = FALSE; - } else { - if (*memorystatus_freeze_swap_low) { - *memorystatus_freeze_swap_low = FALSE; - } + entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */ - can_freeze = TRUE; - } +#if CONFIG_FREEZE + entry->jse_thaw_count = p->p_memstat_thaw_count; +#else /* CONFIG_FREEZE */ + entry->jse_thaw_count = 0; +#endif /* CONFIG_FREEZE */ + + proc_coalitionids(p, cids); + entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM]; + + return TRUE; +} + +static void +memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot) +{ + kern_return_t kr = KERN_SUCCESS; + mach_msg_type_number_t count = HOST_VM_INFO64_COUNT; + vm_statistics64_data_t vm_stat; + + if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) { + printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr); + memset(&snapshot->stats, 0, sizeof(snapshot->stats)); } else { - /* - * Freezing WITH on-disk swap support. - * - * In-core compressor fronts the swap. - */ - if (vm_swap_low_on_space()) { - if (*memorystatus_freeze_swap_low) { - *memorystatus_freeze_swap_low = TRUE; - } + snapshot->stats.free_pages = vm_stat.free_count; + snapshot->stats.active_pages = vm_stat.active_count; + snapshot->stats.inactive_pages = vm_stat.inactive_count; + snapshot->stats.throttled_pages = vm_stat.throttled_count; + snapshot->stats.purgeable_pages = vm_stat.purgeable_count; + snapshot->stats.wired_pages = vm_stat.wire_count; - can_freeze = FALSE; - } + snapshot->stats.speculative_pages = vm_stat.speculative_count; + snapshot->stats.filebacked_pages = vm_stat.external_page_count; + snapshot->stats.anonymous_pages = vm_stat.internal_page_count; + snapshot->stats.compressions = vm_stat.compressions; + snapshot->stats.decompressions = vm_stat.decompressions; + snapshot->stats.compressor_pages = vm_stat.compressor_page_count; + snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor; } - return can_freeze; + get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity); + + bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name)); + get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name), + &snapshot->stats.largest_zone_size); } /* - * This function evaluates if the currently frozen processes deserve - * to stay in the higher jetsam band. If the # of thaws of a process - * is below our threshold, then we will demote that process into the IDLE - * band and put it at the head. We don't immediately kill the process here - * because it already has state on disk and so it might be worth giving - * it another shot at getting thawed/resumed and used. + * Collect vm statistics at boot. + * Called only once (see kern_exec.c) + * Data can be consumed at any time. */ +void +memorystatus_init_at_boot_snapshot() +{ + memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot); + memorystatus_at_boot_snapshot.entry_count = 0; + memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */ + memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time(); +} + static void -memorystatus_demote_frozen_processes(void) +memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count ) { - unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band; - unsigned int demoted_proc_count = 0; - proc_t p = PROC_NULL, next_p = PROC_NULL; + proc_t p, next_p; + unsigned int b = 0, i = 0; - proc_list_lock(); + memorystatus_jetsam_snapshot_t *snapshot = NULL; + memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; + unsigned int snapshot_max = 0; + + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); - if (memorystatus_freeze_enabled == FALSE) { + if (od_snapshot) { /* - * Freeze has been disabled likely to - * reclaim swap space. So don't change - * any state on the frozen processes. + * This is an on_demand snapshot */ - proc_list_unlock(); - return; + snapshot = od_snapshot; + snapshot_list = od_snapshot->entries; + snapshot_max = ods_list_count; + } else { + /* + * This is a jetsam event snapshot + */ + snapshot = memorystatus_jetsam_snapshot; + snapshot_list = memorystatus_jetsam_snapshot->entries; + snapshot_max = memorystatus_jetsam_snapshot_max; } - next_p = memorystatus_get_first_proc_locked(&band, FALSE); + /* + * Init the snapshot header information + */ + memorystatus_init_snapshot_vmstats(snapshot); + snapshot->snapshot_time = mach_absolute_time(); + snapshot->notification_time = 0; + snapshot->js_gencount = 0; + + next_p = memorystatus_get_first_proc_locked(&b, TRUE); while (next_p) { p = next_p; - next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); - - if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { - continue; - } + next_p = memorystatus_get_next_proc_locked(&b, p, TRUE); - if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) { continue; } - if (p->p_memstat_thaw_count < memorystatus_thaw_count_demotion_threshold) { - p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE); -#if DEVELOPMENT || DEBUG - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus_demote_frozen_process pid %d [%s]", - p->p_pid, (*p->p_name ? p->p_name : "unknown")); -#endif /* DEVELOPMENT || DEBUG */ - - /* - * The freezer thread will consider this a normal app to be frozen - * because it is in the IDLE band. So we don't need the - * P_MEMSTAT_REFREEZE_ELIGIBLE state here. Also, if it gets resumed - * we'll correctly count it as eligible for re-freeze again. - * - * We don't drop the frozen count because this process still has - * state on disk. So there's a chance it gets resumed and then it - * should land in the higher jetsam band. For that it needs to - * remain marked frozen. - */ - if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { - p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; - memorystatus_refreeze_eligible_count--; - } - - demoted_proc_count++; - } + MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + p->p_pid, + p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7], + p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]); - if (demoted_proc_count == memorystatus_max_frozen_demotions_daily) { + if (++i == snapshot_max) { break; } } - memorystatus_thaw_count = 0; - proc_list_unlock(); + snapshot->entry_count = i; + + if (!od_snapshot) { + /* update the system buffer count */ + memorystatus_jetsam_snapshot_count = i; + } } +#if DEVELOPMENT || DEBUG -/* - * This function will do 4 things: - * - * 1) check to see if we are currently in a degraded freezer mode, and if so: - * - check to see if our window has expired and we should exit this mode, OR, - * - return a budget based on the degraded throttle window's max. pageouts vs current pageouts. - * - * 2) check to see if we are in a NEW normal window and update the normal throttle window's params. - * - * 3) check what the current normal window allows for a budget. - * - * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below - * what we would normally expect, then we are running low on our daily budget and need to enter - * degraded perf. mode. - */ - -static void -memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) +#if CONFIG_JETSAM +static int +memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) { - clock_sec_t sec; - clock_nsec_t nsec; - mach_timespec_t ts; + int ret; + memorystatus_jetsam_panic_options_t debug; - unsigned int freeze_daily_pageouts_max = 0; + if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) { + return EINVAL; + } -#if DEVELOPMENT || DEBUG - if (!memorystatus_freeze_throttle_enabled) { - /* - * No throttling...we can use the full budget everytime. - */ - *budget_pages_allowed = UINT64_MAX; - return; + ret = copyin(buffer, &debug, buffer_size); + if (ret) { + return ret; } -#endif - clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = sec; - ts.tv_nsec = nsec; + /* Panic bits match kMemorystatusKilled* enum */ + memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask); + + /* Copyout new value */ + debug.data = memorystatus_jetsam_panic_debug; + ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t)); + + return ret; +} +#endif /* CONFIG_JETSAM */ - struct throttle_interval_t *interval = NULL; +/* + * Triggers a sort_order on a specified jetsam priority band. + * This is for testing only, used to force a path through the sort + * function. + */ +static int +memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) +{ + int error = 0; - if (memorystatus_freeze_degradation == TRUE) { - interval = degraded_throttle_window; + unsigned int bucket_index = 0; - if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { - memorystatus_freeze_degradation = FALSE; - interval->pageouts = 0; - interval->max_pageouts = 0; - } else { - *budget_pages_allowed = interval->max_pageouts - interval->pageouts; - } + if (priority == -1) { + /* Use as shorthand for default priority */ + bucket_index = JETSAM_PRIORITY_DEFAULT; + } else { + bucket_index = (unsigned int)priority; } - interval = normal_throttle_window; + error = memorystatus_sort_bucket(bucket_index, sort_order); - if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { - /* - * New throttle window. - * Rollover any unused budget. - * Also ask the storage layer what the new budget needs to be. - */ - uint64_t freeze_daily_budget = 0; - unsigned int daily_budget_pageouts = 0; + return error; +} - if (vm_swap_max_budget(&freeze_daily_budget)) { - memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024)); - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max); - } +#endif /* DEVELOPMENT || DEBUG */ - freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE); +/* + * Prepare the process to be killed (set state, update snapshot) and kill it. + */ +static uint64_t memorystatus_purge_before_jetsam_success = 0; - daily_budget_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); - interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts; +static boolean_t +memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, boolean_t *killed, uint64_t *footprint_of_killed_proc) +{ + pid_t aPid = 0; + uint32_t aPid_ep = 0; - interval->ts.tv_sec = interval->mins * 60; - interval->ts.tv_nsec = 0; - ADD_MACH_TIMESPEC(&interval->ts, &ts); - /* Since we update the throttle stats pre-freeze, adjust for overshoot here */ - if (interval->pageouts > interval->max_pageouts) { - interval->pageouts -= interval->max_pageouts; - } else { - interval->pageouts = 0; - } - *budget_pages_allowed = interval->max_pageouts; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + boolean_t retval = FALSE; - memorystatus_demote_frozen_processes(); - } else { - /* - * Current throttle window. - * Deny freezing if we have no budget left. - * Try graceful degradation if we are within 25% of: - * - the daily budget, and - * - the current budget left is below our normal budget expectations. - */ + aPid = p->p_pid; + aPid_ep = p->p_memstat_effectivepriority; -#if DEVELOPMENT || DEBUG + if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) { /* - * This can only happen in the INTERNAL configs because we allow modifying the daily budget for testing. + * Genuine memory pressure and not other (vnode/zone) resource exhaustion. */ + boolean_t success = FALSE; + uint64_t num_pages_purged; + uint64_t num_pages_reclaimed = 0; + uint64_t num_pages_unsecluded = 0; - if (freeze_daily_pageouts_max > interval->max_pageouts) { + networking_memstatus_callout(p, cause); + num_pages_purged = vm_purgeable_purge_task_owned(p->task); + num_pages_reclaimed += num_pages_purged; +#if CONFIG_SECLUDED_MEMORY + if (cause == kMemorystatusKilledVMPageShortage && + vm_page_secluded_count > 0 && + task_can_use_secluded_mem(p->task, FALSE)) { /* - * We just bumped the daily budget. Re-evaluate our normal window params. + * We're about to kill a process that has access + * to the secluded pool. Drain that pool into the + * free or active queues to make these pages re-appear + * as "available", which might make us no longer need + * to kill that process. + * Since the secluded pool does not get refilled while + * a process has access to it, it should remain + * drained. */ - interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); - memorystatus_freeze_degradation = FALSE; //we'll re-evaluate this below... + num_pages_unsecluded = vm_page_secluded_drain(); + num_pages_reclaimed += num_pages_unsecluded; } -#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_SECLUDED_MEMORY */ - if (memorystatus_freeze_degradation == FALSE) { - if (interval->pageouts >= interval->max_pageouts) { - *budget_pages_allowed = 0; + if (num_pages_reclaimed) { + /* + * We actually reclaimed something and so let's + * check if we need to continue with the kill. + */ + if (cause == kMemorystatusKilledHiwat) { + uint64_t footprint_in_bytes = get_task_phys_footprint(p->task); + uint64_t memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */ + success = (footprint_in_bytes <= memlimit_in_bytes); } else { - int budget_left = interval->max_pageouts - interval->pageouts; - int budget_threshold = (freeze_daily_pageouts_max * FREEZE_DEGRADATION_BUDGET_THRESHOLD) / 100; - - mach_timespec_t time_left = {0, 0}; - - time_left.tv_sec = interval->ts.tv_sec; - time_left.tv_nsec = 0; - - SUB_MACH_TIMESPEC(&time_left, &ts); - - if (budget_left <= budget_threshold) { + success = (memorystatus_avail_pages_below_pressure() == FALSE); +#if CONFIG_SECLUDED_MEMORY + if (!success && num_pages_unsecluded) { /* - * For the current normal window, calculate how much we would pageout in a DEGRADED_WINDOW_MINS duration. - * And also calculate what we would pageout for the same DEGRADED_WINDOW_MINS duration if we had the full - * daily pageout budget. + * We just drained the secluded pool + * because we're about to kill a + * process that has access to it. + * This is an important process and + * we'd rather not kill it unless + * absolutely necessary, so declare + * success even if draining the pool + * did not quite get us out of the + * "pressure" level but still got + * us out of the "critical" level. */ + success = (memorystatus_avail_pages_below_critical() == FALSE); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + } - unsigned int current_budget_rate_allowed = ((budget_left / time_left.tv_sec) / 60) * DEGRADED_WINDOW_MINS; - unsigned int normal_budget_rate_allowed = (freeze_daily_pageouts_max / NORMAL_WINDOW_MINS) * DEGRADED_WINDOW_MINS; - - /* - * The current rate of pageouts is below what we would expect for - * the normal rate i.e. we have below normal budget left and so... - */ + if (success) { + memorystatus_purge_before_jetsam_success++; - if (current_budget_rate_allowed < normal_budget_rate_allowed) { - memorystatus_freeze_degradation = TRUE; - degraded_throttle_window->max_pageouts = current_budget_rate_allowed; - degraded_throttle_window->pageouts = 0; + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n", + num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]); - /* - * Switch over to the degraded throttle window so the budget - * doled out is based on that window. - */ - interval = degraded_throttle_window; - } - } + *killed = FALSE; - *budget_pages_allowed = interval->max_pageouts - interval->pageouts; + return TRUE; } } } - MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", - interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, - interval->throttle ? "on" : "off"); -} +#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) + MEMORYSTATUS_DEBUG(1, "jetsam: killing pid %d [%s] - %lld Mb > 1 (%d Mb)\n", + aPid, (*p->p_name ? p->p_name : "unknown"), + (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */ + p->p_memstat_memlimit); +#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ -static void -memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused) -{ - static boolean_t memorystatus_freeze_swap_low = FALSE; + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; - lck_mtx_lock(&freezer_mutex); + proc_list_lock(); + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); + proc_list_unlock(); - if (memorystatus_freeze_enabled) { - if ((memorystatus_frozen_count < memorystatus_frozen_processes_max) || - (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD)) { - if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { - /* Only freeze if we've not exceeded our pageout budgets.*/ - memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + char kill_reason_string[128]; - if (memorystatus_freeze_budget_pages_remaining) { - memorystatus_freeze_top_process(); - } - } + if (cause == kMemorystatusKilledHiwat) { + strlcpy(kill_reason_string, "killing_highwater_process", 128); + } else { + if (aPid_ep == JETSAM_PRIORITY_IDLE) { + strlcpy(kill_reason_string, "killing_idle_process", 128); + } else { + strlcpy(kill_reason_string, "killing_top_process", 128); } } /* - * We use memorystatus_apps_idle_delay_time because if/when we adopt aging for applications, - * it'll tie neatly into running the freezer once we age an application. - * - * Till then, it serves as a good interval that can be tuned via a sysctl too. + * memorystatus_do_kill drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns */ - memorystatus_freezer_thread_next_run_ts = mach_absolute_time() + memorystatus_apps_idle_delay_time; + os_reason_ref(jetsam_reason); - assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT); - lck_mtx_unlock(&freezer_mutex); + retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc); + *killed = retval; - thread_block((thread_continue_t) memorystatus_freeze_thread); + os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu", + (unsigned long)tv_sec, tv_msec, kill_reason_string, + aPid, ((p && *p->p_name) ? p->p_name : "unknown"), + memorystatus_kill_cause_name[cause], aPid_ep, + (*footprint_of_killed_proc) >> 10, (uint64_t)memorystatus_available_pages); + + return retval; } +/* + * Jetsam the first process in the queue. + */ static boolean_t -memorystatus_freeze_thread_should_run(void) +memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, + int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed) { - /* - * No freezer_mutex held here...see why near call-site - * within memorystatus_pages_update(). - */ - - boolean_t should_run = FALSE; - - if (memorystatus_freeze_enabled == FALSE) { - goto out; - } - - if (memorystatus_available_pages > memorystatus_freeze_threshold) { - goto out; - } + pid_t aPid; + proc_t p = PROC_NULL, next_p = PROC_NULL; + boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE; + unsigned int i = 0; + uint32_t aPid_ep; + int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE; + uint64_t footprint_of_killed_proc = 0; - if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) && - (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) { - goto out; - } +#ifndef CONFIG_FREEZE +#pragma unused(any) +#endif - if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) { - goto out; - } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); - uint64_t curr_time = mach_absolute_time(); - if (curr_time < memorystatus_freezer_thread_next_run_ts) { - goto out; +#if CONFIG_JETSAM + if (sort_flag == TRUE) { + (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT); } - should_run = TRUE; + local_max_kill_prio = max_kill_priority; -out: - return should_run; -} + force_new_snapshot = FALSE; -static int -sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS -{ -#pragma unused(oidp, req, arg1, arg2) +#else /* CONFIG_JETSAM */ - /* Need to be root or have entitlement */ - if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) { - return EPERM; + if (sort_flag == TRUE) { + (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT); } - if (memorystatus_freeze_enabled == FALSE) { - return ENOTSUP; + /* + * On macos, we currently only have 2 reasons to be here: + * + * kMemorystatusKilledZoneMapExhaustion + * AND + * kMemorystatusKilledVMCompressorSpaceShortage + * + * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider + * any and all processes as eligible kill candidates since we need to avoid a panic. + * + * Since this function can be called async. it is harder to toggle the max_kill_priority + * value before and after a call. And so we use this local variable to set the upper band + * on the eligible kill bands. + */ + if (cause == kMemorystatusKilledZoneMapExhaustion) { + local_max_kill_prio = JETSAM_PRIORITY_MAX; + } else { + local_max_kill_prio = max_kill_priority; } - do_fastwake_warmup_all(); - - return 0; -} - -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", ""); - -#endif /* CONFIG_FREEZE */ - -#if VM_PRESSURE_EVENTS - -#if CONFIG_MEMORYSTATUS - -static int -memorystatus_send_note(int event_code, void *data, size_t data_length) -{ - int ret; - struct kev_msg ev_msg; - - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_SYSTEM_CLASS; - ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; - - ev_msg.event_code = event_code; + /* + * And, because we are here under extreme circumstances, we force a snapshot even for + * IDLE kills. + */ + force_new_snapshot = TRUE; - ev_msg.dv[0].data_length = data_length; - ev_msg.dv[0].data_ptr = data; - ev_msg.dv[1].data_length = 0; +#endif /* CONFIG_JETSAM */ - ret = kev_post_msg(&ev_msg); - if (ret) { - printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); + if (cause != kMemorystatusKilledZoneMapExhaustion && + jetsam_current_thread() != NULL && + jetsam_current_thread()->limit_to_low_bands && + local_max_kill_prio > JETSAM_PRIORITY_BACKGROUND) { + local_max_kill_prio = JETSAM_PRIORITY_BACKGROUND; } - return ret; -} + proc_list_lock(); -boolean_t -memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded) -{ - boolean_t ret = FALSE; - boolean_t found_knote = FALSE; - struct knote *kn = NULL; - int send_knote_count = 0; + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) { + p = next_p; + next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); - /* - * See comment in sysctl_memorystatus_vm_pressure_send. - */ - memorystatus_klist_lock(); + aPid = p->p_pid; + aPid_ep = p->p_memstat_effectivepriority; - SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { - proc_t knote_proc = knote_get_kq(kn)->kq_p; - pid_t knote_pid = knote_proc->p_pid; + if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { + continue; /* with lock held */ + } - if (knote_pid == pid) { + if (cause == kMemorystatusKilledVnodes) { /* - * By setting the "fflags" here, we are forcing - * a process to deal with the case where it's - * bumping up into its memory limits. If we don't - * do this here, we will end up depending on the - * system pressure snapshot evaluation in - * filt_memorystatus(). + * If the system runs out of vnodes, we systematically jetsam + * processes in hopes of stumbling onto a vnode gain that helps + * the system recover. The process that happens to trigger + * this path has no known relationship to the vnode shortage. + * Deadlock avoidance: attempt to safeguard the caller. */ -#if CONFIG_EMBEDDED - if (!limit_exceeded) { + if (p == current_proc()) { + /* do not jetsam the current process */ + continue; + } + } + +#if CONFIG_FREEZE + boolean_t skip; + boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED); + if (any || reclaim_proc) { + skip = FALSE; + } else { + skip = TRUE; + } + + if (skip) { + continue; + } else +#endif + { + if (proc_ref_locked(p) == p) { /* - * Intentionally set either the unambiguous limit warning, - * the system-wide critical or the system-wide warning - * notification bit. + * Mark as terminated so that if exit1() indicates success, but the process (for example) + * is blocked in task_exception_notify(), it'll be skipped if encountered again - see + * . This is cheaper than examining P_LEXIT, which requires the + * acquisition of the proc lock. */ - - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; - found_knote = TRUE; - send_knote_count++; - } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; - found_knote = TRUE; - send_knote_count++; - } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; - found_knote = TRUE; - send_knote_count++; - } + p->p_memstat_state |= P_MEMSTAT_TERMINATED; } else { /* - * Send this notification when a process has exceeded a soft limit. + * We need to restart the search again because + * proc_ref_locked _can_ drop the proc_list lock + * and we could have lost our stored next_p via + * an exit() on another core. */ - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; - found_knote = TRUE; - send_knote_count++; - } + i = 0; + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + continue; } -#else /* CONFIG_EMBEDDED */ - if (!limit_exceeded) { - /* - * Processes on desktop are not expecting to handle a system-wide - * critical or system-wide warning notification from this path. - * Intentionally set only the unambiguous limit warning here. - * - * If the limit is soft, however, limit this to one notification per - * active/inactive limit (per each registered listener). - */ - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { - found_knote = TRUE; - if (!is_fatal) { - /* - * Restrict proc_limit_warn notifications when - * non-fatal (soft) limit is at play. - */ - if (is_active) { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) { - /* - * Mark this knote for delivery. - */ - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; - /* - * And suppress it from future notifications. - */ - kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; - send_knote_count++; - } - } else { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) { - /* - * Mark this knote for delivery. - */ - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; - /* - * And suppress it from future notifications. - */ - kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; - send_knote_count++; - } - } - } else { - /* - * No restriction on proc_limit_warn notifications when - * fatal (hard) limit is at play. - */ - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; - send_knote_count++; - } - } - } else { - /* - * Send this notification when a process has exceeded a soft limit, - */ + /* + * Capture a snapshot if none exists and: + * - we are forcing a new snapshot creation, either because: + * - on a particular platform we need these snapshots every time, OR + * - a boot-arg/embedded device tree property has been set. + * - priority was not requested (this is something other than an ambient kill) + * - the priority was requested *and* the targeted process is not at idle priority + */ + if ((memorystatus_jetsam_snapshot_count == 0) && + (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) { + memorystatus_init_jetsam_snapshot_locked(NULL, 0); + new_snapshot = TRUE; + } - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { - found_knote = TRUE; - if (!is_fatal) { - /* - * Restrict critical notifications for soft limits. - */ + proc_list_unlock(); - if (is_active) { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) { - /* - * Suppress future proc_limit_critical notifications - * for the active soft limit. - */ - kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; - send_knote_count++; - } - } else { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) { - /* - * Suppress future proc_limit_critical_notifications - * for the inactive soft limit. - */ - kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; - send_knote_count++; - } - } - } else { - /* - * We should never be trying to send a critical notification for - * a hard limit... the process would be killed before it could be - * received. - */ - panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid); + freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */ + /* Success? */ + if (freed_mem) { + if (killed) { + *memory_reclaimed = footprint_of_killed_proc; + if (priority) { + *priority = aPid_ep; } + } else { + /* purged */ + proc_list_lock(); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + proc_list_unlock(); } + proc_rele(p); + goto exit; } -#endif /* CONFIG_EMBEDDED */ - } - } - if (found_knote) { - if (send_knote_count > 0) { - KNOTE(&memorystatus_klist, 0); + /* + * Failure - first unwind the state, + * then fall through to restart the search. + */ + proc_list_lock(); + proc_rele_locked(p); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + p->p_memstat_state |= P_MEMSTAT_ERROR; + *errors += 1; + + i = 0; + next_p = memorystatus_get_first_proc_locked(&i, TRUE); } - ret = TRUE; } - memorystatus_klist_unlock(); - - return ret; -} - -/* - * Can only be set by the current task on itself. - */ -int -memorystatus_low_mem_privileged_listener(uint32_t op_flags) -{ - boolean_t set_privilege = FALSE; - /* - * Need an entitlement check here? - */ - if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) { - set_privilege = TRUE; - } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) { - set_privilege = FALSE; - } else { - return EINVAL; - } + proc_list_unlock(); - return task_low_mem_privileged_listener(current_task(), set_privilege, NULL); -} +exit: + os_reason_free(jetsam_reason); -int -memorystatus_send_pressure_note(pid_t pid) -{ - MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid); - return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid)); -} + if (!killed) { + *memory_reclaimed = 0; -void -memorystatus_send_low_swap_note(void) -{ - struct knote *kn = NULL; - - memorystatus_klist_lock(); - SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { - /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the - * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist - * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with - * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */ - if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) { - KNOTE(&memorystatus_klist, kMemorystatusLowSwap); - break; + /* Clear snapshot if freshly captured and no target was found */ + if (new_snapshot) { + proc_list_lock(); + memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); } } - memorystatus_klist_unlock(); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, + memorystatus_available_pages, killed ? aPid : 0, killed, *memory_reclaimed, 0); + + return killed; } -boolean_t -memorystatus_bg_pressure_eligible(proc_t p) +/* + * Jetsam aggressively + */ +static boolean_t +memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, + int32_t priority_max, uint32_t *errors, uint64_t *memory_reclaimed) { - boolean_t eligible = FALSE; - - proc_list_lock(); + pid_t aPid; + proc_t p = PROC_NULL, next_p = PROC_NULL; + boolean_t new_snapshot = FALSE, killed = FALSE; + int kill_count = 0; + unsigned int i = 0; + int32_t aPid_ep = 0; + unsigned int memorystatus_level_snapshot = 0; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + os_reason_t jetsam_reason = OS_REASON_NULL; + uint64_t footprint_of_killed_proc = 0; - MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state); + *memory_reclaimed = 0; - /* Foreground processes have already been dealt with at this point, so just test for eligibility */ - if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) { - eligible = TRUE; - } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, + memorystatus_available_pages, priority_max, 0, 0, 0); - if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) { + if (priority_max >= JETSAM_PRIORITY_FOREGROUND) { /* - * IDLE and IDLE_DEFERRED bands contain processes - * that have dropped memory to be under their inactive - * memory limits. And so they can't really give back - * anything. + * Check if aggressive jetsam has been asked to kill upto or beyond the + * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on + * coalition footprint. */ - eligible = FALSE; + memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT); } - proc_list_unlock(); - - return eligible; -} - -boolean_t -memorystatus_is_foreground_locked(proc_t p) -{ - return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) || - (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT); -} - -/* - * This is meant for stackshot and kperf -- it does not take the proc_list_lock - * to access the p_memstat_dirty field. - */ -void -memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit) -{ - if (!v) { - *is_dirty = FALSE; - *is_dirty_tracked = FALSE; - *allow_idle_exit = FALSE; - } else { - proc_t p = (proc_t)v; - *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0; - *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0; - *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0; + jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n"); } -} - -#endif /* CONFIG_MEMORYSTATUS */ - -/* - * Trigger levels to test the mechanism. - * Can be used via a sysctl. - */ -#define TEST_LOW_MEMORY_TRIGGER_ONE 1 -#define TEST_LOW_MEMORY_TRIGGER_ALL 2 -#define TEST_PURGEABLE_TRIGGER_ONE 3 -#define TEST_PURGEABLE_TRIGGER_ALL 4 -#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5 -#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6 - -boolean_t memorystatus_manual_testing_on = FALSE; -vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal; - -extern struct knote * -vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t); + proc_list_lock(); -#define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */ - -#if DEBUG -#define VM_PRESSURE_DEBUG(cond, format, ...) \ -do { \ - if (cond) { printf(format, ##__VA_ARGS__); } \ -} while(0) -#else -#define VM_PRESSURE_DEBUG(cond, format, ...) -#endif - -#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */ - -void -memorystatus_on_pageout_scan_end(void) -{ - /* No-op */ -} + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + while (next_p) { + if (((next_p->p_listflag & P_LIST_EXITED) != 0) || + ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) { + /* + * We have raced with next_p running on another core. + * It may be exiting or it may have moved to a different + * jetsam priority band. This means we have lost our + * place in line while traversing the jetsam list. We + * attempt to recover by rewinding to the beginning of the band + * we were already traversing. By doing this, we do not guarantee + * that no process escapes this aggressive march, but we can make + * skipping an entire range of processes less likely. (PR-21069019) + */ -/* - * kn_max - knote - * - * knote_pressure_level - to check if the knote is registered for this notification level. - * - * task - task whose bits we'll be modifying - * - * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again. - * - * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately. - * - */ + MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n", + aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid); -boolean_t -is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set) -{ - if (kn_max->kn_sfflags & knote_pressure_level) { - if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) { - task_clear_has_been_notified(task, pressure_level_to_clear); + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + continue; } - task_mark_has_been_notified(task, pressure_level_to_set); - return TRUE; - } - - return FALSE; -} - -void -memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear) -{ - struct knote *kn = NULL; - - memorystatus_klist_lock(); - SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { - proc_t p = PROC_NULL; - struct task* t = TASK_NULL; + p = next_p; + next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); - p = knote_get_kq(kn)->kq_p; - proc_list_lock(); - if (p != proc_ref_locked(p)) { - p = PROC_NULL; + if (p->p_memstat_effectivepriority > priority_max) { + /* + * Bail out of this killing spree if we have + * reached beyond the priority_max jetsam band. + * That is, we kill up to and through the + * priority_max jetsam band. + */ proc_list_unlock(); - continue; + goto exit; } - proc_list_unlock(); - - t = (struct task *)(p->task); - - task_clear_has_been_notified(t, pressure_level_to_clear); - - proc_rele(p); - } - - memorystatus_klist_unlock(); -} - -extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process); - -struct knote * -vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process); - -/* - * Used by the vm_pressure_thread which is - * signalled from within vm_pageout_scan(). - */ -static void vm_dispatch_memory_pressure(void); -void consider_vm_pressure_events(void); -void -consider_vm_pressure_events(void) -{ - vm_dispatch_memory_pressure(); -} -static void -vm_dispatch_memory_pressure(void) -{ - memorystatus_update_vm_pressure(FALSE); -} - -extern vm_pressure_level_t -convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); + aPid = p->p_pid; + aPid_ep = p->p_memstat_effectivepriority; -struct knote * -vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process) -{ - struct knote *kn = NULL, *kn_max = NULL; - uint64_t resident_max = 0; /* MB */ - struct timeval curr_tstamp = {0, 0}; - int elapsed_msecs = 0; - int selected_task_importance = 0; - static int pressure_snapshot = -1; - boolean_t pressure_increase = FALSE; + if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { + continue; + } - if (pressure_snapshot == -1) { /* - * Initial snapshot. + * Capture a snapshot if none exists. */ - pressure_snapshot = level; - pressure_increase = TRUE; - } else { - if (level && (level >= pressure_snapshot)) { - pressure_increase = TRUE; - } else { - pressure_increase = FALSE; + if (memorystatus_jetsam_snapshot_count == 0) { + memorystatus_init_jetsam_snapshot_locked(NULL, 0); + new_snapshot = TRUE; } - pressure_snapshot = level; - } - - if (pressure_increase == TRUE) { - /* - * We'll start by considering the largest - * unimportant task in our list. - */ - selected_task_importance = INT_MAX; - } else { /* - * We'll start by considering the largest - * important task in our list. + * Mark as terminated so that if exit1() indicates success, but the process (for example) + * is blocked in task_exception_notify(), it'll be skipped if encountered again - see + * . This is cheaper than examining P_LEXIT, which requires the + * acquisition of the proc lock. */ - selected_task_importance = 0; - } + p->p_memstat_state |= P_MEMSTAT_TERMINATED; - microuptime(&curr_tstamp); + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; - SLIST_FOREACH(kn, candidate_list, kn_selnext) { - uint64_t resident_size = 0; /* MB */ - proc_t p = PROC_NULL; - struct task* t = TASK_NULL; - int curr_task_importance = 0; - boolean_t consider_knote = FALSE; - boolean_t privileged_listener = FALSE; + /* Shift queue, update stats */ + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); - p = knote_get_kq(kn)->kq_p; - proc_list_lock(); - if (p != proc_ref_locked(p)) { - p = PROC_NULL; - proc_list_unlock(); - continue; - } - proc_list_unlock(); + /* + * In order to kill the target process, we will drop the proc_list_lock. + * To guaranteee that p and next_p don't disappear out from under the lock, + * we must take a ref on both. + * If we cannot get a reference, then it's likely we've raced with + * that process exiting on another core. + */ + if (proc_ref_locked(p) == p) { + if (next_p) { + while (next_p && (proc_ref_locked(next_p) != next_p)) { + proc_t temp_p; -#if CONFIG_MEMORYSTATUS - if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) { - /* - * Skip process not marked foreground. - */ - proc_rele(p); - continue; - } -#endif /* CONFIG_MEMORYSTATUS */ + /* + * We must have raced with next_p exiting on another core. + * Recover by getting the next eligible process in the band. + */ - t = (struct task *)(p->task); + MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n", + aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)")); - timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp); - elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; + temp_p = next_p; + next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE); + } + } + proc_list_unlock(); - vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level); + printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n", + (unsigned long)tv_sec, tv_msec, + ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"), + aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"), + memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages); - if ((kn->kn_sfflags & dispatch_level) == 0) { - proc_rele(p); - continue; - } + memorystatus_level_snapshot = memorystatus_level; -#if CONFIG_MEMORYSTATUS - if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) { - VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid); - proc_rele(p); - continue; - } -#endif /* CONFIG_MEMORYSTATUS */ + /* + * memorystatus_do_kill() drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns. + */ + os_reason_ref(jetsam_reason); + killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc); -#if CONFIG_EMBEDDED - curr_task_importance = p->p_memstat_effectivepriority; -#else /* CONFIG_EMBEDDED */ - curr_task_importance = task_importance_estimate(t); -#endif /* CONFIG_EMBEDDED */ + /* Success? */ + if (killed) { + *memory_reclaimed += footprint_of_killed_proc; + proc_rele(p); + kill_count++; + p = NULL; + killed = FALSE; - /* - * Privileged listeners are only considered in the multi-level pressure scheme - * AND only if the pressure is increasing. - */ - if (level > 0) { - if (task_has_been_notified(t, level) == FALSE) { /* - * Is this a privileged listener? + * Continue the killing spree. */ - if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) { - if (privileged_listener) { - kn_max = kn; - proc_rele(p); - goto done_scanning; + proc_list_lock(); + if (next_p) { + proc_rele_locked(next_p); + } + + if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) { + if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) { +#if DEVELOPMENT || DEBUG + printf("Disabling Lenient mode after one-time deployment.\n"); +#endif /* DEVELOPMENT || DEBUG */ + memorystatus_aggressive_jetsam_lenient = FALSE; + break; } } - } else { - proc_rele(p); + continue; } - } else if (level == 0) { + /* - * Task wasn't notified when the pressure was increasing and so - * no need to notify it that the pressure is decreasing. + * Failure - first unwind the state, + * then fall through to restart the search. */ - if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) { - proc_rele(p); - continue; + proc_list_lock(); + proc_rele_locked(p); + if (next_p) { + proc_rele_locked(next_p); } + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + p->p_memstat_state |= P_MEMSTAT_ERROR; + *errors += 1; + p = NULL; } /* - * We don't want a small process to block large processes from - * being notified again. + * Failure - restart the search at the beginning of + * the band we were already traversing. + * + * We might have raced with "p" exiting on another core, resulting in no + * ref on "p". Or, we may have failed to kill "p". + * + * Either way, we fall thru to here, leaving the proc in the + * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state. + * + * And, we hold the the proc_list_lock at this point. */ - resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */ - if (resident_size >= vm_pressure_task_footprint_min) { - if (level > 0) { - /* - * Warning or Critical Pressure. - */ - if (pressure_increase) { - if ((curr_task_importance < selected_task_importance) || - ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - /* - * We have found a candidate process which is: - * a) at a lower importance than the current selected process - * OR - * b) has importance equal to that of the current selected process but is larger - */ + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + } - consider_knote = TRUE; - } - } else { - if ((curr_task_importance > selected_task_importance) || - ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - /* - * We have found a candidate process which is: - * a) at a higher importance than the current selected process - * OR - * b) has importance equal to that of the current selected process but is larger - */ + proc_list_unlock(); - consider_knote = TRUE; - } - } - } else if (level == 0) { - /* - * Pressure back to normal. - */ - if ((curr_task_importance > selected_task_importance) || - ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - consider_knote = TRUE; - } - } +exit: + os_reason_free(jetsam_reason); - if (consider_knote) { - resident_max = resident_size; - kn_max = kn; - selected_task_importance = curr_task_importance; - consider_knote = FALSE; /* reset for the next candidate */ - } - } else { - /* There was no candidate with enough resident memory to scavenge */ - VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size); - } - proc_rele(p); + /* Clear snapshot if freshly captured and no target was found */ + if (new_snapshot && (kill_count == 0)) { + proc_list_lock(); + memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); } -done_scanning: - if (kn_max) { - VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0); - VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max); - } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, + memorystatus_available_pages, 0, kill_count, *memory_reclaimed, 0); - return kn_max; + if (kill_count > 0) { + return TRUE; + } else { + return FALSE; + } } -#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */ -#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */ -#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */ +static boolean_t +memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed) +{ + pid_t aPid = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + boolean_t new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE; + unsigned int i = 0; + uint32_t aPid_ep; + os_reason_t jetsam_reason = OS_REASON_NULL; + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); -uint64_t next_warning_notification_sent_at_ts = 0; -uint64_t next_critical_notification_sent_at_ts = 0; + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n"); + } -kern_return_t -memorystatus_update_vm_pressure(boolean_t target_foreground_process) -{ - struct knote *kn_max = NULL; - struct knote *kn_cur = NULL, *kn_temp = NULL; /* for safe list traversal */ - pid_t target_pid = -1; - struct klist dispatch_klist = { NULL }; - proc_t target_proc = PROC_NULL; - struct task *task = NULL; - boolean_t found_candidate = FALSE; + proc_list_lock(); - static vm_pressure_level_t level_snapshot = kVMPressureNormal; - static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal; - boolean_t smoothing_window_started = FALSE; - struct timeval smoothing_window_start_tstamp = {0, 0}; - struct timeval curr_tstamp = {0, 0}; - int elapsed_msecs = 0; - uint64_t curr_ts = mach_absolute_time(); + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + while (next_p) { + uint64_t footprint_in_bytes = 0; + uint64_t memlimit_in_bytes = 0; + boolean_t skip = 0; -#if !CONFIG_JETSAM -#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */ + p = next_p; + next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); - int idle_kill_counter = 0; + aPid = p->p_pid; + aPid_ep = p->p_memstat_effectivepriority; - /* - * On desktop we take this opportunity to free up memory pressure - * by immediately killing idle exitable processes. We use a delay - * to avoid overkill. And we impose a max counter as a fail safe - * in case daemons re-launch too fast. - */ - while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) { - if (memorystatus_idle_exit_from_VM() == FALSE) { - /* No idle exitable processes left to kill */ - break; + if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { + continue; } - idle_kill_counter++; - if (memorystatus_manual_testing_on == TRUE) { - /* - * Skip the delay when testing - * the pressure notification scheme. - */ - } else { - delay(1000000); /* 1 second */ + /* skip if no limit set */ + if (p->p_memstat_memlimit <= 0) { + continue; } - } -#endif /* !CONFIG_JETSAM */ - if (level_snapshot != kVMPressureNormal) { - /* - * Check to see if we are still in the 'resting' period - * after having notified all clients interested in - * a particular pressure level. - */ + footprint_in_bytes = get_task_phys_footprint(p->task); + memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */ + skip = (footprint_in_bytes <= memlimit_in_bytes); + +#if CONFIG_FREEZE + if (!skip) { + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + skip = TRUE; + } else { + skip = FALSE; + } + } +#endif + + if (skip) { + continue; + } else { + if (memorystatus_jetsam_snapshot_count == 0) { + memorystatus_init_jetsam_snapshot_locked(NULL, 0); + new_snapshot = TRUE; + } + + if (proc_ref_locked(p) == p) { + /* + * Mark as terminated so that if exit1() indicates success, but the process (for example) + * is blocked in task_exception_notify(), it'll be skipped if encountered again - see + * . This is cheaper than examining P_LEXIT, which requires the + * acquisition of the proc lock. + */ + p->p_memstat_state |= P_MEMSTAT_TERMINATED; + + proc_list_unlock(); + } else { + /* + * We need to restart the search again because + * proc_ref_locked _can_ drop the proc_list lock + * and we could have lost our stored next_p via + * an exit() on another core. + */ + i = 0; + next_p = memorystatus_get_first_proc_locked(&i, TRUE); + continue; + } - level_snapshot = memorystatus_vm_pressure_level; + footprint_in_bytes = 0; + freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */ + + /* Success? */ + if (freed_mem) { + if (killed == FALSE) { + /* purged 'p'..don't reset HWM candidate count */ + *purged = TRUE; - if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { - if (next_warning_notification_sent_at_ts) { - if (curr_ts < next_warning_notification_sent_at_ts) { - delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */); - return KERN_SUCCESS; + proc_list_lock(); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + proc_list_unlock(); + } else { + *memory_reclaimed = footprint_in_bytes; } + proc_rele(p); + goto exit; + } + /* + * Failure - first unwind the state, + * then fall through to restart the search. + */ + proc_list_lock(); + proc_rele_locked(p); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + p->p_memstat_state |= P_MEMSTAT_ERROR; + *errors += 1; - next_warning_notification_sent_at_ts = 0; - memorystatus_klist_reset_all_for_level(kVMPressureWarning); - } - } else if (level_snapshot == kVMPressureCritical) { - if (next_critical_notification_sent_at_ts) { - if (curr_ts < next_critical_notification_sent_at_ts) { - delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */); - return KERN_SUCCESS; - } - next_critical_notification_sent_at_ts = 0; - memorystatus_klist_reset_all_for_level(kVMPressureCritical); - } + i = 0; + next_p = memorystatus_get_first_proc_locked(&i, TRUE); } } - while (1) { - /* - * There is a race window here. But it's not clear - * how much we benefit from having extra synchronization. - */ - level_snapshot = memorystatus_vm_pressure_level; + proc_list_unlock(); - if (prev_level_snapshot > level_snapshot) { - /* - * Pressure decreased? Let's take a little breather - * and see if this condition stays. - */ - if (smoothing_window_started == FALSE) { - smoothing_window_started = TRUE; - microuptime(&smoothing_window_start_tstamp); - } +exit: + os_reason_free(jetsam_reason); - microuptime(&curr_tstamp); - timevalsub(&curr_tstamp, &smoothing_window_start_tstamp); - elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; + if (!killed) { + *memory_reclaimed = 0; - if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) { - delay(INTER_NOTIFICATION_DELAY); - continue; - } + /* Clear snapshot if freshly captured and no target was found */ + if (new_snapshot) { + proc_list_lock(); + memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); } + } + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END, + memorystatus_available_pages, killed ? aPid : 0, killed, *memory_reclaimed, 0); - prev_level_snapshot = level_snapshot; - smoothing_window_started = FALSE; + return killed; +} - memorystatus_klist_lock(); - kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process); +/* + * Jetsam a process pinned in the elevated band. + * + * Return: true -- a pinned process was jetsammed + * false -- no pinned process was jetsammed + */ +boolean_t +memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed) +{ + pid_t aPid = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + boolean_t new_snapshot = FALSE, killed = FALSE; + int kill_count = 0; + uint32_t aPid_ep; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + uint64_t footprint_of_killed_proc = 0; - if (kn_max == NULL) { - memorystatus_klist_unlock(); - /* - * No more level-based clients to notify. - * - * Start the 'resting' window within which clients will not be re-notified. - */ + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); - if (level_snapshot != kVMPressureNormal) { - if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { - nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts); +#if CONFIG_FREEZE + boolean_t consider_frozen_only = FALSE; - /* Next warning notification (if nothing changes) won't be sent before...*/ - next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts; - } + if (band == (unsigned int) memorystatus_freeze_jetsam_band) { + consider_frozen_only = TRUE; + } +#endif /* CONFIG_FREEZE */ - if (level_snapshot == kVMPressureCritical) { - nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts); + proc_list_lock(); - /* Next critical notification (if nothing changes) won't be sent before...*/ - next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts; - } - } - return KERN_FAILURE; - } + next_p = memorystatus_get_first_proc_locked(&band, FALSE); + while (next_p) { + p = next_p; + next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); - target_proc = knote_get_kq(kn_max)->kq_p; + aPid = p->p_pid; + aPid_ep = p->p_memstat_effectivepriority; - proc_list_lock(); - if (target_proc != proc_ref_locked(target_proc)) { - target_proc = PROC_NULL; - proc_list_unlock(); - memorystatus_klist_unlock(); + /* + * Only pick a process pinned in this elevated band + */ + if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { continue; } - proc_list_unlock(); - - target_pid = target_proc->p_pid; - - task = (struct task *)(target_proc->task); - - if (level_snapshot != kVMPressureNormal) { - if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { - if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) { - found_candidate = TRUE; - } - } else { - if (level_snapshot == kVMPressureCritical) { - if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) { - found_candidate = TRUE; - } - } - } - } else { - if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { - task_clear_has_been_notified(task, kVMPressureWarning); - task_clear_has_been_notified(task, kVMPressureCritical); - found_candidate = TRUE; - } + if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { + continue; } - if (found_candidate == FALSE) { - proc_rele(target_proc); - memorystatus_klist_unlock(); +#if CONFIG_FREEZE + if (consider_frozen_only && !(p->p_memstat_state & P_MEMSTAT_FROZEN)) { continue; } - SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) { - int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot); - - if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) { - proc_t knote_proc = knote_get_kq(kn_cur)->kq_p; - pid_t knote_pid = knote_proc->p_pid; - if (knote_pid == target_pid) { - KNOTE_DETACH(&memorystatus_klist, kn_cur); - KNOTE_ATTACH(&dispatch_klist, kn_cur); - } - } + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + continue; } +#endif /* CONFIG_FREEZE */ - KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure); +#if DEVELOPMENT || DEBUG + MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n", + aggr_count, + aPid, (*p->p_name ? p->p_name : "unknown"), + memorystatus_available_pages); +#endif /* DEVELOPMENT || DEBUG */ - SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) { - KNOTE_DETACH(&dispatch_klist, kn_cur); - KNOTE_ATTACH(&memorystatus_klist, kn_cur); + if (memorystatus_jetsam_snapshot_count == 0) { + memorystatus_init_jetsam_snapshot_locked(NULL, 0); + new_snapshot = TRUE; } - memorystatus_klist_unlock(); + p->p_memstat_state |= P_MEMSTAT_TERMINATED; + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; - microuptime(&target_proc->vm_pressure_last_notify_tstamp); - proc_rele(target_proc); + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); - if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) { - break; - } + if (proc_ref_locked(p) == p) { + proc_list_unlock(); - if (memorystatus_manual_testing_on == TRUE) { /* - * Testing out the pressure notification scheme. - * No need for delays etc. + * memorystatus_do_kill drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns */ - } else { - uint32_t sleep_interval = INTER_NOTIFICATION_DELAY; -#if CONFIG_JETSAM - unsigned int page_delta = 0; - unsigned int skip_delay_page_threshold = 0; - - assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base); + os_reason_ref(jetsam_reason); + killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc); - page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2; - skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta; + os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n", + (unsigned long)tv_sec, tv_msec, + aggr_count, + aPid, ((p && *p->p_name) ? p->p_name : "unknown"), + memorystatus_kill_cause_name[cause], aPid_ep, + footprint_of_killed_proc >> 10, (uint64_t)memorystatus_available_pages); - if (memorystatus_available_pages <= skip_delay_page_threshold) { - /* - * We are nearing the critcal mark fast and can't afford to wait between - * notifications. - */ - sleep_interval = 0; + /* Success? */ + if (killed) { + *memory_reclaimed = footprint_of_killed_proc; + proc_rele(p); + kill_count++; + goto exit; } -#endif /* CONFIG_JETSAM */ - if (sleep_interval) { - delay(sleep_interval); - } + /* + * Failure - first unwind the state, + * then fall through to restart the search. + */ + proc_list_lock(); + proc_rele_locked(p); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + p->p_memstat_state |= P_MEMSTAT_ERROR; + *errors += 1; } - } - - return KERN_SUCCESS; -} -vm_pressure_level_t -convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level) -{ - vm_pressure_level_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL; + /* + * Failure - restart the search. + * + * We might have raced with "p" exiting on another core, resulting in no + * ref on "p". Or, we may have failed to kill "p". + * + * Either way, we fall thru to here, leaving the proc in the + * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state. + * + * And, we hold the the proc_list_lock at this point. + */ - switch (internal_pressure_level) { - case kVMPressureNormal: - { - dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL; - break; + next_p = memorystatus_get_first_proc_locked(&band, FALSE); } - case kVMPressureWarning: - case kVMPressureUrgent: - { - dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN; - break; - } + proc_list_unlock(); - case kVMPressureCritical: - { - dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; - break; - } +exit: + os_reason_free(jetsam_reason); - default: - break; + if (kill_count == 0) { + *memory_reclaimed = 0; + + /* Clear snapshot if freshly captured and no target was found */ + if (new_snapshot) { + proc_list_lock(); + memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); + } } - return dispatch_level; + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, + memorystatus_available_pages, killed ? aPid : 0, kill_count, *memory_reclaimed, 0); + + return killed; } -static int -sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS +static boolean_t +memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) { -#pragma unused(arg1, arg2, oidp) -#if CONFIG_EMBEDDED - int error = 0; - - error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0); - if (error) { - return error; + /* + * TODO: allow a general async path + * + * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to + * add the appropriate exit reason code mapping. + */ + if ((victim_pid != -1) || + (cause != kMemorystatusKilledVMPageShortage && + cause != kMemorystatusKilledVMCompressorThrashing && + cause != kMemorystatusKilledVMCompressorSpaceShortage && + cause != kMemorystatusKilledFCThrashing && + cause != kMemorystatusKilledZoneMapExhaustion)) { + return FALSE; } -#endif /* CONFIG_EMBEDDED */ - vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level); - - return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level)); + kill_under_pressure_cause = cause; + memorystatus_thread_wake(); + return TRUE; } -#if DEBUG || DEVELOPMENT - -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", ""); - -#else /* DEBUG || DEVELOPMENT */ - -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", ""); - -#endif /* DEBUG || DEVELOPMENT */ - - -static int -sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS +boolean_t +memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async) { -#pragma unused(arg1, arg2) - - int level = 0; - int error = 0; - int pressure_level = 0; - int trigger_request = 0; - int force_purge; + if (async) { + return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage); + } else { + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n"); + } - error = sysctl_handle_int(oidp, &level, 0, req); - if (error || !req->newptr) { - return error; + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason); } +} - memorystatus_manual_testing_on = TRUE; - - trigger_request = (level >> 16) & 0xFFFF; - pressure_level = (level & 0xFFFF); +#if CONFIG_JETSAM +boolean_t +memorystatus_kill_on_VM_compressor_thrashing(boolean_t async) +{ + if (async) { + return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing); + } else { + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n"); + } - if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE || - trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) { - return EINVAL; - } - switch (pressure_level) { - case NOTE_MEMORYSTATUS_PRESSURE_NORMAL: - case NOTE_MEMORYSTATUS_PRESSURE_WARN: - case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL: - break; - default: - return EINVAL; + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason); } +} - /* - * The pressure level is being set from user-space. - * And user-space uses the constants in sys/event.h - * So we translate those events to our internal levels here. - */ - if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { - memorystatus_manual_testing_level = kVMPressureNormal; - force_purge = 0; - } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) { - memorystatus_manual_testing_level = kVMPressureWarning; - force_purge = vm_pageout_state.memorystatus_purge_on_warning; - } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { - memorystatus_manual_testing_level = kVMPressureCritical; - force_purge = vm_pageout_state.memorystatus_purge_on_critical; - } - - memorystatus_vm_pressure_level = memorystatus_manual_testing_level; - - /* purge according to the new pressure level */ - switch (trigger_request) { - case TEST_PURGEABLE_TRIGGER_ONE: - case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE: - if (force_purge == 0) { - /* no purging requested */ - break; - } - vm_purgeable_object_purge_one_unlocked(force_purge); - break; - case TEST_PURGEABLE_TRIGGER_ALL: - case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL: - if (force_purge == 0) { - /* no purging requested */ - break; - } - while (vm_purgeable_object_purge_one_unlocked(force_purge)) { - ; +boolean_t +memorystatus_kill_on_VM_page_shortage(boolean_t async) +{ + if (async) { + return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage); + } else { + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n"); } - break; - } - if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) || - (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) { - memorystatus_update_vm_pressure(TRUE); + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason); } +} - if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) || - (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) { - while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) { - continue; +boolean_t +memorystatus_kill_on_FC_thrashing(boolean_t async) +{ + if (async) { + return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing); + } else { + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n"); } + + return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason); } +} - if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { - memorystatus_manual_testing_on = FALSE; +boolean_t +memorystatus_kill_on_vnode_limit(void) +{ + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n"); } - return 0; + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason); } -SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorypressure_manual_trigger, "I", ""); - +#endif /* CONFIG_JETSAM */ -SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, ""); +boolean_t +memorystatus_kill_on_zone_map_exhaustion(pid_t pid) +{ + boolean_t res = FALSE; + if (pid == -1) { + res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion); + } else { + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n"); + } -#if DEBUG || DEVELOPMENT -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, ""); -#endif + res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason); + } + return res; +} -#endif /* VM_PRESSURE_EVENTS */ +void +memorystatus_on_pageout_scan_end(void) +{ + /* No-op */ +} /* Return both allocated and actual size, since there's a race between allocation and list compilation */ static int @@ -8586,6 +6127,7 @@ memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size) { int error = 0; memorystatus_priority_entry_t mp_entry; + kern_return_t ret; /* Validate inputs */ if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) { @@ -8603,7 +6145,11 @@ memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size) mp_entry.priority = p->p_memstat_effectivepriority; mp_entry.user_data = p->p_memstat_userdata; if (p->p_memstat_memlimit <= 0) { - task_get_phys_footprint_limit(p->task, &mp_entry.limit); + ret = task_get_phys_footprint_limit(p->task, &mp_entry.limit); + if (ret != KERN_SUCCESS) { + proc_rele(p); + return EINVAL; + } } else { mp_entry.limit = p->p_memstat_memlimit; } @@ -8701,19 +6247,6 @@ memorystatus_update_levels_locked(boolean_t critical_only) } } -#if DEBUG || DEVELOPMENT - if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) { - memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic; - - if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure) { - /* - * The critical threshold must never exceed the pressure threshold - */ - memorystatus_available_pages_critical = memorystatus_available_pages_pressure; - } - } -#endif /* DEBUG || DEVELOPMENT */ - if (memorystatus_jetsam_policy & kPolicyMoreFree) { memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages; } @@ -8724,11 +6257,6 @@ memorystatus_update_levels_locked(boolean_t critical_only) #if VM_PRESSURE_EVENTS memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta; -#if DEBUG || DEVELOPMENT - if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) { - memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic; - } -#endif #endif } @@ -9236,6 +6764,9 @@ out: return error; } +memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL; +size_t memorystatus_global_probabilities_size = 0; + static int memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size) { @@ -9353,12 +6884,24 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu * This routine is used to update a process's jetsam priority position and stored user_data. * It is not used for the setting of memory limits, which is why the last 6 args to the * memorystatus_update() call are 0 or FALSE. + * + * Flags passed into this call are used to distinguish the motivation behind a jetsam priority + * transition. By default, the kernel updates the process's original requested priority when + * no flag is passed. But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel + * updates the process's assertion driven priority. + * + * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd). + * When an assertion is controlling a process's jetsam priority, it may conflict with that process's + * dirty/clean (active/inactive) jetsam state. The kernel attempts to resolve a priority transition + * conflict by reviewing the process state and then choosing the maximum jetsam band at play, + * eg: requested priority versus assertion priority. */ static int -memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) +memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) { int error = 0; + boolean_t is_assertion = FALSE; /* priority is driven by an assertion */ memorystatus_priority_properties_t mpp_entry; /* Validate inputs */ @@ -9366,6 +6909,22 @@ memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t b return EINVAL; } + /* Validate flags */ + if (flags == 0) { + /* + * Default. This path updates requestedpriority. + */ + } else { + if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) { + /* + * Unsupported bit set in flag. + */ + return EINVAL; + } else if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) { + is_assertion = TRUE; + } + } + error = copyin(buffer, &mpp_entry, buffer_size); if (error == 0) { @@ -9381,7 +6940,12 @@ memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t b return EPERM; } - error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, FALSE, FALSE, 0, 0, FALSE, FALSE); + if (is_assertion) { + os_log(OS_LOG_DEFAULT, "memorystatus: set assertion priority(%d) target %s:%d\n", + mpp_entry.priority, (*p->p_name ? p->p_name : "unknown"), p->p_pid); + } + + error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, is_assertion, FALSE, FALSE, 0, 0, FALSE, FALSE); proc_rele(p); } @@ -9408,6 +6972,34 @@ memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b return error; } +static void +memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry) +{ + memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t)); + + if (p->p_memstat_memlimit_active > 0) { + p_entry->memlimit_active = p->p_memstat_memlimit_active; + } else { + task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active); + } + + if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { + p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } + + /* + * Get the inactive limit and attributes + */ + if (p->p_memstat_memlimit_inactive <= 0) { + task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive); + } else { + p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive; + } + if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { + p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } +} + /* * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit(). * That gets the proc's cached memlimit and there is no guarantee that the active/inactive @@ -9418,15 +7010,16 @@ memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) { - int error = 0; - memorystatus_memlimit_properties_t mmp_entry; + memorystatus_memlimit_properties2_t mmp_entry; /* Validate inputs */ - if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) { + if ((pid == 0) || (buffer == USER_ADDR_NULL) || + ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) && + (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) { return EINVAL; } - memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties_t)); + memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t)); proc_t p = proc_find(pid); if (!p) { @@ -9438,30 +7031,21 @@ memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b * No locks taken since we hold a reference to the proc. */ - if (p->p_memstat_memlimit_active > 0) { - mmp_entry.memlimit_active = p->p_memstat_memlimit_active; - } else { - task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_active); - } - - if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { - mmp_entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; - } + memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1); +#if CONFIG_JETSAM +#if DEVELOPMENT || DEBUG /* - * Get the inactive limit and attributes + * Get the limit increased via SPI */ - if (p->p_memstat_memlimit_inactive <= 0) { - task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_inactive); - } else { - mmp_entry.memlimit_inactive = p->p_memstat_memlimit_inactive; - } - if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { - mmp_entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; - } + mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase); + mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase; +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_JETSAM */ + proc_rele(p); - error = copyout(&mmp_entry, buffer, buffer_size); + int error = copyout(&mmp_entry, buffer, buffer_size); return error; } @@ -9586,87 +7170,21 @@ memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __u #endif /* CONFIG_JETSAM */ static int -memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry) +memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry) { - int32_t memlimit_active; - boolean_t memlimit_active_is_fatal; - int32_t memlimit_inactive; - boolean_t memlimit_inactive_is_fatal; - uint32_t valid_attrs = 0; - int error = 0; - - proc_t p = proc_find(pid); - if (!p) { - return ESRCH; - } - - /* - * Check for valid attribute flags. - */ - valid_attrs |= (MEMORYSTATUS_MEMLIMIT_ATTR_FATAL); - if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) { - proc_rele(p); - return EINVAL; - } - if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) { - proc_rele(p); - return EINVAL; - } - - /* - * Setup the active memlimit properties - */ - memlimit_active = entry->memlimit_active; - if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) { - memlimit_active_is_fatal = TRUE; - } else { - memlimit_active_is_fatal = FALSE; - } - - /* - * Setup the inactive memlimit properties - */ - memlimit_inactive = entry->memlimit_inactive; - if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) { - memlimit_inactive_is_fatal = TRUE; - } else { - memlimit_inactive_is_fatal = FALSE; - } - - /* - * Setting a limit of <= 0 implies that the process has no - * high-water-mark and has no per-task-limit. That means - * the system_wide task limit is in place, which by the way, - * is always fatal. - */ - - if (memlimit_active <= 0) { - /* - * Enforce the fatal system_wide task limit while process is active. - */ - memlimit_active = -1; - memlimit_active_is_fatal = TRUE; - } - - if (memlimit_inactive <= 0) { - /* - * Enforce the fatal system_wide task limit while process is inactive. - */ - memlimit_inactive = -1; - memlimit_inactive_is_fatal = TRUE; - } + int error = 0; - proc_list_lock(); + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); /* * Store the active limit variants in the proc. */ - SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal); + SET_ACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_active, p_entry->memlimit_active_attr); /* * Store the inactive limit variants in the proc. */ - SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal); + SET_INACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_inactive, p_entry->memlimit_inactive_attr); /* * Enforce appropriate limit variant by updating the cached values @@ -9696,84 +7214,116 @@ memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1)); } - proc_list_unlock(); - proc_rele(p); - return error; } -/* - * Returns the jetsam priority (effective or requested) of the process - * associated with this task. - */ -int -proc_get_memstat_priority(proc_t p, boolean_t effective_priority) -{ - if (p) { - if (effective_priority) { - return p->p_memstat_effectivepriority; - } else { - return p->p_memstat_requestedpriority; - } - } - return 0; -} - static int -memorystatus_get_process_is_managed(pid_t pid, int *is_managed) +memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry) { - proc_t p = NULL; - - /* Validate inputs */ - if (pid == 0) { - return EINVAL; - } + memorystatus_memlimit_properties_t set_entry; - p = proc_find(pid); + proc_t p = proc_find(pid); if (!p) { return ESRCH; } - proc_list_lock(); - *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0); - proc_rele_locked(p); - proc_list_unlock(); + /* + * Check for valid attribute flags. + */ + const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) { + proc_rele(p); + return EINVAL; + } + if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) { + proc_rele(p); + return EINVAL; + } - return 0; -} + /* + * Setup the active memlimit properties + */ + set_entry.memlimit_active = entry->memlimit_active; + set_entry.memlimit_active_attr = entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; -static int -memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed) -{ - proc_t p = NULL; + /* + * Setup the inactive memlimit properties + */ + set_entry.memlimit_inactive = entry->memlimit_inactive; + set_entry.memlimit_inactive_attr = entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; - /* Validate inputs */ - if (pid == 0) { - return EINVAL; - } + /* + * Setting a limit of <= 0 implies that the process has no + * high-water-mark and has no per-task-limit. That means + * the system_wide task limit is in place, which by the way, + * is always fatal. + */ - p = proc_find(pid); - if (!p) { - return ESRCH; + if (set_entry.memlimit_active <= 0) { + /* + * Enforce the fatal system_wide task limit while process is active. + */ + set_entry.memlimit_active = -1; + set_entry.memlimit_active_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } +#if CONFIG_JETSAM +#if DEVELOPMENT || DEBUG + else { + /* add the current increase to it, for roots */ + set_entry.memlimit_active += roundToNearestMB(p->p_memlimit_increase); } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_JETSAM */ - proc_list_lock(); - if (set_managed == TRUE) { - p->p_memstat_state |= P_MEMSTAT_MANAGED; - } else { - p->p_memstat_state &= ~P_MEMSTAT_MANAGED; + if (set_entry.memlimit_inactive <= 0) { + /* + * Enforce the fatal system_wide task limit while process is inactive. + */ + set_entry.memlimit_inactive = -1; + set_entry.memlimit_inactive_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; } - proc_rele_locked(p); +#if CONFIG_JETSAM +#if DEVELOPMENT || DEBUG + else { + /* add the current increase to it, for roots */ + set_entry.memlimit_inactive += roundToNearestMB(p->p_memlimit_increase); + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_JETSAM */ + + proc_list_lock(); + + int error = memorystatus_set_memlimit_properties_internal(p, &set_entry); + proc_list_unlock(); + proc_rele(p); + + return error; +} +/* + * Returns the jetsam priority (effective or requested) of the process + * associated with this task. + */ +int +proc_get_memstat_priority(proc_t p, boolean_t effective_priority) +{ + if (p) { + if (effective_priority) { + return p->p_memstat_effectivepriority; + } else { + return p->p_memstat_requestedpriority; + } + } return 0; } static int -memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable) +memorystatus_get_process_is_managed(pid_t pid, int *is_managed) { - proc_t p = PROC_NULL; + proc_t p = NULL; + /* Validate inputs */ if (pid == 0) { return EINVAL; } @@ -9783,17 +7333,8 @@ memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable) return ESRCH; } - /* - * Only allow this on the current proc for now. - * We can check for privileges and allow targeting another process in the future. - */ - if (p != current_proc()) { - proc_rele(p); - return EPERM; - } - proc_list_lock(); - *is_freezable = ((p->p_memstat_state & P_MEMSTAT_FREEZE_DISABLED) ? 0 : 1); + *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0); proc_rele_locked(p); proc_list_unlock(); @@ -9801,10 +7342,11 @@ memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable) } static int -memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable) +memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed) { - proc_t p = PROC_NULL; + proc_t p = NULL; + /* Validate inputs */ if (pid == 0) { return EINVAL; } @@ -9814,25 +7356,17 @@ memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable) return ESRCH; } - /* - * Only allow this on the current proc for now. - * We can check for privileges and allow targeting another process in the future. - */ - if (p != current_proc()) { - proc_rele(p); - return EPERM; - } - proc_list_lock(); - if (is_freezable == FALSE) { - /* Freeze preference set to FALSE. Set the P_MEMSTAT_FREEZE_DISABLED bit. */ - p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED; - printf("memorystatus_set_process_is_freezable: disabling freeze for pid %d [%s]\n", - p->p_pid, (*p->p_name ? p->p_name : "unknown")); - } else { + if (set_managed == TRUE) { + p->p_memstat_state |= P_MEMSTAT_MANAGED; + /* + * The P_MEMSTAT_MANAGED bit is set by assertiond for Apps. + * Also opt them in to being frozen (they might have started + * off with the P_MEMSTAT_FREEZE_DISABLED bit set.) + */ p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED; - printf("memorystatus_set_process_is_freezable: enabling freeze for pid %d [%s]\n", - p->p_pid, (*p->p_name ? p->p_name : "unknown")); + } else { + p->p_memstat_state &= ~P_MEMSTAT_MANAGED; } proc_rele_locked(p); proc_list_unlock(); @@ -9848,8 +7382,8 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * os_reason_t jetsam_reason = OS_REASON_NULL; #if !CONFIG_JETSAM - #pragma unused(ret) - #pragma unused(jetsam_reason) + #pragma unused(ret) + #pragma unused(jetsam_reason) #endif /* We don't need entitlements if we're setting/ querying the freeze preference for a process. Skip the check below. */ @@ -9879,7 +7413,7 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret); break; case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES: - error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret); + error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret); break; case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES: error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret); @@ -9957,6 +7491,10 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * memorystatus_aggressive_jetsam_lenient = FALSE; error = 0; break; + case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE: + *ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0); + error = 0; + break; case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE: case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE: error = memorystatus_low_mem_privileged_listener(args->command); @@ -9974,6 +7512,7 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * error = memorystatus_get_process_is_managed(args->pid, ret); break; +#if CONFIG_FREEZE case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE: error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE); break; @@ -9982,7 +7521,6 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * error = memorystatus_get_process_is_freezable(args->pid, ret); break; -#if CONFIG_FREEZE #if DEVELOPMENT || DEBUG case MEMORYSTATUS_CMD_FREEZER_CONTROL: error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret); @@ -9990,6 +7528,14 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * #endif /* DEVELOPMENT || DEBUG */ #endif /* CONFIG_FREEZE */ +#if CONFIG_JETSAM +#if DEVELOPMENT || DEBUG + case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT: + error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags); + break; +#endif /* DEVELOPMENT */ +#endif /* CONFIG_JETSAM */ + default: break; } @@ -9998,263 +7544,6 @@ out: return error; } - -static int -filt_memorystatusattach(struct knote *kn, __unused struct kevent_internal_s *kev) -{ - int error; - - kn->kn_flags |= EV_CLEAR; - error = memorystatus_knote_register(kn); - if (error) { - kn->kn_flags = EV_ERROR; - kn->kn_data = error; - } - return 0; -} - -static void -filt_memorystatusdetach(struct knote *kn) -{ - memorystatus_knote_unregister(kn); -} - -static int -filt_memorystatus(struct knote *kn __unused, long hint) -{ - if (hint) { - switch (hint) { - case kMemorystatusNoPressure: - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL; - } - break; - case kMemorystatusPressure: - if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; - } - } else if (memorystatus_vm_pressure_level == kVMPressureCritical) { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; - } - } - break; - case kMemorystatusLowSwap: - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) { - kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP; - } - break; - - case kMemorystatusProcLimitWarn: - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; - } - break; - - case kMemorystatusProcLimitCritical: - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; - } - break; - - default: - break; - } - } - -#if 0 - if (kn->kn_fflags != 0) { - proc_t knote_proc = knote_get_kq(kn)->kq_p; - pid_t knote_pid = knote_proc->p_pid; - - printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n", - (unsigned long)kn, kn->kn_fflags, knote_pid); - } -#endif - - return kn->kn_fflags != 0; -} - -static int -filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev) -{ - int res; - int prev_kn_sfflags = 0; - - memorystatus_klist_lock(); - - /* - * copy in new kevent settings - * (saving the "desired" data and fflags). - */ - - prev_kn_sfflags = kn->kn_sfflags; - kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK); - -#if !CONFIG_EMBEDDED - /* - * Only on desktop do we restrict notifications to - * one per active/inactive state (soft limits only). - */ - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { - /* - * Is there previous state to preserve? - */ - if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { - /* - * This knote was previously interested in proc_limit_warn, - * so yes, preserve previous state. - */ - if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) { - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; - } - if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) { - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; - } - } else { - /* - * This knote was not previously interested in proc_limit_warn, - * but it is now. Set both states. - */ - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; - } - } - - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { - /* - * Is there previous state to preserve? - */ - if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { - /* - * This knote was previously interested in proc_limit_critical, - * so yes, preserve previous state. - */ - if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) { - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; - } - if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) { - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; - } - } else { - /* - * This knote was not previously interested in proc_limit_critical, - * but it is now. Set both states. - */ - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; - } - } -#endif /* !CONFIG_EMBEDDED */ - - /* - * reset the output flags based on a - * combination of the old events and - * the new desired event list. - */ - //kn->kn_fflags &= kn->kn_sfflags; - - res = (kn->kn_fflags != 0); - - memorystatus_klist_unlock(); - - return res; -} - -static int -filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) -{ -#pragma unused(data) - int res; - - memorystatus_klist_lock(); - res = (kn->kn_fflags != 0); - if (res) { - *kev = kn->kn_kevent; - kn->kn_flags |= EV_CLEAR; /* automatic */ - kn->kn_fflags = 0; - kn->kn_data = 0; - } - memorystatus_klist_unlock(); - - return res; -} - -static void -memorystatus_klist_lock(void) -{ - lck_mtx_lock(&memorystatus_klist_mutex); -} - -static void -memorystatus_klist_unlock(void) -{ - lck_mtx_unlock(&memorystatus_klist_mutex); -} - -void -memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) -{ - lck_mtx_init(&memorystatus_klist_mutex, grp, attr); - klist_init(&memorystatus_klist); -} - -int -memorystatus_knote_register(struct knote *kn) -{ - int error = 0; - - memorystatus_klist_lock(); - - /* - * Support only userspace visible flags. - */ - if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) { -#if !CONFIG_EMBEDDED - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; - } - - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; - kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; - } -#endif /* !CONFIG_EMBEDDED */ - - KNOTE_ATTACH(&memorystatus_klist, kn); - } else { - error = ENOTSUP; - } - - memorystatus_klist_unlock(); - - return error; -} - -void -memorystatus_knote_unregister(struct knote *kn __unused) -{ - memorystatus_klist_lock(); - KNOTE_DETACH(&memorystatus_klist, kn); - memorystatus_klist_unlock(); -} - - -#if 0 -#if CONFIG_JETSAM && VM_PRESSURE_EVENTS -static boolean_t -memorystatus_issue_pressure_kevent(boolean_t pressured) -{ - memorystatus_klist_lock(); - KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure); - memorystatus_klist_unlock(); - return TRUE; -} -#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */ -#endif /* 0 */ - /* Coalition support */ /* sorting info for a particular priority bucket */ @@ -10322,7 +7611,8 @@ memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coa p = memorystatus_get_first_proc_locked(&b, FALSE); while (p) { - if (coalition_is_leader(p->task, COALITION_TYPE_JETSAM, &coal)) { + coal = task_get_coalition(p->task, COALITION_TYPE_JETSAM); + if (coalition_is_leader(p->task, coal)) { if (nleaders < MAX_COAL_LEADERS) { int coal_ntasks = 0; uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks); @@ -10610,10 +7900,14 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) TAILQ_REMOVE(¤t_bucket->list, p, p_memstat_list); current_bucket->count--; - + if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { + current_bucket->relaunch_high_count--; + } TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list); new_bucket->count++; - + if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { + new_bucket->relaunch_high_count++; + } /* * Record idle start or idle delta. */ @@ -10655,3 +7949,95 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) return -1; #endif /* !CONFIG_JETSAM */ } + +uint64_t +memorystatus_available_memory_internal(proc_t p) +{ +#ifdef XNU_TARGET_OS_OSX + #pragma unused(p) + return 0; +#else + const uint64_t footprint_in_bytes = get_task_phys_footprint(p->task); + int32_t memlimit_mb; + int64_t memlimit_bytes; + int64_t rc; + + if (isApp(p) == FALSE) { + return 0; + } + + if (p->p_memstat_memlimit > 0) { + memlimit_mb = p->p_memstat_memlimit; + } else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) { + return 0; + } + + if (memlimit_mb <= 0) { + memlimit_bytes = INT_MAX & ~((1 << 20) - 1); + } else { + memlimit_bytes = ((int64_t) memlimit_mb) << 20; + } + + rc = memlimit_bytes - footprint_in_bytes; + + return (rc >= 0) ? rc : 0; +#endif +} + +int +memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret) +{ + *ret = memorystatus_available_memory_internal(p); + + return 0; +} + +#if CONFIG_JETSAM +#if DEVELOPMENT || DEBUG +static int +memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase) +{ + memorystatus_memlimit_properties_t mmp_entry; + + /* Validate inputs */ + if ((pid == 0) || (byte_increase == 0)) { + return EINVAL; + } + + proc_t p = proc_find(pid); + + if (!p) { + return ESRCH; + } + + const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase); + const uint32_t page_aligned_increase = round_page(p->p_memlimit_increase + byte_increase); /* round to page */ + + proc_list_lock(); + + memorystatus_get_memlimit_properties_internal(p, &mmp_entry); + + if (mmp_entry.memlimit_active > 0) { + mmp_entry.memlimit_active -= current_memlimit_increase; + mmp_entry.memlimit_active += roundToNearestMB(page_aligned_increase); + } + + if (mmp_entry.memlimit_inactive > 0) { + mmp_entry.memlimit_inactive -= current_memlimit_increase; + mmp_entry.memlimit_inactive += roundToNearestMB(page_aligned_increase); + } + + /* + * Store the updated delta limit in the proc. + */ + p->p_memlimit_increase = page_aligned_increase; + + int error = memorystatus_set_memlimit_properties_internal(p, &mmp_entry); + + proc_list_unlock(); + proc_rele(p); + + return error; +} +#endif /* DEVELOPMENT */ +#endif /* CONFIG_JETSAM */ diff --git a/bsd/kern/kern_memorystatus_freeze.c b/bsd/kern/kern_memorystatus_freeze.c new file mode 100644 index 000000000..c83a80d72 --- /dev/null +++ b/bsd/kern/kern_memorystatus_freeze.c @@ -0,0 +1,2196 @@ +/* + * Copyright (c) 2006-2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if CONFIG_FREEZE +#include +#endif /* CONFIG_FREEZE */ + +#include +#include +#include + +#if CONFIG_JETSAM + +extern unsigned int memorystatus_available_pages; +extern unsigned int memorystatus_available_pages_pressure; +extern unsigned int memorystatus_available_pages_critical; +extern unsigned int memorystatus_available_pages_critical_base; +extern unsigned int memorystatus_available_pages_critical_idle_offset; + +#else /* CONFIG_JETSAM */ + +extern uint64_t memorystatus_available_pages; +extern uint64_t memorystatus_available_pages_pressure; +extern uint64_t memorystatus_available_pages_critical; + +#endif /* CONFIG_JETSAM */ + +unsigned int memorystatus_frozen_count = 0; +unsigned int memorystatus_suspended_count = 0; +unsigned long freeze_threshold_percentage = 50; + +#if CONFIG_FREEZE + +lck_grp_attr_t *freezer_lck_grp_attr; +lck_grp_t *freezer_lck_grp; +static lck_mtx_t freezer_mutex; + +/* Thresholds */ +unsigned int memorystatus_freeze_threshold = 0; +unsigned int memorystatus_freeze_pages_min = 0; +unsigned int memorystatus_freeze_pages_max = 0; +unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT; +unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT; +uint64_t memorystatus_freeze_budget_pages_remaining = 0; //remaining # of pages that can be frozen to disk +boolean_t memorystatus_freeze_degradation = FALSE; //protected by the freezer mutex. Signals we are in a degraded freeze mode. + +unsigned int memorystatus_max_frozen_demotions_daily = 0; +unsigned int memorystatus_thaw_count_demotion_threshold = 0; + +boolean_t memorystatus_freeze_enabled = FALSE; +int memorystatus_freeze_wakeup = 0; +int memorystatus_freeze_jetsam_band = 0; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */ + +#define MAX_XPC_SERVICE_PIDS 10 /* Max. # of XPC services per coalition we'll consider freezing. */ + +#ifdef XNU_KERNEL_PRIVATE + +unsigned int memorystatus_frozen_processes_max = 0; +unsigned int memorystatus_frozen_shared_mb = 0; +unsigned int memorystatus_frozen_shared_mb_max = 0; +unsigned int memorystatus_freeze_shared_mb_per_process_max = 0; /* Max. MB allowed per process to be freezer-eligible. */ +unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of private:shared pages for a process to be freezer-eligible. */ +unsigned int memorystatus_thaw_count = 0; +unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */ + +#endif /* XNU_KERNEL_PRIVATE */ + +static inline boolean_t memorystatus_can_freeze_processes(void); +static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low); +static boolean_t memorystatus_is_process_eligible_for_freeze(proc_t p); +static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused); + +void memorystatus_disable_freeze(void); + +/* Stats */ +static uint64_t memorystatus_freeze_pageouts = 0; + +/* Throttling */ +#define DEGRADED_WINDOW_MINS (30) +#define NORMAL_WINDOW_MINS (24 * 60) + +static throttle_interval_t throttle_intervals[] = { + { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, + { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, +}; +throttle_interval_t *degraded_throttle_window = &throttle_intervals[0]; +throttle_interval_t *normal_throttle_window = &throttle_intervals[1]; + +extern uint64_t vm_swap_get_free_space(void); +extern boolean_t vm_swap_max_budget(uint64_t *); +extern int i_coal_jetsam_get_taskrole(coalition_t coal, task_t task); + +static void memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed); +static void memorystatus_demote_frozen_processes(boolean_t force_one); + +static uint64_t memorystatus_freezer_thread_next_run_ts = 0; + +/* Sysctls needed for aggd stats */ + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_count, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, ""); + + +#if DEVELOPMENT || DEBUG + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, ""); + +/* + * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band. + * "0" means no limit. + * Default is 10% of system-wide task limit. + */ + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, ""); + +/* + * max. # of frozen process demotions we will allow in our daily cycle. + */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, ""); +/* + * min # of thaws needed by a process to protect it from getting demoted into the IDLE band. + */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, ""); + +boolean_t memorystatus_freeze_throttle_enabled = TRUE; +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, ""); + +/* + * When set to true, this keeps frozen processes in the compressor pool in memory, instead of swapping them out to disk. + * Exposed via the sysctl kern.memorystatus_freeze_to_memory. + */ +boolean_t memorystatus_freeze_to_memory = FALSE; +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_to_memory, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_to_memory, 0, ""); + +#define VM_PAGES_FOR_ALL_PROCS (2) +/* + * Manual trigger of freeze and thaw for dev / debug kernels only. + */ +static int +sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, pid = 0; + proc_t p; + int freezer_error_code = 0; + pid_t pid_list[MAX_XPC_SERVICE_PIDS]; + int ntasks = 0; + coalition_t coal = COALITION_NULL; + + if (memorystatus_freeze_enabled == FALSE) { + printf("sysctl_freeze: Freeze is DISABLED\n"); + return ENOTSUP; + } + + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) { + return error; + } + + if (pid == VM_PAGES_FOR_ALL_PROCS) { + vm_pageout_anonymous_pages(); + + return 0; + } + + lck_mtx_lock(&freezer_mutex); + +again: + p = proc_find(pid); + if (p != NULL) { + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages = 0, state = 0; + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * Freezer backed by the compressor and swap file(s) + * will hold compressed data. + * + * Set the sysctl kern.memorystatus_freeze_to_memory to true to keep compressed data from + * being swapped out to disk. Note that this disables freezer swap support globally, + * not just for the process being frozen. + * + * + * We don't care about the global freezer budget or the process's (min/max) budget here. + * The freeze sysctl is meant to force-freeze a process. + * + * We also don't update any global or process stats on this path, so that the jetsam/ freeze + * logic remains unaffected. The tasks we're performing here are: freeze the process, set the + * P_MEMSTAT_FROZEN bit, and elevate the process to a higher band (if the freezer is active). + */ + max_pages = memorystatus_freeze_pages_max; + } else { + /* + * We only have the compressor without any swap. + */ + max_pages = UINT32_MAX - 1; + } + + proc_list_lock(); + state = p->p_memstat_state; + proc_list_unlock(); + + /* + * The jetsam path also verifies that the process is a suspended App. We don't care about that here. + * We simply ensure that jetsam is not already working on the process and that the process has not + * explicitly disabled freezing. + */ + if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED)) { + printf("sysctl_freeze: p_memstat_state check failed, process is%s%s%s\n", + (state & P_MEMSTAT_TERMINATED) ? " terminated" : "", + (state & P_MEMSTAT_LOCKED) ? " locked" : "", + (state & P_MEMSTAT_FREEZE_DISABLED) ? " unfreezable" : ""); + + proc_rele(p); + lck_mtx_unlock(&freezer_mutex); + return EPERM; + } + + error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + + if (error) { + char reason[128]; + if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + strlcpy(reason, "too much shared memory", 128); + } + + if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + strlcpy(reason, "low private-shared pages ratio", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + strlcpy(reason, "no compressor space", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + strlcpy(reason, "no swap space", 128); + } + + printf("sysctl_freeze: task_freeze failed: %s\n", reason); + + if (error == KERN_NO_SPACE) { + /* Make it easy to distinguish between failures due to low compressor/ swap space and other failures. */ + error = ENOSPC; + } else { + error = EIO; + } + } else { + proc_list_lock(); + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { + p->p_memstat_state |= P_MEMSTAT_FROZEN; + memorystatus_frozen_count++; + } + p->p_memstat_frozen_count++; + + + proc_list_unlock(); + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * We elevate only if we are going to swap out the data. + */ + error = memorystatus_update_inactive_jetsam_priority_band(pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, + memorystatus_freeze_jetsam_band, TRUE); + + if (error) { + printf("sysctl_freeze: Elevating frozen process to higher jetsam band failed with %d\n", error); + } + } + } + + if ((error == 0) && (coal == NULL)) { + /* + * We froze a process and so we check to see if it was + * a coalition leader and if it has XPC services that + * might need freezing. + * Only one leader can be frozen at a time and so we shouldn't + * enter this block more than once per call. Hence the + * check that 'coal' has to be NULL. We should make this an + * assert() or panic() once we have a much more concrete way + * to detect an app vs a daemon. + */ + + task_t curr_task = NULL; + + curr_task = proc_task(p); + coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM); + if (coalition_is_leader(curr_task, coal)) { + ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC, + COALITION_SORT_DEFAULT, pid_list, MAX_XPC_SERVICE_PIDS); + + if (ntasks > MAX_XPC_SERVICE_PIDS) { + ntasks = MAX_XPC_SERVICE_PIDS; + } + } + } + + proc_rele(p); + + while (ntasks) { + pid = pid_list[--ntasks]; + goto again; + } + + lck_mtx_unlock(&freezer_mutex); + return error; + } else { + printf("sysctl_freeze: Invalid process\n"); + } + + + lck_mtx_unlock(&freezer_mutex); + return EINVAL; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_freeze, "I", ""); + +/* + * Manual trigger of agressive frozen demotion for dev / debug kernels only. + */ +static int +sysctl_memorystatus_demote_frozen_process SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp, req) + memorystatus_demote_frozen_processes(false); + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_demote_frozen_processes, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_memorystatus_demote_frozen_process, "I", ""); + +static int +sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, pid = 0; + proc_t p; + + if (memorystatus_freeze_enabled == FALSE) { + return ENOTSUP; + } + + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) { + return error; + } + + if (pid == VM_PAGES_FOR_ALL_PROCS) { + do_fastwake_warmup_all(); + return 0; + } else { + p = proc_find(pid); + if (p != NULL) { + error = task_thaw(p->task); + + if (error) { + error = EIO; + } else { + /* + * task_thaw() succeeded. + * + * We increment memorystatus_frozen_count on the sysctl freeze path. + * And so we need the P_MEMSTAT_FROZEN to decrement the frozen count + * when this process exits. + * + * proc_list_lock(); + * p->p_memstat_state &= ~P_MEMSTAT_FROZEN; + * proc_list_unlock(); + */ + } + proc_rele(p); + return error; + } + } + + return EINVAL; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", ""); + + +typedef struct _global_freezable_status { + boolean_t freeze_pages_threshold_crossed; + boolean_t freeze_eligible_procs_available; + boolean_t freeze_scheduled_in_future; +}global_freezable_status_t; + +typedef struct _proc_freezable_status { + boolean_t freeze_has_memstat_state; + boolean_t freeze_has_pages_min; + int freeze_has_probability; + int freeze_leader_eligible; + boolean_t freeze_attempted; + uint32_t p_memstat_state; + uint32_t p_pages; + int p_freeze_error_code; + int p_pid; + int p_leader_pid; + char p_name[MAXCOMLEN + 1]; +}proc_freezable_status_t; + +#define MAX_FREEZABLE_PROCESSES 200 /* Total # of processes in band 0 that we evaluate for freezability */ + +/* + * For coalition based freezing evaluations, we proceed as follows: + * - detect that the process is a coalition member and a XPC service + * - mark its 'freeze_leader_eligible' field with FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN + * - continue its freezability evaluation assuming its leader will be freezable too + * + * Once we are done evaluating all processes, we do a quick run thru all + * processes and for a coalition member XPC service we look up the 'freezable' + * status of its leader and iff: + * - the xpc service is freezable i.e. its individual freeze evaluation worked + * - and, its leader is also marked freezable + * we update its 'freeze_leader_eligible' to FREEZE_PROC_LEADER_FREEZABLE_SUCCESS. + */ + +#define FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN (-1) +#define FREEZE_PROC_LEADER_FREEZABLE_SUCCESS (1) +#define FREEZE_PROC_LEADER_FREEZABLE_FAILURE (2) + +static int +memorystatus_freezer_get_status(user_addr_t buffer, size_t buffer_size, int32_t *retval) +{ + uint32_t proc_count = 0, freeze_eligible_proc_considered = 0, band = 0, xpc_index = 0, leader_index = 0; + global_freezable_status_t *list_head; + proc_freezable_status_t *list_entry, *list_entry_start; + size_t list_size = 0; + proc_t p, leader_proc; + memstat_bucket_t *bucket; + uint32_t state = 0, pages = 0, entry_count = 0; + boolean_t try_freeze = TRUE, xpc_skip_size_probability_check = FALSE; + int error = 0, probability_of_use = 0; + pid_t leader_pid = 0; + + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) { + return ENOTSUP; + } + + list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES); + + if (buffer_size < list_size) { + return EINVAL; + } + + list_head = (global_freezable_status_t*)kalloc(list_size); + if (list_head == NULL) { + return ENOMEM; + } + + memset(list_head, 0, list_size); + + list_size = sizeof(global_freezable_status_t); + + proc_list_lock(); + + uint64_t curr_time = mach_absolute_time(); + + list_head->freeze_pages_threshold_crossed = (memorystatus_available_pages < memorystatus_freeze_threshold); + list_head->freeze_eligible_procs_available = ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold); + list_head->freeze_scheduled_in_future = (curr_time < memorystatus_freezer_thread_next_run_ts); + + list_entry_start = (proc_freezable_status_t*) ((uintptr_t)list_head + sizeof(global_freezable_status_t)); + list_entry = list_entry_start; + + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + + entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t)); + + p = memorystatus_get_first_proc_locked(&band, FALSE); + proc_count++; + + while ((proc_count <= MAX_FREEZABLE_PROCESSES) && + (p) && + (list_size < buffer_size)) { + if (isSysProc(p)) { + /* + * Daemon:- We will consider freezing it iff: + * - it belongs to a coalition and the leader is freeze-eligible (delayed evaluation) + * - its role in the coalition is XPC service. + * + * We skip memory size requirements in this case. + */ + + coalition_t coal = COALITION_NULL; + task_t leader_task = NULL, curr_task = NULL; + int task_role_in_coalition = 0; + + curr_task = proc_task(p); + coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM); + + if (coal == COALITION_NULL || coalition_is_leader(curr_task, coal)) { + /* + * By default, XPC services without an app + * will be the leader of their own single-member + * coalition. + */ + goto skip_ineligible_xpc; + } + + leader_task = coalition_get_leader(coal); + if (leader_task == TASK_NULL) { + /* + * This jetsam coalition is currently leader-less. + * This could happen if the app died, but XPC services + * have not yet exited. + */ + goto skip_ineligible_xpc; + } + + leader_proc = (proc_t)get_bsdtask_info(leader_task); + task_deallocate(leader_task); + + if (leader_proc == PROC_NULL) { + /* leader task is exiting */ + goto skip_ineligible_xpc; + } + + task_role_in_coalition = i_coal_jetsam_get_taskrole(coal, curr_task); + + if (task_role_in_coalition == COALITION_TASKROLE_XPC) { + xpc_skip_size_probability_check = TRUE; + leader_pid = leader_proc->p_pid; + goto continue_eval; + } + +skip_ineligible_xpc: + p = memorystatus_get_next_proc_locked(&band, p, FALSE); + proc_count++; + continue; + } + +continue_eval: + strlcpy(list_entry->p_name, p->p_name, MAXCOMLEN + 1); + + list_entry->p_pid = p->p_pid; + + state = p->p_memstat_state; + + if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) || + !(state & P_MEMSTAT_SUSPENDED)) { + try_freeze = list_entry->freeze_has_memstat_state = FALSE; + } else { + try_freeze = list_entry->freeze_has_memstat_state = TRUE; + } + + list_entry->p_memstat_state = state; + + if (xpc_skip_size_probability_check == TRUE) { + /* + * Assuming the coalition leader is freezable + * we don't care re. minimum pages and probability + * as long as the process isn't marked P_MEMSTAT_FREEZE_DISABLED. + * XPC services have to be explicity opted-out of the disabled + * state. And we checked that state above. + */ + list_entry->freeze_has_pages_min = TRUE; + list_entry->p_pages = -1; + list_entry->freeze_has_probability = -1; + + list_entry->freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN; + list_entry->p_leader_pid = leader_pid; + + xpc_skip_size_probability_check = FALSE; + } else { + list_entry->freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_SUCCESS; /* Apps are freeze eligible and their own leaders. */ + list_entry->p_leader_pid = 0; /* Setting this to 0 signifies this isn't a coalition driven freeze. */ + + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); + if (pages < memorystatus_freeze_pages_min) { + try_freeze = list_entry->freeze_has_pages_min = FALSE; + } else { + list_entry->freeze_has_pages_min = TRUE; + } + + list_entry->p_pages = pages; + + if (entry_count) { + uint32_t j = 0; + for (j = 0; j < entry_count; j++) { + if (strncmp(memorystatus_global_probabilities_table[j].proc_name, + p->p_name, + MAXCOMLEN + 1) == 0) { + probability_of_use = memorystatus_global_probabilities_table[j].use_probability; + break; + } + } + + list_entry->freeze_has_probability = probability_of_use; + + try_freeze = ((probability_of_use > 0) && try_freeze); + } else { + list_entry->freeze_has_probability = -1; + } + } + + if (try_freeze) { + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages = 0; + int freezer_error_code = 0; + + error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, TRUE /* eval only */); + + if (error) { + list_entry->p_freeze_error_code = freezer_error_code; + } + + list_entry->freeze_attempted = TRUE; + } + + list_entry++; + freeze_eligible_proc_considered++; + + list_size += sizeof(proc_freezable_status_t); + + p = memorystatus_get_next_proc_locked(&band, p, FALSE); + proc_count++; + } + + proc_list_unlock(); + + list_entry = list_entry_start; + + for (xpc_index = 0; xpc_index < freeze_eligible_proc_considered; xpc_index++) { + if (list_entry[xpc_index].freeze_leader_eligible == FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN) { + leader_pid = list_entry[xpc_index].p_leader_pid; + + leader_proc = proc_find(leader_pid); + + if (leader_proc) { + if (leader_proc->p_memstat_state & P_MEMSTAT_FROZEN) { + /* + * Leader has already been frozen. + */ + list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_SUCCESS; + proc_rele(leader_proc); + continue; + } + proc_rele(leader_proc); + } + + for (leader_index = 0; leader_index < freeze_eligible_proc_considered; leader_index++) { + if (list_entry[leader_index].p_pid == leader_pid) { + if (list_entry[leader_index].freeze_attempted && list_entry[leader_index].p_freeze_error_code == 0) { + list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_SUCCESS; + } else { + list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_FAILURE; + list_entry[xpc_index].p_freeze_error_code = FREEZER_ERROR_GENERIC; + } + break; + } + } + + /* + * Didn't find the leader entry. This might be likely because + * the leader never made it down to band 0. + */ + if (leader_index == freeze_eligible_proc_considered) { + list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_FAILURE; + list_entry[xpc_index].p_freeze_error_code = FREEZER_ERROR_GENERIC; + } + } + } + + buffer_size = list_size; + + error = copyout(list_head, buffer, buffer_size); + if (error == 0) { + *retval = buffer_size; + } else { + *retval = 0; + } + + list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES); + kfree(list_head, list_size); + + MEMORYSTATUS_DEBUG(1, "memorystatus_freezer_get_status: returning %d (%lu - size)\n", error, (unsigned long)*list_size); + + return error; +} + +int +memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) +{ + int err = ENOTSUP; + + if (flags == FREEZER_CONTROL_GET_STATUS) { + err = memorystatus_freezer_get_status(buffer, buffer_size, retval); + } + + return err; +} + +#endif /* DEVELOPMENT || DEBUG */ + +extern void vm_swap_consider_defragmenting(int); +extern boolean_t memorystatus_kill_elevated_process(uint32_t, os_reason_t, unsigned int, int, uint32_t *, uint64_t *); + +/* + * This routine will _jetsam_ all frozen processes + * and reclaim the swap space immediately. + * + * So freeze has to be DISABLED when we call this routine. + */ + +void +memorystatus_disable_freeze(void) +{ + memstat_bucket_t *bucket; + int bucket_count = 0, retries = 0; + boolean_t retval = FALSE, killed = FALSE; + uint32_t errors = 0, errors_over_prev_iteration = 0; + os_reason_t jetsam_reason = 0; + unsigned int band = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + uint64_t memory_reclaimed = 0, footprint = 0; + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_DISABLE) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); + + assert(memorystatus_freeze_enabled == FALSE); + + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_disable_freeze: failed to allocate jetsam reason\n"); + } + + /* + * Let's relocate all frozen processes into band 8. Demoted frozen processes + * are sitting in band 0 currently and it's possible to have a frozen process + * in the FG band being actively used. We don't reset its frozen state when + * it is resumed because it has state on disk. + * + * We choose to do this relocation rather than implement a new 'kill frozen' + * process function for these reasons: + * - duplication of code: too many kill functions exist and we need to rework them better. + * - disk-space-shortage kills are rare + * - not having the 'real' jetsam band at time of the this frozen kill won't preclude us + * from answering any imp. questions re. jetsam policy/effectiveness. + * + * This is essentially what memorystatus_update_inactive_jetsam_priority_band() does while + * avoiding the application of memory limits. + */ + +again: + proc_list_lock(); + + band = JETSAM_PRIORITY_IDLE; + p = PROC_NULL; + next_p = PROC_NULL; + + next_p = memorystatus_get_first_proc_locked(&band, TRUE); + while (next_p) { + p = next_p; + next_p = memorystatus_get_next_proc_locked(&band, p, TRUE); + + if (p->p_memstat_effectivepriority > JETSAM_PRIORITY_FOREGROUND) { + break; + } + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { + continue; + } + + if (p->p_memstat_state & P_MEMSTAT_ERROR) { + p->p_memstat_state &= ~P_MEMSTAT_ERROR; + } + + if (p->p_memstat_effectivepriority == memorystatus_freeze_jetsam_band) { + continue; + } + + /* + * We explicitly add this flag here so the process looks like a normal + * frozen process i.e. P_MEMSTAT_FROZEN and P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND. + * We don't bother with assigning the 'active' memory + * limits at this point because we are going to be killing it soon below. + */ + p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + memorystatus_update_priority_locked(p, memorystatus_freeze_jetsam_band, FALSE, TRUE); + } + + bucket = &memstat_bucket[memorystatus_freeze_jetsam_band]; + bucket_count = bucket->count; + proc_list_unlock(); + + /* + * Bucket count is already stale at this point. But, we don't expect + * freezing to continue since we have already disabled the freeze functionality. + * However, an existing freeze might be in progress. So we might miss that process + * in the first go-around. We hope to catch it in the next. + */ + + errors_over_prev_iteration = 0; + while (bucket_count) { + bucket_count--; + + /* + * memorystatus_kill_elevated_process() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns. + */ + + os_reason_ref(jetsam_reason); + retval = memorystatus_kill_elevated_process( + kMemorystatusKilledDiskSpaceShortage, + jetsam_reason, + memorystatus_freeze_jetsam_band, + 0, /* the iteration of aggressive jetsam..ignored here */ + &errors, + &footprint); + + if (errors > 0) { + printf("memorystatus_disable_freeze: memorystatus_kill_elevated_process returned %d error(s)\n", errors); + errors_over_prev_iteration += errors; + errors = 0; + } + + if (retval == 0) { + /* + * No frozen processes left to kill. + */ + break; + } + + killed = TRUE; + memory_reclaimed += footprint; + } + + proc_list_lock(); + + if (memorystatus_frozen_count) { + /* + * A frozen process snuck in and so + * go back around to kill it. That + * process may have been resumed and + * put into the FG band too. So we + * have to do the relocation again. + */ + assert(memorystatus_freeze_enabled == FALSE); + + retries++; + if (retries < 3) { + proc_list_unlock(); + goto again; + } +#if DEVELOPMENT || DEBUG + panic("memorystatus_disable_freeze: Failed to kill all frozen processes, memorystatus_frozen_count = %d, errors = %d", + memorystatus_frozen_count, errors_over_prev_iteration); +#endif /* DEVELOPMENT || DEBUG */ + } + proc_list_unlock(); + + os_reason_free(jetsam_reason); + + if (killed) { + vm_swap_consider_defragmenting(VM_SWAP_FLAGS_FORCE_DEFRAG | VM_SWAP_FLAGS_FORCE_RECLAIM); + + proc_list_lock(); + size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count); + uint64_t timestamp_now = mach_absolute_time(); + memorystatus_jetsam_snapshot->notification_time = timestamp_now; + memorystatus_jetsam_snapshot->js_gencount++; + if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || + timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { + proc_list_unlock(); + int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); + if (!ret) { + proc_list_lock(); + memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; + proc_list_unlock(); + } + } else { + proc_list_unlock(); + } + } + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_DISABLE) | DBG_FUNC_END, + memorystatus_available_pages, memory_reclaimed, 0, 0, 0); + + return; +} + +__private_extern__ void +memorystatus_freeze_init(void) +{ + kern_return_t result; + thread_t thread; + + freezer_lck_grp_attr = lck_grp_attr_alloc_init(); + freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr); + + lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL); + + /* + * This is just the default value if the underlying + * storage device doesn't have any specific budget. + * We check with the storage layer in memorystatus_freeze_update_throttle() + * before we start our freezing the first time. + */ + memorystatus_freeze_budget_pages_remaining = (memorystatus_freeze_daily_mb_max * 1024 * 1024) / PAGE_SIZE; + + result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread); + if (result == KERN_SUCCESS) { + proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); + proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + thread_set_thread_name(thread, "VM_freezer"); + + thread_deallocate(thread); + } else { + panic("Could not create memorystatus_freeze_thread"); + } +} + +static boolean_t +memorystatus_is_process_eligible_for_freeze(proc_t p) +{ + /* + * Called with proc_list_lock held. + */ + + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + + boolean_t should_freeze = FALSE; + uint32_t state = 0, entry_count = 0, pages = 0, i = 0; + int probability_of_use = 0; + + state = p->p_memstat_state; + + if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) { + goto out; + } + + if (isSysProc(p)) { + /* + * Daemon:- We consider freezing it if: + * - it belongs to a coalition and the leader is frozen, and, + * - its role in the coalition is XPC service. + * + * We skip memory size requirements in this case. + */ + + coalition_t coal = COALITION_NULL; + task_t leader_task = NULL, curr_task = NULL; + proc_t leader_proc = NULL; + int task_role_in_coalition = 0; + + curr_task = proc_task(p); + coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM); + + if (coal == NULL || coalition_is_leader(curr_task, coal)) { + /* + * By default, XPC services without an app + * will be the leader of their own single-member + * coalition. + */ + goto out; + } + + leader_task = coalition_get_leader(coal); + if (leader_task == TASK_NULL) { + /* + * This jetsam coalition is currently leader-less. + * This could happen if the app died, but XPC services + * have not yet exited. + */ + goto out; + } + + leader_proc = (proc_t)get_bsdtask_info(leader_task); + task_deallocate(leader_task); + + if (leader_proc == PROC_NULL) { + /* leader task is exiting */ + goto out; + } + + if (!(leader_proc->p_memstat_state & P_MEMSTAT_FROZEN)) { + goto out; + } + + task_role_in_coalition = i_coal_jetsam_get_taskrole(coal, curr_task); + + if (task_role_in_coalition == COALITION_TASKROLE_XPC) { + should_freeze = TRUE; + } + + goto out; + } else { + /* + * Application. In addition to the above states we need to make + * sure we only consider suspended applications for freezing. + */ + if (!(state & P_MEMSTAT_SUSPENDED)) { + goto out; + } + } + + + /* Only freeze applications meeting our minimum resident page criteria */ + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); + if (pages < memorystatus_freeze_pages_min) { + goto out; + } + + /* Don't freeze processes that are already exiting on core. It may have started exiting + * after we chose it for freeze, but before we obtained the proc_list_lock. + * NB: This is only possible if we're coming in from memorystatus_freeze_process_sync. + * memorystatus_freeze_top_process holds the proc_list_lock while it traverses the bands. + */ + if ((p->p_listflag & P_LIST_EXITED) != 0) { + goto out; + } + + entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t)); + + if (entry_count) { + for (i = 0; i < entry_count; i++) { + if (strncmp(memorystatus_global_probabilities_table[i].proc_name, + p->p_name, + MAXCOMLEN + 1) == 0) { + probability_of_use = memorystatus_global_probabilities_table[i].use_probability; + break; + } + } + + if (probability_of_use == 0) { + goto out; + } + } + + should_freeze = TRUE; +out: + return should_freeze; +} + +/* + * Synchronously freeze the passed proc. Called with a reference to the proc held. + * + * Doesn't deal with: + * - re-freezing because this is called on a specific process and + * not by the freezer thread. If that changes, we'll have to teach it about + * refreezing a frozen process. + * + * - grouped/coalition freezing because we are hoping to deprecate this + * interface as it was used by user-space to freeze particular processes. But + * we have moved away from that approach to having the kernel choose the optimal + * candidates to be frozen. + * + * Returns EINVAL or the value returned by task_freeze(). + */ +int +memorystatus_freeze_process_sync(proc_t p) +{ + int ret = EINVAL; + pid_t aPid = 0; + boolean_t memorystatus_freeze_swap_low = FALSE; + int freezer_error_code = 0; + + lck_mtx_lock(&freezer_mutex); + + if (p == NULL) { + printf("memorystatus_freeze_process_sync: Invalid process\n"); + goto exit; + } + + if (memorystatus_freeze_enabled == FALSE) { + printf("memorystatus_freeze_process_sync: Freezing is DISABLED\n"); + goto exit; + } + + if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { + printf("memorystatus_freeze_process_sync: Low compressor and/or low swap space...skipping freeze\n"); + goto exit; + } + + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + if (!memorystatus_freeze_budget_pages_remaining) { + printf("memorystatus_freeze_process_sync: exit with NO available budget\n"); + goto exit; + } + + proc_list_lock(); + + if (p != NULL) { + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages, i; + + aPid = p->p_pid; + + /* Ensure the process is eligible for freezing */ + if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) { + proc_list_unlock(); + goto exit; + } + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining); + } else { + /* + * We only have the compressor without any swap. + */ + max_pages = UINT32_MAX - 1; + } + + /* Mark as locked temporarily to avoid kill */ + p->p_memstat_state |= P_MEMSTAT_LOCKED; + proc_list_unlock(); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); + + ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, + memorystatus_available_pages, aPid, 0, 0, 0); + + DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty); + + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - " + "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", + (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), + memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); + + proc_list_lock(); + + if (ret == KERN_SUCCESS) { + memorystatus_freeze_entry_t data = { aPid, TRUE, dirty }; + + p->p_memstat_freeze_sharedanon_pages += shared; + + memorystatus_frozen_shared_mb += shared; + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { + p->p_memstat_state |= P_MEMSTAT_FROZEN; + memorystatus_frozen_count++; + } + + p->p_memstat_frozen_count++; + + /* + * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process + * to its higher jetsam band. + */ + proc_list_unlock(); + + memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, + memorystatus_freeze_jetsam_band, TRUE); + + if (ret) { + printf("Elevating the frozen process failed with %d\n", ret); + /* not fatal */ + ret = 0; + } + + proc_list_lock(); + + /* Update stats */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + throttle_intervals[i].pageouts += dirty; + } + } else { + proc_list_lock(); + } + + memorystatus_freeze_pageouts += dirty; + + if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) { + /* + * Add some eviction logic here? At some point should we + * jetsam a process to get back its swap space so that we + * can freeze a more eligible process at this moment in time? + */ + } + + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s] done memorystatus_freeze_budget_pages_remaining %llu froze %u pages", + aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, dirty); + } else { + char reason[128]; + if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + strlcpy(reason, "too much shared memory", 128); + } + + if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + strlcpy(reason, "low private-shared pages ratio", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + strlcpy(reason, "no compressor space", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + strlcpy(reason, "no swap space", 128); + } + + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...skipped (%s)", + aPid, ((p && *p->p_name) ? p->p_name : "unknown"), reason); + p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; + } + + p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + wakeup(&p->p_memstat_state); + proc_list_unlock(); + } + +exit: + lck_mtx_unlock(&freezer_mutex); + + return ret; +} + +static int +memorystatus_freeze_top_process(void) +{ + pid_t aPid = 0, coal_xpc_pid = 0; + int ret = -1; + proc_t p = PROC_NULL, next_p = PROC_NULL; + unsigned int i = 0; + unsigned int band = JETSAM_PRIORITY_IDLE; + boolean_t refreeze_processes = FALSE; + task_t curr_task = NULL; + coalition_t coal = COALITION_NULL; + pid_t pid_list[MAX_XPC_SERVICE_PIDS]; + unsigned int ntasks = 0; + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_SCAN) | DBG_FUNC_START, memorystatus_available_pages, 0, 0, 0, 0); + + proc_list_lock(); + + if (memorystatus_frozen_count >= memorystatus_frozen_processes_max) { + /* + * Freezer is already full but we are here and so let's + * try to refreeze any processes we might have thawed + * in the past and push out their compressed state out. + */ + refreeze_processes = TRUE; + band = (unsigned int) memorystatus_freeze_jetsam_band; + } + +freeze_process: + + next_p = memorystatus_get_first_proc_locked(&band, FALSE); + while (next_p) { + kern_return_t kr; + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages = 0; + int freezer_error_code = 0; + + p = next_p; + + if (coal == NULL) { + next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); + } else { + /* + * We have frozen a coalition leader and now are + * dealing with its XPC services. We get our + * next_p for each XPC service from the pid_list + * acquired after a successful task_freeze call + * on the coalition leader. + */ + + if (ntasks > 0) { + coal_xpc_pid = pid_list[--ntasks]; + next_p = proc_findinternal(coal_xpc_pid, 1 /* proc_list_lock held */); + /* + * We grab a reference when we are about to freeze the process. So, drop + * the reference that proc_findinternal() grabbed for us. + * We also have the proc_list_lock and so this process is stable. + */ + if (next_p) { + proc_rele_locked(next_p); + } + } else { + next_p = NULL; + } + } + + aPid = p->p_pid; + + if (p->p_memstat_effectivepriority != (int32_t) band) { + /* + * We shouldn't be freezing processes outside the + * prescribed band. + */ + break; + } + + /* Ensure the process is eligible for (re-)freezing */ + if (refreeze_processes) { + /* + * Has to have been frozen once before. + */ + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { + continue; + } + + /* + * Has to have been resumed once before. + */ + if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == FALSE) { + continue; + } + + /* + * Not currently being looked at for something. + */ + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + continue; + } + + /* + * We are going to try and refreeze and so re-evaluate + * the process. We don't want to double count the shared + * memory. So deduct the old snapshot here. + */ + memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages; + p->p_memstat_freeze_sharedanon_pages = 0; + + p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count--; + } else { + if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) { + continue; // with lock held + } + } + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * Freezer backed by the compressor and swap file(s) + * will hold compressed data. + */ + + max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining); + } else { + /* + * We only have the compressor pool. + */ + max_pages = UINT32_MAX - 1; + } + + /* Mark as locked temporarily to avoid kill */ + p->p_memstat_state |= P_MEMSTAT_LOCKED; + + p = proc_ref_locked(p); + if (!p) { + break; + } + + proc_list_unlock(); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); + + kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, + memorystatus_available_pages, aPid, 0, 0, 0); + + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - " + "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", + (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), + memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); + + proc_list_lock(); + + /* Success? */ + if (KERN_SUCCESS == kr) { + memorystatus_freeze_entry_t data = { aPid, TRUE, dirty }; + + p->p_memstat_freeze_sharedanon_pages += shared; + + memorystatus_frozen_shared_mb += shared; + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { + p->p_memstat_state |= P_MEMSTAT_FROZEN; + memorystatus_frozen_count++; + } + + p->p_memstat_frozen_count++; + + /* + * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process + * to its higher jetsam band. + */ + proc_list_unlock(); + + memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, memorystatus_freeze_jetsam_band, TRUE); + + if (ret) { + printf("Elevating the frozen process failed with %d\n", ret); + /* not fatal */ + ret = 0; + } + + proc_list_lock(); + + /* Update stats */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + throttle_intervals[i].pageouts += dirty; + } + } else { + proc_list_lock(); + } + + memorystatus_freeze_pageouts += dirty; + + if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) { + /* + * Add some eviction logic here? At some point should we + * jetsam a process to get back its swap space so that we + * can freeze a more eligible process at this moment in time? + */ + } + + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: %sfreezing (%s) pid %d [%s] done, memorystatus_freeze_budget_pages_remaining %llu %sfroze %u pages\n", + refreeze_processes? "re" : "", (coal == NULL ? "general" : "coalition-driven"), aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, refreeze_processes? "Re" : "", dirty); + + /* Return KERN_SUCCESS */ + ret = kr; + + /* + * We froze a process successfully. We can stop now + * and see if that helped if this process isn't part + * of a coalition. + * + * Else: + * - if it is a leader, get the list of XPC services + * that need to be frozen. + * - if it is a XPC service whose leader was frozen + * here, continue on to the next XPC service in the list. + */ + + if (coal == NULL) { + curr_task = proc_task(p); + coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM); + if (coalition_is_leader(curr_task, coal)) { + ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC, + COALITION_SORT_DEFAULT, pid_list, MAX_XPC_SERVICE_PIDS); + + if (ntasks > MAX_XPC_SERVICE_PIDS) { + ntasks = MAX_XPC_SERVICE_PIDS; + } + } + + next_p = NULL; + + if (ntasks > 0) { + /* + * Start off with our first next_p in this list. + */ + coal_xpc_pid = pid_list[--ntasks]; + next_p = proc_findinternal(coal_xpc_pid, 1 /* proc_list_lock held */); + + /* + * We grab a reference when we are about to freeze the process. So drop + * the reference that proc_findinternal() grabbed for us. + * We also have the proc_list_lock and so this process is stable. + */ + if (next_p) { + proc_rele_locked(next_p); + } + } + } + + p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + wakeup(&p->p_memstat_state); + proc_rele_locked(p); + + if (coal && next_p) { + continue; + } + + /* + * No coalition leader was frozen. So we don't + * need to evaluate any XPC services. + * + * OR + * + * We have frozen all eligible XPC services for + * the current coalition leader. + * + * Either way, we can break here and see if freezing + * helped. + */ + + break; + } else { + p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + wakeup(&p->p_memstat_state); + + if (refreeze_processes == TRUE) { + if ((freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) || + (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO)) { + /* + * Keeping this prior-frozen process in this high band when + * we failed to re-freeze it due to bad shared memory usage + * could cause excessive pressure on the lower bands. + * We need to demote it for now. It'll get re-evaluated next + * time because we don't set the P_MEMSTAT_FREEZE_IGNORE + * bit. + */ + + p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE); + } + } else { + p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; + } + + char reason[128]; + if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + strlcpy(reason, "too much shared memory", 128); + } + + if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + strlcpy(reason, "low private-shared pages ratio", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + strlcpy(reason, "no compressor space", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + strlcpy(reason, "no swap space", 128); + } + + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (%s) pid %d [%s]...skipped (%s)\n", + (coal == NULL ? "general" : "coalition-driven"), aPid, ((p && *p->p_name) ? p->p_name : "unknown"), reason); + + proc_rele_locked(p); + + if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + break; + } + } + } + + if ((ret == -1) && + (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD) && + (refreeze_processes == FALSE)) { + /* + * We failed to freeze a process from the IDLE + * band AND we have some thawed processes + * AND haven't tried refreezing as yet. + * Let's try and re-freeze processes in the + * frozen band that have been resumed in the past + * and so have brought in state from disk. + */ + + band = (unsigned int) memorystatus_freeze_jetsam_band; + + refreeze_processes = TRUE; + + goto freeze_process; + } + + proc_list_unlock(); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_SCAN) | DBG_FUNC_END, memorystatus_available_pages, aPid, 0, 0, 0); + + return ret; +} + +static inline boolean_t +memorystatus_can_freeze_processes(void) +{ + boolean_t ret; + + proc_list_lock(); + + if (memorystatus_suspended_count) { + memorystatus_freeze_suspended_threshold = MIN(memorystatus_freeze_suspended_threshold, FREEZE_SUSPENDED_THRESHOLD_DEFAULT); + + if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) { + ret = TRUE; + } else { + ret = FALSE; + } + } else { + ret = FALSE; + } + + proc_list_unlock(); + + return ret; +} + +static boolean_t +memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low) +{ + boolean_t can_freeze = TRUE; + + /* Only freeze if we're sufficiently low on memory; this holds off freeze right + * after boot, and is generally is a no-op once we've reached steady state. */ + if (memorystatus_available_pages > memorystatus_freeze_threshold) { + return FALSE; + } + + /* Check minimum suspended process threshold. */ + if (!memorystatus_can_freeze_processes()) { + return FALSE; + } + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + + if (!VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * In-core compressor used for freezing WITHOUT on-disk swap support. + */ + if (vm_compressor_low_on_space()) { + if (*memorystatus_freeze_swap_low) { + *memorystatus_freeze_swap_low = TRUE; + } + + can_freeze = FALSE; + } else { + if (*memorystatus_freeze_swap_low) { + *memorystatus_freeze_swap_low = FALSE; + } + + can_freeze = TRUE; + } + } else { + /* + * Freezing WITH on-disk swap support. + * + * In-core compressor fronts the swap. + */ + if (vm_swap_low_on_space()) { + if (*memorystatus_freeze_swap_low) { + *memorystatus_freeze_swap_low = TRUE; + } + + can_freeze = FALSE; + } + } + + return can_freeze; +} + +/* + * This function evaluates if the currently frozen processes deserve + * to stay in the higher jetsam band. There are 2 modes: + * - 'force one == TRUE': (urgent mode) + * We are out of budget and can't refreeze a process. The process's + * state, if it was resumed, will stay in compressed memory. If we let it + * remain up in the higher frozen jetsam band, it'll put a lot of pressure on + * the lower bands. So we force-demote the least-recently-used-and-thawed + * process. + * + * - 'force_one == FALSE': (normal mode) + * If the # of thaws of a process is below our threshold, then we + * will demote that process into the IDLE band. + * We don't immediately kill the process here because it already has + * state on disk and so it might be worth giving it another shot at + * getting thawed/resumed and used. + */ +static void +memorystatus_demote_frozen_processes(boolean_t force_one) +{ + unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band; + unsigned int demoted_proc_count = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + /* We demote to IDLE unless someone has asserted a higher priority on this process. */ + int maxpriority = JETSAM_PRIORITY_IDLE; + + proc_list_lock(); + + if (memorystatus_freeze_enabled == FALSE) { + /* + * Freeze has been disabled likely to + * reclaim swap space. So don't change + * any state on the frozen processes. + */ + proc_list_unlock(); + return; + } + + next_p = memorystatus_get_first_proc_locked(&band, FALSE); + while (next_p) { + p = next_p; + next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { + continue; + } + + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + continue; + } + + if (force_one == TRUE) { + if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) { + /* + * This process hasn't been thawed recently and so most of + * its state sits on NAND and so we skip it -- jetsamming it + * won't help with memory pressure. + */ + continue; + } + } else { + if (p->p_memstat_thaw_count >= memorystatus_thaw_count_demotion_threshold) { + /* + * This process has met / exceeded our thaw count demotion threshold + * and so we let it live in the higher bands. + */ + continue; + } + } + + p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + maxpriority = MAX(p->p_memstat_assertionpriority, maxpriority); + memorystatus_update_priority_locked(p, maxpriority, FALSE, FALSE); +#if DEVELOPMENT || DEBUG + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus_demote_frozen_process(%s) pid %d [%s]", + (force_one ? "urgent" : "normal"), (p ? p->p_pid : -1), ((p && *p->p_name) ? p->p_name : "unknown")); +#endif /* DEVELOPMENT || DEBUG */ + + /* + * The freezer thread will consider this a normal app to be frozen + * because it is in the IDLE band. So we don't need the + * P_MEMSTAT_REFREEZE_ELIGIBLE state here. Also, if it gets resumed + * we'll correctly count it as eligible for re-freeze again. + * + * We don't drop the frozen count because this process still has + * state on disk. So there's a chance it gets resumed and then it + * should land in the higher jetsam band. For that it needs to + * remain marked frozen. + */ + if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { + p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count--; + } + + demoted_proc_count++; + + if ((force_one == TRUE) || (demoted_proc_count == memorystatus_max_frozen_demotions_daily)) { + break; + } + } + + if (force_one == FALSE) { + /* + * We use this counter to track daily thaws. + * So we only reset it to 0 under the normal + * mode. + */ + memorystatus_thaw_count = 0; + } + + proc_list_unlock(); +} + + +/* + * This function will do 4 things: + * + * 1) check to see if we are currently in a degraded freezer mode, and if so: + * - check to see if our window has expired and we should exit this mode, OR, + * - return a budget based on the degraded throttle window's max. pageouts vs current pageouts. + * + * 2) check to see if we are in a NEW normal window and update the normal throttle window's params. + * + * 3) check what the current normal window allows for a budget. + * + * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below + * what we would normally expect, then we are running low on our daily budget and need to enter + * degraded perf. mode. + */ + +static void +memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) +{ + clock_sec_t sec; + clock_nsec_t nsec; + mach_timespec_t ts; + + unsigned int freeze_daily_pageouts_max = 0; + +#if DEVELOPMENT || DEBUG + if (!memorystatus_freeze_throttle_enabled) { + /* + * No throttling...we can use the full budget everytime. + */ + *budget_pages_allowed = UINT64_MAX; + return; + } +#endif + + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = sec; + ts.tv_nsec = nsec; + + struct throttle_interval_t *interval = NULL; + + if (memorystatus_freeze_degradation == TRUE) { + interval = degraded_throttle_window; + + if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { + memorystatus_freeze_degradation = FALSE; + interval->pageouts = 0; + interval->max_pageouts = 0; + } else { + *budget_pages_allowed = interval->max_pageouts - interval->pageouts; + } + } + + interval = normal_throttle_window; + + if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { + /* + * New throttle window. + * Rollover any unused budget. + * Also ask the storage layer what the new budget needs to be. + */ + uint64_t freeze_daily_budget = 0; + unsigned int daily_budget_pageouts = 0; + + if (vm_swap_max_budget(&freeze_daily_budget)) { + memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024)); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max); + } + + freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE); + + daily_budget_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); + interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts; + + interval->ts.tv_sec = interval->mins * 60; + interval->ts.tv_nsec = 0; + ADD_MACH_TIMESPEC(&interval->ts, &ts); + /* Since we update the throttle stats pre-freeze, adjust for overshoot here */ + if (interval->pageouts > interval->max_pageouts) { + interval->pageouts -= interval->max_pageouts; + } else { + interval->pageouts = 0; + } + *budget_pages_allowed = interval->max_pageouts; + + memorystatus_demote_frozen_processes(FALSE); /* normal mode...don't force a demotion */ + } else { + /* + * Current throttle window. + * Deny freezing if we have no budget left. + * Try graceful degradation if we are within 25% of: + * - the daily budget, and + * - the current budget left is below our normal budget expectations. + */ + +#if DEVELOPMENT || DEBUG + /* + * This can only happen in the INTERNAL configs because we allow modifying the daily budget for testing. + */ + + if (freeze_daily_pageouts_max > interval->max_pageouts) { + /* + * We just bumped the daily budget. Re-evaluate our normal window params. + */ + interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); + memorystatus_freeze_degradation = FALSE; //we'll re-evaluate this below... + } +#endif /* DEVELOPMENT || DEBUG */ + + if (memorystatus_freeze_degradation == FALSE) { + if (interval->pageouts >= interval->max_pageouts) { + *budget_pages_allowed = 0; + } else { + int budget_left = interval->max_pageouts - interval->pageouts; + int budget_threshold = (freeze_daily_pageouts_max * FREEZE_DEGRADATION_BUDGET_THRESHOLD) / 100; + + mach_timespec_t time_left = {0, 0}; + + time_left.tv_sec = interval->ts.tv_sec; + time_left.tv_nsec = 0; + + SUB_MACH_TIMESPEC(&time_left, &ts); + + if (budget_left <= budget_threshold) { + /* + * For the current normal window, calculate how much we would pageout in a DEGRADED_WINDOW_MINS duration. + * And also calculate what we would pageout for the same DEGRADED_WINDOW_MINS duration if we had the full + * daily pageout budget. + */ + + unsigned int current_budget_rate_allowed = ((budget_left / time_left.tv_sec) / 60) * DEGRADED_WINDOW_MINS; + unsigned int normal_budget_rate_allowed = (freeze_daily_pageouts_max / NORMAL_WINDOW_MINS) * DEGRADED_WINDOW_MINS; + + /* + * The current rate of pageouts is below what we would expect for + * the normal rate i.e. we have below normal budget left and so... + */ + + if (current_budget_rate_allowed < normal_budget_rate_allowed) { + memorystatus_freeze_degradation = TRUE; + degraded_throttle_window->max_pageouts = current_budget_rate_allowed; + degraded_throttle_window->pageouts = 0; + + /* + * Switch over to the degraded throttle window so the budget + * doled out is based on that window. + */ + interval = degraded_throttle_window; + } + } + + *budget_pages_allowed = interval->max_pageouts - interval->pageouts; + } + } + } + + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", + interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, + interval->throttle ? "on" : "off"); +} + +static void +memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused) +{ + static boolean_t memorystatus_freeze_swap_low = FALSE; + + lck_mtx_lock(&freezer_mutex); + + if (memorystatus_freeze_enabled) { + if ((memorystatus_frozen_count < memorystatus_frozen_processes_max) || + (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD)) { + if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { + /* Only freeze if we've not exceeded our pageout budgets.*/ + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + + if (memorystatus_freeze_budget_pages_remaining) { + memorystatus_freeze_top_process(); + } else { + memorystatus_demote_frozen_processes(TRUE); /* urgent mode..force one demotion */ + } + } + } + } + + /* + * We use memorystatus_apps_idle_delay_time because if/when we adopt aging for applications, + * it'll tie neatly into running the freezer once we age an application. + * + * Till then, it serves as a good interval that can be tuned via a sysctl too. + */ + memorystatus_freezer_thread_next_run_ts = mach_absolute_time() + memorystatus_apps_idle_delay_time; + + assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT); + lck_mtx_unlock(&freezer_mutex); + + thread_block((thread_continue_t) memorystatus_freeze_thread); +} + +boolean_t +memorystatus_freeze_thread_should_run(void) +{ + /* + * No freezer_mutex held here...see why near call-site + * within memorystatus_pages_update(). + */ + + boolean_t should_run = FALSE; + + if (memorystatus_freeze_enabled == FALSE) { + goto out; + } + + if (memorystatus_available_pages > memorystatus_freeze_threshold) { + goto out; + } + + if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) && + (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) { + goto out; + } + + if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) { + goto out; + } + + uint64_t curr_time = mach_absolute_time(); + + if (curr_time < memorystatus_freezer_thread_next_run_ts) { + goto out; + } + + should_run = TRUE; + +out: + return should_run; +} + +int +memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable) +{ + proc_t p = PROC_NULL; + + if (pid == 0) { + return EINVAL; + } + + p = proc_find(pid); + if (!p) { + return ESRCH; + } + + /* + * Only allow this on the current proc for now. + * We can check for privileges and allow targeting another process in the future. + */ + if (p != current_proc()) { + proc_rele(p); + return EPERM; + } + + proc_list_lock(); + *is_freezable = ((p->p_memstat_state & P_MEMSTAT_FREEZE_DISABLED) ? 0 : 1); + proc_rele_locked(p); + proc_list_unlock(); + + return 0; +} + +int +memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable) +{ + proc_t p = PROC_NULL; + + if (pid == 0) { + return EINVAL; + } + + /* + * To enable freezable status, you need to be root or an entitlement. + */ + if (is_freezable && + !kauth_cred_issuser(kauth_cred_get()) && + !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) { + return EPERM; + } + + p = proc_find(pid); + if (!p) { + return ESRCH; + } + + /* + * A process can change its own status. A coalition leader can + * change the status of coalition members. + */ + if (p != current_proc()) { + coalition_t coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM); + if (!coalition_is_leader(proc_task(current_proc()), coal)) { + proc_rele(p); + return EPERM; + } + } + + proc_list_lock(); + if (is_freezable == FALSE) { + /* Freeze preference set to FALSE. Set the P_MEMSTAT_FREEZE_DISABLED bit. */ + p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED; + printf("memorystatus_set_process_is_freezable: disabling freeze for pid %d [%s]\n", + p->p_pid, (*p->p_name ? p->p_name : "unknown")); + } else { + p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED; + printf("memorystatus_set_process_is_freezable: enabling freeze for pid %d [%s]\n", + p->p_pid, (*p->p_name ? p->p_name : "unknown")); + } + proc_rele_locked(p); + proc_list_unlock(); + + return 0; +} + +static int +sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + if (!req->newptr) { + return EINVAL; + } + + /* Need to be root or have entitlement */ + if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) { + return EPERM; + } + + if (memorystatus_freeze_enabled == FALSE) { + return ENOTSUP; + } + + do_fastwake_warmup_all(); + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", ""); + +#endif /* CONFIG_FREEZE */ diff --git a/bsd/kern/kern_memorystatus_notify.c b/bsd/kern/kern_memorystatus_notify.c new file mode 100644 index 000000000..c5be3d0d8 --- /dev/null +++ b/bsd/kern/kern_memorystatus_notify.c @@ -0,0 +1,1585 @@ +/* + * Copyright (c) 2006-2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if CONFIG_FREEZE +#include +#endif /* CONFIG_FREEZE */ + +#include +#include + +/* + * Memorystatus klist structures + */ +struct klist memorystatus_klist; +static lck_mtx_t memorystatus_klist_mutex; +static void memorystatus_klist_lock(void); +static void memorystatus_klist_unlock(void); + +/* + * Memorystatus kevent filter routines + */ +static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev); +static void filt_memorystatusdetach(struct knote *kn); +static int filt_memorystatus(struct knote *kn, long hint); +static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev); + +SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = { + .f_attach = filt_memorystatusattach, + .f_detach = filt_memorystatusdetach, + .f_event = filt_memorystatus, + .f_touch = filt_memorystatustouch, + .f_process = filt_memorystatusprocess, +}; + +/* + * Memorystatus notification events + */ +enum { + kMemorystatusNoPressure = 0x1, + kMemorystatusPressure = 0x2, + kMemorystatusLowSwap = 0x4, + kMemorystatusProcLimitWarn = 0x8, + kMemorystatusProcLimitCritical = 0x10 +}; + +#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */ +#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */ +#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */ +#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */ + +/* + * Memorystatus notification helper routines + */ +static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); +static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t); +static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear); +static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process); +static void vm_dispatch_memory_pressure(void); +kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process); + +#if VM_PRESSURE_EVENTS + +/* + * This value is the threshold that a process must meet to be considered for scavenging. + */ +#if CONFIG_EMBEDDED +#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */ +#else /* CONFIG_EMBEDDED */ +#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ +#endif /* CONFIG_EMBEDDED */ + +static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE; + +#if DEVELOPMENT || DEBUG +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ + +vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal; + +/* + * We use this flag to signal if we have any HWM offenders + * on the system. This way we can reduce the number of wakeups + * of the memorystatus_thread when the system is between the + * "pressure" and "critical" threshold. + * + * The (re-)setting of this variable is done without any locks + * or synchronization simply because it is not possible (currently) + * to keep track of HWM offenders that drop down below their memory + * limit and/or exit. So, we choose to burn a couple of wasted wakeups + * by allowing the unguarded modification of this variable. + */ +boolean_t memorystatus_hwm_candidates = 0; + +#endif /* VM_PRESSURE_EVENTS */ + +#if CONFIG_JETSAM + +extern unsigned int memorystatus_available_pages; +extern unsigned int memorystatus_available_pages_pressure; +extern unsigned int memorystatus_available_pages_critical; +extern unsigned int memorystatus_available_pages_critical_base; +extern unsigned int memorystatus_available_pages_critical_idle_offset; + +#else /* CONFIG_JETSAM */ + +extern uint64_t memorystatus_available_pages; +extern uint64_t memorystatus_available_pages_pressure; +extern uint64_t memorystatus_available_pages_critical; + +#endif /* CONFIG_JETSAM */ + +extern lck_mtx_t memorystatus_jetsam_fg_band_lock; +uint32_t memorystatus_jetsam_fg_band_waiters = 0; +static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */ +static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */ + +extern boolean_t(*volatile consider_buffer_cache_collect)(int); + +#if DEVELOPMENT || DEBUG +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED, + &memorystatus_jetsam_fg_band_delay_ns, ""); +#endif + +static int +filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev) +{ + int error; + + kn->kn_flags |= EV_CLEAR; /* automatically set */ + kn->kn_sdata = 0; /* incoming data is ignored */ + + error = memorystatus_knote_register(kn); + if (error) { + knote_set_error(kn, error); + } + return 0; +} + +static void +filt_memorystatusdetach(struct knote *kn) +{ + memorystatus_knote_unregister(kn); +} + +static int +filt_memorystatus(struct knote *kn __unused, long hint) +{ + if (hint) { + switch (hint) { + case kMemorystatusNoPressure: + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL; + } + break; + case kMemorystatusPressure: + if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) { + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; + } + } else if (memorystatus_vm_pressure_level == kVMPressureCritical) { + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; + } + } + break; + case kMemorystatusLowSwap: + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) { + kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP; + } + break; + + case kMemorystatusProcLimitWarn: + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + } + break; + + case kMemorystatusProcLimitCritical: + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; + } + break; + + default: + break; + } + } + +#if 0 + if (kn->kn_fflags != 0) { + proc_t knote_proc = knote_get_kq(kn)->kq_p; + pid_t knote_pid = knote_proc->p_pid; + + printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n", + (unsigned long)kn, kn->kn_fflags, knote_pid); + } +#endif + + return kn->kn_fflags != 0; +} + +static int +filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev) +{ + int res; + int prev_kn_sfflags = 0; + + memorystatus_klist_lock(); + + /* + * copy in new kevent settings + * (saving the "desired" data and fflags). + */ + + prev_kn_sfflags = kn->kn_sfflags; + kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK); + +#if !CONFIG_EMBEDDED + /* + * Only on desktop do we restrict notifications to + * one per active/inactive state (soft limits only). + */ + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + /* + * Is there previous state to preserve? + */ + if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + /* + * This knote was previously interested in proc_limit_warn, + * so yes, preserve previous state. + */ + if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) { + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; + } + if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) { + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; + } + } else { + /* + * This knote was not previously interested in proc_limit_warn, + * but it is now. Set both states. + */ + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; + } + } + + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + /* + * Is there previous state to preserve? + */ + if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + /* + * This knote was previously interested in proc_limit_critical, + * so yes, preserve previous state. + */ + if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) { + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; + } + if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) { + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; + } + } else { + /* + * This knote was not previously interested in proc_limit_critical, + * but it is now. Set both states. + */ + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; + } + } +#endif /* !CONFIG_EMBEDDED */ + + /* + * reset the output flags based on a + * combination of the old events and + * the new desired event list. + */ + //kn->kn_fflags &= kn->kn_sfflags; + + res = (kn->kn_fflags != 0); + + memorystatus_klist_unlock(); + + return res; +} + +static int +filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev) +{ + int res = 0; + + memorystatus_klist_lock(); + if (kn->kn_fflags) { + knote_fill_kevent(kn, kev, 0); + res = 1; + } + memorystatus_klist_unlock(); + + return res; +} + +static void +memorystatus_klist_lock(void) +{ + lck_mtx_lock(&memorystatus_klist_mutex); +} + +static void +memorystatus_klist_unlock(void) +{ + lck_mtx_unlock(&memorystatus_klist_mutex); +} + +void +memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) +{ + lck_mtx_init(&memorystatus_klist_mutex, grp, attr); + klist_init(&memorystatus_klist); +} + +int +memorystatus_knote_register(struct knote *kn) +{ + int error = 0; + + memorystatus_klist_lock(); + + /* + * Support only userspace visible flags. + */ + if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) { +#if !CONFIG_EMBEDDED + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; + } + + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; + kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; + } +#endif /* !CONFIG_EMBEDDED */ + + KNOTE_ATTACH(&memorystatus_klist, kn); + } else { + error = ENOTSUP; + } + + memorystatus_klist_unlock(); + + return error; +} + +void +memorystatus_knote_unregister(struct knote *kn __unused) +{ + memorystatus_klist_lock(); + KNOTE_DETACH(&memorystatus_klist, kn); + memorystatus_klist_unlock(); +} + +#if VM_PRESSURE_EVENTS + +#if CONFIG_MEMORYSTATUS + +int +memorystatus_send_note(int event_code, void *data, size_t data_length) +{ + int ret; + struct kev_msg ev_msg; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_SYSTEM_CLASS; + ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; + + ev_msg.event_code = event_code; + + ev_msg.dv[0].data_length = data_length; + ev_msg.dv[0].data_ptr = data; + ev_msg.dv[1].data_length = 0; + + ret = kev_post_msg(&ev_msg); + if (ret) { + printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); + } + + return ret; +} + +boolean_t +memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded) +{ + boolean_t ret = FALSE; + boolean_t found_knote = FALSE; + struct knote *kn = NULL; + int send_knote_count = 0; + + /* + * See comment in sysctl_memorystatus_vm_pressure_send. + */ + + memorystatus_klist_lock(); + + SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { + proc_t knote_proc = knote_get_kq(kn)->kq_p; + pid_t knote_pid = knote_proc->p_pid; + + if (knote_pid == pid) { + /* + * By setting the "fflags" here, we are forcing + * a process to deal with the case where it's + * bumping up into its memory limits. If we don't + * do this here, we will end up depending on the + * system pressure snapshot evaluation in + * filt_memorystatus(). + */ + +#if CONFIG_EMBEDDED + if (!limit_exceeded) { + /* + * Intentionally set either the unambiguous limit warning, + * the system-wide critical or the system-wide warning + * notification bit. + */ + + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + found_knote = TRUE; + send_knote_count++; + } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; + found_knote = TRUE; + send_knote_count++; + } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; + found_knote = TRUE; + send_knote_count++; + } + } else { + /* + * Send this notification when a process has exceeded a soft limit. + */ + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; + found_knote = TRUE; + send_knote_count++; + } + } +#else /* CONFIG_EMBEDDED */ + if (!limit_exceeded) { + /* + * Processes on desktop are not expecting to handle a system-wide + * critical or system-wide warning notification from this path. + * Intentionally set only the unambiguous limit warning here. + * + * If the limit is soft, however, limit this to one notification per + * active/inactive limit (per each registered listener). + */ + + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + found_knote = TRUE; + if (!is_fatal) { + /* + * Restrict proc_limit_warn notifications when + * non-fatal (soft) limit is at play. + */ + if (is_active) { + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) { + /* + * Mark this knote for delivery. + */ + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + /* + * And suppress it from future notifications. + */ + kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE; + send_knote_count++; + } + } else { + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) { + /* + * Mark this knote for delivery. + */ + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + /* + * And suppress it from future notifications. + */ + kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE; + send_knote_count++; + } + } + } else { + /* + * No restriction on proc_limit_warn notifications when + * fatal (hard) limit is at play. + */ + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + send_knote_count++; + } + } + } else { + /* + * Send this notification when a process has exceeded a soft limit, + */ + + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + found_knote = TRUE; + if (!is_fatal) { + /* + * Restrict critical notifications for soft limits. + */ + + if (is_active) { + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) { + /* + * Suppress future proc_limit_critical notifications + * for the active soft limit. + */ + kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE; + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; + send_knote_count++; + } + } else { + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) { + /* + * Suppress future proc_limit_critical_notifications + * for the inactive soft limit. + */ + kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE; + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; + send_knote_count++; + } + } + } else { + /* + * We should never be trying to send a critical notification for + * a hard limit... the process would be killed before it could be + * received. + */ + panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid); + } + } + } +#endif /* CONFIG_EMBEDDED */ + } + } + + if (found_knote) { + if (send_knote_count > 0) { + KNOTE(&memorystatus_klist, 0); + } + ret = TRUE; + } + + memorystatus_klist_unlock(); + + return ret; +} + +/* + * Can only be set by the current task on itself. + */ +int +memorystatus_low_mem_privileged_listener(uint32_t op_flags) +{ + boolean_t set_privilege = FALSE; + /* + * Need an entitlement check here? + */ + if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) { + set_privilege = TRUE; + } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) { + set_privilege = FALSE; + } else { + return EINVAL; + } + + return task_low_mem_privileged_listener(current_task(), set_privilege, NULL); +} + +int +memorystatus_send_pressure_note(pid_t pid) +{ + MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid); + return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid)); +} + +boolean_t +memorystatus_is_foreground_locked(proc_t p) +{ + return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) || + (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT); +} + +/* + * This is meant for stackshot and kperf -- it does not take the proc_list_lock + * to access the p_memstat_dirty field. + */ +void +memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit) +{ + if (!v) { + *is_dirty = FALSE; + *is_dirty_tracked = FALSE; + *allow_idle_exit = FALSE; + } else { + proc_t p = (proc_t)v; + *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0; + *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0; + *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0; + } +} + +boolean_t +memorystatus_bg_pressure_eligible(proc_t p) +{ + boolean_t eligible = FALSE; + + proc_list_lock(); + + MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state); + + /* Foreground processes have already been dealt with at this point, so just test for eligibility */ + if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) { + eligible = TRUE; + } + + if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) { + /* + * IDLE and IDLE_DEFERRED bands contain processes + * that have dropped memory to be under their inactive + * memory limits. And so they can't really give back + * anything. + */ + eligible = FALSE; + } + + proc_list_unlock(); + + return eligible; +} + +void +memorystatus_send_low_swap_note(void) +{ + struct knote *kn = NULL; + + memorystatus_klist_lock(); + SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { + /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the + * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist + * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with + * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */ + if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) { + KNOTE(&memorystatus_klist, kMemorystatusLowSwap); + break; + } + } + + memorystatus_klist_unlock(); +} + +#endif /* CONFIG_MEMORYSTATUS */ + +/* + * kn_max - knote + * + * knote_pressure_level - to check if the knote is registered for this notification level. + * + * task - task whose bits we'll be modifying + * + * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again. + * + * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately. + * + */ + +static boolean_t +is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set) +{ + if (kn_max->kn_sfflags & knote_pressure_level) { + if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) { + task_clear_has_been_notified(task, pressure_level_to_clear); + } + + task_mark_has_been_notified(task, pressure_level_to_set); + return TRUE; + } + + return FALSE; +} + +static void +memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear) +{ + struct knote *kn = NULL; + + memorystatus_klist_lock(); + SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { + proc_t p = PROC_NULL; + struct task* t = TASK_NULL; + + p = knote_get_kq(kn)->kq_p; + proc_list_lock(); + if (p != proc_ref_locked(p)) { + p = PROC_NULL; + proc_list_unlock(); + continue; + } + proc_list_unlock(); + + t = (struct task *)(p->task); + + task_clear_has_been_notified(t, pressure_level_to_clear); + + proc_rele(p); + } + + memorystatus_klist_unlock(); +} + +/* + * Used by the vm_pressure_thread which is + * signalled from within vm_pageout_scan(). + */ + +void +consider_vm_pressure_events(void) +{ + vm_dispatch_memory_pressure(); +} + +static void +vm_dispatch_memory_pressure(void) +{ + memorystatus_update_vm_pressure(FALSE); +} + +static struct knote * +vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process) +{ + struct knote *kn = NULL, *kn_max = NULL; + uint64_t resident_max = 0;/* MB */ + struct timeval curr_tstamp = {0, 0}; + int elapsed_msecs = 0; + int selected_task_importance = 0; + static int pressure_snapshot = -1; + boolean_t pressure_increase = FALSE; + + if (pressure_snapshot == -1) { + /* + * Initial snapshot. + */ + pressure_snapshot = level; + pressure_increase = TRUE; + } else { + if (level && (level >= pressure_snapshot)) { + pressure_increase = TRUE; + } else { + pressure_increase = FALSE; + } + + pressure_snapshot = level; + } + + if (pressure_increase == TRUE) { + /* + * We'll start by considering the largest + * unimportant task in our list. + */ + selected_task_importance = INT_MAX; + } else { + /* + * We'll start by considering the largest + * important task in our list. + */ + selected_task_importance = 0; + } + + microuptime(&curr_tstamp); + + SLIST_FOREACH(kn, candidate_list, kn_selnext) { + uint64_t resident_size = 0;/* MB */ + proc_t p = PROC_NULL; + struct task* t = TASK_NULL; + int curr_task_importance = 0; + boolean_t consider_knote = FALSE; + boolean_t privileged_listener = FALSE; + + p = knote_get_kq(kn)->kq_p; + proc_list_lock(); + if (p != proc_ref_locked(p)) { + p = PROC_NULL; + proc_list_unlock(); + continue; + } + proc_list_unlock(); + +#if CONFIG_MEMORYSTATUS + if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) { + /* + * Skip process not marked foreground. + */ + proc_rele(p); + continue; + } +#endif /* CONFIG_MEMORYSTATUS */ + + t = (struct task *)(p->task); + + timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp); + elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; + + vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level); + + if ((kn->kn_sfflags & dispatch_level) == 0) { + proc_rele(p); + continue; + } + +#if CONFIG_MEMORYSTATUS + if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) { + VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid); + proc_rele(p); + continue; + } +#endif /* CONFIG_MEMORYSTATUS */ + +#if CONFIG_EMBEDDED + curr_task_importance = p->p_memstat_effectivepriority; +#else /* CONFIG_EMBEDDED */ + curr_task_importance = task_importance_estimate(t); +#endif /* CONFIG_EMBEDDED */ + + /* + * Privileged listeners are only considered in the multi-level pressure scheme + * AND only if the pressure is increasing. + */ + if (level > 0) { + if (task_has_been_notified(t, level) == FALSE) { + /* + * Is this a privileged listener? + */ + if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) { + if (privileged_listener) { + kn_max = kn; + proc_rele(p); + goto done_scanning; + } + } + } else { + proc_rele(p); + continue; + } + } else if (level == 0) { + /* + * Task wasn't notified when the pressure was increasing and so + * no need to notify it that the pressure is decreasing. + */ + if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) { + proc_rele(p); + continue; + } + } + + /* + * We don't want a small process to block large processes from + * being notified again. + */ + resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */ + + if (resident_size >= vm_pressure_task_footprint_min) { + if (level > 0) { + /* + * Warning or Critical Pressure. + */ + if (pressure_increase) { + if ((curr_task_importance < selected_task_importance) || + ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { + /* + * We have found a candidate process which is: + * a) at a lower importance than the current selected process + * OR + * b) has importance equal to that of the current selected process but is larger + */ + + consider_knote = TRUE; + } + } else { + if ((curr_task_importance > selected_task_importance) || + ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { + /* + * We have found a candidate process which is: + * a) at a higher importance than the current selected process + * OR + * b) has importance equal to that of the current selected process but is larger + */ + + consider_knote = TRUE; + } + } + } else if (level == 0) { + /* + * Pressure back to normal. + */ + if ((curr_task_importance > selected_task_importance) || + ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { + consider_knote = TRUE; + } + } + + if (consider_knote) { + resident_max = resident_size; + kn_max = kn; + selected_task_importance = curr_task_importance; + consider_knote = FALSE; /* reset for the next candidate */ + } + } else { + /* There was no candidate with enough resident memory to scavenge */ + VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size); + } + proc_rele(p); + } + +done_scanning: + if (kn_max) { + VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0); + VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max); + } + + return kn_max; +} + +static uint64_t next_warning_notification_sent_at_ts = 0; +static uint64_t next_critical_notification_sent_at_ts = 0; + +boolean_t memorystatus_manual_testing_on = FALSE; +vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal; + +kern_return_t +memorystatus_update_vm_pressure(boolean_t target_foreground_process) +{ + struct knote *kn_max = NULL; + struct knote *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */ + pid_t target_pid = -1; + struct klist dispatch_klist = { NULL }; + proc_t target_proc = PROC_NULL; + struct task *task = NULL; + boolean_t found_candidate = FALSE; + + static vm_pressure_level_t level_snapshot = kVMPressureNormal; + static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal; + boolean_t smoothing_window_started = FALSE; + struct timeval smoothing_window_start_tstamp = {0, 0}; + struct timeval curr_tstamp = {0, 0}; + int elapsed_msecs = 0; + uint64_t curr_ts = mach_absolute_time(); + +#if !CONFIG_JETSAM +#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */ + + int idle_kill_counter = 0; + + /* + * On desktop we take this opportunity to free up memory pressure + * by immediately killing idle exitable processes. We use a delay + * to avoid overkill. And we impose a max counter as a fail safe + * in case daemons re-launch too fast. + */ + while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) { + if (memorystatus_idle_exit_from_VM() == FALSE) { + /* No idle exitable processes left to kill */ + break; + } + idle_kill_counter++; + + if (memorystatus_manual_testing_on == TRUE) { + /* + * Skip the delay when testing + * the pressure notification scheme. + */ + } else { + delay(1000000); /* 1 second */ + } + } +#endif /* !CONFIG_JETSAM */ + + if (level_snapshot != kVMPressureNormal) { + /* + * Check to see if we are still in the 'resting' period + * after having notified all clients interested in + * a particular pressure level. + */ + + level_snapshot = memorystatus_vm_pressure_level; + + if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { + if (next_warning_notification_sent_at_ts) { + if (curr_ts < next_warning_notification_sent_at_ts) { + delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */); + return KERN_SUCCESS; + } + + next_warning_notification_sent_at_ts = 0; + memorystatus_klist_reset_all_for_level(kVMPressureWarning); + } + } else if (level_snapshot == kVMPressureCritical) { + if (next_critical_notification_sent_at_ts) { + if (curr_ts < next_critical_notification_sent_at_ts) { + delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */); + return KERN_SUCCESS; + } + next_critical_notification_sent_at_ts = 0; + memorystatus_klist_reset_all_for_level(kVMPressureCritical); + } + } + } + + while (1) { + /* + * There is a race window here. But it's not clear + * how much we benefit from having extra synchronization. + */ + level_snapshot = memorystatus_vm_pressure_level; + + if (prev_level_snapshot > level_snapshot) { + /* + * Pressure decreased? Let's take a little breather + * and see if this condition stays. + */ + if (smoothing_window_started == FALSE) { + smoothing_window_started = TRUE; + microuptime(&smoothing_window_start_tstamp); + } + + microuptime(&curr_tstamp); + timevalsub(&curr_tstamp, &smoothing_window_start_tstamp); + elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; + + if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) { + delay(INTER_NOTIFICATION_DELAY); + continue; + } + } + + prev_level_snapshot = level_snapshot; + smoothing_window_started = FALSE; + + memorystatus_klist_lock(); + kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process); + + if (kn_max == NULL) { + memorystatus_klist_unlock(); + + /* + * No more level-based clients to notify. + * + * Start the 'resting' window within which clients will not be re-notified. + */ + + if (level_snapshot != kVMPressureNormal) { + if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { + nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts); + + /* Next warning notification (if nothing changes) won't be sent before...*/ + next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts; + } + + if (level_snapshot == kVMPressureCritical) { + nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts); + + /* Next critical notification (if nothing changes) won't be sent before...*/ + next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts; + } + } + return KERN_FAILURE; + } + + target_proc = knote_get_kq(kn_max)->kq_p; + + proc_list_lock(); + if (target_proc != proc_ref_locked(target_proc)) { + target_proc = PROC_NULL; + proc_list_unlock(); + memorystatus_klist_unlock(); + continue; + } + proc_list_unlock(); + + target_pid = target_proc->p_pid; + + task = (struct task *)(target_proc->task); + + if (level_snapshot != kVMPressureNormal) { + if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { + if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) { + found_candidate = TRUE; + } + } else { + if (level_snapshot == kVMPressureCritical) { + if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) { + found_candidate = TRUE; + } + } + } + } else { + if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { + task_clear_has_been_notified(task, kVMPressureWarning); + task_clear_has_been_notified(task, kVMPressureCritical); + + found_candidate = TRUE; + } + } + + if (found_candidate == FALSE) { + proc_rele(target_proc); + memorystatus_klist_unlock(); + continue; + } + + SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) { + int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot); + + if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) { + proc_t knote_proc = knote_get_kq(kn_cur)->kq_p; + pid_t knote_pid = knote_proc->p_pid; + if (knote_pid == target_pid) { + KNOTE_DETACH(&memorystatus_klist, kn_cur); + KNOTE_ATTACH(&dispatch_klist, kn_cur); + } + } + } + + KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure); + + SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) { + KNOTE_DETACH(&dispatch_klist, kn_cur); + KNOTE_ATTACH(&memorystatus_klist, kn_cur); + } + + memorystatus_klist_unlock(); + + microuptime(&target_proc->vm_pressure_last_notify_tstamp); + proc_rele(target_proc); + + if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) { + break; + } + + if (memorystatus_manual_testing_on == TRUE) { + /* + * Testing out the pressure notification scheme. + * No need for delays etc. + */ + } else { + uint32_t sleep_interval = INTER_NOTIFICATION_DELAY; +#if CONFIG_JETSAM + unsigned int page_delta = 0; + unsigned int skip_delay_page_threshold = 0; + + assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base); + + page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2; + skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta; + + if (memorystatus_available_pages <= skip_delay_page_threshold) { + /* + * We are nearing the critcal mark fast and can't afford to wait between + * notifications. + */ + sleep_interval = 0; + } +#endif /* CONFIG_JETSAM */ + + if (sleep_interval) { + delay(sleep_interval); + } + } + } + + return KERN_SUCCESS; +} + +static uint32_t +convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level) +{ + uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL; + + switch (internal_pressure_level) { + case kVMPressureNormal: + { + dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL; + break; + } + + case kVMPressureWarning: + case kVMPressureUrgent: + { + dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN; + break; + } + + case kVMPressureCritical: + { + dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; + break; + } + + default: + break; + } + + return dispatch_level; +} + +/* + * Notify any kexts that are waiting for notification that jetsam + * is approaching the foreground bands. They should use this notification + * to free cached memory. + */ +void +memorystatus_issue_fg_band_notify(void) +{ + uint64_t now; + + lck_mtx_lock(&memorystatus_jetsam_fg_band_lock); + absolutetime_to_nanoseconds(mach_absolute_time(), &now); + if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) { + lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock); + return; + } + + if (memorystatus_jetsam_fg_band_waiters > 0) { + thread_wakeup(&memorystatus_jetsam_fg_band_waiters); + memorystatus_jetsam_fg_band_waiters = 0; + memorystatus_jetsam_fg_band_timestamp_ns = now; + } + lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock); + + /* Notify the buffer cache, file systems, etc. to jetison everything they can. */ + if (consider_buffer_cache_collect != NULL) { + (void)(*consider_buffer_cache_collect)(1); + } +} + + +/* + * Memorystatus notification debugging support + */ + +static int +sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) +#if CONFIG_EMBEDDED + int error = 0; + + error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0); + if (error) { + return error; + } + +#endif /* CONFIG_EMBEDDED */ + uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level); + + return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level)); +} + +#if DEBUG || DEVELOPMENT + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", ""); + +#else /* DEBUG || DEVELOPMENT */ + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", ""); + +#endif /* DEBUG || DEVELOPMENT */ + +/* + * Trigger levels to test the mechanism. + * Can be used via a sysctl. + */ +#define TEST_LOW_MEMORY_TRIGGER_ONE 1 +#define TEST_LOW_MEMORY_TRIGGER_ALL 2 +#define TEST_PURGEABLE_TRIGGER_ONE 3 +#define TEST_PURGEABLE_TRIGGER_ALL 4 +#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5 +#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6 + +static int +sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int level = 0; + int error = 0; + int pressure_level = 0; + int trigger_request = 0; + int force_purge; + + error = sysctl_handle_int(oidp, &level, 0, req); + if (error || !req->newptr) { + return error; + } + + memorystatus_manual_testing_on = TRUE; + + trigger_request = (level >> 16) & 0xFFFF; + pressure_level = (level & 0xFFFF); + + if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE || + trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) { + return EINVAL; + } + switch (pressure_level) { + case NOTE_MEMORYSTATUS_PRESSURE_NORMAL: + case NOTE_MEMORYSTATUS_PRESSURE_WARN: + case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL: + break; + default: + return EINVAL; + } + + /* + * The pressure level is being set from user-space. + * And user-space uses the constants in sys/event.h + * So we translate those events to our internal levels here. + */ + if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { + memorystatus_manual_testing_level = kVMPressureNormal; + force_purge = 0; + } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) { + memorystatus_manual_testing_level = kVMPressureWarning; + force_purge = vm_pageout_state.memorystatus_purge_on_warning; + } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { + memorystatus_manual_testing_level = kVMPressureCritical; + force_purge = vm_pageout_state.memorystatus_purge_on_critical; + } + + memorystatus_vm_pressure_level = memorystatus_manual_testing_level; + + /* purge according to the new pressure level */ + switch (trigger_request) { + case TEST_PURGEABLE_TRIGGER_ONE: + case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE: + if (force_purge == 0) { + /* no purging requested */ + break; + } + vm_purgeable_object_purge_one_unlocked(force_purge); + break; + case TEST_PURGEABLE_TRIGGER_ALL: + case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL: + if (force_purge == 0) { + /* no purging requested */ + break; + } + while (vm_purgeable_object_purge_one_unlocked(force_purge)) { + ; + } + break; + } + + if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) || + (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) { + memorystatus_update_vm_pressure(TRUE); + } + + if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) || + (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) { + while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) { + continue; + } + } + + if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { + memorystatus_manual_testing_on = FALSE; + } + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorypressure_manual_trigger, "I", ""); + + +SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, ""); + +#if DEBUG || DEVELOPMENT +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, ""); + +#if 0 +#if CONFIG_JETSAM && VM_PRESSURE_EVENTS +static boolean_t +memorystatus_issue_pressure_kevent(boolean_t pressured) +{ + memorystatus_klist_lock(); + KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure); + memorystatus_klist_unlock(); + return TRUE; +} +#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */ +#endif /* 0 */ + +/* + * This routine is used for targeted notifications regardless of system memory pressure + * and regardless of whether or not the process has already been notified. + * It bypasses and has no effect on the only-one-notification per soft-limit policy. + * + * "memnote" is the current user. + */ + +static int +sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + /* Need to be root or have memorystatus entitlement */ + if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) { + return EPERM; + } + + int error = 0, pid = 0; + struct knote *kn = NULL; + boolean_t found_knote = FALSE; + int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */ + uint64_t value = 0; + + error = sysctl_handle_quad(oidp, &value, 0, req); + if (error || !req->newptr) { + return error; + } + + /* + * Find the pid in the low 32 bits of value passed in. + */ + pid = (int)(value & 0xFFFFFFFF); + + /* + * Find notification in the high 32 bits of the value passed in. + */ + fflags = (int)((value >> 32) & 0xFFFFFFFF); + + /* + * For backwards compatibility, when no notification is + * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN + */ + if (fflags == 0) { + fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; + // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags); + } + + /* wake up everybody waiting for kVMPressureJetsam */ + if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) { + memorystatus_issue_fg_band_notify(); + return error; + } + + /* + * See event.h ... fflags for EVFILT_MEMORYSTATUS + */ + if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) || + (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) || + (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) || + (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) || + (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) || + (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) || + (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 && + ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) { + printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags); + error = 1; + return error; + } + + /* + * Forcibly send pid a memorystatus notification. + */ + + memorystatus_klist_lock(); + + SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { + proc_t knote_proc = knote_get_kq(kn)->kq_p; + pid_t knote_pid = knote_proc->p_pid; + + if (knote_pid == pid) { + /* + * Forcibly send this pid a memorystatus notification. + */ + kn->kn_fflags = fflags; + found_knote = TRUE; + } + } + + if (found_knote) { + KNOTE(&memorystatus_klist, 0); + printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid); + error = 0; + } else { + printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid); + error = 1; + } + + memorystatus_klist_unlock(); + + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY, + 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", ""); + +#endif /* DEBUG || DEVELOPMENT */ + +#endif /* VM_PRESSURE_EVENTS */ diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index dd0cc669b..667b17a7d 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -101,6 +101,7 @@ #include #include #include +#include extern vm_map_t bsd_pageable_map; @@ -135,6 +136,16 @@ static int cputype, cpusubtype, cputhreadtype, cpufamily, cpu64bit; static uint64_t cacheconfig[10], cachesize[10]; static int packages; +static char * osenvironment; +static uint32_t osenvironment_size = 0; +static uint32_t ephemeral_storage = 0; +static uint32_t use_recovery_securityd = 0; + +static struct { + uint32_t ephemeral_storage:1; + uint32_t use_recovery_securityd:1; +} property_existence = {0, 0}; + SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Sysctl internal magic"); SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_LOCKED, 0, @@ -163,8 +174,8 @@ SYSCTL_NODE(_kern, OID_AUTO, bridge, CTLFLAG_RW | CTLFLAG_LOCKED, 0, * hw.* MIB */ -#define CTLHW_RETQUAD (1 << 31) -#define CTLHW_LOCAL (1 << 30) +#define CTLHW_RETQUAD (1U << 31) +#define CTLHW_LOCAL (1U << 30) #define HW_LOCAL_CPUTHREADTYPE (1 | CTLHW_LOCAL) #define HW_LOCAL_PHYSICALCPU (2 | CTLHW_LOCAL) @@ -366,6 +377,83 @@ sysctl_tbfrequency return sysctl_io_number(req, l, sizeof(l), NULL, NULL); } +/* + * Create sysctl entries coming from device tree. + * + * Entries from device tree are loaded here because DTLookupEntry() only works before + * PE_init_iokit(). Doing this also avoids the extern-C hackery to access these entries + * from IORegistry (which requires C++). + */ +void +sysctl_load_devicetree_entries(void) +{ + DTEntry chosen; + void *value; + unsigned int size; + + if (kSuccess != DTLookupEntry(0, "/chosen", &chosen)) { + return; + } + + /* load osenvironment */ + if (kSuccess == DTGetProperty(chosen, "osenvironment", (void **) &value, &size)) { + MALLOC(osenvironment, char *, size, M_TEMP, M_WAITOK); + if (osenvironment) { + memcpy(osenvironment, value, size); + osenvironment_size = size; + } + } + + /* load ephemeral_storage */ + if (kSuccess == DTGetProperty(chosen, "ephemeral-storage", (void **) &value, &size)) { + if (size == sizeof(uint32_t)) { + ephemeral_storage = *(uint32_t *)value; + property_existence.ephemeral_storage = 1; + } + } + + /* load use_recovery_securityd */ + if (kSuccess == DTGetProperty(chosen, "use-recovery-securityd", (void **) &value, &size)) { + if (size == sizeof(uint32_t)) { + use_recovery_securityd = *(uint32_t *)value; + property_existence.use_recovery_securityd = 1; + } + } +} + +static int +sysctl_osenvironment +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + if (osenvironment_size > 0) { + return SYSCTL_OUT(req, osenvironment, osenvironment_size); + } else { + return EINVAL; + } +} + +static int +sysctl_ephemeral_storage +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + if (property_existence.ephemeral_storage) { + return SYSCTL_OUT(req, &ephemeral_storage, sizeof(ephemeral_storage)); + } else { + return EINVAL; + } +} + +static int +sysctl_use_recovery_securityd +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + if (property_existence.use_recovery_securityd) { + return SYSCTL_OUT(req, &use_recovery_securityd, sizeof(use_recovery_securityd)); + } else { + return EINVAL; + } +} + /* * hw.* MIB variables. */ @@ -409,6 +497,9 @@ SYSCTL_QUAD(_hw, OID_AUTO, fixfrequency, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOC SYSCTL_PROC(_hw, OID_AUTO, tbfrequency, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_tbfrequency, "Q", ""); SYSCTL_QUAD(_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &max_mem, ""); SYSCTL_INT(_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &packages, 0, ""); +SYSCTL_PROC(_hw, OID_AUTO, osenvironment, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_osenvironment, "A", ""); +SYSCTL_PROC(_hw, OID_AUTO, ephemeral_storage, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_ephemeral_storage, "I", ""); +SYSCTL_PROC(_hw, OID_AUTO, use_recovery_securityd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_use_recovery_securityd, "I", ""); /* * Optional CPU features can register nodes below hw.optional. @@ -512,6 +603,7 @@ int gNeonHpfp = -1; int gNeonFp16 = -1; int gARMv81Atomics = 0; int gARMv8Crc32 = 0; +int gARMv82FHM = 0; #if defined (__arm__) int arm64_flag = 0; @@ -528,6 +620,7 @@ SYSCTL_INT(_hw_optional, OID_AUTO, neon_hpfp, CTLFLAG_RD | CTLFLAG_KERN | CTLFLA SYSCTL_INT(_hw_optional, OID_AUTO, neon_fp16, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeonFp16, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, armv8_1_atomics, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv81Atomics, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, armv8_crc32, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv8Crc32, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, armv8_2_fhm, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv82FHM, 0, ""); /* * Without this little ifdef dance, the preprocessor replaces "arm64" with "1", @@ -627,6 +720,7 @@ sysctl_mib_init(void) cachesize[4] = 0; packages = 1; + #else #error unknown architecture #endif /* !__i386__ && !__x86_64 && !__arm__ && !__arm64__ */ diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 29853fd43..0fd0cc336 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All Rights Reserved. + * Copyright (c) 2007-2019 Apple Inc. All Rights Reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,6 +131,13 @@ #if CONFIG_MACF #include #endif +#include + +#ifndef CONFIG_EMBEDDED +#include /* for IOTaskHasEntitlement */ +#include /* for csr_check */ +#define MAP_32BIT_ENTITLEMENT "com.apple.security.mmap-map-32bit" +#endif /* * XXX Internally, we use VM_PROT_* somewhat interchangeably, but the correct @@ -151,6 +158,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) vm_map_t user_map; kern_return_t result; vm_map_offset_t user_addr; + vm_map_offset_t sum; vm_map_size_t user_size; vm_object_offset_t pageoff; vm_object_offset_t file_pos; @@ -183,6 +191,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) AUDIT_ARG(len, user_size); AUDIT_ARG(fd, uap->fd); + if (vm_map_range_overflows(user_addr, user_size)) { + return EINVAL; + } prot = (uap->prot & VM_PROT_ALL); #if 3777787 /* @@ -200,7 +211,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) vp = NULLVP; /* - * The vm code does not have prototypes & compiler doesn't do the' + * The vm code does not have prototypes & compiler doesn't do * the right thing when you cast 64bit value and pass it in function * call. So here it is. */ @@ -208,7 +219,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) /* make sure mapping fits into numeric range etc */ - if (file_pos + user_size > (vm_object_offset_t)-PAGE_SIZE_64) { + if (os_add3_overflow(file_pos, user_size, PAGE_SIZE_64 - 1, &sum)) { return EINVAL; } @@ -241,10 +252,31 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) (flags & MAP_JIT)) { return EINVAL; } + } + if (flags & MAP_RESILIENT_CODESIGN) { if (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) { return EPERM; } } + if (flags & MAP_SHARED) { + /* + * MAP_RESILIENT_MEDIA is not valid with MAP_SHARED because + * there is no place to inject zero-filled pages without + * actually adding them to the file. + * Since we didn't reject that combination before, there might + * already be callers using it and getting a valid MAP_SHARED + * mapping but without the resilience. + * For backwards compatibility's sake, let's keep ignoring + * MAP_RESILIENT_MEDIA in that case. + */ + flags &= ~MAP_RESILIENT_MEDIA; + } + if (flags & MAP_RESILIENT_MEDIA) { + if ((flags & MAP_ANON) || + (flags & MAP_SHARED)) { + return EINVAL; + } + } /* * Check for illegal addresses. Watch out for address wrap... Note @@ -450,7 +482,21 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) goto bad; } #endif /* MAC */ + /* + * Consult the file system to determine if this + * particular file object can be mapped. + */ + error = VNOP_MMAP_CHECK(vp, prot, ctx); + if (error) { + (void)vnode_put(vp); + goto bad; + } } + + /* + * No copy-on-read for mmap() mappings themselves. + */ + vmk_flags.vmkf_no_copy_on_read = 1; } if (user_size == 0) { @@ -514,6 +560,21 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (flags & MAP_RESILIENT_CODESIGN) { alloc_flags |= VM_FLAGS_RESILIENT_CODESIGN; } + if (flags & MAP_RESILIENT_MEDIA) { + alloc_flags |= VM_FLAGS_RESILIENT_MEDIA; + } + +#ifndef CONFIG_EMBEDDED + if (flags & MAP_32BIT) { + if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) == 0 || + IOTaskHasEntitlement(current_task(), MAP_32BIT_ENTITLEMENT)) { + vmk_flags.vmkf_32bit_map_va = TRUE; + } else { + error = EPERM; + goto bad; + } + } +#endif /* * Lookup/allocate object. @@ -616,8 +677,7 @@ map_anon_retry: #endif /* radar 3777787 */ map_file_retry: - if ((flags & MAP_RESILIENT_CODESIGN) || - (flags & MAP_RESILIENT_MEDIA)) { + if (flags & MAP_RESILIENT_CODESIGN) { if (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) { assert(!mapanon); vnode_put(vp); @@ -716,10 +776,13 @@ msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused int3 user_map = current_map(); addr = (mach_vm_offset_t) uap->addr; - size = (mach_vm_size_t)uap->len; + size = (mach_vm_size_t) uap->len; #ifndef CONFIG_EMBEDDED KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0); #endif + if (mach_vm_range_overflows(addr, size)) { + return EINVAL; + } if (addr & vm_map_page_mask(user_map)) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ return EINVAL; @@ -797,7 +860,7 @@ munmap(__unused proc_t p, struct munmap_args *uap, __unused int32_t *retval) return EINVAL; } - if (user_addr + user_size < user_addr) { + if (mach_vm_range_overflows(user_addr, user_size)) { return EINVAL; } @@ -834,6 +897,9 @@ mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval) user_size = (mach_vm_size_t) uap->len; prot = (vm_prot_t)(uap->prot & (VM_PROT_ALL | VM_PROT_TRUSTED | VM_PROT_STRIP_READ)); + if (mach_vm_range_overflows(user_addr, user_size)) { + return EINVAL; + } if (user_addr & vm_map_page_mask(user_map)) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ return EINVAL; @@ -939,7 +1005,9 @@ minherit(__unused proc_t p, struct minherit_args *uap, __unused int32_t *retval) addr = (mach_vm_offset_t)uap->addr; size = (mach_vm_size_t)uap->len; inherit = uap->inherit; - + if (mach_vm_range_overflows(addr, size)) { + return EINVAL; + } user_map = current_map(); result = mach_vm_inherit(user_map, addr, size, inherit); @@ -1009,7 +1077,9 @@ madvise(__unused proc_t p, struct madvise_args *uap, __unused int32_t *retval) start = (mach_vm_offset_t) uap->addr; size = (mach_vm_size_t) uap->len; - + if (mach_vm_range_overflows(start, size)) { + return EINVAL; + } #if __arm64__ if (start == 0 && size != 0 && @@ -1203,8 +1273,7 @@ mlock(__unused proc_t p, struct mlock_args *uap, __unused int32_t *retvalval) addr = (vm_map_offset_t) uap->addr; size = (vm_map_size_t)uap->len; - /* disable wrap around */ - if (addr + size < addr) { + if (vm_map_range_overflows(addr, size)) { return EINVAL; } @@ -1240,12 +1309,14 @@ munlock(__unused proc_t p, struct munlock_args *uap, __unused int32_t *retval) kern_return_t result; AUDIT_ARG(addr, uap->addr); - AUDIT_ARG(addr, uap->len); + AUDIT_ARG(len, uap->len); addr = (mach_vm_offset_t) uap->addr; size = (mach_vm_size_t)uap->len; user_map = current_map(); - + if (mach_vm_range_overflows(addr, size)) { + return EINVAL; + } /* JMM - need to remove all wirings by spec - this just removes one */ result = mach_vm_wire_kernel(host_priv_self(), user_map, addr, size, VM_PROT_NONE, VM_KERN_MEMORY_MLOCK); return result == KERN_SUCCESS ? 0 : ENOMEM; @@ -1295,6 +1366,9 @@ mremap_encrypted(__unused struct proc *p, struct mremap_encrypted_args *uap, __u cputype = uap->cputype; cpusubtype = uap->cpusubtype; + if (mach_vm_range_overflows(user_addr, user_size)) { + return EINVAL; + } if (user_addr & vm_map_page_mask(user_map)) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ return EINVAL; diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index 746752d34..07cd0e082 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,9 @@ #include #endif +#if defined(HAS_APPLE_PAC) +#include +#endif /* defined(HAS_APPLE_PAC) */ lck_grp_t * sysctl_lock_group = NULL; lck_rw_t * sysctl_geometry_lock = NULL; @@ -209,13 +212,43 @@ sysctl_register_oid(struct sysctl_oid *new_oidp) } } +#if defined(HAS_APPLE_PAC) + if (oidp->oid_handler) { + /* + * Dereference function-pointer-signed oid_handler to prevent an + * attacker with the ability to observe the result of the + * auth_and_resign below from trying all possible inputs until an auth + * succeeds. + */ + if (__builtin_expect(!*(uintptr_t*)ptrauth_auth_data((void*) + oidp->oid_handler, ptrauth_key_function_pointer, 0), 0)) { + /* + * This is necessary to force the dereference but will never + * actually be reached, dereferencing an invalidly signed pointer + * will trap before getting here (and the codegen is nicer than + * with a panic). + */ + __builtin_trap(); + } + /* + * Sign oid_handler address-discriminated upon installation to make it + * harder to replace with an arbitrary function pointer. + */ + oidp->oid_handler = ptrauth_auth_and_resign(oidp->oid_handler, + ptrauth_key_function_pointer, 0, ptrauth_key_function_pointer, + ptrauth_blend_discriminator(&oidp->oid_handler, + ptrauth_string_discriminator("oid_handler"))); + } +#endif /* defined(HAS_APPLE_PAC) */ /* * Insert the oid into the parent's list in order. */ q = NULL; SLIST_FOREACH(p, parent, oid_link) { - if (oidp->oid_number < p->oid_number) { + if (oidp->oid_number == p->oid_number) { + panic("attempting to register a sysctl at previously registered slot : %d", oidp->oid_number); + } else if (oidp->oid_number < p->oid_number) { break; } q = p; @@ -269,6 +302,34 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) } } +#if defined(HAS_APPLE_PAC) + if (removed_oidp && removed_oidp->oid_handler && old_oidp == NULL) { + /* + * Revert address-discriminated signing performed by + * sysctl_register_oid() (in case this oid is registered again). + */ + removed_oidp->oid_handler = ptrauth_auth_function(removed_oidp->oid_handler, + ptrauth_key_function_pointer, + ptrauth_blend_discriminator(&removed_oidp->oid_handler, + ptrauth_string_discriminator("oid_handler"))); + /* + * Dereference the function-pointer-signed result to prevent an + * attacker with the ability to observe the result of the + * auth_and_resign above from trying all possible inputs until an auth + * succeeds. + */ + if (__builtin_expect(!*(uintptr_t*)ptrauth_auth_data((void*) + removed_oidp->oid_handler, ptrauth_key_function_pointer, 0), 0)) { + /* + * This is necessary to force the dereference but will never + * actually be reached, dereferencing an invalidly signed pointer + * will trap before getting here (and the codegen is nicer than + * with a panic). + */ + __builtin_trap(); + } + } +#endif /* defined(HAS_APPLE_PAC) */ /* * We've removed it from the list at this point, but we don't want @@ -349,6 +410,7 @@ sysctl_early_init(void) sysctl_unlocked_node_lock = lck_mtx_alloc_init(sysctl_lock_group, NULL); sysctl_register_set("__sysctl_set"); + sysctl_load_devicetree_entries(); } /* @@ -441,10 +503,10 @@ sysctl_io_string(struct sysctl_req *req, char *pValue, size_t valueSize, int tru * returned string to the buffer size. This preserves the semantics * of some library routines implemented via sysctl, which truncate * their returned data, rather than simply returning an error. The - * returned string is always NUL terminated. */ + * returned string is always nul (ascii '\0') terminated. */ error = SYSCTL_OUT(req, pValue, req->oldlen - 1); if (!error) { - char c = 0; + char c = '\0'; error = SYSCTL_OUT(req, &c, 1); } } else { @@ -467,7 +529,7 @@ sysctl_io_string(struct sysctl_req *req, char *pValue, size_t valueSize, int tru return EINVAL; } - /* copy the string in and force NUL termination */ + /* copy the string in and force nul termination */ error = SYSCTL_IN(req, pValue, req->newlen); pValue[req->newlen] = '\0'; @@ -1589,6 +1651,15 @@ found: lck_mtx_lock(sysctl_unlocked_node_lock); } +#if defined(HAS_APPLE_PAC) + /* + * oid_handler is signed address-discriminated by sysctl_register_oid(). + */ + oid_handler = ptrauth_auth_function(oid_handler, + ptrauth_key_function_pointer, + ptrauth_blend_discriminator(&oid->oid_handler, + ptrauth_string_discriminator("oid_handler"))); +#endif /* defined(HAS_APPLE_PAC) */ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { i = oid_handler(oid, name + indx, namelen - indx, req); @@ -1656,7 +1727,7 @@ sysctl_create_user_req(struct sysctl_req *req, struct proc *p, user_addr_t oldp, int sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval) { - int error; + int error, new_error; size_t oldlen = 0, newlen; int name[CTL_MAXNAME]; struct sysctl_req req; @@ -1721,16 +1792,25 @@ sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval) err: if (uap->oldlenp != USER_ADDR_NULL) { - error = suulong(uap->oldlenp, oldlen); + /* + * Only overwrite the old error value on a new error + */ + new_error = suulong(uap->oldlenp, oldlen); + + if (new_error) { + error = new_error; + } } return error; } +// sysctlbyname is also exported as KPI to kexts +// and the syscall name cannot conflict with it int -sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retval) +sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retval) { - int error; + int error, new_error; size_t oldlen = 0, newlen; char *name; size_t namelen = 0; @@ -1788,7 +1868,14 @@ sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retval) } if (uap->oldlenp != USER_ADDR_NULL) { - error = suulong(uap->oldlenp, oldlen); + /* + * Only overwrite the old error value on a new error + */ + new_error = suulong(uap->oldlenp, oldlen); + + if (new_error) { + error = new_error; + } } return error; diff --git a/bsd/kern/kern_ntptime.c b/bsd/kern/kern_ntptime.c index 589ff9d97..2ad397e74 100644 --- a/bsd/kern/kern_ntptime.c +++ b/bsd/kern/kern_ntptime.c @@ -334,7 +334,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int32_t *retval) return error; } -#if DEVELOPEMNT || DEBUG +#if DEVELOPMENT || DEBUG if (g_should_log_clock_adjustments) { os_log(OS_LOG_DEFAULT, "%s: BEFORE modes %u offset %ld freq %ld status %d constant %ld time_adjtime %lld\n", __func__, ntv.modes, ntv.offset, ntv.freq, ntv.status, ntv.constant, time_adjtime); @@ -438,7 +438,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int32_t *retval) ret = ntp_is_time_error(time_status) ? TIME_ERROR : time_state; -#if DEVELOPEMNT || DEBUG +#if DEVELOPMENT || DEBUG if (g_should_log_clock_adjustments) { os_log(OS_LOG_DEFAULT, "%s: AFTER modes %u offset %lld freq %lld status %d constant %ld time_adjtime %lld\n", __func__, modes, time_offset, time_freq, time_status, time_constant, time_adjtime); @@ -572,7 +572,7 @@ ntp_update_second(int64_t *adjustment, clock_sec_t secs) updated = 0; } -#if DEVELOPEMNT || DEBUG +#if DEVELOPMENT || DEBUG if (g_should_log_clock_adjustments) { int64_t nano = (time_adj > 0)? time_adj >> 32 : -((-time_adj) >> 32); int64_t frac = (time_adj > 0)? ((uint32_t) time_adj) : -((uint32_t) (-time_adj)); @@ -675,7 +675,7 @@ kern_adjtime(struct timeval *delta) NTP_LOCK(enable); ltr = time_adjtime; time_adjtime = ltw; -#if DEVELOPEMNT || DEBUG +#if DEVELOPMENT || DEBUG if (g_should_log_clock_adjustments) { os_log(OS_LOG_DEFAULT, "%s:AFTER offset %lld freq %lld status %d constant %ld time_adjtime %lld\n", __func__, time_offset, time_freq, time_status, time_constant, time_adjtime); diff --git a/bsd/kern/kern_pcsamples.c b/bsd/kern/kern_pcsamples.c deleted file mode 100644 index 69694bc5b..000000000 --- a/bsd/kern/kern_pcsamples.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -vm_offset_t pc_buftomem = 0; -unsigned int * pc_buffer = 0; /* buffer that holds each pc */ -unsigned int * pc_bufptr = 0; -unsigned int * pc_buflast = 0; -unsigned int npcbufs = 8192; /* number of pc entries in buffer */ -unsigned int pc_bufsize = 0; -unsigned int pcsample_flags = 0; -unsigned int pcsample_enable = 0; - -pid_t pc_sample_pid = 0; -boolean_t pc_trace_frameworks = FALSE; - -char pcsample_comm[MAXCOMLEN + 1]; - -/* Set the default framework boundaries */ -unsigned int pcsample_beg = 0; -unsigned int pcsample_end = 0; - -static pid_t global_state_pid = -1; /* Used to control exclusive use of pc_buffer */ - -extern unsigned int pc_trace_buf[]; -extern int pc_trace_cnt; - -void add_pcbuffer(void); -int branch_tracing_enabled(void); -int disable_branch_tracing(void); -int enable_branch_tracing(void); -int pcsamples_bootstrap(void); -void pcsamples_clear(void); -int pcsamples_control(int *name, u_int namelen, user_addr_t where, size_t *sizep); -int pcsamples_read(user_addr_t buffer, size_t *number); -int pcsamples_reinit(void); - -int -enable_branch_tracing(void) -{ - struct proc *p; - if (-1 != pc_sample_pid) { - p = proc_find(pc_sample_pid); - if (p) { - p->p_btrace = 1; - proc_rele(p); - } - } else { - pc_trace_frameworks = TRUE; - } - - return 1; -} - -int -disable_branch_tracing(void) -{ - struct proc *p; - switch (pc_sample_pid) { - case -1: - pc_trace_frameworks = FALSE; - break; - case 0: - break; - default: - p = proc_find(pc_sample_pid); - if (p) { - p->p_btrace = 0; - proc_rele(p); - } - break; - } - clr_be_bit(); - return 1; -} - -/* - * this only works for the current proc as it - * is called from context_switch in the scheduler - */ -int -branch_tracing_enabled(void) -{ - struct proc *p = current_proc(); - if (TRUE == pc_trace_frameworks) { - return TRUE; - } - if (p) { - return p->p_btrace; - } - return 0; -} - - -void -add_pcbuffer(void) -{ - int i; - unsigned int pc; - - if (!pcsample_enable) { - return; - } - - for (i = 0; i < pc_trace_cnt; i++) { - pc = pc_trace_buf[i]; - - if ((pcsample_beg <= pc) && (pc < pcsample_end)) { - if (pc_bufptr > pc_buffer) { - if ((*(pc_bufptr - 1)) == pc) { - continue; /* Ignore, probably spinning */ - } - } - - /* Then the sample is in our range */ - *pc_bufptr = pc; - pc_bufptr++; - } - } - - /* We never wrap the buffer */ - if ((pc_bufptr + pc_trace_cnt) >= pc_buflast) { - pcsample_enable = 0; - (void)disable_branch_tracing(); - wakeup(&pcsample_enable); - } - return; -} - -int -pcsamples_bootstrap(void) -{ - if (!disable_branch_tracing()) { - return ENOTSUP; - } - - pc_bufsize = npcbufs * sizeof(*pc_buffer); - if (kmem_alloc(kernel_map, &pc_buftomem, - (vm_size_t)pc_bufsize) == KERN_SUCCESS) { - pc_buffer = (unsigned int *) pc_buftomem; - } else { - pc_buffer = NULL; - } - - if (pc_buffer) { - pc_bufptr = pc_buffer; - pc_buflast = &pc_bufptr[npcbufs]; - pcsample_enable = 0; - return 0; - } else { - pc_bufsize = 0; - return EINVAL; - } -} - -int -pcsamples_reinit(void) -{ - int ret = 0; - - pcsample_enable = 0; - - if (pc_bufsize && pc_buffer) { - kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize); - } - - ret = pcsamples_bootstrap(); - return ret; -} - -void -pcsamples_clear(void) -{ - /* Clean up the sample buffer, set defaults */ - global_state_pid = -1; - pcsample_enable = 0; - if (pc_bufsize && pc_buffer) { - kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize); - } - pc_buffer = NULL; - pc_bufptr = NULL; - pc_buflast = NULL; - pc_bufsize = 0; - pcsample_beg = 0; - pcsample_end = 0; - bzero((void *)pcsample_comm, sizeof(pcsample_comm)); - (void)disable_branch_tracing(); - pc_sample_pid = 0; - pc_trace_frameworks = FALSE; -} - -int -pcsamples_control(int *name, __unused u_int namelen, user_addr_t where, size_t *sizep) -{ - int ret = 0; - size_t size = *sizep; - int value = name[1]; - pcinfo_t pc_bufinfo = {}; - pid_t *pidcheck; - - pid_t curpid; - struct proc *p, *curproc; - - if (name[0] != PCSAMPLE_GETNUMBUF) { - curproc = current_proc(); - if (curproc) { - curpid = curproc->p_pid; - } else { - return ESRCH; - } - - if (global_state_pid == -1) { - global_state_pid = curpid; - } else if (global_state_pid != curpid) { - if ((p = proc_find(global_state_pid)) == NULL) { - /* The global pid no longer exists */ - global_state_pid = curpid; - } else { - proc_rele(p); - /* The global pid exists, deny this request */ - return EBUSY; - } - } - } - - - switch (name[0]) { - case PCSAMPLE_DISABLE: /* used to disable */ - pcsample_enable = 0; - break; - case PCSAMPLE_SETNUMBUF: - /* The buffer size is bounded by a min and max number of samples */ - if (value < pc_trace_cnt) { - ret = EINVAL; - break; - } - if (value <= MAX_PCSAMPLES) { - /* npcbufs = value & ~(PC_TRACE_CNT-1); */ - npcbufs = value; - } else { - npcbufs = MAX_PCSAMPLES; - } - break; - case PCSAMPLE_GETNUMBUF: - if (size < sizeof(pc_bufinfo)) { - ret = EINVAL; - break; - } - pc_bufinfo.npcbufs = npcbufs; - pc_bufinfo.bufsize = pc_bufsize; - pc_bufinfo.enable = pcsample_enable; - pc_bufinfo.pcsample_beg = pcsample_beg; - pc_bufinfo.pcsample_end = pcsample_end; - if (copyout(&pc_bufinfo, where, sizeof(pc_bufinfo))) { - ret = EINVAL; - } - break; - case PCSAMPLE_SETUP: - ret = pcsamples_reinit(); - break; - case PCSAMPLE_REMOVE: - pcsamples_clear(); - break; - case PCSAMPLE_READBUF: - /* A nonzero value says enable and wait on the buffer */ - /* A zero value says read up the buffer immediately */ - if (value == 0) { - /* Do not wait on the buffer */ - pcsample_enable = 0; - (void)disable_branch_tracing(); - ret = pcsamples_read(where, sizep); - break; - } else if ((pc_bufsize <= 0) || (!pc_buffer)) { - /* enable only if buffer is initialized */ - ret = EINVAL; - break; - } - - /* Turn on branch tracing */ - if (!enable_branch_tracing()) { - ret = ENOTSUP; - break; - } - - /* Enable sampling */ - pcsample_enable = 1; - - ret = tsleep(&pcsample_enable, PRIBIO | PCATCH, "pcsample", 0); - pcsample_enable = 0; - (void)disable_branch_tracing(); - - if (ret) { - /* Eventually fix this... if (ret != EINTR) */ - if (ret) { - /* On errors, except EINTR, we want to cleanup buffer ptrs */ - /* pc_bufptr = pc_buffer; */ - *sizep = 0; - } - } else { - /* The only way to get here is if the buffer is full */ - ret = pcsamples_read(where, sizep); - } - - break; - case PCSAMPLE_SETREG: - if (size < sizeof(pc_bufinfo)) { - ret = EINVAL; - break; - } - if (copyin(where, &pc_bufinfo, sizeof(pc_bufinfo))) { - ret = EINVAL; - break; - } - - pcsample_beg = pc_bufinfo.pcsample_beg; - pcsample_end = pc_bufinfo.pcsample_end; - break; - case PCSAMPLE_COMM: - if (!(sizeof(pcsample_comm) > size)) { - ret = EINVAL; - break; - } - bzero((void *)pcsample_comm, sizeof(pcsample_comm)); - if (copyin(where, pcsample_comm, size)) { - ret = EINVAL; - break; - } - - /* Check for command name or pid */ - if (pcsample_comm[0] != '\0') { - ret = ENOTSUP; - break; - } else { - if (size != (2 * sizeof(pid_t))) { - ret = EINVAL; - break; - } else { - pidcheck = (pid_t *)pcsample_comm; - pc_sample_pid = pidcheck[1]; - } - } - break; - default: - ret = ENOTSUP; - break; - } - return ret; -} - - -/* - * This buffer must be read up in one call. - * If the buffer isn't big enough to hold - * all the samples, it will copy up enough - * to fill the buffer and throw the rest away. - * This buffer never wraps. - */ -int -pcsamples_read(user_addr_t buffer, size_t *number) -{ - size_t count = 0; - size_t copycount; - - count = (*number) / sizeof(*pc_buffer); - - if (count && pc_bufsize && pc_buffer) { - copycount = pc_bufptr - pc_buffer; - - if (copycount <= 0) { - *number = 0; - return 0; - } - - if (copycount > count) { - copycount = count; - } - - /* We actually have data to send up */ - if (copyout(pc_buffer, buffer, copycount * sizeof(*pc_buffer))) { - *number = 0; - return EINVAL; - } - *number = copycount; - pc_bufptr = pc_buffer; - return 0; - } else { - *number = 0; - return 0; - } -} diff --git a/bsd/kern/kern_persona.c b/bsd/kern/kern_persona.c index c9c846717..7fc207026 100644 --- a/bsd/kern/kern_persona.c +++ b/bsd/kern/kern_persona.c @@ -28,12 +28,17 @@ #include #include #include +#include #if CONFIG_PERSONAS +#include + #include #include #include #include +#include +#include #include #include @@ -52,9 +57,6 @@ #define FIRST_PERSONA_ID 501 #define PERSONA_ID_STEP 10 -#define PERSONA_SYSTEM_UID ((uid_t)99) -#define PERSONA_SYSTEM_LOGIN "system" - #define PERSONA_ALLOC_TOKEN (0x7a0000ae) #define PERSONA_INIT_TOKEN (0x7500005e) #define PERSONA_MAGIC (0x0aa55aa0) @@ -65,8 +67,13 @@ static LIST_HEAD(personalist, persona) all_personas; static uint32_t g_total_personas; uint32_t g_max_personas = MAX_PERSONAS; - -struct persona *g_system_persona = NULL; +struct persona *system_persona = NULL; +struct persona *proxy_system_persona = NULL; +#if CONFIG_EMBEDDED +int unique_persona = 1; +#else +int unique_persona = 0; +#endif static uid_t g_next_persona_id; @@ -80,17 +87,23 @@ os_refgrp_decl(static, persona_refgrp, "persona", NULL); static zone_t persona_zone; kauth_cred_t g_default_persona_cred; +extern struct auditinfo_addr *audit_default_aia_p; #define lock_personas() lck_mtx_lock(&all_personas_lock) #define unlock_personas() lck_mtx_unlock(&all_personas_lock) - extern void mach_kauth_cred_uthread_update(void); +extern kern_return_t bank_get_bank_ledger_thread_group_and_persona(void *voucher, + void *bankledger, void **banktg, uint32_t *persona_id); +void +ipc_voucher_release(void *voucher); + void personas_bootstrap(void) { struct posix_cred pcred; + int unique_persona_bootarg; persona_dbg("Initializing persona subsystem"); LIST_INIT(&all_personas); @@ -126,20 +139,17 @@ personas_bootstrap(void) if (!g_default_persona_cred) { panic("couldn't create default persona credentials!"); } - - g_system_persona = persona_alloc(PERSONA_SYSTEM_UID, - PERSONA_SYSTEM_LOGIN, - PERSONA_SYSTEM, NULL); - int err = persona_init_begin(g_system_persona); - assert(err == 0); - - persona_init_end(g_system_persona, err); - - assert(g_system_persona != NULL); +#if CONFIG_AUDIT + /* posix_cred_create() sets this value to NULL */ + g_default_persona_cred->cr_audit.as_aia_p = audit_default_aia_p; +#endif + if (PE_parse_boot_argn("unique_persona", &unique_persona_bootarg, sizeof(unique_persona_bootarg))) { + unique_persona = !!unique_persona_bootarg; + } } struct persona * -persona_alloc(uid_t id, const char *login, int type, int *error) +persona_alloc(uid_t id, const char *login, int type, char *path, int *error) { struct persona *persona; int err = 0; @@ -170,7 +180,7 @@ persona_alloc(uid_t id, const char *login, int type, int *error) bzero(persona, sizeof(*persona)); - if (hw_atomic_add(&g_total_personas, 1) > MAX_PERSONAS) { + if (os_atomic_inc(&g_total_personas, relaxed) > MAX_PERSONAS) { /* too many personas! */ pna_err("too many active personas!"); err = EBUSY; @@ -199,6 +209,7 @@ persona_alloc(uid_t id, const char *login, int type, int *error) persona->pna_type = type; persona->pna_id = id; persona->pna_valid = PERSONA_ALLOC_TOKEN; + persona->pna_path = path; /* * NOTE: this persona has not been fully initialized. A subsequent @@ -211,7 +222,7 @@ persona_alloc(uid_t id, const char *login, int type, int *error) return persona; out_error: - (void)hw_atomic_add(&g_total_personas, -1); + os_atomic_dec(&g_total_personas, relaxed); zfree(persona_zone, persona); if (error) { *error = err; @@ -375,7 +386,7 @@ persona_init_end(struct persona *persona, int error) if (error != 0 || persona->pna_valid == PERSONA_ALLOC_TOKEN) { persona_dbg("ERROR:%d after initialization of %d (%s)", error, persona->pna_id, persona->pna_login); /* remove this persona from the global count */ - (void)hw_atomic_add(&g_total_personas, -1); + os_atomic_dec(&g_total_personas, relaxed); } else if (error == 0 && persona->pna_valid == PERSONA_INIT_TOKEN) { persona->pna_valid = PERSONA_MAGIC; @@ -386,6 +397,76 @@ persona_init_end(struct persona *persona, int error) unlock_personas(); } +/** + * persona_verify_and_set_uniqueness + * + * This function checks the persona, if the one being spawned is of type + * PERSONA_SYSTEM or PERSONA_SYSTEM_PROXY, is unique. + * + * Conditions: + * global persona list is locked on entry and return. + * + * Returns: + * EEXIST: if persona is system/system-proxy and is not unique. + * 0: Otherwise. + */ +int +persona_verify_and_set_uniqueness(struct persona *persona) +{ + if (persona == NULL) { + return EINVAL; + } + + if (!unique_persona) { + return 0; + } + + if (persona->pna_type == PERSONA_SYSTEM) { + if (system_persona != NULL) { + return EEXIST; + } + system_persona = persona; + return 0; + } + + if (persona->pna_type == PERSONA_SYSTEM_PROXY) { + if (proxy_system_persona != NULL) { + return EEXIST; + } + proxy_system_persona = persona; + return 0; + } + return 0; +} + +/** + * persona_is_unique + * + * This function checks if the persona spawned is unique. + * + * Returns: + * TRUE: if unique. + * FALSE: otherwise. + */ +boolean_t +persona_is_unique(struct persona *persona) +{ + if (persona == NULL) { + return FALSE; + } + + if (!unique_persona) { + return FALSE; + } + + if (persona->pna_type == PERSONA_SYSTEM || + persona->pna_type == PERSONA_SYSTEM_PROXY) { + return TRUE; + } + + return FALSE; +} + static struct persona * persona_get_locked(struct persona *persona) { @@ -438,11 +519,14 @@ persona_put(struct persona *persona) persona_lock(persona); if (persona_valid(persona)) { LIST_REMOVE(persona, pna_list); - if (hw_atomic_add(&g_total_personas, -1) == UINT_MAX) { + if (os_atomic_dec_orig(&g_total_personas, relaxed) == 0) { panic("persona count underflow!\n"); } persona_mkinvalid(persona); } + if (persona->pna_path != NULL) { + FREE_ZONE(persona->pna_path, MAXPATHLEN, M_NAMEI); + } persona_unlock(persona); unlock_personas(); @@ -497,11 +581,11 @@ persona_lookup_and_invalidate(uid_t id) LIST_FOREACH_SAFE(entry, &all_personas, pna_list, tmp) { persona_lock(entry); if (entry->pna_id == id) { - if (persona_valid(entry)) { + if (persona_valid(entry) && !persona_is_unique(entry)) { persona = persona_get_locked(entry); assert(persona != NULL); LIST_REMOVE(persona, pna_list); - if (hw_atomic_add(&g_total_personas, -1) == UINT_MAX) { + if (os_atomic_dec_orig(&g_total_personas, relaxed) == 0) { panic("persona ref count underflow!\n"); } persona_mkinvalid(persona); @@ -516,9 +600,22 @@ persona_lookup_and_invalidate(uid_t id) return persona; } +int +persona_find_by_type(int persona_type, struct persona **persona, size_t *plen) +{ + return persona_find_all(NULL, PERSONA_ID_NONE, persona_type, persona, plen); +} + int persona_find(const char *login, uid_t uid, struct persona **persona, size_t *plen) +{ + return persona_find_all(login, uid, PERSONA_INVALID, persona, plen); +} + +int +persona_find_all(const char *login, uid_t uid, int persona_type, + struct persona **persona, size_t *plen) { struct persona *tmp; int match = 0; @@ -530,6 +627,11 @@ persona_find(const char *login, uid_t uid, if (uid != PERSONA_ID_NONE) { match++; } + if ((persona_type > PERSONA_INVALID) && (persona_type <= PERSONA_TYPE_MAX)) { + match++; + } else if (persona_type != PERSONA_INVALID) { + return EINVAL; + } if (match == 0) { return EINVAL; @@ -548,6 +650,9 @@ persona_find(const char *login, uid_t uid, if (uid != PERSONA_ID_NONE && uid == tmp->pna_id) { m++; } + if (persona_type != PERSONA_INVALID && persona_type == tmp->pna_type) { + m++; + } if (m == match) { if (persona && *plen > found) { persona[found] = persona_get_locked(tmp); @@ -593,13 +698,29 @@ persona_proc_get(pid_t pid) struct persona * current_persona_get(void) { - proc_t p = current_proc(); - struct persona *persona; - - proc_lock(p); - persona = persona_get(p->p_persona); - proc_unlock(p); + struct persona *persona = NULL; + uid_t current_persona_id = PERSONA_ID_NONE; + ipc_voucher_t voucher; + thread_get_mach_voucher(current_thread(), 0, &voucher); + /* returns a voucher ref */ + if (voucher != IPC_VOUCHER_NULL) { + /* + * If the voucher doesn't contain a bank attribute, it uses + * the default bank task value to determine the persona id + * which is the same as the proc's persona id + */ + bank_get_bank_ledger_thread_group_and_persona(voucher, NULL, + NULL, ¤t_persona_id); + ipc_voucher_release(voucher); + persona = persona_lookup(current_persona_id); + } else { + /* Fallback - get the proc's persona */ + proc_t p = current_proc(); + proc_lock(p); + persona = persona_get(p->p_persona); + proc_unlock(p); + } return persona; } @@ -852,7 +973,6 @@ persona_proc_adopt(proc_t p, struct persona *persona, kauth_cred_t auth_override { int error; struct persona *old_persona; - struct session * sessp; if (!persona) { return EINVAL; @@ -886,15 +1006,21 @@ persona_proc_adopt(proc_t p, struct persona *persona, kauth_cred_t auth_override enterpgrp(p, persona->pna_pgid, persona->pna_pgid == uid); } + /* Only Multiuser Mode needs to update the session login name to the persona name */ +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) + volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG); + uint32_t multiuser_flags = *multiuser_flag_address; /* set the login name of the session */ - sessp = proc_session(p); - if (sessp != SESSION_NULL) { - session_lock(sessp); - bcopy(persona->pna_login, sessp->s_login, MAXLOGNAME); - session_unlock(sessp); - session_rele(sessp); + if (multiuser_flags) { + struct session * sessp = proc_session(p); + if (sessp != SESSION_NULL) { + session_lock(sessp); + bcopy(persona->pna_login, sessp->s_login, MAXLOGNAME); + session_unlock(sessp); + session_rele(sessp); + } } - +#endif persona_unlock(persona); set_security_token(p); @@ -1259,8 +1385,6 @@ persona_get_login(struct persona *persona, char login[MAXLOGNAME + 1]) out_unlock: persona_unlock(persona); - login[MAXLOGNAME] = 0; - return ret; } @@ -1270,6 +1394,10 @@ out_unlock: * symbol exports for kext compatibility */ +struct persona *system_persona = NULL; +struct persona *proxy_system_persona = NULL; +int unique_persona = 0; + uid_t persona_get_id(__unused struct persona *persona) { @@ -1303,6 +1431,20 @@ persona_find(__unused const char *login, return ENOTSUP; } +int +persona_find_by_type(__unused int persona_type, + __unused struct persona **persona, + __unused size_t *plen) +{ + return ENOTSUP; +} + +struct persona * +persona_proc_get(__unused pid_t pid) +{ + return NULL; +} + struct persona * current_persona_get(void) { diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index 1d7689232..1efcb5674 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -111,6 +111,8 @@ #include #include #include +#include +#include #ifdef CONFIG_32BIT_TELEMETRY #include @@ -162,6 +164,10 @@ extern struct tty cons; extern int cs_debug; +#if DEVELOPMENT || DEBUG +int syscallfilter_disable = 0; +#endif // DEVELOPMENT || DEBUG + #if DEBUG #define __PROC_INTERNAL_DEBUG 1 #endif @@ -184,6 +190,7 @@ typedef uint64_t unaligned_u64 __attribute__((aligned(1))); static void orphanpg(struct pgrp * pg); void proc_name_kdp(task_t t, char * buf, int size); +boolean_t proc_binary_uuid_kdp(task_t task, uuid_t uuid); int proc_threadname_kdp(void * uth, char * buf, size_t size); void proc_starttime_kdp(void * p, unaligned_u64 *tv_sec, unaligned_u64 *tv_usec, unaligned_u64 *abstime); char * proc_name_address(void * p); @@ -463,13 +470,12 @@ record_procref(proc_t p __unused, int count) return; } - if (count == 1) { - if (uth->uu_pindex < NUM_PROC_REFS_TO_TRACK) { - backtrace((uintptr_t *) &uth->uu_proc_pcs[uth->uu_pindex], PROC_REF_STACK_DEPTH); + if (uth->uu_pindex < NUM_PROC_REFS_TO_TRACK) { + backtrace((uintptr_t *) &uth->uu_proc_pcs[uth->uu_pindex], + PROC_REF_STACK_DEPTH, NULL); - uth->uu_proc_ps[uth->uu_pindex] = p; - uth->uu_pindex++; - } + uth->uu_proc_ps[uth->uu_pindex] = p; + uth->uu_pindex++; } #endif } @@ -808,6 +814,15 @@ proc_ppid(proc_t p) return -1; } +int +proc_original_ppid(proc_t p) +{ + if (p != NULL) { + return p->p_original_ppid; + } + return -1; +} + int proc_selfpid(void) { @@ -826,6 +841,24 @@ proc_selfcsflags(void) return current_proc()->p_csflags; } +uint32_t +proc_platform(proc_t p) +{ + if (p != NULL) { + return p->p_platform; + } + return (uint32_t)-1; +} + +uint32_t +proc_sdk(proc_t p) +{ + if (p != NULL) { + return p->p_sdk; + } + return (uint32_t)-1; +} + #if CONFIG_DTRACE static proc_t dtrace_current_proc_vforking(void) @@ -923,6 +956,19 @@ proc_name_kdp(task_t t, char * buf, int size) } } +boolean_t +proc_binary_uuid_kdp(task_t task, uuid_t uuid) +{ + proc_t p = get_bsdtask_info(task); + if (p == PROC_NULL) { + return FALSE; + } + + proc_getexecutableuuid(p, uuid, sizeof(uuid_t)); + + return TRUE; +} + int proc_threadname_kdp(void * uth, char * buf, size_t size) { @@ -1191,6 +1237,12 @@ proc_getcdhash(proc_t p, unsigned char *cdhash) return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash); } +int +proc_exitstatus(proc_t p) +{ + return p->p_xstat & 0xffff; +} + void proc_getexecutableuuid(proc_t p, unsigned char *uuidbuf, unsigned long size) { @@ -1214,6 +1266,49 @@ proc_getexecutablevnode(proc_t p) return NULLVP; } +int +proc_selfexecutableargs(uint8_t *buf, size_t *buflen) +{ + proc_t p = current_proc(); + + // buflen must always be provided + if (buflen == NULL) { + return EINVAL; + } + + // If a buf is provided, there must be at least enough room to fit argc + if (buf && *buflen < sizeof(p->p_argc)) { + return EINVAL; + } + + if (!p->user_stack) { + return EINVAL; + } + + if (buf == NULL) { + *buflen = p->p_argslen + sizeof(p->p_argc); + return 0; + } + + // Copy in argc to the first 4 bytes + memcpy(buf, &p->p_argc, sizeof(p->p_argc)); + + if (*buflen > sizeof(p->p_argc) && p->p_argslen > 0) { + // See memory layout comment in kern_exec.c:exec_copyout_strings() + // We want to copy starting from `p_argslen` bytes away from top of stack + return copyin(p->user_stack - p->p_argslen, + buf + sizeof(p->p_argc), + MIN(p->p_argslen, *buflen - sizeof(p->p_argc))); + } else { + return 0; + } +} + +off_t +proc_getexecutableoffset(proc_t p) +{ + return p->p_textoff; +} void bsd_set_dependency_capable(task_t task) @@ -1387,6 +1482,7 @@ pinsertchild(proc_t parent, proc_t child) TAILQ_INIT(&child->p_evlist); child->p_pptr = parent; child->p_ppid = parent->p_pid; + child->p_original_ppid = parent->p_pid; child->p_puniqueid = parent->p_uniqueid; child->p_xhighbits = 0; @@ -1765,6 +1861,95 @@ fixjobc(proc_t p, struct pgrp *pgrp, int entering) proc_childrenwalk(p, fixjob_callback, &fjarg); } +/* + * The pidlist_* routines support the functions in this file that + * walk lists of processes applying filters and callouts to the + * elements of the list. + * + * A prior implementation used a single linear array, which can be + * tricky to allocate on large systems. This implementation creates + * an SLIST of modestly sized arrays of PIDS_PER_ENTRY elements. + * + * The array should be sized large enough to keep the overhead of + * walking the list low, but small enough that blocking allocations of + * pidlist_entry_t structures always succeed. + */ + +#define PIDS_PER_ENTRY 1021 + +typedef struct pidlist_entry { + SLIST_ENTRY(pidlist_entry) pe_link; + u_int pe_nused; + pid_t pe_pid[PIDS_PER_ENTRY]; +} pidlist_entry_t; + +typedef struct { + SLIST_HEAD(, pidlist_entry) pl_head; + struct pidlist_entry *pl_active; + u_int pl_nalloc; +} pidlist_t; + +static __inline__ pidlist_t * +pidlist_init(pidlist_t *pl) +{ + SLIST_INIT(&pl->pl_head); + pl->pl_active = NULL; + pl->pl_nalloc = 0; + return pl; +} + +static u_int +pidlist_alloc(pidlist_t *pl, u_int needed) +{ + while (pl->pl_nalloc < needed) { + pidlist_entry_t *pe = kalloc(sizeof(*pe)); + if (NULL == pe) { + panic("no space for pidlist entry"); + } + pe->pe_nused = 0; + SLIST_INSERT_HEAD(&pl->pl_head, pe, pe_link); + pl->pl_nalloc += (sizeof(pe->pe_pid) / sizeof(pe->pe_pid[0])); + } + return pl->pl_nalloc; +} + +static void +pidlist_free(pidlist_t *pl) +{ + pidlist_entry_t *pe; + while (NULL != (pe = SLIST_FIRST(&pl->pl_head))) { + SLIST_FIRST(&pl->pl_head) = SLIST_NEXT(pe, pe_link); + kfree(pe, sizeof(*pe)); + } + pl->pl_nalloc = 0; +} + +static __inline__ void +pidlist_set_active(pidlist_t *pl) +{ + pl->pl_active = SLIST_FIRST(&pl->pl_head); + assert(pl->pl_active); +} + +static void +pidlist_add_pid(pidlist_t *pl, pid_t pid) +{ + pidlist_entry_t *pe = pl->pl_active; + if (pe->pe_nused >= sizeof(pe->pe_pid) / sizeof(pe->pe_pid[0])) { + if (NULL == (pe = SLIST_NEXT(pe, pe_link))) { + panic("pidlist allocation exhausted"); + } + pl->pl_active = pe; + } + pe->pe_pid[pe->pe_nused++] = pid; +} + +static __inline__ u_int +pidlist_nalloc(const pidlist_t *pl) +{ + return pl->pl_nalloc; +} + /* * A process group has become orphaned; if there are any stopped processes in * the group, hang-up all process in that group. @@ -1772,14 +1957,9 @@ fixjobc(proc_t p, struct pgrp *pgrp, int entering) static void orphanpg(struct pgrp *pgrp) { - pid_t *pid_list; + pidlist_t pid_list, *pl = pidlist_init(&pid_list); + u_int pid_count_available = 0; proc_t p; - vm_size_t pid_list_size = 0; - vm_size_t pid_list_size_needed = 0; - int pid_count = 0; - int pid_count_available = 0; - - assert(pgrp != NULL); /* allocate outside of the pgrp_lock */ for (;;) { @@ -1790,71 +1970,52 @@ orphanpg(struct pgrp *pgrp) PGMEMBERS_FOREACH(pgrp, p) { pid_count_available++; - if (p->p_stat == SSTOP) { should_iterate = TRUE; } } - if (pid_count_available == 0 || !should_iterate) { pgrp_unlock(pgrp); - return; + goto out; /* no orphaned processes OR nothing stopped */ } - - pid_list_size_needed = pid_count_available * sizeof(pid_t); - if (pid_list_size >= pid_list_size_needed) { + if (pidlist_nalloc(pl) >= pid_count_available) { break; } pgrp_unlock(pgrp); - if (pid_list_size != 0) { - kfree(pid_list, pid_list_size); - } - pid_list = kalloc(pid_list_size_needed); - if (!pid_list) { - return; - } - pid_list_size = pid_list_size_needed; - } - - /* no orphaned processes */ - if (pid_list_size == 0) { - pgrp_unlock(pgrp); - return; + pidlist_alloc(pl, pid_count_available); } + pidlist_set_active(pl); + u_int pid_count = 0; PGMEMBERS_FOREACH(pgrp, p) { - pid_list[pid_count++] = proc_pid(p); - if (pid_count >= pid_count_available) { + pidlist_add_pid(pl, proc_pid(p)); + if (++pid_count >= pid_count_available) { break; } } pgrp_unlock(pgrp); - if (pid_count == 0) { - goto out; - } - - for (int i = 0; i < pid_count; i++) { - /* do not handle kernproc */ - if (pid_list[i] == 0) { - continue; - } - p = proc_find(pid_list[i]); - if (!p) { - continue; + const pidlist_entry_t *pe; + SLIST_FOREACH(pe, &(pl->pl_head), pe_link) { + for (u_int i = 0; i < pe->pe_nused; i++) { + const pid_t pid = pe->pe_pid[i]; + if (0 == pid) { + continue; /* skip kernproc */ + } + p = proc_find(pid); + if (!p) { + continue; + } + proc_transwait(p, 0); + pt_setrunnable(p); + psignal(p, SIGHUP); + psignal(p, SIGCONT); + proc_rele(p); } - - proc_transwait(p, 0); - pt_setrunnable(p); - psignal(p, SIGHUP); - psignal(p, SIGCONT); - proc_rele(p); } - out: - kfree(pid_list, pid_list_size); - return; + pidlist_free(pl); } int @@ -2344,7 +2505,7 @@ out: return error; } -int +void proc_iterate( unsigned int flags, proc_iterate_fn_t callout, @@ -2352,40 +2513,28 @@ proc_iterate( proc_iterate_fn_t filterfn, void *filterarg) { - pid_t *pid_list = NULL; - vm_size_t pid_list_size = 0; - vm_size_t pid_list_size_needed = 0; - int pid_count = 0; - int pid_count_available = 0; + pidlist_t pid_list, *pl = pidlist_init(&pid_list); + u_int pid_count_available = 0; assert(callout != NULL); /* allocate outside of the proc_list_lock */ for (;;) { proc_list_lock(); - - pid_count_available = nprocs + 1 /* kernel_task not counted in nprocs */; + pid_count_available = nprocs + 1; /* kernel_task not counted in nprocs */ assert(pid_count_available > 0); - - pid_list_size_needed = pid_count_available * sizeof(pid_t); - if (pid_list_size >= pid_list_size_needed) { + if (pidlist_nalloc(pl) > pid_count_available) { break; } proc_list_unlock(); - if (pid_list_size != 0) { - kfree(pid_list, pid_list_size); - } - pid_list = kalloc(pid_list_size_needed); - if (!pid_list) { - return 1; - } - pid_list_size = pid_list_size_needed; + pidlist_alloc(pl, pid_count_available); } - assert(pid_list != NULL); + pidlist_set_active(pl); - /* filter pids into pid_list */ + /* filter pids into the pid_list */ + u_int pid_count = 0; if (flags & PROC_ALLPROCLIST) { proc_t p; ALLPROC_FOREACH(p) { @@ -2396,9 +2545,8 @@ proc_iterate( if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) { continue; } - - pid_list[pid_count++] = proc_pid(p); - if (pid_count >= pid_count_available) { + pidlist_add_pid(pl, proc_pid(p)); + if (++pid_count >= pid_count_available) { break; } } @@ -2411,9 +2559,8 @@ proc_iterate( if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) { continue; } - - pid_list[pid_count++] = proc_pid(p); - if (pid_count >= pid_count_available) { + pidlist_add_pid(pl, proc_pid(p)); + if (++pid_count >= pid_count_available) { break; } } @@ -2423,63 +2570,63 @@ proc_iterate( /* call callout on processes in the pid_list */ - for (int i = 0; i < pid_count; i++) { - proc_t p = proc_find(pid_list[i]); - if (p) { - if ((flags & PROC_NOWAITTRANS) == 0) { - proc_transwait(p, 0); - } - int callout_ret = callout(p, arg); - - switch (callout_ret) { - case PROC_RETURNED_DONE: - proc_rele(p); - /* FALLTHROUGH */ - case PROC_CLAIMED_DONE: - goto out; - - case PROC_RETURNED: - proc_rele(p); - /* FALLTHROUGH */ - case PROC_CLAIMED: - break; - - default: - panic("proc_iterate: callout returned %d for pid %d", - callout_ret, pid_list[i]); - break; - } - } else if (flags & PROC_ZOMBPROCLIST) { - p = proc_find_zombref(pid_list[i]); - if (!p) { - continue; - } - int callout_ret = callout(p, arg); - - switch (callout_ret) { - case PROC_RETURNED_DONE: - proc_drop_zombref(p); - /* FALLTHROUGH */ - case PROC_CLAIMED_DONE: - goto out; - - case PROC_RETURNED: - proc_drop_zombref(p); - /* FALLTHROUGH */ - case PROC_CLAIMED: - break; - - default: - panic("proc_iterate: callout returned %d for zombie pid %d", - callout_ret, pid_list[i]); - break; + const pidlist_entry_t *pe; + SLIST_FOREACH(pe, &(pl->pl_head), pe_link) { + for (u_int i = 0; i < pe->pe_nused; i++) { + const pid_t pid = pe->pe_pid[i]; + proc_t p = proc_find(pid); + if (p) { + if ((flags & PROC_NOWAITTRANS) == 0) { + proc_transwait(p, 0); + } + const int callout_ret = callout(p, arg); + + switch (callout_ret) { + case PROC_RETURNED_DONE: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; + + case PROC_RETURNED: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + default: + panic("%s: callout =%d for pid %d", + __func__, callout_ret, pid); + break; + } + } else if (flags & PROC_ZOMBPROCLIST) { + p = proc_find_zombref(pid); + if (!p) { + continue; + } + const int callout_ret = callout(p, arg); + + switch (callout_ret) { + case PROC_RETURNED_DONE: + proc_drop_zombref(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; + + case PROC_RETURNED: + proc_drop_zombref(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + default: + panic("%s: callout =%d for zombie %d", + __func__, callout_ret, pid); + break; + } } } } - out: - kfree(pid_list, pid_list_size); - return 0; + pidlist_free(pl); } void @@ -2520,93 +2667,82 @@ restart_foreach: proc_list_unlock(); } -int +void proc_childrenwalk( proc_t parent, proc_iterate_fn_t callout, void *arg) { - pid_t *pid_list; - vm_size_t pid_list_size = 0; - vm_size_t pid_list_size_needed = 0; - int pid_count = 0; - int pid_count_available = 0; + pidlist_t pid_list, *pl = pidlist_init(&pid_list); + u_int pid_count_available = 0; assert(parent != NULL); assert(callout != NULL); for (;;) { proc_list_lock(); - pid_count_available = parent->p_childrencnt; if (pid_count_available == 0) { proc_list_unlock(); - return 0; + goto out; } - - pid_list_size_needed = pid_count_available * sizeof(pid_t); - if (pid_list_size >= pid_list_size_needed) { + if (pidlist_nalloc(pl) > pid_count_available) { break; } proc_list_unlock(); - if (pid_list_size != 0) { - kfree(pid_list, pid_list_size); - } - pid_list = kalloc(pid_list_size_needed); - if (!pid_list) { - return 1; - } - pid_list_size = pid_list_size_needed; + pidlist_alloc(pl, pid_count_available); } + pidlist_set_active(pl); + u_int pid_count = 0; proc_t p; PCHILDREN_FOREACH(parent, p) { if (p->p_stat == SIDL) { continue; } - - pid_list[pid_count++] = proc_pid(p); - if (pid_count >= pid_count_available) { + pidlist_add_pid(pl, proc_pid(p)); + if (++pid_count >= pid_count_available) { break; } } proc_list_unlock(); - for (int i = 0; i < pid_count; i++) { - p = proc_find(pid_list[i]); - if (!p) { - continue; - } - - int callout_ret = callout(p, arg); + const pidlist_entry_t *pe; + SLIST_FOREACH(pe, &(pl->pl_head), pe_link) { + for (u_int i = 0; i < pe->pe_nused; i++) { + const pid_t pid = pe->pe_pid[i]; + p = proc_find(pid); + if (!p) { + continue; + } + const int callout_ret = callout(p, arg); - switch (callout_ret) { - case PROC_RETURNED_DONE: - proc_rele(p); - /* FALLTHROUGH */ - case PROC_CLAIMED_DONE: - goto out; + switch (callout_ret) { + case PROC_RETURNED_DONE: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; - case PROC_RETURNED: - proc_rele(p); - /* FALLTHROUGH */ - case PROC_CLAIMED: - break; - default: - panic("proc_childrenwalk: callout returned %d for pid %d", - callout_ret, pid_list[i]); - break; + case PROC_RETURNED: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + default: + panic("%s: callout =%d for pid %d", + __func__, callout_ret, pid); + break; + } } } - out: - kfree(pid_list, pid_list_size); - return 0; + pidlist_free(pl); } -int +void pgrp_iterate( struct pgrp *pgrp, unsigned int flags, @@ -2615,51 +2751,40 @@ pgrp_iterate( proc_iterate_fn_t filterfn, void * filterarg) { - pid_t *pid_list; - proc_t p; - vm_size_t pid_list_size = 0; - vm_size_t pid_list_size_needed = 0; - int pid_count = 0; - int pid_count_available = 0; - - pid_t pgid; + pidlist_t pid_list, *pl = pidlist_init(&pid_list); + u_int pid_count_available = 0; assert(pgrp != NULL); assert(callout != NULL); for (;;) { pgrp_lock(pgrp); - pid_count_available = pgrp->pg_membercnt; if (pid_count_available == 0) { pgrp_unlock(pgrp); - return 0; + if (flags & PGRP_DROPREF) { + pg_rele(pgrp); + } + goto out; } - - pid_list_size_needed = pid_count_available * sizeof(pid_t); - if (pid_list_size >= pid_list_size_needed) { + if (pidlist_nalloc(pl) > pid_count_available) { break; } pgrp_unlock(pgrp); - if (pid_list_size != 0) { - kfree(pid_list, pid_list_size); - } - pid_list = kalloc(pid_list_size_needed); - if (!pid_list) { - return 1; - } - pid_list_size = pid_list_size_needed; + pidlist_alloc(pl, pid_count_available); } + pidlist_set_active(pl); - pgid = pgrp->pg_id; - + const pid_t pgid = pgrp->pg_id; + u_int pid_count = 0; + proc_t p; PGMEMBERS_FOREACH(pgrp, p) { if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) { continue;; } - pid_list[pid_count++] = proc_pid(p); - if (pid_count >= pid_count_available) { + pidlist_add_pid(pl, proc_pid(p)); + if (++pid_count >= pid_count_available) { break; } } @@ -2670,44 +2795,44 @@ pgrp_iterate( pg_rele(pgrp); } - for (int i = 0; i < pid_count; i++) { - /* do not handle kernproc */ - if (pid_list[i] == 0) { - continue; - } - p = proc_find(pid_list[i]); - if (!p) { - continue; - } - if (p->p_pgrpid != pgid) { - proc_rele(p); - continue; - } - - int callout_ret = callout(p, arg); - - switch (callout_ret) { - case PROC_RETURNED: - proc_rele(p); - /* FALLTHROUGH */ - case PROC_CLAIMED: - break; + const pidlist_entry_t *pe; + SLIST_FOREACH(pe, &(pl->pl_head), pe_link) { + for (u_int i = 0; i < pe->pe_nused; i++) { + const pid_t pid = pe->pe_pid[i]; + if (0 == pid) { + continue; /* skip kernproc */ + } + p = proc_find(pid); + if (!p) { + continue; + } + if (p->p_pgrpid != pgid) { + proc_rele(p); + continue; + } + const int callout_ret = callout(p, arg); - case PROC_RETURNED_DONE: - proc_rele(p); - /* FALLTHROUGH */ - case PROC_CLAIMED_DONE: - goto out; + switch (callout_ret) { + case PROC_RETURNED: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + case PROC_RETURNED_DONE: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; - default: - panic("pgrp_iterate: callout returned %d for pid %d", - callout_ret, pid_list[i]); + default: + panic("%s: callout =%d for pid %d", + __func__, callout_ret, pid); + } } } out: - kfree(pid_list, pid_list_size); - return 0; + pidlist_free(pl); } static void @@ -3110,7 +3235,7 @@ proc_knote_drain(struct proc *p) */ proc_klist_lock(); while ((kn = SLIST_FIRST(&p->p_klist))) { - kn->kn_ptr.p_proc = PROC_NULL; + kn->kn_proc = PROC_NULL; KNOTE_DETACH(&p->p_klist, kn); } proc_klist_unlock(); @@ -3138,6 +3263,20 @@ proc_pgrpid(proc_t p) return p->p_pgrpid; } +pid_t +proc_sessionid(proc_t p) +{ + pid_t sid = -1; + struct session * sessp = proc_session(p); + + if (sessp != SESSION_NULL) { + sid = sessp->s_sid; + session_rele(sessp); + } + + return sid; +} + pid_t proc_selfpgrpid() { @@ -3167,6 +3306,7 @@ int proc_dopcontrol(proc_t p) { int pcontrol; + os_reason_t kill_reason; proc_lock(p); @@ -3191,7 +3331,8 @@ proc_dopcontrol(proc_t p) PROC_SETACTION_STATE(p); proc_unlock(p); printf("low swap: killing pid %d (%s)\n", p->p_pid, p->p_comm); - psignal(p, SIGKILL); + kill_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_LOWSWAP); + psignal_with_reason(p, SIGKILL, kill_reason); break; default: @@ -3348,7 +3489,7 @@ proc_pcontrol_null(__unused proc_t p, __unused void *arg) extern uint64_t vm_compressor_pages_compressed(void); -struct timeval last_no_space_action = {0, 0}; +struct timeval last_no_space_action = {.tv_sec = 0, .tv_usec = 0}; #if DEVELOPMENT || DEBUG extern boolean_t kill_on_no_paging_space; @@ -3366,6 +3507,7 @@ no_paging_space_action() proc_t p; struct no_paging_space nps; struct timeval now; + os_reason_t kill_reason; /* * Throttle how often we come through here. Once every 5 seconds should be plenty. @@ -3413,7 +3555,8 @@ no_paging_space_action() last_no_space_action = now; printf("low swap: killing largest compressed process with pid %d (%s) and size %llu MB\n", p->p_pid, p->p_comm, (nps.pcs_max_size / MB_SIZE)); - psignal(p, SIGKILL); + kill_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_LOWSWAP); + psignal_with_reason(p, SIGKILL, kill_reason); proc_rele(p); @@ -3575,6 +3718,36 @@ proc_send_synchronous_EXC_RESOURCE(proc_t p) return FALSE; } +size_t +proc_get_syscall_filter_mask_size(int which) +{ + if (which == SYSCALL_MASK_UNIX) { + return nsysent; + } + + return 0; +} + +int +proc_set_syscall_filter_mask(proc_t p, int which, unsigned char *maskptr, size_t masklen) +{ +#if DEVELOPMENT || DEBUG + if (syscallfilter_disable) { + printf("proc_set_syscall_filter_mask: attempt to set policy for pid %d, but disabled by boot-arg\n", proc_pid(p)); + return KERN_SUCCESS; + } +#endif // DEVELOPMENT || DEBUG + + if (which != SYSCALL_MASK_UNIX || + (maskptr != NULL && masklen != nsysent)) { + return EINVAL; + } + + p->syscall_filter_mask = maskptr; + + return KERN_SUCCESS; +} + #ifdef CONFIG_32BIT_TELEMETRY void proc_log_32bit_telemetry(proc_t p) diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index 7840d8a4b..4a4a662ed 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -501,6 +501,27 @@ getwgroups(__unused proc_t p, __unused struct getwgroups_args *uap, __unused int return ENOTSUP; } +/* + * setsid_internal + * + * Description: Core implementation of setsid(). + */ +int +setsid_internal(proc_t p) +{ + struct pgrp * pg = PGRP_NULL; + + if (p->p_pgrpid == p->p_pid || (pg = pgfind(p->p_pid)) || p->p_lflag & P_LINVFORK) { + if (pg != PGRP_NULL) { + pg_rele(pg); + } + return EPERM; + } else { + /* enter pgrp works with its own pgrp refcount */ + (void)enterpgrp(p, p->p_pid, 1); + return 0; + } +} /* * setsid @@ -529,19 +550,11 @@ getwgroups(__unused proc_t p, __unused struct getwgroups_args *uap, __unused int int setsid(proc_t p, __unused struct setsid_args *uap, int32_t *retval) { - struct pgrp * pg = PGRP_NULL; - - if (p->p_pgrpid == p->p_pid || (pg = pgfind(p->p_pid)) || p->p_lflag & P_LINVFORK) { - if (pg != PGRP_NULL) { - pg_rele(pg); - } - return EPERM; - } else { - /* enter pgrp works with its own pgrp refcount */ - (void)enterpgrp(p, p->p_pid, 1); + int rc = setsid_internal(p); + if (rc == 0) { *retval = p->p_pid; - return 0; } + return rc; } @@ -1640,30 +1653,34 @@ settid_with_pid(proc_t p, struct settid_with_pid_args *uap, __unused int32_t *re * flag the process as having set privilege since the last exec. */ static int -setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused int32_t *retval) +setgroups1(proc_t p, u_int ngrp, user_addr_t gidset, uid_t gmuid, __unused int32_t *retval) { - u_int ngrp; gid_t newgroups[NGROUPS] = { 0 }; int error; - kauth_cred_t my_cred, my_new_cred; - struct uthread *uthread = get_bsdthread_info(current_thread()); - DEBUG_CRED_ENTER("setgroups1 (%d/%d): %d 0x%016x %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), gidsetsize, gidset, gmuid); + DEBUG_CRED_ENTER("setgroups1 (%d/%d): %d 0x%016x %d\n", p->p_pid, + (p->p_pptr ? p->p_pptr->p_pid : 0), ngrp, gidset, gmuid); - ngrp = gidsetsize; if (ngrp > NGROUPS) { return EINVAL; } - if (ngrp < 1) { - ngrp = 1; - } else { + if (ngrp >= 1) { error = copyin(gidset, (caddr_t)newgroups, ngrp * sizeof(gid_t)); if (error) { return error; } } + return setgroups_internal(p, ngrp, newgroups, gmuid); +} + +int +setgroups_internal(proc_t p, u_int ngrp, gid_t *newgroups, uid_t gmuid) +{ + struct uthread *uthread = get_bsdthread_info(current_thread()); + kauth_cred_t my_cred, my_new_cred; + int error; my_cred = kauth_cred_proc_ref(p); if ((error = suser(my_cred, &p->p_acflag))) { @@ -1671,6 +1688,11 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused return error; } + if (ngrp < 1) { + ngrp = 1; + newgroups[0] = 0; + } + if ((uthread->uu_flag & UT_SETUID) != 0) { #if DEBUG_CRED int my_cred_flags = uthread->uu_ucred->cr_flags; @@ -1942,6 +1964,18 @@ getlogin(proc_t p, struct getlogin_args *uap, __unused int32_t *retval) return copyout((caddr_t)buffer, uap->namebuf, uap->namelen); } +void +setlogin_internal(proc_t p, const char login[static MAXLOGNAME]) +{ + struct session *sessp = proc_session(p); + + if (sessp != SESSION_NULL) { + session_lock(sessp); + bcopy(login, sessp->s_login, MAXLOGNAME); + session_unlock(sessp); + session_rele(sessp); + } +} /* * setlogin @@ -1965,7 +1999,6 @@ setlogin(proc_t p, struct setlogin_args *uap, __unused int32_t *retval) int error; size_t dummy = 0; char buffer[MAXLOGNAME + 1]; - struct session * sessp; if ((error = proc_suser(p))) { return error; @@ -1978,15 +2011,7 @@ setlogin(proc_t p, struct setlogin_args *uap, __unused int32_t *retval) (caddr_t) &buffer[0], MAXLOGNAME - 1, (size_t *)&dummy); - sessp = proc_session(p); - - if (sessp != SESSION_NULL) { - session_lock(sessp); - bcopy(buffer, sessp->s_login, MAXLOGNAME); - session_unlock(sessp); - session_rele(sessp); - } - + setlogin_internal(p, buffer); if (!error) { AUDIT_ARG(text, buffer); diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 839a190b5..7978cff02 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -133,7 +133,7 @@ int fill_task_rusage(task_t task, rusage_info_current *ri); void fill_task_billed_usage(task_t task, rusage_info_current *ri); int fill_task_io_rusage(task_t task, rusage_info_current *ri); int fill_task_qos_rusage(task_t task, rusage_info_current *ri); -uint64_t get_task_logical_writes(task_t task); +uint64_t get_task_logical_writes(task_t task, boolean_t external); void fill_task_monotonic_rusage(task_t task, rusage_info_current *ri); int proc_get_rusage(proc_t p, int flavor, user_addr_t buffer, __unused int is_zombie); @@ -780,7 +780,11 @@ do_background_socket(struct proc *p, thread_t thread) #if SOCKETS struct filedesc *fdp; struct fileproc *fp; - int i, background; + int i = 0; + int background = false; +#if NECP + int update_necp = false; +#endif /* NECP */ proc_fdlock(p); @@ -811,7 +815,9 @@ do_background_socket(struct proc *p, thread_t thread) } #if NECP else if (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_NETPOLICY) { - necp_set_client_as_background(p, fp, background); + if (necp_set_client_as_background(p, fp, background)) { + update_necp = true; + } } #endif /* NECP */ } @@ -841,13 +847,21 @@ do_background_socket(struct proc *p, thread_t thread) } #if NECP else if (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_NETPOLICY) { - necp_set_client_as_background(p, fp, background); + if (necp_set_client_as_background(p, fp, background)) { + update_necp = true; + } } #endif /* NECP */ } } proc_fdunlock(p); + +#if NECP + if (update_necp) { + necp_update_all_clients(); + } +#endif /* NECP */ #else #pragma unused(p, thread) #endif @@ -1480,6 +1494,10 @@ static int iopolicysys_vfs_hfs_case_sensitivity(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); static int iopolicysys_vfs_atime_updates(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); +static int +iopolicysys_vfs_materialize_dataless_files(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); +static int +iopolicysys_vfs_statfs_no_data_volume(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); /* * iopolicysys @@ -1526,6 +1544,17 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval) goto out; } break; + case IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES: + error = iopolicysys_vfs_materialize_dataless_files(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + if (error) { + goto out; + } + break; + case IOPOL_TYPE_VFS_STATFS_NO_DATA_VOLUME: + error = iopolicysys_vfs_statfs_no_data_volume(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + if (error) { + goto out; + } default: error = EINVAL; goto out; @@ -1823,6 +1852,184 @@ out: return error; } +static inline int +get_thread_materialize_policy(struct uthread *ut) +{ + if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) { + return IOPOL_MATERIALIZE_DATALESS_FILES_OFF; + } else if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) { + return IOPOL_MATERIALIZE_DATALESS_FILES_ON; + } + /* Default thread behavior is "inherit process behavior". */ + return IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT; +} + +static inline void +set_thread_materialize_policy(struct uthread *ut, int policy) +{ + if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_OFF) { + ut->uu_flag &= ~UT_NSPACE_FORCEDATALESSFAULTS; + ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS; + } else if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_ON) { + ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS; + ut->uu_flag |= UT_NSPACE_FORCEDATALESSFAULTS; + } else { + ut->uu_flag &= ~(UT_NSPACE_NODATALESSFAULTS | UT_NSPACE_FORCEDATALESSFAULTS); + } +} + +static inline void +set_proc_materialize_policy(struct proc *p, int policy) +{ + if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT) { + /* + * Caller has specified "use the default policy". + * The default policy is to NOT materialize dataless + * files. + */ + policy = IOPOL_MATERIALIZE_DATALESS_FILES_OFF; + } + if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_ON) { + OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy); + } else { + OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy); + } +} + +static int +get_proc_materialize_policy(struct proc *p) +{ + return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) ? IOPOL_MATERIALIZE_DATALESS_FILES_ON : IOPOL_MATERIALIZE_DATALESS_FILES_OFF; +} + +static int +iopolicysys_vfs_materialize_dataless_files(struct proc *p __unused, int cmd, int scope, int policy, struct _iopol_param_t *iop_param) +{ + int error = 0; + thread_t thread; + + /* Validate scope */ + switch (scope) { + case IOPOL_SCOPE_THREAD: + thread = current_thread(); + break; + case IOPOL_SCOPE_PROCESS: + thread = THREAD_NULL; + break; + default: + error = EINVAL; + goto out; + } + + /* Validate policy */ + if (cmd == IOPOL_CMD_SET) { + switch (policy) { + case IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT: + case IOPOL_MATERIALIZE_DATALESS_FILES_OFF: + case IOPOL_MATERIALIZE_DATALESS_FILES_ON: + break; + default: + error = EINVAL; + goto out; + } + } + + /* Perform command */ + switch (cmd) { + case IOPOL_CMD_SET: + if (thread != THREAD_NULL) { + set_thread_materialize_policy(get_bsdthread_info(thread), policy); + } else { + set_proc_materialize_policy(p, policy); + } + break; + case IOPOL_CMD_GET: + if (thread != THREAD_NULL) { + policy = get_thread_materialize_policy(get_bsdthread_info(thread)); + } else { + policy = get_proc_materialize_policy(p); + } + iop_param->iop_policy = policy; + break; + default: + error = EINVAL; /* unknown command */ + break; + } + +out: + return error; +} + +static int +iopolicysys_vfs_statfs_no_data_volume(struct proc *p __unused, int cmd, + int scope, int policy, struct _iopol_param_t *iop_param) +{ + int error = 0; + + /* Validate scope */ + switch (scope) { + case IOPOL_SCOPE_PROCESS: + /* Only process OK */ + break; + default: + error = EINVAL; + goto out; + } + + /* Validate policy */ + if (cmd == IOPOL_CMD_SET) { + switch (policy) { + case IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT: + /* fall-through */ + case IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME: + /* These policies are OK */ + break; + default: + error = EINVAL; + goto out; + } + } + + /* Perform command */ + switch (cmd) { + case IOPOL_CMD_SET: + if (0 == kauth_cred_issuser(kauth_cred_get())) { + /* If it's a non-root process, it needs to have the entitlement to set the policy */ + boolean_t entitled = FALSE; + entitled = IOTaskHasEntitlement(current_task(), "com.apple.private.iopol.case_sensitivity"); + if (!entitled) { + error = EPERM; + goto out; + } + } + + switch (policy) { + case IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT: + OSBitAndAtomic16(~((uint32_t)P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME), &p->p_vfs_iopolicy); + break; + case IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME: + OSBitOrAtomic16((uint32_t)P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME, &p->p_vfs_iopolicy); + break; + default: + error = EINVAL; + goto out; + } + + break; + case IOPOL_CMD_GET: + iop_param->iop_policy = (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME) + ? IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME + : IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT; + break; + default: + error = EINVAL; /* unknown command */ + break; + } + +out: + return error; +} + /* BSD call back function for task_policy networking changes */ void proc_apply_task_networkbg(void * bsd_info, thread_t thread) @@ -1850,7 +2057,7 @@ gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor) memset(ru, 0, sizeof(*ru)); switch (flavor) { case RUSAGE_INFO_V4: - ru->ri_logical_writes = get_task_logical_writes(p->task); + ru->ri_logical_writes = get_task_logical_writes(p->task, FALSE); ru->ri_lifetime_max_phys_footprint = get_task_phys_footprint_lifetime_max(p->task); #if CONFIG_LEDGER_INTERVAL_MAX ru->ri_interval_max_phys_footprint = get_task_phys_footprint_interval_max(p->task, FALSE); diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index c9b334c8d..1e0027f95 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -145,6 +145,7 @@ get_system_inshutdown() return system_inshutdown; } +__abortlike static void panic_kernel(int howto, char *message) { @@ -154,6 +155,11 @@ panic_kernel(int howto, char *message) panic("userspace panic: %s", message); } +extern boolean_t compressor_store_stop_compaction; +extern lck_mtx_t vm_swap_data_lock; +extern int vm_swapfile_create_thread_running; +extern int vm_swapfile_gc_thread_running; + int reboot_kernel(int howto, char *message) { @@ -170,6 +176,25 @@ reboot_kernel(int howto, char *message) } return EBUSY; } + + lck_mtx_lock(&vm_swap_data_lock); + + /* Turn OFF future swapfile reclaimation / compaction etc.*/ + compressor_store_stop_compaction = TRUE; + + /* wait for any current swapfile work to end */ + while (vm_swapfile_create_thread_running || vm_swapfile_gc_thread_running) { + assert_wait((event_t)&compressor_store_stop_compaction, THREAD_UNINT); + + lck_mtx_unlock(&vm_swap_data_lock); + + thread_block(THREAD_CONTINUE_NULL); + + lck_mtx_lock(&vm_swap_data_lock); + } + + lck_mtx_unlock(&vm_swap_data_lock); + /* * Notify the power management root domain that the system will shut down. */ @@ -263,9 +288,6 @@ force_reboot: panic_kernel(howto, message); } - if (howto & RB_POWERDOWN) { - hostboot_option = HOST_REBOOT_HALT; - } if (howto & RB_HALT) { hostboot_option = HOST_REBOOT_HALT; } diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index 254f60066..dc26af6f3 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -127,7 +127,6 @@ * +++ */ extern int thread_enable_fpe(thread_t act, int onoff); -extern thread_t port_name_to_thread(mach_port_name_t port_name); extern kern_return_t get_signalact(task_t, thread_t *, int); extern unsigned int get_useraddr(void); extern boolean_t task_did_exec(task_t task); @@ -154,11 +153,11 @@ kern_return_t semaphore_timedwait_trap_internal(mach_port_name_t, unsigned int, kern_return_t semaphore_wait_signal_trap_internal(mach_port_name_t, mach_port_name_t, void (*)(kern_return_t)); kern_return_t semaphore_wait_trap_internal(mach_port_name_t, void (*)(kern_return_t)); -static int filt_sigattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sigattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); -static int filt_signaltouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_signalprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_signaltouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_signalprocess(struct knote *kn, struct kevent_qos_s *kev); SECURITY_READ_ONLY_EARLY(struct filterops) sig_filtops = { .f_attach = filt_sigattach, @@ -996,7 +995,8 @@ __pthread_markcancel(__unused proc_t p, int error = 0; struct uthread *uth; - target_act = (thread_act_t)port_name_to_thread(uap->thread_port); + target_act = (thread_act_t)port_name_to_thread(uap->thread_port, + PORT_TO_THREAD_IN_CURRENT_TASK); if (target_act == THR_ACT_NULL) { return ESRCH; @@ -1264,7 +1264,8 @@ __pthread_kill(__unused proc_t p, struct __pthread_kill_args *uap, int signum = uap->sig; struct uthread *uth; - target_act = (thread_t)port_name_to_thread(uap->thread_port); + target_act = (thread_t)port_name_to_thread(uap->thread_port, + PORT_TO_THREAD_NONE); if (target_act == THREAD_NULL) { return ESRCH; @@ -1281,6 +1282,11 @@ __pthread_kill(__unused proc_t p, struct __pthread_kill_args *uap, goto out; } + if ((thread_get_tag(target_act) & THREAD_TAG_WORKQUEUE) && !uth->uu_workq_pthread_kill_allowed) { + error = ENOTSUP; + goto out; + } + if (signum) { psignal_uthread(target_act, signum); } @@ -2048,6 +2054,7 @@ get_signalthread(proc_t p, int signum, thread_t * thr) thread_t sig_thread; struct task * sig_task = p->task; kern_return_t kret; + bool skip_wqthreads = true; *thr = THREAD_NULL; @@ -2062,15 +2069,25 @@ get_signalthread(proc_t p, int signum, thread_t * thr) } } +again: TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) { if (((uth->uu_flag & UT_NO_SIGMASK) == 0) && (((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) { - if (check_actforsig(p->task, uth->uu_context.vc_thread, 1) == KERN_SUCCESS) { - *thr = uth->uu_context.vc_thread; + thread_t th = uth->uu_context.vc_thread; + if (skip_wqthreads && (thread_get_tag(th) & THREAD_TAG_WORKQUEUE)) { + /* Workqueue threads may be parked in the kernel unable to + * deliver signals for an extended period of time, so skip them + * in favor of pthreads in a first pass. (rdar://50054475). */ + } else if (check_actforsig(p->task, th, 1) == KERN_SUCCESS) { + *thr = th; return KERN_SUCCESS; } } } + if (skip_wqthreads) { + skip_wqthreads = false; + goto again; + } if (get_signalact(p->task, thr, 1) == KERN_SUCCESS) { return KERN_SUCCESS; } @@ -2689,6 +2706,12 @@ psignal_with_reason(proc_t p, int signum, struct os_reason *signal_reason) psignal_internal(p, NULL, NULL, 0, signum, signal_reason); } +void +psignal_sigkill_with_reason(proc_t p, struct os_reason *signal_reason) +{ + psignal_internal(p, NULL, NULL, 0, SIGKILL, signal_reason); +} + void psignal_locked(proc_t p, int signum) { @@ -3269,6 +3292,7 @@ postsig_locked(int signum) if ((ps->ps_signodefer & mask) == 0) { ut->uu_sigmask |= mask; } + sigset_t siginfo = ps->ps_siginfo; if ((signum != SIGILL) && (signum != SIGTRAP) && (ps->ps_sigreset & mask)) { if ((signum != SIGCONT) && (sigprop[signum] & SA_IGNORE)) { p->p_sigignore |= mask; @@ -3285,7 +3309,7 @@ postsig_locked(int signum) ps->ps_code = 0; } OSIncrementAtomicLong(&p->p_stats->p_ru.ru_nsignals); - sendsig(p, catcher, signum, returnmask, code); + sendsig(p, catcher, signum, returnmask, code, siginfo); } proc_signalend(p, 1); } @@ -3299,13 +3323,15 @@ postsig_locked(int signum) */ static int -filt_sigattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_sigattach(struct knote *kn, __unused struct kevent_qos_s *kev) { proc_t p = current_proc(); /* can attach only to oneself */ proc_klist_lock(); - kn->kn_ptr.p_proc = p; + kn->kn_proc = p; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + kn->kn_sdata = 0; /* incoming data is ignored */ KNOTE_ATTACH(&p->p_klist, kn); @@ -3323,10 +3349,10 @@ filt_sigattach(struct knote *kn, __unused struct kevent_internal_s *kev) static void filt_sigdetach(struct knote *kn) { - proc_t p = kn->kn_ptr.p_proc; + proc_t p = kn->kn_proc; proc_klist_lock(); - kn->kn_ptr.p_proc = NULL; + kn->kn_proc = NULL; KNOTE_DETACH(&p->p_klist, kn); proc_klist_unlock(); } @@ -3347,19 +3373,17 @@ filt_signal(struct knote *kn, long hint) hint &= ~NOTE_SIGNAL; if (kn->kn_id == (unsigned int)hint) { - kn->kn_data++; + kn->kn_hook32++; } } else if (hint & NOTE_EXIT) { panic("filt_signal: detected NOTE_EXIT event"); } - return kn->kn_data != 0; + return kn->kn_hook32 != 0; } static int -filt_signaltouch( - struct knote *kn, - struct kevent_internal_s *kev) +filt_signaltouch(struct knote *kn, struct kevent_qos_s *kev) { #pragma unused(kev) @@ -3370,7 +3394,7 @@ filt_signaltouch( /* * No data to save - just capture if it is already fired */ - res = (kn->kn_data > 0); + res = (kn->kn_hook32 > 0); proc_klist_unlock(); @@ -3378,29 +3402,22 @@ filt_signaltouch( } static int -filt_signalprocess( - struct knote *kn, - __unused struct filt_process_s *data, - struct kevent_internal_s *kev) +filt_signalprocess(struct knote *kn, struct kevent_qos_s *kev) { - proc_klist_lock(); - - if (kn->kn_data == 0) { - proc_klist_unlock(); - return 0; - } + int res = 0; /* * Snapshot the event data. - * All signal events are EV_CLEAR, so - * add that and clear out the data field. */ - *kev = kn->kn_kevent; - kev->flags |= EV_CLEAR; - kn->kn_data = 0; + proc_klist_lock(); + if (kn->kn_hook32) { + knote_fill_kevent(kn, kev, kn->kn_hook32); + kn->kn_hook32 = 0; + res = 1; + } proc_klist_unlock(); - return 1; + return res; } void @@ -3409,7 +3426,6 @@ bsd_ast(thread_t thread) proc_t p = current_proc(); struct uthread *ut = get_bsdthread_info(thread); int signum; - user_addr_t pc; static int bsd_init_done = 0; if (p == NULL) { @@ -3421,12 +3437,6 @@ bsd_ast(thread_t thread) return; } - if ((p->p_flag & P_OWEUPC) && (p->p_flag & P_PROFIL)) { - pc = get_useraddr(); - addupc_task(p, pc, 1); - OSBitAndAtomic(~((uint32_t)P_OWEUPC), &p->p_flag); - } - if (timerisset(&p->p_vtimer_user.it_value)) { uint32_t microsecs; diff --git a/bsd/kern/kern_subr.c b/bsd/kern/kern_subr.c index 62c599072..9988a3a3d 100644 --- a/bsd/kern/kern_subr.c +++ b/bsd/kern/kern_subr.c @@ -66,6 +66,8 @@ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 */ +#include + #include #include #include @@ -601,7 +603,7 @@ uio_create( int a_iovcount, /* number of iovecs */ /* leave a note that we allocated this uio_t */ my_uio->uio_flags |= UIO_FLAGS_WE_ALLOCED; #if DEBUG - (void)hw_atomic_add(&uio_t_count, 1); + os_atomic_inc(&uio_t_count, relaxed); #endif } @@ -826,7 +828,7 @@ uio_free( uio_t a_uio ) if (a_uio != NULL && (a_uio->uio_flags & UIO_FLAGS_WE_ALLOCED) != 0) { #if DEBUG - if (hw_atomic_sub(&uio_t_count, 1) == UINT_MAX) { + if (os_atomic_dec_orig(&uio_t_count, relaxed) == 0) { panic("%s :%d - uio_t_count underflow\n", __FILE__, __LINE__); } #endif @@ -843,12 +845,20 @@ uio_free( uio_t a_uio ) int uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length ) { - int i; + int i; + user_size_t resid; if (a_uio == NULL) { #if DEBUG panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); -#endif /* LP64_DEBUG */ +#endif + return -1; + } + + if (os_add_overflow(a_length, a_uio->uio_resid_64, &resid)) { +#if DEBUG + panic("%s :%d - invalid length %lu\n", __FILE__, __LINE__, (unsigned long)a_length); +#endif return -1; } @@ -858,7 +868,7 @@ uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length ) a_uio->uio_iovs.uiovp[i].iov_len = a_length; a_uio->uio_iovs.uiovp[i].iov_base = a_baseaddr; a_uio->uio_iovcnt++; - a_uio->uio_resid_64 += a_length; + a_uio->uio_resid_64 = resid; return 0; } } @@ -868,7 +878,7 @@ uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length ) a_uio->uio_iovs.kiovp[i].iov_len = (u_int64_t)a_length; a_uio->uio_iovs.kiovp[i].iov_base = (u_int64_t)a_baseaddr; a_uio->uio_iovcnt++; - a_uio->uio_resid_64 += a_length; + a_uio->uio_resid_64 = resid; return 0; } } @@ -1161,7 +1171,7 @@ uio_duplicate( uio_t a_uio ) my_uio->uio_flags = UIO_FLAGS_WE_ALLOCED | UIO_FLAGS_INITED; #if DEBUG - (void)hw_atomic_add(&uio_t_count, 1); + os_atomic_inc(&uio_t_count, relaxed); #endif diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index 2ec92f29a..94b6a8975 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -62,8 +62,9 @@ #include #include -#define HIBERNATE_MIN_PHYSICAL_LBA (34) -#define HIBERNATE_MIN_FILE_SIZE (1024*1024) +#define HIBERNATE_MIN_PHYSICAL_LBA_512 (34) +#define HIBERNATE_MIN_PHYSICAL_LBA_4096 (6) +#define HIBERNATE_MIN_FILE_SIZE (1024*1024) /* This function is called from kern_sysctl in the current process context; * it is exported with the System6.0.exports, but this appears to be a legacy @@ -379,7 +380,11 @@ kern_open_file_for_direct_io(const char * name, goto out; } - minoffset = HIBERNATE_MIN_PHYSICAL_LBA * ref->blksize; + if (ref->blksize == 4096) { + minoffset = HIBERNATE_MIN_PHYSICAL_LBA_4096 * ref->blksize; + } else { + minoffset = HIBERNATE_MIN_PHYSICAL_LBA_512 * ref->blksize; + } if (ref->vp->v_type != VREG) { error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &fileblk); diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 566ab2606..ff6629238 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,6 +131,7 @@ #include #include +#include #include #include @@ -196,6 +197,10 @@ extern uint32_t vm_page_creation_throttled_hard; extern uint32_t vm_page_creation_throttled_soft; #endif /* DEVELOPMENT || DEBUG */ +#if CONFIG_LOCKERBOOT +extern const char kernel_protoboot_mount[]; +#endif + /* * Conditionally allow dtrace to see these functions for debugging purposes. */ @@ -232,9 +237,6 @@ extern int netboot_root(void); #endif int -pcsamples_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, - proc_t p); -int sysctl_procargs(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc); STATIC int @@ -402,14 +404,19 @@ sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void return ENAMETOOLONG; } if (!ut->pth_name) { - ut->pth_name = (char*)kalloc( MAXTHREADNAMESIZE ); - if (!ut->pth_name) { + char *tmp_pth_name = (char *)kalloc(MAXTHREADNAMESIZE); + if (!tmp_pth_name) { return ENOMEM; } + bzero(tmp_pth_name, MAXTHREADNAMESIZE); + if (!OSCompareAndSwapPtr(NULL, tmp_pth_name, &ut->pth_name)) { + kfree(tmp_pth_name, MAXTHREADNAMESIZE); + return EBUSY; + } } else { kernel_debug_string_simple(TRACE_STRING_THREADNAME_PREV, ut->pth_name); + bzero(ut->pth_name, MAXTHREADNAMESIZE); } - bzero(ut->pth_name, MAXTHREADNAMESIZE); error = copyin(newp, ut->pth_name, newlen); if (error) { return error; @@ -502,7 +509,7 @@ extern int get_kernel_symfile(proc_t, char **); #if COUNT_SYSCALLS #define KERN_COUNT_SYSCALLS (KERN_OSTYPE + 1000) -extern unsigned int nsysent; +extern const unsigned int nsysent; extern int syscalls_log[]; extern const char *syscallnames[]; @@ -790,7 +797,6 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS int uidcheck = 0; int ruidcheck = 0; int ttycheck = 0; - int success = 0; if (namelen != 1 && !(namelen == 0 && cmd == KERN_PROC_ALL)) { return EINVAL; @@ -848,18 +854,9 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS args.uidval = name[0]; } - success = proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), + proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), sysdoproc_callback, &args, filterfn, name); - /* - * rdar://problem/28433391: if we can't iterate over the processes, - * make sure to return an error. - */ - - if (success != 0) { - return ENOMEM; - } - if (error) { return error; } @@ -1732,6 +1729,33 @@ SYSCTL_STRING(_kern, OID_AUTO, osbuildconfig, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_MASKED, &osbuild_config[0], 0, ""); +STATIC int +sysctl_protoboot(__unused struct sysctl_oid *oidp, + __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error = -1; +#if CONFIG_LOCKERBOOT + char protoboot_buff[24]; + size_t protoboot_len = sizeof(protoboot_buff); + + if (vnode_tag(rootvnode) == VT_LOCKERFS) { + strlcpy(protoboot_buff, kernel_protoboot_mount, protoboot_len); + error = sysctl_io_string(req, protoboot_buff, protoboot_len, 0, NULL); + } else { + error = EFTYPE; + } + +#else +#pragma unused(req) + error = ENOTSUP; +#endif + + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, protoboot, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_protoboot, "A", ""); #if DEBUG #ifndef DKPR @@ -1827,6 +1851,28 @@ SYSCTL_PROC(_kern, OID_AUTO, osproductversion, osproductversion, sizeof(osproductversion), sysctl_osproductversion, "A", "The ProductVersion from SystemVersion.plist"); +static uint64_t iossupportversion_string[48]; + +STATIC int +sysctl_iossupportversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) +{ + if (req->newptr != 0) { + /* + * Can only ever be set by launchd, and only once at boot. + */ + if (req->p->p_pid != 1 || iossupportversion_string[0] != '\0') { + return EPERM; + } + } + + return sysctl_handle_string(oidp, arg1, arg2, req); +} + +SYSCTL_PROC(_kern, OID_AUTO, iossupportversion, + CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED, + iossupportversion_string, sizeof(iossupportversion_string), + sysctl_iossupportversion, "A", "The iOSSupportVersion from SystemVersion.plist"); + static uint64_t osvariant_status = 0; STATIC int @@ -1849,6 +1895,32 @@ SYSCTL_PROC(_kern, OID_AUTO, osvariant_status, &osvariant_status, sizeof(osvariant_status), sysctl_osvariant_status, "Q", "Opaque flags used to cache OS variant information"); +extern void commpage_update_dyld_flags(uint64_t); +static uint64_t dyld_system_flags = 0; + +STATIC int +sysctl_dyld_system_flags(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) +{ + /* + * Can only ever be set by launchd, possibly several times + * as dyld may change its mind after a userspace reboot. + */ + if (req->newptr != 0 && req->p->p_pid != 1) { + return EPERM; + } + + int res = sysctl_handle_quad(oidp, arg1, arg2, req); + if (req->newptr && res == 0) { + commpage_update_dyld_flags(osvariant_status); + } + return res; +} + +SYSCTL_PROC(_kern, OID_AUTO, dyld_system_flags, + CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED | CTLFLAG_MASKED, + &dyld_system_flags, sizeof(dyld_system_flags), + sysctl_dyld_system_flags, "Q", "Opaque flags used to cache dyld system-wide configuration"); + #if defined(XNU_TARGET_OS_BRIDGE) char macosproductversion[MACOS_VERS_LEN] = { '\0' }; @@ -1868,16 +1940,10 @@ sysctl_sysctl_bootargs (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int error; - /* BOOT_LINE_LENGTH */ -#if CONFIG_EMBEDDED - size_t boot_args_len = 256; -#else - size_t boot_args_len = 1024; -#endif - char buf[boot_args_len]; + char buf[BOOT_LINE_LENGTH]; - strlcpy(buf, PE_boot_args(), boot_args_len); - error = sysctl_io_string(req, buf, boot_args_len, 0, NULL); + strlcpy(buf, PE_boot_args(), BOOT_LINE_LENGTH); + error = sysctl_io_string(req, buf, BOOT_LINE_LENGTH, 0, NULL); return error; } @@ -2067,6 +2133,14 @@ SYSCTL_PROC(_kern_perfcontrol_callout, OID_AUTO, update_cycles, sysctl_perfcontrol_callout_stat, "I", ""); #endif /* __arm__ || __arm64__ */ + +#if __arm64__ +extern int legacy_footprint_entitlement_mode; +SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_entitlement_mode, + CTLFLAG_KERN | CTLFLAG_RD | CTLFLAG_LOCKED, + &legacy_footprint_entitlement_mode, 0, ""); +#endif /* __arm64__ */ + #endif /* (DEVELOPMENT || DEBUG) */ STATIC int @@ -2097,9 +2171,17 @@ sysctl_domainname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int error, changed; - error = sysctl_io_string(req, domainname, sizeof(domainname), 0, &changed); - if (changed) { - domainnamelen = strlen(domainname); + char tmpname[MAXHOSTNAMELEN] = {}; + + lck_mtx_lock(&domainname_lock); + strlcpy(tmpname, domainname, sizeof(tmpname)); + lck_mtx_unlock(&domainname_lock); + + error = sysctl_io_string(req, tmpname, sizeof(tmpname), 0, &changed); + if (!error && changed) { + lck_mtx_lock(&hostname_lock); + strlcpy(domainname, tmpname, sizeof(domainname)); + lck_mtx_unlock(&hostname_lock); } return error; } @@ -2117,14 +2199,21 @@ sysctl_hostname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int error, changed; - error = sysctl_io_string(req, hostname, sizeof(hostname), 1, &changed); - if (changed) { - hostnamelen = req->newlen; + char tmpname[MAXHOSTNAMELEN] = {}; + + lck_mtx_lock(&hostname_lock); + strlcpy(tmpname, hostname, sizeof(tmpname)); + lck_mtx_unlock(&hostname_lock); + + error = sysctl_io_string(req, tmpname, sizeof(tmpname), 1, &changed); + if (!error && changed) { + lck_mtx_lock(&hostname_lock); + strlcpy(hostname, tmpname, sizeof(hostname)); + lck_mtx_unlock(&hostname_lock); } return error; } - SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_hostname, "A", ""); @@ -2571,7 +2660,7 @@ sysctl_rage_vnode error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed); - if (error == 0) { + if ((error == 0) && (changed != 0)) { switch (new_value) { case KERN_RAGE_PROC: proc_lock(p); @@ -2600,6 +2689,21 @@ SYSCTL_PROC(_kern, KERN_RAGEVNODE, rage_vnode, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_rage_vnode, "I", ""); +/* XXX until filecoordinationd fixes a bit of inverted logic. */ +STATIC int +sysctl_vfsnspace +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int old_value = 0, new_value, changed; + + return sysctl_io_number(req, old_value, sizeof(int), &new_value, + &changed); +} + +SYSCTL_PROC(_kern, OID_AUTO, vfsnspace, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + 0, 0, sysctl_vfsnspace, "I", ""); + /* XXX move this interface into libproc and remove this sysctl */ STATIC int sysctl_setthread_cpupercent @@ -2658,7 +2762,7 @@ sysctl_kern_check_openevt error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed); - if (error == 0) { + if ((error == 0) && (changed != 0)) { switch (new_value) { case KERN_OPENEVT_PROC: OSBitOrAtomic(P_CHECKOPENEVT, &p->p_flag); @@ -3675,6 +3779,7 @@ extern uint32_t vm_pageout_memorystatus_fb_factor_dr; SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_nr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_nr, 0, ""); SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_dr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_dr, 0, ""); +extern uint32_t vm_grab_anon_nops; SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_overrides, 0, ""); SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_nops, 0, ""); @@ -3756,12 +3861,14 @@ sysctl_cpu_quiescent_counter_interval SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) - int error = sysctl_handle_int(oidp, &cpu_checkin_min_interval_us, 0, req); + uint32_t local_min_interval_us = cpu_quiescent_counter_get_min_interval_us(); + + int error = sysctl_handle_int(oidp, &local_min_interval_us, 0, req); if (error || !req->newptr) { return error; } - cpu_quiescent_counter_set_min_interval_us(cpu_checkin_min_interval_us); + cpu_quiescent_counter_set_min_interval_us(local_min_interval_us); return 0; } @@ -4130,7 +4237,7 @@ sysctl_debugger_test SYSCTL_HANDLER_ARGS return rval; } -decl_lck_spin_data(, spinlock_panic_test_lock) +decl_lck_spin_data(, spinlock_panic_test_lock); __attribute__((noreturn)) static void @@ -4258,7 +4365,7 @@ sysctl_grade_cputype SYSCTL_HANDLER_ARGS return error; } - return_value = grade_binary(type_tuple[0], type_tuple[1]); + return_value = grade_binary(type_tuple[0], type_tuple[1], FALSE); error = SYSCTL_OUT(req, &return_value, sizeof(return_value)); @@ -4295,11 +4402,9 @@ unwedge_thread SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern, OID_AUTO, unwedge_thread, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, unwedge_thread, "I", "unwedge the thread wedged by kern.wedge_thread"); -extern uintptr_t phys_carveout_pa; SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_pa, CTLFLAG_RD | CTLFLAG_LOCKED, &phys_carveout_pa, "base physical address of the phys_carveout_mb boot-arg region"); -extern size_t phys_carveout_size; SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_size, CTLFLAG_RD | CTLFLAG_LOCKED, &phys_carveout_size, "size in bytes of the phys_carveout_mb boot-arg region"); @@ -4337,9 +4442,6 @@ tstile_test_prim_lock(boolean_t use_hashtable); int tstile_test_prim_unlock(boolean_t use_hashtable); -#define SYSCTL_TURNSTILE_TEST_DEFAULT 1 -#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE 2 - static int sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS { @@ -4349,8 +4451,15 @@ sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS if (error || val == 0) { return error; } - boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false; - return tstile_test_prim_lock(use_hashtable); + switch (val) { + case SYSCTL_TURNSTILE_TEST_USER_DEFAULT: + case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE: + case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT: + case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE: + return tstile_test_prim_lock(val); + default: + return error; + } } static int @@ -4362,8 +4471,15 @@ sysctl_turnstile_test_prim_unlock SYSCTL_HANDLER_ARGS if (error || val == 0) { return error; } - boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false; - return tstile_test_prim_unlock(use_hashtable); + switch (val) { + case SYSCTL_TURNSTILE_TEST_USER_DEFAULT: + case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE: + case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT: + case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE: + return tstile_test_prim_unlock(val); + default: + return error; + } } SYSCTL_PROC(_kern, OID_AUTO, turnstiles_test_lock, CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, @@ -4408,42 +4524,6 @@ SYSCTL_QUAD(_kern, OID_AUTO, thread_block_count_on_reg_waitq, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, &thread_block_on_regular_waitq_count, "thread blocked on regular waitq count"); -static int -sysctl_lck_mtx_test_lock SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) - int error, val = 0; - error = sysctl_handle_int(oidp, &val, 0, req); - if (error || val == 0) { - return error; - } - - if (val == 1) { - lck_mtx_test_init(); - lck_mtx_test_lock(); - } - - return 0; -} - -static int -sysctl_lck_mtx_test_unlock SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) - int error, val = 0; - error = sysctl_handle_int(oidp, &val, 0, req); - if (error || val == 0) { - return error; - } - - if (val == 1) { - lck_mtx_test_init(); - lck_mtx_test_unlock(); - } - - return 0; -} - static int sysctl_erase_all_test_mtx_stats SYSCTL_HANDLER_ARGS { @@ -4512,7 +4592,12 @@ sysctl_test_mtx_uncontended SYSCTL_HANDLER_ARGS } input_val[req->newlen] = '\0'; - sscanf(input_val, "%d", &iter); + iter = 0; + error = sscanf(input_val, "%d", &iter); + if (error != 1) { + printf("%s invalid input\n", __func__); + return EINVAL; + } if (iter <= 0) { printf("%s requested %d iterations, not starting the test\n", __func__, iter); @@ -4551,8 +4636,6 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS int buffer_size, offset, error, iter; char input_val[40]; - printf("%s called\n", __func__); - if (!req->newptr) { return 0; } @@ -4571,7 +4654,12 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS } input_val[req->newlen] = '\0'; - sscanf(input_val, "%d", &iter); + iter = 0; + error = sscanf(input_val, "%d", &iter); + if (error != 1) { + printf("%s invalid input\n", __func__); + return EINVAL; + } if (iter <= 0) { printf("%s requested %d iterations, not starting the test\n", __func__, iter); @@ -4582,7 +4670,7 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS erase_all_test_mtx_stats(); - buffer_size = 1000; + buffer_size = 2000; offset = 0; buffer = kalloc(buffer_size); if (!buffer) { @@ -4590,29 +4678,34 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS } memset(buffer, 0, buffer_size); - printf("%s starting contended mutex test with %d iterations\n", __func__, iter); + printf("%s starting contended mutex test with %d iterations FULL_CONTENDED\n", __func__, iter); offset = snprintf(buffer, buffer_size, "STATS INNER LOOP"); - offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset); + offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset, FULL_CONTENDED); + + printf("%s starting contended mutex loop test with %d iterations FULL_CONTENDED\n", __func__, iter); + + offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP"); + offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset, FULL_CONTENDED); + + printf("%s starting contended mutex test with %d iterations HALF_CONTENDED\n", __func__, iter); + + offset += snprintf(&buffer[offset], buffer_size - offset, "STATS INNER LOOP"); + offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset, HALF_CONTENDED); - printf("%s starting contended mutex loop test with %d iterations\n", __func__, iter); + printf("%s starting contended mutex loop test with %d iterations HALF_CONTENDED\n", __func__, iter); offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP"); - offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset); + offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset, HALF_CONTENDED); error = SYSCTL_OUT(req, buffer, offset); + printf("\n%s\n", buffer); kfree(buffer, buffer_size); return error; } -SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_lock, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_lck_mtx_test_lock, "I", "lck mtx test lock"); - -SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_unlock, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_lck_mtx_test_unlock, "I", "lck mtx test unlock"); - SYSCTL_PROC(_kern, OID_AUTO, erase_all_test_mtx_stats, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_erase_all_test_mtx_stats, "I", "erase test_mtx statistics"); @@ -4706,4 +4799,78 @@ sysctl_test_panic_with_thread SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern, OID_AUTO, test_panic_with_thread, CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_WR | CTLTYPE_STRING, 0, 0, sysctl_test_panic_with_thread, "A", "test panic flow for backtracing a different thread"); #endif /* defined (__x86_64__) */ + #endif /* DEVELOPMENT || DEBUG */ + +static int +sysctl_get_owned_vmobjects SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + /* validate */ + if (req->newlen != sizeof(mach_port_name_t) || req->newptr == USER_ADDR_NULL || + req->oldidx != 0 || req->newidx != 0 || req->p == NULL) { + return EINVAL; + } + + int error; + mach_port_name_t task_port_name; + task_t task; + int buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0; + vmobject_list_output_t buffer; + size_t output_size; + + if (buffer_size) { + const int min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t); + + if (buffer_size < min_size) { + buffer_size = min_size; + } + + buffer = kalloc(buffer_size); + + if (!buffer) { + error = ENOMEM; + goto sysctl_get_vmobject_list_exit; + } + } else { + buffer = NULL; + } + + /* we have a "newptr" (for write) we get a task port name from the caller. */ + error = SYSCTL_IN(req, &task_port_name, sizeof(mach_port_name_t)); + + if (error != 0) { + goto sysctl_get_vmobject_list_exit; + } + + task = port_name_to_task(task_port_name); + if (task == TASK_NULL) { + error = ESRCH; + goto sysctl_get_vmobject_list_exit; + } + + /* copy the vmobjects and vmobject data out of the task */ + if (buffer_size == 0) { + int64_t __size; + task_copy_vmobjects(task, NULL, 0, &__size); + output_size = (__size > 0) ? __size * sizeof(vm_object_query_data_t) + sizeof(int64_t) : 0; + } else { + task_copy_vmobjects(task, &buffer->data[0], buffer_size - sizeof(int64_t), &buffer->entries); + output_size = buffer->entries * sizeof(vm_object_query_data_t) + sizeof(int64_t); + } + + task_deallocate(task); + + error = SYSCTL_OUT(req, (char*) buffer, output_size); + +sysctl_get_vmobject_list_exit: + if (buffer) { + kfree(buffer, buffer_size); + } + + return error; +} + +SYSCTL_PROC(_vm, OID_AUTO, get_owned_vmobjects, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + 0, 0, sysctl_get_owned_vmobjects, "A", "get owned vmobjects in task"); diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index 386d06971..ee65deb6c 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -104,8 +104,10 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval) if ((error = suser(kauth_cred_get(), &p->p_acflag))) { #if (DEVELOPMENT || DEBUG) - /* allow non-root user to call panic on dev/debug kernels */ - if (!(uap->opt & RB_PANIC)) { + if (uap->opt & RB_PANIC) { + /* clear 'error' to allow non-root users to call panic on dev/debug kernels */ + error = 0; + } else { return error; } #else @@ -113,10 +115,6 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval) #endif } - if (uap->opt & RB_COMMAND) { - return ENOSYS; - } - if (uap->opt & RB_PANIC && uap->msg != USER_ADDR_NULL) { if (copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy)) { strncpy(message, "user space RB_PANIC message copyin failed", sizeof(message) - 1); diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index 7fb3d23d1..27d01a4a7 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -528,9 +528,13 @@ errno_t mbuf_copydata(const mbuf_t m0, size_t off, size_t len, void *out_data) { /* Copied m_copydata, added error handling (don't just panic) */ - int count; + size_t count; mbuf_t m = m0; + if (off >= INT_MAX || len >= INT_MAX) { + return EINVAL; + } + while (off > 0) { if (m == 0) { return EINVAL; @@ -2014,3 +2018,31 @@ m_do_tx_compl_callback(struct mbuf *m, struct ifnet *ifp) } #endif /* (DEBUG || DEVELOPMENT) */ } + +errno_t +mbuf_get_keepalive_flag(mbuf_t m, boolean_t *is_keepalive) +{ + if (m == NULL || is_keepalive == NULL || !(m->m_flags & M_PKTHDR)) { + return EINVAL; + } + + *is_keepalive = (m->m_pkthdr.pkt_flags & PKTF_KEEPALIVE); + + return 0; +} + +errno_t +mbuf_set_keepalive_flag(mbuf_t m, boolean_t is_keepalive) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) { + return EINVAL; + } + + if (is_keepalive) { + m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE; + } else { + m->m_pkthdr.pkt_flags &= ~PKTF_KEEPALIVE; + } + + return 0; +} diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index 597045a88..4492cf15f 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -106,21 +106,40 @@ errno_t sflt_register(const struct sflt_filter *filter, int domain, __private_extern__ int sflt_permission_check(struct inpcb *inp) { - /* - * All these permissions only apply to the co-processor interface, - * so ignore IPv4. - */ - if (!(inp->inp_vflag & INP_IPV6)) { + /* Only IPv4 or IPv6 sockets can bypass filters */ + if (!(inp->inp_vflag & INP_IPV4) && + !(inp->inp_vflag & INP_IPV6)) { return 0; } /* Sockets that have this entitlement bypass socket filters. */ if (INP_INTCOPROC_ALLOWED(inp)) { return 1; } + /* Sockets bound to an intcoproc interface bypass socket filters. */ if ((inp->inp_flags & INP_BOUND_IF) && IFNET_IS_INTCOPROC(inp->inp_boundifp)) { return 1; } +#if NECP + /* + * Make sure that the NECP policy is populated. + * If result is not populated, the policy ID will be + * NECP_KERNEL_POLICY_ID_NONE. Note that if the result + * is populated, but there was no match, it will be + * NECP_KERNEL_POLICY_ID_NO_MATCH. + * Do not call inp_update_necp_policy() to avoid scoping + * a socket prior to calls to bind(). + */ + if (inp->inp_policyresult.policy_id == NECP_KERNEL_POLICY_ID_NONE) { + necp_socket_find_policy_match(inp, NULL, NULL, 0); + } + + /* If the filter unit is marked to be "no filter", bypass filters */ + if (inp->inp_policyresult.results.filter_control_unit == + NECP_FILTER_UNIT_NO_FILTER) { + return 1; + } +#endif /* NECP */ return 0; } diff --git a/bsd/kern/mach_fat.c b/bsd/kern/mach_fat.c index ffb26e9bf..ce9ab133d 100644 --- a/bsd/kern/mach_fat.c +++ b/bsd/kern/mach_fat.c @@ -50,6 +50,7 @@ * req_cpu_type: The required cpu type. * mask_bits: Bits to mask from the sub-image type when * grading it vs. the req_cpu_type +* imgp: Image params * archret (out): Pointer to fat_arch structure to hold * the results. * @@ -58,11 +59,12 @@ **********************************************************************/ static load_return_t fatfile_getarch( - vm_offset_t data_ptr, - vm_size_t data_size, - cpu_type_t req_cpu_type, - cpu_type_t mask_bits, - struct fat_arch *archret) + vm_offset_t data_ptr, + vm_size_t data_size, + cpu_type_t req_cpu_type, + cpu_type_t mask_bits, + struct image_params *imgp, + struct fat_arch *archret) { load_return_t lret; struct fat_arch *arch; @@ -106,7 +108,7 @@ fatfile_getarch( /* * Get the grade of the cpu subtype (without feature flags) */ - grade = grade_binary(testtype, testsubtype); + grade = grade_binary(testtype, testsubtype, TRUE); /* * Remember it if it's the best we've seen. @@ -117,6 +119,18 @@ fatfile_getarch( } } + /* On X86_64, allow 32 bit exec only for simulator binaries. + * Failing here without re-running the grading algorithm is safe because i386 + * has the lowest possible grade value (so there can't be a lower best grade + * that would be allowed if this check denied the i386 slice). */ + if (best_arch != NULL && + validate_potential_simulator_binary(OSSwapBigToHostInt32(best_arch->cputype), + imgp, OSSwapBigToHostInt32(best_arch->offset), + OSSwapBigToHostInt32(best_arch->size)) != LOAD_SUCCESS) { + best_arch = NULL; + best_grade = 0; + } + /* * Return our results. */ @@ -147,13 +161,14 @@ load_return_t fatfile_getbestarch( vm_offset_t data_ptr, vm_size_t data_size, + struct image_params *imgp, struct fat_arch *archret) { /* * Ignore all architectural bits when determining if an image * in a fat file should be skipped or graded. */ - return fatfile_getarch(data_ptr, data_size, cpu_type(), CPU_ARCH_MASK, archret); + return fatfile_getarch(data_ptr, data_size, cpu_type(), CPU_ARCH_MASK, imgp, archret); } load_return_t @@ -161,12 +176,13 @@ fatfile_getbestarch_for_cputype( cpu_type_t cputype, vm_offset_t data_ptr, vm_size_t data_size, + struct image_params *imgp, struct fat_arch *archret) { /* * Scan the fat_arch array for exact matches for this cpu_type_t only */ - return fatfile_getarch(data_ptr, data_size, cputype, 0, archret); + return fatfile_getarch(data_ptr, data_size, cputype, 0, imgp, archret); } /********************************************************************** @@ -187,7 +203,7 @@ fatfile_getbestarch_for_cputype( load_return_t fatfile_getarch_with_bits( integer_t archbits, - vm_offset_t data_ptr, + vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret) { @@ -195,7 +211,7 @@ fatfile_getarch_with_bits( * Scan the fat_arch array for matches with the requested * architectural bits set, and for the current hardware cpu CPU. */ - return fatfile_getarch(data_ptr, data_size, (archbits & CPU_ARCH_MASK) | (cpu_type() & ~CPU_ARCH_MASK), 0, archret); + return fatfile_getarch(data_ptr, data_size, (archbits & CPU_ARCH_MASK) | (cpu_type() & ~CPU_ARCH_MASK), 0, NULL, archret); } /* diff --git a/bsd/kern/mach_fat.h b/bsd/kern/mach_fat.h index 6d108d1ec..885fb32ee 100644 --- a/bsd/kern/mach_fat.h +++ b/bsd/kern/mach_fat.h @@ -36,9 +36,9 @@ load_return_t fatfile_validate_fatarches(vm_offset_t data_ptr, vm_size_t data_size); -load_return_t fatfile_getbestarch(vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret); +load_return_t fatfile_getbestarch(vm_offset_t data_ptr, vm_size_t data_size, struct image_params *imgp, struct fat_arch *archret); load_return_t fatfile_getbestarch_for_cputype(cpu_type_t cputype, - vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret); + vm_offset_t data_ptr, vm_size_t data_size, struct image_params *imgp, struct fat_arch *archret); load_return_t fatfile_getarch_with_bits(integer_t archbits, vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret); diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index 82ee10f0f..d51e05c70 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -88,16 +88,12 @@ #include -#if __x86_64__ -extern int bootarg_no32exec; /* bsd_init.c */ -#endif - /* * XXX vm/pmap.h should not treat these prototypes as MACH_KERNEL_PRIVATE * when KERNEL is defined. */ -extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t size, - boolean_t is_64bit); +extern pmap_t pmap_create_options(ledger_t ledger, vm_map_size_t size, + unsigned int flags); /* XXX should have prototypes in a shared header file */ extern int get_map_nentries(vm_map_t); @@ -173,6 +169,13 @@ load_uuid( load_result_t *result ); +static load_return_t +load_version( + struct version_min_command *vmc, + boolean_t *found_version_cmd, + load_result_t *result + ); + static load_return_t load_code_signature( struct linkedit_data_command *lcp, @@ -205,6 +208,14 @@ load_main( load_result_t *result ); +static +load_return_t +setup_driver_main( + thread_t thread, + int64_t slide, + load_result_t *result + ); + static load_return_t load_unixthread( struct thread_command *tcp, @@ -251,6 +262,15 @@ load_dylinker( struct image_params *imgp ); +#if __x86_64__ +extern int bootarg_no32exec; +static boolean_t +check_if_simulator_binary( + struct image_params *imgp, + off_t file_offset, + off_t macho_size); +#endif + struct macho_data; static load_return_t @@ -341,12 +361,12 @@ load_machfile( boolean_t enforce_hard_pagezero = TRUE; int in_exec = (imgp->ip_flags & IMGPF_EXEC); task_t task = current_task(); - proc_t p = current_proc(); int64_t aslr_page_offset = 0; int64_t dyld_aslr_page_offset = 0; int64_t aslr_section_size = 0; int64_t aslr_section_offset = 0; kern_return_t kret; + unsigned int pmap_flags = 0; if (macho_size > file_size) { return LOAD_BADMACHO; @@ -354,6 +374,10 @@ load_machfile( result->is_64bit_addr = ((imgp->ip_flags & IMGPF_IS_64BIT_ADDR) == IMGPF_IS_64BIT_ADDR); result->is_64bit_data = ((imgp->ip_flags & IMGPF_IS_64BIT_DATA) == IMGPF_IS_64BIT_DATA); +#if defined(HAS_APPLE_PAC) + pmap_flags |= (imgp->ip_flags & IMGPF_NOJOP) ? PMAP_CREATE_DISABLE_JOP : 0; +#endif /* defined(HAS_APPLE_PAC) */ + pmap_flags |= result->is_64bit_addr ? PMAP_CREATE_64BIT : 0; task_t ledger_task; if (imgp->ip_new_thread) { @@ -361,9 +385,12 @@ load_machfile( } else { ledger_task = task; } - pmap = pmap_create(get_task_ledger(ledger_task), + pmap = pmap_create_options(get_task_ledger(ledger_task), (vm_map_size_t) 0, - result->is_64bit_addr); + pmap_flags); + if (pmap == NULL) { + return LOAD_RESOURCE; + } map = vm_map_create(pmap, 0, vm_compute_max_offset(result->is_64bit_addr), @@ -497,6 +524,7 @@ load_machfile( * task is not yet running, and it makes no sense. */ if (in_exec) { + proc_t p = vfs_context_proc(imgp->ip_vfs_context); /* * Mark the task as halting and start the other * threads towards terminating themselves. Then @@ -597,14 +625,17 @@ parse_machfile( size_t offset; size_t oldoffset; /* for overflow check */ int pass; - proc_t p = current_proc(); /* XXXX */ + proc_t p = vfs_context_proc(imgp->ip_vfs_context); int error; int resid = 0; + int spawn = (imgp->ip_flags & IMGPF_SPAWN); + int vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC); size_t mach_header_sz = sizeof(struct mach_header); boolean_t abi64; boolean_t got_code_signatures = FALSE; boolean_t found_header_segment = FALSE; boolean_t found_xhdr = FALSE; + boolean_t found_version_cmd = FALSE; int64_t slide = 0; boolean_t dyld_no_load_addr = FALSE; boolean_t is_dyld = FALSE; @@ -637,16 +668,10 @@ parse_machfile( */ if (((cpu_type_t)(header->cputype & ~CPU_ARCH_MASK) != (cpu_type() & ~CPU_ARCH_MASK)) || !grade_binary(header->cputype, - header->cpusubtype & ~CPU_SUBTYPE_MASK)) { + header->cpusubtype & ~CPU_SUBTYPE_MASK, TRUE)) { return LOAD_BADARCH; } -#if __x86_64__ - if (bootarg_no32exec && (header->cputype == CPU_TYPE_X86)) { - return LOAD_BADARCH_X86; - } -#endif - abi64 = ((header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64); switch (header->filetype) { @@ -702,7 +727,7 @@ parse_machfile( } error = vn_rdwr(UIO_READ, vp, addr, alloc_size, file_offset, - UIO_SYSSPACE, 0, kauth_cred_get(), &resid, p); + UIO_SYSSPACE, 0, vfs_context_ucred(imgp->ip_vfs_context), &resid, p); if (error) { kfree(addr, alloc_size); return LOAD_IOERROR; @@ -811,10 +836,22 @@ parse_machfile( /* * Check that the entry point is contained in an executable segments */ - if ((pass == 3) && (!result->using_lcmain && result->validentry == 0)) { - thread_state_initialize(thread); - ret = LOAD_FAILURE; - break; + if (pass == 3) { + if (depth == 1 && imgp && (imgp->ip_flags & IMGPF_DRIVER)) { + /* Driver binaries must have driverkit platform */ + if (result->ip_platform == PLATFORM_DRIVERKIT) { + /* Driver binaries have no entry point */ + ret = setup_driver_main(thread, slide, result); + } else { + ret = LOAD_FAILURE; + } + } else if (!result->using_lcmain && result->validentry == 0) { + ret = LOAD_FAILURE; + } + if (ret != KERN_SUCCESS) { + thread_state_initialize(thread); + break; + } } /* @@ -866,10 +903,17 @@ parse_machfile( /* * Act on struct load_command's for which kernel * intervention is required. + * Note that each load command implementation is expected to validate + * that lcp->cmdsize is large enough to fit its specific struct type + * before dereferencing fields not covered by struct load_command. */ switch (lcp->cmd) { case LC_SEGMENT: { struct segment_command *scp = (struct segment_command *) lcp; + if (scp->cmdsize < sizeof(*scp)) { + ret = LOAD_BADMACHO; + break; + } if (pass == 0) { if (is_dyld && scp->vmaddr == 0 && scp->fileoff == 0) { dyld_no_load_addr = TRUE; @@ -948,7 +992,10 @@ parse_machfile( } case LC_SEGMENT_64: { struct segment_command_64 *scp64 = (struct segment_command_64 *) lcp; - + if (scp64->cmdsize < sizeof(*scp64)) { + ret = LOAD_BADMACHO; + break; + } if (pass == 0) { if (is_dyld && scp64->vmaddr == 0 && scp64->fileoff == 0) { dyld_no_load_addr = TRUE; @@ -1142,27 +1189,56 @@ parse_machfile( load_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_DECRYPT); } - assert(load_failure_reason != OS_REASON_NULL); - psignal_with_reason(p, SIGKILL, load_failure_reason); + /* + * Don't signal the process if it was forked and in a partially constructed + * state as part of a spawn -- it will just be torn down when the exec fails. + */ + if (!spawn) { + assert(load_failure_reason != OS_REASON_NULL); + if (vfexec) { + psignal_vfork_with_reason(p, get_threadtask(imgp->ip_new_thread), imgp->ip_new_thread, SIGKILL, load_failure_reason); + load_failure_reason = OS_REASON_NULL; + } else { + psignal_with_reason(p, SIGKILL, load_failure_reason); + load_failure_reason = OS_REASON_NULL; + } + } else { + os_reason_free(load_failure_reason); + load_failure_reason = OS_REASON_NULL; + } } break; #endif -#if __arm64__ - case LC_VERSION_MIN_IPHONEOS: { + case LC_VERSION_MIN_IPHONEOS: + case LC_VERSION_MIN_MACOSX: + case LC_VERSION_MIN_WATCHOS: + case LC_VERSION_MIN_TVOS: { struct version_min_command *vmc; - if (pass != 1) { + if (depth != 1 || pass != 1) { break; } vmc = (struct version_min_command *) lcp; - if (vmc->sdk < (12 << 16)) { - /* app built with a pre-iOS12 SDK: apply legacy footprint mitigation */ - result->legacy_footprint = TRUE; + ret = load_version(vmc, &found_version_cmd, result); + break; + } + case LC_BUILD_VERSION: { + if (depth != 1 || pass != 1) { + break; + } + struct build_version_command* bvc = (struct build_version_command*)lcp; + if (bvc->cmdsize < sizeof(*bvc)) { + ret = LOAD_BADMACHO; + break; } -// printf("FBDP %s:%d vp %p (%s) sdk %d.%d.%d -> legacy_footprint=%d\n", __FUNCTION__, __LINE__, vp, vp->v_name, (vmc->sdk >> 16), ((vmc->sdk & 0xFF00) >> 8), (vmc->sdk & 0xFF), result->legacy_footprint); + if (found_version_cmd == TRUE) { + ret = LOAD_BADMACHO; + break; + } + result->ip_platform = bvc->platform; + found_version_cmd = TRUE; break; } -#endif /* __arm64__ */ default: /* Other commands are ignored by the kernel */ ret = LOAD_SUCCESS; @@ -1217,6 +1293,190 @@ parse_machfile( return ret; } +load_return_t +validate_potential_simulator_binary( + cpu_type_t exectype __unused, + struct image_params *imgp __unused, + off_t file_offset __unused, + off_t macho_size __unused) +{ +#if __x86_64__ + /* Allow 32 bit exec only for simulator binaries */ + if (bootarg_no32exec && imgp != NULL && exectype == CPU_TYPE_X86) { + if (imgp->ip_simulator_binary == IMGPF_SB_DEFAULT) { + boolean_t simulator_binary = check_if_simulator_binary(imgp, file_offset, macho_size); + imgp->ip_simulator_binary = simulator_binary ? IMGPF_SB_TRUE : IMGPF_SB_FALSE; + } + + if (imgp->ip_simulator_binary != IMGPF_SB_TRUE) { + return LOAD_BADARCH; + } + } +#endif + return LOAD_SUCCESS; +} + +#if __x86_64__ +static boolean_t +check_if_simulator_binary( + struct image_params *imgp, + off_t file_offset, + off_t macho_size) +{ + struct mach_header *header; + char *ip_vdata = NULL; + kauth_cred_t cred = NULL; + uint32_t ncmds; + struct load_command *lcp; + boolean_t simulator_binary = FALSE; + void * addr = NULL; + vm_size_t alloc_size, cmds_size; + size_t offset; + proc_t p = current_proc(); /* XXXX */ + int error; + int resid = 0; + size_t mach_header_sz = sizeof(struct mach_header); + + + cred = kauth_cred_proc_ref(p); + + /* Allocate page to copyin mach header */ + ip_vdata = kalloc(PAGE_SIZE); + if (ip_vdata == NULL) { + goto bad; + } + + /* Read the Mach-O header */ + error = vn_rdwr(UIO_READ, imgp->ip_vp, ip_vdata, + PAGE_SIZE, file_offset, + UIO_SYSSPACE, (IO_UNIT | IO_NODELOCKED), + cred, &resid, p); + if (error) { + goto bad; + } + + header = (struct mach_header *)ip_vdata; + + if (header->magic == MH_MAGIC_64 || + header->magic == MH_CIGAM_64) { + mach_header_sz = sizeof(struct mach_header_64); + } + + /* ensure header + sizeofcmds falls within the file */ + if (os_add_overflow(mach_header_sz, header->sizeofcmds, &cmds_size) || + (off_t)cmds_size > macho_size || + round_page_overflow(cmds_size, &alloc_size)) { + goto bad; + } + + /* + * Map the load commands into kernel memory. + */ + addr = kalloc(alloc_size); + if (addr == NULL) { + goto bad; + } + + error = vn_rdwr(UIO_READ, imgp->ip_vp, addr, alloc_size, file_offset, + UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p); + if (error) { + goto bad; + } + + if (resid) { + /* We must be able to read in as much as the mach_header indicated */ + goto bad; + } + + /* + * Loop through each of the load_commands indicated by the + * Mach-O header; if an absurd value is provided, we just + * run off the end of the reserved section by incrementing + * the offset too far, so we are implicitly fail-safe. + */ + offset = mach_header_sz; + ncmds = header->ncmds; + + while (ncmds--) { + /* ensure enough space for a minimal load command */ + if (offset + sizeof(struct load_command) > cmds_size) { + break; + } + + /* + * Get a pointer to the command. + */ + lcp = (struct load_command *)(addr + offset); + + /* + * Perform prevalidation of the struct load_command + * before we attempt to use its contents. Invalid + * values are ones which result in an overflow, or + * which can not possibly be valid commands, or which + * straddle or exist past the reserved section at the + * start of the image. + */ + if (os_add_overflow(offset, lcp->cmdsize, &offset) || + lcp->cmdsize < sizeof(struct load_command) || + offset > cmds_size) { + break; + } + + /* Check if its a simulator binary. */ + switch (lcp->cmd) { + case LC_VERSION_MIN_WATCHOS: + simulator_binary = TRUE; + break; + + case LC_BUILD_VERSION: { + struct build_version_command *bvc; + + bvc = (struct build_version_command *) lcp; + if (bvc->cmdsize < sizeof(*bvc)) { + /* unsafe to use this command struct if cmdsize + * validated above is too small for it to fit */ + break; + } + if (bvc->platform == PLATFORM_IOSSIMULATOR || + bvc->platform == PLATFORM_WATCHOSSIMULATOR) { + simulator_binary = TRUE; + } + + break; + } + + case LC_VERSION_MIN_IPHONEOS: { + simulator_binary = TRUE; + break; + } + + default: + /* ignore other load commands */ + break; + } + + if (simulator_binary == TRUE) { + break; + } + } + +bad: + if (ip_vdata) { + kfree(ip_vdata, PAGE_SIZE); + } + + if (cred) { + kauth_cred_unref(&cred); + } + + if (addr) { + kfree(addr, alloc_size); + } + + return simulator_binary; +} +#endif /* __x86_64__ */ + #if CONFIG_CODE_DECRYPTION #define APPLE_UNPROTECTED_HEADER_SIZE (3 * 4096) @@ -1390,6 +1650,8 @@ map_segment( cur_end = vm_start + (file_end - file_start); } if (control != MEMORY_OBJECT_CONTROL_NULL) { + /* no copy-on-read for mapped binaries */ + vmk_flags.vmkf_no_copy_on_read = 1; ret = vm_map_enter_mem_object_control( map, &cur_start, @@ -1463,6 +1725,8 @@ map_segment( file_start), effective_page_mask); if (control != MEMORY_OBJECT_CONTROL_NULL) { + /* no copy-on-read for mapped binaries */ + cur_vmk_flags.vmkf_no_copy_on_read = 1; ret = vm_map_enter_mem_object_control( map, &cur_start, @@ -1507,6 +1771,8 @@ map_segment( /* one 4K pager for the last page */ cur_end = vm_start + (file_end - file_start); if (control != MEMORY_OBJECT_CONTROL_NULL) { + /* no copy-on-read for mapped binaries */ + vmk_flags.vmkf_no_copy_on_read = 1; ret = vm_map_enter_mem_object_control( map, &cur_start, @@ -1687,7 +1953,13 @@ load_segment( return LOAD_BADMACHO; } - vm_offset = scp->vmaddr + slide; + if (os_add_overflow(scp->vmaddr, slide, &vm_offset)) { + if (cs_debug) { + printf("vmaddr too large\n"); + } + return LOAD_BADMACHO; + } + vm_size = scp->vmsize; if (vm_size == 0) { @@ -1973,6 +2245,68 @@ load_uuid( return LOAD_SUCCESS; } +static +load_return_t +load_version( + struct version_min_command *vmc, + boolean_t *found_version_cmd, + load_result_t *result + ) +{ + uint32_t platform = 0; + uint32_t sdk; + + if (vmc->cmdsize < sizeof(*vmc)) { + return LOAD_BADMACHO; + } + if (*found_version_cmd == TRUE) { + return LOAD_BADMACHO; + } + *found_version_cmd = TRUE; + sdk = vmc->sdk; + switch (vmc->cmd) { + case LC_VERSION_MIN_MACOSX: + platform = PLATFORM_MACOS; + break; +#if __x86_64__ /* __x86_64__ */ + case LC_VERSION_MIN_IPHONEOS: + platform = PLATFORM_IOSSIMULATOR; + break; + case LC_VERSION_MIN_WATCHOS: + platform = PLATFORM_WATCHOSSIMULATOR; + break; + case LC_VERSION_MIN_TVOS: + platform = PLATFORM_TVOSSIMULATOR; + break; +#else + case LC_VERSION_MIN_IPHONEOS: { +#if __arm64__ + extern int legacy_footprint_entitlement_mode; + if (vmc->sdk < (12 << 16)) { + /* app built with a pre-iOS12 SDK: apply legacy footprint mitigation */ + result->legacy_footprint = TRUE; + } +#endif /* __arm64__ */ + platform = PLATFORM_IOS; + break; + } + case LC_VERSION_MIN_WATCHOS: + platform = PLATFORM_WATCHOS; + break; + case LC_VERSION_MIN_TVOS: + platform = PLATFORM_TVOS; + break; +#endif /* __x86_64__ */ + /* All LC_VERSION_MIN_* load commands are legacy and we will not be adding any more */ + default: + sdk = (uint32_t)-1; + __builtin_unreachable(); + } + result->ip_platform = platform; + result->lr_sdk = sdk; + return LOAD_SUCCESS; +} + static load_return_t load_main( @@ -2049,6 +2383,52 @@ load_main( return LOAD_SUCCESS; } +static +load_return_t +setup_driver_main( + thread_t thread, + int64_t slide, + load_result_t *result + ) +{ + mach_vm_offset_t addr; + kern_return_t ret; + + /* Driver binaries have no LC_MAIN, use defaults */ + + if (thread == THREAD_NULL) { + return LOAD_SUCCESS; + } + + result->user_stack_alloc_size = MAXSSIZ; + + /* use default location for stack */ + ret = thread_userstackdefault(&addr, result->is_64bit_addr); + if (ret != KERN_SUCCESS) { + return LOAD_FAILURE; + } + + /* The stack slides down from the default location */ + result->user_stack = addr; + result->user_stack -= slide; + + if (result->using_lcmain || result->entry_point != MACH_VM_MIN_ADDRESS) { + /* Already processed LC_MAIN or LC_UNIXTHREAD */ + return LOAD_FAILURE; + } + + result->needs_dynlinker = TRUE; + + ret = thread_state_initialize( thread ); + if (ret != KERN_SUCCESS) { + return LOAD_FAILURE; + } + + result->unixproc = TRUE; + result->thread_count++; + + return LOAD_SUCCESS; +} static load_return_t @@ -2426,12 +2806,18 @@ load_code_signature( struct cs_blob *blob; int error; vm_size_t blob_size; + uint32_t sum; addr = 0; blob = NULL; - if (lcp->cmdsize != sizeof(struct linkedit_data_command) || - lcp->dataoff + lcp->datasize > macho_size) { + if (lcp->cmdsize != sizeof(struct linkedit_data_command)) { + ret = LOAD_BADMACHO; + goto out; + } + + sum = 0; + if (os_add_overflow(lcp->dataoff, lcp->datasize, &sum) || sum > macho_size) { ret = LOAD_BADMACHO; goto out; } diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index 5a0f66ceb..606d733fb 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -86,6 +86,8 @@ typedef struct _load_result { off_t cs_end_offset; void *threadstate; size_t threadstate_sz; + uint32_t ip_platform; + uint32_t lr_sdk; } load_result_t; struct image_params; @@ -96,6 +98,13 @@ load_return_t load_machfile( vm_map_t *mapp, load_result_t *result); +load_return_t +validate_potential_simulator_binary( + cpu_type_t exectype, + struct image_params *imgp, + off_t file_offset, + off_t macho_size); + #define LOAD_SUCCESS 0 #define LOAD_BADARCH 1 /* CPU type/subtype not found */ #define LOAD_BADMACHO 2 /* malformed mach-o file */ diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index bce1b9784..915a8cb45 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -107,7 +107,6 @@ int get_task_userstop(task_t); #define CLR(t, f) (t) &= ~(f) #define ISSET(t, f) ((t) & (f)) -extern thread_t port_name_to_thread(mach_port_name_t port_name); extern thread_t get_firstthread(task_t); @@ -451,7 +450,8 @@ resume: error = EINVAL; goto out; } - th_act = port_name_to_thread(CAST_MACH_PORT_TO_NAME(uap->addr)); + th_act = port_name_to_thread(CAST_MACH_PORT_TO_NAME(uap->addr), + PORT_TO_THREAD_NONE); if (th_act == THREAD_NULL) { error = ESRCH; goto out; diff --git a/bsd/kern/makesyscalls.sh b/bsd/kern/makesyscalls.sh index 411d5ce61..ffdacf957 100755 --- a/bsd/kern/makesyscalls.sh +++ b/bsd/kern/makesyscalls.sh @@ -409,6 +409,9 @@ s/\$//g current_field++ funcname = $current_field argalias = funcname "_args" + if (substr(argalias, 1, 4) == "sys_") { + argalias = substr(argalias, 5) + } current_field++ # bump past function name if ($current_field != "(") @@ -681,6 +684,9 @@ s/\$//g # output to syscalls.c if (add_sysnames_entry == 1) { tempname = funcname + if (substr(tempname, 1, 4) == "sys_") { + tempname = substr(tempname, 5) + } if (funcname == "nosys" || funcname == "enosys") { if (syscall_num == 0) tempname = "syscall" @@ -701,6 +707,9 @@ s/\$//g # output to syscalls.h if (add_sysheader_entry == 1) { tempname = funcname + if (substr(tempname, 1, 4) == "sys_") { + tempname = substr(tempname, 5) + } if (syscall_num == 0) { tempname = "syscall" } @@ -762,7 +771,7 @@ s/\$//g printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend printf("};\n") > sysent - printf("unsigned int nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent + printf("const unsigned int nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent printf("};\n") > syscallnamestempfile printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall_num) \ diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index a2263417d..03326e009 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2014 Apple Inc. All rights reserved. + * Copyright (c) 2006-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -146,13 +146,11 @@ static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***, static void mcache_slab_free(void *, mcache_obj_t *, boolean_t); static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t); static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int); -static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *, - mcache_bkttype_t **); +static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *); static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *); static void mcache_cache_bkt_enable(mcache_t *); static void mcache_bkt_purge(mcache_t *); -static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *, - mcache_bkt_t *, int); +static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int); static void mcache_bkt_ws_update(mcache_t *); static void mcache_bkt_ws_zero(mcache_t *); static void mcache_bkt_ws_reap(mcache_t *); @@ -201,12 +199,16 @@ mcache_init(void) mcache_update_tcall = thread_call_allocate(mcache_update, NULL); if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) { panic("mcache_init: thread_call_allocate failed"); + /* NOTREACHED */ + __builtin_unreachable(); } mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE, PAGE_SIZE, "mcache"); if (mcache_zone == NULL) { panic("mcache_init: failed to allocate mcache zone\n"); + /* NOTREACHED */ + __builtin_unreachable(); } zone_change(mcache_zone, Z_CALLERACCT, FALSE); @@ -346,6 +348,8 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, if ((align & (align - 1)) != 0) { panic("mcache_create: bad alignment %lu", align); + /* NOTREACHED */ + __builtin_unreachable(); } cp->mc_align = align; @@ -548,7 +552,7 @@ retry_alloc: * bucket from the bucket layer. Upon success, refill this * CPU and place any empty bucket into the empty list. */ - bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL); + bkt = mcache_bkt_alloc(cp, &cp->mc_full); if (bkt != NULL) { if (ccp->cc_pfilled != NULL) { mcache_bkt_free(cp, &cp->mc_empty, @@ -616,6 +620,8 @@ debug_alloc: panic("mcache_alloc_ext: %s cp %p corrupted list " "(got %d actual %d)\n", cp->mc_name, (void *)cp, num - need, n); + /* NOTREACHED */ + __builtin_unreachable(); } } @@ -810,7 +816,7 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list) * bucket from the bucket layer. Upon success, empty this * CPU and place any full bucket into the full list. */ - bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp); + bkt = mcache_bkt_alloc(cp, &cp->mc_empty); if (bkt != NULL) { if (ccp->cc_pfilled != NULL) { mcache_bkt_free(cp, &cp->mc_full, @@ -819,6 +825,7 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list) mcache_cpu_refill(ccp, bkt, 0); continue; } + btp = cp->cache_bkttype; /* * We need an empty bucket to put our freed objects into @@ -844,6 +851,14 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list) continue; } + /* + * Store it in the bucket object since we'll + * need to refer to it during bucket destroy; + * we can't safely refer to cache_bkttype as + * the bucket lock may not be acquired then. + */ + bkt->bkt_type = btp; + /* * We have an empty bucket of the right size; * add it to the bucket layer and try again. @@ -1082,7 +1097,7 @@ mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs) * Allocate a bucket from the bucket layer. */ static mcache_bkt_t * -mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp) +mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp) { mcache_bkt_t *bkt; @@ -1104,10 +1119,6 @@ mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp) blp->bl_alloc++; } - if (btp != NULL) { - *btp = cp->cache_bkttype; - } - MCACHE_UNLOCK(&cp->mc_bkt_lock); return bkt; @@ -1157,7 +1168,6 @@ mcache_bkt_purge(mcache_t *cp) { mcache_cpu_t *ccp; mcache_bkt_t *bp, *pbp; - mcache_bkttype_t *btp; int cpu, objs, pobjs; for (cpu = 0; cpu < ncpu; cpu++) { @@ -1165,7 +1175,6 @@ mcache_bkt_purge(mcache_t *cp) MCACHE_LOCK(&ccp->cc_lock); - btp = cp->cache_bkttype; bp = ccp->cc_filled; pbp = ccp->cc_pfilled; objs = ccp->cc_objs; @@ -1179,10 +1188,10 @@ mcache_bkt_purge(mcache_t *cp) MCACHE_UNLOCK(&ccp->cc_lock); if (bp != NULL) { - mcache_bkt_destroy(cp, btp, bp, objs); + mcache_bkt_destroy(cp, bp, objs); } if (pbp != NULL) { - mcache_bkt_destroy(cp, btp, pbp, pobjs); + mcache_bkt_destroy(cp, pbp, pobjs); } } @@ -1195,8 +1204,7 @@ mcache_bkt_purge(mcache_t *cp) * and also free the bucket itself. */ static void -mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt, - int nobjs) +mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs) { if (nobjs > 0) { mcache_obj_t *top = bkt->bkt_obj[nobjs - 1]; @@ -1219,6 +1227,8 @@ mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt, "list in bkt %p (nobjs %d actual %d)\n", cp->mc_name, (void *)cp, (void *)bkt, nobjs, cnt); + /* NOTREACHED */ + __builtin_unreachable(); } } @@ -1226,7 +1236,7 @@ mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt, (*cp->mc_slab_free)(cp->mc_private, top, (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt); } - mcache_free(btp->bt_cache, bkt); + mcache_free(bkt->bkt_type->bt_cache, bkt); } /* @@ -1269,18 +1279,17 @@ mcache_bkt_ws_reap(mcache_t *cp) { long reap; mcache_bkt_t *bkt; - mcache_bkttype_t *btp; reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min); while (reap-- && - (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL) { - mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize); + (bkt = mcache_bkt_alloc(cp, &cp->mc_full)) != NULL) { + mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize); } reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min); while (reap-- && - (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL) { - mcache_bkt_destroy(cp, btp, bkt, 0); + (bkt = mcache_bkt_alloc(cp, &cp->mc_empty)) != NULL) { + mcache_bkt_destroy(cp, bkt, 0); } } @@ -1487,7 +1496,7 @@ __private_extern__ void mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp, struct timeval *base_ts) { - struct timeval now, base = { 0, 0 }; + struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 }; void *stack[MCACHE_STACK_DEPTH + 1]; struct mca_trn *transaction; @@ -1670,17 +1679,21 @@ mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset, "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr, offset, got, expected); /* NOTREACHED */ + __builtin_unreachable(); } panic("mcache_audit: buffer %p modified after free at offset 0x%lx " "(0x%llx instead of 0x%llx)\n%s\n", addr, offset, got, expected, mcache_dump_mca(mca)); /* NOTREACHED */ + __builtin_unreachable(); } +__attribute__((noinline, cold, not_tail_called, noreturn)) __private_extern__ int assfail(const char *a, const char *f, int l) { panic("assertion failed: %s, file: %s, line: %d", a, f, l); - return 0; + /* NOTREACHED */ + __builtin_unreachable(); } diff --git a/bsd/kern/netboot.c b/bsd/kern/netboot.c index 71362c2f2..f0cfb1037 100644 --- a/bsd/kern/netboot.c +++ b/bsd/kern/netboot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2013 Apple Inc. All rights reserved. + * Copyright (c) 2001-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -561,7 +561,7 @@ static int default_route_add(struct in_addr router, boolean_t proxy_arp) { uint32_t flags = 0; - struct in_addr zeroes = { 0 }; + struct in_addr zeroes = { .s_addr = 0 }; if (proxy_arp == FALSE) { flags |= RTF_GATEWAY; @@ -572,7 +572,7 @@ default_route_add(struct in_addr router, boolean_t proxy_arp) static int host_route_delete(struct in_addr host, unsigned int ifscope) { - struct in_addr zeroes = { 0 }; + struct in_addr zeroes = { .s_addr = 0 }; return route_cmd(RTM_DELETE, host, zeroes, zeroes, RTF_HOST, ifscope); } @@ -599,11 +599,11 @@ find_interface(void) } static const struct sockaddr_in blank_sin = { - sizeof(struct sockaddr_in), - AF_INET, - 0, - { 0 }, - { 0, 0, 0, 0, 0, 0, 0, 0 } + .sin_len = sizeof(struct sockaddr_in), + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { .s_addr = 0 }, + .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 } }; static int @@ -636,12 +636,12 @@ int netboot_mountroot(void) { int error = 0; - struct in_addr iaddr = { 0 }; + struct in_addr iaddr = { .s_addr = 0 }; struct ifreq ifr; struct ifnet * ifp; - struct in_addr netmask = { 0 }; + struct in_addr netmask = { .s_addr = 0 }; proc_t procp = current_proc(); - struct in_addr router = { 0 }; + struct in_addr router = { .s_addr = 0 }; struct socket * so = NULL; unsigned int try; @@ -770,11 +770,11 @@ netboot_setup() goto done; } printf("netboot_setup: calling imageboot_mount_image\n"); - error = imageboot_mount_image(S_netboot_info_p->image_path, -1); + error = imageboot_mount_image(S_netboot_info_p->image_path, -1, IMAGEBOOT_DMG); if (error != 0) { printf("netboot: failed to mount root image, %d\n", error); } else if (S_netboot_info_p->second_image_path != NULL) { - error = imageboot_mount_image(S_netboot_info_p->second_image_path, 0); + error = imageboot_mount_image(S_netboot_info_p->second_image_path, 0, IMAGEBOOT_DMG); if (error != 0) { printf("netboot: failed to mount second root image, %d\n", error); } diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index 06ea2dcfc..de77a23be 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -121,7 +121,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 55) +#if (MAC_POLICY_OPS_VERSION != 58) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -271,8 +271,8 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(vnode_check_rename) CHECK_SET_HOOK(kext_check_query) CHECK_SET_HOOK(proc_notify_exec_complete) - .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook, + .mpo_reserved4 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(proc_check_syscall_unix) CHECK_SET_HOOK(proc_check_expose_task) CHECK_SET_HOOK(proc_check_set_host_special_port) CHECK_SET_HOOK(proc_check_set_host_exception_port) @@ -284,9 +284,9 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(exc_action_label_update) CHECK_SET_HOOK(vnode_check_trigger_resolve) + CHECK_SET_HOOK(mount_check_mount_late) .mpo_reserved1 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved2 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved3 = (mpo_reserved_hook_t *)common_hook, CHECK_SET_HOOK(skywalk_flow_check_connect) CHECK_SET_HOOK(skywalk_flow_check_listen) @@ -322,8 +322,9 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(proc_check_setlcid) CHECK_SET_HOOK(proc_check_signal) CHECK_SET_HOOK(proc_check_wait) - CHECK_SET_HOOK(proc_label_destroy) - CHECK_SET_HOOK(proc_label_init) + + .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook, + .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook, CHECK_SET_HOOK(socket_check_accept) CHECK_SET_HOOK(socket_check_accepted) @@ -473,6 +474,8 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(iokit_check_set_properties) + .mpo_reserved3 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(vnode_check_searchfs) CHECK_SET_HOOK(priv_check) diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index 5aa96d0f1..b1119d153 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -171,28 +171,18 @@ static int psem_cache_search(struct pseminfo **, struct psemname *, struct psemcache **); static int psem_delete(struct pseminfo * pinfo); -static int psem_read(struct fileproc *fp, struct uio *uio, - int flags, vfs_context_t ctx); -static int psem_write(struct fileproc *fp, struct uio *uio, - int flags, vfs_context_t ctx); -static int psem_ioctl(struct fileproc *fp, u_long com, - caddr_t data, vfs_context_t ctx); -static int psem_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx); static int psem_closefile(struct fileglob *fp, vfs_context_t ctx); static int psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache); -static int psem_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); - static const struct fileops psemops = { - .fo_type = DTYPE_PSXSEM, - .fo_read = psem_read, - .fo_write = psem_write, - .fo_ioctl = psem_ioctl, - .fo_select = psem_select, - .fo_close = psem_closefile, - .fo_kqfilter = psem_kqfilter, - .fo_drain = NULL, + .fo_type = DTYPE_PSXSEM, + .fo_read = fo_no_read, + .fo_write = fo_no_write, + .fo_ioctl = fo_no_ioctl, + .fo_select = fo_no_select, + .fo_close = psem_closefile, + .fo_drain = fo_no_drain, + .fo_kqfilter = fo_no_kqfilter, }; static lck_grp_t *psx_sem_subsys_lck_grp; @@ -797,7 +787,7 @@ sem_unlink(__unused proc_t p, struct sem_unlink_args *uap, __unused int32_t *ret if (error != PSEMCACHE_FOUND) { PSEM_SUBSYS_UNLOCK(); - error = EINVAL; + error = ENOENT; goto bad; } @@ -842,6 +832,8 @@ sem_close(proc_t p, struct sem_close_args *uap, __unused int32_t *retval) return EBADF; } procfdtbl_markclosefd(p, fd); + /* release the ref returned from fp_lookup before calling drain */ + (void) os_ref_release_locked(&fp->f_iocount); fileproc_drain(p, fp); fdrelse(p, fd); error = closef_locked(fp, fp->f_fglob, p); @@ -1117,43 +1109,6 @@ psem_delete(struct pseminfo * pinfo) } } -static int -psem_read(__unused struct fileproc *fp, __unused struct uio *uio, - __unused int flags, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -psem_write(__unused struct fileproc *fp, __unused struct uio *uio, - __unused int flags, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -psem_ioctl(__unused struct fileproc *fp, __unused u_long com, - __unused caddr_t data, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -psem_select(__unused struct fileproc *fp, __unused int which, - __unused void *wql, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -psem_kqfilter(__unused struct fileproc *fp, struct knote *kn, - __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx) -{ - kn->kn_flags = EV_ERROR; - kn->kn_data = ENOTSUP; - return 0; -} - int fill_pseminfo(struct psemnode *pnode, struct psem_info * info) { diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index 3cd6aebd1..29c89efb9 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -187,29 +187,22 @@ static pshm_info_t *pshm_cache_search(pshm_info_t * look); static void pshm_cache_add(pshm_info_t *entry); static void pshm_cache_delete(pshm_info_t *entry); -static int pshm_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); -static int pshm_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); -static int pshm_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx); -static int pshm_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx); static int pshm_closefile(struct fileglob *fg, vfs_context_t ctx); -static int pshm_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); - static int pshm_access(pshm_info_t *pinfo, int mode, kauth_cred_t cred, proc_t p); int pshm_cache_purge_all(proc_t p); static int pshm_unlink_internal(pshm_info_t *pinfo); static const struct fileops pshmops = { - .fo_type = DTYPE_PSXSHM, - .fo_read = pshm_read, - .fo_write = pshm_write, - .fo_ioctl = pshm_ioctl, - .fo_select = pshm_select, - .fo_close = pshm_closefile, - .fo_kqfilter = pshm_kqfilter, - .fo_drain = NULL, + .fo_type = DTYPE_PSXSHM, + .fo_read = fo_no_read, + .fo_write = fo_no_write, + .fo_ioctl = fo_no_ioctl, + .fo_select = fo_no_select, + .fo_close = pshm_closefile, + .fo_drain = fo_no_drain, + .fo_kqfilter = fo_no_kqfilter, }; /* @@ -1128,43 +1121,6 @@ pshm_closefile(struct fileglob *fg, __unused vfs_context_t ctx) return error; } -static int -pshm_read(__unused struct fileproc *fp, __unused struct uio *uio, - __unused int flags, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -pshm_write(__unused struct fileproc *fp, __unused struct uio *uio, - __unused int flags, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -pshm_ioctl(__unused struct fileproc *fp, __unused u_long com, - __unused caddr_t data, __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -pshm_select(__unused struct fileproc *fp, __unused int which, __unused void *wql, - __unused vfs_context_t ctx) -{ - return ENOTSUP; -} - -static int -pshm_kqfilter(__unused struct fileproc *fp, struct knote *kn, - __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx) -{ - kn->kn_flags = EV_ERROR; - kn->kn_data = ENOTSUP; - return 0; -} - int fill_pshminfo(pshmnode_t * pshm, struct pshm_info * info) { diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index 8d026e5db..d4bc5e794 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -106,6 +106,7 @@ struct atalk; uint64_t get_dispatchqueue_offset_from_proc(void *); uint64_t get_dispatchqueue_serialno_offset_from_proc(void *); +uint64_t get_dispatchqueue_label_offset_from_proc(void *p); uint64_t get_return_to_kernel_offset_from_proc(void *p); int proc_info_internal(int callnum, int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t * retval); @@ -174,6 +175,8 @@ int __attribute__ ((noinline)) proc_pidexitreasoninfo(proc_t p, struct proc_exit int __attribute__ ((noinline)) proc_pidoriginatorpid_uuid(uuid_t uuid, uint32_t buffersize, pid_t *pid); int __attribute__ ((noinline)) proc_pidlistuptrs(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int __attribute__ ((noinline)) proc_piddynkqueueinfo(pid_t pid, int flavor, kqueue_id_t id, user_addr_t buffer, uint32_t buffersize, int32_t *retval); +int __attribute__ ((noinline)) proc_pidregionpath(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint32_t buffersize, int32_t *retval); +int __attribute__ ((noinline)) proc_pidipctableinfo(proc_t p, struct proc_ipctableinfo *table_info); #if !CONFIG_EMBEDDED int __attribute__ ((noinline)) proc_udata_info(pid_t pid, int flavor, user_addr_t buffer, uint32_t buffersize, int32_t *retval); @@ -192,7 +195,7 @@ int __attribute__ ((noinline)) pid_atalkinfo(struct atalk * at, struct fileproc /* protos for misc */ -int fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo); +int fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo, boolean_t check_fsgetpath); void fill_fileinfo(struct fileproc * fp, proc_t proc, int fd, struct proc_fileinfo * finfo); int proc_security_policy(proc_t targetp, int callnum, int flavor, boolean_t check_same_user); static void munge_vinfo_stat(struct stat64 *sbp, struct vinfo_stat *vsbp); @@ -227,6 +230,17 @@ get_dispatchqueue_serialno_offset_from_proc(void *p) } } +uint64_t +get_dispatchqueue_label_offset_from_proc(void *p) +{ + if (p != NULL) { + proc_t pself = (proc_t)p; + return pself->p_dispatchqueue_label_offset; + } else { + return (uint64_t)0; + } +} + uint64_t get_return_to_kernel_offset_from_proc(void *p) { @@ -968,7 +982,7 @@ proc_pidthreadpathinfo(proc_t p, uint64_t arg, struct proc_threadwithpathinfo *p } if ((vp != NULLVP) && ((vnode_getwithvid(vp, vid)) == 0)) { - error = fill_vnodeinfo(vp, &pinfo->pvip.vip_vi); + error = fill_vnodeinfo(vp, &pinfo->pvip.vip_vi, FALSE); if (error == 0) { count = MAXPATHLEN; vn_getpath(vp, &pinfo->pvip.vip_path[0], &count); @@ -1057,7 +1071,7 @@ proc_pidregionpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint vp = (vnode_t)vnodeaddr; if ((vnode_getwithvid(vp, vnodeid)) == 0) { /* FILL THE VNODEINFO */ - error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi); + error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi, FALSE); count = MAXPATHLEN; vn_getpath(vp, &preginfo.prp_vip.vip_path[0], &count); /* Always make sure it is null terminated */ @@ -1095,7 +1109,7 @@ proc_pidregionpathinfo2(proc_t p, uint64_t arg, user_addr_t buffer, __unused uin vp = (vnode_t)vnodeaddr; if ((vnode_getwithvid(vp, vnodeid)) == 0) { /* FILL THE VNODEINFO */ - error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi); + error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi, FALSE); count = MAXPATHLEN; vn_getpath(vp, &preginfo.prp_vip.vip_path[0], &count); /* Always make sure it is null terminated */ @@ -1112,6 +1126,45 @@ proc_pidregionpathinfo2(proc_t p, uint64_t arg, user_addr_t buffer, __unused uin return error; } +int +proc_pidregionpath(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint32_t buffersize, int32_t *retval) +{ + struct proc_regionpath path; + int ret, error = 0; + uintptr_t vnodeaddr = 0; + uint32_t vnodeid = 0; + vnode_t vp; + + bzero(&path, sizeof(struct proc_regionpath)); + + ret = find_region_details(p->task, (vm_map_offset_t) arg, + (uintptr_t *)&vnodeaddr, (uint32_t *)&vnodeid, + &path.prpo_addr, &path.prpo_regionlength); + if (ret == 0) { + return EINVAL; + } + if (!vnodeaddr) { + return EINVAL; + } + + vp = (vnode_t)vnodeaddr; + if ((vnode_getwithvid(vp, vnodeid)) == 0) { + int count = MAXPATHLEN; + vn_getpath(vp, &path.prpo_path[0], &count); + /* Always make sure it is null terminated */ + path.prpo_path[MAXPATHLEN - 1] = 0; + vnode_put(vp); + } else { + return EINVAL; + } + + error = copyout(&path, buffer, sizeof(struct proc_regionpath)); + if (error == 0) { + *retval = sizeof(struct proc_regionpath); + } + return error; +} + int proc_pidregionpathinfo3(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint32_t buffersize, int32_t *retval) { @@ -1155,7 +1208,7 @@ proc_pidregionpathinfo3(proc_t p, uint64_t arg, user_addr_t buffer, __unused uin if (vnode_get_va_fsid(&va) == arg) { /* FILL THE VNODEINFO */ - error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi); + error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi, FALSE); count = MAXPATHLEN; vn_getpath(vp, &preginfo.prp_vip.vip_path[0], &count); /* Always make sure it is null terminated */ @@ -1209,7 +1262,7 @@ proc_pidvnodepathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, __unu if (vncdirvp != NULLVP) { if ((error = vnode_getwithvid(vncdirvp, vncdirid)) == 0) { /* FILL THE VNODEINFO */ - error = fill_vnodeinfo(vncdirvp, &pvninfo.pvi_cdir.vip_vi); + error = fill_vnodeinfo(vncdirvp, &pvninfo.pvi_cdir.vip_vi, TRUE); if (error == 0) { count = MAXPATHLEN; vn_getpath(vncdirvp, &pvninfo.pvi_cdir.vip_path[0], &count); @@ -1224,7 +1277,7 @@ proc_pidvnodepathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, __unu if ((error == 0) && (vnrdirvp != NULLVP)) { if ((error = vnode_getwithvid(vnrdirvp, vnrdirid)) == 0) { /* FILL THE VNODEINFO */ - error = fill_vnodeinfo(vnrdirvp, &pvninfo.pvi_rdir.vip_vi); + error = fill_vnodeinfo(vnrdirvp, &pvninfo.pvi_rdir.vip_vi, TRUE); if (error == 0) { count = MAXPATHLEN; vn_getpath(vnrdirvp, &pvninfo.pvi_rdir.vip_path[0], &count); @@ -1403,6 +1456,27 @@ proc_pidoriginatoruuid(uuid_t uuid, uint32_t buffersize) return proc_pidoriginatorpid_uuid(uuid, buffersize, &originator_pid); } +/* + * Function to get the task ipc table size. + */ +int +proc_pidipctableinfo(proc_t p, struct proc_ipctableinfo *table_info) +{ + task_t task; + int error = 0; + + task = p->task; + + bzero(table_info, sizeof(struct proc_ipctableinfo)); + error = fill_taskipctableinfo(task, &(table_info->table_size), &(table_info->table_free)); + + if (error) { + error = EINVAL; + } + + return error; +} + /***************************** proc_pidoriginatorinfo ***************************/ int @@ -1628,10 +1702,10 @@ proc_can_use_foreground_hw(int pid, user_addr_t u_reason, uint32_t reasonsize, i } task = p->task; - task_reference(task); - if (coalition_is_leader(task, COALITION_TYPE_JETSAM, &coal) == FALSE) { + if (coalition_is_leader(task, task_get_coalition(task, COALITION_TYPE_JETSAM))) { + task_reference(task); + } else { /* current task is not a coalition leader: find the leader */ - task_deallocate(task); task = coalition_get_leader(coal); } @@ -1892,6 +1966,16 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu size = 0; } break; + case PROC_PIDPLATFORMINFO: + size = PROC_PIDPLATFORMINFO_SIZE; + findzomb = 1; + break; + case PROC_PIDREGIONPATH: + size = PROC_PIDREGIONPATH_SIZE; + break; + case PROC_PIDIPCTABLEINFO: + size = PROC_PIDIPCTABLEINFO_SIZE; + break; default: return EINVAL; } @@ -1931,6 +2015,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDUNIQIDENTIFIERINFO: case PROC_PIDPATHINFO: case PROC_PIDCOALITIONINFO: + case PROC_PIDPLATFORMINFO: check_same_user = NO_CHECK_SAME_USER; break; default: @@ -2232,6 +2317,31 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu kfree(vmrtfbuf, kbufsz); } break; + case PROC_PIDPLATFORMINFO: { + proc_lock(p); + uint32_t platform = p->p_platform; + proc_unlock(p); + error = copyout(&platform, buffer, sizeof(uint32_t)); + if (error == 0) { + *retval = sizeof(uint32_t); + } + } break; + case PROC_PIDREGIONPATH: { + error = proc_pidregionpath(p, arg, buffer, buffersize, retval); + } + break; + case PROC_PIDIPCTABLEINFO: { + struct proc_ipctableinfo table_info; + + error = proc_pidipctableinfo(p, &table_info); + if (error == 0) { + error = copyout(&table_info, buffer, sizeof(struct proc_ipctableinfo)); + if (error == 0) { + *retval = sizeof(struct proc_ipctableinfo); + } + } + } + break; default: error = ENOTSUP; break; @@ -2258,7 +2368,7 @@ pid_vnodeinfo(vnode_t vp, uint32_t vid, struct fileproc * fp, proc_t proc, int f } bzero(&vfi, sizeof(struct vnode_fdinfo)); fill_fileinfo(fp, proc, fd, &vfi.pfi); - error = fill_vnodeinfo(vp, &vfi.pvi); + error = fill_vnodeinfo(vp, &vfi.pvi, FALSE); vnode_put(vp); if (error == 0) { error = copyout((caddr_t)&vfi, buffer, sizeof(struct vnode_fdinfo)); @@ -2280,7 +2390,7 @@ pid_vnodeinfopath(vnode_t vp, uint32_t vid, struct fileproc * fp, proc_t proc, i } bzero(&vfip, sizeof(struct vnode_fdinfowithpath)); fill_fileinfo(fp, proc, fd, &vfip.pfi); - error = fill_vnodeinfo(vp, &vfip.pvip.vip_vi); + error = fill_vnodeinfo(vp, &vfip.pvip.vip_vi, TRUE); if (error == 0) { count = MAXPATHLEN; vn_getpath(vp, &vfip.pvip.vip_path[0], &count); @@ -2335,7 +2445,7 @@ fill_fileinfo(struct fileproc * fp, proc_t proc, int fd, struct proc_fileinfo * int -fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo) +fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo, __unused boolean_t check_fsgetpath) { vfs_context_t context; struct stat64 sb; @@ -2343,11 +2453,17 @@ fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo) bzero(&sb, sizeof(struct stat64)); context = vfs_context_create((vfs_context_t)0); - error = vn_stat(vp, &sb, NULL, 1, context); +#if CONFIG_MACF + /* Called when vnode info is used by the caller to get vnode's path */ + if (check_fsgetpath) { + error = mac_vnode_check_fsgetpath(context, vp); + } +#endif + if (!error) { + error = vn_stat(vp, &sb, NULL, 1, 0, context); + munge_vinfo_stat(&sb, &vinfo->vi_stat); + } (void)vfs_context_rele(context); - - munge_vinfo_stat(&sb, &vinfo->vi_stat); - if (error != 0) { goto out; } @@ -2598,36 +2714,36 @@ proc_pidfdinfo(int pid, int flavor, int fd, user_addr_t buffer, uint32_t buffers break; case PROC_PIDFDKQUEUEINFO: { - struct kqueue * kq; + kqueue_t kqu; if (fd == -1) { - if ((kq = p->p_fd->fd_wqkqueue) == NULL) { + if ((kqu.kqwq = p->p_fd->fd_wqkqueue) == NULL) { /* wqkqueue is initialized on-demand */ error = 0; break; } - } else if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) { + } else if ((error = fp_getfkq(p, fd, &fp, &kqu.kq)) != 0) { goto out1; } /* no need to be under the fdlock */ - error = pid_kqueueinfo(kq, fp, p, fd, buffer, buffersize, retval); + error = pid_kqueueinfo(kqu.kq, fp, p, fd, buffer, buffersize, retval); } break; case PROC_PIDFDKQUEUE_EXTINFO: { - struct kqueue * kq; + kqueue_t kqu; if (fd == -1) { - if ((kq = p->p_fd->fd_wqkqueue) == NULL) { + if ((kqu.kqwq = p->p_fd->fd_wqkqueue) == NULL) { /* wqkqueue is initialized on-demand */ error = 0; break; } - } else if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) { + } else if ((error = fp_getfkq(p, fd, &fp, &kqu.kq)) != 0) { goto out1; } - error = pid_kqueue_extinfo(p, kq, buffer, buffersize, retval); + error = pid_kqueue_extinfo(p, kqu.kq, buffer, buffersize, retval); } break; @@ -3041,7 +3157,7 @@ proc_dirtycontrol(int pid, int flavor, uint64_t arg, int32_t *retval) case PROC_DIRTYCONTROL_GET: { /* No permissions check - dirty state is freely available */ if (retval) { - *retval = memorystatus_dirty_get(target_p); + *retval = memorystatus_dirty_get(target_p, FALSE); } else { error = EINVAL; } diff --git a/bsd/kern/stackshot.c b/bsd/kern/stackshot.c index 3ef4acd7b..d30037c20 100644 --- a/bsd/kern/stackshot.c +++ b/bsd/kern/stackshot.c @@ -179,8 +179,8 @@ kern_stack_snapshot_with_reason(__unused char *reason) config.sc_pid = -1; config.sc_flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_IN_KERNEL_BUFFER | - STACKSHOT_KCDATA_FORMAT | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_THREAD_WAITINFO | - STACKSHOT_NO_IO_STATS); + STACKSHOT_KCDATA_FORMAT | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_THREAD_WAITINFO | + STACKSHOT_NO_IO_STATS | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT); config.sc_delta_timestamp = 0; config.sc_out_buffer_addr = 0; config.sc_out_size_addr = 0; diff --git a/bsd/kern/subr_eventhandler.c b/bsd/kern/subr_eventhandler.c index 41c57380f..0fd805173 100644 --- a/bsd/kern/subr_eventhandler.c +++ b/bsd/kern/subr_eventhandler.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,7 +72,7 @@ SYSCTL_NODE(_kern, OID_AUTO, eventhandler, CTLFLAG_RW | CTLFLAG_LOCKED, SYSCTL_INT(_kern_eventhandler, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &evh_debug, 0, "Eventhandler debug mode"); -struct eventhandler_entry_arg eventhandler_entry_dummy_arg = { { 0 }, { 0 } }; +struct eventhandler_entry_arg eventhandler_entry_dummy_arg = { .ee_fm_uuid = { 0 }, .ee_fr_uuid = { 0 } }; /* List of 'slow' lists */ static struct eventhandler_lists_ctxt evthdlr_lists_ctxt_glb; @@ -177,6 +177,11 @@ eventhandler_register_internal( if (list == NULL) { lck_mtx_convert_spin(&evthdlr_lists_ctxt->eventhandler_mutex); new_list = mcache_alloc(el_cache, MCR_SLEEP); + if (new_list == NULL) { + evhlog((LOG_DEBUG, "%s: Can't allocate list \"%s\"", __func__, name)); + lck_mtx_unlock(&evthdlr_lists_ctxt->eventhandler_mutex); + return NULL; + } bzero(new_list, el_size); evhlog((LOG_DEBUG, "%s: creating list \"%s\"", __func__, name)); list = new_list; @@ -224,6 +229,11 @@ eventhandler_register(struct eventhandler_lists_ctxt *evthdlr_lists_ctxt, /* allocate an entry for this handler, populate it */ eg = mcache_alloc(eg_cache, MCR_SLEEP); + if (eg == NULL) { + evhlog((LOG_DEBUG, "%s: Can't allocate entry to register for event list " + "\"%s\"", __func__, name)); + return NULL; + } bzero(eg, eg_size); eg->func = func; eg->ee.ee_arg = arg; diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index 46475b683..b5a78b6b9 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,6 +65,7 @@ * Error log buffer for kernel printf's. */ +#include #include #include #include @@ -120,7 +121,7 @@ extern uint32_t oslog_s_error_count; /* All globals should be accessed under LOG_LOCK() */ static char amsg_bufc[1024]; -static struct msgbuf aslbuf = {MSG_MAGIC, sizeof(amsg_bufc), 0, 0, amsg_bufc}; +static struct msgbuf aslbuf = {.msg_magic = MSG_MAGIC, .msg_size = sizeof(amsg_bufc), .msg_bufx = 0, .msg_bufr = 0, .msg_bufc = amsg_bufc}; struct msgbuf *aslbufp __attribute__((used)) = &aslbuf; /* logsoftc only valid while log_open=1 */ @@ -144,8 +145,8 @@ struct firehose_chunk_s oslog_boot_buf = { }, }; /* static buffer */ firehose_chunk_t firehose_boot_chunk = &oslog_boot_buf; -struct msgbuf msgbuf = {MSG_MAGIC, sizeof(smsg_bufc), 0, 0, smsg_bufc}; -struct msgbuf oslog_stream_buf = {MSG_MAGIC, 0, 0, 0, NULL}; +struct msgbuf msgbuf = {.msg_magic = MSG_MAGIC, .msg_size = sizeof(smsg_bufc), .msg_bufx = 0, .msg_bufr = 0, .msg_bufc = smsg_bufc}; +struct msgbuf oslog_stream_buf = {.msg_magic = MSG_MAGIC, .msg_size = 0, .msg_bufx = 0, .msg_bufr = 0, .msg_bufc = NULL}; struct msgbuf *msgbufp __attribute__((used)) = &msgbuf; struct msgbuf *oslog_streambufp __attribute__((used)) = &oslog_stream_buf; @@ -195,7 +196,7 @@ void bsd_log_init(void); * Ideally this file would define this lock, but bsd doesn't have the definition * for lock groups. */ -decl_lck_spin_data(extern, oslog_stream_lock) +decl_lck_spin_data(extern, oslog_stream_lock); #define stream_lock() lck_spin_lock(&oslog_stream_lock) #define stream_unlock() lck_spin_unlock(&oslog_stream_lock) @@ -609,7 +610,7 @@ oslog_streamread(__unused dev_t dev, struct uio *uio, int flag) if (copy_size != 0) { error = uiomove((caddr_t)logline, copy_size, uio); } - (void)hw_atomic_add(&oslog_s_streamed_msgcount, 1); + os_atomic_inc(&oslog_s_streamed_msgcount, relaxed); return error; } @@ -1057,7 +1058,7 @@ oslog_streamwrite_locked(firehose_tracepoint_id_u ftid, mbp = oslog_streambufp; if (ft_length > mbp->msg_size) { - (void)hw_atomic_add(&oslog_s_error_count, 1); + os_atomic_inc(&oslog_s_error_count, relaxed); return; } diff --git a/bsd/kern/subr_prf.c b/bsd/kern/subr_prf.c index d090a2429..a7f73781a 100644 --- a/bsd/kern/subr_prf.c +++ b/bsd/kern/subr_prf.c @@ -394,13 +394,16 @@ putchar(int c, void *arg) } int -vprintf_log_locked(const char *fmt, va_list ap) +vprintf_log_locked(const char *fmt, va_list ap, bool addcr) { struct putchar_args pca; pca.flags = TOLOGLOCKED; pca.tty = NULL; __doprnt(fmt, ap, putchar, &pca, 10, TRUE); + if (addcr) { + putchar('\n', &pca); + } return 0; } diff --git a/bsd/kern/subr_prof.c b/bsd/kern/subr_prof.c deleted file mode 100644 index a638d8780..000000000 --- a/bsd/kern/subr_prof.c +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/*- - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93 - */ - -#ifdef GPROF -#include -#endif - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -#ifdef GPROF -#include -#include - -extern int sysctl_doprof(int *, u_int, user_addr_t, size_t *, - user_addr_t, size_t newlen); -extern int sysctl_struct(user_addr_t, size_t *, - user_addr_t, size_t, void *, int); - -lck_spin_t * mcount_lock; -lck_grp_t * mcount_lock_grp; -lck_attr_t * mcount_lock_attr; - -/* - * Froms is actually a bunch of unsigned shorts indexing tos - */ -struct gmonparam _gmonparam = { .state = GMON_PROF_OFF }; - -/* - * This code uses 32 bit mach object segment information from the currently - * running kernel. - */ -void -kmstartup(void) -{ - tostruct_t *cp; - kernel_segment_command_t *sgp; /* 32 bit mach object file segment */ - struct gmonparam *p = &_gmonparam; - - sgp = getsegbyname("__TEXT"); - p->lowpc = (u_int32_t)sgp->vmaddr; - p->highpc = (u_int32_t)(sgp->vmaddr + sgp->vmsize); - - /* - * Round lowpc and highpc to multiples of the density we're using - * so the rest of the scaling (here and in gprof) stays in ints. - */ - p->lowpc = ROUNDDOWN(p->lowpc, HISTFRACTION * sizeof(HISTCOUNTER)); - p->highpc = ROUNDUP(p->highpc, HISTFRACTION * sizeof(HISTCOUNTER)); - p->textsize = p->highpc - p->lowpc; - printf("Profiling kernel, textsize=%lu [0x%016lx..0x%016lx]\n", - p->textsize, p->lowpc, p->highpc); - p->kcountsize = p->textsize / HISTFRACTION; - p->hashfraction = HASHFRACTION; - p->fromssize = p->textsize / HASHFRACTION; - p->tolimit = p->textsize * ARCDENSITY / 100; - if (p->tolimit < MINARCS) { - p->tolimit = MINARCS; - } else if (p->tolimit > MAXARCS) { - p->tolimit = MAXARCS; - } - p->tossize = p->tolimit * sizeof(tostruct_t); - /* Why not use MALLOC with M_GPROF ? */ - cp = (tostruct_t *)kalloc(p->kcountsize + p->fromssize + p->tossize); - if (cp == 0) { - printf("No memory for profiling.\n"); - return; - } - bzero(cp, p->kcountsize + p->tossize + p->fromssize); - p->tos = cp; - cp = (tostruct_t *)((vm_offset_t)cp + p->tossize); - p->kcount = (u_short *)cp; - cp = (tostruct_t *)((vm_offset_t)cp + p->kcountsize); - p->froms = (u_short *)cp; - - mcount_lock_grp = lck_grp_alloc_init("MCOUNT", LCK_GRP_ATTR_NULL); - mcount_lock_attr = lck_attr_alloc_init(); - mcount_lock = lck_spin_alloc_init(mcount_lock_grp, mcount_lock_attr); -} - -/* - * XXX These should be broken out into per-argument OID values, - * XXX since there are no sub-OID parameter values, but unfortunately - * XXX there is barely enough time for an initial conversion. - * - * Note: These items appear to be read/write. - */ -STATIC int -sysctl_doprofhandle SYSCTL_HANDLER_ARGS -{ - sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t * oldlenp, - user_addr_t newp, size_t newlen) - { - __unused int cmd = oidp->oid_arg2; /* subcommand*/ - int *name = arg1; /* oid element argument vector */ - int namelen = arg2; /* number of oid element arguments */ - user_addr_t oldp = req->oldptr; /* user buffer copy out address */ - size_t *oldlenp = req->oldlen; /* user buffer copy out size */ - user_addr_t newp = req->newptr; /* user buffer copy in address */ - size_t newlen = req->newlen; /* user buffer copy in size */ - - struct gmonparam *gp = &_gmonparam; - int error = 0; - - /* all sysctl names at this level are terminal */ - if (namelen != 1) { - return ENOTDIR; /* overloaded */ - } - switch (name[0]) { - case GPROF_STATE: - error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state); - if (error) { - break; - } - if (gp->state == GMON_PROF_OFF) { - stopprofclock(kernproc); - } else { - startprofclock(kernproc); - } - break; - case GPROF_COUNT: - error = sysctl_struct(oldp, oldlenp, newp, newlen, - gp->kcount, gp->kcountsize); - break; - case GPROF_FROMS: - error = sysctl_struct(oldp, oldlenp, newp, newlen, - gp->froms, gp->fromssize); - break; - case GPROF_TOS: - error = sysctl_struct(oldp, oldlenp, newp, newlen, - gp->tos, gp->tossize); - break; - case GPROF_GMONPARAM: - error = sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp); - break; - default: - error = ENOTSUP; - break; - } - - /* adjust index so we return the right required/consumed amount */ - if (!error) { - req->oldidx += req->oldlen; - } - - return error; - } - SYSCTL_PROC(_kern, KERN_PROF, prof, STLFLAG_NODE | CTLFLAG_RW | CTLFLAG_LOCKED, - 0, /* Pointer argument (arg1) */ - 0, /* Integer argument (arg2) */ - sysctl_doprofhandle, /* Handler function */ - NULL, /* No explicit data */ - ""); - - -/* - * mcount() called with interrupts disabled. - */ - void - mcount( - uintptr_t frompc, - uintptr_t selfpc - ) - { - unsigned short *frompcindex; - tostruct_t *top, *prevtop; - struct gmonparam *p = &_gmonparam; - long toindex; - - /* - * check that we are profiling - * and that we aren't recursively invoked. - */ - if (p->state != GMON_PROF_ON) { - return; - } - - lck_spin_lock(mcount_lock); - - /* - * check that frompcindex is a reasonable pc value. - * for example: signal catchers get called from the stack, - * not from text space. too bad. - */ - frompc -= p->lowpc; - if (frompc > p->textsize) { - goto done; - } - - frompcindex = &p->froms[frompc / (p->hashfraction * sizeof(*p->froms))]; - toindex = *frompcindex; - if (toindex == 0) { - /* - * first time traversing this arc - */ - toindex = ++p->tos[0].link; - if (toindex >= p->tolimit) { - /* halt further profiling */ - goto overflow; - } - *frompcindex = toindex; - top = &p->tos[toindex]; - top->selfpc = selfpc; - top->count = 1; - top->link = 0; - goto done; - } - top = &p->tos[toindex]; - if (top->selfpc == selfpc) { - /* - * arc at front of chain; usual case. - */ - top->count++; - goto done; - } - /* - * have to go looking down chain for it. - * top points to what we are looking at, - * prevtop points to previous top. - * we know it is not at the head of the chain. - */ - for (; /* goto done */;) { - if (top->link == 0) { - /* - * top is end of the chain and none of the chain - * had top->selfpc == selfpc. - * so we allocate a new tostruct - * and link it to the head of the chain. - */ - toindex = ++p->tos[0].link; - if (toindex >= p->tolimit) { - goto overflow; - } - top = &p->tos[toindex]; - top->selfpc = selfpc; - top->count = 1; - top->link = *frompcindex; - *frompcindex = toindex; - goto done; - } - /* - * otherwise, check the next arc on the chain. - */ - prevtop = top; - top = &p->tos[top->link]; - if (top->selfpc == selfpc) { - /* - * there it is. - * increment its count - * move it to the head of the chain. - */ - top->count++; - toindex = prevtop->link; - prevtop->link = top->link; - top->link = *frompcindex; - *frompcindex = toindex; - goto done; - } - } -done: - lck_spin_unlock(mcount_lock); - return; - -overflow: - p->state = GMON_PROF_ERROR; - lck_spin_unlock(mcount_lock); - printf("mcount: tos overflow\n"); - return; - } - -#endif /* GPROF */ - -#define PROFILE_LOCK(x) -#define PROFILE_UNLOCK(x) - - -/* - * Scale is a fixed-point number with the binary point 16 bits - * into the value, and is <= 1.0. pc is at most 32 bits, so the - * intermediate result is at most 48 bits. - */ -//K64todo - this doesn't fit into 64 bit any more, it needs 64+16 -#define PC_TO_INDEX(pc, prof) \ - ((user_addr_t)(((u_quad_t)((pc) - (prof)->pr_off) * \ - (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) - -/* - * Collect user-level profiling statistics; called on a profiling tick, - * when a process is running in user-mode. We use - * an AST that will vector us to trap() with a context in which copyin - * and copyout will work. Trap will then call addupc_task(). - * - * Note that we may (rarely) not get around to the AST soon enough, and - * lose profile ticks when the next tick overwrites this one, but in this - * case the system is overloaded and the profile is probably already - * inaccurate. - * - * We can afford to take faults here. If the - * update fails, we simply turn off profiling. - */ -void -addupc_task(struct proc *p, user_addr_t pc, u_int ticks) -{ - user_addr_t off; - u_short count; - - /* Testing P_PROFIL may be unnecessary, but is certainly safe. */ - if ((p->p_flag & P_PROFIL) == 0 || ticks == 0) { - return; - } - - if (proc_is64bit(p)) { - struct user_uprof *prof; - user_addr_t cell; - - for (prof = &p->p_stats->user_p_prof; prof; prof = prof->pr_next) { - off = PC_TO_INDEX(pc, prof); - cell = (prof->pr_base + off); - if (cell >= prof->pr_base && - cell < (prof->pr_size + prof->pr_base)) { - if (copyin(cell, (caddr_t) &count, sizeof(count)) == 0) { - count += ticks; - if (copyout((caddr_t) &count, cell, sizeof(count)) == 0) { - return; - } - } - p->p_stats->user_p_prof.pr_scale = 0; - stopprofclock(p); - break; - } - } - } else { - struct uprof *prof; - short *cell; - - for (prof = &p->p_stats->p_prof; prof; prof = prof->pr_next) { - off = PC_TO_INDEX(pc, prof); - cell = (short *)(prof->pr_base + off); - if (cell >= (short *)prof->pr_base && - cell < (short*)(prof->pr_size + prof->pr_base)) { - if (copyin(CAST_USER_ADDR_T(cell), (caddr_t) &count, sizeof(count)) == 0) { - count += ticks; - if (copyout((caddr_t) &count, CAST_USER_ADDR_T(cell), sizeof(count)) == 0) { - return; - } - } - p->p_stats->p_prof.pr_scale = 0; - stopprofclock(p); - break; - } - } - } -} diff --git a/bsd/kern/subr_xxx.c b/bsd/kern/subr_xxx.c index 879963f19..c3f69f22f 100644 --- a/bsd/kern/subr_xxx.c +++ b/bsd/kern/subr_xxx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,10 +73,6 @@ #include /* for psignal() */ #include -#ifdef GPROF -#include -#endif - #if DEVELOPMENT || DEBUG bool send_sigsys = true; #else @@ -192,17 +188,6 @@ nosys(__unused struct proc *p, __unused struct nosys_args *args, __unused int32_ return ENOSYS; } -#ifdef GPROF -/* - * Stub routine in case it is ever possible to free space. - */ -void -cfreemem(caddr_t cp, int size) -{ - printf("freeing %p, size %d\n", cp, size); -} -#endif - #if !CRYPTO #include diff --git a/bsd/kern/sys_coalition.c b/bsd/kern/sys_coalition.c index bfbd9c9ca..5b1d7d7ac 100644 --- a/bsd/kern/sys_coalition.c +++ b/bsd/kern/sys_coalition.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -260,6 +261,27 @@ coalition_info_efficiency(coalition_t coal, user_addr_t buffer, user_size_t bufs return error; } +static int +coalition_ledger_logical_writes_limit(coalition_t coal, user_addr_t buffer, user_size_t bufsize) +{ + int error = 0; + int64_t limit = 0; + + if (coalition_type(coal) != COALITION_TYPE_RESOURCE) { + error = EINVAL; + goto out; + } + error = copyin(buffer, &limit, MIN(bufsize, sizeof(limit))); + if (error) { + goto out; + } + + + error = coalition_ledger_set_logical_writes_limit(coal, limit); +out: + return error; +} + int coalition_info(proc_t p, struct coalition_info_args *uap, __unused int32_t *retval) { @@ -315,6 +337,60 @@ bad: return error; } +int +coalition_ledger(__unused proc_t p, __unused struct coalition_ledger_args *uap, __unused int32_t *retval) +{ + user_addr_t cidp = uap->cid; + user_addr_t buffer = uap->buffer; + user_addr_t bufsizep = uap->bufsize; + user_size_t bufsize; + uint32_t operation = uap->operation; + int error; + uint64_t cid; + coalition_t coal = COALITION_NULL; + + if (!kauth_cred_issuser(kauth_cred_get())) { + error = EPERM; + goto out; + } + + error = copyin(cidp, &cid, sizeof(cid)); + if (error) { + goto out; + } + + coal = coalition_find_by_id(cid); + if (coal == COALITION_NULL) { + error = ESRCH; + goto out; + } + + if (IS_64BIT_PROCESS(p)) { + user64_size_t size64; + error = copyin(bufsizep, &size64, sizeof(size64)); + bufsize = (user_size_t)size64; + } else { + user32_size_t size32; + error = copyin(bufsizep, &size32, sizeof(size32)); + bufsize = (user_size_t)size32; + } + if (error) { + goto out; + } + + switch (operation) { + case COALITION_LEDGER_SET_LOGICAL_WRITES_LIMIT: + error = coalition_ledger_logical_writes_limit(coal, buffer, bufsize); + break; + default: + error = EINVAL; + } +out: + if (coal != COALITION_NULL) { + coalition_release(coal); + } + return error; +} #if DEVELOPMENT || DEBUG static int sysctl_coalition_get_ids SYSCTL_HANDLER_ARGS { @@ -418,8 +494,7 @@ static int sysctl_coalition_get_page_count SYSCTL_HANDLER_ARGS memset(pgcount, 0, sizeof(pgcount)); for (int t = 0; t < COALITION_NUM_TYPES; t++) { - coal = COALITION_NULL; - coalition_is_leader(tproc->task, t, &coal); + coal = task_get_coalition(tproc->task, t); if (coal != COALITION_NULL) { int ntasks = 0; pgcount[t] = coalition_get_page_count(coal, &ntasks); @@ -484,7 +559,7 @@ static int sysctl_coalition_get_pid_list SYSCTL_HANDLER_ARGS return ESRCH; } - (void)coalition_is_leader(tproc->task, type, &coal); + coal = task_get_coalition(tproc->task, type); if (coal == COALITION_NULL) { goto out; } diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index ec07b11fc..d9fb9d1f9 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -109,6 +109,7 @@ #include #include #include +#include #include #include @@ -142,11 +143,16 @@ #include /* for remote time api*/ #include +#include +#include #if CONFIG_MACF #include #endif +/* for entitlement check */ +#include + /* XXX should be in a header file somewhere */ void evsofree(struct socket *); void evpipefree(struct pipe *); @@ -353,7 +359,7 @@ dofileread(vfs_context_t ctx, struct fileproc *fp, { uio_t auio; user_ssize_t bytecnt; - long error = 0; + int error = 0; char uio_buf[UIO_SIZEOF(1)]; if (nbyte > INT_MAX) { @@ -367,7 +373,10 @@ dofileread(vfs_context_t ctx, struct fileproc *fp, auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ, &uio_buf[0], sizeof(uio_buf)); } - uio_addiov(auio, bufp, nbyte); + if (uio_addiov(auio, bufp, nbyte) != 0) { + *retval = 0; + return EINVAL; + } bytecnt = nbyte; @@ -590,7 +599,7 @@ dofilewrite(vfs_context_t ctx, struct fileproc *fp, user_ssize_t *retval) { uio_t auio; - long error = 0; + int error = 0; user_ssize_t bytecnt; char uio_buf[UIO_SIZEOF(1)]; @@ -606,7 +615,10 @@ dofilewrite(vfs_context_t ctx, struct fileproc *fp, auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE, &uio_buf[0], sizeof(uio_buf)); } - uio_addiov(auio, bufp, nbyte); + if (uio_addiov(auio, bufp, nbyte) != 0) { + *retval = 0; + return EINVAL; + } bytecnt = nbyte; if ((error = fo_write(fp, auio, flags, ctx))) { @@ -911,7 +923,7 @@ ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval) break; } if (fp->f_type == DTYPE_PIPE) { - error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context); + error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context); break; } if (tmp <= 0) { @@ -925,7 +937,7 @@ ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval) tmp = p1->p_pgrpid; proc_rele(p1); } - error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context); + error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context); break; case FIOGETOWN: @@ -1623,7 +1635,7 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata, bits = iptr[i / NFDBITS]; while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); + bits &= ~(1U << j); if (fd < fdp->fd_nfiles) { fp = fdp->fd_ofiles[fd]; @@ -1667,7 +1679,7 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata, /* The select; set the bit, if true */ if (fp->f_ops && fp->f_type && fo_select(fp, flag[msk], rl_ptr, &context)) { - optr[fd / NFDBITS] |= (1 << (fd % NFDBITS)); + optr[fd / NFDBITS] |= (1U << (fd % NFDBITS)); n++; } if (sel_pass == SEL_FIRSTPASS) { @@ -1699,13 +1711,7 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata, return 0; } -int poll_callback(struct kqueue *, struct kevent_internal_s *, void *); - -struct poll_continue_args { - user_addr_t pca_fds; - u_int pca_nfds; - u_int pca_rfds; -}; +static int poll_callback(struct kevent_qos_s *, kevent_ctx_t); int poll(struct proc *p, struct poll_args *uap, int32_t *retval) @@ -1718,15 +1724,11 @@ poll(struct proc *p, struct poll_args *uap, int32_t *retval) int poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) { - struct poll_continue_args *cont; - struct pollfd *fds; - struct kqueue *kq; - struct timeval atv; + struct pollfd *fds = NULL; + struct kqueue *kq = NULL; int ncoll, error = 0; u_int nfds = uap->nfds; u_int rfds = 0; - u_int i; - size_t ni; /* * This is kinda bogus. We have fd limits, but that is not @@ -1740,46 +1742,30 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) return EINVAL; } - kq = kqueue_alloc(p, 0); + kq = kqueue_alloc(p); if (kq == NULL) { return EAGAIN; } - ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args); - MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK); - if (NULL == cont) { - error = EAGAIN; - goto out; - } - - fds = (struct pollfd *)&cont[1]; - error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd)); - if (error) { - goto out; - } - - if (uap->timeout != -1) { - struct timeval rtv; + if (nfds) { + size_t ni = nfds * sizeof(struct pollfd); + MALLOC(fds, struct pollfd *, ni, M_TEMP, M_WAITOK); + if (NULL == fds) { + error = EAGAIN; + goto out; + } - atv.tv_sec = uap->timeout / 1000; - atv.tv_usec = (uap->timeout % 1000) * 1000; - if (itimerfix(&atv)) { - error = EINVAL; + error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd)); + if (error) { goto out; } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); - } else { - atv.tv_sec = 0; - atv.tv_usec = 0; } /* JMM - all this P_SELECT stuff is bogus */ ncoll = nselcoll; OSBitOrAtomic(P_SELECT, &p->p_flag); - for (i = 0; i < nfds; i++) { + for (u_int i = 0; i < nfds; i++) { short events = fds[i].events; - KNOTE_LOCK_CTX(knlc); __assert_only int rc; /* per spec, ignore fd values below zero */ @@ -1789,7 +1775,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) } /* convert the poll event into a kqueue kevent */ - struct kevent_internal_s kev = { + struct kevent_qos_s kev = { .ident = fds[i].fd, .flags = EV_ADD | EV_ONESHOT | EV_POLL, .udata = CAST_USER_ADDR_T(&fds[i]) @@ -1801,7 +1787,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) if (events & (POLLPRI | POLLRDBAND)) { kev.flags |= EV_OOBAND; } - rc = kevent_register(kq, &kev, &knlc); + rc = kevent_register(kq, &kev, NULL); assert((rc & FILTER_REGISTER_WAIT) == 0); } @@ -1809,7 +1795,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) if ((kev.flags & EV_ERROR) == 0 && (events & (POLLOUT | POLLWRNORM | POLLWRBAND))) { kev.filter = EVFILT_WRITE; - rc = kevent_register(kq, &kev, &knlc); + rc = kevent_register(kq, &kev, NULL); assert((rc & FILTER_REGISTER_WAIT) == 0); } @@ -1830,7 +1816,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) if (events & POLLWRITE) { kev.fflags |= NOTE_WRITE; } - rc = kevent_register(kq, &kev, &knlc); + rc = kevent_register(kq, &kev, NULL); assert((rc & FILTER_REGISTER_WAIT) == 0); } @@ -1854,21 +1840,27 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) goto done; } + /* scan for, and possibly wait for, the kevents to trigger */ + kevent_ctx_t kectx = kevent_get_context(current_thread()); + *kectx = (struct kevent_ctx_s){ + .kec_process_noutputs = rfds, + .kec_process_flags = KEVENT_FLAG_POLL, + .kec_deadline = 0, /* wait forever */ + }; + /* * If any events have trouble registering, an event has fired and we - * shouldn't wait for events in kqueue_scan -- use the current time as - * the deadline. + * shouldn't wait for events in kqueue_scan. */ if (rfds) { - getmicrouptime(&atv); + kectx->kec_process_flags |= KEVENT_FLAG_IMMEDIATE; + } else if (uap->timeout != -1) { + clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC, + &kectx->kec_deadline); } - /* scan for, and possibly wait for, the kevents to trigger */ - cont->pca_fds = uap->fds; - cont->pca_nfds = nfds; - cont->pca_rfds = rfds; - error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p); - rfds = cont->pca_rfds; + error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback); + rfds = kectx->kec_process_noutputs; done: OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag); @@ -1876,27 +1868,23 @@ done: if (error == ERESTART) { error = EINTR; } - if (error == EWOULDBLOCK) { - error = 0; - } if (error == 0) { error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd)); *retval = rfds; } out: - if (NULL != cont) { - FREE(cont, M_TEMP); + if (NULL != fds) { + FREE(fds, M_TEMP); } kqueue_dealloc(kq); return error; } -int -poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void *data) +static int +poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx) { - struct poll_continue_args *cont = (struct poll_continue_args *)data; struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata); short prev_revents = fds->revents; short mask = 0; @@ -1945,7 +1933,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void * } if (fds->revents != 0 && prev_revents == 0) { - cont->pca_rfds++; + kectx->kec_process_noutputs++; } return 0; @@ -2011,7 +1999,7 @@ selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp) for (i = 0; i < nfd; i += NFDBITS) { bits = iptr[i / NFDBITS]; while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); + bits &= ~(1U << j); if (fd < fdp->fd_nfiles) { fp = fdp->fd_ofiles[fd]; @@ -2025,7 +2013,7 @@ selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp) error = EBADF; goto bad; } - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); n++; } } @@ -2111,7 +2099,7 @@ seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wak for (i = 0; i < nfd; i += NFDBITS) { bits = iptr[i / NFDBITS]; while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); + bits &= ~(1U << j); fp = fdp->fd_ofiles[fd]; /* * If we've already dropped as many as were @@ -2138,12 +2126,12 @@ seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wak continue; } - fp->f_iocount--; - if (fp->f_iocount < 0) { + const os_ref_count_t refc = os_ref_release_locked(&fp->f_iocount); + if (0 == refc) { panic("f_iocount overdecrement!"); } - if (fp->f_iocount == 0) { + if (1 == refc) { /* * The last iocount is responsible for clearing * selconfict flag - even if we didn't set it - @@ -3184,7 +3172,6 @@ waitevent_close(struct proc *p, struct fileproc *fp) * * Parameters: uuid_buf Pointer to buffer to receive UUID * timeout Timespec for timout - * spi SPI, skip sandbox check (temporary) * * Returns: 0 Success * EWOULDBLOCK Timeout is too short @@ -3202,7 +3189,8 @@ gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retv mach_timespec_t mach_ts; /* for IOKit call */ __darwin_uuid_t uuid_kern = {}; /* for IOKit call */ - if (!uap->spi) { + /* Check entitlement */ + if (!IOTaskHasEntitlement(current_task(), "com.apple.private.getprivatesysid")) { #if CONFIG_EMBEDDED #if CONFIG_MACF if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) { @@ -3403,6 +3391,86 @@ telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t return error; } +/* + * Logging + * + * Description: syscall to access kernel logging from userspace + * + * Args: + * tag - used for syncing with userspace on the version. + * flags - flags used by the syscall. + * buffer - userspace address of string to copy. + * size - size of buffer. + */ +int +log_data(__unused struct proc *p, struct log_data_args *args, int *retval) +{ + unsigned int tag = args->tag; + unsigned int flags = args->flags; + user_addr_t buffer = args->buffer; + unsigned int size = args->size; + int ret = 0; + char *log_msg = NULL; + int error; + *retval = 0; + + /* + * Tag synchronize the syscall version with userspace. + * Tag == 0 => flags == OS_LOG_TYPE + */ + if (tag != 0) { + return EINVAL; + } + + /* + * OS_LOG_TYPE are defined in libkern/os/log.h + * In userspace they are defined in libtrace/os/log.h + */ + if (flags != OS_LOG_TYPE_DEFAULT && + flags != OS_LOG_TYPE_INFO && + flags != OS_LOG_TYPE_DEBUG && + flags != OS_LOG_TYPE_ERROR && + flags != OS_LOG_TYPE_FAULT) { + return EINVAL; + } + + if (size == 0) { + return EINVAL; + } + + /* truncate to OS_LOG_DATA_MAX_SIZE */ + if (size > OS_LOG_DATA_MAX_SIZE) { + printf("%s: WARNING msg is going to be truncated from %u to %u\n", __func__, size, OS_LOG_DATA_MAX_SIZE); + size = OS_LOG_DATA_MAX_SIZE; + } + + log_msg = kalloc(size); + if (!log_msg) { + return ENOMEM; + } + + error = copyin(buffer, log_msg, size); + if (error) { + ret = EFAULT; + goto out; + } + log_msg[size - 1] = '\0'; + + /* + * This will log to dmesg and logd. + * The call will fail if the current + * process is not a driverKit process. + */ + os_log_driverKit(&ret, OS_LOG_DEFAULT, flags, "%s", log_msg); + +out: + if (log_msg != NULL) { + kfree(log_msg, size); + } + + return ret; +} + #if DEVELOPMENT || DEBUG #if CONFIG_WAITQ_DEBUG static uint64_t g_wqset_num = 0; @@ -3835,6 +3903,30 @@ SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used"); +static int +sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + uint64_t value = 0; + int error; + + error = SYSCTL_IN(req, &value, sizeof(value)); + if (error) { + return error; + } + + if (error == 0 && req->newptr) { + error = mpsc_test_pingpong(value, &value); + if (error == 0) { + error = SYSCTL_OUT(req, &value, sizeof(value)); + } + } + + return error; +} +SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong"); + #endif /* DEVELOPMENT || DEBUG */ /*Remote Time api*/ @@ -3858,7 +3950,7 @@ static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS req->oldidx = sizeof(value); return 0; } - if (bt_init_flag) { + if (os_atomic_load(&bt_init_flag, acquire)) { if (req->newptr) { int new_value = 0; error = SYSCTL_IN(req, &new_value, sizeof(new_value)); @@ -3931,6 +4023,13 @@ SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params, #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */ +#if DEVELOPMENT || DEBUG +#endif /* DEVELOPMENT || DEBUG */ + +extern uint32_t task_exc_guard_default; + +SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default, + CTLFLAG_RD | CTLFLAG_LOCKED, &task_exc_guard_default, 0, ""); static int @@ -4022,4 +4121,4 @@ sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid *oidp, __unused v SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0, sysctl_kern_sched_thread_set_no_smt, "I", ""); -#endif +#endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/kern/sys_persona.c b/bsd/kern/sys_persona.c index e2964c118..186f82993 100644 --- a/bsd/kern/sys_persona.c +++ b/bsd/kern/sys_persona.c @@ -35,7 +35,16 @@ #include #include +#include +#include +#include +#include + #include +#include + +extern kern_return_t bank_get_bank_ledger_thread_group_and_persona(void *voucher, + void *bankledger, void **banktg, uint32_t *persona_id); static int kpersona_copyin(user_addr_t infop, struct kpersona_info *kinfo) @@ -84,19 +93,16 @@ kpersona_copyout(struct kpersona_info *kinfo, user_addr_t infop) static int -kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp) +kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp, user_addr_t path) { int error; struct kpersona_info kinfo; - struct persona *persona; + struct persona *persona = NULL; uid_t id = PERSONA_ID_NONE; const char *login; + char *pna_path = NULL; - /* - * TODO: rdar://problem/19981151 - * Add entitlement check! - */ - if (!kauth_cred_issuser(kauth_cred_get())) { + if (!IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) { return EPERM; } @@ -110,12 +116,31 @@ kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp) id = kinfo.persona_id; } + if (path) { + MALLOC_ZONE(pna_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK | M_ZERO); + if (pna_path == NULL) { + return ENOMEM; + } + size_t pathlen; + error = copyinstr(path, (void *)pna_path, MAXPATHLEN, &pathlen); + if (error) { + FREE_ZONE(pna_path, MAXPATHLEN, M_NAMEI); + return error; + } + } + error = 0; - persona = persona_alloc(id, login, kinfo.persona_type, &error); + persona = persona_alloc(id, login, kinfo.persona_type, pna_path, &error); if (!persona) { + if (pna_path != NULL) { + FREE_ZONE(pna_path, MAXPATHLEN, M_NAMEI); + } return error; } + /* persona struct contains a reference to pna_path */ + pna_path = NULL; + error = persona_init_begin(persona); if (error) { goto out_persona_err; @@ -153,6 +178,11 @@ kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp) goto out_persona_err; } + error = persona_verify_and_set_uniqueness(persona); + if (error) { + goto out_persona_err; + } + persona_init_end(persona, error); /* @@ -182,7 +212,7 @@ kpersona_dealloc_syscall(user_addr_t idp) uid_t persona_id; struct persona *persona; - if (!kauth_cred_issuser(kauth_cred_get())) { + if (!IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) { return EPERM; } @@ -211,7 +241,9 @@ static int kpersona_get_syscall(user_addr_t idp) { int error; - struct persona *persona = current_persona_get(); + struct persona *persona; + + persona = current_persona_get(); if (!persona) { return ESRCH; @@ -223,10 +255,60 @@ kpersona_get_syscall(user_addr_t idp) return error; } +static int +kpersona_getpath_syscall(user_addr_t idp, user_addr_t path) +{ + int error; + uid_t persona_id; + struct persona *persona; + size_t pathlen; + uid_t current_persona_id = PERSONA_ID_NONE; + + if (!path) { + return EINVAL; + } + + error = copyin(idp, &persona_id, sizeof(persona_id)); + if (error) { + return error; + } + + /* Get current thread's persona id to compare if the + * input persona_id matches the current persona id + */ + persona = current_persona_get(); + if (persona) { + current_persona_id = persona->pna_id; + } + + if (persona_id && persona_id != current_persona_id) { + /* Release the reference on the current persona id's persona */ + persona_put(persona); + if (!kauth_cred_issuser(kauth_cred_get()) && + !IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) { + return EPERM; + } + persona = persona_lookup(persona_id); + } + + if (!persona) { + return ESRCH; + } + + if (persona->pna_path) { + error = copyoutstr((void *)persona->pna_path, path, MAXPATHLEN, &pathlen); + } + + persona_put(persona); + + return error; +} + static int kpersona_info_syscall(user_addr_t idp, user_addr_t infop) { int error; + uid_t current_persona_id = PERSONA_ID_NONE; uid_t persona_id; struct persona *persona; struct kpersona_info kinfo; @@ -236,12 +318,24 @@ kpersona_info_syscall(user_addr_t idp, user_addr_t infop) return error; } - /* - * TODO: rdar://problem/19981151 - * Add entitlement check! + /* Get current thread's persona id to compare if the + * input persona_id matches the current persona id */ + persona = current_persona_get(); + if (persona) { + current_persona_id = persona->pna_id; + } + + if (persona_id && persona_id != current_persona_id) { + /* Release the reference on the current persona id's persona */ + persona_put(persona); + if (!kauth_cred_issuser(kauth_cred_get()) && + !IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) { + return EPERM; + } + persona = persona_lookup(persona_id); + } - persona = persona_lookup(persona_id); if (!persona) { return ESRCH; } @@ -350,7 +444,7 @@ kpersona_find_syscall(user_addr_t infop, user_addr_t idp, user_addr_t idlenp) } k_idlen = u_idlen; - error = persona_find(login, kinfo.persona_id, persona, &k_idlen); + error = persona_find_all(login, kinfo.persona_id, kinfo.persona_type, persona, &k_idlen); if (error) { goto out; } @@ -381,7 +475,6 @@ out: return error; } - /* * Syscall entry point / demux. */ @@ -393,10 +486,14 @@ persona(__unused proc_t p, struct persona_args *pargs, __unused int32_t *retval) /* uint32_t flags = pargs->flags; */ user_addr_t infop = pargs->info; user_addr_t idp = pargs->id; + user_addr_t path = pargs->path; switch (op) { case PERSONA_OP_ALLOC: - error = kpersona_alloc_syscall(infop, idp); + error = kpersona_alloc_syscall(infop, idp, USER_ADDR_NULL); + break; + case PERSONA_OP_PALLOC: + error = kpersona_alloc_syscall(infop, idp, path); break; case PERSONA_OP_DEALLOC: error = kpersona_dealloc_syscall(idp); @@ -404,6 +501,9 @@ persona(__unused proc_t p, struct persona_args *pargs, __unused int32_t *retval) case PERSONA_OP_GET: error = kpersona_get_syscall(idp); break; + case PERSONA_OP_GETPATH: + error = kpersona_getpath_syscall(idp, path); + break; case PERSONA_OP_INFO: error = kpersona_info_syscall(idp, infop); break; @@ -411,6 +511,7 @@ persona(__unused proc_t p, struct persona_args *pargs, __unused int32_t *retval) error = kpersona_pidinfo_syscall(idp, infop); break; case PERSONA_OP_FIND: + case PERSONA_OP_FIND_BY_TYPE: error = kpersona_find_syscall(infop, idp, pargs->idlen); break; default: diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index cf0e5f2b0..ef7dcbab1 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -159,6 +159,15 @@ #define f_offset f_fglob->fg_offset #define f_data f_fglob->fg_data +struct pipepair { + lck_mtx_t pp_mtx; + struct pipe pp_rpipe; + struct pipe pp_wpipe; +}; + +#define PIPE_PAIR(pipe) \ + __container_of(PIPE_MTX(pipe), struct pipepair, pp_mtx) + /* * interfaces to the outside world exported through file operations */ @@ -170,45 +179,57 @@ static int pipe_close(struct fileglob *fg, vfs_context_t ctx); static int pipe_select(struct fileproc *fp, int which, void * wql, vfs_context_t ctx); static int pipe_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); + struct kevent_qos_s *kev); static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, vfs_context_t ctx); static int pipe_drain(struct fileproc *fp, vfs_context_t ctx); static const struct fileops pipeops = { - .fo_type = DTYPE_PIPE, - .fo_read = pipe_read, - .fo_write = pipe_write, - .fo_ioctl = pipe_ioctl, - .fo_select = pipe_select, - .fo_close = pipe_close, + .fo_type = DTYPE_PIPE, + .fo_read = pipe_read, + .fo_write = pipe_write, + .fo_ioctl = pipe_ioctl, + .fo_select = pipe_select, + .fo_close = pipe_close, + .fo_drain = pipe_drain, .fo_kqfilter = pipe_kqfilter, - .fo_drain = pipe_drain, }; static void filt_pipedetach(struct knote *kn); +static int filt_pipenotsup(struct knote *kn, long hint); +static int filt_pipenotsuptouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_pipenotsupprocess(struct knote *kn, struct kevent_qos_s *kev); + static int filt_piperead(struct knote *kn, long hint); -static int filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_pipereadtouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_pipereadprocess(struct knote *kn, struct kevent_qos_s *kev); static int filt_pipewrite(struct knote *kn, long hint); -static int filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_pipewritetouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_pipewriteprocess(struct knote *kn, struct kevent_qos_s *kev); + +SECURITY_READ_ONLY_EARLY(struct filterops) pipe_nfiltops = { + .f_isfd = 1, + .f_detach = filt_pipedetach, + .f_event = filt_pipenotsup, + .f_touch = filt_pipenotsuptouch, + .f_process = filt_pipenotsupprocess, +}; SECURITY_READ_ONLY_EARLY(struct filterops) pipe_rfiltops = { - .f_isfd = 1, - .f_detach = filt_pipedetach, - .f_event = filt_piperead, - .f_touch = filt_pipereadtouch, + .f_isfd = 1, + .f_detach = filt_pipedetach, + .f_event = filt_piperead, + .f_touch = filt_pipereadtouch, .f_process = filt_pipereadprocess, }; SECURITY_READ_ONLY_EARLY(struct filterops) pipe_wfiltops = { - .f_isfd = 1, - .f_detach = filt_pipedetach, - .f_event = filt_pipewrite, - .f_touch = filt_pipewritetouch, + .f_isfd = 1, + .f_detach = filt_pipedetach, + .f_event = filt_pipewrite, + .f_touch = filt_pipewritetouch, .f_process = filt_pipewriteprocess, }; @@ -235,9 +256,9 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD | CTLFLAG_LOCKED, &amountpipekvawired, 0, "Pipe wired KVA usage"); #endif +static int pipepair_alloc(struct pipe **rpipe, struct pipe **wpipe); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); -static int pipe_create(struct pipe **cpipep); static int pipespace(struct pipe *cpipe, int size); static int choose_pipespace(unsigned long current, unsigned long expected); static int expand_pipespace(struct pipe *p, int target_size); @@ -256,23 +277,6 @@ static zone_t pipe_zone; #define MAX_PIPESIZE(pipe) ( MAX(PIPE_SIZE, (pipe)->pipe_buffer.size) ) -#define PIPE_GARBAGE_AGE_LIMIT 5000 /* In milliseconds */ -#define PIPE_GARBAGE_QUEUE_LIMIT 32000 - -struct pipe_garbage { - struct pipe *pg_pipe; - struct pipe_garbage *pg_next; - uint64_t pg_timestamp; -}; - -static zone_t pipe_garbage_zone; -static struct pipe_garbage *pipe_garbage_head = NULL; -static struct pipe_garbage *pipe_garbage_tail = NULL; -static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT; -static int pipe_garbage_count = 0; -static lck_mtx_t *pipe_garbage_lock; -static void pipe_garbage_collect(struct pipe *cpipe); - SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); /* initial setup done at time of sysinit */ @@ -282,8 +286,8 @@ pipeinit(void) nbigpipe = 0; vm_size_t zone_size; - zone_size = 8192 * sizeof(struct pipe); - pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone"); + zone_size = 8192 * sizeof(struct pipepair); + pipe_zone = zinit(sizeof(struct pipepair), zone_size, 4096, "pipe zone"); /* allocate lock group attribute and group for pipe mutexes */ @@ -292,15 +296,6 @@ pipeinit(void) /* allocate the lock attribute for pipe mutexes */ pipe_mtx_attr = lck_attr_alloc_init(); - - /* - * Set up garbage collection for dead pipes - */ - zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) * - sizeof(struct pipe_garbage); - pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage), - zone_size, 4096, "pipe garbage zone"); - pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr); } #ifndef CONFIG_EMBEDDED @@ -422,46 +417,27 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval) { struct fileproc *rf, *wf; struct pipe *rpipe, *wpipe; - lck_mtx_t *pmtx; - int fd, error; + int error; - if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL) { - return ENOMEM; + error = pipepair_alloc(&rpipe, &wpipe); + if (error) { + return error; } - rpipe = wpipe = NULL; - if (pipe_create(&rpipe) || pipe_create(&wpipe)) { - error = ENFILE; - goto freepipes; - } /* - * allocate the space for the normal I/O direction up - * front... we'll delay the allocation for the other - * direction until a write actually occurs (most likely it won't)... + * for now we'll create half-duplex pipes(refer returns section above). + * this is what we've always supported.. */ - error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0)); - if (error) { - goto freepipes; - } - - TAILQ_INIT(&rpipe->pipe_evlist); - TAILQ_INIT(&wpipe->pipe_evlist); - error = falloc(p, &rf, &fd, vfs_context_current()); + error = falloc(p, &rf, &retval[0], vfs_context_current()); if (error) { goto freepipes; } - retval[0] = fd; - - /* - * for now we'll create half-duplex pipes(refer returns section above). - * this is what we've always supported.. - */ rf->f_flag = FREAD; rf->f_data = (caddr_t)rpipe; rf->f_ops = &pipeops; - error = falloc(p, &wf, &fd, vfs_context_current()); + error = falloc(p, &wf, &retval[1], vfs_context_current()); if (error) { fp_free(p, retval[0], rf); goto freepipes; @@ -472,10 +448,7 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval) rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; - /* both structures share the same mutex */ - rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; - retval[1] = fd; #if CONFIG_MACF /* * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX @@ -495,15 +468,11 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval) fp_drop(p, retval[0], rf, 1); fp_drop(p, retval[1], wf, 1); proc_fdunlock(p); - - return 0; freepipes: pipeclose(rpipe); pipeclose(wpipe); - lck_mtx_free(pmtx, pipe_mtx_grp); - return error; } @@ -577,7 +546,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64) * address of this pipe's struct pipe. This number may be recycled * relatively quickly. */ - sb64->st_ino = (ino64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe); + sb64->st_ino = (ino64_t)VM_KERNEL_ADDRHASH((uintptr_t)cpipe); } else { sb = (struct stat *)ub; @@ -604,7 +573,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64) * address of this pipe's struct pipe. This number may be recycled * relatively quickly. */ - sb->st_ino = (ino_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe); + sb->st_ino = (ino_t)VM_KERNEL_ADDRHASH((uintptr_t)cpipe); } PIPE_UNLOCK(cpipe); @@ -657,12 +626,13 @@ pipespace(struct pipe *cpipe, int size) * initialize and allocate VM and memory for pipe */ static int -pipe_create(struct pipe **cpipep) +pipepair_alloc(struct pipe **rp_out, struct pipe **wp_out) { - struct pipe *cpipe; - cpipe = (struct pipe *)zalloc(pipe_zone); + struct pipepair *pp = zalloc(pipe_zone); + struct pipe *rpipe = &pp->pp_rpipe; + struct pipe *wpipe = &pp->pp_wpipe; - if ((*cpipep = cpipe) == NULL) { + if (pp == NULL) { return ENOMEM; } @@ -670,15 +640,61 @@ pipe_create(struct pipe **cpipep) * protect so pipespace or pipeclose don't follow a junk pointer * if pipespace() fails. */ - bzero(cpipe, sizeof *cpipe); + bzero(pp, sizeof(struct pipepair)); + lck_mtx_init(&pp->pp_mtx, pipe_mtx_grp, pipe_mtx_attr); + + rpipe->pipe_mtxp = &pp->pp_mtx; + wpipe->pipe_mtxp = &pp->pp_mtx; + + TAILQ_INIT(&rpipe->pipe_evlist); + TAILQ_INIT(&wpipe->pipe_evlist); #ifndef CONFIG_EMBEDDED /* Initial times are all the time of creation of the pipe */ - pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME); + pipe_touch(rpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME); + pipe_touch(wpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME); #endif + + /* + * allocate the space for the normal I/O direction up + * front... we'll delay the allocation for the other + * direction until a write actually occurs (most likely it won't)... + */ + int error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0)); + if (__improbable(error)) { + lck_mtx_destroy(&pp->pp_mtx, pipe_mtx_grp); + zfree(pipe_zone, pp); + return error; + } + + *rp_out = rpipe; + *wp_out = wpipe; return 0; } +static void +pipepair_destroy_pipe(struct pipepair *pp, struct pipe *cpipe) +{ + bool can_free; + + pipe_free_kmem(cpipe); + + lck_mtx_lock(&pp->pp_mtx); + if (__improbable(cpipe->pipe_state & PIPE_DEAD)) { + panic("double free of pipe %p in pair %p", cpipe, pp); + } + + cpipe->pipe_state |= PIPE_DEAD; + + can_free = (pp->pp_rpipe.pipe_state & PIPE_DEAD) && + (pp->pp_wpipe.pipe_state & PIPE_DEAD); + lck_mtx_unlock(&pp->pp_mtx); + + if (can_free) { + lck_mtx_destroy(&pp->pp_mtx, pipe_mtx_grp); + zfree(pipe_zone, pp); + } +} /* * lock a pipe for I/O, blocking other access @@ -722,9 +738,8 @@ pipeselwakeup(struct pipe *cpipe, struct pipe *spipe) cpipe->pipe_state &= ~PIPE_SEL; selwakeup(&cpipe->pipe_sel); } - if (cpipe->pipe_state & PIPE_KNOTE) { - KNOTE(&cpipe->pipe_sel.si_note, 1); - } + + KNOTE(&cpipe->pipe_sel.si_note, 1); postpipeevent(cpipe, EV_RWBYTES); @@ -817,7 +832,8 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, * detect EOF condition * read returns 0 on EOF, no need to set error */ - if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { + if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (fileproc_get_vflags(fp) & FPV_DRAIN)) { break; } @@ -923,7 +939,8 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, /* * detect loss of pipe read side, issue SIGPIPE if lost. */ - if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { + if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (fileproc_get_vflags(fp) & FPV_DRAIN)) { PIPE_UNLOCK(rpipe); return EPIPE; } @@ -999,7 +1016,8 @@ retrywrite: int size; /* Transfer size */ int segsize; /* first segment to transfer */ - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { + if ((wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (fileproc_get_vflags(fp) & FPV_DRAIN)) { pipeio_unlock(wpipe); error = EPIPE; break; @@ -1099,21 +1117,23 @@ retrywrite: wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } + /* - * don't block on non-blocking I/O - * we'll do the pipeselwakeup on the way out + * If read side wants to go away, we just issue a signal + * to ourselves. */ - if (fp->f_flag & FNONBLOCK) { - error = EAGAIN; + if ((wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (fileproc_get_vflags(fp) & FPV_DRAIN)) { + error = EPIPE; break; } /* - * If read side wants to go away, we just issue a signal - * to ourselves. + * don't block on non-blocking I/O + * we'll do the pipeselwakeup on the way out */ - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - error = EPIPE; + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; break; } @@ -1254,7 +1274,8 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) case FREAD: if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0) || - (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { + (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (fileproc_get_vflags(fp) & FPV_DRAIN)) { retnum = 1; } else { rpipe->pipe_state |= PIPE_SEL; @@ -1267,6 +1288,7 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) wpipe->pipe_state |= PIPE_WSELECT; } if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (fileproc_get_vflags(fp) & FPV_DRAIN) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { retnum = 1; @@ -1324,14 +1346,7 @@ pipeclose(struct pipe *cpipe) { struct pipe *ppipe; - if (cpipe == NULL) { - return; - } - /* partially created pipes won't have a valid mutex. */ - if (PIPE_MTX(cpipe) != NULL) { - PIPE_LOCK(cpipe); - } - + PIPE_LOCK(cpipe); /* * If the other side is blocked, wake it up saying that @@ -1367,9 +1382,7 @@ pipeclose(struct pipe *cpipe) pipeselwakeup(ppipe, ppipe); wakeup(ppipe); - if (cpipe->pipe_state & PIPE_KNOTE) { - KNOTE(&ppipe->pipe_sel.si_note, 1); - } + KNOTE(&ppipe->pipe_sel.si_note, 1); postpipeevent(ppipe, EV_RCLOSED); @@ -1380,76 +1393,114 @@ pipeclose(struct pipe *cpipe) /* * free resources */ - if (PIPE_MTX(cpipe) != NULL) { - if (ppipe != NULL) { - /* - * since the mutex is shared and the peer is still - * alive, we need to release the mutex, not free it - */ - PIPE_UNLOCK(cpipe); - } else { - /* - * peer is gone, so we're the sole party left with - * interest in this mutex... unlock and free it - */ - PIPE_UNLOCK(cpipe); - lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp); - } + + PIPE_UNLOCK(cpipe); + + pipepair_destroy_pipe(PIPE_PAIR(cpipe), cpipe); +} + +static int64_t +filt_pipelowwat(struct knote *kn, struct pipe *rpipe, int64_t def_lowwat) +{ + if ((kn->kn_sfflags & NOTE_LOWAT) == 0) { + return def_lowwat; } - pipe_free_kmem(cpipe); - if (cpipe->pipe_state & PIPE_WSELECT) { - pipe_garbage_collect(cpipe); - } else { - zfree(pipe_zone, cpipe); - pipe_garbage_collect(NULL); + if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) { + return MAX_PIPESIZE(rpipe); } + return MAX(kn->kn_sdata, def_lowwat); } -/*ARGSUSED*/ static int -filt_piperead_common(struct knote *kn, struct pipe *rpipe) +filt_pipe_draincommon(struct knote *kn, struct pipe *rpipe) { - struct pipe *wpipe; - int retval; - - /* - * we're being called back via the KNOTE post - * we made in pipeselwakeup, and we already hold the mutex... - */ + struct pipe *wpipe = rpipe->pipe_peer; - wpipe = rpipe->pipe_peer; - kn->kn_data = rpipe->pipe_buffer.cnt; if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { kn->kn_flags |= EV_EOF; - retval = 1; + return 1; + } + + return 0; +} + +static int +filt_pipenotsup(struct knote *kn, long hint) +{ +#pragma unused(hint) + struct pipe *rpipe = kn->kn_hook; + + return filt_pipe_draincommon(kn, rpipe); +} + +static int +filt_pipenotsuptouch(struct knote *kn, struct kevent_qos_s *kev) +{ + struct pipe *rpipe = kn->kn_hook; + int res; + + PIPE_LOCK(rpipe); + + /* accept new kevent data (and save off lowat threshold and flag) */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + + /* determine if any event is now deemed fired */ + res = filt_pipe_draincommon(kn, rpipe); + + PIPE_UNLOCK(rpipe); + + return res; +} + +static int +filt_pipenotsupprocess(struct knote *kn, struct kevent_qos_s *kev) +{ + struct pipe *rpipe = kn->kn_hook; + int res; + + PIPE_LOCK(rpipe); + res = filt_pipe_draincommon(kn, rpipe); + if (res) { + knote_fill_kevent(kn, kev, 0); + } + PIPE_UNLOCK(rpipe); + + return res; +} + +/*ARGSUSED*/ +static int +filt_piperead_common(struct knote *kn, struct kevent_qos_s *kev, struct pipe *rpipe) +{ + int64_t data = rpipe->pipe_buffer.cnt; + int res = 0; + + if (filt_pipe_draincommon(kn, rpipe)) { + res = 1; } else { - int64_t lowwat = 1; - if (kn->kn_sfflags & NOTE_LOWAT) { - if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) { - lowwat = MAX_PIPESIZE(rpipe); - } else if (kn->kn_sdata > lowwat) { - lowwat = kn->kn_sdata; - } - } - retval = kn->kn_data >= lowwat; + res = data >= filt_pipelowwat(kn, rpipe, 1); } - return retval; + if (res && kev) { + knote_fill_kevent(kn, kev, data); + } + return res; } static int filt_piperead(struct knote *kn, long hint) { #pragma unused(hint) - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = kn->kn_hook; - return filt_piperead_common(kn, rpipe); + return filt_piperead_common(kn, NULL, rpipe); } static int -filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev) +filt_pipereadtouch(struct knote *kn, struct kevent_qos_s *kev) { - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = kn->kn_hook; int retval; PIPE_LOCK(rpipe); @@ -1459,7 +1510,7 @@ filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sfflags = kev->fflags; /* identify if any events are now fired */ - retval = filt_piperead_common(kn, rpipe); + retval = filt_piperead_common(kn, NULL, rpipe); PIPE_UNLOCK(rpipe); @@ -1467,21 +1518,13 @@ filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_pipereadprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = kn->kn_hook; int retval; PIPE_LOCK(rpipe); - retval = filt_piperead_common(kn, rpipe); - if (retval) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + retval = filt_piperead_common(kn, kev, rpipe); PIPE_UNLOCK(rpipe); return retval; @@ -1489,33 +1532,21 @@ filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct keven /*ARGSUSED*/ static int -filt_pipewrite_common(struct knote *kn, struct pipe *rpipe) +filt_pipewrite_common(struct knote *kn, struct kevent_qos_s *kev, struct pipe *rpipe) { - struct pipe *wpipe; - - /* - * we're being called back via the KNOTE post - * we made in pipeselwakeup, and we already hold the mutex... - */ - wpipe = rpipe->pipe_peer; + int64_t data = 0; + int res = 0; - if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { - kn->kn_data = 0; - kn->kn_flags |= EV_EOF; - return 1; + if (filt_pipe_draincommon(kn, rpipe)) { + res = 1; + } else { + data = MAX_PIPESIZE(rpipe) - rpipe->pipe_buffer.cnt; + res = data >= filt_pipelowwat(kn, rpipe, PIPE_BUF); } - kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt; - - int64_t lowwat = PIPE_BUF; - if (kn->kn_sfflags & NOTE_LOWAT) { - if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe)) { - lowwat = MAX_PIPESIZE(wpipe); - } else if (kn->kn_sdata > lowwat) { - lowwat = kn->kn_sdata; - } + if (res && kev) { + knote_fill_kevent(kn, kev, data); } - - return kn->kn_data >= lowwat; + return res; } /*ARGSUSED*/ @@ -1523,16 +1554,16 @@ static int filt_pipewrite(struct knote *kn, long hint) { #pragma unused(hint) - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = kn->kn_hook; - return filt_pipewrite_common(kn, rpipe); + return filt_pipewrite_common(kn, NULL, rpipe); } static int -filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev) +filt_pipewritetouch(struct knote *kn, struct kevent_qos_s *kev) { - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = kn->kn_hook; int res; PIPE_LOCK(rpipe); @@ -1542,7 +1573,7 @@ filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; /* determine if any event is now deemed fired */ - res = filt_pipewrite_common(kn, rpipe); + res = filt_pipewrite_common(kn, NULL, rpipe); PIPE_UNLOCK(rpipe); @@ -1550,21 +1581,13 @@ filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_pipewriteprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = kn->kn_hook; int res; PIPE_LOCK(rpipe); - res = filt_pipewrite_common(kn, rpipe); - if (res) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + res = filt_pipewrite_common(kn, kev, rpipe); PIPE_UNLOCK(rpipe); return res; @@ -1572,10 +1595,11 @@ filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct keve /*ARGSUSED*/ static int -pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, - __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx) +pipe_kqfilter(struct fileproc *fp, struct knote *kn, + __unused struct kevent_qos_s *kev) { - struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *cpipe = (struct pipe *)fp->f_data; + struct pipe *rpipe = &PIPE_PAIR(cpipe)->pp_rpipe; int res; PIPE_LOCK(cpipe); @@ -1585,51 +1609,56 @@ pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, * XXX process credential should have a persistent reference on it * XXX before being passed in here. */ - if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) { + kauth_cred_t cred = vfs_context_ucred(vfs_context_current()); + if (mac_pipe_check_kqfilter(cred, kn, cpipe) != 0) { PIPE_UNLOCK(cpipe); - kn->kn_flags = EV_ERROR; - kn->kn_data = EPERM; + knote_set_error(kn, EPERM); return 0; } #endif + /* + * FreeBSD will fail the attach with EPIPE if the peer pipe is detached, + * however, this isn't a programming error as the other side closing + * could race with the kevent registration. + * + * Attach should only fail for programming mistakes else it will break + * libdispatch. + * + * Like FreeBSD, have a "Neutered" filter that will not fire until + * the pipe dies if the wrong filter is attached to the wrong end. + * + * Knotes are always attached to the "rpipe". + */ switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_filtid = EVFILTID_PIPE_R; - - /* determine initial state */ - res = filt_piperead_common(kn, cpipe); + if (fp->f_flag & FREAD) { + kn->kn_filtid = EVFILTID_PIPE_R; + res = filt_piperead_common(kn, NULL, rpipe); + } else { + kn->kn_filtid = EVFILTID_PIPE_N; + res = filt_pipe_draincommon(kn, rpipe); + } break; case EVFILT_WRITE: - kn->kn_filtid = EVFILTID_PIPE_W; - - if (cpipe->pipe_peer == NULL) { - /* - * other end of pipe has been closed - */ - PIPE_UNLOCK(cpipe); - kn->kn_flags = EV_ERROR; - kn->kn_data = EPIPE; - return 0; - } - if (cpipe->pipe_peer) { - cpipe = cpipe->pipe_peer; + if (fp->f_flag & FWRITE) { + kn->kn_filtid = EVFILTID_PIPE_W; + res = filt_pipewrite_common(kn, NULL, rpipe); + } else { + kn->kn_filtid = EVFILTID_PIPE_N; + res = filt_pipe_draincommon(kn, rpipe); } - - /* determine inital state */ - res = filt_pipewrite_common(kn, cpipe); break; + default: PIPE_UNLOCK(cpipe); - kn->kn_flags = EV_ERROR; - kn->kn_data = EINVAL; + knote_set_error(kn, EINVAL); return 0; } - if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn)) { - cpipe->pipe_state |= PIPE_KNOTE; - } + kn->kn_hook = rpipe; + KNOTE_ATTACH(&rpipe->pipe_sel.si_note, kn); PIPE_UNLOCK(cpipe); return res; @@ -1639,21 +1668,10 @@ static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *rpipe = &PIPE_PAIR(cpipe)->pp_rpipe; PIPE_LOCK(cpipe); - - if (kn->kn_filter == EVFILT_WRITE) { - if (cpipe->pipe_peer == NULL) { - PIPE_UNLOCK(cpipe); - return; - } - cpipe = cpipe->pipe_peer; - } - if (cpipe->pipe_state & PIPE_KNOTE) { - if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn)) { - cpipe->pipe_state &= ~PIPE_KNOTE; - } - } + KNOTE_DETACH(&rpipe->pipe_sel.si_note, kn); PIPE_UNLOCK(cpipe); } @@ -1734,8 +1752,8 @@ fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo) * XXX (st_dev, st_ino) should be unique. */ - pinfo->pipe_handle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe); - pinfo->pipe_peerhandle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(cpipe->pipe_peer)); + pinfo->pipe_handle = (uint64_t)VM_KERNEL_ADDRHASH((uintptr_t)cpipe); + pinfo->pipe_peerhandle = (uint64_t)VM_KERNEL_ADDRHASH((uintptr_t)(cpipe->pipe_peer)); pinfo->pipe_status = cpipe->pipe_state; PIPE_UNLOCK(cpipe); @@ -1749,17 +1767,30 @@ pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx) { /* Note: fdlock already held */ struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data); + boolean_t drain_pipe = FALSE; + + /* Check if the pipe is going away */ + lck_mtx_lock_spin(&fp->f_fglob->fg_lock); + if (fp->f_fglob->fg_count == 1) { + drain_pipe = TRUE; + } + lck_mtx_unlock(&fp->f_fglob->fg_lock); if (cpipe) { PIPE_LOCK(cpipe); - cpipe->pipe_state |= PIPE_DRAIN; - cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW); + + if (drain_pipe) { + cpipe->pipe_state |= PIPE_DRAIN; + cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW); + } wakeup(cpipe); /* Must wake up peer: a writer sleeps on the read side */ if ((ppipe = cpipe->pipe_peer)) { - ppipe->pipe_state |= PIPE_DRAIN; - ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW); + if (drain_pipe) { + ppipe->pipe_state |= PIPE_DRAIN; + ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW); + } wakeup(ppipe); } @@ -1769,80 +1800,3 @@ pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx) return 1; } - - -/* - * When a thread sets a write-select on a pipe, it creates an implicit, - * untracked dependency between that thread and the peer of the pipe - * on which the select is set. If the peer pipe is closed and freed - * before the select()ing thread wakes up, the system will panic as - * it attempts to unwind the dangling select(). To avoid that panic, - * we notice whenever a dangerous select() is set on a pipe, and - * defer the final deletion of the pipe until that select()s are all - * resolved. Since we can't currently detect exactly when that - * resolution happens, we use a simple garbage collection queue to - * reap the at-risk pipes 'later'. - */ -static void -pipe_garbage_collect(struct pipe *cpipe) -{ - uint64_t old, now; - struct pipe_garbage *pgp; - - /* Convert msecs to nsecs and then to abstime */ - old = pipe_garbage_age_limit * 1000000; - nanoseconds_to_absolutetime(old, &old); - - lck_mtx_lock(pipe_garbage_lock); - - /* Free anything that's been on the queue for seconds */ - now = mach_absolute_time(); - old = now - old; - while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) { - pipe_garbage_head = pgp->pg_next; - if (pipe_garbage_head == NULL) { - pipe_garbage_tail = NULL; - } - pipe_garbage_count--; - zfree(pipe_zone, pgp->pg_pipe); - zfree(pipe_garbage_zone, pgp); - } - - /* Add the new pipe (if any) to the tail of the garbage queue */ - if (cpipe) { - cpipe->pipe_state = PIPE_DEAD; - pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone); - if (pgp == NULL) { - /* - * We're too low on memory to garbage collect the - * pipe. Freeing it runs the risk of panicing the - * system. All we can do is leak it and leave - * a breadcrumb behind. The good news, such as it - * is, is that this will probably never happen. - * We will probably hit the panic below first. - */ - printf("Leaking pipe %p - no room left in the queue", - cpipe); - lck_mtx_unlock(pipe_garbage_lock); - return; - } - - pgp->pg_pipe = cpipe; - pgp->pg_timestamp = now; - pgp->pg_next = NULL; - - if (pipe_garbage_tail) { - pipe_garbage_tail->pg_next = pgp; - } - pipe_garbage_tail = pgp; - if (pipe_garbage_head == NULL) { - pipe_garbage_head = pipe_garbage_tail; - } - - if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT) { - panic("Length of pipe garbage queue exceeded %d", - PIPE_GARBAGE_QUEUE_LIMIT); - } - } - lck_mtx_unlock(pipe_garbage_lock); -} diff --git a/bsd/kern/sys_reason.c b/bsd/kern/sys_reason.c index 518df53ab..70493f974 100644 --- a/bsd/kern/sys_reason.c +++ b/bsd/kern/sys_reason.c @@ -50,6 +50,8 @@ lck_grp_attr_t *os_reason_lock_grp_attr; lck_grp_t *os_reason_lock_grp; lck_attr_t *os_reason_lock_attr; +os_refgrp_decl(static, os_reason_refgrp, "os_reason", NULL); + #define OS_REASON_RESERVE_COUNT 100 #define OS_REASON_MAX_COUNT (maxproc + 100) @@ -131,7 +133,7 @@ os_reason_create(uint32_t osr_namespace, uint64_t osr_code) new_reason->osr_kcd_buf = NULL; lck_mtx_init(&new_reason->osr_lock, os_reason_lock_grp, os_reason_lock_attr); - new_reason->osr_refcount = 1; + os_ref_init(&new_reason->osr_refcount, &os_reason_refgrp); return new_reason; } @@ -276,14 +278,8 @@ os_reason_ref(os_reason_t cur_reason) } lck_mtx_lock(&cur_reason->osr_lock); - - assert(cur_reason->osr_refcount > 0); - if (os_add_overflow(cur_reason->osr_refcount, 1, &cur_reason->osr_refcount)) { - panic("os reason refcount overflow"); - } - + os_ref_retain_locked(&cur_reason->osr_refcount); lck_mtx_unlock(&cur_reason->osr_lock); - return; } @@ -300,12 +296,7 @@ os_reason_free(os_reason_t cur_reason) lck_mtx_lock(&cur_reason->osr_lock); - if (cur_reason->osr_refcount == 0) { - panic("os_reason_free called on reason with zero refcount"); - } - - cur_reason->osr_refcount--; - if (cur_reason->osr_refcount != 0) { + if (os_ref_release_locked(&cur_reason->osr_refcount) > 0) { lck_mtx_unlock(&cur_reason->osr_lock); return; } @@ -317,3 +308,44 @@ os_reason_free(os_reason_t cur_reason) zfree(os_reason_zone, cur_reason); } + +/* + * Sets flags on the passed reason. + */ +void +os_reason_set_flags(os_reason_t cur_reason, uint64_t flags) +{ + if (cur_reason == OS_REASON_NULL) { + return; + } + + lck_mtx_lock(&cur_reason->osr_lock); + cur_reason->osr_flags = flags; + lck_mtx_unlock(&cur_reason->osr_lock); +} + +/* + * Allocates space and sets description data in kcd_descriptor on the passed reason. + */ +void +os_reason_set_description_data(os_reason_t cur_reason, uint32_t type, void *reason_data, uint32_t reason_data_len) +{ + mach_vm_address_t osr_data_addr = 0; + + if (cur_reason == OS_REASON_NULL) { + return; + } + + if (0 != os_reason_alloc_buffer(cur_reason, kcdata_estimate_required_buffer_size(1, reason_data_len))) { + panic("os_reason failed to allocate"); + } + + lck_mtx_lock(&cur_reason->osr_lock); + if (KERN_SUCCESS != kcdata_get_memory_addr(&cur_reason->osr_kcd_descriptor, type, reason_data_len, &osr_data_addr)) { + panic("os_reason failed to get data address"); + } + if (KERN_SUCCESS != kcdata_memcpy(&cur_reason->osr_kcd_descriptor, osr_data_addr, reason_data, reason_data_len)) { + panic("os_reason failed to copy description data"); + } + lck_mtx_unlock(&cur_reason->osr_lock); +} diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 9988ba7c6..53e8f07b5 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -98,14 +98,14 @@ static int soo_close(struct fileglob *, vfs_context_t ctx); static int soo_drain(struct fileproc *, vfs_context_t ctx); const struct fileops socketops = { - .fo_type = DTYPE_SOCKET, - .fo_read = soo_read, - .fo_write = soo_write, - .fo_ioctl = soo_ioctl, - .fo_select = soo_select, - .fo_close = soo_close, + .fo_type = DTYPE_SOCKET, + .fo_read = soo_read, + .fo_write = soo_write, + .fo_ioctl = soo_ioctl, + .fo_select = soo_select, + .fo_close = soo_close, + .fo_drain = soo_drain, .fo_kqfilter = soo_kqfilter, - .fo_drain = soo_drain, }; /* ARGSUSED */ diff --git a/bsd/kern/sys_ulock.c b/bsd/kern/sys_ulock.c index b8046c66e..dce4c3aec 100644 --- a/bsd/kern/sys_ulock.c +++ b/bsd/kern/sys_ulock.c @@ -26,6 +26,8 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include + #include #include #include @@ -101,16 +103,44 @@ typedef lck_spin_t ull_lock_t; #define ULOCK_TO_EVENT(ull) ((event_t)ull) #define EVENT_TO_ULOCK(event) ((ull_t *)event) -typedef struct __attribute__((packed)) { - user_addr_t ulk_addr; - pid_t ulk_pid; +typedef enum { + ULK_INVALID = 0, + ULK_UADDR, + ULK_XPROC, +} ulk_type; + +typedef struct { + union { + struct __attribute__((packed)) { + user_addr_t ulk_addr; + pid_t ulk_pid; + }; + struct __attribute__((packed)) { + uint64_t ulk_object; + uint64_t ulk_offset; + }; + }; + ulk_type ulk_key_type; } ulk_t; +#define ULK_UADDR_LEN (sizeof(user_addr_t) + sizeof(pid_t)) +#define ULK_XPROC_LEN (sizeof(uint64_t) + sizeof(uint64_t)) + inline static bool ull_key_match(ulk_t *a, ulk_t *b) { - return (a->ulk_pid == b->ulk_pid) && - (a->ulk_addr == b->ulk_addr); + if (a->ulk_key_type != b->ulk_key_type) { + return false; + } + + if (a->ulk_key_type == ULK_UADDR) { + return (a->ulk_pid == b->ulk_pid) && + (a->ulk_addr == b->ulk_addr); + } + + assert(a->ulk_key_type == ULK_XPROC); + return (a->ulk_object == b->ulk_object) && + (a->ulk_offset == b->ulk_offset); } typedef struct ull { @@ -120,11 +150,9 @@ typedef struct ull { */ thread_t ull_owner; /* holds +1 thread reference */ ulk_t ull_key; - ulk_t ull_saved_key; ull_lock_t ull_lock; uint ull_bucket_index; int32_t ull_nwaiters; - int32_t ull_max_nwaiters; int32_t ull_refcount; uint8_t ull_opcode; struct turnstile *ull_turnstile; @@ -134,9 +162,13 @@ typedef struct ull { extern void ulock_initialize(void); #define ULL_MUST_EXIST 0x0001 -static ull_t *ull_get(ulk_t *, uint32_t, ull_t **); static void ull_put(ull_t *); +static uint32_t ulock_adaptive_spin_usecs = 20; + +SYSCTL_INT(_kern, OID_AUTO, ulock_adaptive_spin_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, + &ulock_adaptive_spin_usecs, 0, "ulock adaptive spin duration"); + #if DEVELOPMENT || DEBUG static int ull_simulate_copyin_fault = 0; @@ -144,12 +176,22 @@ static void ull_dump(ull_t *ull) { kprintf("ull\t%p\n", ull); - kprintf("ull_key.ulk_pid\t%d\n", ull->ull_key.ulk_pid); - kprintf("ull_key.ulk_addr\t%p\n", (void *)(ull->ull_key.ulk_addr)); - kprintf("ull_saved_key.ulk_pid\t%d\n", ull->ull_saved_key.ulk_pid); - kprintf("ull_saved_key.ulk_addr\t%p\n", (void *)(ull->ull_saved_key.ulk_addr)); + switch (ull->ull_key.ulk_key_type) { + case ULK_UADDR: + kprintf("ull_key.ulk_key_type\tULK_UADDR\n"); + kprintf("ull_key.ulk_pid\t%d\n", ull->ull_key.ulk_pid); + kprintf("ull_key.ulk_addr\t%p\n", (void *)(ull->ull_key.ulk_addr)); + break; + case ULK_XPROC: + kprintf("ull_key.ulk_key_type\tULK_XPROC\n"); + kprintf("ull_key.ulk_object\t%p\n", (void *)(ull->ull_key.ulk_object)); + kprintf("ull_key.ulk_offset\t%p\n", (void *)(ull->ull_key.ulk_offset)); + break; + default: + kprintf("ull_key.ulk_key_type\tUNKNOWN %d\n", ull->ull_key.ulk_key_type); + break; + } kprintf("ull_nwaiters\t%d\n", ull->ull_nwaiters); - kprintf("ull_max_nwaiters\t%d\n", ull->ull_max_nwaiters); kprintf("ull_refcount\t%d\n", ull->ull_refcount); kprintf("ull_opcode\t%d\n\n", ull->ull_opcode); kprintf("ull_owner\t0x%llx\n\n", thread_tid(ull->ull_owner)); @@ -180,13 +222,7 @@ ull_hash_index(const void *key, size_t length) return hash; } -/* Ensure that the key structure is packed, - * so that no undefined memory is passed to - * ull_hash_index() - */ -static_assert(sizeof(ulk_t) == sizeof(user_addr_t) + sizeof(pid_t)); - -#define ULL_INDEX(keyp) ull_hash_index(keyp, sizeof *keyp) +#define ULL_INDEX(keyp) ull_hash_index(keyp, keyp->ulk_key_type == ULK_UADDR ? ULK_UADDR_LEN : ULK_XPROC_LEN) void ulock_initialize(void) @@ -215,6 +251,7 @@ ulock_initialize(void) 0, "ulocks"); zone_change(ull_zone, Z_NOENCRYPT, TRUE); + zone_change(ull_zone, Z_CACHING_ENABLED, TRUE); } #if DEVELOPMENT || DEBUG @@ -237,7 +274,7 @@ ull_hash_dump(pid_t pid) kprintf("%s>index %d:\n", __FUNCTION__, i); } qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) { - if ((pid == 0) || (pid == elem->ull_key.ulk_pid)) { + if ((pid == 0) || ((elem->ull_key.ulk_key_type == ULK_UADDR) && (pid == elem->ull_key.ulk_pid))) { ull_dump(elem); count++; } @@ -261,10 +298,8 @@ ull_alloc(ulk_t *key) ull->ull_refcount = 1; ull->ull_key = *key; - ull->ull_saved_key = *key; ull->ull_bucket_index = ULL_INDEX(key); ull->ull_nwaiters = 0; - ull->ull_max_nwaiters = 0; ull->ull_opcode = 0; ull->ull_owner = THREAD_NULL; @@ -351,7 +386,7 @@ ull_put(ull_t *ull) { ull_assert_owned(ull); int refcount = --ull->ull_refcount; - assert(refcount == 0 ? (ull->ull_key.ulk_pid == 0 && ull->ull_key.ulk_addr == 0) : 1); + assert(refcount == 0 ? (ull->ull_key.ulk_key_type == ULK_INVALID) : 1); ull_unlock(ull); if (refcount > 0) { @@ -365,6 +400,31 @@ ull_put(ull_t *ull) ull_free(ull); } +extern kern_return_t vm_map_page_info(vm_map_t map, vm_map_offset_t offset, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count); +extern vm_map_t current_map(void); +extern boolean_t machine_thread_on_core(thread_t thread); + +static int +uaddr_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp) +{ + kern_return_t ret; + vm_page_info_basic_data_t info; + mach_msg_type_number_t count = VM_PAGE_INFO_BASIC_COUNT; + ret = vm_map_page_info(current_map(), uaddr, VM_PAGE_INFO_BASIC, (vm_page_info_t)&info, &count); + if (ret != KERN_SUCCESS) { + return EINVAL; + } + + if (objectp != NULL) { + *objectp = (uint64_t)info.object_id; + } + if (offsetp != NULL) { + *offsetp = (uint64_t)info.offset; + } + + return 0; +} + static void ulock_wait_continue(void *, wait_result_t); static void ulock_wait_cleanup(ull_t *, thread_t, thread_t, int32_t *); @@ -389,6 +449,24 @@ wait_result_to_return_code(wait_result_t wr) return ret; } +static int +ulock_resolve_owner(uint32_t value, thread_t *owner) +{ + mach_port_name_t owner_name = ulock_owner_value_to_port_name(value); + + *owner = port_name_to_thread(owner_name, + PORT_TO_THREAD_IN_CURRENT_TASK | + PORT_TO_THREAD_NOT_CURRENT_THREAD); + if (*owner == THREAD_NULL) { + /* + * Translation failed - even though the lock value is up to date, + * whatever was stored in the lock wasn't actually a thread port. + */ + return owner_name == MACH_PORT_DEAD ? ESRCH : EOWNERDEAD; + } + return 0; +} + int ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) { @@ -414,29 +492,96 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) goto munge_retval; } - boolean_t set_owner = FALSE; + bool set_owner = false; + bool xproc = false; + size_t lock_size = sizeof(uint32_t); + int copy_ret; switch (opcode) { case UL_UNFAIR_LOCK: - set_owner = TRUE; + set_owner = true; break; case UL_COMPARE_AND_WAIT: break; + case UL_COMPARE_AND_WAIT64: + lock_size = sizeof(uint64_t); + break; + case UL_COMPARE_AND_WAIT_SHARED: + xproc = true; + break; + case UL_COMPARE_AND_WAIT64_SHARED: + xproc = true; + lock_size = sizeof(uint64_t); + break; default: ret = EINVAL; goto munge_retval; } - /* 32-bit lock type for UL_COMPARE_AND_WAIT and UL_UNFAIR_LOCK */ - uint32_t value = 0; + uint64_t value = 0; - if ((args->addr == 0) || (args->addr % _Alignof(_Atomic(typeof(value))))) { + if ((args->addr == 0) || (args->addr & (lock_size - 1))) { ret = EINVAL; goto munge_retval; } - key.ulk_pid = p->p_pid; - key.ulk_addr = args->addr; + if (xproc) { + uint64_t object = 0; + uint64_t offset = 0; + + ret = uaddr_findobj(args->addr, &object, &offset); + if (ret) { + ret = EINVAL; + goto munge_retval; + } + key.ulk_key_type = ULK_XPROC; + key.ulk_object = object; + key.ulk_offset = offset; + } else { + key.ulk_key_type = ULK_UADDR; + key.ulk_pid = p->p_pid; + key.ulk_addr = args->addr; + } + + if ((flags & ULF_WAIT_ADAPTIVE_SPIN) && set_owner) { + /* + * Attempt the copyin outside of the lock once, + * + * If it doesn't match (which is common), return right away. + * + * If it matches, resolve the current owner, and if it is on core, + * spin a bit waiting for the value to change. If the owner isn't on + * core, or if the value stays stable, then go on with the regular + * blocking code. + */ + uint64_t end = 0; + uint32_t u32; + + ret = copyin_atomic32(args->addr, &u32); + if (ret || u32 != args->value) { + goto munge_retval; + } + for (;;) { + if (owner_thread == NULL && ulock_resolve_owner(u32, &owner_thread) != 0) { + break; + } + + /* owner_thread may have a +1 starting here */ + + if (!machine_thread_on_core(owner_thread)) { + break; + } + if (end == 0) { + clock_interval_to_deadline(ulock_adaptive_spin_usecs, + NSEC_PER_USEC, &end); + } else if (mach_absolute_time() > end) { + break; + } + if (copyin_atomic32_wait_if_equals(args->addr, u32) != 0) { + goto munge_retval; + } + } + } ull_t *ull = ull_get(&key, 0, &unused_ull); if (ull == NULL) { @@ -447,10 +592,6 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) ull->ull_nwaiters++; - if (ull->ull_nwaiters > ull->ull_max_nwaiters) { - ull->ull_max_nwaiters = ull->ull_nwaiters; - } - if (ull->ull_opcode == 0) { ull->ull_opcode = opcode; } else if (ull->ull_opcode != opcode) { @@ -466,17 +607,22 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) * holding the ull spinlock across copyin forces any * vm_fault we encounter to fail. */ - uint64_t val64; /* copyin_word always zero-extends to 64-bits */ - int copy_ret = copyin_word(args->addr, &val64, sizeof(value)); + /* copyin_atomicXX always checks alignment */ - value = (uint32_t)val64; + if (lock_size == 4) { + uint32_t u32; + copy_ret = copyin_atomic32(args->addr, &u32); + value = u32; + } else { + copy_ret = copyin_atomic64(args->addr, &value); + } #if DEVELOPMENT || DEBUG /* Occasionally simulate copyin finding the user address paged out */ if (((ull_simulate_copyin_fault == p->p_pid) || (ull_simulate_copyin_fault == 1)) && (copy_ret == 0)) { static _Atomic int fault_inject = 0; - if (__c11_atomic_fetch_add(&fault_inject, 1, __ATOMIC_RELAXED) % 73 == 0) { + if (os_atomic_inc_orig(&fault_inject, relaxed) % 73 == 0) { copy_ret = EFAULT; } } @@ -495,17 +641,17 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) } if (set_owner) { - mach_port_name_t owner_name = ulock_owner_value_to_port_name(args->value); - owner_thread = port_name_to_thread_for_ulock(owner_name); - - /* HACK: don't bail on MACH_PORT_DEAD, to avoid blowing up the no-tsd pthread lock */ - if (owner_name != MACH_PORT_DEAD && owner_thread == THREAD_NULL) { - /* - * Translation failed - even though the lock value is up to date, - * whatever was stored in the lock wasn't actually a thread port. - */ - ret = EOWNERDEAD; - goto out_locked; + if (owner_thread == THREAD_NULL) { + ret = ulock_resolve_owner(args->value, &owner_thread); + if (ret == EOWNERDEAD) { + /* + * Translation failed - even though the lock value is up to date, + * whatever was stored in the lock wasn't actually a thread port. + */ + goto out_locked; + } + /* HACK: don't bail on MACH_PORT_DEAD, to avoid blowing up the no-tsd pthread lock */ + ret = 0; } /* owner_thread has a +1 reference */ @@ -584,10 +730,11 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) ret = wait_result_to_return_code(wr); ull_lock(ull); - turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL); + turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK); out_locked: ulock_wait_cleanup(ull, owner_thread, old_owner, retval); + owner_thread = NULL; if (unused_ull) { ull_free(unused_ull); @@ -597,6 +744,12 @@ out_locked: assert(*retval >= 0); munge_retval: + if (owner_thread) { + thread_deallocate(owner_thread); + } + if (ret == ESTALE) { + ret = 0; + } if ((flags & ULF_NO_ERRNO) && (ret != 0)) { *retval = -ret; ret = 0; @@ -624,8 +777,7 @@ ulock_wait_cleanup(ull_t *ull, thread_t owner_thread, thread_t old_owner, int32_ old_lingering_owner = ull->ull_owner; ull->ull_owner = THREAD_NULL; - ull->ull_key.ulk_pid = 0; - ull->ull_key.ulk_addr = 0; + memset(&ull->ull_key, 0, sizeof ull->ull_key); ull->ull_refcount--; assert(ull->ull_refcount > 0); } @@ -666,7 +818,7 @@ ulock_wait_continue(void * parameter, wait_result_t wr) ret = wait_result_to_return_code(wr); ull_lock(ull); - turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL); + turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK); ulock_wait_cleanup(ull, owner_thread, old_owner, retval); @@ -688,12 +840,6 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva /* involved threads - each variable holds +1 ref if not null */ thread_t wake_thread = THREAD_NULL; - thread_t old_owner = THREAD_NULL; - - if ((flags & ULF_WAKE_MASK) != flags) { - ret = EINVAL; - goto munge_retval; - } #if DEVELOPMENT || DEBUG if (opcode == UL_DEBUG_HASH_DUMP_PID) { @@ -708,120 +854,159 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva } #endif + bool set_owner = false; + bool xproc = false; + + switch (opcode) { + case UL_UNFAIR_LOCK: + set_owner = true; + break; + case UL_COMPARE_AND_WAIT: + case UL_COMPARE_AND_WAIT64: + break; + case UL_COMPARE_AND_WAIT_SHARED: + case UL_COMPARE_AND_WAIT64_SHARED: + xproc = true; + break; + default: + ret = EINVAL; + goto munge_retval; + } + + if ((flags & ULF_WAKE_MASK) != flags) { + ret = EINVAL; + goto munge_retval; + } + + if ((flags & ULF_WAKE_THREAD) && ((flags & ULF_WAKE_ALL) || set_owner)) { + ret = EINVAL; + goto munge_retval; + } + if (args->addr == 0) { ret = EINVAL; goto munge_retval; } - if (flags & ULF_WAKE_THREAD) { - if (flags & ULF_WAKE_ALL) { + if (xproc) { + uint64_t object = 0; + uint64_t offset = 0; + + ret = uaddr_findobj(args->addr, &object, &offset); + if (ret) { ret = EINVAL; goto munge_retval; } + key.ulk_key_type = ULK_XPROC; + key.ulk_object = object; + key.ulk_offset = offset; + } else { + key.ulk_key_type = ULK_UADDR; + key.ulk_pid = p->p_pid; + key.ulk_addr = args->addr; + } + + if (flags & ULF_WAKE_THREAD) { mach_port_name_t wake_thread_name = (mach_port_name_t)(args->wake_value); - wake_thread = port_name_to_thread_for_ulock(wake_thread_name); + wake_thread = port_name_to_thread(wake_thread_name, + PORT_TO_THREAD_IN_CURRENT_TASK | + PORT_TO_THREAD_NOT_CURRENT_THREAD); if (wake_thread == THREAD_NULL) { ret = ESRCH; goto munge_retval; } } - key.ulk_pid = p->p_pid; - key.ulk_addr = args->addr; - ull_t *ull = ull_get(&key, ULL_MUST_EXIST, NULL); + thread_t new_owner = THREAD_NULL; + struct turnstile *ts = TURNSTILE_NULL; + thread_t cleanup_thread = THREAD_NULL; + if (ull == NULL) { - if (wake_thread != THREAD_NULL) { - thread_deallocate(wake_thread); - } ret = ENOENT; goto munge_retval; } /* ull is locked */ - boolean_t clear_owner = FALSE; /* need to reset owner */ - - switch (opcode) { - case UL_UNFAIR_LOCK: - clear_owner = TRUE; - break; - case UL_COMPARE_AND_WAIT: - break; - default: - ret = EINVAL; - goto out_locked; - } - if (opcode != ull->ull_opcode) { ret = EDOM; - goto out_locked; + goto out_ull_put; } - if (!clear_owner) { + if (set_owner) { + if (ull->ull_owner != current_thread()) { + /* + * If the current thread isn't the known owner, + * then this wake call was late to the party, + * and the kernel already knows who owns the lock. + * + * This current owner already knows the lock is contended + * and will redrive wakes, just bail out. + */ + goto out_ull_put; + } + } else { assert(ull->ull_owner == THREAD_NULL); } - struct turnstile *ts; ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile, TURNSTILE_NULL, TURNSTILE_ULOCK); + assert(ts != TURNSTILE_NULL); - if (flags & ULF_WAKE_ALL) { - waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), - THREAD_AWAKENED, 0); - } else if (flags & ULF_WAKE_THREAD) { - kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + if (flags & ULF_WAKE_THREAD) { + kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq, + CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), wake_thread, THREAD_AWAKENED); if (kr != KERN_SUCCESS) { assert(kr == KERN_NOT_WAITING); ret = EALREADY; } - } else { + } else if (flags & ULF_WAKE_ALL) { + if (set_owner) { + turnstile_update_inheritor(ts, THREAD_NULL, + TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD); + } + waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + THREAD_AWAKENED, 0); + } else if (set_owner) { /* - * TODO: WAITQ_SELECT_MAX_PRI forces a linear scan of the (hashed) global waitq. - * Move to a ulock-private, priority sorted waitq (i.e. SYNC_POLICY_FIXED_PRIORITY) to avoid that. - * - * TODO: 'owner is not current_thread (or null)' likely means we can avoid this wakeup - * + * The turnstile waitq is priority ordered, + * and will wake up the highest priority waiter + * and set it as the inheritor for us. */ + new_owner = waitq_wakeup64_identify(&ts->ts_waitq, + CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE); + } else { waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), - THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI); + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); } - /* - * Reaching this point means I previously moved the lock to 'unowned' state in userspace. - * Therefore I need to relinquish my promotion. - * - * However, someone else could have locked it after I unlocked, and then had a third thread - * block on the lock, causing a promotion of some other owner. - * - * I don't want to stomp over that, so only remove the promotion if I'm the current owner. - */ - - if (ull->ull_owner == current_thread()) { - turnstile_update_inheritor(ts, THREAD_NULL, - (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + if (set_owner) { turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); - old_owner = ull->ull_owner; - ull->ull_owner = THREAD_NULL; + cleanup_thread = ull->ull_owner; + ull->ull_owner = new_owner; } - turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL); + turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK); -out_locked: +out_ull_put: ull_put(ull); - /* Need to be called after dropping the interlock */ - turnstile_cleanup(); - - if (wake_thread != THREAD_NULL) { - thread_deallocate(wake_thread); + if (ts != TURNSTILE_NULL) { + /* Need to be called after dropping the interlock */ + turnstile_cleanup(); } - if (old_owner != THREAD_NULL) { - thread_deallocate(old_owner); + if (cleanup_thread != THREAD_NULL) { + thread_deallocate(cleanup_thread); } munge_retval: + if (wake_thread != THREAD_NULL) { + thread_deallocate(wake_thread); + } + if ((flags & ULF_NO_ERRNO) && (ret != 0)) { *retval = -ret; ret = 0; @@ -835,14 +1020,22 @@ kdp_ulock_find_owner(__unused struct waitq * waitq, event64_t event, thread_wait ull_t *ull = EVENT_TO_ULOCK(event); assert(kdp_is_in_zone(ull, "ulocks")); - if (ull->ull_opcode == UL_UNFAIR_LOCK) {// owner is only set if it's an os_unfair_lock - waitinfo->owner = thread_tid(ull->ull_owner); + switch (ull->ull_opcode) { + case UL_UNFAIR_LOCK: + case UL_UNFAIR_LOCK64_SHARED: + waitinfo->owner = thread_tid(ull->ull_owner); waitinfo->context = ull->ull_key.ulk_addr; - } else if (ull->ull_opcode == UL_COMPARE_AND_WAIT) { // otherwise, this is a spinlock - waitinfo->owner = 0; + break; + case UL_COMPARE_AND_WAIT: + case UL_COMPARE_AND_WAIT64: + case UL_COMPARE_AND_WAIT_SHARED: + case UL_COMPARE_AND_WAIT64_SHARED: + waitinfo->owner = 0; waitinfo->context = ull->ull_key.ulk_addr; - } else { + break; + default: panic("%s: Invalid ulock opcode %d addr %p", __FUNCTION__, ull->ull_opcode, (void*)ull); + break; } return; } diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 811d42826..6878545e7 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -215,7 +215,7 @@ 139 AUE_FUTIMES ALL { int futimes(int fd, struct timeval *tptr); } 140 AUE_ADJTIME ALL { int adjtime(struct timeval *delta, struct timeval *olddelta); } 141 AUE_NULL ALL { int nosys(void); } { old getpeername } -142 AUE_SYSCTL ALL { int gethostuuid(unsigned char *uuid_buf, const struct timespec *timeoutp, int spi) NO_SYSCALL_STUB; } +142 AUE_SYSCTL ALL { int gethostuuid(unsigned char *uuid_buf, const struct timespec *timeoutp) NO_SYSCALL_STUB; } 143 AUE_NULL ALL { int nosys(void); } { old sethostid } 144 AUE_NULL ALL { int nosys(void); } { old getrlimit } 145 AUE_NULL ALL { int nosys(void); } { old setrlimit } @@ -314,7 +314,7 @@ ; 216-> 219 used to be mkcomplex and {f,l}statv variants. They are gone now. 216 AUE_NULL ALL { int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode) NO_SYSCALL_STUB; } -217 AUE_NULL ALL { int nosys(void); } { old statv } +217 AUE_FSGETPATH_EXTENDED ALL { user_ssize_t fsgetpath_ext(user_addr_t buf, size_t bufsize, user_addr_t fsid, uint64_t objid, uint32_t options); } 218 AUE_NULL ALL { int nosys(void); } { old lstatv } 219 AUE_NULL ALL { int nosys(void); } { old fstatv } 220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } @@ -414,7 +414,7 @@ 271 AUE_SEMWAIT ALL { int sem_wait(sem_t *sem); } 272 AUE_SEMTRYWAIT ALL { int sem_trywait(sem_t *sem); } 273 AUE_SEMPOST ALL { int sem_post(sem_t *sem); } -274 AUE_SYSCTL ALL { int sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen) NO_SYSCALL_STUB; } +274 AUE_SYSCTL ALL { int sys_sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen) NO_SYSCALL_STUB; } 275 AUE_NULL ALL { int enosys(void); } { old sem_init } 276 AUE_NULL ALL { int enosys(void); } { old sem_destroy } 277 AUE_OPEN_EXTENDED_RWTC ALL { int open_extended(user_addr_t path, int flags, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity) NO_SYSCALL_STUB; } @@ -787,7 +787,7 @@ 493 AUE_NULL ALL { int enosys(void); } #endif #if CONFIG_PERSONAS -494 AUE_PERSONA ALL { int persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen) NO_SYSCALL_STUB; } +494 AUE_PERSONA ALL { int persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen, char *path) NO_SYSCALL_STUB; } #else 494 AUE_NULL ALL { int enosys(void); } #endif @@ -842,3 +842,10 @@ 530 AUE_NULL ALL { int enosys(void); } #endif // CONFIG_WORKQUEUE 531 AUE_NULL ALL { uint64_t __mach_bridge_remote_time(uint64_t local_timestamp); } +#if CONFIG_COALITIONS +532 AUE_NULL ALL { int coalition_ledger(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize) NO_SYSCALL_STUB; } +#else +532 AUE_NULL ALL { int enosys(void); } +#endif // CONFIG_COALITIONS +533 AUE_NULL ALL { int log_data(unsigned int tag, unsigned int flags, void *buffer, unsigned int size) NO_SYSCALL_STUB; } +534 AUE_NULL ALL { uint64_t memorystatus_available_memory(void) NO_SYSCALL_STUB; } diff --git a/bsd/kern/sysv_msg.c b/bsd/kern/sysv_msg.c index 4e141439d..c7352a6ce 100644 --- a/bsd/kern/sysv_msg.c +++ b/bsd/kern/sysv_msg.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,7 +90,7 @@ static void msg_freehdr(struct msg *msghdr); typedef int sy_call_t(struct proc *, void *, int *); /* XXX casting to (sy_call_t *) is bogus, as usual. */ -static sy_call_t *msgcalls[] = { +static sy_call_t* const msgcalls[] = { (sy_call_t *)msgctl, (sy_call_t *)msgget, (sy_call_t *)msgsnd, (sy_call_t *)msgrcv }; @@ -122,12 +122,12 @@ int msgmax, /* max chars in a message */ msgssz, /* size of a message segment (see notes above) */ msgseg; /* number of message segments */ struct msginfo msginfo = { - MSGMAX, /* = (MSGSSZ*MSGSEG) : max chars in a message */ - MSGMNI, /* = 40 : max message queue identifiers */ - MSGMNB, /* = 2048 : max chars in a queue */ - MSGTQL, /* = 40 : max messages in system */ - MSGSSZ, /* = 8 : size of a message segment (2^N long) */ - MSGSEG /* = 2048 : number of message segments */ + .msgmax = MSGMAX, /* = (MSGSSZ*MSGSEG) : max chars in a message */ + .msgmni = MSGMNI, /* = 40 : max message queue identifiers */ + .msgmnb = MSGMNB, /* = 2048 : max chars in a queue */ + .msgtql = MSGTQL, /* = 40 : max messages in system */ + .msgssz = MSGSSZ, /* = 8 : size of a message segment (2^N long) */ + .msgseg = MSGSEG /* = 2048 : number of message segments */ }; #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index 795fd6d02..b6cad0b1b 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -85,16 +85,16 @@ * These are not needed if we can make the semaphore pages swappable. */ static struct seminfo limitseminfo = { - SEMMAP, /* # of entries in semaphore map */ - SEMMNI, /* # of semaphore identifiers */ - SEMMNS, /* # of semaphores in system */ - SEMMNU, /* # of undo structures in system */ - SEMMSL, /* max # of semaphores per id */ - SEMOPM, /* max # of operations per semop call */ - SEMUME, /* max # of undo entries per process */ - SEMUSZ, /* size in bytes of undo structure */ - SEMVMX, /* semaphore maximum value */ - SEMAEM /* adjust on exit max value */ + .semmap = SEMMAP, /* # of entries in semaphore map */ + .semmni = SEMMNI, /* # of semaphore identifiers */ + .semmns = SEMMNS, /* # of semaphores in system */ + .semmnu = SEMMNU, /* # of undo structures in system */ + .semmsl = SEMMSL, /* max # of semaphores per id */ + .semopm = SEMOPM, /* max # of operations per semop call */ + .semume = SEMUME, /* max # of undo entries per process */ + .semusz = SEMUSZ, /* size in bytes of undo structure */ + .semvmx = SEMVMX, /* semaphore maximum value */ + .semaem = SEMAEM /* adjust on exit max value */ }; /* Current system allocations. We use this structure to track how many @@ -102,16 +102,16 @@ static struct seminfo limitseminfo = { * and not allocate the memory for them up front. */ struct seminfo seminfo = { - SEMMAP, /* Unused, # of entries in semaphore map */ - 0, /* # of semaphore identifiers */ - 0, /* # of semaphores in system */ - 0, /* # of undo entries in system */ - SEMMSL, /* max # of semaphores per id */ - SEMOPM, /* max # of operations per semop call */ - SEMUME, /* max # of undo entries per process */ - SEMUSZ, /* size in bytes of undo structure */ - SEMVMX, /* semaphore maximum value */ - SEMAEM /* adjust on exit max value */ + .semmap = SEMMAP, /* Unused, # of entries in semaphore map */ + .semmni = 0, /* # of semaphore identifiers */ + .semmns = 0, /* # of semaphores in system */ + .semmnu = 0, /* # of undo entries in system */ + .semmsl = SEMMSL, /* max # of semaphores per id */ + .semopm = SEMOPM, /* max # of operations per semop call */ + .semume = SEMUME, /* max # of undo entries per process */ + .semusz = SEMUSZ, /* size in bytes of undo structure */ + .semvmx = SEMVMX, /* semaphore maximum value */ + .semaem = SEMAEM /* adjust on exit max value */ }; @@ -121,7 +121,7 @@ static int semundo_adjust(struct proc *p, int *supidx, static void semundo_clear(int semid, int semnum); /* XXX casting to (sy_call_t *) is bogus, as usual. */ -static sy_call_t *semcalls[] = { +static sy_call_t* const semcalls[] = { (sy_call_t *)semctl, (sy_call_t *)semget, (sy_call_t *)semop }; diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index 99ad6602e..d31d1f57b 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -124,7 +124,7 @@ static void shmid_ds_64to32(struct user_shmid_ds *in, struct user32_shmid_ds *ou static void shmid_ds_32to64(struct user32_shmid_ds *in, struct user_shmid_ds *out); /* XXX casting to (sy_call_t *) is bogus, as usual. */ -static sy_call_t *shmcalls[] = { +static sy_call_t* const shmcalls[] = { (sy_call_t *)shmat, (sy_call_t *)oshmctl, (sy_call_t *)shmdt, (sy_call_t *)shmget, (sy_call_t *)shmctl @@ -170,11 +170,11 @@ static int shm_delete_mapping(struct proc *, struct shmmap_state *, int); #define DEFAULT_SHMALL 1024 struct shminfo shminfo = { - DEFAULT_SHMMAX, - DEFAULT_SHMMIN, - DEFAULT_SHMMNI, - DEFAULT_SHMSEG, - DEFAULT_SHMALL + .shmmax = DEFAULT_SHMMAX, + .shmmin = DEFAULT_SHMMIN, + .shmmni = DEFAULT_SHMMNI, + .shmseg = DEFAULT_SHMSEG, + .shmall = DEFAULT_SHMALL }; #define SHMID_IS_VALID(x) ((x) >= 0) diff --git a/bsd/kern/trace_codes b/bsd/kern/trace_codes index 3cbac2c73..2da5b2b91 100644 --- a/bsd/kern/trace_codes +++ b/bsd/kern/trace_codes @@ -97,7 +97,7 @@ 0x10c0094 MSC_semaphore_wait_signal_trap 0x10c0098 MSC_semaphore_timedwait_trap 0x10c009c MSC_semaphore_timedwait_signal_trap -0x10c00a0 MSC_kern_invalid_#40 +0x10c00a0 MSC_mach_port_get_attributes_trap 0x10c00a4 MSC_mach_port_guard_trap 0x10c00a8 MSC_mach_port_unguard_trap 0x10c00ac MSC_mach_generate_activity_id @@ -133,8 +133,8 @@ 0x10c0124 MSC_kern_invalid_#73 0x10c0128 MSC_kern_invalid_#74 0x10c012c MSC_kern_invalid_#75 -0x10c0130 MSC_kern_invalid_#76 -0x10c0134 MSC_kern_invalid_#77 +0x10c0130 MSC_mach_port_type_trap +0x10c0134 MSC_mach_port_request_notification_trap 0x10c0138 MSC_kern_invalid_#78 0x10c013c MSC_kern_invalid_#79 0x10c0140 MSC_kern_invalid_#80 @@ -314,6 +314,9 @@ 0x14000D8 MACH_UNPROMOTED 0x14000DC MACH_PROMOTED_UPDATE 0x14000E0 MACH_QUIESCENT_COUNTER +0x14000E4 MACH_TURNSTILE_USER_CHANGE +0x14000E8 MACH_AMP_RECOMMENDATION_CHANGE +0x1400100 MACH_TURNSTILE_KERNEL_CHANGE 0x1500000 MACH_MSGID_INVALID 0x1600000 MTX_SLEEP 0x1600004 MTX_SLEEP_DEADLINE @@ -894,6 +897,98 @@ 0x3120008 DECMPFS_fetch_uncmp_data 0x3120010 DECMPFS_free_cmp_data 0x3120014 DECMPFS_file_is_cmp +0x3130000 VFS_devfsdirent_label_alloc +0x3130004 VFS_mount_label_alloc +0x3130008 VFS_label_alloc +0x313000C VFS_devfs_label_free +0x3130010 VFS_mount_label_free +0x3130014 VFS_label_free +0x3130018 VFS_label_copy +0x313001C VFS_devfs_label_copy +0x3130020 VFS_devfs_label_update +0x3130024 VFS_label_associate_devfs +0x3130028 VFS_label_associate_extattr +0x313002C VFS_label_associate_singlelabel +0x3130030 VFS_notify_create +0x3130034 VFS_notify_rename +0x3130038 VFS_notify_open +0x313003C VFS_notify_link +0x3130040 VFS_notify_deleteextattr +0x3130044 VFS_notify_setacl +0x3130048 VFS_notify_setattrlist +0x313004C VFS_notify_setextattr +0x3130050 VFS_notify_setflags +0x3130054 VFS_notify_setmode +0x3130058 VFS_notify_setowner +0x313005C VFS_notify_setutimes +0x3130060 VFS_notify_truncate +0x3130064 VFS_label_update_extattr +0x3130068 VFS_label_store +0x313006C VFS_cred_label_update_execve +0x3130070 VFS_cred_check_label_update_execve +0x3130074 VFS_check_access +0x3130078 VFS_check_chdir +0x313007C VFS_check_chroot +0x3130080 VFS_check_clone +0x3130084 VFS_check_create +0x3130088 VFS_check_unlink +0x313008C VFS_check_deleteacl +0x3130090 VFS_check_deleteextattr +0x3130094 VFS_check_exchangedata +0x3130098 VFS_check_getacl +0x313009C VFS_check_getattr +0x31300A0 VFS_check_getattrlist +0x31300A4 VFS_check_exec +0x31300A8 VFS_check_fsgetpath +0x31300AC VFS_check_signature +0x31300B0 VFS_check_getacl +0x31300B4 VFS_check_getextattr +0x31300B8 VFS_check_ioctl +0x31300BC VFS_check_kqfilter +0x31300C0 VFS_check_link +0x31300C4 VFS_check_listextattr +0x31300C8 VFS_check_lookup_preflight +0x31300CC VFS_check_lookup +0x31300D0 VFS_check_open +0x31300D4 VFS_check_read +0x31300D8 VFS_check_readdir +0x31300DC VFS_check_readlink +0x31300E0 VFS_check_label_update +0x31300E4 VFS_check_rename +0x31300E8 VFS_check_revoke +0x31300EC VFS_check_searchfs +0x31300F0 VFS_check_select +0x31300F4 VFS_check_setacl +0x31300F8 VFS_check_setattrlist +0x31300FC VFS_check_setextattr +0x3130100 VFS_check_setflags +0x3130104 VFS_check_setmode +0x3130108 VFS_check_setowner +0x313010C VFS_check_setutimes +0x3130110 VFS_check_stat +0x3130114 VFS_check_trigger_resolve +0x3130118 VFS_check_truncate +0x313011C VFS_check_write +0x3130120 VFS_check_uipc_bind +0x3130124 VFS_check_uipc_connect +0x3130128 VFS_label_update +0x313012C VFS_find_sigs +0x3130130 VFS_mount_label_associate +0x3130134 VFS_mount_check_mount +0x3130138 VFS_mount_check_mount_late +0x313013C VFS_mount_check_snapshot_create +0x3130140 VFS_mount_check_snapshot_delete +0x3130144 VFS_mount_check_snapshot_revert +0x3130148 VFS_mount_check_remount +0x313014C VFS_mount_check_umount +0x3130150 VFS_mount_check_getattr +0x3130154 VFS_mount_check_setattr +0x3130158 VFS_mount_check_stat +0x313015C VFS_mount_check_label_update +0x3130160 VFS_mount_check_fsctl +0x3130164 VFS_devfs_label_associate_device +0x3130168 VFS_devfs_label_associate_directory +0x313016C VFS_label_associate_fdesc 0x3CF0000 CP_OFFSET_IO 0x4010004 proc_exit 0x4010008 force_exit @@ -904,7 +999,7 @@ 0x4020008 MEMSTAT_jetsam 0x402000C MEMSTAT_jetsam_hiwat 0x4020010 MEMSTAT_freeze -0x4020014 MEMSTAT_latency_coalesce +0x4020014 MEMSTAT_freeze_scan 0x4020018 MEMSTAT_update 0x402001C MEMSTAT_idle_demote 0x4020020 MEMSTAT_clear_errors @@ -915,6 +1010,8 @@ 0x4020034 MEMSTAT_do_kill 0x4020038 MEMSTAT_change_priority 0x402003C MEMSTAT_fast_jetsam +0x4020040 MEMSTAT_compactor_run +0x4020044 MEMSTAT_freeze_disable 0x4030004 KEVENT_kq_processing_begin 0x4030008 KEVENT_kq_processing_end 0x403000c KEVENT_kqwq_processing_begin @@ -1634,6 +1731,7 @@ 0x263b0028 imp_thread_qos_workq_override 0x263c0028 imp_thread_qos_promote 0x263d0028 imp_thread_qos_ipc_override +0x263e0028 imp_thread_qos_servicer_override 0x27000000 PERF_PCEVENT 0x27001000 PERF_CPU_IDLE 0x27001100 PERF_CPU_IDLE_TIMER diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index 417357add..814898803 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2017 Apple Inc. All rights reserved. + * Copyright (c) 1997-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1581,6 +1581,12 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) case TIOCGDRAINWAIT: *(int *)data = tp->t_timeout / hz; break; + case TIOCREVOKE: + if (ISSET(tp->t_state, TS_PGRPHUP)) { + tp->t_gen++; + wakeup(TSA_HUP_OR_INPUT(tp)); + } + break; default: error = ttcompat(tp, cmd, data, flag, p); goto out; @@ -2147,7 +2153,7 @@ loop: int m = cc[VMIN]; long t = cc[VTIME]; struct timeval timecopy; - struct timeval etime = {0, 0}; /* protected by !has_etime */ + struct timeval etime = {.tv_sec = 0, .tv_usec = 0}; /* protected by !has_etime */ /* * Check each of the four combinations. @@ -2806,6 +2812,16 @@ ttyecho(int c, struct tty *tp) (void)ttyoutput(c, tp); } +static void +ttwakeup_knote(struct selinfo *sip, long hint) +{ + if ((sip->si_flags & SI_KNPOSTING) == 0) { + sip->si_flags |= SI_KNPOSTING; + KNOTE(&sip->si_note, hint); + sip->si_flags &= ~SI_KNPOSTING; + } +} + /* * Wake up any readers on a tty. @@ -2818,7 +2834,7 @@ ttwakeup(struct tty *tp) TTY_LOCK_OWNED(tp); /* debug assert */ selwakeup(&tp->t_rsel); - KNOTE(&tp->t_rsel.si_note, 1); + ttwakeup_knote(&tp->t_rsel, 0); if (ISSET(tp->t_state, TS_ASYNC)) { /* * XXX: Callers may not revalidate it the tty is closed @@ -2850,7 +2866,7 @@ ttwwakeup(struct tty *tp) if (tp->t_outq.c_cc <= tp->t_lowat) { selwakeup(&tp->t_wsel); - KNOTE(&tp->t_wsel.si_note, 1); + ttwakeup_knote(&tp->t_wsel, 0); } if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) == TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) { @@ -3030,7 +3046,6 @@ ttyinfo_locked(struct tty *tp) break; } calcru(pick, &utime, &stime, NULL); - proc_rele(pick); /* Print command, pid, state, utime, and stime */ ttyprintf(tp, " cmd: %s %d %s %ld.%02du %ld.%02ds\n", @@ -3039,6 +3054,8 @@ ttyinfo_locked(struct tty *tp) state, (long)utime.tv_sec, utime.tv_usec / 10000, (long)stime.tv_sec, stime.tv_usec / 10000); + + proc_rele(pick); tp->t_rocount = 0; } @@ -3311,11 +3328,11 @@ isctty_sp(proc_t p, struct tty *tp, struct session *sessp) } -static int filt_ttyattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_ttyattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_ttydetach(struct knote *kn); static int filt_ttyevent(struct knote *kn, long hint); -static int filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_ttyprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_ttytouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_ttyprocess(struct knote *kn, struct kevent_qos_s *kev); SECURITY_READ_ONLY_EARLY(struct filterops) tty_filtops = { .f_isfd = 1, @@ -3331,31 +3348,35 @@ SECURITY_READ_ONLY_EARLY(struct filterops) tty_filtops = { * or written. */ static int -filt_tty_common(struct knote *kn, struct tty *tp) +filt_tty_common(struct knote *kn, struct kevent_qos_s *kev, struct tty *tp) { int retval = 0; + int64_t data = 0; TTY_LOCK_OWNED(tp); /* debug assert */ - if (tp->t_state & TS_ZOMBIE) { - kn->kn_flags |= EV_EOF; - return 1; - } - - switch (knote_get_seltype(kn)) { - case FREAD: - retval = ttnread(tp); + switch (kn->kn_filter) { + case EVFILT_READ: + /* + * ttnread can change the tty state, + * hence must be done upfront, before any other check. + */ + data = ttnread(tp); + retval = (data != 0); break; - case FWRITE: + case EVFILT_WRITE: if ((tp->t_outq.c_cc <= tp->t_lowat) && (tp->t_state & TS_CONNECTED)) { - retval = tp->t_hiwat - tp->t_outq.c_cc; + data = tp->t_hiwat - tp->t_outq.c_cc; + retval = (data != 0); } break; + default: + panic("tty kevent: unexpected filter: %d, kn = %p, tty = %p", + kn->kn_filter, kn, tp); + break; } - kn->kn_data = retval; - /* * TODO(mwidmann, jandrus): For native knote low watermark support, * check the kn_sfflags for NOTE_LOWAT and check against kn_sdata. @@ -3364,6 +3385,16 @@ filt_tty_common(struct knote *kn, struct tty *tp) * (kn->kn_data >= kn->kn_sdata) : kn->kn_data; */ + if (tp->t_state & TS_ZOMBIE) { + kn->kn_flags |= EV_EOF; + } + if (kn->kn_flags & EV_EOF) { + retval = 1; + } + if (retval && kev) { + knote_fill_kevent(kn, kev, data); + } + return retval; } @@ -3415,24 +3446,6 @@ tty_from_knote(struct knote *kn) return (struct tty *)kn->kn_hook; } -/* - * Try to lock the TTY structure associated with a knote. - * - * On success, this function returns a locked TTY structure. Otherwise, NULL is - * returned. - */ -__attribute__((warn_unused_result)) -static struct tty * -tty_lock_from_knote(struct knote *kn) -{ - struct tty *tp = tty_from_knote(kn); - if (tp) { - tty_lock(tp); - } - - return tp; -} - /* * Set the knote's struct tty to the kn_hook field. * @@ -3538,7 +3551,7 @@ out: } static int -filt_ttyattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_ttyattach(struct knote *kn, __unused struct kevent_qos_s *kev) { int selres = 0; struct tty *tp; @@ -3566,19 +3579,18 @@ filt_ttyattach(struct knote *kn, __unused struct kevent_internal_s *kev) /* * Attach the knote to selinfo's klist. */ - tp = tty_lock_from_knote(kn); - if (!tp) { - knote_set_error(kn, ENOENT); - return 0; - } + tp = tty_from_knote(kn); + tty_lock(tp); - switch (knote_get_seltype(kn)) { - case FREAD: + switch (kn->kn_filter) { + case EVFILT_READ: KNOTE_ATTACH(&tp->t_rsel.si_note, kn); break; - case FWRITE: + case EVFILT_WRITE: KNOTE_ATTACH(&tp->t_wsel.si_note, kn); break; + default: + panic("invalid knote %p attach, filter: %d", kn, kn->kn_filter); } tty_unlock(tp); @@ -3589,28 +3601,22 @@ filt_ttyattach(struct knote *kn, __unused struct kevent_internal_s *kev) static void filt_ttydetach(struct knote *kn) { - struct tty *tp; + struct tty *tp = tty_from_knote(kn); - tp = tty_lock_from_knote(kn); - if (!tp) { - knote_set_error(kn, ENOENT); - return; - } + tty_lock(tp); - struct selinfo *si = NULL; - switch (knote_get_seltype(kn)) { - case FREAD: - si = &tp->t_rsel; + switch (kn->kn_filter) { + case EVFILT_READ: + KNOTE_DETACH(&tp->t_rsel.si_note, kn); break; - case FWRITE: - si = &tp->t_wsel; + case EVFILT_WRITE: + KNOTE_DETACH(&tp->t_wsel.si_note, kn); + break; + default: + panic("invalid knote %p detach, filter: %d", kn, kn->kn_filter); break; - /* knote_get_seltype will panic on default */ } - KNOTE_DETACH(&si->si_note, kn); - kn->kn_hook = NULL; - tty_unlock(tp); ttyfree(tp); } @@ -3618,52 +3624,34 @@ filt_ttydetach(struct knote *kn) static int filt_ttyevent(struct knote *kn, long hint) { + struct tty *tp = tty_from_knote(kn); int ret; - struct tty *tp; - bool revoked = hint & NOTE_REVOKE; - hint &= ~NOTE_REVOKE; - - tp = tty_from_knote(kn); - if (!tp) { - knote_set_error(kn, ENOENT); - return 0; - } - if (!hint) { - tty_lock(tp); - } + TTY_LOCK_OWNED(tp); - if (revoked) { + if (hint & NOTE_REVOKE) { kn->kn_flags |= EV_EOF | EV_ONESHOT; ret = 1; } else { - ret = filt_tty_common(kn, tp); - } - - if (!hint) { - tty_unlock(tp); + ret = filt_tty_common(kn, NULL, tp); } return ret; } static int -filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev) +filt_ttytouch(struct knote *kn, struct kevent_qos_s *kev) { - struct tty *tp; + struct tty *tp = tty_from_knote(kn); int res = 0; - tp = tty_lock_from_knote(kn); - if (!tp) { - knote_set_error(kn, ENOENT); - return 0; - } + tty_lock(tp); kn->kn_sdata = kev->data; kn->kn_sfflags = kev->fflags; if (kn->kn_vnode_kqok) { - res = filt_tty_common(kn, tp); + res = filt_tty_common(kn, NULL, tp); } tty_unlock(tp); @@ -3672,26 +3660,14 @@ filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_ttyprocess(struct knote *kn, __unused struct filt_process_s *data, struct kevent_internal_s *kev) +filt_ttyprocess(struct knote *kn, struct kevent_qos_s *kev) { - struct tty *tp; + struct tty *tp = tty_from_knote(kn); int res; - tp = tty_lock_from_knote(kn); - if (!tp) { - knote_set_error(kn, ENOENT); - return 0; - } - - res = filt_tty_common(kn, tp); + tty_lock(tp); - if (res) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + res = filt_tty_common(kn, kev, tp); tty_unlock(tp); diff --git a/bsd/kern/tty_compat.c b/bsd/kern/tty_compat.c index 4bd2d0273..ac452a3d9 100644 --- a/bsd/kern/tty_compat.c +++ b/bsd/kern/tty_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -99,25 +99,25 @@ static int ttcompatspeedtab(int speed, struct speedtab *table); */ static struct speedtab compatspeeds[] = { #define MAX_SPEED 17 - { 115200, 17 }, - { 57600, 16 }, - { 38400, 15 }, - { 19200, 14 }, - { 9600, 13 }, - { 4800, 12 }, - { 2400, 11 }, - { 1800, 10 }, - { 1200, 9 }, - { 600, 8 }, - { 300, 7 }, - { 200, 6 }, - { 150, 5 }, - { 134, 4 }, - { 110, 3 }, - { 75, 2 }, - { 50, 1 }, - { 0, 0 }, - { -1, -1 }, + { .sp_speed = 115200, .sp_code = 17 }, + { .sp_speed = 57600, .sp_code = 16 }, + { .sp_speed = 38400, .sp_code = 15 }, + { .sp_speed = 19200, .sp_code = 14 }, + { .sp_speed = 9600, .sp_code = 13 }, + { .sp_speed = 4800, .sp_code = 12 }, + { .sp_speed = 2400, .sp_code = 11 }, + { .sp_speed = 1800, .sp_code = 10 }, + { .sp_speed = 1200, .sp_code = 9 }, + { .sp_speed = 600, .sp_code = 8 }, + { .sp_speed = 300, .sp_code = 7 }, + { .sp_speed = 200, .sp_code = 6 }, + { .sp_speed = 150, .sp_code = 5 }, + { .sp_speed = 134, .sp_code = 4 }, + { .sp_speed = 110, .sp_code = 3 }, + { .sp_speed = 75, .sp_code = 2 }, + { .sp_speed = 50, .sp_code = 1 }, + { .sp_speed = 0, .sp_code = 0 }, + { .sp_speed = -1, .sp_code = -1 }, }; static int compatspcodes[] = { 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, diff --git a/bsd/kern/tty_dev.c b/bsd/kern/tty_dev.c index ccfd752fb..302b76aa5 100644 --- a/bsd/kern/tty_dev.c +++ b/bsd/kern/tty_dev.c @@ -506,6 +506,16 @@ out: return; } +static void +ptcwakeup_knote(struct selinfo *sip, long hint) +{ + if ((sip->si_flags & SI_KNPOSTING) == 0) { + sip->si_flags |= SI_KNPOSTING; + KNOTE(&sip->si_note, hint); + sip->si_flags &= ~SI_KNPOSTING; + } +} + /* * Locks: Assumes tty_lock() is held over this call. */ @@ -520,12 +530,12 @@ ptcwakeup(struct tty *tp, int flag) if (flag & FREAD) { selwakeup(&pti->pt_selr); wakeup(TSA_PTC_READ(tp)); - KNOTE(&pti->pt_selr.si_note, 1); + ptcwakeup_knote(&pti->pt_selr, 1); } if (flag & FWRITE) { selwakeup(&pti->pt_selw); wakeup(TSA_PTC_WRITE(tp)); - KNOTE(&pti->pt_selw.si_note, 1); + ptcwakeup_knote(&pti->pt_selw, 1); } } @@ -1011,6 +1021,9 @@ block: goto again; } +/* + * ptyioctl: Assumes dev was opened and lock was initilized + */ __private_extern__ int ptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { @@ -1020,9 +1033,10 @@ ptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) int stop, error = 0; int allow_ext_ioctl = 1; - if (pti == NULL) { + if (pti == NULL || pti->pt_tty == NULL) { return ENXIO; } + tp = pti->pt_tty; tty_lock(tp); diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index 6da6e641a..8f0ba28e4 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2013 Apple Inc. All rights reserved. + * Copyright (c) 1997-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,18 +118,38 @@ extern d_select_t ptcselect; static int ptmx_major; /* dynamically assigned major number */ static struct cdevsw ptmx_cdev = { - ptcopen, ptcclose, ptcread, ptcwrite, - ptyioctl, ptcstop, ptcreset, 0, - ptcselect, eno_mmap, eno_strat, eno_getc, - eno_putc, D_TTY + .d_open = ptcopen, + .d_close = ptcclose, + .d_read = ptcread, + .d_write = ptcwrite, + .d_ioctl = ptyioctl, + .d_stop = ptcstop, + .d_reset = ptcreset, + .d_ttys = NULL, + .d_select = ptcselect, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = D_TTY }; static int ptsd_major; /* dynamically assigned major number */ static struct cdevsw ptsd_cdev = { - ptsopen, ptsclose, ptsread, ptswrite, - ptyioctl, ptsstop, ptsreset, 0, - ptsselect, eno_mmap, eno_strat, eno_getc, - eno_putc, D_TTY + .d_open = ptsopen, + .d_close = ptsclose, + .d_read = ptsread, + .d_write = ptswrite, + .d_ioctl = ptyioctl, + .d_stop = ptsstop, + .d_reset = ptsreset, + .d_ttys = NULL, + .d_select = ptsselect, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = D_TTY }; /* @@ -467,8 +487,8 @@ ptmx_clone(__unused dev_t dev, int action) int ptsd_kqfilter(dev_t dev, struct knote *kn); static void ptsd_kqops_detach(struct knote *); static int ptsd_kqops_event(struct knote *, long); -static int ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev); -static int ptsd_kqops_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int ptsd_kqops_touch(struct knote *kn, struct kevent_qos_s *kev); +static int ptsd_kqops_process(struct knote *kn, struct kevent_qos_s *kev); SECURITY_READ_ONLY_EARLY(struct filterops) ptsd_kqops = { .f_isfd = 1, @@ -491,10 +511,7 @@ SECURITY_READ_ONLY_EARLY(struct filterops) ptsd_kqops = { static void ptsd_kqops_detach(struct knote *kn) { - struct tty *tp; - - tp = kn->kn_hook; - assert(tp != NULL); + struct tty *tp = kn->kn_hook; tty_lock(tp); @@ -507,42 +524,41 @@ ptsd_kqops_detach(struct knote *kn) case EVFILT_READ: KNOTE_DETACH(&tp->t_rsel.si_note, kn); break; - case EVFILT_WRITE: KNOTE_DETACH(&tp->t_wsel.si_note, kn); break; - default: panic("invalid knote %p detach, filter: %d", kn, kn->kn_filter); break; } } - kn->kn_hook = NULL; tty_unlock(tp); - ttyfree(tp); } static int -ptsd_kqops_common(struct knote *kn, struct tty *tp) +ptsd_kqops_common(struct knote *kn, struct kevent_qos_s *kev, struct tty *tp) { int retval = 0; + int64_t data = 0; TTY_LOCK_OWNED(tp); switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_data = ttnread(tp); - if (kn->kn_data > 0) { - retval = 1; - } + /* + * ttnread can change the tty state, + * hence must be done upfront, before any other check. + */ + data = ttnread(tp); + retval = (data > 0); break; case EVFILT_WRITE: if ((tp->t_outq.c_cc <= tp->t_lowat) && (tp->t_state & TS_CONNECTED)) { - kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc; + data = tp->t_outq.c_cn - tp->t_outq.c_cc; retval = 1; } break; @@ -555,9 +571,13 @@ ptsd_kqops_common(struct knote *kn, struct tty *tp) if (tp->t_state & TS_ZOMBIE) { kn->kn_flags |= EV_EOF; + } + if (kn->kn_flags & EV_EOF) { retval = 1; } - + if (retval && kev) { + knote_fill_kevent(kn, kev, data); + } return retval; } @@ -566,35 +586,25 @@ ptsd_kqops_event(struct knote *kn, long hint) { struct tty *tp = kn->kn_hook; int ret; - bool revoked = hint & NOTE_REVOKE; - hint &= ~NOTE_REVOKE; - if (!hint) { - tty_lock(tp); - } + TTY_LOCK_OWNED(tp); - if (revoked) { + if (hint & NOTE_REVOKE) { kn->kn_flags |= EV_EOF | EV_ONESHOT; ret = 1; } else { - ret = ptsd_kqops_common(kn, tp); - } - - if (!hint) { - tty_unlock(tp); + ret = ptsd_kqops_common(kn, NULL, tp); } return ret; } static int -ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) +ptsd_kqops_touch(struct knote *kn, struct kevent_qos_s *kev) { - struct tty *tp; + struct tty *tp = kn->kn_hook; int ret; - tp = kn->kn_hook; - tty_lock(tp); /* accept new kevent state */ @@ -602,7 +612,7 @@ ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; /* recapture fired state of knote */ - ret = ptsd_kqops_common(kn, tp); + ret = ptsd_kqops_common(kn, NULL, tp); tty_unlock(tp); @@ -610,21 +620,13 @@ ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) } static int -ptsd_kqops_process(struct knote *kn, __unused struct filt_process_s *data, - struct kevent_internal_s *kev) +ptsd_kqops_process(struct knote *kn, struct kevent_qos_s *kev) { struct tty *tp = kn->kn_hook; int ret; tty_lock(tp); - ret = ptsd_kqops_common(kn, tp); - if (ret) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + ret = ptsd_kqops_common(kn, kev, tp); tty_unlock(tp); return ret; @@ -672,7 +674,7 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) } /* capture current event state */ - ret = ptsd_kqops_common(kn, tp); + ret = ptsd_kqops_common(kn, NULL, tp); tty_unlock(tp); @@ -688,10 +690,12 @@ ptsd_revoke_knotes(__unused int minor, struct tty *tp) tty_lock(tp); ttwakeup(tp); - KNOTE(&tp->t_rsel.si_note, NOTE_REVOKE | 1 /* the lock is already held */); + assert((tp->t_rsel.si_flags & SI_KNPOSTING) == 0); + KNOTE(&tp->t_rsel.si_note, NOTE_REVOKE); ttwwakeup(tp); - KNOTE(&tp->t_wsel.si_note, NOTE_REVOKE | 1); + assert((tp->t_wsel.si_flags & SI_KNPOSTING) == 0); + KNOTE(&tp->t_wsel.si_note, NOTE_REVOKE); tty_unlock(tp); } @@ -706,9 +710,10 @@ ptsd_revoke_knotes(__unused int minor, struct tty *tp) int ptmx_kqfilter(dev_t dev, struct knote *kn); static void ptmx_kqops_detach(struct knote *); static int ptmx_kqops_event(struct knote *, long); -static int ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev); -static int ptmx_kqops_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -static int ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp); +static int ptmx_kqops_touch(struct knote *kn, struct kevent_qos_s *kev); +static int ptmx_kqops_process(struct knote *kn, struct kevent_qos_s *kev); +static int ptmx_kqops_common(struct knote *kn, struct kevent_qos_s *kev, + struct ptmx_ioctl *pti, struct tty *tp); SECURITY_READ_ONLY_EARLY(struct filterops) ptmx_kqops = { .f_isfd = 1, @@ -728,8 +733,7 @@ ptmx_knote_ioctl(struct knote *kn) static struct tty * ptmx_knote_tty(struct knote *kn) { - struct ptmx_ioctl *pti = kn->kn_hook; - return pti->pt_tty; + return ptmx_knote_ioctl(kn)->pt_tty; } int @@ -754,6 +758,8 @@ ptmx_kqfilter(dev_t dev, struct knote *kn) tty_lock(tp); kn->kn_filtid = EVFILTID_PTMX; + /* the tty will be freed when detaching the knote */ + ttyhold(tp); kn->kn_hook = pti; /* @@ -775,10 +781,8 @@ ptmx_kqfilter(dev_t dev, struct knote *kn) } /* capture current event state */ - ret = ptmx_kqops_common(kn, pti, tp); + ret = ptmx_kqops_common(kn, NULL, pti, tp); - /* take a reference on the TTY */ - ttyhold(tp); tty_unlock(tp); return ret; @@ -790,49 +794,39 @@ ptmx_kqops_detach(struct knote *kn) struct ptmx_ioctl *pti = kn->kn_hook; struct tty *tp = pti->pt_tty; - assert(tp != NULL); - tty_lock(tp); switch (kn->kn_filter) { case EVFILT_READ: KNOTE_DETACH(&pti->pt_selr.si_note, kn); break; - case EVFILT_WRITE: KNOTE_DETACH(&pti->pt_selw.si_note, kn); break; - default: panic("invalid knote %p detach, filter: %d", kn, kn->kn_filter); break; } - kn->kn_hook = NULL; tty_unlock(tp); - ttyfree(tp); } static int -ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp) +ptmx_kqops_common(struct knote *kn, struct kevent_qos_s *kev, + struct ptmx_ioctl *pti, struct tty *tp) { int retval = 0; + int64_t data = 0; TTY_LOCK_OWNED(tp); - /* disconnects should force a wakeup (EOF) */ - if (!(tp->t_state & TS_CONNECTED)) { - kn->kn_flags |= EV_EOF; - return 1; - } - switch (kn->kn_filter) { case EVFILT_READ: /* there's data on the TTY and it's not stopped */ if (tp->t_outq.c_cc && !(tp->t_state & TS_TTSTOP)) { - retval = tp->t_outq.c_cc; - kn->kn_data = retval; + data = tp->t_outq.c_cc; + retval = data > 0; } else if (((pti->pt_flags & PF_PKT) && pti->pt_send) || ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) { retval = 1; @@ -861,11 +855,16 @@ ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp) break; } - if (tp->t_state & TS_ZOMBIE) { + /* disconnects should force a wakeup (EOF) */ + if (!(tp->t_state & TS_CONNECTED) || (tp->t_state & TS_ZOMBIE)) { kn->kn_flags |= EV_EOF; + } + if (kn->kn_flags & EV_EOF) { retval = 1; } - + if (retval && kev) { + knote_fill_kevent(kn, kev, data); + } return retval; } @@ -875,29 +874,21 @@ ptmx_kqops_event(struct knote *kn, long hint) struct ptmx_ioctl *pti = ptmx_knote_ioctl(kn); struct tty *tp = ptmx_knote_tty(kn); int ret; - bool revoked = hint & NOTE_REVOKE; - hint &= ~NOTE_REVOKE; - if (!hint) { - tty_lock(tp); - } + TTY_LOCK_OWNED(tp); - if (revoked) { + if (hint & NOTE_REVOKE) { kn->kn_flags |= EV_EOF | EV_ONESHOT; ret = 1; } else { - ret = ptmx_kqops_common(kn, pti, tp); - } - - if (!hint) { - tty_unlock(tp); + ret = ptmx_kqops_common(kn, NULL, pti, tp); } return ret; } static int -ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) +ptmx_kqops_touch(struct knote *kn, struct kevent_qos_s *kev) { struct ptmx_ioctl *pti = ptmx_knote_ioctl(kn); struct tty *tp = ptmx_knote_tty(kn); @@ -910,7 +901,7 @@ ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; /* recapture fired state of knote */ - ret = ptmx_kqops_common(kn, pti, tp); + ret = ptmx_kqops_common(kn, NULL, pti, tp); tty_unlock(tp); @@ -918,22 +909,14 @@ ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) } static int -ptmx_kqops_process(struct knote *kn, __unused struct filt_process_s *data, - struct kevent_internal_s *kev) +ptmx_kqops_process(struct knote *kn, struct kevent_qos_s *kev) { struct ptmx_ioctl *pti = ptmx_knote_ioctl(kn); struct tty *tp = ptmx_knote_tty(kn); int ret; tty_lock(tp); - ret = ptmx_kqops_common(kn, pti, tp); - if (ret) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + ret = ptmx_kqops_common(kn, kev, pti, tp); tty_unlock(tp); return ret; diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 01c615b28..cc16291df 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -82,6 +82,8 @@ extern kern_return_t memory_object_pages_resident(memory_object_control_t, extern kern_return_t memory_object_signed(memory_object_control_t control, boolean_t is_signed); extern boolean_t memory_object_is_signed(memory_object_control_t); +extern void memory_object_mark_trusted( + memory_object_control_t control); /* XXX Same for those. */ @@ -1937,6 +1939,33 @@ ubc_map(vnode_t vp, int flags) if (vnode_ref_ext(vp, 0, VNODE_REF_FORCE)) { panic("%s : VNODE_REF_FORCE failed\n", __FUNCTION__); } + /* + * Vnodes that are on "unreliable" media (like disk + * images, network filesystems, 3rd-party filesystems, + * and possibly external devices) could see their + * contents be changed via the backing store without + * triggering copy-on-write, so we can't fully rely + * on copy-on-write and might have to resort to + * copy-on-read to protect "privileged" processes and + * prevent privilege escalation. + * + * The root filesystem is considered "reliable" because + * there's not much point in trying to protect + * ourselves from such a vulnerability and the extra + * cost of copy-on-read (CPU time and memory pressure) + * could result in some serious regressions. + */ + if (vp->v_mount != NULL && + ((vp->v_mount->mnt_flag & MNT_ROOTFS) || + vnode_on_reliable_media(vp))) { + /* + * This vnode is deemed "reliable" so mark + * its VM object as "trusted". + */ + memory_object_mark_trusted(uip->ui_control); + } else { +// printf("BUGGYCOW: %s:%d vp %p \"%s\" in mnt %p \"%s\" is untrusted\n", __FUNCTION__, __LINE__, vp, vp->v_name, vp->v_mount, vp->v_mount->mnt_vnodecovered->v_name); + } } } return error; diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index 6b798e832..9321399dc 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -107,6 +107,7 @@ decl_lck_mtx_data(static, domain_proto_mtx); decl_lck_mtx_data(static, domain_timeout_mtx); u_int64_t _net_uptime; +u_int64_t _net_uptime_ms; #if (DEVELOPMENT || DEBUG) @@ -1003,6 +1004,10 @@ net_update_uptime_with_time(const struct timeval *tvp) if (tvp->tv_usec > 500000) { _net_uptime++; } + + /* update milliseconds variant */ + _net_uptime_ms = (((u_int64_t)tvp->tv_sec * 1000) + + ((u_int64_t)tvp->tv_usec / 1000)); } void @@ -1044,6 +1049,16 @@ net_uptime(void) return _net_uptime; } +u_int64_t +net_uptime_ms(void) +{ + if (_net_uptime_ms == 0) { + net_update_uptime(); + } + + return _net_uptime_ms; +} + void domain_proto_mtx_lock_assert_held(void) { diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index c5b86b7b8..db7acd2ed 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2018 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -998,7 +998,7 @@ struct omb_stat *omb_stat; /* For backwards compatibility */ #define MB_STAT_SIZE(n) \ __builtin_offsetof(mb_stat_t, mbs_class[n]) #define OMB_STAT_SIZE(n) \ - ((size_t)(&((struct omb_stat *)0)->mbs_class[n])) + __builtin_offsetof(struct omb_stat, mbs_class[n]) /* * The legacy structure holding all of the mbuf allocation statistics. @@ -1038,7 +1038,7 @@ typedef struct { static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ #define MBUF_MTYPES_SIZE(n) \ - ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) + __builtin_offsetof(mbuf_mtypes_t, mbs_cpu[n]) #define MTYPES_CPU(p) \ ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) @@ -1268,7 +1268,7 @@ m_incref(struct mbuf *m) do { old = *addr; new = old + 1; - ASSERT(new != 0); + VERIFY(new != 0); } while (!OSCompareAndSwap16(old, new, addr)); /* @@ -1290,7 +1290,7 @@ m_decref(struct mbuf *m) do { old = *addr; new = old - 1; - ASSERT(old != 0); + VERIFY(old != 0); } while (!OSCompareAndSwap16(old, new, addr)); return new; @@ -4686,7 +4686,7 @@ fail: mcache_free_ext(rcp, rmp_list); } if (wantall && top != NULL) { - m_freem(top); + m_freem_list(top); return NULL; } *numlist = num; @@ -5576,6 +5576,8 @@ m_copyup(struct mbuf *n, int len, int dstoff) struct mbuf *m; int count, space; + VERIFY(len >= 0 && dstoff >= 0); + if (len > (MHLEN - dstoff)) { goto bad; } @@ -6348,6 +6350,9 @@ m_dup(struct mbuf *m, int how) (void) m_free(n); goto nospace; } + } else { + VERIFY((copyhdr == 1 && m->m_len <= MHLEN) || + (copyhdr == 0 && m->m_len <= MLEN)); } *np = n; if (copyhdr) { @@ -7455,6 +7460,7 @@ mcl_audit_scratch(mcache_audit_t *mca) } } +__abortlike static void mcl_audit_mcheck_panic(struct mbuf *m) { @@ -7535,7 +7541,7 @@ mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { uintptr_t bt[MLEAK_STACK_DEPTH]; - int logged = backtrace(bt, MLEAK_STACK_DEPTH); + int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL); mleak_log(bt, addr, logged, num); } } @@ -8800,7 +8806,7 @@ mtracelarge_register(size_t size) uintptr_t bt[MLEAK_STACK_DEPTH]; unsigned int depth; - depth = backtrace(bt, MLEAK_STACK_DEPTH); + depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL); /* Check if this entry is already on the list. */ for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) { trace = &mtracelarge_table[i]; diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index 90bef57a0..2efefbc33 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -133,12 +133,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) struct mbuf *n = NULL, *o = NULL; int hlen = 0, tlen = 0, olen = 0; int sharedcluster = 0; -#if defined(PULLDOWN_STAT) && INET6 - static struct mbuf *prev = NULL; - int prevlen = 0, prevmlen = 0; -#endif /* check invalid arguments. */ + VERIFY(len >= 0 && off >= 0); + if (m == NULL) { panic("m == NULL in m_pulldown()"); } @@ -146,73 +144,12 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) m_freem(m); return NULL; /* impossible */ } - -#if defined(PULLDOWN_STAT) && INET6 - ip6stat.ip6s_pulldown++; -#endif - -#if defined(PULLDOWN_STAT) && INET6 - /* statistics for m_pullup */ - ip6stat.ip6s_pullup++; - if (off + len > MHLEN) { - ip6stat.ip6s_pullup_fail++; - } else { - int dlen, mlen; - - dlen = (prev == m) ? prevlen : m->m_len; - mlen = (prev == m) ? prevmlen : m->m_len + M_TRAILINGSPACE(m); - - if (dlen >= off + len) { - ip6stat.ip6s_pullup--; /* call will not be made! */ - } else if ((m->m_flags & M_EXT) != 0) { - ip6stat.ip6s_pullup_alloc++; - ip6stat.ip6s_pullup_copy++; - } else { - if (mlen >= off + len) { - ip6stat.ip6s_pullup_copy++; - } else { - ip6stat.ip6s_pullup_alloc++; - ip6stat.ip6s_pullup_copy++; - } - } - - prevlen = off + len; - prevmlen = MHLEN; - } - - /* statistics for m_pullup2 */ - ip6stat.ip6s_pullup2++; - if (off + len > MCLBYTES) { - ip6stat.ip6s_pullup2_fail++; - } else { - int dlen, mlen; - - dlen = (prev == m) ? prevlen : m->m_len; - mlen = (prev == m) ? prevmlen : m->m_len + M_TRAILINGSPACE(m); - prevlen = off + len; - prevmlen = mlen; - - if (dlen >= off + len) { - ip6stat.ip6s_pullup2--; /* call will not be made! */ - } else if ((m->m_flags & M_EXT) != 0) { - ip6stat.ip6s_pullup2_alloc++; - ip6stat.ip6s_pullup2_copy++; - prevmlen = (off + len > MHLEN) ? MCLBYTES : MHLEN; - } else { - if (mlen >= off + len) { - ip6stat.ip6s_pullup2_copy++; - } else { - ip6stat.ip6s_pullup2_alloc++; - ip6stat.ip6s_pullup2_copy++; - prevmlen = (off + len > MHLEN) ? MCLBYTES - : MHLEN; - } - } + int tmp_len = 0; + if (os_add_overflow(off, len, &tmp_len)) { + m_free(m); + return NULL; } - prev = m; -#endif - #ifdef PULLDOWN_DEBUG { struct mbuf *t; @@ -267,10 +204,6 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) goto ok; } -#if defined(PULLDOWN_STAT) && INET6 - ip6stat.ip6s_pulldown_copy++; -#endif - /* * when len <= n->m_len - off and off != 0, it is a special case. * len bytes from sits in single mbuf, but the caller does @@ -364,9 +297,6 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) * now, we need to do the hard way. don't m_copy as there's no room * on both end. */ -#if defined(PULLDOWN_STAT) && INET6 - ip6stat.ip6s_pulldown_alloc++; -#endif MGET(o, M_DONTWAIT, m->m_type); if (o == NULL) { m_freem(m); diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 4d6e12a70..dc2bd511c 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2018 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -159,23 +160,23 @@ static lck_mtx_t *so_cache_mtx; #include -static int filt_sorattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); -static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev); -static int filt_sowattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev); -static int filt_sockattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sockdetach(struct knote *kn); static int filt_sockev(struct knote *kn, long hint); -static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev); static int sooptcopyin_timeval(struct sockopt *, struct timeval *); static int sooptcopyout_timeval(struct sockopt *, const struct timeval *); @@ -550,6 +551,9 @@ so_update_last_owner_locked(struct socket *so, proc_t self) so->last_pid = proc_pid(self); proc_getexecutableuuid(self, so->last_uuid, sizeof(so->last_uuid)); + if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { + (*so->so_proto->pr_update_last_owner)(so, self, NULL); + } } proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid)); } @@ -736,7 +740,7 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, break; } - if (flags & SOCF_ASYNC) { + if (flags & SOCF_MPTCP) { so->so_state |= SS_NBIO; } @@ -791,6 +795,13 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, return error; } + /* + * Note: needs so_pcb to be set after pru_attach + */ + if (prp->pr_update_last_owner != NULL) { + (*prp->pr_update_last_owner)(so, p, ep); + } + atomic_add_32(&prp->pr_domain->dom_refs, 1); TAILQ_INIT(&so->so_evlist); @@ -807,8 +818,8 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, * If this thread or task is marked to create backgrounded sockets, * mark the socket as background. */ - if (proc_get_effective_thread_policy(current_thread(), - TASK_POLICY_NEW_SOCKETS_BG)) { + if (!(flags & SOCF_MPTCP) && + proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) { socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); so->so_background_thread = current_thread(); } @@ -1979,7 +1990,7 @@ defunct: !(so->so_flags1 & SOF1_PRECONNECT_DATA)) { return ENOTCONN; } - } else if (addr == 0 && !(flags & MSG_HOLD)) { + } else if (addr == 0) { return (so->so_proto->pr_flags & PR_CONNREQUIRED) ? ENOTCONN : EDESTADDRREQ; } @@ -2049,10 +2060,6 @@ defunct: * Returns nonzero on error, timeout or signal; callers * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. - * Experiment: - * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf - * MSG_SEND: go thru as for MSG_HOLD on current fragment, then - * point at the mbuf chain being constructed and go from there. * * Returns: 0 Success * EOPNOTSUPP @@ -2446,29 +2453,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, } } - if (flags & (MSG_HOLD | MSG_SEND)) { - /* Enqueue for later, go away if HOLD */ - struct mbuf *mb1; - if (so->so_temp && (flags & MSG_FLUSH)) { - m_freem(so->so_temp); - so->so_temp = NULL; - } - if (so->so_temp) { - so->so_tail->m_next = top; - } else { - so->so_temp = top; - } - mb1 = top; - while (mb1->m_next) { - mb1 = mb1->m_next; - } - so->so_tail = mb1; - if (flags & MSG_HOLD) { - top = NULL; - goto out_locked; - } - top = so->so_temp; - } if (dontroute) { so->so_options |= SO_DONTROUTE; } @@ -2531,10 +2515,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, error = (*so->so_proto->pr_usrreqs->pru_send) (so, sendflags, top, addr, control, p); - if (flags & MSG_SEND) { - so->so_temp = NULL; - } - if (dontroute) { so->so_options &= ~SO_DONTROUTE; } @@ -2587,7 +2567,7 @@ out_locked: int sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags) { - struct mbuf *m0, *control_end; + struct mbuf *m0 = NULL, *control_end = NULL; socket_lock_assert_owned(so); @@ -4566,6 +4546,33 @@ out: return error; } +static int +so_statistics_event_to_nstat_event(int64_t *input_options, + uint64_t *nstat_event) +{ + int error = 0; + switch (*input_options) { + case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK: + *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK; + break; + case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK: + *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK; + break; +#if (DEBUG || DEVELOPMENT) + case SO_STATISTICS_EVENT_RESERVED_1: + *nstat_event = NSTAT_EVENT_SRC_RESERVED_1; + break; + case SO_STATISTICS_EVENT_RESERVED_2: + *nstat_event = NSTAT_EVENT_SRC_RESERVED_2; + break; +#endif /* (DEBUG || DEVELOPMENT) */ + default: + error = EINVAL; + break; + } + return error; +} + /* * Returns: 0 Success * EINVAL @@ -4906,14 +4913,15 @@ sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p) } int -soopt_cred_check(struct socket *so, int priv, boolean_t allow_root) +soopt_cred_check(struct socket *so, int priv, boolean_t allow_root, + boolean_t ignore_delegate) { kauth_cred_t cred = NULL; proc_t ep = PROC_NULL; uid_t uid; int error = 0; - if (so->so_flags & SOF_DELEGATED) { + if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) { ep = proc_find(so->e_pid); if (ep) { cred = kauth_cred_proc_ref(ep); @@ -4960,6 +4968,7 @@ int sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) { int error, optval; + int64_t long_optval; struct linger l; struct timeval tv; #if CONFIG_MACF_SOCKET @@ -5240,7 +5249,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) } if (optval != 0) { error = soopt_cred_check(so, - PRIV_NET_RESTRICTED_AWDL, false); + PRIV_NET_RESTRICTED_AWDL, false, false); if (error == 0) { inp_set_awdl_unrestricted( sotoinpcb(so)); @@ -5262,7 +5271,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) if (optval != 0 && inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) { error = soopt_cred_check(so, - PRIV_NET_RESTRICTED_INTCOPROC, false); + PRIV_NET_RESTRICTED_INTCOPROC, false, false); if (error == 0) { inp_set_intcoproc_allowed( sotoinpcb(so)); @@ -5524,7 +5533,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) break; } - error = so_set_effective_pid(so, optval, sopt->sopt_p); + error = so_set_effective_pid(so, optval, sopt->sopt_p, true); break; case SO_DELEGATED_UUID: { @@ -5535,7 +5544,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) break; } - error = so_set_effective_uuid(so, euuid, sopt->sopt_p); + error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true); break; } @@ -5544,7 +5553,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) error = necp_set_socket_attributes(so, sopt); break; - case SO_NECP_CLIENTUUID: + case SO_NECP_CLIENTUUID: { if (SOCK_DOM(so) == PF_MULTIPATH) { /* Handled by MPTCP itself */ break; @@ -5572,7 +5581,8 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) goto out; } - error = necp_client_register_socket_flow(so->last_pid, + pid_t current_pid = proc_pid(current_proc()); + error = necp_client_register_socket_flow(current_pid, inp->necp_client_uuid, inp); if (error != 0) { uuid_clear(inp->necp_client_uuid); @@ -5580,12 +5590,48 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) } if (inp->inp_lport != 0) { - // There is bound local port, so this is not + // There is a bound local port, so this is not // a fresh socket. Assign to the client. - necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp); + necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp); } break; + } + case SO_NECP_LISTENUUID: { + if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { + error = EINVAL; + goto out; + } + + struct inpcb *inp = sotoinpcb(so); + if (!uuid_is_null(inp->necp_client_uuid)) { + error = EINVAL; + goto out; + } + + error = sooptcopyin(sopt, &inp->necp_client_uuid, + sizeof(uuid_t), sizeof(uuid_t)); + if (error != 0) { + goto out; + } + + if (uuid_is_null(inp->necp_client_uuid)) { + error = EINVAL; + goto out; + } + + error = necp_client_register_socket_listener(proc_pid(current_proc()), + inp->necp_client_uuid, inp); + if (error != 0) { + uuid_clear(inp->necp_client_uuid); + goto out; + } + + // Mark that the port registration is held by NECP + inp->inp_flags2 |= INP2_EXTERNAL_PORT; + + break; + } #endif /* NECP */ case SO_EXTENDED_BK_IDLE: @@ -5613,6 +5659,21 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) } break; + case SO_STATISTICS_EVENT: + error = sooptcopyin(sopt, &long_optval, + sizeof(long_optval), sizeof(long_optval)); + if (error != 0) { + goto out; + } + u_int64_t nstat_event = 0; + error = so_statistics_event_to_nstat_event( + &long_optval, &nstat_event); + if (error != 0) { + goto out; + } + nstat_pcb_event(sotoinpcb(so), nstat_event); + break; + case SO_NET_SERVICE_TYPE: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); @@ -5641,6 +5702,24 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) } break; + case SO_MPKL_SEND_INFO: { + struct so_mpkl_send_info so_mpkl_send_info; + + error = sooptcopyin(sopt, &so_mpkl_send_info, + sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info)); + if (error != 0) { + goto out; + } + uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid); + so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto; + + if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) { + so->so_flags1 &= ~SOF1_MPKL_SEND_INFO; + } else { + so->so_flags1 |= SOF1_MPKL_SEND_INFO; + } + break; + } default: error = ENOPROTOOPT; break; @@ -5837,17 +5916,13 @@ integer: m1 = so->so_rcv.sb_mb; while (m1 != NULL) { - if (m1->m_type == MT_DATA || - m1->m_type == MT_HEADER || - m1->m_type == MT_OOBDATA) { - cnt += 1; - } + cnt += 1; m1 = m1->m_nextpkt; } optval = cnt; goto integer; } else { - error = EINVAL; + error = ENOPROTOOPT; break; } @@ -6050,8 +6125,7 @@ integer: error = necp_get_socket_attributes(so, sopt); break; - case SO_NECP_CLIENTUUID: - { + case SO_NECP_CLIENTUUID: { uuid_t *ncu; if (SOCK_DOM(so) == PF_MULTIPATH) { @@ -6066,6 +6140,25 @@ integer: error = sooptcopyout(sopt, ncu, sizeof(uuid_t)); break; } + + case SO_NECP_LISTENUUID: { + uuid_t *nlu; + + if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { + if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) { + nlu = &sotoinpcb(so)->necp_client_uuid; + } else { + error = ENOENT; + goto out; + } + } else { + error = EINVAL; + goto out; + } + + error = sooptcopyout(sopt, nlu, sizeof(uuid_t)); + break; + } #endif /* NECP */ #if CONTENT_FILTER @@ -6099,6 +6192,15 @@ integer: optval = so_get_netsvc_marking_level(so); goto integer; + case SO_MPKL_SEND_INFO: { + struct so_mpkl_send_info so_mpkl_send_info; + + uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid); + so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto; + error = sooptcopyout(sopt, &so_mpkl_send_info, + sizeof(struct so_mpkl_send_info)); + break; + } default: error = ENOPROTOOPT; break; @@ -6312,14 +6414,9 @@ sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql) } int -soo_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx) +soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(fp) -#if !CONFIG_MACF_SOCKET -#pragma unused(ctx) -#endif /* MAC_SOCKET */ - struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + struct socket *so = (struct socket *)fp->f_fglob->fg_data; int result; socket_lock(so, 1); @@ -6327,11 +6424,10 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, so_update_policy(so); #if CONFIG_MACF_SOCKET - if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)), - kn, so) != 0) { + proc_t p = knote_get_kq(kn)->kq_p; + if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) { socket_unlock(so, 1); - kn->kn_flags = EV_ERROR; - kn->kn_data = EPERM; + knote_set_error(kn, EPERM); return 0; } #endif /* MAC_SOCKET */ @@ -6351,8 +6447,7 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, break; default: socket_unlock(so, 1); - kn->kn_flags = EV_ERROR; - kn->kn_data = EINVAL; + knote_set_error(kn, EINVAL); return 0; } @@ -6368,21 +6463,21 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, } static int -filt_soread_common(struct knote *kn, struct socket *so) +filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so) { - if (so->so_options & SO_ACCEPTCONN) { - int is_not_empty; + int retval = 0; + int64_t data = 0; + if (so->so_options & SO_ACCEPTCONN) { /* * Radar 6615193 handle the listen case dynamically * for kqueue read filter. This allows to call listen() * after registering the kqueue EVFILT_READ. */ - kn->kn_data = so->so_qlen; - is_not_empty = !TAILQ_EMPTY(&so->so_comp); - - return is_not_empty; + retval = !TAILQ_EMPTY(&so->so_comp); + data = so->so_qlen; + goto out; } /* socket isn't a listener */ @@ -6391,13 +6486,14 @@ filt_soread_common(struct knote *kn, struct socket *so) * the bytes of protocol data. We therefore exclude any * control bytes. */ - kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; + data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; if (kn->kn_sfflags & NOTE_OOB) { if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { kn->kn_fflags |= NOTE_OOB; - kn->kn_data -= so->so_oobmark; - return 1; + data -= so->so_oobmark; + retval = 1; + goto out; } } @@ -6408,11 +6504,13 @@ filt_soread_common(struct knote *kn, struct socket *so) ) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - return 1; + retval = 1; + goto out; } if (so->so_error) { /* temporary udp error */ - return 1; + retval = 1; + goto out; } int64_t lowwat = so->so_rcv.sb_lowat; @@ -6429,20 +6527,17 @@ filt_soread_common(struct knote *kn, struct socket *so) } } - /* - * The order below is important. Since NOTE_LOWAT - * overrides sb_lowat, check for NOTE_LOWAT case - * first. - */ - if (kn->kn_sfflags & NOTE_LOWAT) { - return kn->kn_data >= lowwat; - } + retval = (data >= lowwat); - return so->so_rcv.sb_cc >= lowwat; +out: + if (retval && kev) { + knote_fill_kevent(kn, kev, data); + } + return retval; } static int -filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; @@ -6456,16 +6551,16 @@ filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev) if (kn->kn_filter == EVFILT_READ && kn->kn_flags & EV_OOBAND) { kn->kn_flags &= ~EV_OOBAND; - kn->kn_hookid = EV_OOBAND; + kn->kn_hook32 = EV_OOBAND; } else { - kn->kn_hookid = 0; + kn->kn_hook32 = 0; } if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) { so->so_rcv.sb_flags |= SB_KNOTE; } /* indicate if event is already fired */ - return filt_soread_common(kn, so); + return filt_soread_common(kn, NULL, so); } static void @@ -6493,7 +6588,7 @@ filt_soread(struct knote *kn, long hint) socket_lock(so, 1); } - retval = filt_soread_common(kn, so); + retval = filt_soread_common(kn, NULL, so); if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_unlock(so, 1); @@ -6503,7 +6598,7 @@ filt_soread(struct knote *kn, long hint) } static int -filt_sortouch(struct knote *kn, struct kevent_internal_s *kev) +filt_sortouch(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; int retval; @@ -6515,7 +6610,7 @@ filt_sortouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; /* determine if changes result in fired events */ - retval = filt_soread_common(kn, so); + retval = filt_soread_common(kn, NULL, so); socket_unlock(so, 1); @@ -6523,21 +6618,13 @@ filt_sortouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; int retval; socket_lock(so, 1); - retval = filt_soread_common(kn, so); - if (retval) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + retval = filt_soread_common(kn, kev, so); socket_unlock(so, 1); return retval; @@ -6557,26 +6644,35 @@ so_wait_for_if_feedback(struct socket *so) } static int -filt_sowrite_common(struct knote *kn, struct socket *so) +filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so) { int ret = 0; + int64_t data = sbspace(&so->so_snd); - kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - return 1; + ret = 1; + goto out; } + if (so->so_error) { /* temporary udp error */ - return 1; + ret = 1; + goto out; } + if (!socanwrite(so)) { - return 0; + ret = 0; + goto out; } + if (so->so_flags1 & SOF1_PRECONNECT_DATA) { - return 1; + ret = 1; + goto out; } + int64_t lowwat = so->so_snd.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_sdata > so->so_snd.sb_hiwat) { lowwat = so->so_snd.sb_hiwat; @@ -6584,7 +6680,8 @@ filt_sowrite_common(struct knote *kn, struct socket *so) lowwat = kn->kn_sdata; } } - if (kn->kn_data >= lowwat) { + + if (data >= lowwat) { if ((so->so_flags & SOF_NOTSENT_LOWAT) #if (DEBUG || DEVELOPMENT) && so_notsent_lowat_check == 1 @@ -6602,7 +6699,8 @@ filt_sowrite_common(struct knote *kn, struct socket *so) } #endif else { - return 1; + ret = 1; + goto out; } } else { ret = 1; @@ -6611,11 +6709,16 @@ filt_sowrite_common(struct knote *kn, struct socket *so) if (so_wait_for_if_feedback(so)) { ret = 0; } + +out: + if (ret && kev) { + knote_fill_kevent(kn, kev, data); + } return ret; } static int -filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; @@ -6625,7 +6728,7 @@ filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev) } /* determine if its already fired */ - return filt_sowrite_common(kn, so); + return filt_sowrite_common(kn, NULL, so); } static void @@ -6653,7 +6756,7 @@ filt_sowrite(struct knote *kn, long hint) socket_lock(so, 1); } - ret = filt_sowrite_common(kn, so); + ret = filt_sowrite_common(kn, NULL, so); if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_unlock(so, 1); @@ -6663,7 +6766,7 @@ filt_sowrite(struct knote *kn, long hint) } static int -filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev) +filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; int ret; @@ -6675,7 +6778,7 @@ filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; /* determine if these changes result in a triggered event */ - ret = filt_sowrite_common(kn, so); + ret = filt_sowrite_common(kn, NULL, so); socket_unlock(so, 1); @@ -6683,29 +6786,24 @@ filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; int ret; socket_lock(so, 1); - ret = filt_sowrite_common(kn, so); - if (ret) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } + ret = filt_sowrite_common(kn, kev, so); socket_unlock(so, 1); + return ret; } static int -filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) +filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev, + struct socket *so, long ev_hint) { int ret = 0; + int64_t data = 0; uint32_t level_trigger = 0; if (ev_hint & SO_FILT_HINT_CONNRESET) { @@ -6770,7 +6868,7 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); /* If resume event was delivered before, reset it */ - kn->kn_hookid &= ~NOTE_RESUME; + kn->kn_hook32 &= ~NOTE_RESUME; kn->kn_fflags |= NOTE_SUSPEND; level_trigger |= NOTE_SUSPEND; @@ -6781,7 +6879,7 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); /* If suspend event was delivered before, reset it */ - kn->kn_hookid &= ~NOTE_SUSPEND; + kn->kn_hook32 &= ~NOTE_SUSPEND; kn->kn_fflags |= NOTE_RESUME; level_trigger |= NOTE_RESUME; @@ -6789,10 +6887,12 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) if (so->so_error != 0) { ret = 1; - kn->kn_data = so->so_error; + data = so->so_error; kn->kn_flags |= EV_EOF; } else { - get_sockev_state(so, (u_int32_t *)&(kn->kn_data)); + u_int32_t data32; + get_sockev_state(so, &data32); + data = data32; } /* Reset any events that are not requested on this knote */ @@ -6800,7 +6900,7 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK); /* Find the level triggerred events that are already delivered */ - level_trigger &= kn->kn_hookid; + level_trigger &= kn->kn_hook32; level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK; /* Do not deliver level triggerred events more than once */ @@ -6808,22 +6908,48 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) ret = 1; } + if (ret && kev) { + /* + * Store the state of the events being delivered. This + * state can be used to deliver level triggered events + * ateast once and still avoid waking up the application + * multiple times as long as the event is active. + */ + if (kn->kn_fflags != 0) { + kn->kn_hook32 |= (kn->kn_fflags & + EVFILT_SOCK_LEVEL_TRIGGER_MASK); + } + + /* + * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver + * only one of them and remember the last one that was + * delivered last + */ + if (kn->kn_fflags & NOTE_SUSPEND) { + kn->kn_hook32 &= ~NOTE_RESUME; + } + if (kn->kn_fflags & NOTE_RESUME) { + kn->kn_hook32 &= ~NOTE_SUSPEND; + } + + knote_fill_kevent(kn, kev, data); + } return ret; } static int -filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; /* socket locked */ - kn->kn_hookid = 0; + kn->kn_hook32 = 0; if (KNOTE_ATTACH(&so->so_klist, kn)) { so->so_flags |= SOF_KNOTE; } /* determine if event already fired */ - return filt_sockev_common(kn, so, 0); + return filt_sockev_common(kn, NULL, so, 0); } static void @@ -6852,7 +6978,7 @@ filt_sockev(struct knote *kn, long hint) locked = 1; } - ret = filt_sockev_common(kn, so, ev_hint); + ret = filt_sockev_common(kn, NULL, so, ev_hint); if (locked) { socket_unlock(so, 1); @@ -6869,7 +6995,7 @@ filt_sockev(struct knote *kn, long hint) static int filt_socktouch( struct knote *kn, - struct kevent_internal_s *kev) + struct kevent_qos_s *kev) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; uint32_t changed_flags; @@ -6878,7 +7004,7 @@ filt_socktouch( socket_lock(so, 1); /* save off the [result] data and fflags */ - changed_flags = (kn->kn_sfflags ^ kn->kn_hookid); + changed_flags = (kn->kn_sfflags ^ kn->kn_hook32); /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; @@ -6896,11 +7022,10 @@ filt_socktouch( * delivered, if any of those events are not requested * anymore the state related to them can be reset */ - kn->kn_hookid &= - ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK); + kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK); /* determine if we have events to deliver */ - ret = filt_sockev_common(kn, so, 0); + ret = filt_sockev_common(kn, NULL, so, 0); socket_unlock(so, 1); @@ -6911,50 +7036,14 @@ filt_socktouch( * filt_sockprocess - query event fired state and return data */ static int -filt_sockprocess( - struct knote *kn, - struct filt_process_s *data, - struct kevent_internal_s *kev) +filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) - struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; int ret = 0; socket_lock(so, 1); - ret = filt_sockev_common(kn, so, 0); - if (ret) { - *kev = kn->kn_kevent; - - /* - * Store the state of the events being delivered. This - * state can be used to deliver level triggered events - * ateast once and still avoid waking up the application - * multiple times as long as the event is active. - */ - if (kn->kn_fflags != 0) { - kn->kn_hookid |= (kn->kn_fflags & - EVFILT_SOCK_LEVEL_TRIGGER_MASK); - } - - /* - * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver - * only one of them and remember the last one that was - * delivered last - */ - if (kn->kn_fflags & NOTE_SUSPEND) { - kn->kn_hookid &= ~NOTE_RESUME; - } - if (kn->kn_fflags & NOTE_RESUME) { - kn->kn_hookid &= ~NOTE_SUSPEND; - } - - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - } - } + ret = filt_sockev_common(kn, kev, so, 0); socket_unlock(so, 1); @@ -7003,6 +7092,16 @@ solockhistory_nr(struct socket *so) return lock_history_str; } +lck_mtx_t * +socket_getlock(struct socket *so, int flags) +{ + if (so->so_proto->pr_getlock != NULL) { + return (*so->so_proto->pr_getlock)(so, flags); + } else { + return so->so_proto->pr_domain->dom_mtx; + } +} + void socket_lock(struct socket *so, int refcount) { @@ -7062,12 +7161,12 @@ socket_unlock(struct socket *so, int refcount) lr_saved = __builtin_return_address(0); - if (so->so_proto == NULL) { + if (so == NULL || so->so_proto == NULL) { panic("%s: null so_proto so=%p\n", __func__, so); /* NOTREACHED */ } - if (so && so->so_proto->pr_unlock) { + if (so->so_proto->pr_unlock) { (*so->so_proto->pr_unlock)(so, refcount, lr_saved); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; @@ -7625,6 +7724,7 @@ so_set_restrictions(struct socket *so, uint32_t vals) { int nocell_old, nocell_new; int noexpensive_old, noexpensive_new; + int noconstrained_old, noconstrained_new; /* * Deny-type restrictions are trapdoors; once set they cannot be @@ -7641,15 +7741,18 @@ so_set_restrictions(struct socket *so, uint32_t vals) */ nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE); + noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED); so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN | SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR | - SO_RESTRICT_DENY_EXPENSIVE)); + SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED)); nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE); + noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED); /* we can only set, not clear restrictions */ if ((nocell_new - nocell_old) == 0 && - (noexpensive_new - noexpensive_old) == 0) { + (noexpensive_new - noexpensive_old) == 0 && + (noconstrained_new - noconstrained_old) == 0) { return 0; } #if INET6 @@ -7667,6 +7770,9 @@ so_set_restrictions(struct socket *so, uint32_t vals) if (noexpensive_new - noexpensive_old != 0) { inp_set_noexpensive(sotoinpcb(so)); } + if (noconstrained_new - noconstrained_old != 0) { + inp_set_noconstrained(sotoinpcb(so)); + } } if (SOCK_DOM(so) == PF_MULTIPATH) { @@ -7685,7 +7791,7 @@ so_get_restrictions(struct socket *so) } int -so_set_effective_pid(struct socket *so, int epid, struct proc *p) +so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred) { struct proc *ep = PROC_NULL; int error = 0; @@ -7712,7 +7818,7 @@ so_set_effective_pid(struct socket *so, int epid, struct proc *p) * the process's own pid, then proceed. Otherwise ensure * that the issuing process has the necessary privileges. */ - if (epid != so->last_pid || epid != proc_pid(p)) { + if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) { if ((error = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { error = EACCES; @@ -7747,6 +7853,9 @@ so_set_effective_pid(struct socket *so, int epid, struct proc *p) so->e_pid = proc_pid(ep); proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid)); } + if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { + (*so->so_proto->pr_update_last_owner)(so, NULL, ep); + } done: if (error == 0 && net_io_policy_log) { uuid_string_t buf; @@ -7784,7 +7893,7 @@ done: } int -so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p) +so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred) { uuid_string_t buf; uuid_t uuid; @@ -7815,8 +7924,9 @@ so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p) * the process's own uuid, then proceed. Otherwise ensure * that the issuing process has the necessary privileges. */ - if (uuid_compare(euuid, so->last_uuid) != 0 || - uuid_compare(euuid, uuid) != 0) { + if (check_cred && + (uuid_compare(euuid, so->last_uuid) != 0 || + uuid_compare(euuid, uuid) != 0)) { if ((error = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { error = EACCES; @@ -7851,7 +7961,13 @@ so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p) so->e_pid = so->last_pid; uuid_copy(so->e_uuid, euuid); } - + /* + * The following will clear the effective process name as it's the same + * as the real process + */ + if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { + (*so->so_proto->pr_update_last_owner)(so, NULL, NULL); + } done: if (error == 0 && net_io_policy_log) { uuid_unparse(so->e_uuid, buf); diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index 34b295590..cc3c37a52 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -185,6 +186,15 @@ soisconnecting(struct socket *so) void soisconnected(struct socket *so) { + /* + * If socket is subject to filter and is pending initial verdict, + * delay marking socket as connected and do not present the connected + * socket to user just yet. + */ + if (cfil_sock_connected_pending_verdict(so)) { + return; + } + so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; @@ -381,6 +391,7 @@ sonewconn_internal(struct socket *head, int connstatus) SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT | SOF_NODEFUNCT | SOF_PRIVILEGED_TRAFFIC_CLASS | SOF_NOTSENT_LOWAT | SOF_USELRO | SOF_DELEGATED); + so->so_flags1 |= SOF1_INBOUND; so->so_usecount = 1; so->next_lock_lr = 0; so->next_unlock_lr = 0; @@ -395,9 +406,11 @@ sonewconn_internal(struct socket *head, int connstatus) /* inherit traffic management properties of listener */ so->so_flags1 |= - head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND); + head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND | SOF1_TC_NET_SERV_TYPE | + SOF1_QOSMARKING_ALLOWED | SOF1_QOSMARKING_POLICY_OVERRIDE); so->so_background_thread = head->so_background_thread; so->so_traffic_class = head->so_traffic_class; + so->so_netsvctype = head->so_netsvctype; if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); @@ -434,6 +447,9 @@ sonewconn_internal(struct socket *head, int connstatus) } } + if (so->so_proto->pr_copy_last_owner != NULL) { + (*so->so_proto->pr_copy_last_owner)(so, head); + } atomic_add_32(&so->so_proto->pr_domain->dom_refs, 1); /* Insert in head appropriate lists */ @@ -605,7 +621,7 @@ sbwakeup(struct sockbuf *sb) * if the socket has the SS_ASYNC flag set. */ void -sowakeup(struct socket *so, struct sockbuf *sb) +sowakeup(struct socket *so, struct sockbuf *sb, struct socket *so2) { if (so->so_flags & SOF_DEFUNCT) { SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, " @@ -640,11 +656,42 @@ sowakeup(struct socket *so, struct sockbuf *sb) so->so_upcallusecount++; if (lock) { + if (so2) { + struct unpcb *unp = sotounpcb(so2); + unp->unp_flags |= UNP_DONTDISCONNECT; + unp->rw_thrcount++; + + socket_unlock(so2, 0); + } socket_unlock(so, 0); } (*sb_upcall)(so, sb_upcallarg, M_DONTWAIT); if (lock) { + if (so2 && so > so2) { + struct unpcb *unp; + socket_lock(so2, 0); + + unp = sotounpcb(so2); + unp->rw_thrcount--; + if (unp->rw_thrcount == 0) { + unp->unp_flags &= ~UNP_DONTDISCONNECT; + wakeup(unp); + } + } + socket_lock(so, 0); + + if (so2 && so < so2) { + struct unpcb *unp; + socket_lock(so2, 0); + + unp = sotounpcb(so2); + unp->rw_thrcount--; + if (unp->rw_thrcount == 0) { + unp->unp_flags &= ~UNP_DONTDISCONNECT; + wakeup(unp); + } + } } so->so_upcallusecount--; @@ -1086,82 +1133,6 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0) return 1; } -/* - * As above except that OOB data - * is inserted at the beginning of the sockbuf, - * but after any other OOB data. - */ -int -sbinsertoob(struct sockbuf *sb, struct mbuf *m0) -{ - struct mbuf *m; - struct mbuf **mp; - - if (m0 == 0) { - return 0; - } - - SBLASTRECORDCHK(sb, "sbinsertoob 1"); - - if ((sb->sb_flags & SB_RECV && !(m0->m_flags & M_SKIPCFIL)) != 0) { - int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, - sock_data_filt_flag_oob); - - SBLASTRECORDCHK(sb, "sbinsertoob 2"); - -#if CONTENT_FILTER - if (error == 0) { - error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0); - } -#endif /* CONTENT_FILTER */ - - if (error) { - if (error != EJUSTRETURN) { - m_freem(m0); - } - return 0; - } - } else if (m0) { - m0->m_flags &= ~M_SKIPCFIL; - } - - for (mp = &sb->sb_mb; *mp; mp = &((*mp)->m_nextpkt)) { - m = *mp; -again: - switch (m->m_type) { - case MT_OOBDATA: - continue; /* WANT next train */ - - case MT_CONTROL: - m = m->m_next; - if (m) { - goto again; /* inspect THIS train further */ - } - } - break; - } - /* - * Put the first mbuf on the queue. - * Note this permits zero length records. - */ - sballoc(sb, m0); - m0->m_nextpkt = *mp; - if (*mp == NULL) { - /* m0 is actually the new tail */ - sb->sb_lastrecord = m0; - } - *mp = m0; - m = m0->m_next; - m0->m_next = 0; - if (m && (m0->m_flags & M_EOR)) { - m0->m_flags &= ~M_EOR; - m->m_flags |= M_EOR; - } - sbcompress(sb, m, m0); - SBLASTRECORDCHK(sb, "sbinsertoob 3"); - return 1; -} - /* * Concatenate address (optional), control (optional) and data into one * single mbuf chain. If sockbuf *sb is passed in, space check will be @@ -2845,21 +2816,25 @@ sbunlock(struct sockbuf *sb, boolean_t keeplocked) } if (!keeplocked) { /* unlock on exit */ - lck_mtx_t *mutex_held; - - if (so->so_proto->pr_getlock != NULL) { - mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK); + if (so->so_flags & SOF_MP_SUBFLOW || SOCK_DOM(so) == PF_MULTIPATH) { + (*so->so_proto->pr_unlock)(so, 1, lr_saved); } else { - mutex_held = so->so_proto->pr_domain->dom_mtx; - } + lck_mtx_t *mutex_held; - LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); + if (so->so_proto->pr_getlock != NULL) { + mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK); + } else { + mutex_held = so->so_proto->pr_domain->dom_mtx; + } - VERIFY(so->so_usecount > 0); - so->so_usecount--; - so->unlock_lr[so->next_unlock_lr] = lr_saved; - so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX; - lck_mtx_unlock(mutex_held); + LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); + + VERIFY(so->so_usecount > 0); + so->so_usecount--; + so->unlock_lr[so->next_unlock_lr] = lr_saved; + so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX; + lck_mtx_unlock(mutex_held); + } } } @@ -2867,7 +2842,7 @@ void sorwakeup(struct socket *so) { if (sb_notify(&so->so_rcv)) { - sowakeup(so, &so->so_rcv); + sowakeup(so, &so->so_rcv, NULL); } } @@ -2875,7 +2850,7 @@ void sowwakeup(struct socket *so) { if (sb_notify(&so->so_snd)) { - sowakeup(so, &so->so_snd); + sowakeup(so, &so->so_snd, NULL); } } @@ -2895,7 +2870,8 @@ soevent(struct socket *so, long hint) if ((hint & SO_FILT_HINT_IFDENIED) && !(so->so_flags & SOF_MP_SUBFLOW) && !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR) && - !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) { + !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE) && + !(so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) { soevent_ifdenied(so); } } diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index b89e2cc49..b903e4a18 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -100,6 +100,8 @@ #include #include +#include + #if CONFIG_MACF_SOCKET_SUBSET #include #endif /* MAC_SOCKET_SUBSET */ @@ -1012,6 +1014,12 @@ connectitx(struct socket *so, struct sockaddr *src, if ((error = mac_socket_check_connect(kauth_cred_get(), so, dst)) != 0) { return error; } + + if (auio != NULL) { + if ((error = mac_socket_check_send(kauth_cred_get(), so, dst)) != 0) { + return error; + } + } #endif /* MAC_SOCKET_SUBSET */ socket_lock(so, 1); @@ -1816,8 +1824,8 @@ copyout_control(struct proc *p, struct mbuf *m, user_addr_t control, if (proc_is64bit(p)) { struct user64_timeval *tv64 = (struct user64_timeval *)(void *)CMSG_DATA(tmp_cp); - tv64->tv_sec = tv->tv_sec; - tv64->tv_usec = tv->tv_usec; + os_unaligned_deref(&tv64->tv_sec) = tv->tv_sec; + os_unaligned_deref(&tv64->tv_usec) = tv->tv_usec; tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval)); tmp_space = CMSG_SPACE(sizeof(struct user64_timeval)); @@ -3440,9 +3448,11 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) /* * Get number of bytes to send * Should it applies to size of header and trailer? - * JMM - error handling? */ - copyin(uap->nbytes, &nbytes, sizeof(off_t)); + error = copyin(uap->nbytes, &nbytes, sizeof(off_t)); + if (error) { + goto done2; + } /* * If specified, get the pointer to the sf_hdtr struct for diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index 2925a6fee..2c4434dc2 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -161,7 +161,7 @@ struct mdns_ipc_msg_hdr { * need a proper out-of-band * lock pushdown */ -static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL, { 0 } }; +static struct sockaddr sun_noname = { .sa_len = sizeof(sun_noname), .sa_family = AF_LOCAL, .sa_data = { 0 } }; static ino_t unp_ino; /* prototype for fake inode numbers */ static int unp_attach(struct socket *); @@ -392,7 +392,9 @@ uipc_rcvd(struct socket *so, __unused int flags) unp->unp_mbcnt = rcv->sb_mbcnt; snd->sb_hiwat += unp->unp_cc - rcv->sb_cc; unp->unp_cc = rcv->sb_cc; - sowwakeup(so2); + if (sb_notify(&so2->so_snd)) { + sowakeup(so2, &so2->so_snd, so); + } socket_unlock(so2, 1); @@ -495,7 +497,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, */ if (sbappendaddr(&so2->so_rcv, from, m, control, &error)) { control = NULL; - sorwakeup(so2); + if (sb_notify(&so2->so_rcv)) { + sowakeup(so2, &so2->so_rcv, so); + } } else if (control != NULL && error == 0) { /* A socket filter took control; don't touch it */ control = NULL; @@ -587,7 +591,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, unp->unp_conn->unp_cc = rcv->sb_cc; if (didreceive) { control = NULL; - sorwakeup(so2); + if (sb_notify(&so2->so_rcv)) { + sowakeup(so2, &so2->so_rcv, so); + } } else if (control != NULL && error == 0) { /* A socket filter took control; don't touch it */ control = NULL; @@ -1736,12 +1742,12 @@ unp_pcblist SYSCTL_HANDLER_ARGS * connect/disconnect races for SMP. */ if (unp->unp_addr) { - bcopy(unp->unp_addr, &xu.xu_addr, + bcopy(unp->unp_addr, &xu.xu_au, unp->unp_addr->sun_len); } if (unp->unp_conn && unp->unp_conn->unp_addr) { bcopy(unp->unp_conn->unp_addr, - &xu.xu_caddr, + &xu.xu_cau, unp->unp_conn->unp_addr->sun_len); } unpcb_to_compat(unp, &xu.xu_unp); @@ -1890,12 +1896,12 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS * connect/disconnect races for SMP. */ if (unp->unp_addr) { - bcopy(unp->unp_addr, &xu.xunp_addr, + bcopy(unp->unp_addr, &xu.xu_au, unp->unp_addr->sun_len); } if (unp->unp_conn && unp->unp_conn->unp_addr) { bcopy(unp->unp_conn->unp_addr, - &xu.xunp_caddr, + &xu.xu_cau, unp->unp_conn->unp_addr->sun_len); } @@ -2019,14 +2025,13 @@ unp_externalize(struct mbuf *rights) if (fp == NULL) { panic("unp_externalize: MALLOC_ZONE"); } - fp->f_iocount = 0; fp->f_fglob = rp[i]; if (fg_removeuipc_mark(rp[i])) { /* * Take an iocount on the fp for completing the * removal from the global msg queue */ - fp->f_iocount++; + os_ref_retain_locked(&fp->f_iocount); fileproc_l[i] = fp; } else { fileproc_l[i] = NULL; diff --git a/bsd/libkern/copyio.h b/bsd/libkern/copyio.h index 8162ded60..fc782cf05 100644 --- a/bsd/libkern/copyio.h +++ b/bsd/libkern/copyio.h @@ -32,13 +32,13 @@ __BEGIN_DECLS -int copyin(const user_addr_t uaddr, void *kaddr, size_t len); +int copyin(const user_addr_t uaddr, void *kaddr, size_t len) OS_WARN_RESULT; int copyout(const void *kaddr, user_addr_t udaddr, size_t len); #if defined (_FORTIFY_SOURCE) && _FORTIFY_SOURCE == 0 /* FORTIFY_SOURCE disabled */ #else -__attribute__((always_inline)) static inline int +OS_ALWAYS_INLINE OS_WARN_RESULT static inline int __copyin_chk(const user_addr_t uaddr, void *kaddr, size_t len, size_t chk_size) { if (chk_size < len) { @@ -47,7 +47,7 @@ __copyin_chk(const user_addr_t uaddr, void *kaddr, size_t len, size_t chk_size) return copyin(uaddr, kaddr, len); } -__attribute__((always_inline)) static inline int +OS_ALWAYS_INLINE static inline int __copyout_chk(const void *kaddr, user_addr_t uaddr, size_t len, size_t chk_size) { if (chk_size < len) { diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 9a4ecb489..ab4dfd3ee 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -144,9 +144,9 @@ ulmin(u_int32_t a, u_int32_t b) /* Prototypes for non-quad routines. */ -extern int ffs(int); +extern int ffs(unsigned int); extern int ffsll(unsigned long long); -extern int fls(int); +extern int fls(unsigned int); extern int flsll(unsigned long long); extern u_int32_t random(void); extern int scanc(u_int, u_char *, const u_char *, int); @@ -194,10 +194,15 @@ __nosan_crc16(uint16_t crc, const void *bufp, size_t len) #endif int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done); -int copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done); +int copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done) OS_WARN_RESULT; int copyoutstr(const void *kaddr, user_addr_t udaddr, size_t len, size_t *done); #if XNU_KERNEL_PRIVATE -extern int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes); +int copyin_atomic32(const user_addr_t user_addr, uint32_t *u32); +int copyin_atomic32_wait_if_equals(const user_addr_t user_addr, uint32_t u32); +int copyin_atomic64(const user_addr_t user_addr, uint64_t *u64); +int copyout_atomic32(uint32_t u32, user_addr_t user_addr); +int copyout_atomic64(uint64_t u64, user_addr_t user_addr); +int copyoutstr_prevalidate(const void *kaddr, user_addr_t uaddr, size_t len); #endif int vsscanf(const char *, char const *, va_list); @@ -206,7 +211,7 @@ extern int vprintf(const char *, va_list) __printflike(1, 0); extern int vsnprintf(char *, size_t, const char *, va_list) __printflike(3, 0); #if XNU_KERNEL_PRIVATE -extern int vprintf_log_locked(const char *, va_list) __printflike(1, 0); +extern int vprintf_log_locked(const char *, va_list, bool addcr) __printflike(1, 0); extern void osobject_retain(void * object); extern void osobject_release(void * object); #endif diff --git a/bsd/machine/Makefile b/bsd/machine/Makefile index bffe6a814..cd91a9858 100644 --- a/bsd/machine/Makefile +++ b/bsd/machine/Makefile @@ -13,6 +13,9 @@ DATAFILES = \ vmparam.h _types.h _limits.h _param.h \ _mcontext.h +DRIVERKIT_DATAFILES = \ + limits.h types.h _types.h + PRIVATE_DATAFILES = \ disklabel.h @@ -25,6 +28,7 @@ KERNELFILES = \ _mcontext.h INSTALL_MI_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} INSTALL_MI_DIR = machine diff --git a/bsd/machine/exec.h b/bsd/machine/exec.h index f93f56bcf..cdbf3a2e3 100644 --- a/bsd/machine/exec.h +++ b/bsd/machine/exec.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,7 @@ #define _BSD_MACHINE_EXEC_H_ #include +#include struct exec_info { char path[MAXPATHLEN]; @@ -41,15 +42,7 @@ struct exec_info { char **ev; }; -int grade_binary(cpu_type_t, cpu_subtype_t); +int grade_binary(cpu_type_t, cpu_subtype_t, bool allow_simulator_binary); boolean_t pie_required(cpu_type_t, cpu_subtype_t); -#if defined (__i386__) || defined(__x86_64__) -#include "i386/exec.h" -#elif defined (__arm__) || defined (__arm64__) -#include "arm/exec.h" -#else -#error architecture not supported -#endif - #endif /* _BSD_MACHINE_EXEC_H_ */ diff --git a/bsd/machine/reboot.h b/bsd/machine/reboot.h deleted file mode 100644 index ae3b8bfb0..000000000 --- a/bsd/machine/reboot.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _BSD_MACHINE_REBOOT_H_ -#define _BSD_MACHINE_REBOOT_H_ - -#if defined (__i386__) || defined(__x86_64__) -#include "i386/reboot.h" -#elif defined (__arm__) || defined (__arm64__) -#include "arm/reboot.h" -#else -#error architecture not supported -#endif - -#endif /* _BSD_MACHINE_REBOOT_H_ */ diff --git a/bsd/man/man2/access.2 b/bsd/man/man2/access.2 index c4b2f92f6..703ce97af 100644 --- a/bsd/man/man2/access.2 +++ b/bsd/man/man2/access.2 @@ -109,6 +109,12 @@ The checks for accessibility are performed using the effective user and group IDs instead of the real user and group ID as required in a call to .Fn access . .El +.Bl -tag -width indent +.It Dv AT_SYMLINK_NOFOLLOW +If +.Fa path +names a symbolic link, the status of the symbolic link is returned. +.El .Pp Even if a process has appropriate privileges and indicates success for .Dv X_OK , diff --git a/bsd/man/man2/chflags.2 b/bsd/man/man2/chflags.2 index 3463d7180..6bea184c4 100644 --- a/bsd/man/man2/chflags.2 +++ b/bsd/man/man2/chflags.2 @@ -77,6 +77,11 @@ The file has been archived. The file may not be changed. .It SF_APPEND The file may only be appended to. +.It SF_DATALESSFAULT +The file is a dataless placeholder. +The system will attempt to materialize it when accessed according to the dataless file materialization policy of the accessing thread or process. +See +.Xr getiopolicy_np 3 . .El .Pp The @@ -93,6 +98,10 @@ The and .Dq SF_APPEND flags may only be set or unset by the super-user. +.Pp +The +.Dq SF_DATALESSFAULT +flag is an internal flag and may not be set from user space. .Sh RETURN VALUES Upon successful completion, a value of 0 is returned. Otherwise, -1 is returned and the global variable diff --git a/bsd/man/man2/fcntl.2 b/bsd/man/man2/fcntl.2 index abafe4017..7390b18e3 100644 --- a/bsd/man/man2/fcntl.2 +++ b/bsd/man/man2/fcntl.2 @@ -146,7 +146,9 @@ as negative, otherwise .Fa arg is interpreted as a process ID. .It Dv F_GETPATH -Get the path of the file descriptor +Get the path of the file descriptor +.It Dv F_GETPATH_NOFIRMLINK +Get the non firmlinked path of the file descriptor .Fa Fildes . The argument must be a buffer of size .Sy MAXPATHLEN @@ -176,11 +178,6 @@ disables read ahead. A non-zero value in .Fa arg turns read ahead on. -.It Dv F_READBOOTSTRAP -Read bootstrap from disk. -.It Dv F_WRITEBOOTSTRAP -Write bootstrap on disk. -The calling process must have root privileges. .It Dv F_NOCACHE Turns data caching off/on. A non-zero value in .Fa arg @@ -199,6 +196,29 @@ to change. .It Dv F_LOG2PHYS_EXT Variant of F_LOG2PHYS that uses the passed in file offset and length. +.It Dv F_BARRIERFSYNC +Does the same thing as +.Xr fsync 2 +then issues a barrier command to the drive +.Fa ( arg +is ignored). +The barrier applies to I/O that have been flushed with +.Xr fsync 2 +on the same device before. These operations are guaranteed to +be persisted before any other I/O that would follow the barrier, +although no assumption should be made on what has been persisted +or not when this call returns. After the barrier has been issued, +operations on other FDs that have been fsync'd before can still be +re-ordered by the device, but not after the barrier. This is +typically useful to guarantee valid state on disk when ordering is a +concern but durability is not. A barrier can be used to order two phases of operations on +a set of file descriptors and ensure that no file can possibly get persisted +with the effect of the second phase without the effect of the first one. To do so, +execute operations of phase one, then +.Xr fsync 2 +each FD and issue a single barrier. Finally execute operations of phase two. +This is currently implemented on HFS and APFS. It requires hardware support, +which Apple SSDs are guaranteed to provide. .It Dv F_FULLFSYNC Does the same thing as .Xr fsync 2 @@ -207,8 +227,11 @@ flush all buffered data to the permanent storage device .Fa ( arg is ignored). +As this drains the entire queue of the device and acts as a +barrier, data that had been fsync'd on the same device before +is guaranteed to be persisted when this call returns. This is currently implemented on HFS, MS-DOS (FAT), -and Universal Disk Format (UDF) file systems. +Universal Disk Format (UDF) and APFS file systems. The operation may take quite a while to complete. Certain FireWire drives have also been known to ignore the request to flush their buffered data. diff --git a/bsd/man/man2/fs_snapshot_create.2 b/bsd/man/man2/fs_snapshot_create.2 index 6407f428e..6218679fe 100644 --- a/bsd/man/man2/fs_snapshot_create.2 +++ b/bsd/man/man2/fs_snapshot_create.2 @@ -196,8 +196,12 @@ The , .Fn fs_snapshot_delete , -.Fn fs_snapshot_delete -and .Fn fs_snapshot_list +, +.Fn fs_snapshot_mount +, +.Fn fs_snapshot_rename +and +.Fn fs_snapshot_revert function calls appeared in macOS version 10.13 . diff --git a/bsd/man/man2/fsgetpath.2 b/bsd/man/man2/fsgetpath.2 index 317c45cb7..e8b71b76b 100644 --- a/bsd/man/man2/fsgetpath.2 +++ b/bsd/man/man2/fsgetpath.2 @@ -67,7 +67,7 @@ multiple paths to that filesystem object may be returned. .Sh RETURN VALUES Upon successful completion, .Fn fsgetpath -returns the path length. Otherwise, a value of -1 is returned and errno is set to indicate the error. +returns the length of the path including the null terminator. Otherwise, a value of -1 is returned and errno is set to indicate the error. .Pp .Sh COMPATIBILITY Not all volumes support diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index b790fe013..f37a137aa 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -245,6 +245,12 @@ and can be requested. When this option is used, forkattrs are reinterpreted as a set of extended common attributes. . +.It FSOPT_RETURN_REALDEV +If this is bit is set, then ATTR_CMN_DEVID and ATTR_CMN_FSID will return +the values corresponding to the physical volume they are on. When a +filesystem supports VOL_CAP_INT_VOLUME_GROUPS, it is possible that the +filesystem may return a common logical value for these attributes otherwise. +. .El . .Sh ATTRIBUTE BUFFER @@ -433,7 +439,7 @@ in An .Vt fsobj_id_t structure that uniquely identifies the file system object within a mounted -volume for the duration of it's mount; this identifier is not guaranteed to be +volume for the duration of its mount; this identifier is not guaranteed to be persistent for the volume and may change every time the volume is mounted. .Pp On HFS+ volumes, the ATTR_CMN_OBJID of a file system object is distinct from @@ -1181,6 +1187,41 @@ system object. Although the ATTR_CMNEXT_LINKID of a file system object may appea similar (in whole or in part) to its ATTR_CMN_FILEID (see description of ATTR_CMN_FILEID above), \fBno relation between the two attributes should ever be implied.\fP . +.It ATTR_CMNEXT_NOFIRMLINKPATH +An +.Vt attrreference +structure containing a path that does not have firmlinks of +the file system object as +a UTF-8 encoded, null terminated C string. +The attribute data length will not be greater than +.Dv PATH_MAX. +Inconsistent behavior may be observed when this attribute is requested on +hard-linked items, particularly when the file system does not support +ATTR_CMN_PARENTID natively. Callers should be aware of this when requesting the +canonical path of a hard-linked item. +.It ATTR_CMNEXT_REALDEVID +A +.Vt dev_t +containing the real device number of the device on which this +file system object's volume is mounted. +Equivalent to the +.Fa st_dev +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMNEXT_REALFSID +An +.Vt fsid_t +structure containing the real file system identifier for the volume on which +the file system object resides. +Equivalent to the +.Fa f_fsid +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . .El . .Sh VOLUME CAPABILITIES @@ -1424,6 +1465,14 @@ See ATTR_CMN_FLAGS for more details. If this bit is set, the volume format does not support setting file permissions. See ATTR_CMN_USERACCESS for more details. +.It VOL_CAP_FMT_SHARED_SPACE +If this bit is set, the volume format supports having multiple logical filesystems +in a single "partition" which share space. +.It VOL_CAP_FMT_VOL_GROUPS +If this bit is set, the volume format supports having multiple logical filesystems +which may be mounted and unmounted together and may present common filesystem +identifier information. +. . .El .Pp @@ -1563,6 +1612,13 @@ operation. See .Xr rename 2 for more details. . +.It VOL_CAP_INT_RENAME_OPENFAIL +If this bit is set, the file system may fail a rename operation +of a directory if one of its descendents is open. +See +.Xr rename 2 +for more details. +. .El .Pp . diff --git a/bsd/man/man2/getattrlistbulk.2 b/bsd/man/man2/getattrlistbulk.2 index aaf91dd5a..60f7e4566 100644 --- a/bsd/man/man2/getattrlistbulk.2 +++ b/bsd/man/man2/getattrlistbulk.2 @@ -86,7 +86,7 @@ structure must be set. Volume attributes cannot be requested but all other supported getattrlist attributes can be used. For this function, .Dv ATTR_CMN_NAME and -.Dv ATRR_CMN_RETURNED_ATTRS +.Dv ATTR_CMN_RETURNED_ATTRS are required and the absence of these attributes in the attrList parameter results in an error. Note that not all attributes supported by .Fn getattrlist @@ -160,8 +160,23 @@ attributes and then use the value of the .Dv ATTR_CMN_OBJTYPE attribute to parse the resulting attribute buffer. .Pp -A directory which is a mount point for a file system, will have a value of "DIR_MNTSTATUS_MNTPOINT" set for it's the -ATTR_DIR_MOUNTSTATUS attribute entry. However the attributes for the mount point will be those from the (underlying) file system. The only way to get the attributes of mounted root directory is to call getattrlist(2) on the mount point. +A directory which is a mount point for a file system, will have a value of +.Dq DIR_MNTSTATUS_MNTPOINT +set for its ATTR_DIR_MOUNTSTATUS attribute entry. +However the attributes for the mount point will be those from the (underlying) file system. +To get the attributes of the mounted root directory, call +.Xr getattrlist 2 +on the mount point. +.Pp +A directory which is a firmlink will have the +.Dq SF_FIRMLINK +flag set in its ATTR_CMN_FLAGS attribute entry. +However, the attributes returned by +.Fn getattrlistbulk +will be those from the firmlink, not the firmlink's target. +To get the attribute of the firmlink's target, call +.Xr getattrlist 2 +on the firmlink. . .Sh RETURN VALUES Upon successful completion the numbers of entries successfully read diff --git a/bsd/man/man2/getdirentriesattr.2 b/bsd/man/man2/getdirentriesattr.2 index 6be39ee87..cc6c35ec7 100644 --- a/bsd/man/man2/getdirentriesattr.2 +++ b/bsd/man/man2/getdirentriesattr.2 @@ -254,7 +254,7 @@ you should be careful to support the behaviour specified by this document. . .Pp If the directory contains a mount point, then -.Dv DIR_MNTSTATUS_MNTPOINT +.Dq DIR_MNTSTATUS_MNTPOINT will be set in the .Dv ATTR_DIR_MOUNTSTATUS for that entry; all other attributes for that entry, however, @@ -262,6 +262,17 @@ will be for the underlying file system (as opposed to the mounted file system). .Xr getattrlist 2 should be used to get the attributes for the mount point. +.Pp +A directory which is a firmlink will have the +.Dq SF_FIRMLINK +flag set in its +ATTR_CMN_FLAGS attribute entry. +However the attributes returned by +.Fn getdirentriesattr +will be those from the firmlink, not the firmlink's target. +To get the attributes of the firmlink's target, call +.Xr getattrlist 2 +on the firmlink. .Sh ERRORS .Fn getdirentriesattr will fail if: diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2 index d1a535542..be1a477c7 100644 --- a/bsd/man/man2/kqueue.2 +++ b/bsd/man/man2/kqueue.2 @@ -145,6 +145,7 @@ The argument gives the size of .Fa changelist . +.Pp The .Fa eventlist argument @@ -158,11 +159,9 @@ The .Fa nevents argument determines the size of .Fa eventlist . -If the KEVENT_FLAG_STACK_EVENTS flag is provided on the system call, -the eventlist array is filled in in stack order (starting in the -highest available index) instead of typical array order. +.Pp The -.Fa out_data +.Fa data_out argument provides space for extra out data provided by specific filters. The .Fa data_available @@ -170,6 +169,7 @@ argument's contents specified the space available in the data pool on input, and contains the amount still remaining on output. If the KEVENT_FLAG_STACK_DATA flag is specified on the system call, the data is allocated from the pool in stack order instead of typical heap order. +.Pp If .Fa timeout is a non-NULL pointer, it specifies a maximum interval to wait diff --git a/bsd/man/man2/mkdir.2 b/bsd/man/man2/mkdir.2 index 249662258..acb59588e 100644 --- a/bsd/man/man2/mkdir.2 +++ b/bsd/man/man2/mkdir.2 @@ -188,6 +188,8 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Eq EILSEQ +The filename does not match the encoding rules. .El .Sh EXAMPLE .Bd -literal -offset indent diff --git a/bsd/man/man2/mkfifo.2 b/bsd/man/man2/mkfifo.2 index 69ecfd616..29e2d9240 100644 --- a/bsd/man/man2/mkfifo.2 +++ b/bsd/man/man2/mkfifo.2 @@ -112,6 +112,8 @@ error occurred while reading from or writing to the file system. .It Bq Er EFAULT .Fa Path points outside the process's allocated address space. +.It Bq Eq EILSEQ +The filename does not match the encoding rules. .El .Sh SEE ALSO .Xr chmod 2 , diff --git a/bsd/man/man2/open.2 b/bsd/man/man2/open.2 index 40a94d7d1..73eb670f7 100644 --- a/bsd/man/man2/open.2 +++ b/bsd/man/man2/open.2 @@ -129,14 +129,19 @@ and the behavior is identical to a call to The flags specified for the .Fa oflag -argument are formed by -.Em or Ns 'ing -the following values: +argument must include exactly one of the following file access modes: .Pp .Bd -literal -offset indent -compact O_RDONLY open for reading only O_WRONLY open for writing only O_RDWR open for reading and writing +.Ed +.Pp +In addition any combination of the following values can be +.Em or Ns 'ed in +.Fa oflag: +.Pp +.Bd -literal -offset indent -compact O_NONBLOCK do not block on open or for data to become available O_APPEND append on each write O_CREAT create file if it does not exist @@ -421,6 +426,8 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Eq EILSEQ +The filename does not match the encoding rules. .El .Sh COMPATIBILITY .Fn open diff --git a/bsd/man/man2/read.2 b/bsd/man/man2/read.2 index d8df563cc..cdf753cac 100644 --- a/bsd/man/man2/read.2 +++ b/bsd/man/man2/read.2 @@ -205,6 +205,15 @@ An action is requested of a device that does not exist.. .\" =========== .It Bq Er ENXIO A requested action cannot be performed by the device. +.\" =========== +.It Bq Er ESTALE +An attempt to read a remote file through NFS that has already been deleted in +the server. +.\" =========== +.It Bq Er ETIMEDOUT +The connection timed out while reading a remote file from a soft mounted NFS +volume (see +.Xr mount_nfs 8 ) . .El .Pp The diff --git a/bsd/man/man2/rename.2 b/bsd/man/man2/rename.2 index 49a562912..5151ce64c 100644 --- a/bsd/man/man2/rename.2 +++ b/bsd/man/man2/rename.2 @@ -204,6 +204,14 @@ The requested operation requires writing in a directory .Fa new , new/.., or old/..) whose modes disallow this. .\" =========== +.It Bq Er EACCES +.Fa old +is a directory and it, or some descendent in the namespace, is open +and the file system format does does not support renaming a directory +with open descendents (see +.Xr getattrlist 2 +.Dv VOL_CAP_INT_RENAME_OPENFAIL Ns ). +.\" =========== .It Bq Er EDQUOT The directory in which the entry for the new name is being placed cannot be extended because the diff --git a/bsd/man/man2/shmctl.2 b/bsd/man/man2/shmctl.2 index 0af2016ed..6f5c6e4bb 100644 --- a/bsd/man/man2/shmctl.2 +++ b/bsd/man/man2/shmctl.2 @@ -60,7 +60,7 @@ This structure is defined as follows in .Bd -literal struct shmid_ds { struct ipc_perm shm_perm; /* operation permissions */ - int shm_segsz; /* size of segment in bytes */ + size_t shm_segsz; /* size of segment in bytes */ pid_t shm_lpid; /* pid of last shm op */ pid_t shm_cpid; /* pid of creator */ short shm_nattch; /* # of current attaches */ diff --git a/bsd/man/man2/stat.2 b/bsd/man/man2/stat.2 index e6ed48bf0..197313465 100644 --- a/bsd/man/man2/stat.2 +++ b/bsd/man/man2/stat.2 @@ -324,8 +324,6 @@ and in .Fa struct statfs . Please refer to -.Xr stat 2 -and .Xr dir 5 for more detail on the specific changes to the other affected data structures. .Pp diff --git a/bsd/man/man2/symlink.2 b/bsd/man/man2/symlink.2 index ae6f2ad6b..0dd37fee0 100644 --- a/bsd/man/man2/symlink.2 +++ b/bsd/man/man2/symlink.2 @@ -203,6 +203,8 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Eq EILSEQ +The filename does not match the encoding rules. .El .Sh SEE ALSO .Xr ln 1 , diff --git a/bsd/man/man2/vfork.2 b/bsd/man/man2/vfork.2 index 3a07e9965..16e3dcdbc 100644 --- a/bsd/man/man2/vfork.2 +++ b/bsd/man/man2/vfork.2 @@ -1,5 +1,3 @@ -.\" $NetBSD: vfork.2,v 1.6 1995/02/27 12:39:30 cgd Exp $ -.\" .\" Copyright (c) 1980, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" @@ -11,11 +9,7 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors +.\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" @@ -32,71 +26,101 @@ .\" SUCH DAMAGE. .\" .\" @(#)vfork.2 8.1 (Berkeley) 6/4/93 +.\" $FreeBSD$ .\" -.Dd June 4, 1993 +.Dd May 22, 2016 .Dt VFORK 2 -.Os BSD 4 +.Os .Sh NAME .Nm vfork -.Nd spawn new process in a virtual memory efficient way +.Nd create a new process without copying the address space +.Sh LIBRARY +.Lb libc .Sh SYNOPSIS -.Fd #include +.In unistd.h .Ft pid_t -.Fo vfork -.Fa void -.Fc +.Fn vfork void .Sh DESCRIPTION +.Bf -symbolic +Since this function is hard to use correctly from application software, +it is recommended to use +.Xr posix_spawn 3 +or +.Xr fork 2 +instead. +.Ef +.Pp +The .Fn vfork +system call can be used to create new processes without fully copying the address -space of the old process, which is horrendously inefficient in a paged -environment. It is useful when the purpose of +space of the old process, which is inefficient in a paged +environment. +It is useful when the purpose of .Xr fork 2 would have been to create a new system context for an -.Xr execve . +.Xr execve 2 . +The .Fn vfork +system call differs from -.Xr fork -in that the child borrows the parent's memory and thread of -control until a call to +.Xr fork 2 +in that the child borrows the parent process's address space and the +calling thread's stack +until a call to .Xr execve 2 or an exit (either by a call to -.Xr exit 2 -or abnormally.) -The parent process is suspended while the child is using its resources. +.Xr _exit 2 +or abnormally). +The calling thread is suspended while the child is using its resources. +Other threads continue to run. .Pp +The .Fn vfork +system call returns 0 in the child's context and (later) the pid of the child in the parent's context. .Pp -.Fn vfork -can normally be used just like -.Xr fork . -It does not work, however, to return while running in the childs context +Many problems can occur when replacing +.Xr fork 2 +with +.Fn vfork . +For example, it does not work to return while running in the child's context from the procedure that called .Fn vfork since the eventual return from .Fn vfork would then return to a no longer existent stack frame. +Also, changing process state which is partially implemented in user space +such as signal handlers with +.Xr libthr 3 +will corrupt the parent's state. +.Pp Be careful, also, to call -.Xr _exit +.Xr _exit 2 rather than -.Xr exit -if you can't -.Xr execve , +.Xr exit 3 +if you cannot +.Xr execve 2 , since -.Xr exit +.Xr exit 3 will flush and close standard I/O channels, and thereby mess up the parent processes standard I/O data structures. (Even with -.Xr fork +.Xr fork 2 it is wrong to call -.Xr exit +.Xr exit 3 since buffered data would then be flushed twice.) -.Sh SEE ALSO -.Xr execve 2 , +.Pp +Unlike .Xr fork 2 , -.Xr sigaction 2 , -.Xr wait 2 +.Fn vfork +does not run +.Xr pthread_atfork 3 +handlers. +.Sh RETURN VALUES +Same as for +.Xr fork 2 . .Sh ERRORS The .Fn vfork @@ -117,15 +141,20 @@ is called following calling a .Fn vfork call. .El +.Sh SEE ALSO +.Xr _exit 2 , +.Xr execve 2 , +.Xr fork 2 , +.Xr sigaction 2 , +.Xr wait 2 , +.Xr exit 3 , +.Xr posix_spawn 3 +.Sh HISTORY +The +.Fn vfork +system call appeared in +.Bx 3 . .Sh BUGS -This system call will be eliminated when proper system sharing -mechanisms are implemented. -Users should not depend on the memory -sharing semantics of -.Xr vfork -as it will, in that case, be made synonymous to -.Xr fork . -.Pp To avoid a possible deadlock situation, processes that are children in the middle of a @@ -140,8 +169,15 @@ output or calls are allowed and input attempts result in an end-of-file indication. -.Sh HISTORY -The +.Sh CAVEATS +There are limits to what you can do in the child process. +To be totally safe you should restrict yourself to only +executing async-signal safe operations until such time +as one of the exec functions is called. All APIs, including +global data symbols, in any framework or library should be +assumed to be unsafe after a .Fn vfork -function call appeared in -.Bx 3.0 . +unless explicitly documented to be safe or async-signal +safe. If you need to use these frameworks in the child +process, you must exec. In this situation it is reasonable +to exec yourself. diff --git a/bsd/man/man3/Makefile b/bsd/man/man3/Makefile index fb44344cb..d9eeb14aa 100644 --- a/bsd/man/man3/Makefile +++ b/bsd/man/man3/Makefile @@ -47,6 +47,9 @@ INSTALL_MAN_LINKS = \ queue.3 TAILQ_REMOVE.3 \ posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addopen.3 \ posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_adddup2.3 \ + posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addinherit_np.3 \ + posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addchdir_np.3 \ + posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addfchdir_np.3 \ posix_spawn_file_actions_init.3 posix_spawn_file_actions_destroy.3 \ posix_spawnattr_init.3 posix_spawnattr_destroy.3 \ posix_spawnattr_setbinpref_np.3 posix_spawnattr_getbinpref_np.3 \ diff --git a/bsd/man/man3/getiopolicy_np.3 b/bsd/man/man3/getiopolicy_np.3 index 8c7e69743..20e6f7543 100644 --- a/bsd/man/man3/getiopolicy_np.3 +++ b/bsd/man/man3/getiopolicy_np.3 @@ -1,4 +1,4 @@ -.Dd April 30, 2013 +.Dd February 11, 2019 .Dt getiopolicy_np 3 .Os .Sh NAME @@ -110,7 +110,7 @@ This lets users change the access time updates policy for the files accessed by the current thread or process. .Pp -IOPOL_TYPE_VFS_ATIME_UPDATES supports following values for +IOPOL_TYPE_VFS_ATIME_UPDATES supports the following values for .Fa policy: .Bl -tag -width IOPOL_ATIME_UPDATES_DEFAULT .It IOPOL_ATIME_UPDATES_OFF @@ -120,12 +120,37 @@ to reduce the metadata I/O writes. .It IOPOL_ATIME_UPDATES_DEFAULT This is the default I/O policy for new threads. .El -.El .Pp Like with IOPOL_TYPE_DISK, the I/O policy of a newly created process is inherited from its parent process. Access time updates are turned off if the I/O policy is set to IOPOL_ATIME_UPDATES_OFF for the current thread or current process. +.It IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES +This +.Fa iotype +lets users change the materialization policy for dataless files accessed +by the current thread or process. +.Pp +IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES supports the following values for +.Fa policy: +.Bl -tag -width IOPOL_MATERIALIZE_DATALESS +.It IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT +Selects the default materialization policy. +For IOPOL_SCOPE_THREAD, all accesses by the current thread will follow the +process policy. +For IOPOL_SCOPE_PROCESS, all accesses will follow the system default +policy +.Pq IOPOL_MATERIALIZE_DATALESS_FILES_OFF . +.It IOPOL_MATERIALIZE_DATALESS_FILES_OFF +Disables materialization of dataless files by the current thread or +process. +.It IOPOL_MATERIALIZE_DATALESS_FILES_ON +Enables materialization of dataless files by the current thread or +process. +.El +.Pp +New processes inherit the policy of their parent process. +.El .Sh RETURN VALUES The .Fn getiopolicy_np diff --git a/bsd/man/man3/posix_spawn_file_actions_addclose.3 b/bsd/man/man3/posix_spawn_file_actions_addclose.3 index 36c64715d..e2915720a 100644 --- a/bsd/man/man3/posix_spawn_file_actions_addclose.3 +++ b/bsd/man/man3/posix_spawn_file_actions_addclose.3 @@ -61,6 +61,16 @@ .Fa "posix_spawn_file_actions_t *file_actions" .Fa "int filedes" .Fc +.Ft int +.Fo posix_spawn_file_actions_addchdir_np +.Fa "posix_spawn_file_actions_t *file_actions" +.Fa "const char *restrict path" +.Fc +.Ft int +.Fo posix_spawn_file_actions_addfchdir_np +.Fa "posix_spawn_file_actions_t *file_actions" +.Fa "int filedes" +.Fc .Sh DESCRIPTION The .Fn posix_spawn_file_actions_addclose @@ -156,6 +166,45 @@ are made available in the spawned process. In that case, can be used to make specific pre-existing file descriptors from the parent process be available in the spawned process. +.Pp +The +.Fn posix_spawn_file_actions_addchdir +function adds an chdir operation to the list of operations associated with +the object referenced by +.Em file_actions , +for subsequent use in a call to +.Xr posix_spawn 2 +or +.Xr posix_spawnp 2 . +The current working directory will be set as if +.Fn chdir +had been called with +.Em path +prior to the new child process starting execution. +.Pp +The +.Fn posix_spawn_file_actions_addfchdir +function adds a fchdir operation to the list of operations associated with +the object referenced by +.Em file_actions , +for subsequent use in a call to +.Xr posix_spawn 2 +or +.Xr posix_spawnp 2 . +The current working directory will be set as if +.Fn fchdir +had been called with +.Em filedes +prior to the new child process starting execution. +When +.Em POSIX_SPAWN_CLOEXEC_DEFAULT +is set, the file descriptor +.Em filedes +will not be automatically inherited unless an explicit +.Fn posix_spawn_file_actions_addinherit_np +action for +.Em filedes +has been added. .Sh RETURN VALUES On success, these functions return 0; on failure they return an error number from @@ -175,6 +224,12 @@ The value of .Fa file_actions is invalid. .\" ========== +.It Bq Er ENAMETOOLONG +The length of the value specified by +.Fa path +exceeds +.Dv PATH_MAX. +.\" ========== .It Bq Er ENOMEM Insufficient memory was available to add the new action to .Fa file_actions . diff --git a/bsd/man/man3/posix_spawnattr_setflags.3 b/bsd/man/man3/posix_spawnattr_setflags.3 index 21666ad8d..0077fa622 100644 --- a/bsd/man/man3/posix_spawnattr_setflags.3 +++ b/bsd/man/man3/posix_spawnattr_setflags.3 @@ -125,7 +125,7 @@ To resume the child process, it must be sent a signal. .It Dv POSIX_SPAWN_CLOEXEC_DEFAULT .Em Apple Extension : -If this bit is set, then only file descriptors explicitly described by the +If this bit is set, then only file descriptors explicitly created by the .Fa file_actions argument are available in the spawned process; all of the other file descriptors are automatically closed in the spawned process. diff --git a/bsd/miscfs/deadfs/dead_vnops.c b/bsd/miscfs/deadfs/dead_vnops.c index 18eade2fa..c3efcf70f 100644 --- a/bsd/miscfs/deadfs/dead_vnops.c +++ b/bsd/miscfs/deadfs/dead_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -113,46 +113,46 @@ int dead_blockmap(struct vnop_blockmap_args *); #define VOPFUNC int (*)(void *) int(**dead_vnodeop_p)(void *); -struct vnodeopv_entry_desc dead_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)dead_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)dead_create }, /* create */ - { &vnop_open_desc, (VOPFUNC)dead_open }, /* open */ - { &vnop_mknod_desc, (VOPFUNC)dead_mknod }, /* mknod */ - { &vnop_close_desc, (VOPFUNC)dead_close }, /* close */ - { &vnop_access_desc, (VOPFUNC)dead_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC)dead_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)dead_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)dead_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)dead_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)dead_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)dead_select }, /* select */ - { &vnop_mmap_desc, (VOPFUNC)dead_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)dead_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)dead_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)dead_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)dead_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)dead_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)dead_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)dead_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)dead_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)dead_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)dead_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)dead_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)dead_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)dead_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)dead_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)dead_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)dead_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)dead_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)dead_blockmap }, /* blockmap */ - { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } +const struct vnodeopv_entry_desc dead_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)dead_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)dead_create }, /* create */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)dead_open }, /* open */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)dead_mknod }, /* mknod */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)dead_close }, /* close */ + { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)dead_access }, /* access */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)dead_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)dead_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)dead_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)dead_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)dead_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)dead_select }, /* select */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)dead_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)dead_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)dead_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)dead_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)dead_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)dead_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)dead_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)dead_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)dead_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)dead_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)dead_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)dead_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)dead_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)dead_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)dead_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)dead_bwrite }, /* bwrite */ + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)dead_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)dead_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)dead_blockmap }, /* blockmap */ + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (VOPFUNC)NULL } }; -struct vnodeopv_desc dead_vnodeop_opv_desc = -{ &dead_vnodeop_p, dead_vnodeop_entries }; +const struct vnodeopv_desc dead_vnodeop_opv_desc = +{ .opv_desc_vector_p = &dead_vnodeop_p, .opv_desc_ops = dead_vnodeop_entries }; /* * Trivial lookup routine that always fails. diff --git a/bsd/miscfs/devfs/devfs_fdesc_support.c b/bsd/miscfs/devfs/devfs_fdesc_support.c index 28806babe..b83108b52 100644 --- a/bsd/miscfs/devfs/devfs_fdesc_support.c +++ b/bsd/miscfs/devfs/devfs_fdesc_support.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -736,45 +736,45 @@ fdesc_badop(void) #define fdesc_blockmap (int (*) (struct vnop_blockmap_args *))eopnotsupp int(**fdesc_vnodeop_p)(void *); -struct vnodeopv_entry_desc devfs_fdesc_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)vn_default_error}, /* lookup */ - { &vnop_create_desc, (VOPFUNC)fdesc_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)fdesc_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)fdesc_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)fdesc_close }, /* close */ - { &vnop_access_desc, (VOPFUNC)fdesc_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC)fdesc_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)fdesc_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)fdesc_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)fdesc_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)fdesc_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)fdesc_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)fdesc_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)fdesc_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)fdesc_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)fdesc_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)fdesc_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)fdesc_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)fdesc_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)fdesc_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)fdesc_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)vn_default_error},/* readdir */ - { &vnop_readlink_desc, (VOPFUNC)err_readlink}, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)fdesc_inactive },/* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)fdesc_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)fdesc_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)fdesc_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)fdesc_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)fdesc_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)fdesc_blktooff }, /* blktooff */ - { &vnop_blktooff_desc, (VOPFUNC)fdesc_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)fdesc_blockmap }, /* blockmap */ - { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } +const struct vnodeopv_entry_desc devfs_fdesc_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)vn_default_error}, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)fdesc_create }, /* create */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)fdesc_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)fdesc_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)fdesc_close }, /* close */ + { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)fdesc_access }, /* access */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)fdesc_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)fdesc_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)fdesc_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)fdesc_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)fdesc_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)fdesc_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)fdesc_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)fdesc_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)fdesc_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)fdesc_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)fdesc_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)fdesc_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)fdesc_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)fdesc_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)fdesc_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)vn_default_error},/* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink}, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)fdesc_inactive },/* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)fdesc_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)fdesc_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)fdesc_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)fdesc_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)fdesc_bwrite }, /* bwrite */ + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)fdesc_blktooff }, /* blktooff */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)fdesc_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)fdesc_blockmap }, /* blockmap */ + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (VOPFUNC)NULL } }; -struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc = -{ &fdesc_vnodeop_p, devfs_fdesc_vnodeop_entries }; +const struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc = +{ .opv_desc_vector_p = &fdesc_vnodeop_p, .opv_desc_ops = devfs_fdesc_vnodeop_entries }; diff --git a/bsd/miscfs/devfs/devfs_tree.c b/bsd/miscfs/devfs/devfs_tree.c index fe32a3c68..589472d57 100644 --- a/bsd/miscfs/devfs/devfs_tree.c +++ b/bsd/miscfs/devfs/devfs_tree.c @@ -99,6 +99,7 @@ #include #include #include +#include #define BSD_KERNEL_PRIVATE 1 /* devfs_make_link() prototype */ #include "devfs.h" #include "devfsdefs.h" @@ -150,6 +151,8 @@ lck_attr_t * devfs_lck_attr; lck_mtx_t devfs_mutex; lck_mtx_t devfs_attr_mutex; +os_refgrp_decl(static, devfs_refgrp, "devfs", NULL); + devdirent_t * dev_root = NULL; /* root of backing tree */ struct devfs_stats devfs_stats; /* hold stats */ @@ -515,6 +518,7 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, devnode_t * *dn_pp, struct devfsmount *dvm) { devnode_t * dnp = NULL; + int error = 0; #if defined SPLIT_DEVS /* @@ -587,7 +591,9 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, #endif } dnp->dn_dvm = dvm; - dnp->dn_refcount = 0; + + /* Note: this inits the reference count to 1, this is considered unreferenced */ + os_ref_init_raw(&dnp->dn_refcount, &devfs_refgrp); dnp->dn_ino = devfs_unique_fileno; devfs_unique_fileno++; @@ -627,8 +633,8 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, typeinfo->Slnk.namelen + 1, M_DEVFSNODE, M_WAITOK); if (!dnp->dn_typeinfo.Slnk.name) { - FREE(dnp, M_DEVFSNODE); - return ENOMEM; + error = ENOMEM; + break; } strlcpy(dnp->dn_typeinfo.Slnk.name, typeinfo->Slnk.name, typeinfo->Slnk.namelen + 1); @@ -656,12 +662,17 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, #endif /* FDESC */ default: - return EINVAL; + error = EINVAL; } - *dn_pp = dnp; - DEVFS_INCR_NODES(); - return 0; + if (error) { + FREE(dnp, M_DEVFSNODE); + } else { + *dn_pp = dnp; + DEVFS_INCR_NODES(); + } + + return error; } @@ -698,7 +709,10 @@ devfs_dn_free(devnode_t * dnp) } /* Can only free if there are no references; otherwise, wait for last vnode to be reclaimed */ - if (dnp->dn_refcount == 0) { + os_ref_count_t rc = os_ref_get_count_raw(&dnp->dn_refcount); + if (rc == 1) { + /* release final reference from dev_add_node */ + (void) os_ref_release_locked_raw(&dnp->dn_refcount, &devfs_refgrp); devnode_free(dnp); } else { dnp->dn_lflags |= DN_DELETE; @@ -1362,20 +1376,22 @@ out: void devfs_ref_node(devnode_t *dnp) { - dnp->dn_refcount++; + os_ref_retain_locked_raw(&dnp->dn_refcount, &devfs_refgrp); } /* * Release a reference on a devnode. If the devnode is marked for - * free and the refcount is dropped to zero, do the free. + * free and the refcount is dropped to one, do the free. */ void devfs_rele_node(devnode_t *dnp) { - dnp->dn_refcount--; - if (dnp->dn_refcount < 0) { - panic("devfs_rele_node: devnode with a negative refcount!\n"); - } else if ((dnp->dn_refcount == 0) && (dnp->dn_lflags & DN_DELETE)) { + os_ref_count_t rc = os_ref_release_locked_raw(&dnp->dn_refcount, &devfs_refgrp); + if (rc < 1) { + panic("devfs_rele_node: devnode without a refcount!\n"); + } else if ((rc == 1) && (dnp->dn_lflags & DN_DELETE)) { + /* release final reference from dev_add_node */ + (void) os_ref_release_locked_raw(&dnp->dn_refcount, &devfs_refgrp); devnode_free(dnp); } } diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index ab3986460..a1392ce2a 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -504,7 +504,7 @@ devfs_kernel_mount(char * mntname) return 0; } -struct vfsops devfs_vfsops = { +const struct vfsops devfs_vfsops = { .vfs_mount = devfs_mount, .vfs_start = devfs_start, .vfs_unmount = devfs_unmount, diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index 3377d3a35..b9de4b101 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1583,118 +1583,118 @@ devfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify) /* The following ops are used by directories and symlinks */ int(**devfs_vnodeop_p)(void *); -static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)devfs_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vnop_whiteout_desc, (VOPFUNC)err_whiteout }, /* whiteout */ - { &vnop_mknod_desc, (VOPFUNC)devfs_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)nop_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)devfs_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)devfs_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)devfs_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)err_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)err_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)err_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)nop_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)devfs_vnop_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)devfs_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)devfs_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)devfs_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)devfs_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)devfs_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)devfs_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)devfs_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)devs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)err_bwrite }, - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap */ +const static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)devfs_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create }, /* create */ + { .opve_op = &vnop_whiteout_desc, .opve_impl = (VOPFUNC)err_whiteout }, /* whiteout */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)devfs_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)nop_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)devfs_close }, /* close */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)devfs_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)devfs_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)devfs_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)devfs_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)err_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)err_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)err_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)nop_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)devfs_vnop_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)devfs_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)devfs_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)devfs_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)devfs_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)devfs_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)devfs_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)devfs_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)devfs_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)devfs_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)err_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)devs_vnop_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)err_bwrite }, + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)err_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)err_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)err_blockmap }, /* blockmap */ #if CONFIG_MACF - { &vnop_setlabel_desc, (VOPFUNC)devfs_setlabel }, /* setlabel */ + { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)devfs_setlabel }, /* setlabel */ #endif - { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL } + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL } }; -struct vnodeopv_desc devfs_vnodeop_opv_desc = -{ &devfs_vnodeop_p, devfs_vnodeop_entries }; +const struct vnodeopv_desc devfs_vnodeop_opv_desc = +{ .opv_desc_vector_p = &devfs_vnodeop_p, .opv_desc_ops = devfs_vnodeop_entries }; /* The following ops are used by the device nodes */ int(**devfs_spec_vnodeop_p)(void *); -static struct vnodeopv_entry_desc devfs_spec_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)devfsspec_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)devfsspec_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)devfsspec_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)devfs_vnop_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)devfs_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ - { &vnop_blktooff_desc, (VOPFUNC)spec_offtoblk }, /* blkofftoblk */ - { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ +const static struct vnodeopv_entry_desc devfs_spec_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)spec_create }, /* create */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)spec_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)devfsspec_close }, /* close */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)devfs_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)devfs_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)devfsspec_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)devfsspec_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)spec_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)spec_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)devfs_vnop_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)devfs_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)spec_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)spec_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)spec_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)spec_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)spec_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)spec_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)devfs_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)devfs_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)spec_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)vn_bwrite }, + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff }, /* blktooff */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_offtoblk }, /* blkofftoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap }, /* blockmap */ #if CONFIG_MACF - { &vnop_setlabel_desc, (VOPFUNC)devfs_setlabel }, /* setlabel */ + { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)devfs_setlabel }, /* setlabel */ #endif - { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL } + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL } }; -struct vnodeopv_desc devfs_spec_vnodeop_opv_desc = -{ &devfs_spec_vnodeop_p, devfs_spec_vnodeop_entries }; +const struct vnodeopv_desc devfs_spec_vnodeop_opv_desc = +{ .opv_desc_vector_p = &devfs_spec_vnodeop_p, .opv_desc_ops = devfs_spec_vnodeop_entries }; #if FDESC int(**devfs_devfd_vnodeop_p)(void*); -static struct vnodeopv_entry_desc devfs_devfd_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)devfs_devfd_lookup}, /* lookup */ - { &vnop_open_desc, (VOPFUNC)nop_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)devfs_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ - { &vnop_revoke_desc, (VOPFUNC)err_revoke }, /* revoke */ - { &vnop_fsync_desc, (VOPFUNC)nop_fsync }, /* fsync */ - { &vnop_readdir_desc, (VOPFUNC)devfs_devfd_readdir}, /* readdir */ - { &vnop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ - { &vnop_pathconf_desc, (VOPFUNC)devs_vnop_pathconf }, /* pathconf */ +const static struct vnodeopv_entry_desc devfs_devfd_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)devfs_devfd_lookup}, /* lookup */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)nop_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)devfs_close }, /* close */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)devfs_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)devfs_setattr }, /* setattr */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)err_revoke }, /* revoke */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)nop_fsync }, /* fsync */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)devfs_devfd_readdir}, /* readdir */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)devfs_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)devfs_reclaim }, /* reclaim */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)devs_vnop_pathconf }, /* pathconf */ #if CONFIG_MACF - { &vnop_setlabel_desc, (VOPFUNC)devfs_setlabel }, /* setlabel */ + { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)devfs_setlabel }, /* setlabel */ #endif - { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL } + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL } }; -struct vnodeopv_desc devfs_devfd_vnodeop_opv_desc = -{ &devfs_devfd_vnodeop_p, devfs_devfd_vnodeop_entries}; +const struct vnodeopv_desc devfs_devfd_vnodeop_opv_desc = +{ .opv_desc_vector_p = &devfs_devfd_vnodeop_p, .opv_desc_ops = devfs_devfd_vnodeop_entries}; #endif /* FDESC */ diff --git a/bsd/miscfs/devfs/devfsdefs.h b/bsd/miscfs/devfs/devfsdefs.h index 502e4daa5..e2dee3842 100644 --- a/bsd/miscfs/devfs/devfsdefs.h +++ b/bsd/miscfs/devfs/devfsdefs.h @@ -89,7 +89,7 @@ typedef enum { extern int(**devfs_vnodeop_p)(void *); /* our own vector array for dirs */ extern int(**devfs_spec_vnodeop_p)(void *); /* our own vector array for devs */ -extern struct vfsops devfs_vfsops; +extern const struct vfsops devfs_vfsops; typedef struct devnode devnode_t; typedef struct devdirent devdirent_t; @@ -132,7 +132,7 @@ struct devnode { * make sure that a deferred delete eventually happens if it is * blocked behind that reference. */ - int dn_refcount; + os_ref_atomic_t dn_refcount; u_short dn_mode; uid_t dn_uid; gid_t dn_gid; diff --git a/bsd/miscfs/fifofs/fifo_vnops.c b/bsd/miscfs/fifofs/fifo_vnops.c index 956824de4..df13d0d96 100644 --- a/bsd/miscfs/fifofs/fifo_vnops.c +++ b/bsd/miscfs/fifofs/fifo_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,47 +80,47 @@ #define VOPFUNC int (*)(void *) int(**fifo_vnodeop_p)(void *); -struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)fifo_close }, /* close */ - { &vnop_access_desc, (VOPFUNC)fifo_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC)fifo_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)fifo_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)fifo_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)fifo_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)fifo_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)fifo_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)fifo_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)fifo_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap */ - { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL } +const struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)fifo_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create }, /* create */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)fifo_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)fifo_close }, /* close */ + { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)fifo_access }, /* access */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)fifo_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)fifo_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)fifo_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)fifo_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)fifo_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)fifo_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)fifo_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)fifo_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)fifo_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)fifo_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)err_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)fifo_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)fifo_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)fifo_bwrite }, /* bwrite */ + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)err_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)err_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)err_blockmap }, /* blockmap */ + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL } }; -struct vnodeopv_desc fifo_vnodeop_opv_desc = -{ &fifo_vnodeop_p, fifo_vnodeop_entries }; +const struct vnodeopv_desc fifo_vnodeop_opv_desc = +{ .opv_desc_vector_p = &fifo_vnodeop_p, .opv_desc_ops = fifo_vnodeop_entries }; /* * Trivial lookup routine that always fails. diff --git a/bsd/miscfs/mockfs/mockfs_vnops.c b/bsd/miscfs/mockfs/mockfs_vnops.c index 406eddfc2..5dbbbc648 100644 --- a/bsd/miscfs/mockfs/mockfs_vnops.c +++ b/bsd/miscfs/mockfs/mockfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -407,46 +407,46 @@ mockfs_blockmap(struct vnop_blockmap_args * ap) } int(**mockfs_vnodeop_p)(void *); -struct vnodeopv_entry_desc mockfs_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC) vn_default_error }, /* default */ - { &vnop_lookup_desc, (VOPFUNC) mockfs_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC) err_create },/* create */ - { &vnop_open_desc, (VOPFUNC) err_open }, /* open */ - { &vnop_mknod_desc, (VOPFUNC) err_mknod }, /* mknod */ - { &vnop_close_desc, (VOPFUNC) err_close }, /* close */ - { &vnop_access_desc, (VOPFUNC) err_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC) mockfs_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC) err_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC) mockfs_read }, /* read */ - { &vnop_write_desc, (VOPFUNC) err_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC) err_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC) err_select }, /* select */ - { &vnop_mmap_desc, (VOPFUNC) err_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC) nop_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC) err_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC) err_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC) err_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC) err_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC) err_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC) err_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC) err_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC) err_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC) err_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC) mockfs_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC) mockfs_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC) err_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC) err_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC) err_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (VOPFUNC) mockfs_pagein }, /* pagein */ - { &vnop_pageout_desc, (VOPFUNC) err_pageout }, /* pageout */ - { &vnop_copyfile_desc, (VOPFUNC) err_copyfile }, /* copyfile */ - { &vnop_blktooff_desc, (VOPFUNC) err_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC) err_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC) mockfs_blockmap }, /* blockmap */ - { (struct vnodeop_desc *) NULL, (VOPFUNC) NULL } +const struct vnodeopv_entry_desc mockfs_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC) vn_default_error }, /* default */ + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC) mockfs_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC) err_create },/* create */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC) err_open }, /* open */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC) err_mknod }, /* mknod */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC) err_close }, /* close */ + { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC) err_access }, /* access */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC) mockfs_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC) err_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC) mockfs_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC) err_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC) err_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC) err_select }, /* select */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC) err_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC) nop_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC) err_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC) err_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC) err_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC) err_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC) err_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC) err_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC) err_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC) err_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC) err_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC) mockfs_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC) mockfs_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC) err_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC) err_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC) err_bwrite }, /* bwrite */ + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC) mockfs_pagein }, /* pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC) err_pageout }, /* pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC) err_copyfile }, /* copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC) err_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC) err_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC) mockfs_blockmap }, /* blockmap */ + { .opve_op = (struct vnodeop_desc *) NULL, .opve_impl = (VOPFUNC) NULL } }; -struct vnodeopv_desc mockfs_vnodeop_opv_desc = { - &mockfs_vnodeop_p, - mockfs_vnodeop_entries +const struct vnodeopv_desc mockfs_vnodeop_opv_desc = { + .opv_desc_vector_p = &mockfs_vnodeop_p, + .opv_desc_ops = mockfs_vnodeop_entries }; diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c index 0e8330da7..8305be1c6 100644 --- a/bsd/miscfs/nullfs/null_vfsops.c +++ b/bsd/miscfs/nullfs/null_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -408,7 +408,7 @@ nullfs_vfs_getattr(struct mount * mp, struct vfs_attr * vfap, vfs_context_t ctx) vol_capabilities_attr_t capabilities; struct vfsstatfs * sp = vfs_statfs(mp); - struct timespec tzero = {0, 0}; + struct timespec tzero = {.tv_sec = 0, .tv_nsec = 0}; NULLFSDEBUG("%s\n", __FUNCTION__); @@ -549,9 +549,9 @@ nullfs_vfs_start(__unused struct mount * mp, __unused int flags, __unused vfs_co return 0; } -extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; +extern const struct vnodeopv_desc nullfs_vnodeop_opv_desc; -struct vnodeopv_desc * nullfs_vnodeopv_descs[] = { +const struct vnodeopv_desc * nullfs_vnodeopv_descs[] = { &nullfs_vnodeop_opv_desc, }; diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c index ebfe2e7e1..6afadbfab 100644 --- a/bsd/miscfs/nullfs/null_vnops.c +++ b/bsd/miscfs/nullfs/null_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -832,7 +832,7 @@ nullfs_getxattr(struct vnop_getxattr_args * args) NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); if (nullfs_checkspecialvp(args->a_vp)) { - return 0; /* nothing extra needed */ + return ENOATTR; /* no xattrs on the special vnodes */ } vp = args->a_vp; @@ -855,7 +855,7 @@ nullfs_listxattr(struct vnop_listxattr_args * args) NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); if (nullfs_checkspecialvp(args->a_vp)) { - return 0; /* nothing extra needed */ + return 0; /* no xattrs on the special vnodes */ } vp = args->a_vp; @@ -1017,19 +1017,19 @@ end: * Global vfs data structures */ -static struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = { - {&vnop_default_desc, (vop_t)nullfs_default}, {&vnop_getattr_desc, (vop_t)nullfs_getattr}, - {&vnop_open_desc, (vop_t)nullfs_open}, {&vnop_close_desc, (vop_t)nullfs_close}, - {&vnop_inactive_desc, (vop_t)null_inactive}, {&vnop_reclaim_desc, (vop_t)null_reclaim}, - {&vnop_lookup_desc, (vop_t)null_lookup}, {&vnop_readdir_desc, (vop_t)nullfs_readdir}, - {&vnop_readlink_desc, (vop_t)nullfs_readlink}, {&vnop_pathconf_desc, (vop_t)nullfs_pathconf}, - {&vnop_fsync_desc, (vop_t)nullfs_fsync}, {&vnop_mmap_desc, (vop_t)nullfs_mmap}, - {&vnop_mnomap_desc, (vop_t)nullfs_mnomap}, {&vnop_getxattr_desc, (vop_t)nullfs_getxattr}, - {&vnop_pagein_desc, (vop_t)nullfs_pagein}, {&vnop_read_desc, (vop_t)nullfs_read}, - {&vnop_listxattr_desc, (vop_t)nullfs_listxattr}, {NULL, NULL}, +static const struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = { + {.opve_op = &vnop_default_desc, .opve_impl = (vop_t)nullfs_default}, {.opve_op = &vnop_getattr_desc, .opve_impl = (vop_t)nullfs_getattr}, + {.opve_op = &vnop_open_desc, .opve_impl = (vop_t)nullfs_open}, {.opve_op = &vnop_close_desc, .opve_impl = (vop_t)nullfs_close}, + {.opve_op = &vnop_inactive_desc, .opve_impl = (vop_t)null_inactive}, {.opve_op = &vnop_reclaim_desc, .opve_impl = (vop_t)null_reclaim}, + {.opve_op = &vnop_lookup_desc, .opve_impl = (vop_t)null_lookup}, {.opve_op = &vnop_readdir_desc, .opve_impl = (vop_t)nullfs_readdir}, + {.opve_op = &vnop_readlink_desc, .opve_impl = (vop_t)nullfs_readlink}, {.opve_op = &vnop_pathconf_desc, .opve_impl = (vop_t)nullfs_pathconf}, + {.opve_op = &vnop_fsync_desc, .opve_impl = (vop_t)nullfs_fsync}, {.opve_op = &vnop_mmap_desc, .opve_impl = (vop_t)nullfs_mmap}, + {.opve_op = &vnop_mnomap_desc, .opve_impl = (vop_t)nullfs_mnomap}, {.opve_op = &vnop_getxattr_desc, .opve_impl = (vop_t)nullfs_getxattr}, + {.opve_op = &vnop_pagein_desc, .opve_impl = (vop_t)nullfs_pagein}, {.opve_op = &vnop_read_desc, .opve_impl = (vop_t)nullfs_read}, + {.opve_op = &vnop_listxattr_desc, .opve_impl = (vop_t)nullfs_listxattr}, {.opve_op = NULL, .opve_impl = NULL}, }; -struct vnodeopv_desc nullfs_vnodeop_opv_desc = {&nullfs_vnodeop_p, nullfs_vnodeop_entries}; +const struct vnodeopv_desc nullfs_vnodeop_opv_desc = {.opv_desc_vector_p = &nullfs_vnodeop_p, .opv_desc_ops = nullfs_vnodeop_entries}; //NULLFS Specific helper function diff --git a/bsd/miscfs/nullfs/nullfs.h b/bsd/miscfs/nullfs/nullfs.h index 766194f6d..38b55fd53 100644 --- a/bsd/miscfs/nullfs/nullfs.h +++ b/bsd/miscfs/nullfs/nullfs.h @@ -148,7 +148,7 @@ int nullfs_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp); #define NULLVPTOLOWERVID(vp) (VTONULL(vp)->null_lowervid) #define NULLVPTOMYVID(vp) (VTONULL(vp)->null_myvid) -extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; +extern const struct vnodeopv_desc nullfs_vnodeop_opv_desc; extern vop_t * nullfs_vnodeop_p; diff --git a/bsd/miscfs/routefs/routefs_ops.c b/bsd/miscfs/routefs/routefs_ops.c index d066326e1..664ae0e16 100644 --- a/bsd/miscfs/routefs/routefs_ops.c +++ b/bsd/miscfs/routefs/routefs_ops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -487,7 +487,7 @@ out: return error; } -struct vfsops routefs_vfsops = { +const struct vfsops routefs_vfsops = { .vfs_mount = routefs_mount, .vfs_start = routefs_start, .vfs_unmount = routefs_unmount, @@ -518,47 +518,48 @@ routefserr_setlabel(__unused struct vnop_setlabel_args * args) /* The following ops are used by directories and symlinks */ int(**routefs_vnodeop_p)(void *); -static struct vnodeopv_entry_desc routefs_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)routefserr_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vnop_whiteout_desc, (VOPFUNC)err_whiteout }, /* whiteout */ - { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)err_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)err_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)err_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)err_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)err_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)err_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)err_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)err_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)err_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)nop_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)err_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)err_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)err_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)err_bwrite }, - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap */ +static const struct vnodeopv_entry_desc routefs_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)routefserr_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create }, /* create */ + { .opve_op = &vnop_whiteout_desc, .opve_impl = (VOPFUNC)err_whiteout }, /* whiteout */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)err_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)err_close }, /* close */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)err_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)err_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)err_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)err_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)err_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)err_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)err_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)nop_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)err_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)err_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)err_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)err_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)err_bwrite }, + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)err_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)err_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)err_blockmap }, /* blockmap */ #if CONFIG_MACF - { &vnop_setlabel_desc, (VOPFUNC)routefserr_setlabel }, /* setlabel */ + { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)routefserr_setlabel }, /* setlabel */ #endif - { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL } + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL } }; -struct vnodeopv_desc routefs_vnodeop_opv_desc = -{ &routefs_vnodeop_p, routefs_vnodeop_entries }; + +const struct vnodeopv_desc routefs_vnodeop_opv_desc = +{ .opv_desc_vector_p = &routefs_vnodeop_p, .opv_desc_ops = routefs_vnodeop_entries }; diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 7b6f18d9c..042363b41 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,7 +102,7 @@ /* XXX following three prototypes should be in a header file somewhere */ extern dev_t chrtoblk(dev_t dev); extern boolean_t iskmemdev(dev_t dev); -extern int bpfkqfilter(dev_t dev, struct knote *kn); +extern int bpfkqfilter(dev_t dev, struct knote *kn); extern int ptsd_kqfilter(dev_t, struct knote *); extern int ptmx_kqfilter(dev_t, struct knote *); @@ -120,47 +120,47 @@ char devcls[] = "devcls"; #define VOPFUNC int (*)(void *) int(**spec_vnodeop_p)(void *); -struct vnodeopv_entry_desc spec_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ - { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ - { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL } +const struct vnodeopv_entry_desc spec_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create }, /* create */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)spec_close }, /* close */ + { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)spec_access }, /* access */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)spec_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)spec_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)spec_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)spec_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)nop_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap }, /* mmap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)nop_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)nop_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)spec_bwrite }, /* bwrite */ + { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)spec_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap }, /* blockmap */ + { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL } }; -struct vnodeopv_desc spec_vnodeop_opv_desc = -{ &spec_vnodeop_p, spec_vnodeop_entries }; +const struct vnodeopv_desc spec_vnodeop_opv_desc = +{ .opv_desc_vector_p = &spec_vnodeop_p, .opv_desc_ops = spec_vnodeop_entries }; static void set_blocksize(vnode_t, dev_t); @@ -315,6 +315,7 @@ spec_open(struct vnop_open_args *ap) return ENXIO; } if (cred != FSCRED && (ap->a_mode & FWRITE)) { +#if 0 /* * When running in very secure mode, do not allow * opens for writing of any disk character devices. @@ -322,6 +323,7 @@ spec_open(struct vnop_open_args *ap) if (securelevel >= 2 && isdisk(dev, VCHR)) { return EPERM; } +#endif /* Never allow writing to /dev/mem or /dev/kmem */ if (iskmemdev(dev)) { @@ -485,13 +487,49 @@ spec_read(struct vnop_read_args *ap) { struct _throttle_io_info_t *throttle_info = NULL; int thread_throttle_level; - if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { + uint64_t blkno = 0; + uint32_t iolen = 0; + int ddisk = 0; + int ktrace_code = DKIO_READ; + devBlockSize = vp->v_specsize; + uintptr_t our_id; + + if (cdevsw[major(vp->v_rdev)].d_type == D_DISK) { + ddisk = 1; + } + + if (ddisk && vp->v_un.vu_specinfo->si_throttleable) { throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL); } + + if (kdebug_enable && ddisk) { + if (devBlockSize == 0) { + devBlockSize = 512; // default sector size + } + + if (uio_offset(uio) && devBlockSize) { + blkno = ((uint64_t) uio_offset(uio) / ((uint64_t)devBlockSize)); + } + iolen = (int) uio_resid(uio); + our_id = (uintptr_t)thread_tid(current_thread()); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id, + vp->v_rdev, blkno, iolen, 0); + } + error = (*cdevsw[major(vp->v_rdev)].d_read) (vp->v_rdev, uio, ap->a_ioflag); + + if (kdebug_enable && ddisk) { + uint32_t residual = (uint32_t)uio_resid(uio); + ktrace_code |= DKIO_DONE; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id, + (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0); + } + if (throttle_info) { throttle_info_end_io_internal(throttle_info, thread_throttle_level); } @@ -589,16 +627,51 @@ spec_write(struct vnop_write_args *ap) { struct _throttle_io_info_t *throttle_info = NULL; int thread_throttle_level; - if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { + dev = vp->v_rdev; + devBlockSize = vp->v_specsize; + uint32_t iolen = 0; + uint64_t blkno = 0; + int ddisk = 0; + int ktrace_code = 0; // write is implied; read must be OR'd in. + uintptr_t our_id; + + if (cdevsw[major(dev)].d_type == D_DISK) { + ddisk = 1; + } + + if (ddisk && vp->v_un.vu_specinfo->si_throttleable) { throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL); microuptime(&throttle_info->throttle_last_write_timestamp); } + + if (kdebug_enable && ddisk) { + if (devBlockSize == 0) { + devBlockSize = 512; // default sector size + } + if ((uio_offset(uio) != 0) && devBlockSize) { + blkno = ((uint64_t)uio_offset(uio)) / ((uint64_t)devBlockSize); + } + iolen = (int)uio_resid(uio); + our_id = (uintptr_t)thread_tid(current_thread()); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id, + vp->v_rdev, blkno, iolen, 0); + } error = (*cdevsw[major(vp->v_rdev)].d_write) (vp->v_rdev, uio, ap->a_ioflag); + if (kdebug_enable && ddisk) { + //emit the I/O completion + uint32_t residual = (uint32_t)uio_resid(uio); + ktrace_code |= DKIO_DONE; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id, + (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0); + } + if (throttle_info) { throttle_info_end_io_internal(throttle_info, thread_throttle_level); } @@ -746,10 +819,10 @@ spec_select(struct vnop_select_args *ap) } } -static int filt_specattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_specattach(struct knote *kn, struct kevent_qos_s *kev); int -spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev) +spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev) { dev_t dev; @@ -765,7 +838,7 @@ spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev) * other attaches. */ int32_t tmp_flags = kn->kn_flags; - int64_t tmp_data = kn->kn_data; + int64_t tmp_sdata = kn->kn_sdata; int res; res = bpfkqfilter(dev, kn); @@ -773,7 +846,7 @@ spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev) return res; } kn->kn_flags = tmp_flags; - kn->kn_data = tmp_data; + kn->kn_sdata = tmp_sdata; #endif if (major(dev) > nchrdev) { @@ -1975,6 +2048,50 @@ done: return sleep_cnt; } +/* + * returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept + * This function mimics the most of the throttle_lowpri_io checks but without actual sleeping + */ +int +throttle_lowpri_io_will_be_throttled(int sleep_amount) +{ + if (sleep_amount == 0) { + return FALSE; + } + + uthread_t ut = get_bsdthread_info(current_thread()); + if (ut->uu_lowpri_window == 0) { + return FALSE; + } + + struct _throttle_io_info_t *info = ut->uu_throttle_info; + if (info == NULL) { + return FALSE; + } + + lck_mtx_lock(&info->throttle_lock); + assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED); + + if (sleep_amount == 1 && !ut->uu_throttle_bc) { + sleep_amount = 0; + } + + int result = FALSE; + + int throttle_type = throttle_io_will_be_throttled_internal(info, NULL, NULL); + if (throttle_type > THROTTLE_DISENGAGED) { + result = TRUE; + if ((throttle_type == THROTTLE_ENGAGED) && (sleep_amount == 0)) { + result = FALSE; + } + } + + lck_mtx_unlock(&info->throttle_lock); + + return result; +} + + /* * KPI routine * @@ -2379,7 +2496,7 @@ spec_strategy(struct vnop_strategy_args *ap) * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive */ if (bap->ba_flags & BA_META) { - if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) { + if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) { if (bp->b_flags & B_READ) { if (io_tier > IOSCHED_METADATA_TIER) { io_tier = IOSCHED_METADATA_TIER; @@ -2748,8 +2865,8 @@ spec_offtoblk(struct vnop_offtoblk_args *ap) static void filt_specdetach(struct knote *kn); static int filt_specevent(struct knote *kn, long hint); -static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_spectouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_specprocess(struct knote *kn, struct kevent_qos_s *kev); static int filt_specpeek(struct knote *kn); SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = { @@ -2789,7 +2906,7 @@ spec_knote_select_and_link(struct knote *kn) ctx = vfs_context_current(); vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; - int error = vnode_getwithvid(vp, kn->kn_hookid); + int error = vnode_getwithvid(vp, vnode_vid(vp)); if (error != 0) { knote_set_error(kn, ENOENT); return 0; @@ -2798,7 +2915,7 @@ spec_knote_select_and_link(struct knote *kn) /* * This function may be called many times to link or re-link the * underlying vnode to the kqueue. If we've already linked the two, - * we will have a valid kn_hook_data which ties us to the underlying + * we will have a valid kn_hook64 which ties us to the underlying * device's waitq via a the waitq's prepost table object. However, * devices can abort any select action by calling selthreadclear(). * This is OK because the table object will be invalidated by the @@ -2868,13 +2985,13 @@ spec_knote_select_and_link(struct knote *kn) * the table object's ID to us. It will also set the * waitq_prepost_id field within the waitq structure. * - * We can just overwrite kn_hook_data because it's simply a + * We can just overwrite kn_hook64 because it's simply a * table ID used to grab a reference when needed. * * We have a reference on the vnode, so we know that the * device won't go away while we get this ID. */ - kn->kn_hook_data = waitq_get_prepost_id(wq); + kn->kn_hook64 = waitq_get_prepost_id(wq); } else if (selres == 0) { /* * The device indicated that there's no data to read, but didn't call @@ -2890,22 +3007,33 @@ spec_knote_select_and_link(struct knote *kn) return selres; } -static void -filt_spec_common(struct knote *kn, int selres) +static int +filt_spec_common(struct knote *kn, struct kevent_qos_s *kev, int selres) { + int64_t data; + int ret; + if (kn->kn_vnode_use_ofst) { if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { - kn->kn_data = 0; + data = 0; } else { - kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; + data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; } } else { - kn->kn_data = selres; + data = selres; + } + + ret = data >= knote_low_watermark(kn); + + if (ret && kev) { + knote_fill_kevent(kn, kev, data); } + + return ret; } static int -filt_specattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_specattach(struct knote *kn, __unused struct kevent_qos_s *kev) { vnode_t vp; dev_t dev; @@ -2941,8 +3069,7 @@ filt_specattach(struct knote *kn, __unused struct kevent_internal_s *kev) } kn->kn_filtid = EVFILTID_SPEC; - kn->kn_hook_data = 0; - kn->kn_hookid = vnode_vid(vp); + kn->kn_hook64 = 0; knote_markstayactive(kn); return spec_knote_select_and_link(kn); @@ -2957,7 +3084,7 @@ filt_specdetach(struct knote *kn) * This is potentially tricky: the device's selinfo waitq that was * tricked into being part of this knote's waitq set may not be a part * of any other set, and the device itself may have revoked the memory - * in which the waitq was held. We use the knote's kn_hook_data field + * in which the waitq was held. We use the knote's kn_hook64 field * to keep the ID of the waitq's prepost table object. This * object keeps a pointer back to the waitq, and gives us a safe way * to decouple the dereferencing of driver allocated memory: if the @@ -2965,9 +3092,9 @@ filt_specdetach(struct knote *kn) * object will be invalidated. The waitq details are handled in the * waitq API invoked here. */ - if (kn->kn_hook_data) { - waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs)); - kn->kn_hook_data = 0; + if (kn->kn_hook64) { + waitq_unlink_by_prepost_id(kn->kn_hook64, &(knote_get_kq(kn)->kq_wqs)); + kn->kn_hook64 = 0; } } @@ -2982,7 +3109,7 @@ filt_specevent(struct knote *kn, __unused long hint) } static int -filt_spectouch(struct knote *kn, struct kevent_internal_s *kev) +filt_spectouch(struct knote *kn, struct kevent_qos_s *kev) { kn->kn_sdata = kev->data; kn->kn_sfflags = kev->fflags; @@ -2995,9 +3122,8 @@ filt_spectouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_specprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) vnode_t vp; uthread_t uth; vfs_context_t ctx; @@ -3009,29 +3135,18 @@ filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in ctx = vfs_context_current(); vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; - error = vnode_getwithvid(vp, kn->kn_hookid); + error = vnode_getwithvid(vp, vnode_vid(vp)); if (error != 0) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); - *kev = kn->kn_kevent; + knote_fill_kevent(kn, kev, 0); return 1; } selres = spec_knote_select_and_link(kn); - filt_spec_common(kn, selres); + res = filt_spec_common(kn, kev, selres); vnode_put(vp); - res = ((kn->kn_sfflags & NOTE_LOWAT) != 0) ? - (kn->kn_data >= kn->kn_sdata) : kn->kn_data; - - if (res) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_fflags = 0; - kn->kn_data = 0; - } - } - return res; } @@ -3041,7 +3156,5 @@ filt_specpeek(struct knote *kn) int selres = 0; selres = spec_knote_select_and_link(kn); - filt_spec_common(kn, selres); - - return kn->kn_data != 0; + return filt_spec_common(kn, NULL, selres); } diff --git a/bsd/miscfs/specfs/specdev.h b/bsd/miscfs/specfs/specdev.h index ae4e79ae0..3a4b2aa5c 100644 --- a/bsd/miscfs/specfs/specdev.h +++ b/bsd/miscfs/specfs/specdev.h @@ -133,9 +133,9 @@ __BEGIN_DECLS #ifdef BSD_KERNEL_PRIVATE int spec_blktooff(struct vnop_blktooff_args *); int spec_offtoblk(struct vnop_offtoblk_args *); -int spec_fsync_internal(vnode_t, int, vfs_context_t); +int spec_fsync_internal(vnode_t, int, vfs_context_t); int spec_blockmap(struct vnop_blockmap_args *); -int spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev); +int spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev); #endif /* BSD_KERNEL_PRIVATE */ int spec_ebadf(void *); diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 60c66a58b..fb8e28cd1 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -52,11 +52,14 @@ PRIVATE_DATAFILES = \ if_var.h \ if_vlan_var.h \ if_fake_var.h \ + if_6lowpan_var.h \ iptap.h \ lacp.h \ + multi_layer_pkt_log.h \ ndrv_var.h \ necp.h \ net_api_stats.h \ + net_log_common.h \ netsrc.h \ network_agent.h \ ntstat.h \ @@ -70,6 +73,8 @@ PRIVATE_DATAFILES = \ route.h \ net_perf.h \ net_kev.h \ + sixxlowpan.h \ + frame802154.h \ nat464_utils.h PRIVATE_KERNELFILES = $(filter-out radix.h,${KERNELFILES}) \ diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index 52047280c..b855f3a48 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -156,9 +156,15 @@ typedef void (*pktcopyfunc_t)(const void *, void *, size_t); static unsigned int bpf_bufsize = BPF_BUFSIZE; SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_bufsize, 0, ""); + +static int sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS; +extern const int copysize_limit_panic; +#define BPF_MAXSIZE_CAP (copysize_limit_panic >> 1) __private_extern__ unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE; -SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW | CTLFLAG_LOCKED, - &bpf_maxbufsize, 0, ""); +SYSCTL_PROC(_debug, OID_AUTO, bpf_maxbufsize, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &bpf_maxbufsize, 0, + sysctl_bpf_maxbufsize, "I", "Default BPF max buffer size"); + static unsigned int bpf_maxdevices = 256; SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_maxdevices, 0, ""); @@ -248,20 +254,20 @@ select_fcn_t bpfselect; /* Darwin's cdevsw struct differs slightly from BSDs */ #define CDEV_MAJOR 23 static struct cdevsw bpf_cdevsw = { - /* open */ bpfopen, - /* close */ bpfclose, - /* read */ bpfread, - /* write */ bpfwrite, - /* ioctl */ bpfioctl, - /* stop */ eno_stop, - /* reset */ eno_reset, - /* tty */ NULL, - /* select */ bpfselect, - /* mmap */ eno_mmap, - /* strategy */ eno_strat, - /* getc */ eno_getc, - /* putc */ eno_putc, - /* type */ 0 + .d_open = bpfopen, + .d_close = bpfclose, + .d_read = bpfread, + .d_write = bpfwrite, + .d_ioctl = bpfioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = bpfselect, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = 0 }; #define SOCKADDR_HDR_LEN offsetof(struct sockaddr, sa_data) @@ -1221,8 +1227,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) } if (found == 1) { ehp->bh_pid = soprocinfo.spi_pid; - proc_name(ehp->bh_pid, ehp->bh_comm, - MAXCOMLEN); + strlcpy(&ehp->bh_comm[0], &soprocinfo.spi_proc_name[0], sizeof(ehp->bh_comm)); } ehp->bh_flowid = 0; } @@ -2526,9 +2531,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) int bpfkqfilter(dev_t dev, struct knote *kn); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); -static int filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data, - struct kevent_internal_s *kev); +static int filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev); SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = { .f_isfd = 1, @@ -2539,9 +2543,10 @@ SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = { }; static int -filt_bpfread_common(struct knote *kn, struct bpf_d *d) +filt_bpfread_common(struct knote *kn, struct kevent_qos_s *kev, struct bpf_d *d) { int ready = 0; + int64_t data = 0; if (d->bd_immediate) { /* @@ -2558,17 +2563,13 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d) * If there's no data in either buffer, we're not * ready to read. */ - kn->kn_data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ? + data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ? d->bd_slen : d->bd_hlen); - int64_t lowwat = 1; - if (kn->kn_sfflags & NOTE_LOWAT) { - if (kn->kn_sdata > d->bd_bufsize) { - lowwat = d->bd_bufsize; - } else if (kn->kn_sdata > lowwat) { - lowwat = kn->kn_sdata; - } + int64_t lowwat = knote_low_watermark(kn); + if (lowwat > d->bd_bufsize) { + lowwat = d->bd_bufsize; } - ready = (kn->kn_data >= lowwat); + ready = (data >= lowwat); } else { /* * If there's data in the hold buffer, it's the @@ -2585,12 +2586,14 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d) * no data in the hold buffer and the timer hasn't * expired, we're not ready to read. */ - kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) && + data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) && d->bd_state == BPF_TIMED_OUT ? d->bd_slen : d->bd_hlen); - ready = (kn->kn_data > 0); + ready = (data > 0); } if (!ready) { bpf_start_timer(d); + } else if (kev) { + knote_fill_kevent(kn, kev, data); } return ready; @@ -2605,10 +2608,8 @@ bpfkqfilter(dev_t dev, struct knote *kn) /* * Is this device a bpf? */ - if (major(dev) != CDEV_MAJOR || - kn->kn_filter != EVFILT_READ) { - kn->kn_flags = EV_ERROR; - kn->kn_data = EINVAL; + if (major(dev) != CDEV_MAJOR || kn->kn_filter != EVFILT_READ) { + knote_set_error(kn, EINVAL); return 0; } @@ -2620,8 +2621,7 @@ bpfkqfilter(dev_t dev, struct knote *kn) (d->bd_flags & BPF_CLOSING) != 0 || d->bd_bif == NULL) { lck_mtx_unlock(bpf_mlock); - kn->kn_flags = EV_ERROR; - kn->kn_data = ENXIO; + knote_set_error(kn, ENXIO); return 0; } @@ -2631,7 +2631,7 @@ bpfkqfilter(dev_t dev, struct knote *kn) d->bd_flags |= BPF_KNOTE; /* capture the current state */ - res = filt_bpfread_common(kn, d); + res = filt_bpfread_common(kn, NULL, d); lck_mtx_unlock(bpf_mlock); @@ -2657,11 +2657,11 @@ filt_bpfread(struct knote *kn, long hint) #pragma unused(hint) struct bpf_d *d = (struct bpf_d *)kn->kn_hook; - return filt_bpfread_common(kn, d); + return filt_bpfread_common(kn, NULL, d); } static int -filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev) +filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int res; @@ -2673,7 +2673,7 @@ filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sfflags = kev->fflags; /* output data will be re-generated here */ - res = filt_bpfread_common(kn, d); + res = filt_bpfread_common(kn, NULL, d); lck_mtx_unlock(bpf_mlock); @@ -2681,18 +2681,13 @@ filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_bpfprocess(struct knote *kn, struct filt_process_s *data, - struct kevent_internal_s *kev) +filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int res; lck_mtx_lock(bpf_mlock); - res = filt_bpfread_common(kn, d); - if (res) { - *kev = kn->kn_kevent; - } + res = filt_bpfread_common(kn, kev, d); lck_mtx_unlock(bpf_mlock); return res; @@ -3233,7 +3228,7 @@ get_pkt_trunc_len(u_char *p, u_int len) * pre is the offset to the L3 header after the bpfp_header, or length * of L2 header after bpfp_header, if present. */ - uint32_t pre = pktap->pth_frame_pre_length - + int32_t pre = pktap->pth_frame_pre_length - (pkt->bpfp_header_length - pktap->pth_length); /* Length of the input packet starting from L3 header */ @@ -3242,7 +3237,7 @@ get_pkt_trunc_len(u_char *p, u_int len) pktap->pth_protocol_family == AF_INET6) { /* Contains L2 header */ if (pre > 0) { - if (pre < sizeof(struct ether_header)) { + if (pre < (int32_t)sizeof(struct ether_header)) { goto too_short; } @@ -3720,7 +3715,7 @@ bpf_init(__unused void *unused) } #ifndef __APPLE__ -SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, bpf_drvinit, NULL) +SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, bpf_drvinit, NULL); #endif #if CONFIG_MACF_NET @@ -3736,3 +3731,24 @@ mac_bpfdesc_label_set(struct bpf_d *d, struct label *label) d->bd_label = label; } #endif + +static int +sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int i, err; + + i = bpf_maxbufsize; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) { + return err; + } + + if (i < 0 || i > BPF_MAXSIZE_CAP) { + i = BPF_MAXSIZE_CAP; + } + + bpf_maxbufsize = i; + return err; +} diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 12d3a6e37..6d5bcc587 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -470,10 +470,16 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) continue; case BPF_LD | BPF_MEM: + if (pc->k >= BPF_MEMWORDS) { + return 0; + } A = mem[pc->k]; continue; case BPF_LDX | BPF_MEM: + if (pc->k >= BPF_MEMWORDS) { + return 0; + } X = mem[pc->k]; continue; diff --git a/bsd/net/cc.h b/bsd/net/cc.h new file mode 100644 index 000000000..c5113b408 --- /dev/null +++ b/bsd/net/cc.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2003, Adam Dunkels. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This file is part of the Contiki desktop OS + * + * + */ + +/** + * \file + * Default definitions of C compiler quirk work-arounds. + * \author Adam Dunkels + * + * This file is used for making use of extra functionality of some C + * compilers used for Contiki, and defining work-arounds for various + * quirks and problems with some other C compilers. + */ + +#ifndef CC_H_ +#define CC_H_ + +#include "contiki-conf.h" + +/** + * Configure if the C compiler supports the "register" keyword for + * function arguments. + */ +#if CC_CONF_REGISTER_ARGS +#define CC_REGISTER_ARG register +#else /* CC_CONF_REGISTER_ARGS */ +#define CC_REGISTER_ARG +#endif /* CC_CONF_REGISTER_ARGS */ + +/** + * Configure if the C compiler supports the arguments for function + * pointers. + */ +#if CC_CONF_FUNCTION_POINTER_ARGS +#define CC_FUNCTION_POINTER_ARGS 1 +#else /* CC_CONF_FUNCTION_POINTER_ARGS */ +#define CC_FUNCTION_POINTER_ARGS 0 +#endif /* CC_CONF_FUNCTION_POINTER_ARGS */ + +/** + * Configure if the C compiler supports fastcall function + * declarations. + */ +#ifdef CC_CONF_FASTCALL +#define CC_FASTCALL CC_CONF_FASTCALL +#else /* CC_CONF_FASTCALL */ +#define CC_FASTCALL +#endif /* CC_CONF_FASTCALL */ + +/** + * Configure if the C compiler have problems with const function pointers + */ +#ifdef CC_CONF_CONST_FUNCTION_BUG +#define CC_CONST_FUNCTION +#else /* CC_CONF_FASTCALL */ +#define CC_CONST_FUNCTION const +#endif /* CC_CONF_FASTCALL */ + +/** + * Configure work-around for unsigned char bugs with sdcc. + */ +#if CC_CONF_UNSIGNED_CHAR_BUGS +#define CC_UNSIGNED_CHAR_BUGS 1 +#else /* CC_CONF_UNSIGNED_CHAR_BUGS */ +#define CC_UNSIGNED_CHAR_BUGS 0 +#endif /* CC_CONF_UNSIGNED_CHAR_BUGS */ + +/** + * Configure if C compiler supports double hash marks in C macros. + */ +#if CC_CONF_DOUBLE_HASH +#define CC_DOUBLE_HASH 1 +#else /* CC_CONF_DOUBLE_HASH */ +#define CC_DOUBLE_HASH 0 +#endif /* CC_CONF_DOUBLE_HASH */ + +#ifdef CC_CONF_INLINE +#define CC_INLINE CC_CONF_INLINE +#else /* CC_CONF_INLINE */ +#define CC_INLINE +#endif /* CC_CONF_INLINE */ + +/** + * Configure if the C compiler supports the assignment of struct value. + */ +#ifdef CC_CONF_ASSIGN_AGGREGATE +#define CC_ASSIGN_AGGREGATE(dest, src) CC_CONF_ASSIGN_AGGREGATE(dest, src) +#else /* CC_CONF_ASSIGN_AGGREGATE */ +#define CC_ASSIGN_AGGREGATE(dest, src) *dest = *src +#endif /* CC_CONF_ASSIGN_AGGREGATE */ + +#if CC_CONF_NO_VA_ARGS +#define CC_NO_VA_ARGS CC_CONF_VA_ARGS +#endif + +#ifndef NULL +#define NULL 0 +#endif /* NULL */ + +#ifndef MAX +#define MAX(n, m) (((n) < (m)) ? (m) : (n)) +#endif + +#ifndef MIN +#define MIN(n, m) (((n) < (m)) ? (n) : (m)) +#endif + +#ifndef ABS +#define ABS(n) (((n) < 0) ? -(n) : (n)) +#endif + + +#define CC_CONCAT2(s1, s2) s1##s2 +/** + * A C preprocessing macro for concatenating two preprocessor tokens. + * + * We need use two macros (CC_CONCAT and CC_CONCAT2) in order to allow + * concatenation of two \#defined macros. + */ +#define CC_CONCAT(s1, s2) CC_CONCAT2(s1, s2) +#define CC_CONCAT_EXT_2(s1, s2) CC_CONCAT2(s1, s2) + +/** + * A C preprocessing macro for concatenating three preprocessor tokens. + */ +#define CC_CONCAT3(s1, s2, s3) s1##s2##s3 +#define CC_CONCAT_EXT_3(s1, s2, s3) CC_CONCAT3(s1, s2, s3) + +#endif /* CC_H_ */ diff --git a/bsd/net/classq/classq.c b/bsd/net/classq/classq.c index 434a18dfa..dad2552c3 100644 --- a/bsd/net/classq/classq.c +++ b/bsd/net/classq/classq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,13 +105,15 @@ _qinit(class_queue_t *q, int type, int lim, classq_pkt_type_t ptype) /* add a packet at the tail of the queue */ void -_addq(class_queue_t *q, void *pkt) +_addq(class_queue_t *q, classq_pkt_t *pkt) { uint32_t size = 0; + ASSERT(pkt->cp_ptype == qptype(q)); + switch (qptype(q)) { case QP_MBUF: { - struct mbuf *m = pkt; + struct mbuf *m = pkt->cp_mbuf; MBUFQ_ENQUEUE(&qmbufq(q), m); size = m_length(m); break; @@ -121,6 +123,7 @@ _addq(class_queue_t *q, void *pkt) default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } qlen(q)++; @@ -130,13 +133,15 @@ _addq(class_queue_t *q, void *pkt) /* add one or more packets at the tail of the queue */ void -_addq_multi(class_queue_t *q, void *pkt_head, void *pkt_tail, +_addq_multi(class_queue_t *q, classq_pkt_t *pkt_head, classq_pkt_t *pkt_tail, u_int32_t cnt, u_int32_t size) { + ASSERT(pkt_head->cp_ptype == qptype(q)); + ASSERT(pkt_tail->cp_ptype == qptype(q)); switch (qptype(q)) { case QP_MBUF: { - struct mbuf *m_head = pkt_head; - struct mbuf *m_tail = pkt_tail; + struct mbuf *m_head = pkt_head->cp_mbuf; + struct mbuf *m_tail = pkt_tail->cp_mbuf; MBUFQ_ENQUEUE_MULTI(&qmbufq(q), m_head, m_tail); break; } @@ -145,6 +150,7 @@ _addq_multi(class_queue_t *q, void *pkt_head, void *pkt_tail, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } qlen(q) += cnt; @@ -152,19 +158,17 @@ _addq_multi(class_queue_t *q, void *pkt_head, void *pkt_tail, } /* get a packet at the head of the queue */ -void * -_getq(class_queue_t *q) +void +_getq(class_queue_t *q, classq_pkt_t *pkt) { - void *pkt = NULL; uint32_t pkt_len; switch (qptype(q)) { case QP_MBUF: { - struct mbuf *m; - MBUFQ_DEQUEUE(&qmbufq(q), m); - if (m != NULL) { - pkt_len = m_length(m); - pkt = m; + MBUFQ_DEQUEUE(&qmbufq(q), pkt->cp_mbuf); + if (__probable(pkt->cp_mbuf != NULL)) { + CLASSQ_PKT_INIT_MBUF(pkt, pkt->cp_mbuf); + pkt_len = m_length(pkt->cp_mbuf); } break; } @@ -173,14 +177,15 @@ _getq(class_queue_t *q) default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - if (pkt == NULL) { + if (pkt->cp_mbuf == NULL) { VERIFY(qlen(q) == 0); if (qsize(q) > 0) { qsize(q) = 0; } - return NULL; + return; } VERIFY(qlen(q) > 0); qlen(q)--; @@ -191,14 +196,12 @@ _getq(class_queue_t *q) } else if (qsize(q) != 0) { qsize(q) = 0; } - - return pkt; } -static void * -_getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid) +static void +_getq_flow_or_scidx(class_queue_t *q, classq_pkt_t *pkt, u_int32_t val, + boolean_t isflowid) { - void *pkt = NULL; uint32_t pkt_len; switch (qptype(q)) { @@ -217,8 +220,8 @@ _getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid) break; } } - if (m != NULL) { - pkt = m; + if (__probable(m != NULL)) { + CLASSQ_PKT_INIT_MBUF(pkt, m); pkt_len = m_length(m); } break; @@ -228,9 +231,10 @@ _getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid) default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - if (pkt != NULL) { + if (pkt->cp_mbuf != NULL) { VERIFY(qlen(q) > 0); qlen(q)--; @@ -241,36 +245,38 @@ _getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid) qsize(q) = 0; } } - - return pkt; } /* get a packet of a specific flow beginning from the head of the queue */ -void * -_getq_flow(class_queue_t *q, u_int32_t flow) +void +_getq_flow(class_queue_t *q, classq_pkt_t *pkt, u_int32_t flow) { - return _getq_flow_or_scidx(q, flow, TRUE); + return _getq_flow_or_scidx(q, pkt, flow, TRUE); } /* Get a packet whose MBUF_SCIDX() < scidx from head of queue */ -void * -_getq_scidx_lt(class_queue_t *q, u_int32_t scidx) +void +_getq_scidx_lt(class_queue_t *q, classq_pkt_t *pkt, u_int32_t scidx) { - return _getq_flow_or_scidx(q, scidx, FALSE); + return _getq_flow_or_scidx(q, pkt, scidx, FALSE); } /* get all packets (chained) starting from the head of the queue */ -void * -_getq_all(class_queue_t *q, void **last, u_int32_t *qlenp, - u_int64_t *qsizep) +void +_getq_all(class_queue_t *q, classq_pkt_t *first, classq_pkt_t *last, + u_int32_t *qlenp, u_int64_t *qsizep) { - void *pkt = NULL; - switch (qptype(q)) { case QP_MBUF: - pkt = MBUFQ_FIRST(&qmbufq(q)); + first->cp_mbuf = MBUFQ_FIRST(&qmbufq(q)); + if (__probable(first->cp_mbuf != NULL)) { + CLASSQ_PKT_INIT_MBUF(first, first->cp_mbuf); + } if (last != NULL) { - *last = MBUFQ_LAST(&qmbufq(q)); + last->cp_mbuf = MBUFQ_LAST(&qmbufq(q)); + if (__probable(last->cp_mbuf != NULL)) { + CLASSQ_PKT_INIT_MBUF(last, last->cp_mbuf); + } } MBUFQ_INIT(&qmbufq(q)); break; @@ -279,6 +285,7 @@ _getq_all(class_queue_t *q, void **last, u_int32_t *qlenp, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } if (qlenp != NULL) { @@ -290,8 +297,6 @@ _getq_all(class_queue_t *q, void **last, u_int32_t *qlenp, qlen(q) = 0; qsize(q) = 0; - - return pkt; } static inline struct mbuf * @@ -335,22 +340,22 @@ _getq_tail_mbuf(class_queue_t *q) } /* drop a packet at the tail of the queue */ -void * -_getq_tail(class_queue_t *q) +void +_getq_tail(class_queue_t *q, classq_pkt_t *pkt) { - void *t = NULL; - switch (qptype(q)) { case QP_MBUF: - t = _getq_tail_mbuf(q); + pkt->cp_mbuf = _getq_tail_mbuf(q); + if (__probable(pkt->cp_mbuf != NULL)) { + CLASSQ_PKT_INIT_MBUF(pkt, pkt->cp_mbuf); + } break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - - return t; } static inline struct mbuf * @@ -415,22 +420,22 @@ _getq_random_mbuf(class_queue_t *q) } /* randomly select a packet in the queue */ -void * -_getq_random(class_queue_t *q) +void +_getq_random(class_queue_t *q, classq_pkt_t *pkt) { - void *r = NULL; - switch (qptype(q)) { case QP_MBUF: - r = _getq_random_mbuf(q); + pkt->cp_mbuf = _getq_random_mbuf(q); + if (__probable(pkt->cp_mbuf != NULL)) { + CLASSQ_PKT_INIT_MBUF(pkt, pkt->cp_mbuf); + } break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - - return r; } static inline void @@ -445,12 +450,13 @@ _removeq_mbuf(class_queue_t *q, struct mbuf *m) } if (m0 != m) { - while (MBUFQ_NEXT(m0) != m) { - if (m0 == NULL) { - return; - } + while (m0 != NULL && MBUFQ_NEXT(m0) != m) { m0 = MBUFQ_NEXT(m0); } + if (m0 == NULL) { + return; + } + mtail = &MBUFQ_NEXT(m0); } else { mtail = &MBUFQ_FIRST(head); @@ -476,16 +482,18 @@ _removeq_mbuf(class_queue_t *q, struct mbuf *m) /* remove a packet from the queue */ void -_removeq(class_queue_t *q, void *pkt) +_removeq(class_queue_t *q, classq_pkt_t *pkt) { switch (qptype(q)) { case QP_MBUF: - _removeq_mbuf(q, pkt); + ASSERT(pkt->cp_ptype == QP_MBUF); + _removeq_mbuf(q, pkt->cp_mbuf); break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } } @@ -545,6 +553,7 @@ _flushq_flow_mbuf(class_queue_t *q, u_int32_t flow, u_int32_t *cnt, } } + void _flushq_flow(class_queue_t *q, u_int32_t flow, u_int32_t *cnt, u_int32_t *len) { @@ -556,5 +565,6 @@ _flushq_flow(class_queue_t *q, u_int32_t flow, u_int32_t *cnt, u_int32_t *len) default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } } diff --git a/bsd/net/classq/classq.h b/bsd/net/classq/classq.h index 93ded92bc..33d64b75d 100644 --- a/bsd/net/classq/classq.h +++ b/bsd/net/classq/classq.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2016 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,25 @@ typedef enum classq_pkt_type { QP_MBUF, /* mbuf packet */ } classq_pkt_type_t; +/* + * Packet + */ +typedef struct classq_pkt { + union { + struct mbuf *cp_mbuf; /* mbuf packet */ + }; + classq_pkt_type_t cp_ptype; +} classq_pkt_t; + +#define CLASSQ_PKT_INITIALIZER(_p) \ + (classq_pkt_t){ .cp_mbuf = NULL, .cp_ptype = QP_INVALID } + +#define CLASSQ_PKT_INIT_MBUF(_p, _m) do { \ + (_p)->cp_ptype = QP_MBUF; \ + (_p)->cp_mbuf = (_m); \ +} while (0) + + /* * Packet Queue types */ @@ -168,15 +187,17 @@ extern u_int32_t classq_verbose; SYSCTL_DECL(_net_classq); extern void _qinit(class_queue_t *, int, int, classq_pkt_type_t); -extern void _addq(class_queue_t *, void *); -extern void _addq_multi(class_queue_t *, void *, void *, u_int32_t, u_int32_t); -extern void *_getq(class_queue_t *); -extern void *_getq_all(class_queue_t *, void **, u_int32_t *, u_int64_t *); -extern void *_getq_tail(class_queue_t *); -extern void *_getq_random(class_queue_t *); -extern void *_getq_flow(class_queue_t *, u_int32_t); -extern void *_getq_scidx_lt(class_queue_t *, u_int32_t); -extern void _removeq(class_queue_t *, void *); +extern void _addq(class_queue_t *, classq_pkt_t *); +extern void _addq_multi(class_queue_t *, classq_pkt_t *, classq_pkt_t *, + u_int32_t, u_int32_t); +extern void _getq(class_queue_t *, classq_pkt_t *); +extern void _getq_all(class_queue_t *, classq_pkt_t *, classq_pkt_t *, + u_int32_t *, u_int64_t *); +extern void _getq_tail(class_queue_t *, classq_pkt_t *); +extern void _getq_random(class_queue_t *, classq_pkt_t *); +extern void _getq_flow(class_queue_t *, classq_pkt_t *, u_int32_t); +extern void _getq_scidx_lt(class_queue_t *, classq_pkt_t *, u_int32_t); +extern void _removeq(class_queue_t *, classq_pkt_t *); extern void _flushq(class_queue_t *); extern void _flushq_flow(class_queue_t *, u_int32_t, u_int32_t *, u_int32_t *); diff --git a/bsd/net/classq/classq_fq_codel.c b/bsd/net/classq/classq_fq_codel.c index 75a568d2c..912302beb 100644 --- a/bsd/net/classq/classq_fq_codel.c +++ b/bsd/net/classq/classq_fq_codel.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,6 +69,7 @@ fq_codel_init(void) if (flowq_cache == NULL) { panic("%s: failed to allocate flowq_cache", __func__); /* NOTREACHED */ + __builtin_unreachable(); } } @@ -130,12 +131,13 @@ void fq_head_drop(fq_if_t *fqs, fq_t *fq) { pktsched_pkt_t pkt; - uint32_t *pkt_flags; + volatile uint32_t *pkt_flags; uint64_t *pkt_timestamp; struct ifclassq *ifq = fqs->fqs_ifq; _PKTSCHED_PKT_INIT(&pkt); - if (fq_getq_flow_internal(fqs, fq, &pkt) == NULL) { + fq_getq_flow_internal(fqs, fq, &pkt); + if (pkt.pktsched_pkt_mbuf == NULL) { return; } @@ -143,8 +145,14 @@ fq_head_drop(fq_if_t *fqs, fq_t *fq) NULL, NULL); *pkt_timestamp = 0; - if (pkt.pktsched_ptype == QP_MBUF) { + switch (pkt.pktsched_ptype) { + case QP_MBUF: *pkt_flags &= ~PKTF_PRIV_GUARDED; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt)); @@ -159,17 +167,23 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl) u_int64_t now; fq_t *fq = NULL; uint64_t *pkt_timestamp; - uint32_t *pkt_flags; + volatile uint32_t *pkt_flags; uint32_t pkt_flowid, pkt_tx_start_seq; uint8_t pkt_proto, pkt_flowsrc; pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid, &pkt_flowsrc, &pkt_proto, &pkt_tx_start_seq); - if (pkt->pktsched_ptype == QP_MBUF) { + switch (pkt->pktsched_ptype) { + case QP_MBUF: /* See comments in */ VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED)); *pkt_flags |= PKTF_PRIV_GUARDED; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } if (*pkt_timestamp > 0) { @@ -200,9 +214,10 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl) fc_adv = 1; /* * If the flow is suspended or it is not - * TCP, drop the packet + * TCP/QUIC, drop the packet */ - if (pkt_proto != IPPROTO_TCP) { + if ((pkt_proto != IPPROTO_TCP) && + (pkt_proto != IPPROTO_QUIC)) { droptype = DTYPE_EARLY; fq_cl->fcl_stat.fcl_drop_early++; } @@ -312,20 +327,21 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl) return ret; } -void * +void fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) { - void *p; + classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p); uint32_t plen; fq_if_classq_t *fq_cl; struct ifclassq *ifq = fqs->fqs_ifq; - fq_dequeue(fq, p); - if (p == NULL) { - return NULL; + fq_dequeue(fq, &p); + if (p.cp_ptype == QP_INVALID) { + VERIFY(p.cp_mbuf == NULL); + return; } - pktsched_pkt_encap(pkt, fq->fq_ptype, p); + pktsched_pkt_encap(pkt, &p); plen = pktsched_get_pkt_len(pkt); VERIFY(fq->fq_bytes >= plen); @@ -341,24 +357,23 @@ fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) if (fq_empty(fq)) { fq->fq_getqtime = 0; } - - return p; } -void * +void fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) { - void *p; fq_if_classq_t *fq_cl; u_int64_t now; int64_t qdelay = 0; struct timespec now_ts; - uint32_t *pkt_flags, pkt_tx_start_seq; + volatile uint32_t *pkt_flags; + uint32_t pkt_tx_start_seq; uint64_t *pkt_timestamp; - p = fq_getq_flow_internal(fqs, fq, pkt); - if (p == NULL) { - return NULL; + fq_getq_flow_internal(fqs, fq, pkt); + if (pkt->pktsched_ptype == QP_INVALID) { + VERIFY(pkt->pktsched_pkt_mbuf == NULL); + return; } pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, @@ -385,8 +400,6 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) } else { FQ_CLEAR_DELAY_HIGH(fq); } - - /* Reset measured queue delay and update time */ fq->fq_updatetime = now + fqs->fqs_update_interval; fq->fq_min_qdelay = 0; @@ -407,9 +420,13 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) fq_if_is_flow_heavy(fqs, fq); *pkt_timestamp = 0; - if (pkt->pktsched_ptype == QP_MBUF) { + switch (pkt->pktsched_ptype) { + case QP_MBUF: *pkt_flags &= ~PKTF_PRIV_GUARDED; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } - - return p; } diff --git a/bsd/net/classq/classq_fq_codel.h b/bsd/net/classq/classq_fq_codel.h index 29b81a9db..582e4a899 100644 --- a/bsd/net/classq/classq_fq_codel.h +++ b/bsd/net/classq/classq_fq_codel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,12 +74,13 @@ typedef struct flowq { #define fq_empty(_q) MBUFQ_EMPTY(&(_q)->fq_mbufq) -#define fq_enqueue(_q, _p) MBUFQ_ENQUEUE(&(_q)->fq_mbufq, (mbuf_t)_p) +#define fq_enqueue(_q, _p) MBUFQ_ENQUEUE(&(_q)->fq_mbufq, _p.cp_mbuf) #define fq_dequeue(_q, _p) do { \ - mbuf_t _m; \ - MBUFQ_DEQUEUE(&(_q)->fq_mbufq, _m); \ - (_p) = _m; \ + MBUFQ_DEQUEUE(&(_q)->fq_mbufq, (_p)->cp_mbuf); \ + if (__probable((_p)->cp_mbuf != NULL)) { \ + CLASSQ_PKT_INIT_MBUF((_p), (_p)->cp_mbuf); \ + } \ } while (0) struct fq_codel_sched_data; @@ -92,9 +93,9 @@ extern fq_t *fq_alloc(classq_pkt_type_t); extern void fq_destroy(fq_t *); extern int fq_addq(struct fq_codel_sched_data *, pktsched_pkt_t *, struct fq_if_classq *); -extern void *fq_getq_flow(struct fq_codel_sched_data *, fq_t *, +extern void fq_getq_flow(struct fq_codel_sched_data *, fq_t *, pktsched_pkt_t *); -extern void *fq_getq_flow_internal(struct fq_codel_sched_data *, +extern void fq_getq_flow_internal(struct fq_codel_sched_data *, fq_t *, pktsched_pkt_t *); extern void fq_head_drop(struct fq_codel_sched_data *, fq_t *); diff --git a/bsd/net/classq/classq_sfb.c b/bsd/net/classq/classq_sfb.c index 73f0ca03d..1e5963e5a 100644 --- a/bsd/net/classq/classq_sfb.c +++ b/bsd/net/classq/classq_sfb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2017 Apple Inc. All rights reserved. + * Copyright (c) 2011-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -206,8 +206,6 @@ #define DEQUEUE_SPIKE(_new, _old) \ ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11)) -#define ABS(v) (((v) > 0) ? (v) : -(v)) - #define SFB_ZONE_MAX 32 /* maximum elements in zone */ #define SFB_ZONE_NAME "classq_sfb" /* zone name */ @@ -301,13 +299,13 @@ struct sfb_time_tbl { }; static struct sfb_time_tbl sfb_ttbl[] = { - { 1 * MBPS, HOLDTIME_BASE * 1000, PBOXTIME_BASE * 1000 }, - { 10 * MBPS, HOLDTIME_BASE * 100, PBOXTIME_BASE * 100 }, - { 100 * MBPS, HOLDTIME_BASE * 10, PBOXTIME_BASE * 10 }, - { 1 * GBPS, HOLDTIME_BASE, PBOXTIME_BASE }, - { 10 * GBPS, HOLDTIME_BASE / 10, PBOXTIME_BASE / 10 }, - { 100 * GBPS, HOLDTIME_BASE / 100, PBOXTIME_BASE / 100 }, - { 0, 0, 0 } + { .speed = 1 * MBPS, .holdtime = HOLDTIME_BASE * 1000, .pboxtime = PBOXTIME_BASE * 1000}, + { .speed = 10 * MBPS, .holdtime = HOLDTIME_BASE * 100, .pboxtime = PBOXTIME_BASE * 100 }, + { .speed = 100 * MBPS, .holdtime = HOLDTIME_BASE * 10, .pboxtime = PBOXTIME_BASE * 10 }, + { .speed = 1 * GBPS, .holdtime = HOLDTIME_BASE, .pboxtime = PBOXTIME_BASE }, + { .speed = 10 * GBPS, .holdtime = HOLDTIME_BASE / 10, .pboxtime = PBOXTIME_BASE / 10 }, + { .speed = 100 * GBPS, .holdtime = HOLDTIME_BASE / 100, .pboxtime = PBOXTIME_BASE / 100 }, + { .speed = 0, .holdtime = 0, .pboxtime = 0 } }; void @@ -326,7 +324,7 @@ sfb_init(void) zone_change(sfb_zone, Z_EXPAND, TRUE); zone_change(sfb_zone, Z_CALLERACCT, TRUE); - sfb_bins_size = sizeof(*((struct sfb *)0)->sfb_bins); + sfb_bins_size = sizeof(struct sfb_bins); sfb_bins_zone = zinit(sfb_bins_size, SFB_BINS_ZONE_MAX * sfb_bins_size, 0, SFB_BINS_ZONE_NAME); if (sfb_bins_zone == NULL) { @@ -336,7 +334,7 @@ sfb_init(void) zone_change(sfb_bins_zone, Z_EXPAND, TRUE); zone_change(sfb_bins_zone, Z_CALLERACCT, TRUE); - sfb_fcl_size = sizeof(*((struct sfb *)0)->sfb_fc_lists); + sfb_fcl_size = sizeof(struct sfb_fcl); sfb_fcl_zone = zinit(sfb_fcl_size, SFB_FCL_ZONE_MAX * sfb_fcl_size, 0, SFB_FCL_ZONE_NAME); if (sfb_fcl_zone == NULL) { @@ -722,7 +720,7 @@ static int sfb_penalize(struct sfb *sp, uint32_t pkt_sfb_hash, uint32_t *pkt_sfb_flags, struct timespec *now) { - struct timespec delta = { 0, 0 }; + struct timespec delta = { .tv_sec = 0, .tv_nsec = 0 }; uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash; /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */ @@ -1149,7 +1147,7 @@ sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt, uint16_t *pkt_sfb_hash16; uint32_t *pkt_sfb_flags; uint32_t pkt_flowid; - uint32_t *pkt_flags; + volatile uint32_t *pkt_flags; uint8_t pkt_proto, pkt_flowsrc; s = sp->sfb_current; @@ -1160,10 +1158,16 @@ sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt, pkt_sfb_hash = pktsched_get_pkt_sfb_vars(pkt, &pkt_sfb_flags); pkt_sfb_hash16 = (uint16_t *)pkt_sfb_hash; - if (pkt->pktsched_ptype == QP_MBUF) { + switch (pkt->pktsched_ptype) { + case QP_MBUF: /* See comments in */ VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED)); *pkt_flags |= PKTF_PRIV_GUARDED; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } if (*pkt_timestamp > 0) { @@ -1294,7 +1298,7 @@ sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt, /* if successful enqueue this packet, else drop it */ if (droptype == DTYPE_NODROP) { VERIFY(pkt->pktsched_ptype == qptype(q)); - _addq(q, pkt->pktsched_pkt); + _addq(q, &pkt->pktsched_pkt); } else { IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP; @@ -1316,12 +1320,11 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge, pktsched_pkt_t *pkt) { struct timespec now; - classq_pkt_type_t ptype; uint64_t *pkt_timestamp; - uint32_t *pkt_flags; + volatile uint32_t *pkt_flags; uint32_t *pkt_sfb_flags; uint32_t *pkt_sfb_hash; - void *p; + classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p); if (!purge && (sp->sfb_flags & SFBF_SUSPENDED)) { return NULL; @@ -1330,22 +1333,33 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge, nanouptime(&now); /* flow of 0 means head of queue */ - if ((p = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { + if (flow == 0) { + _getq(q, &p); + } else { + _getq_flow(q, &p, flow); + } + + if (p.cp_ptype == QP_INVALID) { if (!purge) { net_timerclear(&sp->sfb_getqtime); } return NULL; } - ptype = qptype(q); - pktsched_pkt_encap(pkt, ptype, p); + pktsched_pkt_encap(pkt, &p); pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, NULL, NULL); pkt_sfb_hash = pktsched_get_pkt_sfb_vars(pkt, &pkt_sfb_flags); /* See comments in */ - if (ptype == QP_MBUF) { + switch (p.cp_ptype) { + case QP_MBUF: VERIFY(*pkt_flags & PKTF_PRIV_GUARDED); + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } if (!purge) { @@ -1424,9 +1438,15 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge, &now, qsize(q)); } - /* See comments in */ - if (ptype == QP_MBUF) { + switch (p.cp_ptype) { + case QP_MBUF: + /* See comments in */ *pkt_flags &= ~PKTF_PRIV_GUARDED; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } /* @@ -1440,7 +1460,7 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge, net_timerclear(&sp->sfb_update_time); net_timerclear(&sp->sfb_getqtime); } - return p; + return pkt->pktsched_pkt_mbuf; } void diff --git a/bsd/net/classq/classq_subr.c b/bsd/net/classq/classq_subr.c index 1256b3c3e..d5af79b4e 100644 --- a/bsd/net/classq/classq_subr.c +++ b/bsd/net/classq/classq_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2017 Apple Inc. All rights reserved. + * Copyright (c) 2011-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,10 +49,10 @@ static errno_t ifclassq_dequeue_common(struct ifclassq *, mbuf_svc_class_t, - u_int32_t, u_int32_t, void **, void **, u_int32_t *, u_int32_t *, - boolean_t, classq_pkt_type_t *); -static void *ifclassq_tbr_dequeue_common(struct ifclassq *, mbuf_svc_class_t, - boolean_t, classq_pkt_type_t *); + u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, + u_int32_t *, boolean_t); +static void ifclassq_tbr_dequeue_common(struct ifclassq *, mbuf_svc_class_t, + boolean_t, classq_pkt_t *); static u_int64_t ifclassq_target_qdelay = 0; SYSCTL_QUAD(_net_classq, OID_AUTO, target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -141,7 +141,7 @@ ifclassq_teardown(struct ifnet *ifp) if (IFCQ_IS_READY(ifq)) { if (IFCQ_TBR_IS_ENABLED(ifq)) { - struct tb_profile tb = { 0, 0, 0 }; + struct tb_profile tb = { .rate = 0, .percent = 0, .depth = 0 }; (void) ifclassq_tbr_set(ifq, &tb, FALSE); } (void) pktsched_teardown(ifq); @@ -244,20 +244,21 @@ ifclassq_get_len(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int32_t *packets, } IFCQ_UNLOCK(ifq); + return err; } inline void ifclassq_set_packet_metadata(struct ifclassq *ifq, struct ifnet *ifp, - void *p, classq_pkt_type_t ptype) + classq_pkt_t *p) { if (!IFNET_IS_CELLULAR(ifp)) { return; } - switch (ptype) { + switch (p->cp_ptype) { case QP_MBUF: { - struct mbuf *m = p; + struct mbuf *m = p->cp_mbuf; m->m_pkthdr.pkt_flags |= PKTF_VALID_UNSENT_DATA; m->m_pkthdr.bufstatus_if = IFCQ_BYTES(ifq); m->m_pkthdr.bufstatus_sndbuf = ifp->if_sndbyte_unsent; @@ -268,16 +269,16 @@ ifclassq_set_packet_metadata(struct ifclassq *ifq, struct ifnet *ifp, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } } errno_t -ifclassq_enqueue(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, - boolean_t *pdrop) +ifclassq_enqueue(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop) { errno_t err; - switch (ptype) { + switch (p->cp_ptype) { case QP_MBUF: IFCQ_LOCK_SPIN(ifq); break; @@ -287,43 +288,41 @@ ifclassq_enqueue(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, break; } - IFCQ_ENQUEUE(ifq, p, ptype, err, pdrop); + IFCQ_ENQUEUE(ifq, p, err, pdrop); IFCQ_UNLOCK(ifq); return err; } errno_t ifclassq_dequeue(struct ifclassq *ifq, u_int32_t pkt_limit, - u_int32_t byte_limit, void **head, void **tail, - u_int32_t *cnt, u_int32_t *len, classq_pkt_type_t *ptype) + u_int32_t byte_limit, classq_pkt_t *head, classq_pkt_t *tail, + u_int32_t *cnt, u_int32_t *len) { return ifclassq_dequeue_common(ifq, MBUF_SC_UNSPEC, pkt_limit, - byte_limit, head, tail, cnt, len, FALSE, ptype); + byte_limit, head, tail, cnt, len, FALSE); } errno_t ifclassq_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc, - u_int32_t pkt_limit, u_int32_t byte_limit, void **head, void **tail, - u_int32_t *cnt, u_int32_t *len, classq_pkt_type_t *ptype) + u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head, + classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len) { return ifclassq_dequeue_common(ifq, sc, pkt_limit, byte_limit, - head, tail, cnt, len, TRUE, ptype); + head, tail, cnt, len, TRUE); } static errno_t -ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, - u_int32_t pkt_limit, u_int32_t byte_limit, void **head, - void **tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt, - classq_pkt_type_t *ptype) +ifclassq_dequeue_common_default(struct ifclassq *ifq, mbuf_svc_class_t sc, + u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head, + classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt) { struct ifnet *ifp = ifq->ifcq_ifp; u_int32_t i = 0, l = 0, lock_spin = 1; - void **first, *last; + classq_pkt_t first = CLASSQ_PKT_INITIALIZER(first); + classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); VERIFY(!drvmgt || MBUF_VALID_SC(sc)); - *ptype = 0; - if (IFCQ_TBR_IS_ENABLED(ifq)) { goto dequeue_loop; @@ -342,10 +341,10 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, IFCQ_LOCK(ifq); } err = ifq->ifcq_dequeue_sc_multi(ifq, sc, pkt_limit, - byte_limit, head, tail, cnt, len, ptype); + byte_limit, head, tail, cnt, len); IFCQ_UNLOCK(ifq); - if (err == 0 && (*head) == NULL) { + if (err == 0 && head->cp_mbuf == NULL) { err = EAGAIN; } return err; @@ -359,19 +358,16 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, } err = ifq->ifcq_dequeue_multi(ifq, pkt_limit, byte_limit, - head, tail, cnt, len, ptype); + head, tail, cnt, len); IFCQ_UNLOCK(ifq); - if (err == 0 && (*head) == NULL) { + if (err == 0 && head->cp_mbuf == NULL) { err = EAGAIN; } return err; } dequeue_loop: - *head = NULL; - first = &(*head); - last = NULL; if (lock_spin) { IFCQ_LOCK_SPIN(ifq); @@ -380,42 +376,46 @@ dequeue_loop: } while (i < pkt_limit && l < byte_limit) { - classq_pkt_type_t tmp_ptype; if (drvmgt) { if (IFCQ_TBR_IS_ENABLED(ifq)) { - IFCQ_TBR_DEQUEUE_SC(ifq, sc, *head, &tmp_ptype); + IFCQ_TBR_DEQUEUE_SC(ifq, sc, head); } else { - IFCQ_DEQUEUE_SC(ifq, sc, *head, &tmp_ptype); + IFCQ_DEQUEUE_SC(ifq, sc, head); } } else { if (IFCQ_TBR_IS_ENABLED(ifq)) { - IFCQ_TBR_DEQUEUE(ifq, *head, &tmp_ptype); + IFCQ_TBR_DEQUEUE(ifq, head); } else { - IFCQ_DEQUEUE(ifq, *head, &tmp_ptype); + IFCQ_DEQUEUE(ifq, head); } } - if (*head == NULL) { + if (head->cp_mbuf == NULL) { break; } - switch (tmp_ptype) { + if (first.cp_mbuf == NULL) { + first = *head; + } + + switch (head->cp_ptype) { case QP_MBUF: - (*((mbuf_t *)head))->m_nextpkt = NULL; - last = *head; - l += (*((mbuf_t *)head))->m_pkthdr.len; - ifclassq_set_packet_metadata(ifq, ifp, (*head), - QP_MBUF); - head = (void **)&(*((mbuf_t *)head))->m_nextpkt; + head->cp_mbuf->m_nextpkt = NULL; + l += head->cp_mbuf->m_pkthdr.len; + ifclassq_set_packet_metadata(ifq, ifp, head); + if (last.cp_mbuf != NULL) { + last.cp_mbuf->m_nextpkt = head->cp_mbuf; + } break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - *ptype = tmp_ptype; + last = *head; i++; } @@ -431,7 +431,17 @@ dequeue_loop: *len = l; } - return (*first != NULL) ? 0 : EAGAIN; + *head = first; + return (first.cp_mbuf != NULL) ? 0 : EAGAIN; +} + +static errno_t +ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, + u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head, + classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt) +{ + return ifclassq_dequeue_common_default(ifq, sc, + pkt_limit, byte_limit, head, tail, cnt, len, drvmgt); } void @@ -570,25 +580,24 @@ ifclassq_ev2str(cqev_t ev) #define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) #define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) -void * -ifclassq_tbr_dequeue(struct ifclassq *ifq, classq_pkt_type_t *ptype) +void +ifclassq_tbr_dequeue(struct ifclassq *ifq, classq_pkt_t *pkt) { - return ifclassq_tbr_dequeue_common(ifq, MBUF_SC_UNSPEC, FALSE, ptype); + ifclassq_tbr_dequeue_common(ifq, MBUF_SC_UNSPEC, FALSE, pkt); } -void * +void ifclassq_tbr_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc, - classq_pkt_type_t *ptype) + classq_pkt_t *pkt) { - return ifclassq_tbr_dequeue_common(ifq, sc, TRUE, ptype); + ifclassq_tbr_dequeue_common(ifq, sc, TRUE, pkt); } -static void * +static void ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, - boolean_t drvmgt, classq_pkt_type_t *ptype) + boolean_t drvmgt, classq_pkt_t *pkt) { struct tb_regulator *tbr; - void *p; int64_t interval; u_int64_t now; @@ -597,6 +606,7 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, VERIFY(!drvmgt || MBUF_VALID_SC(sc)); VERIFY(IFCQ_TBR_IS_ENABLED(ifq)); + *pkt = CLASSQ_PKT_INITIALIZER(*pkt); tbr = &ifq->ifcq_tbr; /* update token only when it is negative */ if (tbr->tbr_token <= 0) { @@ -614,7 +624,7 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, } /* if token is still negative, don't allow dequeue */ if (tbr->tbr_token <= 0) { - return NULL; + return; } /* @@ -622,15 +632,15 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, * ifcq_drain count is adjusted by the caller. */ if (drvmgt) { - IFCQ_DEQUEUE_SC(ifq, sc, p, ptype); + IFCQ_DEQUEUE_SC(ifq, sc, pkt); } else { - IFCQ_DEQUEUE(ifq, p, ptype); + IFCQ_DEQUEUE(ifq, pkt); } - if (p != NULL) { - switch (*ptype) { + if (pkt->cp_mbuf != NULL) { + switch (pkt->cp_ptype) { case QP_MBUF: - tbr->tbr_token -= TBR_SCALE(m_pktlen((mbuf_t)p)); + tbr->tbr_token -= TBR_SCALE(m_pktlen(pkt->cp_mbuf)); break; @@ -639,8 +649,6 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, /* NOTREACHED */ } } - - return p; } /* @@ -678,7 +686,7 @@ ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, if (rate == 0) { if (!IFCQ_TBR_IS_ENABLED(ifq)) { - return ENOENT; + return 0; } if (pktsched_verbose) { diff --git a/bsd/net/classq/if_classq.h b/bsd/net/classq/if_classq.h index 98f019796..2de9ac9b1 100644 --- a/bsd/net/classq/if_classq.h +++ b/bsd/net/classq/if_classq.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2017 Apple Inc. All rights reserved. + * Copyright (c) 2011-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,16 +100,16 @@ struct ifclassq; enum cqdq_op; enum cqrq; -typedef int (*ifclassq_enq_func)(struct ifclassq *, void *, classq_pkt_type_t, +typedef int (*ifclassq_enq_func)(struct ifclassq *, classq_pkt_t *, boolean_t *); -typedef void *(*ifclassq_deq_func)(struct ifclassq *, classq_pkt_type_t *); -typedef void *(*ifclassq_deq_sc_func)(struct ifclassq *, - mbuf_svc_class_t, classq_pkt_type_t *); +typedef void (*ifclassq_deq_func)(struct ifclassq *, classq_pkt_t *); +typedef void (*ifclassq_deq_sc_func)(struct ifclassq *, mbuf_svc_class_t, + classq_pkt_t *); typedef int (*ifclassq_deq_multi_func)(struct ifclassq *, u_int32_t, - u_int32_t, void **, void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *); + u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *); typedef int (*ifclassq_deq_sc_multi_func)(struct ifclassq *, - mbuf_svc_class_t, u_int32_t, u_int32_t, void **, void **, - u_int32_t *, u_int32_t *, classq_pkt_type_t *); + mbuf_svc_class_t, u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, + u_int32_t *, u_int32_t *); typedef int (*ifclassq_req_func)(struct ifclassq *, enum cqrq, void *); /* @@ -249,24 +249,24 @@ struct if_ifclassq_stats { /* * For ifclassq operations */ -#define IFCQ_ENQUEUE(_ifq, _p, _t, _err, _drop) do { \ - (_err) = (*(_ifq)->ifcq_enqueue)(_ifq, _p, _t, _drop); \ +#define IFCQ_ENQUEUE(_ifq, _p, _err, _drop) do { \ + (_err) = (*(_ifq)->ifcq_enqueue)(_ifq, _p, _drop); \ } while (0) -#define IFCQ_DEQUEUE(_ifq, _p, _t) do { \ - (_p) = (*(_ifq)->ifcq_dequeue)(_ifq, _t); \ +#define IFCQ_DEQUEUE(_ifq, _p) do { \ + (*(_ifq)->ifcq_dequeue)(_ifq, _p); \ } while (0) -#define IFCQ_DEQUEUE_SC(_ifq, _sc, _p, _t) do { \ - (_p) = (*(_ifq)->ifcq_dequeue_sc)(_ifq, _sc, _t); \ +#define IFCQ_DEQUEUE_SC(_ifq, _sc, _p) do { \ + (*(_ifq)->ifcq_dequeue_sc)(_ifq, _sc, _p); \ } while (0) -#define IFCQ_TBR_DEQUEUE(_ifcq, _p, _t) do { \ - (_p) = ifclassq_tbr_dequeue(_ifcq, _t); \ +#define IFCQ_TBR_DEQUEUE(_ifcq, _p) do { \ + ifclassq_tbr_dequeue(_ifcq, _p); \ } while (0) -#define IFCQ_TBR_DEQUEUE_SC(_ifcq, _sc, _p, _t) do { \ - (_p) = ifclassq_tbr_dequeue_sc(_ifcq, _sc, _t); \ +#define IFCQ_TBR_DEQUEUE_SC(_ifcq, _sc, _p) do { \ + ifclassq_tbr_dequeue_sc(_ifcq, _sc, _p); \ } while (0) #define IFCQ_PURGE(_ifq) do { \ @@ -338,13 +338,12 @@ extern void ifclassq_set_maxlen(struct ifclassq *, u_int32_t); extern u_int32_t ifclassq_get_maxlen(struct ifclassq *); extern int ifclassq_get_len(struct ifclassq *, mbuf_svc_class_t, u_int32_t *, u_int32_t *); -extern errno_t ifclassq_enqueue(struct ifclassq *, void *, classq_pkt_type_t, - boolean_t *); +extern errno_t ifclassq_enqueue(struct ifclassq *, classq_pkt_t *, boolean_t *); extern errno_t ifclassq_dequeue(struct ifclassq *, u_int32_t, u_int32_t, - void **, void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *); + classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *); extern errno_t ifclassq_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, - u_int32_t, u_int32_t, void **, void **, u_int32_t *, u_int32_t *, - classq_pkt_type_t *); + u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, + u_int32_t *); extern void *ifclassq_poll(struct ifclassq *, classq_pkt_type_t *); extern void *ifclassq_poll_sc(struct ifclassq *, mbuf_svc_class_t, classq_pkt_type_t *); @@ -357,14 +356,14 @@ extern int ifclassq_getqstats(struct ifclassq *, u_int32_t, void *, u_int32_t *); extern const char *ifclassq_ev2str(cqev_t); extern int ifclassq_tbr_set(struct ifclassq *, struct tb_profile *, boolean_t); -extern void *ifclassq_tbr_dequeue(struct ifclassq *, classq_pkt_type_t *); -extern void *ifclassq_tbr_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, - classq_pkt_type_t *); +extern void ifclassq_tbr_dequeue(struct ifclassq *, classq_pkt_t *); +extern void ifclassq_tbr_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, + classq_pkt_t *); extern void ifclassq_calc_target_qdelay(struct ifnet *ifp, u_int64_t *if_target_qdelay); extern void ifclassq_calc_update_interval(u_int64_t *update_interval); extern void ifclassq_set_packet_metadata(struct ifclassq *ifq, - struct ifnet *ifp, void *p, classq_pkt_type_t ptype); + struct ifnet *ifp, classq_pkt_t *p); extern void ifclassq_reap_caches(boolean_t); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 49f16a7df..62988b66b 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Apple Inc. All rights reserved. + * Copyright (c) 2013-2019 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -312,6 +312,7 @@ #include #include +#include #include #include @@ -322,8 +323,14 @@ #include #include #include +#include +#include +#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX) #define MAX_CONTENT_FILTER 2 +#else +#define MAX_CONTENT_FILTER 8 +#endif struct cfil_entry; @@ -340,6 +347,8 @@ struct content_filter { uint32_t cf_sock_count; TAILQ_HEAD(, cfil_entry) cf_sock_entries; + + cfil_crypto_state_t cf_crypto_state; }; #define CFF_ACTIVE 0x01 @@ -391,6 +400,7 @@ struct cfil_queue { */ struct cfil_entry { TAILQ_ENTRY(cfil_entry) cfe_link; + SLIST_ENTRY(cfil_entry) cfe_order_link; struct content_filter *cfe_filter; struct cfil_info *cfe_cfil_info; @@ -452,7 +462,14 @@ struct cfil_info { uint32_t cfi_op_list_ctr; uint32_t cfi_op_time[CFI_MAX_TIME_LOG_ENTRY]; /* time interval in microseconds since first event */ unsigned char cfi_op_list[CFI_MAX_TIME_LOG_ENTRY]; + union sockaddr_in_4_6 cfi_so_attach_faddr; /* faddr at the time of attach */ + union sockaddr_in_4_6 cfi_so_attach_laddr; /* laddr at the time of attach */ + int cfi_dir; + uint64_t cfi_byte_inbound_count; + uint64_t cfi_byte_outbound_count; + + boolean_t cfi_isSignatureLatest; /* Indicates if signature covers latest flow attributes */ struct cfi_buf { /* * cfi_pending_first and cfi_pending_last describe the total @@ -479,6 +496,7 @@ struct cfil_info { struct cfil_entry cfi_entries[MAX_CONTENT_FILTER]; struct cfil_hash_entry *cfi_hash_entry; + SLIST_HEAD(, cfil_entry) cfi_ordered_entries; } __attribute__((aligned(8))); #define CFIF_DROP 0x0001 /* drop action applied */ @@ -488,12 +506,16 @@ struct cfil_info { #define CFIF_RETRY_INJECT_OUT 0x0020 /* inject out failed */ #define CFIF_SHUT_WR 0x0040 /* shutdown write */ #define CFIF_SHUT_RD 0x0080 /* shutdown read */ +#define CFIF_SOCKET_CONNECTED 0x0100 /* socket is connected */ +#define CFIF_INITIAL_VERDICT 0x0200 /* received initial verdict */ #define CFI_MASK_GENCNT 0xFFFFFFFF00000000 /* upper 32 bits */ #define CFI_SHIFT_GENCNT 32 #define CFI_MASK_FLOWHASH 0x00000000FFFFFFFF /* lower 32 bits */ #define CFI_SHIFT_FLOWHASH 0 +#define CFI_ENTRY_KCUNIT(i, e) (((e) - &((i)->cfi_entries[0])) + 1) + TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head; #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x) @@ -505,12 +527,16 @@ TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head; LIST_HEAD(cfilhashhead, cfil_hash_entry); #define CFILHASHSIZE 16 #define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport)) -#define IS_UDP(so) (so && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP) +#define IS_UDP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP) #define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \ ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)))) #define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \ cfil_info->cfi_entries[kcunit - 1].cfe_filter != NULL) #define IS_DNS(local, remote) (check_port(local, 53) || check_port(remote, 53) || check_port(local, 5353) || check_port(remote, 5353)) +#define IS_INITIAL_TFO_DATA(so) (so && (so->so_flags1 & SOF1_PRECONNECT_DATA) && (so->so_state & SS_ISCONNECTING)) +#define NULLADDRESS(addr) ((addr.sa.sa_len == 0) || \ + (addr.sa.sa_family == AF_INET && addr.sin.sin_addr.s_addr == 0) || \ + (addr.sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&addr.sin6.sin6_addr))) /* * UDP Garbage Collection: @@ -657,6 +683,7 @@ static int cfil_action_data_pass(struct socket *, struct cfil_info *, uint32_t, uint64_t, uint64_t); static int cfil_action_drop(struct socket *, struct cfil_info *, uint32_t); static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *); +static int cfil_action_set_crypto_key(uint32_t, struct cfil_msg_hdr *); static int cfil_dispatch_closed_event(struct socket *, struct cfil_info *, int); static int cfil_data_common(struct socket *, struct cfil_info *, int, struct sockaddr *, struct mbuf *, struct mbuf *, uint32_t); @@ -666,8 +693,8 @@ static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *, struct in_addr, u_int16_t); static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *, struct in6_addr *, u_int16_t); -; -static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t); + +static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t, int); static void cfil_info_free(struct cfil_info *); static struct cfil_info * cfil_info_alloc(struct socket *, struct cfil_hash_entry *); static int cfil_info_attach_unit(struct socket *, uint32_t, struct cfil_info *); @@ -722,6 +749,11 @@ bool cfil_info_buffer_threshold_exceeded(struct cfil_info *); struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *); static void cfil_udp_gc_thread_func(void *, wait_result_t); static void cfil_info_udp_expire(void *, wait_result_t); +static bool fill_cfil_hash_entry_from_address(struct cfil_hash_entry *, bool, struct sockaddr *); +static void cfil_sock_received_verdict(struct socket *so); +static void cfil_fill_event_msg_addresses(struct cfil_hash_entry *, struct inpcb *, + union sockaddr_in_4_6 *, union sockaddr_in_4_6 *, + boolean_t, boolean_t); bool check_port(struct sockaddr *, u_short); @@ -1059,7 +1091,6 @@ cfil_info_buf_verify(struct cfi_buf *cfi_buf) CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q); VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last); - VERIFY(cfi_buf->cfi_pending_mbcnt >= 0); } static void @@ -1312,6 +1343,11 @@ release: cfil_active_count--; cfil_rw_unlock_exclusive(&cfil_lck_rw); + if (cfc->cf_crypto_state != NULL) { + cfil_crypto_cleanup_state(cfc->cf_crypto_state); + cfc->cf_crypto_state = NULL; + } + zfree(content_filter_zone, cfc); done: if (error == 0) { @@ -1569,6 +1605,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, struct cfil_msg_action *action_msg; struct cfil_entry *entry; struct cfil_info *cfil_info = NULL; + unsigned int data_len = 0; CFIL_LOG(LOG_INFO, ""); @@ -1583,9 +1620,15 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, error = EINVAL; goto done; } + if (m == NULL) { + CFIL_LOG(LOG_ERR, "null mbuf"); + error = EINVAL; + goto done; + } + data_len = m_length(m); - if (m_length(m) < sizeof(struct cfil_msg_hdr)) { - CFIL_LOG(LOG_ERR, "too short %u", m_length(m)); + if (data_len < sizeof(struct cfil_msg_hdr)) { + CFIL_LOG(LOG_ERR, "too short %u", data_len); error = EINVAL; goto done; } @@ -1600,6 +1643,12 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, error = EINVAL; goto done; } + if (msghdr->cfm_len > data_len) { + CFIL_LOG(LOG_ERR, "bad length %u", msghdr->cfm_len); + error = EINVAL; + goto done; + } + /* Validate action operation */ switch (msghdr->cfm_op) { case CFM_OP_DATA_UPDATE: @@ -1620,6 +1669,17 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, } error = cfil_action_bless_client(kcunit, msghdr); goto done; + case CFM_OP_SET_CRYPTO_KEY: + if (msghdr->cfm_len != sizeof(struct cfil_msg_set_crypto_key)) { + OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len); + error = EINVAL; + CFIL_LOG(LOG_ERR, "bad len: %u for op %u", + msghdr->cfm_len, + msghdr->cfm_op); + goto done; + } + error = cfil_action_set_crypto_key(kcunit, msghdr); + goto done; default: OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op); CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op); @@ -1699,6 +1759,13 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset, action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset); #endif + /* + * Received verdict, at this point we know this + * socket connection is allowed. Unblock thread + * immediately before proceeding to process the verdict. + */ + cfil_sock_received_verdict(so); + if (action_msg->cfa_out_peek_offset != 0 || action_msg->cfa_out_pass_offset != 0) { error = cfil_action_data_pass(so, cfil_info, kcunit, 1, @@ -1723,7 +1790,15 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, break; case CFM_OP_DROP: +#if VERDICT_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: ", + (uint64_t)VM_KERNEL_ADDRPERM(so), + cfil_info->cfi_sock_id, + action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset, + action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset); +#endif error = cfil_action_drop(so, cfil_info, kcunit); + cfil_sock_received_verdict(so); break; default: @@ -1852,7 +1927,7 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, fill_ip6_sockaddr_4_6(&sock_info->cfs_local, laddr, lport); fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport); } else if (inp->inp_vflag & INP_IPV4) { - struct in_addr laddr = {0}, faddr = {0}; + struct in_addr laddr = {.s_addr = 0}, faddr = {.s_addr = 0}; u_int16_t lport = 0, fport = 0; cfil_get_flow_address(cfil_info->cfi_hash_entry, inp, @@ -2291,6 +2366,7 @@ cfil_info_alloc(struct socket *so, struct cfil_hash_entry *hash_entry) } TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link); + SLIST_INIT(&cfil_info->cfi_ordered_entries); cfil_sock_attached_count++; @@ -2323,24 +2399,41 @@ cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit, struct cf kcunit++) { struct content_filter *cfc = content_filters[kcunit - 1]; struct cfil_entry *entry; + struct cfil_entry *iter_entry; + struct cfil_entry *iter_prev; if (cfc == NULL) { continue; } - if (cfc->cf_necp_control_unit != filter_control_unit) { + if (!(cfc->cf_necp_control_unit & filter_control_unit)) { continue; } entry = &cfil_info->cfi_entries[kcunit - 1]; entry->cfe_filter = cfc; - entry->cfe_necp_control_unit = filter_control_unit; + entry->cfe_necp_control_unit = cfc->cf_necp_control_unit; TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link); cfc->cf_sock_count++; + + /* Insert the entry into the list ordered by control unit */ + iter_prev = NULL; + SLIST_FOREACH(iter_entry, &cfil_info->cfi_ordered_entries, cfe_order_link) { + if (entry->cfe_necp_control_unit < iter_entry->cfe_necp_control_unit) { + break; + } + iter_prev = iter_entry; + } + + if (iter_prev == NULL) { + SLIST_INSERT_HEAD(&cfil_info->cfi_ordered_entries, entry, cfe_order_link); + } else { + SLIST_INSERT_AFTER(iter_prev, entry, cfe_order_link); + } + verify_content_filter(cfc); attached = 1; entry->cfe_flags |= CFEF_CFIL_ATTACHED; - break; } cfil_rw_unlock_exclusive(&cfil_lck_rw); @@ -2417,12 +2510,69 @@ cfil_info_free(struct cfil_info *cfil_info) zfree(cfil_info_zone, cfil_info); } +/* + * Received a verdict from userspace for a socket. + * Perform any delayed operation if needed. + */ +static void +cfil_sock_received_verdict(struct socket *so) +{ + if (so == NULL || so->so_cfil == NULL) { + return; + } + + so->so_cfil->cfi_flags |= CFIF_INITIAL_VERDICT; + + /* + * If socket has already been connected, trigger + * soisconnected now. + */ + if (so->so_cfil->cfi_flags & CFIF_SOCKET_CONNECTED) { + so->so_cfil->cfi_flags &= ~CFIF_SOCKET_CONNECTED; + soisconnected(so); + return; + } +} + +/* + * Entry point from Sockets layer + * The socket is locked. + * + * Checks if a connected socket is subject to filter and + * pending the initial verdict. + */ +boolean_t +cfil_sock_connected_pending_verdict(struct socket *so) +{ + if (so == NULL || so->so_cfil == NULL) { + return false; + } + + if (so->so_cfil->cfi_flags & CFIF_INITIAL_VERDICT) { + return false; + } else { + /* + * Remember that this protocol is already connected, so + * we will trigger soisconnected() upon receipt of + * initial verdict later. + */ + so->so_cfil->cfi_flags |= CFIF_SOCKET_CONNECTED; + return true; + } +} + +boolean_t +cfil_filter_present(void) +{ + return cfil_active_count > 0; +} + /* * Entry point from Sockets layer * The socket is locked. */ errno_t -cfil_sock_attach(struct socket *so) +cfil_sock_attach(struct socket *so, struct sockaddr *local, struct sockaddr *remote, int dir) { errno_t error = 0; uint32_t filter_control_unit; @@ -2444,6 +2594,9 @@ cfil_sock_attach(struct socket *so) goto done; } + if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) { + goto done; + } if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) { OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only); goto done; @@ -2462,6 +2615,7 @@ cfil_sock_attach(struct socket *so) OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem); goto done; } + so->so_cfil->cfi_dir = dir; } if (cfil_info_attach_unit(so, filter_control_unit, so->so_cfil) == 0) { CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed", @@ -2479,7 +2633,18 @@ cfil_sock_attach(struct socket *so) /* Hold a reference on the socket */ so->so_usecount++; - error = cfil_dispatch_attach_event(so, so->so_cfil, filter_control_unit); + /* + * Save passed addresses for attach event msg (in case resend + * is needed. + */ + if (remote != NULL) { + memcpy(&so->so_cfil->cfi_so_attach_faddr, remote, remote->sa_len); + } + if (local != NULL) { + memcpy(&so->so_cfil->cfi_so_attach_laddr, local, local->sa_len); + } + + error = cfil_dispatch_attach_event(so, so->so_cfil, 0, dir); /* We can recover from flow control or out of memory errors */ if (error == ENOBUFS || error == ENOMEM) { error = 0; @@ -2517,14 +2682,215 @@ cfil_sock_detach(struct socket *so) return 0; } +/* + * Fill in the address info of an event message from either + * the socket or passed in address info. + */ +static void +cfil_fill_event_msg_addresses(struct cfil_hash_entry *entry, struct inpcb *inp, + union sockaddr_in_4_6 *sin_src, union sockaddr_in_4_6 *sin_dst, + boolean_t isIPv4, boolean_t outgoing) +{ + if (isIPv4) { + struct in_addr laddr = {0}, faddr = {0}; + u_int16_t lport = 0, fport = 0; + + cfil_get_flow_address(entry, inp, &laddr, &faddr, &lport, &fport); + + if (outgoing) { + fill_ip_sockaddr_4_6(sin_src, laddr, lport); + fill_ip_sockaddr_4_6(sin_dst, faddr, fport); + } else { + fill_ip_sockaddr_4_6(sin_src, faddr, fport); + fill_ip_sockaddr_4_6(sin_dst, laddr, lport); + } + } else { + struct in6_addr *laddr = NULL, *faddr = NULL; + u_int16_t lport = 0, fport = 0; + + cfil_get_flow_address_v6(entry, inp, &laddr, &faddr, &lport, &fport); + if (outgoing) { + fill_ip6_sockaddr_4_6(sin_src, laddr, lport); + fill_ip6_sockaddr_4_6(sin_dst, faddr, fport); + } else { + fill_ip6_sockaddr_4_6(sin_src, faddr, fport); + fill_ip6_sockaddr_4_6(sin_dst, laddr, lport); + } + } +} + +static boolean_t +cfil_dispatch_attach_event_sign(cfil_crypto_state_t crypto_state, + struct cfil_info *cfil_info, + struct cfil_msg_sock_attached *msg) +{ + struct cfil_crypto_data data = {}; + + if (crypto_state == NULL || msg == NULL || cfil_info == NULL) { + return false; + } + + data.sock_id = msg->cfs_msghdr.cfm_sock_id; + data.direction = msg->cfs_conn_dir; + + data.pid = msg->cfs_pid; + data.effective_pid = msg->cfs_e_pid; + uuid_copy(data.uuid, msg->cfs_uuid); + uuid_copy(data.effective_uuid, msg->cfs_e_uuid); + data.socketProtocol = msg->cfs_sock_protocol; + if (data.direction == CFS_CONNECTION_DIR_OUT) { + data.remote.sin6 = msg->cfs_dst.sin6; + data.local.sin6 = msg->cfs_src.sin6; + } else { + data.remote.sin6 = msg->cfs_src.sin6; + data.local.sin6 = msg->cfs_dst.sin6; + } + + // At attach, if local address is already present, no need to re-sign subsequent data messages. + if (!NULLADDRESS(data.local)) { + cfil_info->cfi_isSignatureLatest = true; + } + + msg->cfs_signature_length = sizeof(cfil_crypto_signature); + if (cfil_crypto_sign_data(crypto_state, &data, msg->cfs_signature, &msg->cfs_signature_length) != 0) { + msg->cfs_signature_length = 0; + CFIL_LOG(LOG_ERR, "CFIL: Failed to sign attached msg ", + msg->cfs_msghdr.cfm_sock_id); + return false; + } + + return true; +} + +static boolean_t +cfil_dispatch_data_event_sign(cfil_crypto_state_t crypto_state, + struct socket *so, struct cfil_info *cfil_info, + struct cfil_msg_data_event *msg) +{ + struct cfil_crypto_data data = {}; + + if (crypto_state == NULL || msg == NULL || + so == NULL || cfil_info == NULL) { + return false; + } + + data.sock_id = cfil_info->cfi_sock_id; + data.direction = cfil_info->cfi_dir; + data.pid = so->last_pid; + memcpy(data.uuid, so->last_uuid, sizeof(uuid_t)); + if (so->so_flags & SOF_DELEGATED) { + data.effective_pid = so->e_pid; + memcpy(data.effective_uuid, so->e_uuid, sizeof(uuid_t)); + } else { + data.effective_pid = so->last_pid; + memcpy(data.effective_uuid, so->last_uuid, sizeof(uuid_t)); + } + data.socketProtocol = so->so_proto->pr_protocol; + + if (data.direction == CFS_CONNECTION_DIR_OUT) { + data.remote.sin6 = msg->cfc_dst.sin6; + data.local.sin6 = msg->cfc_src.sin6; + } else { + data.remote.sin6 = msg->cfc_src.sin6; + data.local.sin6 = msg->cfc_dst.sin6; + } + + // At first data, local address may show up for the first time, update address cache and + // no need to re-sign subsequent data messages anymore. + if (!NULLADDRESS(data.local)) { + memcpy(&cfil_info->cfi_so_attach_laddr, &data.local, data.local.sa.sa_len); + cfil_info->cfi_isSignatureLatest = true; + } + + msg->cfd_signature_length = sizeof(cfil_crypto_signature); + if (cfil_crypto_sign_data(crypto_state, &data, msg->cfd_signature, &msg->cfd_signature_length) != 0) { + msg->cfd_signature_length = 0; + CFIL_LOG(LOG_ERR, "CFIL: Failed to sign data msg ", + msg->cfd_msghdr.cfm_sock_id); + return false; + } + + return true; +} + +static boolean_t +cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state, + struct socket *so, struct cfil_info *cfil_info, + struct cfil_msg_sock_closed *msg) +{ + struct cfil_crypto_data data = {}; + struct cfil_hash_entry hash_entry = {}; + struct cfil_hash_entry *hash_entry_ptr = NULL; + struct inpcb *inp = (struct inpcb *)so->so_pcb; + + if (crypto_state == NULL || msg == NULL || + so == NULL || inp == NULL || cfil_info == NULL) { + return false; + } + + data.sock_id = cfil_info->cfi_sock_id; + data.direction = cfil_info->cfi_dir; + + data.pid = so->last_pid; + memcpy(data.uuid, so->last_uuid, sizeof(uuid_t)); + if (so->so_flags & SOF_DELEGATED) { + data.effective_pid = so->e_pid; + memcpy(data.effective_uuid, so->e_uuid, sizeof(uuid_t)); + } else { + data.effective_pid = so->last_pid; + memcpy(data.effective_uuid, so->last_uuid, sizeof(uuid_t)); + } + data.socketProtocol = so->so_proto->pr_protocol; + + /* + * Fill in address info: + * For UDP, use the cfil_info hash entry directly. + * For TCP, compose an hash entry with the saved addresses. + */ + if (cfil_info->cfi_hash_entry != NULL) { + hash_entry_ptr = cfil_info->cfi_hash_entry; + } else if (cfil_info->cfi_so_attach_faddr.sa.sa_len > 0 || + cfil_info->cfi_so_attach_laddr.sa.sa_len > 0) { + fill_cfil_hash_entry_from_address(&hash_entry, TRUE, &cfil_info->cfi_so_attach_laddr.sa); + fill_cfil_hash_entry_from_address(&hash_entry, FALSE, &cfil_info->cfi_so_attach_faddr.sa); + hash_entry_ptr = &hash_entry; + } + if (hash_entry_ptr != NULL) { + boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT); + union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote; + union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local; + cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, inp->inp_vflag & INP_IPV4, outgoing); + } + + data.byte_count_in = cfil_info->cfi_byte_inbound_count; + data.byte_count_out = cfil_info->cfi_byte_outbound_count; + + msg->cfc_signature_length = sizeof(cfil_crypto_signature); + if (cfil_crypto_sign_data(crypto_state, &data, msg->cfc_signature, &msg->cfc_signature_length) != 0) { + msg->cfc_signature_length = 0; + CFIL_LOG(LOG_ERR, "CFIL: Failed to sign closed msg ", + msg->cfc_msghdr.cfm_sock_id); + return false; + } + + return true; +} + static int -cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint32_t filter_control_unit) +cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, + uint32_t kcunit, int conn_dir) { errno_t error = 0; struct cfil_entry *entry = NULL; struct cfil_msg_sock_attached msg_attached; - uint32_t kcunit; struct content_filter *cfc = NULL; + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct cfil_hash_entry *hash_entry_ptr = NULL; + struct cfil_hash_entry hash_entry; + + memset(&hash_entry, 0, sizeof(struct cfil_hash_entry)); + proc_t p = PROC_NULL; + task_t t = TASK_NULL; socket_lock_assert_owned(so); @@ -2534,29 +2900,19 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint3 error = EINVAL; goto done; } - /* - * Find the matching filter unit - */ - for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - cfc = content_filters[kcunit - 1]; - if (cfc == NULL) { - continue; - } - if (cfc->cf_necp_control_unit != filter_control_unit) { - continue; - } + if (kcunit == 0) { + entry = SLIST_FIRST(&cfil_info->cfi_ordered_entries); + } else { entry = &cfil_info->cfi_entries[kcunit - 1]; - if (entry->cfe_filter == NULL) { - continue; - } - - VERIFY(cfc == entry->cfe_filter); + } - break; + if (entry == NULL) { + goto done; } - if (entry == NULL || entry->cfe_filter == NULL) { + cfc = entry->cfe_filter; + if (cfc == NULL) { goto done; } @@ -2564,8 +2920,12 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint3 goto done; } + if (kcunit == 0) { + kcunit = CFI_ENTRY_KCUNIT(cfil_info, entry); + } + CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u", - (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, kcunit); + (uint64_t)VM_KERNEL_ADDRPERM(so), entry->cfe_necp_control_unit, kcunit); /* Would be wasteful to try when flow controlled */ if (cfc->cf_flags & CFF_FLOW_CONTROLLED) { @@ -2593,6 +2953,46 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint3 memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t)); } + /* + * Fill in address info: + * For UDP, use the cfil_info hash entry directly. + * For TCP, compose an hash entry with the saved addresses. + */ + if (cfil_info->cfi_hash_entry != NULL) { + hash_entry_ptr = cfil_info->cfi_hash_entry; + } else if (cfil_info->cfi_so_attach_faddr.sa.sa_len > 0 || + cfil_info->cfi_so_attach_laddr.sa.sa_len > 0) { + fill_cfil_hash_entry_from_address(&hash_entry, TRUE, &cfil_info->cfi_so_attach_laddr.sa); + fill_cfil_hash_entry_from_address(&hash_entry, FALSE, &cfil_info->cfi_so_attach_faddr.sa); + hash_entry_ptr = &hash_entry; + } + if (hash_entry_ptr != NULL) { + cfil_fill_event_msg_addresses(hash_entry_ptr, inp, + &msg_attached.cfs_src, &msg_attached.cfs_dst, + inp->inp_vflag & INP_IPV4, conn_dir == CFS_CONNECTION_DIR_OUT); + } + msg_attached.cfs_conn_dir = conn_dir; + + if (msg_attached.cfs_e_pid != 0) { + p = proc_find(msg_attached.cfs_e_pid); + if (p != PROC_NULL) { + t = proc_task(p); + if (t != TASK_NULL) { + audit_token_t audit_token; + mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT; + if (task_info(t, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count) == KERN_SUCCESS) { + memcpy(&msg_attached.cfs_audit_token, &audit_token, sizeof(msg_attached.cfs_audit_token)); + } else { + CFIL_LOG(LOG_ERR, "CFIL: Failed to get process audit token ", + entry->cfe_cfil_info->cfi_sock_id); + } + } + proc_rele(p); + } + } + + cfil_dispatch_attach_event_sign(entry->cfe_filter->cf_crypto_state, cfil_info, &msg_attached); + #if LIFECYCLE_DEBUG CFIL_LOG(LOG_DEBUG, "CFIL: LIFECYCLE: SENDING ATTACH UP ", entry->cfe_cfil_info->cfi_sock_id); @@ -2800,6 +3200,10 @@ cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int k memcpy(msg_closed.cfc_op_time, cfil_info->cfi_op_time, sizeof(uint32_t) * CFI_MAX_TIME_LOG_ENTRY); memcpy(msg_closed.cfc_op_list, cfil_info->cfi_op_list, sizeof(unsigned char) * CFI_MAX_TIME_LOG_ENTRY); msg_closed.cfc_op_list_ctr = cfil_info->cfi_op_list_ctr; + msg_closed.cfc_byte_inbound_count = cfil_info->cfi_byte_inbound_count; + msg_closed.cfc_byte_outbound_count = cfil_info->cfi_byte_outbound_count; + + cfil_dispatch_closed_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, &msg_closed); #if LIFECYCLE_DEBUG CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec); @@ -2998,37 +3402,16 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_ data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen; /* - * TBD: + * Copy address/port into event msg. * For non connected sockets need to copy addresses from passed * parameters */ - if (inp->inp_vflag & INP_IPV6) { - struct in6_addr *laddr = NULL, *faddr = NULL; - u_int16_t lport = 0, fport = 0; + cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, + &data_req->cfc_src, &data_req->cfc_dst, + inp->inp_vflag & INP_IPV4, outgoing); - cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp, - &laddr, &faddr, &lport, &fport); - if (outgoing) { - fill_ip6_sockaddr_4_6(&data_req->cfc_src, laddr, lport); - fill_ip6_sockaddr_4_6(&data_req->cfc_dst, faddr, fport); - } else { - fill_ip6_sockaddr_4_6(&data_req->cfc_src, faddr, fport); - fill_ip6_sockaddr_4_6(&data_req->cfc_dst, laddr, lport); - } - } else if (inp->inp_vflag & INP_IPV4) { - struct in_addr laddr = {0}, faddr = {0}; - u_int16_t lport = 0, fport = 0; - - cfil_get_flow_address(cfil_info->cfi_hash_entry, inp, - &laddr, &faddr, &lport, &fport); - - if (outgoing) { - fill_ip_sockaddr_4_6(&data_req->cfc_src, laddr, lport); - fill_ip_sockaddr_4_6(&data_req->cfc_dst, faddr, fport); - } else { - fill_ip_sockaddr_4_6(&data_req->cfc_src, faddr, fport); - fill_ip_sockaddr_4_6(&data_req->cfc_dst, laddr, lport); - } + if (cfil_info->cfi_isSignatureLatest == false) { + cfil_dispatch_data_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, data_req); } microuptime(&tv); @@ -3105,7 +3488,8 @@ cfil_data_service_ctl_q(struct socket *so, struct cfil_info *cfil_info, uint32_t /* Send attached message if not yet done */ if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) { - error = cfil_dispatch_attach_event(so, cfil_info, kcunit); + error = cfil_dispatch_attach_event(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, entry), + outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN); if (error != 0) { /* We can recover from flow control */ if (error == ENOBUFS || error == ENOMEM) { @@ -3566,6 +3950,7 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3 */ curlen = 0; while ((data = cfil_queue_first(pending_q)) != NULL) { + struct cfil_entry *iter_entry; datalen = cfil_data_length(data, NULL, NULL); #if DATA_DEBUG @@ -3583,10 +3968,10 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3 curlen += datalen; - for (kcunit += 1; - kcunit <= MAX_CONTENT_FILTER; - kcunit++) { - error = cfil_data_filter(so, cfil_info, kcunit, outgoing, + for (iter_entry = SLIST_NEXT(entry, cfe_order_link); + iter_entry != NULL; + iter_entry = SLIST_NEXT(iter_entry, cfe_order_link)) { + error = cfil_data_filter(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, iter_entry), outgoing, data, datalen); /* 0 means passed so we can continue */ if (error != 0) { @@ -3967,6 +4352,7 @@ cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr) cfil_info->cfi_sock_id); } #endif + cfil_sock_received_verdict(so); (void)cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET); (void)cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET); } else { @@ -3978,6 +4364,51 @@ cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr) return error; } +int +cfil_action_set_crypto_key(uint32_t kcunit, struct cfil_msg_hdr *msghdr) +{ + struct content_filter *cfc = NULL; + cfil_crypto_state_t crypto_state = NULL; + struct cfil_msg_set_crypto_key *keymsg = (struct cfil_msg_set_crypto_key *)msghdr; + + CFIL_LOG(LOG_NOTICE, ""); + + if (content_filters == NULL) { + CFIL_LOG(LOG_ERR, "no content filter"); + return EINVAL; + } + if (kcunit > MAX_CONTENT_FILTER) { + CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)", + kcunit, MAX_CONTENT_FILTER); + return EINVAL; + } + crypto_state = cfil_crypto_init_client((uint8_t *)keymsg->crypto_key); + if (crypto_state == NULL) { + CFIL_LOG(LOG_ERR, "failed to initialize crypto state for unit %u)", + kcunit); + return EINVAL; + } + + cfil_rw_lock_exclusive(&cfil_lck_rw); + + cfc = content_filters[kcunit - 1]; + if (cfc->cf_kcunit != kcunit) { + CFIL_LOG(LOG_ERR, "bad unit info %u)", + kcunit); + cfil_rw_unlock_exclusive(&cfil_lck_rw); + cfil_crypto_cleanup_state(crypto_state); + return EINVAL; + } + if (cfc->cf_crypto_state != NULL) { + cfil_crypto_cleanup_state(cfc->cf_crypto_state); + cfc->cf_crypto_state = NULL; + } + cfc->cf_crypto_state = crypto_state; + + cfil_rw_unlock_exclusive(&cfil_lck_rw); + return 0; +} + static int cfil_update_entry_offsets(struct socket *so, struct cfil_info *cfil_info, int outgoing, unsigned int datalen) { @@ -4047,8 +4478,10 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s if (outgoing) { cfi_buf = &cfil_info->cfi_snd; + cfil_info->cfi_byte_outbound_count += datalen; } else { cfi_buf = &cfil_info->cfi_rcv; + cfil_info->cfi_byte_inbound_count += datalen; } cfi_buf->cfi_pending_last += datalen; @@ -4085,10 +4518,12 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: FAST PATH"); #endif } else { - for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + struct cfil_entry *iter_entry; + SLIST_FOREACH(iter_entry, &cfil_info->cfi_ordered_entries, cfe_order_link) { // Is cfil attached to this filter? + kcunit = CFI_ENTRY_KCUNIT(cfil_info, iter_entry); if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) { - if (IS_UDP(so)) { + if (IS_UDP(so) && chain == NULL) { /* UDP only: * Chain addr (incoming only TDB), control (optional) and data into one chain. * This full chain will be reinjected into socket after recieving verdict. @@ -4140,6 +4575,13 @@ cfil_sock_data_out(struct socket *so, struct sockaddr *to, return 0; } + /* + * Pass initial data for TFO. + */ + if (IS_INITIAL_TFO_DATA(so)) { + return 0; + } + socket_lock_assert_owned(so); if (so->so_cfil->cfi_flags & CFIF_DROP) { @@ -4188,6 +4630,13 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from, return 0; } + /* + * Pass initial data for TFO. + */ + if (IS_INITIAL_TFO_DATA(so)) { + return 0; + } + socket_lock_assert_owned(so); if (so->so_cfil->cfi_flags & CFIF_DROP) { @@ -5311,7 +5760,7 @@ cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id) if (db == NULL || id == 0) { CFIL_LOG(LOG_DEBUG, "CFIL: UDP NULL DB ", - (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), id); + db ? (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so) : 0, id); return NULL; } @@ -5331,7 +5780,6 @@ cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id) struct cfil_hash_entry * cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote) { -#pragma unused(so, filter_control_unit, outgoing, local, remote) struct cfil_hash_entry *hash_entry = NULL; errno_t error = 0; @@ -5364,6 +5812,7 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem); return NULL; } + hash_entry->cfentry_cfil->cfi_dir = outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN; #if LIFECYCLE_DEBUG cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED"); @@ -5387,7 +5836,8 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out /* Hold a reference on the socket for each flow */ so->so_usecount++; - error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, filter_control_unit); + error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, 0, + outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN); /* We can recover from flow control or out of memory errors */ if (error != 0 && error != ENOBUFS && error != ENOMEM) { return NULL; @@ -5416,12 +5866,21 @@ cfil_sock_udp_handle_data(bool outgoing, struct socket *so, return error; } + // Socket has been blessed + if ((so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) != 0) { + return error; + } + filter_control_unit = necp_socket_get_content_filter_control_unit(so); if (filter_control_unit == 0) { CFIL_LOG(LOG_DEBUG, "CFIL: UDP failed to get control unit"); return error; } + if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) { + return error; + } + if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) { CFIL_LOG(LOG_DEBUG, "CFIL: UDP user space only"); OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only); diff --git a/bsd/net/content_filter.h b/bsd/net/content_filter.h index 6af66bb70..b4f4485c5 100644 --- a/bsd/net/content_filter.h +++ b/bsd/net/content_filter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Apple Inc. All rights reserved. + * Copyright (c) 2013-2019 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef BSD_KERNEL_PRIVATE #include @@ -91,7 +92,42 @@ struct cfil_opt_sock_info { /* * How many filter may be active simultaneously */ +#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX) #define CFIL_MAX_FILTER_COUNT 2 +#else +#define CFIL_MAX_FILTER_COUNT 8 +#endif + + +/* + * Crypto Support + */ +#define CFIL_CRYPTO 1 +#define CFIL_CRYPTO_SIGNATURE_SIZE 32 +#define CFIL_CRYPTO_DATA_EVENT 1 + +typedef uint8_t cfil_crypto_key[CCSHA256_OUTPUT_SIZE]; +typedef uint8_t cfil_crypto_signature[CFIL_CRYPTO_SIGNATURE_SIZE]; + +typedef struct cfil_crypto_state { + const struct ccdigest_info *digest_info; + cfil_crypto_key key; +} *cfil_crypto_state_t; + +typedef struct cfil_crypto_data { + uuid_t flow_id; + u_int64_t sock_id; + u_int32_t direction; + union sockaddr_in_4_6 remote; + union sockaddr_in_4_6 local; + u_int32_t socketProtocol; + pid_t pid; + pid_t effective_pid; + uuid_t uuid; + uuid_t effective_uuid; + u_int64_t byte_count_in; + u_int64_t byte_count_out; +} *cfil_crypto_data_t; /* * Types of messages @@ -120,6 +156,7 @@ struct cfil_opt_sock_info { #define CFM_OP_DATA_UPDATE 16 /* update pass or peek offsets */ #define CFM_OP_DROP 17 /* shutdown socket, no more data */ #define CFM_OP_BLESS_CLIENT 18 /* mark a client flow as already filtered, passes a uuid */ +#define CFM_OP_SET_CRYPTO_KEY 19 /* assign client crypto key for message signing */ /* * struct cfil_msg_hdr @@ -136,6 +173,14 @@ struct cfil_msg_hdr { #define CFM_VERSION_CURRENT 1 +/* + * Connection Direction + */ +#define CFS_CONNECTION_DIR_IN 0 +#define CFS_CONNECTION_DIR_OUT 1 + +#define CFS_AUDIT_TOKEN 1 + /* * struct cfil_msg_sock_attached * @@ -158,6 +203,12 @@ struct cfil_msg_sock_attached { pid_t cfs_e_pid; uuid_t cfs_uuid; uuid_t cfs_e_uuid; + union sockaddr_in_4_6 cfs_src; + union sockaddr_in_4_6 cfs_dst; + int cfs_conn_dir; + unsigned int cfs_audit_token[8]; /* Must match audit_token_t */ + cfil_crypto_signature cfs_signature; + uint32_t cfs_signature_length; }; /* @@ -181,6 +232,8 @@ struct cfil_msg_data_event { union sockaddr_in_4_6 cfc_dst; uint64_t cfd_start_offset; uint64_t cfd_end_offset; + cfil_crypto_signature cfd_signature; + uint32_t cfd_signature_length; /* Actual content data immediatly follows */ }; @@ -203,6 +256,10 @@ struct cfil_msg_sock_closed { uint32_t cfc_op_list_ctr; uint32_t cfc_op_time[CFI_MAX_TIME_LOG_ENTRY]; /* time interval in microseconds since first event */ unsigned char cfc_op_list[CFI_MAX_TIME_LOG_ENTRY]; + uint64_t cfc_byte_inbound_count; + uint64_t cfc_byte_outbound_count; + cfil_crypto_signature cfc_signature; + uint32_t cfc_signature_length; } __attribute__((aligned(8))); /* @@ -244,6 +301,20 @@ struct cfil_msg_bless_client { uuid_t cfb_client_uuid; }; +/* + * struct cfil_msg_set_crypto_key + * + * Filter assigning client crypto key to CFIL for message signing + * + * Valid Type: CFM_TYPE_ACTION + * + * Valid Ops: CFM_OP_SET_CRYPTO_KEY + */ +struct cfil_msg_set_crypto_key { + struct cfil_msg_hdr cfb_msghdr; + cfil_crypto_key crypto_key; +}; + #define CFM_MAX_OFFSET UINT64_MAX /* @@ -400,7 +471,10 @@ do { \ extern void cfil_init(void); -extern errno_t cfil_sock_attach(struct socket *so); +extern boolean_t cfil_filter_present(void); +extern boolean_t cfil_sock_connected_pending_verdict(struct socket *so); +extern errno_t cfil_sock_attach(struct socket *so, + struct sockaddr *local, struct sockaddr *remote, int dir); extern errno_t cfil_sock_detach(struct socket *so); extern int cfil_sock_data_out(struct socket *so, struct sockaddr *to, diff --git a/bsd/net/content_filter_crypto.c b/bsd/net/content_filter_crypto.c new file mode 100644 index 000000000..a0d8e6475 --- /dev/null +++ b/bsd/net/content_filter_crypto.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2019 Apple Inc. + * All rights reserved. + */ + +#include +#include +#include +#include +#include + +extern int cfil_log_level; + +#define CFIL_CRYPTO_LOG(level, fmt, ...) \ +do { \ + if (cfil_log_level >= level) \ + printf("%s:%d " fmt "\n",\ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ +} while (0) + +#define CFIL_CRYPTO_LOG_4BYTES(name) \ + CFIL_CRYPTO_LOG(LOG_DEBUG, \ + "%s \t%s: %hhX %hhX %hhX %hhX", \ + prefix, name, ptr[0], ptr[1], ptr[2], ptr[3]) + +#define CFIL_CRYPTO_LOG_8BYTES(name) \ + CFIL_CRYPTO_LOG(LOG_DEBUG, \ + "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \ + prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]) + +#define CFIL_CRYPTO_LOG_16BYTES(name) \ + CFIL_CRYPTO_LOG(LOG_DEBUG, \ + "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \ + prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15]) + +#define CFIL_CRYPTO_LOG_28BYTES(name) \ + CFIL_CRYPTO_LOG(LOG_DEBUG, \ + "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \ + prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15], ptr[16], ptr[17], ptr[18], ptr[19], ptr[20], ptr[21], ptr[22], ptr[23], ptr[24], ptr[25], ptr[26], ptr[27]) + +#define CFIL_CRYPTO_LOG_32BYTES(name, prefix) \ + CFIL_CRYPTO_LOG(LOG_DEBUG, \ + "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \ + prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15], ptr[16], ptr[17], ptr[18], ptr[19], ptr[20], ptr[21], ptr[22], ptr[23], ptr[24], ptr[25], ptr[26], ptr[27], ptr[28], ptr[29], ptr[30], ptr[31]) + +static void +cfil_crypto_print_data(cfil_crypto_data_t data, const char *prefix) +{ + u_int8_t *ptr = NULL; + CFIL_CRYPTO_LOG(LOG_DEBUG, "%s NE Filter crypto data:", prefix); + + ptr = (u_int8_t *)&data->flow_id; + CFIL_CRYPTO_LOG_16BYTES("flow_id"); + + ptr = (u_int8_t *)&data->sock_id; + CFIL_CRYPTO_LOG_8BYTES("sock_id"); + + ptr = (u_int8_t *)&data->direction; + CFIL_CRYPTO_LOG_4BYTES("direction"); + + ptr = (u_int8_t *)&data->remote; + CFIL_CRYPTO_LOG_28BYTES("remote"); + ptr = (u_int8_t *)&data->local; + CFIL_CRYPTO_LOG_28BYTES("local"); + + ptr = (u_int8_t *)&data->socketProtocol; + CFIL_CRYPTO_LOG_4BYTES("socketProtocol"); + + ptr = (u_int8_t *)&data->pid; + CFIL_CRYPTO_LOG_4BYTES("pid"); + + ptr = (u_int8_t *)&data->effective_pid; + CFIL_CRYPTO_LOG_4BYTES("effective_pid"); + + ptr = (u_int8_t *)&data->uuid; + CFIL_CRYPTO_LOG_16BYTES("uuid"); + ptr = (u_int8_t *)&data->effective_uuid; + CFIL_CRYPTO_LOG_16BYTES("effective_uuid"); + + ptr = (u_int8_t *)&data->byte_count_in; + CFIL_CRYPTO_LOG_8BYTES("byte_count_in"); + + ptr = (u_int8_t *)&data->byte_count_out; + CFIL_CRYPTO_LOG_8BYTES("byte_count_out"); +} + +cfil_crypto_state_t +cfil_crypto_init_client(cfil_crypto_key client_key) +{ + if (client_key == NULL) { + return NULL; + } + + struct cfil_crypto_state *state; + MALLOC(state, struct cfil_crypto_state *, sizeof(struct cfil_crypto_state), + M_TEMP, M_WAITOK | M_ZERO); + if (state == NULL) { + return NULL; + } + + memcpy(state->key, client_key, sizeof(cfil_crypto_key)); + state->digest_info = ccsha256_di(); + + CFIL_CRYPTO_LOG(LOG_DEBUG, "Inited client key"); + return state; +} + +void +cfil_crypto_cleanup_state(cfil_crypto_state_t state) +{ + if (state != NULL) { + FREE(state, M_TEMP); + } +} + +static void +cfil_crypto_update_context(const struct ccdigest_info *di, + cchmac_ctx_t ctx, + cfil_crypto_data_t data) +{ + const uint8_t context[32] = {[0 ... 31] = 0x20}; // 0x20 repeated 32 times + const char *context_string = "NEFilterCrypto"; + uint8_t separator = 0; + cchmac_update(di, ctx, sizeof(context), context); + cchmac_update(di, ctx, strlen(context_string), context_string); + cchmac_update(di, ctx, sizeof(separator), &separator); + cchmac_update(di, ctx, sizeof(struct cfil_crypto_data), data); +} + +int +cfil_crypto_sign_data(cfil_crypto_state_t state, cfil_crypto_data_t data, + cfil_crypto_signature signature, u_int32_t *signature_length) +{ + u_int8_t *ptr = NULL; + + if (state->digest_info == NULL) { + return EINVAL; + } + + if (data == NULL || + signature == NULL || + signature_length == NULL) { + return EINVAL; + } + + size_t required_tag_length = state->digest_info->output_size; + if (*signature_length < required_tag_length) { + return ERANGE; + } + + *signature_length = (u_int32_t)required_tag_length; + + cchmac_ctx_decl(state->digest_info->state_size, + state->digest_info->block_size, ctx); + cchmac_init(state->digest_info, ctx, + sizeof(state->key), + state->key); + cfil_crypto_update_context(state->digest_info, ctx, data); + cchmac_final(state->digest_info, ctx, signature); + + if (cfil_log_level >= LOG_DEBUG) { + cfil_crypto_print_data(data, "SIGN"); + CFIL_CRYPTO_LOG(LOG_DEBUG, "Signed data: datalen %lu", sizeof(struct cfil_crypto_data)); + ptr = (u_int8_t *)signature; + CFIL_CRYPTO_LOG_32BYTES("Signature", "SIGN"); + } + + return 0; +} diff --git a/bsd/net/content_filter_crypto.h b/bsd/net/content_filter_crypto.h new file mode 100644 index 000000000..fd56c0a38 --- /dev/null +++ b/bsd/net/content_filter_crypto.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2019 Apple Inc. + * All rights reserved. + */ + +#ifndef __content_filter_crypto_h +#define __content_filter_crypto_h + +#include + +extern cfil_crypto_state_t +cfil_crypto_init_client(cfil_crypto_key client_key); + +extern void +cfil_crypto_cleanup_state(cfil_crypto_state_t state); + +extern int +cfil_crypto_sign_data(cfil_crypto_state_t state, cfil_crypto_data_t data, + cfil_crypto_signature signature, u_int32_t *signature_length); + +#endif // __content_filter_crypto_h diff --git a/bsd/net/contiki-conf.h b/bsd/net/contiki-conf.h new file mode 100644 index 000000000..2ee32b4ff --- /dev/null +++ b/bsd/net/contiki-conf.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef contiki_conf_h +#define contiki_conf_h + +#define NETSTACK_CONF_NETWORK sicslowpan_driver + +#endif /* contiki_conf_h */ diff --git a/bsd/net/contiki-default-conf.h b/bsd/net/contiki-default-conf.h new file mode 100644 index 000000000..9cf10f7e9 --- /dev/null +++ b/bsd/net/contiki-default-conf.h @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2012, Thingsquare, http://www.thingsquare.com/. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef CONTIKI_DEFAULT_CONF_H +#define CONTIKI_DEFAULT_CONF_H + +/*---------------------------------------------------------------------------*/ +/* Netstack configuration + * + * The netstack configuration is typically overridden by the platform + * configuration, as defined in contiki-conf.h + */ + +/* NETSTACK_CONF_RADIO specifies the radio driver. The radio driver + * typically depends on the radio used on the target hardware. */ +#ifndef NETSTACK_CONF_RADIO +#define NETSTACK_CONF_RADIO nullradio_driver +/* #define NETSTACK_CONF_RADIO cc2420_driver */ +#endif /* NETSTACK_CONF_RADIO */ + +/* NETSTACK_CONF_FRAMER specifies the over-the-air frame format used + * by Contiki radio packets. For IEEE 802.15.4 radios, use the + * framer_802154 driver. */ +#ifndef NETSTACK_CONF_FRAMER +#define NETSTACK_CONF_FRAMER framer_nullmac +/* #define NETSTACK_CONF_FRAMER framer_802154 */ +#endif /* NETSTACK_CONF_FRAMER */ + +/* NETSTACK_CONF_RDC specifies the Radio Duty Cycling (RDC) layer. The + * nullrdc_driver never turns the radio off and is compatible with all + * radios, but consumes a lot of power. The contikimac_driver is + * highly power-efficent and allows sleepy routers, but is not + * compatible with all radios. */ +#ifndef NETSTACK_CONF_RDC +#define NETSTACK_CONF_RDC nullrdc_driver +/* #define NETSTACK_CONF_RDC contikimac_driver */ +#endif /* NETSTACK_CONF_RDC */ + +/* NETSTACK_CONF_MAC specifies the Medium Access Control (MAC) + * layer. The nullmac_driver does not provide any MAC + * functionality. The csma_driver is the default CSMA MAC layer, but + * is not compatible with all radios. */ +#ifndef NETSTACK_CONF_MAC +#define NETSTACK_CONF_MAC nullmac_driver +/* #define NETSTACK_CONF_MAC csma_driver */ +#endif /* NETSTACK_CONF_MAC */ + +/* NETSTACK_CONF_LLSEC specifies the link layer security driver. */ +#ifndef NETSTACK_CONF_LLSEC +#define NETSTACK_CONF_LLSEC nullsec_driver +#endif /* NETSTACK_CONF_LLSEC */ + +/* To avoid unnecessary complexity, we assume the common case of + * a constant LoWPAN-wide IEEE 802.15.4 security level, which + * can be specified by defining LLSEC802154_CONF_SECURITY_LEVEL. */ +#ifndef LLSEC802154_CONF_SECURITY_LEVEL +#define LLSEC802154_CONF_SECURITY_LEVEL 0 +#endif /* LLSEC802154_CONF_SECURITY_LEVEL */ + +/* NETSTACK_CONF_NETWORK specifies the network layer and can be either + * sicslowpan_driver, for IPv6 networking, or rime_driver, for the + * custom Rime network stack. */ +#ifndef NETSTACK_CONF_NETWORK +#define NETSTACK_CONF_NETWORK rime_driver +/* #define NETSTACK_CONF_NETWORK sicslowpan_driver */ +#endif /* NETSTACK_CONF_NETWORK */ + +/* NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE specifies the channel check + * rate of the RDC layer. This defines how often the RDC will wake up + * and check for radio channel activity. A higher check rate results + * in higher communication performance at the cost of a higher power + * consumption. */ +#ifndef NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE +#define NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE 8 +#endif /* NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE */ + +/*---------------------------------------------------------------------------*/ +/* Packet buffer size options. + * + * The packet buffer size options can be tweaked on a per-project + * basis to reduce memory consumption. + */ + +/* QUEUEBUF_CONF_NUM specifies the number of queue buffers. Queue + * buffers are used throughout the Contiki netstack but the + * configuration option can be tweaked to save memory. Performance can + * suffer with a too low number of queue buffers though. */ +#ifndef QUEUEBUF_CONF_NUM +#define QUEUEBUF_CONF_NUM 8 +#endif /* QUEUEBUF_CONF_NUM */ +/*---------------------------------------------------------------------------*/ +/* uIPv6 configuration options. + * + * Many of the uIPv6 configuration options can be overriden by a + * project-specific configuration to save memory. + */ + +/* NETSTACK_CONF_WITH_IPV6 specifies whether or not IPv6 should be used. If IPv6 + * is not used, IPv4 is used instead. */ +#ifndef NETSTACK_CONF_WITH_IPV6 +#define NETSTACK_CONF_WITH_IPV6 0 +#endif /* NETSTACK_CONF_WITH_IPV6 */ + +/* UIP_CONF_BUFFER_SIZE specifies how much memory should be reserved + * for the uIP packet buffer. This sets an upper bound on the largest + * IP packet that can be received by the system. */ +#ifndef UIP_CONF_BUFFER_SIZE +#define UIP_CONF_BUFFER_SIZE 128 +#endif /* UIP_CONF_BUFFER_SIZE */ + +/* UIP_CONF_ROUTER specifies if the IPv6 node should be a router or + * not. By default, all Contiki nodes are routers. */ +#ifndef UIP_CONF_ROUTER +#define UIP_CONF_ROUTER 1 +#endif /* UIP_CONF_ROUTER */ + +/* UIP_CONF_IPV6_RPL specifies if RPL is to be used for IPv6 + * routing. */ +#ifndef UIP_CONF_IPV6_RPL +#define UIP_CONF_IPV6_RPL 1 +#endif /* UIP_CONF_IPV6_RPL */ + +/* UIP_CONF_MAX_ROUTES specifies the maximum number of routes that each + * node will be able to handle. */ +#ifndef UIP_CONF_MAX_ROUTES +#define UIP_CONF_MAX_ROUTES 20 +#endif /* UIP_CONF_MAX_ROUTES */ + +/* UIP_CONF_UDP specifies if UDP support should be included or + * not. Disabling UDP saves memory but breaks a lot of stuff. */ +#ifndef UIP_CONF_UDP +#define UIP_CONF_UDP 1 +#endif /* UIP_CONF_UDP */ + +/* UIP_CONF_MAX_CONNECTIONS specifies the maximum number of + * simultaneous TCP connections. */ +#ifndef UIP_CONF_MAX_CONNECTIONS +#define UIP_CONF_MAX_CONNECTIONS 8 +#endif /* UIP_CONF_MAX_CONNECTIONS */ + +/* UIP_CONF_TCP specifies if TCP support should be included or + * not. Disabling TCP saves memory. */ +#ifndef UIP_CONF_TCP +#define UIP_CONF_TCP 1 +#endif /* UIP_CONF_TCP */ + +/* UIP_CONF_MAX_CONNECTIONS specifies the maximum number of + * simultaneous TCP connections. */ +#ifndef UIP_CONF_MAX_CONNECTIONS +#define UIP_CONF_MAX_CONNECTIONS 8 +#endif /* UIP_CONF_MAX_CONNECTIONS */ + + +/* UIP_CONF_TCP_SPLIT enables a performance optimization hack, where + * each maximum-sized TCP segment is split into two, to avoid the + * performance degradation that is caused by delayed ACKs. */ +#ifndef UIP_CONF_TCP_SPLIT +#define UIP_CONF_TCP_SPLIT 0 +#endif /* UIP_CONF_TCP_SPLIT */ + +/* NBR_TABLE_CONF_MAX_NEIGHBORS specifies the maximum number of neighbors + * that each node will be able to handle. */ +#ifndef NBR_TABLE_CONF_MAX_NEIGHBORS +#define NBR_TABLE_CONF_MAX_NEIGHBORS 8 +#endif /* NBR_TABLE_CONF_MAX_NEIGHBORS */ + +/* UIP_CONF_ND6_SEND_NA enables standard IPv6 Neighbor Discovery Protocol. + * This is unneeded when RPL is used. Disable to save ROM and a little RAM. */ +#ifndef UIP_CONF_ND6_SEND_NA +#define UIP_CONF_ND6_SEND_NA 1 +#endif /* UIP_CONF_ND6_SEND_NA */ + +/*---------------------------------------------------------------------------*/ +/* 6lowpan configuration options. + * + * These options change the behavior of the 6lowpan header compression + * code (sicslowpan). They typically depend on the type of radio used + * on the target platform, and are therefore platform-specific. + */ + +/* SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS specifies how many times the + * MAC layer should resend packets if no link-layer ACK was + * received. This only makes sense with the csma_driver + * NETSTACK_CONF_MAC. */ +#ifndef SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS +#define SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS 4 +#endif /* SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS */ + +/* SICSLOWPAN_CONF_FRAG specifies if 6lowpan fragmentation should be + * used or not. Fragmentation is on by default. */ +#ifndef SICSLOWPAN_CONF_FRAG +#define SICSLOWPAN_CONF_FRAG 1 +#endif /* SICSLOWPAN_CONF_FRAG */ + +/* SICSLOWPAN_CONF_MAC_MAX_PAYLOAD is the maximum available size for + * frame headers, link layer security-related overhead, as well as + * 6LoWPAN payload. By default, SICSLOWPAN_CONF_MAC_MAX_PAYLOAD is + * 127 bytes (MTU of 802.15.4) - 2 bytes (Footer of 802.15.4). */ +#ifndef SICSLOWPAN_CONF_MAC_MAX_PAYLOAD +#define SICSLOWPAN_CONF_MAC_MAX_PAYLOAD (127 - 2) +#endif /* SICSLOWPAN_CONF_MAC_MAX_PAYLOAD */ + +/* SICSLOWPAN_CONF_COMPRESSION_THRESHOLD sets a lower threshold for + * when packets should not be compressed. This is used by ContikiMAC, + * which requires packets to be larger than a given minimum size. */ +#ifndef SICSLOWPAN_CONF_COMPRESSION_THRESHOLD +#define SICSLOWPAN_CONF_COMPRESSION_THRESHOLD 0 +/* #define SICSLOWPAN_CONF_COMPRESSION_THRESHOLD 63 */ +#endif /* SICSLOWPAN_CONF_COMPRESSION_THRESHOLD */ + +/* SICSLOWPAN_CONF_COMPRESSION specifies what 6lowpan compression + * mechanism to be used. 6lowpan hc06 is the default in Contiki. */ +#ifndef SICSLOWPAN_CONF_COMPRESSION +#define SICSLOWPAN_CONF_COMPRESSION SICSLOWPAN_COMPRESSION_HC06 +#endif /* SICSLOWPAN_CONF_COMPRESSION */ + +/*---------------------------------------------------------------------------*/ +/* ContikiMAC configuration options. + * + * These are typically configured on a per-platform basis. + */ + +/* CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION specifies if ContikiMAC + * should optimize for the phase of neighbors. The phase optimization + * may reduce power consumption but is not compatible with all timer + * settings and is therefore off by default. */ +#ifndef CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION +#define CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION 0 +#endif /* CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION */ + + +#endif /* CONTIKI_DEFAULT_CONF_H */ diff --git a/bsd/net/contiki-lib.h b/bsd/net/contiki-lib.h new file mode 100644 index 000000000..23e4daf24 --- /dev/null +++ b/bsd/net/contiki-lib.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2005, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + * Author: Adam Dunkels + * + */ +#ifndef CONTIKI_LIB_H_ +#define CONTIKI_LIB_H_ + +#include "contiki.h" +#include "lib/list.h" +#include "lib/memb.h" +#include "lib/mmem.h" +#include "lib/random.h" + +#endif /* CONTIKI_LIB_H_ */ diff --git a/bsd/net/contiki-net.h b/bsd/net/contiki-net.h new file mode 100644 index 000000000..4b6735133 --- /dev/null +++ b/bsd/net/contiki-net.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2005, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + * Author: Adam Dunkels + * + */ +#ifndef CONTIKI_NET_H_ +#define CONTIKI_NET_H_ + +#include "contiki.h" + +#include "net/ip/tcpip.h" +#include "net/ip/uip.h" +#include "net/ipv4/uip-fw.h" +#include "net/ipv4/uip-fw-drv.h" +#include "net/ipv4/uip_arp.h" +#include "net/ip/uiplib.h" +#include "net/ip/uip-udp-packet.h" +#include "net/ip/simple-udp.h" +#include "net/ip/uip-nameserver.h" + +#if NETSTACK_CONF_WITH_IPV6 +#include "net/ipv6/uip-icmp6.h" +#include "net/ipv6/uip-ds6.h" +#endif /* NETSTACK_CONF_WITH_IPV6 */ + +#include "net/ip/resolv.h" + +#include "net/ip/psock.h" + +#include "net/ip/udp-socket.h" +#include "net/ip/tcp-socket.h" + +#include "net/rime/rime.h" + +#include "net/netstack.h" + +#endif /* CONTIKI_NET_H_ */ diff --git a/bsd/net/contiki-version.h b/bsd/net/contiki-version.h new file mode 100644 index 000000000..318d9eef1 --- /dev/null +++ b/bsd/net/contiki-version.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2004, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + * Author: Adam Dunkels + * + */ +#ifndef __CONTIKI_VERSION__ +#define __CONTIKI_VERSION__ + +#ifndef CONTIKI_VERSION_STRING +#define CONTIKI_VERSION_STRING "Contiki 3.x" +#endif /* CONTIKI_VERSION_STRING */ + +#endif /* __CONTIKI_VERSION__ */ diff --git a/bsd/net/contiki.h b/bsd/net/contiki.h new file mode 100644 index 000000000..3cc2488f2 --- /dev/null +++ b/bsd/net/contiki.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2004, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + * Author: Adam Dunkels + * + */ +#ifndef CONTIKI_H_ +#define CONTIKI_H_ + +#include "contiki-version.h" +#include "contiki-conf.h" +#include "contiki-default-conf.h" + +#include "sys/process.h" +#include "sys/autostart.h" + +#include "sys/timer.h" +#include "sys/ctimer.h" +#include "sys/etimer.h" +#include "sys/rtimer.h" + +#include "sys/pt.h" + +#include "sys/procinit.h" + +#include "sys/loader.h" +#include "sys/clock.h" + +#include "sys/energest.h" + +#endif /* CONTIKI_H_ */ diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 39ce6ac9e..7a119911e 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -125,12 +125,15 @@ #include #endif /* PF */ #include +#include #if NECP #include #endif /* NECP */ +#include + #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8)) @@ -392,13 +395,20 @@ static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *); static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *); static void dlil_main_input_thread_func(void *, wait_result_t); +static void dlil_main_input_thread_cont(void *, wait_result_t); + static void dlil_input_thread_func(void *, wait_result_t); +static void dlil_input_thread_cont(void *, wait_result_t); + static void dlil_rxpoll_input_thread_func(void *, wait_result_t); +static void dlil_rxpoll_input_thread_cont(void *, wait_result_t); + static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *); static void dlil_terminate_input_thread(struct dlil_threading_info *); static void dlil_input_stats_add(const struct ifnet_stat_increment_param *, - struct dlil_threading_info *, boolean_t); -static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *); + struct dlil_threading_info *, struct ifnet *, boolean_t); +static boolean_t dlil_input_stats_sync(struct ifnet *, + struct dlil_threading_info *); static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *, u_int32_t, ifnet_model_t, boolean_t); static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *, @@ -414,17 +424,23 @@ static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t, static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *, protocol_family_t); +static void dlil_incr_pending_thread_count(void); +static void dlil_decr_pending_thread_count(void); + static void ifnet_detacher_thread_func(void *, wait_result_t); static int ifnet_detacher_thread_cont(int); static void ifnet_detach_final(struct ifnet *); static void ifnet_detaching_enqueue(struct ifnet *); static struct ifnet *ifnet_detaching_dequeue(void); -static void ifnet_start_thread_fn(void *, wait_result_t); -static void ifnet_poll_thread_fn(void *, wait_result_t); -static void ifnet_poll(struct ifnet *); -static errno_t ifnet_enqueue_common(struct ifnet *, void *, - classq_pkt_type_t, boolean_t, boolean_t *); +static void ifnet_start_thread_func(void *, wait_result_t); +static void ifnet_start_thread_cont(void *, wait_result_t); + +static void ifnet_poll_thread_func(void *, wait_result_t); +static void ifnet_poll_thread_cont(void *, wait_result_t); + +static errno_t ifnet_enqueue_common(struct ifnet *, classq_pkt_t *, + boolean_t, boolean_t *); static void ifp_src_route_copyout(struct ifnet *, struct route *); static void ifp_src_route_copyin(struct ifnet *, struct route *); @@ -526,7 +542,7 @@ int dlil_verbose = 0; static u_int32_t dlil_input_sanity_check = 0; #endif /* IFNET_INPUT_SANITY_CHK */ /* rate limit debug messages */ -struct timespec dlil_dbgrate = { 1, 0 }; +struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 }; SYSCTL_DECL(_net_link_generic_system); @@ -547,7 +563,7 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen, sysctl_rcvq_maxlen, "I", "Default receive queue max length"); #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */ -static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY; +u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY; SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay, CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY, "ilog2 of EWMA decay rate of avg inbound packets"); @@ -568,8 +584,6 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time, IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime, "Q", "input poll sampling time"); -#define IF_RXPOLL_INTERVALTIME_MIN (1ULL * 1000) /* 1 us */ -#define IF_RXPOLL_INTERVALTIME (1ULL * 1000 * 1000) /* 1 ms */ static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME; SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time, @@ -577,22 +591,22 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time, "Q", "input poll interval (time)"); #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */ -static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS; +u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS; SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts, CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts, IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)"); #define IF_RXPOLL_WLOWAT 10 -static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT; +static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT; SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat, IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat, "I", "input poll wakeup low watermark"); #define IF_RXPOLL_WHIWAT 100 -static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT; +static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT; SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat, IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat, "I", "input poll wakeup high watermark"); @@ -601,7 +615,7 @@ SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max, CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0, "max packets per poll call"); -static u_int32_t if_rxpoll = 1; +u_int32_t if_rxpoll = 1; SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0, sysctl_rxpoll, "I", "enable opportunistic input polling"); @@ -770,20 +784,16 @@ static lck_attr_t *dlil_lck_attributes = NULL; /* DLIL data threshold thread call */ static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t); -static void dlil_mit_tcall_fn(thread_call_param_t, thread_call_param_t); - -uint32_t dlil_rcv_mit_pkts_min = 5; -uint32_t dlil_rcv_mit_pkts_max = 64; -uint32_t dlil_rcv_mit_interval = (500 * 1000); - -#if (DEVELOPMENT || DEBUG) -SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rcv_mit_pkts_min, - CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_rcv_mit_pkts_min, 0, ""); -SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rcv_mit_pkts_max, - CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_rcv_mit_pkts_max, 0, ""); -SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rcv_mit_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_rcv_mit_interval, 0, ""); -#endif /* DEVELOPMENT || DEBUG */ +void +ifnet_filter_update_tso(boolean_t filter_enable) +{ + /* + * update filter count and route_generation ID to let TCP + * know it should reevalute doing TSO or not + */ + OSAddAtomic(filter_enable ? 1 : -1, &dlil_filter_disable_tso_count); + routegenid_update(); +} #define DLIL_INPUT_CHECK(m, ifp) { \ @@ -816,14 +826,38 @@ struct rxpoll_time_tbl { }; static struct rxpoll_time_tbl rxpoll_tbl[] = { - { 10 * MBPS, 2, 8, (1 * 1024), (6 * 1024) }, - { 100 * MBPS, 10, 40, (4 * 1024), (64 * 1024) }, - { 1 * GBPS, 10, 40, (4 * 1024), (64 * 1024) }, - { 10 * GBPS, 10, 40, (4 * 1024), (64 * 1024) }, - { 100 * GBPS, 10, 40, (4 * 1024), (64 * 1024) }, - { 0, 0, 0, 0, 0 } + { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) }, + { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) }, + { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) }, + { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) }, + { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) }, + { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 } }; +decl_lck_mtx_data(static, dlil_thread_sync_lock); +static uint32_t dlil_pending_thread_cnt = 0; +static void +dlil_incr_pending_thread_count(void) +{ + LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(&dlil_thread_sync_lock); + dlil_pending_thread_cnt++; + lck_mtx_unlock(&dlil_thread_sync_lock); +} + +static void +dlil_decr_pending_thread_count(void) +{ + LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(&dlil_thread_sync_lock); + VERIFY(dlil_pending_thread_cnt > 0); + dlil_pending_thread_cnt--; + if (dlil_pending_thread_cnt == 0) { + wakeup(&dlil_pending_thread_cnt); + } + lck_mtx_unlock(&dlil_thread_sync_lock); +} + int proto_hash_value(u_int32_t protocol_family) { @@ -839,9 +873,11 @@ proto_hash_value(u_int32_t protocol_family) return 1; case PF_VLAN: return 2; + case PF_802154: + return 3; case PF_UNSPEC: default: - return 3; + return 4; } } @@ -1107,7 +1143,7 @@ if_free_protolist(u_int32_t *list) _FREE(list, M_TEMP); } -__private_extern__ void +__private_extern__ int dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, u_int32_t event_code, struct net_event_data *event_data, u_int32_t event_data_len) @@ -1140,12 +1176,23 @@ dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, ev_msg.dv[0].data_ptr = event_data; ev_msg.dv[1].data_length = 0; - /* Don't update interface generation for quality and RRC state changess */ - bool update_generation = (event_subclass != KEV_DL_SUBCLASS || - (event_code != KEV_DL_LINK_QUALITY_METRIC_CHANGED && - event_code != KEV_DL_RRC_STATE_CHANGED)); + bool update_generation = true; + if (event_subclass == KEV_DL_SUBCLASS) { + /* Don't update interface generation for frequent link quality and state changes */ + switch (event_code) { + case KEV_DL_LINK_QUALITY_METRIC_CHANGED: + case KEV_DL_RRC_STATE_CHANGED: + case KEV_DL_NODE_PRESENCE: + case KEV_DL_NODE_ABSENCE: + case KEV_DL_MASTER_ELECTED: + update_generation = false; + break; + default: + break; + } + } - dlil_event_internal(ifp, &ev_msg, update_generation); + return dlil_event_internal(ifp, &ev_msg, update_generation); } __private_extern__ int @@ -1227,7 +1274,7 @@ dlil_alloc_local_stats(struct ifnet *ifp) } } end: - if (ret != 0) { + if (ifp != NULL && ret != 0) { if (ifp->if_tcp_stat != NULL) { pbuf = (void **) ((intptr_t)ifp->if_tcp_stat - sizeof(void *)); @@ -1253,20 +1300,43 @@ end: return ret; } +static void +dlil_reset_rxpoll_params(ifnet_t ifp) +{ + ASSERT(ifp != NULL); + ifnet_set_poll_cycle(ifp, NULL); + ifp->if_poll_update = 0; + ifp->if_poll_flags = 0; + ifp->if_poll_req = 0; + ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF; + bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats)); + bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats)); + bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats)); + net_timerclear(&ifp->if_poll_mode_holdtime); + net_timerclear(&ifp->if_poll_mode_lasttime); + net_timerclear(&ifp->if_poll_sample_holdtime); + net_timerclear(&ifp->if_poll_sample_lasttime); + net_timerclear(&ifp->if_poll_dbg_lasttime); +} + static int dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp) { + boolean_t dlil_rxpoll_input; thread_continue_t func; u_int32_t limit; int error; + dlil_rxpoll_input = (ifp != NULL && net_rxpoll && + (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY)); + /* NULL ifp indicates the main input thread, called at dlil_init time */ if (ifp == NULL) { func = dlil_main_input_thread_func; VERIFY(inp == dlil_main_input_thread); (void) strlcat(inp->input_name, "main_input", DLIL_THREADNAME_LEN); - } else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) { + } else if (dlil_rxpoll_input) { func = dlil_rxpoll_input_thread_func; VERIFY(inp != dlil_main_input_thread); (void) snprintf(inp->input_name, DLIL_THREADNAME_LEN, @@ -1282,15 +1352,7 @@ dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp) inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes); lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes); - inp->mode = IFNET_MODEL_INPUT_POLL_OFF; - inp->ifp = ifp; /* NULL for main input thread */ - - net_timerclear(&inp->mode_holdtime); - net_timerclear(&inp->mode_lasttime); - net_timerclear(&inp->sample_holdtime); - net_timerclear(&inp->sample_lasttime); - net_timerclear(&inp->dbg_lasttime); - + inp->ifp = ifp; /* NULL for main input thread */ /* * For interfaces that support opportunistic polling, set the * low and high watermarks for outstanding inbound packets/bytes. @@ -1299,7 +1361,9 @@ dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp) */ if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) { limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN); - (void) dlil_rxpoll_set_params(ifp, NULL, FALSE); + if (ifp->if_xflags & IFXF_LEGACY) { + (void) dlil_rxpoll_set_params(ifp, NULL, FALSE); + } } else { limit = (u_int32_t)-1; } @@ -1390,18 +1454,6 @@ dlil_clean_threading_info(struct dlil_threading_info *inp) VERIFY(inp->wloop_thr == THREAD_NULL); VERIFY(inp->poll_thr == THREAD_NULL); VERIFY(inp->tag == 0); - - inp->mode = IFNET_MODEL_INPUT_POLL_OFF; - bzero(&inp->tstats, sizeof(inp->tstats)); - bzero(&inp->pstats, sizeof(inp->pstats)); - bzero(&inp->sstats, sizeof(inp->sstats)); - - net_timerclear(&inp->mode_holdtime); - net_timerclear(&inp->mode_lasttime); - net_timerclear(&inp->sample_holdtime); - net_timerclear(&inp->sample_lasttime); - net_timerclear(&inp->dbg_lasttime); - #if IFNET_INPUT_SANITY_CHK inp->input_mbuf_cnt = 0; #endif /* IFNET_INPUT_SANITY_CHK */ @@ -1411,6 +1463,7 @@ static void dlil_terminate_input_thread(struct dlil_threading_info *inp) { struct ifnet *ifp = inp->ifp; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); VERIFY(current_thread() == inp->input_thr); VERIFY(inp != dlil_main_input_thread); @@ -1425,21 +1478,27 @@ dlil_terminate_input_thread(struct dlil_threading_info *inp) i++) { v = (i + 1) * v; } - printf("the value is %d\n", v); + DLIL_PRINTF("the value is %d\n", v); } #endif /* TEST_INPUT_THREAD_TERMINATION */ lck_mtx_lock_spin(&inp->input_lck); + _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL); VERIFY((inp->input_waiting & DLIL_INPUT_TERMINATE) != 0); inp->input_waiting |= DLIL_INPUT_TERMINATE_COMPLETE; wakeup_one((caddr_t)&inp->input_waiting); lck_mtx_unlock(&inp->input_lck); + /* free up pending packets */ + if (pkt.cp_mbuf != NULL) { + mbuf_freem_list(pkt.cp_mbuf); + } + /* for the extra refcnt from kernel_thread_start() */ thread_deallocate(current_thread()); if (dlil_verbose) { - printf("%s: input thread terminated\n", + DLIL_PRINTF("%s: input thread terminated\n", if_name(ifp)); } @@ -1563,6 +1622,9 @@ dlil_init(void) _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE); _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND); _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR); + _CASSERT(IFRTYPE_FAMILY_6LOWPAN == IFNET_FAMILY_6LOWPAN); + _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN); + _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC); _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY); _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB); @@ -1571,6 +1633,8 @@ dlil_init(void) _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT); _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED); _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC); + _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY); + _CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT); _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN); _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN); @@ -1584,6 +1648,7 @@ dlil_init(void) PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug)); + VERIFY(dlil_pending_thread_cnt == 0); dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) : sizeof(struct dlil_ifnet_dbg); /* Enforce 64-bit alignment for dlil_ifnet structure */ @@ -1696,6 +1761,7 @@ dlil_init(void) lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group, dlil_lck_attributes); lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes); + lck_mtx_init(&dlil_thread_sync_lock, dlil_lock_group, dlil_lck_attributes); /* Setup interface flow control related items */ lck_mtx_init(&ifnet_fc_lock, dlil_lock_group, dlil_lck_attributes); @@ -1752,14 +1818,39 @@ dlil_init(void) * Create and start up the main DLIL input thread and the interface * detacher threads once everything is initialized. */ + dlil_incr_pending_thread_count(); dlil_create_input_thread(NULL, dlil_main_input_thread); + /* + * Create ifnet detacher thread. + * When an interface gets detached, part of the detach processing + * is delayed. The interface is added to delayed detach list + * and this thread is woken up to call ifnet_detach_final + * on these interfaces. + */ + dlil_incr_pending_thread_count(); if (kernel_thread_start(ifnet_detacher_thread_func, NULL, &thread) != KERN_SUCCESS) { panic_plain("%s: couldn't create detacher thread", __func__); /* NOTREACHED */ } thread_deallocate(thread); + + /* + * Wait for the created kernel threads for dlil to get + * scheduled and run at least once before we proceed + */ + lck_mtx_lock(&dlil_thread_sync_lock); + while (dlil_pending_thread_cnt != 0) { + DLIL_PRINTF("%s: Waiting for all the create dlil kernel threads " + "to get scheduled at least once.\n", __func__); + (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock, (PZERO - 1), + __func__, NULL); + LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED); + } + lck_mtx_unlock(&dlil_thread_sync_lock); + DLIL_PRINTF("%s: All the created dlil kernel threads have been scheduled " + "at least once. Proceeding.\n", __func__); } static void @@ -1858,8 +1949,7 @@ dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter, * know it shouldn't do TSO on this connection */ if ((filter->filt_flags & DLIL_IFF_TSO) == 0) { - OSAddAtomic(1, &dlil_filter_disable_tso_count); - routegenid_update(); + ifnet_filter_update_tso(TRUE); } OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count); INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total); @@ -1867,7 +1957,7 @@ dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter, INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total); } if (dlil_verbose) { - printf("%s: %s filter attached\n", if_name(ifp), + DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp), if_filter->iff_name); } done: @@ -1923,7 +2013,7 @@ dlil_detach_filter_internal(interface_filter_t filter, int detached) if_flt_monitor_leave(ifp); lck_mtx_unlock(&ifp->if_flt_lock); if (dlil_verbose) { - printf("%s: %s filter detached\n", + DLIL_PRINTF("%s: %s filter detached\n", if_name(ifp), filter->filt_name); } goto destroy; @@ -1938,7 +2028,7 @@ dlil_detach_filter_internal(interface_filter_t filter, int detached) } if (dlil_verbose) { - printf("%s filter detached\n", filter->filt_name); + DLIL_PRINTF("%s filter detached\n", filter->filt_name); } destroy: @@ -1953,8 +2043,7 @@ destroy: * know it should reevalute doing TSO or not */ if ((filter->filt_flags & DLIL_IFF_TSO) == 0) { - OSAddAtomic(-1, &dlil_filter_disable_tso_count); - routegenid_update(); + ifnet_filter_update_tso(FALSE); } VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0); @@ -1980,6 +2069,27 @@ dlil_detach_filter(interface_filter_t filter) dlil_detach_filter_internal(filter, 0); } +__attribute__((noreturn)) +static void +dlil_main_input_thread_func(void *v, wait_result_t w) +{ +#pragma unused(w) + struct dlil_threading_info *inp = v; + + VERIFY(inp == dlil_main_input_thread); + VERIFY(inp->ifp == NULL); + VERIFY(current_thread() == inp->input_thr); + + dlil_decr_pending_thread_count(); + lck_mtx_lock(&inp->input_lck); + VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING)); + (void) assert_wait(&inp->input_waiting, THREAD_UNINT); + lck_mtx_unlock(&inp->input_lck); + (void) thread_block_parameter(dlil_main_input_thread_cont, inp); + /* NOTREACHED */ + __builtin_unreachable(); +} + /* * Main input thread: * @@ -1992,46 +2102,38 @@ dlil_detach_filter(interface_filter_t filter) */ __attribute__((noreturn)) static void -dlil_main_input_thread_func(void *v, wait_result_t w) +dlil_main_input_thread_cont(void *v, wait_result_t wres) { -#pragma unused(w) struct dlil_main_threading_info *inpm = v; struct dlil_threading_info *inp = v; - VERIFY(inp == dlil_main_input_thread); - VERIFY(inp->ifp == NULL); - VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF); + /* main input thread is uninterruptible */ + VERIFY(wres != THREAD_INTERRUPTED); + lck_mtx_lock_spin(&inp->input_lck); + VERIFY(!(inp->input_waiting & (DLIL_INPUT_TERMINATE | + DLIL_INPUT_RUNNING))); + inp->input_waiting |= DLIL_INPUT_RUNNING; while (1) { struct mbuf *m = NULL, *m_loop = NULL; u_int32_t m_cnt, m_cnt_loop; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); boolean_t proto_req; - lck_mtx_lock_spin(&inp->input_lck); - - /* Wait until there is work to be done */ - while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { - inp->input_waiting &= ~DLIL_INPUT_RUNNING; - (void) msleep(&inp->input_waiting, &inp->input_lck, - (PZERO - 1) | PSPIN, inp->input_name, NULL); - } - - inp->input_waiting |= DLIL_INPUT_RUNNING; inp->input_waiting &= ~DLIL_INPUT_WAITING; - /* Main input thread cannot be terminated */ - VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE)); - proto_req = (inp->input_waiting & (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)); /* Packets for non-dedicated interfaces other than lo0 */ m_cnt = qlen(&inp->rcvq_pkts); - m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL); + _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL); + m = pkt.cp_mbuf; /* Packets exclusive to lo0 */ m_cnt_loop = qlen(&inpm->lo_rcvq_pkts); - m_loop = _getq_all(&inpm->lo_rcvq_pkts, NULL, NULL, NULL); + _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL); + m_loop = pkt.cp_mbuf; inp->wtot = 0; @@ -2044,26 +2146,41 @@ dlil_main_input_thread_func(void *v, wait_result_t w) */ if (m_loop != NULL) { dlil_input_packet_list_extended(lo_ifp, m_loop, - m_cnt_loop, inp->mode); + m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF); } if (m != NULL) { dlil_input_packet_list_extended(NULL, m, - m_cnt, inp->mode); + m_cnt, IFNET_MODEL_INPUT_POLL_OFF); } if (proto_req) { proto_input_run(); } + + lck_mtx_lock_spin(&inp->input_lck); + VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING); + /* main input thread cannot be terminated */ + VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE)); + if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + break; + } } - /* NOTREACHED */ + inp->input_waiting &= ~DLIL_INPUT_RUNNING; + (void) assert_wait(&inp->input_waiting, THREAD_UNINT); + lck_mtx_unlock(&inp->input_lck); + (void) thread_block_parameter(dlil_main_input_thread_cont, inp); + VERIFY(0); /* we should never get here */ + /* NOTREACHED */ + __builtin_unreachable(); } /* * Input thread for interfaces with legacy input model. */ +__attribute__((noreturn)) static void dlil_input_thread_func(void *v, wait_result_t w) { @@ -2072,30 +2189,52 @@ dlil_input_thread_func(void *v, wait_result_t w) struct dlil_threading_info *inp = v; struct ifnet *ifp = inp->ifp; - /* Construct the name for this thread, and then apply it. */ + VERIFY(inp != dlil_main_input_thread); + VERIFY(ifp != NULL); + VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll || + !(ifp->if_xflags & IFXF_LEGACY)); + VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF || + !(ifp->if_xflags & IFXF_LEGACY)); + VERIFY(current_thread() == inp->input_thr); + + /* construct the name for this thread, and then apply it */ bzero(thread_name, sizeof(thread_name)); - snprintf(thread_name, sizeof(thread_name), "dlil_input_%s", ifp->if_xname); + (void) snprintf(thread_name, sizeof(thread_name), + "dlil_input_%s", ifp->if_xname); thread_set_thread_name(inp->input_thr, thread_name); + ifnet_decr_pending_thread_count(ifp); - VERIFY(inp != dlil_main_input_thread); - VERIFY(ifp != NULL); - VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll); - VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF); + lck_mtx_lock(&inp->input_lck); + VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING)); + (void) assert_wait(&inp->input_waiting, THREAD_UNINT); + lck_mtx_unlock(&inp->input_lck); + (void) thread_block_parameter(dlil_input_thread_cont, inp); + /* NOTREACHED */ + __builtin_unreachable(); +} + +__attribute__((noreturn)) +static void +dlil_input_thread_cont(void *v, wait_result_t wres) +{ + struct dlil_threading_info *inp = v; + struct ifnet *ifp = inp->ifp; + + lck_mtx_lock_spin(&inp->input_lck); + if (__improbable(wres == THREAD_INTERRUPTED || + (inp->input_waiting & DLIL_INPUT_TERMINATE))) { + goto terminate; + } + + VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING)); + inp->input_waiting |= DLIL_INPUT_RUNNING; while (1) { struct mbuf *m = NULL; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); + boolean_t notify = FALSE; u_int32_t m_cnt; - lck_mtx_lock_spin(&inp->input_lck); - - /* Wait until there is work to be done */ - while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { - inp->input_waiting &= ~DLIL_INPUT_RUNNING; - (void) msleep(&inp->input_waiting, &inp->input_lck, - (PZERO - 1) | PSPIN, inp->input_name, NULL); - } - - inp->input_waiting |= DLIL_INPUT_RUNNING; inp->input_waiting &= ~DLIL_INPUT_WAITING; /* @@ -2110,27 +2249,19 @@ dlil_input_thread_func(void *v, wait_result_t w) /* Packets for this interface */ m_cnt = qlen(&inp->rcvq_pkts); - m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL); - - if (inp->input_waiting & DLIL_INPUT_TERMINATE) { - lck_mtx_unlock(&inp->input_lck); - - /* Free up pending packets */ - if (m != NULL) { - mbuf_freem_list(m); - } - - dlil_terminate_input_thread(inp); - /* NOTREACHED */ - return; - } + _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL); + m = pkt.cp_mbuf; inp->wtot = 0; - dlil_input_stats_sync(ifp, inp); + notify = dlil_input_stats_sync(ifp, inp); lck_mtx_unlock(&inp->input_lck); + if (notify) { + ifnet_notify_data_threshold(ifp); + } + /* * NOTE warning %%% attention !!!! * We should think about putting some thread starvation @@ -2138,38 +2269,97 @@ dlil_input_thread_func(void *v, wait_result_t w) */ if (m != NULL) { dlil_input_packet_list_extended(NULL, m, - m_cnt, inp->mode); + m_cnt, ifp->if_poll_mode); + } + + lck_mtx_lock_spin(&inp->input_lck); + VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING); + if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + break; } } - /* NOTREACHED */ + inp->input_waiting &= ~DLIL_INPUT_RUNNING; + + if (__improbable(inp->input_waiting & DLIL_INPUT_TERMINATE)) { +terminate: + lck_mtx_unlock(&inp->input_lck); + dlil_terminate_input_thread(inp); + /* NOTREACHED */ + } else { + (void) assert_wait(&inp->input_waiting, THREAD_UNINT); + lck_mtx_unlock(&inp->input_lck); + (void) thread_block_parameter(dlil_input_thread_cont, inp); + /* NOTREACHED */ + } + VERIFY(0); /* we should never get here */ + /* NOTREACHED */ + __builtin_unreachable(); } /* * Input thread for interfaces with opportunistic polling input model. */ +__attribute__((noreturn)) static void dlil_rxpoll_input_thread_func(void *v, wait_result_t w) { #pragma unused(w) + char thread_name[MAXTHREADNAMESIZE]; struct dlil_threading_info *inp = v; struct ifnet *ifp = inp->ifp; - struct timespec ts; VERIFY(inp != dlil_main_input_thread); - VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL)); + VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) && + (ifp->if_xflags & IFXF_LEGACY)); + VERIFY(current_thread() == inp->input_thr); + + /* construct the name for this thread, and then apply it */ + bzero(thread_name, sizeof(thread_name)); + (void) snprintf(thread_name, sizeof(thread_name), + "dlil_input_poll_%s", ifp->if_xname); + thread_set_thread_name(inp->input_thr, thread_name); + ifnet_decr_pending_thread_count(ifp); + + lck_mtx_lock(&inp->input_lck); + VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING)); + (void) assert_wait(&inp->input_waiting, THREAD_UNINT); + lck_mtx_unlock(&inp->input_lck); + (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp); + /* NOTREACHED */ + __builtin_unreachable(); +} + +__attribute__((noreturn)) +static void +dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres) +{ + struct dlil_threading_info *inp = v; + struct ifnet *ifp = inp->ifp; + struct timespec ts; + + lck_mtx_lock_spin(&inp->input_lck); + if (__improbable(wres == THREAD_INTERRUPTED || + (inp->input_waiting & DLIL_INPUT_TERMINATE))) { + goto terminate; + } + + VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING)); + inp->input_waiting |= DLIL_INPUT_RUNNING; while (1) { struct mbuf *m = NULL; u_int32_t m_cnt, m_size, poll_req = 0; ifnet_model_t mode; struct timespec now, delta; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); + boolean_t notify; u_int64_t ival; - lck_mtx_lock_spin(&inp->input_lck); + inp->input_waiting &= ~DLIL_INPUT_WAITING; - if ((ival = inp->rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) { + if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) { ival = IF_RXPOLL_INTERVALTIME_MIN; } @@ -2180,17 +2370,7 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w) } /* Current operating mode */ - mode = inp->mode; - - /* Wait until there is work to be done */ - while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { - inp->input_waiting &= ~DLIL_INPUT_RUNNING; - (void) msleep(&inp->input_waiting, &inp->input_lck, - (PZERO - 1) | PSPIN, inp->input_name, NULL); - } - - inp->input_waiting |= DLIL_INPUT_RUNNING; - inp->input_waiting &= ~DLIL_INPUT_WAITING; + mode = ifp->if_poll_mode; /* * Protocol registration and injection must always use @@ -2202,22 +2382,6 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w) VERIFY(!(inp->input_waiting & (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER))); - if (inp->input_waiting & DLIL_INPUT_TERMINATE) { - /* Free up pending packets */ - lck_mtx_convert_spin(&inp->input_lck); - _flushq(&inp->rcvq_pkts); - if (inp->input_mit_tcall != NULL) { - if (thread_call_isactive(inp->input_mit_tcall)) { - thread_call_cancel(inp->input_mit_tcall); - } - } - lck_mtx_unlock(&inp->input_lck); - - dlil_terminate_input_thread(inp); - /* NOTREACHED */ - return; - } - /* Total count of all packets */ m_cnt = qlen(&inp->rcvq_pkts); @@ -2225,116 +2389,121 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w) m_size = qsize(&inp->rcvq_pkts); /* Packets for this interface */ - m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL); + _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL); + m = pkt.cp_mbuf; VERIFY(m != NULL || m_cnt == 0); nanouptime(&now); - if (!net_timerisset(&inp->sample_lasttime)) { - *(&inp->sample_lasttime) = *(&now); + if (!net_timerisset(&ifp->if_poll_sample_lasttime)) { + *(&ifp->if_poll_sample_lasttime) = *(&now); } - net_timersub(&now, &inp->sample_lasttime, &delta); - if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) { + net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta); + if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) { u_int32_t ptot, btot; /* Accumulate statistics for current sampling */ - PKTCNTR_ADD(&inp->sstats, m_cnt, m_size); + PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size); - if (net_timercmp(&delta, &inp->sample_holdtime, <)) { + if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) { goto skip; } - *(&inp->sample_lasttime) = *(&now); + *(&ifp->if_poll_sample_lasttime) = *(&now); /* Calculate min/max of inbound bytes */ - btot = (u_int32_t)inp->sstats.bytes; - if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot) { - inp->rxpoll_bmin = btot; + btot = (u_int32_t)ifp->if_poll_sstats.bytes; + if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) { + ifp->if_rxpoll_bmin = btot; } - if (btot > inp->rxpoll_bmax) { - inp->rxpoll_bmax = btot; + if (btot > ifp->if_rxpoll_bmax) { + ifp->if_rxpoll_bmax = btot; } /* Calculate EWMA of inbound bytes */ - DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay); + DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay); /* Calculate min/max of inbound packets */ - ptot = (u_int32_t)inp->sstats.packets; - if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot) { - inp->rxpoll_pmin = ptot; + ptot = (u_int32_t)ifp->if_poll_sstats.packets; + if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) { + ifp->if_rxpoll_pmin = ptot; } - if (ptot > inp->rxpoll_pmax) { - inp->rxpoll_pmax = ptot; + if (ptot > ifp->if_rxpoll_pmax) { + ifp->if_rxpoll_pmax = ptot; } /* Calculate EWMA of inbound packets */ - DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay); + DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay); /* Reset sampling statistics */ - PKTCNTR_CLEAR(&inp->sstats); + PKTCNTR_CLEAR(&ifp->if_poll_sstats); /* Calculate EWMA of wakeup requests */ - DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay); + DLIL_EWMA(ifp->if_rxpoll_wavg, inp->wtot, if_rxpoll_decay); inp->wtot = 0; if (dlil_verbose) { - if (!net_timerisset(&inp->dbg_lasttime)) { - *(&inp->dbg_lasttime) = *(&now); + if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) { + *(&ifp->if_poll_dbg_lasttime) = *(&now); } - net_timersub(&now, &inp->dbg_lasttime, &delta); + net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta); if (net_timercmp(&delta, &dlil_dbgrate, >=)) { - *(&inp->dbg_lasttime) = *(&now); - printf("%s: [%s] pkts avg %d max %d " + *(&ifp->if_poll_dbg_lasttime) = *(&now); + DLIL_PRINTF("%s: [%s] pkts avg %d max %d " "limits [%d/%d], wreq avg %d " "limits [%d/%d], bytes avg %d " "limits [%d/%d]\n", if_name(ifp), - (inp->mode == + (ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_ON) ? - "ON" : "OFF", inp->rxpoll_pavg, - inp->rxpoll_pmax, - inp->rxpoll_plowat, - inp->rxpoll_phiwat, - inp->rxpoll_wavg, - inp->rxpoll_wlowat, - inp->rxpoll_whiwat, - inp->rxpoll_bavg, - inp->rxpoll_blowat, - inp->rxpoll_bhiwat); + "ON" : "OFF", ifp->if_rxpoll_pavg, + ifp->if_rxpoll_pmax, + ifp->if_rxpoll_plowat, + ifp->if_rxpoll_phiwat, + ifp->if_rxpoll_wavg, + ifp->if_rxpoll_wlowat, + ifp->if_rxpoll_whiwat, + ifp->if_rxpoll_bavg, + ifp->if_rxpoll_blowat, + ifp->if_rxpoll_bhiwat); } } /* Perform mode transition, if necessary */ - if (!net_timerisset(&inp->mode_lasttime)) { - *(&inp->mode_lasttime) = *(&now); + if (!net_timerisset(&ifp->if_poll_mode_lasttime)) { + *(&ifp->if_poll_mode_lasttime) = *(&now); } - net_timersub(&now, &inp->mode_lasttime, &delta); - if (net_timercmp(&delta, &inp->mode_holdtime, <)) { + net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta); + if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) { goto skip; } - if (inp->rxpoll_pavg <= inp->rxpoll_plowat && - inp->rxpoll_bavg <= inp->rxpoll_blowat && - inp->mode != IFNET_MODEL_INPUT_POLL_OFF) { + if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat && + ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat && + ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) { mode = IFNET_MODEL_INPUT_POLL_OFF; - } else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat && - (inp->rxpoll_bavg >= inp->rxpoll_bhiwat || - inp->rxpoll_wavg >= inp->rxpoll_whiwat) && - inp->mode != IFNET_MODEL_INPUT_POLL_ON) { + } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat && + (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat || + ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) && + ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) { mode = IFNET_MODEL_INPUT_POLL_ON; } - if (mode != inp->mode) { - inp->mode = mode; - *(&inp->mode_lasttime) = *(&now); + if (mode != ifp->if_poll_mode) { + ifp->if_poll_mode = mode; + *(&ifp->if_poll_mode_lasttime) = *(&now); poll_req++; } } skip: - dlil_input_stats_sync(ifp, inp); + notify = dlil_input_stats_sync(ifp, inp); lck_mtx_unlock(&inp->input_lck); + if (notify) { + ifnet_notify_data_threshold(ifp); + } + /* * If there's a mode change and interface is still attached, * perform a downcall to the driver for the new mode. Also @@ -2342,27 +2511,29 @@ skip: * being detached (will be release below.) */ if (poll_req != 0 && ifnet_is_attached(ifp, 1)) { - struct ifnet_model_params p = { mode, { 0 } }; + struct ifnet_model_params p = { + .model = mode, .reserved = { 0 } + }; errno_t err; if (dlil_verbose) { - printf("%s: polling is now %s, " + DLIL_PRINTF("%s: polling is now %s, " "pkts avg %d max %d limits [%d/%d], " "wreq avg %d limits [%d/%d], " "bytes avg %d limits [%d/%d]\n", if_name(ifp), (mode == IFNET_MODEL_INPUT_POLL_ON) ? - "ON" : "OFF", inp->rxpoll_pavg, - inp->rxpoll_pmax, inp->rxpoll_plowat, - inp->rxpoll_phiwat, inp->rxpoll_wavg, - inp->rxpoll_wlowat, inp->rxpoll_whiwat, - inp->rxpoll_bavg, inp->rxpoll_blowat, - inp->rxpoll_bhiwat); + "ON" : "OFF", ifp->if_rxpoll_pavg, + ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat, + ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg, + ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat, + ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat, + ifp->if_rxpoll_bhiwat); } if ((err = ((*ifp->if_input_ctl)(ifp, IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) { - printf("%s: error setting polling mode " + DLIL_PRINTF("%s: error setting polling mode " "to %s (%d)\n", if_name(ifp), (mode == IFNET_MODEL_INPUT_POLL_ON) ? "ON" : "OFF", err); @@ -2371,9 +2542,9 @@ skip: switch (mode) { case IFNET_MODEL_INPUT_POLL_OFF: ifnet_set_poll_cycle(ifp, NULL); - inp->rxpoll_offreq++; + ifp->if_rxpoll_offreq++; if (err != 0) { - inp->rxpoll_offerr++; + ifp->if_rxpoll_offerr++; } break; @@ -2381,9 +2552,9 @@ skip: net_nsectimer(&ival, &ts); ifnet_set_poll_cycle(ifp, &ts); ifnet_poll(ifp); - inp->rxpoll_onreq++; + ifp->if_rxpoll_onreq++; if (err != 0) { - inp->rxpoll_onerr++; + ifp->if_rxpoll_onerr++; } break; @@ -2404,28 +2575,37 @@ skip: if (m != NULL) { dlil_input_packet_list_extended(NULL, m, m_cnt, mode); } + + lck_mtx_lock_spin(&inp->input_lck); + VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING); + if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + break; + } + } + + inp->input_waiting &= ~DLIL_INPUT_RUNNING; + + if (__improbable(inp->input_waiting & DLIL_INPUT_TERMINATE)) { +terminate: + lck_mtx_unlock(&inp->input_lck); + dlil_terminate_input_thread(inp); + /* NOTREACHED */ + } else { + (void) assert_wait(&inp->input_waiting, THREAD_UNINT); + lck_mtx_unlock(&inp->input_lck); + (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, + inp); + /* NOTREACHED */ } - /* NOTREACHED */ VERIFY(0); /* we should never get here */ + /* NOTREACHED */ + __builtin_unreachable(); } -/* - * Must be called on an attached ifnet (caller is expected to check.) - * Caller may pass NULL for poll parameters to indicate "auto-tuning." - */ errno_t -dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p, - boolean_t locked) +dlil_rxpoll_validate_params(struct ifnet_poll_params *p) { - struct dlil_threading_info *inp; - u_int64_t sample_holdtime, inbw; - - VERIFY(ifp != NULL); - if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) { - return ENXIO; - } - if (p != NULL) { if ((p->packets_lowat == 0 && p->packets_hiwat != 0) || (p->packets_lowat != 0 && p->packets_hiwat == 0)) { @@ -2448,33 +2628,22 @@ dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p, p->interval_time = IF_RXPOLL_INTERVALTIME_MIN; } } + return 0; +} - if (!locked) { - lck_mtx_lock(&inp->input_lck); - } - - LCK_MTX_ASSERT(&inp->input_lck, LCK_MTX_ASSERT_OWNED); - - /* - * Normally, we'd reset the parameters to the auto-tuned values - * if the the input thread detects a change in link rate. If the - * driver provides its own parameters right after a link rate - * changes, but before the input thread gets to run, we want to - * make sure to keep the driver's values. Clearing if_poll_update - * will achieve that. - */ - if (p != NULL && !locked && ifp->if_poll_update != 0) { - ifp->if_poll_update = 0; - } +void +dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p) +{ + u_int64_t sample_holdtime, inbw; if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) { sample_holdtime = 0; /* polling is disabled */ - inp->rxpoll_wlowat = inp->rxpoll_plowat = - inp->rxpoll_blowat = 0; - inp->rxpoll_whiwat = inp->rxpoll_phiwat = - inp->rxpoll_bhiwat = (u_int32_t)-1; - inp->rxpoll_plim = 0; - inp->rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN; + ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat = + ifp->if_rxpoll_blowat = 0; + ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat = + ifp->if_rxpoll_bhiwat = (u_int32_t)-1; + ifp->if_rxpoll_plim = 0; + ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN; } else { u_int32_t plowat, phiwat, blowat, bhiwat, plim; u_int64_t ival; @@ -2505,33 +2674,71 @@ dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p, VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN); sample_holdtime = if_rxpoll_sample_holdtime; - inp->rxpoll_wlowat = if_rxpoll_wlowat; - inp->rxpoll_whiwat = if_rxpoll_whiwat; - inp->rxpoll_plowat = plowat; - inp->rxpoll_phiwat = phiwat; - inp->rxpoll_blowat = blowat; - inp->rxpoll_bhiwat = bhiwat; - inp->rxpoll_plim = plim; - inp->rxpoll_ival = ival; + ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat; + ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat; + ifp->if_rxpoll_plowat = plowat; + ifp->if_rxpoll_phiwat = phiwat; + ifp->if_rxpoll_blowat = blowat; + ifp->if_rxpoll_bhiwat = bhiwat; + ifp->if_rxpoll_plim = plim; + ifp->if_rxpoll_ival = ival; } - net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime); - net_nsectimer(&sample_holdtime, &inp->sample_holdtime); + net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime); + net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime); if (dlil_verbose) { - printf("%s: speed %llu bps, sample per %llu nsec, " + DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, " "poll interval %llu nsec, pkts per poll %u, " "pkt limits [%u/%u], wreq limits [%u/%u], " "bytes limits [%u/%u]\n", if_name(ifp), - inbw, sample_holdtime, inp->rxpoll_ival, inp->rxpoll_plim, - inp->rxpoll_plowat, inp->rxpoll_phiwat, inp->rxpoll_wlowat, - inp->rxpoll_whiwat, inp->rxpoll_blowat, inp->rxpoll_bhiwat); + inbw, sample_holdtime, ifp->if_rxpoll_ival, + ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat, + ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat, + ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat, + ifp->if_rxpoll_bhiwat); + } +} + +/* + * Must be called on an attached ifnet (caller is expected to check.) + * Caller may pass NULL for poll parameters to indicate "auto-tuning." + */ +errno_t +dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p, + boolean_t locked) +{ + errno_t err; + struct dlil_threading_info *inp; + + VERIFY(ifp != NULL); + if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) { + return ENXIO; + } + err = dlil_rxpoll_validate_params(p); + if (err != 0) { + return err; } + if (!locked) { + lck_mtx_lock(&inp->input_lck); + } + LCK_MTX_ASSERT(&inp->input_lck, LCK_MTX_ASSERT_OWNED); + /* + * Normally, we'd reset the parameters to the auto-tuned values + * if the the input thread detects a change in link rate. If the + * driver provides its own parameters right after a link rate + * changes, but before the input thread gets to run, we want to + * make sure to keep the driver's values. Clearing if_poll_update + * will achieve that. + */ + if (p != NULL && !locked && ifp->if_poll_update != 0) { + ifp->if_poll_update = 0; + } + dlil_rxpoll_update_params(ifp, p); if (!locked) { lck_mtx_unlock(&inp->input_lck); } - return 0; } @@ -2551,12 +2758,12 @@ dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p) bzero(p, sizeof(*p)); lck_mtx_lock(&inp->input_lck); - p->packets_limit = inp->rxpoll_plim; - p->packets_lowat = inp->rxpoll_plowat; - p->packets_hiwat = inp->rxpoll_phiwat; - p->bytes_lowat = inp->rxpoll_blowat; - p->bytes_hiwat = inp->rxpoll_bhiwat; - p->interval_time = inp->rxpoll_ival; + p->packets_limit = ifp->if_rxpoll_plim; + p->packets_lowat = ifp->if_rxpoll_plowat; + p->packets_hiwat = ifp->if_rxpoll_phiwat; + p->bytes_lowat = ifp->if_rxpoll_blowat; + p->bytes_hiwat = ifp->if_rxpoll_bhiwat; + p->interval_time = ifp->if_rxpoll_ival; lck_mtx_unlock(&inp->input_lck); return 0; @@ -2576,6 +2783,14 @@ ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head, return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE); } +errno_t +ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head, + struct mbuf *m_tail, const struct ifnet_stat_increment_param *s) +{ + return ifnet_input_common(ifp, m_head, m_tail, s, + (m_head != NULL), TRUE); +} + static errno_t ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll) @@ -2602,7 +2817,7 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, * interface is no longer attached; else hold an IO refcnt to * prevent it from being detached (will be released below.) */ - if (ifp == NULL || (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) { + if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) { if (m_head != NULL) { mbuf_freem_list(m_head); } @@ -2685,7 +2900,7 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, if (ifp != lo_ifp) { /* Release the IO refcnt */ - ifnet_decr_iorefcnt(ifp); + ifnet_datamov_end(ifp); } return err; @@ -2706,6 +2921,7 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head, struct dlil_threading_info *inp; u_int32_t m_cnt = s->packets_in; u_int32_t m_size = s->bytes_in; + boolean_t notify = FALSE; if ((inp = ifp->if_inp) == NULL) { inp = dlil_main_input_thread; @@ -2753,13 +2969,16 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head, * dedicated input threads go to the regular list. */ if (m_head != NULL) { + classq_pkt_t head, tail; + CLASSQ_PKT_INIT_MBUF(&head, m_head); + CLASSQ_PKT_INIT_MBUF(&tail, m_tail); if (inp == dlil_main_input_thread && ifp == lo_ifp) { struct dlil_main_threading_info *inpm = (struct dlil_main_threading_info *)inp; - _addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail, + _addq_multi(&inpm->lo_rcvq_pkts, &head, &tail, m_cnt, m_size); } else { - _addq_multi(&inp->rcvq_pkts, m_head, m_tail, + _addq_multi(&inp->rcvq_pkts, &head, &tail, m_cnt, m_size); } } @@ -2784,7 +3003,7 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head, } #endif /* IFNET_INPUT_SANITY_CHK */ - dlil_input_stats_add(s, inp, poll); + dlil_input_stats_add(s, inp, ifp, poll); /* * If we're using the main input thread, synchronize the * stats now since we have the interface context. All @@ -2792,31 +3011,20 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head, * have their stats synchronized there. */ if (inp == dlil_main_input_thread) { - dlil_input_stats_sync(ifp, inp); - } - - if (inp->input_mit_tcall && - qlen(&inp->rcvq_pkts) >= dlil_rcv_mit_pkts_min && - qlen(&inp->rcvq_pkts) < dlil_rcv_mit_pkts_max && - (ifp->if_family == IFNET_FAMILY_ETHERNET || - ifp->if_type == IFT_CELLULAR) - ) { - if (!thread_call_isactive(inp->input_mit_tcall)) { - uint64_t deadline; - clock_interval_to_deadline(dlil_rcv_mit_interval, - 1, &deadline); - (void) thread_call_enter_delayed( - inp->input_mit_tcall, deadline); - } - } else { - inp->input_waiting |= DLIL_INPUT_WAITING; - if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) { - inp->wtot++; - wakeup_one((caddr_t)&inp->input_waiting); - } + notify = dlil_input_stats_sync(ifp, inp); + } + + inp->input_waiting |= DLIL_INPUT_WAITING; + if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) { + inp->wtot++; + wakeup_one((caddr_t)&inp->input_waiting); } lck_mtx_unlock(&inp->input_lck); + if (notify) { + ifnet_notify_data_threshold(ifp); + } + return 0; } @@ -2858,22 +3066,20 @@ ifnet_start(struct ifnet *ifp) ifnet_start_common(ifp, FALSE); } +__attribute__((noreturn)) static void -ifnet_start_thread_fn(void *v, wait_result_t w) +ifnet_start_thread_func(void *v, wait_result_t w) { #pragma unused(w) struct ifnet *ifp = v; - char ifname[IFNAMSIZ + 1]; char thread_name[MAXTHREADNAMESIZE]; - struct timespec *ts = NULL; - struct ifclassq *ifq = &ifp->if_snd; - struct timespec delay_start_ts; /* Construct the name for this thread, and then apply it. */ bzero(thread_name, sizeof(thread_name)); (void) snprintf(thread_name, sizeof(thread_name), "ifnet_start_%s", ifp->if_xname); - thread_set_thread_name(ifp->if_start_thread, thread_name); + ASSERT(ifp->if_start_thread == current_thread()); + thread_set_thread_name(current_thread(), thread_name); /* * Treat the dedicated starter thread for lo0 as equivalent to @@ -2901,86 +3107,89 @@ ifnet_start_thread_fn(void *v, wait_result_t w) lck_mtx_unlock(&inp->input_lck); } } + ifnet_decr_pending_thread_count(ifp); - (void) snprintf(ifname, sizeof(ifname), "%s_starter", if_name(ifp)); + lck_mtx_lock(&ifp->if_start_lock); + VERIFY(!ifp->if_start_active); + (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT); + lck_mtx_unlock(&ifp->if_start_lock); + (void) thread_block_parameter(ifnet_start_thread_cont, ifp); + /* NOTREACHED */ + __builtin_unreachable(); +} - lck_mtx_lock_spin(&ifp->if_start_lock); +__attribute__((noreturn)) +static void +ifnet_start_thread_cont(void *v, wait_result_t wres) +{ + struct ifnet *ifp = v; + struct ifclassq *ifq = &ifp->if_snd; - for (;;) { - if (ifp->if_start_thread != NULL) { - (void) msleep(&ifp->if_start_thread, - &ifp->if_start_lock, - (PZERO - 1) | PSPIN, ifname, ts); - } - /* interface is detached? */ - if (ifp->if_start_thread == THREAD_NULL) { - ifnet_set_start_cycle(ifp, NULL); - lck_mtx_unlock(&ifp->if_start_lock); - ifnet_purge(ifp); + lck_mtx_lock(&ifp->if_start_lock); + if (__improbable(wres == THREAD_INTERRUPTED || + ifp->if_start_thread == THREAD_NULL)) { + goto terminate; + } - if (dlil_verbose) { - printf("%s: starter thread terminated\n", - if_name(ifp)); - } + ifp->if_start_active = 1; - /* for the extra refcnt from kernel_thread_start() */ - thread_deallocate(current_thread()); - /* this is the end */ - thread_terminate(current_thread()); - /* NOTREACHED */ - return; + /* + * Keep on servicing until no more request. + */ + for (;;) { + u_int32_t req = ifp->if_start_req; + if (!IFCQ_IS_EMPTY(ifq) && + (ifp->if_eflags & IFEF_ENQUEUE_MULTI) && + ifp->if_start_delayed == 0 && + IFCQ_LEN(ifq) < ifp->if_start_delay_qlen && + (ifp->if_eflags & IFEF_DELAY_START)) { + ifp->if_start_delayed = 1; + ifnet_start_delayed++; + break; + } else { + ifp->if_start_delayed = 0; } + lck_mtx_unlock(&ifp->if_start_lock); - ifp->if_start_active = 1; + /* + * If no longer attached, don't call start because ifp + * is being destroyed; else hold an IO refcnt to + * prevent the interface from being detached (will be + * released below.) + */ + if (!ifnet_datamov_begin(ifp)) { + lck_mtx_lock_spin(&ifp->if_start_lock); + break; + } - for (;;) { - u_int32_t req = ifp->if_start_req; - if (!IFCQ_IS_EMPTY(ifq) && - (ifp->if_eflags & IFEF_ENQUEUE_MULTI) && - ifp->if_start_delayed == 0 && - IFCQ_LEN(ifq) < ifp->if_start_delay_qlen && - (ifp->if_eflags & IFEF_DELAY_START)) { - ifp->if_start_delayed = 1; - ifnet_start_delayed++; - break; - } else { - ifp->if_start_delayed = 0; - } - lck_mtx_unlock(&ifp->if_start_lock); + /* invoke the driver's start routine */ + ((*ifp->if_start)(ifp)); - /* - * If no longer attached, don't call start because ifp - * is being destroyed; else hold an IO refcnt to - * prevent the interface from being detached (will be - * released below.) - */ - if (!ifnet_is_attached(ifp, 1)) { - lck_mtx_lock_spin(&ifp->if_start_lock); - break; - } + /* + * Release the io ref count taken above. + */ + ifnet_datamov_end(ifp); - /* invoke the driver's start routine */ - ((*ifp->if_start)(ifp)); + lck_mtx_lock_spin(&ifp->if_start_lock); - /* - * Release the io ref count taken by ifnet_is_attached. - */ - ifnet_decr_iorefcnt(ifp); + /* + * If there's no pending request or if the + * interface has been disabled, we're done. + */ + if (req == ifp->if_start_req || + (ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) { + break; + } + } - lck_mtx_lock_spin(&ifp->if_start_lock); + ifp->if_start_req = 0; + ifp->if_start_active = 0; - /* - * If there's no pending request or if the - * interface has been disabled, we're done. - */ - if (req == ifp->if_start_req || - (ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) { - break; - } - } - ifp->if_start_req = 0; - ifp->if_start_active = 0; + if (__probable(ifp->if_start_thread != THREAD_NULL)) { + uint64_t deadline = TIMEOUT_WAIT_FOREVER; + struct timespec delay_start_ts; + struct timespec *ts; /* * Wakeup N ns from now if rate-controlled by TBR, and if @@ -3000,9 +3209,40 @@ ifnet_start_thread_fn(void *v, wait_result_t w) if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) { ts = NULL; } + + if (__improbable(ts != NULL)) { + clock_interval_to_deadline((ts->tv_nsec + + (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline); + } + + (void) assert_wait_deadline(&ifp->if_start_thread, + THREAD_UNINT, deadline); + lck_mtx_unlock(&ifp->if_start_lock); + (void) thread_block_parameter(ifnet_start_thread_cont, ifp); + /* NOTREACHED */ + } else { +terminate: + /* interface is detached? */ + ifnet_set_start_cycle(ifp, NULL); + lck_mtx_unlock(&ifp->if_start_lock); + ifnet_purge(ifp); + + if (dlil_verbose) { + DLIL_PRINTF("%s: starter thread terminated\n", + if_name(ifp)); + } + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ } + /* must never get here */ + VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } void @@ -3015,12 +3255,12 @@ ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts) } if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) { - printf("%s: restart interval set to %lu nsec\n", + DLIL_PRINTF("%s: restart interval set to %lu nsec\n", if_name(ifp), ts->tv_nsec); } } -static void +void ifnet_poll(struct ifnet *ifp) { /* @@ -3028,134 +3268,149 @@ ifnet_poll(struct ifnet *ifp) */ lck_mtx_lock_spin(&ifp->if_poll_lock); ifp->if_poll_req++; - if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) { + if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) && + ifp->if_poll_thread != THREAD_NULL) { wakeup_one((caddr_t)&ifp->if_poll_thread); } lck_mtx_unlock(&ifp->if_poll_lock); } +__attribute__((noreturn)) static void -ifnet_poll_thread_fn(void *v, wait_result_t w) +ifnet_poll_thread_func(void *v, wait_result_t w) { #pragma unused(w) + char thread_name[MAXTHREADNAMESIZE]; + struct ifnet *ifp = v; + + VERIFY(ifp->if_eflags & IFEF_RXPOLL); + VERIFY(current_thread() == ifp->if_poll_thread); + + /* construct the name for this thread, and then apply it */ + bzero(thread_name, sizeof(thread_name)); + (void) snprintf(thread_name, sizeof(thread_name), + "ifnet_poller_%s", ifp->if_xname); + thread_set_thread_name(ifp->if_poll_thread, thread_name); + ifnet_decr_pending_thread_count(ifp); + + lck_mtx_lock(&ifp->if_poll_lock); + (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT); + lck_mtx_unlock(&ifp->if_poll_lock); + (void) thread_block_parameter(ifnet_poll_thread_cont, ifp); + /* NOTREACHED */ + __builtin_unreachable(); +} + +__attribute__((noreturn)) +static void +ifnet_poll_thread_cont(void *v, wait_result_t wres) +{ struct dlil_threading_info *inp; struct ifnet *ifp = v; - char ifname[IFNAMSIZ + 1]; - struct timespec *ts = NULL; struct ifnet_stat_increment_param s; + struct timespec start_time; + + VERIFY(ifp->if_eflags & IFEF_RXPOLL); - snprintf(ifname, sizeof(ifname), "%s_poller", - if_name(ifp)); bzero(&s, sizeof(s)); + net_timerclear(&start_time); lck_mtx_lock_spin(&ifp->if_poll_lock); + if (__improbable(wres == THREAD_INTERRUPTED || + ifp->if_poll_thread == THREAD_NULL)) { + goto terminate; + } inp = ifp->if_inp; VERIFY(inp != NULL); + ifp->if_poll_flags |= IF_POLLF_RUNNING; + + /* + * Keep on servicing until no more request. + */ for (;;) { - if (ifp->if_poll_thread != THREAD_NULL) { - (void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock, - (PZERO - 1) | PSPIN, ifname, ts); - } + struct mbuf *m_head, *m_tail; + u_int32_t m_lim, m_cnt, m_totlen; + u_int16_t req = ifp->if_poll_req; - /* interface is detached (maybe while asleep)? */ - if (ifp->if_poll_thread == THREAD_NULL) { - ifnet_set_poll_cycle(ifp, NULL); - lck_mtx_unlock(&ifp->if_poll_lock); + m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim : + MAX((qlimit(&inp->rcvq_pkts)), (ifp->if_rxpoll_phiwat << 2)); + lck_mtx_unlock(&ifp->if_poll_lock); - if (dlil_verbose) { - printf("%s: poller thread terminated\n", - if_name(ifp)); - } + /* + * If no longer attached, there's nothing to do; + * else hold an IO refcnt to prevent the interface + * from being detached (will be released below.) + */ + if (!ifnet_is_attached(ifp, 1)) { + lck_mtx_lock_spin(&ifp->if_poll_lock); + break; + } - /* for the extra refcnt from kernel_thread_start() */ - thread_deallocate(current_thread()); - /* this is the end */ - thread_terminate(current_thread()); - /* NOTREACHED */ - return; + if (dlil_verbose > 1) { + DLIL_PRINTF("%s: polling up to %d pkts, " + "pkts avg %d max %d, wreq avg %d, " + "bytes avg %d\n", + if_name(ifp), m_lim, + ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax, + ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg); } - ifp->if_poll_active = 1; - for (;;) { - struct mbuf *m_head, *m_tail; - u_int32_t m_lim, m_cnt, m_totlen; - u_int16_t req = ifp->if_poll_req; + /* invoke the driver's input poll routine */ + ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail, + &m_cnt, &m_totlen)); - lck_mtx_unlock(&ifp->if_poll_lock); + if (m_head != NULL) { + VERIFY(m_tail != NULL && m_cnt > 0); - /* - * If no longer attached, there's nothing to do; - * else hold an IO refcnt to prevent the interface - * from being detached (will be released below.) - */ - if (!ifnet_is_attached(ifp, 1)) { - lck_mtx_lock_spin(&ifp->if_poll_lock); - break; + if (dlil_verbose > 1) { + DLIL_PRINTF("%s: polled %d pkts, " + "pkts avg %d max %d, wreq avg %d, " + "bytes avg %d\n", + if_name(ifp), m_cnt, + ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax, + ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg); } - m_lim = (inp->rxpoll_plim != 0) ? inp->rxpoll_plim : - MAX((qlimit(&inp->rcvq_pkts)), - (inp->rxpoll_phiwat << 2)); + /* stats are required for extended variant */ + s.packets_in = m_cnt; + s.bytes_in = m_totlen; + (void) ifnet_input_common(ifp, m_head, m_tail, + &s, TRUE, TRUE); + } else { if (dlil_verbose > 1) { - printf("%s: polling up to %d pkts, " + DLIL_PRINTF("%s: no packets, " "pkts avg %d max %d, wreq avg %d, " "bytes avg %d\n", - if_name(ifp), m_lim, - inp->rxpoll_pavg, inp->rxpoll_pmax, - inp->rxpoll_wavg, inp->rxpoll_bavg); + if_name(ifp), ifp->if_rxpoll_pavg, + ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg, + ifp->if_rxpoll_bavg); } - /* invoke the driver's input poll routine */ - ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail, - &m_cnt, &m_totlen)); - - if (m_head != NULL) { - VERIFY(m_tail != NULL && m_cnt > 0); - - if (dlil_verbose > 1) { - printf("%s: polled %d pkts, " - "pkts avg %d max %d, wreq avg %d, " - "bytes avg %d\n", - if_name(ifp), m_cnt, - inp->rxpoll_pavg, inp->rxpoll_pmax, - inp->rxpoll_wavg, inp->rxpoll_bavg); - } - - /* stats are required for extended variant */ - s.packets_in = m_cnt; - s.bytes_in = m_totlen; + (void) ifnet_input_common(ifp, NULL, NULL, + NULL, FALSE, TRUE); + } - (void) ifnet_input_common(ifp, m_head, m_tail, - &s, TRUE, TRUE); - } else { - if (dlil_verbose > 1) { - printf("%s: no packets, " - "pkts avg %d max %d, wreq avg %d, " - "bytes avg %d\n", - if_name(ifp), inp->rxpoll_pavg, - inp->rxpoll_pmax, inp->rxpoll_wavg, - inp->rxpoll_bavg); - } + /* Release the io ref count */ + ifnet_decr_iorefcnt(ifp); - (void) ifnet_input_common(ifp, NULL, NULL, - NULL, FALSE, TRUE); - } + lck_mtx_lock_spin(&ifp->if_poll_lock); - /* Release the io ref count */ - ifnet_decr_iorefcnt(ifp); + /* if there's no pending request, we're done */ + if (req == ifp->if_poll_req || + ifp->if_poll_thread == THREAD_NULL) { + break; + } + } - lck_mtx_lock_spin(&ifp->if_poll_lock); + ifp->if_poll_req = 0; + ifp->if_poll_flags &= ~IF_POLLF_RUNNING; - /* if there's no pending request, we're done */ - if (req == ifp->if_poll_req) { - break; - } - } - ifp->if_poll_req = 0; - ifp->if_poll_active = 0; + if (ifp->if_poll_thread != THREAD_NULL) { + uint64_t deadline = TIMEOUT_WAIT_FOREVER; + struct timespec *ts; /* * Wakeup N ns from now, else sleep indefinitely (ts = NULL) @@ -3165,9 +3420,39 @@ ifnet_poll_thread_fn(void *v, wait_result_t w) if (ts->tv_sec == 0 && ts->tv_nsec == 0) { ts = NULL; } + + if (ts != NULL) { + clock_interval_to_deadline((ts->tv_nsec + + (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline); + } + + (void) assert_wait_deadline(&ifp->if_poll_thread, + THREAD_UNINT, deadline); + lck_mtx_unlock(&ifp->if_poll_lock); + (void) thread_block_parameter(ifnet_poll_thread_cont, ifp); + /* NOTREACHED */ + } else { +terminate: + /* interface is detached (maybe while asleep)? */ + ifnet_set_poll_cycle(ifp, NULL); + lck_mtx_unlock(&ifp->if_poll_lock); + + if (dlil_verbose) { + DLIL_PRINTF("%s: poller thread terminated\n", + if_name(ifp)); + } + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ } + /* must never get here */ + VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } void @@ -3180,7 +3465,7 @@ ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts) } if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) { - printf("%s: poll interval set to %lu nsec\n", + DLIL_PRINTF("%s: poll interval set to %lu nsec\n", if_name(ifp), ts->tv_nsec); } } @@ -3203,8 +3488,10 @@ ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev) } if (IFCQ_TBR_IS_ENABLED(ifq)) { - struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw, - ifq->ifcq_tbr.tbr_percent, 0 }; + struct tb_profile tb = { + .rate = ifq->ifcq_tbr.tbr_rate_raw, + .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0 + }; (void) ifclassq_tbr_set(ifq, &tb, FALSE); } @@ -3375,16 +3662,78 @@ ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen, } } +/* + * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf. + * While it's ok for buf to be not 32 bit aligned, the caller must ensure that + * buf holds the full header. + */ +static __attribute__((noinline)) void +ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver) +{ + struct ip *ip; + struct ip6_hdr *ip6; + uint8_t lbuf[64] __attribute__((aligned(8))); + uint8_t *p = buf; + + if (ip_ver == IPVERSION) { + uint8_t old_tos; + uint32_t sum; + + if (__improbable(!IP_HDR_ALIGNED_P(p))) { + DTRACE_IP1(not__aligned__v4, uint8_t *, buf); + bcopy(buf, lbuf, sizeof(struct ip)); + p = lbuf; + } + ip = (struct ip *)(void *)p; + if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) { + return; + } + + DTRACE_IP1(clear__v4, struct ip *, ip); + old_tos = ip->ip_tos; + ip->ip_tos &= IPTOS_ECN_MASK; + sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos); + sum = (sum >> 16) + (sum & 0xffff); + ip->ip_sum = (uint16_t)(sum & 0xffff); + + if (__improbable(p == lbuf)) { + bcopy(lbuf, buf, sizeof(struct ip)); + } + } else { + uint32_t flow; + ASSERT(ip_ver == IPV6_VERSION); + + if (__improbable(!IP_HDR_ALIGNED_P(p))) { + DTRACE_IP1(not__aligned__v6, uint8_t *, buf); + bcopy(buf, lbuf, sizeof(struct ip6_hdr)); + p = lbuf; + } + ip6 = (struct ip6_hdr *)(void *)p; + flow = ntohl(ip6->ip6_flow); + if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) { + return; + } + + DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6); + ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK); + + if (__improbable(p == lbuf)) { + bcopy(lbuf, buf, sizeof(struct ip6_hdr)); + } + } +} + static inline errno_t -ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype, - boolean_t flush, boolean_t *pdrop) +ifnet_enqueue_ifclassq(struct ifnet *ifp, classq_pkt_t *p, boolean_t flush, + boolean_t *pdrop) { volatile uint64_t *fg_ts = NULL; volatile uint64_t *rt_ts = NULL; - struct mbuf *m = p; struct timespec now; u_int64_t now_nsec = 0; int error = 0; + uint8_t *mcast_buf = NULL; + uint8_t ip_ver; ASSERT(ifp->if_eflags & IFEF_TXSTART); @@ -3394,44 +3743,110 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype, * PKTF_TS_VALID is always cleared prior to entering classq, i.e. * the timestamp value is used internally there. */ - switch (ptype) { + switch (p->cp_ptype) { case QP_MBUF: - ASSERT(m->m_flags & M_PKTHDR); - ASSERT(m->m_nextpkt == NULL); + ASSERT(p->cp_mbuf->m_flags & M_PKTHDR); + ASSERT(p->cp_mbuf->m_nextpkt == NULL); - if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID) || - m->m_pkthdr.pkt_timestamp == 0) { + if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) || + p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) { nanouptime(&now); net_timernsec(&now, &now_nsec); - m->m_pkthdr.pkt_timestamp = now_nsec; + p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec; } - m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID; + p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID; /* * If the packet service class is not background, * update the timestamp to indicate recent activity * on a foreground socket. */ - if ((m->m_pkthdr.pkt_flags & PKTF_FLOW_ID) && - m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) { - if (!(m->m_pkthdr.pkt_flags & PKTF_SO_BACKGROUND)) { + if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) && + p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) { + if (!(p->cp_mbuf->m_pkthdr.pkt_flags & + PKTF_SO_BACKGROUND)) { ifp->if_fg_sendts = _net_uptime; if (fg_ts != NULL) { *fg_ts = _net_uptime; } } - if (m->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) { + if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) { ifp->if_rt_sendts = _net_uptime; if (rt_ts != NULL) { *rt_ts = _net_uptime; } } } + + /* + * Some Wi-Fi AP implementations do not correctly handle + * multicast IP packets with DSCP bits set (radr://9331522). + * As a workaround we clear the DSCP bits and set the service + * class to BE. + */ + if ((p->cp_mbuf->m_flags & M_MCAST) != 0 && + IFNET_IS_WIFI_INFRA(ifp)) { + size_t len = mbuf_len(p->cp_mbuf), hlen; + struct ether_header *eh; + boolean_t pullup = FALSE; + uint16_t etype; + + if (__improbable(len < sizeof(struct ether_header))) { + DTRACE_IP1(small__ether, size_t, len); + if ((p->cp_mbuf = m_pullup(p->cp_mbuf, + sizeof(struct ether_header))) == NULL) { + return ENOMEM; + } + } + eh = (struct ether_header *)mbuf_data(p->cp_mbuf); + etype = ntohs(eh->ether_type); + if (etype == ETHERTYPE_IP) { + hlen = sizeof(struct ether_header) + + sizeof(struct ip); + if (len < hlen) { + DTRACE_IP1(small__v4, size_t, len); + pullup = TRUE; + } + ip_ver = IPVERSION; + } else if (etype == ETHERTYPE_IPV6) { + hlen = sizeof(struct ether_header) + + sizeof(struct ip6_hdr); + if (len < hlen) { + DTRACE_IP1(small__v6, size_t, len); + pullup = TRUE; + } + ip_ver = IPV6_VERSION; + } else { + DTRACE_IP1(invalid__etype, uint16_t, etype); + break; + } + if (pullup) { + if ((p->cp_mbuf = m_pullup(p->cp_mbuf, hlen)) == + NULL) { + return ENOMEM; + } + + eh = (struct ether_header *)mbuf_data( + p->cp_mbuf); + } + mbuf_set_service_class(p->cp_mbuf, MBUF_SC_BE); + mcast_buf = (uint8_t *)(eh + 1); + /* + * ifnet_mcast_clear_dscp() will finish the work below. + * Note that the pullups above ensure that mcast_buf + * points to a full IP header. + */ + } break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); + } + + if (mcast_buf != NULL) { + ifnet_mcast_clear_dscp(mcast_buf, ip_ver); } if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) { @@ -3474,7 +3889,8 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype, ifp->if_start_delay_idle = 0; } else { if (ifp->if_start_delay_idle >= 10) { - ifp->if_eflags &= ~(IFEF_DELAY_START); + ifp->if_eflags &= + ~(IFEF_DELAY_START); ifnet_delay_start_disabled++; } else { ifp->if_start_delay_idle++; @@ -3493,17 +3909,8 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype, ifp->if_eflags &= ~(IFEF_DELAY_START); } - switch (ptype) { - case QP_MBUF: - /* enqueue the packet (caller consumes object) */ - error = ifclassq_enqueue(&ifp->if_snd, m, QP_MBUF, pdrop); - m = NULL; - break; - - - default: - break; - } + /* enqueue the packet (caller consumes object) */ + error = ifclassq_enqueue(&ifp->if_snd, p, pdrop); /* * Tell the driver to start dequeueing; do this even when the queue @@ -3515,7 +3922,36 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype, ifnet_start(ifp); } - return error; + return error; +} + +int +ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts) +{ + struct ifnet *ifp = handle; + boolean_t pdrop; /* dummy */ + uint32_t i; + + ASSERT(n_pkts >= 1); + for (i = 0; i < n_pkts - 1; i++) { + (void) ifnet_enqueue_ifclassq(ifp, &pkts[i].pktsched_pkt, + FALSE, &pdrop); + } + /* flush with the last packet */ + (void) ifnet_enqueue_ifclassq(ifp, &pkts[i].pktsched_pkt, TRUE, &pdrop); + + return 0; +} + +static inline errno_t +ifnet_enqueue_common(struct ifnet *ifp, classq_pkt_t *pkt, boolean_t flush, + boolean_t *pdrop) +{ + if (ifp->if_output_netem != NULL) { + return netem_enqueue(ifp->if_output_netem, pkt, pdrop); + } else { + return ifnet_enqueue_ifclassq(ifp, pkt, flush, pdrop); + } } errno_t @@ -3529,6 +3965,8 @@ errno_t ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush, boolean_t *pdrop) { + classq_pkt_t pkt; + if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) || m->m_nextpkt != NULL) { if (m != NULL) { @@ -3548,7 +3986,8 @@ ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush, return ENETDOWN; } - return ifnet_enqueue_common(ifp, m, QP_MBUF, flush, pdrop); + CLASSQ_PKT_INIT_MBUF(&pkt, m); + return ifnet_enqueue_common(ifp, &pkt, flush, pdrop); } @@ -3556,7 +3995,8 @@ errno_t ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp) { errno_t rc; - classq_pkt_type_t ptype; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); + if (ifp == NULL || mp == NULL) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || @@ -3568,10 +4008,10 @@ ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp) } rc = ifclassq_dequeue(&ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, - (void **)mp, NULL, NULL, NULL, &ptype); - VERIFY((*mp == NULL) || (ptype == QP_MBUF)); + &pkt, NULL, NULL, NULL); + VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); - + *mp = pkt.cp_mbuf; return rc; } @@ -3580,7 +4020,8 @@ ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc, struct mbuf **mp) { errno_t rc; - classq_pkt_type_t ptype; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); + if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || @@ -3592,10 +4033,10 @@ ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc, } rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, - CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, (void **)mp, NULL, NULL, - NULL, &ptype); - VERIFY((*mp == NULL) || (ptype == QP_MBUF)); + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL); + VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); + *mp = pkt.cp_mbuf; return rc; } @@ -3604,7 +4045,9 @@ ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) { errno_t rc; - classq_pkt_type_t ptype; + classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head); + classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail); + if (ifp == NULL || head == NULL || pkt_limit < 1) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || @@ -3616,10 +4059,13 @@ ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit, } rc = ifclassq_dequeue(&ifp->if_snd, pkt_limit, - CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, (void **)head, (void **)tail, cnt, - len, &ptype); - VERIFY((*head == NULL) || (ptype == QP_MBUF)); + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len); + VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); + *head = pkt_head.cp_mbuf; + if (tail != NULL) { + *tail = pkt_tail.cp_mbuf; + } return rc; } @@ -3628,7 +4074,9 @@ ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) { errno_t rc; - classq_pkt_type_t ptype; + classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head); + classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail); + if (ifp == NULL || head == NULL || byte_limit < 1) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || @@ -3640,9 +4088,13 @@ ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit, } rc = ifclassq_dequeue(&ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT, - byte_limit, (void **)head, (void **)tail, cnt, len, &ptype); - VERIFY((*head == NULL) || (ptype == QP_MBUF)); + byte_limit, &pkt_head, &pkt_tail, cnt, len); + VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); + *head = pkt_head.cp_mbuf; + if (tail != NULL) { + *tail = pkt_tail.cp_mbuf; + } return rc; } @@ -3652,7 +4104,9 @@ ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc, u_int32_t *len) { errno_t rc; - classq_pkt_type_t ptype; + classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head); + classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail); + if (ifp == NULL || head == NULL || pkt_limit < 1 || !MBUF_VALID_SC(sc)) { return EINVAL; @@ -3665,10 +4119,14 @@ ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc, } rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, pkt_limit, - CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, (void **)head, - (void **)tail, cnt, len, &ptype); - VERIFY((*head == NULL) || (ptype == QP_MBUF)); + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, + cnt, len); + VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); + *head = pkt_head.cp_mbuf; + if (tail != NULL) { + *tail = pkt_tail.cp_mbuf; + } return rc; } @@ -3689,11 +4147,30 @@ ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m, } #endif /* !CONFIG_EMBEDDED */ +static boolean_t +packet_has_vlan_tag(struct mbuf * m) +{ + u_int tag = 0; + + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) { + tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag); + if (tag == 0) { + /* the packet is just priority-tagged, clear the bit */ + m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID; + } + } + return tag != 0; +} + static int dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p, char **frame_header_p, protocol_family_t protocol_family) { - struct ifnet_filter *filter; + boolean_t is_vlan_packet = FALSE; + struct ifnet_filter *filter; + struct mbuf *m = *m_p; + + is_vlan_packet = packet_has_vlan_tag(m); /* * Pass the inbound packet to the interface filters @@ -3704,6 +4181,12 @@ dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p, TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { int result; + /* exclude VLAN packets from external filters PR-3586856 */ + if (is_vlan_packet && + (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) { + continue; + } + if (!filter->filt_skip && filter->filt_input != NULL && (filter->filt_protocol == 0 || filter->filt_protocol == protocol_family)) { @@ -3740,7 +4223,11 @@ static int dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p, protocol_family_t protocol_family) { - struct ifnet_filter *filter; + boolean_t is_vlan_packet; + struct ifnet_filter *filter; + struct mbuf *m = *m_p; + + is_vlan_packet = packet_has_vlan_tag(m); /* * Pass the outbound packet to the interface filters @@ -3751,6 +4238,12 @@ dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p, TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { int result; + /* exclude VLAN packets from external filters PR-3586856 */ + if (is_vlan_packet && + (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) { + continue; + } + if (!filter->filt_skip && filter->filt_output != NULL && (filter->filt_protocol == 0 || filter->filt_protocol == protocol_family)) { @@ -3809,7 +4302,7 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) static void dlil_input_stats_add(const struct ifnet_stat_increment_param *s, - struct dlil_threading_info *inp, boolean_t poll) + struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll) { struct ifnet_stat_increment_param *d = &inp->stats; @@ -3841,11 +4334,11 @@ dlil_input_stats_add(const struct ifnet_stat_increment_param *s, } if (poll) { - PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in); + PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in); } } -static void +static boolean_t dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp) { struct ifnet_stat_increment_param *s = &inp->stats; @@ -3889,23 +4382,20 @@ dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp) s->dropped = 0; } - if (ifp->if_data_threshold != 0) { - lck_mtx_convert_spin(&inp->input_lck); - ifnet_notify_data_threshold(ifp); - } - /* * No need for atomic operations as they are modified here * only from within the DLIL input thread context. */ - if (inp->tstats.packets != 0) { - inp->pstats.ifi_poll_packets += inp->tstats.packets; - inp->tstats.packets = 0; + if (ifp->if_poll_tstats.packets != 0) { + ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets; + ifp->if_poll_tstats.packets = 0; } - if (inp->tstats.bytes != 0) { - inp->pstats.ifi_poll_bytes += inp->tstats.bytes; - inp->tstats.bytes = 0; + if (ifp->if_poll_tstats.bytes != 0) { + ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes; + ifp->if_poll_tstats.bytes = 0; } + + return ifp->if_data_threshold != 0; } __private_extern__ void @@ -3952,7 +4442,8 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, ifp = m->m_pkthdr.rcvif; } - if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 && + if ((ifp->if_eflags & IFEF_RXPOLL) && + (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 && poll_ival > 0 && (--poll_thresh % poll_ival) == 0) { ifnet_poll(ifp); } @@ -3971,7 +4462,7 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, * away, so optimize for that. */ if (ifp != lo_ifp) { - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_datamov_begin(ifp)) { m_freem(m); goto next; } @@ -4076,7 +4567,6 @@ skip_clat: dlil_input_cksum_dbg(ifp, m, frame_header, protocol_family); } - /* * For partial checksum offload, we expect the driver to * set the start offset indicating the start of the span @@ -4084,11 +4574,14 @@ skip_clat: * adjust this start offset accordingly because the data * pointer has been advanced beyond the link-layer header. * - * Don't adjust if the interface is a bridge member, as - * the adjustment will occur from the context of the - * bridge interface during input. + * Virtual lan types (bridge, vlan, bond) can call + * dlil_input_packet_list() with the same packet with the + * checksum flags set. Set a flag indicating that the + * adjustment has already been done. */ - if (ifp->if_bridge == NULL && (m->m_pkthdr.csum_flags & + if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) { + /* adjustment has already been done */ + } else if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) == (CSUM_DATA_VALID | CSUM_PARTIAL)) { int adj; @@ -4103,8 +4596,9 @@ skip_clat: } else { m->m_pkthdr.csum_rx_start -= adj; } + /* make sure we don't adjust more than once */ + m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE; } - if (clat_debug) { pktap_input(ifp, protocol_family, m, frame_header); } @@ -4113,18 +4607,16 @@ skip_clat: atomic_add_64(&ifp->if_imcasts, 1); } - /* run interface filters, exclude VLAN packets PR-3586856 */ - if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - error = dlil_interface_filters_input(ifp, &m, - &frame_header, protocol_family); - if (error != 0) { - if (error != EJUSTRETURN) { - m_freem(m); - } - goto next; + /* run interface filters */ + error = dlil_interface_filters_input(ifp, &m, + &frame_header, protocol_family); + if (error != 0) { + if (error != EJUSTRETURN) { + m_freem(m); } + goto next; } - if (error != 0 || ((m->m_flags & M_PROMISC) != 0)) { + if ((m->m_flags & M_PROMISC) != 0) { m_freem(m); goto next; } @@ -4187,7 +4679,7 @@ next: ifp->if_updatemcasts = 0; } if (iorefcnt == 1) { - ifnet_decr_iorefcnt(ifp); + ifnet_datamov_end(ifp); } } @@ -4203,7 +4695,7 @@ if_mcasts_update(struct ifnet *ifp) if (err == EAFNOSUPPORT) { err = 0; } - printf("%s: %s %d suspended link-layer multicast membership(s) " + DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) " "(err=%d)\n", if_name(ifp), (err == 0 ? "successfully restored" : "failed to restore"), ifp->if_updatemcasts, err); @@ -4492,7 +4984,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, u_int64_t now_nsec; boolean_t did_clat46 = FALSE; protocol_family_t old_proto_family = proto_family; + struct sockaddr_in6 dest6; struct rtentry *rt = NULL; + u_int32_t m_loop_set = 0; KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); @@ -4500,7 +4994,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, * Get an io refcnt if the interface is attached to prevent ifnet_detach * from happening while this operation is in progress */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_datamov_begin(ifp)) { retval = ENXIO; goto cleanup; } @@ -4564,7 +5058,6 @@ preout_again: * performed address family translation. */ if (!did_clat46 && proto_family == PF_INET6) { - struct sockaddr_in6 dest6; did_clat46 = TRUE; if (proto != NULL) { @@ -4700,7 +5193,7 @@ preout_again: m->m_pkthdr.rcvif = ifp; rcvif_set = 1; } - + m_loop_set = m->m_flags & M_LOOP; retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type, &pre, &post); if (retval != 0) { @@ -4743,16 +5236,12 @@ preout_again: /* * Let interface filters (if any) do their thing ... */ - /* Do not pass VLAN tagged packets to filters PR-3586856 */ - if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - retval = dlil_interface_filters_output(ifp, - &m, proto_family); - if (retval != 0) { - if (retval != EJUSTRETURN) { - m_freem(m); - } - goto next; + retval = dlil_interface_filters_output(ifp, &m, proto_family); + if (retval != 0) { + if (retval != EJUSTRETURN) { + m_freem(m); } + goto next; } /* * Strip away M_PROTO1 bit prior to sending packet @@ -4850,7 +5339,7 @@ preout_again: fpkts++; } if (retval != 0 && dlil_verbose) { - printf("%s: output error on %s retval = %d\n", + DLIL_PRINTF("%s: output error on %s retval = %d\n", __func__, if_name(ifp), retval); } @@ -4862,6 +5351,7 @@ preout_again: next: m = packetlist; if (m != NULL) { + m->m_flags |= m_loop_set; packetlist = packetlist->m_nextpkt; m->m_nextpkt = NULL; } @@ -4889,7 +5379,7 @@ next: fpkts++; } if (retval != 0 && dlil_verbose) { - printf("%s: output error on %s retval = %d\n", + DLIL_PRINTF("%s: output error on %s retval = %d\n", __func__, if_name(ifp), retval); } } else { @@ -4916,7 +5406,7 @@ next: } } if (retval != 0 && dlil_verbose) { - printf("%s: output error on %s " + DLIL_PRINTF("%s: output error on %s " "retval = %d\n", __func__, if_name(ifp), retval); } @@ -4948,7 +5438,7 @@ cleanup: retval = 0; } if (iorefcnt == 1) { - ifnet_decr_iorefcnt(ifp); + ifnet_datamov_end(ifp); } if (rt != NULL) { rtfree(rt); @@ -5593,9 +6083,10 @@ static __inline__ int _is_announcement(const struct sockaddr_in * sender_sin, const struct sockaddr_in * target_sin) { - if (sender_sin == NULL) { + if (target_sin == NULL || sender_sin == NULL) { return FALSE; } + return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr; } @@ -5610,8 +6101,11 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, struct sockaddr_inarp target_proto_sinarp; struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0; - if (target_proto == NULL || (sender_proto != NULL && - sender_proto->sa_family != target_proto->sa_family)) { + if (target_proto == NULL || sender_proto == NULL) { + return EINVAL; + } + + if (sender_proto->sa_family != target_proto->sa_family) { return EINVAL; } @@ -5637,7 +6131,7 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, if (target_proto->sa_family == AF_INET && IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) && ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST && - !_is_announcement(target_sin, sender_sin)) { + !_is_announcement(sender_sin, target_sin)) { ifnet_t *ifp_list; u_int32_t count; u_int32_t ifp_on; @@ -5749,10 +6243,30 @@ ifnet_is_attached(struct ifnet *ifp, int refio) return ret; } +void +ifnet_incr_pending_thread_count(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + ifp->if_threads_pending++; + lck_mtx_unlock(&ifp->if_ref_lock); +} + +void +ifnet_decr_pending_thread_count(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + VERIFY(ifp->if_threads_pending > 0); + ifp->if_threads_pending--; + if (ifp->if_threads_pending == 0) { + wakeup(&ifp->if_threads_pending); + } + lck_mtx_unlock(&ifp->if_ref_lock); +} + /* * Caller must ensure the interface is attached; the assumption is that * there is at least an outstanding IO reference count held already. - * Most callers would call ifnet_is_attached() instead. + * Most callers would call ifnet_is_{attached,data_ready}() instead. */ void ifnet_incr_iorefcnt(struct ifnet *ifp) @@ -5764,13 +6278,17 @@ ifnet_incr_iorefcnt(struct ifnet *ifp) lck_mtx_unlock(&ifp->if_ref_lock); } -void -ifnet_decr_iorefcnt(struct ifnet *ifp) +__attribute__((always_inline)) +static void +ifnet_decr_iorefcnt_locked(struct ifnet *ifp) { - lck_mtx_lock_spin(&ifp->if_ref_lock); + LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED); + VERIFY(ifp->if_refio > 0); VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)); + ifp->if_refio--; + VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0); /* * if there are no more outstanding io references, wakeup the @@ -5779,7 +6297,95 @@ ifnet_decr_iorefcnt(struct ifnet *ifp) if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) { wakeup(&(ifp->if_refio)); } +} + +void +ifnet_decr_iorefcnt(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + ifnet_decr_iorefcnt_locked(ifp); + lck_mtx_unlock(&ifp->if_ref_lock); +} + +boolean_t +ifnet_datamov_begin(struct ifnet *ifp) +{ + boolean_t ret; + + lck_mtx_lock_spin(&ifp->if_ref_lock); + if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) { + ifp->if_refio++; + ifp->if_datamov++; + } + lck_mtx_unlock(&ifp->if_ref_lock); + + return ret; +} + +void +ifnet_datamov_end(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + VERIFY(ifp->if_datamov > 0); + /* + * if there's no more thread moving data, wakeup any + * drainers that's blocked waiting for this. + */ + if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) { + wakeup(&(ifp->if_datamov)); + } + ifnet_decr_iorefcnt_locked(ifp); + lck_mtx_unlock(&ifp->if_ref_lock); +} + +void +ifnet_datamov_suspend(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)); + ifp->if_refio++; + if (ifp->if_suspend++ == 0) { + VERIFY(ifp->if_refflags & IFRF_READY); + ifp->if_refflags &= ~IFRF_READY; + } + lck_mtx_unlock(&ifp->if_ref_lock); +} + +void +ifnet_datamov_drain(struct ifnet *ifp) +{ + lck_mtx_lock(&ifp->if_ref_lock); + VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)); + /* data movement must already be suspended */ + VERIFY(ifp->if_suspend > 0); + VERIFY(!(ifp->if_refflags & IFRF_READY)); + ifp->if_drainers++; + while (ifp->if_datamov != 0) { + (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock, + (PZERO - 1), __func__, NULL); + } + VERIFY(!(ifp->if_refflags & IFRF_READY)); + VERIFY(ifp->if_drainers > 0); + ifp->if_drainers--; + lck_mtx_unlock(&ifp->if_ref_lock); + + /* purge the interface queues */ + if ((ifp->if_eflags & IFEF_TXSTART) != 0) { + if_qflush(ifp, 0); + } +} +void +ifnet_datamov_resume(struct ifnet *ifp) +{ + lck_mtx_lock(&ifp->if_ref_lock); + /* data movement must already be suspended */ + VERIFY(ifp->if_suspend > 0); + if (--ifp->if_suspend == 0) { + VERIFY(!(ifp->if_refflags & IFRF_READY)); + ifp->if_refflags |= IFRF_READY; + } + ifnet_decr_iorefcnt_locked(ifp); lck_mtx_unlock(&ifp->if_ref_lock); } @@ -5976,13 +6582,13 @@ ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol, &proto_count); end: - if (retval != 0 && retval != EEXIST && ifp != NULL) { + if (retval != 0 && retval != EEXIST) { DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n", - if_name(ifp), protocol, retval); + ifp != NULL ? if_name(ifp) : "N/A", protocol, retval); } else { if (dlil_verbose) { - printf("%s: attached v1 protocol %d (count = %d)\n", - if_name(ifp), + DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n", + ifp != NULL ? if_name(ifp) : "N/A", protocol, proto_count); } } @@ -6045,13 +6651,13 @@ ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol, &proto_count); end: - if (retval != 0 && retval != EEXIST && ifp != NULL) { + if (retval != 0 && retval != EEXIST) { DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n", - if_name(ifp), protocol, retval); + ifp != NULL ? if_name(ifp) : "N/A", protocol, retval); } else { if (dlil_verbose) { - printf("%s: attached v2 protocol %d (count = %d)\n", - if_name(ifp), + DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n", + ifp != NULL ? if_name(ifp) : "N/A", protocol, proto_count); } } @@ -6118,7 +6724,7 @@ ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family) ifnet_lock_done(ifp); if (dlil_verbose) { - printf("%s: detached %s protocol %d\n", if_name(ifp), + DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp), (proto->proto_kpi == kProtoKPI_v1) ? "v1" : "v2", proto_family); } @@ -6243,6 +6849,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) /* Sanity check */ VERIFY(ifp->if_detaching_link.tqe_next == NULL); VERIFY(ifp->if_detaching_link.tqe_prev == NULL); + VERIFY(ifp->if_threads_pending == 0); if (ll_addr != NULL) { if (ifp->if_addrlen == 0) { @@ -6396,23 +7003,19 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(dl_inp->wloop_thr == THREAD_NULL); VERIFY(dl_inp->poll_thr == THREAD_NULL); VERIFY(dl_inp->tag == 0); - VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF); - bzero(&dl_inp->tstats, sizeof(dl_inp->tstats)); - bzero(&dl_inp->pstats, sizeof(dl_inp->pstats)); - bzero(&dl_inp->sstats, sizeof(dl_inp->sstats)); + #if IFNET_INPUT_SANITY_CHK VERIFY(dl_inp->input_mbuf_cnt == 0); #endif /* IFNET_INPUT_SANITY_CHK */ + VERIFY(ifp->if_poll_thread == THREAD_NULL); + dlil_reset_rxpoll_params(ifp); /* - * A specific DLIL input thread is created per Ethernet/cellular - * interface or for an interface which supports opportunistic - * input polling. Pseudo interfaces or other types of interfaces - * use the main input thread instead. + * A specific DLIL input thread is created per non-loopback interface. */ - if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) || - ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) { + if (ifp->if_family != IFNET_FAMILY_LOOPBACK) { ifp->if_inp = dl_inp; + ifnet_incr_pending_thread_count(ifp); err = dlil_create_input_thread(ifp, ifp->if_inp); if (err != 0) { panic_plain("%s: ifp=%p couldn't get an input thread; " @@ -6420,13 +7023,6 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) /* NOTREACHED */ } } - - if (ifp->if_inp != NULL && ifp->if_inp->input_mit_tcall == NULL) { - ifp->if_inp->input_mit_tcall = - thread_call_allocate_with_priority(dlil_mit_tcall_fn, - ifp, THREAD_CALL_PRIORITY_KERNEL); - } - /* * If the driver supports the new transmit model, calculate flow hash * and create a workloop starter thread to invoke the if_start callback @@ -6442,7 +7038,8 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) ifp->if_start_req = 0; ifp->if_start_flags = 0; VERIFY(ifp->if_start != NULL); - if ((err = kernel_thread_start(ifnet_start_thread_fn, + ifnet_incr_pending_thread_count(ifp); + if ((err = kernel_thread_start(ifnet_start_thread_func, ifp, &ifp->if_start_thread)) != KERN_SUCCESS) { panic_plain("%s: " "ifp=%p couldn't get a start thread; " @@ -6455,21 +7052,25 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) ifp->if_flowhash = 0; } + /* Reset polling parameters */ + ifnet_set_poll_cycle(ifp, NULL); + ifp->if_poll_update = 0; + ifp->if_poll_flags = 0; + ifp->if_poll_req = 0; + VERIFY(ifp->if_poll_thread == THREAD_NULL); + /* * If the driver supports the new receive model, create a poller * thread to invoke if_input_poll callback where the packets may * be dequeued from the driver and processed for reception. + * if the interface is netif compat then the poller thread is managed by netif. */ - if (ifp->if_eflags & IFEF_RXPOLL) { + if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL) && + (ifp->if_xflags & IFXF_LEGACY)) { VERIFY(ifp->if_input_poll != NULL); VERIFY(ifp->if_input_ctl != NULL); - VERIFY(ifp->if_poll_thread == THREAD_NULL); - - ifnet_set_poll_cycle(ifp, NULL); - ifp->if_poll_update = 0; - ifp->if_poll_active = 0; - ifp->if_poll_req = 0; - if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp, + ifnet_incr_pending_thread_count(ifp); + if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp, &ifp->if_poll_thread)) != KERN_SUCCESS) { panic_plain("%s: ifp=%p couldn't get a poll thread; " "err=%d", __func__, ifp, err); @@ -6498,7 +7099,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) IFMA_UNLOCK(ifma); } - printf("%s: attached with %d suspended link-layer multicast " + DLIL_PRINTF("%s: attached with %d suspended link-layer multicast " "membership(s)\n", if_name(ifp), ifp->if_updatemcasts); } @@ -6515,6 +7116,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(ifp->if_delegated.family == 0); VERIFY(ifp->if_delegated.subfamily == 0); VERIFY(ifp->if_delegated.expensive == 0); + VERIFY(ifp->if_delegated.constrained == 0); VERIFY(ifp->if_agentids == NULL); VERIFY(ifp->if_agentcount == 0); @@ -6553,12 +7155,12 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) error = if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_FASTLANE); if (error != 0) { - printf("%s if_set_qosmarking_mode(%s) error %d\n", + DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n", __func__, ifp->if_xname, error); } else { ifp->if_eflags |= IFEF_QOSMARKING_ENABLED; #if (DEVELOPMENT || DEBUG) - printf("%s fastlane enabled on %s\n", + DLIL_PRINTF("%s fastlane enabled on %s\n", __func__, ifp->if_xname); #endif /* (DEVELOPMENT || DEBUG) */ } @@ -6614,12 +7216,28 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(ifp->if_dt_tcall != NULL); /* - * Finally, mark this ifnet as attached. + * Wait for the created kernel threads for I/O to get + * scheduled and run at least once before we proceed + * to mark interface as attached. */ + lck_mtx_lock(&ifp->if_ref_lock); + while (ifp->if_threads_pending != 0) { + DLIL_PRINTF("%s: Waiting for all kernel threads created for " + "interface %s to get scheduled at least once.\n", + __func__, ifp->if_xname); + (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1), + __func__, NULL); + LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED); + } + lck_mtx_unlock(&ifp->if_ref_lock); + DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled " + "at least once. Proceeding.\n", __func__, ifp->if_xname); + + /* Final mark this ifnet as attached. */ lck_mtx_lock(rnh_lock); ifnet_lock_exclusive(ifp); lck_mtx_lock_spin(&ifp->if_ref_lock); - ifp->if_refflags = IFRF_ATTACHED; /* clears embryonic */ + ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */ lck_mtx_unlock(&ifp->if_ref_lock); if (net_rtref) { /* boot-args override; enable idle notification */ @@ -6644,7 +7262,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0); if (dlil_verbose) { - printf("%s: attached%s\n", if_name(ifp), + DLIL_PRINTF("%s: attached%s\n", if_name(ifp), (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : ""); } @@ -6802,6 +7420,11 @@ ifnet_detach(ifnet_t ifp) ifnet_head_lock_exclusive(); ifnet_lock_exclusive(ifp); + if (ifp->if_output_netem != NULL) { + netem_destroy(ifp->if_output_netem); + ifp->if_output_netem = NULL; + } + /* * Check to see if this interface has previously triggered * aggressive protocol draining; if so, decrement the global @@ -6832,7 +7455,7 @@ ifnet_detach(ifnet_t ifp) lck_mtx_unlock(&ifp->if_ref_lock); if (dlil_verbose) { - printf("%s: detaching\n", if_name(ifp)); + DLIL_PRINTF("%s: detaching\n", if_name(ifp)); } /* clean up flow control entry object if there's any */ @@ -6847,6 +7470,17 @@ ifnet_detach(ifnet_t ifp) /* Reset CLAT46 flag */ ifp->if_eflags &= ~IFEF_CLAT46; + /* + * We do not reset the TCP keep alive counters in case + * a TCP connection stays connection after the interface + * went down + */ + if (ifp->if_tcp_kao_cnt > 0) { + os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero", + __func__, if_name(ifp), ifp->if_tcp_kao_cnt); + } + ifp->if_tcp_kao_max = 0; + /* * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will * no longer be visible during lookups from this point. @@ -7009,6 +7643,8 @@ ifnet_detacher_thread_cont(int err) /* NOTREACHED */ } + net_update_uptime(); + VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL); /* Take care of detaching ifnet */ @@ -7021,10 +7657,12 @@ ifnet_detacher_thread_cont(int err) } } +__dead2 static void ifnet_detacher_thread_func(void *v, wait_result_t w) { #pragma unused(v, w) + dlil_decr_pending_thread_count(); dlil_if_lock(); (void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock, (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont); @@ -7059,11 +7697,16 @@ ifnet_detach_final(struct ifnet *ifp) * common case, so block without using a continuation. */ while (ifp->if_refio > 0) { - printf("%s: Waiting for IO references on %s interface " + DLIL_PRINTF("%s: Waiting for IO references on %s interface " "to be released\n", __func__, if_name(ifp)); (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock, (PZERO - 1), "ifnet_ioref_wait", NULL); } + + VERIFY(ifp->if_datamov == 0); + VERIFY(ifp->if_drainers == 0); + VERIFY(ifp->if_suspend == 0); + ifp->if_refflags &= ~IFRF_READY; lck_mtx_unlock(&ifp->if_ref_lock); /* Drain and destroy send queue */ @@ -7181,6 +7824,7 @@ ifnet_detach_final(struct ifnet *ifp) /* Tear down poll thread affinity */ if (ptp != NULL) { VERIFY(ifp->if_eflags & IFEF_RXPOLL); + VERIFY(ifp->if_xflags & IFXF_LEGACY); (void) dlil_affinity_set(ptp, THREAD_AFFINITY_TAG_NULL); thread_deallocate(ptp); @@ -7222,6 +7866,9 @@ ifnet_detach_final(struct ifnet *ifp) /* clean-up input thread state */ dlil_clean_threading_info(inp); + /* clean-up poll parameters */ + VERIFY(ifp->if_poll_thread == THREAD_NULL); + dlil_reset_rxpoll_params(ifp); } /* The driver might unload, so point these to ourselves */ @@ -7257,6 +7904,7 @@ ifnet_detach_final(struct ifnet *ifp) VERIFY(ifp->if_delegated.family == 0); VERIFY(ifp->if_delegated.subfamily == 0); VERIFY(ifp->if_delegated.expensive == 0); + VERIFY(ifp->if_delegated.constrained == 0); /* QoS marking get cleared */ ifp->if_eflags &= ~IFEF_QOSMARKING_ENABLED; @@ -7317,7 +7965,7 @@ ifnet_detach_final(struct ifnet *ifp) } if (dlil_verbose) { - printf("%s: detached\n", if_name(ifp)); + DLIL_PRINTF("%s: detached\n", if_name(ifp)); } /* Release reference held during ifnet attach */ @@ -7475,6 +8123,7 @@ dlil_if_acquire(u_int32_t family, const void *uniqueid, { struct ifnet *ifp1 = NULL; struct dlil_ifnet *dlifp1 = NULL; + struct dlil_ifnet *dlifp1_saved = NULL; void *buf, *base, **pbuf; int ret = 0; @@ -7513,10 +8162,10 @@ dlil_if_acquire(u_int32_t family, const void *uniqueid, ret = EBUSY; goto end; } else { - dlifp1->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE); /* Cache the first interface that can be recycled */ if (*ifp == NULL) { *ifp = ifp1; + dlifp1_saved = dlifp1; } /* * XXX Do not break or jump to end as we have to traverse @@ -7530,6 +8179,12 @@ dlil_if_acquire(u_int32_t family, const void *uniqueid, /* If there's an interface that can be recycled, use that */ if (*ifp != NULL) { + if (dlifp1_saved != NULL) { + lck_mtx_lock(&dlifp1_saved->dl_if_lock); + dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE); + lck_mtx_unlock(&dlifp1_saved->dl_if_lock); + dlifp1_saved = NULL; + } goto end; } @@ -7985,7 +8640,12 @@ if_state_update(struct ifnet *ifp, if (ifp->if_interface_state.interface_availability == IF_INTERFACE_STATE_INTERFACE_AVAILABLE) { + os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n", + __func__, if_name(ifp), ifp->if_index); if_index_available = ifp->if_index; + } else { + os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n", + __func__, if_name(ifp), ifp->if_index); } } ifnet_lock_done(ifp); @@ -7993,8 +8653,8 @@ if_state_update(struct ifnet *ifp, /* * Check if the TCP connections going on this interface should be * forced to send probe packets instead of waiting for TCP timers - * to fire. This will be done when there is an explicit - * notification that the interface became available. + * to fire. This is done on an explicit notification such as + * SIOCSIFINTERFACESTATE which marks the interface as available. */ if (if_index_available > 0) { tcp_interface_send_probe(if_index_available); @@ -8060,30 +8720,76 @@ if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe) } /* for uuid.c */ -int -uuid_get_ethernet(u_int8_t *node) +static int +get_ether_index(int * ret_other_index) { struct ifnet *ifp; - struct sockaddr_dl *sdl; + int en0_index = 0; + int other_en_index = 0; + int any_ether_index = 0; + short best_unit = 0; - ifnet_head_lock_shared(); + *ret_other_index = 0; TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + /* + * find en0, or if not en0, the lowest unit en*, and if not + * that, any ethernet + */ ifnet_lock_shared(ifp); - IFA_LOCK_SPIN(ifp->if_lladdr); - sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr; - if (sdl->sdl_type == IFT_ETHER) { - memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN); - IFA_UNLOCK(ifp->if_lladdr); - ifnet_lock_done(ifp); - ifnet_head_done(); - return 0; + if (strcmp(ifp->if_name, "en") == 0) { + if (ifp->if_unit == 0) { + /* found en0, we're done */ + en0_index = ifp->if_index; + ifnet_lock_done(ifp); + break; + } + if (other_en_index == 0 || ifp->if_unit < best_unit) { + other_en_index = ifp->if_index; + best_unit = ifp->if_unit; + } + } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) { + any_ether_index = ifp->if_index; } - IFA_UNLOCK(ifp->if_lladdr); ifnet_lock_done(ifp); } - ifnet_head_done(); + if (en0_index == 0) { + if (other_en_index != 0) { + *ret_other_index = other_en_index; + } else if (any_ether_index != 0) { + *ret_other_index = any_ether_index; + } + } + return en0_index; +} + +int +uuid_get_ethernet(u_int8_t *node) +{ + static int en0_index; + struct ifnet *ifp; + int other_index = 0; + int the_index = 0; + int ret; - return -1; + ifnet_head_lock_shared(); + if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) { + en0_index = get_ether_index(&other_index); + } + if (en0_index != 0) { + the_index = en0_index; + } else if (other_index != 0) { + the_index = other_index; + } + if (the_index != 0) { + ifp = ifindex2ifnet[the_index]; + VERIFY(ifp != NULL); + memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN); + ret = 0; + } else { + ret = -1; + } + ifnet_head_done(); + return ret; } static int @@ -8184,18 +8890,18 @@ sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS uint32_t i; int err; - i = if_rxpoll_wlowat; + i = if_sysctl_rxpoll_wlowat; err = sysctl_handle_int(oidp, &i, 0, req); if (err != 0 || req->newptr == USER_ADDR_NULL) { return err; } - if (i == 0 || i >= if_rxpoll_whiwat) { + if (i == 0 || i >= if_sysctl_rxpoll_whiwat) { return EINVAL; } - if_rxpoll_wlowat = i; + if_sysctl_rxpoll_wlowat = i; return err; } @@ -8206,18 +8912,18 @@ sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS uint32_t i; int err; - i = if_rxpoll_whiwat; + i = if_sysctl_rxpoll_whiwat; err = sysctl_handle_int(oidp, &i, 0, req); if (err != 0 || req->newptr == USER_ADDR_NULL) { return err; } - if (i <= if_rxpoll_wlowat) { + if (i <= if_sysctl_rxpoll_wlowat) { return EINVAL; } - if_rxpoll_whiwat = i; + if_sysctl_rxpoll_whiwat = i; return err; } @@ -8263,13 +8969,14 @@ sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS return err; } -void +int dlil_node_present(struct ifnet *ifp, struct sockaddr *sa, int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48]) { struct kev_dl_node_presence kev; struct sockaddr_dl *sdl; struct sockaddr_in6 *sin6; + int ret = 0; VERIFY(ifp); VERIFY(sa); @@ -8284,32 +8991,97 @@ dlil_node_present(struct ifnet *ifp, struct sockaddr *sa, kev.node_proximity_metric = npm; bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info)); - nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE, - &kev.link_data, sizeof(kev)); + ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm); + if (ret == 0) { + int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE, + &kev.link_data, sizeof(kev)); + if (err != 0) { + log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with" + "error %d\n", __func__, err); + } + } + return ret; } void dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa) { - struct kev_dl_node_absence kev; - struct sockaddr_in6 *sin6; - struct sockaddr_dl *sdl; + struct kev_dl_node_absence kev = {}; + struct sockaddr_in6 *kev_sin6 = NULL; + struct sockaddr_dl *kev_sdl = NULL; - VERIFY(ifp); - VERIFY(sa); + VERIFY(ifp != NULL); + VERIFY(sa != NULL); VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6); - bzero(&kev, sizeof(kev)); - sin6 = &kev.sin6_node_address; - sdl = &kev.sdl_node_address; - nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6); + kev_sin6 = &kev.sin6_node_address; + kev_sdl = &kev.sdl_node_address; + + if (sa->sa_family == AF_INET6) { + /* + * If IPv6 address is given, get the link layer + * address from what was cached in the neighbor cache + */ + VERIFY(sa->sa_len <= sizeof(*kev_sin6)); + bcopy(sa, kev_sin6, sa->sa_len); + nd6_alt_node_absent(ifp, kev_sin6, kev_sdl); + } else { + /* + * If passed address is AF_LINK type, derive the address + * based on the link address. + */ + nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6); + nd6_alt_node_absent(ifp, kev_sin6, NULL); + } + + kev_sdl->sdl_type = ifp->if_type; + kev_sdl->sdl_index = ifp->if_index; - nd6_alt_node_absent(ifp, sin6); dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE, &kev.link_data, sizeof(kev)); } +int +dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl, + int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48]) +{ + struct kev_dl_node_presence kev = {}; + struct sockaddr_dl *kev_sdl = NULL; + struct sockaddr_in6 *kev_sin6 = NULL; + int ret = 0; + + VERIFY(ifp != NULL); + VERIFY(sa != NULL && sdl != NULL); + VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK); + + kev_sin6 = &kev.sin6_node_address; + kev_sdl = &kev.sdl_node_address; + + VERIFY(sdl->sdl_len <= sizeof(*kev_sdl)); + bcopy(sdl, kev_sdl, sdl->sdl_len); + kev_sdl->sdl_type = ifp->if_type; + kev_sdl->sdl_index = ifp->if_index; + + VERIFY(sa->sa_len <= sizeof(*kev_sin6)); + bcopy(sa, kev_sin6, sa->sa_len); + + kev.rssi = rssi; + kev.link_quality_metric = lqm; + kev.node_proximity_metric = npm; + bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info)); + + ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm); + if (ret == 0) { + int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE, + &kev.link_data, sizeof(kev)); + if (err != 0) { + log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with", + "error %d\n", __func__, err); + } + } + return ret; +} + const void * dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep, kauth_cred_t *credp) @@ -8484,8 +9256,11 @@ ifnet_set_throttle(struct ifnet *ifp, u_int32_t level) IFCQ_UNLOCK(ifq); if (err == 0) { - printf("%s: throttling level set to %d\n", if_name(ifp), + DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp), level); +#if NECP + necp_update_all_clients(); +#endif /* NECP */ if (level == IFNET_THROTTLE_OFF) { ifnet_start(ifp); } @@ -9133,7 +9908,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, if (frame_header == NULL || frame_header < (char *)mbuf_datastart(m) || frame_header > (char *)m->m_data) { - printf("%s: frame header pointer 0x%llx out of range " + DLIL_PRINTF("%s: frame header pointer 0x%llx out of range " "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp), (uint64_t)VM_KERNEL_ADDRPERM(frame_header), (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)), @@ -9193,7 +9968,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, if (hlen > rxoff) { hwcksum_dbg_bad_rxoff++; if (dlil_verbose) { - printf("%s: partial cksum start offset %d " + DLIL_PRINTF("%s: partial cksum start offset %d " "is less than frame header length %d for " "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen, (uint64_t)VM_KERNEL_ADDRPERM(m)); @@ -9214,7 +9989,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, if (sum != m->m_pkthdr.csum_rx_val) { hwcksum_dbg_bad_cksum++; if (dlil_verbose) { - printf("%s: bad partial cksum value " + DLIL_PRINTF("%s: bad partial cksum value " "0x%x (expected 0x%x) for mbuf " "0x%llx [rx_start %d]\n", if_name(ifp), @@ -9653,7 +10428,7 @@ sysctl_get_kao_frames SYSCTL_HANDLER_ARGS error = ifnet_get_keepalive_offload_frames(ifp, frames_array, frames_array_count, frame_data_offset, &used_frames_count); if (error != 0) { - printf("%s: ifnet_get_keepalive_offload_frames error %d\n", + DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n", __func__, error); goto done; } @@ -9679,27 +10454,3 @@ ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs, { tcp_update_stats_per_flow(ifs, ifp); } - -static void -dlil_mit_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1) -{ -#pragma unused(arg1) - struct ifnet *ifp = (struct ifnet *)arg0; - struct dlil_threading_info *inp = ifp->if_inp; - - ifnet_lock_shared(ifp); - if (!IF_FULLY_ATTACHED(ifp) || inp == NULL) { - ifnet_lock_done(ifp); - return; - } - - lck_mtx_lock_spin(&inp->input_lck); - inp->input_waiting |= DLIL_INPUT_WAITING; - if (!(inp->input_waiting & DLIL_INPUT_RUNNING) || - !qempty(&inp->rcvq_pkts)) { - inp->wtot++; - wakeup_one((caddr_t)&inp->input_waiting); - } - lck_mtx_unlock(&inp->input_lck); - ifnet_lock_done(ifp); -} diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index 881cf0505..7f2753cba 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2017 Apple Inc. All rights reserved. + * Copyright (c) 1999-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,6 +97,12 @@ enum { } \ } while (0) +#define net_timerusec(tvp, nsp) do { \ + *(nsp) = (tvp)->tv_nsec / NSEC_PER_USEC; \ + if ((tvp)->tv_sec > 0) \ + *(nsp) += ((tvp)->tv_sec * USEC_PER_SEC); \ +} while (0) + #define net_timernsec(tvp, nsp) do { \ *(nsp) = (tvp)->tv_nsec; \ if ((tvp)->tv_sec > 0) \ @@ -165,44 +171,12 @@ struct dlil_threading_info { struct thread *wloop_thr; /* workloop thread */ struct thread *poll_thr; /* poll thread */ u_int32_t tag; /* affinity tag */ - /* - * Opportunistic polling. - */ - ifnet_model_t mode; /* current mode */ - struct pktcntr tstats; /* incremental polling statistics */ - struct if_rxpoll_stats pstats; /* polling statistics */ -#define rxpoll_offreq pstats.ifi_poll_off_req -#define rxpoll_offerr pstats.ifi_poll_off_err -#define rxpoll_onreq pstats.ifi_poll_on_req -#define rxpoll_onerr pstats.ifi_poll_on_err -#define rxpoll_wavg pstats.ifi_poll_wakeups_avg -#define rxpoll_wlowat pstats.ifi_poll_wakeups_lowat -#define rxpoll_whiwat pstats.ifi_poll_wakeups_hiwat -#define rxpoll_pavg pstats.ifi_poll_packets_avg -#define rxpoll_pmin pstats.ifi_poll_packets_min -#define rxpoll_pmax pstats.ifi_poll_packets_max -#define rxpoll_plowat pstats.ifi_poll_packets_lowat -#define rxpoll_phiwat pstats.ifi_poll_packets_hiwat -#define rxpoll_bavg pstats.ifi_poll_bytes_avg -#define rxpoll_bmin pstats.ifi_poll_bytes_min -#define rxpoll_bmax pstats.ifi_poll_bytes_max -#define rxpoll_blowat pstats.ifi_poll_bytes_lowat -#define rxpoll_bhiwat pstats.ifi_poll_bytes_hiwat -#define rxpoll_plim pstats.ifi_poll_packets_limit -#define rxpoll_ival pstats.ifi_poll_interval_time - struct pktcntr sstats; /* packets and bytes per sampling */ - struct timespec mode_holdtime; /* mode holdtime in nsec */ - struct timespec mode_lasttime; /* last mode change time in nsec */ - struct timespec sample_holdtime; /* sampling holdtime in nsec */ - struct timespec sample_lasttime; /* last sampling time in nsec */ - struct timespec dbg_lasttime; /* last debug message time in nsec */ #if IFNET_INPUT_SANITY_CHK /* * For debugging. */ u_int64_t input_mbuf_cnt; /* total # of packets processed */ #endif - thread_call_t input_mit_tcall; /* coalescing input processing */ }; /* @@ -230,11 +204,20 @@ struct dlil_main_threading_info { #define DLIL_IFF_TSO 0x01 /* Interface filter supports TSO */ #define DLIL_IFF_INTERNAL 0x02 /* Apple internal -- do not count towards stats */ +/* Input poll interval definitions */ +#define IF_RXPOLL_INTERVALTIME_MIN (1ULL * 1000) /* 1 us */ +#define IF_RXPOLL_INTERVALTIME (1ULL * 1000 * 1000) /* 1 ms */ + extern int dlil_verbose; extern uint32_t hwcksum_dbg; extern uint32_t hwcksum_tx; extern uint32_t hwcksum_rx; extern struct dlil_threading_info *dlil_main_input_thread; +extern unsigned int net_rxpoll; +extern uint32_t if_rxpoll; +extern uint32_t if_rxpoll_decay; +extern uint32_t if_rxpoll_interval_pkts; +extern uint32_t if_rcvq_maxlen; extern void dlil_init(void); @@ -323,7 +306,7 @@ extern void dlil_detach_filter(interface_filter_t); extern void dlil_proto_unplumb_all(ifnet_t); -extern void dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t, +extern int dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t, struct net_event_data *, u_int32_t); extern void dlil_post_sifflags_msg(struct ifnet *); @@ -332,6 +315,14 @@ extern int dlil_post_complete_msg(struct ifnet *, struct kev_msg *); extern int dlil_alloc_local_stats(struct ifnet *); +extern void ifnet_filter_update_tso(boolean_t filter_enable); +extern errno_t dlil_rxpoll_validate_params(struct ifnet_poll_params *); +extern void dlil_rxpoll_update_params(struct ifnet *, + struct ifnet_poll_params *); +extern void ifnet_poll(struct ifnet *); +extern errno_t ifnet_input_poll(struct ifnet *, struct mbuf *, + struct mbuf *, const struct ifnet_stat_increment_param *); + /* * dlil_if_acquire is obsolete. Use ifnet_allocate. @@ -346,9 +337,11 @@ extern void dlil_if_release(struct ifnet *ifp); extern errno_t dlil_if_ref(struct ifnet *); extern errno_t dlil_if_free(struct ifnet *); -extern void dlil_node_present(struct ifnet *, struct sockaddr *, int32_t, int, +extern int dlil_node_present(struct ifnet *, struct sockaddr *, int32_t, int, int, u_int8_t[48]); extern void dlil_node_absent(struct ifnet *, struct sockaddr *); +extern int dlil_node_present_v2(struct ifnet *, struct sockaddr *, struct sockaddr_dl *, int32_t, int, + int, u_int8_t[48]); extern const void *dlil_ifaddr_bytes(const struct sockaddr_dl *, size_t *, kauth_cred_t *); @@ -356,7 +349,7 @@ extern const void *dlil_ifaddr_bytes(const struct sockaddr_dl *, size_t *, extern void dlil_report_issues(struct ifnet *, u_int8_t[DLIL_MODIDLEN], u_int8_t[DLIL_MODARGLEN]); -#define PROTO_HASH_SLOTS 4 +#define PROTO_HASH_SLOTS 5 extern int proto_hash_value(u_int32_t); diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index 35acf1064..f26aec76c 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,6 +96,7 @@ #include #include #include +#include #if BOND #include #endif /* BOND */ @@ -105,6 +106,9 @@ #if IF_FAKE #include #endif /* IF_FAKE */ +#if IF_HEADLESS +extern void if_headless_init(void); +#endif /* IF_HEADLESS */ #include @@ -377,12 +381,6 @@ ether_demux(ifnet_t ifp, mbuf_t m, char *frame_header, m->m_flags &= ~M_HASFCS; } - if (ifp->if_eflags & IFEF_BOND) { - /* if we're bonded, bond "protocol" gets all the packets */ - *protocol_family = PF_BOND; - return 0; - } - if ((eh->ether_dhost[0] & 1) == 0) { /* * When the driver is put into promiscuous mode we may receive @@ -396,6 +394,12 @@ ether_demux(ifnet_t ifp, mbuf_t m, char *frame_header, } } + /* check for IEEE 802.15.4 */ + if (ether_type == htons(ETHERTYPE_IEEE802154)) { + *protocol_family = PF_802154; + return 0; + } + /* check for VLAN */ if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) { if (EVL_VLANOFTAG(m->m_pkthdr.vlan_tag) != 0) { @@ -655,7 +659,12 @@ ether_family_init(void) #if IF_FAKE if_fake_init(); #endif /* IF_FAKE */ - +#if IF_HEADLESS + if_headless_init(); +#endif /* IF_HEADLESS */ +#if SIXLOWPAN + sixlowpan_family_init(); +#endif /* VLAN */ done: return error; diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index 9dda79efb..c4df50720 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,6 +108,7 @@ typedef struct ether_addr { #define ETHERTYPE_PTP 0x88f7 /* IEEE 1588 Precision Time Protocol */ #define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */ /* XXX - add more useful types here */ +#define ETHERTYPE_IEEE802154 0x0809 /* 802.15.4 */ /* * The ETHERTYPE_NTRAILER packet types starting at ETHERTYPE_TRAIL have diff --git a/bsd/net/flowadv.c b/bsd/net/flowadv.c index 020081305..2d6a41ecb 100644 --- a/bsd/net/flowadv.c +++ b/bsd/net/flowadv.c @@ -246,6 +246,7 @@ flowadv_thread_cont(int err) } } +__dead2 static void flowadv_thread_func(void *v, wait_result_t w) { diff --git a/bsd/net/frame802154.c b/bsd/net/frame802154.c new file mode 100644 index 000000000..e5f2e9355 --- /dev/null +++ b/bsd/net/frame802154.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * + * Copyright (c) 2008, Swedish Institute of Computer Science + * All rights reserved. + * + * Additional fixes for AVR contributed by: + * + * Colin O'Flynn coflynn@newae.com + * Eric Gnoske egnoske@gmail.com + * Blake Leverett bleverett@gmail.com + * Mike Vidales mavida404@gmail.com + * Kevin Brown kbrown3@uccs.edu + * Nate Bohlmann nate@elfwerks.com + * + * Additional fixes for MSP430 contributed by: + * Joakim Eriksson + * Niclas Finne + * Nicolas Tsiftes + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holders nor the names of + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +/* + * \brief This file is where the main functions that relate to frame + * manipulation will reside. + */ + +/** + * \file + * \brief 802.15.4 frame creation and parsing functions + * + * This file converts to and from a structure to a packed 802.15.4 + * frame. + */ + +/** + * \addtogroup frame802154 + * @{ + */ + +#include "cc.h" +#include "frame802154.h" +//#include "net/llsec/llsec802154.h" +#include "linkaddr.h" +#include + +/** + * \brief Structure that contains the lengths of the various addressing and security fields + * in the 802.15.4 header. This structure is used in \ref frame802154_create() + */ +typedef struct { + uint8_t dest_pid_len; /**< Length (in bytes) of destination PAN ID field */ + uint8_t dest_addr_len; /**< Length (in bytes) of destination address field */ + uint8_t src_pid_len; /**< Length (in bytes) of source PAN ID field */ + uint8_t src_addr_len; /**< Length (in bytes) of source address field */ + uint8_t aux_sec_len; /**< Length (in bytes) of aux security header field */ +} field_length_t; + +/*----------------------------------------------------------------------------*/ +CC_INLINE static uint8_t +addr_len(uint8_t mode) +{ + switch (mode) { + case FRAME802154_SHORTADDRMODE: /* 16-bit address */ + return 2; + case FRAME802154_LONGADDRMODE: /* 64-bit address */ + return 8; + default: + return 0; + } +} +/*----------------------------------------------------------------------------*/ +#if LLSEC802154_USES_EXPLICIT_KEYS +static uint8_t +get_key_id_len(uint8_t key_id_mode) +{ + switch (key_id_mode) { + case FRAME802154_1_BYTE_KEY_ID_MODE: + return 1; + case FRAME802154_5_BYTE_KEY_ID_MODE: + return 5; + case FRAME802154_9_BYTE_KEY_ID_MODE: + return 9; + default: + return 0; + } +} +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ +/*----------------------------------------------------------------------------*/ +static void +field_len(frame802154_t *p, field_length_t *flen) +{ + /* init flen to zeros */ + memset(flen, 0, sizeof(field_length_t)); + + /* Determine lengths of each field based on fcf and other args */ + if (p->fcf.dest_addr_mode & 3) { + flen->dest_pid_len = 2; + } + if (p->fcf.src_addr_mode & 3) { + flen->src_pid_len = 2; + } + + /* Set PAN ID compression bit if src pan id matches dest pan id. */ + if (p->fcf.dest_addr_mode & 3 && p->fcf.src_addr_mode & 3 && + p->src_pid == p->dest_pid) { + p->fcf.panid_compression = 1; + + /* compressed header, only do dest pid */ + flen->src_pid_len = 0; + } else { + p->fcf.panid_compression = 0; + } + + /* determine address lengths */ + flen->dest_addr_len = addr_len(p->fcf.dest_addr_mode & 3); + flen->src_addr_len = addr_len(p->fcf.src_addr_mode & 3); + +#if LLSEC802154_SECURITY_LEVEL + /* Aux security header */ + if (p->fcf.security_enabled & 1) { + flen->aux_sec_len = 5 +#if LLSEC802154_USES_EXPLICIT_KEYS + + get_key_id_len(p->aux_hdr.security_control.key_id_mode); +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + ; + } +#endif /* LLSEC802154_SECURITY_LEVEL */ +} +/*----------------------------------------------------------------------------*/ +/** + * \brief Calculates the length of the frame header. This function is + * meant to be called by a higher level function, that interfaces to a MAC. + * + * \param p Pointer to frame802154_t_t struct, which specifies the + * frame to send. + * + * \return The length of the frame header. + */ +int +frame802154_hdrlen(frame802154_t *p) +{ + field_length_t flen; + field_len(p, &flen); + return 3 + flen.dest_pid_len + flen.dest_addr_len + + flen.src_pid_len + flen.src_addr_len + flen.aux_sec_len; +} +/*----------------------------------------------------------------------------*/ +/** + * \brief Creates a frame for transmission over the air. This function is + * meant to be called by a higher level function, that interfaces to a MAC. + * + * \param p Pointer to frame802154_t struct, which specifies the + * frame to send. + * + * \param buf Pointer to the buffer to use for the frame. + * + * \return The length of the frame header + */ +int +frame802154_create(frame802154_t *p, uint8_t *buf) +{ + int c; + field_length_t flen; + uint8_t pos; +#if LLSEC802154_USES_EXPLICIT_KEYS + uint8_t key_id_mode; +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + + field_len(p, &flen); + + /* OK, now we have field lengths. Time to actually construct */ + /* the outgoing frame, and store it in buf */ + buf[0] = (p->fcf.frame_type & 7) | + ((p->fcf.security_enabled & 1) << 3) | + ((p->fcf.frame_pending & 1) << 4) | + ((p->fcf.ack_required & 1) << 5) | + ((p->fcf.panid_compression & 1) << 6); + buf[1] = ((p->fcf.dest_addr_mode & 3) << 2) | + ((p->fcf.frame_version & 3) << 4) | + ((p->fcf.src_addr_mode & 3) << 6); + + /* sequence number */ + buf[2] = p->seq; + pos = 3; + + /* Destination PAN ID */ + if (flen.dest_pid_len == 2) { + buf[pos++] = p->dest_pid & 0xff; + buf[pos++] = (p->dest_pid >> 8) & 0xff; + } + + /* Destination address */ + for (c = flen.dest_addr_len; c > 0; c--) { + buf[pos++] = p->dest_addr[c - 1]; + } + + /* Source PAN ID */ + if (flen.src_pid_len == 2) { + buf[pos++] = p->src_pid & 0xff; + buf[pos++] = (p->src_pid >> 8) & 0xff; + } + + /* Source address */ + for (c = flen.src_addr_len; c > 0; c--) { + buf[pos++] = p->src_addr[c - 1]; + } + +#if LLSEC802154_SECURITY_LEVEL + /* Aux header */ + if (flen.aux_sec_len) { + buf[pos++] = p->aux_hdr.security_control.security_level +#if LLSEC802154_USES_EXPLICIT_KEYS + | (p->aux_hdr.security_control.key_id_mode << 3) +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + ; + memcpy(buf + pos, p->aux_hdr.frame_counter.u8, 4); + pos += 4; + +#if LLSEC802154_USES_EXPLICIT_KEYS + key_id_mode = p->aux_hdr.security_control.key_id_mode; + if (key_id_mode) { + c = (key_id_mode - 1) * 4; + memcpy(buf + pos, p->aux_hdr.key_source.u8, c); + pos += c; + buf[pos++] = p->aux_hdr.key_index; + } +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + } +#endif /* LLSEC802154_SECURITY_LEVEL */ + + return (int)pos; +} +/*----------------------------------------------------------------------------*/ +/** + * \brief Parses an input frame. Scans the input frame to find each + * section, and stores the information of each section in a + * frame802154_t structure. + * + * \param data The input data from the radio chip. + * \param len The size of the input data + * \param pf The frame802154_t struct to store the parsed frame information. + */ +int +frame802154_parse(uint8_t *data, int len, frame802154_t *pf, uint8_t **payload) +{ + uint8_t *p; + frame802154_fcf_t fcf; + int c; +#if LLSEC802154_USES_EXPLICIT_KEYS + uint8_t key_id_mode; +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + + if (len < 3) { + return 0; + } + + p = data; + + /* decode the FCF */ + fcf.frame_type = p[0] & 7; + fcf.security_enabled = (p[0] >> 3) & 1; + fcf.frame_pending = (p[0] >> 4) & 1; + fcf.ack_required = (p[0] >> 5) & 1; + fcf.panid_compression = (p[0] >> 6) & 1; + + fcf.dest_addr_mode = (p[1] >> 2) & 3; + fcf.frame_version = (p[1] >> 4) & 3; + fcf.src_addr_mode = (p[1] >> 6) & 3; + + /* copy fcf and seqNum */ + memcpy(&pf->fcf, &fcf, sizeof(frame802154_fcf_t)); + pf->seq = p[2]; + p += 3; /* Skip first three bytes */ + + /* Destination address, if any */ + if (fcf.dest_addr_mode) { + /* Destination PAN */ + pf->dest_pid = p[0] + (p[1] << 8); + p += 2; + + /* Destination address */ + /* l = addr_len(fcf.dest_addr_mode); */ + /* for(c = 0; c < l; c++) { */ + /* pf->dest_addr.u8[c] = p[l - c - 1]; */ + /* } */ + /* p += l; */ + if (fcf.dest_addr_mode == FRAME802154_SHORTADDRMODE) { + linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->dest_addr), &linkaddr_null); + pf->dest_addr[0] = p[1]; + pf->dest_addr[1] = p[0]; + p += 2; + } else if (fcf.dest_addr_mode == FRAME802154_LONGADDRMODE) { + for (c = 0; c < 8; c++) { + pf->dest_addr[c] = p[7 - c]; + } + p += 8; + } + } else { + linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->dest_addr), &linkaddr_null); + pf->dest_pid = 0; + } + + /* Source address, if any */ + if (fcf.src_addr_mode) { + /* Source PAN */ + if (!fcf.panid_compression) { + pf->src_pid = p[0] + (p[1] << 8); + p += 2; + } else { + pf->src_pid = pf->dest_pid; + } + + /* Source address */ + /* l = addr_len(fcf.src_addr_mode); */ + /* for(c = 0; c < l; c++) { */ + /* pf->src_addr.u8[c] = p[l - c - 1]; */ + /* } */ + /* p += l; */ + if (fcf.src_addr_mode == FRAME802154_SHORTADDRMODE) { + linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->src_addr), &linkaddr_null); + pf->src_addr[0] = p[1]; + pf->src_addr[1] = p[0]; + p += 2; + } else if (fcf.src_addr_mode == FRAME802154_LONGADDRMODE) { + for (c = 0; c < 8; c++) { + pf->src_addr[c] = p[7 - c]; + } + p += 8; + } + } else { + linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->src_addr), &linkaddr_null); + pf->src_pid = 0; + } + +#if LLSEC802154_SECURITY_LEVEL + if (fcf.security_enabled) { + pf->aux_hdr.security_control.security_level = p[0] & 7; +#if LLSEC802154_USES_EXPLICIT_KEYS + pf->aux_hdr.security_control.key_id_mode = (p[0] >> 3) & 3; +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + p += 1; + + memcpy(pf->aux_hdr.frame_counter.u8, p, 4); + p += 4; + +#if LLSEC802154_USES_EXPLICIT_KEYS + key_id_mode = pf->aux_hdr.security_control.key_id_mode; + if (key_id_mode) { + c = (key_id_mode - 1) * 4; + memcpy(pf->aux_hdr.key_source.u8, p, c); + p += c; + pf->aux_hdr.key_index = p[0]; + p += 1; + } +#endif /* LLSEC802154_USES_EXPLICIT_KEYS */ + } +#endif /* LLSEC802154_SECURITY_LEVEL */ + + /* header length */ + c = p - data; + /* payload length */ + pf->payload_len = (len - c); + /* payload */ + *payload = p; + + /* return header length if successful */ + return c > len ? 0 : c; +} +/** \} */ diff --git a/bsd/net/frame802154.h b/bsd/net/frame802154.h new file mode 100644 index 000000000..fbdb29cab --- /dev/null +++ b/bsd/net/frame802154.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2008, Swedish Institute of Computer Science + * All rights reserved. + * + * Additional fixes for AVR contributed by: + * Colin O'Flynn coflynn@newae.com + * Eric Gnoske egnoske@gmail.com + * Blake Leverett bleverett@gmail.com + * Mike Vidales mavida404@gmail.com + * Kevin Brown kbrown3@uccs.edu + * Nate Bohlmann nate@elfwerks.com + * + * Additional fixes for MSP430 contributed by: + * Joakim Eriksson + * Niclas Finne + * Nicolas Tsiftes + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holders nor the names of + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \addtogroup net + * @{ + */ + +/** + * \defgroup frame802154 802.15.4 frame creation and parsing + * @{ + */ +/** + * \file + * \brief 802.15.4 frame creation and parsing functions + * + * This file converts to and from a structure to a packed 802.15.4 + * frame. + * + */ + +/* Includes */ +#ifndef FRAME_802154_H +#define FRAME_802154_H + +#include "contiki-conf.h" + +#include + +#ifdef IEEE802154_CONF_PANID +#define IEEE802154_PANID IEEE802154_CONF_PANID +#else /* IEEE802154_CONF_PANID */ +#define IEEE802154_PANID 0xABCD +#endif /* IEEE802154_CONF_PANID */ + +/* Macros & Defines */ + +/** \brief These are some definitions of values used in the FCF. See the 802.15.4 spec for details. + * \name FCF element values definitions + * @{ + */ +#define FRAME802154_BEACONFRAME (0x00) +#define FRAME802154_DATAFRAME (0x01) +#define FRAME802154_ACKFRAME (0x02) +#define FRAME802154_CMDFRAME (0x03) + +#define FRAME802154_BEACONREQ (0x07) + +#define FRAME802154_IEEERESERVED (0x00) +#define FRAME802154_NOADDR (0x00) /**< Only valid for ACK or Beacon frames. */ +#define FRAME802154_SHORTADDRMODE (0x02) +#define FRAME802154_LONGADDRMODE (0x03) + +#define FRAME802154_NOBEACONS (0x0F) + +#define FRAME802154_BROADCASTADDR (0xFFFF) +#define FRAME802154_BROADCASTPANDID (0xFFFF) + +#define FRAME802154_IEEE802154_2003 (0x00) +#define FRAME802154_IEEE802154_2006 (0x01) + +#define FRAME802154_SECURITY_LEVEL_NONE (0) +#define FRAME802154_SECURITY_LEVEL_MIC_32 (1) +#define FRAME802154_SECURITY_LEVEL_MIC_64 (2) +#define FRAME802154_SECURITY_LEVEL_MIC_128 (3) +#define FRAME802154_SECURITY_LEVEL_ENC (4) +#define FRAME802154_SECURITY_LEVEL_ENC_MIC_32 (5) +#define FRAME802154_SECURITY_LEVEL_ENC_MIC_64 (6) +#define FRAME802154_SECURITY_LEVEL_ENC_MIC_128 (7) + +#define FRAME802154_IMPLICIT_KEY (0) +#define FRAME802154_1_BYTE_KEY_ID_MODE (1) +#define FRAME802154_5_BYTE_KEY_ID_MODE (2) +#define FRAME802154_9_BYTE_KEY_ID_MODE (3) + +/** + * @brief The IEEE 802.15.4 frame has a number of constant/fixed fields that + * can be counted to make frame construction and max payload + * calculations easier. + * + * These include: + * 1. FCF - 2 bytes - Fixed + * 2. Sequence number - 1 byte - Fixed + * 3. Addressing fields - 4 - 20 bytes - Variable + * 4. Aux security header - 0 - 14 bytes - Variable + * 5. CRC - 2 bytes - Fixed + */ + +/** + * \brief Defines the bitfields of the frame control field (FCF). + */ +typedef struct { + uint8_t frame_type; /**< 3 bit. Frame type field, see 802.15.4 */ + uint8_t security_enabled; /**< 1 bit. True if security is used in this frame */ + uint8_t frame_pending; /**< 1 bit. True if sender has more data to send */ + uint8_t ack_required; /**< 1 bit. Is an ack frame required? */ + uint8_t panid_compression; /**< 1 bit. Is this a compressed header? */ + /* uint8_t reserved; */ /**< 3 bit. Unused bits */ + uint8_t dest_addr_mode; /**< 2 bit. Destination address mode, see 802.15.4 */ + uint8_t frame_version; /**< 2 bit. 802.15.4 frame version */ + uint8_t src_addr_mode; /**< 2 bit. Source address mode, see 802.15.4 */ +} frame802154_fcf_t; + +/** \brief 802.15.4 security control bitfield. See section 7.6.2.2.1 in 802.15.4 specification */ +typedef struct { + uint8_t security_level; /**< 3 bit. security level */ + uint8_t key_id_mode; /**< 2 bit. Key identifier mode */ + uint8_t reserved; /**< 3 bit. Reserved bits */ +} frame802154_scf_t; + +typedef union { + uint32_t u32; + uint16_t u16[2]; + uint8_t u8[4]; +} frame802154_frame_counter_t; + +typedef union { + uint16_t u16[4]; + uint8_t u8[8]; +} frame802154_key_source_t; + +/** \brief 802.15.4 Aux security header */ +typedef struct { + frame802154_scf_t security_control; /**< Security control bitfield */ + frame802154_frame_counter_t frame_counter; /**< Frame counter, used for security */ + frame802154_key_source_t key_source; /**< Key Source subfield */ + uint8_t key_index; /**< Key Index subfield */ +} frame802154_aux_hdr_t; + +/** \brief Parameters used by the frame802154_create() function. These + * parameters are used in the 802.15.4 frame header. See the 802.15.4 + * specification for details. + */ +struct frame802154 { + /* The fields dest_addr and src_addr must come first to ensure they are aligned to the + * CPU word size. Needed as they are accessed directly as linkaddr_t*. Note we cannot use + * the type linkaddr_t directly here, as we always need 8 bytes, not LINKADDR_SIZE bytes. */ + uint8_t dest_addr[8]; /**< Destination address */ + uint8_t src_addr[8]; /**< Source address */ + frame802154_fcf_t fcf; /**< Frame control field */ + uint8_t seq; /**< Sequence number */ + uint16_t dest_pid; /**< Destination PAN ID */ + uint16_t src_pid; /**< Source PAN ID */ + frame802154_aux_hdr_t aux_hdr; /**< Aux security header */ + //uint8_t *payload; /**< Pointer to 802.15.4 payload */ + int payload_len; /**< Length of payload field */ +}; +typedef struct frame802154 frame802154_t; + +/* Prototypes */ + +int frame802154_hdrlen(frame802154_t *p); +int frame802154_create(frame802154_t *p, uint8_t *buf); +int frame802154_parse(uint8_t *data, int length, frame802154_t *pf, uint8_t **payload); + +/** @} */ +#endif /* FRAME_802154_H */ +/** @} */ +/** @} */ diff --git a/bsd/net/if.c b/bsd/net/if.c index 79921cbdb..dcb728807 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -132,6 +133,7 @@ #include #endif + #include /* @@ -247,6 +249,14 @@ static uint32_t if_verbose = 0; SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &if_verbose, 0, ""); +#if (DEBUG || DEVELOPMENT) +static uint32_t default_tcp_kao_max = 0; +SYSCTL_INT(_net_link_generic_system, OID_AUTO, default_tcp_kao_max, + CTLFLAG_RW | CTLFLAG_LOCKED, &default_tcp_kao_max, 0, ""); +#else +static const uint32_t default_tcp_kao_max = 0; +#endif /* (DEBUG || DEVELOPMENT) */ + boolean_t intcoproc_unrestricted; /* Eventhandler context for interface events */ @@ -393,7 +403,7 @@ if_detach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link) panic("%s: unexpected (missing) refcnt ifa=%p", __func__, ifa); /* NOTREACHED */ } - ifa->ifa_debug &= ~(IFD_ATTACHED | IFD_DETACHING); + ifa->ifa_debug &= ~IFD_ATTACHED; if (ifa->ifa_detached != NULL) { (*ifa->ifa_detached)(ifa); @@ -795,11 +805,15 @@ u_int32_t if_functional_type(struct ifnet *ifp, bool exclude_delegate) { u_int32_t ret = IFRTYPE_FUNCTIONAL_UNKNOWN; + if (ifp != NULL) { if (ifp->if_flags & IFF_LOOPBACK) { ret = IFRTYPE_FUNCTIONAL_LOOPBACK; + } else if (IFNET_IS_COMPANION_LINK(ifp)) { + ret = IFRTYPE_FUNCTIONAL_COMPANIONLINK; } else if ((exclude_delegate && - (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) || + (ifp->if_family == IFNET_FAMILY_ETHERNET && + ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) || (!exclude_delegate && IFNET_IS_WIFI(ifp))) { if (ifp->if_eflags & IFEF_AWDL) { ret = IFRTYPE_FUNCTIONAL_WIFI_AWDL; @@ -1806,13 +1820,23 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) switch (cmd) { case SIOCSIFLINKPARAMS: { /* struct if_linkparamsreq */ - struct tb_profile tb = { 0, 0, 0 }; + struct tb_profile tb = { .rate = 0, .percent = 0, .depth = 0 }; if ((error = proc_suser(p)) != 0) { break; } + char netem_name[32]; + (void) snprintf(netem_name, sizeof(netem_name), + "if_output_netem_%s", if_name(ifp)); + error = netem_config(&ifp->if_output_netem, netem_name, + &iflpr->iflpr_output_netem, (void *)ifp, + ifnet_enqueue_netem, NETEM_MAX_BATCH_SIZE); + if (error != 0) { + break; + } + IFCQ_LOCK(ifq); if (!IFCQ_IS_READY(ifq)) { error = ENXIO; @@ -1864,6 +1888,12 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) sizeof(iflpr->iflpr_output_lt)); bcopy(&ifp->if_input_lt, &iflpr->iflpr_input_lt, sizeof(iflpr->iflpr_input_lt)); + + if (ifp->if_output_netem != NULL) { + netem_get_params(ifp->if_output_netem, + &iflpr->iflpr_output_netem); + } + break; } @@ -1980,9 +2010,11 @@ ifioctl_getnetagents(struct ifnet *ifp, u_int32_t *count, user_addr_t uuid_p) #define IF_MAXAGENTS 64 #define IF_AGENT_INCREMENT 8 -static int +int if_add_netagent_locked(struct ifnet *ifp, uuid_t new_agent_uuid) { + VERIFY(ifp != NULL); + uuid_t *first_empty_slot = NULL; u_int32_t index = 0; bool already_added = FALSE; @@ -2290,14 +2322,10 @@ if_set_qosmarking_mode(struct ifnet *ifp, u_int32_t mode) switch (mode) { case IFRTYPE_QOSMARKING_MODE_NONE: ifp->if_qosmarking_mode = IFRTYPE_QOSMARKING_MODE_NONE; - ifp->if_eflags &= ~IFEF_QOSMARKING_CAPABLE; break; case IFRTYPE_QOSMARKING_FASTLANE: - ifp->if_qosmarking_mode = IFRTYPE_QOSMARKING_FASTLANE; - ifp->if_eflags |= IFEF_QOSMARKING_CAPABLE; - if (net_qos_policy_capable_enabled != 0) { - ifp->if_eflags |= IFEF_QOSMARKING_ENABLED; - } + case IFRTYPE_QOSMARKING_RFC4594: + ifp->if_qosmarking_mode = mode; break; default: error = EINVAL; @@ -2305,7 +2333,7 @@ if_set_qosmarking_mode(struct ifnet *ifp, u_int32_t mode) } if (error == 0 && old_mode != ifp->if_qosmarking_mode) { dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_QOS_MODE_CHANGED, - NULL, sizeof(struct kev_dl_rrc_state)); + NULL, 0); } return error; } @@ -2360,10 +2388,12 @@ ifioctl_iforder(u_long cmd, caddr_t data) if (found_duplicate) { break; } - } - - error = ifnet_reset_order(ordered_indices, ifo->ifo_count); + error = ifnet_reset_order(ordered_indices, ifo->ifo_count); + } else { + // Clear the list + error = ifnet_reset_order(NULL, 0); + } break; } @@ -2640,6 +2670,7 @@ ifioctl_restrict_intcoproc(unsigned long cmd, const char *ifname, case SIOCGIFNETMASK_IN6: case SIOCGIFPROTOLIST32: case SIOCGIFPROTOLIST64: + case SIOCGIFXFLAGS: return false; default: #if (DEBUG || DEVELOPMENT) @@ -2900,6 +2931,15 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFLOWINTERNET: /* struct ifreq */ case SIOCGIFLOWPOWER: /* struct ifreq */ case SIOCSIFLOWPOWER: /* struct ifreq */ + case SIOCSIF6LOWPAN: /* struct ifreq */ + case SIOCGIF6LOWPAN: /* struct ifreq */ + case SIOCGIFMPKLOG: /* struct ifreq */ + case SIOCSIFMPKLOG: /* struct ifreq */ + case SIOCGIFCONSTRAINED: /* struct ifreq */ + case SIOCSIFCONSTRAINED: /* struct ifreq */ + case SIOCGIFXFLAGS: /* struct ifreq */ + case SIOCGIFNOACKPRIO: /* struct ifreq */ + case SIOCSIFNOACKPRIO: /* struct ifreq */ { /* struct ifreq */ struct ifreq ifr; bcopy(data, &ifr, sizeof(ifr)); @@ -2924,20 +2964,20 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFPHYADDR: /* struct {if,in_}aliasreq */ bcopy(((struct in_aliasreq *)(void *)data)->ifra_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; #if INET6 case SIOCSIFPHYADDR_IN6_32: /* struct in6_aliasreq_32 */ bcopy(((struct in6_aliasreq_32 *)(void *)data)->ifra_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCSIFPHYADDR_IN6_64: /* struct in6_aliasreq_64 */ bcopy(((struct in6_aliasreq_64 *)(void *)data)->ifra_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; #endif /* INET6 */ @@ -2951,48 +2991,48 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) bcopy(data, ifs, sizeof(*ifs)); ifs->ifs_name[IFNAMSIZ - 1] = '\0'; bcopy(ifs->ifs_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCGIFMEDIA32: /* struct ifmediareq32 */ case SIOCGIFXMEDIA32: /* struct ifmediareq32 */ bcopy(((struct ifmediareq32 *)(void *)data)->ifm_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCGIFMEDIA64: /* struct ifmediareq64 */ case SIOCGIFXMEDIA64: /* struct ifmediareq64 */ bcopy(((struct ifmediareq64 *)(void *)data)->ifm_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCSIFDESC: /* struct if_descreq */ case SIOCGIFDESC: /* struct if_descreq */ bcopy(((struct if_descreq *)(void *)data)->ifdr_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCSIFLINKPARAMS: /* struct if_linkparamsreq */ case SIOCGIFLINKPARAMS: /* struct if_linkparamsreq */ bcopy(((struct if_linkparamsreq *)(void *)data)->iflpr_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCGIFQUEUESTATS: /* struct if_qstatsreq */ bcopy(((struct if_qstatsreq *)(void *)data)->ifqr_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCSIFTHROTTLE: /* struct if_throttlereq */ case SIOCGIFTHROTTLE: /* struct if_throttlereq */ bcopy(((struct if_throttlereq *)(void *)data)->ifthr_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCAIFAGENTID: /* struct if_agentidreq */ @@ -3001,21 +3041,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFAGENTIDS64: /* struct if_agentidsreq64 */ bcopy(((struct if_agentidreq *)(void *)data)->ifar_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCSIFNETSIGNATURE: /* struct if_nsreq */ case SIOCGIFNETSIGNATURE: /* struct if_nsreq */ bcopy(((struct if_nsreq *)(void *)data)->ifnsr_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; case SIOCGIFPROTOLIST32: /* struct if_protolistreq32 */ case SIOCGIFPROTOLIST64: /* struct if_protolistreq64 */ bcopy(((struct if_protolistreq *)(void *)data)->ifpl_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; default: /* @@ -3024,7 +3064,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) */ bcopy(((struct ifreq *)(void *)data)->ifr_name, ifname, IFNAMSIZ); - ifp = ifunit(ifname); + ifp = ifunit_ref(ifname); break; } dlil_if_unlock(); @@ -3066,8 +3106,8 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) bcopy(ifs, data, sizeof(*ifs)); break; - case SIOCGIFMEDIA32: /* struct ifmediareq32 */ - case SIOCGIFMEDIA64: /* struct ifmediareq64 */ + case SIOCGIFMEDIA32: /* struct ifmediareq32 */ + case SIOCGIFMEDIA64: /* struct ifmediareq64 */ case SIOCGIFXMEDIA32: /* struct ifmediareq32 */ case SIOCGIFXMEDIA64: /* struct ifmediareq64 */ error = ifioctl_get_media(ifp, so, cmd, data); @@ -3171,6 +3211,9 @@ done: } } + if (ifp != NULL) { + ifnet_decr_iorefcnt(ifp); + } return error; } @@ -3228,6 +3271,12 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ifnet_lock_done(ifp); break; + case SIOCGIFXFLAGS: + ifnet_lock_shared(ifp); + ifr->ifr_xflags = ifp->if_xflags; + ifnet_lock_done(ifp); + break; + case SIOCGIFCAP: ifnet_lock_shared(ifp); ifr->ifr_reqcap = ifp->if_capabilities; @@ -3495,6 +3544,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) case SIOCSIFALTMTU: case SIOCSIFVLAN: case SIOCSIFBOND: + case SIOCSIF6LOWPAN: error = proc_suser(p); if (error != 0) { break; @@ -3545,6 +3595,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) case SIOCGIFDEVMTU: case SIOCGIFVLAN: case SIOCGIFBOND: + case SIOCGIF6LOWPAN: error = ifnet_ioctl(ifp, SOCK_DOM(so), cmd, (caddr_t)ifr); break; @@ -3617,6 +3668,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) } else { ifp->if_eflags &= ~IFEF_EXPENSIVE; } + ifnet_increment_generation(ifp); ifnet_lock_done(ifp); /* * Update the expensive bit in the delegated interface @@ -3628,10 +3680,57 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) if (difp->if_delegated.ifp == ifp) { difp->if_delegated.expensive = ifp->if_eflags & IFEF_EXPENSIVE ? 1 : 0; + ifnet_increment_generation(difp); } ifnet_lock_done(difp); } ifnet_head_done(); + necp_update_all_clients(); + break; + } + + case SIOCGIFCONSTRAINED: + ifnet_lock_shared(ifp); + if (ifp->if_xflags & IFXF_CONSTRAINED) { + ifr->ifr_constrained = 1; + } else { + ifr->ifr_constrained = 0; + } + ifnet_lock_done(ifp); + break; + + case SIOCSIFCONSTRAINED: + { + struct ifnet *difp; + + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + return error; + } + ifnet_lock_exclusive(ifp); + if (ifr->ifr_constrained) { + ifp->if_xflags |= IFXF_CONSTRAINED; + } else { + ifp->if_xflags &= ~IFXF_CONSTRAINED; + } + ifnet_increment_generation(ifp); + ifnet_lock_done(ifp); + /* + * Update the constrained bit in the delegated interface + * structure. + */ + ifnet_head_lock_shared(); + TAILQ_FOREACH(difp, &ifnet_head, if_link) { + ifnet_lock_exclusive(difp); + if (difp->if_delegated.ifp == ifp) { + difp->if_delegated.constrained = + ifp->if_xflags & IFXF_CONSTRAINED ? 1 : 0; + ifnet_increment_generation(difp); + } + ifnet_lock_done(difp); + } + ifnet_head_done(); + necp_update_all_clients(); break; } @@ -3794,6 +3893,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) error = EINVAL; } break; + case SIOCSIFTIMESTAMPENABLE: case SIOCSIFTIMESTAMPDISABLE: error = proc_suser(p); @@ -3875,6 +3975,15 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) error = EINVAL; #endif /* (DEBUG || DEVELOPMENT) */ break; + + case SIOCSIFSUBFAMILY: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + return error; + } + error = ifnet_ioctl(ifp, SOCK_DOM(so), cmd, (caddr_t)ifr); + break; + case SIOCSIFLOWINTERNET: if ((error = priv_check_cred(kauth_cred_get(), PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { @@ -3918,6 +4027,41 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) error = EOPNOTSUPP; #endif /* DEVELOPMENT || DEBUG */ break; + + case SIOCGIFMPKLOG: + ifr->ifr_mpk_log = !!(ifp->if_xflags & IFXF_MPK_LOG); + break; + case SIOCSIFMPKLOG: + if (ifr->ifr_mpk_log) { + ifp->if_xflags |= IFXF_MPK_LOG; + } else { + ifp->if_xflags &= ~IFXF_MPK_LOG; + } + break; + case SIOCGIFNOACKPRIO: + ifnet_lock_shared(ifp); + if (ifp->if_eflags & IFEF_NOACKPRI) { + ifr->ifr_noack_prio = 1; + } else { + ifr->ifr_noack_prio = 0; + } + ifnet_lock_done(ifp); + break; + + case SIOCSIFNOACKPRIO: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + return error; + } + ifnet_lock_exclusive(ifp); + if (ifr->ifr_noack_prio) { + ifp->if_eflags |= IFEF_NOACKPRI; + } else { + ifp->if_eflags &= ~IFEF_NOACKPRI; + } + ifnet_lock_done(ifp); + break; + default: VERIFY(0); /* NOTREACHED */ @@ -5216,15 +5360,18 @@ if_copy_rxpoll_stats(struct ifnet *ifp, struct if_rxpoll_stats *if_rs) if (!(ifp->if_eflags & IFEF_RXPOLL) || !ifnet_is_attached(ifp, 1)) { return; } - - /* by now, ifnet will stay attached so if_inp must be valid */ - VERIFY(ifp->if_inp != NULL); - bcopy(&ifp->if_inp->pstats, if_rs, sizeof(*if_rs)); - + bcopy(&ifp->if_poll_pstats, if_rs, sizeof(*if_rs)); /* Release the IO refcnt */ ifnet_decr_iorefcnt(ifp); } +void +if_copy_netif_stats(struct ifnet *ifp, struct if_netif_stats *if_ns) +{ + bzero(if_ns, sizeof(*if_ns)); +#pragma unused(ifp) +} + struct ifaddr * ifa_remref(struct ifaddr *ifa, int locked) { @@ -5544,6 +5691,8 @@ ifioctl_cassert(void) case SIOCSIFDISABLEOUTPUT: + case SIOCSIFSUBFAMILY: + case SIOCGIFAGENTLIST32: case SIOCGIFAGENTLIST64: @@ -5560,8 +5709,22 @@ ifioctl_cassert(void) case SIOCGIFPROTOLIST32: case SIOCGIFPROTOLIST64: + case SIOCSIF6LOWPAN: + case SIOCGIF6LOWPAN: + case SIOCGIFLOWPOWER: case SIOCSIFLOWPOWER: + + case SIOCGIFMPKLOG: + case SIOCSIFMPKLOG: + + case SIOCGIFCONSTRAINED: + case SIOCSIFCONSTRAINED: + + case SIOCGIFXFLAGS: + + case SIOCGIFNOACKPRIO: + case SIOCSIFNOACKPRIO: ; } } @@ -5617,3 +5780,25 @@ intf_event_enqueue_nwk_wq_entry(struct ifnet *ifp, struct sockaddr *addrp, p_intf_ev->nwk_wqe.arg = &p_intf_ev->intf_ev_arg; nwk_wq_enqueue((struct nwk_wq_entry*)p_intf_ev); } + +int +if_get_tcp_kao_max(struct ifnet *ifp) +{ + int error = 0; + + if (ifp->if_tcp_kao_max == 0) { + struct ifreq ifr; + + memset(&ifr, 0, sizeof(struct ifreq)); + error = ifnet_ioctl(ifp, 0, SIOCGIFTCPKAOMAX, &ifr); + + ifnet_lock_exclusive(ifp); + if (error == 0) { + ifp->if_tcp_kao_max = ifr.ifr_tcp_kao_max; + } else if (error == EOPNOTSUPP) { + ifp->if_tcp_kao_max = default_tcp_kao_max; + } + ifnet_lock_done(ifp); + } + return error; +} diff --git a/bsd/net/if.h b/bsd/net/if.h index 4189a82cc..0e516a42e 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -129,7 +129,7 @@ struct if_clonereq32 { #define IFEF_ENQUEUE_MULTI 0x00000002 /* enqueue multiple packets at once */ #define IFEF_DELAY_START 0x00000004 /* delay start callback */ #define IFEF_PROBE_CONNECTIVITY 0x00000008 /* Probe connections going over this interface */ -#define IFEF_QOSMARKING_CAPABLE 0x00000010 /* XXX Obsolete, to be removed */ +#define IFEF_ADV_REPORT 0x00000010 /* Supports interface advisory report */ #define IFEF_IPV6_DISABLED 0x00000020 /* coupled to ND6_IFF_IFDISABLED */ #define IFEF_ACCEPT_RTADV 0x00000040 /* accepts IPv6 RA on the interface */ #define IFEF_TXSTART 0x00000080 /* has start callback */ @@ -178,12 +178,14 @@ struct if_clonereq32 { #define IFXF_WAKE_ON_MAGIC_PACKET 0x00000001 /* wake on magic packet */ #define IFXF_TIMESTAMP_ENABLED 0x00000002 /* time stamping enabled */ #define IFXF_NX_NOAUTO 0x00000004 /* no auto config nexus */ -#define IFXF_MULTISTACK_BPF_TAP 0x00000008 /* multistack bpf tap */ +#define IFXF_LEGACY 0x00000008 /* legacy (non-netif) mode */ #define IFXF_LOW_INTERNET_UL 0x00000010 /* Uplink Low Internet is confirmed */ #define IFXF_LOW_INTERNET_DL 0x00000020 /* Downlink Low Internet is confirmed */ #define IFXF_ALLOC_KPI 0x00000040 /* Allocated via the ifnet_alloc KPI */ #define IFXF_LOW_POWER 0x00000080 /* Low Power Mode */ - +#define IFXF_MPK_LOG 0x00000100 /* Multi-layer Packet Logging */ +#define IFXF_CONSTRAINED 0x00000200 /* Constrained - Save Data Mode */ +#define IFXF_LOW_LATENCY 0x00000400 /* Low latency interface */ /* * Current requirements for an AWDL interface. Setting/clearing IFEF_AWDL * will also trigger the setting/clearing of the rest of the flags. Once @@ -433,6 +435,7 @@ struct ifreq { uint32_t ifo_inuse; } ifru_opportunistic; u_int64_t ifru_eflags; + u_int64_t ifru_xflags; struct { int32_t ifl_level; uint32_t ifl_flags; @@ -466,6 +469,9 @@ struct ifreq { #define IFRTYPE_FAMILY_FIREWIRE 13 #define IFRTYPE_FAMILY_BOND 14 #define IFRTYPE_FAMILY_CELLULAR 15 +#define IFRTYPE_FAMILY_6LOWPAN 16 +#define IFRTYPE_FAMILY_UTUN 17 +#define IFRTYPE_FAMILY_IPSEC 18 uint32_t ift_subfamily; #define IFRTYPE_SUBFAMILY_ANY 0 #define IFRTYPE_SUBFAMILY_USB 1 @@ -474,19 +480,23 @@ struct ifreq { #define IFRTYPE_SUBFAMILY_THUNDERBOLT 4 #define IFRTYPE_SUBFAMILY_RESERVED 5 #define IFRTYPE_SUBFAMILY_INTCOPROC 6 +#define IFRTYPE_SUBFAMILY_QUICKRELAY 7 +#define IFRTYPE_SUBFAMILY_DEFAULT 8 } ifru_type; #endif /* PRIVATE */ u_int32_t ifru_functional_type; -#define IFRTYPE_FUNCTIONAL_UNKNOWN 0 -#define IFRTYPE_FUNCTIONAL_LOOPBACK 1 -#define IFRTYPE_FUNCTIONAL_WIRED 2 -#define IFRTYPE_FUNCTIONAL_WIFI_INFRA 3 -#define IFRTYPE_FUNCTIONAL_WIFI_AWDL 4 -#define IFRTYPE_FUNCTIONAL_CELLULAR 5 -#define IFRTYPE_FUNCTIONAL_INTCOPROC 6 -#define IFRTYPE_FUNCTIONAL_LAST 6 +#define IFRTYPE_FUNCTIONAL_UNKNOWN 0 +#define IFRTYPE_FUNCTIONAL_LOOPBACK 1 +#define IFRTYPE_FUNCTIONAL_WIRED 2 +#define IFRTYPE_FUNCTIONAL_WIFI_INFRA 3 +#define IFRTYPE_FUNCTIONAL_WIFI_AWDL 4 +#define IFRTYPE_FUNCTIONAL_CELLULAR 5 +#define IFRTYPE_FUNCTIONAL_INTCOPROC 6 +#define IFRTYPE_FUNCTIONAL_COMPANIONLINK 7 +#define IFRTYPE_FUNCTIONAL_LAST 7 #ifdef PRIVATE u_int32_t ifru_expensive; + u_int32_t ifru_constrained; u_int32_t ifru_2kcl; struct { u_int32_t qlen; @@ -500,7 +510,8 @@ struct ifreq { #define IFRTYPE_ECN_DISABLE 2 u_int32_t ifru_qosmarking_mode; #define IFRTYPE_QOSMARKING_MODE_NONE 0 -#define IFRTYPE_QOSMARKING_FASTLANE 1 +#define IFRTYPE_QOSMARKING_FASTLANE 1 /* supported: socket/channel */ +#define IFRTYPE_QOSMARKING_RFC4594 2 /* supported: channel only */ u_int32_t ifru_qosmarking_enabled; u_int32_t ifru_disable_output; u_int32_t ifru_low_internet; @@ -508,6 +519,9 @@ struct ifreq { #define IFRTYPE_LOW_INTERNET_ENABLE_UL 0x0001 #define IFRTYPE_LOW_INTERNET_ENABLE_DL 0x0002 int ifru_low_power_mode; + u_int32_t ifru_tcp_kao_max; + int ifru_mpk_log; /* Multi Layer Packet Log */ + u_int32_t ifru_noack_prio; #endif /* PRIVATE */ } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ @@ -540,9 +554,11 @@ struct ifreq { #ifdef PRIVATE #define ifr_opportunistic ifr_ifru.ifru_opportunistic #define ifr_eflags ifr_ifru.ifru_eflags /* extended flags */ +#define ifr_xflags ifr_ifru.ifru_xflags /* extra flags */ #define ifr_log ifr_ifru.ifru_log /* logging level/flags */ #define ifr_delegated ifr_ifru.ifru_delegated /* delegated interface index */ #define ifr_expensive ifr_ifru.ifru_expensive +#define ifr_constrained ifr_ifru.ifru_constrained #define ifr_type ifr_ifru.ifru_type /* interface type */ #define ifr_functional_type ifr_ifru.ifru_functional_type #define ifr_2kcl ifr_ifru.ifru_2kcl @@ -558,6 +574,9 @@ struct ifreq { #define ifr_disable_output ifr_ifru.ifru_disable_output #define ifr_low_internet ifr_ifru.ifru_low_internet #define ifr_low_power_mode ifr_ifru.ifru_low_power_mode +#define ifr_tcp_kao_max ifr_ifru.ifru_tcp_kao_max +#define ifr_mpk_log ifr_ifru.ifru_mpk_log +#define ifr_noack_prio ifr_ifru.ifru_noack_prio #endif /* PRIVATE */ }; @@ -794,6 +813,8 @@ struct if_linkparamsreq { struct if_bandwidths iflpr_input_bw; struct if_latencies iflpr_output_lt; struct if_latencies iflpr_input_lt; + struct if_netem_params iflpr_input_netem; + struct if_netem_params iflpr_output_netem; }; /* @@ -912,7 +933,7 @@ struct if_nexusreq { char ifnr_name[IFNAMSIZ]; /* interface name */ uint64_t ifnr_flags; /* unused, must be zero */ uuid_t ifnr_netif; /* netif nexus instance UUID */ - uuid_t ifnr_multistack; /* multistack nexus UUID */ + uuid_t ifnr_flowswitch; /* flowswitch nexus UUID */ uint64_t ifnr_reserved[5]; }; diff --git a/bsd/net/if_6lowpan.c b/bsd/net/if_6lowpan.c new file mode 100644 index 000000000..92eebb0fa --- /dev/null +++ b/bsd/net/if_6lowpan.c @@ -0,0 +1,1095 @@ +/* + * Copyright (c) 2017-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright 1998 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * if_6lowpan.c - pseudo-device driver for IEEE 802.15.4 . + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#ifdef INET +#include +#include +#endif + +#include +#include +#include + +#define SIXLOWPANNAME "6lowpan" + +struct ifnet *p_6lowpan_ifnet = NULL; + +extern errno_t nd6_lookup_ipv6(ifnet_t interface, + const struct sockaddr_in6 *ip6_dest, struct sockaddr_dl *ll_dest, + size_t ll_dest_len, route_t hint, mbuf_t packet); + + +typedef int (bpf_callback_func)(struct ifnet *, struct mbuf *); +typedef int (if_set_bpf_tap_func)(struct ifnet *ifp, int mode, bpf_callback_func * func); + +static __inline__ lck_grp_t * +my_lck_grp_alloc_init(const char * grp_name) +{ + lck_grp_t * grp; + lck_grp_attr_t * grp_attrs; + + grp_attrs = lck_grp_attr_alloc_init(); + grp = lck_grp_alloc_init(grp_name, grp_attrs); + lck_grp_attr_free(grp_attrs); + return grp; +} + +static __inline__ lck_mtx_t * +my_lck_mtx_alloc_init(lck_grp_t * lck_grp) +{ + lck_attr_t * lck_attrs; + lck_mtx_t * lck_mtx; + + lck_attrs = lck_attr_alloc_init(); + lck_mtx = lck_mtx_alloc_init(lck_grp, lck_attrs); + lck_attr_free(lck_attrs); + return lck_mtx; +} + +static lck_mtx_t *sixlowpan_lck_mtx; + +static __inline__ void +sixlowpan_lock_init(void) +{ + lck_grp_t *lck_grp; + + lck_grp = my_lck_grp_alloc_init("if_6lowpan"); + sixlowpan_lck_mtx = my_lck_mtx_alloc_init(lck_grp); +} + +static __inline__ void +sixlowpan_assert_lock_held(void) +{ + lck_mtx_assert(sixlowpan_lck_mtx, LCK_MTX_ASSERT_OWNED); + return; +} + +#ifdef __UNUSED__ +static __inline__ void +sixlowpan_assert_lock_not_held(void) +{ + lck_mtx_assert(sixlowpan_lck_mtx, LCK_MTX_ASSERT_NOTOWNED); + return; +} +#endif + +static __inline__ void +sixlowpan_lock(void) +{ + lck_mtx_lock(sixlowpan_lck_mtx); + return; +} + +static __inline__ void +sixlowpan_unlock(void) +{ + lck_mtx_unlock(sixlowpan_lck_mtx); + return; +} + +struct if6lpan; +LIST_HEAD(if6lpan_list, if6lpan); + +typedef LIST_ENTRY(if6lpan) +if6lpan_entry; + +#define IF6LPAN_SIGNATURE 0x6666face +struct if6lpan { + if6lpan_entry if6lpan_list; + char if6lpan_name[IFNAMSIZ]; /* our unique id */ + char if6lpan_addr[IEEE802154_ADDR_LEN]; /* our LL address */ + struct ifnet * if6lpan_ifp; /* our interface */ + struct ifnet * if6lpan_pifp; /* parent interface */ +#define IF6LPANF_DETACHING 0x1 /* interface is detaching */ +#define IF6LPANF_READY 0x2 /* interface is ready */ + u_int32_t if6lpan_flags; + bpf_packet_func if6lpan_bpf_input; + bpf_packet_func if6lpan_bpf_output; + int32_t if6lpan_retain_count; + u_int32_t if6lpan_signature; /* IF6LPAN_SIGNATURE */ + u_int8_t if6lpan_ieee802154_seq; +}; + +typedef struct if6lpan * if6lpan_ref; + +static __inline__ int +if6lpan_flags_ready(if6lpan_ref ifl) +{ + return (ifl->if6lpan_flags & IF6LPANF_READY) != 0; +} + +static __inline__ void +if6lpan_flags_set_ready(if6lpan_ref ifl) +{ + ifl->if6lpan_flags |= IF6LPANF_READY; + return; +} + +static __inline__ void +if6lpan_set_addr(if6lpan_ref ifl, caddr_t ether_addr) +{ + ifl->if6lpan_addr[0] = 0x66; + ifl->if6lpan_addr[1] = 0x66; + bcopy(ether_addr, &ifl->if6lpan_addr[2], ETHER_ADDR_LEN); + return; +} + +#ifdef __UNUSED__ +static __inline__ u_int8_t* +if6lpan_get_addr(if6lpan_ref ifl) +{ + return ifl->ifl6lpan_addr; +} +#endif + +static __inline__ int +if6lpan_flags_detaching(if6lpan_ref ifl) +{ + return (ifl->if6lpan_flags & IF6LPANF_DETACHING) != 0; +} + +static __inline__ void +if6lpan_flags_set_detaching(if6lpan_ref ifl) +{ + ifl->if6lpan_flags |= IF6LPANF_DETACHING; + return; +} + +static int sixlowpan_clone_create(struct if_clone *, u_int32_t, void *); +static int sixlowpan_clone_destroy(struct ifnet *); +static int sixlowpan_input(ifnet_t ifp, protocol_family_t protocol, + mbuf_t m, char *frame_header); +static int sixlowpan_output(struct ifnet *ifp, struct mbuf *m); +static int sixlowpan_ioctl(ifnet_t ifp, u_long cmd, void *addr); +static int sixlowpan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, + bpf_packet_func func); +static int sixlowpan_attach_protocol(struct ifnet *ifp); +static int sixlowpan_detach_protocol(struct ifnet *ifp); +static int sixlowpan_unconfig(if6lpan_ref ifl); +static int sixlowpan_config(struct ifnet *ifp, struct ifnet *p); +static void sixlowpan_if_free(struct ifnet *ifp); +static int sixlowpan_remove(if6lpan_ref ifl); +static int sixlowpan_framer_extended(struct ifnet *ifp, struct mbuf **m, + const struct sockaddr *ndest, const char *edst, + const char *ether_type, u_int32_t *prepend_len, u_int32_t *postpend_len); + +#define SIXLOWPAN_MAXUNIT IF_MAXUNIT +#define SIXLOWPAN_ZONE_MAX_ELEM MIN(IFNETS_MAX, SIXLOWPAN_MAXUNIT) + +static struct if_clone sixlowpan_cloner = IF_CLONE_INITIALIZER(SIXLOWPANNAME, + sixlowpan_clone_create, + sixlowpan_clone_destroy, + 0, + SIXLOWPAN_MAXUNIT, + SIXLOWPAN_ZONE_MAX_ELEM, + sizeof(struct if6lpan)); + +/** +** if6lpan_ref routines +**/ +static void +if6lpan_retain(if6lpan_ref ifl) +{ + if (ifl->if6lpan_signature != IF6LPAN_SIGNATURE) { + panic("if6lpan_retain: bad signature\n"); + } + if (ifl->if6lpan_retain_count == 0) { + panic("if6lpan_retain: retain count is 0\n"); + } + OSIncrementAtomic(&ifl->if6lpan_retain_count); +} + +static void +if6lpan_release(if6lpan_ref ifl) +{ + u_int32_t old_retain_count; + + if (ifl->if6lpan_signature != IF6LPAN_SIGNATURE) { + panic("if6lpan_release: bad signature\n"); + } + old_retain_count = OSDecrementAtomic(&ifl->if6lpan_retain_count); + switch (old_retain_count) { + case 0: + panic("if6lpan_release: retain count is 0\n"); + break; + case 1: + ifl->if6lpan_signature = 0; + if_clone_softc_deallocate(&sixlowpan_cloner, ifl); + break; + default: + break; + } + return; +} + +static if6lpan_ref +ifnet_get_if6lpan(struct ifnet * ifp) +{ + if6lpan_ref ifl; + + ifl = (if6lpan_ref)ifnet_softc(ifp); + return ifl; +} + +static if6lpan_ref +ifnet_get_if6lpan_retained(struct ifnet * ifp) +{ + if6lpan_ref ifl; + + ifl = ifnet_get_if6lpan(ifp); + if (ifl == NULL) { + return NULL; + } + if (if6lpan_flags_detaching(ifl)) { + return NULL; + } + if6lpan_retain(ifl); + return ifl; +} + +static int +sixlowpan_clone_attach(void) +{ + int error; + + error = if_clone_attach(&sixlowpan_cloner); + if (error != 0) { + return error; + } + sixlowpan_lock_init(); + return 0; +} + +static int +sixlowpan_demux( + __unused ifnet_t ifp, + __unused mbuf_t m, + __unused char *frame_header, + protocol_family_t *protocol_family) +{ + *protocol_family = PF_INET6; + return 0; +} + +static errno_t +sixlowpan_add_proto(__unused ifnet_t interface, protocol_family_t protocol, + __unused const struct ifnet_demux_desc *demux_array, + __unused u_int32_t demux_count) +{ + if (protocol == PF_INET6) { + return 0; + } + return ENOPROTOOPT; +} + +static errno_t +sixlowpan_del_proto(__unused ifnet_t interface, __unused protocol_family_t protocol) +{ + return 0; +} + +static int +sixlowpan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) +{ + int error; + if6lpan_ref ifl; + ifnet_t ifp; + struct ifnet_init_eparams if_epraram; + + ifl = if_clone_softc_allocate(&sixlowpan_cloner); + if (ifl == NULL) { + return ENOBUFS; + } + ifl->if6lpan_retain_count = 1; + ifl->if6lpan_signature = IF6LPAN_SIGNATURE; + + /* use the interface name as the unique id for ifp recycle */ + if ((unsigned int) + snprintf(ifl->if6lpan_name, sizeof(ifl->if6lpan_name), "%s%d", + ifc->ifc_name, unit) >= sizeof(ifl->if6lpan_name)) { + if6lpan_release(ifl); + return EINVAL; + } + + bzero(&if_epraram, sizeof(if_epraram)); + if_epraram.ver = IFNET_INIT_CURRENT_VERSION; + if_epraram.len = sizeof(if_epraram); + if_epraram.flags = IFNET_INIT_LEGACY; + if_epraram.uniqueid = ifl->if6lpan_name; + if_epraram.uniqueid_len = strlen(ifl->if6lpan_name); + if_epraram.name = ifc->ifc_name; + if_epraram.unit = unit; + if_epraram.family = IFNET_FAMILY_6LOWPAN; + if_epraram.type = IFT_6LOWPAN; + if_epraram.output = sixlowpan_output; + if_epraram.demux = sixlowpan_demux; + if_epraram.add_proto = sixlowpan_add_proto; + if_epraram.del_proto = sixlowpan_del_proto; + if_epraram.framer_extended = sixlowpan_framer_extended; + if_epraram.softc = ifl; + if_epraram.ioctl = sixlowpan_ioctl; + if_epraram.set_bpf_tap = sixlowpan_set_bpf_tap; + if_epraram.detach = sixlowpan_if_free; + error = ifnet_allocate_extended(&if_epraram, &ifp); + + if (error) { + if6lpan_release(ifl); + return error; + } + + ifnet_set_offload(ifp, 0); + ifnet_set_addrlen(ifp, IEEE802154_ADDR_LEN); + ifnet_set_baudrate(ifp, 0); + // TODO: ifnet_set_hdrlen(ifp, IEEE802154_ENCAP_LEN); + + error = ifnet_attach(ifp, NULL); + if (error) { + ifnet_release(ifp); + if6lpan_release(ifl); + return error; + } + ifl->if6lpan_ifp = ifp; + + p_6lowpan_ifnet = ifp; + /* TODO: attach as IEEE 802.15.4 with no FCS */ + bpfattach(ifp, DLT_IEEE802_15_4_NOFCS, IEEE802154_ENCAP_LEN); + return 0; +} + +static int +sixlowpan_remove(if6lpan_ref ifl) +{ + sixlowpan_assert_lock_held(); + if (if6lpan_flags_detaching(ifl)) { + return 0; + } + if6lpan_flags_set_detaching(ifl); + sixlowpan_unconfig(ifl); + return 1; +} + + +static int +sixlowpan_clone_destroy(struct ifnet *ifp) +{ + if6lpan_ref ifl; + + sixlowpan_lock(); + ifl = ifnet_get_if6lpan_retained(ifp); + if (ifl == NULL) { + sixlowpan_unlock(); + return 0; + } + if (sixlowpan_remove(ifl) == 0) { + sixlowpan_unlock(); + if6lpan_release(ifl); + return 0; + } + sixlowpan_unlock(); + if6lpan_release(ifl); + ifnet_detach(ifp); + p_6lowpan_ifnet = NULL; + return 0; +} + +static int +sixlowpan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func) +{ + if6lpan_ref ifl; + + sixlowpan_lock(); + ifl = ifnet_get_if6lpan_retained(ifp); + if (ifl == NULL) { + sixlowpan_unlock(); + return ENODEV; + } + switch (mode) { + case BPF_TAP_DISABLE: + ifl->if6lpan_bpf_input = ifl->if6lpan_bpf_output = NULL; + break; + + case BPF_TAP_INPUT: + ifl->if6lpan_bpf_input = func; + break; + + case BPF_TAP_OUTPUT: + ifl->if6lpan_bpf_output = func; + break; + + case BPF_TAP_INPUT_OUTPUT: + ifl->if6lpan_bpf_input = ifl->if6lpan_bpf_output = func; + break; + default: + break; + } + sixlowpan_unlock(); + if6lpan_release(ifl); + return 0; +} + +/* + * 6lowpan output routine. + * Header compression on the protocol payload + * Frame the compressed payload in 802.15.4 Data Frame + * Encapsulate the 802.15.4 frame in an Ethernet frame. + */ +static int +sixlowpan_output(struct ifnet * ifp, struct mbuf * m) +{ + struct ifnet *p_intf = NULL; + if6lpan_ref ifl = NULL; + struct flowadv adv = { .code = FADV_SUCCESS }; + int err = 0; + char link_layer_dest[ETHER_ADDR_LEN]; + bpf_packet_func bpf_func; + + u_int16_t ethertype = htons(ETHERTYPE_IEEE802154); + memset(link_layer_dest, 0xff, ETHER_ADDR_LEN); + + if (m == 0) { + return 0; + } + if ((m->m_flags & M_PKTHDR) == 0) { + m_freem_list(m); + return 0; + } + + sixlowpan_lock(); + ifl = ifnet_get_if6lpan_retained(ifp); + + if (ifl == NULL || if6lpan_flags_ready(ifl) == 0) { + goto unlock_done; + } + + /* XXX parent interface equivalent? */ + p_intf = ifl->if6lpan_pifp; + bpf_func = ifl->if6lpan_bpf_output; + + sixlowpan_unlock(); + if6lpan_release(ifl); + + (void)ifnet_stat_increment_out(ifp, 1, m->m_pkthdr.len, 0); + + /* + * We added a 2 byte length before the 802.15.4 data frame + * We can play just with the length of the first mbuf in the + * chain because bpf_tap_imp() disregards the packet length + * of the mbuf packet header. + */ + if (bpf_func && (mbuf_setdata(m, m->m_data + 2, m->m_len - 2) == 0)) { + bpf_func(ifp, m); + mbuf_setdata(m, m->m_data - 2, m->m_len + 2); + } + + /* Append ethernet header */ + if ((err = ether_frameout_extended(p_intf, &m, NULL, + link_layer_dest, (const char *)ðertype, + NULL, NULL))) { + return err; + } + + err = dlil_output(p_intf, PF_802154, m, NULL, NULL, 1, &adv); + + if (err == 0) { + if (adv.code == FADV_FLOW_CONTROLLED) { + err = EQFULL; + } else if (adv.code == FADV_SUSPENDED) { + err = EQSUSPENDED; + } + } + return err; + +unlock_done: + sixlowpan_unlock(); + if (ifl != NULL) { + if6lpan_release(ifl); + } + m_freem(m); + return err; +} + +/* + * 6lowpan input routine. + * Decapsulate the 802.15.4 Data Frame + * Header decompression on the payload + * Pass the mbuf to the IPV6 protocol stack using proto_input() + */ +static int +sixlowpan_input(ifnet_t p, __unused protocol_family_t protocol, + mbuf_t m, __unused char *frame_header) +{ + frame802154_t ieee02154hdr; + u_int8_t *payload = NULL; + if6lpan_ref ifl = NULL; + bpf_packet_func bpf_func; + mbuf_t mc, m_temp; + int off, err = 0; + u_int16_t len; + + /* Allocate an mbuf cluster for the 802.15.4 frame and uncompressed payload */ + mc = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); + if (mc == NULL) { + err = -1; + goto err_out; + } + + memcpy(&len, mtod(m, u_int8_t *), sizeof(u_int16_t)); + len = ntohs(len); + m_adj(m, sizeof(u_int16_t)); + /* Copy the compressed 802.15.4 payload from source mbuf to allocated cluster mbuf */ + for (m_temp = m, off = 0; m_temp != NULL; m_temp = m_temp->m_next) { + if (m_temp->m_len > 0) { + m_copyback(mc, off, m_temp->m_len, mtod(m_temp, void *)); + off += m_temp->m_len; + } + } + + p = p_6lowpan_ifnet; + mc->m_pkthdr.rcvif = p; + + sixlowpan_lock(); + ifl = ifnet_get_if6lpan_retained(p); + + if (ifl == NULL) { + sixlowpan_unlock(); + err = -1; + goto err_out; + } + + if (if6lpan_flags_ready(ifl) == 0) { + if6lpan_release(ifl); + sixlowpan_unlock(); + err = -1; + goto err_out; + } + + bpf_func = ifl->if6lpan_bpf_input; + sixlowpan_unlock(); + if6lpan_release(ifl); + + if (bpf_func) { + bpf_func(p, mc); + } + + /* Parse the 802.15.4 frame header */ + bzero(&ieee02154hdr, sizeof(ieee02154hdr)); + frame802154_parse(mtod(mc, uint8_t *), len, &ieee02154hdr, &payload); + + /* XXX Add check for your link layer address being dest */ + sixxlowpan_input(&ieee02154hdr, payload); + + if (mbuf_setdata(mc, payload, ieee02154hdr.payload_len)) { + err = -1; + goto err_out; + } + mbuf_pkthdr_setlen(mc, ieee02154hdr.payload_len); + + /* Post decompression */ + if (proto_input(PF_INET6, mc) != 0) { + ifnet_stat_increment_in(p, 0, 0, 1); + err = -1; + goto err_out; + } else { + ifnet_stat_increment_in(p, 1, mc->m_pkthdr.len, 0); + } + +err_out: + if (err && mc) { + m_freem(mc); + } + if (!err) { + m_freem(m); + } + return err; +} + +#define SIXLOWPAN_IFMTU 1280 + +static int +sixlowpan_config(struct ifnet *ifp, struct ifnet *p) +{ + if6lpan_ref ifl; + u_int16_t parent_flags; + sixlowpan_lock(); + ifl = ifnet_get_if6lpan_retained(ifp); + if (ifl == NULL || ifl->if6lpan_pifp != NULL) { + sixlowpan_unlock(); + if (ifl != NULL) { + if6lpan_release(ifl); + } + return EBUSY; + } + sixlowpan_attach_protocol(p); + + /* set our LL address derived from that of the parent */ + if6lpan_set_addr(ifl, IF_LLADDR(p)); + ifnet_set_lladdr_and_type(ifp, ifl->if6lpan_addr, IEEE802154_ADDR_LEN, IFT_6LOWPAN); + + ifl->if6lpan_pifp = p; + ifl->if6lpan_flags = 0; + ifnet_set_mtu(ifp, SIXLOWPAN_IFMTU); + parent_flags = ifnet_flags(p) & (IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX); + ifnet_set_flags(ifp, parent_flags, IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX); + ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING); + ifnet_set_eflags(ifp, IFEF_NOAUTOIPV6LL, IFEF_NOAUTOIPV6LL); + if6lpan_flags_set_ready(ifl); + if6lpan_release(ifl); + sixlowpan_unlock(); + return 0; +} + +static int +sixlowpan_unconfig(if6lpan_ref ifl) +{ + struct ifnet *ifp = ifl->if6lpan_ifp; + + sixlowpan_assert_lock_held(); + /* Clear our MAC address. */ + ifnet_set_lladdr_and_type(ifp, NULL, 0, IFT_6LOWPAN); + sixlowpan_detach_protocol(ifl->if6lpan_pifp); + ifnet_set_mtu(ifp, 0); + ifnet_set_flags(ifp, 0, + IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX | IFF_RUNNING); + ifnet_set_eflags(ifp, 0, IFEF_NOAUTOIPV6LL); + ifl->if6lpan_flags = 0; + + return 0; +} + +static int +sixlowpan_ioctl(ifnet_t ifp, u_long cmd, void * data) +{ + int error = 0; + struct ifreq * ifr = NULL; + struct ifnet * p = NULL; + struct sixlowpanreq req = {}; + user_addr_t user_addr = 0; + if6lpan_ref ifl = NULL; + + if (ifnet_type(ifp) != IFT_6LOWPAN) { + return EOPNOTSUPP; + } + ifr = (struct ifreq *)data; + + switch (cmd) { + case SIOCSIFADDR: + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + break; + + case SIOCSIF6LOWPAN: + user_addr = proc_is64bit(current_proc()) + ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data); + error = copyin(user_addr, &req, sizeof(req)); + req.parent[IFNAMSIZ - 1] = '\0'; + if (error) { + break; + } + if (req.parent[0] != '\0') { + p = ifunit(req.parent); + if (p == NULL) { + error = ENXIO; + break; + } + if (ifnet_type(p) != IFT_ETHER + && ifnet_type(p) != IFT_IEEE8023ADLAG) { + error = EPROTONOSUPPORT; + break; + } + error = sixlowpan_config(ifp, p); + if (error) { + break; + } + } + break; + + case SIOCGIF6LOWPAN: + bzero(&req, sizeof req); + sixlowpan_lock(); + ifl = (if6lpan_ref)ifnet_softc(ifp); + if (ifl == NULL || if6lpan_flags_detaching(ifl)) { + sixlowpan_unlock(); + return ifl == NULL ? EOPNOTSUPP : EBUSY; + } + p = ifl->if6lpan_pifp; + sixlowpan_unlock(); + if (p != NULL) { + snprintf(req.parent, sizeof(req.parent), + "%s%d", ifnet_name(p), ifnet_unit(p)); + } + user_addr = proc_is64bit(current_proc()) + ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data); + error = copyout(&req, user_addr, sizeof(req)); + break; + +#ifdef SIOCSIFMTU /* xxx */ + case SIOCGIFMTU: + break; + + case SIOCSIFMTU: + ifnet_set_mtu(ifp, ifr->ifr_mtu); + break; +#endif /* SIOCSIFMTU */ + + default: + error = EOPNOTSUPP; + } + return error; +} + +static void +sixlowpan_if_free(struct ifnet * ifp) +{ + if6lpan_ref ifl; + + if (ifp == NULL) { + return; + } + ifl = (if6lpan_ref)ifnet_softc(ifp); + if (ifl == NULL) { + return; + } + if6lpan_release(ifl); + ifnet_release(ifp); + return; +} + +static errno_t +sixlowpan_detached(ifnet_t p, __unused protocol_family_t protocol) +{ + if (ifnet_is_attached(p, 0) == 0) { + // TODO: Find ifp from the parent p + // sixlowpan_if_free(ifp); + } + return 0; +} + +/* + * Function: sixlowpan_attach_protocol + * Purpose: + * Attach a DLIL protocol to the interface + * The ethernet demux actually special cases 802.15.4. + * The demux here isn't used. The demux will return PF_802154 for the + * appropriate packets and our sixlowpan_input function will be called. + */ +static int +sixlowpan_attach_protocol(struct ifnet *ifp) +{ + int error; + struct ifnet_attach_proto_param reg; + + bzero(®, sizeof(reg)); + reg.input = sixlowpan_input; + reg.detached = sixlowpan_detached; + error = ifnet_attach_protocol(ifp, PF_802154, ®); + if (error) { + printf("%s(%s%d) ifnet_attach_protocol failed, %d\n", + __func__, ifnet_name(ifp), ifnet_unit(ifp), error); + } + return error; +} + +/* + * Function: sixlowpan_detach_protocol + * Purpose: + * Detach our DLIL protocol from an interface + */ +static int +sixlowpan_detach_protocol(struct ifnet *ifp) +{ + int error; + + error = ifnet_detach_protocol(ifp, PF_802154); + if (error) { + printf("(%s%d) ifnet_detach_protocol failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + } + + return error; +} + +static errno_t +sixlowpan_proto_pre_output(ifnet_t ifp, + __unused protocol_family_t protocol_family, + mbuf_t *m0, + const struct sockaddr *dest, + void *route, + char *type, + char *ll_dest) +{ +#pragma unused(protocol_family) + errno_t result = 0; + struct sockaddr_dl sdl; + struct sockaddr_in6 *dest6 = (struct sockaddr_in6 *)(uintptr_t)(size_t)dest; + + if (!IN6_IS_ADDR_MULTICAST(&dest6->sin6_addr)) { + result = nd6_lookup_ipv6(ifp, dest6, &sdl, sizeof(sdl), route, *m0); + if (result == 0) { + bcopy(LLADDR(&sdl), ll_dest, sdl.sdl_alen); + } + } else { + /* map multicast address */ + ll_dest[0] = (dest6->sin6_addr.s6_addr8[14] & 0x1f) | 0x80; + ll_dest[1] = dest6->sin6_addr.s6_addr8[15]; + } + + /* + * XXX This should be generic to the underlying hardware type + */ + if (result == 0) { + u_int16_t ethertype = htons(ETHERTYPE_IEEE802154); + bcopy(ðertype, type, sizeof(ethertype)); + } + + return result; +} + +static int +sixlowpan_framer_extended(struct ifnet *ifp, struct mbuf **m, + const struct sockaddr *ndest, const char *edst, + const char *ether_type, u_int32_t *prepend_len, u_int32_t *postpend_len) +{ +#pragma unused(ndest) +#pragma unused(ether_type) + char buf[IEEE802154_ENCAP_LEN] = {0}; + int buflen = 0, err = 0; + frame802154_t ieee02154hdr; + if6lpan_ref ifl = NULL; + u_int8_t *payload = NULL; + struct mbuf *mc = NULL; + u_int16_t len; + struct sockaddr_in6 *dest6 = (struct sockaddr_in6 *)(uintptr_t)(size_t)ndest; + + /* Initialize 802.15.4 frame header */ + bzero(&ieee02154hdr, sizeof(ieee02154hdr)); + if (!IN6_IS_ADDR_MULTICAST(&dest6->sin6_addr)) { + bcopy(edst, ieee02154hdr.dest_addr, sizeof(ieee02154hdr.dest_addr)); + ieee02154hdr.fcf.dest_addr_mode = FRAME802154_LONGADDRMODE; + } else { + bcopy(edst, ieee02154hdr.dest_addr, 2); + ieee02154hdr.fcf.dest_addr_mode = FRAME802154_SHORTADDRMODE; + } + + /* Allocate a contiguous buffer for IPv6 header & payload */ + /* + * XXX As of now either we compress or we don't compress at all + * adding another byte of dispatch to communicate that there's no + * compression. + * + * Allocate for the worst case. + */ + payload = _MALLOC(m_pktlen(*m) + 1, M_TEMP, M_WAITOK | M_ZERO); + if (payload == NULL) { + err = -1; + goto err_out; + } + + /* Copy the IPv6 header & payload */ + if (mbuf_copydata(*m, 0, m_pktlen(*m), payload)) { + err = -1; + goto err_out; + } + + /* Allocate an mbuf cluster for the 802.15.4 frame and compressed payload */ + mc = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); + if (mc == NULL) { + err = -1; + goto err_out; + } + + sixlowpan_lock(); + ifl = ifnet_get_if6lpan_retained(ifp); + if (ifl == NULL || if6lpan_flags_ready(ifl) == 0) { + if (ifl != NULL) { + if6lpan_release(ifl); + } + sixlowpan_unlock(); + err = -1; + goto err_out; + } + bcopy(ifl->if6lpan_addr, ieee02154hdr.src_addr, sizeof(ieee02154hdr.src_addr)); + ieee02154hdr.seq = ifl->if6lpan_ieee802154_seq++; /**< Sequence number */ + if6lpan_release(ifl); + sixlowpan_unlock(); + + /* Initialize frame control field */ + ieee02154hdr.fcf.frame_type = FRAME802154_DATAFRAME; /**< 3 bit. Frame type field, see 802.15.4 */ + ieee02154hdr.fcf.security_enabled = 0; /**< 1 bit. True if security is used in this frame */ + ieee02154hdr.fcf.frame_pending = 0; /**< 1 bit. True if sender has more data to send */ + ieee02154hdr.fcf.ack_required = 0; /**< 1 bit. Is an ack frame required? */ + ieee02154hdr.fcf.panid_compression = 0; /**< 1 bit. Is this a compressed header? */ + ieee02154hdr.fcf.frame_version = FRAME802154_IEEE802154_2006; /**< 2 bit. 802.15.4 frame version */ + ieee02154hdr.fcf.src_addr_mode = FRAME802154_LONGADDRMODE; /**< 2 bit. Source address mode, see 802.15.4 */ + ieee02154hdr.dest_pid = IEEE802154_PANID; /**< Destination PAN ID */ + ieee02154hdr.src_pid = IEEE802154_PANID; /**< Source PAN ID */ + ieee02154hdr.payload_len = m_pktlen(*m); /**< Length of payload field */ + + /* Create an 802.15.4 Data header frame */ + buflen = frame802154_create(&ieee02154hdr, (uint8_t *)buf); + + /* Perform inline compression of the IPv6 hdr & payload */ + sixxlowpan_output(&ieee02154hdr, payload); + + /* + * Add 2 bytes at the front of the frame indicating the total payload + * length + */ + len = htons(buflen + ieee02154hdr.payload_len); + m_copyback(mc, 0, sizeof(len), &len); + /* Copy back the 802.15.4 Data frame header into mbuf */ + m_copyback(mc, sizeof(len), buflen, buf); + /* Copy back the compressed payload into mbuf */ + m_copyback(mc, buflen + sizeof(len), ieee02154hdr.payload_len, payload); + + if (prepend_len != NULL) { + *prepend_len = buflen; + } + if (postpend_len != NULL) { + *postpend_len = 0; + } + +err_out: + if (payload != NULL) { + _FREE(payload, M_TEMP); + } + m_freem(*m); + *m = mc; + return err; +} + + +static errno_t +sixlowpan_attach_inet6(struct ifnet *ifp, protocol_family_t protocol_family) +{ + struct ifnet_attach_proto_param proto; + errno_t error; + + bzero(&proto, sizeof(proto)); + proto.pre_output = sixlowpan_proto_pre_output; + + error = ifnet_attach_protocol(ifp, protocol_family, &proto); + if (error && error != EEXIST) { + printf("WARNING: %s can't attach ipv6 to %s\n", __func__, + if_name(ifp)); + } + return error; +} + +static void +sixlowpan_detach_inet6(struct ifnet *ifp, protocol_family_t protocol_family) +{ + (void) ifnet_detach_protocol(ifp, protocol_family); +} + +#if INET6 +__private_extern__ int +sixlowpan_family_init(void) +{ + int error = 0; + + error = proto_register_plumber(PF_INET6, IFNET_FAMILY_6LOWPAN, + sixlowpan_attach_inet6, sixlowpan_detach_inet6); + if (error != 0) { + printf("6lowpan: proto_register_plumber failed for AF_INET6 error=%d\n", + error); + goto done; + } + + error = sixlowpan_clone_attach(); + if (error != 0) { + printf("6lowpan: proto_register_plumber failed sixlowpan_clone_attach error=%d\n", + error); + goto done; + } + + +done: + return error; +} +#endif diff --git a/bsd/net/if_6lowpan_var.h b/bsd/net/if_6lowpan_var.h new file mode 100644 index 000000000..494db0573 --- /dev/null +++ b/bsd/net/if_6lowpan_var.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_6LOWPAN_VAR_H_ +#define _NET_IF_6LOWPAN_VAR_H_ 1 + +#define IEEE802154_ADDR_LEN 8 +#define IEEE802154_ENCAP_LEN 25 /* len of 802.15.4 Frame header */ +#define IEEE802154_FRAME_LEN 127 + +/* + * Configuration structure for SIOCSET6LOWPAN and SIOCGET6LOWPAN ioctls. + */ +struct sixlowpanreq { + char parent[IFNAMSIZ]; +}; + +#ifdef KERNEL_PRIVATE +int sixlowpan_family_init(void); +#endif /* KERNEL_PRIVATE */ +#endif /* _NET_IF_6LOWPAN_VAR_H_ */ diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c index 116148967..754923927 100644 --- a/bsd/net/if_bond.c +++ b/bsd/net/if_bond.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,7 @@ #include #include #include - +#include #include #include #include @@ -79,8 +80,16 @@ #include #include +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, OID_AUTO, bond, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "Bond interface"); + +static int if_bond_debug = 0; +SYSCTL_INT(_net_link_bond, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_bond_debug, 0, "Bond interface debug logs"); + static struct ether_addr slow_proto_multicast = { - IEEE8023AD_SLOW_PROTO_MULTICAST + .octet = IEEE8023AD_SLOW_PROTO_MULTICAST }; typedef struct ifbond_s ifbond, * ifbond_ref; @@ -269,6 +278,13 @@ enum { typedef u_char MuxState; +#define PORT_CONTROL_FLAGS_IN_LIST 0x01 +#define PORT_CONTROL_FLAGS_PROTO_ATTACHED 0x02 +#define PORT_CONTROL_FLAGS_FILTER_ATTACHED 0x04 +#define PORT_CONTROL_FLAGS_LLADDR_SET 0x08 +#define PORT_CONTROL_FLAGS_MTU_SET 0x10 +#define PORT_CONTROL_FLAGS_PROMISCUOUS_SET 0x20 + struct bondport_s { TAILQ_ENTRY(bondport_s) po_port_list; ifbond_ref po_bond; @@ -278,6 +294,8 @@ struct bondport_s { int po_enabled; char po_name[IFNAMSIZ]; struct ifdevmtu po_devmtu; + uint32_t po_control_flags; + interface_filter_t po_filter; /* LACP */ TAILQ_ENTRY(bondport_s) po_lag_port_list; @@ -296,6 +314,7 @@ struct bondport_s { SelectedState po_selected; int32_t po_last_transmit_secs; struct media_info po_media_info; + uint64_t po_force_link_event_time; LAG_ref po_lag; }; @@ -462,7 +481,6 @@ typedef struct bond_globals_s { struct ifbond_list ifbond_list; lacp_system system; lacp_system_priority system_priority; - int verbose; } * bond_globals_ref; static bond_globals_ref g_bond; @@ -566,6 +584,8 @@ LAG_get_aggregatable_port_count(LAG_ref lag, int * active_media); static int ifbond_selection(ifbond_ref bond); +static void +bond_handle_event(struct ifnet * port_ifp, int event_code); /** ** bondport @@ -621,19 +641,20 @@ bondport_collecting(bondport_ref p) **/ static int bond_clone_create(struct if_clone *, u_int32_t, void *); static int bond_clone_destroy(struct ifnet *); -static int bond_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t m, - char *frame_header); static int bond_output(struct ifnet *ifp, struct mbuf *m); static int bond_ioctl(struct ifnet *ifp, u_long cmd, void * addr); static int bond_set_bpf_tap(struct ifnet * ifp, bpf_tap_mode mode, bpf_packet_func func); static int bond_attach_protocol(struct ifnet *ifp); static int bond_detach_protocol(struct ifnet *ifp); +static errno_t bond_iff_input(void *cookie, ifnet_t ifp, + protocol_family_t protocol, mbuf_t *data, char **frame_ptr); +static int bond_attach_filter(struct ifnet *ifp, interface_filter_t * filter_p); static int bond_setmulti(struct ifnet *ifp); static int bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp); static int bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp); static void bond_if_free(struct ifnet * ifp); -static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); +static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); static struct if_clone bond_cloner = IF_CLONE_INITIALIZER(BONDNAME, bond_clone_create, @@ -687,11 +708,11 @@ ifbond_release(ifbond_ref ifb) return; } - if (g_bond->verbose) { + if (if_bond_debug) { printf("ifbond_release(%s)\n", ifb->ifb_name); } if (ifb->ifb_ifma_slow_proto != NULL) { - if (g_bond->verbose) { + if (if_bond_debug) { printf("ifbond_release(%s) removing multicast\n", ifb->ifb_name); } @@ -732,7 +753,7 @@ ifbond_wait(ifbond_ref ifb, const char * msg) /* other add/remove in progress */ while (ifbond_flags_change_in_progress(ifb)) { - if (g_bond->verbose) { + if (if_bond_debug) { printf("%s: %s msleep\n", ifb->ifb_name, msg); } waited = 1; @@ -740,7 +761,7 @@ ifbond_wait(ifbond_ref ifb, const char * msg) } /* prevent other bond list remove/add from taking place */ ifbond_flags_set_change_in_progress(ifb); - if (g_bond->verbose && waited) { + if (if_bond_debug && waited) { printf("%s: %s woke up\n", ifb->ifb_name, msg); } return; @@ -761,7 +782,7 @@ ifbond_signal(ifbond_ref ifb, const char * msg) { ifbond_flags_clear_change_in_progress(ifb); wakeup((caddr_t)ifb); - if (g_bond->verbose) { + if (if_bond_debug) { printf("%s: %s wakeup\n", ifb->ifb_name, msg); } return; @@ -775,6 +796,10 @@ static int link_speed(int active) { switch (IFM_SUBTYPE(active)) { + case IFM_AUTO: + case IFM_MANUAL: + case IFM_NONE: + return 0; case IFM_10_T: case IFM_10_2: case IFM_10_5: @@ -795,7 +820,7 @@ link_speed(int active) case IFM_1000_KX: return 1000; case IFM_HPNA_1: - return 0; + return 1; default: /* assume that new defined types are going to be at least 10GigE */ case IFM_10G_SR: @@ -851,6 +876,30 @@ media_full_duplex(const struct media_info * mi) return (mi->mi_active & IFM_FDX) != 0; } +static __inline__ int +media_type_unknown(const struct media_info * mi) +{ + int unknown; + + switch (IFM_SUBTYPE(mi->mi_active)) { + case IFM_AUTO: + case IFM_MANUAL: + case IFM_NONE: + unknown = 1; + break; + default: + unknown = 0; + break; + } + return unknown; +} + +static __inline__ int +media_ok(const struct media_info * mi) +{ + return media_full_duplex(mi) || media_type_unknown(mi); +} + static __inline__ int media_speed(const struct media_info * mi) { @@ -1067,7 +1116,7 @@ bond_setmulti(struct ifnet * ifp) } bond_lock(); signal_done: - ifbond_signal(ifb, "bond_setmulti"); + ifbond_signal(ifb, __func__); bond_unlock(); ifbond_release(ifb); return result; @@ -1171,7 +1220,6 @@ bond_clone_create(struct if_clone * ifc, u_int32_t unit, __unused void *params) ifnet_set_offload(ifp, 0); ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); /* XXX ethernet specific */ ifnet_set_flags(ifp, IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX, 0xffff); - ifnet_set_baudrate(ifp, 0); ifnet_set_mtu(ifp, ETHERMTU); error = ifnet_attach(ifp, NULL); @@ -1473,7 +1521,7 @@ bond_output(struct ifnet * ifp, struct mbuf * m) ifbond_ref ifb; struct ifnet * port_ifp = NULL; int err; - struct flowadv adv = { FADV_SUCCESS }; + struct flowadv adv = { .code = FADV_SUCCESS }; if (m == 0) { return 0; @@ -1571,6 +1619,7 @@ bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp) struct ifnet * bond_ifp = NULL; ifbond_ref ifb; int event_code = 0; + bool need_link_update = false; bondport_ref p; bond_lock(); @@ -1588,6 +1637,24 @@ bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp) if (ifb->ifb_mode != IF_BOND_MODE_LACP) { goto done; } + /* + * Work-around for rdar://problem/51372042 + * Sometimes, the link comes up but the driver doesn't report the + * negotiated medium at that time. When we receive an LACPDU packet, + * and the medium is unknown, force a link status check. Don't force + * the link status check more often than _FORCE_LINK_EVENT_INTERVAL + * seconds. + */ +#define _FORCE_LINK_EVENT_INTERVAL 1 + if (media_type_unknown(&p->po_media_info)) { + uint64_t now = net_uptime(); + + if ((now - p->po_force_link_event_time) >= + _FORCE_LINK_EVENT_INTERVAL) { + need_link_update = true; + p->po_force_link_event_time = now; + } + } bondport_receive_lacpdu(p, (lacpdu_ref)m->m_data); if (ifbond_selection(ifb)) { event_code = (ifb->ifb_active_lag == NULL) @@ -1601,7 +1668,7 @@ bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp) ? KEV_DL_LINK_OFF : KEV_DL_LINK_ON; if (event_code != ifb->ifb_last_link_event) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("%s: (receive) generating LINK event\n", ifb->ifb_name); } @@ -1616,6 +1683,12 @@ done: interface_link_event(bond_ifp, event_code); } m_freem(m); + if (need_link_update) { + if (if_bond_debug != 0) { + printf("bond: simulating link status changed event"); + } + bond_handle_event(port_ifp, KEV_DL_LINK_ON); + } return; } @@ -1651,9 +1724,8 @@ failed: return; } -static int -bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m, - char * frame_header) +static void +bond_input(ifnet_t port_ifp, mbuf_t m, char *frame_header) { bpf_packet_func bpf_func; const struct ether_header * eh_p; @@ -1671,17 +1743,17 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m, if (subtype == IEEE8023AD_SLOW_PROTO_SUBTYPE_LACP) { if (m->m_pkthdr.len < (int)offsetof(lacpdu, la_reserved)) { m_freem(m); - return 0; + return; } /* send to lacp */ if (m->m_len < (int)offsetof(lacpdu, la_reserved)) { m = m_pullup(m, offsetof(lacpdu, la_reserved)); if (m == NULL) { - return 0; + return; } } bond_receive_lacpdu(m, port_ifp); - return 0; + return; } else if (subtype == IEEE8023AD_SLOW_PROTO_SUBTYPE_LA_MARKER_PROTOCOL) { int min_size; @@ -1692,23 +1764,23 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m, min_size = ETHER_HDR_LEN + offsetof(la_marker_pdu, lm_reserved); if (m->m_pkthdr.len < min_size) { m_freem(m); - return 0; + return; } /* send to lacp */ if (m->m_len < min_size) { m = m_pullup(m, min_size); if (m == NULL) { - return 0; + return; } } /* send to marker responder */ bond_receive_la_marker_pdu(m, port_ifp); - return 0; + return; } else if (subtype == 0 || subtype > IEEE8023AD_SLOW_PROTO_SUBTYPE_RESERVED_END) { /* invalid subtype, discard the frame */ m_freem(m); - return 0; + return; } } bond_lock(); @@ -1720,12 +1792,19 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m, goto done; } - /* make the packet appear as if it arrived on the bonded interface */ ifb = p->po_bond; ifp = ifb->ifb_ifp; bpf_func = ifb->ifb_bpf_input; bond_unlock(); + /* + * Need to clear the promiscous flags otherwise it will be + * dropped by DLIL after processing filters + */ + if ((mbuf_flags(m) & MBUF_PROMISC)) { + mbuf_setflags_mask(m, 0, MBUF_PROMISC); + } + if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) { (void)ifnet_stat_increment_in(ifp, 1, (m->m_pkthdr.len + ETHER_HDR_LEN @@ -1734,16 +1813,31 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m, (void)ifnet_stat_increment_in(ifp, 1, (m->m_pkthdr.len + ETHER_HDR_LEN), 0); } + + /* make the packet appear as if it arrived on the bonded interface */ m->m_pkthdr.rcvif = ifp; bond_bpf_input(ifp, m, eh_p, bpf_func); m->m_pkthdr.pkt_hdr = frame_header; dlil_input_packet_list(ifp, m); - return 0; + return; done: bond_unlock(); m_freem(m); - return 0; + return; +} + +static errno_t +bond_iff_input(void *cookie, ifnet_t port_ifp, protocol_family_t protocol, + mbuf_t *data, char **frame_header_ptr) +{ +#pragma unused(cookie) +#pragma unused(protocol) + mbuf_t m = *data; + char * frame_header = *frame_header_ptr; + + bond_input(port_ifp, m, frame_header); + return EJUSTRETURN; } static __inline__ const char * @@ -1807,7 +1901,7 @@ bondport_timer_process_func(devtimer_ref timer, ? KEV_DL_LINK_OFF : KEV_DL_LINK_ON; if (event_code != p->po_bond->ifb_last_link_event) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("%s: (timer) generating LINK event\n", p->po_bond->ifb_name); } @@ -1952,11 +2046,6 @@ bondport_free(bondport_ref p) return; } -#define BOND_ADD_PROGRESS_IN_LIST 0x1 -#define BOND_ADD_PROGRESS_PROTO_ATTACHED 0x2 -#define BOND_ADD_PROGRESS_LLADDR_SET 0x4 -#define BOND_ADD_PROGRESS_MTU_SET 0x8 - static __inline__ int bond_device_mtu(struct ifnet * ifp, ifbond_ref ifb) { @@ -1967,15 +2056,16 @@ bond_device_mtu(struct ifnet * ifp, ifbond_ref ifb) static int bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) { + uint32_t control_flags = 0; int devmtu; int error = 0; int event_code = 0; + interface_filter_t filter = NULL; int first = FALSE; ifbond_ref ifb; bondport_ref * new_array = NULL; bondport_ref * old_array = NULL; bondport_ref p; - int progress = 0; if (IFNET_IS_INTCOPROC(port_ifp)) { return EINVAL; @@ -2009,7 +2099,7 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) ifbond_retain(ifb); /* wait for other add or remove to complete */ - ifbond_wait(ifb, "bond_add_interface"); + ifbond_wait(ifb, __func__); if (ifbond_flags_if_detaching(ifb)) { /* someone destroyed the bond while we were waiting */ @@ -2050,8 +2140,9 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) ifnet_offload_t offload; offload = ifp_offload & port_ifp_offload; - printf("bond_add_interface(%s, %s) " + printf("%s(%s, %s) " "hwassist values don't match 0x%x != 0x%x, using 0x%x instead\n", + __func__, ifb->ifb_name, bondport_get_name(p), ifp_offload, port_ifp_offload, offload); /* @@ -2080,7 +2171,7 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) IFT_ETHER); } - progress |= BOND_ADD_PROGRESS_IN_LIST; + control_flags |= PORT_CONTROL_FLAGS_IN_LIST; /* allocate a larger distributing array */ new_array = (bondport_ref *) @@ -2095,24 +2186,32 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) if (error) { goto failed; } - progress |= BOND_ADD_PROGRESS_PROTO_ATTACHED; + control_flags |= PORT_CONTROL_FLAGS_PROTO_ATTACHED; + + /* attach our BOND interface filter */ + error = bond_attach_filter(port_ifp, &filter); + if (error != 0) { + goto failed; + } + control_flags |= PORT_CONTROL_FLAGS_FILTER_ATTACHED; /* set the interface MTU */ devmtu = bond_device_mtu(ifp, ifb); error = siocsifmtu(port_ifp, devmtu); if (error != 0) { - printf("bond_add_interface(%s, %s):" + printf("%s(%s, %s):" " SIOCSIFMTU %d failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), devmtu, error); goto failed; } - progress |= BOND_ADD_PROGRESS_MTU_SET; + control_flags |= PORT_CONTROL_FLAGS_MTU_SET; /* program the port with our multicast addresses */ error = multicast_list_program(&p->po_multicast, ifp, port_ifp); if (error) { - printf("bond_add_interface(%s, %s):" - " multicast_list_program failed %d\n", + printf("%s(%s, %s): multicast_list_program failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), error); goto failed; } @@ -2122,7 +2221,8 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) error = ifnet_ioctl(port_ifp, 0, SIOCSIFFLAGS, NULL); if (error != 0) { - printf("bond_add_interface(%s, %s): SIOCSIFFLAGS failed %d\n", + printf("%s(%s, %s): SIOCSIFFLAGS failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), error); goto failed; } @@ -2130,18 +2230,36 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) /* re-program the port's ethernet address */ error = if_siflladdr(port_ifp, (const struct ether_addr *)IF_LLADDR(ifp)); + if (error == 0) { + if (memcmp(IF_LLADDR(ifp), IF_LLADDR(port_ifp), ETHER_ADDR_LEN) + != 0) { + /* it lied, it really doesn't support setting lladdr */ + error = EOPNOTSUPP; + } + } if (error != 0) { /* port doesn't support setting the link address */ - printf("bond_add_interface(%s, %s): if_siflladdr failed %d\n", + printf("%s(%s, %s): if_siflladdr failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), error); - goto failed; + error = ifnet_set_promiscuous(port_ifp, 1); + if (error != 0) { + /* port doesn't support setting promiscuous mode */ + printf("%s(%s, %s): set promiscuous failed %d\n", + __func__, + ifb->ifb_name, bondport_get_name(p), error); + goto failed; + } + control_flags |= PORT_CONTROL_FLAGS_PROMISCUOUS_SET; + } else { + control_flags |= PORT_CONTROL_FLAGS_LLADDR_SET; } - progress |= BOND_ADD_PROGRESS_LLADDR_SET; bond_lock(); /* no failures past this point */ p->po_enabled = 1; + p->po_control_flags = control_flags; /* copy the contents of the existing distributing array */ if (ifb->ifb_distributing_count) { @@ -2172,8 +2290,10 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) bondport_disable_distributing(p); } } + p->po_filter = filter; + /* clear the busy state, and wakeup anyone waiting */ - ifbond_signal(ifb, "bond_add_interface"); + ifbond_signal(ifb, __func__); bond_unlock(); if (event_code != 0) { interface_link_event(ifp, event_code); @@ -2194,30 +2314,45 @@ failed: if (new_array != NULL) { FREE(new_array, M_BOND); } - if ((progress & BOND_ADD_PROGRESS_LLADDR_SET) != 0) { + if ((control_flags & PORT_CONTROL_FLAGS_LLADDR_SET) != 0) { int error1; error1 = if_siflladdr(port_ifp, &p->po_saved_addr); if (error1 != 0) { - printf("bond_add_interface(%s, %s): if_siflladdr failed %d\n", + printf("%s(%s, %s): if_siflladdr restore failed %d\n", + __func__, + ifb->ifb_name, bondport_get_name(p), error1); + } + } + if ((control_flags & PORT_CONTROL_FLAGS_PROMISCUOUS_SET) != 0) { + int error1; + + error1 = ifnet_set_promiscuous(port_ifp, 0); + if (error1 != 0) { + printf("%s(%s, %s): promiscous mode disable failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), error1); } } - if ((progress & BOND_ADD_PROGRESS_PROTO_ATTACHED) != 0) { + if ((control_flags & PORT_CONTROL_FLAGS_PROTO_ATTACHED) != 0) { (void)bond_detach_protocol(port_ifp); } - if ((progress & BOND_ADD_PROGRESS_MTU_SET) != 0) { + if ((control_flags & PORT_CONTROL_FLAGS_FILTER_ATTACHED) != 0) { + iflt_detach(filter); + } + if ((control_flags & PORT_CONTROL_FLAGS_MTU_SET) != 0) { int error1; error1 = siocsifmtu(port_ifp, p->po_devmtu.ifdm_current); if (error1 != 0) { - printf("bond_add_interface(%s, %s): SIOCSIFMTU %d failed %d\n", + printf("%s(%s, %s): SIOCSIFMTU %d failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), p->po_devmtu.ifdm_current, error1); } } bond_lock(); - if ((progress & BOND_ADD_PROGRESS_IN_LIST) != 0) { + if ((control_flags & PORT_CONTROL_FLAGS_IN_LIST) != 0) { TAILQ_REMOVE(&ifb->ifb_port_list, p, po_port_list); ifb->ifb_port_count--; } @@ -2229,7 +2364,7 @@ failed: } signal_done: - ifbond_signal(ifb, "bond_add_interface"); + ifbond_signal(ifb, __func__); bond_unlock(); ifbond_release(ifb); bondport_free(p); @@ -2244,6 +2379,7 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) int event_code = 0; bondport_ref head_port; struct ifnet * ifp; + interface_filter_t filter; int last = FALSE; int new_link_address = FALSE; bondport_ref p; @@ -2315,7 +2451,7 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) ifb->ifb_last_link_event = event_code = KEV_DL_LINK_OFF; } } - + filter = p->po_filter; bond_unlock(); if (last) { @@ -2335,11 +2471,17 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) TAILQ_FOREACH(scan_port, &ifb->ifb_port_list, po_port_list) { scan_ifp = scan_port->po_ifp; + if ((scan_port->po_control_flags & + PORT_CONTROL_FLAGS_LLADDR_SET) == 0) { + /* port doesn't support setting lladdr */ + continue; + } error = if_siflladdr(scan_ifp, (const struct ether_addr *) IF_LLADDR(ifp)); if (error != 0) { - printf("bond_remove_interface(%s, %s): " + printf("%s(%s, %s): " "if_siflladdr (%s) failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), bondport_get_name(scan_port), error); } @@ -2347,16 +2489,30 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) } /* restore the port's ethernet address */ - error = if_siflladdr(port_ifp, &p->po_saved_addr); - if (error != 0) { - printf("bond_remove_interface(%s, %s): if_siflladdr failed %d\n", - ifb->ifb_name, bondport_get_name(p), error); + if ((p->po_control_flags & PORT_CONTROL_FLAGS_LLADDR_SET) != 0) { + error = if_siflladdr(port_ifp, &p->po_saved_addr); + if (error != 0) { + printf("%s(%s, %s): if_siflladdr failed %d\n", + __func__, + ifb->ifb_name, bondport_get_name(p), error); + } + } + + /* disable promiscous mode (if we enabled it) */ + if ((p->po_control_flags & PORT_CONTROL_FLAGS_PROMISCUOUS_SET) != 0) { + error = ifnet_set_promiscuous(port_ifp, 0); + if (error != 0) { + printf("%s(%s, %s): disable promiscuous failed %d\n", + __func__, + ifb->ifb_name, bondport_get_name(p), error); + } } /* restore the port's MTU */ error = siocsifmtu(port_ifp, p->po_devmtu.ifdm_current); if (error != 0) { - printf("bond_remove_interface(%s, %s): SIOCSIFMTU %d failed %d\n", + printf("%s(%s, %s): SIOCSIFMTU %d failed %d\n", + __func__, ifb->ifb_name, bondport_get_name(p), p->po_devmtu.ifdm_current, error); } @@ -2364,6 +2520,11 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) /* remove the bond "protocol" */ bond_detach_protocol(port_ifp); + /* detach the filter */ + if (filter != NULL) { + iflt_detach(filter); + } + /* generate link event */ if (event_code != 0) { interface_link_event(ifp, event_code); @@ -2376,7 +2537,7 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) ifbond_release(ifb); signal_done: - ifbond_signal(ifb, "bond_remove_interface"); + ifbond_signal(ifb, __func__); ifbond_release(ifb); return error; } @@ -2474,7 +2635,7 @@ bond_set_mode(struct ifnet * ifp, int mode) ifb->ifb_last_link_event = event_code; signal_done: - ifbond_signal(ifb, "bond_set_mode"); + ifbond_signal(ifb, __func__); bond_unlock(); ifbond_release(ifb); @@ -2665,7 +2826,7 @@ bond_set_mtu(struct ifnet * ifp, int mtu, int isdevmtu) } signal_done: - ifbond_signal(ifb, "bond_set_mtu"); + ifbond_signal(ifb, __func__); ifbond_release(ifb); done: @@ -2816,12 +2977,7 @@ bond_ioctl(struct ifnet *ifp, u_long cmd, void * data) break; case IF_BOND_OP_SET_VERBOSE: bond_lock(); - if (g_bond == NULL) { - bond_unlock(); - error = ENXIO; - break; - } - g_bond->verbose = ibr.ibr_ibru.ibru_int_val; + if_bond_debug = ibr.ibr_ibru.ibru_int_val; bond_unlock(); break; case IF_BOND_OP_SET_MODE: @@ -2920,10 +3076,11 @@ bond_handle_event(struct ifnet * port_ifp, int event_code) ifbond_ref ifb; int old_distributing_count; bondport_ref p; - struct media_info media_info = { 0, 0}; + struct media_info media_info = { .mi_active = 0, .mi_status = 0 }; switch (event_code) { case KEV_DL_IF_DETACHED: + case KEV_DL_IF_DETACHING: break; case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: @@ -2942,6 +3099,7 @@ bond_handle_event(struct ifnet * port_ifp, int event_code) old_distributing_count = ifb->ifb_distributing_count; switch (event_code) { case KEV_DL_IF_DETACHED: + case KEV_DL_IF_DETACHING: bond_remove_interface(ifb, p->po_ifp); break; case KEV_DL_LINK_OFF: @@ -2966,7 +3124,7 @@ bond_handle_event(struct ifnet * port_ifp, int event_code) ? KEV_DL_LINK_OFF : KEV_DL_LINK_ON; if (event_code != ifb->ifb_last_link_event) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("%s: (event) generating LINK event\n", ifb->ifb_name); } @@ -3000,8 +3158,9 @@ bond_handle_event(struct ifnet * port_ifp, int event_code) } static void -bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, - const struct kev_msg * event) +bond_iff_event(__unused void *cookie, ifnet_t port_ifp, + __unused protocol_family_t protocol, + const struct kev_msg *event) { int event_code; @@ -3014,7 +3173,8 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, switch (event_code) { case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: - /* we only care about link status changes */ + case KEV_DL_IF_DETACHING: + case KEV_DL_IF_DETACHED: bond_handle_event(port_ifp, event_code); break; default: @@ -3023,11 +3183,11 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, return; } -static errno_t -bond_detached(ifnet_t port_ifp, __unused protocol_family_t protocol) +static void +bond_iff_detached(__unused void *cookie, ifnet_t port_ifp) { bond_handle_event(port_ifp, KEV_DL_IF_DETACHED); - return 0; + return; } static void @@ -3052,6 +3212,19 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) return; } +static errno_t +bond_proto_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t packet, + char *header) +{ +#pragma unused(protocol, packet, header) + if (if_bond_debug != 0) { + printf("%s: unexpected packet from %s\n", __func__, + ifp->if_xname); + } + return 0; +} + + /* * Function: bond_attach_protocol * Purpose: @@ -3069,9 +3242,7 @@ bond_attach_protocol(struct ifnet *ifp) struct ifnet_attach_proto_param reg; bzero(®, sizeof(reg)); - reg.input = bond_input; - reg.event = bond_event; - reg.detached = bond_detached; + reg.input = bond_proto_input; error = ifnet_attach_protocol(ifp, PF_BOND, ®); if (error) { @@ -3099,6 +3270,33 @@ bond_detach_protocol(struct ifnet *ifp) return error; } +/* + * Function: bond_attach_filter + * Purpose: + * Attach our DLIL interface filter. + */ +static int +bond_attach_filter(struct ifnet *ifp, interface_filter_t * filter_p) +{ + int error; + struct iff_filter iff; + + /* + * install an interface filter + */ + memset(&iff, 0, sizeof(struct iff_filter)); + iff.iff_name = "com.apple.kernel.bsd.net.if_bond"; + iff.iff_input = bond_iff_input; + iff.iff_event = bond_iff_event; + iff.iff_detached = bond_iff_detached; + error = iflt_attach_internal(ifp, &iff, filter_p); + if (error != 0) { + printf("%s: iflt_attach_internal failed %d\n", __func__, error); + } + return error; +} + + /* * DLIL interface family functions */ @@ -3178,7 +3376,7 @@ ifbond_list_find_moved_port(bondport_ref rx_port, if (ps->ps_port == lacp_actor_partner_tlv_get_port(atlv) && bcmp(&ps_li->li_system, atlv->lap_system, sizeof(ps_li->li_system)) == 0) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("System " EA_FORMAT " Port 0x%x moved from %s to %s\n", EA_LIST(&ps_li->li_system), ps->ps_port, @@ -3219,7 +3417,7 @@ ifbond_selection(ifbond_ref bond) lag_changed = 1; } else if (lag != NULL) { if (lag->lag_active_media != active_media) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("LAG PORT SPEED CHANGED from %d to %d\n", link_speed(lag->lag_active_media), link_speed(active_media)); @@ -3484,15 +3682,24 @@ bondport_link_status_changed(bondport_ref p) { ifbond_ref bond = p->po_bond; - if (g_bond->verbose) { + if (if_bond_debug) { if (media_active(&p->po_media_info)) { + const char * duplex_string; + + if (media_full_duplex(&p->po_media_info)) { + duplex_string = "full"; + } else if (media_type_unknown(&p->po_media_info)) { + duplex_string = "unknown"; + } else { + duplex_string = "half"; + } timestamp_printf("[%s] Link UP %d Mbit/s %s duplex\n", bondport_get_name(p), media_speed(&p->po_media_info), - media_full_duplex(&p->po_media_info) - ? "full" : "half"); + duplex_string); } else { - timestamp_printf("[%s] Link DOWN\n", bondport_get_name(p)); + timestamp_printf("[%s] Link DOWN\n", + bondport_get_name(p)); } } if (bond->ifb_mode == IF_BOND_MODE_LACP) { @@ -3501,7 +3708,7 @@ bondport_link_status_changed(bondport_ref p) && p->po_lag == bond->ifb_active_lag && p->po_selected != SelectedState_UNSELECTED) { if (media_speed(&p->po_media_info) != p->po_lag->lag_active_media) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Port speed %d differs from LAG %d\n", bondport_get_name(p), media_speed(&p->po_media_info), @@ -3538,7 +3745,7 @@ bondport_aggregatable(bondport_ref p) } switch (p->po_receive_state) { default: - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Port is not selectable\n", bondport_get_name(p)); } @@ -3581,7 +3788,7 @@ bondport_remove_from_LAG(bondport_ref p) return 0; } TAILQ_REMOVE(&lag->lag_port_list, p, po_lag_port_list); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Removed from LAG (0x%04x," EA_FORMAT ",0x%04x)\n", bondport_get_name(p), @@ -3594,7 +3801,7 @@ bondport_remove_from_LAG(bondport_ref p) if (lag->lag_port_count > 0) { return bond->ifb_active_lag == lag; } - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("Key 0x%04x: LAG Released (%04x," EA_FORMAT ",0x%04x)\n", bond->ifb_key, @@ -3617,7 +3824,7 @@ bondport_add_to_LAG(bondport_ref p, LAG_ref lag) TAILQ_INSERT_TAIL(&lag->lag_port_list, p, po_lag_port_list); p->po_lag = lag; lag->lag_port_count++; - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Added to LAG (0x%04x," EA_FORMAT "0x%04x)\n", bondport_get_name(p), lag->lag_info.li_system_priority, @@ -3656,7 +3863,7 @@ bondport_assign_to_LAG(bondport_ref p) lag->lag_selected_port_count = 0; lag->lag_info = p->po_partner_state.ps_lag_info; TAILQ_INSERT_TAIL(&bond->ifb_lag_list, lag, lag_list); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("Key 0x%04x: LAG Created (0x%04x," EA_FORMAT ",0x%04x)\n", bond->ifb_key, @@ -3699,7 +3906,7 @@ bondport_set_selected(bondport_ref p, SelectedState s) } else if (s == SelectedState_SELECTED) { lag->lag_selected_port_count++; } - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] SetSelected: %s (was %s)\n", bondport_get_name(p), SelectedStateString(s), @@ -3753,7 +3960,7 @@ bondport_UpdateSelected(bondport_ref p, lacpdu_ref lacpdu_p) || (lacp_actor_partner_state_aggregatable(actor->lap_state) != lacp_actor_partner_state_aggregatable(ps->ps_state))) { bondport_set_selected(p, SelectedState_UNSELECTED); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] updateSelected UNSELECTED\n", bondport_get_name(p)); } @@ -3791,7 +3998,7 @@ bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p) if (lacp_actor_partner_state_active_lacp(ps->ps_state) || (lacp_actor_partner_state_active_lacp(p->po_actor_state) && lacp_actor_partner_state_active_lacp(partner->lap_state))) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] recordPDU: LACP will maintain\n", bondport_get_name(p)); } @@ -3810,7 +4017,7 @@ bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p) && lacp_actor_partner_state_in_sync(actor->lap_state) && lacp_maintain) { ps->ps_state = lacp_actor_partner_state_set_in_sync(ps->ps_state); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] recordPDU: LACP partner in sync\n", bondport_get_name(p)); } @@ -3818,7 +4025,7 @@ bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p) && lacp_actor_partner_state_in_sync(actor->lap_state) && lacp_maintain) { ps->ps_state = lacp_actor_partner_state_set_in_sync(ps->ps_state); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] recordPDU: LACP partner in sync (ind)\n", bondport_get_name(p)); } @@ -3853,7 +4060,7 @@ bondport_UpdateNTT(bondport_ref p, lacpdu_ref lacpdu_p) || (updateNTTBits(partner->lap_state) != updateNTTBits(p->po_actor_state))) { bondport_flags_set_ntt(p); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] updateNTT: Need To Transmit\n", bondport_get_name(p)); } @@ -3865,7 +4072,7 @@ static void bondport_AttachMuxToAggregator(bondport_ref p) { if (bondport_flags_mux_attached(p) == 0) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Attached Mux To Aggregator\n", bondport_get_name(p)); } @@ -3878,7 +4085,7 @@ static void bondport_DetachMuxFromAggregator(bondport_ref p) { if (bondport_flags_mux_attached(p)) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Detached Mux From Aggregator\n", bondport_get_name(p)); } @@ -3894,7 +4101,7 @@ bondport_enable_distributing(bondport_ref p) ifbond_ref bond = p->po_bond; bond->ifb_distributing_array[bond->ifb_distributing_count++] = p; - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Distribution Enabled\n", bondport_get_name(p)); } @@ -3926,7 +4133,7 @@ bondport_disable_distributing(bondport_ref p) } } bond->ifb_distributing_count--; - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Distribution Disabled\n", bondport_get_name(p)); } @@ -4027,7 +4234,7 @@ bondport_receive_machine_initialize(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_current_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive INITIALIZE\n", bondport_get_name(p)); } @@ -4053,7 +4260,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_current_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive PORT_DISABLED\n", bondport_get_name(p)); } @@ -4063,7 +4270,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event, /* FALL THROUGH */ case LAEventMediaChange: if (media_active(&p->po_media_info)) { - if (media_full_duplex(&p->po_media_info)) { + if (media_ok(&p->po_media_info)) { bondport_receive_machine_expired(p, LAEventStart, NULL); } else { bondport_receive_machine_lacp_disabled(p, LAEventStart, NULL); @@ -4071,7 +4278,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event, } else if (p->po_selected == SelectedState_SELECTED) { struct timeval tv; - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive PORT_DISABLED: " "link timer started\n", bondport_get_name(p)); @@ -4088,7 +4295,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event, break; case LAEventTimeout: if (p->po_selected == SelectedState_SELECTED) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive PORT_DISABLED: " "link timer completed, marking UNSELECTED\n", bondport_get_name(p)); @@ -4115,7 +4322,7 @@ bondport_receive_machine_expired(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_current_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive EXPIRED\n", bondport_get_name(p)); } @@ -4152,7 +4359,7 @@ bondport_receive_machine_lacp_disabled(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_current_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive LACP_DISABLED\n", bondport_get_name(p)); } @@ -4177,7 +4384,7 @@ bondport_receive_machine_defaulted(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_current_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive DEFAULTED\n", bondport_get_name(p)); } @@ -4203,7 +4410,7 @@ bondport_receive_machine_current(bondport_ref p, LAEvent event, switch (event) { case LAEventPacket: devtimer_cancel(p->po_current_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Receive CURRENT\n", bondport_get_name(p)); } @@ -4250,7 +4457,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] periodic_transmit Start\n", bondport_get_name(p)); } @@ -4259,7 +4466,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, devtimer_cancel(p->po_periodic_timer); p->po_periodic_interval = 0; if (media_active(&p->po_media_info) == 0 - || media_full_duplex(&p->po_media_info) == 0) { + || media_ok(&p->po_media_info) == 0) { break; } case LAEventPacket: @@ -4280,7 +4487,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, if (p->po_periodic_interval != interval) { if (interval == LACP_FAST_PERIODIC_TIME && p->po_periodic_interval == LACP_SLOW_PERIODIC_TIME) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] periodic_transmit:" " Need To Transmit\n", bondport_get_name(p)); @@ -4294,7 +4501,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, (devtimer_timeout_func) bondport_periodic_transmit_machine, (void *)LAEventTimeout, NULL); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Periodic Transmission Timer: %d secs\n", bondport_get_name(p), p->po_periodic_interval); @@ -4308,7 +4515,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, devtimer_set_relative(p->po_periodic_timer, tv, (devtimer_timeout_func) bondport_periodic_transmit_machine, (void *)LAEventTimeout, NULL); - if (g_bond->verbose > 1) { + if (if_bond_debug > 1) { timestamp_printf("[%s] Periodic Transmission Timer: %d secs\n", bondport_get_name(p), p->po_periodic_interval); } @@ -4346,7 +4553,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event, { lacp_actor_partner_tlv_ref aptlv; lacp_collector_tlv_ref ctlv; - struct timeval next_tick_time = {0, 0}; + struct timeval next_tick_time = {.tv_sec = 0, .tv_usec = 0}; lacpdu_ref out_lacpdu_p; packet_buffer_ref pkt; partner_state_ref ps; @@ -4363,7 +4570,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event, } else if (bondport_can_transmit(p, devtimer_current_secs(), &next_tick_time.tv_sec) == 0) { if (devtimer_enabled(p->po_transmit_timer)) { - if (g_bond->verbose > 0) { + if (if_bond_debug > 0) { timestamp_printf("[%s] Transmit Timer Already Set\n", bondport_get_name(p)); } @@ -4372,7 +4579,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event, (devtimer_timeout_func) bondport_transmit_machine, (void *)LAEventTimeout, NULL); - if (g_bond->verbose > 0) { + if (if_bond_debug > 0) { timestamp_printf("[%s] Transmit Timer Deadline %d secs\n", bondport_get_name(p), (int)next_tick_time.tv_sec); @@ -4380,7 +4587,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event, } break; } - if (g_bond->verbose > 0) { + if (if_bond_debug > 0) { if (event == LAEventTimeout) { timestamp_printf("[%s] Transmit Timer Complete\n", bondport_get_name(p)); @@ -4430,7 +4637,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event, bondport_slow_proto_transmit(p, pkt); bondport_flags_clear_ntt(p); - if (g_bond->verbose > 0) { + if (if_bond_debug > 0) { timestamp_printf("[%s] Transmit Packet %d\n", bondport_get_name(p), p->po_n_transmit); } @@ -4493,7 +4700,7 @@ bondport_mux_machine_detached(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_wait_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux DETACHED\n", bondport_get_name(p)); } @@ -4531,7 +4738,7 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_wait_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING\n", bondport_get_name(p)); } @@ -4546,21 +4753,21 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event, if (p->po_selected == SelectedState_STANDBY) { devtimer_cancel(p->po_wait_while_timer); /* wait until state changes to SELECTED */ - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: Standby\n", bondport_get_name(p)); } break; } if (bondport_flags_ready(p)) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: Port is already ready\n", bondport_get_name(p)); } break; } if (devtimer_enabled(p->po_wait_while_timer)) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: Timer already set\n", bondport_get_name(p)); } @@ -4568,14 +4775,14 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event, } if (ifbond_all_ports_attached(p->po_bond, p)) { devtimer_cancel(p->po_wait_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: No waiting\n", bondport_get_name(p)); } bondport_flags_set_ready(p); goto no_waiting; } - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: 2 seconds\n", bondport_get_name(p)); } @@ -4587,7 +4794,7 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event, (void *)LAEventTimeout, NULL); break; case LAEventTimeout: - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: Ready\n", bondport_get_name(p)); } @@ -4596,7 +4803,7 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event, case LAEventReady: no_waiting: if (bondport_flags_ready(p)) { - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux WAITING: All Ports Ready\n", bondport_get_name(p)); } @@ -4617,7 +4824,7 @@ bondport_mux_machine_attached(bondport_ref p, LAEvent event, switch (event) { case LAEventStart: devtimer_cancel(p->po_wait_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux ATTACHED\n", bondport_get_name(p)); } @@ -4659,7 +4866,7 @@ bondport_mux_machine_collecting_distributing(bondport_ref p, switch (event) { case LAEventStart: devtimer_cancel(p->po_wait_while_timer); - if (g_bond->verbose) { + if (if_bond_debug) { timestamp_printf("[%s] Mux COLLECTING_DISTRIBUTING\n", bondport_get_name(p)); } diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index ca97c63dd..a3c90194d 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2018 Apple Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -187,6 +187,7 @@ #define BR_DBGF_MBUF 0x0040 #define BR_DBGF_MCAST 0x0080 #define BR_DBGF_HOSTFILTER 0x0100 +#define BR_DBGF_CHECKSUM 0x0200 #endif /* BRIDGE_DEBUG */ #define _BRIDGE_LOCK(_sc) lck_mtx_lock(&(_sc)->sc_mtx) @@ -323,6 +324,7 @@ struct bridge_iflist { #define BIFF_HOST_FILTER 0x10 /* host filter enabled */ #define BIFF_HF_HWSRC 0x20 /* host filter source MAC is set */ #define BIFF_HF_IPSRC 0x40 /* host filter source IP is set */ +#define BIFF_INPUT_BROADCAST 0x80 /* send broadcast packets in */ /* * Bridge route node. @@ -355,44 +357,26 @@ struct bridge_delayed_call { #define BDCF_OUTSTANDING 0x01 /* Delayed call has been scheduled */ #define BDCF_CANCELLING 0x02 /* May be waiting for call completion */ - /* * Software state for each bridge. */ LIST_HEAD(_bridge_rtnode_list, bridge_rtnode); -typedef struct { - struct _bridge_rtnode_list *bb_rthash; /* our forwarding table */ - struct _bridge_rtnode_list bb_rtlist; /* list version of above */ - uint32_t bb_rthash_key; /* key for hash */ - uint32_t bb_rthash_size; /* size of the hash table */ - struct bridge_delayed_call bb_aging_timer; - struct bridge_delayed_call bb_resize_call; - TAILQ_HEAD(, bridge_iflist) bb_spanlist; /* span ports list */ - struct bstp_state bb_stp; /* STP state */ - bpf_packet_func bb_bpf_input; - bpf_packet_func bb_bpf_output; -} bridge_bsd, *bridge_bsd_t; - -#define sc_rthash sc_u.scu_bsd.bb_rthash -#define sc_rtlist sc_u.scu_bsd.bb_rtlist -#define sc_rthash_key sc_u.scu_bsd.bb_rthash_key -#define sc_rthash_size sc_u.scu_bsd.bb_rthash_size -#define sc_aging_timer sc_u.scu_bsd.bb_aging_timer -#define sc_resize_call sc_u.scu_bsd.bb_resize_call -#define sc_spanlist sc_u.scu_bsd.bb_spanlist -#define sc_stp sc_u.scu_bsd.bb_stp -#define sc_bpf_input sc_u.scu_bsd.bb_bpf_input -#define sc_bpf_output sc_u.scu_bsd.bb_bpf_output - struct bridge_softc { struct ifnet *sc_ifp; /* make this an interface */ u_int32_t sc_flags; - union { - bridge_bsd scu_bsd; - } sc_u; LIST_ENTRY(bridge_softc) sc_list; decl_lck_mtx_data(, sc_mtx); + struct _bridge_rtnode_list *sc_rthash; /* our forwarding table */ + struct _bridge_rtnode_list sc_rtlist; /* list version of above */ + uint32_t sc_rthash_key; /* key for hash */ + uint32_t sc_rthash_size; /* size of the hash table */ + struct bridge_delayed_call sc_aging_timer; + struct bridge_delayed_call sc_resize_call; + TAILQ_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ + struct bstp_state sc_stp; /* STP state */ + bpf_packet_func sc_bpf_input; + bpf_packet_func sc_bpf_output; void *sc_cv; uint32_t sc_brtmax; /* max # of addresses */ uint32_t sc_brtcnt; /* cur. # of addresses */ @@ -420,19 +404,13 @@ struct bridge_softc { #define SCF_DETACHING 0x01 #define SCF_RESIZING 0x02 #define SCF_MEDIA_ACTIVE 0x04 -#define SCF_BSD_MODE 0x08 - -static inline void -bridge_set_bsd_mode(struct bridge_softc * sc) -{ - sc->sc_flags |= SCF_BSD_MODE; -} -static inline boolean_t -bridge_in_bsd_mode(const struct bridge_softc * sc) -{ - return (sc->sc_flags & SCF_BSD_MODE) != 0; -} +typedef enum { + kChecksumOperationNone = 0, + kChecksumOperationClear = 1, + kChecksumOperationFinalize = 2, + kChecksumOperationCompute = 3, +} ChecksumOperation; struct bridge_hostfilter_stats bridge_hostfilter_stats; @@ -452,7 +430,8 @@ static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, int); #endif static errno_t bridge_set_tso(struct bridge_softc *); -__private_extern__ void bridge_ifdetach(struct bridge_iflist *, struct ifnet *); +static void bridge_ifdetach(struct ifnet *); +static void bridge_proto_attach_changed(struct ifnet *); static int bridge_init(struct ifnet *); #if HAS_BRIDGE_DUMMYNET static void bridge_dummynet(struct mbuf *, struct ifnet *); @@ -462,14 +441,13 @@ static int bridge_output(struct ifnet *, struct mbuf *); static void bridge_finalize_cksum(struct ifnet *, struct mbuf *); static void bridge_start(struct ifnet *); __private_extern__ errno_t bridge_input(struct ifnet *, struct mbuf *, void *); -#if BRIDGE_MEMBER_OUT_FILTER -static errno_t bridge_iff_output(void *, ifnet_t, protocol_family_t, +static errno_t bridge_iff_output(void *, ifnet_t, protocol_family_t, mbuf_t *); -static int bridge_member_output(struct ifnet *, struct mbuf *, - struct sockaddr *, struct rtentry *); -#endif +static errno_t bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, + mbuf_t m); + static int bridge_enqueue(struct bridge_softc *, struct ifnet *, - struct mbuf *); + struct ifnet *, struct mbuf *, ChecksumOperation); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, @@ -586,7 +564,6 @@ static void bridge_cancel_delayed_call(struct bridge_delayed_call *); static void bridge_cleanup_delayed_call(struct bridge_delayed_call *); static int bridge_host_filter(struct bridge_iflist *, struct mbuf *); - #define m_copypacket(m, how) m_copym(m, 0, M_COPYALL, how) /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ @@ -634,14 +611,6 @@ SYSCTL_INT(_net_link_bridge, OID_AUTO, delayed_callback_delay, "Delay before calling delayed function"); #endif -static int bridge_bsd_mode = 1; -#if (DEVELOPMENT || DEBUG) -SYSCTL_INT(_net_link_bridge, OID_AUTO, bsd_mode, - CTLFLAG_RW | CTLFLAG_LOCKED, - &bridge_bsd_mode, 0, - "Bridge using bsd mode"); -#endif /* (DEVELOPMENT || DEBUG) */ - SYSCTL_STRUCT(_net_link_bridge, OID_AUTO, hostfilterstats, CTLFLAG_RD | CTLFLAG_LOCKED, &bridge_hostfilter_stats, bridge_hostfilter_stats, ""); @@ -684,199 +653,199 @@ struct bridge_control { #define BC_F_SUSER 0x04 /* do super-user check */ static const struct bridge_control bridge_control_table32[] = { - { bridge_ioctl_add, sizeof(struct ifbreq), /* 0 */ - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_del, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_add, .bc_argsize = sizeof(struct ifbreq), /* 0 */ + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_del, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gifflags, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_sifflags, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gifflags, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sifflags, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_scache, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gcache, sizeof(struct ifbrparam), - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_scache, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gcache, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_gifs32, sizeof(struct ifbifconf32), - BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_rts32, sizeof(struct ifbaconf32), - BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_gifs32, .bc_argsize = sizeof(struct ifbifconf32), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_rts32, .bc_argsize = sizeof(struct ifbaconf32), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_saddr32, sizeof(struct ifbareq32), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_saddr32, .bc_argsize = sizeof(struct ifbareq32), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sto, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gto, sizeof(struct ifbrparam), /* 10 */ - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sto, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gto, .bc_argsize = sizeof(struct ifbrparam), /* 10 */ + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_daddr32, sizeof(struct ifbareq32), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_daddr32, .bc_argsize = sizeof(struct ifbareq32), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_flush, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_flush, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gpri, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_spri, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gpri, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_spri, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_ght, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sht, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_ght, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sht, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gfd, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfd, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gfd, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sfd, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gma, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sma, sizeof(struct ifbrparam), /* 20 */ - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gma, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sma, .bc_argsize = sizeof(struct ifbrparam), /* 20 */ + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sifprio, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sifprio, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sifcost, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sifcost, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gfilt, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfilt, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gfilt, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sfilt, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_purge, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_purge, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_addspan, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_delspan, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_addspan, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_delspan, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gbparam32, sizeof(struct ifbropreq32), - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_gbparam32, .bc_argsize = sizeof(struct ifbropreq32), + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_grte, sizeof(struct ifbrparam), - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_grte, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_gifsstp32, sizeof(struct ifbpstpconf32), /* 30 */ - BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_gifsstp32, .bc_argsize = sizeof(struct ifbpstpconf32), /* 30 */ + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_sproto, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sproto, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_stxhc, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_stxhc, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sifmaxaddr, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_ghostfilter, sizeof(struct ifbrhostfilter), - BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_shostfilter, sizeof(struct ifbrhostfilter), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_ghostfilter, .bc_argsize = sizeof(struct ifbrhostfilter), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_shostfilter, .bc_argsize = sizeof(struct ifbrhostfilter), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, }; static const struct bridge_control bridge_control_table64[] = { - { bridge_ioctl_add, sizeof(struct ifbreq), /* 0 */ - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_del, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_add, .bc_argsize = sizeof(struct ifbreq), /* 0 */ + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_del, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gifflags, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_sifflags, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gifflags, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sifflags, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_scache, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gcache, sizeof(struct ifbrparam), - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_scache, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gcache, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_gifs64, sizeof(struct ifbifconf64), - BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_rts64, sizeof(struct ifbaconf64), - BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_gifs64, .bc_argsize = sizeof(struct ifbifconf64), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_rts64, .bc_argsize = sizeof(struct ifbaconf64), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_saddr64, sizeof(struct ifbareq64), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_saddr64, .bc_argsize = sizeof(struct ifbareq64), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sto, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gto, sizeof(struct ifbrparam), /* 10 */ - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sto, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gto, .bc_argsize = sizeof(struct ifbrparam), /* 10 */ + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_daddr64, sizeof(struct ifbareq64), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_daddr64, .bc_argsize = sizeof(struct ifbareq64), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_flush, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_flush, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gpri, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_spri, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gpri, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_spri, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_ght, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sht, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_ght, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sht, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gfd, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfd, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gfd, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sfd, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gma, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sma, sizeof(struct ifbrparam), /* 20 */ - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gma, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sma, .bc_argsize = sizeof(struct ifbrparam), /* 20 */ + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sifprio, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sifprio, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sifcost, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sifcost, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gfilt, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfilt, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_gfilt, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_sfilt, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_purge, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_purge, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_addspan, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_delspan, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_addspan, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_delspan, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_gbparam64, sizeof(struct ifbropreq64), - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_gbparam64, .bc_argsize = sizeof(struct ifbropreq64), + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_grte, sizeof(struct ifbrparam), - BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_grte, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYOUT }, - { bridge_ioctl_gifsstp64, sizeof(struct ifbpstpconf64), /* 30 */ - BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_gifsstp64, .bc_argsize = sizeof(struct ifbpstpconf64), /* 30 */ + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_sproto, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sproto, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_stxhc, sizeof(struct ifbrparam), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_stxhc, .bc_argsize = sizeof(struct ifbrparam), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_sifmaxaddr, .bc_argsize = sizeof(struct ifbreq), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, - { bridge_ioctl_ghostfilter, sizeof(struct ifbrhostfilter), - BC_F_COPYIN | BC_F_COPYOUT }, - { bridge_ioctl_shostfilter, sizeof(struct ifbrhostfilter), - BC_F_COPYIN | BC_F_SUSER }, + { .bc_func = bridge_ioctl_ghostfilter, .bc_argsize = sizeof(struct ifbrhostfilter), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, + { .bc_func = bridge_ioctl_shostfilter, .bc_argsize = sizeof(struct ifbrhostfilter), + .bc_flags = BC_F_COPYIN | BC_F_SUSER }, }; static const unsigned int bridge_control_table_size = @@ -1279,10 +1248,6 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) sc->sc_filter_flags &= ~IFBF_FILT_USEIPF; #endif - if (bridge_bsd_mode != 0) { - bridge_set_bsd_mode(sc); - } - TAILQ_INIT(&sc->sc_iflist); /* use the interface name as the unique id for ifp recycle */ @@ -1291,23 +1256,21 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) bzero(&init_params, sizeof(init_params)); init_params.ver = IFNET_INIT_CURRENT_VERSION; init_params.len = sizeof(init_params); - if (bridge_in_bsd_mode(sc)) { - /* Initialize our routing table. */ - error = bridge_rtable_init(sc); - if (error != 0) { - printf("%s: bridge_rtable_init failed %d\n", - __func__, error); - goto done; - } - TAILQ_INIT(&sc->sc_spanlist); - if (if_bridge_txstart) { - init_params.start = bridge_start; - } else { - init_params.flags = IFNET_INIT_LEGACY; - init_params.output = bridge_output; - } - init_params.set_bpf_tap = bridge_set_bpf_tap; + /* Initialize our routing table. */ + error = bridge_rtable_init(sc); + if (error != 0) { + printf("%s: bridge_rtable_init failed %d\n", + __func__, error); + goto done; + } + TAILQ_INIT(&sc->sc_spanlist); + if (if_bridge_txstart) { + init_params.start = bridge_start; + } else { + init_params.flags = IFNET_INIT_LEGACY; + init_params.output = bridge_output; } + init_params.set_bpf_tap = bridge_set_bpf_tap; init_params.uniqueid = sc->sc_if_xname; init_params.uniqueid_len = strlen(sc->sc_if_xname); init_params.sndq_maxlen = IFQ_MAXLEN; @@ -1326,22 +1289,19 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) init_params.broadcast_addr = etherbroadcastaddr; init_params.broadcast_len = ETHER_ADDR_LEN; - if (bridge_in_bsd_mode(sc)) { - error = ifnet_allocate_extended(&init_params, &ifp); - if (error != 0) { - printf("%s: ifnet_allocate failed %d\n", - __func__, error); - goto done; - } - sc->sc_ifp = ifp; - error = bridge_ifnet_set_attrs(ifp); - if (error != 0) { - printf("%s: bridge_ifnet_set_attrs failed %d\n", - __func__, error); - goto done; - } + error = ifnet_allocate_extended(&init_params, &ifp); + if (error != 0) { + printf("%s: ifnet_allocate failed %d\n", + __func__, error); + goto done; + } + sc->sc_ifp = ifp; + error = bridge_ifnet_set_attrs(ifp); + if (error != 0) { + printf("%s: bridge_ifnet_set_attrs failed %d\n", + __func__, error); + goto done; } - /* * Generate an ethernet address with a locally administered address. * @@ -1397,12 +1357,10 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) link_print(sc); } #endif - if (bridge_in_bsd_mode(sc)) { - error = ifnet_attach(ifp, NULL); - if (error != 0) { - printf("%s: ifnet_attach failed %d\n", __func__, error); - goto done; - } + error = ifnet_attach(ifp, NULL); + if (error != 0) { + printf("%s: ifnet_attach failed %d\n", __func__, error); + goto done; } error = ifnet_set_lladdr_and_type(ifp, sc->sc_defaddr, ETHER_ADDR_LEN, @@ -1413,20 +1371,18 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) goto done; } - if (bridge_in_bsd_mode(sc)) { - ifnet_set_offload(ifp, - IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | - IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_MULTIPAGES); - error = bridge_set_tso(sc); - if (error != 0) { - printf("%s: bridge_set_tso failed %d\n", - __func__, error); - goto done; - } + ifnet_set_offload(ifp, + IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_MULTIPAGES); + error = bridge_set_tso(sc); + if (error != 0) { + printf("%s: bridge_set_tso failed %d\n", + __func__, error); + goto done; + } #if BRIDGESTP - bstp_attach(&sc->sc_stp, &bridge_ops); + bstp_attach(&sc->sc_stp, &bridge_ops); #endif /* BRIDGESTP */ - } lck_mtx_lock(&bridge_list_mtx); LIST_INSERT_HEAD(&bridge_list, sc, sc_list); @@ -1466,12 +1422,10 @@ bridge_clone_destroy(struct ifnet *ifp) bridge_ifstop(ifp, 1); - if (bridge_in_bsd_mode(sc)) { - bridge_cancel_delayed_call(&sc->sc_resize_call); + bridge_cancel_delayed_call(&sc->sc_resize_call); - bridge_cleanup_delayed_call(&sc->sc_resize_call); - bridge_cleanup_delayed_call(&sc->sc_aging_timer); - } + bridge_cleanup_delayed_call(&sc->sc_resize_call); + bridge_cleanup_delayed_call(&sc->sc_aging_timer); error = ifnet_set_flags(ifp, 0, IFF_UP); if (error != 0) { @@ -1482,12 +1436,10 @@ bridge_clone_destroy(struct ifnet *ifp) bridge_delete_member(sc, bif, 0); } - if (bridge_in_bsd_mode(sc)) { - while ((bif = TAILQ_FIRST(&sc->sc_spanlist)) != NULL) { - bridge_delete_span(sc, bif); - } - BRIDGE_UNLOCK(sc); + while ((bif = TAILQ_FIRST(&sc->sc_spanlist)) != NULL) { + bridge_delete_span(sc, bif); } + BRIDGE_UNLOCK(sc); error = ifnet_detach(ifp); if (error != 0) { @@ -1995,7 +1947,6 @@ out: return error; } -#if BRIDGE_MEMBER_OUT_FILTER static errno_t bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol, mbuf_t *data) @@ -2020,17 +1971,15 @@ bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol, #endif /* BRIDGE_DEBUG */ error = bridge_member_output(sc, ifp, m); - if (error != 0) { + if (error != 0 && error != EJUSTRETURN) { printf("%s: bridge_member_output failed error %d\n", __func__, error); } - out: BRIDGE_LOCK_ASSERT_NOTHELD(sc); return error; } -#endif /* BRIDGE_MEMBER_OUT_FILTER */ static void bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol, @@ -2054,7 +2003,7 @@ bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol, switch (event_msg->event_code) { case KEV_DL_IF_DETACHING: case KEV_DL_IF_DETACHED: { - bridge_ifdetach(bif, ifp); + bridge_ifdetach(ifp); break; } case KEV_DL_LINK_OFF: @@ -2089,6 +2038,11 @@ bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol, BRIDGE_UNLOCK(sc); break; } + case KEV_DL_PROTO_DETACHED: + case KEV_DL_PROTO_ATTACHED: { + bridge_proto_attach_changed(ifp); + break; + } default: break; } @@ -2112,7 +2066,7 @@ bridge_iff_detached(void *cookie, ifnet_t ifp) } #endif /* BRIDGE_DEBUG */ - bridge_ifdetach(bif, ifp); + bridge_ifdetach(ifp); _FREE(bif, M_DEVBUF); } @@ -2185,13 +2139,10 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, int lladdr_changed = 0, error, filt_attached; uint8_t eaddr[ETHER_ADDR_LEN]; u_int32_t event_code = 0; - boolean_t bsd_mode; BRIDGE_LOCK_ASSERT_HELD(sc); VERIFY(ifs != NULL); - bsd_mode = bridge_in_bsd_mode(sc); - /* * First, remove the member from the list first so it cannot be found anymore * when we release the bridge lock below @@ -2239,7 +2190,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, BRIDGE_LOCK(sc); } #if BRIDGESTP - if (bsd_mode && (bif->bif_ifflags & IFBIF_STP) != 0) { + if ((bif->bif_ifflags & IFBIF_STP) != 0) { bstp_disable(&bif->bif_stp); } #endif /* BRIDGESTP */ @@ -2273,9 +2224,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, printf("%s: bridge_set_tso failed %d\n", __func__, error); } - if (bsd_mode) { - bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); - } + bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); KASSERT(bif->bif_addrcnt == 0, ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); @@ -2287,9 +2236,8 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, */ event_code = bridge_updatelinkstatus(sc); - if (bsd_mode) { - BRIDGE_UNLOCK(sc); - } + BRIDGE_UNLOCK(sc); + if (lladdr_changed && (error = ifnet_set_lladdr(bifp, eaddr, ETHER_ADDR_LEN)) != 0) { @@ -2301,9 +2249,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, } #if BRIDGESTP - if (bsd_mode) { - bstp_destroy(&bif->bif_stp); /* prepare to free */ - } + bstp_destroy(&bif->bif_stp); /* prepare to free */ #endif /* BRIDGESTP */ if (filt_attached) { @@ -2347,7 +2293,6 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) uint8_t eaddr[ETHER_ADDR_LEN]; struct iff_filter iff; u_int32_t event_code = 0; - boolean_t bsd_mode = bridge_in_bsd_mode(sc); ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) { @@ -2361,12 +2306,10 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) return EINVAL; } - if (bsd_mode) { - /* If it's in the span list, it can't be a member. */ - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) - if (ifs == bif->bif_ifp) { - return EBUSY; - } + /* If it's in the span list, it can't be a member. */ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) { + return EBUSY; } if (ifs->if_bridge == sc) { @@ -2427,9 +2370,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) ifs->if_bridge = sc; #if BRIDGESTP - if (bsd_mode) { - bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); - } + bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); #endif /* BRIDGESTP */ /* @@ -2481,9 +2422,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) /* * Respect lock ordering with DLIL lock for the following operations */ - if (bsd_mode) { - BRIDGE_UNLOCK(sc); - } + BRIDGE_UNLOCK(sc); + /* * install an interface filter @@ -2491,12 +2431,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) memset(&iff, 0, sizeof(struct iff_filter)); iff.iff_cookie = bif; iff.iff_name = "com.apple.kernel.bsd.net.if_bridge"; - if (bsd_mode) { - iff.iff_input = bridge_iff_input; -#if BRIDGE_MEMBER_OUT_FILTER - iff.iff_output = bridge_iff_output; -#endif /* BRIDGE_MEMBER_OUT_FILTER */ - } + iff.iff_input = bridge_iff_input; + iff.iff_output = bridge_iff_output; iff.iff_event = bridge_iff_event; iff.iff_detached = bridge_iff_detached; error = dlil_attach_filter(ifs, &iff, &bif->bif_iff_ref, @@ -2506,10 +2442,12 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) BRIDGE_LOCK(sc); goto out; } + BRIDGE_LOCK(sc); bif->bif_flags |= BIFF_FILTER_ATTACHED; + BRIDGE_UNLOCK(sc); /* - * install an dummy "bridge" protocol + * install a dummy "bridge" protocol */ if ((error = bridge_attach_protocol(ifs)) != 0) { if (error != 0) { @@ -2519,7 +2457,9 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) goto out; } } + BRIDGE_LOCK(sc); bif->bif_flags |= BIFF_PROTO_ATTACHED; + BRIDGE_UNLOCK(sc); if (lladdr_changed && (error = ifnet_set_lladdr(bifp, eaddr, ETHER_ADDR_LEN)) != 0) { @@ -2574,36 +2514,35 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) return ENOENT; } - if (bridge_in_bsd_mode(sc)) { - struct bstp_port *bp; + struct bstp_port *bp; - bp = &bif->bif_stp; - req->ifbr_state = bp->bp_state; - req->ifbr_priority = bp->bp_priority; - req->ifbr_path_cost = bp->bp_path_cost; - req->ifbr_proto = bp->bp_protover; - req->ifbr_role = bp->bp_role; - req->ifbr_stpflags = bp->bp_flags; - /* Copy STP state options as flags */ - if (bp->bp_operedge) { - req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; - } - if (bp->bp_flags & BSTP_PORT_AUTOEDGE) { - req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; - } - if (bp->bp_ptp_link) { - req->ifbr_ifsflags |= IFBIF_BSTP_PTP; - } - if (bp->bp_flags & BSTP_PORT_AUTOPTP) { - req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; - } - if (bp->bp_flags & BSTP_PORT_ADMEDGE) { - req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; - } - if (bp->bp_flags & BSTP_PORT_ADMCOST) { - req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; - } + bp = &bif->bif_stp; + req->ifbr_state = bp->bp_state; + req->ifbr_priority = bp->bp_priority; + req->ifbr_path_cost = bp->bp_path_cost; + req->ifbr_proto = bp->bp_protover; + req->ifbr_role = bp->bp_role; + req->ifbr_stpflags = bp->bp_flags; + /* Copy STP state options as flags */ + if (bp->bp_operedge) { + req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; + } + if (bp->bp_flags & BSTP_PORT_AUTOEDGE) { + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; + } + if (bp->bp_ptp_link) { + req->ifbr_ifsflags |= IFBIF_BSTP_PTP; } + if (bp->bp_flags & BSTP_PORT_AUTOPTP) { + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; + } + if (bp->bp_flags & BSTP_PORT_ADMEDGE) { + req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; + } + if (bp->bp_flags & BSTP_PORT_ADMCOST) { + req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; + } + req->ifbr_ifsflags = bif->bif_ifflags; req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; req->ifbr_addrcnt = bif->bif_addrcnt; @@ -2623,10 +2562,6 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) int error; #endif /* BRIDGESTP */ - if (!bridge_in_bsd_mode(sc)) { - return EINVAL; - } - bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) { return ENOENT; @@ -2677,9 +2612,7 @@ bridge_ioctl_scache(struct bridge_softc *sc, void *arg) struct ifbrparam *param = arg; sc->sc_brtmax = param->ifbrp_csize; - if (bridge_in_bsd_mode(sc)) { - bridge_rttrim(sc); - } + bridge_rttrim(sc); return 0; } @@ -2702,10 +2635,8 @@ bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) count = 0; \ TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) \ count++; \ - if (bridge_in_bsd_mode(sc)) { \ - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) \ - count++; \ - } \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) \ + count++; \ \ buflen = sizeof (breq) * count; \ if (bifc->ifbic_len == 0) { \ @@ -2735,22 +2666,20 @@ bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) buf += sizeof (breq); \ len -= sizeof (breq); \ } \ - if (bridge_in_bsd_mode(sc)) { \ - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { \ - if (len < sizeof (breq)) \ - break; \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { \ + if (len < sizeof (breq)) \ + break; \ \ - snprintf(breq.ifbr_ifsname, \ - sizeof (breq.ifbr_ifsname), \ - "%s", bif->bif_ifp->if_xname); \ - breq.ifbr_ifsflags = bif->bif_ifflags; \ - breq.ifbr_portno \ - = bif->bif_ifp->if_index & 0xfff; \ - memcpy(buf, &breq, sizeof (breq)); \ - count++; \ - buf += sizeof (breq); \ - len -= sizeof (breq); \ - } \ + snprintf(breq.ifbr_ifsname, \ + sizeof (breq.ifbr_ifsname), \ + "%s", bif->bif_ifp->if_xname); \ + breq.ifbr_ifsflags = bif->bif_ifflags; \ + breq.ifbr_portno \ + = bif->bif_ifp->if_index & 0xfff; \ + memcpy(buf, &breq, sizeof (breq)); \ + count++; \ + buf += sizeof (breq); \ + len -= sizeof (breq); \ } \ \ BRIDGE_UNLOCK(sc); \ @@ -2794,9 +2723,6 @@ bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg) \ bzero(&bareq, sizeof (bareq)); \ count = 0; \ - if (!bridge_in_bsd_mode(sc)) { \ - goto out; \ - } \ LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) \ count++; \ buflen = sizeof (bareq) * count; \ @@ -2869,10 +2795,6 @@ bridge_ioctl_saddr32(struct bridge_softc *sc, void *arg) struct bridge_iflist *bif; int error; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } - bif = bridge_lookup_member(sc, req->ifba_ifsname); if (bif == NULL) { return ENOENT; @@ -2891,10 +2813,6 @@ bridge_ioctl_saddr64(struct bridge_softc *sc, void *arg) struct bridge_iflist *bif; int error; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } - bif = bridge_lookup_member(sc, req->ifba_ifsname); if (bif == NULL) { return ENOENT; @@ -2929,9 +2847,6 @@ bridge_ioctl_daddr32(struct bridge_softc *sc, void *arg) { struct ifbareq32 *req = arg; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } return bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan); } @@ -2940,9 +2855,6 @@ bridge_ioctl_daddr64(struct bridge_softc *sc, void *arg) { struct ifbareq64 *req = arg; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } return bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan); } @@ -2951,9 +2863,6 @@ bridge_ioctl_flush(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } bridge_rtflush(sc, req->ifbr_ifsflags); return 0; } @@ -2964,9 +2873,6 @@ bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } param->ifbrp_prio = bs->bs_bridge_priority; return 0; } @@ -2977,9 +2883,6 @@ bridge_ioctl_spri(struct bridge_softc *sc, void *arg) #if BRIDGESTP struct ifbrparam *param = arg; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } return bstp_set_priority(&sc->sc_stp, param->ifbrp_prio); #else /* !BRIDGESTP */ #pragma unused(sc, arg) @@ -2993,9 +2896,6 @@ bridge_ioctl_ght(struct bridge_softc *sc, void *arg) struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; return 0; } @@ -3006,9 +2906,6 @@ bridge_ioctl_sht(struct bridge_softc *sc, void *arg) #if BRIDGESTP struct ifbrparam *param = arg; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } return bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime); #else /* !BRIDGESTP */ #pragma unused(sc, arg) @@ -3022,9 +2919,6 @@ bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) struct ifbrparam *param; struct bstp_state *bs; - if (!bridge_in_bsd_mode(sc)) { - return 0; - } param = arg; bs = &sc->sc_stp; param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; @@ -3037,9 +2931,6 @@ bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) #if BRIDGESTP struct ifbrparam *param = arg; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } return bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay); #else /* !BRIDGESTP */ #pragma unused(sc, arg) @@ -3053,9 +2944,6 @@ bridge_ioctl_gma(struct bridge_softc *sc, void *arg) struct ifbrparam *param; struct bstp_state *bs; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } param = arg; bs = &sc->sc_stp; param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; @@ -3068,9 +2956,6 @@ bridge_ioctl_sma(struct bridge_softc *sc, void *arg) #if BRIDGESTP struct ifbrparam *param = arg; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } return bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage); #else /* !BRIDGESTP */ #pragma unused(sc, arg) @@ -3085,9 +2970,6 @@ bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) struct ifbreq *req = arg; struct bridge_iflist *bif; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) { return ENOENT; @@ -3107,9 +2989,6 @@ bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) struct ifbreq *req = arg; struct bridge_iflist *bif; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) { return ENOENT; @@ -3174,9 +3053,6 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) struct bridge_iflist *bif = NULL; struct ifnet *ifs; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) { return ENOENT; @@ -3228,9 +3104,6 @@ bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) struct bridge_iflist *bif; struct ifnet *ifs; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) { return ENOENT; @@ -3280,9 +3153,7 @@ bridge_ioctl_gbparam32(struct bridge_softc *sc, void *arg) { struct ifbropreq32 *req = arg; - if (bridge_in_bsd_mode(sc)) { - BRIDGE_IOCTL_GBPARAM; - } + BRIDGE_IOCTL_GBPARAM; return 0; } @@ -3291,9 +3162,7 @@ bridge_ioctl_gbparam64(struct bridge_softc *sc, void *arg) { struct ifbropreq64 *req = arg; - if (bridge_in_bsd_mode(sc)) { - BRIDGE_IOCTL_GBPARAM; - } + BRIDGE_IOCTL_GBPARAM; return 0; } @@ -3368,9 +3237,7 @@ bridge_ioctl_gifsstp32(struct bridge_softc *sc, void *arg) struct ifbpstpconf32 *bifstp = arg; int error = 0; - if (bridge_in_bsd_mode(sc)) { - BRIDGE_IOCTL_GIFSSTP; - } + BRIDGE_IOCTL_GIFSSTP; return error; } @@ -3380,9 +3247,7 @@ bridge_ioctl_gifsstp64(struct bridge_softc *sc, void *arg) struct ifbpstpconf64 *bifstp = arg; int error = 0; - if (bridge_in_bsd_mode(sc)) { - BRIDGE_IOCTL_GIFSSTP; - } + BRIDGE_IOCTL_GIFSSTP; return error; } @@ -3392,9 +3257,6 @@ bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) #if BRIDGESTP struct ifbrparam *param = arg; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } return bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto); #else /* !BRIDGESTP */ #pragma unused(sc, arg) @@ -3408,9 +3270,6 @@ bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) #if BRIDGESTP struct ifbrparam *param = arg; - if (!bridge_in_bsd_mode(sc)) { - return EOPNOTSUPP; - } return bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc); #else /* !BRIDGESTP */ #pragma unused(sc, arg) @@ -3491,9 +3350,10 @@ bridge_ioctl_shostfilter(struct bridge_softc *sc, void *arg) * Detach an interface from a bridge. Called when a member * interface is detaching. */ -__private_extern__ void -bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp) +static void +bridge_ifdetach(struct ifnet *ifp) { + struct bridge_iflist *bif; struct bridge_softc *sc = ifp->if_bridge; #if BRIDGE_DEBUG @@ -3515,19 +3375,68 @@ bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp) /* Check if the interface is a span port */ lck_mtx_lock(&bridge_list_mtx); LIST_FOREACH(sc, &bridge_list, sc_list) { - if (bridge_in_bsd_mode(sc)) { - BRIDGE_LOCK(sc); - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) - if (ifp == bif->bif_ifp) { - bridge_delete_span(sc, bif); - break; - } - BRIDGE_UNLOCK(sc); + BRIDGE_LOCK(sc); + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifp == bif->bif_ifp) { + bridge_delete_span(sc, bif); + break; } + BRIDGE_UNLOCK(sc); } lck_mtx_unlock(&bridge_list_mtx); } +/* + * bridge_proto_attach_changed + * + * Called when protocol attachment on the interface changes. + */ +static void +bridge_proto_attach_changed(struct ifnet *ifp) +{ + boolean_t changed = FALSE; + struct bridge_iflist *bif; + boolean_t input_broadcast; + struct bridge_softc *sc = ifp->if_bridge; + +#if BRIDGE_DEBUG + if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + printf("%s: %s\n", __func__, ifp->if_xname); + } +#endif /* BRIDGE_DEBUG */ + if (sc == NULL) { + return; + } + /* + * Selectively enable input broadcast only when necessary. + * The bridge interface itself attaches a fake protocol + * so checking for at least two protocols means that the + * interface is being used for something besides bridging. + */ + input_broadcast = if_get_protolist(ifp, NULL, 0) >= 2; + BRIDGE_LOCK(sc); + bif = bridge_lookup_member_if(sc, ifp); + if (bif != NULL) { + if (input_broadcast) { + if ((bif->bif_flags & BIFF_INPUT_BROADCAST) == 0) { + bif->bif_flags |= BIFF_INPUT_BROADCAST; + changed = TRUE; + } + } else if ((bif->bif_flags & BIFF_INPUT_BROADCAST) != 0) { + changed = TRUE; + bif->bif_flags &= ~BIFF_INPUT_BROADCAST; + } + } + BRIDGE_UNLOCK(sc); +#if BRIDGE_DEBUG + if ((if_bridge_debug & BR_DBGF_LIFECYCLE) != 0 && changed) { + printf("%s: input broadcast %s", ifp->if_xname, + input_broadcast ? "ENABLED" : "DISABLED"); + } +#endif /* BRIDGE_DEBUG */ + return; +} + /* * interface_media_active: * @@ -3827,18 +3736,16 @@ bridge_init(struct ifnet *ifp) error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING); - if (bridge_in_bsd_mode(sc)) { - /* - * Calling bridge_aging_timer() is OK as there are no entries to - * age so we're just going to arm the timer - */ - bridge_aging_timer(sc); + /* + * Calling bridge_aging_timer() is OK as there are no entries to + * age so we're just going to arm the timer + */ + bridge_aging_timer(sc); #if BRIDGESTP - if (error == 0) { - bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ - } -#endif /* BRIDGESTP */ + if (error == 0) { + bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ } +#endif /* BRIDGESTP */ return error; } @@ -3859,18 +3766,70 @@ bridge_ifstop(struct ifnet *ifp, int disable) return; } - if (bridge_in_bsd_mode(sc)) { - bridge_cancel_delayed_call(&sc->sc_aging_timer); + bridge_cancel_delayed_call(&sc->sc_aging_timer); #if BRIDGESTP - bstp_stop(&sc->sc_stp); + bstp_stop(&sc->sc_stp); #endif /* BRIDGESTP */ - bridge_rtflush(sc, IFBF_FLUSHDYN); - } + bridge_rtflush(sc, IFBF_FLUSHDYN); (void) ifnet_set_flags(ifp, 0, IFF_RUNNING); } +/* + * bridge_compute_cksum: + * + * If the packet has checksum flags, compare the hardware checksum + * capabilities of the source and destination interfaces. If they + * are the same, there's nothing to do. If they are different, + * finalize the checksum so that it can be sent on the destination + * interface. + */ +static void +bridge_compute_cksum(struct ifnet *src_if, struct ifnet *dst_if, struct mbuf *m) +{ + uint32_t csum_flags; + uint16_t dst_hw_csum; + uint32_t did_sw; + struct ether_header *eh; + uint16_t src_hw_csum; + + csum_flags = m->m_pkthdr.csum_flags & IF_HWASSIST_CSUM_MASK; + if (csum_flags == 0) { + /* no checksum offload */ + return; + } + + /* + * if destination/source differ in checksum offload + * capabilities, finalize/compute the checksum + */ + dst_hw_csum = IF_HWASSIST_CSUM_FLAGS(dst_if->if_hwassist); + src_hw_csum = IF_HWASSIST_CSUM_FLAGS(src_if->if_hwassist); + if (dst_hw_csum == src_hw_csum) { + return; + } + eh = mtod(m, struct ether_header *); + switch (ntohs(eh->ether_type)) { + case ETHERTYPE_IP: + did_sw = in_finalize_cksum(m, sizeof(*eh), csum_flags); + break; +#if INET6 + case ETHERTYPE_IPV6: + did_sw = in6_finalize_cksum(m, sizeof(*eh), -1, -1, csum_flags); + break; +#endif /* INET6 */ + } +#if BRIDGE_DEBUG + if (if_bridge_debug & BR_DBGF_CHECKSUM) { + printf("%s: [%s -> %s] before 0x%x did 0x%x after 0x%x\n", + __func__, + src_if->if_xname, dst_if->if_xname, csum_flags, did_sw, + m->m_pkthdr.csum_flags); + } +#endif /* BRIDGE_DEBUG */ +} + /* * bridge_enqueue: * @@ -3878,11 +3837,11 @@ bridge_ifstop(struct ifnet *ifp, int disable) * */ static int -bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) +bridge_enqueue(struct bridge_softc *sc, struct ifnet *src_ifp, + struct ifnet *dst_ifp, struct mbuf *m, ChecksumOperation cksum_op) { int len, error = 0; - short mflags; - struct mbuf *m0; + struct mbuf *next_m; VERIFY(dst_ifp != NULL); @@ -3891,19 +3850,30 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) * * NOTE: bridge_fragment() is called only when PFIL_HOOKS is enabled. */ - for (; m; m = m0) { + for (; m; m = next_m) { errno_t _error; - struct flowadv adv = { FADV_SUCCESS }; + struct flowadv adv = { .code = FADV_SUCCESS }; - m0 = m->m_nextpkt; + next_m = m->m_nextpkt; m->m_nextpkt = NULL; len = m->m_pkthdr.len; - mflags = m->m_flags; m->m_flags |= M_PROTO1; /* set to avoid loops */ - bridge_finalize_cksum(dst_ifp, m); - + switch (cksum_op) { + case kChecksumOperationClear: + m->m_pkthdr.csum_flags = 0; + break; + case kChecksumOperationFinalize: + /* the checksum might not be correct, finalize now */ + bridge_finalize_cksum(dst_ifp, m); + break; + case kChecksumOperationCompute: + bridge_compute_cksum(src_ifp, dst_ifp, m); + break; + default: + break; + } #if HAS_IF_CAP /* * If underlying interface can not do VLAN tag insertion itself @@ -3963,7 +3933,7 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) sc = ifp->if_bridge; /* - * The packet didnt originate from a member interface. This should only + * The packet didn't originate from a member interface. This should only * ever happen if a member interface is removed while packets are * queued for it. */ @@ -3981,11 +3951,10 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) } } - (void) bridge_enqueue(sc, ifp, m); + (void) bridge_enqueue(sc, NULL, ifp, m, kChecksumOperationNone); } #endif /* HAS_BRIDGE_DUMMYNET */ -#if BRIDGE_MEMBER_OUT_FILTER /* * bridge_member_output: * @@ -3993,17 +3962,13 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) * performs the bridging function for locally originated * packets. * - * The mbuf has the Ethernet header already attached. We must - * enqueue or free the mbuf before returning. + * The mbuf has the Ethernet header already attached. */ -static int -bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, - struct rtentry *rt) +static errno_t +bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) { -#pragma unused(sa, rt) struct ether_header *eh; struct ifnet *dst_if; - struct bridge_softc *sc; uint16_t vlan; #if BRIDGE_DEBUG @@ -4015,12 +3980,11 @@ bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, if (m->m_len < ETHER_HDR_LEN) { m = m_pullup(m, ETHER_HDR_LEN); if (m == NULL) { - return 0; + return ENOBUFS; } } eh = mtod(m, struct ether_header *); - sc = ifp->if_bridge; vlan = VLANTAGOF(m); BRIDGE_LOCK(sc); @@ -4057,17 +4021,23 @@ bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, if (dst_if == NULL) { struct bridge_iflist *bif; struct mbuf *mc; - int error = 0, used = 0; + int used = 0; + errno_t error; + bridge_span(sc, m); BRIDGE_LOCK2REF(sc, error); - if (error) { + if (error != 0) { m_freem(m); - return 0; + return error; } TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + /* skip interface with inactive link status */ + if ((bif->bif_flags & BIFF_MEDIA_ACTIVE) == 0) { + continue; + } dst_if = bif->bif_ifp; if (dst_if->if_type == IFT_GIF) { @@ -4087,26 +4057,25 @@ bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { continue; } - - if (LIST_NEXT(bif, bif_next) == NULL) { + if (TAILQ_NEXT(bif, bif_next) == NULL) { used = 1; mc = m; } else { - mc = m_copypacket(m, M_DONTWAIT); + mc = m_dup(m, M_DONTWAIT); if (mc == NULL) { (void) ifnet_stat_increment_out( sc->sc_ifp, 0, 0, 1); continue; } } - - (void) bridge_enqueue(sc, dst_if, mc); + (void) bridge_enqueue(sc, ifp, dst_if, mc, + kChecksumOperationCompute); } if (used == 0) { m_freem(m); } BRIDGE_UNREF(sc); - return 0; + return EJUSTRETURN; } sendunicast: @@ -4118,14 +4087,18 @@ sendunicast: if ((dst_if->if_flags & IFF_RUNNING) == 0) { m_freem(m); BRIDGE_UNLOCK(sc); - return 0; + return EJUSTRETURN; } BRIDGE_UNLOCK(sc); - (void) bridge_enqueue(sc, dst_if, m); - return 0; + if (dst_if == ifp) { + /* just let the packet continue on its way */ + return 0; + } + (void) bridge_enqueue(sc, ifp, dst_if, m, + kChecksumOperationCompute); + return EJUSTRETURN; } -#endif /* BRIDGE_MEMBER_OUT_FILTER */ /* * Output callback. @@ -4145,7 +4118,6 @@ bridge_output(struct ifnet *ifp, struct mbuf *m) dst_if = NULL; BRIDGE_LOCK(sc); - ASSERT(bridge_in_bsd_mode(sc)); if (!(m->m_flags & (M_BCAST | M_MCAST))) { dst_if = bridge_rtlookup(sc, eh->ether_dhost, 0); @@ -4161,10 +4133,11 @@ bridge_output(struct ifnet *ifp, struct mbuf *m) if (dst_if == NULL) { /* callee will unlock */ - bridge_broadcast(sc, ifp, m, 0); + bridge_broadcast(sc, NULL, m, 0); } else { BRIDGE_UNLOCK(sc); - error = bridge_enqueue(sc, dst_if, m); + error = bridge_enqueue(sc, NULL, dst_if, m, + kChecksumOperationFinalize); } return error; @@ -4176,6 +4149,7 @@ bridge_finalize_cksum(struct ifnet *ifp, struct mbuf *m) struct ether_header *eh = mtod(m, struct ether_header *); uint32_t sw_csum, hwcap; + if (ifp != NULL) { hwcap = (ifp->if_hwassist | CSUM_DATA_VALID); } else { @@ -4277,7 +4251,6 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, int error; BRIDGE_LOCK_ASSERT_HELD(sc); - ASSERT(bridge_in_bsd_mode(sc)); #if BRIDGE_DEBUG if (if_bridge_debug & BR_DBGF_OUTPUT) { @@ -4329,11 +4302,14 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, * "this" side of the bridge, drop it. */ if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { + /* unicast */ dst_if = bridge_rtlookup(sc, dst, vlan); if (src_if == dst_if) { goto drop; } } else { + /* broadcast/multicast */ + /* * Check if its a reserved multicast address, any address * listed in 802.1D section 7.12.6 may not be forwarded by the @@ -4390,6 +4366,9 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, return; } + /* + * Unicast. + */ /* * At this point, we're dealing with a unicast frame * going to a different interface. @@ -4438,7 +4417,14 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, } #endif /* PFIL_HOOKS */ - (void) bridge_enqueue(sc, dst_if, m); + /* + * This is an inbound packet where the checksum + * (if applicable) is already present/valid. Since + * we are just doing layer 2 forwarding (not IP + * forwarding), there's no need to validate the checksum. + * Clear the checksum offload flags and send it along. + */ + (void) bridge_enqueue(sc, NULL, dst_if, m, kChecksumOperationClear); return; drop: @@ -4478,7 +4464,6 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) uint16_t vlan; int error; - ASSERT(bridge_in_bsd_mode(sc)); #if BRIDGE_DEBUG if (if_bridge_debug & BR_DBGF_INPUT) { printf("%s: %s from %s m 0x%llx data 0x%llx\n", __func__, @@ -4695,30 +4680,40 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) if ((iface)->if_type == IFT_GIF) \ continue; \ /* It is destined for us. */ \ - if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, \ + if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, \ ETHER_ADDR_LEN) == 0 || CARP_CHECK_WE_ARE_DST((iface))) { \ if ((iface)->if_type == IFT_BRIDGE) { \ BRIDGE_BPF_MTAP_INPUT(sc, m); \ /* Filter on the physical interface. */ \ PFIL_PHYS(sc, iface, m); \ + } else { \ + bpf_tap_in(iface, DLT_EN10MB, m, NULL, 0); \ } \ if (bif->bif_ifflags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ if (error && bif->bif_addrmax) { \ BRIDGE_UNLOCK(sc); \ + m_freem(m); \ return (EJUSTRETURN); \ } \ } \ - m->m_pkthdr.rcvif = iface; \ BRIDGE_UNLOCK(sc); \ - return (0); \ + mbuf_pkthdr_setrcvif(m, iface); \ + mbuf_pkthdr_setheader(m, mbuf_data(m)); \ + mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, \ + mbuf_len(m) - ETHER_HDR_LEN); \ + mbuf_pkthdr_adjustlen(m, -ETHER_HDR_LEN); \ + m->m_flags |= M_PROTO1; /* set to avoid loops */ \ + dlil_input_packet_list(iface, m); \ + return (EJUSTRETURN); \ } \ \ /* We just received a packet that we sent out. */ \ - if (memcmp(IF_LLADDR((iface)), eh->ether_shost, \ + if (memcmp(IF_LLADDR((iface)), eh->ether_shost, \ ETHER_ADDR_LEN) == 0 || CARP_CHECK_WE_ARE_SRC((iface))) { \ BRIDGE_UNLOCK(sc); \ + m_freem(m); \ return (EJUSTRETURN); \ } @@ -4787,13 +4782,16 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) printf("%s: not forwarding packet bound for member " "interface\n", __func__); #endif + BRIDGE_UNLOCK(sc); return 0; } - /* Now check the all bridge members. */ + /* Now check the remaining bridge members. */ TAILQ_FOREACH(bif2, &sc->sc_iflist, bif_next) { - GRAB_OUR_PACKETS(bif2->bif_ifp) + if (bif2->bif_ifp != ifp) { + GRAB_OUR_PACKETS(bif2->bif_ifp); + } } #undef CARP_CHECK_WE_ARE_DST @@ -4828,10 +4826,25 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, #endif struct bridge_iflist *dbif, *sbif; struct mbuf *mc; + struct mbuf *mc_in; struct ifnet *dst_if; int error = 0, used = 0; + boolean_t is_output; + ChecksumOperation cksum_op; - sbif = bridge_lookup_member_if(sc, src_if); + if (src_if != NULL) { + is_output = FALSE; + cksum_op = kChecksumOperationClear; + sbif = bridge_lookup_member_if(sc, src_if); + } else { + /* + * src_if is NULL when the bridge interface calls + * bridge_broadcast(). + */ + is_output = TRUE; + cksum_op = kChecksumOperationFinalize; + sbif = NULL; + } BRIDGE_LOCK2REF(sc, error); if (error) { @@ -4854,11 +4867,12 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, TAILQ_FOREACH(dbif, &sc->sc_iflist, bif_next) { dst_if = dbif->bif_ifp; if (dst_if == src_if) { + /* skip the interface that the packet came in on */ continue; } /* Private segments can not talk to each other */ - if (sbif && + if (sbif != NULL && (sbif->bif_ifflags & dbif->bif_ifflags & IFBIF_PRIVATE)) { continue; } @@ -4893,6 +4907,18 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } } + /* + * If broadcast input is enabled, do so only if this + * is an input packet. + */ + if (!is_output && + (dbif->bif_flags & BIFF_INPUT_BROADCAST) != 0) { + mc_in = m_dup(mc, M_DONTWAIT); + /* this could fail, but we continue anyways */ + } else { + mc_in = NULL; + } + #ifdef PFIL_HOOKS /* * Filter on the output interface. Pass a NULL bridge interface @@ -4908,19 +4934,42 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, if (mc == NULL) { (void) ifnet_stat_increment_out( sc->sc_ifp, 0, 0, 1); + if (mc_in != NULL) { + m_freem(mc_in); + } continue; } } if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) { + if (mc_in != NULL) { + m_freem(mc_in); + } continue; } if (mc == NULL) { + if (mc_in != NULL) { + m_freem(mc_in); + } continue; } } #endif /* PFIL_HOOKS */ - (void) bridge_enqueue(sc, dst_if, mc); + /* out */ + (void) bridge_enqueue(sc, NULL, dst_if, mc, cksum_op); + + /* in */ + if (mc_in == NULL) { + continue; + } + bpf_tap_in(dst_if, DLT_EN10MB, mc_in, NULL, 0); + mbuf_pkthdr_setrcvif(mc_in, dst_if); + mbuf_pkthdr_setheader(mc_in, mbuf_data(mc_in)); + mbuf_setdata(mc_in, (char *)mbuf_data(mc_in) + ETHER_HDR_LEN, + mbuf_len(mc_in) - ETHER_HDR_LEN); + mbuf_pkthdr_adjustlen(mc_in, -ETHER_HDR_LEN); + mc_in->m_flags |= M_PROTO1; /* set to avoid loops */ + dlil_input_packet_list(dst_if, mc_in); } if (used == 0) { m_freem(m); @@ -4963,7 +5012,8 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m) continue; } - (void) bridge_enqueue(sc, dst_if, mc); + (void) bridge_enqueue(sc, NULL, dst_if, mc, + kChecksumOperationNone); } } @@ -4981,7 +5031,6 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, int error; BRIDGE_LOCK_ASSERT_HELD(sc); - ASSERT(bridge_in_bsd_mode(sc)); /* Check the source address is valid and not multicast. */ if (ETHER_IS_MULTICAST(dst) || @@ -5243,8 +5292,6 @@ bridge_rtable_init(struct bridge_softc *sc) { u_int32_t i; - ASSERT(bridge_in_bsd_mode(sc)); - sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); if (sc->sc_rthash == NULL) { @@ -5465,7 +5512,6 @@ bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, int dir; BRIDGE_LOCK_ASSERT_HELD(sc); - ASSERT(bridge_in_bsd_mode(sc)); hash = bridge_rthash(sc, addr); LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { @@ -6197,7 +6243,6 @@ bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback) if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) { return ENODEV; } - ASSERT(bridge_in_bsd_mode(sc)); switch (mode) { case BPF_TAP_DISABLE: sc->sc_bpf_input = sc->sc_bpf_output = NULL; @@ -6236,10 +6281,8 @@ bridge_detach(ifnet_t ifp) bstp_detach(&sc->sc_stp); #endif /* BRIDGESTP */ - if (bridge_in_bsd_mode(sc)) { - /* Tear down the routing table. */ - bridge_rtable_fini(sc); - } + /* Tear down the routing table. */ + bridge_rtable_fini(sc); lck_mtx_lock(&bridge_list_mtx); LIST_REMOVE(sc, sc_list); @@ -6261,7 +6304,6 @@ bridge_bpf_input(ifnet_t ifp, struct mbuf *m) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); - ASSERT(bridge_in_bsd_mode(sc)); if (sc->sc_bpf_input) { if (mbuf_pkthdr_rcvif(m) != ifp) { printf("%s: rcvif: 0x%llx != ifp 0x%llx\n", __func__, @@ -6283,7 +6325,6 @@ bridge_bpf_output(ifnet_t ifp, struct mbuf *m) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); - ASSERT(bridge_in_bsd_mode(sc)); if (sc->sc_bpf_output) { (*sc->sc_bpf_output)(ifp, m); } @@ -6603,5 +6644,3 @@ done: } return error; } - - diff --git a/bsd/net/if_fake.c b/bsd/net/if_fake.c index 88bbab707..06baffe5e 100644 --- a/bsd/net/if_fake.c +++ b/bsd/net/if_fake.c @@ -81,6 +81,12 @@ #include #include +static boolean_t +is_power_of_two(unsigned int val) +{ + return (val & (val - 1)) == 0; +} + #define FAKE_ETHER_NAME "feth" SYSCTL_DECL(_net_link); @@ -111,6 +117,204 @@ static int if_fake_wmm_mode = 0; SYSCTL_INT(_net_link_fake, OID_AUTO, wmm_mode, CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_wmm_mode, 0, "Fake interface in 802.11 WMM mode"); +static int if_fake_multibuflet = 0; +SYSCTL_INT(_net_link_fake, OID_AUTO, multibuflet, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_fake_multibuflet, 0, "Fake interface using multi-buflet packets"); + +static int if_fake_copypkt_mode = 0; +SYSCTL_INT(_net_link_fake, OID_AUTO, copypkt_mode, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_fake_copypkt_mode, 0, "Fake interface copying packet to peer"); + +/* sysctl net.link.fake.tx_headroom */ +#define FETH_TX_HEADROOM_MAX 32 +static unsigned int if_fake_tx_headroom = 0; + +static int +feth_tx_headroom_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int new_value; + int changed; + int error; + + error = sysctl_io_number(req, if_fake_tx_headroom, + sizeof(if_fake_tx_headroom), &new_value, &changed); + if (error == 0 && changed != 0) { + if (new_value > FETH_TX_HEADROOM_MAX || + (new_value % 8) != 0) { + return EINVAL; + } + if_fake_tx_headroom = new_value; + } + return 0; +} + +SYSCTL_PROC(_net_link_fake, OID_AUTO, tx_headroom, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, feth_tx_headroom_sysctl, "IU", "Fake ethernet Tx headroom"); + + +/* sysctl net.link.fake.max_mtu */ +#define FETH_MAX_MTU_DEFAULT 2048 +#define FETH_MAX_MTU_MAX ((16 * 1024) - ETHER_HDR_LEN) + +static unsigned int if_fake_max_mtu = FETH_MAX_MTU_DEFAULT; + +/* sysctl net.link.fake.buflet_size */ +#define FETH_BUFLET_SIZE_MIN 512 +#define FETH_BUFLET_SIZE_MAX 2048 + +static unsigned int if_fake_buflet_size = FETH_BUFLET_SIZE_MIN; + +static int +feth_max_mtu_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int new_value; + int changed; + int error; + + error = sysctl_io_number(req, if_fake_max_mtu, + sizeof(if_fake_max_mtu), &new_value, &changed); + if (error == 0 && changed != 0) { + if (new_value > FETH_MAX_MTU_MAX || + new_value < ETHERMTU || + new_value <= if_fake_buflet_size) { + return EINVAL; + } + if_fake_max_mtu = new_value; + } + return 0; +} + +SYSCTL_PROC(_net_link_fake, OID_AUTO, max_mtu, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, feth_max_mtu_sysctl, "IU", "Fake interface maximum MTU"); + +static int +feth_buflet_size_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int new_value; + int changed; + int error; + + error = sysctl_io_number(req, if_fake_buflet_size, + sizeof(if_fake_buflet_size), &new_value, &changed); + if (error == 0 && changed != 0) { + /* must be a power of 2 between min and max */ + if (new_value > FETH_BUFLET_SIZE_MAX || + new_value < FETH_BUFLET_SIZE_MIN || + !is_power_of_two(new_value) || + new_value >= if_fake_max_mtu) { + return EINVAL; + } + if_fake_buflet_size = new_value; + } + return 0; +} + +SYSCTL_PROC(_net_link_fake, OID_AUTO, buflet_size, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, feth_buflet_size_sysctl, "IU", "Fake interface buflet size"); + +static unsigned int if_fake_user_access = 0; + +static int +feth_user_access_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int new_value; + int changed; + int error; + + error = sysctl_io_number(req, if_fake_user_access, + sizeof(if_fake_user_access), &new_value, &changed); + if (error == 0 && changed != 0) { + if (new_value != 0) { + if (new_value != 1) { + return EINVAL; + } + /* + * copypkt mode requires a kernel only buffer pool so + * it is incompatible with user access mode. + */ + if (if_fake_copypkt_mode != 0) { + return ENOTSUP; + } + } + if_fake_user_access = new_value; + } + return 0; +} + +SYSCTL_PROC(_net_link_fake, OID_AUTO, user_access, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, feth_user_access_sysctl, "IU", "Fake interface user access"); + +/* sysctl net.link.fake.if_adv_intvl (unit: millisecond) */ +#define FETH_IF_ADV_INTVL_MIN 10 +#define FETH_IF_ADV_INTVL_MAX INT_MAX + +static int if_fake_if_adv_interval = 0; /* no interface advisory */ +static int +feth_if_adv_interval_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int new_value; + int changed; + int error; + + error = sysctl_io_number(req, if_fake_if_adv_interval, + sizeof(if_fake_if_adv_interval), &new_value, &changed); + if (error == 0 && changed != 0) { + if ((new_value != 0) && (new_value > FETH_IF_ADV_INTVL_MAX || + new_value < FETH_IF_ADV_INTVL_MIN)) { + return EINVAL; + } + if_fake_if_adv_interval = new_value; + } + return 0; +} + +SYSCTL_PROC(_net_link_fake, OID_AUTO, if_adv_intvl, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, + feth_if_adv_interval_sysctl, "IU", + "Fake interface will generate interface advisories reports at the specified interval in ms"); + +/* sysctl net.link.fake.tx_drops */ +/* + * Fake ethernet will drop packet on the transmit path at the specified + * rate, i.e drop one in every if_fake_tx_drops number of packets. + */ +#define FETH_TX_DROPS_MIN 0 +#define FETH_TX_DROPS_MAX INT_MAX +static int if_fake_tx_drops = 0; /* no packets are dropped */ +static int +feth_fake_tx_drops_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int new_value; + int changed; + int error; + + error = sysctl_io_number(req, if_fake_tx_drops, + sizeof(if_fake_tx_drops), &new_value, &changed); + if (error == 0 && changed != 0) { + if (new_value > FETH_TX_DROPS_MAX || + new_value < FETH_TX_DROPS_MIN) { + return EINVAL; + } + if_fake_tx_drops = new_value; + } + return 0; +} + +SYSCTL_PROC(_net_link_fake, OID_AUTO, tx_drops, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, + feth_fake_tx_drops_sysctl, "IU", + "Fake interface will intermittently drop packets on Tx path"); + /** ** virtual ethernet structures, types **/ @@ -125,6 +329,8 @@ typedef uint16_t iff_flags_t; #define IFF_FLAGS_BSD_MODE 0x0002 #define IFF_FLAGS_DETACHING 0x0004 #define IFF_FLAGS_WMM_MODE 0x0008 +#define IFF_FLAGS_MULTIBUFLETS 0x0010 +#define IFF_FLAGS_COPYPKT_MODE 0x0020 struct if_fake { @@ -139,6 +345,7 @@ struct if_fake { int iff_media_list[IF_FAKE_MEDIA_LIST_MAX]; struct mbuf * iff_pending_tx_packet; boolean_t iff_start_busy; + unsigned int iff_max_mtu; }; typedef struct if_fake * if_fake_ref; @@ -288,12 +495,35 @@ feth_unlock(void) } static inline int -feth_max_mtu(void) +get_max_mtu(int bsd_mode, unsigned int max_mtu) { - if (njcl > 0) { - return M16KCLBYTES - ETHER_HDR_LEN; + unsigned int mtu; + + if (bsd_mode != 0) { + mtu = (njcl > 0) ? (M16KCLBYTES - ETHER_HDR_LEN) + : MBIGCLBYTES - ETHER_HDR_LEN; + if (mtu > max_mtu) { + mtu = max_mtu; + } + } else { + mtu = max_mtu; } - return MBIGCLBYTES - ETHER_HDR_LEN; + return mtu; +} + +static inline unsigned int +feth_max_mtu(ifnet_t ifp) +{ + if_fake_ref fakeif; + unsigned int max_mtu = ETHERMTU; + + feth_lock(); + fakeif = ifnet_get_if_fake(ifp); + if (fakeif != NULL) { + max_mtu = fakeif->iff_max_mtu; + } + feth_unlock(); + return max_mtu; } static void @@ -406,6 +636,7 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) if (if_fake_hwcsum != 0) { fakeif->iff_flags |= IFF_FLAGS_HWCSUM; } + fakeif->iff_max_mtu = get_max_mtu(if_fake_bsd_mode, if_fake_max_mtu); /* use the interface name as the unique id for ifp recycle */ if ((unsigned int) @@ -598,15 +829,19 @@ feth_start(ifnet_t ifp) feth_lock(); fakeif = ifnet_get_if_fake(ifp); + if (fakeif == NULL) { + feth_unlock(); + return; + } + if (fakeif->iff_start_busy) { feth_unlock(); printf("if_fake: start is busy\n"); return; } - if (fakeif != NULL) { - peer = fakeif->iff_peer; - flags = fakeif->iff_flags; - } + + peer = fakeif->iff_peer; + flags = fakeif->iff_flags; /* check for pending TX */ m = fakeif->iff_pending_tx_packet; @@ -888,7 +1123,7 @@ feth_get_drvspec(ifnet_t ifp, u_int32_t cmd, u_int32_t len, break; } feth_lock(); - fakeif = (if_fake_ref)ifnet_softc(ifp); + fakeif = ifnet_get_if_fake(ifp); if (fakeif == NULL) { feth_unlock(); error = EOPNOTSUPP; @@ -941,7 +1176,7 @@ feth_ioctl(ifnet_t ifp, u_long cmd, void * data) case SIOCGIFMEDIA32: case SIOCGIFMEDIA64: feth_lock(); - fakeif = (if_fake_ref)ifnet_softc(ifp); + fakeif = ifnet_get_if_fake(ifp); if (fakeif == NULL) { feth_unlock(); return EOPNOTSUPP; @@ -973,12 +1208,13 @@ feth_ioctl(ifnet_t ifp, u_long cmd, void * data) case SIOCGIFDEVMTU: devmtu_p = &ifr->ifr_devmtu; devmtu_p->ifdm_current = ifnet_mtu(ifp); - devmtu_p->ifdm_max = feth_max_mtu(); + devmtu_p->ifdm_max = feth_max_mtu(ifp); devmtu_p->ifdm_min = IF_MINMTU; break; case SIOCSIFMTU: - if (ifr->ifr_mtu > feth_max_mtu() || ifr->ifr_mtu < IF_MINMTU) { + if ((unsigned int)ifr->ifr_mtu > feth_max_mtu(ifp) || + ifr->ifr_mtu < IF_MINMTU) { error = EINVAL; } else { error = ifnet_set_mtu(ifp, ifr->ifr_mtu); diff --git a/bsd/net/if_headless.c b/bsd/net/if_headless.c new file mode 100644 index 000000000..f7ebb1776 --- /dev/null +++ b/bsd/net/if_headless.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +void +if_headless_init(void) +{ + /* nothing here */ +} diff --git a/bsd/net/if_ipsec.c b/bsd/net/if_ipsec.c index eb32af709..e967cad2c 100644 --- a/bsd/net/if_ipsec.c +++ b/bsd/net/if_ipsec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2018 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,6 +54,7 @@ #include #include #include +#include #define IPSEC_NEXUS 0 @@ -95,7 +96,6 @@ static errno_t ipsec_proto_pre_output(ifnet_t interface, protocol_family_t proto char *frame_type, char *link_layer_dest); static kern_ctl_ref ipsec_kctlref; -static u_int32_t ipsec_family; static lck_attr_t *ipsec_lck_attr; static lck_grp_attr_t *ipsec_lck_grp_attr; static lck_grp_t *ipsec_lck_grp; @@ -116,12 +116,23 @@ SYSCTL_INT(_net_ipsec, OID_AUTO, verify_interface_creation, CTLFLAG_RW | CTLFLAG #define IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE 128 #define IPSEC_IF_DEFAULT_BUF_SEG_SIZE skmem_usr_buf_seg_size -#define IPSEC_IF_MIN_RING_SIZE 16 +#define IPSEC_IF_WMM_RING_COUNT NEXUS_NUM_WMM_QUEUES +#define IPSEC_IF_MAX_RING_COUNT IPSEC_IF_WMM_RING_COUNT +#define IPSEC_NETIF_WMM_TX_RING_COUNT IPSEC_IF_WMM_RING_COUNT +#define IPSEC_NETIF_WMM_RX_RING_COUNT 1 +#define IPSEC_NETIF_MAX_TX_RING_COUNT IPSEC_NETIF_WMM_TX_RING_COUNT +#define IPSEC_NETIF_MAX_RX_RING_COUNT IPSEC_NETIF_WMM_RX_RING_COUNT + +#define IPSEC_IF_MIN_RING_SIZE 8 #define IPSEC_IF_MAX_RING_SIZE 1024 #define IPSEC_IF_MIN_SLOT_SIZE 1024 #define IPSEC_IF_MAX_SLOT_SIZE 4096 +#define IPSEC_DEFAULT_MAX_PENDING_INPUT_COUNT 512 + +static int if_ipsec_max_pending_input = IPSEC_DEFAULT_MAX_PENDING_INPUT_COUNT; + static int sysctl_if_ipsec_ring_size SYSCTL_HANDLER_ARGS; static int sysctl_if_ipsec_tx_fsw_ring_size SYSCTL_HANDLER_ARGS; static int sysctl_if_ipsec_rx_fsw_ring_size SYSCTL_HANDLER_ARGS; @@ -130,6 +141,7 @@ static int if_ipsec_ring_size = IPSEC_IF_DEFAULT_RING_SIZE; static int if_ipsec_tx_fsw_ring_size = IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE; static int if_ipsec_rx_fsw_ring_size = IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE; +SYSCTL_INT(_net_ipsec, OID_AUTO, max_pending_input, CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_max_pending_input, 0, ""); SYSCTL_PROC(_net_ipsec, OID_AUTO, ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_ring_size, IPSEC_IF_DEFAULT_RING_SIZE, &sysctl_if_ipsec_ring_size, "I", ""); SYSCTL_PROC(_net_ipsec, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, @@ -137,17 +149,20 @@ SYSCTL_PROC(_net_ipsec, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED SYSCTL_PROC(_net_ipsec, OID_AUTO, rx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_rx_fsw_ring_size, IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE, &sysctl_if_ipsec_rx_fsw_ring_size, "I", ""); +static int if_ipsec_debug = 0; +SYSCTL_INT(_net_ipsec, OID_AUTO, debug, CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_debug, 0, ""); + static errno_t ipsec_register_nexus(void); typedef struct ipsec_nx { uuid_t if_provider; uuid_t if_instance; - uuid_t ms_provider; - uuid_t ms_instance; - uuid_t ms_device; - uuid_t ms_host; - uuid_t ms_agent; + uuid_t fsw_provider; + uuid_t fsw_instance; + uuid_t fsw_device; + uuid_t fsw_host; + uuid_t fsw_agent; } *ipsec_nx_t; static nexus_controller_t ipsec_ncd; @@ -160,47 +175,97 @@ static uuid_t ipsec_kpipe_uuid; struct ipsec_pcb { TAILQ_ENTRY(ipsec_pcb) ipsec_chain; kern_ctl_ref ipsec_ctlref; - ifnet_t ipsec_ifp; - u_int32_t ipsec_unit; - u_int32_t ipsec_unique_id; - u_int32_t ipsec_flags; - u_int32_t ipsec_input_frag_size; - bool ipsec_frag_size_set; - int ipsec_ext_ifdata_stats; + ifnet_t ipsec_ifp; + u_int32_t ipsec_unit; + u_int32_t ipsec_unique_id; + // These external flags can be set with IPSEC_OPT_FLAGS + u_int32_t ipsec_external_flags; + // These internal flags are only used within this driver + u_int32_t ipsec_internal_flags; + u_int32_t ipsec_input_frag_size; + bool ipsec_frag_size_set; + int ipsec_ext_ifdata_stats; mbuf_svc_class_t ipsec_output_service_class; - char ipsec_if_xname[IFXNAMSIZ]; - char ipsec_unique_name[IFXNAMSIZ]; - // PCB lock protects state fields, like ipsec_kpipe_enabled + char ipsec_if_xname[IFXNAMSIZ]; + char ipsec_unique_name[IFXNAMSIZ]; + // PCB lock protects state fields, like ipsec_kpipe_count decl_lck_rw_data(, ipsec_pcb_lock); + // lock to protect ipsec_pcb_data_move & ipsec_pcb_drainers + decl_lck_mtx_data(, ipsec_pcb_data_move_lock); + u_int32_t ipsec_pcb_data_move; /* number of data moving contexts */ + u_int32_t ipsec_pcb_drainers; /* number of threads waiting to drain */ + u_int32_t ipsec_pcb_data_path_state; /* internal state of interface data path */ #if IPSEC_NEXUS - lck_mtx_t ipsec_input_chain_lock; + lck_mtx_t ipsec_input_chain_lock; + lck_mtx_t ipsec_kpipe_encrypt_lock; + lck_mtx_t ipsec_kpipe_decrypt_lock; struct mbuf * ipsec_input_chain; struct mbuf * ipsec_input_chain_last; + u_int32_t ipsec_input_chain_count; // Input chain lock protects the list of input mbufs // The input chain lock must be taken AFTER the PCB lock if both are held struct ipsec_nx ipsec_nx; - int ipsec_kpipe_enabled; - uuid_t ipsec_kpipe_uuid; - void * ipsec_kpipe_rxring; - void * ipsec_kpipe_txring; - kern_pbufpool_t ipsec_kpipe_pp; + u_int32_t ipsec_kpipe_count; + pid_t ipsec_kpipe_pid; + uuid_t ipsec_kpipe_uuid[IPSEC_IF_MAX_RING_COUNT]; + void * ipsec_kpipe_rxring[IPSEC_IF_MAX_RING_COUNT]; + void * ipsec_kpipe_txring[IPSEC_IF_MAX_RING_COUNT]; + kern_pbufpool_t ipsec_kpipe_pp; + u_int32_t ipsec_kpipe_tx_ring_size; + u_int32_t ipsec_kpipe_rx_ring_size; kern_nexus_t ipsec_netif_nexus; - kern_pbufpool_t ipsec_netif_pp; - void * ipsec_netif_rxring; - void * ipsec_netif_txring; - uint64_t ipsec_netif_txring_size; - - u_int32_t ipsec_slot_size; - u_int32_t ipsec_netif_ring_size; - u_int32_t ipsec_tx_fsw_ring_size; - u_int32_t ipsec_rx_fsw_ring_size; - bool ipsec_use_netif; - bool ipsec_needs_netagent; + kern_pbufpool_t ipsec_netif_pp; + void * ipsec_netif_rxring[IPSEC_NETIF_MAX_RX_RING_COUNT]; + void * ipsec_netif_txring[IPSEC_NETIF_MAX_TX_RING_COUNT]; + uint64_t ipsec_netif_txring_size; + + u_int32_t ipsec_slot_size; + u_int32_t ipsec_netif_ring_size; + u_int32_t ipsec_tx_fsw_ring_size; + u_int32_t ipsec_rx_fsw_ring_size; + bool ipsec_use_netif; + bool ipsec_needs_netagent; #endif // IPSEC_NEXUS }; +/* These are internal flags not exposed outside this file */ +#define IPSEC_FLAGS_KPIPE_ALLOCATED 1 + +/* data movement refcounting functions */ +static boolean_t ipsec_data_move_begin(struct ipsec_pcb *pcb); +static void ipsec_data_move_end(struct ipsec_pcb *pcb); +static void ipsec_wait_data_move_drain(struct ipsec_pcb *pcb); + +/* Data path states */ +#define IPSEC_PCB_DATA_PATH_READY 0x1 + +/* Macros to set/clear/test data path states */ +#define IPSEC_SET_DATA_PATH_READY(_pcb) ((_pcb)->ipsec_pcb_data_path_state |= IPSEC_PCB_DATA_PATH_READY) +#define IPSEC_CLR_DATA_PATH_READY(_pcb) ((_pcb)->ipsec_pcb_data_path_state &= ~IPSEC_PCB_DATA_PATH_READY) +#define IPSEC_IS_DATA_PATH_READY(_pcb) (((_pcb)->ipsec_pcb_data_path_state & IPSEC_PCB_DATA_PATH_READY) != 0) + +#if IPSEC_NEXUS +/* Macros to clear/set/test flags. */ +static inline void +ipsec_flag_set(struct ipsec_pcb *pcb, uint32_t flag) +{ + pcb->ipsec_internal_flags |= flag; +} +static inline void +ipsec_flag_clr(struct ipsec_pcb *pcb, uint32_t flag) +{ + pcb->ipsec_internal_flags &= ~flag; +} + +static inline bool +ipsec_flag_isset(struct ipsec_pcb *pcb, uint32_t flag) +{ + return !!(pcb->ipsec_internal_flags & flag); +} +#endif // IPSEC_NEXUS + TAILQ_HEAD(ipsec_list, ipsec_pcb) ipsec_head; #define IPSEC_PCB_ZONE_MAX 32 @@ -274,6 +339,14 @@ sysctl_if_ipsec_rx_fsw_ring_size SYSCTL_HANDLER_ARGS return 0; } + + +static inline bool +ipsec_in_wmm_mode(struct ipsec_pcb *pcb) +{ + return pcb->ipsec_kpipe_count == IPSEC_IF_WMM_RING_COUNT; +} + #endif // IPSEC_NEXUS errno_t @@ -282,19 +355,12 @@ ipsec_register_control(void) struct kern_ctl_reg kern_ctl; errno_t result = 0; - /* Find a unique value for our interface family */ - result = mbuf_tag_id_find(IPSEC_CONTROL_NAME, &ipsec_family); - if (result != 0) { - printf("ipsec_register_control - mbuf_tag_id_find_internal failed: %d\n", result); - return result; - } - ipsec_pcb_size = sizeof(struct ipsec_pcb); ipsec_pcb_zone = zinit(ipsec_pcb_size, IPSEC_PCB_ZONE_MAX * ipsec_pcb_size, 0, IPSEC_PCB_ZONE_NAME); if (ipsec_pcb_zone == NULL) { - printf("ipsec_register_control - zinit(ipsec_pcb) failed"); + os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - zinit(ipsec_pcb) failed"); return ENOMEM; } @@ -319,26 +385,26 @@ ipsec_register_control(void) result = ctl_register(&kern_ctl, &ipsec_kctlref); if (result != 0) { - printf("ipsec_register_control - ctl_register failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - ctl_register failed: %d\n", result); return result; } /* Register the protocol plumbers */ - if ((result = proto_register_plumber(PF_INET, ipsec_family, + if ((result = proto_register_plumber(PF_INET, IFNET_FAMILY_IPSEC, ipsec_attach_proto, NULL)) != 0) { - printf("ipsec_register_control - proto_register_plumber(PF_INET, %d) failed: %d\n", - ipsec_family, result); + os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - proto_register_plumber(PF_INET, IFNET_FAMILY_IPSEC) failed: %d\n", + result); ctl_deregister(ipsec_kctlref); return result; } /* Register the protocol plumbers */ - if ((result = proto_register_plumber(PF_INET6, ipsec_family, + if ((result = proto_register_plumber(PF_INET6, IFNET_FAMILY_IPSEC, ipsec_attach_proto, NULL)) != 0) { - proto_unregister_plumber(PF_INET, ipsec_family); + proto_unregister_plumber(PF_INET, IFNET_FAMILY_IPSEC); ctl_deregister(ipsec_kctlref); - printf("ipsec_register_control - proto_register_plumber(PF_INET6, %d) failed: %d\n", - ipsec_family, result); + os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - proto_register_plumber(PF_INET6, IFNET_FAMILY_IPSEC) failed: %d\n", + result); return result; } @@ -449,7 +515,7 @@ ipsec_register_nexus(void) &dp_init, sizeof(dp_init), &ipsec_nx_dom_prov); if (err != 0) { - printf("%s: failed to register domain provider\n", __func__); + os_log_error(OS_LOG_DEFAULT, "%s: failed to register domain provider\n", __func__); return err; } return 0; @@ -480,6 +546,12 @@ ipsec_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(nxprov, channel) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); boolean_t ok = ifnet_is_attached(pcb->ipsec_ifp, 1); + /* Mark the data path as ready */ + if (ok) { + lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock); + IPSEC_SET_DATA_PATH_READY(pcb); + lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); + } return ok ? 0 : ENXIO; } @@ -487,14 +559,24 @@ static void ipsec_nexus_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_t channel) { -#pragma unused(nxprov, nexus, channel) +#pragma unused(nxprov, channel) + struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); + + VERIFY(pcb->ipsec_kpipe_count != 0); + + /* Wait until all threads in the data paths are done. */ + ipsec_wait_data_move_drain(pcb); } static void ipsec_netif_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_t channel) { -#pragma unused(nxprov, nexus, channel) +#pragma unused(nxprov, channel) + struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); + + /* Wait until all threads in the data paths are done. */ + ipsec_wait_data_move_drain(pcb); } static void @@ -516,14 +598,30 @@ ipsec_kpipe_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov) #pragma unused(channel) -#pragma unused(ring_ctx) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); + uint8_t ring_idx; + + for (ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) { + if (!uuid_compare(channel->ch_info->cinfo_nx_uuid, pcb->ipsec_kpipe_uuid[ring_idx])) { + break; + } + } + + if (ring_idx == pcb->ipsec_kpipe_count) { + uuid_string_t uuidstr; + uuid_unparse(channel->ch_info->cinfo_nx_uuid, uuidstr); + os_log_error(OS_LOG_DEFAULT, "%s: %s cannot find channel %s\n", __func__, pcb->ipsec_if_xname, uuidstr); + return ENOENT; + } + + *ring_ctx = (void *)(uintptr_t)ring_idx; + if (!is_tx_ring) { - VERIFY(pcb->ipsec_kpipe_rxring == NULL); - pcb->ipsec_kpipe_rxring = ring; + VERIFY(pcb->ipsec_kpipe_rxring[ring_idx] == NULL); + pcb->ipsec_kpipe_rxring[ring_idx] = ring; } else { - VERIFY(pcb->ipsec_kpipe_txring == NULL); - pcb->ipsec_kpipe_txring = ring; + VERIFY(pcb->ipsec_kpipe_txring[ring_idx] == NULL); + pcb->ipsec_kpipe_txring[ring_idx] = ring; } return 0; } @@ -533,12 +631,19 @@ ipsec_kpipe_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_ring_t ring) { #pragma unused(nxprov) + bool found = false; struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); - if (pcb->ipsec_kpipe_rxring == ring) { - pcb->ipsec_kpipe_rxring = NULL; - } else if (pcb->ipsec_kpipe_txring == ring) { - pcb->ipsec_kpipe_txring = NULL; + + for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) { + if (pcb->ipsec_kpipe_rxring[i] == ring) { + pcb->ipsec_kpipe_rxring[i] = NULL; + found = true; + } else if (pcb->ipsec_kpipe_txring[i] == ring) { + pcb->ipsec_kpipe_txring[i] = NULL; + found = true; + } } + VERIFY(found); } static errno_t @@ -549,27 +654,38 @@ ipsec_kpipe_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(flags) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); + if (!ipsec_data_move_begin(pcb)) { + os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp)); + return 0; + } + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); - int channel_enabled = pcb->ipsec_kpipe_enabled; - if (!channel_enabled) { + + if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) { lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); return 0; } + VERIFY(pcb->ipsec_kpipe_count); + kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); if (tx_slot == NULL) { // Nothing to write, bail lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); return 0; } // Signal the netif ring to read - kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring; + kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring[0]; lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); if (rx_ring != NULL) { kern_channel_notify(rx_ring, 0); } + + ipsec_data_move_end(pcb); return 0; } @@ -613,7 +729,7 @@ ipsec_encrypt_mbuf(ifnet_t interface, data = ipsec_state.m; if (error || data == NULL) { if (error) { - printf("ipsec_encrypt_mbuf: ipsec4_output error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec4_output error %d\n", error); } goto ipsec_output_err; } @@ -624,7 +740,7 @@ ipsec_encrypt_mbuf(ifnet_t interface, data = ipsec6_splithdr(data); if (data == NULL) { - printf("ipsec_encrypt_mbuf: ipsec6_splithdr returned NULL\n"); + os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec6_splithdr returned NULL\n"); goto ipsec_output_err; } @@ -645,14 +761,14 @@ ipsec_encrypt_mbuf(ifnet_t interface, data = ipsec_state.m; if (error || data == NULL) { if (error) { - printf("ipsec_encrypt_mbuf: ipsec6_output error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec6_output error %d\n", error); } goto ipsec_output_err; } goto done; } default: { - printf("ipsec_encrypt_mbuf: Received unknown packet version %d\n", ip_version); + os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: Received unknown packet version %d\n", ip_version); error = -1; goto ipsec_output_err; } @@ -676,28 +792,43 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(flags) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); struct kern_channel_ring_stat_increment rx_ring_stats; + uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(rx_ring); + + if (!ipsec_data_move_begin(pcb)) { + os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp)); + return 0; + } lck_rw_lock_shared(&pcb->ipsec_pcb_lock); - int channel_enabled = pcb->ipsec_kpipe_enabled; - if (!channel_enabled) { + if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) { lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); return 0; } + VERIFY(pcb->ipsec_kpipe_count); + VERIFY(ring_idx <= pcb->ipsec_kpipe_count); + // Reclaim user-released slots (void) kern_channel_reclaim(rx_ring); uint32_t avail = kern_channel_available_slot_count(rx_ring); if (avail == 0) { lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d no room in rx_ring\n", __func__, + pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx); + ipsec_data_move_end(pcb); return 0; } - kern_channel_ring_t tx_ring = pcb->ipsec_netif_txring; + kern_channel_ring_t tx_ring = pcb->ipsec_netif_txring[ring_idx]; if (tx_ring == NULL) { // Net-If TX ring not set up yet, nothing to read lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 1\n", __func__, + pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx); + ipsec_data_move_end(pcb); return 0; } @@ -710,15 +841,17 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Lock again after entering and validate lck_rw_lock_shared(&pcb->ipsec_pcb_lock); - if (tx_ring != pcb->ipsec_netif_txring) { + if (tx_ring != pcb->ipsec_netif_txring[ring_idx]) { // Ring no longer valid // Unlock first, then exit ring lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); kr_exit(tx_ring); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 2\n", __func__, + pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx); + ipsec_data_move_end(pcb); return 0; } - struct kern_channel_ring_stat_increment tx_ring_stats; bzero(&tx_ring_stats, sizeof(tx_ring_stats)); kern_channel_slot_t tx_pslot = NULL; @@ -728,6 +861,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Unlock first, then exit ring lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); kr_exit(tx_ring); + ipsec_data_move_end(pcb); return 0; } @@ -746,7 +880,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_packet_t rx_ph = 0; error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); if (__improbable(error != 0)) { - printf("ipsec_kpipe_sync_rx %s: failed to allocate packet\n", + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: failed to allocate packet\n", pcb->ipsec_ifp->if_xname); break; } @@ -783,27 +917,29 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT); if (error == 0) { // Encrypt and send packet + lck_mtx_lock(&pcb->ipsec_kpipe_encrypt_lock); data = ipsec_encrypt_mbuf(pcb->ipsec_ifp, data); + lck_mtx_unlock(&pcb->ipsec_kpipe_encrypt_lock); } else { - printf("ipsec_kpipe_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); mbuf_freem(data); data = NULL; } } else { - printf("ipsec_kpipe_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); } } else { - printf("ipsec_kpipe_sync_rx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); } if (data == NULL) { - printf("ipsec_kpipe_sync_rx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname); + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname); kern_pbufpool_free(rx_pp, rx_ph); break; } @@ -813,7 +949,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Flush data mbuf_freem(data); kern_pbufpool_free(rx_pp, rx_ph); - printf("ipsec_kpipe_sync_rx %s: encrypted packet length %zu > %u\n", + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: encrypted packet length %zu > %u\n", pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size); continue; } @@ -838,8 +974,8 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); VERIFY(error == 0); - STATS_INC(nifs, NETIF_STATS_TXPKTS); - STATS_INC(nifs, NETIF_STATS_TXCOPY_DIRECT); + STATS_INC(nifs, NETIF_STATS_TX_PACKETS); + STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT); rx_ring_stats.kcrsi_slots_transferred++; rx_ring_stats.kcrsi_bytes_transferred += length; @@ -868,7 +1004,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, /* always reenable output */ errno_t error = ifnet_enable_output(pcb->ipsec_ifp); if (error != 0) { - printf("ipsec_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error); } // Unlock first, then exit ring @@ -879,9 +1015,33 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } kr_exit(tx_ring); + ipsec_data_move_end(pcb); return 0; } +static uint8_t +ipsec_find_tx_ring_by_svc(kern_packet_svc_class_t svc_class) +{ + switch (svc_class) { + case KPKT_SC_VO: { + return 0; + } + case KPKT_SC_VI: { + return 1; + } + case KPKT_SC_BE: { + return 2; + } + case KPKT_SC_BK: { + return 3; + } + default: { + VERIFY(0); + return 0; + } + } +} + static errno_t ipsec_netif_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_t channel, kern_channel_ring_t ring, boolean_t is_tx_ring, @@ -889,14 +1049,26 @@ ipsec_netif_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov) #pragma unused(channel) -#pragma unused(ring_ctx) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); + if (!is_tx_ring) { - VERIFY(pcb->ipsec_netif_rxring == NULL); - pcb->ipsec_netif_rxring = ring; + VERIFY(pcb->ipsec_netif_rxring[0] == NULL); + pcb->ipsec_netif_rxring[0] = ring; } else { - VERIFY(pcb->ipsec_netif_txring == NULL); - pcb->ipsec_netif_txring = ring; + uint8_t ring_idx = 0; + if (ipsec_in_wmm_mode(pcb)) { + int err; + kern_packet_svc_class_t svc_class; + err = kern_channel_get_service_class(ring, &svc_class); + VERIFY(err == 0); + ring_idx = ipsec_find_tx_ring_by_svc(svc_class); + VERIFY(ring_idx < IPSEC_IF_WMM_RING_COUNT); + } + + *ring_ctx = (void *)(uintptr_t)ring_idx; + + VERIFY(pcb->ipsec_netif_txring[ring_idx] == NULL); + pcb->ipsec_netif_txring[ring_idx] = ring; } return 0; } @@ -907,11 +1079,23 @@ ipsec_netif_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); - if (pcb->ipsec_netif_rxring == ring) { - pcb->ipsec_netif_rxring = NULL; - } else if (pcb->ipsec_netif_txring == ring) { - pcb->ipsec_netif_txring = NULL; + bool found = false; + + for (int i = 0; i < IPSEC_NETIF_MAX_RX_RING_COUNT; i++) { + if (pcb->ipsec_netif_rxring[i] == ring) { + pcb->ipsec_netif_rxring[i] = NULL; + VERIFY(!found); + found = true; + } + } + for (int i = 0; i < IPSEC_NETIF_MAX_TX_RING_COUNT; i++) { + if (pcb->ipsec_netif_txring[i] == ring) { + pcb->ipsec_netif_txring[i] = NULL; + VERIFY(!found); + found = true; + } } + VERIFY(found); } static bool @@ -935,12 +1119,12 @@ ipsec_netif_check_policy(mbuf_t data) u_int ip_version = ip->ip_v; switch (ip_version) { case 4: { - necp_matched_policy_id = necp_ip_output_find_policy_match(data, 0, NULL, + necp_matched_policy_id = necp_ip_output_find_policy_match(data, 0, NULL, NULL, &necp_result, &necp_result_parameter); break; } case 6: { - necp_matched_policy_id = necp_ip6_output_find_policy_match(data, 0, NULL, + necp_matched_policy_id = necp_ip6_output_find_policy_match(data, 0, NULL, NULL, &necp_result, &necp_result_parameter); break; } @@ -969,6 +1153,11 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats; + if (!ipsec_data_move_begin(pcb)) { + os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp)); + return 0; + } + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); struct kern_channel_ring_stat_increment tx_ring_stats; @@ -976,22 +1165,31 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_slot_t tx_pslot = NULL; kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); - STATS_INC(nifs, NETIF_STATS_TXSYNC); + STATS_INC(nifs, NETIF_STATS_TX_SYNC); if (tx_slot == NULL) { // Nothing to write, don't bother signalling lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); return 0; } - if (pcb->ipsec_kpipe_enabled) { - kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring; + if (pcb->ipsec_kpipe_count && + ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) { + // Select the corresponding kpipe rx ring + uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(tx_ring); + VERIFY(ring_idx < IPSEC_IF_MAX_RING_COUNT); + kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring[ring_idx]; + + // Unlock while calling notify lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); // Signal the kernel pipe ring to read if (rx_ring != NULL) { kern_channel_notify(rx_ring, 0); } + + ipsec_data_move_end(pcb); return 0; } @@ -1032,42 +1230,42 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Check policy with NECP if (!ipsec_netif_check_policy(data)) { - printf("ipsec_netif_sync_tx %s - failed policy check\n", pcb->ipsec_ifp->if_xname); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - failed policy check\n", pcb->ipsec_ifp->if_xname); + STATS_INC(nifs, NETIF_STATS_DROP); mbuf_freem(data); data = NULL; } else { // Send through encryption error = ipsec_output(pcb->ipsec_ifp, data); if (error != 0) { - printf("ipsec_netif_sync_tx %s - ipsec_output error %d\n", pcb->ipsec_ifp->if_xname, error); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - ipsec_output error %d\n", pcb->ipsec_ifp->if_xname, error); } } } else { - printf("ipsec_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); mbuf_freem(data); data = NULL; } } else { - printf("ipsec_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); } } else { - printf("ipsec_netif_sync_tx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); } if (data == NULL) { - printf("ipsec_netif_sync_tx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname); break; } - STATS_INC(nifs, NETIF_STATS_TXPKTS); - STATS_INC(nifs, NETIF_STATS_TXCOPY_MBUF); + STATS_INC(nifs, NETIF_STATS_TX_PACKETS); + STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF); tx_ring_stats.kcrsi_slots_transferred++; tx_ring_stats.kcrsi_bytes_transferred += length; @@ -1080,19 +1278,22 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); return 0; } static errno_t -ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, - kern_channel_ring_t ring, __unused uint32_t flags) +ipsec_netif_tx_doorbell_one(kern_nexus_provider_t nxprov, kern_nexus_t nexus, + kern_channel_ring_t ring, uint32_t flags, uint8_t ring_idx) { #pragma unused(nxprov) struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); boolean_t more = false; errno_t rc = 0; + VERIFY((flags & KERN_NEXUS_TXDOORBELLF_ASYNC_REFILL) == 0); + /* * Refill and sync the ring; we may be racing against another thread doing * an RX sync that also wants to do kr_enter(), and so use the blocking @@ -1100,26 +1301,35 @@ ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, */ rc = kern_channel_tx_refill_canblock(ring, UINT32_MAX, UINT32_MAX, true, &more); if (rc != 0 && rc != EAGAIN && rc != EBUSY) { - printf("%s, tx refill failed %d\n", __func__, rc); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s tx refill failed %d\n", __func__, + pcb->ipsec_if_xname, ring->ckr_name, rc); } (void) kr_enter(ring, TRUE); lck_rw_lock_shared(&pcb->ipsec_pcb_lock); + if (ring != pcb->ipsec_netif_txring[ring_idx]) { + // ring no longer valid + lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + kr_exit(ring); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 3\n", __func__, + pcb->ipsec_if_xname, ring->ckr_name, ring_idx); + return ENXIO; + } - if (pcb->ipsec_kpipe_enabled) { + if (pcb->ipsec_kpipe_count) { uint32_t tx_available = kern_channel_available_slot_count(ring); if (pcb->ipsec_netif_txring_size > 0 && tx_available >= pcb->ipsec_netif_txring_size - 1) { // No room left in tx ring, disable output for now errno_t error = ifnet_disable_output(pcb->ipsec_ifp); if (error != 0) { - printf("ipsec_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error); } } } - if (pcb->ipsec_kpipe_enabled) { - kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring; + if (pcb->ipsec_kpipe_count) { + kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring[ring_idx]; // Unlock while calling notify lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); @@ -1136,6 +1346,34 @@ ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, return 0; } +static errno_t +ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, + kern_channel_ring_t ring, __unused uint32_t flags) +{ + errno_t ret = 0; + struct ipsec_pcb *pcb = kern_nexus_get_context(nexus); + + if (!ipsec_data_move_begin(pcb)) { + os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp)); + return 0; + } + + if (ipsec_in_wmm_mode(pcb)) { + for (uint8_t i = 0; i < IPSEC_IF_WMM_RING_COUNT; i++) { + kern_channel_ring_t nring = pcb->ipsec_netif_txring[i]; + ret = ipsec_netif_tx_doorbell_one(nxprov, nexus, nring, flags, i); + if (ret) { + break; + } + } + } else { + ret = ipsec_netif_tx_doorbell_one(nxprov, nexus, ring, flags, 0); + } + + ipsec_data_move_end(pcb); + return ret; +} + static errno_t ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_ring_t rx_ring, uint32_t flags) @@ -1147,16 +1385,22 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats; + if (!ipsec_data_move_begin(pcb)) { + os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp)); + return 0; + } + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); // Reclaim user-released slots (void) kern_channel_reclaim(rx_ring); - STATS_INC(nifs, NETIF_STATS_RXSYNC); + STATS_INC(nifs, NETIF_STATS_RX_SYNC); uint32_t avail = kern_channel_available_slot_count(rx_ring); if (avail == 0) { lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); return 0; } @@ -1179,13 +1423,16 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_packet_t rx_ph = 0; errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); if (__improbable(error != 0)) { - STATS_INC(nifs, NETIF_STATS_NOMEM_PKT); - STATS_INC(nifs, NETIF_STATS_DROPPED); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); + STATS_INC(nifs, NETIF_STATS_DROP); lck_mtx_unlock(&pcb->ipsec_input_chain_lock); break; } // Advance waiting packets + if (pcb->ipsec_input_chain_count > 0) { + pcb->ipsec_input_chain_count--; + } pcb->ipsec_input_chain = data->m_nextpkt; data->m_nextpkt = NULL; if (pcb->ipsec_input_chain == NULL) { @@ -1199,9 +1446,9 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Flush data mbuf_freem(data); kern_pbufpool_free(rx_pp, rx_ph); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("ipsec_netif_sync_rx %s: legacy decrypted packet length cannot hold IP %zu < %zu\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy decrypted packet length cannot hold IP %zu < %zu\n", pcb->ipsec_ifp->if_xname, length, sizeof(struct ip)); continue; } @@ -1219,7 +1466,7 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, break; } default: { - printf("ipsec_netif_sync_rx %s: legacy unknown ip version %u\n", + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy unknown ip version %u\n", pcb->ipsec_ifp->if_xname, ip_version); break; } @@ -1246,9 +1493,9 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, if (fragment_error == 0 && data != NULL) { fragment_chain = data; } else { - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("ipsec_netif_sync_rx %s: failed to fragment IPv4 packet of length %zu (%d)\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv4 packet of length %zu (%d)\n", pcb->ipsec_ifp->if_xname, length, fragment_error); } break; @@ -1256,25 +1503,23 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, case AF_INET6: { if (length < sizeof(struct ip6_hdr)) { mbuf_freem(data); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu < %zu\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu < %zu\n", pcb->ipsec_ifp->if_xname, length, sizeof(struct ip6_hdr)); } else { // ip6_do_fragmentation will free the original data on success only struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *); - struct ip6_exthdrs exthdrs; - memset(&exthdrs, 0, sizeof(exthdrs)); int fragment_error = ip6_do_fragmentation(&data, 0, pcb->ipsec_ifp, sizeof(struct ip6_hdr), - ip6, &exthdrs, fragment_mtu, ip6->ip6_nxt); + ip6, NULL, fragment_mtu, ip6->ip6_nxt, htonl(ip6_randomid())); if (fragment_error == 0 && data != NULL) { fragment_chain = data; } else { mbuf_freem(data); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu (%d)\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu (%d)\n", pcb->ipsec_ifp->if_xname, length, fragment_error); } } @@ -1283,9 +1528,9 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, default: { // Cannot fragment unknown families mbuf_freem(data); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("ipsec_netif_sync_rx %s: uknown legacy decrypted packet length %zu > %u\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: uknown legacy decrypted packet length %zu > %u\n", pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size); break; } @@ -1299,9 +1544,11 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } else { pcb->ipsec_input_chain = fragment_chain; } + pcb->ipsec_input_chain_count++; while (fragment_chain->m_nextpkt) { VERIFY(fragment_chain != fragment_chain->m_nextpkt); fragment_chain = fragment_chain->m_nextpkt; + pcb->ipsec_input_chain_count++; } pcb->ipsec_input_chain_last = fragment_chain; lck_mtx_unlock(&pcb->ipsec_input_chain_lock); @@ -1330,17 +1577,15 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, VERIFY(error == 0); error = kern_buflet_set_data_length(rx_buf, length); VERIFY(error == 0); - error = kern_packet_set_link_header_offset(rx_ph, 0); - VERIFY(error == 0); - error = kern_packet_set_network_header_offset(rx_ph, 0); + error = kern_packet_set_headroom(rx_ph, 0); VERIFY(error == 0); error = kern_packet_finalize(rx_ph); VERIFY(error == 0); error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); VERIFY(error == 0); - STATS_INC(nifs, NETIF_STATS_RXPKTS); - STATS_INC(nifs, NETIF_STATS_RXCOPY_MBUF); + STATS_INC(nifs, NETIF_STATS_RX_PACKETS); + STATS_INC(nifs, NETIF_STATS_RX_COPY_MBUF); bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0); rx_ring_stats.kcrsi_slots_transferred++; @@ -1357,234 +1602,244 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); } - struct kern_channel_ring_stat_increment tx_ring_stats; - bzero(&tx_ring_stats, sizeof(tx_ring_stats)); - kern_channel_ring_t tx_ring = pcb->ipsec_kpipe_txring; - kern_channel_slot_t tx_pslot = NULL; - kern_channel_slot_t tx_slot = NULL; - if (tx_ring == NULL) { - // Net-If TX ring not set up yet, nothing to read - goto done; - } - - - // Unlock ipsec before entering ring - lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + for (uint8_t ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) { + struct kern_channel_ring_stat_increment tx_ring_stats; + bzero(&tx_ring_stats, sizeof(tx_ring_stats)); + kern_channel_ring_t tx_ring = pcb->ipsec_kpipe_txring[ring_idx]; + kern_channel_slot_t tx_pslot = NULL; + kern_channel_slot_t tx_slot = NULL; + if (tx_ring == NULL) { + // Net-If TX ring not set up yet, nothing to read + goto done; + } - (void)kr_enter(tx_ring, TRUE); - // Lock again after entering and validate - lck_rw_lock_shared(&pcb->ipsec_pcb_lock); + // Unlock ipsec before entering ring + lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); - if (tx_ring != pcb->ipsec_kpipe_txring) { - goto done; - } + (void)kr_enter(tx_ring, TRUE); - tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); - if (tx_slot == NULL) { - // Nothing to read, don't bother signalling - goto done; - } + // Lock again after entering and validate + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); - while (rx_slot != NULL && tx_slot != NULL) { - size_t length = 0; - mbuf_t data = NULL; - errno_t error = 0; - uint32_t af; + if (tx_ring != pcb->ipsec_kpipe_txring[ring_idx]) { + goto done; + } - // Allocate rx packet - kern_packet_t rx_ph = 0; - error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); - if (__improbable(error != 0)) { - STATS_INC(nifs, NETIF_STATS_NOMEM_PKT); - STATS_INC(nifs, NETIF_STATS_DROPPED); - break; + tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); + if (tx_slot == NULL) { + // Nothing to read, don't bother signalling + goto done; } - kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot); + while (rx_slot != NULL && tx_slot != NULL) { + size_t length = 0; + mbuf_t data = NULL; + errno_t error = 0; + uint32_t af; + + // Allocate rx packet + kern_packet_t rx_ph = 0; + error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); + if (__improbable(error != 0)) { + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); + STATS_INC(nifs, NETIF_STATS_DROP); + break; + } - // Advance TX ring - tx_pslot = tx_slot; - tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); + kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot); - if (tx_ph == 0) { - kern_pbufpool_free(rx_pp, rx_ph); - continue; - } + // Advance TX ring + tx_pslot = tx_slot; + tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); - kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL); - VERIFY(tx_buf != NULL); - uint8_t *tx_baddr = kern_buflet_get_object_address(tx_buf); - VERIFY(tx_baddr != 0); - tx_baddr += kern_buflet_get_data_offset(tx_buf); + if (tx_ph == 0) { + kern_pbufpool_free(rx_pp, rx_ph); + continue; + } - length = MIN(kern_packet_get_data_length(tx_ph), - pcb->ipsec_slot_size); + kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL); + VERIFY(tx_buf != NULL); + uint8_t *tx_baddr = kern_buflet_get_object_address(tx_buf); + VERIFY(tx_baddr != 0); + tx_baddr += kern_buflet_get_data_offset(tx_buf); - // Increment TX stats - tx_ring_stats.kcrsi_slots_transferred++; - tx_ring_stats.kcrsi_bytes_transferred += length; + length = MIN(kern_packet_get_data_length(tx_ph), + pcb->ipsec_slot_size); - if (length >= sizeof(struct ip)) { - error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data); - if (error == 0) { - error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT); + // Increment TX stats + tx_ring_stats.kcrsi_slots_transferred++; + tx_ring_stats.kcrsi_bytes_transferred += length; + + if (length >= sizeof(struct ip)) { + error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data); if (error == 0) { - struct ip *ip = mtod(data, struct ip *); - u_int ip_version = ip->ip_v; - switch (ip_version) { - case 4: { - af = AF_INET; - ip->ip_len = ntohs(ip->ip_len) - sizeof(struct ip); - ip->ip_off = ntohs(ip->ip_off); - - if (length < ip->ip_len) { - printf("ipsec_netif_sync_rx %s: IPv4 packet length too short (%zu < %u)\n", - pcb->ipsec_ifp->if_xname, length, ip->ip_len); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - mbuf_freem(data); - data = NULL; - } else { - data = esp4_input_extended(data, sizeof(struct ip), pcb->ipsec_ifp); + error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT); + if (error == 0) { + lck_mtx_lock(&pcb->ipsec_kpipe_decrypt_lock); + struct ip *ip = mtod(data, struct ip *); + u_int ip_version = ip->ip_v; + switch (ip_version) { + case 4: { + af = AF_INET; + ip->ip_len = ntohs(ip->ip_len) - sizeof(struct ip); + ip->ip_off = ntohs(ip->ip_off); + + if (length < ip->ip_len) { + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv4 packet length too short (%zu < %u)\n", + pcb->ipsec_ifp->if_xname, length, ip->ip_len); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + mbuf_freem(data); + data = NULL; + } else { + data = esp4_input_extended(data, sizeof(struct ip), pcb->ipsec_ifp); + } + break; } - break; - } - case 6: { - if (length < sizeof(struct ip6_hdr)) { - printf("ipsec_netif_sync_rx %s: IPv6 packet length too short for header %zu\n", - pcb->ipsec_ifp->if_xname, length); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - mbuf_freem(data); - data = NULL; - } else { - af = AF_INET6; - struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *); - const size_t ip6_len = sizeof(*ip6) + ntohs(ip6->ip6_plen); - if (length < ip6_len) { - printf("ipsec_netif_sync_rx %s: IPv6 packet length too short (%zu < %zu)\n", - pcb->ipsec_ifp->if_xname, length, ip6_len); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); + case 6: { + if (length < sizeof(struct ip6_hdr)) { + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv6 packet length too short for header %zu\n", + pcb->ipsec_ifp->if_xname, length); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); mbuf_freem(data); data = NULL; } else { - int offset = sizeof(struct ip6_hdr); - esp6_input_extended(&data, &offset, ip6->ip6_nxt, pcb->ipsec_ifp); + af = AF_INET6; + struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *); + const size_t ip6_len = sizeof(*ip6) + ntohs(ip6->ip6_plen); + if (length < ip6_len) { + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv6 packet length too short (%zu < %zu)\n", + pcb->ipsec_ifp->if_xname, length, ip6_len); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + mbuf_freem(data); + data = NULL; + } else { + int offset = sizeof(struct ip6_hdr); + esp6_input_extended(&data, &offset, ip6->ip6_nxt, pcb->ipsec_ifp); + } } + break; } - break; - } - default: { - printf("ipsec_netif_sync_rx %s: unknown ip version %u\n", - pcb->ipsec_ifp->if_xname, ip_version); - STATS_INC(nifs, NETIF_STATS_DROPPED); + default: { + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: unknown ip version %u\n", + pcb->ipsec_ifp->if_xname, ip_version); + STATS_INC(nifs, NETIF_STATS_DROP); + mbuf_freem(data); + data = NULL; + break; + } + } + lck_mtx_unlock(&pcb->ipsec_kpipe_decrypt_lock); + } else { + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); mbuf_freem(data); data = NULL; - break; - } } } else { - printf("ipsec_netif_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); - mbuf_freem(data); - data = NULL; + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); } } else { - printf("ipsec_netif_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - bad packet length %zu\n", pcb->ipsec_ifp->if_xname, length); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); } - } else { - printf("ipsec_netif_sync_rx %s - bad packet length %zu\n", pcb->ipsec_ifp->if_xname, length); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - } - - if (data == NULL) { - // Failed to get decrypted data data - kern_pbufpool_free(rx_pp, rx_ph); - continue; - } - length = mbuf_pkthdr_len(data); - if (length > rx_pp->pp_buflet_size) { - // Flush data - mbuf_freem(data); - kern_pbufpool_free(rx_pp, rx_ph); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("ipsec_netif_sync_rx %s: decrypted packet length %zu > %u\n", - pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size); - continue; - } - - mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp); - - // Fillout rx packet - kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); - VERIFY(rx_buf != NULL); - void *rx_baddr = kern_buflet_get_object_address(rx_buf); - VERIFY(rx_baddr != NULL); + if (data == NULL) { + // Failed to get decrypted data data + kern_pbufpool_free(rx_pp, rx_ph); + continue; + } - // Copy-in data from mbuf to buflet - mbuf_copydata(data, 0, length, (void *)rx_baddr); - kern_packet_clear_flow_uuid(rx_ph); // Zero flow id + length = mbuf_pkthdr_len(data); + if (length > rx_pp->pp_buflet_size) { + // Flush data + mbuf_freem(data); + kern_pbufpool_free(rx_pp, rx_ph); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: decrypted packet length %zu > %u\n", + pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size); + continue; + } - // Finalize and attach the packet - error = kern_buflet_set_data_offset(rx_buf, 0); - VERIFY(error == 0); - error = kern_buflet_set_data_length(rx_buf, length); - VERIFY(error == 0); - error = kern_packet_set_link_header_offset(rx_ph, 0); - VERIFY(error == 0); - error = kern_packet_set_network_header_offset(rx_ph, 0); - VERIFY(error == 0); - error = kern_packet_finalize(rx_ph); - VERIFY(error == 0); - error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); - VERIFY(error == 0); + mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp); + + // Fillout rx packet + kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); + VERIFY(rx_buf != NULL); + void *rx_baddr = kern_buflet_get_object_address(rx_buf); + VERIFY(rx_baddr != NULL); + + // Copy-in data from mbuf to buflet + mbuf_copydata(data, 0, length, (void *)rx_baddr); + kern_packet_clear_flow_uuid(rx_ph); // Zero flow id + + // Finalize and attach the packet + error = kern_buflet_set_data_offset(rx_buf, 0); + VERIFY(error == 0); + error = kern_buflet_set_data_length(rx_buf, length); + VERIFY(error == 0); + error = kern_packet_set_link_header_offset(rx_ph, 0); + VERIFY(error == 0); + error = kern_packet_set_network_header_offset(rx_ph, 0); + VERIFY(error == 0); + error = kern_packet_finalize(rx_ph); + VERIFY(error == 0); + error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); + VERIFY(error == 0); + + STATS_INC(nifs, NETIF_STATS_RX_PACKETS); + STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT); + bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0); + + rx_ring_stats.kcrsi_slots_transferred++; + rx_ring_stats.kcrsi_bytes_transferred += length; + + if (!pcb->ipsec_ext_ifdata_stats) { + ifnet_stat_increment_in(pcb->ipsec_ifp, 1, length, 0); + } - STATS_INC(nifs, NETIF_STATS_RXPKTS); - STATS_INC(nifs, NETIF_STATS_RXCOPY_DIRECT); - bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0); + mbuf_freem(data); - rx_ring_stats.kcrsi_slots_transferred++; - rx_ring_stats.kcrsi_bytes_transferred += length; + rx_pslot = rx_slot; + rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); + } - if (!pcb->ipsec_ext_ifdata_stats) { - ifnet_stat_increment_in(pcb->ipsec_ifp, 1, length, 0); +done: + if (tx_pslot) { + kern_channel_advance_slot(tx_ring, tx_pslot); + kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats); + (void)kern_channel_reclaim(tx_ring); } - mbuf_freem(data); + // Unlock first, then exit ring + lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + if (tx_ring != NULL) { + if (tx_pslot != NULL) { + kern_channel_notify(tx_ring, 0); + } + kr_exit(tx_ring); + } - rx_pslot = rx_slot; - rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); } -done: if (rx_pslot) { kern_channel_advance_slot(rx_ring, rx_pslot); kern_channel_increment_ring_net_stats(rx_ring, pcb->ipsec_ifp, &rx_ring_stats); } - if (tx_pslot) { - kern_channel_advance_slot(tx_ring, tx_pslot); - kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats); - (void)kern_channel_reclaim(tx_ring); - } - // Unlock first, then exit ring lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); - if (tx_ring != NULL) { - if (tx_pslot != NULL) { - kern_channel_notify(tx_ring, 0); - } - kr_exit(tx_ring); - } + ipsec_data_move_end(pcb); return 0; } @@ -1622,7 +1877,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb, err = kern_nexus_attr_create(&nxa); IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s: kern_nexus_attr_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n", __func__, err); goto failed; } @@ -1638,20 +1893,45 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb, err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size); VERIFY(err == 0); + assert(err == 0); + + if (ipsec_in_wmm_mode(pcb)) { + os_log(OS_LOG_DEFAULT, "%s: %s enabling wmm mode\n", + __func__, pcb->ipsec_if_xname); + + init_params->output_sched_model = IFNET_SCHED_MODEL_DRIVER_MANAGED; + + err = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_RINGS, + IPSEC_NETIF_WMM_TX_RING_COUNT); + VERIFY(err == 0); + err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_RINGS, + IPSEC_NETIF_WMM_RX_RING_COUNT); + VERIFY(err == 0); + + err = kern_nexus_attr_set(nxa, NEXUS_ATTR_QMAP, NEXUS_QMAP_TYPE_WMM); + VERIFY(err == 0); + } + pcb->ipsec_netif_txring_size = ring_size; bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2; + pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; + // Note: we need more packets than can be held in the tx and rx rings because + // packets can also be in the AQM queue(s) + pp_init.kbi_packets = pcb->ipsec_netif_ring_size * (2 * pcb->ipsec_kpipe_count + 1); pp_init.kbi_bufsize = pcb->ipsec_slot_size; pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE; pp_init.kbi_max_frags = 1; (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), "%s", provider_name); + pp_init.kbi_ctx = NULL; + pp_init.kbi_ctx_retain = NULL; + pp_init.kbi_ctx_release = NULL; - err = kern_pbufpool_create(&pp_init, &pp_init, &pcb->ipsec_netif_pp, NULL); + err = kern_pbufpool_create(&pp_init, &pcb->ipsec_netif_pp, NULL); if (err != 0) { - printf("%s pbufbool create failed, error %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, err); goto failed; } @@ -1664,7 +1944,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb, &pcb->ipsec_nx.if_provider); IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s register provider failed, error %d\n", + os_log_error(OS_LOG_DEFAULT, "%s register provider failed, error %d\n", __func__, err); goto failed; } @@ -1684,7 +1964,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb, ifp); IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s alloc_net_provider_instance failed, %d\n", + os_log_error(OS_LOG_DEFAULT, "%s alloc_net_provider_instance failed, %d\n", __func__, err); kern_nexus_controller_deregister_provider(controller, pcb->ipsec_nx.if_provider); @@ -1713,7 +1993,7 @@ ipsec_detach_provider_and_instance(uuid_t provider, uuid_t instance) err = kern_nexus_controller_free_provider_instance(controller, instance); if (err != 0) { - printf("%s free_provider_instance failed %d\n", + os_log_error(OS_LOG_DEFAULT, "%s free_provider_instance failed %d\n", __func__, err); } uuid_clear(instance); @@ -1722,7 +2002,7 @@ ipsec_detach_provider_and_instance(uuid_t provider, uuid_t instance) err = kern_nexus_controller_deregister_provider(controller, provider); if (err != 0) { - printf("%s deregister_provider %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s deregister_provider %d\n", __func__, err); } uuid_clear(provider); } @@ -1736,30 +2016,30 @@ ipsec_nexus_detach(struct ipsec_pcb *pcb) nexus_controller_t controller = kern_nexus_shared_controller(); errno_t err; - if (!uuid_is_null(nx->ms_host)) { + if (!uuid_is_null(nx->fsw_host)) { err = kern_nexus_ifdetach(controller, - nx->ms_instance, - nx->ms_host); + nx->fsw_instance, + nx->fsw_host); if (err != 0) { - printf("%s: kern_nexus_ifdetach ms host failed %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms host failed %d\n", __func__, err); } } - if (!uuid_is_null(nx->ms_device)) { + if (!uuid_is_null(nx->fsw_device)) { err = kern_nexus_ifdetach(controller, - nx->ms_instance, - nx->ms_device); + nx->fsw_instance, + nx->fsw_device); if (err != 0) { - printf("%s: kern_nexus_ifdetach ms device failed %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms device failed %d\n", __func__, err); } } ipsec_detach_provider_and_instance(nx->if_provider, nx->if_instance); - ipsec_detach_provider_and_instance(nx->ms_provider, - nx->ms_instance); + ipsec_detach_provider_and_instance(nx->fsw_provider, + nx->fsw_instance); if (pcb->ipsec_netif_pp != NULL) { kern_pbufpool_destroy(pcb->ipsec_netif_pp); @@ -1770,7 +2050,7 @@ ipsec_nexus_detach(struct ipsec_pcb *pcb) static errno_t ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb, - uint32_t subtype, const char *type_name, + const char *type_name, const char *ifname, uuid_t *provider, uuid_t *instance) { @@ -1781,11 +2061,11 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb, struct kern_nexus_init init; nexus_name_t provider_name; - err = kern_nexus_get_builtin_domain_provider(NEXUS_TYPE_FLOW_SWITCH, + err = kern_nexus_get_default_domain_provider(NEXUS_TYPE_FLOW_SWITCH, &dom_prov); IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s can't get %s provider, error %d\n", + os_log_error(OS_LOG_DEFAULT, "%s can't get %s provider, error %d\n", __func__, type_name, err); goto failed; } @@ -1793,14 +2073,11 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb, err = kern_nexus_attr_create(&attr); IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s: kern_nexus_attr_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n", __func__, err); goto failed; } - err = kern_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, subtype); - VERIFY(err == 0); - uint64_t slot_buffer_size = pcb->ipsec_slot_size; err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size); VERIFY(err == 0); @@ -1826,7 +2103,7 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb, attr = NULL; IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s register %s provider failed, error %d\n", + os_log_error(OS_LOG_DEFAULT, "%s register %s provider failed, error %d\n", __func__, type_name, err); goto failed; } @@ -1838,7 +2115,7 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb, instance, &init); IPSEC_IF_VERIFY(err == 0); if (err != 0) { - printf("%s alloc_provider_instance %s failed, %d\n", + os_log_error(OS_LOG_DEFAULT, "%s alloc_provider_instance %s failed, %d\n", __func__, type_name, err); kern_nexus_controller_deregister_provider(controller, *provider); @@ -1849,62 +2126,56 @@ failed: } static errno_t -ipsec_multistack_attach(struct ipsec_pcb *pcb) +ipsec_flowswitch_attach(struct ipsec_pcb *pcb) { nexus_controller_t controller = kern_nexus_shared_controller(); errno_t err = 0; ipsec_nx_t nx = &pcb->ipsec_nx; - // Allocate multistack flowswitch + // Allocate flowswitch err = ipsec_create_fs_provider_and_instance(pcb, - NEXUS_EXTENSION_FSW_TYPE_MULTISTACK, - "multistack", + "flowswitch", pcb->ipsec_ifp->if_xname, - &nx->ms_provider, - &nx->ms_instance); + &nx->fsw_provider, + &nx->fsw_instance); if (err != 0) { - printf("%s: failed to create bridge provider and instance\n", + os_log_error(OS_LOG_DEFAULT, "%s: failed to create bridge provider and instance\n", __func__); goto failed; } - // Attach multistack to device port - err = kern_nexus_ifattach(controller, nx->ms_instance, + // Attach flowswitch to device port + err = kern_nexus_ifattach(controller, nx->fsw_instance, NULL, nx->if_instance, - FALSE, &nx->ms_device); + FALSE, &nx->fsw_device); if (err != 0) { - printf("%s kern_nexus_ifattach ms device %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms device %d\n", __func__, err); goto failed; } - // Attach multistack to host port - err = kern_nexus_ifattach(controller, nx->ms_instance, + // Attach flowswitch to host port + err = kern_nexus_ifattach(controller, nx->fsw_instance, NULL, nx->if_instance, - TRUE, &nx->ms_host); + TRUE, &nx->fsw_host); if (err != 0) { - printf("%s kern_nexus_ifattach ms host %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms host %d\n", __func__, err); goto failed; } // Extract the agent UUID and save for later - struct kern_nexus *multistack_nx = nx_find(nx->ms_instance, false); - if (multistack_nx != NULL) { - struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(multistack_nx); + struct kern_nexus *flowswitch_nx = nx_find(nx->fsw_instance, false); + if (flowswitch_nx != NULL) { + struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(flowswitch_nx); if (flowswitch != NULL) { FSW_RLOCK(flowswitch); - struct fsw_ms_context *ms_context = (struct fsw_ms_context *)flowswitch->fsw_ops_private; - if (ms_context != NULL) { - uuid_copy(nx->ms_agent, ms_context->mc_agent_uuid); - } else { - printf("ipsec_multistack_attach - fsw_ms_context is NULL\n"); - } + uuid_copy(nx->fsw_agent, flowswitch->fsw_agent_uuid); FSW_UNLOCK(flowswitch); } else { - printf("ipsec_multistack_attach - flowswitch is NULL\n"); + os_log_error(OS_LOG_DEFAULT, "ipsec_flowswitch_attach - flowswitch is NULL\n"); } - nx_release(multistack_nx); + nx_release(flowswitch_nx); } else { - printf("ipsec_multistack_attach - unable to find multistack nexus\n"); + os_log_error(OS_LOG_DEFAULT, "ipsec_flowswitch_attach - unable to find flowswitch nexus\n"); } return 0; @@ -1914,7 +2185,7 @@ failed: errno_t detach_error = 0; if ((detach_error = ifnet_detach(pcb->ipsec_ifp)) != 0) { - panic("ipsec_multistack_attach - ifnet_detach failed: %d\n", detach_error); + panic("ipsec_flowswitch_attach - ifnet_detach failed: %d\n", detach_error); /* NOT REACHED */ } @@ -1924,7 +2195,7 @@ failed: #pragma mark Kernel Pipe Nexus static errno_t -ipsec_register_kernel_pipe_nexus(void) +ipsec_register_kernel_pipe_nexus(struct ipsec_pcb *pcb) { nexus_attr_t nxa = NULL; errno_t result; @@ -1937,16 +2208,16 @@ ipsec_register_kernel_pipe_nexus(void) result = kern_nexus_controller_create(&ipsec_ncd); if (result) { - printf("%s: kern_nexus_controller_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_create failed: %d\n", __FUNCTION__, result); goto done; } uuid_t dom_prov; - result = kern_nexus_get_builtin_domain_provider( + result = kern_nexus_get_default_domain_provider( NEXUS_TYPE_KERNEL_PIPE, &dom_prov); if (result) { - printf("%s: kern_nexus_get_builtin_domain_provider failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_get_default_domain_provider failed: %d\n", __FUNCTION__, result); goto done; } @@ -1969,7 +2240,7 @@ ipsec_register_kernel_pipe_nexus(void) result = kern_nexus_attr_create(&nxa); if (result) { - printf("%s: kern_nexus_attr_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n", __FUNCTION__, result); goto done; } @@ -1979,9 +2250,19 @@ ipsec_register_kernel_pipe_nexus(void) VERIFY(result == 0); // Reset ring size for kernel pipe nexus to limit memory usage - uint64_t ring_size = if_ipsec_ring_size; + // Note: It's better to have less on slots on the kpipe TX ring than the netif + // so back pressure is applied at the AQM layer + uint64_t ring_size = + pcb->ipsec_kpipe_tx_ring_size != 0 ? pcb->ipsec_kpipe_tx_ring_size : + pcb->ipsec_netif_ring_size != 0 ? pcb->ipsec_netif_ring_size : + if_ipsec_ring_size; result = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_SLOTS, ring_size); VERIFY(result == 0); + + ring_size = + pcb->ipsec_kpipe_rx_ring_size != 0 ? pcb->ipsec_kpipe_rx_ring_size : + pcb->ipsec_netif_ring_size != 0 ? pcb->ipsec_netif_ring_size : + if_ipsec_ring_size; result = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size); VERIFY(result == 0); @@ -1993,7 +2274,7 @@ ipsec_register_kernel_pipe_nexus(void) nxa, &ipsec_kpipe_uuid); if (result) { - printf("%s: kern_nexus_controller_register_provider failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_register_provider failed: %d\n", __FUNCTION__, result); goto done; } @@ -2031,41 +2312,74 @@ ipsec_unregister_kernel_pipe_nexus(void) lck_mtx_unlock(&ipsec_lock); } -// For use by socket option, not internally -static errno_t -ipsec_disable_channel(struct ipsec_pcb *pcb) -{ - errno_t result; - int enabled; - uuid_t uuid; +/* This structure only holds onto kpipe channels that need to be + * freed in the future, but are cleared from the pcb under lock + */ +struct ipsec_detached_channels { + int count; + kern_pbufpool_t pp; + uuid_t uuids[IPSEC_IF_MAX_RING_COUNT]; +}; - lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock); +static void +ipsec_detach_channels(struct ipsec_pcb *pcb, struct ipsec_detached_channels *dc) +{ + LCK_RW_ASSERT(&pcb->ipsec_pcb_lock, LCK_RW_TYPE_EXCLUSIVE); - enabled = pcb->ipsec_kpipe_enabled; - uuid_copy(uuid, pcb->ipsec_kpipe_uuid); + if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) { + for (int i = 0; i < IPSEC_IF_MAX_RING_COUNT; i++) { + VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i])); + } + dc->count = 0; + return; + } - VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid) == !enabled); + dc->count = pcb->ipsec_kpipe_count; - pcb->ipsec_kpipe_enabled = 0; - uuid_clear(pcb->ipsec_kpipe_uuid); + VERIFY(dc->count >= 0); + VERIFY(dc->count <= IPSEC_IF_MAX_RING_COUNT); - lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock); + for (int i = 0; i < dc->count; i++) { + VERIFY(!uuid_is_null(pcb->ipsec_kpipe_uuid[i])); + uuid_copy(dc->uuids[i], pcb->ipsec_kpipe_uuid[i]); + uuid_clear(pcb->ipsec_kpipe_uuid[i]); + } + for (int i = dc->count; i < IPSEC_IF_MAX_RING_COUNT; i++) { + VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i])); + } - if (enabled) { - result = kern_nexus_controller_free_provider_instance(ipsec_ncd, uuid); + if (dc->count) { + VERIFY(pcb->ipsec_kpipe_pp); } else { - result = ENXIO; + VERIFY(!pcb->ipsec_kpipe_pp); } - if (!result) { - if (pcb->ipsec_kpipe_pp != NULL) { - kern_pbufpool_destroy(pcb->ipsec_kpipe_pp); - pcb->ipsec_kpipe_pp = NULL; - } - ipsec_unregister_kernel_pipe_nexus(); + dc->pp = pcb->ipsec_kpipe_pp; + + pcb->ipsec_kpipe_pp = NULL; + + ipsec_flag_clr(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED); +} + +static void +ipsec_free_channels(struct ipsec_detached_channels *dc) +{ + if (!dc->count) { + return; } - return result; + for (int i = 0; i < dc->count; i++) { + errno_t result; + result = kern_nexus_controller_free_provider_instance(ipsec_ncd, dc->uuids[i]); + VERIFY(!result); + } + + VERIFY(dc->pp); + kern_pbufpool_destroy(dc->pp); + + ipsec_unregister_kernel_pipe_nexus(); + + memset(dc, 0, sizeof(*dc)); } static errno_t @@ -2081,65 +2395,76 @@ ipsec_enable_channel(struct ipsec_pcb *pcb, struct proc *proc) return result; } - result = ipsec_register_kernel_pipe_nexus(); - if (result) { - return result; - } + VERIFY(pcb->ipsec_kpipe_count); + VERIFY(!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)); - VERIFY(ipsec_ncd); + result = ipsec_register_kernel_pipe_nexus(pcb); lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock); - /* ipsec driver doesn't support channels without a netif */ - if (!pcb->ipsec_use_netif) { - result = EOPNOTSUPP; + if (result) { + os_log_error(OS_LOG_DEFAULT, "%s: %s failed to register kernel pipe nexus\n", + __func__, pcb->ipsec_if_xname); goto done; } - if (pcb->ipsec_kpipe_enabled) { - result = EEXIST; // return success instead? - goto done; - } + VERIFY(ipsec_ncd); bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2; + pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; + // Note: We only needs are many packets as can be held in the tx and rx rings + pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2 * pcb->ipsec_kpipe_count; pp_init.kbi_bufsize = pcb->ipsec_slot_size; pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE; pp_init.kbi_max_frags = 1; pp_init.kbi_flags |= KBIF_QUANTUM; (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), "com.apple.kpipe.%s", pcb->ipsec_if_xname); + pp_init.kbi_ctx = NULL; + pp_init.kbi_ctx_retain = NULL; + pp_init.kbi_ctx_release = NULL; - result = kern_pbufpool_create(&pp_init, &pp_init, &pcb->ipsec_kpipe_pp, + result = kern_pbufpool_create(&pp_init, &pcb->ipsec_kpipe_pp, NULL); if (result != 0) { - printf("%s pbufbool create failed, error %d\n", __func__, result); + os_log_error(OS_LOG_DEFAULT, "%s: %s pbufbool create failed, error %d\n", + __func__, pcb->ipsec_if_xname, result); goto done; } - VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid)); bzero(&init, sizeof(init)); init.nxi_version = KERN_NEXUS_CURRENT_VERSION; init.nxi_tx_pbufpool = pcb->ipsec_kpipe_pp; - result = kern_nexus_controller_alloc_provider_instance(ipsec_ncd, - ipsec_kpipe_uuid, pcb, &pcb->ipsec_kpipe_uuid, &init); - if (result) { - goto done; - } - nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT; - result = kern_nexus_controller_bind_provider_instance(ipsec_ncd, - pcb->ipsec_kpipe_uuid, &port, - proc_pid(proc), NULL, NULL, 0, NEXUS_BIND_PID); - if (result) { - kern_nexus_controller_free_provider_instance(ipsec_ncd, - pcb->ipsec_kpipe_uuid); - uuid_clear(pcb->ipsec_kpipe_uuid); - goto done; - } + for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) { + VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i])); + result = kern_nexus_controller_alloc_provider_instance(ipsec_ncd, + ipsec_kpipe_uuid, pcb, &pcb->ipsec_kpipe_uuid[i], &init); + + if (result == 0) { + nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT; + pid_t pid = pcb->ipsec_kpipe_pid; + if (!pid) { + pid = proc_pid(proc); + } + result = kern_nexus_controller_bind_provider_instance(ipsec_ncd, + pcb->ipsec_kpipe_uuid[i], &port, + pid, NULL, NULL, 0, NEXUS_BIND_PID); + } - pcb->ipsec_kpipe_enabled = 1; + if (result) { + /* Unwind all of them on error */ + for (int j = 0; j < IPSEC_IF_MAX_RING_COUNT; j++) { + if (!uuid_is_null(pcb->ipsec_kpipe_uuid[j])) { + kern_nexus_controller_free_provider_instance(ipsec_ncd, + pcb->ipsec_kpipe_uuid[j]); + uuid_clear(pcb->ipsec_kpipe_uuid[j]); + } + } + goto done; + } + } done: lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock); @@ -2150,6 +2475,8 @@ done: pcb->ipsec_kpipe_pp = NULL; } ipsec_unregister_kernel_pipe_nexus(); + } else { + ipsec_flag_set(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED); } return result; @@ -2165,8 +2492,12 @@ ipsec_free_pcb(struct ipsec_pcb *pcb, bool in_list) { #if IPSEC_NEXUS mbuf_freem_list(pcb->ipsec_input_chain); + pcb->ipsec_input_chain_count = 0; lck_mtx_destroy(&pcb->ipsec_input_chain_lock, ipsec_lck_grp); + lck_mtx_destroy(&pcb->ipsec_kpipe_encrypt_lock, ipsec_lck_grp); + lck_mtx_destroy(&pcb->ipsec_kpipe_decrypt_lock, ipsec_lck_grp); #endif // IPSEC_NEXUS + lck_mtx_destroy(&pcb->ipsec_pcb_data_move_lock, ipsec_lck_grp); lck_rw_destroy(&pcb->ipsec_pcb_lock, ipsec_lck_grp); if (in_list) { lck_mtx_lock(&ipsec_lock); @@ -2193,14 +2524,18 @@ ipsec_ctl_bind(kern_ctl_ref kctlref, #if IPSEC_NEXUS pcb->ipsec_use_netif = false; pcb->ipsec_slot_size = IPSEC_IF_DEFAULT_SLOT_SIZE; - pcb->ipsec_netif_ring_size = IPSEC_IF_DEFAULT_RING_SIZE; - pcb->ipsec_tx_fsw_ring_size = IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE; - pcb->ipsec_rx_fsw_ring_size = IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE; + pcb->ipsec_netif_ring_size = if_ipsec_ring_size; + pcb->ipsec_tx_fsw_ring_size = if_ipsec_tx_fsw_ring_size; + pcb->ipsec_rx_fsw_ring_size = if_ipsec_rx_fsw_ring_size; #endif // IPSEC_NEXUS lck_rw_init(&pcb->ipsec_pcb_lock, ipsec_lck_grp, ipsec_lck_attr); + lck_mtx_init(&pcb->ipsec_pcb_data_move_lock, ipsec_lck_grp, ipsec_lck_attr); #if IPSEC_NEXUS + pcb->ipsec_input_chain_count = 0; lck_mtx_init(&pcb->ipsec_input_chain_lock, ipsec_lck_grp, ipsec_lck_attr); + lck_mtx_init(&pcb->ipsec_kpipe_encrypt_lock, ipsec_lck_grp, ipsec_lck_attr); + lck_mtx_init(&pcb->ipsec_kpipe_decrypt_lock, ipsec_lck_grp, ipsec_lck_attr); #endif // IPSEC_NEXUS return 0; @@ -2259,7 +2594,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, snprintf(pcb->ipsec_if_xname, sizeof(pcb->ipsec_if_xname), "ipsec%d", pcb->ipsec_unit - 1); snprintf(pcb->ipsec_unique_name, sizeof(pcb->ipsec_unique_name), "ipsecid%d", pcb->ipsec_unique_id - 1); - printf("ipsec_ctl_connect: creating interface %s (id %s)\n", pcb->ipsec_if_xname, pcb->ipsec_unique_name); + os_log(OS_LOG_DEFAULT, "ipsec_ctl_connect: creating interface %s (id %s)\n", pcb->ipsec_if_xname, pcb->ipsec_unique_name); /* Create the interface */ bzero(&ipsec_init, sizeof(ipsec_init)); @@ -2279,8 +2614,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, ipsec_init.unit = pcb->ipsec_unit - 1; ipsec_init.uniqueid = pcb->ipsec_unique_name; ipsec_init.uniqueid_len = strlen(pcb->ipsec_unique_name); - ipsec_init.family = ipsec_family; - ipsec_init.subfamily = IFNET_SUBFAMILY_IPSEC; + ipsec_init.family = IFNET_FAMILY_IPSEC; ipsec_init.type = IFT_OTHER; ipsec_init.demux = ipsec_demux; ipsec_init.add_proto = ipsec_add_proto; @@ -2290,18 +2624,52 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, ipsec_init.detach = ipsec_detached; #if IPSEC_NEXUS + /* We don't support kpipes without a netif */ + if (pcb->ipsec_kpipe_count && !pcb->ipsec_use_netif) { + result = ENOTSUP; + os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - kpipe requires netif: failed %d\n", result); + ipsec_free_pcb(pcb, true); + *unitinfo = NULL; + return result; + } + + if (if_ipsec_debug != 0) { + printf("%s: %s%d use_netif %d kpipe_count %d slot_size %u ring_size %u " + "kpipe_tx_ring_size %u kpipe_rx_ring_size %u\n", + __func__, + ipsec_init.name, ipsec_init.unit, + pcb->ipsec_use_netif, + pcb->ipsec_kpipe_count, + pcb->ipsec_slot_size, + pcb->ipsec_netif_ring_size, + pcb->ipsec_kpipe_tx_ring_size, + pcb->ipsec_kpipe_rx_ring_size); + } if (pcb->ipsec_use_netif) { + if (pcb->ipsec_kpipe_count) { + result = ipsec_enable_channel(pcb, current_proc()); + if (result) { + os_log_error(OS_LOG_DEFAULT, "%s: %s failed to enable channels\n", + __func__, pcb->ipsec_if_xname); + ipsec_free_pcb(pcb, true); + *unitinfo = NULL; + return result; + } + } + result = ipsec_nexus_ifattach(pcb, &ipsec_init, &pcb->ipsec_ifp); if (result != 0) { - printf("ipsec_ctl_connect - ipsec_nexus_ifattach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ipsec_nexus_ifattach failed: %d\n", result); ipsec_free_pcb(pcb, true); *unitinfo = NULL; return result; } - result = ipsec_multistack_attach(pcb); + result = ipsec_flowswitch_attach(pcb); if (result != 0) { - printf("ipsec_ctl_connect - ipsec_multistack_attach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ipsec_flowswitch_attach failed: %d\n", result); + // Do not call ipsec_free_pcb(). We will be attached already, and will be freed later + // in ipsec_detached(). *unitinfo = NULL; return result; } @@ -2313,7 +2681,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, { result = ifnet_allocate_extended(&ipsec_init, &pcb->ipsec_ifp); if (result != 0) { - printf("ipsec_ctl_connect - ifnet_allocate failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ifnet_allocate failed: %d\n", result); ipsec_free_pcb(pcb, true); *unitinfo = NULL; return result; @@ -2323,7 +2691,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, /* Attach the interface */ result = ifnet_attach(pcb->ipsec_ifp, NULL); if (result != 0) { - printf("ipsec_ctl_connect - ifnet_attach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ifnet_attach failed: %d\n", result); ifnet_release(pcb->ipsec_ifp); ipsec_free_pcb(pcb, true); *unitinfo = NULL; @@ -2334,6 +2702,16 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, bpfattach(pcb->ipsec_ifp, DLT_NULL, 0); } + /* + * Mark the data path as ready. + * If kpipe nexus is being used then the data path is marked ready only when a kpipe channel is connected. + */ + if (pcb->ipsec_kpipe_count == 0) { + lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock); + IPSEC_SET_DATA_PATH_READY(pcb); + lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); + } + /* The interfaces resoures allocated, mark it as running */ ifnet_set_flags(pcb->ipsec_ifp, IFF_RUNNING, IFF_RUNNING); @@ -2386,11 +2764,11 @@ ipsec_remove_address(ifnet_t interface, ifnet_name(interface), ifnet_unit(interface)); result = ifaddr_address(address, &ifr.ifr_addr, sizeof(ifr.ifr_addr)); if (result != 0) { - printf("ipsec_remove_address - ifaddr_address failed: %d", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - ifaddr_address failed: %d", result); } else { result = sock_ioctl(pf_socket, SIOCDIFADDR, &ifr); if (result != 0) { - printf("ipsec_remove_address - SIOCDIFADDR failed: %d", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - SIOCDIFADDR failed: %d", result); } } } else if (protocol == PF_INET6) { @@ -2402,12 +2780,12 @@ ipsec_remove_address(ifnet_t interface, result = ifaddr_address(address, (struct sockaddr*)&ifr6.ifr_addr, sizeof(ifr6.ifr_addr)); if (result != 0) { - printf("ipsec_remove_address - ifaddr_address failed (v6): %d", + os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - ifaddr_address failed (v6): %d", result); } else { result = sock_ioctl(pf_socket, SIOCDIFADDR_IN6, &ifr6); if (result != 0) { - printf("ipsec_remove_address - SIOCDIFADDR_IN6 failed: %d", + os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - SIOCDIFADDR_IN6 failed: %d", result); } } @@ -2424,7 +2802,7 @@ ipsec_cleanup_family(ifnet_t interface, int i; if (protocol != PF_INET && protocol != PF_INET6) { - printf("ipsec_cleanup_family - invalid protocol family %d\n", protocol); + os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - invalid protocol family %d\n", protocol); return; } @@ -2432,7 +2810,7 @@ ipsec_cleanup_family(ifnet_t interface, result = sock_socket(protocol, SOCK_DGRAM, 0, NULL, NULL, &pf_socket); if (result != 0) { if (result != EAFNOSUPPORT) { - printf("ipsec_cleanup_family - failed to create %s socket: %d\n", + os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - failed to create %s socket: %d\n", protocol == PF_INET ? "IP" : "IPv6", result); } goto cleanup; @@ -2447,7 +2825,7 @@ ipsec_cleanup_family(ifnet_t interface, goto cleanup; } else if (result != EBUSY) { /* Uh, not really sure what happened here... */ - printf("ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result); goto cleanup; } @@ -2457,7 +2835,7 @@ ipsec_cleanup_family(ifnet_t interface, */ result = ifnet_get_address_list_family(interface, &addresses, protocol); if (result != 0) { - printf("fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n", ifnet_name(interface), ifnet_unit(interface), protocol == PF_INET ? "PF_INET" : "PF_INET6", result); goto cleanup; @@ -2474,7 +2852,7 @@ ipsec_cleanup_family(ifnet_t interface, */ result = ipsec_detach_ip(interface, protocol, pf_socket); if (result != 0 && result != ENXIO) { - printf("ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result); } cleanup: @@ -2500,6 +2878,9 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, return EINVAL; } + /* Wait until all threads in the data paths are done. */ + ipsec_wait_data_move_drain(pcb); + #if IPSEC_NEXUS // Tell the nexus to stop all rings if (pcb->ipsec_netif_nexus != NULL) { @@ -2510,10 +2891,13 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock); #if IPSEC_NEXUS - uuid_t kpipe_uuid; - uuid_copy(kpipe_uuid, pcb->ipsec_kpipe_uuid); - uuid_clear(pcb->ipsec_kpipe_uuid); - pcb->ipsec_kpipe_enabled = FALSE; + if (if_ipsec_debug != 0) { + printf("ipsec_ctl_disconnect: detaching interface %s (id %s)\n", + pcb->ipsec_if_xname, pcb->ipsec_unique_name); + } + + struct ipsec_detached_channels dc; + ipsec_detach_channels(pcb, &dc); #endif // IPSEC_NEXUS pcb->ipsec_ctlref = NULL; @@ -2547,15 +2931,8 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock); - if (!uuid_is_null(kpipe_uuid)) { - if (kern_nexus_controller_free_provider_instance(ipsec_ncd, kpipe_uuid) == 0) { - if (pcb->ipsec_kpipe_pp != NULL) { - kern_pbufpool_destroy(pcb->ipsec_kpipe_pp); - pcb->ipsec_kpipe_pp = NULL; - } - ipsec_unregister_kernel_pipe_nexus(); - } - } + ipsec_free_channels(&dc); + ipsec_nexus_detach(pcb); /* Decrement refcnt to finish detaching and freeing */ @@ -2566,15 +2943,7 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock); #if IPSEC_NEXUS - if (!uuid_is_null(kpipe_uuid)) { - if (kern_nexus_controller_free_provider_instance(ipsec_ncd, kpipe_uuid) == 0) { - if (pcb->ipsec_kpipe_pp != NULL) { - kern_pbufpool_destroy(pcb->ipsec_kpipe_pp); - pcb->ipsec_kpipe_pp = NULL; - } - ipsec_unregister_kernel_pipe_nexus(); - } - } + ipsec_free_channels(&dc); #endif // IPSEC_NEXUS /* @@ -2594,7 +2963,7 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, * ifnet_release(). */ if ((result = ifnet_detach(ifp)) != 0) { - printf("ipsec_ctl_disconnect - ifnet_detach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_disconnect - ifnet_detach failed: %d\n", result); } } } else { @@ -2642,15 +3011,16 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, } switch (opt) { - case IPSEC_OPT_FLAGS: + case IPSEC_OPT_FLAGS: { if (len != sizeof(u_int32_t)) { result = EMSGSIZE; } else { - pcb->ipsec_flags = *(u_int32_t *)data; + pcb->ipsec_external_flags = *(u_int32_t *)data; } break; + } - case IPSEC_OPT_EXT_IFDATA_STATS: + case IPSEC_OPT_EXT_IFDATA_STATS: { if (len != sizeof(int)) { result = EMSGSIZE; break; @@ -2662,6 +3032,7 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, } pcb->ipsec_ext_ifdata_stats = (*(int *)data) ? 1 : 0; break; + } case IPSEC_OPT_INC_IFDATA_STATS_IN: case IPSEC_OPT_INC_IFDATA_STATS_OUT: { @@ -2691,8 +3062,8 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, } case IPSEC_OPT_SET_DELEGATE_INTERFACE: { - ifnet_t del_ifp = NULL; - char name[IFNAMSIZ]; + ifnet_t del_ifp = NULL; + char name[IFNAMSIZ]; if (len > IFNAMSIZ - 1) { result = EMSGSIZE; @@ -2703,13 +3074,13 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, result = EINVAL; break; } - if (len != 0) { /* if len==0, del_ifp will be NULL causing the delegate to be removed */ + if (len != 0) { /* if len==0, del_ifp will be NULL causing the delegate to be removed */ bcopy(data, name, len); name[len] = 0; result = ifnet_find_by_name(name, &del_ifp); } if (result == 0) { - printf("%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n", + os_log_error(OS_LOG_DEFAULT, "%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n", __func__, pcb->ipsec_ifp->if_xname, del_ifp ? del_ifp->if_xname : "NULL"); @@ -2737,7 +3108,7 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, } else { pcb->ipsec_output_service_class = output_service_class; } - printf("%s IPSEC_OPT_OUTPUT_TRAFFIC_CLASS %s svc %d\n", + os_log_error(OS_LOG_DEFAULT, "%s IPSEC_OPT_OUTPUT_TRAFFIC_CLASS %s svc %d\n", __func__, pcb->ipsec_ifp->if_xname, pcb->ipsec_output_service_class); break; @@ -2749,16 +3120,36 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, result = EMSGSIZE; break; } - if (pcb->ipsec_ifp == NULL) { - // Only can set after connecting + if (pcb->ipsec_ifp != NULL) { + // Only can set before connecting result = EINVAL; break; } - if (*(int *)data) { - result = ipsec_enable_channel(pcb, current_proc()); - } else { - result = ipsec_disable_channel(pcb); + if ((*(int *)data) != 0 && + (*(int *)data) != 1 && + (*(int *)data) != IPSEC_IF_WMM_RING_COUNT) { + result = EINVAL; + break; } + lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock); + pcb->ipsec_kpipe_count = *(int *)data; + lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock); + break; + } + + case IPSEC_OPT_CHANNEL_BIND_PID: { + if (len != sizeof(pid_t)) { + result = EMSGSIZE; + break; + } + if (pcb->ipsec_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock); + pcb->ipsec_kpipe_pid = *(pid_t *)data; + lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock); break; } @@ -2772,21 +3163,27 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, result = EINVAL; break; } - if (!if_is_netagent_enabled()) { + if (!if_is_fsw_transport_netagent_enabled()) { result = ENOTSUP; break; } - if (uuid_is_null(pcb->ipsec_nx.ms_agent)) { + if (uuid_is_null(pcb->ipsec_nx.fsw_agent)) { result = ENOENT; break; } + uint32_t flags = netagent_get_flags(pcb->ipsec_nx.fsw_agent); + if (*(int *)data) { - if_add_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.ms_agent); + flags |= (NETAGENT_FLAG_NEXUS_PROVIDER | + NETAGENT_FLAG_NEXUS_LISTENER); + result = netagent_set_flags(pcb->ipsec_nx.fsw_agent, flags); pcb->ipsec_needs_netagent = true; } else { pcb->ipsec_needs_netagent = false; - if_delete_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.ms_agent); + flags &= ~(NETAGENT_FLAG_NEXUS_PROVIDER | + NETAGENT_FLAG_NEXUS_LISTENER); + result = netagent_set_flags(pcb->ipsec_nx.fsw_agent, flags); } break; } @@ -2801,7 +3198,6 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, pcb->ipsec_frag_size_set = FALSE; pcb->ipsec_input_frag_size = 0; } else { - printf("SET FRAG SIZE TO %u\n", input_frag_size); pcb->ipsec_frag_size_set = TRUE; pcb->ipsec_input_frag_size = input_frag_size; } @@ -2838,6 +3234,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, return EINVAL; } pcb->ipsec_slot_size = slot_size; + if (if_ipsec_debug != 0) { + printf("%s: IPSEC_OPT_SLOT_SIZE %u\n", __func__, slot_size); + } break; } case IPSEC_OPT_NETIF_RING_SIZE: { @@ -2856,6 +3255,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, return EINVAL; } pcb->ipsec_netif_ring_size = ring_size; + if (if_ipsec_debug != 0) { + printf("%s: IPSEC_OPT_NETIF_RING_SIZE %u\n", __func__, ring_size); + } break; } case IPSEC_OPT_TX_FSW_RING_SIZE: { @@ -2874,6 +3276,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, return EINVAL; } pcb->ipsec_tx_fsw_ring_size = ring_size; + if (if_ipsec_debug != 0) { + printf("%s: IPSEC_OPT_TX_FSW_RING_SIZE %u\n", __func__, ring_size); + } break; } case IPSEC_OPT_RX_FSW_RING_SIZE: { @@ -2892,15 +3297,61 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, return EINVAL; } pcb->ipsec_rx_fsw_ring_size = ring_size; + if (if_ipsec_debug != 0) { + printf("%s: IPSEC_OPT_TX_FSW_RING_SIZE %u\n", __func__, ring_size); + } + break; + } + case IPSEC_OPT_KPIPE_TX_RING_SIZE: { + if (len != sizeof(u_int32_t)) { + result = EMSGSIZE; + break; + } + if (pcb->ipsec_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + u_int32_t ring_size = *(u_int32_t *)data; + if (ring_size < IPSEC_IF_MIN_RING_SIZE || + ring_size > IPSEC_IF_MAX_RING_SIZE) { + return EINVAL; + } + pcb->ipsec_kpipe_tx_ring_size = ring_size; + if (if_ipsec_debug != 0) { + printf("%s: IPSEC_OPT_KPIPE_TX_RING_SIZE %u\n", __func__, ring_size); + } + break; + } + case IPSEC_OPT_KPIPE_RX_RING_SIZE: { + if (len != sizeof(u_int32_t)) { + result = EMSGSIZE; + break; + } + if (pcb->ipsec_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + u_int32_t ring_size = *(u_int32_t *)data; + if (ring_size < IPSEC_IF_MIN_RING_SIZE || + ring_size > IPSEC_IF_MAX_RING_SIZE) { + return EINVAL; + } + pcb->ipsec_kpipe_rx_ring_size = ring_size; + if (if_ipsec_debug != 0) { + printf("%s: IPSEC_OPT_KPIPE_RX_RING_SIZE %u\n", __func__, ring_size); + } break; } #endif // IPSEC_NEXUS - default: + default: { result = ENOPROTOOPT; break; } + } return result; } @@ -2921,7 +3372,7 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref, if (*len != sizeof(u_int32_t)) { result = EMSGSIZE; } else { - *(u_int32_t *)data = pcb->ipsec_flags; + *(u_int32_t *)data = pcb->ipsec_external_flags; } break; } @@ -2965,7 +3416,18 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref, result = EMSGSIZE; } else { lck_rw_lock_shared(&pcb->ipsec_pcb_lock); - *(int *)data = pcb->ipsec_kpipe_enabled; + *(int *)data = pcb->ipsec_kpipe_count; + lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + } + break; + } + + case IPSEC_OPT_CHANNEL_BIND_PID: { + if (*len != sizeof(pid_t)) { + result = EMSGSIZE; + } else { + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); + *(pid_t *)data = pcb->ipsec_kpipe_pid; lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); } break; @@ -2975,7 +3437,7 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref, if (*len != sizeof(int)) { result = EMSGSIZE; } else { - *(int *)data = if_check_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.ms_agent); + *(int *)data = if_check_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.fsw_agent); } break; } @@ -2993,12 +3455,14 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref, case IPSEC_OPT_GET_CHANNEL_UUID: { lck_rw_lock_shared(&pcb->ipsec_pcb_lock); - if (uuid_is_null(pcb->ipsec_kpipe_uuid)) { + if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) { result = ENXIO; - } else if (*len != sizeof(uuid_t)) { + } else if (*len != sizeof(uuid_t) * pcb->ipsec_kpipe_count) { result = EMSGSIZE; } else { - uuid_copy(data, pcb->ipsec_kpipe_uuid); + for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) { + uuid_copy(((uuid_t *)data)[i], pcb->ipsec_kpipe_uuid[i]); + } } lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); break; @@ -3044,6 +3508,22 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref, } break; } + case IPSEC_OPT_KPIPE_TX_RING_SIZE: { + if (*len != sizeof(u_int32_t)) { + result = EMSGSIZE; + } else { + *(u_int32_t *)data = pcb->ipsec_kpipe_tx_ring_size; + } + break; + } + case IPSEC_OPT_KPIPE_RX_RING_SIZE: { + if (*len != sizeof(u_int32_t)) { + result = EMSGSIZE; + } else { + *(u_int32_t *)data = pcb->ipsec_kpipe_rx_ring_size; + } + break; + } #endif // IPSEC_NEXUS @@ -3112,7 +3592,7 @@ ipsec_output(ifnet_t interface, data = ipsec_state.m; if (error || data == NULL) { if (error) { - printf("ipsec_output: ipsec4_output error %d.\n", error); + os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec4_output error %d.\n", error); } goto ipsec_output_err; } @@ -3171,7 +3651,7 @@ ipsec_output(ifnet_t interface, data = ipsec6_splithdr(data); if (data == NULL) { - printf("ipsec_output: ipsec6_splithdr returned NULL\n"); + os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec6_splithdr returned NULL\n"); goto ipsec_output_err; } @@ -3189,7 +3669,7 @@ ipsec_output(ifnet_t interface, data = ipsec_state.m; if (error || data == NULL) { if (error) { - printf("ipsec_output: ipsec6_output error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec6_output error %d\n", error); } goto ipsec_output_err; } @@ -3232,7 +3712,7 @@ ipsec_output(ifnet_t interface, goto done; } default: { - printf("ipsec_output: Received unknown packet version %d.\n", ip_version); + os_log_error(OS_LOG_DEFAULT, "ipsec_output: Received unknown packet version %d.\n", ip_version); error = EINVAL; goto ipsec_output_err; } @@ -3357,6 +3837,30 @@ ipsec_ioctl(ifnet_t interface, /* ifioctl() takes care of it */ break; + case SIOCSIFSUBFAMILY: { + uint32_t subfamily; + + subfamily = ((struct ifreq*)data)->ifr_type.ift_subfamily; + switch (subfamily) { + case IFRTYPE_SUBFAMILY_BLUETOOTH: + interface->if_subfamily = IFNET_SUBFAMILY_BLUETOOTH; + break; + case IFRTYPE_SUBFAMILY_WIFI: + interface->if_subfamily = IFNET_SUBFAMILY_WIFI; + break; + case IFRTYPE_SUBFAMILY_QUICKRELAY: + interface->if_subfamily = IFNET_SUBFAMILY_QUICKRELAY; + break; + case IFRTYPE_SUBFAMILY_DEFAULT: + interface->if_subfamily = IFNET_SUBFAMILY_DEFAULT; + break; + default: + result = EINVAL; + break; + } + break; + } + default: result = EOPNOTSUPP; } @@ -3368,6 +3872,7 @@ static void ipsec_detached(ifnet_t interface) { struct ipsec_pcb *pcb = ifnet_softc(interface); + (void)ifnet_release(interface); ipsec_free_pcb(pcb, true); } @@ -3435,7 +3940,7 @@ ipsec_attach_proto(ifnet_t interface, result = ifnet_attach_protocol(interface, protocol, &proto); if (result != 0 && result != EEXIST) { - printf("ipsec_attach_inet - ifnet_attach_protocol %d failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "ipsec_attach_inet - ifnet_attach_protocol %d failed: %d\n", protocol, result); } @@ -3450,28 +3955,45 @@ ipsec_inject_inbound_packet(ifnet_t interface, struct ipsec_pcb *pcb = ifnet_softc(interface); if (pcb->ipsec_use_netif) { + if (!ipsec_data_move_begin(pcb)) { + os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, + if_name(pcb->ipsec_ifp)); + return ENXIO; + } + lck_rw_lock_shared(&pcb->ipsec_pcb_lock); lck_mtx_lock(&pcb->ipsec_input_chain_lock); + + if (pcb->ipsec_input_chain_count > (u_int32_t)if_ipsec_max_pending_input) { + lck_mtx_unlock(&pcb->ipsec_input_chain_lock); + lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); + ipsec_data_move_end(pcb); + return ENOSPC; + } + if (pcb->ipsec_input_chain != NULL) { pcb->ipsec_input_chain_last->m_nextpkt = packet; } else { pcb->ipsec_input_chain = packet; } + pcb->ipsec_input_chain_count++; while (packet->m_nextpkt) { VERIFY(packet != packet->m_nextpkt); packet = packet->m_nextpkt; + pcb->ipsec_input_chain_count++; } pcb->ipsec_input_chain_last = packet; lck_mtx_unlock(&pcb->ipsec_input_chain_lock); - kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring; + kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring[0]; lck_rw_unlock_shared(&pcb->ipsec_pcb_lock); if (rx_ring != NULL) { kern_channel_notify(rx_ring, 0); } + ipsec_data_move_end(pcb); return 0; } else #endif // IPSEC_NEXUS @@ -3551,3 +4073,63 @@ ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa) ip6oa->ip6oa_sotc = SO_TC_VO; } } + +static boolean_t +ipsec_data_move_begin(struct ipsec_pcb *pcb) +{ + boolean_t ret = 0; + + lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock); + if ((ret = IPSEC_IS_DATA_PATH_READY(pcb))) { + pcb->ipsec_pcb_data_move++; + } + lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); + + return ret; +} + +static void +ipsec_data_move_end(struct ipsec_pcb *pcb) +{ + lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock); + VERIFY(pcb->ipsec_pcb_data_move > 0); + /* + * if there's no more thread moving data, wakeup any + * drainers that's blocked waiting for this. + */ + if (--pcb->ipsec_pcb_data_move == 0 && pcb->ipsec_pcb_drainers > 0) { + wakeup(&(pcb->ipsec_pcb_data_move)); + } + lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); +} + +static void +ipsec_data_move_drain(struct ipsec_pcb *pcb) +{ + lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock); + /* data path must already be marked as not ready */ + VERIFY(!IPSEC_IS_DATA_PATH_READY(pcb)); + pcb->ipsec_pcb_drainers++; + while (pcb->ipsec_pcb_data_move != 0) { + (void)msleep(&(pcb->ipsec_pcb_data_move), &pcb->ipsec_pcb_data_move_lock, + (PZERO - 1), __func__, NULL); + } + VERIFY(!IPSEC_IS_DATA_PATH_READY(pcb)); + VERIFY(pcb->ipsec_pcb_drainers > 0); + pcb->ipsec_pcb_drainers--; + lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); +} + +static void +ipsec_wait_data_move_drain(struct ipsec_pcb *pcb) +{ + /* + * Mark the data path as not usable. + */ + lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock); + IPSEC_CLR_DATA_PATH_READY(pcb); + lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); + + /* Wait until all threads in the data paths are done. */ + ipsec_data_move_drain(pcb); +} diff --git a/bsd/net/if_ipsec.h b/bsd/net/if_ipsec.h index 3c0fcbd2b..39e4f35d1 100644 --- a/bsd/net/if_ipsec.h +++ b/bsd/net/if_ipsec.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2014 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,7 +40,6 @@ errno_t ipsec_register_control(void); /* Helpers */ int ipsec_interface_isvalid(ifnet_t interface); -boolean_t ipsec_interface_needs_netagent(ifnet_t interface); errno_t ipsec_inject_inbound_packet(ifnet_t interface, mbuf_t packet); @@ -61,23 +60,27 @@ void ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa /* * Socket option names to manage ipsec */ -#define IPSEC_OPT_FLAGS 1 -#define IPSEC_OPT_IFNAME 2 -#define IPSEC_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ +#define IPSEC_OPT_FLAGS 1 +#define IPSEC_OPT_IFNAME 2 +#define IPSEC_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ #define IPSEC_OPT_INC_IFDATA_STATS_IN 4 /* set to increment stat counters (type struct ipsec_stats_param) */ #define IPSEC_OPT_INC_IFDATA_STATS_OUT 5 /* set to increment stat counters (type struct ipsec_stats_param) */ #define IPSEC_OPT_SET_DELEGATE_INTERFACE 6 /* set the delegate interface (char[]) */ #define IPSEC_OPT_OUTPUT_TRAFFIC_CLASS 7 /* set the traffic class for packets leaving the interface, see sys/socket.h */ -#define IPSEC_OPT_ENABLE_CHANNEL 8 /* enable a kernel pipe nexus that allows the owner to open a channel to act as a driver */ -#define IPSEC_OPT_GET_CHANNEL_UUID 9 /* get the uuid of the kernel pipe nexus instance */ -#define IPSEC_OPT_ENABLE_FLOWSWITCH 10 /* enable a flowswitch nexus that clients can use */ -#define IPSEC_OPT_INPUT_FRAG_SIZE 11 /* set the maximum size of input packets before fragmenting as a uint32_t */ - -#define IPSEC_OPT_ENABLE_NETIF 12 /* Must be set before connecting */ -#define IPSEC_OPT_SLOT_SIZE 13 /* Must be set before connecting */ -#define IPSEC_OPT_NETIF_RING_SIZE 14 /* Must be set before connecting */ -#define IPSEC_OPT_TX_FSW_RING_SIZE 15 /* Must be set before connecting */ -#define IPSEC_OPT_RX_FSW_RING_SIZE 16 /* Must be set before connecting */ +#define IPSEC_OPT_ENABLE_CHANNEL 8 /* enable a kernel pipe nexus that allows the owner to open a channel to act as a driver, + * Must be set before connecting */ +#define IPSEC_OPT_GET_CHANNEL_UUID 9 /* get the uuid of the kernel pipe nexus instance */ +#define IPSEC_OPT_ENABLE_FLOWSWITCH 10 /* enable a flowswitch nexus that clients can use */ +#define IPSEC_OPT_INPUT_FRAG_SIZE 11 /* set the maximum size of input packets before fragmenting as a uint32_t */ + +#define IPSEC_OPT_ENABLE_NETIF 12 /* Must be set before connecting */ +#define IPSEC_OPT_SLOT_SIZE 13 /* Must be set before connecting */ +#define IPSEC_OPT_NETIF_RING_SIZE 14 /* Must be set before connecting */ +#define IPSEC_OPT_TX_FSW_RING_SIZE 15 /* Must be set before connecting */ +#define IPSEC_OPT_RX_FSW_RING_SIZE 16 /* Must be set before connecting */ +#define IPSEC_OPT_CHANNEL_BIND_PID 17 /* Must be set before connecting */ +#define IPSEC_OPT_KPIPE_TX_RING_SIZE 18 /* Must be set before connecting */ +#define IPSEC_OPT_KPIPE_RX_RING_SIZE 19 /* Must be set before connecting */ /* * ipsec stats parameter structure diff --git a/bsd/net/if_low_power_mode.c b/bsd/net/if_low_power_mode.c index 3f9e3e89b..b93387b15 100644 --- a/bsd/net/if_low_power_mode.c +++ b/bsd/net/if_low_power_mode.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Apple Inc. All rights reserved. + * Copyright (c) 2018-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -135,7 +135,7 @@ if_low_power_evhdlr_init(void) { eventhandler_lists_ctxt_init(&if_low_power_evhdlr_ctx); - (void) EVENTHANDLER_REGISTER(&if_low_power_evhdlr_ctx, + (void)EVENTHANDLER_REGISTER(&if_low_power_evhdlr_ctx, if_low_power_event, if_low_power_evhdlr_callback, eventhandler_entry_dummy_arg, diff --git a/bsd/net/if_media.h b/bsd/net/if_media.h index 55fd50f64..f1ee7273a 100644 --- a/bsd/net/if_media.h +++ b/bsd/net/if_media.h @@ -25,12 +25,12 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* $NetBSD: if_media.h,v 1.3 1997/03/26 01:19:27 thorpej Exp $ */ +/* $NetBSD: if_media.h,v 1.3 1997/03/26 01:19:27 thorpej Exp $ */ /* $FreeBSD: src/sys/net/if_media.h,v 1.9.2.1 2001/07/04 00:12:38 brooks Exp $ */ /* * Copyright (c) 1997 - * Jonathan Stone and Jason R. Thorpe. All rights reserved. + * Jonathan Stone and Jason R. Thorpe. All rights reserved. * * This software is derived from information provided by Matt Thomas. * @@ -44,8 +44,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by Jonathan Stone - * and Jason R. Thorpe for the NetBSD Project. + * This product includes software developed by Jonathan Stone + * and Jason R. Thorpe for the NetBSD Project. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * @@ -88,7 +88,7 @@ * Bits Use * ---- ------- * 0-4 Media variant - * 5-7 Media type + * 5-7 Media type * 8-15 Type specific options * 16-19 Extended media variant bits * 20-27 Shared (global) options @@ -298,12 +298,12 @@ /* * Masks */ -#define IFM_NMASK 0x000000e0 /* Network type */ +#define IFM_NMASK 0x000000e0 /* Network type */ #define IFM_TMASK (IFM_TMASK_COMPAT|IFM_TMASK_EXT) /* Media sub-type */ -#define IFM_IMASK 0xf0000000 /* Instance */ -#define IFM_ISHIFT 28 /* Instance shift */ -#define IFM_OMASK 0x0000ff00 /* Type specific options */ -#define IFM_GMASK 0x0ff00000 /* Global options */ +#define IFM_IMASK 0xf0000000 /* Instance */ +#define IFM_ISHIFT 28 /* Instance shift */ +#define IFM_OMASK 0x0000ff00 /* Type specific options */ +#define IFM_GMASK 0x0ff00000 /* Global options */ /* * Status bits @@ -315,12 +315,11 @@ /* * Macros to extract various bits of information from the media word. */ - #define IFM_TYPE(x) ((x) & IFM_NMASK) #define IFM_SUBTYPE(x) ((x) & IFM_TMASK) #define IFM_TYPE_OPTIONS(x) ((x) & IFM_OMASK) #define IFM_INST(x) (((x) & IFM_IMASK) >> IFM_ISHIFT) -#define IFM_OPTIONS(x) ((x) & (IFM_OMASK|IFM_GMASK)) +#define IFM_OPTIONS(x) ((x) & (IFM_OMASK|IFM_GMASK)) #define IFM_INST_MAX IFM_INST(IFM_IMASK) @@ -350,21 +349,21 @@ struct ifmedia_description { { 0, NULL }, \ } -#define IFM_SUBTYPE_ETHERNET_DESCRIPTIONS { \ - { IFM_10_T, "10baseT/UTP" }, \ - { IFM_10_2, "10base2/BNC" }, \ - { IFM_10_5, "10base5/AUI" }, \ - { IFM_100_TX, "100baseTX" }, \ - { IFM_100_FX, "100baseFX" }, \ - { IFM_100_T4, "100baseT4" }, \ - { IFM_100_VG, "100baseVG" }, \ - { IFM_100_T2, "100baseT2" }, \ - { IFM_10_STP, "10baseSTP" }, \ - { IFM_10_FL, "10baseFL" }, \ +#define IFM_SUBTYPE_ETHERNET_DESCRIPTIONS { \ + { IFM_10_T, "10baseT/UTP" }, \ + { IFM_10_2, "10base2/BNC" }, \ + { IFM_10_5, "10base5/AUI" }, \ + { IFM_100_TX, "100baseTX" }, \ + { IFM_100_FX, "100baseFX" }, \ + { IFM_100_T4, "100baseT4" }, \ + { IFM_100_VG, "100baseVG" }, \ + { IFM_100_T2, "100baseT2" }, \ + { IFM_10_STP, "10baseSTP" }, \ + { IFM_10_FL, "10baseFL" }, \ { IFM_1000_SX, "1000baseSX" }, \ - { IFM_1000_LX, "1000baseLX" }, \ - { IFM_1000_CX, "1000baseCX" }, \ - { IFM_1000_T, "1000baseT" }, \ + { IFM_1000_LX, "1000baseLX" }, \ + { IFM_1000_CX, "1000baseCX" }, \ + { IFM_1000_T, "1000baseT" }, \ { IFM_HPNA_1, "homePNA" }, \ { IFM_10G_LR, "10Gbase-LR" }, \ { IFM_10G_SR, "10Gbase-SR" }, \ @@ -377,11 +376,11 @@ struct ifmedia_description { { IFM_40G_CR4, "40Gbase-CR4" }, \ { IFM_40G_SR4, "40Gbase-SR4" }, \ { IFM_40G_LR4, "40Gbase-LR4" }, \ - { IFM_1000_KX, "1000Base-KX" }, \ + { IFM_1000_KX, "1000Base-KX" }, \ { IFM_OTHER, "Other" }, \ - { IFM_10G_KX4, "10GBase-KX4" }, \ - { IFM_10G_KR, "10GBase-KR" }, \ - { IFM_10G_CR1, "10GBase-CR1" }, \ + { IFM_10G_KX4, "10GBase-KX4" }, \ + { IFM_10G_KR, "10GBase-KR" }, \ + { IFM_10G_CR1, "10GBase-CR1" }, \ { IFM_20G_KR2, "20GBase-KR2" }, \ { IFM_2500_KX, "2500Base-KX" }, \ { IFM_2500_T, "2500Base-T" }, \ @@ -393,18 +392,18 @@ struct ifmedia_description { { IFM_40G_XLPPI, "40GBase-XLPPI" }, \ { IFM_1000_CX_SGMII, "1000Base-CX-SGMII" }, \ { IFM_40G_KR4, "40GBase-KR4" }, \ - { IFM_10G_ER, "10GBase-ER" }, \ + { IFM_10G_ER, "10GBase-ER" }, \ { IFM_100G_CR4, "100GBase-CR4" }, \ { IFM_100G_SR4, "100GBase-SR4" }, \ { IFM_100G_KR4, "100GBase-KR4" }, \ { IFM_100G_LR4, "100GBase-LR4" }, \ { IFM_56G_R4, "56GBase-R4" }, \ { IFM_100_T, "100BaseT" }, \ - { IFM_25G_CR, "25GBase-CR" }, \ - { IFM_25G_KR, "25GBase-KR" }, \ - { IFM_25G_SR, "25GBase-SR" }, \ - { IFM_50G_CR2, "50GBase-CR2" }, \ - { IFM_50G_KR2, "50GBase-KR2" }, \ + { IFM_25G_CR, "25GBase-CR" }, \ + { IFM_25G_KR, "25GBase-KR" }, \ + { IFM_25G_SR, "25GBase-SR" }, \ + { IFM_50G_CR2, "50GBase-CR2" }, \ + { IFM_50G_KR2, "50GBase-KR2" }, \ { IFM_25G_LR, "25GBase-LR" }, \ { IFM_10G_AOC, "10GBase-AOC" }, \ { IFM_25G_ACC, "25GBase-ACC" }, \ @@ -422,8 +421,8 @@ struct ifmedia_description { { IFM_40G_XLAUI, "40G-XLAUI" }, \ { IFM_40G_XLAUI_AC, "40G-XLAUI-AC" }, \ { IFM_40G_ER4, "40GBase-ER4" }, \ - { IFM_50G_SR2, "50GBase-SR2" }, \ - { IFM_50G_LR2, "50GBase-LR2" }, \ + { IFM_50G_SR2, "50GBase-SR2" }, \ + { IFM_50G_LR2, "50GBase-LR2" }, \ { IFM_50G_LAUI2_AC, "50G-LAUI2-AC" }, \ { IFM_50G_LAUI2, "50G-LAUI2" }, \ { IFM_50G_AUI2_AC, "50G-AUI2-AC" }, \ @@ -465,7 +464,7 @@ struct ifmedia_description { { IFM_400G_DR4, "400GBase-DR4" }, \ { IFM_400G_AUI8_AC, "400G-AUI8-AC" }, \ { IFM_400G_AUI8, "400G-AUI8" }, \ - { 0, NULL }, \ + { 0, NULL }, \ } #define IFM_SUBTYPE_ETHERNET_ALIASES { \ @@ -574,7 +573,7 @@ struct ifmedia_description { { IFM_FDX, "full-duplex" }, \ { IFM_HDX, "half-duplex" }, \ { IFM_FLOW, "flow-control" }, \ - { IFM_EEE, "energy-efficient-ethernet" }, \ + { IFM_EEE, "energy-efficient-ethernet" }, \ { IFM_FLAG0, "flag0" }, \ { IFM_FLAG1, "flag1" }, \ { IFM_FLAG2, "flag2" }, \ diff --git a/bsd/net/if_mib.c b/bsd/net/if_mib.c index 6c0f94ef8..93bd59ce7 100644 --- a/bsd/net/if_mib.c +++ b/bsd/net/if_mib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,6 +192,7 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) if_copy_data_extended(ifp, &ifmd_supp->ifmd_data_extended); if_copy_packet_stats(ifp, &ifmd_supp->ifmd_packet_stats); if_copy_rxpoll_stats(ifp, &ifmd_supp->ifmd_rxpoll_stats); + if_copy_netif_stats(ifp, &ifmd_supp->ifmd_netif_stats); if (req->oldptr == USER_ADDR_NULL) { req->oldlen = sizeof(*ifmd_supp); diff --git a/bsd/net/if_mib.h b/bsd/net/if_mib.h index 0cec12310..b0c74483a 100644 --- a/bsd/net/if_mib.h +++ b/bsd/net/if_mib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ struct ifmibdata_supplemental { struct if_data_extended ifmd_data_extended; struct if_packet_stats ifmd_packet_stats; struct if_rxpoll_stats ifmd_rxpoll_stats; + struct if_netif_stats ifmd_netif_stats; }; #endif /* PRIVATE */ diff --git a/bsd/net/if_types.h b/bsd/net/if_types.h index 2d95e9eeb..e4d3bcf5f 100644 --- a/bsd/net/if_types.h +++ b/bsd/net/if_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,12 +128,13 @@ #define IFT_PROPVIRTUAL 0x35 /* Proprietary Virtual/internal */ #define IFT_PROPMUX 0x36 /* Proprietary Multiplexing */ /* - * IFT_GIF, IFT_FAITH and IFT_FAITH are not based on IANA assignments. + * IFT_GIF, IFT_FAITH and IFT_6LOWPAN are not based on IANA assignments. * Note: IFT_STF has a defined ifType: 0xd7 (215), but we use 0x39. */ #define IFT_GIF 0x37 /*0xf0*/ #define IFT_FAITH 0x38 /*0xf2*/ #define IFT_STF 0x39 /*0xf3*/ +#define IFT_6LOWPAN 0x40 /* IETF RFC 6282 */ #define IFT_L2VLAN 0x87 /* Layer 2 Virtual LAN using 802.1Q */ #define IFT_IEEE8023ADLAG 0x88 /* IEEE802.3ad Link Aggregate */ diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index 416ef1537..d29785b8b 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2018 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,6 +54,7 @@ #include #include #include +#include #define UTUN_NEXUS 0 @@ -66,11 +67,11 @@ static uuid_t utun_nx_dom_prov; typedef struct utun_nx { uuid_t if_provider; uuid_t if_instance; - uuid_t ms_provider; - uuid_t ms_instance; - uuid_t ms_device; - uuid_t ms_host; - uuid_t ms_agent; + uuid_t fsw_provider; + uuid_t fsw_instance; + uuid_t fsw_device; + uuid_t fsw_host; + uuid_t fsw_agent; } *utun_nx_t; #endif // UTUN_NEXUS @@ -91,6 +92,7 @@ struct utun_pcb { decl_lck_rw_data(, utun_pcb_lock); struct mbuf * utun_input_chain; struct mbuf * utun_input_chain_last; + u_int32_t utun_input_chain_count; // Input chain lock protects the list of input mbufs // The input chain lock must be taken AFTER the PCB lock if both are held lck_mtx_t utun_input_chain_lock; @@ -102,6 +104,8 @@ struct utun_pcb { void * utun_kpipe_rxring; void * utun_kpipe_txring; kern_pbufpool_t utun_kpipe_pp; + u_int32_t utun_kpipe_tx_ring_size; + u_int32_t utun_kpipe_rx_ring_size; kern_nexus_t utun_netif_nexus; kern_pbufpool_t utun_netif_pp; @@ -113,6 +117,10 @@ struct utun_pcb { u_int32_t utun_netif_ring_size; u_int32_t utun_tx_fsw_ring_size; u_int32_t utun_rx_fsw_ring_size; + // Auto attach flowswitch when netif is enabled. When set to false, + // it allows userspace nexus controller to attach and own flowswitch. + bool utun_attach_fsw; + bool utun_netif_connected; bool utun_use_netif; bool utun_needs_netagent; #endif // UTUN_NEXUS @@ -167,12 +175,16 @@ static errno_t utun_pkt_input(struct utun_pcb *pcb, mbuf_t m); #define UTUN_IF_DEFAULT_BUF_SEG_SIZE skmem_usr_buf_seg_size #define UTUN_IF_HEADROOM_SIZE 32 -#define UTUN_IF_MIN_RING_SIZE 16 +#define UTUN_IF_MIN_RING_SIZE 8 #define UTUN_IF_MAX_RING_SIZE 1024 #define UTUN_IF_MIN_SLOT_SIZE 1024 #define UTUN_IF_MAX_SLOT_SIZE 4096 +#define UTUN_DEFAULT_MAX_PENDING_INPUT_COUNT 512 + +static int if_utun_max_pending_input = UTUN_DEFAULT_MAX_PENDING_INPUT_COUNT; + static int sysctl_if_utun_ring_size SYSCTL_HANDLER_ARGS; static int sysctl_if_utun_tx_fsw_ring_size SYSCTL_HANDLER_ARGS; static int sysctl_if_utun_rx_fsw_ring_size SYSCTL_HANDLER_ARGS; @@ -184,6 +196,7 @@ static int if_utun_rx_fsw_ring_size = UTUN_IF_DEFAULT_RX_FSW_RING_SIZE; SYSCTL_DECL(_net_utun); SYSCTL_NODE(_net, OID_AUTO, utun, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "UTun"); +SYSCTL_INT(_net_utun, OID_AUTO, max_pending_input, CTLFLAG_LOCKED | CTLFLAG_RW, &if_utun_max_pending_input, 0, ""); SYSCTL_PROC(_net_utun, OID_AUTO, ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &if_utun_ring_size, UTUN_IF_DEFAULT_RING_SIZE, &sysctl_if_utun_ring_size, "I", ""); SYSCTL_PROC(_net_utun, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, @@ -231,7 +244,6 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #define UTUN_HEADER_SIZE(_pcb) (sizeof(u_int32_t) + (((_pcb)->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) ? sizeof(uuid_t) : 0)) static kern_ctl_ref utun_kctlref; -static u_int32_t utun_family; static lck_attr_t *utun_lck_attr; static lck_grp_attr_t *utun_lck_grp_attr; static lck_grp_t *utun_lck_grp; @@ -359,7 +371,7 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_slot_t tx_pslot = NULL; kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); - STATS_INC(nifs, NETIF_STATS_TXSYNC); + STATS_INC(nifs, NETIF_STATS_TX_SYNC); if (tx_slot == NULL) { // Nothing to write, don't bother signalling @@ -426,7 +438,7 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, break; } default: { - printf("utun_netif_sync_tx %s: unknown ip version %u vhl %u tx_offset %u len %u header_size %zu\n", + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s: unknown ip version %u vhl %u tx_offset %u len %u header_size %zu\n", pcb->utun_ifp->if_xname, ip_version, vhl, tx_offset, tx_length, UTUN_HEADER_SIZE(pcb)); break; @@ -452,24 +464,24 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, if (error == 0) { error = utun_output(pcb->utun_ifp, data); if (error != 0) { - printf("utun_netif_sync_tx %s - utun_output error %d\n", pcb->utun_ifp->if_xname, error); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - utun_output error %d\n", pcb->utun_ifp->if_xname, error); } } else { - printf("utun_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->utun_ifp->if_xname, length, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->utun_ifp->if_xname, length, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); mbuf_freem(data); data = NULL; } } else { - printf("utun_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->utun_ifp->if_xname, error); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->utun_ifp->if_xname, error); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); } } else { - printf("utun_netif_sync_tx %s - 0 length packet\n", pcb->utun_ifp->if_xname); - STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF); - STATS_INC(nifs, NETIF_STATS_DROPPED); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - 0 length packet\n", pcb->utun_ifp->if_xname); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); + STATS_INC(nifs, NETIF_STATS_DROP); } kern_pbufpool_free(tx_ring->ckr_pp, tx_ph); @@ -478,8 +490,8 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, continue; } - STATS_INC(nifs, NETIF_STATS_TXPKTS); - STATS_INC(nifs, NETIF_STATS_TXCOPY_MBUF); + STATS_INC(nifs, NETIF_STATS_TX_PACKETS); + STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF); tx_ring_stats.kcrsi_slots_transferred++; tx_ring_stats.kcrsi_bytes_transferred += length; @@ -512,7 +524,7 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, */ rc = kern_channel_tx_refill_canblock(ring, UINT32_MAX, UINT32_MAX, true, &more); if (rc != 0 && rc != EAGAIN && rc != EBUSY) { - printf("%s, tx refill failed %d\n", __func__, rc); + os_log_error(OS_LOG_DEFAULT, "%s, tx refill failed %d\n", __func__, rc); } (void) kr_enter(ring, TRUE); @@ -525,7 +537,7 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // No room left in tx ring, disable output for now errno_t error = ifnet_disable_output(pcb->utun_ifp); if (error != 0) { - printf("utun_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "utun_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error); } } } @@ -564,7 +576,7 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Reclaim user-released slots (void) kern_channel_reclaim(rx_ring); - STATS_INC(nifs, NETIF_STATS_RXSYNC); + STATS_INC(nifs, NETIF_STATS_RX_SYNC); uint32_t avail = kern_channel_available_slot_count(rx_ring); if (avail == 0) { @@ -591,13 +603,16 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_packet_t rx_ph = 0; errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); if (__improbable(error != 0)) { - STATS_INC(nifs, NETIF_STATS_NOMEM_PKT); - STATS_INC(nifs, NETIF_STATS_DROPPED); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); + STATS_INC(nifs, NETIF_STATS_DROP); lck_mtx_unlock(&pcb->utun_input_chain_lock); break; } // Advance waiting packets + if (pcb->utun_input_chain_count > 0) { + pcb->utun_input_chain_count--; + } pcb->utun_input_chain = data->m_nextpkt; data->m_nextpkt = NULL; if (pcb->utun_input_chain == NULL) { @@ -612,9 +627,9 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // mbuf is too small mbuf_freem(data); kern_pbufpool_free(rx_pp, rx_ph); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("utun_netif_sync_rx %s: legacy packet length too short for header %zu < %zu\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: legacy packet length too short for header %zu < %zu\n", pcb->utun_ifp->if_xname, length, header_offset); continue; } @@ -624,9 +639,9 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Flush data mbuf_freem(data); kern_pbufpool_free(rx_pp, rx_ph); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("utun_netif_sync_rx %s: legacy packet length %zu > %u\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: legacy packet length %zu > %u\n", pcb->utun_ifp->if_xname, length, rx_pp->pp_buflet_size); continue; } @@ -648,17 +663,15 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, VERIFY(error == 0); error = kern_buflet_set_data_length(rx_buf, length); VERIFY(error == 0); - error = kern_packet_set_link_header_offset(rx_ph, 0); - VERIFY(error == 0); - error = kern_packet_set_network_header_offset(rx_ph, 0); + error = kern_packet_set_headroom(rx_ph, 0); VERIFY(error == 0); error = kern_packet_finalize(rx_ph); VERIFY(error == 0); error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); VERIFY(error == 0); - STATS_INC(nifs, NETIF_STATS_RXPKTS); - STATS_INC(nifs, NETIF_STATS_RXCOPY_MBUF); + STATS_INC(nifs, NETIF_STATS_RX_PACKETS); + STATS_INC(nifs, NETIF_STATS_RX_COPY_MBUF); bpf_tap_packet_in(pcb->utun_ifp, DLT_RAW, rx_ph, NULL, 0); rx_ring_stats.kcrsi_slots_transferred++; @@ -717,8 +730,8 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, */ errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); if (__improbable(error != 0)) { - STATS_INC(nifs, NETIF_STATS_NOMEM_PKT); - STATS_INC(nifs, NETIF_STATS_DROPPED); + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); + STATS_INC(nifs, NETIF_STATS_DROP); break; } @@ -734,9 +747,9 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, if (tx_length < header_offset) { // Packet is too small kern_pbufpool_free(rx_pp, rx_ph); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); - printf("utun_netif_sync_rx %s: packet length too short for header %u < %zu\n", + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: packet length too short for header %u < %zu\n", pcb->utun_ifp->if_xname, tx_length, header_offset); continue; } @@ -762,17 +775,15 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, VERIFY(error == 0); error = kern_buflet_set_data_length(rx_buf, length); VERIFY(error == 0); - error = kern_packet_set_link_header_offset(rx_ph, 0); - VERIFY(error == 0); - error = kern_packet_set_network_header_offset(rx_ph, 0); + error = kern_packet_set_headroom(rx_ph, 0); VERIFY(error == 0); error = kern_packet_finalize(rx_ph); VERIFY(error == 0); error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); VERIFY(error == 0); - STATS_INC(nifs, NETIF_STATS_RXPKTS); - STATS_INC(nifs, NETIF_STATS_RXCOPY_DIRECT); + STATS_INC(nifs, NETIF_STATS_RX_PACKETS); + STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT); bpf_tap_packet_in(pcb->utun_ifp, DLT_RAW, rx_ph, NULL, 0); rx_ring_stats.kcrsi_slots_transferred++; @@ -839,7 +850,7 @@ utun_nexus_ifattach(struct utun_pcb *pcb, nexus_attr_t nxa = NULL; err = kern_nexus_attr_create(&nxa); if (err != 0) { - printf("%s: kern_nexus_attr_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n", __func__, err); goto failed; } @@ -859,16 +870,20 @@ utun_nexus_ifattach(struct utun_pcb *pcb, bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; + pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; pp_init.kbi_packets = pcb->utun_netif_ring_size * 2; pp_init.kbi_bufsize = pcb->utun_slot_size; pp_init.kbi_buf_seg_size = UTUN_IF_DEFAULT_BUF_SEG_SIZE; pp_init.kbi_max_frags = 1; (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), "%s", provider_name); + pp_init.kbi_ctx = NULL; + pp_init.kbi_ctx_retain = NULL; + pp_init.kbi_ctx_release = NULL; - err = kern_pbufpool_create(&pp_init, &pp_init, &pcb->utun_netif_pp, NULL); + err = kern_pbufpool_create(&pp_init, &pcb->utun_netif_pp, NULL); if (err != 0) { - printf("%s pbufbool create failed, error %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, err); goto failed; } @@ -880,7 +895,7 @@ utun_nexus_ifattach(struct utun_pcb *pcb, nxa, &pcb->utun_nx.if_provider); if (err != 0) { - printf("%s register provider failed, error %d\n", + os_log_error(OS_LOG_DEFAULT, "%s register provider failed, error %d\n", __func__, err); goto failed; } @@ -899,7 +914,7 @@ utun_nexus_ifattach(struct utun_pcb *pcb, &net_init, ifp); if (err != 0) { - printf("%s alloc_net_provider_instance failed, %d\n", + os_log_error(OS_LOG_DEFAULT, "%s alloc_net_provider_instance failed, %d\n", __func__, err); kern_nexus_controller_deregister_provider(controller, pcb->utun_nx.if_provider); @@ -928,7 +943,7 @@ utun_detach_provider_and_instance(uuid_t provider, uuid_t instance) err = kern_nexus_controller_free_provider_instance(controller, instance); if (err != 0) { - printf("%s free_provider_instance failed %d\n", + os_log_error(OS_LOG_DEFAULT, "%s free_provider_instance failed %d\n", __func__, err); } uuid_clear(instance); @@ -937,7 +952,7 @@ utun_detach_provider_and_instance(uuid_t provider, uuid_t instance) err = kern_nexus_controller_deregister_provider(controller, provider); if (err != 0) { - printf("%s deregister_provider %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s deregister_provider %d\n", __func__, err); } uuid_clear(provider); } @@ -951,30 +966,30 @@ utun_nexus_detach(struct utun_pcb *pcb) nexus_controller_t controller = kern_nexus_shared_controller(); errno_t err; - if (!uuid_is_null(nx->ms_host)) { + if (!uuid_is_null(nx->fsw_host)) { err = kern_nexus_ifdetach(controller, - nx->ms_instance, - nx->ms_host); + nx->fsw_instance, + nx->fsw_host); if (err != 0) { - printf("%s: kern_nexus_ifdetach ms host failed %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms host failed %d\n", __func__, err); } } - if (!uuid_is_null(nx->ms_device)) { + if (!uuid_is_null(nx->fsw_device)) { err = kern_nexus_ifdetach(controller, - nx->ms_instance, - nx->ms_device); + nx->fsw_instance, + nx->fsw_device); if (err != 0) { - printf("%s: kern_nexus_ifdetach ms device failed %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms device failed %d\n", __func__, err); } } utun_detach_provider_and_instance(nx->if_provider, nx->if_instance); - utun_detach_provider_and_instance(nx->ms_provider, - nx->ms_instance); + utun_detach_provider_and_instance(nx->fsw_provider, + nx->fsw_instance); if (pcb->utun_netif_pp != NULL) { kern_pbufpool_destroy(pcb->utun_netif_pp); @@ -985,7 +1000,7 @@ utun_nexus_detach(struct utun_pcb *pcb) static errno_t utun_create_fs_provider_and_instance(struct utun_pcb *pcb, - uint32_t subtype, const char *type_name, + const char *type_name, const char *ifname, uuid_t *provider, uuid_t *instance) { @@ -996,24 +1011,21 @@ utun_create_fs_provider_and_instance(struct utun_pcb *pcb, struct kern_nexus_init init; nexus_name_t provider_name; - err = kern_nexus_get_builtin_domain_provider(NEXUS_TYPE_FLOW_SWITCH, + err = kern_nexus_get_default_domain_provider(NEXUS_TYPE_FLOW_SWITCH, &dom_prov); if (err != 0) { - printf("%s can't get %s provider, error %d\n", + os_log_error(OS_LOG_DEFAULT, "%s can't get %s provider, error %d\n", __func__, type_name, err); goto failed; } err = kern_nexus_attr_create(&attr); if (err != 0) { - printf("%s: kern_nexus_attr_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n", __func__, err); goto failed; } - err = kern_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, subtype); - VERIFY(err == 0); - uint64_t slot_buffer_size = pcb->utun_slot_size; err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size); VERIFY(err == 0); @@ -1038,7 +1050,7 @@ utun_create_fs_provider_and_instance(struct utun_pcb *pcb, kern_nexus_attr_destroy(attr); attr = NULL; if (err != 0) { - printf("%s register %s provider failed, error %d\n", + os_log_error(OS_LOG_DEFAULT, "%s register %s provider failed, error %d\n", __func__, type_name, err); goto failed; } @@ -1049,7 +1061,7 @@ utun_create_fs_provider_and_instance(struct utun_pcb *pcb, NULL, instance, &init); if (err != 0) { - printf("%s alloc_provider_instance %s failed, %d\n", + os_log_error(OS_LOG_DEFAULT, "%s alloc_provider_instance %s failed, %d\n", __func__, type_name, err); kern_nexus_controller_deregister_provider(controller, *provider); @@ -1060,62 +1072,56 @@ failed: } static errno_t -utun_multistack_attach(struct utun_pcb *pcb) +utun_flowswitch_attach(struct utun_pcb *pcb) { nexus_controller_t controller = kern_nexus_shared_controller(); errno_t err = 0; utun_nx_t nx = &pcb->utun_nx; - // Allocate multistack flowswitch + // Allocate flowswitch err = utun_create_fs_provider_and_instance(pcb, - NEXUS_EXTENSION_FSW_TYPE_MULTISTACK, - "multistack", + "flowswitch", pcb->utun_ifp->if_xname, - &nx->ms_provider, - &nx->ms_instance); + &nx->fsw_provider, + &nx->fsw_instance); if (err != 0) { - printf("%s: failed to create bridge provider and instance\n", + os_log_error(OS_LOG_DEFAULT, "%s: failed to create bridge provider and instance\n", __func__); goto failed; } - // Attach multistack to device port - err = kern_nexus_ifattach(controller, nx->ms_instance, + // Attach flowswitch to device port + err = kern_nexus_ifattach(controller, nx->fsw_instance, NULL, nx->if_instance, - FALSE, &nx->ms_device); + FALSE, &nx->fsw_device); if (err != 0) { - printf("%s kern_nexus_ifattach ms device %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms device %d\n", __func__, err); goto failed; } - // Attach multistack to host port - err = kern_nexus_ifattach(controller, nx->ms_instance, + // Attach flowswitch to host port + err = kern_nexus_ifattach(controller, nx->fsw_instance, NULL, nx->if_instance, - TRUE, &nx->ms_host); + TRUE, &nx->fsw_host); if (err != 0) { - printf("%s kern_nexus_ifattach ms host %d\n", __func__, err); + os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms host %d\n", __func__, err); goto failed; } // Extract the agent UUID and save for later - struct kern_nexus *multistack_nx = nx_find(nx->ms_instance, false); - if (multistack_nx != NULL) { - struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(multistack_nx); + struct kern_nexus *flowswitch_nx = nx_find(nx->fsw_instance, false); + if (flowswitch_nx != NULL) { + struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(flowswitch_nx); if (flowswitch != NULL) { FSW_RLOCK(flowswitch); - struct fsw_ms_context *ms_context = (struct fsw_ms_context *)flowswitch->fsw_ops_private; - if (ms_context != NULL) { - uuid_copy(nx->ms_agent, ms_context->mc_agent_uuid); - } else { - printf("utun_multistack_attach - fsw_ms_context is NULL\n"); - } + uuid_copy(nx->fsw_agent, flowswitch->fsw_agent_uuid); FSW_UNLOCK(flowswitch); } else { - printf("utun_multistack_attach - flowswitch is NULL\n"); + os_log_error(OS_LOG_DEFAULT, "utun_flowswitch_attach - flowswitch is NULL\n"); } - nx_release(multistack_nx); + nx_release(flowswitch_nx); } else { - printf("utun_multistack_attach - unable to find multistack nexus\n"); + os_log_error(OS_LOG_DEFAULT, "utun_flowswitch_attach - unable to find flowswitch nexus\n"); } return 0; @@ -1125,7 +1131,7 @@ failed: errno_t detach_error = 0; if ((detach_error = ifnet_detach(pcb->utun_ifp)) != 0) { - panic("utun_multistack_attach - ifnet_detach failed: %d\n", detach_error); + panic("utun_flowswitch_attach - ifnet_detach failed: %d\n", detach_error); /* NOT REACHED */ } @@ -1133,7 +1139,7 @@ failed: } static errno_t -utun_register_kernel_pipe_nexus(void) +utun_register_kernel_pipe_nexus(struct utun_pcb *pcb) { nexus_attr_t nxa = NULL; errno_t result; @@ -1146,16 +1152,16 @@ utun_register_kernel_pipe_nexus(void) result = kern_nexus_controller_create(&utun_ncd); if (result) { - printf("%s: kern_nexus_controller_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_create failed: %d\n", __FUNCTION__, result); goto done; } uuid_t dom_prov; - result = kern_nexus_get_builtin_domain_provider( + result = kern_nexus_get_default_domain_provider( NEXUS_TYPE_KERNEL_PIPE, &dom_prov); if (result) { - printf("%s: kern_nexus_get_builtin_domain_provider failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_get_default_domain_provider failed: %d\n", __FUNCTION__, result); goto done; } @@ -1178,7 +1184,7 @@ utun_register_kernel_pipe_nexus(void) result = kern_nexus_attr_create(&nxa); if (result) { - printf("%s: kern_nexus_attr_create failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n", __FUNCTION__, result); goto done; } @@ -1188,9 +1194,15 @@ utun_register_kernel_pipe_nexus(void) VERIFY(result == 0); // Reset ring size for kernel pipe nexus to limit memory usage - uint64_t ring_size = if_utun_ring_size; + uint64_t ring_size = + pcb->utun_kpipe_tx_ring_size != 0 ? pcb->utun_kpipe_tx_ring_size : + if_utun_ring_size; result = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_SLOTS, ring_size); VERIFY(result == 0); + + ring_size = + pcb->utun_kpipe_rx_ring_size != 0 ? pcb->utun_kpipe_rx_ring_size : + if_utun_ring_size; result = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size); VERIFY(result == 0); @@ -1202,7 +1214,7 @@ utun_register_kernel_pipe_nexus(void) nxa, &utun_kpipe_uuid); if (result) { - printf("%s: kern_nexus_controller_register_provider failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_register_provider failed: %d\n", __FUNCTION__, result); goto done; } @@ -1290,7 +1302,7 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) return result; } - result = utun_register_kernel_pipe_nexus(); + result = utun_register_kernel_pipe_nexus(pcb); if (result) { return result; } @@ -1315,6 +1327,7 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; + pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; pp_init.kbi_packets = pcb->utun_netif_ring_size * 2; pp_init.kbi_bufsize = pcb->utun_slot_size; pp_init.kbi_buf_seg_size = UTUN_IF_DEFAULT_BUF_SEG_SIZE; @@ -1322,11 +1335,14 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) pp_init.kbi_flags |= KBIF_QUANTUM; (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), "com.apple.kpipe.%s", pcb->utun_if_xname); + pp_init.kbi_ctx = NULL; + pp_init.kbi_ctx_retain = NULL; + pp_init.kbi_ctx_release = NULL; - result = kern_pbufpool_create(&pp_init, &pp_init, &pcb->utun_kpipe_pp, + result = kern_pbufpool_create(&pp_init, &pcb->utun_kpipe_pp, NULL); if (result != 0) { - printf("%s pbufbool create failed, error %d\n", __func__, result); + os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, result); goto done; } @@ -1375,19 +1391,12 @@ utun_register_control(void) struct kern_ctl_reg kern_ctl; errno_t result = 0; - /* Find a unique value for our interface family */ - result = mbuf_tag_id_find(UTUN_CONTROL_NAME, &utun_family); - if (result != 0) { - printf("utun_register_control - mbuf_tag_id_find_internal failed: %d\n", result); - return result; - } - utun_pcb_size = sizeof(struct utun_pcb); utun_pcb_zone = zinit(utun_pcb_size, UTUN_PCB_ZONE_MAX * utun_pcb_size, 0, UTUN_PCB_ZONE_NAME); if (utun_pcb_zone == NULL) { - printf("utun_register_control - zinit(utun_pcb) failed"); + os_log_error(OS_LOG_DEFAULT, "utun_register_control - zinit(utun_pcb) failed"); return ENOMEM; } @@ -1413,26 +1422,26 @@ utun_register_control(void) result = ctl_register(&kern_ctl, &utun_kctlref); if (result != 0) { - printf("utun_register_control - ctl_register failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_register_control - ctl_register failed: %d\n", result); return result; } /* Register the protocol plumbers */ - if ((result = proto_register_plumber(PF_INET, utun_family, + if ((result = proto_register_plumber(PF_INET, IFNET_FAMILY_UTUN, utun_attach_proto, NULL)) != 0) { - printf("utun_register_control - proto_register_plumber(PF_INET, %d) failed: %d\n", - utun_family, result); + os_log_error(OS_LOG_DEFAULT, "utun_register_control - proto_register_plumber(PF_INET, IFNET_FAMILY_UTUN) failed: %d\n", + result); ctl_deregister(utun_kctlref); return result; } /* Register the protocol plumbers */ - if ((result = proto_register_plumber(PF_INET6, utun_family, + if ((result = proto_register_plumber(PF_INET6, IFNET_FAMILY_UTUN, utun_attach_proto, NULL)) != 0) { - proto_unregister_plumber(PF_INET, utun_family); + proto_unregister_plumber(PF_INET, IFNET_FAMILY_UTUN); ctl_deregister(utun_kctlref); - printf("utun_register_control - proto_register_plumber(PF_INET6, %d) failed: %d\n", - utun_family, result); + os_log_error(OS_LOG_DEFAULT, "utun_register_control - proto_register_plumber(PF_INET6, IFNET_FAMILY_UTUN) failed: %d\n", + result); return result; } @@ -1452,6 +1461,7 @@ utun_free_pcb(struct utun_pcb *pcb, bool in_list) { #ifdef UTUN_NEXUS mbuf_freem_list(pcb->utun_input_chain); + pcb->utun_input_chain_count = 0; lck_mtx_destroy(&pcb->utun_input_chain_lock, utun_lck_grp); #endif // UTUN_NEXUS lck_rw_destroy(&pcb->utun_pcb_lock, utun_lck_grp); @@ -1478,13 +1488,16 @@ utun_ctl_bind(kern_ctl_ref kctlref, #if UTUN_NEXUS pcb->utun_use_netif = false; + pcb->utun_attach_fsw = true; + pcb->utun_netif_connected = false; pcb->utun_slot_size = UTUN_IF_DEFAULT_SLOT_SIZE; - pcb->utun_netif_ring_size = UTUN_IF_DEFAULT_RING_SIZE; - pcb->utun_tx_fsw_ring_size = UTUN_IF_DEFAULT_TX_FSW_RING_SIZE; - pcb->utun_rx_fsw_ring_size = UTUN_IF_DEFAULT_RX_FSW_RING_SIZE; + pcb->utun_netif_ring_size = if_utun_ring_size; + pcb->utun_tx_fsw_ring_size = if_utun_tx_fsw_ring_size; + pcb->utun_rx_fsw_ring_size = if_utun_rx_fsw_ring_size; + pcb->utun_input_chain_count = 0; + lck_mtx_init(&pcb->utun_input_chain_lock, utun_lck_grp, utun_lck_attr); #endif // UTUN_NEXUS - lck_mtx_init(&pcb->utun_input_chain_lock, utun_lck_grp, utun_lck_attr); lck_rw_init(&pcb->utun_pcb_lock, utun_lck_grp, utun_lck_attr); return 0; @@ -1543,7 +1556,7 @@ utun_ctl_connect(kern_ctl_ref kctlref, snprintf(pcb->utun_if_xname, sizeof(pcb->utun_if_xname), "utun%d", pcb->utun_unit - 1); snprintf(pcb->utun_unique_name, sizeof(pcb->utun_unique_name), "utunid%d", pcb->utun_unique_id - 1); - printf("utun_ctl_connect: creating interface %s (id %s)\n", pcb->utun_if_xname, pcb->utun_unique_name); + os_log(OS_LOG_DEFAULT, "utun_ctl_connect: creating interface %s (id %s)\n", pcb->utun_if_xname, pcb->utun_unique_name); /* Create the interface */ bzero(&utun_init, sizeof(utun_init)); @@ -1565,8 +1578,7 @@ utun_ctl_connect(kern_ctl_ref kctlref, utun_init.unit = pcb->utun_unit - 1; utun_init.uniqueid = pcb->utun_unique_name; utun_init.uniqueid_len = strlen(pcb->utun_unique_name); - utun_init.family = utun_family; - utun_init.subfamily = IFNET_SUBFAMILY_UTUN; + utun_init.family = IFNET_FAMILY_UTUN; utun_init.type = IFT_OTHER; utun_init.demux = utun_demux; utun_init.add_proto = utun_add_proto; @@ -1579,17 +1591,19 @@ utun_ctl_connect(kern_ctl_ref kctlref, if (pcb->utun_use_netif) { result = utun_nexus_ifattach(pcb, &utun_init, &pcb->utun_ifp); if (result != 0) { - printf("utun_ctl_connect - utun_nexus_ifattach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - utun_nexus_ifattach failed: %d\n", result); utun_free_pcb(pcb, true); *unitinfo = NULL; return result; } - result = utun_multistack_attach(pcb); - if (result != 0) { - printf("utun_ctl_connect - utun_multistack_attach failed: %d\n", result); - *unitinfo = NULL; - return result; + if (pcb->utun_attach_fsw) { + result = utun_flowswitch_attach(pcb); + if (result != 0) { + os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - utun_flowswitch_attach failed: %d\n", result); + *unitinfo = NULL; + return result; + } } /* Attach to bpf */ @@ -1603,7 +1617,7 @@ utun_ctl_connect(kern_ctl_ref kctlref, */ result = ifnet_allocate_extended(&utun_init, &pcb->utun_ifp); if (result != 0) { - printf("utun_ctl_connect - ifnet_allocate failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - ifnet_allocate failed: %d\n", result); utun_free_pcb(pcb, true); *unitinfo = NULL; return result; @@ -1626,7 +1640,7 @@ utun_ctl_connect(kern_ctl_ref kctlref, /* Attach the interface */ result = ifnet_attach(pcb->utun_ifp, NULL); if (result != 0) { - printf("utun_ctl_connect - ifnet_attach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - ifnet_attach failed: %d\n", result); /* Release reference now since attach failed */ ifnet_release(pcb->utun_ifp); utun_free_pcb(pcb, true); @@ -1690,11 +1704,11 @@ utun_remove_address(ifnet_t interface, ifnet_name(interface), ifnet_unit(interface)); result = ifaddr_address(address, &ifr.ifr_addr, sizeof(ifr.ifr_addr)); if (result != 0) { - printf("utun_remove_address - ifaddr_address failed: %d", result); + os_log_error(OS_LOG_DEFAULT, "utun_remove_address - ifaddr_address failed: %d", result); } else { result = sock_ioctl(pf_socket, SIOCDIFADDR, &ifr); if (result != 0) { - printf("utun_remove_address - SIOCDIFADDR failed: %d", result); + os_log_error(OS_LOG_DEFAULT, "utun_remove_address - SIOCDIFADDR failed: %d", result); } } } else if (protocol == PF_INET6) { @@ -1706,12 +1720,12 @@ utun_remove_address(ifnet_t interface, result = ifaddr_address(address, (struct sockaddr*)&ifr6.ifr_addr, sizeof(ifr6.ifr_addr)); if (result != 0) { - printf("utun_remove_address - ifaddr_address failed (v6): %d", + os_log_error(OS_LOG_DEFAULT, "utun_remove_address - ifaddr_address failed (v6): %d", result); } else { result = sock_ioctl(pf_socket, SIOCDIFADDR_IN6, &ifr6); if (result != 0) { - printf("utun_remove_address - SIOCDIFADDR_IN6 failed: %d", + os_log_error(OS_LOG_DEFAULT, "utun_remove_address - SIOCDIFADDR_IN6 failed: %d", result); } } @@ -1728,7 +1742,7 @@ utun_cleanup_family(ifnet_t interface, int i; if (protocol != PF_INET && protocol != PF_INET6) { - printf("utun_cleanup_family - invalid protocol family %d\n", protocol); + os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - invalid protocol family %d\n", protocol); return; } @@ -1736,7 +1750,7 @@ utun_cleanup_family(ifnet_t interface, result = sock_socket(protocol, SOCK_DGRAM, 0, NULL, NULL, &pf_socket); if (result != 0) { if (result != EAFNOSUPPORT) { - printf("utun_cleanup_family - failed to create %s socket: %d\n", + os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - failed to create %s socket: %d\n", protocol == PF_INET ? "IP" : "IPv6", result); } goto cleanup; @@ -1751,7 +1765,7 @@ utun_cleanup_family(ifnet_t interface, goto cleanup; } else if (result != EBUSY) { /* Uh, not really sure what happened here... */ - printf("utun_cleanup_family - utun_detach_ip failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - utun_detach_ip failed: %d\n", result); goto cleanup; } @@ -1761,7 +1775,7 @@ utun_cleanup_family(ifnet_t interface, */ result = ifnet_get_address_list_family(interface, &addresses, protocol); if (result != 0) { - printf("fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n", ifnet_name(interface), ifnet_unit(interface), protocol == PF_INET ? "PF_INET" : "PF_INET6", result); goto cleanup; @@ -1778,7 +1792,7 @@ utun_cleanup_family(ifnet_t interface, */ result = utun_detach_ip(interface, protocol, pf_socket); if (result != 0 && result != ENXIO) { - printf("utun_cleanup_family - utun_detach_ip failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - utun_detach_ip failed: %d\n", result); } cleanup: @@ -1806,7 +1820,7 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref, #if UTUN_NEXUS // Tell the nexus to stop all rings - if (pcb->utun_netif_nexus != NULL) { + if (pcb->utun_netif_nexus != NULL && pcb->utun_netif_connected) { kern_nexus_stop(pcb->utun_netif_nexus); } #endif // UTUN_NEXUS @@ -1894,7 +1908,7 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref, * ifnet_release(). */ if ((result = ifnet_detach(ifp)) != 0) { - printf("utun_ctl_disconnect - ifnet_detach failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_ctl_disconnect - ifnet_detach failed: %d\n", result); } } } else { @@ -1920,7 +1934,7 @@ utun_ctl_send(__unused kern_ctl_ref kctlref, if (m_pktlen(m) >= (int32_t)UTUN_HEADER_SIZE((struct utun_pcb *)unitinfo)) { *(protocol_family_t *)mbuf_data(m) = ntohl(*(protocol_family_t *)mbuf_data(m)); } else { - printf("%s - unexpected short mbuf pkt len %d\n", __func__, m_pktlen(m)); + os_log_error(OS_LOG_DEFAULT, "%s - unexpected short mbuf pkt len %d\n", __func__, m_pktlen(m)); } return utun_pkt_input((struct utun_pcb *)unitinfo, m); @@ -2081,24 +2095,45 @@ utun_ctl_setopt(__unused kern_ctl_ref kctlref, result = EINVAL; break; } - if (!if_is_netagent_enabled()) { + if (!if_is_fsw_transport_netagent_enabled()) { result = ENOTSUP; break; } - if (uuid_is_null(pcb->utun_nx.ms_agent)) { + if (uuid_is_null(pcb->utun_nx.fsw_agent)) { result = ENOENT; break; } + uint32_t flags = netagent_get_flags(pcb->utun_nx.fsw_agent); + if (*(int *)data) { - if_add_netagent(pcb->utun_ifp, pcb->utun_nx.ms_agent); pcb->utun_needs_netagent = true; + flags |= (NETAGENT_FLAG_NEXUS_PROVIDER | + NETAGENT_FLAG_NEXUS_LISTENER); + result = netagent_set_flags(pcb->utun_nx.fsw_agent, flags); } else { + flags &= ~(NETAGENT_FLAG_NEXUS_PROVIDER | + NETAGENT_FLAG_NEXUS_LISTENER); + result = netagent_set_flags(pcb->utun_nx.fsw_agent, flags); pcb->utun_needs_netagent = false; - if_delete_netagent(pcb->utun_ifp, pcb->utun_nx.ms_agent); } break; } + case UTUN_OPT_ATTACH_FLOWSWITCH: { + if (len != sizeof(int)) { + result = EMSGSIZE; + break; + } + if (pcb->utun_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + lck_rw_lock_exclusive(&pcb->utun_pcb_lock); + pcb->utun_attach_fsw = !!(*(int *)data); + lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); + break; + } case UTUN_OPT_ENABLE_NETIF: { if (len != sizeof(int)) { result = EMSGSIZE; @@ -2186,6 +2221,42 @@ utun_ctl_setopt(__unused kern_ctl_ref kctlref, pcb->utun_rx_fsw_ring_size = ring_size; break; } + case UTUN_OPT_KPIPE_TX_RING_SIZE: { + if (len != sizeof(u_int32_t)) { + result = EMSGSIZE; + break; + } + if (pcb->utun_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + u_int32_t ring_size = *(u_int32_t *)data; + if (ring_size < UTUN_IF_MIN_RING_SIZE || + ring_size > UTUN_IF_MAX_RING_SIZE) { + return EINVAL; + } + pcb->utun_kpipe_tx_ring_size = ring_size; + break; + } + case UTUN_OPT_KPIPE_RX_RING_SIZE: { + if (len != sizeof(u_int32_t)) { + result = EMSGSIZE; + break; + } + if (pcb->utun_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + u_int32_t ring_size = *(u_int32_t *)data; + if (ring_size < UTUN_IF_MIN_RING_SIZE || + ring_size > UTUN_IF_MAX_RING_SIZE) { + return EINVAL; + } + pcb->utun_kpipe_rx_ring_size = ring_size; + break; + } #endif // UTUN_NEXUS default: { result = ENOPROTOOPT; @@ -2262,7 +2333,7 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref, if (*len != sizeof(int)) { result = EMSGSIZE; } else { - *(int *)data = if_check_netagent(pcb->utun_ifp, pcb->utun_nx.ms_agent); + *(int *)data = if_check_netagent(pcb->utun_ifp, pcb->utun_nx.fsw_agent); } break; } @@ -2322,6 +2393,22 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref, } break; } + case UTUN_OPT_KPIPE_TX_RING_SIZE: { + if (*len != sizeof(u_int32_t)) { + result = EMSGSIZE; + } else { + *(u_int32_t *)data = pcb->utun_kpipe_tx_ring_size; + } + break; + } + case UTUN_OPT_KPIPE_RX_RING_SIZE: { + if (*len != sizeof(u_int32_t)) { + result = EMSGSIZE; + } else { + *(u_int32_t *)data = pcb->utun_kpipe_rx_ring_size; + } + break; + } #endif // UTUN_NEXUS default: @@ -2346,7 +2433,7 @@ utun_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int flags) u_int32_t utun_packet_cnt; errno_t error_pc = ctl_getenqueuepacketcount(kctlref, unit, &utun_packet_cnt); if (error_pc != 0) { - printf("utun_ctl_rcvd: ctl_getenqueuepacketcount returned error %d\n", error_pc); + os_log_error(OS_LOG_DEFAULT, "utun_ctl_rcvd: ctl_getenqueuepacketcount returned error %d\n", error_pc); utun_packet_cnt = 0; } @@ -2357,7 +2444,7 @@ utun_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int flags) if (reenable_output) { errno_t error = ifnet_enable_output(pcb->utun_ifp); if (error != 0) { - printf("utun_ctl_rcvd: ifnet_enable_output returned error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "utun_ctl_rcvd: ifnet_enable_output returned error %d\n", error); } } ifnet_lock_done(pcb->utun_ifp); @@ -2394,7 +2481,7 @@ utun_start(ifnet_t interface) u_int32_t utun_packet_cnt; errno_t error_pc = ctl_getenqueuepacketcount(pcb->utun_ctlref, pcb->utun_unit, &utun_packet_cnt); if (error_pc != 0) { - printf("utun_start: ctl_getenqueuepacketcount returned error %d\n", error_pc); + os_log_error(OS_LOG_DEFAULT, "utun_start: ctl_getenqueuepacketcount returned error %d\n", error_pc); utun_packet_cnt = 0; } @@ -2412,7 +2499,7 @@ utun_start(ifnet_t interface) if (!can_accept_packets) { errno_t error = ifnet_disable_output(interface); if (error != 0) { - printf("utun_start: ifnet_disable_output returned error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "utun_start: ifnet_disable_output returned error %d\n", error); } ifnet_lock_done(pcb->utun_ifp); break; @@ -2466,7 +2553,7 @@ utun_output(ifnet_t interface, result = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, data, CTL_DATA_EOR); if (result != 0) { mbuf_freem(data); - printf("utun_output - ctl_enqueuembuf failed: %d\n", result); + os_log_error(OS_LOG_DEFAULT, "utun_output - ctl_enqueuembuf failed: %d\n", result); #if UTUN_NEXUS if (!pcb->utun_use_netif) #endif // UTUN_NEXUS @@ -2549,7 +2636,7 @@ utun_framer(ifnet_t interface, u_int32_t header_length = UTUN_HEADER_SIZE(pcb); if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) { - printf("utun_framer - ifnet_output prepend failed\n"); + os_log_error(OS_LOG_DEFAULT, "utun_framer - ifnet_output prepend failed\n"); ifnet_stat_increment_out(interface, 0, 0, 1); @@ -2704,7 +2791,7 @@ utun_attach_proto(ifnet_t interface, errno_t result = ifnet_attach_protocol(interface, protocol, &proto); if (result != 0 && result != EEXIST) { - printf("utun_attach_inet - ifnet_attach_protocol %d failed: %d\n", + os_log_error(OS_LOG_DEFAULT, "utun_attach_inet - ifnet_attach_protocol %d failed: %d\n", protocol, result); } @@ -2719,14 +2806,23 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet) lck_rw_lock_shared(&pcb->utun_pcb_lock); lck_mtx_lock(&pcb->utun_input_chain_lock); + + if (pcb->utun_input_chain_count > (u_int32_t)if_utun_max_pending_input) { + lck_mtx_unlock(&pcb->utun_input_chain_lock); + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + return ENOSPC; + } + if (pcb->utun_input_chain != NULL) { pcb->utun_input_chain_last->m_nextpkt = packet; } else { pcb->utun_input_chain = packet; } + pcb->utun_input_chain_count++; while (packet->m_nextpkt) { VERIFY(packet != packet->m_nextpkt); packet = packet->m_nextpkt; + pcb->utun_input_chain_count++; } pcb->utun_input_chain_last = packet; lck_mtx_unlock(&pcb->utun_input_chain_lock); @@ -2740,7 +2836,7 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet) return 0; } else -#endif // IPSEC_NEXUS +#endif // UTUN_NEXUS { mbuf_pkthdr_setrcvif(packet, pcb->utun_ifp); @@ -2765,7 +2861,7 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet) if (result != 0) { ifnet_stat_increment_in(pcb->utun_ifp, 0, 0, 1); - printf("%s - ifnet_input failed: %d\n", __FUNCTION__, result); + os_log_error(OS_LOG_DEFAULT, "%s - ifnet_input failed: %d\n", __FUNCTION__, result); mbuf_freem(packet); } @@ -2804,7 +2900,7 @@ utun_register_nexus(void) &dp_init, sizeof(dp_init), &utun_nx_dom_prov); if (err != 0) { - printf("%s: failed to register domain provider\n", __func__); + os_log_error(OS_LOG_DEFAULT, "%s: failed to register domain provider\n", __func__); return err; } return 0; @@ -2867,6 +2963,9 @@ utun_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(nxprov, channel) struct utun_pcb *pcb = kern_nexus_get_context(nexus); boolean_t ok = ifnet_is_attached(pcb->utun_ifp, 1); + if (pcb->utun_netif_nexus == nexus) { + pcb->utun_netif_connected = true; + } return ok ? 0 : ENXIO; } @@ -2891,7 +2990,11 @@ utun_nexus_disconnected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(nxprov, channel) struct utun_pcb *pcb = kern_nexus_get_context(nexus); if (pcb->utun_netif_nexus == nexus) { - pcb->utun_netif_nexus = NULL; + pcb->utun_netif_connected = false; + if (pcb->utun_attach_fsw) { + // disconnected by flowswitch that was attached by us + pcb->utun_netif_nexus = NULL; + } } ifnet_decr_iorefcnt(pcb->utun_ifp); } @@ -3111,7 +3214,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_packet_t rx_ph = 0; errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); if (__improbable(error != 0)) { - printf("utun_kpipe_sync_rx %s: failed to allocate packet\n", + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n", pcb->utun_ifp->if_xname); break; } @@ -3136,10 +3239,10 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, (pcb->utun_flags & UTUN_FLAGS_NO_OUTPUT)) { /* flush data */ kern_pbufpool_free(rx_pp, rx_ph); - printf("utun_kpipe_sync_rx %s: invalid length %zu header_size %zu\n", + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: invalid length %zu header_size %zu\n", pcb->utun_ifp->if_xname, length, UTUN_HEADER_SIZE(pcb)); - STATS_INC(nifs, NETIF_STATS_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROPPED); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); continue; } @@ -3163,7 +3266,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, break; } default: { - printf("utun_kpipe_sync_rx %s: unknown ip version %u vhl %u header_size %zu\n", + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: unknown ip version %u vhl %u header_size %zu\n", pcb->utun_ifp->if_xname, ip_version, vhl, UTUN_HEADER_SIZE(pcb)); break; } @@ -3190,8 +3293,8 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); VERIFY(error == 0); - STATS_INC(nifs, NETIF_STATS_TXPKTS); - STATS_INC(nifs, NETIF_STATS_TXCOPY_DIRECT); + STATS_INC(nifs, NETIF_STATS_TX_PACKETS); + STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT); rx_ring_stats.kcrsi_slots_transferred++; rx_ring_stats.kcrsi_bytes_transferred += length; @@ -3214,7 +3317,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, /* just like utun_ctl_rcvd(), always reenable output */ errno_t error = ifnet_enable_output(pcb->utun_ifp); if (error != 0) { - printf("utun_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error); + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error); } // Unlock first, then exit ring @@ -3267,7 +3370,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_packet_t rx_ph = 0; errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); if (__improbable(error != 0)) { - printf("utun_kpipe_sync_rx %s: failed to allocate packet\n", + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n", pcb->utun_ifp->if_xname); break; } diff --git a/bsd/net/if_utun.h b/bsd/net/if_utun.h index 0a8f9f967..22dc69265 100644 --- a/bsd/net/if_utun.h +++ b/bsd/net/if_utun.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2016 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,7 +37,6 @@ void* utun_alloc(size_t size); void utun_free(void *ptr); errno_t utun_register_control(void); -boolean_t utun_interface_needs_netagent(ifnet_t interface); #endif @@ -49,30 +48,33 @@ boolean_t utun_interface_needs_netagent(ifnet_t interface); /* * Socket option names to manage utun */ -#define UTUN_OPT_FLAGS 1 -#define UTUN_OPT_IFNAME 2 -#define UTUN_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ +#define UTUN_OPT_FLAGS 1 +#define UTUN_OPT_IFNAME 2 +#define UTUN_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ #define UTUN_OPT_INC_IFDATA_STATS_IN 4 /* set to increment stat counters (type struct utun_stats_param) */ #define UTUN_OPT_INC_IFDATA_STATS_OUT 5 /* set to increment stat counters (type struct utun_stats_param) */ #define UTUN_OPT_SET_DELEGATE_INTERFACE 15 /* set the delegate interface (char[]) */ #define UTUN_OPT_MAX_PENDING_PACKETS 16 /* the number of packets that can be waiting to be read - * from the control socket at a time */ + * from the control socket at a time */ #define UTUN_OPT_ENABLE_CHANNEL 17 #define UTUN_OPT_GET_CHANNEL_UUID 18 #define UTUN_OPT_ENABLE_FLOWSWITCH 19 -#define UTUN_OPT_ENABLE_NETIF 20 /* Must be set before connecting */ -#define UTUN_OPT_SLOT_SIZE 21 /* Must be set before connecting */ -#define UTUN_OPT_NETIF_RING_SIZE 22 /* Must be set before connecting */ -#define UTUN_OPT_TX_FSW_RING_SIZE 23 /* Must be set before connecting */ -#define UTUN_OPT_RX_FSW_RING_SIZE 24 /* Must be set before connecting */ +#define UTUN_OPT_ENABLE_NETIF 20 /* Must be set before connecting */ +#define UTUN_OPT_SLOT_SIZE 21 /* Must be set before connecting */ +#define UTUN_OPT_NETIF_RING_SIZE 22 /* Must be set before connecting */ +#define UTUN_OPT_TX_FSW_RING_SIZE 23 /* Must be set before connecting */ +#define UTUN_OPT_RX_FSW_RING_SIZE 24 /* Must be set before connecting */ +#define UTUN_OPT_KPIPE_TX_RING_SIZE 25 /* Must be set before connecting */ +#define UTUN_OPT_KPIPE_RX_RING_SIZE 26 /* Must be set before connecting */ +#define UTUN_OPT_ATTACH_FLOWSWITCH 27 /* Must be set before connecting */ /* * Flags for by UTUN_OPT_FLAGS */ #define UTUN_FLAGS_NO_OUTPUT 0x0001 -#define UTUN_FLAGS_NO_INPUT 0x0002 +#define UTUN_FLAGS_NO_INPUT 0x0002 #define UTUN_FLAGS_ENABLE_PROC_UUID 0x0004 /* diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index c980cd7e3..cd7010f73 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,7 +75,7 @@ #ifdef PRIVATE #include #endif -#ifdef BSD_KERNEL_PRIVATE +#ifdef BSD_KERN_PRIVATE #include #endif @@ -98,6 +98,10 @@ #define APPLE_IF_FAM_STF 12 #define APPLE_IF_FAM_FIREWIRE 13 #define APPLE_IF_FAM_BOND 14 +#define APPLE_IF_FAM_CELLULAR 15 +#define APPLE_IF_FAM_6LOWPAN 16 +#define APPLE_IF_FAM_UTUN 17 +#define APPLE_IF_FAM_IPSEC 18 #endif /* __APPLE__ */ /* @@ -302,6 +306,37 @@ struct if_latencies { u_int64_t max_lt; /* maximum theoretical latency */ }; +#define IF_NETEM_PARAMS_PSCALE 100000 +struct if_netem_params { + /* bandwidth limit */ + uint64_t ifnetem_bandwidth_bps; + + /* latency (normal distribution with jitter as stdev) */ + uint32_t ifnetem_latency_ms; + uint32_t ifnetem_jitter_ms; + + /* + * NetEm probabilistic model parameters has a scaling factor of 100,000 + * for 5 digits precision. For instance, probability 12.345% is + * expressed as uint32_t fixed point 12345 in ifnet_*_p variable below. + */ + /* random packet corruption */ + uint32_t ifnetem_corruption_p; + + /* random packet duplication */ + uint32_t ifnetem_duplication_p; + + /* 4 state Markov loss model */ + uint32_t ifnetem_loss_p_gr_gl; /* P( gap_loss | gap_rx ) */ + uint32_t ifnetem_loss_p_gr_bl; /* P( burst_loss | gap_rx ) */ + uint32_t ifnetem_loss_p_bl_br; /* P( burst_rx | burst_loss ) */ + uint32_t ifnetem_loss_p_bl_gr; /* P( gap_rx | burst_loss ) */ + uint32_t ifnetem_loss_p_br_bl; /* P( burst_loss | burst_rx ) */ + + /* random packet reordering */ + uint32_t ifnetem_reordering_p; +}; + struct if_rxpoll_stats { u_int32_t ifi_poll_off_req; /* total # of POLL_OFF reqs */ u_int32_t ifi_poll_off_err; /* total # of POLL_OFF errors */ @@ -330,6 +365,23 @@ struct if_rxpoll_stats { u_int64_t ifi_poll_interval_time; /* poll interval (nsec) */ }; +struct if_netif_stats { + u_int64_t ifn_rx_mit_interval; /* rx mitigation ival (nsec) */ + u_int32_t ifn_rx_mit_mode; /* 0: static, 1: dynamic */ + u_int32_t ifn_rx_mit_packets_avg; /* average # of packets */ + u_int32_t ifn_rx_mit_packets_min; /* smallest # of packets */ + u_int32_t ifn_rx_mit_packets_max; /* largest # of packets */ + u_int32_t ifn_rx_mit_bytes_avg; /* average # of bytes */ + u_int32_t ifn_rx_mit_bytes_min; /* smallest # of bytes */ + u_int32_t ifn_rx_mit_bytes_max; /* largest # of bytes */ + u_int32_t ifn_rx_mit_cfg_idx; /* current config selector */ + u_int32_t ifn_rx_mit_cfg_packets_lowat; /* pkts low watermark */ + u_int32_t ifn_rx_mit_cfg_packets_hiwat; /* pkts high watermark */ + u_int32_t ifn_rx_mit_cfg_bytes_lowat; /* bytes low watermark */ + u_int32_t ifn_rx_mit_cfg_bytes_hiwat; /* bytes high watermark */ + u_int32_t ifn_rx_mit_cfg_interval; /* delay interval (nsec) */ +}; + struct if_tcp_ecn_perf_stat { u_int64_t total_txpkts; u_int64_t total_rxmitpkts; @@ -633,6 +685,77 @@ struct chain_len_stats { uint64_t cls_five_or_more; } __attribute__((__aligned__(sizeof(uint64_t)))); +/* + * This structure is used to define the parameters for advisory notifications + * on an interface. + */ +#pragma pack(push, 1) +struct ifnet_interface_advisory { + /* The current structure version */ + uint8_t version; +#define IF_INTERFACE_ADVISORY_VERSION_1 0x1 +#define IF_INTERFACE_ADVISORY_VERSION_CURRENT IF_INTERFACE_ADVISORY_VERSION_1 + /* Specifies if the advisory is for transmit or receive path */ + uint8_t direction; +#define IF_INTERFACE_ADVISORY_DIRECTION_TX 0x1 +#define IF_INTERFACE_ADVISORY_DIRECTION_RX 0x2 + /* reserved for future use */ + uint16_t _reserved; + /* + * suggestion for data rate change to keep the latency low. + * unit: bits per second (bps) + * NOTE: if the interface cannot provide suggestions in terms of bps, + * it should use the following values: + * INT32_MAX : ramp up + * INT32_MIN : ramp down + * 0 : neutral + */ +#define IF_INTERFACE_ADVISORY_RATE_SUGGESTION_RAMP_UP INT32_MAX +#define IF_INTERFACE_ADVISORY_RATE_SUGGESTION_RAMP_DOWN INT32_MIN +#define IF_INTERFACE_ADVISORY_RATE_SUGGESTION_RAMP_NEUTRAL 0 + int32_t rate_trend_suggestion; + /* + * Time of the issue of advisory. + * Timestamp should be in the host domain. + * unit: mach absolute time + */ + uint64_t timestamp; + /* + * Maximum theoretical bandwidth of the interface. + * unit: bits per second (bps) + */ + uint64_t max_bandwidth; + /* + * Total bytes sent or received on the interface. + * wrap around possible and the application should account for that. + * unit: byte + */ + uint64_t total_byte_count; + /* + * average throughput observed at the driver stack. + * unit: bits per second (bps) + */ + uint64_t average_throughput; + /* + * flushable queue size at the driver. + * should be set to UINT32_MAX if not available. + * unit: byte + */ + uint32_t flushable_queue_size; + /* + * non flushable queue size at the driver. + * should be set to UINT32_MAX if not available. + * unit: byte + */ + uint32_t non_flushable_queue_size; + /* + * average delay observed at the interface. + * unit: milliseconds (ms) + */ + uint32_t average_delay; +} __attribute__((aligned(sizeof(uint64_t)))); +#pragma pack(pop) + #endif /* PRIVATE */ #pragma pack() @@ -837,9 +960,13 @@ struct ifnet { TAILQ_ENTRY(ifnet) if_detaching_link; /* list of detaching ifnets */ TAILQ_ENTRY(ifnet) if_ordered_link; /* list of ordered ifnets */ - decl_lck_mtx_data(, if_ref_lock) + decl_lck_mtx_data(, if_ref_lock); u_int32_t if_refflags; /* see IFRF flags below */ u_int32_t if_refio; /* number of io ops to the underlying driver */ + u_int32_t if_threads_pending; /* Threads created but waiting for first run */ + u_int32_t if_datamov; /* number of threads moving data */ + u_int32_t if_drainers; /* number of draining threads */ + u_int32_t if_suspend; /* number of suspend requests */ #define if_list if_link struct ifaddrhead if_addrhead; /* linked list of addresses per if */ @@ -913,7 +1040,7 @@ struct ifnet { struct if_latencies if_output_lt; struct if_latencies if_input_lt; - decl_lck_mtx_data(, if_flt_lock) + decl_lck_mtx_data(, if_flt_lock); u_int32_t if_flt_busy; u_int32_t if_flt_waiters; struct ifnet_filter_head if_flt_head; @@ -924,12 +1051,64 @@ struct ifnet { decl_lck_mtx_data(, if_addrconfig_lock); /* for serializing addr config */ struct in_multi *if_allhostsinm; /* store all-hosts inm for this ifp */ + /* + * Opportunistic polling parameters. + */ decl_lck_mtx_data(, if_poll_lock); - u_int16_t if_poll_req; - u_int16_t if_poll_update; /* link update */ - u_int32_t if_poll_active; /* polling is active */ - struct timespec if_poll_cycle; /* poll interval */ - struct thread *if_poll_thread; + struct if_poll_params { + u_int16_t poll_req; + u_int16_t poll_update; /* link update */ + u_int32_t poll_flags; +#define IF_POLLF_READY 0x1 /* poll thread is ready */ +#define IF_POLLF_RUNNING 0x2 /* poll thread is running/active */ + struct timespec poll_cycle; /* poll interval */ + struct thread *poll_thread; + + ifnet_model_t poll_mode; /* current mode */ + struct pktcntr poll_tstats; /* incremental polling statistics */ + struct if_rxpoll_stats poll_pstats; /* polling statistics */ + struct pktcntr poll_sstats; /* packets and bytes per sampling */ + struct timespec poll_mode_holdtime; /* mode holdtime in nsec */ + struct timespec poll_mode_lasttime; /* last mode change time in nsec */ + struct timespec poll_sample_holdtime; /* sampling holdtime in nsec */ + struct timespec poll_sample_lasttime; /* last sampling time in nsec */ + struct timespec poll_dbg_lasttime; /* last debug message time in nsec */ + } rxpoll_params; +#define if_poll_req rxpoll_params.poll_req +#define if_poll_update rxpoll_params.poll_update +#define if_poll_flags rxpoll_params.poll_flags +#define if_poll_cycle rxpoll_params.poll_cycle +#define if_poll_thread rxpoll_params.poll_thread +#define if_poll_mode rxpoll_params.poll_mode +#define if_poll_tstats rxpoll_params.poll_tstats +#define if_poll_sstats rxpoll_params.poll_sstats +#define if_poll_pstats rxpoll_params.poll_pstats + +#define if_poll_mode_holdtime rxpoll_params.poll_mode_holdtime +#define if_poll_mode_lasttime rxpoll_params.poll_mode_lasttime +#define if_poll_sample_holdtime rxpoll_params.poll_sample_holdtime +#define if_poll_sample_lasttime rxpoll_params.poll_sample_lasttime +#define if_poll_dbg_lasttime rxpoll_params.poll_dbg_lasttime + +#define if_rxpoll_offreq rxpoll_params.poll_pstats.ifi_poll_off_req +#define if_rxpoll_offerr rxpoll_params.poll_pstats.ifi_poll_off_err +#define if_rxpoll_onreq rxpoll_params.poll_pstats.ifi_poll_on_req +#define if_rxpoll_onerr rxpoll_params.poll_pstats.ifi_poll_on_err +#define if_rxpoll_wavg rxpoll_params.poll_pstats.ifi_poll_wakeups_avg +#define if_rxpoll_wlowat rxpoll_params.poll_pstats.ifi_poll_wakeups_lowat +#define if_rxpoll_whiwat rxpoll_params.poll_pstats.ifi_poll_wakeups_hiwat +#define if_rxpoll_pavg rxpoll_params.poll_pstats.ifi_poll_packets_avg +#define if_rxpoll_pmin rxpoll_params.poll_pstats.ifi_poll_packets_min +#define if_rxpoll_pmax rxpoll_params.poll_pstats.ifi_poll_packets_max +#define if_rxpoll_plowat rxpoll_params.poll_pstats.ifi_poll_packets_lowat +#define if_rxpoll_phiwat rxpoll_params.poll_pstats.ifi_poll_packets_hiwat +#define if_rxpoll_bavg rxpoll_params.poll_pstats.ifi_poll_bytes_avg +#define if_rxpoll_bmin rxpoll_params.poll_pstats.ifi_poll_bytes_min +#define if_rxpoll_bmax rxpoll_params.poll_pstats.ifi_poll_bytes_max +#define if_rxpoll_blowat rxpoll_params.poll_pstats.ifi_poll_bytes_lowat +#define if_rxpoll_bhiwat rxpoll_params.poll_pstats.ifi_poll_bytes_hiwat +#define if_rxpoll_plim rxpoll_params.poll_pstats.ifi_poll_packets_limit +#define if_rxpoll_ival rxpoll_params.poll_pstats.ifi_poll_interval_time struct dlil_threading_info *if_inp; @@ -992,7 +1171,8 @@ struct ifnet { u_int32_t type; /* delegated i/f type */ u_int32_t family; /* delegated i/f family */ u_int32_t subfamily; /* delegated i/f sub-family */ - uint32_t expensive:1; /* delegated i/f expensive? */ + uint32_t expensive:1, /* delegated i/f expensive? */ + constrained:1; /* delegated i/f constrained? */ } if_delegated; uuid_t *if_agentids; /* network agents attached to interface */ @@ -1027,6 +1207,11 @@ struct ifnet { struct if_tcp_ecn_stat *if_ipv6_stat; struct if_lim_perf_stat if_lim_stat; + + uint32_t if_tcp_kao_max; + uint32_t if_tcp_kao_cnt; + + struct netem *if_output_netem; }; /* Interface event handling declarations */ @@ -1062,11 +1247,16 @@ EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); #define IFRF_EMBRYONIC 0x1 /* ifnet is allocated; awaiting attach */ #define IFRF_ATTACHED 0x2 /* ifnet attach is completely done */ #define IFRF_DETACHING 0x4 /* detach has been requested */ +#define IFRF_READY 0x8 /* data path is ready */ + #define IFRF_ATTACH_MASK \ (IFRF_EMBRYONIC|IFRF_ATTACHED|IFRF_DETACHING) #define IF_FULLY_ATTACHED(_ifp) \ (((_ifp)->if_refflags & IFRF_ATTACH_MASK) == IFRF_ATTACHED) + +#define IF_FULLY_ATTACHED_AND_READY(_ifp) \ + (IF_FULLY_ATTACHED(_ifp) && ((_ifp)->if_refflags & IFRF_READY)) /* * Valid values for if_start_flags */ @@ -1203,6 +1393,8 @@ struct ifaddr { (struct ifaddr *, int); void (*ifa_attached)(struct ifaddr *); /* callback fn for attaching */ void (*ifa_detached)(struct ifaddr *); /* callback fn for detaching */ + void *ifa_del_wc; /* Wait channel to avoid address deletion races */ + int ifa_del_waiters; /* Threads in wait to delete the address */ #if __arm__ && (__BIGGEST_ALIGNMENT__ > 4) /* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers * are 32-bit: @@ -1360,8 +1552,10 @@ struct ifmultiaddr { * IFNET_FAMILY_ETHERNET (as well as type to IFT_ETHER) which is too generic. */ #define IFNET_IS_WIFI(_ifp) \ - ((_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI || \ - (_ifp)->if_delegated.subfamily == IFNET_SUBFAMILY_WIFI) + (((_ifp)->if_family == IFNET_FAMILY_ETHERNET && \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI) || \ + ((_ifp)->if_delegated.family == IFNET_FAMILY_ETHERNET && \ + (_ifp)->if_delegated.subfamily == IFNET_SUBFAMILY_WIFI)) /* * Indicate whether or not the immediate interface, or the interface delegated @@ -1388,6 +1582,17 @@ struct ifmultiaddr { (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI && \ !((_ifp)->if_eflags & IFEF_AWDL)) +/* + * Indicate whether or not the immediate interface is a companion link + * interface. + */ +#define IFNET_IS_COMPANION_LINK(_ifp) \ + ((_ifp)->if_family == IFNET_FAMILY_IPSEC && \ + ((_ifp)->if_subfamily == IFNET_SUBFAMILY_BLUETOOTH || \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI || \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_QUICKRELAY || \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_DEFAULT)) + /* * Indicate whether or not the immediate interface, or the interface delegated * by it, is marked as expensive. The delegated interface is set/cleared @@ -1397,7 +1602,7 @@ struct ifmultiaddr { * Note that this is meant to be used only for policy purposes. */ #define IFNET_IS_EXPENSIVE(_ifp) \ - ((_ifp)->if_eflags & IFEF_EXPENSIVE || \ + ((_ifp)->if_eflags & IFEF_EXPENSIVE || \ (_ifp)->if_delegated.expensive) #define IFNET_IS_LOW_POWER(_ifp) \ @@ -1406,6 +1611,10 @@ struct ifmultiaddr { ((_ifp)->if_delegated.ifp != NULL && \ ((_ifp)->if_delegated.ifp->if_xflags & IFXF_LOW_POWER))) +#define IFNET_IS_CONSTRAINED(_ifp) \ + ((_ifp)->if_xflags & IFXF_CONSTRAINED || \ + (_ifp)->if_delegated.constrained) + /* * We don't support AWDL interface delegation. */ @@ -1499,8 +1708,15 @@ __private_extern__ void ifnet_head_assert_exclusive(void); __private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t, u_int32_t); __private_extern__ int ifnet_is_attached(struct ifnet *, int refio); +__private_extern__ void ifnet_incr_pending_thread_count(struct ifnet *); +__private_extern__ void ifnet_decr_pending_thread_count(struct ifnet *); __private_extern__ void ifnet_incr_iorefcnt(struct ifnet *); __private_extern__ void ifnet_decr_iorefcnt(struct ifnet *); +__private_extern__ boolean_t ifnet_datamov_begin(struct ifnet *); +__private_extern__ void ifnet_datamov_end(struct ifnet *); +__private_extern__ void ifnet_datamov_suspend(struct ifnet *); +__private_extern__ void ifnet_datamov_drain(struct ifnet *); +__private_extern__ void ifnet_datamov_resume(struct ifnet *); __private_extern__ void ifnet_set_start_cycle(struct ifnet *, struct timespec *); __private_extern__ void ifnet_set_poll_cycle(struct ifnet *, @@ -1743,6 +1959,8 @@ __private_extern__ void if_copy_packet_stats(struct ifnet *ifp, struct if_packet_stats *if_ps); __private_extern__ void if_copy_rxpoll_stats(struct ifnet *ifp, struct if_rxpoll_stats *if_rs); +__private_extern__ void if_copy_netif_stats(struct ifnet *ifp, + struct if_netif_stats *if_ns); __private_extern__ struct rtentry *ifnet_cached_rtlookup_inet(struct ifnet *, struct in_addr); @@ -1801,6 +2019,7 @@ __private_extern__ u_int32_t ifnet_get_generation(struct ifnet *); /* Adding and deleting netagents will take ifnet lock */ __private_extern__ int if_add_netagent(struct ifnet *, uuid_t); +__private_extern__ int if_add_netagent_locked(struct ifnet *, uuid_t); __private_extern__ int if_delete_netagent(struct ifnet *, uuid_t); __private_extern__ boolean_t if_check_netagent(struct ifnet *, uuid_t); @@ -1811,6 +2030,7 @@ __private_extern__ void intf_event_enqueue_nwk_wq_entry(struct ifnet *ifp, struct sockaddr *addrp, uint32_t intf_event_code); __private_extern__ void ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *, struct ifnet *); +__private_extern__ int if_get_tcp_kao_max(struct ifnet *); #if !CONFIG_EMBEDDED __private_extern__ errno_t ifnet_framer_stub(struct ifnet *, struct mbuf **, const struct sockaddr *, const char *, const char *, u_int32_t *, @@ -1820,6 +2040,8 @@ __private_extern__ void ifnet_enqueue_multi_setup(struct ifnet *, uint16_t, uint16_t); __private_extern__ errno_t ifnet_enqueue_mbuf(struct ifnet *, struct mbuf *, boolean_t, boolean_t *); +__private_extern__ int ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, + uint32_t n_pkts); extern int if_low_power_verbose; extern int if_low_power_restricted; diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index b4a922301..8509cca99 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2018 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1053,7 +1053,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) u_short tag; vlan_parent_ref vlp = NULL; int err; - struct flowadv adv = { FADV_SUCCESS }; + struct flowadv adv = { .code = FADV_SUCCESS }; if (m == 0) { return 0; @@ -1129,6 +1129,13 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) evl->evl_proto = evl->evl_encap_proto; evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); + + /* adjust partial checksum offload offsets */ + if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | + CSUM_PARTIAL)) == (CSUM_DATA_VALID | CSUM_PARTIAL)) { + m->m_pkthdr.csum_tx_start += ETHER_VLAN_ENCAP_LEN; + m->m_pkthdr.csum_tx_stuff += ETHER_VLAN_ENCAP_LEN; + } } err = dlil_output(p, PF_VLAN, m, NULL, NULL, 1, &adv); @@ -1176,6 +1183,7 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol, soft_vlan = 1; switch (ifnet_type(p)) { case IFT_ETHER: + case IFT_IEEE8023ADLAG: if (m->m_len < ETHER_VLAN_ENCAP_LEN) { m_freem(m); return 0; diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index 22281c271..4e849c3e5 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2018 Apple Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -206,7 +206,6 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, (einit.flags & IFNET_INIT_INPUT_POLL)) { return EINVAL; } - einit.pre_enqueue = NULL; einit.start = NULL; einit.output_ctl = NULL; @@ -233,7 +232,6 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, } } - /* Initialize external name (name + unit) */ (void) snprintf(if_xname, sizeof(if_xname), "%s%d", einit.name, einit.unit); @@ -417,6 +415,8 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, } ifp->if_xflags = 0; + /* legacy interface */ + ifp->if_xflags |= IFXF_LEGACY; /* * output target queue delay is specified in millisecond @@ -635,6 +635,14 @@ ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask) ifnet_lock_done(interface); return EINVAL; } + /* + * Currently Interface advisory reporting is supported only for + * skywalk interface. + */ + if ((((new_flags & mask) & IFEF_ADV_REPORT) != 0) && + ((interface->if_eflags & IFEF_SKYWALK_NATIVE) == 0)) { + return EINVAL; + } oeflags = interface->if_eflags; interface->if_eflags = (new_flags & mask) | (interface->if_eflags & ~mask); @@ -2229,7 +2237,24 @@ ifnet_add_multicast(ifnet_t interface, const struct sockaddr *maddr, } /* Don't let users screw up protocols' entries. */ - if (maddr->sa_family != AF_UNSPEC && maddr->sa_family != AF_LINK) { + switch (maddr->sa_family) { + case AF_LINK: { + const struct sockaddr_dl *sdl = + (const struct sockaddr_dl *)(uintptr_t)maddr; + if (sdl->sdl_len < sizeof(struct sockaddr_dl) || + (sdl->sdl_nlen + sdl->sdl_alen + sdl->sdl_slen + + offsetof(struct sockaddr_dl, sdl_data) > sdl->sdl_len)) { + return EINVAL; + } + break; + } + case AF_UNSPEC: + if (maddr->sa_len < ETHER_ADDR_LEN + + offsetof(struct sockaddr, sa_data)) { + return EINVAL; + } + break; + default: return EINVAL; } @@ -2870,8 +2895,34 @@ ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr *sa, int32_t rssi, return EINVAL; } - dlil_node_present(ifp, sa, rssi, lqm, npm, srvinfo); - return 0; + return dlil_node_present(ifp, sa, rssi, lqm, npm, srvinfo); +} + +errno_t +ifnet_notice_node_presence_v2(ifnet_t ifp, struct sockaddr *sa, struct sockaddr_dl *sdl, + int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48]) +{ + /* Support older version if sdl is NULL */ + if (sdl == NULL) { + return ifnet_notice_node_presence(ifp, sa, rssi, lqm, npm, srvinfo); + } + + if (ifp == NULL || sa == NULL || srvinfo == NULL) { + return EINVAL; + } + if (sa->sa_len > sizeof(struct sockaddr_storage)) { + return EINVAL; + } + + if (sa->sa_family != AF_INET6) { + return EINVAL; + } + + if (sdl->sdl_family != AF_LINK) { + return EINVAL; + } + + return dlil_node_present_v2(ifp, sa, sdl, rssi, lqm, npm, srvinfo); } errno_t @@ -2970,6 +3021,8 @@ ifnet_set_delegate(ifnet_t ifp, ifnet_t delegated_ifp) ifp->if_delegated.subfamily = delegated_ifp->if_subfamily; ifp->if_delegated.expensive = delegated_ifp->if_eflags & IFEF_EXPENSIVE ? 1 : 0; + ifp->if_delegated.constrained = + delegated_ifp->if_xflags & IFXF_CONSTRAINED ? 1 : 0; /* * Propogate flags related to ECN from delegated interface @@ -3061,7 +3114,7 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp, bzero(frame, sizeof(struct ifnet_keepalive_offload_frame)); } - /* First collect IPSec related keep-alive frames */ + /* First collect IPsec related keep-alive frames */ *used_frames_count = key_fill_offload_frames_for_savs(ifp, frames_array, frames_array_count, frame_data_offset); @@ -3084,6 +3137,32 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp, return 0; } +errno_t +ifnet_notify_tcp_keepalive_offload_timeout(ifnet_t ifp, + struct ifnet_keepalive_offload_frame *frame) +{ + errno_t error = 0; + + if (ifp == NULL || frame == NULL) { + return EINVAL; + } + + if (frame->type != IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP) { + return EINVAL; + } + if (frame->ether_type != IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 && + frame->ether_type != IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6) { + return EINVAL; + } + if (frame->local_port == 0 || frame->remote_port == 0) { + return EINVAL; + } + + error = tcp_notify_kao_timeout(ifp, frame); + + return error; +} + errno_t ifnet_link_status_report(ifnet_t ifp, const void *buffer, size_t buffer_len) @@ -3161,7 +3240,7 @@ ifnet_link_status_report(ifnet_t ifp, const void *buffer, ifp->if_link_status->ifsr_len = ifsr->ifsr_len; if_cell_sr->valid_bitmask = 0; bcopy(new_cell_sr, if_cell_sr, sizeof(*if_cell_sr)); - } else if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) { + } else if (IFNET_IS_WIFI(ifp)) { struct if_wifi_status_v1 *if_wifi_sr, *new_wifi_sr; /* Check version */ @@ -3252,7 +3331,7 @@ ifnet_get_fastlane_capable(ifnet_t interface, boolean_t *capable) if (interface == NULL || capable == NULL) { return EINVAL; } - if (interface->if_eflags & IFEF_QOSMARKING_CAPABLE) { + if (interface->if_qosmarking_mode == IFRTYPE_QOSMARKING_FASTLANE) { *capable = true; } else { *capable = false; @@ -3356,3 +3435,17 @@ ifnet_get_low_power_mode(ifnet_t ifp, boolean_t *on) return 0; } + +/*************************************************************************/ +/* Interface advisory notifications */ +/*************************************************************************/ +errno_t +ifnet_interface_advisory_report(ifnet_t ifp, + const struct ifnet_interface_advisory *advisory) +{ + +#pragma unused(ifp) +#pragma unused(advisory) + return ENOTSUP; + +} diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index 3d71fdeec..f131c5746 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2018 Apple Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +43,7 @@ #ifdef KERNEL_PRIVATE struct if_interface_state; +struct ifnet_interface_advisory; #include #endif /* KERNEL_PRIVATE */ @@ -55,7 +56,7 @@ struct if_interface_state; #define KPI_INTERFACE_EMBEDDED 0 #endif #else -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define KPI_INTERFACE_EMBEDDED 1 #else #define KPI_INTERFACE_EMBEDDED 0 @@ -88,6 +89,9 @@ struct ifnet_demux_desc; * @constant IFNET_FAMILY_FIREWIRE An IEEE 1394 [Firewire] interface. * @constant IFNET_FAMILY_BOND A virtual bonded interface. * @constant IFNET_FAMILY_CELLULAR A cellular interface. + * @constant IFNET_FAMILY_6LOWPAN A 6LoWPAN interface. + * @constant IFNET_FAMILY_UTUN A utun interface. + * @constant IFNET_FAMILY_IPSEC An IPsec interface. */ enum { IFNET_FAMILY_ANY = 0, @@ -105,7 +109,10 @@ enum { IFNET_FAMILY_STF = 12, IFNET_FAMILY_FIREWIRE = 13, IFNET_FAMILY_BOND = 14, - IFNET_FAMILY_CELLULAR = 15 + IFNET_FAMILY_CELLULAR = 15, + IFNET_FAMILY_6LOWPAN = 16, + IFNET_FAMILY_UTUN = 17, + IFNET_FAMILY_IPSEC = 18 }; /*! @@ -131,8 +138,8 @@ enum { IFNET_SUBFAMILY_THUNDERBOLT = 4, IFNET_SUBFAMILY_RESERVED = 5, IFNET_SUBFAMILY_INTCOPROC = 6, - IFNET_SUBFAMILY_UTUN = 7, - IFNET_SUBFAMILY_IPSEC = 8, + IFNET_SUBFAMILY_QUICKRELAY = 7, + IFNET_SUBFAMILY_DEFAULT = 8, }; /* @@ -3326,12 +3333,36 @@ ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr *sa, int32_t rssi, * system that the absence of the specified node has been detected. * @param ifp The interface attached to the link where the absence of the * specified node has been detected. - * @param sa The AF_LINK family address of the node whose absence has been - * detected. + * @param sa The AF_INET6 or AF_LINK family address of the node whose absence has been + * detected. If AF_LINK is specified, AF_INET6 address is derived from the + * AF_LINK address. * @result Returns 0 on success, or EINVAL if arguments are invalid. */ extern errno_t ifnet_notice_node_absence(ifnet_t ifp, struct sockaddr *sa); +/* + * @function ifnet_notice_node_presence_v2 + * @discussion Provided for network interface drivers to notify the + * system of a change detected in the presence of the specified + * node. + * @param ifp The interface attached to the link where the specified node + * is present. + * @param sa The AF_INET6 family address of the node whose presence is + * changing. + * @param sdl The AF_LINK family address of the node whose presence is + * changing. + * @param rssi The received signal strength indication as measured in + * dBm by a radio receiver. + * @param lqm A link quality metric associated with the specified node. + * @param npm A node proximity metric associated with the specified node. + * @param srvinfo A fixed-size array of octets containing opaque service + * information data used by the mDNS responder subsystem. + * @result Returns 0 on success, or EINVAL if arguments are invalid. + */ +extern errno_t +ifnet_notice_node_presence_v2(ifnet_t ifp, struct sockaddr *sa, struct sockaddr_dl *sdl, int32_t rssi, + int lqm, int npm, u_int8_t srvinfo[48]); + /* * @function ifnet_notice_master_elected * @discussion Provided for network interface drivers to notify the system @@ -3392,7 +3423,7 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp); * @struct ifnet_keepalive_offload_frame * @discussion This structure is used to define various opportunistic * polling parameters for an interface. - * For IPSec and AirPlay UDP keep alive only a subset of the + * For IPsec and AirPlay UDP keep alive only a subset of the * fields are relevant. * An incoming TCP keep alive probe has the sequence number * in the TCP header equal to "remote_seq" and the @@ -3415,6 +3446,7 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp); * @field keep_retry Interval before retrying if previous probe was not answered (TCP only) * @field reply_length The length of the frame in the reply_data field (TCP only) * @field addr_length Length in bytes of local_addr and remote_addr (TCP only) + * @field flags Flags (TCP only) * @field reply_data Keep alive reply to be sent to incoming probe (TCP only) * @field local_addr Local address: 4 bytes IPv4 or 16 bytes IPv6 address (TCP only) * @field remote_addr Remote address: 4 bytes IPv4 or 16 bytes IPv6 address (TCP only) @@ -3442,7 +3474,9 @@ struct ifnet_keepalive_offload_frame { u_int16_t keep_retry; /* interval before retrying if previous probe was not answered */ u_int8_t reply_length; /* Length of valid reply_data bytes including offset */ u_int8_t addr_length; /* Length of valid bytes in local_addr and remote_addr */ - u_int8_t reserved[2]; +#define IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP 0x01 + u_int8_t flags; + u_int8_t reserved[1]; u_int8_t reply_data[IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE]; /* Response packet */ u_int8_t local_addr[IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE]; /* in network byte order */ u_int8_t remote_addr[IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE]; /* in network byte order */ @@ -3457,13 +3491,13 @@ struct ifnet_keepalive_offload_frame { * @discussion Fills out frames_array with IP packets to send at * periodic intervals as Keep-alive or heartbeat messages. * This can be used to offload keep alives for UDP or TCP. - * Note: The frames are returned in this order: first the IPSec + * Note: The frames are returned in this order: first the IPsec * frames, then the AirPlay frames and finally the TCP frames. * If a device does not support one kind of keep alive frames_array * it should provide a frames_array large enough to accomodate * the other frames * @param ifp The interface to send frames out on. This is used to - * select which sockets or IPSec SAs should generate the + * select which sockets or IPsec SAs should generate the * packets. * @param frames_array An array of ifnet_keepalive_offload_frame * structs. This is allocated by the caller, and has @@ -3481,6 +3515,28 @@ extern errno_t ifnet_get_keepalive_offload_frames(ifnet_t ifp, u_int32_t frames_array_count, size_t frame_data_offset, u_int32_t *used_frames_count); + +/* + * @function ifnet_notify_tcp_keepalive_offload_timeout + * @discussion Used by an interface to notify a TCP connection whose + * keep alive was offloaded did experience a timeout. + * @param ifp The interface for which the TCP keep alive offload timed out + * @param frame The ifnet_keepalive_offload_frame structure that identifies + * the TCP connection that experienced the timeout. + * All the fields must be zeroed by the caller except for: + * - type: must be IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP + * and for the fields identifying the 5-tup;e of the + * TCP connection: + * - ether_type + * - local_addr + * - remote_addr + * - local_port + * - remote_port + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_notify_tcp_keepalive_offload_timeout(ifnet_t ifp, + struct ifnet_keepalive_offload_frame *frame); + /*************************************************************************/ /* Link level notifications */ /*************************************************************************/ @@ -3594,6 +3650,21 @@ extern errno_t ifnet_touch_lastupdown(ifnet_t interface); */ extern errno_t ifnet_updown_delta(ifnet_t interface, struct timeval *updown_delta); +/*************************************************************************/ +/* Interface advisory notifications */ +/*************************************************************************/ +/*! + * @function ifnet_interface_advisory_report + * @discussion KPI to let the driver provide interface advisory + * notifications to the user space. + * @param ifp The interface that is generating the advisory report. + * @param advisory structure containing the advisory notification + * information. + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_interface_advisory_report(ifnet_t ifp, + const struct ifnet_interface_advisory *advisory); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/net/linkaddr.c b/bsd/net/linkaddr.c new file mode 100644 index 000000000..d8cc855a4 --- /dev/null +++ b/bsd/net/linkaddr.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2007, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + */ + +/** + * \file + * Functions for manipulating Rime addresses + * \author + * Adam Dunkels + */ + +/** + * \addtogroup linkaddr + * @{ + */ + +#include "linkaddr.h" +#include + +linkaddr_t linkaddr_node_addr; +#if LINKADDR_SIZE == 2 +const linkaddr_t linkaddr_null = { { 0, 0 } }; +#else /*LINKADDR_SIZE == 2*/ +#if LINKADDR_SIZE == 8 +const linkaddr_t linkaddr_null = { { 0, 0, 0, 0, 0, 0, 0, 0 } }; +#endif /*LINKADDR_SIZE == 8*/ +#endif /*LINKADDR_SIZE == 2*/ + + +/*---------------------------------------------------------------------------*/ +void +linkaddr_copy(linkaddr_t *dest, const linkaddr_t *src) +{ + memcpy(dest, src, LINKADDR_SIZE); +} +/*---------------------------------------------------------------------------*/ +int +linkaddr_cmp(const linkaddr_t *addr1, const linkaddr_t *addr2) +{ + return memcmp(addr1, addr2, LINKADDR_SIZE) == 0; +} +/*---------------------------------------------------------------------------*/ +void +linkaddr_set_node_addr(linkaddr_t *t) +{ + linkaddr_copy(&linkaddr_node_addr, t); +} +/*---------------------------------------------------------------------------*/ +/** @} */ diff --git a/bsd/net/linkaddr.h b/bsd/net/linkaddr.h new file mode 100644 index 000000000..aed3471b2 --- /dev/null +++ b/bsd/net/linkaddr.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2007, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + */ + +/** + * \file + * Header file for the Rime address representation + * \author + * Adam Dunkels + */ + +/** + * \addtogroup rime + * @{ + */ + +/** + * \defgroup linkaddr Rime addresses + * @{ + * + * The linkaddr module is an abstract representation of addresses in + * Rime. + * + */ + +#ifndef LINKADDR_H_ +#define LINKADDR_H_ + +#include "contiki-conf.h" + +#include + +#ifdef LINKADDR_CONF_SIZE +#define LINKADDR_SIZE LINKADDR_CONF_SIZE +#else /* LINKADDR_SIZE */ +#define LINKADDR_SIZE 2 +#endif /* LINKADDR_SIZE */ + +typedef union { + unsigned char u8[LINKADDR_SIZE]; +#if LINKADDR_SIZE == 2 + uint16_t u16; +#endif /* LINKADDR_SIZE == 2 */ +} linkaddr_t; + +typedef union { + uint8_t u8[8]; + uint16_t u16[4]; +} linkaddr_extended_t; + +/** + * \brief Copy a Rime address + * \param dest The destination + * \param from The source + * + * This function copies a Rime address from one location + * to another. + * + */ +void linkaddr_copy(linkaddr_t *dest, const linkaddr_t *from); + +/** + * \brief Compare two Rime addresses + * \param addr1 The first address + * \param addr2 The second address + * \return Non-zero if the addresses are the same, zero if they are different + * + * This function compares two Rime addresses and returns + * the result of the comparison. The function acts like + * the '==' operator and returns non-zero if the addresses + * are the same, and zero if the addresses are different. + * + */ +int linkaddr_cmp(const linkaddr_t *addr1, const linkaddr_t *addr2); + + +/** + * \brief Set the address of the current node + * \param addr The address + * + * This function sets the Rime address of the node. + * + */ +void linkaddr_set_node_addr(linkaddr_t *addr); + +/** + * \brief The Rime address of the node + * + * This variable contains the Rime address of the + * node. This variable should not be changed directly; + * rather, the linkaddr_set_node_addr() function should be + * used. + * + */ +extern linkaddr_t linkaddr_node_addr; + +/** + * \brief The null Rime address + * + * This variable contains the null Rime address. The null + * address is used in route tables to indicate that the + * table entry is unused. Nodes with no configured address + * has the null address. Nodes with their node address set + * to the null address will have problems communicating + * with other nodes. + * + */ +extern const linkaddr_t linkaddr_null; + +#endif /* LINKADDR_H_ */ +/** @} */ +/** @} */ diff --git a/bsd/net/multi_layer_pkt_log.c b/bsd/net/multi_layer_pkt_log.c new file mode 100644 index 000000000..b6af63d42 --- /dev/null +++ b/bsd/net/multi_layer_pkt_log.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include +#include +#include + +SYSCTL_NODE(_net, OID_AUTO, mpklog, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Multi-layer packet logging"); + +/* + * Note: net_mpklog_enabled allows to override the interface flags IFXF_MPK_LOG + */ +int net_mpklog_enabled = 1; +static int sysctl_net_mpklog_enabled SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_mpklog, OID_AUTO, enabled, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, &sysctl_net_mpklog_enabled, "I", "Multi-layer packet logging enabled"); + +static int sysctl_net_mpklog_type SYSCTL_HANDLER_ARGS; +int net_mpklog_type = OS_LOG_TYPE_DEFAULT; +SYSCTL_PROC(_net_mpklog, OID_AUTO, type, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, &sysctl_net_mpklog_type, "I", "Multi-layer packet logging type"); + +SYSCTL_INT(_net_mpklog, OID_AUTO, version, CTLFLAG_RD | CTLFLAG_LOCKED, + (int *)NULL, MPKL_VERSION, "Multi-layer packet logging version"); + +static int +sysctl_net_mpklog_enabled SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int value = net_mpklog_enabled; + + int error = sysctl_handle_int(oidp, &value, 0, req); + if (error || !req->newptr) { + return error; + } + + net_mpklog_enabled = (value == 0) ? 0 : 1; + + os_log(OS_LOG_DEFAULT, "%s:%d set net_mpklog_enabled to %d", + proc_best_name(current_proc()), proc_selfpid(), net_mpklog_enabled); + + return 0; +} + +static int +sysctl_net_mpklog_type SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int value = net_mpklog_type; + + int error = sysctl_handle_int(oidp, &value, 0, req); + if (error || !req->newptr) { + return error; + } + + if (value != OS_LOG_TYPE_DEFAULT && + value != OS_LOG_TYPE_INFO) { + return EINVAL; + } + + net_mpklog_type = value; + + os_log(OS_LOG_DEFAULT, "%s:%d set net_mpklog_type to %d (%s)", + proc_best_name(current_proc()), proc_selfpid(), net_mpklog_type, + net_mpklog_type == OS_LOG_TYPE_DEFAULT ? "default" : "info"); + + return 0; +} diff --git a/bsd/net/multi_layer_pkt_log.h b/bsd/net/multi_layer_pkt_log.h new file mode 100644 index 000000000..ef3a3eac1 --- /dev/null +++ b/bsd/net/multi_layer_pkt_log.h @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_MULTI_LAYER_PKT_LOG_H_ +#define _NET_MULTI_LAYER_PKT_LOG_H_ + +#include + +/* + * Bump this version whenever the format of a log is modified + */ +#define MPKL_VERSION 1 + +/* + * Protocol ID, use to track inter-layer transitions and direction of data flow. + * Watch transport physical layer has lowest numeric value, increases to the highest layer in the system. + * Direction is to physical layer, or away from physical layer. + * + */ + +#define MPKL_PROTOCOL_PHYSICAL ((uint8_t)0) /* (OTA/serial-port/etc..) */ + +#define MPKL_PROTOCOL_BT ((uint8_t)20) +#define MPKL_PROTOCOL_WIFI ((uint8_t)30) +#define MPKL_PROTOCOL_CELLULAR ((uint8_t)40) +#define MPKL_PROTOCOL_TERMINUS ((uint8_t)60) +#define MPKL_PROTOCOL_IPSEC ((uint8_t)80) +#define MPKL_PROTOCOL_TCP ((uint8_t)100) +#define MPKL_PROTOCOL_IDS ((uint8_t)120) +#define MPKL_PROTOCOL_LIBNETCORE ((uint8_t)140) +#define MPKL_PROTOCOL_CFNETWORK ((uint8_t)160) +#define MPKL_PROTOCOL_REMOTE_CONNECTION ((uint8_t)200) + +#define MPKL_TOPMOST_LAYER ((uint8_t)255) /* Top-most layer */ + + +/*! + * @macro MPKL_CREATE_LOGOBJECT + * @discussion Creates a log object with input category name for the transportpacketlog subsystem + * + * @param Name string name of os_log_t category + * + * @return os_log_t object + * + */ +#define MPKL_CREATE_LOGOBJECT(Name) os_log_create("com.apple.magnetpacketlog", Name) + +/* + * Cross-layer association APIs + * + */ + +/*! + * @macro MPKL_UUID_UUID_ASSOCIATE_PREV + * @discussion Associate current layer's packet UUID to previous layer's packet UUID, data is flowing into the current layer + * + * @param LOGOBJECT os_log_t object to write data into + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of previous layer being associated + * @param CUR_UUID uuid_t Current layer 16-byte UUID of packet + * @param PREV_UUID uuid_t Previous layer 16-byte UUID of packet + * @param CUR_LEN uint16_t Current layer packet length + * @param LOG_SEQ uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_UUID_UUID_ASSOCIATE_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_UUID, PREV_UUID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "1 {curProtocol: %hhu, prevProtocol: %hhu, curUUID: %{public,uuid_t}.16P, prevUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_UUID, PREV_UUID, CUR_LEN, LOG_SEQN) + +/*! + * @macro MPKL_UUID_UUID_ASSOCIATE_NEXT + * @discussion Associate current layer's packet UUID to next layer's packet UUID, data is flowing out of the current layer + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param NEXT_PROTOCOL_ID uint8_t ID of next layer being associated + * @param CUR_UUID uuid_t Current layer 16-byte UUID of packet + * @param NEXT_UUID uuid_t Next layer 16-byte UUID of packet + * @param CUR_LEN uint16_t Current layer packet length + * @param LOG_SEQ uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_UUID_UUID_ASSOCIATE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "2 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public,uuid_t}.16P, nextUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, CUR_LEN, LOG_SEQN) + +/*! + * @macro MPKL_SEQRANGE_UUID_ASSOCIATE + * @discussion Associate previous layer's byte sequence range (start/end) to current layer's packet UUID + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of other layer being associated + * @param PREV_RANGE_START uint32_t Previous layer byte sequence range start + * @param PREV_RANGE_END uint32_t Previous layer byte sequence range end + * @param CUR_UUID uuid_t Other layer 16-byte UUID of packet + * @param CUR_LEN uint16_t Current layer packet length + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_SEQRANGE_UUID_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "3 {curProtocol: %hhu, prevProtocol: %hhu, prevStart: %u, prevEnd: %u, curUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CUR_LEN, LOG_SEQN) + +/*! + * @macro MPKL_UUID_SEQRANGE_ASSOCIATE + * @discussion Associate previous layer's packet UUID to current layer's byte sequence range (start/end) + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of other layer being associated + * @param PREV_UUID uuid_t Previous layer 16-byte UUID of packet + * @param CUR_RANGE_START uint16_t Current layer byte sequence range start + * @param CUR_RANGE_END uint16_t Current layer byte sequence range end + * @param PREV_LEN uint16_t PRevious layer message length + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_UUID_SEQRANGE_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, CUR_RANGE_START, CUR_RANGE_END, PREV_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "4 {curProtocol: %hhu, prevProtocol: %hhu, prevUUID: %{public,uuid_t}.16P, curStart: %u, curEnd: %u, prevPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, CUR_RANGE_START, CUR_RANGE_END, PREV_LEN, LOG_SEQN) + + +/*! + * @macro MPKL_BUNDLEID_UUID_ASSOCIATE + * @discussion Associate previous layer's packet BUNDLEID to current layer's UUID + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of other layer being associated + * @param PREV_BUNDLE_ID NSString BundleID of previous layer + * @param CUR_UUID uuid_t Current layer 16-byte UUID of packet + * @param CUR_LEN uint32_t Current layer packet length + * @param LOG_SEQ uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_BUNDLEID_UUID_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_BUNDLE_ID, CUR_UUID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "5 {curProtocol: %hhu, prevProtocol: %hhu, prevBundleID: %@, curUUID: %{public,uuid_t}.16P, curPktLen: %u, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_BUNDLE_ID, CUR_UUID, CUR_LEN, LOG_SEQN) + + +/*! + * @macro MPKL_SEQRANGE_UUID_ASSOCIATE_W_BUNDLEID + * @discussion Associate previous layer's packet byte sequence range to to current layer's UUID and client's bundle id + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of other layer being associated + * @param PREV_RANGE_START uint32_t Previous layer byte sequence range start + * @param PREV_RANGE_END uint32_t Previous layer byte sequence range end + * @param CUR_UUID uuid_t Current layer 16-byte UUID of packet + * @param PREV_BUNDLE_ID NSString BundleID of previous layer + * @param CUR_LEN uint16_t Current layer packet length + * @param LOG_SEQ uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_SEQRANGE_UUID_ASSOCIATE_W_BUNDLEID(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CLIENT_BUNDLE_ID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "6 {curProtocol: %hhu, prevProtocol: %hhu, prevStart: %u, prevEnd: %u, curUUID: %{public,uuid_t}.16P, curBundleID: %@, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CLIENT_BUNDLE_ID, CUR_LEN, LOG_SEQN) + + +/*! + * @macro MPKL_SEQN_UUID_ASSOCIATE_PREV + * @discussion Associate current layer's packet unique protocol sequenceNumber to another layer's message UUID + * Support fragmentation and re-assembly (for layers like BT), map byte-sequence range (2 byte) of current and other layer data + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of other layer being associated + * @param PREV_UUID uuid_t Other layer 16-byte UUID of message + * @param PREV_RANGE_START uint16_t Current layer byte sequence range start + * @param PREV_RANGE_END uint16_t Current layer byte sequence range end + * @param PREV_LEN uint16_t PRevious layer message length + * @param CUR_SEQ_N uint16_t Current layer message length + * @param CUR_RANGE_START uint16_t Current layer byte sequence range start + * @param CUR_RANGE_END uint16_t Current layer byte sequence range end + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_SEQN_UUID_ASSOCIATE_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_RANGE_START, PREV_RANGE_END, PREV_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) \ +os_log(LOGOBJECT, "7 {Send, curProtocol: %hhu, prevProtocol: %hhu, prevUUID: %{public,uuid_t}.16P, prevStart: %hu, prevEnd: %hu, prevPktLen %hu, curSeqN: %hu, curStart: %hu, curEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_RANGE_START, PREV_RANGE_END, PREV_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) + +/*! + * @macro MPKL_SEQN_UUID_ASSOCIATE_NEXT + * @discussion Associate current layer's packet unique protocol sequenceNumber to another layer's message UUID + * Support fragmentation and re-assembly (for layers like BT), map byte-sequence range (2 byte) of current and other layer data + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param NEXT_PROTOCOL_ID uint8_t ID of other layer being associated + * @param NEXT_UUID uuid_t Other layer 16-byte UUID of message + * @param NEXT_RANGE_START uint16_t Current layer byte sequence range start + * @param NEXT_RANGE_END uint16_t Current layer byte sequence range end + * @param NEXT_LEN uint16_t Current layer message length + * @param CUR_SEQ_N uint16_t Current layer message length + * @param CUR_RANGE_START uint16_t Current layer byte sequence range start + * @param CUR_RANGE_END uint16_t Current layer byte sequence range end + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_SEQN_UUID_ASSOCIATE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, NEXT_UUID, NEXT_RANGE_START, NEXT_RANGE_END, NEXT_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) \ +os_log(LOGOBJECT, "8 {Receive, curProtocol: %hhu, nextProtocol: %hhu, nextUUID: %{public,uuid_t}.16P, nextStart: %hu, nextEnd: %hu, nextPktLen %hu, curSeqN: %hu, curStart: %hu, curEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, NEXT_UUID, NEXT_RANGE_START, NEXT_RANGE_END, NEXT_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) + +/* + * APIs to indicate transitioning of messages; example in/out of a layer + */ + +/*! + * @macro MPKL_UUID_NEXT + * @discussion Log the transition of current layer's message with UUID to next layer + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param NEXT_PROTOCOL_ID uint8_t ID of next layer + * @param CUR_UUID uuid_t Current layer 16-byte UUID of message + * @param CUR_LEN uint32_t Current layer message length + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_UUID_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "9 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public,uuid_t}.16P, curPktLen: %u, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN) + +/*! + * @macro MPKL_SEQRANGE_NEXT + * @discussion Log the transition of current layer's message with UUID to next layer + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param NEXT_PROTOCOL_ID uint8_t ID of next layer + * @param CUR_RANGE_START uint16_t Current layer byte sequence range start + * @param CUR_RANGE_END uint16_t Current layer byte sequence range end + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_SEQRANGE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "10 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN) + + +/*! + * @macro MPKL_UUID_PREV + * @discussion Log the transition of previous layer's message with UUID to current layer + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param PREV_PROTOCOL_ID uint8_t ID of other layer being associated + * @param PREV_UUID uuid_t Previous layer 16-byte UUID of message + * @param PREV_LEN uint16_t Previous layer message length + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_UUID_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_LEN, LOG_SEQN) \ +os_log(LOGOBJECT, "11 {curProtocol: %hhu, prevProtocol: %hhu, prevUUID: %{public,uuid_t}.16P, prevPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_LEN, LOG_SEQN) + +/* + * APIs to indicate a Task Start/End + */ + +/*! + * @macro MPKL_TASK_START + * @discussion Log the start of a task + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CLIENT_BUNDLE_ID NSString bundleID of the client + * @param TASK_UUID uuid_t 16-byte UUID of NSURL task + * @param CONN_UUID uuid_t 16-byte UUID of associated libnetcore connection + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_TASK_START(LOGOBJECT, CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN) \ +os_log(LOGOBJECT, "12 {startBundleID: %@, taskUUID: %{public,uuid_t}.16P, connUUID: %{public,uuid_t}.16P, logSeqn: %hhu}", CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN) + +/*! + * @macro MPKL_TASK_START + * @discussion Log the end of a task + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CLIENT_BUNDLE_ID NSString bundleID of the client + * @param TASK_UUID uuid_t 16-byte UUID of NSURL task + * @param CONN_UUID uuid_t 16-byte UUID of associated libnetcore connection + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ + +#define MPKL_TASK_END(LOGOBJECT, CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN) \ +os_log(LOGOBJECT, "13 {endBundleID: %@, taskUUID: %{public,uuid_t}.16P, connUUID: %{public,uuid_t}.16P, logSeqn: %hhu}", CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN) + +/*! + * @macro MPKL_SEQN_INCOMPLETE_PREV + * @discussion An incomplete packet was sent with a given protocol sequence number and couldn't be associated to another protocol. + * The incomplete packet is saved, its byte sequence range is logged and it is associated once more data arrives. + * + */ + +#define MPKL_SEQN_INCOMPLETE_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, PREV_RANGE_START, PREV_RANGE_END, LOG_SEQN) \ +os_log(LOGOBJECT, "14 {Send Incomplete. curProtocol: %hhu, prevProtocol: %hhu, curSeqN: %hu, curStart: %hu, curEnd: %hu, prevStart: %hu, prevEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, PREV_RANGE_START, PREV_RANGE_END, LOG_SEQN) + +/*! + * @macro MPKL_SEQN_INCOMPLETE_NEXT + * @discussion An incomplete packet was sent with a given protocol sequence number and couldn't be associated to another protocol. + * The incomplete packet is saved, its byte sequence range is logged and it is associated once more data arrives. + * + */ + +#define MPKL_SEQN_INCOMPLETE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, NEXT_RANGE_START, NEXT_RANGE_END, LOG_SEQN) \ +os_log(LOGOBJECT, "15 {Receive Incomplete. curProtocol: %hhu, nextProtocol: %hhu, curSeqN: %hu, curStart: %hu, curEnd: %hu, nextStart: %hu, nextEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, NEXT_RANGE_START, NEXT_RANGE_END, LOG_SEQN) + +#ifdef KERNEL +/*! + * @macro MPKL_TCP_SEND + * @discussion Associate data sent by a process with a TCP connection + * + * @param LOGOBJECT os_log_t object to write data into + * @param PREV_PROTOCOL_ID uint8_t Protocol identifier passed by the process (may be 0) + * @param PREV_UUID uuid_t UUID passed by the process (may be null UUID) + * @param LOCAL_PORT uint16_t Local port of the TCP connection + * @param REMOTE_PORT uint16_t Remote port of the TCP connection + * @param TCP_SEQ uint32_t TCP sequence number of the first byte of the data being sent by the process + * @param TCP_LEN uint32_t Length of the data + * @param PID uint16_t pid of the process using the TCP connection + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ +#define MPKL_TCP_SEND(LOGOBJECT, PREV_PROTOCOL_ID, PREV_UUID, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_LEN, PID, LOG_SEQN) \ + os_log_with_type(LOGOBJECT, net_mpklog_type, \ + "16 {curProtocol: 100, prevProtocol: %hhu, " \ + "prevUUID: " \ + "%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X, " \ + "localPort: %hu, remotePort: %hu, tcpSeq: %u, length: %u, " \ + "pid: %hu, logSeqn: %hhu}", \ + PREV_PROTOCOL_ID, \ + PREV_UUID[0], PREV_UUID[1], PREV_UUID[2], PREV_UUID[3], PREV_UUID[4], PREV_UUID[5], PREV_UUID[6], PREV_UUID[7], \ + PREV_UUID[8], PREV_UUID[9], PREV_UUID[10], PREV_UUID[11], PREV_UUID[12], PREV_UUID[13], PREV_UUID[14], PREV_UUID[15], \ + LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_LEN, \ + (uint16_t)PID, LOG_SEQN) + +/*! + * @macro MPKL_TCP_INPUT + * @discussion Associate TCP segment being with a packet received to a TCP connection + * + * @param LOGOBJECT os_log_t object to write data into + * @param LOCAL_PORT uint16_t Local port in the TCP header of the segment + * @param REMOTE_PORT uint16_t Remote port in the TCP header of the segment + * @param TCP_SEQ uint32_t Sequence number in the TCP header of the segment + * @param TCP_ACK uint32_t Acknowledgement number in the TCP header of the segment + * @param TCP_LEN uint16_t Length in the TCP header of the segment + * @param TCP_FLAGS uint8_t Flags of the TCP header of the segment + * @param PID uint16_t pid of the process using the TCP connection + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ +#define MPKL_TCP_INPUT(LOGOBJECT, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, TCP_LEN, TCP_FLAGS, PID, LOG_SEQN) \ + os_log_with_type(LOGOBJECT, net_mpklog_type, \ + "17 {curProtocol: 100, prevProtocol: 80, " \ + "localPort: %hu, remotePort: %hu, tcpSeq: %u, tcpAck: %u, tcpLen: %hu, tcpFlags: 0x%02x, " \ + "pid: %hu, logSeqn: %hhu}", \ + LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, (uint16_t)TCP_LEN, TCP_FLAGS, \ + (uint16_t)PID, LOG_SEQN) + +/*! + * @macro MPKL_ESP_OUTPUT_TCP + * @discussion Associate a packet with a TCP segment being sent to an ESP packet + * + * @param LOGOBJECT os_log_t object to write data into + * @param SPI uint32_t SPI field in the ESP header + * @param ESP_SEQ uint32_t Sequence number field in the ESP header + * @param LOCAL_PORT uint16_t Local port of the TCP connection + * @param REMOTE_PORT uint16_t Remote port of the TCP connection + * @param TCP_SEQ uint32_t Sequence number in the TCP header of the segment + * @param TCP_ACK uint32_t Acknowledgement number in the TCP header of the segment + * @param TCP_LEN uint16_t Length in the TCP header of the segment + * @param TCP_FLAGS uint8_t Flags of the TCP header of the segment + */ +#define MPKL_ESP_OUTPUT_TCP(LOGOBJECT, SPI, ESP_SEQ, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, TCP_LEN, TCP_FLAGS) \ + os_log_with_type(LOGOBJECT, net_mpklog_type, \ + "18 {curProtocol: 80, spi: 0x%X, espSeq: %u, PayloadProtocol: 100, " \ + "localPort: %hu, remotePort: %hu, tcpSeq: %u, tcpAck: %u, tcpLen: %hu, tcpFlags: 0x%02x}", \ + SPI, ESP_SEQ, \ + LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, (uint16_t)TCP_LEN, TCP_FLAGS) + +/*! + * @macro MPKL_ESP_INPUT_TCP + * @discussion Associate an ESP packet for TCP to the TCP segment + * + * @param LOGOBJECT os_log_t object to write data into + * @param SPI uint32_t SPI field in the ESP header + * @param ESP_SEQ uint32_t Sequence number field in the ESP header + * @param LOCAL_PORT uint16_t Local port of the TCP connection + * @param REMOTE_PORT uint16_t Remote port of the TCP connection + * @param TCP_SEQ uint32_t Sequence number in the TCP header of the segment + * @param TCP_LEN uint16_t Length in the TCP header of the segment + */ +#define MPKL_ESP_INPUT_TCP(LOGOBJECT, SPI, ESP_SEQ, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_LEN) \ + os_log_with_type(LOGOBJECT, net_mpklog_type, \ + "19 {curProtocol: 80 spi: 0x%X, espSeq: %u, PayloadProtocol: 100, " \ + "localPort: %hu, remotePort: %hu, tcpSeq: %u, tcpLen: %hu}", \ + SPI, ESP_SEQ, \ + LOCAL_PORT, REMOTE_PORT, TCP_SEQ, (uint16_t)TCP_LEN) +#endif /* KERNEL */ + +/*! + * @macro MPKL_BYTERANGE_UUID_ASSOCIATE + * @discussion Associate current layer's byte range (start/end) to current layer's UUID + * + * @param LOGOBJECT os_log_t object to write data into + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param NEXT_PROTOCOL_ID uint8_t ID of other layer being associated + * @param CUR_UUID uuid_t Current layer 16-byte UUID of endpoint handler + * @param CUR_RANGE_START uint64_t Current layer byte range start + * @param CUR_RANGE_END uint64_t Current layer byte range end + * @param LOG_SEQN uint8_t Incrementing sequence number to detect logging system drop of messages + */ +#define MPKL_BYTERANGE_UUID_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) \ +os_log(LOGOBJECT, "32 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public}.16P, curStart: %llu, curEnd: %llu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) + +/*! + * @macro MPKL_UUID_ONLY_ASSOCIATE_NEXT + * @discussion Associate current layer's UUID to next layer's UUID + * + * @param LOGOBJECT os_log_t object to write data into + * @param LABEL string optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {} + * @param CUR_PROTOCOL_ID uint8_t ID of current layer from MPKL_PROTOCOL_XXX defines above + * @param NEXT_PROTOCOL_ID uint8_t ID of next layer being associated + * @param CUR_UUID uuid_t Current layer 16-byte UUID + * @param NEXT_UUID uuid_t Next layer 16-byte UUID + * @param LOG_SEQ uint8_t Incrementing sequence number to detect logging system drop of messages + */ +#define MPKL_UUID_ONLY_ASSOCIATE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, LOG_SEQN) \ +os_log(LOGOBJECT, "33 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public}.16P, nextUUID: %{public}.16P, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, LOG_SEQN) + +#ifdef KERNEL_PRIVATE +extern int net_mpklog_enabled; +extern int net_mpklog_type; +#endif /* KERNEL_PRIVATE */ + +#endif /* _NET_MULTI_LAYER_PKT_LOG_H_ */ diff --git a/bsd/net/nat464_utils.c b/bsd/net/nat464_utils.c index fd99d34b5..30370a84e 100644 --- a/bsd/net/nat464_utils.c +++ b/bsd/net/nat464_utils.c @@ -808,9 +808,13 @@ nat464_translate_proto(pbuf_t *pbuf, struct nat464_addr *osrc, proto = &ip6h->ip6_nxt; break; } + default: + return NT_DROP; /* We should never come here */ } - VERIFY(*proto == oproto); + if (*proto != oproto) { + return NT_DROP; + } /* * We may want to manipulate csum flags in some cases diff --git a/bsd/net/necp.c b/bsd/net/necp.c index 513cea4ae..18c84be93 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Apple Inc. All rights reserved. + * Copyright (c) 2013-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +43,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -61,6 +64,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -136,6 +143,14 @@ u_int32_t necp_drop_all_level = 0; u_int32_t necp_pass_loopback = 1; // 0=Off, 1=On u_int32_t necp_pass_keepalives = 1; // 0=Off, 1=On +u_int32_t necp_pass_interpose = 1; // 0=Off, 1=On + +u_int32_t necp_drop_unentitled_order = 0; +#ifdef XNU_TARGET_OS_WATCH +u_int32_t necp_drop_unentitled_level = NECP_SESSION_PRIORITY_CONTROL + 1; // Block all unentitled traffic from policies below control level +#else // XNU_TARGET_OS_WATCH +u_int32_t necp_drop_unentitled_level = 0; +#endif // XNU_TARGET_OS_WATCH u_int32_t necp_debug = 0; // 0=None, 1=Basic, 2=EveryMatch @@ -182,6 +197,9 @@ u_int32_t necp_session_count = 0; #define IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(x) ((x) == NECP_ROUTE_RULE_DENY_INTERFACE || (x) == NECP_ROUTE_RULE_ALLOW_INTERFACE) +#define IS_NECP_DEST_IN_LOCAL_NETWORKS(rt) \ + ((rt) != NULL && !((rt)->rt_flags & RTF_GATEWAY) && ((rt)->rt_ifa && (rt)->rt_ifa->ifa_ifp && !((rt)->rt_ifa->ifa_ifp->if_flags & IFF_POINTOPOINT))) + #define NECP_KERNEL_CONDITION_ALL_INTERFACES 0x000001 #define NECP_KERNEL_CONDITION_BOUND_INTERFACE 0x000002 #define NECP_KERNEL_CONDITION_PROTOCOL 0x000004 @@ -203,6 +221,12 @@ u_int32_t necp_session_count = 0; #define NECP_KERNEL_CONDITION_ENTITLEMENT 0x040000 #define NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT 0x080000 #define NECP_KERNEL_CONDITION_AGENT_TYPE 0x100000 +#define NECP_KERNEL_CONDITION_HAS_CLIENT 0x200000 +#define NECP_KERNEL_CONDITION_LOCAL_NETWORKS 0x400000 +#define NECP_KERNEL_CONDITION_CLIENT_FLAGS 0x800000 +#define NECP_KERNEL_CONDITION_LOCAL_EMPTY 0x1000000 +#define NECP_KERNEL_CONDITION_REMOTE_EMPTY 0x2000000 +#define NECP_KERNEL_CONDITION_PLATFORM_BINARY 0x4000000 #define NECP_MAX_POLICY_RESULT_SIZE 512 #define NECP_MAX_ROUTE_RULES_ARRAY_SIZE 1024 @@ -256,8 +280,13 @@ struct necp_socket_info { u_int32_t application_id; u_int32_t real_application_id; u_int32_t account_id; + u_int32_t drop_order; + u_int32_t client_flags; char *domain; errno_t cred_result; + unsigned has_client : 1; + unsigned is_platform_binary : 1; + unsigned __pad_bits : 6; }; static kern_ctl_ref necp_kctlref; @@ -273,6 +302,8 @@ static lck_attr_t *necp_route_rule_mtx_attr = NULL; static lck_grp_t *necp_route_rule_mtx_grp = NULL; decl_lck_rw_data(static, necp_route_rule_lock); +os_refgrp_decl(static, necp_refgrp, "NECPRefGroup", NULL); + /* * On modification, invalidate cached lookups by bumping the generation count. * Other calls will need to take the slowpath of taking @@ -285,6 +316,22 @@ static volatile int32_t necp_kernel_socket_policies_gencount; } \ } while (0) +/* + * Drop-all Bypass: + * Allow priviledged processes to bypass the default drop-all + * via entitlement check. For OSX, since entitlement check is + * not supported for configd, configd signing identity is checked + * instead. + */ +#define SIGNING_ID_CONFIGD "com.apple.configd" +#define SIGNING_ID_CONFIGD_LEN (sizeof(SIGNING_ID_CONFIGD) - 1) + +typedef enum { + NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE = 0, + NECP_DROP_ALL_BYPASS_CHECK_RESULT_TRUE = 1, + NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE = 2, +} necp_drop_all_bypass_check_result_t; + static u_int32_t necp_kernel_application_policies_condition_mask; static size_t necp_kernel_application_policies_count; static u_int32_t necp_kernel_socket_policies_condition_mask; @@ -310,6 +357,11 @@ static LIST_HEAD(_necpkernelipoutputpolicies, necp_kernel_ip_output_policy) necp #define NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS 5 #define NECP_IP_OUTPUT_MAP_ID_TO_BUCKET(id) (id ? (id%(NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS - 1) + 1) : 0) static struct necp_kernel_ip_output_policy **necp_kernel_ip_output_policies_map[NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS]; +static struct necp_kernel_socket_policy pass_policy = +{ + .id = NECP_KERNEL_POLICY_ID_NO_MATCH, + .result = NECP_KERNEL_POLICY_RESULT_PASS, +}; static struct necp_session *necp_create_session(void); static void necp_delete_session(struct necp_session *session); @@ -338,11 +390,11 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session); static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy); static void necp_policy_apply_all(struct necp_session *session); -static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); +static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, u_int32_t cond_client_flags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id); static bool necp_kernel_socket_policies_reprocess(void); static bool necp_kernel_socket_policies_update_uuid_table(void); -static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id); +static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id_array, size_t *return_route_rule_id_array_count, size_t route_rule_id_array_count, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id, struct rtentry *rt, necp_kernel_policy_result *return_drop_dest_policy_result, necp_drop_all_bypass_check_result_t *return_drop_all_bypass); static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_ip_output_policy_delete(necp_kernel_policy_id policy_id); @@ -353,6 +405,7 @@ static bool necp_is_range_in_range(struct sockaddr *inner_range_start, struct so static bool necp_is_addr_in_subnet(struct sockaddr *addr, struct sockaddr *subnet_addr, u_int8_t subnet_prefix); static int necp_addr_compare(struct sockaddr *sa1, struct sockaddr *sa2, int check_port); static bool necp_buffer_compare_with_bit_prefix(u_int8_t *p1, u_int8_t *p2, u_int32_t bits); +static bool necp_addr_is_empty(struct sockaddr *addr); static bool necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, struct inpcb *inp, struct mbuf *packet); static bool necp_is_intcoproc(struct inpcb *inp, struct mbuf *packet); @@ -360,8 +413,8 @@ struct necp_uuid_id_mapping { LIST_ENTRY(necp_uuid_id_mapping) chain; uuid_t uuid; u_int32_t id; - u_int32_t refcount; - u_int32_t table_refcount; // Add to UUID policy table count + os_refcnt_t refcount; + u_int32_t table_usecount; // Add to UUID policy table count }; static size_t necp_num_uuid_app_id_mappings; static bool necp_uuid_app_id_mappings_dirty; @@ -383,7 +436,7 @@ struct necp_string_id_mapping { LIST_ENTRY(necp_string_id_mapping) chain; char *string; necp_app_id id; - u_int32_t refcount; + os_refcnt_t refcount; }; static LIST_HEAD(necp_string_id_mapping_list, necp_string_id_mapping) necp_account_id_list; static u_int32_t necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char *domain); @@ -412,9 +465,10 @@ struct necp_route_rule { u_int8_t wifi_action; u_int8_t wired_action; u_int8_t expensive_action; + u_int8_t constrained_action; u_int exception_if_indices[MAX_ROUTE_RULE_INTERFACES]; u_int8_t exception_if_actions[MAX_ROUTE_RULE_INTERFACES]; - u_int32_t refcount; + os_refcnt_t refcount; }; static LIST_HEAD(necp_route_rule_list, necp_route_rule) necp_route_rules; static u_int32_t necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_array, u_int32_t route_rules_array_size); @@ -434,17 +488,30 @@ static u_int32_t necp_create_aggregate_route_rule(u_int32_t *rule_ids); // Sysctl definitions static int sysctl_handle_necp_level SYSCTL_HANDLER_ARGS; +static int sysctl_handle_necp_unentitled_level SYSCTL_HANDLER_ARGS; SYSCTL_NODE(_net, OID_AUTO, necp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "NECP"); SYSCTL_INT(_net_necp, NECPCTL_PASS_LOOPBACK, pass_loopback, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_pass_loopback, 0, ""); SYSCTL_INT(_net_necp, NECPCTL_PASS_KEEPALIVES, pass_keepalives, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_pass_keepalives, 0, ""); +SYSCTL_INT(_net_necp, NECPCTL_PASS_INTERPOSE, pass_interpose, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_pass_interpose, 0, ""); SYSCTL_INT(_net_necp, NECPCTL_DEBUG, debug, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_debug, 0, ""); +SYSCTL_PROC(_net_necp, NECPCTL_DROP_UNENTITLED_LEVEL, drop_unentitled_level, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &necp_drop_unentitled_level, 0, &sysctl_handle_necp_unentitled_level, "IU", ""); SYSCTL_PROC(_net_necp, NECPCTL_DROP_ALL_LEVEL, drop_all_level, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &necp_drop_all_level, 0, &sysctl_handle_necp_level, "IU", ""); SYSCTL_LONG(_net_necp, NECPCTL_SOCKET_POLICY_COUNT, socket_policy_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_kernel_socket_policies_count, ""); SYSCTL_LONG(_net_necp, NECPCTL_SOCKET_NON_APP_POLICY_COUNT, socket_non_app_policy_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_kernel_socket_policies_non_app_count, ""); SYSCTL_LONG(_net_necp, NECPCTL_IP_POLICY_COUNT, ip_policy_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_kernel_ip_output_policies_count, ""); SYSCTL_INT(_net_necp, NECPCTL_SESSION_COUNT, session_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_session_count, 0, ""); +static struct necp_drop_dest_policy necp_drop_dest_policy; +static int necp_drop_dest_debug = 0; // 0: off, 1: match, >1: every evaluation +SYSCTL_INT(_net_necp, OID_AUTO, drop_dest_debug, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_drop_dest_debug, 0, ""); + +static int sysctl_handle_necp_drop_dest_level SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_necp, OID_AUTO, drop_dest_level, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_RW, + 0, 0, &sysctl_handle_necp_drop_dest_level, "S,necp_drop_dest_level", ""); + +static bool necp_address_matches_drop_dest_policy(union necp_sockaddr_union *, u_int32_t); + // Session order allocation static u_int32_t necp_allocate_new_session_order(u_int32_t priority, u_int32_t control_unit) @@ -465,6 +532,9 @@ necp_allocate_new_session_order(u_int32_t priority, u_int32_t control_unit) static inline u_int32_t necp_get_first_order_for_priority(u_int32_t priority) { + if (priority == 0) { + return 0; + } return ((priority - 1) * 1000) + 1; } @@ -474,72 +544,76 @@ sysctl_handle_necp_level SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); - if (necp_drop_all_level == 0) { - necp_drop_all_order = 0; + necp_drop_all_order = necp_get_first_order_for_priority(necp_drop_all_level); + return error; +} + +static int +sysctl_handle_necp_unentitled_level SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + necp_drop_unentitled_order = necp_get_first_order_for_priority(necp_drop_unentitled_level); + return error; +} + +// Use a macro here to avoid computing the kauth_cred_t when necp_drop_unentitled_level is 0 +static inline u_int32_t +_necp_process_drop_order_inner(kauth_cred_t cred) +{ + if (priv_check_cred(cred, PRIV_NET_PRIVILEGED_CLIENT_ACCESS, 0) != 0 && + priv_check_cred(cred, PRIV_NET_PRIVILEGED_SERVER_ACCESS, 0) != 0) { + return necp_drop_unentitled_order; } else { - necp_drop_all_order = necp_get_first_order_for_priority(necp_drop_all_level); + return 0; } - return error; } +#define necp_process_drop_order(_cred) (necp_drop_unentitled_order != 0 ? _necp_process_drop_order_inner(_cred) : necp_drop_unentitled_order) +#pragma GCC poison _necp_process_drop_order_inner + // Session fd -static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t); -static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t); -static int noop_ioctl(struct fileproc *, unsigned long, caddr_t, - vfs_context_t); -static int noop_select(struct fileproc *, int, void *, vfs_context_t); static int necp_session_op_close(struct fileglob *, vfs_context_t); -static int noop_kqfilter(struct fileproc *, struct knote *, - struct kevent_internal_s *, vfs_context_t); static const struct fileops necp_session_fd_ops = { - .fo_type = DTYPE_NETPOLICY, - .fo_read = noop_read, - .fo_write = noop_write, - .fo_ioctl = noop_ioctl, - .fo_select = noop_select, - .fo_close = necp_session_op_close, - .fo_kqfilter = noop_kqfilter, - .fo_drain = NULL, + .fo_type = DTYPE_NETPOLICY, + .fo_read = fo_no_read, + .fo_write = fo_no_write, + .fo_ioctl = fo_no_ioctl, + .fo_select = fo_no_select, + .fo_close = necp_session_op_close, + .fo_drain = fo_no_drain, + .fo_kqfilter = fo_no_kqfilter, }; -static int -noop_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) -{ -#pragma unused(fp, uio, flags, ctx) - return ENXIO; -} - -static int -noop_write(struct fileproc *fp, struct uio *uio, int flags, - vfs_context_t ctx) -{ -#pragma unused(fp, uio, flags, ctx) - return ENXIO; -} - -static int -noop_ioctl(struct fileproc *fp, unsigned long com, caddr_t data, - vfs_context_t ctx) +static inline necp_drop_all_bypass_check_result_t +necp_check_drop_all_bypass_result(proc_t proc) { -#pragma unused(fp, com, data, ctx) - return ENOTTY; -} + if (proc == NULL) { + proc = current_proc(); + if (proc == NULL) { + return NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE; + } + } -static int -noop_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) -{ -#pragma unused(fp, which, wql, ctx) - return ENXIO; -} +#if defined(XNU_TARGET_OS_OSX) + const char *signing_id = NULL; + const bool isConfigd = (csproc_get_platform_binary(proc) && + (signing_id = cs_identity_get(proc)) && + (strlen(signing_id) == SIGNING_ID_CONFIGD_LEN) && + (memcmp(signing_id, SIGNING_ID_CONFIGD, SIGNING_ID_CONFIGD_LEN) == 0)); + if (isConfigd) { + return NECP_DROP_ALL_BYPASS_CHECK_RESULT_TRUE; + } +#endif -static int -noop_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx) -{ -#pragma unused(fp, kn, kev, ctx) - return ENXIO; + const task_t task = proc_task(proc); + if (task == NULL || !IOTaskHasEntitlement(task, "com.apple.private.necp.drop_all_bypass")) { + return NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE; + } else { + return NECP_DROP_ALL_BYPASS_CHECK_RESULT_TRUE; + } } int @@ -1124,6 +1198,117 @@ static errno_t necp_ctl_setopt(kern_ctl_ref kctlref, u_int32_t unit, void *uniti static bool necp_send_ctl_data(struct necp_session *session, u_int8_t *buffer, size_t buffer_size); +struct necp_resolver_key_state { + const struct ccdigest_info *digest_info; + uint8_t key[CCSHA256_OUTPUT_SIZE]; +}; +static struct necp_resolver_key_state s_necp_resolver_key_state; + +static void +necp_generate_resolver_key(void) +{ + s_necp_resolver_key_state.digest_info = ccsha256_di(); + cc_rand_generate(s_necp_resolver_key_state.key, sizeof(s_necp_resolver_key_state.key)); +} + +static void +necp_sign_update_context(const struct ccdigest_info *di, + cchmac_ctx_t ctx, + uuid_t client_id, + u_int8_t *query, + u_int32_t query_length, + u_int8_t *answer, + u_int32_t answer_length) +{ + const uint8_t context[32] = {[0 ... 31] = 0x20}; // 0x20 repeated 32 times + const char *context_string = "NECP Resolver Binder"; + uint8_t separator = 0; + cchmac_update(di, ctx, sizeof(context), context); + cchmac_update(di, ctx, strlen(context_string), context_string); + cchmac_update(di, ctx, sizeof(separator), &separator); + cchmac_update(di, ctx, sizeof(uuid_t), client_id); + cchmac_update(di, ctx, sizeof(query_length), &query_length); + cchmac_update(di, ctx, query_length, query); + cchmac_update(di, ctx, sizeof(answer_length), &answer_length); + cchmac_update(di, ctx, answer_length, answer); +} + +int +necp_sign_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length, + u_int8_t *answer, u_int32_t answer_length, + u_int8_t *tag, u_int32_t *out_tag_length) +{ + if (s_necp_resolver_key_state.digest_info == NULL) { + return EINVAL; + } + + if (query == NULL || + query_length == 0 || + answer == NULL || + answer_length == 0 || + tag == NULL || + out_tag_length == NULL) { + return EINVAL; + } + + size_t required_tag_length = s_necp_resolver_key_state.digest_info->output_size; + if (*out_tag_length < required_tag_length) { + return ERANGE; + } + + *out_tag_length = required_tag_length; + + cchmac_ctx_decl(s_necp_resolver_key_state.digest_info->state_size, + s_necp_resolver_key_state.digest_info->block_size, ctx); + cchmac_init(s_necp_resolver_key_state.digest_info, ctx, + sizeof(s_necp_resolver_key_state.key), + s_necp_resolver_key_state.key); + necp_sign_update_context(s_necp_resolver_key_state.digest_info, + ctx, client_id, query, query_length, + answer, answer_length); + cchmac_final(s_necp_resolver_key_state.digest_info, ctx, tag); + + return 0; +} + +bool +necp_validate_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length, + u_int8_t *answer, u_int32_t answer_length, + u_int8_t *tag, u_int32_t tag_length) +{ + if (s_necp_resolver_key_state.digest_info == NULL) { + return false; + } + + if (query == NULL || + query_length == 0 || + answer == NULL || + answer_length == 0 || + tag == NULL || + tag_length == 0) { + return false; + } + + size_t required_tag_length = s_necp_resolver_key_state.digest_info->output_size; + if (tag_length != required_tag_length) { + return false; + } + + uint8_t actual_tag[required_tag_length]; + + cchmac_ctx_decl(s_necp_resolver_key_state.digest_info->state_size, + s_necp_resolver_key_state.digest_info->block_size, ctx); + cchmac_init(s_necp_resolver_key_state.digest_info, ctx, + sizeof(s_necp_resolver_key_state.key), + s_necp_resolver_key_state.key); + necp_sign_update_context(s_necp_resolver_key_state.digest_info, + ctx, client_id, query, query_length, + answer, answer_length); + cchmac_final(s_necp_resolver_key_state.digest_info, ctx, actual_tag); + + return cc_cmp_safe(s_necp_resolver_key_state.digest_info->output_size, tag, actual_tag) == 0; +} + errno_t necp_init(void) { @@ -1196,6 +1381,8 @@ necp_init(void) LIST_INIT(&necp_route_rules); LIST_INIT(&necp_aggregate_route_rules); + necp_generate_resolver_key(); + necp_uuid_app_id_hashtbl = hashinit(NECP_UUID_APP_ID_HASH_SIZE, M_NECP, &necp_uuid_app_id_hash_mask); necp_uuid_app_id_hash_num_buckets = necp_uuid_app_id_hash_mask + 1; necp_num_uuid_app_id_mappings = 0; @@ -1217,6 +1404,8 @@ necp_init(void) memset(&necp_kernel_ip_output_policies_map, 0, sizeof(necp_kernel_ip_output_policies_map)); necp_kernel_socket_policies_app_layer_map = NULL; + necp_drop_unentitled_order = necp_get_first_order_for_priority(necp_drop_unentitled_level); + done: if (result != 0) { if (necp_kernel_policy_mtx_attr != NULL) { @@ -1442,7 +1631,8 @@ necp_buffer_write_tlv_if_different(u_int8_t *cursor, u_int8_t type, u_int8_t *buffer, u_int32_t buffer_length) { if (!necp_buffer_write_tlv_validate(cursor, type, length, buffer, buffer_length)) { - return NULL; + // If we can't fit this TLV, return the current cursor + return cursor; } u_int8_t *next_tlv = (u_int8_t *)(cursor + sizeof(type) + sizeof(length) + length); if (*updated || *(u_int8_t *)(cursor) != type) { @@ -1524,9 +1714,15 @@ necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_siz } int -necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int next) +necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int *err, int next) { + if (err != NULL) { + *err = ENOENT; + } if (offset < 0) { + if (err != NULL) { + *err = EINVAL; + } return -1; } int cursor = offset; @@ -1553,6 +1749,9 @@ necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_in if (curr_type == type) { // check if entire TLV fits inside buffer if (((u_int32_t)next_cursor) <= buffer_length) { + if (err != NULL) { + *err = 0; + } return cursor; } else { return -1; @@ -1569,7 +1768,7 @@ necp_find_tlv(mbuf_t packet, u_int8_t *buffer, u_int32_t buffer_length, int offs if (packet != NULL) { cursor = necp_packet_find_tlv(packet, offset, type, err, next); } else if (buffer != NULL) { - cursor = necp_buffer_find_tlv(buffer, buffer_length, offset, type, next); + cursor = necp_buffer_find_tlv(buffer, buffer_length, offset, type, err, next); } return cursor; } @@ -1981,7 +2180,8 @@ necp_policy_result_is_valid(u_int8_t *buffer, u_int32_t length) case NECP_POLICY_RESULT_PASS: case NECP_POLICY_RESULT_DROP: case NECP_POLICY_RESULT_ROUTE_RULES: - case NECP_POLICY_RESULT_SCOPED_DIRECT: { + case NECP_POLICY_RESULT_SCOPED_DIRECT: + case NECP_POLICY_RESULT_ALLOW_UNENTITLED: { validated = TRUE; break; } @@ -2099,7 +2299,8 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli policy_result_type == NECP_POLICY_RESULT_ROUTE_RULES || policy_result_type == NECP_POLICY_RESULT_USE_NETAGENT || policy_result_type == NECP_POLICY_RESULT_NETAGENT_SCOPED || - policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT) ? TRUE : FALSE; + policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT || + policy_result_type == NECP_POLICY_RESULT_ALLOW_UNENTITLED) ? TRUE : FALSE; u_int32_t condition_length = necp_policy_condition_get_value_length_from_buffer(buffer, length); u_int8_t *condition_value = necp_policy_condition_get_value_pointer_from_buffer(buffer, length); u_int8_t type = necp_policy_condition_get_type_from_buffer(buffer, length); @@ -2131,7 +2332,10 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli } case NECP_POLICY_CONDITION_DEFAULT: case NECP_POLICY_CONDITION_ALL_INTERFACES: - case NECP_POLICY_CONDITION_ENTITLEMENT: { + case NECP_POLICY_CONDITION_ENTITLEMENT: + case NECP_POLICY_CONDITION_PLATFORM_BINARY: + case NECP_POLICY_CONDITION_HAS_CLIENT: + case NECP_POLICY_CONDITION_LOCAL_NETWORKS: { if (!(flags & NECP_POLICY_CONDITION_FLAGS_NEGATIVE)) { validated = TRUE; } @@ -2181,6 +2385,43 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli } break; } + case NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL: { + if (condition_length >= sizeof(u_int16_t)) { + validated = TRUE; + } + break; + } + case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR: + case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR: { + if (condition_length >= sizeof(struct necp_policy_condition_addr) && + necp_address_is_valid(&((struct necp_policy_condition_addr *)(void *)condition_value)->address.sa)) { + validated = TRUE; + } + break; + } + case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE: + case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE: { + if (condition_length >= sizeof(struct necp_policy_condition_addr_range) && + necp_address_is_valid(&((struct necp_policy_condition_addr_range *)(void *)condition_value)->start_address.sa) && + necp_address_is_valid(&((struct necp_policy_condition_addr_range *)(void *)condition_value)->end_address.sa)) { + validated = TRUE; + } + break; + } + case NECP_POLICY_CONDITION_CLIENT_FLAGS: { + if (condition_length == 0 || condition_length >= sizeof(u_int32_t)) { + validated = TRUE; + } + break; + } + case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY: { + validated = TRUE; + break; + } + case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY: { + validated = TRUE; + break; + } default: { validated = FALSE; break; @@ -2454,6 +2695,11 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ // Read policy result cursor = necp_find_tlv(packet, tlv_buffer, tlv_buffer_length, offset, NECP_TLV_POLICY_RESULT, &error, 0); + if (error || cursor < 0) { + NECPLOG(LOG_ERR, "Failed to find policy result TLV: %d", error); + response_error = NECP_ERROR_INVALID_TLV; + goto fail; + } error = necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &policy_result_size); if (error || policy_result_size == 0) { NECPLOG(LOG_ERR, "Failed to get policy result length: %d", error); @@ -2490,8 +2736,12 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ cursor = necp_find_tlv(packet, tlv_buffer, tlv_buffer_length, cursor, NECP_TLV_ROUTE_RULE, &error, 1)) { u_int32_t route_rule_size = 0; necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &route_rule_size); - if (route_rule_size > 0) { - route_rules_array_size += (sizeof(u_int8_t) + sizeof(u_int32_t) + route_rule_size); + if (os_add_overflow(route_rules_array_size, + (sizeof(u_int8_t) + sizeof(u_int32_t) + route_rule_size), + &route_rules_array_size)) { + NECPLOG0(LOG_ERR, "Route rules size overflowed, too large"); + response_error = NECP_ERROR_INVALID_TLV; + goto fail; } } @@ -2519,7 +2769,8 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ u_int8_t route_rule_type = NECP_TLV_ROUTE_RULE; u_int32_t route_rule_size = 0; necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &route_rule_size); - if (route_rule_size > 0 && route_rule_size <= (route_rules_array_size - route_rules_array_cursor)) { + if (route_rule_size > 0 && + (sizeof(route_rule_type) + sizeof(route_rule_size) + route_rule_size) <= (route_rules_array_size - route_rules_array_cursor)) { // Add type memcpy((route_rules_array + route_rules_array_cursor), &route_rule_type, sizeof(route_rule_type)); route_rules_array_cursor += sizeof(route_rule_type); @@ -2559,7 +2810,13 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &condition_size); if (condition_size > 0) { - conditions_array_size += (sizeof(u_int8_t) + sizeof(u_int32_t) + condition_size); + if (os_add_overflow(conditions_array_size, + (sizeof(u_int8_t) + sizeof(u_int32_t) + condition_size), + &conditions_array_size)) { + NECPLOG0(LOG_ERR, "Conditions size overflowed, too large"); + response_error = NECP_ERROR_INVALID_TLV; + goto fail; + } } } @@ -2587,7 +2844,8 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ u_int8_t condition_type = NECP_TLV_POLICY_CONDITION; u_int32_t condition_size = 0; necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &condition_size); - if (condition_size > 0 && condition_size <= (conditions_array_size - conditions_array_cursor)) { + if (condition_size > 0 && + (sizeof(condition_type) + sizeof(condition_size) + condition_size) <= (conditions_array_size - conditions_array_cursor)) { // Add type memcpy((conditions_array + conditions_array_cursor), &condition_type, sizeof(condition_type)); conditions_array_cursor += sizeof(condition_type); @@ -3035,6 +3293,9 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, if (condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES) { num_conditions++; } + if (condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) { + num_conditions++; + } if (condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { snprintf(if_name, IFXNAMSIZ, "%s%d", ifnet_name(policy->cond_bound_interface), ifnet_unit(policy->cond_bound_interface)); condition_tlv_length += strlen(if_name) + 1; @@ -3086,6 +3347,12 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, condition_tlv_length += entitlement_len; num_conditions++; } + if (condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) { + num_conditions++; + } if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { condition_tlv_length += sizeof(struct necp_policy_condition_addr_range); @@ -3106,6 +3373,16 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, condition_tlv_length += sizeof(struct necp_policy_condition_agent_type); num_conditions++; } + if (condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) { + condition_tlv_length += sizeof(u_int32_t); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) { + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) { + num_conditions++; + } } condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above. @@ -3148,6 +3425,12 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, if (condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES) { cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ALL_INTERFACES, 0, "", cond_buf, condition_tlv_length); } + if (condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_HAS_CLIENT, 0, "", cond_buf, condition_tlv_length); + } + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_LOCAL_NETWORKS, 0, "", cond_buf, condition_tlv_length); + } if (condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_BOUND_INTERFACE, strlen(if_name) + 1, if_name, cond_buf, condition_tlv_length); @@ -3200,6 +3483,9 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ENTITLEMENT, strlen(policy->cond_custom_entitlement) + 1, policy->cond_custom_entitlement, cond_buf, condition_tlv_length); } + if (condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PLATFORM_BINARY, 0, "", cond_buf, condition_tlv_length); + } if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { struct necp_policy_condition_addr_range range; @@ -3235,6 +3521,15 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, sizeof(policy->cond_agent_type), &policy->cond_agent_type, cond_buf, condition_tlv_length); } + if (condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_CLIENT_FLAGS, sizeof(policy->cond_client_flags), &policy->cond_client_flags, cond_buf, condition_tlv_length); + } + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY, 0, "", cond_buf, condition_tlv_length); + } + if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY, 0, "", cond_buf, condition_tlv_length); + } } cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf, tlv_buffer, total_allocated_bytes); @@ -3606,6 +3901,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli union necp_sockaddr_union cond_remote_start; union necp_sockaddr_union cond_remote_end; u_int8_t cond_remote_prefix = 0; + u_int32_t cond_client_flags = 0; u_int32_t offset = 0; u_int8_t ultimate_result = 0; u_int32_t secondary_result = 0; @@ -3642,6 +3938,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli socket_ip_conditions = TRUE; break; } + case NECP_POLICY_CONDITION_HAS_CLIENT: { + master_condition_mask |= NECP_KERNEL_CONDITION_HAS_CLIENT; + socket_only_conditions = TRUE; + break; + } case NECP_POLICY_CONDITION_ENTITLEMENT: { if (condition_length > 0) { if (cond_custom_entitlement == NULL) { @@ -3657,6 +3958,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } break; } + case NECP_POLICY_CONDITION_PLATFORM_BINARY: { + master_condition_mask |= NECP_KERNEL_CONDITION_PLATFORM_BINARY; + socket_only_conditions = TRUE; + break; + } case NECP_POLICY_CONDITION_DOMAIN: { // Make sure there is only one such rule if (condition_length > 0 && cond_domain == NULL) { @@ -3781,18 +4087,29 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } break; } - case NECP_POLICY_CONDITION_IP_PROTOCOL: { + case NECP_POLICY_CONDITION_IP_PROTOCOL: + case NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL: { if (condition_length >= sizeof(u_int16_t)) { master_condition_mask |= NECP_KERNEL_CONDITION_PROTOCOL; if (condition_is_negative) { master_condition_negated_mask |= NECP_KERNEL_CONDITION_PROTOCOL; } memcpy(&cond_protocol, condition_value, sizeof(cond_protocol)); - socket_ip_conditions = TRUE; + if (condition_type == NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL) { + socket_only_conditions = TRUE; + } else { + socket_ip_conditions = TRUE; + } } break; } - case NECP_POLICY_CONDITION_LOCAL_ADDR: { + case NECP_POLICY_CONDITION_LOCAL_NETWORKS: { + master_condition_mask |= NECP_KERNEL_CONDITION_LOCAL_NETWORKS; + socket_ip_conditions = TRUE; + break; + } + case NECP_POLICY_CONDITION_LOCAL_ADDR: + case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR: { struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)condition_value; if (!necp_address_is_valid(&address_struct->address.sa)) { break; @@ -3806,10 +4123,15 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_START; master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_PREFIX; } - socket_ip_conditions = TRUE; + if (condition_type == NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR) { + socket_only_conditions = TRUE; + } else { + socket_ip_conditions = TRUE; + } break; } - case NECP_POLICY_CONDITION_REMOTE_ADDR: { + case NECP_POLICY_CONDITION_REMOTE_ADDR: + case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR: { struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)condition_value; if (!necp_address_is_valid(&address_struct->address.sa)) { break; @@ -3823,10 +4145,15 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_START; master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_PREFIX; } - socket_ip_conditions = TRUE; + if (condition_type == NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR) { + socket_only_conditions = TRUE; + } else { + socket_ip_conditions = TRUE; + } break; } - case NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE: { + case NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE: + case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE: { struct necp_policy_condition_addr_range *address_struct = (struct necp_policy_condition_addr_range *)(void *)condition_value; if (!necp_address_is_valid(&address_struct->start_address.sa) || !necp_address_is_valid(&address_struct->end_address.sa)) { @@ -3841,10 +4168,15 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_START; master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_END; } - socket_ip_conditions = TRUE; + if (condition_type == NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE) { + socket_only_conditions = TRUE; + } else { + socket_ip_conditions = TRUE; + } break; } - case NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE: { + case NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE: + case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE: { struct necp_policy_condition_addr_range *address_struct = (struct necp_policy_condition_addr_range *)(void *)condition_value; if (!necp_address_is_valid(&address_struct->start_address.sa) || !necp_address_is_valid(&address_struct->end_address.sa)) { @@ -3859,7 +4191,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_START; master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_END; } - socket_ip_conditions = TRUE; + if (condition_type == NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE) { + socket_only_conditions = TRUE; + } else { + socket_ip_conditions = TRUE; + } break; } case NECP_POLICY_CONDITION_AGENT_TYPE: { @@ -3870,6 +4206,36 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } break; } + case NECP_POLICY_CONDITION_CLIENT_FLAGS: { + if (condition_is_negative) { + master_condition_negated_mask |= NECP_KERNEL_CONDITION_CLIENT_FLAGS; + } + master_condition_mask |= NECP_KERNEL_CONDITION_CLIENT_FLAGS; + socket_only_conditions = TRUE; + if (condition_length >= sizeof(u_int32_t)) { + memcpy(&cond_client_flags, condition_value, sizeof(cond_client_flags)); + } else { + // Empty means match on fallback traffic + cond_client_flags = NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC; + } + break; + } + case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY: { + master_condition_mask |= NECP_KERNEL_CONDITION_LOCAL_EMPTY; + if (condition_is_negative) { + master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_EMPTY; + } + socket_only_conditions = TRUE; + break; + } + case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY: { + master_condition_mask |= NECP_KERNEL_CONDITION_REMOTE_EMPTY; + if (condition_is_negative) { + master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_EMPTY; + } + socket_only_conditions = TRUE; + break; + } default: { break; } @@ -4017,13 +4383,23 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli socket_layer_non_id_conditions = TRUE; break; } + case NECP_POLICY_RESULT_ALLOW_UNENTITLED: { + socket_layer_non_id_conditions = TRUE; + break; + } case NECP_POLICY_RESULT_ROUTE_RULES: { if (policy->route_rules != NULL && policy->route_rules_size > 0) { u_int32_t route_rule_id = necp_create_route_rule(&necp_route_rules, policy->route_rules, policy->route_rules_size); if (route_rule_id > 0) { policy->applied_route_rules_id = route_rule_id; ultimate_result_parameter.route_rule_id = route_rule_id; - socket_layer_non_id_conditions = TRUE; + if (socket_only_conditions) { // socket_ip_conditions can be TRUE or FALSE + socket_layer_non_id_conditions = TRUE; + } else if (socket_ip_conditions) { + socket_layer_non_id_conditions = TRUE; + ip_output_layer_non_id_conditions = TRUE; + ip_output_layer_non_id_only = TRUE; // Only apply route rules to packets that didn't go through socket layer + } } } break; @@ -4034,7 +4410,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (socket_layer_non_id_conditions) { - necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, ultimate_result, ultimate_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, cond_client_flags, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy"); @@ -4050,6 +4426,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli if (ip_output_layer_non_id_only) { condition_mask |= NECP_KERNEL_CONDITION_POLICY_ID; } + necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { @@ -4202,9 +4579,10 @@ necp_kernel_policy_get_new_id(bool socket_level) return newid; } -#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE) +#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE | NECP_KERNEL_CONDITION_HAS_CLIENT | NECP_KERNEL_CONDITION_LOCAL_NETWORKS | NECP_KERNEL_CONDITION_CLIENT_FLAGS | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_PLATFORM_BINARY) + static necp_kernel_policy_id -necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) +necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, u_int32_t cond_client_flags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { struct necp_kernel_socket_policy *new_kernel_policy = NULL; struct necp_kernel_socket_policy *tmp_kernel_policy = NULL; @@ -4237,6 +4615,12 @@ necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, if ((new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_END) && (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_PREFIX)) { new_kernel_policy->condition_mask &= ~NECP_KERNEL_CONDITION_REMOTE_PREFIX; } + if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) { + new_kernel_policy->condition_mask &= ~(NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_LOCAL_END); + } + if ((new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY)) { + new_kernel_policy->condition_mask &= ~(NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_REMOTE_END); + } new_kernel_policy->condition_negated_mask = condition_negated_mask & new_kernel_policy->condition_mask; // Set condition values @@ -4296,6 +4680,9 @@ necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) { memcpy(&new_kernel_policy->cond_agent_type, cond_agent_type, sizeof(*cond_agent_type)); } + if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) { + new_kernel_policy->cond_client_flags = cond_client_flags; + } new_kernel_policy->result = result; memcpy(&new_kernel_policy->result_parameter, &result_parameter, sizeof(result_parameter)); @@ -4407,9 +4794,13 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul snprintf(result_string, MAX_RESULT_STRING_LEN, "ScopedDirect"); break; } + case NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED: { + snprintf(result_string, MAX_RESULT_STRING_LEN, "AllowUnentitled"); + break; + } case NECP_KERNEL_POLICY_RESULT_ROUTE_RULES: { int index = 0; - char interface_names[IFXNAMSIZ][MAX_ROUTE_RULE_INTERFACES]; + char interface_names[MAX_ROUTE_RULE_INTERFACES][IFXNAMSIZ]; struct necp_route_rule *route_rule = necp_lookup_route_rule_locked(&necp_route_rules, result_parameter.route_rule_id); if (route_rule != NULL) { for (index = 0; index < MAX_ROUTE_RULE_INTERFACES; index++) { @@ -4422,11 +4813,12 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul } switch (route_rule->default_action) { case NECP_ROUTE_RULE_DENY_INTERFACE: - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "WiFi " : "", (route_rule->wired_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Wired " : "", (route_rule->expensive_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Exp " : "", + (route_rule->constrained_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Constrained " : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[0] : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? " " : "", (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[1] : "", @@ -4448,11 +4840,12 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[9] : ""); break; case NECP_ROUTE_RULE_ALLOW_INTERFACE: - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!WiFi " : "", (route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Wired " : "", (route_rule->expensive_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Exp " : "", + (route_rule->constrained_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Constrained " : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!" : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_DENY_INTERFACE) ? interface_names[0] : "", (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!" : "", @@ -4475,11 +4868,12 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_DENY_INTERFACE) ? interface_names[9] : ""); break; case NECP_ROUTE_RULE_QOS_MARKING: - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING) ? "WiFi " : "", (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Wired " : "", (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Exp " : "", + (route_rule->constrained_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Constrained " : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[0] : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[1] : "", @@ -4627,7 +5021,8 @@ necp_kernel_socket_policy_results_overlap(struct necp_kernel_socket_policy *uppe } else if (upper_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER || upper_policy->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES || upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT || - upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) { + upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED || + upper_policy->result == NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED) { // Filters and route rules never cancel out lower policies return FALSE; } else if (necp_kernel_socket_result_is_trigger_service_type(upper_policy)) { @@ -4699,6 +5094,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic continue; } + // If new policy matches Local Networks, compared policy must also + if ((policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) && !(compared_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS)) { + continue; + } + // Default makes lower policies unecessary always if (compared_policy->condition_mask == 0) { return TRUE; @@ -4764,6 +5164,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic continue; } + if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS && + compared_policy->cond_client_flags != policy->cond_client_flags) { + continue; + } + if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS && !(compared_policy->cond_traffic_class.start_tc <= policy->cond_traffic_class.start_tc && compared_policy->cond_traffic_class.end_tc >= policy->cond_traffic_class.end_tc)) { @@ -5020,7 +5425,7 @@ necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char existing_mapping = necp_lookup_string_to_id_locked(list, string); if (existing_mapping != NULL) { string_id = existing_mapping->id; - existing_mapping->refcount++; + os_ref_retain_locked(&existing_mapping->refcount); } else { struct necp_string_id_mapping *new_mapping = NULL; MALLOC(new_mapping, struct necp_string_id_mapping *, sizeof(struct necp_string_id_mapping), M_NECP, M_WAITOK); @@ -5032,7 +5437,7 @@ necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char if (new_mapping->string != NULL) { memcpy(new_mapping->string, string, length); new_mapping->id = necp_get_new_string_id(); - new_mapping->refcount = 1; + os_ref_init(&new_mapping->refcount, &necp_refgrp); LIST_INSERT_HEAD(list, new_mapping, chain); string_id = new_mapping->id; } else { @@ -5053,7 +5458,7 @@ necp_remove_string_to_id_mapping(struct necp_string_id_mapping_list *list, char existing_mapping = necp_lookup_string_to_id_locked(list, string); if (existing_mapping != NULL) { - if (--existing_mapping->refcount == 0) { + if (os_ref_release_locked(&existing_mapping->refcount) == 0) { LIST_REMOVE(existing_mapping, chain); FREE(existing_mapping->string, M_NECP); FREE(existing_mapping, M_NECP); @@ -5138,7 +5543,7 @@ necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route } static struct necp_route_rule * -necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int32_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int32_t *if_indices, u_int8_t *if_actions) +necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int32_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int32_t *if_indices, u_int8_t *if_actions) { struct necp_route_rule *searchentry = NULL; struct necp_route_rule *foundentry = NULL; @@ -5148,7 +5553,8 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i searchentry->cellular_action == cellular_action && searchentry->wifi_action == wifi_action && searchentry->wired_action == wired_action && - searchentry->expensive_action == expensive_action) { + searchentry->expensive_action == expensive_action && + searchentry->constrained_action == constrained_action) { bool match_failed = FALSE; size_t index_a = 0; size_t index_b = 0; @@ -5199,6 +5605,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ u_int8_t wifi_action = NECP_ROUTE_RULE_NONE; u_int8_t wired_action = NECP_ROUTE_RULE_NONE; u_int8_t expensive_action = NECP_ROUTE_RULE_NONE; + u_int8_t constrained_action = NECP_ROUTE_RULE_NONE; u_int32_t if_indices[MAX_ROUTE_RULE_INTERFACES]; size_t num_valid_indices = 0; memset(&if_indices, 0, sizeof(if_indices)); @@ -5241,6 +5648,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ if (rule_flags & NECP_ROUTE_RULE_FLAG_EXPENSIVE) { expensive_action = rule_type; } + if (rule_flags & NECP_ROUTE_RULE_FLAG_CONSTRAINED) { + constrained_action = rule_type; + } if (rule_flags == 0) { default_action = rule_type; } @@ -5265,10 +5675,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; } - existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, if_indices, if_actions); + existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, if_indices, if_actions); if (existing_rule != NULL) { route_rule_id = existing_rule->id; - existing_rule->refcount++; + os_ref_retain_locked(&existing_rule->refcount); } else { struct necp_route_rule *new_rule = NULL; MALLOC(new_rule, struct necp_route_rule *, sizeof(struct necp_route_rule), M_NECP, M_WAITOK); @@ -5280,9 +5690,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ new_rule->wifi_action = wifi_action; new_rule->wired_action = wired_action; new_rule->expensive_action = expensive_action; + new_rule->constrained_action = constrained_action; memcpy(&new_rule->exception_if_indices, &if_indices, sizeof(if_indices)); memcpy(&new_rule->exception_if_actions, &if_actions, sizeof(if_actions)); - new_rule->refcount = 1; + os_ref_init(&new_rule->refcount, &necp_refgrp); LIST_INSERT_HEAD(list, new_rule, chain); } } @@ -5323,7 +5734,7 @@ necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_i existing_rule = necp_lookup_route_rule_locked(list, route_rule_id); if (existing_rule != NULL) { - if (--existing_rule->refcount == 0) { + if (os_ref_release_locked(&existing_rule->refcount) == 0) { necp_remove_aggregate_route_rule_for_id(existing_rule->id); LIST_REMOVE(existing_rule, chain); FREE(existing_rule, M_NECP); @@ -5361,16 +5772,10 @@ necp_create_aggregate_route_rule(u_int32_t *rule_ids) struct necp_aggregate_route_rule *new_rule = NULL; struct necp_aggregate_route_rule *existing_rule = NULL; - LIST_FOREACH(existing_rule, &necp_aggregate_route_rules, chain) { - if (memcmp(existing_rule->rule_ids, rule_ids, (sizeof(u_int32_t) * MAX_AGGREGATE_ROUTE_RULES)) == 0) { - return existing_rule->id; - } - } - lck_rw_lock_exclusive(&necp_route_rule_lock); + // Check if the rule already exists LIST_FOREACH(existing_rule, &necp_aggregate_route_rules, chain) { - // Re-check, in case something else created the rule while we are waiting to lock if (memcmp(existing_rule->rule_ids, rule_ids, (sizeof(u_int32_t) * MAX_AGGREGATE_ROUTE_RULES)) == 0) { lck_rw_done(&necp_route_rule_lock); return existing_rule->id; @@ -5494,9 +5899,9 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_ existing_mapping = necp_uuid_lookup_app_id_locked(uuid); if (existing_mapping != NULL) { local_id = existing_mapping->id; - existing_mapping->refcount++; + os_ref_retain_locked(&existing_mapping->refcount); if (uuid_policy_table) { - existing_mapping->table_refcount++; + existing_mapping->table_usecount++; } } else { struct necp_uuid_id_mapping *new_mapping = NULL; @@ -5504,11 +5909,11 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_ if (new_mapping != NULL) { uuid_copy(new_mapping->uuid, uuid); new_mapping->id = necp_get_new_uuid_id(false); - new_mapping->refcount = 1; + os_ref_init(&new_mapping->refcount, &necp_refgrp); if (uuid_policy_table) { - new_mapping->table_refcount = 1; + new_mapping->table_usecount = 1; } else { - new_mapping->table_refcount = 0; + new_mapping->table_usecount = 0; } LIST_INSERT_HEAD(APPUUIDHASH(uuid), new_mapping, chain); @@ -5538,9 +5943,9 @@ necp_remove_uuid_app_id_mapping(uuid_t uuid, bool *removed_mapping, bool uuid_po existing_mapping = necp_uuid_lookup_app_id_locked(uuid); if (existing_mapping != NULL) { if (uuid_policy_table) { - existing_mapping->table_refcount--; + existing_mapping->table_usecount--; } - if (--existing_mapping->refcount == 0) { + if (os_ref_release_locked(&existing_mapping->refcount) == 0) { LIST_REMOVE(existing_mapping, chain); FREE(existing_mapping, M_NECP); if (removed_mapping) { @@ -5618,14 +6023,14 @@ necp_create_uuid_service_id_mapping(uuid_t uuid) existing_mapping = necp_uuid_lookup_service_id_locked(uuid); if (existing_mapping != NULL) { local_id = existing_mapping->id; - existing_mapping->refcount++; + os_ref_retain_locked(&existing_mapping->refcount); } else { struct necp_uuid_id_mapping *new_mapping = NULL; MALLOC(new_mapping, struct necp_uuid_id_mapping *, sizeof(*new_mapping), M_NECP, M_WAITOK); if (new_mapping != NULL) { uuid_copy(new_mapping->uuid, uuid); new_mapping->id = necp_get_new_uuid_id(true); - new_mapping->refcount = 1; + os_ref_init(&new_mapping->refcount, &necp_refgrp); LIST_INSERT_HEAD(&necp_uuid_service_id_list, new_mapping, chain); @@ -5647,9 +6052,9 @@ necp_remove_uuid_service_id_mapping(uuid_t uuid) LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - existing_mapping = necp_uuid_lookup_app_id_locked(uuid); + existing_mapping = necp_uuid_lookup_service_id_locked(uuid); if (existing_mapping != NULL) { - if (--existing_mapping->refcount == 0) { + if (os_ref_release_locked(&existing_mapping->refcount) == 0) { LIST_REMOVE(existing_mapping, chain); FREE(existing_mapping, M_NECP); } @@ -5676,7 +6081,7 @@ necp_kernel_socket_policies_update_uuid_table(void) for (uuid_list_head = &necp_uuid_app_id_hashtbl[necp_uuid_app_id_hash_num_buckets - 1]; uuid_list_head >= necp_uuid_app_id_hashtbl; uuid_list_head--) { struct necp_uuid_id_mapping *mapping = NULL; LIST_FOREACH(mapping, uuid_list_head, chain) { - if (mapping->table_refcount > 0 && + if (mapping->table_usecount > 0 && proc_uuid_policy_kernel(PROC_UUID_POLICY_OPERATION_ADD, mapping->uuid, PROC_UUID_NECP_APP_POLICY) < 0) { NECPLOG0(LOG_DEBUG, "Error adding uuid to policy table\n"); } @@ -5690,7 +6095,7 @@ necp_kernel_socket_policies_update_uuid_table(void) return TRUE; } -#define NECP_KERNEL_VALID_IP_OUTPUT_CONDITIONS (NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE) +#define NECP_KERNEL_VALID_IP_OUTPUT_CONDITIONS (NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_LOCAL_NETWORKS) static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { @@ -5905,6 +6310,11 @@ necp_kernel_ip_output_policy_is_unnecessary(struct necp_kernel_ip_output_policy continue; } + // If new policy matches Local Networks, compared policy must also + if ((policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) && !(compared_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS)) { + continue; + } + // Default makes lower policies unecessary always if (compared_policy->condition_mask == 0) { return TRUE; @@ -5999,8 +6409,10 @@ necp_kernel_ip_output_policies_reprocess(void) /* Update bucket counts: * Non-id and SKIP policies will be added to all buckets + * Add local networks policy to all buckets for incoming IP */ if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) || + (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) || kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) { for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) { bucket_allocation_counts[i]++; @@ -6030,6 +6442,7 @@ necp_kernel_ip_output_policies_reprocess(void) LIST_FOREACH(kernel_policy, &necp_kernel_ip_output_policies, chain) { // Insert pointers into map if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) || + (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) || kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) { for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) { if (!necp_kernel_ip_output_policy_is_unnecessary(kernel_policy, necp_kernel_ip_output_policies_map[i], bucket_current_free_index[i])) { @@ -6167,11 +6580,35 @@ necp_hostname_matches_domain(struct substring hostname_substring, u_int8_t hostn return FALSE; } +bool +net_domain_contains_hostname(char *hostname_string, char *domain_string) +{ + if (hostname_string == NULL || + domain_string == NULL) { + return false; + } + + struct substring hostname_substring; + hostname_substring.string = hostname_string; + hostname_substring.length = strlen(hostname_string); + + return necp_hostname_matches_domain(hostname_substring, + necp_count_dots(hostname_string, hostname_substring.length), + domain_string, + necp_count_dots(domain_string, strlen(domain_string))); +} + +#define NECP_MAX_STRING_LEN 1024 + static char * necp_copy_string(char *string, size_t length) { char *copied_string = NULL; + if (length > NECP_MAX_STRING_LEN) { + return NULL; + } + MALLOC(copied_string, char *, length + 1, M_NECP, M_WAITOK); if (copied_string == NULL) { return NULL; @@ -6208,34 +6645,31 @@ static inline void necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info) { task_t task = proc_task(proc ? proc : current_proc()); - coalition_t coal = COALITION_NULL; - Boolean is_leader = coalition_is_leader(task, COALITION_TYPE_JETSAM, &coal); + coalition_t coal = task_get_coalition(task, COALITION_TYPE_JETSAM); - if (is_leader == TRUE) { + if (coal == COALITION_NULL || coalition_is_leader(task, coal)) { // No parent, nothing to do return; } - if (coal != NULL) { - task_t lead_task = coalition_get_leader(coal); - if (lead_task != NULL) { - proc_t lead_proc = get_bsdtask_info(lead_task); - if (lead_proc != NULL) { - kauth_cred_t lead_cred = kauth_cred_proc_ref(lead_proc); - if (lead_cred != NULL) { - errno_t cred_result = priv_check_cred(lead_cred, PRIV_NET_PRIVILEGED_NECP_MATCH, 0); - kauth_cred_unref(&lead_cred); - info->cred_result = cred_result; - } + task_t lead_task = coalition_get_leader(coal); + if (lead_task != NULL) { + proc_t lead_proc = get_bsdtask_info(lead_task); + if (lead_proc != NULL) { + kauth_cred_t lead_cred = kauth_cred_proc_ref(lead_proc); + if (lead_cred != NULL) { + errno_t cred_result = priv_check_cred(lead_cred, PRIV_NET_PRIVILEGED_NECP_MATCH, 0); + kauth_cred_unref(&lead_cred); + info->cred_result = cred_result; } - task_deallocate(lead_task); } + task_deallocate(lead_task); } } -#define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX) +#define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_LOCAL_NETWORKS) static void -necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, proc_t proc, struct necp_socket_info *info) +necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info) { memset(info, 0, sizeof(struct necp_socket_info)); @@ -6244,6 +6678,9 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic info->protocol = protocol; info->bound_interface_index = bound_interface_index; info->traffic_class = traffic_class; + info->has_client = has_client; + info->drop_order = drop_order; + info->client_flags = client_flags; if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT && proc != NULL) { info->cred_result = priv_check_cred(proc_ucred(proc), PRIV_NET_PRIVILEGED_NECP_MATCH, 0); @@ -6253,6 +6690,10 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic } } + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY && proc != NULL) { + info->is_platform_binary = csproc_get_platform_binary(proc) ? true : false; + } + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID && !uuid_is_null(application_uuid)) { struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(application_uuid); if (existing_mapping) { @@ -6285,9 +6726,23 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_ADDRESS_TYPE_CONDITIONS) { if (local_addr && local_addr->sa.sa_len > 0) { memcpy(&info->local_addr, local_addr, local_addr->sa.sa_len); + if (local_port != 0) { + info->local_addr.sin6.sin6_port = local_port; + } + } else if (local_port != 0) { + info->local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6); + info->local_addr.sin6.sin6_family = AF_INET6; + info->local_addr.sin6.sin6_port = local_port; } if (remote_addr && remote_addr->sa.sa_len > 0) { memcpy(&info->remote_addr, remote_addr, remote_addr->sa.sa_len); + if (remote_port != 0) { + info->remote_addr.sin6.sin6_port = remote_port; + } + } else if (remote_port != 0) { + info->remote_addr.sin6.sin6_len = sizeof(struct sockaddr_in6); + info->remote_addr.sin6.sin6_family = AF_INET6; + info->remote_addr.sin6.sin6_port = remote_port; } } } @@ -6325,10 +6780,14 @@ necp_application_find_policy_match_internal(proc_t proc, u_int32_t parameters_size, struct necp_aggregate_result *returned_result, u_int32_t *flags, + u_int32_t *reason, u_int required_interface_index, const union necp_sockaddr_union *override_local_addr, const union necp_sockaddr_union *override_remote_addr, - struct rtentry **returned_route, bool ignore_address) + struct necp_client_endpoint *returned_v4_gateway, + struct necp_client_endpoint *returned_v6_gateway, + struct rtentry **returned_route, bool ignore_address, + bool has_client) { int error = 0; size_t offset = 0; @@ -6336,7 +6795,6 @@ necp_application_find_policy_match_internal(proc_t proc, struct necp_kernel_socket_policy *matched_policy = NULL; struct necp_socket_info info; necp_kernel_policy_filter filter_control_unit = 0; - u_int32_t route_rule_id = 0; necp_kernel_policy_result service_action = 0; necp_kernel_policy_service service = { 0, 0 }; @@ -6349,6 +6807,9 @@ necp_application_find_policy_match_internal(proc_t proc, bool no_remote_addr = FALSE; u_int8_t remote_family = 0; bool no_local_addr = FALSE; + u_int16_t local_port = 0; + u_int16_t remote_port = 0; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; if (override_local_addr) { memcpy(&local_addr, override_local_addr, sizeof(local_addr)); @@ -6392,11 +6853,23 @@ necp_application_find_policy_match_internal(proc_t proc, return EINVAL; } + if (returned_v4_gateway != NULL) { + memset(returned_v4_gateway, 0, sizeof(struct necp_client_endpoint)); + } + + if (returned_v6_gateway != NULL) { + memset(returned_v6_gateway, 0, sizeof(struct necp_client_endpoint)); + } + memset(returned_result, 0, sizeof(struct necp_aggregate_result)); + u_int32_t drop_order = necp_process_drop_order(proc_ucred(proc)); + + necp_kernel_policy_result drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + lck_rw_lock_shared(&necp_kernel_policy_lock); if (necp_kernel_application_policies_count == 0) { - if (necp_drop_all_order > 0) { + if (necp_drop_all_order > 0 || drop_order > 0) { returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; lck_rw_done(&necp_kernel_policy_lock); return 0; @@ -6489,6 +6962,8 @@ necp_application_find_policy_match_internal(proc_t proc, case NECP_CLIENT_PARAMETER_IP_PROTOCOL: { if (length >= sizeof(u_int16_t)) { memcpy(&protocol, value, sizeof(u_int16_t)); + } else if (length >= sizeof(u_int8_t)) { + memcpy(&protocol, value, sizeof(u_int8_t)); } break; } @@ -6506,7 +6981,7 @@ necp_application_find_policy_match_internal(proc_t proc, break; } case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS: { - if (ignore_address) { + if (ignore_address || override_local_addr) { break; } @@ -6519,7 +6994,7 @@ necp_application_find_policy_match_internal(proc_t proc, break; } case NECP_CLIENT_PARAMETER_REMOTE_ADDRESS: { - if (ignore_address) { + if (ignore_address || override_remote_addr) { break; } @@ -6531,18 +7006,49 @@ necp_application_find_policy_match_internal(proc_t proc, } break; } - case NECP_CLIENT_PARAMETER_FLAGS: { - if (length >= sizeof(client_flags)) { - memcpy(&client_flags, value, sizeof(client_flags)); + case NECP_CLIENT_PARAMETER_LOCAL_ENDPOINT: { + if (ignore_address || override_local_addr) { + break; + } + + if (length >= sizeof(struct necp_client_endpoint)) { + struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value; + if (endpoint->u.endpoint.endpoint_family == AF_UNSPEC && + endpoint->u.endpoint.endpoint_port != 0) { + // Save port + local_port = endpoint->u.endpoint.endpoint_port; + } } break; } - case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: { - if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) { + case NECP_CLIENT_PARAMETER_REMOTE_ENDPOINT: { + if (ignore_address || override_remote_addr) { break; } - if (length >= sizeof(struct necp_client_parameter_netagent_type)) { - memcpy(&required_agent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); + + if (length >= sizeof(struct necp_client_endpoint)) { + struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value; + if (endpoint->u.endpoint.endpoint_family == AF_UNSPEC && + endpoint->u.endpoint.endpoint_port != 0) { + // Save port + remote_port = endpoint->u.endpoint.endpoint_port; + } + } + break; + } + case NECP_CLIENT_PARAMETER_FLAGS: { + if (length >= sizeof(client_flags)) { + memcpy(&client_flags, value, sizeof(client_flags)); + } + break; + } + case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: + case NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE: { + if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) { + break; + } + if (length >= sizeof(struct necp_client_parameter_netagent_type)) { + memcpy(&required_agent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); num_required_agent_types++; } break; @@ -6560,21 +7066,36 @@ necp_application_find_policy_match_internal(proc_t proc, // Lock lck_rw_lock_shared(&necp_kernel_policy_lock); - necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, proc, &info); - matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL); + u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; + size_t route_rule_id_array_count = 0; + necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, proc, drop_order, client_flags, &info); + matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL, NULL, &drop_dest_policy_result, &drop_all_bypass); if (matched_policy) { returned_result->policy_id = matched_policy->id; returned_result->routing_result = matched_policy->result; memcpy(&returned_result->routing_result_parameter, &matched_policy->result_parameter, sizeof(returned_result->routing_result_parameter)); - } else if (necp_drop_all_order > 0) { - // Mark socket as a drop if drop_all is set - returned_result->policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; } else { - returned_result->policy_id = 0; - returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_NONE; + bool drop_all = false; + if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) { + // Mark socket as a drop if drop_all is set + drop_all = true; + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(proc); + } + } + if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + returned_result->policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; + } else { + returned_result->policy_id = 0; + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_NONE; + } + } + if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) { + returned_result->filter_control_unit = 0; + } else { + returned_result->filter_control_unit = filter_control_unit; } - returned_result->filter_control_unit = filter_control_unit; returned_result->service_action = service_action; // Handle trigger service @@ -6671,7 +7192,7 @@ necp_application_find_policy_match_internal(proc_t proc, returned_result->routed_interface_index = 0; } - if (no_remote_addr && remote_family == 0 && + if (no_remote_addr && remote_family == AF_UNSPEC && (rt == NULL || rt->rt_ifp == NULL)) { // Route lookup for default IPv4 failed, try IPv6 @@ -6834,13 +7355,28 @@ necp_application_find_policy_match_internal(proc_t proc, } // Check QoS marking (fastlane) - if (necp_update_qos_marking(rt->rt_ifp, route_rule_id)) { - *flags |= NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING; + for (size_t route_rule_index = 0; route_rule_index < route_rule_id_array_count; route_rule_index++) { + if (necp_update_qos_marking(rt->rt_ifp, route_rule_id_array[route_rule_index])) { + *flags |= NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING; + // If the route can use QoS markings, stop iterating route rules + break; + } } if (IFNET_IS_LOW_POWER(rt->rt_ifp)) { *flags |= NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER; } + + if (traffic_class == SO_TC_BK_SYS) { + // Block BK_SYS traffic if interface is throttled + u_int32_t throttle_level = 0; + if (ifnet_get_throttle(rt->rt_ifp, &throttle_level) == 0) { + if (throttle_level == IFNET_THROTTLE_OPPORTUNISTIC) { + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; + memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); + } + } + } } } @@ -6867,6 +7403,12 @@ necp_application_find_policy_match_internal(proc_t proc, if (v4Route->rt_ifp != NULL && !IS_INTF_CLAT46(v4Route->rt_ifp)) { *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV4; } + if (returned_v4_gateway != NULL && + v4Route->rt_gateway != NULL && + v4Route->rt_gateway->sa_len == sizeof(returned_v4_gateway->u.sin)) { + memcpy(&returned_v4_gateway->u.sin, v4Route->rt_gateway, sizeof(returned_v4_gateway->u.sin)); + memset(&returned_v4_gateway->u.sin.sin_zero, 0, sizeof(returned_v4_gateway->u.sin.sin_zero)); + } rtfree(v4Route); v4Route = NULL; } @@ -6879,21 +7421,56 @@ necp_application_find_policy_match_internal(proc_t proc, *flags |= NECP_CLIENT_RESULT_FLAG_HAS_NAT64; } } + if (returned_v6_gateway != NULL && + v6Route->rt_gateway != NULL && + v6Route->rt_gateway->sa_len == sizeof(returned_v6_gateway->u.sin6)) { + memcpy(&returned_v6_gateway->u.sin6, v6Route->rt_gateway, sizeof(returned_v6_gateway->u.sin6)); + } rtfree(v6Route); v6Route = NULL; } } } - u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; - bool route_is_allowed = necp_route_is_allowed(rt, NULL, route_rule_id, &interface_type_denied); - if (!route_is_allowed) { - // If the route is blocked, treat the lookup as a drop - returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; - memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); + for (size_t route_rule_index = 0; route_rule_index < route_rule_id_array_count; route_rule_index++) { + u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; + bool route_is_allowed = necp_route_is_allowed(rt, NULL, route_rule_id_array[route_rule_index], &interface_type_denied); + if (!route_is_allowed) { + // If the route is blocked, treat the lookup as a drop + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; + memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); + + if (interface_type_denied != IFRTYPE_FUNCTIONAL_UNKNOWN) { + if (reason != NULL) { + if (interface_type_denied == IFRTYPE_FUNCTIONAL_CELLULAR) { + *reason = NECP_CLIENT_RESULT_REASON_CELLULAR_DENIED; + } else if (interface_type_denied == IFRTYPE_FUNCTIONAL_WIFI_INFRA) { + *reason = NECP_CLIENT_RESULT_REASON_WIFI_DENIED; + } + } + necp_send_application_interface_denied_event(pid, application_uuid, interface_type_denied); + } + // If the route gets denied, stop matching rules + break; + } + } - if (interface_type_denied != IFRTYPE_FUNCTIONAL_UNKNOWN) { - necp_send_application_interface_denied_event(pid, application_uuid, interface_type_denied); + if (rt != NULL && rt->rt_ifp != NULL) { + const bool expensive_prohibited = ((client_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) && + IFNET_IS_EXPENSIVE(rt->rt_ifp)); + const bool constrained_prohibited = ((client_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED) && + IFNET_IS_CONSTRAINED(rt->rt_ifp)); + if (reason != NULL) { + if (expensive_prohibited) { + *reason = NECP_CLIENT_RESULT_REASON_EXPENSIVE_PROHIBITED; + } else if (constrained_prohibited) { + *reason = NECP_CLIENT_RESULT_REASON_CONSTRAINED_PROHIBITED; + } + } + if (expensive_prohibited || constrained_prohibited) { + // If the client flags prohibited a property of the interface, treat it as a drop + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; + memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); } } @@ -6912,7 +7489,58 @@ necp_application_find_policy_match_internal(proc_t proc, } static bool -necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc) +necp_is_route_local(union necp_sockaddr_union *remote_addr) +{ + bool no_remote_addr = FALSE; + u_int8_t remote_family = 0; + struct rtentry *rt = NULL; + bool is_local = FALSE; + + if (remote_addr == NULL) { + return NULL; + } + + if (remote_addr->sa.sa_len == 0 || + (remote_addr->sa.sa_family == AF_INET && remote_addr->sin.sin_addr.s_addr == 0) || + (remote_addr->sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&remote_addr->sin6.sin6_addr))) { + no_remote_addr = TRUE; + remote_family = remote_addr->sa.sa_family; + } + + if (no_remote_addr) { + memset(remote_addr, 0, sizeof(union necp_sockaddr_union)); + if (remote_family == AF_INET6) { + // Reset address to :: + remote_addr->sa.sa_family = AF_INET6; + remote_addr->sa.sa_len = sizeof(struct sockaddr_in6); + } else { + // Reset address to 0.0.0.0 + remote_addr->sa.sa_family = AF_INET; + remote_addr->sa.sa_len = sizeof(struct sockaddr_in); + } + } + + // Lookup route regardless of the scoped interface to check if + // remote address is in a local network. + rt = rtalloc1_scoped((struct sockaddr *)remote_addr, 0, 0, 0); + + if (rt == NULL) { + goto done; + } + if (remote_addr->sa.sa_family == AF_INET && IS_INTF_CLAT46(rt->rt_ifp)) { + goto free_rt; + } + is_local = IS_NECP_DEST_IN_LOCAL_NETWORKS(rt); + +free_rt: + rtfree(rt); + +done: + return is_local; +} + +static bool +necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, bool has_client, uint32_t client_flags, int is_platform_binary, proc_t proc, struct rtentry *rt) { if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { @@ -6968,6 +7596,12 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) { + if (!has_client) { + return FALSE; + } + } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT) { if (cred_result != 0) { // Process is missing entitlement @@ -6975,6 +7609,13 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { + if (is_platform_binary == 0) { + // Process is not platform binary + return FALSE; + } + } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) { if (kernel_policy->cond_custom_entitlement_matched == necp_boolean_state_false) { // Process is missing entitlement based on previous check @@ -7103,6 +7744,21 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) { + bool is_local = FALSE; + + if (rt != NULL) { + is_local = IS_NECP_DEST_IN_LOCAL_NETWORKS(rt); + } else { + is_local = necp_is_route_local(remote); + } + + if (!is_local) { + // Either no route to validate or no match for local networks + return FALSE; + } + } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { bool inRange = necp_is_addr_in_range((struct sockaddr *)local, (struct sockaddr *)&kernel_policy->cond_local_start, (struct sockaddr *)&kernel_policy->cond_local_end); @@ -7155,6 +7811,46 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) { + if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) { + if ((client_flags & kernel_policy->cond_client_flags) == kernel_policy->cond_client_flags) { + // Flags do match, and condition is negative, fail. + return FALSE; + } + } else { + if ((client_flags & kernel_policy->cond_client_flags) != kernel_policy->cond_client_flags) { + // Flags do not match, fail. + return FALSE; + } + } + } + + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) { + bool isEmpty = necp_addr_is_empty((struct sockaddr *)local); + if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) { + if (isEmpty) { + return FALSE; + } + } else { + if (!isEmpty) { + return FALSE; + } + } + } + + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) { + bool isEmpty = necp_addr_is_empty((struct sockaddr *)remote); + if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) { + if (isEmpty) { + return FALSE; + } + } else { + if (!isEmpty) { + return FALSE; + } + } + } + return TRUE; } @@ -7165,7 +7861,7 @@ necp_socket_calc_flowhash_locked(struct necp_socket_info *info) } static void -necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, struct necp_socket_info *info) +necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, u_int32_t drop_order, struct necp_socket_info *info) { struct socket *so = NULL; @@ -7173,6 +7869,8 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc so = inp->inp_socket; + info->drop_order = drop_order; + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PID) { info->pid = ((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid); } @@ -7185,6 +7883,30 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc info->traffic_class = so->so_traffic_class; } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) { + info->has_client = !uuid_is_null(inp->necp_client_uuid); + } + + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) { + info->client_flags = 0; + if (INP_NO_CONSTRAINED(inp)) { + info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED; + } + if (INP_NO_EXPENSIVE(inp)) { + info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE; + } + if (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK) { + info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC; + } + if (inp->inp_socket->so_flags1 & SOF1_INBOUND) { + info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_INBOUND; + } + if (inp->inp_socket->so_options & SO_ACCEPTCONN || + inp->inp_flags2 & INP2_EXTERNAL_PORT) { + info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_LISTENER; + } + } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PROTOCOL) { if (inp->inp_ip_p) { info->protocol = inp->inp_ip_p; @@ -7215,6 +7937,10 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc necp_get_parent_cred_result(NULL, info); } } + + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { + info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false; + } } if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) { @@ -7237,45 +7963,51 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc } if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_ADDRESS_TYPE_CONDITIONS) { - if (inp->inp_vflag & INP_IPV4) { - if (override_local_addr) { - if (override_local_addr->sa_len <= sizeof(struct sockaddr_in)) { - memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len); - } - } else { + if (override_local_addr != NULL) { + if (override_local_addr->sa_family == AF_INET6 && override_local_addr->sa_len <= sizeof(struct sockaddr_in6)) { + memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len); + if (IN6_IS_ADDR_V4MAPPED(&(info->local_addr.sin6.sin6_addr))) { + struct sockaddr_in sin; + in6_sin6_2_sin(&sin, &(info->local_addr.sin6)); + memset(&info->local_addr, 0, sizeof(union necp_sockaddr_union)); + memcpy(&info->local_addr, &sin, sin.sin_len); + } + } else if (override_local_addr->sa_family == AF_INET && override_local_addr->sa_len <= sizeof(struct sockaddr_in)) { + memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len); + } + } else { + if (inp->inp_vflag & INP_IPV4) { ((struct sockaddr_in *)&info->local_addr)->sin_family = AF_INET; ((struct sockaddr_in *)&info->local_addr)->sin_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *)&info->local_addr)->sin_port = inp->inp_lport; memcpy(&((struct sockaddr_in *)&info->local_addr)->sin_addr, &inp->inp_laddr, sizeof(struct in_addr)); - } - - if (override_remote_addr) { - if (override_remote_addr->sa_len <= sizeof(struct sockaddr_in)) { - memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len); - } - } else { - ((struct sockaddr_in *)&info->remote_addr)->sin_family = AF_INET; - ((struct sockaddr_in *)&info->remote_addr)->sin_len = sizeof(struct sockaddr_in); - ((struct sockaddr_in *)&info->remote_addr)->sin_port = inp->inp_fport; - memcpy(&((struct sockaddr_in *)&info->remote_addr)->sin_addr, &inp->inp_faddr, sizeof(struct in_addr)); - } - } else if (inp->inp_vflag & INP_IPV6) { - if (override_local_addr) { - if (override_local_addr->sa_len <= sizeof(struct sockaddr_in6)) { - memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len); - } - } else { + } else if (inp->inp_vflag & INP_IPV6) { ((struct sockaddr_in6 *)&info->local_addr)->sin6_family = AF_INET6; ((struct sockaddr_in6 *)&info->local_addr)->sin6_len = sizeof(struct sockaddr_in6); ((struct sockaddr_in6 *)&info->local_addr)->sin6_port = inp->inp_lport; memcpy(&((struct sockaddr_in6 *)&info->local_addr)->sin6_addr, &inp->in6p_laddr, sizeof(struct in6_addr)); } + } - if (override_remote_addr) { - if (override_remote_addr->sa_len <= sizeof(struct sockaddr_in6)) { - memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len); + if (override_remote_addr != NULL) { + if (override_remote_addr->sa_family == AF_INET6 && override_remote_addr->sa_len <= sizeof(struct sockaddr_in6)) { + memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len); + if (IN6_IS_ADDR_V4MAPPED(&(info->remote_addr.sin6.sin6_addr))) { + struct sockaddr_in sin; + in6_sin6_2_sin(&sin, &(info->remote_addr.sin6)); + memset(&info->remote_addr, 0, sizeof(union necp_sockaddr_union)); + memcpy(&info->remote_addr, &sin, sin.sin_len); } - } else { + } else if (override_remote_addr->sa_family == AF_INET && override_remote_addr->sa_len <= sizeof(struct sockaddr_in)) { + memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len); + } + } else { + if (inp->inp_vflag & INP_IPV4) { + ((struct sockaddr_in *)&info->remote_addr)->sin_family = AF_INET; + ((struct sockaddr_in *)&info->remote_addr)->sin_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *)&info->remote_addr)->sin_port = inp->inp_fport; + memcpy(&((struct sockaddr_in *)&info->remote_addr)->sin_addr, &inp->inp_faddr, sizeof(struct in_addr)); + } else if (inp->inp_vflag & INP_IPV6) { ((struct sockaddr_in6 *)&info->remote_addr)->sin6_family = AF_INET6; ((struct sockaddr_in6 *)&info->remote_addr)->sin6_len = sizeof(struct sockaddr_in6); ((struct sockaddr_in6 *)&info->remote_addr)->sin6_port = inp->inp_fport; @@ -7287,45 +8019,75 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc static inline struct necp_kernel_socket_policy * necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, - necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, + necp_kernel_policy_filter *return_filter, + u_int32_t *return_route_rule_id_array, size_t *return_route_rule_id_array_count, size_t route_rule_id_array_count, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, - u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id) + u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id, struct rtentry *rt, + necp_kernel_policy_result *return_drop_dest_policy_result, necp_drop_all_bypass_check_result_t *return_drop_all_bypass) { struct necp_kernel_socket_policy *matched_policy = NULL; u_int32_t skip_order = 0; u_int32_t skip_session_order = 0; - u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; size_t route_rule_id_count = 0; int i; size_t netagent_cursor = 0; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; + if (return_drop_all_bypass != NULL) { + *return_drop_all_bypass = drop_all_bypass; + } // Pre-process domain for quick matching struct substring domain_substring = necp_trim_dots_and_stars(info->domain, info->domain ? strlen(info->domain) : 0); u_int8_t domain_dot_count = necp_count_dots(domain_substring.string, domain_substring.length); - if (return_filter) { + if (return_filter != NULL) { *return_filter = 0; } - if (return_route_rule_id) { - *return_route_rule_id = 0; + if (return_route_rule_id_array_count != NULL) { + *return_route_rule_id_array_count = 0; } - if (return_service_action) { + if (return_service_action != NULL) { *return_service_action = 0; } - if (return_service) { + if (return_service != NULL) { return_service->identifier = 0; return_service->data = 0; } + // Do not subject layer-2 filter to NECP policies, return a PASS policy + if (necp_pass_interpose > 0 && info->client_flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) { + return &pass_policy; + } + + *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + if (policy_search_array != NULL) { for (i = 0; policy_search_array[i] != NULL; i++) { if (necp_drop_all_order != 0 && policy_search_array[i]->session_order >= necp_drop_all_order) { // We've hit a drop all rule + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(proc); + if (return_drop_all_bypass != NULL) { + *return_drop_all_bypass = drop_all_bypass; + } + } + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + break; + } + } + if (necp_drop_dest_policy.entry_count != 0 && + necp_address_matches_drop_dest_policy(&info->remote_addr, policy_search_array[i]->session_order)) { + // We've hit a drop by destination address rule + *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_DROP; + break; + } + if (info->drop_order != 0 && policy_search_array[i]->session_order >= info->drop_order) { + // We've hit a drop order for this socket break; } if (skip_session_order && policy_search_array[i]->session_order >= skip_session_order) { @@ -7346,18 +8108,24 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy // Skip this policy continue; } - if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, proc)) { + + if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, info->has_client, info->client_flags, info->is_platform_binary, proc, rt)) { if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) { - if (return_filter && *return_filter == 0) { - *return_filter = policy_search_array[i]->result_parameter.filter_control_unit; + if (return_filter && *return_filter != NECP_FILTER_UNIT_NO_FILTER) { + necp_kernel_policy_filter control_unit = policy_search_array[i]->result_parameter.filter_control_unit; + if (control_unit == NECP_FILTER_UNIT_NO_FILTER) { + *return_filter = control_unit; + } else { + *return_filter |= control_unit; + } if (necp_debug > 1) { NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) Filter %d", info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, policy_search_array[i]->result_parameter.filter_control_unit); } } continue; } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES) { - if (return_route_rule_id && route_rule_id_count < MAX_AGGREGATE_ROUTE_RULES) { - route_rule_id_array[route_rule_id_count++] = policy_search_array[i]->result_parameter.route_rule_id; + if (return_route_rule_id_array && route_rule_id_count < route_rule_id_array_count) { + return_route_rule_id_array[route_rule_id_count++] = policy_search_array[i]->result_parameter.route_rule_id; if (necp_debug > 1) { NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) Route Rule %d", info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, policy_search_array[i]->result_parameter.route_rule_id); } @@ -7408,6 +8176,12 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy continue; } + // Matched an allow unentitled, which clears any drop order + if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED) { + info->drop_order = 0; + continue; + } + // Passed all tests, found a match matched_policy = policy_search_array[i]; break; @@ -7415,10 +8189,8 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy } } - if (route_rule_id_count == 1) { - *return_route_rule_id = route_rule_id_array[0]; - } else if (route_rule_id_count > 1) { - *return_route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array); + if (return_route_rule_id_array_count != NULL) { + *return_route_rule_id_array_count = route_rule_id_count; } return matched_policy; } @@ -7495,11 +8267,12 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local { struct socket *so = NULL; necp_kernel_policy_filter filter_control_unit = 0; - u_int32_t route_rule_id = 0; struct necp_kernel_socket_policy *matched_policy = NULL; necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE; necp_kernel_policy_result service_action = 0; necp_kernel_policy_service service = { 0, 0 }; + u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; u_int32_t netagent_ids[NECP_MAX_NETAGENTS]; memset(&netagent_ids, 0, sizeof(netagent_ids)); @@ -7523,10 +8296,12 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local so = inp->inp_socket; + u_int32_t drop_order = necp_process_drop_order(so->so_cred); + // Don't lock. Possible race condition, but we don't want the performance hit. if (necp_kernel_socket_policies_count == 0 || (!(inp->inp_flags2 & INP2_WANT_APP_POLICY) && necp_kernel_socket_policies_non_app_count == 0)) { - if (necp_drop_all_order > 0) { + if (necp_drop_all_order > 0 || drop_order > 0) { inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = 0; @@ -7560,7 +8335,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local // Lock lck_rw_lock_shared(&necp_kernel_policy_lock); - necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, &info); + necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, drop_order, &info); inp->inp_policyresult.app_id = info.application_id; // Check info @@ -7578,7 +8353,10 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local // Match socket to policy necp_kernel_policy_id skip_policy_id; - matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id); + u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; + size_t route_rule_id_array_count = 0; + matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id, inp->inp_route.ro_rt, &drop_dest_policy_result, &drop_all_bypass); + // If the socket matched a scoped service policy, mark as Drop if not registered. // This covers the cases in which a service is required (on demand) but hasn't started yet. if ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED || @@ -7655,6 +8433,15 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } } } + + u_int32_t route_rule_id = 0; + if (route_rule_id_array_count == 1) { + route_rule_id = route_rule_id_array[0]; + } else if (route_rule_id_array_count > 1) { + route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array); + } + + bool reset_tcp_mss = false; if (matched_policy) { matched_policy_id = matched_policy->id; inp->inp_policyresult.policy_id = matched_policy->id; @@ -7677,40 +8464,54 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && info.protocol == IPPROTO_TCP) { // Reset MSS on TCP socket if tunnel policy changes - tcp_mtudisc(inp, 0); + reset_tcp_mss = true; } if (necp_debug > 1) { NECPLOG(LOG_DEBUG, "Socket Policy: %p (BoundInterface %d Proto %d) Policy %d Result %d Parameter %d", inp->inp_socket, info.bound_interface_index, info.protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index); } - } else if (necp_drop_all_order > 0) { - // Mark socket as a drop if set - inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; - inp->inp_policyresult.flowhash = flowhash; - inp->inp_policyresult.results.filter_control_unit = 0; - inp->inp_policyresult.results.route_rule_id = 0; - inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_DROP; } else { - // Mark non-matching socket so we don't re-check it - inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; - inp->inp_policyresult.flowhash = flowhash; - inp->inp_policyresult.results.filter_control_unit = filter_control_unit; // We may have matched a filter, so mark it! - inp->inp_policyresult.results.route_rule_id = route_rule_id; // We may have matched a route rule, so mark it! - inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_NONE; + bool drop_all = false; + if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) { + // Mark socket as a drop if set + drop_all = true; + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(NULL); + } + } + if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; + inp->inp_policyresult.flowhash = flowhash; + inp->inp_policyresult.results.filter_control_unit = 0; + inp->inp_policyresult.results.route_rule_id = 0; + inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_DROP; + } else { + // Mark non-matching socket so we don't re-check it + inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; + inp->inp_policyresult.flowhash = flowhash; + inp->inp_policyresult.results.filter_control_unit = filter_control_unit; // We may have matched a filter, so mark it! + inp->inp_policyresult.results.route_rule_id = route_rule_id; // We may have matched a route rule, so mark it! + inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_NONE; + } } // Unlock lck_rw_done(&necp_kernel_policy_lock); + if (reset_tcp_mss) { + // Update MSS when not holding the policy lock to avoid recursive locking + tcp_mtudisc(inp, 0); + } + return matched_policy_id; } static bool -necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote) +necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct rtentry *rt) { if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { @@ -7767,6 +8568,21 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) { + bool is_local = FALSE; + + if (rt != NULL) { + is_local = IS_NECP_DEST_IN_LOCAL_NETWORKS(rt); + } else { + is_local = necp_is_route_local(remote); + } + + if (!is_local) { + // Either no route to validate or no match for local networks + return FALSE; + } + } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { bool inRange = necp_is_addr_in_range((struct sockaddr *)local, (struct sockaddr *)&kernel_policy->cond_local_start, (struct sockaddr *)&kernel_policy->cond_local_end); @@ -7823,17 +8639,43 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, } static inline struct necp_kernel_ip_output_policy * -necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr) +necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, struct rtentry *rt, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_drop_dest_policy_result, necp_drop_all_bypass_check_result_t *return_drop_all_bypass) { u_int32_t skip_order = 0; u_int32_t skip_session_order = 0; - int i; struct necp_kernel_ip_output_policy *matched_policy = NULL; struct necp_kernel_ip_output_policy **policy_search_array = necp_kernel_ip_output_policies_map[NECP_IP_OUTPUT_MAP_ID_TO_BUCKET(socket_policy_id)]; + u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; + size_t route_rule_id_count = 0; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; + if (return_drop_all_bypass != NULL) { + *return_drop_all_bypass = drop_all_bypass; + } + + if (return_route_rule_id != NULL) { + *return_route_rule_id = 0; + } + + *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + if (policy_search_array != NULL) { - for (i = 0; policy_search_array[i] != NULL; i++) { + for (int i = 0; policy_search_array[i] != NULL; i++) { if (necp_drop_all_order != 0 && policy_search_array[i]->session_order >= necp_drop_all_order) { // We've hit a drop all rule + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(NULL); + if (return_drop_all_bypass != NULL) { + *return_drop_all_bypass = drop_all_bypass; + } + } + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + break; + } + } + if (necp_drop_dest_policy.entry_count > 0 && + necp_address_matches_drop_dest_policy(remote_addr, policy_search_array[i]->session_order)) { + // We've hit a drop by destination address rule + *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_DROP; break; } if (skip_session_order && policy_search_array[i]->session_order >= skip_session_order) { @@ -7854,21 +8696,32 @@ necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, // Skip this policy continue; } - if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr)) { - // Passed all tests, found a match - matched_policy = policy_search_array[i]; - if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) { + if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr, rt)) { + if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES) { + if (return_route_rule_id != NULL && route_rule_id_count < MAX_AGGREGATE_ROUTE_RULES) { + route_rule_id_array[route_rule_id_count++] = policy_search_array[i]->result_parameter.route_rule_id; + } + continue; + } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) { skip_order = policy_search_array[i]->result_parameter.skip_policy_order; skip_session_order = policy_search_array[i]->session_order + 1; continue; } + // Passed all tests, found a match + matched_policy = policy_search_array[i]; break; } } } + if (route_rule_id_count == 1) { + *return_route_rule_id = route_rule_id_array[0]; + } else if (route_rule_id_count > 1) { + *return_route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array); + } + return matched_policy; } @@ -7888,7 +8741,8 @@ necp_output_bypass(struct mbuf *packet) } necp_kernel_policy_id -necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter) +necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, struct rtentry *rt, + necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter) { struct ip *ip = NULL; int hlen = sizeof(struct ip); @@ -7901,6 +8755,8 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a u_int32_t last_interface_index = 0; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; + u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; if (result) { *result = 0; @@ -7920,7 +8776,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a // Exit early for an empty list // Don't lock. Possible race condition, but we don't want the performance hit. if (necp_kernel_ip_output_policies_count == 0 || - ((socket_policy_id == NECP_KERNEL_POLICY_ID_NONE) && necp_kernel_ip_output_policies_non_id_count == 0)) { + (socket_policy_id == NECP_KERNEL_POLICY_ID_NONE && necp_kernel_ip_output_policies_non_id_count == 0 && necp_drop_dest_policy.entry_count == 0)) { if (necp_drop_all_order > 0) { matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; if (result) { @@ -7998,7 +8854,8 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a // Match packet to policy lck_rw_lock_shared(&necp_kernel_policy_lock); - matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr); + u_int32_t route_rule_id = 0; + matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr, rt, &route_rule_id, &drop_dest_policy_result, &drop_all_bypass); if (matched_policy) { matched_policy_id = matched_policy->id; if (result) { @@ -8009,13 +8866,36 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a memcpy(result_parameter, &matched_policy->result_parameter, sizeof(matched_policy->result_parameter)); } + if (route_rule_id != 0 && + packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) { + packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id; + } + if (necp_debug > 1) { - NECPLOG(LOG_DEBUG, "IP Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index); + NECPLOG(LOG_DEBUG, "IP Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d Route Rule %u", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index, route_rule_id); } - } else if (necp_drop_all_order > 0) { - matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - if (result) { - *result = NECP_KERNEL_POLICY_RESULT_DROP; + } else { + bool drop_all = false; + /* + * Apply drop-all only to packets which have never matched a primary policy (check + * if the packet saved policy id is none or falls within the socket policy id range). + */ + if (socket_policy_id < NECP_KERNEL_POLICY_ID_FIRST_VALID_IP && + (necp_drop_all_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP)) { + drop_all = true; + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(NULL); + } + } + if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + if (result) { + *result = NECP_KERNEL_POLICY_RESULT_DROP; + } + } else if (route_rule_id != 0 && + packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) { + // If we matched a route rule, mark it + packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id; } } @@ -8025,7 +8905,8 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a } necp_kernel_policy_id -necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter) +necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, struct rtentry *rt, + necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter) { struct ip6_hdr *ip6 = NULL; int next = -1; @@ -8039,6 +8920,8 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out u_int32_t last_interface_index = 0; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; + u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; if (result) { *result = 0; @@ -8058,7 +8941,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out // Exit early for an empty list // Don't lock. Possible race condition, but we don't want the performance hit. if (necp_kernel_ip_output_policies_count == 0 || - ((socket_policy_id == NECP_KERNEL_POLICY_ID_NONE) && necp_kernel_ip_output_policies_non_id_count == 0)) { + (socket_policy_id == NECP_KERNEL_POLICY_ID_NONE && necp_kernel_ip_output_policies_non_id_count == 0 && necp_drop_dest_policy.entry_count == 0)) { if (necp_drop_all_order > 0) { matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; if (result) { @@ -8133,7 +9016,8 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out // Match packet to policy lck_rw_lock_shared(&necp_kernel_policy_lock); - matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr); + u_int32_t route_rule_id = 0; + matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr, rt, &route_rule_id, &drop_dest_policy_result, &drop_all_bypass); if (matched_policy) { matched_policy_id = matched_policy->id; if (result) { @@ -8144,13 +9028,36 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out memcpy(result_parameter, &matched_policy->result_parameter, sizeof(matched_policy->result_parameter)); } + if (route_rule_id != 0 && + packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) { + packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id; + } + if (necp_debug > 1) { - NECPLOG(LOG_DEBUG, "IP6 Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index); + NECPLOG(LOG_DEBUG, "IP6 Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d Route Rule %u", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index, route_rule_id); } - } else if (necp_drop_all_order > 0) { - matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; - if (result) { - *result = NECP_KERNEL_POLICY_RESULT_DROP; + } else { + bool drop_all = false; + /* + * Apply drop-all only to packets which have never matched a primary policy (check + * if the packet saved policy id is none or falls within the socket policy id range). + */ + if (socket_policy_id < NECP_KERNEL_POLICY_ID_FIRST_VALID_IP && + (necp_drop_all_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP)) { + drop_all = true; + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(NULL); + } + } + if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + if (result) { + *result = NECP_KERNEL_POLICY_RESULT_DROP; + } + } else if (route_rule_id != 0 && + packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) { + // If we matched a route rule, mark it + packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id; } } @@ -8360,6 +9267,54 @@ necp_buffer_compare_with_bit_prefix(u_int8_t *p1, u_int8_t *p2, u_int32_t bits) return TRUE; } +static bool +necp_addr_is_empty(struct sockaddr *addr) +{ + if (addr == NULL) { + return TRUE; + } + + if (addr->sa_len == 0) { + return TRUE; + } + + switch (addr->sa_family) { + case AF_INET: { + static struct sockaddr_in ipv4_empty_address = { + .sin_len = sizeof(struct sockaddr_in), + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { .s_addr = 0 }, // 0.0.0.0 + .sin_zero = {0}, + }; + if (necp_addr_compare(addr, (struct sockaddr *)&ipv4_empty_address, 0) == 0) { + return TRUE; + } else { + return FALSE; + } + } + case AF_INET6: { + static struct sockaddr_in6 ipv6_empty_address = { + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_flowinfo = 0, + .sin6_addr = IN6ADDR_ANY_INIT, // :: + .sin6_scope_id = 0, + }; + if (necp_addr_compare(addr, (struct sockaddr *)&ipv6_empty_address, 0) == 0) { + return TRUE; + } else { + return FALSE; + } + } + default: + return FALSE; + } + + return FALSE; +} + static bool necp_update_qos_marking(struct ifnet *ifp, u_int32_t route_rule_id) { @@ -8399,12 +9354,13 @@ necp_update_qos_marking(struct ifnet *ifp, u_int32_t route_rule_id) if ((route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_CELLULAR(ifp)) || (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIFI(ifp)) || (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIRED(ifp)) || - (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_EXPENSIVE(ifp))) { + (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_EXPENSIVE(ifp)) || + (route_rule->constrained_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_CONSTRAINED(ifp))) { qos_marking = TRUE; if (necp_debug > 2) { - NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d for Rule %d Allowed %d", + NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d Cn:%d for Rule %d Allowed %d", route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action, - route_rule->expensive_action, route_rule_id, qos_marking); + route_rule->expensive_action, route_rule->constrained_action, route_rule_id, qos_marking); } goto done; } @@ -8630,6 +9586,22 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } } + if (IFNET_IS_CONSTRAINED(ifp)) { + if (route_rule->constrained_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + if (necp_route_is_lqm_abort(ifp, delegated_ifp)) { + // Mark aggregate action as deny + type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE; + } + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->constrained_action)) { + if (type_aggregate_action == NECP_ROUTE_RULE_NONE || + (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && + route_rule->constrained_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { + // Deny wins if there is a conflict + type_aggregate_action = route_rule->constrained_action; + } + } + } + if (type_aggregate_action != NECP_ROUTE_RULE_NONE) { if (necp_debug > 1) { NECPLOG(LOG_DEBUG, "Route Allowed: C:%d WF:%d W:%d E:%d for Rule %d Allowed %d", route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action, route_rule->expensive_action, route_rule_id, ((type_aggregate_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE)); @@ -8726,7 +9698,8 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr u_int32_t route_rule_id = 0; struct rtentry *route = NULL; u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; - + necp_kernel_policy_result drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; + necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; u_int32_t netagent_ids[NECP_MAX_NETAGENTS]; memset(&netagent_ids, 0, sizeof(netagent_ids)); @@ -8746,10 +9719,14 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr route = inp->inp_route.ro_rt; + struct socket *so = inp->inp_socket; + + u_int32_t drop_order = necp_process_drop_order(so->so_cred); + // Don't lock. Possible race condition, but we don't want the performance hit. if (necp_kernel_socket_policies_count == 0 || (!(inp->inp_flags2 & INP2_WANT_APP_POLICY) && necp_kernel_socket_policies_non_app_count == 0)) { - if (necp_drop_all_order > 0) { + if (necp_drop_all_order > 0 || drop_order > 0) { if (necp_socket_bypass(override_local_addr, override_remote_addr, inp)) { allowed_to_receive = TRUE; } else { @@ -8806,7 +9783,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr // Actually calculate policy result lck_rw_lock_shared(&necp_kernel_policy_lock); - necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, &info); + necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, drop_order, &info); flowhash = necp_socket_calc_flowhash_locked(&info); if (inp->inp_policyresult.policy_id != NECP_KERNEL_POLICY_ID_NONE && @@ -8834,7 +9811,16 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr goto done; } - struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), return_skip_policy_id); + u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; + size_t route_rule_id_array_count = 0; + struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), return_skip_policy_id, inp->inp_route.ro_rt, &drop_dest_policy_result, &drop_all_bypass); + + if (route_rule_id_array_count == 1) { + route_rule_id = route_rule_id_array[0]; + } else if (route_rule_id_array_count > 1) { + route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array); + } + if (matched_policy != NULL) { if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP || matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT || @@ -8861,14 +9847,23 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr NECPLOG(LOG_DEBUG, "Socket Send/Recv Policy: Policy %d Allowed %d", return_policy_id ? *return_policy_id : 0, allowed_to_receive); } goto done; - } else if (necp_drop_all_order > 0) { - allowed_to_receive = FALSE; } else { - if (return_policy_id) { - *return_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + bool drop_all = false; + if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) { + drop_all = true; + if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) { + drop_all_bypass = necp_check_drop_all_bypass_result(NULL); + } } - if (return_route_rule_id) { - *return_route_rule_id = route_rule_id; + if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) { + allowed_to_receive = FALSE; + } else { + if (return_policy_id) { + *return_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + } + if (return_route_rule_id) { + *return_route_rule_id = route_rule_id; + } } } @@ -8915,10 +9910,13 @@ necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, } bool -necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, +necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, necp_kernel_policy_id *return_policy_id, + u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id) { - return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, NULL, return_policy_id, return_route_rule_id, return_skip_policy_id); + return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, interface, + return_policy_id, return_route_rule_id, + return_skip_policy_id); } int @@ -8946,8 +9944,18 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel } packet->m_pkthdr.necp_mtag.necp_app_id = inp->inp_policyresult.app_id; - if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE) { + if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE && + skip_policy_id != NECP_KERNEL_POLICY_ID_NO_MATCH) { + // Only mark the skip policy if it is a valid policy ID packet->m_pkthdr.necp_mtag.necp_skip_policy_id = skip_policy_id; + } else if (inp->inp_policyresult.results.filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) { + // Overload the meaning of "NECP_KERNEL_POLICY_ID_NO_MATCH" + // to indicate that NECP_FILTER_UNIT_NO_FILTER was set + // See necp_get_skip_policy_id_from_packet() and + // necp_packet_should_skip_filters(). + packet->m_pkthdr.necp_mtag.necp_skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + } else { + packet->m_pkthdr.necp_mtag.necp_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE; } return 0; @@ -9018,9 +10026,25 @@ necp_get_skip_policy_id_from_packet(struct mbuf *packet) return NECP_KERNEL_POLICY_ID_NONE; } + // Check for overloaded value. See necp_mark_packet_from_socket(). + if (packet->m_pkthdr.necp_mtag.necp_skip_policy_id == NECP_KERNEL_POLICY_ID_NO_MATCH) { + return NECP_KERNEL_POLICY_ID_NONE; + } + return packet->m_pkthdr.necp_mtag.necp_skip_policy_id; } +bool +necp_packet_should_skip_filters(struct mbuf *packet) +{ + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { + return false; + } + + // Check for overloaded value. See necp_mark_packet_from_socket(). + return packet->m_pkthdr.necp_mtag.necp_skip_policy_id == NECP_KERNEL_POLICY_ID_NO_MATCH; +} + u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet) { @@ -9332,7 +10356,17 @@ static bool necp_is_intcoproc(struct inpcb *inp, struct mbuf *packet) { if (inp != NULL) { - return sflt_permission_check(inp) ? true : false; + if (!(inp->inp_vflag & INP_IPV6)) { + return false; + } + if (INP_INTCOPROC_ALLOWED(inp)) { + return true; + } + if ((inp->inp_flags & INP_BOUND_IF) && + IFNET_IS_INTCOPROC(inp->inp_boundifp)) { + return true; + } + return false; } if (packet != NULL) { struct ip6_hdr *ip6 = mtod(packet, struct ip6_hdr *); @@ -9346,3 +10380,176 @@ necp_is_intcoproc(struct inpcb *inp, struct mbuf *packet) return false; } + +static bool +necp_address_matches_drop_dest_policy(union necp_sockaddr_union *sau, u_int32_t session_order) +{ + char dest_str[MAX_IPv6_STR_LEN]; + + if (necp_drop_dest_debug > 0) { + if (sau->sa.sa_family == AF_INET) { + (void) inet_ntop(AF_INET, &sau->sin.sin_addr, dest_str, sizeof(dest_str)); + } else if (sau->sa.sa_family == AF_INET6) { + (void) inet_ntop(AF_INET6, &sau->sin6.sin6_addr, dest_str, sizeof(dest_str)); + } else { + dest_str[0] = 0; + } + } + for (u_int32_t i = 0; i < necp_drop_dest_policy.entry_count; i++) { + struct necp_drop_dest_entry *necp_drop_dest_entry = &necp_drop_dest_policy.entries[i]; + struct necp_policy_condition_addr *npca = &necp_drop_dest_entry->cond_addr; + + if (session_order >= necp_drop_dest_entry->order && necp_is_addr_in_subnet(&sau->sa, &npca->address.sa, npca->prefix)) { + if (necp_drop_dest_debug > 0) { + char subnet_str[MAX_IPv6_STR_LEN]; + struct proc *p = current_proc(); + pid_t pid = proc_pid(p); + + if (sau->sa.sa_family == AF_INET) { + (void) inet_ntop(AF_INET, &npca->address.sin, subnet_str, sizeof(subnet_str)); + os_log(OS_LOG_DEFAULT, "%s (process %s:%u) %s matches %s/%u", __func__, proc_best_name(p), pid, dest_str, subnet_str, npca->prefix); + } else if (sau->sa.sa_family == AF_INET6) { + (void) inet_ntop(AF_INET6, &npca->address.sin6, subnet_str, sizeof(subnet_str)); + os_log(OS_LOG_DEFAULT, "%s (process %s:%u) %s matches %s/%u", __func__, proc_best_name(p), pid, dest_str, subnet_str, npca->prefix); + } + } + return true; + } + } + if (necp_drop_dest_debug > 1) { + struct proc *p = current_proc(); + pid_t pid = proc_pid(p); + + os_log(OS_LOG_DEFAULT, "%s (process %s:%u) %s no match", __func__, proc_best_name(p), pid, dest_str); + } + return false; +} + +static int +sysctl_handle_necp_drop_dest_level SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + int changed = 0; + int error = 0; + struct necp_drop_dest_policy tmp_drop_dest_policy; + struct proc *p = current_proc(); + pid_t pid = proc_pid(p); + + if (req->newptr != USER_ADDR_NULL && proc_suser(current_proc()) != 0 && + priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NECP_POLICIES, 0) != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) not permitted", __func__, proc_best_name(p), pid); + return EPERM; + } + if (req->newptr != USER_ADDR_NULL && req->newlen != sizeof(struct necp_drop_dest_policy)) { + NECPLOG(LOG_ERR, "%s (process %s:%u) bad newlen %lu", __func__, proc_best_name(p), pid, req->newlen); + return EINVAL; + } + + memcpy(&tmp_drop_dest_policy, &necp_drop_dest_policy, sizeof(struct necp_drop_dest_policy)); + error = sysctl_io_opaque(req, &tmp_drop_dest_policy, sizeof(struct necp_drop_dest_policy), &changed); + if (error != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) sysctl_io_opaque() error %d", __func__, proc_best_name(p), pid, error); + return error; + } + if (changed == 0 || req->newptr == USER_ADDR_NULL) { + return error; + } + + // + // Validate the passed parameters + // + if (tmp_drop_dest_policy.entry_count >= MAX_NECP_DROP_DEST_LEVEL_ADDRS) { + NECPLOG(LOG_ERR, "%s (process %s:%u) bad entry_count %u", __func__, proc_best_name(p), pid, tmp_drop_dest_policy.entry_count); + return EINVAL; + } + for (u_int32_t i = 0; i < tmp_drop_dest_policy.entry_count; i++) { + struct necp_drop_dest_entry *tmp_drop_dest_entry = &tmp_drop_dest_policy.entries[i]; + struct necp_policy_condition_addr *npca = &tmp_drop_dest_entry->cond_addr; + + switch (tmp_drop_dest_entry->level) { + case NECP_SESSION_PRIORITY_UNKNOWN: + if (tmp_drop_dest_policy.entry_count != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) NECP_SESSION_PRIORITY_UNKNOWN bad entry_count %u", __func__, proc_best_name(p), pid, tmp_drop_dest_policy.entry_count); + return EINVAL; + } + break; + case NECP_SESSION_PRIORITY_CONTROL: + case NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL: + case NECP_SESSION_PRIORITY_HIGH: + case NECP_SESSION_PRIORITY_DEFAULT: + case NECP_SESSION_PRIORITY_LOW: + if (tmp_drop_dest_policy.entry_count == 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) priority %u entry_count 0", __func__, proc_best_name(p), pid, tmp_drop_dest_entry->level); + return EINVAL; + } + break; + default: { + NECPLOG(LOG_ERR, "%s (process %s:%u) bad level %u", __func__, proc_best_name(p), pid, tmp_drop_dest_entry->level); + return EINVAL; + } + } + + switch (npca->address.sa.sa_family) { + case AF_INET: { + if (npca->prefix > 32) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET bad prefix %u", __func__, proc_best_name(p), pid, npca->prefix); + return EINVAL; + } + if (npca->address.sin.sin_len != sizeof(struct sockaddr_in)) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET bad sin_len %u", __func__, proc_best_name(p), pid, npca->address.sin.sin_len); + return EINVAL; + } + if (npca->address.sin.sin_port != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET bad sin_port %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin.sin_port); + return EINVAL; + } + break; + } + case AF_INET6: { + if (npca->prefix > 128) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad prefix %u", __func__, proc_best_name(p), pid, npca->prefix); + return EINVAL; + } + if (npca->address.sin6.sin6_len != sizeof(struct sockaddr_in6)) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_len %u", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_len); + return EINVAL; + } + if (npca->address.sin6.sin6_port != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_port %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_port); + return EINVAL; + } + if (npca->address.sin6.sin6_flowinfo != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_flowinfo %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_flowinfo); + return EINVAL; + } + if (npca->address.sin6.sin6_scope_id != 0) { + NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_scope_id %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_scope_id); + return EINVAL; + } + break; + } + default: { + return EINVAL; + } + } + } + + // + // Commit the changed policy + // + lck_rw_lock_exclusive(&necp_kernel_policy_lock); + memset(&necp_drop_dest_policy, 0, sizeof(struct necp_drop_dest_policy)); + + necp_drop_dest_policy.entry_count = tmp_drop_dest_policy.entry_count; + for (u_int32_t i = 0; i < tmp_drop_dest_policy.entry_count; i++) { + struct necp_drop_dest_entry *tmp_drop_dest_entry = &tmp_drop_dest_policy.entries[i]; + struct necp_drop_dest_entry *necp_drop_dest_entry = &necp_drop_dest_policy.entries[i]; + + memcpy(necp_drop_dest_entry, tmp_drop_dest_entry, sizeof(struct necp_drop_dest_entry)); + + necp_drop_dest_entry->order = necp_get_first_order_for_priority(necp_drop_dest_entry->level); + } + lck_rw_done(&necp_kernel_policy_lock); + + return 0; +} diff --git a/bsd/net/necp.h b/bsd/net/necp.h index 5ae4af20d..b6f9db0af 100644 --- a/bsd/net/necp.h +++ b/bsd/net/necp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Apple Inc. All rights reserved. + * Copyright (c) 2013-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -130,12 +130,25 @@ struct necp_packet_header { #define NECP_POLICY_CONDITION_BOUND_INTERFACE 9 // String #define NECP_POLICY_CONDITION_TRAFFIC_CLASS 10 // necp_policy_condition_tc_range // Socket/IP conditions -#define NECP_POLICY_CONDITION_IP_PROTOCOL 11 // u_int8_t +#define NECP_POLICY_CONDITION_IP_PROTOCOL 11 // u_int16_t #define NECP_POLICY_CONDITION_LOCAL_ADDR 12 // necp_policy_condition_addr #define NECP_POLICY_CONDITION_REMOTE_ADDR 13 // necp_policy_condition_addr #define NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE 14 // necp_policy_condition_addr_range #define NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE 15 // necp_policy_condition_addr_range #define NECP_POLICY_CONDITION_AGENT_TYPE 16 // struct necp_policy_condition_agent_type +#define NECP_POLICY_CONDITION_HAS_CLIENT 17 // N/A +#define NECP_POLICY_CONDITION_LOCAL_NETWORKS 18 // Matches all local networks +// Socket-only conditions +#define NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL 19 // u_int16_t +#define NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR 20 // necp_policy_condition_addr +#define NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR 21 // necp_policy_condition_addr +#define NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE 22 // necp_policy_condition_addr_range +#define NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE 23 // necp_policy_condition_addr_range +// Socket/Application conditions, continued +#define NECP_POLICY_CONDITION_CLIENT_FLAGS 24 // u_int32_t, values from NECP_CLIENT_PARAMETER_FLAG_* +#define NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY 25 // N/A +#define NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY 26 // N/A +#define NECP_POLICY_CONDITION_PLATFORM_BINARY 27 // N/A /* * Results @@ -156,8 +169,9 @@ struct necp_packet_header { #define NECP_POLICY_RESULT_USE_NETAGENT 14 // netagent uuid_t #define NECP_POLICY_RESULT_NETAGENT_SCOPED 15 // netagent uuid_t #define NECP_POLICY_RESULT_SCOPED_DIRECT 16 // N/A, scopes to primary physical interface +#define NECP_POLICY_RESULT_ALLOW_UNENTITLED 17 // N/A -#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_SCOPED_DIRECT +#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_ALLOW_UNENTITLED /* * Route Rules @@ -170,9 +184,10 @@ struct necp_packet_header { #define NECP_ROUTE_RULE_DENY_LQM_ABORT 4 // String, or empty to match all #define NECP_ROUTE_RULE_FLAG_CELLULAR 0x01 -#define NECP_ROUTE_RULE_FLAG_WIFI 0x02 -#define NECP_ROUTE_RULE_FLAG_WIRED 0x04 +#define NECP_ROUTE_RULE_FLAG_WIFI 0x02 +#define NECP_ROUTE_RULE_FLAG_WIRED 0x04 #define NECP_ROUTE_RULE_FLAG_EXPENSIVE 0x08 +#define NECP_ROUTE_RULE_FLAG_CONSTRAINED 0x10 /* * Error types @@ -303,7 +318,7 @@ struct necp_basic_metadata { u_int32_t rcvbufused; }; -struct necp_tcp_probe_status { +struct necp_connection_probe_status { unsigned int probe_activated : 1; unsigned int write_probe_failed : 1; unsigned int read_probe_failed : 1; @@ -311,7 +326,7 @@ struct necp_tcp_probe_status { }; struct necp_extra_tcp_metadata { - struct necp_tcp_probe_status probestatus; + struct necp_connection_probe_status probestatus; u_int32_t sndbufsize; u_int32_t sndbufused; @@ -323,7 +338,6 @@ struct necp_extra_tcp_metadata { u_int32_t traffic_mgt_flags; u_int32_t cc_alg_index; u_int32_t state; - activity_bitmap_t activity_bitmap; }; struct necp_stats_hdr { @@ -334,11 +348,15 @@ struct necp_stats_hdr { #define NECP_CLIENT_STATISTICS_TYPE_TCP 1 // Identifies use of necp_tcp_stats #define NECP_CLIENT_STATISTICS_TYPE_UDP 2 // Identifies use of necp_udp_stats +#define NECP_CLIENT_STATISTICS_TYPE_QUIC 3 // Identifies use of necp_quic_stats + #define NECP_CLIENT_STATISTICS_TYPE_TCP_VER_1 1 // Currently supported version for TCP #define NECP_CLIENT_STATISTICS_TYPE_UDP_VER_1 1 // Currently supported version for UDP +#define NECP_CLIENT_STATISTICS_TYPE_QUIC_VER_1 1 // Currently supported version for QUIC #define NECP_CLIENT_STATISTICS_TYPE_TCP_CURRENT_VER NECP_CLIENT_STATISTICS_TYPE_TCP_VER_1 #define NECP_CLIENT_STATISTICS_TYPE_UDP_CURRENT_VER NECP_CLIENT_STATISTICS_TYPE_UDP_VER_1 +#define NECP_CLIENT_STATISTICS_TYPE_QUIC_CURRENT_VER NECP_CLIENT_STATISTICS_TYPE_QUIC_VER_1 #define NECP_CLIENT_STATISTICS_EVENT_INIT 0x00000000 // Register the flow #define NECP_CLIENT_STATISTICS_EVENT_TIME_WAIT 0x00000001 // The flow is effectively finished but waiting on timer @@ -356,10 +374,42 @@ struct necp_udp_stats { struct necp_basic_metadata necp_udp_basic; }; + +/* + * The following reflects the special case for QUIC. + * It is a streaming protocol built on top of UDP. + * Therefore QUIC stats are defined as basic UDP stats + * with some extra meta data. + * TODO: For now the extra metadata is an exact replica + * of the metadata for TCP. However keeping that separate allows + * the structures to diverge later as new stats are added. + */ +#define QUIC_STATELESS_RESET_TOKEN_SIZE 16 +struct necp_extra_quic_metadata { + u_int32_t sndbufsize; + u_int32_t sndbufused; + u_int32_t txunacked; + u_int32_t txwindow; + u_int32_t txcwindow; + u_int32_t traffic_mgt_flags; + u_int32_t cc_alg_index; + u_int32_t state; + u_int8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE]; +}; + +#define necp_quic_hdr necp_quic_udp_stats.necp_udp_hdr +#define necp_quic_counts necp_quic_udp_stats.necp_udp_counts +#define necp_quic_basic necp_quic_udp_stats.necp_udp_basic +struct necp_quic_stats { + struct necp_udp_stats necp_quic_udp_stats; + struct necp_extra_quic_metadata necp_quic_extra; +}; + typedef struct necp_all_stats { union { struct necp_tcp_stats tcp_stats; struct necp_udp_stats udp_stats; + struct necp_quic_stats quic_stats; } all_stats_u; } necp_all_stats; @@ -448,7 +498,8 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_ACTION_COPY_UPDATED_RESULT 16 // Copy client result only if changed. Input: client_id; Output: result in buffer #define NECP_CLIENT_ACTION_ADD_FLOW 17 // Add a flow. Input: client_id; Output: struct necp_client_add_flow #define NECP_CLIENT_ACTION_REMOVE_FLOW 18 // Remove a flow. Input: flow_id, optional struct ifnet_stats_per_flow - +#define NECP_CLIENT_ACTION_CLAIM 19 // Claim a client that has been added for this unique PID. Input: client_id +#define NECP_CLIENT_ACTION_SIGN 20 // Sign a resolver answer. Input: struct necp_client_resolver_answer; Output: signed tag, expected to be 32 bytes #define NECP_CLIENT_PARAMETER_APPLICATION NECP_POLICY_CONDITION_APPLICATION // Requires entitlement #define NECP_CLIENT_PARAMETER_REAL_APPLICATION NECP_POLICY_CONDITION_REAL_APPLICATION // Requires entitlement @@ -487,9 +538,24 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_PARAMETER_ASSERT_AGENT 131 // uuid_t, network agent UUID #define NECP_CLIENT_PARAMETER_UNASSERT_AGENT 132 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_PARENT_ID 150 // uuid_t, client UUID + #define NECP_CLIENT_PARAMETER_LOCAL_ENDPOINT 200 // struct necp_client_endpoint #define NECP_CLIENT_PARAMETER_REMOTE_ENDPOINT 201 // struct necp_client_endpoint -#define NECP_CLIENT_PARAMETER_BROWSE_CATEGORY 202 // struct necp_client_endpoint +#define NECP_CLIENT_PARAMETER_BROWSE_DESCRIPTOR 202 // struct necp_client_endpoint +#define NECP_CLIENT_PARAMETER_RESOLVER_TAG 203 // Tag as bytes, expected to be 32 bytes +#define NECP_CLIENT_PARAMETER_ADVERTISE_DESCRIPTOR 204 // struct necp_client_endpoint + +#define NECP_CLIENT_PARAMETER_DELEGATED_UPID 210 // u_int64_t, requires entitlement + +#define NECP_CLIENT_PARAMETER_ETHERTYPE 220 // u_int16_t, ethertype +#define NECP_CLIENT_PARAMETER_TRANSPORT_PROTOCOL 221 // u_int8_t, IPPROTO_ + +#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE 230 // u_int8_t, NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_ + +#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_DEFAULT 0 +#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_TEMPORARY 1 +#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_STABLE 2 #define NECP_CLIENT_PARAMETER_FLAGS 250 // u_int32_t, see NECP_CLIENT_PAREMETER_FLAG_* values @@ -501,8 +567,13 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_PARAMETER_FLAG_ECN_ENABLE 0x0020 // Client is requesting to enable ECN #define NECP_CLIENT_PARAMETER_FLAG_ECN_DISABLE 0x0040 // Client is requesting to disable ECN #define NECP_CLIENT_PARAMETER_FLAG_TFO_ENABLE 0x0080 // Client is requesting to enable TFO -#define NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE 0x0100 // Interpret NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE only for primary -// interface, and allow exceptions for multipath or listeners +#define NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE 0x0100 // Interpret NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE only for primary interface, and allow exceptions for multipath or listeners +#define NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER 0x0200 // Client expects to open a custom ethernet channel +#define NECP_CLIENT_PARAMETER_FLAG_CUSTOM_IP 0x0400 // Client expects to open a custom IP protocol channel +#define NECP_CLIENT_PARAMETER_FLAG_INTERPOSE 0x0800 // Client expects to open an interpose filter channel +#define NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED 0x1000 // Prohibit constrained interfaces +#define NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC 0x2000 // Fallback traffic +#define NECP_CLIENT_PARAMETER_FLAG_INBOUND 0x4000 // Flow is inbound (passive) #define NECP_CLIENT_RESULT_CLIENT_ID 1 // uuid_t #define NECP_CLIENT_RESULT_POLICY_RESULT 2 // u_int32_t @@ -521,6 +592,7 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_RECOMMENDED_MSS 15 // u_int8_t #define NECP_CLIENT_RESULT_FLOW_ID 16 // uuid_t #define NECP_CLIENT_RESULT_INTERFACE_TIME_DELTA 17 // u_int32_t, seconds since interface up/down +#define NECP_CLIENT_RESULT_REASON 18 // u_int32_t, see NECP_CLIENT_RESULT_REASON_* values #define NECP_CLIENT_RESULT_NEXUS_INSTANCE 100 // uuid_t #define NECP_CLIENT_RESULT_NEXUS_PORT 101 // u_int16_t @@ -531,8 +603,12 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_LOCAL_ENDPOINT 200 // struct necp_client_endpoint #define NECP_CLIENT_RESULT_REMOTE_ENDPOINT 201 // struct necp_client_endpoint #define NECP_CLIENT_RESULT_DISCOVERED_ENDPOINT 202 // struct necp_client_endpoint, result of browse +#define NECP_CLIENT_RESULT_RESOLVED_ENDPOINT 203 // struct necp_client_endpoint, result of resolve +#define NECP_CLIENT_RESULT_LOCAL_ETHER_ADDR 204 // struct ether_addr +#define NECP_CLIENT_RESULT_REMOTE_ETHER_ADDR 205 // struct ether_addr #define NECP_CLIENT_RESULT_EFFECTIVE_TRAFFIC_CLASS 210 // u_int32_t #define NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG 211 // u_int32_t, 1: background, 0: not background +#define NECP_CLIENT_RESULT_GATEWAY 212 // struct necp_client_endpoint #define NECP_CLIENT_RESULT_FLAG_IS_LOCAL 0x0001 // Routes to this device #define NECP_CLIENT_RESULT_FLAG_IS_DIRECT 0x0002 // Routes to directly accessible peer @@ -549,6 +625,7 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING 0x1000 // QoS marking is allowed #define NECP_CLIENT_RESULT_FLAG_HAS_NAT64 0x2000 // Has NAT64 prefix #define NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER 0x4000 // Interface is in low-power mode +#define NECP_CLIENT_RESULT_FLAG_SPECIFIC_LISTENER 0x8000 // Listener should not listen on all interfaces #define NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6 | NECP_CLIENT_RESULT_FLAG_HAS_NAT64 | NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER) @@ -559,6 +636,11 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_RECOMMENDED_MSS_LOW 0x02 #define NECP_CLIENT_RESULT_RECOMMENDED_MSS_MEDIUM 0x04 +#define NECP_CLIENT_RESULT_REASON_EXPENSIVE_PROHIBITED 1 // Expensive networks were prohibited +#define NECP_CLIENT_RESULT_REASON_CONSTRAINED_PROHIBITED 2 // Constrained networks were prohibited +#define NECP_CLIENT_RESULT_REASON_CELLULAR_DENIED 3 // Denied by a cellular route rule +#define NECP_CLIENT_RESULT_REASON_WIFI_DENIED 4 // Denied by a wifi route rule + struct necp_interface_signature { u_int8_t signature[IFNET_SIGNATURELEN]; u_int8_t signature_len; @@ -574,6 +656,8 @@ struct necp_interface_details { u_int32_t mtu; struct necp_interface_signature ipv4_signature; struct necp_interface_signature ipv6_signature; + u_int32_t ipv4_netmask; + u_int32_t ipv4_broadcast; }; #define NECP_INTERFACE_FLAG_EXPENSIVE 0x0001 @@ -581,6 +665,10 @@ struct necp_interface_details { #define NECP_INTERFACE_FLAG_NOACKPRI 0x0004 #define NECP_INTERFACE_FLAG_3CARRIERAGG 0x0008 #define NECP_INTERFACE_FLAG_IS_LOW_POWER 0x0010 +#define NECP_INTERFACE_FLAG_MPK_LOG 0x0020 // Multi-layer Packet Logging +#define NECP_INTERFACE_FLAG_CONSTRAINED 0x0040 +#define NECP_INTERFACE_FLAG_HAS_NETMASK 0x0080 +#define NECP_INTERFACE_FLAG_HAS_BROADCAST 0x0100 struct necp_client_parameter_netagent_type { char netagent_domain[32]; @@ -629,6 +717,8 @@ struct kev_necp_policies_changed_data { #define NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS 0x01 // Request a nexus instance upon adding a flow #define NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID 0x02 // Register the client ID rather than the flow registration ID with network agents +#define NECP_CLIENT_FLOW_FLAGS_BROWSE 0x04 // Create request with a browse agent +#define NECP_CLIENT_FLOW_FLAGS_RESOLVE 0x08 // Create request with a resolution agent struct necp_client_flow_stats { u_int32_t stats_type; // NECP_CLIENT_STATISTICS_TYPE_* @@ -666,6 +756,41 @@ struct necp_client_observer_update { u_int8_t tlv_buffer[0]; // Parameters or result as TLVs, based on type }; +#define NECP_CLIENT_SIGN_TYPE_RESOLVER_ANSWER 1 + +struct necp_client_signable { + uuid_t client_id; + u_int32_t sign_type; +} __attribute__((__packed__)); + +struct necp_client_resolver_answer { + uuid_t client_id; + u_int32_t sign_type; + union sockaddr_in_4_6 address_answer; + u_int32_t hostname_length; + // hostname +} __attribute__((__packed__)); + +#define NECP_FILTER_UNIT_NO_FILTER UINT32_MAX // Reserved filter unit value that prohibits all filters and socket filters + +/* + * The sysctl "net.necp.necp_drop_dest_level" controls the global drop rule policy for + * a set of destinations addresses at the given level -- the drop rule is the last one + * to be evaluated at this level. + */ +#define MAX_NECP_DROP_DEST_LEVEL_ADDRS 8 + +struct necp_drop_dest_entry { + u_int32_t level; // priority level + u_int32_t order; // session order (read only via sysctl) + struct necp_policy_condition_addr cond_addr; +}; + +struct necp_drop_dest_policy { + u_int32_t entry_count; + struct necp_drop_dest_entry entries[MAX_NECP_DROP_DEST_LEVEL_ADDRS]; +}; + #ifdef BSD_KERNEL_PRIVATE #include #include @@ -675,6 +800,8 @@ struct necp_client_observer_update { #include #include #include +#include + SYSCTL_DECL(_net_necp); @@ -713,10 +840,13 @@ struct necp_all_kstats { extern errno_t necp_client_init(void); extern int necp_application_find_policy_match_internal(proc_t proc, u_int8_t *parameters, u_int32_t parameters_size, struct necp_aggregate_result *returned_result, - u_int32_t *flags, u_int required_interface_index, + u_int32_t *flags, u_int32_t *reason, u_int required_interface_index, const union necp_sockaddr_union *override_local_addr, const union necp_sockaddr_union *override_remote_addr, - struct rtentry **returned_route, bool ignore_address); + struct necp_client_endpoint *returned_v4_gateway, + struct necp_client_endpoint *returned_v6_gateway, + struct rtentry **returned_route, bool ignore_address, + bool has_client); /* * TLV utilities * @@ -736,7 +866,7 @@ extern u_int8_t *necp_buffer_write_tlv_if_different(u_int8_t *cursor, u_int8_t t extern u_int8_t necp_buffer_get_tlv_type(u_int8_t *buffer, int tlv_offset); extern u_int32_t necp_buffer_get_tlv_length(u_int8_t *buffer, int tlv_offset); extern u_int8_t *necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_size); -extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int next); +extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int *err, int next); #define NECPCTL_DROP_ALL_LEVEL 1 /* Drop all packets if no policy matches above this level */ #define NECPCTL_DEBUG 2 /* Log all kernel policy matches */ @@ -755,13 +885,16 @@ extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int o #define NECPCTL_OBSERVER_FD_COUNT 15 /* Count of NECP observer fds */ #define NECPCTL_OBSERVER_MESSAGE_LIMIT 16 /* Number of of NECP observer messages allowed to be queued */ #define NECPCTL_SYSCTL_ARENA_COUNT 17 /* Count of sysctl arenas */ +#define NECPCTL_DROP_UNENTITLED_LEVEL 18 /* Drop unentitled process traffic above this level */ +#define NECPCTL_PASS_INTERPOSE 19 /* Pass interpose */ #define NECPCTL_NAMES { \ { 0, 0 }, \ { "drop_all_level", CTLTYPE_INT }, \ { "debug", CTLTYPE_INT }, \ { "pass_loopback", CTLTYPE_INT }, \ - { "pass_keepalives", CTLTYPE_INT }, \ + { "pass_keepalives", CTLTYPE_INT }, \ + { "pass_interpose", CTLTYPE_INT }, \ } typedef u_int32_t necp_kernel_policy_id; @@ -789,6 +922,7 @@ typedef u_int32_t necp_app_id; #define NECP_KERNEL_POLICY_RESULT_USE_NETAGENT NECP_POLICY_RESULT_USE_NETAGENT #define NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED NECP_POLICY_RESULT_NETAGENT_SCOPED #define NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT NECP_POLICY_RESULT_SCOPED_DIRECT +#define NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED NECP_POLICY_RESULT_ALLOW_UNENTITLED typedef struct { u_int32_t identifier; @@ -821,6 +955,7 @@ struct necp_kernel_socket_policy { u_int32_t condition_mask; u_int32_t condition_negated_mask; + u_int32_t cond_client_flags; necp_kernel_policy_id cond_policy_id; u_int32_t cond_app_id; // Locally assigned ID value stored u_int32_t cond_real_app_id; // Locally assigned ID value stored @@ -932,7 +1067,10 @@ extern bool necp_socket_should_rescope(struct inpcb *inp); extern u_int necp_socket_get_rescope_if_index(struct inpcb *inp); extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t current_mtu); -extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, +extern bool necp_socket_is_allowed_to_recv_on_interface(struct inpcb *inp, ifnet_t interface); + +extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, + necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id); extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, @@ -950,6 +1088,7 @@ extern int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, u_int32_t route_rule_id, necp_kernel_policy_id skip_policy_id); extern necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet); extern necp_kernel_policy_id necp_get_skip_policy_id_from_packet(struct mbuf *packet); +extern bool necp_packet_should_skip_filters(struct mbuf *packet); extern u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet); extern u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet); extern int necp_get_app_uuid_from_packet(struct mbuf *packet, @@ -958,9 +1097,11 @@ extern int necp_get_app_uuid_from_packet(struct mbuf *packet, extern necp_kernel_policy_id necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface); extern necp_kernel_policy_id necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, + struct rtentry *rt, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter); extern necp_kernel_policy_id necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, + struct rtentry *rt, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter); @@ -975,11 +1116,20 @@ extern bool necp_packet_is_allowed_over_interface(struct mbuf *packet, struct if extern int necp_mark_packet_as_keepalive(struct mbuf *packet, bool is_keepalive); extern bool necp_get_is_keepalive_from_packet(struct mbuf *packet); +extern int necp_sign_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length, + u_int8_t *answer, u_int32_t answer_length, + u_int8_t *tag, u_int32_t *out_tag_length); + +extern bool necp_validate_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length, + u_int8_t *answer, u_int32_t answer_length, + u_int8_t *tag, u_int32_t tag_length); + extern void necp_update_all_clients(void); // Handle general re-evaluate event +extern void necp_update_all_clients_immediately_if_needed(bool should_update_immediately); // Handle general re-evaluate event extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_t agent_generation); // Cause a single client to get an update event -extern void necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background); // Set all clients for an fp as background or not +extern bool necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background); // Set all clients for an fp as background or not struct necp_fd_data; extern void necp_fd_memstatus(proc_t proc, uint32_t status, struct necp_fd_data *client_fd); // Purge memory of clients for the process @@ -987,6 +1137,9 @@ extern void necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd); // Set extern int necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp); +extern int necp_client_register_socket_listener(pid_t pid, uuid_t client_id, struct inpcb *inp); + + extern int necp_client_assert_bb_radio_manager(uuid_t client_id, bool assert); extern int necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp); @@ -1007,6 +1160,7 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id, #define NECP_FLOWADV_IDX_INVALID UINT32_MAX extern void *necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length, struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, + struct ether_addr *local_ether_addr, u_int32_t flow_adv_index, void *flow_stats, size_t *message_length); struct necp_client_nexus_parameters { @@ -1015,11 +1169,24 @@ struct necp_client_nexus_parameters { uuid_t euuid; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; - u_int16_t ip_protocol; + u_int8_t ip_protocol; + u_int8_t transport_protocol; + u_int16_t ethertype; u_int32_t traffic_class; necp_policy_id policy_id; unsigned is_listener:1; + unsigned is_interpose:1; + unsigned is_custom_ether:1; unsigned allow_qos_marking:1; + unsigned override_address_selection:1; + unsigned use_stable_address:1; // Used if override_address_selection is set +}; + +struct necp_client_agent_parameters { + union { + struct necp_client_nexus_parameters nexus_request; + u_int8_t close_token[QUIC_STATELESS_RESET_TOKEN_SIZE]; + } u; }; #define NECP_CLIENT_CBACTION_NONVIABLE 1 @@ -1040,6 +1207,13 @@ extern void necp_client_reap_caches(boolean_t purge); #endif /* BSD_KERNEL_PRIVATE */ + +#ifdef KERNEL +#ifdef KERNEL_PRIVATE +extern bool net_domain_contains_hostname(char *hostname_string, char *domain_string); +#endif /* KERNEL_PRIVATE */ +#endif /* KERNEL */ + #ifndef KERNEL extern int necp_match_policy(const uint8_t *parameters, size_t parameters_size, struct necp_aggregate_result *returned_result); diff --git a/bsd/net/necp_client.c b/bsd/net/necp_client.c index 893ce06f0..ec1fd72f0 100644 --- a/bsd/net/necp_client.c +++ b/bsd/net/necp_client.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Apple Inc. All rights reserved. + * Copyright (c) 2015-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -66,6 +67,8 @@ #include #include +#include + /* * NECP Client Architecture @@ -145,14 +148,9 @@ extern u_int32_t necp_debug; -static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t); -static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t); -static int noop_ioctl(struct fileproc *, unsigned long, caddr_t, - vfs_context_t); static int necpop_select(struct fileproc *, int, void *, vfs_context_t); static int necpop_close(struct fileglob *, vfs_context_t); -static int necpop_kqfilter(struct fileproc *, struct knote *, - struct kevent_internal_s *kev, vfs_context_t); +static int necpop_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *); // Timer functions static int necp_timeout_microseconds = 1000 * 100; // 100ms @@ -165,6 +163,8 @@ static int necp_socket_flow_count = 0; static int necp_if_flow_count = 0; static int necp_observer_message_limit = 256; +os_refgrp_decl(static, necp_client_refgrp, "NECPClientRefGroup", NULL); + SYSCTL_INT(_net_necp, NECPCTL_CLIENT_FD_COUNT, client_fd_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_client_fd_count, 0, ""); SYSCTL_INT(_net_necp, NECPCTL_OBSERVER_FD_COUNT, observer_fd_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_observer_fd_count, 0, ""); SYSCTL_INT(_net_necp, NECPCTL_CLIENT_COUNT, client_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_client_count, 0, ""); @@ -199,26 +199,36 @@ extern unsigned int get_maxmtu(struct rtentry *); #define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_UUID 0x20000 #define NECP_PARSED_PARAMETERS_FIELD_TRAFFIC_CLASS 0x40000 #define NECP_PARSED_PARAMETERS_FIELD_LOCAL_PORT 0x80000 +#define NECP_PARSED_PARAMETERS_FIELD_DELEGATED_UPID 0x100000 +#define NECP_PARSED_PARAMETERS_FIELD_ETHERTYPE 0x200000 +#define NECP_PARSED_PARAMETERS_FIELD_TRANSPORT_PROTOCOL 0x400000 +#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR_PREFERENCE 0x800000 -#define NECP_MAX_PARSED_PARAMETERS 16 + +#define NECP_MAX_INTERFACE_PARAMETERS 16 +#define NECP_MAX_AGENT_PARAMETERS 4 struct necp_client_parsed_parameters { u_int32_t valid_fields; u_int32_t flags; + u_int64_t delegated_upid; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; u_int32_t required_interface_index; - char prohibited_interfaces[IFXNAMSIZ][NECP_MAX_PARSED_PARAMETERS]; + char prohibited_interfaces[NECP_MAX_INTERFACE_PARAMETERS][IFXNAMSIZ]; u_int8_t required_interface_type; - u_int8_t prohibited_interface_types[NECP_MAX_PARSED_PARAMETERS]; - struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_PARSED_PARAMETERS]; - struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_PARSED_PARAMETERS]; - struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_PARSED_PARAMETERS]; - struct necp_client_parameter_netagent_type avoided_netagent_types[NECP_MAX_PARSED_PARAMETERS]; - uuid_t required_netagents[NECP_MAX_PARSED_PARAMETERS]; - uuid_t prohibited_netagents[NECP_MAX_PARSED_PARAMETERS]; - uuid_t preferred_netagents[NECP_MAX_PARSED_PARAMETERS]; - uuid_t avoided_netagents[NECP_MAX_PARSED_PARAMETERS]; - u_int16_t ip_protocol; + u_int8_t local_address_preference; + u_int8_t prohibited_interface_types[NECP_MAX_INTERFACE_PARAMETERS]; + struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_AGENT_PARAMETERS]; + struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_AGENT_PARAMETERS]; + struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_AGENT_PARAMETERS]; + struct necp_client_parameter_netagent_type avoided_netagent_types[NECP_MAX_AGENT_PARAMETERS]; + uuid_t required_netagents[NECP_MAX_AGENT_PARAMETERS]; + uuid_t prohibited_netagents[NECP_MAX_AGENT_PARAMETERS]; + uuid_t preferred_netagents[NECP_MAX_AGENT_PARAMETERS]; + uuid_t avoided_netagents[NECP_MAX_AGENT_PARAMETERS]; + u_int8_t ip_protocol; + u_int8_t transport_protocol; + u_int16_t ethertype; pid_t effective_pid; uuid_t effective_uuid; u_int32_t traffic_class; @@ -234,18 +244,20 @@ necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa); static bool necp_ifnet_matches_parameters(struct ifnet *ifp, struct necp_client_parsed_parameters *parsed_parameters, + u_int32_t override_flags, u_int32_t *preferred_count, - bool secondary_interface); + bool secondary_interface, + bool require_scoped_field); static const struct fileops necp_fd_ops = { - .fo_type = DTYPE_NETPOLICY, - .fo_read = noop_read, - .fo_write = noop_write, - .fo_ioctl = noop_ioctl, - .fo_select = necpop_select, - .fo_close = necpop_close, + .fo_type = DTYPE_NETPOLICY, + .fo_read = fo_no_read, + .fo_write = fo_no_write, + .fo_ioctl = fo_no_ioctl, + .fo_select = necpop_select, + .fo_close = necpop_close, + .fo_drain = fo_no_drain, .fo_kqfilter = necpop_kqfilter, - .fo_drain = NULL, }; struct necp_client_assertion { @@ -338,15 +350,13 @@ struct necp_client { decl_lck_mtx_data(, lock); decl_lck_mtx_data(, route_lock); - uint32_t reference_count; + os_refcnt_t reference_count; uuid_t client_id; unsigned result_read : 1; unsigned allow_multiple_flows : 1; unsigned legacy_client_is_flow : 1; - unsigned background : 1; - unsigned background_update : 1; unsigned platform_binary : 1; size_t result_length; @@ -354,9 +364,11 @@ struct necp_client { necp_policy_id policy_id; - u_int16_t ip_protocol; + u_int8_t ip_protocol; int proc_pid; + u_int64_t delegated_upid; + struct _necp_client_flow_tree flow_registrations; LIST_HEAD(_necp_client_assertion_list, necp_client_assertion) assertion_list; @@ -370,6 +382,7 @@ struct necp_client { void *agent_handle; + size_t parameters_length; u_int8_t parameters[0]; }; @@ -383,8 +396,9 @@ struct necp_client { #define NECP_CLIENT_ROUTE_UNLOCK(_c) lck_mtx_unlock(&_c->route_lock) static void necp_client_retain_locked(struct necp_client *client); -static void necp_client_retain(struct necp_client *client); + static bool necp_client_release_locked(struct necp_client *client); +static bool necp_client_release(struct necp_client *client); static void necp_client_add_assertion(struct necp_client *client, uuid_t netagent_uuid); @@ -402,6 +416,9 @@ struct necp_flow_defunct { uuid_t nexus_agent; void *agent_handle; int proc_pid; + u_int32_t flags; + struct necp_client_agent_parameters close_parameters; + bool has_close_parameters; }; LIST_HEAD(_necp_flow_defunct_list, necp_flow_defunct); @@ -449,6 +466,9 @@ struct necp_fd_data { TAILQ_HEAD(_necp_client_update_list, necp_client_update) update_list; int update_count; int flags; + + unsigned background : 1; + int proc_pid; decl_lck_mtx_data(, fd_lock); struct selinfo si; @@ -537,29 +557,6 @@ static thread_call_t necp_client_update_tcall; /// NECP file descriptor functions -static int -noop_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) -{ -#pragma unused(fp, uio, flags, ctx) - return ENXIO; -} - -static int -noop_write(struct fileproc *fp, struct uio *uio, int flags, - vfs_context_t ctx) -{ -#pragma unused(fp, uio, flags, ctx) - return ENXIO; -} - -static int -noop_ioctl(struct fileproc *fp, unsigned long com, caddr_t data, - vfs_context_t ctx) -{ -#pragma unused(fp, com, data, ctx) - return ENOTTY; -} - static void necp_fd_notify(struct necp_fd_data *fd_data, bool locked) { @@ -793,9 +790,8 @@ necp_fd_knread(struct knote *kn, long hint) } static int -necp_fd_knrprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +necp_fd_knrprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) struct necp_fd_data *fd_data; int revents; int res; @@ -806,14 +802,14 @@ necp_fd_knrprocess(struct knote *kn, struct filt_process_s *data, struct kevent_ revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1); res = ((revents & POLLIN) != 0); if (res) { - *kev = kn->kn_kevent; + knote_fill_kevent(kn, kev, 0); } NECP_FD_UNLOCK(fd_data); return res; } static int -necp_fd_knrtouch(struct knote *kn, struct kevent_internal_s *kev) +necp_fd_knrtouch(struct knote *kn, struct kevent_qos_s *kev) { #pragma unused(kev) struct necp_fd_data *fd_data; @@ -838,24 +834,21 @@ SECURITY_READ_ONLY_EARLY(struct filterops) necp_fd_rfiltops = { static int necpop_kqfilter(struct fileproc *fp, struct knote *kn, - __unused struct kevent_internal_s *kev, vfs_context_t ctx) + __unused struct kevent_qos_s *kev) { -#pragma unused(fp, ctx) struct necp_fd_data *fd_data = NULL; int revents; if (kn->kn_filter != EVFILT_READ) { NECPLOG(LOG_ERR, "bad filter request %d", kn->kn_filter); - kn->kn_flags = EV_ERROR; - kn->kn_data = EINVAL; + knote_set_error(kn, EINVAL); return 0; } - fd_data = (struct necp_fd_data *)kn->kn_fp->f_fglob->fg_data; + fd_data = (struct necp_fd_data *)fp->f_fglob->fg_data; if (fd_data == NULL) { NECPLOG0(LOG_ERR, "No channel for kqfilter"); - kn->kn_flags = EV_ERROR; - kn->kn_data = ENOENT; + knote_set_error(kn, ENOENT); return 0; } @@ -908,7 +901,7 @@ necp_defunct_flow_registration(struct necp_client *client, flow_registration->registration_id)); flow_defunct->proc_pid = client->proc_pid; flow_defunct->agent_handle = client->agent_handle; - + flow_defunct->flags = flow_registration->flags; // Add to the list provided by caller LIST_INSERT_HEAD(defunct_list, flow_defunct, chain); } @@ -958,33 +951,35 @@ necp_client_retain_locked(struct necp_client *client) { NECP_CLIENT_ASSERT_LOCKED(client); - client->reference_count++; - ASSERT(client->reference_count != 0); + os_ref_retain_locked(&client->reference_count); } -static void -necp_client_retain(struct necp_client *client) -{ - NECP_CLIENT_LOCK(client); - necp_client_retain_locked(client); - NECP_CLIENT_UNLOCK(client); -} static bool necp_client_release_locked(struct necp_client *client) { NECP_CLIENT_ASSERT_LOCKED(client); - uint32_t old_ref = client->reference_count; - - ASSERT(client->reference_count != 0); - if (--client->reference_count == 0) { + os_ref_count_t count = os_ref_release_locked(&client->reference_count); + if (count == 0) { necp_client_free(client); } - return old_ref == 1; + return count == 0; } +static bool +necp_client_release(struct necp_client *client) +{ + bool last_ref; + + NECP_CLIENT_LOCK(client); + if (!(last_ref = necp_client_release_locked(client))) { + NECP_CLIENT_UNLOCK(client); + } + + return last_ref; +} static void necp_client_update_observer_add_internal(struct necp_fd_data *observer_fd, struct necp_client *client) @@ -1127,6 +1122,9 @@ necp_destroy_client_flow_registration(struct necp_client *client, { NECP_CLIENT_ASSERT_LOCKED(client); + bool has_close_parameters = false; + struct necp_client_agent_parameters close_parameters = {}; + memset(close_parameters.u.close_token, 0, sizeof(close_parameters.u.close_token)); struct necp_client_flow *search_flow = NULL; struct necp_client_flow *temp_flow = NULL; @@ -1134,15 +1132,23 @@ necp_destroy_client_flow_registration(struct necp_client *client, if (search_flow->nexus && !uuid_is_null(search_flow->u.nexus_agent)) { // Note that if we had defuncted the client earlier, this would result in a harmless ENOENT - int netagent_error = netagent_client_message(search_flow->u.nexus_agent, + u_int8_t message_type = (abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS : + NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS); + if (((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_BROWSE) || + (flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_RESOLVE)) && + !(flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS)) { + message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT; + } + int netagent_error = netagent_client_message_with_params(search_flow->u.nexus_agent, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? client->client_id : flow_registration->registration_id), pid, client->agent_handle, - (abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS : - NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS)); + message_type, + has_close_parameters ? &close_parameters : NULL, + NULL, 0); if (netagent_error != 0 && netagent_error != ENOENT) { - NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d)", netagent_error); + NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d) MESSAGE TYPE %u", netagent_error, message_type); } uuid_clear(search_flow->u.nexus_agent); } @@ -1189,6 +1195,7 @@ necp_destroy_client(struct necp_client *client, pid_t pid, bool abort) necp_destroy_client_flow_registration(client, flow_registration, pid, abort); } + // Remove agent assertions struct necp_client_assertion *search_assertion = NULL; struct necp_client_assertion *temp_assertion = NULL; @@ -1439,6 +1446,8 @@ necp_client_add_interface_option_if_needed(struct necp_client *client, option->interface_generation = interface_generation; if (nexus_agent != NULL) { uuid_copy(option->nexus_agent, *nexus_agent); + } else { + uuid_clear(option->nexus_agent); } client->interface_option_count++; } else { @@ -1452,6 +1461,8 @@ necp_client_add_interface_option_if_needed(struct necp_client *client, option->interface_generation = interface_generation; if (nexus_agent != NULL) { uuid_copy(option->nexus_agent, *nexus_agent); + } else { + uuid_clear(option->nexus_agent); } client->interface_option_count++; } @@ -1468,9 +1479,10 @@ necp_client_flow_is_viable(proc_t proc, struct necp_client *client, flow->necp_flow_flags = 0; int error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length, - &result, &flow->necp_flow_flags, + &result, &flow->necp_flow_flags, NULL, flow->interface_index, - &flow->local_addr, &flow->remote_addr, NULL, ignore_address); + &flow->local_addr, &flow->remote_addr, NULL, NULL, + NULL, ignore_address, true); return error == 0 && result.routed_interface_index != IFSCOPE_NONE && @@ -1547,14 +1559,14 @@ necp_client_update_flows(proc_t proc, if (flow->viable && client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) { bool flow_viable = flow->viable; - flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow->interface_index, flow->necp_flow_flags, &viable); + flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow->interface_index, flow->necp_flow_flags, &flow_viable); flow->viable = flow_viable; } if (!flow->viable || flow->invalid) { if (client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) { bool flow_viable = flow->viable; - flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow->interface_index, flow->necp_flow_flags, &viable); + flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow->interface_index, flow->necp_flow_flags, &flow_viable); flow->viable = flow_viable; } // The callback might change the viable-flag of the @@ -1619,12 +1631,44 @@ necp_netagent_applies_to_client(struct necp_client *client, return applies; } - if (!allow_nexus && - (flags & NETAGENT_FLAG_NEXUS_PROVIDER)) { - // Hide nexus providers unless allowed - // Direct interfaces and direct policies are allowed to use a nexus - // Delegate interfaces or re-scoped interfaces are not allowed - return applies; + const bool is_nexus_agent = ((flags & NETAGENT_FLAG_NEXUS_PROVIDER) || + (flags & NETAGENT_FLAG_NEXUS_LISTENER) || + (flags & NETAGENT_FLAG_CUSTOM_ETHER_NEXUS) || + (flags & NETAGENT_FLAG_CUSTOM_IP_NEXUS) || + (flags & NETAGENT_FLAG_INTERPOSE_NEXUS)); + if (is_nexus_agent) { + if (!allow_nexus) { + // Hide nexus providers unless allowed + // Direct interfaces and direct policies are allowed to use a nexus + // Delegate interfaces or re-scoped interfaces are not allowed + return applies; + } + + if ((parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) && + !(flags & NETAGENT_FLAG_CUSTOM_ETHER_NEXUS)) { + // Client requested a custom ether nexus, but this nexus isn't one + return applies; + } + + if ((parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_IP) && + !(flags & NETAGENT_FLAG_CUSTOM_IP_NEXUS)) { + // Client requested a custom IP nexus, but this nexus isn't one + return applies; + } + + if ((parameters->flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) && + !(flags & NETAGENT_FLAG_INTERPOSE_NEXUS)) { + // Client requested an interpose nexus, but this nexus isn't one + return applies; + } + + if (!(parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) && + !(parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_IP) && + !(parameters->flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) && + !(flags & NETAGENT_FLAG_NEXUS_PROVIDER)) { + // Client requested default parameters, but this nexus isn't generic + return applies; + } } if (uuid_compare(client->failed_trigger_agent.netagent_uuid, *netagent_uuid) == 0) { @@ -1643,7 +1687,7 @@ necp_netagent_applies_to_client(struct necp_client *client, bool required = FALSE; if (parameters != NULL) { // Check required agent UUIDs - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (uuid_is_null(parameters->required_netagents[i])) { break; } @@ -1661,7 +1705,7 @@ necp_netagent_applies_to_client(struct necp_client *client, memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE); memset(&netagent_type, 0, NETAGENT_TYPESIZE); - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || strlen(parameters->required_netagent_types[i].netagent_type) == 0) { break; @@ -1724,6 +1768,23 @@ necp_client_address_is_valid(struct sockaddr *address) } } +static inline bool +necp_client_endpoint_is_unspecified(struct necp_client_endpoint *endpoint) +{ + if (necp_client_address_is_valid(&endpoint->u.sa)) { + if (endpoint->u.sa.sa_family == AF_INET) { + return endpoint->u.sin.sin_addr.s_addr == INADDR_ANY; + } else if (endpoint->u.sa.sa_family == AF_INET6) { + return IN6_IS_ADDR_UNSPECIFIED(&endpoint->u.sin6.sin6_addr); + } else { + return TRUE; + } + } else { + return TRUE; + } +} + + static int necp_client_parse_parameters(u_int8_t *parameters, u_int32_t parameters_size, @@ -1742,6 +1803,11 @@ necp_client_parse_parameters(u_int8_t *parameters, u_int32_t num_prohibited_agent_types = 0; u_int32_t num_preferred_agent_types = 0; u_int32_t num_avoided_agent_types = 0; + u_int8_t *resolver_tag = NULL; + u_int32_t resolver_tag_length = 0; + u_int8_t *client_hostname = NULL; + u_int32_t hostname_length = 0; + uuid_t parent_id = {}; if (parsed_parameters == NULL) { return EINVAL; @@ -1830,7 +1896,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_PROHIBIT_INTERFACE: { - if (num_prohibited_interfaces >= NECP_MAX_PARSED_PARAMETERS) { + if (num_prohibited_interfaces >= NECP_MAX_INTERFACE_PARAMETERS) { break; } if (length <= IFXNAMSIZ && length > 0) { @@ -1854,7 +1920,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_PROHIBIT_IF_TYPE: { - if (num_prohibited_interface_types >= NECP_MAX_PARSED_PARAMETERS) { + if (num_prohibited_interface_types >= NECP_MAX_INTERFACE_PARAMETERS) { break; } if (length >= sizeof(u_int8_t)) { @@ -1865,7 +1931,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_REQUIRE_AGENT: { - if (num_required_agents >= NECP_MAX_PARSED_PARAMETERS) { + if (num_required_agents >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(uuid_t)) { @@ -1876,7 +1942,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_PROHIBIT_AGENT: { - if (num_prohibited_agents >= NECP_MAX_PARSED_PARAMETERS) { + if (num_prohibited_agents >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(uuid_t)) { @@ -1887,7 +1953,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_PREFER_AGENT: { - if (num_preferred_agents >= NECP_MAX_PARSED_PARAMETERS) { + if (num_preferred_agents >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(uuid_t)) { @@ -1898,7 +1964,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_AVOID_AGENT: { - if (num_avoided_agents >= NECP_MAX_PARSED_PARAMETERS) { + if (num_avoided_agents >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(uuid_t)) { @@ -1909,7 +1975,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: { - if (num_required_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + if (num_required_agent_types >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(struct necp_client_parameter_netagent_type)) { @@ -1920,7 +1986,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_PROHIBIT_AGENT_TYPE: { - if (num_prohibited_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + if (num_prohibited_agent_types >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(struct necp_client_parameter_netagent_type)) { @@ -1931,7 +1997,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE: { - if (num_preferred_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + if (num_preferred_agent_types >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(struct necp_client_parameter_netagent_type)) { @@ -1942,7 +2008,7 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_AVOID_AGENT_TYPE: { - if (num_avoided_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + if (num_avoided_agent_types >= NECP_MAX_AGENT_PARAMETERS) { break; } if (length >= sizeof(struct necp_client_parameter_netagent_type)) { @@ -1960,12 +2026,24 @@ necp_client_parse_parameters(u_int8_t *parameters, break; } case NECP_CLIENT_PARAMETER_IP_PROTOCOL: { - if (length >= sizeof(parsed_parameters->ip_protocol)) { + if (length == sizeof(u_int16_t)) { + u_int16_t large_ip_protocol = 0; + memcpy(&large_ip_protocol, value, sizeof(large_ip_protocol)); + parsed_parameters->ip_protocol = (u_int8_t)large_ip_protocol; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL; + } else if (length >= sizeof(parsed_parameters->ip_protocol)) { memcpy(&parsed_parameters->ip_protocol, value, sizeof(parsed_parameters->ip_protocol)); parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL; } break; } + case NECP_CLIENT_PARAMETER_TRANSPORT_PROTOCOL: { + if (length >= sizeof(parsed_parameters->transport_protocol)) { + memcpy(&parsed_parameters->transport_protocol, value, sizeof(parsed_parameters->transport_protocol)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_TRANSPORT_PROTOCOL; + } + break; + } case NECP_CLIENT_PARAMETER_PID: { if (length >= sizeof(parsed_parameters->effective_pid)) { memcpy(&parsed_parameters->effective_pid, value, sizeof(parsed_parameters->effective_pid)); @@ -1973,6 +2051,20 @@ necp_client_parse_parameters(u_int8_t *parameters, } break; } + case NECP_CLIENT_PARAMETER_DELEGATED_UPID: { + if (length >= sizeof(parsed_parameters->delegated_upid)) { + memcpy(&parsed_parameters->delegated_upid, value, sizeof(parsed_parameters->delegated_upid)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_DELEGATED_UPID; + } + break; + } + case NECP_CLIENT_PARAMETER_ETHERTYPE: { + if (length >= sizeof(parsed_parameters->ethertype)) { + memcpy(&parsed_parameters->ethertype, value, sizeof(parsed_parameters->ethertype)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_ETHERTYPE; + } + break; + } case NECP_CLIENT_PARAMETER_APPLICATION: { if (length >= sizeof(parsed_parameters->effective_uuid)) { memcpy(&parsed_parameters->effective_uuid, value, sizeof(parsed_parameters->effective_uuid)); @@ -1987,6 +2079,33 @@ necp_client_parse_parameters(u_int8_t *parameters, } break; } + case NECP_CLIENT_PARAMETER_RESOLVER_TAG: { + if (length > 0) { + resolver_tag = (u_int8_t *)value; + resolver_tag_length = length; + } + break; + } + case NECP_CLIENT_PARAMETER_DOMAIN: { + if (length > 0) { + client_hostname = (u_int8_t *)value; + hostname_length = length; + } + break; + } + case NECP_CLIENT_PARAMETER_PARENT_ID: { + if (length == sizeof(parent_id)) { + uuid_copy(parent_id, value); + } + break; + } + case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE: { + if (length >= sizeof(parsed_parameters->local_address_preference)) { + memcpy(&parsed_parameters->local_address_preference, value, sizeof(parsed_parameters->local_address_preference)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR_PREFERENCE; + } + break; + } default: { break; } @@ -1997,6 +2116,20 @@ necp_client_parse_parameters(u_int8_t *parameters, offset += sizeof(struct necp_tlv_header) + length; } + if (resolver_tag != NULL) { + union necp_sockaddr_union remote_addr; + memcpy(&remote_addr, &parsed_parameters->remote_addr, sizeof(remote_addr)); + remote_addr.sin.sin_port = 0; + const bool validated = necp_validate_resolver_answer(parent_id, + client_hostname, hostname_length, + (u_int8_t *)&remote_addr, sizeof(remote_addr), + resolver_tag, resolver_tag_length); + if (!validated) { + error = EAUTH; + NECPLOG(LOG_ERR, "Failed to validate answer for hostname %s", client_hostname); + } + } + return error; } @@ -2108,8 +2241,8 @@ necp_client_add_socket_flow(struct necp_client_flow_registration *flow_registrat LIST_INSERT_HEAD(&flow_registration->flow_list, new_flow, flow_chain); } -int -necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp) +static int +necp_client_register_socket_inner(pid_t pid, uuid_t client_id, struct inpcb *inp, bool is_listener) { int error = 0; struct necp_fd_data *client_fd = NULL; @@ -2121,20 +2254,25 @@ necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp) struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); if (client != NULL) { if (!pid || client->proc_pid == pid) { - struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); - if (flow_registration != NULL) { - // Found the right client and flow registration, add a new flow + if (is_listener) { found_client = TRUE; - necp_client_add_socket_flow(flow_registration, inp); - } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) { - // No flows yet on this client, add a new registration - flow_registration = necp_client_create_flow_registration(client_fd, client); - if (flow_registration == NULL) { - error = ENOMEM; - } else { - // Add a new flow + } else { + // Find client flow and assign from socket + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow registration, add a new flow found_client = TRUE; necp_client_add_socket_flow(flow_registration, inp); + } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) { + // No flows yet on this client, add a new registration + flow_registration = necp_client_create_flow_registration(client_fd, client); + if (flow_registration == NULL) { + error = ENOMEM; + } else { + // Add a new flow + found_client = TRUE; + necp_client_add_socket_flow(flow_registration, inp); + } } } } @@ -2163,6 +2301,19 @@ necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp) return error; } +int +necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp) +{ + return necp_client_register_socket_inner(pid, client_id, inp, false); +} + +int +necp_client_register_socket_listener(pid_t pid, uuid_t client_id, struct inpcb *inp) +{ + return necp_client_register_socket_inner(pid, client_id, inp, true); +} + + static void necp_client_add_multipath_interface_flows(struct necp_client_flow_registration *flow_registration, struct necp_client *client, @@ -2250,7 +2401,7 @@ necp_client_lookup_bb_radio_manager(struct necp_client *client, } error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length, - &result, NULL, 0, NULL, NULL, NULL, true); + &result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, true, true); proc_rele(proc); proc = PROC_NULL; @@ -2530,7 +2681,7 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp) flow->assigned_results = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0, (struct necp_client_endpoint *)&flow->local_addr, (struct necp_client_endpoint *)&flow->remote_addr, - 0, NULL, &flow->assigned_results_length); + NULL, 0, NULL, &flow->assigned_results_length); flow_registration->flow_result_read = FALSE; client_updated = TRUE; break; @@ -2565,6 +2716,57 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp) return error; } +bool +necp_socket_is_allowed_to_recv_on_interface(struct inpcb *inp, ifnet_t interface) +{ + if (interface == NULL || + inp == NULL || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_is_null(inp->necp_client_uuid)) { + // If there's no interface or client ID to check, + // or if this is not a listener, pass. + // Outbound connections will have already been + // validated for policy. + return TRUE; + } + + // Only filter out listener sockets (no remote address specified) + if ((inp->inp_vflag & INP_IPV4) && + inp->inp_faddr.s_addr != INADDR_ANY) { + return TRUE; + } + if ((inp->inp_vflag & INP_IPV6) && + !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + return TRUE; + } + + bool allowed = TRUE; + + NECP_CLIENT_TREE_LOCK_SHARED(); + + struct necp_client *client = necp_find_client_and_lock(inp->necp_client_uuid); + if (client != NULL) { + struct necp_client_parsed_parameters *parsed_parameters = NULL; + + MALLOC(parsed_parameters, struct necp_client_parsed_parameters *, sizeof(*parsed_parameters), M_NECP, (M_WAITOK | M_ZERO)); + if (parsed_parameters != NULL) { + int error = necp_client_parse_parameters(client->parameters, (u_int32_t)client->parameters_length, parsed_parameters); + if (error == 0) { + if (!necp_ifnet_matches_parameters(interface, parsed_parameters, 0, NULL, true, false)) { + allowed = FALSE; + } + } + FREE(parsed_parameters, M_NECP); + } + + NECP_CLIENT_UNLOCK(client); + } + + NECP_CLIENT_TREE_UNLOCK(); + + return allowed; +} + int necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id, uint32_t protoctl_event_code, uint32_t protoctl_event_val, @@ -2765,7 +2967,7 @@ necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_param // This is a scoped agent. Add it to the required agents. if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) { // Already some required agents, add this at the end - for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) { + for (int j = 0; j < NECP_MAX_AGENT_PARAMETERS; j++) { if (uuid_compare(parsed_parameters->required_netagents[j], result->netagents[i]) == 0) { // Already required, break break; @@ -2789,7 +2991,7 @@ necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_param char remove_agent_domain[NETAGENT_DOMAINSIZE] = { 0 }; char remove_agent_type[NETAGENT_TYPESIZE] = { 0 }; if (netagent_get_agent_domain_and_type(result->netagents[i], remove_agent_domain, remove_agent_type)) { - for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) { + for (int j = 0; j < NECP_MAX_AGENT_PARAMETERS; j++) { if (strlen(parsed_parameters->required_netagent_types[j].netagent_domain) == 0 && strlen(parsed_parameters->required_netagent_types[j].netagent_type) == 0) { break; @@ -2799,16 +3001,16 @@ necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_param strncmp(parsed_parameters->required_netagent_types[j].netagent_type, remove_agent_type, NETAGENT_TYPESIZE) == 0) { updated = true; - if (j == NECP_MAX_PARSED_PARAMETERS - 1) { + if (j == NECP_MAX_AGENT_PARAMETERS - 1) { // Last field, just clear and break - memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type)); + memset(&parsed_parameters->required_netagent_types[NECP_MAX_AGENT_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type)); break; } else { // Move the parameters down, clear the last entry memmove(&parsed_parameters->required_netagent_types[j], &parsed_parameters->required_netagent_types[j + 1], - sizeof(struct necp_client_parameter_netagent_type) * (NECP_MAX_PARSED_PARAMETERS - (j + 1))); - memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type)); + sizeof(struct necp_client_parameter_netagent_type) * (NECP_MAX_AGENT_PARAMETERS - (j + 1))); + memset(&parsed_parameters->required_netagent_types[NECP_MAX_AGENT_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type)); // Continue, don't increment but look at the new shifted item instead continue; } @@ -2847,7 +3049,10 @@ necp_calculate_client_result(proc_t proc, struct necp_client *client, struct necp_client_parsed_parameters *parsed_parameters, struct necp_aggregate_result *result, - u_int32_t *flags) + u_int32_t *flags, + u_int32_t *reason, + struct necp_client_endpoint *v4_gateway, + struct necp_client_endpoint *v6_gateway) { struct rtentry *route = NULL; @@ -2862,8 +3067,10 @@ necp_calculate_client_result(proc_t proc, memset(result, 0, sizeof(*result)); int error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length, - result, flags, matching_if_index, - NULL, NULL, &route, false); + result, flags, reason, matching_if_index, + NULL, NULL, + v4_gateway, v6_gateway, + &route, false, true); if (error != 0) { if (route != NULL) { rtfree(route); @@ -2874,7 +3081,7 @@ necp_calculate_client_result(proc_t proc, if (validate_agents) { bool requirement_failed = FALSE; if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (uuid_is_null(parsed_parameters->required_netagents[i])) { break; } @@ -2899,7 +3106,7 @@ necp_calculate_client_result(proc_t proc, } if (!requirement_failed && parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 && strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) { break; @@ -2956,6 +3163,11 @@ necp_calculate_client_result(proc_t proc, return TRUE; } +#define NECP_PARSED_PARAMETERS_REQUIRED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) + static bool necp_update_client_result(proc_t proc, struct necp_fd_data *client_fd, @@ -2966,6 +3178,7 @@ necp_update_client_result(proc_t proc, struct necp_aggregate_result result; struct necp_client_parsed_parameters *parsed_parameters = NULL; u_int32_t flags = 0; + u_int32_t reason = 0; NECP_CLIENT_ASSERT_LOCKED(client); @@ -2988,19 +3201,28 @@ necp_update_client_result(proc_t proc, client->ip_protocol = parsed_parameters->ip_protocol; // Calculate the policy result - if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) { + struct necp_client_endpoint v4_gateway = {}; + struct necp_client_endpoint v6_gateway = {}; + if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway)) { FREE(parsed_parameters, M_NECP); return FALSE; } if (necp_update_parsed_parameters(parsed_parameters, &result)) { // Changed the parameters based on result, try again (only once) - if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) { + if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway)) { FREE(parsed_parameters, M_NECP); return FALSE; } } + if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) && + parsed_parameters->required_interface_index != IFSCOPE_NONE && + (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF) == 0) { + // Listener should not apply required interface index if + parsed_parameters->required_interface_index = IFSCOPE_NONE; + } + // Save the last policy id on the client client->policy_id = result.policy_id; @@ -3041,6 +3263,9 @@ necp_update_client_result(proc_t proc, bool updated = FALSE; u_int8_t *cursor = client->result; cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_FLAGS, sizeof(flags), &flags, &updated, client->result, sizeof(client->result)); + if (reason != 0) { + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_REASON, sizeof(reason), &reason, &updated, client->result, sizeof(client->result)); + } cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_CLIENT_ID, sizeof(uuid_t), client->client_id, &updated, client->result, sizeof(client->result)); cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_POLICY_RESULT, sizeof(result.routing_result), &result.routing_result, &updated, @@ -3058,6 +3283,7 @@ necp_update_client_result(proc_t proc, if (result.routed_interface_index != 0) { u_int routed_interface_index = result.routed_interface_index; if (result.routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && + (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_REQUIRED_FIELDS) && parsed_parameters->required_interface_index != IFSCOPE_NONE && parsed_parameters->required_interface_index != result.routed_interface_index) { routed_interface_index = parsed_parameters->required_interface_index; @@ -3073,15 +3299,31 @@ necp_update_client_result(proc_t proc, sizeof(effective_traffic_class), &effective_traffic_class, &updated, client->result, sizeof(client->result)); } - if (client->background_update) { - u_int32_t background = client->background; - cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG, - sizeof(background), &background, &updated, - client->result, sizeof(client->result)); - if (updated) { - client->background_update = 0; + + if (client_fd->background) { + bool has_assigned_flow = FALSE; + struct necp_client_flow_registration *flow_registration = NULL; + struct necp_client_flow *search_flow = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { + if (search_flow->assigned) { + has_assigned_flow = TRUE; + break; + } + } + } + + if (has_assigned_flow) { + u_int32_t background = client_fd->background; + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG, + sizeof(background), &background, &updated, + client->result, sizeof(client->result)); } } + + bool write_v4_gateway = !necp_client_endpoint_is_unspecified(&v4_gateway); + bool write_v6_gateway = !necp_client_endpoint_is_unspecified(&v6_gateway); + NECP_CLIENT_ROUTE_LOCK(client); if (client->current_route != NULL) { const u_int32_t route_mtu = get_maxmtu(client->current_route); @@ -3090,9 +3332,29 @@ necp_update_client_result(proc_t proc, sizeof(route_mtu), &route_mtu, &updated, client->result, sizeof(client->result)); } + bool has_remote_addr = parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REMOTE_ADDR; + if (has_remote_addr && client->current_route->rt_gateway != NULL) { + if (client->current_route->rt_gateway->sa_family == AF_INET) { + write_v6_gateway = false; + } else if (client->current_route->rt_gateway->sa_family == AF_INET6) { + write_v4_gateway = false; + } + } } NECP_CLIENT_ROUTE_UNLOCK(client); + if (write_v4_gateway) { + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_GATEWAY, + sizeof(struct necp_client_endpoint), &v4_gateway, &updated, + client->result, sizeof(client->result)); + } + + if (write_v6_gateway) { + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_GATEWAY, + sizeof(struct necp_client_endpoint), &v6_gateway, &updated, + client->result, sizeof(client->result)); + } + if (result.mss_recommended != 0) { cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_RECOMMENDED_MSS, sizeof(result.mss_recommended), &result.mss_recommended, &updated, @@ -3131,6 +3393,7 @@ necp_update_client_result(proc_t proc, delegate_interface = direct_interface->if_delegated.ifp; } if (result.routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && + (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_REQUIRED_FIELDS) && parsed_parameters->required_interface_index != IFSCOPE_NONE && parsed_parameters->required_interface_index != result.routing_result_parameter.tunnel_interface_index && parsed_parameters->required_interface_index <= (u_int32_t)if_index) { @@ -3174,7 +3437,7 @@ necp_update_client_result(proc_t proc, // Get multipath interface options from ordered list struct ifnet *multi_interface = NULL; TAILQ_FOREACH(multi_interface, &ifnet_ordered_head, if_ordered_link) { - if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, NULL, true)) { + if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, 0, NULL, true, false)) { // Add multipath interface flows for kernel MPTCP necp_client_add_interface_option_if_needed(client, multi_interface->if_index, ifnet_get_generation(multi_interface), NULL); @@ -3183,14 +3446,26 @@ necp_update_client_result(proc_t proc, necp_client_add_agent_interface_options(client, parsed_parameters, multi_interface); } } - } else if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) && - result.routing_result != NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) { - // Get listener interface options from global list - struct ifnet *listen_interface = NULL; - TAILQ_FOREACH(listen_interface, &ifnet_head, if_link) { - if (necp_ifnet_matches_parameters(listen_interface, parsed_parameters, NULL, true)) { + } else if (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) { + if (result.routing_result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) { + if (direct_interface != NULL) { + // If scoped, only listen on that interface // Add nexus agents for listeners - necp_client_add_agent_interface_options(client, parsed_parameters, listen_interface); + necp_client_add_agent_interface_options(client, parsed_parameters, direct_interface); + + // Add interface option in case it is not a nexus + necp_client_add_interface_option_if_needed(client, direct_interface->if_index, + ifnet_get_generation(direct_interface), NULL); + } + } else { + // Get listener interface options from global list + struct ifnet *listen_interface = NULL; + TAILQ_FOREACH(listen_interface, &ifnet_head, if_link) { + if ((listen_interface->if_flags & (IFF_UP | IFF_RUNNING)) && + necp_ifnet_matches_parameters(listen_interface, parsed_parameters, 0, NULL, true, false)) { + // Add nexus agents for listeners + necp_client_add_agent_interface_options(client, parsed_parameters, listen_interface); + } } } } @@ -3305,17 +3580,15 @@ necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_flow_ LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { if (search_flow->nexus && !uuid_is_null(search_flow->u.nexus_agent)) { - struct necp_flow_defunct *flow_defunct; - // Sleeping alloc won't fail; copy only what's necessary - flow_defunct = _MALLOC(sizeof(struct necp_flow_defunct), M_NECP, M_WAITOK | M_ZERO); + struct necp_flow_defunct *flow_defunct = _MALLOC(sizeof(struct necp_flow_defunct), M_NECP, M_WAITOK | M_ZERO); uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent); uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? client->client_id : flow_registration->registration_id)); flow_defunct->proc_pid = client->proc_pid; flow_defunct->agent_handle = client->agent_handle; - + flow_defunct->flags = flow_registration->flags; // Add to the list provided by caller LIST_INSERT_HEAD(defunct_list, flow_defunct, chain); @@ -3391,11 +3664,19 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy, // For each newly defunct client, send a message to the nexus to remove the flow LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) { if (!uuid_is_null(flow_defunct->nexus_agent)) { - int netagent_error = netagent_client_message(flow_defunct->nexus_agent, + u_int8_t message_type = NETAGENT_MESSAGE_TYPE_ABORT_NEXUS; + if (((flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_BROWSE) || + (flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_RESOLVE)) && + !(flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS)) { + message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT; + } + int netagent_error = netagent_client_message_with_params(flow_defunct->nexus_agent, flow_defunct->flow_id, flow_defunct->proc_pid, flow_defunct->agent_handle, - NETAGENT_MESSAGE_TYPE_ABORT_NEXUS); + message_type, + flow_defunct->has_close_parameters ? &flow_defunct->close_parameters : NULL, + NULL, 0); if (netagent_error != 0) { char namebuf[MAXCOMLEN + 1]; (void) strlcpy(namebuf, "unknown", sizeof(namebuf)); @@ -3412,6 +3693,12 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy, void necp_update_all_clients(void) +{ + necp_update_all_clients_immediately_if_needed(false); +} + +void +necp_update_all_clients_immediately_if_needed(bool should_update_immediately) { if (necp_client_update_tcall == NULL) { // Don't try to update clients if the module is not initialized @@ -3420,72 +3707,51 @@ necp_update_all_clients(void) uint64_t deadline = 0; uint64_t leeway = 0; - clock_interval_to_deadline(necp_timeout_microseconds, NSEC_PER_USEC, &deadline); - clock_interval_to_absolutetime_interval(necp_timeout_leeway_microseconds, NSEC_PER_USEC, &leeway); + + uint32_t timeout_to_use = necp_timeout_microseconds; + uint32_t leeway_to_use = necp_timeout_leeway_microseconds; + if (should_update_immediately) { + timeout_to_use = 1000 * 10; // 10ms + leeway_to_use = 1000 * 10; // 10ms; + } + + clock_interval_to_deadline(timeout_to_use, NSEC_PER_USEC, &deadline); + clock_interval_to_absolutetime_interval(leeway_to_use, NSEC_PER_USEC, &leeway); thread_call_enter_delayed_with_leeway(necp_client_update_tcall, NULL, deadline, leeway, THREAD_CALL_DELAY_LEEWAY); } -void +bool necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background) { - bool updated_result = FALSE; - struct necp_client *client = NULL; - if (proc == PROC_NULL) { NECPLOG0(LOG_ERR, "NULL proc"); - return; + return FALSE; } if (fp == NULL) { NECPLOG0(LOG_ERR, "NULL fp"); - return; + return FALSE; } struct necp_fd_data *client_fd = (struct necp_fd_data *)fp->f_fglob->fg_data; if (client_fd == NULL) { NECPLOG0(LOG_ERR, "Could not find client structure for backgrounded client"); - return; + return FALSE; } if (client_fd->necp_fd_type != necp_fd_type_client) { // Not a client fd, ignore NECPLOG0(LOG_ERR, "Not a client fd, ignore"); - return; + return FALSE; } - NECP_FD_LOCK(client_fd); - - RB_FOREACH(client, _necp_client_tree, &client_fd->clients) { - NECP_CLIENT_LOCK(client); + client_fd->background = background; - bool has_assigned_flow = FALSE; - struct necp_client_flow_registration *flow_registration = NULL; - struct necp_client_flow *search_flow = NULL; - RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { - LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { - if (search_flow->assigned) { - has_assigned_flow = TRUE; - break; - } - } - } - - if (has_assigned_flow) { - client->background = background; - client->background_update = TRUE; - updated_result = TRUE; - } - - NECP_CLIENT_UNLOCK(client); - } - if (updated_result) { - necp_update_client_fd_locked(client_fd, proc, NULL); - } - NECP_FD_UNLOCK(client_fd); + return TRUE; } void @@ -3528,11 +3794,19 @@ necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd) // For each defunct client, remove flow from the nexus LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) { if (!uuid_is_null(flow_defunct->nexus_agent)) { - int netagent_error = netagent_client_message(flow_defunct->nexus_agent, + u_int8_t message_type = NETAGENT_MESSAGE_TYPE_ABORT_NEXUS; + if (((flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_BROWSE) || + (flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_RESOLVE)) && + !(flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS)) { + message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT; + } + int netagent_error = netagent_client_message_with_params(flow_defunct->nexus_agent, flow_defunct->flow_id, flow_defunct->proc_pid, flow_defunct->agent_handle, - NETAGENT_MESSAGE_TYPE_ABORT_NEXUS); + message_type, + flow_defunct->has_close_parameters ? &flow_defunct->close_parameters : NULL, + NULL, 0); if (netagent_error != 0) { NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_defunct_client abort nexus error (%d)", netagent_error); } @@ -3632,10 +3906,8 @@ necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ - NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT | \ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE | \ - NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE | \ - NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) #define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR | \ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) @@ -3785,23 +4057,51 @@ necp_interface_type_is_primary_eligible(u_int8_t interface_type) static bool necp_ifnet_matches_parameters(struct ifnet *ifp, struct necp_client_parsed_parameters *parsed_parameters, + u_int32_t override_flags, u_int32_t *preferred_count, - bool secondary_interface) + bool secondary_interface, + bool require_scoped_field) { + bool matched_some_scoped_field = FALSE; + if (preferred_count) { *preferred_count = 0; } + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF) { + if (parsed_parameters->required_interface_index != ifp->if_index) { + return FALSE; + } + } + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR) { if (!necp_ifnet_matches_local_address(ifp, &parsed_parameters->local_addr.sa)) { return FALSE; } + if (require_scoped_field) { + matched_some_scoped_field = TRUE; + } } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) { - if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) && - IFNET_IS_EXPENSIVE(ifp)) { - return FALSE; + if (override_flags != 0) { + if ((override_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) && + IFNET_IS_EXPENSIVE(ifp)) { + return FALSE; + } + if ((override_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED) && + IFNET_IS_CONSTRAINED(ifp)) { + return FALSE; + } + } else { + if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) && + IFNET_IS_EXPENSIVE(ifp)) { + return FALSE; + } + if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED) && + IFNET_IS_CONSTRAINED(ifp)) { + return FALSE; + } } } @@ -3813,8 +4113,14 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, return FALSE; } + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) { + if (require_scoped_field) { + matched_some_scoped_field = TRUE; + } + } + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IFTYPE) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_INTERFACE_PARAMETERS; i++) { if (parsed_parameters->prohibited_interface_types[i] == 0) { break; } @@ -3826,7 +4132,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IF) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_INTERFACE_PARAMETERS; i++) { if (strlen(parsed_parameters->prohibited_interfaces[i]) == 0) { break; } @@ -3838,7 +4144,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (uuid_is_null(parsed_parameters->required_netagents[i])) { break; } @@ -3846,11 +4152,15 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, if (!necp_ifnet_matches_agent(ifp, &parsed_parameters->required_netagents[i], FALSE)) { return FALSE; } + + if (require_scoped_field) { + matched_some_scoped_field = TRUE; + } } } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (uuid_is_null(parsed_parameters->prohibited_netagents[i])) { break; } @@ -3862,7 +4172,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 && strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) { break; @@ -3871,11 +4181,15 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, if (!necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) { return FALSE; } + + if (require_scoped_field) { + matched_some_scoped_field = TRUE; + } } } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (strlen(parsed_parameters->prohibited_netagent_types[i].netagent_domain) == 0 && strlen(parsed_parameters->prohibited_netagent_types[i].netagent_type) == 0) { break; @@ -3890,19 +4204,22 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, // Checked preferred properties if (preferred_count) { if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (uuid_is_null(parsed_parameters->preferred_netagents[i])) { break; } if (necp_ifnet_matches_agent(ifp, &parsed_parameters->preferred_netagents[i], TRUE)) { (*preferred_count)++; + if (require_scoped_field) { + matched_some_scoped_field = TRUE; + } } } } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (strlen(parsed_parameters->preferred_netagent_types[i].netagent_domain) == 0 && strlen(parsed_parameters->preferred_netagent_types[i].netagent_type) == 0) { break; @@ -3910,12 +4227,15 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, if (necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) { (*preferred_count)++; + if (require_scoped_field) { + matched_some_scoped_field = TRUE; + } } } } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (uuid_is_null(parsed_parameters->avoided_netagents[i])) { break; } @@ -3927,7 +4247,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) { - for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { if (strlen(parsed_parameters->avoided_netagent_types[i].netagent_domain) == 0 && strlen(parsed_parameters->avoided_netagent_types[i].netagent_type) == 0) { break; @@ -3941,6 +4261,10 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } } + if (require_scoped_field) { + return matched_some_scoped_field; + } + return TRUE; } @@ -3958,7 +4282,18 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ return TRUE; } - if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_INTERESTING_IFNET_FIELDS)) { + // Check and save off flags + u_int32_t flags = 0; + bool has_prohibit_flags = FALSE; + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) { + flags = parsed_parameters->flags; + has_prohibit_flags = (parsed_parameters->flags & + (NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE | + NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED)); + } + + if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_INTERESTING_IFNET_FIELDS) && + !has_prohibit_flags) { return TRUE; } @@ -3967,11 +4302,12 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ // We have interesting parameters to parse and find a matching interface ifnet_head_lock_shared(); - if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS)) { + if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS) && + !has_preferred_fields) { // We do have fields to match, but they are only prohibitory // If the first interface in the list matches, or there are no ordered interfaces, we don't need to scope ifp = TAILQ_FIRST(&ifnet_ordered_head); - if (ifp == NULL || necp_ifnet_matches_parameters(ifp, parsed_parameters, NULL, false)) { + if (ifp == NULL || necp_ifnet_matches_parameters(ifp, parsed_parameters, 0, NULL, false, false)) { // Don't set return_ifindex, so the client doesn't need to scope ifnet_head_done(); return TRUE; @@ -3981,7 +4317,7 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ // First check the ordered interface list TAILQ_FOREACH(ifp, &ifnet_ordered_head, if_ordered_link) { u_int32_t preferred_count = 0; - if (necp_ifnet_matches_parameters(ifp, parsed_parameters, &preferred_count, false)) { + if (necp_ifnet_matches_parameters(ifp, parsed_parameters, flags, &preferred_count, false, false)) { if (preferred_count > best_preferred_count || *return_ifindex == 0) { // Everything matched, and is most preferred. Return this interface. @@ -3993,20 +4329,34 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ } } } + + if (has_prohibit_flags && + ifp == TAILQ_FIRST(&ifnet_ordered_head)) { + // This was the first interface. From here on, if the + // client prohibited either expensive or constrained, + // don't allow either as a secondary interface option. + flags |= (NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE | + NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED); + } } + bool is_listener = ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) && + (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER)); + // Then check the remaining interfaces if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS) && ((!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE)) || - !necp_interface_type_is_primary_eligible(parsed_parameters->required_interface_type)) && - *return_ifindex == 0) { + !necp_interface_type_is_primary_eligible(parsed_parameters->required_interface_type) || + (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR) || + is_listener) && + (*return_ifindex == 0 || has_preferred_fields)) { TAILQ_FOREACH(ifp, &ifnet_head, if_link) { u_int32_t preferred_count = 0; if (NECP_IFP_IS_ON_ORDERED_LIST(ifp)) { // This interface was in the ordered list, skip continue; } - if (necp_ifnet_matches_parameters(ifp, parsed_parameters, &preferred_count, false)) { + if (necp_ifnet_matches_parameters(ifp, parsed_parameters, flags, &preferred_count, false, true)) { if (preferred_count > best_preferred_count || *return_ifindex == 0) { // Everything matched, and is most preferred. Return this interface. @@ -4143,7 +4493,12 @@ done: return error; } -static int +// All functions called directly from necp_client_action() to handle one of the +// types should be marked with NECP_CLIENT_ACTION_FUNCTION. This ensures that +// necp_client_action() does not inline all the actions into a single function. +#define NECP_CLIENT_ACTION_FUNCTION __attribute__((noinline)) + +static NECP_CLIENT_ACTION_FUNCTION int necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -4173,7 +4528,8 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client lck_mtx_init(&client->lock, necp_fd_mtx_grp, necp_fd_mtx_attr); lck_mtx_init(&client->route_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); - necp_client_retain(client); // Hold our reference until close + + os_ref_init(&client->reference_count, &necp_client_refgrp); // Hold our reference until close client->parameters_length = uap->buffer_size; client->proc_pid = fd_data->proc_pid; // Save off proc pid in case the client will persist past fd @@ -4190,6 +4546,7 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client goto done; } + necp_client_update_observer_add(client); NECP_FD_LOCK(fd_data); @@ -4216,7 +4573,74 @@ done: return error; } -static int +static NECP_CLIENT_ACTION_FUNCTION int +necp_client_claim(struct proc *p, struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + uuid_t client_id = {}; + struct necp_client *client = NULL; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t)) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_claim copyin client_id error (%d)", error); + goto done; + } + + u_int64_t upid = proc_uniqueid(p); + + NECP_FD_LIST_LOCK_SHARED(); + + struct necp_fd_data *find_fd = NULL; + LIST_FOREACH(find_fd, &necp_fd_list, chain) { + NECP_FD_LOCK(find_fd); + struct necp_client *find_client = necp_client_fd_find_client_and_lock(find_fd, client_id); + if (find_client != NULL) { + if (find_client->delegated_upid == upid) { + // Matched the client to claim; remove from the old fd + client = find_client; + RB_REMOVE(_necp_client_tree, &find_fd->clients, client); + necp_client_retain_locked(client); + } + NECP_CLIENT_UNLOCK(find_client); + } + NECP_FD_UNLOCK(find_fd); + + if (client != NULL) { + break; + } + } + + NECP_FD_LIST_UNLOCK(); + + if (client == NULL) { + error = ENOENT; + goto done; + } + + client->proc_pid = fd_data->proc_pid; // Transfer client to claiming pid + + // Add matched client to our fd and re-run result + NECP_FD_LOCK(fd_data); + RB_INSERT(_necp_client_tree, &fd_data->clients, client); + NECP_CLIENT_LOCK(client); + (void)necp_update_client_result(current_proc(), fd_data, client, NULL); + NECP_CLIENT_UNLOCK(client); + NECP_FD_UNLOCK(fd_data); + + necp_client_release(client); + +done: + *retval = error; + + return error; +} + +static NECP_CLIENT_ACTION_FUNCTION int necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -4286,7 +4710,8 @@ done: } -static int +// Don't inline the function since it includes necp_client_parsed_parameters on the stack +static __attribute__((noinline)) int necp_client_check_tcp_heuristics(struct necp_client *client, struct necp_client_flow *flow, u_int32_t *flags, u_int8_t *tfo_cookie, u_int8_t *tfo_cookie_len) { struct necp_client_parsed_parameters parsed_parameters; @@ -4656,7 +5081,7 @@ necp_client_copy_internal(struct necp_client *client, uuid_t client_id, bool cli return 0; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -4752,7 +5177,7 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u return error; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_copy_client_update(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -4828,9 +5253,17 @@ necp_client_copy_parameters_locked(struct necp_client *client, memcpy(¶meters->local_addr, &parsed_parameters.local_addr, sizeof(parameters->local_addr)); memcpy(¶meters->remote_addr, &parsed_parameters.remote_addr, sizeof(parameters->remote_addr)); parameters->ip_protocol = parsed_parameters.ip_protocol; + if (parsed_parameters.valid_fields & NECP_PARSED_PARAMETERS_FIELD_TRANSPORT_PROTOCOL) { + parameters->transport_protocol = parsed_parameters.transport_protocol; + } else { + parameters->transport_protocol = parsed_parameters.ip_protocol; + } + parameters->ethertype = parsed_parameters.ethertype; parameters->traffic_class = parsed_parameters.traffic_class; uuid_copy(parameters->euuid, parsed_parameters.effective_uuid); parameters->is_listener = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) ? 1 : 0; + parameters->is_interpose = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) ? 1 : 0; + parameters->is_custom_ether = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) ? 1 : 0; parameters->policy_id = client->policy_id; // parse client result flag @@ -4843,10 +5276,24 @@ necp_client_copy_parameters_locked(struct necp_client *client, } parameters->allow_qos_marking = (client_result_flags & NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING) ? 1 : 0; + if (parsed_parameters.valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR_PREFERENCE) { + if (parsed_parameters.local_address_preference == NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_DEFAULT) { + parameters->override_address_selection = false; + } else if (parsed_parameters.local_address_preference == NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_TEMPORARY) { + parameters->override_address_selection = true; + parameters->use_stable_address = false; + } else if (parsed_parameters.local_address_preference == NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_STABLE) { + parameters->override_address_selection = true; + parameters->use_stable_address = true; + } + } else { + parameters->override_address_selection = false; + } + return error; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_list(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -4975,7 +5422,7 @@ necp_client_remove_assertion(struct necp_client *client, uuid_t netagent_uuid) return true; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -5064,7 +5511,7 @@ necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action fd_data->proc_pid, client->agent_handle, netagent_message_type, - &parsed_parameters, + (struct necp_client_agent_parameters *)&parsed_parameters, NULL, NULL); if (error == 0) { acted_on_agent = TRUE; @@ -5100,7 +5547,7 @@ done: return error; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_copy_agent(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -5130,7 +5577,7 @@ done: return error; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_agent_use(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -5181,15 +5628,28 @@ done: return error; } -static int +struct necp_interface_details_legacy { + char name[IFXNAMSIZ]; + u_int32_t index; + u_int32_t generation; + u_int32_t functional_type; + u_int32_t delegate_index; + u_int32_t flags; // see NECP_INTERFACE_FLAG_* + u_int32_t mtu; + struct necp_interface_signature ipv4_signature; + struct necp_interface_signature ipv6_signature; +}; + +static NECP_CLIENT_ACTION_FUNCTION int necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; u_int32_t interface_index = 0; - struct necp_interface_details interface_details; + struct necp_interface_details interface_details = {}; if (uap->client_id == 0 || uap->client_id_len != sizeof(u_int32_t) || - uap->buffer_size < sizeof(interface_details) || uap->buffer == 0) { + uap->buffer_size < sizeof(struct necp_interface_details_legacy) || + uap->buffer == 0) { NECPLOG0(LOG_ERR, "necp_client_copy_interface bad input"); error = EINVAL; goto done; @@ -5207,8 +5667,6 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl goto done; } - memset(&interface_details, 0, sizeof(interface_details)); - ifnet_head_lock_shared(); ifnet_t interface = NULL; if (interface_index != IFSCOPE_NONE && interface_index <= (u_int32_t)if_index) { @@ -5228,6 +5686,9 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl if (IFNET_IS_EXPENSIVE(interface)) { interface_details.flags |= NECP_INTERFACE_FLAG_EXPENSIVE; } + if (IFNET_IS_CONSTRAINED(interface)) { + interface_details.flags |= NECP_INTERFACE_FLAG_CONSTRAINED; + } if ((interface->if_eflags & IFEF_TXSTART) == IFEF_TXSTART) { interface_details.flags |= NECP_INTERFACE_FLAG_TXSTART; } @@ -5240,6 +5701,9 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl if (IFNET_IS_LOW_POWER(interface)) { interface_details.flags |= NECP_INTERFACE_FLAG_IS_LOW_POWER; } + if (interface->if_xflags & IFXF_MPK_LOG) { + interface_details.flags |= NECP_INTERFACE_FLAG_MPK_LOG; + } interface_details.mtu = interface->if_mtu; u_int8_t ipv4_signature_len = sizeof(interface_details.ipv4_signature.signature); @@ -5257,11 +5721,32 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl ipv6_signature_len = 0; } interface_details.ipv6_signature.signature_len = ipv6_signature_len; + + ifnet_lock_shared(interface); + struct ifaddr *ifa = NULL; + TAILQ_FOREACH(ifa, &interface->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family == AF_INET) { + interface_details.flags |= NECP_INTERFACE_FLAG_HAS_NETMASK; + interface_details.ipv4_netmask = ((struct in_ifaddr *)ifa)->ia_sockmask.sin_addr.s_addr; + if (interface->if_flags & IFF_BROADCAST) { + interface_details.flags |= NECP_INTERFACE_FLAG_HAS_BROADCAST; + interface_details.ipv4_broadcast = ((struct in_ifaddr *)ifa)->ia_broadaddr.sin_addr.s_addr; + } + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(interface); } ifnet_head_done(); - error = copyout(&interface_details, uap->buffer, sizeof(interface_details)); + // If the client is using an older version of the struct, copy that length + size_t copy_length = sizeof(interface_details); + if (uap->buffer_size < sizeof(struct necp_interface_details_legacy)) { + copy_length = sizeof(struct necp_interface_details_legacy); + } + error = copyout(&interface_details, uap->buffer, copy_length); if (error) { NECPLOG(LOG_ERR, "necp_client_copy_interface copyout error (%d)", error); goto done; @@ -5273,7 +5758,7 @@ done: } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_copy_route_statistics(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -5337,7 +5822,7 @@ done: return error; } -static int +static NECP_CLIENT_ACTION_FUNCTION int necp_client_update_cache(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; @@ -5460,6 +5945,119 @@ done: return error; } +#define NECP_CLIENT_ACTION_SIGN_DEFAULT_HOSTNAME_LENGTH 64 +#define NECP_CLIENT_ACTION_SIGN_MAX_HOSTNAME_LENGTH 1024 + +#define NECP_CLIENT_ACTION_SIGN_TAG_LENGTH 32 + +static NECP_CLIENT_ACTION_FUNCTION int +necp_client_sign(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + u_int32_t hostname_length = 0; + u_int8_t tag[NECP_CLIENT_ACTION_SIGN_TAG_LENGTH] = {}; + struct necp_client_signable signable = {}; + union necp_sockaddr_union address_answer = {}; + u_int8_t *client_hostname = NULL; + u_int8_t *allocated_hostname = NULL; + u_int8_t default_hostname[NECP_CLIENT_ACTION_SIGN_DEFAULT_HOSTNAME_LENGTH] = ""; + uint32_t tag_size = sizeof(tag); + + *retval = 0; + + const bool has_resolver_entitlement = (priv_check_cred(kauth_cred_get(), PRIV_NET_VALIDATED_RESOLVER, 0) == 0); + if (!has_resolver_entitlement) { + NECPLOG0(LOG_ERR, "Process does not hold the necessary entitlement to sign resolver answers"); + error = EPERM; + goto done; + } + + if (uap->client_id == 0 || uap->client_id_len < sizeof(struct necp_client_signable)) { + error = EINVAL; + goto done; + } + + if (uap->buffer == 0 || uap->buffer_size != NECP_CLIENT_ACTION_SIGN_TAG_LENGTH) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, &signable, sizeof(signable)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_sign copyin signable error (%d)", error); + goto done; + } + + if (signable.sign_type != NECP_CLIENT_SIGN_TYPE_RESOLVER_ANSWER) { + NECPLOG(LOG_ERR, "necp_client_sign unknown signable type (%u)", signable.sign_type); + error = EINVAL; + goto done; + } + + if (uap->client_id_len < sizeof(struct necp_client_resolver_answer)) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id + sizeof(signable), &address_answer, sizeof(address_answer)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_sign copyin address_answer error (%d)", error); + goto done; + } + + error = copyin(uap->client_id + sizeof(signable) + sizeof(address_answer), &hostname_length, sizeof(hostname_length)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_sign copyin hostname_length error (%d)", error); + goto done; + } + + if (hostname_length > NECP_CLIENT_ACTION_SIGN_MAX_HOSTNAME_LENGTH) { + error = EINVAL; + goto done; + } + + if (hostname_length > NECP_CLIENT_ACTION_SIGN_DEFAULT_HOSTNAME_LENGTH) { + if ((allocated_hostname = _MALLOC(hostname_length, M_NECP, M_WAITOK | M_ZERO)) == NULL) { + NECPLOG(LOG_ERR, "necp_client_sign malloc hostname %u failed", hostname_length); + error = ENOMEM; + goto done; + } + + client_hostname = allocated_hostname; + } else { + client_hostname = default_hostname; + } + + error = copyin(uap->client_id + sizeof(signable) + sizeof(address_answer) + sizeof(hostname_length), client_hostname, hostname_length); + if (error) { + NECPLOG(LOG_ERR, "necp_client_sign copyin hostname error (%d)", error); + goto done; + } + + address_answer.sin.sin_port = 0; + error = necp_sign_resolver_answer(signable.client_id, client_hostname, hostname_length, + (u_int8_t *)&address_answer, sizeof(address_answer), + tag, &tag_size); + if (tag_size != sizeof(tag)) { + NECPLOG(LOG_ERR, "necp_client_sign unexpected tag size %u", tag_size); + error = EINVAL; + goto done; + } + error = copyout(tag, uap->buffer, tag_size); + if (error) { + NECPLOG(LOG_ERR, "necp_client_sign copyout error (%d)", error); + goto done; + } + +done: + if (allocated_hostname != NULL) { + FREE(allocated_hostname, M_NECP); + allocated_hostname = NULL; + } + *retval = error; + return error; +} + int necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *retval) { @@ -5479,6 +6077,10 @@ necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *ret return_value = necp_client_add(p, fd_data, uap, retval); break; } + case NECP_CLIENT_ACTION_CLAIM: { + return_value = necp_client_claim(p, fd_data, uap, retval); + break; + } case NECP_CLIENT_ACTION_REMOVE: { return_value = necp_client_remove(fd_data, uap, retval); break; @@ -5521,6 +6123,10 @@ necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *ret return_value = necp_client_copy_client_update(fd_data, uap, retval); break; } + case NECP_CLIENT_ACTION_SIGN: { + return_value = necp_client_sign(fd_data, uap, retval); + break; + } default: { NECPLOG(LOG_ERR, "necp_client_action unknown action (%u)", action); return_value = EINVAL; @@ -5565,7 +6171,7 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r } error = necp_application_find_policy_match_internal(p, parameters, uap->parameters_size, - &returned_result, NULL, 0, NULL, NULL, NULL, false); + &returned_result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, false, false); if (error) { goto done; } @@ -5594,7 +6200,7 @@ necp_set_socket_attribute(u_int8_t *buffer, size_t buffer_length, u_int8_t type, char *local_string = NULL; u_int8_t *value = NULL; - cursor = necp_buffer_find_tlv(buffer, buffer_length, 0, type, 0); + cursor = necp_buffer_find_tlv(buffer, buffer_length, 0, type, NULL, 0); if (cursor < 0) { // This will clear out the parameter goto done; @@ -5752,7 +6358,7 @@ done: void * necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length, - struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, + struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, struct ether_addr *local_ether_addr, u_int32_t flow_adv_index, void *flow_stats, size_t *message_length) { u_int8_t *buffer = NULL; @@ -5760,7 +6366,6 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo size_t valsize = 0; bool has_nexus_assignment = FALSE; - if (!uuid_is_null(nexus_instance)) { has_nexus_assignment = TRUE; valsize += sizeof(struct necp_tlv_header) + sizeof(uuid_t); @@ -5778,6 +6383,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo if (remote_endpoint != NULL) { valsize += sizeof(struct necp_tlv_header) + sizeof(struct necp_client_endpoint); } + if (local_ether_addr != NULL) { + valsize += sizeof(struct necp_tlv_header) + sizeof(struct ether_addr); + } if (flow_stats != NULL) { valsize += sizeof(struct necp_tlv_header) + sizeof(void *); } @@ -5807,6 +6415,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo if (remote_endpoint != NULL) { cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_REMOTE_ENDPOINT, sizeof(struct necp_client_endpoint), remote_endpoint, buffer, valsize); } + if (local_ether_addr != NULL) { + cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_LOCAL_ETHER_ADDR, sizeof(struct ether_addr), local_ether_addr, buffer, valsize); + } if (flow_stats != NULL) { cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_NEXUS_FLOW_STATS, sizeof(void *), &flow_stats, buffer, valsize); } diff --git a/bsd/net/net_log_common.h b/bsd/net/net_log_common.h new file mode 100644 index 000000000..d6b8a19de --- /dev/null +++ b/bsd/net/net_log_common.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_LOG_COMMON_H_ +#define _NET_LOG_COMMON_H_ + +#include + +#define NET_LOG_SUBSYSTEM_PREFIX "com.apple.xnu.net" + +#define NET_LOG_SUBSYSTEM_MPTCP NET_LOG_SUBSYSTEM_PREFIX ".mptcp" +#define NET_LOG_SUBSYSTEM_TCP NET_LOG_SUBSYSTEM_PREFIX ".tcp" + +#endif /* _NET_LOG_COMMON_H_ */ diff --git a/bsd/net/net_str_id.c b/bsd/net/net_str_id.c index 26f008ade..637006974 100644 --- a/bsd/net/net_str_id.c +++ b/bsd/net/net_str_id.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008,2011 Apple Inc. All rights reserved. + * Copyright (c) 2008,2011,2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,7 +42,7 @@ #include "net/net_str_id.h" #define NET_ID_STR_ENTRY_SIZE(__str) \ - ((size_t)&(((struct net_str_id_entry*)0)->nsi_string[0]) + \ + (__builtin_offsetof(struct net_str_id_entry, nsi_string[0]) + \ strlen(__str) + 1) #define FIRST_NET_STR_ID 1000 diff --git a/bsd/net/net_stubs.c b/bsd/net/net_stubs.c index 27c2f1e03..169575c24 100644 --- a/bsd/net/net_stubs.c +++ b/bsd/net/net_stubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2018 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -258,6 +258,8 @@ STUB(mbuf_get_timestamp_requested); STUB(mbuf_set_timestamp_requested); STUB(mbuf_register_tx_compl_callback); STUB(mbuf_unregister_tx_compl_callback); +STUB(mbuf_get_keepalive_flag); +STUB(mbuf_set_keepalive_flag); STUB(net_init_add); STUB(proto_inject); STUB(proto_input); @@ -326,6 +328,7 @@ STUB(ifnet_link_quality); STUB(ifnet_notice_master_elected); STUB(ifnet_notice_node_absence); STUB(ifnet_notice_node_presence); +STUB(ifnet_notice_node_presence_v2); STUB(ifnet_poll_params); STUB(ifnet_purge); STUB(ifnet_report_issues); @@ -351,6 +354,8 @@ STUB(ifnet_get_unsent_bytes); STUB(ifnet_get_buffer_status); STUB(ifnet_normalise_unsent_data); STUB(ifnet_set_low_power_mode); +STUB(ifnet_notify_tcp_keepalive_offload_timeout); +STUB(ifnet_interface_advisory_report); STUB(in6_localaddr); STUB(in_localaddr); STUB(in6addr_local); @@ -388,6 +393,7 @@ STUB(net_del_domain); STUB(net_del_domain_old); STUB(net_del_proto); STUB(net_del_proto_old); +STUB(net_domain_contains_hostname); STUB(pffinddomain); STUB(pffinddomain_old); STUB(pffindproto); diff --git a/bsd/net/netsrc.c b/bsd/net/netsrc.c index 1b53940ae..4a10ea965 100644 --- a/bsd/net/netsrc.c +++ b/bsd/net/netsrc.c @@ -137,7 +137,7 @@ netsrc_common(struct rtentry *rt, struct netsrc_rep *reply) } reply->nrp_ifindex = rt->rt_ifp ? rt->rt_ifp->if_index : 0; - if (rt->rt_ifp->if_eflags & IFEF_AWDL) { + if (rt->rt_ifp != NULL && (rt->rt_ifp->if_eflags & IFEF_AWDL)) { reply->nrp_flags |= NETSRC_FLAG_AWDL; } if (rt->rt_flags & RTF_LOCAL) { diff --git a/bsd/net/network_agent.c b/bsd/net/network_agent.c index d54016311..a7d27aed8 100644 --- a/bsd/net/network_agent.c +++ b/bsd/net/network_agent.c @@ -44,6 +44,7 @@ #include #include #include +#include u_int32_t netagent_debug = LOG_NOTICE; // 0=None, 1=Basic @@ -58,14 +59,24 @@ static int netagent_active_count = 0; SYSCTL_INT(_net_netagent, OID_AUTO, active_count, CTLFLAG_RD | CTLFLAG_LOCKED, &netagent_active_count, 0, ""); -#define NETAGENTLOG(level, format, ...) do { \ - if (level <= netagent_debug) \ - log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: " format "\n", __FUNCTION__, __VA_ARGS__); \ +#define NETAGENTLOG(level, format, ...) do { \ + if (level <= netagent_debug) { \ + if (level == LOG_ERR) { \ + os_log_error(OS_LOG_DEFAULT, "%s: " format "\n", __FUNCTION__, __VA_ARGS__); \ + } else { \ + os_log(OS_LOG_DEFAULT, "%s: " format "\n", __FUNCTION__, __VA_ARGS__); \ + } \ + } \ } while (0) -#define NETAGENTLOG0(level, msg) do { \ - if (level <= netagent_debug) \ - log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: %s\n", __FUNCTION__, msg); \ +#define NETAGENTLOG0(level, msg) do { \ + if (level <= netagent_debug) { \ + if (level == LOG_ERR) { \ + os_log_error(OS_LOG_DEFAULT, "%s: %s\n", __FUNCTION__, msg); \ + } else { \ + os_log(OS_LOG_DEFAULT, "%s: %s\n", __FUNCTION__, msg); \ + } \ + } \ } while (0) struct netagent_client { @@ -285,10 +296,10 @@ netagent_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo) // Kernel events static void -netagent_post_event(uuid_t agent_uuid, u_int32_t event_code, bool update_necp) +netagent_post_event(uuid_t agent_uuid, u_int32_t event_code, bool update_necp, bool should_update_immediately) { if (update_necp) { - necp_update_all_clients(); + necp_update_all_clients_immediately_if_needed(should_update_immediately); } struct kev_msg ev_msg; @@ -678,7 +689,7 @@ netagent_unregister_session_wrapper(struct netagent_session *session) if (unregistered) { ifnet_clear_netagent(unregistered_uuid); - netagent_post_event(unregistered_uuid, KEV_NETAGENT_UNREGISTERED, TRUE); + netagent_post_event(unregistered_uuid, KEV_NETAGENT_UNREGISTERED, TRUE, false); } } @@ -777,7 +788,7 @@ netagent_register(netagent_session_t _session, struct netagent *agent) } memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); - memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size); + __nochk_memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size); int error = netagent_handle_register_inner(session, new_wrapper); if (error != 0) { @@ -786,7 +797,7 @@ netagent_register(netagent_session_t _session, struct netagent *agent) } NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE); + netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); return 0; } @@ -846,7 +857,7 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl } memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); - memcpy(&new_wrapper->netagent, register_netagent, sizeof(struct netagent) + data_size); + __nochk_memcpy(&new_wrapper->netagent, register_netagent, sizeof(struct netagent) + data_size); response_error = netagent_handle_register_inner(session, new_wrapper); if (response_error != 0) { @@ -855,7 +866,7 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl } NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE); + netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); done: return response_error; @@ -921,7 +932,7 @@ netagent_handle_register_message(struct netagent_session *session, u_int32_t mes NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE); + netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); return; fail: netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id, response_error); @@ -1121,11 +1132,12 @@ netagent_update(netagent_session_t _session, struct netagent *agent) } memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); - memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size); + __nochk_memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size); int error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX); if (error == 0) { - netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed); + bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); + netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); if (agent_changed == FALSE) { // The session wrapper does not need the "new_wrapper" as nothing changed FREE(new_wrapper, M_NETAGENT); @@ -1193,11 +1205,12 @@ netagent_handle_update_setopt(struct netagent_session *session, u_int8_t *payloa } memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); - memcpy(&new_wrapper->netagent, update_netagent, sizeof(struct netagent) + data_size); + __nochk_memcpy(&new_wrapper->netagent, update_netagent, sizeof(struct netagent) + data_size); response_error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX); if (response_error == 0) { - netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed); + bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); + netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); if (agent_changed == FALSE) { // The session wrapper does not need the "new_wrapper" as nothing changed FREE(new_wrapper, M_NETAGENT); @@ -1271,7 +1284,8 @@ netagent_handle_update_message(struct netagent_session *session, u_int32_t messa } netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_UPDATE, message_id); - netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed); + bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); + netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); if (agent_changed == FALSE) { // The session wrapper does not need the "new_wrapper" as nothing changed @@ -1624,7 +1638,7 @@ netagent_post_updated_interfaces(uuid_t uuid) lck_rw_done(&netagent_lock); if (wrapper != NULL) { - netagent_post_event(uuid, KEV_NETAGENT_UPDATED_INTERFACES, TRUE); + netagent_post_event(uuid, KEV_NETAGENT_UPDATED_INTERFACES, TRUE, false); } else { NETAGENTLOG0(LOG_DEBUG, "Interface event with no associated agent"); } @@ -1802,6 +1816,24 @@ netagent_get_flags(uuid_t uuid) return flags; } +errno_t +netagent_set_flags(uuid_t uuid, u_int32_t flags) +{ + errno_t error = 0; + lck_rw_lock_exclusive(&netagent_lock); + struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(uuid); + if (wrapper != NULL) { + wrapper->netagent.netagent_flags = flags; + } else { + NETAGENTLOG0(LOG_DEBUG, + "Attempt to set flags for invalid netagent"); + error = ENOENT; + } + lck_rw_done(&netagent_lock); + + return error; +} + u_int32_t netagent_get_generation(uuid_t uuid) { @@ -1881,7 +1913,7 @@ netagent_client_message_with_params(uuid_t agent_uuid, pid_t pid, void *handle, u_int8_t message_type, - struct necp_client_nexus_parameters *parameters, + struct necp_client_agent_parameters *parameters, void **assigned_results, size_t *assigned_results_length) { @@ -1916,8 +1948,8 @@ netagent_client_message_with_params(uuid_t agent_uuid, pid_t report_pid = 0; uuid_t report_proc_uuid = {}; if (parameters != NULL) { - report_pid = parameters->epid; - uuid_copy(report_proc_uuid, parameters->euuid); + report_pid = parameters->u.nexus_request.epid; + uuid_copy(report_proc_uuid, parameters->u.nexus_request.euuid); } else { struct proc *p = current_proc(); if (p != NULL) { @@ -1931,7 +1963,13 @@ netagent_client_message_with_params(uuid_t agent_uuid, } else if (message_type == NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS || message_type == NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS || message_type == NETAGENT_MESSAGE_TYPE_ABORT_NEXUS) { - if ((wrapper->netagent.netagent_flags & NETAGENT_FLAG_NEXUS_PROVIDER) == 0) { + bool is_nexus_agent = ((wrapper->netagent.netagent_flags & + (NETAGENT_FLAG_NEXUS_PROVIDER | + NETAGENT_FLAG_NEXUS_LISTENER | + NETAGENT_FLAG_CUSTOM_IP_NEXUS | + NETAGENT_FLAG_CUSTOM_ETHER_NEXUS | + NETAGENT_FLAG_INTERPOSE_NEXUS)) != 0); + if (!is_nexus_agent) { NETAGENTLOG0(LOG_ERR, "Requested netagent for nexus instance is not a nexus provider"); // Agent is not a nexus provider error = EINVAL; @@ -1979,8 +2017,8 @@ netagent_client_message_with_params(uuid_t agent_uuid, } else { uuid_copy(new_pending_client->client_id, necp_client_uuid); if (parameters != NULL) { - new_pending_client->client_pid = parameters->epid; - uuid_copy(new_pending_client->client_proc_uuid, parameters->euuid); + new_pending_client->client_pid = parameters->u.nexus_request.epid; + uuid_copy(new_pending_client->client_proc_uuid, parameters->u.nexus_request.euuid); } else { struct proc *p = current_proc(); if (p != NULL) { @@ -2082,8 +2120,8 @@ netagent_trigger(struct proc *p, struct netagent_trigger_args *uap, int32_t *ret if (uap->agent_uuid) { if (uap->agent_uuidlen != sizeof(uuid_t)) { - NETAGENTLOG(LOG_ERR, "Incorrect length (got %llu, expected %lu)", - uap->agent_uuidlen, sizeof(uuid_t)); + NETAGENTLOG(LOG_ERR, "Incorrect length (got %zu, expected %lu)", + (size_t)uap->agent_uuidlen, sizeof(uuid_t)); return ERANGE; } diff --git a/bsd/net/network_agent.h b/bsd/net/network_agent.h index d51352628..3afa6624a 100644 --- a/bsd/net/network_agent.h +++ b/bsd/net/network_agent.h @@ -108,15 +108,22 @@ struct netagent_assign_nexus_message { #define NETAGENT_MAX_DATA_SIZE 4096 -#define NETAGENT_FLAG_REGISTERED 0x0001 // Agent is registered -#define NETAGENT_FLAG_ACTIVE 0x0002 // Agent is active -#define NETAGENT_FLAG_KERNEL_ACTIVATED 0x0004 // Agent can be activated by kernel activity -#define NETAGENT_FLAG_USER_ACTIVATED 0x0008 // Agent can be activated by system call (netagent_trigger) -#define NETAGENT_FLAG_VOLUNTARY 0x0010 // Use of agent is optional -#define NETAGENT_FLAG_SPECIFIC_USE_ONLY 0x0020 // Agent should only be used and activated when specifically required +#define NETAGENT_FLAG_REGISTERED 0x0001 // Agent is registered +#define NETAGENT_FLAG_ACTIVE 0x0002 // Agent is active +#define NETAGENT_FLAG_KERNEL_ACTIVATED 0x0004 // Agent can be activated by kernel activity +#define NETAGENT_FLAG_USER_ACTIVATED 0x0008 // Agent can be activated by system call (netagent_trigger) +#define NETAGENT_FLAG_VOLUNTARY 0x0010 // Use of agent is optional +#define NETAGENT_FLAG_SPECIFIC_USE_ONLY 0x0020 // Agent should only be used and activated when specifically required #define NETAGENT_FLAG_NETWORK_PROVIDER 0x0040 // Agent provides network access #define NETAGENT_FLAG_NEXUS_PROVIDER 0x0080 // Agent provides a skywalk nexus #define NETAGENT_FLAG_SUPPORTS_BROWSE 0x0100 // Assertions will cause agent to fill in browse endpoints +#define NETAGENT_FLAG_REQUIRES_ASSERT 0x0200 // Assertions are expected to be taken against this agent +#define NETAGENT_FLAG_NEXUS_LISTENER 0x0400 // Nexus supports listeners +#define NETAGENT_FLAG_UPDATE_IMMEDIATELY 0x0800 // Updates the clients without waiting for a leeway +#define NETAGENT_FLAG_CUSTOM_ETHER_NEXUS 0x2000 // Agent provides a custom ethertype nexus +#define NETAGENT_FLAG_CUSTOM_IP_NEXUS 0x4000 // Agent provides a custom IP nexus +#define NETAGENT_FLAG_INTERPOSE_NEXUS 0x8000 // Agent provides an interpose nexus +#define NETAGENT_FLAG_SUPPORTS_RESOLVE 0x10000 // Assertions will cause agent to fill in resolved endpoints #define NETAGENT_NEXUS_MAX_REQUEST_TYPES 16 #define NETAGENT_NEXUS_MAX_RESOLUTION_TYPE_PAIRS 16 @@ -130,9 +137,11 @@ struct netagent_assign_nexus_message { #define NETAGENT_NEXUS_ENDPOINT_TYPE_ADDRESS 1 #define NETAGENT_NEXUS_ENDPOINT_TYPE_HOST 2 #define NETAGENT_NEXUS_ENDPOINT_TYPE_BONJOUR 3 +#define NETAGENT_NEXUS_ENDPOINT_TYPE_SRV 5 #define NETAGENT_NEXUS_FLAG_SUPPORTS_USER_PACKET_POOL 0x1 #define NETAGENT_NEXUS_FLAG_ASSERT_UNSUPPORTED 0x2 // No calls to assert the agent are required +#define NETAGENT_NEXUS_FLAG_SHOULD_USE_EVENT_RING 0x4 // indicates that nexus agent should use event rings struct netagent_nexus { u_int32_t frame_type; @@ -206,13 +215,15 @@ struct netagentlist_req64 { user64_addr_t data __attribute__((aligned(8))); }; -struct necp_client_nexus_parameters; +struct necp_client_agent_parameters; // Kernel accessors extern void netagent_post_updated_interfaces(uuid_t uuid); // To be called from interface ioctls extern u_int32_t netagent_get_flags(uuid_t uuid); +extern errno_t netagent_set_flags(uuid_t uuid, u_int32_t flags); + extern u_int32_t netagent_get_generation(uuid_t uuid); extern bool netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *type); @@ -226,7 +237,7 @@ extern int netagent_client_message_with_params(uuid_t agent_uuid, pid_t pid, void *handle, u_int8_t message_type, - struct necp_client_nexus_parameters *parameters, + struct necp_client_agent_parameters *parameters, void **assigned_results, size_t *assigned_results_length); @@ -249,7 +260,7 @@ struct netagent_nexus_agent { #define NETAGENT_EVENT_NEXUS_FLOW_REMOVE NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS #define NETAGENT_EVENT_NEXUS_FLOW_ABORT NETAGENT_MESSAGE_TYPE_ABORT_NEXUS -typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *necp_handle, void *context, struct necp_client_nexus_parameters *parameters, void **assigned_results, size_t *assigned_results_length); +typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *necp_handle, void *context, struct necp_client_agent_parameters *parameters, void **assigned_results, size_t *assigned_results_length); extern netagent_session_t netagent_create(netagent_event_f event_handler, void *handle); diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index 7a33d6832..cd6c3dacb 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2017 Apple Inc. All rights reserved. + * Copyright (c) 2010-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -139,9 +139,9 @@ SYSCTL_UINT(_net_stats, OID_AUTO, api_report_interval, #endif /* DEBUG || DEVELOPMENT */ enum{ - NSTAT_FLAG_CLEANUP = (1 << 0), - NSTAT_FLAG_REQCOUNTS = (1 << 1), - NSTAT_FLAG_SUPPORTS_UPDATES = (1 << 2), + NSTAT_FLAG_CLEANUP = (1 << 0), + NSTAT_FLAG_REQCOUNTS = (1 << 1), + NSTAT_FLAG_SUPPORTS_UPDATES = (1 << 2), NSTAT_FLAG_SYSINFO_SUBSCRIBED = (1 << 3), }; @@ -151,8 +151,8 @@ enum{ #define QUERY_CONTINUATION_SRC_COUNT 100 #endif -typedef TAILQ_HEAD(, nstat_src) tailq_head_nstat_src; -typedef TAILQ_ENTRY(nstat_src) tailq_entry_nstat_src; +typedef TAILQ_HEAD(, nstat_src) tailq_head_nstat_src; +typedef TAILQ_ENTRY(nstat_src) tailq_entry_nstat_src; typedef struct nstat_provider_filter { u_int64_t npf_flags; @@ -164,36 +164,36 @@ typedef struct nstat_provider_filter { typedef struct nstat_control_state { struct nstat_control_state *ncs_next; - u_int32_t ncs_watching; + u_int32_t ncs_watching; decl_lck_mtx_data(, ncs_mtx); - kern_ctl_ref ncs_kctl; - u_int32_t ncs_unit; - nstat_src_ref_t ncs_next_srcref; + kern_ctl_ref ncs_kctl; + u_int32_t ncs_unit; + nstat_src_ref_t ncs_next_srcref; tailq_head_nstat_src ncs_src_queue; - mbuf_t ncs_accumulated; - u_int32_t ncs_flags; + mbuf_t ncs_accumulated; + u_int32_t ncs_flags; nstat_provider_filter ncs_provider_filters[NSTAT_PROVIDER_COUNT]; /* state maintained for partial query requests */ - u_int64_t ncs_context; - u_int64_t ncs_seq; + u_int64_t ncs_context; + u_int64_t ncs_seq; } nstat_control_state; typedef struct nstat_provider { struct nstat_provider *next; - nstat_provider_id_t nstat_provider_id; - size_t nstat_descriptor_length; - errno_t (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie); - int (*nstat_gone)(nstat_provider_cookie_t cookie); - errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); - errno_t (*nstat_watcher_add)(nstat_control_state *state, nstat_msg_add_all_srcs *req); - void (*nstat_watcher_remove)(nstat_control_state *state); - errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); - void (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked); - bool (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, nstat_provider_filter *filter); + nstat_provider_id_t nstat_provider_id; + size_t nstat_descriptor_length; + errno_t (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie); + int (*nstat_gone)(nstat_provider_cookie_t cookie); + errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); + errno_t (*nstat_watcher_add)(nstat_control_state *state, nstat_msg_add_all_srcs *req); + void (*nstat_watcher_remove)(nstat_control_state *state); + errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); + void (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked); + bool (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, nstat_provider_filter *filter); } nstat_provider; -typedef STAILQ_HEAD(, nstat_src) stailq_head_nstat_src; -typedef STAILQ_ENTRY(nstat_src) stailq_entry_nstat_src; +typedef STAILQ_HEAD(, nstat_src) stailq_head_nstat_src; +typedef STAILQ_ENTRY(nstat_src) stailq_entry_nstat_src; typedef TAILQ_HEAD(, nstat_tu_shadow) tailq_head_tu_shadow; typedef TAILQ_ENTRY(nstat_tu_shadow) tailq_entry_tu_shadow; @@ -203,31 +203,31 @@ typedef TAILQ_ENTRY(nstat_procdetails) tailq_entry_procdetails; typedef struct nstat_src { tailq_entry_nstat_src ns_control_link; // All sources for the nstat_control_state, for iterating over. - nstat_control_state *ns_control; // The nstat_control_state that this is a source for - nstat_src_ref_t srcref; - nstat_provider *provider; - nstat_provider_cookie_t cookie; - uint32_t filter; - uint64_t seq; + nstat_control_state *ns_control; // The nstat_control_state that this is a source for + nstat_src_ref_t srcref; + nstat_provider *provider; + nstat_provider_cookie_t cookie; + uint32_t filter; + uint64_t seq; } nstat_src; -static errno_t nstat_control_send_counts(nstat_control_state *, - nstat_src *, unsigned long long, u_int16_t, int *); -static int nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int16_t hdr_flags); -static int nstat_control_send_update(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int16_t hdr_flags, int *gone); -static errno_t nstat_control_send_removed(nstat_control_state *, nstat_src *); -static errno_t nstat_control_send_goodbye(nstat_control_state *state, nstat_src *src); -static void nstat_control_cleanup_source(nstat_control_state *state, nstat_src *src, boolean_t); -static bool nstat_control_reporting_allowed(nstat_control_state *state, nstat_src *src); -static boolean_t nstat_control_begin_query(nstat_control_state *state, const nstat_msg_hdr *hdrp); -static u_int16_t nstat_control_end_query(nstat_control_state *state, nstat_src *last_src, boolean_t partial); -static void nstat_ifnet_report_ecn_stats(void); -static void nstat_ifnet_report_lim_stats(void); -static void nstat_net_api_report_stats(void); -static errno_t nstat_set_provider_filter( nstat_control_state *state, nstat_msg_add_all_srcs *req); - -static u_int32_t nstat_udp_watchers = 0; -static u_int32_t nstat_tcp_watchers = 0; +static errno_t nstat_control_send_counts(nstat_control_state *, nstat_src *, unsigned long long, u_int16_t, int *); +static int nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int16_t hdr_flags); +static int nstat_control_send_update(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int64_t event, u_int16_t hdr_flags, int *gone); +static errno_t nstat_control_send_removed(nstat_control_state *, nstat_src *); +static errno_t nstat_control_send_goodbye(nstat_control_state *state, nstat_src *src); +static void nstat_control_cleanup_source(nstat_control_state *state, nstat_src *src, boolean_t); +static bool nstat_control_reporting_allowed(nstat_control_state *state, nstat_src *src); +static boolean_t nstat_control_begin_query(nstat_control_state *state, const nstat_msg_hdr *hdrp); +static u_int16_t nstat_control_end_query(nstat_control_state *state, nstat_src *last_src, boolean_t partial); +static void nstat_ifnet_report_ecn_stats(void); +static void nstat_ifnet_report_lim_stats(void); +static void nstat_net_api_report_stats(void); +static errno_t nstat_set_provider_filter( nstat_control_state *state, nstat_msg_add_all_srcs *req); +static errno_t nstat_control_send_event(nstat_control_state *state, nstat_src *src, u_int64_t event); + +static u_int32_t nstat_udp_watchers = 0; +static u_int32_t nstat_tcp_watchers = 0; static void nstat_control_register(void); @@ -273,9 +273,9 @@ nstat_copy_sa_out( static void nstat_ip_to_sockaddr( const struct in_addr *ip, - u_int16_t port, - struct sockaddr_in *sin, - u_int32_t maxlen) + u_int16_t port, + struct sockaddr_in *sin, + u_int32_t maxlen) { if (maxlen < sizeof(struct sockaddr_in)) { return; @@ -318,11 +318,17 @@ nstat_ifnet_to_flags( case IFRTYPE_FUNCTIONAL_CELLULAR: flags |= NSTAT_IFNET_IS_CELLULAR; break; + case IFRTYPE_FUNCTIONAL_COMPANIONLINK: + flags |= NSTAT_IFNET_IS_COMPANIONLINK; + break; } if (IFNET_IS_EXPENSIVE(ifp)) { flags |= NSTAT_IFNET_IS_EXPENSIVE; } + if (IFNET_IS_CONSTRAINED(ifp)) { + flags |= NSTAT_IFNET_IS_CONSTRAINED; + } return flags; } @@ -333,20 +339,27 @@ nstat_inpcb_to_flags( { u_int16_t flags = 0; - if ((inp != NULL) && (inp->inp_last_outifp != NULL)) { - struct ifnet *ifp = inp->inp_last_outifp; - flags = nstat_ifnet_to_flags(ifp); + if (inp != NULL) { + if (inp->inp_last_outifp != NULL) { + struct ifnet *ifp = inp->inp_last_outifp; + flags = nstat_ifnet_to_flags(ifp); - if (flags & NSTAT_IFNET_IS_CELLULAR) { - if (inp->inp_socket != NULL && - (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK)) { - flags |= NSTAT_IFNET_VIA_CELLFALLBACK; + struct tcpcb *tp = intotcpcb(inp); + if (tp) { + if (tp->t_flags & TF_LOCAL) { + flags |= NSTAT_IFNET_IS_LOCAL; + } else { + flags |= NSTAT_IFNET_IS_NON_LOCAL; + } } + } else { + flags = NSTAT_IFNET_IS_UNKNOWN_TYPE; + } + if (inp->inp_socket != NULL && + (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK)) { + flags |= NSTAT_IFNET_VIA_CELLFALLBACK; } - } else { - flags = NSTAT_IFNET_IS_UNKNOWN_TYPE; } - return flags; } @@ -372,10 +385,10 @@ nstat_find_provider_by_id( static errno_t nstat_lookup_entry( - nstat_provider_id_t id, - const void *data, - u_int32_t length, - nstat_provider **out_provider, + nstat_provider_id_t id, + const void *data, + u_int32_t length, + nstat_provider **out_provider, nstat_provider_cookie_t *out_cookie) { *out_provider = nstat_find_provider_by_id(id); @@ -426,14 +439,14 @@ nstat_malloc_aligned( OSMallocTag tag) { struct align_header *hdr = NULL; - u_int32_t size = length + sizeof(*hdr) + alignment - 1; + u_int32_t size = length + sizeof(*hdr) + alignment - 1; - u_int8_t *buffer = OSMalloc(size, tag); + u_int8_t *buffer = OSMalloc(size, tag); if (buffer == NULL) { return NULL; } - u_int8_t *aligned = buffer + sizeof(*hdr); + u_int8_t *aligned = buffer + sizeof(*hdr); aligned = (u_int8_t*)P2ROUNDUP(aligned, alignment); hdr = (struct align_header*)(void *)(aligned - sizeof(*hdr)); @@ -458,8 +471,8 @@ static nstat_provider nstat_route_provider; static errno_t nstat_route_lookup( - const void *data, - u_int32_t length, + const void *data, + u_int32_t length, nstat_provider_cookie_t *out_cookie) { // rt_lookup doesn't take const params but it doesn't modify the parameters for @@ -523,8 +536,8 @@ nstat_route_gone( static errno_t nstat_route_counts( nstat_provider_cookie_t cookie, - struct nstat_counts *out_counts, - int *out_gone) + struct nstat_counts *out_counts, + int *out_gone) { struct rtentry *rt = (struct rtentry*)cookie; struct nstat_counts *rt_stats = rt->rt_stats; @@ -566,7 +579,7 @@ nstat_route_release( rtfree((struct rtentry*)cookie); } -static u_int32_t nstat_route_watchers = 0; +static u_int32_t nstat_route_watchers = 0; static int nstat_route_walktree_add( @@ -607,7 +620,7 @@ nstat_route_walktree_add( static errno_t nstat_route_add_watcher( - nstat_control_state *state, + nstat_control_state *state, nstat_msg_add_all_srcs *req) { int i; @@ -678,8 +691,8 @@ nstat_route_remove_watcher( static errno_t nstat_route_copy_descriptor( nstat_provider_cookie_t cookie, - void *data, - u_int32_t len) + void *data, + u_int32_t len) { nstat_route_descriptor *desc = (nstat_route_descriptor*)data; if (len < sizeof(*desc)) { @@ -828,9 +841,9 @@ nstat_route_connect_success( __private_extern__ void nstat_route_tx( struct rtentry *rte, - u_int32_t packets, - u_int32_t bytes, - u_int32_t flags) + u_int32_t packets, + u_int32_t bytes, + u_int32_t flags) { while (rte) { struct nstat_counts* stats = nstat_route_attach(rte); @@ -850,9 +863,9 @@ nstat_route_tx( __private_extern__ void nstat_route_rx( struct rtentry *rte, - u_int32_t packets, - u_int32_t bytes, - u_int32_t flags) + u_int32_t packets, + u_int32_t bytes, + u_int32_t flags) { while (rte) { struct nstat_counts* stats = nstat_route_attach(rte); @@ -1154,8 +1167,8 @@ nstat_tcpudp_lookup( static errno_t nstat_tcp_lookup( - const void *data, - u_int32_t length, + const void *data, + u_int32_t length, nstat_provider_cookie_t *out_cookie) { return nstat_tcpudp_lookup(&tcbinfo, data, length, out_cookie); @@ -1178,8 +1191,8 @@ nstat_tcp_gone( static errno_t nstat_tcp_counts( nstat_provider_cookie_t cookie, - struct nstat_counts *out_counts, - int *out_gone) + struct nstat_counts *out_counts, + int *out_gone) { struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; @@ -1242,7 +1255,7 @@ nstat_tcp_release( static errno_t nstat_tcp_add_watcher( nstat_control_state *state, - nstat_msg_add_all_srcs *req) + nstat_msg_add_all_srcs *req) { // There is a tricky issue around getting all TCP sockets added once // and only once. nstat_tcp_new_pcb() is called prior to the new item @@ -1369,6 +1382,46 @@ nstat_pcb_detach(struct inpcb *inp) } } +__private_extern__ void +nstat_pcb_event(struct inpcb *inp, u_int64_t event) +{ + nstat_control_state *state; + nstat_src *src; + struct nstat_tucookie *tucookie; + errno_t result; + nstat_provider_id_t provider_id; + + if (inp == NULL || (nstat_tcp_watchers == 0 && nstat_udp_watchers == 0)) { + return; + } + + lck_mtx_lock(&nstat_mtx); + for (state = nstat_controls; state; state = state->ncs_next) { + if (((state->ncs_provider_filters[NSTAT_PROVIDER_TCP_KERNEL].npf_events & event) == 0) && + ((state->ncs_provider_filters[NSTAT_PROVIDER_UDP_KERNEL].npf_events & event) == 0)) { + continue; + } + lck_mtx_lock(&state->ncs_mtx); + TAILQ_FOREACH(src, &state->ncs_src_queue, ns_control_link) + { + provider_id = src->provider->nstat_provider_id; + if (provider_id == NSTAT_PROVIDER_TCP_KERNEL || provider_id == NSTAT_PROVIDER_UDP_KERNEL) { + tucookie = (struct nstat_tucookie *)src->cookie; + if (tucookie->inp == inp) { + break; + } + } + } + + if (src && ((state->ncs_provider_filters[provider_id].npf_events & event) != 0)) { + result = nstat_control_send_event(state, src, event); + } + lck_mtx_unlock(&state->ncs_mtx); + } + lck_mtx_unlock(&nstat_mtx); +} + + __private_extern__ void nstat_pcb_cache(struct inpcb *inp) { @@ -1639,8 +1692,8 @@ static nstat_provider nstat_udp_provider; static errno_t nstat_udp_lookup( - const void *data, - u_int32_t length, + const void *data, + u_int32_t length, nstat_provider_cookie_t *out_cookie) { return nstat_tcpudp_lookup(&udbinfo, data, length, out_cookie); @@ -1710,7 +1763,7 @@ nstat_udp_release( static errno_t nstat_udp_add_watcher( nstat_control_state *state, - nstat_msg_add_all_srcs *req) + nstat_msg_add_all_srcs *req) { // There is a tricky issue around getting all UDP sockets added once // and only once. nstat_udp_new_pcb() is called prior to the new item @@ -1798,8 +1851,8 @@ nstat_udp_new_pcb( static errno_t nstat_udp_copy_descriptor( nstat_provider_cookie_t cookie, - void *data, - u_int32_t len) + void *data, + u_int32_t len) { if (len < sizeof(nstat_udp_descriptor)) { return EINVAL; @@ -1811,8 +1864,8 @@ nstat_udp_copy_descriptor( struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; - nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; - struct inpcb *inp = tucookie->inp; + nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; + struct inpcb *inp = tucookie->inp; bzero(desc, sizeof(*desc)); @@ -2208,7 +2261,7 @@ nstat_ifnet_copy_link_status( cell_status->valid_bitmask |= NSTAT_IFNET_DESC_CELL_MSS_RECOMMENDED_VALID; cell_status->mss_recommended = if_cell_sr->mss_recommended; } - } else if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) { + } else if (IFNET_IS_WIFI(ifp)) { nstat_ifnet_desc_wifi_status *wifi_status = &link_status->u.wifi; struct if_wifi_status_v1 *if_wifi_sr = &ifsr->ifsr_u.ifsr_wifi.if_wifi_u.if_status_v1; @@ -3550,6 +3603,33 @@ nstat_enqueue_success( return result; } +static errno_t +nstat_control_send_event( + nstat_control_state *state, + nstat_src *src, + u_int64_t event) +{ + errno_t result = 0; + int failed = 0; + + if (nstat_control_reporting_allowed(state, src)) { + if ((state->ncs_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { + result = nstat_control_send_update(state, src, 0, event, 0, NULL); + if (result != 0) { + failed = 1; + if (nstat_debug != 0) { + printf("%s - nstat_control_send_event() %d\n", __func__, result); + } + } + } else { + if (nstat_debug != 0) { + printf("%s - nstat_control_send_event() used when updates not supported\n", __func__); + } + } + } + return result; +} + static errno_t nstat_control_send_goodbye( nstat_control_state *state, @@ -3560,7 +3640,7 @@ nstat_control_send_goodbye( if (nstat_control_reporting_allowed(state, src)) { if ((state->ncs_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { - result = nstat_control_send_update(state, src, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL); + result = nstat_control_send_update(state, src, 0, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL); if (result != 0) { failed = 1; if (nstat_debug != 0) { @@ -3677,7 +3757,7 @@ nstat_idle_check( __unused thread_call_param_t p1) { nstat_control_state *control; - nstat_src *src, *tmpsrc; + nstat_src *src, *tmpsrc; tailq_head_nstat_src dead_list; TAILQ_INIT(&dead_list); @@ -3785,20 +3865,18 @@ nstat_control_reporting_allowed( return TRUE; } - return - src->provider->nstat_reporting_allowed(src->cookie, - &state->ncs_provider_filters[src->provider->nstat_provider_id]) - ; + return src->provider->nstat_reporting_allowed(src->cookie, + &state->ncs_provider_filters[src->provider->nstat_provider_id]); } static errno_t nstat_control_connect( - kern_ctl_ref kctl, - struct sockaddr_ctl *sac, - void **uinfo) + kern_ctl_ref kctl, + struct sockaddr_ctl *sac, + void **uinfo) { - nstat_control_state *state = OSMalloc(sizeof(*state), nstat_malloc_tag); + nstat_control_state *state = OSMalloc(sizeof(*state), nstat_malloc_tag); if (state == NULL) { return ENOMEM; } @@ -3827,11 +3905,11 @@ nstat_control_connect( static errno_t nstat_control_disconnect( __unused kern_ctl_ref kctl, - __unused u_int32_t unit, - void *uinfo) + __unused u_int32_t unit, + void *uinfo) { - u_int32_t watching; - nstat_control_state *state = (nstat_control_state*)uinfo; + u_int32_t watching; + nstat_control_state *state = (nstat_control_state*)uinfo; tailq_head_nstat_src cleanup_list; nstat_src *src; @@ -3892,11 +3970,11 @@ nstat_control_next_src_ref( static errno_t nstat_control_send_counts( - nstat_control_state *state, - nstat_src *src, - unsigned long long context, - u_int16_t hdr_flags, - int *gone) + nstat_control_state *state, + nstat_src *src, + unsigned long long context, + u_int16_t hdr_flags, + int *gone) { nstat_msg_src_counts counts; errno_t result = 0; @@ -3933,9 +4011,9 @@ nstat_control_send_counts( static errno_t nstat_control_append_counts( - nstat_control_state *state, - nstat_src *src, - int *gone) + nstat_control_state *state, + nstat_src *src, + int *gone) { /* Some providers may not have any counts to send */ if (!src->provider->nstat_counts) { @@ -3965,10 +4043,10 @@ nstat_control_append_counts( static int nstat_control_send_description( - nstat_control_state *state, - nstat_src *src, - u_int64_t context, - u_int16_t hdr_flags) + nstat_control_state *state, + nstat_src *src, + u_int64_t context, + u_int16_t hdr_flags) { // Provider doesn't support getting the descriptor? Done. if (src->provider->nstat_descriptor_length == 0 || @@ -3977,14 +4055,14 @@ nstat_control_send_description( } // Allocate storage for the descriptor message - mbuf_t msg; + mbuf_t msg; unsigned int one = 1; - u_int32_t size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length; + u_int32_t size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length; if (mbuf_allocpacket(MBUF_DONTWAIT, size, &one, &msg) != 0) { return ENOMEM; } - nstat_msg_src_description *desc = (nstat_msg_src_description*)mbuf_data(msg); + nstat_msg_src_description *desc = (nstat_msg_src_description*)mbuf_data(msg); bzero(desc, size); mbuf_setlen(msg, size); mbuf_pkthdr_setlen(msg, mbuf_len(msg)); @@ -4016,8 +4094,8 @@ nstat_control_send_description( static errno_t nstat_control_append_description( - nstat_control_state *state, - nstat_src *src) + nstat_control_state *state, + nstat_src *src) { size_t size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length; if (size > 512 || src->provider->nstat_descriptor_length == 0 || @@ -4029,7 +4107,7 @@ nstat_control_append_description( u_int64_t buffer[size / sizeof(u_int64_t) + 1]; // u_int64_t to ensure alignment bzero(buffer, size); - nstat_msg_src_description *desc = (nstat_msg_src_description*)buffer; + nstat_msg_src_description *desc = (nstat_msg_src_description*)buffer; desc->hdr.type = NSTAT_MSG_TYPE_SRC_DESC; desc->hdr.length = size; desc->srcref = src->srcref; @@ -4050,11 +4128,12 @@ nstat_control_append_description( static int nstat_control_send_update( - nstat_control_state *state, - nstat_src *src, - u_int64_t context, - u_int16_t hdr_flags, - int *gone) + nstat_control_state *state, + nstat_src *src, + u_int64_t context, + u_int64_t event, + u_int16_t hdr_flags, + int *gone) { // Provider doesn't support getting the descriptor or counts? Done. if ((src->provider->nstat_descriptor_length == 0 || @@ -4064,22 +4143,22 @@ nstat_control_send_update( } // Allocate storage for the descriptor message - mbuf_t msg; + mbuf_t msg; unsigned int one = 1; - u_int32_t size = offsetof(nstat_msg_src_update, data) + + u_int32_t size = offsetof(nstat_msg_src_update, data) + src->provider->nstat_descriptor_length; if (mbuf_allocpacket(MBUF_DONTWAIT, size, &one, &msg) != 0) { return ENOMEM; } - nstat_msg_src_update *desc = (nstat_msg_src_update*)mbuf_data(msg); + nstat_msg_src_update *desc = (nstat_msg_src_update*)mbuf_data(msg); bzero(desc, size); desc->hdr.context = context; desc->hdr.type = NSTAT_MSG_TYPE_SRC_UPDATE; desc->hdr.length = size; desc->hdr.flags = hdr_flags; desc->srcref = src->srcref; - desc->event_flags = 0; + desc->event_flags = event; desc->provider = src->provider->nstat_provider_id; mbuf_setlen(msg, size); @@ -4118,9 +4197,9 @@ nstat_control_send_update( static errno_t nstat_control_append_update( - nstat_control_state *state, - nstat_src *src, - int *gone) + nstat_control_state *state, + nstat_src *src, + int *gone) { size_t size = offsetof(nstat_msg_src_update, data) + src->provider->nstat_descriptor_length; if (size > 512 || ((src->provider->nstat_descriptor_length == 0 || @@ -4176,8 +4255,8 @@ nstat_control_append_update( static errno_t nstat_control_send_removed( - nstat_control_state *state, - nstat_src *src) + nstat_control_state *state, + nstat_src *src) { nstat_msg_src_removed removed; errno_t result; @@ -4198,8 +4277,8 @@ nstat_control_send_removed( static errno_t nstat_control_handle_add_request( - nstat_control_state *state, - mbuf_t m) + nstat_control_state *state, + mbuf_t m) { errno_t result; @@ -4214,7 +4293,7 @@ nstat_control_handle_add_request( return EINVAL; } - nstat_provider *provider = NULL; + nstat_provider *provider = NULL; nstat_provider_cookie_t cookie = NULL; nstat_msg_add_src_req *req = mbuf_data(m); if (mbuf_pkthdr_len(m) > mbuf_len(m)) { @@ -4248,7 +4327,7 @@ nstat_control_handle_add_request( static errno_t nstat_set_provider_filter( nstat_control_state *state, - nstat_msg_add_all_srcs *req) + nstat_msg_add_all_srcs *req) { nstat_provider_id_t provider_id = req->provider; @@ -4269,7 +4348,7 @@ nstat_set_provider_filter( static errno_t nstat_control_handle_add_all( nstat_control_state *state, - mbuf_t m) + mbuf_t m) { errno_t result = 0; @@ -4283,7 +4362,7 @@ nstat_control_handle_add_all( return ENOENT; } - nstat_provider *provider = nstat_find_provider_by_id(req->provider); + nstat_provider *provider = nstat_find_provider_by_id(req->provider); if (!provider) { return ENOENT; @@ -4323,10 +4402,10 @@ nstat_control_handle_add_all( static errno_t nstat_control_source_add( - u_int64_t context, - nstat_control_state *state, - nstat_provider *provider, - nstat_provider_cookie_t cookie) + u_int64_t context, + nstat_control_state *state, + nstat_provider *provider, + nstat_provider_cookie_t cookie) { // Fill out source added message if appropriate mbuf_t msg = NULL; @@ -4416,8 +4495,8 @@ nstat_control_source_add( static errno_t nstat_control_handle_remove_request( - nstat_control_state *state, - mbuf_t m) + nstat_control_state *state, + mbuf_t m) { nstat_src_ref_t srcref = NSTAT_SRC_REF_INVALID; nstat_src *src; @@ -4450,8 +4529,8 @@ nstat_control_handle_remove_request( static errno_t nstat_control_handle_query_request( - nstat_control_state *state, - mbuf_t m) + nstat_control_state *state, + mbuf_t m) { // TBD: handle this from another thread so we can enqueue a lot of data // As written, if a client requests query all, this function will be @@ -4583,8 +4662,8 @@ nstat_control_handle_query_request( static errno_t nstat_control_handle_get_src_description( - nstat_control_state *state, - mbuf_t m) + nstat_control_state *state, + mbuf_t m) { nstat_msg_get_src_description req; errno_t result = ENOENT; @@ -4664,8 +4743,8 @@ nstat_control_handle_get_src_description( static errno_t nstat_control_handle_set_filter( - nstat_control_state *state, - mbuf_t m) + nstat_control_state *state, + mbuf_t m) { nstat_msg_set_filter req; nstat_src *src; @@ -4798,7 +4877,7 @@ nstat_control_handle_get_update( TAILQ_FOREACH_SAFE(src, &state->ncs_src_queue, ns_control_link, tmpsrc) { - int gone; + int gone; gone = 0; if (nstat_control_reporting_allowed(state, src)) { @@ -4824,7 +4903,7 @@ nstat_control_handle_get_update( src_count++; } } else if (src->srcref == req.srcref) { - result = nstat_control_send_update(state, src, req.hdr.context, 0, &gone); + result = nstat_control_send_update(state, src, req.hdr.context, 0, 0, &gone); } } @@ -4890,9 +4969,9 @@ nstat_control_handle_subscribe_sysinfo( static errno_t nstat_control_send( kern_ctl_ref kctl, - u_int32_t unit, - void *uinfo, - mbuf_t m, + u_int32_t unit, + void *uinfo, + mbuf_t m, __unused int flags) { nstat_control_state *state = (nstat_control_state*)uinfo; @@ -5001,7 +5080,7 @@ nstat_control_send( static int -tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxduration, struct xtcpprogress_indicators *indicators) +tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxduration, uint16_t filter_flags, struct xtcpprogress_indicators *indicators) { int error = 0; struct inpcb *inp; @@ -5020,7 +5099,9 @@ tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_ if (tp && inp->inp_last_outifp && inp->inp_last_outifp->if_index == ifindex && inp->inp_state != INPCB_STATE_DEAD && - !(tp->t_flags & TF_LOCAL)) { + ((filter_flags == 0) || + ((filter_flags & NSTAT_IFNET_IS_NON_LOCAL) && !(tp->t_flags & TF_LOCAL)) || + ((filter_flags & NSTAT_IFNET_IS_LOCAL) && (tp->t_flags & TF_LOCAL)))) { struct tcp_conn_status connstatus; indicators->xp_numflows++; tcp_get_connectivity_status(tp, &connstatus); @@ -5077,7 +5158,7 @@ ntstat_tcp_progress_indicators(struct sysctl_req *req) if (error != 0) { return error; } - error = tcp_progress_indicators_for_interface(requested.ifindex, requested.recentflow_maxduration, &indicators); + error = tcp_progress_indicators_for_interface(requested.ifindex, requested.recentflow_maxduration, (uint16_t)requested.filter_flags, &indicators); if (error != 0) { return error; } diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h index 7f204fcf3..a5b976061 100644 --- a/bsd/net/ntstat.h +++ b/bsd/net/ntstat.h @@ -45,16 +45,22 @@ typedef u_int64_t nstat_event_flags_t; // The following event definitions are very provisional.. enum{ - NSTAT_EVENT_SRC_ADDED = 0x00000001 - , NSTAT_EVENT_SRC_REMOVED = 0x00000002 - , NSTAT_EVENT_SRC_QUERIED = 0x00000004 - , NSTAT_EVENT_SRC_QUERIED_ALL = 0x00000008 - , NSTAT_EVENT_SRC_WILL_CHANGE_STATE = 0x00000010 - , NSTAT_EVENT_SRC_DID_CHANGE_STATE = 0x00000020 - , NSTAT_EVENT_SRC_WILL_CHANGE_OWNER = 0x00000040 - , NSTAT_EVENT_SRC_DID_CHANGE_OWNER = 0x00000080 + NSTAT_EVENT_SRC_ADDED = 0x00000001 + , NSTAT_EVENT_SRC_REMOVED = 0x00000002 + , NSTAT_EVENT_SRC_QUERIED = 0x00000004 + , NSTAT_EVENT_SRC_QUERIED_ALL = 0x00000008 + , NSTAT_EVENT_SRC_WILL_CHANGE_STATE = 0x00000010 + , NSTAT_EVENT_SRC_DID_CHANGE_STATE = 0x00000020 + , NSTAT_EVENT_SRC_WILL_CHANGE_OWNER = 0x00000040 + , NSTAT_EVENT_SRC_DID_CHANGE_OWNER = 0x00000080 , NSTAT_EVENT_SRC_WILL_CHANGE_PROPERTY = 0x00000100 , NSTAT_EVENT_SRC_DID_CHANGE_PROPERTY = 0x00000200 + , NSTAT_EVENT_SRC_ENTER_CELLFALLBACK = 0x00000400 + , NSTAT_EVENT_SRC_EXIT_CELLFALLBACK = 0x00000800 +#if (DEBUG || DEVELOPMENT) + , NSTAT_EVENT_SRC_RESERVED_1 = 0x00001000 + , NSTAT_EVENT_SRC_RESERVED_2 = 0x00002000 +#endif /* (DEBUG || DEVELOPMENT) */ }; typedef struct nstat_counts { @@ -110,7 +116,7 @@ typedef struct nstat_sysinfo_counts { } nstat_sysinfo_counts; enum{ - NSTAT_SYSINFO_KEY_MBUF_256B_TOTAL = 1 + NSTAT_SYSINFO_KEY_MBUF_256B_TOTAL = 1 , NSTAT_SYSINFO_KEY_MBUF_2KB_TOTAL = 2 , NSTAT_SYSINFO_KEY_MBUF_4KB_TOTAL = 3 , NSTAT_SYSINFO_KEY_SOCK_MBCNT = 4 @@ -325,31 +331,38 @@ enum{ // Interface properties -#define NSTAT_IFNET_IS_UNKNOWN_TYPE 0x01 -#define NSTAT_IFNET_IS_LOOPBACK 0x02 -#define NSTAT_IFNET_IS_CELLULAR 0x04 -#define NSTAT_IFNET_IS_WIFI 0x08 -#define NSTAT_IFNET_IS_WIRED 0x10 -#define NSTAT_IFNET_IS_AWDL 0x20 -#define NSTAT_IFNET_IS_EXPENSIVE 0x40 -#define NSTAT_IFNET_IS_VPN 0x80 -#define NSTAT_IFNET_VIA_CELLFALLBACK 0x100 +#define NSTAT_IFNET_IS_UNKNOWN_TYPE 0x0001 +#define NSTAT_IFNET_IS_LOOPBACK 0x0002 +#define NSTAT_IFNET_IS_CELLULAR 0x0004 +#define NSTAT_IFNET_IS_WIFI 0x0008 +#define NSTAT_IFNET_IS_WIRED 0x0010 +#define NSTAT_IFNET_IS_AWDL 0x0020 +#define NSTAT_IFNET_IS_EXPENSIVE 0x0040 +#define NSTAT_IFNET_IS_VPN 0x0080 +#define NSTAT_IFNET_VIA_CELLFALLBACK 0x0100 +#define NSTAT_IFNET_IS_COMPANIONLINK 0x0200 +#define NSTAT_IFNET_IS_CONSTRAINED 0x0400 +// The following local and non-local flags are set only if fully known +// They are mutually exclusive but there is no guarantee that one or the other will be set +#define NSTAT_IFNET_IS_LOCAL 0x0800 +#define NSTAT_IFNET_IS_NON_LOCAL 0x1000 // Temporary properties of use for bringing up userland providers -#define NSTAT_IFNET_ROUTE_VALUE_UNOBTAINABLE 0x1000 -#define NSTAT_IFNET_FLOWSWITCH_VALUE_UNOBTAINABLE 0x2000 +#define NSTAT_IFNET_ROUTE_VALUE_UNOBTAINABLE 0x2000 +#define NSTAT_IFNET_FLOWSWITCH_VALUE_UNOBTAINABLE 0x4000 -enum{ - NSTAT_PROVIDER_NONE = 0 - , NSTAT_PROVIDER_ROUTE = 1 - , NSTAT_PROVIDER_TCP_KERNEL = 2 +typedef enum { + NSTAT_PROVIDER_NONE = 0 + , NSTAT_PROVIDER_ROUTE = 1 + , NSTAT_PROVIDER_TCP_KERNEL = 2 , NSTAT_PROVIDER_TCP_USERLAND = 3 - , NSTAT_PROVIDER_UDP_KERNEL = 4 + , NSTAT_PROVIDER_UDP_KERNEL = 4 , NSTAT_PROVIDER_UDP_USERLAND = 5 - , NSTAT_PROVIDER_IFNET = 6 - , NSTAT_PROVIDER_SYSINFO = 7 -}; -#define NSTAT_PROVIDER_LAST NSTAT_PROVIDER_SYSINFO + , NSTAT_PROVIDER_IFNET = 6 + , NSTAT_PROVIDER_SYSINFO = 7 + , NSTAT_PROVIDER_QUIC_USERLAND = 8 +} nstat_provider_type_t; +#define NSTAT_PROVIDER_LAST NSTAT_PROVIDER_QUIC_USERLAND #define NSTAT_PROVIDER_COUNT (NSTAT_PROVIDER_LAST+1) typedef struct nstat_route_add_param { @@ -463,6 +476,15 @@ typedef struct nstat_udp_descriptor { u_int8_t reserved[6]; } nstat_udp_descriptor; +/* + * XXX For now just typedef'ing TCP Nstat descriptor to nstat_quic_descriptor + * as for now they report very similar data. + * Later when we extend the QUIC descriptor we can just declare its own + * descriptor struct. + */ +typedef struct nstat_tcp_add_param nstat_quic_add_param; +typedef struct nstat_tcp_descriptor nstat_quic_descriptor; + typedef struct nstat_route_descriptor { u_int64_t id __attribute__((aligned(sizeof(u_int64_t)))); u_int64_t parent_id __attribute__((aligned(sizeof(u_int64_t)))); @@ -666,26 +688,26 @@ typedef struct nstat_sysinfo_add_param { enum{ // generic response messages - NSTAT_MSG_TYPE_SUCCESS = 0 - , NSTAT_MSG_TYPE_ERROR = 1 + NSTAT_MSG_TYPE_SUCCESS = 0 + , NSTAT_MSG_TYPE_ERROR = 1 // Requests - , NSTAT_MSG_TYPE_ADD_SRC = 1001 - , NSTAT_MSG_TYPE_ADD_ALL_SRCS = 1002 - , NSTAT_MSG_TYPE_REM_SRC = 1003 - , NSTAT_MSG_TYPE_QUERY_SRC = 1004 - , NSTAT_MSG_TYPE_GET_SRC_DESC = 1005 - , NSTAT_MSG_TYPE_SET_FILTER = 1006 - , NSTAT_MSG_TYPE_GET_UPDATE = 1007 - , NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO = 1008 + , NSTAT_MSG_TYPE_ADD_SRC = 1001 + , NSTAT_MSG_TYPE_ADD_ALL_SRCS = 1002 + , NSTAT_MSG_TYPE_REM_SRC = 1003 + , NSTAT_MSG_TYPE_QUERY_SRC = 1004 + , NSTAT_MSG_TYPE_GET_SRC_DESC = 1005 + , NSTAT_MSG_TYPE_SET_FILTER = 1006 + , NSTAT_MSG_TYPE_GET_UPDATE = 1007 + , NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO = 1008 // Responses/Notfications - , NSTAT_MSG_TYPE_SRC_ADDED = 10001 - , NSTAT_MSG_TYPE_SRC_REMOVED = 10002 - , NSTAT_MSG_TYPE_SRC_DESC = 10003 - , NSTAT_MSG_TYPE_SRC_COUNTS = 10004 - , NSTAT_MSG_TYPE_SYSINFO_COUNTS = 10005 - , NSTAT_MSG_TYPE_SRC_UPDATE = 10006 + , NSTAT_MSG_TYPE_SRC_ADDED = 10001 + , NSTAT_MSG_TYPE_SRC_REMOVED = 10002 + , NSTAT_MSG_TYPE_SRC_DESC = 10003 + , NSTAT_MSG_TYPE_SRC_COUNTS = 10004 + , NSTAT_MSG_TYPE_SYSINFO_COUNTS = 10005 + , NSTAT_MSG_TYPE_SRC_UPDATE = 10006 }; enum{ @@ -700,7 +722,7 @@ enum{ /* Provider-level filters */ enum{ - NSTAT_FILTER_ACCEPT_UNKNOWN = 0x00000001 + NSTAT_FILTER_ACCEPT_UNKNOWN = 0x00000001 , NSTAT_FILTER_ACCEPT_LOOPBACK = 0x00000002 , NSTAT_FILTER_ACCEPT_CELLULAR = 0x00000004 , NSTAT_FILTER_ACCEPT_WIFI = 0x00000008 @@ -708,13 +730,15 @@ enum{ , NSTAT_FILTER_ACCEPT_AWDL = 0x00000020 , NSTAT_FILTER_ACCEPT_EXPENSIVE = 0x00000040 , NSTAT_FILTER_ACCEPT_CELLFALLBACK = 0x00000100 - , NSTAT_FILTER_IFNET_FLAGS = 0x00000FFF + , NSTAT_FILTER_ACCEPT_COMPANIONLINK = 0x00000200 + , NSTAT_FILTER_ACCEPT_IS_CONSTRAINED = 0x00000400 + , NSTAT_FILTER_ACCEPT_IS_LOCAL = 0x00000800 + , NSTAT_FILTER_ACCEPT_IS_NON_LOCAL = 0x00001000 + , NSTAT_FILTER_IFNET_FLAGS = 0x00001FFF - , NSTAT_FILTER_TCP_NO_LISTENER = 0x00001000 - , NSTAT_FILTER_TCP_ONLY_LISTENER = 0x00002000 , NSTAT_FILTER_TCP_INTERFACE_ATTACH = 0x00004000 , NSTAT_FILTER_TCP_NO_EARLY_CLOSE = 0x00008000 - , NSTAT_FILTER_TCP_FLAGS = 0x0000F000 + , NSTAT_FILTER_TCP_FLAGS = 0x0000C000 , NSTAT_FILTER_UDP_INTERFACE_ATTACH = 0x00010000 , NSTAT_FILTER_UDP_FLAGS = 0x000F0000 @@ -778,18 +802,18 @@ typedef struct nstat_msg_add_src_convenient { typedef struct nstat_msg_add_all_srcs { nstat_msg_hdr hdr; - u_int64_t filter __attribute__((aligned(sizeof(u_int64_t)))); + u_int64_t filter __attribute__((aligned(sizeof(u_int64_t)))); nstat_event_flags_t events __attribute__((aligned(sizeof(u_int64_t)))); nstat_provider_id_t provider; - pid_t target_pid; - uuid_t target_uuid; + pid_t target_pid; + uuid_t target_uuid; } nstat_msg_add_all_srcs; typedef struct nstat_msg_src_added { nstat_msg_hdr hdr; nstat_src_ref_t srcref __attribute__((aligned(sizeof(u_int64_t)))); nstat_provider_id_t provider; - u_int8_t reserved[4]; + u_int8_t reserved[4]; } nstat_msg_src_added; typedef struct nstat_msg_rem_src { @@ -805,8 +829,8 @@ typedef struct nstat_msg_get_src_description { typedef struct nstat_msg_set_filter { nstat_msg_hdr hdr; nstat_src_ref_t srcref __attribute__((aligned(sizeof(u_int64_t)))); - u_int32_t filter; - u_int8_t reserved[4]; + u_int32_t filter; + u_int8_t reserved[4]; } nstat_msg_set_filter; #define NSTAT_SRC_DESCRIPTION_FIELDS \ @@ -826,13 +850,14 @@ typedef struct nstat_msg_src_description_header { } nstat_msg_src_description_header; typedef struct nstat_msg_src_description_convenient { - nstat_msg_src_description_header hdr; + nstat_msg_src_description_header hdr; union { - nstat_tcp_descriptor tcp; - nstat_udp_descriptor udp; - nstat_route_descriptor route; - nstat_ifnet_descriptor ifnet; - nstat_sysinfo_descriptor sysinfo; + nstat_tcp_descriptor tcp; + nstat_udp_descriptor udp; + nstat_route_descriptor route; + nstat_ifnet_descriptor ifnet; + nstat_sysinfo_descriptor sysinfo; + nstat_quic_descriptor quic; }; } nstat_msg_src_description_convenient; @@ -875,6 +900,7 @@ typedef struct nstat_msg_src_update_convenient { nstat_route_descriptor route; nstat_ifnet_descriptor ifnet; nstat_sysinfo_descriptor sysinfo; + nstat_quic_descriptor quic; }; } nstat_msg_src_update_convenient; @@ -920,91 +946,91 @@ struct nstat_stats { #pragma mark -- System Information Internal Support -- typedef struct nstat_sysinfo_mbuf_stats { - u_int32_t total_256b; /* Peak usage, 256B pool */ - u_int32_t total_2kb; /* Peak usage, 2KB pool */ - u_int32_t total_4kb; /* Peak usage, 4KB pool */ - u_int32_t total_16kb; /* Peak usage, 16KB pool */ - u_int32_t sbmb_total; /* Total mbufs in sock buffer pool */ - u_int32_t sb_atmbuflimit; /* Memory limit reached for socket buffer autoscaling */ - u_int32_t draincnt; /* Number of times mbuf pool has been drained under memory pressure */ - u_int32_t memreleased; /* Memory (bytes) released from mbuf pool to VM */ - u_int32_t sbmb_floor; /* Lowest mbufs in sock buffer pool */ + u_int32_t total_256b; /* Peak usage, 256B pool */ + u_int32_t total_2kb; /* Peak usage, 2KB pool */ + u_int32_t total_4kb; /* Peak usage, 4KB pool */ + u_int32_t total_16kb; /* Peak usage, 16KB pool */ + u_int32_t sbmb_total; /* Total mbufs in sock buffer pool */ + u_int32_t sb_atmbuflimit; /* Memory limit reached for socket buffer autoscaling */ + u_int32_t draincnt; /* Number of times mbuf pool has been drained under memory pressure */ + u_int32_t memreleased; /* Memory (bytes) released from mbuf pool to VM */ + u_int32_t sbmb_floor; /* Lowest mbufs in sock buffer pool */ } nstat_sysinfo_mbuf_stats; typedef struct nstat_sysinfo_tcp_stats { /* When adding/removing here, also adjust NSTAT_SYSINFO_TCP_STATS_COUNT */ - u_int32_t ipv4_avgrtt; /* Average RTT for IPv4 */ - u_int32_t ipv6_avgrtt; /* Average RTT for IPv6 */ - u_int32_t send_plr; /* Average uplink packet loss rate */ - u_int32_t recv_plr; /* Average downlink packet loss rate */ - u_int32_t send_tlrto_rate; /* Average rxt timeout after tail loss */ - u_int32_t send_reorder_rate; /* Average packet reordering rate */ - u_int32_t connection_attempts; /* TCP client connection attempts */ - u_int32_t connection_accepts; /* TCP server connection accepts */ - u_int32_t ecn_client_enabled; /* Global setting for ECN client side */ - u_int32_t ecn_server_enabled; /* Global setting for ECN server side */ - u_int32_t ecn_client_setup; /* Attempts to setup TCP client connection with ECN */ - u_int32_t ecn_server_setup; /* Attempts to setup TCP server connection with ECN */ - u_int32_t ecn_client_success; /* Number of successful negotiations of ECN for a client connection */ - u_int32_t ecn_server_success; /* Number of successful negotiations of ECN for a server connection */ - u_int32_t ecn_not_supported; /* Number of falbacks to Non-ECN, no support from peer */ - u_int32_t ecn_lost_syn; /* Number of SYNs lost with ECN bits */ - u_int32_t ecn_lost_synack; /* Number of SYN-ACKs lost with ECN bits */ - u_int32_t ecn_recv_ce; /* Number of CEs received from network */ - u_int32_t ecn_recv_ece; /* Number of ECEs received from receiver */ - u_int32_t ecn_sent_ece; /* Number of ECEs sent in response to CE */ - u_int32_t ecn_conn_recv_ce; /* Number of connections using ECN received CE at least once */ - u_int32_t ecn_conn_recv_ece; /* Number of connections using ECN received ECE at least once */ - u_int32_t ecn_conn_plnoce; /* Number of connections using ECN seen packet loss but never received CE */ - u_int32_t ecn_conn_pl_ce; /* Number of connections using ECN seen packet loss and CE */ - u_int32_t ecn_conn_nopl_ce; /* Number of connections using ECN with no packet loss but received CE */ - u_int32_t ecn_fallback_synloss; /* Number of times we did fall back due to SYN-Loss */ - u_int32_t ecn_fallback_reorder; /* Number of times we fallback because we detected the PAWS-issue */ - u_int32_t ecn_fallback_ce; /* Number of times we fallback because we received too many CEs */ - u_int32_t tfo_syn_data_rcv; /* Number of SYN+data received with valid cookie */ - u_int32_t tfo_cookie_req_rcv;/* Number of TFO cookie-requests received */ - u_int32_t tfo_cookie_sent; /* Number of TFO-cookies offered to the client */ - u_int32_t tfo_cookie_invalid;/* Number of invalid TFO-cookies received */ - u_int32_t tfo_cookie_req; /* Number of SYNs with cookie request received*/ - u_int32_t tfo_cookie_rcv; /* Number of SYN/ACKs with Cookie received */ - u_int32_t tfo_syn_data_sent; /* Number of SYNs+data+cookie sent */ - u_int32_t tfo_syn_data_acked;/* Number of times our SYN+data has been acknowledged */ - u_int32_t tfo_syn_loss; /* Number of times SYN+TFO has been lost and we fallback */ - u_int32_t tfo_blackhole; /* Number of times SYN+TFO has been lost and we fallback */ - u_int32_t tfo_cookie_wrong; /* TFO-cookie we sent was wrong */ - u_int32_t tfo_no_cookie_rcv; /* We asked for a cookie but didn't get one */ - u_int32_t tfo_heuristics_disable; /* TFO got disabled due to heuristics */ - u_int32_t tfo_sndblackhole; /* TFO got blackholed in the sending direction */ - u_int32_t mptcp_handover_attempt; /* Total number of MPTCP-attempts using handover mode */ - u_int32_t mptcp_interactive_attempt; /* Total number of MPTCP-attempts using interactive mode */ - u_int32_t mptcp_aggregate_attempt; /* Total number of MPTCP-attempts using aggregate mode */ - u_int32_t mptcp_fp_handover_attempt; /* Same as previous three but only for first-party apps */ - u_int32_t mptcp_fp_interactive_attempt; - u_int32_t mptcp_fp_aggregate_attempt; - u_int32_t mptcp_heuristic_fallback; /* Total number of MPTCP-connections that fell back due to heuristics */ - u_int32_t mptcp_fp_heuristic_fallback; /* Same as previous but for first-party apps */ - u_int32_t mptcp_handover_success_wifi; /* Total number of successfull handover-mode connections that *started* on WiFi */ - u_int32_t mptcp_handover_success_cell; /* Total number of successfull handover-mode connections that *started* on Cell */ - u_int32_t mptcp_interactive_success; /* Total number of interactive-mode connections that negotiated MPTCP */ - u_int32_t mptcp_aggregate_success; /* Same as previous but for aggregate */ - u_int32_t mptcp_fp_handover_success_wifi; /* Same as previous four, but for first-party apps */ - u_int32_t mptcp_fp_handover_success_cell; - u_int32_t mptcp_fp_interactive_success; - u_int32_t mptcp_fp_aggregate_success; - u_int32_t mptcp_handover_cell_from_wifi; /* Total number of connections that use cell in handover-mode (coming from WiFi) */ - u_int32_t mptcp_handover_wifi_from_cell; /* Total number of connections that use WiFi in handover-mode (coming from cell) */ - u_int32_t mptcp_interactive_cell_from_wifi; /* Total number of connections that use cell in interactive mode (coming from WiFi) */ - u_int32_t mptcp_back_to_wifi; /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */ - u_int64_t mptcp_handover_cell_bytes; /* Total number of bytes sent on cell in handover-mode (on new subflows, ignoring initial one) */ - u_int64_t mptcp_interactive_cell_bytes; /* Same as previous but for interactive */ - u_int64_t mptcp_aggregate_cell_bytes; - u_int64_t mptcp_handover_all_bytes; /* Total number of bytes sent in handover */ - u_int64_t mptcp_interactive_all_bytes; - u_int64_t mptcp_aggregate_all_bytes; - u_int32_t mptcp_wifi_proxy; /* Total number of new subflows that fell back to regular TCP on cell */ - u_int32_t mptcp_cell_proxy; /* Total number of new subflows that fell back to regular TCP on WiFi */ - u_int32_t mptcp_triggered_cell; /* Total number of times an MPTCP-connection triggered cell bringup */ - u_int32_t _padding; + u_int32_t ipv4_avgrtt; /* Average RTT for IPv4 */ + u_int32_t ipv6_avgrtt; /* Average RTT for IPv6 */ + u_int32_t send_plr; /* Average uplink packet loss rate */ + u_int32_t recv_plr; /* Average downlink packet loss rate */ + u_int32_t send_tlrto_rate; /* Average rxt timeout after tail loss */ + u_int32_t send_reorder_rate; /* Average packet reordering rate */ + u_int32_t connection_attempts; /* TCP client connection attempts */ + u_int32_t connection_accepts; /* TCP server connection accepts */ + u_int32_t ecn_client_enabled; /* Global setting for ECN client side */ + u_int32_t ecn_server_enabled; /* Global setting for ECN server side */ + u_int32_t ecn_client_setup; /* Attempts to setup TCP client connection with ECN */ + u_int32_t ecn_server_setup; /* Attempts to setup TCP server connection with ECN */ + u_int32_t ecn_client_success; /* Number of successful negotiations of ECN for a client connection */ + u_int32_t ecn_server_success; /* Number of successful negotiations of ECN for a server connection */ + u_int32_t ecn_not_supported; /* Number of falbacks to Non-ECN, no support from peer */ + u_int32_t ecn_lost_syn; /* Number of SYNs lost with ECN bits */ + u_int32_t ecn_lost_synack; /* Number of SYN-ACKs lost with ECN bits */ + u_int32_t ecn_recv_ce; /* Number of CEs received from network */ + u_int32_t ecn_recv_ece; /* Number of ECEs received from receiver */ + u_int32_t ecn_sent_ece; /* Number of ECEs sent in response to CE */ + u_int32_t ecn_conn_recv_ce; /* Number of connections using ECN received CE at least once */ + u_int32_t ecn_conn_recv_ece; /* Number of connections using ECN received ECE at least once */ + u_int32_t ecn_conn_plnoce; /* Number of connections using ECN seen packet loss but never received CE */ + u_int32_t ecn_conn_pl_ce; /* Number of connections using ECN seen packet loss and CE */ + u_int32_t ecn_conn_nopl_ce; /* Number of connections using ECN with no packet loss but received CE */ + u_int32_t ecn_fallback_synloss; /* Number of times we did fall back due to SYN-Loss */ + u_int32_t ecn_fallback_reorder; /* Number of times we fallback because we detected the PAWS-issue */ + u_int32_t ecn_fallback_ce; /* Number of times we fallback because we received too many CEs */ + u_int32_t tfo_syn_data_rcv; /* Number of SYN+data received with valid cookie */ + u_int32_t tfo_cookie_req_rcv;/* Number of TFO cookie-requests received */ + u_int32_t tfo_cookie_sent; /* Number of TFO-cookies offered to the client */ + u_int32_t tfo_cookie_invalid;/* Number of invalid TFO-cookies received */ + u_int32_t tfo_cookie_req; /* Number of SYNs with cookie request received*/ + u_int32_t tfo_cookie_rcv; /* Number of SYN/ACKs with Cookie received */ + u_int32_t tfo_syn_data_sent; /* Number of SYNs+data+cookie sent */ + u_int32_t tfo_syn_data_acked;/* Number of times our SYN+data has been acknowledged */ + u_int32_t tfo_syn_loss; /* Number of times SYN+TFO has been lost and we fallback */ + u_int32_t tfo_blackhole; /* Number of times SYN+TFO has been lost and we fallback */ + u_int32_t tfo_cookie_wrong; /* TFO-cookie we sent was wrong */ + u_int32_t tfo_no_cookie_rcv; /* We asked for a cookie but didn't get one */ + u_int32_t tfo_heuristics_disable; /* TFO got disabled due to heuristics */ + u_int32_t tfo_sndblackhole; /* TFO got blackholed in the sending direction */ + u_int32_t mptcp_handover_attempt; /* Total number of MPTCP-attempts using handover mode */ + u_int32_t mptcp_interactive_attempt; /* Total number of MPTCP-attempts using interactive mode */ + u_int32_t mptcp_aggregate_attempt; /* Total number of MPTCP-attempts using aggregate mode */ + u_int32_t mptcp_fp_handover_attempt; /* Same as previous three but only for first-party apps */ + u_int32_t mptcp_fp_interactive_attempt; + u_int32_t mptcp_fp_aggregate_attempt; + u_int32_t mptcp_heuristic_fallback; /* Total number of MPTCP-connections that fell back due to heuristics */ + u_int32_t mptcp_fp_heuristic_fallback; /* Same as previous but for first-party apps */ + u_int32_t mptcp_handover_success_wifi; /* Total number of successfull handover-mode connections that *started* on WiFi */ + u_int32_t mptcp_handover_success_cell; /* Total number of successfull handover-mode connections that *started* on Cell */ + u_int32_t mptcp_interactive_success; /* Total number of interactive-mode connections that negotiated MPTCP */ + u_int32_t mptcp_aggregate_success; /* Same as previous but for aggregate */ + u_int32_t mptcp_fp_handover_success_wifi; /* Same as previous four, but for first-party apps */ + u_int32_t mptcp_fp_handover_success_cell; + u_int32_t mptcp_fp_interactive_success; + u_int32_t mptcp_fp_aggregate_success; + u_int32_t mptcp_handover_cell_from_wifi; /* Total number of connections that use cell in handover-mode (coming from WiFi) */ + u_int32_t mptcp_handover_wifi_from_cell; /* Total number of connections that use WiFi in handover-mode (coming from cell) */ + u_int32_t mptcp_interactive_cell_from_wifi; /* Total number of connections that use cell in interactive mode (coming from WiFi) */ + u_int32_t mptcp_back_to_wifi; /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */ + u_int64_t mptcp_handover_cell_bytes; /* Total number of bytes sent on cell in handover-mode (on new subflows, ignoring initial one) */ + u_int64_t mptcp_interactive_cell_bytes; /* Same as previous but for interactive */ + u_int64_t mptcp_aggregate_cell_bytes; + u_int64_t mptcp_handover_all_bytes; /* Total number of bytes sent in handover */ + u_int64_t mptcp_interactive_all_bytes; + u_int64_t mptcp_aggregate_all_bytes; + u_int32_t mptcp_wifi_proxy; /* Total number of new subflows that fell back to regular TCP on cell */ + u_int32_t mptcp_cell_proxy; /* Total number of new subflows that fell back to regular TCP on WiFi */ + u_int32_t mptcp_triggered_cell; /* Total number of times an MPTCP-connection triggered cell bringup */ + u_int32_t _padding; /* When adding/removing here, also adjust NSTAT_SYSINFO_TCP_STATS_COUNT */ } nstat_sysinfo_tcp_stats; #define NSTAT_SYSINFO_TCP_STATS_COUNT 71 @@ -1021,25 +1047,25 @@ enum { }; typedef struct nstat_sysinfo_ifnet_ecn_stats { - u_int32_t ifnet_proto; - u_int32_t ifnet_type; - struct if_tcp_ecn_stat ecn_stat; + u_int32_t ifnet_proto; + u_int32_t ifnet_type; + struct if_tcp_ecn_stat ecn_stat; } nstat_sysinfo_ifnet_ecn_stats; /* Total number of Low Internet stats that will be reported */ #define NSTAT_LIM_STAT_KEYVAL_COUNT 12 typedef struct nstat_sysinfo_lim_stats { - u_int8_t ifnet_signature[NSTAT_SYSINFO_KEYVAL_STRING_MAXSIZE]; - u_int32_t ifnet_siglen; - u_int32_t ifnet_type; - struct if_lim_perf_stat lim_stat; + u_int8_t ifnet_signature[NSTAT_SYSINFO_KEYVAL_STRING_MAXSIZE]; + u_int32_t ifnet_siglen; + u_int32_t ifnet_type; + struct if_lim_perf_stat lim_stat; } nstat_sysinfo_lim_stats; #define NSTAT_NET_API_STAT_KEYVAL_COUNT (NSTAT_SYSINFO_API_LAST - NSTAT_SYSINFO_API_FIRST + 1) typedef struct nstat_sysinfo_net_api_stats { - u_int32_t report_interval; - u_int32_t _padding; - struct net_api_stats net_api_stats; + u_int32_t report_interval; + u_int32_t _padding; + struct net_api_stats net_api_stats; } nstat_sysinfo_net_api_stats; typedef struct nstat_sysinfo_data { @@ -1094,6 +1120,7 @@ void nstat_tcp_new_pcb(struct inpcb *inp); void nstat_udp_new_pcb(struct inpcb *inp); void nstat_route_new_entry(struct rtentry *rt); void nstat_pcb_detach(struct inpcb *inp); +void nstat_pcb_event(struct inpcb *inp, u_int64_t event); void nstat_pcb_cache(struct inpcb *inp); void nstat_pcb_invalidate_cache(struct inpcb *inp); diff --git a/bsd/net/nwk_wq.c b/bsd/net/nwk_wq.c index 9eaa778e5..e0e973862 100644 --- a/bsd/net/nwk_wq.c +++ b/bsd/net/nwk_wq.c @@ -112,6 +112,7 @@ nwk_wq_thread_cont(int err) } } +__dead2 static void nwk_wq_thread_func(void *v, wait_result_t w) { diff --git a/bsd/net/packet_mangler.c b/bsd/net/packet_mangler.c index db7b4e643..8e7f41be0 100644 --- a/bsd/net/packet_mangler.c +++ b/bsd/net/packet_mangler.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -840,6 +841,7 @@ static errno_t pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protocol) { struct packet_mangler *p_pkt_mnglr = (struct packet_mangler *)cookie; + struct ip6_hdr ip6; struct ip ip; struct tcphdr tcp; int ip_pld_len; @@ -871,6 +873,14 @@ pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protoco goto input_done; } + if (ip.ip_v == 6) { + error = mbuf_copydata(*data, 0, sizeof(ip6), &ip6); + if (error) { + PKT_MNGLR_LOG(LOG_ERR, "Could not make local IPv6 header copy"); + goto input_done; + } + } + if ((p_pkt_mnglr->lsaddr.ss_family == AF_INET6) && (ip.ip_v == 4)) { PKT_MNGLR_LOG(LOG_INFO, "Skipping filtering as address family of packet is IPv4 but local " "address is set to IPv6"); @@ -888,6 +898,11 @@ pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protoco if (ip.ip_dst.s_addr != laddr.sin_addr.s_addr) { goto input_done; } + } else if (p_pkt_mnglr->lsaddr.ss_family == AF_INET6) { + struct sockaddr_in6 laddr = *(struct sockaddr_in6 *)(&(p_pkt_mnglr->lsaddr)); + if (!IN6_ARE_ADDR_EQUAL(&ip6.ip6_dst, &laddr.sin6_addr)) { + goto input_done; + } } if (p_pkt_mnglr->rsaddr.ss_family == AF_INET) { @@ -898,13 +913,25 @@ pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protoco PKT_MNGLR_LOG(LOG_INFO, "Remote IP: %x Source IP: %x in input path", raddr.sin_addr.s_addr, ip.ip_src.s_addr); + } else if (p_pkt_mnglr->rsaddr.ss_family == AF_INET6) { + struct sockaddr_in6 raddr = *(struct sockaddr_in6 *)(&(p_pkt_mnglr->rsaddr)); + if (!IN6_ARE_ADDR_EQUAL(&ip6.ip6_src, &raddr.sin6_addr)) { + goto input_done; + } } - if (ip.ip_v != 4) { + if (ip.ip_v == 4) { + ip_pld_len = ntohs(ip.ip_len) - (ip.ip_hl << 2); + } else if (ip.ip_v == 6) { + if (ip6.ip6_nxt != p_pkt_mnglr->proto) { + /* Don't support IPv6 extension headers */ + goto input_done; + } + ip_pld_len = ntohs(ip6.ip6_plen) + sizeof(struct ip6_hdr); + } else { goto input_done; } - ip_pld_len = ntohs(ip.ip_len) - (ip.ip_hl << 2); if (protocol != p_pkt_mnglr->proto) { PKT_MNGLR_LOG(LOG_INFO, "Skip: Protocol mismatch"); diff --git a/bsd/net/pf.c b/bsd/net/pf.c index e9fa2a37d..87a392533 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2018 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -328,12 +328,12 @@ extern struct pool pfr_kentry_pl; extern int path_mtu_discovery; struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = { - { &pf_state_pl, PFSTATE_HIWAT }, - { &pf_app_state_pl, PFAPPSTATE_HIWAT }, - { &pf_src_tree_pl, PFSNODE_HIWAT }, - { &pf_frent_pl, PFFRAG_FRENT_HIWAT }, - { &pfr_ktable_pl, PFR_KTABLE_HIWAT }, - { &pfr_kentry_pl, PFR_KENTRY_HIWAT }, + { .pp = &pf_state_pl, .limit = PFSTATE_HIWAT }, + { .pp = &pf_app_state_pl, .limit = PFAPPSTATE_HIWAT }, + { .pp = &pf_src_tree_pl, .limit = PFSNODE_HIWAT }, + { .pp = &pf_frent_pl, .limit = PFFRAG_FRENT_HIWAT }, + { .pp = &pfr_ktable_pl, .limit = PFR_KTABLE_HIWAT }, + { .pp = &pfr_kentry_pl, .limit = PFR_KENTRY_HIWAT }, }; void * @@ -2381,8 +2381,10 @@ pf_change_addr(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u, { struct pf_addr ao; - PF_ACPY(&ao, a, af); - PF_ACPY(a, an, afn); + if (af != afn) { + PF_ACPY(&ao, a, af); + PF_ACPY(a, an, afn); + } switch (af) { case AF_INET: @@ -6081,24 +6083,6 @@ pf_is_dummynet_enabled(void) #endif /* DUMMYNET */ } -boolean_t -pf_is_nlc_enabled(void) -{ -#if DUMMYNET - if (__probable(!pf_is_dummynet_enabled())) { - return FALSE; - } - - if (__probable(!is_nlc_enabled_glb)) { - return FALSE; - } - - return TRUE; -#else - return FALSE; -#endif /* DUMMYNET */ -} - #if DUMMYNET /* * When pf_test_dummynet() returns PF_PASS, the rule matching parameter "rm" @@ -6347,7 +6331,6 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, dnflow.fwa_ro6_pmtu = fwa->fwa_ro6_pmtu; dnflow.fwa_origifp = fwa->fwa_origifp; dnflow.fwa_mtu = fwa->fwa_mtu; - dnflow.fwa_alwaysfrag = fwa->fwa_alwaysfrag; dnflow.fwa_unfragpartlen = fwa->fwa_unfragpartlen; dnflow.fwa_exthdrs = fwa->fwa_exthdrs; } @@ -7826,7 +7809,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &sk->gwy.addr, 0); - if (pf_lazy_makewritable(pd, NULL, + if (pf_lazy_makewritable(pd, pbuf, off + sizeof(struct icmp6_hdr)) == NULL) { return PF_DROP; @@ -9340,6 +9323,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_addr naddr; struct pf_src_node *sn = NULL; int error = 0; + struct pf_mtag *pf_mtag; if (pbufp == NULL || !pbuf_is_valid(*pbufp) || r == NULL || (dir != PF_IN && dir != PF_OUT) || oifp == NULL) { @@ -9388,11 +9372,8 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, /* Cheat. XXX why only in the v6addr case??? */ if (r->rt == PF_FASTROUTE) { - struct pf_mtag *pf_mtag; - - if ((pf_mtag = pf_get_mtag(m0)) == NULL) { - goto bad; - } + pf_mtag = pf_get_mtag(m0); + ASSERT(pf_mtag != NULL); pf_mtag->pftag_flags |= PF_TAG_GENERATED; ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); return; @@ -9433,6 +9414,24 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, "< sizeof (struct ip6_hdr)\n")); goto bad; } + pf_mtag = pf_get_mtag(m0); + /* + * send refragmented packets. + */ + if ((pf_mtag->pftag_flags & PF_TAG_REFRAGMENTED) != 0) { + pf_mtag->pftag_flags &= ~PF_TAG_REFRAGMENTED; + /* + * nd6_output() frees packet chain in both success and + * failure cases. + */ + error = nd6_output(ifp, ifp, m0, dst, NULL, NULL); + m0 = NULL; + if (error) { + DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6:" + "dropped refragmented packet\n")); + } + goto done; + } ip6 = mtod(m0, struct ip6_hdr *); } @@ -9460,6 +9459,7 @@ done: bad: if (m0) { m_freem(m0); + m0 = NULL; } goto done; } @@ -9648,6 +9648,11 @@ pf_test(int dir, struct ifnet *ifp, pbuf_t **pbufp, return PF_PASS; } + if (pbuf->pb_packet_len < (int)sizeof(*h)) { + REASON_SET(&reason, PFRES_SHORT); + return PF_DROP; + } + /* initialize enough of pd for the done label */ h = pbuf->pb_data; pd.mp = pbuf; @@ -9666,13 +9671,6 @@ pf_test(int dir, struct ifnet *ifp, pbuf_t **pbufp, pd.tot_len = ntohs(h->ip_len); pd.eh = eh; - if (pbuf->pb_packet_len < (int)sizeof(*h)) { - action = PF_DROP; - REASON_SET(&reason, PFRES_SHORT); - log = 1; - goto done; - } - #if DUMMYNET if (fwa != NULL && fwa->fwa_pf_rule != NULL) { goto nonormalize; @@ -10209,9 +10207,15 @@ pf_test6(int dir, struct ifnet *ifp, pbuf_t **pbufp, struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0; u_int8_t nxt; + boolean_t fwd = FALSE; LCK_MTX_ASSERT(pf_lock, LCK_MTX_ASSERT_OWNED); + ASSERT(ifp != NULL); + if ((dir == PF_OUT) && (pbuf->pb_ifp) && (ifp != pbuf->pb_ifp)) { + fwd = TRUE; + } + if (!pf_status.running) { return PF_PASS; } @@ -10239,8 +10243,12 @@ pf_test6(int dir, struct ifnet *ifp, pbuf_t **pbufp, return PF_PASS; } - h = pbuf->pb_data; + if (pbuf->pb_packet_len < (int)sizeof(*h)) { + REASON_SET(&reason, PFRES_SHORT); + return PF_DROP; + } + h = pbuf->pb_data; nxt = h->ip6_nxt; off = ((caddr_t)h - (caddr_t)pbuf->pb_data) + sizeof(struct ip6_hdr); pd.mp = pbuf; @@ -10266,13 +10274,6 @@ pf_test6(int dir, struct ifnet *ifp, pbuf_t **pbufp, pd.pktflags = (*pbuf->pb_flags & PKTF_FLOW_MASK); } - if (pbuf->pb_packet_len < (int)sizeof(*h)) { - action = PF_DROP; - REASON_SET(&reason, PFRES_SHORT); - log = 1; - goto done; - } - #if DUMMYNET if (fwa != NULL && fwa->fwa_pf_rule != NULL) { goto nonormalize; @@ -10302,7 +10303,6 @@ nonormalize: goto done; } #endif - pd.src = (struct pf_addr *)(uintptr_t)&h->ip6_src; pd.dst = (struct pf_addr *)(uintptr_t)&h->ip6_dst; PF_ACPY(&pd.baddr, pd.src, AF_INET6); @@ -10322,7 +10322,7 @@ nonormalize: pd.pf_mtag = pf_get_mtag_pbuf(pbuf); do { - switch (nxt) { + switch (pd.proto) { case IPPROTO_FRAGMENT: { struct ip6_frag ip6f; @@ -10336,7 +10336,7 @@ nonormalize: log = 1; goto done; } - pd.proto = nxt = ip6f.ip6f_nxt; + pd.proto = ip6f.ip6f_nxt; #if DUMMYNET /* Traffic goes through dummynet first */ action = pf_test_dummynet(&r, dir, kif, &pbuf, &pd, @@ -10377,7 +10377,7 @@ nonormalize: } else { off += (opt6.ip6e_len + 1) * 8; } - nxt = opt6.ip6e_nxt; + pd.proto = opt6.ip6e_nxt; /* goto the next header */ break; } @@ -10800,11 +10800,17 @@ done: *pbufp = NULL; action = PF_PASS; } else if (r->rt) { - /* pf_route6 can free the mbuf causing *m0 to become NULL */ + /* pf_route6 can free the mbuf causing *pbufp to become NULL */ pf_route6(pbufp, r, dir, kif->pfik_ifp, s, &pd); } #endif /* 0 */ + /* if reassembled packet passed, create new fragments */ + struct pf_fragment_tag *ftag = NULL; + if ((action == PF_PASS) && (*pbufp != NULL) && (fwd) && + ((ftag = pf_find_fragment_tag_pbuf(*pbufp)) != NULL)) { + action = pf_refragment6(ifp, pbufp, ftag); + } return action; } #endif /* INET6 */ @@ -10909,6 +10915,51 @@ pf_get_mtag_pbuf(pbuf_t *pbuf) return pf_find_mtag_pbuf(pbuf); } +struct pf_fragment_tag * +pf_copy_fragment_tag(struct mbuf *m, struct pf_fragment_tag *ftag, int how) +{ + struct m_tag *tag; + struct pf_mtag *pftag = pf_find_mtag(m); + + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF_REASS, + sizeof(*ftag), how, m); + if (tag == NULL) { + return NULL; + } else { + m_tag_prepend(m, tag); + tag = tag + 1; + } + bcopy(ftag, tag, sizeof(*ftag)); + pftag->pftag_flags |= PF_TAG_REASSEMBLED; + return (struct pf_fragment_tag *)tag; +} + +struct pf_fragment_tag * +pf_find_fragment_tag(struct mbuf *m) +{ + struct m_tag *tag; + struct pf_fragment_tag *ftag; + struct pf_mtag *pftag = pf_find_mtag(m); + + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF_REASS, + NULL); + VERIFY((tag == NULL) || (pftag->pftag_flags & PF_TAG_REASSEMBLED)); + if (tag != NULL) { + tag = tag + 1; + } + ftag = (struct pf_fragment_tag *)tag; + return ftag; +} + +struct pf_fragment_tag * +pf_find_fragment_tag_pbuf(pbuf_t *pbuf) +{ + struct pf_mtag *mtag = pf_find_mtag_pbuf(pbuf); + + return (mtag->pftag_flags & PF_TAG_REASSEMBLED) ? + pbuf->pb_pf_fragtag : NULL; +} + uint64_t pf_time_second(void) { diff --git a/bsd/net/pf_if.c b/bsd/net/pf_if.c index fad147b5a..4afbfed57 100644 --- a/bsd/net/pf_if.c +++ b/bsd/net/pf_if.c @@ -481,12 +481,19 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) IFA_UNLOCK(ia); continue; } - if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && + if ((af == AF_INET6) && IN6_IS_ADDR_LINKLOCAL(&((struct sockaddr_in6 *) (void *)ia->ifa_addr)->sin6_addr)) { IFA_UNLOCK(ia); continue; } + if ((af == AF_INET6) && + (((struct in6_ifaddr *)ia)->ia6_flags & + (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_DETACHED | + IN6_IFF_CLAT46 | IN6_IFF_TEMPORARY | IN6_IFF_DEPRECATED))) { + IFA_UNLOCK(ia); + continue; + } if (flags & PFI_AFLAG_NOALIAS) { if (af == AF_INET && got4) { IFA_UNLOCK(ia); diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 43a8e23c7..560dee4ab 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2015 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,7 @@ #include #include #include +#include #include @@ -189,20 +190,20 @@ static void pf_deleterule_anchor_step_out(struct pf_ruleset **, #define PF_CDEV_MAJOR (-1) static struct cdevsw pf_cdevsw = { - /* open */ pfopen, - /* close */ pfclose, - /* read */ eno_rdwrt, - /* write */ eno_rdwrt, - /* ioctl */ pfioctl, - /* stop */ eno_stop, - /* reset */ eno_reset, - /* tty */ NULL, - /* select */ eno_select, - /* mmap */ eno_mmap, - /* strategy */ eno_strat, - /* getc */ eno_getc, - /* putc */ eno_putc, - /* type */ 0 + .d_open = pfopen, + .d_close = pfclose, + .d_read = eno_rdwrt, + .d_write = eno_rdwrt, + .d_ioctl = pfioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = 0 }; static void pf_attach_hooks(void); @@ -224,6 +225,8 @@ int16_t pf_nat64_configured = 0; /* * These are the pf enabled reference counting variables */ +#define NR_TOKENS_LIMIT (INT_MAX / sizeof(struct pfioc_token)) + static u_int64_t pf_enabled_ref_count; static u_int32_t nr_tokens = 0; static u_int64_t pffwrules; @@ -344,6 +347,11 @@ generate_token(struct proc *p) u_int64_t token_value; struct pfioc_kernel_token *new_token; + if (nr_tokens + 1 > NR_TOKENS_LIMIT) { + os_log_error(OS_LOG_DEFAULT, "%s: NR_TOKENS_LIMIT reached", __func__); + return 0; + } + new_token = _MALLOC(sizeof(struct pfioc_kernel_token), M_TEMP, M_WAITOK | M_ZERO); @@ -351,7 +359,7 @@ generate_token(struct proc *p) if (new_token == NULL) { /* malloc failed! bail! */ - printf("%s: unable to allocate pf token structure!", __func__); + os_log_error(OS_LOG_DEFAULT, "%s: unable to allocate pf token structure!", __func__); return 0; } @@ -2292,6 +2300,11 @@ pfioctl_ioc_tokens(u_long cmd, struct pfioc_tokens_32 *tok32, } size = sizeof(struct pfioc_token) * nr_tokens; + if (size / nr_tokens != sizeof(struct pfioc_token)) { + os_log_error(OS_LOG_DEFAULT, "%s: size overflows", __func__); + error = ERANGE; + break; + } ocnt = cnt = (p64 ? tok64->size : tok32->size); if (cnt == 0) { if (p64) { diff --git a/bsd/net/pf_norm.c b/bsd/net/pf_norm.c index 9c28415de..96b85f462 100644 --- a/bsd/net/pf_norm.c +++ b/bsd/net/pf_norm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2016 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #if INET6 #include +#include #endif /* INET6 */ #include @@ -98,7 +99,8 @@ struct pf_frent { struct ip6_hdr *fru_ipv6; } fr_u; struct ip6_frag fr_ip6f_opt; - int fr_ip6f_hlen; + uint16_t fr_ip6f_hlen; /* total header length */ + uint16_t fr_ip6f_extoff; /* last extension header offset or 0 */ }; struct pf_frcache { @@ -136,6 +138,7 @@ struct pf_fragment { } fr_u; uint32_t fr_csum_flags; /* checksum flags */ uint32_t fr_csum; /* partial checksum value */ + uint16_t fr_ip6_maxlen; /* maximum length of a single fragment in IPv6 */ }; static TAILQ_HEAD(pf_fragqueue, pf_fragment) pf_fragqueue; @@ -159,19 +162,21 @@ static struct pf_fragment *pf_find_fragment_by_key(struct pf_fragment *, struct pf_frag_tree *); static __inline struct pf_fragment * pf_find_fragment_by_ipv4_header(struct ip *, struct pf_frag_tree *); -static __inline struct pf_fragment * -pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *, - struct pf_frag_tree *); static struct mbuf *pf_reassemble(struct mbuf *, struct pf_fragment **, struct pf_frent *, int); static struct mbuf *pf_fragcache(struct mbuf **, struct ip *, struct pf_fragment **, int, int, int *); +static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *, + struct pf_pdesc *, pbuf_t *, struct tcphdr *, int, int *); +#if INET6 +static __inline struct pf_fragment * +pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *, + struct pf_frag_tree *); static struct mbuf *pf_reassemble6(struct mbuf **, struct pf_fragment **, struct pf_frent *, int); static struct mbuf *pf_frag6cache(struct mbuf **, struct ip6_hdr*, struct ip6_frag *, struct pf_fragment **, int, int, int, int *); -static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *, - struct pf_pdesc *, pbuf_t *, struct tcphdr *, int, int *); +#endif /* INET6 */ #define DPFPRINTF(x) do { \ if (pf_status.debug >= PF_DEBUG_MISC) { \ @@ -483,17 +488,7 @@ pf_find_fragment_by_ipv4_header(struct ip *ip, struct pf_frag_tree *tree) return pf_find_fragment_by_key(&key, tree); } -static __inline struct pf_fragment * -pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh, - struct pf_frag_tree *tree) -{ - struct pf_fragment key; - pf_ip6hdr2key(&key, ip6, fh); - return pf_find_fragment_by_key(&key, tree); -} - /* Removes a fragment from the fragment queue and frees the fragment */ - static void pf_remove_fragment(struct pf_fragment *frag) { @@ -773,7 +768,6 @@ insert: (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; - m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; @@ -1120,6 +1114,7 @@ drop_fragment: return NULL; } +#if INET6 #define FR_IP6_OFF(fr) \ (ntohs((fr)->fr_ip6f_opt.ip6f_offlg & IP6F_OFF_MASK)) #define FR_IP6_PLEN(fr) (ntohs((fr)->fr_ip6->ip6_plen)) @@ -1131,7 +1126,7 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag, struct pf_frent *frea, *frep, *next; struct ip6_hdr *ip6; struct ip6_frag *ip6f; - int plen, off, fr_max; + int plen, off, fr_max, pktlen; uint32_t uoff, csum, csum_flags; VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag)); @@ -1142,7 +1137,8 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag, off = FR_IP6_OFF(frent); uoff = frent->fr_ip6f_hlen; plen = FR_IP6_PLEN(frent); - fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof *ip6); + fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof(*ip6)); + pktlen = plen + sizeof(*ip6); DPFPRINTF(("0x%llx IPv6 frag plen %u off %u fr_ip6f_hlen %u " "fr_max %u m_len %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, off, @@ -1230,6 +1226,7 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag, (*frag)->fr_flags = 0; (*frag)->fr_max = 0; + (*frag)->fr_ip6_maxlen = pktlen; (*frag)->fr_af = AF_INET6; (*frag)->fr_srcx.v6addr = frent->fr_ip6->ip6_src; (*frag)->fr_dstx.v6addr = frent->fr_ip6->ip6_dst; @@ -1250,6 +1247,10 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag, goto insert; } + /* Remember maximum fragment len for refragmentation */ + if (pktlen > (*frag)->fr_ip6_maxlen) { + (*frag)->fr_ip6_maxlen = pktlen; + } /* * If this fragment contains similar checksum offload info * as that of the existing ones, accumulate checksum. Otherwise, @@ -1369,6 +1370,31 @@ insert: return NULL; } + ASSERT(*frag != NULL); + ASSERT(frent != NULL); + next = LIST_NEXT(frent, fr_next); + if (next == NULL) { + DPFPRINTF(("drop: atomic fragment\n")); + pf_free_fragment(*frag); + *frag = NULL; + return NULL; + } + + /* retrieve the values to be filled in to reassembled tag */ + uint16_t hdrlen, unfragpartlen, extoff, maxlen; + uint32_t id; + + /* Get total extension header length from the first fragment */ + hdrlen = frent->fr_ip6f_hlen - sizeof(struct ip6_frag); + /* + * Get total extension header length of per-fragment headers from the + * subsequent fragment. + */ + unfragpartlen = next->fr_ip6f_hlen - sizeof(struct ip6_frag); + extoff = frent->fr_ip6f_extoff; + maxlen = (*frag)->fr_ip6_maxlen; + id = (*frag)->fr_id6; + ip6 = frent->fr_ip6; ip6->ip6_nxt = (*frag)->fr_p; ip6->ip6_plen = htons(off); @@ -1387,7 +1413,6 @@ insert: (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; - m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; } @@ -1414,17 +1439,38 @@ insert: /* XXX this should be done elsewhere */ if (m->m_flags & M_PKTHDR) { - int pktlen = 0; + int len = 0; for (m2 = m; m2; m2 = m2->m_next) { - pktlen += m2->m_len; + len += m2->m_len; } - m->m_pkthdr.len = pktlen; + m->m_pkthdr.len = len; } DPFPRINTF(("complete: 0x%llx ip6_plen %d m_pkthdr.len %d\n", (uint64_t)VM_KERNEL_ADDRPERM(m), ntohs(ip6->ip6_plen), m->m_pkthdr.len)); + /* Add the reassembled tag */ + struct m_tag *mtag; + struct pf_fragment_tag *ftag; + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF_REASS, + sizeof(*ftag), M_NOWAIT, m); + if (mtag == NULL) { + /* XXX: add stats */ + m_freem(m); + return NULL; + } + ftag = (struct pf_fragment_tag *)(mtag + 1); + ftag->ft_hdrlen = hdrlen; + ftag->ft_unfragpartlen = unfragpartlen; + ftag->ft_extoff = extoff; + ftag->ft_maxlen = maxlen; + ftag->ft_id = id; + m_tag_prepend(m, mtag); + + struct pf_mtag *pftag = pf_get_mtag(m); + ASSERT(pftag != NULL); + pftag->pftag_flags |= PF_TAG_REASSEMBLED; return m; drop_fragment: @@ -1771,6 +1817,99 @@ drop_fragment: return NULL; } +int +pf_refragment6(struct ifnet *ifp, pbuf_t **pbufp, struct pf_fragment_tag *ftag) +{ + struct mbuf *m; + uint32_t frag_id; + uint16_t hdrlen, extoff, maxlen, unfragpartlen; + uint8_t proto; + int error, action; + uint8_t *lexthdrsp; + struct route_in6 ip6route; + struct route_in6 *ro; + struct sockaddr_in6 *dst; + struct ip6_hdr *hdr; + struct pf_mtag *mtag; + struct m_tag *tag; + + if (pbufp == NULL || !pbuf_is_valid(*pbufp) || ftag == NULL) { + panic("pf_route6: invalid parameters"); + /* NOT REACHED */ + } + m = pbuf_to_mbuf(*pbufp, FALSE); + hdr = mtod(m, struct ip6_hdr *); + mtag = pf_find_mtag(m); + hdrlen = ftag->ft_hdrlen - sizeof(struct ip6_hdr); + extoff = ftag->ft_extoff; + maxlen = ftag->ft_maxlen; + frag_id = ftag->ft_id; + unfragpartlen = ftag->ft_unfragpartlen; + tag = (struct m_tag *)(void *)ftag; + tag = tag - 1; + m_tag_delete(m, tag); + ftag = NULL; + tag = NULL; + mtag->pftag_flags &= ~PF_TAG_REASSEMBLED; + ro = &ip6route; + bzero((caddr_t)ro, sizeof(*ro)); + dst = (struct sockaddr_in6 *)&ro->ro_dst; + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = hdr->ip6_dst; + + if (extoff) { + int off; + struct mbuf *mexthdr; + + /* Use protocol from next field of last extension header */ + mexthdr = m_getptr(m, extoff + + offsetof(struct ip6_ext, ip6e_nxt), &off); + ASSERT(mexthdr != NULL); + lexthdrsp = (mtod(mexthdr, uint8_t *) + off); + proto = *lexthdrsp; + if (proto == IPPROTO_DSTOPTS) { + struct ip6_ext ext; + if (!pf_pull_hdr(*pbufp, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) { + DPFPRINTF(("pkt too short")); + action = PF_DROP; + goto done; + } + proto = ext.ip6e_nxt; + } + } else { + lexthdrsp = NULL; + proto = hdr->ip6_nxt; + } + + /* + * The MTU must be a multiple of 8 bytes, or we risk doing the + * fragmentation wrong. + */ + maxlen = maxlen & ~7; + + error = ip6_do_fragmentation(&m, hdrlen, NULL, unfragpartlen, + hdr, lexthdrsp, maxlen, proto, frag_id); + + if (error == 0) { + /* + * PF_TAG_REFRAGMENTED flag set to indicate ip6_forward() + * and pf_route6() that the mbuf contains a chain of fragments. + */ + mtag->pftag_flags |= PF_TAG_REFRAGMENTED; + action = PF_PASS; + pbuf_init_mbuf(*pbufp, m, ifp); + } else { + DPFPRINTF(("refragment error %d", error)); + action = PF_DROP; + goto done; + } +done: + return action; +} +#endif /* INET6 */ + int pf_normalize_ip(pbuf_t *pbuf, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) @@ -2093,22 +2232,29 @@ bad: } #if INET6 +static __inline struct pf_fragment * +pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh, + struct pf_frag_tree *tree) +{ + struct pf_fragment key; + pf_ip6hdr2key(&key, ip6, fh); + return pf_find_fragment_by_key(&key, tree); +} + int pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { - struct mbuf *m; + struct mbuf *m = NULL; struct pf_rule *r; struct ip6_hdr *h = pbuf->pb_data; + int extoff; int off; struct ip6_ext ext; -/* adi XXX */ -#if 0 struct ip6_opt opt; struct ip6_opt_jumbo jumbo; int optend; int ooff; -#endif struct ip6_frag frag; u_int32_t jumbolen = 0, plen; u_int16_t fragoff = 0; @@ -2172,6 +2318,7 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, goto drop; } + extoff = 0; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; terminal = 0; @@ -2187,6 +2334,7 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, NULL, AF_INET6)) { goto shortpkt; } + extoff = off; /* * * Multiple routing headers not allowed. @@ -2209,16 +2357,15 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, proto = ext.ip6e_nxt; break; case IPPROTO_HOPOPTS: -/* adi XXX */ -#if 0 - if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + if (!pf_pull_hdr(pbuf, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) { goto shortpkt; } + extoff = off; optend = off + (ext.ip6e_len + 1) * 8; ooff = off + sizeof(ext); do { - if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, + if (!pf_pull_hdr(pbuf, ooff, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, NULL, AF_INET6)) { goto shortpkt; @@ -2227,11 +2374,12 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, ooff++; continue; } - if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), + if (!pf_pull_hdr(pbuf, ooff, &opt, sizeof(opt), NULL, NULL, AF_INET6)) { goto shortpkt; } - if (ooff + sizeof(opt) + opt.ip6o_len > optend) { + if ((ooff + (int) sizeof(opt) + opt.ip6o_len) > + optend) { goto drop; } switch (opt.ip6o_type) { @@ -2239,7 +2387,7 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, if (h->ip6_plen != 0) { goto drop; } - if (!pf_pull_hdr(m, ooff, &jumbo, + if (!pf_pull_hdr(pbuf, ooff, &jumbo, sizeof(jumbo), NULL, NULL, AF_INET6)) { goto shortpkt; @@ -2250,8 +2398,8 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, if (jumbolen <= IPV6_MAXPACKET) { goto drop; } - if (sizeof(struct ip6_hdr) + - jumbolen != m->m_pkthdr.len) { + if ((sizeof(struct ip6_hdr) + + jumbolen) != pbuf->pb_packet_len) { goto drop; } break; @@ -2264,7 +2412,6 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, off = optend; proto = ext.ip6e_nxt; break; -#endif default: terminal = 1; break; @@ -2292,10 +2439,11 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif, return PF_PASS; fragment: - if (ntohs(h->ip6_plen) == 0 || jumbolen) { + plen = ntohs(h->ip6_plen); + /* Jumbo payload packets cannot be fragmented */ + if (plen == 0 || jumbolen) { goto drop; } - plen = ntohs(h->ip6_plen); if (!pf_pull_hdr(pbuf, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) { goto shortpkt; @@ -2303,7 +2451,7 @@ fragment: fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK); pd->proto = frag.ip6f_nxt; mff = ntohs(frag.ip6f_offlg & IP6F_MORE_FRAG); - off += sizeof frag; + off += sizeof(frag); if (fragoff + (plen - off) > IPV6_MAXPACKET) { goto badfrag; } @@ -2346,7 +2494,16 @@ fragment: frent->fr_ip6 = h; frent->fr_m = m; frent->fr_ip6f_opt = frag; + frent->fr_ip6f_extoff = extoff; frent->fr_ip6f_hlen = off; + /* account for 2nd Destination Options header if present */ + if (pd->proto == IPPROTO_DSTOPTS) { + if (!pf_pull_hdr(pbuf, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) { + goto shortpkt; + } + frent->fr_ip6f_hlen += (ext.ip6e_len + 1) * 8; + } /* Might return a completely reassembled mbuf, or NULL */ DPFPRINTF(("reass IPv6 frag %d @ %d-%d\n", @@ -2363,7 +2520,8 @@ fragment: if (pff != NULL && (pff->fr_flags & PFFRAG_DROP)) { goto drop; } - } else if (dir == PF_IN || !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) { + } else if (dir == PF_IN || + !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) { /* non-buffering fragment cache (overlaps: see RFC 5722) */ int nomem = 0; diff --git a/bsd/net/pf_pbuf.c b/bsd/net/pf_pbuf.c index be08224fa..5352a4922 100644 --- a/bsd/net/pf_pbuf.c +++ b/bsd/net/pf_pbuf.c @@ -28,6 +28,7 @@ #include #include #include +#include #include void @@ -90,6 +91,9 @@ pbuf_sync(pbuf_t *pbuf) pbuf->pb_flowid = &m->m_pkthdr.pkt_flowid; pbuf->pb_flags = &m->m_pkthdr.pkt_flags; pbuf->pb_pftag = m_pftag(m); + pbuf->pb_pf_fragtag = pf_find_fragment_tag(m); + ASSERT((pbuf->pb_pf_fragtag == NULL) || + (pbuf->pb_pftag->pftag_flags & PF_TAG_REASSEMBLED)); } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) { struct pbuf_memory *nm = &pbuf->pb_memory; @@ -109,6 +113,7 @@ pbuf_sync(pbuf_t *pbuf) pbuf->pb_flowid = &nm->pm_flowid; pbuf->pb_flags = &nm->pm_flags; pbuf->pb_pftag = &nm->pm_pftag; + pbuf->pb_pf_fragtag = &nm->pm_pf_fragtag; } else { panic("%s: bad pb_type: %d", __func__, pbuf->pb_type); } @@ -125,9 +130,10 @@ pbuf_to_mbuf(pbuf_t *pbuf, boolean_t release_ptr) m = pbuf->pb_mbuf; if (release_ptr) { pbuf->pb_mbuf = NULL; - pbuf_destroy(pbuf); } } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) { + boolean_t fragtag = FALSE; + if (pbuf->pb_packet_len > (u_int)MHLEN) { if (pbuf->pb_packet_len > (u_int)MCLBYTES) { printf("%s: packet too big for cluster (%u)\n", @@ -139,7 +145,7 @@ pbuf_to_mbuf(pbuf_t *pbuf, boolean_t release_ptr) m = m_gethdr(M_DONTWAIT, MT_DATA); } if (m == NULL) { - return NULL; + goto done; } m_copyback(m, 0, pbuf->pb_packet_len, pbuf->pb_data); @@ -153,16 +159,26 @@ pbuf_to_mbuf(pbuf_t *pbuf, boolean_t release_ptr) if (pbuf->pb_pftag != NULL) { struct pf_mtag *pftag = m_pftag(m); - if (pftag != NULL) { - *pftag = *pbuf->pb_pftag; - } + ASSERT(pftag != NULL); + *pftag = *pbuf->pb_pftag; + fragtag = + ((pftag->pftag_flags & PF_TAG_REASSEMBLED) != 0); } - if (release_ptr) { - pbuf_destroy(pbuf); + if (fragtag && pbuf->pb_pf_fragtag != NULL) { + if (pf_copy_fragment_tag(m, pbuf->pb_pf_fragtag, + M_NOWAIT) == NULL) { + m_freem(m); + m = NULL; + goto done; + } } } +done: + if (release_ptr) { + pbuf_destroy(pbuf); + } return m; } @@ -335,7 +351,7 @@ pbuf_copy_back(pbuf_t *pbuf, int off, int len, void *src) if (pbuf->pb_type == PBUF_TYPE_MBUF) { m_copyback(pbuf->pb_mbuf, off, len, src); - } else if (pbuf->pb_type == PBUF_TYPE_MBUF) { + } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) { if (len) { memcpy(&((uint8_t *)pbuf->pb_data)[off], src, len); } @@ -353,7 +369,7 @@ pbuf_copy_data(pbuf_t *pbuf, int off, int len, void *dst) if (pbuf->pb_type == PBUF_TYPE_MBUF) { m_copydata(pbuf->pb_mbuf, off, len, dst); - } else if (pbuf->pb_type == PBUF_TYPE_MBUF) { + } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) { if (len) { memcpy(dst, &((uint8_t *)pbuf->pb_data)[off], len); } diff --git a/bsd/net/pf_pbuf.h b/bsd/net/pf_pbuf.h index fd8f7dd57..232b9b2a1 100644 --- a/bsd/net/pf_pbuf.h +++ b/bsd/net/pf_pbuf.h @@ -51,6 +51,7 @@ struct pbuf_memory { uint32_t pm_flowid; uint32_t pm_flags; struct pf_mtag pm_pftag; + struct pf_fragment_tag pm_pf_fragtag; int (*pm_action)(struct pbuf_memory *, enum pbuf_action); void *pm_action_cookie; }; @@ -74,6 +75,7 @@ typedef struct pbuf { uint32_t *pb_flowid; uint32_t *pb_flags; struct pf_mtag *pb_pftag; + struct pf_fragment_tag *pb_pf_fragtag; struct ifnet *pb_ifp; struct pbuf *pb_next; } pbuf_t; diff --git a/bsd/net/pf_table.c b/bsd/net/pf_table.c index c66741be2..2d172aacc 100644 --- a/bsd/net/pf_table.c +++ b/bsd/net/pf_table.c @@ -1199,13 +1199,10 @@ pfr_walktree(struct radix_node *rn, void *arg) if (w->pfrw_free-- > 0) { struct pfr_astats as; + bzero(&as, sizeof(as)); + pfr_copyout_addr(&as.pfras_a, ke); -#if !defined(__LP64__) - /* Initialized to avoid potential info leak to - * userspace */ - as._pad = 0; -#endif bcopy(ke->pfrke_packets, as.pfras_packets, sizeof(as.pfras_packets)); bcopy(ke->pfrke_bytes, as.pfras_bytes, diff --git a/bsd/net/pfkeyv2.h b/bsd/net/pfkeyv2.h index d10af5141..7a354d17c 100644 --- a/bsd/net/pfkeyv2.h +++ b/bsd/net/pfkeyv2.h @@ -145,10 +145,9 @@ struct sadb_sa_2 { u_int16_t sadb_sa_natt_interval; }; - union { - u_int32_t sadb_reserved1; - u_int16_t sadb_sa_natt_offload_interval; - }; + u_int16_t sadb_sa_natt_offload_interval; +#define SADB_SA_NATT_SRC_PORT 1 + u_int16_t sadb_sa_natt_src_port; }; #endif /* PRIVATE */ @@ -293,9 +292,9 @@ struct sadb_x_policy { * = (sadb_x_policy_len * sizeof(uint64_t) - sizeof(struct sadb_x_policy)) */ #ifdef PRIVATE -/* IPSec Interface Extension: - * IPSec interface can be specified alone, or all three - * of internal, outgoing, and IPSec interfaces must be +/* IPsec Interface Extension: + * IPsec interface can be specified alone, or all three + * of internal, outgoing, and IPsec interfaces must be * specified. */ struct sadb_x_ipsecif { @@ -492,6 +491,7 @@ struct sadb_sastat { #ifdef PRIVATE #define SADB_X_EXT_SA2_DELETE_ON_DETACH 0x0001 +#define SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS 0x0002 #endif /* SPI size for PF_KEYv2 */ diff --git a/bsd/net/pfvar.h b/bsd/net/pfvar.h index 2eed3f3c3..c82e61d02 100644 --- a/bsd/net/pfvar.h +++ b/bsd/net/pfvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -634,8 +634,8 @@ struct pf_os_fingerprint { #define PF_OSFP_TCPOPT_TS 0x4 /* TCP timestamp option */ #define PF_OSFP_TCPOPT_BITS 3 /* bits used by each option */ #define PF_OSFP_MAX_OPTS \ - (sizeof(((struct pf_os_fingerprint *)0)->fp_tcpopts) * 8) \ - / PF_OSFP_TCPOPT_BITS + ((sizeof(pf_tcpopts_t) * 8) \ + / PF_OSFP_TCPOPT_BITS) SLIST_ENTRY(pf_os_fingerprint) fp_next; }; @@ -1432,8 +1432,7 @@ struct pf_pdesc { struct pf_rule *nat_rule; /* nat/rdr rule applied to packet */ struct pf_addr *src; struct pf_addr *dst; - struct ether_header - *eh; + struct ether_header *eh; pbuf_t *mp; int lmw; /* lazy writable offset */ struct pf_mtag *pf_mtag; @@ -2186,7 +2185,7 @@ extern struct pool pf_app_state_pl; extern struct thread *pf_purge_thread; __private_extern__ void pfinit(void); -__private_extern__ void pf_purge_thread_fn(void *, wait_result_t); +__private_extern__ void pf_purge_thread_fn(void *, wait_result_t) __dead2; __private_extern__ void pf_purge_expired_src_nodes(void); __private_extern__ void pf_purge_expired_states(u_int32_t); __private_extern__ void pf_unlink_state(struct pf_state *); @@ -2212,7 +2211,6 @@ __private_extern__ void pf_rm_rule(struct pf_rulequeue *, struct pf_rule *); struct ip_fw_args; extern boolean_t is_nlc_enabled_glb; -extern boolean_t pf_is_nlc_enabled(void); #if INET __private_extern__ int pf_test(int, struct ifnet *, pbuf_t **, @@ -2229,6 +2227,10 @@ __private_extern__ int pf_test6_mbuf(int, struct ifnet *, struct mbuf **, __private_extern__ void pf_poolmask(struct pf_addr *, struct pf_addr *, struct pf_addr *, struct pf_addr *, u_int8_t); __private_extern__ void pf_addr_inc(struct pf_addr *, sa_family_t); +__private_extern__ int pf_normalize_ip6(pbuf_t *, int, struct pfi_kif *, + u_short *, struct pf_pdesc *); +__private_extern__ int pf_refragment6(struct ifnet *, pbuf_t **, + struct pf_fragment_tag *); #endif /* INET6 */ __private_extern__ void *pf_lazy_makewritable(struct pf_pdesc *, @@ -2254,8 +2256,6 @@ __private_extern__ void pf_normalize_init(void); __private_extern__ int pf_normalize_isempty(void); __private_extern__ int pf_normalize_ip(pbuf_t *, int, struct pfi_kif *, u_short *, struct pf_pdesc *); -__private_extern__ int pf_normalize_ip6(pbuf_t *, int, struct pfi_kif *, - u_short *, struct pf_pdesc *); __private_extern__ int pf_normalize_tcp(int, struct pfi_kif *, pbuf_t *, int, int, void *, struct pf_pdesc *); __private_extern__ void pf_normalize_tcp_cleanup(struct pf_state *); @@ -2413,6 +2413,10 @@ __private_extern__ struct pf_mtag *pf_find_mtag(struct mbuf *); __private_extern__ struct pf_mtag *pf_find_mtag_pbuf(pbuf_t *); __private_extern__ struct pf_mtag *pf_get_mtag(struct mbuf *); __private_extern__ struct pf_mtag *pf_get_mtag_pbuf(pbuf_t *); +__private_extern__ struct pf_fragment_tag * pf_find_fragment_tag_pbuf(pbuf_t *); +__private_extern__ struct pf_fragment_tag * pf_find_fragment_tag(struct mbuf *); +__private_extern__ struct pf_fragment_tag * pf_copy_fragment_tag(struct mbuf *, + struct pf_fragment_tag *, int); #else /* !KERNEL */ extern struct pf_anchor_global pf_anchors; extern struct pf_anchor pf_main_anchor; diff --git a/bsd/net/pktap.c b/bsd/net/pktap.c index da700024f..02340a977 100644 --- a/bsd/net/pktap.c +++ b/bsd/net/pktap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2018 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -252,11 +252,9 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) pktap->pktp_filters[1].filter_param_if_type = IFT_IEEE1394; #endif /* CONFIG_EMBEDDED */ -#if (DEVELOPMENT || DEBUG) pktap->pktp_filters[2].filter_op = PKTAP_FILTER_OP_PASS; pktap->pktp_filters[2].filter_param = PKTAP_FILTER_PARAM_IF_TYPE; pktap->pktp_filters[2].filter_param_if_type = IFT_OTHER; -#endif /* DEVELOPMENT || DEBUG */ /* * We do not use a set_bpf_tap() function as we rather rely on the more @@ -786,6 +784,8 @@ pktap_set_procinfo(struct pktap_header *hdr, struct so_procinfo *soprocinfo) if (hdr->pth_comm[0] == 0) { proc_name(soprocinfo->spi_pid, hdr->pth_comm, MAXCOMLEN); } + strlcpy(&hdr->pth_comm[0], &soprocinfo->spi_proc_name[0], sizeof(hdr->pth_comm)); + if (soprocinfo->spi_pid != 0) { uuid_copy(hdr->pth_uuid, soprocinfo->spi_uuid); } @@ -793,9 +793,7 @@ pktap_set_procinfo(struct pktap_header *hdr, struct so_procinfo *soprocinfo) if (soprocinfo->spi_delegated != 0) { hdr->pth_flags |= PTH_FLAG_PROC_DELEGATED; hdr->pth_epid = soprocinfo->spi_epid; - if (hdr->pth_ecomm[0] == 0) { - proc_name(soprocinfo->spi_epid, hdr->pth_ecomm, MAXCOMLEN); - } + strlcpy(&hdr->pth_ecomm[0], &soprocinfo->spi_e_proc_name[0], sizeof(hdr->pth_ecomm)); uuid_copy(hdr->pth_euuid, soprocinfo->spi_euuid); } } @@ -837,8 +835,7 @@ pktap_v2_set_procinfo(struct pktap_v2_hdr *pktap_v2_hdr, char *ptr = ((char *)pktap_v2_hdr) + pktap_v2_hdr->pth_comm_offset; - proc_name(soprocinfo->spi_pid, - ptr, PKTAP_MAX_COMM_SIZE); + strlcpy(ptr, &soprocinfo->spi_proc_name[0], PKTAP_MAX_COMM_SIZE); } if (pktap_v2_hdr->pth_uuid_offset != 0) { uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) + @@ -864,8 +861,7 @@ pktap_v2_set_procinfo(struct pktap_v2_hdr *pktap_v2_hdr, char *ptr = ((char *)pktap_v2_hdr) + pktap_v2_hdr->pth_e_comm_offset; - proc_name(soprocinfo->spi_epid, - ptr, PKTAP_MAX_COMM_SIZE); + strlcpy(ptr, &soprocinfo->spi_e_proc_name[0], PKTAP_MAX_COMM_SIZE); } if (pktap_v2_hdr->pth_e_uuid_offset != 0) { uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) + @@ -1213,8 +1209,8 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, hdr->pth_dlt = DLT_APPLE_IP_OVER_IEEE1394; break; case IFT_OTHER: - if (ifp->if_subfamily == IFNET_SUBFAMILY_IPSEC || - ifp->if_subfamily == IFNET_SUBFAMILY_UTUN) { + if (ifp->if_family == IFNET_FAMILY_IPSEC || + ifp->if_family == IFNET_FAMILY_UTUN) { /* * For utun: * - incoming packets do not have the prefix set to four diff --git a/bsd/net/pktsched/pktsched.c b/bsd/net/pktsched/pktsched.c index 4c0d3b7fe..f08febca3 100644 --- a/bsd/net/pktsched/pktsched.c +++ b/bsd/net/pktsched/pktsched.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2017 Apple Inc. All rights reserved. + * Copyright (c) 2011-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -46,13 +47,14 @@ #include #include #include +#include #include u_int32_t machclk_freq = 0; u_int64_t machclk_per_sec = 0; -u_int32_t pktsched_verbose; /* more noise if greater than 1 */ +u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */ static void init_machclk(void); @@ -72,6 +74,7 @@ pktsched_init(void) tcq_init(); qfq_init(); + netem_init(); } static void @@ -225,47 +228,102 @@ pktsched_getqstats(struct ifclassq *ifq, u_int32_t qid, } void -pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_type_t ptype, void *pp) +pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt) { - pkt->pktsched_ptype = ptype; - pkt->pktsched_pkt = pp; + pkt->pktsched_pkt = *cpkt; - switch (ptype) { + switch (cpkt->cp_ptype) { case QP_MBUF: pkt->pktsched_plen = - (uint32_t)m_pktlen((struct mbuf *)pkt->pktsched_pkt); + (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf); break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } } +int +pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2) +{ + struct mbuf *m1, *m2; + + ASSERT(pkt1 != NULL); + ASSERT(pkt1->pktsched_pkt_mbuf != NULL); + /* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */ + ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf == + pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 && + pkt2->pktsched_pkt_mbuf == NULL)); + + switch (pkt1->pktsched_ptype) { + case QP_MBUF: + m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf; + m2 = m_dup(m1, M_NOWAIT); + if (__improbable(m2 == NULL)) { + return ENOBUFS; + } + pkt2->pktsched_pkt_mbuf = m2; + break; + + + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); + } + + pkt2->pktsched_plen = pkt1->pktsched_plen; + pkt2->pktsched_ptype = pkt1->pktsched_ptype; + return 0; +} + +void +pktsched_corrupt_packet(pktsched_pkt_t *pkt) +{ + struct mbuf *m = NULL; + uint8_t *data = NULL; + uint32_t data_len = 0; + uint32_t rand32, rand_off, rand_bit; + + switch (pkt->pktsched_ptype) { + case QP_MBUF: + m = pkt->pktsched_pkt_mbuf; + data = mtod(m, uint8_t *); + data_len = m->m_pkthdr.len; + break; + + default: + /* NOTREACHED */ + VERIFY(0); + __builtin_unreachable(); + } + + read_frandom(&rand32, sizeof(rand32)); + rand_bit = rand32 & 0x8; + rand_off = (rand32 >> 3) % data_len; + data[rand_off] ^= 1 << rand_bit; +} + void pktsched_free_pkt(pktsched_pkt_t *pkt) { switch (pkt->pktsched_ptype) { case QP_MBUF: - m_freem(pkt->pktsched_pkt); + m_freem(pkt->pktsched_pkt_mbuf); break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - pkt->pktsched_pkt = NULL; + pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt); pkt->pktsched_plen = 0; - pkt->pktsched_ptype = 0; -} - -uint32_t -pktsched_get_pkt_len(pktsched_pkt_t *pkt) -{ - return pkt->pktsched_plen; } mbuf_svc_class_t @@ -275,27 +333,27 @@ pktsched_get_pkt_svc(pktsched_pkt_t *pkt) switch (pkt->pktsched_ptype) { case QP_MBUF: - svc = m_get_service_class((mbuf_t)pkt->pktsched_pkt); + svc = m_get_service_class(pkt->pktsched_pkt_mbuf); break; default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } return svc; } void -pktsched_get_pkt_vars(pktsched_pkt_t *pkt, uint32_t **flags, +pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags, uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto, uint32_t *tcp_start_seq) { switch (pkt->pktsched_ptype) { case QP_MBUF: { - struct mbuf *m = (struct mbuf *)pkt->pktsched_pkt; - struct pkthdr *pkth = &m->m_pkthdr; + struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr); if (flags != NULL) { *flags = &pkth->pkt_flags; @@ -327,6 +385,7 @@ pktsched_get_pkt_vars(pktsched_pkt_t *pkt, uint32_t **flags, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } } @@ -338,7 +397,7 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how) switch (pkt->pktsched_ptype) { case QP_MBUF: { - struct mbuf *m = (struct mbuf *)pkt->pktsched_pkt; + struct mbuf *m = pkt->pktsched_pkt_mbuf; fce = flowadv_alloc_entry(how); if (fce == NULL) { @@ -357,6 +416,7 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how) default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } return fce; @@ -369,12 +429,10 @@ pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags) switch (pkt->pktsched_ptype) { case QP_MBUF: { - struct mbuf *m = (struct mbuf *)pkt->pktsched_pkt; - struct pkthdr *pkth = &m->m_pkthdr; + struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr); _CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t)); _CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t)); - *sfb_flags = &pkth->pkt_mpriv_flags; hashp = &pkth->pkt_mpriv_hash; break; @@ -384,6 +442,7 @@ pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags) default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } return hashp; diff --git a/bsd/net/pktsched/pktsched.h b/bsd/net/pktsched/pktsched.h index b094eb623..624e2e58d 100644 --- a/bsd/net/pktsched/pktsched.h +++ b/bsd/net/pktsched/pktsched.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2016 Apple Inc. All rights reserved. + * Copyright (c) 2011-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -58,18 +58,18 @@ extern "C" { #define PKTSCHEDF_QALG_DRIVER_MANAGED 0x10 /* driver managed */ typedef struct _pktsched_pkt_ { - classq_pkt_type_t __ptype; + classq_pkt_t __pkt; uint32_t __plen; - void *__pkt; -#define pktsched_ptype __ptype +#define pktsched_ptype __pkt.cp_ptype #define pktsched_plen __plen #define pktsched_pkt __pkt +#define pktsched_pkt_mbuf __pkt.cp_mbuf +#define pktsched_pkt_kpkt __pkt.cp_kpkt } pktsched_pkt_t; -#define _PKTSCHED_PKT_INIT(_p) do { \ - (_p)->pktsched_ptype = QP_INVALID; \ - (_p)->pktsched_plen = 0; \ - (_p)->pktsched_pkt = NULL; \ +#define _PKTSCHED_PKT_INIT(_p) do { \ + (_p)->pktsched_pkt = CLASSQ_PKT_INITIALIZER((_p)->pktsched_pkt);\ + (_p)->pktsched_plen = 0; \ } while (0) /* macro for timeout/untimeout */ @@ -137,6 +137,12 @@ __fls(pktsched_bitmap_t word) return pktsched_fls(word) - 1; } +static inline uint32_t +pktsched_get_pkt_len(pktsched_pkt_t *pkt) +{ + return pkt->pktsched_plen; +} + /* * We can use mach_absolute_time which returns a 64-bit value with * granularity less than a microsecond even on the slowest processor. @@ -164,11 +170,12 @@ extern int pktsched_getqstats(struct ifclassq *, u_int32_t, extern u_int64_t pktsched_abs_to_nsecs(u_int64_t); extern u_int64_t pktsched_nsecs_to_abstime(u_int64_t); extern void pktsched_free_pkt(pktsched_pkt_t *); -extern uint32_t pktsched_get_pkt_len(pktsched_pkt_t *); -extern void pktsched_get_pkt_vars(pktsched_pkt_t *, uint32_t **, uint64_t **, - uint32_t *, uint8_t *, uint8_t *, uint32_t *); +extern int pktsched_clone_pkt(pktsched_pkt_t *, pktsched_pkt_t *); +extern void pktsched_corrupt_packet(pktsched_pkt_t *pkt); +extern void pktsched_get_pkt_vars(pktsched_pkt_t *, volatile uint32_t **, + uint64_t **, uint32_t *, uint8_t *, uint8_t *, uint32_t *); extern uint32_t *pktsched_get_pkt_sfb_vars(pktsched_pkt_t *, uint32_t **); -extern void pktsched_pkt_encap(pktsched_pkt_t *, classq_pkt_type_t, void *); +extern void pktsched_pkt_encap(pktsched_pkt_t *, classq_pkt_t *); extern mbuf_svc_class_t pktsched_get_pkt_svc(pktsched_pkt_t *); extern struct flowadv_fcentry *pktsched_alloc_fcentry(pktsched_pkt_t *, struct ifnet *, int); diff --git a/bsd/net/pktsched/pktsched_fq_codel.c b/bsd/net/pktsched/pktsched_fq_codel.c index b6cd0c67f..e523e8096 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.c +++ b/bsd/net/pktsched/pktsched_fq_codel.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,19 +42,18 @@ static fq_if_t *fq_if_alloc(struct ifnet *, classq_pkt_type_t); static void fq_if_destroy(fq_if_t *fqs); static void fq_if_classq_init(fq_if_t *fqs, u_int32_t priority, u_int32_t quantum, u_int32_t drr_max, u_int32_t svc_class); -static int fq_if_enqueue_classq(struct ifclassq *ifq, void *p, - classq_pkt_type_t ptype, boolean_t *pdrop); -static void *fq_if_dequeue_classq(struct ifclassq *, classq_pkt_type_t *); +static int fq_if_enqueue_classq(struct ifclassq *, classq_pkt_t *, boolean_t *); +static void fq_if_dequeue_classq(struct ifclassq *, classq_pkt_t *); static int fq_if_dequeue_classq_multi(struct ifclassq *, u_int32_t, - u_int32_t, void **, void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *); -static void *fq_if_dequeue_sc_classq(struct ifclassq *, mbuf_svc_class_t, - classq_pkt_type_t *); + u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *); +static void fq_if_dequeue_sc_classq(struct ifclassq *, mbuf_svc_class_t, + classq_pkt_t *); static int fq_if_dequeue_sc_classq_multi(struct ifclassq *, - mbuf_svc_class_t, u_int32_t, u_int32_t, void **, - void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *); + mbuf_svc_class_t, u_int32_t, u_int32_t, classq_pkt_t *, + classq_pkt_t *, u_int32_t *, u_int32_t *); static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, u_int32_t, - u_int32_t, void **, void **, u_int32_t *, u_int32_t *, - boolean_t drvmgmt, classq_pkt_type_t *); + u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, + u_int32_t *, boolean_t drvmgmt); static int fq_if_request_classq(struct ifclassq *ifq, cqrq_t op, void *arg); void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat); static void fq_if_purge(fq_if_t *); @@ -75,26 +74,25 @@ static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \ STAILQ_EMPTY(&(_fcl_)->fcl_old_flows)) -typedef void (* fq_if_append_pkt_t)(void *, void *); +typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *); typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *, - u_int32_t, u_int32_t, void **, void **, u_int32_t *, u_int32_t *, - boolean_t *, u_int32_t); + u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, + u_int32_t *, boolean_t *, u_int32_t); static void -fq_if_append_mbuf(void *pkt, void *next_pkt) +fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt) { - ((mbuf_t)pkt)->m_nextpkt = (mbuf_t)next_pkt; + pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf; } static boolean_t fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, - u_int32_t byte_limit, u_int32_t pkt_limit, void **top, void **last, - u_int32_t *byte_cnt, u_int32_t *pkt_cnt, boolean_t *qempty, - u_int32_t pflags) + u_int32_t byte_limit, u_int32_t pkt_limit, classq_pkt_t *top, + classq_pkt_t *last, u_int32_t *byte_cnt, u_int32_t *pkt_cnt, + boolean_t *qempty, u_int32_t pflags) { - struct mbuf *m; u_int32_t plen; pktsched_pkt_t pkt; boolean_t limit_reached = FALSE; @@ -104,28 +102,28 @@ fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, while (fq->fq_deficit > 0 && limit_reached == FALSE && !MBUFQ_EMPTY(&fq->fq_mbufq)) { _PKTSCHED_PKT_INIT(&pkt); - m = fq_getq_flow(fqs, fq, &pkt); + fq_getq_flow(fqs, fq, &pkt); ASSERT(pkt.pktsched_ptype == QP_MBUF); plen = pktsched_get_pkt_len(&pkt); fq->fq_deficit -= plen; - m->m_pkthdr.pkt_flags |= pflags; + pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= pflags; - if (*top == NULL) { - *top = m; + if (top->cp_mbuf == NULL) { + *top = pkt.pktsched_pkt; } else { - ASSERT(*last != NULL); - ASSERT((*(struct mbuf **)last)->m_nextpkt == NULL); - (*(struct mbuf **)last)->m_nextpkt = m; + ASSERT(last->cp_mbuf != NULL); + ASSERT(last->cp_mbuf->m_nextpkt == NULL); + last->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf; } - *last = m; - (*(mbuf_t *)last)->m_nextpkt = NULL; + *last = pkt.pktsched_pkt; + last->cp_mbuf->m_nextpkt = NULL; fq_cl->fcl_stat.fcl_dequeue++; fq_cl->fcl_stat.fcl_dequeue_bytes += plen; *pkt_cnt += 1; *byte_cnt += plen; - ifclassq_set_packet_metadata(ifq, ifp, m, QP_MBUF); + ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt); /* Check if the limit is reached */ if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) { @@ -267,11 +265,10 @@ fq_if_classq_init(fq_if_t *fqs, u_int32_t pri, u_int32_t quantum, u_int32_t drr_max, u_int32_t svc_class) { fq_if_classq_t *fq_cl; - + VERIFY(pri < FQ_IF_MAX_CLASSES); fq_cl = &fqs->fqs_classq[pri]; - VERIFY(pri >= 0 && pri < FQ_IF_MAX_CLASSES && - fq_cl->fcl_quantum == 0); + VERIFY(fq_cl->fcl_quantum == 0); fq_cl->fcl_quantum = quantum; fq_cl->fcl_pri = pri; fq_cl->fcl_drr_max = drr_max; @@ -281,8 +278,7 @@ fq_if_classq_init(fq_if_t *fqs, u_int32_t pri, u_int32_t quantum, } int -fq_if_enqueue_classq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, - boolean_t *pdrop) +fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop) { u_int32_t pri; fq_if_t *fqs; @@ -292,18 +288,19 @@ fq_if_enqueue_classq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, pktsched_pkt_t pkt; IFCQ_LOCK_ASSERT_HELD(ifq); - if ((ptype == QP_MBUF) && !(((mbuf_t)p)->m_flags & M_PKTHDR)) { + if ((p->cp_ptype == QP_MBUF) && !(p->cp_mbuf->m_flags & M_PKTHDR)) { IFCQ_CONVERT_LOCK(ifq); - m_freem((mbuf_t)p); + m_freem(p->cp_mbuf); + *p = CLASSQ_PKT_INITIALIZER(*p); *pdrop = TRUE; return ENOBUFS; } - pktsched_pkt_encap(&pkt, ptype, p); + pktsched_pkt_encap(&pkt, p); fqs = (fq_if_t *)ifq->ifcq_disc; svc = pktsched_get_pkt_svc(&pkt); pri = fq_if_service_to_priority(fqs, svc); - VERIFY(pri >= 0 && pri < FQ_IF_MAX_CLASSES); + VERIFY(pri < FQ_IF_MAX_CLASSES); fq_cl = &fqs->fqs_classq[pri]; if (svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1) { @@ -357,21 +354,17 @@ fq_if_enqueue_classq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, return ret; } -static void * -fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_type_t *ptype) +static void +fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt) { - void *top; - (void) fq_if_dequeue_classq_multi(ifq, 1, - CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &top, NULL, NULL, NULL, ptype); - return top; + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL); } -static void * +static void fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc, - classq_pkt_type_t *ptype) + classq_pkt_t *pkt) { - void *top; fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; fq_if_classq_t *fq_cl; u_int32_t pri; @@ -380,22 +373,23 @@ fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc, fq_cl = &fqs->fqs_classq[pri]; fq_if_dequeue(fqs, fq_cl, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, - &top, NULL, NULL, NULL, TRUE, ptype); - return top; + pkt, NULL, NULL, NULL, TRUE); } int fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, - u_int32_t maxbytecnt, void **first_packet, - void **last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, - classq_pkt_type_t *ptype) + u_int32_t maxbytecnt, classq_pkt_t *first_packet, + classq_pkt_t *last_packet, u_int32_t *retpktcnt, + u_int32_t *retbytecnt) { - void *top = NULL, *tail = NULL, *first, *last; - u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt, total_bytecnt; - fq_if_t *fqs; + u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt = 0, total_bytecnt = 0; + classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt); + classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); + classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp); + fq_if_append_pkt_t append_pkt; fq_if_classq_t *fq_cl; + fq_if_t *fqs; int pri; - fq_if_append_pkt_t append_pkt; IFCQ_LOCK_ASSERT_HELD(ifq); @@ -410,14 +404,13 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } - first = last = NULL; - total_pktcnt = total_bytecnt = 0; - *ptype = fqs->fqs_ptype; - for (;;) { - classq_pkt_type_t tmp_ptype; + classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top); + classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail); + if (fqs->fqs_bitmaps[FQ_IF_ER] == 0 && fqs->fqs_bitmaps[FQ_IF_EB] == 0) { fqs->fqs_bitmaps[FQ_IF_EB] = fqs->fqs_bitmaps[FQ_IF_IB]; @@ -454,22 +447,21 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, } fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt, - &bytecnt, FALSE, &tmp_ptype); - if (top != NULL) { - ASSERT(tmp_ptype == *ptype); + &bytecnt, FALSE); + if (top.cp_mbuf != NULL) { ASSERT(pktcnt > 0 && bytecnt > 0); - if (first == NULL) { + if (first.cp_mbuf == NULL) { first = top; - last = tail; total_pktcnt = pktcnt; total_bytecnt = bytecnt; } else { - append_pkt(last, top); - last = tail; + ASSERT(last.cp_mbuf != NULL); + append_pkt(&last, &top); total_pktcnt += pktcnt; total_bytecnt += bytecnt; } - append_pkt(last, NULL); + last = tail; + append_pkt(&last, &tmp); fq_cl->fcl_budget -= bytecnt; pktcnt = 0; bytecnt = 0; @@ -498,49 +490,35 @@ state_change: break; } } - if (first != NULL) { - if (first_packet != NULL) { - *first_packet = first; - } - if (last_packet != NULL) { - *last_packet = last; - } - if (retpktcnt != NULL) { - *retpktcnt = total_pktcnt; - } - if (retbytecnt != NULL) { - *retbytecnt = total_bytecnt; - } - IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt); - } else { - if (first_packet != NULL) { - *first_packet = NULL; - } - if (last_packet != NULL) { - *last_packet = NULL; - } - if (retpktcnt != NULL) { - *retpktcnt = 0; - } - if (retbytecnt != NULL) { - *retbytecnt = 0; - } + + if (__probable(first_packet != NULL)) { + *first_packet = first; + } + if (last_packet != NULL) { + *last_packet = last; } + if (retpktcnt != NULL) { + *retpktcnt = total_pktcnt; + } + if (retbytecnt != NULL) { + *retbytecnt = total_bytecnt; + } + + IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt); return 0; } int fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, - u_int32_t maxpktcnt, u_int32_t maxbytecnt, void **first_packet, - void **last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, - classq_pkt_type_t *ptype) + u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, + classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt) { -#pragma unused(maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt) fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; u_int32_t pri; u_int32_t total_pktcnt = 0, total_bytecnt = 0; fq_if_classq_t *fq_cl; - void *first = NULL, *last = NULL; + classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt); + classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); fq_if_append_pkt_t append_pkt; switch (fqs->fqs_ptype) { @@ -552,11 +530,11 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } pri = fq_if_service_to_priority(fqs, svc); fq_cl = &fqs->fqs_classq[pri]; - /* * Now we have the queue for a particular service class. We need * to dequeue as many packets as needed, first from the new flows @@ -564,49 +542,41 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, */ while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt && fq_cl->fcl_stat.fcl_pkt_cnt > 0) { - void *top, *tail; + classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top); + classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail); u_int32_t pktcnt = 0, bytecnt = 0; + fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt, - &bytecnt, TRUE, ptype); - if (first == NULL) { - first = top; - total_pktcnt = pktcnt; - total_bytecnt = bytecnt; - } else { - append_pkt(last, top); - total_pktcnt += pktcnt; - total_bytecnt += bytecnt; + &bytecnt, TRUE); + if (top.cp_mbuf != NULL) { + if (first.cp_mbuf == NULL) { + first = top; + total_pktcnt = pktcnt; + total_bytecnt = bytecnt; + } else { + ASSERT(last.cp_mbuf != NULL); + append_pkt(&last, &top); + total_pktcnt += pktcnt; + total_bytecnt += bytecnt; + } + last = tail; } - last = tail; } - if (first != NULL) { - if (first_packet != NULL) { - *first_packet = first; - } - if (last_packet != NULL) { - *last_packet = last; - } - if (retpktcnt != NULL) { - *retpktcnt = total_pktcnt; - } - if (retbytecnt != NULL) { - *retbytecnt = total_bytecnt; - } - } else { - if (first_packet != NULL) { - *first_packet = NULL; - } - if (last_packet != NULL) { - *last_packet = NULL; - } - if (retpktcnt != NULL) { - *retpktcnt = 0; - } - if (retbytecnt != NULL) { - *retbytecnt = 0; - } + + if (__probable(first_packet != NULL)) { + *first_packet = first; + } + if (last_packet != NULL) { + *last_packet = last; } + if (retpktcnt != NULL) { + *retpktcnt = total_pktcnt; + } + if (retbytecnt != NULL) { + *retbytecnt = total_bytecnt; + } + return 0; } @@ -621,7 +591,12 @@ fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, u_int32_t *pktsp, fq_cl = &fqs->fqs_classq[fq->fq_sc_index]; pkts = bytes = 0; _PKTSCHED_PKT_INIT(&pkt); - while (fq_getq_flow(fqs, fq, &pkt) != NULL) { + for (;;) { + fq_getq_flow(fqs, fq, &pkt); + if (pkt.pktsched_pkt_mbuf == NULL) { + VERIFY(pkt.pktsched_ptype == QP_INVALID); + break; + } pkts++; bytes += pktsched_get_pkt_len(&pkt); pktsched_free_pkt(&pkt); @@ -1007,7 +982,7 @@ fq_if_drop_packet(fq_if_t *fqs) fq_t *fq = fqs->fqs_large_flow; fq_if_classq_t *fq_cl; pktsched_pkt_t pkt; - uint32_t *pkt_flags; + volatile uint32_t *pkt_flags; uint64_t *pkt_timestamp; if (fq == NULL) { @@ -1018,15 +993,22 @@ fq_if_drop_packet(fq_if_t *fqs) fq_cl = &fqs->fqs_classq[fq->fq_sc_index]; _PKTSCHED_PKT_INIT(&pkt); - (void)fq_getq_flow_internal(fqs, fq, &pkt); + fq_getq_flow_internal(fqs, fq, &pkt); + ASSERT(pkt.pktsched_ptype != QP_INVALID); pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, NULL, NULL); IFCQ_CONVERT_LOCK(fqs->fqs_ifq); *pkt_timestamp = 0; - if (pkt.pktsched_ptype == QP_MBUF) { + switch (pkt.pktsched_ptype) { + case QP_MBUF: *pkt_flags &= ~PKTF_PRIV_GUARDED; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); } if (fq_empty(fq)) { @@ -1115,15 +1097,14 @@ fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl) void fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit, - u_int32_t bytelimit, void **top, void **tail, - u_int32_t *retpktcnt, u_int32_t *retbytecnt, boolean_t drvmgmt, - classq_pkt_type_t *ptype) + u_int32_t bytelimit, classq_pkt_t *top, classq_pkt_t *tail, + u_int32_t *retpktcnt, u_int32_t *retbytecnt, boolean_t drvmgmt) { fq_t *fq = NULL, *tfq = NULL; flowq_stailq_t temp_stailq; u_int32_t pktcnt, bytecnt; boolean_t qempty, limit_reached = FALSE; - void *last = NULL; + classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); fq_getq_flow_t fq_getq_flow_fn; switch (fqs->fqs_ptype) { @@ -1135,6 +1116,7 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit, default: VERIFY(0); /* NOTREACHED */ + __builtin_unreachable(); } /* @@ -1146,9 +1128,6 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit, } VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL); - - *top = NULL; - *ptype = fqs->fqs_ptype; pktcnt = bytecnt = 0; STAILQ_INIT(&temp_stailq); @@ -1201,8 +1180,8 @@ done: fq_cl->fcl_old_flows = temp_stailq; } - if (last != NULL) { - VERIFY(*top != NULL); + if (last.cp_mbuf != NULL) { + VERIFY(top->cp_mbuf != NULL); if (tail != NULL) { *tail = last; } diff --git a/bsd/net/pktsched/pktsched_netem.c b/bsd/net/pktsched/pktsched_netem.c new file mode 100644 index 000000000..5344b01a0 --- /dev/null +++ b/bsd/net/pktsched/pktsched_netem.c @@ -0,0 +1,1523 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include +#include +#include +#include + +enum { + NETEM_LOG_ERROR = 0, + NETEM_LOG_INFO = 1, + NETEM_LOG_DEBUG = 2, + NETEM_LOG_HIDEBUG = 3, +}; + +#define NETEM_HEAP_SIZE 1024 +#define NETEM_PSCALE IF_NETEM_PARAMS_PSCALE + +#define netem_log(_level, _fmt, ...) \ + do { \ + if (pktsched_verbose > _level) { \ + log(LOG_DEBUG, "NETEM: %-30s "_fmt "\n",\ + __FUNCTION__, ##__VA_ARGS__); \ + } \ + } while (0); + +extern kern_return_t thread_terminate(thread_t); + +static lck_attr_t *netem_lock_attr; +static lck_grp_t *netem_lock_group; +static lck_grp_attr_t *netem_lock_group_attr; +static int __netem_inited = 0; + +static const int32_t NORM_DIST_SCALE = 8192; +/* normal distribution lookup table */ +static int32_t norm_dist_table[] = +{ + -32768, -28307, -26871, -25967, -25298, -24765, -24320, -23937, + -23600, -23298, -23025, -22776, -22546, -22333, -22133, -21946, + -21770, -21604, -21445, -21295, -21151, -21013, -20882, -20755, + -20633, -20516, -20403, -20293, -20187, -20084, -19984, -19887, + -19793, -19702, -19612, -19526, -19441, -19358, -19277, -19198, + -19121, -19045, -18971, -18899, -18828, -18758, -18690, -18623, + -18557, -18492, -18429, -18366, -18305, -18245, -18185, -18127, + -18070, -18013, -17957, -17902, -17848, -17794, -17741, -17690, + -17638, -17588, -17538, -17489, -17440, -17392, -17345, -17298, + -17252, -17206, -17160, -17116, -17071, -17028, -16984, -16942, + -16899, -16857, -16816, -16775, -16735, -16694, -16654, -16615, + -16576, -16538, -16499, -16461, -16424, -16386, -16350, -16313, + -16277, -16241, -16205, -16170, -16135, -16100, -16066, -16031, + -15998, -15964, -15931, -15897, -15865, -15832, -15800, -15768, + -15736, -15704, -15673, -15642, -15611, -15580, -15550, -15519, + -15489, -15460, -15430, -15401, -15371, -15342, -15313, -15285, + -15256, -15228, -15200, -15172, -15144, -15116, -15089, -15062, + -15035, -15008, -14981, -14954, -14928, -14902, -14875, -14850, + -14823, -14798, -14772, -14747, -14722, -14696, -14671, -14647, + -14622, -14597, -14573, -14549, -14524, -14500, -14476, -14453, + -14429, -14405, -14382, -14359, -14335, -14312, -14289, -14266, + -14243, -14221, -14198, -14176, -14153, -14131, -14109, -14087, + -14065, -14043, -14021, -14000, -13978, -13957, -13935, -13914, + -13893, -13872, -13851, -13830, -13809, -13788, -13768, -13747, + -13727, -13706, -13686, -13666, -13646, -13626, -13606, -13586, + -13566, -13547, -13527, -13507, -13488, -13468, -13449, -13430, + -13411, -13392, -13373, -13354, -13335, -13316, -13297, -13278, + -13260, -13242, -13223, -13204, -13186, -13168, -13150, -13131, + -13113, -13095, -13077, -13060, -13042, -13024, -13006, -12988, + -12971, -12954, -12936, -12918, -12901, -12884, -12867, -12850, + -12832, -12815, -12798, -12781, -12764, -12748, -12731, -12714, + -12697, -12681, -12664, -12648, -12631, -12615, -12598, -12582, + -12566, -12549, -12533, -12517, -12501, -12485, -12469, -12453, + -12437, -12422, -12406, -12390, -12374, -12358, -12343, -12327, + -12312, -12296, -12281, -12265, -12250, -12235, -12220, -12204, + -12189, -12174, -12159, -12144, -12129, -12114, -12099, -12084, + -12069, -12054, -12039, -12025, -12010, -11995, -11981, -11966, + -11952, -11937, -11923, -11908, -11894, -11879, -11865, -11851, + -11837, -11822, -11808, -11794, -11780, -11766, -11752, -11737, + -11724, -11710, -11696, -11682, -11668, -11654, -11640, -11627, + -11613, -11599, -11586, -11572, -11559, -11545, -11531, -11518, + -11504, -11491, -11478, -11464, -11451, -11438, -11425, -11411, + -11398, -11385, -11372, -11359, -11346, -11332, -11319, -11306, + -11293, -11280, -11268, -11255, -11242, -11229, -11216, -11203, + -11191, -11178, -11165, -11153, -11140, -11127, -11114, -11102, + -11090, -11077, -11065, -11052, -11040, -11027, -11015, -11002, + -10990, -10978, -10965, -10953, -10941, -10929, -10917, -10904, + -10892, -10880, -10868, -10856, -10844, -10832, -10820, -10808, + -10796, -10784, -10772, -10760, -10748, -10736, -10725, -10713, + -10701, -10689, -10677, -10666, -10654, -10643, -10631, -10619, + -10607, -10596, -10584, -10573, -10562, -10550, -10539, -10527, + -10516, -10504, -10493, -10481, -10470, -10459, -10447, -10436, + -10425, -10414, -10402, -10391, -10380, -10369, -10358, -10346, + -10335, -10324, -10313, -10302, -10291, -10280, -10269, -10258, + -10247, -10236, -10225, -10214, -10203, -10192, -10181, -10171, + -10160, -10149, -10138, -10127, -10117, -10106, -10095, -10085, + -10074, -10063, -10052, -10042, -10031, -10021, -10010, -10000, + -9989, -9978, -9968, -9957, -9947, -9936, -9926, -9916, + -9905, -9895, -9884, -9874, -9864, -9853, -9843, -9833, + -9822, -9812, -9802, -9791, -9781, -9771, -9761, -9751, + -9741, -9730, -9720, -9710, -9700, -9690, -9680, -9670, + -9660, -9650, -9640, -9630, -9619, -9610, -9600, -9590, + -9580, -9570, -9560, -9550, -9540, -9530, -9520, -9511, + -9501, -9491, -9481, -9472, -9462, -9452, -9442, -9432, + -9423, -9413, -9403, -9394, -9384, -9374, -9365, -9355, + -9345, -9336, -9326, -9317, -9307, -9298, -9288, -9278, + -9269, -9259, -9250, -9241, -9231, -9221, -9212, -9202, + -9193, -9184, -9175, -9165, -9156, -9146, -9137, -9128, + -9119, -9109, -9100, -9090, -9081, -9072, -9063, -9053, + -9044, -9035, -9026, -9017, -9008, -8998, -8989, -8980, + -8971, -8962, -8953, -8944, -8934, -8925, -8916, -8907, + -8898, -8889, -8880, -8871, -8862, -8853, -8844, -8835, + -8826, -8817, -8808, -8799, -8790, -8781, -8772, -8764, + -8755, -8746, -8737, -8728, -8719, -8711, -8702, -8693, + -8684, -8675, -8667, -8658, -8649, -8640, -8632, -8623, + -8614, -8605, -8597, -8588, -8579, -8570, -8562, -8553, + -8545, -8536, -8527, -8519, -8510, -8502, -8493, -8484, + -8476, -8467, -8459, -8450, -8442, -8433, -8425, -8416, + -8408, -8399, -8391, -8382, -8374, -8365, -8357, -8348, + -8340, -8332, -8323, -8315, -8306, -8298, -8290, -8281, + -8273, -8264, -8256, -8248, -8240, -8231, -8223, -8215, + -8206, -8198, -8190, -8182, -8174, -8165, -8157, -8149, + -8140, -8132, -8124, -8116, -8108, -8099, -8091, -8083, + -8075, -8067, -8059, -8051, -8042, -8034, -8027, -8018, + -8010, -8002, -7994, -7986, -7978, -7970, -7962, -7954, + -7946, -7938, -7930, -7922, -7913, -7906, -7897, -7890, + -7882, -7874, -7866, -7858, -7850, -7842, -7834, -7826, + -7818, -7810, -7802, -7795, -7787, -7779, -7771, -7763, + -7755, -7748, -7739, -7732, -7724, -7716, -7708, -7700, + -7693, -7685, -7677, -7669, -7662, -7654, -7646, -7638, + -7630, -7623, -7615, -7608, -7600, -7592, -7584, -7577, + -7569, -7561, -7553, -7546, -7538, -7530, -7523, -7515, + -7508, -7500, -7492, -7485, -7477, -7469, -7462, -7454, + -7447, -7439, -7432, -7424, -7417, -7409, -7401, -7394, + -7386, -7379, -7372, -7364, -7356, -7349, -7341, -7334, + -7327, -7319, -7311, -7304, -7297, -7289, -7281, -7274, + -7267, -7259, -7252, -7245, -7237, -7230, -7222, -7215, + -7208, -7200, -7193, -7186, -7178, -7171, -7163, -7156, + -7149, -7141, -7134, -7127, -7119, -7112, -7105, -7098, + -7090, -7083, -7075, -7068, -7061, -7054, -7046, -7039, + -7032, -7025, -7018, -7010, -7003, -6996, -6989, -6981, + -6974, -6967, -6960, -6953, -6946, -6938, -6931, -6924, + -6917, -6910, -6903, -6895, -6888, -6881, -6874, -6867, + -6860, -6853, -6845, -6838, -6831, -6824, -6817, -6810, + -6803, -6796, -6789, -6782, -6775, -6767, -6760, -6753, + -6747, -6740, -6732, -6725, -6718, -6711, -6704, -6697, + -6690, -6683, -6676, -6669, -6662, -6655, -6648, -6641, + -6634, -6627, -6620, -6613, -6607, -6600, -6593, -6586, + -6579, -6572, -6565, -6558, -6551, -6544, -6538, -6531, + -6524, -6517, -6510, -6503, -6496, -6489, -6482, -6476, + -6469, -6462, -6455, -6448, -6441, -6434, -6428, -6421, + -6414, -6407, -6400, -6394, -6387, -6380, -6373, -6366, + -6360, -6353, -6346, -6339, -6333, -6326, -6319, -6312, + -6306, -6299, -6292, -6286, -6279, -6272, -6265, -6259, + -6252, -6245, -6239, -6232, -6225, -6219, -6212, -6205, + -6198, -6192, -6185, -6178, -6172, -6165, -6158, -6152, + -6145, -6139, -6132, -6125, -6119, -6112, -6105, -6099, + -6092, -6085, -6079, -6072, -6066, -6059, -6053, -6046, + -6040, -6033, -6026, -6019, -6013, -6006, -6000, -5993, + -5987, -5980, -5974, -5967, -5961, -5954, -5948, -5941, + -5935, -5928, -5922, -5915, -5908, -5902, -5895, -5889, + -5883, -5876, -5870, -5863, -5857, -5850, -5844, -5837, + -5831, -5825, -5818, -5811, -5805, -5799, -5792, -5786, + -5779, -5773, -5766, -5760, -5754, -5747, -5741, -5734, + -5728, -5722, -5715, -5709, -5702, -5696, -5690, -5683, + -5677, -5671, -5664, -5658, -5651, -5645, -5639, -5632, + -5626, -5620, -5613, -5607, -5600, -5594, -5588, -5582, + -5575, -5569, -5563, -5556, -5550, -5544, -5537, -5531, + -5525, -5519, -5512, -5506, -5500, -5494, -5487, -5481, + -5475, -5468, -5462, -5456, -5450, -5443, -5437, -5431, + -5425, -5418, -5412, -5406, -5400, -5393, -5387, -5381, + -5375, -5369, -5362, -5356, -5350, -5344, -5337, -5331, + -5325, -5319, -5313, -5306, -5300, -5294, -5288, -5282, + -5276, -5270, -5263, -5257, -5251, -5245, -5239, -5233, + -5226, -5220, -5214, -5208, -5202, -5196, -5190, -5183, + -5177, -5171, -5165, -5159, -5153, -5147, -5140, -5135, + -5129, -5122, -5116, -5110, -5104, -5098, -5092, -5086, + -5080, -5074, -5068, -5061, -5055, -5050, -5043, -5037, + -5031, -5025, -5019, -5013, -5007, -5001, -4995, -4989, + -4983, -4977, -4971, -4965, -4959, -4953, -4947, -4941, + -4935, -4929, -4923, -4917, -4911, -4905, -4899, -4893, + -4887, -4881, -4875, -4869, -4863, -4857, -4851, -4845, + -4839, -4833, -4827, -4821, -4815, -4809, -4803, -4797, + -4791, -4785, -4779, -4773, -4767, -4762, -4755, -4750, + -4744, -4738, -4732, -4726, -4720, -4714, -4708, -4702, + -4696, -4690, -4685, -4678, -4673, -4667, -4661, -4655, + -4649, -4643, -4637, -4631, -4626, -4620, -4614, -4608, + -4602, -4596, -4590, -4585, -4579, -4573, -4567, -4561, + -4555, -4549, -4544, -4538, -4532, -4526, -4520, -4514, + -4508, -4503, -4497, -4491, -4485, -4479, -4474, -4468, + -4462, -4456, -4450, -4445, -4439, -4433, -4427, -4421, + -4415, -4410, -4404, -4398, -4392, -4386, -4381, -4375, + -4369, -4363, -4358, -4352, -4346, -4340, -4334, -4329, + -4323, -4317, -4311, -4306, -4300, -4294, -4289, -4283, + -4277, -4271, -4266, -4260, -4254, -4248, -4243, -4237, + -4231, -4225, -4220, -4214, -4208, -4202, -4197, -4191, + -4185, -4180, -4174, -4168, -4162, -4157, -4151, -4146, + -4140, -4134, -4128, -4123, -4117, -4111, -4105, -4100, + -4094, -4089, -4083, -4077, -4071, -4066, -4060, -4055, + -4049, -4043, -4037, -4032, -4026, -4021, -4015, -4009, + -4003, -3998, -3992, -3987, -3981, -3975, -3970, -3964, + -3958, -3953, -3947, -3942, -3936, -3930, -3925, -3919, + -3913, -3908, -3902, -3897, -3891, -3885, -3880, -3874, + -3869, -3863, -3857, -3852, -3846, -3840, -3835, -3829, + -3824, -3818, -3813, -3807, -3801, -3796, -3790, -3785, + -3779, -3774, -3768, -3762, -3757, -3751, -3746, -3740, + -3734, -3729, -3723, -3718, -3712, -3707, -3701, -3696, + -3690, -3684, -3679, -3673, -3668, -3662, -3657, -3651, + -3646, -3640, -3635, -3629, -3624, -3618, -3613, -3607, + -3602, -3596, -3591, -3585, -3579, -3574, -3568, -3563, + -3557, -3552, -3546, -3541, -3535, -3530, -3524, -3519, + -3514, -3508, -3502, -3497, -3491, -3486, -3480, -3475, + -3469, -3464, -3459, -3453, -3448, -3442, -3437, -3431, + -3425, -3420, -3415, -3409, -3404, -3398, -3393, -3387, + -3382, -3376, -3371, -3366, -3360, -3355, -3349, -3344, + -3338, -3333, -3328, -3322, -3317, -3311, -3305, -3300, + -3295, -3289, -3284, -3278, -3273, -3268, -3262, -3257, + -3251, -3246, -3240, -3235, -3230, -3224, -3219, -3213, + -3208, -3203, -3197, -3192, -3186, -3181, -3176, -3170, + -3165, -3159, -3154, -3149, -3143, -3138, -3132, -3127, + -3122, -3116, -3111, -3105, -3100, -3095, -3089, -3084, + -3079, -3073, -3068, -3062, -3057, -3052, -3046, -3041, + -3036, -3030, -3025, -3019, -3014, -3009, -3003, -2998, + -2993, -2987, -2982, -2977, -2971, -2966, -2961, -2955, + -2950, -2944, -2939, -2934, -2928, -2923, -2918, -2912, + -2907, -2902, -2896, -2891, -2886, -2880, -2875, -2870, + -2864, -2859, -2854, -2848, -2843, -2838, -2832, -2827, + -2822, -2816, -2811, -2806, -2800, -2795, -2790, -2784, + -2779, -2774, -2768, -2763, -2758, -2753, -2747, -2742, + -2737, -2732, -2726, -2721, -2716, -2710, -2705, -2700, + -2694, -2689, -2684, -2678, -2673, -2668, -2663, -2657, + -2652, -2647, -2642, -2636, -2631, -2626, -2620, -2615, + -2610, -2605, -2599, -2594, -2589, -2583, -2578, -2573, + -2568, -2562, -2557, -2552, -2546, -2542, -2536, -2531, + -2526, -2520, -2515, -2510, -2505, -2499, -2494, -2489, + -2483, -2478, -2473, -2468, -2463, -2457, -2452, -2447, + -2442, -2436, -2431, -2426, -2421, -2415, -2410, -2405, + -2400, -2395, -2389, -2384, -2379, -2374, -2368, -2363, + -2358, -2353, -2347, -2342, -2337, -2332, -2327, -2321, + -2316, -2311, -2306, -2300, -2295, -2290, -2285, -2279, + -2275, -2269, -2264, -2259, -2254, -2248, -2243, -2238, + -2233, -2227, -2222, -2217, -2212, -2207, -2202, -2196, + -2191, -2186, -2181, -2175, -2170, -2165, -2160, -2155, + -2150, -2144, -2139, -2134, -2129, -2124, -2118, -2113, + -2108, -2103, -2098, -2093, -2087, -2082, -2077, -2072, + -2067, -2062, -2056, -2051, -2046, -2041, -2036, -2030, + -2025, -2020, -2015, -2010, -2005, -2000, -1994, -1989, + -1984, -1979, -1974, -1969, -1963, -1958, -1953, -1948, + -1943, -1937, -1932, -1927, -1922, -1917, -1912, -1907, + -1901, -1896, -1891, -1886, -1881, -1876, -1871, -1865, + -1860, -1855, -1850, -1845, -1840, -1835, -1829, -1824, + -1819, -1814, -1809, -1804, -1799, -1794, -1788, -1783, + -1778, -1773, -1768, -1763, -1758, -1752, -1747, -1742, + -1737, -1732, -1727, -1722, -1717, -1711, -1706, -1701, + -1696, -1691, -1686, -1681, -1676, -1670, -1665, -1660, + -1655, -1650, -1645, -1640, -1635, -1629, -1624, -1619, + -1614, -1609, -1604, -1599, -1594, -1589, -1584, -1579, + -1573, -1568, -1563, -1558, -1553, -1548, -1543, -1538, + -1532, -1527, -1522, -1517, -1512, -1507, -1502, -1497, + -1492, -1486, -1482, -1477, -1471, -1466, -1461, -1456, + -1451, -1446, -1441, -1436, -1431, -1425, -1420, -1415, + -1410, -1405, -1400, -1395, -1390, -1385, -1380, -1375, + -1370, -1364, -1359, -1354, -1349, -1344, -1339, -1334, + -1329, -1324, -1319, -1314, -1309, -1303, -1298, -1294, + -1288, -1283, -1278, -1273, -1268, -1263, -1258, -1253, + -1248, -1243, -1237, -1232, -1228, -1222, -1217, -1212, + -1207, -1202, -1197, -1192, -1187, -1182, -1177, -1171, + -1167, -1162, -1156, -1151, -1146, -1141, -1136, -1131, + -1126, -1121, -1116, -1111, -1106, -1101, -1096, -1091, + -1085, -1081, -1076, -1070, -1065, -1060, -1055, -1050, + -1045, -1040, -1035, -1030, -1025, -1020, -1015, -1010, + -1005, -1000, -995, -990, -985, -979, -974, -970, + -964, -959, -954, -949, -944, -939, -934, -929, + -924, -919, -914, -909, -904, -899, -894, -889, + -884, -879, -874, -868, -863, -859, -853, -848, + -843, -838, -833, -828, -823, -818, -813, -808, + -803, -798, -793, -788, -783, -778, -773, -768, + -763, -758, -752, -748, -743, -738, -732, -727, + -723, -717, -712, -707, -702, -697, -692, -687, + -682, -677, -672, -667, -662, -657, -652, -647, + -642, -637, -632, -627, -622, -617, -612, -607, + -602, -597, -591, -587, -582, -577, -571, -566, + -562, -557, -551, -546, -541, -537, -531, -526, + -521, -516, -511, -506, -501, -496, -491, -486, + -481, -476, -471, -466, -461, -456, -451, -446, + -441, -436, -431, -426, -421, -416, -411, -406, + -401, -396, -391, -386, -381, -376, -371, -366, + -360, -356, -351, -346, -340, -335, -331, -326, + -320, -315, -310, -306, -300, -295, -290, -285, + -281, -275, -270, -265, -261, -255, -250, -245, + -240, -235, -230, -225, -220, -215, -210, -205, + -200, -195, -190, -185, -180, -175, -170, -165, + -160, -155, -150, -145, -140, -135, -130, -125, + -120, -115, -110, -105, -100, -95, -90, -85, + -80, -75, -70, -65, -60, -55, -50, -45, + -40, -35, -29, -25, -20, -15, -9, -5, + 0, 5, 11, 16, 20, 25, 30, 36, + 41, 45, 50, 56, 61, 66, 70, 76, + 81, 86, 91, 96, 101, 106, 111, 116, + 121, 126, 131, 136, 141, 146, 151, 156, + 161, 166, 171, 176, 181, 186, 191, 196, + 201, 206, 211, 216, 221, 226, 231, 236, + 241, 246, 251, 256, 261, 266, 271, 276, + 281, 286, 291, 296, 301, 306, 311, 316, + 322, 326, 331, 336, 342, 347, 351, 356, + 362, 367, 372, 376, 382, 387, 392, 396, + 402, 407, 412, 417, 422, 427, 432, 437, + 442, 447, 452, 457, 462, 467, 472, 477, + 482, 487, 492, 497, 502, 507, 512, 517, + 522, 527, 532, 537, 542, 547, 552, 557, + 562, 567, 572, 578, 582, 587, 593, 598, + 603, 607, 613, 618, 623, 628, 633, 638, + 643, 648, 653, 658, 663, 668, 673, 678, + 683, 688, 693, 698, 703, 708, 713, 718, + 723, 728, 733, 739, 743, 748, 754, 759, + 763, 768, 774, 779, 784, 789, 794, 799, + 804, 809, 814, 819, 824, 829, 834, 839, + 844, 849, 854, 859, 864, 869, 874, 879, + 884, 890, 895, 899, 905, 910, 915, 920, + 925, 930, 935, 940, 945, 950, 955, 960, + 965, 970, 975, 980, 985, 990, 995, 1001, + 1006, 1010, 1016, 1021, 1026, 1031, 1036, 1041, + 1046, 1051, 1056, 1061, 1066, 1071, 1076, 1081, + 1086, 1092, 1096, 1102, 1107, 1112, 1117, 1122, + 1127, 1132, 1137, 1142, 1147, 1152, 1157, 1162, + 1167, 1173, 1178, 1183, 1188, 1193, 1198, 1203, + 1208, 1213, 1218, 1223, 1228, 1233, 1238, 1244, + 1248, 1254, 1259, 1264, 1269, 1274, 1279, 1284, + 1289, 1294, 1299, 1304, 1309, 1314, 1320, 1325, + 1330, 1335, 1340, 1345, 1350, 1355, 1360, 1365, + 1371, 1375, 1381, 1386, 1391, 1396, 1401, 1406, + 1411, 1416, 1421, 1426, 1432, 1436, 1442, 1447, + 1452, 1457, 1462, 1467, 1472, 1477, 1482, 1488, + 1493, 1497, 1503, 1508, 1513, 1518, 1523, 1528, + 1534, 1538, 1543, 1549, 1554, 1559, 1564, 1569, + 1574, 1579, 1584, 1590, 1595, 1600, 1605, 1610, + 1615, 1620, 1625, 1630, 1636, 1640, 1646, 1651, + 1656, 1661, 1666, 1671, 1676, 1681, 1687, 1692, + 1697, 1702, 1707, 1712, 1717, 1722, 1728, 1733, + 1738, 1743, 1748, 1753, 1758, 1764, 1769, 1774, + 1779, 1784, 1789, 1794, 1799, 1805, 1810, 1815, + 1820, 1825, 1831, 1835, 1841, 1846, 1851, 1856, + 1861, 1866, 1871, 1877, 1882, 1887, 1892, 1897, + 1902, 1908, 1913, 1918, 1923, 1928, 1933, 1939, + 1944, 1949, 1954, 1959, 1964, 1969, 1975, 1980, + 1985, 1990, 1995, 2000, 2005, 2011, 2016, 2021, + 2026, 2031, 2037, 2042, 2047, 2052, 2057, 2062, + 2068, 2073, 2078, 2083, 2088, 2093, 2099, 2104, + 2109, 2114, 2119, 2125, 2130, 2135, 2140, 2145, + 2150, 2156, 2161, 2166, 2171, 2177, 2182, 2187, + 2192, 2197, 2202, 2208, 2213, 2218, 2223, 2229, + 2234, 2239, 2244, 2249, 2254, 2260, 2265, 2270, + 2275, 2281, 2286, 2291, 2296, 2302, 2306, 2312, + 2317, 2322, 2327, 2333, 2338, 2343, 2348, 2354, + 2359, 2364, 2369, 2374, 2380, 2385, 2390, 2395, + 2401, 2406, 2411, 2416, 2422, 2427, 2432, 2437, + 2442, 2448, 2453, 2458, 2463, 2469, 2474, 2479, + 2485, 2490, 2495, 2500, 2506, 2511, 2516, 2521, + 2526, 2532, 2537, 2542, 2548, 2553, 2558, 2563, + 2569, 2574, 2579, 2585, 2589, 2595, 2600, 2605, + 2611, 2616, 2621, 2627, 2632, 2637, 2642, 2648, + 2653, 2658, 2664, 2669, 2674, 2680, 2685, 2690, + 2695, 2700, 2706, 2711, 2716, 2722, 2727, 2732, + 2738, 2743, 2748, 2754, 2759, 2764, 2769, 2775, + 2780, 2785, 2791, 2796, 2801, 2807, 2812, 2817, + 2823, 2828, 2833, 2839, 2844, 2849, 2855, 2860, + 2865, 2870, 2876, 2881, 2886, 2892, 2897, 2902, + 2908, 2913, 2918, 2924, 2929, 2935, 2940, 2945, + 2951, 2956, 2961, 2967, 2972, 2977, 2983, 2988, + 2993, 2999, 3004, 3010, 3015, 3020, 3026, 3031, + 3036, 3042, 3047, 3052, 3058, 3063, 3069, 3074, + 3079, 3085, 3090, 3095, 3101, 3106, 3112, 3117, + 3122, 3128, 3133, 3139, 3144, 3149, 3155, 3160, + 3166, 3171, 3176, 3182, 3187, 3193, 3198, 3203, + 3209, 3214, 3220, 3225, 3231, 3236, 3242, 3247, + 3252, 3258, 3263, 3269, 3274, 3279, 3285, 3290, + 3296, 3301, 3307, 3312, 3317, 3323, 3328, 3334, + 3339, 3345, 3350, 3355, 3361, 3367, 3372, 3378, + 3383, 3388, 3394, 3399, 3405, 3410, 3416, 3421, + 3427, 3432, 3437, 3443, 3448, 3454, 3459, 3465, + 3471, 3476, 3481, 3487, 3492, 3498, 3503, 3509, + 3514, 3520, 3525, 3531, 3536, 3542, 3548, 3553, + 3558, 3564, 3569, 3575, 3580, 3586, 3591, 3597, + 3602, 3608, 3613, 3619, 3625, 3630, 3636, 3641, + 3647, 3652, 3658, 3663, 3669, 3675, 3680, 3686, + 3691, 3697, 3702, 3708, 3713, 3719, 3724, 3730, + 3736, 3741, 3747, 3752, 3758, 3763, 3769, 3774, + 3780, 3786, 3791, 3797, 3802, 3808, 3813, 3819, + 3825, 3830, 3836, 3842, 3847, 3853, 3858, 3864, + 3869, 3875, 3881, 3886, 3892, 3898, 3903, 3909, + 3915, 3920, 3926, 3931, 3937, 3942, 3948, 3954, + 3960, 3965, 3971, 3976, 3982, 3987, 3993, 3999, + 4005, 4010, 4016, 4021, 4027, 4033, 4039, 4044, + 4050, 4055, 4061, 4067, 4073, 4078, 4084, 4089, + 4095, 4101, 4107, 4112, 4118, 4123, 4129, 4135, + 4141, 4146, 4152, 4158, 4164, 4169, 4175, 4181, + 4187, 4192, 4198, 4203, 4209, 4215, 4221, 4226, + 4232, 4238, 4243, 4249, 4255, 4261, 4266, 4272, + 4278, 4284, 4289, 4295, 4301, 4307, 4313, 4318, + 4324, 4330, 4336, 4341, 4347, 4353, 4359, 4364, + 4370, 4376, 4382, 4388, 4393, 4399, 4405, 4411, + 4417, 4422, 4428, 4434, 4440, 4445, 4452, 4457, + 4463, 4469, 4474, 4481, 4486, 4492, 4498, 4504, + 4510, 4515, 4521, 4527, 4533, 4539, 4545, 4551, + 4556, 4562, 4568, 4574, 4580, 4585, 4592, 4597, + 4603, 4609, 4615, 4621, 4627, 4633, 4638, 4644, + 4650, 4656, 4662, 4668, 4674, 4680, 4686, 4692, + 4697, 4703, 4709, 4715, 4721, 4727, 4733, 4739, + 4745, 4751, 4757, 4762, 4769, 4774, 4780, 4786, + 4792, 4798, 4804, 4810, 4816, 4822, 4828, 4834, + 4840, 4846, 4852, 4858, 4864, 4870, 4876, 4882, + 4888, 4894, 4900, 4906, 4912, 4918, 4924, 4930, + 4936, 4942, 4948, 4954, 4960, 4966, 4972, 4978, + 4984, 4990, 4996, 5002, 5008, 5014, 5020, 5026, + 5032, 5038, 5045, 5050, 5057, 5063, 5069, 5075, + 5081, 5087, 5093, 5099, 5105, 5111, 5118, 5123, + 5129, 5136, 5142, 5148, 5154, 5160, 5166, 5172, + 5179, 5185, 5191, 5197, 5203, 5209, 5215, 5221, + 5227, 5233, 5240, 5246, 5252, 5258, 5265, 5271, + 5277, 5283, 5289, 5295, 5301, 5308, 5314, 5320, + 5326, 5333, 5339, 5345, 5351, 5357, 5363, 5369, + 5376, 5382, 5388, 5394, 5401, 5407, 5413, 5419, + 5426, 5432, 5438, 5444, 5451, 5457, 5463, 5469, + 5476, 5482, 5488, 5494, 5501, 5507, 5513, 5520, + 5526, 5532, 5539, 5545, 5551, 5557, 5564, 5570, + 5576, 5583, 5589, 5596, 5602, 5608, 5614, 5621, + 5627, 5634, 5640, 5646, 5652, 5659, 5665, 5672, + 5678, 5684, 5691, 5697, 5704, 5710, 5716, 5723, + 5729, 5736, 5742, 5748, 5755, 5761, 5768, 5774, + 5780, 5787, 5793, 5800, 5806, 5813, 5819, 5826, + 5832, 5838, 5845, 5852, 5858, 5864, 5871, 5877, + 5884, 5890, 5897, 5903, 5910, 5916, 5923, 5929, + 5936, 5942, 5949, 5956, 5962, 5968, 5975, 5981, + 5988, 5994, 6001, 6008, 6014, 6021, 6027, 6034, + 6041, 6047, 6054, 6060, 6067, 6074, 6080, 6087, + 6093, 6100, 6107, 6113, 6120, 6126, 6133, 6140, + 6146, 6153, 6160, 6167, 6173, 6180, 6186, 6193, + 6200, 6206, 6213, 6220, 6226, 6233, 6240, 6246, + 6253, 6260, 6266, 6273, 6280, 6287, 6294, 6300, + 6307, 6314, 6321, 6327, 6334, 6341, 6348, 6354, + 6361, 6368, 6375, 6382, 6388, 6395, 6402, 6409, + 6416, 6422, 6429, 6436, 6443, 6450, 6457, 6463, + 6470, 6477, 6484, 6491, 6497, 6504, 6511, 6518, + 6525, 6532, 6539, 6546, 6553, 6559, 6566, 6573, + 6580, 6587, 6594, 6601, 6608, 6615, 6622, 6629, + 6636, 6643, 6650, 6657, 6664, 6671, 6678, 6685, + 6692, 6699, 6706, 6713, 6719, 6727, 6734, 6741, + 6748, 6755, 6762, 6769, 6776, 6783, 6790, 6797, + 6804, 6811, 6818, 6826, 6833, 6840, 6847, 6854, + 6861, 6868, 6875, 6883, 6889, 6897, 6904, 6911, + 6918, 6925, 6932, 6939, 6947, 6954, 6961, 6969, + 6975, 6983, 6990, 6997, 7005, 7012, 7019, 7026, + 7033, 7041, 7048, 7055, 7062, 7070, 7077, 7084, + 7091, 7099, 7106, 7114, 7121, 7128, 7135, 7143, + 7150, 7157, 7165, 7172, 7179, 7187, 7194, 7202, + 7209, 7216, 7224, 7231, 7238, 7246, 7253, 7261, + 7268, 7276, 7283, 7290, 7298, 7306, 7313, 7320, + 7328, 7336, 7343, 7350, 7358, 7365, 7373, 7381, + 7388, 7395, 7403, 7410, 7418, 7426, 7433, 7441, + 7448, 7456, 7463, 7471, 7479, 7486, 7494, 7501, + 7509, 7517, 7524, 7532, 7540, 7547, 7555, 7563, + 7571, 7578, 7586, 7594, 7601, 7609, 7617, 7624, + 7632, 7640, 7648, 7655, 7663, 7671, 7679, 7687, + 7694, 7702, 7710, 7718, 7725, 7733, 7741, 7749, + 7757, 7765, 7773, 7780, 7788, 7796, 7804, 7812, + 7820, 7828, 7836, 7843, 7852, 7859, 7868, 7875, + 7883, 7891, 7899, 7907, 7915, 7923, 7931, 7939, + 7947, 7955, 7963, 7971, 7979, 7988, 7995, 8004, + 8012, 8020, 8028, 8036, 8044, 8052, 8061, 8069, + 8076, 8085, 8093, 8101, 8109, 8117, 8126, 8134, + 8142, 8150, 8158, 8167, 8175, 8183, 8192, 8200, + 8208, 8217, 8225, 8233, 8241, 8250, 8258, 8266, + 8275, 8283, 8292, 8300, 8308, 8317, 8325, 8333, + 8342, 8350, 8359, 8367, 8376, 8384, 8392, 8401, + 8409, 8418, 8426, 8435, 8443, 8452, 8461, 8469, + 8477, 8486, 8495, 8503, 8512, 8520, 8529, 8538, + 8546, 8555, 8564, 8573, 8581, 8590, 8598, 8607, + 8616, 8625, 8633, 8642, 8651, 8659, 8668, 8677, + 8686, 8695, 8704, 8712, 8721, 8730, 8739, 8748, + 8756, 8765, 8774, 8783, 8792, 8801, 8810, 8819, + 8828, 8837, 8846, 8855, 8864, 8873, 8882, 8891, + 8900, 8909, 8918, 8927, 8936, 8945, 8954, 8964, + 8973, 8982, 8991, 9000, 9009, 9019, 9028, 9037, + 9046, 9055, 9064, 9074, 9083, 9092, 9102, 9111, + 9120, 9130, 9139, 9148, 9157, 9167, 9176, 9186, + 9195, 9205, 9214, 9223, 9233, 9242, 9252, 9261, + 9271, 9280, 9290, 9300, 9309, 9318, 9328, 9338, + 9347, 9357, 9367, 9376, 9386, 9395, 9405, 9415, + 9424, 9434, 9444, 9454, 9464, 9473, 9483, 9493, + 9503, 9513, 9522, 9532, 9542, 9552, 9562, 9572, + 9582, 9592, 9602, 9612, 9622, 9632, 9642, 9652, + 9662, 9672, 9682, 9692, 9702, 9712, 9722, 9733, + 9743, 9753, 9763, 9773, 9783, 9794, 9804, 9814, + 9825, 9835, 9845, 9855, 9866, 9876, 9887, 9897, + 9907, 9918, 9928, 9939, 9949, 9960, 9970, 9981, + 9991, 10002, 10012, 10023, 10034, 10044, 10055, 10066, + 10076, 10087, 10097, 10108, 10119, 10130, 10140, 10152, + 10162, 10173, 10184, 10195, 10206, 10217, 10227, 10238, + 10249, 10260, 10271, 10282, 10293, 10304, 10315, 10326, + 10337, 10349, 10360, 10371, 10382, 10394, 10405, 10416, + 10427, 10438, 10450, 10461, 10472, 10484, 10495, 10507, + 10518, 10530, 10541, 10553, 10564, 10575, 10587, 10598, + 10610, 10622, 10633, 10645, 10657, 10668, 10680, 10692, + 10704, 10715, 10727, 10739, 10751, 10763, 10775, 10786, + 10798, 10811, 10822, 10834, 10847, 10858, 10870, 10883, + 10895, 10907, 10919, 10931, 10944, 10956, 10968, 10981, + 10993, 11005, 11017, 11030, 11042, 11055, 11067, 11080, + 11092, 11105, 11117, 11130, 11142, 11155, 11168, 11180, + 11193, 11206, 11219, 11232, 11245, 11257, 11270, 11283, + 11296, 11309, 11322, 11335, 11348, 11361, 11375, 11388, + 11401, 11414, 11427, 11441, 11454, 11467, 11481, 11494, + 11508, 11521, 11534, 11548, 11561, 11575, 11589, 11602, + 11616, 11630, 11644, 11657, 11671, 11685, 11699, 11713, + 11727, 11741, 11755, 11769, 11783, 11797, 11811, 11826, + 11839, 11854, 11868, 11882, 11897, 11911, 11926, 11940, + 11955, 11969, 11984, 11998, 12013, 12028, 12043, 12057, + 12072, 12087, 12102, 12117, 12132, 12147, 12162, 12177, + 12193, 12208, 12223, 12238, 12254, 12269, 12284, 12299, + 12315, 12331, 12346, 12362, 12378, 12393, 12409, 12425, + 12441, 12457, 12473, 12489, 12505, 12521, 12537, 12553, + 12569, 12586, 12602, 12619, 12635, 12651, 12668, 12684, + 12701, 12718, 12734, 12751, 12768, 12785, 12802, 12819, + 12836, 12853, 12870, 12888, 12905, 12922, 12940, 12957, + 12975, 12993, 13010, 13028, 13046, 13064, 13081, 13099, + 13117, 13135, 13154, 13172, 13190, 13209, 13227, 13246, + 13264, 13283, 13301, 13320, 13339, 13358, 13377, 13396, + 13415, 13434, 13454, 13473, 13492, 13512, 13532, 13551, + 13571, 13591, 13611, 13631, 13651, 13671, 13691, 13711, + 13732, 13752, 13773, 13793, 13814, 13835, 13856, 13877, + 13898, 13919, 13940, 13962, 13983, 14005, 14026, 14048, + 14070, 14092, 14114, 14136, 14159, 14181, 14203, 14226, + 14249, 14272, 14294, 14318, 14341, 14364, 14387, 14411, + 14434, 14458, 14482, 14506, 14530, 14554, 14578, 14603, + 14628, 14653, 14677, 14703, 14728, 14753, 14778, 14804, + 14830, 14855, 14882, 14908, 14934, 14961, 14987, 15014, + 15041, 15068, 15095, 15123, 15151, 15179, 15206, 15235, + 15263, 15291, 15320, 15349, 15378, 15408, 15437, 15466, + 15496, 15527, 15557, 15587, 15618, 15649, 15680, 15712, + 15743, 15775, 15808, 15840, 15872, 15906, 15939, 15972, + 16006, 16040, 16074, 16108, 16143, 16178, 16214, 16249, + 16285, 16322, 16358, 16395, 16433, 16470, 16508, 16547, + 16586, 16624, 16664, 16704, 16744, 16785, 16826, 16867, + 16910, 16952, 16995, 17038, 17082, 17126, 17171, 17217, + 17263, 17309, 17356, 17403, 17452, 17501, 17550, 17600, + 17651, 17702, 17754, 17807, 17861, 17915, 17970, 18026, + 18083, 18141, 18200, 18259, 18320, 18382, 18444, 18508, + 18573, 18639, 18706, 18775, 18845, 18917, 18989, 19064, + 19140, 19217, 19297, 19378, 19461, 19547, 19634, 19724, + 19816, 19911, 20009, 20109, 20213, 20319, 20430, 20544, + 20663, 20786, 20914, 21047, 21186, 21331, 21484, 21644, + 21813, 21991, 22181, 22384, 22601, 22836, 23091, 23370, + 23679, 24027, 24424, 24888, 25450, 26164, 27159, 28858, +}; +#define NORM_DIST_TABLE_SIZE \ + (sizeof (norm_dist_table) / sizeof (norm_dist_table[0])) + +struct heap_elem { + uint64_t key; + pktsched_pkt_t pkt; +}; + +struct heap { + uint32_t limit; /* max size */ + uint32_t size; /* current size */ + struct heap_elem p[0]; +}; + +static struct heap *heap_create(uint32_t size); +static int heap_insert(struct heap *h, uint64_t k, pktsched_pkt_t *p); +static int heap_peek(struct heap *h, uint64_t *k, pktsched_pkt_t *p); +static int heap_extract(struct heap *h, uint64_t *k, pktsched_pkt_t *p); + +struct netem { + decl_lck_mtx_data(, netem_lock); + + /* Init Time Constants */ + char netem_name[MAXTHREADNAMESIZE]; + uint32_t netem_flags; + struct thread *netem_output_thread; + + void *netem_output_handle; + int (*netem_output)(void *handle, pktsched_pkt_t *pkts, + uint32_t n_pkts); + uint32_t netem_output_max_batch_size; + + struct heap *netem_heap; + + /* Parameters variables */ + /* bandwidth token bucket limit */ +#define TOKEN_INVALID UINT64_MAX + struct token_bucket { + uint64_t depth; + uint64_t token; + uint64_t last; + uint64_t rate; + } netem_bandwidth_model; + + /* XXX (need correlated) naive corruption model */ + struct corruption { + uint32_t corruption_p; + } netem_corruption_model; + + /* naive duplication model */ + struct duplication { + uint32_t duplication_p; + } netem_duplication_model; + + /* latency (with jitter following random distribution) */ + struct latency { + uint32_t latency_ms; + uint32_t jitter_ms; + uint64_t last_time_to_send; + } netem_latency_model; + + /* 4 state Markov packet loss model */ + struct loss { + enum _4state_markov_packet_loss_state { + __NO_LOSS = 0, + GAP_RX = 1, + GAP_LOSS, + BURST_RX, + BURST_LOSS, + } state; + + uint32_t p_gr_gl; /* P( gap_loss | gap_rx ) */ + uint32_t p_gr_bl; /* P( burst_loss | gap_rx ) */ + uint32_t p_bl_br; /* P( burst_rx | burst_loss ) */ + uint32_t p_bl_gr; /* P( gap_rx | burst_loss ) */ + uint32_t p_br_bl; /* P( burst_loss | burst_rx ) */ + } netem_loss_model; + + /* + * Reordering Model -- + * randomly select packets and re-inject with additional delay + */ + struct reordering { + uint32_t reordering_p; + } netem_reordering_model; +}; + +#define NETEMF_INITIALIZED 0x00000001 /* has been initialized */ +#define NETEMF_RUNNING 0x00000002 /* thread is running */ +#define NETEMF_TERMINATEBLOCK 0x20000000 /* block waiting terminate */ +#define NETEMF_TERMINATING 0x40000000 /* thread is terminating */ +#define NETEMF_TERMINATED 0x80000000 /* thread is terminated */ + +#define NETEM_MTX_LOCK(_sch) \ + lck_mtx_lock(&(_sch)->netem_lock) +#define NETEM_MTX_LOCK_ASSERT_HELD(_sch) \ + LCK_MTX_ASSERT(&(_sch)->netem_lock, LCK_ASSERT_OWNED) +#define NETEM_MTX_LOCK_ASSERT_NOTHELD(_sch) \ + LCK_MTX_ASSERT(&(_sch)->netem_lock, LCK_ASSERT_NOTOWNED) +#define NETEM_MTX_UNLOCK(_sch) \ + lck_mtx_unlock(&(_sch)->netem_lock) + +static struct heap * +heap_create(uint32_t limit) +{ + struct heap *h = NULL; + + // verify limit + size_t size = sizeof(struct heap) + sizeof(struct heap_elem) * limit; + + h = _MALLOC(size, M_DEVBUF, M_WAITOK | M_ZERO); + if (h == NULL) { + return NULL; + } + + h->limit = limit; + h->size = 0; + + return h; +} + +static void +heap_destroy(struct heap *h) +{ + ASSERT(h->size == 0); + + _FREE(h, M_DEVBUF); +} + +#define HEAP_FATHER(child) (((child) - 1) / 2) +#define HEAP_SWAP(a, b, tmp) { tmp = a; a = b; b = tmp; } +#define HEAP_LEFT(x) (2 * (x) + 1) + +static int +heap_insert(struct heap *h, uint64_t key, pktsched_pkt_t *pkt) +{ + ASSERT(h != NULL); + + if (h->size == h->limit) { + return ENOMEM; + } + + uint32_t child, parent; + if (pkt == NULL) { + child = key; + ASSERT(child < h->size); + } else { + child = h->size; + h->p[child].key = key; + h->p[child].pkt = *pkt; + h->size++; + } + + while (child > 0) { + struct heap_elem tmp; + parent = HEAP_FATHER(child); + if (h->p[parent].key < h->p[child].key) { + break; + } + HEAP_SWAP(h->p[child], h->p[parent], tmp); + child = parent; + } + + return 0; +} + +static int +heap_peek(struct heap *h, uint64_t *key, pktsched_pkt_t *pkt) +{ + if (h->size == 0) { + return ENOENT; + } + + *key = h->p[0].key; + *pkt = h->p[0].pkt; + return 0; +} + +static int +heap_extract(struct heap *h, uint64_t *key, pktsched_pkt_t *pkt) +{ + uint32_t child, parent, max; + + if (h->size == 0) { + netem_log(NETEM_LOG_ERROR, "warning: extract from empty heap"); + return ENOENT; + } + + *key = h->p[0].key; + *pkt = h->p[0].pkt; + + /* re-heapify */ + parent = 0; + child = HEAP_LEFT(parent); /* start from left child */ + max = h->size - 1; + while (child <= max) { + if (child != max && h->p[child + 1].key < h->p[child].key) { + child = child + 1; /* right child */ + } + h->p[parent] = h->p[child]; + parent = child; + child = HEAP_LEFT(child); /* left child for next loop */ + } + + h->size--; + if (parent != max) { + /* Fill hole with last entry, bubble up reusing insert code */ + h->p[parent] = h->p[max]; + _PKTSCHED_PKT_INIT(&h->p[max].pkt); + heap_insert(h, parent, NULL); /* this one cannot fail */ + } + + return 0; +} + +static void +token_bucket_update(struct token_bucket *tb) +{ + uint64_t now, elapsed; + clock_sec_t sec; + clock_usec_t usec; + + if (tb->rate == 0) { + return; + } + + now = mach_absolute_time(); + elapsed = now - tb->last; + absolutetime_to_microtime(elapsed, &sec, &usec); + tb->token += ((sec * USEC_PER_SEC + usec) * tb->rate / USEC_PER_SEC); + if (__improbable(tb->token > tb->depth)) { + tb->token = tb->depth; + } + tb->last = now; +} + +static boolean_t +bandwidth_limited(struct netem *ne, uint32_t pkt_len) +{ + struct token_bucket *tb = &ne->netem_bandwidth_model; + + if (tb->rate == 0) { + return FALSE; + } + + if (tb->token < pkt_len * 8) { + netem_log(NETEM_LOG_DEBUG, "limited"); + return TRUE; + } + tb->token -= pkt_len * 8; + + netem_log(NETEM_LOG_DEBUG, "token left %llu", tb->token); + + return FALSE; +} + +static void +corruption_event(struct netem *ne, pktsched_pkt_t *pkt) +{ + struct corruption *corr = &ne->netem_corruption_model; + uint32_t rand; + + if (corr->corruption_p == 0) { + return; + } + + read_frandom(&rand, sizeof(rand)); + rand %= NETEM_PSCALE; + + if (rand < corr->corruption_p) { + netem_log(NETEM_LOG_ERROR, "\t corrupted"); + pktsched_corrupt_packet(pkt); + } +} + +static boolean_t +duplication_event(struct netem *ne) +{ + struct duplication *dup = &ne->netem_duplication_model; + uint32_t rand; + + if (dup->duplication_p == 0) { + return FALSE; + } + + read_frandom(&rand, sizeof(rand)); + rand %= NETEM_PSCALE; + + return rand < dup->duplication_p; +} + +static uint64_t +latency_event(struct netem *ne, boolean_t reordering) +{ + struct latency *l = &ne->netem_latency_model; + int32_t delay_ms = 0, jitter_ms = 0; + uint64_t time_to_send = 0; + + delay_ms = l->latency_ms; + if (l->jitter_ms != 0) { + int32_t rand, x, t, s = l->jitter_ms; + read_frandom(&rand, sizeof(rand)); + t = norm_dist_table[rand % NORM_DIST_TABLE_SIZE]; + x = (s % NORM_DIST_SCALE) * t; + if (x >= 0) { + x += NORM_DIST_SCALE / 2; + } else { + x -= NORM_DIST_SCALE / 2; + } + jitter_ms = x / NORM_DIST_SCALE + (s * t / NORM_DIST_SCALE); + } + + delay_ms += jitter_ms; + delay_ms = MAX(delay_ms, 0); + + netem_log(NETEM_LOG_DEBUG, "\tdelay %dms", delay_ms); + clock_interval_to_deadline(delay_ms, NSEC_PER_MSEC, &time_to_send); + + if (l->last_time_to_send != 0) { + if (reordering) { + /* reorder with last packet */ + time_to_send = l->last_time_to_send - 1; + } else { + /* make sure packet time to send is monotonic */ + if (time_to_send < l->last_time_to_send) { + /* send this one immediately afterwards */ + time_to_send = l->last_time_to_send + 1; + } + } + } + + l->last_time_to_send = time_to_send; + + return time_to_send; +} + +static boolean_t +loss_event(struct netem *ne) +{ + struct loss *loss = &ne->netem_loss_model; + uint32_t rand; + + if (loss->state == __NO_LOSS) { + return FALSE; + } + + read_frandom(&rand, sizeof(rand)); + rand %= NETEM_PSCALE; + + switch (loss->state) { + case GAP_RX: + if (rand < loss->p_gr_gl) { + loss->state = GAP_RX; + return TRUE; + } else if (loss->p_gr_gl < rand && + rand < loss->p_gr_gl + loss->p_gr_bl) { + loss->state = BURST_LOSS; + return TRUE; + } else { + loss->state = GAP_RX; + return FALSE; + } + case BURST_LOSS: + if (rand < loss->p_bl_br) { + loss->state = BURST_RX; + return FALSE; + } else if (loss->p_bl_br < rand && + rand < loss->p_bl_br + loss->p_bl_gr) { + loss->state = GAP_RX; + return FALSE; + } else { + loss->state = BURST_LOSS; + return TRUE; + } + case BURST_RX: + if (rand < loss->p_br_bl) { + loss->state = BURST_LOSS; + return TRUE; + } else { + loss->state = BURST_RX; + return FALSE; + } + case GAP_LOSS: + /* This is instantaneous (stateless), should not be reached */ + default: + VERIFY(0); + break; + } + + /* not reached */ + VERIFY(0); + return FALSE; +} + +static boolean_t +reordering_event(struct netem *ne) +{ + struct reordering *reord = &ne->netem_reordering_model; + uint32_t rand; + + if (reord->reordering_p == 0) { + return FALSE; + } + + read_frandom(&rand, sizeof(rand)); + rand %= NETEM_PSCALE; + + return rand < reord->reordering_p; +} + +static void +netem_update_locked(struct netem *ne) +{ + ASSERT(ne != NULL); + NETEM_MTX_LOCK_ASSERT_HELD(ne); + + token_bucket_update(&ne->netem_bandwidth_model); +} + +int +netem_enqueue(struct netem *ne, classq_pkt_t *p, boolean_t *pdrop) +{ + int ret = 0; + int pkt_count = 1; + uint64_t time_to_send; + pktsched_pkt_t pkt; + + pktsched_pkt_encap(&pkt, p); + + ASSERT(ne != NULL); + ASSERT(pdrop != NULL); + NETEM_MTX_LOCK(ne); + + netem_log(NETEM_LOG_DEBUG, "+ %p begin", p->cp_mbuf); + + if (loss_event(ne)) { + netem_log(NETEM_LOG_DEBUG, "\t lost"); + pkt_count--; + } + + if (duplication_event(ne)) { + netem_log(NETEM_LOG_DEBUG, "\t dup'ed"); + pkt_count++; + } + + if (pkt_count == 0) { + pktsched_free_pkt(&pkt); + *pdrop = TRUE; + goto done; + } + + do { + corruption_event(ne, &pkt); + + time_to_send = latency_event(ne, reordering_event(ne)); + + ret = heap_insert(ne->netem_heap, time_to_send, &pkt); + if (ret != 0) { + netem_log(NETEM_LOG_DEBUG, "\t%p err heap_insert %d", + p->cp_mbuf, ret); + pktsched_free_pkt(&pkt); + goto done; + } + netem_log(NETEM_LOG_DEBUG, "\t%p enqueued", + pkt.pktsched_pkt_mbuf); + } while (--pkt_count > 0 && + __probable((ret = pktsched_clone_pkt(&pkt, &pkt)) == 0)); + +done: + if (__probable(ne->netem_output_thread != THREAD_NULL)) { + if (!(ne->netem_flags & (NETEMF_RUNNING | + NETEMF_TERMINATING | NETEMF_TERMINATED))) { + netem_log(NETEM_LOG_DEBUG, "wakeup output thread"); + (void) thread_wakeup((caddr_t)&ne->netem_flags); + } + } + + NETEM_MTX_UNLOCK(ne); + netem_log(NETEM_LOG_DEBUG, "- %p end", p->cp_mbuf); + + return ret; +} + +static int +netem_dequeue_internal_locked(struct netem *ne, pktsched_pkt_t *pp, + boolean_t *ppending) +{ + int ret = 0; + uint64_t time_to_send; + pktsched_pkt_t pkt; + + ASSERT(ne != NULL); + NETEM_MTX_LOCK_ASSERT_HELD(ne); + + netem_log(NETEM_LOG_HIDEBUG, "+ begin"); + + ret = heap_peek(ne->netem_heap, &time_to_send, &pkt); + if (ret != 0) { + netem_log(NETEM_LOG_HIDEBUG, "\theap empty"); + ret = ENOENT; + goto done; + } + + /* latency limit */ + if (time_to_send > mach_absolute_time()) { + netem_log(NETEM_LOG_DEBUG, + "held back: time_to_send %llu now %llu", + time_to_send, mach_absolute_time()); + ret = EAGAIN; + goto done; + } + + /* bandwidth limited */ + if (bandwidth_limited(ne, pkt.pktsched_plen)) { + ret = EAGAIN; + goto done; + } + + ret = heap_extract(ne->netem_heap, &time_to_send, &pkt); + ASSERT(ret == 0); + *pp = pkt; + + netem_log(NETEM_LOG_HIDEBUG, "- %p end", pkt.pktsched_pkt_mbuf); + +done: + *ppending = (ret == EAGAIN) ? TRUE : FALSE; + + return ret; +} + +int +netem_dequeue(struct netem *ne, pktsched_pkt_t *p, + boolean_t *ppending) +{ + int ret; + + NETEM_MTX_LOCK(ne); + netem_update_locked(ne); + ret = netem_dequeue_internal_locked(ne, p, ppending); + NETEM_MTX_UNLOCK(ne); + + return ret; +} + +__attribute__((noreturn)) +static void +netem_output_thread_cont(void *v, wait_result_t w) +__attribute__((optnone)) +{ + struct netem *ne = v; + boolean_t pending = FALSE; + pktsched_pkt_t pkts[NETEM_MAX_BATCH_SIZE]; + uint32_t n_pkts = 0; + int ret; + + NETEM_MTX_LOCK(ne); + ASSERT(!(ne->netem_flags & NETEMF_TERMINATED)); + ne->netem_flags |= NETEMF_RUNNING; + + if (__improbable(w == THREAD_INTERRUPTED || + (ne->netem_flags & NETEMF_TERMINATING) != 0)) { + ASSERT(!(ne->netem_flags & NETEMF_TERMINATED)); + ne->netem_flags &= ~(NETEMF_RUNNING | NETEMF_TERMINATING); + ne->netem_flags |= NETEMF_TERMINATED; + + netem_log(NETEM_LOG_INFO, "%s output thread terminated", + ne->netem_name); + + if (ne->netem_flags & NETEMF_TERMINATEBLOCK) { + thread_wakeup((caddr_t)&ne->netem_output_thread); + } + + NETEM_MTX_UNLOCK(ne); + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ + __builtin_unreachable(); + } + + ASSERT(ne->netem_output != NULL); + netem_update_locked(ne); + n_pkts = 0; + for (;;) { + ret = netem_dequeue_internal_locked(ne, &pkts[n_pkts], + &pending); + if (__probable(ret == 0 && + ++n_pkts < ne->netem_output_max_batch_size)) { + continue; + } + + if (__probable(n_pkts != 0)) { + NETEM_MTX_UNLOCK(ne); + (void) ne->netem_output(ne->netem_output_handle, + pkts, n_pkts); + NETEM_MTX_LOCK(ne); + n_pkts = 0; + } + if (ret != 0) { + break; + } + } + + uint64_t deadline = TIMEOUT_WAIT_FOREVER; + if (pending) { + clock_interval_to_deadline(1, NSEC_PER_MSEC, &deadline); + } + (void) assert_wait_deadline(&ne->netem_flags, THREAD_UNINT, deadline); + ne->netem_flags &= ~NETEMF_RUNNING; + NETEM_MTX_UNLOCK(ne); + (void) thread_block_parameter(netem_output_thread_cont, ne); + /* NOTREACHED */ + __builtin_unreachable(); +} + +__attribute__((noreturn)) +static void +netem_output_thread_func(void *v, wait_result_t w) +{ +#pragma unused(w) + struct netem *ne = v; + + ASSERT(ne->netem_output_thread == current_thread()); + thread_set_thread_name(current_thread(), ne->netem_name); + + NETEM_MTX_LOCK(ne); + VERIFY(!(ne->netem_flags & NETEMF_RUNNING)); + (void) assert_wait(&ne->netem_flags, THREAD_UNINT); + NETEM_MTX_UNLOCK(ne); + thread_block_parameter(netem_output_thread_cont, ne); + /* NOTREACHED */ + __builtin_unreachable(); +} + +int +netem_init(void) +{ + ASSERT(!__netem_inited); + __netem_inited = 1; + + netem_lock_attr = lck_attr_alloc_init(); + netem_lock_group_attr = lck_grp_attr_alloc_init(); + netem_lock_group = lck_grp_alloc_init("pktsched_netem_lock", + netem_lock_group_attr); + + return 0; +} + +static struct netem * +netem_create(const char *name, void *output_handle, + int (*output)(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts), + uint32_t output_max_batch_size) +{ + struct netem *ne; + + ne = _MALLOC(sizeof(struct netem), M_DEVBUF, M_WAITOK | M_ZERO); + + lck_mtx_init(&ne->netem_lock, netem_lock_group, netem_lock_attr); + + ne->netem_heap = heap_create(NETEM_HEAP_SIZE); + ne->netem_flags = NETEMF_INITIALIZED; + ne->netem_output_handle = output_handle; + ne->netem_output = output; + ne->netem_output_max_batch_size = + MIN(output_max_batch_size, NETEM_MAX_BATCH_SIZE); + ne->netem_output_thread = THREAD_NULL; + if (output != NULL) { + strlcpy(ne->netem_name, name, sizeof(ne->netem_name)); + if (kernel_thread_start(netem_output_thread_func, ne, + &ne->netem_output_thread) != KERN_SUCCESS) { + panic_plain("%s can't create thread", ne->netem_name); + } + } + + return ne; +} + +void +netem_destroy(struct netem *ne) +{ + uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */ + uint64_t s = (1000 * NSEC_PER_MSEC); /* 1 sec */ + uint32_t i = 0; + int ret = 0; + uint64_t key = 0; + pktsched_pkt_t pkt; + + ASSERT(ne != NULL); + + if (ne->netem_output_thread != THREAD_NULL) { + ASSERT(ne->netem_flags & NETEMF_INITIALIZED); + /* signal thread to begin self-termination */ + NETEM_MTX_LOCK(ne); + ne->netem_flags |= NETEMF_TERMINATING; + + /* and wait for thread to terminate */ + while (!(ne->netem_flags & NETEMF_TERMINATED)) { + uint64_t t = 0; + nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t); + clock_absolutetime_interval_to_deadline(t, &t); + ASSERT(t != 0); + + ne->netem_flags |= NETEMF_TERMINATEBLOCK; + if (!(ne->netem_flags & NETEMF_RUNNING)) { + thread_wakeup((caddr_t)&ne->netem_flags); + } + (void) assert_wait_deadline(&ne->netem_output_thread, + THREAD_UNINT, t); + NETEM_MTX_UNLOCK(ne); + (void) thread_block(THREAD_CONTINUE_NULL); + NETEM_MTX_LOCK(ne); + ne->netem_flags &= ~NETEMF_TERMINATEBLOCK; + } + ASSERT(ne->netem_flags & NETEMF_TERMINATED); + NETEM_MTX_UNLOCK(ne); + ne->netem_output_thread = THREAD_NULL; + } + ASSERT(ne->netem_output_thread == THREAD_NULL); + + lck_mtx_destroy(&ne->netem_lock, netem_lock_group); + + while ((ret = heap_extract(ne->netem_heap, &key, &pkt)) == 0) { + pktsched_free_pkt(&pkt); + } + heap_destroy(ne->netem_heap); + + _FREE(ne, M_DEVBUF); +} + +static int +netem_check_params(const struct if_netem_params *p) +{ + if (p->ifnetem_corruption_p > NETEM_PSCALE) { + netem_log(NETEM_LOG_ERROR, "error: corruption_p %d > %d", + p->ifnetem_corruption_p, NETEM_PSCALE); + return EINVAL; + } + + if (p->ifnetem_duplication_p > NETEM_PSCALE) { + netem_log(NETEM_LOG_ERROR, "error: duplication_p %d > %d", + p->ifnetem_duplication_p, NETEM_PSCALE); + return EINVAL; + } + + if (p->ifnetem_duplication_p > 0 && + p->ifnetem_latency_ms == 0) { + /* we need to insert dup'ed packet with latency */ + netem_log(NETEM_LOG_ERROR, + "error: duplication needs latency param"); + return EINVAL; + } + + if (p->ifnetem_latency_ms > 1000) { + netem_log(NETEM_LOG_ERROR, + "error: latency %d too big (> 1 sec)", + p->ifnetem_latency_ms); + return EINVAL; + } + + if (p->ifnetem_jitter_ms * 3 > p->ifnetem_latency_ms) { + netem_log(NETEM_LOG_ERROR, + "error: jitter %dms too big (latency %dms)", + p->ifnetem_jitter_ms, p->ifnetem_latency_ms); + return EINVAL; + } + + /* if gr_gl == 0 (no loss), other prob should all be zero */ + if (p->ifnetem_loss_p_gr_gl == 0 && + (p->ifnetem_loss_p_gr_bl != 0 || + p->ifnetem_loss_p_bl_br != 0 || + p->ifnetem_loss_p_bl_gr != 0 || + p->ifnetem_loss_p_br_bl != 0)) { + netem_log(NETEM_LOG_ERROR, + "error: loss params not all zero when p_gr_gl is zero"); + return EINVAL; + } + + /* check state machine transition prob integrity */ + if (p->ifnetem_loss_p_gr_gl > NETEM_PSCALE || + /* gr_gl = NETEM_PSCALE for total loss */ + p->ifnetem_loss_p_gr_bl > NETEM_PSCALE || + p->ifnetem_loss_p_bl_br > NETEM_PSCALE || + p->ifnetem_loss_p_bl_gr > NETEM_PSCALE || + p->ifnetem_loss_p_br_bl > NETEM_PSCALE || + p->ifnetem_loss_p_gr_gl + p->ifnetem_loss_p_gr_bl > NETEM_PSCALE || + p->ifnetem_loss_p_bl_br + p->ifnetem_loss_p_bl_gr > NETEM_PSCALE) { + netem_log(NETEM_LOG_ERROR, "error: loss params too big"); + return EINVAL; + } + + if (p->ifnetem_reordering_p > NETEM_PSCALE) { + netem_log(NETEM_LOG_ERROR, "error: reordering %d > %d", + p->ifnetem_reordering_p, NETEM_PSCALE); + return EINVAL; + } + + return 0; +} + +static void +netem_set_params(struct netem *ne, const struct if_netem_params *p) +{ + NETEM_MTX_LOCK(ne); + + struct token_bucket *tb = &ne->netem_bandwidth_model; + if (p->ifnetem_bandwidth_bps == 0) { + tb->depth = 0; + tb->rate = 0; + tb->token = 0; + tb->last = 0; + } else { + tb->depth = p->ifnetem_bandwidth_bps; + tb->rate = p->ifnetem_bandwidth_bps; + tb->token = p->ifnetem_bandwidth_bps / 2; + tb->last = mach_absolute_time(); + } + + struct corruption *corr = &ne->netem_corruption_model; + corr->corruption_p = p->ifnetem_corruption_p; + + struct duplication *dup = &ne->netem_duplication_model; + dup->duplication_p = p->ifnetem_duplication_p; + + struct latency *late = &ne->netem_latency_model; + late->latency_ms = p->ifnetem_latency_ms; + late->jitter_ms = p->ifnetem_jitter_ms; + + struct loss *loss = &ne->netem_loss_model; + loss->state = GAP_RX; + loss->p_gr_gl = p->ifnetem_loss_p_gr_gl; + loss->p_gr_bl = p->ifnetem_loss_p_gr_bl; + loss->p_bl_gr = p->ifnetem_loss_p_bl_gr; + loss->p_bl_br = p->ifnetem_loss_p_bl_br; + loss->p_br_bl = p->ifnetem_loss_p_br_bl; + + struct reordering *r = &ne->netem_reordering_model; + r->reordering_p = p->ifnetem_reordering_p; + + netem_log(NETEM_LOG_INFO, "success: bandwidth %d bps", tb->rate); + netem_log(NETEM_LOG_INFO, "success: corruption %d\%", + corr->corruption_p); + netem_log(NETEM_LOG_INFO, "success: duplication %d\%", + dup->duplication_p); + netem_log(NETEM_LOG_INFO, "success: latency_ms %d jitter_ms %d", + late->latency_ms, late->jitter_ms); + netem_log(NETEM_LOG_INFO, "changed loss p_gr_gl %d p_gr_bl %d " + "p_bl_gr %d p_bl_br %d p_br_bl %d", loss->p_gr_gl, loss->p_gr_bl, + loss->p_bl_gr, loss->p_bl_br, loss->p_br_bl); + netem_log(NETEM_LOG_DEBUG, "success: reordering %d\%", + r->reordering_p); + + NETEM_MTX_UNLOCK(ne); +} + +void +netem_get_params(struct netem *ne, struct if_netem_params *p) +{ + ASSERT(ne != NULL); + NETEM_MTX_LOCK(ne); + + struct token_bucket *tb = &ne->netem_bandwidth_model; + p->ifnetem_bandwidth_bps = tb->depth; + + struct corruption *corr = &ne->netem_corruption_model; + p->ifnetem_corruption_p = corr->corruption_p; + + struct duplication *dup = &ne->netem_duplication_model; + p->ifnetem_duplication_p = dup->duplication_p; + + struct latency *late = &ne->netem_latency_model; + p->ifnetem_latency_ms = late->latency_ms; + p->ifnetem_jitter_ms = late->jitter_ms; + + struct loss *loss = &ne->netem_loss_model; + p->ifnetem_loss_p_gr_gl = loss->p_gr_gl; + p->ifnetem_loss_p_gr_bl = loss->p_gr_bl; + p->ifnetem_loss_p_bl_gr = loss->p_bl_gr; + p->ifnetem_loss_p_bl_br = loss->p_bl_br; + p->ifnetem_loss_p_br_bl = loss->p_br_bl; + + struct reordering *r = &ne->netem_reordering_model; + p->ifnetem_reordering_p = r->reordering_p; + + NETEM_MTX_UNLOCK(ne); +} + +int +netem_config(struct netem **ne, const char *name, + const struct if_netem_params *p, void *output_handle, + int (*output_func)(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts), + uint32_t output_max_batch_size) +{ + struct netem *netem = NULL; + boolean_t enable = TRUE; + int ret = 0; + + if (p == NULL || ( + p->ifnetem_bandwidth_bps == 0 && + p->ifnetem_corruption_p == 0 && + p->ifnetem_duplication_p == 0 && + p->ifnetem_latency_ms == 0 && + p->ifnetem_jitter_ms == 0 && + p->ifnetem_loss_p_gr_gl == 0 && + p->ifnetem_loss_p_gr_bl == 0 && + p->ifnetem_loss_p_bl_br == 0 && + p->ifnetem_loss_p_bl_gr == 0 && + p->ifnetem_loss_p_br_bl == 0 && + p->ifnetem_reordering_p == 0)) { + enable = FALSE; + } + + ret = netem_check_params(p); + if (ret != 0) { + goto done; + } + + if (enable) { + if (*ne == NULL) { + netem_log(NETEM_LOG_INFO, "netem create %s", name); + netem = netem_create(name, output_handle, output_func, + output_max_batch_size); + if (netem == NULL) { + return ENOMEM; + } + atomic_set_ptr(ne, netem); + } + netem_set_params(*ne, p); + } else { + netem_log(NETEM_LOG_INFO, "netem disable %s", name); + if (*ne != NULL) { + netem = *ne; + atomic_set_ptr(ne, NULL); + netem_log(NETEM_LOG_INFO, "netem destroy %s", name); + netem_destroy(netem); + } + ret = 0; + } + +done: + netem_log(NETEM_LOG_INFO, "netem config ret %d", ret); + return ret; +} diff --git a/bsd/net/pktsched/pktsched_netem.h b/bsd/net/pktsched/pktsched_netem.h new file mode 100644 index 000000000..bcd8b5a89 --- /dev/null +++ b/bsd/net/pktsched/pktsched_netem.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define NETEM_MAX_BATCH_SIZE 32 + +__BEGIN_DECLS + +extern int netem_init(void); +extern void netem_fini(void); + +extern int netem_config(struct netem **ne, const char *name, + const struct if_netem_params *p, void *output_handle, + int (*output)(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts), + uint32_t output_max_batch_size); +extern void netem_get_params(struct netem *ne, struct if_netem_params *p); +extern void netem_destroy(struct netem *ne); +extern int netem_enqueue(struct netem *ne, classq_pkt_t *p, boolean_t *pdrop); +extern int netem_dequeue(struct netem *ne, pktsched_pkt_t *p, + boolean_t *ppending); + +__END_DECLS diff --git a/bsd/net/pktsched/pktsched_qfq.c b/bsd/net/pktsched/pktsched_qfq.c index e4c6c2d8c..7ed8559a2 100644 --- a/bsd/net/pktsched/pktsched_qfq.c +++ b/bsd/net/pktsched/pktsched_qfq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2017 Apple Inc. All rights reserved. + * Copyright (c) 2011-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,9 +82,8 @@ /* * function prototypes */ -static int qfq_enqueue_ifclassq(struct ifclassq *, void *, classq_pkt_type_t, - boolean_t *); -static void *qfq_dequeue_ifclassq(struct ifclassq *, classq_pkt_type_t *); +static int qfq_enqueue_ifclassq(struct ifclassq *, classq_pkt_t *, boolean_t *); +static void qfq_dequeue_ifclassq(struct ifclassq *, classq_pkt_t *); static int qfq_request_ifclassq(struct ifclassq *, cqrq_t, void *); static int qfq_clear_interface(struct qfq_if *); static struct qfq_class *qfq_class_create(struct qfq_if *, u_int32_t, @@ -852,7 +851,7 @@ qfq_dequeue(struct qfq_if *qif, pktsched_pkt_t *pkt) IFCQ_LOCK_ASSERT_HELD(ifq); - pkt->pktsched_pkt = NULL; + _PKTSCHED_PKT_INIT(pkt); for (;;) { if (er_bits == 0) { @@ -880,7 +879,8 @@ qfq_dequeue(struct qfq_if *qif, pktsched_pkt_t *pkt) VERIFY(cl != NULL && !qempty(&cl->cl_q)); qfq_getq(cl, pkt); - VERIFY(pkt->pktsched_pkt != NULL); /* qalg must be work conserving */ + /* qalg must be work conserving */ + VERIFY(pkt->pktsched_ptype != QP_INVALID); len = pktsched_get_pkt_len(pkt); #if QFQ_DEBUG @@ -902,8 +902,8 @@ qfq_dequeue(struct qfq_if *qif, pktsched_pkt_t *pkt) log(LOG_DEBUG, "%s: %s qid=%d dequeue pkt=0x%llx F=0x%llx " "V=0x%llx", if_name(QFQIF_IFP(qif)), qfq_style(qif), cl->cl_handle, - (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt), cl->cl_F, - qif->qif_V); + (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt_mbuf), + cl->cl_F, qif->qif_V); } if (qfq_update_class(qif, grp, cl)) { @@ -1071,7 +1071,7 @@ qfq_enqueue(struct qfq_if *qif, struct qfq_class *cl, pktsched_pkt_t *pkt, log(LOG_DEBUG, "%s: %s qid=%d enqueue m=0x%llx state=%s 0x%x " "S=0x%llx F=0x%llx V=0x%llx\n", if_name(QFQIF_IFP(qif)), qfq_style(qif), cl->cl_handle, - (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt), + (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt_mbuf), qfq_state2str(s), qif->qif_bitmaps[s], cl->cl_S, cl->cl_F, qif->qif_V); } @@ -1273,20 +1273,23 @@ qfq_addq(struct qfq_class *cl, pktsched_pkt_t *pkt, struct pf_mtag *t) #endif /* PF_ECN */ VERIFY(pkt->pktsched_ptype == qptype(&cl->cl_q)); - _addq(&cl->cl_q, pkt->pktsched_pkt); + _addq(&cl->cl_q, &pkt->pktsched_pkt); return 0; } static inline void qfq_getq(struct qfq_class *cl, pktsched_pkt_t *pkt) { + classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p); + IFCQ_LOCK_ASSERT_HELD(cl->cl_qif->qif_ifq); if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) { return sfb_getq(cl->cl_sfb, &cl->cl_q, pkt); } - return pktsched_pkt_encap(pkt, qptype(&cl->cl_q), _getq(&cl->cl_q)); + _getq(&cl->cl_q, &p); + return pktsched_pkt_encap(pkt, &p); } static void @@ -1564,7 +1567,7 @@ qfq_dump_sched(struct qfq_if *qif, const char *msg) log(LOG_DEBUG, "%s: %s IB 0x%08x\n", if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[IB]); qfq_dump_groups(qif, 0xffffffff); -}; +} #endif /* QFQ_DEBUG */ /* @@ -1572,8 +1575,7 @@ qfq_dump_sched(struct qfq_if *qif, const char *msg) * (*ifcq_enqueue) in struct ifclassq. */ static int -qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, - boolean_t *pdrop) +qfq_enqueue_ifclassq(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop) { u_int32_t i = 0; int ret; @@ -1582,15 +1584,16 @@ qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, IFCQ_LOCK_ASSERT_HELD(ifq); - switch (ptype) { + switch (p->cp_ptype) { case QP_MBUF: { - struct mbuf *m = p; + struct mbuf *m = p->cp_mbuf; if (!(m->m_flags & M_PKTHDR)) { /* should not happen */ log(LOG_ERR, "%s: packet does not have pkthdr\n", if_name(ifq->ifcq_ifp)); IFCQ_CONVERT_LOCK(ifq); m_freem(m); + *p = CLASSQ_PKT_INITIALIZER(*p); *pdrop = TRUE; return ENOBUFS; } @@ -1602,12 +1605,13 @@ qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, default: VERIFY(0); + __builtin_unreachable(); /* NOTREACHED */ } VERIFY((u_int32_t)i < IFCQ_SC_MAX); - pktsched_pkt_encap(&pkt, ptype, p); + pktsched_pkt_encap(&pkt, p); ret = qfq_enqueue(ifq->ifcq_disc, ifq->ifcq_disc_slots[i].cl, &pkt, t); @@ -1650,14 +1654,13 @@ qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, * CLASSQDQ_REMOVE must return the same packet if called immediately * after CLASSQDQ_POLL. */ -static void * -qfq_dequeue_ifclassq(struct ifclassq *ifq, classq_pkt_type_t *ptype) +static void +qfq_dequeue_ifclassq(struct ifclassq *ifq, classq_pkt_t *cpkt) { pktsched_pkt_t pkt; - bzero(&pkt, sizeof(pkt)); + _PKTSCHED_PKT_INIT(&pkt); qfq_dequeue(ifq->ifcq_disc, &pkt); - *ptype = pkt.pktsched_ptype; - return pkt.pktsched_pkt; + *cpkt = pkt.pktsched_pkt; } static int diff --git a/bsd/net/pktsched/pktsched_tcq.c b/bsd/net/pktsched/pktsched_tcq.c index d59bf0d5e..aa4ccec53 100644 --- a/bsd/net/pktsched/pktsched_tcq.c +++ b/bsd/net/pktsched/pktsched_tcq.c @@ -51,10 +51,9 @@ /* * function prototypes */ -static int tcq_enqueue_ifclassq(struct ifclassq *, void *, classq_pkt_type_t, - boolean_t *); -static void *tcq_dequeue_tc_ifclassq(struct ifclassq *, mbuf_svc_class_t, - classq_pkt_type_t *); +static int tcq_enqueue_ifclassq(struct ifclassq *, classq_pkt_t *, boolean_t *); +static void tcq_dequeue_tc_ifclassq(struct ifclassq *, mbuf_svc_class_t, + classq_pkt_t *); static int tcq_request_ifclassq(struct ifclassq *, cqrq_t, void *); static int tcq_clear_interface(struct tcq_if *); static struct tcq_class *tcq_class_create(struct tcq_if *, int, u_int32_t, @@ -489,24 +488,23 @@ tcq_dequeue_cl(struct tcq_if *tif, struct tcq_class *cl, mbuf_svc_class_t sc, uint32_t len; IFCQ_LOCK_ASSERT_HELD(ifq); + pkt->pktsched_pkt_mbuf = NULL; if (cl == NULL) { cl = tcq_clh_to_clp(tif, MBUF_SCIDX(sc)); if (cl == NULL) { - pkt->pktsched_pkt = NULL; return; } } if (qempty(&cl->cl_q)) { - pkt->pktsched_pkt = NULL; return; } VERIFY(!IFCQ_IS_EMPTY(ifq)); tcq_getq(cl, pkt); - if (pkt->pktsched_pkt != NULL) { + if (pkt->pktsched_pkt_mbuf != NULL) { len = pktsched_get_pkt_len(pkt); IFCQ_DEC_LEN(ifq); IFCQ_DEC_BYTES(ifq, len); @@ -578,7 +576,7 @@ tcq_addq(struct tcq_class *cl, pktsched_pkt_t *pkt, struct pf_mtag *t) #endif /* PF_ECN */ VERIFY(pkt->pktsched_ptype == qptype(&cl->cl_q)); - _addq(&cl->cl_q, pkt->pktsched_pkt); + _addq(&cl->cl_q, &pkt->pktsched_pkt); return 0; } @@ -586,13 +584,16 @@ tcq_addq(struct tcq_class *cl, pktsched_pkt_t *pkt, struct pf_mtag *t) static inline void tcq_getq(struct tcq_class *cl, pktsched_pkt_t *pkt) { + classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p); + IFCQ_LOCK_ASSERT_HELD(cl->cl_tif->tif_ifq); if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) { return sfb_getq(cl->cl_sfb, &cl->cl_q, pkt); } - return pktsched_pkt_encap(pkt, qptype(&cl->cl_q), _getq(&cl->cl_q)); + _getq(&cl->cl_q, &p); + return pktsched_pkt_encap(pkt, &p); } static void @@ -739,8 +740,7 @@ tcq_style(struct tcq_if *tif) * (*ifcq_enqueue) in struct ifclassq. */ static int -tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, - boolean_t *pdrop) +tcq_enqueue_ifclassq(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop) { u_int32_t i = 0; int ret; @@ -749,14 +749,15 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, IFCQ_LOCK_ASSERT_HELD(ifq); - if (ptype == QP_MBUF) { - struct mbuf *m = p; + if (p->cp_ptype == QP_MBUF) { + struct mbuf *m = p->cp_mbuf; if (!(m->m_flags & M_PKTHDR)) { /* should not happen */ log(LOG_ERR, "%s: packet does not have pkthdr\n", if_name(ifq->ifcq_ifp)); IFCQ_CONVERT_LOCK(ifq); m_freem(m); + *p = CLASSQ_PKT_INITIALIZER(*p); *pdrop = TRUE; return ENOBUFS; } @@ -765,7 +766,7 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, } VERIFY((u_int32_t)i < IFCQ_SC_MAX); - pktsched_pkt_encap(&pkt, ptype, p); + pktsched_pkt_encap(&pkt, p); ret = tcq_enqueue(ifq->ifcq_disc, ifq->ifcq_disc_slots[i].cl, &pkt, t); @@ -795,6 +796,7 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, break; default: VERIFY(0); + __builtin_unreachable(); } return ret; } @@ -808,19 +810,18 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype, * CLASSQDQ_REMOVE must return the same packet if called immediately * after CLASSQDQ_POLL. */ -static void * +static void tcq_dequeue_tc_ifclassq(struct ifclassq *ifq, mbuf_svc_class_t sc, - classq_pkt_type_t *ptype) + classq_pkt_t *cpkt) { pktsched_pkt_t pkt; u_int32_t i = MBUF_SCIDX(sc); VERIFY((u_int32_t)i < IFCQ_SC_MAX); - bzero(&pkt, sizeof(pkt)); + _PKTSCHED_PKT_INIT(&pkt); (tcq_dequeue_cl(ifq->ifcq_disc, ifq->ifcq_disc_slots[i].cl, sc, &pkt)); - *ptype = pkt.pktsched_ptype; - return pkt.pktsched_pkt; + *cpkt = pkt.pktsched_pkt; } static int diff --git a/bsd/net/restricted_in_port.c b/bsd/net/restricted_in_port.c new file mode 100644 index 000000000..7e39a0d12 --- /dev/null +++ b/bsd/net/restricted_in_port.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Entitlement required for using the port of the test entry + */ +#define ENTITLEMENT_TEST_PORT "com.apple.private.network.restricted.port.test" + +/* + * Entitlement required for setting the test sysctl variables + */ +#define ENTITLEMENT_TEST_CONTROL "com.apple.private.network.restricted.port.control" + +/* + * Use a single bitmap for quickly checking if a TCP or UDP port is restricted + */ +bitmap_t *restricted_port_bitmap = NULL; + +struct restricted_port_entry { + const char *rpe_entitlement; // entitlement to check for this port + in_port_t rpe_port; // restricted port number (host byte order) + uint16_t rpe_flags; // RPE_FLAG_xxx +}; + +/* + * Possible values for the field rpe_flags + */ +#define RPE_FLAG_SUPERUSER 0x01 // superuser can use the port +#define RPE_FLAG_ENTITLEMENT 0x02 // can use the port with the required entitlement +#define RPE_FLAG_TCP 0x04 // require entitlement for TCP +#define RPE_FLAG_UDP 0x08 // require entitlement for TCP +#define RPE_FLAG_TEST 0x10 // entry for testing + +static struct restricted_port_entry restricted_port_list[] = { +#if CONFIG_EMBEDDED + /* + * Network relay proxy + */ + { + .rpe_port = 62742, + .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_TCP | RPE_FLAG_UDP, + .rpe_entitlement = "com.apple.private.network.restricted.port.nr_proxy", + }, + + /* + * Network relay control + */ + { + .rpe_port = 62743, + .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_UDP, + .rpe_entitlement = "com.apple.private.network.restricted.port.nr_control", + }, + + /* + * Entries for identityservicesd + */ + { + .rpe_port = 61314, + .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_TCP | RPE_FLAG_UDP, + .rpe_entitlement = "com.apple.private.network.restricted.port.ids_service_connector", + }, + { + .rpe_port = 61315, + .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_TCP | RPE_FLAG_UDP, + .rpe_entitlement = "com.apple.private.network.restricted.port.ids_cloud_service_connector", + }, +#endif /* CONFIG_EMBEDDED */ + +#if (DEBUG || DEVELOPMENT) + /* + * Entries reserved for unit testing + */ + { + .rpe_port = 0, + .rpe_flags = RPE_FLAG_TCP | RPE_FLAG_TEST, + .rpe_entitlement = ENTITLEMENT_TEST_PORT, + }, + { + .rpe_port = 0, + .rpe_flags = RPE_FLAG_UDP | RPE_FLAG_TEST, + .rpe_entitlement = ENTITLEMENT_TEST_PORT, + }, +#endif /* (DEBUG || DEVELOPMENT) */ + + /* + * Sentinel to mark the actual end of the list (rpe_entitlement == NULL) + */ + { + .rpe_port = 0, + .rpe_flags = 0, + .rpe_entitlement = NULL, + } +}; + +#define RPE_ENTRY_COUNT (sizeof(restricted_port_list) / sizeof(restricted_port_list[0])) + +SYSCTL_NODE(_net, OID_AUTO, restricted_port, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "restricted port"); + +static int sysctl_restricted_port_bitmap SYSCTL_HANDLER_ARGS; +static int sysctl_restricted_port_enforced SYSCTL_HANDLER_ARGS; +static int sysctl_restricted_port_verbose SYSCTL_HANDLER_ARGS; + +SYSCTL_PROC(_net_restricted_port, OID_AUTO, bitmap, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, &sysctl_restricted_port_bitmap, "", ""); + +/* + * In order to set the following sysctl variables the process needs to run as superuser + * or have the entitlement ENTITLEMENT_TEST_CONTROL + */ +#if (DEBUG || DEVELOPMENT) +static int restricted_port_enforced = 1; +SYSCTL_PROC(_net_restricted_port, OID_AUTO, enforced, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY, + 0, 0, &sysctl_restricted_port_enforced, "I", ""); +#else /* (DEBUG || DEVELOPMENT) */ +const int restricted_port_enforced = 1; +SYSCTL_PROC(_net_restricted_port, OID_AUTO, enforced, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RD, + 0, 0, &sysctl_restricted_port_enforced, "I", ""); +#endif /* (DEBUG || DEVELOPMENT) */ + +static int restricted_port_verbose = 0; +SYSCTL_PROC(_net_restricted_port, OID_AUTO, verbose, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY, + 0, 0, &sysctl_restricted_port_verbose, "I", ""); + +#if (DEBUG || DEVELOPMENT) + +/* + * Register dynamically a test port set by the unit test program to avoid conflict with + * a restricted port currently used by its legetimate process. + * The value must be passed is in host byte order. + */ +static uint16_t restricted_port_test = 0; + +static int sysctl_restricted_port_test_entitlement SYSCTL_HANDLER_ARGS; +static int sysctl_restricted_port_test_superuser SYSCTL_HANDLER_ARGS; + +SYSCTL_PROC(_net_restricted_port, OID_AUTO, test_entitlement, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY, + 0, 0, &sysctl_restricted_port_test_entitlement, "UI", ""); + +SYSCTL_PROC(_net_restricted_port, OID_AUTO, test_superuser, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY, + 0, 0, &sysctl_restricted_port_test_superuser, "UI", ""); +#endif /* (DEBUG || DEVELOPMENT) */ + +static int +sysctl_restricted_port_bitmap SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + if (req->newptr) { + return EPERM; + } + int error = SYSCTL_OUT(req, restricted_port_bitmap, BITMAP_SIZE(UINT16_MAX)); + + return error; +} + +static int +sysctl_restricted_port_enforced SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int old_value = restricted_port_enforced; + int value = old_value; + + int error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || !req->newptr) { + return error; + } +#if (DEBUG || DEVELOPMENT) + if (proc_suser(current_proc()) != 0 && + !IOTaskHasEntitlement(current_task(), ENTITLEMENT_TEST_CONTROL)) { + return EPERM; + } + restricted_port_enforced = value; + os_log(OS_LOG_DEFAULT, + "%s:%u sysctl net.restricted_port.enforced: %d -> %d", + proc_best_name(current_proc()), proc_selfpid(), + old_value, restricted_port_enforced); + return error; +#else + return EPERM; +#endif /* (DEBUG || DEVELOPMENT) */ +} + +static int +sysctl_restricted_port_verbose SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int old_value = restricted_port_verbose; + int value = old_value; + + int error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || !req->newptr) { + return error; + } + if (proc_suser(current_proc()) != 0 && + !IOTaskHasEntitlement(current_task(), ENTITLEMENT_TEST_CONTROL)) { + return EPERM; + } + restricted_port_verbose = value; + os_log(OS_LOG_DEFAULT, + "%s:%u sysctl net.restricted_port.verbose: %d -> %d)", + proc_best_name(current_proc()), proc_selfpid(), + old_value, restricted_port_verbose); + + return error; +} + +#if (DEBUG || DEVELOPMENT) + +static int +sysctl_restricted_port_test_common(struct sysctl_oid *oidp, + struct sysctl_req *req, bool test_superuser) +{ + uint16_t old_value = restricted_port_test; + int value = old_value; + unsigned int i; + + int error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || !req->newptr) { + return error; + } + if (proc_suser(current_proc()) != 0 && + !IOTaskHasEntitlement(current_task(), ENTITLEMENT_TEST_CONTROL)) { + return EPERM; + } + if (value < 0 || value > UINT16_MAX) { + return EINVAL; + } + if (value == 0) { + /* + * Clear the current test port entries + */ + if (restricted_port_test != 0) { + for (i = 0; i < RPE_ENTRY_COUNT; i++) { + struct restricted_port_entry *rpe = &restricted_port_list[i]; + + if (rpe->rpe_entitlement == NULL) { + break; + } + if (!(rpe->rpe_flags & RPE_FLAG_TEST)) { + continue; + } + rpe->rpe_port = 0; + rpe->rpe_flags &= ~(RPE_FLAG_ENTITLEMENT | RPE_FLAG_SUPERUSER); + } + bitmap_clear(restricted_port_bitmap, restricted_port_test); + restricted_port_test = 0; + } + } else { + for (i = 0; i < RPE_ENTRY_COUNT; i++) { + struct restricted_port_entry *rpe = &restricted_port_list[i]; + + if (rpe->rpe_entitlement == NULL) { + break; + } + if (!(rpe->rpe_flags & RPE_FLAG_TEST)) { + continue; + } + rpe->rpe_port = value; + if (test_superuser) { + rpe->rpe_flags |= RPE_FLAG_SUPERUSER; + rpe->rpe_flags &= ~RPE_FLAG_ENTITLEMENT; + } else { + rpe->rpe_flags |= RPE_FLAG_ENTITLEMENT; + rpe->rpe_flags &= ~RPE_FLAG_SUPERUSER; + } + } + restricted_port_test = (uint16_t)value; + bitmap_set(restricted_port_bitmap, restricted_port_test); + } + + return 0; +} + +static int +sysctl_restricted_port_test_entitlement SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + uint16_t old_value = restricted_port_test; + int error; + + error = sysctl_restricted_port_test_common(oidp, req, false); + if (error == 0) { + os_log(OS_LOG_DEFAULT, + "%s:%u sysctl net.restricted_port.test_entitlement: %u -> %u)", + proc_best_name(current_proc()), proc_selfpid(), + old_value, restricted_port_test); + } + return error; +} + +static int +sysctl_restricted_port_test_superuser SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + uint16_t old_value = restricted_port_test; + int error; + + error = sysctl_restricted_port_test_common(oidp, req, true); + if (error == 0) { + os_log(OS_LOG_DEFAULT, + "%s:%u sysctl net.restricted_port.test_superuser: %u -> %u)", + proc_best_name(current_proc()), proc_selfpid(), + old_value, restricted_port_test); + } + return error; +} + +#endif /* (DEBUG || DEVELOPMENT) */ + +void +restricted_in_port_init(void) +{ + unsigned int i; + + + restricted_port_bitmap = bitmap_alloc(UINT16_MAX); + + if (restricted_port_bitmap == NULL) { + panic("restricted_port_init: bitmap allocation failed"); + } + + for (i = 0; i < RPE_ENTRY_COUNT; i++) { + struct restricted_port_entry *rpe = &restricted_port_list[i]; + + if (rpe->rpe_entitlement == NULL) { + break; + } + if (rpe->rpe_port == 0) { + continue; + } + bitmap_set(restricted_port_bitmap, rpe->rpe_port); + } +} + +static const char * +port_flag_str(uint32_t port_flags) +{ + switch (port_flags) { + case PORT_FLAGS_LISTENER: + return "listener"; + case PORT_FLAGS_BSD: + return "bsd"; + case PORT_FLAGS_PF: + return "pf"; + default: + break; + } + return "?"; +} + +/* + * The port is passed in network byte order + */ +bool +current_task_can_use_restricted_in_port(in_port_t port, uint8_t protocol, uint32_t port_flags) +{ + unsigned int i; + struct proc *p = current_proc(); + pid_t pid = proc_pid(p); + + /* + * Quick check that does not take in account the protocol + */ + if (!IS_RESTRICTED_IN_PORT(port) || restricted_port_enforced == 0) { + if (restricted_port_verbose > 1) { + os_log(OS_LOG_DEFAULT, + "port %u for protocol %u via %s can be used by process %s:%u", + ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid); + } + return true; + } + + for (i = 0; i < RPE_ENTRY_COUNT; i++) { + struct restricted_port_entry *rpe = &restricted_port_list[i]; + + if (rpe->rpe_entitlement == NULL) { + break; + } + if (rpe->rpe_port == 0) { + continue; + } + if ((protocol == IPPROTO_TCP && !(rpe->rpe_flags & RPE_FLAG_TCP)) || + (protocol == IPPROTO_UDP && !(rpe->rpe_flags & RPE_FLAG_UDP))) { + continue; + } + if (rpe->rpe_port != ntohs(port)) { + continue; + } + /* + * Found an entry in the list of restricted ports + * + * A process can use a restricted port if it meets at least one of + * the following conditions: + * - The process has the required entitlement + * - The port is marked as usable by root + */ + task_t task = current_task(); + if (rpe->rpe_flags & RPE_FLAG_SUPERUSER) { + if (task == kernel_task || proc_suser(current_proc()) == 0) { + os_log(OS_LOG_DEFAULT, + "root restricted port %u for protocol %u via %s can be used by superuser process %s:%u", + ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid); + return true; + } + } + if (rpe->rpe_flags & RPE_FLAG_ENTITLEMENT) { + /* + * Do not let the kernel use the port because there is + * no entitlement for kernel extensions + */ + if (task == kernel_task) { + os_log(OS_LOG_DEFAULT, + "entitlement restricted port %u for protocol %u via %s cannot be used by kernel", + ntohs(port), protocol, port_flag_str(port_flags)); + return false; + } + if (!IOTaskHasEntitlement(current_task(), rpe->rpe_entitlement)) { + os_log(OS_LOG_DEFAULT, + "entitlement restricted port %u for protocol %u via %s cannot be used by process %s:%u -- IOTaskHasEntitlement(%s) failed", + ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid, rpe->rpe_entitlement); + return false; + } + os_log(OS_LOG_DEFAULT, + "entitlement restricted port %u for protocol %u via %s can be used by process %s:%u", + ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid); + return true; + } + os_log(OS_LOG_DEFAULT, + "root restricted port %u for protocol %u via %s cannot be used by process %s:%u", + ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid); + return false; + } + if (restricted_port_verbose > 1) { + os_log(OS_LOG_DEFAULT, + "port %u for protocol %u via %s can be used by process %s:%u", + ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid); + } + return true; +} diff --git a/bsd/net/restricted_in_port.h b/bsd/net/restricted_in_port.h new file mode 100644 index 000000000..2520d9ba4 --- /dev/null +++ b/bsd/net/restricted_in_port.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NETINET_IN_RESTRICTED_PORT_H_ +#define _NETINET_IN_RESTRICTED_PORT_H_ + +#ifdef BSD_KERNEL_PRIVATE + +#include + +#define PORT_FLAGS_LISTENER 0x00 +#define PORT_FLAGS_BSD 0x02 +#define PORT_FLAGS_PF 0x03 +#define PORT_FLAGS_MAX 0x03 + +/* + * the port in network byte order + */ +#define IS_RESTRICTED_IN_PORT(x) (bitmap_test(restricted_port_bitmap, ntohs((uint16_t)(x)))) + +extern bitmap_t *restricted_port_bitmap; + +extern void restricted_in_port_init(void); + +/* + * The port must be in network byte order + */ +extern bool current_task_can_use_restricted_in_port(in_port_t port, uint8_t protocol, uint32_t port_flags); + +#endif /* BSD_KERNEL_PRIVATE */ + +#endif /* _NETINET_IN_RESTRICTED_PORT_H_ */ diff --git a/bsd/net/route.c b/bsd/net/route.c index bba83ba46..b63ba0183 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -215,7 +216,14 @@ extern void kdp_set_gateway_mac(void *gatewaymac); -__private_extern__ struct rtstat rtstat = { 0, 0, 0, 0, 0, 0 }; +__private_extern__ struct rtstat rtstat = { + .rts_badredirect = 0, + .rts_dynamic = 0, + .rts_newgateway = 0, + .rts_unreach = 0, + .rts_wildcard = 0, + .rts_badrtgwroute = 0 +}; struct radix_node_head *rt_tables[AF_MAX+1]; decl_lck_mtx_data(, rnh_lock_data); /* global routing tables mutex */ @@ -231,6 +239,7 @@ static lck_grp_attr_t *rte_mtx_grp_attr; int rttrash = 0; /* routes not in table but not freed */ +boolean_t trigger_v6_defrtr_select = FALSE; unsigned int rte_debug = 0; /* Possible flags for rte_debug */ @@ -362,11 +371,18 @@ struct matchleaf_arg { * of sockaddr_in for convenience). */ static struct sockaddr sin_def = { - sizeof (struct sockaddr_in), AF_INET, { 0, } + .sa_len = sizeof (struct sockaddr_in), + .sa_family = AF_INET, + .sa_data = { 0, } }; static struct sockaddr_in6 sin6_def = { - sizeof (struct sockaddr_in6), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 + .sin6_len = sizeof (struct sockaddr_in6), + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_flowinfo = 0, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_scope_id = 0 }; /* @@ -1765,6 +1781,10 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, #define senderr(x) { error = x; goto bad; } + DTRACE_ROUTE6(rtrequest, int, req, struct sockaddr *, dst0, + struct sockaddr *, gateway, struct sockaddr *, netmask, + int, flags, unsigned int, ifscope); + LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED); /* * Find the correct routing tree to use for this Address Family @@ -1930,6 +1950,10 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, if (rt_primary_default(rt, rt_key(rt))) { set_primary_ifscope(rt_key(rt)->sa_family, IFSCOPE_NONE); + if ((rt->rt_flags & RTF_STATIC) && + rt_key(rt)->sa_family == PF_INET6) { + trigger_v6_defrtr_select = TRUE; + } } #if NECP @@ -2453,7 +2477,7 @@ delete_rt: * Round up sockaddr len to multiples of 32-bytes. This will reduce * or even eliminate the need to re-allocate the chunk of memory used * for rt_key and rt_gateway in the event the gateway portion changes. - * Certain code paths (e.g. IPSec) are notorious for caching the address + * Certain code paths (e.g. IPsec) are notorious for caching the address * of rt_gateway; this rounding-up would help ensure that the gateway * portion never gets deallocated (though it may change contents) and * thus greatly simplifies things. @@ -2823,7 +2847,7 @@ node_lookup(struct sockaddr *dst, struct sockaddr *netmask, struct radix_node *rn; struct sockaddr_storage ss, mask; int af = dst->sa_family; - struct matchleaf_arg ma = { ifscope }; + struct matchleaf_arg ma = { .ifscope = ifscope }; rn_matchf_t *f = rn_match_ifscope; void *w = &ma; @@ -4410,7 +4434,7 @@ route_op_entitlement_check(struct socket *so, * allowed accesses. */ if (soopt_cred_check(so, PRIV_NET_RESTRICTED_ROUTE_NC_READ, - allow_root) == 0) + allow_root, false) == 0) return (0); else return (-1); diff --git a/bsd/net/route.h b/bsd/net/route.h index 5b4ea82ed..d1406262f 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -107,6 +107,7 @@ struct route_old { #include #include +extern boolean_t trigger_v6_defrtr_select; /* * Kernel resident routing tables. * diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index 23d7bf201..a8b260159 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -94,9 +94,9 @@ static struct domain *routedomain = NULL; MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); -static struct sockaddr route_dst = { 2, PF_ROUTE, { 0, } }; -static struct sockaddr route_src = { 2, PF_ROUTE, { 0, } }; -static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, { 0, } }; +static struct sockaddr route_dst = { .sa_len = 2, .sa_family = PF_ROUTE, .sa_data = { 0, } }; +static struct sockaddr route_src = { .sa_len = 2, .sa_family = PF_ROUTE, .sa_data = { 0, } }; +static struct sockaddr sa_zero = { .sa_len = sizeof(sa_zero), .sa_family = AF_INET, .sa_data = { 0, } }; struct route_cb { u_int32_t ip_count; /* attached w/ AF_INET */ @@ -160,6 +160,9 @@ SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "routing"); #define ADVANCE32(x, n) \ (x += ROUNDUP32((n)->sa_len)) +#define RT_HAS_IFADDR(rt) \ + ((rt)->rt_ifa != NULL && (rt)->rt_ifa->ifa_addr != NULL) + /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX @@ -383,7 +386,7 @@ route_output(struct mbuf *m, struct socket *so) } if (info.rti_info[RTAX_DST]->sa_family == AF_INET && - info.rti_info[RTAX_DST]->sa_len != sizeof(dst_in)) { + info.rti_info[RTAX_DST]->sa_len != sizeof(struct sockaddr_in)) { /* At minimum, we need up to sin_addr */ if (info.rti_info[RTAX_DST]->sa_len < offsetof(struct sockaddr_in, sin_zero)) { @@ -396,22 +399,29 @@ route_output(struct mbuf *m, struct socket *so) dst_in.sin_addr = SIN(info.rti_info[RTAX_DST])->sin_addr; info.rti_info[RTAX_DST] = (struct sockaddr *)&dst_in; dst_sa_family = info.rti_info[RTAX_DST]->sa_family; + } else if (info.rti_info[RTAX_DST]->sa_family == AF_INET6 && + info.rti_info[RTAX_DST]->sa_len < sizeof(struct sockaddr_in6)) { + senderr(EINVAL); } - if (info.rti_info[RTAX_GATEWAY] != NULL && - info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET && - info.rti_info[RTAX_GATEWAY]->sa_len != sizeof(gate_in)) { - /* At minimum, we need up to sin_addr */ - if (info.rti_info[RTAX_GATEWAY]->sa_len < - offsetof(struct sockaddr_in, sin_zero)) { + if (info.rti_info[RTAX_GATEWAY] != NULL) { + if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET && + info.rti_info[RTAX_GATEWAY]->sa_len != sizeof(struct sockaddr_in)) { + /* At minimum, we need up to sin_addr */ + if (info.rti_info[RTAX_GATEWAY]->sa_len < + offsetof(struct sockaddr_in, sin_zero)) { + senderr(EINVAL); + } + bzero(&gate_in, sizeof(gate_in)); + gate_in.sin_len = sizeof(gate_in); + gate_in.sin_family = AF_INET; + gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port; + gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in; + } else if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET6 && + info.rti_info[RTAX_GATEWAY]->sa_len < sizeof(struct sockaddr_in6)) { senderr(EINVAL); } - bzero(&gate_in, sizeof(gate_in)); - gate_in.sin_len = sizeof(gate_in); - gate_in.sin_family = AF_INET; - gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port; - gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr; - info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in; } if (info.rti_info[RTAX_GENMASK]) { @@ -755,7 +765,7 @@ flush: return error; } } else { - struct sockproto route_proto = { PF_ROUTE, 0 }; + struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 }; if (rp != NULL) { rp->rcb_proto.sp_family = 0; /* Avoid us */ } @@ -1315,7 +1325,7 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; - struct sockproto route_proto = { PF_ROUTE, 0 }; + struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 }; if (route_cb.any_count == 0) { return; @@ -1342,7 +1352,7 @@ rt_ifmsg(struct ifnet *ifp) struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; - struct sockproto route_proto = { PF_ROUTE, 0 }; + struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 }; if (route_cb.any_count == 0) { return; @@ -1379,7 +1389,7 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) int pass; struct mbuf *m = 0; struct ifnet *ifp = ifa->ifa_ifp; - struct sockproto route_proto = { PF_ROUTE, 0 }; + struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 }; LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); @@ -1461,7 +1471,7 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) struct mbuf *m = 0; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; - struct sockproto route_proto = { PF_ROUTE, 0 }; + struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 }; if (route_cb.any_count == 0) { return; @@ -1608,6 +1618,9 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + if (RT_HAS_IFADDR(rt)) { + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; + } if (w->w_op != NET_RT_DUMP2) { size = rt_msg2(RTM_GET, &info, NULL, w, credp); diff --git a/bsd/net/sixxlowpan.c b/bsd/net/sixxlowpan.c new file mode 100644 index 000000000..8ccaab009 --- /dev/null +++ b/bsd/net/sixxlowpan.c @@ -0,0 +1,897 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2008, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file is part of the Contiki operating system. + * + */ + +/** + * \file + * Header file for the 6lowpan implementation + * (RFC4944 and draft-hui-6lowpan-hc-01) + * \author Adam Dunkels + * \author Nicolas Tsiftes + * \author Niclas Finne + * \author Mathilde Durvy + * \author Julien Abeille + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include + +errno_t +compress_hdr_hc1(struct frame802154 *, u_int8_t *, + long *, size_t *, u_int8_t *); +errno_t +uncompress_hdr_hc1(struct frame802154 *, u_int8_t *, + uint16_t, long *, size_t *, u_int8_t *); + + + +/** + * \addtogroup sicslowpan + * @{ + */ + +/** + * \name General sicslowpan defines + * @{ + */ +/* Min and Max compressible UDP ports - HC06 */ +#define SICSLOWPAN_UDP_PORT_MIN 0xF0B0 +#define SICSLOWPAN_UDP_PORT_MAX 0xF0BF /* F0B0 + 15 */ + +/** @} */ + +/** + * \name 6lowpan compressions + * @{ + */ +#define SICSLOWPAN_COMPRESSION_IPV6 0 +#define SICSLOWPAN_COMPRESSION_HC1 1 +#define SICSLOWPAN_COMPRESSION_HC06 2 +/** @} */ + +/** + * \name 6lowpan dispatches + * @{ + */ +#define SICSLOWPAN_DISPATCH_IPV6 0x41 /* 01000001 = 65 */ +#define SICSLOWPAN_DISPATCH_HC1 0x42 /* 01000010 = 66 */ +#define SICSLOWPAN_DISPATCH_IPHC 0x60 /* 011xxxxx = ... */ +#define SICSLOWPAN_DISPATCH_FRAG1 0xc0 /* 11000xxx */ +#define SICSLOWPAN_DISPATCH_FRAGN 0xe0 /* 11100xxx */ +/** @} */ + +/** \name HC1 encoding + * @{ + */ +#define SICSLOWPAN_HC1_NH_UDP 0x02 +#define SICSLOWPAN_HC1_NH_TCP 0x06 +#define SICSLOWPAN_HC1_NH_ICMP6 0x04 +/** @} */ + +/** \name HC_UDP encoding (works together with HC1) + * @{ + */ +#define SICSLOWPAN_HC_UDP_ALL_C 0xE0 +/** @} */ + +/** + * \name IPHC encoding + * @{ + */ +/* + * Values of fields within the IPHC encoding first byte + * (C stands for compressed and I for inline) + */ +#define SICSLOWPAN_IPHC_FL_C 0x10 +#define SICSLOWPAN_IPHC_TC_C 0x08 +#define SICSLOWPAN_IPHC_NH_C 0x04 +#define SICSLOWPAN_IPHC_TTL_1 0x01 +#define SICSLOWPAN_IPHC_TTL_64 0x02 +#define SICSLOWPAN_IPHC_TTL_255 0x03 +#define SICSLOWPAN_IPHC_TTL_I 0x00 + + +/* Values of fields within the IPHC encoding second byte */ +#define SICSLOWPAN_IPHC_CID 0x80 + +#define SICSLOWPAN_IPHC_SAC 0x40 +#define SICSLOWPAN_IPHC_SAM_00 0x00 +#define SICSLOWPAN_IPHC_SAM_01 0x10 +#define SICSLOWPAN_IPHC_SAM_10 0x20 +#define SICSLOWPAN_IPHC_SAM_11 0x30 + +#define SICSLOWPAN_IPHC_SAM_BIT 4 + +#define SICSLOWPAN_IPHC_M 0x08 +#define SICSLOWPAN_IPHC_DAC 0x04 +#define SICSLOWPAN_IPHC_DAM_00 0x00 +#define SICSLOWPAN_IPHC_DAM_01 0x01 +#define SICSLOWPAN_IPHC_DAM_10 0x02 +#define SICSLOWPAN_IPHC_DAM_11 0x03 + +#define SICSLOWPAN_IPHC_DAM_BIT 0 + +/* Link local context number */ +#define SICSLOWPAN_IPHC_ADDR_CONTEXT_LL 0 +/* 16-bit multicast addresses compression */ +#define SICSLOWPAN_IPHC_MCAST_RANGE 0xA0 +/** @} */ + +/* NHC_EXT_HDR */ +#define SICSLOWPAN_NHC_MASK 0xF0 +#define SICSLOWPAN_NHC_EXT_HDR 0xE0 + +/** + * \name LOWPAN_UDP encoding (works together with IPHC) + * @{ + */ +/** + * \name LOWPAN_UDP encoding (works together with IPHC) + * @{ + */ +#define SICSLOWPAN_NHC_UDP_MASK 0xF8 +#define SICSLOWPAN_NHC_UDP_ID 0xF0 +#define SICSLOWPAN_NHC_UDP_CHECKSUMC 0x04 +#define SICSLOWPAN_NHC_UDP_CHECKSUMI 0x00 +/* values for port compression, _with checksum_ ie bit 5 set to 0 */ +#define SICSLOWPAN_NHC_UDP_CS_P_00 0xF0 /* all inline */ +#define SICSLOWPAN_NHC_UDP_CS_P_01 0xF1 /* source 16bit inline, dest = 0xF0 + 8 bit inline */ +#define SICSLOWPAN_NHC_UDP_CS_P_10 0xF2 /* source = 0xF0 + 8bit inline, dest = 16 bit inline */ +#define SICSLOWPAN_NHC_UDP_CS_P_11 0xF3 /* source & dest = 0xF0B + 4bit inline */ +/** @} */ + + +/** + * \name The 6lowpan "headers" length + * @{ + */ + +#define SICSLOWPAN_IPV6_HDR_LEN 1 /*one byte*/ +#define SICSLOWPAN_HC1_HDR_LEN 3 +#define SICSLOWPAN_HC1_HC_UDP_HDR_LEN 7 +#define SICSLOWPAN_FRAG1_HDR_LEN 4 +#define SICSLOWPAN_FRAGN_HDR_LEN 5 + +// Minimum size of the compressed 6LoWPAN header length +#define SICSLOWPAN_MIN_COMP_HDR_LEN 7 + +// Minimum size of the uncompressed IPv6 header length +#define SICSLOWPAN_MIN_UNCOMP_HDR_LEN 40 + + +#define UIP_IPH_LEN 40 +#define UIP_UDPH_LEN 8 /* Size of UDP header */ +#define UIP_TCPH_LEN 20 /* Size of TCP header */ +#define UIP_ICMPH_LEN 4 /* Size of ICMP header */ + +/** @} */ + +/** + * \brief The header for fragments + * \note We do not define different structures for FRAG1 + * and FRAGN headers, which are different. For FRAG1, the + * offset field is just not used + */ +/* struct sicslowpan_frag_hdr { */ +/* uint16_t dispatch_size; */ +/* uint16_t tag; */ +/* uint8_t offset; */ +/* }; */ + +/** + * \brief The HC1 header when HC_UDP is not used + * + * When all fields are compressed and HC_UDP is not used, + * we use this structure. If HC_UDP is used, the ttl is + * in another spot, and we use the sicslowpan_hc1_hc_udp + * structure + */ +/* struct sicslowpan_hc1_hdr { */ +/* uint8_t dispatch; */ +/* uint8_t encoding; */ +/* uint8_t ttl; */ +/* }; */ + +/** + * \brief HC1 followed by HC_UDP + */ +/* struct sicslowpan_hc1_hc_udp_hdr { */ +/* uint8_t dispatch; */ +/* uint8_t hc1_encoding; */ +/* uint8_t hc_udp_encoding; */ +/* uint8_t ttl; */ +/* uint8_t ports; */ +/* uint16_t udpchksum; */ +/* }; */ + +/** + * \brief An address context for IPHC address compression + * each context can have upto 8 bytes + */ +struct sicslowpan_addr_context { + uint8_t used; /* possibly use as prefix-length */ + uint8_t number; + uint8_t prefix[8]; +}; + +/** + * \name Address compressibility test functions + * @{ + */ + +/** + * \brief check whether we can compress the IID in + * address 'a' to 16 bits. + * This is used for unicast addresses only, and is true + * if the address is on the format \::0000:00ff:fe00:XXXX + * NOTE: we currently assume 64-bits prefixes + */ +#define sicslowpan_is_iid_16_bit_compressable(a) \ +((((a)->u16[4]) == 0) && \ +(((a)->u8[10]) == 0)&& \ +(((a)->u8[11]) == 0xff)&& \ +(((a)->u8[12]) == 0xfe)&& \ +(((a)->u8[13]) == 0)) + +/** + * \brief check whether the 9-bit group-id of the + * compressed multicast address is known. It is true + * if the 9-bit group is the all nodes or all routers + * group. + * \param a is typed uint8_t * + */ +#define sicslowpan_is_mcast_addr_decompressable(a) \ +(((*a & 0x01) == 0) && \ +((*(a + 1) == 0x01) || (*(a + 1) == 0x02))) + +/** + * \brief check whether the 112-bit group-id of the + * multicast address is mappable to a 9-bit group-id + * It is true if the group is the all nodes or all + * routers group. + */ +#define sicslowpan_is_mcast_addr_compressable(a) \ +((((a)->u16[1]) == 0) && \ +(((a)->u16[2]) == 0) && \ +(((a)->u16[3]) == 0) && \ +(((a)->u16[4]) == 0) && \ +(((a)->u16[5]) == 0) && \ +(((a)->u16[6]) == 0) && \ +(((a)->u8[14]) == 0) && \ +((((a)->u8[15]) == 1) || (((a)->u8[15]) == 2))) + +/* FFXX::00XX:XXXX:XXXX */ +#define sicslowpan_is_mcast_addr_compressable48(a) \ +((((a)->u16[1]) == 0) && \ +(((a)->u16[2]) == 0) && \ +(((a)->u16[3]) == 0) && \ +(((a)->u16[4]) == 0) && \ +(((a)->u8[10]) == 0)) + +/* FFXX::00XX:XXXX */ +#define sicslowpan_is_mcast_addr_compressable32(a) \ +((((a)->u16[1]) == 0) && \ +(((a)->u16[2]) == 0) && \ +(((a)->u16[3]) == 0) && \ +(((a)->u16[4]) == 0) && \ +(((a)->u16[5]) == 0) && \ +(((a)->u8[12]) == 0)) + +/* FF02::00XX */ +#define sicslowpan_is_mcast_addr_compressable8(a) \ +((((a)->u8[1]) == 2) && \ +(((a)->u16[1]) == 0) && \ +(((a)->u16[2]) == 0) && \ +(((a)->u16[3]) == 0) && \ +(((a)->u16[4]) == 0) && \ +(((a)->u16[5]) == 0) && \ +(((a)->u16[6]) == 0) && \ +(((a)->u8[14]) == 0)) + +#define uip_is_addr_mac_addr_based(a, m) \ +((((a)->s6_addr[8]) == (((m)[0]) ^ 0x02)) && \ +(((a)->s6_addr[9]) == (m)[1]) && \ +(((a)->s6_addr[10]) == (m)[2]) && \ +(((a)->s6_addr[11]) == (m)[3]) && \ +(((a)->s6_addr[12]) == (m)[4]) && \ +(((a)->s6_addr[13]) == (m)[5]) && \ +(((a)->s6_addr[14]) == (m)[6]) && \ +(((a)->s6_addr[15]) == (m)[7])) + +/** + * Construct an IPv6 address from eight 16-bit words. + * + * This function constructs an IPv6 address. + * + * \hideinitializer + */ +#define uip_ip6addr(addr, addr0, addr1, addr2, addr3, addr4, addr5, addr6, addr7) do {\ +(addr)->s6_addr[0] = htons(addr0); \ +(addr)->s6_addr[1] = htons(addr1); \ +(addr)->s6_addr[2] = htons(addr2); \ +(addr)->s6_addr[3] = htons(addr3); \ +(addr)->s6_addr[4] = htons(addr4); \ +(addr)->s6_addr[5] = htons(addr5); \ +(addr)->s6_addr[6] = htons(addr6); \ +(addr)->s6_addr[7] = htons(addr7); \ +} while(0) + +/** + * Construct an IPv6 address from sixteen 8-bit words. + * + * This function constructs an IPv6 address. + * + * \hideinitializer + */ +#define uip_ip6addr_u8(addr, addr0, addr1, addr2, addr3, addr4, addr5, addr6, addr7, addr8, addr9, addr10, addr11, addr12, addr13, addr14, addr15) do {\ +(addr)->s6_addr[0] = addr0; \ +(addr)->s6_addr[1] = addr1; \ +(addr)->s6_addr[2] = addr2; \ +(addr)->s6_addr[3] = addr3; \ +(addr)->s6_addr[4] = addr4; \ +(addr)->s6_addr[5] = addr5; \ +(addr)->s6_addr[6] = addr6; \ +(addr)->s6_addr[7] = addr7; \ +(addr)->s6_addr[8] = addr8; \ +(addr)->s6_addr[9] = addr9; \ +(addr)->s6_addr[10] = addr10; \ +(addr)->s6_addr[11] = addr11; \ +(addr)->s6_addr[12] = addr12; \ +(addr)->s6_addr[13] = addr13; \ +(addr)->s6_addr[14] = addr14; \ +(addr)->s6_addr[15] = addr15; \ +} while(0) + + + +/** \brief 16 bit 802.15.4 address */ +typedef struct uip_802154_shortaddr { + uint8_t addr[2]; +} uip_802154_shortaddr; +/** \brief 64 bit 802.15.4 address */ +typedef struct uip_802154_longaddr { + uint8_t addr[8]; +} uip_802154_longaddr; + +/** \brief 802.11 address */ +typedef struct uip_80211_addr { + uint8_t addr[6]; +} uip_80211_addr; + +/** \brief 802.3 address */ +typedef struct uip_eth_addr { + uint8_t addr[6]; +} uip_eth_addr; +typedef uip_802154_longaddr uip_lladdr_t; + +#define UIP_802154_SHORTADDR_LEN 2 +#define UIP_802154_LONGADDR_LEN 8 +#define UIP_LLADDR_LEN UIP_802154_LONGADDR_LEN + + +#define GET16(ptr) (((uint16_t)(((u_int8_t *)ptr)[0] << 8)) | (((u_int8_t *)ptr)[1])) +#define SET16(ptr, value) do { \ +((u_int8_t *)ptr)[0] = ((value) >> 8) & 0xff; \ +((u_int8_t *)ptr)[1] = (value) & 0xff; \ +} while(0) + +/** \name Pointers in the packetbuf buffer + * @{ + */ +#define PACKETBUF_FRAG_DISPATCH_SIZE 0 /* 16 bit */ +#define PACKETBUF_FRAG_TAG 2 /* 16 bit */ +#define PACKETBUF_FRAG_OFFSET 4 /* 8 bit */ + +#define PACKETBUF_HC1_DISPATCH 0 /* 8 bit */ +#define PACKETBUF_HC1_ENCODING 1 /* 8 bit */ +#define PACKETBUF_HC1_TTL 2 /* 8 bit */ + +#define PACKETBUF_HC1_HC_UDP_DISPATCH 0 /* 8 bit */ +#define PACKETBUF_HC1_HC_UDP_HC1_ENCODING 1 /* 8 bit */ +#define PACKETBUF_HC1_HC_UDP_UDP_ENCODING 2 /* 8 bit */ +#define PACKETBUF_HC1_HC_UDP_TTL 3 /* 8 bit */ +#define PACKETBUF_HC1_HC_UDP_PORTS 4 /* 8 bit */ +#define PACKETBUF_HC1_HC_UDP_CHKSUM 5 /* 16 bit */ + + +#define LINKADDR_SIZE 8 +typedef union { + unsigned char u8[LINKADDR_SIZE]; + uint16_t u16; +} linkaddr_t; + +static void +uip_ds6_set_addr_iid(struct in6_addr *ipaddr, uip_lladdr_t *lladdr) +{ + /* We consider only links with IEEE EUI-64 identifier or + * IEEE 48-bit MAC addresses */ +#if (UIP_LLADDR_LEN == 8) + memcpy(ipaddr->s6_addr + 8, lladdr, UIP_LLADDR_LEN); + ipaddr->s6_addr[8] ^= 0x02; +#elif (UIP_LLADDR_LEN == 6) + memcpy(ipaddr->s6_addr + 8, lladdr, 3); + ipaddr->s6_addr[11] = 0xff; + ipaddr->s6_addr[12] = 0xfe; + memcpy(ipaddr->s6_addr + 13, (uint8_t *)lladdr + 3, 3); + ipaddr->s6_addr[8] ^= 0x02; +#else +#error uip-ds6.c cannot build interface address when UIP_LLADDR_LEN is not 6 or 8 +#endif +} + +static errno_t +compress_hdr_ipv6(__unused struct frame802154 *ieee02154hdr, + __unused u_int8_t *payload, + long *hdroffset, size_t *hdrlen, u_int8_t *hdrbuf) +{ + /* + * Negative offset: 6LoWPAN header needs to ve prepended to the data + */ + *hdroffset = -SICSLOWPAN_IPV6_HDR_LEN; + *hdrlen = SICSLOWPAN_IPV6_HDR_LEN; + hdrbuf[0] = SICSLOWPAN_DISPATCH_IPV6; + + return 0; +} + + +#if 0 +/*--------------------------------------------------------------------*/ +/** \name HC1 compression and uncompression functions + * @{ */ +/*--------------------------------------------------------------------*/ +/** + * \brief Compress IP/UDP header using HC1 and HC_UDP + * + * This function is called by the 6lowpan code to create a compressed + * 6lowpan packet in the packetbuf buffer from a full IPv6 packet in the + * uip_buf buffer. + * + * + * If we can compress everything, we use HC1 dispatch, if not we use + * IPv6 dispatch.\n + * We can compress everything if: + * - IP version is + * - Flow label and traffic class are 0 + * - Both src and dest ip addresses are link local + * - Both src and dest interface ID are recoverable from lower layer + * header + * - Next header is either ICMP, UDP or TCP + * Moreover, if next header is UDP, we try to compress it using HC_UDP. + * This is feasible is both ports are between F0B0 and F0B0 + 15\n\n + * + * Resulting header structure: + * - For ICMP, TCP, non compressed UDP\n + * HC1 encoding = 11111010 (UDP) 11111110 (TCP) 11111100 (ICMP)\n + * \verbatim + * 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | LoWPAN HC1 Dsp | HC1 encoding | IPv6 Hop limit| L4 hdr + data| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \endverbatim + * + * - For compressed UDP + * HC1 encoding = 11111011, HC_UDP encoding = 11100000\n + * \verbatim + * 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | LoWPAN HC1 Dsp| HC1 encoding | HC_UDP encod.| IPv6 Hop limit| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | src p.| dst p.| UDP checksum | L4 data... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \endverbatim + * + * \param link_destaddr L2 destination address, needed to compress the + * IP destination field + */ +#endif +errno_t +compress_hdr_hc1(struct frame802154 *ieee02154hdr, u_int8_t *payload, + long *hdroffset, size_t *hdrlen, u_int8_t *hdrbuf) +{ + struct ip6_hdr *ip6 = (struct ip6_hdr *)(payload); + + if (*hdrlen < SICSLOWPAN_MIN_COMP_HDR_LEN) { + return EINVAL; + } + + *hdroffset = 0; + + /* + * Check if all the assumptions for full compression + * are valid : + */ + if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION || + !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src) || + !uip_is_addr_mac_addr_based(&ip6->ip6_src, ieee02154hdr->src_addr) || + !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst) || + !uip_is_addr_mac_addr_based(&ip6->ip6_dst, + ieee02154hdr->dest_addr) || + (ip6->ip6_nxt != IPPROTO_ICMPV6 && + ip6->ip6_nxt != IPPROTO_UDP && + ip6->ip6_nxt != IPPROTO_TCP)) { + /* + * IPV6 DISPATCH + * Something cannot be compressed, use IPV6 DISPATCH, + * compress nothing, copy IPv6 header in packetbuf buffer + */ + return compress_hdr_ipv6(ieee02154hdr, payload, hdroffset, hdrlen, hdrbuf); + } else { + /* + * HC1 DISPATCH + * maximum compresssion: + * All fields in the IP header but Hop Limit are elided + * If next header is UDP, we compress UDP header using HC2 + */ + hdrbuf[PACKETBUF_HC1_DISPATCH] = SICSLOWPAN_DISPATCH_HC1; + + switch (ip6->ip6_nxt) { + case IPPROTO_ICMPV6: + /* HC1 encoding and ttl */ + hdrbuf[PACKETBUF_HC1_ENCODING] = 0xFC; + hdrbuf[PACKETBUF_HC1_TTL] = ip6->ip6_hlim; + *hdrlen = SICSLOWPAN_HC1_HDR_LEN; + *hdroffset = sizeof(struct ip6_hdr); + break; + + case IPPROTO_TCP: + /* HC1 encoding and ttl */ + hdrbuf[PACKETBUF_HC1_ENCODING] = 0xFE; + hdrbuf[PACKETBUF_HC1_TTL] = ip6->ip6_hlim; + *hdrlen = SICSLOWPAN_HC1_HDR_LEN; + *hdroffset = sizeof(struct ip6_hdr); + break; + + case IPPROTO_UDP: { + struct udphdr *udp = (struct udphdr *)(uintptr_t)(ip6 + 1); + + /* + * try to compress UDP header (we do only full compression). + * This is feasible if both src and dest ports are between + * SICSLOWPAN_UDP_PORT_MIN and SICSLOWPAN_UDP_PORT_MIN + 15 + */ + printf("source/remote ports %u/%u\n", ntohs(udp->uh_sport), ntohs(udp->uh_dport)); + if (ntohs(udp->uh_sport) >= SICSLOWPAN_UDP_PORT_MIN && + ntohs(udp->uh_sport) < SICSLOWPAN_UDP_PORT_MAX && + ntohs(udp->uh_dport) >= SICSLOWPAN_UDP_PORT_MIN && + ntohs(udp->uh_dport) < SICSLOWPAN_UDP_PORT_MAX) { + /* HC1 encoding */ + hdrbuf[PACKETBUF_HC1_HC_UDP_HC1_ENCODING] = 0xFB; + + /* HC_UDP encoding, ttl, src and dest ports, checksum */ + hdrbuf[PACKETBUF_HC1_HC_UDP_UDP_ENCODING] = 0xE0; + hdrbuf[PACKETBUF_HC1_HC_UDP_TTL] = ip6->ip6_hlim; + + hdrbuf[PACKETBUF_HC1_HC_UDP_PORTS] = + (uint8_t)((ntohs(udp->uh_sport) - SICSLOWPAN_UDP_PORT_MIN) << 4) + + (uint8_t)((ntohs(udp->uh_dport) - SICSLOWPAN_UDP_PORT_MIN)); + + memcpy(&hdrbuf[PACKETBUF_HC1_HC_UDP_CHKSUM], &udp->uh_sum, 2); + *hdrlen = SICSLOWPAN_HC1_HC_UDP_HDR_LEN; + *hdroffset = sizeof(struct ip6_hdr) + sizeof(struct udphdr); + } else { + /* HC1 encoding and ttl */ + hdrbuf[PACKETBUF_HC1_ENCODING] = 0xFA; + hdrbuf[PACKETBUF_HC1_TTL] = ip6->ip6_hlim; + *hdrlen = SICSLOWPAN_HC1_HDR_LEN; + *hdroffset = sizeof(struct ip6_hdr); + } + break; + } + } + } + return 0; +} + + +/*--------------------------------------------------------------------*/ +/** + * \brief Uncompress HC1 (and HC_UDP) headers and put them in + * sicslowpan_buf + * + * This function is called by the input function when the dispatch is + * HC1. + * We %process the packet in the packetbuf buffer, uncompress the header + * fields, and copy the result in the sicslowpan buffer. + * At the end of the decompression, packetbuf_hdr_len and uncompressed_hdr_len + * are set to the appropriate values + * + * \param ip_len Equal to 0 if the packet is not a fragment (IP length + * is then inferred from the L2 length), non 0 if the packet is a 1st + * fragment. + */ +errno_t +uncompress_hdr_hc1(struct frame802154 *frame, u_int8_t *payload, + uint16_t ip_len, long *hdroffset, size_t *hdrlen, u_int8_t *hdrbuf) +{ + struct ip6_hdr *ip6 = (struct ip6_hdr *)hdrbuf; + + if (payload[PACKETBUF_HC1_DISPATCH] == SICSLOWPAN_DISPATCH_IPV6) { + *hdroffset = -SICSLOWPAN_IPV6_HDR_LEN; + *hdrlen = SICSLOWPAN_IPV6_HDR_LEN; + return 0; + } + + *hdroffset = 0; + + /* version, traffic class, flow label */ + ip6->ip6_flow = 0; + ip6->ip6_vfc = IPV6_VERSION; + + /* src and dest ip addresses */ + uip_ip6addr_u8(&ip6->ip6_src, 0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + uip_ds6_set_addr_iid(&ip6->ip6_src, + (uip_lladdr_t *)frame->src_addr); + + uip_ip6addr_u8(&ip6->ip6_dst, 0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + uip_ds6_set_addr_iid(&ip6->ip6_dst, + (uip_lladdr_t *)frame->dest_addr); + + *hdrlen = UIP_IPH_LEN; + + /* Next header field */ + switch (payload[PACKETBUF_HC1_ENCODING] & 0x06) { + case SICSLOWPAN_HC1_NH_ICMP6: + ip6->ip6_nxt = IPPROTO_ICMPV6; + ip6->ip6_hlim = payload[PACKETBUF_HC1_TTL]; + *hdroffset = SICSLOWPAN_HC1_HDR_LEN; + break; + + case SICSLOWPAN_HC1_NH_TCP: + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_hlim = payload[PACKETBUF_HC1_TTL]; + *hdroffset = SICSLOWPAN_HC1_HDR_LEN; + break; + + case SICSLOWPAN_HC1_NH_UDP: + ip6->ip6_nxt = IPPROTO_UDP; + if (payload[PACKETBUF_HC1_HC_UDP_HC1_ENCODING] & 0x01) { + struct udphdr *udp = (struct udphdr *)(uintptr_t)ip6; + + /* UDP header is compressed with HC_UDP */ + if (payload[PACKETBUF_HC1_HC_UDP_UDP_ENCODING] != + SICSLOWPAN_HC_UDP_ALL_C) { + printf("sicslowpan (uncompress_hdr), packet not supported"); + return EINVAL; + } + /* IP TTL */ + + ip6->ip6_hlim = payload[PACKETBUF_HC1_HC_UDP_TTL]; + /* UDP ports, len, checksum */ + udp->uh_sport = + htons(SICSLOWPAN_UDP_PORT_MIN + (payload[PACKETBUF_HC1_HC_UDP_PORTS] >> 4)); + udp->uh_dport = + htons(SICSLOWPAN_UDP_PORT_MIN + (payload[PACKETBUF_HC1_HC_UDP_PORTS] & 0x0F)); + + memcpy(&udp->uh_sum, &payload[PACKETBUF_HC1_HC_UDP_CHKSUM], 2); + *hdrlen += UIP_UDPH_LEN; + *hdroffset = SICSLOWPAN_HC1_HC_UDP_HDR_LEN; + } else { + ip6->ip6_hlim = payload[PACKETBUF_HC1_TTL]; + *hdroffset = SICSLOWPAN_HC1_HDR_LEN; + } + break; + + default: + /* this shouldn't happen, drop */ + return EINVAL; + } + + /* IP length field. */ + if (ip_len == 0) { + size_t len = frame->payload_len - *hdroffset + *hdrlen - sizeof(struct ip6_hdr); + + /* This is not a fragmented packet */ + SET16(&ip6->ip6_plen, len); + } else { + /* This is a 1st fragment */ + SET16(&ip6->ip6_plen, ip_len - UIP_IPH_LEN); + } + /* length field in UDP header */ + if (ip6->ip6_nxt == IPPROTO_UDP) { + struct udphdr *udp = (struct udphdr *)(uintptr_t)ip6; + + memcpy(&udp->uh_ulen, &ip6->ip6_plen, 2); + } + return 0; +} + +errno_t +sixxlowpan_compress(struct frame802154 *ieee02154hdr, u_int8_t *payload) +{ + long hdroffset; + size_t hdrlen; + u_int8_t hdrbuf[128]; + errno_t error; + + bzero(hdrbuf, sizeof(hdrbuf)); + hdrlen = sizeof(hdrbuf); + + error = compress_hdr_hc1(ieee02154hdr, payload, + &hdroffset, &hdrlen, hdrbuf); + if (error != 0) { + return error; + } + + if (hdroffset < 0) { + /* + * hdroffset negative means that we have to add + * hdrlen of extra stuff + */ + memmove(&payload[hdrlen], + &payload[0], + ieee02154hdr->payload_len); + memcpy(&payload[0], hdrbuf, hdrlen); + + ieee02154hdr->payload_len += hdrlen; + } else if (hdroffset > 0) { + /* + * hdroffset is the size of the compressed header + * + * hdrlen is the size of the data that has been compressed + * -- i.e. when the untouched data starts + */ + memmove(&payload[hdrlen], + &payload[hdroffset], + ieee02154hdr->payload_len - hdroffset); + memcpy(&payload[0], hdrbuf, hdrlen); + + ieee02154hdr->payload_len += hdrlen - hdroffset; + } + + return 0; +} + +errno_t +sixxlowpan_uncompress(struct frame802154 *ieee02154hdr, u_int8_t *payload) +{ + long hdroffset; + size_t hdrlen; + u_int8_t hdrbuf[128]; + errno_t error; + + bzero(hdrbuf, sizeof(hdrbuf)); + hdrlen = sizeof(hdrbuf); + + error = uncompress_hdr_hc1(ieee02154hdr, (u_int8_t *)payload, + 0, &hdroffset, &hdrlen, hdrbuf); + + if (error != 0) { + return error; + } + + if (hdroffset < 0) { + /* + * hdroffset negative means that we have to remove + * hdrlen of extra stuff + */ + memmove(&payload[0], + &payload[hdrlen], + ieee02154hdr->payload_len - hdrlen); + ieee02154hdr->payload_len -= hdrlen; + } else { + /* + * hdroffset is the size of the compressed header + * -- i.e. when the untouched data starts + * + * hdrlen is the size of the decompressed header + * that takes the place of compressed header of size hdroffset + */ + memmove(payload + hdrlen, + payload + hdroffset, + ieee02154hdr->payload_len - hdroffset); + memcpy(payload, hdrbuf, hdrlen); + ieee02154hdr->payload_len += hdrlen - hdroffset; + } + + return 0; +} + +errno_t +sixxlowpan_output(struct frame802154 *ieee02154hdr, u_int8_t *payload) +{ + errno_t error = 0; + + error = sixxlowpan_compress(ieee02154hdr, payload); + if (error != 0) { + goto done; + } + + /* + * TO DO: fragmentation + */ + +done: + return error; +} + +errno_t +sixxlowpan_input(struct frame802154 *ieee02154hdr, u_int8_t *payload) +{ + errno_t error = 0; + + error = sixxlowpan_uncompress(ieee02154hdr, payload); + if (error != 0) { + goto done; + } + + /* + * TO DO: fragmentation + */ + +done: + return error; +} diff --git a/bsd/net/sixxlowpan.h b/bsd/net/sixxlowpan.h new file mode 100644 index 000000000..3c5528e2d --- /dev/null +++ b/bsd/net/sixxlowpan.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef sixxlowpan_h +#define sixxlowpan_h + +#include + +#include "frame802154.h" + +errno_t sixxlowpan_compress(struct frame802154 *, u_int8_t *); +errno_t sixxlowpan_uncompress(struct frame802154 *, u_int8_t *); + +errno_t sixxlowpan_output(struct frame802154 *, u_int8_t *); +errno_t sixxlowpan_input(struct frame802154 *, u_int8_t *); + +#endif /* sixxlowpan_h */ diff --git a/bsd/net/skywalk_stubs.c b/bsd/net/skywalk_stubs.c index a4425e741..f8af8c657 100644 --- a/bsd/net/skywalk_stubs.c +++ b/bsd/net/skywalk_stubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Apple Inc. All rights reserved. + * Copyright (c) 2015-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,6 +49,8 @@ STUB(kern_buflet_get_object_segment); STUB(kern_buflet_set_data_offset); STUB(kern_buflet_set_data_length); STUB(kern_buflet_get_data_limit); +STUB(kern_buflet_attach_buffer); +STUB(kern_buflet_attach_buffer_with_segment_info); STUB(kern_channel_advance_slot); STUB(kern_channel_available_slot_count); STUB(kern_channel_get_context); @@ -83,7 +85,7 @@ STUB(kern_nexus_controller_read_provider_attr); STUB(kern_nexus_controller_register_provider); STUB(kern_nexus_controller_unbind_provider_instance); STUB(kern_nexus_deregister_domain_provider); -STUB(kern_nexus_get_builtin_domain_provider); +STUB(kern_nexus_get_default_domain_provider); STUB(kern_nexus_get_context); STUB(kern_nexus_get_pbufpool); STUB(kern_nexus_register_domain_provider); @@ -91,12 +93,15 @@ STUB(kern_packet_clear_flow_uuid); STUB(kern_packet_get_euuid); STUB(kern_packet_finalize); STUB(kern_packet_get_buflet_count); +STUB(kern_packet_set_buflet_count); STUB(kern_packet_get_data_length); STUB(kern_packet_get_flow_uuid); STUB(kern_packet_get_inet_checksum); +STUB(kern_packet_get_headroom); STUB(kern_packet_get_link_broadcast); STUB(kern_packet_get_link_ethfcs); STUB(kern_packet_get_link_header_offset); +STUB(kern_packet_get_link_header_length); STUB(kern_packet_get_link_multicast); STUB(kern_packet_get_network_header_offset); STUB(kern_packet_get_next_buflet); @@ -114,8 +119,10 @@ STUB(kern_packet_get_transport_traffic_background) STUB(kern_packet_get_transport_traffic_realtime) STUB(kern_packet_set_flow_uuid); STUB(kern_packet_set_inet_checksum); +STUB(kern_packet_set_headroom); STUB(kern_packet_set_link_broadcast); STUB(kern_packet_set_link_header_offset); +STUB(kern_packet_set_link_header_length); STUB(kern_packet_set_link_multicast); STUB(kern_packet_set_link_ethfcs); STUB(kern_packet_set_network_header_offset); @@ -128,16 +135,34 @@ STUB(kern_packet_get_timestamp_requested); STUB(kern_packet_get_tx_completion_status); STUB(kern_packet_set_tx_completion_status); STUB(kern_packet_tx_completion); +STUB(kern_packet_set_group_start); +STUB(kern_packet_get_group_start); +STUB(kern_packet_set_group_end); +STUB(kern_packet_get_group_end); +STUB(kern_packet_set_expire_time); +STUB(kern_packet_get_expire_time); +STUB(kern_packet_set_token); +STUB(kern_packet_get_token); +STUB(kern_packet_get_packetid); +STUB(kern_packet_set_vlan_tag); +STUB(kern_packet_get_vlan_tag); +STUB(kern_packet_get_vlan_id); +STUB(kern_packet_get_vlan_priority); STUB(kern_pbufpool_alloc); STUB(kern_pbufpool_alloc_batch); +STUB(kern_pbufpool_alloc_batch_callback); STUB(kern_pbufpool_alloc_nosleep); STUB(kern_pbufpool_alloc_batch_nosleep); +STUB(kern_pbufpool_alloc_batch_nosleep_callback); STUB(kern_pbufpool_create); STUB(kern_pbufpool_destroy); STUB(kern_pbufpool_free); STUB(kern_pbufpool_free_batch); STUB(kern_pbufpool_get_context); STUB(kern_pbufpool_get_memory_info); +STUB(kern_pbufpool_alloc_buffer); +STUB(kern_pbufpool_alloc_buffer_nosleep); +STUB(kern_pbufpool_free_buffer); STUB(kern_segment_get_index); #undef STUB #endif /* !SKYWALK */ diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index 6eddebfe3..dc4f2c43b 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -32,6 +32,7 @@ PRIVATE_DATAFILES = \ ip_fw2.h \ mptcp_var.h \ tcp.h \ + tcp_cc.h \ tcp_debug.h \ tcp_var.h \ tcp_cache.h \ @@ -39,7 +40,7 @@ PRIVATE_DATAFILES = \ in_stat.h PRIVATE_KERNELFILES = ${KERNELFILES} \ - ip_ecn.h ip_encap.h + ip_ecn.h ip_encap.h tcp_log.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/netinet/cbrtf.c b/bsd/netinet/cbrtf.c index f4f4bddf9..ff68ee31d 100644 --- a/bsd/netinet/cbrtf.c +++ b/bsd/netinet/cbrtf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,89 +42,117 @@ struct cbrt_table_entry { static const struct cbrt_table_entry cbrt_table[] = { /* mantissa = 0x1.00... */ - {0x1.0000000000000p+0, 0x1.0000000000000p+0, - 0x1.0000000000000p+0, 0x1.0000000000000p+0}, /* exponent = 0 */ - {0x1.037e200000000p+1, 0x1.4400000000000p+0, - 0x1.948b0fcd6e9e0p-1, 0x1.f91bd1b62b9cfp-2}, /* exponent = 1 */ - {0x1.0315800000000p+2, 0x1.9800000000000p+0, - 0x1.4141414141414p-1, 0x1.f9e7cba5753afp-3}, /* exponent = 2 */ + { + .x = 0x1.0000000000000p+0, + .cbrt_x = 0x1.0000000000000p+0, + .recip_cbrt_x = 0x1.0000000000000p+0, + .recip_x = 0x1.0000000000000p+0 + }, /* exponent = 0 */ + { + .x = 0x1.037e200000000p+1, + .cbrt_x = 0x1.4400000000000p+0, + .recip_cbrt_x = 0x1.948b0fcd6e9e0p-1, + .recip_x = 0x1.f91bd1b62b9cfp-2 + }, /* exponent = 1 */ + { + .x = 0x1.0315800000000p+2, + .cbrt_x = 0x1.9800000000000p+0, + .recip_cbrt_x = 0x1.4141414141414p-1, + .recip_x = 0x1.f9e7cba5753afp-3 + }, /* exponent = 2 */ /* mantissa = 0x1.04... */ - {0x1.060c080000000p+0, 0x1.0200000000000p+0, - 0x1.fc07f01fc07f0p-1, 0x1.f42f61dacddc6p-1}, /* exponent = 0 */ - {0x1.05ff4c356ff40p+1, 0x1.450a000000000p+0, - 0x1.933fff9b30002p-1, 0x1.f447b132ca3acp-2}, /* exponent = 1 */ - {0x1.06e9aa0000000p+2, 0x1.9a00000000000p+0, - 0x1.3fb013fb013fbp-1, 0x1.f289bb31fd41cp-3}, /* exponent = 2 */ + { + .x = 0x1.060c080000000p+0, + .cbrt_x = 0x1.0200000000000p+0, + 0x1.fc07f01fc07f0p-1, + 0x1.f42f61dacddc6p-1 + }, /* exponent = 0 */ + { + .x = 0x1.05ff4c356ff40p+1, + .cbrt_x = 0x1.450a000000000p+0, + 0x1.933fff9b30002p-1, + 0x1.f447b132ca3acp-2 + }, /* exponent = 1 */ + { + .x = 0x1.06e9aa0000000p+2, + .cbrt_x = 0x1.9a00000000000p+0, + 0x1.3fb013fb013fbp-1, + 0x1.f289bb31fd41cp-3 + }, /* exponent = 2 */ /* mantissa = 0x1.08...*/ - {0x1.09fe97c0b2e80p+0, 0x1.034a000000000p+0, + {.x = 0x1.09fe97c0b2e80p+0, .cbrt_x = 0x1.034a000000000p+0, 0x1.f9815c85b04a3p-1, 0x1.ecc3168ac46e4p-1}, // exponent = 0 - {0x1.0853ec0000000p+1, 0x1.4600000000000p+0, 0x1.920fb49d0e229p-1, 0x1.efde7dcdacefdp-2}, // exponent = 1 - {0x1.0ac7700000000p+2, 0x1.9c00000000000p+0, 0x1.3e22cbce4a902p-1, 0x1.eb501ca81bb3ep-3}, // exponent = 2 + {.x = 0x1.0853ec0000000p+1, .cbrt_x = 0x1.4600000000000p+0, + 0x1.920fb49d0e229p-1, 0x1.efde7dcdacefdp-2}, // exponent = 1 + {.x = 0x1.0ac7700000000p+2, .cbrt_x = 0x1.9c00000000000p+0, + 0x1.3e22cbce4a902p-1, 0x1.eb501ca81bb3ep-3}, // exponent = 2 /* mantissa = 0x1.0c...*/ - {0x1.0c30400000000p+0, 0x1.0400000000000p+0, 0x1.f81f81f81f820p-1, 0x1.e8bb1d5b6e585p-1}, // exponent = 0 - {0x1.0d39000000000p+1, 0x1.4800000000000p+0, 0x1.8f9c18f9c18fap-1, 0x1.e6da80ced1523p-2}, // exponent = 1 - {0x1.0eaede0000000p+2, 0x1.9e00000000000p+0, 0x1.3c995a47babe7p-1, 0x1.e43a0fc24fe4bp-3}, // exponent = 2 + {.x = 0x1.0c30400000000p+0, 0x1.0400000000000p+0, + 0x1.f81f81f81f820p-1, 0x1.e8bb1d5b6e585p-1}, // exponent = 0 + {.x = 0x1.0d39000000000p+1, 0x1.4800000000000p+0, + 0x1.8f9c18f9c18fap-1, 0x1.e6da80ced1523p-2}, // exponent = 1 + {.x = 0x1.0eaede0000000p+2, 0x1.9e00000000000p+0, 0x1.3c995a47babe7p-1, 0x1.e43a0fc24fe4bp-3}, // exponent = 2 /* mantissa = 0x1.10...*/ - {0x1.126cd80000000p+0, 0x1.0600000000000p+0, 0x1.f44659e4a4271p-1, 0x1.dd9fb30af3365p-1}, // exponent = 0 - {0x1.122d740000000p+1, 0x1.4a00000000000p+0, 0x1.8d3018d3018d3p-1, 0x1.de0e209af882ep-2}, // exponent = 1 - {0x1.12a0000000000p+2, 0x1.a000000000000p+0, 0x1.3b13b13b13b14p-1, 0x1.dd46baab49c24p-3}, // exponent = 2 + {.x = 0x1.126cd80000000p+0, 0x1.0600000000000p+0, 0x1.f44659e4a4271p-1, 0x1.dd9fb30af3365p-1}, // exponent = 0 + {.x = 0x1.122d740000000p+1, 0x1.4a00000000000p+0, 0x1.8d3018d3018d3p-1, 0x1.de0e209af882ep-2}, // exponent = 1 + {.x = 0x1.12a0000000000p+2, 0x1.a000000000000p+0, 0x1.3b13b13b13b14p-1, 0x1.dd46baab49c24p-3}, // exponent = 2 /* mantissa = 0x1.14...*/ - {0x1.15f9b5b480000p+0, 0x1.0720000000000p+0, 0x1.f222c82dba316p-1, 0x1.d786108fd7a9fp-1}, // exponent = 0 - {0x1.1731600000000p+1, 0x1.4c00000000000p+0, 0x1.8acb90f6bf3aap-1, 0x1.d577b2f5c6f87p-2}, // exponent = 1 - {0x1.169ae20000000p+2, 0x1.a200000000000p+0, 0x1.3991c2c187f63p-1, 0x1.d67549c6f9b67p-3}, // exponent = 2 + {.x = 0x1.15f9b5b480000p+0, 0x1.0720000000000p+0, 0x1.f222c82dba316p-1, 0x1.d786108fd7a9fp-1}, // exponent = 0 + {.x = 0x1.1731600000000p+1, 0x1.4c00000000000p+0, 0x1.8acb90f6bf3aap-1, 0x1.d577b2f5c6f87p-2}, // exponent = 1 + {.x = 0x1.169ae20000000p+2, 0x1.a200000000000p+0, 0x1.3991c2c187f63p-1, 0x1.d67549c6f9b67p-3}, // exponent = 2 /* mantissa = 0x1.18...*/ - {0x1.18c2000000000p+0, 0x1.0800000000000p+0, 0x1.f07c1f07c1f08p-1, 0x1.d2d9cbd756afdp-1}, // exponent = 0 - {0x1.19fb2ce620540p+1, 0x1.4d1a000000000p+0, 0x1.897d564f5cf98p-1, 0x1.d0d34ccd78141p-2}, // exponent = 1 - {0x1.1a9f900000000p+2, 0x1.a400000000000p+0, 0x1.3813813813814p-1, 0x1.cfc4ef7db5bffp-3}, // exponent = 2 + {.x = 0x1.18c2000000000p+0, 0x1.0800000000000p+0, 0x1.f07c1f07c1f08p-1, 0x1.d2d9cbd756afdp-1}, // exponent = 0 + {.x = 0x1.19fb2ce620540p+1, 0x1.4d1a000000000p+0, 0x1.897d564f5cf98p-1, 0x1.d0d34ccd78141p-2}, // exponent = 1 + {.x = 0x1.1a9f900000000p+2, 0x1.a400000000000p+0, 0x1.3813813813814p-1, 0x1.cfc4ef7db5bffp-3}, // exponent = 2 /* mantissa = 0x1.1c...*/ - {0x1.1f2fe80000000p+0, 0x1.0a00000000000p+0, 0x1.ecc07b301ecc0p-1, 0x1.c86636f753a66p-1}, // exponent = 0 - {0x1.1c44dc0000000p+1, 0x1.4e00000000000p+0, 0x1.886e5f0abb04ap-1, 0x1.cd159cdbba714p-2}, // exponent = 1 - {0x1.1eae160000000p+2, 0x1.a600000000000p+0, 0x1.3698df3de0748p-1, 0x1.c934e4095d202p-3}, // exponent = 2 + {.x = 0x1.1f2fe80000000p+0, 0x1.0a00000000000p+0, 0x1.ecc07b301ecc0p-1, 0x1.c86636f753a66p-1}, // exponent = 0 + {.x = 0x1.1c44dc0000000p+1, 0x1.4e00000000000p+0, 0x1.886e5f0abb04ap-1, 0x1.cd159cdbba714p-2}, // exponent = 1 + {.x = 0x1.1eae160000000p+2, 0x1.a600000000000p+0, 0x1.3698df3de0748p-1, 0x1.c934e4095d202p-3}, // exponent = 2 /* mantissa = 0x1.20...*/ - {0x1.21fac7ca59c00p+0, 0x1.0adc000000000p+0, 0x1.eb2a412496abdp-1, 0x1.c40112c606d3ep-1}, // exponent = 0 - {0x1.2168000000000p+1, 0x1.5000000000000p+0, 0x1.8618618618618p-1, 0x1.c4e651e0c37d7p-2}, // exponent = 1 - {0x1.22c6800000000p+2, 0x1.a800000000000p+0, 0x1.3521cfb2b78c1p-1, 0x1.c2c46544650c1p-3}, // exponent = 2 + {.x = 0x1.21fac7ca59c00p+0, 0x1.0adc000000000p+0, 0x1.eb2a412496abdp-1, 0x1.c40112c606d3ep-1}, // exponent = 0 + {.x = 0x1.2168000000000p+1, 0x1.5000000000000p+0, 0x1.8618618618618p-1, 0x1.c4e651e0c37d7p-2}, // exponent = 1 + {.x = 0x1.22c6800000000p+2, 0x1.a800000000000p+0, 0x1.3521cfb2b78c1p-1, 0x1.c2c46544650c1p-3}, // exponent = 2 /* mantissa = 0x1.24...*/ - {0x1.25b6c00000000p+0, 0x1.0c00000000000p+0, 0x1.e9131abf0b767p-1, 0x1.be41e7ee3f7edp-1}, // exponent = 0 - {0x1.269ae40000000p+1, 0x1.5200000000000p+0, 0x1.83c977ab2beddp-1, 0x1.bce853967753cp-2}, // exponent = 1 - {0x1.26e8da0000000p+2, 0x1.aa00000000000p+0, 0x1.33ae45b57bcb2p-1, 0x1.bc72b67ab9ce7p-3}, // exponent = 2 + {.x = 0x1.25b6c00000000p+0, 0x1.0c00000000000p+0, 0x1.e9131abf0b767p-1, 0x1.be41e7ee3f7edp-1}, // exponent = 0 + {.x = 0x1.269ae40000000p+1, 0x1.5200000000000p+0, 0x1.83c977ab2beddp-1, 0x1.bce853967753cp-2}, // exponent = 1 + {.x = 0x1.26e8da0000000p+2, 0x1.aa00000000000p+0, 0x1.33ae45b57bcb2p-1, 0x1.bc72b67ab9ce7p-3}, // exponent = 2 /* mantissa = 0x1.28...*/ - {0x1.29ff9aaaa2c00p+0, 0x1.0d4c000000000p+0, 0x1.e6b8275501adbp-1, 0x1.b7d7596e80007p-1}, // exponent = 0 - {0x1.2bdda00000000p+1, 0x1.5400000000000p+0, 0x1.8181818181818p-1, 0x1.b51a30f9739f8p-2}, // exponent = 1 - {0x1.2b15300000000p+2, 0x1.ac00000000000p+0, 0x1.323e34a2b10bfp-1, 0x1.b63f203c60c07p-3}, // exponent = 2 + {.x = 0x1.29ff9aaaa2c00p+0, 0x1.0d4c000000000p+0, 0x1.e6b8275501adbp-1, 0x1.b7d7596e80007p-1}, // exponent = 0 + {.x = 0x1.2bdda00000000p+1, 0x1.5400000000000p+0, 0x1.8181818181818p-1, 0x1.b51a30f9739f8p-2}, // exponent = 1 + {.x = 0x1.2b15300000000p+2, 0x1.ac00000000000p+0, 0x1.323e34a2b10bfp-1, 0x1.b63f203c60c07p-3}, // exponent = 2 /* mantissa = 0x1.2c...*/ - {0x1.2c56b80000000p+0, 0x1.0e00000000000p+0, 0x1.e573ac901e574p-1, 0x1.b469f4adc7794p-1}, // exponent = 0 - {0x1.2dfff74f29dc0p+1, 0x1.54ce000000000p+0, 0x1.80987c755886ap-1, 0x1.b203708429799p-2}, // exponent = 1 - {0x1.2f4b8e0000000p+2, 0x1.ae00000000000p+0, 0x1.30d190130d190p-1, 0x1.b028f031c8644p-3}, // exponent = 2 + {.x = 0x1.2c56b80000000p+0, 0x1.0e00000000000p+0, 0x1.e573ac901e574p-1, 0x1.b469f4adc7794p-1}, // exponent = 0 + {.x = 0x1.2dfff74f29dc0p+1, 0x1.54ce000000000p+0, 0x1.80987c755886ap-1, 0x1.b203708429799p-2}, // exponent = 1 + {.x = 0x1.2f4b8e0000000p+2, 0x1.ae00000000000p+0, 0x1.30d190130d190p-1, 0x1.b028f031c8644p-3}, // exponent = 2 /* mantissa = 0x1.30...*/ - {0x1.3310000000000p+0, 0x1.1000000000000p+0, 0x1.e1e1e1e1e1e1ep-1, 0x1.aadb93d39ae9cp-1}, // exponent = 0 - {0x1.31304c0000000p+1, 0x1.5600000000000p+0, 0x1.7f405fd017f40p-1, 0x1.ad7a85e593e54p-2}, // exponent = 1 - {0x1.338c000000000p+2, 0x1.b000000000000p+0, 0x1.2f684bda12f68p-1, 0x1.aa2f78f1b4cc6p-3}, // exponent = 2 + {.x = 0x1.3310000000000p+0, 0x1.1000000000000p+0, 0x1.e1e1e1e1e1e1ep-1, 0x1.aadb93d39ae9cp-1}, // exponent = 0 + {.x = 0x1.31304c0000000p+1, 0x1.5600000000000p+0, 0x1.7f405fd017f40p-1, 0x1.ad7a85e593e54p-2}, // exponent = 1 + {.x = 0x1.338c000000000p+2, 0x1.b000000000000p+0, 0x1.2f684bda12f68p-1, 0x1.aa2f78f1b4cc6p-3}, // exponent = 2 /* mantissa = 0x1.34... */ - {0x1.35fb6f4579c00p+0, 0x1.10dc000000000p+0, 0x1.e05d5a24448c5p-1, 0x1.a6d6548fa984dp-1}, // exponent = 0 - {0x1.3693000000000p+1, 0x1.5800000000000p+0, 0x1.7d05f417d05f4p-1, 0x1.a607fa909db1fp-2}, // exponent = 1 - {0x1.37d6920000000p+2, 0x1.b200000000000p+0, 0x1.2e025c04b8097p-1, 0x1.a45211d8b748ap-3}, // exponent = 2 + {.x = 0x1.35fb6f4579c00p+0, 0x1.10dc000000000p+0, 0x1.e05d5a24448c5p-1, 0x1.a6d6548fa984dp-1}, // exponent = 0 + {.x = 0x1.3693000000000p+1, 0x1.5800000000000p+0, 0x1.7d05f417d05f4p-1, 0x1.a607fa909db1fp-2}, // exponent = 1 + {.x = 0x1.37d6920000000p+2, 0x1.b200000000000p+0, 0x1.2e025c04b8097p-1, 0x1.a45211d8b748ap-3}, // exponent = 2 /* mantissa = 0x1.38... */ - {0x1.39e2c80000000p+0, 0x1.1200000000000p+0, 0x1.de5d6e3f8868ap-1, 0x1.a1941b013022dp-1}, // exponent = 0 - {0x1.39fe541ac7840p+1, 0x1.5942000000000p+0, 0x1.7ba298eae8947p-1, 0x1.a16f787114257p-2}, // exponent = 1 - {0x1.39ffaac000000p+2, 0x1.b300000000000p+0, 0x1.2d50a012d50a0p-1, 0x1.a16db0ec408b2p-3}, // exponent = 2 + {.x = 0x1.39e2c80000000p+0, 0x1.1200000000000p+0, 0x1.de5d6e3f8868ap-1, 0x1.a1941b013022dp-1}, // exponent = 0 + {.x = 0x1.39fe541ac7840p+1, 0x1.5942000000000p+0, 0x1.7ba298eae8947p-1, 0x1.a16f787114257p-2}, // exponent = 1 + {.x = 0x1.39ffaac000000p+2, 0x1.b300000000000p+0, 0x1.2d50a012d50a0p-1, 0x1.a16db0ec408b2p-3}, // exponent = 2 /* mantissa = 0x1.3c... */ - {0x1.3dfc1312b0000p+0, 0x1.1330000000000p+0, 0x1.dc4cfaf10eb5cp-1, 0x1.9c322b87f17e8p-1}, // exponent = 0 + {.x = 0x1.3dfc1312b0000p+0, 0x1.1330000000000p+0, 0x1.dc4cfaf10eb5cp-1, 0x1.9c322b87f17e8p-1}, // exponent = 0 {0x1.3c05d40000000p+1, 0x1.5a00000000000p+0, 0x1.7ad2208e0ecc3p-1, 0x1.9ec1430b0dfc7p-2}, // exponent = 1 {0x1.3c2b500000000p+2, 0x1.b400000000000p+0, 0x1.2c9fb4d812ca0p-1, 0x1.9e9016e2211b6p-3}, // exponent = 2 @@ -349,24 +377,24 @@ static const struct cbrt_table_entry cbrt_table[] = { {0x1.ee35ca0000000p+2, 0x1.fa00000000000p+0, 0x1.03091b51f5e1ap-1, 0x1.093712d33ff42p-3}, // exponent = 2 /* mantissa = 0x1.f0... */ - {0x1.f1fd112ab0c80p+0, 0x1.3f92000000000p+0, 0x1.9a2696dd75ba1p-1, 0x1.0733ed7907e73p-1}, // exponent = 0 - {0x1.f1fc8b255bc40p+1, 0x1.92a2000000000p+0, 0x1.45898cb57730cp-1, 0x1.0734344eaebefp-2}, // exponent = 1 - {0x1.f1ff2ff2d4ba0p+2, 0x1.fb4a000000000p+0, 0x1.02609989a73cfp-1, 0x1.0732ce999c3d1p-3}, // exponent = 2 + {.x = 0x1.f1fd112ab0c80p+0, 0x1.3f92000000000p+0, 0x1.9a2696dd75ba1p-1, 0x1.0733ed7907e73p-1}, // exponent = 0 + {.x = 0x1.f1fc8b255bc40p+1, 0x1.92a2000000000p+0, 0x1.45898cb57730cp-1, 0x1.0734344eaebefp-2}, // exponent = 1 + {.x = 0x1.f1ff2ff2d4ba0p+2, 0x1.fb4a000000000p+0, 0x1.02609989a73cfp-1, 0x1.0732ce999c3d1p-3}, // exponent = 2 /* mantissa = 0x1.f4... */ - {0x1.f400000000000p+0, 0x1.4000000000000p+0, 0x1.999999999999ap-1, 0x1.0624dd2f1a9fcp-1}, // exponent = 0 - {0x1.f713a00000000p+1, 0x1.9400000000000p+0, 0x1.446f86562d9fbp-1, 0x1.048a727489527p-2}, // exponent = 1 - {0x1.f417f00000000p+2, 0x1.fc00000000000p+0, 0x1.0204081020408p-1, 0x1.061850f2a7123p-3}, // exponent = 2 + {.x = 0x1.f400000000000p+0, 0x1.4000000000000p+0, 0x1.999999999999ap-1, 0x1.0624dd2f1a9fcp-1}, // exponent = 0 + {.x = 0x1.f713a00000000p+1, 0x1.9400000000000p+0, 0x1.446f86562d9fbp-1, 0x1.048a727489527p-2}, // exponent = 1 + {.x = 0x1.f417f00000000p+2, 0x1.fc00000000000p+0, 0x1.0204081020408p-1, 0x1.061850f2a7123p-3}, // exponent = 2 /* mantissa = 0x1.f8... */ - {0x1.f9fe36d7a7d80p+0, 0x1.4146000000000p+0, 0x1.97f9f956c92fdp-1, 0x1.030a055aebeddp-1}, // exponent = 0 - {0x1.f9f8b6ce70ec0p+1, 0x1.94c6000000000p+0, 0x1.43d0d2af8e146p-1, 0x1.030cd637fd65ep-2}, // exponent = 1 - {0x1.fa05fe0000000p+2, 0x1.fe00000000000p+0, 0x1.0101010101010p-1, 0x1.03060a0f151c2p-3}, // exponent = 2 + {.x = 0x1.f9fe36d7a7d80p+0, 0x1.4146000000000p+0, 0x1.97f9f956c92fdp-1, 0x1.030a055aebeddp-1}, // exponent = 0 + {.x = 0x1.f9f8b6ce70ec0p+1, 0x1.94c6000000000p+0, 0x1.43d0d2af8e146p-1, 0x1.030cd637fd65ep-2}, // exponent = 1 + {.x = 0x1.fa05fe0000000p+2, 0x1.fe00000000000p+0, 0x1.0101010101010p-1, 0x1.03060a0f151c2p-3}, // exponent = 2 /* mantissa = 0x1.fc... */ - {0x1.fd6f080000000p+0, 0x1.4200000000000p+0, 0x1.970e4f80cb872p-1, 0x1.014a239d8b1a9p-1}, // exponent = 0 - {0x1.fe95cc0000000p+1, 0x1.9600000000000p+0, 0x1.42d6625d51f87p-1, 0x1.00b59a78a8ffcp-2}, // exponent = 1 - {0x1.0000000000000p+3, 0x1.0000000000000p+1, 0x1.0000000000000p-1, 0x1.0000000000000p-3}, // exponent = 2 + {.x = 0x1.fd6f080000000p+0, 0x1.4200000000000p+0, 0x1.970e4f80cb872p-1, 0x1.014a239d8b1a9p-1}, // exponent = 0 + {.x = 0x1.fe95cc0000000p+1, 0x1.9600000000000p+0, 0x1.42d6625d51f87p-1, 0x1.00b59a78a8ffcp-2}, // exponent = 1 + {.x = 0x1.0000000000000p+3, 0x1.0000000000000p+1, 0x1.0000000000000p-1, 0x1.0000000000000p-3}, // exponent = 2 }; union floatdata { float f; int32_t x; }; diff --git a/bsd/netinet/dhcp_options.c b/bsd/netinet/dhcp_options.c index c38c6a6fa..2e2d4581d 100644 --- a/bsd/netinet/dhcp_options.c +++ b/bsd/netinet/dhcp_options.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2016 Apple Inc. All rights reserved. + * Copyright (c) 2002-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -457,20 +457,20 @@ struct test { }; struct test tests[] = { - { "empty", test_empty, sizeof(test_empty), TRUE }, - { "simple", test_simple, sizeof(test_simple), TRUE }, - { "vendor", test_vendor, sizeof(test_vendor), TRUE }, - { "no_end", test_no_end, sizeof(test_no_end), TRUE }, - { "no magic", test_no_magic, sizeof(test_no_magic), FALSE }, - { "short", test_short, sizeof(test_short), FALSE }, - { NULL, NULL, 0, FALSE }, + { .name = "empty", .data = test_empty, .len = sizeof(test_empty), .result = TRUE }, + { .name = "simple", .data = test_simple, .len = sizeof(test_simple), .result = TRUE }, + { .name = "vendor", .data = test_vendor, .len = sizeof(test_vendor), .result = TRUE }, + { .name = "no_end", .data = test_no_end, .len = sizeof(test_no_end), .result = TRUE }, + { .name = "no magic", .data = test_no_magic, .len = sizeof(test_no_magic), .result = FALSE }, + { .name = "short", .data = test_short, .len = sizeof(test_short), .result = FALSE }, + { .name = NULL, .data = NULL, .len = 0, .result = FALSE }, }; static char buf[2048]; int -main() +main(void) { int i; dhcpol_t options; diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index 13b9cad5d..ebd14a5b1 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -45,6 +45,8 @@ #include #include #include +#include +#include #include #include #include @@ -64,6 +66,10 @@ #include #include #include +#include +#if CONTENT_FILTER +#include +#endif /* CONTENT_FILTER */ #define FLOW_DIVERT_CONNECT_STARTED 0x00000001 #define FLOW_DIVERT_READ_CLOSED 0x00000002 @@ -472,13 +478,21 @@ flow_divert_packet_get_tlv(mbuf_t packet, int offset, uint8_t type, size_t buff_ length = ntohl(length); + uint32_t data_offset = tlv_offset + sizeof(type) + sizeof(length); + + if (length > (mbuf_pkthdr_len(packet) - data_offset)) { + FDLOG(LOG_ERR, &nil_pcb, "Length of %u TLV (%u) is larger than remaining packet data (%lu)", type, length, (mbuf_pkthdr_len(packet) - data_offset)); + return EINVAL; + } + if (val_size != NULL) { *val_size = length; } if (buff != NULL && buff_len > 0) { + memset(buff, 0, buff_len); size_t to_copy = (length < buff_len) ? length : buff_len; - error = mbuf_copydata(packet, tlv_offset + sizeof(type) + sizeof(length), to_copy, buff); + error = mbuf_copydata(packet, data_offset, to_copy, buff); if (error) { return error; } @@ -560,7 +574,7 @@ flow_divert_packet_verify_hmac(mbuf_t packet, uint32_t ctl_unit) goto done; } - if (memcmp(packet_hmac, computed_hmac, sizeof(packet_hmac))) { + if (cc_cmp_safe(sizeof(packet_hmac), packet_hmac, computed_hmac)) { FDLOG0(LOG_WARNING, &nil_pcb, "HMAC in token does not match computed HMAC"); error = EINVAL; goto done; @@ -631,6 +645,20 @@ flow_divert_check_no_expensive(struct flow_divert_pcb *fd_cb) return 0; } +static errno_t +flow_divert_check_no_constrained(struct flow_divert_pcb *fd_cb) +{ + struct inpcb *inp = NULL; + + inp = sotoinpcb(fd_cb->so); + if (inp && INP_NO_CONSTRAINED(inp) && inp->inp_last_outifp && + IFNET_IS_CONSTRAINED(inp->inp_last_outifp)) { + return EHOSTUNREACH; + } + + return 0; +} + static void flow_divert_update_closed_state(struct flow_divert_pcb *fd_cb, int how, Boolean tunnel) { @@ -1022,10 +1050,10 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr socket_unlock(so, 0); - if (signing_id == NULL) { - release_proc = flow_divert_get_src_proc(so, &src_proc); - if (src_proc != PROC_NULL) { - proc_lock(src_proc); + release_proc = flow_divert_get_src_proc(so, &src_proc); + if (src_proc != PROC_NULL) { + proc_lock(src_proc); + if (signing_id == NULL) { if (src_proc->p_csflags & (CS_VALID | CS_DEBUGGED)) { const char * cs_id; cs_id = cs_identity_get(src_proc); @@ -1033,11 +1061,9 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr } else { FDLOG0(LOG_WARNING, fd_cb, "Signature is invalid"); } - } else { - FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc"); } } else { - src_proc = PROC_NULL; + FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc"); } if (signing_id != NULL) { @@ -1080,6 +1106,27 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr } } + if (error == 0 && src_proc != PROC_NULL) { + task_t task = proc_task(src_proc); + if (task != TASK_NULL) { + audit_token_t audit_token; + mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT; + kern_return_t rc = task_info(task, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count); + if (rc == KERN_SUCCESS) { + error = flow_divert_packet_append_tlv(connect_packet, + FLOW_DIVERT_TLV_APP_AUDIT_TOKEN, + sizeof(audit_token_t), + &audit_token); + if (error) { + FDLOG(LOG_ERR, fd_cb, "failed to append app audit token: %d", error); + error = 0; /* do not treat this as fatal error, proceed */ + } + } else { + FDLOG(LOG_ERR, fd_cb, "failed to retrieve app audit token: %d", rc); + } + } + } + if (src_proc != PROC_NULL) { proc_unlock(src_proc); if (release_proc) { @@ -1768,12 +1815,38 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, } fd_cb->local_address = dup_sockaddr((struct sockaddr *)&local_address, 1); } + if (flow_divert_is_sockaddr_valid((struct sockaddr *)&local_address)) { + if (inp->inp_vflag & INP_IPV4 && local_address.ss_family == AF_INET) { + struct sockaddr_in *local_in_address = (struct sockaddr_in *)&local_address; + inp->inp_lport = local_in_address->sin_port; + memcpy(&inp->inp_laddr, &local_in_address->sin_addr, sizeof(struct in_addr)); + } else if (inp->inp_vflag & INP_IPV6 && local_address.ss_family == AF_INET6) { + struct sockaddr_in6 *local_in6_address = (struct sockaddr_in6 *)&local_address; + inp->inp_lport = local_in6_address->sin6_port; + memcpy(&inp->in6p_laddr, &local_in6_address->sin6_addr, sizeof(struct in6_addr)); + } + } if (remote_address.ss_family != 0) { + if (fd_cb->remote_address != NULL) { + FREE(fd_cb->remote_address, M_SONAME); + fd_cb->remote_address = NULL; + } if (remote_address.ss_len > sizeof(remote_address)) { remote_address.ss_len = sizeof(remote_address); } fd_cb->remote_address = dup_sockaddr((struct sockaddr *)&remote_address, 1); + if (flow_divert_is_sockaddr_valid((struct sockaddr *)&remote_address)) { + if (inp->inp_vflag & INP_IPV4 && remote_address.ss_family == AF_INET) { + struct sockaddr_in *remote_in_address = (struct sockaddr_in *)&remote_address; + inp->inp_fport = remote_in_address->sin_port; + memcpy(&inp->inp_faddr, &remote_in_address->sin_addr, sizeof(struct in_addr)); + } else if (inp->inp_vflag & INP_IPV6 && remote_address.ss_family == AF_INET6) { + struct sockaddr_in6 *remote_in6_address = (struct sockaddr_in6 *)&remote_address; + inp->inp_fport = remote_in6_address->sin6_port; + memcpy(&inp->in6p_faddr, &remote_in6_address->sin6_addr, sizeof(struct in6_addr)); + } + } } else { error = EINVAL; goto set_socket_state; @@ -1857,6 +1930,15 @@ set_socket_state: } flow_divert_disconnect_socket(fd_cb->so); } else { +#if NECP + /* Update NECP client with connected five-tuple */ + if (!uuid_is_null(inp->necp_client_uuid)) { + socket_unlock(fd_cb->so, 0); + necp_client_assign_from_socket(fd_cb->so->last_pid, inp->necp_client_uuid, inp); + socket_lock(fd_cb->so, 0); + } +#endif /* NECP */ + flow_divert_send_buffered_data(fd_cb, FALSE); soisconnected(fd_cb->so); } @@ -1917,26 +1999,27 @@ flow_divert_handle_close(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offse static mbuf_t flow_divert_get_control_mbuf(struct flow_divert_pcb *fd_cb) { - if (fd_cb->local_address != NULL) { - struct inpcb *inp = sotoinpcb(fd_cb->so); - if ((inp->inp_vflag & INP_IPV4) && - (inp->inp_flags & INP_RECVDSTADDR) && - fd_cb->local_address->sa_family == AF_INET && - fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in)) { + struct inpcb *inp = sotoinpcb(fd_cb->so); + if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags & INP_RECVDSTADDR)) { + struct in_addr ia = { }; + + if (fd_cb->local_address != NULL && fd_cb->local_address->sa_family == AF_INET && fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in)) { struct sockaddr_in *sin = (struct sockaddr_in *)(void *)fd_cb->local_address; + bcopy(&sin->sin_addr, &ia, sizeof(struct in_addr)); + } - return sbcreatecontrol((caddr_t) &sin->sin_addr, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); - } else if ((inp->inp_vflag & INP_IPV6) && - (inp->inp_flags & IN6P_PKTINFO) && - fd_cb->local_address->sa_family == AF_INET6 && - fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in6)) { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)fd_cb->local_address; - struct in6_pktinfo pi6; + return sbcreatecontrol((caddr_t)&ia, sizeof(ia), IP_RECVDSTADDR, IPPROTO_IP); + } else if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags & IN6P_PKTINFO)) { + struct in6_pktinfo pi6; + memset(&pi6, 0, sizeof(pi6)); + if (fd_cb->local_address != NULL && fd_cb->local_address->sa_family == AF_INET6 && fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in6)) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)fd_cb->local_address; bcopy(&sin6->sin6_addr, &pi6.ipi6_addr, sizeof(struct in6_addr)); pi6.ipi6_ifindex = 0; - return sbcreatecontrol((caddr_t)&pi6, sizeof(struct in6_pktinfo), IPV6_PKTINFO, IPPROTO_IPV6); } + + return sbcreatecontrol((caddr_t)&pi6, sizeof(pi6), IPV6_PKTINFO, IPPROTO_IPV6); } return NULL; } @@ -1981,7 +2064,8 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off FDLOG(LOG_ERR, fd_cb, "mbuf_split failed: %d", error); } else { if (flow_divert_check_no_cellular(fd_cb) || - flow_divert_check_no_expensive(fd_cb)) { + flow_divert_check_no_expensive(fd_cb) || + flow_divert_check_no_constrained(fd_cb)) { flow_divert_update_closed_state(fd_cb, SHUT_RDWR, TRUE); flow_divert_send_close(fd_cb, SHUT_RDWR); flow_divert_disconnect_socket(fd_cb->so); @@ -2012,13 +2096,14 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off } mctl = flow_divert_get_control_mbuf(fd_cb); - if (sbappendaddr(&fd_cb->so->so_rcv, append_sa, data, mctl, NULL)) { + int append_error = 0; + if (sbappendaddr(&fd_cb->so->so_rcv, append_sa, data, mctl, &append_error)) { fd_cb->bytes_received += data_size; flow_divert_add_data_statistics(fd_cb, data_size, FALSE); fd_cb->sb_size = fd_cb->so->so_rcv.sb_cc; sorwakeup(fd_cb->so); data = NULL; - } else { + } else if (append_error != EJUSTRETURN) { FDLOG0(LOG_ERR, fd_cb, "received data, but sbappendaddr failed"); } if (!error) { @@ -2082,6 +2167,11 @@ flow_divert_handle_group_init(struct flow_divert_group *group, mbuf_t packet, in lck_rw_lock_exclusive(&group->lck); + if (group->token_key != NULL) { + FREE(group->token_key, M_TEMP); + group->token_key = NULL; + } + MALLOC(group->token_key, uint8_t *, key_size, M_TEMP, M_WAITOK); error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_TOKEN_KEY, key_size, group->token_key, NULL); if (error) { @@ -2554,6 +2644,12 @@ flow_divert_append_target_endpoint_tlv(mbuf_t connect_packet, struct sockaddr *t int error = 0; int port = 0; + if (!flow_divert_is_sockaddr_valid(toaddr)) { + FDLOG(LOG_ERR, &nil_pcb, "Invalid target address, family = %u, length = %u", toaddr->sa_family, toaddr->sa_len); + error = EINVAL; + goto done; + } + error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_TARGET_ADDRESS, toaddr->sa_len, toaddr); if (error) { goto done; @@ -2594,13 +2690,13 @@ flow_divert_is_sockaddr_valid(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: - if (addr->sa_len != sizeof(struct sockaddr_in)) { + if (addr->sa_len < sizeof(struct sockaddr_in)) { return FALSE; } break; #if INET6 case AF_INET6: - if (addr->sa_len != sizeof(struct sockaddr_in6)) { + if (addr->sa_len < sizeof(struct sockaddr_in6)) { return FALSE; } break; @@ -3095,7 +3191,8 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr } error = flow_divert_check_no_cellular(fd_cb) || - flow_divert_check_no_expensive(fd_cb); + flow_divert_check_no_expensive(fd_cb) || + flow_divert_check_no_constrained(fd_cb); if (error) { goto done; } @@ -3103,6 +3200,21 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr /* Implicit connect */ if (!(fd_cb->flags & FLOW_DIVERT_CONNECT_STARTED)) { FDLOG0(LOG_INFO, fd_cb, "implicit connect"); + +#if CONTENT_FILTER + /* + * If the socket is subject to a UDP Content Filter and no remote address is passed in, + * retrieve the CFIL saved remote address from the mbuf and use it. + */ + if (to == NULL && so->so_cfil_db) { + struct sockaddr *cfil_faddr = NULL; + struct m_tag *cfil_tag = cfil_udp_get_socket_state(data, NULL, NULL, &cfil_faddr); + if (cfil_tag) { + to = (struct sockaddr *)(void *)cfil_faddr; + } + FDLOG(LOG_INFO, fd_cb, "Using remote address from CFIL saved state: %p", to); + } +#endif error = flow_divert_connect_out(so, to, p); if (error) { goto done; @@ -3658,8 +3770,21 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void * panic("group with unit %d (%p) != unit info (%p)", unit, group, unitinfo); } + g_flow_divert_groups[unit] = NULL; + g_active_group_count--; + + if (g_active_group_count == 0) { + FREE(g_flow_divert_groups, M_TEMP); + g_flow_divert_groups = NULL; + } + + lck_rw_done(&g_flow_divert_group_lck); + if (group != NULL) { flow_divert_close_all(group); + + lck_rw_lock_exclusive(&group->lck); + if (group->token_key != NULL) { memset(group->token_key, 0, group->token_key_size); FREE(group->token_key, M_TEMP); @@ -3674,20 +3799,13 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void * memset(&group->signing_id_trie, 0, sizeof(group->signing_id_trie)); group->signing_id_trie.root = NULL_TRIE_IDX; + lck_rw_done(&group->lck); + FREE_ZONE(group, sizeof(*group), M_FLOW_DIVERT_GROUP); - g_flow_divert_groups[unit] = NULL; - g_active_group_count--; } else { error = EINVAL; } - if (g_active_group_count == 0) { - FREE(g_flow_divert_groups, M_TEMP); - g_flow_divert_groups = NULL; - } - - lck_rw_done(&g_flow_divert_group_lck); - return error; } diff --git a/bsd/netinet/flow_divert_proto.h b/bsd/netinet/flow_divert_proto.h index 84c39eb66..5961653b9 100644 --- a/bsd/netinet/flow_divert_proto.h +++ b/bsd/netinet/flow_divert_proto.h @@ -69,6 +69,7 @@ #define FLOW_DIVERT_TLV_FLAGS 29 #define FLOW_DIVERT_TLV_FLOW_TYPE 30 #define FLOW_DIVERT_TLV_APP_DATA 31 +#define FLOW_DIVERT_TLV_APP_AUDIT_TOKEN 32 #define FLOW_DIVERT_FLOW_TYPE_TCP 1 #define FLOW_DIVERT_FLOW_TYPE_UDP 3 diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index 6f4372935..4e7daeaf1 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -202,7 +202,7 @@ static struct igmpstat_v3 igmpstat_v3 = { .igps_len = sizeof(struct igmpstat_v3), }; static struct igmpstat igmpstat; /* old IGMPv2 stats structure */ -static struct timeval igmp_gsrdelay = {10, 0}; +static struct timeval igmp_gsrdelay = {.tv_sec = 10, .tv_usec = 0}; static int igmp_recvifkludge = 1; static int igmp_sendra = 1; @@ -847,7 +847,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, struct igmp_ifinfo *igi; struct in_multi *inm; struct in_multistep step; - struct igmp_tparams itp = { 0, 0, 0, 0 }; + struct igmp_tparams itp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 }; IGMP_LOCK_ASSERT_NOTHELD(); @@ -937,7 +937,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, struct in_multi *inm; int is_general_query; uint16_t timer; - struct igmp_tparams itp = { 0, 0, 0, 0 }; + struct igmp_tparams itp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 }; IGMP_LOCK_ASSERT_NOTHELD(); @@ -1104,7 +1104,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; - struct igmp_tparams itp = { 0, 0, 0, 0 }; + struct igmp_tparams itp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 }; IGMP_LOCK_ASSERT_NOTHELD(); diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index 5f464c325..f51b22b4e 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include #include +#include #include #include @@ -778,10 +779,6 @@ inctl_ifaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, error = 0; } if (error != 0) { - /* Reset the detaching flag */ - IFA_LOCK(&ia->ia_ifa); - ia->ia_ifa.ifa_debug &= ~IFD_DETACHING; - IFA_UNLOCK(&ia->ia_ifa); break; } @@ -1346,21 +1343,6 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, if (iap->ia_ifp == ifp && iap->ia_addr.sin_addr.s_addr == sa->sin_addr.s_addr) { - /* - * Avoid the race condition seen when two - * threads process SIOCDIFADDR command - * at the same time (radar 28942007) - */ - if (cmd == SIOCDIFADDR) { - if (iap->ia_ifa.ifa_debug & - IFD_DETACHING) { - IFA_UNLOCK(&iap->ia_ifa); - continue; - } else { - iap->ia_ifa.ifa_debug |= - IFD_DETACHING; - } - } ia = iap; IFA_ADDREF_LOCKED(&iap->ia_ifa); IFA_UNLOCK(&iap->ia_ifa); @@ -1377,15 +1359,12 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, IFA_LOCK(&iap->ia_ifa); if (iap->ia_addr.sin_family == AF_INET) { ia = iap; + IFA_ADDREF_LOCKED(&iap->ia_ifa); IFA_UNLOCK(&iap->ia_ifa); break; } IFA_UNLOCK(&iap->ia_ifa); } - /* take a reference on ia before releasing lock */ - if (ia != NULL) { - IFA_ADDREF(&ia->ia_ifa); - } ifnet_lock_done(ifp); } } @@ -1444,10 +1423,40 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, error = EINVAL; goto done; } - if (cmd == SIOCDIFADDR && ia == NULL) { - error = EADDRNOTAVAIL; - goto done; + if (cmd == SIOCDIFADDR) { + if (ia == NULL) { + error = EADDRNOTAVAIL; + goto done; + } + + IFA_LOCK(&ia->ia_ifa); + /* + * Avoid the race condition seen when two + * threads process SIOCDIFADDR command + * at the same time. + */ + while (ia->ia_ifa.ifa_debug & IFD_DETACHING) { + os_log(OS_LOG_DEFAULT, + "Another thread is already attempting to " + "delete IPv4 address: %s on interface %s. " + "Go to sleep and check again after the operation is done", + inet_ntoa(sa->sin_addr), ia->ia_ifp->if_xname); + ia->ia_ifa.ifa_del_waiters++; + (void) msleep(ia->ia_ifa.ifa_del_wc, &ia->ia_ifa.ifa_lock, (PZERO - 1), + __func__, NULL); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + } + + if ((ia->ia_ifa.ifa_debug & IFD_ATTACHED) == 0) { + error = EADDRNOTAVAIL; + IFA_UNLOCK(&ia->ia_ifa); + goto done; + } + + ia->ia_ifa.ifa_debug |= IFD_DETACHING; + IFA_UNLOCK(&ia->ia_ifa); } + /* FALLTHROUGH */ case SIOCSIFADDR: /* struct ifreq */ case SIOCSIFDSTADDR: /* struct ifreq */ @@ -1543,8 +1552,18 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, error = EOPNOTSUPP; break; } + done: if (ia != NULL) { + if (cmd == SIOCDIFADDR) { + IFA_LOCK(&ia->ia_ifa); + ia->ia_ifa.ifa_debug &= ~IFD_DETACHING; + if (ia->ia_ifa.ifa_del_waiters > 0) { + ia->ia_ifa.ifa_del_waiters = 0; + wakeup(ia->ia_ifa.ifa_del_wc); + } + IFA_UNLOCK(&ia->ia_ifa); + } IFA_REMREF(&ia->ia_ifa); } if (so_unlocked) { @@ -2036,6 +2055,8 @@ in_ifaddr_alloc(int how) bzero(inifa, inifa_size); inifa->ia_ifa.ifa_free = in_ifaddr_free; inifa->ia_ifa.ifa_debug |= IFD_ALLOC; + inifa->ia_ifa.ifa_del_wc = &inifa->ia_ifa.ifa_debug; + inifa->ia_ifa.ifa_del_waiters = 0; ifa_lock_init(&inifa->ia_ifa); if (inifa_debug != 0) { struct in_ifaddr_dbg *inifa_dbg = diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index 6be1d8272..66674c208 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -205,13 +205,16 @@ #define IPPROTO_ENCAP 98 /* encapsulation header */ #define IPPROTO_APES 99 /* any private encr. scheme */ #define IPPROTO_GMTP 100 /* GMTP*/ -/* 101-254: Partly Unassigned */ +/* 101-252: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ #define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ #define IPPROTO_PGM 113 /* PGM */ #define IPPROTO_SCTP 132 /* SCTP */ -/* 255: Reserved */ +/* 253-254: Experimentation and testing; 255: Reserved (RFC3692) */ /* BSD Private, local use, namespace incursion */ +#ifdef PRIVATE +#define IPPROTO_QUIC 253 /* QUIC protocol (Over UDP) */ +#endif /* PRIVATE */ #define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ #define IPPROTO_RAW 255 /* raw IP packet */ @@ -814,7 +817,6 @@ union sockaddr_in_4_6 { struct sockaddr_in sin; struct sockaddr_in6 sin6; }; - #define CLAT46_HDR_EXPANSION_OVERHD (sizeof(struct ip6_hdr) - sizeof(struct ip)) /* diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 1aec999a6..d13e22e9a 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -247,6 +247,11 @@ static int arp_verbose; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_verbose, 0, ""); +static uint32_t arp_maxhold_total = 1024; /* max total packets in the holdq */ +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold_total, + CTLFLAG_RW | CTLFLAG_LOCKED, &arp_maxhold_total, 0, ""); + + /* * Generally protected by rnh_lock; use atomic operations on fields * that are also modified outside of that lock (if needed). @@ -324,15 +329,29 @@ arp_llinfo_free(void *arg) zfree(llinfo_arp_zone, la); } -static void +static bool arp_llinfo_addq(struct llinfo_arp *la, struct mbuf *m) { + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); + + if (arpstat.held >= arp_maxhold_total) { + if (arp_verbose) { + log(LOG_DEBUG, + "%s: dropping packet due to maxhold_total\n", + __func__); + } + atomic_add_32(&arpstat.dropped, 1); + return false; + } + if (qlen(&la->la_holdq) >= qlimit(&la->la_holdq)) { struct mbuf *_m; /* prune less than CTL, else take what's at the head */ - _m = _getq_scidx_lt(&la->la_holdq, SCIDX_CTL); + _getq_scidx_lt(&la->la_holdq, &pkt, SCIDX_CTL); + _m = pkt.cp_mbuf; if (_m == NULL) { - _m = _getq(&la->la_holdq); + _getq(&la->la_holdq, &pkt); + _m = pkt.cp_mbuf; } VERIFY(_m != NULL); if (arp_verbose) { @@ -343,13 +362,16 @@ arp_llinfo_addq(struct llinfo_arp *la, struct mbuf *m) atomic_add_32(&arpstat.dropped, 1); atomic_add_32(&arpstat.held, -1); } - _addq(&la->la_holdq, m); + CLASSQ_PKT_INIT_MBUF(&pkt, m); + _addq(&la->la_holdq, &pkt); atomic_add_32(&arpstat.held, 1); if (arp_verbose) { log(LOG_DEBUG, "%s: enqueued packet (scidx %u), qlen now %u\n", __func__, MBUF_SCIDX(mbuf_get_service_class(m)), qlen(&la->la_holdq)); } + + return true; } static uint32_t @@ -1250,6 +1272,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, uint32_t rtflags; struct sockaddr_dl sdl; boolean_t send_probe_notif = FALSE; + boolean_t enqueued = FALSE; if (ifp == NULL || net_dest == NULL) { return EINVAL; @@ -1455,7 +1478,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, * we still hold the route's rt_lock. */ if (packet != NULL) { - arp_llinfo_addq(llinfo, packet); + enqueued = arp_llinfo_addq(llinfo, packet); } else { llinfo->la_prbreq_cnt++; } @@ -1545,14 +1568,15 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, * from the time of _addq() above, this packet * must be at the tail. */ - if (packet != NULL) { - struct mbuf *_m = - _getq_tail(&llinfo->la_holdq); + if (packet != NULL && enqueued) { + classq_pkt_t pkt = + CLASSQ_PKT_INITIALIZER(pkt); + + _getq_tail(&llinfo->la_holdq, &pkt); atomic_add_32(&arpstat.held, -1); - VERIFY(_m == packet); + VERIFY(pkt.cp_mbuf == packet); } result = EHOSTUNREACH; - /* * Enqueue work item to invoke callback for this route entry */ @@ -1563,8 +1587,12 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, } } - /* The packet is now held inside la_holdq */ + /* The packet is now held inside la_holdq or dropped */ result = EJUSTRETURN; + if (packet != NULL && !enqueued) { + mbuf_free(packet); + packet = NULL; + } release: if (result == EHOSTUNREACH) { @@ -1659,14 +1687,11 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, /* * Determine if this ARP is for us - * For a bridge, we want to check the address irrespective - * of the receive interface. */ lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { IFA_LOCK_SPIN(&ia->ia_ifa); - if (((bridged && ia->ia_ifp->if_bridge != NULL) || - (ia->ia_ifp == ifp)) && + if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == target_ip->sin_addr.s_addr) { best_ia = ia; best_ia_sin = best_ia->ia_addr; @@ -1680,8 +1705,7 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, TAILQ_FOREACH(ia, INADDR_HASH(sender_ip->sin_addr.s_addr), ia_hash) { IFA_LOCK_SPIN(&ia->ia_ifa); - if (((bridged && ia->ia_ifp->if_bridge != NULL) || - (ia->ia_ifp == ifp)) && + if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == sender_ip->sin_addr.s_addr) { best_ia = ia; best_ia_sin = best_ia->ia_addr; @@ -2132,8 +2156,11 @@ match: if (!qempty(&llinfo->la_holdq)) { uint32_t held; - struct mbuf *m0 = - _getq_all(&llinfo->la_holdq, NULL, &held, NULL); + struct mbuf *m0; + classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt); + + _getq_all(&llinfo->la_holdq, &pkt, NULL, &held, NULL); + m0 = pkt.cp_mbuf; if (arp_verbose) { log(LOG_DEBUG, "%s: sending %u held packets\n", __func__, held); diff --git a/bsd/netinet/in_mcast.c b/bsd/netinet/in_mcast.c index 03532eaa3..b4eed3bd8 100644 --- a/bsd/netinet/in_mcast.c +++ b/bsd/netinet/in_mcast.c @@ -2772,11 +2772,6 @@ inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) return EADDRNOTAVAIL; } } - /* XXX remove? */ -#ifdef IGMP_DEBUG0 - IGMP_PRINTF(("%s: ifp = 0x%llx, addr = %s\n", __func__, - (uint64_t)VM_KERNEL_ADDRPERM(ifp), inet_ntoa(addr))); -#endif } /* Reject interfaces which do not support multicast. */ diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 6f2754373..d1627fb39 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,11 +94,13 @@ #include #include #include +#include #include #include #include #include + #if INET6 #include #include @@ -120,6 +122,10 @@ #include #include +#include + +extern const char *proc_name_address(struct proc *); + static lck_grp_t *inpcb_lock_grp; static lck_attr_t *inpcb_lock_attr; static lck_grp_attr_t *inpcb_lock_grp_attr; @@ -173,6 +179,20 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) int error; +#if (DEBUG | DEVELOPMENT) + int old_value = *(int *)oidp->oid_arg1; + /* + * For unit testing allow a non-superuser process with the + * proper entitlement to modify the variables + */ + if (req->newptr) { + if (proc_suser(current_proc()) != 0 && + (error = priv_check_cred(kauth_cred_get(), + PRIV_NETINET_RESERVEDPORT, 0))) { + return EPERM; + } + } +#endif /* (DEBUG | DEVELOPMENT) */ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error) { @@ -183,6 +203,14 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); } + +#if (DEBUG | DEVELOPMENT) + os_log(OS_LOG_DEFAULT, + "%s:%u sysctl net.restricted_port.verbose: %d -> %d)", + proc_best_name(current_proc()), proc_selfpid(), + old_value, *(int *)oidp->oid_arg1); +#endif /* (DEBUG | DEVELOPMENT) */ + return error; } @@ -191,23 +219,29 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IP Ports"); +#if (DEBUG | DEVELOPMENT) +#define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY) +#else +#define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED) +#endif /* (DEBUG | DEVELOPMENT) */ + SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + CTLFAGS_IP_PORTRANGE, &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + CTLFAGS_IP_PORTRANGE, &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + CTLFAGS_IP_PORTRANGE, &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + CTLFAGS_IP_PORTRANGE, &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + CTLFAGS_IP_PORTRANGE, &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + CTLFAGS_IP_PORTRANGE, &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); static uint32_t apn_fallbk_debug = 0; @@ -652,7 +686,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p) * in_pcblookup_local_and_cleanup does everything * in_pcblookup_local does but it checks for a socket * that's going away. Since we know that the lock is - * held read+write when this funciton is called, we + * held read+write when this function is called, we * can safely dispose of this socket like the slow * timer would usually do and return NULL. This is * great for bind. @@ -816,13 +850,16 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) IFA_REMREF(ifa); } } + + if (lport != 0) { struct inpcb *t; uid_t u; #if !CONFIG_EMBEDDED if (ntohs(lport) < IPPORT_RESERVED && - SIN(nam)->sin_addr.s_addr != 0) { + SIN(nam)->sin_addr.s_addr != 0 && + !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) { cred = kauth_cred_proc_ref(p); error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); @@ -834,6 +871,16 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } #endif /* !CONFIG_EMBEDDED */ + /* + * Check wether the process is allowed to bind to a restricted port + */ + if (!current_task_can_use_restricted_in_port(lport, + so->so_proto->pr_protocol, PORT_FLAGS_BSD)) { + lck_rw_done(pcbinfo->ipi_lock); + socket_lock(so, 0); + return EADDRINUSE; + } + if (!IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) && (u = kauth_cred_getuid(so->so_cred)) != 0 && (t = in_pcblookup_local_and_cleanup( @@ -845,7 +892,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) (u != kauth_cred_getuid(t->inp_socket->so_cred)) && !(t->inp_socket->so_flags & SOF_REUSESHAREUID) && (SIN(nam)->sin_addr.s_addr != INADDR_ANY || - t->inp_laddr.s_addr != INADDR_ANY)) { + t->inp_laddr.s_addr != INADDR_ANY) && + (!(t->inp_flags2 & INP2_EXTERNAL_PORT) || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) { if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && !(so->so_flags & SOF_NOTIFYCONFLICT)) { @@ -864,7 +914,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) t = in_pcblookup_local_and_cleanup(pcbinfo, SIN(nam)->sin_addr, lport, wild); if (t != NULL && - (reuseport & t->inp_socket->so_options) == 0) { + (reuseport & t->inp_socket->so_options) == 0 && + (!(t->inp_flags2 & INP2_EXTERNAL_PORT) || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) { #if INET6 if (SIN(nam)->sin_addr.s_addr != INADDR_ANY || t->inp_laddr.s_addr != INADDR_ANY || @@ -895,6 +948,13 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) int count; bool found; + /* + * Override wild = 1 for implicit bind (mainly used by connect) + * For implicit bind (lport == 0), we always use an unused port, + * so REUSEADDR|REUSEPORT don't apply + */ + wild = 1; + randomport = (so->so_flags & SOF_BINDRANDOMPORT) || (so->so_type == SOCK_STREAM ? tcp_use_randomport : udp_use_randomport); @@ -967,6 +1027,14 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } lport = htons(*lastport); + /* + * Skip if this is a restricted port as we do not want to + * restricted ports as ephemeral + */ + if (IS_RESTRICTED_IN_PORT(lport)) { + continue; + } + found = in_pcblookup_local_and_cleanup(pcbinfo, lookup_addr, lport, wild) == NULL; } while (!found); @@ -999,6 +1067,14 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } lport = htons(*lastport); + /* + * Skip if this is a restricted port as we do not want to + * restricted ports as ephemeral + */ + if (IS_RESTRICTED_IN_PORT(lport)) { + continue; + } + found = in_pcblookup_local_and_cleanup(pcbinfo, lookup_addr, lport, wild) == NULL; } while (!found); @@ -1159,7 +1235,7 @@ apn_fallback_required(proc_t proc, struct socket *so, struct sockaddr_in *p_dstv bzero(&sb, sizeof(struct stat64)); context = vfs_context_create(NULL); - vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, context); + vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, 0, context); (void)vfs_context_rele(context); if (vn_stat_error != 0 || @@ -2172,6 +2248,12 @@ in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && @@ -2210,6 +2292,12 @@ in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_lport == lport) { if (inp->inp_laddr.s_addr == laddr.s_addr) { @@ -2295,6 +2383,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && @@ -2334,6 +2428,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_lport == lport) { if (inp->inp_laddr.s_addr == laddr.s_addr) { @@ -2897,6 +2997,15 @@ inp_set_noexpensive(struct inpcb *inp) ROUTE_RELEASE(&inp->inp_route); } +void +inp_set_noconstrained(struct inpcb *inp) +{ + inp->inp_flags2 |= INP2_NO_IFF_CONSTRAINED; + + /* Blow away any cached route in the PCB */ + ROUTE_RELEASE(&inp->inp_route); +} + void inp_set_awdl_unrestricted(struct inpcb *inp) { @@ -3233,6 +3342,8 @@ inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo) struct socket *so = inp->inp_socket; soprocinfo->spi_pid = so->last_pid; + strlcpy(&soprocinfo->spi_proc_name[0], &inp->inp_last_proc_name[0], + sizeof(soprocinfo->spi_proc_name)); if (so->last_pid != 0) { uuid_copy(soprocinfo->spi_uuid, so->last_uuid); } @@ -3247,6 +3358,8 @@ inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo) soprocinfo->spi_delegated = 0; soprocinfo->spi_epid = so->last_pid; } + strlcpy(&soprocinfo->spi_e_proc_name[0], &inp->inp_e_proc_name[0], + sizeof(soprocinfo->spi_e_proc_name)); } int @@ -3479,6 +3592,10 @@ _inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp) return TRUE; } + if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) { + return TRUE; + } + if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) { return TRUE; } @@ -3545,6 +3662,10 @@ _inp_restricted_send(struct inpcb *inp, struct ifnet *ifp) return TRUE; } + if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) { + return TRUE; + } + if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) { return TRUE; } @@ -3576,8 +3697,7 @@ inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack) struct ifnet *ifp = inp->inp_last_outifp; struct socket *so = inp->inp_socket; if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) && - (ifp->if_type == IFT_CELLULAR || - ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) { + (ifp->if_type == IFT_CELLULAR || IFNET_IS_WIFI(ifp))) { int32_t unsent; so->so_snd.sb_flags |= SB_SNDBYTE_CNT; @@ -3636,13 +3756,13 @@ inp_incr_sndbytes_unsent(struct socket *so, int32_t len) inline void inp_decr_sndbytes_unsent(struct socket *so, int32_t len) { - struct inpcb *inp = (struct inpcb *)so->so_pcb; - struct ifnet *ifp = inp->inp_last_outifp; - if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) { return; } + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct ifnet *ifp = inp->inp_last_outifp; + if (ifp != NULL) { if (ifp->if_sndbyte_unsent >= len) { OSAddAtomic64(-len, &ifp->if_sndbyte_unsent); @@ -3677,3 +3797,40 @@ inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab) { bcopy(&inp->inp_nw_activity, ab, sizeof(*ab)); } + +void +inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep) +{ + struct inpcb *inp = (struct inpcb *)so->so_pcb; + + if (inp == NULL) { + return; + } + + if (p != NULL) { + strlcpy(&inp->inp_last_proc_name[0], proc_name_address(p), sizeof(inp->inp_last_proc_name)); + } + if (so->so_flags & SOF_DELEGATED) { + if (ep != NULL) { + strlcpy(&inp->inp_e_proc_name[0], proc_name_address(ep), sizeof(inp->inp_e_proc_name)); + } else { + inp->inp_e_proc_name[0] = 0; + } + } else { + inp->inp_e_proc_name[0] = 0; + } +} + +void +inp_copy_last_owner(struct socket *so, struct socket *head) +{ + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct inpcb *head_inp = (struct inpcb *)head->so_pcb; + + if (inp == NULL || head_inp == NULL) { + return; + } + + strlcpy(&inp->inp_last_proc_name[0], &head_inp->inp_last_proc_name[0], sizeof(inp->inp_last_proc_name)); + strlcpy(&inp->inp_e_proc_name[0], &head_inp->inp_e_proc_name[0], sizeof(inp->inp_e_proc_name)); +} diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 90e0e0769..a5ec42ab2 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -160,6 +160,7 @@ struct inpcb { u_int32_t inp_flow; /* IPv6 flow information */ u_char inp_sndinprog_cnt; /* outstanding send operations */ + uint32_t inp_sndingprog_waiters;/* waiters for outstanding send */ u_char inp_vflag; /* INP_IPV4 or INP_IPV6 */ u_char inp_ip_ttl; /* time to live proto */ @@ -212,7 +213,7 @@ struct inpcb { struct label *inp_label; /* MAC label */ #endif #if IPSEC - struct inpcbpolicy *inp_sp; /* for IPSec */ + struct inpcbpolicy *inp_sp; /* for IPsec */ #endif /* IPSEC */ #if NECP struct { @@ -238,6 +239,9 @@ struct inpcb { u_int8_t inp_Wstat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; activity_bitmap_t inp_nw_activity; u_int64_t inp_start_timestamp; + + char inp_last_proc_name[MAXCOMLEN + 1]; + char inp_e_proc_name[MAXCOMLEN + 1]; }; #define INP_ADD_STAT(_inp, _cnt_cellular, _cnt_wifi, _cnt_wired, _a, _n) \ @@ -624,6 +628,8 @@ struct inpcbinfo { ((_inp)->inp_flags & INP_NO_IFT_CELLULAR) #define INP_NO_EXPENSIVE(_inp) \ ((_inp)->inp_flags2 & INP2_NO_IFF_EXPENSIVE) +#define INP_NO_CONSTRAINED(_inp) \ + ((_inp)->inp_flags2 & INP2_NO_IFF_CONSTRAINED) #define INP_AWDL_UNRESTRICTED(_inp) \ ((_inp)->inp_flags2 & INP2_AWDL_UNRESTRICTED) #define INP_INTCOPROC_ALLOWED(_inp) \ @@ -709,6 +715,8 @@ struct inpcbinfo { #define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */ #define INP2_CONNECT_IN_PROGRESS 0x00000100 /* A connect call is in progress, so binds are intermediate steps */ #define INP2_CLAT46_FLOW 0x00000200 /* The flow is going to use CLAT46 path */ +#define INP2_EXTERNAL_PORT 0x00000400 /* The port is registered externally, for NECP listeners */ +#define INP2_NO_IFF_CONSTRAINED 0x00000800 /* do not use constrained interface */ /* * Flags passed to in_pcblookup*() functions. @@ -807,6 +815,7 @@ extern int inp_bindif(struct inpcb *, unsigned int, struct ifnet **); extern void inp_set_nocellular(struct inpcb *); extern void inp_clear_nocellular(struct inpcb *); extern void inp_set_noexpensive(struct inpcb *); +extern void inp_set_noconstrained(struct inpcb *); extern void inp_set_awdl_unrestricted(struct inpcb *); extern boolean_t inp_get_awdl_unrestricted(struct inpcb *); extern void inp_clear_awdl_unrestricted(struct inpcb *); @@ -838,6 +847,8 @@ extern int32_t inp_get_sndbytes_allunsent(struct socket *, u_int32_t); extern void inp_decr_sndbytes_allunsent(struct socket *, u_int32_t); extern void inp_set_activity_bitmap(struct inpcb *inp); extern void inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *b); +extern void inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep); +extern void inp_copy_last_owner(struct socket *so, struct socket *head); #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE /* exported for PPP */ diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c index 73b55db69..dcd59d9c3 100644 --- a/bsd/netinet/in_pcblist.c +++ b/bsd/netinet/in_pcblist.c @@ -112,50 +112,54 @@ sotoxsocket_n(struct socket *so, struct xsocket_n *xso) xso->xso_len = sizeof(struct xsocket_n); xso->xso_kind = XSO_SOCKET; - if (so != NULL) { - xso->xso_so = (uint64_t)VM_KERNEL_ADDRPERM(so); - xso->so_type = so->so_type; - xso->so_options = so->so_options; - xso->so_linger = so->so_linger; - xso->so_state = so->so_state; - xso->so_pcb = (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb); - if (so->so_proto) { - xso->xso_protocol = SOCK_PROTO(so); - xso->xso_family = SOCK_DOM(so); - } else { - xso->xso_protocol = xso->xso_family = 0; - } - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; - xso->so_timeo = so->so_timeo; - xso->so_error = so->so_error; - xso->so_pgid = so->so_pgid; - xso->so_oobmark = so->so_oobmark; - xso->so_uid = kauth_cred_getuid(so->so_cred); - xso->so_last_pid = so->last_pid; - xso->so_e_pid = so->e_pid; + if (so == NULL) { + return; + } + + xso->xso_so = (uint64_t)VM_KERNEL_ADDRPERM(so); + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb); + if (so->so_proto) { + xso->xso_protocol = SOCK_PROTO(so); + xso->xso_family = SOCK_DOM(so); + } else { + xso->xso_protocol = xso->xso_family = 0; } + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_pgid; + xso->so_oobmark = so->so_oobmark; + xso->so_uid = kauth_cred_getuid(so->so_cred); + xso->so_last_pid = so->last_pid; + xso->so_e_pid = so->e_pid; } __private_extern__ void sbtoxsockbuf_n(struct sockbuf *sb, struct xsockbuf_n *xsb) { xsb->xsb_len = sizeof(struct xsockbuf_n); - xsb->xsb_kind = (sb->sb_flags & SB_RECV) ? XSO_RCVBUF : XSO_SNDBUF; - if (sb != NULL) { - xsb->sb_cc = sb->sb_cc; - xsb->sb_hiwat = sb->sb_hiwat; - xsb->sb_mbcnt = sb->sb_mbcnt; - xsb->sb_mbmax = sb->sb_mbmax; - xsb->sb_lowat = sb->sb_lowat; - xsb->sb_flags = sb->sb_flags; - xsb->sb_timeo = (short)(sb->sb_timeo.tv_sec * hz) + - sb->sb_timeo.tv_usec / tick; - if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) { - xsb->sb_timeo = 1; - } + if (sb == NULL) { + return; + } + + xsb->xsb_kind = (sb->sb_flags & SB_RECV) ? XSO_RCVBUF : XSO_SNDBUF; + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = (short)(sb->sb_timeo.tv_sec * hz) + + sb->sb_timeo.tv_usec / tick; + if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) { + xsb->sb_timeo = 1; } } @@ -167,6 +171,10 @@ sbtoxsockstat_n(struct socket *so, struct xsockstat_n *xst) xst->xst_len = sizeof(struct xsockstat_n); xst->xst_kind = XSO_STATS; + if (so == NULL) { + return; + } + for (i = 0; i < SO_TC_STATS_MAX; i++) { xst->xst_tc_stats[i].rxpackets = so->so_tc_stats[i].rxpackets; xst->xst_tc_stats[i].rxbytes = so->so_tc_stats[i].rxbytes; diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index bc3ce2a5e..340f31f65 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,7 +102,6 @@ #if IPSEC_ESP #include #endif -#include #endif /* IPSEC */ static void in_dinit(struct domain *); @@ -139,6 +138,8 @@ static struct protosw inetsw[] = { .pr_lock = udp_lock, .pr_unlock = udp_unlock, .pr_getlock = udp_getlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_STREAM, @@ -155,6 +156,8 @@ static struct protosw inetsw[] = { .pr_lock = tcp_lock, .pr_unlock = tcp_unlock, .pr_getlock = tcp_getlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -165,6 +168,8 @@ static struct protosw inetsw[] = { .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -174,6 +179,8 @@ static struct protosw inetsw[] = { .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_DGRAM, @@ -183,6 +190,8 @@ static struct protosw inetsw[] = { .pr_ctloutput = icmp_dgram_ctloutput, .pr_usrreqs = &icmp_dgram_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -193,6 +202,8 @@ static struct protosw inetsw[] = { .pr_init = igmp_init, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -203,6 +214,8 @@ static struct protosw inetsw[] = { .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, #if IPSEC { @@ -221,14 +234,6 @@ static struct protosw inetsw[] = { .pr_usrreqs = &nousrreqs, }, #endif /* IPSEC_ESP */ - { - .pr_type = SOCK_RAW, - .pr_protocol = IPPROTO_IPCOMP, - .pr_flags = PR_ATOMIC | PR_ADDR | PR_PROTOLOCK, - .pr_input = ipcomp4_input, - .pr_init = ipcomp_init, - .pr_usrreqs = &nousrreqs, - }, #endif /* IPSEC */ { .pr_type = SOCK_RAW, @@ -239,6 +244,8 @@ static struct protosw inetsw[] = { .pr_init = encap4_init, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, #if INET6 { @@ -250,6 +257,8 @@ static struct protosw inetsw[] = { .pr_init = encap4_init, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, #endif /* INET6 */ #if IPDIVERT @@ -275,6 +284,8 @@ static struct protosw inetsw[] = { .pr_init = rip_init, .pr_usrreqs = &rip_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, }; diff --git a/bsd/netinet/in_stat.c b/bsd/netinet/in_stat.c index 522bf5184..c47255185 100644 --- a/bsd/netinet/in_stat.c +++ b/bsd/netinet/in_stat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Apple Inc. All rights reserved. + * Copyright (c) 2017-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,7 +41,8 @@ in_stat_set_activity_bitmap(activity_bitmap_t *activity, uint64_t now) uint64_t elapsed_time, slot; uint64_t *bitmap; if (activity->start == 0) { - activity->start = now; + // Align all activity maps + activity->start = now - (now % IN_STAT_ACTIVITY_GRANULARITY); } elapsed_time = now - activity->start; diff --git a/bsd/netinet/in_systm.h b/bsd/netinet/in_systm.h index 638f14bbe..c191fed8d 100644 --- a/bsd/netinet/in_systm.h +++ b/bsd/netinet/in_systm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -85,6 +85,8 @@ typedef __uint32_t n_long; /* long as received from the net */ typedef __uint32_t n_time; /* ms since 00:00 GMT, byte rev */ #ifdef BSD_KERNEL_PRIVATE +#define ABS(v) (((v) > 0) ? (v) : -(v)) + u_int32_t iptime(void); #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET_IN_SYSTM_H_ */ diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c index 7d8f336bd..6c939c5aa 100644 --- a/bsd/netinet/in_tclass.c +++ b/bsd/netinet/in_tclass.c @@ -61,8 +61,13 @@ #include #include +struct net_qos_dscp_map { + uint8_t sotc_to_dscp[SO_TC_MAX]; + uint8_t netsvctype_to_dscp[_NET_SERVICE_TYPE_COUNT]; +}; + struct dcsp_msc_map { - u_int8_t dscp; + uint8_t dscp; mbuf_svc_class_t msc; }; static inline int so_throttle_best_effort(struct socket *, struct ifnet *); @@ -117,10 +122,6 @@ int net_qos_policy_wifi_enabled = 0; SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, ""); -int net_qos_policy_none_wifi_enabled = 0; -SYSCTL_INT(_net_qos_policy, OID_AUTO, none_wifi_enabled, - CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_none_wifi_enabled, 0, ""); - int net_qos_policy_capable_enabled = 0; SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, ""); @@ -145,18 +146,36 @@ const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = { */ static const struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = { - { NET_SERVICE_TYPE_BE, _DSCP_DF }, - { NET_SERVICE_TYPE_BK, _DSCP_AF11 }, - { NET_SERVICE_TYPE_SIG, _DSCP_CS3 }, - { NET_SERVICE_TYPE_VI, _DSCP_AF41 }, - { NET_SERVICE_TYPE_VO, _DSCP_EF }, - { NET_SERVICE_TYPE_RV, _DSCP_CS4 }, - { NET_SERVICE_TYPE_AV, _DSCP_AF31 }, - { NET_SERVICE_TYPE_OAM, _DSCP_CS2 }, - { NET_SERVICE_TYPE_RD, _DSCP_AF21 }, + { .netsvctype = NET_SERVICE_TYPE_BE, .dscp = _DSCP_DF }, + { .netsvctype = NET_SERVICE_TYPE_BK, .dscp = _DSCP_AF11 }, + { .netsvctype = NET_SERVICE_TYPE_SIG, .dscp = _DSCP_CS3 }, + { .netsvctype = NET_SERVICE_TYPE_VI, .dscp = _DSCP_AF41 }, + { .netsvctype = NET_SERVICE_TYPE_VO, .dscp = _DSCP_EF }, + { .netsvctype = NET_SERVICE_TYPE_RV, .dscp = _DSCP_CS4 }, + { .netsvctype = NET_SERVICE_TYPE_AV, .dscp = _DSCP_AF31 }, + { .netsvctype = NET_SERVICE_TYPE_OAM, .dscp = _DSCP_CS2 }, + { .netsvctype = NET_SERVICE_TYPE_RD, .dscp = _DSCP_AF21 }, +}; + + +/* + * DSCP mappings for QoS RFC4594 as based on network service types + */ +static const +struct netsvctype_dscp_map rfc4594_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = { + { .netsvctype = NET_SERVICE_TYPE_BE, .dscp = _DSCP_DF }, + { .netsvctype = NET_SERVICE_TYPE_BK, .dscp = _DSCP_CS1 }, + { .netsvctype = NET_SERVICE_TYPE_SIG, .dscp = _DSCP_CS5 }, + { .netsvctype = NET_SERVICE_TYPE_VI, .dscp = _DSCP_AF41 }, + { .netsvctype = NET_SERVICE_TYPE_VO, .dscp = _DSCP_EF }, + { .netsvctype = NET_SERVICE_TYPE_RV, .dscp = _DSCP_CS4 }, + { .netsvctype = NET_SERVICE_TYPE_AV, .dscp = _DSCP_AF31 }, + { .netsvctype = NET_SERVICE_TYPE_OAM, .dscp = _DSCP_CS2 }, + { .netsvctype = NET_SERVICE_TYPE_RD, .dscp = _DSCP_AF21 }, }; -static struct net_qos_dscp_map default_net_qos_dscp_map; +static struct net_qos_dscp_map fastlane_net_qos_dscp_map; +static struct net_qos_dscp_map rfc4594_net_qos_dscp_map; /* * The size is one more than the max because DSCP start at zero @@ -174,79 +193,79 @@ static struct net_qos_dscp_map default_net_qos_dscp_map; * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS. */ static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = { - { _DSCP_DF, MBUF_SC_BE }, /* RFC 2474 Standard */ - { 1, MBUF_SC_BE }, /* */ - { 2, MBUF_SC_BE }, /* */ - { 3, MBUF_SC_BE }, /* */ - { 4, MBUF_SC_BE }, /* */ - { 5, MBUF_SC_BE }, /* */ - { 6, MBUF_SC_BE }, /* */ - { 7, MBUF_SC_BE }, /* */ - - { _DSCP_CS1, MBUF_SC_BK }, /* RFC 3662 Low-Priority Data */ - { 9, MBUF_SC_BK }, /* */ - { _DSCP_AF11, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ - { 11, MBUF_SC_BK }, /* */ - { _DSCP_AF12, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ - { 13, MBUF_SC_BK }, /* */ - { _DSCP_AF13, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ - { 15, MBUF_SC_BK }, /* */ - - { _DSCP_CS2, MBUF_SC_BK }, /* RFC 4594 OAM */ - { 17, MBUF_SC_BK }, /* */ - { _DSCP_AF21, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ - { 19, MBUF_SC_BK }, /* */ - { _DSCP_AF22, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ - { 21, MBUF_SC_BK }, /* */ - { _DSCP_AF23, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ - { 23, MBUF_SC_BK }, /* */ - - { _DSCP_CS3, MBUF_SC_BE }, /* RFC 2474 Broadcast Video */ - { 25, MBUF_SC_BE }, /* */ - { _DSCP_AF31, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ - { 27, MBUF_SC_BE }, /* */ - { _DSCP_AF32, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ - { 29, MBUF_SC_BE }, /* */ - { _DSCP_AF33, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ - { 31, MBUF_SC_BE }, /* */ - - { _DSCP_CS4, MBUF_SC_VI }, /* RFC 2474 Real-Time Interactive */ - { 33, MBUF_SC_VI }, /* */ - { _DSCP_AF41, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ - { 35, MBUF_SC_VI }, /* */ - { _DSCP_AF42, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ - { 37, MBUF_SC_VI }, /* */ - { _DSCP_AF43, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ - { 39, MBUF_SC_VI }, /* */ - - { _DSCP_CS5, MBUF_SC_VI }, /* RFC 2474 Signaling */ - { 41, MBUF_SC_VI }, /* */ - { 42, MBUF_SC_VI }, /* */ - { 43, MBUF_SC_VI }, /* */ - { _DSCP_VA, MBUF_SC_VI }, /* RFC 5865 VOICE-ADMIT */ - { 45, MBUF_SC_VI }, /* */ - { _DSCP_EF, MBUF_SC_VI }, /* RFC 3246 Telephony */ - { 47, MBUF_SC_VI }, /* */ - - { _DSCP_CS6, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */ - { 49, MBUF_SC_VO }, /* */ - { 50, MBUF_SC_VO }, /* */ - { 51, MBUF_SC_VO }, /* */ - { 52, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Sigma */ - { 53, MBUF_SC_VO }, /* */ - { 54, MBUF_SC_VO }, /* */ - { 55, MBUF_SC_VO }, /* */ - - { _DSCP_CS7, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */ - { 57, MBUF_SC_VO }, /* */ - { 58, MBUF_SC_VO }, /* */ - { 59, MBUF_SC_VO }, /* */ - { 60, MBUF_SC_VO }, /* */ - { 61, MBUF_SC_VO }, /* */ - { 62, MBUF_SC_VO }, /* */ - { 63, MBUF_SC_VO }, /* */ - - { 255, MBUF_SC_UNSPEC } /* invalid DSCP to mark last entry */ + { .dscp = _DSCP_DF, .msc = MBUF_SC_BE }, /* RFC 2474 Standard */ + { .dscp = 1, .msc = MBUF_SC_BE }, /* */ + { .dscp = 2, .msc = MBUF_SC_BE }, /* */ + { .dscp = 3, .msc = MBUF_SC_BE }, /* */ + { .dscp = 4, .msc = MBUF_SC_BE }, /* */ + { .dscp = 5, .msc = MBUF_SC_BE }, /* */ + { .dscp = 6, .msc = MBUF_SC_BE }, /* */ + { .dscp = 7, .msc = MBUF_SC_BE }, /* */ + + { .dscp = _DSCP_CS1, .msc = MBUF_SC_BK }, /* RFC 3662 Low-Priority Data */ + { .dscp = 9, .msc = MBUF_SC_BK }, /* */ + { .dscp = _DSCP_AF11, .msc = MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ + { .dscp = 11, .msc = MBUF_SC_BK }, /* */ + { .dscp = _DSCP_AF12, .msc = MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ + { .dscp = 13, .msc = MBUF_SC_BK }, /* */ + { .dscp = _DSCP_AF13, .msc = MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ + { .dscp = 15, .msc = MBUF_SC_BK }, /* */ + + { .dscp = _DSCP_CS2, .msc = MBUF_SC_BK }, /* RFC 4594 OAM */ + { .dscp = 17, .msc = MBUF_SC_BK }, /* */ + { .dscp = _DSCP_AF21, .msc = MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ + { .dscp = 19, .msc = MBUF_SC_BK }, /* */ + { .dscp = _DSCP_AF22, .msc = MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ + { .dscp = 21, .msc = MBUF_SC_BK }, /* */ + { .dscp = _DSCP_AF23, .msc = MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ + { .dscp = 23, .msc = MBUF_SC_BK }, /* */ + + { .dscp = _DSCP_CS3, .msc = MBUF_SC_BE }, /* RFC 2474 Broadcast Video */ + { .dscp = 25, .msc = MBUF_SC_BE }, /* */ + { .dscp = _DSCP_AF31, .msc = MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ + { .dscp = 27, .msc = MBUF_SC_BE }, /* */ + { .dscp = _DSCP_AF32, .msc = MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ + { .dscp = 29, .msc = MBUF_SC_BE }, /* */ + { .dscp = _DSCP_AF33, .msc = MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ + { .dscp = 31, .msc = MBUF_SC_BE }, /* */ + + { .dscp = _DSCP_CS4, .msc = MBUF_SC_VI }, /* RFC 2474 Real-Time Interactive */ + { .dscp = 33, .msc = MBUF_SC_VI }, /* */ + { .dscp = _DSCP_AF41, .msc = MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ + { .dscp = 35, .msc = MBUF_SC_VI }, /* */ + { .dscp = _DSCP_AF42, .msc = MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ + { .dscp = 37, .msc = MBUF_SC_VI }, /* */ + { .dscp = _DSCP_AF43, .msc = MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ + { .dscp = 39, .msc = MBUF_SC_VI }, /* */ + + { .dscp = _DSCP_CS5, .msc = MBUF_SC_VI }, /* RFC 2474 Signaling */ + { .dscp = 41, .msc = MBUF_SC_VI }, /* */ + { .dscp = 42, .msc = MBUF_SC_VI }, /* */ + { .dscp = 43, .msc = MBUF_SC_VI }, /* */ + { .dscp = _DSCP_VA, .msc = MBUF_SC_VI }, /* RFC 5865 VOICE-ADMIT */ + { .dscp = 45, .msc = MBUF_SC_VI }, /* */ + { .dscp = _DSCP_EF, .msc = MBUF_SC_VI }, /* RFC 3246 Telephony */ + { .dscp = 47, .msc = MBUF_SC_VI }, /* */ + + { .dscp = _DSCP_CS6, .msc = MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */ + { .dscp = 49, .msc = MBUF_SC_VO }, /* */ + { .dscp = 50, .msc = MBUF_SC_VO }, /* */ + { .dscp = 51, .msc = MBUF_SC_VO }, /* */ + { .dscp = 52, .msc = MBUF_SC_VO }, /* Wi-Fi WMM Certification: Sigma */ + { .dscp = 53, .msc = MBUF_SC_VO }, /* */ + { .dscp = 54, .msc = MBUF_SC_VO }, /* */ + { .dscp = 55, .msc = MBUF_SC_VO }, /* */ + + { .dscp = _DSCP_CS7, .msc = MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */ + { .dscp = 57, .msc = MBUF_SC_VO }, /* */ + { .dscp = 58, .msc = MBUF_SC_VO }, /* */ + { .dscp = 59, .msc = MBUF_SC_VO }, /* */ + { .dscp = 60, .msc = MBUF_SC_VO }, /* */ + { .dscp = 61, .msc = MBUF_SC_VO }, /* */ + { .dscp = 62, .msc = MBUF_SC_VO }, /* */ + { .dscp = 63, .msc = MBUF_SC_VO }, /* */ + + { .dscp = 255, .msc = MBUF_SC_UNSPEC } /* invalid DSCP to mark last entry */ }; mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE]; @@ -270,7 +289,7 @@ struct tclass_for_proc { int tfp_class; pid_t tfp_pid; char tfp_pname[(2 * MAXCOMLEN) + 1]; - u_int32_t tfp_qos_mode; + uint32_t tfp_qos_mode; }; static int get_pid_tclass(struct so_tcdbg *); @@ -873,9 +892,7 @@ so_get_netsvc_marking_level(struct socket *so) break; } if (ifp != NULL) { - if ((ifp->if_eflags & - (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) == - (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) { + if ((ifp->if_eflags & IFEF_QOSMARKING_ENABLED) != 0) { if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) { marking_level = NETSVC_MRKNG_LVL_L3L2_ALL; } else { @@ -1080,7 +1097,7 @@ so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, static inline int so_throttle_best_effort(struct socket *so, struct ifnet *ifp) { - u_int32_t uptime = net_uptime(); + uint32_t uptime = net_uptime(); return soissrcbesteffort(so) && net_io_policy_throttle_best_effort == 1 && ifp->if_rt_sendts > 0 && @@ -1096,7 +1113,7 @@ set_tcp_stream_priority(struct socket *so) u_char old_cc = tp->tcp_cc_index; int recvbg = IS_TCP_RECV_BG(so); bool is_local = false, fg_active = false; - u_int32_t uptime; + uint32_t uptime; VERIFY((SOCK_CHECK_DOM(so, PF_INET) || SOCK_CHECK_DOM(so, PF_INET6)) && @@ -1210,7 +1227,7 @@ set_tcp_stream_priority(struct socket *so) */ __private_extern__ void set_packet_service_class(struct mbuf *m, struct socket *so, - int sotc, u_int32_t flags) + int sotc, uint32_t flags) { mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */ struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ @@ -1473,28 +1490,160 @@ sotc_index(int sotc) return SIZE_T_MAX; } +uint8_t +fastlane_sc_to_dscp(uint32_t svc_class) +{ + uint8_t dscp = _DSCP_DF; + + switch (svc_class) { + case MBUF_SC_BK_SYS: + case MBUF_SC_BK: + dscp = _DSCP_AF11; + break; + + case MBUF_SC_BE: + dscp = _DSCP_DF; + break; + case MBUF_SC_RD: + dscp = _DSCP_AF21; + break; + case MBUF_SC_OAM: + dscp = _DSCP_CS2; + break; + + case MBUF_SC_AV: + dscp = _DSCP_AF31; + break; + case MBUF_SC_RV: + dscp = _DSCP_CS4; + break; + case MBUF_SC_VI: + dscp = _DSCP_AF41; + break; + case MBUF_SC_SIG: + dscp = _DSCP_CS3; + break; + + case MBUF_SC_VO: + dscp = _DSCP_EF; + break; + case MBUF_SC_CTL: + dscp = _DSCP_DF; + break; + default: + dscp = _DSCP_DF; + break; + } + + return dscp; +} + +uint8_t +rfc4594_sc_to_dscp(uint32_t svc_class) +{ + uint8_t dscp = _DSCP_DF; + + switch (svc_class) { + case MBUF_SC_BK_SYS: /* Low-Priority Data */ + case MBUF_SC_BK: + dscp = _DSCP_CS1; + break; + + case MBUF_SC_BE: /* Standard */ + dscp = _DSCP_DF; + break; + case MBUF_SC_RD: /* Low-Latency Data */ + dscp = _DSCP_AF21; + break; + + /* SVC_CLASS Not Defined: High-Throughput Data */ + + case MBUF_SC_OAM: /* OAM */ + dscp = _DSCP_CS2; + break; + + /* SVC_CLASS Not Defined: Broadcast Video */ + + case MBUF_SC_AV: /* Multimedia Streaming */ + dscp = _DSCP_AF31; + break; + case MBUF_SC_RV: /* Real-Time Interactive */ + dscp = _DSCP_CS4; + break; + case MBUF_SC_VI: /* Multimedia Conferencing */ + dscp = _DSCP_AF41; + break; + case MBUF_SC_SIG: /* Signaling */ + dscp = _DSCP_CS5; + break; + + case MBUF_SC_VO: /* Telephony */ + dscp = _DSCP_EF; + break; + case MBUF_SC_CTL: /* Network Control*/ + dscp = _DSCP_CS6; + break; + default: + dscp = _DSCP_DF; + break; + } + + return dscp; +} + +mbuf_traffic_class_t +rfc4594_dscp_to_tc(uint8_t dscp) +{ + mbuf_traffic_class_t tc = MBUF_TC_BE; + + switch (dscp) { + case _DSCP_CS1: + tc = MBUF_TC_BK; + break; + case _DSCP_DF: + case _DSCP_AF21: + case _DSCP_CS2: + tc = MBUF_TC_BE; + break; + case _DSCP_AF31: + case _DSCP_CS4: + case _DSCP_AF41: + case _DSCP_CS5: + tc = MBUF_TC_VI; + break; + case _DSCP_EF: + case _DSCP_CS6: + tc = MBUF_TC_VO; + break; + default: + tc = MBUF_TC_BE; + break; + } + + return tc; +} + /* * Pass NULL ifp for default map */ static errno_t -set_netsvctype_dscp_map(size_t in_count, +set_netsvctype_dscp_map(struct net_qos_dscp_map *net_qos_dscp_map, const struct netsvctype_dscp_map *netsvctype_dscp_map) { size_t i; - struct net_qos_dscp_map *net_qos_dscp_map = NULL; int netsvctype; /* * Do not accept more that max number of distinct DSCPs */ - if (in_count > _MAX_DSCP || netsvctype_dscp_map == NULL) { + if (net_qos_dscp_map == NULL || netsvctype_dscp_map == NULL) { return EINVAL; } /* * Validate input parameters */ - for (i = 0; i < in_count; i++) { + for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) { if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype)) { return EINVAL; } @@ -1503,9 +1652,7 @@ set_netsvctype_dscp_map(size_t in_count, } } - net_qos_dscp_map = &default_net_qos_dscp_map; - - for (i = 0; i < in_count; i++) { + for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) { netsvctype = netsvctype_dscp_map[i].netsvctype; net_qos_dscp_map->netsvctype_to_dscp[netsvctype] = @@ -1568,7 +1715,7 @@ get_netsvctype_dscp_map(size_t *out_count, return EINVAL; } - net_qos_dscp_map = &default_net_qos_dscp_map; + net_qos_dscp_map = &fastlane_net_qos_dscp_map; for (i = 0; i < MIN(_NET_SERVICE_TYPE_COUNT, *out_count); i++) { netsvctype_dscp_map[i].netsvctype = i; @@ -1584,17 +1731,13 @@ net_qos_map_init() { errno_t error; - /* - * By default use the Fastlane DSCP mappngs - */ - error = set_netsvctype_dscp_map(_NET_SERVICE_TYPE_COUNT, + error = set_netsvctype_dscp_map(&fastlane_net_qos_dscp_map, fastlane_netsvctype_dscp_map); ASSERT(error == 0); - /* - * No DSCP mapping for network control - */ - default_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF; + error = set_netsvctype_dscp_map(&rfc4594_net_qos_dscp_map, + rfc4594_netsvctype_dscp_map); + ASSERT(error == 0); set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1); } @@ -1604,8 +1747,6 @@ sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) int error = 0; - const size_t max_netsvctype_to_dscp_map_len = - _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map); size_t len; struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {}; size_t count; @@ -1627,48 +1768,37 @@ sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS } } - if (req->newptr == USER_ADDR_NULL) { - goto done; + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; } - - error = proc_suser(current_proc()); - if (error != 0) { - goto done; - } - - /* - * Check input length - */ - if (req->newlen > max_netsvctype_to_dscp_map_len) { - error = EINVAL; - goto done; - } - /* - * Cap the number of entries to copy from input buffer - */ - error = SYSCTL_IN(req, netsvctype_dscp_map, req->newlen); - if (error != 0) { - goto done; - } - - count = req->newlen / sizeof(struct netsvctype_dscp_map); - error = set_netsvctype_dscp_map(count, netsvctype_dscp_map); done: return error; } __private_extern__ errno_t set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed, - int sotc, int netsvctype, u_int8_t *dscp_inout) + int sotc, int netsvctype, uint8_t *dscp_inout) { if (ifp == NULL || dscp_inout == NULL) { return EINVAL; } - if ((ifp->if_eflags & - (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) == - (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) { - u_int8_t dscp; + if ((ifp->if_eflags & IFEF_QOSMARKING_ENABLED) != 0 && + ifp->if_qosmarking_mode != IFRTYPE_QOSMARKING_MODE_NONE) { + uint8_t dscp; + const struct net_qos_dscp_map *net_qos_dscp_map = NULL; + + switch (ifp->if_qosmarking_mode) { + case IFRTYPE_QOSMARKING_FASTLANE: + net_qos_dscp_map = &fastlane_net_qos_dscp_map; + break; + case IFRTYPE_QOSMARKING_RFC4594: + net_qos_dscp_map = &rfc4594_net_qos_dscp_map; + break; + default: + panic("invalid QoS marking type"); + /* NOTREACHED */ + } /* * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops @@ -1688,7 +1818,7 @@ set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed, */ if (IS_VALID_NET_SERVICE_TYPE(netsvctype) && netsvctype != NET_SERVICE_TYPE_BE) { - dscp = default_net_qos_dscp_map.netsvctype_to_dscp[netsvctype]; + dscp = net_qos_dscp_map->netsvctype_to_dscp[netsvctype]; if (qos_allowed == FALSE && netsvctype != NET_SERVICE_TYPE_BE && @@ -1701,7 +1831,7 @@ set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed, } else if (sotc != SO_TC_UNSPEC) { size_t sotcix = sotc_index(sotc); if (sotcix != SIZE_T_MAX) { - dscp = default_net_qos_dscp_map.sotc_to_dscp[sotcix]; + dscp = net_qos_dscp_map->sotc_to_dscp[sotcix]; if (qos_allowed == FALSE && sotc != SO_TC_BE && sotc != SO_TC_BK && sotc != SO_TC_BK_SYS && @@ -1790,7 +1920,7 @@ dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *netsvctype_dsc size_t count, struct dcsp_msc_map *dcsp_msc_map) { errno_t error = 0; - u_int32_t i; + uint32_t i; /* * Validate input parameters @@ -1825,7 +1955,7 @@ sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE] = {}; struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE]; size_t count; - u_int32_t i; + uint32_t i; if (req->oldptr == USER_ADDR_NULL) { req->oldidx = len; @@ -1967,6 +2097,15 @@ net_qos_guideline(struct proc *p, struct net_qos_guideline_args *arg, return 0; } } + if (ipv4_primary != NULL && IFNET_IS_CONSTRAINED(ipv4_primary) && + ipv6_primary != NULL && IFNET_IS_CONSTRAINED(ipv6_primary)) { + if (qos_arg.nq_use_constrained) { + return 0; + } else { + *retval = RETURN_USE_BK; + return 0; + } + } if (qos_arg.nq_transfer_size >= 5 * 1024 * 1024) { *retval = RETURN_USE_BK; return 0; diff --git a/bsd/netinet/in_tclass.h b/bsd/netinet/in_tclass.h index 1d8493b57..7f2c2600d 100644 --- a/bsd/netinet/in_tclass.h +++ b/bsd/netinet/in_tclass.h @@ -67,7 +67,8 @@ struct so_tcdbg { struct net_qos_param { u_int64_t nq_transfer_size; /* transfer size in bytes */ u_int32_t nq_use_expensive:1, /* allowed = 1 otherwise 0 */ - nq_uplink:1; /* uplink = 1 otherwise 0 */ + nq_uplink:1, /* uplink = 1 otherwise 0 */ + nq_use_constrained:1; /* allowed = 1 otherwise 0 */ u_int32_t nq_unused; /* for future expansion */ }; @@ -91,14 +92,18 @@ extern int net_qos_guideline(struct net_qos_param *param, size_t param_len); extern int net_qos_policy_restricted; extern int net_qos_policy_wifi_enabled; -extern int net_qos_policy_none_wifi_enabled; extern int net_qos_policy_capable_enabled; extern void net_qos_map_init(void); +extern void net_qos_map_change(uint32_t mode); extern errno_t set_packet_qos(struct mbuf *, struct ifnet *, boolean_t, int, int, u_int8_t *); extern int so_get_netsvc_marking_level(struct socket *); +extern uint8_t fastlane_sc_to_dscp(uint32_t svc_class); +extern uint8_t rfc4594_sc_to_dscp(uint32_t svc_class); +extern mbuf_traffic_class_t rfc4594_dscp_to_tc(uint8_t dscp); + #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index 18c55a17a..454a58288 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -135,6 +135,20 @@ struct kev_in_arpalive { struct net_event_data link_data; /* link where ARP was received */ }; +#ifdef PRIVATE +/* + * Common structure for KEV_SOCKET_SUBCLASS + * Have to place here to avoid declaration dependencies. + */ +struct kev_socket_event_data { + union sockaddr_in_4_6 kev_sockname; + union sockaddr_in_4_6 kev_peername; +}; + +struct kev_socket_closed { + struct kev_socket_event_data ev_data; +}; +#endif /* PRIVATE */ #ifdef __APPLE_API_PRIVATE struct kev_in_portinuse { @@ -145,6 +159,10 @@ struct kev_in_portinuse { #endif /* __APPLE_API_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE +extern void socket_post_kev_msg(uint32_t, struct kev_socket_event_data *, + uint32_t); +extern void socket_post_kev_msg_closed(struct socket *); + #include #include #include diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index 723b49961..aef9c2c11 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -136,7 +136,13 @@ static u_int32_t div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_int32_t div_recvspace = DIVRCVQ; /* XXX sysctl ? */ /* Optimization: have this preinitialized */ -static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; +static struct sockaddr_in divsrc = { + .sin_len = sizeof(divsrc), + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { .s_addr = 0 }, + .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 } +}; /* Internal functions */ static int div_output(struct socket *so, diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index c9f566822..3a854db05 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -1620,7 +1620,6 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl } pkt->dn_origifp = fwa->fwa_origifp; pkt->dn_mtu = fwa->fwa_mtu; - pkt->dn_alwaysfrag = fwa->fwa_alwaysfrag; pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen; if (fwa->fwa_exthdrs) { bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs)); diff --git a/bsd/netinet/ip_dummynet.h b/bsd/netinet/ip_dummynet.h index 884ce05da..fae71f8e0 100644 --- a/bsd/netinet/ip_dummynet.h +++ b/bsd/netinet/ip_dummynet.h @@ -109,12 +109,6 @@ typedef u_int64_t dn_key; /* sorting key */ * virtual time wraps every 15 days. */ -/* - * The OFFSET_OF macro is used to return the offset of a field within - * a structure. It is used by the heap management routines. - */ -#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) - /* * The maximum hash table size for queues. This value must be a power * of 2. @@ -188,7 +182,6 @@ struct dn_pkt_tag { struct route_in6 dn_ro6_pmtu; /* for ip6_output */ struct ifnet *dn_origifp; /* for ip6_output */ u_int32_t dn_mtu; /* for ip6_output */ - int dn_alwaysfrag; /* for ip6_output */ u_int32_t dn_unfragpartlen; /* for ip6_output */ struct ip6_exthdrs dn_exthdrs; /* for ip6_output */ int dn_flags; /* flags, for ip[6]_output */ diff --git a/bsd/netinet/ip_encap.c b/bsd/netinet/ip_encap.c index 02c4e8141..4aaa6fa38 100644 --- a/bsd/netinet/ip_encap.c +++ b/bsd/netinet/ip_encap.c @@ -76,7 +76,7 @@ * Well, what can I say. They impose different en/decapsulation mechanism * from each other, so they need separate protocol handler. The only one * we can easily determine by protocol # is IPsec, which always has - * AH/ESP/IPComp header right after outer IP header. + * AH/ESP header right after outer IP header. * * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. diff --git a/bsd/netinet/ip_flowid.h b/bsd/netinet/ip_flowid.h index 4c7f8f371..3c68a809f 100644 --- a/bsd/netinet/ip_flowid.h +++ b/bsd/netinet/ip_flowid.h @@ -113,7 +113,6 @@ struct ip_fw_args { struct route_in6 *fwa_ro6_pmtu; /* for IPv6 output */ struct ifnet *fwa_origifp; /* for IPv6 output */ u_int32_t fwa_mtu; /* for IPv6 output */ - int fwa_alwaysfrag; /* for IPv6 output */ u_int32_t fwa_unfragpartlen; /* for IPv6 output */ struct ip6_exthdrs *fwa_exthdrs; /* for IPv6 output */ struct ip_flow_id fwa_id; /* grabbed from IP header */ diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index acbb060cb..f6f9baf01 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2016 Apple Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -2136,7 +2136,7 @@ ipfw_chk(struct ip_fw_args *args) */ u_int8_t proto; u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ - struct in_addr src_ip = { 0 }, dst_ip = { 0 }; /* NOTE: network format */ + struct in_addr src_ip = { .s_addr = 0 }, dst_ip = { .s_addr = 0 }; /* NOTE: network format */ u_int16_t ip_len = 0; int pktlen; int dyn_dir = MATCH_UNKNOWN; diff --git a/bsd/netinet/ip_fw2_compat.c b/bsd/netinet/ip_fw2_compat.c index 2965c4adc..7360f9668 100644 --- a/bsd/netinet/ip_fw2_compat.c +++ b/bsd/netinet/ip_fw2_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2012 Apple Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,26 +70,26 @@ struct _s_x { #if FW2_DEBUG_VERBOSE static struct _s_x f_tcpflags[] = { - { "syn", TH_SYN }, - { "fin", TH_FIN }, - { "ack", TH_ACK }, - { "psh", TH_PUSH }, - { "rst", TH_RST }, - { "urg", TH_URG }, - { "tcp flag", 0 }, - { NULL, 0 } + { .s = "syn", .x = TH_SYN }, + { .s = "fin", .x = TH_FIN }, + { .s = "ack", .x = TH_ACK }, + { .s = "psh", .x = TH_PUSH }, + { .s = "rst", .x = TH_RST }, + { .s = "urg", .x = TH_URG }, + { .s = "tcp flag", .x = 0 }, + { .s = NULL, .x = 0 } }; static struct _s_x f_tcpopts[] = { - { "mss", IP_FW_TCPOPT_MSS }, - { "maxseg", IP_FW_TCPOPT_MSS }, - { "window", IP_FW_TCPOPT_WINDOW }, - { "sack", IP_FW_TCPOPT_SACK }, - { "ts", IP_FW_TCPOPT_TS }, - { "timestamp", IP_FW_TCPOPT_TS }, - { "cc", IP_FW_TCPOPT_CC }, - { "tcp option", 0 }, - { NULL, 0 } + { .s = "mss", .x = IP_FW_TCPOPT_MSS }, + { .s = "maxseg", .x = IP_FW_TCPOPT_MSS }, + { .s = "window", .x = IP_FW_TCPOPT_WINDOW }, + { .s = "sack", .x = IP_FW_TCPOPT_SACK }, + { .s = "ts", .x = IP_FW_TCPOPT_TS }, + { .s = "timestamp", .x = IP_FW_TCPOPT_TS }, + { .s = "cc", .x = IP_FW_TCPOPT_CC }, + { .s = "tcp option", .x = 0 }, + { .s = NULL, .x = 0 } }; @@ -98,32 +98,32 @@ static struct _s_x f_tcpopts[] = { * (though in fact only the low 5 bits are significant). */ static struct _s_x f_ipopts[] = { - { "ssrr", IP_FW_IPOPT_SSRR}, - { "lsrr", IP_FW_IPOPT_LSRR}, - { "rr", IP_FW_IPOPT_RR}, - { "ts", IP_FW_IPOPT_TS}, - { "ip option", 0 }, - { NULL, 0 } + { .s = "ssrr", .x = IP_FW_IPOPT_SSRR}, + { .s = "lsrr", .x = IP_FW_IPOPT_LSRR}, + { .s = "rr", .x = IP_FW_IPOPT_RR}, + { .s = "ts", .x = IP_FW_IPOPT_TS}, + { .s = "ip option", .x = 0 }, + { .s = NULL, .x = 0 } }; static struct _s_x f_iptos[] = { - { "lowdelay", IPTOS_LOWDELAY}, - { "throughput", IPTOS_THROUGHPUT}, - { "reliability", IPTOS_RELIABILITY}, - { "mincost", IPTOS_MINCOST}, - { "congestion", IPTOS_CE}, - { "ecntransport", IPTOS_ECT}, - { "ip tos option", 0}, - { NULL, 0 } + { .s = "lowdelay", .x = IPTOS_LOWDELAY}, + { .s = "throughput", .x = IPTOS_THROUGHPUT}, + { .s = "reliability", .x = IPTOS_RELIABILITY}, + { .s = "mincost", .x = IPTOS_MINCOST}, + { .s = "congestion", .x = IPTOS_CE}, + { .s = "ecntransport", .x = IPTOS_ECT}, + { .s = "ip tos option", .x = 0}, + { .s = NULL, .x = 0 } }; static struct _s_x limit_masks[] = { - {"all", DYN_SRC_ADDR | DYN_SRC_PORT | DYN_DST_ADDR | DYN_DST_PORT}, - {"src-addr", DYN_SRC_ADDR}, - {"src-port", DYN_SRC_PORT}, - {"dst-addr", DYN_DST_ADDR}, - {"dst-port", DYN_DST_PORT}, - {NULL, 0} + { .s = "all", .x = DYN_SRC_ADDR | DYN_SRC_PORT | DYN_DST_ADDR | DYN_DST_PORT}, + { .s = "src-addr", .x = DYN_SRC_ADDR}, + { .s = "src-port", .x = DYN_SRC_PORT}, + { .s = "dst-addr", .x = DYN_DST_ADDR}, + { .s = "dst-port", .x = DYN_DST_PORT}, + { .s = NULL, .x = 0} }; #endif /* !FW2_DEBUG_VERBOSE */ diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 74b051aa1..44804c8a2 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -445,6 +445,7 @@ icmp_input(struct mbuf *m, int hlen) struct in_ifaddr *ia; void (*ctlfunc)(int, struct sockaddr *, void *, struct ifnet *); int code; + boolean_t should_log_redirect = false; /* Expect 32-bit aligned data pointer on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); @@ -578,11 +579,15 @@ deliver: */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || IP_VHL_HL(icp->icmp_ip.ip_vhl) < - (sizeof(struct ip) >> 2)) { + (sizeof(struct ip) >> 2) || + (m = m_pullup(m, hlen + ICMP_ADVLEN(icp))) == NULL) { icmpstat.icps_badlen++; goto freeit; } + ip = mtod(m, struct ip *); + icp = (struct icmp *)(void *)(mtod(m, uint8_t *) + hlen); + #if BYTE_ORDER != BIG_ENDIAN NTOHS(icp->icmp_ip.ip_len); #endif @@ -735,21 +740,6 @@ reflect: return; case ICMP_REDIRECT: - if (log_redirect) { - u_int32_t src, dst, gw; - - src = ntohl(ip->ip_src.s_addr); - dst = ntohl(icp->icmp_ip.ip_dst.s_addr); - gw = ntohl(icp->icmp_gwaddr.s_addr); - printf("icmp redirect from %d.%d.%d.%d: " - "%d.%d.%d.%d => %d.%d.%d.%d\n", - (int)(src >> 24), (int)((src >> 16) & 0xff), - (int)((src >> 8) & 0xff), (int)(src & 0xff), - (int)(dst >> 24), (int)((dst >> 16) & 0xff), - (int)((dst >> 8) & 0xff), (int)(dst & 0xff), - (int)(gw >> 24), (int)((gw >> 16) & 0xff), - (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); - } if (drop_redirect) { break; } @@ -761,6 +751,12 @@ reflect: icmpstat.icps_badlen++; break; } + +#if (DEBUG | DEVELOPMENT) + should_log_redirect = log_redirect || (icmpprintfs > 0); +#else + should_log_redirect = log_redirect; +#endif /* * Short circuit routing redirects to force * immediate change in the kernel's routing @@ -770,16 +766,18 @@ reflect: */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; -#if (DEBUG | DEVELOPMENT) - if (icmpprintfs > 0) { + + if (should_log_redirect) { + char src_str[MAX_IPv4_STR_LEN]; char dst_str[MAX_IPv4_STR_LEN]; char gw_str[MAX_IPv4_STR_LEN]; + inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str)); inet_ntop(AF_INET, &icp->icmp_ip.ip_dst, dst_str, sizeof(dst_str)); inet_ntop(AF_INET, &icp->icmp_gwaddr, gw_str, sizeof(gw_str)); - printf("%s: redirect dst %s to %s\n", __func__, dst_str, gw_str); + printf("%s: redirect dst %s to %s from %s\n", __func__, + dst_str, gw_str, src_str); } -#endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; rtredirect(m->m_pkthdr.rcvif, (struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, NULL, RTF_GATEWAY | RTF_HOST, diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 35ee30666..1cdd96aff 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1225,9 +1225,6 @@ ipfw_tags_done: ip_input_adjust(m, ip, inifp); } - /* for consistency */ - m->m_pkthdr.pkt_proto = ip->ip_p; - /* for netstat route statistics */ src_ip = ip->ip_src; len = m->m_pkthdr.len; @@ -2121,9 +2118,6 @@ tooshort: ip_input_adjust(m, ip, inifp); } - /* for consistency */ - m->m_pkthdr.pkt_proto = ip->ip_p; - #if DUMMYNET check_with_pf: #endif @@ -2732,6 +2726,8 @@ found: ASSERT(trailer >= 0); if ((start != 0 && start != hlen) || trailer != 0) { + uint32_t datalen = ip->ip_len - hlen; + #if BYTE_ORDER != BIG_ENDIAN if (start < hlen) { HTONS(ip->ip_len); @@ -2739,8 +2735,7 @@ found: } #endif /* BYTE_ORDER != BIG_ENDIAN */ /* callee folds in sum */ - csum = m_adj_sum16(m, start, hlen, - (ip->ip_len - hlen), csum); + csum = m_adj_sum16(m, start, hlen, datalen, csum); if (hlen > start) { swbytes += (hlen - start); } else { @@ -3053,7 +3048,6 @@ found: (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; - m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; @@ -3308,7 +3302,11 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) struct in_addr *sin, dst; u_int32_t ntime; struct sockaddr_in ipaddr = { - sizeof(ipaddr), AF_INET, 0, { 0 }, { 0, } + .sin_len = sizeof(ipaddr), + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { .s_addr = 0 }, + .sin_zero = { 0, } }; /* Expect 32-bit aligned data pointer on strict-align platforms */ @@ -3822,6 +3820,24 @@ ip_stripoptions(struct mbuf *m) #endif /* BYTE_ORDER != BIG_ENDIAN */ ip->ip_len -= sizeof(struct ip); + + /* + * Given that we've just stripped IP options from the header, + * we need to adjust the start offset accordingly if this + * packet had gone thru partial checksum offload. + */ + if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) == + (CSUM_DATA_VALID | CSUM_PARTIAL)) { + if (m->m_pkthdr.csum_rx_start >= (sizeof(struct ip) + olen)) { + /* most common case */ + m->m_pkthdr.csum_rx_start -= olen; + } else { + /* compute checksum in software instead */ + m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; + m->m_pkthdr.csum_data = 0; + ipstat.ips_adj_hwcsum_clr++; + } + } } u_char inetctlerrmap[PRC_NCMDS] = { @@ -3829,7 +3845,7 @@ u_char inetctlerrmap[PRC_NCMDS] = { 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, - 0, 0, 0, 0, + 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index ee8eef60f..16782affc 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -345,6 +345,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, boolean_t isbroadcast : 1; boolean_t didfilter : 1; boolean_t noexpensive : 1; /* set once */ + boolean_t noconstrained : 1; /* set once */ boolean_t awdl_unrestricted : 1; /* set once */ #if IPFIREWALL_FORWARD boolean_t fwd_rewrite_src : 1; @@ -362,7 +363,8 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \ (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \ ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \ - (IFNET_IS_INTCOPROC(_ifp)) || \ + ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) || \ + (IFNET_IS_INTCOPROC(_ifp)) || \ (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp))) if (ip_output_measure) { @@ -497,6 +499,10 @@ ipfw_tags_done: ipobf.noexpensive = TRUE; ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE; } + if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) { + ipobf.noconstrained = TRUE; + ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED; + } if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) { ipobf.awdl_unrestricted = TRUE; } @@ -1007,7 +1013,11 @@ loopit: * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ - if (!TAILQ_EMPTY(&ipv4_filters)) { + if (!TAILQ_EMPTY(&ipv4_filters) +#if NECP + && !necp_packet_should_skip_filters(m) +#endif // NECP + ) { struct ipfilter *filter; int seen = (inject_filter_ref == NULL); @@ -1186,7 +1196,12 @@ sendit: } } - if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) { + if (!ipobf.didfilter && + !TAILQ_EMPTY(&ipv4_filters) +#if NECP + && !necp_packet_should_skip_filters(m) +#endif // NECP + ) { struct ipfilter *filter; int seen = (inject_filter_ref == NULL); ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; @@ -1241,7 +1256,7 @@ sendit: #if NECP /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */ necp_matched_policy_id = necp_ip_output_find_policy_match(m, - flags, (flags & IP_OUTARGS) ? ipoa : NULL, &necp_result, &necp_result_parameter); + flags, (flags & IP_OUTARGS) ? ipoa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter); if (necp_matched_policy_id) { necp_mark_packet_from_ip(m, necp_matched_policy_id); switch (necp_result) { @@ -1512,7 +1527,11 @@ sendit: 7, 0xff, 0xff, 0xff, 0xff); /* Pass to filters again */ - if (!TAILQ_EMPTY(&ipv4_filters)) { + if (!TAILQ_EMPTY(&ipv4_filters) +#if NECP + && !necp_packet_should_skip_filters(m) +#endif // NECP + ) { struct ipfilter *filter; ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; @@ -1850,16 +1869,6 @@ pass: } } - /* - * Some Wi-Fi AP implementations do not correctly handle multicast IP - * packets with DSCP bits set -- see radr://9331522 -- so as a - * workaround we clear the DSCP bits and set the service class to BE - */ - if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) { - ip->ip_tos &= IPTOS_ECN_MASK; - mbuf_set_service_class(m, MBUF_SC_BE); - } - ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), ip->ip_len, &sw_csum); @@ -2559,6 +2568,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; + lck_mtx_t *mutex_held = NULL; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { @@ -2567,6 +2577,21 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) switch (sopt->sopt_dir) { case SOPT_SET: + mutex_held = socket_getlock(so, PR_F_WILLUNLOCK); + /* + * Wait if we are in the middle of ip_output + * as we unlocked the socket there and don't + * want to overwrite the IP options + */ + if (inp->inp_sndinprog_cnt > 0) { + inp->inp_sndingprog_waiters++; + + while (inp->inp_sndinprog_cnt > 0) { + msleep(&inp->inp_sndinprog_cnt, mutex_held, + PSOCK | PCATCH, "inp_sndinprog_cnt", NULL); + } + inp->inp_sndingprog_waiters--; + } switch (sopt->sopt_name) { #ifdef notyet case IP_RETOPTS: diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index 9c2f73035..a9ecaa856 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -248,7 +248,7 @@ struct ip_moptions; /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ -#define IP_NOIPSEC 0x4 /* No IPSec processing */ +#define IP_NOIPSEC 0x4 /* No IPsec processing */ #define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables (0x0010) */ #define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast pkts (0x0020) */ #define IP_OUTARGS 0x100 /* has ancillary output info */ @@ -297,6 +297,7 @@ struct ip_out_args { #define IPOAF_AWDL_UNRESTRICTED 0x00000040 /* can send over * AWDL_RESTRICTED */ #define IPOAF_QOSMARKING_ALLOWED 0x00000080 /* policy allows Fastlane DSCP marking */ +#define IPOAF_NO_CONSTRAINED 0x00000100 /* skip IFXF_CONSTRAINED */ u_int32_t ipoa_retflags; /* IPOARF return flags (see below) */ #define IPOARF_IFDENIED 0x00000001 /* denied access to interface */ int ipoa_sotc; /* traffic class for Fastlane DSCP mapping */ diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index 07ac30572..b783eb214 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -85,7 +85,9 @@ __private_extern__ void ipf_ref(void) { lck_mtx_lock(kipf_lock); - kipf_ref++; + if (os_inc_overflow(&kipf_ref)) { + panic("kipf_ref overflow"); + } lck_mtx_unlock(kipf_lock); } @@ -94,11 +96,10 @@ ipf_unref(void) { lck_mtx_lock(kipf_lock); - if (kipf_ref == 0) { - panic("ipf_unref: kipf_ref == 0\n"); + if (os_dec_overflow(&kipf_ref)) { + panic("kipf_ref underflow"); } - kipf_ref--; if (kipf_ref == 0 && kipf_delayed_remove != 0) { struct ipfilter *filter; @@ -434,6 +435,9 @@ ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) if (options->ippo_flags & IPPOF_NO_IFF_EXPENSIVE) { ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; } + if (options->ippo_flags & IPPOF_NO_IFF_CONSTRAINED) { + ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED; + } } bzero(&ro, sizeof(struct route)); @@ -521,6 +525,9 @@ ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) if (options->ippo_flags & IPPOF_NO_IFF_EXPENSIVE) { ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; } + if (options->ippo_flags & IPPOF_NO_IFF_CONSTRAINED) { + ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED; + } } bzero(&ro, sizeof(struct route_in6)); diff --git a/bsd/netinet/kpi_ipfilter.h b/bsd/netinet/kpi_ipfilter.h index 1739bc708..0aafb5c29 100644 --- a/bsd/netinet/kpi_ipfilter.h +++ b/bsd/netinet/kpi_ipfilter.h @@ -36,7 +36,12 @@ #ifndef __KPI_IPFILTER__ #define __KPI_IPFILTER__ -#include +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ /* * ipf_pktopts @@ -50,14 +55,15 @@ struct ipf_pktopts { int ippo_mcast_loop; u_int8_t ippo_mcast_ttl; }; -#define IPPOF_MCAST_OPTS 0x1 +#define IPPOF_MCAST_OPTS 0x1 #ifdef PRIVATE -#define IPPOF_BOUND_IF 0x2 -#define IPPOF_NO_IFT_CELLULAR 0x4 -#define IPPOF_SELECT_SRCIF 0x8 -#define IPPOF_BOUND_SRCADDR 0x10 -#define IPPOF_SHIFT_IFSCOPE 16 -#define IPPOF_NO_IFF_EXPENSIVE 0x20 +#define IPPOF_BOUND_IF 0x2 +#define IPPOF_NO_IFT_CELLULAR 0x4 +#define IPPOF_SELECT_SRCIF 0x8 +#define IPPOF_BOUND_SRCADDR 0x10 +#define IPPOF_SHIFT_IFSCOPE 16 +#define IPPOF_NO_IFF_EXPENSIVE 0x20 +#define IPPOF_NO_IFF_CONSTRAINED 0x40 #endif /* PRIVATE */ typedef struct ipf_pktopts *ipf_pktopts_t; @@ -72,7 +78,7 @@ __BEGIN_DECLS * filter is called between when the general IP processing is * handled and when the packet is passed up to the next layer * protocol such as udp or tcp. In the case of encapsulation, such - * as UDP in ESP (IPSec), your filter will be called once for ESP + * as UDP in ESP (IPsec), your filter will be called once for ESP * and then again for UDP. This will give your filter an * opportunity to process the ESP header as well as the decrypted * packet. Offset and protocol are used to determine where in the @@ -101,7 +107,7 @@ typedef errno_t (*ipf_input_func)(void *cookie, mbuf_t *data, int offset, * * @discussion ipf_output_func is used to filter outbound ip packets. * The IP filter is called for packets to all interfaces. The - * filter is called before fragmentation and IPSec processing. If + * filter is called before fragmentation and IPsec processing. If * you need to change the destination IP address, call * ipf_inject_output and return EJUSTRETURN. * @param cookie The cookie specified when your filter was attached. @@ -164,7 +170,8 @@ extern errno_t ipf_addv4_internal(const struct ipf_filter *filter, ipf_addv4_internal((filter), (filter_ref)) #else extern errno_t ipf_addv4(const struct ipf_filter *filter, - ipfilter_t *filter_ref); + ipfilter_t *filter_ref) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -182,7 +189,8 @@ extern errno_t ipf_addv6_internal(const struct ipf_filter *filter, ipf_addv6_internal((filter), (filter_ref)) #else extern errno_t ipf_addv6(const struct ipf_filter *filter, - ipfilter_t *filter_ref); + ipfilter_t *filter_ref) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -192,7 +200,8 @@ extern errno_t ipf_addv6(const struct ipf_filter *filter, * ipf_addv6. * @result 0 on success otherwise the errno error. */ -extern errno_t ipf_remove(ipfilter_t filter_ref); +extern errno_t ipf_remove(ipfilter_t filter_ref) +__NKE_API_DEPRECATED; /*! * @function ipf_inject_input @@ -212,7 +221,8 @@ extern errno_t ipf_remove(ipfilter_t filter_ref); * @param filter_ref The reference to the filter injecting the data * @result 0 on success otherwise the errno error. */ -extern errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref); +extern errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref) +__NKE_API_DEPRECATED; /*! * @function ipf_inject_output @@ -231,7 +241,8 @@ extern errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref); * will always free the mbuf. */ extern errno_t ipf_inject_output(mbuf_t data, ipfilter_t filter_ref, - ipf_pktopts_t options); + ipf_pktopts_t options) +__NKE_API_DEPRECATED; __END_DECLS #endif /* __KPI_IPFILTER__ */ diff --git a/bsd/netinet/mp_pcb.c b/bsd/netinet/mp_pcb.c index 6aabe6b8d..581be9c54 100644 --- a/bsd/netinet/mp_pcb.c +++ b/bsd/netinet/mp_pcb.c @@ -59,6 +59,15 @@ static boolean_t mp_ticking; static void mp_sched_timeout(void); static void mp_timeout(void *); +static void +mpp_lock_assert_held(struct mppcb *mp) +{ +#if !MACH_ASSERT +#pragma unused(mp) +#endif + LCK_MTX_ASSERT(&mp->mpp_lock, LCK_MTX_ASSERT_OWNED); +} + void mp_pcbinit(void) { @@ -222,7 +231,7 @@ mp_pcballoc(struct socket *so, struct mppcbinfo *mppi) mpp->mpp_socket = so; so->so_pcb = mpp; - error = mptcp_sescreate(mpp); + error = mptcp_session_create(mpp); if (error) { lck_mtx_destroy(&mpp->mpp_lock, mppi->mppi_lock_grp); zfree(mppi->mppi_zone, mpp); @@ -233,6 +242,7 @@ mp_pcballoc(struct socket *so, struct mppcbinfo *mppi) mpp->mpp_flags |= MPP_ATTACHED; TAILQ_INSERT_TAIL(&mppi->mppi_pcbs, mpp, mpp_entry); mppi->mppi_count++; + lck_mtx_unlock(&mppi->mppi_lock); return 0; @@ -244,9 +254,6 @@ mp_pcbdetach(struct socket *mp_so) struct mppcb *mpp = mpsotomppcb(mp_so); mpp->mpp_state = MPPCB_STATE_DEAD; - if (!(mp_so->so_flags & SOF_PCBCLEARING)) { - mp_so->so_flags |= SOF_PCBCLEARING; - } mp_gc_sched(); } @@ -269,6 +276,16 @@ mp_pcbdispose(struct mppcb *mpp) VERIFY(mppi->mppi_count != 0); mppi->mppi_count--; + if (mppi->mppi_count == 0) { + if (mptcp_cellicon_refcount) { + os_log_error(mptcp_log_handle, "%s: No more MPTCP-flows, but cell icon counter is %u\n", + __func__, mptcp_cellicon_refcount); + mptcp_clear_cellicon(); + mptcp_cellicon_refcount = 0; + } + } + + VERIFY(mpp->mpp_inside == 0); mpp_unlock(mpp); #if NECP diff --git a/bsd/netinet/mp_pcb.h b/bsd/netinet/mp_pcb.h index 0fc2a103d..e2cce3f7b 100644 --- a/bsd/netinet/mp_pcb.h +++ b/bsd/netinet/mp_pcb.h @@ -54,6 +54,7 @@ struct mppcb { struct socket *mpp_socket; /* back pointer to socket */ uint32_t mpp_flags; /* PCB flags */ mppcb_state_t mpp_state; /* PCB state */ + int32_t mpp_inside; /* Indicates whether or not a thread is processing MPTCP */ #if NECP uuid_t necp_client_uuid; @@ -72,19 +73,17 @@ mpsotomppcb(struct socket *mp_so) #define MPP_ATTACHED 0x001 #define MPP_INSIDE_OUTPUT 0x002 /* MPTCP-stack is inside mptcp_subflow_output */ #define MPP_INSIDE_INPUT 0x004 /* MPTCP-stack is inside mptcp_subflow_input */ -#define MPP_RUPCALL 0x008 /* MPTCP-stack is handling a read upcall */ +#define MPP_INPUT_HANDLE 0x008 /* MPTCP-stack is handling input */ #define MPP_WUPCALL 0x010 /* MPTCP-stack is handling a read upcall */ #define MPP_SHOULD_WORKLOOP 0x020 /* MPTCP-stack should call the workloop function */ #define MPP_SHOULD_RWAKEUP 0x040 /* MPTCP-stack should call sorwakeup */ #define MPP_SHOULD_WWAKEUP 0x080 /* MPTCP-stack should call sowwakeup */ #define MPP_CREATE_SUBFLOWS 0x100 /* This connection needs to create subflows */ -#define MPP_SET_CELLICON 0x200 /* Set the cellicon (deferred) */ -#define MPP_UNSET_CELLICON 0x400 /* Unset the cellicon (deferred) */ static inline boolean_t mptcp_should_defer_upcall(struct mppcb *mpp) { - return !!(mpp->mpp_flags & (MPP_INSIDE_OUTPUT | MPP_INSIDE_INPUT | MPP_RUPCALL | MPP_WUPCALL)); + return !!(mpp->mpp_flags & (MPP_INSIDE_OUTPUT | MPP_INSIDE_INPUT | MPP_INPUT_HANDLE | MPP_WUPCALL)); } /* diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c index c40a144a5..a2883309e 100644 --- a/bsd/netinet/mptcp.c +++ b/bsd/netinet/mptcp.c @@ -111,8 +111,12 @@ int mptcp_enable = 1; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED, &mptcp_enable, 0, "Enable Multipath TCP Support"); -/* Number of times to try negotiating MPTCP on SYN retransmissions */ -int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES; +/* + * Number of times to try negotiating MPTCP on SYN retransmissions. + * We haven't seen any reports of a middlebox that is dropping all SYN-segments + * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times. + */ +int mptcp_mpcap_retries = 4; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr, CTLFLAG_RW | CTLFLAG_LOCKED, &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries"); @@ -178,7 +182,8 @@ static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t, static int mptcp_reass_present(struct socket *mp_so) { - struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb; + struct mptses *mpte = mpsotompte(mp_so); + struct mptcb *mp_tp = mpte->mpte_mptcb; struct tseg_qent *q; int dowakeup = 0; int flags = 0; @@ -363,11 +368,11 @@ mptcp_input(struct mptses *mpte, struct mbuf *m) VERIFY(m->m_flags & M_PKTHDR); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; + socket_lock_assert_owned(mp_so); + DTRACE_MPTCP(input); mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp); @@ -437,9 +442,6 @@ fallback: mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN); socantrcvmore(mp_so); } - - mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__, - count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); return; } @@ -449,6 +451,8 @@ fallback: int64_t todrop; int mb_dfin = 0; + VERIFY(m->m_flags & M_PKTHDR); + /* If fallback occurs, mbufs will not have PKTF_MPTCP set */ if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { goto fallback; @@ -482,6 +486,11 @@ fallback: if (todrop > 0) { tcpstat.tcps_mptcp_rcvpackafterwin++; + os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt, + mp_tp->mpt_rcvwnd, todrop); + if (todrop >= mb_datalen) { if (freelist == NULL) { freelist = m; @@ -501,6 +510,7 @@ fallback: } else { m_adj(m, -todrop); mb_datalen -= todrop; + m->m_pkthdr.mp_rlen -= todrop; } /* @@ -510,7 +520,6 @@ fallback: m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN; } - if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) { if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen), mp_tp->mpt_rcvnxt)) { @@ -531,10 +540,11 @@ fallback: continue; } else { m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn)); + mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn); + mb_dsn = mp_tp->mpt_rcvnxt; + m->m_pkthdr.mp_rlen = mb_datalen; + m->m_pkthdr.mp_dsn = mb_dsn; } - mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__, - mp_tp->mpt_rcvnxt), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); } if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) || @@ -559,8 +569,6 @@ fallback: count = mp_so->so_rcv.sb_cc - count; tcpstat.tcps_mp_rcvtotal++; tcpstat.tcps_mp_rcvbytes += count; - mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); mp_tp->mpt_rcvnxt += count; @@ -637,41 +645,29 @@ mptcp_output(struct mptses *mpte) uint64_t old_snd_nxt; int error = 0; - mpte_lock_assert_held(mpte); mp_so = mptetoso(mpte); + socket_lock_assert_owned(mp_so); mp_tp = mpte->mpte_mptcb; VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)); mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL; - mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n", - __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax, - (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd, - mpte->mpte_reinjectq ? 1 : 0, - mp_tp->mpt_state), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); - old_snd_nxt = mp_tp->mpt_sndnxt; while (mptcp_can_send_more(mp_tp, FALSE)) { /* get the "best" subflow to be used for transmission */ - mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts); + mpts = mptcp_get_subflow(mpte, &preferred_mpts); if (mpts == NULL) { mptcplog((LOG_INFO, "%s: no subflow\n", __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG); break; } - mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); - /* In case there's just one flow, we reattempt later */ if (mpts_tried != NULL && (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) { mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER; mpts_tried->mpts_flags |= MPTSF_ACTIVE; mptcp_start_timer(mpte, MPTT_REXMT); - mptcplog((LOG_DEBUG, "%s: retry later\n", __func__), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); break; } @@ -691,11 +687,6 @@ mptcp_output(struct mptses *mpte) min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max)) == 1) { mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat; - - mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n", - __func__, mp_so->so_snd.sb_hiwat, - mp_so->so_snd.sb_lowat), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); } } } @@ -709,9 +700,9 @@ mptcp_output(struct mptses *mpte) mpts->mpts_flags &= ~MPTSF_ACTIVE; mpts_tried = mpts; if (error != ECANCELED) { - mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__, - error, mpts->mpts_flags), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + error, mpts->mpts_flags); } break; } @@ -738,14 +729,6 @@ mptcp_output(struct mptses *mpte) if (mpte->mpte_active_sub == NULL) { mpte->mpte_active_sub = mpts; } else if (mpte->mpte_active_sub != mpts) { - struct tcpcb *tp = sototcpcb(mpts->mpts_socket); - struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket); - - mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__, - mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT, - mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT), - (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG); - mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE; mpte->mpte_active_sub = mpts; @@ -807,11 +790,25 @@ mptcp_return_subflow(struct mptsub *mpts) return mpts; } +static boolean_t +mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts) +{ + struct tcpcb *tp = sototcpcb(mpts->mpts_socket); + int fail_thresh = mptcp_fail_thresh; + + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + fail_thresh *= 2; + } + + return tp->t_rxtshift >= fail_thresh && + (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq); +} + /* * Return the most eligible subflow to be used for sending data. */ struct mptsub * -mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred) +mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred) { struct tcpcb *besttp, *secondtp; struct inpcb *bestinp, *secondinp; @@ -830,8 +827,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr struct tcpcb *tp = sototcpcb(so); struct inpcb *inp = sotoinpcb(so); - mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n", - __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags, + mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n", + __func__, mpts->mpts_connid, mpts->mpts_flags, INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state, inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1, tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt, @@ -842,7 +839,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr * First, the hard conditions to reject subflows * (e.g., not connected,...) */ - if (mpts == ignore || inp->inp_last_outifp == NULL) { + if (inp->inp_last_outifp == NULL) { continue; } @@ -920,7 +917,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr * Only handover if Symptoms tells us to do so. */ if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) && - mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best)) { + mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) { return mptcp_return_subflow(second_best); } @@ -931,7 +928,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr /* Adjust with symptoms information */ if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) && - mptcp_is_wifi_unusable(mpte) != 0) { + mptcp_is_wifi_unusable_for_session(mpte) != 0) { rtt_thresh /= 2; rto_thresh /= 2; } @@ -948,7 +945,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr return mptcp_return_subflow(second_best); } - if (mptcp_subflow_is_bad(mpte, best) && + if (mptcp_subflow_is_slow(mpte, best) && secondtp->t_rxtshift == 0) { return mptcp_return_subflow(second_best); } @@ -972,7 +969,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr * has some space in the congestion-window. */ return mptcp_return_subflow(best); - } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) { + } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) { struct mptsub *tmp; /* @@ -1062,7 +1059,10 @@ mptcp_state_to_str(mptcp_state_t state) void mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) { - mpte_lock_assert_held(mp_tp->mpt_mpte); + struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); + + socket_lock_assert_owned(mp_so); + mptcp_state_t old_state = mp_tp->mpt_state; DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, @@ -1161,20 +1161,16 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp, uint16_t csum) { if (mdss_data_len == 0) { - mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte)); if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) { - mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__, - csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum); } mptcp_notify_mpfail(tp->t_inpcb->inp_socket); return; } - mptcplog((LOG_DEBUG, - "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__, - seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); mptcp_notify_mpready(tp->t_inpcb->inp_socket); @@ -1200,9 +1196,8 @@ mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m, /* unacceptable DSS option, fallback to TCP */ if (m->m_pkthdr.len > ((int) datalen + hdrlen)) { - mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d", - __func__, m->m_pkthdr.len, datalen), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen); } else { return 0; } @@ -1369,18 +1364,6 @@ mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag) sowwakeup(mpp->mpp_socket); } - - if (mpp->mpp_flags & MPP_SET_CELLICON) { - mpp->mpp_flags &= ~MPP_SET_CELLICON; - - mptcp_set_cellicon(mpp->mpp_pcbe); - } - - if (mpp->mpp_flags & MPP_UNSET_CELLICON) { - mpp->mpp_flags &= ~MPP_UNSET_CELLICON; - - mptcp_unset_cellicon(); - } } void @@ -1396,10 +1379,7 @@ mptcp_ask_for_nat64(struct ifnet *ifp) static void mptcp_reset_itfinfo(struct mpt_itf_info *info) { - info->ifindex = 0; - info->has_v4_conn = 0; - info->has_v6_conn = 0; - info->has_nat64_conn = 0; + memset(info, 0, sizeof(*info)); } void @@ -1425,8 +1405,10 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, return; } + mp_so = mptetoso(mpte); + if (action != NECP_CLIENT_CBACTION_INITIAL) { - mpte_lock(mpte); + socket_lock(mp_so, 1); locked = 1; /* Check again, because it might have changed while waiting */ @@ -1435,13 +1417,13 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, } } - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mp_so); mp_tp = mpte->mpte_mptcb; - mp_so = mptetoso(mpte); - os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n", - __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state, + os_log_info(mptcp_log_handle, "%s - %lx: action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex, + mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state, has_v4, has_v6, has_nat64, low_power); /* No need on fallen back sockets */ @@ -1472,6 +1454,7 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, } else if (action == NECP_CLIENT_CBACTION_VIABLE || action == NECP_CLIENT_CBACTION_INITIAL) { int found_slot = 0, slot_index = -1; + struct sockaddr *dst; struct ifnet *ifp; ifnet_head_lock_shared(); @@ -1487,6 +1470,11 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, goto out; } + if (IFNET_IS_CONSTRAINED(ifp) && + (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) { + goto out; + } + if (IFNET_IS_CELLULAR(ifp) && (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) { goto out; @@ -1526,8 +1514,9 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, } } - if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) && - !has_nat64 && !has_v4) { + dst = mptcp_get_session_dst(mpte, has_v6, has_v4); + if (dst && (dst->sa_family == AF_INET || dst->sa_family == 0) && + has_v6 && !has_nat64 && !has_v4) { if (found_slot) { mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4; mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6; @@ -1542,8 +1531,8 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO); if (info == NULL) { - os_log_error(mptcp_log_handle, "%s malloc failed for %u\n", - __func__, new_size); + os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size); goto out; } @@ -1571,7 +1560,7 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, out: if (locked) { - mpte_unlock(mpte); + socket_unlock(mp_so, 1); } } @@ -1581,7 +1570,7 @@ mptcp_set_restrictions(struct socket *mp_so) struct mptses *mpte = mpsotompte(mp_so); uint32_t i; - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mp_so); ifnet_head_lock_shared(); @@ -1604,6 +1593,11 @@ mptcp_set_restrictions(struct socket *mp_so) info->ifindex = IFSCOPE_NONE; } + if (IFNET_IS_CONSTRAINED(ifp) && + (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) { + info->ifindex = IFSCOPE_NONE; + } + if (IFNET_IS_CELLULAR(ifp) && (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) { info->ifindex = IFSCOPE_NONE; diff --git a/bsd/netinet/mptcp.h b/bsd/netinet/mptcp.h index d98f50e83..122476f65 100644 --- a/bsd/netinet/mptcp.h +++ b/bsd/netinet/mptcp.h @@ -64,13 +64,13 @@ * Used to establish an MPTCP connection and first subflow. */ struct mptcp_mpcapable_opt_common { - u_int8_t mmco_kind; - u_int8_t mmco_len; + uint8_t mmco_kind; + uint8_t mmco_len; #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mmco_version:4, + uint8_t mmco_version:4, mmco_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mmco_subtype:4, + uint8_t mmco_subtype:4, mmco_version:4; #endif #define MPCAP_PROPOSAL_SBIT 0x01 /* SHA1 Algorithm */ @@ -79,11 +79,10 @@ struct mptcp_mpcapable_opt_common { #define MPCAP_FBIT 0x04 /* must be 0 */ #define MPCAP_EBIT 0x08 /* must be 0 */ #define MPCAP_DBIT 0x10 /* must be 0 */ -#define MPCAP_CBIT 0x20 /* must be 0 */ +#define MPCAP_UNICAST_IPBIT 0x20 /* Should MPTCP only use ADD_ADDR IPs for new subflows */ #define MPCAP_BBIT 0x40 /* Extensibility bit, must be 0 */ -#define MPCAP_ABIT 0x80 /* alias of MPCAP_CHECKSUM_CBIT */ #define MPCAP_CHECKSUM_CBIT 0x80 /* DSS Checksum bit */ - u_int8_t mmco_flags; + uint8_t mmco_flags; } __attribute__((__packed__)); struct mptcp_mpcapable_opt_rsp { @@ -105,86 +104,53 @@ struct mptcp_mpcapable_opt_rsp1 { /* MP_JOIN Option for SYN */ struct mptcp_mpjoin_opt_req { - u_int8_t mmjo_kind; - u_int8_t mmjo_len; + uint8_t mmjo_kind; + uint8_t mmjo_len; #define MPTCP_BACKUP 0x1 - u_int8_t mmjo_subtype_bkp; - u_int8_t mmjo_addr_id; - u_int32_t mmjo_peer_token; - u_int32_t mmjo_rand; + uint8_t mmjo_subtype_bkp; + uint8_t mmjo_addr_id; + uint32_t mmjo_peer_token; + uint32_t mmjo_rand; } __attribute__((__packed__)); /* MP_JOIN Option for SYN/ACK */ struct mptcp_mpjoin_opt_rsp { - u_int8_t mmjo_kind; - u_int8_t mmjo_len; + uint8_t mmjo_kind; + uint8_t mmjo_len; #define MPTCP_BACKUP 0x1 - u_int8_t mmjo_subtype_bkp; - u_int8_t mmjo_addr_id; - u_int64_t mmjo_mac; /* Truncated message auth code */ - u_int32_t mmjo_rand; + uint8_t mmjo_subtype_bkp; + uint8_t mmjo_addr_id; + uint64_t mmjo_mac; /* Truncated message auth code */ + uint32_t mmjo_rand; } __attribute__((__packed__)); /* MP_Join Option for ACK */ struct mptcp_mpjoin_opt_rsp2 { - u_int8_t mmjo_kind; - u_int8_t mmjo_len; + uint8_t mmjo_kind; + uint8_t mmjo_len; #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mmjo_reserved1:4, + uint8_t mmjo_reserved1:4, mmjo_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mmjo_subtype:4, + uint8_t mmjo_subtype:4, mmjo_reserved1:4; #endif - u_int8_t mmjo_reserved2; - u_int8_t mmjo_mac[SHA1_RESULTLEN]; /* This is 160 bits HMAC SHA-1 per RFC */ -} __attribute__((__packed__)); - - -/* - * MPTCP ADD_ADDR and REMOVE_ADDR TCP Options - * - * ADD_ADDR option shall be ignored by this implementation - * REMOVE_ADDR option shall be sent to help flush dead subflows - */ - -/* Add Address Option */ -struct mptcp_addaddr_opt { - u_int8_t ma_kind; - u_int8_t ma_len; -#if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t ma_ipver:4, - ma_subtype:4; -#else /* BIG_ENDIAN */ - u_int8_t ma_subtype:4, - ma_ipver:4; -#endif -#define MA_IPVer_V4 4 /* IPv4 Address tagged to the option */ -#define MA_IPVer_V6 6 /* IPv6 Address tagged to the option */ - u_int8_t ma_addr_id; -} __attribute__((__packed__)); - -/* Address sent in the ADD_ADDR option */ -struct mptcp_addr_family_val { - union { - struct in_addr ma_v4_addr; - struct in6_addr ma_v6_addr; - } ma_addr; - /* u_int16_t ma_ports; */ /* optional field */ + uint8_t mmjo_reserved2; + uint8_t mmjo_mac[SHA1_RESULTLEN]; /* This is 160 bits HMAC SHA-1 per RFC */ } __attribute__((__packed__)); /* Remove Address Option */ struct mptcp_remaddr_opt { - u_int8_t mr_kind; - u_int8_t mr_len; + uint8_t mr_kind; + uint8_t mr_len; #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mr_rest:4, + uint8_t mr_rest:4, mr_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mr_subtype:4, + uint8_t mr_subtype:4, mr_rest:4; #endif - u_int8_t mr_addr_id; + uint8_t mr_addr_id; } __attribute__((__packed__)); /* @@ -205,85 +171,85 @@ struct mptcp_remaddr_opt { /* DSS fields common to all DSS option variants */ struct mptcp_dss_copt { - u_int8_t mdss_kind; - u_int8_t mdss_len; + uint8_t mdss_kind; + uint8_t mdss_len; #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mdss_reserved1:4, + uint8_t mdss_reserved1:4, mdss_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mdss_subtype:4, + uint8_t mdss_subtype:4, mdss_reserved1:4; #endif - u_int8_t mdss_flags; + uint8_t mdss_flags; }__attribute__((__packed__)); /* 32-bit DSS option */ struct mptcp_dsn_opt { struct mptcp_dss_copt mdss_copt; - u_int32_t mdss_dsn; /* Data Sequence Number */ - u_int32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ - u_int16_t mdss_data_len; /* Data Length */ - /* u_int16_t mdss_xsum; */ /* Data checksum - optional */ + uint32_t mdss_dsn; /* Data Sequence Number */ + uint32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ + uint16_t mdss_data_len; /* Data Length */ + /* uint16_t mdss_xsum; */ /* Data checksum - optional */ }__attribute__((__packed__)); /* 64-bit DSS option */ struct mptcp_dsn64_opt { struct mptcp_dss_copt mdss_copt; - u_int64_t mdss_dsn; /* Data Sequence Number */ - u_int32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ - u_int16_t mdss_data_len; /* Data Length */ - /* u_int16_t mdss_xsum; */ /* Data checksum - optional */ + uint64_t mdss_dsn; /* Data Sequence Number */ + uint32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ + uint16_t mdss_data_len; /* Data Length */ + /* uint16_t mdss_xsum; */ /* Data checksum - optional */ }__attribute__((__packed__)); /* 32-bit DSS Data ACK option */ struct mptcp_data_ack_opt { struct mptcp_dss_copt mdss_copt; - u_int32_t mdss_ack; + uint32_t mdss_ack; }__attribute__((__packed__)); /* 64-bit DSS Data ACK option */ struct mptcp_data_ack64_opt { struct mptcp_dss_copt mdss_copt; - u_int64_t mdss_ack; + uint64_t mdss_ack; }__attribute__((__packed__)); /* 32-bit DSS+Data ACK option */ struct mptcp_dss_ack_opt { struct mptcp_dss_copt mdss_copt; - u_int32_t mdss_ack; /* Data ACK */ - u_int32_t mdss_dsn; /* Data Sequence Number */ - u_int32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ - u_int16_t mdss_data_len; /* Data Length */ - /* u_int16_t mdss_xsum; */ /* Data checksum - optional */ + uint32_t mdss_ack; /* Data ACK */ + uint32_t mdss_dsn; /* Data Sequence Number */ + uint32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ + uint16_t mdss_data_len; /* Data Length */ + /* uint16_t mdss_xsum; */ /* Data checksum - optional */ }__attribute__((__packed__)); /* 64-bit DSS+Data ACK option */ struct mptcp_dss64_ack64_opt { struct mptcp_dss_copt mdss_copt; - u_int64_t mdss_ack; /* Data ACK */ - u_int64_t mdss_dsn; /* Data Sequence Number */ - u_int32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ - u_int16_t mdss_data_len; /* Data Length */ - /* u_int16_t mdss_xsum; */ /* Data checksum - optional */ + uint64_t mdss_ack; /* Data ACK */ + uint64_t mdss_dsn; /* Data Sequence Number */ + uint32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ + uint16_t mdss_data_len; /* Data Length */ + /* uint16_t mdss_xsum; */ /* Data checksum - optional */ }__attribute__((__packed__)); /* DSS+Data ACK mixed option variants */ struct mptcp_dss32_ack64_opt { struct mptcp_dss_copt mdss_copt; - u_int64_t mdss_ack; /* Data ACK */ - u_int32_t mdss_dsn; /* Data Sequence Number */ - u_int32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ - u_int16_t mdss_data_len; /* Data Length */ - /* u_int16_t mdss_xsum; */ /* Data checksum - optional */ + uint64_t mdss_ack; /* Data ACK */ + uint32_t mdss_dsn; /* Data Sequence Number */ + uint32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ + uint16_t mdss_data_len; /* Data Length */ + /* uint16_t mdss_xsum; */ /* Data checksum - optional */ }__attribute__((__packed__)); struct mptcp_dss64_ack32_opt { struct mptcp_dss_copt mdss_copt; - u_int32_t mdss_ack; /* Data ACK */ - u_int64_t mdss_dsn; /* Data Sequence Number */ - u_int32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ - u_int16_t mdss_data_len; /* Data Length */ - /* u_int16_t mdss_xsum; */ /* Data checksum - optional */ + uint32_t mdss_ack; /* Data ACK */ + uint64_t mdss_dsn; /* Data Sequence Number */ + uint32_t mdss_subflow_seqn; /* Relative Subflow Seq Num */ + uint16_t mdss_data_len; /* Data Length */ + /* uint16_t mdss_xsum; */ /* Data checksum - optional */ }__attribute__((__packed__)); @@ -295,17 +261,17 @@ struct mptcp_dss64_ack32_opt { * API is supported. */ struct mptcp_fastclose_opt { - u_int8_t mfast_kind; - u_int8_t mfast_len; + uint8_t mfast_kind; + uint8_t mfast_len; #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mfast_reserved:4, + uint8_t mfast_reserved:4, mfast_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mfast_subtype:4, + uint8_t mfast_subtype:4, mfast_reserved:4; #endif - u_int8_t mfast_reserved1; - u_int64_t mfast_key; /* Option receiver's key */ + uint8_t mfast_reserved1; + uint64_t mfast_key; /* Option receiver's key */ }__attribute__((__packed__)); /* @@ -316,19 +282,44 @@ struct mptcp_fastclose_opt { * option. */ struct mptcp_mpfail_opt { - u_int8_t mfail_kind; - u_int8_t mfail_len; + uint8_t mfail_kind; + uint8_t mfail_len; #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mfail_reserved:4, + uint8_t mfail_reserved:4, mfail_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mfail_subtype:4, + uint8_t mfail_subtype:4, mfail_reserved:4; #endif - u_int8_t mfail_reserved1:8; - u_int64_t mfail_dsn; + uint8_t mfail_reserved1:8; + uint64_t mfail_dsn; +}__attribute__((__packed__)); + +struct mptcp_add_addr_opt { + uint8_t maddr_kind; + uint8_t maddr_len; +#if BYTE_ORDER == LITTLE_ENDIAN + uint8_t maddr_ipversion:4, + maddr_subtype:4; +#else /* BIG_ENDIAN */ + uint8_t maddr_subtype:4, + maddr_ipversion:4; +#endif + uint8_t maddr_addrid; + union { + struct { + struct in_addr maddr_addrv4; + uint32_t maddr_pad[3]; + }; + + struct { + struct in6_addr maddr_addrv6; + }; + } maddr_u; }__attribute__((__packed__)); +#define MPTCP_ADD_ADDR_OPT_LEN_V4 8 +#define MPTCP_ADD_ADDR_OPT_LEN_V6 20 /* * MPTCP MP_PRIO Option @@ -340,31 +331,31 @@ struct mptcp_mpfail_opt { /* Option to change priority of self */ struct mptcp_mpprio_opt { - u_int8_t mpprio_kind; - u_int8_t mpprio_len; + uint8_t mpprio_kind; + uint8_t mpprio_len; #define MPTCP_MPPRIO_BKP 0x1 #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mpprio_flags:4, + uint8_t mpprio_flags:4, mpprio_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mpprio_subtype:4, + uint8_t mpprio_subtype:4, mpprio_flags:4; #endif }__attribute__((__packed__)); /* Option to change priority of some other subflow(s) using addr_id */ struct mptcp_mpprio_addr_opt { - u_int8_t mpprio_kind; - u_int8_t mpprio_len; + uint8_t mpprio_kind; + uint8_t mpprio_len; #define MPTCP_MPPRIO_BKP 0x1 #if BYTE_ORDER == LITTLE_ENDIAN - u_int8_t mpprio_flags:4, + uint8_t mpprio_flags:4, mpprio_subtype:4; #else /* BIG_ENDIAN */ - u_int8_t mpprio_subtype:4, + uint8_t mpprio_subtype:4, mpprio_flags:4; #endif - u_int8_t mpprio_addrid; + uint8_t mpprio_addrid; }__attribute__((__packed__)); /* @@ -372,10 +363,10 @@ struct mptcp_mpprio_addr_opt { * */ struct mptcp_pseudohdr { - u_int64_t mphdr_dsn; /* Data Sequence Number */ - u_int32_t mphdr_ssn; /* Subflow Sequence Number */ - u_int16_t mphdr_len; /* Data-Level Length */ - u_int16_t mphdr_xsum; /* MPTCP Level Checksum */ + uint64_t mphdr_dsn; /* Data Sequence Number */ + uint32_t mphdr_ssn; /* Subflow Sequence Number */ + uint16_t mphdr_len; /* Data-Level Length */ + uint16_t mphdr_xsum; /* MPTCP Level Checksum */ }__attribute__((__packed__)); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/netinet/mptcp_opt.c b/bsd/netinet/mptcp_opt.c index 377f0d567..6b63ab6e0 100644 --- a/bsd/netinet/mptcp_opt.c +++ b/bsd/netinet/mptcp_opt.c @@ -67,13 +67,22 @@ mptcp_setup_first_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optl struct mptcp_mpcapable_opt_common mptcp_opt; struct tcpcb *tp = sototcpcb(so); struct mptcb *mp_tp = tptomptp(tp); + int ret; - mpte_lock_assert_held(mp_tp->mpt_mpte); + ret = tcp_heuristic_do_mptcp(tp); + if (ret > 0) { + os_log_info(mptcp_log_handle, "%s - %lx: Not doing MPTCP due to heuristics", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte)); + mp_tp->mpt_flags |= MPTCPF_FALLBACK_HEURISTIC; + return optlen; + } /* * Avoid retransmitting the MP_CAPABLE option. */ - if (tp->t_rxtshift > mptcp_mpcap_retries) { + if (ret == 0 && + tp->t_rxtshift > mptcp_mpcap_retries && + !(tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE)) { if (!(mp_tp->mpt_flags & (MPTCPF_FALLBACK_HEURISTIC | MPTCPF_HEURISTIC_TRAC))) { mp_tp->mpt_flags |= MPTCPF_HEURISTIC_TRAC; tcp_heuristic_mptcp_loss(tp); @@ -81,11 +90,6 @@ mptcp_setup_first_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optl return optlen; } - if (!tcp_heuristic_do_mptcp(tp)) { - mp_tp->mpt_flags |= MPTCPF_FALLBACK_HEURISTIC; - return optlen; - } - bzero(&mptcp_opt, sizeof(struct mptcp_mpcapable_opt_common)); mptcp_opt.mmco_kind = TCPOPT_MULTIPATH; @@ -125,9 +129,6 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optle mpts = tp->t_mpsub; - VERIFY(tptomptp(tp)); - mpte_lock_assert_held(tptomptp(tp)->mpt_mpte); - bzero(&mpjoin_req, sizeof(mpjoin_req)); mpjoin_req.mmjo_kind = TCPOPT_MULTIPATH; mpjoin_req.mmjo_len = sizeof(mpjoin_req); @@ -136,7 +137,7 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optle if (tp->t_mpflags & TMPF_BACKUP_PATH) { mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP; } else if (inp->inp_boundifp && IFNET_IS_CELLULAR(inp->inp_boundifp) && - mpts->mpts_mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) { + mpts->mpts_mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) { mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP; tp->t_mpflags |= TMPF_BACKUP_PATH; } else { @@ -209,8 +210,6 @@ mptcp_send_mpfail(struct tcpcb *tp, u_char *opt, unsigned int optlen) return optlen; } - mpte_lock_assert_held(mp_tp->mpt_mpte); - /* if option space low give up */ if ((MAX_TCPOPTLEN - optlen) < sizeof(struct mptcp_mpfail_opt)) { tp->t_mpflags &= ~TMPF_SND_MPFAIL; @@ -251,8 +250,6 @@ mptcp_send_infinite_mapping(struct tcpcb *tp, u_char *opt, unsigned int optlen) return optlen; } - mpte_lock_assert_held(mp_tp->mpt_mpte); - if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) { csum_len = 2; } @@ -326,8 +323,6 @@ mptcp_ok_to_fin(struct tcpcb *tp, u_int64_t dsn, u_int32_t datalen) { struct mptcb *mp_tp = tptomptp(tp); - mpte_lock_assert_held(mp_tp->mpt_mpte); - dsn = (mp_tp->mpt_sndmax & MPTCP_DATASEQ_LOW32_MASK) | dsn; if ((dsn + datalen) == mp_tp->mpt_sndmax) { return 1; @@ -354,7 +349,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, goto ret_optlen; } - mpte_lock_assert_held(mp_tp->mpt_mpte); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) { do_csum = TRUE; @@ -873,12 +868,6 @@ mptcp_sanitize_option(struct tcpcb *tp, int mptcp_subtype) struct mptcb *mp_tp = tptomptp(tp); int ret = 1; - if (mp_tp == NULL) { - mptcplog((LOG_ERR, "%s: NULL mpsocket \n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - return 0; - } - switch (mptcp_subtype) { case MPO_CAPABLE: break; @@ -895,9 +884,8 @@ mptcp_sanitize_option(struct tcpcb *tp, int mptcp_subtype) break; default: ret = 0; - mptcplog((LOG_ERR, "%s: type = %d \n", __func__, - mptcp_subtype), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: type = %d \n", __func__, + (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), mptcp_subtype); break; } return ret; @@ -915,7 +903,7 @@ mptcp_valid_mpcapable_common_opt(u_char *cp) return 0; } - if (rsp->mmco_flags & (MPCAP_BBIT | MPCAP_CBIT | MPCAP_DBIT | + if (rsp->mmco_flags & (MPCAP_BBIT | MPCAP_DBIT | MPCAP_EBIT | MPCAP_FBIT | MPCAP_GBIT)) { return 0; } @@ -930,8 +918,7 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, { struct mptcp_mpcapable_opt_rsp *rsp = NULL; struct mptcb *mp_tp = tptomptp(tp); - - mpte_lock_assert_held(mp_tp->mpt_mpte); + struct mptses *mpte = mp_tp->mpt_mpte; /* Only valid on SYN/ACK */ if ((th->th_flags & (TH_SYN | TH_ACK)) != (TH_SYN | TH_ACK)) { @@ -952,10 +939,9 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, /* A SYN/ACK contains peer's key and flags */ if (optlen != sizeof(struct mptcp_mpcapable_opt_rsp)) { /* complain */ - mptcplog((LOG_ERR, "%s: SYN_ACK optlen = %d, sizeof mp opt = %lu \n", - __func__, optlen, - sizeof(struct mptcp_mpcapable_opt_rsp)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK optlen = %d, sizeof mp opt = %lu \n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), optlen, + sizeof(struct mptcp_mpcapable_opt_rsp)); tcpstat.tcps_invalid_mpcap++; return; } @@ -969,6 +955,11 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, mp_tp->mpt_flags |= MPTCPF_CHECKSUM; } + if (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags & + MPCAP_UNICAST_IPBIT) { + mpte->mpte_flags |= MPTE_UNICAST_IP; + } + rsp = (struct mptcp_mpcapable_opt_rsp *)cp; mp_tp->mpt_remotekey = rsp->mmc_localkey; /* For now just downgrade to the peer's version */ @@ -990,7 +981,6 @@ static void mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen) { #define MPTCP_JOPT_ERROR_PATH(tp) { \ - tp->t_mpflags |= TMPF_RESET; \ tcpstat.tcps_invalid_joins++; \ if (tp->t_inpcb->inp_socket != NULL) { \ soevent(tp->t_inpcb->inp_socket, \ @@ -1007,10 +997,9 @@ mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen) } if (optlen != sizeof(struct mptcp_mpjoin_opt_rsp)) { - mptcplog((LOG_ERR, "%s: SYN_ACK: unexpected optlen = %d mp " - "option = %lu\n", __func__, optlen, - sizeof(struct mptcp_mpjoin_opt_rsp)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK: unexpected optlen = %d mp option = %lu\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), + optlen, sizeof(struct mptcp_mpjoin_opt_rsp)); tp->t_mpflags &= ~TMPF_PREESTABLISHED; /* send RST and close */ MPTCP_JOPT_ERROR_PATH(tp); @@ -1022,8 +1011,9 @@ mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen) error = mptcp_validate_join_hmac(tp, (u_char*)&join_rsp->mmjo_mac, SHA1_TRUNCATED); if (error) { - mptcplog((LOG_ERR, "%s: SYN_ACK error = %d \n", __func__, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK error = %d \n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), + error); tp->t_mpflags &= ~TMPF_PREESTABLISHED; /* send RST and close */ MPTCP_JOPT_ERROR_PATH(tp); @@ -1039,8 +1029,6 @@ mptcp_validate_join_hmac(struct tcpcb *tp, u_char* hmac, int mac_len) struct mptcb *mp_tp = tptomptp(tp); u_int32_t rem_rand, loc_rand; - mpte_lock_assert_held(mp_tp->mpt_mpte); - rem_rand = loc_rand = 0; mptcp_get_rands(tp->t_local_aid, mp_tp, &loc_rand, &rem_rand); @@ -1068,7 +1056,7 @@ mptcp_validate_join_hmac(struct tcpcb *tp, u_char* hmac, int mac_len) void mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack) { - u_int64_t acked = full_dack - mp_tp->mpt_snduna; + uint64_t acked = full_dack - mp_tp->mpt_snduna; if (acked) { struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); @@ -1076,11 +1064,11 @@ mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack) if (acked > mp_so->so_snd.sb_cc) { if (acked > mp_so->so_snd.sb_cc + 1 || mp_tp->mpt_state < MPTCPS_FIN_WAIT_1) { - mptcplog((LOG_ERR, "%s: acked %u, sb_cc %u full %u suna %u state %u\n", - __func__, (uint32_t)acked, mp_so->so_snd.sb_cc, + os_log_error(mptcp_log_handle, "%s - %lx: acked %u, sb_cc %u full %u suna %u state %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), + (uint32_t)acked, mp_so->so_snd.sb_cc, (uint32_t)full_dack, (uint32_t)mp_tp->mpt_snduna, - mp_tp->mpt_state), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR); + mp_tp->mpt_state); } sbdrop(&mp_so->so_snd, (int)mp_so->so_snd.sb_cc); @@ -1116,7 +1104,7 @@ mptcp_update_window_wakeup(struct tcpcb *tp) { struct mptcb *mp_tp = tptomptp(tp); - mpte_lock_assert_held(mp_tp->mpt_mpte); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { mp_tp->mpt_sndwnd = tp->snd_wnd; @@ -1130,9 +1118,9 @@ mptcp_update_window_wakeup(struct tcpcb *tp) static void mptcp_update_window(struct mptcb *mp_tp, u_int64_t ack, u_int64_t seq, u_int32_t tiwin) { - if (SEQ_LT(mp_tp->mpt_sndwl1, seq) || + if (MPTCP_SEQ_LT(mp_tp->mpt_sndwl1, seq) || (mp_tp->mpt_sndwl1 == seq && - (SEQ_LT(mp_tp->mpt_sndwl2, ack) || + (MPTCP_SEQ_LT(mp_tp->mpt_sndwl2, ack) || (mp_tp->mpt_sndwl2 == ack && tiwin > mp_tp->mpt_sndwnd)))) { mp_tp->mpt_sndwnd = tiwin; mp_tp->mpt_sndwl1 = seq; @@ -1163,12 +1151,6 @@ mptcp_do_dss_opt_ack_meat(u_int64_t full_dack, u_int64_t full_dsn, if (close_notify) { mptcp_notify_close(tp->t_inpcb->inp_socket); } - } else { - os_log_error(mptcp_log_handle, - "%s: unexpected dack %u snduna %u sndmax %u\n", - __func__, (u_int32_t)full_dack, - (u_int32_t)mp_tp->mpt_snduna, - (u_int32_t)mp_tp->mpt_sndmax); } mptcp_update_window(mp_tp, full_dack, full_dsn, tiwin); @@ -1414,27 +1396,21 @@ mptcp_do_dss_opt_meat(u_char *cp, struct tcpcb *tp, struct tcphdr *th) } static void -mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen) +mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) { -#pragma unused(optlen) + struct mptcp_dss_copt *dss_rsp = (struct mptcp_dss_copt *)cp; struct mptcb *mp_tp = tptomptp(tp); if (!mp_tp) { return; } - /* We may get Data ACKs just during fallback, so don't ignore those */ - if ((tp->t_mpflags & TMPF_MPTCP_TRUE) || - (tp->t_mpflags & TMPF_TCP_FALLBACK)) { - struct mptcp_dss_copt *dss_rsp = (struct mptcp_dss_copt *)cp; - - if (dss_rsp->mdss_subtype == MPO_DSS) { - if (dss_rsp->mdss_flags & MDSS_F) { - tp->t_rcv_map.mpt_dfin = 1; - } - - mptcp_do_dss_opt_meat(cp, tp, th); + if (dss_rsp->mdss_subtype == MPO_DSS) { + if (dss_rsp->mdss_flags & MDSS_F) { + tp->t_rcv_map.mpt_dfin = 1; } + + mptcp_do_dss_opt_meat(cp, tp, th); } } @@ -1473,7 +1449,7 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) } /* Reset this flow */ - tp->t_mpflags |= (TMPF_RESET | TMPF_FASTCLOSERCV); + tp->t_mpflags |= TMPF_FASTCLOSERCV; if (tp->t_inpcb->inp_socket != NULL) { soevent(tp->t_inpcb->inp_socket, @@ -1485,9 +1461,9 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) static void mptcp_do_mpfail_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) { - struct mptcb *mp_tp = NULL; struct mptcp_mpfail_opt *fail_opt = (struct mptcp_mpfail_opt *)cp; u_int32_t mdss_subflow_seqn = 0; + struct mptcb *mp_tp; int error = 0; /* @@ -1521,6 +1497,96 @@ mptcp_do_mpfail_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) mptcp_notify_mpfail(tp->t_inpcb->inp_socket); } +static void +mptcp_do_add_addr_opt(struct mptses *mpte, u_char *cp) +{ + struct mptcp_add_addr_opt *addr_opt = (struct mptcp_add_addr_opt *)cp; + + if (addr_opt->maddr_len != MPTCP_ADD_ADDR_OPT_LEN_V4 && + addr_opt->maddr_len != MPTCP_ADD_ADDR_OPT_LEN_V6) { + os_log_info(mptcp_log_handle, "%s - %lx: Wrong ADD_ADDR length %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + addr_opt->maddr_len); + + return; + } + + if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V4 && + addr_opt->maddr_ipversion != 4) { + os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDR length for v4 but version is %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + addr_opt->maddr_ipversion); + + return; + } + + if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V6 && + addr_opt->maddr_ipversion != 6) { + os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDR length for v6 but version is %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + addr_opt->maddr_ipversion); + + return; + } + + if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V4) { + struct sockaddr_in *dst = &mpte->mpte_dst_unicast_v4; + struct in_addr *addr = &addr_opt->maddr_u.maddr_addrv4; + in_addr_t haddr = ntohl(addr->s_addr); + + if (IN_ZERONET(haddr) || + IN_LOOPBACK(haddr) || + IN_LINKLOCAL(haddr) || + IN_DS_LITE(haddr) || + IN_6TO4_RELAY_ANYCAST(haddr) || + IN_MULTICAST(haddr) || + INADDR_BROADCAST == haddr || + IN_PRIVATE(haddr) || + IN_SHARED_ADDRESS_SPACE(haddr)) { + os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDR invalid addr: %x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + addr->s_addr); + + return; + } + + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_port = mpte->__mpte_dst_v4.sin_port; + dst->sin_addr.s_addr = addr->s_addr; + } else { + struct sockaddr_in6 *dst = &mpte->mpte_dst_unicast_v6; + struct in6_addr *addr = &addr_opt->maddr_u.maddr_addrv6; + + if (IN6_IS_ADDR_LINKLOCAL(addr) || + IN6_IS_ADDR_MULTICAST(addr) || + IN6_IS_ADDR_UNSPECIFIED(addr) || + IN6_IS_ADDR_LOOPBACK(addr) || + IN6_IS_ADDR_V4COMPAT(addr) || + IN6_IS_ADDR_V4MAPPED(addr)) { + char dbuf[MAX_IPv6_STR_LEN]; + + inet_ntop(AF_INET6, &dst->sin6_addr, dbuf, sizeof(dbuf)); + os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDRv6 invalid addr: %s\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + dbuf); + + return; + } + + dst->sin6_len = sizeof(*dst); + dst->sin6_family = AF_INET6; + dst->sin6_port = mpte->__mpte_dst_v6.sin6_port; + memcpy(&dst->sin6_addr, addr, sizeof(*addr)); + } + + os_log_info(mptcp_log_handle, "%s - %lx: Received ADD_ADDRv%u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + addr_opt->maddr_ipversion); + + mptcp_sched_create_subflows(mpte); +} + void tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th, struct tcpopt *to, int optlen) @@ -1532,7 +1598,7 @@ tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th, return; } - mpte_lock_assert_held(mp_tp->mpt_mpte); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); /* All MPTCP options have atleast 4 bytes */ if (optlen < 4) { @@ -1553,7 +1619,7 @@ tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th, mptcp_do_mpjoin_opt(tp, cp, th, optlen); break; case MPO_DSS: - mptcp_do_dss_opt(tp, cp, th, optlen); + mptcp_do_dss_opt(tp, cp, th); break; case MPO_FASTCLOSE: mptcp_do_fastclose_opt(tp, cp, th); @@ -1561,7 +1627,9 @@ tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th, case MPO_FAIL: mptcp_do_mpfail_opt(tp, cp, th); break; - case MPO_ADD_ADDR: /* fall through */ + case MPO_ADD_ADDR: + mptcp_do_add_addr_opt(mp_tp->mpt_mpte, cp); + break; case MPO_REMOVE_ADDR: /* fall through */ case MPO_PRIO: to->to_flags |= TOF_MPTCP; diff --git a/bsd/netinet/mptcp_opt.h b/bsd/netinet/mptcp_opt.h index 0a65d5651..5ca0e32e8 100644 --- a/bsd/netinet/mptcp_opt.h +++ b/bsd/netinet/mptcp_opt.h @@ -31,15 +31,6 @@ #ifdef BSD_KERNEL_PRIVATE -/* - * Try setting up an MPTCP connection by making atleast 3 attempts, - * that is 2 retransmissions - needed for Weak WiFi and long delay cellular. - * This number must be bumped higher when we are assured that middleboxes - * are not the reason for retries. Generally, on weak wifi and cold start - * cellular, more than 2 retries are necessary. - */ -#define MPTCP_CAPABLE_RETRIES (2) - __BEGIN_DECLS extern void mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack); extern void mptcp_update_window_wakeup(struct tcpcb *tp); diff --git a/bsd/netinet/mptcp_subr.c b/bsd/netinet/mptcp_subr.c index f21312da5..f7980b76c 100644 --- a/bsd/netinet/mptcp_subr.c +++ b/bsd/netinet/mptcp_subr.c @@ -117,8 +117,6 @@ static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **, struct uio *, struct mbuf **, struct mbuf **, int *); static int mptcp_subflow_sosend(struct socket *, struct sockaddr *, struct uio *, struct mbuf *, struct mbuf *, int); -static void mptcp_subflow_rupcall(struct socket *, void *, int); -static void mptcp_subflow_input(struct mptses *, struct mptsub *); static void mptcp_subflow_wupcall(struct socket *, void *, int); static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t); static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so); @@ -127,6 +125,9 @@ static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *); static void mptcp_subflow_abort(struct mptsub *, int); static void mptcp_send_dfin(struct socket *so); +static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts); +static void mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, long val); +static int mptcp_freeq(struct mptcb *mp_tp); /* * Possible return values for subflow event handlers. Note that success @@ -142,7 +143,6 @@ typedef enum { MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */ } ev_ret_t; -static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *); static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); @@ -152,11 +152,10 @@ static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); +static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t); -static const char *mptcp_evret2str(ev_ret_t); - static void mptcp_do_sha1(mptcp_key_t *, char *); static void mptcp_init_local_parms(struct mptses *); @@ -171,9 +170,6 @@ static struct zone *mpt_subauth_zone; /* zone of subf auth entry */ struct mppcbinfo mtcbinfo; -#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */ -#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */ - SYSCTL_DECL(_net_inet); SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP"); @@ -212,8 +208,13 @@ typedef struct mptcp_subflow_event_entry { uint64_t event); } mptsub_ev_entry_t; -static uint8_t mptcp_cellicon_is_set; -static uint32_t mptcp_last_cellicon_set; +/* Using Symptoms Advisory to detect poor WiFi or poor Cell */ +static kern_ctl_ref mptcp_kern_ctrl_ref = NULL; +static uint32_t mptcp_kern_skt_inuse = 0; +static uint32_t mptcp_kern_skt_unit; +static symptoms_advisory_t mptcp_advisory; + +uint32_t mptcp_cellicon_refcount = 0; #define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */ /* @@ -221,6 +222,10 @@ static uint32_t mptcp_last_cellicon_set; * really important. Think twice before changing it. */ static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = { + { + .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR, + .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev, + }, { .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE, .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev, @@ -298,6 +303,8 @@ mptcp_init(struct protosw *pp, struct domain *dp) } mptcp_initialized = 1; + mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK; + /* * Since PF_MULTIPATH gets initialized after PF_INET/INET6, * we must be able to find IPPROTO_TCP entries for both. @@ -399,40 +406,51 @@ mptcp_init(struct protosw *pp, struct domain *dp) zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE); zone_change(mpt_subauth_zone, Z_EXPAND, TRUE); - mptcp_last_cellicon_set = tcp_now; - mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp"); } int -mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts) +mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create) { - const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; - int i, index = -1; - if (ifp == NULL) { - mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - return -1; - } - for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) { - if (stats[i].ifindex == IFSCOPE_NONE) { + if (create && stats[i].ifindex == IFSCOPE_NONE) { if (index < 0) { index = i; } continue; } - if (stats[i].ifindex == ifp->if_index) { + if (stats[i].ifindex == ifindex) { index = i; return index; } } if (index != -1) { - stats[index].ifindex = ifp->if_index; + stats[index].ifindex = ifindex; + } + + return index; +} + +static int +mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts) +{ + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + int index; + + if (ifp == NULL) { + os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), + sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags); + return -1; + } + + index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true); + + if (index != -1) { if (stats[index].is_expensive == 0) { stats[index].is_expensive = IFNET_IS_CELLULAR(ifp); } @@ -449,7 +467,7 @@ mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts) tcpstat.tcps_mp_switches++; mpte->mpte_subflow_switches++; - index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts); + index = mptcpstats_get_index(mpte->mpte_itfstats, mpts); if (index != -1) { mpte->mpte_itfstats[index].switches++; @@ -475,7 +493,7 @@ mptcp_flush_sopts(struct mptses *mpte) * Create an MPTCP session, called as a result of opening a MPTCP socket. */ int -mptcp_sescreate(struct mppcb *mpp) +mptcp_session_create(struct mppcb *mpp) { struct mppcbinfo *mppi; struct mptses *mpte; @@ -500,6 +518,8 @@ mptcp_sescreate(struct mppcb *mpp) mpte->mpte_associd = SAE_ASSOCID_ANY; mpte->mpte_connid_last = SAE_CONNID_ANY; + mptcp_init_urgency_timer(mpte); + mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0]; mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE; @@ -507,6 +527,8 @@ mptcp_sescreate(struct mppcb *mpp) mpte->mpte_alternate_port = htons(mptcp_alternate_port); } + mpte->mpte_last_cellicon_set = tcp_now; + /* MPTCP Protocol Control Block */ bzero(mp_tp, sizeof(*mp_tp)); mp_tp->mpt_mpte = mpte; @@ -517,6 +539,36 @@ mptcp_sescreate(struct mppcb *mpp) return 0; } +struct sockaddr * +mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4) +{ + if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) { + return &mpte->mpte_dst; + } + + if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) { + return (struct sockaddr *)&mpte->mpte_dst_unicast_v6; + } + + if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) { + return (struct sockaddr *)&mpte->mpte_dst_unicast_v4; + } + + /* The interface has neither IPv4 nor IPv6 routes. Give our best guess, + * meaning we prefer IPv6 over IPv4. + */ + if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) { + return (struct sockaddr *)&mpte->mpte_dst_unicast_v6; + } + + if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) { + return (struct sockaddr *)&mpte->mpte_dst_unicast_v4; + } + + /* We don't yet have a unicast IP */ + return NULL; +} + static void mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell, uint64_t *cellbytes, uint64_t *allbytes) @@ -537,12 +589,12 @@ mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell, if (initial_cell) { mycellbytes -= mpte->mpte_init_txbytes; - mycellbytes -= mpte->mpte_init_txbytes; + mycellbytes -= mpte->mpte_init_rxbytes; } if (mycellbytes < 0) { - mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes); *cellbytes = 0; *allbytes = 0; } else { @@ -677,39 +729,27 @@ mptcpstats_session_wrapup(struct mptses *mpte) static void mptcp_session_destroy(struct mptses *mpte) { - struct mptcb *mp_tp; - - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + struct mptcb *mp_tp = mpte->mpte_mptcb; - mp_tp = mpte->mpte_mptcb; VERIFY(mp_tp != NULL); + VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0); mptcpstats_session_wrapup(mpte); - - mptcp_unset_cellicon(); - - /* - * MPTCP Multipath PCB Extension section - */ + mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments); mptcp_flush_sopts(mpte); - VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0); if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) { _FREE(mpte->mpte_itfinfo, M_TEMP); } - mpte->mpte_itfinfo = NULL; m_freem_list(mpte->mpte_reinjectq); - /* - * MPTCP Protocol Control Block section - */ - DTRACE_MPTCP2(session__destroy, struct mptses *, mpte, - struct mptcb *, mp_tp); + os_log(mptcp_log_handle, "%s - %lx: Destroying session\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); } -static boolean_t +boolean_t mptcp_ok_to_create_subflows(struct mptcb *mp_tp) { return mp_tp->mpt_state >= MPTCPS_ESTABLISHED && @@ -718,15 +758,16 @@ mptcp_ok_to_create_subflows(struct mptcb *mp_tp) } static int -mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4) +mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, + const struct in_addr *addrv4) { static const struct in6_addr well_known_prefix = { .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, }; + const char *ptrv4 = (const char *)addrv4; char buf[MAX_IPv6_STR_LEN]; - char *ptrv4 = (char *)addrv4; char *ptr = (char *)addr; if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network @@ -790,23 +831,44 @@ mptcp_trigger_cell_bringup(struct mptses *mpte) uuid_string_t uuidstr; int err; - mpte_unlock(mpte); + socket_unlock(mp_so, 0); err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid, TRUE); - mpte_lock(mpte); + socket_lock(mp_so, 0); if (err == 0) { mpte->mpte_triggered_cell = 1; } uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr); - os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n", - __func__, uuidstr, err); + os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err); } else { - os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__); + os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); } } +static boolean_t +mptcp_subflow_disconnecting(struct mptsub *mpts) +{ + /* Split out in if-statements for readability. Compile should + * optimize that. + */ + if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) { + return true; + } + + if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) { + return true; + } + + if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) { + return true; + } + + return false; +} void mptcp_check_subflows_and_add(struct mptses *mpte) @@ -817,24 +879,36 @@ mptcp_check_subflows_and_add(struct mptses *mpte) uint32_t i; if (!mptcp_ok_to_create_subflows(mp_tp)) { + os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags); + return; + } + + if (mptcp_get_session_dst(mpte, false, false) == NULL) { return; } for (i = 0; i < mpte->mpte_itfinfo_size; i++) { + boolean_t need_to_ask_symptoms = FALSE, found = FALSE; struct mpt_itf_info *info; + struct sockaddr_in6 nat64pre; + struct sockaddr *dst; struct mptsub *mpts; struct ifnet *ifp; uint32_t ifindex; - int found = 0; info = &mpte->mpte_itfinfo[i]; - if (info->no_mptcp_support) { + ifindex = info->ifindex; + if (ifindex == IFSCOPE_NONE) { continue; } - ifindex = info->ifindex; - if (ifindex == IFSCOPE_NONE) { + os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support, + info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn); + + if (info->no_mptcp_support) { continue; } @@ -852,11 +926,24 @@ mptcp_check_subflows_and_add(struct mptses *mpte) TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + struct tcpcb *tp = sototcpcb(mpts->mpts_socket); if (subifp == NULL) { continue; } + /* + * If there is at least one functioning subflow on WiFi + * and we are checking for the cell interface, then + * we always need to ask symptoms for permission as + * cell is triggered even if WiFi is available. + */ + if (!IFNET_IS_CELLULAR(subifp) && + !mptcp_subflow_disconnecting(mpts) && + IFNET_IS_CELLULAR(ifp)) { + need_to_ask_symptoms = TRUE; + } + /* * In Handover mode, only create cell subflow if * 1. Wi-Fi Assist is active @@ -876,109 +963,140 @@ mptcp_check_subflows_and_add(struct mptses *mpte) */ if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && !IFNET_IS_CELLULAR(subifp) && - !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) && - (mptcp_is_wifi_unusable(mpte) == 0 || - (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 && - ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) { - os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n", - __func__, mptcp_is_wifi_unusable(mpte), - sototcpcb(mpts->mpts_socket)->t_rxtshift, + !mptcp_subflow_disconnecting(mpts) && + (mptcp_is_wifi_unusable_for_session(mpte) == 0 || + (tp->t_rxtshift < mptcp_fail_thresh * 2 && mptetoso(mpte)->so_snd.sb_cc))) { + os_log_debug(mptcp_log_handle, + "%s - %lx: handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mptcp_is_wifi_unusable_for_session(mpte), + tp->t_rxtshift, !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc, - ifindex, subifp->if_index); - found = 1; + ifindex, subifp->if_index, + tp->t_srtt >> TCP_RTT_SHIFT, + tp->t_rttvar >> TCP_RTTVAR_SHIFT, + tp->t_rxtcur); + found = TRUE; /* We found a proper subflow on WiFi - no need for cell */ want_cellular = FALSE; break; + } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) { + uint64_t time_now = mach_continuous_time(); + + os_log(mptcp_log_handle, + "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target, + time_now, mptcp_is_wifi_unusable_for_session(mpte), + IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state, + mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state); + + if (!IFNET_IS_CELLULAR(subifp) && + !mptcp_subflow_disconnecting(mpts) && + (mpte->mpte_time_target == 0 || + (int64_t)(mpte->mpte_time_target - time_now) > 0 || + !mptcp_is_wifi_unusable_for_session(mpte))) { + found = TRUE; + + want_cellular = FALSE; + break; + } } else { - os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n", - __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags, - mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift, - !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc); + os_log_debug(mptcp_log_handle, + "%s - %lx: svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u rtt %u rttvar %u rto %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags, + mptcp_is_wifi_unusable_for_session(mpte), tp->t_rxtshift, + !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc, + tp->t_srtt >> TCP_RTT_SHIFT, + tp->t_rttvar >> TCP_RTTVAR_SHIFT, + tp->t_rxtcur); } if (subifp->if_index == ifindex && - !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) && - sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) { + !mptcp_subflow_disconnecting(mpts)) { /* * We found a subflow on this interface. * No need to create a new one. */ - found = 1; + found = TRUE; break; } } - if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) && + if (found) { + continue; + } + + if (need_to_ask_symptoms && + !(mpte->mpte_flags & MPTE_FIRSTPARTY) && !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) && mptcp_developer_mode == 0) { mptcp_ask_symptoms(mpte); return; } - if (!found) { - struct sockaddr *dst = &mpte->mpte_dst; - struct sockaddr_in6 nat64pre; - - if (mpte->mpte_dst.sa_family == AF_INET && - !info->has_v4_conn && info->has_nat64_conn) { - struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; - int error, j; - - bzero(&nat64pre, sizeof(struct sockaddr_in6)); + dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn); - error = ifnet_get_nat64prefix(ifp, nat64prefixes); - if (error) { - os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n", - __func__, ifp->if_name, error); - continue; - } + if (dst->sa_family == AF_INET && + !info->has_v4_conn && info->has_nat64_conn) { + struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; + int error, j; - for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) { - if (nat64prefixes[j].prefix_len != 0) { - break; - } - } + bzero(&nat64pre, sizeof(struct sockaddr_in6)); - VERIFY(j < NAT64_MAX_NUM_PREFIXES); + error = ifnet_get_nat64prefix(ifp, nat64prefixes); + if (error) { + os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error); + continue; + } - error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix, - nat64prefixes[j].prefix_len, - &mpte->__mpte_dst_v4.sin_addr); - if (error != 0) { - os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n", - __func__); - continue; + for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) { + if (nat64prefixes[j].prefix_len != 0) { + break; } - - memcpy(&nat64pre.sin6_addr, - &nat64prefixes[j].ipv6_prefix, - sizeof(nat64pre.sin6_addr)); - nat64pre.sin6_len = sizeof(struct sockaddr_in6); - nat64pre.sin6_family = AF_INET6; - nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port; - nat64pre.sin6_flowinfo = 0; - nat64pre.sin6_scope_id = 0; - - dst = (struct sockaddr *)&nat64pre; } - /* Initial subflow started on a NAT64'd address? */ - if (mpte->mpte_dst.sa_family == AF_INET6 && - mpte->mpte_dst_v4_nat64.sin_family == AF_INET) { - dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64; - } + VERIFY(j < NAT64_MAX_NUM_PREFIXES); - if (dst->sa_family == AF_INET && !info->has_v4_conn) { - continue; - } - if (dst->sa_family == AF_INET6 && !info->has_v6_conn) { + error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix, + nat64prefixes[j].prefix_len, + &((struct sockaddr_in *)(void *)dst)->sin_addr); + if (error != 0) { + os_log_info(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); continue; } - mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL); + memcpy(&nat64pre.sin6_addr, + &nat64prefixes[j].ipv6_prefix, + sizeof(nat64pre.sin6_addr)); + nat64pre.sin6_len = sizeof(struct sockaddr_in6); + nat64pre.sin6_family = AF_INET6; + nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port; + nat64pre.sin6_flowinfo = 0; + nat64pre.sin6_scope_id = 0; + + dst = (struct sockaddr *)&nat64pre; } + + /* Initial subflow started on a NAT64'd address? */ + if (!(mpte->mpte_flags & MPTE_UNICAST_IP) && + mpte->mpte_dst.sa_family == AF_INET6 && + mpte->mpte_dst_v4_nat64.sin_family == AF_INET) { + dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64; + } + + if (dst->sa_family == AF_INET && !info->has_v4_conn) { + continue; + } + if (dst->sa_family == AF_INET6 && !info->has_v6_conn) { + continue; + } + + mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL); } if (!cellular_viable && want_cellular) { @@ -987,21 +1105,56 @@ mptcp_check_subflows_and_add(struct mptses *mpte) } } -/* - * Based on the MPTCP Service-type and the state of the subflows, we - * will destroy subflows here. - */ static void -mptcp_check_subflows_and_remove(struct mptses *mpte) +mptcp_remove_cell_subflows(struct mptses *mpte) { struct mptsub *mpts, *tmpts; - int found_working_subflow = 0, removed_some = 0; - int wifi_unusable = mptcp_is_wifi_unusable(mpte); + boolean_t found = false; + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; - if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) { + if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { + continue; + } + + /* We have a functioning subflow on WiFi. No need for cell! */ + if (mpts->mpts_flags & MPTSF_CONNECTED && + !mptcp_subflow_disconnecting(mpts)) { + found = true; + } + } + + /* Didn't found functional sub on WiFi - stay on cell */ + if (!found) { return; } + TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + + /* Only remove cellular subflows */ + if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) { + continue; + } + + os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); + + soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); + } + + return; +} + +/* Returns true if it removed a subflow on cell */ +static void +mptcp_handover_subflows_remove(struct mptses *mpte) +{ + int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte); + boolean_t found_working_subflow = false; + struct mptsub *mpts; + /* * Look for a subflow that is on a non-cellular interface * and actually works (aka, no retransmission timeout). @@ -1023,14 +1176,17 @@ mptcp_check_subflows_and_remove(struct mptses *mpte) continue; } + os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable); + /* Is this subflow in good condition? */ - if (tp->t_rxtshift == 0) { - found_working_subflow = 1; + if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc) { + found_working_subflow = true; } /* Or WiFi is fine */ if (!wifi_unusable) { - found_working_subflow = 1; + found_working_subflow = true; } } @@ -1042,20 +1198,43 @@ mptcp_check_subflows_and_remove(struct mptses *mpte) return; } - TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { - const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + mptcp_remove_cell_subflows(mpte); +} - /* Only remove cellular subflows */ - if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) { - continue; - } +static void +mptcp_targetbased_subflows_remove(struct mptses *mpte) +{ + uint64_t time_now = mach_continuous_time(); - soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); - removed_some = 1; + if (mpte->mpte_time_target != 0 && + (int64_t)(mpte->mpte_time_target - time_now) <= 0 && + mptcp_is_wifi_unusable_for_session(mpte)) { + /* WiFi is bad and we are below the target - don't remove any subflows */ + return; } - if (removed_some) { - mptcp_unset_cellicon(); + mptcp_remove_cell_subflows(mpte); +} + +/* + * Based on the MPTCP Service-type and the state of the subflows, we + * will destroy subflows here. + */ +void +mptcp_check_subflows_and_remove(struct mptses *mpte) +{ + if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) { + return; + } + + socket_lock_assert_owned(mptetoso(mpte)); + + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + mptcp_handover_subflows_remove(mpte); + } + + if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) { + mptcp_targetbased_subflows_remove(mpte); } } @@ -1064,10 +1243,63 @@ mptcp_remove_subflows(struct mptses *mpte) { struct mptsub *mpts, *tmpts; + if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) { + return; + } + TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + boolean_t found = false; + uint32_t ifindex; + uint32_t i; + if (mpts->mpts_flags & MPTSF_CLOSE_REQD) { mpts->mpts_flags &= ~MPTSF_CLOSE_REQD; + os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, + ifp ? ifp->if_index : -1); + soevent(mpts->mpts_socket, + SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR); + + continue; + } + + if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) { + continue; + } + + if (ifp) { + ifindex = ifp->if_index; + } else { + ifindex = mpts->mpts_ifscope; + } + + for (i = 0; i < mpte->mpte_itfinfo_size; i++) { + if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) { + continue; + } + + if (mpte->mpte_itfinfo[i].ifindex == ifindex) { + if (mpts->mpts_dst.sa_family == AF_INET6 && + (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) { + found = true; + break; + } + + if (mpts->mpts_dst.sa_family == AF_INET && + mpte->mpte_itfinfo[i].has_v4_conn) { + found = true; + break; + } + } + } + + if (!found) { + os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + ifindex, mpts->mpts_flags); + soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR); } @@ -1084,8 +1316,7 @@ mptcp_create_subflows(__unused void *arg) * while a new event comes in. */ if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) { - mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__); } /* Iterate over all MPTCP connections */ @@ -1093,27 +1324,23 @@ mptcp_create_subflows(__unused void *arg) lck_mtx_lock(&mtcbinfo.mppi_lock); TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { - struct mptses *mpte; - struct socket *mp_so; + struct socket *mp_so = mpp->mpp_socket; + struct mptses *mpte = mpp->mpp_pcbe; if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) { continue; } - mpp_lock(mpp); + socket_lock(mp_so, 1); + VERIFY(mp_so->so_usecount > 0); mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS; - mpte = mpp->mpp_pcbe; - mp_so = mpp->mpp_socket; - - VERIFY(mp_so->so_usecount > 0); - mptcp_check_subflows_and_add(mpte); mptcp_remove_subflows(mpte); mp_so->so_usecount--; /* See mptcp_sched_create_subflows */ - mpp_unlock(mpp); + socket_unlock(mp_so, 1); } lck_mtx_unlock(&mtcbinfo.mppi_lock); @@ -1136,9 +1363,8 @@ mptcp_sched_create_subflows(struct mptses *mpte) struct socket *mp_so = mpp->mpp_socket; if (!mptcp_ok_to_create_subflows(mp_tp)) { - mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x", - __func__, mp_tp->mpt_state, mp_tp->mpt_flags), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags); return; } @@ -1189,7 +1415,7 @@ mptcp_sopt_free(struct mptopt *mpo) void mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo) { - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + socket_lock_assert_owned(mptetoso(mpte)); mpo->mpo_flags |= MPOF_ATTACHED; TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry); } @@ -1200,7 +1426,7 @@ mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo) void mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo) { - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + socket_lock_assert_owned(mptetoso(mpte)); VERIFY(mpo->mpo_flags & MPOF_ATTACHED); mpo->mpo_flags &= ~MPOF_ATTACHED; TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry); @@ -1214,7 +1440,7 @@ mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt) { struct mptopt *mpo; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + socket_lock_assert_owned(mptetoso(mpte)); TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) { if (mpo->mpo_level == sopt->sopt_level && @@ -1338,28 +1564,30 @@ mptcp_subflow_necp_cb(void *handle, __unused int action, * The socket is being garbage-collected. There is nothing to be done * here. */ - if (so->so_usecount == 0) { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) { return; } socket_lock(so, 1); /* Check again after we acquired the lock. */ - if (so->so_usecount == 0) { + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { goto out; } mpte = tptomptp(sototcpcb(so))->mpt_mpte; mpts = sototcpcb(so)->t_mpsub; - os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u", - __func__, mpts->mpts_ifscope, low_power); + os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power); mpts->mpts_flags |= MPTSF_CLOSE_REQD; mptcp_sched_create_subflows(mpte); - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL) { + if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || + mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) && + viable != NULL) { *viable = 1; } @@ -1381,13 +1609,13 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, int error; *so = NULL; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + mp_so = mptetoso(mpte); p = proc_find(mp_so->last_pid); if (p == PROC_NULL) { - mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid); return ESRCH; } @@ -1405,14 +1633,13 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes * the ipi-lock. We cannot hold the socket-lock at that point. */ - mpte_unlock(mpte); + socket_unlock(mp_so, 0); error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p, - SOCF_ASYNC, PROC_NULL); - mpte_lock(mpte); + SOCF_MPTCP, PROC_NULL); + socket_lock(mp_so, 0); if (error) { - mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); proc_rele(p); @@ -1469,25 +1696,52 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, * then socket-locks) is no more respected. So, we need to * unlock here. */ - mpte_unlock(mpte); + socket_unlock(mp_so, 0); error = necp_client_register_socket_flow(mp_so->last_pid, mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so)); - mpte_lock(mpte); + socket_lock(mp_so, 0); if (error) { + os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); + goto out_err; } /* Possible state-change during the unlock above */ if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT || (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) { + os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mp_tp->mpt_state, mp_tp->mpt_flags); + + error = EINVAL; goto out_err; } uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid); - } else { - mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); + } + + /* Needs to happen prior to the delegation! */ + (*so)->last_pid = mp_so->last_pid; + + if (mp_so->so_flags & SOF_DELEGATED) { + if (mpte->mpte_epid) { + error = so_set_effective_pid(*so, mpte->mpte_epid, p, false); + if (error) { + os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); + goto out_err; + } + } + if (!uuid_is_null(mpte->mpte_euuid)) { + error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false); + if (error) { + os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); + goto out_err; + } + } } /* inherit the other socket options */ @@ -1508,19 +1762,6 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, goto out_err; } - /* enable keepalive */ - smpo.mpo_name = SO_KEEPALIVE; - if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) { - goto out_err; - } - - smpo.mpo_level = IPPROTO_TCP; - smpo.mpo_intval = mptcp_subflow_keeptime; - smpo.mpo_name = TCP_KEEPALIVE; - if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) { - goto out_err; - } - if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) { /* * On secondary subflows we might need to set the cell-fallback @@ -1556,12 +1797,10 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, interim = (mpo->mpo_flags & MPOF_INTERIM); if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) { - mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx" - " sopt %s val %d interim record removed\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), + os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), - mpo->mpo_intval), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + mpo->mpo_intval); mptcp_sopt_remove(mpte, mpo); mptcp_sopt_free(mpo); continue; @@ -1599,9 +1838,6 @@ out_err: proc_rele(p); - mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n", - __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - return error; } @@ -1681,13 +1917,13 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) } os_log_info(mptcp_log_handle, - "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope, - dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)); + "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)); p = proc_find(mp_so->last_pid); if (p == PROC_NULL) { - mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid); return ESRCH; } @@ -1720,14 +1956,65 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte, struct mptsub *, mpts, int, error); if (error) { - mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n", - __func__, error, mpts->mpts_ifscope), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope); } return error; } +static int +mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, + uint32_t rseq, uint16_t dlen) +{ + struct mptsub *mpts = sototcpcb(so)->t_mpsub; + + if (m_pktlen(m) == 0) { + return 0; + } + + if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { + if (off && (dsn != m->m_pkthdr.mp_dsn || + rseq != m->m_pkthdr.mp_rseq || + dlen != m->m_pkthdr.mp_rlen)) { + os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), + (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn, + rseq, m->m_pkthdr.mp_rseq, + dlen, m->m_pkthdr.mp_rlen); + + soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); + return -1; + } + m->m_pkthdr.mp_dsn += off; + m->m_pkthdr.mp_rseq += off; + m->m_pkthdr.mp_rlen = m->m_pkthdr.len; + } else { + if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) { + /* data arrived without an DSS option mapping */ + + /* initial subflow can fallback right after SYN handshake */ + if (mpts->mpts_flags & MPTSF_INITIAL_SUB) { + mptcp_notify_mpfail(so); + } else { + soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); + + return -1; + } + } else if (m->m_flags & M_PKTHDR) { + /* We need to fake the DATA-mapping */ + m->m_pkthdr.pkt_flags |= PKTF_MPTCP; + m->m_pkthdr.mp_dsn = dsn + off; + m->m_pkthdr.mp_rseq = rseq + off; + m->m_pkthdr.mp_rlen = m->m_pkthdr.len; + } + } + + mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED; + + return 0; +} + /* * MPTCP subflow socket receive routine, derived from soreceive(). */ @@ -1742,7 +2029,6 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, struct mbuf *m, **mp = mp0; boolean_t proc_held = FALSE; - mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte); VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED); #ifdef MORE_LOCKING_DEBUG @@ -1892,7 +2178,11 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, csum = m->m_pkthdr.mp_csum; } else { /* We did fallback */ - mptcp_adj_rmap(so, m, 0, 0, 0, 0); + if (mptcp_adj_rmap(so, m, 0, 0, 0, 0)) { + error = EIO; + *mp0 = NULL; + goto release; + } sbfree(&so->so_rcv, m); @@ -1937,7 +2227,6 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, error = EIO; dlen = 0; *mp0 = NULL; - mptcp_subflow_abort(sototcpcb(so)->t_mpsub, ECONNABORTED); break; } @@ -2032,7 +2321,7 @@ mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, en_tracing = TRUE; en_tracing_val = top->m_pkthdr.len; KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START, - VM_KERNEL_ADDRPERM(so), + (unsigned long)VM_KERNEL_ADDRPERM(so), ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0), (int64_t)en_tracing_val); } @@ -2076,7 +2365,7 @@ out: if (en_tracing) { KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END, - VM_KERNEL_ADDRPERM(so), + (unsigned long)VM_KERNEL_ADDRPERM(so), ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0), (int64_t)en_tracing_val); } @@ -2097,22 +2386,23 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, struct mptsub *mpts = NULL; int af, error = 0; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; + socket_lock_assert_owned(mp_so); + if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { /* If the remote end sends Data FIN, refuse subflow adds */ - mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: state %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state); error = ENOTCONN; goto out_err; } mpts = mptcp_subflow_alloc(); if (mpts == NULL) { - mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); error = ENOMEM; goto out_err; } @@ -2161,7 +2451,7 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, goto out_err; } - memcpy(&mpts->mpts_dst, dst, dst->sa_len); + memcpy(&mpts->mpts_u_dst, dst, dst->sa_len); af = mpts->mpts_dst.sa_family; @@ -2214,7 +2504,7 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, } /* register for subflow socket read/write events */ - sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1); + sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1); /* Register for subflow socket control events */ sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts, @@ -2224,7 +2514,7 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO | - SO_FILT_HINT_ADAPTIVE_WTIMO); + SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR); /* sanity check */ VERIFY(!(mpts->mpts_flags & @@ -2256,25 +2546,6 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, mpts->mpts_flags |= MPTSF_CONNECTING; - if (af == AF_INET || af == AF_INET6) { - char dbuf[MAX_IPv6_STR_LEN]; - - mptcplog((LOG_DEBUG, "MPTCP Socket: %s " - "mp_so 0x%llx dst %s[%d] cid %d " - "[pending %s]\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - inet_ntop(af, ((af == AF_INET) ? - (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr : - (void *)&SIN6(&mpts->mpts_dst)->sin6_addr), - dbuf, sizeof(dbuf)), ((af == AF_INET) ? - ntohs(SIN(&mpts->mpts_dst)->sin_port) : - ntohs(SIN6(&mpts->mpts_dst)->sin6_port)), - mpts->mpts_connid, - ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? - "YES" : "NO")), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - } - /* connect right away if first attempt, or if join can be done now */ if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) { error = mptcp_subflow_soconnectx(mpte, mpts); @@ -2304,15 +2575,24 @@ out_err: } void -mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts) +mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts) { - int index = mptcp_get_statsindex(stats, mpts); + int index = mptcpstats_get_index(stats, mpts); if (index != -1) { struct inpcb *inp = sotoinpcb(mpts->mpts_socket); stats[index].mpis_txbytes += inp->inp_stat->txbytes; stats[index].mpis_rxbytes += inp->inp_stat->rxbytes; + + stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes; + stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes; + + stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes; + stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes; + + stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes; + stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes; } } @@ -2328,19 +2608,16 @@ mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts) struct socket *so = mpts->mpts_socket; struct tcpcb *tp = sototcpcb(so); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + socket_lock_assert_owned(mp_so); VERIFY(mpts->mpts_mpte == mpte); VERIFY(mpts->mpts_flags & MPTSF_ATTACHED); VERIFY(mpte->mpte_numflows != 0); VERIFY(mp_so->so_usecount > 0); - mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid, - mpts->mpts_flags, mp_so->so_error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - mptcpstats_update(mpte->mpte_itfstats, mpts); + + mptcp_unset_cellicon(mpte, mpts, 1); + mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes; mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes; @@ -2426,15 +2703,14 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts) struct mptcb *mp_tp; int send_dfin = 0; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - - VERIFY(mpts->mpts_mpte == mpte); - VERIFY(mpts->mpts_socket != NULL); + socket_lock_assert_owned(mptetoso(mpte)); if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) { return; } + mptcp_unset_cellicon(mpte, mpts, 1); + mpts->mpts_flags |= MPTSF_DISCONNECTING; so = mpts->mpts_socket; @@ -2464,46 +2740,6 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts) mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED); } -/* - * Called when the associated subflow socket posted a read event. - */ -static void -mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf) -{ -#pragma unused(so, waitf) - struct mptsub *mpts = arg, *tmpts; - struct mptses *mpte = mpts->mpts_mpte; - - VERIFY(mpte != NULL); - - if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) { - if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL)) { - mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP; - } - return; - } - - mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL; - TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { - if (mpts->mpts_socket->so_usecount == 0) { - /* Will be removed soon by tcp_garbage_collect */ - continue; - } - - mptcp_subflow_addref(mpts); - mpts->mpts_socket->so_usecount++; - - mptcp_subflow_input(mpte, mpts); - - mptcp_subflow_remref(mpts); /* ours */ - - VERIFY(mpts->mpts_socket->so_usecount != 0); - mpts->mpts_socket->so_usecount--; - } - - mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL); -} - /* * Subflow socket input. */ @@ -2529,9 +2765,8 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) error = sock_receive_internal(so, NULL, &m, 0, NULL); if (error != 0 && error != EWOULDBLOCK) { - mptcplog((LOG_ERR, "%s: cid %d error %d\n", - __func__, mpts->mpts_connid, error), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error); if (error == ENODATA) { /* * Don't ignore ENODATA so as to discover @@ -2558,11 +2793,17 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) if (m != NULL) { if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) { - mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON; + mptcp_set_cellicon(mpte, mpts); mpte->mpte_used_cell = 1; } else { - mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON; + /* + * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't + * explicitly set the cellicon, then we unset it again. + */ + if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) { + mptcp_unset_cellicon(mpte, NULL, 1); + } mpte->mpte_used_wifi = 1; } @@ -2570,18 +2811,55 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) mptcp_input(mpte, m); } - /* notify protocol that we drained all the data */ - if (error == 0 && m != NULL && - (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) { - (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0); - } - out: if (wakeup) { mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP; } - mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT); + mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT); +} + +void +mptcp_handle_input(struct socket *so) +{ + struct mptsub *mpts, *tmpts; + struct mptses *mpte; + + if (!(so->so_flags & SOF_MP_SUBFLOW)) { + return; + } + + mpts = sototcpcb(so)->t_mpsub; + mpte = mpts->mpts_mpte; + + socket_lock_assert_owned(mptetoso(mpte)); + + if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) { + if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) { + mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP; + } + return; + } + + mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE; + TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { + if (mpts->mpts_socket->so_usecount == 0) { + /* Will be removed soon by tcp_garbage_collect */ + continue; + } + + mptcp_subflow_addref(mpts); + mpts->mpts_socket->so_usecount++; + + mptcp_subflow_input(mpte, mpts); + + mptcp_subflow_remref(mpts); /* ours */ + + VERIFY(mpts->mpts_socket->so_usecount != 0); + mpts->mpts_socket->so_usecount--; + } + + mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE); } /* @@ -2648,12 +2926,12 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags) uint16_t tot_sent = 0; boolean_t reinjected = FALSE; - mpte_lock_assert_held(mpte); - mp_so = mptetoso(mpte); so = mpts->mpts_socket; tp = sototcpcb(so); + socket_lock_assert_owned(mp_so); + VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT)); mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT; @@ -2698,10 +2976,10 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags) } if (sb_mb == NULL) { - mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n", - __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt, - (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt, + (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1); /* Fix it to prevent looping */ if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) { @@ -2723,11 +3001,10 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags) /* First, drop acknowledged data */ if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) { - mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier " + os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier " "dsn %u suna %u reinject? %u\n", - __func__, (uint32_t)mpt_dsn, - (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn, + (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq); if (mpte->mpte_reinjectq) { mptcp_clean_reinjectq(mpte); } else { @@ -2740,8 +3017,8 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags) /* Check again because of above sbdrop */ if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) { - mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto out; } @@ -2759,9 +3036,9 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags) sbdrop(&mp_so->so_snd, (int)len); wakeup = 1; - mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n", - __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna); } } @@ -2787,8 +3064,8 @@ dont_reinject: sb_mb = mp_so->so_snd.sb_mb; } if (sb_mb == NULL) { - mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__, + (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto out; } @@ -2821,10 +3098,9 @@ dont_reinject: off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna; sb_cc -= off; } else { - mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n", - __func__, (uint32_t)mp_tp->mpt_sndnxt, - (uint32_t)mp_tp->mpt_sndmax), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt, + (uint32_t)mp_tp->mpt_sndmax); goto out; } @@ -2832,11 +3108,10 @@ dont_reinject: sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so)); if (sb_cc <= 0) { - mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n", - __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd, + os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax, - mptcp_subflow_cwnd_space(so)), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + mptcp_subflow_cwnd_space(so)); } sb_cc = min(sb_cc, UINT16_MAX); @@ -2877,10 +3152,9 @@ dont_reinject: mlen = min(mlen, sb_cc - tot_sent); if (mlen < 0) { - mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n", - __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen, - (uint32_t)off, sb_cc, tot_sent), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen, + (uint32_t)off, sb_cc, tot_sent); goto out; } @@ -2891,8 +3165,8 @@ dont_reinject: m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, M_COPYM_MUST_COPY_HDR); if (m == NULL) { - mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__, + (unsigned long)VM_KERNEL_ADDRPERM(mpte)); error = ENOBUFS; break; } @@ -3010,11 +3284,17 @@ done_sending: } if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) { - mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON; + mptcp_set_cellicon(mpte, mpts); mpte->mpte_used_cell = 1; } else { - mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON; + /* + * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't + * explicitly set the cellicon, then we unset it again. + */ + if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) { + mptcp_unset_cellicon(mpte, NULL, 1); + } mpte->mpte_used_wifi = 1; } @@ -3025,9 +3305,8 @@ done_sending: */ error = 0; } else { - mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n", - __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat); } out: @@ -3155,7 +3434,7 @@ mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn) } static struct mbuf * -mptcp_copy_mbuf_list(struct mbuf *m, int len) +mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len) { struct mbuf *top = NULL, *tail = NULL; uint64_t dsn; @@ -3172,8 +3451,8 @@ mptcp_copy_mbuf_list(struct mbuf *m, int len) n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR); if (n == NULL) { - mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto err; } @@ -3251,7 +3530,7 @@ mptcp_reinject_mbufs(struct socket *so) } /* Copy the mbuf with headers (aka, DSN-numbers) */ - m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen); + m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen); if (m == NULL) { break; } @@ -3287,7 +3566,7 @@ mptcp_clean_reinjectq(struct mptses *mpte) { struct mptcb *mp_tp = mpte->mpte_mptcb; - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mptetoso(mpte)); while (mpte->mpte_reinjectq) { struct mbuf *m = mpte->mpte_reinjectq; @@ -3313,8 +3592,7 @@ mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events) struct mptsub *mpts = arg; struct mptses *mpte = mpts->mpts_mpte; - VERIFY(mpte != NULL); - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mptetoso(mpte)); if ((mpts->mpts_evctl & events) == events) { return; @@ -3343,8 +3621,6 @@ mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts, int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) / sizeof(mpsub_ev_entry_tbl[0]); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - /* bail if there's nothing to process */ if (!mpts->mpts_evctl) { return ret; @@ -3388,10 +3664,10 @@ mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts, * so loudly complain if we have any unprocessed one(s). */ if (mpts->mpts_evctl || ret < MPTS_EVRET_OK) { - mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__, + mptcplog((LOG_WARNING, "%s%s: cid %d evret %d unhandled events=%b\n", __func__, (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "", mpts->mpts_connid, - mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS), + ret, mpts->mpts_evctl, SO_FILT_HINT_BITS), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); } else { mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__, @@ -3409,8 +3685,6 @@ mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts, struct socket *mp_so, *so; struct mptcb *mp_tp; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - VERIFY(mpte->mpte_mppcb != NULL); mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; @@ -3424,6 +3698,7 @@ mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts, * based on the state of the MPTCP connection. */ if (mp_tp->mpt_state < MPTCPS_ESTABLISHED || + (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) || ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) { mp_so->so_error = so->so_error; *p_mpsofilt_hint |= event; @@ -3443,9 +3718,6 @@ mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts, struct socket *mp_so; struct tcpcb *tp; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - - VERIFY(mpte->mpte_mppcb != NULL); mp_so = mptetoso(mpte); tp = intotcpcb(sotoinpcb(mpts->mpts_socket)); @@ -3471,6 +3743,31 @@ mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts, return MPTS_EVRET_DELETE; } +static ev_ret_t +mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts, + uint64_t *p_mpsofilt_hint, uint64_t event) +{ +#pragma unused(event, p_mpsofilt_hint) + struct socket *so, *mp_so; + + so = mpts->mpts_socket; + + if (so->so_error != ENODATA) { + return MPTS_EVRET_OK; + } + + + mp_so = mptetoso(mpte); + + mp_so->so_error = ENODATA; + + sorwakeup(mp_so); + sowwakeup(mp_so); + + return MPTS_EVRET_OK; +} + + /* * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that * indicates that the remote side sent a Data FIN @@ -3480,10 +3777,7 @@ mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts, uint64_t *p_mpsofilt_hint, uint64_t event) { #pragma unused(event) - struct mptcb *mp_tp; - - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - mp_tp = mpte->mpte_mptcb; + struct mptcb *mp_tp = mpte->mpte_mptcb; mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); @@ -3514,22 +3808,17 @@ mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts, struct socket *mp_so; int altpath_exists = 0; - mpte_lock_assert_held(mpte); mp_so = mptetoso(mpte); - mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); + os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); mptcp_reinject_mbufs(mpts->mpts_socket); - mpts_alt = mptcp_get_subflow(mpte, mpts, NULL); - /* - * If there is no alternate eligible subflow, ignore the - * failover hint. - */ - if (mpts_alt == NULL) { - mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); + mpts_alt = mptcp_get_subflow(mpte, NULL); + + /* If there is no alternate eligible subflow, ignore the failover hint. */ + if (mpts_alt == NULL || mpts_alt == mpts) { + os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__, + (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto done; } @@ -3553,9 +3842,8 @@ mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts, mpts->mpts_flags |= MPTSF_FAILINGOVER; mpts->mpts_flags &= ~MPTSF_ACTIVE; - mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n", - __func__, mpts->mpts_connid, mpts_alt->mpts_connid), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); + os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid); mptcpstats_inc_switch(mpte, mpts); @@ -3578,9 +3866,6 @@ static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts, uint64_t *p_mpsofilt_hint, uint64_t event) { - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - VERIFY(mpte->mpte_mppcb != NULL); - mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); @@ -3653,6 +3938,13 @@ mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts) struct ifnet *ifp; int j; + /* Subflow IPs will be steered directly by the server - no need to + * desynthesize. + */ + if (mpte->mpte_flags & MPTE_UNICAST_IP) { + return; + } + ifp = sotoinpcb(so)->inp_last_outifp; if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) { @@ -3695,9 +3987,6 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, int af; boolean_t mpok = FALSE; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - VERIFY(mpte->mpte_mppcb != NULL); - mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; @@ -3797,7 +4086,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, mptcp_notify_mpfail(so); } else { if (IFNET_IS_CELLULAR(inp->inp_last_outifp) && - mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) { + mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) { tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); } else { mpts->mpts_flags |= MPTSF_PREFERRED; @@ -3822,10 +4111,6 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt; mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna; soisconnected(mp_so); - - mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok), - MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG); } else if (mpok) { /* * case (b) above @@ -3836,7 +4121,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, */ if (IFNET_IS_CELLULAR(inp->inp_last_outifp) && !(tp->t_mpflags & TMPF_BACKUP_PATH) && - mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) { + mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) { tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); mpts->mpts_flags &= ~MPTSF_PREFERRED; } else { @@ -3888,7 +4173,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, } /* This call, just to "book" an entry in the stats-table for this ifindex */ - mptcp_get_statsindex(mpte->mpte_itfstats, mpts); + mptcpstats_get_index(mpte->mpte_itfstats, mpts); mptcp_output(mpte); @@ -3906,8 +4191,6 @@ mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts, struct socket *mp_so, *so; struct mptcb *mp_tp; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - VERIFY(mpte->mpte_mppcb != NULL); mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; @@ -3941,10 +4224,6 @@ mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts, mptcp_drop(mpte, mp_tp, so->so_error); } - if (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV) { - mptcp_drop(mpte, mp_tp, mp_so->so_error); - } - /* * Clear flags that are used by getconninfo to return state. * Retain like MPTSF_DELETEOK for internal purposes. @@ -3964,12 +4243,10 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts, uint64_t *p_mpsofilt_hint, uint64_t event) { #pragma unused(event, p_mpsofilt_hint) + ev_ret_t ret = MPTS_EVRET_OK; struct socket *mp_so, *so; struct mptcb *mp_tp; - ev_ret_t ret = MPTS_EVRET_OK; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - VERIFY(mpte->mpte_mppcb != NULL); mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; @@ -4001,7 +4278,6 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts, } if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { - VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)); ret = MPTS_EVRET_DISCONNECT_FALLBACK; m_freem_list(mpte->mpte_reinjectq); @@ -4011,12 +4287,6 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts, ret = MPTS_EVRET_CONNECT_PENDING; } - mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid, - mpts->mpts_flags, MPTSF_BITS), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); - done: return ret; } @@ -4033,8 +4303,6 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, struct mptcb *mp_tp; boolean_t is_fastclose; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - VERIFY(mpte->mpte_mppcb != NULL); mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; @@ -4049,6 +4317,8 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV); + tp->t_mpflags |= TMPF_RESET; + t_template = tcp_maketemplate(tp); if (t_template) { struct tcp_respond_args tra; @@ -4065,29 +4335,32 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una, TH_RST, &tra); (void) m_free(dtom(t_template)); - mptcplog((LOG_DEBUG, "MPTCP Events: " - "%s: mp_so 0x%llx cid %d \n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - so, mpts->mpts_connid), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); } - mptcp_subflow_abort(mpts, ECONNABORTED); if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) { + struct mptsub *iter, *tmp; + *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET; - if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { - mp_so->so_error = ECONNABORTED; - } else { - mp_so->so_error = ECONNRESET; + mp_so->so_error = ECONNRESET; + + TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) { + if (iter == mpts) { + continue; + } + mptcp_subflow_abort(iter, ECONNABORTED); } /* * mptcp_drop is being called after processing the events, to fully * close the MPTCP connection */ + mptcp_drop(mpte, mp_tp, mp_so->so_error); } + mptcp_subflow_abort(mpts, ECONNABORTED); + + if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) { mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST; } @@ -4155,30 +4428,6 @@ mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts, return MPTS_EVRET_OK; } -static const char * -mptcp_evret2str(ev_ret_t ret) -{ - const char *c = "UNKNOWN"; - - switch (ret) { - case MPTS_EVRET_DELETE: - c = "MPTS_EVRET_DELETE"; - break; - case MPTS_EVRET_CONNECT_PENDING: - c = "MPTS_EVRET_CONNECT_PENDING"; - break; - case MPTS_EVRET_DISCONNECT_FALLBACK: - c = "MPTS_EVRET_DISCONNECT_FALLBACK"; - break; - case MPTS_EVRET_OK: - c = "MPTS_EVRET_OK"; - break; - default: - break; - } - return c; -} - /* * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked, * caller must ensure that the option can be issued on subflow sockets, via @@ -4192,18 +4441,19 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt * int error; VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); - mpte_lock_assert_held(mpte); mp_so = mptetoso(mpte); so = mpts->mpts_socket; + socket_lock_assert_owned(mp_so); + if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED && mpo->mpo_level == SOL_SOCKET && mpo->mpo_name == SO_MARK_CELLFALLBACK) { struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope]; mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n", - __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte), + __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte), sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1, mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); @@ -4246,20 +4496,12 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt * sopt.sopt_p = kernproc; error = sosetoptlock(so, &sopt, 0); - if (error == 0) { - mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s " - "val %d set successful\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), - mpo->mpo_intval), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - } else { - mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s " + if (error) { + os_log_error(mptcp_log_handle, "%s - %lx: sopt %s " "val %d set error %d\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), + (unsigned long)VM_KERNEL_ADDRPERM(mpte), mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), - mpo->mpo_intval, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + mpo->mpo_intval, error); } return error; } @@ -4278,9 +4520,10 @@ mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so, int error; VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ mp_so = mptetoso(mpte); + socket_lock_assert_owned(mp_so); + bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = mpo->mpo_level; @@ -4290,20 +4533,11 @@ mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so, sopt.sopt_p = kernproc; error = sogetoptlock(so, &sopt, 0); /* already locked */ - if (error == 0) { - mptcplog((LOG_DEBUG, "MPTCP Socket: " - "%s: mp_so 0x%llx sopt %s " - "val %d get successful\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), - mpo->mpo_intval), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - } else { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: mp_so 0x%llx sopt %s get error %d\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + if (error) { + os_log_error(mptcp_log_handle, + "%s - %lx: sopt %s get error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error); } return error; } @@ -4329,46 +4563,22 @@ mptcp_gc(struct mppcbinfo *mppi) struct mptses *mpte; struct mptcb *mp_tp; - VERIFY(mpp->mpp_flags & MPP_ATTACHED); mp_so = mpp->mpp_socket; - VERIFY(mp_so != NULL); mpte = mptompte(mpp); - VERIFY(mpte != NULL); mp_tp = mpte->mpte_mptcb; - VERIFY(mp_tp != NULL); - - mptcplog((LOG_DEBUG, "MPTCP Socket: " - "%s: mp_so 0x%llx found " - "(u=%d,r=%d,s=%d)\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount, - mp_so->so_retaincnt, mpp->mpp_state), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - if (!mpte_try_lock(mpte)) { - mptcplog((LOG_DEBUG, "MPTCP Socket: " - "%s: mp_so 0x%llx skipped lock " - "(u=%d,r=%d)\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mp_so->so_usecount, mp_so->so_retaincnt), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + if (!mpp_try_lock(mpp)) { active++; continue; } + VERIFY(mpp->mpp_flags & MPP_ATTACHED); + /* check again under the lock */ if (mp_so->so_usecount > 0) { boolean_t wakeup = FALSE; struct mptsub *mpts, *tmpts; - mptcplog((LOG_DEBUG, "MPTCP Socket: " - "%s: mp_so 0x%llx skipped usecount " - "[u=%d,r=%d] %d %d\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mp_so->so_usecount, mp_so->so_retaincnt, - mp_tp->mpt_gc_ticks, - mp_tp->mpt_state), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) { if (mp_tp->mpt_gc_ticks > 0) { mp_tp->mpt_gc_ticks--; @@ -4384,15 +4594,15 @@ mptcp_gc(struct mppcbinfo *mppi) mpts, SO_FILT_HINT_DISCONNECTED); } } - mpte_unlock(mpte); + socket_unlock(mp_so, 0); active++; continue; } if (mpp->mpp_state != MPPCB_STATE_DEAD) { - panic("MPTCP Socket: %s: mp_so 0x%llx skipped state " + panic("%s - %lx: skipped state " "[u=%d,r=%d,s=%d]\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), + (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->so_usecount, mp_so->so_retaincnt, mpp->mpp_state); } @@ -4403,12 +4613,6 @@ mptcp_gc(struct mppcbinfo *mppi) mptcp_session_destroy(mpte); - mptcplog((LOG_DEBUG, "MPTCP Socket: " - "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mp_so->so_usecount, mp_so->so_retaincnt), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - DTRACE_MPTCP4(dispose, struct socket *, mp_so, struct sockbuf *, &mp_so->so_rcv, struct sockbuf *, &mp_so->so_snd, @@ -4427,11 +4631,11 @@ mptcp_gc(struct mppcbinfo *mppi) struct mptses * mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno) { - struct socket *mp_so; + struct socket *mp_so = mptetoso(mpte); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ VERIFY(mpte->mpte_mptcb == mp_tp); - mp_so = mptetoso(mpte); + + socket_lock_assert_owned(mp_so); DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, uint32_t, 0 /* event */); @@ -4450,12 +4654,11 @@ mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno) struct mptses * mptcp_close(struct mptses *mpte, struct mptcb *mp_tp) { - struct socket *mp_so = NULL; struct mptsub *mpts = NULL, *tmpts = NULL; + struct socket *mp_so = mptetoso(mpte); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + socket_lock_assert_owned(mp_so); VERIFY(mpte->mpte_mptcb == mp_tp); - mp_so = mptetoso(mpte); mp_tp->mpt_state = MPTCPS_TERMINATE; @@ -4484,11 +4687,13 @@ void mptcp_subflow_workloop(struct mptses *mpte) { boolean_t connect_pending = FALSE, disconnect_fallback = FALSE; - uint64_t mpsofilt_hint_mask; + uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED; struct mptsub *mpts, *tmpts; struct socket *mp_so; - mpte_lock_assert_held(mpte); + mp_so = mptetoso(mpte); + + socket_lock_assert_owned(mp_so); if (mpte->mpte_flags & MPTE_IN_WORKLOOP) { mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH; @@ -4496,10 +4701,7 @@ mptcp_subflow_workloop(struct mptses *mpte) } mpte->mpte_flags |= MPTE_IN_WORKLOOP; - mp_so = mptetoso(mpte); - relaunch: - mpsofilt_hint_mask = SO_FILT_HINT_LOCKED; mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH; TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { @@ -4555,6 +4757,11 @@ relaunch: if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) { VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED); + if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) { + mp_so->so_state |= SS_CANTRCVMORE; + sorwakeup(mp_so); + } + soevent(mp_so, mpsofilt_hint_mask); } @@ -4596,10 +4803,6 @@ relaunch: ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE); tp->t_mpflags |= TMPF_TCP_FALLBACK; - if (mpts->mpts_flags & MPTSF_ACTIVE) { - continue; - } - tp->t_mpflags |= TMPF_RESET; soevent(so, SO_FILT_HINT_MUSTRST); } else if (connect_pending) { /* @@ -4656,6 +4859,7 @@ mptcp_lock(struct socket *mp_so, int refcount, void *lr) } if (refcount != 0) { mp_so->so_usecount++; + mpp->mpp_inside++; } mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved; mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX; @@ -4684,10 +4888,11 @@ mptcp_unlock(struct socket *mp_so, int refcount, void *lr) solockhistory_nr(mp_so)); /* NOTREACHED */ } - mpp_lock_assert_held(mpp); + socket_lock_assert_owned(mp_so); if (refcount != 0) { mp_so->so_usecount--; + mpp->mpp_inside--; } if (mp_so->so_usecount < 0) { @@ -4695,6 +4900,11 @@ mptcp_unlock(struct socket *mp_so, int refcount, void *lr) mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); /* NOTREACHED */ } + if (mpp->mpp_inside < 0) { + panic("%s: mpp=%p inside=%x lrh= %s\n", __func__, + mpp, mpp->mpp_inside, solockhistory_nr(mp_so)); + /* NOTREACHED */ + } mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved; mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX; mpp_unlock(mpp); @@ -4728,12 +4938,10 @@ mptcp_getlock(struct socket *mp_so, int flags) */ static void -mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, - uint8_t addr_id) +mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id) { struct tcpcb *tp = sototcpcb(so); struct mptcp_subf_auth_entry *sauth_entry; - mpte_lock_assert_held(mp_tp->mpt_mpte); /* * The address ID of the first flow is implicitly 0. @@ -4789,7 +4997,6 @@ mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand, u_int32_t *rrand) { struct mptcp_subf_auth_entry *sauth_entry; - mpte_lock_assert_held(mp_tp->mpt_mpte); LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { if (sauth_entry->msae_laddr_id == addr_id) { @@ -4809,26 +5016,23 @@ mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp, mptcp_addr_id raddr_id, u_int32_t raddr_rand) { struct mptcp_subf_auth_entry *sauth_entry; - mpte_lock_assert_held(mp_tp->mpt_mpte); LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { if (sauth_entry->msae_laddr_id == laddr_id) { if ((sauth_entry->msae_raddr_id != 0) && (sauth_entry->msae_raddr_id != raddr_id)) { - mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched" - " address ids %d %d \n", __func__, raddr_id, - sauth_entry->msae_raddr_id), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s - %lx: mismatched" + " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), + raddr_id, sauth_entry->msae_raddr_id); return; } sauth_entry->msae_raddr_id = raddr_id; if ((sauth_entry->msae_raddr_rand != 0) && (sauth_entry->msae_raddr_rand != raddr_rand)) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: dup SYN_ACK %d %d \n", - __func__, raddr_rand, - sauth_entry->msae_raddr_rand), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s - %lx: " + "dup SYN_ACK %d %d \n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), + raddr_rand, sauth_entry->msae_raddr_rand); return; } sauth_entry->msae_raddr_rand = raddr_rand; @@ -4908,8 +5112,6 @@ mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest) { uint32_t lrand, rrand; - mpte_lock_assert_held(mp_tp->mpt_mpte); - lrand = rrand = 0; mptcp_get_rands(aid, mp_tp, &lrand, &rrand); mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, @@ -4996,7 +5198,6 @@ int mptcp_init_remote_parms(struct mptcb *mp_tp) { char remote_digest[SHA1_RESULTLEN]; - mpte_lock_assert_held(mp_tp->mpt_mpte); /* Only Version 0 is supported for auth purposes */ if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) { @@ -5010,6 +5211,7 @@ mptcp_init_remote_parms(struct mptcb *mp_tp) mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN, (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t)); mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1; + mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd; return 0; } @@ -5048,7 +5250,6 @@ mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m) } __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb); - mpte_lock_assert_held(mp_tp->mpt_mpte); while (m) { VERIFY(m->m_flags & M_PKTHDR); @@ -5108,7 +5309,15 @@ mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len) mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack); + + /* We can have data in the subflow's send-queue that is being acked, + * while the DATA_ACK has already advanced. Thus, we should check whether + * or not the DATA_ACK is actually new here. + */ + if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) && + MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) { + mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack); + } } void @@ -5259,44 +5468,6 @@ mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th) } } -int -mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, - uint32_t rseq, uint16_t dlen) -{ - struct mptsub *mpts = sototcpcb(so)->t_mpsub; - - if (m_pktlen(m) == 0) { - return 0; - } - - if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { - if (off && (dsn != m->m_pkthdr.mp_dsn || - rseq != m->m_pkthdr.mp_rseq || - dlen != m->m_pkthdr.mp_rlen)) { - mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n", - __func__, dsn, m->m_pkthdr.mp_dsn, - rseq, m->m_pkthdr.mp_rseq, - dlen, m->m_pkthdr.mp_rlen), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR); - return -1; - } - m->m_pkthdr.mp_dsn += off; - m->m_pkthdr.mp_rseq += off; - m->m_pkthdr.mp_rlen = m->m_pkthdr.len; - } else { - if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) { - /* data arrived without an DSS option mapping */ - - /* initial subflow can fallback right after SYN handshake */ - mptcp_notify_mpfail(so); - } - } - - mpts->mpts_flags |= MPTSF_CONFIRMED; - - return 0; -} - /* * Following routines help with failure detection and failover of data * transfer from one subflow to another. @@ -5361,9 +5532,7 @@ mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq) * not much else to do. */ - mptcplog((LOG_ERR, "MPTCP Sender: " - "%s: %llu not found \n", __func__, dsn_fail), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail); return -1; } @@ -5576,7 +5745,7 @@ mptcp_sbspace(struct mptcb *mp_tp) int32_t space; int32_t pending = 0; - mpte_lock_assert_held(mp_tp->mpt_mpte); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); mptcp_sbrcv_grow_rwin(mp_tp, sb); @@ -5674,7 +5843,8 @@ boolean_t mptcp_ok_to_keepalive(struct mptcb *mp_tp) { boolean_t ret = 1; - mpte_lock_assert_held(mp_tp->mpt_mpte); + + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { ret = 0; @@ -5703,7 +5873,7 @@ mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc) return 0; } - mpte_lock_assert_held(mp_tp->mpt_mpte); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); /* * For the first subflow and subsequent subflows, adjust mss for @@ -5810,13 +5980,12 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS } TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { flows = NULL; - mpp_lock(mpp); + socket_lock(mpp->mpp_socket, 1); VERIFY(mpp->mpp_flags & MPP_ATTACHED); mpte = mptompte(mpp); - VERIFY(mpte != NULL); - mpte_lock_assert_held(mpte); + + socket_lock_assert_owned(mptetoso(mpte)); mp_tp = mpte->mpte_mptcb; - VERIFY(mp_tp != NULL); bzero(&mptcpci, sizeof(mptcpci)); mptcpci.mptcpci_state = mp_tp->mpt_state; @@ -5844,7 +6013,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS if (mpte->mpte_numflows != 0) { flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); if (flows == NULL) { - mpp_unlock(mpp); + socket_unlock(mpp->mpp_socket, 1); break; } mptcpci.mptcpci_len = sizeof(mptcpci) + @@ -5856,7 +6025,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci)); } if (error) { - mpp_unlock(mpp); + socket_unlock(mpp->mpp_socket, 1); FREE(flows, M_TEMP); break; } @@ -5866,7 +6035,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS fill_mptcp_subflow(so, &flows[f], mpts); f++; } - mpp_unlock(mpp); + socket_unlock(mpp->mpp_socket, 1); if (flows) { error = SYSCTL_OUT(req, flows, len); FREE(flows, M_TEMP); @@ -5938,7 +6107,7 @@ mptcp_notsent_lowat_check(struct socket *so) } mpte = mptompte(mpp); - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mptetoso(mpte)); mp_tp = mpte->mpte_mptcb; notsent = so->so_snd.sb_cc; @@ -5981,12 +6150,6 @@ mptcp_notsent_lowat_check(struct socket *so) return 0; } -/* Using Symptoms Advisory to detect poor WiFi or poor Cell */ -static kern_ctl_ref mptcp_kern_ctrl_ref = NULL; -static uint32_t mptcp_kern_skt_inuse = 0; -static uint32_t mptcp_kern_skt_unit; -symptoms_advisory_t mptcp_advisory; - static errno_t mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, void **unitinfo) @@ -5994,7 +6157,7 @@ mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, #pragma unused(kctlref, sac, unitinfo) if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) { - os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__); + os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__); } mptcp_kern_skt_unit = sac->sc_unit; @@ -6003,7 +6166,7 @@ mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, } static void -mptcp_allow_uuid(uuid_t uuid) +mptcp_allow_uuid(uuid_t uuid, int32_t rssi) { struct mppcb *mpp; @@ -6012,13 +6175,10 @@ mptcp_allow_uuid(uuid_t uuid) lck_mtx_lock(&mtcbinfo.mppi_lock); TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { - struct mptses *mpte; - struct socket *mp_so; + struct socket *mp_so = mpp->mpp_socket; + struct mptses *mpte = mpp->mpp_pcbe; - mpp_lock(mpp); - - mpte = mpp->mpp_pcbe; - mp_so = mpp->mpp_socket; + socket_lock(mp_so, 1); if (mp_so->so_flags & SOF_DELEGATED && uuid_compare(uuid, mp_so->e_uuid)) { @@ -6028,18 +6188,22 @@ mptcp_allow_uuid(uuid_t uuid) goto next; } - os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp\n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); + os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi); mpte->mpte_flags |= MPTE_ACCESS_GRANTED; + if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) { + mpte->mpte_flags |= MPTE_CELL_PROHIBITED; + } + mptcp_check_subflows_and_add(mpte); mptcp_remove_subflows(mpte); - mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED; + mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED); next: - mpp_unlock(mpp); + socket_unlock(mp_so, 1); } lck_mtx_unlock(&mtcbinfo.mppi_lock); @@ -6055,16 +6219,14 @@ mptcp_wifi_status_changed(void) lck_mtx_lock(&mtcbinfo.mppi_lock); TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { - struct mptses *mpte; - struct socket *mp_so; - - mpp_lock(mpp); + struct socket *mp_so = mpp->mpp_socket; + struct mptses *mpte = mpp->mpp_pcbe; - mpte = mpp->mpp_pcbe; - mp_so = mpp->mpp_socket; + socket_lock(mp_so, 1); - /* Only handover-mode is purely driven by Symptom's Wi-Fi status */ - if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) { + /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */ + if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER && + mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) { goto next; } @@ -6072,7 +6234,7 @@ mptcp_wifi_status_changed(void) mptcp_check_subflows_and_remove(mpte); next: - mpp_unlock(mpp); + socket_unlock(mp_so, 1); } lck_mtx_unlock(&mtcbinfo.mppi_lock); @@ -6087,7 +6249,8 @@ mptcp_ask_symptoms(struct mptses *mpte) int pid, prio, err; if (mptcp_kern_skt_unit == 0) { - os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__); + os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); return; } @@ -6101,7 +6264,8 @@ mptcp_ask_symptoms(struct mptses *mpte) p = proc_find(pid); if (p == PROC_NULL) { - os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid); + os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid); return; } @@ -6115,7 +6279,8 @@ mptcp_ask_symptoms(struct mptses *mpte) prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE); - if (prio == TASK_BACKGROUND_APPLICATION) { + if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION || + prio == TASK_DARWINBG_APPLICATION) { ask.priority = MPTCP_SYMPTOMS_BACKGROUND; } else if (prio == TASK_FOREGROUND_APPLICATION) { ask.priority = MPTCP_SYMPTOMS_FOREGROUND; @@ -6126,8 +6291,8 @@ mptcp_ask_symptoms(struct mptses *mpte) err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit, &ask, sizeof(ask), CTL_DATA_EOR); - os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n", - __func__, pid, ask.priority, err); + os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err); proc_rele(p); @@ -6152,7 +6317,7 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, symptoms_advisory_t *sa = NULL; if (kcunit != mptcp_kern_skt_unit) { - os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n", + os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n", __func__, kcunit, mptcp_kern_skt_unit); } @@ -6170,46 +6335,39 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, sa = mbuf_data(m); - if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT && - sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) { - uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status; - - mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n", - __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE); + if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) { + os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__, + sa->sa_wifi_status, mptcp_advisory.sa_wifi_status, + sa->sa_cell_status, mptcp_advisory.sa_cell_status); - if ((sa->sa_wifi_status & - (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) != - (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) { + if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) { mptcp_advisory.sa_wifi_status = sa->sa_wifi_status; - } - - if (old_wifi_status != mptcp_advisory.sa_wifi_status) { mptcp_wifi_status_changed(); } - } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) { - mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__, - mptcp_advisory.sa_wifi_status), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE); - } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) { - uuid_t uuid; + } else { + struct mptcp_symptoms_answer answer; errno_t err; - if (mbuf_len(m) < sizeof(uuid_t) + sizeof(*sa)) { - os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n", - __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa)); + /* We temporarily allow different sizes for ease of submission */ + if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) && + mbuf_len(m) != sizeof(answer)) { + os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n", + __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa), + sizeof(answer)); mbuf_free(m); return EINVAL; } - err = mbuf_copydata(m, sizeof(*sa), sizeof(uuid_t), uuid); + memset(&answer, 0, sizeof(answer)); + + err = mbuf_copydata(m, 0, mbuf_len(m), &answer); if (err) { os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err); mbuf_free(m); return err; } - mptcp_allow_uuid(uuid); + mptcp_allow_uuid(answer.uuid, answer.rssi); } mbuf_freem(m); @@ -6237,14 +6395,14 @@ mptcp_control_register(void) * Three return-values: * 1 : WiFi is bad * 0 : WiFi is good - * -1 : WiFi-state is unknown, use subflow-only heuristics + * -1 : WiFi-state is unknown */ int -mptcp_is_wifi_unusable(struct mptses *mpte) +mptcp_is_wifi_unusable_for_session(struct mptses *mpte) { if (mpte->mpte_flags & MPTE_FIRSTPARTY) { if (mptcp_advisory.sa_wifi_status) { - return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0; + return symptoms_is_wifi_lossy() ? 1 : 0; } /* @@ -6252,23 +6410,39 @@ mptcp_is_wifi_unusable(struct mptses *mpte) * about the Wi-Fi state, let's be pessimistic. */ return -1; - } + } else { + if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) { + return 1; + } - return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0; -} + /* + * If we are target-based (meaning, we allow to be more lax on + * the "unusable" target. We only *know* about the state once + * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED). + * + * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then + * be set. + * + * In any other case (while in target-mode), consider WiFi bad + * and we are going to ask for allowance from Symptoms anyway. + */ + if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) { + if (mpte->mpte_flags & MPTE_ACCESS_GRANTED && + mpte->mpte_flags & MPTE_CELL_PROHIBITED) { + return 0; + } -boolean_t -mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts) -{ - struct tcpcb *tp = sototcpcb(mpts->mpts_socket); - int fail_thresh = mptcp_fail_thresh; + return 1; + } - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { - fail_thresh *= 2; + return 0; } +} - return tp->t_rxtshift >= fail_thresh && - (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq); +boolean_t +symptoms_is_wifi_lossy(void) +{ + return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true; } /* If TFO data is succesfully acked, it must be dropped from the mptcp so */ @@ -6308,10 +6482,6 @@ mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts) VERIFY(mp_so->so_snd.sb_mb != NULL); sbdrop(&mp_so->so_snd, (int)mp_droplen); } - mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mpts->mpts_connid, tcp_droplen, mp_droplen), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); } } @@ -6352,8 +6522,8 @@ mptcp_post_event(u_int32_t event_code, int value) return kev_post_msg(&ev_msg); } -void -mptcp_set_cellicon(struct mptses *mpte) +static void +mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts) { int error; @@ -6362,54 +6532,124 @@ mptcp_set_cellicon(struct mptses *mpte) return; } - /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */ - mptcp_last_cellicon_set = tcp_now; + /* Subflow is disappearing - don't set it on this one */ + if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) { + return; + } + + /* Remember the last time we set the cellicon. Needed for debouncing */ + mpte->mpte_last_cellicon_set = tcp_now; + + if (mpts->mpts_flags & MPTSF_CELLICON_SET && + mpte->mpte_cellicon_increments != 0) { + if (mptcp_cellicon_refcount == 0) { + os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments); + + /* Continue, so that the icon gets set... */ + } else { + /* + * In this case, the cellicon is already set. No need to bump it + * even higher + */ + + return; + } + } + + /* When tearing down this subflow, we need to decrement the + * reference counter + */ + mpts->mpts_flags |= MPTSF_CELLICON_SET; + + /* This counter, so that when a session gets destroyed we decrement + * the reference counter by whatever is left + */ + mpte->mpte_cellicon_increments++; - /* If cellicon is already set, get out of here! */ - if (OSTestAndSet(7, &mptcp_cellicon_is_set)) { + if (OSIncrementAtomic(&mptcp_cellicon_refcount)) { + /* If cellicon is already set, get out of here! */ return; } error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1); if (error) { - mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n", - __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); } else { - mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); } } void -mptcp_unset_cellicon(void) +mptcp_clear_cellicon(void) { - int error; + int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0); + + if (error) { + os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n", + __func__, error); + } else { + os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n", + __func__); + } +} + +/* + * Returns true if the icon has been flipped to WiFi. + */ +static boolean_t +__mptcp_unset_cellicon(long val) +{ + if (OSAddAtomic(-val, &mptcp_cellicon_refcount) != 1) { + return false; + } + + mptcp_clear_cellicon(); + + return true; +} - /* If cellicon is already unset, get out of here! */ - if (OSTestAndClear(7, &mptcp_cellicon_is_set)) { +static void +mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, long val) +{ + /* First-party apps (Siri) don't flip the cellicon */ + if (mpte->mpte_flags & MPTE_FIRSTPARTY) { return; } - /* - * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't - * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset - * it again. - */ - if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, - tcp_now)) { - OSTestAndSet(7, &mptcp_cellicon_is_set); + if (mpte->mpte_cellicon_increments == 0) { + /* This flow never used cell - get out of here! */ return; } - error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0); + if (mptcp_cellicon_refcount == 0) { + os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments); - if (error) { - mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n", - __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - } else { - mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + return; + } + + if (mpts) { + if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) { + return; + } + + mpts->mpts_flags &= ~MPTSF_CELLICON_SET; + } + + mpte->mpte_cellicon_increments--; + + if (__mptcp_unset_cellicon(val) == false) { + return; + } + + /* All flows are gone - our counter should be at zero too! */ + if (mpte->mpte_cellicon_increments != 0) { + os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments); } } diff --git a/bsd/netinet/mptcp_timer.c b/bsd/netinet/mptcp_timer.c index b376cea47..ac7595aea 100644 --- a/bsd/netinet/mptcp_timer.c +++ b/bsd/netinet/mptcp_timer.c @@ -66,6 +66,8 @@ SYSCTL_INT(_net_inet_mptcp, OID_AUTO, tw, CTLFLAG_RW | CTLFLAG_LOCKED, #define TIMEVAL_TO_HZ(_tv_) ((_tv_).tv_sec * hz + (_tv_).tv_usec / hz) +static int mptcp_cancel_urgency_timer(struct mptses *mpte); + static int mptcp_timer_demux(struct mptses *mpte, uint32_t now_msecs) { @@ -75,7 +77,6 @@ mptcp_timer_demux(struct mptses *mpte, uint32_t now_msecs) DTRACE_MPTCP2(timer, struct mptses *, mpte, struct mptcb *, mp_tp); - mpte_lock_assert_held(mpte); switch (mp_tp->mpt_timer_vals) { case MPTT_REXMT: if (mp_tp->mpt_rxtstart == 0) { @@ -144,16 +145,15 @@ mptcp_timer(struct mppcbinfo *mppi) struct mptses *mpte; mp_so = mpp->mpp_socket; - VERIFY(mp_so != NULL); mpte = mptompte(mpp); - VERIFY(mpte != NULL); - mpte_lock(mpte); + socket_lock(mp_so, 1); + VERIFY(mpp->mpp_flags & MPP_ATTACHED); if (mptcp_timer_demux(mpte, now_msecs)) { resched_timer = 1; } - mpte_unlock(mpte); + socket_unlock(mp_so, 1); } return resched_timer; @@ -171,7 +171,7 @@ mptcp_start_timer(struct mptses *mpte, int timer_type) mptcplog((LOG_DEBUG, "MPTCP Socket: %s: %d\n", __func__, timer_type), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mptetoso(mpte)); switch (timer_type) { case MPTT_REXMT: @@ -198,8 +198,7 @@ mptcp_start_timer(struct mptses *mpte, int timer_type) void mptcp_cancel_timer(struct mptcb *mp_tp, int timer_type) { - mpte_lock_assert_held(mp_tp->mpt_mpte); - DTRACE_MPTCP2(cancel__timer, struct mptcb *, mp_tp, int, timer_type); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); switch (timer_type) { case MPTT_REXMT: @@ -221,7 +220,113 @@ mptcp_cancel_timer(struct mptcb *mp_tp, int timer_type) void mptcp_cancel_all_timers(struct mptcb *mp_tp) { + struct mptses *mpte = mp_tp->mpt_mpte; + + if (mpte->mpte_time_target) { + mptcp_cancel_urgency_timer(mpte); + } + mptcp_cancel_timer(mp_tp, MPTT_REXMT); mptcp_cancel_timer(mp_tp, MPTT_TW); mptcp_cancel_timer(mp_tp, MPTT_FASTCLOSE); } + +static void +mptcp_urgency_timer_locked(struct mptses *mpte) +{ + uint64_t time_now = mach_continuous_time(); + struct socket *mp_so = mptetoso(mpte); + + VERIFY(mp_so->so_usecount >= 0); + + os_log(mptcp_log_handle, "%s - %lx: timer at %llu now %llu usecount %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target, time_now, mp_so->so_usecount); + + mptcp_check_subflows_and_add(mpte); + + mp_so->so_usecount--; +} + +static void +mptcp_urgency_timer(void *param0, __unused void *param1) +{ + struct mptses *mpte = (struct mptses *)param0; + struct socket *mp_so = mptetoso(mpte); + + socket_lock(mp_so, 1); + + mptcp_urgency_timer_locked(mpte); + + socket_unlock(mp_so, 1); +} + +void +mptcp_init_urgency_timer(struct mptses *mpte) +{ + /* thread_call_allocate never fails */ + mpte->mpte_time_thread = thread_call_allocate(mptcp_urgency_timer, mpte); +} + +void +mptcp_set_urgency_timer(struct mptses *mpte) +{ + struct socket *mp_so = mptetoso(mpte); + uint64_t time_now = 0; + boolean_t ret = FALSE; + + socket_lock_assert_owned(mp_so); + + VERIFY(mp_so->so_usecount >= 0); + if (mp_so->so_usecount == 0) { + goto exit_log; + } + + if (mpte->mpte_time_target == 0) { + mptcp_cancel_urgency_timer(mpte); + + goto exit_log; + } + + time_now = mach_continuous_time(); + + if ((int64_t)(mpte->mpte_time_target - time_now) > 0) { + mptcp_check_subflows_and_remove(mpte); + + ret = thread_call_enter_delayed_with_leeway(mpte->mpte_time_thread, NULL, + mpte->mpte_time_target, 0, THREAD_CALL_CONTINUOUS); + + if (!ret) { + mp_so->so_usecount++; + } + } else if ((int64_t)(mpte->mpte_time_target - time_now) <= 0) { + mp_so->so_usecount++; + + /* Already passed the deadline, trigger subflows now */ + mptcp_urgency_timer_locked(mpte); + } + +exit_log: + os_log(mptcp_log_handle, "%s - %lx: timer at %llu now %llu usecount %u ret %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target, time_now, + mp_so->so_usecount, ret); +} + +static int +mptcp_cancel_urgency_timer(struct mptses *mpte) +{ + struct socket *mp_so = mptetoso(mpte); + boolean_t ret; + + ret = thread_call_cancel(mpte->mpte_time_thread); + + os_log(mptcp_log_handle, "%s - %lx: Canceled timer thread usecount %u ret %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->so_usecount, ret); + + mptcp_check_subflows_and_remove(mpte); + + if (ret) { + mp_so->so_usecount--; + } + + return 0; +} diff --git a/bsd/netinet/mptcp_timer.h b/bsd/netinet/mptcp_timer.h index cab9306d6..231cf201a 100644 --- a/bsd/netinet/mptcp_timer.h +++ b/bsd/netinet/mptcp_timer.h @@ -35,10 +35,12 @@ #define MPT_TIMEWAIT 1 /* timewait timer */ __BEGIN_DECLS -extern uint32_t mptcp_timer(struct mppcbinfo *); -extern void mptcp_start_timer(struct mptses *, int); -extern void mptcp_cancel_timer(struct mptcb *, int); -extern void mptcp_cancel_all_timers(struct mptcb *); +extern uint32_t mptcp_timer(struct mppcbinfo *mppi); +extern void mptcp_start_timer(struct mptses *mpte, int timer_type); +extern void mptcp_cancel_timer(struct mptcb *mp_tp, int timer_type); +extern void mptcp_cancel_all_timers(struct mptcb *mp_tp); +extern void mptcp_init_urgency_timer(struct mptses *mpte); +extern void mptcp_set_urgency_timer(struct mptses *mpte); __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/netinet/mptcp_usrreq.c b/bsd/netinet/mptcp_usrreq.c index a73d3339f..a47b8a512 100644 --- a/bsd/netinet/mptcp_usrreq.c +++ b/bsd/netinet/mptcp_usrreq.c @@ -78,7 +78,6 @@ static int mptcp_usr_shutdown(struct socket *); static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *, struct mbuf *, struct mbuf *, int); static int mptcp_usr_socheckopt(struct socket *, struct sockopt *); -static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *); static int mptcp_usr_preconnect(struct socket *so); struct pr_usrreqs mptcp_usrreqs = { @@ -110,6 +109,10 @@ int mptcp_developer_mode = 0; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, allow_aggregate, CTLFLAG_RW | CTLFLAG_LOCKED, &mptcp_developer_mode, 0, "Allow the Multipath aggregation mode"); +static unsigned long mptcp_expected_progress_headstart = 5000; +SYSCTL_ULONG(_net_inet_mptcp, OID_AUTO, expected_progress_headstart, CTLFLAG_RW | CTLFLAG_LOCKED, + &mptcp_expected_progress_headstart, "Headstart to give MPTCP before meeting the progress deadline"); + /* * Attaches an MPTCP control block to a socket. @@ -148,9 +151,9 @@ mptcp_usr_detach(struct socket *mp_so) struct mppcb *mpp = mpsotomppcb(mp_so); if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { - mptcplog((LOG_ERR, "%s state: %d\n", __func__, - mpp ? mpp->mpp_state : -1), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: state: %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mpp ? mpp->mpp_state : -1); return EINVAL; } @@ -199,22 +202,20 @@ mptcp_attach(struct socket *mp_so, struct proc *p) } /* - * MPTCP socket buffers cannot be compressed, due to the + * MPTCP send-socket buffers cannot be compressed, due to the * fact that each mbuf chained via m_next is a M_PKTHDR * which carries some MPTCP metadata. */ mp_so->so_snd.sb_flags |= SB_NOCOMPRESS; - mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS; if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) { goto out; } mpp = mpsotomppcb(mp_so); - VERIFY(mpp != NULL); mpte = (struct mptses *)mpp->mpp_pcbe; - VERIFY(mpte != NULL); mp_tp = mpte->mpte_mptcb; + VERIFY(mp_tp != NULL); out: return error; @@ -225,39 +226,57 @@ mptcp_entitlement_check(struct socket *mp_so) { struct mptses *mpte = mpsotompte(mp_so); - if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE) == 0) { + /* First, check for mptcp_extended without delegation */ + if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE, FALSE) == 0) { + /* + * This means the app has the extended entitlement. Thus, + * it's a first party app and can run without restrictions. + */ + mpte->mpte_flags |= MPTE_FIRSTPARTY; + return 0; + } + + /* Now with delegation */ + if (mp_so->so_flags & SOF_DELEGATED && + soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE, TRUE) == 0) { /* * This means the app has the extended entitlement. Thus, * it's a first party app and can run without restrictions. */ mpte->mpte_flags |= MPTE_FIRSTPARTY; - goto grant; + return 0; } + /* Now, take a look at exceptions configured through sysctl */ #if (DEVELOPMENT || DEBUG) if (mptcp_disable_entitlements) { - goto grant; + return 0; } #endif - if (soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE)) { - mptcplog((LOG_NOTICE, "%s Multipath Capability needed\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - return -1; + if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) { + if (mptcp_developer_mode) { + return 0; + } + + goto deny; } - if (mpte->mpte_svctype > MPTCP_SVCTYPE_INTERACTIVE && - mptcp_developer_mode == 0) { - mptcplog((LOG_NOTICE, "%s need to set allow_aggregate sysctl\n", - __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - return -1; + /* Second, check for regular users that are within the data-limits */ + if (soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE, FALSE) == 0) { + return 0; } -grant: - mptcplog((LOG_NOTICE, "%s entitlement granted for %u\n", __func__, mpte->mpte_svctype), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); + if (mp_so->so_flags & SOF_DELEGATED && + soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE, TRUE) == 0) { + return 0; + } - return 0; +deny: + os_log_error(mptcp_log_handle, "%s - %lx: MPTCP prohibited on svc %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype); + + return -1; } /* @@ -270,17 +289,11 @@ static int mptcp_connectx(struct mptses *mpte, struct sockaddr *src, struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid) { - struct socket *mp_so = mptetoso(mpte); int error = 0; VERIFY(dst != NULL); VERIFY(pcid != NULL); - mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - DTRACE_MPTCP2(connectx, struct mptses *, mpte, struct socket *, mp_so); - error = mptcp_subflow_add(mpte, src, dst, ifscope, pcid); return error; @@ -303,22 +316,18 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src, int error = 0; if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { - mptcplog((LOG_ERR, "%s state %d\n", __func__, - mpp ? mpp->mpp_state : -1), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: state %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mpp ? mpp->mpp_state : -1); error = EINVAL; goto out; } mpte = mptompte(mpp); - VERIFY(mpte != NULL); - mpte_lock_assert_held(mpte); - mp_tp = mpte->mpte_mptcb; - VERIFY(mp_tp != NULL); if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { - mptcplog((LOG_ERR, "%s fell back to TCP\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: fell back to TCP\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); error = EINVAL; goto out; } @@ -330,18 +339,16 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src, if (dst->sa_family == AF_INET && dst->sa_len != sizeof(mpte->__mpte_dst_v4)) { - mptcplog((LOG_ERR, "%s IPv4 dst len %u\n", __func__, - dst->sa_len), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: IPv4 dst len %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), dst->sa_len); error = EINVAL; goto out; } if (dst->sa_family == AF_INET6 && dst->sa_len != sizeof(mpte->__mpte_dst_v6)) { - mptcplog((LOG_ERR, "%s IPv6 dst len %u\n", __func__, - dst->sa_len), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: IPv6 dst len %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), dst->sa_len); error = EINVAL; goto out; } @@ -356,7 +363,7 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src, } if ((mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) { - memcpy(&mpte->mpte_dst, dst, dst->sa_len); + memcpy(&mpte->mpte_u_dst, dst, dst->sa_len); } if (src) { @@ -367,24 +374,22 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src, if (src->sa_family == AF_INET && src->sa_len != sizeof(mpte->__mpte_src_v4)) { - mptcplog((LOG_ERR, "%s IPv4 src len %u\n", __func__, - src->sa_len), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: IPv4 src len %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), src->sa_len); error = EINVAL; goto out; } if (src->sa_family == AF_INET6 && src->sa_len != sizeof(mpte->__mpte_src_v6)) { - mptcplog((LOG_ERR, "%s IPv6 src len %u\n", __func__, - src->sa_len), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: IPv6 src len %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), src->sa_len); error = EINVAL; goto out; } if ((mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) { - memcpy(&mpte->mpte_src, src, src->sa_len); + memcpy(&mpte->mpte_u_src, src, src->sa_len); } } @@ -418,8 +423,6 @@ out: static int mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp) { - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - /* MPTCP has at most 1 association */ *cnt = (mpte->mpte_associd != SAE_ASSOCID_ANY) ? 1 : 0; @@ -442,8 +445,6 @@ mptcp_getconnids(struct mptses *mpte, sae_associd_t aid, uint32_t *cnt, struct mptsub *mpts; int error = 0; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL && aid != mpte->mpte_associd) { return EINVAL; @@ -477,20 +478,17 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type, user_addr_t aux_data, uint32_t *aux_len) { - struct socket *so; - struct inpcb *inp; - struct mptsub *mpts; - int error = 0; - *flags = 0; *aux_type = 0; *ifindex = 0; *soerror = 0; + /* MPTCP-level global stats */ if (*cid == SAE_CONNID_ALL) { struct socket *mp_so = mptetoso(mpte); struct mptcb *mp_tp = mpte->mpte_mptcb; struct conninfo_multipathtcp mptcp_ci; + int error = 0; if (*aux_len != 0 && *aux_len != sizeof(mptcp_ci)) { return EINVAL; @@ -522,8 +520,9 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, *aux_len = sizeof(mptcp_ci); if (aux_data != USER_ADDR_NULL) { - unsigned long i = 0; + const struct mptsub *mpts; int initial_info_set = 0; + unsigned long i = 0; bzero(&mptcp_ci, sizeof(mptcp_ci)); mptcp_ci.mptcpci_subflow_count = mpte->mpte_numflows; @@ -539,6 +538,8 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, mptcp_ci.mptcpci_subflow_connids[i] = mpts->mpts_connid; if (mpts->mpts_flags & MPTSF_INITIAL_SUB) { + const struct inpcb *inp; + inp = sotoinpcb(mpts->mpts_socket); mptcp_ci.mptcpci_init_rxbytes = inp->inp_stat->rxbytes; @@ -562,9 +563,8 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, error = copyout(&mptcp_ci, aux_data, sizeof(mptcp_ci)); if (error != 0) { - mptcplog((LOG_ERR, "%s copyout failed: %d\n", - __func__, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: copyout failed: %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); return error; } } @@ -572,51 +572,221 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, return 0; } - TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { - if (mpts->mpts_connid == *cid || *cid == SAE_CONNID_ANY) { - break; + /* Any stats of any subflow */ + if (*cid == SAE_CONNID_ANY) { + const struct mptsub *mpts; + struct socket *so; + const struct inpcb *inp; + int error = 0; + + mpts = TAILQ_FIRST(&mpte->mpte_subflows); + if (mpts == NULL) { + return ENXIO; } - } - if (mpts == NULL) { - return (*cid == SAE_CONNID_ANY) ? ENXIO : EINVAL; - } - so = mpts->mpts_socket; - inp = sotoinpcb(so); + so = mpts->mpts_socket; + inp = sotoinpcb(so); + + if (inp->inp_vflag & INP_IPV4) { + error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex, + soerror, src, src_len, dst, dst_len, + aux_type, aux_data, aux_len); + } else { + error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex, + soerror, src, src_len, dst, dst_len, + aux_type, aux_data, aux_len); + } + + if (error != 0) { + os_log_error(mptcp_log_handle, "%s - %lx:error from in_getconninfo %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); + return error; + } + + if (mpts->mpts_flags & MPTSF_MP_CAPABLE) { + *flags |= CIF_MP_CAPABLE; + } + if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { + *flags |= CIF_MP_DEGRADED; + } + if (mpts->mpts_flags & MPTSF_MP_READY) { + *flags |= CIF_MP_READY; + } + if (mpts->mpts_flags & MPTSF_ACTIVE) { + *flags |= CIF_MP_ACTIVE; + } - if (inp->inp_vflag & INP_IPV4) { - error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex, - soerror, src, src_len, dst, dst_len, - aux_type, aux_data, aux_len); + return 0; } else { - error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex, - soerror, src, src_len, dst, dst_len, - aux_type, aux_data, aux_len); - } + /* Per-interface stats */ + const struct mptsub *mpts, *orig_mpts; + struct conninfo_tcp tcp_ci; + const struct inpcb *inp; + struct socket *so; + int error = 0; + int index; - if (error != 0) { - mptcplog((LOG_ERR, "%s error from in_getconninfo %d\n", - __func__, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - return error; - } + bzero(&tcp_ci, sizeof(tcp_ci)); - if (mpts->mpts_flags & MPTSF_MP_CAPABLE) { - *flags |= CIF_MP_CAPABLE; - } - if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { - *flags |= CIF_MP_DEGRADED; - } - if (mpts->mpts_flags & MPTSF_MP_READY) { - *flags |= CIF_MP_READY; - } - if (mpts->mpts_flags & MPTSF_ACTIVE) { - *flags |= CIF_MP_ACTIVE; - } + /* First, get a subflow to fill in the "regular" info. */ + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; - mptcplog((LOG_DEBUG, "%s: cid %d flags %x \n", __func__, - mpts->mpts_connid, mpts->mpts_flags), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + if (ifp && ifp->if_index == *cid) { + break; + } + } + + if (mpts == NULL) { + /* No subflow there - well, let's just get the basic itf-info */ + goto interface_info; + } + + so = mpts->mpts_socket; + inp = sotoinpcb(so); + + /* Give it USER_ADDR_NULL, because we are doing this on our own */ + if (inp->inp_vflag & INP_IPV4) { + error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex, + soerror, src, src_len, dst, dst_len, + aux_type, USER_ADDR_NULL, aux_len); + } else { + error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex, + soerror, src, src_len, dst, dst_len, + aux_type, USER_ADDR_NULL, aux_len); + } + + if (error != 0) { + os_log_error(mptcp_log_handle, "%s - %lx:error from in_getconninfo %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); + return error; + } + + /* ToDo: Nobody is reading these flags on subflows. Why bother ? */ + if (mpts->mpts_flags & MPTSF_MP_CAPABLE) { + *flags |= CIF_MP_CAPABLE; + } + if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { + *flags |= CIF_MP_DEGRADED; + } + if (mpts->mpts_flags & MPTSF_MP_READY) { + *flags |= CIF_MP_READY; + } + if (mpts->mpts_flags & MPTSF_ACTIVE) { + *flags |= CIF_MP_ACTIVE; + } + + /* + * Now, we gather the metrics (aka., tcp_info) and roll them in + * across all subflows of this interface to build an aggregated + * view. + * + * We take the TCP_INFO from the first subflow as the "master", + * feeding into those fields that we do not roll. + */ + if (aux_data != USER_ADDR_NULL) { + tcp_getconninfo(so, &tcp_ci); + + orig_mpts = mpts; + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + const struct inpcb *mptsinp = sotoinpcb(mpts->mpts_socket); + const struct ifnet *ifp; + + ifp = mptsinp->inp_last_outifp; + + if (ifp == NULL || ifp->if_index != *cid || mpts == orig_mpts) { + continue; + } + + /* Roll the itf-stats into the tcp_info */ + tcp_ci.tcpci_tcp_info.tcpi_txbytes += + mptsinp->inp_stat->txbytes; + tcp_ci.tcpci_tcp_info.tcpi_rxbytes += + mptsinp->inp_stat->rxbytes; + + tcp_ci.tcpci_tcp_info.tcpi_wifi_txbytes += + mptsinp->inp_wstat->txbytes; + tcp_ci.tcpci_tcp_info.tcpi_wifi_rxbytes += + mptsinp->inp_wstat->rxbytes; + + tcp_ci.tcpci_tcp_info.tcpi_wired_txbytes += + mptsinp->inp_Wstat->txbytes; + tcp_ci.tcpci_tcp_info.tcpi_wired_rxbytes += + mptsinp->inp_Wstat->rxbytes; + + tcp_ci.tcpci_tcp_info.tcpi_cell_txbytes += + mptsinp->inp_cstat->txbytes; + tcp_ci.tcpci_tcp_info.tcpi_cell_rxbytes += + mptsinp->inp_cstat->rxbytes; + } + } + +interface_info: + *aux_type = CIAUX_TCP; + if (*aux_len == 0) { + *aux_len = sizeof(tcp_ci); + } else if (aux_data != USER_ADDR_NULL) { + boolean_t create; + + /* + * Finally, old subflows might have been closed - we + * want this data as well, so grab it from the interface + * stats. + */ + create = orig_mpts != NULL; + + /* + * When we found a subflow, we are willing to create a stats-index + * because we have some data to return. If there isn't a subflow, + * nor anything in the stats, return EINVAL. Because the + * ifindex belongs to something that doesn't exist. + */ + index = mptcpstats_get_index_by_ifindex(mpte->mpte_itfstats, *cid, false); + if (index == -1) { + os_log_error(mptcp_log_handle, + "%s - %lx: Asking for too many ifindex: %u subcount %u, mpts? %s\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + *cid, mpte->mpte_numflows, + orig_mpts ? "yes" : "no"); + + if (orig_mpts == NULL) { + return EINVAL; + } + } else { + struct mptcp_itf_stats *stats; + + stats = &mpte->mpte_itfstats[index]; + + /* Roll the itf-stats into the tcp_info */ + tcp_ci.tcpci_tcp_info.tcpi_last_outif = *cid; + tcp_ci.tcpci_tcp_info.tcpi_txbytes += + stats->mpis_txbytes; + tcp_ci.tcpci_tcp_info.tcpi_rxbytes += + stats->mpis_rxbytes; + + tcp_ci.tcpci_tcp_info.tcpi_wifi_txbytes += + stats->mpis_wifi_txbytes; + tcp_ci.tcpci_tcp_info.tcpi_wifi_rxbytes += + stats->mpis_wifi_rxbytes; + + tcp_ci.tcpci_tcp_info.tcpi_wired_txbytes += + stats->mpis_wired_txbytes; + tcp_ci.tcpci_tcp_info.tcpi_wired_rxbytes += + stats->mpis_wired_rxbytes; + + tcp_ci.tcpci_tcp_info.tcpi_cell_txbytes += + stats->mpis_cell_txbytes; + tcp_ci.tcpci_tcp_info.tcpi_cell_rxbytes += + stats->mpis_cell_rxbytes; + } + + *aux_len = min(*aux_len, sizeof(tcp_ci)); + error = copyout(&tcp_ci, aux_data, *aux_len); + if (error != 0) { + return error; + } + } + } return 0; } @@ -638,9 +808,6 @@ mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data, goto out; } mpte = mptompte(mpp); - VERIFY(mpte != NULL); - - mpte_lock_assert_held(mpte); /* same as MP socket lock */ switch (cmd) { case SIOCGASSOCIDS32: { /* struct so_aidreq32 */ @@ -730,15 +897,9 @@ mptcp_disconnect(struct mptses *mpte) struct mptcb *mp_tp; int error = 0; - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; - mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx %d\n", __func__, - (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - DTRACE_MPTCP3(disconnectx, struct mptses *, mpte, struct socket *, mp_so, struct mptcb *, mp_tp); @@ -856,14 +1017,23 @@ mptcp_usr_rcvd(struct socket *mp_so, int flags) #pragma unused(flags) struct mppcb *mpp = mpsotomppcb(mp_so); struct mptses *mpte; + struct mptsub *mpts; int error = 0; if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { error = EINVAL; goto out; } + mpte = mptompte(mpp); - VERIFY(mpte != NULL); + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + struct socket *so = mpts->mpts_socket; + + if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { + (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0); + } + } error = mptcp_output(mpte); out: @@ -1096,7 +1266,7 @@ mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio, VERIFY(mp_so->so_type == SOCK_STREAM); VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW)); - if ((flags & (MSG_OOB | MSG_DONTROUTE | MSG_HOLD | MSG_SEND | MSG_FLUSH)) || + if ((flags & (MSG_OOB | MSG_DONTROUTE)) || (mp_so->so_flags & SOF_ENABLE_MSGS)) { error = EOPNOTSUPP; socket_unlock(mp_so, 1); @@ -1253,6 +1423,7 @@ mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt) case SO_NECP_ATTRIBUTES: case SO_NECP_CLIENTUUID: #endif /* NECP */ + case SO_MPKL_SEND_INFO: /* * Tell the caller that these options are to be processed. */ @@ -1321,7 +1492,6 @@ mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo) goto out; } - mpte_lock_assert_held(mpte); /* same as MP socket lock */ mp_so = mptetoso(mpte); /* @@ -1441,18 +1611,34 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) rec = 0; break; - /* Next ones, record at MPTCP-level */ + /* Next ones, record at MPTCP-level */ + case SO_DELEGATED: + error = sooptcopyin(sopt, &mpte->mpte_epid, + sizeof(int), sizeof(int)); + if (error != 0) { + goto err_out; + } + + goto out; + case SO_DELEGATED_UUID: + error = sooptcopyin(sopt, &mpte->mpte_euuid, + sizeof(uuid_t), sizeof(uuid_t)); + if (error != 0) { + goto err_out; + } + + goto out; #if NECP case SO_NECP_CLIENTUUID: if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) { error = EINVAL; - goto out; + goto err_out; } error = sooptcopyin(sopt, &mpsotomppcb(mp_so)->necp_client_uuid, sizeof(uuid_t), sizeof(uuid_t)); if (error != 0) { - goto out; + goto err_out; } mpsotomppcb(mp_so)->necp_cb = mptcp_session_necp_cb; @@ -1460,12 +1646,12 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) mpsotomppcb(mp_so)->necp_client_uuid, mpsotomppcb(mp_so)); if (error) { - goto out; + goto err_out; } if (uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) { error = EINVAL; - goto out; + goto err_out; } goto out; @@ -1494,11 +1680,11 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) { - goto out; + goto err_out; } if (optval < 0) { error = EINVAL; - goto out; + goto err_out; } else { if (optval == 0) { mp_so->so_flags &= ~SOF_NOTSENT_LOWAT; @@ -1508,6 +1694,10 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) error = mptcp_set_notsent_lowat(mpte, optval); } + + if (error) { + goto err_out; + } } goto out; case MPTCP_SERVICE_TYPE: @@ -1515,18 +1705,18 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) { - goto out; + goto err_out; } if (optval < 0 || optval >= MPTCP_SVCTYPE_MAX) { error = EINVAL; - goto out; + goto err_out; } mpte->mpte_svctype = optval; if (mptcp_entitlement_check(mp_so) < 0) { error = EACCES; - goto out; + goto err_out; } mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED; @@ -1537,27 +1727,104 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) { - goto out; + goto err_out; } if (optval < 0 || optval > UINT16_MAX) { error = EINVAL; - goto out; + goto err_out; } mpte->mpte_alternate_port = optval; goto out; + case MPTCP_FORCE_ENABLE: + /* record at MPTCP level */ + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) { + goto err_out; + } + + if (optval < 0 || optval > 1) { + error = EINVAL; + goto err_out; + } + + if (optval) { + mpte->mpte_flags |= MPTE_FORCE_ENABLE; + } else { + mpte->mpte_flags &= ~MPTE_FORCE_ENABLE; + } + + goto out; + case MPTCP_EXPECTED_PROGRESS_TARGET: + { + struct mptcb *mp_tp = mpte->mpte_mptcb; + uint64_t mach_time_target; + uint64_t nanoseconds; + + if (mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) { + os_log(mptcp_log_handle, "%s - %lx: Can't set urgent activity when svctype is %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype); + error = EINVAL; + goto err_out; + } + + error = sooptcopyin(sopt, &mach_time_target, sizeof(mach_time_target), sizeof(mach_time_target)); + if (error) { + goto err_out; + } + + if (!mptcp_ok_to_create_subflows(mp_tp)) { + os_log(mptcp_log_handle, "%s - %lx: Not ok to create subflows, state %u flags %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags); + error = EINVAL; + goto err_out; + } + + if (mach_time_target) { + uint64_t time_now = 0; + uint64_t time_now_nanoseconds; + + absolutetime_to_nanoseconds(mach_time_target, &nanoseconds); + nanoseconds = nanoseconds - (mptcp_expected_progress_headstart * NSEC_PER_MSEC); + + time_now = mach_continuous_time(); + absolutetime_to_nanoseconds(time_now, &time_now_nanoseconds); + + nanoseconds_to_absolutetime(nanoseconds, &mach_time_target); + /* If the timer is already running and it would + * fire in less than mptcp_expected_progress_headstart + * seconds, then it's not worth canceling it. + */ + if (mpte->mpte_time_target && + mpte->mpte_time_target < time_now && + time_now_nanoseconds > nanoseconds - (mptcp_expected_progress_headstart * NSEC_PER_MSEC)) { + os_log(mptcp_log_handle, "%s - %lx: Not rescheduling timer %llu now %llu target %llu\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mpte->mpte_time_target, + time_now, + mach_time_target); + goto out; + } + } + + mpte->mpte_time_target = mach_time_target; + mptcp_set_urgency_timer(mpte); + + goto out; + } default: /* not eligible */ error = ENOPROTOOPT; - goto out; + goto err_out; } } if ((error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval))) != 0) { - goto out; + goto err_out; } if (rec) { @@ -1568,14 +1835,8 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) if (mpo == NULL) { error = ENOBUFS; + goto err_out; } else { - mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s val %d %s\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(level, optname), optval, - (mpo->mpo_flags & MPOF_ATTACHED) ? - "updated" : "recorded"), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - /* initialize or update, as needed */ mpo->mpo_intval = optval; if (!(mpo->mpo_flags & MPOF_ATTACHED)) { @@ -1596,34 +1857,154 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) } /* issue this socket option on existing subflows */ - if (error == 0) { - error = mptcp_setopt_apply(mpte, mpo); - if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) { - VERIFY(mpo != &smpo); - mptcp_sopt_remove(mpte, mpo); - mptcp_sopt_free(mpo); - } - if (mpo == &smpo) { - mpo->mpo_flags &= ~MPOF_INTERIM; - } + error = mptcp_setopt_apply(mpte, mpo); + if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) { + VERIFY(mpo != &smpo); + mptcp_sopt_remove(mpte, mpo); + mptcp_sopt_free(mpo); } -out: - if (error == 0 && mpo != NULL) { - mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s val %d set %s\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(level, optname), optval, - (mpo->mpo_flags & MPOF_INTERIM) ? - "pending" : "successful"), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - } else if (error != 0) { - mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s (%d, %d) val %d can't be issued error %d\n", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(level, optname), level, optname, optval, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + if (mpo == &smpo) { + mpo->mpo_flags &= ~MPOF_INTERIM; } + + if (error) { + goto err_out; + } + +out: + + return 0; + +err_out: + os_log_error(mptcp_log_handle, "%s - %lx: sopt %s (%d, %d) val %d can't be issued error %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mptcp_sopt2str(level, optname), level, optname, optval, error); return error; } +static void +mptcp_fill_info_bytestats(struct tcp_info *ti, struct mptses *mpte) +{ + struct mptsub *mpts; + int i; + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + const struct inpcb *inp = sotoinpcb(mpts->mpts_socket); + + if (inp == NULL) { + continue; + } + + ti->tcpi_txbytes += inp->inp_stat->txbytes; + ti->tcpi_rxbytes += inp->inp_stat->rxbytes; + ti->tcpi_cell_txbytes += inp->inp_cstat->txbytes; + ti->tcpi_cell_rxbytes += inp->inp_cstat->rxbytes; + ti->tcpi_wifi_txbytes += inp->inp_wstat->txbytes; + ti->tcpi_wifi_rxbytes += inp->inp_wstat->rxbytes; + ti->tcpi_wired_txbytes += inp->inp_Wstat->txbytes; + ti->tcpi_wired_rxbytes += inp->inp_Wstat->rxbytes; + } + + for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) { + struct mptcp_itf_stats *stats = &mpte->mpte_itfstats[i]; + + ti->tcpi_txbytes += stats->mpis_txbytes; + ti->tcpi_rxbytes += stats->mpis_rxbytes; + + ti->tcpi_wifi_txbytes += stats->mpis_wifi_txbytes; + ti->tcpi_wifi_rxbytes += stats->mpis_wifi_rxbytes; + + ti->tcpi_wired_txbytes += stats->mpis_wired_txbytes; + ti->tcpi_wired_rxbytes += stats->mpis_wired_rxbytes; + + ti->tcpi_cell_txbytes += stats->mpis_cell_txbytes; + ti->tcpi_cell_rxbytes += stats->mpis_cell_rxbytes; + } +} + +static void +mptcp_fill_info(struct mptses *mpte, struct tcp_info *ti) +{ + struct mptsub *actsub = mpte->mpte_active_sub; + struct mptcb *mp_tp = mpte->mpte_mptcb; + struct tcpcb *acttp = NULL; + + if (actsub) { + acttp = sototcpcb(actsub->mpts_socket); + } + + bzero(ti, sizeof(*ti)); + + ti->tcpi_state = mp_tp->mpt_state; + /* tcpi_options */ + /* tcpi_snd_wscale */ + /* tcpi_rcv_wscale */ + /* tcpi_flags */ + if (acttp) { + ti->tcpi_rto = acttp->t_timer[TCPT_REXMT] ? acttp->t_rxtcur : 0; + } + + /* tcpi_snd_mss */ + /* tcpi_rcv_mss */ + if (acttp) { + ti->tcpi_rttcur = acttp->t_rttcur; + ti->tcpi_srtt = acttp->t_srtt >> TCP_RTT_SHIFT; + ti->tcpi_rttvar = acttp->t_rttvar >> TCP_RTTVAR_SHIFT; + ti->tcpi_rttbest = acttp->t_rttbest >> TCP_RTT_SHIFT; + } + /* tcpi_snd_ssthresh */ + /* tcpi_snd_cwnd */ + /* tcpi_rcv_space */ + ti->tcpi_snd_wnd = mp_tp->mpt_sndwnd; + ti->tcpi_snd_nxt = mp_tp->mpt_sndnxt; + ti->tcpi_rcv_nxt = mp_tp->mpt_rcvnxt; + if (acttp) { + ti->tcpi_last_outif = (acttp->t_inpcb->inp_last_outifp == NULL) ? 0 : + acttp->t_inpcb->inp_last_outifp->if_index; + } + + mptcp_fill_info_bytestats(ti, mpte); + /* tcpi_txpackets */ + + /* tcpi_txretransmitbytes */ + /* tcpi_txunacked */ + /* tcpi_rxpackets */ + + /* tcpi_rxduplicatebytes */ + /* tcpi_rxoutoforderbytes */ + /* tcpi_snd_bw */ + /* tcpi_synrexmits */ + /* tcpi_unused1 */ + /* tcpi_unused2 */ + /* tcpi_cell_rxpackets */ + + /* tcpi_cell_txpackets */ + + /* tcpi_wifi_rxpackets */ + + /* tcpi_wifi_txpackets */ + + /* tcpi_wired_rxpackets */ + /* tcpi_wired_txpackets */ + /* tcpi_connstatus */ + /* TFO-stuff */ + /* ECN stuff */ + /* tcpi_ecn_recv_ce */ + /* tcpi_ecn_recv_cwr */ + if (acttp) { + ti->tcpi_rcvoopack = acttp->t_rcvoopack; + } + /* tcpi_pawsdrop */ + /* tcpi_sack_recovery_episode */ + /* tcpi_reordered_pkts */ + /* tcpi_dsack_sent */ + /* tcpi_dsack_recvd */ + /* tcpi_flowhash */ + if (acttp) { + ti->tcpi_txretransmitpackets = acttp->t_stat.rxmitpkts; + } +} + /* * Handle SOPT_GET for socket options issued on MP socket. */ @@ -1643,6 +2024,9 @@ mptcp_getopt(struct mptses *mpte, struct sockopt *sopt) } switch (sopt->sopt_name) { + case PERSIST_TIMEOUT: + /* Only case for which we have a non-zero default */ + optval = tcp_max_persist_timeout; case TCP_NODELAY: case TCP_RXT_FINDROP: case TCP_KEEPALIVE: @@ -1650,100 +2034,58 @@ mptcp_getopt(struct mptses *mpte, struct sockopt *sopt) case TCP_KEEPCNT: case TCP_CONNECTIONTIMEOUT: case TCP_RXT_CONNDROPTIME: - case PERSIST_TIMEOUT: case TCP_ADAPTIVE_READ_TIMEOUT: case TCP_ADAPTIVE_WRITE_TIMEOUT: - case TCP_NOTSENT_LOWAT: - case MPTCP_SERVICE_TYPE: - case MPTCP_ALTERNATE_PORT: - /* eligible; get the default value just in case */ - error = mptcp_default_tcp_optval(mpte, sopt, &optval); - break; - default: - /* not eligible */ - error = ENOPROTOOPT; + { + struct mptopt *mpo = mptcp_sopt_find(mpte, sopt); + + if (mpo != NULL) { + optval = mpo->mpo_intval; + } break; } - switch (sopt->sopt_name) { + /* The next ones are stored at the MPTCP-level */ case TCP_NOTSENT_LOWAT: if (mptetoso(mpte)->so_flags & SOF_NOTSENT_LOWAT) { optval = mptcp_get_notsent_lowat(mpte); } else { optval = 0; } - goto out; - case MPTCP_SERVICE_TYPE: - optval = mpte->mpte_svctype; - goto out; - case MPTCP_ALTERNATE_PORT: - optval = mpte->mpte_alternate_port; - goto out; - } + break; + case TCP_INFO: + { + struct tcp_info ti; - /* - * Search for a previously-issued TCP level socket option and - * return the recorded option value. This assumes that the - * value did not get modified by the lower layer after it was - * issued at setsockopt(2) time. If not found, we'll return - * the default value obtained ealier. - */ - if (error == 0) { - struct mptopt *mpo; + mptcp_fill_info(mpte, &ti); + error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info)); - if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL) { - optval = mpo->mpo_intval; - } - - error = sooptcopyout(sopt, &optval, sizeof(int)); + goto out; } -out: - return error; -} - -/* - * Return default values for TCP socket options. Ideally we would query the - * subflow TCP socket, but that requires creating a subflow socket before - * connectx(2) time. To simplify things, just return the default values - * that we know of. - */ -static int -mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval) -{ - int error = 0; - - VERIFY(sopt->sopt_level == IPPROTO_TCP); - VERIFY(sopt->sopt_dir == SOPT_GET); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - - /* try to do what tcp_newtcpcb() does */ - switch (sopt->sopt_name) { - case TCP_NODELAY: - case TCP_RXT_FINDROP: - case TCP_KEEPINTVL: - case TCP_KEEPCNT: - case TCP_CONNECTIONTIMEOUT: - case TCP_RXT_CONNDROPTIME: - case TCP_NOTSENT_LOWAT: - case TCP_ADAPTIVE_READ_TIMEOUT: - case TCP_ADAPTIVE_WRITE_TIMEOUT: case MPTCP_SERVICE_TYPE: - case MPTCP_ALTERNATE_PORT: - *optval = 0; + optval = mpte->mpte_svctype; break; - - case TCP_KEEPALIVE: - *optval = mptcp_subflow_keeptime; + case MPTCP_ALTERNATE_PORT: + optval = mpte->mpte_alternate_port; break; - - case PERSIST_TIMEOUT: - *optval = tcp_max_persist_timeout; + case MPTCP_FORCE_ENABLE: + optval = !!(mpte->mpte_flags & MPTE_FORCE_ENABLE); break; + case MPTCP_EXPECTED_PROGRESS_TARGET: + error = sooptcopyout(sopt, &mpte->mpte_time_target, sizeof(mpte->mpte_time_target)); + goto out; default: + /* not eligible */ error = ENOPROTOOPT; break; } + + if (error == 0) { + error = sooptcopyout(sopt, &optval, sizeof(int)); + } + +out: return error; } @@ -1764,15 +2106,10 @@ mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt) goto out; } mpte = mptompte(mpp); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ + socket_lock_assert_owned(mp_so); /* we only handle socket and TCP-level socket options for MPTCP */ if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) { - mptcplog((LOG_DEBUG, "MPTCP Socket: " - "%s: mp_so 0x%llx sopt %s level not " - "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), - mptcp_sopt2str(sopt->sopt_level, sopt->sopt_name)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); error = EINVAL; goto out; } @@ -1903,6 +2240,10 @@ mptcp_sopt2str(int level, int optname) return "MPTCP_SERVICE_TYPE"; case MPTCP_ALTERNATE_PORT: return "MPTCP_ALTERNATE_PORT"; + case MPTCP_FORCE_ENABLE: + return "MPTCP_FORCE_ENABLE"; + case MPTCP_EXPECTED_PROGRESS_TARGET: + return "MPTCP_EXPECTED_PROGRESS_TARGET"; } break; @@ -1922,14 +2263,11 @@ mptcp_usr_preconnect(struct socket *mp_so) int error; mpte = mptompte(mpp); - VERIFY(mpte != NULL); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - mpts = mptcp_get_subflow(mpte, NULL, NULL); + mpts = mptcp_get_subflow(mpte, NULL); if (mpts == NULL) { - mptcplog((LOG_ERR, "%s: mp_so 0x%llx invalid preconnect ", - __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: invalid preconnect ", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); return EINVAL; } mpts->mpts_flags &= ~MPTSF_TFO_REQD; diff --git a/bsd/netinet/mptcp_var.h b/bsd/netinet/mptcp_var.h index 5ae998afc..4c9037db7 100644 --- a/bsd/netinet/mptcp_var.h +++ b/bsd/netinet/mptcp_var.h @@ -67,32 +67,52 @@ struct mptses { sae_associd_t mpte_associd; /* MPTCP association ID */ sae_connid_t mpte_connid_last; /* last used connection ID */ + uint64_t mpte_time_target; + thread_call_t mpte_time_thread; + + uint32_t mpte_last_cellicon_set; + uint32_t mpte_cellicon_increments; + union { /* Source address of initial subflow */ - struct sockaddr mpte_src; - struct sockaddr_in __mpte_src_v4; - struct sockaddr_in6 __mpte_src_v6; - }; - + struct sockaddr _mpte_src; + struct sockaddr_in _mpte_src_v4; + struct sockaddr_in6 _mpte_src_v6; + } mpte_u_src; +#define mpte_src mpte_u_src._mpte_src +#define __mpte_src_v4 mpte_u_src._mpte_src_v4 +#define __mpte_src_v6 mpte_u_src._mpte_src_v6 union { /* Destination address of initial subflow */ - struct sockaddr mpte_dst; - struct sockaddr_in __mpte_dst_v4; - struct sockaddr_in6 __mpte_dst_v6; - }; + struct sockaddr _mpte_dst; + struct sockaddr_in _mpte_dst_v4; + struct sockaddr_in6 _mpte_dst_v6; + } mpte_u_dst; +#define mpte_dst mpte_u_dst._mpte_dst +#define __mpte_dst_v4 mpte_u_dst._mpte_dst_v4 +#define __mpte_dst_v6 mpte_u_dst._mpte_dst_v6 struct sockaddr_in mpte_dst_v4_nat64; + struct sockaddr_in mpte_dst_unicast_v4; + struct sockaddr_in6 mpte_dst_unicast_v6; + uint16_t mpte_alternate_port; /* Alternate port for subflow establishment (network-byte-order) */ + int mpte_epid; + uuid_t mpte_euuid; + struct mptsub *mpte_active_sub; /* ptr to last active subf */ - uint8_t mpte_flags; /* per mptcp session flags */ + uint16_t mpte_flags; /* per mptcp session flags */ #define MPTE_SND_REM_ADDR 0x01 /* Send Remove_addr option */ #define MPTE_SVCTYPE_CHECKED 0x02 /* Did entitlement-check for service-type */ #define MPTE_FIRSTPARTY 0x04 /* First-party app used multipath_extended entitlement */ #define MPTE_ACCESS_GRANTED 0x08 /* Access to cellular has been granted for this connection */ -#define MPTE_IN_WORKLOOP 0x10 /* Are we currently inside the workloop ? */ -#define MPTE_WORKLOOP_RELAUNCH 0x20 /* Another event got queued, we should restart the workloop */ +#define MPTE_FORCE_ENABLE 0x10 /* For MPTCP regardless of heuristics to detect middleboxes */ +#define MPTE_IN_WORKLOOP 0x20 /* Are we currently inside the workloop ? */ +#define MPTE_WORKLOOP_RELAUNCH 0x40 /* Another event got queued, we should restart the workloop */ +#define MPTE_UNICAST_IP 0x80 /* New subflows are only being established towards the unicast IP in the ADD_ADDR */ +#define MPTE_CELL_PROHIBITED 0x100 /* Cell access has been prohibited based on signal quality */ uint8_t mpte_svctype; /* MPTCP Service type */ uint8_t mpte_lost_aid; /* storing lost address id */ uint8_t mpte_addrid_last; /* storing address id parm */ @@ -135,24 +155,6 @@ mpsotompte(struct socket *so) return mptompte(mpsotomppcb(so)); } -static inline void -mpp_lock_assert_held(struct mppcb *mp) -{ -#if !MACH_ASSERT -#pragma unused(mp) -#endif - LCK_MTX_ASSERT(&mp->mpp_lock, LCK_MTX_ASSERT_OWNED); -} - -static inline void -mpp_lock_assert_notheld(struct mppcb *mp) -{ -#if !MACH_ASSERT -#pragma unused(mp) -#endif - LCK_MTX_ASSERT(&mp->mpp_lock, LCK_MTX_ASSERT_NOTOWNED); -} - static inline boolean_t mpp_try_lock(struct mppcb *mp) { @@ -193,42 +195,6 @@ mpp_getlock(struct mppcb *mp, int flags) return &mp->mpp_lock; } -static inline void -mpte_lock_assert_held(struct mptses *mpte) -{ - mpp_lock_assert_held(mpte->mpte_mppcb); -} - -static inline void -mpte_lock_assert_notheld(struct mptses *mpte) -{ - mpp_lock_assert_notheld(mpte->mpte_mppcb); -} - -static inline boolean_t -mpte_try_lock(struct mptses *mpte) -{ - return mpp_try_lock(mpte->mpte_mppcb); -} - -static inline void -mpte_lock(struct mptses *mpte) -{ - mpp_lock(mpte->mpte_mppcb); -} - -static inline void -mpte_unlock(struct mptses *mpte) -{ - mpp_unlock(mpte->mpte_mppcb); -} - -static inline lck_mtx_t * -mpte_getlock(struct mptses *mpte, int flags) -{ - return mpp_getlock(mpte->mpte_mppcb, flags); -} - static inline int mptcp_subflow_cwnd_space(struct socket *so) { @@ -272,11 +238,13 @@ struct mptsub { union { /* destination address */ - struct sockaddr mpts_dst; - struct sockaddr_in __mpts_dst_v4; - struct sockaddr_in6 __mpts_dst_v6; - }; - + struct sockaddr _mpts_dst; + struct sockaddr_in _mpts_dst_v4; + struct sockaddr_in6 _mpts_dst_v6; + } mpts_u_dst; +#define mpts_dst mpts_u_dst._mpts_dst +#define __mpts_dst_v4 mpts_u_dst._mpts_dst_v4 +#define __mpts_dst_v6 mpts_u_dst._mpts_dst_v6 u_int32_t mpts_rel_seq; /* running count of subflow # */ u_int32_t mpts_iss; /* Initial sequence number, taking TFO into account */ u_int32_t mpts_ifscope; /* scoped to the interface */ @@ -335,7 +303,11 @@ struct mptsub { #define MPTSF_INITIAL_SUB 0x00040000 /* This is the initial subflow */ #define MPTSF_READ_STALL 0x00080000 /* A read-stall has been detected */ #define MPTSF_WRITE_STALL 0x00100000 /* A write-stall has been detected */ -#define MPTSF_CONFIRMED 0x00200000 /* Subflow confirmed to be MPTCP-capable */ +#define MPTSF_FULLY_ESTABLISHED 0x00200000 /* Subflow is fully established and it has been confirmed + * whether or not it supports MPTCP. + * No need for further middlebox-detection. + */ +#define MPTSF_CELLICON_SET 0x00400000 /* This subflow set the cellicon */ #define MPTSF_BITS \ "\020\1ATTACHED\2CONNECTING\3PENDING\4CONNECTED\5DISCONNECTING" \ @@ -385,8 +357,8 @@ struct mptcp_subf_auth_entry { struct mptcb { struct mptses *mpt_mpte; /* back ptr to MPTCP session */ mptcp_state_t mpt_state; /* MPTCP state */ - u_int32_t mpt_flags; /* see flags below */ - u_int32_t mpt_version; /* MPTCP proto version */ + uint32_t mpt_flags; /* see flags below */ + uint32_t mpt_version; /* MPTCP proto version */ int mpt_softerror; /* error not yet reported */ /* * Authentication and metadata invariants @@ -401,32 +373,33 @@ struct mptcb { * Data ACKs do not. */ int mpt_rxtshift; /* num of consecutive retrans */ - u_int32_t mpt_rxtstart; /* time at which rxt started */ - u_int64_t mpt_rtseq; /* seq # being tracked */ - u_int32_t mpt_timer_vals; /* timer related values */ - u_int32_t mpt_timewait; /* timewait */ + uint32_t mpt_rxtstart; /* time at which rxt started */ + uint64_t mpt_rtseq; /* seq # being tracked */ + uint32_t mpt_timer_vals; /* timer related values */ + uint32_t mpt_timewait; /* timewait */ /* * Sending side */ - u_int64_t mpt_snduna; /* DSN of last unacked byte */ - u_int64_t mpt_sndnxt; /* DSN of next byte to send */ - u_int64_t mpt_sndmax; /* DSN of max byte sent */ - u_int64_t mpt_local_idsn; /* First byte's DSN */ - u_int32_t mpt_sndwnd; - u_int64_t mpt_sndwl1; - u_int64_t mpt_sndwl2; + uint64_t mpt_snduna; /* DSN of last unacked byte */ + uint64_t mpt_sndnxt; /* DSN of next byte to send */ + uint64_t mpt_sndmax; /* DSN of max byte sent */ + uint64_t mpt_local_idsn; /* First byte's DSN */ + uint32_t mpt_sndwnd; + uint64_t mpt_sndwl1; + uint64_t mpt_sndwl2; /* * Receiving side */ - u_int64_t mpt_rcvnxt; /* Next expected DSN */ - u_int64_t mpt_remote_idsn; /* Peer's IDSN */ - u_int32_t mpt_rcvwnd; + uint64_t mpt_rcvnxt; /* Next expected DSN */ + uint64_t mpt_remote_idsn; /* Peer's IDSN */ + uint32_t mpt_rcvwnd; + uint32_t mpt_rcvadv; LIST_HEAD(, mptcp_subf_auth_entry) mpt_subauth_list; /* address IDs */ /* * Fastclose */ - u_int64_t mpt_dsn_at_csum_fail; /* MPFail Opt DSN */ - u_int32_t mpt_ssn_at_csum_fail; /* MPFail Subflow Seq */ + uint64_t mpt_dsn_at_csum_fail; /* MPFail Opt DSN */ + uint32_t mpt_ssn_at_csum_fail; /* MPFail Subflow Seq */ /* * Zombie handling */ @@ -434,11 +407,11 @@ struct mptcb { #define MPT_GC_TICKS_FAST (10) int32_t mpt_gc_ticks; /* Used for zombie deletion */ - u_int32_t mpt_notsent_lowat; /* TCP_NOTSENT_LOWAT support */ - u_int32_t mpt_peer_version; /* Version from peer */ + uint32_t mpt_notsent_lowat; /* TCP_NOTSENT_LOWAT support */ + uint32_t mpt_peer_version; /* Version from peer */ struct tsegqe_head mpt_segq; - u_int16_t mpt_reassqlen; /* length of reassembly queue */ + uint16_t mpt_reassqlen; /* length of reassembly queue */ }; /* valid values for mpt_flags (see also notes on mpts_flags above) */ @@ -453,6 +426,7 @@ struct mptcb { #define MPTCPF_FALLBACK_HEURISTIC 0x100 /* Send SYN without MP_CAPABLE due to heuristic */ #define MPTCPF_HEURISTIC_TRAC 0x200 /* Tracked this connection in the heuristics as a failure */ #define MPTCPF_REASS_INPROG 0x400 /* Reassembly is in progress */ +#define MPTCPF_UNICAST_IP 0x800 #define MPTCPF_BITS \ "\020\1CHECKSUM\2FALLBACK_TO_TCP\3JOIN_READY\4RECVD_MPFAIL" \ @@ -544,7 +518,7 @@ extern os_log_t mptcp_log_handle; #define MPTCP_EXTEND_DSN(x, y, z) { \ if ((MPTCP_DATASEQ_LOW32(x) > y) && \ ((((u_int32_t)MPTCP_DATASEQ_LOW32(x)) - (u_int32_t)y) >= \ - (u_int32_t)(1 << 31))) { \ + (u_int32_t)(1U << 31))) { \ /* \ * y wrapped around and x and y are 2**31 bytes apart \ */ \ @@ -553,7 +527,7 @@ extern os_log_t mptcp_log_handle; } else if ((MPTCP_DATASEQ_LOW32(x) < y) && \ (((u_int32_t)y - \ ((u_int32_t)MPTCP_DATASEQ_LOW32(x))) >= \ - (u_int32_t)(1 << 31))) { \ + (u_int32_t)(1U << 31))) { \ /* \ * x wrapped around and x and y are 2**31 apart \ */ \ @@ -578,17 +552,20 @@ extern int mptcp_subflow_keeptime; /* Multipath subflow TCP_KEEPALIVE opt */ extern uint32_t mptcp_dbg_level; /* Multipath TCP debugging level */ extern uint32_t mptcp_dbg_area; /* Multipath TCP debugging area */ extern int mptcp_developer_mode; /* Allow aggregation mode */ +extern uint32_t mptcp_cellicon_refcount; extern int tcp_jack_rxmt; /* Join ACK retransmission value in msecs */ __BEGIN_DECLS extern void mptcp_init(struct protosw *, struct domain *); extern int mptcp_ctloutput(struct socket *, struct sockopt *); -extern int mptcp_sescreate(struct mppcb *); -extern void mptcp_check_subflows_and_add(struct mptses *); -extern int mptcp_get_statsindex(struct mptcp_itf_stats *stats, - const struct mptsub *mpts); -extern void mptcpstats_inc_switch(struct mptses *, const struct mptsub *); +extern int mptcp_session_create(struct mppcb *); +extern boolean_t mptcp_ok_to_create_subflows(struct mptcb *mp_tp); +extern void mptcp_check_subflows_and_add(struct mptses *mpte); +extern void mptcp_check_subflows_and_remove(struct mptses *mpte); +extern void mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts); +extern void mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts); +extern int mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create); extern struct mptses *mptcp_drop(struct mptses *, struct mptcb *, int); extern struct mptses *mptcp_close(struct mptses *, struct mptcb *); extern int mptcp_lock(struct socket *, int, void *); @@ -608,9 +585,9 @@ extern struct mptopt *mptcp_sopt_find(struct mptses *, struct sockopt *); extern int mptcp_subflow_add(struct mptses *, struct sockaddr *, struct sockaddr *, uint32_t, sae_connid_t *); -extern void mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts); extern void mptcp_subflow_del(struct mptses *, struct mptsub *); +extern void mptcp_handle_input(struct socket *so); #define MPTCP_SUBOUT_PROBING 0x01 extern int mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags); extern void mptcp_clean_reinjectq(struct mptses *mpte); @@ -643,8 +620,7 @@ extern void mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum); extern void mptcp_act_on_txfail(struct socket *); -extern struct mptsub *mptcp_get_subflow(struct mptses *, struct mptsub *, - struct mptsub **); +extern struct mptsub *mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred); extern int mptcp_get_map_for_dsn(struct socket *, u_int64_t, u_int32_t *); extern int32_t mptcp_adj_sendlen(struct socket *so, int32_t off); extern void mptcp_sbrcv_grow(struct mptcb *mp_tp); @@ -658,14 +634,14 @@ extern u_int32_t mptcp_get_notsent_lowat(struct mptses *mpte); extern int mptcp_notsent_lowat_check(struct socket *so); extern void mptcp_ask_symptoms(struct mptses *mpte); extern void mptcp_control_register(void); -extern int mptcp_is_wifi_unusable(struct mptses *mpte); -extern boolean_t mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts); +extern int mptcp_is_wifi_unusable_for_session(struct mptses *mpte); +extern boolean_t symptoms_is_wifi_lossy(void); extern void mptcp_ask_for_nat64(struct ifnet *ifp); extern void mptcp_session_necp_cb(void *, int, uint32_t, uint32_t, bool *); +extern struct sockaddr *mptcp_get_session_dst(struct mptses *mpte, + boolean_t has_v6, boolean_t has_v4); extern void mptcp_set_restrictions(struct socket *mp_so); -extern int mptcp_freeq(struct mptcb *); -extern void mptcp_set_cellicon(struct mptses *mpte); -extern void mptcp_unset_cellicon(void); +extern void mptcp_clear_cellicon(void); extern void mptcp_reset_rexmit_state(struct tcpcb *tp); extern void mptcp_reset_keepalive(struct tcpcb *tp); extern int mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, @@ -742,6 +718,13 @@ typedef struct symptoms_advisory { }; } symptoms_advisory_t; +#define MPTCP_TARGET_BASED_RSSI_THRESHOLD -75 +struct mptcp_symptoms_answer { + struct symptoms_advisory advisory; + uuid_t uuid; + int32_t rssi; +}; + struct mptcp_symptoms_ask_uuid { uint32_t cmd; #define MPTCP_SYMPTOMS_ASK_UUID 1 diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index d88d42b99..66b0102bb 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -193,7 +193,14 @@ rip_init(struct protosw *pp, struct domain *dp) in_pcbinfo_attach(&ripcbinfo); } -static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET, 0, {0}, {0, 0, 0, 0, 0, 0, 0, 0, } }; +static struct sockaddr_in ripsrc = { + .sin_len = sizeof(ripsrc), + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { .s_addr = 0 }, + .sin_zero = {0, 0, 0, 0, 0, 0, 0, 0, } +}; + /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with @@ -410,6 +417,9 @@ rip_output( if (INP_NO_EXPENSIVE(inp)) { ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; } + if (INP_NO_CONSTRAINED(inp)) { + ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED; + } if (INP_AWDL_UNRESTRICTED(inp)) { ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; } @@ -609,11 +619,11 @@ rip_output( } /* - * If output interface was cellular/expensive, and this socket is + * If output interface was cellular/expensive/constrained, and this socket is * denied access to it, generate an event. */ if (error != 0 && (ipoa.ipoa_retflags & IPOARF_IFDENIED) && - (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) { + (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) { soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); } diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 369a709d7..06e25d6bb 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -288,7 +288,8 @@ struct tcp_notify_ack_complete { #define MPTCP_SVCTYPE_HANDOVER 0 /* Default 0 */ #define MPTCP_SVCTYPE_INTERACTIVE 1 #define MPTCP_SVCTYPE_AGGREGATE 2 -#define MPTCP_SVCTYPE_MAX 3 +#define MPTCP_SVCTYPE_TARGET_BASED 3 +#define MPTCP_SVCTYPE_MAX 4 /* * Specify minimum time in seconds before which an established * TCP connection will not be dropped when there is no response from the @@ -299,6 +300,9 @@ struct tcp_notify_ack_complete { #define TCP_RXT_MINIMUM_TIMEOUT_LIMIT (5 * 60) /* Limit is 5 minutes */ #define MPTCP_ALTERNATE_PORT 0x216 +#define MPTCP_FORCE_ENABLE 0x217 +#define TCP_FASTOPEN_FORCE_ENABLE 0x218 +#define MPTCP_EXPECTED_PROGRESS_TARGET 0x219 /* * The TCP_INFO socket option is a private API and is subject to change @@ -479,6 +483,12 @@ struct mptcp_itf_stats { uint32_t is_expensive:1; uint64_t mpis_txbytes __attribute__((aligned(8))); uint64_t mpis_rxbytes __attribute__((aligned(8))); + uint64_t mpis_wifi_txbytes __attribute__((aligned(8))); + uint64_t mpis_wifi_rxbytes __attribute__((aligned(8))); + uint64_t mpis_wired_txbytes __attribute__((aligned(8))); + uint64_t mpis_wired_rxbytes __attribute__((aligned(8))); + uint64_t mpis_cell_txbytes __attribute__((aligned(8))); + uint64_t mpis_cell_rxbytes __attribute__((aligned(8))); }; /* Version solely used to let libnetcore survive */ diff --git a/bsd/netinet/tcp_cache.c b/bsd/netinet/tcp_cache.c index deaa3bcb2..aad8ce29c 100644 --- a/bsd/netinet/tcp_cache.c +++ b/bsd/netinet/tcp_cache.c @@ -67,6 +67,7 @@ struct tcp_heuristic { uint8_t th_tfo_data_rst; /* The number of times a SYN+data has received a RST */ uint8_t th_tfo_req_rst; /* The number of times a SYN+cookie-req has received a RST */ uint8_t th_mptcp_loss; /* The number of times a SYN+MP_CAPABLE has been lost */ + uint8_t th_mptcp_success; /* The number of times MPTCP-negotiation has been successful */ uint8_t th_ecn_loss; /* The number of times a SYN+ecn has been lost */ uint8_t th_ecn_aggressive; /* The number of times we did an aggressive fallback */ uint8_t th_ecn_droprst; /* The number of times ECN connections received a RST after first data pkt */ @@ -79,7 +80,8 @@ struct tcp_heuristic { uint32_t th_ecn_backoff; /* Time until when we should not try out ECN */ uint8_t th_tfo_in_backoff:1, /* Are we avoiding TFO due to the backoff timer? */ - th_mptcp_in_backoff:1; /* Are we avoiding MPTCP due to the backoff timer? */ + th_mptcp_in_backoff:1, /* Are we avoiding MPTCP due to the backoff timer? */ + th_mptcp_heuristic_disabled:1; /* Are heuristics disabled? */ char th_val_end[0]; /* Marker for memsetting to 0 */ }; @@ -181,6 +183,7 @@ tcp_min_to_hz(uint32_t minutes) #define TFO_MAX_COOKIE_LOSS 2 #define ECN_MAX_SYN_LOSS 2 #define MPTCP_MAX_SYN_LOSS 2 +#define MPTCP_SUCCESS_TRIGGER 10 #define ECN_MAX_DROPRST 1 #define ECN_MAX_DROPRXMT 4 #define ECN_MAX_SYNRST 4 @@ -634,38 +637,67 @@ tcp_heuristic_reset_counters(struct tcp_cache_key_src *tcks, u_int8_t flags) struct tcp_heuristic *tpheur; /* - * Don't attempt to create it! Keep the heuristics clean if the - * server does not support TFO. This reduces the lookup-cost on - * our side. + * Always create heuristics here because MPTCP needs to write success + * into it. Thus, we always end up creating them. */ - tpheur = tcp_getheuristic_with_lock(tcks, 0, &head); + tpheur = tcp_getheuristic_with_lock(tcks, 1, &head); if (tpheur == NULL) { return; } if (flags & TCPCACHE_F_TFO_DATA) { + if (tpheur->th_tfo_data_loss >= TFO_MAX_COOKIE_LOSS) { + os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-data loss to 0 from %u on heur %lx\n", + __func__, tpheur->th_tfo_data_loss, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); + } tpheur->th_tfo_data_loss = 0; } if (flags & TCPCACHE_F_TFO_REQ) { + if (tpheur->th_tfo_req_loss >= TFO_MAX_COOKIE_LOSS) { + os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-req loss to 0 from %u on heur %lx\n", + __func__, tpheur->th_tfo_req_loss, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); + } tpheur->th_tfo_req_loss = 0; } if (flags & TCPCACHE_F_TFO_DATA_RST) { + if (tpheur->th_tfo_data_rst >= TFO_MAX_COOKIE_LOSS) { + os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-data RST to 0 from %u on heur %lx\n", + __func__, tpheur->th_tfo_data_rst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); + } tpheur->th_tfo_data_rst = 0; } if (flags & TCPCACHE_F_TFO_REQ_RST) { + if (tpheur->th_tfo_req_rst >= TFO_MAX_COOKIE_LOSS) { + os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-req RST to 0 from %u on heur %lx\n", + __func__, tpheur->th_tfo_req_rst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); + } tpheur->th_tfo_req_rst = 0; } if (flags & TCPCACHE_F_ECN) { + if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS || tpheur->th_ecn_synrst >= ECN_MAX_SYNRST) { + os_log(OS_LOG_DEFAULT, "%s: Resetting ECN-loss to 0 from %u and synrst from %u on heur %lx\n", + __func__, tpheur->th_ecn_loss, tpheur->th_ecn_synrst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); + } tpheur->th_ecn_loss = 0; tpheur->th_ecn_synrst = 0; } if (flags & TCPCACHE_F_MPTCP) { tpheur->th_mptcp_loss = 0; + if (tpheur->th_mptcp_success < MPTCP_SUCCESS_TRIGGER) { + tpheur->th_mptcp_success++; + + if (tpheur->th_mptcp_success == MPTCP_SUCCESS_TRIGGER) { + os_log(mptcp_log_handle, "%s disabling heuristics for 12 hours", __func__); + tpheur->th_mptcp_heuristic_disabled = 1; + /* Disable heuristics for 12 hours */ + tpheur->th_mptcp_backoff = tcp_now + tcp_min_to_hz(tcp_ecn_timeout * 12); + } + } } tcp_heuristic_unlock(head); @@ -734,6 +766,9 @@ __tcp_heuristic_tfo_middlebox_common(struct tcp_heuristic *tpheur) if (tpheur->th_tfo_backoff > tcp_min_to_hz(tcp_backoff_maximum)) { tpheur->th_tfo_backoff = tcp_min_to_hz(tcp_ecn_timeout); } + + os_log(OS_LOG_DEFAULT, "%s disable TFO until %u now %u on %lx\n", __func__, + tpheur->th_tfo_backoff_until, tcp_now, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } static void @@ -797,7 +832,9 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, } } - if ((flags & TCPCACHE_F_ECN) && tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT) { + if ((flags & TCPCACHE_F_ECN) && + tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT && + TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { tpheur->th_ecn_loss++; if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) { tcpstat.tcps_ecn_fallback_synloss++; @@ -805,11 +842,16 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); + + os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for SYN-loss\n", + __func__, tpheur->th_ecn_backoff, tcp_now, + (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } if ((flags & TCPCACHE_F_MPTCP) && - tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT) { + tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT && + tpheur->th_mptcp_heuristic_disabled == 0) { tpheur->th_mptcp_loss++; if (tpheur->th_mptcp_loss >= MPTCP_MAX_SYN_LOSS) { /* @@ -819,11 +861,17 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_mptcp_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_mptcp_loss - MPTCP_MAX_SYN_LOSS)); + tpheur->th_mptcp_in_backoff = 1; + + os_log(OS_LOG_DEFAULT, "%s disable MPTCP until %u now %u on %lx\n", + __func__, tpheur->th_mptcp_backoff, tcp_now, + (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } if ((flags & TCPCACHE_F_ECN_DROPRST) && - tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT) { + tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT && + TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { tpheur->th_ecn_droprst++; if (tpheur->th_ecn_droprst >= ECN_MAX_DROPRST) { tcpstat.tcps_ecn_fallback_droprst++; @@ -832,11 +880,16 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_droprst - ECN_MAX_DROPRST)); + + os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for drop-RST\n", + __func__, tpheur->th_ecn_backoff, tcp_now, + (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } if ((flags & TCPCACHE_F_ECN_DROPRXMT) && - tpheur->th_ecn_droprxmt < TCP_CACHE_OVERFLOW_PROTECT) { + tpheur->th_ecn_droprxmt < TCP_CACHE_OVERFLOW_PROTECT && + TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { tpheur->th_ecn_droprxmt++; if (tpheur->th_ecn_droprxmt >= ECN_MAX_DROPRXMT) { tcpstat.tcps_ecn_fallback_droprxmt++; @@ -845,6 +898,10 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_droprxmt - ECN_MAX_DROPRXMT)); + + os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for drop-Rxmit\n", + __func__, tpheur->th_ecn_backoff, tcp_now, + (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } if ((flags & TCPCACHE_F_ECN_SYNRST) && @@ -857,6 +914,10 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_synrst - ECN_MAX_SYNRST)); + + os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for SYN-RST\n", + __func__, tpheur->th_ecn_backoff, tcp_now, + (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } tcp_heuristic_unlock(head); @@ -868,6 +929,11 @@ tcp_heuristic_tfo_loss(struct tcpcb *tp) struct tcp_cache_key_src tcks; uint32_t flag = 0; + if (symptoms_is_wifi_lossy() && + IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) { + return; + } + tcp_cache_key_src_create(tp, &tcks); if (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) { @@ -903,6 +969,11 @@ tcp_heuristic_mptcp_loss(struct tcpcb *tp) { struct tcp_cache_key_src tcks; + if (symptoms_is_wifi_lossy() && + IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) { + return; + } + tcp_cache_key_src_create(tp, &tcks); tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_MPTCP); @@ -913,6 +984,11 @@ tcp_heuristic_ecn_loss(struct tcpcb *tp) { struct tcp_cache_key_src tcks; + if (symptoms_is_wifi_lossy() && + IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) { + return; + } + tcp_cache_key_src_create(tp, &tcks); tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN); @@ -970,6 +1046,12 @@ tcp_heuristic_ecn_aggressive_common(struct tcp_cache_key_src *tcks) return; } + if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) { + /* We are already in aggressive mode */ + tcp_heuristic_unlock(head); + return; + } + /* Must be done before, otherwise we will start off with expo-backoff */ tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_aggressive)); @@ -983,6 +1065,9 @@ tcp_heuristic_ecn_aggressive_common(struct tcp_cache_key_src *tcks) } tcp_heuristic_unlock(head); + + os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx\n", __func__, + tpheur->th_ecn_backoff, tcp_now, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } void @@ -1041,16 +1126,23 @@ tcp_heuristic_do_tfo(struct tcpcb *tp) return FALSE; } - -boolean_t +/* + * @return: + * 0 Enable MPTCP (we are still discovering middleboxes) + * -1 Enable MPTCP (heuristics have been temporarily disabled) + * 1 Disable MPTCP + */ +int tcp_heuristic_do_mptcp(struct tcpcb *tp) { struct tcp_cache_key_src tcks; struct tcp_heuristics_head *head = NULL; struct tcp_heuristic *tpheur; + int ret = 0; - if (disable_tcp_heuristics) { - return TRUE; + if (disable_tcp_heuristics || + (tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE)) { + return 0; } tcp_cache_key_src_create(tp, &tcks); @@ -1058,16 +1150,32 @@ tcp_heuristic_do_mptcp(struct tcpcb *tp) /* Get the tcp-heuristic. */ tpheur = tcp_getheuristic_with_lock(&tcks, 0, &head); if (tpheur == NULL) { - return TRUE; + return 0; + } + + if (tpheur->th_mptcp_in_backoff == 0 || + tpheur->th_mptcp_heuristic_disabled == 1) { + goto mptcp_ok; } if (TSTMP_GT(tpheur->th_mptcp_backoff, tcp_now)) { goto fallback; } - tcp_heuristic_unlock(head); + tpheur->th_mptcp_in_backoff = 0; - return TRUE; +mptcp_ok: + if (tpheur->th_mptcp_heuristic_disabled) { + ret = -1; + + if (TSTMP_GT(tcp_now, tpheur->th_mptcp_backoff)) { + tpheur->th_mptcp_heuristic_disabled = 0; + tpheur->th_mptcp_success = 0; + } + } + + tcp_heuristic_unlock(head); + return ret; fallback: if (head) { @@ -1080,7 +1188,7 @@ fallback: tcpstat.tcps_mptcp_heuristic_fallback++; } - return FALSE; + return 1; } static boolean_t @@ -1113,6 +1221,9 @@ tcp_heuristic_do_ecn_common(struct tcp_cache_key_src *tcks) if (tpheur->th_ecn_synrst >= ECN_RETRY_LIMIT) { tpheur->th_ecn_synrst = 0; } + + /* Make sure it follows along */ + tpheur->th_ecn_backoff = tcp_now; } tcp_heuristic_unlock(head); diff --git a/bsd/netinet/tcp_cache.h b/bsd/netinet/tcp_cache.h index d9344c84e..9259076ca 100644 --- a/bsd/netinet/tcp_cache.h +++ b/bsd/netinet/tcp_cache.h @@ -51,7 +51,7 @@ extern void tcp_heuristic_tfo_success(struct tcpcb *tp); extern void tcp_heuristic_mptcp_success(struct tcpcb *tp); extern void tcp_heuristic_ecn_success(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp); -extern boolean_t tcp_heuristic_do_mptcp(struct tcpcb *tp); +extern int tcp_heuristic_do_mptcp(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp); extern void tcp_heuristic_ecn_droprst(struct tcpcb *tp); extern void tcp_heuristic_ecn_droprxmt(struct tcpcb *tp); diff --git a/bsd/netinet/tcp_cc.c b/bsd/netinet/tcp_cc.c index 3512bc9a1..2eb6faf90 100644 --- a/bsd/netinet/tcp_cc.c +++ b/bsd/netinet/tcp_cc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Apple Inc. All rights reserved. + * Copyright (c) 2013-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,40 +42,6 @@ #include #include -struct tcp_cc_debug_state { - u_int64_t ccd_tsns; - char ccd_srcaddr[INET6_ADDRSTRLEN]; - uint16_t ccd_srcport; - char ccd_destaddr[INET6_ADDRSTRLEN]; - uint16_t ccd_destport; - uint32_t ccd_snd_cwnd; - uint32_t ccd_snd_wnd; - uint32_t ccd_snd_ssthresh; - uint32_t ccd_pipeack; - uint32_t ccd_rttcur; - uint32_t ccd_rxtcur; - uint32_t ccd_srtt; - uint32_t ccd_event; - uint32_t ccd_sndcc; - uint32_t ccd_sndhiwat; - uint32_t ccd_bytes_acked; - u_int8_t ccd_cc_index; - u_int8_t ccd_unused_1__; - u_int16_t ccd_unused_2__; - union { - struct { - uint32_t ccd_last_max; - uint32_t ccd_tcp_win; - uint32_t ccd_target_win; - uint32_t ccd_avg_lastmax; - uint32_t ccd_mean_deviation; - } cubic_state; - struct { - u_int32_t led_base_rtt; - } ledbat_state; - } u; -}; - SYSCTL_SKMEM_TCP_INT(OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_cc_debug, 0, "Enable debug data collection"); @@ -113,8 +79,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, cwnd_nonvalidated, struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; struct zone *tcp_cc_zone; -/* Information for colelcting TCP debug information using control socket */ -#define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug" #define TCP_CCDBG_NOUNIT 0xffffffff static kern_ctl_ref tcp_ccdbg_ctlref = NULL; volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT; @@ -151,12 +115,13 @@ tcp_cc_control_register(void) errno_t err; bzero(&ccdbg_control, sizeof(ccdbg_control)); - strlcpy(ccdbg_control.ctl_name, TCP_CCDEBUG_CONTROL_NAME, + strlcpy(ccdbg_control.ctl_name, TCP_CC_CONTROL_NAME, sizeof(ccdbg_control.ctl_name)); ccdbg_control.ctl_connect = tcp_ccdbg_control_connect; ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect; ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED; ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM; + ccdbg_control.ctl_sendsize = 32 * 1024; err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref); if (err != 0) { @@ -340,7 +305,7 @@ tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) /* * Indicate whether this ack should be delayed. * Here is the explanation for different settings of tcp_delack_enabled: - * - when set to 1, the bhavior is same as when set to 2. We kept this + * - when set to 1, the behavior is same as when set to 2. We kept this * for binary compatibility. * - when set to 2, will "ack every other packet" * - if our last ack wasn't a 0-sized window. @@ -372,8 +337,8 @@ tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) if ((tp->t_flags & TF_RXWIN0SENT) == 0 && (th->th_flags & TH_PUSH) == 0 && ((tp->t_unacksegs == 1) || - ((tp->t_flags & TF_STRETCHACK) != 0 && - tp->t_unacksegs < (maxseg_unacked)))) { + ((tp->t_flags & TF_STRETCHACK) && + tp->t_unacksegs < maxseg_unacked))) { return 1; } break; diff --git a/bsd/netinet/tcp_cc.h b/bsd/netinet/tcp_cc.h index 8a1f584ad..3f484dac0 100644 --- a/bsd/netinet/tcp_cc.h +++ b/bsd/netinet/tcp_cc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2014 Apple Inc. All rights reserved. + * Copyright (c) 2010-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,18 +61,108 @@ #ifndef _NETINET_CC_H_ #define _NETINET_CC_H_ -#ifdef KERNEL -#include -#include -#include +#ifdef PRIVATE + +#include + +/* + * Data structure to collect and display congestion control debug information + */ +struct tcp_cc_debug_state { + u_int64_t ccd_tsns; + char ccd_srcaddr[INET6_ADDRSTRLEN]; + uint16_t ccd_srcport; + char ccd_destaddr[INET6_ADDRSTRLEN]; + uint16_t ccd_destport; + uint32_t ccd_snd_cwnd; + uint32_t ccd_snd_wnd; + uint32_t ccd_snd_ssthresh; + uint32_t ccd_pipeack; + uint32_t ccd_rttcur; + uint32_t ccd_rxtcur; + uint32_t ccd_srtt; + uint32_t ccd_event; + uint32_t ccd_sndcc; + uint32_t ccd_sndhiwat; + uint32_t ccd_bytes_acked; + u_int8_t ccd_cc_index; + u_int8_t ccd_unused_1__; + u_int16_t ccd_unused_2__; + union { + struct { + uint32_t ccd_last_max; + uint32_t ccd_tcp_win; + uint32_t ccd_target_win; + uint32_t ccd_avg_lastmax; + uint32_t ccd_mean_deviation; + } cubic_state; + struct { + u_int32_t led_base_rtt; + } ledbat_state; + } u; +}; +/* + * Values of ccd_cc_index + */ #define TCP_CC_ALGO_NONE 0 #define TCP_CC_ALGO_NEWRENO_INDEX 1 #define TCP_CC_ALGO_BACKGROUND_INDEX 2 /* CC for background transport */ #define TCP_CC_ALGO_CUBIC_INDEX 3 /* default CC algorithm */ #define TCP_CC_ALGO_COUNT 4 /* Count of CC algorithms */ -#define TCP_CA_NAME_MAX 16 /* Maximum characters in the name of a CC algorithm */ +/* + * Values of ccd_event + */ +#define TCP_CC_EVENT_LIST \ + X(TCP_CC_CWND_INIT) \ + X(TCP_CC_INSEQ_ACK_RCVD) \ + X(TCP_CC_ACK_RCVD) \ + X(TCP_CC_ENTER_FASTRECOVERY) \ + X(TCP_CC_IN_FASTRECOVERY) \ + X(TCP_CC_EXIT_FASTRECOVERY) \ + X(TCP_CC_PARTIAL_ACK) \ + X(TCP_CC_IDLE_TIMEOUT) \ + X(TCP_CC_REXMT_TIMEOUT) \ + X(TCP_CC_ECN_RCVD) \ + X(TCP_CC_BAD_REXMT_RECOVERY) \ + X(TCP_CC_OUTPUT_ERROR) \ + X(TCP_CC_CHANGE_ALGO) \ + X(TCP_CC_FLOW_CONTROL) \ + X(TCP_CC_SUSPEND) \ + X(TCP_CC_LIMITED_TRANSMIT) \ + X(TCP_CC_EARLY_RETRANSMIT) \ + X(TCP_CC_TLP_RECOVERY) \ + X(TCP_CC_TLP_RECOVER_LASTPACKET) \ + X(TCP_CC_DELAY_FASTRECOVERY) \ + X(TCP_CC_TLP_IN_FASTRECOVERY) \ + X(TCP_CC_DSACK_BAD_REXMT) \ + X(TCP_CC_FIRST_REXMT) \ + X(MAX_TCP_CC_EVENTS) + +enum tcp_cc_event { +#define X(name, ...) name, + TCP_CC_EVENT_LIST +#undef X +}; + +/* + * Kernel control ID + */ +#define TCP_CC_CONTROL_NAME "com.apple.network.tcp_ccdebug" + +#endif /* PRIVATE */ + +#ifdef KERNEL_PRIVATE + +#include +#include +#include + +/* + * Maximum characters in the name of a CC algorithm + */ +#define TCP_CA_NAME_MAX 16 extern int tcp_recv_bg; @@ -153,5 +243,5 @@ extern void tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp); extern u_int32_t tcp_get_max_pipeack(struct tcpcb *tp); extern void tcp_clear_pipeack_state(struct tcpcb *tp); -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ #endif /* _NETINET_CC_H_ */ diff --git a/bsd/netinet/tcp_cubic.c b/bsd/netinet/tcp_cubic.c index 2d1ad246b..a347a0dcb 100644 --- a/bsd/netinet/tcp_cubic.c +++ b/bsd/netinet/tcp_cubic.c @@ -85,9 +85,9 @@ struct tcp_cc_algo tcp_cc_cubic = { .switch_to = tcp_cubic_switch_cc }; -const float tcp_cubic_backoff = 0.2; /* multiplicative decrease factor */ -const float tcp_cubic_coeff = 0.4; -const float tcp_cubic_fast_convergence_factor = 0.875; +const float tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */ +const float tcp_cubic_coeff = 0.4f; +const float tcp_cubic_fast_convergence_factor = 0.875f; SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_tcp_friendliness, CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_cubic_tcp_friendliness, 0, "Enable TCP friendliness"); diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 8990100b8..d7d04516a 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,7 +91,9 @@ #include #include #include +#include #include +#include #include #include @@ -127,6 +129,7 @@ u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tcp_savetcp; #endif /* TCPDEBUG */ +#include #if IPSEC #include @@ -291,7 +294,6 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961, CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1, "Enable/Disable full RFC 5961 compliance"); -extern int tcp_TCPTV_MIN; extern int tcp_acc_iaj_high; extern int tcp_acc_iaj_react_limit; @@ -310,8 +312,6 @@ static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *, static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); -static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *, - struct ifnet *); static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq); static inline unsigned int tcp_maxmtu(struct rtentry *); static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags); @@ -637,7 +637,7 @@ tcp_bwmeas_check(struct tcpcb *tp) static int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, - struct ifnet *ifp) + struct ifnet *ifp, int *dowakeup) { struct tseg_qent *q; struct tseg_qent *p = NULL; @@ -646,7 +646,6 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; int flags = 0; - int dowakeup = 0; struct mbuf *oodata = NULL; int copy_oodata = 0; u_int16_t qlimit; @@ -896,6 +895,13 @@ present: if (so->so_state & SS_CANTRCVMORE) { m_freem(q->tqe_m); } else { + /* + * The mbuf may be freed after it has been added to the + * receive socket buffer so we reinitialize th to point + * to a safe copy of the TCP header + */ + struct tcphdr saved_tcphdr = {}; + so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ if (so->so_flags & SOF_ENABLE_MSGS) { /* @@ -911,10 +917,13 @@ present: copy_oodata = 0; } } + memcpy(&saved_tcphdr, th, sizeof(struct tcphdr)); if (sbappendstream_rcvdemux(so, q->tqe_m, q->tqe_th->th_seq - (tp->irs + 1), 0)) { - dowakeup = 1; + *dowakeup = 1; } + th = &saved_tcphdr; + if (tp->t_flagsext & TF_LRO_OFFLOADED) { tcp_update_lro_seq(tp->rcv_nxt, inp->inp_laddr, inp->inp_faddr, @@ -955,7 +964,7 @@ msg_unordered_delivery: if (oodata != NULL) { if (sbappendmsgstream_rcv(&so->so_rcv, oodata, te->tqe_th->th_seq - (tp->irs + 1), 1)) { - dowakeup = 1; + *dowakeup = 1; tcpstat.tcps_msg_unopkts++; } else { tcpstat.tcps_msg_unoappendfail++; @@ -963,9 +972,6 @@ msg_unordered_delivery: } } - if (dowakeup) { - sorwakeup(so); /* done with socket lock held */ - } return flags; } @@ -1186,7 +1192,7 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, */ if (TSTMP_GEQ(tcp_now, tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) { - if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) { + if (tp->rfbuf_cnt + pktlen >= TCP_RCVNOTS_BYTELEVEL) { tcp_sbrcv_reserve(tp, sbrcv, tcp_autorcvbuf_max, 0, tcp_autorcvbuf_max); @@ -1207,8 +1213,9 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, * on the link. */ if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) { - if (tp->rfbuf_cnt > (sbrcv->sb_hiwat - + if (tp->rfbuf_cnt + pktlen > (sbrcv->sb_hiwat - (sbrcv->sb_hiwat >> 1))) { + tp->rfbuf_cnt += pktlen; int32_t rcvbuf_inc, min_incr; /* * Increment the receive window by a @@ -1238,7 +1245,7 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg; tcp_sbrcv_reserve(tp, sbrcv, sbrcv->sb_hiwat + rcvbuf_inc, - (tp->rfbuf_cnt * 2), rcvbuf_max); + (tp->rfbuf_cnt << 1), rcvbuf_max); } /* Measure instantaneous receive bandwidth */ if (tp->t_bwmeas != NULL && tp->rfbuf_cnt > 0 && @@ -1429,6 +1436,7 @@ tcp_reset_stretch_ack(struct tcpcb *tp) { tp->t_flags &= ~(TF_STRETCHACK | TF_STREAMING_ON); tp->rcv_by_unackwin = 0; + tp->rcv_by_unackhalfwin = 0; tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; /* @@ -1802,7 +1810,8 @@ tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to) * rexmit the SYN. If that's the case, it's better to start * backing of TFO-cookie requests. */ - if (tp->t_tfo_flags & TFO_F_SYN_LOSS) { + if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + tp->t_tfo_flags & TFO_F_SYN_LOSS) { tp->t_tfo_stats |= TFO_S_SYN_LOSS; tcpstat.tcps_tfo_syn_loss++; @@ -1892,6 +1901,17 @@ tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th, return false; } +static void +tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup) +{ + if (read_wakeup != 0) { + sorwakeup(so); + } + if (write_wakeup != 0) { + sowwakeup(so); + } +} + void tcp_input(struct mbuf *m, int off0) { @@ -1906,6 +1926,8 @@ tcp_input(struct mbuf *m, int off0) int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; + int read_wakeup = 0; + int write_wakeup = 0; struct in_addr laddr; #if INET6 struct in6_addr laddr6; @@ -1936,7 +1958,22 @@ tcp_input(struct mbuf *m, int off0) boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp)); boolean_t recvd_dsack = FALSE; struct tcp_respond_args tra; + int prev_t_state; + boolean_t check_cfil = cfil_filter_present(); bool findpcb_iterated = false; + /* + * The mbuf may be freed after it has been added to the receive socket + * buffer or the reassembly queue, so we reinitialize th to point to a + * safe copy of the TCP header + */ + struct tcphdr saved_tcphdr = {}; + /* + * Save copy of the IPv4/IPv6 header. + * Note: use array of uint32_t to silence compiler warning when casting + * to a struct ip6_hdr pointer. + */ +#define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t)) + uint32_t saved_hdr[MAX_IPWORDS]; #define TCP_INC_VAR(stat, npkts) do { \ stat += npkts; \ @@ -1988,6 +2025,7 @@ tcp_input(struct mbuf *m, int off0) th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0); if (tcp_input_checksum(AF_INET6, m, th, off0, tlen)) { + TCP_LOG_DROP_PKT(ip6, th, ifp, "IPv6 bad tcp checksum"); goto dropnosock; } @@ -2005,6 +2043,7 @@ tcp_input(struct mbuf *m, int off0) if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ IF_TCP_STATINC(ifp, unspecv6); + TCP_LOG_DROP_PKT(ip6, th, ifp, "src IPv6 address unspecified"); goto dropnosock; } DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, @@ -2038,6 +2077,7 @@ tcp_input(struct mbuf *m, int off0) tlen = ip->ip_len; if (tcp_input_checksum(AF_INET, m, th, off0, tlen)) { + TCP_LOG_DROP_PKT(ip, th, ifp, "IPv4 bad tcp checksum"); goto dropnosock; } @@ -2055,6 +2095,8 @@ tcp_input(struct mbuf *m, int off0) th->th_seq, th->th_ack, th->th_win); } +#define TCP_LOG_HDR (isipv6 ? (void *)ip6 : (void *)ip) + /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. @@ -2063,6 +2105,7 @@ tcp_input(struct mbuf *m, int off0) if (off < sizeof(struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; IF_TCP_STATINC(ifp, badformat); + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "bad tcp offset"); goto dropnosock; } tlen -= off; /* tlen is used instead of ti->ti_len */ @@ -2116,6 +2159,7 @@ tcp_input(struct mbuf *m, int off0) */ if (drop_synfin && (thflags & (TH_SYN | TH_FIN)) == (TH_SYN | TH_FIN)) { IF_TCP_STATINC(ifp, synfin); + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "drop SYN FIN"); goto dropnosock; } #endif @@ -2286,18 +2330,22 @@ findpcb: switch (blackhole) { case 1: if (thflags & TH_SYN) { + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 1 syn for closed port"); goto dropnosock; } break; case 2: + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 2 closed port"); goto dropnosock; default: + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole closed port"); goto dropnosock; } } } rstreason = BANDLIM_RST_CLOSEDPORT; IF_TCP_STATINC(ifp, noconnnolist); + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "closed port"); goto dropwithresetnosock; } so = inp->inp_socket; @@ -2311,6 +2359,7 @@ findpcb: #if TEMPDEBUG printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp); #endif + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL"); goto dropnosock; } @@ -2318,6 +2367,7 @@ findpcb: if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { socket_unlock(so, 1); inp = NULL; // pretend we didn't find it + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp state WNT_STOPUSING"); goto dropnosock; } @@ -2357,20 +2407,49 @@ findpcb: } } + tp = intotcpcb(inp); + if (tp == NULL) { + rstreason = BANDLIM_RST_CLOSEDPORT; + IF_TCP_STATINC(ifp, noconnlist); + TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "tp is NULL"); + goto dropwithreset; + } + + TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp); + + if (tp->t_state == TCPS_CLOSED) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "tp state TCPS_CLOSED"); + goto drop; + } + #if NECP if (so->so_state & SS_ISCONNECTED) { // Connected TCP sockets have a fully-bound local and remote, // so the policy check doesn't need to override addresses - if (!necp_socket_is_allowed_to_send_recv(inp, NULL, NULL, NULL)) { + if (!necp_socket_is_allowed_to_send_recv(inp, ifp, NULL, NULL, NULL)) { + TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false); IF_TCP_STATINC(ifp, badformat); goto drop; } } else { + /* + * If the proc_uuid_policy table has been updated since the last use + * of the listening socket (i.e., the proc_uuid_policy_table_gencount + * has been updated), the flags in the socket may be out of date. + * If INP2_WANT_APP_POLICY is stale, inbound packets may + * be dropped by NECP if the socket should now match a per-app + * exception policy. + * In order to avoid this refresh the proc_uuid_policy state to + * potentially recalculate the socket's flags before checking + * with NECP. + */ + (void) inp_update_policy(inp); #if INET6 if (isipv6) { if (!necp_socket_is_allowed_to_send_recv_v6(inp, th->th_dport, th->th_sport, &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) { + TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false); IF_TCP_STATINC(ifp, badformat); goto drop; } @@ -2380,6 +2459,7 @@ findpcb: if (!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) { + TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false); IF_TCP_STATINC(ifp, badformat); goto drop; } @@ -2387,18 +2467,11 @@ findpcb: } #endif /* NECP */ - tp = intotcpcb(inp); - if (tp == 0) { - rstreason = BANDLIM_RST_CLOSEDPORT; - IF_TCP_STATINC(ifp, noconnlist); - goto dropwithreset; - } - if (tp->t_state == TCPS_CLOSED) { - goto drop; - } + prev_t_state = tp->t_state; /* If none of the FIN|SYN|RST|ACK flag is set, drop */ if (tcp_do_rfc5961 && (thflags & TH_ACCEPT) == 0) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 TH_ACCEPT == 0"); goto drop; } @@ -2409,8 +2482,10 @@ findpcb: tiwin = th->th_win; } + #if CONFIG_MACF_NET if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mac_inpcb_check_deliver failed"); goto drop; } #endif @@ -2418,6 +2493,7 @@ findpcb: /* Avoid processing packets while closing a listen socket */ if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN) == 0) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "closing a listening socket"); goto drop; } @@ -2440,13 +2516,15 @@ findpcb: struct socket *so2; struct socket *oso; struct sockaddr_storage from; + struct sockaddr_storage to2; #if INET6 struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ struct ifnet *head_ifscope; unsigned int head_nocell, head_recvanyif, head_noexpensive, head_awdl_unrestricted, - head_intcoproc_allowed; + head_intcoproc_allowed, head_external_port, + head_noconstrained; /* Get listener's bound-to-interface, if any */ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? @@ -2457,8 +2535,10 @@ findpcb: head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF); /* Get listener's no-expensive information, if any */ head_noexpensive = INP_NO_EXPENSIVE(inp); + head_noconstrained = INP_NO_CONSTRAINED(inp); head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp); + head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT); /* * If the state is LISTEN then ignore segment if it contains an RST. @@ -2470,9 +2550,11 @@ findpcb: IF_TCP_STATINC(ifp, listbadsyn); if (thflags & TH_RST) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with RST"); goto drop; } if (thflags & TH_ACK) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with ACK"); tp = NULL; tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; @@ -2481,6 +2563,7 @@ findpcb: /* We come here if there is no SYN set */ tcpstat.tcps_badsyn++; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN"); goto drop; } KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START, 0, 0, 0, 0, 0); @@ -2489,11 +2572,13 @@ findpcb: if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port"); goto drop; } } else #endif /* INET6 */ if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address"); goto drop; } } @@ -2506,12 +2591,14 @@ findpcb: * be discarded. */ if (m->m_flags & (M_BCAST | M_MCAST)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST | M_MCAST"); goto drop; } #if INET6 if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST"); goto drop; } } else @@ -2520,6 +2607,7 @@ findpcb: IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address"); goto drop; } @@ -2549,12 +2637,13 @@ findpcb: tp = NULL; rstreason = BANDLIM_RST_OPENPORT; IF_TCP_STATINC(ifp, deprecate6); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address"); goto dropwithreset; } } } #endif - if (so->so_filt) { + if (so->so_filt || check_cfil) { #if INET6 if (isipv6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from; @@ -2565,6 +2654,15 @@ findpcb: sin6->sin6_flowinfo = 0; sin6->sin6_addr = ip6->ip6_src; sin6->sin6_scope_id = 0; + + sin6 = (struct sockaddr_in6*)&to2; + + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = th->th_dport; + sin6->sin6_flowinfo = 0; + sin6->sin6_addr = ip6->ip6_dst; + sin6->sin6_scope_id = 0; } else #endif { @@ -2574,7 +2672,17 @@ findpcb: sin->sin_family = AF_INET; sin->sin_port = th->th_sport; sin->sin_addr = ip->ip_src; + + sin = (struct sockaddr_in*)&to2; + + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_port = th->th_dport; + sin->sin_addr = ip->ip_dst; } + } + + if (so->so_filt) { so2 = sonewconn(so, 0, (struct sockaddr*)&from); } else { so2 = sonewconn(so, 0, NULL); @@ -2589,6 +2697,7 @@ findpcb: } } if (!so2) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop"); goto drop; } } @@ -2635,6 +2744,9 @@ findpcb: if (head_noexpensive) { inp_set_noexpensive(inp); } + if (head_noconstrained) { + inp_set_noconstrained(inp); + } if (head_awdl_unrestricted) { inp_set_awdl_unrestricted(inp); } @@ -2649,6 +2761,10 @@ findpcb: } else { inp->inp_flags &= ~INP_RECV_ANYIF; } + + if (head_external_port) { + inp->inp_flags2 |= INP2_EXTERNAL_PORT; + } #if INET6 if (isipv6) { inp->in6p_laddr = ip6->ip6_dst; @@ -2675,6 +2791,7 @@ findpcb: inp->inp_lport = 0; socket_lock(oso, 0); /* release ref on parent */ socket_unlock(oso, 1); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed"); goto drop; } #if INET6 @@ -2742,11 +2859,29 @@ findpcb: tcp_set_max_rwinscale(tp, so, ifp); +#if CONTENT_FILTER + if (check_cfil) { + int error = cfil_sock_attach(so2, (struct sockaddr*)&to2, (struct sockaddr*)&from, + CFS_CONNECTION_DIR_IN); + if (error != 0) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed"); + goto drop; + } + } +#endif /* CONTENT_FILTER */ + KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END, 0, 0, 0, 0, 0); } } socket_lock_assert_owned(so); + if (net_mpklog_enabled && (m->m_pkthdr.rcvif->if_xflags & IFXF_MPK_LOG)) { + MPKL_TCP_INPUT(tcp_mpkl_log_object, + ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport), + th->th_seq, th->th_ack, tlen, thflags, + so->last_pid, so->so_log_seqn++); + } + if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) { /* * Evaluate the rate of arrival of packets to see if the @@ -2764,11 +2899,13 @@ findpcb: } else { tp->t_flags &= ~(TF_STRETCHACK); } - if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) { + if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> 1), tcp_now)) { + tp->rcv_by_unackhalfwin += (tlen + off); tp->rcv_by_unackwin += (tlen + off); } else { tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; - tp->rcv_by_unackwin = tlen + off; + tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off; + tp->rcv_by_unackhalfwin = tlen + off; } } @@ -2780,13 +2917,13 @@ findpcb: } /* * Explicit Congestion Notification - Flag that we need to send ECT if - * + The IP Congestion experienced flag was set. - * + Socket is in established state - * + We negotiated ECN in the TCP setup - * + This isn't a pure ack (tlen > 0) - * + The data is in the valid window + * + The IP Congestion experienced flag was set. + * + Socket is in established state + * + We negotiated ECN in the TCP setup + * + This isn't a pure ack (tlen > 0) + * + The data is in the valid window * - * TE_SENDECE will be cleared when we receive a packet with TH_CWR set. + * TE_SENDECE will be cleared when we receive a packet with TH_CWR set. */ if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) && tlen > 0 && @@ -3100,7 +3237,7 @@ findpcb: tcp_bwmeas_check(tp); } - sowwakeup(so); /* has to be done with socket lock held */ + write_wakeup = 1; if (!SLIST_EMPTY(&tp->t_notify_ack)) { tcp_notify_acknowledgement(tp, so); } @@ -3112,6 +3249,9 @@ findpcb: tcp_tfo_rcv_ack(tp, th); tcp_check_timer_state(tp); + + tcp_handle_wakeup(so, read_wakeup, write_wakeup); + socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); return; @@ -3203,10 +3343,21 @@ findpcb: * this socket, deliver the packet received as an * in-order message with sequence number attached to it. */ + if (isipv6) { + memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr)); + ip6 = (struct ip6_hdr *)&saved_hdr[0]; + } else { + memcpy(&saved_hdr, ip, ip->ip_hl << 2); + ip = (struct ip *)&saved_hdr[0]; + } + memcpy(&saved_tcphdr, th, sizeof(struct tcphdr)); if (sbappendstream_rcvdemux(so, m, th->th_seq - (tp->irs + 1), 0)) { - sorwakeup(so); + mptcp_handle_input(so); + read_wakeup = 1; } + th = &saved_tcphdr; + #if INET6 if (isipv6) { KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), @@ -3237,6 +3388,9 @@ findpcb: } tcp_check_timer_state(tp); + + tcp_handle_wakeup(so, read_wakeup, write_wakeup); + socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); return; @@ -3266,9 +3420,10 @@ findpcb: */ if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && (mp_tp = tptomptp(tp))) { - mpte_lock_assert_held(mp_tp->mpt_mpte); - if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) { - tp->rcv_wnd = imax(mp_tp->mpt_rcvwnd, (int)(tp->rcv_adv - tp->rcv_nxt)); + socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); + + if (tp->rcv_wnd > (int)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt)) { + tp->rcv_wnd = mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt; tcpstat.tcps_mp_reducedwin++; } } @@ -3291,11 +3446,17 @@ findpcb: #endif socket_lock_assert_owned(so); + + /* Clear the logging flags inherited from the listening socket */ + tp->t_log_flags = 0; + tp->t_flagsext &= ~TF_LOGGED_CONN_SUMMARY; + #if INET6 if (isipv6) { MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, M_SONAME, M_NOWAIT); if (sin6 == NULL) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed"); goto drop; } bzero(sin6, sizeof(*sin6)); @@ -3311,6 +3472,7 @@ findpcb: proc0)) { inp->in6p_laddr = laddr6; FREE(sin6, M_SONAME); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed"); goto drop; } FREE(sin6, M_SONAME); @@ -3321,6 +3483,7 @@ findpcb: MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT); if (sin == NULL) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed"); goto drop; } sin->sin_family = AF_INET; @@ -3336,6 +3499,7 @@ findpcb: IFSCOPE_NONE, NULL)) { inp->inp_laddr = laddr; FREE(sin, M_SONAME); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed"); goto drop; } FREE(sin, M_SONAME); @@ -3371,6 +3535,7 @@ findpcb: tp->t_state = TCPS_SYN_RECEIVED; tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp)); + tp->t_connect_time = tcp_now; dropsocket = 0; /* committed to socket */ if (inp->inp_flowhash == 0) { @@ -3394,6 +3559,11 @@ findpcb: tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); } + /* + * The address and connection state are finalized + */ + TCP_LOG_CONNECT(tp, false, 0); + goto trimthenstep6; } @@ -3407,6 +3577,7 @@ findpcb: SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; IF_TCP_STATINC(ifp, ooopacket); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad ACK"); goto dropwithreset; } @@ -3441,11 +3612,13 @@ findpcb: SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; IF_TCP_STATINC(ifp, ooopacket); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT bad ACK"); goto dropwithreset; } if (thflags & TH_RST) { if ((thflags & TH_ACK) != 0) { - if (tfo_enabled(tp)) { + if (tfo_enabled(tp) && + !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) { tcp_heuristic_tfo_rst(tp); } if ((tp->ecn_flags & (TE_SETUPSENT | TE_RCVD_SYN_RST)) == TE_SETUPSENT) { @@ -3467,9 +3640,11 @@ findpcb: tp = tcp_drop(tp, ECONNREFUSED); postevent(so, 0, EV_RESET); } + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT got RST"); goto drop; } if ((thflags & TH_SYN) == 0) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT no SYN"); goto drop; } tp->snd_wnd = th->th_win; /* initial send window */ @@ -3531,12 +3706,15 @@ findpcb: * There is a middlebox that acks all but one * byte and still drops the data. */ - if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) && + if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) && tp->snd_max == th->th_ack + 1 && tp->snd_max > tp->snd_una + 1) { tcp_heuristic_tfo_middlebox(tp); so->so_error = ENODATA; + soevent(so, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR)); tp->t_tfo_stats |= TFO_S_ONE_BYTE_PROXY; } @@ -3573,6 +3751,8 @@ findpcb: tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; + + TCP_LOG_CONNECTION_SUMMARY(tp); } else { DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, struct tcpcb *, @@ -3692,10 +3872,12 @@ trimthenstep6: if (thflags & TH_SYN) { /* Drop the packet silently if we have reached the limit */ if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited"); goto drop; } else { /* Send challenge ACK */ tcpstat.tcps_synchallenge++; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK"); goto dropafterack; } } @@ -3791,6 +3973,7 @@ trimthenstep6: case TCPS_ESTABLISHED: if (tcp_do_rfc5961 == 0 && tp->last_ack_sent != th->th_seq) { tcpstat.tcps_badrst++; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 bad RST"); goto drop; } if (TCP_ECN_ENABLED(tp) && @@ -3832,10 +4015,12 @@ close: tcpstat.tcps_badrst++; /* Drop if we have reached the ACK limit */ if (tcp_is_ack_ratelimited(tp)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited"); goto drop; } else { /* Send challenge ACK */ tcpstat.tcps_rstchallenge++; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK"); goto dropafterack; } } @@ -3911,6 +4096,7 @@ close: if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; IF_TCP_STATINC(ifp, dospacket); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad SEQ"); goto dropwithreset; } @@ -4018,15 +4204,18 @@ close: if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_NOFDREF"); close_it = TRUE; } if ((so->so_flags & SOF_MP_SUBFLOW) && (mptetoso(tptomptp(tp)->mpt_mpte)->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_MP_SUBFLOW SS_NOFDREF"); close_it = TRUE; } if ((so->so_flags & SOF_DEFUNCT) && tp->t_state > TCPS_FIN_WAIT_1) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_DEFUNCT"); close_it = TRUE; } @@ -4122,10 +4311,12 @@ close: tcpstat.tcps_badsyn++; /* Drop if we have reached ACK limit */ if (tcp_is_ack_ratelimited(tp)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN rate limited"); goto drop; } else { /* Send challenge ACK */ tcpstat.tcps_synchallenge++; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN challenge ack"); goto dropafterack; } } else { @@ -4133,6 +4324,7 @@ close: rstreason = BANDLIM_UNLIMITED; postevent(so, 0, EV_RESET); IF_TCP_STATINC(ifp, synwindow); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN"); goto dropwithreset; } } @@ -4173,8 +4365,10 @@ close: goto step6; } else if (tp->t_flags & TF_ACKNOW) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK"); goto dropafterack; } else { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK"); goto drop; } } @@ -4214,6 +4408,8 @@ close: struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; + + TCP_LOG_CONNECTION_SUMMARY(tp); } else { DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, @@ -4237,8 +4433,17 @@ close: * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) { + if (isipv6) { + memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr)); + ip6 = (struct ip6_hdr *)&saved_hdr[0]; + } else { + memcpy(&saved_hdr, ip, ip->ip_hl << 2); + ip = (struct ip *)&saved_hdr[0]; + } + memcpy(&saved_tcphdr, th, sizeof(struct tcphdr)); (void) tcp_reass(tp, (struct tcphdr *)0, &tlen, - NULL, ifp); + NULL, ifp, &read_wakeup); + th = &saved_tcphdr; } tp->snd_wl1 = th->th_seq - 1; @@ -4323,6 +4528,7 @@ close: if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 rcvacktoomuch"); goto drop; } else { goto dropafterack; @@ -4330,6 +4536,7 @@ close: } if (tcp_do_rfc5961 && SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) { if (tcp_is_ack_ratelimited(tp)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad ACK"); goto drop; } else { goto dropafterack; @@ -4366,10 +4573,6 @@ close: ~TMPF_PREESTABLISHED; tp->t_mpflags |= TMPF_MPTCP_TRUE; - mptcplog((LOG_DEBUG, "MPTCP " - "Sockets: %s \n", __func__), - MPTCP_SOCKET_DBG, - MPTCP_LOGLVL_LOG); tp->t_timer[TCPT_JACK_RXMT] = 0; tp->t_mprxtshift = 0; @@ -4793,6 +4996,12 @@ process_ACK: tp->t_rxtcur); } + if ((prev_t_state == TCPS_SYN_SENT || + prev_t_state == TCPS_SYN_RECEIVED) && + tp->t_state == TCPS_ESTABLISHED) { + TCP_LOG_RTT_INFO(tp); + } + /* * If no data (only SYN) was ACK'd, skip rest of ACK * processing. @@ -4899,12 +5108,7 @@ process_ACK: tcp_bwmeas_check(tp); } - /* - * sowwakeup must happen after snd_una, et al. are - * updated so that the sequence numbers are in sync with - * so_snd - */ - sowwakeup(so); + write_wakeup = 1; if (!SLIST_EMPTY(&tp->t_notify_ack)) { tcp_notify_acknowledgement(tp, so); @@ -5100,6 +5304,7 @@ dodata: */ if (inp->inp_state == INPCB_STATE_DEAD) { /* Just drop the packet that we are processing and return */ + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "INPCB_STATE_DEAD"); goto drop; } @@ -5175,12 +5380,30 @@ dodata: TCP_AUTORCVBUF_MAX(ifp)); so_recv_data_stat(so, m, drop_hdrlen); + if (isipv6) { + memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr)); + ip6 = (struct ip6_hdr *)&saved_hdr[0]; + } else { + memcpy(&saved_hdr, ip, ip->ip_hl << 2); + ip = (struct ip *)&saved_hdr[0]; + } + memcpy(&saved_tcphdr, th, sizeof(struct tcphdr)); if (sbappendstream_rcvdemux(so, m, th->th_seq - (tp->irs + 1), 0)) { - sorwakeup(so); + read_wakeup = 1; } + th = &saved_tcphdr; } else { - thflags = tcp_reass(tp, th, &tlen, m, ifp); + if (isipv6) { + memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr)); + ip6 = (struct ip6_hdr *)&saved_hdr[0]; + } else { + memcpy(&saved_hdr, ip, ip->ip_hl << 2); + ip = (struct ip *)&saved_hdr[0]; + } + memcpy(&saved_tcphdr, th, sizeof(struct tcphdr)); + thflags = tcp_reass(tp, th, &tlen, m, ifp, &read_wakeup); + th = &saved_tcphdr; tp->t_flags |= TF_ACKNOW; } @@ -5309,6 +5532,10 @@ dodata: } #endif + if (read_wakeup) { + mptcp_handle_input(so); + } + /* * Return any desired output. */ @@ -5318,6 +5545,7 @@ dodata: tcp_check_timer_state(tp); + tcp_handle_wakeup(so, read_wakeup, write_wakeup); socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); @@ -5354,8 +5582,11 @@ dropafterack: #endif m_freem(m); tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tcp_handle_wakeup(so, read_wakeup, write_wakeup); + /* Don't need to check timer state as we should have done it during tcp_output */ socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); @@ -5423,6 +5654,8 @@ dropwithreset: (void) soabort(so); socket_unlock(so, 1); } else if ((inp != NULL) && (nosock == 0)) { + tcp_handle_wakeup(so, read_wakeup, write_wakeup); + socket_unlock(so, 1); } KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); @@ -5445,6 +5678,8 @@ drop: (void) soabort(so); socket_unlock(so, 1); } else if (nosock == 0) { + tcp_handle_wakeup(so, read_wakeup, write_wakeup); + socket_unlock(so, 1); } KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); @@ -5781,6 +6016,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt, u_int32_t tsecr, tcp_seq th_ack) { int delta; + int old_srtt = tp->t_srtt; + int old_rttvar = tp->t_rttvar; + bool log_rtt = false; /* * On AWDL interface, the initial RTT measurement on SYN @@ -5908,6 +6146,12 @@ compute_rto: * and the return path might not be symmetrical). */ tp->t_softerror = 0; + + if (log_rtt) { + TCP_LOG_RTT_INFO(tp); + } + + TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar); } static inline unsigned int @@ -6026,6 +6270,15 @@ tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope) #endif inp = tp->t_inpcb; + + so = inp->inp_socket; + /* + * Nothing left to send after the socket is defunct or TCP is in the closed state + */ + if ((so->so_state & SS_DEFUNCT) || tp->t_state == TCPS_CLOSED) { + return; + } + #if INET6 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) @@ -6064,7 +6317,6 @@ tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope) ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) { tp->t_flags |= TF_SLOWLINK; } - so = inp->inp_socket; taop = rmx_taop(rt->rt_rmx); /* @@ -6663,41 +6915,8 @@ tcp_getstat SYSCTL_HANDLER_ARGS struct tcpstat *stat; stat = &tcpstat; #if !CONFIG_EMBEDDED - proc_t caller = PROC_NULL; - proc_t caller_parent = PROC_NULL; - char command_name[MAXCOMLEN + 1] = ""; - char parent_name[MAXCOMLEN + 1] = ""; struct tcpstat zero_stat; - if ((caller = proc_self()) != PROC_NULL) { - /* get process name */ - strlcpy(command_name, caller->p_comm, sizeof(command_name)); - - /* get parent process name if possible */ - if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) { - strlcpy(parent_name, caller_parent->p_comm, - sizeof(parent_name)); - proc_rele(caller_parent); - } - - if ((escape_str(command_name, strlen(command_name) + 1, - sizeof(command_name)) == 0) && - (escape_str(parent_name, strlen(parent_name) + 1, - sizeof(parent_name)) == 0)) { - kern_asl_msg(LOG_DEBUG, "messagetracer", - 5, - "com.apple.message.domain", - "com.apple.kernel.tcpstat", /* 1 */ - "com.apple.message.signature", - "tcpstat", /* 2 */ - "com.apple.message.signature2", command_name, /* 3 */ - "com.apple.message.signature3", parent_name, /* 4 */ - "com.apple.message.summarize", "YES", /* 5 */ - NULL); - } - } - if (caller != PROC_NULL) { - proc_rele(caller); - } + if (tcp_disable_access_to_stats && !kauth_cred_issuser(kauth_cred_get())) { bzero(&zero_stat, sizeof(zero_stat)); diff --git a/bsd/netinet/tcp_log.c b/bsd/netinet/tcp_log.c new file mode 100644 index 000000000..0fcdb4807 --- /dev/null +++ b/bsd/netinet/tcp_log.c @@ -0,0 +1,938 @@ +/* + * Copyright (c) 2018-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include +#if INET6 +#include +#endif /* INET6 */ + +#if !TCPDEBUG +#define TCPSTATES +#endif /* TCPDEBUG */ +#include + +#include + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "TCP logs"); + +static int tcp_log_level_info = 0; +SYSCTL_INT(_net_inet_tcp_log, OID_AUTO, level_info, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_level_info, 0, ""); + +#if (DEVELOPMENT || DEBUG) +#if defined(XNU_TARGET_OS_OSX) +/* + * Log less on macOS as sockets are more prevalent than channels + */ +#define TCP_LOG_ENABLE_DEFAULT \ + (TLEF_CONNECTION | TLEF_DST_LOCAL | TLEF_DST_GW | \ + TLEF_DROP_NECP) +#else /* XNU_TARGET_OS_OSX */ +#define TCP_LOG_ENABLE_DEFAULT \ + (TLEF_CONNECTION | TLEF_DST_LOCAL | TLEF_DST_GW | \ + TLEF_DROP_NECP | TLEF_DROP_PCB | TLEF_DROP_PKT | TLEF_THF_SYN) +#endif /* XNU_TARGET_OS_OSX */ +#else /* (DEVELOPMENT || DEBUG) */ +#define TCP_LOG_ENABLE_DEFAULT 0 +#endif /* (DEVELOPMENT || DEBUG) */ + +uint32_t tcp_log_enable_flags = TCP_LOG_ENABLE_DEFAULT; +SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, enable, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_enable_flags, 0, ""); + +/* + * The following is a help to describe the values of the flags + */ +#define X(name, value, description, ...) #description ":" #value " " +SYSCTL_STRING(_net_inet_tcp_log, OID_AUTO, enable_usage, CTLFLAG_RD | CTLFLAG_LOCKED, + TCP_ENABLE_FLAG_LIST, 0, ""); +#undef X + +/* + * Values for tcp_log_port when TLEF_RTT is enabled: + * 0: log all TCP connections regardless of the port numbers + * 1 to 65535: log TCP connections with this local or foreign port + * other: do not log (same effect as as tcp_log_rtt == 0) + */ +uint32_t tcp_log_port = 0; +SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, rtt_port, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_log_port, 0, ""); + +/* + * Values for tcp_log_thflags_if_family when TLEF_THF_XXX is enabled: + * 0: all interfaces + * other: only for interfaces with the corresponding interface functional type + */ +#if (DEVELOPMENT || DEBUG) +#define TCP_LOG_THFLAGS_IF_FAMILY_DEFAULT IFNET_FAMILY_IPSEC +#else /* (DEVELOPMENT || DEBUG) */ +#define TCP_LOG_THFLAGS_IF_FAMILY_DEFAULT 0 +#endif /* (DEVELOPMENT || DEBUG) */ + +static uint32_t tcp_log_thflags_if_family = TCP_LOG_THFLAGS_IF_FAMILY_DEFAULT; +SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, thflags_if_family, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_thflags_if_family, 0, ""); + +#if (DEVELOPMENT || DEBUG) +#define TCP_LOG_PRIVACY_DEFAULT 0 +#else +#define TCP_LOG_PRIVACY_DEFAULT 1 +#endif /* (DEVELOPMENT || DEBUG) */ + +int tcp_log_privacy = TCP_LOG_PRIVACY_DEFAULT; +SYSCTL_INT(_net_inet_tcp_log, OID_AUTO, privacy, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_privacy, 0, ""); + +#define TCP_LOG_RATE_LIMIT 600 +static unsigned int tcp_log_rate_limit = TCP_LOG_RATE_LIMIT; +SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, rate_limit, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_rate_limit, 0, ""); + +/* 1 minute by default */ +#define TCP_LOG_RATE_DURATION 60 +static unsigned int tcp_log_rate_duration = TCP_LOG_RATE_DURATION; +SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, rate_duration, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_rate_duration, 0, ""); + +static unsigned long tcp_log_rate_max = 0; +SYSCTL_ULONG(_net_inet_tcp_log, OID_AUTO, rate_max, + CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_log_rate_max, ""); + +static unsigned long tcp_log_rate_exceeded_total = 0; +SYSCTL_ULONG(_net_inet_tcp_log, OID_AUTO, rate_exceeded_total, + CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_log_rate_exceeded_total, ""); + +static unsigned long tcp_log_rate_current = 0; +SYSCTL_ULONG(_net_inet_tcp_log, OID_AUTO, rate_current, + CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_log_rate_current, ""); + +static bool tcp_log_rate_exceeded_logged = false; + +static uint64_t tcp_log_current_period = 0; + +#define ADDRESS_STR_LEN (MAX_IPv6_STR_LEN + 6) + +#define TCP_LOG_COMMON_FMT \ + "[%s:%u<->%s:%u] " \ + "interface: %s " \ + "(skipped: %lu)\n" + +#define TCP_LOG_COMMON_ARGS \ + laddr_buf, ntohs(local_port), faddr_buf, ntohs(foreign_port), \ + ifp != NULL ? if_name(ifp) : "", \ + tcp_log_rate_exceeded_total + +#define TCP_LOG_COMMON_PCB_FMT \ + TCP_LOG_COMMON_FMT \ + "t_state: %s " \ + "process: %s:%u " + +#define TCP_LOG_COMMON_PCB_ARGS \ + TCP_LOG_COMMON_ARGS, \ + tcpstates[tp->t_state], \ + inp->inp_last_proc_name, so->last_pid + +/* + * Returns true when above the rate limit + */ +static bool +tcp_log_is_rate_limited(void) +{ + uint64_t current_net_period = net_uptime(); + + /* When set to zero it means to reset to default */ + if (tcp_log_rate_duration == 0) { + tcp_log_rate_duration = TCP_LOG_RATE_DURATION; + } + if (tcp_log_rate_limit == 0) { + tcp_log_rate_duration = TCP_LOG_RATE_LIMIT; + } + + if (current_net_period > tcp_log_current_period + tcp_log_rate_duration) { + if (tcp_log_rate_current > tcp_log_rate_max) { + tcp_log_rate_max = tcp_log_rate_current; + } + tcp_log_current_period = current_net_period; + tcp_log_rate_current = 0; + tcp_log_rate_exceeded_logged = false; + } + + tcp_log_rate_current += 1; + + if (tcp_log_rate_current > (unsigned long) tcp_log_rate_limit) { + tcp_log_rate_exceeded_total += 1; + return true; + } + + return false; +} + +static void +tcp_log_inp_addresses(struct inpcb *inp, char *lbuf, size_t lbuflen, char *fbuf, size_t fbuflen) +{ + /* + * Ugly but %{private} does not work in the kernel version of os_log() + */ + if (tcp_log_privacy != 0) { + if (inp->inp_vflag & INP_IPV6) { + strlcpy(lbuf, "", lbuflen); + strlcpy(fbuf, "", fbuflen); + } else { + strlcpy(lbuf, "", lbuflen); + strlcpy(fbuf, "", fbuflen); + } + } else if (inp->inp_vflag & INP_IPV6) { + inet_ntop(AF_INET6, (void *)&inp->in6p_laddr, lbuf, lbuflen); + inet_ntop(AF_INET6, (void *)&inp->in6p_faddr, fbuf, fbuflen); + } else { + inet_ntop(AF_INET, (void *)&inp->inp_laddr.s_addr, lbuf, lbuflen); + inet_ntop(AF_INET, (void *)&inp->inp_faddr.s_addr, fbuf, fbuflen); + } +} + +void +tcp_log_rtt_info(const char *func_name, int line_no, struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port = inp->inp_lport; + in_port_t foreign_port = inp->inp_fport; + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + + os_log(OS_LOG_DEFAULT, + "tcp_rtt_info (%s:%d) " + TCP_LOG_COMMON_PCB_FMT + "rttcur: %u ms srtt: %u ms rttvar: %u ms rttmin: %u ms rxtcur: %u rxtshift: %u", + func_name, line_no, + TCP_LOG_COMMON_PCB_ARGS, + tp->t_rttcur, tp->t_srtt >> TCP_RTT_SHIFT, + tp->t_rttvar >> TCP_RTTVAR_SHIFT, + tp->t_rttmin, tp->t_rxtcur, tp->t_rxtshift); +} + +void +tcp_log_rt_rtt(const char *func_name, int line_no, struct tcpcb *tp, + struct rtentry *rt) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port = inp->inp_lport; + in_port_t foreign_port = inp->inp_fport; + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + + /* + * Log RTT values in milliseconds + */ + os_log(OS_LOG_DEFAULT, + "tcp_rt_rtt (%s:%d) " + TCP_LOG_COMMON_PCB_FMT + "rt_rmx: RTV_RTT: %d ms rtt: %u ms rttvar: %u ms", + func_name, line_no, + TCP_LOG_COMMON_PCB_ARGS, + (rt->rt_rmx.rmx_locks & RTV_RTT), + rt->rt_rmx.rmx_rtt / (RTM_RTTUNIT / TCP_RETRANSHZ), + rt->rt_rmx.rmx_rttvar / (RTM_RTTUNIT / TCP_RETRANSHZ)); +} + +void +tcp_log_rtt_change(const char *func_name, int line_no, struct tcpcb *tp, + int old_srtt, int old_rttvar) +{ + int srtt_diff; + int rttvar_diff; + + srtt_diff = ABS(tp->t_srtt - old_srtt) >> TCP_RTT_SHIFT; + rttvar_diff = + ABS((tp->t_rttvar - old_rttvar) >> TCP_RTTVAR_SHIFT); + if (srtt_diff >= 1000 || rttvar_diff >= 500) { + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port = inp->inp_lport; + in_port_t foreign_port = inp->inp_fport; + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + + os_log(OS_LOG_DEFAULT, + "tcp_rtt_change (%s:%d) " + TCP_LOG_COMMON_PCB_FMT + "srtt: %u ms old_rtt: %u ms " + "rttvar: %u old_rttvar: %u ms ", + func_name, line_no, + TCP_LOG_COMMON_PCB_ARGS, + tp->t_srtt >> TCP_RTT_SHIFT, + old_srtt >> TCP_RTT_SHIFT, + tp->t_rttvar >> TCP_RTTVAR_SHIFT, + old_rttvar >> TCP_RTTVAR_SHIFT); + } +} + +void +tcp_log_keepalive(const char *func_name, int line_no, struct tcpcb *tp, + int32_t idle_time) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port = inp->inp_lport; + in_port_t foreign_port = inp->inp_fport; + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + + os_log(OS_LOG_DEFAULT, + "tcp_keepalive (%s:%d) " + TCP_LOG_COMMON_PCB_FMT + "snd_una: %u snd_max: %u " + "SO_KA: %d RSTALL: %d TFOPRB: %d idle_time: %u " + "KIDLE: %d KINTV: %d KCNT: %d", + func_name, line_no, + TCP_LOG_COMMON_PCB_ARGS, + tp->snd_una, tp->snd_max, + tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE, + tp->t_flagsext & TF_DETECT_READSTALL, + tp->t_tfo_probe_state == TFO_PROBE_PROBING, + idle_time, + TCP_CONN_KEEPIDLE(tp), TCP_CONN_KEEPINTVL(tp), + TCP_CONN_KEEPCNT(tp)); +} + + +void +tcp_log_connection(struct tcpcb *tp, const char *event, int error) +{ + struct inpcb *inp; + struct socket *so; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + + if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL || event == NULL) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + inp = tp->t_inpcb; + so = inp->inp_socket; + + local_port = inp->inp_lport; + foreign_port = inp->inp_fport; + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + +#define TCP_LOG_CONNECT_FMT \ + "tcp %s: " \ + TCP_LOG_COMMON_PCB_FMT \ + "rtt: %u.%u ms " \ + "rttvar: %u.%u ms " \ + "error: %d " \ + "so_error: %d " \ + "svc/tc: %u" + +#define TCP_LOG_CONNECT_ARGS \ + event, \ + TCP_LOG_COMMON_PCB_ARGS, \ + tp->t_srtt >> TCP_RTT_SHIFT, tp->t_srtt - ((tp->t_srtt >> TCP_RTT_SHIFT) << TCP_RTT_SHIFT), \ + tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rttvar - ((tp->t_rttvar >> TCP_RTTVAR_SHIFT) << TCP_RTTVAR_SHIFT), \ + error, \ + so->so_error, \ + (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class + + if (so->so_head == NULL) { + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT, + TCP_LOG_CONNECT_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT, + TCP_LOG_CONNECT_ARGS); + } + } else { +#define TCP_LOG_CONN_Q_FMT \ + "so_qlimit: %d "\ + "so_qlen: %d "\ + "so_incqlen: %d " + +#define TCP_LOG_CONN_Q_ARGS \ + so->so_head->so_qlimit, \ + so->so_head->so_qlen, \ + so->so_head->so_incqlen + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT "\n" TCP_LOG_CONN_Q_FMT, + TCP_LOG_CONNECT_ARGS, TCP_LOG_CONN_Q_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT "\n" TCP_LOG_CONN_Q_FMT, + TCP_LOG_CONNECT_ARGS, TCP_LOG_CONN_Q_ARGS); + } +#undef TCP_LOG_CONN_Q_FMT +#undef TCP_LOG_CONN_Q_ARGS + } +#undef TCP_LOG_CONNECT_FMT +#undef TCP_LOG_CONNECT_ARGS +} + +void +tcp_log_listen(struct tcpcb *tp, int error) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + + if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + inp = tp->t_inpcb; + so = inp->inp_socket; + + local_port = inp->inp_lport; + foreign_port = inp->inp_fport; + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + +#define TCP_LOG_LISTEN_FMT \ + "tcp listen: " \ + TCP_LOG_COMMON_PCB_FMT \ + "so_qlimit: %d "\ + "error: %d " \ + "so_error: %d " \ + "svc/tc: %u" + +#define TCP_LOG_LISTEN_ARGS \ + TCP_LOG_COMMON_PCB_ARGS, \ + so->so_qlimit, \ + error, \ + so->so_error, \ + (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_LISTEN_FMT, + TCP_LOG_LISTEN_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_LISTEN_FMT, + TCP_LOG_LISTEN_ARGS); + } +#undef TCP_LOG_LISTEN_FMT +#undef TCP_LOG_LISTEN_ARGS +} + +void +tcp_log_connection_summary(struct tcpcb *tp) +{ + struct inpcb *inp; + struct socket *so; + struct ifnet *ifp; + uint32_t conntime = 0; + uint32_t duration = 0; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + + if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + inp = tp->t_inpcb; + so = inp->inp_socket; + + local_port = inp->inp_lport; + foreign_port = inp->inp_fport; + + /* Make sure the summary is logged once */ + if (tp->t_flagsext & TF_LOGGED_CONN_SUMMARY) { + return; + } + tp->t_flagsext |= TF_LOGGED_CONN_SUMMARY; + + /* + * t_connect_time is the time when the connection started on + * the first SYN. + * + * t_starttime is when the three way handshake was completed. + */ + if (tp->t_connect_time > 0) { + duration = tcp_now - tp->t_connect_time; + + if (tp->t_starttime > 0) { + conntime = tp->t_starttime - tp->t_connect_time; + } + } + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + +#define TCP_LOG_CONNECTION_SUMMARY_FMT \ + "tcp_connection_summary " \ + TCP_LOG_COMMON_PCB_FMT \ + "Duration: %u.%u sec " \ + "Conn_Time: %u.%u sec " \ + "syn rxmit: %u\n" \ + "bytes in/out: %llu/%llu " \ + "pkts in/out: %llu/%llu " \ + "rtt: %u.%u ms " \ + "rttvar: %u.%u ms " \ + "pkt rxmit: %u " \ + "ooo pkts: %u dup bytes in: %u " \ + "so_error: %d " \ + "svc/tc: %u" + +#define TCP_LOG_CONNECTION_SUMMARY_ARGS \ + TCP_LOG_COMMON_PCB_ARGS, \ + duration / TCP_RETRANSHZ, duration % TCP_RETRANSHZ, \ + conntime / TCP_RETRANSHZ, conntime % TCP_RETRANSHZ, \ + tp->t_stat.synrxtshift, \ + inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ + inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + tp->t_srtt >> TCP_RTT_SHIFT, tp->t_srtt - ((tp->t_srtt >> TCP_RTT_SHIFT) << TCP_RTT_SHIFT), \ + tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rttvar - ((tp->t_rttvar >> TCP_RTTVAR_SHIFT) << TCP_RTTVAR_SHIFT), \ + tp->t_stat.rxmitpkts, \ + tp->t_rcvoopack, tp->t_stat.rxduplicatebytes, \ + so->so_error, \ + (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_CONNECTION_SUMMARY_FMT, + TCP_LOG_CONNECTION_SUMMARY_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_CONNECTION_SUMMARY_FMT, + TCP_LOG_CONNECTION_SUMMARY_ARGS); + } +#undef TCP_LOG_CONNECTION_SUMMARY_FMT +#undef TCP_LOG_CONNECTION_SUMMARY_ARGS +} + +static bool +tcp_log_pkt_addresses(void *hdr, struct tcphdr *th, bool outgoing, + char *lbuf, size_t lbuflen, char *fbuf, size_t fbuflen) +{ + bool isipv6; + uint8_t thflags; + + isipv6 = (((struct ip *)hdr)->ip_v == 6); + thflags = th->th_flags; + + if (isipv6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)hdr; + + if (memcmp(&ip6->ip6_src, &in6addr_loopback, sizeof(struct in6_addr)) == 0 || + memcmp(&ip6->ip6_dst, &in6addr_loopback, sizeof(struct in6_addr)) == 0) { + if (!(tcp_log_enable_flags & TLEF_DST_LOOPBACK)) { + return false; + } + } + + if (tcp_log_privacy != 0) { + strlcpy(lbuf, "", lbuflen); + strlcpy(fbuf, "", fbuflen); + } else if (outgoing) { + inet_ntop(AF_INET6, &ip6->ip6_src, lbuf, lbuflen); + inet_ntop(AF_INET6, &ip6->ip6_dst, fbuf, fbuflen); + } else { + inet_ntop(AF_INET6, &ip6->ip6_dst, lbuf, lbuflen); + inet_ntop(AF_INET6, &ip6->ip6_src, fbuf, fbuflen); + } + } else { + struct ip *ip = (struct ip *)hdr; + + if (ntohl(ip->ip_src.s_addr) == INADDR_LOOPBACK || + ntohl(ip->ip_dst.s_addr) == INADDR_LOOPBACK) { + if (!(tcp_log_enable_flags & TLEF_DST_LOOPBACK)) { + return false; + } + } + + if (tcp_log_privacy != 0) { + strlcpy(lbuf, "", lbuflen); + strlcpy(fbuf, "", fbuflen); + } else if (outgoing) { + inet_ntop(AF_INET, (void *)&ip->ip_src.s_addr, lbuf, lbuflen); + inet_ntop(AF_INET, (void *)&ip->ip_dst.s_addr, fbuf, fbuflen); + } else { + inet_ntop(AF_INET, (void *)&ip->ip_dst.s_addr, lbuf, lbuflen); + inet_ntop(AF_INET, (void *)&ip->ip_src.s_addr, fbuf, fbuflen); + } + } + return true; +} + +/* + * Note: currently only used in the input path + */ +void +tcp_log_drop_pcb(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, const char *reason) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + + if (tp == NULL) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + + /* Use the packet addresses when in the data path */ + if (hdr != NULL && th != NULL) { + if (outgoing) { + local_port = th->th_sport; + foreign_port = th->th_dport; + } else { + local_port = th->th_dport; + foreign_port = th->th_sport; + } + (void) tcp_log_pkt_addresses(hdr, th, outgoing, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + } else { + local_port = inp->inp_lport; + foreign_port = inp->inp_fport; + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + } + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + +#define TCP_LOG_DROP_PCB_FMT \ + "tcp drop %s " \ + TCP_LOG_COMMON_PCB_FMT \ + "t_state: %s " \ + "so_error: %d " \ + "reason: %s" + +#define TCP_LOG_DROP_PCB_ARGS \ + outgoing ? "outgoing" : "incoming", \ + TCP_LOG_COMMON_PCB_ARGS, \ + tcpstates[tp->t_state], \ + so->so_error, \ + reason + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_DROP_PCB_FMT, + TCP_LOG_DROP_PCB_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_DROP_PCB_FMT, + TCP_LOG_DROP_PCB_ARGS); + } +#undef TCP_LOG_DROP_PCB_FMT +#undef TCP_LOG_DROP_PCB_ARGS +} + +#define TCP_LOG_TH_FLAGS_COMMON_FMT \ + "tcp control %s " \ + "%s" \ + "%s" \ + "%s" \ + "%s" \ + TCP_LOG_COMMON_FMT + +#define TCP_LOG_TH_FLAGS_COMMON_ARGS \ + outgoing ? "outgoing" : "incoming", \ + thflags & TH_SYN ? "SYN " : "", \ + thflags & TH_FIN ? "FIN " : "", \ + thflags & TH_RST ? "RST " : "", \ + thflags & TH_ACK ? "ACK " : "", \ + TCP_LOG_COMMON_ARGS + +void +tcp_log_th_flags(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, struct ifnet *ifp) +{ + struct socket *so = tp->t_inpcb != NULL ? tp->t_inpcb->inp_socket : NULL; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + uint8_t thflags; + + if (hdr == NULL || th == NULL) { + return; + } + + if (outgoing) { + local_port = th->th_sport; + foreign_port = th->th_dport; + } else { + local_port = th->th_dport; + foreign_port = th->th_sport; + } + thflags = th->th_flags; + + if ((((thflags & TH_SYN) && (tcp_log_enable_flags & TLEF_THF_SYN)) || + ((thflags & TH_FIN) && (tcp_log_enable_flags & TLEF_THF_FIN)) || + ((thflags & TH_RST) && (tcp_log_enable_flags & TLEF_THF_RST))) == false) { + return; + } + + if (ifp != NULL && tcp_log_thflags_if_family != 0 && ifp->if_family != tcp_log_thflags_if_family) { + return; + } + + if (!tcp_log_pkt_addresses(hdr, th, outgoing, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf))) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + + /* + * When no PCB or socket just log the packet + */ + if (tp == NULL || so == NULL) { + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_COMMON_FMT " no pcb", + TCP_LOG_TH_FLAGS_COMMON_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_COMMON_FMT, + TCP_LOG_TH_FLAGS_COMMON_ARGS); + } + } else { +#define TCP_LOG_TH_FLAGS_PCB_FMT \ + TCP_LOG_TH_FLAGS_COMMON_FMT \ + "rtt: %u.%u ms " \ + "rttvar: %u.%u ms " \ + "syn rxmit: %u " \ + "pkt rxmit: %u " \ + "so_error: %d " \ + "svc/tc: %u " + +#define TCP_LOG_TH_FLAGS_PCB_ARGS \ + TCP_LOG_TH_FLAGS_COMMON_ARGS, \ + tp->t_srtt >> TCP_RTT_SHIFT, tp->t_srtt - ((tp->t_srtt >> TCP_RTT_SHIFT) << TCP_RTT_SHIFT), \ + tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rttvar - ((tp->t_rttvar >> TCP_RTTVAR_SHIFT) << TCP_RTTVAR_SHIFT), \ + tp->t_stat.synrxtshift, \ + tp->t_stat.rxmitpkts, \ + so->so_error, \ + (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? \ + so->so_netsvctype : so->so_traffic_class + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_PCB_FMT, + TCP_LOG_TH_FLAGS_PCB_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_PCB_FMT, + TCP_LOG_TH_FLAGS_PCB_ARGS); + } +#undef TCP_LOG_TH_FLAGS_PCB_FMT +#undef TCP_LOG_TH_FLAGS_PCB_ARGS + } +} + +void +tcp_log_drop_pkt(void *hdr, struct tcphdr *th, struct ifnet *ifp, const char *reason) +{ + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + uint8_t thflags; + bool outgoing = false; /* This is only for incoming packets */ + + if (hdr == NULL || th == NULL) { + return; + } + + local_port = th->th_dport; + foreign_port = th->th_sport; + thflags = th->th_flags; + + if ((((thflags & TH_SYN) && (tcp_log_enable_flags & TLEF_THF_SYN)) || + ((thflags & TH_FIN) && (tcp_log_enable_flags & TLEF_THF_FIN)) || + ((thflags & TH_RST) && (tcp_log_enable_flags & TLEF_THF_RST))) == false) { + return; + } + + if (ifp != NULL && tcp_log_thflags_if_family != 0 && ifp->if_family != tcp_log_thflags_if_family) { + return; + } + + if (!tcp_log_pkt_addresses(hdr, th, outgoing, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf))) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + +#define TCP_LOG_DROP_PKT_FMT \ + "tcp drop incoming control packet " \ + TCP_LOG_TH_FLAGS_COMMON_FMT \ + "reason: %s" + +#define TCP_LOG_DROP_PKT_ARGS \ + TCP_LOG_TH_FLAGS_COMMON_ARGS, \ + reason != NULL ? reason : "" + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_DROP_PKT_FMT, + TCP_LOG_DROP_PKT_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_DROP_PKT_FMT, + TCP_LOG_DROP_PKT_ARGS); + } +#undef TCP_LOG_DROP_PKT_FMT +#undef TCP_LOG_DROP_PKT_ARGS +} + +void +tcp_log_message(const char *func_name, int line_no, struct tcpcb *tp, const char *format, ...) +{ + struct inpcb *inp; + struct socket *so; + struct ifnet *ifp; + char laddr_buf[ADDRESS_STR_LEN]; + char faddr_buf[ADDRESS_STR_LEN]; + in_port_t local_port; + in_port_t foreign_port; + char message[256]; + + if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL) { + return; + } + + /* Do not log too much */ + if (tcp_log_is_rate_limited()) { + return; + } + inp = tp->t_inpcb; + so = inp->inp_socket; + + local_port = inp->inp_lport; + foreign_port = inp->inp_fport; + + ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL; + + tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf)); + + va_list ap; + va_start(ap, format); + vsnprintf(message, sizeof(message), format, ap); + va_end(ap); + +#define TCP_LOG_MESSAGE_FMT \ + "tcp (%s:%d) " \ + TCP_LOG_COMMON_PCB_FMT \ + "%s" + +#define TCP_LOG_MESSAGE_ARGS \ + func_name, line_no, \ + TCP_LOG_COMMON_PCB_ARGS, \ + message + + if (tcp_log_level_info == 0) { + os_log(OS_LOG_DEFAULT, TCP_LOG_MESSAGE_FMT, + TCP_LOG_MESSAGE_ARGS); + } else { + os_log_info(OS_LOG_DEFAULT, TCP_LOG_MESSAGE_FMT, + TCP_LOG_MESSAGE_ARGS); + } +#undef TCP_LOG_MESSAGE_FMT +#undef TCP_LOG_MESSAGE_ARGS +} diff --git a/bsd/netinet/tcp_log.h b/bsd/netinet/tcp_log.h new file mode 100644 index 000000000..040948f92 --- /dev/null +++ b/bsd/netinet/tcp_log.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef _NETINET_TCP_LOG_H_ +#define _NETINET_TCP_LOG_H_ + +#ifdef BSD_KERNEL_PRIVATE + +#include +#include + +#include +#include +#include +#if INET6 +#include +#endif + +#include +#include +#if INET6 +#include +#endif + +#include + +#include + +#include + +extern os_log_t tcp_mpkl_log_object; +extern uint32_t tcp_log_enable_flags; +extern uint32_t tcp_log_port; +extern int tcp_log_privacy; + +#define TCP_ENABLE_FLAG_LIST \ + X(TLEF_CONNECTION, 0x1, connection) \ + X(TLEF_RTT, 0x2, rtt) \ + X(TLEF_KEEP_ALIVE, 0x4, ka) \ + X(TLEF_DST_LOOPBACK, 0x10, loop) \ + X(TLEF_DST_LOCAL, 0x20, local) \ + X(TLEF_DST_GW, 0x40, gw) \ + X(TLEF_THF_SYN, 0x100, syn) \ + X(TLEF_THF_FIN, 0x200, fin) \ + X(TLEF_THF_RST, 0x400, rst) \ + X(TLEF_DROP_NECP, 0x1000, dropnecp) \ + X(TLEF_DROP_PCB, 0x2000, droppcb) \ + X(TLEF_DROP_PKT, 0x4000, droppkt) \ + +/* + * Flag values for tcp_log_enabled + */ +enum { +#define X(name, value, ...) name = value, + TCP_ENABLE_FLAG_LIST +#undef X +}; + +#define TLEF_MASK_DST (TLEF_DST_LOOPBACK | TLEF_DST_LOCAL | TLEF_DST_GW) + +#define TLEF_MASK_THF (TLEF_THF_SYN | TLEF_THF_FIN | TLEF_THF_RST) + +extern void tcp_log_connection_summary(struct tcpcb *tp); +extern void tcp_log_th_flags(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, struct ifnet *ifp); +extern void tcp_log_connection(struct tcpcb *tp, const char *event, int error); +extern void tcp_log_listen(struct tcpcb *tp, int error); +extern void tcp_log_drop_pcb(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, const char *reason); +extern void tcp_log_drop_pkt(void *hdr, struct tcphdr *th, struct ifnet *ifp, const char *reason); +extern void tcp_log_rtt_info(const char *func_name, int line_no, struct tcpcb *tp); +extern void tcp_log_rt_rtt(const char *func_name, int line_no, struct tcpcb *tp, struct rtentry *rt); +extern void tcp_log_rtt_change(const char *func_name, int line_no, struct tcpcb *tp, int old_srtt, int old_rttvar); +extern void tcp_log_keepalive(const char *func_name, int line_no, struct tcpcb *tp, int32_t idle_time); +extern void tcp_log_message(const char *func_name, int line_no, struct tcpcb *tp, const char *format, ...); + + +static inline bool +tcp_is_log_enabled(struct tcpcb *tp, uint32_t req_flags) +{ + if (tp == NULL || tp->t_inpcb == NULL) { + return false; + } + if (tcp_log_port > 0 && tcp_log_port <= IPPORT_HILASTAUTO) { + if (ntohs(tp->t_inpcb->inp_lport) != tcp_log_port && + ntohs(tp->t_inpcb->inp_fport) != tcp_log_port) { + return false; + } + } + /* + * First find out the kind of destination + */ + if (tp->t_log_flags == 0) { + if (tp->t_inpcb->inp_vflag & INP_IPV6) { + if (IN6_IS_ADDR_LOOPBACK(&tp->t_inpcb->in6p_laddr) || + IN6_IS_ADDR_LOOPBACK(&tp->t_inpcb->in6p_faddr)) { + tp->t_log_flags |= TLEF_DST_LOOPBACK; + } + } else { + if (ntohl(tp->t_inpcb->inp_laddr.s_addr) == INADDR_LOOPBACK || + ntohl(tp->t_inpcb->inp_faddr.s_addr) == INADDR_LOOPBACK) { + tp->t_log_flags |= TLEF_DST_LOOPBACK; + } + } + if (tp->t_log_flags == 0) { + if (tp->t_flags & TF_LOCAL) { + tp->t_log_flags |= TLEF_DST_LOCAL; + } else { + tp->t_log_flags |= TLEF_DST_GW; + } + } + } + /* + * Check separately the destination flags that are per TCP connection + * and the other functional flags that are global + */ + return (tp->t_log_flags & tcp_log_enable_flags & TLEF_MASK_DST) && + (tcp_log_enable_flags & (req_flags & ~TLEF_MASK_DST)); +} + +#define TCP_LOG_RTT_INFO(tp) if (tcp_is_log_enabled(tp, TLEF_RTT)) \ + tcp_log_rtt_info(__func__, __LINE__, (tp)) + +#define TCP_LOG_RTM_RTT(tp, rt) if (tcp_is_log_enabled(tp, TLEF_RTT)) \ + tcp_log_rt_rtt(__func__, __LINE__, (tp), (rt)) + +#define TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar) if (tcp_is_log_enabled(tp, TLEF_RTT)) \ + tcp_log_rtt_change(__func__, __LINE__, (tp), (old_srtt), (old_rttvar)) + +#define TCP_LOG_KEEP_ALIVE(tp, idle_time) if (tcp_is_log_enabled(tp, TLEF_KEEP_ALIVE)) \ + tcp_log_keepalive(__func__, __LINE__, (tp), (idle_time)) + +#define TCP_LOG_CONNECT(tp, outgoing, error) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \ + tcp_log_connection((tp), (outgoing) ? "connect outgoing" : "connect incoming", (error)) + +#define TCP_LOG_LISTEN(tp, error) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \ + tcp_log_listen((tp), (error)) + +#define TCP_LOG_ACCEPT(tp, error) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \ + tcp_log_connection((tp), "accept", (error)) + +#define TCP_LOG_CONNECTION_SUMMARY(tp) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \ + tcp_log_connection_summary((tp)) + +#define TCP_LOG_DROP_NECP(hdr, th, tp, outgoing) if (tcp_is_log_enabled(tp, TLEF_DROP_NECP)) \ + tcp_log_drop_pcb((hdr), (th), (tp), (outgoing), "NECP") + +#define TCP_LOG_DROP_PCB(hdr, th, tp, outgoing, reason) if (tcp_is_log_enabled(tp, TLEF_DROP_PCB)) \ + tcp_log_drop_pcb((hdr), (th), (tp), (outgoing), reason) + +#define TCP_LOG_TH_FLAGS(hdr, th, tp, outgoing, ifp) \ + if ((th) != NULL && ((th)->th_flags & (TH_SYN|TH_FIN|TH_RST))) \ + tcp_log_th_flags((hdr), (th), (tp), (outgoing), (ifp)) + +#define TCP_LOG_DROP_PKT(hdr, th, ifp, reason) \ + if ((th) != NULL && ((th->th_flags) & (TH_SYN|TH_FIN|TH_RST)) && \ + (tcp_log_enable_flags & TLEF_DROP_PKT)) \ + tcp_log_drop_pkt((hdr), (th), (ifp), (reason)) + +#define TCP_LOG(tp, format, ...) \ + tcp_log_message(__func__, __LINE__, tp, format, ## __VA_ARGS__) + + + +#endif /* BSD_KERNEL_RPIVATE */ + +#endif /* _NETINET_TCP_LOG_H_ */ diff --git a/bsd/netinet/tcp_lro.c b/bsd/netinet/tcp_lro.c index baacc13f3..8aef977e3 100644 --- a/bsd/netinet/tcp_lro.c +++ b/bsd/netinet/tcp_lro.c @@ -586,7 +586,6 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, int drop_hdrlen) default: lck_mtx_unlock(&tcp_lro_lock); panic_plain("%s: unrecognized type %d", __func__, retval); - break; } if (ret_response == TCP_LRO_FLOW_NOTFOUND) { diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 75e8634c0..6f63e40f7 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -112,6 +112,7 @@ #if TCPDEBUG #include #endif +#include #include #include @@ -166,14 +167,14 @@ sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t), &i, &changed); if (err != 0 || req->newptr == USER_ADDR_NULL) - return(err); + return err; if (changed) { if ((tcp_ecn_outbound == 0 || tcp_ecn_outbound == 1) && (i == 0 || i == 1)) { tcp_ecn_outbound = i; SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound); - return(err); + return err; } if (tcp_ecn_outbound == 2 && (i == 0 || i == 1)) { /* @@ -215,7 +216,7 @@ sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS tcp_ecn_inbound = i; SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound); } - return (err); + return err; } int tcp_ecn_outbound = 2; @@ -310,18 +311,15 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len) if (tp->t_flags & TF_NOOPT) goto fallback; - if ((so->so_flags1 & SOF1_DATA_AUTHENTICATED) && - !(tp->t_flagsext & TF_FASTOPEN_HEUR)) - return (len); - - if (!tcp_heuristic_do_tfo(tp)) { + if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + !tcp_heuristic_do_tfo(tp)) { tp->t_tfo_stats |= TFO_S_HEURISTICS_DISABLE; tcpstat.tcps_tfo_heuristics_disable++; goto fallback; } if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) - return (len); + return len; optlen += TCPOLEN_MAXSEG; @@ -330,7 +328,8 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len) #if MPTCP if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable && - tp->t_rxtshift <= mptcp_mpcap_retries) + (tp->t_rxtshift <= mptcp_mpcap_retries || + (tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE))) optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t); #endif /* MPTCP */ @@ -349,7 +348,7 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len) cookie_len = tcp_cache_get_cookie_len(tp); if (cookie_len == 0) /* No cookie, so we request one */ - return (0); + return 0; /* There is not enough space for the cookie, so we cannot do TFO */ if (MAX_TCPOPTLEN - optlen < cookie_len) @@ -360,11 +359,11 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len) goto fallback; /* Ok, everything looks good. We can go on and do TFO */ - return (len); + return len; fallback: - tp->t_flagsext &= ~TF_FASTOPEN; - return (0); + tcp_disable_tfo(tp); + return 0; } /* Returns the number of bytes written to the TCP option-space */ @@ -377,7 +376,7 @@ tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned optlen, u_char *opt) if ((MAX_TCPOPTLEN - optlen) < (TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT)) - return (ret); + return ret; tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out)); @@ -391,7 +390,7 @@ tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned optlen, u_char *opt) tp->t_tfo_stats |= TFO_S_COOKIE_SENT; tcpstat.tcps_tfo_cookie_sent++; - return (ret); + return ret; } static unsigned @@ -411,7 +410,7 @@ tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t len, tcpstat.tcps_tfo_syn_data_sent++; } - return (0); + return 0; } bp = opt + optlen; @@ -446,15 +445,15 @@ tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t len, } } - return (ret); + return ret; } static inline bool tcp_send_ecn_flags_on_syn(struct tcpcb *tp, struct socket *so) { - return(!((tp->ecn_flags & TE_SETUPSENT) || + return !((tp->ecn_flags & TE_SETUPSENT || (so->so_flags & SOF_MP_SUBFLOW) || - (tp->t_flagsext & TF_FASTOPEN))); + (tfo_enabled(tp)))); } void @@ -642,7 +641,7 @@ tcp_output(struct tcpcb *tp) * the subflow socket stays around until deleted. * No packets such as FINs must be sent after RST. */ - return (0); + return 0; } #endif /* MPTCP */ @@ -708,18 +707,18 @@ again: if (tp->t_state >= TCPS_CLOSE_WAIT) { tcp_drop(tp, EADDRNOTAVAIL); - return(EADDRNOTAVAIL); + return EADDRNOTAVAIL; } - /* Set retransmit timer if it wasn't set, + /* + * Set retransmit timer if it wasn't set, * reset Persist timer and shift register as the * advertised peer window may not be valid anymore */ - - if (!tp->t_timer[TCPT_REXMT]) { + if (tp->t_timer[TCPT_REXMT] == 0) { tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); - if (tp->t_timer[TCPT_PERSIST]) { + if (tp->t_timer[TCPT_PERSIST] != 0) { tp->t_timer[TCPT_PERSIST] = 0; tp->t_persist_stop = 0; TCP_RESET_REXMT_STATE(tp); @@ -733,10 +732,10 @@ again: /* drop connection if source address isn't available */ if (so->so_flags & SOF_NOADDRAVAIL) { tcp_drop(tp, EADDRNOTAVAIL); - return(EADDRNOTAVAIL); + return EADDRNOTAVAIL; } else { tcp_check_timer_state(tp); - return(0); /* silently ignore, keep data in socket: address may be back */ + return 0; /* silently ignore, keep data in socket: address may be back */ } } if (ia != NULL) @@ -766,7 +765,7 @@ again: * has been disabled) */ - if (!path_mtu_discovery || ((rt != NULL) && + if (!path_mtu_discovery || ((rt != NULL) && (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) tp->t_flags &= ~TF_PMTUD; @@ -1011,7 +1010,7 @@ after_sack_rexmit: } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - return(0); + return 0; } } @@ -1029,6 +1028,12 @@ after_sack_rexmit: flags &= ~TH_FIN; } + /* + * Don't send a RST with data. + */ + if (flags & TH_RST) + len = 0; + if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp)) len = tcp_tfo_check(tp, len); @@ -1142,6 +1147,12 @@ after_sack_rexmit: } #if MPTCP + if (so->so_flags & SOF_MP_SUBFLOW && off < 0) { + os_log_error(mptcp_log_handle, "%s - %lx: offset is negative! len %d off %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte), + len, off); + } + if ((so->so_flags & SOF_MP_SUBFLOW) && !(tp->t_mpflags & TMPF_TCP_FALLBACK)) { int newlen = len; @@ -1170,11 +1181,12 @@ after_sack_rexmit: * option can be sent in one packet, reduce length to match * the contiguous MPTCP level. Set sendalot to send remainder. */ - if (len > 0) + if (len > 0 && off >= 0) { newlen = mptcp_adj_sendlen(so, off); + } + if (newlen < len) { len = newlen; - sendalot = 1; } } #endif /* MPTCP */ @@ -1217,23 +1229,23 @@ after_sack_rexmit: * next expected input). If the difference is at least two * max size segments, or at least 25% of the maximum possible * window, then want to send a window update to peer. - * Skip this if the connection is in T/TCP half-open state. */ recwin = tcp_sbspace(tp); -#if MPTCP - if (so->so_flags & SOF_MP_SUBFLOW) { + + if (!(so->so_flags & SOF_MP_SUBFLOW)) { + if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && + recwin < (int)tp->t_maxseg) { + recwin = 0; + } + } else { struct mptcb *mp_tp = tptomptp(tp); + struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); - if (mp_tp != NULL) { - mpte_lock_assert_held(mp_tp->mpt_mpte); - recwin = imin(recwin, mptcp_sbspace(mp_tp)); + if (recwin < (int32_t)(mp_so->so_rcv.sb_hiwat / 4) && + recwin < (int)tp->t_maxseg) { + recwin = 0; } } -#endif - - if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && - recwin < (int)tp->t_maxseg) - recwin = 0; #if TRAFFIC_MGT if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) { @@ -1253,15 +1265,18 @@ after_sack_rexmit: if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); - /* - * MPTCP needs to be able to announce a smaller window than previously, - * because the other subflow may have filled up the available window- - * space. So we have to be able to go backwards and announce a smaller - * window. - */ - if (!(so->so_flags & SOF_MP_SUBFLOW) && - recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) - recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt); + if (!(so->so_flags & SOF_MP_SUBFLOW)) { + if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) { + recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt); + } + } else { + struct mptcb *mp_tp = tptomptp(tp); + + /* Don't remove what we announced at the MPTCP-layer */ + if (recwin < (int32_t)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt)) { + recwin = (int32_t)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt); + } + } /* * Sender silly window avoidance. We transmit under the following @@ -1283,6 +1298,16 @@ after_sack_rexmit: if (sack_rxmit) goto send; + /* + * If this here is the first segment after SYN/ACK and TFO + * is being used, then we always send it, regardless of Nagle,... + */ + if (tp->t_state == TCPS_SYN_RECEIVED && + tfo_enabled(tp) && + (tp->t_tfo_flags & TFO_F_COOKIE_VALID) && + tp->snd_nxt == tp->iss + 1) + goto send; + /* * Send new data on the connection only if it is * not flow controlled @@ -1449,7 +1474,7 @@ just_return: tcp_check_timer_state(tp); } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - return (0); + return 0; send: /* @@ -2024,12 +2049,12 @@ send: goto out; } if (MHLEN < (hdrlen + max_linkhdr)) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_freem(m); - error = ENOBUFS; - goto out; - } + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(m); + error = ENOBUFS; + goto out; + } } m->m_data += max_linkhdr; m->m_len = hdrlen; @@ -2134,8 +2159,20 @@ send: } th->th_flags = flags; th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); - if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) - tp->rcv_adv = tp->rcv_nxt + recwin; + if (!(so->so_flags & SOF_MP_SUBFLOW)) { + if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) { + tp->rcv_adv = tp->rcv_nxt + recwin; + } + } else { + struct mptcb *mp_tp = tptomptp(tp); + if (recwin > 0) { + tp->rcv_adv = tp->rcv_nxt + recwin; + } + + if (recwin > 0 && SEQ_LT(mp_tp->mpt_rcvadv, (uint32_t)mp_tp->mpt_rcvnxt + recwin)) { + mp_tp->mpt_rcvadv = (uint32_t)mp_tp->mpt_rcvnxt + recwin; + } + } /* * Adjust the RXWIN0SENT flag - indicate that we have advertised @@ -2149,6 +2186,7 @@ send: tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; @@ -2402,7 +2440,8 @@ timer: necp_kernel_policy_id policy_id; necp_kernel_policy_id skip_policy_id; u_int32_t route_rule_id; - if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id, &skip_policy_id)) { + if (!necp_socket_is_allowed_to_send_recv(inp, NULL, &policy_id, &route_rule_id, &skip_policy_id)) { + TCP_LOG_DROP_NECP(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true); m_freem(m); error = EHOSTUNREACH; goto out; @@ -2470,6 +2509,10 @@ timer: (void) m_set_service_class(m, so_tc2msc(sotc)); } + TCP_LOG_TH_FLAGS(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true, + inp->inp_last_outifp != NULL ? inp->inp_last_outifp : + inp->inp_boundifp); + tp->t_pktlist_sentlen += len; tp->t_lastchain++; @@ -2493,9 +2536,9 @@ timer: tp->t_pktlist_head = tp->t_pktlist_tail = m; } - if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) && - ((th->th_flags & TH_ACK) == TH_ACK) && (!len) && - (tp->t_state == TCPS_ESTABLISHED)) { + if (lro_ackmore && !sackoptlen && tp->t_timer[TCPT_PERSIST] == 0 && + (th->th_flags & TH_ACK) == TH_ACK && len == 0 && + tp->t_state == TCPS_ESTABLISHED) { /* For a pure ACK, see if you need to send more of them */ mnext = tcp_send_lroacks(tp, m, th); if (mnext) { @@ -2553,7 +2596,7 @@ timer: (tp->t_flags & TF_CLOSING)) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); - return (0); + return 0; } } else { error = 0; @@ -2606,8 +2649,8 @@ out: * when we failed to send a segment that can be * retransmitted (i.e. not pure ack or rst) */ - if (!tp->t_timer[TCPT_REXMT] && - !tp->t_timer[TCPT_PERSIST] && + if (tp->t_timer[TCPT_REXMT] == 0 && + tp->t_timer[TCPT_PERSIST] == 0 && (len != 0 || (flags & (TH_SYN | TH_FIN)) != 0 || so->so_snd.sb_cc > 0)) tp->t_timer[TCPT_REXMT] = @@ -2618,7 +2661,7 @@ out: KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR); - return (0); + return 0; } if (error == EMSGSIZE) { /* @@ -2654,7 +2697,7 @@ out: } tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - return (error); + return error; } tcpstat.tcps_sndtotal++; @@ -2664,7 +2707,8 @@ out: goto again; tcp_check_timer_state(tp); - return (0); + + return 0; } static int @@ -2732,6 +2776,14 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; } + if (INP_NO_CONSTRAINED(inp)) { +#if INET6 + if (isipv6) + ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED; + else +#endif /* INET6 */ + ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED; + } if (INP_AWDL_UNRESTRICTED(inp)) { #if INET6 if (isipv6) @@ -2822,7 +2874,6 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, #endif ; // I'm important, not extraneous - while (pkt != NULL) { struct mbuf *npkt = pkt->m_nextpkt; @@ -2831,7 +2882,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, /* * If we are not chaining, make sure to set the packet * list count to 0 so that IP takes the right path; - * this is important for cases such as IPSec where a + * this is important for cases such as IPsec where a * single mbuf might result in multiple mbufs as part * of the encapsulation. If a non-zero count is passed * down to IP, the head of the chain might change and @@ -2901,22 +2952,33 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, error = ENOBUFS; VERIFY(inp->inp_sndinprog_cnt > 0); - if ( --inp->inp_sndinprog_cnt == 0) + if ( --inp->inp_sndinprog_cnt == 0) { inp->inp_flags &= ~(INP_FC_FEEDBACK); + if (inp->inp_sndingprog_waiters > 0) { + wakeup(&inp->inp_sndinprog_cnt); + } + } #if INET6 if (isipv6) { - if (ro6.ro_rt != NULL) + /* + * When an NECP IP tunnel policy forces the outbound interface, + * ip6_output_list() informs the transport layer what is the actual + * outgoing interface + */ + if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) { + outif = ifindex2ifnet[ip6oa.ip6oa_boundif]; + } else if (ro6.ro_rt != NULL) { outif = ro6.ro_rt->rt_ifp; + } } else #endif /* INET6 */ if (ro.ro_rt != NULL) outif = ro.ro_rt->rt_ifp; - if (outif != NULL && outif != inp->inp_last_outifp && - so->so_snd.sb_cc > 0) { + if (outif != NULL && outif != inp->inp_last_outifp) { /* Update the send byte count */ - if (so->so_snd.sb_flags & SB_SNDBYTE_CNT) { + if (so->so_snd.sb_cc > 0 && so->so_snd.sb_flags & SB_SNDBYTE_CNT) { inp_decr_sndbytes_total(so, so->so_snd.sb_cc); inp_decr_sndbytes_allunsent(so, tp->snd_una); so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT; @@ -2926,7 +2988,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, } if (error != 0 && ifdenied && - (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) + (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) soevent(so, (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED)); @@ -2946,7 +3008,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt); tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); } - return (error); + return error; } int tcptv_persmin_val = TCPTV_PERSMIN; @@ -2962,9 +3024,9 @@ tcp_setpersist(struct tcpcb *tp) * see rdar://5805356 */ - if ((tp->t_persist_timeout != 0) && - (tp->t_timer[TCPT_PERSIST] == 0) && - (tp->t_persist_stop == 0)) { + if (tp->t_persist_timeout != 0 && + tp->t_timer[TCPT_PERSIST] == 0 && + tp->t_persist_stop == 0) { tp->t_persist_stop = tcp_now + tp->t_persist_timeout; } @@ -3097,7 +3159,7 @@ tcp_recv_throttle (struct tcpcb *tp) * in that state until rtt comes closer to base rtt */ if (tp->t_flagsext & TF_RECV_THROTTLE) - return (1); + return 1; base_rtt = get_base_rtt(tp); @@ -3123,9 +3185,9 @@ tcp_recv_throttle (struct tcpcb *tp) tcp_recv_throttle_minwin); sbrcv->sb_idealsize = newsize; } - return (1); + return 1; } else { - return (0); + return 0; } } } @@ -3135,7 +3197,7 @@ tcp_recv_throttle (struct tcpcb *tp) * measurement. Use IPDV in this case. */ if (tp->acc_iaj > tcp_acc_iaj_react_limit) - return (1); + return 1; - return (0); + return 0; } diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index 67381dffe..20c8a7b61 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -91,6 +91,7 @@ #include #include #include +#include #define tcp_minmssoverload fring #define _IP_VHL @@ -129,6 +130,8 @@ #if TCPDEBUG #include #endif +#include + #include #if IPSEC @@ -153,6 +156,8 @@ #include #include #include +#include +#include #include @@ -257,6 +262,16 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, __private_extern__ int, tcp_win_scale, 3, "Window scaling factor"); +#if (DEVELOPMENT || DEBUG) +SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache, + CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1, + "Initalize RTT from route cache"); +#else +SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache, + CTLFLAG_RD | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1, + "Initalize RTT from route cache"); +#endif /* (DEVELOPMENT || DEBUG) */ + static void tcp_cleartaocache(void); static void tcp_notify(struct inpcb *, int); @@ -307,6 +322,8 @@ struct inp_tp { int get_inpcb_str_size(void); int get_tcp_str_size(void); +os_log_t tcp_mpkl_log_object = NULL; + static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); static lck_attr_t *tcp_uptime_mtx_attr = NULL; @@ -462,6 +479,7 @@ tcp_init(struct protosw *pp, struct domain *dp) static int tcp_initialized = 0; vm_size_t str_size; struct inpcbinfo *pcbinfo; + uint32_t logging_config; VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); @@ -639,6 +657,18 @@ tcp_init(struct protosw *pp, struct domain *dp) /* Initialize TCP Cache */ tcp_cache_init(); + tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp"); + if (tcp_mpkl_log_object == NULL) { + panic("MPKL_CREATE_LOGOBJECT failed"); + } + + logging_config = atm_get_diagnostic_config(); + if (logging_config & 0x80000000) { + tcp_log_privacy = 1; + } + + PE_parse_boot_argn("tcp_log", &tcp_log_enable_flags, sizeof(tcp_log_enable_flags)); + /* * If more than 60 MB of mbuf pool is available, increase the * maximum allowed receive and send socket buffer size. @@ -875,6 +905,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = 0; + if (tra->keep_alive) { + m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE; + } #if CONFIG_MACF_NET if (tp != NULL && tp->t_inpcb != NULL) { /* @@ -973,6 +1006,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (tra->noexpensive) { ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; } + if (tra->noconstrained) { + ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED; + } if (tra->awdl_unrestricted) { ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; } @@ -1017,6 +1053,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (tra->noexpensive) { ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; } + if (tra->noconstrained) { + ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED; + } if (tra->awdl_unrestricted) { ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; } @@ -1123,6 +1162,7 @@ tcp_newtcpcb(struct inpcb *inp) tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = tcp_now; tp->tentry.timer_start = tcp_now; + tp->rcv_unackwin = tcp_now; tp->t_persist_timeout = tcp_max_persist_timeout; tp->t_persist_stop = 0; tp->t_flagsext |= TF_RCVUNACK_WAITSS; @@ -1177,6 +1217,9 @@ tcp_drop(struct tcpcb *tp, int errno) errno = tp->t_softerror; } so->so_error = errno; + + TCP_LOG_CONNECTION_SUMMARY(tp); + return tcp_close(tp); } @@ -1186,7 +1229,9 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) u_int32_t rtt = rt->rt_rmx.rmx_rtt; int isnetlocal = (tp->t_flags & TF_LOCAL); - if (rtt != 0) { + TCP_LOG_RTM_RTT(tp, rt); + + if (rtt != 0 && tcp_init_rtt_from_cache != 0) { /* * XXX the lock bit for RTT indicates that the value * is also a minimum value; this is subject to time. @@ -1197,9 +1242,11 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; } + tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { tp->t_rttvar = rt->rt_rmx.rmx_rttvar / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); @@ -1209,11 +1256,19 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } + + /* + * The RTO formula in the route metric case is based on: + * 4 * srtt + 8 * rttvar + * modulo the min, max and slop + */ TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp)); } + + TCP_LOG_RTT_INFO(tp); } static inline void @@ -1415,6 +1470,8 @@ tcp_close(struct tcpcb *tp) return NULL; } + TCP_LOG_CONNECTION_SUMMARY(tp); + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, int32_t, TCPS_CLOSED); @@ -1441,6 +1498,7 @@ tcp_close(struct tcpcb *tp) */ if (tp->t_rttupdated >= 16) { u_int32_t i = 0; + bool log_rtt = false; #if INET6 if (isipv6) { @@ -1481,6 +1539,7 @@ tcp_close(struct tcpcb *tp) rt->rt_rmx.rmx_rtt = i; } tcpstat.tcps_cachedrtt++; + log_rtt = true; } if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar * @@ -1492,6 +1551,11 @@ tcp_close(struct tcpcb *tp) rt->rt_rmx.rmx_rttvar = i; } tcpstat.tcps_cachedrttvar++; + log_rtt = true; + } + if (log_rtt) { + TCP_LOG_RTM_RTT(tp, rt); + TCP_LOG_RTT_INFO(tp); } /* * The old comment here said: @@ -1597,6 +1661,11 @@ no_valid_rt: inp->inp_lport, inp->inp_fport); tp->t_flagsext &= ~TF_LRO_OFFLOADED; } + /* + * Make sure to clear the TCP Keep Alive Offload as it is + * ref counted on the interface + */ + tcp_clear_keep_alive_offload(so); /* * If this is a socket that does not want to wakeup the device @@ -1742,11 +1811,6 @@ tcp_notify(struct inpcb *inp, int error) } else { tp->t_softerror = error; } -#if 0 - wakeup((caddr_t) &so->so_timeo); - sorwakeup(so); - sowwakeup(so); -#endif } struct bwmeas * @@ -2229,9 +2293,9 @@ tcp_handle_msgsize(struct ip *ip, struct inpcb *inp) u_short ifscope = IFSCOPE_NONE; int mtu; struct sockaddr_in icmpsrc = { - sizeof(struct sockaddr_in), - AF_INET, 0, { 0 }, - { 0, 0, 0, 0, 0, 0, 0, 0 } + .sin_len = sizeof(struct sockaddr_in), + .sin_family = AF_INET, .sin_port = 0, .sin_addr = { .s_addr = 0 }, + .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 } }; struct icmp *icp = NULL; @@ -2699,13 +2763,20 @@ tcp_mtudisc( #if INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; + /* + * Nothing left to send after the socket is defunct or TCP is in the closed state + */ + if ((so->so_state & SS_DEFUNCT) || (tp != NULL && tp->t_state == TCPS_CLOSED)) { + return; + } + if (isipv6) { protoHdrOverhead = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif /* INET6 */ - if (tp) { + if (tp != NULL) { #if INET6 if (isipv6) { rt = tcp_rtlookup6(inp, IFSCOPE_NONE); @@ -3103,18 +3174,16 @@ retry: if (so->so_pcb != NULL) { if (so->so_flags & SOF_MP_SUBFLOW) { struct mptcb *mp_tp = tptomptp(sototcpcb(so)); - VERIFY(mp_tp); - - mpte_lock_assert_notheld(mp_tp->mpt_mpte); + struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); - mpte_lock(mp_tp->mpt_mpte); + socket_lock(mp_so, refcount); /* * Check if we became non-MPTCP while waiting for the lock. * If yes, we have to retry to grab the right lock. */ if (!(so->so_flags & SOF_MP_SUBFLOW)) { - mpte_unlock(mp_tp->mpt_mpte); + socket_unlock(mp_so, refcount); goto retry; } } else { @@ -3186,11 +3255,11 @@ tcp_unlock(struct socket *so, int refcount, void *lr) if (so->so_flags & SOF_MP_SUBFLOW) { struct mptcb *mp_tp = tptomptp(sototcpcb(so)); + struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); - VERIFY(mp_tp); - mpte_lock_assert_held(mp_tp->mpt_mpte); + socket_lock_assert_owned(mp_so); - mpte_unlock(mp_tp->mpt_mpte); + socket_unlock(mp_so, refcount); } else { LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); @@ -3213,8 +3282,9 @@ tcp_getlock(struct socket *so, int flags) if (so->so_flags & SOF_MP_SUBFLOW) { struct mptcb *mp_tp = tptomptp(sototcpcb(so)); + struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); - return mpte_getlock(mp_tp->mpt_mpte, flags); + return mp_so->so_proto->pr_getlock(mp_so, flags); } else { return &inp->inpcb_mtx; } @@ -3272,6 +3342,13 @@ tcp_sbspace(struct tcpcb *tp) int32_t space; int32_t pending = 0; + if (so->so_flags & SOF_MP_SUBFLOW) { + /* We still need to grow TCP's buffer to have a BDP-estimate */ + tcp_sbrcv_grow_rwin(tp, sb); + + return mptcp_sbspace(tptomptp(tp)); + } + tcp_sbrcv_grow_rwin(tp, sb); /* hiwat might have changed */ @@ -3390,7 +3467,7 @@ void calculate_tcp_clock(void) { struct timeval tv = tcp_uptime; - struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC}; + struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC}; struct timeval now, hold_now; uint32_t incr = 0; @@ -3929,6 +4006,10 @@ tcp_fill_keepalive_offload_frames(ifnet_t ifp, tcp_keepidle; frame->keep_cnt = TCP_CONN_KEEPCNT(tp); frame->keep_retry = TCP_CONN_KEEPINTVL(tp); + if (so->so_options & SO_NOWAKEFROMSLEEP) { + frame->flags |= + IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP; + } frame->local_port = ntohs(inp->inp_lport); frame->remote_port = ntohs(inp->inp_fport); frame->local_seq = tp->snd_nxt; @@ -3995,6 +4076,110 @@ tcp_fill_keepalive_offload_frames(ifnet_t ifp, *used_frames_count = frame_index; } +static bool +inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame, + struct inpcb *inp) +{ + if (inp->inp_ppcb == NULL) { + return false; + } + /* Release the want count */ + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + return false; + } + if (inp->inp_last_outifp == NULL || + inp->inp_last_outifp->if_index != ifp->if_index) { + return false; + } + if (frame->local_port != ntohs(inp->inp_lport) || + frame->remote_port != ntohs(inp->inp_fport)) { + return false; + } + if (inp->inp_vflag & INP_IPV4) { + if (memcmp(&inp->inp_laddr, frame->local_addr, + sizeof(struct in_addr)) != 0 || + memcmp(&inp->inp_faddr, frame->remote_addr, + sizeof(struct in_addr)) != 0) { + return false; + } + } else if (inp->inp_vflag & INP_IPV6) { + if (memcmp(&inp->inp_laddr, frame->local_addr, + sizeof(struct in6_addr)) != 0 || + memcmp(&inp->inp_faddr, frame->remote_addr, + sizeof(struct in6_addr)) != 0) { + return false; + } + } else { + return false; + } + return true; +} + +int +tcp_notify_kao_timeout(ifnet_t ifp, + struct ifnet_keepalive_offload_frame *frame) +{ + struct inpcb *inp = NULL; + struct socket *so = NULL; + bool found = false; + + /* + * Unlock the list before posting event on the matching socket + */ + lck_rw_lock_shared(tcbinfo.ipi_lock); + + LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { + if ((so = inp->inp_socket) == NULL || + (so->so_state & SS_DEFUNCT)) { + continue; + } + if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) { + continue; + } + if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) { + continue; + } + if (inp->inp_ppcb == NULL || + in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) { + continue; + } + socket_lock(so, 1); + if (inp_matches_kao_frame(ifp, frame, inp)) { + /* + * Keep the matching socket locked + */ + found = true; + break; + } + socket_unlock(so, 1); + } + lck_rw_done(tcbinfo.ipi_lock); + + if (found) { + ASSERT(inp != NULL); + ASSERT(so != NULL); + ASSERT(so == inp->inp_socket); + /* + * Drop the TCP connection like tcptimers() does + */ + struct tcpcb *tp = inp->inp_ppcb; + + tcpstat.tcps_keepdrops++; + postevent(so, 0, EV_TIMEOUT); + soevent(so, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT)); + tp = tcp_drop(tp, ETIMEDOUT); + + tcpstat.tcps_ka_offload_drops++; + os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n", + __func__, frame->local_port, frame->remote_port); + + socket_unlock(so, 1); + } + + return 0; +} + errno_t tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so, u_int32_t notify_id) diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index fda0f86f6..784c0e879 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #include #include #include +#include #if INET6 #include #endif @@ -102,6 +103,8 @@ #if TCPDEBUG #include #endif +#include + #include #include #include @@ -128,7 +131,10 @@ sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS int error, s, tt; tt = *(int *)arg1; - s = tt * 1000 / TCP_RETRANSHZ;; + if (tt < 0 || tt >= INT_MAX / 1000) { + return EINVAL; + } + s = tt * 1000 / TCP_RETRANSHZ; error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) { @@ -266,6 +272,13 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_mss, 1200, "Path MTU Discovery Black Hole Detection lowered MSS"); +#if (DEBUG || DEVELOPMENT) +int tcp_probe_if_fix_port = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, probe_if_fix_port, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_probe_if_fix_port, 0, ""); +#endif /* (DEBUG || DEVELOPMENT) */ + static u_int32_t tcp_mss_rec_medium = 1200; static u_int32_t tcp_mss_rec_low = 512; @@ -477,7 +490,7 @@ inline int32_t timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { return (int32_t)((t1 + toff1) - (t2 + toff2)); -}; +} /* * Add to tcp timewait list, delay is given in milliseconds. @@ -565,7 +578,19 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) active = TRUE; goto out; } + if (mpsotomppcb(mp_so)->mpp_inside > 0) { + os_log(mptcp_log_handle, "%s - %lx: Still inside %d usecount %d\n", __func__, + (unsigned long)VM_KERNEL_ADDRPERM(mpsotompte(mp_so)), + mpsotomppcb(mp_so)->mpp_inside, + mp_so->so_usecount); + socket_unlock(mp_so, 0); + mp_so = NULL; + active = TRUE; + goto out; + } + /* We call socket_unlock with refcount further below */ mp_so->so_usecount++; + tptomptp(tp)->mpt_mpte->mpte_mppcb->mpp_inside++; } /* @@ -1004,6 +1029,7 @@ retransmit_packet: * is spurious. */ tcp_rexmt_save_state(tp); + tcp_ccdbg_trace(tp, NULL, TCP_CC_FIRST_REXMT); } #if MPTCP if ((tp->t_rxtshift >= mptcp_fail_thresh) && @@ -1012,10 +1038,13 @@ retransmit_packet: mptcp_act_on_txfail(so); } - if (so->so_flags & SOF_MP_SUBFLOW) { + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (so->so_flags & SOF_MP_SUBFLOW)) { struct mptses *mpte = tptomptp(tp)->mpt_mpte; - mptcp_check_subflows_and_add(mpte); + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + mptcp_check_subflows_and_add(mpte); + } } #endif /* MPTCP */ @@ -1049,11 +1078,13 @@ retransmit_packet: tp->t_flagsext &= ~(TF_DELAY_RECOVERY); } - if (tp->t_state == TCPS_SYN_RECEIVED) { + if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + tp->t_state == TCPS_SYN_RECEIVED) { tcp_disable_tfo(tp); } - if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) && + if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) && (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) && !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) && ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) || @@ -1070,6 +1101,8 @@ retransmit_packet: tcp_heuristic_tfo_middlebox(tp); so->so_error = ENODATA; + soevent(so, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR)); sorwakeup(so); sowwakeup(so); @@ -1077,13 +1110,16 @@ retransmit_packet: tcpstat.tcps_tfo_sndblackhole++; } - if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) && + if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) && (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) && tp->t_rxtshift > 3) { if (TSTMP_GT(tp->t_sndtime - 10 * TCP_RETRANSHZ, tp->t_rcvtime)) { tcp_heuristic_tfo_middlebox(tp); so->so_error = ENODATA; + soevent(so, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR)); sorwakeup(so); sowwakeup(so); } @@ -1092,12 +1128,12 @@ retransmit_packet: if (tp->t_state == TCPS_SYN_SENT) { rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; tp->t_stat.synrxtshift = tp->t_rxtshift; + tp->t_stat.rxmitsyns++; /* When retransmitting, disable TFO */ if (tfo_enabled(tp) && - (!(so->so_flags1 & SOF1_DATA_AUTHENTICATED) || - (tp->t_flagsext & TF_FASTOPEN_HEUR))) { - tp->t_flagsext &= ~TF_FASTOPEN; + !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) { + tcp_disable_tfo(tp); tp->t_tfo_flags |= TFO_F_SYN_LOSS; } } else { @@ -1108,6 +1144,8 @@ retransmit_packet: TCP_ADD_REXMTSLOP(tp)); tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + TCP_LOG_RTT_INFO(tp); + if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) { goto fc_output; } @@ -1347,8 +1385,10 @@ fc_output: bzero(&tra, sizeof(tra)); tra.nocell = INP_NO_CELLULAR(inp); tra.noexpensive = INP_NO_EXPENSIVE(inp); + tra.noconstrained = INP_NO_CONSTRAINED(inp); tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp); + tra.keep_alive = 1; if (tp->t_inpcb->inp_flags & INP_BOUND_IF) { tra.ifscope = tp->t_inpcb->inp_boundifp->if_index; } else { @@ -1362,6 +1402,9 @@ fc_output: tp->t_rtimo_probes++; } } + + TCP_LOG_KEEP_ALIVE(tp, idle_time); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINTVL(tp)); } else { @@ -1418,12 +1461,15 @@ fc_output: tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START( tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)), tp->t_timer[TCPT_KEEP]); - } else if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) && + } else if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && + !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) && tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) { /* Still no data! Let's assume a TFO-error and err out... */ tcp_heuristic_tfo_middlebox(tp); so->so_error = ENODATA; + soevent(so, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR)); sorwakeup(so); tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE; tcpstat.tcps_tfo_blackhole++; @@ -1508,51 +1554,101 @@ fc_output: case TCPT_PTO: { - int32_t snd_len; - tp->t_flagsext &= ~(TF_SENT_TLPROBE); + int32_t ret = 0; + if (!(tp->t_flagsext & TF_IF_PROBING)) { + tp->t_flagsext &= ~(TF_SENT_TLPROBE); + } /* * Check if the connection is in the right state to * send a probe */ - if (tp->t_state != TCPS_ESTABLISHED || - (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) || + if ((tp->t_state != TCPS_ESTABLISHED || + tp->t_rxtshift > 0 || tp->snd_max == tp->snd_una || !SACK_ENABLED(tp) || !TAILQ_EMPTY(&tp->snd_holes) || - IN_FASTRECOVERY(tp)) { + IN_FASTRECOVERY(tp)) && + !(tp->t_flagsext & TF_IF_PROBING)) { break; } /* - * If there is no new data to send or if the - * connection is limited by receive window then - * retransmit the last segment, otherwise send - * new data. + * When the interface state is changed explicitly reset the retransmission + * timer state for both SYN and data packets because we do not want to + * wait unnecessarily or timeout too quickly if the link characteristics + * have changed drastically */ - snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - - (tp->snd_max - tp->snd_una); - if (snd_len > 0) { - tp->snd_nxt = tp->snd_max; + if (tp->t_flagsext & TF_IF_PROBING) { + tp->t_rxtshift = 0; + if (tp->t_state == TCPS_SYN_SENT) { + tp->t_stat.synrxtshift = tp->t_rxtshift; + } + /* + * Reset to the the default RTO + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = + ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = tp->t_flags & TF_LOCAL ? tcp_TCPTV_MIN : + TCPTV_REXMTMIN; + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + tp->t_rttmin, TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp)); + TCP_LOG_RTT_INFO(tp); + } + + if (tp->t_state == TCPS_SYN_SENT) { + /* + * The PTO for SYN_SENT reinitializes TCP as if it was a fresh + * connection attempt + */ + tp->snd_nxt = tp->snd_una; + /* + * Note: We overload snd_recover to function also as the + * snd_last variable described in RFC 2582 + */ + tp->snd_recover = tp->snd_max; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + + /* If timing a segment in this window, stop the timer */ + tp->t_rtttime = 0; } else { - snd_len = min((tp->snd_max - tp->snd_una), - tp->t_maxseg); - tp->snd_nxt = tp->snd_max - snd_len; + int32_t snd_len; + + /* + * If there is no new data to send or if the + * connection is limited by receive window then + * retransmit the last segment, otherwise send + * new data. + */ + snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) + - (tp->snd_max - tp->snd_una); + if (snd_len > 0) { + tp->snd_nxt = tp->snd_max; + } else { + snd_len = min((tp->snd_max - tp->snd_una), + tp->t_maxseg); + tp->snd_nxt = tp->snd_max - snd_len; + } } tcpstat.tcps_pto++; - if (tp->t_flagsext & TF_PROBING) { + if (tp->t_flagsext & TF_IF_PROBING) { tcpstat.tcps_probe_if++; } /* If timing a segment in this window, stop the timer */ tp->t_rtttime = 0; - /* Note that tail loss probe is being sent */ - tp->t_flagsext |= TF_SENT_TLPROBE; - tp->t_tlpstart = tcp_now; + /* Note that tail loss probe is being sent. Exclude IF probe */ + if (!(tp->t_flagsext & TF_IF_PROBING)) { + tp->t_flagsext |= TF_SENT_TLPROBE; + tp->t_tlpstart = tcp_now; + } tp->snd_cwnd += tp->t_maxseg; - /* * When tail-loss-probe fires, we reset the RTO timer, because * a probe just got sent, so we are good to push out the timer. @@ -1560,11 +1656,57 @@ fc_output: * Set to 0 to ensure that tcp_output() will reschedule it */ tp->t_timer[TCPT_REXMT] = 0; + ret = tcp_output(tp); + +#if (DEBUG || DEVELOPMENT) + if ((tp->t_flagsext & TF_IF_PROBING) && + ((IFNET_IS_COMPANION_LINK(tp->t_inpcb->inp_last_outifp)) || + tp->t_state == TCPS_SYN_SENT)) { + if (ret == 0 && tcp_probe_if_fix_port > 0 && + tcp_probe_if_fix_port <= IPPORT_HILASTAUTO) { + tp->t_timer[TCPT_REXMT] = 0; + tcp_set_lotimer_index(tp); + } - (void)tcp_output(tp); + os_log(OS_LOG_DEFAULT, + "%s: sent %s probe for %u > %u on interface %s" + " (%u) %s(%d)", + __func__, + tp->t_state == TCPS_SYN_SENT ? "SYN" : "data", + ntohs(tp->t_inpcb->inp_lport), + ntohs(tp->t_inpcb->inp_fport), + if_name(tp->t_inpcb->inp_last_outifp), + tp->t_inpcb->inp_last_outifp->if_index, + ret == 0 ? "succeeded" :"failed", ret); + } +#endif /* DEBUG || DEVELOPMENT */ + + /* + * When the connection is not idle, make sure the retransmission timer + * is armed because it was set to zero above + */ + if ((tp->t_timer[TCPT_REXMT] == 0 || tp->t_timer[TCPT_PERSIST] == 0) && + (tp->t_inpcb->inp_socket->so_snd.sb_cc != 0 || tp->t_state == TCPS_SYN_SENT || + tp->t_state == TCPS_SYN_RECEIVED)) { + tp->t_timer[TCPT_REXMT] = + OFFSET_FROM_START(tp, tp->t_rxtcur); + + os_log(OS_LOG_DEFAULT, + "%s: tcp_output() returned %u with retransmission timer disabled " + "for %u > %u in state %d, reset timer to %d", + __func__, ret, + ntohs(tp->t_inpcb->inp_lport), + ntohs(tp->t_inpcb->inp_fport), + tp->t_state, + tp->t_timer[TCPT_REXMT]); + + tcp_check_timer_state(tp); + } tp->snd_cwnd -= tp->t_maxseg; - tp->t_tlphighrxt = tp->snd_nxt; + if (!(tp->t_flagsext & TF_IF_PROBING)) { + tp->t_tlphighrxt = tp->snd_nxt; + } break; } case TCPT_DELAYFR: @@ -1762,12 +1904,11 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, * If this connection is over an interface that needs to * be probed, send probe packets to reinitiate communication. */ - if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL && - tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) { - tp->t_flagsext |= TF_PROBING; + if (TCP_IF_STATE_CHANGED(tp, probe_if_index)) { + tp->t_flagsext |= TF_IF_PROBING; tcp_timers(tp, TCPT_PTO); tp->t_timer[TCPT_PTO] = 0; - tp->t_flagsext &= ~TF_PROBING; + tp->t_flagsext &= ~TF_IF_PROBING; } /* @@ -1907,7 +2048,14 @@ tcp_run_timerlist(void * arg1, void * arg2) LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { uint32_t offset = 0; uint32_t runtime = te->runtime; - if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) { + + tp = TIMERENTRY_TO_TP(te); + + /* + * An interface probe may need to happen before the previously scheduled runtime + */ + if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now) && + !TCP_IF_STATE_CHANGED(tp, listp->probe_if_index)) { offset = timer_diff(runtime, 0, tcp_now, 0); if (next_timer == 0 || offset < next_timer) { next_timer = offset; @@ -1916,8 +2064,6 @@ tcp_run_timerlist(void * arg1, void * arg2) continue; } - tp = TIMERENTRY_TO_TP(te); - /* * Acquire an inp wantcnt on the inpcb so that the socket * won't get detached even if tcp_close is called @@ -2473,13 +2619,19 @@ tcp_interface_send_probe(u_int16_t probe_if_index) calculate_tcp_clock(); lck_mtx_lock(listp->mtx); - if (listp->probe_if_index > 0) { + if (listp->probe_if_index > 0 && listp->probe_if_index != probe_if_index) { tcpstat.tcps_probe_if_conflict++; + os_log(OS_LOG_DEFAULT, + "%s: probe_if_index %u conflicts with %u, tcps_probe_if_conflict %u\n", + __func__, probe_if_index, listp->probe_if_index, + tcpstat.tcps_probe_if_conflict); goto done; } listp->probe_if_index = probe_if_index; if (listp->running) { + os_log(OS_LOG_DEFAULT, "%s: timer list already running for if_index %u\n", + __func__, probe_if_index); goto done; } @@ -2493,6 +2645,9 @@ tcp_interface_send_probe(u_int16_t probe_if_index) diff = timer_diff(listp->runtime, 0, tcp_now, offset); if (diff <= 0) { /* The timer will fire sooner than what's needed */ + os_log(OS_LOG_DEFAULT, + "%s: timer will fire sooner than needed for if_index %u\n", + __func__, probe_if_index); goto done; } } diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index ef53f55b9..92c445448 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -73,6 +73,7 @@ #if !CONFIG_EMBEDDED #include #endif +#include #include #include #include @@ -82,6 +83,7 @@ #include #include #include +#include #include #include @@ -104,6 +106,7 @@ #include #include #include +#include #include #if TCPDEBUG #include @@ -125,12 +128,11 @@ errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *); int tcp_sysctl_info(struct sysctl_oid *, void *, int, struct sysctl_req *); static void tcp_connection_fill_info(struct tcpcb *tp, struct tcp_connection_info *tci); +static int tcp_get_mpkl_send_info(struct mbuf *, struct so_mpkl_send_info *); /* * TCP protocol interface to socket abstraction. */ -extern char *tcpstates[]; /* XXX ??? */ - static int tcp_attach(struct socket *, struct proc *); static int tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *); #if INET6 @@ -387,6 +389,7 @@ tcp_usr_listen(struct socket *so, struct proc *p) if (error == 0) { tp->t_state = TCPS_LISTEN; } + TCP_LOG_LISTEN(tp, error); COMMON_END(PRU_LISTEN); } @@ -409,6 +412,7 @@ tcp6_usr_listen(struct socket *so, struct proc *p) if (error == 0) { tp->t_state = TCPS_LISTEN; } + TCP_LOG_LISTEN(tp, error); COMMON_END(PRU_LISTEN); } #endif /* INET6 */ @@ -422,7 +426,8 @@ tcp_connect_complete(struct socket *so) /* TFO delays the tcp_output until later, when the app calls write() */ if (so->so_flags1 & SOF1_PRECONNECT_DATA) { - if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL, NULL)) { + if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL, NULL, NULL)) { + TCP_LOG_DROP_NECP(NULL, NULL, tp, true); return EHOSTUNREACH; } @@ -474,8 +479,14 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) } } #if NECP +#if CONTENT_FILTER + error = cfil_sock_attach(so, NULL, nam, CFS_CONNECTION_DIR_OUT); + if (error != 0) { + return error; + } +#endif /* CONTENT_FILTER */ #if FLOW_DIVERT - else if (necp_socket_should_use_flow_divert(inp)) { + if (necp_socket_should_use_flow_divert(inp)) { uint32_t fd_ctl_unit = necp_socket_get_flow_divert_control_unit(inp); if (fd_ctl_unit > 0) { error = flow_divert_pcb_init(so, fd_ctl_unit); @@ -489,12 +500,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) return error; } #endif /* FLOW_DIVERT */ -#if CONTENT_FILTER - error = cfil_sock_attach(so); - if (error != 0) { - return error; - } -#endif /* CONTENT_FILTER */ #endif /* NECP */ tp = intotcpcb(inp); TCPDEBUG1(); @@ -516,11 +521,14 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) } if ((error = tcp_connect(tp, nam, p)) != 0) { + TCP_LOG_CONNECT(tp, true, error); goto out; } error = tcp_connect_complete(so); + TCP_LOG_CONNECT(tp, true, error); + COMMON_END(PRU_CONNECT); } @@ -658,8 +666,14 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) } } #if NECP +#if CONTENT_FILTER + error = cfil_sock_attach(so, NULL, nam, CFS_CONNECTION_DIR_OUT); + if (error != 0) { + return error; + } +#endif /* CONTENT_FILTER */ #if FLOW_DIVERT - else if (necp_socket_should_use_flow_divert(inp)) { + if (necp_socket_should_use_flow_divert(inp)) { uint32_t fd_ctl_unit = necp_socket_get_flow_divert_control_unit(inp); if (fd_ctl_unit > 0) { error = flow_divert_pcb_init(so, fd_ctl_unit); @@ -673,12 +687,6 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) return error; } #endif /* FLOW_DIVERT */ -#if CONTENT_FILTER - error = cfil_sock_attach(so); - if (error != 0) { - return error; - } -#endif /* CONTENT_FILTER */ #endif /* NECP */ tp = intotcpcb(inp); @@ -712,6 +720,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0) { + TCP_LOG_CONNECT(tp, true, error); goto out; } @@ -721,10 +730,14 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((error = tcp6_connect(tp, nam, p)) != 0) { + TCP_LOG_CONNECT(tp, true, error); goto out; } error = tcp_connect_complete(so); + + TCP_LOG_CONNECT(tp, true, error); + COMMON_END(PRU_CONNECT); } @@ -807,17 +820,14 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) else if (necp_socket_should_use_flow_divert(inp)) { return EPROTOTYPE; } -#if CONTENT_FILTER - error = cfil_sock_attach(so); - if (error != 0) { - return error; - } -#endif /* CONTENT_FILTER */ + #endif /* NECP */ tp = intotcpcb(inp); TCPDEBUG1(); + TCP_LOG_ACCEPT(tp, 0); + calculate_tcp_clock(); COMMON_END(PRU_ACCEPT); @@ -843,17 +853,14 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) else if (necp_socket_should_use_flow_divert(inp)) { return EPROTOTYPE; } -#if CONTENT_FILTER - error = cfil_sock_attach(so); - if (error != 0) { - return error; - } -#endif /* CONTENT_FILTER */ + #endif /* NECP */ tp = intotcpcb(inp); TCPDEBUG1(); + TCP_LOG_ACCEPT(tp, 0); + calculate_tcp_clock(); in6_mapped_peeraddr(so, nam); @@ -1005,6 +1012,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; uint32_t msgpri = MSG_PRI_DEFAULT; + uint32_t mpkl_len = 0; /* length of mbuf chain */ + uint32_t mpkl_seq; /* sequence number where new data is added */ + struct so_mpkl_send_info mpkl_send_info = {}; + #if INET6 int isipv6; #endif @@ -1045,6 +1056,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, calculate_tcp_clock(); + if (net_mpklog_enabled) { + mpkl_seq = tp->snd_una + so->so_snd.sb_cc; + if (m) { + mpkl_len = m_length(m); + } + if (so->so_flags1 & SOF1_MPKL_SEND_INFO) { + uuid_copy(mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid); + mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto; + } + } + if (control != NULL) { if (so->so_flags & SOF_ENABLE_MSGS) { /* Get the msg priority from control mbufs */ @@ -1058,22 +1080,30 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, m = NULL; goto out; } - m_freem(control); - control = NULL; - } else if (control->m_len) { + } + if (control->m_len > 0 && net_mpklog_enabled) { + error = tcp_get_mpkl_send_info(control, &mpkl_send_info); /* - * if not unordered, TCP should not have - * control mbufs + * Intepretation of the returned code: + * 0: client wants us to use value passed in SCM_MPKL_SEND_INFO + * 1: SCM_MPKL_SEND_INFO was not present + * other: failure */ - m_freem(control); - if (m != NULL) { - m_freem(m); + if (error != 0 && error != ENOMSG) { + m_freem(control); + if (m != NULL) { + m_freem(m); + } + control = NULL; + m = NULL; + goto out; } - control = NULL; - m = NULL; - error = EINVAL; - goto out; } + /* + * Silently drop unsupported ancillary data messages + */ + m_freem(control); + control = NULL; } if (so->so_flags & SOF_ENABLE_MSGS) { @@ -1107,11 +1137,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) { + TCP_LOG_CONNECT(tp, true, error); goto out; } tp->snd_wnd = TTCP_CLIENT_SND_WND; tp->max_sndwnd = tp->snd_wnd; tcp_mss(tp, -1, IFSCOPE_NONE); + + TCP_LOG_CONNECT(tp, true, error); + + /* The sequence number of the data is past the SYN */ + mpkl_seq = tp->iss + 1; } if (flags & PRUS_EOF) { @@ -1162,11 +1198,14 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) { + TCP_LOG_CONNECT(tp, true, error); goto out; } tp->snd_wnd = TTCP_CLIENT_SND_WND; tp->max_sndwnd = tp->snd_wnd; tcp_mss(tp, -1, IFSCOPE_NONE); + + TCP_LOG_CONNECT(tp, true, error); } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flagsext |= TF_FORCE; @@ -1174,6 +1213,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, tp->t_flagsext &= ~TF_FORCE; } + if (net_mpklog_enabled && (inp = tp->t_inpcb) != NULL && + ((inp->inp_last_outifp != NULL && + (inp->inp_last_outifp->if_xflags & IFXF_MPK_LOG)) || + (inp->inp_boundifp != NULL && + (inp->inp_boundifp->if_xflags & IFXF_MPK_LOG)))) { + MPKL_TCP_SEND(tcp_mpkl_log_object, + mpkl_send_info.mpkl_proto, mpkl_send_info.mpkl_uuid, + ntohs(inp->inp_lport), ntohs(inp->inp_fport), + mpkl_seq, mpkl_len, + so->last_pid, so->so_log_seqn++); + } /* * We wait for the socket to successfully connect before returning. @@ -1445,6 +1495,7 @@ skip_oinp: tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + tp->t_connect_time = tcp_now; if (nstat_collect) { nstat_route_connect_attempt(inp->inp_route.ro_rt); } @@ -1546,6 +1597,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p) TCP_CONN_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + tp->t_connect_time = tcp_now; if (nstat_collect) { nstat_route_connect_attempt(inp->inp_route.ro_rt); } @@ -1639,7 +1691,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; if (tp->t_state > TCPS_LISTEN) { - ti->tcpi_synrexmits = tp->t_stat.synrxtshift; + ti->tcpi_synrexmits = tp->t_stat.rxmitsyns; } ti->tcpi_cell_rxpackets = inp->inp_cstat->rxpackets; ti->tcpi_cell_rxbytes = inp->inp_cstat->rxbytes; @@ -1856,44 +1908,6 @@ tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int error; struct tcp_info ti = {}; struct info_tuple itpl; -#if !CONFIG_EMBEDDED - proc_t caller = PROC_NULL; - proc_t caller_parent = PROC_NULL; - char command_name[MAXCOMLEN + 1] = ""; - char parent_name[MAXCOMLEN + 1] = ""; - - if ((caller = proc_self()) != PROC_NULL) { - /* get process name */ - strlcpy(command_name, caller->p_comm, sizeof(command_name)); - - /* get parent process name if possible */ - if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) { - strlcpy(parent_name, caller_parent->p_comm, - sizeof(parent_name)); - proc_rele(caller_parent); - } - - if ((escape_str(command_name, strlen(command_name) + 1, - sizeof(command_name)) == 0) && - (escape_str(parent_name, strlen(parent_name) + 1, - sizeof(parent_name)) == 0)) { - kern_asl_msg(LOG_DEBUG, "messagetracer", - 5, - "com.apple.message.domain", - "com.apple.kernel.tcpstat", /* 1 */ - "com.apple.message.signature", - "tcpinfo", /* 2 */ - "com.apple.message.signature2", command_name, /* 3 */ - "com.apple.message.signature3", parent_name, /* 4 */ - "com.apple.message.summarize", "YES", /* 5 */ - NULL); - } - } - - if (caller != PROC_NULL) { - proc_rele(caller); - } -#endif /* !CONFIG_EMBEDDED */ if (req->newptr == USER_ADDR_NULL) { return EINVAL; @@ -1965,6 +1979,90 @@ tcp_getconninfo(struct socket *so, struct conninfo_tcp *tcp_ci) tcp_fill_info(sototcpcb(so), &tcp_ci->tcpci_tcp_info); } +void +tcp_clear_keep_alive_offload(struct socket *so) +{ + struct inpcb *inp; + struct ifnet *ifp; + + inp = sotoinpcb(so); + if (inp == NULL) { + return; + } + + if ((inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD) == 0) { + return; + } + + ifp = inp->inp_boundifp != NULL ? inp->inp_boundifp : + inp->inp_last_outifp; + if (ifp == NULL) { + panic("%s: so %p inp %p ifp NULL", + __func__, so, inp); + } + + ifnet_lock_exclusive(ifp); + + if (ifp->if_tcp_kao_cnt == 0) { + panic("%s: so %p inp %p ifp %p if_tcp_kao_cnt == 0", + __func__, so, inp, ifp); + } + ifp->if_tcp_kao_cnt--; + inp->inp_flags2 &= ~INP2_KEEPALIVE_OFFLOAD; + + ifnet_lock_done(ifp); +} + +static int +tcp_set_keep_alive_offload(struct socket *so, struct proc *proc) +{ + int error = 0; + struct inpcb *inp; + struct ifnet *ifp; + + inp = sotoinpcb(so); + if (inp == NULL) { + return ECONNRESET; + } + if ((inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD) != 0) { + return 0; + } + + ifp = inp->inp_boundifp != NULL ? inp->inp_boundifp : + inp->inp_last_outifp; + if (ifp == NULL) { + error = ENXIO; + os_log_info(OS_LOG_DEFAULT, + "%s: error %d for proc %s[%u] out ifp is not set\n", + __func__, error, + proc != NULL ? proc->p_comm : "kernel", + proc != NULL ? proc->p_pid : 0); + return ENXIO; + } + + error = if_get_tcp_kao_max(ifp); + if (error != 0) { + return error; + } + + ifnet_lock_exclusive(ifp); + if (ifp->if_tcp_kao_cnt < ifp->if_tcp_kao_max) { + ifp->if_tcp_kao_cnt++; + inp->inp_flags2 |= INP2_KEEPALIVE_OFFLOAD; + } else { + error = ETOOMANYREFS; + os_log_info(OS_LOG_DEFAULT, + "%s: error %d for proc %s[%u] if_tcp_kao_max %u\n", + __func__, error, + proc != NULL ? proc->p_comm : "kernel", + proc != NULL ? proc->p_pid : 0, + ifp->if_tcp_kao_max); + } + ifnet_lock_done(ifp); + + return error; +} + /* * The new sockopt interface makes it possible for us to block in the * copyin/out step (if we take a page fault). Taking a page fault at @@ -2203,6 +2301,10 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) break; case TCP_KEEPALIVE_OFFLOAD: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NETINET_TCP_KA_OFFLOAD, 0)) != 0) { + break; + } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) { @@ -2213,9 +2315,10 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) break; } if (optval != 0) { - inp->inp_flags2 |= INP2_KEEPALIVE_OFFLOAD; + error = tcp_set_keep_alive_offload(so, + sopt->sopt_p); } else { - inp->inp_flags2 &= ~INP2_KEEPALIVE_OFFLOAD; + tcp_clear_keep_alive_offload(so); } break; @@ -2398,6 +2501,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } break; case TCP_FASTOPEN_FORCE_HEURISTICS: + + break; + case TCP_FASTOPEN_FORCE_ENABLE: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); @@ -2414,9 +2520,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) break; } if (optval) { - tp->t_flagsext |= TF_FASTOPEN_HEUR; + tp->t_flagsext |= TF_FASTOPEN_FORCE_ENABLE; } else { - tp->t_flagsext &= ~TF_FASTOPEN_HEUR; + tp->t_flagsext &= ~TF_FASTOPEN_FORCE_ENABLE; } break; @@ -2600,7 +2706,10 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) optval = tfo_enabled(tp); break; case TCP_FASTOPEN_FORCE_HEURISTICS: - optval = (tp->t_flagsext & TF_FASTOPEN_HEUR) ? 1 : 0; + optval = 0; + break; + case TCP_FASTOPEN_FORCE_ENABLE: + optval = (tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) ? 1 : 0; break; case TCP_MEASURE_SND_BW: optval = tp->t_flagsext & TF_MEASURESNDBW; @@ -2915,6 +3024,7 @@ tcp_usrclosed(struct tcpcb *tp) struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); tp->t_state = TCPS_FIN_WAIT_1; + TCP_LOG_CONNECTION_SUMMARY(tp); break; case TCPS_CLOSE_WAIT: @@ -2923,6 +3033,7 @@ tcp_usrclosed(struct tcpcb *tp) struct tcpcb *, tp, int32_t, TCPS_LAST_ACK); tp->t_state = TCPS_LAST_ACK; + TCP_LOG_CONNECTION_SUMMARY(tp); break; } if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { @@ -2964,6 +3075,7 @@ tcp_out6_cksum_stats(u_int32_t len) tcpstat.tcps_snd6_swcsum++; tcpstat.tcps_snd6_swcsum_bytes += len; } +#endif /* INET6 */ /* * When messages are enabled on a TCP socket, the message priority @@ -2973,6 +3085,7 @@ int tcp_get_msg_priority(struct mbuf *control, uint32_t *msgpri) { struct cmsghdr *cm; + if (control == NULL) { return EINVAL; } @@ -2994,4 +3107,33 @@ tcp_get_msg_priority(struct mbuf *control, uint32_t *msgpri) } return 0; } -#endif /* INET6 */ + +int +tcp_get_mpkl_send_info(struct mbuf *control, + struct so_mpkl_send_info *mpkl_send_info) +{ + struct cmsghdr *cm; + + if (control == NULL || mpkl_send_info == NULL) { + return EINVAL; + } + + for (cm = M_FIRST_CMSGHDR(control); cm; + cm = M_NXT_CMSGHDR(control, cm)) { + if (cm->cmsg_len < sizeof(struct cmsghdr) || + cm->cmsg_len > control->m_len) { + return EINVAL; + } + if (cm->cmsg_level != SOL_SOCKET || + cm->cmsg_type != SCM_MPKL_SEND_INFO) { + continue; + } + if (cm->cmsg_len != CMSG_LEN(sizeof(struct so_mpkl_send_info))) { + return EINVAL; + } + memcpy(mpkl_send_info, CMSG_DATA(cm), + sizeof(struct so_mpkl_send_info)); + return 0; + } + return ENOMSG; +} diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index e9fde2f3a..5358d21a0 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -380,6 +380,7 @@ struct tcpcb { /* Receiver state for stretch-ack algorithm */ u_int32_t rcv_unackwin; /* to measure win for stretching acks */ u_int32_t rcv_by_unackwin; /* bytes seen during the last ack-stretching win */ + u_int32_t rcv_by_unackhalfwin; u_int32_t rcv_nostrack_ts; /* timestamp when stretch ack was disabled automatically */ u_int32_t rcv_nostrack_pkts; /* pkts received since strech ack was disabled */ u_int16_t rcv_waitforss; /* wait for packets during slow-start */ @@ -449,7 +450,7 @@ struct tcpcb { u_int32_t rxoutoforderbytes; u_int32_t txretransmitbytes; u_int8_t synrxtshift; - u_int8_t unused; + u_int8_t rxmitsyns; u_int16_t unused_pad_to_8; u_int32_t rxmitpkts; } t_stat; @@ -479,10 +480,11 @@ struct tcpcb { #define TF_DISABLE_DSACK 0x40000 /* Ignore DSACK due to n/w duplication */ #define TF_RESCUE_RXT 0x80000 /* SACK rescue retransmit */ #define TF_CWND_NONVALIDATED 0x100000 /* cwnd non validated */ -#define TF_PROBING 0x200000 /* Trigger probe timeout */ +#define TF_IF_PROBING 0x200000 /* Trigger interface probe timeout */ #define TF_FASTOPEN 0x400000 /* TCP Fastopen is enabled */ #define TF_REASS_INPROG 0x800000 /* Reassembly is in progress */ -#define TF_FASTOPEN_HEUR 0x1000000 /* Make sure that heuristics get never skipped */ +#define TF_FASTOPEN_FORCE_ENABLE 0x1000000 /* Force-enable TCP Fastopen */ +#define TF_LOGGED_CONN_SUMMARY 0x2000000 /* Connection summary was logged */ #if TRAFFIC_MGT /* Inter-arrival jitter related state */ @@ -621,6 +623,9 @@ struct tcpcb { u_int32_t t_rxt_minimum_timeout; /* minimum retransmit timeout in ms */ uint32_t t_challengeack_last; /* last time challenge ACK was sent per sec */ uint32_t t_challengeack_count; /* # of challenge ACKs already sent per sec */ + + u_int32_t t_log_flags; /* TCP logging flags*/ + u_int32_t t_connect_time; /* time when the connection started */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -722,30 +727,9 @@ extern int tcprexmtthresh; #define TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && (IFNET_IS_CELLULAR((_ifp_))) && ((_ifp_)->if_eflags & IFEF_3CA)) ? \ (tcp_autorcvbuf_max << 1) : tcp_autorcvbuf_max) -enum tcp_cc_event { - TCP_CC_CWND_INIT, /* 0 */ - TCP_CC_INSEQ_ACK_RCVD, /* 1 */ - TCP_CC_ACK_RCVD, /* 2 */ - TCP_CC_ENTER_FASTRECOVERY, /* 3 */ - TCP_CC_IN_FASTRECOVERY, /* 4 */ - TCP_CC_EXIT_FASTRECOVERY, /* 5 */ - TCP_CC_PARTIAL_ACK, /* 6 */ - TCP_CC_IDLE_TIMEOUT, /* 7 */ - TCP_CC_REXMT_TIMEOUT, /* 8 */ - TCP_CC_ECN_RCVD, /* 9 */ - TCP_CC_BAD_REXMT_RECOVERY, /* 10 */ - TCP_CC_OUTPUT_ERROR, /* 11 */ - TCP_CC_CHANGE_ALGO, /* 12 */ - TCP_CC_FLOW_CONTROL, /* 13 */ - TCP_CC_SUSPEND, /* 14 */ - TCP_CC_LIMITED_TRANSMIT, /* 15 */ - TCP_CC_EARLY_RETRANSMIT, /* 16 */ - TCP_CC_TLP_RECOVERY, /* 17 */ - TCP_CC_TLP_RECOVER_LASTPACKET, /* 18 */ - TCP_CC_DELAY_FASTRECOVERY, /* 19 */ - TCP_CC_TLP_IN_FASTRECOVERY, /* 20 */ - TCP_CC_DSACK_BAD_REXMT /* 21 */ -}; +#define TCP_IF_STATE_CHANGED(tp, probe_if_index) \ + (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL && \ + probe_if_index == tp->t_inpcb->inp_last_outifp->if_index) /* * Structure to hold TCP options that are only used during segment @@ -1205,6 +1189,10 @@ struct tcpstat { u_int32_t tcps_mptcp_back_to_wifi; /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */ u_int32_t tcps_mptcp_wifi_proxy; /* Total number of new subflows that fell back to regular TCP on cell */ u_int32_t tcps_mptcp_cell_proxy; /* Total number of new subflows that fell back to regular TCP on WiFi */ + + /* TCP offload statistics */ + u_int32_t tcps_ka_offload_drops; /* Keep alive drops for timeout reported by firmware */ + u_int32_t tcps_mptcp_triggered_cell; /* Total number of times an MPTCP-connection triggered cell bringup */ }; @@ -1444,17 +1432,17 @@ struct xtcpprogress_indicators { u_int64_t xp_recentflows_rxooo; /* Total of "recent" flows received out of order bytes */ u_int64_t xp_recentflows_rxdup; /* Total of "recent" flows received duplicate bytes */ u_int64_t xp_recentflows_retx; /* Total of "recent" flows retransmitted bytes */ - u_int64_t xp_reserved1; /* Expansion */ - u_int64_t xp_reserved2; /* Expansion */ - u_int64_t xp_reserved3; /* Expansion */ - u_int64_t xp_reserved4; /* Expansion */ + u_int64_t xp_reserved1; /* Expansion */ + u_int64_t xp_reserved2; /* Expansion */ + u_int64_t xp_reserved3; /* Expansion */ + u_int64_t xp_reserved4; /* Expansion */ }; struct tcpprogressreq { - u_int64_t ifindex; /* Interface index for progress indicators */ + u_int64_t ifindex; /* Interface index for progress indicators */ u_int64_t recentflow_maxduration; /* In mach_absolute_time, max duration for flow to be counted as "recent" */ - u_int64_t xp_reserved1; /* Expansion */ - u_int64_t xp_reserved2; /* Expansion */ + u_int64_t filter_flags; /* Optional additional filtering, values are interface properties per ntstat.h */ + u_int64_t xp_reserved2; /* Expansion */ }; #endif /* PRIVATE */ @@ -1504,6 +1492,8 @@ struct tcpprogressreq { { "v6mssdflt", CTLTYPE_INT }, \ } +extern int tcp_TCPTV_MIN; + #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); #endif /* SYSCTL_DECL */ @@ -1550,7 +1540,9 @@ struct tcp_respond_args { unsigned int nocell:1, noexpensive:1, awdl_unrestricted:1, - intcoproc_allowed:1; + intcoproc_allowed:1, + keep_alive:1, + noconstrained:1; }; void tcp_canceltimers(struct tcpcb *); @@ -1660,8 +1652,11 @@ extern void tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable); extern void tcp_get_connectivity_status(struct tcpcb *, struct tcp_conn_status *); +extern void tcp_clear_keep_alive_offload(struct socket *so); extern void tcp_fill_keepalive_offload_frames(struct ifnet *, struct ifnet_keepalive_offload_frame *, u_int32_t, size_t, u_int32_t *); +extern int tcp_notify_kao_timeout(ifnet_t ifp, + struct ifnet_keepalive_offload_frame *frame); extern boolean_t tfo_enabled(const struct tcpcb *tp); extern void tcp_disable_tfo(struct tcpcb *tp); @@ -1693,6 +1688,6 @@ extern void mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *t __private_extern__ void tcp_update_stats_per_flow( struct ifnet_stats_per_flow *, struct ifnet *); -#endif /* BSD_KERNEL_RPIVATE */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 571afd2ac..247e01802 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -596,7 +596,7 @@ udp_input(struct mbuf *m, int iphlen) goto bad; } - /* free the extra copy of mbuf or skipped by IPSec */ + /* free the extra copy of mbuf or skipped by IPsec */ if (m != NULL) { m_freem(m); } @@ -607,13 +607,14 @@ udp_input(struct mbuf *m, int iphlen) #if IPSEC /* * UDP to port 4500 with a payload where the first four bytes are - * not zero is a UDP encapsulated IPSec packet. Packets where + * not zero is a UDP encapsulated IPsec packet. Packets where * the payload is one byte and that byte is 0xFF are NAT keepalive - * packets. Decapsulate the ESP packet and carry on with IPSec input + * packets. Decapsulate the ESP packet and carry on with IPsec input * or discard the NAT keep-alive. */ if (ipsec_bypass == 0 && (esp_udp_encap_port & 0xFFFF) != 0 && - uh->uh_dport == ntohs((u_short)esp_udp_encap_port)) { + (uh->uh_dport == ntohs((u_short)esp_udp_encap_port) || + uh->uh_sport == ntohs((u_short)esp_udp_encap_port))) { int payload_len = len - sizeof(struct udphdr) > 4 ? 4 : len - sizeof(struct udphdr); @@ -643,7 +644,7 @@ udp_input(struct mbuf *m, int iphlen) return; } else if (payload_len == 4 && *(u_int32_t *)(void *) ((caddr_t)uh + sizeof(struct udphdr)) != 0) { - /* UDP encapsulated IPSec packet to pass through NAT */ + /* UDP encapsulated IPsec packet to pass through NAT */ KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); /* preserve the udp header */ @@ -1571,6 +1572,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (INP_NO_EXPENSIVE(inp)) { ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; } + if (INP_NO_CONSTRAINED(inp)) { + ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED; + } if (INP_AWDL_UNRESTRICTED(inp)) { ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; } @@ -1948,6 +1952,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, VERIFY(inp->inp_sndinprog_cnt > 0); if (--inp->inp_sndinprog_cnt == 0) { inp->inp_flags &= ~(INP_FC_FEEDBACK); + if (inp->inp_sndingprog_waiters > 0) { + wakeup(&inp->inp_sndinprog_cnt); + } } /* Synchronize PCB cached route */ @@ -2008,7 +2015,7 @@ abort: * denied access to it, generate an event. */ if (error != 0 && (ipoa.ipoa_retflags & IPOARF_IFDENIED) && - (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) { + (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) { soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); } diff --git a/bsd/netinet6/Makefile b/bsd/netinet6/Makefile index 9bccd060b..c1b816806 100644 --- a/bsd/netinet6/Makefile +++ b/bsd/netinet6/Makefile @@ -24,7 +24,7 @@ PRIVATE_DATAFILES = \ PRIVATE_KERNELFILES = \ ah6.h esp6.h esp_rijndael.h esp_chachapoly.h \ in6_gif.h in6_ifattach.h ip6_ecn.h ip6protosw.h \ - ipcomp6.h ipsec6.h tcp6_var.h udp6_var.h + ipsec6.h tcp6_var.h udp6_var.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/netinet6/ah_core.c b/bsd/netinet6/ah_core.c index cd503bea6..578e06257 100644 --- a/bsd/netinet6/ah_core.c +++ b/bsd/netinet6/ah_core.c @@ -164,37 +164,37 @@ const struct ah_algorithm * ah_algorithm_lookup(int idx) { /* checksum algorithms */ - static struct ah_algorithm hmac_md5 = + static const struct ah_algorithm hmac_md5 = { ah_sumsiz_1216, ah_hmac_md5_mature, 128, 128, "hmac-md5", ah_hmac_md5_init, ah_hmac_md5_loop, ah_hmac_md5_result, }; - static struct ah_algorithm keyed_md5 = + static const struct ah_algorithm keyed_md5 = { ah_sumsiz_1216, ah_keyed_md5_mature, 128, 128, "keyed-md5", ah_keyed_md5_init, ah_keyed_md5_loop, ah_keyed_md5_result, }; - static struct ah_algorithm hmac_sha1 = + static const struct ah_algorithm hmac_sha1 = { ah_sumsiz_1216, ah_hmac_sha1_mature, 160, 160, "hmac-sha1", ah_hmac_sha1_init, ah_hmac_sha1_loop, ah_hmac_sha1_result, }; - static struct ah_algorithm keyed_sha1 = + static const struct ah_algorithm keyed_sha1 = { ah_sumsiz_1216, ah_keyed_sha1_mature, 160, 160, "keyed-sha1", ah_keyed_sha1_init, ah_keyed_sha1_loop, ah_keyed_sha1_result, }; - static struct ah_algorithm ah_none = + static const struct ah_algorithm ah_none = { ah_sumsiz_zero, ah_none_mature, 0, 2048, "none", ah_none_init, ah_none_loop, ah_none_result, }; #if AH_ALL_CRYPTO - static struct ah_algorithm hmac_sha2_256 = + static const struct ah_algorithm hmac_sha2_256 = { ah_sumsiz_sha2_256, ah_hmac_sha2_256_mature, 256, 256, "hmac-sha2-256", ah_hmac_sha2_256_init, ah_hmac_sha2_256_loop, ah_hmac_sha2_256_result, }; - static struct ah_algorithm hmac_sha2_384 = + static const struct ah_algorithm hmac_sha2_384 = { ah_sumsiz_sha2_384, ah_hmac_sha2_384_mature, 384, 384, "hmac-sha2-384", ah_hmac_sha2_384_init, ah_hmac_sha2_384_loop, ah_hmac_sha2_384_result, }; - static struct ah_algorithm hmac_sha2_512 = + static const struct ah_algorithm hmac_sha2_512 = { ah_sumsiz_sha2_512, ah_hmac_sha2_512_mature, 512, 512, "hmac-sha2-512", ah_hmac_sha2_512_init, ah_hmac_sha2_512_loop, diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index 2a67501f6..104f5a9c6 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -124,6 +124,8 @@ #define IPLEN_FLIPPED +extern lck_mtx_t *sadb_mutex; + #if INET void ah4_input(struct mbuf *m, int off) @@ -263,8 +265,8 @@ ah4_input(struct mbuf *m, int off) /* * check for sequence number. */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav)) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) { + if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) { ; /*okey*/ } else { IPSEC_STAT_INCREMENT(ipsecstat.in_ahreplay); @@ -386,8 +388,8 @@ ah4_input(struct mbuf *m, int off) /* * update sequence number. */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav)) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) { + if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) { IPSEC_STAT_INCREMENT(ipsecstat.in_ahreplay); goto fail; } @@ -499,9 +501,18 @@ ah4_input(struct mbuf *m, int off) IFA_REMREF(ifa); } - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + if (inject_error == 0) { m = NULL; goto done; } else { @@ -555,13 +566,22 @@ ah4_input(struct mbuf *m, int off) struct ip *, ip, struct ip6_hdr *, NULL); if (nxt != IPPROTO_DONE) { - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { ip->ip_len = htons(ip->ip_len + hlen); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = ip_cksum_hdr_in(m, hlen); - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + if (inject_error == 0) { m = NULL; goto done; } else { @@ -709,8 +729,8 @@ ah6_input(struct mbuf **mp, int *offp, int proto) /* * check for sequence number. */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav)) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) { + if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) { ; /*okey*/ } else { IPSEC_STAT_INCREMENT(ipsec6stat.in_ahreplay); @@ -815,8 +835,8 @@ ah6_input(struct mbuf **mp, int *offp, int proto) /* * update sequence number. */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav)) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) { + if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_ahreplay); goto fail; } @@ -907,9 +927,18 @@ ah6_input(struct mbuf **mp, int *offp, int proto) IFA_REMREF(ifa); } - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + if (inject_error == 0) { m = NULL; nxt = IPPROTO_DONE; goto done; @@ -955,9 +984,18 @@ ah6_input(struct mbuf **mp, int *offp, int proto) goto fail; } - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + if (inject_error == 0) { m = NULL; nxt = IPPROTO_DONE; goto done; diff --git a/bsd/netinet6/ah_output.c b/bsd/netinet6/ah_output.c index d41e2f679..10f39472b 100644 --- a/bsd/netinet6/ah_output.c +++ b/bsd/netinet6/ah_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -197,12 +197,12 @@ ah4_output(struct mbuf *m, struct secasvar *sav) size_t plen = 0; /*AH payload size in bytes*/ size_t ahlen = 0; /*plen + sizeof(ah)*/ struct ip *ip; - struct in_addr dst = { 0 }; + struct in_addr dst = { .s_addr = 0 }; struct in_addr *finaldst; int error; /* sanity checks */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] == NULL) { ip = mtod(m, struct ip *); ipseclog((LOG_DEBUG, "ah4_output: internal error: " "sav->replay is null: %x->%x, SPI=%u\n", @@ -295,7 +295,7 @@ ah4_output(struct mbuf *m, struct secasvar *sav) ahdr->ah_nxt = ip->ip_p; ahdr->ah_reserve = htons(0); ahdr->ah_spi = spi; - if (sav->replay->count == ~0) { + if (sav->replay[0]->count == ~0) { if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) { /* XXX Is it noisy ? */ ipseclog((LOG_WARNING, @@ -307,13 +307,13 @@ ah4_output(struct mbuf *m, struct secasvar *sav) } } lck_mtx_lock(sadb_mutex); - sav->replay->count++; + sav->replay[0]->count++; lck_mtx_unlock(sadb_mutex); /* * XXX sequence number must not be cycled, if the SA is * installed by IKE daemon. */ - ahdr->ah_seq = htonl(sav->replay->count); + ahdr->ah_seq = htonl(sav->replay[0]->count); bzero(ahdr + 1, plen); } @@ -461,7 +461,7 @@ ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); - if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] == NULL) { ipseclog((LOG_DEBUG, "ah6_output: internal error: " "sav->replay is null: SPI=%u\n", (u_int32_t)ntohl(sav->spi))); @@ -504,7 +504,7 @@ ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, ahdr->ah_len = (plen >> 2) + 1; /* plus one for seq# */ ahdr->ah_reserve = htons(0); ahdr->ah_spi = spi; - if (sav->replay->count == ~0) { + if (sav->replay[0]->count == ~0) { if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) { /* XXX Is it noisy ? */ ipseclog((LOG_WARNING, @@ -516,13 +516,13 @@ ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, } } lck_mtx_lock(sadb_mutex); - sav->replay->count++; + sav->replay[0]->count++; lck_mtx_unlock(sadb_mutex); /* * XXX sequence number must not be cycled, if the SA is * installed by IKE daemon. */ - ahdr->ah_seq = htonl(sav->replay->count); + ahdr->ah_seq = htonl(sav->replay[0]->count); bzero(ahdr + 1, plen); } diff --git a/bsd/netinet6/esp.h b/bsd/netinet6/esp.h index c49d5eff3..c72ac1c72 100644 --- a/bsd/netinet6/esp.h +++ b/bsd/netinet6/esp.h @@ -65,7 +65,7 @@ #ifndef _NETINET6_ESP_H_ #define _NETINET6_ESP_H_ #include - +#include struct esp { u_int32_t esp_spi; /* ESP */ @@ -123,6 +123,8 @@ struct esp_algorithm { int (*finalizeencrypt)(struct secasvar *, u_int8_t *, uint); }; +extern os_log_t esp_mpkl_log_object; + extern const struct esp_algorithm *esp_algorithm_lookup(int); extern int esp_max_ivlen(void); @@ -135,6 +137,8 @@ extern size_t esp_hdrsiz(struct ipsecrequest *); extern int esp_schedule(const struct esp_algorithm *, struct secasvar *); extern int esp_auth(struct mbuf *, size_t, size_t, struct secasvar *, u_char *); + +extern void esp_init(void); #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET6_ESP_H_ */ diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index c5448b8a5..17bd8e242 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2016 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -113,6 +113,7 @@ #define MAX_SBUF_LEN 2000 extern lck_mtx_t *sadb_mutex; +os_log_t esp_mpkl_log_object = NULL; static int esp_null_mature(struct secasvar *); static int esp_null_decrypt(struct mbuf *, size_t, @@ -151,47 +152,119 @@ static int esp_gcm_mature(struct secasvar *); #define ESP_AESGCM_KEYLEN192 224 // 24-bytes key + 4 bytes salt #define ESP_AESGCM_KEYLEN256 288 // 32-bytes key + 4 bytes salt -static const struct esp_algorithm des_cbc = -{ 8, -1, esp_descbc_mature, 64, 64, esp_des_schedlen, - "des-cbc", - esp_descbc_ivlen, esp_cbc_decrypt, - esp_cbc_encrypt, esp_des_schedule, - esp_des_blockdecrypt, esp_des_blockencrypt, - 0, 0, 0 }; -static const struct esp_algorithm des3_cbc = -{ 8, 8, esp_cbc_mature, 192, 192, esp_3des_schedlen, - "3des-cbc", - esp_common_ivlen, esp_cbc_decrypt, - esp_cbc_encrypt, esp_3des_schedule, - esp_3des_blockdecrypt, esp_3des_blockencrypt, - 0, 0, 0 }; -static const struct esp_algorithm null_esp = -{ 1, 0, esp_null_mature, 0, 2048, 0, "null", - esp_common_ivlen, esp_null_decrypt, - esp_null_encrypt, NULL, NULL, NULL, - 0, 0, 0 }; -static const struct esp_algorithm aes_cbc = -{ 16, 16, esp_cbc_mature, 128, 256, esp_aes_schedlen, - "aes-cbc", - esp_common_ivlen, esp_cbc_decrypt_aes, - esp_cbc_encrypt_aes, esp_aes_schedule, - 0, 0, - 0, 0, 0 }; -static const struct esp_algorithm aes_gcm = -{ 4, 8, esp_gcm_mature, ESP_AESGCM_KEYLEN128, ESP_AESGCM_KEYLEN256, esp_gcm_schedlen, - "aes-gcm", - esp_common_ivlen, esp_gcm_decrypt_aes, - esp_gcm_encrypt_aes, esp_gcm_schedule, - 0, 0, - 16, esp_gcm_decrypt_finalize, esp_gcm_encrypt_finalize}; -static const struct esp_algorithm chacha_poly = -{ ESP_CHACHAPOLY_PAD_BOUND, ESP_CHACHAPOLY_IV_LEN, - esp_chachapoly_mature, ESP_CHACHAPOLY_KEYBITS_WITH_SALT, - ESP_CHACHAPOLY_KEYBITS_WITH_SALT, esp_chachapoly_schedlen, - "chacha-poly", esp_chachapoly_ivlen, esp_chachapoly_decrypt, - esp_chachapoly_encrypt, esp_chachapoly_schedule, - NULL, NULL, ESP_CHACHAPOLY_ICV_LEN, - esp_chachapoly_decrypt_finalize, esp_chachapoly_encrypt_finalize}; +static const struct esp_algorithm des_cbc = { + .padbound = 8, + .ivlenval = -1, + .mature = esp_descbc_mature, + .keymin = 64, + .keymax = 64, + .schedlen = esp_des_schedlen, + .name = "des-cbc", + .ivlen = esp_descbc_ivlen, + .decrypt = esp_cbc_decrypt, + .encrypt = esp_cbc_encrypt, + .schedule = esp_des_schedule, + .blockdecrypt = esp_des_blockdecrypt, + .blockencrypt = esp_des_blockencrypt, + .icvlen = 0, + .finalizedecrypt = NULL, + .finalizeencrypt = NULL +}; + +static const struct esp_algorithm des3_cbc = { + .padbound = 8, + .ivlenval = 8, + .mature = esp_cbc_mature, + .keymin = 192, + .keymax = 192, + .schedlen = esp_3des_schedlen, + .name = "3des-cbc", + .ivlen = esp_common_ivlen, + .decrypt = esp_cbc_decrypt, + .encrypt = esp_cbc_encrypt, + .schedule = esp_3des_schedule, + .blockdecrypt = esp_3des_blockdecrypt, + .blockencrypt = esp_3des_blockencrypt, + .icvlen = 0, + .finalizedecrypt = NULL, + .finalizeencrypt = NULL +}; + +static const struct esp_algorithm null_esp = { + .padbound = 1, + .ivlenval = 0, + .mature = esp_null_mature, + .keymin = 0, + .keymax = 2048, + .schedlen = NULL, + .name = "null", + .ivlen = esp_common_ivlen, + .decrypt = esp_null_decrypt, + .encrypt = esp_null_encrypt, + .schedule = NULL, + .blockdecrypt = NULL, + .blockencrypt = NULL, + .icvlen = 0, + .finalizedecrypt = NULL, + .finalizeencrypt = NULL +}; + +static const struct esp_algorithm aes_cbc = { + .padbound = 16, + .ivlenval = 16, + .mature = esp_cbc_mature, + .keymin = 128, + .keymax = 256, + .schedlen = esp_aes_schedlen, + .name = "aes-cbc", + .ivlen = esp_common_ivlen, + .decrypt = esp_cbc_decrypt_aes, + .encrypt = esp_cbc_encrypt_aes, + .schedule = esp_aes_schedule, + .blockdecrypt = NULL, + .blockencrypt = NULL, + .icvlen = 0, + .finalizedecrypt = NULL, + .finalizeencrypt = NULL +}; + +static const struct esp_algorithm aes_gcm = { + .padbound = 4, + .ivlenval = 8, + .mature = esp_gcm_mature, + .keymin = ESP_AESGCM_KEYLEN128, + .keymax = ESP_AESGCM_KEYLEN256, + .schedlen = esp_gcm_schedlen, + .name = "aes-gcm", + .ivlen = esp_common_ivlen, + .decrypt = esp_gcm_decrypt_aes, + .encrypt = esp_gcm_encrypt_aes, + .schedule = esp_gcm_schedule, + .blockdecrypt = NULL, + .blockencrypt = NULL, + .icvlen = 16, + .finalizedecrypt = esp_gcm_decrypt_finalize, + .finalizeencrypt = esp_gcm_encrypt_finalize +}; + +static const struct esp_algorithm chacha_poly = { + .padbound = ESP_CHACHAPOLY_PAD_BOUND, + .ivlenval = ESP_CHACHAPOLY_IV_LEN, + .mature = esp_chachapoly_mature, + .keymin = ESP_CHACHAPOLY_KEYBITS_WITH_SALT, + .keymax = ESP_CHACHAPOLY_KEYBITS_WITH_SALT, + .schedlen = esp_chachapoly_schedlen, + .name = "chacha-poly", + .ivlen = esp_chachapoly_ivlen, + .decrypt = esp_chachapoly_decrypt, + .encrypt = esp_chachapoly_encrypt, + .schedule = esp_chachapoly_schedule, + .blockdecrypt = NULL, + .blockencrypt = NULL, + .icvlen = ESP_CHACHAPOLY_ICV_LEN, + .finalizedecrypt = esp_chachapoly_decrypt_finalize, + .finalizeencrypt = esp_chachapoly_encrypt_finalize +}; static const struct esp_algorithm *esp_algorithms[] = { &des_cbc, @@ -425,9 +498,8 @@ esp_des_blockdecrypt( { /* assumption: d has a good alignment */ bcopy(s, d, sizeof(DES_LONG) * 2); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, - (des_ecb_key_schedule *)sav->sched, DES_DECRYPT); - return 0; + return des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, + (des_ecb_key_schedule *)sav->sched, DES_DECRYPT); } static int @@ -439,9 +511,8 @@ esp_des_blockencrypt( { /* assumption: d has a good alignment */ bcopy(s, d, sizeof(DES_LONG) * 2); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, - (des_ecb_key_schedule *)sav->sched, DES_ENCRYPT); - return 0; + return des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, + (des_ecb_key_schedule *)sav->sched, DES_ENCRYPT); } static int @@ -597,9 +668,8 @@ esp_3des_blockdecrypt( { /* assumption: d has a good alignment */ bcopy(s, d, sizeof(DES_LONG) * 2); - des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d, - (des3_ecb_key_schedule *)sav->sched, DES_DECRYPT); - return 0; + return des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d, + (des3_ecb_key_schedule *)sav->sched, DES_DECRYPT); } static int @@ -611,9 +681,8 @@ esp_3des_blockencrypt( { /* assumption: d has a good alignment */ bcopy(s, d, sizeof(DES_LONG) * 2); - des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d, - (des3_ecb_key_schedule *)sav->sched, DES_ENCRYPT); - return 0; + return des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d, + (des3_ecb_key_schedule *)sav->sched, DES_ENCRYPT); } static int @@ -1206,3 +1275,22 @@ esp_auth( KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 6, 0, 0, 0, 0); return 0; } + +void +esp_init(void) +{ + static int esp_initialized = 0; + + if (esp_initialized) { + return; + } + + esp_initialized = 1; + + esp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.esp"); + if (esp_mpkl_log_object == NULL) { + panic("MPKL_CREATE_LOGOBJECT for ESP failed"); + } + + return; +} diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index 36311e312..f53236153 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -58,6 +58,8 @@ * SUCH DAMAGE. */ +#define _IP_VHL + /* * RFC1827/2406 Encapsulated Security Payload. */ @@ -89,6 +91,8 @@ #include #include #include +#include +#include #if INET6 #include #endif @@ -174,6 +178,40 @@ esp6_input_strip_udp_encap(struct mbuf *m, int ip6hlen) return ip6; } +static void +esp_input_log(struct mbuf *m, struct secasvar *sav, u_int32_t spi, u_int32_t seq) +{ + if (net_mpklog_enabled && + (sav->sah->ipsec_if->if_xflags & IFXF_MPK_LOG) == IFXF_MPK_LOG) { + struct tcphdr th = {}; + size_t iphlen = 0; + u_int32_t proto_len = 0; + u_int8_t proto = 0; + + struct ip *inner_ip = mtod(m, struct ip *); + if (IP_VHL_V(inner_ip->ip_vhl) == 4) { + iphlen = IP_VHL_HL(inner_ip->ip_vhl) << 2; + proto = inner_ip->ip_p; + } else if (IP_VHL_V(inner_ip->ip_vhl) == 6) { + struct ip6_hdr *inner_ip6 = mtod(m, struct ip6_hdr *); + iphlen = sizeof(struct ip6_hdr); + proto = inner_ip6->ip6_nxt; + } + + if (proto == IPPROTO_TCP) { + if ((int)(iphlen + sizeof(th)) <= m->m_pkthdr.len) { + m_copydata(m, iphlen, sizeof(th), (u_int8_t *)&th); + } + + proto_len = m->m_pkthdr.len - iphlen - (th.th_off << 2); + MPKL_ESP_INPUT_TCP(esp_mpkl_log_object, + ntohl(spi), seq, + ntohs(th.th_sport), ntohs(th.th_dport), + ntohl(th.th_seq), proto_len); + } + } +} + void esp4_input(struct mbuf *m, int off) { @@ -200,6 +238,7 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) size_t esplen; sa_family_t ifamily; struct mbuf *out_m = NULL; + mbuf_traffic_class_t traffic_class = 0; KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_START, 0, 0, 0, 0, 0); /* sanity check for alignment. */ @@ -248,8 +287,8 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) (caddr_t)&ip->ip_src, (caddr_t)&ip->ip_dst, IPPROTO_ESP, spi, interface)) == 0) { ipseclog((LOG_WARNING, - "IPv4 ESP input: no key association found for spi %u\n", - (u_int32_t)ntohl(spi))); + "IPv4 ESP input: no key association found for spi %u (0x%08x)\n", + (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi))); IPSEC_STAT_INCREMENT(ipsecstat.in_nosa); goto bad; } @@ -259,16 +298,16 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) if (sav->state != SADB_SASTATE_MATURE && sav->state != SADB_SASTATE_DYING) { ipseclog((LOG_DEBUG, - "IPv4 ESP input: non-mature/dying SA found for spi %u\n", - (u_int32_t)ntohl(spi))); + "IPv4 ESP input: non-mature/dying SA found for spi %u (0x%08x)\n", + (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi))); IPSEC_STAT_INCREMENT(ipsecstat.in_badspi); goto bad; } algo = esp_algorithm_lookup(sav->alg_enc); if (!algo) { ipseclog((LOG_DEBUG, "IPv4 ESP input: " - "unsupported encryption algorithm for spi %u\n", - (u_int32_t)ntohl(spi))); + "unsupported encryption algorithm for spi %u (0x%08x)\n", + (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi))); IPSEC_STAT_INCREMENT(ipsecstat.in_badspi); goto bad; } @@ -284,6 +323,12 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) seq = ntohl(((struct newesp *)esp)->esp_seq); + if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) == + SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) { + u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT; + traffic_class = rfc4594_dscp_to_tc(dscp); + } + /* Save ICV from packet for verification later */ size_t siz = 0; unsigned char saved_icv[AH_MAXSUMSIZE]; @@ -293,8 +338,8 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) goto delay_icv; } - if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay - && (sav->alg_auth && sav->key_auth))) { + if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] != NULL && + (sav->alg_auth && sav->key_auth))) { goto noreplaycheck; } @@ -306,7 +351,7 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) /* * check for sequence number. */ - if (ipsec_chkreplay(seq, sav)) { + if (ipsec_chkreplay(seq, sav, traffic_class)) { ; /*okey*/ } else { IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay); @@ -372,8 +417,8 @@ delay_icv: /* * update sequence number. */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(seq, sav)) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] != NULL) { + if (ipsec_updatereplay(seq, sav, traffic_class)) { IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay); goto bad; } @@ -442,7 +487,7 @@ noreplaycheck: if (algo->finalizedecrypt) { if ((*algo->finalizedecrypt)(sav, saved_icv, algo->icvlen)) { - ipseclog((LOG_ERR, "packet decryption ICV failure\n")); + ipseclog((LOG_ERR, "esp4 packet decryption ICV failure\n")); IPSEC_STAT_INCREMENT(ipsecstat.in_inval); KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_END, 1, 0, 0, 0, 0); goto bad; @@ -491,8 +536,8 @@ noreplaycheck: // if peer is behind nat and this is the latest esp packet if ((sav->flags & SADB_X_EXT_NATT_DETECTED_PEER) != 0 && (sav->flags & SADB_X_EXT_OLD) == 0 && - seq && sav->replay && - seq >= sav->replay->lastseq) { + seq && sav->replay[traffic_class] && + seq >= sav->replay[traffic_class]->lastseq) { struct udphdr *encap_uh = (__typeof__(encap_uh))(void *)((caddr_t)ip + off); if (encap_uh->uh_sport && ntohs(encap_uh->uh_sport) != sav->remote_ike_port) { @@ -629,16 +674,30 @@ noreplaycheck: /* Clear the csum flags, they can't be valid for the inner headers */ m->m_pkthdr.csum_flags = 0; - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { + esp_input_log(m, sav, spi, seq); + ipsec_save_wake_packet(m, ntohl(spi), seq); + // Return mbuf if (interface != NULL && - interface == sav->sah->ipsec_if) { + interface == ipsec_if) { out_m = m; + ifnet_release(ipsec_if); goto done; } - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + + if (inject_error == 0) { m = NULL; goto done; } else { @@ -741,13 +800,21 @@ noreplaycheck: struct ip *, ip, struct ip6_hdr *, NULL); // Input via IPsec interface legacy path - if (sav->sah->ipsec_if != NULL) { + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { int mlen; if ((mlen = m_length2(m, NULL)) < hlen) { ipseclog((LOG_DEBUG, "IPv4 ESP input: decrypted packet too short %d < %d\n", mlen, hlen)); IPSEC_STAT_INCREMENT(ipsecstat.in_inval); + ifnet_release(ipsec_if); goto bad; } ip->ip_len = htons(ip->ip_len + hlen); @@ -755,14 +822,21 @@ noreplaycheck: ip->ip_sum = 0; ip->ip_sum = ip_cksum_hdr_in(m, hlen); + esp_input_log(m, sav, spi, seq); + ipsec_save_wake_packet(m, ntohl(spi), seq); + // Return mbuf if (interface != NULL && - interface == sav->sah->ipsec_if) { + interface == ipsec_if) { out_m = m; + ifnet_release(ipsec_if); goto done; } - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + + if (inject_error == 0) { m = NULL; goto done; } else { @@ -829,6 +903,7 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface) int ivlen; size_t esplen; sa_family_t ifamily; + mbuf_traffic_class_t traffic_class = 0; /* sanity check for alignment. */ if (off % 4 != 0 || m->m_pkthdr.len % 4 != 0) { @@ -877,8 +952,19 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface) (caddr_t)&ip6->ip6_src, (caddr_t)&ip6->ip6_dst, IPPROTO_ESP, spi, interface)) == 0) { ipseclog((LOG_WARNING, - "IPv6 ESP input: no key association found for spi %u\n", - (u_int32_t)ntohl(spi))); + "IPv6 ESP input: no key association found for spi %u (0x%08x) seq %u" + " src %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x" + " dst %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x if %s\n", + (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi), ntohl(((struct newesp *)esp)->esp_seq), + ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[0]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[1]), + ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[2]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[3]), + ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[4]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[5]), + ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[6]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[7]), + ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[0]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[1]), + ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[2]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[3]), + ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[4]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[5]), + ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[6]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[7]), + ((interface != NULL) ? if_name(interface) : "NONE"))); IPSEC_STAT_INCREMENT(ipsec6stat.in_nosa); goto bad; } @@ -888,16 +974,16 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface) if (sav->state != SADB_SASTATE_MATURE && sav->state != SADB_SASTATE_DYING) { ipseclog((LOG_DEBUG, - "IPv6 ESP input: non-mature/dying SA found for spi %u\n", - (u_int32_t)ntohl(spi))); + "IPv6 ESP input: non-mature/dying SA found for spi %u (0x%08x)\n", + (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi))); IPSEC_STAT_INCREMENT(ipsec6stat.in_badspi); goto bad; } algo = esp_algorithm_lookup(sav->alg_enc); if (!algo) { ipseclog((LOG_DEBUG, "IPv6 ESP input: " - "unsupported encryption algorithm for spi %u\n", - (u_int32_t)ntohl(spi))); + "unsupported encryption algorithm for spi %u (0x%08x)\n", + (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi))); IPSEC_STAT_INCREMENT(ipsec6stat.in_badspi); goto bad; } @@ -913,6 +999,12 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface) seq = ntohl(((struct newesp *)esp)->esp_seq); + if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) == + SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) { + u_int8_t dscp = (ntohl(ip6->ip6_flow) & IP6FLOW_DSCP_MASK) >> IP6FLOW_DSCP_SHIFT; + traffic_class = rfc4594_dscp_to_tc(dscp); + } + /* Save ICV from packet for verification later */ size_t siz = 0; unsigned char saved_icv[AH_MAXSUMSIZE]; @@ -922,8 +1014,9 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface) goto delay_icv; } - if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay - && (sav->alg_auth && sav->key_auth))) { + if (!((sav->flags & SADB_X_EXT_OLD) == 0 && + sav->replay[traffic_class] != NULL && + (sav->alg_auth && sav->key_auth))) { goto noreplaycheck; } @@ -935,7 +1028,7 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface) /* * check for sequence number. */ - if (ipsec_chkreplay(seq, sav)) { + if (ipsec_chkreplay(seq, sav, traffic_class)) { ; /*okey*/ } else { IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay); @@ -998,8 +1091,8 @@ delay_icv: /* * update sequence number. */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(seq, sav)) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] != NULL) { + if (ipsec_updatereplay(seq, sav, traffic_class)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay); goto bad; } @@ -1067,7 +1160,7 @@ noreplaycheck: if (algo->finalizedecrypt) { if ((*algo->finalizedecrypt)(sav, saved_icv, algo->icvlen)) { - ipseclog((LOG_ERR, "packet decryption ICV failure\n")); + ipseclog((LOG_ERR, "esp6 packet decryption ICV failure\n")); IPSEC_STAT_INCREMENT(ipsecstat.in_inval); KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_END, 1, 0, 0, 0, 0); goto bad; @@ -1113,8 +1206,8 @@ noreplaycheck: // if peer is behind nat and this is the latest esp packet if ((sav->flags & SADB_X_EXT_NATT_DETECTED_PEER) != 0 && (sav->flags & SADB_X_EXT_OLD) == 0 && - seq && sav->replay && - seq >= sav->replay->lastseq) { + seq && sav->replay[traffic_class] && + seq >= sav->replay[traffic_class]->lastseq) { struct udphdr *encap_uh = (__typeof__(encap_uh))(void *)((caddr_t)ip6 + off); if (encap_uh->uh_sport && ntohs(encap_uh->uh_sport) != sav->remote_ike_port) { @@ -1240,15 +1333,29 @@ noreplaycheck: IFA_REMREF(ifa); } - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { + esp_input_log(m, sav, spi, seq); + ipsec_save_wake_packet(m, ntohl(spi), seq); + // Return mbuf if (interface != NULL && - interface == sav->sah->ipsec_if) { + interface == ipsec_if) { + ifnet_release(ipsec_if); goto done; } - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + + if (inject_error == 0) { m = NULL; nxt = IPPROTO_DONE; goto done; @@ -1348,7 +1455,6 @@ noreplaycheck: m = n; } #endif - ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - stripsiz); @@ -1370,15 +1476,29 @@ noreplaycheck: _CASSERT(offsetof(struct pkthdr, csum_data) == offsetof(struct pkthdr, csum_rx_val)); } - // Input via IPSec interface - if (sav->sah->ipsec_if != NULL) { + // Input via IPsec interface + lck_mtx_lock(sadb_mutex); + ifnet_t ipsec_if = sav->sah->ipsec_if; + if (ipsec_if != NULL) { + // If an interface is found, add a reference count before dropping the lock + ifnet_reference(ipsec_if); + } + lck_mtx_unlock(sadb_mutex); + if (ipsec_if != NULL) { + esp_input_log(m, sav, spi, seq); + ipsec_save_wake_packet(m, ntohl(spi), seq); + // Return mbuf if (interface != NULL && - interface == sav->sah->ipsec_if) { + interface == ipsec_if) { + ifnet_release(ipsec_if); goto done; } - if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) { + errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m); + ifnet_release(ipsec_if); + + if (inject_error == 0) { m = NULL; nxt = IPPROTO_DONE; goto done; diff --git a/bsd/netinet6/esp_output.c b/bsd/netinet6/esp_output.c index 36c91b56f..9401200f3 100644 --- a/bsd/netinet6/esp_output.c +++ b/bsd/netinet6/esp_output.c @@ -79,12 +79,15 @@ #include #include +#include #include #include #include #include #include /* for nat traversal */ +#include +#include #if INET6 #include @@ -184,7 +187,7 @@ esp_hdrsiz(__unused struct ipsecrequest *isr) } else { /* RFC 2406 */ aalgo = ah_algorithm_lookup(sav->alg_auth); - if (aalgo && sav->replay && sav->key_auth) { + if (aalgo && sav->replay[0] != NULL && sav->key_auth) { authlen = (aalgo->sumsiz)(sav); } else { authlen = 0; @@ -251,7 +254,11 @@ esp_output( struct esp *esp; struct esptail *esptail; const struct esp_algorithm *algo; + struct tcphdr th = {}; u_int32_t spi; + u_int32_t seq; + u_int32_t inner_payload_len = 0; + u_int8_t inner_protocol = 0; u_int8_t nxt = 0; size_t plen; /*payload length to be encrypted*/ size_t espoff; @@ -263,7 +270,7 @@ esp_output( struct ipsecstat *stat; struct udphdr *udp = NULL; int udp_encapsulate = (sav->flags & SADB_X_EXT_NATT && (af == AF_INET || af == AF_INET6) && - (esp_udp_encap_port & 0xFFFF) != 0); + ((esp_udp_encap_port & 0xFFFF) != 0 || sav->natt_encapsulated_src_port != 0)); KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_START, sav->ivlen, 0, 0, 0, 0); switch (af) { @@ -285,8 +292,35 @@ esp_output( return 0; /* no change at all */ } + mbuf_traffic_class_t traffic_class = 0; + if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) == + SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) { + u_int8_t dscp = 0; + switch (af) { +#if INET + case AF_INET: + { + struct ip *ip = mtod(m, struct ip *); + dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT; + break; + } +#endif /*INET*/ +#if INET6 + case AF_INET6: + { + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + dscp = (ntohl(ip6->ip6_flow) & IP6FLOW_DSCP_MASK) >> IP6FLOW_DSCP_SHIFT; + break; + } +#endif /*INET6*/ + default: + panic("esp_output: should not reach here"); + } + traffic_class = rfc4594_dscp_to_tc(dscp); + } + /* some sanity check */ - if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) { + if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] == NULL) { switch (af) { #if INET case AF_INET: @@ -398,6 +432,58 @@ esp_output( #endif } + /* grab info for packet logging */ + struct secashead *sah = sav->sah; + if (net_mpklog_enabled && + sah != NULL && sah->ipsec_if != NULL) { + ifnet_t ifp = sah->ipsec_if; + + if ((ifp->if_xflags & IFXF_MPK_LOG) == IFXF_MPK_LOG) { + size_t iphlen = 0; + + if (sav->sah->saidx.mode == IPSEC_MODE_TUNNEL) { + struct ip *inner_ip = mtod(md, struct ip *); + if (IP_VHL_V(inner_ip->ip_vhl) == IPVERSION) { +#ifdef _IP_VHL + iphlen = IP_VHL_HL(inner_ip->ip_vhl) << 2; +#else + iphlen = inner_ip->ip_hl << 2; +#endif + inner_protocol = inner_ip->ip_p; + } else if (IP_VHL_V(inner_ip->ip_vhl) == IPV6_VERSION) { + struct ip6_hdr *inner_ip6 = mtod(md, struct ip6_hdr *); + iphlen = sizeof(struct ip6_hdr); + inner_protocol = inner_ip6->ip6_nxt; + } + + if (inner_protocol == IPPROTO_TCP) { + if ((int)(iphlen + sizeof(th)) <= + (m->m_pkthdr.len - m->m_len)) { + m_copydata(md, iphlen, sizeof(th), (u_int8_t *)&th); + } + + inner_payload_len = m->m_pkthdr.len - m->m_len - iphlen - (th.th_off << 2); + } + } else { + iphlen = hlen; + if (af == AF_INET) { + inner_protocol = ip->ip_p; + } else if (af == AF_INET6) { + inner_protocol = ip6->ip6_nxt; + } + + if (inner_protocol == IPPROTO_TCP) { + if ((int)(iphlen + sizeof(th)) <= + m->m_pkthdr.len) { + m_copydata(m, iphlen, sizeof(th), (u_int8_t *)&th); + } + + inner_payload_len = m->m_pkthdr.len - iphlen - (th.th_off << 2); + } + } + } + } + /* make the packet over-writable */ mprev->m_next = NULL; if ((md = ipsec_copypkt(md)) == NULL) { @@ -514,7 +600,7 @@ esp_output( if ((sav->flags & SADB_X_EXT_OLD) == 0) { struct newesp *nesp; nesp = (struct newesp *)esp; - if (sav->replay->count == ~0) { + if (sav->replay[traffic_class]->count == sav->replay[traffic_class]->lastseq) { if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) { /* XXX Is it noisy ? */ ipseclog((LOG_WARNING, @@ -527,13 +613,14 @@ esp_output( } } lck_mtx_lock(sadb_mutex); - sav->replay->count++; + sav->replay[traffic_class]->count++; lck_mtx_unlock(sadb_mutex); /* * XXX sequence number must not be cycled, if the SA is * installed by IKE daemon. */ - nesp->esp_seq = htonl(sav->replay->count); + nesp->esp_seq = htonl(sav->replay[traffic_class]->count); + seq = sav->replay[traffic_class]->count; } { @@ -665,9 +752,13 @@ esp_output( *nexthdrp = IPPROTO_UDP; /* Fill out the UDP header */ - udp->uh_sport = ntohs((u_short)esp_udp_encap_port); - udp->uh_dport = ntohs(sav->remote_ike_port); -// udp->uh_len set later, after all length tweaks are complete + if (sav->natt_encapsulated_src_port != 0) { + udp->uh_sport = (u_short)sav->natt_encapsulated_src_port; + } else { + udp->uh_sport = htons((u_short)esp_udp_encap_port); + } + udp->uh_dport = htons(sav->remote_ike_port); + // udp->uh_len set later, after all length tweaks are complete udp->uh_sum = 0; /* Update last sent so we know if we need to send keepalive */ @@ -753,7 +844,7 @@ esp_output( goto fill_icv; } - if (!sav->replay) { + if (!sav->replay[traffic_class]) { goto noantireplay; } if (!sav->key_auth) { @@ -863,6 +954,17 @@ fill_icv: } noantireplay: + if (net_mpklog_enabled && sav->sah != NULL && + sav->sah->ipsec_if != NULL && + (sav->sah->ipsec_if->if_xflags & IFXF_MPK_LOG) && + inner_protocol == IPPROTO_TCP) { + MPKL_ESP_OUTPUT_TCP(esp_mpkl_log_object, + ntohl(spi), seq, + ntohs(th.th_sport), ntohs(th.th_dport), + ntohl(th.th_seq), ntohl(th.th_ack), + th.th_flags, inner_payload_len); + } + lck_mtx_lock(sadb_mutex); if (!m) { ipseclog((LOG_ERR, diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index 1b5925265..9357d5e99 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -364,20 +364,15 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { /* + * Mark packet as reassembled. * In ICMPv6 processing, we drop certain * NDP messages that are not expected to * have fragment header based on recommendations * against security vulnerability as described in * RFC 6980. - * We set PKTF_REASSEMBLED flag to let ICMPv6 NDP - * drop such packets. - * However there are already devices running software - * that are creating interface with MTU < IPv6 Min - * MTU. We should not have allowed that but they are - * out, and sending atomic NDP fragments. - * For that reason, we do not set the same flag here - * and relax the check. + * Treat atomic fragments as re-assembled packets as well. */ + m->m_pkthdr.pkt_flags |= PKTF_REASSEMBLED; ip6stat.ip6s_atmfrag_rcvd++; in6_ifstat_inc(dstifp, ifs6_atmfrag_rcvd); *offp = offset; @@ -785,7 +780,6 @@ insert: (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; - m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; } diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index f1d66606e..a7376b686 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -410,7 +410,7 @@ icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags) m = m_pullup(m, preplen); } if (m == NULL) { - nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); + nd6log(debug, "ENOBUFS in icmp6_error %d\n", __LINE__); return; } @@ -551,9 +551,9 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) * calculate the checksum */ if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 checksum error(%d|%x) %s\n", - icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src))); + icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src)); icmp6stat.icp6s_checksum++; goto freeit; } @@ -909,11 +909,11 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) break; default: - nd6log((LOG_DEBUG, + nd6log(debug, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), - m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0)); + m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; @@ -1213,14 +1213,9 @@ icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) return; } - /* - * In case the suggested mtu is less than IPV6_MMTU, we - * only need to remember that it was for above mentioned - * "alwaysfrag" case. - * Try to be as close to the spec as possible. - */ + /* Limit the MTU to the minimum IPv6 MTU */ if (mtu < IPV6_MMTU) { - mtu = IPV6_MMTU - 8; + mtu = IPV6_MMTU; } bzero(&sin6, sizeof(sin6)); @@ -1336,9 +1331,8 @@ ni6_input(struct mbuf *m, int off) } if ((ia6_flags & IN6_IFF_TEMPORARY) && !(icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { - nd6log((LOG_DEBUG, "ni6_input: ignore node info to " - "a temporary address in %s:%d", - __func__, __LINE__)); + nd6log(debug, "ni6_input: ignore node info to a temporary address in %s:%d", + __func__, __LINE__); goto bad; } } @@ -1438,7 +1432,9 @@ ni6_input(struct mbuf *m, int off) * wildcard match, if gethostname(3) side has * truncated hostname. */ + lck_mtx_lock(&hostname_lock); n = ni6_nametodns(hostname, hostnamelen, 0); + lck_mtx_unlock(&hostname_lock); if (!n || n->m_next || n->m_len == 0) { goto bad; } @@ -1571,7 +1567,9 @@ ni6_input(struct mbuf *m, int off) /* * XXX do we really have FQDN in variable "hostname"? */ + lck_mtx_lock(&hostname_lock); n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn); + lck_mtx_unlock(&hostname_lock); if (n->m_next == NULL) { goto bad; } @@ -2259,10 +2257,10 @@ icmp6_reflect(struct mbuf *m, size_t off) /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { - nd6log((LOG_DEBUG, - "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", + nd6log(debug, + "sanity fail: off=%x, sizeof(ip6)=%x in %s:%d\n", (u_int32_t)off, (u_int32_t)sizeof(struct ip6_hdr), - __func__, __LINE__)); + __func__, __LINE__); goto bad; } @@ -2384,10 +2382,10 @@ icmp6_reflect(struct mbuf *m, size_t off) &src_storage, ip6oa.ip6oa_boundif, &e); ROUTE_RELEASE(&ro); if (src == NULL) { - nd6log((LOG_DEBUG, + nd6log(debug, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", - ip6_sprintf(&sa6_src.sin6_addr), e)); + ip6_sprintf(&sa6_src.sin6_addr), e); goto bad; } } @@ -2462,26 +2460,35 @@ icmp6_redirect_diag(struct in6_addr *src6, void icmp6_redirect_input(struct mbuf *m, int off) { - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct nd_redirect *nd_rd; - int icmp6len = ntohs(ip6->ip6_plen); + struct ifnet *ifp = NULL; + struct ip6_hdr *ip6 = NULL; + struct nd_redirect *nd_rd = NULL; char *lladdr = NULL; + int icmp6len = 0; int lladdrlen = 0; u_char *redirhdr = NULL; int redirhdrlen = 0; struct rtentry *rt = NULL; int is_router; int is_onlink; - struct in6_addr src6 = ip6->ip6_src; + struct in6_addr src6; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; - if (!m || !ifp) { + if (m == NULL) { return; } + ifp = m->m_pkthdr.rcvif; + if (ifp == NULL) { + goto freeit; + } + + ip6 = mtod(m, struct ip6_hdr *); + icmp6len = ntohs(ip6->ip6_plen); + src6 = ip6->ip6_src; + /* * If we are an advertising router on this interface, * don't update route by icmp6 redirect. @@ -2500,7 +2507,7 @@ icmp6_redirect_input(struct mbuf *m, int off) IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { icmp6stat.icp6s_tooshort++; - return; + goto freeit; } #endif redtgt6 = nd_rd->nd_rd_target; @@ -2513,16 +2520,16 @@ icmp6_redirect_input(struct mbuf *m, int off) /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 redirect sent from %s rejected; " - "must be from linklocal\n", ip6_sprintf(&src6))); + "must be from linklocal\n", ip6_sprintf(&src6)); goto bad; } - if (ip6->ip6_hlim != 255) { - nd6log((LOG_ERR, + if (ip6->ip6_hlim != IPV6_MAXHLIM) { + nd6log(error, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", - ip6_sprintf(&src6), ip6->ip6_hlim)); + ip6_sprintf(&src6), ip6->ip6_hlim); goto bad; } { @@ -2539,10 +2546,10 @@ icmp6_redirect_input(struct mbuf *m, int off) RT_LOCK(rt); if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); RT_UNLOCK(rt); rtfree(rt); goto bad; @@ -2551,21 +2558,21 @@ icmp6_redirect_input(struct mbuf *m, int off) gw6 = &(((struct sockaddr_in6 *)(void *) rt->rt_gateway)->sin6_addr); if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(gw6), - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); RT_UNLOCK(rt); rtfree(rt); goto bad; } } else { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); goto bad; } RT_UNLOCK(rt); @@ -2573,10 +2580,10 @@ icmp6_redirect_input(struct mbuf *m, int off) rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); goto bad; } @@ -2588,10 +2595,10 @@ icmp6_redirect_input(struct mbuf *m, int off) is_onlink = 1; /* on-link destination case */ } if (!is_router && !is_onlink) { - nd6log((LOG_ERR, + nd6log(error, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); goto bad; } /* validation passed */ @@ -2599,9 +2606,9 @@ icmp6_redirect_input(struct mbuf *m, int off) icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { - nd6log((LOG_INFO, "icmp6_redirect_input: " + nd6log(info, "icmp6_redirect_input: " "invalid ND option, rejected: %s\n", - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); /* nd6_options have incremented stats */ goto freeit; } @@ -2617,11 +2624,11 @@ icmp6_redirect_input(struct mbuf *m, int off) } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { - nd6log((LOG_INFO, + nd6log(info, "icmp6_redirect_input: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", ip6_sprintf(&redtgt6), ifp->if_addrlen, lladdrlen - 2, - icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); + icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); goto bad; } @@ -2799,7 +2806,7 @@ icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; - ip6->ip6_hlim = 255; + ip6->ip6_hlim = IPV6_MAXHLIM; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(&ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index 950911de7..5ed976827 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2018 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -174,7 +174,12 @@ const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { - sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 + .sin6_len = sizeof(sa6_any), + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_flowinfo = 0, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_scope_id = 0 }; static int in6ctl_associd(struct socket *, u_long, caddr_t); @@ -816,7 +821,7 @@ in6ctl_llstop(struct ifnet *ifp) pr0.ndpr_plen = 64; pr0.ndpr_ifp = ifp; pr0.ndpr_prefix.sin6_addr.s6_addr16[0] = IPV6_ADDR_INT16_ULL; - in6_setscope(&pr0.ndpr_prefix.sin6_addr, ifp, NULL); + (void)in6_setscope(&pr0.ndpr_prefix.sin6_addr, ifp, NULL); pr = nd6_prefix_lookup(&pr0, ND6_PREFIX_EXPIRY_UNSPEC); if (pr) { lck_mtx_lock(nd6_mutex); @@ -1007,7 +1012,7 @@ in6ctl_alifetime(struct in6_ifaddr *ia, u_long cmd, struct in6_ifreq *ifr, lt.ia6t_preferred = ia6_lt.ia6t_preferred; lt.ia6t_vltime = ia6_lt.ia6t_vltime; lt.ia6t_pltime = ia6_lt.ia6t_pltime; - bcopy(<, &ifr->ifr_ifru.ifru_lifetime, sizeof(lt)); + bcopy(<, &ifr->ifr_ifru.ifru_lifetime, sizeof(ifr->ifr_ifru.ifru_lifetime)); } else { struct in6_addrlifetime_32 lt; @@ -1016,7 +1021,7 @@ in6ctl_alifetime(struct in6_ifaddr *ia, u_long cmd, struct in6_ifreq *ifr, lt.ia6t_preferred = (uint32_t)ia6_lt.ia6t_preferred; lt.ia6t_vltime = (uint32_t)ia6_lt.ia6t_vltime; lt.ia6t_pltime = (uint32_t)ia6_lt.ia6t_pltime; - bcopy(<, &ifr->ifr_ifru.ifru_lifetime, sizeof(lt)); + bcopy(<, &ifr->ifr_ifru.ifru_lifetime, sizeof(ifr->ifr_ifru.ifru_lifetime)); } IFA_UNLOCK(&ia->ia_ifa); break; @@ -1153,8 +1158,8 @@ in6ctl_clat46start(struct ifnet *ifp) if (pr != NULL) { if ((ia6 = in6_pfx_newpersistaddr(pr, FALSE, &error, TRUE)) == NULL) { - nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface " - "%s.\n", ifp->if_xname)); + nd6log0(error, "Could not configure CLAT46 address on interface " + "%s.\n", ifp->if_xname); } else { IFA_LOCK(&ia6->ia_ifa); NDPR_LOCK(pr); @@ -1981,12 +1986,12 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) ifa = &ia->ia_ifa; in6m_sol = NULL; - nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x ifaupflags 0x%x\n", + nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x ifaupflags 0x%x\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ia->ia_ifp), ia->ia6_flags, - ifaupflags)); + ifaupflags); /* * Just to be safe, always clear certain flags when address @@ -2045,10 +2050,10 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) } imm = in6_joingroup(ifp, &llsol, &error, delay); if (imm == NULL) { - nd6log((LOG_WARNING, + nd6log(info, "%s: addmulti failed for %s on %s (errno=%d)\n", __func__, ip6_sprintf(&llsol), if_name(ifp), - error)); + error); VERIFY(error != 0); goto unwind; } @@ -2106,10 +2111,10 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); if (!imm) { - nd6log((LOG_WARNING, + nd6log(info, "%s: addmulti failed for %s on %s (errno=%d)\n", __func__, ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error)); + if_name(ifp), error); VERIFY(error != 0); goto unwind; } @@ -2129,16 +2134,18 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) */ delay = random() % MAX_RTR_SOLICITATION_DELAY; } - if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr) - == 0) { + lck_mtx_lock(&hostname_lock); + int n = in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr); + lck_mtx_unlock(&hostname_lock); + if (n == 0) { imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, delay); /* XXX jinmei */ if (!imm) { - nd6log((LOG_WARNING, + nd6log(info, "%s: addmulti failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error)); + if_name(ifp), error); /* XXX not very fatal, go on... */ error = 0; } else { @@ -2183,10 +2190,10 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); if (!imm) { - nd6log((LOG_WARNING, + nd6log(info, "%s: addmulti failed for %s on %s (errno=%d)\n", __func__, ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error)); + if_name(ifp), error); VERIFY(error != 0); goto unwind; } @@ -2200,13 +2207,14 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) ++nd6_sched_timeout_want; /* - * Perform DAD, if needed. - * XXX It may be of use, if we can administratively - * disable DAD. + * Perform DAD, if: + * * Interface is marked to perform DAD, AND + * * Address is not marked to skip DAD, AND + * * Address is in a pre-DAD state (Tentative or Optimistic) */ IFA_LOCK_SPIN(ifa); - if (in6if_do_dad(ifp) && ((ifa->ifa_flags & IN6_IFF_NODAD) == 0) && - (ia->ia6_flags & IN6_IFF_DADPROGRESS)) { + if (in6if_do_dad(ifp) && (ia->ia6_flags & IN6_IFF_NODAD) == 0 && + (ia->ia6_flags & IN6_IFF_DADPROGRESS) != 0) { int mindelay, maxdelay; int *delayptr, delayval; @@ -3711,8 +3719,8 @@ in6if_do_dad( return 0; } - if (ifp->if_subfamily == IFNET_SUBFAMILY_IPSEC || - ifp->if_subfamily == IFNET_SUBFAMILY_UTUN) { + if (ifp->if_family == IFNET_FAMILY_IPSEC || + ifp->if_family == IFNET_FAMILY_UTUN) { /* * Ignore DAD for tunneling virtual interfaces, which get * their IPv6 address explicitly assigned. @@ -3832,6 +3840,8 @@ in6_if2idlen(struct ifnet *ifp) return 64; /* Packet Data over Cellular */ case IFT_BRIDGE: return 64; /* Transparent bridge interface */ + case IFT_6LOWPAN: + return 64; /* 6LoWPAN */ default: /* * Unknown link type: @@ -4016,6 +4026,8 @@ in6_ifaddr_alloc(int how) bzero(in6ifa, in6ifa_size); in6ifa->ia_ifa.ifa_free = in6_ifaddr_free; in6ifa->ia_ifa.ifa_debug |= IFD_ALLOC; + in6ifa->ia_ifa.ifa_del_wc = &in6ifa->ia_ifa.ifa_debug; + in6ifa->ia_ifa.ifa_del_waiters = 0; ifa_lock_init(&in6ifa->ia_ifa); if (in6ifa_debug != 0) { struct in6_ifaddr_dbg *in6ifa_dbg = @@ -4804,9 +4816,9 @@ in6_eventhdlr_callback(struct eventhandler_entry_arg arg0 __unused, bzero(&ev_msg, sizeof(ev_msg)); bzero(&nd6_event, sizeof(nd6_event)); - nd6log0((LOG_INFO, "%s Event %s received for %s\n", + nd6log0(info, "%s Event %s received for %s\n", __func__, in6_event2kev_array[in6_ev_code].in6_event_str, - ip6_sprintf(p_addr6))); + ip6_sprintf(p_addr6)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index 59967ec1b..fa67c43a4 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -1004,7 +1004,6 @@ extern int inet6_rth_add(void *, const struct in6_addr *); extern int inet6_rth_reverse(const void *, void *); extern int inet6_rth_segments(const void *); extern struct in6_addr *inet6_rth_getaddr(const void *, int); -extern void addrsel_policy_init(void); __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 643e3e363..bd4ad95c0 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -131,12 +131,15 @@ get_rand_iid( { SHA1_CTX ctxt; u_int8_t digest[SHA1_RESULTLEN]; - int hostnlen = strlen(hostname); + int hostnlen; /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); SHA1Init(&ctxt); + lck_mtx_lock(&hostname_lock); + hostnlen = strlen(hostname); SHA1Update(&ctxt, hostname, hostnlen); + lck_mtx_unlock(&hostname_lock); SHA1Final(digest, &ctxt); /* assumes sizeof (digest) > sizeof (iid) */ @@ -212,8 +215,8 @@ in6_generate_tmp_iid( * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { - nd6log((LOG_INFO, - "%s: computed SHA1 value is zero.\n", __func__)); + nd6log(info, + "%s: computed SHA1 value is zero.\n", __func__); getmicrotime(&tv); val32 = random() ^ tv.tv_usec; @@ -289,6 +292,7 @@ in6_iid_from_hw(struct ifnet *ifp, struct in6_addr *in6) case IFT_IEEE80211: #endif case IFT_BRIDGE: + case IFT_6LOWPAN: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) { @@ -412,15 +416,15 @@ in6_select_iid_from_all_hw( /* first, try to get it from the interface itself */ if (in6_iid_from_hw(ifp0, in6) == 0) { - nd6log((LOG_DEBUG, "%s: IID derived from HW interface.\n", - if_name(ifp0))); + nd6log(debug, "%s: IID derived from HW interface.\n", + if_name(ifp0)); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_iid_from_hw(altifp, in6) == 0) { - nd6log((LOG_DEBUG, "%s: IID from alterate HW interface %s.\n", - if_name(ifp0), if_name(altifp))); + nd6log(debug, "%s: IID from alterate HW interface %s.\n", + if_name(ifp0), if_name(altifp)); goto success; } @@ -439,8 +443,8 @@ in6_select_iid_from_all_hw( * globally unique */ if (ND6_IFID_UNIVERSAL(in6)) { - nd6log((LOG_DEBUG, "%s: borrowed IID from %s\n", - if_name(ifp0), if_name(ifp))); + nd6log(debug, "%s: borrowed IID from %s\n", + if_name(ifp0), if_name(ifp)); ifnet_head_done(); goto success; } @@ -449,7 +453,7 @@ in6_select_iid_from_all_hw( /* last resort: get from random number source */ if (get_rand_iid(ifp, in6) == 0) { - nd6log((LOG_DEBUG, "%s: IID from PRNG.\n", if_name(ifp0))); + nd6log(debug, "%s: IID from PRNG.\n", if_name(ifp0)); goto success; } @@ -457,13 +461,13 @@ in6_select_iid_from_all_hw( return -1; success: - nd6log((LOG_INFO, "%s: IID: " + nd6log(info, "%s: IID: " "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], - in6->s6_addr[14], in6->s6_addr[15])); + in6->s6_addr[14], in6->s6_addr[15]); return 0; } @@ -487,10 +491,10 @@ in6_ifattach_linklocal(struct ifnet *ifp, struct in6_aliasreq *ifra) * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) { - nd6log((LOG_NOTICE, "%s: failed to " + nd6log(info, "%s: failed to " "configure a link-local address on %s " "(errno=%d)\n", - __func__, if_name(ifp), error)); + __func__, if_name(ifp), error); } return EADDRNOTAVAIL; } @@ -593,9 +597,9 @@ in6_ifattach_loopback( /* add the new interface address */ error = in6_update_ifa(ifp, &ifra, 0, &ia); if (error != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: failed to configure loopback address %s (error=%d)\n", - __func__, if_name(ifp), error)); + __func__, if_name(ifp), error); VERIFY(ia == NULL); return EADDRNOTAVAIL; } @@ -730,9 +734,8 @@ in6_ifattach_prelim(struct ifnet *ifp) * (previously, this was a silent error.) */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { - nd6log0((LOG_INFO, "in6_ifattach: ", - "%s is not multicast capable, IPv6 not enabled\n", - if_name(ifp))); + nd6log0(info, "in6_ifattach: %s is not multicast capable, IPv6 not enabled\n", + if_name(ifp)); return EINVAL; } @@ -902,8 +905,8 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, } else { if (in6_select_iid_from_all_hw(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { - nd6log((LOG_ERR, "%s: no IID available\n", - if_name(ifp))); + nd6log(error, "%s: no IID available\n", + if_name(ifp)); return EADDRNOTAVAIL; } } @@ -924,9 +927,9 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, /* Attach the link-local address */ if (in6_ifattach_linklocal(ifp, &ifra) != 0) { - nd6log((LOG_INFO, + nd6log(info, "%s: %s could not attach link-local address.\n", - __func__, if_name(ifp))); + __func__, if_name(ifp)); /* NB: not an error */ } @@ -1014,9 +1017,9 @@ in6_ifattach_llcgareq(struct ifnet *ifp, struct in6_cgareq *llcgasr) /* Attach the link-local address */ if (in6_ifattach_linklocal(ifp, &ifra) != 0) { /* NB: not an error */ - nd6log((LOG_INFO, + nd6log(info, "%s: %s could not attach link-local address.\n", - __func__, if_name(ifp))); + __func__, if_name(ifp)); } VERIFY(error == 0); @@ -1144,9 +1147,9 @@ in6_ifdetach(struct ifnet *ifp) if (ia->ia_next) { ia->ia_next = oia->ia_next; } else { - nd6log((LOG_ERR, + nd6log(error, "%s: didn't unlink in6ifaddr from " - "list\n", if_name(ifp))); + "list\n", if_name(ifp)); unlinked = 0; } } diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c index 65d4c090a..4635a2b18 100644 --- a/bsd/netinet6/in6_mcast.c +++ b/bsd/netinet6/in6_mcast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2017 Apple Inc. All rights reserved. + * Copyright (c) 2010-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -3012,7 +3012,7 @@ ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) if (error) { break; } - if (hlim < -1 || hlim > 255) { + if (hlim < -1 || hlim > IPV6_MAXHLIM) { error = EINVAL; break; } else if (hlim == -1) { diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 674f64d68..c1d2ff08e 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2017 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,16 +109,19 @@ #include #include #include +#include #include #include #include #include #include + #include #include #include #include + #include #include @@ -295,13 +298,16 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) IFA_REMREF(ifa); } } + + if (lport != 0) { struct inpcb *t; uid_t u; #if !CONFIG_EMBEDDED if (ntohs(lport) < IPV6PORT_RESERVED && - !IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr)) { + !IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) && + !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) { cred = kauth_cred_proc_ref(p); error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); @@ -313,19 +319,30 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } #endif /* !CONFIG_EMBEDDED */ + /* + * Check wether the process is allowed to bind to a restricted port + */ + if (!current_task_can_use_restricted_in_port(lport, + so->so_proto->pr_protocol, PORT_FLAGS_BSD)) { + lck_rw_done(pcbinfo->ipi_lock); + socket_lock(so, 0); + return EADDRINUSE; + } + if (!IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr) && (u = kauth_cred_getuid(so->so_cred)) != 0) { t = in6_pcblookup_local_and_cleanup(pcbinfo, &sin6.sin6_addr, lport, INPLOOKUP_WILDCARD); - if (t != NULL && (!IN6_IS_ADDR_UNSPECIFIED( - &sin6.sin6_addr) || + if (t != NULL && + (!IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || - !(t->inp_socket->so_options & - SO_REUSEPORT)) && (u != kauth_cred_getuid( - t->inp_socket->so_cred)) && - !(t->inp_socket->so_flags & - SOF_REUSESHAREUID)) { + !(t->inp_socket->so_options & SO_REUSEPORT)) && + (u != kauth_cred_getuid(t->inp_socket->so_cred)) && + !(t->inp_socket->so_flags & SOF_REUSESHAREUID) && + (!(t->inp_flags2 & INP2_EXTERNAL_PORT) || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) { lck_rw_done(pcbinfo->ipi_lock); socket_lock(so, 0); return EADDRINUSE; @@ -339,23 +356,28 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD); if (t != NULL && - !(t->inp_socket->so_options & - SO_REUSEPORT) && + !(t->inp_socket->so_options & SO_REUSEPORT) && (kauth_cred_getuid(so->so_cred) != - kauth_cred_getuid(t->inp_socket-> - so_cred)) && (t->inp_laddr.s_addr != - INADDR_ANY || SOCK_DOM(so) == - SOCK_DOM(t->inp_socket))) { + kauth_cred_getuid(t->inp_socket->so_cred)) && + (t->inp_laddr.s_addr != INADDR_ANY || + SOCK_DOM(so) == SOCK_DOM(t->inp_socket)) && + (!(t->inp_flags2 & INP2_EXTERNAL_PORT) || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) { lck_rw_done(pcbinfo->ipi_lock); socket_lock(so, 0); return EADDRINUSE; } + } } t = in6_pcblookup_local_and_cleanup(pcbinfo, &sin6.sin6_addr, lport, wild); if (t != NULL && - (reuseport & t->inp_socket->so_options) == 0) { + (reuseport & t->inp_socket->so_options) == 0 && + (!(t->inp_flags2 & INP2_EXTERNAL_PORT) || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) { lck_rw_done(pcbinfo->ipi_lock); socket_lock(so, 0); return EADDRINUSE; @@ -370,7 +392,10 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) if (t != NULL && (reuseport & t->inp_socket->so_options) == 0 && (t->inp_laddr.s_addr != INADDR_ANY || - SOCK_DOM(so) == SOCK_DOM(t->inp_socket))) { + SOCK_DOM(so) == SOCK_DOM(t->inp_socket)) && + (!(t->inp_flags2 & INP2_EXTERNAL_PORT) || + !(inp->inp_flags2 & INP2_EXTERNAL_PORT) || + uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) { lck_rw_done(pcbinfo->ipi_lock); socket_lock(so, 0); return EADDRINUSE; @@ -546,9 +571,7 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) struct socket *so = inp->inp_socket; #if CONTENT_FILTER - if (so) { - so->so_state_change_cnt++; - } + so->so_state_change_cnt++; #endif if (so->so_proto->pr_protocol == IPPROTO_UDP && @@ -700,6 +723,7 @@ in6_pcbdetach(struct inpcb *inp) inp->in6p_options = NULL; } ip6_freepcbopts(inp->in6p_outputopts); + inp->in6p_outputopts = NULL; ROUTE_RELEASE(&inp->in6p_route); /* free IPv4 related resources in case of mapped addr */ if (inp->inp_options != NULL) { @@ -1180,6 +1204,12 @@ in6_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && @@ -1211,6 +1241,12 @@ in6_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && inp->inp_lport == lport) { if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, @@ -1277,6 +1313,12 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && @@ -1309,6 +1351,12 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, continue; } +#if NECP + if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) { + continue; + } +#endif /* NECP */ + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && inp->inp_lport == lport) { if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index 78d084ee6..7a1f23c44 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2018 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -143,10 +143,6 @@ #include #endif #endif -#include -#if INET6 -#include -#endif #endif /*IPSEC*/ #include @@ -190,6 +186,8 @@ struct ip6protosw inet6sw[] = { .pr_lock = udp_lock, .pr_unlock = udp_unlock, .pr_getlock = udp_getlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_STREAM, @@ -208,6 +206,8 @@ struct ip6protosw inet6sw[] = { .pr_lock = tcp_lock, .pr_unlock = tcp_unlock, .pr_getlock = tcp_getlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -222,6 +222,8 @@ struct ip6protosw inet6sw[] = { #endif /* !INET */ .pr_usrreqs = &rip6_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -234,6 +236,8 @@ struct ip6protosw inet6sw[] = { .pr_init = icmp6_init, .pr_usrreqs = &rip6_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_DGRAM, @@ -246,6 +250,8 @@ struct ip6protosw inet6sw[] = { .pr_init = icmp6_init, .pr_usrreqs = &icmp6_dgram_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, { .pr_type = SOCK_RAW, @@ -286,16 +292,6 @@ struct ip6protosw inet6sw[] = { .pr_usrreqs = &nousrreqs, }, #endif /* IPSEC_ESP */ - { - .pr_type = SOCK_RAW, - .pr_protocol = IPPROTO_IPCOMP, - .pr_flags = PR_ATOMIC | PR_ADDR | PR_PROTOLOCK, - .pr_input = ipcomp6_input, -#if !INET /* don't call initialization and timeout routines twice */ - .pr_init = ipcomp_init, -#endif /* !INET */ - .pr_usrreqs = &nousrreqs, - }, #endif /* IPSEC */ #if INET { @@ -308,6 +304,8 @@ struct ip6protosw inet6sw[] = { .pr_init = encap6_init, .pr_usrreqs = &rip6_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, #endif /*INET*/ { @@ -320,6 +318,8 @@ struct ip6protosw inet6sw[] = { .pr_init = encap6_init, .pr_usrreqs = &rip6_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, /* raw wildcard */ { @@ -331,6 +331,8 @@ struct ip6protosw inet6sw[] = { .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_unlock = rip_unlock, + .pr_update_last_owner = inp_update_last_owner, + .pr_copy_last_owner = inp_copy_last_owner, }, }; @@ -398,6 +400,10 @@ in6_dinit(struct domain *dp) offsetof(struct protosw, pr_filter_head)); _CASSERT(offsetof(struct ip6protosw, pr_old) == offsetof(struct protosw, pr_old)); + _CASSERT(offsetof(struct ip6protosw, pr_update_last_owner) == + offsetof(struct protosw, pr_update_last_owner)); + _CASSERT(offsetof(struct ip6protosw, pr_copy_last_owner) == + offsetof(struct protosw, pr_copy_last_owner)); /* * Attach first, then initialize. ip6_init() needs raw IP6 handler. diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index 85e2dc7fe..f018e5111 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,12 +109,14 @@ #include #include #include +#include #include #include #include #include #include + #include #include #include @@ -257,10 +259,8 @@ in6_selectsrc_core_ifa(struct sockaddr_in6 *addr, struct ifnet *ifp, int srcsel_ if ((ifa->ifa_debug & IFD_DETACHING) != 0) { err = EHOSTUNREACH; ifnet_lock_done(ifp); - if (ifa != NULL) { - IFA_REMREF(ifa); - ifa = NULL; - } + IFA_REMREF(ifa); + ifa = NULL; goto done; } ifnet_lock_done(ifp); @@ -672,6 +672,9 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, if (INP_NO_EXPENSIVE(inp)) { ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; } + if (INP_NO_CONSTRAINED(inp)) { + ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED; + } if (INP_AWDL_UNRESTRICTED(inp)) { ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; } @@ -843,6 +846,7 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, boolean_t select_srcif, proxied_ifa = FALSE, local_dst = FALSE; unsigned int ifscope = ((ip6oa != NULL) ? ip6oa->ip6oa_boundif : IFSCOPE_NONE); + boolean_t is_direct = FALSE; if (retifp != NULL) { *retifp = NULL; @@ -868,15 +872,49 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, } /* - * Perform source interface selection only if Scoped Routing + * Perform source interface selection if Scoped Routing * is enabled and a source address that isn't unspecified. */ select_srcif = (srcsock != NULL && !IN6_IS_ADDR_UNSPECIFIED(&srcsock->sin6_addr)); + /* + * For scoped routing, if interface scope is 0 or src/dst addr is linklocal + * or dst addr is multicast, source interface selection should be performed even + * if the destination is directly reachable. + */ + if (ifscope != IFSCOPE_NONE && + !(srcsock != NULL && IN6_IS_ADDR_LINKLOCAL(&srcsock->sin6_addr)) && + !IN6_IS_ADDR_MULTICAST(dst) && !IN6_IS_ADDR_LINKLOCAL(dst)) { + struct rtentry *temp_rt = NULL; + + lck_mtx_lock(rnh_lock); + temp_rt = rt_lookup(TRUE, (struct sockaddr *)dstsock, + NULL, rt_tables[AF_INET6], ifscope); + lck_mtx_unlock(rnh_lock); + + /* + * If the destination is directly reachable, relax + * the behavior around select_srcif, i.e. don't force + * the packet to go out from the interface that is hosting + * the source address. + * It happens when we share v6 with NAT66 and want + * the external interface's v6 address to be reachable + * to the clients we are sharing v6 connectivity with + * using NAT. + */ + if (temp_rt != NULL) { + if ((temp_rt->rt_flags & RTF_GATEWAY) == 0) { + select_srcif = FALSE; + is_direct = TRUE; + } + rtfree(temp_rt); + } + } + if (ip6_select_srcif_debug) { - printf("%s src %s dst %s ifscope %d select_srcif %d\n", - __func__, s_src, s_dst, ifscope, select_srcif); + printf("%s src %s dst %s ifscope %d is_direct %d select_srcif %d\n", + __func__, s_src, s_dst, ifscope, is_direct, select_srcif); } /* If the caller specified the outgoing interface explicitly, use it */ @@ -1292,6 +1330,8 @@ done: IFNET_IS_CELLULAR(_ifp)) || \ (((_ip6oa)->ip6oa_flags & IP6OAF_NO_EXPENSIVE) && \ IFNET_IS_EXPENSIVE(_ifp)) || \ + (((_ip6oa)->ip6oa_flags & IP6OAF_NO_CONSTRAINED) && \ + IFNET_IS_CONSTRAINED(_ifp)) || \ (!((_ip6oa)->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED) && \ IFNET_IS_INTCOPROC(_ifp)) || \ (!((_ip6oa)->ip6oa_flags & IP6OAF_AWDL_UNRESTRICTED) && \ @@ -1594,6 +1634,15 @@ in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p, } } lport = htons(*lastport); + + /* + * Skip if this is a restricted port as we do not want to + * restricted ports as ephemeral + */ + if (IS_RESTRICTED_IN_PORT(lport)) { + continue; + } + found = (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, wild) == NULL); } while (!found); diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index baa9a541b..b7c080352 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -784,23 +784,27 @@ extern u_char inet6ctlerrmap[]; extern u_int32_t in6_maxmtu; /* N.B.: if_inet6data is never freed once set, so we don't need to lock */ -#define in6_ifstat_inc_common(_ifp, _tag, _atomic) do { \ +#define in6_ifstat_add_common(_ifp, _tag, _count, _atomic) do { \ if (_ifp != NULL && IN6_IFEXTRA(_ifp) != NULL) { \ if (_atomic) \ atomic_add_64( \ - &IN6_IFEXTRA(_ifp)->in6_ifstat._tag, 1); \ + &IN6_IFEXTRA(_ifp)->in6_ifstat._tag, _count);\ else \ - IN6_IFEXTRA(_ifp)->in6_ifstat._tag++; \ + IN6_IFEXTRA(_ifp)->in6_ifstat._tag += _count; \ } \ } while (0) /* atomic version */ #define in6_ifstat_inc(_ifp, _tag) \ - in6_ifstat_inc_common(_ifp, _tag, TRUE) + in6_ifstat_add_common(_ifp, _tag, 1, TRUE) /* non-atomic version (for fast paths) */ #define in6_ifstat_inc_na(_ifp, _tag) \ - in6_ifstat_inc_common(_ifp, _tag, FALSE) + in6_ifstat_add_common(_ifp, _tag, 1, FALSE) + +/* atomic add version */ +#define in6_ifstat_add(_ifp, _tag, _count) \ + in6_ifstat_add_common(_ifp, _tag, _count, TRUE) /* * Macro for finding the internet address structure (in6_ifaddr) corresponding diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index 8498211c7..57c44b85c 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2016 Apple Inc. All rights reserved. + * Copyright (c) 2009-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -132,7 +132,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, boolean_t proxy = FALSE; struct mbuf *mcopy = NULL; struct ifnet *ifp, *rcvifp, *origifp; /* maybe unnecessary */ - u_int32_t inzone, outzone, len; + u_int32_t inzone, outzone, len = 0, pktcnt = 0; struct in6_addr src_in6, dst_in6; uint64_t curtime = net_uptime(); #if IPSEC @@ -141,7 +141,10 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, unsigned int ifscope = IFSCOPE_NONE; #if PF struct pf_mtag *pf_mtag; + struct pf_fragment_tag *pf_ftagp, pf_ftag; + boolean_t pf_ftag_valid = FALSE; #endif /* PF */ + uint32_t mpktlen = 0; /* * In the prefix proxying case, the route to the proxied node normally @@ -164,10 +167,23 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #if PF pf_mtag = pf_find_mtag(m); + /* + * save the PF fragmentation metadata as m_copy() removes the + * mbufs tags from the original mbuf. + */ + pf_ftagp = pf_find_fragment_tag(m); + if (pf_ftagp != NULL) { + ASSERT(pf_mtag->pftag_flags & PF_TAG_REASSEMBLED); + pf_ftag = *pf_ftagp; + pf_ftag_valid = TRUE; + mpktlen = pf_ftag.ft_maxlen; + ASSERT(mpktlen); + } if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) { ifscope = pf_mtag->pftag_rtableid; } - + pf_mtag = NULL; + pf_ftagp = NULL; /* * If the caller provides a route which is on a different interface * than the one specified for scoped forwarding, discard the route @@ -543,7 +559,11 @@ skip_ipsec: return NULL; } - if (m->m_pkthdr.len > rt->rt_ifp->if_mtu) { + if (mpktlen == 0) { + mpktlen = m->m_pkthdr.len; + } + + if (mpktlen > rt->rt_ifp->if_mtu) { in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); if (mcopy) { uint32_t mtu; @@ -704,6 +724,14 @@ skip_ipsec: #if PF if (PF_IS_ENABLED) { + /* + * PF refragments any packet which it reassembled due to scrub + * rules, in which case it will set the PF_TAG_REFRAGMENTED + * flag in PF mbuf tag. + */ + if (pf_ftag_valid) { + pf_copy_fragment_tag(m, &pf_ftag, M_DONTWAIT); + } #if DUMMYNET struct ip_fw_args args; bzero(&args, sizeof(args)); @@ -729,6 +757,31 @@ skip_ipsec: /* Already freed by callee */ goto senderr; } + + pf_mtag = pf_find_mtag(m); + /* + * refragmented packets from PF. + */ + if ((pf_mtag->pftag_flags & PF_TAG_REFRAGMENTED) != 0) { + struct mbuf *t; + + pf_mtag->pftag_flags &= ~PF_TAG_REFRAGMENTED; + /* for statistics */ + t = m; + while (t != NULL) { + pktcnt++; + len += m_pktlen(t); + t = t->m_nextpkt; + } + + /* + * nd6_output() frees packetchain in both success and + * failure cases. + */ + error = nd6_output(ifp, origifp, m, dst, rt, NULL); + m = NULL; + goto sent; + } /* * We do not use ip6 header again in the code below, * however still adding the bit here so that any new @@ -740,21 +793,23 @@ skip_ipsec: #endif /* PF */ len = m_pktlen(m); + pktcnt = 1; error = nd6_output(ifp, origifp, m, dst, rt, NULL); +sent: if (error) { - in6_ifstat_inc(ifp, ifs6_out_discard); - ip6stat.ip6s_cantforward++; + in6_ifstat_add(ifp, ifs6_out_discard, pktcnt); + ip6stat.ip6s_cantforward += pktcnt; } else { /* * Increment stats on the source interface; the ones * for destination interface has been taken care of * during output above by virtue of PKTF_FORWARDED. */ - rcvifp->if_fpackets++; + rcvifp->if_fpackets += pktcnt; rcvifp->if_fbytes += len; - ip6stat.ip6s_forward++; - in6_ifstat_inc(ifp, ifs6_out_forward); + ip6stat.ip6s_forward += pktcnt; + in6_ifstat_add(ifp, ifs6_out_forward, pktcnt); if (type) { ip6stat.ip6s_redirectsent++; } else { diff --git a/bsd/netinet6/ip6_id.c b/bsd/netinet6/ip6_id.c index b767c8edb..508274bf4 100644 --- a/bsd/netinet6/ip6_id.c +++ b/bsd/netinet6/ip6_id.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2013 Apple Inc. All rights reserved. + * Copyright (c) 2009-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -149,27 +149,43 @@ struct randomtab { }; static struct randomtab randomtab_32 = { - 32, /* resulting bits */ - 180, /* Time after wich will be reseeded */ - 1000000000, /* Uniq cycle, avoid blackjack prediction */ - 2, /* Starting generator */ - 2147483629, /* RU_N-1 = 2^2*3^2*59652323 */ - 7, /* determine ru_a as RU_AGEN^(2*rand) */ - 1836660096, /* RU_M = 2^7*3^15 - don't change */ - { 2, 3, 59652323, 0 }, /* factors of ru_n */ - 0, 0, 0, 0, 0, 0, 0, 0, 0 + .ru_bits = 32, /* resulting bits */ + .ru_out = 180, /* Time after wich will be reseeded */ + .ru_max = 1000000000, /* Uniq cycle, avoid blackjack prediction */ + .ru_gen = 2, /* Starting generator */ + .ru_n = 2147483629, /* RU_N-1 = 2^2*3^2*59652323 */ + .ru_agen = 7, /* determine ru_a as RU_AGEN^(2*rand) */ + .ru_m = 1836660096, /* RU_M = 2^7*3^15 - don't change */ + .pfacts = { 2, 3, 59652323, 0 }, /* factors of ru_n */ + .ru_counter = 0, + .ru_msb = 0, + .ru_x = 0, + .ru_seed = 0, + .ru_seed2 = 0, + .ru_a = 0, + .ru_b = 0, + .ru_g = 0, + .ru_reseed = 0 }; static struct randomtab randomtab_20 = { - 20, /* resulting bits */ - 180, /* Time after wich will be reseeded */ - 200000, /* Uniq cycle, avoid blackjack prediction */ - 2, /* Starting generator */ - 524269, /* RU_N-1 = 2^2*3^2*14563 */ - 7, /* determine ru_a as RU_AGEN^(2*rand) */ - 279936, /* RU_M = 2^7*3^7 - don't change */ - { 2, 3, 14563, 0 }, /* factors of ru_n */ - 0, 0, 0, 0, 0, 0, 0, 0, 0 + .ru_bits = 20, /* resulting bits */ + .ru_out = 180, /* Time after wich will be reseeded */ + .ru_max = 200000, /* Uniq cycle, avoid blackjack prediction */ + .ru_gen = 2, /* Starting generator */ + .ru_n = 524269, /* RU_N-1 = 2^2*3^2*14563 */ + .ru_agen = 7, /* determine ru_a as RU_AGEN^(2*rand) */ + .ru_m = 279936, /* RU_M = 2^7*3^7 - don't change */ + .pfacts = { 2, 3, 14563, 0 }, /* factors of ru_n */ + .ru_counter = 0, + .ru_msb = 0, + .ru_x = 0, + .ru_seed = 0, + .ru_seed2 = 0, + .ru_a = 0, + .ru_b = 0, + .ru_g = 0, + .ru_reseed = 0 }; static u_int32_t pmod(u_int32_t, u_int32_t, u_int32_t); diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index b35e7f501..dad053c63 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -803,9 +803,6 @@ ip6_input(struct mbuf *m) } } - /* for consistency */ - m->m_pkthdr.pkt_proto = ip6->ip6_nxt; - #if DUMMYNET check_with_pf: #endif /* DUMMYNET */ @@ -928,9 +925,9 @@ check_with_pf: lck_rw_done(&in6_ifaddr_rwlock); ia6 = NULL; /* address is not ready, so discard the packet. */ - nd6log((LOG_INFO, "%s: packet to an unready address %s->%s\n", + nd6log(info, "%s: packet to an unready address %s->%s\n", __func__, ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst))); + ip6_sprintf(&ip6->ip6_dst)); goto bad; } lck_rw_done(&in6_ifaddr_rwlock); @@ -1000,9 +997,9 @@ check_with_pf: RT_UNLOCK(rin6.ro_rt); ia6 = NULL; /* address is not ready, so discard the packet. */ - nd6log((LOG_INFO, "%s: packet to an unready address %s->%s\n", + nd6log(error, "%s: packet to an unready address %s->%s\n", __func__, ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst))); + ip6_sprintf(&ip6->ip6_dst)); goto bad; } @@ -1679,9 +1676,9 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, // Send ECN flags for v4-mapped addresses if ((inp->inp_flags & IN6P_TCLASS) != 0) { struct ip *ip_header = mtod(m, struct ip *); - u_int8_t tos = (ip_header->ip_tos & IPTOS_ECN_MASK); - mp = sbcreatecontrol_mbuf((caddr_t)&tos, sizeof(tos), + int tclass = (int)(ip_header->ip_tos); + mp = sbcreatecontrol_mbuf((caddr_t)&tclass, sizeof(tclass), IPV6_TCLASS, IPPROTO_IPV6, mp); if (*mp == NULL) { return NULL; diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index a468a93f6..b87f6ba7b 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -170,7 +170,7 @@ static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_getpmtu(struct route_in6 *, struct route_in6 *, - struct ifnet *, struct in6_addr *, u_int32_t *, boolean_t *); + struct ifnet *, struct in6_addr *, u_int32_t *); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *sopt); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, int); @@ -185,7 +185,7 @@ static void ip6_output_checksum(struct ifnet *, uint32_t, struct mbuf *, extern int udp_ctloutput(struct socket *, struct sockopt *); static int ip6_fragment_packet(struct mbuf **m, struct ip6_pktopts *opt, struct ip6_exthdrs *exthdrsp, struct ifnet *ifp, - uint32_t mtu, boolean_t alwaysfrag, uint32_t unfragpartlen, + uint32_t mtu, uint32_t unfragpartlen, struct route_in6 *ro_pmtu, int nxt0, uint32_t optlen); SYSCTL_DECL(_net_inet6_ip6); @@ -285,7 +285,6 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, int error = 0; struct in6_ifaddr *ia = NULL, *src_ia = NULL; u_int32_t mtu = 0; - boolean_t alwaysfrag = FALSE; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_rthdr *rh; struct in6_addr finaldst; @@ -295,6 +294,11 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, uint32_t pktcnt = 0; uint32_t packets_processed = 0; struct timeval start_tv; +#if PF + boolean_t skip_pf = (ip6oa != NULL) && + (ip6oa->ip6oa_flags & IP6OAF_SKIP_PF); +#endif + #if DUMMYNET struct m_tag *tag; struct ip6_out_args saved_ip6oa; @@ -399,7 +403,6 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, ifnet_reference(origifp); } mtu = dn_tag->dn_mtu; - alwaysfrag = (dn_tag->dn_alwaysfrag != 0); unfragpartlen = dn_tag->dn_unfragpartlen; bcopy(&dn_tag->dn_exthdrs, &exthdrs, sizeof(exthdrs)); @@ -473,6 +476,9 @@ tags_done: if (ip6oa->ip6oa_flags & IP6OAF_NO_EXPENSIVE) { ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE; } + if (ip6oa->ip6oa_flags & IP6OAF_NO_CONSTRAINED) { + ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED; + } adv = &ip6oa->ip6oa_flowadv; adv->code = FADV_SUCCESS; ip6oa->ip6oa_retflags = 0; @@ -501,7 +507,7 @@ tags_done: * only needs to happen once per function entry. */ necp_matched_policy_id = necp_ip6_output_find_policy_match(m, flags, - (flags & IPV6_OUTARGS) ? ip6oa : NULL, &necp_result, + (flags & IPV6_OUTARGS) ? ip6oa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter); #endif /* NECP */ @@ -576,6 +582,9 @@ loopit: case NECP_KERNEL_POLICY_RESULT_PASS: goto skip_ipsec; case NECP_KERNEL_POLICY_RESULT_DROP: + error = EHOSTUNREACH; + ip6stat.ip6s_necp_policy_drop++; + goto freehdrs; case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT: /* * Flow divert packets should be blocked at the IP @@ -824,7 +833,11 @@ skip_ipsec: if (!TAILQ_EMPTY(&ipv6_filters) && !((flags & IPV6_OUTARGS) && - (ip6oa->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED))) { + (ip6oa->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED) +#if NECP + && !necp_packet_should_skip_filters(m) +#endif // NECP + )) { struct ipfilter *filter; int seen = (inject_filter_ref == NULL); int fixscope = 0; @@ -1354,8 +1367,7 @@ routefound: } /* Determine path MTU. */ - if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu, - &alwaysfrag)) != 0) { + if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu)) != 0) { goto bad; } @@ -1430,7 +1442,7 @@ routefound: check_with_pf: #endif /* DUMMYNET */ #if PF - if (PF_IS_ENABLED) { + if (PF_IS_ENABLED && !skip_pf) { #if DUMMYNET /* @@ -1448,7 +1460,6 @@ check_with_pf: args.fwa_ro6_pmtu = ro_pmtu; args.fwa_origifp = origifp; args.fwa_mtu = mtu; - args.fwa_alwaysfrag = alwaysfrag; args.fwa_unfragpartlen = unfragpartlen; args.fwa_exthdrs = &exthdrs; /* Invoke outbound packet filter */ @@ -1497,7 +1508,7 @@ check_with_pf: * is unchanged. */ error = ip6_fragment_packet(&m, opt, - &exthdrs, ifp, mtu, alwaysfrag, unfragpartlen, ro_pmtu, nxt0, + &exthdrs, ifp, mtu, unfragpartlen, ro_pmtu, nxt0, optlen); if (error) { @@ -1654,25 +1665,19 @@ bad: /* ip6_fragment_packet * * The fragmentation logic is rather complex: - * 1: normal case (dontfrag == 0, alwaysfrag == 0) + * 1: normal case (dontfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu - * - * 3: if we always need to attach fragment header (alwaysfrag == 1) - * always fragment - * - * 4: if dontfrag == 1 && alwaysfrag == 1 - * error, as we cannot handle this conflicting request */ static int ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt, struct ip6_exthdrs *exthdrsp, struct ifnet *ifp, uint32_t mtu, - boolean_t alwaysfrag, uint32_t unfragpartlen, struct route_in6 *ro_pmtu, + uint32_t unfragpartlen, struct route_in6 *ro_pmtu, int nxt0, uint32_t optlen) { VERIFY(NULL != mptr); @@ -1695,11 +1700,6 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt, } } - if (dontfrag && alwaysfrag) { /* case 4 */ - /* conflicting request - can't transmit */ - return EMSGSIZE; - } - /* Access without acquiring nd_ifinfo lock for performance */ if (dontfrag && tlen > IN6_LINKMTU(ifp)) { /* case 2-b */ /* @@ -1723,9 +1723,9 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt, /* * transmit packet without fragmentation */ - if (dontfrag || (!alwaysfrag && /* case 1-a and 2-a */ + if (dontfrag || (tlen <= mtu || TSO_IPV6_OK(ifp, m) || - (ifp->if_hwassist & CSUM_FRAGMENT_IPV6)))) { + (ifp->if_hwassist & CSUM_FRAGMENT_IPV6))) { /* * mppn not updated in this case because no new chain is formed * and inserted @@ -1733,12 +1733,24 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt, ip6_output_checksum(ifp, mtu, m, nxt0, tlen, optlen); } else { /* - * time to fragment - cases 1-b and 3 are handled inside + * time to fragment - cases 1-b is handled inside * ip6_do_fragmentation(). * mppn is passed down to be updated to point at fragment chain. */ + u_int8_t *lexthdrsp; + + if (exthdrsp->ip6e_rthdr != NULL) { + lexthdrsp = mtod(exthdrsp->ip6e_rthdr, uint8_t *); + } else if (exthdrsp->ip6e_dest1 != NULL) { + lexthdrsp = mtod(exthdrsp->ip6e_dest1, uint8_t *); + } else if (exthdrsp->ip6e_hbh != NULL) { + lexthdrsp = mtod(exthdrsp->ip6e_hbh, uint8_t *); + } else { + lexthdrsp = NULL; + } error = ip6_do_fragmentation(mptr, optlen, ifp, - unfragpartlen, mtod(m, struct ip6_hdr *), exthdrsp, mtu, nxt0); + unfragpartlen, mtod(m, struct ip6_hdr *), lexthdrsp, mtu, + nxt0, htonl(ip6_randomid())); } return error; @@ -1749,11 +1761,19 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt, * the packet needs to be fragmented. on success, morig is freed and a chain * of fragments is linked into the packet chain where morig existed. Otherwise, * an errno is returned. + * optlen: total length of all extension headers (excludes the IPv6 header). + * unfragpartlen: length of the per-fragment headers which consist of the IPv6 + * header plus any extension headers that must be processed by nodes + * en route to the destination. + * lexthdrsp: pointer to the last extension header in the unfragmentable part + * or NULL. + * nxt0: upper-layer protocol number. + * id: Identification value to be used in the fragment header. */ int ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp, - uint32_t unfragpartlen, struct ip6_hdr *ip6, struct ip6_exthdrs *exthdrsp, - uint32_t mtu, int nxt0) + uint32_t unfragpartlen, struct ip6_hdr *ip6, uint8_t *lexthdrsp, + uint32_t mtu, int nxt0, uint32_t id) { VERIFY(NULL != mptr); int error = 0; @@ -1764,9 +1784,7 @@ ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp, size_t tlen = morig->m_pkthdr.len; - /* - * try to fragment the packet. case 1-b and 3 - */ + /* try to fragment the packet. case 1-b */ if ((morig->m_pkthdr.csum_flags & CSUM_TSO_IPV6)) { /* TSO and fragment aren't compatible */ in6_ifstat_inc(ifp, ifs6_out_fragfail); @@ -1783,7 +1801,6 @@ ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp, size_t hlen, len, off; struct mbuf **mnext = NULL; struct ip6_frag *ip6f; - u_int32_t id = htonl(ip6_randomid()); u_char nextproto; /* @@ -1806,15 +1823,9 @@ ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp, * Change the next header field of the last header in the * unfragmentable part. */ - if (exthdrsp->ip6e_rthdr != NULL) { - nextproto = *mtod(exthdrsp->ip6e_rthdr, u_char *); - *mtod(exthdrsp->ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; - } else if (exthdrsp->ip6e_dest1 != NULL) { - nextproto = *mtod(exthdrsp->ip6e_dest1, u_char *); - *mtod(exthdrsp->ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; - } else if (exthdrsp->ip6e_hbh != NULL) { - nextproto = *mtod(exthdrsp->ip6e_hbh, u_char *); - *mtod(exthdrsp->ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; + if (lexthdrsp != NULL) { + nextproto = *lexthdrsp; + *lexthdrsp = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; @@ -2257,17 +2268,11 @@ ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, static int ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, - struct ifnet *ifp, struct in6_addr *dst, u_int32_t *mtup, - boolean_t *alwaysfragp) + struct ifnet *ifp, struct in6_addr *dst, u_int32_t *mtup) { u_int32_t mtu = 0; - boolean_t alwaysfrag = FALSE; int error = 0; - boolean_t is_local = FALSE; - if (IN6_IS_SCOPE_LINKLOCAL(dst)) { - is_local = TRUE; - } if (ro_pmtu != ro) { /* The first hop and the final destination may differ. */ @@ -2319,17 +2324,6 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, if (!(ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU)) { ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */ } - } else if (mtu < IPV6_MMTU) { - /* - * RFC2460 section 5, last paragraph: - * if we record ICMPv6 too big message with - * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU - * or smaller, with framgent header attached. - * (fragment header is needed regardless from the - * packet size, for translators to identify packets) - */ - alwaysfrag = TRUE; - mtu = IPV6_MMTU; } } else { if (ifp) { @@ -2341,9 +2335,6 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, } *mtup = mtu; - if ((alwaysfragp != NULL) && !is_local) { - *alwaysfragp = alwaysfrag; - } return error; } @@ -2361,6 +2352,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) int level, op = -1, optname = 0; int optlen = 0; struct proc *p; + lck_mtx_t *mutex_held = NULL; VERIFY(sopt != NULL); @@ -2377,6 +2369,22 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) boolean_t capture_exthdrstat_in = FALSE; switch (op) { case SOPT_SET: + mutex_held = socket_getlock(so, PR_F_WILLUNLOCK); + /* + * Wait if we are in the middle of ip6_output + * as we unlocked the socket there and don't + * want to overwrite the IP options + */ + if (in6p->inp_sndinprog_cnt > 0) { + in6p->inp_sndingprog_waiters++; + + while (in6p->inp_sndinprog_cnt > 0) { + msleep(&in6p->inp_sndinprog_cnt, mutex_held, + PSOCK | PCATCH, "inp_sndinprog_cnt", + NULL); + } + in6p->inp_sndingprog_waiters--; + } switch (optname) { case IPV6_2292PKTOPTIONS: { struct mbuf *m; @@ -2923,7 +2931,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) * the outgoing interface. */ error = ip6_getpmtu(&sro, NULL, NULL, - &in6p->in6p_faddr, &pmtu, NULL); + &in6p->in6p_faddr, &pmtu); ROUTE_RELEASE(&sro); if (error) { break; @@ -3783,7 +3791,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, return EINVAL; } hlimp = (int *)(void *)buf; - if (*hlimp < -1 || *hlimp > 255) { + if (*hlimp < -1 || *hlimp > IPV6_MAXHLIM) { return EINVAL; } diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index b0c1b84cb..3dca8411e 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -448,6 +448,8 @@ struct ip6_out_args { #define IP6OAF_QOSMARKING_ALLOWED 0x00000080 /* policy allows Fastlane DSCP marking */ #define IP6OAF_INTCOPROC_ALLOWED 0x00000100 /* access to internal coproc interfaces */ #define IP6OAF_NO_LOW_POWER 0x00000200 /* skip low power */ +#define IP6OAF_NO_CONSTRAINED 0x00000400 /* skip IFXF_CONSTRAINED */ +#define IP6OAF_SKIP_PF 0x00000800 /* skip PF */ u_int32_t ip6oa_retflags; /* IP6OARF return flags (see below) */ #define IP6OARF_IFDENIED 0x00000001 /* denied access to interface */ int ip6oa_sotc; /* traffic class for Fastlane DSCP mapping */ @@ -556,7 +558,7 @@ extern struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); extern int ip6_optlen(struct inpcb *); extern void ip6_drain(void); extern int ip6_do_fragmentation(struct mbuf **, uint32_t, struct ifnet *, uint32_t, - struct ip6_hdr *, struct ip6_exthdrs *, uint32_t, int); + struct ip6_hdr *, uint8_t *, uint32_t, int, uint32_t); extern int route6_input(struct mbuf **, int *, int); diff --git a/bsd/netinet6/ip6protosw.h b/bsd/netinet6/ip6protosw.h index a9f8cefea..55d1bf799 100644 --- a/bsd/netinet6/ip6protosw.h +++ b/bsd/netinet6/ip6protosw.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -194,6 +194,12 @@ struct ip6protosw { */ TAILQ_HEAD(, socket_filter) pr_filter_head; struct protosw_old *pr_old; + + void (*pr_update_last_owner) /* update last socket owner) */ + (struct socket *so, struct proc *p, struct proc *ep); + + void (*pr_copy_last_owner) /* copy last socket from listener */ + (struct socket *so, struct socket *head); }; #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET6_IP6PROTOSW_H_ */ diff --git a/bsd/netinet6/ipcomp.h b/bsd/netinet6/ipcomp.h index 991764ea0..ad996afcd 100644 --- a/bsd/netinet6/ipcomp.h +++ b/bsd/netinet6/ipcomp.h @@ -53,18 +53,4 @@ struct ipcomp { #define IPCOMP_CPI_NEGOTIATE_MIN 256 -#ifdef BSD_KERNEL_PRIVATE -struct ipcomp_algorithm { - int (*compress)(struct mbuf *, struct mbuf *, size_t *); - int (*decompress)(struct mbuf *, struct mbuf *, size_t *); - size_t minplen; /* minimum required length for compression */ -}; - -struct ipsecrequest; -extern void ipcomp_init(struct protosw *, struct domain *); -extern const struct ipcomp_algorithm *ipcomp_algorithm_lookup(int); -extern void ipcomp4_input(struct mbuf *, int); -extern int ipcomp4_output(struct mbuf *, struct secasvar *); -#endif /* BSD_KERNEL_PRIVATE */ - #endif /* _NETINET6_IPCOMP_H_ */ diff --git a/bsd/netinet6/ipcomp6.h b/bsd/netinet6/ipcomp6.h deleted file mode 100644 index 670307044..000000000 --- a/bsd/netinet6/ipcomp6.h +++ /dev/null @@ -1,48 +0,0 @@ -/* $FreeBSD: src/sys/netinet6/ipcomp6.h,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $ */ -/* $KAME: ipcomp.h,v 1.8 2000/09/26 07:55:14 itojun Exp $ */ - -/* - * Copyright (C) 1999 WIDE Project. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * RFC2393 IP payload compression protocol (IPComp). - */ - -#ifndef _NETINET6_IPCOMP6_H_ -#define _NETINET6_IPCOMP6_H_ -#include -#include - -#ifdef BSD_KERNEL_PRIVATE -extern int ipcomp6_input(struct mbuf **, int *, int); -extern int ipcomp6_output(struct mbuf *, u_char *, struct mbuf *, - struct secasvar *); -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /*_NETINET6_IPCOMP6_H_*/ diff --git a/bsd/netinet6/ipcomp_core.c b/bsd/netinet6/ipcomp_core.c deleted file mode 100644 index ef4fc2d20..000000000 --- a/bsd/netinet6/ipcomp_core.c +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright (c) 2016 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* $FreeBSD: src/sys/netinet6/ipcomp_core.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $ */ -/* $KAME: ipcomp_core.c,v 1.24 2000/10/23 04:24:22 itojun Exp $ */ - -/* - * Copyright (C) 1999 WIDE Project. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * RFC2393 IP payload compression protocol (IPComp). - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#if IPCOMP_ZLIB -#include -#endif -#include - -#include -#if INET6 -#include -#endif -#include -#if INET6 -#include -#endif - -#include - -#if IPCOMP_ZLIB -static void *deflate_alloc(void *, u_int, u_int); -static void deflate_free(void *, void *); -static int deflate_common(struct mbuf *, struct mbuf *, size_t *, int); -static int deflate_compress(struct mbuf *, struct mbuf *, size_t *); -static int deflate_decompress(struct mbuf *, struct mbuf *, size_t *); - -/* - * We need to use default window size (2^15 = 32Kbytes as of writing) for - * inbound case. Otherwise we get interop problem. - * Use negative value to avoid Adler32 checksum. This is an undocumented - * feature in zlib (see ipsec wg mailing list archive in January 2000). - */ -static int deflate_policy = Z_DEFAULT_COMPRESSION; -static int deflate_window_out = -12; -static const int deflate_window_in = -1 * MAX_WBITS; /* don't change it */ -static int deflate_memlevel = MAX_MEM_LEVEL; - -static z_stream deflate_stream; -static z_stream inflate_stream; -#endif /* IPCOMP_ZLIB */ - -#if IPCOMP_ZLIB -static const struct ipcomp_algorithm ipcomp_algorithms[] = { - { deflate_compress, deflate_decompress, 90 }, -}; -#else -static const struct ipcomp_algorithm ipcomp_algorithms[] __unused = {}; -#endif - -decl_lck_mtx_data(static, ipcomp_mutex_data); -static lck_mtx_t *ipcomp_mutex = &ipcomp_mutex_data; - -void -ipcomp_init(struct protosw *pp, struct domain *dp) -{ -#pragma unused(dp) - static int ipcomp_initialized = 0; - lck_grp_attr_t *ipcomp_mutex_grp_attr = NULL; - lck_attr_t *ipcomp_mutex_attr = NULL; - lck_grp_t *ipcomp_mutex_grp = NULL; - - VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - - if (ipcomp_initialized) { - return; - } - - ipcomp_mutex_grp_attr = lck_grp_attr_alloc_init(); - ipcomp_mutex_grp = lck_grp_alloc_init("ipcomp", ipcomp_mutex_grp_attr); - lck_grp_attr_free(ipcomp_mutex_grp_attr); - - ipcomp_mutex_attr = lck_attr_alloc_init(); - lck_mtx_init(ipcomp_mutex, ipcomp_mutex_grp, ipcomp_mutex_attr); - lck_grp_free(ipcomp_mutex_grp); - lck_attr_free(ipcomp_mutex_attr); - - ipcomp_initialized = 1; -} - -const struct ipcomp_algorithm * -ipcomp_algorithm_lookup( -#if IPCOMP_ZLIB - int idx -#else - __unused int idx -#endif - ) -{ -#if IPCOMP_ZLIB - if (idx == SADB_X_CALG_DEFLATE) { - /* - * Avert your gaze, ugly hack follows! - * We init here so our malloc can allocate using M_WAIT. - * We don't want to allocate if ipcomp isn't used, and we - * don't want to allocate on the input or output path. - * Allocation fails if we use M_NOWAIT because init allocates - * something like 256k (ouch). - */ - if (deflate_stream.zalloc == NULL) { - deflate_stream.zalloc = deflate_alloc; - deflate_stream.zfree = deflate_free; - if (deflateInit2(&deflate_stream, deflate_policy, Z_DEFLATED, - deflate_window_out, deflate_memlevel, Z_DEFAULT_STRATEGY)) { - /* Allocation failed */ - bzero(&deflate_stream, sizeof(deflate_stream)); -#if IPSEC_DEBUG - printf("ipcomp_algorithm_lookup: deflateInit2 failed.\n"); -#endif - } - } - - if (inflate_stream.zalloc == NULL) { - inflate_stream.zalloc = deflate_alloc; - inflate_stream.zfree = deflate_free; - if (inflateInit2(&inflate_stream, deflate_window_in)) { - /* Allocation failed */ - bzero(&inflate_stream, sizeof(inflate_stream)); -#if IPSEC_DEBUG - printf("ipcomp_algorithm_lookup: inflateInit2 failed.\n"); -#endif - } - } - - return &ipcomp_algorithms[0]; - } -#endif /* IPCOMP_ZLIB */ - return NULL; -} - -#if IPCOMP_ZLIB -static void * -deflate_alloc( - __unused void *aux, - u_int items, - u_int siz) -{ - void *ptr; - ptr = _MALLOC(items * siz, M_TEMP, M_NOWAIT); - return ptr; -} - -static void -deflate_free( - __unused void *aux, - void *ptr) -{ - FREE(ptr, M_TEMP); -} - -/* @param mode 0: compress 1: decompress */ -static int -deflate_common(struct mbuf *m, struct mbuf *md, size_t *lenp, int mode) -{ - struct mbuf *mprev; - struct mbuf *p; - struct mbuf *n = NULL, *n0 = NULL, **np; - z_stream *zs; - int error = 0; - int zerror; - size_t offset; - -#define MOREBLOCK() \ -do { \ - /* keep the reply buffer into our chain */ \ - if (n) { \ - n->m_len = zs->total_out - offset; \ - offset = zs->total_out; \ - *np = n; \ - np = &n->m_next; \ - n = NULL; \ - } \ - \ - /* get a fresh reply buffer */ \ - MGET(n, M_DONTWAIT, MT_DATA); \ - if (n) { \ - MCLGET(n, M_DONTWAIT); \ - } \ - if (!n) { \ - error = ENOBUFS; \ - goto fail; \ - } \ - n->m_len = 0; \ - n->m_len = M_TRAILINGSPACE(n); \ - n->m_next = NULL; \ - /* \ - * if this is the first reply buffer, reserve \ - * region for ipcomp header. \ - */ \ - if (*np == NULL) { \ - n->m_len -= sizeof(struct ipcomp); \ - n->m_data += sizeof(struct ipcomp); \ - } \ - \ - zs->next_out = mtod(n, u_int8_t *); \ - zs->avail_out = n->m_len; \ -} while (0) - - for (mprev = m; mprev && mprev->m_next != md; mprev = mprev->m_next) { - ; - } - if (!mprev) { - panic("md is not in m in deflate_common"); - } - - - lck_mtx_lock(ipcomp_mutex); - zs = mode ? &inflate_stream : &deflate_stream; - if (zs->zalloc == NULL) { - /* - * init is called in ipcomp_algorithm_lookup. - * if zs->zalloc is NULL, either init hasn't been called (unlikely) - * or init failed because of no memory. - */ - error = ENOBUFS; - goto fail; - } - - zs->next_in = 0; - zs->avail_in = 0; - zs->next_out = 0; - zs->avail_out = 0; - - n0 = n = NULL; - np = &n0; - offset = 0; - zerror = 0; - p = md; - while (p && p->m_len == 0) { - p = p->m_next; - } - - /* input stream and output stream are available */ - while (p && zs->avail_in == 0) { - /* get input buffer */ - if (p && zs->avail_in == 0) { - zs->next_in = mtod(p, u_int8_t *); - zs->avail_in = p->m_len; - p = p->m_next; - while (p && p->m_len == 0) { - p = p->m_next; - } - } - - /* get output buffer */ - if (zs->next_out == NULL || zs->avail_out == 0) { - MOREBLOCK(); - } - - zerror = mode ? inflate(zs, Z_NO_FLUSH) - : deflate(zs, Z_NO_FLUSH); - - if (zerror == Z_STREAM_END) { - ; /*once more.*/ - } else if (zerror == Z_OK) { - /* inflate: Z_OK can indicate the end of decode */ - if (mode && !p && zs->avail_out != 0) { - goto terminate; - } - - /* else once more.*/ - } else { - if (zs->msg) { - ipseclog((LOG_ERR, "ipcomp_%scompress: " - "%sflate(Z_NO_FLUSH): %s\n", - mode ? "de" : "", mode ? "in" : "de", - zs->msg)); - } else { - ipseclog((LOG_ERR, "ipcomp_%scompress: " - "%sflate(Z_NO_FLUSH): unknown error (%d)\n", - mode ? "de" : "", mode ? "in" : "de", - zerror)); - } - mode ? inflateReset(zs) : deflateReset(zs); -/* mode ? inflateEnd(zs) : deflateEnd(zs);*/ - error = EINVAL; - goto fail; - } - } - - if (zerror == Z_STREAM_END) { - goto terminate; - } - - /* termination */ - while (1) { - /* get output buffer */ - if (zs->next_out == NULL || zs->avail_out == 0) { - MOREBLOCK(); - } - - zerror = mode ? inflate(zs, Z_FINISH) - : deflate(zs, Z_FINISH); - - if (zerror == Z_STREAM_END) { - break; - } else if (zerror == Z_OK) { - ; /*once more.*/ - } else { - if (zs->msg) { - ipseclog((LOG_ERR, "ipcomp_%scompress: " - "%sflate(Z_FINISH): %s\n", - mode ? "de" : "", mode ? "in" : "de", - zs->msg)); - } else { - ipseclog((LOG_ERR, "ipcomp_%scompress: " - "%sflate(Z_FINISH): unknown error (%d)\n", - mode ? "de" : "", mode ? "in" : "de", - zerror)); - } - mode ? inflateReset(zs) : deflateReset(zs); -/* mode ? inflateEnd(zs) : deflateEnd(zs); */ - error = EINVAL; - goto fail; - } - } - -terminate: - /* keep the final reply buffer into our chain */ - if (n) { - n->m_len = zs->total_out - offset; - offset = zs->total_out; - *np = n; - np = &n->m_next; - n = NULL; - } - - /* switch the mbuf to the new one */ - mprev->m_next = n0; - m_freem(md); - *lenp = zs->total_out; - - /* reset the inflate/deflate state */ - zerror = mode ? inflateReset(zs) : deflateReset(zs); - if (zerror != Z_OK) { - /* - * A failure here is uncommon. If this does - * fail, the packet can still be used but - * the z_stream will be messed up so subsequent - * inflates/deflates will probably fail. - */ - if (zs->msg) { - ipseclog((LOG_ERR, "ipcomp_%scompress: " - "%sflateEnd: %s\n", - mode ? "de" : "", mode ? "in" : "de", - zs->msg)); - } else { - ipseclog((LOG_ERR, "ipcomp_%scompress: " - "%sflateEnd: unknown error (%d)\n", - mode ? "de" : "", mode ? "in" : "de", - zerror)); - } - } - - lck_mtx_unlock(ipcomp_mutex); - return 0; - -fail: - lck_mtx_unlock(ipcomp_mutex); - if (m) { - m_freem(m); - } - if (n) { - m_freem(n); - } - if (n0) { - m_freem(n0); - } - return error; -#undef MOREBLOCK -} - -static int -deflate_compress(struct mbuf *m, struct mbuf *md, size_t *lenp) -{ - if (!m) { - panic("m == NULL in deflate_compress"); - } - if (!md) { - panic("md == NULL in deflate_compress"); - } - if (!lenp) { - panic("lenp == NULL in deflate_compress"); - } - - return deflate_common(m, md, lenp, 0); -} - -static int -deflate_decompress(struct mbuf *m, struct mbuf *md, size_t *lenp) -{ - if (!m) { - panic("m == NULL in deflate_decompress"); - } - if (!md) { - panic("md == NULL in deflate_decompress"); - } - if (!lenp) { - panic("lenp == NULL in deflate_decompress"); - } - - return deflate_common(m, md, lenp, 1); -} -#endif /* IPCOMP_ZLIB */ diff --git a/bsd/netinet6/ipcomp_input.c b/bsd/netinet6/ipcomp_input.c deleted file mode 100644 index a50e11d2a..000000000 --- a/bsd/netinet6/ipcomp_input.c +++ /dev/null @@ -1,362 +0,0 @@ -/* $FreeBSD: src/sys/netinet6/ipcomp_input.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $ */ -/* $KAME: ipcomp_input.c,v 1.25 2001/03/01 09:12:09 itojun Exp $ */ - -/* - * Copyright (C) 1999 WIDE Project. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * RFC2393 IP payload compression protocol (IPComp). - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if INET6 -#include -#include -#endif -#include -#if INET6 -#include -#endif - -#include -#if INET6 -#include -#endif -#include -#include - -#include -#include - -#define IPLEN_FLIPPED - -void -ipcomp4_input(struct mbuf *m, int off) -{ - struct mbuf *md; - struct ip *ip; - struct ipcomp *ipcomp; - const struct ipcomp_algorithm *algo; - u_int16_t cpi; /* host order */ - u_int16_t nxt; - size_t hlen; - int error; - size_t newlen, olen; - struct secasvar *sav = NULL; - - if (m->m_pkthdr.len < off + sizeof(struct ipcomp)) { - ipseclog((LOG_DEBUG, "IPv4 IPComp input: assumption failed " - "(packet too short)\n")); - IPSEC_STAT_INCREMENT(ipsecstat.in_inval); - goto fail; - } - - md = m_pulldown(m, off, sizeof(*ipcomp), NULL); - if (!md) { - m = NULL; /*already freed*/ - ipseclog((LOG_DEBUG, "IPv4 IPComp input: assumption failed " - "(pulldown failure)\n")); - IPSEC_STAT_INCREMENT(ipsecstat.in_inval); - goto fail; - } - ipcomp = mtod(md, struct ipcomp *); - - /* Expect 32-bit aligned data pointer on strict-align platforms */ - MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); - - ip = mtod(m, struct ip *); - nxt = ipcomp->comp_nxt; -#ifdef _IP_VHL - hlen = IP_VHL_HL(ip->ip_vhl) << 2; -#else - hlen = ip->ip_hl << 2; -#endif - - cpi = ntohs(ipcomp->comp_cpi); - - if (cpi >= IPCOMP_CPI_NEGOTIATE_MIN) { - sav = key_allocsa(AF_INET, (caddr_t)&ip->ip_src, - (caddr_t)&ip->ip_dst, IPPROTO_IPCOMP, htonl(cpi)); - if (sav != NULL - && (sav->state == SADB_SASTATE_MATURE - || sav->state == SADB_SASTATE_DYING)) { - cpi = sav->alg_enc; /*XXX*/ - /* other parameters to look at? */ - } - } - algo = ipcomp_algorithm_lookup(cpi); - if (!algo) { - ipseclog((LOG_WARNING, "IPv4 IPComp input: unknown cpi %u\n", - cpi)); - IPSEC_STAT_INCREMENT(ipsecstat.in_nosa); - goto fail; - } - - /* chop ipcomp header */ - ipcomp = NULL; - md->m_data += sizeof(struct ipcomp); - md->m_len -= sizeof(struct ipcomp); - m->m_pkthdr.len -= sizeof(struct ipcomp); -#ifdef IPLEN_FLIPPED - ip->ip_len -= sizeof(struct ipcomp); -#else - ip->ip_len = htons(ntohs(ip->ip_len) - sizeof(struct ipcomp)); -#endif - - olen = m->m_pkthdr.len; - newlen = m->m_pkthdr.len - off; - error = (*algo->decompress)(m, m->m_next, &newlen); - if (error != 0) { - if (error == EINVAL) { - IPSEC_STAT_INCREMENT(ipsecstat.in_inval); - } else if (error == ENOBUFS) { - IPSEC_STAT_INCREMENT(ipsecstat.in_nomem); - } - m = NULL; - goto fail; - } - IPSEC_STAT_INCREMENT(ipsecstat.in_comphist[cpi]); - - /* - * returning decompressed packet onto icmp is meaningless. - * mark it decrypted to prevent icmp from attaching original packet. - */ - m->m_flags |= M_DECRYPTED; - - m->m_pkthdr.len = off + newlen; - ip = mtod(m, struct ip *); - { - size_t len; -#ifdef IPLEN_FLIPPED - len = ip->ip_len; -#else - len = ntohs(ip->ip_len); -#endif - /* - * be careful about underflow. also, do not assign exact value - * as ip_len is manipulated differently on *BSDs. - */ - len += m->m_pkthdr.len; - len -= olen; - if (len & ~0xffff) { - /* packet too big after decompress */ - IPSEC_STAT_INCREMENT(ipsecstat.in_inval); - goto fail; - } -#ifdef IPLEN_FLIPPED - ip->ip_len = len & 0xffff; -#else - ip->ip_len = htons(len & 0xffff); -#endif - ip->ip_p = nxt; - } - - if (sav) { - key_sa_recordxfer(sav, m); - if (ipsec_addhist(m, IPPROTO_IPCOMP, (u_int32_t)cpi) != 0) { - IPSEC_STAT_INCREMENT(ipsecstat.in_nomem); - goto fail; - } - key_freesav(sav, KEY_SADB_UNLOCKED); - sav = NULL; - } - - if (nxt != IPPROTO_DONE) { - if ((ip_protox[nxt]->pr_flags & PR_LASTHDR) != 0 && - ipsec4_in_reject(m, NULL)) { - IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); - goto fail; - } - - DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, - struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, - struct ip *, ip, struct ip6_hdr *, NULL); - - ip_proto_dispatch_in(m, off, nxt, 0); - } else { - m_freem(m); - } - m = NULL; - - IPSEC_STAT_INCREMENT(ipsecstat.in_success); - return; - -fail: - if (sav) { - key_freesav(sav, KEY_SADB_UNLOCKED); - } - - if (m) { - m_freem(m); - } - return; -} - -#if INET6 -int -ipcomp6_input(struct mbuf **mp, int *offp, int proto) -{ -#pragma unused(proto) - struct mbuf *m, *md; - int off; - struct ip6_hdr *ip6; - struct ipcomp *ipcomp; - const struct ipcomp_algorithm *algo; - u_int16_t cpi; /* host order */ - u_int16_t nxt; - int error; - size_t newlen; - struct secasvar *sav = NULL; - char *prvnxtp; - - m = *mp; - off = *offp; - - md = m_pulldown(m, off, sizeof(*ipcomp), NULL); - if (!md) { - m = NULL; /*already freed*/ - ipseclog((LOG_DEBUG, "IPv6 IPComp input: assumption failed " - "(pulldown failure)\n")); - IPSEC_STAT_INCREMENT(ipsec6stat.in_inval); - goto fail; - } - ipcomp = mtod(md, struct ipcomp *); - - /* Expect 32-bit aligned data pointer on strict-align platforms */ - MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); - - ip6 = mtod(m, struct ip6_hdr *); - nxt = ipcomp->comp_nxt; - - cpi = ntohs(ipcomp->comp_cpi); - - if (cpi >= IPCOMP_CPI_NEGOTIATE_MIN) { - sav = key_allocsa(AF_INET6, (caddr_t)&ip6->ip6_src, - (caddr_t)&ip6->ip6_dst, IPPROTO_IPCOMP, htonl(cpi)); - if (sav != NULL - && (sav->state == SADB_SASTATE_MATURE - || sav->state == SADB_SASTATE_DYING)) { - cpi = sav->alg_enc; /*XXX*/ - /* other parameters to look at? */ - } - } - algo = ipcomp_algorithm_lookup(cpi); - if (!algo) { - ipseclog((LOG_WARNING, "IPv6 IPComp input: unknown cpi %u; " - "dropping the packet for simplicity\n", cpi)); - IPSEC_STAT_INCREMENT(ipsec6stat.in_nosa); - goto fail; - } - - /* chop ipcomp header */ - ipcomp = NULL; - md->m_data += sizeof(struct ipcomp); - md->m_len -= sizeof(struct ipcomp); - m->m_pkthdr.len -= sizeof(struct ipcomp); - - newlen = m->m_pkthdr.len - off; - error = (*algo->decompress)(m, md, &newlen); - if (error != 0) { - if (error == EINVAL) { - IPSEC_STAT_INCREMENT(ipsec6stat.in_inval); - } else if (error == ENOBUFS) { - IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); - } - m = NULL; - goto fail; - } - IPSEC_STAT_INCREMENT(ipsec6stat.in_comphist[cpi]); - m->m_pkthdr.len = off + newlen; - - /* - * returning decompressed packet onto icmp is meaningless. - * mark it decrypted to prevent icmp from attaching original packet. - */ - m->m_flags |= M_DECRYPTED; - - /* update next header field */ - prvnxtp = ip6_get_prevhdr(m, off); - *prvnxtp = nxt; - - /* - * no need to adjust payload length, as all the IPv6 protocols - * look at m->m_pkthdr.len - */ - - if (sav) { - key_sa_recordxfer(sav, m); - if (ipsec_addhist(m, IPPROTO_IPCOMP, (u_int32_t)cpi) != 0) { - IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); - goto fail; - } - key_freesav(sav, KEY_SADB_UNLOCKED); - sav = NULL; - } - *offp = off; - *mp = m; - IPSEC_STAT_INCREMENT(ipsec6stat.in_success); - return nxt; - -fail: - if (m) { - m_freem(m); - } - if (sav) { - key_freesav(sav, KEY_SADB_UNLOCKED); - } - return IPPROTO_DONE; -} -#endif /* INET6 */ diff --git a/bsd/netinet6/ipcomp_output.c b/bsd/netinet6/ipcomp_output.c deleted file mode 100644 index 6abaedef6..000000000 --- a/bsd/netinet6/ipcomp_output.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Copyright (c) 2016 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* $FreeBSD: src/sys/netinet6/ipcomp_output.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $ */ -/* $KAME: ipcomp_output.c,v 1.23 2001/01/23 08:59:37 itojun Exp $ */ - -/* - * Copyright (C) 1999 WIDE Project. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * RFC2393 IP payload compression protocol (IPComp). - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#if INET6 -#include -#include -#endif -#include -#if INET6 -#include -#endif - -#include -#if INET6 -#include -#endif -#include -#include - -#include - - -static int ipcomp_output(struct mbuf *, u_char *, struct mbuf *, - int, struct secasvar *sav); - -/* - * Modify the packet so that the payload is compressed. - * The mbuf (m) must start with IPv4 or IPv6 header. - * On failure, free the given mbuf and return non-zero. - * - * on invocation: - * m nexthdrp md - * v v v - * IP ......... payload - * during the encryption: - * m nexthdrp mprev md - * v v v v - * IP ............... ipcomp payload - * <-----><-----> - * complen plen - * <-> hlen - * <-----------------> compoff - */ -static int -ipcomp_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, int af, struct secasvar *sav) -{ - struct mbuf *n; - struct mbuf *md0; - struct mbuf *mcopy; - struct mbuf *mprev; - struct ipcomp *ipcomp; - const struct ipcomp_algorithm *algo; - u_int16_t cpi; /* host order */ - size_t plen0, plen; /*payload length to be compressed*/ - size_t compoff; - int afnumber; - int error = 0; - struct ipsecstat *stat; - - switch (af) { -#if INET - case AF_INET: - afnumber = 4; - stat = &ipsecstat; - break; -#endif -#if INET6 - case AF_INET6: - afnumber = 6; - stat = &ipsec6stat; - break; -#endif - default: - ipseclog((LOG_ERR, "ipcomp_output: unsupported af %d\n", af)); - return 0; /* no change at all */ - } - - /* grab parameters */ - algo = ipcomp_algorithm_lookup(sav->alg_enc); - if ((ntohl(sav->spi) & ~0xffff) != 0 || !algo) { - IPSEC_STAT_INCREMENT(stat->out_inval); - m_freem(m); - return EINVAL; - } - if ((sav->flags & SADB_X_EXT_RAWCPI) == 0) { - cpi = sav->alg_enc; - } else { - cpi = ntohl(sav->spi) & 0xffff; - } - - /* compute original payload length */ - plen = 0; - for (n = md; n; n = n->m_next) { - plen += n->m_len; - } - - /* if the payload is short enough, we don't need to compress */ - if (plen < algo->minplen) { - return 0; - } - - /* - * retain the original packet for two purposes: - * (1) we need to backout our changes when compression is not necessary. - * (2) byte lifetime computation should use the original packet. - * see RFC2401 page 23. - * compromise two m_copym(). we will be going through every byte of - * the payload during compression process anyways. - */ - mcopy = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (mcopy == NULL) { - error = ENOBUFS; - return 0; - } - md0 = m_copym(md, 0, M_COPYALL, M_NOWAIT); - if (md0 == NULL) { - m_freem(mcopy); - error = ENOBUFS; - return 0; - } - plen0 = plen; - - /* make the packet over-writable */ - for (mprev = m; mprev && mprev->m_next != md; mprev = mprev->m_next) { - ; - } - if (mprev == NULL || mprev->m_next != md) { - ipseclog((LOG_DEBUG, "ipcomp%d_output: md is not in chain\n", - afnumber)); - IPSEC_STAT_INCREMENT(stat->out_inval); - m_freem(m); - m_freem(md0); - m_freem(mcopy); - return EINVAL; - } - mprev->m_next = NULL; - if ((md = ipsec_copypkt(md)) == NULL) { - m_freem(m); - m_freem(md0); - m_freem(mcopy); - error = ENOBUFS; - goto fail; - } - mprev->m_next = md; - - /* compress data part */ - if ((*algo->compress)(m, md, &plen) || mprev->m_next == NULL) { - ipseclog((LOG_ERR, "packet compression failure\n")); - m = NULL; - m_freem(md0); - m_freem(mcopy); - IPSEC_STAT_INCREMENT(stat->out_inval); - error = EINVAL; - goto fail; - } - IPSEC_STAT_INCREMENT(stat->out_comphist[sav->alg_enc]); - md = mprev->m_next; - - /* - * if the packet became bigger, meaningless to use IPComp. - * we've only wasted our cpu time. - */ - if (plen0 < plen) { - m_freem(md); - m_freem(mcopy); - mprev->m_next = md0; - return 0; - } - - /* - * no need to backout change beyond here. - */ - m_freem(md0); - md0 = NULL; - - m->m_pkthdr.len -= plen0; - m->m_pkthdr.len += plen; - - { - /* - * insert IPComp header. - */ -#if INET - struct ip *ip = NULL; -#endif -#if INET6 - struct ip6_hdr *ip6 = NULL; -#endif - size_t hlen = 0; /*ip header len*/ - size_t complen = sizeof(struct ipcomp); - - switch (af) { -#if INET - case AF_INET: - ip = mtod(m, struct ip *); -#ifdef _IP_VHL - hlen = IP_VHL_HL(ip->ip_vhl) << 2; -#else - hlen = ip->ip_hl << 2; -#endif - break; -#endif -#if INET6 - case AF_INET6: - ip6 = mtod(m, struct ip6_hdr *); - hlen = sizeof(*ip6); - break; -#endif - } - - compoff = m->m_pkthdr.len - plen; - - /* - * grow the mbuf to accomodate ipcomp header. - * before: IP ... payload - * after: IP ... ipcomp payload - */ - if (M_LEADINGSPACE(md) < complen) { - MGET(n, M_DONTWAIT, MT_DATA); - if (!n) { - m_freem(m); - error = ENOBUFS; - goto fail; - } - n->m_len = complen; - mprev->m_next = n; - n->m_next = md; - m->m_pkthdr.len += complen; - ipcomp = mtod(n, struct ipcomp *); - } else { - md->m_len += complen; - md->m_data -= complen; - m->m_pkthdr.len += complen; - ipcomp = mtod(md, struct ipcomp *); - } - - bzero(ipcomp, sizeof(*ipcomp)); - ipcomp->comp_nxt = *nexthdrp; - *nexthdrp = IPPROTO_IPCOMP; - ipcomp->comp_cpi = htons(cpi); - switch (af) { -#if INET - case AF_INET: - if (compoff + complen + plen < IP_MAXPACKET) { - ip->ip_len = htons(compoff + complen + plen); - } else { - ipseclog((LOG_ERR, - "IPv4 ESP output: size exceeds limit\n")); - IPSEC_STAT_INCREMENT(ipsecstat.out_inval); - m_freem(m); - error = EMSGSIZE; - goto fail; - } - break; -#endif -#if INET6 - case AF_INET6: - /* total packet length will be computed in ip6_output() */ - break; -#endif - } - } - - if (!m) { - ipseclog((LOG_DEBUG, - "NULL mbuf after compression in ipcomp%d_output", - afnumber)); - IPSEC_STAT_INCREMENT(stat->out_inval); - } - IPSEC_STAT_INCREMENT(stat->out_success); - - /* compute byte lifetime against original packet */ - key_sa_recordxfer(sav, mcopy); - m_freem(mcopy); - - return 0; - -fail: -#if 1 - return error; -#else - panic("something bad in ipcomp_output"); -#endif -} - -#if INET -int -ipcomp4_output(struct mbuf *m, struct secasvar *sav) -{ - struct ip *ip; - if (m->m_len < sizeof(struct ip)) { - ipseclog((LOG_DEBUG, "ipcomp4_output: first mbuf too short\n")); - IPSEC_STAT_INCREMENT(ipsecstat.out_inval); - m_freem(m); - return 0; - } - ip = mtod(m, struct ip *); - /* XXX assumes that m->m_next points to payload */ - return ipcomp_output(m, &ip->ip_p, m->m_next, AF_INET, sav); -} -#endif /*INET*/ - -#if INET6 -int -ipcomp6_output( - struct mbuf *m, - u_char *nexthdrp, - struct mbuf *md, - struct secasvar *sav) -{ - if (m->m_len < sizeof(struct ip6_hdr)) { - ipseclog((LOG_DEBUG, "ipcomp6_output: first mbuf too short\n")); - IPSEC_STAT_INCREMENT(ipsec6stat.out_inval); - m_freem(m); - return 0; - } - return ipcomp_output(m, nexthdrp, md, AF_INET6, sav); -} -#endif /*INET6*/ diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 32683adc3..671a6a64f 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -121,16 +122,14 @@ #include #endif #endif -#include -#if INET6 -#include -#endif #include #include #include #include +#include + #if IPSEC_DEBUG int ipsec_debug = 1; #else @@ -164,6 +163,9 @@ extern u_int64_t natt_now; struct ipsec_tag; +void *sleep_wake_handle = NULL; +bool ipsec_save_wake_pkt = false; + SYSCTL_DECL(_net_inet_ipsec); #if INET6 SYSCTL_DECL(_net_inet6_ipsec6); @@ -238,6 +240,10 @@ SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, esp_randpad, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_randpad, 0, ""); #endif /* INET6 */ +SYSCTL_DECL(_net_link_generic_system); + +struct ipsec_wake_pkt_info ipsec_wake_pkt; + static int ipsec_setspidx_interface(struct secpolicyindex *, u_int, struct mbuf *, int, int, int); static int ipsec_setspidx_mbuf(struct secpolicyindex *, u_int, u_int, @@ -271,23 +277,27 @@ static void ipsec_optaux(struct mbuf *, struct ipsec_tag *); int ipsec_send_natt_keepalive(struct secasvar *sav); bool ipsec_fill_offload_frame(ifnet_t ifp, struct secasvar *sav, struct ifnet_keepalive_offload_frame *frame, size_t frame_data_offset); +extern bool IOPMCopySleepWakeUUIDKey(char *, size_t); +extern void *registerSleepWakeInterest(void *, void *, void *); + static int sysctl_def_policy SYSCTL_HANDLER_ARGS { - int old_policy = ip4_def_policy.policy; - int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + int new_policy = ip4_def_policy.policy; + int error = sysctl_handle_int(oidp, &new_policy, 0, req); #pragma unused(arg1, arg2) + if (error == 0) { + if (new_policy != IPSEC_POLICY_NONE && + new_policy != IPSEC_POLICY_DISCARD) { + return EINVAL; + } + ip4_def_policy.policy = new_policy; - if (ip4_def_policy.policy != IPSEC_POLICY_NONE && - ip4_def_policy.policy != IPSEC_POLICY_DISCARD) { - ip4_def_policy.policy = old_policy; - return EINVAL; - } - - /* Turn off the bypass if the default security policy changes */ - if (ipsec_bypass != 0 && ip4_def_policy.policy != IPSEC_POLICY_NONE) { - ipsec_bypass = 0; + /* Turn off the bypass if the default security policy changes */ + if (ipsec_bypass != 0 && ip4_def_policy.policy != IPSEC_POLICY_NONE) { + ipsec_bypass = 0; + } } return error; @@ -627,7 +637,7 @@ ipsec4_getpolicybyinterface(struct mbuf *m, /* Disabled policies go in the clear */ key_freesp(*sp, KEY_SADB_UNLOCKED); *sp = NULL; - *flags |= IP_NOIPSEC; /* Avoid later IPSec check */ + *flags |= IP_NOIPSEC; /* Avoid later IPsec check */ } else { /* If policy is enabled, redirect to ipsec interface */ ipoa->ipoa_boundif = (*sp)->ipsec_if->if_index; @@ -939,7 +949,7 @@ ipsec6_getpolicybyinterface(struct mbuf *m, /* Disabled policies go in the clear */ key_freesp(*sp, KEY_SADB_UNLOCKED); *sp = NULL; - *noipsec = 1; /* Avoid later IPSec check */ + *noipsec = 1; /* Avoid later IPsec check */ } else { /* If policy is enabled, redirect to ipsec interface */ ip6oap->ip6oa_boundif = (*sp)->ipsec_if->if_index; @@ -1894,11 +1904,8 @@ ipsec_get_reqlevel(struct ipsecrequest *isr) } break; case IPPROTO_IPCOMP: - /* - * we don't really care, as IPcomp document says that - * we shouldn't compress small packets - */ - level = IPSEC_LEVEL_USE; + ipseclog((LOG_ERR, "ipsec_get_reqlevel: " + "still got IPCOMP - exiting\n")); break; default: panic("ipsec_get_reqlevel: " @@ -2183,8 +2190,10 @@ ipsec_hdrsiz(struct secpolicy *sp) case IPPROTO_AH: clen = ah_hdrsiz(isr); break; - case IPPROTO_IPCOMP: - clen = sizeof(struct ipcomp); + default: + ipseclog((LOG_ERR, "ipsec_hdrsiz: " + "unknown protocol %u\n", + isr->saidx.proto)); break; } @@ -2679,9 +2688,6 @@ ipsec6_update_routecache_and_output( case IPPROTO_AH: error = ah6_output(state->m, &ip6->ip6_nxt, state->m->m_next, sav); break; - case IPPROTO_IPCOMP: - /* XXX code should be here */ - /*FALLTHROUGH*/ default: ipseclog((LOG_ERR, "%s: unknown ipsec protocol %d\n", __FUNCTION__, sav->sah->saidx.proto)); m_freem(state->m); @@ -2875,7 +2881,7 @@ ipsec46_encapsulate(struct ipsec_output_state *state, struct secasvar *sav) * based on RFC 2401. */ int -ipsec_chkreplay(u_int32_t seq, struct secasvar *sav) +ipsec_chkreplay(u_int32_t seq, struct secasvar *sav, u_int8_t replay_index) { const struct secreplay *replay; u_int32_t diff; @@ -2890,7 +2896,7 @@ ipsec_chkreplay(u_int32_t seq, struct secasvar *sav) } lck_mtx_lock(sadb_mutex); - replay = sav->replay; + replay = sav->replay[replay_index]; if (replay->wsize == 0) { lck_mtx_unlock(sadb_mutex); @@ -2947,7 +2953,7 @@ ipsec_chkreplay(u_int32_t seq, struct secasvar *sav) * 1: NG */ int -ipsec_updatereplay(u_int32_t seq, struct secasvar *sav) +ipsec_updatereplay(u_int32_t seq, struct secasvar *sav, u_int8_t replay_index) { struct secreplay *replay; u_int32_t diff; @@ -2961,7 +2967,7 @@ ipsec_updatereplay(u_int32_t seq, struct secasvar *sav) } lck_mtx_lock(sadb_mutex); - replay = sav->replay; + replay = sav->replay[replay_index]; if (replay->wsize == 0) { goto ok; /* no need to check replay. */ @@ -3351,19 +3357,13 @@ ipsec4_output_internal(struct ipsec_output_state *state, struct secasvar *sav) goto bad; } break; - case IPPROTO_IPCOMP: - if ((error = ipcomp4_output(state->m, sav)) != 0) { - state->m = NULL; - goto bad; - } - break; default: ipseclog((LOG_ERR, "ipsec4_output: unknown ipsec protocol %d\n", sav->sah->saidx.proto)); m_freem(state->m); state->m = NULL; - error = EINVAL; + error = EPROTONOSUPPORT; goto bad; } @@ -3607,15 +3607,12 @@ ipsec6_output_trans_internal( case IPPROTO_AH: error = ah6_output(state->m, nexthdrp, mprev->m_next, sav); break; - case IPPROTO_IPCOMP: - error = ipcomp6_output(state->m, nexthdrp, mprev->m_next, sav); - break; default: ipseclog((LOG_ERR, "ipsec6_output_trans: " "unknown ipsec protocol %d\n", sav->sah->saidx.proto)); m_freem(state->m); IPSEC_STAT_INCREMENT(ipsec6stat.out_inval); - error = EINVAL; + error = EPROTONOSUPPORT; break; } if (error) { @@ -3907,20 +3904,13 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar goto bad; } break; - case IPPROTO_IPCOMP: - if ((error = ipcomp4_output(state->m, sav)) != 0) { - state->m = NULL; - ROUTE_RELEASE(&ro4_copy); - goto bad; - } - break; default: ipseclog((LOG_ERR, "ipsec4_output: unknown ipsec protocol %d\n", sav->sah->saidx.proto)); m_freem(state->m); state->m = NULL; - error = EINVAL; + error = EPROTONOSUPPORT; ROUTE_RELEASE(&ro4_copy); goto bad; } @@ -4027,9 +4017,6 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar case IPPROTO_AH: error = ah6_output(state->m, &ip6->ip6_nxt, state->m->m_next, sav); break; - case IPPROTO_IPCOMP: - /* XXX code should be here */ - /*FALLTHROUGH*/ default: ipseclog((LOG_ERR, "ipsec6_output_tunnel: " "unknown ipsec protocol %d\n", sav->sah->saidx.proto)); @@ -4892,7 +4879,7 @@ ipsec_send_natt_keepalive( LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(sadb_mutex); - if ((esp_udp_encap_port & 0xFFFF) == 0 || sav->remote_ike_port == 0) { + if (((esp_udp_encap_port & 0xFFFF) == 0 && sav->natt_encapsulated_src_port == 0) || sav->remote_ike_port == 0) { lck_mtx_unlock(sadb_mutex); return FALSE; } @@ -4953,6 +4940,11 @@ ipsec_send_natt_keepalive( ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; } + if (sav->natt_encapsulated_src_port != 0) { + uh->uh_sport = (u_short)sav->natt_encapsulated_src_port; + } else { + uh->uh_sport = htons((u_short)esp_udp_encap_port); + } uh->uh_sport = htons((u_short)esp_udp_encap_port); uh->uh_dport = htons(sav->remote_ike_port); uh->uh_ulen = htons(1 + sizeof(*uh)); @@ -5018,7 +5010,11 @@ ipsec_send_natt_keepalive( ip6->ip6_dst.s6_addr16[1] = 0; } - uh->uh_sport = htons((u_short)esp_udp_encap_port); + if (sav->natt_encapsulated_src_port != 0) { + uh->uh_sport = (u_short)sav->natt_encapsulated_src_port; + } else { + uh->uh_sport = htons((u_short)esp_udp_encap_port); + } uh->uh_dport = htons(sav->remote_ike_port); uh->uh_ulen = htons(1 + sizeof(*uh)); *(u_int8_t*)((char*)m_mtod(m) + sizeof(*ip6) + sizeof(*uh)) = 0xFF; @@ -5073,7 +5069,7 @@ ipsec_fill_offload_frame(ifnet_t ifp, !(sav->flags & SADB_X_EXT_NATT_KEEPALIVE) || !(sav->flags & SADB_X_EXT_NATT_KEEPALIVE_OFFLOAD) || sav->flags & SADB_X_EXT_ESP_KEEPALIVE || - (esp_udp_encap_port & 0xFFFF) == 0 || + ((esp_udp_encap_port & 0xFFFF) == 0 && sav->natt_encapsulated_src_port == 0) || sav->remote_ike_port == 0 || (natt_keepalive_interval == 0 && sav->natt_interval == 0 && sav->natt_offload_interval == 0)) { /* SA is not eligible for keepalive offload on this interface */ @@ -5127,7 +5123,12 @@ ipsec_fill_offload_frame(ifnet_t ifp, ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; } ip->ip_sum = in_cksum_hdr_opt(ip); - uh->uh_sport = htons((u_short)esp_udp_encap_port); + /* Fill out the UDP header */ + if (sav->natt_encapsulated_src_port != 0) { + uh->uh_sport = (u_short)sav->natt_encapsulated_src_port; + } else { + uh->uh_sport = htons((u_short)esp_udp_encap_port); + } uh->uh_dport = htons(sav->remote_ike_port); uh->uh_ulen = htons(1 + sizeof(*uh)); uh->uh_sum = 0; @@ -5142,3 +5143,96 @@ ipsec_fill_offload_frame(ifnet_t ifp, } return TRUE; } + +static int +sysctl_ipsec_wake_packet SYSCTL_HANDLER_ARGS +{ + #pragma unused(oidp, arg1, arg2) + if (req->newptr != USER_ADDR_NULL) { + ipseclog((LOG_ERR, "ipsec: invalid parameters")); + return EINVAL; + } + + struct proc *p = current_proc(); + if (p != NULL) { + uid_t uid = kauth_cred_getuid(proc_ucred(p)); + if (uid != 0 && priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_IPSEC_WAKE_PACKET, 0) != 0) { + ipseclog((LOG_ERR, "process does not hold necessary entitlement to get ipsec wake packet")); + return EPERM; + } + + int result = sysctl_io_opaque(req, &ipsec_wake_pkt, sizeof(ipsec_wake_pkt), NULL); + return result; + } + + return EINVAL; +} + +SYSCTL_PROC(_net_link_generic_system, OID_AUTO, ipsec_wake_pkt, CTLTYPE_STRUCT | CTLFLAG_RD | + CTLFLAG_LOCKED, 0, 0, &sysctl_ipsec_wake_packet, "S,ipsec wake packet", ""); + +void +ipsec_save_wake_packet(struct mbuf *wake_mbuf, u_int32_t spi, u_int32_t seq) +{ + if (wake_mbuf == NULL) { + ipseclog((LOG_ERR, "ipsec: bad wake packet")); + return; + } + + lck_mtx_lock(sadb_mutex); + if (__probable(!ipsec_save_wake_pkt)) { + goto done; + } + + u_int16_t max_len = (wake_mbuf->m_pkthdr.len > IPSEC_MAX_WAKE_PKT_LEN) ? IPSEC_MAX_WAKE_PKT_LEN : wake_mbuf->m_pkthdr.len; + m_copydata(wake_mbuf, 0, max_len, (void *)ipsec_wake_pkt.wake_pkt); + ipsec_wake_pkt.wake_pkt_len = max_len; + + ipsec_wake_pkt.wake_pkt_spi = spi; + ipsec_wake_pkt.wake_pkt_seq = seq; + + ipsec_save_wake_pkt = false; +done: + lck_mtx_unlock(sadb_mutex); + return; +} + +static IOReturn +ipsec_sleep_wake_handler(void *target, void *refCon, UInt32 messageType, + void *provider, void *messageArgument, vm_size_t argSize) +{ +#pragma unused(target, refCon, provider, messageArgument, argSize) + switch (messageType) { + case kIOMessageSystemWillSleep: + memset(&ipsec_wake_pkt, 0, sizeof(ipsec_wake_pkt)); + IOPMCopySleepWakeUUIDKey(ipsec_wake_pkt.wake_uuid, + sizeof(ipsec_wake_pkt.wake_uuid)); + ipseclog((LOG_INFO, + "ipsec: system will sleep")); + break; + case kIOMessageSystemHasPoweredOn: + ipsec_save_wake_pkt = true; + ipseclog((LOG_INFO, + "ipsec: system has powered on")); + break; + default: + break; + } + + return IOPMAckImplied; +} + +void +ipsec_monitor_sleep_wake(void) +{ + LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED); + + if (sleep_wake_handle == NULL) { + sleep_wake_handle = registerSleepWakeInterest(ipsec_sleep_wake_handler, + NULL, NULL); + if (sleep_wake_handle != NULL) { + ipseclog((LOG_INFO, + "ipsec: monitoring sleep wake")); + } + } +} diff --git a/bsd/netinet6/ipsec.h b/bsd/netinet6/ipsec.h index 0cfe8a0b3..eb094bfce 100644 --- a/bsd/netinet6/ipsec.h +++ b/bsd/netinet6/ipsec.h @@ -40,11 +40,12 @@ #include #include +#include #ifdef BSD_KERNEL_PRIVATE #include #include -/* lock for IPSec stats */ +/* lock for IPsec stats */ extern lck_grp_t *sadb_stat_mutex_grp; extern lck_grp_attr_t *sadb_stat_mutex_grp_attr; extern lck_attr_t *sadb_stat_mutex_attr; @@ -66,7 +67,7 @@ struct secpolicyaddrrange { * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code. */ struct secpolicyindex { - u_int8_t dir; /* direction of packet flow, see blow */ + u_int8_t dir; /* direction of packet flow, see below */ struct sockaddr_storage src; /* IP src address for SP */ struct sockaddr_storage dst; /* IP dst address for SP */ u_int8_t prefs; /* prefix length in bits for src */ @@ -99,7 +100,7 @@ struct secpolicy { /* pointer to the ipsec request tree, */ /* if policy == IPSEC else this value == NULL.*/ - ifnet_t ipsec_if; /* IPSec interface to use */ + ifnet_t ipsec_if; /* IPsec interface to use */ ifnet_t outgoing_if; /* Outgoing interface for encrypted traffic */ char disabled; /* Set to ignore policy */ @@ -232,6 +233,15 @@ struct ipsecstat { u_quad_t out_comphist[256] __attribute__ ((aligned(8))); }; +#define IPSEC_MAX_WAKE_PKT_LEN 100 +struct ipsec_wake_pkt_info { + u_int8_t wake_pkt[IPSEC_MAX_WAKE_PKT_LEN]; + uuid_string_t wake_uuid; + u_int32_t wake_pkt_spi; + u_int32_t wake_pkt_seq; + u_int16_t wake_pkt_len; +}; + #ifdef BSD_KERNEL_PRIVATE /* * Definitions for IPsec & Key sysctl operations. @@ -325,6 +335,8 @@ extern int ip4_ipsec_dfbit; extern int ip4_ipsec_ecn; extern int ip4_esp_randpad; +extern bool ipsec_save_wake_pkt; + #define ipseclog(x) do { if (ipsec_debug) log x; } while (0) extern struct secpolicy *ipsec4_getpolicybysock(struct mbuf *, u_int, @@ -349,8 +361,8 @@ extern int ipsec4_in_reject(struct mbuf *, struct inpcb *); struct secas; struct tcpcb; -extern int ipsec_chkreplay(u_int32_t, struct secasvar *); -extern int ipsec_updatereplay(u_int32_t, struct secasvar *); +extern int ipsec_chkreplay(u_int32_t, struct secasvar *, u_int8_t); +extern int ipsec_updatereplay(u_int32_t, struct secasvar *, u_int8_t); extern size_t ipsec4_hdrsiz(struct mbuf *, u_int, struct inpcb *); extern size_t ipsec_hdrsiz_tcp(struct tcpcb *); @@ -380,6 +392,8 @@ extern struct socket *ipsec_getsocket(struct mbuf *); extern int ipsec_addhist(struct mbuf *, int, u_int32_t); extern struct ipsec_history *ipsec_gethist(struct mbuf *, int *); extern void ipsec_clearhist(struct mbuf *); +extern void ipsec_monitor_sleep_wake(void); +extern void ipsec_save_wake_packet(struct mbuf *, u_int32_t, u_int32_t); #endif /* BSD_KERNEL_PRIVATE */ #ifndef KERNEL diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 60b7777cd..ba2daacdc 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -220,7 +220,7 @@ static void mld_sched_timeout(void); /* * Normative references: RFC 2710, RFC 3590, RFC 3810. */ -static struct timeval mld_gsrdelay = {10, 0}; +static struct timeval mld_gsrdelay = {.tv_sec = 10, .tv_usec = 0}; static LIST_HEAD(, mld_ifinfo) mli_head; static int querier_present_timers_running6; @@ -304,8 +304,8 @@ struct mld_raopt { * Router Alert hop-by-hop option header. */ static struct mld_raopt mld_ra = { - .hbh = { 0, 0 }, - .pad = { .ip6o_type = IP6OPT_PADN, 0 }, + .hbh = { .ip6h_nxt = 0, .ip6h_len = 0 }, + .pad = { .ip6o_type = IP6OPT_PADN, .ip6o_len = 0 }, .ra = { .ip6or_type = (u_int8_t)IP6OPT_ROUTER_ALERT, .ip6or_len = (u_int8_t)(IP6OPT_RTALERT_LEN - 2), @@ -449,7 +449,7 @@ sysctl_mld_v2enable SYSCTL_HANDLER_ARGS int error; int i; struct mld_ifinfo *mli; - struct mld_tparams mtp = { 0, 0, 0, 0 }; + struct mld_tparams mtp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 }; MLD_LOCK(); @@ -860,7 +860,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, struct in6_multi *inm; int err = 0, is_general_query; uint16_t timer; - struct mld_tparams mtp = { 0, 0, 0, 0 }; + struct mld_tparams mtp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 }; MLD_LOCK_ASSERT_NOTHELD(); @@ -907,7 +907,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks. */ - in6_setscope(&mld->mld_addr, ifp, NULL); + (void)in6_setscope(&mld->mld_addr, ifp, NULL); } /* @@ -1049,7 +1049,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, int err = 0, is_general_query; uint16_t timer; uint8_t qrv; - struct mld_tparams mtp = { 0, 0, 0, 0 }; + struct mld_tparams mtp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 }; MLD_LOCK_ASSERT_NOTHELD(); @@ -1132,7 +1132,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, * lookup whilst we don't hold other locks (due to KAME * locking lameness). We own this mbuf chain just now. */ - in6_setscope(&mld->mld_addr, ifp, NULL); + (void)in6_setscope(&mld->mld_addr, ifp, NULL); } mli = MLD_IFINFO(ifp); @@ -1432,7 +1432,7 @@ mld_v1_input_report(struct ifnet *ifp, struct mbuf *m, * whilst we don't hold other locks (due to KAME locking lameness). */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { - in6_setscope(&mld->mld_addr, ifp, NULL); + (void)in6_setscope(&mld->mld_addr, ifp, NULL); } /* @@ -3651,7 +3651,7 @@ mld_dispatch_packet(struct mbuf *m) m0->m_pkthdr.rcvif = lo_ifp; ip6 = mtod(m0, struct ip6_hdr *); - (void) in6_setscope(&ip6->ip6_dst, ifp, NULL); + (void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* * Retrieve the ICMPv6 type before handoff to ip6_output(), diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index 5b8a5d477..0af74cc8d 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,6 +103,8 @@ #include #include +#include + #include "loop.h" #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ @@ -175,7 +177,7 @@ static lck_attr_t *nd_if_lock_attr = NULL; /* Protected by nd6_mutex */ struct nd_drhead nd_defrouter; -struct nd_prhead nd_prefix = { 0 }; +struct nd_prhead nd_prefix = { .lh_first = 0 }; /* * nd6_timeout() is scheduled on a demand basis. nd6_timeout_run is used @@ -536,9 +538,9 @@ nd6_ifattach(struct ifnet *ifp) lck_mtx_unlock(&ndi->lock); nd6_setmtu(ifp); - nd6log0((LOG_INFO, ": ", - "%s Reinit'd ND information for interface %s\n", - if_name(ifp))); + nd6log0(info, + "Reinit'd ND information for interface %s\n", + if_name(ifp)); return; } @@ -712,9 +714,9 @@ nd6_options(union nd_opts *ndopts) case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - nd6log((LOG_INFO, + nd6log(error, "duplicated ND6 option found (type=%d)\n", - nd_opt->nd_opt_type)); + nd_opt->nd_opt_type); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = @@ -738,16 +740,16 @@ nd6_options(union nd_opts *ndopts) * Unknown options must be silently ignored, * to accomodate future extension to the protocol. */ - nd6log((LOG_DEBUG, + nd6log(debug, "nd6_options: unsupported option %d - " - "option ignored\n", nd_opt->nd_opt_type)); + "option ignored\n", nd_opt->nd_opt_type); } skip1: i++; if (i > nd6_maxndopt) { icmp6stat.icp6s_nd_toomanyopt++; - nd6log((LOG_INFO, "too many loop in nd opt\n")); + nd6log(info, "too many loop in nd opt\n"); break; } @@ -792,9 +794,9 @@ nd6_service(void *arg) * to run this entire operation single threaded. */ while (nd6_service_busy) { - nd6log2((LOG_DEBUG, "%s: %s is blocked by %d waiters\n", + nd6log2(debug, "%s: %s is blocked by %d waiters\n", __func__, ap->draining ? "drainer" : "timer", - nd6_service_waiters)); + nd6_service_waiters); nd6_service_waiters++; (void) msleep(nd6_service_wc, rnh_lock, (PZERO - 1), __func__, NULL); @@ -1201,10 +1203,10 @@ again: * learned on cellular interface. Ever. */ dr->expire += dr->rtlifetime; - nd6log2((LOG_DEBUG, + nd6log2(debug, "%s: Refreshing expired default router entry " "%s for interface %s\n", __func__, - ip6_sprintf(&dr->rtaddr), if_name(dr->ifp))); + ip6_sprintf(&dr->rtaddr), if_name(dr->ifp)); } else { ap->killed++; /* @@ -1244,6 +1246,17 @@ again: defrtrlist_del(dr); NDDR_REMREF(dr); /* remove list reference */ } + + /* + * Also check if default router selection needs to be triggered + * for default interface, to avoid an issue with co-existence of + * static un-scoped default route configuration and default router + * discovery/selection. + */ + if (trigger_v6_defrtr_select) { + defrouter_select(NULL); + trigger_v6_defrtr_select = FALSE; + } lck_mtx_unlock(nd6_mutex); /* @@ -1460,7 +1473,7 @@ void nd6_drain(void *arg) { #pragma unused(arg) - nd6log2((LOG_DEBUG, "%s: draining ND6 entries\n", __func__)); + nd6log2(debug, "%s: draining ND6 entries\n", __func__); lck_mtx_lock(rnh_lock); nd6_need_draining = 1; @@ -1487,9 +1500,9 @@ nd6_timeout(void *arg) sarg.draining = 1; } nd6_service(&sarg); - nd6log2((LOG_DEBUG, "%s: found %u, aging_lazy %u, aging %u, " + nd6log2(debug, "%s: found %u, aging_lazy %u, aging %u, " "sticky %u, killed %u\n", __func__, sarg.found, sarg.aging_lazy, - sarg.aging, sarg.sticky, sarg.killed)); + sarg.aging, sarg.sticky, sarg.killed); /* re-arm the timer if there's work to do */ nd6_timeout_run--; VERIFY(nd6_timeout_run >= 0 && nd6_timeout_run < 2); @@ -1515,7 +1528,7 @@ nd6_timeout(void *arg) } nd6_sched_timeout(&atv, leeway); } else if (nd6_debug) { - nd6log2((LOG_DEBUG, "%s: not rescheduling timer\n", __func__)); + nd6log2(debug, "%s: not rescheduling timer\n", __func__); } lck_mtx_unlock(rnh_lock); } @@ -1535,18 +1548,18 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv) /* see comments on top of this file */ if (nd6_timeout_run == 0) { if (ltv == NULL) { - nd6log2((LOG_DEBUG, "%s: timer scheduled in " + nd6log2(debug, "%s: timer scheduled in " "T+%llus.%lluu (demand %d)\n", __func__, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, - nd6_sched_timeout_want)); + nd6_sched_timeout_want); nd6_fast_timer_on = TRUE; timeout(nd6_timeout, &nd6_fast_timer_on, tvtohz(atv)); } else { - nd6log2((LOG_DEBUG, "%s: timer scheduled in " + nd6log2(debug, "%s: timer scheduled in " "T+%llus.%lluu with %llus.%lluu leeway " "(demand %d)\n", __func__, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, (uint64_t)ltv->tv_sec, - (uint64_t)ltv->tv_usec, nd6_sched_timeout_want)); + (uint64_t)ltv->tv_usec, nd6_sched_timeout_want); nd6_fast_timer_on = FALSE; timeout_with_leeway(nd6_timeout, NULL, tvtohz(atv), tvtohz(ltv)); @@ -1555,27 +1568,27 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv) nd6_sched_timeout_want = 0; } else if (nd6_timeout_run == 1 && ltv == NULL && nd6_fast_timer_on == FALSE) { - nd6log2((LOG_DEBUG, "%s: fast timer scheduled in " + nd6log2(debug, "%s: fast timer scheduled in " "T+%llus.%lluu (demand %d)\n", __func__, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, - nd6_sched_timeout_want)); + nd6_sched_timeout_want); nd6_fast_timer_on = TRUE; nd6_sched_timeout_want = 0; nd6_timeout_run++; timeout(nd6_timeout, &nd6_fast_timer_on, tvtohz(atv)); } else { if (ltv == NULL) { - nd6log2((LOG_DEBUG, "%s: not scheduling timer: " + nd6log2(debug, "%s: not scheduling timer: " "timers %d, fast_timer %d, T+%llus.%lluu\n", __func__, nd6_timeout_run, nd6_fast_timer_on, - (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec)); + (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec); } else { - nd6log2((LOG_DEBUG, "%s: not scheduling timer: " + nd6log2(debug, "%s: not scheduling timer: " "timers %d, fast_timer %d, T+%llus.%lluu " "with %llus.%lluu leeway\n", __func__, nd6_timeout_run, nd6_fast_timer_on, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, - (uint64_t)ltv->tv_sec, (uint64_t)ltv->tv_usec)); + (uint64_t)ltv->tv_sec, (uint64_t)ltv->tv_usec); } } } @@ -2162,9 +2175,9 @@ nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp, void nd6_free(struct rtentry *rt) { - struct llinfo_nd6 *ln; - struct in6_addr in6; - struct nd_defrouter *dr; + struct llinfo_nd6 *ln = NULL; + struct in6_addr in6 = {}; + struct nd_defrouter *dr = NULL; LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); RT_LOCK_ASSERT_NOTHELD(rt); @@ -2571,9 +2584,9 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) error = in6_mc_join(ifp, &llsol, NULL, &in6m, 0); if (error) { - nd6log((LOG_ERR, "%s: failed to join " + nd6log(error, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), - ip6_sprintf(&llsol), error)); + ip6_sprintf(&llsol), error); } else { IN6M_REMREF(in6m); } @@ -3168,7 +3181,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) if (cmd == SIOCGDEFIFACE_IN6_64) { u_int64_t j = nd6_defifindex; - bcopy(&j, &ndif_64->ifindex, sizeof(j)); + __nochk_bcopy(&j, &ndif_64->ifindex, sizeof(j)); } else { bcopy(&nd6_defifindex, &ndif_32->ifindex, sizeof(u_int32_t)); @@ -3186,7 +3199,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) if (cmd == SIOCSDEFIFACE_IN6_64) { u_int64_t j; - bcopy(&ndif_64->ifindex, &j, sizeof(j)); + __nochk_bcopy(&ndif_64->ifindex, &j, sizeof(j)); idx = (u_int32_t)j; } else { bcopy(&ndif_32->ifindex, &idx, sizeof(idx)); @@ -3287,9 +3300,6 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, is_newentry = 0; } - if (rt == NULL) { - return; - } if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) { fail: RT_UNLOCK(rt); @@ -4132,6 +4142,7 @@ nd6_need_cache(struct ifnet *ifp) #endif case IFT_BRIDGE: case IFT_CELLULAR: + case IFT_6LOWPAN: return 1; default: return 0; @@ -4329,7 +4340,7 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS */ error = proc_suser(current_proc()); if (error != 0) { - printf("%s: proc_suser() error %d\n", + nd6log0(error, "%s: proc_suser() error %d\n", __func__, error); goto done; } @@ -4342,23 +4353,31 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS if (req->oldlen != sizeof(struct nd6_lookup_ipv6_args) || req->newlen != sizeof(struct nd6_lookup_ipv6_args)) { error = EINVAL; - printf("%s: bad req, error %d\n", + nd6log0(error, "%s: bad req, error %d\n", __func__, error); goto done; } error = SYSCTL_IN(req, &nd6_lookup_ipv6_args, sizeof(struct nd6_lookup_ipv6_args)); if (error != 0) { - printf("%s: SYSCTL_IN() error %d\n", + nd6log0(error, "%s: SYSCTL_IN() error %d\n", __func__, error); goto done; } + + if (nd6_lookup_ipv6_args.ll_dest_len > sizeof(nd6_lookup_ipv6_args.ll_dest_)) { + error = EINVAL; + nd6log0(error, "%s: bad ll_dest_len, error %d\n", + __func__, error); + goto done; + } + /* Make sure to terminate the string */ nd6_lookup_ipv6_args.ifname[IFNAMSIZ - 1] = 0; error = ifnet_find_by_name(nd6_lookup_ipv6_args.ifname, &ifp); if (error != 0) { - printf("%s: ifnet_find_by_name() error %d\n", + nd6log0(error, "%s: ifnet_find_by_name() error %d\n", __func__, error); goto done; } @@ -4367,7 +4386,7 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS &nd6_lookup_ipv6_args.ll_dest_._sdl, nd6_lookup_ipv6_args.ll_dest_len, NULL, NULL); if (error != 0) { - printf("%s: nd6_lookup_ipv6() error %d\n", + nd6log0(error, "%s: nd6_lookup_ipv6() error %d\n", __func__, error); goto done; } @@ -4375,7 +4394,7 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS error = SYSCTL_OUT(req, &nd6_lookup_ipv6_args, sizeof(struct nd6_lookup_ipv6_args)); if (error != 0) { - printf("%s: SYSCTL_OUT() error %d\n", + nd6log0(error, "%s: SYSCTL_OUT() error %d\n", __func__, error); goto done; } @@ -4717,9 +4736,9 @@ in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia) ia->ia6_flags &= ~(IN6_IFF_DUPLICATED | IN6_IFF_DADPROGRESS); ia->ia6_flags |= flags; - nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n", + nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ia->ia_ifp), - ia->ia6_flags)); + ia->ia6_flags); } diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index d3bc920ee..1ff88945f 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,22 +118,18 @@ struct llinfo_nd6 { #ifdef BSD_KERNEL_PRIVATE #define ND6_CACHE_STATE_TRANSITION(ln, nstate) do {\ - struct rtentry *ln_rt = (ln)->ln_rt; \ if (nd6_debug >= 1) {\ - nd6log((LOG_INFO,\ - "[%s:%d]: NDP cache entry changed from %s -> %s",\ + struct rtentry *ln_rt = ln != NULL ? (ln)->ln_rt : NULL; \ + nd6log(info,\ + "[%s:%d]: NDP cache entry changed from %s -> %s for address %s.\n",\ __func__,\ __LINE__,\ ndcache_state2str((ln)->ln_state),\ - ndcache_state2str(nstate)));\ - if (ln_rt != NULL)\ - nd6log((LOG_INFO,\ - " for address: %s.\n",\ - ip6_sprintf(&SIN6(rt_key(ln_rt))->sin6_addr)));\ - else\ - nd6log((LOG_INFO, "\n"));\ + ndcache_state2str(nstate),\ + ln_rt != NULL ? ip6_sprintf(&SIN6(rt_key(ln_rt))->sin6_addr) : "N/A");\ }\ - (ln)->ln_state = nstate;\ + if (ln != NULL)\ + (ln)->ln_state = nstate;\ } while(0) #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE) @@ -444,7 +440,7 @@ struct in6_ndifreq_32 { struct in6_ndifreq_64 { char ifname[IFNAMSIZ]; - u_long ifindex __attribute__((aligned(8))); + u_int64_t ifindex __attribute__((aligned(8))); }; #endif /* BSD_KERNEL_PRIVATE */ @@ -758,9 +754,11 @@ extern int nd6_debug; extern int nd6_onlink_ns_rfc4861; extern int nd6_optimistic_dad; -#define nd6log0(x) do { log x; } while (0) -#define nd6log(x) do { if (nd6_debug >= 1) log x; } while (0) -#define nd6log2(x) do { if (nd6_debug >= 2) log x; } while (0) +#include + +#define nd6log0(type, ...) do { os_log_##type(OS_LOG_DEFAULT, __VA_ARGS__); } while (0) +#define nd6log(type, ...) do { if (nd6_debug >= 1) os_log_##type(OS_LOG_DEFAULT, __VA_ARGS__); } while (0) +#define nd6log2(type, ...) do { if (nd6_debug >= 2) os_log_##type(OS_LOG_DEFAULT, __VA_ARGS__); } while (0) #define ND6_OPTIMISTIC_DAD_LINKLOCAL (1 << 0) #define ND6_OPTIMISTIC_DAD_AUTOCONF (1 << 1) @@ -867,9 +865,9 @@ extern void nd6_llreach_set_reachable(struct ifnet *, void *, unsigned int); extern void nd6_llreach_use(struct llinfo_nd6 *); extern void nd6_alt_node_addr_decompose(struct ifnet *, struct sockaddr *, struct sockaddr_dl *, struct sockaddr_in6 *); -extern void nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *, +extern int nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *, struct sockaddr_dl *, int32_t, int, int); -extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *); +extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *, struct sockaddr_dl *); /* nd6_rtr.c */ extern struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int, diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index 9adb4feae..ae3e33e53 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -230,9 +230,9 @@ nd6_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, if (nd6_debug && lr != NULL && why != NULL) { char tmp[MAX_IPv6_STR_LEN]; - nd6log((LOG_DEBUG, "%s: %s%s for %s\n", if_name(ifp), + nd6log(debug, "%s: %s%s for %s\n", if_name(ifp), type, why, inet_ntop(AF_INET6, - &SIN6(rt_key(rt))->sin6_addr, tmp, sizeof(tmp)))); + &SIN6(rt_key(rt))->sin6_addr, tmp, sizeof(tmp))); } } } @@ -289,10 +289,10 @@ nd6_ns_input( } if (ip6->ip6_hlim != IPV6_MAXHLIM) { - nd6log((LOG_ERR, + nd6log(error, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), if_name(ifp))); + ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); goto bad; } @@ -306,8 +306,8 @@ nd6_ns_input( daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { - nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " - "(wrong ip6 dst)\n")); + nd6log(info, "nd6_ns_input: bad DAD packet " + "(wrong ip6 dst)\n"); goto bad; } } else if (!nd6_onlink_ns_rfc4861) { @@ -324,22 +324,22 @@ nd6_ns_input( src_sa6.sin6_len = sizeof(src_sa6); src_sa6.sin6_addr = saddr6; if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { - nd6log((LOG_INFO, "nd6_ns_input: " - "NS packet from non-neighbor\n")); + nd6log(info, "nd6_ns_input: " + "NS packet from non-neighbor\n"); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { - nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n")); + nd6log(info, "nd6_ns_input: bad NS target (multicast)\n"); goto bad; } icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { - nd6log((LOG_INFO, - "nd6_ns_input: invalid ND option, ignored\n")); + nd6log(info, + "nd6_ns_input: invalid ND option, ignored\n"); /* nd6_options have incremented stats */ goto freeit; } @@ -350,8 +350,8 @@ nd6_ns_input( } if (is_dad_probe && lladdr) { - nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " - "(link-layer address option)\n")); + nd6log(info, "nd6_ns_input: bad DAD packet " + "(link-layer address option)\n"); goto bad; } @@ -446,17 +446,17 @@ nd6_ns_input( IFA_UNLOCK(ifa); if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { - nd6log((LOG_INFO, + nd6log(info, "nd6_ns_input: lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", - ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2)); + ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2); goto bad; } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { - nd6log((LOG_INFO, + nd6log(info, "nd6_ns_input: duplicate IP6 address %s\n", - ip6_sprintf(&saddr6))); + ip6_sprintf(&saddr6)); goto freeit; } @@ -539,9 +539,9 @@ freeit: return; bad: - nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(&saddr6))); - nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(&daddr6))); - nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(&taddr6))); + nd6log(error, "nd6_ns_input: src=%s\n", ip6_sprintf(&saddr6)); + nd6log(error, "nd6_ns_input: dst=%s\n", ip6_sprintf(&daddr6)); + nd6log(error, "nd6_ns_input: tgt=%s\n", ip6_sprintf(&taddr6)); icmp6stat.icp6s_badns++; m_freem(m); if (ifa != NULL) { @@ -722,11 +722,11 @@ nd6_ns_output( NULL, &ro, NULL, &src_storage, ip6oa.ip6oa_boundif, &error); if (src == NULL) { - nd6log((LOG_DEBUG, + nd6log(debug, "nd6_ns_output: source can't be " "determined: dst=%s, error=%d\n", ip6_sprintf(&dst_sa.sin6_addr), - error)); + error); goto bad; } @@ -744,10 +744,10 @@ nd6_ns_output( */ ia = in6ifa_ifpwithaddr(ifp, src); if (!ia || (ia->ia6_flags & IN6_IFF_OPTIMISTIC)) { - nd6log((LOG_DEBUG, + nd6log(debug, "nd6_ns_output: no preferred source " "available: dst=%s\n", - ip6_sprintf(&dst_sa.sin6_addr))); + ip6_sprintf(&dst_sa.sin6_addr)); goto bad; } } @@ -848,6 +848,7 @@ nd6_ns_output( (void) m_set_service_class(m, MBUF_SC_CTL); } + ip6oa.ip6oa_flags |= IP6OAF_SKIP_PF; ip6_output(m, NULL, NULL, flags, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); @@ -906,7 +907,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) bool send_nc_alive_kev = false; if ((ifp->if_eflags & IFEF_IPV6_ND6ALT) != 0) { - nd6log((LOG_INFO, "nd6_na_input: on ND6ALT interface!\n")); + nd6log(info, "nd6_na_input: on ND6ALT interface!\n"); goto freeit; } @@ -914,10 +915,10 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); if (ip6->ip6_hlim != IPV6_MAXHLIM) { - nd6log((LOG_ERR, + nd6log(error, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), if_name(ifp))); + ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); goto bad; } @@ -935,15 +936,15 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) goto bad; /* XXX: impossible */ } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { - nd6log((LOG_ERR, + nd6log(error, "nd6_na_input: invalid target address %s\n", - ip6_sprintf(&taddr6))); + ip6_sprintf(&taddr6)); goto bad; } if (IN6_IS_ADDR_MULTICAST(&daddr6)) { if (is_solicited) { - nd6log((LOG_ERR, - "nd6_na_input: a solicited adv is multicasted\n")); + nd6log(error, + "nd6_na_input: a solicited adv is multicasted\n"); goto bad; } } @@ -951,8 +952,8 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) icmp6len -= sizeof(*nd_na); nd6_option_init(nd_na + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { - nd6log((LOG_INFO, - "nd6_na_input: invalid ND option, ignored\n")); + nd6log(info, + "nd6_na_input: invalid ND option, ignored\n"); /* nd6_options have incremented stats */ goto freeit; } @@ -962,11 +963,11 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; if (((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { - nd6log((LOG_INFO, + nd6log(info, "nd6_na_input: lladdrlen mismatch for %s " "(if %d, NA packet %d)\n", ip6_sprintf(&taddr6), ifp->if_addrlen, - lladdrlen - 2)); + lladdrlen - 2); goto bad; } } @@ -1465,9 +1466,9 @@ nd6_na_output( src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, &src_storage, ip6oa.ip6oa_boundif, &error); if (src == NULL) { - nd6log((LOG_DEBUG, "nd6_na_output: source can't be " + nd6log(debug, "nd6_na_output: source can't be " "determined: dst=%s, error=%d\n", - ip6_sprintf(&dst_sa.sin6_addr), error)); + ip6_sprintf(&dst_sa.sin6_addr), error); goto bad; } ip6->ip6_src = *src; @@ -1545,6 +1546,7 @@ nd6_na_output( (void) m_set_service_class(m, MBUF_SC_CTL); } + ip6oa.ip6oa_flags |= IP6OAF_SKIP_PF; ip6_output(m, NULL, NULL, IPV6_OUTARGS, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); @@ -1587,6 +1589,7 @@ nd6_ifptomac( #endif case IFT_BRIDGE: case IFT_ISO88025: + case IFT_6LOWPAN: return (caddr_t)IF_LLADDR(ifp); default: return NULL; @@ -1662,10 +1665,10 @@ nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *nonce) nonce->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 && memcmp(&nonce->nd_opt_nonce[0], &dp->dad_nonce[0], ND_OPT_NONCE_LEN) == 0) { - nd6log((LOG_ERR, "%s: a looped back NS message is " + nd6log(error, "%s: a looped back NS message is " "detected during DAD for %s. Ignoring.\n", if_name(ifa->ifa_ifp), - ip6_sprintf(IFA_IN6(ifa)))); + ip6_sprintf(IFA_IN6(ifa))); dp->dad_ns_lcount++; ++ip6stat.ip6s_dad_loopcount; DAD_UNLOCK(dp); @@ -1698,11 +1701,11 @@ nd6_dad_start( struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; - nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n", + nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ia->ia_ifp), - ia->ia6_flags)); + ia->ia6_flags); /* * If we don't need DAD, don't do it. @@ -1712,11 +1715,11 @@ nd6_dad_start( */ IFA_LOCK(&ia->ia_ifa); if (!(ia->ia6_flags & IN6_IFF_DADPROGRESS)) { - nd6log0((LOG_DEBUG, + nd6log0(debug, "nd6_dad_start: not a tentative or optimistic address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), - ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???")); + ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); IFA_UNLOCK(&ia->ia_ifa); return; } @@ -1741,10 +1744,9 @@ nd6_dad_start( dp = zalloc(dad_zone); if (dp == NULL) { - nd6log0((LOG_ERR, "nd6_dad_start: memory allocation failed for " - "%s(%s)\n", + nd6log0(error, "nd6_dad_start: memory allocation failed for %s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), - ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???")); + ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } bzero(dp, dad_size); @@ -1753,11 +1755,11 @@ nd6_dad_start( /* Callee adds one reference for us */ dp = nd6_dad_attach(dp, ifa); - nd6log0((LOG_DEBUG, "%s: starting %sDAD %sfor %s\n", + nd6log0(debug, "%s: starting %sDAD %sfor %s\n", if_name(ifa->ifa_ifp), (ia->ia6_flags & IN6_IFF_OPTIMISTIC) ? "optimistic " : "", (tick_delay == NULL) ? "immediately " : "", - ip6_sprintf(&ia->ia_addr.sin6_addr))); + ip6_sprintf(&ia->ia_addr.sin6_addr)); /* * Send NS packet for DAD, ip6_dad_count times. @@ -1880,8 +1882,8 @@ nd6_unsol_na_output(struct ifaddr *ifa) return; } - nd6log((LOG_INFO, "%s: sending unsolicited NA\n", - if_name(ifa->ifa_ifp))); + nd6log(info, "%s: sending unsolicited NA\n", + if_name(ifa->ifa_ifp)); nd6_na_output(ifp, &saddr6, &taddr6, ND_NA_FLAG_OVERRIDE, 1, NULL); } @@ -1896,35 +1898,35 @@ nd6_dad_timer(struct ifaddr *ifa) /* Sanity check */ if (ia == NULL) { - nd6log0((LOG_ERR, "nd6_dad_timer: called with null parameter\n")); + nd6log0(error, "nd6_dad_timer: called with null parameter\n"); goto done; } - nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n", + nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ia->ia_ifp), - ia->ia6_flags)); + ia->ia6_flags); dp = nd6_dad_find(ifa, NULL); if (dp == NULL) { - nd6log0((LOG_ERR, "nd6_dad_timer: DAD structure not found\n")); + nd6log0(error, "nd6_dad_timer: DAD structure not found\n"); goto done; } IFA_LOCK(&ia->ia_ifa); if (ia->ia6_flags & IN6_IFF_DUPLICATED) { - nd6log0((LOG_ERR, "nd6_dad_timer: called with duplicated address " + nd6log0(error, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), - ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???")); + ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); IFA_UNLOCK(&ia->ia_ifa); goto done; } if ((ia->ia6_flags & IN6_IFF_DADPROGRESS) == 0) { - nd6log0((LOG_ERR, "nd6_dad_timer: not a tentative or optimistic " + nd6log0(error, "nd6_dad_timer: not a tentative or optimistic " "address %s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), - ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???")); + ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); IFA_UNLOCK(&ia->ia_ifa); goto done; } @@ -1934,8 +1936,8 @@ nd6_dad_timer(struct ifaddr *ifa) DAD_LOCK(dp); if (dp->dad_ns_tcount > dad_maxtry) { DAD_UNLOCK(dp); - nd6log0((LOG_INFO, "%s: could not run DAD, driver problem?\n", - if_name(ifa->ifa_ifp))); + nd6log0(info, "%s: could not run DAD, driver problem?\n", + if_name(ifa->ifa_ifp)); nd6_dad_detach(dp, ifa); goto done; @@ -1962,10 +1964,10 @@ nd6_dad_timer(struct ifaddr *ifa) if (dp->dad_na_icount > 0 || dp->dad_ns_icount) { /* We've seen NS or NA, means DAD has failed. */ DAD_UNLOCK(dp); - nd6log0((LOG_INFO, - "%s: duplicate IPv6 address %s [timer]\n", + nd6log0(info, + "%s: duplicate IPv6 address %s if:%s [timer]\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ia->ia_ifp))); + if_name(ia->ia_ifp)); nd6_dad_duplicated(ifa); /* (*dp) will be freed in nd6_dad_duplicated() */ } else if (dad_enhanced != 0 && @@ -1986,12 +1988,10 @@ nd6_dad_timer(struct ifaddr *ifa) * additional probes until the loopback condition * becomes clear when a looped back probe is detected. */ - nd6log0((LOG_INFO, - "%s: a looped back NS message is " - "detected during DAD for %s. " - "Another DAD probe is being sent on interface.\n", + nd6log0(info, + "%s: a looped back NS message is detected during DAD for %s. Another DAD probe is being sent on interface %s.\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ia->ia_ifp))); + if_name(ia->ia_ifp)); /* * Send an NS immediately and increase dad_count by * nd6_mmaxtries - 1. @@ -2020,20 +2020,20 @@ nd6_dad_timer(struct ifaddr *ifa) nd6_unsol_na_output(ifa); } - nd6log0((LOG_DEBUG, - "%s: DAD complete for %s - no duplicates found%s\n", + nd6log0(debug, + "%s: DAD complete for %s - no duplicates found %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), - txunsolna ? ", tx unsolicited NA with O=1" : ".")); + txunsolna ? ", tx unsolicited NA with O=1" : "."); if (dp->dad_ns_lcount > 0) { - nd6log0((LOG_DEBUG, + nd6log0(debug, "%s: DAD completed while " "a looped back NS message is detected " "during DAD for %s om interface %s\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ia->ia_ifp))); + if_name(ia->ia_ifp)); } in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia, @@ -2063,9 +2063,9 @@ nd6_dad_duplicated(struct ifaddr *ifa) } IFA_LOCK(&ia->ia_ifa); DAD_LOCK(dp); - nd6log((LOG_ERR, "%s: NS in/out/loopback=%d/%d, NA in=%d\n", + nd6log(error, "%s: NS in/out/loopback=%d/%d/%d, NA in=%d\n", __func__, dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount, - dp->dad_na_icount)); + dp->dad_na_icount); candisable = FALSE; if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr) && @@ -2290,8 +2290,8 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, } if (replicated) { - nd6log((LOG_INFO, "%s: ignoring duplicate NA on " - "replicated interface %s\n", __func__, if_name(ifp))); + nd6log(info, "%s: ignoring duplicate NA on " + "replicated interface %s\n", __func__, if_name(ifp)); goto done; } @@ -2301,9 +2301,9 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, if (!(ia->ia6_flags & IN6_IFF_DADPROGRESS)) { IFA_UNLOCK(ifa); - nd6log((LOG_INFO, "%s: ignoring duplicate NA on " + nd6log(info, "%s: ignoring duplicate NA on " "%s [DAD not in progress]\n", __func__, - if_name(ifp))); + if_name(ifp)); goto done; } @@ -2317,8 +2317,8 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, if (ip6a && (ip6a->ip6a_flags & IP6A_HASEEN) != 0 && bcmp(ip6a->ip6a_ehsrc, lladdr, ETHER_ADDR_LEN) != 0) { IFA_UNLOCK(ifa); - nd6log((LOG_ERR, "%s: ignoring duplicate NA on %s " - "[eh_src != tgtlladdr]\n", __func__, if_name(ifp))); + nd6log(error, "%s: ignoring duplicate NA on %s " + "[eh_src != tgtlladdr]\n", __func__, if_name(ifp)); goto done; } } @@ -2327,8 +2327,8 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, dp = nd6_dad_find(ifa, NULL); if (dp == NULL) { - nd6log((LOG_INFO, "%s: no DAD structure for %s on %s.\n", - __func__, ip6_sprintf(taddr), if_name(ifp))); + nd6log(info, "%s: no DAD structure for %s on %s.\n", + __func__, ip6_sprintf(taddr), if_name(ifp)); goto done; } @@ -2342,9 +2342,9 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, DAD_REMREF(dp); /* remove the address. */ - nd6log((LOG_INFO, + nd6log(info, "%s: duplicate IPv6 address %s [processing NA on %s]\n", __func__, - ip6_sprintf(taddr), if_name(ifp))); + ip6_sprintf(taddr), if_name(ifp)); done: IFA_LOCK_ASSERT_NOTHELD(ifa); IFA_REMREF(ifa); @@ -2423,11 +2423,11 @@ nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa, VERIFY(sdl && (void *)sa != (void *)sdl); VERIFY(sin6 && (void *)sa != (void *)sin6); - bzero(sin6, sizeof *sin6); + bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof *sin6; sin6->sin6_family = AF_INET6; - bzero(sdl, sizeof *sdl); + bzero(sdl, sizeof(*sdl)); sdl->sdl_len = sizeof *sdl; sdl->sdl_family = AF_LINK; sdl->sdl_type = ifp->if_type; @@ -2463,7 +2463,7 @@ nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa, struct in6_addr *in6 = &sin6->sin6_addr; caddr_t lla = LLADDR(sdla); - VERIFY(sa->sa_len <= sizeof *sdl); + VERIFY(sa->sa_len <= sizeof(*sdl)); bcopy(sa, sdl, sa->sa_len); sin6->sin6_scope_id = sdla->sdl_index; @@ -2495,7 +2495,7 @@ nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa, } } -void +int nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6, struct sockaddr_dl *sdl, int32_t rssi, int lqm, int npm) { @@ -2550,21 +2550,23 @@ nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6, if (rt == NULL) { log(LOG_ERR, "%s: failed to add/update host route to %s.\n", __func__, ip6_sprintf(&sin6->sin6_addr)); + return EHOSTUNREACH; } else { - nd6log((LOG_DEBUG, "%s: host route to %s [lr=0x%llx]\n", + nd6log(debug, "%s: host route to %s [lr=0x%llx]\n", __func__, ip6_sprintf(&sin6->sin6_addr), - (uint64_t)VM_KERNEL_ADDRPERM(lr))); + (uint64_t)VM_KERNEL_ADDRPERM(lr)); + return 0; } } void -nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6) +nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6, struct sockaddr_dl *sdl) { struct rtentry *rt; const uint16_t temp_embedded_id = sin6->sin6_addr.s6_addr16[1]; - nd6log((LOG_DEBUG, "%s: host route to %s\n", __func__, - ip6_sprintf(&sin6->sin6_addr))); + nd6log(debug, "%s: host route to %s\n", __func__, + ip6_sprintf(&sin6->sin6_addr)); if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && (temp_embedded_id == 0)) { @@ -2588,6 +2590,17 @@ nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6) if (!(rt->rt_flags & (RTF_CLONING | RTF_PRCLONING)) && (rt->rt_flags & (RTF_HOST | RTF_LLINFO | RTF_WASCLONED)) == (RTF_HOST | RTF_LLINFO | RTF_WASCLONED)) { + /* + * Copy the link layer information in SDL when present + * as it later gets used to issue the kernel event for + * node absence. + */ + if (sdl != NULL && rt->rt_gateway != NULL && + rt->rt_gateway->sa_family == AF_LINK && + SDL(rt->rt_gateway)->sdl_len <= sizeof(*sdl)) { + bcopy(rt->rt_gateway, sdl, SDL(rt->rt_gateway)->sdl_len); + } + rt->rt_flags |= RTF_CONDEMNED; RT_UNLOCK(rt); diff --git a/bsd/netinet6/nd6_prproxy.c b/bsd/netinet6/nd6_prproxy.c index a3006a929..c3cb5ecd4 100644 --- a/bsd/netinet6/nd6_prproxy.c +++ b/bsd/netinet6/nd6_prproxy.c @@ -619,10 +619,10 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6) if ((rt = ro6->ro_rt) != NULL) { RT_LOCK(rt); if (!(rt->rt_flags & RTF_PROXY) || rt->rt_ifp == ifp) { - nd6log2((LOG_DEBUG, "%s: found incorrect prefix " + nd6log2(debug, "%s: found incorrect prefix " "proxy route for dst %s on %s\n", if_name(ifp), ip6_sprintf(dst6), - if_name(rt->rt_ifp))); + if_name(rt->rt_ifp)); RT_UNLOCK(rt); /* look it up below */ } else { @@ -701,9 +701,9 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6) rtfree_locked(rt); rt = NULL; } else { - nd6log2((LOG_DEBUG, "%s: found prefix proxy route " + nd6log2(debug, "%s: found prefix proxy route " "for dst %s\n", if_name(rt->rt_ifp), - ip6_sprintf(dst6))); + ip6_sprintf(dst6)); RT_UNLOCK(rt); ro6->ro_rt = rt; /* refcnt held by rtalloc1 */ lck_mtx_unlock(rnh_lock); @@ -723,9 +723,9 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6) rtfree_locked(rt); rt = NULL; } else { - nd6log2((LOG_DEBUG, "%s: allocated prefix proxy " + nd6log2(debug, "%s: allocated prefix proxy " "route for dst %s\n", if_name(rt->rt_ifp), - ip6_sprintf(dst6))); + ip6_sprintf(dst6)); RT_UNLOCK(rt); ro6->ro_rt = rt; /* refcnt held by rtalloc1 */ } @@ -733,9 +733,9 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6) VERIFY(rt != NULL || ro6->ro_rt == NULL); if (fwd_ifp == NULL || rt == NULL) { - nd6log2((LOG_ERR, "%s: failed to find forwarding prefix " + nd6log2(error, "%s: failed to find forwarding prefix " "proxy entry for dst %s\n", if_name(ifp), - ip6_sprintf(dst6))); + ip6_sprintf(dst6)); } lck_mtx_unlock(rnh_lock); } @@ -929,12 +929,12 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp, } if (exclifp == NULL) { - nd6log2((LOG_DEBUG, "%s: sending NS who has %s on ALL\n", - if_name(ifp), ip6_sprintf(taddr))); + nd6log2(debug, "%s: sending NS who has %s on ALL\n", + if_name(ifp), ip6_sprintf(taddr)); } else { - nd6log2((LOG_DEBUG, "%s: sending NS who has %s on ALL " + nd6log2(debug, "%s: sending NS who has %s on ALL " "(except %s)\n", if_name(ifp), - ip6_sprintf(taddr), if_name(exclifp))); + ip6_sprintf(taddr), if_name(exclifp)); } SLIST_INIT(&ndprl_head); @@ -1001,10 +1001,10 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp, NDPR_LOCK(pr); if (pr->ndpr_stateflags & NDPRF_ONLINK) { NDPR_UNLOCK(pr); - nd6log2((LOG_DEBUG, + nd6log2(debug, "%s: Sending cloned NS who has %s, originally " "on %s\n", if_name(fwd_ifp), - ip6_sprintf(taddr), if_name(ifp))); + ip6_sprintf(taddr), if_name(ifp)); nd6_ns_output(fwd_ifp, daddr, taddr, NULL, NULL); } else { @@ -1133,12 +1133,12 @@ nd6_prproxy_ns_input(struct ifnet *ifp, struct in6_addr *saddr, NDPR_LOCK(pr); if (pr->ndpr_stateflags & NDPRF_ONLINK) { NDPR_UNLOCK(pr); - nd6log2((LOG_DEBUG, + nd6log2(debug, "%s: Forwarding NS (%s) from %s to %s who " "has %s, originally on %s\n", if_name(fwd_ifp), ndprl->ndprl_sol ? "NUD/AR" : "DAD", ip6_sprintf(saddr), ip6_sprintf(daddr), - ip6_sprintf(taddr), if_name(ifp))); + ip6_sprintf(taddr), if_name(ifp)); nd6_ns_output(fwd_ifp, ndprl->ndprl_sol ? taddr : NULL, taddr, NULL, nonce); @@ -1278,20 +1278,20 @@ nd6_prproxy_na_input(struct ifnet *ifp, struct in6_addr *saddr, if (send_na) { if (!ndprl->ndprl_sol) { - nd6log2((LOG_DEBUG, + nd6log2(debug, "%s: Forwarding NA (DAD) from %s to %s " "tgt is %s, originally on %s\n", if_name(fwd_ifp), ip6_sprintf(saddr), ip6_sprintf(&daddr), - ip6_sprintf(taddr), if_name(ifp))); + ip6_sprintf(taddr), if_name(ifp)); } else { - nd6log2((LOG_DEBUG, + nd6log2(debug, "%s: Forwarding NA (NUD/AR) from %s to " "%s (was %s) tgt is %s, originally on " "%s\n", if_name(fwd_ifp), ip6_sprintf(saddr), ip6_sprintf(&daddr), ip6_sprintf(daddr0), - ip6_sprintf(taddr), if_name(ifp))); + ip6_sprintf(taddr), if_name(ifp)); } nd6_na_output(fwd_ifp, &daddr, taddr, flags, 1, NULL); diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index 53259c5ea..e0c8b4864 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -285,11 +285,11 @@ nd6_rs_input( } /* Sanity checks */ - if (ip6->ip6_hlim != 255) { - nd6log((LOG_ERR, + if (ip6->ip6_hlim != IPV6_MAXHLIM) { + nd6log(error, "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), if_name(ifp))); + ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); goto bad; } @@ -308,8 +308,8 @@ nd6_rs_input( src_sa6.sin6_len = sizeof(src_sa6); src_sa6.sin6_addr = ip6->ip6_src; if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { - nd6log((LOG_INFO, "nd6_rs_input: " - "RS packet from non-neighbor\n")); + nd6log(info, "nd6_rs_input: " + "RS packet from non-neighbor\n"); goto freeit; } } @@ -319,8 +319,8 @@ nd6_rs_input( icmp6len -= sizeof(*nd_rs); nd6_option_init(nd_rs + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { - nd6log((LOG_INFO, - "nd6_rs_input: invalid ND option, ignored\n")); + nd6log(info, + "nd6_rs_input: invalid ND option, ignored\n"); /* nd6_options have incremented stats */ goto freeit; } @@ -331,10 +331,10 @@ nd6_rs_input( } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { - nd6log((LOG_INFO, + nd6log(info, "nd6_rs_input: lladdrlen mismatch for %s " "(if %d, RS packet %d)\n", - ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2)); + ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2); goto bad; } @@ -409,18 +409,18 @@ nd6_ra_input( ia6 = NULL; } - if (ip6->ip6_hlim != 255) { - nd6log((LOG_ERR, + if (ip6->ip6_hlim != IPV6_MAXHLIM) { + nd6log(error, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), if_name(ifp))); + ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); goto bad; } if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { - nd6log((LOG_ERR, + nd6log(error, "nd6_ra_input: src %s is not link-local\n", - ip6_sprintf(&saddr6))); + ip6_sprintf(&saddr6)); goto bad; } @@ -430,8 +430,8 @@ nd6_ra_input( icmp6len -= sizeof(*nd_ra); nd6_option_init(nd_ra + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { - nd6log((LOG_INFO, - "nd6_ra_input: invalid ND option, ignored\n")); + nd6log(info, + "nd6_ra_input: invalid ND option, ignored\n"); /* nd6_options have incremented stats */ goto freeit; } @@ -469,12 +469,12 @@ nd6_ra_input( if (ndi->chlim < nd_ra->nd_ra_curhoplimit) { ndi->chlim = nd_ra->nd_ra_curhoplimit; } else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) { - nd6log((LOG_ERR, + nd6log(error, "RA with a lower CurHopLimit sent from " "%s on %s (current = %d, received = %d). " "Ignored.\n", ip6_sprintf(&ip6->ip6_src), if_name(ifp), ndi->chlim, - nd_ra->nd_ra_curhoplimit)); + nd_ra->nd_ra_curhoplimit); } } lck_mtx_unlock(&ndi->lock); @@ -503,18 +503,18 @@ nd6_ra_input( pi = (struct nd_opt_prefix_info *)pt; if (pi->nd_opt_pi_len != 4) { - nd6log((LOG_INFO, + nd6log(info, "nd6_ra_input: invalid option " "len %d for prefix information option, " - "ignored\n", pi->nd_opt_pi_len)); + "ignored\n", pi->nd_opt_pi_len); continue; } if (128 < pi->nd_opt_pi_prefix_len) { - nd6log((LOG_INFO, + nd6log(info, "nd6_ra_input: invalid prefix " "len %d for prefix information option, " - "ignored\n", pi->nd_opt_pi_prefix_len)); + "ignored\n", pi->nd_opt_pi_prefix_len); continue; } @@ -531,10 +531,10 @@ nd6_ra_input( if (IN6_IS_ADDR_UNSPECIFIED(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { - nd6log((LOG_INFO, + nd6log(info, "%s: invalid prefix %s, ignored\n", __func__, - ip6_sprintf(&pi->nd_opt_pi_prefix))); + ip6_sprintf(&pi->nd_opt_pi_prefix)); continue; } @@ -563,17 +563,17 @@ nd6_ra_input( */ if (ip6_only_allow_rfc4193_prefix && !IN6_IS_ADDR_UNIQUE_LOCAL(&pi->nd_opt_pi_prefix)) { - nd6log((LOG_INFO, + nd6log(info, "nd6_ra_input: no SLAAC on prefix %s " "[not RFC 4193]\n", - ip6_sprintf(&pi->nd_opt_pi_prefix))); + ip6_sprintf(&pi->nd_opt_pi_prefix)); pr.ndpr_raf_auto = 0; } else if (!nd6_accept_6to4 && IN6_IS_ADDR_6TO4(&pi->nd_opt_pi_prefix)) { - nd6log((LOG_INFO, + nd6log(info, "%s: no SLAAC on prefix %s " "[6to4]\n", __func__, - ip6_sprintf(&pi->nd_opt_pi_prefix))); + ip6_sprintf(&pi->nd_opt_pi_prefix)); pr.ndpr_raf_auto = 0; } @@ -632,9 +632,9 @@ nd6_ra_input( /* lower bound */ if (mtu < IPV6_MMTU) { - nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option " + nd6log(info, "nd6_ra_input: bogus mtu option " "mtu=%d sent from %s, ignoring\n", - mtu, ip6_sprintf(&ip6->ip6_src))); + mtu, ip6_sprintf(&ip6->ip6_src)); goto skip; } @@ -650,19 +650,19 @@ nd6_ra_input( in6_setmaxmtu(); } } else { - nd6log((LOG_INFO, "nd6_ra_input: bogus mtu " + nd6log(info, "nd6_ra_input: bogus mtu " "mtu=%d sent from %s; " "exceeds maxmtu %d, ignoring\n", mtu, ip6_sprintf(&ip6->ip6_src), - ndi->maxmtu)); + ndi->maxmtu); lck_mtx_unlock(&ndi->lock); } } else { lck_mtx_unlock(&ndi->lock); - nd6log((LOG_INFO, "nd6_ra_input: mtu option " + nd6log(info, "nd6_ra_input: mtu option " "mtu=%d sent from %s; maxmtu unknown, " "ignoring\n", - mtu, ip6_sprintf(&ip6->ip6_src))); + mtu, ip6_sprintf(&ip6->ip6_src)); } } @@ -677,10 +677,10 @@ skip: } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { - nd6log((LOG_INFO, + nd6log(info, "nd6_ra_input: lladdrlen mismatch for %s " "(if %d, RA packet %d)\n", - ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2)); + ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2); goto bad; } @@ -773,16 +773,16 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) } if (new->ifp->if_eflags & IFEF_IPV6_ROUTER) { - nd6log2((LOG_INFO, "%s: ignoring router %s, scoped=%d, " + nd6log2(info, "%s: ignoring router %s, scoped=%d, " "static=%d on advertising interface\n", if_name(new->ifp), ip6_sprintf(&new->rtaddr), scoped, - (new->stateflags & NDDRF_STATIC) ? 1 : 0)); + (new->stateflags & NDDRF_STATIC) ? 1 : 0); goto out; } - nd6log2((LOG_INFO, "%s: adding default router %s, scoped=%d, " + nd6log2(info, "%s: adding default router %s, scoped=%d, " "static=%d\n", if_name(new->ifp), ip6_sprintf(&new->rtaddr), - scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0)); + scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0); Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -826,9 +826,9 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) new->rtaddr_mapped = gate.sin6_addr; new->stateflags |= NDDRF_MAPPED; - nd6log((LOG_INFO, "%s: Default router %s mapped " - "to ", if_name(new->ifp), ip6_sprintf(&new->rtaddr))); - nd6log((LOG_INFO, "%s\n", ip6_sprintf(&new->rtaddr_mapped))); + nd6log(info, "%s: Default router %s mapped " + "to ", if_name(new->ifp), ip6_sprintf(&new->rtaddr)); + nd6log(info, "%s\n", ip6_sprintf(&new->rtaddr_mapped)); } } @@ -847,10 +847,10 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) new->stateflags |= NDDRF_IFSCOPE; } } else { - nd6log((LOG_ERR, "%s: failed to add default router " + nd6log(error, "%s: failed to add default router " "%s on %s scoped %d (errno = %d)\n", __func__, ip6_sprintf(&gate.sin6_addr), if_name(new->ifp), - (ifscope != IFSCOPE_NONE), err)); + (ifscope != IFSCOPE_NONE), err); NDDR_LOCK(new); } new->err = err; @@ -907,10 +907,10 @@ defrouter_delreq(struct nd_defrouter *dr) goto out; } - nd6log2((LOG_INFO, "%s: removing default router %s, scoped=%d, " + nd6log2(info, "%s: removing default router %s, scoped=%d, " "static=%d\n", dr->ifp != NULL ? if_name(dr->ifp) : "ANY", ip6_sprintf(&dr->rtaddr), (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, - (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + (dr->stateflags & NDDRF_STATIC) ? 1 : 0); Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -951,10 +951,10 @@ defrouter_delreq(struct nd_defrouter *dr) RT_UNLOCK(oldrt); rtfree(oldrt); } else if (err != ESRCH) { - nd6log((LOG_ERR, "%s: failed to delete default router " + nd6log(error, "%s: failed to delete default router " "%s on %s scoped %d (errno = %d)\n", __func__, ip6_sprintf(&gate.sin6_addr), dr->ifp != NULL ? - if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err)); + if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err); } NDDR_LOCK(dr); /* ESRCH means it's no longer in the routing table; ignore it */ @@ -1140,8 +1140,8 @@ defrtrlist_del(struct nd_defrouter *dr) lck_mtx_lock(nd6_mutex); NDDR_REMREF(dr); - nd6log2((LOG_INFO, "%s: freeing defrouter %s\n", if_name(dr->ifp), - ip6_sprintf(&dr->rtaddr))); + nd6log2(info, "%s: freeing defrouter %s\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr)); /* * Delete it from the routing table. */ @@ -1319,26 +1319,32 @@ defrouter_select(struct ifnet *ifp) LCK_MTX_ASSERT(nd6_mutex, LCK_MTX_ASSERT_OWNED); if (ifp == NULL) { - nd6log2((LOG_INFO, - "%s:%d: Return early. NULL interface", - __func__, __LINE__)); - return; + ifp = nd6_defifp; + if (ifp == NULL) { + nd6log2(info, + "%s:%d: Return early. NULL interface", + __func__, __LINE__); + return; + } + nd6log2(info, + "%s:%d: NULL interface passed. Setting to default interface %s.\n", + __func__, __LINE__, if_name(ifp)); } if (ifp == lo_ifp) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Return early. " "Default router select called for loopback.\n", - __func__, __LINE__)); + __func__, __LINE__); return; } if (ifp->if_eflags & IFEF_IPV6_ROUTER) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Return early. " "Default router select called for interface" " %s with IFEF_IPV6_ROUTER flag set\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); return; } @@ -1347,9 +1353,9 @@ defrouter_select(struct ifnet *ifp) * If default router list is empty, there's nothing to be done. */ if (!TAILQ_FIRST(&nd_defrouter)) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Return early. " - "Default router is empty.\n", __func__, __LINE__)); + "Default router is empty.\n", __func__, __LINE__); return; } @@ -1359,18 +1365,18 @@ defrouter_select(struct ifnet *ifp) */ ndi = ND_IFINFO(ifp); if (!ndi || !ndi->initialized) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Return early. " "Interface %s's nd_ifinfo not initialized.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); return; } if (ndi->ndefrouters == 0) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Return early. " "%s does not have any default routers.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); return; } @@ -1418,11 +1424,11 @@ defrouter_select(struct ifnet *ifp) * there's nothing else to choose from. */ if (ndi->ndefrouters == 1) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Fast forward default router selection " "as interface %s has learned only one default " "router and there's nothing else to choose from.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); VERIFY(selected_dr == NULL && installed_dr == NULL); selected_dr = dr; if (dr->stateflags & NDDRF_INSTALLED) { @@ -1511,9 +1517,9 @@ defrouter_select(struct ifnet *ifp) lck_mtx_lock(nd6_mutex); } else { /* this should not happen; warn for diagnosis */ - nd6log((LOG_ERR, "defrouter_select: more than one " + nd6log(error, "defrouter_select: more than one " "default router is installed for interface :%s.\n", - if_name(ifp))); + if_name(ifp)); NDDR_UNLOCK(dr); } } else { @@ -1541,18 +1547,18 @@ defrouter_select(struct ifnet *ifp) } if (ndi->ndefrouters == 0) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Interface %s no longer " "has any default routers. Abort.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); goto out; } - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Iterate default router list again " "for interface %s, as the list seems to have " "changed during release-reaquire of global " "nd6_mutex lock.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); is_installed_reachable = FALSE; genid = nd6_defrouter_genid; @@ -1595,12 +1601,12 @@ defrouter_select(struct ifnet *ifp) } if ((selected_dr == NULL) && (installed_dr == NULL)) { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: Between release and reaquire of global " "nd6_mutex lock, the list seems to have changed " "and it does not have any default routers for " "interface %s.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); goto out; } @@ -1639,10 +1645,10 @@ install_route: */ lck_mtx_unlock(nd6_mutex); if (installed_dr != selected_dr) { - nd6log((LOG_INFO, + nd6log(info, "%s:%d: Found a better router for interface " "%s. Installing new default route.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); if (installed_dr != NULL) { defrouter_delreq(installed_dr); } @@ -1656,18 +1662,18 @@ install_route: (installed_dr->ifp == nd6_defifp)) || (!(installed_dr->stateflags & NDDRF_IFSCOPE) && (installed_dr->ifp != nd6_defifp))) { - nd6log((LOG_INFO, + nd6log(info, "%s:%d: Need to reinstall default route for interface " "%s as its scope has changed.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); defrouter_delreq(installed_dr); defrouter_addreq(installed_dr, (installed_dr->ifp != nd6_defifp)); } else { - nd6log2((LOG_INFO, + nd6log2(info, "%s:%d: No need to change the default " "route for interface %s.\n", - __func__, __LINE__, if_name(ifp))); + __func__, __LINE__, if_name(ifp)); } lck_mtx_lock(nd6_mutex); out: @@ -1795,8 +1801,8 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) VERIFY(ndi->ndefrouters != 0); lck_mtx_unlock(&ndi->lock); - nd6log2((LOG_INFO, "%s: allocating defrouter %s\n", if_name(ifp), - ip6_sprintf(&new->rtaddr))); + nd6log2(info, "%s: allocating defrouter %s\n", if_name(ifp), + ip6_sprintf(&new->rtaddr)); getmicrotime(&caltime); NDDR_LOCK(n); @@ -2022,11 +2028,11 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, if ((e = nd6_prefix_onlink_common(new, force_scoped, new->ndpr_ifp->if_index)) != 0) { - nd6log((LOG_ERR, "nd6_prelist_add: failed to make " + nd6log(error, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link %s on %s (errno=%d)\n", ip6_sprintf(&new->ndpr_prefix.sin6_addr), new->ndpr_plen, force_scoped ? "scoped" : - "non-scoped", if_name(ifp), e)); + "non-scoped", if_name(ifp), e); /* proceed anyway. XXX: is it correct? */ } } @@ -2082,10 +2088,10 @@ prelist_remove(struct nd_prefix *pr) NDPR_UNLOCK(pr); lck_mtx_unlock(nd6_mutex); if ((e = nd6_prefix_offlink(pr)) != 0) { - nd6log((LOG_ERR, "prelist_remove: failed to make " + nd6log(error, "prelist_remove: failed to make " "%s/%d offlink on %s, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(ifp), e)); + pr->ndpr_plen, if_name(ifp), e); /* what should we do? */ } lck_mtx_lock(nd6_mutex); @@ -2198,12 +2204,12 @@ prelist_update( NDPR_UNLOCK(pr); if ((e = nd6_prefix_onlink(pr)) != 0) { - nd6log((LOG_ERR, + nd6log(error, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " "(errno=%d)\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); + pr->ndpr_plen, if_name(pr->ndpr_ifp), e); /* proceed anyway. XXX: is it correct? */ } NDPR_LOCK(pr); @@ -2231,12 +2237,12 @@ prelist_update( error = nd6_prelist_add(new, dr, &pr, FALSE); if (error != 0 || pr == NULL) { - nd6log((LOG_NOTICE, "prelist_update: " + nd6log(info, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s " "errno=%d, returnpr=0x%llx\n", ip6_sprintf(&new->ndpr_prefix.sin6_addr), new->ndpr_plen, if_name(new->ndpr_ifp), - error, (uint64_t)VM_KERNEL_ADDRPERM(pr))); + error, (uint64_t)VM_KERNEL_ADDRPERM(pr)); goto end; /* we should just give up in this case. */ } } @@ -2447,10 +2453,10 @@ prelist_update( if (ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1)) != 0) { - nd6log((LOG_NOTICE, "prelist_update: " + nd6log(info, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", - e)); + e); } } IFA_REMREF(&ia6->ia_ifa); @@ -2485,8 +2491,7 @@ prelist_update( IN6_CLAT46_EVENT_V6_ADDR_CONFFAIL, 0, tmp_uuid); - nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface " - "%s.\n", ifp->if_xname)); + nd6log0(error, "Could not configure CLAT46 address on interface %s.\n", ifp->if_xname); } /* * Reset the error as we do not want to @@ -3054,11 +3059,11 @@ pfxlist_onlink_check(void) NDPR_UNLOCK(pr); lck_mtx_unlock(nd6_mutex); if ((e = nd6_prefix_offlink(pr)) != 0) { - nd6log((LOG_ERR, + nd6log(error, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, e)); + pr->ndpr_plen, e); } lck_mtx_lock(nd6_mutex); NDPR_REMREF(pr); @@ -3070,11 +3075,11 @@ pfxlist_onlink_check(void) pr->ndpr_raf_onlink) { NDPR_UNLOCK(pr); if ((e = nd6_prefix_onlink(pr)) != 0) { - nd6log((LOG_ERR, + nd6log(error, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, e)); + pr->ndpr_plen, e); } NDPR_REMREF(pr); pr = nd_prefix.lh_first; @@ -3112,8 +3117,8 @@ pfxlist_onlink_check(void) err = ifnet_get_address_list_family_internal(NULL, &ifap, AF_INET6, 0, M_NOWAIT, 0); if (err != 0 || ifap == NULL) { - nd6log((LOG_ERR, "%s: ifnet_get_address_list_family_internal " - "failed", __func__)); + nd6log(error, "%s: ifnet_get_address_list_family_internal " + "failed", __func__); return; } for (i = 0; ifap[i]; i++) { @@ -3286,61 +3291,61 @@ nd6_prefix_sync(struct ifnet *ifp) err = nd6_prefix_offlink(opr); lck_mtx_lock(nd6_mutex); if (err != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: failed to make %s/%d offlink on %s, " "errno=%d\n", __func__, ip6_sprintf(&opr->ndpr_prefix.sin6_addr), - opr->ndpr_plen, if_name(opr->ndpr_ifp), err)); + opr->ndpr_plen, if_name(opr->ndpr_ifp), err); } } else { - nd6log((LOG_ERR, + nd6log(error, "%s: scoped %s/%d on %s has no matching unscoped prefix\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp))); + pr->ndpr_plen, if_name(pr->ndpr_ifp)); } lck_mtx_unlock(nd6_mutex); err = nd6_prefix_offlink(pr); lck_mtx_lock(nd6_mutex); if (err != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: failed to make %s/%d offlink on %s, errno=%d\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp), err)); + pr->ndpr_plen, if_name(pr->ndpr_ifp), err); } /* Add the entries back */ if (opr != NULL) { err = nd6_prefix_onlink_scoped(opr, opr->ndpr_ifp->if_index); if (err != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: failed to make %s/%d scoped onlink on %s, " "errno=%d\n", __func__, ip6_sprintf(&opr->ndpr_prefix.sin6_addr), - opr->ndpr_plen, if_name(opr->ndpr_ifp), err)); + opr->ndpr_plen, if_name(opr->ndpr_ifp), err); } } err = nd6_prefix_onlink_scoped(pr, IFSCOPE_NONE); if (err != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: failed to make %s/%d onlink on %s, errno=%d\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp), err)); + pr->ndpr_plen, if_name(pr->ndpr_ifp), err); } if (err != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: error promoting %s/%d to %s from %s\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), - (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE")); + (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE"); } else { - nd6log2((LOG_INFO, + nd6log2(info, "%s: %s/%d promoted, previously on %s\n", if_name(pr->ndpr_ifp), ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, - (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE")); + (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE"); } if (opr != NULL) { @@ -3365,13 +3370,13 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, /* sanity check */ NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: %s/%d on %s scoped=%d is already on-link\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0); - NDPR_UNLOCK(pr); - return (EEXIST)); + NDPR_UNLOCK(pr); + return EEXIST; } NDPR_UNLOCK(pr); @@ -3424,11 +3429,11 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, * after removing all IPv6 addresses on the receiving * interface. This should, of course, be rare though. */ - nd6log((LOG_NOTICE, + nd6log(info, "nd6_prefix_onlink: failed to find any ifaddr" " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(ifp))); + pr->ndpr_plen, if_name(ifp)); NDPR_UNLOCK(pr); return 0; } @@ -3483,15 +3488,15 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, NDPR_LOCK(pr); } else { NDPR_LOCK(pr); - nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a" - " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx," + nd6log(error, "nd6_prefix_onlink: failed to add route for a" + " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%x," " scoped=%d, errno = %d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp), ip6_sprintf(&((struct sockaddr_in6 *) (void *)ifa->ifa_addr)->sin6_addr), ip6_sprintf(&mask6.sin6_addr), rtflags, - (ifscope != IFSCOPE_NONE), error)); + (ifscope != IFSCOPE_NONE), error); } NDPR_LOCK_ASSERT_HELD(pr); @@ -3583,11 +3588,11 @@ nd6_prefix_offlink(struct nd_prefix *pr) /* sanity check */ NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { - nd6log((LOG_ERR, + nd6log(error, "nd6_prefix_offlink: %s/%d on %s scoped=%d is already " "off-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), - (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0)); + (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0); NDPR_UNLOCK(pr); return EEXIST; } @@ -3622,11 +3627,11 @@ nd6_prefix_offlink(struct nd_prefix *pr) RT_UNLOCK(rt); rtfree(rt); } else { - nd6log((LOG_ERR, + nd6log(error, "nd6_prefix_offlink: failed to delete route: " "%s/%d on %s, scoped %d, (errno = %d)\n", ip6_sprintf(&sa6.sin6_addr), plen, if_name(ifp), - (ifscope != IFSCOPE_NONE), error)); + (ifscope != IFSCOPE_NONE), error); } if (ndpr_rt != NULL) { @@ -3700,16 +3705,16 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i * stateless autoconfiguration not yet well-defined for IID * lengths other than 64 octets. Just give up for now. */ - nd6log((LOG_INFO, "%s: IID length not 64 octets (%s)\n", - __func__, if_name(ifp))); + nd6log(info, "%s: IID length not 64 octets (%s)\n", + __func__, if_name(ifp)); goto unlock1; } if (iidlen + pr->ndpr_plen != 128) { error = EADDRNOTAVAIL; - nd6log((LOG_INFO, + nd6log(info, "%s: invalid prefix length %d for %s, ignored\n", - __func__, pr->ndpr_plen, if_name(ifp))); + __func__, pr->ndpr_plen, if_name(ifp)); goto unlock1; } @@ -3741,8 +3746,8 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i ia6 = in6ifa_ifpforlinklocal(ifp, 0); if (ia6 == NULL) { error = EADDRNOTAVAIL; - nd6log((LOG_INFO, "%s: no link-local address (%s)\n", - __func__, if_name(ifp))); + nd6log(info, "%s: no link-local address (%s)\n", + __func__, if_name(ifp)); goto done; } @@ -3796,11 +3801,11 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i } } else { if (!is_clat46) { - nd6log((LOG_ERR, "%s: no CGA available (%s)\n", - __func__, if_name(ifp))); + nd6log(error, "%s: no CGA available (%s)\n", + __func__, if_name(ifp)); } else { - nd6log((LOG_ERR, "%s: no CLAT46 available (%s)\n", - __func__, if_name(ifp))); + nd6log(error, "%s: no CLAT46 available (%s)\n", + __func__, if_name(ifp)); } goto done; } @@ -3850,10 +3855,10 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i } error = in6_update_ifa(ifp, &ifra, ifaupdate, &ia6); if (error != 0) { - nd6log((LOG_ERR, + nd6log(error, "%s: failed to make ifaddr %s on %s (errno=%d)\n", __func__, ip6_sprintf(&ifra.ifra_addr.sin6_addr), - if_name(ifp), error)); + if_name(ifp), error); error = EADDRNOTAVAIL; goto done; } @@ -3918,8 +3923,8 @@ again: if ((ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) { IFA_REMREF(&ia->ia_ifa); if (trylimit-- == 0) { - nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find " - "a unique random IFID\n")); + nd6log(info, "in6_tmpifadd: failed to find " + "a unique random IFID\n"); return EEXIST; } forcegen = 1; @@ -3974,7 +3979,7 @@ again: ifaupdate = IN6_IFAUPDATE_NOWAIT | IN6_IFAUPDATE_DADDELAY; error = in6_update_ifa(ifp, &ifra, ifaupdate, &newia); if (error != 0) { - nd6log((LOG_ERR, "in6_tmpifadd: failed to add address.\n")); + nd6log(error, "in6_tmpifadd: failed to add address.\n"); return error; } VERIFY(newia != NULL); @@ -3986,7 +3991,7 @@ again: * We lost the race with another thread that has purged * ia0 address; in this case, purge the tmp addr as well. */ - nd6log((LOG_ERR, "in6_tmpifadd: no public address\n")); + nd6log(error, "in6_tmpifadd: no public address\n"); VERIFY(!(ia0->ia6_flags & IN6_IFF_AUTOCONF)); IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); in6_purgeaddr(&newia->ia_ifa); @@ -4044,9 +4049,9 @@ in6_init_prefix_ltimes(struct nd_prefix *ndpr) /* check if preferred lifetime > valid lifetime. RFC 4862 5.5.3 (c) */ if (ndpr->ndpr_pltime > ndpr->ndpr_vltime) { - nd6log((LOG_INFO, "in6_init_prefix_ltimes: preferred lifetime" + nd6log(info, "in6_init_prefix_ltimes: preferred lifetime" "(%d) is greater than valid lifetime(%d)\n", - (u_int)ndpr->ndpr_pltime, (u_int)ndpr->ndpr_vltime)); + (u_int)ndpr->ndpr_pltime, (u_int)ndpr->ndpr_vltime); return EINVAL; } if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) { @@ -4188,11 +4193,11 @@ nd6_setdefaultiface( } if (nd6_defifp != NULL) { - nd6log((LOG_INFO, "%s: is now the default " + nd6log(info, "%s: is now the default " "interface (was %s)\n", if_name(nd6_defifp), - odef_ifp != NULL ? if_name(odef_ifp) : "NONE")); + odef_ifp != NULL ? if_name(odef_ifp) : "NONE"); } else { - nd6log((LOG_INFO, "No default interface set\n")); + nd6log(info, "No default interface set\n"); } /* diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index b1ce62b0c..5b2b17517 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -397,6 +397,9 @@ rip6_output( if (INP_NO_EXPENSIVE(in6p)) { ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; } + if (INP_NO_CONSTRAINED(in6p)) { + ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED; + } if (INP_AWDL_UNRESTRICTED(in6p)) { ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; } @@ -710,9 +713,20 @@ rip6_output( * route is not multicast, update outif with that of * the route interface index used by IP. */ - if (rt != NULL && - (outif = rt->rt_ifp) != in6p->in6p_last_outifp) { - in6p->in6p_last_outifp = outif; + if (rt != NULL) { + /* + * When an NECP IP tunnel policy forces the outbound interface, + * ip6_output_list() informs the transport layer what is the actual + * outgoing interface + */ + if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) { + outif = ifindex2ifnet[ip6oa.ip6oa_boundif]; + } else { + outif = rt->rt_ifp; + } + if (outif != NULL) { + in6p->in6p_last_outifp = outif; + } } } else { ROUTE_RELEASE(&in6p->in6p_route); @@ -723,7 +737,7 @@ rip6_output( * denied access to it, generate an event. */ if (error != 0 && (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED) && - (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p))) { + (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p) || INP_NO_CONSTRAINED(in6p))) { soevent(in6p->inp_socket, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); } diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index 7898c179f..66025ca43 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -200,6 +200,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, if (INP_NO_EXPENSIVE(in6p)) { ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; } + if (INP_NO_CONSTRAINED(in6p)) { + ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED; + } if (INP_AWDL_UNRESTRICTED(in6p)) { ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; } @@ -568,6 +571,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, VERIFY(in6p->inp_sndinprog_cnt > 0); if (--in6p->inp_sndinprog_cnt == 0) { in6p->inp_flags &= ~(INP_FC_FEEDBACK); + if (in6p->inp_sndingprog_waiters > 0) { + wakeup(&in6p->inp_sndinprog_cnt); + } } if (ro.ro_rt != NULL) { @@ -612,16 +618,27 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, * If the destination route is unicast, update outif * with that of the route interface used by IP. */ - if (rt != NULL && - (outif = rt->rt_ifp) != in6p->in6p_last_outifp) { - in6p->in6p_last_outifp = outif; - - so->so_pktheadroom = P2ROUNDUP( - sizeof(struct udphdr) + - hlen + - ifnet_hdrlen(outif) + - ifnet_mbuf_packetpreamblelen(outif), - sizeof(u_int32_t)); + if (rt != NULL) { + /* + * When an NECP IP tunnel policy forces the outbound interface, + * ip6_output_list() informs the transport layer what is the actual + * outgoing interface + */ + if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) { + outif = ifindex2ifnet[ip6oa.ip6oa_boundif]; + } else { + outif = rt->rt_ifp; + } + if (outif != NULL && outif != in6p->in6p_last_outifp) { + in6p->in6p_last_outifp = outif; + + so->so_pktheadroom = P2ROUNDUP( + sizeof(struct udphdr) + + hlen + + ifnet_hdrlen(outif) + + ifnet_mbuf_packetpreamblelen(outif), + sizeof(u_int32_t)); + } } } else { ROUTE_RELEASE(&in6p->in6p_route); @@ -632,7 +649,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, * socket is denied access to it, generate an event. */ if (error != 0 && (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED) && - (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p))) { + (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p) || INP_NO_CONSTRAINED(in6p))) { soevent(in6p->inp_socket, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); } diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index 64b57f861..9b4c3a16e 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -484,13 +484,14 @@ udp6_input(struct mbuf **mp, int *offp, int proto) #if IPSEC /* * UDP to port 4500 with a payload where the first four bytes are - * not zero is a UDP encapsulated IPSec packet. Packets where + * not zero is a UDP encapsulated IPsec packet. Packets where * the payload is one byte and that byte is 0xFF are NAT keepalive - * packets. Decapsulate the ESP packet and carry on with IPSec input + * packets. Decapsulate the ESP packet and carry on with IPsec input * or discard the NAT keep-alive. */ if (ipsec_bypass == 0 && (esp_udp_encap_port & 0xFFFF) != 0 && - uh->uh_dport == ntohs((u_short)esp_udp_encap_port)) { + (uh->uh_dport == ntohs((u_short)esp_udp_encap_port) || + uh->uh_sport == ntohs((u_short)esp_udp_encap_port))) { int payload_len = ulen - sizeof(struct udphdr) > 4 ? 4 : ulen - sizeof(struct udphdr); @@ -515,7 +516,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) goto bad; } else if (payload_len == 4 && *(u_int32_t*)(void *) ((caddr_t)uh + sizeof(struct udphdr)) != 0) { - /* UDP encapsulated IPSec packet to pass through NAT */ + /* UDP encapsulated IPsec packet to pass through NAT */ /* preserve the udp header */ *offp = off + sizeof(struct udphdr); return esp6_input(mp, offp, IPPROTO_UDP); diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index a340f34b7..e1230e472 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -119,10 +119,6 @@ #include #endif #endif -#include -#if INET6 -#include -#endif /* randomness */ @@ -546,7 +542,6 @@ static void key_getcomb_setlifetime(struct sadb_comb *); static struct mbuf *key_getcomb_esp(void); #endif static struct mbuf *key_getcomb_ah(void); -static struct mbuf *key_getcomb_ipcomp(void); static struct mbuf *key_getprop(const struct secasindex *); static int key_acquire(struct secasindex *, struct secpolicy *); @@ -612,6 +607,7 @@ key_init(struct protosw *pp, struct domain *dp) VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); _CASSERT(PFKEY_ALIGN8(sizeof(struct sadb_msg)) <= _MHLEN); + _CASSERT(MAX_REPLAY_WINDOWS == MBUF_TC_MAX); if (key_initialized) { return; @@ -670,6 +666,7 @@ key_init(struct protosw *pp, struct domain *dp) /* initialize key statistics */ keystat.getspi_count = 1; + esp_init(); #ifndef __APPLE__ printf("IPsec: Initialized Security Association Processing.\n"); #endif @@ -898,7 +895,7 @@ key_alloc_outbound_sav_for_interface(ifnet_t interface, int family, } } - /* This SAH is linked to the IPSec interface, and the right family. We found it! */ + /* This SAH is linked to the IPsec interface, and the right family. We found it! */ if (key_preferred_oldsa) { saorder_state_valid = saorder_state_valid_prefer_old; arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); @@ -1915,7 +1912,6 @@ key_msg2sp( switch (xisr->sadb_x_ipsecrequest_proto) { case IPPROTO_ESP: case IPPROTO_AH: - case IPPROTO_IPCOMP: break; default: ipseclog((LOG_DEBUG, @@ -3943,6 +3939,7 @@ key_newsav( LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav, secasvar, chain); ipsec_sav_count++; + ipsec_monitor_sleep_wake(); return newsav; } @@ -4111,9 +4108,12 @@ key_delsav( KFREE(sav->sched); sav->sched = NULL; } - if (sav->replay != NULL) { - keydb_delsecreplay(sav->replay); - sav->replay = NULL; + + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + if (sav->replay[i] != NULL) { + keydb_delsecreplay(sav->replay[i]); + sav->replay[i] = NULL; + } } if (sav->lft_c != NULL) { KFREE(sav->lft_c); @@ -4298,7 +4298,9 @@ key_setsaval( } /* initialization */ - sav->replay = NULL; + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + sav->replay[i] = NULL; + } sav->key_auth = NULL; sav->key_enc = NULL; sav->sched = NULL; @@ -4337,6 +4339,7 @@ key_setsaval( error = EINVAL; goto fail; } + sav->natt_encapsulated_src_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_src_port; sav->remote_ike_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_port; sav->natt_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_interval; sav->natt_offload_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_offload_interval; @@ -4356,11 +4359,28 @@ key_setsaval( /* replay window */ if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) { - sav->replay = keydb_newsecreplay(sa0->sadb_sa_replay); - if (sav->replay == NULL) { - ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); - error = ENOBUFS; - goto fail; + if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) == + SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) { + uint32_t range = (1ULL << (sizeof(((struct secreplay *)0)->count) * 8)) / MAX_REPLAY_WINDOWS; + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + sav->replay[i] = keydb_newsecreplay(sa0->sadb_sa_replay); + if (sav->replay[i] == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + /* Allowed range for sequence per traffic class */ + sav->replay[i]->count = i * range; + sav->replay[i]->lastseq = ((i + 1) * range) - 1; + } + } else { + sav->replay[0] = keydb_newsecreplay(sa0->sadb_sa_replay); + if (sav->replay[0] == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + sav->replay[0]->lastseq = ~0; } } } @@ -4387,7 +4407,6 @@ key_setsaval( error = EINVAL; } break; - case SADB_X_SATYPE_IPCOMP: default: error = EINVAL; break; @@ -4434,12 +4453,6 @@ key_setsaval( goto fail; } break; - case SADB_X_SATYPE_IPCOMP: - if (len != PFKEY_ALIGN8(sizeof(struct sadb_key))) { - error = EINVAL; - } - sav->key_enc = NULL; /*just in case*/ - break; case SADB_SATYPE_AH: default: error = EINVAL; @@ -4485,7 +4498,6 @@ key_setsaval( #endif break; case SADB_SATYPE_AH: - case SADB_X_SATYPE_IPCOMP: break; default: ipseclog((LOG_DEBUG, "key_setsaval: invalid SA type.\n")); @@ -4567,9 +4579,11 @@ key_setsaval( fail: /* initialization */ - if (sav->replay != NULL) { - keydb_delsecreplay(sav->replay); - sav->replay = NULL; + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + if (sav->replay[i] != NULL) { + keydb_delsecreplay(sav->replay[i]); + sav->replay[i] = NULL; + } } if (sav->key_auth != NULL) { bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); @@ -4641,7 +4655,10 @@ key_setsaval2(struct secasvar *sav, LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED); /* initialization */ - sav->replay = NULL; + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + sav->replay[i] = NULL; + } + sav->key_auth = NULL; sav->key_enc = NULL; sav->sched = NULL; @@ -4688,11 +4705,28 @@ key_setsaval2(struct secasvar *sav, /* replay window */ if ((flags & SADB_X_EXT_OLD) == 0) { - sav->replay = keydb_newsecreplay(replay); - if (sav->replay == NULL) { - ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); - error = ENOBUFS; - goto fail; + if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) == + SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) { + uint32_t range = (1ULL << (sizeof(((struct secreplay *)0)->count) * 8)) / MAX_REPLAY_WINDOWS; + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + sav->replay[i] = keydb_newsecreplay(replay); + if (sav->replay[i] == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + /* Allowed range for sequence per traffic class */ + sav->replay[i]->count = i * range; + sav->replay[i]->lastseq = ((i + 1) * range) - 1; + } + } else { + sav->replay[0] = keydb_newsecreplay(replay); + if (sav->replay[0] == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + sav->replay[0]->lastseq = ~0; } } @@ -4792,9 +4826,11 @@ key_setsaval2(struct secasvar *sav, fail: /* initialization */ - if (sav->replay != NULL) { - keydb_delsecreplay(sav->replay); - sav->replay = NULL; + for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) { + if (sav->replay[i] != NULL) { + keydb_delsecreplay(sav->replay[i]); + sav->replay[i] = NULL; + } } if (sav->key_auth != NULL) { bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); @@ -4895,20 +4931,6 @@ key_mature( checkmask = 2; mustmask = 2; break; - case IPPROTO_IPCOMP: - if (sav->alg_auth != SADB_AALG_NONE) { - ipseclog((LOG_DEBUG, "key_mature: " - "protocol and algorithm mismated.\n")); - return EINVAL; - } - if ((sav->flags & SADB_X_EXT_RAWCPI) == 0 - && ntohl(sav->spi) >= 0x10000) { - ipseclog((LOG_DEBUG, "key_mature: invalid cpi for IPComp.\n")); - return EINVAL; - } - checkmask = 4; - mustmask = 4; - break; default: ipseclog((LOG_DEBUG, "key_mature: Invalid satype.\n")); return EPROTONOSUPPORT; @@ -5000,18 +5022,6 @@ key_mature( #endif } - /* check compression algorithm */ - if ((checkmask & 4) != 0) { - const struct ipcomp_algorithm *algo; - - /* algorithm-dependent check */ - algo = ipcomp_algorithm_lookup(sav->alg_enc); - if (!algo) { - ipseclog((LOG_DEBUG, "key_mature: unknown compression algorithm.\n")); - return EINVAL; - } - } - key_sa_chgstate(sav, SADB_SASTATE_MATURE); return 0; @@ -5060,7 +5070,7 @@ key_setdumpsa( case SADB_X_EXT_SA2: m = key_setsadbxsa2(sav->sah->saidx.mode, - sav->replay ? sav->replay->count : 0, + sav->replay[0] ? sav->replay[0]->count : 0, sav->sah->saidx.reqid, sav->flags2); if (!m) { @@ -5268,7 +5278,7 @@ key_setsadbsa( p->sadb_sa_len = PFKEY_UNIT64(len); p->sadb_sa_exttype = SADB_EXT_SA; p->sadb_sa_spi = sav->spi; - p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0); + p->sadb_sa_replay = (sav->replay[0] != NULL ? sav->replay[0]->wsize : 0); p->sadb_sa_state = sav->state; p->sadb_sa_auth = sav->alg_auth; p->sadb_sa_encrypt = sav->alg_enc; @@ -6684,8 +6694,6 @@ key_satype2proto( return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; - case SADB_X_SATYPE_IPCOMP: - return IPPROTO_IPCOMP; default: return 0; } @@ -6706,8 +6714,6 @@ key_proto2satype( return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; - case IPPROTO_IPCOMP: - return SADB_X_SATYPE_IPCOMP; default: return 0; } @@ -7063,20 +7069,6 @@ key_do_getnewspi( keymin = key_spi_minval; keymax = key_spi_maxval; } - /* IPCOMP needs 2-byte SPI */ - if (saidx->proto == IPPROTO_IPCOMP) { - u_int32_t t; - if (keymin >= 0x10000) { - keymin = 0xffff; - } - if (keymax >= 0x10000) { - keymax = 0xffff; - } - if (keymin > keymax) { - t = keymin; keymin = keymax; keymax = t; - } - } - if (keymin == keymax) { if (key_checkspidup(saidx, keymin) != NULL) { ipseclog((LOG_DEBUG, "key_do_getnewspi: SPI %u exists already.\n", keymin)); @@ -7419,6 +7411,7 @@ key_migrate(struct socket *so, /* Reset NAT values */ sav->flags = sa0->sadb_sa_flags; + sav->natt_encapsulated_src_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_src_port; sav->remote_ike_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_port; sav->natt_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_interval; sav->natt_offload_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_offload_interval; @@ -8380,55 +8373,6 @@ key_getcomb_ah(void) return m; } -/* - * not really an official behavior. discussed in pf_key@inner.net in Sep2000. - * XXX reorder combinations by preference - */ -static struct mbuf * -key_getcomb_ipcomp(void) -{ - struct sadb_comb *comb; - const struct ipcomp_algorithm *algo; - struct mbuf *m; - int i; - const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); - - m = NULL; - for (i = 1; i <= SADB_X_CALG_MAX; i++) { - algo = ipcomp_algorithm_lookup(i); - if (!algo) { - continue; - } - - if (!m) { -#if DIAGNOSTIC - if (l > MLEN) { - panic("assumption failed in key_getcomb_ipcomp"); - } -#endif - MGET(m, M_WAITOK, MT_DATA); - if (m) { - M_ALIGN(m, l); - m->m_len = l; - m->m_next = NULL; - } - } else { - M_PREPEND(m, l, M_WAITOK, 1); - } - if (!m) { - return NULL; - } - - comb = mtod(m, struct sadb_comb *); - bzero(comb, sizeof(*comb)); - key_getcomb_setlifetime(comb); - comb->sadb_comb_encrypt = i; - /* what should we set into sadb_comb_*_{min,max}bits? */ - } - - return m; -} - /* * XXX no way to pass mode (transport/tunnel) to userland * XXX replay checking? @@ -8452,9 +8396,6 @@ key_getprop( case IPPROTO_AH: m = key_getcomb_ah(); break; - case IPPROTO_IPCOMP: - m = key_getcomb_ipcomp(); - break; default: return NULL; } @@ -8494,8 +8435,6 @@ key_getprop( * * XXX x_policy is outside of RFC2367 (KAME extension). * XXX sensitivity is not supported. - * XXX for ipcomp, RFC2367 does not define how to fill in proposal. - * see comment for key_getcomb_ipcomp(). * * OUT: * 0 : succeed @@ -8644,25 +8583,12 @@ key_acquire( /* create proposal/combination extension */ m = key_getprop(saidx); -#if 0 - /* - * spec conformant: always attach proposal/combination extension, - * the problem is that we have no way to attach it for ipcomp, - * due to the way sadb_comb is declared in RFC2367. - */ - if (!m) { - error = ENOBUFS; - goto fail; - } - m_cat(result, m); -#else /* * outside of spec; make proposal/combination extension optional. */ if (m) { m_cat(result, m); } -#endif if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; @@ -9123,7 +9049,7 @@ setmsg: } #endif -#if DIGAGNOSTIC +#if DIAGNOSTIC if (off != len) { panic("length assumption failed in key_register"); } @@ -9248,7 +9174,7 @@ key_expire( /* create SA extension */ m = key_setsadbxsa2(sav->sah->saidx.mode, - sav->replay ? sav->replay->count : 0, + sav->replay[0] ? sav->replay[0]->count : 0, sav->sah->saidx.reqid, sav->flags2); if (!m) { @@ -9825,7 +9751,7 @@ key_parse( target = KEY_SENDUP_ONE; if ((m->m_flags & M_PKTHDR) == 0 || - m->m_pkthdr.len != m->m_pkthdr.len) { + m->m_pkthdr.len != orglen) { ipseclog((LOG_DEBUG, "key_parse: invalid message length.\n")); PFKEY_STAT_INCREMENT(pfkeystat.out_invlen); error = EINVAL; @@ -9913,7 +9839,6 @@ key_parse( break; case SADB_SATYPE_AH: case SADB_SATYPE_ESP: - case SADB_X_SATYPE_IPCOMP: switch (msg->sadb_msg_type) { case SADB_X_SPDADD: case SADB_X_SPDDELETE: @@ -10755,7 +10680,7 @@ key_delsp_for_ipsec_if(ifnet_t ipsec_if) LIST_FOREACH(sah, &sahtree, chain) { if (sah->ipsec_if == ipsec_if) { - /* This SAH is linked to the IPSec interface. It now needs to close. */ + /* This SAH is linked to the IPsec interface. It now needs to close. */ ifnet_release(sah->ipsec_if); sah->ipsec_if = NULL; diff --git a/bsd/netkey/key_debug.c b/bsd/netkey/key_debug.c index f1be18104..ac3a0cb0c 100644 --- a/bsd/netkey/key_debug.c +++ b/bsd/netkey/key_debug.c @@ -619,8 +619,8 @@ kdebug_secasv(sav) printf("\n"); } - if (sav->replay != NULL) - kdebug_secreplay(sav->replay); + if (sav->replay[0] != NULL) + kdebug_secreplay(sav->replay[0]); if (sav->lft_c != NULL) kdebug_sadb_lifetime((struct sadb_ext *)sav->lft_c); if (sav->lft_h != NULL) diff --git a/bsd/netkey/keydb.h b/bsd/netkey/keydb.h index 19450e6bd..db7a04ef3 100644 --- a/bsd/netkey/keydb.h +++ b/bsd/netkey/keydb.h @@ -70,6 +70,8 @@ struct secashead { struct route_in6 sa_route; /* route cache */ }; +#define MAX_REPLAY_WINDOWS 4 + /* Security Association */ struct secasvar { LIST_ENTRY(secasvar) chain; @@ -90,7 +92,8 @@ struct secasvar { void *sched; /* intermediate encryption key */ size_t schedlen; - struct secreplay *replay; /* replay prevention */ + struct secreplay *replay[MAX_REPLAY_WINDOWS]; /* replay prevention */ + long created; /* for lifetime */ struct sadb_lifetime *lft_c; /* CURRENT lifetime, it's constant. */ @@ -119,7 +122,7 @@ struct secreplay { u_int32_t count; u_int wsize; /* window size, i.g. 4 bytes */ u_int32_t seq; /* used by sender */ - u_int32_t lastseq; /* used by receiver */ + u_int32_t lastseq; /* used by sender/receiver */ caddr_t bitmap; /* used by receiver */ int overflow; /* overflow flag */ }; diff --git a/bsd/netkey/keysock.c b/bsd/netkey/keysock.c index 9b7f46424..83dd1d874 100644 --- a/bsd/netkey/keysock.c +++ b/bsd/netkey/keysock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,8 +87,8 @@ extern lck_mtx_t *raw_mtx; extern void key_init(struct protosw *, struct domain *); -struct sockaddr key_dst = { 2, PF_KEY, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }; -struct sockaddr key_src = { 2, PF_KEY, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }; +struct sockaddr key_dst = { .sa_len = 2, .sa_family = PF_KEY, .sa_data = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }; +struct sockaddr key_src = { .sa_len = 2, .sa_family = PF_KEY, .sa_data = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }; static void key_dinit(struct domain *); static int key_sendup0(struct rawcb *, struct mbuf *, int); diff --git a/bsd/nfs/gss/gss_krb5_mech.c b/bsd/nfs/gss/gss_krb5_mech.c index 70f497101..747b7fb6d 100644 --- a/bsd/nfs/gss/gss_krb5_mech.c +++ b/bsd/nfs/gss/gss_krb5_mech.c @@ -1333,6 +1333,9 @@ gss_krb5_cfx_verify_mic_mbuf(uint32_t *minor, /* minor_status */ header.value = mic->value; *minor = krb5_mic_mbuf(cctx, NULL, mbp, offset, len, &header, digest, &verified, 0, 0); + if (*minor) { + return GSS_S_FAILURE; + } //XXX errors and such? Sequencing and replay? Not Supported RPCSEC_GSS memcpy(&seq, token->SND_SEQ, sizeof(uint64_t)); @@ -2171,7 +2174,7 @@ gss_krb5_3des_unwrap_mbuf(uint32_t *minor, break; } wrap.Seal_Alg[0] = 0xff; - wrap.Seal_Alg[0] = 0xff; + wrap.Seal_Alg[1] = 0xff; } if (*minor) { return GSS_S_FAILURE; @@ -2204,12 +2207,12 @@ gss_krb5_3des_unwrap_mbuf(uint32_t *minor, header.value = &wrap; *minor = krb5_mic_mbuf(cctx, &header, smb, 0, length, NULL, hashval, &verified, 0, 0); - if (!verified) { - return GSS_S_BAD_SIG; - } if (*minor) { return GSS_S_FAILURE; } + if (!verified) { + return GSS_S_BAD_SIG; + } /* Get the pad bytes */ *minor = mbuf_copydata(smb, length - 1, 1, &padlen); diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index cc09ff674..dcb9647e8 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -117,7 +117,7 @@ extern int nfs_ticks; #endif /* default values for unresponsive mount timeouts */ -#define NFS_TPRINTF_INITIAL_DELAY 12 +#define NFS_TPRINTF_INITIAL_DELAY 5 #define NFS_TPRINTF_DELAY 30 /* @@ -187,6 +187,9 @@ extern int nfs_ticks; #define NFS_MATTR_SVCPRINCIPAL 26 /* GSS principal to authenticate to, the server principal */ #define NFS_MATTR_NFS_VERSION_RANGE 27 /* Packed version range to try */ #define NFS_MATTR_KERB_ETYPE 28 /* Enctype to use for kerberos mounts */ +#define NFS_MATTR_LOCAL_NFS_PORT 29 /* Unix domain socket for NFS protocol */ +#define NFS_MATTR_LOCAL_MOUNT_PORT 30 /* Unix domain socket for MOUNT protocol */ +#define NFS_MATTR_SET_MOUNT_OWNER 31 /* Set owner of mount point */ /* NFS mount flags */ #define NFS_MFLAG_SOFT 0 /* soft mount (requests fail if unresponsive) */ @@ -207,6 +210,8 @@ extern int nfs_ticks; #define NFS_MFLAG_NOQUOTA 15 /* don't support QUOTA requests */ #define NFS_MFLAG_MNTUDP 16 /* MOUNT protocol should use UDP */ #define NFS_MFLAG_MNTQUICK 17 /* use short timeouts while mounting */ +/* 18 reserved */ +#define NFS_MFLAG_NOOPAQUE_AUTH 19 /* don't make the mount AUTH_OPAQUE. Used by V3 */ /* Macros for packing and unpacking packed versions */ #define PVER2MAJOR(M) ((uint32_t)(((M) >> 16) & 0xffff)) @@ -1139,22 +1144,25 @@ extern int nfs_request_timer_on; /* mutex for nfs client globals */ extern lck_mtx_t *nfs_global_mutex; +#if CONFIG_NFS4 /* NFSv4 callback globals */ extern int nfs4_callback_timer_on; extern in_port_t nfs4_cb_port, nfs4_cb_port6; +/* nfs 4 default domain for user mapping */ +extern char nfs4_default_domain[MAXPATHLEN]; +/* nfs 4 timer call structure */ +extern thread_call_t nfs4_callback_timer_call; +#endif + /* nfs timer call structures */ extern thread_call_t nfs_request_timer_call; extern thread_call_t nfs_buf_timer_call; -extern thread_call_t nfs4_callback_timer_call; extern thread_call_t nfsrv_idlesock_timer_call; #if CONFIG_FSE extern thread_call_t nfsrv_fmod_timer_call; #endif -/* nfs 4 default domain for user mapping */ -extern char nfs4_default_domain[MAXPATHLEN]; - __BEGIN_DECLS nfstype vtonfs_type(enum vtype, int); @@ -1167,6 +1175,7 @@ void nfs_nhinit(void); void nfs_nhinit_finish(void); u_long nfs_hash(u_char *, int); +#if CONFIG_NFS4 int nfs4_init_clientid(struct nfsmount *); int nfs4_setclientid(struct nfsmount *); int nfs4_renew(struct nfsmount *, int); @@ -1178,8 +1187,10 @@ void nfs4_cb_rcv(socket_t, void *, int); void nfs4_callback_timer(void *, void *); int nfs4_secinfo_rpc(struct nfsmount *, struct nfsreq_secinfo_args *, kauth_cred_t, uint32_t *, int *); int nfs4_get_fs_locations(struct nfsmount *, nfsnode_t, u_char *, int, const char *, vfs_context_t, struct nfs_fs_locations *); -void nfs_fs_locations_cleanup(struct nfs_fs_locations *); void nfs4_default_attrs_for_referral_trigger(nfsnode_t, char *, int, struct nfs_vattr *, fhandle_t *); +#endif + +void nfs_fs_locations_cleanup(struct nfs_fs_locations *); int nfs_sockaddr_cmp(struct sockaddr *, struct sockaddr *); int nfs_connect(struct nfsmount *, int, int); @@ -1257,6 +1268,7 @@ int nfs_dir_buf_cache_lookup(nfsnode_t, nfsnode_t *, struct componentname *, int nfs_dir_buf_search(struct nfsbuf *, struct componentname *, fhandle_t *, struct nfs_vattr *, uint64_t *, time_t *, daddr64_t *, int); void nfs_name_cache_purge(nfsnode_t, nfsnode_t, struct componentname *, vfs_context_t); +#if CONFIG_NFS4 uint32_t nfs4_ace_nfstype_to_vfstype(uint32_t, int *); uint32_t nfs4_ace_vfstype_to_nfstype(uint32_t, int *); uint32_t nfs4_ace_nfsflags_to_vfsflags(uint32_t); @@ -1266,8 +1278,11 @@ uint32_t nfs4_ace_vfsrights_to_nfsmask(uint32_t); int nfs4_id2guid(char *, guid_t *, int); int nfs4_guid2id(guid_t *, char *, size_t *, int); -int nfs_parsefattr(struct nfsm_chain *, int, struct nfs_vattr *); int nfs4_parsefattr(struct nfsm_chain *, struct nfs_fsattr *, struct nfs_vattr *, fhandle_t *, struct dqblk *, struct nfs_fs_locations *); +#endif + +int nfs_parsefattr(struct nfsmount *nmp, struct nfsm_chain *, int, + struct nfs_vattr *); void nfs_vattr_set_supported(uint32_t *, struct vnode_attr *); void nfs_vattr_set_bitmap(struct nfsmount *, uint32_t *, struct vnode_attr *); void nfs3_pathconf_cache(struct nfsmount *, struct nfs_fsattr *); @@ -1277,7 +1292,6 @@ int nfs_node_access_slot(nfsnode_t, uid_t, int); void nfs_vnode_notify(nfsnode_t, uint32_t); void nfs_avoid_needless_id_setting_on_create(nfsnode_t, struct vnode_attr *, vfs_context_t); -int nfs4_create_rpc(vfs_context_t, nfsnode_t, struct componentname *, struct vnode_attr *, int, char *, nfsnode_t *); int nfs_open_state_set_busy(nfsnode_t, thread_t); void nfs_open_state_clear_busy(nfsnode_t); struct nfs_open_owner *nfs_open_owner_find(struct nfsmount *, kauth_cred_t, int); @@ -1296,24 +1310,9 @@ void nfs_open_file_add_open(struct nfs_open_file *, uint32_t, uint32_t, int); void nfs_open_file_remove_open_find(struct nfs_open_file *, uint32_t, uint32_t, uint32_t *, uint32_t *, int*); void nfs_open_file_remove_open(struct nfs_open_file *, uint32_t, uint32_t); void nfs_get_stateid(nfsnode_t, thread_t, kauth_cred_t, nfs_stateid *); -int nfs4_open(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); -int nfs4_open_delegated(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); -int nfs_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); int nfs_check_for_locks(struct nfs_open_owner *, struct nfs_open_file *); -int nfs4_reopen(struct nfs_open_file *, thread_t); -int nfs4_open_rpc(struct nfs_open_file *, vfs_context_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); -int nfs4_open_rpc_internal(struct nfs_open_file *, vfs_context_t, thread_t, kauth_cred_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); -int nfs4_open_confirm_rpc(struct nfsmount *, nfsnode_t, u_char *, int, struct nfs_open_owner *, nfs_stateid *, thread_t, kauth_cred_t, struct nfs_vattr *, uint64_t *); -int nfs4_open_reopen_rpc(struct nfs_open_file *, thread_t, kauth_cred_t, struct componentname *, vnode_t, vnode_t *, int, int); -int nfs4_open_reclaim_rpc(struct nfs_open_file *, int, int); -int nfs4_claim_delegated_open_rpc(struct nfs_open_file *, int, int, int); -int nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *, int); -int nfs4_claim_delegated_state_for_node(nfsnode_t, int); -int nfs4_open_downgrade_rpc(nfsnode_t, struct nfs_open_file *, vfs_context_t); -int nfs4_close_rpc(nfsnode_t, struct nfs_open_file *, thread_t, kauth_cred_t, int); -void nfs4_delegation_return_enqueue(nfsnode_t); -int nfs4_delegation_return(nfsnode_t, int, thread_t, kauth_cred_t); -int nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, int, thread_t, kauth_cred_t); +int nfs_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); + void nfs_release_open_state_for_node(nfsnode_t, int); void nfs_revoke_open_state_for_node(nfsnode_t); struct nfs_lock_owner *nfs_lock_owner_find(nfsnode_t, proc_t, int); @@ -1326,15 +1325,35 @@ void nfs_lock_owner_insert_held_lock(struct nfs_lock_owner *, struct nfs_file struct nfs_file_lock *nfs_file_lock_alloc(struct nfs_lock_owner *); void nfs_file_lock_destroy(struct nfs_file_lock *); int nfs_file_lock_conflict(struct nfs_file_lock *, struct nfs_file_lock *, int *); -int nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); int nfs_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, thread_t, kauth_cred_t, int); int nfs_advlock_getlock(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); int nfs_advlock_setlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, short, vfs_context_t); int nfs_advlock_unlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, uint64_t, uint64_t, int, vfs_context_t); +#if CONFIG_NFS4 +int nfs4_create_rpc(vfs_context_t, nfsnode_t, struct componentname *, struct vnode_attr *, int, char *, nfsnode_t *); +int nfs4_open(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); +int nfs4_open_delegated(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); +int nfs4_reopen(struct nfs_open_file *, thread_t); +int nfs4_open_rpc(struct nfs_open_file *, vfs_context_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); +int nfs4_open_rpc_internal(struct nfs_open_file *, vfs_context_t, thread_t, kauth_cred_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); +int nfs4_open_confirm_rpc(struct nfsmount *, nfsnode_t, u_char *, int, struct nfs_open_owner *, nfs_stateid *, thread_t, kauth_cred_t, struct nfs_vattr *, uint64_t *); +int nfs4_open_reopen_rpc(struct nfs_open_file *, thread_t, kauth_cred_t, struct componentname *, vnode_t, vnode_t *, int, int); +int nfs4_open_reclaim_rpc(struct nfs_open_file *, int, int); +int nfs4_claim_delegated_open_rpc(struct nfs_open_file *, int, int, int); +int nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *, int); +int nfs4_claim_delegated_state_for_node(nfsnode_t, int); +int nfs4_open_downgrade_rpc(nfsnode_t, struct nfs_open_file *, vfs_context_t); +int nfs4_close_rpc(nfsnode_t, struct nfs_open_file *, thread_t, kauth_cred_t, int); +void nfs4_delegation_return_enqueue(nfsnode_t); +int nfs4_delegation_return(nfsnode_t, int, thread_t, kauth_cred_t); +int nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, int, thread_t, kauth_cred_t); + nfsnode_t nfs4_named_attr_dir_get(nfsnode_t, int, vfs_context_t); int nfs4_named_attr_get(nfsnode_t, struct componentname *, uint32_t, int, vfs_context_t, nfsnode_t *, struct nfs_open_file **); int nfs4_named_attr_remove(nfsnode_t, nfsnode_t, const char *, vfs_context_t); +#endif int nfs_mount_state_in_use_start(struct nfsmount *, thread_t); int nfs_mount_state_in_use_end(struct nfsmount *, int); @@ -1355,6 +1374,7 @@ int nfs_vnop_advlock(struct vnop_advlock_args *); int nfs_vnop_mmap(struct vnop_mmap_args *); int nfs_vnop_mnomap(struct vnop_mnomap_args *); +#if CONFIG_NFS4 int nfs4_vnop_create(struct vnop_create_args *); int nfs4_vnop_mknod(struct vnop_mknod_args *); int nfs4_vnop_close(struct vnop_close_args *); @@ -1373,46 +1393,48 @@ int nfs4_vnop_makenamedstream(struct vnop_makenamedstream_args *); int nfs4_vnop_removenamedstream(struct vnop_removenamedstream_args *); #endif +int nfs4_access_rpc(nfsnode_t, u_int32_t *, int, vfs_context_t); +int nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); +int nfs4_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); +int nfs4_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); +int nfs4_read_rpc_async_finish(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *); +int nfs4_write_rpc_async(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **); +int nfs4_write_rpc_async_finish(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *); +int nfs4_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); +int nfs4_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); +int nfs4_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); +int nfs4_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); +int nfs4_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); +int nfs4_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); +int nfs4_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); +int nfs4_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); +int nfs4_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); +int nfs4_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); +#endif + int nfs_read_rpc(nfsnode_t, uio_t, vfs_context_t); int nfs_write_rpc(nfsnode_t, uio_t, vfs_context_t, int *, uint64_t *); int nfs_write_rpc2(nfsnode_t, uio_t, thread_t, kauth_cred_t, int *, uint64_t *); int nfs3_access_rpc(nfsnode_t, u_int32_t *, int, vfs_context_t); -int nfs4_access_rpc(nfsnode_t, u_int32_t *, int, vfs_context_t); int nfs3_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); -int nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); int nfs3_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); -int nfs4_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs3_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); -int nfs4_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); int nfs3_read_rpc_async_finish(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *); -int nfs4_read_rpc_async_finish(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *); int nfs3_write_rpc_async(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **); -int nfs4_write_rpc_async(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **); int nfs3_write_rpc_async_finish(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *); -int nfs4_write_rpc_async_finish(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *); int nfs3_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); -int nfs4_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); int nfs3_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); -int nfs4_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); int nfs3_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); -int nfs4_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); int nfs3_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); -int nfs4_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); int nfs3_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); -int nfs4_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); int nfs3_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); -int nfs4_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); int nfs3_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); -int nfs4_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); int nfs3_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); -int nfs4_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); int nfs3_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); -int nfs4_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); int nfs3_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); -int nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); int nfs3_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); -int nfs4_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); void nfsrv_active_user_list_reclaim(void); void nfsrv_cleancache(void); @@ -1503,21 +1525,24 @@ void nfsrv_uc_dequeue(struct nfsrv_sock *); /* Debug support */ #define NFS_DEBUG_LEVEL (nfs_debug_ctl & 0xf) -#define NFS_DEBUG_FACILITY ((nfs_debug_ctl >> 4) & 0xff) -#define NFS_DEBUG_FLAGS ((nfs_debug_ctl >> 12) & 0xff) +#define NFS_DEBUG_FACILITY ((nfs_debug_ctl >> 4) & 0xfff) +#define NFS_DEBUG_FLAGS ((nfs_debug_ctl >> 16) & 0xf) #define NFS_DEBUG_VALUE ((nfs_debug_ctl >> 20) & 0xfff) -#define NFS_FAC_SOCK 0x01 -#define NFS_FAC_STATE 0x02 -#define NFS_FAC_NODE 0x04 -#define NFS_FAC_VNOP 0x08 -#define NFS_FAC_BIO 0x10 -#define NFS_FAC_GSS 0x20 -#define NFS_FAC_VFS 0x40 - -#define NFS_DBG(fac, lev, fmt, ...) \ - if (__builtin_expect(NFS_DEBUG_LEVEL, 0)) nfs_printf(fac, lev, "%s: %d: " fmt, __func__, __LINE__, ## __VA_ARGS__) - -void nfs_printf(int, int, const char *, ...) __printflike(3, 4); +#define NFS_FAC_SOCK 0x001 +#define NFS_FAC_STATE 0x002 +#define NFS_FAC_NODE 0x004 +#define NFS_FAC_VNOP 0x008 +#define NFS_FAC_BIO 0x010 +#define NFS_FAC_GSS 0x020 +#define NFS_FAC_VFS 0x040 +#define NFS_FAC_SRV 0x080 + +#define NFS_IS_DBG(fac, lev) \ + (__builtin_expect((NFS_DEBUG_FACILITY & (fac)) && ((lev) <= NFS_DEBUG_LEVEL), 0)) +#define NFS_DBG(fac, lev, fmt, ...) nfs_printf((fac), (lev), "%s: %d: " fmt, __func__, __LINE__, ## __VA_ARGS__) + +void nfs_printf(unsigned int, unsigned int, const char *, ...) __printflike(3, 4); +void nfs_dump_mbuf(const char *, int, const char *, mbuf_t); int nfs_mountopts(struct nfsmount *, char *, int); __END_DECLS diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index b906597c5..ffb82cb06 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2016 Apple Inc. All rights reserved. + * Copyright (c) 2006-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,6 +69,7 @@ #include #include +#if CONFIG_NFS4 /* * NFS_MAX_WHO is the maximum length of a string representation used * in as an ace who, owner, or group. There is no explicit limit in the @@ -527,6 +528,7 @@ nfs4_secinfo_rpc(struct nfsmount *nmp, struct nfsreq_secinfo_args *siap, kauth_c dnp = nsp->nsr_dnp; dvp = NFSTOV(dnp); if ((error = vnode_get(dvp))) { + dvp = NULLVP; nfs_node_unlock(np); goto nfsmout; } @@ -602,6 +604,7 @@ nfsmout: } return error; } +#endif /* CONFIG_NFS4 */ /* * Parse an NFSv4 SECINFO array to an array of pseudo flavors. @@ -611,8 +614,12 @@ int nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp) { int error = 0, secmax, seccount, srvcount; - uint32_t flavor, val; + uint32_t flavor; + +#if CONFIG_NFS_GSS + uint32_t val; u_char oid[12]; +#endif seccount = srvcount = 0; secmax = *seccountp; @@ -625,11 +632,14 @@ nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp) switch (flavor) { case RPCAUTH_NONE: case RPCAUTH_SYS: +#if CONFIG_NFS_GSS case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: +#endif /* CONFIG_NFS_GSS */ sec[seccount++] = flavor; break; +#if CONFIG_NFS_GSS case RPCSEC_GSS: /* we only recognize KRB5, KRB5I, KRB5P */ nfsm_chain_get_32(error, nmc, val); /* OID length */ @@ -660,6 +670,7 @@ nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp) break; } break; +#endif /* CONFIG_NFS_GSS */ } srvcount--; } @@ -670,7 +681,7 @@ nfsmout: return error; } - +#if CONFIG_NFS4 /* * Fetch the FS_LOCATIONS attribute for the node found at directory/name. */ @@ -2634,6 +2645,7 @@ nfsmout: } return error; } +#endif /* CONFIG_NFS4 */ /* * Got the given error and need to start recovery (if not already started). @@ -2655,6 +2667,7 @@ nfs_need_recover(struct nfsmount *nmp, int error) } } +#if CONFIG_NFS4 /* * After recovery due to state expiry, check each node and * drop any lingering delegation we thought we had. @@ -2722,6 +2735,7 @@ nfs4_expired_check_delegation(nfsnode_t np, vfs_context_t ctx) lck_mtx_unlock(&np->n_openlock); } +#endif /* CONFIG_NFS4*/ /* * Recover state for an NFS mount. @@ -2731,14 +2745,16 @@ nfs4_expired_check_delegation(nfsnode_t np, vfs_context_t ctx) void nfs_recover(struct nfsmount *nmp) { - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; int error, lost, reopen; struct nfs_open_owner *noop; struct nfs_open_file *nofp; struct nfs_file_lock *nflp, *nextnflp; struct nfs_lock_owner *nlop; thread_t thd = current_thread(); +#if CONFIG_NFS4 nfsnode_t np, nextnp; +#endif struct timeval now; restart: @@ -2804,6 +2820,7 @@ restart: if (nmp->nm_vers < NFS_VER4) { goto reclaim_locks; } +#if CONFIG_NFS4 if (nofp->nof_rw_drw) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_BOTH); } @@ -2912,7 +2929,7 @@ restart: nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; lck_mtx_unlock(&nofp->nof_lock); } - +#endif /* CONFIG_NFS4 */ /* * Scan this node's lock owner list for entries with this open owner, * then walk the lock owner's held lock list recovering each lock. @@ -2959,7 +2976,7 @@ reclaim_locks: break; } } - +#if CONFIG_NFS4 /* * If we've determined that we need to reopen the file then we probably * didn't receive any delegation we think we hold. We should attempt to @@ -2979,7 +2996,7 @@ reclaim_locks: goto restart; } } - +#endif if (lost) { /* revoke open file state */ NP(nofp->nof_np, "nfs_recover: state lost for %d %p 0x%x", @@ -2992,6 +3009,7 @@ reclaim_locks: if (!error) { /* If state expired, make sure we're not holding onto any stale delegations */ lck_mtx_lock(&nmp->nm_lock); +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && (nmp->nm_state & NFSSTA_RECOVER_EXPIRED)) { recheckdeleg: TAILQ_FOREACH_SAFE(np, &nmp->nm_delegations, n_dlink, nextnp) { @@ -3003,6 +3021,7 @@ recheckdeleg: } } } +#endif nmp->nm_state &= ~(NFSSTA_RECOVER | NFSSTA_RECOVER_EXPIRED); wakeup(&nmp->nm_state); printf("nfs recovery completed for %s, 0x%x\n", diff --git a/bsd/nfs/nfs4_vnops.c b/bsd/nfs/nfs4_vnops.c index 223ae28f7..261da73e2 100644 --- a/bsd/nfs/nfs4_vnops.c +++ b/bsd/nfs/nfs4_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2017 Apple Inc. All rights reserved. + * Copyright (c) 2006-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include +#if CONFIG_NFS4 int nfs4_access_rpc(nfsnode_t np, u_int32_t *access, int rpcflags, vfs_context_t ctx) { @@ -1752,6 +1753,7 @@ nfsmout: } return error; } +#endif /* CONFIG_NFS4 */ /* * Wait for any pending recovery to complete. @@ -1759,7 +1761,7 @@ nfsmout: int nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) { - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; lck_mtx_lock(&nmp->nm_lock); @@ -1785,7 +1787,7 @@ nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) int nfs_mount_state_in_use_start(struct nfsmount *nmp, thread_t thd) { - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; int error = 0, slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; if (nfs_mount_gone(nmp)) { @@ -1903,7 +1905,7 @@ int nfs_open_state_set_busy(nfsnode_t np, thread_t thd) { struct nfsmount *nmp; - struct timespec ts = {2, 0}; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; int error = 0, slpflag; nmp = NFSTONMP(np); @@ -2061,7 +2063,7 @@ int nfs_open_owner_set_busy(struct nfs_open_owner *noop, thread_t thd) { struct nfsmount *nmp; - struct timespec ts = {2, 0}; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; int error = 0, slpflag; nmp = noop->noo_mount; @@ -2256,7 +2258,7 @@ int nfs_open_file_set_busy(struct nfs_open_file *nofp, thread_t thd) { struct nfsmount *nmp; - struct timespec ts = {2, 0}; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; int error = 0, slpflag; nmp = nofp->nof_owner->noo_mount; @@ -2617,7 +2619,7 @@ nfs_open_file_remove_open(struct nfs_open_file *nofp, uint32_t accessMode, uint3 lck_mtx_unlock(&nofp->nof_lock); } - +#if CONFIG_NFS4 /* * Get the current (delegation, lock, open, default) stateid for this node. * If node has a delegation, use that stateid. @@ -2882,6 +2884,7 @@ out: } return error; } +#endif /* CONFIG_NFS4 */ int nfs_vnop_mmap( @@ -2946,6 +2949,7 @@ restart: NP(np, "nfs_vnop_mmap: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); error = EPERM; } +#if CONFIG_NFS4 if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { nfs_mount_state_in_use_end(nmp, 0); error = nfs4_reopen(nofp, NULL); @@ -2954,6 +2958,7 @@ restart: goto restart; } } +#endif if (!error) { error = nfs_open_file_set_busy(nofp, NULL); } @@ -2996,9 +3001,12 @@ restart: /* NFS v2/v3 opens are always allowed - so just add it. */ nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0); error = 0; - } else { + } +#if CONFIG_NFS4 + else { error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); } +#endif if (!error) { nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; } @@ -3201,6 +3209,7 @@ loop: continue; } lck_mtx_unlock(&np->n_openlock); +#if CONFIG_NFS4 if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); error = nfs4_reopen(nofp, NULL); @@ -3208,6 +3217,7 @@ loop: goto loop; } } +#endif if (!error) { error = nfs_open_file_set_busy(nofp, NULL); } @@ -3364,7 +3374,7 @@ int nfs_lock_owner_set_busy(struct nfs_lock_owner *nlop, thread_t thd) { struct nfsmount *nmp; - struct timespec ts = {2, 0}; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; int error = 0, slpflag; nmp = nlop->nlo_open_owner->noo_mount; @@ -3518,6 +3528,7 @@ nfs_file_lock_conflict(struct nfs_file_lock *nflp1, struct nfs_file_lock *nflp2, return 1; } +#if CONFIG_NFS4 /* * Send an NFSv4 LOCK RPC to the server. */ @@ -3816,7 +3827,7 @@ nfsmout: nfsm_chain_cleanup(&nmrep); return error; } - +#endif /* CONFIG_NFS4 */ /* * Check for any conflicts with the given lock. @@ -3917,7 +3928,7 @@ nfs_advlock_setlock( struct nfs_file_lock *newnflp, *nflp, *nflp2 = NULL, *nextnflp, *flocknflp = NULL; struct nfs_file_lock *coalnflp; int error = 0, error2, willsplit = 0, delay, slpflag, busy = 0, inuse = 0, restart, inqueue = 0; - struct timespec ts = {1, 0}; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; nmp = NFSTONMP(np); if (nfs_mount_gone(nmp)) { @@ -3973,6 +3984,7 @@ restart: inuse = 0; goto error_out; } +#if CONFIG_NFS4 if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); inuse = 0; @@ -3982,6 +3994,7 @@ restart: } goto restart; } +#endif lck_mtx_lock(&np->n_openlock); if (!inqueue) { @@ -4085,6 +4098,7 @@ restart: busy = 1; delay = 0; do { +#if CONFIG_NFS4 /* do we have a delegation? (that we're not returning?) */ if ((np->n_openflags & N_DELEG_MASK) && !(np->n_openflags & N_DELEG_RETURN)) { if (np->n_openflags & N_DELEG_WRITE) { @@ -4117,6 +4131,7 @@ restart: } } } +#endif if (np->n_flag & NREVOKE) { error = EIO; } @@ -4358,7 +4373,11 @@ error_out: int nfs_advlock_unlock( nfsnode_t np, - struct nfs_open_file *nofp, + struct nfs_open_file *nofp +#if !CONFIG_NFS4 + __unused +#endif + , struct nfs_lock_owner *nlop, uint64_t start, uint64_t end, @@ -4378,6 +4397,7 @@ restart: if ((error = nfs_mount_state_in_use_start(nmp, NULL))) { return error; } +#if CONFIG_NFS4 if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); error = nfs4_reopen(nofp, NULL); @@ -4386,6 +4406,7 @@ restart: } goto restart; } +#endif if ((error = nfs_open_state_set_busy(np, NULL))) { nfs_mount_state_in_use_end(nmp, error); return error; @@ -4752,7 +4773,9 @@ nfs_vnop_advlock( goto out; } /* find the open file */ +#if CONFIG_NFS4 restart: +#endif error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0); if (error) { error = EBADF; @@ -4761,6 +4784,7 @@ restart: NP(np, "nfs_vnop_advlock: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); error = EIO; } +#if CONFIG_NFS4 if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { error = nfs4_reopen(nofp, ((op == F_UNLCK) ? NULL : vfs_context_thread(ctx))); nofp = NULL; @@ -4768,6 +4792,7 @@ restart: goto restart; } } +#endif if (error) { NP(np, "nfs_vnop_advlock: no open file %d, %d", error, kauth_cred_getuid(noop->noo_cred)); goto out; @@ -4814,6 +4839,7 @@ nfs_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) return nlop ? 1 : 0; } +#if CONFIG_NFS4 /* * Reopen simple (no deny, no locks) open state that was lost. */ @@ -4832,7 +4858,7 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) char smallname[128]; char *filename = NULL; int error = 0, done = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; lck_mtx_lock(&nofp->nof_lock); while (nofp->nof_flags & NFS_OPEN_FILE_REOPENING) { @@ -4858,6 +4884,7 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) struct nfs_sillyrename *nsp = np->n_sillyrename; dvp = NFSTOV(nsp->nsr_dnp); if ((error = vnode_get(dvp))) { + dvp = NULLVP; nfs_node_unlock(np); goto out; } @@ -5473,6 +5500,7 @@ nfs4_claim_delegated_open_rpc( struct nfs_sillyrename *nsp = np->n_sillyrename; dvp = NFSTOV(nsp->nsr_dnp); if ((error = vnode_get(dvp))) { + dvp = NULLVP; nfs_node_unlock(np); goto out; } @@ -6266,6 +6294,7 @@ nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *nofp, int flags) return error; } +#endif /* CONFIG_NFS4*/ /* * Release all open state for the given node. @@ -6318,9 +6347,11 @@ nfs_release_open_state_for_node(nfsnode_t np, int force) nofp->nof_flags |= NFS_OPEN_FILE_LOST; lck_mtx_unlock(&nofp->nof_lock); +#if CONFIG_NFS4 if (!force && nmp && (nmp->nm_vers >= NFS_VER4)) { nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); } +#endif } lck_mtx_unlock(&np->n_openlock); @@ -6358,6 +6389,7 @@ nfs_revoke_open_state_for_node(nfsnode_t np) } } +#if CONFIG_NFS4 /* * Claim the delegated open combinations that each of this node's open files hold. */ @@ -6537,7 +6569,7 @@ nfsmout: nfsm_chain_cleanup(&nmrep); return error; } - +#endif /* CONFIG_NFS4 */ /* * NFS read call. @@ -6587,6 +6619,7 @@ restart: NP(np, "nfs_vnop_read: LOST %d", kauth_cred_getuid(noop->noo_cred)); error = EIO; } +#if CONFIG_NFS4 if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { error = nfs4_reopen(nofp, vfs_context_thread(ctx)); nofp = NULL; @@ -6594,6 +6627,7 @@ restart: goto restart; } } +#endif if (error) { nfs_open_owner_rele(noop); return error; @@ -6652,9 +6686,12 @@ restart: if (nmp->nm_vers < NFS_VER4) { /* NFS v2/v3 opens are always allowed - so just add it. */ nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0); - } else { + } +#if CONFIG_NFS4 + else { error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); } +#endif if (!error) { nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; } @@ -6674,6 +6711,7 @@ do_read: return nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context); } +#if CONFIG_NFS4 /* * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. * Files are created using the NFSv4 OPEN RPC. So we must open the @@ -8913,3 +8951,4 @@ nfs4_vnop_removenamedstream( } #endif +#endif /* CONFIG_NFS4 */ diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index cb1f92939..2e2dec099 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,6 +96,7 @@ #include #include #include +#include #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__) @@ -212,7 +213,7 @@ nfs_buf_freeup(int timer) if (!fbp) { break; } - if (fbp->nb_refs) { + if (os_ref_get_count(&fbp->nb_refs) > 1) { break; } if (NBUFSTAMPVALID(fbp) && @@ -239,7 +240,7 @@ nfs_buf_freeup(int timer) if (!fbp) { break; } - if (fbp->nb_refs) { + if (os_ref_get_count(&fbp->nb_refs) > 1) { break; } if (NBUFSTAMPVALID(fbp) && @@ -609,7 +610,7 @@ nfs_buf_delwri_service(void) void nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) { - struct timespec ts = { 30, 0 }; + struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 }; int error = 0; lck_mtx_lock(nfs_buf_mutex); @@ -907,6 +908,8 @@ loop: NFSBUFCNTCHK(); /* init nfsbuf */ bzero(bp, sizeof(*bp)); + os_ref_init(&bp->nb_refs, NULL); + bp->nb_free.tqe_next = NFSNOLIST; bp->nb_validoff = bp->nb_validend = -1; FSDBG(545, np, blkno, bp, 0); @@ -1387,7 +1390,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) void nfs_buf_refget(struct nfsbuf *bp) { - bp->nb_refs++; + os_ref_retain_locked(&bp->nb_refs); } /* * release a reference on a buffer @@ -1396,7 +1399,7 @@ nfs_buf_refget(struct nfsbuf *bp) void nfs_buf_refrele(struct nfsbuf *bp) { - bp->nb_refs--; + (void) os_ref_release_locked(&bp->nb_refs); } /* @@ -1609,7 +1612,7 @@ nfs_buf_read_finish(struct nfsbuf *bp) ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) { bp->nb_validend = 0x100000000LL - NBOFF(bp); } - bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; + bp->nb_valid = (uint32_t)(1LLU << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; if (bp->nb_validend & PAGE_MASK) { /* zero-fill remainder of last page */ bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); @@ -1680,9 +1683,11 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) len = (length > nmrsize) ? nmrsize : length; cb.rcb_args[0] = offset; cb.rcb_args[1] = len; +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { cb.rcb_args[2] = nmp->nm_stategenid; } +#endif req = NULL; error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req); if (error) { @@ -1794,6 +1799,7 @@ finish: } return; } +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { lck_mtx_lock(&nmp->nm_lock); if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { @@ -1840,6 +1846,7 @@ finish: } } } +#endif if (error) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; @@ -1867,14 +1874,18 @@ finish: * requested, so we need to issue another read for the rest. * (Don't bother if the buffer already hit an error.) */ +#if CONFIG_NFS4 readagain: +#endif offset += rlen; length -= rlen; cb.rcb_args[0] = offset; cb.rcb_args[1] = length; +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { cb.rcb_args[2] = nmp->nm_stategenid; } +#endif error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq); if (!error) { if (IS_VALID_CRED(cred)) { @@ -2348,6 +2359,7 @@ buffer_ready: error = uiomove(bp->nb_data + on, n, uio); } + nfs_buf_release(bp, 1); nfs_data_unlock(np); nfs_node_lock_force(np); @@ -2365,7 +2377,7 @@ int nfs_async_write_start(struct nfsmount *nmp) { int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; - struct timespec ts = {1, 0}; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; if (nfs_max_async_writes <= 0) { return 0; @@ -2910,9 +2922,11 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred len = (length > nmwsize) ? nmwsize : length; cb.rcb_args[0] = offset; cb.rcb_args[1] = len; +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { cb.rcb_args[2] = nmp->nm_stategenid; } +#endif if (async && ((error = nfs_async_write_start(nmp)))) { break; } @@ -3029,6 +3043,7 @@ finish: } return; } +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { lck_mtx_lock(&nmp->nm_lock); if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { @@ -3075,6 +3090,7 @@ finish: } } } +#endif if (error) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; @@ -3111,7 +3127,9 @@ finish: * (Don't bother if the buffer hit an error or stale wverf.) */ if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) { +#if CONFIG_NFS4 writeagain: +#endif offset += rlen; length -= rlen; @@ -3121,10 +3139,11 @@ writeagain: cb.rcb_args[0] = offset; cb.rcb_args[1] = length; +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { cb.rcb_args[2] = nmp->nm_stategenid; } - +#endif // XXX iomode should really match the original request error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred, NFS_WRITE_FILESYNC, &cb, &wreq); @@ -3845,7 +3864,7 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf struct nfsmount *nmp = VTONMP(vp); int error, slpflag, slptimeo, nflags, retry = 0; int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE; - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; off_t size; FSDBG_TOP(554, np, flags, intrflg, 0); @@ -4085,7 +4104,9 @@ nfs_asyncio_resend(struct nfsreq *req) return; } +#if CONFIG_NFS_GSS nfs_gss_clnt_rpcdone(req); +#endif lck_mtx_lock(&nmp->nm_lock); if (!(req->r_flags & R_RESENDQ)) { TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain); @@ -4119,10 +4140,12 @@ nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) if (nmp->nm_vers < NFS_VER4) { error = nfs3_readdir_rpc(np, bp, ctx); - } else { + } +#if CONFIG_NFS4 + else { error = nfs4_readdir_rpc(np, bp, ctx); } - +#endif if (error && (error != NFSERR_DIRBUFDROPPED)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; diff --git a/bsd/nfs/nfs_boot.c b/bsd/nfs/nfs_boot.c index 67d3d5ef4..9f5ec1030 100644 --- a/bsd/nfs/nfs_boot.c +++ b/bsd/nfs/nfs_boot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -245,7 +245,6 @@ nfs_boot_init(struct nfs_diskless *nd) bp_sin.sin_len = sizeof(bp_sin); bp_sin.sin_family = AF_INET; bp_sin.sin_addr.s_addr = INADDR_BROADCAST; - hostnamelen = MAXHOSTNAMELEN; router.s_addr = 0; error = bp_whoami(&bp_sin, &my_ip, &router); if (error) { @@ -254,7 +253,9 @@ nfs_boot_init(struct nfs_diskless *nd) } printf("nfs_boot: BOOTPARAMS server " IP_FORMAT "\n", IP_LIST(&bp_sin.sin_addr)); + lck_mtx_lock(&hostname_lock); printf("nfs_boot: hostname %s\n", hostname); + lck_mtx_unlock(&hostname_lock); } if (do_bpgetfile) { error = bp_getfile(&bp_sin, "root", &nd->nd_root.ndm_saddr, @@ -537,9 +538,10 @@ bp_whoami(struct sockaddr_in *bpsin, if (cn_len >= MAXHOSTNAMELEN) { goto bad; } + lck_mtx_lock(&hostname_lock); bcopy(str->data, hostname, cn_len); hostname[cn_len] = '\0'; - hostnamelen = cn_len; + lck_mtx_unlock(&hostname_lock); p += RPC_STR_SIZE(cn_len); msg_len -= RPC_STR_SIZE(cn_len); @@ -555,9 +557,10 @@ bp_whoami(struct sockaddr_in *bpsin, if (dn_len >= MAXHOSTNAMELEN) { goto bad; } + lck_mtx_lock(&domainname_lock); bcopy(str->data, domainname, dn_len); domainname[dn_len] = '\0'; - domainnamelen = dn_len; + lck_mtx_unlock(&domainname_lock); p += RPC_STR_SIZE(dn_len); msg_len -= RPC_STR_SIZE(dn_len); @@ -611,7 +614,9 @@ bp_getfile(struct sockaddr_in *bpsin, /* * Get message buffer of sufficient size. */ - cn_len = hostnamelen; + lck_mtx_lock(&hostname_lock); + cn_len = strlen(hostname); + lck_mtx_unlock(&hostname_lock); key_len = strlen(key); msg_len = 0; msg_len += RPC_STR_SIZE(cn_len); @@ -629,7 +634,9 @@ bp_getfile(struct sockaddr_in *bpsin, /* client name (hostname) */ str = (struct rpc_string *)p; str->len = htonl(cn_len); + lck_mtx_lock(&hostname_lock); bcopy(hostname, str->data, cn_len); + lck_mtx_unlock(&hostname_lock); p += RPC_STR_SIZE(cn_len); /* key name (root or swap) */ str = (struct rpc_string *)p; diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index c1d300d0f..95d21f6c6 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -157,6 +157,9 @@ static void nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *); static void nfs_gss_svc_ctx_timer(void *, void *); static int nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *); static int nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *, uint32_t); + +/* This is only used by server code */ +static void nfs_gss_nfsm_chain(struct nfsm_chain *, mbuf_t); #endif /* NFSSERVER */ static void host_release_special_port(mach_port_t); @@ -166,7 +169,6 @@ static int nfs_gss_mach_vmcopyout(vm_map_copy_t, uint32_t, u_char *); static int nfs_gss_mchain_length(mbuf_t); static int nfs_gss_append_chain(struct nfsm_chain *, mbuf_t); -static void nfs_gss_nfsm_chain(struct nfsm_chain *, mbuf_t); #if NFSSERVER thread_call_t nfs_gss_svc_ctx_timer_call; @@ -3896,6 +3898,12 @@ nfs_gss_mach_alloc_buffer(u_char *buf, uint32_t buflen, vm_map_copy_t *addr) tbuflen = vm_map_round_page(buflen, vm_map_page_mask(ipc_kernel_map)); + + if (tbuflen < buflen) { + printf("nfs_gss_mach_alloc_buffer: vm_map_round_page failed\n"); + return; + } + kr = vm_allocate_kernel(ipc_kernel_map, &kmem_buf, tbuflen, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_FILE); if (kr != 0) { printf("nfs_gss_mach_alloc_buffer: vm_allocate failed\n"); @@ -4005,6 +4013,7 @@ nfs_gss_append_chain(struct nfsm_chain *nmc, mbuf_t mc) return 0; } +#if NFSSERVER /* Only used by NFSSERVER */ /* * Convert an mbuf chain to an NFS mbuf chain */ @@ -4025,7 +4034,7 @@ nfs_gss_nfsm_chain(struct nfsm_chain *nmc, mbuf_t mc) nmc->nmc_left = mbuf_trailingspace(tail); nmc->nmc_flags = 0; } - +#endif /* NFSSERVER */ #if 0 diff --git a/bsd/nfs/nfs_ioctl.h b/bsd/nfs/nfs_ioctl.h index 6ab20e01a..5f9b1fc2d 100644 --- a/bsd/nfs/nfs_ioctl.h +++ b/bsd/nfs/nfs_ioctl.h @@ -71,6 +71,8 @@ struct user_nfs_gss_principal { #define NFS_IOC_GET_CRED _IOWR('n', 3, struct nfs_gss_principal) +#define NFS_IOC_DISARM_TRIGGER _IO('n', 4) + #ifdef KERNEL #define NFS_IOC_SET_CRED64 _IOW('n', 2, struct user_nfs_gss_principal) diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index c48c14954..8f7da7ea0 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -354,6 +354,9 @@ loop: cmp = nfs_case_insensitive(mp) ? strncasecmp : strncmp; + if (vp->v_name && (size_t)cnp->cn_namelen != strnlen(vp->v_name, MAXPATHLEN)) { + update_flags |= VNODE_UPDATE_NAME; + } if (vp->v_name && cnp->cn_namelen && (*cmp)(cnp->cn_nameptr, vp->v_name, cnp->cn_namelen)) { update_flags |= VNODE_UPDATE_NAME; } @@ -504,6 +507,7 @@ loop: vfsp.vnfs_str = "nfs"; vfsp.vnfs_dvp = dnp ? NFSTOV(dnp) : NULL; vfsp.vnfs_fsnode = np; +#if CONFIG_NFS4 if (nfsvers == NFS_VER4) { #if FIFO if (nvap->nva_type == VFIFO) { @@ -515,7 +519,9 @@ loop: } else { vfsp.vnfs_vops = nfsv4_vnodeop_p; } - } else { + } else +#endif /* CONFIG_NFS4 */ + { #if FIFO if (nvap->nva_type == VFIFO) { vfsp.vnfs_vops = fifo_nfsv2nodeop_p; @@ -538,20 +544,24 @@ loop: } #if CONFIG_TRIGGERS - if ((nfsvers >= NFS_VER4) && (nvap->nva_type == VDIR) && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + if (((nfsvers >= NFS_VER4) + ) + && (nvap->nva_type == VDIR) && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER) + && !(flags & NG_MARKROOT)) { struct vnode_trigger_param vtp; bzero(&vtp, sizeof(vtp)); bcopy(&vfsp, &vtp.vnt_params, sizeof(vfsp)); vtp.vnt_resolve_func = nfs_mirror_mount_trigger_resolve; vtp.vnt_unresolve_func = nfs_mirror_mount_trigger_unresolve; vtp.vnt_rearm_func = nfs_mirror_mount_trigger_rearm; - vtp.vnt_flags = VNT_AUTO_REARM; + vtp.vnt_flags = VNT_AUTO_REARM | VNT_KERN_RESOLVE; error = vnode_create(VNCREATE_TRIGGER, VNCREATE_TRIGGER_SIZE, &vtp, &np->n_vnode); } else #endif { error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode); } +notsup: if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); nfs_node_unlock(np); @@ -677,6 +687,7 @@ restart: * node has gone inactive without being open, we need to * clean up (close) the open done in the create. */ +#if CONFIG_NFS4 if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && nofp->nof_creator && !force) { if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { lck_mtx_unlock(&np->n_openlock); @@ -705,6 +716,7 @@ restart: } goto restart; } +#endif if (nofp->nof_flags & NFS_OPEN_FILE_NEEDCLOSE) { /* * If the file is marked as needing reopen, but this was the only @@ -725,9 +737,11 @@ restart: if (inuse) { nfs_mount_state_in_use_end(nmp, 0); } +#if CONFIG_NFS4 if (!nfs4_reopen(nofp, NULL)) { goto restart; } +#endif } error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); if (error) { @@ -910,9 +924,11 @@ nfs_vnop_reclaim( FSDBG_TOP(265, vp, np, np->n_flag, 0); force = (!mp || vfs_isforce(mp) || nfs_mount_gone(nmp)); + /* There shouldn't be any open or lock state at this point */ lck_mtx_lock(&np->n_openlock); +#if CONFIG_NFS4 if (nmp && (nmp->nm_vers >= NFS_VER4)) { /* need to drop a delegation */ if (np->n_dreturn.tqe_next != NFSNOLIST) { @@ -944,6 +960,7 @@ nfs_vnop_reclaim( np->n_attrdirfh = NULL; } } +#endif /* clean up file locks */ TAILQ_FOREACH_SAFE(nflp, &np->n_locks, nfl_link, nextnflp) { @@ -1004,12 +1021,14 @@ nfs_vnop_reclaim( nofp->nof_r_drw, nofp->nof_d_r_drw, nofp->nof_w_drw, nofp->nof_d_w_drw, nofp->nof_rw_drw, nofp->nof_d_rw_drw); +#if CONFIG_NFS4 /* try sending a close RPC if it wasn't delegated */ if (nofp->nof_r || nofp->nof_w || nofp->nof_rw || nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) { nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); } +#endif } } TAILQ_REMOVE(&np->n_opens, nofp, nof_link); @@ -1022,7 +1041,7 @@ nfs_vnop_reclaim( /* then remove this node from the monitored node list. */ lck_mtx_lock(&nmp->nm_lock); while (np->n_mflag & NMMONSCANINPROG) { - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; np->n_mflag |= NMMONSCANWANT; msleep(&np->n_mflag, &nmp->nm_lock, PZERO - 1, "nfswaitmonscan", &ts); } @@ -1178,7 +1197,7 @@ nfs_node_unlock2(nfsnode_t np1, nfsnode_t np2) int nfs_node_set_busy(nfsnode_t np, thread_t thd) { - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; int error; if ((error = nfs_node_lock(np))) { diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index b5cf7e407..2ebb8994b 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -449,7 +449,7 @@ nfsrv_getattr( error = nfsrv_credcheck(nd, ctx, nx, nxo); nfsmerr_if(error); -#if CONFIG_MAC +#if CONFIG_MACF if (mac_vnode_check_open(ctx, vp, FREAD)) { error = ESTALE; } @@ -459,7 +459,7 @@ nfsrv_getattr( nfsm_srv_vattr_init(&vattr, nd->nd_vers); error = vnode_getattr(vp, &vattr, ctx); -#if CONFIG_MAC +#if CONFIG_MACF /* XXXab: Comment in the VFS code makes it sound like * some arguments can be filtered out, but not * what it actually means. Hopefully not like @@ -511,7 +511,7 @@ nfsrv_setattr( struct nfs_export_options *nxo; int error, preattrerr, postattrerr, gcheck; struct nfs_filehandle nfh; - struct timespec guard = { 0, 0 }; + struct timespec guard = { .tv_sec = 0, .tv_nsec = 0 }; kauth_action_t action; uid_t saved_uid; diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index 55ba36619..0adab689b 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,7 @@ #include #define NFS_SOCK_DBG(...) NFS_DBG(NFS_FAC_SOCK, 7, ## __VA_ARGS__) +#define NFS_SOCK_DUMP_MBUF(msg, mb) if (NFS_IS_DBG(NFS_FAC_SOCK, 15)) nfs_dump_mbuf(__func__, __LINE__, (msg), (mb)) /* XXX */ boolean_t current_thread_aborted(void); @@ -203,8 +205,30 @@ int nfs_is_dead(int, struct nfsmount *); * 3 - read * 4 - write */ -static int proct[NFS_NPROCS] = { - 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0 +static const int proct[] = { + [NFSPROC_NULL] = 0, + [NFSPROC_GETATTR] = 1, + [NFSPROC_SETATTR] = 0, + [NFSPROC_LOOKUP] = 2, + [NFSPROC_ACCESS] = 1, + [NFSPROC_READLINK] = 3, + [NFSPROC_READ] = 3, + [NFSPROC_WRITE] = 4, + [NFSPROC_CREATE] = 0, + [NFSPROC_MKDIR] = 0, + [NFSPROC_SYMLINK] = 0, + [NFSPROC_MKNOD] = 0, + [NFSPROC_REMOVE] = 0, + [NFSPROC_RMDIR] = 0, + [NFSPROC_RENAME] = 0, + [NFSPROC_LINK] = 0, + [NFSPROC_READDIR] = 3, + [NFSPROC_READDIRPLUS] = 3, + [NFSPROC_FSSTAT] = 0, + [NFSPROC_FSINFO] = 0, + [NFSPROC_PATHCONF] = 0, + [NFSPROC_COMMIT] = 0, + [NFSPROC_NOOP] = 0, }; /* @@ -296,7 +320,18 @@ nfs_location_mntfromname(struct nfs_fs_locations *locs, struct nfs_location_inde p = s; if (!pathonly) { - cnt = snprintf(p, size, "%s:", fsl->nl_servers[idx.nli_serv]->ns_name); + char *name = fsl->nl_servers[idx.nli_serv]->ns_name; + if (name == NULL) { + name = ""; + } + if (*name == '\0') { + if (*fsl->nl_servers[idx.nli_serv]->ns_addresses[idx.nli_addr]) { + name = fsl->nl_servers[idx.nli_serv]->ns_addresses[idx.nli_addr]; + } + cnt = snprintf(p, size, "<%s>:", name); + } else { + cnt = snprintf(p, size, "%s:", name); + } p += cnt; size -= cnt; } @@ -329,7 +364,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) int error = 0, recv = 1; if (nso->nso_flags & NSO_CONNECTING) { - NFS_SOCK_DBG("nfs connect - socket %p upcall - connecting\n", nso); + NFS_SOCK_DBG("nfs connect - socket %p upcall - connecting flags = %8.8x\n", nso, nso->nso_flags); wakeup(nso->nso_wake); return; } @@ -340,7 +375,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) lck_mtx_unlock(&nso->nso_lock); return; } - NFS_SOCK_DBG("nfs connect - socket %p upcall\n", nso); + NFS_SOCK_DBG("nfs connect - socket %p upcall %8.8x\n", nso, nso->nso_flags); nso->nso_flags |= NSO_UPCALL; /* loop while we make error-free progress */ @@ -353,6 +388,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) m = NULL; if (nso->nso_sotype == SOCK_STREAM) { error = nfs_rpc_record_read(so, &nso->nso_rrs, MSG_DONTWAIT, &recv, &m); + NFS_SOCK_DBG("nfs_rpc_record_read returned %d recv = %d\n", error, recv); } else { rcvlen = 1000000; error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); @@ -365,6 +401,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) uint32_t reply = 0, rxid = 0, verf_type, verf_len; uint32_t reply_status, rejected_status, accepted_status; + NFS_SOCK_DUMP_MBUF("Got mbuf from ping", m); nfsm_chain_dissect_init(error, &nmrep, m); nfsm_chain_get_32(error, &nmrep, rxid); nfsm_chain_get_32(error, &nmrep, reply); @@ -386,6 +423,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) } nfsm_chain_get_32(error, &nmrep, accepted_status); nfsmout_if(error); + NFS_SOCK_DBG("Recevied accepted_status of %d nso_version = %d\n", accepted_status, nso->nso_version); if ((accepted_status == RPC_PROGMISMATCH) && !nso->nso_version) { uint32_t minvers, maxvers; nfsm_chain_get_32(error, &nmrep, minvers); @@ -454,6 +492,8 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) nfsmout: nso->nso_flags &= ~NSO_PINGING; if (error) { + NFS_SOCK_DBG("nfs upcalled failed for %d program %d vers error = %d\n", + nso->nso_protocol, nso->nso_version, error); nso->nso_error = error; nso->nso_flags |= NSO_DEAD; } else { @@ -469,6 +509,7 @@ nfsmout: nso->nso_flags &= ~NSO_UPCALL; if ((error != EWOULDBLOCK) && (error || !recv)) { /* problems with the socket... */ + NFS_SOCK_DBG("connect upcall failed %d\n", error); nso->nso_error = error ? error : EPIPE; nso->nso_flags |= NSO_DEAD; wakeup(nso->nso_wake); @@ -496,17 +537,29 @@ nfs_socket_create( struct nfs_socket *nso; struct timeval now; int error; +#define NFS_SOCKET_DEBUGGING #ifdef NFS_SOCKET_DEBUGGING - char naddr[MAX_IPv6_STR_LEN]; + char naddr[sizeof((struct sockaddr_un *)0)->sun_path]; void *sinaddr; - if (sa->sa_family == AF_INET) { - sinaddr = &((struct sockaddr_in*)sa)->sin_addr; - } else { - sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr; - } - if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) { - strlcpy(naddr, "", sizeof(naddr)); + switch (sa->sa_family) { + case AF_INET: + case AF_INET6: + if (sa->sa_family == AF_INET) { + sinaddr = &((struct sockaddr_in*)sa)->sin_addr; + } else { + sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr; + } + if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) { + strlcpy(naddr, "", sizeof(naddr)); + } + break; + case AF_LOCAL: + strlcpy(naddr, ((struct sockaddr_un *)sa)->sun_path, sizeof(naddr)); + break; + default: + strlcpy(naddr, "", sizeof(naddr)); + break; } #else char naddr[1] = { 0 }; @@ -533,10 +586,17 @@ nfs_socket_create( microuptime(&now); nso->nso_timestamp = now.tv_sec; bcopy(sa, nso->nso_saddr, sa->sa_len); - if (sa->sa_family == AF_INET) { - ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port); - } else if (sa->sa_family == AF_INET6) { - ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port); + switch (sa->sa_family) { + case AF_INET: + case AF_INET6: + if (sa->sa_family == AF_INET) { + ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port); + } else if (sa->sa_family == AF_INET6) { + ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port); + } + break; + case AF_LOCAL: + break; } nso->nso_protocol = protocol; nso->nso_version = vers; @@ -577,7 +637,7 @@ nfs_socket_create( resvport ? "r" : "", port, protocol, vers); nfs_socket_destroy(nso); } else { - NFS_SOCK_DBG("nfs connect %s created socket %p %s type %d%s port %d prot %d %d\n", + NFS_SOCK_DBG("nfs connect %s created socket %p <%s> type %d%s port %d prot %d %d\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, naddr, sotype, resvport ? "r" : "", port, protocol, vers); *nsop = nso; @@ -591,8 +651,9 @@ nfs_socket_create( void nfs_socket_destroy(struct nfs_socket *nso) { - struct timespec ts = { 4, 0 }; + struct timespec ts = { .tv_sec = 4, .tv_nsec = 0 }; + NFS_SOCK_DBG("Destoring socket %p flags = %8.8x error = %d\n", nso, nso->nso_flags, nso->nso_error); lck_mtx_lock(&nso->nso_lock); nso->nso_flags |= NSO_DISCONNECTING; if (nso->nso_flags & NSO_UPCALL) { /* give upcall a chance to complete */ @@ -644,8 +705,8 @@ nfs_socket_options(struct nfsmount *nmp, struct nfs_socket *nso) sock_setsockopt(nso->nso_so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } } - if (nso->nso_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ - int reserve = NFS_UDPSOCKBUF; + if (nso->nso_sotype == SOCK_DGRAM || nso->nso_saddr->sa_family == AF_LOCAL) { /* set socket buffer sizes for UDP */ + int reserve = (nso->nso_sotype == SOCK_DGRAM) ? NFS_UDPSOCKBUF : (2 * 1024 * 1024); sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); } @@ -765,7 +826,24 @@ nfs_connect_search_new_socket(struct nfsmount *nmp, struct nfs_socket_search *ns fsl = nmp->nm_locations.nl_locations[nss->nss_nextloc.nli_loc]; fss = fsl->nl_servers[nss->nss_nextloc.nli_serv]; addrstr = fss->ns_addresses[nss->nss_nextloc.nli_addr]; + NFS_SOCK_DBG("Trying address %s for program %d on port %d\n", addrstr, nss->nss_protocol, nss->nss_port); + if (*addrstr == '\0') { + /* + * We have an unspecified local domain address. We use the program to translate to + * a well known local transport address. We only support PMAPROG and NFS for this. + */ + if (nss->nss_protocol == PMAPPROG) { + addrstr = (nss->nss_sotype == SOCK_DGRAM) ? RPCB_TICLTS_PATH : RPCB_TICOTSORD_PATH; + } else if (nss->nss_protocol == NFS_PROG) { + addrstr = nmp->nm_nfs_localport; + if (!addrstr || *addrstr == '\0') { + addrstr = (nss->nss_sotype == SOCK_DGRAM) ? NFS_TICLTS_PATH : NFS_TICOTSORD_PATH; + } + } + NFS_SOCK_DBG("Calling prog %d with <%s>\n", nss->nss_protocol, addrstr); + } if (!nfs_uaddr2sockaddr(addrstr, (struct sockaddr*)&ss)) { + NFS_SOCK_DBG("Could not convert address %s to socket\n", addrstr); nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); nss->nss_addrcnt -= 1; nss->nss_last = -2; @@ -773,6 +851,7 @@ nfs_connect_search_new_socket(struct nfsmount *nmp, struct nfs_socket_search *ns } /* Check that socket family is acceptable. */ if (nmp->nm_sofamily && (ss.ss_family != nmp->nm_sofamily)) { + NFS_SOCK_DBG("Skipping socket family %d, want mount family %d\n", ss.ss_family, nmp->nm_sofamily); nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); nss->nss_addrcnt -= 1; nss->nss_last = -2; @@ -791,6 +870,7 @@ nfs_connect_search_new_socket(struct nfsmount *nmp, struct nfs_socket_search *ns nso->nso_wake = nss; error = sock_setupcall(nso->nso_so, nfs_connect_upcall, nso); if (error) { + NFS_SOCK_DBG("sock_setupcall failed for socket %p setting nfs_connect_upcall error = %d\n", nso, error); lck_mtx_lock(&nso->nso_lock); nso->nso_error = error; nso->nso_flags |= NSO_DEAD; @@ -834,9 +914,14 @@ nfs_connect_search_socket_connect(struct nfsmount *nmp, struct nfs_socket *nso, /* initiate the connection */ nso->nso_flags |= NSO_CONNECTING; lck_mtx_unlock(&nso->nso_lock); - NFS_SOCK_DBG("nfs connect %s connecting socket %p\n", - vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso); + NFS_SOCK_DBG("nfs connect %s connecting socket %p %s\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, + nso->nso_saddr->sa_family == AF_LOCAL ? ((struct sockaddr_un*)nso->nso_saddr)->sun_path : ""); error = sock_connect(nso->nso_so, nso->nso_saddr, MSG_DONTWAIT); + if (error) { + NFS_SOCK_DBG("nfs connect %s connecting socket %p returned %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error); + } lck_mtx_lock(&nso->nso_lock); if (error && (error != EINPROGRESS)) { nso->nso_error = error; @@ -896,6 +981,7 @@ nfs_connect_search_ping(struct nfsmount *nmp, struct nfs_socket *nso, struct tim } } lck_mtx_unlock(&nso->nso_lock); + NFS_SOCK_DBG("Pinging socket %p %d %d %d\n", nso, nso->nso_sotype, nso->nso_protocol, vers); error = nfsm_rpchead2(nmp, nso->nso_sotype, nso->nso_protocol, vers, 0, RPCAUTH_SYS, vfs_context_ucred(vfs_context_kernel()), NULL, NULL, &xid, &mreq); lck_mtx_lock(&nso->nso_lock); @@ -912,6 +998,7 @@ nfs_connect_search_ping(struct nfsmount *nmp, struct nfs_socket *nso, struct tim reqlen += mbuf_len(m); } lck_mtx_unlock(&nso->nso_lock); + NFS_SOCK_DUMP_MBUF("Sending ping packet", mreq); error = sock_sendmbuf(nso->nso_so, &msg, mreq, 0, &sentlen); NFS_SOCK_DBG("nfs connect %s verifying socket %p send rv %d\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error); @@ -975,8 +1062,8 @@ nfs_connect_search_socket_reap(struct nfsmount *nmp __unused, struct nfs_socket_ continue; } lck_mtx_unlock(&nso->nso_lock); - NFS_SOCK_DBG("nfs connect %s reaping socket %p %d\n", - vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error); + NFS_SOCK_DBG("nfs connect %s reaping socket %p error = %d flags = %8.8x\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error, nso->nso_flags); nfs_socket_search_update_error(nss, nso->nso_error); TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); nss->nss_sockcnt--; @@ -1113,7 +1200,10 @@ nfs_connect(struct nfsmount *nmp, int verbose, int timeo) struct sockaddr_storage ss; struct sockaddr *saddr, *oldsaddr; sock_upcall upcall; - struct timeval now, start; +#if CONFIG_NFS4 + struct timeval now; +#endif + struct timeval start; int error, savederror, nfsvers; int tryv4 = 1; uint8_t sotype = nmp->nm_sotype ? nmp->nm_sotype : SOCK_STREAM; @@ -1167,26 +1257,34 @@ tryagain: /* First time connecting, we may need to negotiate some things */ if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + NFS_SOCK_DBG("so_family = %d\n", nmp->nm_sofamily); + NFS_SOCK_DBG("nfs port = %d local: <%s>\n", nmp->nm_nfsport, nmp->nm_nfs_localport ? nmp->nm_nfs_localport : ""); + NFS_SOCK_DBG("mount port = %d local: <%s>\n", nmp->nm_mountport, nmp->nm_mount_localport ? nmp->nm_mount_localport : ""); if (!nmp->nm_vers) { /* No NFS version specified... */ if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) { +#if CONFIG_NFS4 if (PVER2MAJOR(nmp->nm_max_vers) >= NFS_VER4 && tryv4) { nss.nss_port = NFS_PORT; nss.nss_protocol = NFS_PROG; nss.nss_version = 4; nss.nss_flags |= NSS_FALLBACK2PMAP; } else { - /* ...connect to portmapper first if we (may) need any ports. */ - nss.nss_port = PMAPPORT; - nss.nss_protocol = PMAPPROG; - nss.nss_version = 0; - } +#endif + /* ...connect to portmapper first if we (may) need any ports. */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; +#if CONFIG_NFS4 + } +#endif } else { /* ...connect to NFS port first. */ nss.nss_port = nmp->nm_nfsport; nss.nss_protocol = NFS_PROG; nss.nss_version = 0; } +#if CONFIG_NFS4 } else if (nmp->nm_vers >= NFS_VER4) { if (tryv4) { /* For NFSv4, we use the given (or default) port. */ @@ -1206,6 +1304,7 @@ tryagain: nss.nss_protocol = PMAPPROG; nss.nss_version = 0; } +#endif } else { /* For NFSv3/v2... */ if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) { @@ -1304,9 +1403,14 @@ keepsearching: /* We may be speaking to portmap first... to determine port(s). */ if (nso->nso_saddr->sa_family == AF_INET) { port = ntohs(((struct sockaddr_in*)nso->nso_saddr)->sin_port); - } else { + } else if (nso->nso_saddr->sa_family == AF_INET6) { port = ntohs(((struct sockaddr_in6*)nso->nso_saddr)->sin6_port); + } else if (nso->nso_saddr->sa_family == AF_LOCAL) { + if (nso->nso_protocol == PMAPPROG) { + port = PMAPPORT; + } } + if (port == PMAPPORT) { /* Use this portmapper port to get the port #s we need. */ NFS_SOCK_DBG("nfs connect %s got portmapper socket %p\n", @@ -1325,29 +1429,46 @@ keepsearching: ((struct sockaddr_in*)&ss)->sin_port = htons(0); } else if (ss.ss_family == AF_INET6) { ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + } else if (ss.ss_family == AF_LOCAL) { + if (((struct sockaddr_un*)&ss)->sun_path[0] == '/') { + NFS_SOCK_DBG("Looking up NFS socket over %s\n", ((struct sockaddr_un*)&ss)->sun_path); + } } for (; nfsvers >= (int)PVER2MAJOR(nmp->nm_min_vers); nfsvers--) { if (nmp->nm_vers && nmp->nm_vers != nfsvers) { continue; /* Wrong version */ } +#if CONFIG_NFS4 if (nfsvers == NFS_VER4 && nso->nso_sotype == SOCK_DGRAM) { continue; /* NFSv4 does not do UDP */ } - error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, - nso->nso_so, NFS_PROG, nfsvers, - (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); +#endif + if (ss.ss_family == AF_LOCAL && nmp->nm_nfs_localport) { + struct sockaddr_un *sun = (struct sockaddr_un *)&ss; + NFS_SOCK_DBG("Using supplied local address %s for NFS_PROG\n", nmp->nm_nfs_localport); + strlcpy(sun->sun_path, nmp->nm_nfs_localport, sizeof(sun->sun_path)); + error = 0; + } else { + NFS_SOCK_DBG("Calling Portmap/Rpcbind for NFS_PROG"); + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, NFS_PROG, nfsvers, nso->nso_sotype, timeo); + } if (!error) { if (ss.ss_family == AF_INET) { port = ntohs(((struct sockaddr_in*)&ss)->sin_port); } else if (ss.ss_family == AF_INET6) { port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + } else if (ss.ss_family == AF_LOCAL) { + port = ((struct sockaddr_un *)&ss)->sun_path[0] ? NFS_PORT : 0; } if (!port) { error = EPROGUNAVAIL; } +#if CONFIG_NFS4 if (port == NFS_PORT && nfsvers == NFS_VER4 && tryv4 == 0) { continue; /* We already tried this */ } +#endif } if (!error) { break; @@ -1359,16 +1480,25 @@ keepsearching: if (error) { nfs_socket_search_update_error(&nss, error); nfs_socket_destroy(nso); + NFS_SOCK_DBG("Could not lookup NFS socket address for version %d error = %d\n", nfsvers, error); goto keepsearching; } + } else if (nmp->nm_nfs_localport) { + strlcpy(((struct sockaddr_un*)&ss)->sun_path, nmp->nm_nfs_localport, sizeof(((struct sockaddr_un*)&ss)->sun_path)); + NFS_SOCK_DBG("Using supplied nfs_local_port %s for NFS_PROG\n", nmp->nm_nfs_localport); } + /* Create NFS protocol socket and add it to the list of sockets. */ /* N.B. If nfsvers is NFS_VER4 at this point then we're on a non standard port */ + if (ss.ss_family == AF_LOCAL) { + NFS_SOCK_DBG("Creating NFS socket for %s port = %d\n", ((struct sockaddr_un*)&ss)->sun_path, port); + } error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nso->nso_sotype, port, NFS_PROG, nfsvers, NMFLAG(nmp, RESVPORT), &nsonfs); if (error) { nfs_socket_search_update_error(&nss, error); nfs_socket_destroy(nso); + NFS_SOCK_DBG("Could not create NFS socket: %d\n", error); goto keepsearching; } nsonfs->nso_location = nso->nso_location; @@ -1378,6 +1508,7 @@ keepsearching: nfs_socket_search_update_error(&nss, error); nfs_socket_destroy(nsonfs); nfs_socket_destroy(nso); + NFS_SOCK_DBG("Could not nfs_connect_upcall: %d", error); goto keepsearching; } TAILQ_INSERT_TAIL(&nss.nss_socklist, nsonfs, nso_link); @@ -1387,24 +1518,31 @@ keepsearching: error = 0; bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); port = nmp->nm_mountport; + NFS_SOCK_DBG("mount port = %d\n", port); if (ss.ss_family == AF_INET) { ((struct sockaddr_in*)&ss)->sin_port = htons(port); } else if (ss.ss_family == AF_INET6) { ((struct sockaddr_in6*)&ss)->sin6_port = htons(port); + } else if (ss.ss_family == AF_LOCAL && nmp->nm_mount_localport) { + NFS_SOCK_DBG("Setting mount address to %s port = %d\n", nmp->nm_mount_localport, nmp->nm_mountport); + strlcpy(((struct sockaddr_un*)&ss)->sun_path, nmp->nm_mount_localport, sizeof(((struct sockaddr_un*)&ss)->sun_path)); } if (!port) { /* Get port/sockaddr for MOUNT version corresponding to NFS version. */ /* If NFS version is unknown, optimistically choose for NFSv3. */ int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + NFS_SOCK_DBG("Looking up mount port with socket %p\n", nso->nso_so); error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, - nso->nso_so, RPCPROG_MNT, mntvers, mntproto, timeo); + nso->nso_so, RPCPROG_MNT, mntvers, mntproto == IPPROTO_UDP ? SOCK_DGRAM : SOCK_STREAM, timeo); } if (!error) { if (ss.ss_family == AF_INET) { port = ntohs(((struct sockaddr_in*)&ss)->sin_port); } else if (ss.ss_family == AF_INET6) { port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + } else if (ss.ss_family == AF_LOCAL) { + port = (((struct sockaddr_un*)&ss)->sun_path[0] != '\0'); } if (!port) { error = EPROGUNAVAIL; @@ -1421,12 +1559,14 @@ keepsearching: bcopy(&ss, nsonfs->nso_saddr2, ss.ss_len); } if (error) { + NFS_SOCK_DBG("Could not create mount sockaet address %d", error); lck_mtx_lock(&nsonfs->nso_lock); nsonfs->nso_error = error; nsonfs->nso_flags |= NSO_DEAD; lck_mtx_unlock(&nsonfs->nso_lock); } } + NFS_SOCK_DBG("Destroying socket %p so %p\n", nso, nso->nso_so); nfs_socket_destroy(nso); goto keepsearching; } @@ -1443,19 +1583,23 @@ keepsearching: saddr = nso->nso_saddr2; if (!saddr) { /* Need sockaddr for MOUNT port */ + NFS_SOCK_DBG("Getting mount address mountport = %d, mount_localport = %s\n", nmp->nm_mountport, nmp->nm_mount_localport); bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); port = nmp->nm_mountport; if (ss.ss_family == AF_INET) { ((struct sockaddr_in*)&ss)->sin_port = htons(port); } else if (ss.ss_family == AF_INET6) { ((struct sockaddr_in6*)&ss)->sin6_port = htons(port); + } else if (ss.ss_family == AF_LOCAL && nmp->nm_mount_localport) { + NFS_SOCK_DBG("Setting mount address to %s port = %d\n", nmp->nm_mount_localport, nmp->nm_mountport); + strlcpy(((struct sockaddr_un*)&ss)->sun_path, nmp->nm_mount_localport, sizeof(((struct sockaddr_un*)&ss)->sun_path)); } if (!port) { /* Get port/sockaddr for MOUNT version corresponding to NFS version. */ int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; - int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + int so_type = NM_OMFLAG(nmp, MNTUDP) ? SOCK_DGRAM : nso->nso_sotype; error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, - NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + NULL, RPCPROG_MNT, mntvers, so_type, timeo); if (ss.ss_family == AF_INET) { port = ntohs(((struct sockaddr_in*)&ss)->sin_port); } else if (ss.ss_family == AF_INET6) { @@ -1588,6 +1732,7 @@ keepsearching: } if (!nmp->nm_vers) { nmp->nm_vers = nfsvers; +#if CONFIG_NFS4 /* If we negotiated NFSv4, set nm_nfsport if we ended up on the standard NFS port */ if ((nfsvers >= NFS_VER4) && !NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) { if (nso->nso_saddr->sa_family == AF_INET) { @@ -1601,7 +1746,9 @@ keepsearching: nmp->nm_nfsport = NFS_PORT; } } +#endif } +#if CONFIG_NFS4 /* do some version-specific pre-mount set up */ if (nmp->nm_vers >= NFS_VER4) { microtime(&now); @@ -1610,6 +1757,7 @@ keepsearching: nfs4_mount_callback_setup(nmp); } } +#endif } /* Initialize NFS socket state variables */ @@ -1649,6 +1797,7 @@ keepsearching: nmp->nm_sotype = 0; } if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_VERSION)) { +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) { nmp->nm_nfsport = 0; @@ -1661,6 +1810,7 @@ keepsearching: } bzero(&nmp->nm_un, sizeof(nmp->nm_un)); } +#endif nmp->nm_vers = 0; } } @@ -1709,10 +1859,14 @@ keepsearching: /* setup & confirm socket connection is functional */ int -nfs_connect_setup(struct nfsmount *nmp) +nfs_connect_setup( +#if !CONFIG_NFS4 + __unused +#endif + struct nfsmount *nmp) { int error = 0; - +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { if (nmp->nm_state & NFSSTA_CLIENTID) { /* first, try to renew our current state */ @@ -1729,6 +1883,7 @@ nfs_connect_setup(struct nfsmount *nmp) } error = nfs4_setclientid(nmp); } +#endif return error; } @@ -1840,7 +1995,7 @@ nfs_disconnect(struct nfsmount *nmp) lck_mtx_lock(&nmp->nm_lock); tryagain: if (nmp->nm_nso) { - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; if (nmp->nm_state & NFSSTA_SENDING) { /* wait for sending to complete */ nmp->nm_state |= NFSSTA_WANTSND; msleep(&nmp->nm_state, &nmp->nm_lock, PZERO - 1, "nfswaitsending", &ts); @@ -1909,7 +2064,7 @@ void nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) { struct nfsmount *nmp = arg; - struct timespec ts = { 30, 0 }; + struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 }; thread_t thd = current_thread(); struct nfsreq *req; struct timeval now; @@ -1949,7 +2104,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) if (error == EIO || error == EINTR) { lvl = (do_reconnect_sleep++ % 600) ? 7 : 0; } - nfs_printf(NFS_FAC_SOCK, lvl, "nfs reconnect %s: returned %d\n", + NFS_DBG(NFS_FAC_SOCK, lvl, "nfs reconnect %s: returned %d\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, error); } else { nmp->nm_reconnect_start = 0; @@ -1966,6 +2121,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) nfs_recover(nmp); lck_mtx_lock(&nmp->nm_lock); } +#if CONFIG_NFS4 /* handle NFSv4 delegation returns */ while ((nmp->nm_vers >= NFS_VER4) && !(nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD)) && (nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER) && @@ -1974,6 +2130,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) nfs4_delegation_return(np, R_RECOVER, thd, nmp->nm_mcred); lck_mtx_lock(&nmp->nm_lock); } +#endif /* do resends, if necessary/possible */ while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) || (nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD))) && @@ -2010,6 +2167,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) lck_mtx_unlock(&req->r_mtx); /* async RPCs on GSS mounts need to be rebuilt and resent. */ nfs_reqdequeue(req); +#if CONFIG_NFS_GSS if (nfs_request_using_gss(req)) { nfs_gss_clnt_rpcdone(req); error = nfs_gss_clnt_args_restore(req); @@ -2017,6 +2175,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) req->r_xid = 0; } } +#endif /* CONFIG_NFS_GSS */ NFS_SOCK_DBG("nfs async%s restart: p %d x 0x%llx f 0x%x rtt %d\n", nfs_request_using_gss(req) ? " gss" : "", req->r_procnum, req->r_xid, req->r_flags, req->r_rtt); @@ -2227,6 +2386,7 @@ struct nfs_callback_socket { #define NCBSOCK_UPCALLWANT 0x0002 #define NCBSOCK_DEAD 0x0004 +#if CONFIG_NFS4 /* * NFS callback channel state * @@ -2415,7 +2575,7 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) struct nfs_callback_socket *ncbsp; socket_t so, so6; struct nfs4_cb_sock_list cb_socks; - struct timespec ts = {1, 0}; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; lck_mtx_lock(nfs_global_mutex); TAILQ_REMOVE(&nfs4_cb_mounts, nmp, nm_cblink); @@ -2592,7 +2752,7 @@ void nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag) { struct nfs_callback_socket *ncbsp = arg; - struct timespec ts = {1, 0}; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; struct timeval now; mbuf_t m; int error = 0, recv = 1; @@ -3001,7 +3161,7 @@ out: } return error; } - +#endif /* CONFIG_NFS4 */ /* * Initialize an nfs_rpc_record_state structure. @@ -3155,7 +3315,7 @@ nfs_send(struct nfsreq *req, int wait) struct sockaddr *sendnam; mbuf_t mreqcopy; size_t sentlen = 0; - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; again: error = nfs_sndlock(req); @@ -3345,6 +3505,7 @@ again: msg.msg_name = (caddr_t)sendnam; msg.msg_namelen = sendnam->sa_len; } + NFS_SOCK_DUMP_MBUF("Sending mbuf\n", mreqcopy); error = sock_sendmbuf(nso->nso_so, &msg, mreqcopy, 0, &sentlen); if (error || (sentlen != req->r_mreqlen)) { NFS_SOCK_DBG("nfs_send: 0x%llx sent %d/%d error %d\n", @@ -3731,9 +3892,11 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) /* signal anyone waiting on this request */ wakeup(req); asyncioq = (req->r_callback.rcb_func != NULL); +#if CONFIG_NFS_GSS if (nfs_request_using_gss(req)) { nfs_gss_clnt_rpcdone(req); } +#endif /* CONFIG_NFS_GSS */ lck_mtx_unlock(&req->r_mtx); lck_mtx_unlock(nfs_request_mutex); /* if it's an async RPC with a callback, queue it up */ @@ -3758,7 +3921,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) int nfs_wait_reply(struct nfsreq *req) { - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; int error = 0, slpflag, first = 1; if (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) { @@ -3949,9 +4112,12 @@ void nfs_request_destroy(struct nfsreq *req) { struct nfsmount *nmp; - struct gss_seq *gsp, *ngsp; int clearjbtimeo = 0; +#if CONFIG_NFS_GSS + struct gss_seq *gsp, *ngsp; +#endif + if (!req || !(req->r_flags & R_INITTED)) { return; } @@ -4025,6 +4191,7 @@ nfs_request_destroy(struct nfsreq *req) if (IS_VALID_CRED(req->r_cred)) { kauth_cred_unref(&req->r_cred); } +#if CONFIG_NFS_GSS if (nfs_request_using_gss(req)) { nfs_gss_clnt_rpcdone(req); } @@ -4033,6 +4200,7 @@ nfs_request_destroy(struct nfsreq *req) if (req->r_gss_ctx) { nfs_gss_clnt_ctx_unref(req); } +#endif /* CONFIG_NFS_GSS */ if (req->r_wrongsec) { FREE(req->r_wrongsec, M_TEMP); } @@ -4233,6 +4401,7 @@ nfs_request_finish( lck_mtx_unlock(&nmp->nm_lock); } +#if CONFIG_NFS_GSS if (nfs_request_using_gss(req)) { /* * If the request used an RPCSEC_GSS credential @@ -4261,6 +4430,7 @@ nfs_request_finish( goto nfsmout; } } +#endif /* CONFIG_NFS_GSS */ /* * If there was a successful reply, make sure to mark the mount as up. @@ -4297,6 +4467,7 @@ nfs_request_finish( nfsm_chain_get_32(error, &nmrep, auth_status); nfsmout_if(error); switch (auth_status) { +#if CONFIG_NFS_GSS case RPCSEC_GSS_CREDPROBLEM: case RPCSEC_GSS_CTXPROBLEM: /* @@ -4321,6 +4492,7 @@ nfs_request_finish( req->r_xid = 0; // get a new XID req->r_flags |= R_RESTART; goto nfsmout; +#endif /* CONFIG_NFS_GSS */ default: error = EACCES; break; @@ -4342,12 +4514,14 @@ nfs_request_finish( } nfsm_chain_get_32(error, &nmrep, accepted_status); break; +#if CONFIG_NFS_GSS case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: error = nfs_gss_clnt_verf_get(req, &nmrep, verf_type, verf_len, &accepted_status); break; +#endif /* CONFIG_NFS_GSS */ } nfsmout_if(error); @@ -4432,6 +4606,7 @@ nfs_request_finish( nfs_up(nmp, req->r_thread, clearjbtimeo, "resource available again"); } +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && (*status == NFSERR_WRONGSEC)) { /* * Hmmm... we need to try a different security flavor. @@ -4524,7 +4699,7 @@ nfs_request_finish( req->r_np->n_auth = req->r_auth; } } - +#endif /* CONFIG_NFS4 */ if (*status == NFS_OK) { /* * Successful NFS request @@ -4676,6 +4851,7 @@ nfs_request2( } +#if CONFIG_NFS_GSS /* * Set up a new null proc request to exchange GSS context tokens with the * server. Associate the context that we are setting up with the request that we @@ -4744,6 +4920,7 @@ nfs_request_gss( return error; } +#endif /* CONFIG_NFS_GSS */ /* * Create and start an asynchronous NFS request. @@ -4790,7 +4967,7 @@ nfs_request_async( if (!error && !(req->r_flags & R_SENT) && req->r_callback.rcb_func) { /* make sure to wait until this async I/O request gets sent */ int slpflag = (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; while (!(req->r_flags & R_SENT)) { nmp = req->r_nmp; if ((req->r_flags & R_RESENDQ) && !nfs_mount_gone(nmp)) { @@ -4859,7 +5036,7 @@ nfs_request_async_finish( req->r_flags |= R_ASYNCWAIT; } while (req->r_flags & R_RESENDQ) { /* wait until the request is off the resend queue */ - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; if ((nmp = req->r_nmp)) { lck_mtx_lock(&nmp->nm_lock); @@ -5409,7 +5586,7 @@ nfs_sndlock(struct nfsreq *req) struct nfsmount *nmp = req->r_nmp; int *statep; int error = 0, slpflag = 0; - struct timespec ts = { 0, 0 }; + struct timespec ts = { .tv_sec = 0, .tv_nsec = 0 }; if (nfs_mount_gone(nmp)) { return ENXIO; @@ -5486,7 +5663,7 @@ nfs_aux_request( int error = 0, on = 1, try, sendat = 2, soproto, recv, optlen, restoreto = 0; socket_t newso = NULL; struct sockaddr_storage ss; - struct timeval orig_rcvto, orig_sndto, tv = { 1, 0 }; + struct timeval orig_rcvto, orig_sndto, tv = { .tv_sec = 1, .tv_usec = 0 }; mbuf_t m, mrep = NULL; struct msghdr msg; uint32_t rxid = 0, reply = 0, reply_status, rejected_status; @@ -5496,12 +5673,16 @@ nfs_aux_request( if (!so) { /* create socket and set options */ - soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP; + if (saddr->sa_family == AF_LOCAL) { + soproto = 0; + } else { + soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP; + } if ((error = sock_socket(saddr->sa_family, sotype, soproto, NULL, NULL, &newso))) { goto nfsmout; } - if (bindresv) { + if (bindresv && saddr->sa_family != AF_LOCAL) { int level = (saddr->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6; int optname = (saddr->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE; int portrange = IP_PORTRANGE_LOW; @@ -5673,13 +5854,23 @@ nfs_portmap_lookup( socket_t so, uint32_t protocol, uint32_t vers, - uint32_t ipproto, + uint32_t stype, int timeo) { thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); struct sockaddr_storage ss; struct sockaddr *saddr = (struct sockaddr*)&ss; + static struct sockaddr_un rpcbind_cots = { + sizeof(struct sockaddr_un), + AF_LOCAL, + RPCB_TICOTSORD_PATH + }; + static struct sockaddr_un rpcbind_clts = { + sizeof(struct sockaddr_un), + AF_LOCAL, + RPCB_TICLTS_PATH + }; struct nfsm_chain nmreq, nmrep; mbuf_t mreq; int error = 0, ip, pmprog, pmvers, pmproc; @@ -5699,6 +5890,13 @@ nfs_portmap_lookup( pmprog = RPCBPROG; pmvers = RPCBVERS4; pmproc = RPCBPROC_GETVERSADDR; + } else if (saddr->sa_family == AF_LOCAL) { + ip = 0; + pmprog = RPCBPROG; + pmvers = RPCBVERS4; + pmproc = RPCBPROC_GETVERSADDR; + NFS_SOCK_DBG("%s\n", ((struct sockaddr_un*)sa)->sun_path); + saddr = (struct sockaddr*)((stype == SOCK_STREAM) ? &rpcbind_cots : &rpcbind_clts); } else { return EINVAL; } @@ -5709,33 +5907,46 @@ tryagain: /* send portmapper request to get port/uaddr */ if (ip == 4) { ((struct sockaddr_in*)saddr)->sin_port = htons(PMAPPORT); - } else { + } else if (ip == 6) { ((struct sockaddr_in6*)saddr)->sin6_port = htons(PMAPPORT); } nfsm_chain_build_alloc_init(error, &nmreq, 8 * NFSX_UNSIGNED); nfsm_chain_add_32(error, &nmreq, protocol); nfsm_chain_add_32(error, &nmreq, vers); if (ip == 4) { - nfsm_chain_add_32(error, &nmreq, ipproto); + nfsm_chain_add_32(error, &nmreq, stype == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP); nfsm_chain_add_32(error, &nmreq, 0); } else { - if (ipproto == IPPROTO_TCP) { - nfsm_chain_add_string(error, &nmreq, "tcp6", 4); + if (stype == SOCK_STREAM) { + if (ip == 6) { + nfsm_chain_add_string(error, &nmreq, "tcp6", 4); + } else { + nfsm_chain_add_string(error, &nmreq, "ticotsord", 9); + } } else { - nfsm_chain_add_string(error, &nmreq, "udp6", 4); + if (ip == 6) { + nfsm_chain_add_string(error, &nmreq, "udp6", 4); + } else { + nfsm_chain_add_string(error, &nmreq, "ticlts", 6); + } } nfsm_chain_add_string(error, &nmreq, "", 0); /* uaddr */ nfsm_chain_add_string(error, &nmreq, "", 0); /* owner */ } nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(nmp, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, - pmprog, pmvers, pmproc, RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, - &xid, &mreq); + error = nfsm_rpchead2(nmp, stype, pmprog, pmvers, pmproc, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, saddr, so, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, - mreq, R_XID32(xid), 0, timeo, &nmrep); + + NFS_SOCK_DUMP_MBUF("nfs_portmap_loockup request", mreq); + error = nfs_aux_request(nmp, thd, saddr, so, + stype, mreq, R_XID32(xid), 0, timeo, &nmrep); + NFS_SOCK_DUMP_MBUF("nfs_portmap_lookup reply", nmrep.nmc_mhead); + NFS_SOCK_DBG("rpcbind request returned %d for program %u vers %u: %s\n", error, protocol, vers, + (saddr->sa_family == AF_LOCAL) ? ((struct sockaddr_un *)saddr)->sun_path : + (saddr->sa_family == AF_INET6) ? "INET6 socket" : "INET socket"); /* grab port from portmap response */ if (ip == 4) { @@ -5753,9 +5964,15 @@ tryagain: if (ualen < 1) { /* program is not available, just return a zero port */ bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); - ((struct sockaddr_in6*)saddr)->sin6_port = htons(0); + if (ip == 6) { + ((struct sockaddr_in6*)saddr)->sin6_port = htons(0); + } else { + ((struct sockaddr_un*)saddr)->sun_path[0] = '\0'; + } + NFS_SOCK_DBG("Program %u version %u unavailable", protocol, vers); } else { nfsm_chain_get_opaque(error, &nmrep, ualen, uaddr); + NFS_SOCK_DBG("Got uaddr %s\n", uaddr); if (!error) { uaddr[ualen] = '\0'; if (!nfs_uaddr2sockaddr(uaddr, saddr)) { @@ -5785,6 +6002,8 @@ tryagain: nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); + NFS_SOCK_DBG("Returned %d\n", error); + return error; } @@ -6247,6 +6466,9 @@ nfsrv_send(struct nfsrv_sock *slp, mbuf_t nam, mbuf_t top) msg.msg_namelen = sendnam->sa_len; } } + if (NFS_IS_DBG(NFS_FAC_SRV, 15)) { + nfs_dump_mbuf(__func__, __LINE__, "nfsrv_send\n", top); + } error = sock_sendmbuf(so, &msg, top, 0, NULL); if (!error) { return 0; diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index 6b8cf9140..6a6878fc5 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,7 @@ #include #include #include +#include #include #include @@ -889,7 +891,10 @@ nfsm_chain_add_v2sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap, uint32_ * Add an NFSv3 "sattr" structure to an mbuf chain */ int -nfsm_chain_add_v3sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap) +nfsm_chain_add_v3sattr_f( + struct nfsmount *nmp, + struct nfsm_chain *nmc, + struct vnode_attr *vap) { int error = 0; @@ -937,6 +942,7 @@ nfsm_chain_add_v3sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap) } } + return error; } @@ -948,6 +954,7 @@ nfsm_chain_add_v3sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap) */ int nfsm_chain_get_fh_attr( + struct nfsmount *nmp, struct nfsm_chain *nmc, nfsnode_t dnp, vfs_context_t ctx, @@ -976,7 +983,7 @@ nfsm_chain_get_fh_attr( if (!gotfh) { /* skip attributes */ nfsm_chain_adv(error, nmc, NFSX_V3FATTR); } else { /* get attributes */ - error = nfs_parsefattr(nmc, nfsvers, nvap); + error = nfs_parsefattr(nmp, nmc, nfsvers, nvap); } } else if (gotfh) { /* we need valid attributes in order to call nfs_nget() */ @@ -1146,6 +1153,7 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in auth_len = ((uint32_t)groupcount + 5) * NFSX_UNSIGNED; break; } +#if CONFIG_NFS_GSS case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: @@ -1154,6 +1162,7 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in } auth_len = 5 * NFSX_UNSIGNED + 0; // zero context handle for now break; +#endif /* CONFIG_NFS_GSS */ default: return EINVAL; } @@ -1207,7 +1216,9 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in nfsm_chain_add_32(error, &nmreq, vers); nfsm_chain_add_32(error, &nmreq, proc); +#if CONFIG_NFS_GSS add_cred: +#endif switch (auth_type) { case RPCAUTH_NONE: nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* auth */ @@ -1223,7 +1234,9 @@ add_cred: case RPCAUTH_SYS: { nfsm_chain_add_32(error, &nmreq, RPCAUTH_SYS); nfsm_chain_add_32(error, &nmreq, authsiz); - nfsm_chain_add_32(error, &nmreq, 0); /* stamp */ + { + nfsm_chain_add_32(error, &nmreq, 0); /* stamp */ + } nfsm_chain_add_32(error, &nmreq, 0); /* zero-length hostname */ nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(cred)); /* UID */ nfsm_chain_add_32(error, &nmreq, kauth_cred_getgid(cred)); /* GID */ @@ -1243,6 +1256,7 @@ add_cred: } break; } +#if CONFIG_NFS_GSS case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: @@ -1264,6 +1278,7 @@ add_cred: goto add_cred; } break; +#endif /* CONFIG_NFS_GSS */ } ; @@ -1304,7 +1319,11 @@ add_cred: * Parse an NFS file attribute structure out of an mbuf chain. */ int -nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) +nfs_parsefattr( + struct nfsmount *nmp, + struct nfsm_chain *nmc, + int nfsvers, + struct nfs_vattr *nvap) { int error = 0; enum vtype vtype; @@ -1407,10 +1426,12 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) nfsm_chain_get_time(error, nmc, nfsvers, nvap->nva_timesec[NFSTIME_CHANGE], nvap->nva_timensec[NFSTIME_CHANGE]); + nfsmout: return error; } + /* * Load the attribute cache (that lives in the nfsnode entry) with * the value pointed to by nvap, unless the file type in the attribute @@ -1531,6 +1552,7 @@ nfs_loadattrcache( } else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && (nvap->nva_gid != npnvap->nva_gid)) { events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; +#if CONFIG_NFS4 } else if (nmp->nm_vers >= NFS_VER4) { if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && !kauth_guid_equal(&nvap->nva_uuuid, &npnvap->nva_uuuid)) { @@ -1544,11 +1566,15 @@ nfs_loadattrcache( bcmp(nvap->nva_acl, npnvap->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl))))) { events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; } +#endif } - if (((nmp->nm_vers >= NFS_VER4) && (nvap->nva_change != npnvap->nva_change)) || - (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_MODIFY) && - ((nvap->nva_timesec[NFSTIME_MODIFY] != npnvap->nva_timesec[NFSTIME_MODIFY]) || - (nvap->nva_timensec[NFSTIME_MODIFY] != npnvap->nva_timensec[NFSTIME_MODIFY])))) { + if (/* Oh, C... */ +#if CONFIG_NFS4 + ((nmp->nm_vers >= NFS_VER4) && (nvap->nva_change != npnvap->nva_change)) || +#endif + (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_MODIFY) && + ((nvap->nva_timesec[NFSTIME_MODIFY] != npnvap->nva_timesec[NFSTIME_MODIFY]) || + (nvap->nva_timensec[NFSTIME_MODIFY] != npnvap->nva_timensec[NFSTIME_MODIFY])))) { events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_WRITE; } if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_RAWDEV) && @@ -1625,6 +1651,7 @@ nfs_loadattrcache( } #if CONFIG_TRIGGERS +#if CONFIG_NFS4 /* * For NFSv4, if the fsid doesn't match the fsid for the mount, then * this node is for a different file system on the server. So we mark @@ -1635,7 +1662,8 @@ nfs_loadattrcache( (np->n_vattr.nva_fsid.minor != nmp->nm_fsid.minor))) { np->n_vattr.nva_flags |= NFS_FFLAG_TRIGGER; } -#endif +#endif /* CONFIG_NFS4 */ +#endif /* CONFIG_TRIGGERS */ if (!vp || (nvap->nva_type != VREG)) { np->n_size = nvap->nva_size; @@ -1703,11 +1731,13 @@ nfs_attrcachetimeout(nfsnode_t np) } isdir = vnode_isdir(NFSTOV(np)); - +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) { /* If we have a delegation, we always use the max timeout. */ timeo = isdir ? nmp->nm_acdirmax : nmp->nm_acregmax; - } else if ((np)->n_flag & NMODIFIED) { + } else +#endif + if ((np)->n_flag & NMODIFIED) { /* If we have modifications, we always use the min timeout. */ timeo = isdir ? nmp->nm_acdirmin : nmp->nm_acregmin; } else { @@ -1914,8 +1944,19 @@ nfs_uaddr2sockaddr(const char *uaddr, struct sockaddr *addr) unsigned long val; /* decoded value */ int s; /* index used for sliding array to insert elided zeroes */ + /* AF_LOCAL address are paths that start with '/' or are empty */ + if (*uaddr == '/' || *uaddr == '\0') { /* AF_LOCAL address */ + struct sockaddr_un *sun = (struct sockaddr_un *)addr; + sun->sun_family = AF_LOCAL; + sun->sun_len = sizeof(struct sockaddr_un); + strlcpy(sun->sun_path, uaddr, sizeof(sun->sun_path)); + + return 1; + } + #define HEXVALUE 0 #define DECIMALVALUE 1 + #define GET(TYPE) \ do { \ if ((dcount <= 0) || (dcount > (((TYPE) == DECIMALVALUE) ? 3 : 4))) \ @@ -2104,20 +2145,57 @@ uint32_t nfs_debug_ctl; #include void -nfs_printf(int facility, int level, const char *fmt, ...) +nfs_printf(unsigned int facility, unsigned int level, const char *fmt, ...) { va_list ap; - if ((uint32_t)level > NFS_DEBUG_LEVEL) { - return; + if (NFS_IS_DBG(facility, level)) { + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); } - if (NFS_DEBUG_FACILITY && !((uint32_t)facility & NFS_DEBUG_FACILITY)) { - return; +} + + +#define DISPLAYLEN 16 + +static bool +isprint(int ch) +{ + return ch >= 0x20 && ch <= 0x7e; +} + +static void +hexdump(void *data, size_t len) +{ + size_t i, j; + unsigned char *d = data; + char *p, disbuf[3 * DISPLAYLEN + 1]; + + for (i = 0; i < len; i += DISPLAYLEN) { + for (p = disbuf, j = 0; (j + i) < len && j < DISPLAYLEN; j++, p += 3) { + snprintf(p, 4, "%2.2x ", d[i + j]); + } + for (; j < DISPLAYLEN; j++, p += 3) { + snprintf(p, 4, " "); + } + printf("%s ", disbuf); + for (p = disbuf, j = 0; (j + i) < len && j < DISPLAYLEN; j++, p++) { + snprintf(p, 2, "%c", isprint(d[i + j]) ? d[i + j] : '.'); + } + printf("%s\n", disbuf); } +} - va_start(ap, fmt); - vprintf(fmt, ap); - va_end(ap); +void +nfs_dump_mbuf(const char *func, int lineno, const char *msg, mbuf_t mb) +{ + mbuf_t m; + + printf("%s:%d %s\n", func, lineno, msg); + for (m = mb; m; m = mbuf_next(m)) { + hexdump(mbuf_data(m), mbuf_len(m)); + } } /* Is a mount gone away? */ diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index a27683203..78d83c951 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -170,8 +170,12 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LO SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, ""); SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, readlink_nocache, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_readlink_nocache, 0, ""); +#if CONFIG_NFS_GSS SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, root_steals_gss_context, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_root_steals_ctx, 0, ""); +#endif +#if CONFIG_NFS4 SYSCTL_STRING(_vfs_generic_nfs_client, OID_AUTO, default_nfs4domain, CTLFLAG_RW | CTLFLAG_LOCKED, nfs4_default_domain, sizeof(nfs4_default_domain), ""); +#endif #endif /* NFSCLIENT */ #if NFSSERVER @@ -203,11 +207,11 @@ SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | C #if NFSCLIENT +#if CONFIG_NFS4 static int mapname2id(struct nfs_testmapid *map) { int error; - error = nfs4_id2guid(map->ntm_name, &map->ntm_guid, map->ntm_grpflag); if (error) { return error; @@ -257,6 +261,8 @@ nfsclnt_testidmap(proc_t p, user_addr_t argp) } error = copyin(argp, &mapid, sizeof(mapid)); + mapid.ntm_name[MAXIDNAMELEN - 1] = '\0'; + if (error) { return error; } @@ -281,6 +287,7 @@ nfsclnt_testidmap(proc_t p, user_addr_t argp) return error ? error : coerror; } +#endif int nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) @@ -298,9 +305,11 @@ nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) case NFSCLNT_LOCKDNOTIFY: error = nfslockdnotify(p, uap->argp); break; +#if CONFIG_NFS4 case NFSCLNT_TESTIDMAP: error = nfsclnt_testidmap(p, uap->argp); break; +#endif default: error = EINVAL; } @@ -907,6 +916,8 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) if (sotype == SOCK_STREAM) { error = nfsrv_check_exports_allow_address(mynam); if (error) { + log(LOG_INFO, "nfsvc_addsock:: nfsrv_check_exports_allow_address(myname) returned %d\n", error); + mbuf_freem(mynam); return error; } sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); @@ -914,8 +925,8 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP)) { sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } - if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ - int reserve = NFS_UDPSOCKBUF; + if (sotype == SOCK_DGRAM || sodomain == AF_LOCAL) { /* set socket buffer sizes for UDP */ + int reserve = (sotype == SOCK_DGRAM) ? NFS_UDPSOCKBUF : (2 * 1024 * 1024); error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); if (error) { @@ -977,7 +988,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) /* add the socket to the list */ first = TAILQ_EMPTY(&nfsrv_socklist); TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain); - if (soprotocol == IPPROTO_TCP) { + if (sotype == SOCK_STREAM) { nfsrv_sock_tcp_cnt++; if (nfsrv_sock_idle_timeout < 0) { nfsrv_sock_idle_timeout = 0; diff --git a/bsd/nfs/nfs_upcall.c b/bsd/nfs/nfs_upcall.c index b6dced906..9b83d3fc6 100644 --- a/bsd/nfs/nfs_upcall.c +++ b/bsd/nfs/nfs_upcall.c @@ -333,7 +333,7 @@ nfsrv_uc_proxy(socket_t so, void *arg, int waitflag) TAILQ_INSERT_TAIL(myqueue->ucq_queue, uap, nua_svcq); uap->nua_flags |= NFS_UC_QUEUED; - if (myqueue->ucq_flags | NFS_UC_QUEUE_SLEEPING) { + if (myqueue->ucq_flags & NFS_UC_QUEUE_SLEEPING) { wakeup(myqueue); } diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 1ac2b3bd5..67b409bae 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -178,10 +179,12 @@ int nfs_tprintf_delay = NFS_TPRINTF_DELAY; int mountnfs(char *, mount_t, vfs_context_t, vnode_t *); +#if CONFIG_NETBOOT static int nfs_mount_diskless(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t); #if !defined(NO_MOUNT_PRIVATE) static int nfs_mount_diskless_private(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t); #endif /* NO_MOUNT_PRIVATE */ +#endif int nfs_mount_connect(struct nfsmount *); void nfs_mount_drain_and_cleanup(struct nfsmount *); void nfs_mount_cleanup(struct nfsmount *); @@ -238,47 +241,49 @@ int nfs4_getquota(struct nfsmount *, vfs_context_t, uid_t, int, struct dqblk *); #endif const struct nfs_funcs nfs3_funcs = { - nfs3_mount, - nfs3_update_statfs, - nfs3_getquota, - nfs3_access_rpc, - nfs3_getattr_rpc, - nfs3_setattr_rpc, - nfs3_read_rpc_async, - nfs3_read_rpc_async_finish, - nfs3_readlink_rpc, - nfs3_write_rpc_async, - nfs3_write_rpc_async_finish, - nfs3_commit_rpc, - nfs3_lookup_rpc_async, - nfs3_lookup_rpc_async_finish, - nfs3_remove_rpc, - nfs3_rename_rpc, - nfs3_setlock_rpc, - nfs3_unlock_rpc, - nfs3_getlock_rpc + .nf_mount = nfs3_mount, + .nf_update_statfs = nfs3_update_statfs, + .nf_getquota = nfs3_getquota, + .nf_access_rpc = nfs3_access_rpc, + .nf_getattr_rpc = nfs3_getattr_rpc, + .nf_setattr_rpc = nfs3_setattr_rpc, + .nf_read_rpc_async = nfs3_read_rpc_async, + .nf_read_rpc_async_finish = nfs3_read_rpc_async_finish, + .nf_readlink_rpc = nfs3_readlink_rpc, + .nf_write_rpc_async = nfs3_write_rpc_async, + .nf_write_rpc_async_finish = nfs3_write_rpc_async_finish, + .nf_commit_rpc = nfs3_commit_rpc, + .nf_lookup_rpc_async = nfs3_lookup_rpc_async, + .nf_lookup_rpc_async_finish = nfs3_lookup_rpc_async_finish, + .nf_remove_rpc = nfs3_remove_rpc, + .nf_rename_rpc = nfs3_rename_rpc, + .nf_setlock_rpc = nfs3_setlock_rpc, + .nf_unlock_rpc = nfs3_unlock_rpc, + .nf_getlock_rpc = nfs3_getlock_rpc }; +#if CONFIG_NFS4 const struct nfs_funcs nfs4_funcs = { - nfs4_mount, - nfs4_update_statfs, - nfs4_getquota, - nfs4_access_rpc, - nfs4_getattr_rpc, - nfs4_setattr_rpc, - nfs4_read_rpc_async, - nfs4_read_rpc_async_finish, - nfs4_readlink_rpc, - nfs4_write_rpc_async, - nfs4_write_rpc_async_finish, - nfs4_commit_rpc, - nfs4_lookup_rpc_async, - nfs4_lookup_rpc_async_finish, - nfs4_remove_rpc, - nfs4_rename_rpc, - nfs4_setlock_rpc, - nfs4_unlock_rpc, - nfs4_getlock_rpc + .nf_mount = nfs4_mount, + .nf_update_statfs = nfs4_update_statfs, + .nf_getquota = nfs4_getquota, + .nf_access_rpc = nfs4_access_rpc, + .nf_getattr_rpc = nfs4_getattr_rpc, + .nf_setattr_rpc = nfs4_setattr_rpc, + .nf_read_rpc_async = nfs4_read_rpc_async, + .nf_read_rpc_async_finish = nfs4_read_rpc_async_finish, + .nf_readlink_rpc = nfs4_readlink_rpc, + .nf_write_rpc_async = nfs4_write_rpc_async, + .nf_write_rpc_async_finish = nfs4_write_rpc_async_finish, + .nf_commit_rpc = nfs4_commit_rpc, + .nf_lookup_rpc_async = nfs4_lookup_rpc_async, + .nf_lookup_rpc_async_finish = nfs4_lookup_rpc_async_finish, + .nf_remove_rpc = nfs4_remove_rpc, + .nf_rename_rpc = nfs4_rename_rpc, + .nf_setlock_rpc = nfs4_setlock_rpc, + .nf_unlock_rpc = nfs4_unlock_rpc, + .nf_getlock_rpc = nfs4_getlock_rpc }; +#endif /* * Called once to initialize data structures... @@ -286,8 +291,9 @@ const struct nfs_funcs nfs4_funcs = { int nfs_vfs_init(__unused struct vfsconf *vfsp) { +#if CONFIG_NFS4 int i; - +#endif /* * Check to see if major data structures haven't bloated. */ @@ -328,8 +334,11 @@ nfs_vfs_init(__unused struct vfsconf *vfsp) nfs_nbinit(); /* Init the nfsbuf table */ nfs_nhinit(); /* Init the nfsnode table */ nfs_lockinit(); /* Init the nfs lock state */ +#if CONFIG_NFS_GSS nfs_gss_init(); /* Init RPCSEC_GSS security */ +#endif +#if CONFIG_NFS4 /* NFSv4 stuff */ NFS4_PER_FS_ATTRIBUTES(nfs_fs_attr_bitmap); NFS4_PER_OBJECT_ATTRIBUTES(nfs_object_attr_bitmap); @@ -338,15 +347,18 @@ nfs_vfs_init(__unused struct vfsconf *vfsp) nfs_getattr_bitmap[i] &= nfs_object_attr_bitmap[i]; } TAILQ_INIT(&nfsclientids); +#endif /* initialize NFS timer callouts */ nfs_request_timer_call = thread_call_allocate(nfs_request_timer, NULL); nfs_buf_timer_call = thread_call_allocate(nfs_buf_timer, NULL); +#if CONFIG_NFS4 nfs4_callback_timer_call = thread_call_allocate(nfs4_callback_timer, NULL); - +#endif return 0; } + /* * nfs statfs call */ @@ -434,6 +446,7 @@ nfsmout: return error; } +#if CONFIG_NFS4 int nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) { @@ -506,16 +519,22 @@ nfsmout: vnode_put(NFSTOV(np)); return error; } +#endif /* CONFIG_NFS4 */ + /* * Return an NFS volume name from the mntfrom name. */ static void -nfs_get_volname(struct mount *mp, char *volname, size_t len) +nfs_get_volname(struct mount *mp, char *volname, size_t len, vfs_context_t ctx) { const char *ptr, *cptr; const char *mntfrom = mp->mnt_vfsstat.f_mntfromname; - size_t mflen = strnlen(mntfrom, MAXPATHLEN + 1); + struct nfsmount *nmp = VFSTONFS(mp); + size_t mflen; + + + mflen = strnlen(mntfrom, MAXPATHLEN + 1); if (mflen > MAXPATHLEN || mflen == 0) { strlcpy(volname, "Bad volname", len); @@ -557,6 +576,7 @@ nfs_get_volname(struct mount *mp, char *volname, size_t len) strlcpy(volname, ptr, len); } + /* * The NFS VFS_GETATTR function: "statfs"-type information is retrieved * using the nf_update_statfs() function, and other attributes are cobbled @@ -646,10 +666,11 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { /*%%% IF fail over support is implemented we may need to take nm_lock */ - nfs_get_volname(mp, fsap->f_vol_name, MAXPATHLEN); + nfs_get_volname(mp, fsap->f_vol_name, MAXPATHLEN, ctx); VFSATTR_SET_SUPPORTED(fsap, f_vol_name); } - if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities) + ) { u_int32_t caps, valid; nfsnode_t np = nmp->nm_dnp; @@ -663,10 +684,10 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) * The capabilities[] array defines what this volume supports. * * The valid[] array defines which bits this code understands - * the meaning of (whether the volume has that capability or not). - * Any zero bits here means "I don't know what you're asking about" - * and the caller cannot tell whether that capability is - * present or not. + * the meaning of (whether the volume has that capability or + * not). Any zero bits here means "I don't know what you're + * asking about" and the caller cannot tell whether that + * capability is present or not. */ caps = valid = 0; if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_SYMLINK_SUPPORT)) { @@ -706,6 +727,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) */ caps |= VOL_CAP_FMT_2TB_FILESIZE; } +#if CONFIG_NFS4 if (nfsvers >= NFS_VER4) { caps |= VOL_CAP_FMT_HIDDEN_FILES; valid |= VOL_CAP_FMT_HIDDEN_FILES; @@ -713,6 +735,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) // caps |= VOL_CAP_FMT_OPENDENYMODES; // valid |= VOL_CAP_FMT_OPENDENYMODES; } +#endif // no version of nfs supports immutable files caps |= VOL_CAP_FMT_NO_IMMUTABLE_FILES; valid |= VOL_CAP_FMT_NO_IMMUTABLE_FILES; @@ -753,16 +776,18 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) /* * We don't support most of the interfaces. * - * We MAY support locking, but we don't have any easy way of probing. - * We can tell if there's no lockd running or if locks have been - * disabled for a mount, so we can definitely answer NO in that case. - * Any attempt to send a request to lockd to test for locking support - * may cause the lazily-launched locking daemons to be started - * unnecessarily. So we avoid that. However, we do record if we ever - * successfully perform a lock operation on a mount point, so if it - * looks like lock ops have worked, we do report that we support them. + * We MAY support locking, but we don't have any easy way of + * probing. We can tell if there's no lockd running or if + * locks have been disabled for a mount, so we can definitely + * answer NO in that case. Any attempt to send a request to + * lockd to test for locking support may cause the lazily- + * launched locking daemons to be started unnecessarily. So + * we avoid that. However, we do record if we ever successfully + * perform a lock operation on a mount point, so if it looks + * like lock ops have worked, we do report that we support them. */ caps = valid = 0; +#if CONFIG_NFS4 if (nfsvers >= NFS_VER4) { caps = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; @@ -780,7 +805,9 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) } valid |= VOL_CAP_INT_NAMEDSTREAMS; #endif - } else if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { + } else +#endif + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { /* locks disabled on this mount, so they definitely won't work */ valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; } else if (nmp->nm_state & NFSSTA_LOCKSWORK) { @@ -980,6 +1007,7 @@ nfsmout: * if swdevt[0].sw_dev == NODEV * - build the rootfs mount point and call mountnfs() to do the rest. */ +#if CONFIG_NETBOOT int nfs_mountroot(void) { @@ -1341,7 +1369,7 @@ nfs_mount_diskless_private( uint32_t argslength_offset, attrslength_offset, end_offset; procp = current_proc(); /* XXX */ - xb_init(&xb, 0); + xb_init(&xb, XDRBUF_NONE); { /* @@ -1592,6 +1620,8 @@ out: } #endif /* NO_MOUNT_PRIVATE */ +#endif + /* * Convert old style NFS mount args to XDR. */ @@ -2158,6 +2188,7 @@ out: return error; } +#if CONFIG_NFS4 /* * Update an NFSv4 mount path with the contents of the symlink. * @@ -2763,6 +2794,7 @@ nfsmout: nfsm_chain_cleanup(&nmrep); return error; } +#endif /* CONFIG_NFS4 */ /* * Thread to handle initial NFS mount connection. @@ -2844,7 +2876,7 @@ nfs_mount_connect(struct nfsmount *nmp) { int error = 0, slpflag; thread_t thd; - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; /* * Set up the socket. Perform initial search for a location/server/address to @@ -2923,7 +2955,13 @@ mountnfs( uint32_t *mflags_mask; uint32_t *mflags; uint32_t argslength, attrslength; - struct nfs_location_index firstloc = { NLI_VALID, 0, 0, 0 }; + uid_t set_owner; + struct nfs_location_index firstloc = { + .nli_flags = NLI_VALID, + .nli_loc = 0, + .nli_serv = 0, + .nli_addr = 0 + }; static const struct nfs_etype nfs_default_etypes = { .count = NFS_MAX_ETYPES, .selected = NFS_MAX_ETYPES, @@ -2931,6 +2969,7 @@ mountnfs( NFS_AES128_CTS_HMAC_SHA1_96, NFS_DES3_CBC_SHA1_KD} }; + /* make sure mbuf constants are set up */ if (!nfs_mbuf_mhlen) { nfs_mbuf_init(); @@ -3115,11 +3154,13 @@ mountnfs( switch (val) { case NFS_LOCK_MODE_DISABLED: case NFS_LOCK_MODE_LOCAL: +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { /* disabled/local lock mode only allowed on v2/v3 */ error = EINVAL; break; } +#endif /* FALLTHROUGH */ case NFS_LOCK_MODE_ENABLED: nmp->nm_lockmode = val; @@ -3184,10 +3225,11 @@ mountnfs( xb_get_32(error, &xb, nmp->nm_numgrps); } if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) { - char sotype[6]; + char sotype[16]; + *sotype = '\0'; xb_get_32(error, &xb, val); - if (!error && ((val < 3) || (val > 5))) { + if (!error && ((val < 3) || (val > sizeof(sotype)))) { error = EINVAL; } nfsmerr_if(error); @@ -3216,13 +3258,24 @@ mountnfs( nmp->nm_sofamily = AF_INET6; } else if (!strcmp(sotype, "inet")) { nmp->nm_sofamily = 0; /* ok */ + } else if (!strcmp(sotype, "ticotsord")) { + nmp->nm_sofamily = AF_LOCAL; + nmp->nm_sotype = SOCK_STREAM; + } else if (!strcmp(sotype, "ticlts")) { + nmp->nm_sofamily = AF_LOCAL; + nmp->nm_sotype = SOCK_DGRAM; } else { error = EINVAL; } +#if CONFIG_NFS4 if (!error && (nmp->nm_vers >= NFS_VER4) && nmp->nm_sotype && (nmp->nm_sotype != SOCK_STREAM)) { error = EINVAL; /* NFSv4 is only allowed over TCP. */ } +#endif + if (error) { + NFS_VFS_DBG("EINVAL sotype = \"%s\"\n", sotype); + } nfsmerr_if(error); } if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) { @@ -3279,6 +3332,7 @@ mountnfs( xb_get_32(error, &xb, nmp->nm_locations.nl_numlocs); /* fs location count */ /* sanity check location count */ if (!error && ((nmp->nm_locations.nl_numlocs < 1) || (nmp->nm_locations.nl_numlocs > 256))) { + NFS_VFS_DBG("Invalid number of fs_locations: %d", nmp->nm_locations.nl_numlocs); error = EINVAL; } nfsmerr_if(error); @@ -3296,12 +3350,14 @@ mountnfs( xb_get_32(error, &xb, fsl->nl_servcount); /* server count */ /* sanity check server count */ if (!error && ((fsl->nl_servcount < 1) || (fsl->nl_servcount > 256))) { + NFS_VFS_DBG("Invalid server count %d", fsl->nl_servcount); error = EINVAL; } nfsmerr_if(error); MALLOC(fsl->nl_servers, struct nfs_fs_server **, fsl->nl_servcount * sizeof(struct nfs_fs_server*), M_TEMP, M_WAITOK | M_ZERO); if (!fsl->nl_servers) { error = ENOMEM; + NFS_VFS_DBG("Server count = %d, error = %d\n", fsl->nl_servcount, error); } for (serv = 0; serv < fsl->nl_servcount; serv++) { nfsmerr_if(error); @@ -3312,7 +3368,8 @@ mountnfs( fsl->nl_servers[serv] = fss; xb_get_32(error, &xb, val); /* server name length */ /* sanity check server name length */ - if (!error && ((val < 1) || (val > MAXPATHLEN))) { + if (!error && (val > MAXPATHLEN)) { + NFS_VFS_DBG("Invalid server name length %d", val); error = EINVAL; } nfsmerr_if(error); @@ -3325,6 +3382,7 @@ mountnfs( xb_get_32(error, &xb, fss->ns_addrcount); /* address count */ /* sanity check address count (OK to be zero) */ if (!error && (fss->ns_addrcount > 256)) { + NFS_VFS_DBG("Invalid address count %d", fss->ns_addrcount); error = EINVAL; } nfsmerr_if(error); @@ -3336,7 +3394,8 @@ mountnfs( for (addr = 0; addr < fss->ns_addrcount; addr++) { xb_get_32(error, &xb, val); /* address length */ /* sanity check address length */ - if (!error && ((val < 1) || (val > 128))) { + if (!error && val > 128) { + NFS_VFS_DBG("Invalid address length %d", val); error = EINVAL; } nfsmerr_if(error); @@ -3356,6 +3415,7 @@ mountnfs( xb_get_32(error, &xb, fsp->np_compcount); /* component count */ /* sanity check component count */ if (!error && (fsp->np_compcount > MAXPATHLEN)) { + NFS_VFS_DBG("Invalid component count %d", fsp->np_compcount); error = EINVAL; } nfsmerr_if(error); @@ -3383,6 +3443,7 @@ mountnfs( continue; } if (!error && ((val < 1) || (val > MAXPATHLEN))) { + NFS_VFS_DBG("Invalid component path length %d", val); error = EINVAL; } nfsmerr_if(error); @@ -3394,7 +3455,8 @@ mountnfs( error = xb_get_bytes(&xb, fsp->np_components[comp], val, 0); /* component */ } xb_get_32(error, &xb, val); /* fs location info length */ - xb_skip(error, &xb, val); /* skip fs location info */ + NFS_VFS_DBG("Skipping fs location info bytes %d", val); + xb_skip(error, &xb, xdr_rndup(val)); /* skip fs location info */ } } if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) { @@ -3466,6 +3528,62 @@ mountnfs( } nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_NFS_PORT)) { + if (nmp->nm_nfsport) { + error = EINVAL; + NFS_VFS_DBG("Can't have ports specified over incompatible socket families"); + } + nfsmerr_if(error); + xb_get_32(error, &xb, len); + if (!error && ((len < 1) || (len > sizeof(((struct sockaddr_un *)0)->sun_path)))) { + error = EINVAL; + } + nfsmerr_if(error); + MALLOC(nmp->nm_nfs_localport, char *, len + 1, M_TEMP, M_WAITOK | M_ZERO); + if (!nmp->nm_nfs_localport) { + error = ENOMEM; + } + nfsmerr_if(error); + error = xb_get_bytes(&xb, nmp->nm_nfs_localport, len, 0); + nmp->nm_sofamily = AF_LOCAL; + nmp->nm_nfsport = 1; /* We use the now deprecated tpcmux port to indcate that we have an AF_LOCAL port */ + NFS_VFS_DBG("Setting nfs local port %s (%d)\n", nmp->nm_nfs_localport, nmp->nm_nfsport); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT)) { + if (nmp->nm_mountport) { + error = EINVAL; + NFS_VFS_DBG("Can't have ports specified over mulitple socket families"); + } + nfsmerr_if(error); + xb_get_32(error, &xb, len); + if (!error && ((len < 1) || (len > sizeof(((struct sockaddr_un *)0)->sun_path)))) { + error = EINVAL; + } + nfsmerr_if(error); + MALLOC(nmp->nm_mount_localport, char *, len + 1, M_TEMP, M_WAITOK | M_ZERO); + if (!nmp->nm_mount_localport) { + error = ENOMEM; + } + nfsmerr_if(error); + error = xb_get_bytes(&xb, nmp->nm_mount_localport, len, 0); + nmp->nm_sofamily = AF_LOCAL; + nmp->nm_mountport = 1; /* We use the now deprecated tpcmux port to indcate that we have an AF_LOCAL port */ + NFS_VFS_DBG("Setting mount local port %s (%d)\n", nmp->nm_mount_localport, nmp->nm_mountport); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SET_MOUNT_OWNER)) { + xb_get_32(error, &xb, set_owner); + nfsmerr_if(error); + error = vfs_context_suser(ctx); + /* + * root can set owner to whatever, user can set owner to self + */ + if ((error) && (set_owner == kauth_cred_getuid(vfs_context_ucred(ctx)))) { + /* ok for non-root can set owner to self */ + error = 0; + } + nfsmerr_if(error); + } + /* * Sanity check/finalize settings. */ @@ -3498,10 +3616,11 @@ mountnfs( } nfsmerr_if(error); - /* init mount's mntfromname to first location */ if (!NM_OMATTR_GIVEN(nmp, MNTFROM)) { + /* init mount's mntfromname to first location */ nfs_location_mntfromname(&nmp->nm_locations, firstloc, - vfs_statfs(mp)->f_mntfromname, sizeof(vfs_statfs(mp)->f_mntfromname), 0); + vfs_statfs(mp)->f_mntfromname, + sizeof(vfs_statfs(mp)->f_mntfromname), 0); } /* Need to save the mounting credential for v4. */ @@ -3520,21 +3639,29 @@ mountnfs( } nfsmerr_if(error); - /* do mount's initial socket connection */ - error = nfs_mount_connect(nmp); - nfsmerr_if(error); - /* set up the version-specific function tables */ if (nmp->nm_vers < NFS_VER4) { nmp->nm_funcs = &nfs3_funcs; } else { +#if CONFIG_NFS4 nmp->nm_funcs = &nfs4_funcs; +#else + /* don't go any further if we don't support NFS4 */ + nmp->nm_funcs = NULL; + error = ENOTSUP; + nfsmerr_if(error); +#endif } + /* do mount's initial socket connection */ + error = nfs_mount_connect(nmp); + nfsmerr_if(error); + /* sanity check settings now that version/connection is set */ if (nmp->nm_vers == NFS_VER2) { /* ignore RDIRPLUS on NFSv2 */ NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); } +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { if (NFS_BITMAP_ISSET(nmp->nm_flags, NFS_MFLAG_ACLONLY)) { /* aclonly trumps noacl */ NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); @@ -3544,12 +3671,15 @@ mountnfs( error = EINVAL; /* disabled/local lock mode only allowed on v2/v3 */ } } else { - /* ignore these if not v4 */ - NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOCALLBACK); - NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NAMEDATTR); - NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); - NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY); - } +#endif + /* ignore these if not v4 */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOCALLBACK); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NAMEDATTR); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY); +#if CONFIG_NFS4 +} +#endif nfsmerr_if(error); if (nmp->nm_sotype == SOCK_DGRAM) { @@ -3597,6 +3727,19 @@ mountnfs( TAILQ_INIT(&nmp->nm_cwndq); } + if (nmp->nm_saddr->sa_family == AF_LOCAL) { + struct sockaddr_un *un = (struct sockaddr_un *)nmp->nm_saddr; + size_t size; + int n = snprintf(vfs_statfs(mp)->f_mntfromname, sizeof(vfs_statfs(mp)->f_mntfromname), "<%s>:", un->sun_path); + + if (n > 0 && (size_t)n < sizeof(vfs_statfs(mp)->f_mntfromname)) { + size = sizeof(vfs_statfs(mp)->f_mntfromname) - n; + nfs_location_mntfromname(&nmp->nm_locations, firstloc, + &vfs_statfs(mp)->f_mntfromname[n], size, 1); + } + } + + /* * Get the root node/attributes from the NFS server and * do any basic, version-specific setup. @@ -3612,6 +3755,8 @@ mountnfs( */ nmp->nm_dnp = np; *vpp = NFSTOV(np); + + /* get usecount and drop iocount */ error = vnode_ref(*vpp); vnode_put(*vpp); @@ -3643,6 +3788,10 @@ mountnfs( sbp->f_ffree = nmp->nm_fsattr.nfsa_files_free; sbp->f_iosize = nfs_iosize; + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SET_MOUNT_OWNER)) { + sbp->f_owner = set_owner; + } + /* * Calculate the size used for I/O buffers. Use the larger * of the two sizes to minimise NFS requests but make sure @@ -3652,18 +3801,15 @@ mountnfs( * buffers into multiple requests if the buffer size is * larger than the I/O size. */ -#ifndef CONFIG_EMBEDDED iosize = max(nmp->nm_rsize, nmp->nm_wsize); if (iosize < PAGE_SIZE) { iosize = PAGE_SIZE; } -#else - iosize = PAGE_SIZE; -#endif nmp->nm_biosize = trunc_page_32(iosize); /* For NFSv3 and greater, there is a (relatively) reliable ACCESS call. */ - if (nmp->nm_vers > NFS_VER2) { + if (nmp->nm_vers > NFS_VER2 && !NMFLAG(nmp, NOOPAQUE_AUTH) + ) { vfs_setauthopaqueaccess(mp); } @@ -3681,6 +3827,7 @@ mountnfs( break; } + /* success! */ lck_mtx_lock(&nmp->nm_lock); nmp->nm_state |= NFSSTA_MOUNTED; @@ -3704,7 +3851,9 @@ int nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) { nfsnode_t np = VTONFS(vp); +#if CONFIG_NFS4 nfsnode_t dnp = VTONFS(dvp); +#endif struct nfsmount *nmp = NFSTONMP(np); char fstype[MFSTYPENAMELEN], *mntfromname = NULL, *path = NULL, *relpath, *p, *cp; int error = 0, pathbuflen = MAXPATHLEN, i, mntflags = 0, referral, skipcopy = 0; @@ -3725,7 +3874,7 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) bzero(&nfsls, sizeof(nfsls)); } - xb_init(&xbnew, 0); + xb_init(&xbnew, XDRBUF_NONE); if (!nmp || (nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD))) { return ENXIO; @@ -3793,13 +3942,16 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) const char *vname = vnode_getname(NFSTOV(np)); if (!vname) { error = ENOENT; - } else { + } +#if CONFIG_NFS4 + else { error = nfs4_get_fs_locations(nmp, dnp, NULL, 0, vname, ctx, &nfsls); vnode_putname(vname); if (!error && (nfsls.nl_numlocs < 1)) { error = ENOENT; } } +#endif nfsmerr_if(error); } @@ -3841,12 +3993,13 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) } if (referral) { NFS_BITMAP_SET(newmattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_CLR(newmattrs, NFS_MATTR_MNTFROM); } else { NFS_BITMAP_SET(newmattrs, NFS_MATTR_FH); } NFS_BITMAP_SET(newmattrs, NFS_MATTR_FLAGS); NFS_BITMAP_SET(newmattrs, NFS_MATTR_MNTFLAGS); - NFS_BITMAP_CLR(newmattrs, NFS_MATTR_MNTFROM); + NFS_BITMAP_SET(newmattrs, NFS_MATTR_SET_MOUNT_OWNER); xb_add_bitmap(error, &xbnew, newmattrs, NFS_MATTR_BITMAP_LEN); attrslength_offset = xb_offset(&xbnew); xb_copy_32(error, &xb, &xbnew, val); /* attrs length */ @@ -3980,20 +4133,18 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) xb_copy_opaque(error, &xb, &xbnew); /* component */ } /* add additional components */ - for (comp = 0; !skipcopy && !error && (comp < relpathcomps); comp++) { - p = relpath; - while (*p && (*p == '/')) { + p = relpath; + while (*p && (*p == '/')) { + p++; + } + while (*p && !error) { + cp = p; + while (*p && (*p != '/')) { p++; } - while (*p && !error) { - cp = p; - while (*p && (*p != '/')) { - p++; - } - xb_add_string(error, &xbnew, cp, (p - cp)); /* component */ - while (*p && (*p == '/')) { - p++; - } + xb_add_string(error, &xbnew, cp, (p - cp)); /* component */ + while (*p && (*p == '/')) { + p++; } } xb_copy_opaque(error, &xb, &xbnew); /* fs location info */ @@ -4070,6 +4221,31 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) error = xb_add_bytes(&xbnew, buf, count, 1); } } + /* + * The following string copies rely on the fact that we already validated + * these data when creating the initial mount point. + */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REALM)) { + xb_add_string(error, &xbnew, nmp->nm_realm, strlen(nmp->nm_realm)); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_PRINCIPAL)) { + xb_add_string(error, &xbnew, nmp->nm_principal, strlen(nmp->nm_principal)); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SVCPRINCIPAL)) { + xb_add_string(error, &xbnew, nmp->nm_sprinc, strlen(nmp->nm_sprinc)); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_NFS_PORT)) { + xb_add_string(error, &xbnew, nmp->nm_nfs_localport, strlen(nmp->nm_nfs_localport)); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT)) { + xb_add_string(error, &xbnew, nmp->nm_mount_localport, strlen(nmp->nm_mount_localport)); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SET_MOUNT_OWNER)) { + /* drop embedded owner value */ + xb_get_32(error, &xb, count); + } + /* New mount always gets same owner as this mount */ + xb_add_32(error, &xbnew, vnode_mount(vp)->mnt_vfsstat.f_owner); xb_build_done(error, &xbnew); /* update opaque counts */ @@ -4088,10 +4264,13 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) /* * For kernel_mount() call, use the existing mount flags (instead of the * original flags) because flags like MNT_NOSUID and MNT_NODEV may have - * been silently enforced. + * been silently enforced. Also, in terms of MACF, the _kernel_ is + * performing the mount (and enforcing all of the mount options), so we + * use the kernel context for the mount call. */ mntflags = vnode_vfsvisflags(vp); mntflags |= (MNT_AUTOMOUNTED | MNT_DONTBROWSE); + ctx = vfs_context_kernel(); /* do the mount */ error = kernel_mount(fstype, dvp, vp, path, xb_buffer_base(&xbnew), argslength, @@ -4122,6 +4301,7 @@ nfsmerr: /* * trigger vnode functions */ +#define NFS_TRIGGER_DEBUG 1 resolver_result_t nfs_mirror_mount_trigger_resolve( @@ -4132,9 +4312,10 @@ nfs_mirror_mount_trigger_resolve( __unused void *data, vfs_context_t ctx) { - nfsnode_t np = VTONFS(vp); - vnode_t pvp = NULLVP; - int error = 0; + nfsnode_t np = VTONFS(vp); + vnode_t pvp = NULLVP; + int error = 0; + int didBusy = 0; resolver_result_t result; /* @@ -4204,6 +4385,21 @@ nfs_mirror_mount_trigger_resolve( #endif return result; } + didBusy = 1; + + /* Check again, in case the mount happened while we were setting busy */ + if (vnode_mountedhere(vp) != NULL) { + /* Been there. Done that. Let's just say it succeeded. */ + error = 0; + goto skipmount; + } + nfs_node_lock_force(np); + if (np->n_flag & NDISARMTRIGGER) { + error = ECANCELED; + nfs_node_unlock(np); + goto skipmount; + } + nfs_node_unlock(np); pvp = vnode_getparent(vp); if (pvp == NULLVP) { @@ -4226,7 +4422,9 @@ skipmount: if (pvp != NULLVP) { vnode_put(pvp); } - nfs_node_clear_busy(np); + if (didBusy) { + nfs_node_clear_busy(np); + } return result; } @@ -4326,7 +4524,8 @@ nfs_ephemeral_mount_harvester_callback(mount_t mp, void *arg) return VFS_RETURNED; } nmp = VFSTONFS(mp); - if (!nmp || !NMFLAG(nmp, EPHEMERAL)) { + if (!nmp || !NMFLAG(nmp, EPHEMERAL) + ) { return VFS_RETURNED; } hinfo->mountcount++; @@ -4438,6 +4637,7 @@ nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsver uint32_t mntvers, mntport, val; struct sockaddr_storage ss; struct sockaddr *saddr = (struct sockaddr*)&ss; + struct sockaddr_un *sun = (struct sockaddr_un*)saddr; nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4452,20 +4652,26 @@ nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsver ((struct sockaddr_in*)saddr)->sin_port = htons(nmp->nm_mountport); } mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); - } else { + } else if (saddr->sa_family == AF_INET6) { if (nmp->nm_mountport) { ((struct sockaddr_in6*)saddr)->sin6_port = htons(nmp->nm_mountport); } mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + } else { /* Local domain socket */ + mntport = ((struct sockaddr_un *)saddr)->sun_path[0]; /* Do we have and address? */ + mntproto = IPPROTO_TCP; /* XXX rpcbind only listens on streams sockets for now */ } while (!mntport) { - error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, + mntproto == IPPROTO_UDP ? SOCK_DGRAM : SOCK_STREAM, timeo); nfsmout_if(error); if (saddr->sa_family == AF_INET) { mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); - } else { + } else if (saddr->sa_family == AF_INET6) { mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + } else if (saddr->sa_family == AF_LOCAL) { + mntport = ((struct sockaddr_un*)saddr)->sun_path[0]; } if (!mntport) { /* if not found and TCP, then retry with UDP */ @@ -4475,6 +4681,9 @@ nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsver } mntproto = IPPROTO_UDP; bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + if (saddr->sa_family == AF_LOCAL) { + strlcpy(sun->sun_path, RPCB_TICLTS_PATH, sizeof(sun->sun_path)); + } } } nfsmout_if(error || !mntport); @@ -4541,8 +4750,10 @@ nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo) bcopy(nmp->nm_saddr, saddr, min(sizeof(ss), nmp->nm_saddr->sa_len)); if (saddr->sa_family == AF_INET) { ((struct sockaddr_in*)saddr)->sin_port = htons(mntport); - } else { + } else if (saddr->sa_family == AF_INET6) { ((struct sockaddr_in6*)saddr)->sin6_port = htons(mntport); + } else { /* Local domain socket */ + mntport = ((struct sockaddr_un *)saddr)->sun_path[0]; /* Do we have and address? */ } while (!mntport) { @@ -4550,8 +4761,10 @@ nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo) nfsmout_if(error); if (saddr->sa_family == AF_INET) { mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); - } else { + } else if (saddr->sa_family == AF_INET6) { mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + } else { /* Local domain socket */ + mntport = ((struct sockaddr_un *)saddr)->sun_path[0]; /* Do we have and address? */ } /* if not found and mntvers > VER1, then retry with VER1 */ if (!mntport) { @@ -4603,7 +4816,7 @@ nfs_vfs_unmount( struct nfsmount *nmp; vnode_t vp; int error, flags = 0; - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; nmp = VFSTONFS(mp); lck_mtx_lock(&nmp->nm_lock); @@ -4774,7 +4987,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) { struct nfsreq *req, *treq; struct nfs_reqqhead iodq, resendq; - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; struct nfs_open_owner *noop, *nextnoop; nfsnode_t np; int docallback; @@ -4783,14 +4996,16 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) nmp->nm_state |= nm_state_flags; nmp->nm_ref++; lck_mtx_unlock(&nmp->nm_lock); - +#if CONFIG_NFS4 /* stop callbacks */ if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) { nfs4_mount_callback_shutdown(nmp); } - +#endif +#if CONFIG_NFS_GSS /* Destroy any RPCSEC_GSS contexts */ nfs_gss_clnt_ctx_unmount(nmp); +#endif /* mark the socket for termination */ lck_mtx_lock(&nmp->nm_lock); @@ -4814,6 +5029,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) lck_mtx_lock(&nmp->nm_lock); +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) { /* clear out any pending delegation return requests */ while ((np = TAILQ_FIRST(&nmp->nm_dreturnq))) { @@ -4828,7 +5044,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) thread_call_free(nmp->nm_renew_timer); nmp->nm_renew_timer = NULL; } - +#endif lck_mtx_unlock(&nmp->nm_lock); if (nmp->nm_state & NFSSTA_MOUNTED) { @@ -4846,6 +5062,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) } } +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_longid) { /* remove/deallocate the client ID data */ lck_mtx_lock(nfs_global_mutex); @@ -4857,7 +5074,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) nmp->nm_longid = NULL; lck_mtx_unlock(nfs_global_mutex); } - +#endif /* * Be sure all requests for this mount are completed * and removed from the resend queue. @@ -4967,6 +5184,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) } lck_mtx_unlock(&nmp->nm_lock); +#if CONFIG_NFS4 /* clean up NFSv4 state */ if (nmp->nm_vers >= NFS_VER4) { lck_mtx_lock(&nmp->nm_lock); @@ -4976,7 +5194,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) } lck_mtx_unlock(&nmp->nm_lock); } - +#endif nfs_mount_rele(nmp); } @@ -5042,6 +5260,8 @@ nfs_mount_cleanup(struct nfsmount *nmp) if (nmp->nm_fh) { FREE(nmp->nm_fh, M_TEMP); } + + FREE_ZONE(nmp, sizeof(struct nfsmount), M_NFSMNT); } @@ -5130,13 +5350,13 @@ nfs3_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc uint32_t val = 0, bsize = 0; struct sockaddr *rqsaddr; struct timeval now; - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; if (!nmp->nm_saddr) { return ENXIO; } - if (NMFLAG(nmp, NOQUOTA)) { + if (NMFLAG(nmp, NOQUOTA) || nmp->nm_saddr->sa_family == AF_LOCAL /* XXX for now */) { return ENOTSUP; } @@ -5291,7 +5511,7 @@ nfsmout: nfsm_chain_cleanup(&nmrep); return error; } - +#if CONFIG_NFS4 int nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struct dqblk *dqb) { @@ -5382,7 +5602,7 @@ nfsmout: kauth_cred_unref(&cred); return error; } - +#endif /* CONFIG_NFS4 */ int nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t ctx) { @@ -5554,7 +5774,7 @@ int nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) { struct xdrbuf xbinfo, xborig; - char sotype[6]; + char sotype[16]; uint32_t origargsvers, origargslength; uint32_t infolength_offset, curargsopaquelength_offset, curargslength_offset, attrslength_offset, curargs_end_offset, end_offset; uint32_t miattrs[NFS_MIATTR_BITMAP_LEN]; @@ -5598,9 +5818,11 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); NFS_BITMAP_SET(mattrs, NFS_MATTR_FLAGS); NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_MINOR_VERSION); } +#endif NFS_BITMAP_SET(mattrs, NFS_MATTR_READ_SIZE); NFS_BITMAP_SET(mattrs, NFS_MATTR_WRITE_SIZE); NFS_BITMAP_SET(mattrs, NFS_MATTR_READDIR_SIZE); @@ -5616,8 +5838,10 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) } NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST); NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); - NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); - if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) { + if (nmp->nm_saddr->sa_family != AF_LOCAL) { + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + } + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport && !nmp->nm_mount_localport) { NFS_BITMAP_SET(mattrs, NFS_MATTR_MOUNT_PORT); } NFS_BITMAP_SET(mattrs, NFS_MATTR_REQUEST_TIMEOUT); @@ -5644,6 +5868,12 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) if (nmp->nm_sprinc) { NFS_BITMAP_SET(mattrs, NFS_MATTR_SVCPRINCIPAL); } + if (nmp->nm_nfs_localport) { + NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCAL_NFS_PORT); + } + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mount_localport) { + NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT); + } /* set up current mount flags bitmap */ /* first set the flags that we will be setting - either on OR off */ @@ -5663,6 +5893,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) } NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONEGNAMECACHE); NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MUTEJUKEBOX); +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_EPHEMERAL); NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCALLBACK); @@ -5670,6 +5901,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOACL); NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_ACLONLY); } +#endif NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NFC); NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOQUOTA); if (nmp->nm_vers < NFS_VER4) { @@ -5705,6 +5937,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) if (NMFLAG(nmp, MUTEJUKEBOX)) { NFS_BITMAP_SET(mflags, NFS_MFLAG_MUTEJUKEBOX); } +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { if (NMFLAG(nmp, EPHEMERAL)) { NFS_BITMAP_SET(mflags, NFS_MFLAG_EPHEMERAL); @@ -5722,6 +5955,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) NFS_BITMAP_SET(mflags, NFS_MFLAG_ACLONLY); } } +#endif if (NMFLAG(nmp, NFC)) { NFS_BITMAP_SET(mflags, NFS_MFLAG_NFC); } @@ -5765,9 +5999,11 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) xb_add_bitmap(error, &xbinfo, mflags_mask, NFS_MFLAG_BITMAP_LEN); xb_add_bitmap(error, &xbinfo, mflags, NFS_MFLAG_BITMAP_LEN); xb_add_32(error, &xbinfo, nmp->nm_vers); /* NFS_VERSION */ +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { xb_add_32(error, &xbinfo, nmp->nm_minor_vers); /* NFS_MINOR_VERSION */ } +#endif xb_add_32(error, &xbinfo, nmp->nm_rsize); /* READ_SIZE */ xb_add_32(error, &xbinfo, nmp->nm_wsize); /* WRITE_SIZE */ xb_add_32(error, &xbinfo, nmp->nm_readdirsize); /* READDIR_SIZE */ @@ -5807,13 +6043,29 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) } xb_add_32(error, &xbinfo, nmp->nm_numgrps); /* MAX_GROUP_LIST */ nfsmerr_if(error); - snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp", - nmp->nm_sofamily ? (nmp->nm_sofamily == AF_INET) ? "4" : "6" : ""); - xb_add_string(error, &xbinfo, sotype, strlen(sotype)); /* SOCKET_TYPE */ - xb_add_32(error, &xbinfo, ntohs(((struct sockaddr_in*)nmp->nm_saddr)->sin_port)); /* NFS_PORT */ - if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) { - xb_add_32(error, &xbinfo, nmp->nm_mountport); /* MOUNT_PORT */ + + switch (nmp->nm_saddr->sa_family) { + case AF_INET: + case AF_INET6: + snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp", + nmp->nm_sofamily ? (nmp->nm_sofamily == AF_INET) ? "4" : "6" : ""); + xb_add_string(error, &xbinfo, sotype, strlen(sotype)); /* SOCKET_TYPE */ + xb_add_32(error, &xbinfo, ntohs(((struct sockaddr_in*)nmp->nm_saddr)->sin_port)); /* NFS_PORT */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MOUNT_PORT)) { + xb_add_32(error, &xbinfo, nmp->nm_mountport); /* MOUNT_PORT */ + } + break; + case AF_LOCAL: + strlcpy(sotype, (nmp->nm_sotype == SOCK_DGRAM) ? "ticlts" : "ticotsord", sizeof(sotype)); + xb_add_string(error, &xbinfo, sotype, strlen(sotype)); + break; + default: + NFS_VFS_DBG("Unsupported address family %d\n", nmp->nm_saddr->sa_family); + printf("Unsupported address family %d\n", nmp->nm_saddr->sa_family); + error = EINVAL; + break; } + timeo = (nmp->nm_timeo * 10) / NFS_HZ; xb_add_32(error, &xbinfo, timeo / 10); /* REQUEST_TIMEOUT */ xb_add_32(error, &xbinfo, (timeo % 10) * 100000000); /* REQUEST_TIMEOUT */ @@ -5861,7 +6113,13 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SVCPRINCIPAL)) { xb_add_string(error, &xbinfo, nmp->nm_sprinc, strlen(nmp->nm_sprinc)); } - + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_NFS_PORT)) { + struct sockaddr_un *un = (struct sockaddr_un *)nmp->nm_saddr; + xb_add_string(error, &xbinfo, un->sun_path, strlen(un->sun_path)); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT)) { + xb_add_string(error, &xbinfo, nmp->nm_mount_localport, strlen(nmp->nm_mount_localport)); + } curargs_end_offset = xb_offset(&xbinfo); /* NFS_MIATTR_CUR_LOC_INDEX */ @@ -5924,8 +6182,9 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, struct xdrbuf xb; struct netfs_status *nsp = NULL; int timeoutmask; - uint pos, totlen, count, numThreads; + uint totlen, count, numThreads; #if NFSSERVER + uint pos; struct nfs_exportfs *nxfs; struct nfs_export *nx; struct nfs_active_user_list *ulist; @@ -6033,7 +6292,7 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (((nmp = VFSTONFS(mp))) == NULL) { return ENOENT; } - xb_init(&xb, 0); + xb_init(&xb, XDRBUF_NONE); if ((error = nfs_mountinfo_assemble(nmp, &xb))) { return error; } @@ -6311,9 +6570,11 @@ ustat_skip: if (nmp->nm_lockmode == NFS_LOCK_MODE_LOCAL) { /* can't toggle locks when using local locks */ error = EINVAL; +#if CONFIG_NFS4 } else if ((nmp->nm_vers >= NFS_VER4) && val) { /* can't disable locks for NFSv4 */ error = EINVAL; +#endif } else if (val) { if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED)) { nfs_lockd_mount_unregister(nmp); diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 0991a5373..b460a0411 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,6 +87,7 @@ #include #include #include +#include #include @@ -157,53 +158,56 @@ int nfs3_vnop_mkdir(struct vnop_mkdir_args *); int nfs3_vnop_rmdir(struct vnop_rmdir_args *); int nfs3_vnop_symlink(struct vnop_symlink_args *); + vnop_t **nfsv2_vnodeop_p; -static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { - { &vnop_default_desc, (vnop_t *)vn_default_error }, - { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup }, /* lookup */ - { &vnop_create_desc, (vnop_t *)nfs3_vnop_create }, /* create */ - { &vnop_mknod_desc, (vnop_t *)nfs3_vnop_mknod }, /* mknod */ - { &vnop_open_desc, (vnop_t *)nfs_vnop_open }, /* open */ - { &vnop_close_desc, (vnop_t *)nfs_vnop_close }, /* close */ - { &vnop_access_desc, (vnop_t *)nfs_vnop_access }, /* access */ - { &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr }, /* getattr */ - { &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr }, /* setattr */ - { &vnop_read_desc, (vnop_t *)nfs_vnop_read }, /* read */ - { &vnop_write_desc, (vnop_t *)nfs_vnop_write }, /* write */ - { &vnop_ioctl_desc, (vnop_t *)nfs_vnop_ioctl }, /* ioctl */ - { &vnop_select_desc, (vnop_t *)nfs_vnop_select }, /* select */ - { &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke }, /* revoke */ - { &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap }, /* mmap */ - { &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap }, /* mnomap */ - { &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync }, /* fsync */ - { &vnop_remove_desc, (vnop_t *)nfs_vnop_remove }, /* remove */ - { &vnop_link_desc, (vnop_t *)nfs3_vnop_link }, /* link */ - { &vnop_rename_desc, (vnop_t *)nfs_vnop_rename }, /* rename */ - { &vnop_mkdir_desc, (vnop_t *)nfs3_vnop_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (vnop_t *)nfs3_vnop_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (vnop_t *)nfs3_vnop_symlink }, /* symlink */ - { &vnop_readdir_desc, (vnop_t *)nfs_vnop_readdir }, /* readdir */ - { &vnop_readlink_desc, (vnop_t *)nfs_vnop_readlink }, /* readlink */ - { &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive }, /* inactive */ - { &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (vnop_t *)err_strategy }, /* strategy */ - { &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock }, /* advlock */ - { &vnop_bwrite_desc, (vnop_t *)err_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein }, /* Pagein */ - { &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (vnop_t *)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ - { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ - { NULL, NULL } +static const struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { + { .opve_op = &vnop_default_desc, .opve_impl = (vnop_t *)vn_default_error }, + { .opve_op = &vnop_lookup_desc, .opve_impl = (vnop_t *)nfs_vnop_lookup }, /* lookup */ + { .opve_op = &vnop_create_desc, .opve_impl = (vnop_t *)nfs3_vnop_create }, /* create */ + { .opve_op = &vnop_mknod_desc, .opve_impl = (vnop_t *)nfs3_vnop_mknod }, /* mknod */ + { .opve_op = &vnop_open_desc, .opve_impl = (vnop_t *)nfs_vnop_open }, /* open */ + { .opve_op = &vnop_close_desc, .opve_impl = (vnop_t *)nfs_vnop_close }, /* close */ + { .opve_op = &vnop_access_desc, .opve_impl = (vnop_t *)nfs_vnop_access }, /* access */ + { .opve_op = &vnop_getattr_desc, .opve_impl = (vnop_t *)nfs3_vnop_getattr }, /* getattr */ + { .opve_op = &vnop_setattr_desc, .opve_impl = (vnop_t *)nfs_vnop_setattr }, /* setattr */ + { .opve_op = &vnop_read_desc, .opve_impl = (vnop_t *)nfs_vnop_read }, /* read */ + { .opve_op = &vnop_write_desc, .opve_impl = (vnop_t *)nfs_vnop_write }, /* write */ + { .opve_op = &vnop_ioctl_desc, .opve_impl = (vnop_t *)nfs_vnop_ioctl }, /* ioctl */ + { .opve_op = &vnop_select_desc, .opve_impl = (vnop_t *)nfs_vnop_select }, /* select */ + { .opve_op = &vnop_revoke_desc, .opve_impl = (vnop_t *)nfs_vnop_revoke }, /* revoke */ + { .opve_op = &vnop_mmap_desc, .opve_impl = (vnop_t *)nfs_vnop_mmap }, /* mmap */ + { .opve_op = &vnop_mnomap_desc, .opve_impl = (vnop_t *)nfs_vnop_mnomap }, /* mnomap */ + { .opve_op = &vnop_fsync_desc, .opve_impl = (vnop_t *)nfs_vnop_fsync }, /* fsync */ + { .opve_op = &vnop_remove_desc, .opve_impl = (vnop_t *)nfs_vnop_remove }, /* remove */ + { .opve_op = &vnop_link_desc, .opve_impl = (vnop_t *)nfs3_vnop_link }, /* link */ + { .opve_op = &vnop_rename_desc, .opve_impl = (vnop_t *)nfs_vnop_rename }, /* rename */ + { .opve_op = &vnop_mkdir_desc, .opve_impl = (vnop_t *)nfs3_vnop_mkdir }, /* mkdir */ + { .opve_op = &vnop_rmdir_desc, .opve_impl = (vnop_t *)nfs3_vnop_rmdir }, /* rmdir */ + { .opve_op = &vnop_symlink_desc, .opve_impl = (vnop_t *)nfs3_vnop_symlink }, /* symlink */ + { .opve_op = &vnop_readdir_desc, .opve_impl = (vnop_t *)nfs_vnop_readdir }, /* readdir */ + { .opve_op = &vnop_readlink_desc, .opve_impl = (vnop_t *)nfs_vnop_readlink }, /* readlink */ + { .opve_op = &vnop_inactive_desc, .opve_impl = (vnop_t *)nfs_vnop_inactive }, /* inactive */ + { .opve_op = &vnop_reclaim_desc, .opve_impl = (vnop_t *)nfs_vnop_reclaim }, /* reclaim */ + { .opve_op = &vnop_strategy_desc, .opve_impl = (vnop_t *)err_strategy }, /* strategy */ + { .opve_op = &vnop_pathconf_desc, .opve_impl = (vnop_t *)nfs_vnop_pathconf }, /* pathconf */ + { .opve_op = &vnop_advlock_desc, .opve_impl = (vnop_t *)nfs_vnop_advlock }, /* advlock */ + { .opve_op = &vnop_bwrite_desc, .opve_impl = (vnop_t *)err_bwrite }, /* bwrite */ + { .opve_op = &vnop_pagein_desc, .opve_impl = (vnop_t *)nfs_vnop_pagein }, /* Pagein */ + { .opve_op = &vnop_pageout_desc, .opve_impl = (vnop_t *)nfs_vnop_pageout }, /* Pageout */ + { .opve_op = &vnop_copyfile_desc, .opve_impl = (vnop_t *)err_copyfile }, /* Copyfile */ + { .opve_op = &vnop_blktooff_desc, .opve_impl = (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ + { .opve_op = &vnop_offtoblk_desc, .opve_impl = (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ + { .opve_op = &vnop_blockmap_desc, .opve_impl = (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { .opve_op = &vnop_monitor_desc, .opve_impl = (vnop_t *)nfs_vnop_monitor }, /* monitor */ + { .opve_op = NULL, .opve_impl = NULL } }; -struct vnodeopv_desc nfsv2_vnodeop_opv_desc = +const struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; + +#if CONFIG_NFS4 vnop_t **nfsv4_vnodeop_p; -static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { +static const struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_default_desc, (vnop_t *)vn_default_error }, { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)nfs4_vnop_create }, /* create */ @@ -253,14 +257,15 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; -struct vnodeopv_desc nfsv4_vnodeop_opv_desc = +const struct vnodeopv_desc nfsv4_vnodeop_opv_desc = { &nfsv4_vnodeop_p, nfsv4_vnodeop_entries }; +#endif /* * Special device vnode ops */ vnop_t **spec_nfsv2nodeop_p; -static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { +static const struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vnop_default_desc, (vnop_t *)vn_default_error }, { &vnop_lookup_desc, (vnop_t *)spec_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)spec_create }, /* create */ @@ -298,10 +303,11 @@ static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; -struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = +const struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = { &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries }; +#if CONFIG_NFS4 vnop_t **spec_nfsv4nodeop_p; -static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = { +static const struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = { { &vnop_default_desc, (vnop_t *)vn_default_error }, { &vnop_lookup_desc, (vnop_t *)spec_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)spec_create }, /* create */ @@ -348,12 +354,13 @@ static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = { { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; -struct vnodeopv_desc spec_nfsv4nodeop_opv_desc = +const struct vnodeopv_desc spec_nfsv4nodeop_opv_desc = { &spec_nfsv4nodeop_p, spec_nfsv4nodeop_entries }; +#endif /* CONFIG_NFS4 */ #if FIFO vnop_t **fifo_nfsv2nodeop_p; -static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { +static const struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { { &vnop_default_desc, (vnop_t *)vn_default_error }, { &vnop_lookup_desc, (vnop_t *)fifo_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)fifo_create }, /* create */ @@ -391,11 +398,14 @@ static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; -struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = +const struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = { &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries }; +#endif +#if CONFIG_NFS4 +#if FIFO vnop_t **fifo_nfsv4nodeop_p; -static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = { +static const struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = { { &vnop_default_desc, (vnop_t *)vn_default_error }, { &vnop_lookup_desc, (vnop_t *)fifo_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)fifo_create }, /* create */ @@ -442,14 +452,16 @@ static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = { { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; -struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc = +const struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc = { &fifo_nfsv4nodeop_p, fifo_nfsv4nodeop_entries }; #endif /* FIFO */ +#endif /* CONFIG_NFS4 */ int nfs_sillyrename(nfsnode_t, nfsnode_t, struct componentname *, vfs_context_t); int nfs_getattr_internal(nfsnode_t, struct nfs_vattr *, vfs_context_t, int); int nfs_refresh_fh(nfsnode_t, vfs_context_t); + /* * Find the slot in the access cache for this UID. * If adding and no existing slot is found, reuse slots in FIFO order. @@ -514,11 +526,15 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *access, int rpcflags, vfs_context_t ctx } nfsmout_if(error); +#if CONFIG_NFS_GSS if (auth_is_kerberized(np->n_auth) || auth_is_kerberized(nmp->nm_auth)) { uid = nfs_cred_getasid2uid(vfs_context_ucred(ctx)); } else { uid = kauth_cred_getuid(vfs_context_ucred(ctx)); } +#else + uid = kauth_cred_getuid(vfs_context_ucred(ctx)); +#endif /* CONFIG_NFS_GSS */ slot = nfs_node_access_slot(np, uid, 1); np->n_accessuid[slot] = uid; microuptime(&now); @@ -551,6 +567,7 @@ nfsmout: return error; } + /* * NFS access vnode op. * For NFS version 2, just return ok. File accesses may fail later. @@ -582,7 +599,8 @@ nfs_vnop_access( } nfsvers = nmp->nm_vers; - if (nfsvers == NFS_VER2) { + + if (nfsvers == NFS_VER2 || NMFLAG(nmp, NOOPAQUE_AUTH)) { if ((ap->a_action & KAUTH_VNODE_WRITE_RIGHTS) && vfs_isrdonly(vnode_mount(vp))) { return EROFS; @@ -670,11 +688,15 @@ nfs_vnop_access( * Does our cached result allow us to give a definite yes to * this request? */ +#if CONFIG_NFS_GSS if (auth_is_kerberized(np->n_auth) || auth_is_kerberized(nmp->nm_auth)) { uid = nfs_cred_getasid2uid(vfs_context_ucred(ctx)); } else { uid = kauth_cred_getuid(vfs_context_ucred(ctx)); } +#else + uid = kauth_cred_getuid(vfs_context_ucred(ctx)); +#endif /* CONFIG_NFS_GSS */ slot = nfs_node_access_slot(np, uid, 0); dorpc = 1; if (access == 0) { @@ -851,6 +873,7 @@ restart: NP(np, "nfs_vnop_open: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); error = EIO; } +#if CONFIG_NFS4 if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { nfs_mount_state_in_use_end(nmp, 0); error = nfs4_reopen(nofp, vfs_context_thread(ctx)); @@ -859,6 +882,7 @@ restart: goto restart; } } +#endif if (!error) { error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); } @@ -886,9 +910,11 @@ restart: nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; nofp->nof_creator = NULL; } else { +#if CONFIG_NFS4 if (!opened) { error = nfs4_open(np, nofp, accessMode, denyMode, ctx); } +#endif if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) && (nofp->nof_creator == current_thread())) { /* @@ -1154,6 +1180,7 @@ restart: } error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0); +#if CONFIG_NFS4 if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { nfs_mount_state_in_use_end(nmp, 0); error = nfs4_reopen(nofp, NULL); @@ -1162,6 +1189,7 @@ restart: goto restart; } } +#endif if (error) { NP(np, "nfs_vnop_close: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); error = EBADF; @@ -1212,7 +1240,9 @@ nfs_close( uint32_t denyMode, vfs_context_t ctx) { +#if CONFIG_NFS4 struct nfs_lock_owner *nlop; +#endif int error = 0, changed = 0, delegated = 0, closed = 0, downgrade = 0; uint32_t newAccessMode, newDenyMode; @@ -1254,10 +1284,11 @@ nfs_close( changed = 0; } - if (NFSTONMP(np)->nm_vers < NFS_VER4) { /* NFS v2/v3 closes simply need to remove the open. */ + if (NFSTONMP(np)->nm_vers < NFS_VER4) { + /* NFS v2/v3 closes simply need to remove the open. */ goto v3close; } - +#if CONFIG_NFS4 if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) { /* * No more access after this close, so clean up and close it. @@ -1305,13 +1336,13 @@ nfs_close( } } } - +#endif +v3close: if (error) { NP(np, "nfs_close: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); return error; } -v3close: if (!downgrade) { nfs_open_file_remove_open(nofp, accessMode, denyMode); } @@ -1402,7 +1433,7 @@ nfs3_getattr_rpc( error = status; } nfsmout_if(error); - error = nfs_parsefattr(&nmrep, nfsvers, nvap); + error = nfs_parsefattr(nmp, &nmrep, nfsvers, nvap); nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -1426,7 +1457,7 @@ nfs_refresh_fh(nfsnode_t np, vfs_context_t ctx) int namelen, fhsize, refreshed; int error, wanted = 0; uint8_t *fhp; - struct timespec ts = {2, 0}; + struct timespec ts = {.tv_sec = 2, .tv_nsec = 0}; NFS_VNOP_DBG("vnode is %d\n", vnode_vtype(vp)); @@ -1574,7 +1605,7 @@ nfs_getattr_internal(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, in struct nfsmount *nmp; int error = 0, nfsvers, inprogset = 0, wanted = 0, avoidfloods; struct nfs_vattr nvattr; - struct timespec ts = { 2, 0 }; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; u_int64_t xid; FSDBG_TOP(513, np->n_size, np, np->n_vattr.nva_size, np->n_flag); @@ -1796,6 +1827,21 @@ nfsmout: return error; } +static int +nfs_parse_user_access( + mount_t mp, + enum vtype type) +{ + int user_access = R_OK; + if ((vfs_flags(mp) & MNT_RDONLY) == 0) { + user_access |= W_OK; + } + if (type == VDIR) { + user_access |= X_OK; + } + return user_access; +} + /* * NFS getattr call from vfs. */ @@ -1821,6 +1867,7 @@ nfsmout: VNODE_ATTR_va_fileid | \ VNODE_ATTR_va_type) + int nfs3_vnop_getattr( struct vnop_getattr_args /* { @@ -1836,12 +1883,15 @@ nfs3_vnop_getattr( struct nfsmount *nmp; dev_t rdev; + nmp = VTONMP(ap->a_vp); + /* * Lets don't go over the wire if we don't support any of the attributes. * Just fall through at the VFS layer and let it cons up what it needs. */ /* Return the io size no matter what, since we don't go over the wire for this */ VATTR_RETURN(vap, va_iosize, nfs_iosize); + if ((vap->va_active & NFS3_SUPPORTED_VATTRS) == 0) { return 0; } @@ -1857,7 +1907,6 @@ nfs3_vnop_getattr( } /* copy nva to *a_vap */ - nmp = VTONMP(ap->a_vp); VATTR_RETURN(vap, va_type, nva.nva_type); VATTR_RETURN(vap, va_mode, nva.nva_mode); rdev = makedev(nva.nva_rawdev.specdata1, nva.nva_rawdev.specdata2); @@ -1878,6 +1927,7 @@ nfs3_vnop_getattr( vap->va_change_time.tv_nsec = nva.nva_timensec[NFSTIME_CHANGE]; VATTR_SET_SUPPORTED(vap, va_change_time); + // VATTR_RETURN(vap, va_encoding, 0xffff /* kTextEncodingUnknown */); return error; } @@ -1907,9 +1957,10 @@ nfs_vnop_setattr( int dul_in_progress = 0; vnode_t dvp = NULL; const char *vname = NULL; +#if CONFIG_NFS4 struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; - +#endif nmp = VTONMP(vp); if (nfs_mount_gone(nmp)) { return ENXIO; @@ -1966,6 +2017,7 @@ nfs_vnop_setattr( FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, -1); return error; } +#if CONFIG_NFS4 if (nfsvers >= NFS_VER4) { /* setting file size requires having the file open for write access */ if (np->n_flag & NREVOKE) { @@ -2018,6 +2070,7 @@ restart: } } } +#endif nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE); if (np->n_size > vap->va_data_size) { /* shrinking? */ daddr64_t obn, bn; @@ -2201,6 +2254,7 @@ restart: nfs_node_unlock(np); } nfs_data_unlock(np); +#if CONFIG_NFS4 if (nfsvers >= NFS_VER4) { if (nofp) { /* don't close our setattr open if we'll be restarting... */ @@ -2220,6 +2274,7 @@ restart: } nfs_open_owner_rele(noop); } +#endif } return error; } @@ -2250,7 +2305,9 @@ nfs3_setattr_rpc( VATTR_SET_SUPPORTED(vap, va_access_time); VATTR_SET_SUPPORTED(vap, va_modify_time); - if (VATTR_IS_ACTIVE(vap, va_flags)) { + + if (VATTR_IS_ACTIVE(vap, va_flags) + ) { if (vap->va_flags) { /* we don't support setting flags */ if (vap->va_active & ~VNODE_ATTR_va_flags) { return EINVAL; /* return EINVAL if other attributes also set */ @@ -2348,7 +2405,7 @@ nfs3_setattr_rpc( error = lockerror; } if (nfsvers == NFS_VER3) { - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid); nfsmout_if(error); /* if file hadn't changed, update cached mtime */ @@ -2503,11 +2560,13 @@ nfs_vnop_lookup( fh.fh_len = 0; goto found; } +#if CONFIG_NFS4 if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { /* we should never be looking things up in a trigger directory, return nothing */ error = ENOENT; goto error_return; } +#endif /* do we know this name is too long? */ nmp = VTONMP(dvp); @@ -2788,8 +2847,9 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) user_ssize_t tsiz; off_t txoffset; struct nfsreq rq, *req = &rq; +#if CONFIG_NFS4 uint32_t stategenid = 0, restart = 0; - +#endif FSDBG_TOP(536, np, uio_offset(uio), uio_resid(uio), 0); nmp = NFSTONMP(np); if (nfs_mount_gone(nmp)) { @@ -2812,14 +2872,17 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) error = EIO; break; } +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { stategenid = nmp->nm_stategenid; } +#endif error = nmp->nm_funcs->nf_read_rpc_async(np, txoffset, len, vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); if (!error) { error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, uio, &retlen, &eof); } +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */ lck_mtx_lock(&nmp->nm_lock); @@ -2839,6 +2902,7 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) } } } +#endif if (error) { break; } @@ -3557,6 +3621,8 @@ skipread: } nfs_buf_write_delayed(bp); } + + if (np->n_needcommitcnt >= NFS_A_LOT_OF_NEEDCOMMITS) { nfs_flushcommits(np, 1); } @@ -3601,7 +3667,10 @@ nfs_write_rpc2( uint64_t wverf = 0, wverf2; size_t nmwsize, totalsize, tsiz, len, rlen; struct nfsreq rq, *req = &rq; - uint32_t stategenid = 0, vrestart = 0, restart = 0; +#if CONFIG_NFS4 + uint32_t stategenid = 0, restart = 0; +#endif + uint32_t vrestart = 0; uio_t uio_save = NULL; #if DIAGNOSTIC @@ -3639,9 +3708,11 @@ nfs_write_rpc2( error = EIO; break; } +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { stategenid = nmp->nm_stategenid; } +#endif error = nmp->nm_funcs->nf_write_rpc_async(np, uio, len, thd, cred, *iomodep, NULL, &req); if (!error) { error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &commit, &rlen, &wverf2); @@ -3650,6 +3721,7 @@ nfs_write_rpc2( if (nfs_mount_gone(nmp)) { error = ENXIO; } +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */ lck_mtx_lock(&nmp->nm_lock); @@ -3669,6 +3741,7 @@ nfs_write_rpc2( } } } +#endif if (error) { break; } @@ -3811,7 +3884,7 @@ nfs3_write_rpc_async_finish( error = lockerror; } if (nfsvers == NFS_VER3) { - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid); if (nfstimespeccmp(&np->n_mtime, &premtime, ==)) { updatemtime = 1; @@ -3891,7 +3964,7 @@ nfs3_vnop_mknod( struct nfs_vattr nvattr; fhandle_t fh; int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; u_int32_t rdev; u_int64_t xid = 0, dxid; int nfsvers, gotuid, gotgid; @@ -3942,7 +4015,7 @@ nfs3_vnop_mknod( nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { nfsm_chain_add_32(error, &nmreq, vtonfs_type(vap->va_type, nfsvers)); - nfsm_chain_add_v3sattr(error, &nmreq, vap); + nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap); if (vap->va_type == VCHR || vap->va_type == VBLK) { nfsm_chain_add_32(error, &nmreq, major(vap->va_rdev)); nfsm_chain_add_32(error, &nmreq, minor(vap->va_rdev)); @@ -3972,7 +4045,7 @@ nfs3_vnop_mknod( dnp->n_flag &= ~NNEGNCENTRIES; cache_purge_negatives(dvp); } - error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); + error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); } if (nfsvers == NFS_VER3) { nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid); @@ -4054,19 +4127,22 @@ nfs3_vnop_create( nfsnode_t dnp = VTONFS(dvp); vnode_t newvp = NULL; int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0, fmode = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; int nfsvers, gotuid, gotgid; u_int64_t xid, dxid; uint32_t val; struct nfsm_chain nmreq, nmrep; struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; + int dul_in_progress = 0; + int namedattrs; nmp = VTONMP(dvp); if (nfs_mount_gone(nmp)) { return ENXIO; } nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) { return ENAMETOOLONG; @@ -4083,7 +4159,8 @@ nfs3_vnop_create( gotuid = VATTR_IS_ACTIVE(vap, va_uid); gotgid = VATTR_IS_ACTIVE(vap, va_gid); - if (vap->va_vaflags & VA_EXCLUSIVE) { + if ((vap->va_vaflags & VA_EXCLUSIVE) + ) { fmode |= O_EXCL; if (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time)) { vap->va_vaflags |= VA_UTIMES_NULL; @@ -4092,7 +4169,9 @@ nfs3_vnop_create( again: error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if (!namedattrs) { + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + } nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4117,7 +4196,7 @@ again: nfsm_chain_add_32(error, &nmreq, create_verf); } else { nfsm_chain_add_32(error, &nmreq, NFS_CREATE_UNCHECKED); - nfsm_chain_add_v3sattr(error, &nmreq, vap); + nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap); } } else { nfsm_chain_add_v2sattr(error, &nmreq, vap, 0); @@ -4128,7 +4207,10 @@ again: error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_CREATE, vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) { + nfs_dulookup_start(&dul, dnp, ctx); + dul_in_progress = 1; + } error = nfs_request_async_finish(req, &nmrep, &xid, &status); } @@ -4141,7 +4223,7 @@ again: dnp->n_flag &= ~NNEGNCENTRIES; cache_purge_negatives(dvp); } - error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); + error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); } if (nfsvers == NFS_VER3) { nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid); @@ -4174,7 +4256,9 @@ nfsmout: newvp = NFSTOV(np); } - nfs_dulookup_finish(&dul, dnp, ctx); + if (dul_in_progress) { + nfs_dulookup_finish(&dul, dnp, ctx); + } if (!busyerror) { nfs_node_clear_busy(dnp); } @@ -4320,11 +4404,11 @@ again: } goto again_relock; } - +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) { nfs4_delegation_return(np, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); } - +#endif /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is @@ -4440,7 +4524,7 @@ nfs3_remove_rpc( kauth_cred_t cred) { int error = 0, lockerror = ENOENT, status, wccpostattr = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; struct nfsmount *nmp; int nfsvers; u_int64_t xid; @@ -4581,10 +4665,12 @@ nfs_vnop_rename( /* sillyrename succeeded.*/ tvp = NULL; } - } else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) { + } +#if CONFIG_NFS4 + else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) { nfs4_delegation_return(tnp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); } - +#endif error = nmp->nm_funcs->nf_rename_rpc(fdnp, fcnp->cn_nameptr, fcnp->cn_namelen, tdnp, tcnp->cn_nameptr, tcnp->cn_namelen, ctx); @@ -4685,7 +4771,7 @@ nfs3_rename_rpc( vfs_context_t ctx) { int error = 0, lockerror = ENOENT, status, fwccpostattr = 0, twccpostattr = 0; - struct timespec fpremtime = { 0, 0 }, tpremtime = { 0, 0 }; + struct timespec fpremtime = { .tv_sec = 0, .tv_nsec = 0 }, tpremtime = { .tv_sec = 0, .tv_nsec = 0 }; struct nfsmount *nmp; int nfsvers; u_int64_t xid, txid; @@ -4770,7 +4856,7 @@ nfs3_vnop_link( vnode_t tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; int error = 0, lockerror = ENOENT, status, wccpostattr = 0, attrflag = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; struct nfsmount *nmp; nfsnode_t np = VTONFS(vp); nfsnode_t tdnp = VTONFS(tdvp); @@ -4880,7 +4966,7 @@ nfs3_vnop_symlink( struct nfs_vattr nvattr; fhandle_t fh; int slen, error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; vnode_t newvp = NULL; int nfsvers, gotuid, gotgid; u_int64_t xid = 0, dxid; @@ -4890,12 +4976,15 @@ nfs3_vnop_symlink( struct nfsm_chain nmreq, nmrep; struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; + int namedattrs; + int dul_in_progress = 0; nmp = VTONMP(dvp); if (nfs_mount_gone(nmp)) { return ENXIO; } nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); slen = strlen(ap->a_target); if ((nfsvers == NFS_VER2) && @@ -4915,7 +5004,9 @@ nfs3_vnop_symlink( gotgid = VATTR_IS_ACTIVE(vap, va_gid); error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if (!namedattrs) { + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + } nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4926,7 +5017,7 @@ nfs3_vnop_symlink( nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { - nfsm_chain_add_v3sattr(error, &nmreq, vap); + nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap); } nfsm_chain_add_name(error, &nmreq, ap->a_target, slen, nmp); if (nfsvers == NFS_VER2) { @@ -4938,7 +5029,10 @@ nfs3_vnop_symlink( error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_SYMLINK, vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) { + nfs_dulookup_start(&dul, dnp, ctx); + dul_in_progress = 1; + } error = nfs_request_async_finish(req, &nmrep, &xid, &status); } @@ -4952,7 +5046,7 @@ nfs3_vnop_symlink( cache_purge_negatives(dvp); } if (nfsvers == NFS_VER3) { - error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); + error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); } else { fh.fh_len = 0; } @@ -4985,7 +5079,9 @@ nfsmout: newvp = NFSTOV(np); } - nfs_dulookup_finish(&dul, dnp, ctx); + if (dul_in_progress) { + nfs_dulookup_finish(&dul, dnp, ctx); + } /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry @@ -5052,19 +5148,23 @@ nfs3_vnop_mkdir( nfsnode_t dnp = VTONFS(dvp); vnode_t newvp = NULL; int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; int nfsvers, gotuid, gotgid; u_int64_t xid = 0, dxid; fhandle_t fh; struct nfsm_chain nmreq, nmrep; struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; + int namedattrs; + int dul_in_progress = 0; nmp = VTONMP(dvp); if (nfs_mount_gone(nmp)) { return ENXIO; } nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) { return ENAMETOOLONG; } @@ -5081,7 +5181,9 @@ nfs3_vnop_mkdir( gotgid = VATTR_IS_ACTIVE(vap, va_gid); error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if (!namedattrs) { + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + } nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -5092,7 +5194,7 @@ nfs3_vnop_mkdir( nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { - nfsm_chain_add_v3sattr(error, &nmreq, vap); + nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap); } else { nfsm_chain_add_v2sattr(error, &nmreq, vap, -1); } @@ -5102,7 +5204,10 @@ nfs3_vnop_mkdir( error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKDIR, vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) { + nfs_dulookup_start(&dul, dnp, ctx); + dul_in_progress = 1; + } error = nfs_request_async_finish(req, &nmrep, &xid, &status); } @@ -5115,7 +5220,7 @@ nfs3_vnop_mkdir( dnp->n_flag &= ~NNEGNCENTRIES; cache_purge_negatives(dvp); } - error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); + error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr); } if (nfsvers == NFS_VER3) { nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid); @@ -5145,7 +5250,9 @@ nfsmout: newvp = NFSTOV(np); } - nfs_dulookup_finish(&dul, dnp, ctx); + if (dul_in_progress) { + nfs_dulookup_finish(&dul, dnp, ctx); + } /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry @@ -5206,7 +5313,7 @@ nfs3_vnop_rmdir( vnode_t dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; int error = 0, lockerror = ENOENT, status, wccpostattr = 0; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; struct nfsmount *nmp; nfsnode_t np = VTONFS(vp); nfsnode_t dnp = VTONFS(dvp); @@ -5215,12 +5322,16 @@ nfs3_vnop_rmdir( struct nfsm_chain nmreq, nmrep; struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; + int namedattrs; + int dul_in_progress = 0; nmp = VTONMP(vp); if (nfs_mount_gone(nmp)) { return ENXIO; } nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) { return ENAMETOOLONG; } @@ -5229,7 +5340,9 @@ nfs3_vnop_rmdir( return error; } - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if (!namedattrs) { + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + } nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -5244,7 +5357,10 @@ nfs3_vnop_rmdir( error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_RMDIR, vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) { + nfs_dulookup_start(&dul, dnp, ctx); + dul_in_progress = 1; + } error = nfs_request_async_finish(req, &nmrep, &xid, &status); } @@ -5272,7 +5388,9 @@ nfsmout: /* nfs_getattr() will check changed and purge caches */ nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } - nfs_dulookup_finish(&dul, dnp, ctx); + if (dul_in_progress) { + nfs_dulookup_finish(&dul, dnp, ctx); + } nfs_node_clear_busy2(dnp, np); /* @@ -5366,12 +5484,12 @@ nfs_vnop_readdir( if (uio_resid(uio) == 0) { return 0; } - +#if CONFIG_NFS4 if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { /* trigger directories should never be read, return nothing */ return 0; } - +#endif thd = vfs_context_thread(ctx); numdirent = done = 0; nextcookie = uio_offset(uio); @@ -6250,7 +6368,7 @@ nextbuffer: nfsmout_if(error); if (attrflag) { /* grab attributes */ - error = nfs_parsefattr(&nmrep, NFS_VER3, nvattrp); + error = nfs_parsefattr(nmp, &nmrep, NFS_VER3, nvattrp); nfsmout_if(error); dp->d_type = IFTODT(VTTOIF(nvattrp->nva_type)); /* fileid is already in d_fileno, so stash xid in attrs */ @@ -6521,13 +6639,13 @@ nfs3_lookup_rpc_async_finish( /* get the attributes */ if (nfsvers == NFS_VER3) { - nfsm_chain_postop_attr_get(error, &nmrep, attrflag, nvap); + nfsm_chain_postop_attr_get(nmp, error, &nmrep, attrflag, nvap); nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid); if (!error && !attrflag) { error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp); } } else { - error = nfs_parsefattr(&nmrep, nfsvers, nvap); + error = nfs_parsefattr(nmp, &nmrep, nfsvers, nvap); } nfsmout: if (!lockerror) { @@ -6771,7 +6889,7 @@ nfs3_commit_rpc( { struct nfsmount *nmp; int error = 0, lockerror, status, wccpostattr = 0, nfsvers; - struct timespec premtime = { 0, 0 }; + struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 }; u_int64_t xid, newwverf; uint32_t count32; struct nfsm_chain nmreq, nmrep; @@ -7039,7 +7157,9 @@ nfs_vnop_pathconf( } else { nfsap = &nmp->nm_fsattr; } - } else if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_HOMOGENEOUS)) { + } +#if CONFIG_NFS4 + else if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_HOMOGENEOUS)) { /* no pathconf info cached */ lck_mtx_unlock(&nmp->nm_lock); NFS_CLEAR_ATTRIBUTES(nfsa.nfsa_bitmap); @@ -7053,16 +7173,19 @@ nfs_vnop_pathconf( } lck_mtx_lock(&nmp->nm_lock); nfsap = &nfsa; - } else { + } +#endif + else { nfsap = &nmp->nm_fsattr; } - switch (ap->a_name) { case _PC_LINK_MAX: if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXLINK)) { *ap->a_retval = nfsap->nfsa_maxlink; +#if CONFIG_NFS4 } else if ((nmp->nm_vers == NFS_VER4) && NFS_BITMAP_ISSET(np->n_vattr.nva_bitmap, NFS_FATTR_MAXLINK)) { *ap->a_retval = np->n_vattr.nva_maxlink; +#endif } else { error = EINVAL; } @@ -7390,14 +7513,15 @@ nfs_vnop_ioctl( vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; struct nfsmount *mp = VTONMP(vp); + int error = ENOTTY; +#if CONFIG_NFS_GSS struct user_nfs_gss_principal gprinc = {}; uint32_t len; - int error = ENOTTY; +#endif if (mp == NULL) { return ENXIO; } - switch (ap->a_command) { case F_FULLFSYNC: if (vnode_vfsisrdonly(vp)) { @@ -7405,6 +7529,7 @@ nfs_vnop_ioctl( } error = nfs_flush(VTONFS(vp), MNT_WAIT, vfs_context_thread(ctx), 0); break; +#if CONFIG_NFS_GSS case NFS_IOC_DESTROY_CRED: if (!auth_is_kerberized(mp->nm_auth)) { return ENOTSUP; @@ -7499,6 +7624,7 @@ nfs_vnop_ioctl( if (gprinc.principal) { FREE(gprinc.principal, M_TEMP); } +#endif /* CONFIG_NFS_GSS */ } return error; @@ -7561,7 +7687,10 @@ nfs_vnop_pagein( #define MAXPAGINGREQS 16 /* max outstanding RPCs for pagein/pageout */ struct nfsreq *req[MAXPAGINGREQS]; int nextsend, nextwait; - uint32_t stategenid = 0, restart = 0; +#if CONFIG_NFS4 + uint32_t stategenid = 0; +#endif + uint32_t restart = 0; kern_return_t kret; FSDBG(322, np, f_offset, size, flags); @@ -7611,9 +7740,11 @@ nfs_vnop_pagein( ioaddr += pl_offset; tryagain: +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { stategenid = nmp->nm_stategenid; } +#endif txsize = rxsize = size; txoffset = f_offset; rxaddr = ioaddr; @@ -7649,6 +7780,7 @@ tryagain: error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req[nextwait], uio, &retsize, NULL); req[nextwait] = NULL; nextwait = (nextwait + 1) % MAXPAGINGREQS; +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { lck_mtx_lock(&nmp->nm_lock); if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { @@ -7659,6 +7791,7 @@ tryagain: restart++; goto cancel; } +#endif if (error) { FSDBG(322, uio_offset(uio), uio_resid(uio), error, -1); break; @@ -7681,7 +7814,9 @@ tryagain: restart = 0; if (error) { +#if CONFIG_NFS4 cancel: +#endif /* cancel any outstanding requests */ while (req[nextwait]) { nfs_request_async_cancel(req[nextwait]); @@ -7885,7 +8020,10 @@ nfs_vnop_pageout( struct nfsreq *req[MAXPAGINGREQS]; int nextsend, nextwait, wverfset, commit; uint64_t wverf, wverf2; - uint32_t stategenid = 0, vrestart = 0, restart = 0, vrestarts = 0, restarts = 0; +#if CONFIG_NFS4 + uint32_t stategenid = 0; +#endif + uint32_t vrestart = 0, restart = 0, vrestarts = 0, restarts = 0; kern_return_t kret; FSDBG(323, f_offset, size, pl, pl_offset); @@ -8081,9 +8219,11 @@ nfs_vnop_pageout( &uio_buf, sizeof(uio_buf)); tryagain: +#if CONFIG_NFS4 if (nmp->nm_vers >= NFS_VER4) { stategenid = nmp->nm_stategenid; } +#endif wverf = wverf2 = wverfset = 0; txsize = rxsize = xsize; txoffset = rxoffset = f_offset; @@ -8132,6 +8272,7 @@ tryagain: nfs_node_lock_force(np); np->n_numoutput--; nfs_node_unlock(np); +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { lck_mtx_lock(&nmp->nm_lock); if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { @@ -8142,6 +8283,7 @@ tryagain: restart = 1; goto cancel; } +#endif if (error) { FSDBG(323, rxoffset, rxsize, error, -1); break; @@ -8169,6 +8311,7 @@ tryagain: uio_addiov(auio, CAST_USER_ADDR_T(rxaddr), remsize); iomode = NFS_WRITE_UNSTABLE; error = nfs_write_rpc2(np, auio, thd, cred, &iomode, &wverf2); +#if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { NP(np, "nfs_vnop_pageout: restart: error %d", error); lck_mtx_lock(&nmp->nm_lock); @@ -8180,6 +8323,7 @@ tryagain: restart = 1; goto cancel; } +#endif if (error) { FSDBG(323, rxoffset, rxsize, error, -1); break; @@ -8394,7 +8538,7 @@ nfs_vnop_monitor( /* This vnode is no longer being monitored, make sure we're not tracking it. */ /* Wait for any in-progress getattr to complete first. */ while (np->n_mflag & NMMONSCANINPROG) { - struct timespec ts = { 1, 0 }; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; np->n_mflag |= NMMONSCANWANT; msleep(&np->n_mflag, &nmp->nm_lock, PZERO - 1, "nfswaitmonscan", &ts); } @@ -8443,3 +8587,4 @@ nfs_vnode_notify(nfsnode_t np, uint32_t events) } vnode_notify(NFSTOV(np), events, vap); } + diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index 6f348ac36..b16669fc5 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -82,7 +82,7 @@ int nfsm_chain_add_opaque_nopad_f(struct nfsm_chain *, const u_char *, uint32_t) int nfsm_chain_add_uio(struct nfsm_chain *, uio_t, uint32_t); int nfsm_chain_add_fattr4_f(struct nfsm_chain *, struct vnode_attr *, struct nfsmount *); int nfsm_chain_add_v2sattr_f(struct nfsm_chain *, struct vnode_attr *, uint32_t); -int nfsm_chain_add_v3sattr_f(struct nfsm_chain *, struct vnode_attr *); +int nfsm_chain_add_v3sattr_f(struct nfsmount *, struct nfsm_chain *, struct vnode_attr *); int nfsm_chain_add_string_nfc(struct nfsm_chain *, const uint8_t *, uint32_t); int nfsm_chain_advance(struct nfsm_chain *, uint32_t); @@ -91,7 +91,7 @@ int nfsm_chain_reverse(struct nfsm_chain *, uint32_t); int nfsm_chain_get_opaque_pointer_f(struct nfsm_chain *, uint32_t, u_char **); int nfsm_chain_get_opaque_f(struct nfsm_chain *, uint32_t, u_char *); int nfsm_chain_get_uio(struct nfsm_chain *, uint32_t, uio_t); -int nfsm_chain_get_fh_attr(struct nfsm_chain *, nfsnode_t, +int nfsm_chain_get_fh_attr(struct nfsmount *, struct nfsm_chain *, nfsnode_t, vfs_context_t, int, uint64_t *, fhandle_t *, struct nfs_vattr *); int nfsm_chain_get_wcc_data_f(struct nfsm_chain *, nfsnode_t, struct timespec *, int *, u_int64_t *); int nfsm_chain_get_secinfo(struct nfsm_chain *, uint32_t *, int *); @@ -415,10 +415,10 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); } while (0) /* Add an NFSv3 "sattr" structure to an mbuf chain */ -#define nfsm_chain_add_v3sattr(E, NMC, VAP) \ +#define nfsm_chain_add_v3sattr(NMP, E, NMC, VAP) \ do { \ if (E) break; \ - (E) = nfsm_chain_add_v3sattr_f((NMC), (VAP)); \ + (E) = nfsm_chain_add_v3sattr_f((NMP), (NMC), (VAP)); \ } while (0) /* Add an NFSv4 "fattr" structure to an mbuf chain */ @@ -664,13 +664,13 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); } while (0) /* get postop attributes from an mbuf chain */ -#define nfsm_chain_postop_attr_get(E, NMC, F, VAP) \ +#define nfsm_chain_postop_attr_get(NMP, E, NMC, F, VAP) \ do { \ (F) = 0; \ if ((E) || !(NMC)->nmc_mhead) break; \ nfsm_chain_get_32((E), (NMC), (F)); \ if ((E) || !(F)) break; \ - if (((E) = nfs_parsefattr((NMC), NFS_VER3, (VAP)))) \ + if (((E) = nfs_parsefattr((NMP), (NMC), NFS_VER3, (VAP)))) \ (F) = 0; \ } while (0) @@ -679,7 +679,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); #define nfsm_chain_postop_attr_update_flag(E, NMC, NP, F, X) \ do { \ struct nfs_vattr ttvattr; \ - nfsm_chain_postop_attr_get((E), (NMC), (F), &ttvattr); \ + nfsm_chain_postop_attr_get(NFSTONMP(NP), (E), (NMC), (F), &ttvattr); \ if ((E) || !(F)) break; \ if (((E) = nfs_loadattrcache((NP), &ttvattr, (X), 1))) { \ (F) = 0; \ @@ -703,15 +703,28 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); (E) = nfsm_chain_get_wcc_data_f((NMC), (NP), (PREMTIME), (NEWPOSTATTR), (X)); \ } while (0) +#if CONFIG_NFS4 +/* separate v4 variant for loading attrs that only runs when NFSv4 is set */ +#define __nfsm_chain_loadattr_v4(E, NMC, VERS, X, VATTR) \ + do { \ + (E) = nfs4_parsefattr((NMC), NULL, (VATTR), NULL, NULL, NULL); \ + } while (0) +#else +#define __nfsm_chain_loadattr_v4(E, NMC, VERS, X, VATTR) \ + do { \ + break; \ + } while (0) +#endif + /* update a node's attribute cache with attributes from an mbuf chain */ #define nfsm_chain_loadattr(E, NMC, NP, VERS, X) \ do { \ struct nfs_vattr ttvattr; \ if (E) break; \ if ((VERS) == NFS_VER4) { \ - (E) = nfs4_parsefattr((NMC), NULL, &ttvattr, NULL, NULL, NULL); \ + __nfsm_chain_loadattr_v4((E), (NMC), (VERS), (X), &ttvattr); \ } else { \ - (E) = nfs_parsefattr((NMC), (VERS), &ttvattr); \ + (E) = nfs_parsefattr(NFSTONMP(NP), (NMC), (VERS), &ttvattr); \ } \ if (!(E) && (NP)) \ (E) = nfs_loadattrcache((NP), &ttvattr, (X), 0); \ diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index c9dc924de..0743b8383 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -177,7 +177,9 @@ struct nfs_socket { int nso_error; /* saved error/status */ struct nfs_rpc_record_state nso_rrs; /* RPC record parsing state (TCP) */ }; + TAILQ_HEAD(nfssocketlist, nfs_socket); + /* nso_flags */ #define NSO_UPCALL 0x0001 /* socket upcall in progress */ #define NSO_DEAD 0x0002 /* socket is dead */ @@ -337,6 +339,8 @@ struct nfsmount { uint8_t nm_sotype; /* (preferred) type of socket */ in_port_t nm_nfsport; /* NFS protocol port */ in_port_t nm_mountport; /* MOUNT protocol port (v2/v3) */ + char *nm_nfs_localport; /* Unix domain address (port) for nfs */ + char *nm_mount_localport; /* Unix domain address (port) for mountd */ struct nfs_socket_search *nm_nss; /* current socket search structure */ struct nfs_socket *nm_nso; /* current socket */ struct sockaddr *nm_saddr; /* Address of server */ diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index 81341cc91..9562d6144 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,7 +98,7 @@ struct nfsbuf { TAILQ_ENTRY(nfsbuf) nb_free; /* free list position if not active. */ volatile uint32_t nb_flags; /* NB_* flags. */ volatile uint32_t nb_lflags; /* NBL_* flags. */ - volatile uint32_t nb_refs; /* outstanding references. */ + os_refcnt_t nb_refs; /* outstanding references. */ uint32_t nb_bufsize; /* buffer size */ daddr64_t nb_lblkno; /* logical block number. */ uint64_t nb_verf; /* V3 write verifier */ @@ -300,7 +300,8 @@ struct nfsdmap { #define NFSTIME_CHANGE 2 /* time file changed */ #define NFSTIME_CREATE 3 /* time file created */ #define NFSTIME_BACKUP 4 /* time of last backup */ -#define NFSTIME_COUNT 5 +#define NFSTIME_ADDED 5 /* time added (FPnfs only) */ +#define NFSTIME_COUNT 6 #define NFS_COMPARE_MTIME(TVP, NVAP, CMP) \ (((TVP)->tv_sec == (NVAP)->nva_timesec[NFSTIME_MODIFY]) ? \ @@ -332,6 +333,11 @@ struct nfs_vattr { int64_t nva_timesec[NFSTIME_COUNT]; int32_t nva_timensec[NFSTIME_COUNT]; uint32_t nva_bitmap[NFS_ATTR_BITMAP_LEN]; /* attributes that are valid */ + + /* FPnfs only. */ + uint32_t nva_bsd_flags; /* BSD flags */ + uint64_t nva_parentid; /* parent file id */ + uint64_t nva_allocsize; /* size allocated on disk */ }; /* nva_flags */ @@ -341,6 +347,10 @@ struct nfs_vattr { #define NFS_FFLAG_TRIGGER 0x0008 /* node is a trigger/mirror mount point */ #define NFS_FFLAG_TRIGGER_REFERRAL 0x0010 /* trigger is a referral */ #define NFS_FFLAG_IS_ATTR 0x8000 /* file is a named attribute file/directory */ +/* FPnfs only */ +#define NFS_FFLAG_FPNFS_BSD_FLAGS 0x01000000 +#define NFS_FFLAG_FPNFS_PARENTID 0x02000000 +#define NFS_FFLAG_FPNFS_ADDEDTIME 0x04000000 /* flags for nfs_getattr() */ #define NGA_CACHED 0x0001 /* use cached attributes (if still valid) */ @@ -692,6 +702,7 @@ struct nfsnode { #define NISMAPPED 0x10000 /* node is mmapped */ #define NREFRESH 0x20000 /* node's fh needs to be refreshed */ #define NREFRESHWANT 0x40000 /* Waiting for fh to be refreshed */ +#define NDISARMTRIGGER 0x80000 /* Ignore node's mirror mount trigger */ /* * Flags for n_hflag @@ -793,11 +804,13 @@ extern lck_mtx_t *nfsiod_mutex; typedef int vnop_t(void *); extern vnop_t **fifo_nfsv2nodeop_p; extern vnop_t **nfsv2_vnodeop_p; +extern vnop_t **fpnfs_vnodeop_p; extern vnop_t **spec_nfsv2nodeop_p; +#if CONFIG_NFS4 extern vnop_t **fifo_nfsv4nodeop_p; extern vnop_t **nfsv4_vnodeop_p; extern vnop_t **spec_nfsv4nodeop_p; - +#endif /* * Prototypes for NFS vnode operations */ @@ -875,7 +888,7 @@ int nfs_flushcommits(nfsnode_t, int); int nfs_flush(nfsnode_t, int, thread_t, int); void nfs_buf_delwri_push(int); void nfs_buf_delwri_service(void); -void nfs_buf_delwri_thread(void *, wait_result_t);; +void nfs_buf_delwri_thread(void *, wait_result_t); int nfsiod_start(void); void nfsiod_terminate(struct nfsiod *); diff --git a/bsd/nfs/nfsproto.h b/bsd/nfs/nfsproto.h index b45f35145..1ade820c3 100644 --- a/bsd/nfs/nfsproto.h +++ b/bsd/nfs/nfsproto.h @@ -89,7 +89,13 @@ #define NFS_V2MAXDATA 8192 #define NFS_MAXDGRAMDATA 16384 #define NFS_PREFDGRAMDATA 8192 -#define NFS_MAXDATA (64*1024) // XXX not ready for >64K + +#ifdef XNU_TARGET_OS_IOS +#define NFS_MAXDATA (32 * PAGE_SIZE) /* Same as NFS_MAXBSIZE from nfsnode.h */ +#else /* TARGET_OS_IOS */ +#define NFS_MAXDATA (64*1024) +#endif /* TARGET_OS_IOS */ + #define NFSRV_MAXDATA (64*1024) // XXX not ready for >64K #define NFS_MAXPATHLEN 1024 #define NFS_MAXNAMLEN 255 @@ -348,9 +354,9 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, * NFS attribute management stuff */ #define NFS_ATTR_BITMAP_LEN 2 -#define NFS_BITMAP_SET(B, I) (((uint32_t *)(B))[(I)/32] |= 1<<((I)%32)) -#define NFS_BITMAP_CLR(B, I) (((uint32_t *)(B))[(I)/32] &= ~(1<<((I)%32))) -#define NFS_BITMAP_ISSET(B, I) (((uint32_t *)(B))[(I)/32] & (1<<((I)%32))) +#define NFS_BITMAP_SET(B, I) (((uint32_t *)(B))[(I)/32] |= 1U<<((I)%32)) +#define NFS_BITMAP_CLR(B, I) (((uint32_t *)(B))[(I)/32] &= ~(1U<<((I)%32))) +#define NFS_BITMAP_ISSET(B, I) (((uint32_t *)(B))[(I)/32] & (1U<<((I)%32))) #define NFS_BITMAP_ZERO(B, L) \ do { \ int __i; \ diff --git a/bsd/nfs/rpcv2.h b/bsd/nfs/rpcv2.h index 2a5bc9c6f..f23f98572 100644 --- a/bsd/nfs/rpcv2.h +++ b/bsd/nfs/rpcv2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -157,5 +157,13 @@ #define RQUOTA_STAT_NOQUOTA 2 #define RQUOTA_STAT_EPERM 3 +/* Local transports for rpcbind */ +#define RPCB_TICOTSORD_PATH "/var/run/rpcb.ticotsord" +#define RPCB_TICLTS_PATH "/var/run/rpcb.ticlst" + +/* Local transport for nfs */ +#define NFS_TICOTSORD_PATH "/var/ran/nfs.ticotsord" +#define NFS_TICLTS_PATH "/var/run/nfs.ticlts" + #endif /* __APPLE_API_PRIVATE */ #endif /* _NFS_RPCV2_H_ */ diff --git a/bsd/nfs/xdr_subs.h b/bsd/nfs/xdr_subs.h index 24295f487..36e4dc929 100644 --- a/bsd/nfs/xdr_subs.h +++ b/bsd/nfs/xdr_subs.h @@ -100,7 +100,10 @@ * * generalized functionality for managing the building/dissecting of XDR data */ -typedef enum xdrbuf_type { XDRBUF_BUFFER=1 } xdrbuf_type; +typedef enum xdrbuf_type { + XDRBUF_NONE = 0, + XDRBUF_BUFFER = 1, +} xdrbuf_type; struct xdrbuf { union { @@ -192,6 +195,8 @@ xb_cleanup(struct xdrbuf *xbp) xb_free(xbp->xb_u.xb_buffer.xbb_base); } break; + default: + break; } xbp->xb_flags &= ~XB_CLEANUP; } @@ -207,6 +212,8 @@ xb_set_cur_buf_len(struct xdrbuf *xbp) case XDRBUF_BUFFER: xbp->xb_u.xb_buffer.xbb_len = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base; break; + default: + break; } } @@ -244,6 +251,8 @@ xb_offset(struct xdrbuf *xbp) case XDRBUF_BUFFER: offset = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base; break; + default: + break; } return offset; @@ -260,6 +269,8 @@ xb_seek(struct xdrbuf *xbp, uint32_t offset) xbp->xb_ptr = xbp->xb_u.xb_buffer.xbb_base + offset; xbp->xb_left = xbp->xb_u.xb_buffer.xbb_len - offset; break; + default: + break; } return 0; @@ -323,6 +334,8 @@ xb_grow(struct xdrbuf *xbp) xbp->xb_ptr = newbuf + oldsize; xbp->xb_left = xbp->xb_growsize; break; + default: + break; } return 0; diff --git a/bsd/pthread/Makefile b/bsd/pthread/Makefile index ef0643f8a..4016262f6 100644 --- a/bsd/pthread/Makefile +++ b/bsd/pthread/Makefile @@ -31,7 +31,7 @@ INSTALL_MI_DIR = pthread # /usr/local/include without PRIVATE stuff # /System/Library/Frameworks/System.framework/PrivateHeaders -INCDIR = /usr/local/include +INCDIR = $(SDKHEADERSROOT)/usr/local/include INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} diff --git a/bsd/pthread/bsdthread_private.h b/bsd/pthread/bsdthread_private.h index add1853ba..fa5f0fdc1 100644 --- a/bsd/pthread/bsdthread_private.h +++ b/bsd/pthread/bsdthread_private.h @@ -56,6 +56,8 @@ #define BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET 0x402 /* bsdthread_ctl(BSDTHREAD_CTL_QOS_MAX_PARALLELISM, priority, flags, 0) */ #define BSDTHREAD_CTL_QOS_MAX_PARALLELISM 0x800 +/* bsdthread_ctl(BSDTHREAD_CTL_WORKQ_ALLOW_KILL, enable, 0, 0) */ +#define BSDTHREAD_CTL_WORKQ_ALLOW_KILL 0x1000 #define _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL 0x1 #define _PTHREAD_QOS_PARALLELISM_REALTIME 0x2 diff --git a/bsd/pthread/priority_private.h b/bsd/pthread/priority_private.h index b73c0ad95..dbfff7e54 100644 --- a/bsd/pthread/priority_private.h +++ b/bsd/pthread/priority_private.h @@ -179,14 +179,21 @@ _pthread_default_priority(unsigned long flags) return _pthread_priority_make_from_thread_qos(THREAD_QOS_LEGACY, 0, flags); } +__attribute__((always_inline, const)) +static inline thread_qos_t +_pthread_priority_thread_qos_fast(pthread_priority_t pp) +{ + pp &= _PTHREAD_PRIORITY_QOS_CLASS_MASK; + pp >>= _PTHREAD_PRIORITY_QOS_CLASS_SHIFT; + return (thread_qos_t)__builtin_ffs((int)pp); +} + __attribute__((always_inline, const)) static inline thread_qos_t _pthread_priority_thread_qos(pthread_priority_t pp) { if (_pthread_priority_has_qos(pp)) { - pp &= _PTHREAD_PRIORITY_QOS_CLASS_MASK; - pp >>= _PTHREAD_PRIORITY_QOS_CLASS_SHIFT; - return (thread_qos_t)__builtin_ffs((int)pp); + return _pthread_priority_thread_qos_fast(pp); } return THREAD_QOS_UNSPECIFIED; } diff --git a/bsd/pthread/pthread_shims.c b/bsd/pthread/pthread_shims.c index 2f3aadbf3..86e618e7d 100644 --- a/bsd/pthread/pthread_shims.c +++ b/bsd/pthread/pthread_shims.c @@ -57,11 +57,11 @@ /* version number of the in-kernel shims given to pthread.kext */ #define PTHREAD_SHIMS_VERSION 1 -/* on arm, the callbacks function has two #ifdef arm ponters */ +/* on arm, the callbacks function has two #ifdef arm pointers */ #if defined(__arm__) #define PTHREAD_CALLBACK_MEMBER __unused_was_map_is_1gb #else -#define PTHREAD_CALLBACK_MEMBER __unused_was_ml_get_max_cpus +#define PTHREAD_CALLBACK_MEMBER kevent_workq_internal #endif /* compile time asserts to check the length of structures in pthread_shims.h */ @@ -255,7 +255,7 @@ static void psynch_wait_complete(uintptr_t kwq, struct turnstile **tstore) { assert(tstore); - turnstile_complete(kwq, tstore, NULL); + turnstile_complete(kwq, tstore, NULL, TURNSTILE_PTHREAD_MUTEX); } static void @@ -270,7 +270,7 @@ psynch_wait_update_owner(uintptr_t kwq, thread_t owner, turnstile_update_inheritor(ts, owner, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); - turnstile_complete(kwq, tstore, NULL); + turnstile_complete(kwq, tstore, NULL, TURNSTILE_PTHREAD_MUTEX); } static void @@ -300,7 +300,7 @@ psynch_wait_wakeup(uintptr_t kwq, struct ksyn_waitq_element *kwe, uth->uu_thread, THREAD_AWAKENED); turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); - turnstile_complete(kwq, tstore, NULL); + turnstile_complete(kwq, tstore, NULL, TURNSTILE_PTHREAD_MUTEX); } else { kr = thread_wakeup_thread((event_t)kwq, uth->uu_thread); } @@ -481,11 +481,8 @@ kdp_pthread_get_thread_kwq(thread_t thread) } void -thread_will_park_or_terminate(thread_t thread) +thread_will_park_or_terminate(__unused thread_t thread) { - if (thread_owned_workloops_count(thread)) { - (void)kevent_exit_on_workloop_ownership_leak(thread); - } } /* @@ -540,6 +537,8 @@ static const struct pthread_callbacks_s pthread_callbacks = { .thread_create = thread_create, .thread_resume = thread_resume, + .kevent_workq_internal = kevent_workq_internal, + .convert_thread_to_port = convert_thread_to_port, .proc_get_stack_addr_hint = proc_get_stack_addr_hint, diff --git a/bsd/pthread/pthread_workqueue.c b/bsd/pthread/pthread_workqueue.c index 0ad001488..c979f80eb 100644 --- a/bsd/pthread/pthread_workqueue.c +++ b/bsd/pthread/pthread_workqueue.c @@ -29,9 +29,6 @@ #include -// panic() should be marked noreturn -extern void panic(const char *string, ...) __printflike(1, 2) __dead2; - #include #include #include @@ -82,10 +79,9 @@ extern void panic(const char *string, ...) __printflike(1, 2) __dead2; #include -extern thread_t port_name_to_thread(mach_port_name_t port_name); /* osfmk/kern/ipc_tt.h */ - static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2; -static void workq_schedule_creator(proc_t p, struct workqueue *wq, int flags); +static void workq_schedule_creator(proc_t p, struct workqueue *wq, + workq_kern_threadreq_flags_t flags); static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth, workq_threadreq_t req); @@ -116,6 +112,7 @@ static lck_attr_t *workq_lck_attr; static lck_grp_attr_t *workq_lck_grp_attr; os_refgrp_decl(static, workq_refgrp, "workq", NULL); +static struct mpsc_daemon_queue workq_deallocate_queue; static zone_t workq_zone_workqueue; static zone_t workq_zone_threadreq; @@ -184,10 +181,10 @@ proc_init_wqptr_or_wait(struct proc *p) struct workqueue *wq; proc_lock(p); - wq = p->p_wqptr; + wq = os_atomic_load(&p->p_wqptr, relaxed); if (wq == NULL) { - p->p_wqptr = WQPTR_IS_INITING_VALUE; + os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed); proc_unlock(p); return true; } @@ -211,9 +208,7 @@ workq_parked_wait_event(struct uthread *uth) static inline void workq_thread_wakeup(struct uthread *uth) { - if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) { - thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread); - } + thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread); } #pragma mark wq_thactive @@ -242,7 +237,7 @@ static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3, static inline wq_thactive_t _wq_thactive(struct workqueue *wq) { - return os_atomic_load(&wq->wq_thactive, relaxed); + return os_atomic_load_wide(&wq->wq_thactive, relaxed); } static inline int @@ -323,7 +318,7 @@ _wq_thactive_move(struct workqueue *wq, { wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) - _wq_thactive_offset_for_qos(old_qos); - os_atomic_add_orig(&wq->wq_thactive, v, relaxed); + os_atomic_add(&wq->wq_thactive, v, relaxed); wq->wq_thscheduled_count[_wq_bucket(old_qos)]--; wq->wq_thscheduled_count[_wq_bucket(new_qos)]++; } @@ -388,13 +383,6 @@ workq_is_exiting(struct proc *p) return !wq || _wq_exiting(wq); } -struct turnstile * -workq_turnstile(struct proc *p) -{ - struct workqueue *wq = proc_get_wqptr(p); - return wq ? wq->wq_turnstile : TURNSTILE_NULL; -} - #pragma mark workqueue lock static bool @@ -450,7 +438,7 @@ workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth) workq_threadreq_param_t cur_trp, req_trp = { }; cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params; - if (req->tr_flags & TR_FLAG_WL_PARAMS) { + if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) { req_trp = kqueue_threadreq_workloop_param(req); } @@ -537,7 +525,7 @@ workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth) assert(uth == current_uthread()); workq_threadreq_param_t trp = { }; - if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) { + if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) { trp = kqueue_threadreq_workloop_param(req); } @@ -560,7 +548,7 @@ workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth) static void workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth, - workq_threadreq_t req) + workq_threadreq_t req, bool unpark) { thread_t th = uth->uu_thread; thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP; @@ -568,16 +556,18 @@ workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth, int priority = 31; int policy = POLICY_TIMESHARE; - if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) { + if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) { trp = kqueue_threadreq_workloop_param(req); } uth->uu_workq_pri = WORKQ_POLICY_INIT(qos); uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS; - uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value; - // qos sent out to userspace (may differ from uu_workq_pri on param threads) - uth->uu_save.uus_workq_park_data.qos = qos; + if (unpark) { + uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value; + // qos sent out to userspace (may differ from uu_workq_pri on param threads) + uth->uu_save.uus_workq_park_data.qos = qos; + } if (qos == WORKQ_THREAD_QOS_MANAGER) { uint32_t mgr_pri = wq->wq_event_manager_priority; @@ -611,12 +601,12 @@ workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth, * every time a servicer is being told about a new max QoS. */ void -workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr) +workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr) { struct uu_workq_policy old_pri, new_pri; - struct uthread *uth = get_bsdthread_info(kqr->kqr_thread); + struct uthread *uth = current_uthread(); struct workqueue *wq = proc_get_wqptr_fast(p); - thread_qos_t qos = kqr->kqr_qos_index; + thread_qos_t qos = kqr->tr_kq_qos_index; if (uth->uu_workq_pri.qos_max == qos) { return; @@ -729,7 +719,9 @@ workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement) wq, wq->wq_thidlecount, 0, 0, 0); wq->wq_thdying_count++; uth->uu_workq_flags |= UT_WORKQ_DYING; - workq_thread_wakeup(uth); + if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) { + workq_thread_wakeup(uth); + } return; } @@ -770,14 +762,15 @@ workq_kill_old_threads_call(void *param0, void *param1 __unused) workq_lock_spin(wq); WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0); - os_atomic_and(&wq->wq_flags, ~WQ_DEATH_CALL_SCHEDULED, relaxed); + os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed); workq_death_policy_evaluate(wq, 0); WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0); workq_unlock(wq); } static struct uthread * -workq_pop_idle_thread(struct workqueue *wq) +workq_pop_idle_thread(struct workqueue *wq, uint8_t uu_flags, + bool *needs_wakeup) { struct uthread *uth; @@ -790,13 +783,21 @@ workq_pop_idle_thread(struct workqueue *wq) TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry); assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0); - uth->uu_workq_flags |= UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT; + uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags; + if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) { + wq->wq_constrained_threads_scheduled++; + } wq->wq_threads_scheduled++; wq->wq_thidlecount--; if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) { uth->uu_workq_flags ^= UT_WORKQ_DYING; workq_death_policy_evaluate(wq, 1); + *needs_wakeup = false; + } else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) { + *needs_wakeup = false; + } else { + *needs_wakeup = true; } return uth; } @@ -814,6 +815,7 @@ workq_thread_init_and_wq_lock(task_t task, thread_t th) uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY); uth->uu_workq_thport = MACH_PORT_NULL; uth->uu_workq_stackaddr = 0; + uth->uu_workq_pthread_kill_allowed = 0; thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE); thread_reset_workq_qos(th, THREAD_QOS_LEGACY); @@ -886,13 +888,13 @@ out: __attribute__((noreturn, noinline)) static void workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq, - struct uthread *uth, uint32_t death_flags) + struct uthread *uth, uint32_t death_flags, uint32_t setup_flags) { thread_qos_t qos = workq_pri_override(uth->uu_workq_pri); bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW; if (qos > WORKQ_THREAD_QOS_CLEANUP) { - workq_thread_reset_pri(wq, uth, NULL); + workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true); qos = WORKQ_THREAD_QOS_CLEANUP; } @@ -910,8 +912,13 @@ workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq, workq_unlock(wq); + if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) { + __assert_only kern_return_t kr; + kr = thread_set_voucher_name(MACH_PORT_NULL); + assert(kr == KERN_SUCCESS); + } + uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS; - uint32_t setup_flags = WQ_SETUP_EXIT_THREAD; thread_t th = uth->uu_thread; vm_map_t vmap = get_task_map(p->task); @@ -920,7 +927,7 @@ workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq, } pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr, - uth->uu_workq_thport, 0, setup_flags, flags); + uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags); __builtin_unreachable(); } @@ -946,6 +953,10 @@ workq_turnstile_update_inheritor(struct workqueue *wq, turnstile_inheritor_t inheritor, turnstile_update_flags_t flags) { + if (wq->wq_inheritor == inheritor) { + return; + } + wq->wq_inheritor = inheritor; workq_perform_turnstile_operation_locked(wq, ^{ turnstile_update_inheritor(wq->wq_turnstile, inheritor, flags | TURNSTILE_IMMEDIATE_UPDATE); @@ -955,35 +966,44 @@ workq_turnstile_update_inheritor(struct workqueue *wq, } static void -workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth) +workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth, + uint32_t setup_flags) { uint64_t now = mach_absolute_time(); + bool is_creator = (uth == wq->wq_creator); - uth->uu_workq_flags &= ~UT_WORKQ_RUNNING; if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) { wq->wq_constrained_threads_scheduled--; } + uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT); TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry); wq->wq_threads_scheduled--; - if (wq->wq_creator == uth) { + if (is_creator) { + wq->wq_creator = NULL; WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0, uth->uu_save.uus_workq_park_data.yields, 0); - wq->wq_creator = NULL; + } + + if (wq->wq_inheritor == uth->uu_thread) { + assert(wq->wq_creator == NULL); if (wq->wq_reqcount) { workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ); } else { workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0); } - if (uth->uu_workq_flags & UT_WORKQ_NEW) { - TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry); - wq->wq_thidlecount++; - return; - } - } else { + } + + if (uth->uu_workq_flags & UT_WORKQ_NEW) { + assert(is_creator || (_wq_flags(wq) & WQ_EXITING)); + TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry); + wq->wq_thidlecount++; + return; + } + + if (!is_creator) { _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket); wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--; - assert(!(uth->uu_workq_flags & UT_WORKQ_NEW)); uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP; } @@ -1014,7 +1034,7 @@ workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth) wq->wq_thdying_count++; uth->uu_workq_flags |= UT_WORKQ_DYING; uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP; - workq_unpark_for_death_and_unlock(p, wq, uth, 0); + workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags); __builtin_unreachable(); } @@ -1045,7 +1065,7 @@ workq_priority_for_req(workq_threadreq_t req) { thread_qos_t qos = req->tr_qos; - if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) { + if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) { workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req); assert(trp.trp_flags & TRP_PRIORITY); return trp.trp_pri; @@ -1056,9 +1076,9 @@ workq_priority_for_req(workq_threadreq_t req) static inline struct priority_queue * workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req) { - if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) { + if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) { return &wq->wq_special_queue; - } else if (req->tr_flags & TR_FLAG_OVERCOMMIT) { + } else if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) { return &wq->wq_overcommit_queue; } else { return &wq->wq_constrained_queue; @@ -1072,14 +1092,14 @@ workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req) static bool workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req) { - assert(req->tr_state == TR_STATE_NEW); + assert(req->tr_state == WORKQ_TR_STATE_NEW); - req->tr_state = TR_STATE_QUEUED; + req->tr_state = WORKQ_TR_STATE_QUEUED; wq->wq_reqcount += req->tr_count; if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) { assert(wq->wq_event_manager_threadreq == NULL); - assert(req->tr_flags & TR_FLAG_KEVENT); + assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT); assert(req->tr_count == 1); wq->wq_event_manager_threadreq = req; return true; @@ -1087,7 +1107,7 @@ workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req) if (priority_queue_insert(workq_priority_queue_for_req(wq, req), &req->tr_entry, workq_priority_for_req(req), PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { - if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) { _wq_thactive_refresh_best_constrained_req_qos(wq); } return true; @@ -1113,7 +1133,7 @@ workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req) } if (priority_queue_remove(workq_priority_queue_for_req(wq, req), &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { - if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) { _wq_thactive_refresh_best_constrained_req_qos(wq); } return true; @@ -1125,113 +1145,14 @@ workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req) static void workq_threadreq_destroy(proc_t p, workq_threadreq_t req) { - req->tr_state = TR_STATE_IDLE; - if (req->tr_flags & (TR_FLAG_WORKLOOP | TR_FLAG_KEVENT)) { + req->tr_state = WORKQ_TR_STATE_CANCELED; + if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) { kqueue_threadreq_cancel(p, req); } else { zfree(workq_zone_threadreq, req); } } -/* - * Mark a thread request as complete. At this point, it is treated as owned by - * the submitting subsystem and you should assume it could be freed. - * - * Called with the workqueue lock held. - */ -static void -workq_threadreq_bind_and_unlock(proc_t p, struct workqueue *wq, - workq_threadreq_t req, struct uthread *uth) -{ - uint8_t tr_flags = req->tr_flags; - bool needs_commit = false; - int creator_flags = 0; - - wq->wq_fulfilled++; - - if (req->tr_state == TR_STATE_QUEUED) { - workq_threadreq_dequeue(wq, req); - creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS; - } - - if (wq->wq_creator == uth) { - WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0, - uth->uu_save.uus_workq_park_data.yields, 0); - creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS | - WORKQ_THREADREQ_CREATOR_TRANSFER; - wq->wq_creator = NULL; - _wq_thactive_inc(wq, req->tr_qos); - wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++; - } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) { - _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos); - } - workq_thread_reset_pri(wq, uth, req); - - if (tr_flags & TR_FLAG_OVERCOMMIT) { - if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) { - uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT; - wq->wq_constrained_threads_scheduled--; - } - } else { - if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) { - uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT; - wq->wq_constrained_threads_scheduled++; - } - } - - if (tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP)) { - if (req->tr_state == TR_STATE_NEW) { - /* - * We're called from workq_kern_threadreq_initiate() - * due to an unbind, with the kq req held. - */ - assert(!creator_flags); - req->tr_state = TR_STATE_IDLE; - kqueue_threadreq_bind(p, req, uth->uu_thread, 0); - } else { - assert(req->tr_count == 0); - workq_perform_turnstile_operation_locked(wq, ^{ - kqueue_threadreq_bind_prepost(p, req, uth->uu_thread); - }); - needs_commit = true; - } - req = NULL; - } else if (req->tr_count > 0) { - req = NULL; - } - - if (creator_flags) { - /* This can drop the workqueue lock, and take it again */ - workq_schedule_creator(p, wq, creator_flags); - } - - workq_unlock(wq); - - if (req) { - zfree(workq_zone_threadreq, req); - } - if (needs_commit) { - kqueue_threadreq_bind_commit(p, uth->uu_thread); - } - - /* - * Run Thread, Run! - */ - uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI; - if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { - upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER; - } else if (tr_flags & TR_FLAG_OVERCOMMIT) { - upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT; - } - if (tr_flags & TR_FLAG_KEVENT) { - upcall_flags |= WQ_FLAG_THREAD_KEVENT; - } - if (tr_flags & TR_FLAG_WORKLOOP) { - upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT; - } - uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags; -} - #pragma mark workqueue thread creation thread calls static inline bool @@ -1332,8 +1253,8 @@ workq_proc_resumed(struct proc *p) return; } - wq_flags = os_atomic_and_orig(&wq->wq_flags, ~(WQ_PROC_SUSPENDED | - WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED), relaxed); + wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED | + WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed); if ((wq_flags & WQ_EXITING) == 0) { disable_preemption(); if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) { @@ -1352,7 +1273,7 @@ workq_proc_resumed(struct proc *p) static bool workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp) { - uint64_t lastblocked_ts = os_atomic_load(lastblocked_tsp, relaxed); + uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed); if (now <= lastblocked_ts) { /* * Because the update of the timestamp when a thread blocks @@ -1392,7 +1313,7 @@ workq_add_new_threads_call(void *_p, void *flags) workq_lock_spin(wq); wq->wq_thread_call_last_run = mach_absolute_time(); - os_atomic_and(&wq->wq_flags, ~my_flag, release); + os_atomic_andnot(&wq->wq_flags, my_flag, release); /* This can drop the workqueue lock, and take it again */ workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS); @@ -1434,7 +1355,7 @@ workq_sched_callback(int type, thread_t thread) * get scheduled and then block after we start down this path), it's * not a problem. Either timestamp is adequate, so no need to retry */ - os_atomic_store(&wq->wq_lastblocked_ts[_wq_bucket(qos)], + os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)], thread_last_run_time(thread), relaxed); if (req_qos == THREAD_QOS_UNSPECIFIED) { @@ -1506,12 +1427,17 @@ workq_reference(struct workqueue *wq) os_ref_retain(&wq->wq_refcnt); } -void -workq_destroy(struct workqueue *wq) +static void +workq_deallocate_queue_invoke(mpsc_queue_chain_t e, + __assert_only mpsc_daemon_queue_t dq) { + struct workqueue *wq; struct turnstile *ts; - turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts); + wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link); + assert(dq == &workq_deallocate_queue); + + turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS); assert(ts); turnstile_cleanup(); turnstile_deallocate(ts); @@ -1524,7 +1450,8 @@ static void workq_deallocate(struct workqueue *wq) { if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) { - workq_destroy(wq); + workq_deallocate_queue_invoke(&wq->wq_destroy_link, + &workq_deallocate_queue); } } @@ -1532,7 +1459,8 @@ void workq_deallocate_safe(struct workqueue *wq) { if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) { - workq_deallocate_enqueue(wq); + mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link, + MPSC_QUEUE_DISABLE_PREEMPTION); } } @@ -1677,7 +1605,8 @@ workq_mark_exiting(struct proc *p) mgr_req = wq->wq_event_manager_threadreq; wq->wq_event_manager_threadreq = NULL; wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */ - workq_turnstile_update_inheritor(wq, NULL, 0); + wq->wq_creator = NULL; + workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0); workq_unlock(wq); @@ -1809,18 +1738,18 @@ bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority, goto qos; } - struct kqrequest *kqr = uth->uu_kqr_bound; + workq_threadreq_t kqr = uth->uu_kqr_bound; if (kqr == NULL) { unbind_rv = EALREADY; goto qos; } - if (kqr->kqr_state & KQR_WORKLOOP) { + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { unbind_rv = EINVAL; goto qos; } - kqueue_threadreq_unbind(p, uth->uu_kqr_bound); + kqueue_threadreq_unbind(p, kqr); } qos: @@ -1840,9 +1769,10 @@ qos: qos_rv = EPERM; goto voucher; } - } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { + } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER || + uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) { /* - * Workqueue manager threads can't change QoS + * Workqueue manager threads or threads above UI can't change QoS */ qos_rv = EINVAL; goto voucher; @@ -1960,7 +1890,8 @@ bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport, return EINVAL; } - thread_t th = port_name_to_thread(kport); + thread_t th = port_name_to_thread(kport, + PORT_TO_THREAD_IN_CURRENT_TASK); if (th == THREAD_NULL) { return ESRCH; } @@ -1976,7 +1907,8 @@ static int bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport, user_addr_t resource) { - thread_t th = port_name_to_thread(kport); + thread_t th = port_name_to_thread(kport, + PORT_TO_THREAD_IN_CURRENT_TASK); if (th == THREAD_NULL) { return ESRCH; } @@ -2000,7 +1932,8 @@ workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport, return EINVAL; } - thread_t thread = port_name_to_thread(kport); + thread_t thread = port_name_to_thread(kport, + PORT_TO_THREAD_IN_CURRENT_TASK); if (thread == THREAD_NULL) { return ESRCH; } @@ -2017,16 +1950,16 @@ workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport, thread_mtx_lock(thread); if (ulock_addr) { - uint64_t val; + uint32_t val; int rc; /* * Workaround lack of explicit support for 'no-fault copyin' * , as disabling preemption prevents paging in */ disable_preemption(); - rc = copyin_word(ulock_addr, &val, sizeof(kport)); + rc = copyin_atomic32(ulock_addr, &val); enable_preemption(); - if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != kport) { + if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) { goto out; } } @@ -2076,6 +2009,23 @@ workq_thread_reset_dispatch_override(proc_t p, thread_t thread) return 0; } +static int +workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable) +{ + if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) { + // If the thread isn't a workqueue thread, don't set the + // kill_allowed bit; however, we still need to return 0 + // instead of an error code since this code is executed + // on the abort path which needs to not depend on the + // pthread_t (returning an error depends on pthread_t via + // cerror_nocancel) + return 0; + } + struct uthread *uth = get_bsdthread_info(thread); + uth->uu_workq_pthread_kill_allowed = enable; + return 0; +} + static int bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags, int *retval) @@ -2131,6 +2081,10 @@ bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval) ENSURE_UNUSED(uap->arg3); return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1, (unsigned long)uap->arg2, retval); + case BSDTHREAD_CTL_WORKQ_ALLOW_KILL: + ENSURE_UNUSED(uap->arg2); + ENSURE_UNUSED(uap->arg3); + return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1); case BSDTHREAD_CTL_SET_QOS: case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD: @@ -2145,9 +2099,13 @@ bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval) #pragma mark workqueue thread manipulation +static void __dead2 +workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, + struct uthread *uth, uint32_t setup_flags); + static void __dead2 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, - struct uthread *uth); + struct uthread *uth, uint32_t setup_flags); static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2; @@ -2156,8 +2114,8 @@ static inline uint64_t workq_trace_req_id(workq_threadreq_t req) { struct kqworkloop *kqwl; - if (req->tr_flags & TR_FLAG_WORKLOOP) { - kqwl = __container_of(req, struct kqworkloop, kqwl_request.kqr_req); + if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { + kqwl = __container_of(req, struct kqworkloop, kqwl_request); return kqwl->kqwl_dynamicid; } @@ -2185,12 +2143,12 @@ workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp) workq_threadreq_t req = zalloc(workq_zone_threadreq); priority_queue_entry_init(&req->tr_entry); - req->tr_state = TR_STATE_NEW; + req->tr_state = WORKQ_TR_STATE_NEW; req->tr_flags = 0; req->tr_qos = qos; if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) { - req->tr_flags |= TR_FLAG_OVERCOMMIT; + req->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT; upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT; } @@ -2213,7 +2171,7 @@ workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp) * If there aren't enough threads, add one, but re-evaluate everything * as conditions may now have changed. */ - if (reqcount > 1 && (req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + if (reqcount > 1 && (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) { unpaced = workq_constrained_allowance(wq, qos, NULL, false); if (unpaced >= reqcount - 1) { unpaced = reqcount - 1; @@ -2226,27 +2184,32 @@ workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp) * This path does not currently handle custom workloop parameters * when creating threads for parallelism. */ - assert(!(req->tr_flags & TR_FLAG_WL_PARAMS)); + assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)); /* * This is a trimmed down version of workq_threadreq_bind_and_unlock() */ while (unpaced > 0 && wq->wq_thidlecount) { - struct uthread *uth = workq_pop_idle_thread(wq); + struct uthread *uth; + bool needs_wakeup; + uint8_t uu_flags = UT_WORKQ_EARLY_BOUND; + + if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) { + uu_flags |= UT_WORKQ_OVERCOMMIT; + } + + uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup); _wq_thactive_inc(wq, qos); wq->wq_thscheduled_count[_wq_bucket(qos)]++; - workq_thread_reset_pri(wq, uth, req); + workq_thread_reset_pri(wq, uth, req, /*unpark*/ true); wq->wq_fulfilled++; - uth->uu_workq_flags |= UT_WORKQ_EARLY_BOUND; - if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { - uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT; - wq->wq_constrained_threads_scheduled++; - } uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags; uth->uu_save.uus_workq_park_data.thread_request = req; - workq_thread_wakeup(uth); + if (needs_wakeup) { + workq_thread_wakeup(uth); + } unpaced--; reqcount--; } @@ -2272,41 +2235,27 @@ exiting: } bool -workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr, - struct turnstile *workloop_ts, thread_qos_t qos, int flags) +workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req, + struct turnstile *workloop_ts, thread_qos_t qos, + workq_kern_threadreq_flags_t flags) { struct workqueue *wq = proc_get_wqptr_fast(p); - workq_threadreq_t req = &kqr->kqr_req; struct uthread *uth = NULL; - uint8_t tr_flags = 0; - if (kqr->kqr_state & KQR_WORKLOOP) { - tr_flags = TR_FLAG_WORKLOOP; + assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)); + if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) { workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req); - if (trp.trp_flags & TRP_PRIORITY) { - tr_flags |= TR_FLAG_WL_OUTSIDE_QOS; - qos = thread_workq_qos_for_pri(trp.trp_pri); - if (qos == THREAD_QOS_UNSPECIFIED) { - qos = WORKQ_THREAD_QOS_ABOVEUI; - } - } - if (trp.trp_flags) { - tr_flags |= TR_FLAG_WL_PARAMS; + qos = thread_workq_qos_for_pri(trp.trp_pri); + if (qos == THREAD_QOS_UNSPECIFIED) { + qos = WORKQ_THREAD_QOS_ABOVEUI; } - } else { - tr_flags = TR_FLAG_KEVENT; - } - if (qos != WORKQ_THREAD_QOS_MANAGER && - (kqr->kqr_state & KQR_THOVERCOMMIT)) { - tr_flags |= TR_FLAG_OVERCOMMIT; } - assert(req->tr_state == TR_STATE_IDLE); + assert(req->tr_state == WORKQ_TR_STATE_IDLE); priority_queue_entry_init(&req->tr_entry); req->tr_count = 1; - req->tr_state = TR_STATE_NEW; - req->tr_flags = tr_flags; + req->tr_state = WORKQ_TR_STATE_NEW; req->tr_qos = qos; WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq, @@ -2324,13 +2273,25 @@ workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr, workq_lock_spin(wq); if (_wq_exiting(wq)) { + req->tr_state = WORKQ_TR_STATE_IDLE; workq_unlock(wq); return false; } if (uth && workq_threadreq_admissible(wq, uth, req)) { assert(uth != wq->wq_creator); - workq_threadreq_bind_and_unlock(p, wq, req, uth); + if (uth->uu_workq_pri.qos_bucket != req->tr_qos) { + _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos); + workq_thread_reset_pri(wq, uth, req, /*unpark*/ false); + } + /* + * We're called from workq_kern_threadreq_initiate() + * due to an unbind, with the kq req held. + */ + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq, + workq_trace_req_id(req), 0, 0, 0); + wq->wq_fulfilled++; + kqueue_threadreq_bind(p, req, uth->uu_thread, 0); } else { if (workloop_ts) { workq_perform_turnstile_operation_locked(wq, ^{ @@ -2343,21 +2304,21 @@ workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr, if (workq_threadreq_enqueue(wq, req)) { workq_schedule_creator(p, wq, flags); } - workq_unlock(wq); } + workq_unlock(wq); + return true; } void -workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, - thread_qos_t qos, int flags) +workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req, + thread_qos_t qos, workq_kern_threadreq_flags_t flags) { struct workqueue *wq = proc_get_wqptr_fast(p); - workq_threadreq_t req = &kqr->kqr_req; - bool change_overcommit = false; + bool make_overcommit = false; - if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) { + if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) { /* Requests outside-of-QoS shouldn't accept modify operations */ return; } @@ -2365,24 +2326,25 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, workq_lock_spin(wq); assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER); - assert(req->tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP)); + assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)); - if (req->tr_state == TR_STATE_BINDING) { - kqueue_threadreq_bind(p, req, req->tr_binding_thread, 0); + if (req->tr_state == WORKQ_TR_STATE_BINDING) { + kqueue_threadreq_bind(p, req, req->tr_thread, 0); workq_unlock(wq); return; } - change_overcommit = (bool)(kqr->kqr_state & KQR_THOVERCOMMIT) != - (bool)(req->tr_flags & TR_FLAG_OVERCOMMIT); + if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) { + make_overcommit = (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0; + } - if (_wq_exiting(wq) || (req->tr_qos == qos && !change_overcommit)) { + if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) { workq_unlock(wq); return; } assert(req->tr_count == 1); - if (req->tr_state != TR_STATE_QUEUED) { + if (req->tr_state != WORKQ_TR_STATE_QUEUED) { panic("Invalid thread request (%p) state %d", req, req->tr_state); } @@ -2400,7 +2362,7 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, */ if (priority_queue_remove(pq, &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { - if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) { _wq_thactive_refresh_best_constrained_req_qos(wq); } } @@ -2411,8 +2373,8 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, * If the item will not become the root of the priority queue it belongs to, * then we need to wait in line, just enqueue and return quickly. */ - if (__improbable(change_overcommit)) { - req->tr_flags ^= TR_FLAG_OVERCOMMIT; + if (__improbable(make_overcommit)) { + req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT; pq = workq_priority_queue_for_req(wq, req); } req->tr_qos = qos; @@ -2430,11 +2392,11 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, * * Pretend the thread request is new again: * - adjust wq_reqcount to not count it anymore. - * - make its state TR_STATE_NEW (so that workq_threadreq_bind_and_unlock + * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock * properly attempts a synchronous bind) */ wq->wq_reqcount--; - req->tr_state = TR_STATE_NEW; + req->tr_state = WORKQ_TR_STATE_NEW; if (workq_threadreq_enqueue(wq, req)) { workq_schedule_creator(p, wq, flags); } @@ -2454,20 +2416,19 @@ workq_kern_threadreq_unlock(struct proc *p) } void -workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr, +workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req, thread_t owner, struct turnstile *wl_ts, turnstile_update_flags_t flags) { struct workqueue *wq = proc_get_wqptr_fast(p); - workq_threadreq_t req = &kqr->kqr_req; turnstile_inheritor_t inheritor; assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER); - assert(req->tr_flags & TR_FLAG_WORKLOOP); + assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP); workq_lock_held(wq); - if (req->tr_state == TR_STATE_BINDING) { - kqueue_threadreq_bind(p, req, req->tr_binding_thread, + if (req->tr_state == WORKQ_TR_STATE_BINDING) { + kqueue_threadreq_bind(p, req, req->tr_thread, KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE); return; } @@ -2475,7 +2436,7 @@ workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr, if (_wq_exiting(wq)) { inheritor = TURNSTILE_INHERITOR_NULL; } else { - if (req->tr_state != TR_STATE_QUEUED) { + if (req->tr_state != WORKQ_TR_STATE_QUEUED) { panic("Invalid thread request (%p) state %d", req, req->tr_state); } @@ -2494,7 +2455,7 @@ workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr, } void -workq_kern_threadreq_redrive(struct proc *p, int flags) +workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags) { struct workqueue *wq = proc_get_wqptr_fast(p); @@ -2506,12 +2467,10 @@ workq_kern_threadreq_redrive(struct proc *p, int flags) void workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked) { - if (!locked) { - workq_lock_spin(wq); - } - workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_CREATOR_SYNC_UPDATE); - if (!locked) { - workq_unlock(wq); + if (locked) { + workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE); + } else { + workq_schedule_immediate_thread_creation(wq); } } @@ -2521,7 +2480,7 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap, { thread_t th = current_thread(); struct uthread *uth = get_bsdthread_info(th); - struct kqrequest *kqr = uth->uu_kqr_bound; + workq_threadreq_t kqr = uth->uu_kqr_bound; workq_threadreq_param_t trp = { }; int nevents = uap->affinity, error; user_addr_t eventlist = uap->item; @@ -2542,17 +2501,26 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap, proc_unlock(p); } - if (kqr && kqr->kqr_req.tr_flags & TR_FLAG_WL_PARAMS) { + if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) { /* * Ensure we store the threadreq param before unbinding * the kqr from this thread. */ - trp = kqueue_threadreq_workloop_param(&kqr->kqr_req); + trp = kqueue_threadreq_workloop_param(kqr); } + /* + * Freeze thee base pri while we decide the fate of this thread. + * + * Either: + * - we return to user and kevent_cleanup will have unfrozen the base pri, + * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will. + */ + thread_freeze_base_pri(th); + if (kqr) { uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE; - if (kqr->kqr_state & KQR_WORKLOOP) { + if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) { upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT; } else { upcall_flags |= WQ_FLAG_THREAD_KEVENT; @@ -2575,6 +2543,7 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap, get_task_map(p->task), uth->uu_workq_stackaddr, uth->uu_workq_thport, eventlist, nevents, upcall_flags); if (error) { + assert(uth->uu_kqr_bound == kqr); return error; } @@ -2597,7 +2566,8 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap, workq_lock_spin(wq); WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0); uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value; - workq_select_threadreq_or_park_and_unlock(p, wq, uth); + workq_select_threadreq_or_park_and_unlock(p, wq, uth, + WQ_SETUP_CLEAR_VOUCHER); __builtin_unreachable(); } @@ -2714,6 +2684,35 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *ret *retval = should_narrow; break; } + case WQOPS_SETUP_DISPATCH: { + /* + * item = pointer to workq_dispatch_config structure + * arg2 = sizeof(item) + */ + struct workq_dispatch_config cfg; + bzero(&cfg, sizeof(cfg)); + + error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2)); + if (error) { + break; + } + + if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS || + cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) { + error = ENOTSUP; + break; + } + + /* Load fields from version 1 */ + p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs; + + /* Load fields from version 2 */ + if (cfg.wdc_version >= 2) { + p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs; + } + + break; + } default: error = EINVAL; break; @@ -2729,15 +2728,17 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *ret */ __attribute__((noreturn, noinline)) static void -workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth) +workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth, + uint32_t setup_flags) { assert(uth == current_uthread()); assert(uth->uu_kqr_bound == NULL); - workq_push_idle_thread(p, wq, uth); // may not return + workq_push_idle_thread(p, wq, uth, setup_flags); // may not return workq_thread_reset_cpupercent(NULL, uth); - if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) { + if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) && + !(uth->uu_workq_flags & UT_WORKQ_DYING)) { workq_unlock(wq); /* @@ -2762,6 +2763,7 @@ workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth) workq_lock_spin(wq); uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP; + setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER; } if (uth->uu_workq_flags & UT_WORKQ_RUNNING) { @@ -2772,13 +2774,13 @@ workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth) * we just run the continuation ourselves. */ WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0); - workq_select_threadreq_or_park_and_unlock(p, wq, uth); + workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags); __builtin_unreachable(); } if (uth->uu_workq_flags & UT_WORKQ_DYING) { workq_unpark_for_death_and_unlock(p, wq, uth, - WORKQ_UNPARK_FOR_DEATH_WAS_IDLE); + WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags); __builtin_unreachable(); } @@ -2883,7 +2885,7 @@ workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth, if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) { return workq_may_start_event_mgr_thread(wq, uth); } - if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) { return workq_constrained_allowance(wq, req->tr_qos, uth, true); } return true; @@ -2990,8 +2992,8 @@ workq_threadreq_select(struct workqueue *wq, struct uthread *uth) &proprietor); if (pri) { struct kqworkloop *kqwl = (struct kqworkloop *)proprietor; - req_pri = &kqwl->kqwl_request.kqr_req; - if (req_pri->tr_state != TR_STATE_QUEUED) { + req_pri = &kqwl->kqwl_request; + if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) { panic("Invalid thread request (%p) state %d", req_pri, req_pri->tr_state); } @@ -3063,10 +3065,12 @@ workq_threadreq_select(struct workqueue *wq, struct uthread *uth) * efficient scheduling and reduced context switches. */ static void -workq_schedule_creator(proc_t p, struct workqueue *wq, int flags) +workq_schedule_creator(proc_t p, struct workqueue *wq, + workq_kern_threadreq_flags_t flags) { workq_threadreq_t req; struct uthread *uth; + bool needs_wakeup; workq_lock_held(wq); assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0); @@ -3075,6 +3079,14 @@ again: uth = wq->wq_creator; if (!wq->wq_reqcount) { + /* + * There is no thread request left. + * + * If there is a creator, leave everything in place, so that it cleans + * up itself in workq_push_idle_thread(). + * + * Else, make sure the turnstile state is reset to no inheritor. + */ if (uth == NULL) { workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0); } @@ -3083,13 +3095,16 @@ again: req = workq_threadreq_select_for_creator(wq); if (req == NULL) { - if (flags & WORKQ_THREADREQ_CREATOR_SYNC_UPDATE) { - assert((flags & WORKQ_THREADREQ_CREATOR_TRANSFER) == 0); - /* - * turnstile propagation code is reaching out to us, - * and we still don't want to do anything, do not recurse. - */ - } else { + /* + * There isn't a thread request that passes the admission check. + * + * If there is a creator, do not touch anything, the creator will sort + * it out when it runs. + * + * Else, set the inheritor to "WORKQ" so that the turnstile propagation + * code calls us if anything changes. + */ + if (uth == NULL) { workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ); } return; @@ -3102,15 +3117,17 @@ again: if (workq_thread_needs_priority_change(req, uth)) { WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE, wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0); - workq_thread_reset_pri(wq, uth, req); + workq_thread_reset_pri(wq, uth, req, /*unpark*/ true); } + assert(wq->wq_inheritor == uth->uu_thread); } else if (wq->wq_thidlecount) { /* * We need to unpark a creator thread */ - wq->wq_creator = uth = workq_pop_idle_thread(wq); + wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT, + &needs_wakeup); if (workq_thread_needs_priority_change(req, uth)) { - workq_thread_reset_pri(wq, uth, req); + workq_thread_reset_pri(wq, uth, req, /*unpark*/ true); } workq_turnstile_update_inheritor(wq, uth->uu_thread, TURNSTILE_INHERITOR_THREAD); @@ -3118,13 +3135,16 @@ again: wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0); uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled; uth->uu_save.uus_workq_park_data.yields = 0; - workq_thread_wakeup(uth); + if (needs_wakeup) { + workq_thread_wakeup(uth); + } } else { /* * We need to allocate a thread... */ if (__improbable(wq->wq_nthreads >= wq_max_threads)) { /* out of threads, just go away */ + flags = WORKQ_THREADREQ_NONE; } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) { act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ); } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) { @@ -3136,16 +3156,173 @@ again: workq_schedule_delayed_thread_creation(wq, 0); } - if (flags & WORKQ_THREADREQ_CREATOR_TRANSFER) { - /* - * workq_schedule_creator() failed at creating a thread, - * and the responsibility of redriving is now with a thread-call. - * - * We still need to tell the turnstile the previous creator is gone. - */ - workq_turnstile_update_inheritor(wq, NULL, 0); + /* + * If the current thread is the inheritor: + * + * If we set the AST, then the thread will stay the inheritor until + * either the AST calls workq_kern_threadreq_redrive(), or it parks + * and calls workq_push_idle_thread(). + * + * Else, the responsibility of the thread creation is with a thread-call + * and we need to clear the inheritor. + */ + if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 && + wq->wq_inheritor == current_thread()) { + workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0); + } + } +} + +/** + * Same as workq_unpark_select_threadreq_or_park_and_unlock, + * but do not allow early binds. + * + * Called with the base pri frozen, will unfreeze it. + */ +__attribute__((noreturn, noinline)) +static void +workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, + struct uthread *uth, uint32_t setup_flags) +{ + workq_threadreq_t req = NULL; + bool is_creator = (wq->wq_creator == uth); + bool schedule_creator = false; + + if (__improbable(_wq_exiting(wq))) { + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0); + goto park; + } + + if (wq->wq_reqcount == 0) { + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0); + goto park; + } + + req = workq_threadreq_select(wq, uth); + if (__improbable(req == NULL)) { + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0); + goto park; + } + + uint8_t tr_flags = req->tr_flags; + struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req); + + /* + * Attempt to setup ourselves as the new thing to run, moving all priority + * pushes to ourselves. + * + * If the current thread is the creator, then the fact that we are presently + * running is proof that we'll do something useful, so keep going. + * + * For other cases, peek at the AST to know whether the scheduler wants + * to preempt us, if yes, park instead, and move the thread request + * turnstile back to the workqueue. + */ + if (req_ts) { + workq_perform_turnstile_operation_locked(wq, ^{ + turnstile_update_inheritor(req_ts, uth->uu_thread, + TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD); + turnstile_update_inheritor_complete(req_ts, + TURNSTILE_INTERLOCK_HELD); + }); + } + + if (is_creator) { + WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0, + uth->uu_save.uus_workq_park_data.yields, 0); + wq->wq_creator = NULL; + _wq_thactive_inc(wq, req->tr_qos); + wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++; + } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) { + _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos); + } + + workq_thread_reset_pri(wq, uth, req, /*unpark*/ true); + + if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) { + if (req_ts) { + workq_perform_turnstile_operation_locked(wq, ^{ + turnstile_update_inheritor(req_ts, wq->wq_turnstile, + TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE); + turnstile_update_inheritor_complete(req_ts, + TURNSTILE_INTERLOCK_HELD); + }); } + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0); + goto park_thawed; + } + + /* + * We passed all checks, dequeue the request, bind to it, and set it up + * to return to user. + */ + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq, + workq_trace_req_id(req), 0, 0, 0); + wq->wq_fulfilled++; + schedule_creator = workq_threadreq_dequeue(wq, req); + + if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) { + kqueue_threadreq_bind_prepost(p, req, uth); + req = NULL; + } else if (req->tr_count > 0) { + req = NULL; + } + + workq_thread_reset_cpupercent(req, uth); + if (uth->uu_workq_flags & UT_WORKQ_NEW) { + uth->uu_workq_flags ^= UT_WORKQ_NEW; + setup_flags |= WQ_SETUP_FIRST_USE; + } + if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) { + if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) { + uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT; + wq->wq_constrained_threads_scheduled--; + } + } else { + if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) { + uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT; + wq->wq_constrained_threads_scheduled++; + } + } + + if (is_creator || schedule_creator) { + /* This can drop the workqueue lock, and take it again */ + workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS); + } + + workq_unlock(wq); + + if (req) { + zfree(workq_zone_threadreq, req); + } + + /* + * Run Thread, Run! + */ + uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI; + if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { + upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER; + } else if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) { + upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT; } + if (tr_flags & WORKQ_TR_FLAG_KEVENT) { + upcall_flags |= WQ_FLAG_THREAD_KEVENT; + } + if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) { + upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT; + } + uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags; + + if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) { + kqueue_threadreq_bind_commit(p, uth->uu_thread); + } + workq_setup_and_run(p, uth, setup_flags); + __builtin_unreachable(); + +park: + thread_unfreeze_base_pri(uth->uu_thread); +park_thawed: + workq_park_and_unlock(p, wq, uth, setup_flags); } /** @@ -3161,16 +3338,14 @@ again: * Either way, the thread request object serviced will be moved to state * BINDING and attached to the uthread. * - * Should be called with the workqueue lock held. Will drop it. + * Should be called with the workqueue lock held. Will drop it. + * Should be called with the base pri not frozen. */ __attribute__((noreturn, noinline)) static void -workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, - struct uthread *uth) +workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, + struct uthread *uth, uint32_t setup_flags) { - uint32_t setup_flags = 0; - workq_threadreq_t req; - if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) { if (uth->uu_workq_flags & UT_WORKQ_NEW) { setup_flags |= WQ_SETUP_FIRST_USE; @@ -3179,33 +3354,17 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, /* * This pointer is possibly freed and only used for tracing purposes. */ - req = uth->uu_save.uus_workq_park_data.thread_request; + workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request; workq_unlock(wq); WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq, VM_KERNEL_ADDRHIDE(req), 0, 0, 0); - goto run; - } else if (_wq_exiting(wq)) { - WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0); - } else if (wq->wq_reqcount == 0) { - WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0); - } else if ((req = workq_threadreq_select(wq, uth)) == NULL) { - WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0); - } else { - WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq, - workq_trace_req_id(req), 0, 0, 0); - if (uth->uu_workq_flags & UT_WORKQ_NEW) { - uth->uu_workq_flags ^= UT_WORKQ_NEW; - setup_flags |= WQ_SETUP_FIRST_USE; - } - workq_thread_reset_cpupercent(req, uth); - workq_threadreq_bind_and_unlock(p, wq, req, uth); -run: + (void)req; workq_setup_and_run(p, uth, setup_flags); __builtin_unreachable(); } - workq_park_and_unlock(p, wq, uth); - __builtin_unreachable(); + thread_freeze_base_pri(uth->uu_thread); + workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags); } static bool @@ -3250,7 +3409,8 @@ __attribute__((noreturn, noinline)) static void workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused) { - struct uthread *uth = current_uthread(); + thread_t th = current_thread(); + struct uthread *uth = get_bsdthread_info(th); proc_t p = current_proc(); struct workqueue *wq = proc_get_wqptr_fast(p); @@ -3270,7 +3430,7 @@ workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused) } if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) { - workq_select_threadreq_or_park_and_unlock(p, wq, uth); + workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE); __builtin_unreachable(); } @@ -3294,7 +3454,7 @@ workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused) } workq_unpark_for_death_and_unlock(p, wq, uth, - WORKQ_UNPARK_FOR_DEATH_WAS_IDLE); + WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE); __builtin_unreachable(); } @@ -3490,4 +3650,7 @@ workq_init(void) NSEC_PER_USEC, &wq_reduce_pool_window.abstime); clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs, NSEC_PER_USEC, &wq_max_timer_interval.abstime); + + thread_deallocate_daemon_register_queue(&workq_deallocate_queue, + workq_deallocate_queue_invoke); } diff --git a/bsd/pthread/workqueue_internal.h b/bsd/pthread/workqueue_internal.h index c2a67f5e7..f7ed3080c 100644 --- a/bsd/pthread/workqueue_internal.h +++ b/bsd/pthread/workqueue_internal.h @@ -67,6 +67,7 @@ #define WORKQUEUE_CONSTRAINED_FACTOR 5 #if BSD_KERNEL_PRIVATE +#include #include #include #include @@ -96,33 +97,96 @@ typedef union workq_threadreq_param_s { } workq_threadreq_param_t; #define TRP_PRIORITY 0x1 -#define TRP_POLICY 0x2 +#define TRP_POLICY 0x2 #define TRP_CPUPERCENT 0x4 #define TRP_RELEASED 0x8000 +/*! + * @enum workq_tr_state_t + * + * @brief + * This enum represents the state of a workq thread request. + * + * @discussion + * The states are used and set by both kevent and the workq subsystem under very + * precise locking domains. + * + * When for kevent requests, this structure is embedded on the kqueue itself, + * for non kevent related thread requests, it is allocated. + * + * Only the BINDING state isn't set under the kqlock, but then only QUEUED could + * be read by kqueue in its stead. + * + * @const WORKQ_TR_STATE_IDLE + * This thread request is idle. + * The state is only transient for non kevent thread requests. + * Set under the kqlock (kevent) or after allocation (workq). + * + * tr_entry/tr_thread are unused. + * + * @const WORKQ_TR_STATE_NEW + * This thread request is being initialized. This state is transient. + * Set workq lock for all kinds, set under the kqlock to for kevent requests. + * + * tr_entry is initialized, tr_thread is unused. + * + * @const WORKQ_TR_STATE_QUEUED + * This thread request has been pended, waiting for a thread to be bound. + * Set workq lock for all kinds, set under the kqlock to for kevent requests. + * + * tr_entry is used as linkage in a workq priority queue, tr_thread is unused. + * + * @const WORKQ_TR_STATE_CANCELED + * When the process exits, Queued thread requests are marked canceled. + * This happens under the workqueue lock. + * + * @const WORKQ_TR_STATE_BINDING (kevent only) + * A thread was found to bind to the thread request. + * The bind is preposted this way under the workq lock and will be + * acknowledged by the kevent subsystem. + * + * tr_entry is unused, tr_thread is the thread we're binding to. + * + * @const WORKQ_TR_STATE_BOUND (kevent only) + * A thread bind has been acknowledged by the kevent subsystem. + * This is always set under the kqlock, sometimes also under the workq lock. + * + * tr_entry is unused, tr_thread is the thread we're bound to. + */ +__enum_decl(workq_tr_state_t, uint8_t, { + WORKQ_TR_STATE_IDLE = 0, /* request isn't in flight */ + WORKQ_TR_STATE_NEW = 1, /* request is being initiated */ + WORKQ_TR_STATE_QUEUED = 2, /* request is being queued */ + WORKQ_TR_STATE_CANCELED = 3, /* request is canceled */ + WORKQ_TR_STATE_BINDING = 4, /* request is preposted for bind */ + WORKQ_TR_STATE_BOUND = 5, /* request is bound to a thread */ +}); + +__options_decl(workq_tr_flags_t, uint8_t, { + WORKQ_TR_FLAG_KEVENT = 0x01, + WORKQ_TR_FLAG_WORKLOOP = 0x02, + WORKQ_TR_FLAG_OVERCOMMIT = 0x04, + WORKQ_TR_FLAG_WL_PARAMS = 0x08, + WORKQ_TR_FLAG_WL_OUTSIDE_QOS = 0x10, +}); + typedef struct workq_threadreq_s { union { struct priority_queue_entry tr_entry; - thread_t tr_binding_thread; + thread_t tr_thread; }; - uint32_t tr_flags; - uint8_t tr_state; - thread_qos_t tr_qos; - uint16_t tr_count; -} *workq_threadreq_t; - -TAILQ_HEAD(threadreq_head, workq_threadreq_s); + uint16_t tr_count; + workq_tr_flags_t tr_flags; + workq_tr_state_t tr_state; + thread_qos_t tr_qos; /* qos for the thread request */ -#define TR_STATE_IDLE 0 /* request isn't in flight */ -#define TR_STATE_NEW 1 /* request is being initiated */ -#define TR_STATE_QUEUED 2 /* request is being queued */ -#define TR_STATE_BINDING 4 /* request is preposted for bind */ + /* kqueue states, modified under the kqlock */ + kq_index_t tr_kq_override_index; /* highest wakeup override index */ + kq_index_t tr_kq_qos_index; /* QoS for the servicer */ + bool tr_kq_wakeup; /* an event has fired */ +} workq_threadreq_s, *workq_threadreq_t; -#define TR_FLAG_KEVENT 0x01 -#define TR_FLAG_WORKLOOP 0x02 -#define TR_FLAG_OVERCOMMIT 0x04 -#define TR_FLAG_WL_PARAMS 0x08 -#define TR_FLAG_WL_OUTSIDE_QOS 0x10 +TAILQ_HEAD(threadreq_head, workq_threadreq_s); #if defined(__LP64__) typedef unsigned __int128 wq_thactive_t; @@ -130,7 +194,7 @@ typedef unsigned __int128 wq_thactive_t; typedef uint64_t wq_thactive_t; #endif -typedef enum { +__options_decl(workq_state_flags_t, uint32_t, { WQ_EXITING = 0x0001, WQ_PROC_SUSPENDED = 0x0002, WQ_DEATH_CALL_SCHEDULED = 0x0004, @@ -139,7 +203,7 @@ typedef enum { WQ_DELAYED_CALL_PENDED = 0x0020, WQ_IMMEDIATE_CALL_SCHEDULED = 0x0040, WQ_IMMEDIATE_CALL_PENDED = 0x0080, -} workq_state_flags_t; +}); TAILQ_HEAD(workq_uthread_head, uthread); @@ -147,7 +211,11 @@ struct workqueue { thread_call_t wq_delayed_call; thread_call_t wq_immediate_call; thread_call_t wq_death_call; - struct turnstile *wq_turnstile; + + union { + struct turnstile *wq_turnstile; + struct mpsc_queue_chain wq_destroy_link; + }; lck_spin_t wq_lock; @@ -171,6 +239,7 @@ struct workqueue { struct proc *wq_proc; struct uthread *wq_creator; + turnstile_inheritor_t wq_inheritor; thread_t wq_turnstile_updater; // thread doing a turnstile_update_ineritor struct workq_uthread_head wq_thrunlist; struct workq_uthread_head wq_thnewlist; @@ -182,9 +251,6 @@ struct workqueue { workq_threadreq_t wq_event_manager_threadreq; }; -static_assert(offsetof(struct workqueue, wq_lock) >= sizeof(struct queue_entry), - "Make sure workq_deallocate_enqueue can cast the workqueue"); - #define WORKQUEUE_MAXTHREADS 512 #define WQ_STALLED_WINDOW_USECS 200 #define WQ_REDUCE_POOL_WINDOW_USECS 5000000 @@ -192,7 +258,7 @@ static_assert(offsetof(struct workqueue, wq_lock) >= sizeof(struct queue_entry), #pragma mark definitions -struct kqrequest; +struct workq_threadreq_s; uint32_t _get_pwq_state_kdp(proc_t p); void workq_exit(struct proc *p); @@ -200,34 +266,34 @@ void workq_mark_exiting(struct proc *p); bool workq_is_exiting(struct proc *p); -struct turnstile *workq_turnstile(struct proc *p); - -void workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr); +void workq_thread_set_max_qos(struct proc *p, struct workq_threadreq_s *kqr); void workq_thread_terminate(struct proc *p, struct uthread *uth); -#define WORKQ_THREADREQ_SET_AST_ON_FAILURE 0x01 -#define WORKQ_THREADREQ_ATTEMPT_REBIND 0x02 -#define WORKQ_THREADREQ_CAN_CREATE_THREADS 0x04 -#define WORKQ_THREADREQ_CREATOR_TRANSFER 0x08 -#define WORKQ_THREADREQ_CREATOR_SYNC_UPDATE 0x10 +__options_decl(workq_kern_threadreq_flags_t, uint32_t, { + WORKQ_THREADREQ_NONE = 0x00, + WORKQ_THREADREQ_SET_AST_ON_FAILURE = 0x01, + WORKQ_THREADREQ_ATTEMPT_REBIND = 0x02, + WORKQ_THREADREQ_CAN_CREATE_THREADS = 0x04, + WORKQ_THREADREQ_MAKE_OVERCOMMIT = 0x08, +}); // called with the kq req lock held -bool workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr, - struct turnstile *ts, thread_qos_t qos, int flags); +bool workq_kern_threadreq_initiate(struct proc *p, struct workq_threadreq_s *kqr, + struct turnstile *ts, thread_qos_t qos, workq_kern_threadreq_flags_t flags); // called with the kq req lock held -void workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, - thread_qos_t qos, int flags); +void workq_kern_threadreq_modify(struct proc *p, struct workq_threadreq_s *kqr, + thread_qos_t qos, workq_kern_threadreq_flags_t flags); // called with the kq req lock held -void workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr, +void workq_kern_threadreq_update_inheritor(struct proc *p, struct workq_threadreq_s *kqr, thread_t owner, struct turnstile *ts, turnstile_update_flags_t flags); void workq_kern_threadreq_lock(struct proc *p); void workq_kern_threadreq_unlock(struct proc *p); -void workq_kern_threadreq_redrive(struct proc *p, int flags); +void workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags); enum workq_set_self_flags { WORKQ_SET_SELF_QOS_FLAG = 0x1, diff --git a/bsd/pthread/workqueue_syscalls.h b/bsd/pthread/workqueue_syscalls.h index f12656aac..e4ce73082 100644 --- a/bsd/pthread/workqueue_syscalls.h +++ b/bsd/pthread/workqueue_syscalls.h @@ -35,14 +35,15 @@ #ifdef __PTHREAD_EXPOSE_INTERNALS__ /* workq_kernreturn commands */ -#define WQOPS_THREAD_RETURN 0x04 /* parks the thread back into the kernel */ -#define WQOPS_QUEUE_NEWSPISUPP 0x10 /* this is to check for newer SPI support */ -#define WQOPS_QUEUE_REQTHREADS 0x20 /* request number of threads of a prio */ -#define WQOPS_QUEUE_REQTHREADS2 0x30 /* request a number of threads in a given priority bucket */ -#define WQOPS_THREAD_KEVENT_RETURN 0x40 /* parks the thread after delivering the passed kevent array */ -#define WQOPS_SET_EVENT_MANAGER_PRIORITY 0x80 /* max() in the provided priority in the the priority of the event manager */ -#define WQOPS_THREAD_WORKLOOP_RETURN 0x100 /* parks the thread after delivering the passed kevent array */ -#define WQOPS_SHOULD_NARROW 0x200 /* checks whether we should narrow our concurrency */ +#define WQOPS_THREAD_RETURN 0x004 /* parks the thread back into the kernel */ +#define WQOPS_QUEUE_NEWSPISUPP 0x010 /* this is to check for newer SPI support */ +#define WQOPS_QUEUE_REQTHREADS 0x020 /* request number of threads of a prio */ +#define WQOPS_QUEUE_REQTHREADS2 0x030 /* request a number of threads in a given priority bucket */ +#define WQOPS_THREAD_KEVENT_RETURN 0x040 /* parks the thread after delivering the passed kevent array */ +#define WQOPS_SET_EVENT_MANAGER_PRIORITY 0x080 /* max() in the provided priority in the the priority of the event manager */ +#define WQOPS_THREAD_WORKLOOP_RETURN 0x100 /* parks the thread after delivering the passed kevent array */ +#define WQOPS_SHOULD_NARROW 0x200 /* checks whether we should narrow our concurrency */ +#define WQOPS_SETUP_DISPATCH 0x400 /* setup pthread workqueue-related operations */ /* flag values for upcall flags field, only 8 bits per struct threadlist */ #define WQ_FLAG_THREAD_PRIO_SCHED 0x00008000 @@ -53,7 +54,7 @@ #define WQ_FLAG_THREAD_REUSE 0x00020000 /* thread is being reused */ #define WQ_FLAG_THREAD_NEWSPI 0x00040000 /* the call is with new SPIs */ #define WQ_FLAG_THREAD_KEVENT 0x00080000 /* thread is response to kevent req */ -#define WQ_FLAG_THREAD_EVENT_MANAGER 0x00100000 /* event manager thread */ +#define WQ_FLAG_THREAD_EVENT_MANAGER 0x00100000 /* event manager thread */ #define WQ_FLAG_THREAD_TSD_BASE_SET 0x00200000 /* tsd base has already been set */ #define WQ_FLAG_THREAD_WORKLOOP 0x00400000 /* workloop thread */ #define WQ_FLAG_THREAD_OUTSIDEQOS 0x00800000 /* thread qos changes should not be sent to kernel */ @@ -93,10 +94,22 @@ int __kqueue_workloop_ctl(uintptr_t cmd, uint64_t options, void *addr, size_t sz); /* SPI flags between WQ and workq_setup_thread in pthread.kext */ +#define WQ_SETUP_NONE 0 #define WQ_SETUP_FIRST_USE 1 #define WQ_SETUP_CLEAR_VOUCHER 2 // was WQ_SETUP_SET_SCHED_CALL 4 #define WQ_SETUP_EXIT_THREAD 8 #endif // __PTHREAD_EXPOSE_INTERNALS__ + +#define WORKQ_DISPATCH_CONFIG_VERSION 2 +#define WORKQ_DISPATCH_MIN_SUPPORTED_VERSION 1 +#define WORKQ_DISPATCH_SUPPORTED_FLAGS 0 +struct workq_dispatch_config { + uint32_t wdc_version; + uint32_t wdc_flags; + uint64_t wdc_queue_serialno_offs; + uint64_t wdc_queue_label_offs; +} __attribute__((packed, aligned(4))); + #endif // _PTHREAD_WORKQUEUE_PRIVATE_H_ diff --git a/bsd/security/audit/audit_bsm.c b/bsd/security/audit/audit_bsm.c index 9610b52dd..18e98c0f5 100644 --- a/bsd/security/audit/audit_bsm.c +++ b/bsd/security/audit/audit_bsm.c @@ -1846,6 +1846,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) } break; + case AUE_FSGETPATH_EXTENDED: case AUE_FSGETPATH: if (ARG_IS_VALID(kar, ARG_VALUE32)) { tok = au_to_arg32(3, "volfsid", ar->ar_arg_value32); @@ -2068,7 +2069,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) * record is good, 0 otherwise. */ int -bsm_rec_verify(void *rec, int length) +bsm_rec_verify(void *rec, int length, boolean_t kern_events_allowed) { /* Used to partially deserialize the buffer */ struct hdr_tok_partial *hdr; @@ -2105,6 +2106,10 @@ bsm_rec_verify(void *rec, int length) return 0; } + if (!kern_events_allowed && AUE_IS_A_KEVENT(ntohs(hdr->e_type))) { + return 0; + } + return 1; } #endif /* CONFIG_AUDIT */ diff --git a/bsd/security/audit/audit_bsm_domain.c b/bsd/security/audit/audit_bsm_domain.c index 88626e09a..e49285e42 100644 --- a/bsd/security/audit/audit_bsm_domain.c +++ b/bsd/security/audit/audit_bsm_domain.c @@ -1,6 +1,5 @@ /*- - * Copyright (c) 2008-2009 Apple Inc. - * All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -45,406 +44,405 @@ struct bsm_domain { #define PF_NO_LOCAL_MAPPING -600 static const struct bsm_domain bsm_domains[] = { - { BSM_PF_UNSPEC, PF_UNSPEC }, - { BSM_PF_LOCAL, PF_LOCAL }, - { BSM_PF_INET, PF_INET }, - { BSM_PF_IMPLINK, + { .bd_bsm_domain = BSM_PF_UNSPEC, .bd_local_domain = PF_UNSPEC }, + { .bd_bsm_domain = BSM_PF_LOCAL, .bd_local_domain = PF_LOCAL }, + { .bd_bsm_domain = BSM_PF_INET, .bd_local_domain = PF_INET }, + { .bd_bsm_domain = BSM_PF_IMPLINK, #ifdef PF_IMPLINK - PF_IMPLINK + .bd_local_domain = PF_IMPLINK #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_PUP, + { .bd_bsm_domain = BSM_PF_PUP, #ifdef PF_PUP - PF_PUP + .bd_local_domain = PF_PUP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_CHAOS, + { .bd_bsm_domain = BSM_PF_CHAOS, #ifdef PF_CHAOS - PF_CHAOS + .bd_local_domain = PF_CHAOS #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_NS, + { .bd_bsm_domain = BSM_PF_NS, #ifdef PF_NS - PF_NS + .bd_local_domain = PF_NS #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_NBS, + { .bd_bsm_domain = BSM_PF_NBS, #ifdef PF_NBS - PF_NBS + .bd_local_domain = PF_NBS #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ECMA, + { .bd_bsm_domain = BSM_PF_ECMA, #ifdef PF_ECMA - PF_ECMA + .bd_local_domain = PF_ECMA #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_DATAKIT, + { .bd_bsm_domain = BSM_PF_DATAKIT, #ifdef PF_DATAKIT - PF_DATAKIT + .bd_local_domain = PF_DATAKIT #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_CCITT, + { .bd_bsm_domain = BSM_PF_CCITT, #ifdef PF_CCITT - PF_CCITT + .bd_local_domain = PF_CCITT #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_SNA, PF_SNA }, - { BSM_PF_DECnet, PF_DECnet }, - { BSM_PF_DLI, + { .bd_bsm_domain = BSM_PF_SNA, .bd_local_domain = PF_SNA }, + { .bd_bsm_domain = BSM_PF_DECnet, .bd_local_domain = PF_DECnet }, + { .bd_bsm_domain = BSM_PF_DLI, #ifdef PF_DLI - PF_DLI + .bd_local_domain = PF_DLI #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_LAT, + { .bd_bsm_domain = BSM_PF_LAT, #ifdef PF_LAT - PF_LAT + .bd_local_domain = PF_LAT #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_HYLINK, + { .bd_bsm_domain = BSM_PF_HYLINK, #ifdef PF_HYLINK - PF_HYLINK + .bd_local_domain = PF_HYLINK #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_APPLETALK, PF_APPLETALK }, - { BSM_PF_NIT, + { .bd_bsm_domain = BSM_PF_APPLETALK, .bd_local_domain = PF_APPLETALK }, + { .bd_bsm_domain = BSM_PF_NIT, #ifdef PF_NIT - PF_NIT + .bd_local_domain = PF_NIT #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_802, + { .bd_bsm_domain = BSM_PF_802, #ifdef PF_802 - PF_802 + .bd_local_domain = PF_802 #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_OSI, + { .bd_bsm_domain = BSM_PF_OSI, #ifdef PF_OSI - PF_OSI + .bd_local_domain = PF_OSI #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_X25, + { .bd_bsm_domain = BSM_PF_X25, #ifdef PF_X25 - PF_X25 + .bd_local_domain = PF_X25 #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_OSINET, + { .bd_bsm_domain = BSM_PF_OSINET, #ifdef PF_OSINET - PF_OSINET + .bd_local_domain = PF_OSINET #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_GOSIP, + { .bd_bsm_domain = BSM_PF_GOSIP, #ifdef PF_GOSIP - PF_GOSIP + .bd_local_domain = PF_GOSIP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_IPX, PF_IPX }, - { BSM_PF_ROUTE, PF_ROUTE }, - { BSM_PF_LINK, + { .bd_bsm_domain = BSM_PF_IPX, .bd_local_domain = PF_IPX }, + { .bd_bsm_domain = BSM_PF_ROUTE, .bd_local_domain = PF_ROUTE }, + { .bd_bsm_domain = BSM_PF_LINK, #ifdef PF_LINK - PF_LINK + .bd_local_domain = PF_LINK #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_INET6, PF_INET6 }, - { BSM_PF_KEY, PF_KEY }, - { BSM_PF_NCA, + { .bd_bsm_domain = BSM_PF_KEY, .bd_local_domain = PF_KEY }, + { .bd_bsm_domain = BSM_PF_NCA, #ifdef PF_NCA - PF_NCA + .bd_local_domain = PF_NCA #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_POLICY, + { .bd_bsm_domain = BSM_PF_POLICY, #ifdef PF_POLICY - PF_POLICY + .bd_local_domain = PF_POLICY #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_INET_OFFLOAD, + { .bd_bsm_domain = BSM_PF_INET_OFFLOAD, #ifdef PF_INET_OFFLOAD - PF_INET_OFFLOAD + .bd_local_domain = PF_INET_OFFLOAD #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_NETBIOS, + { .bd_bsm_domain = BSM_PF_NETBIOS, #ifdef PF_NETBIOS - PF_NETBIOS + .bd_local_domain = PF_NETBIOS #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ISO, + { .bd_bsm_domain = BSM_PF_ISO, #ifdef PF_ISO - PF_ISO + .bd_local_domain = PF_ISO #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_XTP, + { .bd_bsm_domain = BSM_PF_XTP, #ifdef PF_XTP - PF_XTP + .bd_local_domain = PF_XTP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_COIP, + { .bd_bsm_domain = BSM_PF_COIP, #ifdef PF_COIP - PF_COIP + .bd_local_domain = PF_COIP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_CNT, + { .bd_bsm_domain = BSM_PF_CNT, #ifdef PF_CNT - PF_CNT + .bd_local_domain = PF_CNT #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_RTIP, + { .bd_bsm_domain = BSM_PF_RTIP, #ifdef PF_RTIP - PF_RTIP + .bd_local_domain = PF_RTIP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_SIP, + { .bd_bsm_domain = BSM_PF_SIP, #ifdef PF_SIP - PF_SIP + .bd_local_domain = PF_SIP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_PIP, + { .bd_bsm_domain = BSM_PF_PIP, #ifdef PF_PIP - PF_PIP + .bd_local_domain = PF_PIP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ISDN, + { .bd_bsm_domain = BSM_PF_ISDN, #ifdef PF_ISDN - PF_ISDN + .bd_local_domain = PF_ISDN #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_E164, + { .bd_bsm_domain = BSM_PF_E164, #ifdef PF_E164 - PF_E164 + .bd_local_domain = PF_E164 #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_NATM, + { .bd_bsm_domain = BSM_PF_NATM, #ifdef PF_NATM - PF_NATM + .bd_local_domain = PF_NATM #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ATM, + { .bd_bsm_domain = BSM_PF_ATM, #ifdef PF_ATM - PF_ATM + .bd_local_domain = PF_ATM #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_NETGRAPH, + { .bd_bsm_domain = BSM_PF_NETGRAPH, #ifdef PF_NETGRAPH - PF_NETGRAPH + .bd_local_domain = PF_NETGRAPH #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_SLOW, + { .bd_bsm_domain = BSM_PF_SLOW, #ifdef PF_SLOW - PF_SLOW + .bd_local_domain = PF_SLOW #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_SCLUSTER, + { .bd_bsm_domain = BSM_PF_SCLUSTER, #ifdef PF_SCLUSTER - PF_SCLUSTER + .bd_local_domain = PF_SCLUSTER #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ARP, + { .bd_bsm_domain = BSM_PF_ARP, #ifdef PF_ARP - PF_ARP + .bd_local_domain = PF_ARP #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_BLUETOOTH, + { .bd_bsm_domain = BSM_PF_BLUETOOTH, #ifdef PF_BLUETOOTH - PF_BLUETOOTH + .bd_local_domain = PF_BLUETOOTH #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_IEEE80211, + { .bd_bsm_domain = BSM_PF_IEEE80211, #ifdef PF_IEEE80211 - PF_IEEE80211 + .bd_local_domain = PF_IEEE80211 #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_AX25, + { .bd_bsm_domain = BSM_PF_AX25, #ifdef PF_AX25 - PF_AX25 + .bd_local_domain = PF_AX25 #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ROSE, + { .bd_bsm_domain = BSM_PF_ROSE, #ifdef PF_ROSE - PF_ROSE + .bd_local_domain = PF_ROSE #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_NETBEUI, + { .bd_bsm_domain = BSM_PF_NETBEUI, #ifdef PF_NETBEUI - PF_NETBEUI + .bd_local_domain = PF_NETBEUI #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_SECURITY, + { .bd_bsm_domain = BSM_PF_SECURITY, #ifdef PF_SECURITY - PF_SECURITY + .bd_local_domain = PF_SECURITY #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_PACKET, + { .bd_bsm_domain = BSM_PF_PACKET, #ifdef PF_PACKET - PF_PACKET + .bd_local_domain = PF_PACKET #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ASH, + { .bd_bsm_domain = BSM_PF_ASH, #ifdef PF_ASH - PF_ASH + .bd_local_domain = PF_ASH #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ECONET, + { .bd_bsm_domain = BSM_PF_ECONET, #ifdef PF_ECONET - PF_ECONET + .bd_local_domain = PF_ECONET #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_ATMSVC, + { .bd_bsm_domain = BSM_PF_ATMSVC, #ifdef PF_ATMSVC - PF_ATMSVC + .bd_local_domain = PF_ATMSVC #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_IRDA, + { .bd_bsm_domain = BSM_PF_IRDA, #ifdef PF_IRDA - PF_IRDA + .bd_local_domain = PF_IRDA #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_PPPOX, + { .bd_bsm_domain = BSM_PF_PPPOX, #ifdef PF_PPPOX - PF_PPPOX + .bd_local_domain = PF_PPPOX #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_WANPIPE, + { .bd_bsm_domain = BSM_PF_WANPIPE, #ifdef PF_WANPIPE - PF_WANPIPE + .bd_local_domain = PF_WANPIPE #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_LLC, + { .bd_bsm_domain = BSM_PF_LLC, #ifdef PF_LLC - PF_LLC + .bd_local_domain = PF_LLC #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_CAN, + { .bd_bsm_domain = BSM_PF_CAN, #ifdef PF_CAN - PF_CAN + .bd_local_domain = PF_CAN #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_TIPC, + { .bd_bsm_domain = BSM_PF_TIPC, #ifdef PF_TIPC - PF_TIPC + .bd_local_domain = PF_TIPC #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_IUCV, + { .bd_bsm_domain = BSM_PF_IUCV, #ifdef PF_IUCV - PF_IUCV + .bd_local_domain = PF_IUCV #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_RXRPC, + { .bd_bsm_domain = BSM_PF_RXRPC, #ifdef PF_RXRPC - PF_RXRPC + .bd_local_domain = PF_RXRPC #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, - { BSM_PF_PHONET, + { .bd_bsm_domain = BSM_PF_PHONET, #ifdef PF_PHONET - PF_PHONET + .bd_local_domain = PF_PHONET #else - PF_NO_LOCAL_MAPPING + .bd_local_domain = PF_NO_LOCAL_MAPPING #endif }, }; diff --git a/bsd/security/audit/audit_bsm_errno.c b/bsd/security/audit/audit_bsm_errno.c index ecdbc71dc..a1b1b77fb 100644 --- a/bsd/security/audit/audit_bsm_errno.c +++ b/bsd/security/audit/audit_bsm_errno.c @@ -1,6 +1,5 @@ /*- - * Copyright (c) 2008-2011 Apple Inc. - * All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -58,7 +57,7 @@ struct bsm_errno { #define ERRNO_NO_LOCAL_MAPPING -600 #if !defined(KERNEL) && !defined(_KERNEL) -#define ES(x) x +#define ES(x) .be_strerror = x #else #define ES(x) #endif @@ -79,511 +78,511 @@ struct bsm_errno { * string using strerror(3). */ static const struct bsm_errno bsm_errnos[] = { - { BSM_ERRNO_ESUCCESS, 0, ES("Success") }, - { BSM_ERRNO_EPERM, EPERM, ES("Operation not permitted") }, - { BSM_ERRNO_ENOENT, ENOENT, ES("No such file or directory") }, - { BSM_ERRNO_ESRCH, ESRCH, ES("No such process") }, - { BSM_ERRNO_EINTR, EINTR, ES("Interrupted system call") }, - { BSM_ERRNO_EIO, EIO, ES("Input/output error") }, - { BSM_ERRNO_ENXIO, ENXIO, ES("Device not configured") }, - { BSM_ERRNO_E2BIG, E2BIG, ES("Argument list too long") }, - { BSM_ERRNO_ENOEXEC, ENOEXEC, ES("Exec format error") }, - { BSM_ERRNO_EBADF, EBADF, ES("Bad file descriptor") }, - { BSM_ERRNO_ECHILD, ECHILD, ES("No child processes") }, - { BSM_ERRNO_EAGAIN, EAGAIN, ES("Resource temporarily unavailable") }, - { BSM_ERRNO_ENOMEM, ENOMEM, ES("Cannot allocate memory") }, - { BSM_ERRNO_EACCES, EACCES, ES("Permission denied") }, - { BSM_ERRNO_EFAULT, EFAULT, ES("Bad address") }, - { BSM_ERRNO_ENOTBLK, ENOTBLK, ES("Block device required") }, - { BSM_ERRNO_EBUSY, EBUSY, ES("Device busy") }, - { BSM_ERRNO_EEXIST, EEXIST, ES("File exists") }, - { BSM_ERRNO_EXDEV, EXDEV, ES("Cross-device link") }, - { BSM_ERRNO_ENODEV, ENODEV, ES("Operation not supported by device") }, - { BSM_ERRNO_ENOTDIR, ENOTDIR, ES("Not a directory") }, - { BSM_ERRNO_EISDIR, EISDIR, ES("Is a directory") }, - { BSM_ERRNO_EINVAL, EINVAL, ES("Invalid argument") }, - { BSM_ERRNO_ENFILE, ENFILE, ES("Too many open files in system") }, - { BSM_ERRNO_EMFILE, EMFILE, ES("Too many open files") }, - { BSM_ERRNO_ENOTTY, ENOTTY, ES("Inappropriate ioctl for device") }, - { BSM_ERRNO_ETXTBSY, ETXTBSY, ES("Text file busy") }, - { BSM_ERRNO_EFBIG, EFBIG, ES("File too large") }, - { BSM_ERRNO_ENOSPC, ENOSPC, ES("No space left on device") }, - { BSM_ERRNO_ESPIPE, ESPIPE, ES("Illegal seek") }, - { BSM_ERRNO_EROFS, EROFS, ES("Read-only file system") }, - { BSM_ERRNO_EMLINK, EMLINK, ES("Too many links") }, - { BSM_ERRNO_EPIPE, EPIPE, ES("Broken pipe") }, - { BSM_ERRNO_EDOM, EDOM, ES("Numerical argument out of domain") }, - { BSM_ERRNO_ERANGE, ERANGE, ES("Result too large") }, - { BSM_ERRNO_ENOMSG, ENOMSG, ES("No message of desired type") }, - { BSM_ERRNO_EIDRM, EIDRM, ES("Identifier removed") }, - { BSM_ERRNO_ECHRNG, + { .be_bsm_errno = BSM_ERRNO_ESUCCESS, .be_local_errno = 0, ES("Success") }, + { .be_bsm_errno = BSM_ERRNO_EPERM, .be_local_errno = EPERM, ES("Operation not permitted") }, + { .be_bsm_errno = BSM_ERRNO_ENOENT, .be_local_errno = ENOENT, ES("No such file or directory") }, + { .be_bsm_errno = BSM_ERRNO_ESRCH, .be_local_errno = ESRCH, ES("No such process") }, + { .be_bsm_errno = BSM_ERRNO_EINTR, .be_local_errno = EINTR, ES("Interrupted system call") }, + { .be_bsm_errno = BSM_ERRNO_EIO, .be_local_errno = EIO, ES("Input/output error") }, + { .be_bsm_errno = BSM_ERRNO_ENXIO, .be_local_errno = ENXIO, ES("Device not configured") }, + { .be_bsm_errno = BSM_ERRNO_E2BIG, .be_local_errno = E2BIG, ES("Argument list too long") }, + { .be_bsm_errno = BSM_ERRNO_ENOEXEC, .be_local_errno = ENOEXEC, ES("Exec format error") }, + { .be_bsm_errno = BSM_ERRNO_EBADF, .be_local_errno = EBADF, ES("Bad file descriptor") }, + { .be_bsm_errno = BSM_ERRNO_ECHILD, .be_local_errno = ECHILD, ES("No child processes") }, + { .be_bsm_errno = BSM_ERRNO_EAGAIN, .be_local_errno = EAGAIN, ES("Resource temporarily unavailable") }, + { .be_bsm_errno = BSM_ERRNO_ENOMEM, .be_local_errno = ENOMEM, ES("Cannot allocate memory") }, + { .be_bsm_errno = BSM_ERRNO_EACCES, .be_local_errno = EACCES, ES("Permission denied") }, + { .be_bsm_errno = BSM_ERRNO_EFAULT, .be_local_errno = EFAULT, ES("Bad address") }, + { .be_bsm_errno = BSM_ERRNO_ENOTBLK, .be_local_errno = ENOTBLK, ES("Block device required") }, + { .be_bsm_errno = BSM_ERRNO_EBUSY, .be_local_errno = EBUSY, ES("Device busy") }, + { .be_bsm_errno = BSM_ERRNO_EEXIST, .be_local_errno = EEXIST, ES("File exists") }, + { .be_bsm_errno = BSM_ERRNO_EXDEV, .be_local_errno = EXDEV, ES("Cross-device link") }, + { .be_bsm_errno = BSM_ERRNO_ENODEV, .be_local_errno = ENODEV, ES("Operation not supported by device") }, + { .be_bsm_errno = BSM_ERRNO_ENOTDIR, .be_local_errno = ENOTDIR, ES("Not a directory") }, + { .be_bsm_errno = BSM_ERRNO_EISDIR, .be_local_errno = EISDIR, ES("Is a directory") }, + { .be_bsm_errno = BSM_ERRNO_EINVAL, .be_local_errno = EINVAL, ES("Invalid argument") }, + { .be_bsm_errno = BSM_ERRNO_ENFILE, .be_local_errno = ENFILE, ES("Too many open files in system") }, + { .be_bsm_errno = BSM_ERRNO_EMFILE, .be_local_errno = EMFILE, ES("Too many open files") }, + { .be_bsm_errno = BSM_ERRNO_ENOTTY, .be_local_errno = ENOTTY, ES("Inappropriate ioctl for device") }, + { .be_bsm_errno = BSM_ERRNO_ETXTBSY, .be_local_errno = ETXTBSY, ES("Text file busy") }, + { .be_bsm_errno = BSM_ERRNO_EFBIG, .be_local_errno = EFBIG, ES("File too large") }, + { .be_bsm_errno = BSM_ERRNO_ENOSPC, .be_local_errno = ENOSPC, ES("No space left on device") }, + { .be_bsm_errno = BSM_ERRNO_ESPIPE, .be_local_errno = ESPIPE, ES("Illegal seek") }, + { .be_bsm_errno = BSM_ERRNO_EROFS, .be_local_errno = EROFS, ES("Read-only file system") }, + { .be_bsm_errno = BSM_ERRNO_EMLINK, .be_local_errno = EMLINK, ES("Too many links") }, + { .be_bsm_errno = BSM_ERRNO_EPIPE, .be_local_errno = EPIPE, ES("Broken pipe") }, + { .be_bsm_errno = BSM_ERRNO_EDOM, .be_local_errno = EDOM, ES("Numerical argument out of domain") }, + { .be_bsm_errno = BSM_ERRNO_ERANGE, .be_local_errno = ERANGE, ES("Result too large") }, + { .be_bsm_errno = BSM_ERRNO_ENOMSG, .be_local_errno = ENOMSG, ES("No message of desired type") }, + { .be_bsm_errno = BSM_ERRNO_EIDRM, .be_local_errno = EIDRM, ES("Identifier removed") }, + { .be_bsm_errno = BSM_ERRNO_ECHRNG, #ifdef ECHRNG - ECHRNG, + .be_local_errno = ECHRNG, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Channel number out of range") }, - { BSM_ERRNO_EL2NSYNC, + { .be_bsm_errno = BSM_ERRNO_EL2NSYNC, #ifdef EL2NSYNC - EL2NSYNC, + .be_local_errno = EL2NSYNC, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Level 2 not synchronized") }, - { BSM_ERRNO_EL3HLT, + { .be_bsm_errno = BSM_ERRNO_EL3HLT, #ifdef EL3HLT - EL3HLT, + .be_local_errno = EL3HLT, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Level 3 halted") }, - { BSM_ERRNO_EL3RST, + { .be_bsm_errno = BSM_ERRNO_EL3RST, #ifdef EL3RST - EL3RST, + .be_local_errno = EL3RST, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Level 3 reset") }, - { BSM_ERRNO_ELNRNG, + { .be_bsm_errno = BSM_ERRNO_ELNRNG, #ifdef ELNRNG - ELNRNG, + .be_local_errno = ELNRNG, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Link number out of range") }, - { BSM_ERRNO_EUNATCH, + { .be_bsm_errno = BSM_ERRNO_EUNATCH, #ifdef EUNATCH - EUNATCH, + .be_local_errno = EUNATCH, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Protocol driver not attached") }, - { BSM_ERRNO_ENOCSI, + { .be_bsm_errno = BSM_ERRNO_ENOCSI, #ifdef ENOCSI - ENOCSI, + .be_local_errno = ENOCSI, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("No CSI structure available") }, - { BSM_ERRNO_EL2HLT, + { .be_bsm_errno = BSM_ERRNO_EL2HLT, #ifdef EL2HLT - EL2HLT, + .be_local_errno = EL2HLT, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Level 2 halted") }, - { BSM_ERRNO_EDEADLK, EDEADLK, ES("Resource deadlock avoided") }, - { BSM_ERRNO_ENOLCK, ENOLCK, ES("No locks available") }, - { BSM_ERRNO_ECANCELED, ECANCELED, ES("Operation canceled") }, - { BSM_ERRNO_ENOTSUP, ENOTSUP, ES("Operation not supported") }, - { BSM_ERRNO_EDQUOT, EDQUOT, ES("Disc quota exceeded") }, - { BSM_ERRNO_EBADE, + { .be_bsm_errno = BSM_ERRNO_EDEADLK, .be_local_errno = EDEADLK, ES("Resource deadlock avoided") }, + { .be_bsm_errno = BSM_ERRNO_ENOLCK, .be_local_errno = ENOLCK, ES("No locks available") }, + { .be_bsm_errno = BSM_ERRNO_ECANCELED, .be_local_errno = ECANCELED, ES("Operation canceled") }, + { .be_bsm_errno = BSM_ERRNO_ENOTSUP, .be_local_errno = ENOTSUP, ES("Operation not supported") }, + { .be_bsm_errno = BSM_ERRNO_EDQUOT, .be_local_errno = EDQUOT, ES("Disc quota exceeded") }, + { .be_bsm_errno = BSM_ERRNO_EBADE, #ifdef EBADE - EBADE, + .be_local_errno = EBADE, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Invalid exchange") }, - { BSM_ERRNO_EBADR, + { .be_bsm_errno = BSM_ERRNO_EBADR, #ifdef EBADR - EBADR, + .be_local_errno = EBADR, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Invalid request descriptor") }, - { BSM_ERRNO_EXFULL, + { .be_bsm_errno = BSM_ERRNO_EXFULL, #ifdef EXFULL - EXFULL, + .be_local_errno = EXFULL, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Exchange full") }, - { BSM_ERRNO_ENOANO, + { .be_bsm_errno = BSM_ERRNO_ENOANO, #ifdef ENOANO - ENOANO, + .be_local_errno = ENOANO, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("No anode") }, - { BSM_ERRNO_EBADRQC, + { .be_bsm_errno = BSM_ERRNO_EBADRQC, #ifdef EBADRQC - EBADRQC, + .be_local_errno = EBADRQC, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Invalid request descriptor") }, - { BSM_ERRNO_EBADSLT, + { .be_bsm_errno = BSM_ERRNO_EBADSLT, #ifdef EBADSLT - EBADSLT, + .be_local_errno = EBADSLT, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Invalid slot") }, - { BSM_ERRNO_EDEADLOCK, + { .be_bsm_errno = BSM_ERRNO_EDEADLOCK, #ifdef EDEADLOCK - EDEADLOCK, + .be_local_errno = EDEADLOCK, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Resource deadlock avoided") }, - { BSM_ERRNO_EBFONT, + { .be_bsm_errno = BSM_ERRNO_EBFONT, #ifdef EBFONT - EBFONT, + .be_local_errno = EBFONT, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Bad font file format") }, - { BSM_ERRNO_EOWNERDEAD, + { .be_bsm_errno = BSM_ERRNO_EOWNERDEAD, #ifdef EOWNERDEAD - EOWNERDEAD, + .be_local_errno = EOWNERDEAD, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Process died with the lock") }, - { BSM_ERRNO_ENOTRECOVERABLE, + { .be_bsm_errno = BSM_ERRNO_ENOTRECOVERABLE, #ifdef ENOTRECOVERABLE - ENOTRECOVERABLE, + .be_local_errno = ENOTRECOVERABLE, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Lock is not recoverable") }, - { BSM_ERRNO_ENOSTR, + { .be_bsm_errno = BSM_ERRNO_ENOSTR, #ifdef ENOSTR - ENOSTR, + .be_local_errno = ENOSTR, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Device not a stream") }, - { BSM_ERRNO_ENONET, + { .be_bsm_errno = BSM_ERRNO_ENONET, #ifdef ENONET - ENONET, + .be_local_errno = ENONET, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Machine is not on the network") }, - { BSM_ERRNO_ENOPKG, + { .be_bsm_errno = BSM_ERRNO_ENOPKG, #ifdef ENOPKG - ENOPKG, + .be_local_errno = ENOPKG, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Package not installed") }, - { BSM_ERRNO_EREMOTE, EREMOTE, + { .be_bsm_errno = BSM_ERRNO_EREMOTE, .be_local_errno = EREMOTE, ES("Too many levels of remote in path") }, - { BSM_ERRNO_ENOLINK, + { .be_bsm_errno = BSM_ERRNO_ENOLINK, #ifdef ENOLINK - ENOLINK, + .be_local_errno = ENOLINK, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Link has been severed") }, - { BSM_ERRNO_EADV, + { .be_bsm_errno = BSM_ERRNO_EADV, #ifdef EADV - EADV, + .be_local_errno = EADV, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Advertise error") }, - { BSM_ERRNO_ESRMNT, + { .be_bsm_errno = BSM_ERRNO_ESRMNT, #ifdef ESRMNT - ESRMNT, + .be_local_errno = ESRMNT, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("srmount error") }, - { BSM_ERRNO_ECOMM, + { .be_bsm_errno = BSM_ERRNO_ECOMM, #ifdef ECOMM - ECOMM, + .be_local_errno = ECOMM, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Communication error on send") }, - { BSM_ERRNO_EPROTO, + { .be_bsm_errno = BSM_ERRNO_EPROTO, #ifdef EPROTO - EPROTO, + .be_local_errno = EPROTO, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Protocol error") }, - { BSM_ERRNO_ELOCKUNMAPPED, + { .be_bsm_errno = BSM_ERRNO_ELOCKUNMAPPED, #ifdef ELOCKUNMAPPED - ELOCKUNMAPPED, + .be_local_errno = ELOCKUNMAPPED, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Locked lock was unmapped") }, - { BSM_ERRNO_ENOTACTIVE, + { .be_bsm_errno = BSM_ERRNO_ENOTACTIVE, #ifdef ENOTACTIVE - ENOTACTIVE, + .be_local_errno = ENOTACTIVE, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Facility is not active") }, - { BSM_ERRNO_EMULTIHOP, + { .be_bsm_errno = BSM_ERRNO_EMULTIHOP, #ifdef EMULTIHOP - EMULTIHOP, + .be_local_errno = EMULTIHOP, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Multihop attempted") }, - { BSM_ERRNO_EBADMSG, + { .be_bsm_errno = BSM_ERRNO_EBADMSG, #ifdef EBADMSG - EBADMSG, + .be_local_errno = EBADMSG, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Bad message") }, - { BSM_ERRNO_ENAMETOOLONG, ENAMETOOLONG, ES("File name too long") }, - { BSM_ERRNO_EOVERFLOW, EOVERFLOW, + { .be_bsm_errno = BSM_ERRNO_ENAMETOOLONG, .be_local_errno = ENAMETOOLONG, ES("File name too long") }, + { .be_bsm_errno = BSM_ERRNO_EOVERFLOW, .be_local_errno = EOVERFLOW, ES("Value too large to be stored in data type") }, - { BSM_ERRNO_ENOTUNIQ, + { .be_bsm_errno = BSM_ERRNO_ENOTUNIQ, #ifdef ENOTUNIQ - ENOTUNIQ, + .be_local_errno = ENOTUNIQ, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Given log name not unique") }, - { BSM_ERRNO_EBADFD, + { .be_bsm_errno = BSM_ERRNO_EBADFD, #ifdef EBADFD - EBADFD, + .be_local_errno = EBADFD, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Given f.d. invalid for this operation") }, - { BSM_ERRNO_EREMCHG, + { .be_bsm_errno = BSM_ERRNO_EREMCHG, #ifdef EREMCHG - EREMCHG, + .be_local_errno = EREMCHG, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Remote address changed") }, - { BSM_ERRNO_ELIBACC, + { .be_bsm_errno = BSM_ERRNO_ELIBACC, #ifdef ELIBACC - ELIBACC, + .be_local_errno = ELIBACC, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Can't access a needed shared lib") }, - { BSM_ERRNO_ELIBBAD, + { .be_bsm_errno = BSM_ERRNO_ELIBBAD, #ifdef ELIBBAD - ELIBBAD, + .be_local_errno = ELIBBAD, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Accessing a corrupted shared lib") }, - { BSM_ERRNO_ELIBSCN, + { .be_bsm_errno = BSM_ERRNO_ELIBSCN, #ifdef ELIBSCN - ELIBSCN, + .be_local_errno = ELIBSCN, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES(".lib section in a.out corrupted") }, - { BSM_ERRNO_ELIBMAX, + { .be_bsm_errno = BSM_ERRNO_ELIBMAX, #ifdef ELIBMAX - ELIBMAX, + .be_local_errno = ELIBMAX, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Attempting to link in too many libs") }, - { BSM_ERRNO_ELIBEXEC, + { .be_bsm_errno = BSM_ERRNO_ELIBEXEC, #ifdef ELIBEXEC - ELIBEXEC, + .be_local_errno = ELIBEXEC, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Attempting to exec a shared library") }, - { BSM_ERRNO_EILSEQ, EILSEQ, ES("Illegal byte sequence") }, - { BSM_ERRNO_ENOSYS, ENOSYS, ES("Function not implemented") }, - { BSM_ERRNO_ELOOP, ELOOP, ES("Too many levels of symbolic links") }, - { BSM_ERRNO_ERESTART, + { .be_bsm_errno = BSM_ERRNO_EILSEQ, .be_local_errno = EILSEQ, ES("Illegal byte sequence") }, + { .be_bsm_errno = BSM_ERRNO_ENOSYS, .be_local_errno = ENOSYS, ES("Function not implemented") }, + { .be_bsm_errno = BSM_ERRNO_ELOOP, .be_local_errno = ELOOP, ES("Too many levels of symbolic links") }, + { .be_bsm_errno = BSM_ERRNO_ERESTART, #ifdef ERESTART - ERESTART, + .be_local_errno = ERESTART, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Restart syscall") }, - { BSM_ERRNO_ESTRPIPE, + { .be_bsm_errno = BSM_ERRNO_ESTRPIPE, #ifdef ESTRPIPE - ESTRPIPE, + .be_local_errno = ESTRPIPE, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("If pipe/FIFO, don't sleep in stream head") }, - { BSM_ERRNO_ENOTEMPTY, ENOTEMPTY, ES("Directory not empty") }, - { BSM_ERRNO_EUSERS, EUSERS, ES("Too many users") }, - { BSM_ERRNO_ENOTSOCK, ENOTSOCK, + { .be_bsm_errno = BSM_ERRNO_ENOTEMPTY, .be_local_errno = ENOTEMPTY, ES("Directory not empty") }, + { .be_bsm_errno = BSM_ERRNO_EUSERS, .be_local_errno = EUSERS, ES("Too many users") }, + { .be_bsm_errno = BSM_ERRNO_ENOTSOCK, .be_local_errno = ENOTSOCK, ES("Socket operation on non-socket") }, - { BSM_ERRNO_EDESTADDRREQ, EDESTADDRREQ, + { .be_bsm_errno = BSM_ERRNO_EDESTADDRREQ, .be_local_errno = EDESTADDRREQ, ES("Destination address required") }, - { BSM_ERRNO_EMSGSIZE, EMSGSIZE, ES("Message too long") }, - { BSM_ERRNO_EPROTOTYPE, EPROTOTYPE, + { .be_bsm_errno = BSM_ERRNO_EMSGSIZE, .be_local_errno = EMSGSIZE, ES("Message too long") }, + { .be_bsm_errno = BSM_ERRNO_EPROTOTYPE, .be_local_errno = EPROTOTYPE, ES("Protocol wrong type for socket") }, - { BSM_ERRNO_ENOPROTOOPT, ENOPROTOOPT, ES("Protocol not available") }, - { BSM_ERRNO_EPROTONOSUPPORT, EPROTONOSUPPORT, + { .be_bsm_errno = BSM_ERRNO_ENOPROTOOPT, .be_local_errno = ENOPROTOOPT, ES("Protocol not available") }, + { .be_bsm_errno = BSM_ERRNO_EPROTONOSUPPORT, .be_local_errno = EPROTONOSUPPORT, ES("Protocol not supported") }, - { BSM_ERRNO_ESOCKTNOSUPPORT, ESOCKTNOSUPPORT, + { .be_bsm_errno = BSM_ERRNO_ESOCKTNOSUPPORT, .be_local_errno = ESOCKTNOSUPPORT, ES("Socket type not supported") }, - { BSM_ERRNO_EOPNOTSUPP, EOPNOTSUPP, ES("Operation not supported") }, - { BSM_ERRNO_EPFNOSUPPORT, EPFNOSUPPORT, + { .be_bsm_errno = BSM_ERRNO_EOPNOTSUPP, .be_local_errno = EOPNOTSUPP, ES("Operation not supported") }, + { .be_bsm_errno = BSM_ERRNO_EPFNOSUPPORT, .be_local_errno = EPFNOSUPPORT, ES("Protocol family not supported") }, - { BSM_ERRNO_EAFNOSUPPORT, EAFNOSUPPORT, + { .be_bsm_errno = BSM_ERRNO_EAFNOSUPPORT, .be_local_errno = EAFNOSUPPORT, ES("Address family not supported by protocol family") }, - { BSM_ERRNO_EADDRINUSE, EADDRINUSE, ES("Address already in use") }, - { BSM_ERRNO_EADDRNOTAVAIL, EADDRNOTAVAIL, + { .be_bsm_errno = BSM_ERRNO_EADDRINUSE, .be_local_errno = EADDRINUSE, ES("Address already in use") }, + { .be_bsm_errno = BSM_ERRNO_EADDRNOTAVAIL, .be_local_errno = EADDRNOTAVAIL, ES("Can't assign requested address") }, - { BSM_ERRNO_ENETDOWN, ENETDOWN, ES("Network is down") }, - { BSM_ERRNO_ENETRESET, ENETRESET, + { .be_bsm_errno = BSM_ERRNO_ENETDOWN, .be_local_errno = ENETDOWN, ES("Network is down") }, + { .be_bsm_errno = BSM_ERRNO_ENETRESET, .be_local_errno = ENETRESET, ES("Network dropped connection on reset") }, - { BSM_ERRNO_ECONNABORTED, ECONNABORTED, + { .be_bsm_errno = BSM_ERRNO_ECONNABORTED, .be_local_errno = ECONNABORTED, ES("Software caused connection abort") }, - { BSM_ERRNO_ECONNRESET, ECONNRESET, ES("Connection reset by peer") }, - { BSM_ERRNO_ENOBUFS, ENOBUFS, ES("No buffer space available") }, - { BSM_ERRNO_EISCONN, EISCONN, ES("Socket is already connected") }, - { BSM_ERRNO_ENOTCONN, ENOTCONN, ES("Socket is not connected") }, - { BSM_ERRNO_ESHUTDOWN, ESHUTDOWN, + { .be_bsm_errno = BSM_ERRNO_ECONNRESET, .be_local_errno = ECONNRESET, ES("Connection reset by peer") }, + { .be_bsm_errno = BSM_ERRNO_ENOBUFS, .be_local_errno = ENOBUFS, ES("No buffer space available") }, + { .be_bsm_errno = BSM_ERRNO_EISCONN, .be_local_errno = EISCONN, ES("Socket is already connected") }, + { .be_bsm_errno = BSM_ERRNO_ENOTCONN, .be_local_errno = ENOTCONN, ES("Socket is not connected") }, + { .be_bsm_errno = BSM_ERRNO_ESHUTDOWN, .be_local_errno = ESHUTDOWN, ES("Can't send after socket shutdown") }, - { BSM_ERRNO_ETOOMANYREFS, ETOOMANYREFS, + { .be_bsm_errno = BSM_ERRNO_ETOOMANYREFS, .be_local_errno = ETOOMANYREFS, ES("Too many references: can't splice") }, - { BSM_ERRNO_ETIMEDOUT, ETIMEDOUT, ES("Operation timed out") }, - { BSM_ERRNO_ECONNREFUSED, ECONNREFUSED, ES("Connection refused") }, - { BSM_ERRNO_EHOSTDOWN, EHOSTDOWN, ES("Host is down") }, - { BSM_ERRNO_EHOSTUNREACH, EHOSTUNREACH, ES("No route to host") }, - { BSM_ERRNO_EALREADY, EALREADY, ES("Operation already in progress") }, - { BSM_ERRNO_EINPROGRESS, EINPROGRESS, + { .be_bsm_errno = BSM_ERRNO_ETIMEDOUT, .be_local_errno = ETIMEDOUT, ES("Operation timed out") }, + { .be_bsm_errno = BSM_ERRNO_ECONNREFUSED, .be_local_errno = ECONNREFUSED, ES("Connection refused") }, + { .be_bsm_errno = BSM_ERRNO_EHOSTDOWN, .be_local_errno = EHOSTDOWN, ES("Host is down") }, + { .be_bsm_errno = BSM_ERRNO_EHOSTUNREACH, .be_local_errno = EHOSTUNREACH, ES("No route to host") }, + { .be_bsm_errno = BSM_ERRNO_EALREADY, .be_local_errno = EALREADY, ES("Operation already in progress") }, + { .be_bsm_errno = BSM_ERRNO_EINPROGRESS, .be_local_errno = EINPROGRESS, ES("Operation now in progress") }, - { BSM_ERRNO_ESTALE, ESTALE, ES("Stale NFS file handle") }, - { BSM_ERRNO_EQFULL, EQFULL, ES("Interface output queue is full") }, - { BSM_ERRNO_EPWROFF, + { .be_bsm_errno = BSM_ERRNO_ESTALE, .be_local_errno = ESTALE, ES("Stale NFS file handle") }, + { .be_bsm_errno = BSM_ERRNO_EQFULL, .be_local_errno = EQFULL, ES("Interface output queue is full") }, + { .be_bsm_errno = BSM_ERRNO_EPWROFF, #ifdef EPWROFF - EPWROFF, + .be_local_errno = EPWROFF, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Device power is off") }, - { BSM_ERRNO_EDEVERR, + { .be_bsm_errno = BSM_ERRNO_EDEVERR, #ifdef EDEVERR - EDEVERR, + .be_local_errno = EDEVERR, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Device error") }, - { BSM_ERRNO_EBADEXEC, + { .be_bsm_errno = BSM_ERRNO_EBADEXEC, #ifdef EBADEXEC - EBADEXEC, + .be_local_errno = EBADEXEC, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Bad executable") }, - { BSM_ERRNO_EBADARCH, + { .be_bsm_errno = BSM_ERRNO_EBADARCH, #ifdef EBADARCH - EBADARCH, + .be_local_errno = EBADARCH, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Bad CPU type in executable") }, - { BSM_ERRNO_ESHLIBVERS, + { .be_bsm_errno = BSM_ERRNO_ESHLIBVERS, #ifdef ESHLIBVERS - ESHLIBVERS, + .be_local_errno = ESHLIBVERS, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Shared library version mismatch") }, - { BSM_ERRNO_EBADMACHO, + { .be_bsm_errno = BSM_ERRNO_EBADMACHO, #ifdef EBADMACHO - EBADMACHO, + .be_local_errno = EBADMACHO, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Malformed Macho file") }, - { BSM_ERRNO_EPOLICY, + { .be_bsm_errno = BSM_ERRNO_EPOLICY, #ifdef EPOLICY - EPOLICY, + .be_local_errno = EPOLICY, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Operation failed by policy") }, - { BSM_ERRNO_EDOTDOT, + { .be_bsm_errno = BSM_ERRNO_EDOTDOT, #ifdef EDOTDOT - EDOTDOT, + .be_local_errno = EDOTDOT, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("RFS specific error") }, - { BSM_ERRNO_EUCLEAN, + { .be_bsm_errno = BSM_ERRNO_EUCLEAN, #ifdef EUCLEAN - EUCLEAN, + .be_local_errno = EUCLEAN, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Structure needs cleaning") }, - { BSM_ERRNO_ENOTNAM, + { .be_bsm_errno = BSM_ERRNO_ENOTNAM, #ifdef ENOTNAM - ENOTNAM, + .be_local_errno = ENOTNAM, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Not a XENIX named type file") }, - { BSM_ERRNO_ENAVAIL, + { .be_bsm_errno = BSM_ERRNO_ENAVAIL, #ifdef ENAVAIL - ENAVAIL, + .be_local_errno = ENAVAIL, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("No XENIX semaphores available") }, - { BSM_ERRNO_EISNAM, + { .be_bsm_errno = BSM_ERRNO_EISNAM, #ifdef EISNAM - EISNAM, + .be_local_errno = EISNAM, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Is a named type file") }, - { BSM_ERRNO_EREMOTEIO, + { .be_bsm_errno = BSM_ERRNO_EREMOTEIO, #ifdef EREMOTEIO - EREMOTEIO, + .be_local_errno = EREMOTEIO, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Remote I/O error") }, - { BSM_ERRNO_ENOMEDIUM, + { .be_bsm_errno = BSM_ERRNO_ENOMEDIUM, #ifdef ENOMEDIUM - ENOMEDIUM, + .be_local_errno = ENOMEDIUM, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("No medium found") }, - { BSM_ERRNO_EMEDIUMTYPE, + { .be_bsm_errno = BSM_ERRNO_EMEDIUMTYPE, #ifdef EMEDIUMTYPE - EMEDIUMTYPE, + .be_local_errno = EMEDIUMTYPE, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Wrong medium type") }, - { BSM_ERRNO_ENOKEY, + { .be_bsm_errno = BSM_ERRNO_ENOKEY, #ifdef ENOKEY - ENOKEY, + .be_local_errno = ENOKEY, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Required key not available") }, - { BSM_ERRNO_EKEYEXPIRED, + { .be_bsm_errno = BSM_ERRNO_EKEYEXPIRED, #ifdef EKEEXPIRED - EKEYEXPIRED, + .be_local_errno = EKEYEXPIRED, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Key has expired") }, - { BSM_ERRNO_EKEYREVOKED, + { .be_bsm_errno = BSM_ERRNO_EKEYREVOKED, #ifdef EKEYREVOKED - EKEYREVOKED, + .be_local_errno = EKEYREVOKED, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Key has been revoked") }, - { BSM_ERRNO_EKEYREJECTED, + { .be_bsm_errno = BSM_ERRNO_EKEYREJECTED, #ifdef EKEREJECTED - EKEYREJECTED, + .be_local_errno = EKEYREJECTED, #else - ERRNO_NO_LOCAL_MAPPING, + .be_local_errno = ERRNO_NO_LOCAL_MAPPING, #endif ES("Key was rejected by service") }, }; diff --git a/bsd/security/audit/audit_bsm_fcntl.c b/bsd/security/audit/audit_bsm_fcntl.c index 11454fbf4..3236e918f 100644 --- a/bsd/security/audit/audit_bsm_fcntl.c +++ b/bsd/security/audit/audit_bsm_fcntl.c @@ -226,6 +226,9 @@ static const bsm_fcntl_cmd_t bsm_fcntl_cmdtab[] = { #ifdef F_TRIM_ACTIVE_FILE { BSM_F_TRIM_ACTIVE_FILE, F_TRIM_ACTIVE_FILE }, #endif +#ifdef F_SPECULATIVE_READ + { BSM_F_SPECULATIVE_READ, F_SPECULATIVE_READ }, +#endif #ifdef FCNTL_FS_SPECIFIC_BASE { BSM_F_FS_SPECIFIC_0, FCNTL_FS_SPECIFIC_BASE}, diff --git a/bsd/security/audit/audit_pipe.c b/bsd/security/audit/audit_pipe.c index 8b34ab598..6b096473f 100644 --- a/bsd/security/audit/audit_pipe.c +++ b/bsd/security/audit/audit_pipe.c @@ -1084,45 +1084,6 @@ audit_pipe_poll(dev_t dev, int events, void *wql, struct proc *p) return revents; } -#ifndef __APPLE__ -/* - * Return true if there are records available for reading on the pipe. - */ -static int -audit_pipe_kqread(struct knote *kn, long hint) -{ - struct audit_pipe *ap; - - ap = (struct audit_pipe *)kn->kn_hook; - KASSERT(ap != NULL, ("audit_pipe_kqread: ap == NULL")); - AUDIT_PIPE_LOCK_ASSERT(ap); - - if (ap->ap_qlen != 0) { - kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset; - return 1; - } else { - kn->kn_data = 0; - return 0; - } -} - -/* - * Detach kqueue state from audit pipe. - */ -static void -audit_pipe_kqdetach(struct knote *kn) -{ - struct audit_pipe *ap; - - ap = (struct audit_pipe *)kn->kn_hook; - KASSERT(ap != NULL, ("audit_pipe_kqdetach: ap == NULL")); - - AUDIT_PIPE_LOCK(ap); - knlist_remove(&ap->ap_selinfo.si_note, kn, 1); - AUDIT_PIPE_UNLOCK(ap); -} -#endif /* !__APPLE__ */ - static void *devnode; int diff --git a/bsd/security/audit/audit_private.h b/bsd/security/audit/audit_private.h index 4da799daa..e9e2e6dec 100644 --- a/bsd/security/audit/audit_private.h +++ b/bsd/security/audit/audit_private.h @@ -348,7 +348,7 @@ struct kaudit_record *audit_new(int event, proc_t p, struct uthread *td); */ struct au_record; int kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau); -int bsm_rec_verify(void *rec, int length); +int bsm_rec_verify(void *rec, int length, boolean_t kern_events_allowed); /* * Kernel versions of the libbsm audit record functions. @@ -488,6 +488,11 @@ int audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia); */ #define AU_AUDITCTL_RESERVED_ENTITLEMENT "com.apple.private.protected-audit-control" +/* + * Entitlement required to control auditctl sys call + */ +#define AU_AUDIT_USER_ENTITLEMENT "com.apple.private.audit.user" + /* * Max sizes used by the kernel for signing id and team id values of the * identity tokens. These lengths include space for the null terminator. @@ -498,8 +503,10 @@ int audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia); struct __attribute__((__packed__)) hdr_tok_partial { u_char type; uint32_t len; + u_char ver; + uint16_t e_type; }; -static_assert(sizeof(struct hdr_tok_partial) == 5); +static_assert(sizeof(struct hdr_tok_partial) == 8); struct __attribute__((__packed__)) trl_tok_partial { u_char type; diff --git a/bsd/security/audit/audit_session.c b/bsd/security/audit/audit_session.c index d99b186fa..fed263471 100644 --- a/bsd/security/audit/audit_session.c +++ b/bsd/security/audit/audit_session.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -100,8 +101,10 @@ static au_sentry_t audit_default_se = { struct auditinfo_addr *audit_default_aia_p = &audit_default_se.se_auinfo; +/* Copied from */ +#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, - mach_msg_type_name_t, ipc_port_t *); + mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t); void ipc_port_release_send(ipc_port_t); #if CONFIG_AUDIT @@ -556,7 +559,7 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, AUDIT_SENTRY_RUNLOCK(); /* Reconcile with the process table. */ - (void) proc_iterate(PROC_ALLPROCLIST | PROC_ZOMBPROCLIST, + proc_iterate(PROC_ALLPROCLIST | PROC_ZOMBPROCLIST, audit_session_debug_callout, NULL, audit_session_debug_filterfn, (void *)&sed_tab[0]); @@ -1350,10 +1353,15 @@ audit_session_port(proc_t p, struct audit_session_port_args *uap, */ se = AU_SENTRY_PTR(aia_p); audit_ref_session(se); - } else if (kauth_cred_issuser(cred)) { - /* The superuser may obtain a port for any existing - * session. + } else { + /* + * Only privileged processes may obtain a port for + * any existing session. */ + err = priv_check_cred(cred, PRIV_AUDIT_SESSION_PORT, 0); + if (err != 0) { + goto done; + } AUDIT_SENTRY_RLOCK(); se = audit_session_find(uap->asid); AUDIT_SENTRY_RUNLOCK(); @@ -1362,9 +1370,6 @@ audit_session_port(proc_t p, struct audit_session_port_args *uap, goto done; } aia_p = &se->se_auinfo; - } else { - err = EPERM; - goto done; } /* @@ -1513,7 +1518,7 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap, if (ipc_object_copyin(get_task_ipcspace(p->task), send, - MACH_MSG_TYPE_COPY_SEND, &port) != KERN_SUCCESS) { + MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) != KERN_SUCCESS) { *ret_asid = AU_DEFAUDITSID; err = EINVAL; } else { diff --git a/bsd/security/audit/audit_syscalls.c b/bsd/security/audit/audit_syscalls.c index 0df9209ce..4db4b53f7 100644 --- a/bsd/security/audit/audit_syscalls.c +++ b/bsd/security/audit/audit_syscalls.c @@ -1,6 +1,5 @@ /*- - * Copyright (c) 1999-2010, Apple Inc. - * All rights reserved. + * Copyright (c) 1999-2019 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -159,12 +158,30 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) int max_record_length = MAX_AUDIT_RECORD_SIZE; void *udata = NULL; u_int ulen = 0; - struct au_identity_info id_info = {0, NULL, 0, NULL, 0, NULL, 0}; + struct au_identity_info id_info = { + .signer_type = 0, + .signing_id = NULL, + .signing_id_trunc = 0, + .team_id = NULL, + .team_id_trunc = 0, + .cdhash = NULL, + .cdhash_len = 0 + }; token_t *id_tok = NULL; + boolean_t kern_events_allowed = FALSE; error = suser(kauth_cred_get(), &p->p_acflag); if (error) { - goto free_out; + /* + * If a process is not running as root but is properly + * entitled, allow it to audit non-kernel events only. + */ + if (!IOTaskHasEntitlement(current_task(), + AU_AUDIT_USER_ENTITLEMENT)) { + goto free_out; + } + } else { + kern_events_allowed = TRUE; } mtx_lock(&audit_mtx); @@ -234,7 +251,7 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) #endif /* Verify the record. */ - if (bsm_rec_verify(rec, uap->length) == 0) { + if (bsm_rec_verify(rec, uap->length, kern_events_allowed) == 0) { error = EINVAL; goto free_out; } diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index e2ad05581..b813428ca 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -33,7 +33,12 @@ DATAFILES = \ user.h utfconv.h utsname.h vadvise.h vcmd.h \ vm.h vmmeter.h vmparam.h vnioctl.h vnode.h vnode_if.h vstat.h wait.h xattr.h \ _select.h _structs.h _types.h _endian.h domain.h protosw.h \ - spawn.h timex.h commpage.h + spawn.h timex.h commpage.h log_data.h + +# Installs header file for DriverKit drivers - +# $(DSTROOT)/System/DriverKit/System/usr/include/ +DRIVERKIT_DATAFILES = \ + cdefs.h _types.h # Installs header file for Apple internal use in user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders @@ -64,7 +69,10 @@ PRIVATE_DATAFILES = \ kern_control.h \ kern_event.h \ kern_memorystatus.h \ + kern_memorystatus_freeze.h \ + kern_memorystatus_notify.h \ kern_overrides.h \ + kern_sysctl.h \ mbuf.h \ mman.h \ monotonic.h \ @@ -98,6 +106,11 @@ PRIVATE_DATAFILES = \ memory_maintenance.h \ commpage.h +# Installs header file for Apple internal use by DriverKit drivers - +# $(DSTROOT)/System/DriverKit/System/usr/local/include/ +DRIVERKIT_PRIVATE_DATAFILES = \ + appleapiopts.h kdebug.h + # Installs header file for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders @@ -108,8 +121,8 @@ KERNELFILES = \ errno.h ev.h event.h fcntl.h file.h filio.h \ ioccom.h ioctl.h ipc.h \ ioctl_compat.h kernel.h kernel_types.h kern_event.h lock.h lockf.h \ - kauth.h kdebug.h md5.h kern_control.h imgact.h malloc.h namei.h \ - mman.h mbuf.h mount.h netport.h param.h paths.h \ + kauth.h kdebug.h kdebug_kernel.h md5.h kern_control.h imgact.h malloc.h \ + namei.h mman.h mbuf.h mount.h netport.h param.h paths.h \ proc.h queue.h random.h resource.h \ sbuf.h posix_sem.h posix_shm.h sem.h shm.h \ select.h signal.h socket.h socketvar.h sockio.h stat.h stdio.h \ @@ -140,6 +153,8 @@ PRIVATE_KERNELFILES = \ fslog.h \ kasl.h \ kern_memorystatus.h \ + kern_memorystatus_freeze.h \ + kern_memorystatus_notify.h \ kpi_private.h \ ktrace.h \ mach_swapon.h \ @@ -168,6 +183,7 @@ PRIVATE_KERNELFILES = \ doc_tombstone.h \ fsevents.h \ work_interval.h \ + kern_sysctl.h \ XNU_ONLY_EXPORTS = \ bsdtask_info.h \ @@ -191,8 +207,12 @@ XNU_ONLY_EXPORTS = \ # /usr/include INSTALL_MI_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} + INSTALL_MI_GEN_LIST = syscall.h _posix_availability.h _symbol_aliasing.h +INSTALL_DRIVERKIT_MI_GEN_LIST = _posix_availability.h _symbol_aliasing.h + INSTALL_MI_DIR = sys EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} ${XNU_ONLY_EXPORTS} @@ -204,6 +224,8 @@ EXPORT_MI_DIR = sys # /System/Library/Frameworks/System.framework/PrivateHeaders INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} +INSTALL_DRIVERKIT_MI_LCL_LIST = ${DRIVERKIT_PRIVATE_DATAFILES} + # /System/Library/Frameworks/Kernel.framework/PrivateHeaders INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} @@ -221,43 +243,48 @@ $(OBJROOT)/cscope.genhdrs: $(_v)mkdir -p $(OBJROOT)/cscope.genhdrs $(OBJROOT)/syscall.codes: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)) $(_v)$(MAKESYSCALLS) $< trace > $@ $(OBJROOT)/trace.codes: $(SRCROOT)/bsd/kern/trace_codes $(OBJROOT)/syscall.codes $(_v)sort -g $(SRCROOT)/bsd/kern/trace_codes $(OBJROOT)/syscall.codes >$@ syscall.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscope.genhdrs - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)) @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKESYSCALLS) $< header > /dev/null sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscope.genhdrs - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)) @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKESYSCALLS) $< proto > /dev/null kdebugevents.h: $(OBJROOT)/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)) @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@" MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh _posix_availability.h: $(MAKE_POSIX_AVAILABILITY) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)) $(_v)$(MAKE_POSIX_AVAILABILITY) "$@" MAKE_SYMBOL_ALIASING = $(SRCROOT)/bsd/sys/make_symbol_aliasing.sh _symbol_aliasing.h: $(MAKE_SYMBOL_ALIASING) - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)) $(_v)$(MAKE_SYMBOL_ALIASING) "$(SDKROOT)" "$@" +# generated headers needed early (used by iig during installhdrs of iokit/DriverKit) +SETUP_GEN_LIST = _posix_availability.h _symbol_aliasing.h + +do_build_setup:: $(SETUP_GEN_LIST) + TRACE_CODES_DEST = \ $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR)/trace.codes $(TRACE_CODES_DEST): $(OBJROOT)/trace.codes $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR) - @echo INSTALL $(@F) + $(call makelog,INSTALL $(@F)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $(OBJROOT)/trace.codes $@ do_textfiles_install:: $(TRACE_CODES_DEST) diff --git a/bsd/sys/_types.h b/bsd/sys/_types.h index 08691552c..1d63f5c85 100644 --- a/bsd/sys/_types.h +++ b/bsd/sys/_types.h @@ -52,6 +52,7 @@ #define __DARWIN_NULL ((void *)0) #endif /* __cplusplus */ +#if !defined(DRIVERKIT) typedef __int64_t __darwin_blkcnt_t; /* total blocks */ typedef __int32_t __darwin_blksize_t; /* preferred block size */ typedef __int32_t __darwin_dev_t; /* dev_t */ @@ -74,12 +75,13 @@ typedef __uint32_t __darwin_sigset_t; /* [???] signal set */ typedef __int32_t __darwin_suseconds_t; /* [???] microseconds */ typedef __uint32_t __darwin_uid_t; /* [???] user IDs */ typedef __uint32_t __darwin_useconds_t; /* [???] microseconds */ +#endif /* !defined(DRIVERKIT) */ typedef unsigned char __darwin_uuid_t[16]; typedef char __darwin_uuid_string_t[37]; -#ifndef KERNEL +#if !defined(KERNEL) && !defined(DRIVERKIT) #include -#endif /* KERNEL */ +#endif /* !defined(KERNEL) && !defined(DRIVERKIT) */ #if defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 5 || __GNUC__ > 3) #define __offsetof(type, field) __builtin_offsetof(type, field) diff --git a/bsd/sys/_types/Makefile b/bsd/sys/_types/Makefile index c64ec4c8a..5b5373961 100644 --- a/bsd/sys/_types/Makefile +++ b/bsd/sys/_types/Makefile @@ -98,6 +98,35 @@ DATAFILES = \ _user32_ntptimeval.h \ _user64_ntptimeval.h \ +# Installs header file for DriverKit drivers - +# $(DSTROOT)/System/DriverKit/System/usr/include/ +DRIVERKIT_DATAFILES = \ + _ct_rune_t.h \ + _errno_t.h \ + _int16_t.h \ + _int32_t.h \ + _int64_t.h \ + _int8_t.h \ + _intptr_t.h \ + _mbstate_t.h \ + _null.h \ + _offsetof.h \ + _os_inline.h \ + _ptrdiff_t.h \ + _rsize_t.h \ + _rune_t.h \ + _size_t.h \ + _ssize_t.h \ + _u_int16_t.h \ + _u_int32_t.h \ + _u_int64_t.h \ + _u_int8_t.h \ + _uintptr_t.h \ + _uuid_t.h \ + _va_list.h \ + _wchar_t.h \ + _wint_t.h \ + # Installs header file for Apple internal use in user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders PRIVATE_DATAFILES = \ @@ -122,6 +151,8 @@ PRIVATE_KERNELFILES = \ # /System/Library/Frameworks/System.framework/Headers and /usr/include INSTALL_MI_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} + INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = sys/_types diff --git a/bsd/sys/_types/_guid_t.h b/bsd/sys/_types/_guid_t.h index ac9cd5c76..df29f9ca3 100644 --- a/bsd/sys/_types/_guid_t.h +++ b/bsd/sys/_types/_guid_t.h @@ -28,9 +28,10 @@ #ifndef _KAUTH_GUID #define _KAUTH_GUID /* Apple-style globally unique identifier */ -typedef struct { +typedef union { #define KAUTH_GUID_SIZE 16 /* 128-bit identifier */ unsigned char g_guid[KAUTH_GUID_SIZE]; + unsigned int g_guid_asint[KAUTH_GUID_SIZE / sizeof(unsigned int)]; } guid_t; #define _GUID_T #endif /* _KAUTH_GUID */ diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index cdf7e13a7..5b4f4c133 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -56,6 +56,14 @@ #define FSOPT_ATTR_CMN_EXTENDED 0x00000020 #ifdef PRIVATE #define FSOPT_LIST_SNAPSHOT 0x00000040 +#ifndef FSOPT_NOFIRMLINKPATH /*a copy is in fsgetpath.h */ +#define FSOPT_NOFIRMLINKPATH 0x00000080 +#endif /* FSOPT_NOFIRMLINKPATH */ +#define FSOPT_FOLLOW_FIRMLINK 0x00000100 +#define FSOPT_RETURN_REALDEV 0x00000200 +#ifndef FSOPT_ISREALFSID /*a copy is in fsgetpath.h */ +#define FSOPT_ISREALFSID FSOPT_RETURN_REALDEV +#endif #endif /* PRIVATE */ /* we currently aren't anywhere near this amount for a valid @@ -235,6 +243,16 @@ typedef struct vol_capabilities_attr { * * VOL_CAP_FMT_NO_PERMISSIONS: When set, the volume does not support setting * permissions. + * + * VOL_CAP_FMT_SHARED_SPACE: When set, the volume supports sharing space with + * other filesystems i.e. multiple logical filesystems can exist in the same + * "partition". An implication of this is that the filesystem which sets + * this capability treats waitfor arguments to VFS_SYNC as bit flags. + * + * VOL_CAP_FMT_VOL_GROUPS: When set, this volume is part of a volume-group + * that implies multiple volumes must be mounted in order to boot and root the + * operating system. Typically, this means a read-only system volume and a + * writable data volume. */ #define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 #define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 @@ -259,7 +277,8 @@ typedef struct vol_capabilities_attr { #define VOL_CAP_FMT_WRITE_GENERATION_COUNT 0x00100000 #define VOL_CAP_FMT_NO_IMMUTABLE_FILES 0x00200000 #define VOL_CAP_FMT_NO_PERMISSIONS 0x00400000 - +#define VOL_CAP_FMT_SHARED_SPACE 0x00800000 +#define VOL_CAP_FMT_VOL_GROUPS 0x01000000 /* * VOL_CAP_INT_SEARCHFS: When set, the volume implements the @@ -328,6 +347,8 @@ typedef struct vol_capabilities_attr { * VOL_CAP_INT_RENAME_EXCL: When set, the volume supports an * exclusive rename operation. * + * VOL_CAP_INT_RENAME_OPENFAIL: When set, the volume may fail rename + * operations on files that are open. */ #define VOL_CAP_INT_SEARCHFS 0x00000001 #define VOL_CAP_INT_ATTRLIST 0x00000002 @@ -352,6 +373,7 @@ typedef struct vol_capabilities_attr { #define VOL_CAP_INT_SNAPSHOT 0x00020000 #define VOL_CAP_INT_RENAME_SWAP 0x00040000 #define VOL_CAP_INT_RENAME_EXCL 0x00080000 +#define VOL_CAP_INT_RENAME_OPENFAIL 0x00100000 typedef struct vol_attributes_attr { attribute_set_t validattr; @@ -506,8 +528,11 @@ typedef struct vol_attributes_attr { #define ATTR_CMNEXT_RELPATH 0x00000004 #define ATTR_CMNEXT_PRIVATESIZE 0x00000008 #define ATTR_CMNEXT_LINKID 0x00000010 +#define ATTR_CMNEXT_NOFIRMLINKPATH 0x00000020 +#define ATTR_CMNEXT_REALDEVID 0x00000040 +#define ATTR_CMNEXT_REALFSID 0x00000080 -#define ATTR_CMNEXT_VALIDMASK 0x0000001c +#define ATTR_CMNEXT_VALIDMASK 0x000000fc #define ATTR_CMNEXT_SETMASK 0x00000000 /* Deprecated fork attributes */ diff --git a/bsd/sys/bitstring.h b/bsd/sys/bitstring.h index e69067255..536fab617 100644 --- a/bsd/sys/bitstring.h +++ b/bsd/sys/bitstring.h @@ -94,7 +94,7 @@ typedef uint8_t bitstr_t; /* set bit N of bitstring name (atomic) */ #define bitstr_set_atomic(name, bit) \ - atomic_bitset_8(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit)) + (void)os_atomic_or(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit), relaxed) /* clear bit N of bitstring name */ #define bitstr_clear(name, bit) \ @@ -102,7 +102,7 @@ typedef uint8_t bitstr_t; /* clear bit N of bitstring name (atomic) */ #define bitstr_clear_atomic(name, bit) \ - atomic_bitclear_8(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit)) + (void)os_atomic_andnot(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit), relaxed) /* clear bits start ... stop in bitstring */ #define bitstr_nclear(name, start, stop) do { \ diff --git a/bsd/sys/bsdtask_info.h b/bsd/sys/bsdtask_info.h index a0f182493..0c9a6901d 100644 --- a/bsd/sys/bsdtask_info.h +++ b/bsd/sys/bsdtask_info.h @@ -100,15 +100,19 @@ struct proc_regioninfo_internal { #define PROC_REGION_SHARED 2 extern uint32_t vnode_vid(void *vp); + #if CONFIG_IOSCHED kern_return_t vnode_pager_get_object_devvp(memory_object_t mem_obj, uintptr_t *devvp); extern struct vnode *vnode_mountdevvp(struct vnode *); #endif +extern boolean_t vnode_isonexternalstorage(void *vp); + #endif /* MACH_KERNEL_PRIVATE */ extern int fill_procregioninfo(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid); extern int fill_procregioninfo_onlymappedvnodes(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid); +extern int find_region_details(task_t task, vm_map_offset_t offset, uintptr_t *vnodeaddr, uint32_t *vid, uint64_t *start, uint64_t *len); void fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo); int fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *); int fill_taskthreadlist(task_t task, void * buffer, int thcount, bool thuniqueid); @@ -118,5 +122,6 @@ void bsd_getthreadname(void *uth, char* buffer); void bsd_setthreadname(void *uth, const char* buffer); void bsd_threadcdir(void * uth, void *vptr, int *vidp); extern void bsd_copythreadname(void *dst_uth, void *src_uth); +int fill_taskipctableinfo(task_t task, uint32_t *table_size, uint32_t *table_free); #endif /*_SYS_BSDTASK_INFO_H */ diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index fa96b304c..f46094803 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -1020,6 +1020,22 @@ void buf_markstatic(buf_t bp); */ int buf_static(buf_t bp); +/*! + * @function bufattr_markiosched + * @abstract Mark a buffer as belonging to an io scheduled mount point + * @param bap Buffer attributes to mark. + * @discussion Marks the buffer so that spec_strategy() will know that it belongs to an io scheduled mount point + */ +void bufattr_markioscheduled(bufattr_t bap); + +/*! + * @function bufattr_iosched + * @abstract Check if a buffer is marked as io scheduled + * @param bap Buffer attributes to test. + * @return Nonzero if the buffer is marked io scheduled, 0 otherwise. + */ +int bufattr_ioscheduled(bufattr_t bap); + #ifdef KERNEL_PRIVATE void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void(**)(buf_t, void *), void **); diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index c7b206823..2bd3511b2 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -271,8 +271,8 @@ extern vm_offset_t buf_kernel_addrperm; #define BA_ISOCHRONOUS 0x00001000 /* device specific isochronous throughput to media */ #define BA_STRATEGY_TRACKED_IO 0x00002000 /* tracked by spec_strategy */ -#define BA_IO_TIER_UPGRADE 0x00004000 /* effective I/O tier is higher than BA_IO_TIER */ - +#define BA_IO_TIER_UPGRADE 0x00004000 /* effective I/O tier is higher than BA_IO_TIER */ +#define BA_IO_SCHEDULED 0x00008000 /* buf is associated with a mount point that is io scheduled */ #define GET_BUFATTR_IO_TIER(bap) ((bap->ba_flags & BA_IO_TIER_MASK) >> BA_IO_TIER_SHIFT) #define SET_BUFATTR_IO_TIER(bap, tier) \ diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index bb42543df..066b91859 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -152,18 +152,27 @@ #endif /* !NO_ANSI_KEYWORDS */ #endif /* !(__STDC__ || __cplusplus) */ -#define __dead2 __attribute__((noreturn)) -#define __pure2 __attribute__((const)) +#define __dead2 __attribute__((__noreturn__)) +#define __pure2 __attribute__((__const__)) /* __unused denotes variables and functions that may not be used, preventing * the compiler from warning about it if not used. */ -#define __unused __attribute__((unused)) +#define __unused __attribute__((__unused__)) /* __used forces variables and functions to be included even if it appears * to the compiler that they are not used (and would thust be discarded). */ -#define __used __attribute__((used)) +#define __used __attribute__((__used__)) + +/* __cold marks code used for debugging or that is rarely taken + * and tells the compiler to optimize for size and outline code. + */ +#if __has_attribute(cold) +#define __cold __attribute__((__cold__)) +#else +#define __cold +#endif /* __deprecated causes the compiler to produce a warning when encountering * code using the deprecated functionality. @@ -172,14 +181,16 @@ * This may require turning on such warning with the -Wdeprecated flag. * __deprecated_enum_msg() should be used on enums, and compilers that support * it will print the deprecation warning. + * __kpi_deprecated() specifically indicates deprecation of kernel programming + * interfaces in Kernel.framework used by KEXTs. */ -#define __deprecated __attribute__((deprecated)) +#define __deprecated __attribute__((__deprecated__)) #if __has_extension(attribute_deprecated_with_message) || \ (defined(__GNUC__) && ((__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))) - #define __deprecated_msg(_msg) __attribute__((deprecated(_msg))) + #define __deprecated_msg(_msg) __attribute__((__deprecated__(_msg))) #else - #define __deprecated_msg(_msg) __attribute__((deprecated)) + #define __deprecated_msg(_msg) __attribute__((__deprecated__)) #endif #if __has_extension(enumerator_attributes) @@ -188,10 +199,16 @@ #define __deprecated_enum_msg(_msg) #endif +#if defined(KERNEL) && !defined(KERNEL_PRIVATE) +#define __kpi_deprecated(_msg) __deprecated_msg(_msg) +#else /* !defined(KERNEL) || defined(KERNEL_PRIVATE) */ +#define __kpi_deprecated(_msg) +#endif /* !defined(KERNEL) || defined(KERNEL_PRIVATE) */ + /* __unavailable causes the compiler to error out when encountering * code using the tagged function of variable. */ -#define __unavailable __attribute__((unavailable)) +#define __unavailable __attribute__((__unavailable__)) /* Delete pseudo-keywords wherever they are not available or needed. */ #ifndef __dead @@ -277,6 +294,15 @@ #define __swift_unavailable(_msg) #endif +/* + * __abortlike is the attribute to put on functions like abort() that are + * typically used to mark assertions. These optimize the codegen + * for outlining while still maintaining debugability. + */ +#ifndef __abortlike +#define __abortlike __dead2 __cold __not_tail_called +#endif + /* Declaring inline functions within headers is error-prone due to differences * across various versions of the C language and extensions. __header_inline * can be used to declare inline functions within system headers. In cases @@ -525,6 +551,12 @@ #define __DARWIN_ONLY_UNIX_CONFORMANCE 1 #define __DARWIN_ONLY_VERS_1050 1 #endif /* PLATFORM_BridgeOS */ +#ifdef PLATFORM_DriverKit +/* Platform: DriverKit */ +#define __DARWIN_ONLY_64_BIT_INO_T 1 +#define __DARWIN_ONLY_UNIX_CONFORMANCE 1 +#define __DARWIN_ONLY_VERS_1050 1 +#endif /* PLATFORM_DriverKit */ #ifdef PLATFORM_MacOSX /* Platform: MacOSX */ #define __DARWIN_ONLY_64_BIT_INO_T 0 @@ -691,6 +723,9 @@ #define __DARWIN_EXTSN(sym) __asm("_" __STRING(sym) __DARWIN_SUF_EXTSN) #define __DARWIN_EXTSN_C(sym) __asm("_" __STRING(sym) __DARWIN_SUF_EXTSN __DARWIN_SUF_NON_CANCELABLE) +#if XNU_KERNEL_PRIVATE +#define __XNU_INTERNAL(sym) __asm("_" __STRING(sym) "$XNU_INTERNAL") +#endif /* * symbol release macros @@ -855,6 +890,16 @@ #define _DARWIN_FEATURE_UNIX_CONFORMANCE 3 #endif +#if defined(DRIVERKIT) && !defined(KERNEL) +/* + * __DRIVERKIT_LIBC__ indicates to the C++ standard library headers and + * similar components that only the restricted set of standard C library + * functionality and headers for the DriverKit userspace driver environment + * are available. + */ +#define __DRIVERKIT_LIBC__ 1 +#endif /* defined(DRIVERKIT) && !defined(KERNEL) */ + /* * This macro casts away the qualifier from the variable * @@ -905,13 +950,61 @@ #define __improbable(x) __builtin_expect(!!(x), 0) #endif /* !defined(__probable) && !defined(__improbable) */ -#define __container_of(ptr, type, field) ({ \ - const typeof(((type *)0)->field) *__ptr = (ptr); \ +#if defined(__cplusplus) +#define __container_of(ptr, type, field) __extension__({ \ + const typeof(((type *)nullptr)->field) *__ptr = (ptr); \ + (type *)((uintptr_t)__ptr - offsetof(type, field)); \ + }) +#else +#define __container_of(ptr, type, field) __extension__({ \ + const typeof(((type *)NULL)->field) *__ptr = (ptr); \ (type *)((uintptr_t)__ptr - offsetof(type, field)); \ }) +#endif #endif /* KERNEL || PRIVATE */ #define __compiler_barrier() __asm__ __volatile__("" ::: "memory") +#if __has_attribute(enum_extensibility) +#define __enum_open __attribute__((__enum_extensibility__(open))) +#define __enum_closed __attribute__((__enum_extensibility__(closed))) +#else +#define __enum_open +#define __enum_closed +#endif // __has_attribute(enum_extensibility) + +#if __has_attribute(flag_enum) +#define __enum_options __attribute__((__flag_enum__)) +#else +#define __enum_options +#endif + +/* + * Similar to OS_ENUM/OS_CLOSED_ENUM/OS_OPTIONS/OS_CLOSED_OPTIONS + * + * This provides more advanced type checking on compilers supporting + * the proper extensions, even in C. + */ +#if __has_feature(objc_fixed_enum) || __has_extension(cxx_fixed_enum) || \ + __has_extension(cxx_strong_enums) +#define __enum_decl(_name, _type, ...) \ + typedef enum : _type __VA_ARGS__ __enum_open _name +#define __enum_closed_decl(_name, _type, ...) \ + typedef enum : _type __VA_ARGS__ __enum_closed _name +#define __options_decl(_name, _type, ...) \ + typedef enum : _type __VA_ARGS__ __enum_open __enum_options _name +#define __options_closed_decl(_name, _type, ...) \ + typedef enum : _type __VA_ARGS__ __enum_closed __enum_options _name +#else +#define __enum_decl(_name, _type, ...) \ + typedef _type _name; enum __VA_ARGS__ __enum_open +#define __enum_closed_decl(_name, _type, ...) \ + typedef _type _name; enum __VA_ARGS__ __enum_closed +#define __options_decl(_name, _type, ...) \ + typedef _type _name; enum __VA_ARGS__ __enum_open __enum_options +#define __options_closed_decl(_name, _type, ...) \ + typedef _type _name; enum __VA_ARGS__ __enum_closed __enum_options +#endif + #endif /* !_CDEFS_H_ */ diff --git a/bsd/sys/coalition.h b/bsd/sys/coalition.h index 34a532a9d..147959094 100644 --- a/bsd/sys/coalition.h +++ b/bsd/sys/coalition.h @@ -49,6 +49,7 @@ int coalition_reap(uint64_t cid, uint32_t flags); int coalition_info_resource_usage(uint64_t cid, struct coalition_resource_usage *cru, size_t sz); int coalition_info_set_name(uint64_t cid, const char *name, size_t size); int coalition_info_set_efficiency(uint64_t cid, uint64_t flags); +int coalition_ledger_set_logical_writes_limit(uint64_t cid, int64_t limit); #else /* KERNEL */ @@ -86,25 +87,31 @@ extern int coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, in /* - * coalition_is_leader: - * Determine if a task is a coalition leader. + * task_get_coalition: + * Return the coalition of a task. * * Parameters: * task : The task to investigate * coal_type : The COALITION_TYPE of the coalition to investigate. * Valid types can be found in - * coal : If 'task' is a valid task, and is a member of a coalition - * of type 'coal_type', then 'coal' will be filled in with - * the corresponding coalition_t object. - * NOTE: This will be filled in whether or not the 'task' is - * a leader in the coalition. However, if 'task' is - * not a member of a coalition of type 'coal_type' then - * 'coal' will be filled in with COALITION_NULL. - * NOTE: This can be NULL - * - * Returns: TRUE if 'task' is a coalition leader, FALSE otherwise. + * + * Returns: valid coalition_t or COALITION_NULL + */ +extern coalition_t task_get_coalition(task_t task, int coal_type); + + +/* + * coalition_is_leader: + * Determine if a task is a coalition leader. + * + * Parameters: + * task : The task to investigate + * coal : The coalition to test against. + * NOTE: This can be COALITION_NULL, in case FALSE is returned. + * + * Returns: TRUE if 'task' is the coalition's leader, FALSE otherwise. */ -extern boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal); +extern boolean_t coalition_is_leader(task_t task, coalition_t coal); /* * coalition_get_leader: @@ -203,12 +210,17 @@ coalitions_get_list(__unused int type, return 0; } +static inline coalition_t +coalition_get_leader(__unused task_t task, + __unused int coal_type) +{ + return COALITION_NULL; +} + static inline boolean_t coalition_is_leader(__unused task_t task, - __unused int coal_type, - coalition_t *coal) + __unused coalition_t coal) { - *coal = COALITION_NULL; return FALSE; } diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index 26e3e1f64..25e569fcb 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -92,6 +92,9 @@ int csproc_forced_lv(struct proc* p); int cs_system_require_lv(void); uint32_t cs_entitlement_flags(struct proc *p); int cs_entitlements_blob_get(struct proc *, void **, size_t *); +#ifdef KERNEL_PRIVATE +int cs_entitlements_dictionary_copy(struct proc *, void **); +#endif int cs_restricted(struct proc *); uint8_t * cs_get_cdhash(struct proc *); diff --git a/bsd/sys/commpage.h b/bsd/sys/commpage.h index 83871f07f..ccdd50949 100644 --- a/bsd/sys/commpage.h +++ b/bsd/sys/commpage.h @@ -29,6 +29,10 @@ #define _COMMPAGE_H #ifdef PRIVATE + +#define _COMM_PAGE32_SIGNATURE_STRING "commpage 32-bit" +#define _COMM_PAGE64_SIGNATURE_STRING "commpage 64-bit" + typedef volatile struct commpage_timeofday_data { uint64_t TimeStamp_tick; uint64_t TimeStamp_sec; @@ -36,6 +40,7 @@ typedef volatile struct commpage_timeofday_data { uint64_t Ticks_scale; uint64_t Ticks_per_sec; } new_commpage_timeofday_data_t; + #endif #endif diff --git a/bsd/sys/decmpfs.h b/bsd/sys/decmpfs.h index e8f6f3a27..51fbdfdd6 100644 --- a/bsd/sys/decmpfs.h +++ b/bsd/sys/decmpfs.h @@ -76,11 +76,47 @@ enum { #define DECMPFS_XATTR_NAME "com.apple.decmpfs" /* extended attribute to use for decmpfs */ +/* + * This single field is to be interpreted differently depending on the + * corresponding item type. + * For regular files: it is a 64bits-encoded logical size + * For directories: it is a 64bits-encoded number of children (ie st_nlink - 2) + * For packages: it is 40bits encoded size and 24bits number of children at root + */ +typedef struct __attribute__((packed)) { + uint64_t value; +} decmpfs_raw_item_size; + +#define DECMPFS_PKG_SIZE_MASK 0x000000ffffffffffULL +#define DECMPFS_PKG_COUNT_MASK 0xffffff +#define DECMPFS_PKG_CHLD_COUNT_SHIFT 40 + +#define DECMPFS_PKG_SIZE(x) ((x).value & DECMPFS_PKG_SIZE_MASK) +#define DECMPFS_PKG_CHLD_COUNT(x) ((uint32_t)(((x).value >> DECMPFS_PKG_CHLD_COUNT_SHIFT) & DECMPFS_PKG_COUNT_MASK)) +#define DECMPFS_PKG_VALUE_FROM_SIZE_COUNT(size, count) \ + (((size) & DECMPFS_PKG_SIZE_MASK) | ((uint64_t)(count) << DECMPFS_PKG_CHLD_COUNT_SHIFT)) + +/* Dataless file or directory */ +#define DATALESS_CMPFS_TYPE 0x80000001 + +/* Dataless package, with number of root children and total size encoded on disk */ +#define DATALESS_PKG_CMPFS_TYPE 0x80000002 + + +static inline bool +decmpfs_type_is_dataless(uint32_t cmp_type) +{ + return cmp_type == DATALESS_CMPFS_TYPE || cmp_type == DATALESS_PKG_CMPFS_TYPE; +} + typedef struct __attribute__((packed)) { /* this structure represents the xattr on disk; the fields below are little-endian */ uint32_t compression_magic; - uint32_t compression_type; /* see the enum below */ - uint64_t uncompressed_size; + uint32_t compression_type; /* see the enum below */ + union { + uint64_t uncompressed_size; /* compatility accessor */ + decmpfs_raw_item_size _size; + }; unsigned char attr_bytes[0]; /* the bytes of the attribute after the header */ } decmpfs_disk_header; @@ -89,10 +125,38 @@ typedef struct __attribute__((packed)) { uint32_t attr_size; uint32_t compression_magic; uint32_t compression_type; - uint64_t uncompressed_size; + union { + /* + * although uncompressed_size remains available for backward-compatibility reasons + * the uncompressed size and nchildren should be accessed using the inline helpers + * below + */ + uint64_t uncompressed_size; + decmpfs_raw_item_size _size; + }; unsigned char attr_bytes[0]; /* the bytes of the attribute after the header */ } decmpfs_header; +static inline uint64_t +decmpfs_get_uncompressed_size(const decmpfs_header *hdr) +{ + if (hdr->compression_magic == DECMPFS_MAGIC && hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) { + return DECMPFS_PKG_SIZE(hdr->_size); + } + + return hdr->uncompressed_size; +} + +static inline uint32_t +decmpfs_get_directory_entries(const decmpfs_header *hdr) +{ + if (hdr->compression_magic == DECMPFS_MAGIC && hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) { + return DECMPFS_PKG_CHLD_COUNT(hdr->_size); + } + + return (uint32_t)hdr->uncompressed_size; +} + /* compression_type values */ enum { CMP_Type1 = 1,/* uncompressed data in xattr */ @@ -120,6 +184,8 @@ struct decmpfs_cnode { uint32_t lockcount; void *lockowner; /* cnode's lock owner (if a thread is currently holding an exclusive lock) */ uint64_t uncompressed_size __attribute__((aligned(8))); + uint64_t nchildren __attribute__((aligned(8))); /* for dataless directories (incl. packages) */ + uint64_t total_size __attribute__((aligned(8)));/* for dataless directories (incl. packages) */ uint64_t decompression_flags; lck_rw_t compressed_data_lock; }; @@ -156,6 +222,11 @@ void decmpfs_unlock_compressed_data(decmpfs_cnode *cp, int exclusive); uint32_t decmpfs_cnode_get_vnode_state(decmpfs_cnode *cp); void decmpfs_cnode_set_vnode_state(decmpfs_cnode *cp, uint32_t state, int skiplock); uint64_t decmpfs_cnode_get_vnode_cached_size(decmpfs_cnode *cp); +uint64_t decmpfs_cnode_get_vnode_cached_nchildren(decmpfs_cnode *cp); +uint64_t decmpfs_cnode_get_vnode_cached_total_size(decmpfs_cnode *cp); +void decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size); +void decmpfs_cnode_set_vnode_cached_nchildren(decmpfs_cnode *cp, uint64_t nchildren); +void decmpfs_cnode_set_vnode_cached_total_size(decmpfs_cnode *cp, uint64_t total_sz); uint32_t decmpfs_cnode_cmp_type(decmpfs_cnode *cp); int decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp); diff --git a/bsd/sys/dirent.h b/bsd/sys/dirent.h index c6e1d8868..497077301 100644 --- a/bsd/sys/dirent.h +++ b/bsd/sys/dirent.h @@ -141,4 +141,17 @@ struct direntry __DARWIN_STRUCT_DIRENTRY; #define DTTOIF(dirtype) ((dirtype) << 12) #endif +#if PRIVATE +/* + * If a buffer at least this size is passed to __getdirentries64, + * the the last 4 bytes will be the flags below. + */ +#define GETDIRENTRIES64_EXTENDED_BUFSIZE 1024 + +__options_decl(getdirentries64_flags_t, unsigned, { + /* the __getdirentries64 returned all entries */ + GETDIRENTRIES64_EOF = 1U << 0, +}); +#endif + #endif /* _SYS_DIRENT_H */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index f0a7a15da..66d317902 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -57,6 +57,8 @@ * DKIOCREQUESTIDLE idle media * DKIOCUNMAP delete unused data * + * DKIOCGETLOCATION get device's physical location + * * DKIOCGETMAXBLOCKCOUNTREAD get maximum block count for reads * DKIOCGETMAXBLOCKCOUNTWRITE get maximum block count for writes * DKIOCGETMAXBYTECOUNTREAD get maximum byte count for reads @@ -135,7 +137,6 @@ typedef struct{ #endif /* !__LP64__ */ } dk_unmap_t; - typedef struct{ uint64_t flags; uint64_t hotfile_size; /* in bytes */ @@ -176,6 +177,8 @@ typedef struct{ char * description; } dk_error_description_t; +#define DK_LOCATION_INTERNAL 0x00000000 +#define DK_LOCATION_EXTERNAL 0x00000001 #ifdef KERNEL #ifdef PRIVATE @@ -203,6 +206,8 @@ typedef struct{ #define DKIOCUNMAP _IOW('d', 31, dk_unmap_t) #define DKIOCCORESTORAGE _IOR('d', 32, dk_corestorage_info_t) +#define DKIOCGETLOCATION _IOR('d', 33, uint64_t) + #define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, uint64_t) #define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, uint64_t) #define DKIOCGETMAXBYTECOUNTREAD _IOR('d', 70, uint64_t) @@ -344,9 +349,9 @@ typedef struct dk_apfs_wbc_range { #endif /* KERNEL */ #ifdef PRIVATE -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define _DKIOCSETSTATIC _IO('d', 84) -#endif /* TARGET_OS_EMBEDDED */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ #endif /* PRIVATE */ #endif /* _SYS_DISK_H_ */ diff --git a/bsd/sys/domain.h b/bsd/sys/domain.h index 10ada89ca..9d6ee92e5 100644 --- a/bsd/sys/domain.h +++ b/bsd/sys/domain.h @@ -186,11 +186,12 @@ typedef const struct domain_unguard *domain_unguard_t; extern domain_unguard_t domain_unguard_deploy(void); extern void domain_unguard_release(domain_unguard_t); extern struct domain_old *pffinddomain_old(int); +extern struct domain *pffinddomain(int) __XNU_INTERNAL(pffinddomain); #else extern void net_add_domain(struct domain *dp); extern int net_del_domain(struct domain *); -#endif /* XNU_KERNEL_PRIVATE */ extern struct domain *pffinddomain(int); +#endif /* XNU_KERNEL_PRIVATE */ __END_DECLS #endif /* KERNEL_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index 29540c2a3..98d3628dd 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -34,8 +34,6 @@ #ifndef _SYS_DTRACE_H #define _SYS_DTRACE_H -/* #pragma ident "@(#)dtrace.h 1.37 07/06/05 SMI" */ - #ifdef __cplusplus extern "C" { #endif @@ -73,12 +71,6 @@ extern "C" { #endif #endif -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - #if defined(__BIG_ENDIAN__) #if !defined(_BIG_ENDIAN) #define _BIG_ENDIAN /* Solaris vs. Darwin */ @@ -91,6 +83,12 @@ extern "C" { #error Unknown endian-ness #endif +#ifdef KERNEL +#ifndef _KERNEL +#define _KERNEL /* Solaris vs. Darwin */ +#endif +#endif + #include #include #include @@ -286,6 +284,7 @@ typedef enum dtrace_probespec { #define DIF_OP_RLDX 77 /* rldx [r1], rd */ #define DIF_OP_XLATE 78 /* xlate xlrindex, rd */ #define DIF_OP_XLARG 79 /* xlarg xlrindex, rd */ +#define DIF_OP_STRIP 80 /* strip r1, key, rd */ #define DIF_INTOFF_MAX 0xffff /* highest integer table offset */ #define DIF_STROFF_MAX 0xffff /* highest string table offset */ @@ -394,7 +393,10 @@ typedef enum dtrace_probespec { #define DIF_SUBR_INET_NTOA6 43 #define DIF_SUBR_TOUPPER 44 #define DIF_SUBR_TOLOWER 45 -#define DIF_SUBR_MAX 46 /* max subroutine value */ +#define DIF_SUBR_JSON 46 +#define DIF_SUBR_STRTOLL 47 +#define DIF_SUBR_STRIP 48 +#define DIF_SUBR_MAX 48 /* max subroutine value */ /* Apple-specific subroutines */ #if defined(__APPLE__) @@ -412,6 +414,7 @@ typedef uint32_t dif_instr_t; #define DIF_INSTR_R2(i) (((i) >> 8) & 0xff) #define DIF_INSTR_RD(i) ((i) & 0xff) #define DIF_INSTR_RS(i) ((i) & 0xff) +#define DIF_INSTR_IMM2(i) (((i) >> 8) & 0xff) #define DIF_INSTR_LABEL(i) ((i) & 0xffffff) #define DIF_INSTR_VAR(i) (((i) >> 8) & 0xffff) #define DIF_INSTR_INTEGER(i) (((i) >> 8) & 0xffff) @@ -2558,25 +2561,6 @@ extern void dtrace_sync(void); extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t)); extern void dtrace_xcall(processorid_t, dtrace_xcall_t, void *); -extern int dtrace_safe_defer_signal(void); -extern void dtrace_safe_synchronous_signal(void); - -extern int dtrace_mach_aframes(void); - -#if !defined(__APPLE__) -#if defined(__i386) || defined(__amd64) -extern int dtrace_instr_size(uchar_t *instr); -extern int dtrace_instr_size_isa(uchar_t *, model_t, int *); -extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -extern void dtrace_invop_callsite(void); -#endif - -#ifdef __sparc -extern int dtrace_blksuword32(uintptr_t, uint32_t *, int); -extern void dtrace_getfsr(uint64_t *); -#endif -#else #if defined(__i386__) || defined(__x86_64__) extern int dtrace_instr_size(uchar_t *instr); extern int dtrace_instr_size_isa(uchar_t *, model_t, int *); @@ -2586,16 +2570,17 @@ extern void *dtrace_invop_callsite_pre; extern void *dtrace_invop_callsite_post; #endif -#if defined(__arm__) || defined(__arm64__) +#if defined(__arm__) extern int dtrace_instr_size(uint32_t instr, int thumb_mode); +#endif +#if defined(__arm__) || defined(__arm64__) extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t)); extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); extern void *dtrace_invop_callsite_pre; extern void *dtrace_invop_callsite_post; #endif - + #undef proc_t -#endif /* __APPLE__ */ #define DTRACE_CPUFLAG_ISSET(flag) \ (cpu_core[CPU->cpu_id].cpuc_dtrace_flags & (flag)) @@ -2610,17 +2595,6 @@ extern void *dtrace_invop_callsite_post; #endif /* _ASM */ -#if !defined(__APPLE__) -#if defined(__i386) || defined(__amd64) - -#define DTRACE_INVOP_PUSHL_EBP 1 -#define DTRACE_INVOP_POPL_EBP 2 -#define DTRACE_INVOP_LEAVE 3 -#define DTRACE_INVOP_NOP 4 -#define DTRACE_INVOP_RET 5 - -#endif -#else #if defined(__i386__) || defined(__x86_64__) #define DTRACE_INVOP_PUSHL_EBP 1 @@ -2639,8 +2613,6 @@ extern void *dtrace_invop_callsite_post; #endif -#endif /* __APPLE__ */ - #ifdef __cplusplus } #endif diff --git a/bsd/sys/dtrace_glue.h b/bsd/sys/dtrace_glue.h index 51d9804c7..be144f4d4 100644 --- a/bsd/sys/dtrace_glue.h +++ b/bsd/sys/dtrace_glue.h @@ -36,12 +36,12 @@ #include #include #include +#include #include #include #include #include #include -#include #if defined(__i386__) || defined(__x86_64__) #include @@ -226,8 +226,6 @@ typedef struct modctl { #define MODCTL_SDT_PROBES_PROVIDED 0x10 // sdt probes have been provided #define MODCTL_SDT_INVALID 0x20 // Module is invalid for sdt probes #define MODCTL_HAS_UUID 0x40 // Module has UUID -#define MODCTL_FBT_PRIVATE_PROBES_PROVIDED 0x80 // fbt private probes have been provided -#define MODCTL_FBT_PROVIDE_PRIVATE_PROBES 0x100 // fbt provider must provide private probes #define MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES 0x200 // fbt provider must provide blacklisted probes #define MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED 0x400 // fbt blacklisted probes have been provided #define MODCTL_IS_STATIC_KEXT 0x800 // module is a static kext @@ -241,16 +239,13 @@ typedef struct modctl { #define MOD_SDT_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_SDT_PROBES_PROVIDED) #define MOD_SDT_INVALID(mod) (mod->mod_flags & MODCTL_SDT_INVALID) #define MOD_HAS_UUID(mod) (mod->mod_flags & MODCTL_HAS_UUID) -#define MOD_FBT_PRIVATE_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_PRIVATE_PROBES_PROVIDED) -#define MOD_FBT_PROVIDE_PRIVATE_PROBES(mod) (mod->mod_flags & MODCTL_FBT_PROVIDE_PRIVATE_PROBES) #define MOD_FBT_BLACKLISTED_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED) #define MOD_FBT_PROVIDE_BLACKLISTED_PROBES(mod) (mod->mod_flags & MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES) #define MOD_IS_STATIC_KEXT(mod) (mod->mod_flags & MODCTL_IS_STATIC_KEXT) /* Compound accessors */ -#define MOD_FBT_PRIVATE_PROBES_DONE(mod) (MOD_FBT_PRIVATE_PROBES_PROVIDED(mod) || !MOD_FBT_PROVIDE_PRIVATE_PROBES(mod)) #define MOD_FBT_BLACKLISTED_PROBES_DONE(mod) (MOD_FBT_BLACKLISTED_PROBES_PROVIDED(mod) || !MOD_FBT_PROVIDE_BLACKLISTED_PROBES(mod)) -#define MOD_FBT_DONE(mod) ((MOD_FBT_PROBES_PROVIDED(mod) && MOD_FBT_PRIVATE_PROBES_DONE(mod) && MOD_FBT_BLACKLISTED_PROBES_DONE(mod)) || MOD_FBT_INVALID(mod)) +#define MOD_FBT_DONE(mod) ((MOD_FBT_PROBES_PROVIDED(mod) && MOD_FBT_BLACKLISTED_PROBES_DONE(mod)) || MOD_FBT_INVALID(mod)) #define MOD_SDT_DONE(mod) (MOD_SDT_PROBES_PROVIDED(mod) || MOD_SDT_INVALID(mod)) #define MOD_SYMBOLS_DONE(mod) (MOD_FBT_DONE(mod) && MOD_SDT_DONE(mod)) @@ -449,60 +444,6 @@ extern vmem_t *vmem_create(const char *, void *, size_t, size_t, void *, extern void vmem_destroy(vmem_t *); extern void vmem_free(vmem_t *vmp, void *vaddr, size_t size); -/* - * Atomic - */ - -static inline uint8_t -atomic_or_8(uint8_t *addr, uint8_t mask) -{ - return OSBitOrAtomic8(mask, addr); -} - -static inline uint32_t -atomic_and_32( uint32_t *addr, int32_t mask) -{ - return OSBitAndAtomic(mask, addr); -} - -static inline uint32_t -atomic_add_32( uint32_t *theAddress, int32_t theAmount ) -{ - return OSAddAtomic( theAmount, theAddress ); -} - -#if defined(__i386__) || defined(__x86_64__) -static inline void -atomic_add_64( uint64_t *theAddress, int64_t theAmount ) -{ - (void)OSAddAtomic64( theAmount, (SInt64 *)theAddress ); -} -#elif defined(__arm__) -static inline void -atomic_add_64( uint64_t *theAddress, int64_t theAmount ) -{ - // FIXME - // atomic_add_64() is at present only called from fasttrap.c to increment - // or decrement a 64bit counter. Narrow to 32bits since arm has - // no convenient 64bit atomic op. - - (void)OSAddAtomic((int32_t)theAmount, &(((SInt32 *)theAddress)[0])); -} -#elif defined (__arm64__) -static inline void -atomic_add_64( uint64_t *theAddress, int64_t theAmount ) -{ - (void)OSAddAtomic64( theAmount, (SInt64 *)theAddress ); -} -#endif - -static inline uint32_t -atomic_or_32(uint32_t *addr, uint32_t mask) -{ - return OSBitOrAtomic(mask, addr); -} - - /* * Miscellaneous */ @@ -514,7 +455,6 @@ typedef uintptr_t greg_t; /* For dtrace_impl.h prototype of dtrace_getfp() */ #endif extern struct regs *find_user_regs( thread_t thread); extern vm_offset_t dtrace_get_cpu_int_stack_top(void); -extern vm_offset_t max_valid_stack_address(void); /* kern/thread.h */ #define panic_quiesce (panic_active()) @@ -542,13 +482,6 @@ int dtrace_buffer_copyout(const void*, user_addr_t, vm_size_t); */ #define LIT_STRNEQL(s1, lit_s2) (0 == strncmp( (s1), (lit_s2), sizeof((lit_s2)) )) -/* - * Safe counted string compare of a literal against the beginning of a string. Here - * the sizeof() is reduced by 1 so that the trailing null of the literal does not - * participate in the comparison. - */ -#define LIT_STRNSTART(s1, lit_s2) (0 == strncmp( (s1), (lit_s2), sizeof((lit_s2)) - 1 )) - #define KERNELBASE VM_MIN_KERNEL_ADDRESS #endif /* KERNEL_BUILD */ #endif /* _DTRACE_GLUE_H */ diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index f463b49e3..cfc07b33f 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -30,8 +30,6 @@ #ifndef _SYS_DTRACE_IMPL_H #define _SYS_DTRACE_IMPL_H -/* #pragma ident "@(#)dtrace_impl.h 1.23 07/02/16 SMI" */ - #ifdef __cplusplus extern "C" { #endif @@ -1232,6 +1230,7 @@ struct dtrace_state { size_t dts_nretained; /* number of retained enabs */ uint64_t dts_arg_error_illval; uint32_t dts_buf_over_limit; /* number of bufs over dtb_limit */ + uint64_t **dts_rstate; /* per-CPU random state */ }; struct dtrace_provider { @@ -1394,6 +1393,9 @@ extern void dtrace_flush_caches(void); extern void dtrace_copy(uintptr_t, uintptr_t, size_t); extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); +extern void* dtrace_ptrauth_strip(void*, uint64_t); +extern int dtrace_is_valid_ptrauth_key(uint64_t); + /* * DTrace state handling */ diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index 74e47d3d5..502fabcd0 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -276,6 +276,7 @@ __END_DECLS #ifdef BSD_KERNEL_PRIVATE #define EREDRIVEOPEN (-6) #define EKEEPLOOKING (-7) +#define EDATALESS (-8) /* used for cvwait error returns to Libc */ #define ECVCERORR 256 #define ECVPERORR 512 diff --git a/bsd/sys/event.h b/bsd/sys/event.h index e8c171fd9..5966311eb 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2017 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,16 +73,17 @@ #define EVFILT_MACHPORT (-8) /* Mach portsets */ #define EVFILT_FS (-9) /* Filesystem events */ #define EVFILT_USER (-10) /* User events */ - /* (-11) unused */ +#ifdef PRIVATE +#define EVFILT_UNUSED_11 (-11) /* (-11) unused */ +#endif #define EVFILT_VM (-12) /* Virtual memory events */ - #ifdef PRIVATE #define EVFILT_SOCK (-13) /* Socket events */ #define EVFILT_MEMORYSTATUS (-14) /* Memorystatus events */ #endif /* PRIVATE */ #define EVFILT_EXCEPT (-15) /* Exception events */ #ifdef PRIVATE -#define EVFILT_WORKLOOP (-17) /* Workloop events */ +#define EVFILT_WORKLOOP (-17) /* Workloop events */ #endif /* PRIVATE */ #define EVFILT_SYSCOUNT 17 @@ -91,46 +92,34 @@ #pragma pack(4) struct kevent { - uintptr_t ident; /* identifier for this event */ - int16_t filter; /* filter for event */ - uint16_t flags; /* general flags */ - uint32_t fflags; /* filter-specific flags */ - intptr_t data; /* filter-specific data */ - void *udata; /* opaque user data identifier */ + uintptr_t ident; /* identifier for this event */ + int16_t filter; /* filter for event */ + uint16_t flags; /* general flags */ + uint32_t fflags; /* filter-specific flags */ + intptr_t data; /* filter-specific data */ + void *udata; /* opaque user data identifier */ }; #ifdef KERNEL_PRIVATE struct user64_kevent { - uint64_t ident; /* identifier for this event */ - int16_t filter; /* filter for event */ - uint16_t flags; /* general flags */ - uint32_t fflags; /* filter-specific flags */ - int64_t data; /* filter-specific data */ - user_addr_t udata; /* opaque user data identifier */ + uint64_t ident; /* identifier for this event */ + int16_t filter; /* filter for event */ + uint16_t flags; /* general flags */ + uint32_t fflags; /* filter-specific flags */ + int64_t data; /* filter-specific data */ + user_addr_t udata; /* opaque user data identifier */ }; struct user32_kevent { - uint32_t ident; /* identifier for this event */ - int16_t filter; /* filter for event */ - uint16_t flags; /* general flags */ - uint32_t fflags; /* filter-specific flags */ - int32_t data; /* filter-specific data */ + uint32_t ident; /* identifier for this event */ + int16_t filter; /* filter for event */ + uint16_t flags; /* general flags */ + uint32_t fflags; /* filter-specific flags */ + int32_t data; /* filter-specific data */ user32_addr_t udata; /* opaque user data identifier */ }; -struct kevent_internal_s { - uint64_t ident; /* identifier for this event */ - int16_t filter; /* filter for event */ - uint16_t flags; /* general flags */ - int32_t qos; /* quality of service */ - uint32_t fflags; /* filter-specific flags */ -// uint32_t xflags; /* extra filter-specific flags */ - int64_t data; /* filter-specific data */ - uint64_t udata; /* opaque user data identifier */ - uint64_t ext[4]; /* filter-specific extensions */ -}; - #endif /* KERNEL_PRIVATE */ #pragma pack() @@ -162,7 +151,6 @@ struct kevent_qos_s { * Type definition for names/ids of dynamically allocated kqueues. */ typedef uint64_t kqueue_id_t; - #endif /* PRIVATE */ #define EV_SET(kevp, a, b, c, d, e, f) do { \ @@ -201,19 +189,19 @@ typedef uint64_t kqueue_id_t; * instead. */ -#define KEVENT_FLAG_STACK_EVENTS 0x000004 /* output events treated as stack (grows down) */ +// was KEVENT_FLAG_STACK_EVENTS 0x000004 #define KEVENT_FLAG_STACK_DATA 0x000008 /* output data allocated as stack (grows down) */ -// 0x000010 +// KEVENT_FLAG_POLL 0x000010 #define KEVENT_FLAG_WORKQ 0x000020 /* interact with the default workq kq */ // KEVENT_FLAG_LEGACY32 0x000040 // KEVENT_FLAG_LEGACY64 0x000080 -// 0x000100 +// KEVENT_FLAG_PROC64 0x000100 #define KEVENT_FLAG_WORKQ_MANAGER 0x000200 /* obsolete */ #define KEVENT_FLAG_WORKLOOP 0x000400 /* interact with the specified workloop kq */ #define KEVENT_FLAG_PARKING 0x000800 /* workq thread is parking */ // KEVENT_FLAG_KERNEL 0x001000 // KEVENT_FLAG_DYNAMIC_KQUEUE 0x002000 -// 0x004000 +// KEVENT_FLAG_NEEDS_END_PROCESSING 0x004000 #define KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH 0x008000 /* obsolete */ #define KEVENT_FLAG_WORKLOOP_SERVICER_DETACH 0x010000 /* obsolete */ #define KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST 0x020000 /* kq lookup by id must exist */ @@ -222,14 +210,19 @@ typedef uint64_t kqueue_id_t; #ifdef XNU_KERNEL_PRIVATE +#define KEVENT_FLAG_POLL 0x0010 /* Call is for poll() */ #define KEVENT_FLAG_LEGACY32 0x0040 /* event data in legacy 32-bit format */ #define KEVENT_FLAG_LEGACY64 0x0080 /* event data in legacy 64-bit format */ +#define KEVENT_FLAG_PROC64 0x0100 /* proc is 64bits */ #define KEVENT_FLAG_KERNEL 0x1000 /* caller is in-kernel */ #define KEVENT_FLAG_DYNAMIC_KQUEUE 0x2000 /* kqueue is dynamically allocated */ +#define KEVENT_FLAG_NEEDS_END_PROCESSING 0x4000 /* end processing required before returning */ + +#define KEVENT_ID_FLAG_USER (KEVENT_FLAG_WORKLOOP | \ + KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) #define KEVENT_FLAG_USER (KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS | \ - KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \ - KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \ + KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \ KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) /* @@ -238,22 +231,24 @@ typedef uint64_t kqueue_id_t; * let kn_fops() get the correct fops for all cases. */ #define EVFILTID_KQREAD (EVFILT_SYSCOUNT) -#define EVFILTID_PIPE_R (EVFILT_SYSCOUNT + 1) -#define EVFILTID_PIPE_W (EVFILT_SYSCOUNT + 2) -#define EVFILTID_PTSD (EVFILT_SYSCOUNT + 3) -#define EVFILTID_SOREAD (EVFILT_SYSCOUNT + 4) -#define EVFILTID_SOWRITE (EVFILT_SYSCOUNT + 5) -#define EVFILTID_SCK (EVFILT_SYSCOUNT + 6) -#define EVFILTID_SOEXCEPT (EVFILT_SYSCOUNT + 7) -#define EVFILTID_SPEC (EVFILT_SYSCOUNT + 8) -#define EVFILTID_BPFREAD (EVFILT_SYSCOUNT + 9) -#define EVFILTID_NECP_FD (EVFILT_SYSCOUNT + 10) -#define EVFILTID_FSEVENT (EVFILT_SYSCOUNT + 13) -#define EVFILTID_VN (EVFILT_SYSCOUNT + 14) -#define EVFILTID_TTY (EVFILT_SYSCOUNT + 16) -#define EVFILTID_PTMX (EVFILT_SYSCOUNT + 17) - -#define EVFILTID_MAX (EVFILT_SYSCOUNT + 18) +#define EVFILTID_PIPE_N (EVFILT_SYSCOUNT + 1) +#define EVFILTID_PIPE_R (EVFILT_SYSCOUNT + 2) +#define EVFILTID_PIPE_W (EVFILT_SYSCOUNT + 3) +#define EVFILTID_PTSD (EVFILT_SYSCOUNT + 4) +#define EVFILTID_SOREAD (EVFILT_SYSCOUNT + 5) +#define EVFILTID_SOWRITE (EVFILT_SYSCOUNT + 6) +#define EVFILTID_SCK (EVFILT_SYSCOUNT + 7) +#define EVFILTID_SOEXCEPT (EVFILT_SYSCOUNT + 8) +#define EVFILTID_SPEC (EVFILT_SYSCOUNT + 9) +#define EVFILTID_BPFREAD (EVFILT_SYSCOUNT + 10) +#define EVFILTID_NECP_FD (EVFILT_SYSCOUNT + 11) +#define EVFILTID_FSEVENT (EVFILT_SYSCOUNT + 15) +#define EVFILTID_VN (EVFILT_SYSCOUNT + 16) +#define EVFILTID_TTY (EVFILT_SYSCOUNT + 17) +#define EVFILTID_PTMX (EVFILT_SYSCOUNT + 18) + +#define EVFILTID_DETACHED (EVFILT_SYSCOUNT + 19) +#define EVFILTID_MAX (EVFILT_SYSCOUNT + 20) #endif /* defined(XNU_KERNEL_PRIVATE) */ @@ -371,6 +366,8 @@ typedef uint64_t kqueue_id_t; * Marks the waiter knote as being eligible to become an owner * This bit can only be set once, trying it again will fail with EALREADY. * + * @const NOTE_WL_SYNC_IPC [in/out] + * The knote is a sync IPC redirected turnstile push. * * Flags/Modifiers: * @@ -402,24 +399,27 @@ typedef uint64_t kqueue_id_t; #define NOTE_WL_THREAD_REQUEST 0x00000001 #define NOTE_WL_SYNC_WAIT 0x00000004 #define NOTE_WL_SYNC_WAKE 0x00000008 -#define NOTE_WL_COMMANDS_MASK 0x0000000f /* Mask of all the [in] commands above */ +#define NOTE_WL_SYNC_IPC 0x80000000 +#define NOTE_WL_COMMANDS_MASK 0x8000000f /* Mask of all the [in] commands above */ #define NOTE_WL_UPDATE_QOS 0x00000010 #define NOTE_WL_END_OWNERSHIP 0x00000020 -#define NOTE_WL_UPDATE_OWNER 0 /* ... compatibility define ... */ #define NOTE_WL_DISCOVER_OWNER 0x00000080 #define NOTE_WL_IGNORE_ESTALE 0x00000100 #define NOTE_WL_UPDATES_MASK 0x000001f0 /* Mask of all the [in] updates above */ +#define NOTE_WL_UPDATE_OWNER 0 /* ... compatibility define ... */ + /* * EVFILT_WORKLOOP ext[] array indexes/meanings. */ #define EV_EXTIDX_WL_LANE 0 /* lane identifier [in: sync waiter] - * [out: thread request] */ + * [out: thread request] */ #define EV_EXTIDX_WL_ADDR 1 /* debounce address [in: NULL==no debounce] */ #define EV_EXTIDX_WL_MASK 2 /* debounce mask [in] */ #define EV_EXTIDX_WL_VALUE 3 /* debounce value [in: not current->ESTALE] - * [out: new/debounce value] */ + * [out: new/debounce value] */ + #endif /* PRIVATE */ /* @@ -532,6 +532,7 @@ enum { #define NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE 0x00000080 /* Used to restrict sending a warn event only once, per inactive limit, soft limit only */ #define NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE 0x00000100 /* Used to restrict sending a critical event only once per active limit, soft limit only */ #define NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE 0x00000200 /* Used to restrict sending a critical event only once per inactive limit, soft limit only */ +#define NOTE_MEMORYSTATUS_JETSAM_FG_BAND 0x00000400 /* jetsam is approaching foreground band */ /* * Use this mask to protect the kernel private flags. @@ -547,6 +548,7 @@ typedef enum vm_pressure_level { kVMPressureWarning = 1, kVMPressureUrgent = 2, kVMPressureCritical = 3, + kVMPressureJetsam = 4, /* jetsam approaching FG bands */ } vm_pressure_level_t; #endif /* PRIVATE */ @@ -677,35 +679,30 @@ SLIST_HEAD(klist, knote); MALLOC_DECLARE(M_KQUEUE); #endif +LIST_HEAD(knote_list, knote); TAILQ_HEAD(kqtailq, knote); /* a list of "queued" events */ /* index into various kq queues */ typedef uint8_t kq_index_t; -typedef uint16_t kn_status_t; - -#define KN_ACTIVE 0x0001 /* event has been triggered */ -#define KN_QUEUED 0x0002 /* event is on queue */ -#define KN_DISABLED 0x0004 /* event is disabled */ -#define KN_DROPPING 0x0008 /* knote is being dropped */ -#define KN_LOCKED 0x0010 /* knote is locked (kq_knlocks) */ -#define KN_ATTACHING 0x0020 /* event is pending attach */ -#define KN_STAYACTIVE 0x0040 /* force event to stay active */ -#define KN_DEFERDELETE 0x0080 /* defer delete until re-enabled */ -#define KN_ATTACHED 0x0100 /* currently attached to source */ -#define KN_DISPATCH 0x0200 /* disables as part of deliver */ -#define KN_UDATA_SPECIFIC 0x0400 /* udata is part of matching */ -#define KN_SUPPRESSED 0x0800 /* event is suppressed during delivery */ -#define KN_MERGE_QOS 0x1000 /* f_event() / f_* ran concurrently and - * overrides must merge */ -#define KN_REQVANISH 0x2000 /* requested EV_VANISH */ -#define KN_VANISHED 0x4000 /* has vanished */ -// 0x8000 - -/* combination defines deferred-delete mode enabled */ -#define KN_DISPATCH2 (KN_DISPATCH | KN_UDATA_SPECIFIC) + +/* lskq(1) knows about this type */ +__options_decl(kn_status_t, uint16_t /* 12 bits really */, { + KN_ACTIVE = 0x001, /* event has been triggered */ + KN_QUEUED = 0x002, /* event is on queue */ + KN_DISABLED = 0x004, /* event is disabled */ + KN_DROPPING = 0x008, /* knote is being dropped */ + KN_LOCKED = 0x010, /* knote is locked (kq_knlocks) */ + KN_POSTING = 0x020, /* f_event() in flight */ + KN_STAYACTIVE = 0x040, /* force event to stay active */ + KN_DEFERDELETE = 0x080, /* defer delete until re-enabled */ + KN_MERGE_QOS = 0x100, /* f_event() / f_* ran concurrently and overrides must merge */ + KN_REQVANISH = 0x200, /* requested EV_VANISH */ + KN_VANISHED = 0x400, /* has vanished */ + KN_SUPPRESSED = 0x800, /* event is suppressed during delivery */ +}); #define KNOTE_KQ_BITSIZE 42 -_Static_assert(KNOTE_KQ_BITSIZE >= VM_KERNEL_POINTER_SIGNIFICANT_BITS, +_Static_assert(KNOTE_KQ_BITSIZE > VM_KERNEL_POINTER_SIGNIFICANT_BITS, "Make sure sign extending kn_kq_packed is legit"); struct kqueue; @@ -713,43 +710,82 @@ struct knote { TAILQ_ENTRY(knote) kn_tqe; /* linkage for tail queue */ SLIST_ENTRY(knote) kn_link; /* linkage for search list */ SLIST_ENTRY(knote) kn_selnext; /* klist element chain */ - uintptr_t kn_filtid:8, /* filter id to index filter ops */ - kn_req_index:4, /* requested qos index */ + + kn_status_t kn_status : 12; + uintptr_t kn_qos_index:4, /* in-use qos index */ - kn_qos_override:4, /* qos override index */ + kn_qos_override:3, /* qos override index */ + kn_is_fd:1, /* knote is an fd */ kn_vnode_kqok:1, kn_vnode_use_ofst:1; #if __LP64__ - intptr_t kn_kq_packed : KNOTE_KQ_BITSIZE; + intptr_t kn_kq_packed : KNOTE_KQ_BITSIZE; #else - intptr_t kn_kq_packed; + intptr_t kn_kq_packed; #endif + + /* per filter stash of data (pointer, uint32_t or uint64_t) */ union { - void *kn_hook; - uint64_t kn_hook_data; + void *kn_hook; + uint32_t kn_hook32; + uint64_t kn_hook64; }; - int64_t kn_sdata; /* saved data field */ + + /* per filter pointer to the resource being watched */ union { - struct fileproc *p_fp; /* file data pointer */ - struct proc *p_proc; /* proc pointer */ - struct ipc_mqueue *p_mqueue; /* pset pointer */ - } kn_ptr; - struct kevent_internal_s kn_kevent; - int kn_sfflags; /* saved filter flags */ - int kn_hookid; - uint16_t kn_inuse; /* inuse count */ - kn_status_t kn_status; /* status bits */ - -#define kn_id kn_kevent.ident -#define kn_filter kn_kevent.filter -#define kn_flags kn_kevent.flags -#define kn_qos kn_kevent.qos -#define kn_udata kn_kevent.udata -#define kn_fflags kn_kevent.fflags -#define kn_xflags kn_kevent.xflags -#define kn_data kn_kevent.data -#define kn_ext kn_kevent.ext -#define kn_fp kn_ptr.p_fp + struct fileproc *kn_fp; /* file data pointer */ + struct proc *kn_proc; /* proc pointer */ + struct ipc_mqueue *kn_mqueue; /* pset pointer */ + struct thread_call *kn_thcall; + struct thread *kn_thread; + }; + + /* + * Mimic kevent_qos so that knote_fill_kevent code is not horrid, + * but with subtleties: + * + * - kevent_qos_s::filter is 16bits where ours is 8, and we use the top + * bits to store the real specialized filter. + * knote_fill_kevent* will always force the top bits to 0xff. + * + * - kevent_qos_s::xflags is not kept, kn_sfflags takes its place, + * knote_fill_kevent* will set xflags to 0. + * + * - kevent_qos_s::data is saved as kn_sdata and filters are encouraged + * to use knote_fill_kevent, knote_fill_kevent_with_sdata will copy + * kn_sdata as the output value. + * + * knote_fill_kevent_with_sdata() programatically asserts + * these aliasings are respected. + */ + struct kevent_internal_s { + uint64_t kei_ident; /* identifier for this event */ +#ifdef __LITTLE_ENDIAN__ + int8_t kei_filter; /* filter for event */ + uint8_t kei_filtid; /* actual filter for event */ +#else + uint8_t kei_filtid; /* actual filter for event */ + int8_t kei_filter; /* filter for event */ +#endif + uint16_t kei_flags; /* general flags */ + int32_t kei_qos; /* quality of service */ + uint64_t kei_udata; /* opaque user data identifier */ + uint32_t kei_fflags; /* filter-specific flags */ + uint32_t kei_sfflags; /* knote: saved fflags */ + int64_t kei_sdata; /* knote: filter-specific saved data */ + uint64_t kei_ext[4]; /* filter-specific extensions */ + } kn_kevent; + +#define kn_id kn_kevent.kei_ident +#define kn_filtid kn_kevent.kei_filtid +#define kn_filter kn_kevent.kei_filter +#define kn_flags kn_kevent.kei_flags +#define kn_qos kn_kevent.kei_qos +#define kn_udata kn_kevent.kei_udata +#define kn_fflags kn_kevent.kei_fflags +#define kn_sfflags kn_kevent.kei_sfflags +#define kn_sdata kn_kevent.kei_sdata +#define kn_ext kn_kevent.kei_ext }; static inline struct kqueue * @@ -773,21 +809,25 @@ knote_get_seltype(struct knote *kn) } } -static inline void -knote_set_error(struct knote *kn, int error) -{ - kn->kn_flags |= EV_ERROR; - kn->kn_data = error; -} - -struct filt_process_s { - int fp_fd; - unsigned int fp_flags; - user_addr_t fp_data_out; - user_size_t fp_data_size; - user_size_t fp_data_resid; +struct kevent_ctx_s { + uint64_t kec_data_avail; /* address of remaining data size */ + user_addr_t kec_data_out; /* extra data pointer */ + user_size_t kec_data_size; /* total extra data size */ + user_size_t kec_data_resid; /* residual extra data size */ + uint64_t kec_deadline; /* wait deadline unless KEVENT_FLAG_IMMEDIATE */ + struct fileproc *kec_fp; /* fileproc to pass to fp_drop or NULL */ + int kec_fd; /* fd to pass to fp_drop or -1 */ + + /* the fields below are only set during process / scan */ + int kec_process_nevents; /* user-level event count */ + int kec_process_noutputs; /* number of events output */ + unsigned int kec_process_flags; /* kevent flags, only set for process */ + user_addr_t kec_process_eventlist; /* user-level event list address */ }; -typedef struct filt_process_s *filt_process_data_t; +typedef struct kevent_ctx_s *kevent_ctx_t; + +kevent_ctx_t +kevent_get_context(thread_t thread); /* * Filter operators @@ -955,16 +995,16 @@ struct filterops { bool f_adjusts_qos; /* true if the filter can override the knote */ bool f_extended_codes; /* hooks return extended codes */ - int (*f_attach)(struct knote *kn, struct kevent_internal_s *kev); + int (*f_attach)(struct knote *kn, struct kevent_qos_s *kev); void (*f_detach)(struct knote *kn); int (*f_event)(struct knote *kn, long hint); - int (*f_touch)(struct knote *kn, struct kevent_internal_s *kev); - int (*f_process)(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); + int (*f_touch)(struct knote *kn, struct kevent_qos_s *kev); + int (*f_process)(struct knote *kn, struct kevent_qos_s *kev); int (*f_peek)(struct knote *kn); /* optional & advanced */ - bool (*f_allow_drop)(struct knote *kn, struct kevent_internal_s *kev); - void (*f_post_register_wait)(struct uthread *uth, struct knote_lock_ctx *ctx, + bool (*f_allow_drop)(struct knote *kn, struct kevent_qos_s *kev); + void (*f_post_register_wait)(struct uthread *uth, struct knote *kn, struct _kevent_register *ss_kr); }; @@ -1026,6 +1066,16 @@ struct filterops { * Valid: f_touch, f_attach, f_event, f_process * Implicit: - * Ignored: f_peek + * + * FILTER_THREADREQ_NODEFEER + * The filter has moved a turnstile priority push away from the current + * thread, preemption has been disabled, and thread requests need to be + * commited before preemption is re-enabled. + * + * + * Valid: f_attach, f_touch + * Implicit: - + * Invalid: f_event, f_process, f_peek */ #define FILTER_ACTIVE 0x00000001 #define FILTER_REGISTER_WAIT 0x00000002 @@ -1036,6 +1086,7 @@ struct filterops { #define FILTER_ADJUST_EVENT_QOS(qos) \ (((qos) << FILTER_ADJUST_EVENT_QOS_SHIFT) | FILTER_ADJUST_EVENT_QOS_BIT) #define FILTER_RESET_EVENT_QOS FILTER_ADJUST_EVENT_QOS_BIT +#define FILTER_THREADREQ_NODEFEER 0x00000080 #define filter_call(_ops, call) \ ((_ops)->f_extended_codes ? (_ops)->call : !!((_ops)->call)) @@ -1048,24 +1099,28 @@ extern void klist_init(struct klist *list); #define KNOTE_ATTACH(list, kn) knote_attach(list, kn) #define KNOTE_DETACH(list, kn) knote_detach(list, kn) -extern void knote(struct klist *list, long hint); -extern int knote_attach(struct klist *list, struct knote *kn); -extern int knote_detach(struct klist *list, struct knote *kn); -extern void knote_vanish(struct klist *list, bool make_active); -extern void knote_link_waitqset_lazy_alloc(struct knote *kn); +extern void knote(struct klist *list, long hint); +extern int knote_attach(struct klist *list, struct knote *kn); +extern int knote_detach(struct klist *list, struct knote *kn); +extern void knote_vanish(struct klist *list, bool make_active); + +extern void knote_set_error(struct knote *kn, int error); +extern int64_t knote_low_watermark(const struct knote *kn) __pure2; +extern void knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev); +extern void knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data); + +extern void knote_link_waitqset_lazy_alloc(struct knote *kn); extern boolean_t knote_link_waitqset_should_lazy_alloc(struct knote *kn); -extern int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link); -extern int knote_unlink_waitq(struct knote *kn, struct waitq *wq); -extern void knote_fdclose(struct proc *p, int fd); -extern void knote_markstayactive(struct knote *kn); -extern void knote_clearstayactive(struct knote *kn); +extern int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link); +extern int knote_unlink_waitq(struct knote *kn, struct waitq *wq); +extern void knote_fdclose(struct proc *p, int fd); +extern void knote_markstayactive(struct knote *kn); +extern void knote_clearstayactive(struct knote *kn); extern const struct filterops *knote_fops(struct knote *kn); -extern void knote_set_error(struct knote *kn, int error); extern struct turnstile *kqueue_turnstile(struct kqueue *); extern struct turnstile *kqueue_alloc_turnstile(struct kqueue *); -int kevent_exit_on_workloop_ownership_leak(thread_t thread); int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize); int kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize, int32_t *nkqueues_out); @@ -1074,6 +1129,15 @@ int kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, int kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, uint32_t ubufsize, int32_t *nknotes_out); +extern int filt_wlattach_sync_ipc(struct knote *kn); +extern void filt_wldetach_sync_ipc(struct knote *kn); + +extern int kevent_workq_internal(struct proc *p, + user_addr_t changelist, int nchanges, + user_addr_t eventlist, int nevents, + user_addr_t data_out, user_size_t *data_available, + unsigned int flags, int32_t *retval); + #elif defined(KERNEL_PRIVATE) /* !XNU_KERNEL_PRIVATE: kexts still need a klist structure definition */ #include @@ -1083,25 +1147,6 @@ SLIST_HEAD(klist, knote); #endif /* !XNU_KERNEL_PRIVATE && KERNEL_PRIVATE */ -#ifdef KERNEL_PRIVATE -#ifdef PRIVATE - -/* make these private functions available to the pthread kext */ -extern int kevent_qos_internal(struct proc *p, int fd, - user_addr_t changelist, int nchanges, - user_addr_t eventlist, int nevents, - user_addr_t data_out, user_size_t *data_available, - unsigned int flags, int32_t *retval); - -extern int kevent_id_internal(struct proc *p, kqueue_id_t *id, - user_addr_t changelist, int nchanges, - user_addr_t eventlist, int nevents, - user_addr_t data_out, user_size_t *data_available, - unsigned int flags, int32_t *retval); - -#endif /* PRIVATE */ -#endif /* KERNEL_PRIVATE */ - #else /* KERNEL */ #include diff --git a/bsd/sys/eventhandler.h b/bsd/sys/eventhandler.h index 82f2c8439..7934d169e 100644 --- a/bsd/sys/eventhandler.h +++ b/bsd/sys/eventhandler.h @@ -74,7 +74,7 @@ struct eventhandler_lists_ctxt { }; struct eventhandler_entry_arg { - uuid_t ee_fmc_uuid; /* Flow manager UUID */ + uuid_t ee_fm_uuid; /* Flow manager UUID */ uuid_t ee_fr_uuid; /* Flow route UUID */ }; diff --git a/bsd/sys/eventvar.h b/bsd/sys/eventvar.h index e15a1a757..04d31067e 100644 --- a/bsd/sys/eventvar.h +++ b/bsd/sys/eventvar.h @@ -63,8 +63,7 @@ #if defined(XNU_KERNEL_PRIVATE) -typedef int (*kevent_callback_t)(struct kqueue *, struct kevent_internal_s *, void *); -typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); +typedef int (*kevent_callback_t)(struct kevent_qos_s *, struct kevent_ctx_s *); #include #include @@ -80,7 +79,7 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); * proc fd lock -> kq lock -> kq-waitq-set lock -> thread lock * * WorkQ/WorkLoop kqueues (from above): - * proc fd lock -> kq lock -> kq-request lock -> pthread kext locks -> thread lock + * proc fd lock -> kq lock -> workq lock -> thread lock * * Whenever kqueues interact with source locks, it drops all of its own * locks in exchange for a use-reference on the knote used to synchronize @@ -89,26 +88,18 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); * * Standard file-based kqueues (from below): * XXX lock -> kq lock -> kq-waitq-set lock -> thread lock - * Standard file-based kqueues with non-kq-aware sources (from below): - * XXX lock -> kq-waitq-set lock -> thread lock * * WorkQ/WorkLoop kqueues (from below): - * XXX lock -> kq lock -> kq-request lock -> pthread kext locks -> thread lock - * WorkQ/WorkLoop kqueues with non-kq-aware sources (from below): - * XXX -> kq-waitq-set lock -> kq-request lock -> pthread kext locks -> thread lock + * XXX lock -> kq lock -> workq lock -> thread lock */ #define KQEXTENT 256 /* linear growth by this amount */ struct knote_lock_ctx { - struct knote *knlc_knote; - thread_t knlc_thread; - // TODO: knlc_turnstile - TAILQ_HEAD(, knote_lock_ctx) knlc_head; - union { - LIST_ENTRY(knote_lock_ctx) knlc_le; - TAILQ_ENTRY(knote_lock_ctx) knlc_tqe; - }; + struct knote *knlc_knote; + thread_t knlc_thread; + uintptr_t knlc_waiters; + LIST_ENTRY(knote_lock_ctx) knlc_link; #if DEBUG || DEVELOPMENT #define KNOTE_LOCK_CTX_UNLOCKED 0 #define KNOTE_LOCK_CTX_LOCKED 1 @@ -124,8 +115,12 @@ LIST_HEAD(knote_locks, knote_lock_ctx); * the stack named `name`. In development kernels, it uses tricks to make sure * not locks was still held when exiting the C-scope that contains this context. */ -__attribute__((noinline, not_tail_called)) -void knote_lock_ctx_chk(struct knote_lock_ctx *ctx); +static inline void +knote_lock_ctx_chk(struct knote_lock_ctx *knlc) +{ + /* evil hackery to make sure no one forgets to unlock */ + assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED); +} #define KNOTE_LOCK_CTX(n) \ struct knote_lock_ctx n __attribute__((cleanup(knote_lock_ctx_chk))); \ n.knlc_state = KNOTE_LOCK_CTX_UNLOCKED @@ -134,6 +129,24 @@ void knote_lock_ctx_chk(struct knote_lock_ctx *ctx); struct knote_lock_ctx n #endif + +__options_decl(kq_state_t, uint16_t, { + KQ_SEL = 0x0001, /* select was recorded for kq */ + KQ_SLEEP = 0x0002, /* thread is waiting for events */ + KQ_PROCWAIT = 0x0004, /* thread waiting for processing */ + KQ_KEV32 = 0x0008, /* kq is used with 32-bit events */ + KQ_KEV64 = 0x0010, /* kq is used with 64-bit events */ + KQ_KEV_QOS = 0x0020, /* kq events carry QoS info */ + KQ_WORKQ = 0x0040, /* KQ is bound to process workq */ + KQ_WORKLOOP = 0x0080, /* KQ is part of a workloop */ + KQ_PROCESSING = 0x0100, /* KQ is being processed */ + KQ_DRAIN = 0x0200, /* kq is draining */ + KQ_WAKEUP = 0x0400, /* kq awakened while processing */ + KQ_DYNAMIC = 0x0800, /* kqueue is dynamically managed */ + KQ_R2K_ARMED = 0x1000, /* ast notification armed */ + KQ_HAS_TURNSTILE = 0x2000, /* this kqueue has a turnstile */ +}); + /* * kqueue - common core definition of a kqueue * @@ -145,28 +158,18 @@ struct kqueue { struct { struct waitq_set kq_wqs; /* private waitq set */ lck_spin_t kq_lock; /* kqueue lock */ - uint16_t kq_state; /* state of the kq */ - uint16_t kq_level; /* nesting level of the kq */ + kq_state_t kq_state; /* state of the kq */ + union { + uint16_t kq_waitq_hook;/* prepost hook (kqwl/kqwq) */ + uint16_t kq_level; /* nesting level of the kq */ + }; uint32_t kq_count; /* number of queued events */ struct proc *kq_p; /* process containing kqueue */ struct knote_locks kq_knlocks; /* list of knote locks held */ - lck_spin_t kq_reqlock; /* kqueue request lock */ }; /* make sure struct padding is put before kq_queue */ struct kqtailq kq_queue[0]; /* variable array of queues */ }; -#define KQ_SEL 0x001 /* select was recorded for kq */ -#define KQ_SLEEP 0x002 /* thread is waiting for events */ -#define KQ_PROCWAIT 0x004 /* thread waiting for processing */ -#define KQ_KEV32 0x008 /* kq is used with 32-bit events */ -#define KQ_KEV64 0x010 /* kq is used with 64-bit events */ -#define KQ_KEV_QOS 0x020 /* kq events carry QoS info */ -#define KQ_WORKQ 0x040 /* KQ is bound to process workq */ -#define KQ_WORKLOOP 0x080 /* KQ is part of a workloop */ -#define KQ_PROCESSING 0x100 /* KQ is being processed */ -#define KQ_DRAIN 0x200 /* kq is draining */ -#define KQ_WAKEUP 0x400 /* kq awakened while processing */ -#define KQ_DYNAMIC 0x800 /* kqueue is dynamically managed */ /* * kqfile - definition of a typical kqueue opened as a file descriptor * via the kqueue() system call. @@ -179,40 +182,15 @@ struct kqfile { struct kqtailq kqf_queue; /* queue of woken up knotes */ struct kqtailq kqf_suppressed; /* suppression queue */ struct selinfo kqf_sel; /* parent select/kqueue info */ -}; - #define kqf_wqs kqf_kqueue.kq_wqs #define kqf_lock kqf_kqueue.kq_lock #define kqf_state kqf_kqueue.kq_state #define kqf_level kqf_kqueue.kq_level #define kqf_count kqf_kqueue.kq_count #define kqf_p kqf_kqueue.kq_p - -#define QOS_INDEX_KQFILE 0 /* number of qos levels in a file kq */ - -/* - * kqrequest - per-QoS thread request status - */ -struct kqrequest { - struct workq_threadreq_s kqr_req; /* used when request oustanding */ - struct kqtailq kqr_suppressed; /* Per-QoS suppression queues */ - thread_t kqr_thread; /* thread to satisfy request */ - uint8_t kqr_state; /* KQ/workq interaction state */ -#define KQWL_STAYACTIVE_FIRED_BIT (1 << 0) - uint8_t kqr_wakeup_indexes; /* QoS/override levels that woke */ - uint16_t kqr_dsync_waiters; /* number of dispatch sync waiters */ - kq_index_t kqr_stayactive_qos; /* max QoS of statyactive knotes */ - kq_index_t kqr_override_index; /* highest wakeup override index */ - kq_index_t kqr_qos_index; /* QoS for the thread request */ }; - -#define KQR_WORKLOOP 0x01 /* owner is a workloop */ -#define KQR_THREQUESTED 0x02 /* thread has been requested from workq */ -#define KQR_WAKEUP 0x04 /* wakeup called during processing */ -#define KQR_THOVERCOMMIT 0x08 /* overcommit needed for thread requests */ -#define KQR_R2K_NOTIF_ARMED 0x10 /* ast notifications armed */ -#define KQR_ALLOCATED_TURNSTILE 0x20 /* kqwl_turnstile is allocated */ +#define QOS_INDEX_KQFILE 0 /* number of qos levels in a file kq */ /* * WorkQ kqueues need to request threads to service the triggered @@ -240,17 +218,18 @@ struct kqrequest { * values. */ struct kqworkq { - struct kqueue kqwq_kqueue; - struct kqtailq kqwq_queue[KQWQ_NBUCKETS]; /* array of queues */ - struct kqrequest kqwq_request[KQWQ_NBUCKETS]; /* per-QoS request states */ + struct kqueue kqwq_kqueue; + struct kqtailq kqwq_queue[KQWQ_NBUCKETS]; /* array of queues */ + struct kqtailq kqwq_suppressed[KQWQ_NBUCKETS]; /* Per-QoS suppression queues */ + workq_threadreq_s kqwq_request[KQWQ_NBUCKETS]; /* per-QoS request states */ }; -#define kqwq_wqs kqwq_kqueue.kq_wqs -#define kqwq_lock kqwq_kqueue.kq_lock -#define kqwq_state kqwq_kqueue.kq_state -#define kqwq_level kqwq_kqueue.kq_level -#define kqwq_count kqwq_kqueue.kq_count -#define kqwq_p kqwq_kqueue.kq_p +#define kqwq_wqs kqwq_kqueue.kq_wqs +#define kqwq_lock kqwq_kqueue.kq_lock +#define kqwq_state kqwq_kqueue.kq_state +#define kqwq_waitq_hook kqwq_kqueue.kq_waitq_hook +#define kqwq_count kqwq_kqueue.kq_count +#define kqwq_p kqwq_kqueue.kq_p /* * WorkLoop kqueues need to request a thread to service the triggered @@ -292,16 +271,20 @@ struct kqworkq { * NOTE: "lane" support is TBD. */ struct kqworkloop { - struct kqueue kqwl_kqueue; /* queue of events */ - struct kqtailq kqwl_queue[KQWL_NBUCKETS]; /* array of queues */ - struct kqrequest kqwl_request; /* thread request state */ - lck_mtx_t kqwl_statelock; /* state/debounce lock */ - thread_t kqwl_owner; /* current [sync] owner thread */ - uint32_t kqwl_retains; /* retain references */ - kqueue_id_t kqwl_dynamicid; /* dynamic identity */ - uint64_t kqwl_params; /* additional parameters */ - struct turnstile *kqwl_turnstile; /* turnstile for sync IPC/waiters */ - SLIST_ENTRY(kqworkloop) kqwl_hashlink; /* linkage for search list */ + struct kqueue kqwl_kqueue; /* queue of events */ + struct kqtailq kqwl_queue[KQWL_NBUCKETS]; /* array of queues */ + struct kqtailq kqwl_suppressed; /* Per-QoS suppression queues */ + workq_threadreq_s kqwl_request; /* thread request state */ + lck_spin_t kqwl_statelock; /* state/debounce lock */ + thread_t kqwl_owner; /* current [sync] owner thread */ + uint32_t kqwl_retains; /* retain references */ +#define KQWL_STAYACTIVE_FIRED_BIT (1 << 0) + uint8_t kqwl_wakeup_indexes; /* QoS/override levels that woke */ + kq_index_t kqwl_stayactive_qos; /* max QoS of statyactive knotes */ + kqueue_id_t kqwl_dynamicid; /* dynamic identity */ + uint64_t kqwl_params; /* additional parameters */ + struct turnstile *kqwl_turnstile; /* turnstile for sync IPC/waiters */ + LIST_ENTRY(kqworkloop) kqwl_hashlink; /* linkage for search list */ #if CONFIG_WORKLOOP_DEBUG #define KQWL_HISTORY_COUNT 32 #define KQWL_HISTORY_WRITE_ENTRY(kqwl, ...) ({ \ @@ -328,6 +311,7 @@ struct kqworkloop { unsigned int kqwl_index; #endif // CONFIG_WORKLOOP_DEBUG }; +LIST_HEAD(kqwllist, kqworkloop); typedef union { struct kqueue *kq; @@ -336,26 +320,28 @@ typedef union { struct kqworkloop *kqwl; } __attribute__((transparent_union)) kqueue_t; -SLIST_HEAD(kqlist, kqworkloop); -#define kqwl_wqs kqwl_kqueue.kq_wqs -#define kqwl_lock kqwl_kqueue.kq_lock -#define kqwl_state kqwl_kqueue.kq_state -#define kqwl_level kqwl_kqueue.kq_level -#define kqwl_count kqwl_kqueue.kq_count -#define kqwl_p kqwl_kqueue.kq_p +#define kqwl_wqs kqwl_kqueue.kq_wqs +#define kqwl_lock kqwl_kqueue.kq_lock +#define kqwl_state kqwl_kqueue.kq_state +#define kqwl_waitq_hook kqwl_kqueue.kq_waitq_hook +#define kqwl_count kqwl_kqueue.kq_count +#define kqwl_p kqwl_kqueue.kq_p #define KQ_WORKLOOP_RETAINS_MAX UINT32_MAX -extern void kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr); +extern void kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t); // called with the kq req held #define KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE 0x1 extern void kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread, unsigned int flags); +struct turnstile *kqueue_threadreq_get_turnstile(workq_threadreq_t kqr); + // called with the wq lock held -extern void kqueue_threadreq_bind_prepost(struct proc *p, workq_threadreq_t req, thread_t thread); +extern void kqueue_threadreq_bind_prepost(struct proc *p, workq_threadreq_t req, + struct uthread *uth); // called with no lock held extern void kqueue_threadreq_bind_commit(struct proc *p, thread_t thread); @@ -365,16 +351,17 @@ extern void kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req); // lock not held as kqwl_params is immutable after creation extern workq_threadreq_param_t kqueue_threadreq_workloop_param(workq_threadreq_t req); -extern struct kqueue *kqueue_alloc(struct proc *, unsigned int); +extern struct kqueue *kqueue_alloc(struct proc *); extern void kqueue_dealloc(struct kqueue *); +extern void kqworkq_dealloc(struct kqworkq *kqwq); extern void knotes_dealloc(struct proc *); extern void kqworkloops_dealloc(struct proc *); -extern int kevent_register(struct kqueue *, struct kevent_internal_s *, - struct knote_lock_ctx *); -extern int kqueue_scan(struct kqueue *, kevent_callback_t, kqueue_continue_t, - void *, struct filt_process_s *, struct timeval *, struct proc *); +extern int kevent_register(struct kqueue *, struct kevent_qos_s *, + struct knote **); +extern int kqueue_scan(struct kqueue *, int flags, + struct kevent_ctx_s *, kevent_callback_t); extern int kqueue_stat(struct kqueue *, void *, int, proc_t); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/fasttrap.h b/bsd/sys/fasttrap.h index ec2ece46c..7fa981545 100644 --- a/bsd/sys/fasttrap.h +++ b/bsd/sys/fasttrap.h @@ -27,8 +27,6 @@ #ifndef _SYS_FASTTRAP_H #define _SYS_FASTTRAP_H -/* #pragma ident "@(#)fasttrap.h 1.5 06/03/30 SMI" */ - #include #include #include diff --git a/bsd/sys/fasttrap_impl.h b/bsd/sys/fasttrap_impl.h index 863e6037e..109118fb1 100644 --- a/bsd/sys/fasttrap_impl.h +++ b/bsd/sys/fasttrap_impl.h @@ -27,10 +27,6 @@ #ifndef _FASTTRAP_IMPL_H #define _FASTTRAP_IMPL_H -/* - * #pragma ident "@(#)fasttrap_impl.h 1.14 08/04/09 SMI" - */ - #include #include #include diff --git a/bsd/sys/fbt.h b/bsd/sys/fbt.h index a6411a57f..88b365d79 100644 --- a/bsd/sys/fbt.h +++ b/bsd/sys/fbt.h @@ -66,8 +66,9 @@ extern int fbt_invop(uintptr_t, uintptr_t *, uintptr_t); extern void fbt_provide_module(void *, struct modctl *); extern int fbt_enable (void *arg, dtrace_id_t id, void *parg); -extern int fbt_module_excluded(struct modctl*); -extern int fbt_excluded(const char *); +extern bool fbt_module_excluded(struct modctl*); +extern bool fbt_excluded(const char *); +extern void fbt_blacklist_init(void); extern void fbt_provide_probe(struct modctl *ctl, const char *modname, const char *name, machine_inst_t *instr, machine_inst_t *limit); #endif /* _FBT_H */ diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index de413f34c..f0f301865 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -146,6 +146,13 @@ #define AT_SYMLINK_NOFOLLOW 0x0020 /* Act on the symlink itself not the target */ #define AT_SYMLINK_FOLLOW 0x0040 /* Act on target of symlink */ #define AT_REMOVEDIR 0x0080 /* Path refers to directory */ +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +#ifdef PRIVATE +#define AT_REMOVEDIR_DATALESS 0x0100 /* Remove a dataless directory without materializing first */ +#endif +#define AT_REALDEV 0x0200 /* Return real device inodes resides on for fstatat(2) */ +#define AT_FDONLY 0x0400 /* Use only the fd and Ignore the path for fstatat(2) */ +#endif #endif #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) @@ -366,7 +373,11 @@ #define F_PUNCHHOLE 99 /* Deallocate a range of the file */ -#define F_TRIM_ACTIVE_FILE 100 /* Trim an active file */ +#define F_TRIM_ACTIVE_FILE 100 /* Trim an active file */ + +#define F_SPECULATIVE_READ 101 /* Synchronous advisory read fcntl for regular and compressed file */ + +#define F_GETPATH_NOFIRMLINK 102 /* return the full path without firmlinks of the fd */ // FS-specific fcntl()'s numbers begin at 0x00010000 and go up #define FCNTL_FS_SPECIFIC_BASE 0x00010000 @@ -618,6 +629,14 @@ typedef struct ftrimactivefile { off_t fta_length; /* IN: size of the region */ } ftrimactivefile_t; +/* fspecread_t used by F_SPECULATIVE_READ */ +typedef struct fspecread { + unsigned int fsr_flags; /* IN: flags word */ + unsigned int reserved; /* to maintain 8-byte alignment */ + off_t fsr_offset; /* IN: start of the region */ + off_t fsr_length; /* IN: size of the region */ +} fspecread_t; + /* fbootstraptransfer_t used by F_READBOOTSTRAP and F_WRITEBOOTSTRAP commands */ typedef struct fbootstraptransfer { diff --git a/bsd/sys/file.h b/bsd/sys/file.h index 123407262..d9f6b1a5c 100644 --- a/bsd/sys/file.h +++ b/bsd/sys/file.h @@ -97,9 +97,11 @@ int file_drop(int); #ifdef KERNEL_PRIVATE int fd_rdwr(int fd, enum uio_rw, uint64_t base, int64_t len, enum uio_seg, off_t offset, int io_flg, int64_t *aresid); +struct fileglob; struct fileproc; struct vnode; int fp_getfvp(struct proc *p, int fd, struct fileproc **resultfp, struct vnode **resultvp); +struct vnode *fg_get_vnode(struct fileglob *fg); #endif /* KERNEL_PRIVATE */ __END_DECLS #endif /* !_SYS_FILE_H_ */ diff --git a/bsd/sys/file_internal.h b/bsd/sys/file_internal.h index fbd615cbd..e5fde0760 100644 --- a/bsd/sys/file_internal.h +++ b/bsd/sys/file_internal.h @@ -76,16 +76,21 @@ #include #include #include +#include struct proc; struct uio; struct knote; -struct kevent_internal_s; +struct kevent_qos_s; #ifdef __APPLE_API_UNSTABLE struct file; +__options_decl(fileproc_vflags_t, unsigned int, { + FPV_NONE = 0, + FPV_DRAIN = 0x01, +}); /* * Kernel descriptor table. @@ -93,7 +98,8 @@ struct file; */ struct fileproc { unsigned int f_flags; - int32_t f_iocount; + _Atomic fileproc_vflags_t f_vflags; + os_refcnt_t f_iocount; struct fileglob * f_fglob; void *f_wset; }; @@ -164,36 +170,37 @@ typedef enum { #define FG_CONFINED 0x200 /* fileglob confined to process, immutably */ #define FG_HAS_OFDLOCK 0x400 /* Has or has had an OFD lock */ +struct fileops { + file_type_t fo_type; /* descriptor type */ + int (*fo_read) (struct fileproc *fp, struct uio *uio, + int flags, vfs_context_t ctx); + int (*fo_write) (struct fileproc *fp, struct uio *uio, + int flags, vfs_context_t ctx); +#define FOF_OFFSET 0x00000001 /* offset supplied to vn_write */ +#define FOF_PCRED 0x00000002 /* cred from proc, not current thread */ + int (*fo_ioctl)(struct fileproc *fp, u_long com, + caddr_t data, vfs_context_t ctx); + int (*fo_select) (struct fileproc *fp, int which, + void *wql, vfs_context_t ctx); + int (*fo_close) (struct fileglob *fg, vfs_context_t ctx); + int (*fo_kqfilter) (struct fileproc *fp, struct knote *, struct kevent_qos_s *); + int (*fo_drain) (struct fileproc *fp, vfs_context_t ctx); +}; + struct fileglob { LIST_ENTRY(fileglob) f_msglist;/* list of active files */ - int32_t fg_flag; /* see fcntl.h */ + int32_t fg_flag; /* see fcntl.h */ int32_t fg_count; /* reference count */ int32_t fg_msgcount; /* references from message queue */ int32_t fg_lflags; /* file global flags */ kauth_cred_t fg_cred; /* credentials associated with descriptor */ - const struct fileops { - file_type_t fo_type; /* descriptor type */ - int (*fo_read) (struct fileproc *fp, struct uio *uio, - int flags, vfs_context_t ctx); - int (*fo_write) (struct fileproc *fp, struct uio *uio, - int flags, vfs_context_t ctx); -#define FOF_OFFSET 0x00000001 /* offset supplied to vn_write */ -#define FOF_PCRED 0x00000002 /* cred from proc, not current thread */ - int (*fo_ioctl)(struct fileproc *fp, u_long com, - caddr_t data, vfs_context_t ctx); - int (*fo_select) (struct fileproc *fp, int which, - void *wql, vfs_context_t ctx); - int (*fo_close) (struct fileglob *fg, vfs_context_t ctx); - int (*fo_kqfilter) (struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); - int (*fo_drain) (struct fileproc *fp, vfs_context_t ctx); - } *fg_ops; + const struct fileops *fg_ops; off_t fg_offset; - void *fg_data; /* vnode or socket or SHM or semaphore */ - void *fg_vn_data; /* Per fd vnode data, used for directories */ + void *fg_data; /* vnode or socket or SHM or semaphore */ + void *fg_vn_data; /* Per fd vnode data, used for directories */ lck_mtx_t fg_lock; #if CONFIG_MACF - struct label *fg_label; /* JMM - use the one in the cred? */ + struct label *fg_label; /* JMM - use the one in the cred? */ #endif }; @@ -209,20 +216,32 @@ extern int maxfilesperproc; __BEGIN_DECLS + +/* wrappers for fp->f_ops->fo_... */ int fo_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); int fo_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); int fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx); int fo_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx); int fo_close(struct fileglob *fg, vfs_context_t ctx); -int fo_kqfilter(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); +int fo_drain(struct fileproc *fp, vfs_context_t ctx); +int fo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev); + +/* Functions to use for unsupported fileops */ +int fo_no_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); +int fo_no_write(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +int fo_no_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx); +int fo_no_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx); +int fo_no_drain(struct fileproc *fp, vfs_context_t ctx); +int fo_no_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *kev); + void fileproc_drain(proc_t, struct fileproc *); int fp_tryswap(proc_t, int fd, struct fileproc *nfp); int fp_drop(struct proc *p, int fd, struct fileproc *fp, int locked); int fp_drop_written(proc_t p, int fd, struct fileproc *fp); int fp_drop_event(proc_t p, int fd, struct fileproc *fp); -int fp_free(struct proc * p, int fd, struct fileproc * fp); +void fp_free(struct proc * p, int fd, struct fileproc * fp); struct kqueue; int fp_getfkq(struct proc *p, int fd, struct fileproc **resultfp, struct kqueue **resultkq); struct psemnode; @@ -242,12 +261,14 @@ int fp_isguarded(struct fileproc *fp, u_int attribs); int fp_guard_exception(proc_t p, int fd, struct fileproc *fp, u_int attribs); int closef_locked(struct fileproc *fp, struct fileglob *fg, struct proc *p); int close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags); +int fileport_makefd_internal(proc_t p, ipc_port_t port, int uf_flags, int *fd); struct nameidata; struct vnode_attr; int open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval); -int kqueue_body(struct proc *p, fp_allocfn_t, void *cra, int32_t *retval); +int chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread); +int kqueue_internal(struct proc *p, fp_allocfn_t, void *cra, int32_t *retval); void fg_insertuipc(struct fileglob * fg); boolean_t fg_insertuipc_mark(struct fileglob * fg); void fg_removeuipc(struct fileglob * fg); @@ -267,6 +288,8 @@ extern void fg_vn_data_free(void *fgvndata); extern int nameiat(struct nameidata *ndp, int dirfd); extern int falloc_guarded(struct proc *p, struct fileproc **fp, int *fd, vfs_context_t ctx, const guardid_t *guard, u_int attrs); +extern void fileproc_modify_vflags(struct fileproc *fp, fileproc_vflags_t vflags, boolean_t clearflags); +fileproc_vflags_t fileproc_get_vflags(struct fileproc *fp); __END_DECLS #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/sys/filedesc.h b/bsd/sys/filedesc.h index 80d91f2e4..aebf4b054 100644 --- a/bsd/sys/filedesc.h +++ b/bsd/sys/filedesc.h @@ -88,14 +88,14 @@ #include struct klist; -struct kqlist; +struct kqwllist; struct filedesc { struct fileproc **fd_ofiles; /* file structures for open files */ lck_mtx_t fd_kqhashlock; /* lock for dynamic kqueue hash */ u_long fd_kqhashmask; /* size of dynamic kqueue hash */ - struct kqlist *fd_kqhash; /* hash table for dynamic kqueues */ - struct kqueue *fd_wqkqueue; /* the workq kqueue */ + struct kqwllist *fd_kqhash; /* hash table for dynamic kqueues */ + struct kqworkq *fd_wqkqueue; /* the workq kqueue */ char *fd_ofileflags; /* per-process open file flags */ struct vnode *fd_cdir; /* current directory */ struct vnode *fd_rdir; /* root directory */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index 8c7ec89b0..3c2c3783c 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -159,7 +159,6 @@ typedef struct namespace_handler_data { extern int resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg); -extern int get_nspace_item_status(struct vnode *vp, int32_t *status); #else @@ -216,8 +215,6 @@ int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg) #define NAMESPACE_HANDLER_EVENT_TYPE_MASK (NAMESPACE_HANDLER_NSPACE_EVENT | NAMESPACE_HANDLER_SNAPSHOT_EVENT | NAMESPACE_HANDLER_TRACK_EVENT) -#define DATALESS_CMPFS_TYPE 0x80000001 - typedef int32_t nspace_handler_info[2]; typedef char fstypename_t[MFSTYPENAMELEN]; @@ -260,6 +257,27 @@ typedef struct disk_conditioner_info { uint32_t segwritecnt; } disk_conditioner_info; +/* + * BSD flags manipulation arguments. + * + * This provides a safe way to update the BSD flags field of an inode, + * which has some user components as well as some system components. + * What it provides is a compare-and-swap operation, whereby the caller + * fetches what the expected flags are, computes the new set, and then + * provides the old along with the new. If the old that's provided matches + * what's actually in the inode, the new value is set. The actual inode + * value is returned to the caller, and expected == actual is how the + * caller can determine that the operation succeeded. + * + * Some BSD flags (e.g. UF_COMPRESSED) can only be manipulated via this + * safe mechanism. + */ +struct fsioc_cas_bsdflags { + uint32_t expected_flags; /* [IN] expected flags */ + uint32_t new_flags; /* [IN] new value to set */ + uint32_t actual_flags; /* [OUT] the actual flags in inode */ +}; + #define FSCTL_SYNC_FULLSYNC (1<<0) /* Flush the data fully to disk, if supported by the filesystem */ #define FSCTL_SYNC_WAIT (1<<1) /* Wait for the sync to complete */ @@ -273,35 +291,16 @@ typedef struct disk_conditioner_info { /* Unsupported - previously FSIOC_WAIT_FOR_SYNC */ #define FSIOC_UNSUPPORTED _IOR('A', 3, int32_t) -#define FSIOC_NAMESPACE_HANDLER_GET _IOW('A', 4, struct namespace_handler_info) -#define FSCTL_NAMESPACE_HANDLER_GET IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET) - -#define FSIOC_NAMESPACE_HANDLER_UPDATE _IOW('A', 5, nspace_handler_info) -#define FSCTL_NAMESPACE_HANDLER_UPDATE IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE) - -#define FSIOC_NAMESPACE_HANDLER_UNBLOCK _IOW('A', 6, nspace_handler_info) -#define FSCTL_NAMESPACE_HANDLER_UNBLOCK IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK) - -#define FSIOC_NAMESPACE_HANDLER_CANCEL _IOW('A', 7, nspace_handler_info) -#define FSCTL_NAMESPACE_HANDLER_CANCEL IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL) - -#define FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME _IOW('A', 8, int32_t) -#define FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME) - -#define FSIOC_OLD_SNAPSHOT_HANDLER_GET _IOW('A', 9, struct namespace_handler_info) -#define FSCTL_OLD_SNAPSHOT_HANDLER_GET IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET) +/* 4 - 9 was used for NAMESPACE handler operation to support dataless file faults + * no and no longer user */ #define FSIOC_SET_FSTYPENAME_OVERRIDE _IOW('A', 10, fstypename_t) #define FSCTL_SET_FSTYPENAME_OVERRIDE IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE) -#define FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS _IOW('A', 11, int32_t) -#define FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS) - /* 12 was used for TRACKED_HANDLER_GET which has now been removed * as it is no longer used. */ -#define FSIOC_SNAPSHOT_HANDLER_GET_EXT _IOW('A', 13, struct namespace_handler_info_ext) -#define FSCTL_SNAPSHOT_HANDLER_GET_EXT IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT) +/* 13 was used for FSIOC_SNAPSHOT_HANDLER_GET_EXT and now been removed */ /* 14 was used for NAMESPACE_HANDLER_GETDATA which has now been * removed as it is no longer used. */ @@ -321,6 +320,9 @@ typedef struct disk_conditioner_info { #define DISK_CONDITIONER_IOC_SET _IOW('A', 19, disk_conditioner_info) #define DISK_CONDITIONER_FSCTL_SET IOCBASECMD(DISK_CONDITIONER_IOC_SET) +/* Set the value of a file's BSD flags in a safe way. */ +#define FSIOC_CAS_BSDFLAGS _IOWR('A', 20, struct fsioc_cas_bsdflags) + /* Check if a file is only open once (pass zero for the extra arg) */ #define FSIOC_FD_ONLY_OPEN_ONCE _IOWR('A', 21, uint32_t) @@ -346,6 +348,14 @@ typedef struct disk_conditioner_info { #define FSIOC_THAW_EXTENTS _IO('h', 21) #define FSCTL_THAW_EXTENTS IOCBASECMD(FSIOC_THAW_EXTENTS) +/* this FSCTL selector is duplicated in XNU with the intent of making the VFS/generic one the only one eventually */ +#define FIRMLINK_STRUCT_LEN 1032 +typedef struct generic_firmlink { + uint8_t array[FIRMLINK_STRUCT_LEN]; +} generic_firmlink_t; + +#define FSIOC_FIRMLINK_CTL _IOWR ('J', 60, generic_firmlink_t) + #ifndef KERNEL #include diff --git a/bsd/sys/fsevents.h b/bsd/sys/fsevents.h index 8779bc362..4ee5a460a 100644 --- a/bsd/sys/fsevents.h +++ b/bsd/sys/fsevents.h @@ -96,11 +96,11 @@ // These are special bits that be set in the 32-bit mode // field that /dev/fsevents provides. // -#define FSE_MODE_HLINK (1 << 31) // notification is for a hard-link -#define FSE_MODE_LAST_HLINK (1 << 30) // link count == 0 on a hard-link delete -#define FSE_REMOTE_DIR_EVENT (1 << 29) // this is a remotely generated directory-level granularity event -#define FSE_TRUNCATED_PATH (1 << 28) // the path for this item had to be truncated -#define FSE_MODE_CLONE (1 << 27) // notification is for a clone +#define FSE_MODE_HLINK (1U << 31) // notification is for a hard-link +#define FSE_MODE_LAST_HLINK (1U << 30) // link count == 0 on a hard-link delete +#define FSE_REMOTE_DIR_EVENT (1U << 29) // this is a remotely generated directory-level granularity event +#define FSE_TRUNCATED_PATH (1U << 28) // the path for this item had to be truncated +#define FSE_MODE_CLONE (1U << 27) // notification is for a clone // ioctl's on /dev/fsevents typedef struct fsevent_clone_args { diff --git a/bsd/sys/fsgetpath.h b/bsd/sys/fsgetpath.h index bde5ce6e8..75a96e3f4 100644 --- a/bsd/sys/fsgetpath.h +++ b/bsd/sys/fsgetpath.h @@ -45,6 +45,7 @@ #include #include #ifdef __APPLE_API_PRIVATE +#include #include #include #endif /* __APPLE_API_PRIVATE */ @@ -60,6 +61,18 @@ ssize_t fsgetpath(char *, size_t, fsid_t *, uint64_t) __OSX_AVAILABLE(10.13) __I #ifdef PRIVATE #include +#ifndef FSOPT_NOFIRMLINKPATH /* also in attr.h */ +#define FSOPT_NOFIRMLINKPATH 0x00000080 +#endif + +#ifndef FSOPT_ISREALFSID /* also in attr.h */ +#ifdef FSOPT_RETURN_REALDEV +#define FSOPT_ISREALFSID FSOPT_RETURN_REALDEV +#else +#define FSOPT_ISREALFSID 0x00000200 +#endif +#endif /* FSOPT_ISREALFSID */ + #ifdef __APPLE_API_PRIVATE @@ -81,6 +94,8 @@ ssize_t fsgetpath(char *, size_t, fsid_t *, uint64_t) __OSX_AVAILABLE(10.13) __I */ int openbyid_np(fsid_t* fsid, fsobj_id_t* objid, int flags); +ssize_t fsgetpath_ext(char *, size_t, fsid_t *, uint64_t, uint32_t) __OSX_AVAILABLE(10.15) __IOS_AVAILABLE(13.0) __TVOS_AVAILABLE(13.0) __WATCHOS_AVAILABLE(6.0); + #endif /* __APPLE_API_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/sys/gmon.h b/bsd/sys/gmon.h index c50bf146a..ef7b40d85 100644 --- a/bsd/sys/gmon.h +++ b/bsd/sys/gmon.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -221,31 +221,6 @@ extern struct gmonparam _gmonparam; #define GMON_PROF_ERROR 2 #define GMON_PROF_OFF 3 -/* - * Sysctl definitions for extracting profiling information from the kernel. - */ -#define GPROF_STATE 0 /* int: profiling enabling variable */ -#define GPROF_COUNT 1 /* struct: profile tick count buffer */ -#define GPROF_FROMS 2 /* struct: from location hash bucket */ -#define GPROF_TOS 3 /* struct: destination/count structure */ -#define GPROF_GMONPARAM 4 /* struct: profiling parameters (see above) */ - - -/* - * Declarations for various profiling related functions from - * bsd/kern/subr_prof.c - */ -#ifdef GPROF -#ifdef XNU_KERNEL_PRIVATE - -void kmstartup(void); -void cfreemem(caddr_t, int); /* Currently only a stub function. */ -void mcount(uintptr_t, uintptr_t); - -#endif /* XNU_KERNEL_PRIVATE */ -#endif /* GPROF */ - - /* * In order to support more information than in the original mon.out and * gmon.out files there is an alternate gmon.out file format. The alternate diff --git a/bsd/sys/guarded.h b/bsd/sys/guarded.h index 6bd3d8e62..8534410a8 100644 --- a/bsd/sys/guarded.h +++ b/bsd/sys/guarded.h @@ -131,8 +131,15 @@ struct vnguard_set { guardid_t vns_guard; }; +struct vnguard_getattr { + int vga_fd; /* in */ + unsigned vga_attrs; /* out */ + guardid_t vga_guard; /* in */ +}; + #define VNG_SYSC_PING 0 #define VNG_SYSC_SET_GUARD 1 +#define VNG_SYSC_GET_ATTR 2 #define VNG_POLICY_NAME "vnguard" diff --git a/bsd/sys/imageboot.h b/bsd/sys/imageboot.h index 7b0f11d9e..e42f1e39d 100644 --- a/bsd/sys/imageboot.h +++ b/bsd/sys/imageboot.h @@ -28,13 +28,23 @@ #ifndef _IMAGEBOOT_H_ #define _IMAGEBOOT_H_ -int imageboot_needed(void); -void imageboot_setup(void); +typedef enum imageboot_type { + IMAGEBOOT_NONE, + IMAGEBOOT_DMG, + IMAGEBOOT_LOCKER, +} imageboot_type_t; + +imageboot_type_t imageboot_needed(void); +void imageboot_setup(imageboot_type_t type); int imageboot_format_is_valid(const char *root_path); -int imageboot_mount_image(const char *root_path, int height); +int imageboot_mount_image(const char *root_path, int height, imageboot_type_t type); #define IMAGEBOOT_CONTAINER_ARG "container-dmg" #define IMAGEBOOT_ROOT_ARG "root-dmg" #define IMAGEBOOT_AUTHROOT_ARG "auth-root-dmg" +#if CONFIG_LOCKERBOOT +#define IMAGEBOOT_LOCKER_ARG "locker" +#define LOCKERFS_NAME "lockerfs" +#endif #endif diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index 8d5da2872..a0138830e 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -118,10 +118,12 @@ struct image_params { void *ip_px_spa; void *ip_px_smpx; /* MAC-specific spawn attrs. */ void *ip_px_persona; /* persona args */ + void *ip_px_pcred_info; /* posix cred args */ void *ip_cs_error; /* codesigning error reason */ uint64_t ip_dyld_fsid; uint64_t ip_dyld_fsobjid; + unsigned int ip_simulator_binary; /* simulator binary flags */ }; /* @@ -139,6 +141,14 @@ struct image_params { #define IMGPF_EXEC 0x00000100 /* exec */ #define IMGPF_HIGH_BITS_ASLR 0x00000200 /* randomize high bits of ASLR slide */ #define IMGPF_IS_64BIT_DATA 0x00000400 /* exec to a 64Bit register state */ +#define IMGPF_DRIVER 0x00000800 /* exec of a driver binary (no LC_MAIN) */ +#define IMGPF_NOJOP 0x80000000 +/* + * Simulator binary flags + */ +#define IMGPF_SB_DEFAULT 0 /* Default value, did not check if it is a simulator binary */ +#define IMGPF_SB_TRUE 1 /* Binary is a simulator binary */ +#define IMGPF_SB_FALSE 2 /* Binary is not a simulator binary */ #endif /* !_SYS_IMGACT */ diff --git a/bsd/sys/kasl.h b/bsd/sys/kasl.h index c3b9b415f..1de38642a 100644 --- a/bsd/sys/kasl.h +++ b/bsd/sys/kasl.h @@ -36,10 +36,6 @@ #endif /* BSD_KERNEL_PRIVATE */ -extern int -kern_asl_msg_va(int level, const char *facility, int num_pairs, - va_list vargs, ...); - extern int kern_asl_msg(int level, const char *facility, int num_pairs, ...); diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 3a72e0b74..701390408 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -507,7 +507,9 @@ kauth_filesec_t kauth_filesec_alloc(int size); void kauth_filesec_free(kauth_filesec_t fsp); extern kauth_scope_t kauth_register_scope(const char *_identifier, kauth_scope_callback_t _callback, void *_idata); extern void kauth_deregister_scope(kauth_scope_t _scope); +__kpi_deprecated("Use EndpointSecurity instead") extern kauth_listener_t kauth_listen_scope(const char *_identifier, kauth_scope_callback_t _callback, void *_idata); +__kpi_deprecated("Use EndpointSecurity instead") extern void kauth_unlisten_scope(kauth_listener_t _scope); extern int kauth_authorize_action(kauth_scope_t _scope, kauth_cred_t _credential, kauth_action_t _action, uintptr_t _arg0, uintptr_t _arg1, uintptr_t _arg2, uintptr_t _arg3); @@ -624,29 +626,29 @@ __END_DECLS /* Actions, also rights bits in an ACE */ #if defined(KERNEL) || defined (_SYS_ACL_H) -#define KAUTH_VNODE_READ_DATA (1<<1) +#define KAUTH_VNODE_READ_DATA (1U<<1) #define KAUTH_VNODE_LIST_DIRECTORY KAUTH_VNODE_READ_DATA -#define KAUTH_VNODE_WRITE_DATA (1<<2) +#define KAUTH_VNODE_WRITE_DATA (1U<<2) #define KAUTH_VNODE_ADD_FILE KAUTH_VNODE_WRITE_DATA -#define KAUTH_VNODE_EXECUTE (1<<3) +#define KAUTH_VNODE_EXECUTE (1U<<3) #define KAUTH_VNODE_SEARCH KAUTH_VNODE_EXECUTE -#define KAUTH_VNODE_DELETE (1<<4) -#define KAUTH_VNODE_APPEND_DATA (1<<5) +#define KAUTH_VNODE_DELETE (1U<<4) +#define KAUTH_VNODE_APPEND_DATA (1U<<5) #define KAUTH_VNODE_ADD_SUBDIRECTORY KAUTH_VNODE_APPEND_DATA -#define KAUTH_VNODE_DELETE_CHILD (1<<6) -#define KAUTH_VNODE_READ_ATTRIBUTES (1<<7) -#define KAUTH_VNODE_WRITE_ATTRIBUTES (1<<8) -#define KAUTH_VNODE_READ_EXTATTRIBUTES (1<<9) -#define KAUTH_VNODE_WRITE_EXTATTRIBUTES (1<<10) -#define KAUTH_VNODE_READ_SECURITY (1<<11) -#define KAUTH_VNODE_WRITE_SECURITY (1<<12) -#define KAUTH_VNODE_TAKE_OWNERSHIP (1<<13) +#define KAUTH_VNODE_DELETE_CHILD (1U<<6) +#define KAUTH_VNODE_READ_ATTRIBUTES (1U<<7) +#define KAUTH_VNODE_WRITE_ATTRIBUTES (1U<<8) +#define KAUTH_VNODE_READ_EXTATTRIBUTES (1U<<9) +#define KAUTH_VNODE_WRITE_EXTATTRIBUTES (1U<<10) +#define KAUTH_VNODE_READ_SECURITY (1U<<11) +#define KAUTH_VNODE_WRITE_SECURITY (1U<<12) +#define KAUTH_VNODE_TAKE_OWNERSHIP (1U<<13) /* backwards compatibility only */ #define KAUTH_VNODE_CHANGE_OWNER KAUTH_VNODE_TAKE_OWNERSHIP /* For Windows interoperability only */ -#define KAUTH_VNODE_SYNCHRONIZE (1<<20) +#define KAUTH_VNODE_SYNCHRONIZE (1U<<20) /* (1<<21) - (1<<24) are reserved for generic rights bits */ @@ -654,13 +656,13 @@ __END_DECLS /* * Authorizes the vnode as the target of a hard link. */ -#define KAUTH_VNODE_LINKTARGET (1<<25) +#define KAUTH_VNODE_LINKTARGET (1U<<25) /* * Indicates that other steps have been taken to authorise the action, * but authorisation should be denied for immutable objects. */ -#define KAUTH_VNODE_CHECKIMMUTABLE (1<<26) +#define KAUTH_VNODE_CHECKIMMUTABLE (1U<<26) /* Action modifiers */ /* @@ -671,7 +673,7 @@ __END_DECLS * * This bit will never be present in an ACE. */ -#define KAUTH_VNODE_ACCESS (1<<31) +#define KAUTH_VNODE_ACCESS (1U<<31) /* * The KAUTH_VNODE_NOIMMUTABLE bit is passed to the callback along with the @@ -681,7 +683,7 @@ __END_DECLS * The system immutable flags are only ignored when the system securelevel * is low enough to allow their removal. */ -#define KAUTH_VNODE_NOIMMUTABLE (1<<30) +#define KAUTH_VNODE_NOIMMUTABLE (1U<<30) /* @@ -692,7 +694,7 @@ __END_DECLS * for an exact match on the last credential to lookup * the component being acted on */ -#define KAUTH_VNODE_SEARCHBYANYONE (1<<29) +#define KAUTH_VNODE_SEARCHBYANYONE (1U<<29) /* @@ -758,7 +760,7 @@ void kprintf(const char *fmt, ...); # endif /* !_FN_KPRINTF */ # define KAUTH_DEBUG_ENABLE # define K_UUID_FMT "%08x:%08x:%08x:%08x" -# define K_UUID_ARG(_u) *(int *)&_u.g_guid[0],*(int *)&_u.g_guid[4],*(int *)&_u.g_guid[8],*(int *)&_u.g_guid[12] +# define K_UUID_ARG(_u) &_u.g_guid_asint[0],&_u.g_guid_asint[1],&_u.g_guid_asint[2],&_u.g_guid_asint[3] # define KAUTH_DEBUG(fmt, args...) do { kprintf("%s:%d: " fmt "\n", __PRETTY_FUNCTION__, __LINE__ , ##args); } while (0) # define KAUTH_DEBUG_CTX(_c) KAUTH_DEBUG("p = %p c = %p", _c->vc_proc, _c->vc_ucred) # define VFS_DEBUG(_ctx, _vp, fmt, args...) \ diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 203b8dc57..03c7af88f 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -26,10 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * kdebug.h - kernel_debug definitions - */ - #ifndef BSD_SYS_KDEBUG_H #define BSD_SYS_KDEBUG_H @@ -40,17 +36,12 @@ __BEGIN_DECLS #ifdef __APPLE_API_UNSTABLE -#include -#include - -#ifndef KERNEL -#include -#endif - /* - * Kdebug is a facility for tracing events occurring on a system. + * Kdebug is a kernel facility for tracing events occurring on a system. User + * space processes should prefer os_signpost, instead. * - * All events are tagged with a 32-bit debugid: + * This header defines reserved debugids, which are 32-bit values that describe + * each event: * * +----------------+----------------+----------------------------+----+ * | Class (8) | Subclass (8) | Code (14) |Func| @@ -91,9 +82,9 @@ __BEGIN_DECLS /* Generate an eventid corresponding to Class, SubClass, and Code. */ #define KDBG_EVENTID(Class, SubClass, Code) \ - ((((Class) & 0xff) << KDBG_CLASS_OFFSET) | \ - (((SubClass) & 0xff) << KDBG_SUBCLASS_OFFSET) | \ - (((Code) & 0x3fff) << KDBG_CODE_OFFSET)) + (((unsigned)((Class) & 0xff) << KDBG_CLASS_OFFSET) | \ + ((unsigned)((SubClass) & 0xff) << KDBG_SUBCLASS_OFFSET) | \ + ((unsigned)((Code) & 0x3fff) << KDBG_CODE_OFFSET)) /* Deprecated macro using old naming convention. */ #define KDBG_CODE(Class, SubClass, Code) \ KDBG_EVENTID(Class, SubClass, Code) @@ -107,77 +98,16 @@ __BEGIN_DECLS ((uint16_t)(((Debugid) & KDBG_CSC_MASK) >> KDBG_CSC_OFFSET)) #define KDBG_EXTRACT_CODE(Debugid) \ ((uint16_t)(((Debugid) & KDBG_CODE_MASK) >> KDBG_CODE_OFFSET)) +#define KDBG_CLASS_ENCODE(Class, SubClass) KDBG_EVENTID(Class, SubClass, 0) +#define KDBG_CLASS_DECODE(Debugid) (Debugid & KDBG_CSC_MASK) /* function qualifiers */ -#define DBG_FUNC_START 1 -#define DBG_FUNC_END 2 -#define DBG_FUNC_NONE 0 - -/* - * Definitions to support IOP tracing. - */ - -#ifdef KERNEL_PRIVATE - -typedef enum { - /* Trace is now enabled; no arguments. */ - KD_CALLBACK_KDEBUG_ENABLED, - /* Trace is now disabled; no arguments. */ - KD_CALLBACK_KDEBUG_DISABLED, - /* - * Request the latest entries from the IOP and block until complete; no - * arguments. - */ - KD_CALLBACK_SYNC_FLUSH, - /* - * The typefilter is enabled; a read-only pointer to the typefilter is - * provided, valid only while in the callback. - */ - KD_CALLBACK_TYPEFILTER_CHANGED, -} kd_callback_type; -typedef void (*kd_callback_fn) (void* context, kd_callback_type reason, void* arg); - -struct kd_callback { - kd_callback_fn func; - void *context; - /* name of IOP, NUL-terminated */ - char iop_name[8]; -}; - -typedef struct kd_callback kd_callback_t; - -/* - * Registers an IOP for participation in tracing. - * - * The registered callback function will be called with the - * supplied context as the first argument, followed by a - * kd_callback_type and an associated void* argument. - * - * The return value is a nonzero coreid that shall be used in - * kernel_debug_enter() to refer to your IOP. If the allocation - * failed, then 0 will be returned. - * - * Caveats: - * Note that not all callback calls will indicate a change in - * state (e.g. disabling trace twice would send two disable - * notifications). - */ -extern int kernel_debug_register_callback(kd_callback_t callback); - -extern void kernel_debug_enter( - uint32_t coreid, - uint32_t debugid, - uint64_t timestamp, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t threadid - ); - -#endif /* KERNEL_PRIVATE */ +#define DBG_FUNC_START 1U +#define DBG_FUNC_END 2U +#define DBG_FUNC_NONE 0U /* The Kernel Debug Classes */ + #define DBG_MACH 1 #define DBG_NETWORK 2 #define DBG_FSYSTEM 3 @@ -196,6 +126,7 @@ extern void kernel_debug_enter( #define DBG_QT 32 #define DBG_APPS 33 #define DBG_LAUNCHD 34 +#define DBG_SILICON 35 #define DBG_PERF 37 #define DBG_IMPORTANCE 38 #define DBG_BANK 40 @@ -209,149 +140,8 @@ extern void kernel_debug_enter( #define DBG_UMALLOC 51 #define DBG_TURNSTILE 53 - #define DBG_MIG 255 -#ifdef PRIVATE - -/* - * Private kdebug userspace API - */ -#ifndef KERNEL -#include - -/* - * OS components can use the full precision of the "code" field - * (Class, SubClass, Code) to inject events using kdebug_trace() by - * using: - * - * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, 1, 2, 3, 4); - * - * These trace points can be included in production code, since they - * use reserved, non-overlapping ranges. The performance impact when - * kernel tracing is not enabled is minimal. Classes can be reserved - * by filing a Radar in xnu|all. - * - * 64-bit arguments may be truncated if the system is using a 32-bit - * kernel. - * - * On error, -1 will be returned and errno will indicate the error. - */ -extern int kdebug_trace( - uint32_t code, - uint64_t arg1, - uint64_t arg2, - uint64_t arg3, - uint64_t arg4) -__OSX_AVAILABLE(10.10.2) __IOS_AVAILABLE(8.2); - -/*! - * @function kdebug_trace_string - * - * @discussion - * This function emits strings to kdebug trace along with an ID and allows - * for previously-traced strings to be overwritten and invalidated. - * - * To start tracing a string and generate an ID to use to refer to it: - * - * string_id = kdebug_trace_string(debugid, 0, "string"); - * - * To replace a string previously traced: - * - * string_id = kdebug_trace_string(debugid, string_id, "new string"); - * - * To invalidate a string ID: - * - * string_id = kdebug_trace_string(debugid, string_id, NULL); - * - * To check for errors: - * - * if ((int64_t)string_id == -1) { perror("string error") } - * - * @param debugid - * The `debugid` to check if its enabled before tracing and include as - * an argument in the event containing the string. - * - * Some classes or subclasses are reserved for specific uses and are not - * allowed to be used with this function. No function qualifiers are - * allowed on `debugid`. - * - * @param str_id - * When 0, a new ID will be generated and returned if tracing is - * enabled. - * - * Otherwise `str_id` must contain an ID that was previously generated - * with this function. Clents should pass NULL in `str` if `str_id` - * is no longer in use. Otherwise, the string previously mapped to - * `str_id` will be overwritten with the contents of `str`. - * - * @param str - * A NUL-terminated 'C' string containing the characters that should be - * traced alongside `str_id`. - * - * If necessary, the string will be truncated at an - * implementation-defined length. The string must not be the empty - * string, but can be NULL if a valid `str_id` is provided. - * - * @return - * 0 if tracing is disabled or `debugid` is being filtered out of trace. - * It can also return (int64_t)-1 if an error occured. Otherwise, - * it returns the ID to use to refer to the string in future - * kdebug_trace(2) calls. - * - * The errors that can occur are: - * - * EINVAL - * There are function qualifiers on `debugid`, `str` is empty, or - * `str_id` was not generated by this function. - * EPERM - * The `debugid`'s class or subclass is reserved for internal use. - * EFAULT - * `str` is an invalid address or NULL when `str_id` is 0. - */ -extern uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id, - const char *str) -__OSX_AVAILABLE(10.11) __IOS_AVAILABLE(9.0); - -/* - * Although the performance impact of kdebug_trace() when kernel - * tracing is not enabled is minimal, it may require the caller to - * perform an expensive calculation/summarization. This cost can be - * skipped by checking the kdebug_is_enabled() predicate: - * - * if (kdebug_is_enabled(KDBG_CODE(DBG_XPC, 15, 1))) { - * uint64_t arg1 = ...; - * uint64_t arg2 = ...; - * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, arg1, arg2, 0, 0); - * } - * - * If tracing is enabled for the code at the time of the check, 1 - * will be returned. Otherwise, 0 will be returned. - */ -extern bool kdebug_is_enabled(uint32_t code) -__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) -__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); - -/* - * Returns a pointer to the userspace typefilter, if one is available. - * May return NULL. - */ -extern void *kdebug_typefilter(void) -__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) -__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); - -#endif /* !KERNEL (Private kdebug userspace API) */ -#endif /* PRIVATE */ - -#ifdef XNU_KERNEL_PRIVATE -/* Used in early boot to log strings spanning only a single tracepoint. */ -extern void kernel_debug_string_early(const char *message); -/* Used to trace strings within kdebug tracepoints on arbitrary eventids. */ -extern void kernel_debug_string_simple(uint32_t eventid, const char *str); -/* Only used by ktrace to reset kdebug. ktrace_lock must be held. */ -extern void kdebug_reset(void); -#endif /* XNU_KERNEL_PRIVATE */ - /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */ #define DBG_MACH_EXCP_KTRAP_x86 0x02 /* Kernel Traps on x86 */ #define DBG_MACH_EXCP_DFLT 0x03 /* Data Translation Fault */ @@ -387,6 +177,7 @@ extern void kdebug_reset(void); #define DBG_MACH_THREAD_GROUP 0xA6 /* Thread groups */ #define DBG_MACH_COALITION 0xA7 /* Coalitions */ #define DBG_MACH_SHAREDREGION 0xA8 /* Shared region */ +#define DBG_MACH_SCHED_CLUTCH 0xA9 /* Clutch scheduler */ #define DBG_MACH_IO 0xAA /* I/O */ /* Codes for DBG_MACH_IO */ @@ -456,10 +247,20 @@ extern void kdebug_reset(void); #define MACH_AMP_SIGNAL_SPILL 0x32 /* AMP spill signal sent to cpuid */ #define MACH_AMP_STEAL 0x33 /* AMP thread stolen or spilled */ #define MACH_SCHED_LOAD_EFFECTIVE 0x34 /* Effective scheduler load */ -#define MACH_PROMOTED 0x35 /* thread promoted due to mutex priority promotion */ -#define MACH_UNPROMOTED 0x36 /* thread unpromoted due to mutex priority promotion */ -#define MACH_PROMOTED_UPDATE 0x37 /* thread already promoted, but promotion priority changed */ +/* unused MACH_PROMOTED 0x35 was: thread promoted due to mutex priority promotion */ +/* unused MACH_UNPROMOTED 0x36 was: thread unpromoted due to mutex priority promotion */ +/* unused MACH_PROMOTED_UPDATE 0x37 was: thread already promoted, but promotion priority changed */ #define MACH_QUIESCENT_COUNTER 0x38 /* quiescent counter tick */ +#define MACH_TURNSTILE_USER_CHANGE 0x39 /* base priority change because of turnstile */ +#define MACH_AMP_RECOMMENDATION_CHANGE 0x3a /* Thread group recommendation change */ +#define MACH_TURNSTILE_KERNEL_CHANGE 0x40 /* sched priority change because of turnstile */ + +/* Codes for Clutch Scheduler (DBG_MACH_SCHED_CLUTCH) */ +#define MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE 0x0 +#define MACH_SCHED_CLUTCH_TG_BUCKET_STATE 0x1 +#define MACH_SCHED_CLUTCH_THREAD_SELECT 0x2 +#define MACH_SCHED_CLUTCH_THREAD_STATE 0x3 +#define MACH_SCHED_CLUTCH_TG_BUCKET_PRI 0x4 /* Variants for MACH_MULTIQ_DEQUEUE */ #define MACH_MULTIQ_BOUND 1 @@ -491,7 +292,8 @@ extern void kdebug_reset(void); #define MACH_IPC_VOUCHER_DESTROY 0x9 /* Voucher removed from global voucher hashtable */ #define MACH_IPC_KMSG_INFO 0xa /* Send/Receive info for a kmsg */ #define MACH_IPC_KMSG_LINK 0xb /* link a kernel kmsg pointer to user mach_msg_header_t */ -#define MACH_IPC_PORT_ENTRY_MODIFY 0xc /* A port space gained or lost a port right (reference) */ +#define MACH_IPC_PORT_ENTRY_MODIFY 0xc /* A port space gained or lost a port right (reference) */ +#define MACH_IPC_DESTROY_GUARDED_DESC 0xd /* Unable to receive a guarded descriptor */ /* Codes for thread groups (DBG_MACH_THREAD_GROUP) */ #define MACH_THREAD_GROUP_NEW 0x0 @@ -530,6 +332,7 @@ extern void kdebug_reset(void); #define PMAP__SWITCH 0x12 #define PMAP__TTE 0x13 #define PMAP__SWITCH_USER_TTB 0x14 +#define PMAP__UPDATE_CACHING 0x15 /* Codes for clock (DBG_MACH_CLOCK) */ #define MACH_EPOCH_CHANGE 0x0 /* wake epoch change */ @@ -654,6 +457,12 @@ extern void kdebug_reset(void); #define DBG_HIBERNATE 51 /* hibernation related events */ #define DBG_IOTHUNDERBOLT 52 /* Thunderbolt */ #define DBG_BOOTER 53 /* booter related events */ +#define DBG_IOAUDIO2 54 /* Audio (extended) */ + +#define DBG_IOSURFACEPA 64 /* IOSurface page mappings */ +#define DBG_IOMDPA 65 /* IOMemoryDescriptor page mappings */ +#define DBG_IODARTPA 66 /* DART page mappings */ +/* **** 67-79 reserved for physical address mapping information **** */ /* Backwards compatibility */ #define DBG_IOPOINTING DBG_IOHID /* OBSOLETE: Use DBG_IOHID instead */ @@ -686,6 +495,9 @@ extern void kdebug_reset(void); #define DBG_DRVSMC 25 /* System Management Controller */ #define DBG_DRVMACEFIMANAGER 26 /* Mac EFI Manager */ #define DBG_DRVANE 27 /* ANE */ +#define DBG_DRVETHERNET 28 /* Ethernet */ +#define DBG_DRVMCC 29 /* Memory Cache Controller */ +#define DBG_DRVACCESSORY 30 /* Accessories */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -698,12 +510,7 @@ extern void kdebug_reset(void); #define DBG_DLIL_PR_FLT 4 /* DLIL Protocol Filter */ #define DBG_DLIL_IF_FLT 5 /* DLIL Interface FIlter */ - -/* - * The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) - * - * Please NOTE: sub class values 0xC and 0xD are currently unused. - */ +/* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */ #define DBG_FSRW 0x1 /* reads and writes to the filesystem */ #define DBG_DKRW 0x2 /* reads and writes to the disk */ #define DBG_FSVN 0x3 /* vnode operations (inc. locking/unlocking) */ @@ -720,6 +527,8 @@ extern void kdebug_reset(void); #define DBG_ACFS 0x10 /* Xsan-specific events; see the XsanFS project */ #define DBG_THROTTLE 0x11 /* I/O Throttling events */ #define DBG_DECMP 0x12 /* Decmpfs-specific events */ +#define DBG_VFS 0x13 /* VFS layer events */ +#define DBG_LIVEFS 0x14 /* LiveFS events; see the UserFS project */ #define DBG_CONTENT_PROT 0xCF /* Content Protection Events: see bsd/sys/cprotect.h */ /* @@ -756,7 +565,7 @@ extern void kdebug_reset(void); #define BSD_MEMSTAT_JETSAM 2 /* LRU jetsam */ #define BSD_MEMSTAT_JETSAM_HIWAT 3 /* highwater jetsam */ #define BSD_MEMSTAT_FREEZE 4 /* freeze process */ -#define BSD_MEMSTAT_LATENCY_COALESCE 5 /* delay imposed to coalesce jetsam reports */ +#define BSD_MEMSTAT_FREEZE_SCAN 5 /* select a process to freeze and freeze it */ #define BSD_MEMSTAT_UPDATE 6 /* priority update */ #define BSD_MEMSTAT_IDLE_DEMOTE 7 /* idle demotion fired */ #define BSD_MEMSTAT_CLEAR_ERRORS 8 /* reset termination error state */ @@ -769,6 +578,9 @@ extern void kdebug_reset(void); #define BSD_MEMSTAT_CHANGE_PRIORITY 14 /* priority changed */ #endif /* PRIVATE */ #define BSD_MEMSTAT_FAST_JETSAM 15 /* Aggressive jetsam ("clear-the-deck") */ +#define BSD_MEMSTAT_COMPACTOR_RUN 16 /* run VM compactor after process kill */ +#define BSD_MEMSTAT_FREEZE_DISABLE 17 /* disable freeze and kill frozen processes */ +#define BSD_MEMSTAT_RELAUNCH_FLAGS 18 /* flags representing jetsam behavior; based on launchd data */ /* Codes for BSD subcode class DBG_BSD_KEVENT */ #define BSD_KEVENT_KQ_PROCESS_BEGIN 1 @@ -833,9 +645,11 @@ extern void kdebug_reset(void); #define DBG_MT_TMPCPU 0xff /* The Kernel Debug Sub Classes for DBG_MISC */ -#define DBG_EVENT 0x10 -#define DBG_MISC_LAYOUT 0x1a -#define DBG_BUFFER 0x20 +#define DBG_EVENT 0x10 +#define DBG_MISC_INSTRUMENTS 0x11 +#define DBG_MISC_INSTRUMENTSBT 0x12 +#define DBG_MISC_LAYOUT 0x1a +#define DBG_BUFFER 0x20 /* The Kernel Debug Sub Classes for DBG_DYLD */ #define DBG_DYLD_UUID (5) @@ -890,21 +704,57 @@ extern void kdebug_reset(void); #define IO_THROTTLE_DISABLE 0x3 #define IO_TIER_UPL_MISMATCH 0x4 - /* Subclasses for MACH Importance Policies (DBG_IMPORTANCE) */ /* TODO: Split up boost and task policy? */ -#define IMP_ASSERTION 0x10 /* Task takes/drops a boost assertion */ -#define IMP_BOOST 0x11 /* Task boost level changed */ -#define IMP_MSG 0x12 /* boosting message sent by donating task on donating port */ -#define IMP_WATCHPORT 0x13 /* port marked as watchport, and boost was transferred to the watched task */ -#define IMP_TASK_SUPPRESSION 0x17 /* Task changed suppression behaviors */ -#define IMP_TASK_APPTYPE 0x18 /* Task launched with apptype */ -#define IMP_UPDATE 0x19 /* Requested -> effective calculation */ -#define IMP_USYNCH_QOS_OVERRIDE 0x1A /* Userspace synchronization applied QoS override to resource owning thread */ -#define IMP_DONOR_CHANGE 0x1B /* The iit_donor bit changed */ -#define IMP_MAIN_THREAD_QOS 0x1C /* The task's main thread QoS was set */ -#define IMP_SYNC_IPC_QOS 0x1D /* Sync IPC QOS override */ -/* DBG_IMPORTANCE subclasses 0x20 - 0x3F reserved for task policy flavors */ +#define IMP_ASSERTION 0x10 /* Task takes/drops a boost assertion */ +#define IMP_BOOST 0x11 /* Task boost level changed */ +#define IMP_MSG 0x12 /* boosting message sent by donating task on donating port */ +#define IMP_WATCHPORT 0x13 /* port marked as watchport, and boost was transferred to the watched task */ +#define IMP_TASK_SUPPRESSION 0x17 /* Task changed suppression behaviors */ +#define IMP_TASK_APPTYPE 0x18 /* Task launched with apptype */ +#define IMP_UPDATE 0x19 /* Requested -> effective calculation */ +#define IMP_USYNCH_QOS_OVERRIDE 0x1A /* Userspace synchronization applied QoS override to resource owning thread */ +#define IMP_DONOR_CHANGE 0x1B /* The iit_donor bit changed */ +#define IMP_MAIN_THREAD_QOS 0x1C /* The task's main thread QoS was set */ +#define IMP_SYNC_IPC_QOS 0x1D /* Sync IPC QOS override */ +/* DBG_IMPORTANCE subclasses 0x20 - 0x3F are reserved for task policy flavors */ + +/* thread and task attributes */ +#define IMP_TASK_POLICY_DARWIN_BG 0x21 +#define IMP_TASK_POLICY_IOPOL 0x22 +#define IMP_TASK_POLICY_IO 0x23 +#define IMP_TASK_POLICY_PASSIVE_IO 0x24 + +/* task only attributes */ +#define IMP_TASK_POLICY_DARWIN_BG_IOPOL 0x27 +#define IMP_TASK_POLICY_TAL 0x28 +#define IMP_TASK_POLICY_BOOST 0x29 +#define IMP_TASK_POLICY_ROLE 0x2A +/* unused 0x2B */ +#define IMP_TASK_POLICY_TERMINATED 0x2C +#define IMP_TASK_POLICY_NEW_SOCKETS_BG 0x2D +#define IMP_TASK_POLICY_SUP_ACTIVE 0x2E +#define IMP_TASK_POLICY_LATENCY_QOS 0x2F +#define IMP_TASK_POLICY_THROUGH_QOS 0x30 +#define IMP_TASK_POLICY_WATCHERS_BG 0x31 + +#define IMP_TASK_POLICY_SFI_MANAGED 0x34 +#define IMP_TASK_POLICY_ALL_SOCKETS_BG 0x37 + +#define IMP_TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS 0x39 /* latency as value1, throughput as value2 */ +#define IMP_TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS 0x3A /* latency as value1, throughput as value2 */ + +/* thread only attributes */ +#define IMP_TASK_POLICY_PIDBIND_BG 0x32 +/* unused 0x33 */ +/* reserved 0x35 */ +#define IMP_TASK_POLICY_QOS_OVERRIDE 0x36 +#define IMP_TASK_POLICY_QOS_AND_RELPRIO 0x38 /* QoS as value1, relative priority as value2 */ +#define IMP_TASK_POLICY_QOS_WORKQ_OVERRIDE 0x3B +#define IMP_TASK_POLICY_QOS_PROMOTE 0x3C +#define IMP_TASK_POLICY_QOS_KEVENT_OVERRIDE 0x3D +#define IMP_TASK_POLICY_QOS_IPC_OVERRIDE IMP_TASK_POLICY_QOS_KEVENT_OVERRIDE /* legacy name */ +#define IMP_TASK_POLICY_QOS_SERVICER_OVERRIDE 0x3E /* Codes for IMP_ASSERTION */ #define IMP_HOLD 0x2 /* Task holds a boost assertion */ @@ -998,8 +848,8 @@ extern void kdebug_reset(void); /**********************************************************************/ -#define KDBG_MIGCODE(msgid) ((DBG_MIG << KDBG_CLASS_OFFSET) | \ - (((msgid) & 0x3fffff) << KDBG_CODE_OFFSET)) +#define KDBG_MIGCODE(msgid) (((unsigned)DBG_MIG << KDBG_CLASS_OFFSET) | \ + ((unsigned)((msgid) & 0x3fffff) << KDBG_CODE_OFFSET)) #define MACHDBG_CODE(SubClass, code) KDBG_CODE(DBG_MACH, SubClass, code) #define NETDBG_CODE(SubClass, code) KDBG_CODE(DBG_NETWORK, SubClass, code) @@ -1008,6 +858,7 @@ extern void kdebug_reset(void); #define IOKDBG_CODE(SubClass, code) KDBG_CODE(DBG_IOKIT, SubClass, code) #define DRVDBG_CODE(SubClass, code) KDBG_CODE(DBG_DRIVERS, SubClass, code) #define TRACEDBG_CODE(SubClass, code) KDBG_CODE(DBG_TRACE, SubClass, code) +#define SILICONDBG_CODE(SubClass, code) KDBG_CODE(DBG_SILICON, SubClass, code) #define MISCDBG_CODE(SubClass, code) KDBG_CODE(DBG_MISC, SubClass, code) #define DLILDBG_CODE(SubClass, code) KDBG_CODE(DBG_DLIL, SubClass, code) #define SECURITYDBG_CODE(SubClass, code) KDBG_CODE(DBG_SECURITY, SubClass, code) @@ -1029,7 +880,6 @@ extern void kdebug_reset(void); #define PMAP_CODE(code) MACHDBG_CODE(DBG_MACH_PMAP, code) - #define IMPORTANCE_CODE(SubClass, code) KDBG_CODE(DBG_IMPORTANCE, (SubClass), (code)) #define BANK_CODE(SubClass, code) KDBG_CODE(DBG_BANK, (SubClass), (code)) #define ATM_CODE(SubClass, code) KDBG_CODE(DBG_ATM, (SubClass), (code)) @@ -1039,824 +889,20 @@ extern void kdebug_reset(void); #define COREDUETDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_COREDUET, code) #define POWERDDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_POWERD, code) -/* - * To use kdebug in the kernel: - * - * #include - * - * #define DBG_NETIPINIT NETDBG_CODE(DBG_NETIP, 1) - * - * void - * ip_init(void) - * { - * KDBG(DBG_NETIPINIT | DBG_FUNC_START, 1, 2, 3, 4); - * ... - * KDBG(DBG_NETIPINIT); - * ... - * KDBG(DBG_NETIPINIT | DBG_FUNC_END); - * } - */ - -#ifdef KERNEL_PRIVATE - -/* - * The KDBG{,_DEBUG,_RELEASE,_FILTERED} macros are the preferred method of - * making tracepoints. - * - * Kernel pointers must be unslid or permuted using VM_KERNEL_UNSLIDE_OR_PERM. - * Do not trace any sensitive data. - */ - -/* - * Traced on debug and development (and release macOS) kernels. - */ -#define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) - -/* - * Traced on debug and development (and release macOS) kernels if explicitly - * requested. Omitted from tracing without a typefilter. - */ -#define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) - -/* - * Traced on debug and development (and release macOS) kernels, even if the - * process filter would reject it. - */ -#define KDBG_RELEASE_NOPROCFILT(x, ...) \ - KDBG_(_RELEASE_NOPROCFILT, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) - -/* - * Traced on debug, development, and release kernels. - * - * Only use this tracepoint if the events are required for a shipping trace - * tool. - */ -#define KDBG_RELEASE(x, ...) KDBG_(_RELEASE, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) - -/* - * Traced only on debug kernels. - */ -#define KDBG_DEBUG(x, ...) KDBG_(_DEBUG, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) - -#define KDBG_(f, x, a, b, c, d, n, ...) KDBG##n(f, x, a, b, c, d) -#define KDBG0(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, 0, 0, 0, 0, 0) -#define KDBG1(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, 0, 0, 0, 0) -#define KDBG2(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, 0, 0, 0) -#define KDBG3(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, 0, 0) -#define KDBG4(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, d, 0) - -#endif /* defined(KERNEL_PRIVATE) */ - -extern unsigned int kdebug_enable; - -/* - * Bits used by kdebug_enable. These control which events are traced at - * runtime. - */ -#define KDEBUG_ENABLE_TRACE (1U << 0) -#define KDEBUG_ENABLE_ENTROPY (1U << 1) /* obsolete */ -#define KDEBUG_ENABLE_CHUD (1U << 2) /* obsolete */ -#define KDEBUG_ENABLE_PPT (1U << 3) -#define KDEBUG_ENABLE_SERIAL (1U << 4) - -#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE) - -/* - * Specify KDEBUG_PPT to indicate that the event belongs to the limited PPT set. - * PPT is deprecated -- use a typefilter and the PPTDBG class instead. - */ -#define KDEBUG_PPT (KDEBUG_ENABLE_PPT) -#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_PPT) - -/* - * The kernel debug configuration level. These values control which events are - * compiled in under different build configurations. - * - * Infer the supported kernel debug event level from config option. Use - * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug - * code. - */ -#define KDEBUG_LEVEL_NONE 0 -#define KDEBUG_LEVEL_IST 1 -#define KDEBUG_LEVEL_STANDARD 2 -#define KDEBUG_LEVEL_FULL 3 - -#if NO_KDEBUG -#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE -#elif IST_KDEBUG -#define KDEBUG_LEVEL KDEBUG_LEVEL_IST -// currently configured for the iOS release kernel -#elif KDEBUG -#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL -#else -#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD -/* - * Currently, all other kernel configurations (development, etc) build with - * KDEBUG_LEVEL_STANDARD. As a result, KERNEL_DEBUG_CONSTANT*() are on by - * default but KERNEL_DEBUG*() are not. - */ -#endif - -#ifdef XNU_KERNEL_PRIVATE -#define KDBG_IMPROBABLE __improbable -#else -#define KDBG_IMPROBABLE -#endif - -/* - * KERNEL_DEBUG_CONSTANT_FILTERED events are omitted from tracing unless they - * are explicitly requested in the typefilter. They are not emitted when - * tracing without a typefilter. - */ -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) -#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d)); \ - } \ - } while (0) -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ -#define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0) -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ - -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) -#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \ - } \ - } while (0) -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ -#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \ - do { } while (0) -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ - - -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) -#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ - (uintptr_t)(d),(uintptr_t)(e)); \ - } \ - } while (0) - -/* - * DO NOT USE THIS MACRO -- it breaks fundamental assumptions about ktrace and - * is only meant to be used by the pthread kext and other points in the kernel - * where the thread ID must be provided explicitly. - */ -#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ - (uintptr_t)(d), (uintptr_t)(e)); \ - } \ - } while (0) - -#define KERNEL_DEBUG_EARLY(x, a, b, c, d) \ - do { \ - kernel_debug_early((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d)); \ - } while (0) -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ -#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) do {} while (0) -#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) do {} while (0) -#define KERNEL_DEBUG_EARLY(x, a, b, c, d) do {} while (0) -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ - -/* - * KERNEL_DEBUG_CONSTANT_IST (in-system trace) events provide an audited subset - * of tracepoints for userland system tracing tools. This tracing level was - * created by 8857227 to protect fairplayd and other PT_DENY_ATTACH processes. - * It has two effects: only KERNEL_DEBUG_CONSTANT_IST() traces are emitted and - * any PT_DENY_ATTACH processes will only emit basic traces as defined by the - * kernel_debug_filter() routine. - */ -#define KERNEL_DEBUG_CONSTANT_RELEASE(x, a, b, c, d, e) \ - KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, x, a, b, c, d, 0) - -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) -#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & (type))) { \ - kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ - (uintptr_t)(d), 0); \ - } \ - } while (0) -#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable)) { \ - kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ - (uintptr_t)(d), (uintptr_t)(e)); \ - } \ - } while (0) -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ -#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) do {} while (0) -#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e) do {} while (0) -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ - -#if NO_KDEBUG -#define __kdebug_constant_only __unused -#endif - -/* - * KERNEL_DEBUG events are only traced for DEBUG kernels. - */ -#define KERNEL_DEBUG_CONSTANT_DEBUG(x, a, b, c, d, e) \ - KERNEL_DEBUG(x, a, b, c, d, e) - -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) -#define __kdebug_only - -#define KERNEL_DEBUG(x, a, b, c, d, e) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e)); \ - } \ - } while (0) - -/* - * DO NOT USE THIS MACRO -- see warning above for KERNEL_DEBUG_CONSTANT1. - */ -#define KERNEL_DEBUG1(x, a, b, c, d, e) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug1((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e)); \ - } \ - } while (0) - -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ -#define __kdebug_only __unused - -#define KERNEL_DEBUG(x, a, b, c, d, e) do {} while (0) -#define KERNEL_DEBUG1(x, a, b, c, d, e) do {} while (0) -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ - - -extern void kernel_debug( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5); - -extern void kernel_debug1( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5); - -#define KDBG_FLAG_FILTERED 0x01 -#define KDBG_FLAG_NOPROCFILT 0x02 - -extern void kernel_debug_flags( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uint64_t flags); - -extern void kernel_debug_filtered( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4); - -extern void kernel_debug_early( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4); - -/* - * EnergyTracing macros. - */ - -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) -// whether to bother calculating EnergyTracing inputs -// could change in future to see if DBG_ENERGYTRACE is active -#define ENTR_SHOULDTRACE kdebug_enable -// encode logical EnergyTracing into 32/64 KDebug trace -#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value) \ -do { \ - uint32_t kdcode__; \ - uintptr_t highval__, lowval__, mask__ = 0xffffffff; \ - kdcode__ = KDBG_CODE(DBG_ENERGYTRACE,component,opcode)|(lifespan); \ - highval__ = ((value) >> 32) & mask__; \ - lowval__ = (value) & mask__; \ - ENTR_KDTRACEFUNC(kdcode__, id, quality, highval__, lowval__); \ -} while(0) - -/* - * Trace the association of two existing activations. - * - * An association is traced as a modification to the parent activation. - * In order to fit the sub-activation's component, activation code, and - * activation ID into a kdebug tracepoint, the arguments that would hold - * the value are left separate, and one stores the component and opcode - * of the sub-activation, while the other stores the pointer-sized - * activation ID. - * - * arg2 arg3 arg4 - +-----------------+ +~+----+----+--------+ +----------+ - |kEnTrModAssociate| | | | | | | | - +-----------------+ +~+----+----+--------+ +----------+ - * 8-bits unused sub-activation ID - * 8-bit sub-component - * 16-bit sub-opcode - * - */ -#define kEnTrModAssociate (1 << 28) -#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id, \ - sub_comp, sub_opcode, sub_act_id) \ -do { \ - unsigned sub_compcode = ((unsigned)sub_comp << 16) | sub_opcode; \ - ENTR_KDTRACEFUNC(KDBG_CODE(DBG_ENERGYTRACE,par_comp,par_opcode), \ - par_act_id, kEnTrModAssociate, sub_compcode, \ - sub_act_id); \ -} while(0) - -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ - -#define ENTR_SHOULDTRACE FALSE -#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value) \ - do {} while (0) -#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id, \ - sub_comp, sub_opcode, sub_act_id) \ - do {} while (0) - -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ - -#ifdef KERNEL_PRIVATE -/* - * kernel_debug_string provides the same functionality as the - * kdebug_trace_string syscall as a KPI. str_id is an in/out - * parameter that, if it's pointing to a string ID of 0, will - * receive a generated ID. If it provides a value in str_id, - * then that will be used, instead. - * - * Returns an errno indicating the type of failure. - */ -extern int -kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str); - -/* - * kernel_debug_disable disables event logging, but leaves any buffers - * intact. - */ -extern void kernel_debug_disable(void); -#endif - -/* - * Bits set in the comm page for kdebug. - */ -#define KDEBUG_COMMPAGE_ENABLE_TRACE 0x1 -#define KDEBUG_COMMPAGE_ENABLE_TYPEFILTER 0x2 /* Forced to false if ENABLE_TRACE is 0 */ - -// for EnergyTracing user space & clients -#define kEnTrCompKernel 2 - -/* - * EnergyTracing opcodes - * - * Activations use DBG_FUNC_START/END. - * Events are DBG_FUNC_NONE. - */ - -/* Socket reads and writes are uniquely identified by the (sanitized) - * pointer to the socket struct in question. To associate this address - * with the user space file descriptor, we have a socket activation with - * the FD as its identifier and the socket struct pointer as its value. - */ -#define kEnTrActKernSocket 1 -#define kEnTrActKernSockRead 2 -#define kEnTrActKernSockWrite 3 - -#define kEnTrActKernPoll 10 -#define kEnTrActKernSelect 11 -#define kEnTrActKernKQWait 12 - -// events -#define kEnTrEvUnblocked 256 - -// EnergyTracing flags (the low-order 16 bits of 'quality') -#define kEnTrFlagNonBlocking 1 << 0 -#define kEnTrFlagNoWork 1 << 1 - -// and now the internal mechanism -#ifdef KERNEL_PRIVATE - -// 20452597 requests that the trace macros not take an argument it throws away -#define KERNEL_DBG_IST_SANE(x, a, b, c, d) \ - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, x, a, b, c, d, \ - 0 /*__unused in kernel_debug()*/ ) -#define ENTR_KDTRACEFUNC KERNEL_DBG_IST_SANE - -// value is int64_t, quality is uint32_t -#define KERNEL_ENERGYTRACE(opcode, lifespan, id, quality, value) \ - ENTR_KDTRACE(kEnTrCompKernel, opcode, lifespan, id, \ - quality, value) -#define KERNEL_ENTR_ASSOCIATE(par_opcode, par_act_id, sub_opcode, sub_act_id) \ - ENTR_KDASSOCIATE(kEnTrCompKernel, par_opcode, par_act_id, \ - kEnTrCompKernel, sub_opcode, sub_act_id) - -// end EnergyTracing - - -#include - -#define NUMPARMS 23 - -struct proc; - -/* - * Returns false if the debugid is disabled by filters, and true if the - * debugid is allowed to be traced. A debugid may not be traced if the - * typefilter disables its class and subclass, it's outside a range - * check, or if it's not an allowed debugid in a value check. Trace - * system events bypass this check. - */ -boolean_t kdebug_debugid_enabled(uint32_t debugid); - -/* - * Returns true only if the debugid is explicitly enabled by filters. Returns - * false otherwise, including when no filters are active. - */ -boolean_t kdebug_debugid_explicitly_enabled(uint32_t debugid); - -uint32_t kdebug_commpage_state(void); - -#define KDBG_VFS_LOOKUP_FLAG_LOOKUP 0x01 -#define KDBG_VFS_LOOKUP_FLAG_NOPROCFILT 0x02 -void kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, - uint32_t flags); - -void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, - boolean_t lookup); - -void kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid); - -void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4); - -void kdbg_dump_trace_to_file(const char *); -void kdebug_init(unsigned int n_events, char *filterdesc, boolean_t wrapping); -void kdebug_trace_start(unsigned int n_events, const char *filterdesc, - boolean_t wrapping, boolean_t at_wake); -void kdebug_free_early_buf(void); -struct task; -void release_storage_unit(int cpu, uint32_t storage_unit); -int allocate_storage_unit(int cpu); - -#define KDBG_CLASS_ENCODE(Class, SubClass) KDBG_EVENTID(Class, SubClass, 0) -#define KDBG_CLASS_DECODE(Debugid) (Debugid & KDBG_CSC_MASK) - -#endif /* KERNEL_PRIVATE */ -#endif /* __APPLE_API_UNSTABLE */ -__END_DECLS - -#ifdef PRIVATE -#ifdef __APPLE_API_PRIVATE -/* - * private kernel_debug definitions - */ - -/* - * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf - * structure. - */ -#if defined(__arm64__) -typedef uint64_t kd_buf_argtype; -#else -typedef uintptr_t kd_buf_argtype; -#endif - -typedef struct { - uint64_t timestamp; - kd_buf_argtype arg1; - kd_buf_argtype arg2; - kd_buf_argtype arg3; - kd_buf_argtype arg4; - kd_buf_argtype arg5; /* the thread ID */ - uint32_t debugid; -/* - * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf - * structure. - */ -#if defined(__LP64__) || defined(__arm64__) - uint32_t cpuid; - kd_buf_argtype unused; -#endif -} kd_buf; - -#if defined(__LP64__) || defined(__arm64__) -#define KDBG_TIMESTAMP_MASK 0xffffffffffffffffULL -static inline void -kdbg_set_cpu(kd_buf *kp, int cpu) -{ - kp->cpuid = (unsigned int)cpu; -} -static inline int -kdbg_get_cpu(kd_buf *kp) -{ - return (int)kp->cpuid; -} -static inline void -kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) -{ - kp->timestamp = thetime; -} -static inline uint64_t -kdbg_get_timestamp(kd_buf *kp) -{ - return kp->timestamp; -} -static inline void -kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) -{ - kdbg_set_timestamp(kp, thetime); - kdbg_set_cpu(kp, cpu); -} -#else -#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL -#define KDBG_CPU_MASK 0xff00000000000000ULL -#define KDBG_CPU_SHIFT 56 -static inline void -kdbg_set_cpu(kd_buf *kp, int cpu) -{ - kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) | - (((uint64_t) cpu) << KDBG_CPU_SHIFT); -} -static inline int -kdbg_get_cpu(kd_buf *kp) -{ - return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT); -} -static inline void -kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) -{ - kp->timestamp = thetime & KDBG_TIMESTAMP_MASK; -} -static inline uint64_t -kdbg_get_timestamp(kd_buf *kp) -{ - return kp->timestamp & KDBG_TIMESTAMP_MASK; -} -static inline void -kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) -{ - kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) | - (((uint64_t) cpu) << KDBG_CPU_SHIFT); -} -#endif - -/* - * 2^16 bits (8 kilobytes), one for each possible class/subclass combination - */ -#define KDBG_TYPEFILTER_BITMAP_SIZE ((256 * 256) / 8) - -/* - * Bits for kd_ctrl_page.flags, KERN_KD{D,E}FLAGS. - */ -#define KDBG_INIT (1U << 0) /* obsolete */ -/* disable tracing when buffers are full */ -#define KDBG_NOWRAP (1U << 1) -#define KDBG_FREERUN (1U << 2) /* obsolete */ -/* buffer has wrapped */ -#define KDBG_WRAPPED (1U << 3) -/* flags that are allowed to be set by user space */ -#define KDBG_USERFLAGS (KDBG_FREERUN | KDBG_NOWRAP | KDBG_INIT) -/* only include processes with kdebug bit set in proc */ -#define KDBG_PIDCHECK (1U << 4) -/* thread map is initialized */ -#define KDBG_MAPINIT (1U << 5) -/* exclude processes based on kdebug bit in proc */ -#define KDBG_PIDEXCLUDE (1U << 6) -/* whether the kdebug locks are intialized */ -#define KDBG_LOCKINIT (1U << 7) -/* word size of the kernel */ -#define KDBG_LP64 (1U << 8) - -/* bits for kd_ctrl_page.flags and kbufinfo_t.flags */ - -/* only trace events within a range */ -#define KDBG_RANGECHECK 0x00100000U -/* only trace at most 4 types of events, at the code granularity */ -#define KDBG_VALCHECK 0x00200000U -/* check class and subclass against the typefilter */ -#define KDBG_TYPEFILTER_CHECK 0x00400000U -/* kdebug trace buffers are initialized */ -#define KDBG_BUFINIT 0x80000000U - -/* bits for the type field of kd_regtype */ -#define KDBG_CLASSTYPE 0x10000 -#define KDBG_SUBCLSTYPE 0x20000 -#define KDBG_RANGETYPE 0x40000 -#define KDBG_TYPENONE 0x80000 -#define KDBG_CKTYPES 0xF0000 - -typedef struct { - unsigned int type; - unsigned int value1; - unsigned int value2; - unsigned int value3; - unsigned int value4; -} kd_regtype; - -typedef struct { - /* number of events that can fit in the buffers */ - int nkdbufs; - /* set if trace is disabled */ - int nolog; - /* kd_ctrl_page.flags */ - unsigned int flags; - /* number of threads in thread map */ - int nkdthreads; - /* the owning pid */ - int bufid; -} kbufinfo_t; - -typedef struct { - /* the thread ID */ -#if defined(__arm64__) - uint64_t thread; -#else - uintptr_t thread; -#endif - /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */ - int valid; - /* the name of the process owning the thread */ - char command[20]; -} kd_threadmap; - -typedef struct { - uint32_t version_no; - uint32_t cpu_count; -} kd_cpumap_header; - -/* cpumap flags */ -#define KDBG_CPUMAP_IS_IOP 0x1 - -typedef struct { - uint32_t cpu_id; - uint32_t flags; - char name[8]; -} kd_cpumap; - -/* - * TRACE file formats... - * - * RAW_VERSION0 - * - * uint32_t #threadmaps - * kd_threadmap[] - * kd_buf[] - * - * RAW_VERSION1 - * - * RAW_header, with version_no set to RAW_VERSION1 - * kd_threadmap[] - * Empty space to pad alignment to the nearest page boundary. - * kd_buf[] - * - * RAW_VERSION1+ - * - * RAW_header, with version_no set to RAW_VERSION1 - * kd_threadmap[] - * kd_cpumap_header, with version_no set to RAW_VERSION1 - * kd_cpumap[] - * Empty space to pad alignment to the nearest page boundary. - * kd_buf[] - * - * V1+ implementation details... - * - * It would have been nice to add the cpumap data "correctly", but there were - * several obstacles. Existing code attempts to parse both V1 and V0 files. - * Due to the fact that V0 has no versioning or header, the test looks like - * this: - * - * // Read header - * if (header.version_no != RAW_VERSION1) { // Assume V0 } - * - * If we add a VERSION2 file format, all existing code is going to treat that - * as a VERSION0 file when reading it, and crash terribly when trying to read - * RAW_VERSION2 threadmap entries. - * - * To differentiate between a V1 and V1+ file, read as V1 until you reach - * the padding bytes. Then: - * - * boolean_t is_v1plus = FALSE; - * if (padding_bytes >= sizeof(kd_cpumap_header)) { - * kd_cpumap_header header = // read header; - * if (header.version_no == RAW_VERSION1) { - * is_v1plus = TRUE; - * } - * } - * - */ - -typedef struct { - int version_no; - int thread_count; - uint64_t TOD_secs; - uint32_t TOD_usecs; -} RAW_header; - -// Version 3 header -// The header chunk has the tag 0x00001000 which also serves as a magic word -// that identifies the file as a version 3 trace file. The header payload is -// a set of fixed fields followed by a variable number of sub-chunks: -/* - * ____________________________________________________________________________ - | Offset | Size | Field | - | ---------------------------------------------------------------------------- - | 0 | 4 | Tag (0x00001000) | - | 4 | 4 | Sub-tag. Represents the version of the header. | - | 8 | 8 | Length of header payload (40+8x) | - | 16 | 8 | Time base info. Two 32-bit numbers, numer/denom, | - | | | for converting timestamps to nanoseconds. | - | 24 | 8 | Timestamp of trace start. | - | 32 | 8 | Wall time seconds since Unix epoch. | - | | | As returned by gettimeofday(). | - | 40 | 4 | Wall time microseconds. As returned by gettimeofday(). | - | 44 | 4 | Local time zone offset in minutes. ( " ) | - | 48 | 4 | Type of daylight savings time correction to apply. ( " ) | - | 52 | 4 | Flags. 1 = 64-bit. Remaining bits should be written | - | | | as 0 and ignored when reading. | - | 56 | 8x | Variable number of sub-chunks. None are required. | - | | | Ignore unknown chunks. | - | ---------------------------------------------------------------------------- - */ -// NOTE: The header sub-chunks are considered part of the header chunk, -// so they must be included in the header chunk’s length field. -// The CPU map is an optional sub-chunk of the header chunk. It provides -// information about the CPUs that are referenced from the trace events. -typedef struct { - uint32_t tag; - uint32_t sub_tag; - uint64_t length; - uint32_t timebase_numer; - uint32_t timebase_denom; - uint64_t timestamp; - uint64_t walltime_secs; - uint32_t walltime_usecs; - uint32_t timezone_minuteswest; - uint32_t timezone_dst; - uint32_t flags; -} __attribute__((packed)) kd_header_v3; - -typedef struct { - uint32_t tag; - uint32_t sub_tag; - uint64_t length; -} __attribute__((packed)) kd_chunk_header_v3; - -#define RAW_VERSION0 0x55aa0000 -#define RAW_VERSION1 0x55aa0101 -#define RAW_VERSION2 0x55aa0200 /* Only used by kperf and Instruments */ -#define RAW_VERSION3 0x00001000 - -#define V3_CONFIG 0x00001b00 -#define V3_CPU_MAP 0x00001c00 -#define V3_THREAD_MAP 0x00001d00 -#define V3_RAW_EVENTS 0x00001e00 -#define V3_NULL_CHUNK 0x00002000 - -// The current version of all kernel managed chunks is 1. The -// V3_CURRENT_CHUNK_VERSION is added to ease the simple case -// when most/all the kernel managed chunks have the same version. - -#define V3_CURRENT_CHUNK_VERSION 1 -#define V3_HEADER_VERSION V3_CURRENT_CHUNK_VERSION -#define V3_CPUMAP_VERSION V3_CURRENT_CHUNK_VERSION -#define V3_THRMAP_VERSION V3_CURRENT_CHUNK_VERSION -#define V3_EVENT_DATA_VERSION V3_CURRENT_CHUNK_VERSION - -// Apis to support writing v3 chunks in the kernel -int kdbg_write_v3_chunk_header_to_buffer(void *buffer, uint32_t tag, uint32_t sub_tag, uint64_t length); -int kdbg_write_v3_chunk_to_fd(uint32_t tag, uint32_t sub_tag, uint64_t length, void *payload, uint64_t payload_size, int fd); - /* VFS lookup events for serial traces */ #define VFS_LOOKUP (FSDBG_CODE(DBG_FSRW,36)) #define VFS_LOOKUP_DONE (FSDBG_CODE(DBG_FSRW,39)) -#if !CONFIG_EMBEDDED -#if defined(XNU_KERNEL_PRIVATE) && (DEVELOPMENT || DEBUG) -#define KDEBUG_MOJO_TRACE 1 -#endif -#endif +#endif /* __APPLE_API_UNSTABLE */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* PRIVATE */ +__END_DECLS + +#if defined(__has_include) && __has_include() +#include +#endif /* __has_include() */ + +#ifdef KERNEL +#include +#endif /* defined(KERNEL) */ -#endif /* !BSD_SYS_KDEBUG_H */ +#endif /* !defined(BSD_SYS_KDEBUG_H) */ diff --git a/bsd/sys/kdebug_kernel.h b/bsd/sys/kdebug_kernel.h new file mode 100644 index 000000000..c366472b1 --- /dev/null +++ b/bsd/sys/kdebug_kernel.h @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef BSD_SYS_KDEBUG_KERNEL_H +#define BSD_SYS_KDEBUG_KERNEL_H + +#include +#include +#include +#include +#include + +__BEGIN_DECLS + +#ifdef KERNEL + +/* + * To use kdebug in the kernel: + * + * #include + * + * #define DBG_NETIPINIT NETDBG_CODE(DBG_NETIP, 1) + * + * void + * ip_init(void) + * { + * KDBG(DBG_NETIPINIT | DBG_FUNC_START, 1, 2, 3, 4); + * ... + * KDBG(DBG_NETIPINIT); + * ... + * KDBG(DBG_NETIPINIT | DBG_FUNC_END); + * } + */ + +#pragma mark - kernel tracepoints + +/* + * The KDBG{,_DEBUG,_RELEASE,_FILTERED} macros are the preferred method of + * making tracepoints. + * + * Kernel pointers must be unslid or permuted using VM_KERNEL_UNSLIDE_OR_PERM. + * Do not trace any sensitive data. + */ + +/* + * Traced on debug and development (and release macOS) kernels. + */ +#define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +/* + * Traced on debug and development (and release macOS) kernels if explicitly + * requested. Omitted from tracing without a typefilter. + */ +#define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +#ifdef KERNEL_PRIVATE + +/* + * Traced on debug and development (and release macOS) kernels, even if the + * process filter would reject it. + */ +#define KDBG_RELEASE_NOPROCFILT(x, ...) \ + KDBG_(_RELEASE_NOPROCFILT, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +#endif /* KERNEL_PRIVATE */ + +/* + * Traced on debug, development, and release kernels. + * + * Only use this tracepoint if the events are required for a shipping trace + * tool. + */ +#define KDBG_RELEASE(x, ...) KDBG_(_RELEASE, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +/* + * Traced only on debug kernels. + */ +#define KDBG_DEBUG(x, ...) KDBG_(_DEBUG, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +#pragma mark - kernel API + +#ifdef KERNEL_PRIVATE + +/* + * kernel_debug_string provides the same functionality as the + * kdebug_trace_string syscall as a KPI. str_id is an in/out + * parameter that, if it's pointing to a string ID of 0, will + * receive a generated ID. If it provides a value in str_id, + * then that will be used, instead. + * + * Returns an errno indicating the type of failure. + */ +int kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str); + +/* + * kernel_debug_disable disables event logging, but leaves any buffers + * intact. + */ +void kernel_debug_disable(void); + +#endif /* KERNEL_PRIVATE */ + +/* + * Returns true if kdebug is using continuous time for its events, and false + * otherwise. + */ +bool kdebug_using_continuous_time(void); + +/* + * Returns true if kdebug will log an event with the provided debugid, and + * false otherwise. + */ +bool kdebug_debugid_enabled(uint32_t debugid); + +/* + * Returns true only if the debugid is explicitly enabled by filters. Returns + * false otherwise, including when no filters are active. + */ +bool kdebug_debugid_explicitly_enabled(uint32_t debugid); + +uint32_t kdebug_commpage_state(void); + +#pragma mark - IOP tracing + +/* + * Definitions to support IOP tracing. + */ + +typedef enum { + /* Trace is now enabled; no arguments. */ + KD_CALLBACK_KDEBUG_ENABLED, + /* Trace is now disabled; no arguments. */ + KD_CALLBACK_KDEBUG_DISABLED, + /* + * Request the latest entries from the IOP and block until complete; no + * arguments. + */ + KD_CALLBACK_SYNC_FLUSH, + /* + * The typefilter is enabled; a read-only pointer to the typefilter is + * provided, valid only while in the callback. + */ + KD_CALLBACK_TYPEFILTER_CHANGED, +} kd_callback_type; + +typedef void (*kd_callback_fn) (void *context, kd_callback_type reason, + void *arg); + +struct kd_callback { + kd_callback_fn func; + void *context; + /* name of IOP, NUL-terminated */ + char iop_name[8]; +}; + +typedef struct kd_callback kd_callback_t; + +/* + * Registers an IOP for participation in tracing. + * + * The registered callback function will be called with the + * supplied context as the first argument, followed by a + * kd_callback_type and an associated void* argument. + * + * The return value is a nonzero coreid that shall be used in + * kernel_debug_enter() to refer to your IOP. If the allocation + * failed, then 0 will be returned. + * + * Caveats: + * Note that not all callback calls will indicate a change in + * state (e.g. disabling trace twice would send two disable + * notifications). + */ +int kernel_debug_register_callback(kd_callback_t callback); + +void kernel_debug_enter(uint32_t coreid, uint32_t debugid, uint64_t timestamp, + uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, + uintptr_t threadid); + +#pragma mark - internals + +#define KDBG_(f, x, a, b, c, d, n, ...) KDBG##n(f, x, a, b, c, d) +#define KDBG0(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, 0, 0, 0, 0, 0) +#define KDBG1(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, 0, 0, 0, 0) +#define KDBG2(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, 0, 0, 0) +#define KDBG3(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, 0, 0) +#define KDBG4(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, d, 0) + +#ifdef XNU_KERNEL_PRIVATE +#define KDBG_IMPROBABLE __improbable +#else +#define KDBG_IMPROBABLE +#endif + +extern unsigned int kdebug_enable; + +/* + * The kernel debug configuration level. These values control which events are + * compiled in under different build configurations. + * + * Infer the supported kernel debug event level from config option. Use + * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug + * code. + */ +#define KDEBUG_LEVEL_NONE 0 +#define KDEBUG_LEVEL_IST 1 +#define KDEBUG_LEVEL_STANDARD 2 +#define KDEBUG_LEVEL_FULL 3 + +#if NO_KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE +#elif IST_KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_IST +#elif KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL +#else +#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD +/* + * Currently, all other kernel configurations (development, etc) build with + * KDEBUG_LEVEL_STANDARD. + */ +#endif + +/* + * KERNEL_DEBUG_CONSTANT_FILTERED events are omitted from tracing unless they + * are explicitly requested in the typefilter. They are not emitted when + * tracing without a typefilter. + */ +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d)); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ +#define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) +#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ +#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \ + do { } while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ + + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d),(uintptr_t)(e)); \ + } \ + } while (0) + +/* + * DO NOT USE THIS MACRO -- it breaks fundamental assumptions about ktrace and + * is only meant to be used by the pthread kext and other points in the kernel + * where the thread ID must be provided explicitly. + */ +#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) + +#define KERNEL_DEBUG_EARLY(x, a, b, c, d) \ + do { \ + kernel_debug_early((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d)); \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ +#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) do {} while (0) +#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) do {} while (0) +#define KERNEL_DEBUG_EARLY(x, a, b, c, d) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ + +/* + * KERNEL_DEBUG_CONSTANT_IST (in-system trace) events provide an audited subset + * of tracepoints for userland system tracing tools. This tracing level was + * created by 8857227 to protect fairplayd and other PT_DENY_ATTACH processes. + * It has two effects: only KERNEL_DEBUG_CONSTANT_IST() traces are emitted and + * any PT_DENY_ATTACH processes will only emit basic traces as defined by the + * kernel_debug_filter() routine. + */ +#define KERNEL_DEBUG_CONSTANT_RELEASE(x, a, b, c, d, e) \ + KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, x, a, b, c, d, 0) + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) +#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & (type))) { \ + kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d), 0); \ + } \ + } while (0) +#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable)) { \ + kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ +#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) do {} while (0) +#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ + +#if NO_KDEBUG +#define __kdebug_constant_only __unused +#endif + +/* + * KERNEL_DEBUG events are only traced for DEBUG kernels. + */ +#define KERNEL_DEBUG_CONSTANT_DEBUG(x, a, b, c, d, e) \ + KERNEL_DEBUG(x, a, b, c, d, e) + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) +#define __kdebug_only + +#undef KERNEL_DEBUG +#define KERNEL_DEBUG(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) + +/* + * DO NOT USE THIS MACRO -- see warning above for KERNEL_DEBUG_CONSTANT1. + */ +#define KERNEL_DEBUG1(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug1((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) + +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ +#define __kdebug_only __unused + +#undef KERNEL_DEBUG +#define KERNEL_DEBUG(x, a, b, c, d, e) do {} while (0) +#define KERNEL_DEBUG1(x, a, b, c, d, e) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ + +void kernel_debug(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + +void kernel_debug1(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + +#define KDBG_FLAG_FILTERED 0x01 +#define KDBG_FLAG_NOPROCFILT 0x02 + +void kernel_debug_flags(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uint64_t flags); + +void kernel_debug_filtered(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4); + +#pragma mark - xnu API + +#ifdef XNU_KERNEL_PRIVATE +/* Used in early boot to log events. */ +void kernel_debug_early(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4); +/* Used in early boot to log strings spanning only a single tracepoint. */ +void kernel_debug_string_early(const char *message); +/* Used to trace strings within kdebug tracepoints on arbitrary eventids. */ +void kernel_debug_string_simple(uint32_t eventid, const char *str); +/* Only used by ktrace to reset kdebug. ktrace_lock must be held. */ +extern void kdebug_reset(void); + +void kdbg_dump_trace_to_file(const char *); +void kdebug_init(unsigned int n_events, char *filterdesc, bool wrapping); +void kdebug_trace_start(unsigned int n_events, const char *filterdesc, + bool wrapping, bool at_wake); +void kdebug_free_early_buf(void); +void release_storage_unit(int cpu, uint32_t storage_unit); +bool allocate_storage_unit(int cpu); + +struct proc; +void kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid); +void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, + long *arg4); + +#define KDBG_VFS_LOOKUP_FLAG_LOOKUP 0x01 +#define KDBG_VFS_LOOKUP_FLAG_NOPROCFILT 0x02 +void kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, + uint32_t flags); + +#endif /* XNU_KERNEL_PRIVATE */ + +#ifdef KERNEL_PRIVATE + +#define NUMPARMS 23 +void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, + bool lookup); + +#pragma mark - EnergyTracing + +#define KERNEL_DBG_IST_SANE KDBG_RELEASE +#define ENTR_KDTRACEFUNC KDBG_RELEASE + +// value is int64_t, quality is uint32_t +#define KERNEL_ENERGYTRACE(opcode, lifespan, id, quality, value) \ + ENTR_KDTRACE(kEnTrCompKernel, opcode, lifespan, id, \ + quality, value) +#define KERNEL_ENTR_ASSOCIATE(par_opcode, par_act_id, sub_opcode, sub_act_id) \ + ENTR_KDASSOCIATE(kEnTrCompKernel, par_opcode, par_act_id, \ + kEnTrCompKernel, sub_opcode, sub_act_id) + +#endif /* KERNEL_PRIVATE */ + +#endif /* KERNEL */ + +__END_DECLS + +#endif /* !defined(BSD_SYS_KDEBUG_KERNEL_H) */ diff --git a/bsd/sys/kdebug_signpost.h b/bsd/sys/kdebug_signpost.h index 7db2d075f..250b60abf 100644 --- a/bsd/sys/kdebug_signpost.h +++ b/bsd/sys/kdebug_signpost.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2016-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,35 +38,24 @@ __BEGIN_DECLS #ifndef KERNEL /* - * In previous versions of the operating system, applications could use: - * - * syscall(SYS_kdebug_trace, APPSDBG_CODE(DBG_MACH_CHUD, ) | DBG_FUNC_, arg1, arg2, arg3, arg4); - * - * to record events that would be displayed by Instruments. - * - * syscall(2) is now deprecated and this interface replaces the above call as follows: - * - * The code argument is . Only the low 14-bits of the code are - * preserved. + * kdebug_signpost(2) is deprecated. Use the os_signpost(3) family of tracing + * functions, instead. */ -/* - * When is NONE, use kdebug_signpost. - */ -int kdebug_signpost(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) -__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); +int kdebug_signpost(uint32_t code, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4) +__API_DEPRECATED_WITH_REPLACEMENT("os_signpost_event_emit", + macos(10.12, 10.15), ios(10.0, 13.0), watchos(3.0, 6.0), tvos(10.0, 13.0)); -/* - * When is START, use kdebug_signpost_start. - */ -int kdebug_signpost_start(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) -__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); +int kdebug_signpost_start(uint32_t code, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4) +__API_DEPRECATED_WITH_REPLACEMENT("os_signpost_interval_begin", + macos(10.12, 10.15), ios(10.0, 13.0), watchos(3.0, 6.0), tvos(10.0, 13.0)); -/* - * When is END, use kdebug_signpost_end. - */ -int kdebug_signpost_end(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) -__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); +int kdebug_signpost_end(uint32_t code, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4) +__API_DEPRECATED_WITH_REPLACEMENT("os_signpost_interval_end", + macos(10.12, 10.15), ios(10.0, 13.0), watchos(3.0, 6.0), tvos(10.0, 13.0)); #endif /* !KERNEL */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index 0ef5a8132..8ea2bce39 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -29,11 +29,9 @@ #ifndef SYS_MEMORYSTATUS_H #define SYS_MEMORYSTATUS_H -#include #include -#include -#include #include +#include #define MEMORYSTATUS_ENTITLEMENT "com.apple.private.memorystatus" @@ -55,6 +53,7 @@ #define JETSAM_PRIORITY_FOREGROUND 10 #define JETSAM_PRIORITY_AUDIO_AND_ACCESSORY 12 #define JETSAM_PRIORITY_CONDUCTOR 13 +#define JETSAM_PRIORITY_DRIVER_APPLE 15 #define JETSAM_PRIORITY_HOME 16 #define JETSAM_PRIORITY_EXECUTIVE 17 #define JETSAM_PRIORITY_IMPORTANT 18 @@ -69,6 +68,21 @@ /* Compatibility */ #define DEFAULT_JETSAM_PRIORITY 18 +/* + * The deferral time used by default for apps and daemons in all aging + * policies except kJetsamAgingPolicySysProcsReclaimedFirst is + * DEFERRED_IDLE_EXIT_TIME_SECS. + * + * For kJetsamAgingPolicySysProcsReclaimedFirst, + * + * Daemons: The actual idle deferred time for the daemon is based on + * the relaunch behavior of the daemon. The relaunch behavior determines + * the scaling factor applied to DEFERRED_IDLE_EXIT_TIME_SECS. See + * kJetsamSysProcsIdleDelayTime* ratios defined in kern_memorystatus.c + * + * Apps: The apps are aged for DEFERRED_IDLE_EXIT_TIME_SECS factored + * by kJetsamAppsIdleDelayTimeRatio. + */ #define DEFERRED_IDLE_EXIT_TIME_SECS 10 #define KEV_MEMORYSTATUS_SUBCLASS 3 @@ -185,15 +199,28 @@ typedef struct jetsam_snapshot { memorystatus_jetsam_snapshot_entry_t entries[]; } memorystatus_jetsam_snapshot_t; -typedef struct memorystatus_freeze_entry { - int32_t pid; - uint32_t flags; - uint32_t pages; -} memorystatus_freeze_entry_t; - /* TODO - deprecate; see */ #define kMaxSnapshotEntries 192 +/* + * default jetsam snapshot support + */ +extern memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot; +extern memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy; +extern unsigned int memorystatus_jetsam_snapshot_count; +extern unsigned int memorystatus_jetsam_snapshot_copy_count; +extern unsigned int memorystatus_jetsam_snapshot_max; +extern unsigned int memorystatus_jetsam_snapshot_size; +extern uint64_t memorystatus_jetsam_snapshot_last_timestamp; +extern uint64_t memorystatus_jetsam_snapshot_timeout; +#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries +#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30 + +/* General memorystatus stuff */ + +extern uint64_t memorystatus_sysprocs_idle_delay_time; +extern uint64_t memorystatus_apps_idle_delay_time; + /* State */ #define kMemorystatusSuspended 0x01 #define kMemorystatusFrozen 0x02 @@ -201,6 +228,7 @@ typedef struct memorystatus_freeze_entry { #define kMemorystatusTracked 0x08 #define kMemorystatusSupportsIdleExit 0x10 #define kMemorystatusDirty 0x20 +#define kMemorystatusAssertion 0x40 /* * Jetsam exit reason definitions - related to memorystatus @@ -223,8 +251,8 @@ typedef struct memorystatus_freeze_entry { #define JETSAM_REASON_ZONE_MAP_EXHAUSTION 10 #define JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING 11 #define JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE 12 - -#define JETSAM_REASON_MEMORYSTATUS_MAX JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE +#define JETSAM_REASON_LOWSWAP 13 +#define JETSAM_REASON_MEMORYSTATUS_MAX JETSAM_REASON_LOWSWAP /* * Jetsam exit reason definitions - not related to memorystatus @@ -246,9 +274,14 @@ enum { kMemorystatusKilledZoneMapExhaustion = JETSAM_REASON_ZONE_MAP_EXHAUSTION, kMemorystatusKilledVMCompressorThrashing = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING, kMemorystatusKilledVMCompressorSpaceShortage = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE, + kMemorystatusKilledLowSwap = JETSAM_REASON_LOWSWAP, }; -/* For backwards compatibility */ +/* + * For backwards compatibility + * Keeping these around for external users (e.g. ReportCrash, Ariadne). + * TODO: Remove once they stop using these. + */ #define kMemorystatusKilledDiagnostic kMemorystatusKilledDiskSpaceShortage #define kMemorystatusKilledVMThrashing kMemorystatusKilledVMCompressorThrashing #define JETSAM_REASON_MEMORY_VMTHRASHING JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING @@ -289,6 +322,10 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu #endif /* DEVELOPMENT || DEBUG */ #endif /* CONFIG_FREEZE */ +#define MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE 21 /* Query if the lenient mode for aggressive jetsam is enabled. */ + +#define MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT 22 /* Used by DYLD to increase the jetsam active and inactive limits, when using roots */ + /* Commands that act on a group of processes */ #define MEMORYSTATUS_CMD_GRP_SET_PROPERTIES 100 @@ -362,6 +399,11 @@ typedef struct memorystatus_priority_properties { uint64_t user_data; } memorystatus_priority_properties_t; +/* + * Inform the kernel that setting the priority property is driven by assertions. + */ +#define MEMORYSTATUS_SET_PRIORITY_ASSERTION 0x1 + /* * For use with memorystatus_control: * MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES @@ -374,6 +416,12 @@ typedef struct memorystatus_memlimit_properties { uint32_t memlimit_inactive_attr; } memorystatus_memlimit_properties_t; +typedef struct memorystatus_memlimit_properties2 { + memorystatus_memlimit_properties_t v1; + uint32_t memlimit_increase; /* jetsam memory limit increase (in MB) for active and inactive states */ + uint32_t memlimit_increase_bytes; /* bytes used to determine the jetsam memory limit increase, for active and inactive states */ +} memorystatus_memlimit_properties2_t; + #define MEMORYSTATUS_MEMLIMIT_ATTR_FATAL 0x1 /* if set, exceeding the memlimit is fatal */ #ifdef XNU_KERNEL_PRIVATE @@ -414,7 +462,6 @@ typedef struct memorystatus_memlimit_properties { #define P_MEMSTAT_FREEZE_IGNORE 0x00000040 /* Process was evaluated by freezer and will be ignored till the next time it goes active and does something */ #define P_MEMSTAT_PRIORITYUPDATED 0x00000080 /* Process had its jetsam priority updated */ #define P_MEMSTAT_FOREGROUND 0x00000100 /* Process is in the FG jetsam band...unused??? */ -#define P_MEMSTAT_DIAG_SUSPENDED 0x00000200 /* ...unused??? */ #define P_MEMSTAT_REFREEZE_ELIGIBLE 0x00000400 /* Process was once thawed i.e. its state was brought back from disk. It is now refreeze eligible.*/ #define P_MEMSTAT_MANAGED 0x00000800 /* Process is managed by assertiond i.e. is either application or extension */ #define P_MEMSTAT_INTERNAL 0x00001000 /* Process is a system-critical-not-be-jetsammed process i.e. launchd */ @@ -423,24 +470,73 @@ typedef struct memorystatus_memlimit_properties { #define P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL 0x00008000 /* if set, exceeding limit is fatal when the process is inactive */ #define P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND 0x00010000 /* if set, the process will go into this band & stay there when in the background instead * of the aging bands and/or the IDLE band. */ +#define P_MEMSTAT_PRIORITY_ASSERTION 0x00020000 /* jetsam priority is being driven by an assertion */ + + +/* + * p_memstat_relaunch_flags holds + * - relaunch behavior when jetsammed + */ +#define P_MEMSTAT_RELAUNCH_UNKNOWN 0x0 +#define P_MEMSTAT_RELAUNCH_LOW 0x1 +#define P_MEMSTAT_RELAUNCH_MED 0x2 +#define P_MEMSTAT_RELAUNCH_HIGH 0x4 + +/* + * Checking the p_memstat_state almost always requires the proc_list_lock + * because the jetsam thread could be on the other core changing the state. + * + * App -- almost always managed by a system process. Always have dirty tracking OFF. Can include extensions too. + * System Processes -- not managed by anybody. Always have dirty tracking ON. Can include extensions (here) too. + */ +#define isApp(p) ((p->p_memstat_state & P_MEMSTAT_MANAGED) || ! (p->p_memstat_dirty & P_DIRTY_TRACK)) +#define isSysProc(p) ( ! (p->p_memstat_state & P_MEMSTAT_MANAGED) || (p->p_memstat_dirty & P_DIRTY_TRACK)) + +#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1) + +typedef struct memstat_bucket { + TAILQ_HEAD(, proc) list; + int count; + int relaunch_high_count; +} memstat_bucket_t; + +extern memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT]; + +/* + * Table that expresses the probability of a process + * being used in the next hour. + */ +typedef struct memorystatus_internal_probabilities { + char proc_name[MAXCOMLEN + 1]; + int use_probability; +} memorystatus_internal_probabilities_t; + +extern memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table; +extern size_t memorystatus_global_probabilities_size; + extern void memorystatus_init(void) __attribute__((section("__TEXT, initcode"))); extern void memorystatus_init_at_boot_snapshot(void); extern int memorystatus_add(proc_t p, boolean_t locked); -extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, +extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t is_assertion, boolean_t effective, boolean_t update_memlimit, int32_t memlimit_active, boolean_t memlimit_active_is_fatal, int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal); -extern int memorystatus_remove(proc_t p, boolean_t locked); +/* Remove this process from jetsam bands for killing or freezing. + * The proc_list_lock is held by the caller. + * @param p: The process to remove. + * @return: 0 if successful. EAGAIN if the process can't be removed right now (because it's being frozen) or ESRCH. + */ +extern int memorystatus_remove(proc_t p); int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, int priority, boolean_t effective_now); - +int memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags); extern int memorystatus_dirty_track(proc_t p, uint32_t pcontrol); extern int memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol); -extern int memorystatus_dirty_get(proc_t p); +extern int memorystatus_dirty_get(proc_t p, boolean_t locked); extern int memorystatus_dirty_clear(proc_t p, uint32_t pcontrol); extern int memorystatus_on_terminate(proc_t p); @@ -463,6 +559,11 @@ void memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_i void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); void proc_memstat_terminated(proc_t p, boolean_t set); void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); + +#if __arm64__ +void memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase); +#endif /* __arm64__ */ + #endif /* CONFIG_MEMORYSTATUS */ int memorystatus_get_pressure_status_kdp(void); @@ -472,9 +573,6 @@ int memorystatus_get_pressure_status_kdp(void); typedef enum memorystatus_policy { kPolicyDefault = 0x0, kPolicyMoreFree = 0x1, - kPolicyDiagnoseAll = 0x2, - kPolicyDiagnoseFirst = 0x4, - kPolicyDiagnoseActive = (kPolicyDiagnoseAll | kPolicyDiagnoseFirst), } memorystatus_policy_t; boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async); @@ -487,48 +585,27 @@ void memorystatus_fast_jetsam_override(boolean_t enable_override); #endif /* CONFIG_JETSAM */ +/* These are very verbose printfs(), enable with + * MEMORYSTATUS_DEBUG_LOG + */ +#if MEMORYSTATUS_DEBUG_LOG +#define MEMORYSTATUS_DEBUG(cond, format, ...) \ +do { \ +if (cond) { printf(format, ##__VA_ARGS__); } \ +} while(0) +#else +#define MEMORYSTATUS_DEBUG(cond, format, ...) +#endif + boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid); boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async); void memorystatus_pages_update(unsigned int pages_avail); - boolean_t memorystatus_idle_exit_from_VM(void); - - -#ifdef CONFIG_FREEZE - -#define FREEZE_PAGES_MIN ( 8 * 1024 * 1024 / PAGE_SIZE) -#define FREEZE_PAGES_MAX (32 * 1024 * 1024 / PAGE_SIZE) - -#define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4 -#define FREEZE_PROCESSES_MAX 20 - -#define FREEZE_DAILY_MB_MAX_DEFAULT 1024 -#define FREEZE_DEGRADATION_BUDGET_THRESHOLD 25 //degraded perf. when the daily budget left falls below this threshold percentage - -#define MAX_FROZEN_SHARED_MB_PERCENT 10 /* max shared MB calculated as percent of system task limit. */ -#define MAX_FROZEN_PROCESS_DEMOTIONS 2 /* max demotions of frozen processes into IDLE band done daily. */ -#define MIN_THAW_DEMOTION_THRESHOLD 5 /* min # of thaws required for a process to be safe from demotion. */ -#define MIN_THAW_REFREEZE_THRESHOLD 3 /* min # of global thaws needed for us to consider refreezing these processes. */ - -typedef struct throttle_interval_t { - uint32_t mins; - uint32_t burst_multiple; - uint32_t pageouts; - uint32_t max_pageouts; - mach_timespec_t ts; -} throttle_interval_t; - -extern boolean_t memorystatus_freeze_enabled; -extern int memorystatus_freeze_wakeup; - -extern void memorystatus_freeze_init(void) __attribute__((section("__TEXT, initcode"))); -extern int memorystatus_freeze_process_sync(proc_t p); - -#if DEVELOPMENT || DEBUG -#define FREEZER_CONTROL_GET_STATUS (1) -#endif /* DEVELOPMENT || DEBUG */ - -#endif /* CONFIG_FREEZE */ +proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search); +proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search); +void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages); +void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state); +void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check); #if VM_PRESSURE_EVENTS diff --git a/bsd/sys/kern_memorystatus_freeze.h b/bsd/sys/kern_memorystatus_freeze.h new file mode 100644 index 000000000..6c5a8b6b6 --- /dev/null +++ b/bsd/sys/kern_memorystatus_freeze.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2006-2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef SYS_MEMORYSTATUS_FREEZE_H +#define SYS_MEMORYSTATUS_FREEZE_H + +#include +#include +#include +#include +#include + +typedef struct memorystatus_freeze_entry { + int32_t pid; + uint32_t flags; + uint32_t pages; +} memorystatus_freeze_entry_t; + +#ifdef XNU_KERNEL_PRIVATE + +extern unsigned long freeze_threshold_percentage; +extern unsigned int memorystatus_frozen_count; +extern unsigned int memorystatus_frozen_processes_max; +extern unsigned int memorystatus_frozen_shared_mb; +extern unsigned int memorystatus_frozen_shared_mb_max; +extern unsigned int memorystatus_freeze_shared_mb_per_process_max; /* Max. MB allowed per process to be freezer-eligible. */ +extern unsigned int memorystatus_freeze_private_shared_pages_ratio; /* Ratio of private:shared pages for a process to be freezer-eligible. */ +extern unsigned int memorystatus_suspended_count; +extern unsigned int memorystatus_thaw_count; +extern unsigned int memorystatus_refreeze_eligible_count; /* # of processes currently thawed i.e. have state on disk & in-memory */ + +void memorystatus_freeze_init(void); +extern int memorystatus_freeze_process_sync(proc_t p); + +#ifdef CONFIG_FREEZE + +#define FREEZE_PAGES_MIN ( 8 * 1024 * 1024 / PAGE_SIZE) +#define FREEZE_PAGES_MAX (max_task_footprint_mb == 0 ? INT_MAX : (max_task_footprint_mb << (20 - PAGE_SHIFT))) + +#define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4 +#define FREEZE_PROCESSES_MAX 20 + +#define FREEZE_DAILY_MB_MAX_DEFAULT 1024 +#define FREEZE_DEGRADATION_BUDGET_THRESHOLD 25 //degraded perf. when the daily budget left falls below this threshold percentage + +#define MAX_FROZEN_SHARED_MB_PERCENT 10 +#define MAX_FROZEN_PROCESS_DEMOTIONS 2 +#define MIN_THAW_DEMOTION_THRESHOLD 5 +#define MIN_THAW_REFREEZE_THRESHOLD 3 /* min # of global thaws needed for us to consider refreezing these processes. */ + +typedef struct throttle_interval_t { + uint32_t mins; + uint32_t burst_multiple; + uint32_t pageouts; + uint32_t max_pageouts; + mach_timespec_t ts; +} throttle_interval_t; + +extern boolean_t memorystatus_freeze_enabled; +extern int memorystatus_freeze_wakeup; + +/* Thresholds */ +extern unsigned int memorystatus_freeze_threshold; +extern unsigned int memorystatus_freeze_pages_min; +extern unsigned int memorystatus_freeze_pages_max; +extern unsigned int memorystatus_freeze_suspended_threshold; +extern unsigned int memorystatus_freeze_daily_mb_max; +extern uint64_t memorystatus_freeze_budget_pages_remaining; //remaining # of pages that can be frozen to disk +extern boolean_t memorystatus_freeze_degradation; //protected by the freezer mutex. Signals we are in a degraded freeze mode. + +extern unsigned int memorystatus_max_frozen_demotions_daily; +extern unsigned int memorystatus_thaw_count_demotion_threshold; + +#if DEVELOPMENT || DEBUG +#define FREEZER_CONTROL_GET_STATUS (1) +#endif /* DEVELOPMENT || DEBUG */ + +extern boolean_t memorystatus_freeze_enabled; +extern int memorystatus_freeze_wakeup; +extern int memorystatus_freeze_jetsam_band; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */ + +boolean_t memorystatus_freeze_thread_should_run(void); +int memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable); +int memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable); +int memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval); + +#endif /* CONFIG_FREEZE */ + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* SYS_MEMORYSTATUS_FREEZE_H */ diff --git a/bsd/sys/kern_memorystatus_notify.h b/bsd/sys/kern_memorystatus_notify.h new file mode 100644 index 000000000..ee6c5a014 --- /dev/null +++ b/bsd/sys/kern_memorystatus_notify.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2006-2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef SYS_MEMORYSTATUS_NOTIFY_H +#define SYS_MEMORYSTATUS_NOTIFY_H + +#include +#include +#include + +#if VM_PRESSURE_EVENTS + +extern vm_pressure_level_t memorystatus_vm_pressure_level; +extern boolean_t memorystatus_hwm_candidates; + +boolean_t memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t exceeded); +int memorystatus_send_note(int event_code, void *data, size_t data_length); +void memorystatus_send_low_swap_note(void); +void consider_vm_pressure_events(void); + +#if CONFIG_MEMORYSTATUS + +int memorystatus_low_mem_privileged_listener(uint32_t op_flags); +int memorystatus_send_pressure_note(int pid); +boolean_t memorystatus_is_foreground_locked(proc_t p); +boolean_t memorystatus_bg_pressure_eligible(proc_t p); +void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); +extern void memorystatus_issue_fg_band_notify(void); + +#endif /* CONFIG_MEMORYSTATUS */ + +#if DEBUG +#define VM_PRESSURE_DEBUG(cond, format, ...) \ +do { \ +if (cond) { printf(format, ##__VA_ARGS__); } \ +} while(0) +#else +#define VM_PRESSURE_DEBUG(cond, format, ...) +#endif + +#endif /* VM_PRESSURE_EVENTS */ + +#endif /* SYS_MEMORYSTATUS_NOTIFY_H */ diff --git a/bsd/sys/kern_sysctl.h b/bsd/sys/kern_sysctl.h new file mode 100644 index 000000000..72d7c82fd --- /dev/null +++ b/bsd/sys/kern_sysctl.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_SYSCTL_H_ +#define _KERN_SYSCTL_H_ + +#include + +typedef struct _vm_object_query_data_ vm_object_query_data_t; +typedef struct _vm_object_query_data_ *vm_object_query_t; + +struct _vm_object_query_data_ { + vm_object_id_t object_id; + mach_vm_size_t virtual_size; + mach_vm_size_t resident_size; + mach_vm_size_t wired_size; + mach_vm_size_t reusable_size; + mach_vm_size_t compressed_size; + struct { + uint64_t vo_no_footprint : 1; /* object not included in footprint */ + uint64_t vo_ledger_tag : 3; /* object ledger tag */ + uint64_t purgable : 2; /* object "purgable" state #defines */ + }; +}; + +typedef struct _vmobject_list_output_ vmobject_list_output_data_t; +typedef struct _vmobject_list_output_ *vmobject_list_output_t; + +struct _vmobject_list_output_ { + int64_t entries; /* int64_t for alignment reasons, instead of int32_t */ + vm_object_query_data_t data[0]; +}; +#endif /* _KERN_SYSCTL_H_ */ diff --git a/bsd/sys/kernel.h b/bsd/sys/kernel.h index efb737aa7..b6ce1fc87 100644 --- a/bsd/sys/kernel.h +++ b/bsd/sys/kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,15 +76,16 @@ #ifdef BSD_KERNEL_PRIVATE #include +#include /* Global variables for the kernel. */ /* 1.1 */ extern long hostid; extern char hostname[MAXHOSTNAMELEN]; -extern int hostnamelen; +extern lck_mtx_t hostname_lock; extern char domainname[MAXHOSTNAMELEN]; -extern int domainnamelen; +extern lck_mtx_t domainname_lock; /* 1.2 */ extern int stathz; /* statistics clock's frequency */ diff --git a/bsd/sys/kernel_types.h b/bsd/sys/kernel_types.h index f43d1f0c2..f5fbe196d 100644 --- a/bsd/sys/kernel_types.h +++ b/bsd/sys/kernel_types.h @@ -55,11 +55,13 @@ struct ucred; typedef struct ucred * ucred_t; #endif +#if defined(KERNEL) || !defined(_SYS_MOUNT_H_) /* also defined in mount.h */ struct mount; typedef struct mount * mount_t; struct vnode; typedef struct vnode * vnode_t; +#endif struct proc; typedef struct proc * proc_t; @@ -104,8 +106,10 @@ typedef struct file * file_t; #ifndef __LP64__ typedef struct ucred * ucred_t; #endif +#if defined(KERNEL) || !defined(_SYS_MOUNT_H_) /* also defined in mount.h */ typedef struct mount * mount_t; typedef struct vnode * vnode_t; +#endif typedef struct proc * proc_t; typedef struct uio * uio_t; typedef struct user_iovec * user_iovec_t; diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index 76d960422..3e1429761 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -1053,7 +1053,7 @@ extern void mbuf_inbound_modified(mbuf_t mbuf); * There are a number of operations that are performed in hardware, * such as calculating checksums. This function will perform in * software the various opterations that were scheduled to be done - * in hardware. Future operations may include IPSec processing or + * in hardware. Future operations may include IPsec processing or * vlan support. If you are redirecting a packet to a new interface * which may not have the same hardware support or encapsulating * the packet, you should call this function to force the stack to @@ -1911,6 +1911,27 @@ extern errno_t mbuf_get_flowid(mbuf_t mbuf, u_int16_t *flowid); */ extern errno_t mbuf_set_flowid(mbuf_t mbuf, u_int16_t flowid); +/*! + * @function mbuf_get_keepalive_flag + * @discussion Tell if it's a keep alive packet. + * @param mbuf The mbuf representing the packet. + * @param is_keepalive A pointer that returns the truth value. + * @result 0 upon success otherwise the errno error. If the mbuf + * packet header does not have valid data bytes, the error + * code will be EINVAL + */ +extern errno_t mbuf_get_keepalive_flag(mbuf_t mbuf, boolean_t *is_keepalive); + +/*! + * @function mbuf_set_keepalive_flag + * @discussion Set or clear the packet keep alive flag. + * @param mbuf The mbuf representing the packet. + * @param is_keepalive The boolean value. + * @result 0 upon success otherwise the errno error. If the mbuf + * packet header does not have valid data bytes, the error + * code will be EINVAL + */ +extern errno_t mbuf_set_keepalive_flag(mbuf_t mbuf, boolean_t is_keepalive); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/kpi_socket.h b/bsd/sys/kpi_socket.h index aa5a89f26..7eaa3367a 100644 --- a/bsd/sys/kpi_socket.h +++ b/bsd/sys/kpi_socket.h @@ -41,6 +41,13 @@ #include #include +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ + __BEGIN_DECLS struct timeval; @@ -114,7 +121,8 @@ extern errno_t sock_accept_internal(socket_t so, struct sockaddr *from, int from (cookie), (new_so)) #else extern errno_t sock_accept(socket_t so, struct sockaddr *from, int fromlen, - int flags, sock_upcall callback, void *cookie, socket_t *new_so); + int flags, sock_upcall callback, void *cookie, socket_t *new_so) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -125,7 +133,8 @@ extern errno_t sock_accept(socket_t so, struct sockaddr *from, int fromlen, * @param to The local address the socket should be bound to. * @result 0 on success otherwise the errno error. */ -extern errno_t sock_bind(socket_t so, const struct sockaddr *to); +extern errno_t sock_bind(socket_t so, const struct sockaddr *to) +__NKE_API_DEPRECATED; /*! * @function sock_connect @@ -141,7 +150,8 @@ extern errno_t sock_bind(socket_t so, const struct sockaddr *to); * @result 0 on success, EINPROGRESS for a non-blocking connect that * has not completed, otherwise the errno error. */ -extern errno_t sock_connect(socket_t so, const struct sockaddr *to, int flags); +extern errno_t sock_connect(socket_t so, const struct sockaddr *to, int flags) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -174,7 +184,8 @@ extern errno_t sock_connectwait(socket_t so, const struct timeval *tv); * @result 0 on success otherwise the errno error. */ extern errno_t sock_getpeername(socket_t so, struct sockaddr *peername, - int peernamelen); + int peernamelen) +__NKE_API_DEPRECATED; /*! * @function sock_getsockname @@ -186,7 +197,8 @@ extern errno_t sock_getpeername(socket_t so, struct sockaddr *peername, * @result 0 on success otherwise the errno error. */ extern errno_t sock_getsockname(socket_t so, struct sockaddr *sockname, - int socknamelen); + int socknamelen) +__NKE_API_DEPRECATED; /*! * @function sock_getsockopt @@ -199,7 +211,8 @@ extern errno_t sock_getsockname(socket_t so, struct sockaddr *sockname, * @result 0 on success otherwise the errno error. */ extern errno_t sock_getsockopt(socket_t so, int level, int optname, - void *optval, int *optlen); + void *optval, int *optlen) +__NKE_API_DEPRECATED; /*! * @function sock_ioctl @@ -209,7 +222,8 @@ extern errno_t sock_getsockopt(socket_t so, int level, int optname, * @param argp The argument. * @result 0 on success otherwise the errno error. */ -extern errno_t sock_ioctl(socket_t so, unsigned long request, void *argp); +extern errno_t sock_ioctl(socket_t so, unsigned long request, void *argp) +__NKE_API_DEPRECATED; /*! * @function sock_setsockopt @@ -222,7 +236,8 @@ extern errno_t sock_ioctl(socket_t so, unsigned long request, void *argp); * @result 0 on success otherwise the errno error. */ extern errno_t sock_setsockopt(socket_t so, int level, int optname, - const void *optval, int optlen); + const void *optval, int optlen) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -277,7 +292,8 @@ extern errno_t sock_receive_internal(socket_t, struct msghdr *, mbuf_t *, * @param backlog The maximum length of the queue of pending connections. * @result 0 on success otherwise the errno error. */ -extern errno_t sock_listen(socket_t so, int backlog); +extern errno_t sock_listen(socket_t so, int backlog) +__NKE_API_DEPRECATED; /*! * @function sock_receive @@ -292,7 +308,8 @@ extern errno_t sock_listen(socket_t so, int backlog); * would cause the thread to block, otherwise the errno error. */ extern errno_t sock_receive(socket_t so, struct msghdr *msg, int flags, - size_t *recvdlen); + size_t *recvdlen) +__NKE_API_DEPRECATED; /*! * @function sock_receivembuf @@ -313,7 +330,8 @@ extern errno_t sock_receive(socket_t so, struct msghdr *msg, int flags, * would cause the thread to block, otherwise the errno error. */ extern errno_t sock_receivembuf(socket_t so, struct msghdr *msg, mbuf_t *data, - int flags, size_t *recvlen); + int flags, size_t *recvlen) +__NKE_API_DEPRECATED; /*! * @function sock_send @@ -328,7 +346,8 @@ extern errno_t sock_receivembuf(socket_t so, struct msghdr *msg, mbuf_t *data, * would cause the thread to block, otherwise the errno error. */ extern errno_t sock_send(socket_t so, const struct msghdr *msg, int flags, - size_t *sentlen); + size_t *sentlen) +__NKE_API_DEPRECATED; /*! * @function sock_sendmbuf @@ -345,7 +364,8 @@ extern errno_t sock_send(socket_t so, const struct msghdr *msg, int flags, * Regardless of return value, the mbuf chain 'data' will be freed. */ extern errno_t sock_sendmbuf(socket_t so, const struct msghdr *msg, mbuf_t data, - int flags, size_t *sentlen); + int flags, size_t *sentlen) +__NKE_API_DEPRECATED; /*! * @function sock_shutdown @@ -357,7 +377,8 @@ extern errno_t sock_sendmbuf(socket_t so, const struct msghdr *msg, mbuf_t data, * SHUT_RDWR - shutdown both. * @result 0 on success otherwise the errno error. */ -extern errno_t sock_shutdown(socket_t so, int how); +extern errno_t sock_shutdown(socket_t so, int how) +__NKE_API_DEPRECATED; /*! * @function sock_socket @@ -382,7 +403,8 @@ extern errno_t sock_socket_internal(int domain, int type, int protocol, (callback), (cookie), (new_so)) #else extern errno_t sock_socket(int domain, int type, int protocol, - sock_upcall callback, void *cookie, socket_t *new_so); + sock_upcall callback, void *cookie, socket_t *new_so) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -393,7 +415,8 @@ extern errno_t sock_socket(int domain, int type, int protocol, * using sock_close may leave a file descriptor pointing to the * closed socket, resulting in undefined behavior. */ -extern void sock_close(socket_t so); +extern void sock_close(socket_t so) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -427,7 +450,8 @@ extern void sock_release(socket_t so); * @param on Indicate whether or not the SS_PRIV flag should be set. * @result 0 on success otherwise the errno error. */ -extern errno_t sock_setpriv(socket_t so, int on); +extern errno_t sock_setpriv(socket_t so, int on) +__NKE_API_DEPRECATED; /*! * @function sock_isconnected @@ -435,7 +459,8 @@ extern errno_t sock_setpriv(socket_t so, int on); * @param so The socket to check. * @result 0 - socket is not connected. 1 - socket is connected. */ -extern int sock_isconnected(socket_t so); +extern int sock_isconnected(socket_t so) +__NKE_API_DEPRECATED; /*! * @function sock_isnonblocking @@ -448,7 +473,8 @@ extern int sock_isconnected(socket_t so); * If the parameter is non-zero, the socket will not block. * @result 0 - socket will block. 1 - socket will not block. */ -extern int sock_isnonblocking(socket_t so); +extern int sock_isnonblocking(socket_t so) +__NKE_API_DEPRECATED; /*! * @function sock_gettype @@ -462,7 +488,8 @@ extern int sock_isnonblocking(socket_t so); * @param protocol The socket protocol. May be NULL. * @result 0 on success otherwise the errno error. */ -extern errno_t sock_gettype(socket_t so, int *domain, int *type, int *protocol); +extern errno_t sock_gettype(socket_t so, int *domain, int *type, int *protocol) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* diff --git a/bsd/sys/kpi_socketfilter.h b/bsd/sys/kpi_socketfilter.h index e82a0f52f..5af14bec0 100644 --- a/bsd/sys/kpi_socketfilter.h +++ b/bsd/sys/kpi_socketfilter.h @@ -55,6 +55,13 @@ #include #include +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ + struct sockaddr; /*! @@ -577,7 +584,8 @@ extern errno_t sflt_register_internal(const struct sflt_filter *filter, sflt_register_internal((filter), (domain), (type), (protocol)) #else extern errno_t sflt_register(const struct sflt_filter *filter, int domain, - int type, int protocol); + int type, int protocol) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -589,7 +597,8 @@ extern errno_t sflt_register(const struct sflt_filter *filter, int domain, * @param handle The sf_handle of the socket filter to unregister. * @result 0 on success otherwise the errno error. */ -extern errno_t sflt_unregister(sflt_handle handle); +extern errno_t sflt_unregister(sflt_handle handle) +__NKE_API_DEPRECATED; /*! * @function sflt_attach @@ -599,7 +608,8 @@ extern errno_t sflt_unregister(sflt_handle handle); * @param handle The handle of the registered filter to be attached. * @result 0 on success otherwise the errno error. */ -extern errno_t sflt_attach(socket_t socket, sflt_handle handle); +extern errno_t sflt_attach(socket_t socket, sflt_handle handle) +__NKE_API_DEPRECATED; /*! * @function sflt_detach @@ -608,7 +618,8 @@ extern errno_t sflt_attach(socket_t socket, sflt_handle handle); * @param handle The handle of the registered filter to be detached. * @result 0 on success otherwise the errno error. */ -extern errno_t sflt_detach(socket_t socket, sflt_handle handle); +extern errno_t sflt_detach(socket_t socket, sflt_handle handle) +__NKE_API_DEPRECATED; /* Functions for manipulating sockets */ /* @@ -635,7 +646,8 @@ extern errno_t sflt_detach(socket_t socket, sflt_handle handle); * mbuf. */ extern errno_t sock_inject_data_in(socket_t so, const struct sockaddr *from, - mbuf_t data, mbuf_t control, sflt_data_flag_t flags); + mbuf_t data, mbuf_t control, sflt_data_flag_t flags) +__NKE_API_DEPRECATED; /*! * @function sock_inject_data_out @@ -652,7 +664,8 @@ extern errno_t sock_inject_data_in(socket_t so, const struct sockaddr *from, * values are always freed regardless of return value. */ extern errno_t sock_inject_data_out(socket_t so, const struct sockaddr *to, - mbuf_t data, mbuf_t control, sflt_data_flag_t flags); + mbuf_t data, mbuf_t control, sflt_data_flag_t flags) +__NKE_API_DEPRECATED; /* @@ -672,7 +685,8 @@ typedef u_int8_t sockopt_dir; * @param sopt The socket option. * @result sock_opt_get or sock_opt_set. */ -extern sockopt_dir sockopt_direction(sockopt_t sopt); +extern sockopt_dir sockopt_direction(sockopt_t sopt) +__NKE_API_DEPRECATED; /*! * @function sockopt_level @@ -680,7 +694,8 @@ extern sockopt_dir sockopt_direction(sockopt_t sopt); * @param sopt The socket option. * @result The socket option level. See man 2 setsockopt */ -extern int sockopt_level(sockopt_t sopt); +extern int sockopt_level(sockopt_t sopt) +__NKE_API_DEPRECATED; /*! * @function sockopt_name @@ -688,7 +703,8 @@ extern int sockopt_level(sockopt_t sopt); * @param sopt The socket option. * @result The socket option name. See man 2 setsockopt */ -extern int sockopt_name(sockopt_t sopt); +extern int sockopt_name(sockopt_t sopt) +__NKE_API_DEPRECATED; /*! * @function sockopt_valsize @@ -696,7 +712,8 @@ extern int sockopt_name(sockopt_t sopt); * @param sopt The socket option. * @result The length, in bytes, of the data. */ -extern size_t sockopt_valsize(sockopt_t sopt); +extern size_t sockopt_valsize(sockopt_t sopt) +__NKE_API_DEPRECATED; /*! * @function sockopt_copyin @@ -706,7 +723,8 @@ extern size_t sockopt_valsize(sockopt_t sopt); * @param length The number of bytes to copy. * @result An errno error or zero upon success. */ -extern errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length); +extern errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length) +__NKE_API_DEPRECATED; /*! * @function sockopt_copyout @@ -716,7 +734,8 @@ extern errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length); * @param length The number of bytes to copy. * @result An errno error or zero upon success. */ -extern errno_t sockopt_copyout(sockopt_t sopt, void *data, size_t length); +extern errno_t sockopt_copyout(sockopt_t sopt, void *data, size_t length) +__NKE_API_DEPRECATED; __END_DECLS #endif /* __KPI_SOCKETFILTER__ */ diff --git a/bsd/sys/lockf.h b/bsd/sys/lockf.h index da52967a0..ab81a003e 100644 --- a/bsd/sys/lockf.h +++ b/bsd/sys/lockf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,30 +87,23 @@ MALLOC_DECLARE(M_LOCKF); */ TAILQ_HEAD(locklist, lockf); -#pragma pack(4) - struct lockf { short lf_flags; /* Semantics: F_POSIX, F_FLOCK, F_WAIT */ - short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ + short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ +#if IMPORTANCE_INHERITANCE + int lf_boosted; /* Is the owner of the lock boosted */ +#endif off_t lf_start; /* Byte # of the start of the lock */ off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ caddr_t lf_id; /* Id of the resource holding the lock */ struct lockf **lf_head; /* Back pointer to the head of the locf list */ - struct vnode *lf_vnode; /* Back pointer to the inode */ + struct vnode *lf_vnode; /* Back pointer to the inode */ struct lockf *lf_next; /* Pointer to the next lock on this inode */ struct locklist lf_blkhd; /* List of requests blocked on this lock */ TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ -#if IMPORTANCE_INHERITANCE - int lf_boosted; /* Is the owner of the lock boosted */ -#endif - struct proc *lf_owner; /* The proc that did the SETLK, if known */ + struct proc *lf_owner; /* The proc that did the SETLK, if known */ }; -#pragma pack() - -/* Maximum length of sleep chains to traverse to try and detect deadlock. */ -#define MAXDEPTH 50 - __BEGIN_DECLS #ifdef KERNEL_PRIVATE diff --git a/bsd/sys/lockstat.h b/bsd/sys/lockstat.h index 327b03304..35c8e30b0 100644 --- a/bsd/sys/lockstat.h +++ b/bsd/sys/lockstat.h @@ -27,8 +27,6 @@ #ifndef _SYS_LOCKSTAT_H #define _SYS_LOCKSTAT_H -/* #pragma ident "@(#)lockstat.h 1.6 05/06/08 SMI" */ - #ifdef __cplusplus extern "C" { #endif diff --git a/bsd/sys/log_data.h b/bsd/sys/log_data.h new file mode 100644 index 000000000..dde6185b3 --- /dev/null +++ b/bsd/sys/log_data.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ +/* + */ + +#ifndef _DATA_LOG_H_ +#define _DATA_LOG_H_ + +/* + * rdar://problem/48252465 + * This header should be exported only to dexts. + */ +int log_data_as_kernel(unsigned int tag, unsigned int flags, void *buffer, unsigned int size); + +#endif /* _DATA_LOG_H_ */ diff --git a/bsd/sys/make_symbol_aliasing.sh b/bsd/sys/make_symbol_aliasing.sh index ef47e37c8..8e98dee4b 100755 --- a/bsd/sys/make_symbol_aliasing.sh +++ b/bsd/sys/make_symbol_aliasing.sh @@ -34,8 +34,10 @@ fi SDKROOT="$1" OUTPUT="$2" -if [ ! -x "${SDKROOT}/usr/local/libexec/availability.pl" ] ; then - echo "Unable to locate ${SDKROOT}/usr/local/libexec/availability.pl (or not executable)" >&2 +AVAILABILITY_PL="${SDKROOT}/${DRIVERKITROOT}/usr/local/libexec/availability.pl" + +if [ ! -x "${AVAILABILITY_PL}" ] ; then + echo "Unable to locate ${AVAILABILITY_PL} (or not executable)" >&2 exit 1 fi @@ -74,7 +76,7 @@ cat <if_hwassist & IFNET_TSO_IPV4) && \ @@ -1017,7 +1047,7 @@ struct name { \ #define MBUFQ_LAST(head) \ (((head)->mq_last == &MBUFQ_FIRST(head)) ? NULL : \ ((struct mbuf *)(void *)((char *)(head)->mq_last - \ - (size_t)(&MBUFQ_NEXT((struct mbuf *)0))))) + __builtin_offsetof(struct mbuf, m_nextpkt)))) #define max_linkhdr P2ROUNDUP(_max_linkhdr, sizeof (u_int32_t)) #define max_protohdr P2ROUNDUP(_max_protohdr, sizeof (u_int32_t)) @@ -1228,7 +1258,7 @@ struct mbuf; #define M_COPYM_MUST_COPY_HDR 3 /* MUST copy pkthdr from old to new */ #define M_COPYM_MUST_MOVE_HDR 4 /* MUST move pkthdr from old to new */ -extern void m_freem(struct mbuf *); +extern void m_freem(struct mbuf *) __XNU_INTERNAL(m_freem); extern u_int64_t mcl_to_paddr(char *); extern void m_adj(struct mbuf *, int); extern void m_cat(struct mbuf *, struct mbuf *); @@ -1379,7 +1409,7 @@ __private_extern__ caddr_t m_16kalloc(int); __private_extern__ void m_16kfree(caddr_t, u_int, caddr_t); __private_extern__ struct mbuf *m_m16kget(struct mbuf *, int); __private_extern__ int m_reinit(struct mbuf *, int); -__private_extern__ struct mbuf *m_free(struct mbuf *); +__private_extern__ struct mbuf *m_free(struct mbuf *) __XNU_INTERNAL(m_free); __private_extern__ struct mbuf *m_getclr(int, int); __private_extern__ struct mbuf *m_getptr(struct mbuf *, int, int *); __private_extern__ unsigned int m_length(struct mbuf *); @@ -1478,6 +1508,7 @@ enum { KERNEL_TAG_TYPE_IPSEC = 10, KERNEL_TAG_TYPE_DRVAUX = 11, KERNEL_TAG_TYPE_CFIL_UDP = 13, + KERNEL_TAG_TYPE_PF_REASS = 14, }; /* Packet tag routines */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 0c5aa5255..db058126b 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2017 Apple Inc. All rights reserved. + * Copyright (c) 2006-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -246,6 +246,7 @@ typedef struct mcache_obj { typedef struct mcache_bkt { void *bkt_next; /* next bucket in list */ + struct mcache_bkttype *bkt_type; /* bucket type */ void *bkt_obj[1]; /* one or more objects */ } mcache_bkt_t; @@ -373,7 +374,7 @@ typedef struct mcache_audit { } mca_trns[MCA_TRN_MAX]; } mcache_audit_t; -__private_extern__ int assfail(const char *, const char *, int); +__private_extern__ int assfail(const char *, const char *, int) __abortlike; __private_extern__ void mcache_init(void); __private_extern__ unsigned int mcache_getflags(void); __private_extern__ unsigned int mcache_cache_line_size(void); @@ -407,7 +408,7 @@ __private_extern__ void mcache_audit_free_verify_set(mcache_audit_t *, void *, size_t, size_t); __private_extern__ char *mcache_dump_mca(mcache_audit_t *); __private_extern__ void mcache_audit_panic(mcache_audit_t *, void *, size_t, - int64_t, int64_t); + int64_t, int64_t) __abortlike; extern int32_t total_sbmb_cnt; extern int32_t total_sbmb_cnt_floor; diff --git a/bsd/sys/mman.h b/bsd/sys/mman.h index bd0b0618f..abe0b93c0 100644 --- a/bsd/sys/mman.h +++ b/bsd/sys/mman.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -145,6 +145,10 @@ #define MAP_RESILIENT_CODESIGN 0x2000 /* no code-signing failures */ #define MAP_RESILIENT_MEDIA 0x4000 /* no backing-store failures */ +#if !defined(CONFIG_EMBEDDED) +#define MAP_32BIT 0x8000 /* Return virtual addresses <4G only: Requires entitlement */ +#endif /* !defined(CONFIG_EMBEDDED) */ + #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* diff --git a/bsd/sys/monotonic.h b/bsd/sys/monotonic.h index cfca2afee..6ec648972 100644 --- a/bsd/sys/monotonic.h +++ b/bsd/sys/monotonic.h @@ -1,9 +1,49 @@ +/* + * Copyright (c) 2017-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + #ifndef SYS_MONOTONIC_H #define SYS_MONOTONIC_H #include #include #include + +__BEGIN_DECLS + +struct mt_cpu_inscyc { + uint64_t mtci_instructions; + uint64_t mtci_cycles; +}; + +__END_DECLS + +#if !MACH_KERNEL_PRIVATE + #include __BEGIN_DECLS @@ -13,10 +53,17 @@ __BEGIN_DECLS */ #define MT_IOC(x) _IO('m', (x)) - #define MT_IOC_RESET MT_IOC(0) - #define MT_IOC_ADD MT_IOC(1) +#define MT_IOC_ENABLE MT_IOC(2) +#define MT_IOC_COUNTS MT_IOC(3) +#define MT_IOC_GET_INFO MT_IOC(4) + +__END_DECLS + +#endif /* !MACH_KERNEL_PRIVATE */ + +__BEGIN_DECLS struct monotonic_config { uint64_t event; @@ -34,19 +81,12 @@ union monotonic_ctl_add { } out; }; -/* - * - Consider a separate IOC for disable -- to avoid the copyin to determine - * which way to set it. - */ -#define MT_IOC_ENABLE MT_IOC(2) - union monotonic_ctl_enable { struct { bool enable; } in; }; -#define MT_IOC_COUNTS MT_IOC(3) union monotonic_ctl_counts { struct { @@ -58,7 +98,6 @@ union monotonic_ctl_counts { } out; }; -#define MT_IOC_GET_INFO MT_IOC(4) union monotonic_ctl_info { struct { @@ -67,13 +106,19 @@ union monotonic_ctl_info { } out; }; +__END_DECLS + #if XNU_KERNEL_PRIVATE +#if MONOTONIC + #include #include #include #include +__BEGIN_DECLS + #ifdef MT_CORE_INSTRS #define COUNTS_INSTRS __counts[MT_CORE_INSTRS] #else /* defined(MT_CORE_INSTRS) */ @@ -131,6 +176,10 @@ union monotonic_ctl_info { #define MT_KDBG_TMPTH_START(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_START) #define MT_KDBG_TMPTH_END(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_END) +extern lck_grp_t * mt_lock_grp; + +int mt_dev_init(void); + struct mt_device { const char *mtd_name; int(*const mtd_init)(struct mt_device *dev); @@ -148,12 +197,10 @@ typedef struct mt_device *mt_device_t; extern struct mt_device mt_devices[]; -extern lck_grp_t *mt_lock_grp; +__END_DECLS -int mt_dev_init(void); +#endif /* MONOTONIC */ #endif /* XNU_KERNEL_PRIVATE */ -__END_DECLS - #endif /* !defined(SYS_MONOTONIC_H) */ diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index bff53904b..c3e884bad 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,6 +74,7 @@ #include #include #include /* needed for vol_capabilities_attr_t */ +#include #ifndef KERNEL #include @@ -100,6 +101,8 @@ #define MNAMELEN 90 /* length of buffer for returned name */ #endif /* __DARWIN_64_BIT_INO_T */ +#define MNT_EXT_ROOT_DATA_VOL 0x00000001 /* Data volume of root volume group */ + #define __DARWIN_STRUCT_STATFS64 { \ uint32_t f_bsize; /* fundamental file system block size */ \ int32_t f_iosize; /* optimal transfer block size */ \ @@ -116,7 +119,8 @@ char f_fstypename[MFSTYPENAMELEN]; /* fs type name */ \ char f_mntonname[MAXPATHLEN]; /* directory on which mounted */ \ char f_mntfromname[MAXPATHLEN]; /* mounted filesystem */ \ - uint32_t f_reserved[8]; /* For future use */ \ + uint32_t f_flags_ext; /* extended flags */ \ + uint32_t f_reserved[7]; /* For future use */ \ } #if !__DARWIN_ONLY_64_BIT_INO_T @@ -299,6 +303,12 @@ struct vfs_attr { */ #define MNT_EXPORTED 0x00000100 /* file system is exported */ +/* + * Denotes storage which can be removed from the system by the user. + */ + +#define MNT_REMOVABLE 0x00000200 + /* * MAC labeled / "quarantined" flag */ @@ -322,6 +332,7 @@ struct vfs_attr { #define MNT_MULTILABEL 0x04000000 /* MAC support for individual labels */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ #define MNT_SNAPSHOT 0x40000000 /* The mount is a snapshot */ +#define MNT_STRICTATIME 0x80000000 /* enable strict update of file access time */ #ifdef BSD_KERNEL_PRIVATE /* #define MNT_IMGSRC_BY_INDEX 0x20000000 see sys/imgsrc.h */ #endif /* BSD_KERNEL_PRIVATE */ @@ -337,11 +348,11 @@ struct vfs_attr { #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_NODEV | MNT_UNION | \ MNT_ASYNC | MNT_EXPORTED | MNT_QUARANTINE | \ - MNT_LOCAL | MNT_QUOTA | \ + MNT_LOCAL | MNT_QUOTA | MNT_REMOVABLE | \ MNT_ROOTFS | MNT_DOVOLFS | MNT_DONTBROWSE | \ MNT_IGNORE_OWNERSHIP | MNT_AUTOMOUNTED | MNT_JOURNALED | \ MNT_NOUSERXATTR | MNT_DEFWRITE | MNT_MULTILABEL | \ - MNT_NOATIME | MNT_SNAPSHOT | MNT_CPROTECT) + MNT_NOATIME | MNT_STRICTATIME | MNT_SNAPSHOT | MNT_CPROTECT) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. @@ -381,9 +392,13 @@ struct vfs_attr { #define MNT_WAIT 1 /* synchronized I/O file integrity completion */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_DWAIT 4 /* synchronized I/O data integrity completion */ +#ifdef KERNEL +/* only for VFS_SYNC */ +#define MNT_VOLUME 8 /* sync on a single mounted filesystem */ +#endif -#ifndef KERNEL +#if !defined(KERNEL) && !defined(_KERN_SYS_KERNELTYPES_H_) /* also defined in kernel_types.h */ struct mount; typedef struct mount * mount_t; struct vnode; @@ -489,7 +504,8 @@ struct netfs_status { #define VQ_QUOTA 0x1000 /* a user quota has been hit */ #define VQ_NEARLOWDISK 0x2000 /* Above lowdisk and below desired disk space */ #define VQ_DESIRED_DISK 0x4000 /* the desired disk space */ -#define VQ_FLAG8000 0x8000 /* placeholder */ +#define VQ_FREE_SPACE_CHANGE 0x8000 /* free disk space has significantly changed */ +#define VQ_FLAG10000 0x10000 /* placeholder */ #ifdef KERNEL @@ -772,6 +788,18 @@ struct fs_snapshot_root_args { }; #define VFSIOC_ROOT_SNAPSHOT _IOW('V', 3, struct fs_snapshot_root_args) +typedef struct fs_role_mount_args { + mount_t root_mp; + uint32_t mount_role; +} fs_role_mount_args_t; + +OS_ENUM(vfs_roles, uint32_t, + VFS_SYSTEM_ROLE = 1, + VFS_VM_ROLE = 8, + VFS_DATA_ROLE = 64); + +#define VFSIOC_MOUNT_BYROLE _IOW('V', 4, fs_role_mount_args_t) + #endif /* KERNEL */ /* @@ -780,6 +808,9 @@ struct fs_snapshot_root_args { #ifdef PRIVATE #define VFS_ITERATE_TAIL_FIRST (1 << 0) #define VFS_ITERATE_CB_DROPREF (1 << 1) // Callback will drop the iterref +#define VFS_ITERATE_NOSKIP_UNMOUNT (1 << 2) /* Callback will be made on FS in unmount. + * The callback cannot make any calls + * into the Filesystem when this is set. */ #endif /* PRIVATE */ /* @@ -1280,6 +1311,7 @@ void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "v void vfs_setcompoundopen(mount_t mp); uint64_t vfs_throttle_mask(mount_t mp); int vfs_isswapmount(mount_t mp); +boolean_t vfs_context_is_dataless_manipulator(vfs_context_t); struct vnode_trigger_info; @@ -1353,6 +1385,13 @@ int vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *d /* tags a volume as not supporting extended readdir for NFS exports */ void mount_set_noreaddirext(mount_t); +/*! + * @function vfs_get_statfs64 + * @abstract Get the same information as vfs_statfs(), but in a format suitable + * for copying to userland. + */ +void vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 2bb9d8d11..05e522c99 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,60 +105,60 @@ typedef uint32_t pending_io_t; TAILQ_HEAD(vnodelst, vnode); struct mount { - TAILQ_ENTRY(mount) mnt_list; /* mount list */ - int32_t mnt_count; /* reference on the mount */ - lck_mtx_t mnt_mlock; /* mutex that protects mount point */ - struct vfsops *mnt_op; /* operations on fs */ - struct vfstable *mnt_vtable; /* configuration info */ - struct vnode *mnt_vnodecovered; /* vnode we mounted on */ - struct vnodelst mnt_vnodelist; /* list of vnodes this mount */ - struct vnodelst mnt_workerqueue; /* list of vnodes this mount */ - struct vnodelst mnt_newvnodes; /* list of vnodes this mount */ - uint32_t mnt_flag; /* flags */ - uint32_t mnt_kern_flag; /* kernel only flags */ - uint32_t mnt_compound_ops; /* Available compound operations */ - uint32_t mnt_lflag; /* mount life cycle flags */ - uint32_t mnt_maxsymlinklen; /* max size of short symlink */ - struct vfsstatfs mnt_vfsstat; /* cache of filesystem stats */ - qaddr_t mnt_data; /* private data */ + TAILQ_ENTRY(mount) mnt_list; /* mount list */ + int32_t mnt_count; /* reference on the mount */ + lck_mtx_t mnt_mlock; /* mutex that protects mount point */ + const struct vfsops *mnt_op; /* operations on fs */ + struct vfstable *mnt_vtable; /* configuration info */ + struct vnode *mnt_vnodecovered; /* vnode we mounted on */ + struct vnodelst mnt_vnodelist; /* list of vnodes this mount */ + struct vnodelst mnt_workerqueue; /* list of vnodes this mount */ + struct vnodelst mnt_newvnodes; /* list of vnodes this mount */ + uint32_t mnt_flag; /* flags */ + uint32_t mnt_kern_flag; /* kernel only flags */ + uint32_t mnt_compound_ops; /* Available compound operations */ + uint32_t mnt_lflag; /* mount life cycle flags */ + uint32_t mnt_maxsymlinklen; /* max size of short symlink */ + struct vfsstatfs mnt_vfsstat; /* cache of filesystem stats */ + qaddr_t mnt_data; /* private data */ /* Cached values of the IO constraints for the device */ - uint32_t mnt_maxreadcnt; /* Max. byte count for read */ - uint32_t mnt_maxwritecnt; /* Max. byte count for write */ - uint32_t mnt_segreadcnt; /* Max. segment count for read */ - uint32_t mnt_segwritecnt; /* Max. segment count for write */ - uint32_t mnt_maxsegreadsize; /* Max. segment read size */ - uint32_t mnt_maxsegwritesize; /* Max. segment write size */ - uint32_t mnt_alignmentmask; /* Mask of bits that aren't addressable via DMA */ - uint32_t mnt_devblocksize; /* the underlying device block size */ - uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ - uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ - uint32_t mnt_ioflags; /* flags for underlying device */ - uint32_t mnt_minsaturationbytecount; /* if non-zero, mininum amount of writes (in bytes) needed to max out throughput */ - pending_io_t mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending writes */ - pending_io_t mnt_pending_read_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending reads */ - struct timeval mnt_last_write_issued_timestamp; - struct timeval mnt_last_write_completed_timestamp; - int64_t mnt_max_swappin_available; - - lck_rw_t mnt_rwlock; /* mutex readwrite lock */ - lck_mtx_t mnt_renamelock; /* mutex that serializes renames that change shape of tree */ - vnode_t mnt_devvp; /* the device mounted on for local file systems */ - uint32_t mnt_devbsdunit; /* the BSD unit number of the device */ - uint64_t mnt_throttle_mask; /* the throttle mask of what devices will be affected by I/O from this mnt */ - void *mnt_throttle_info; /* used by the throttle code */ - int32_t mnt_crossref; /* refernces to cover lookups crossing into mp */ - int32_t mnt_iterref; /* refernces to cover iterations; drained makes it -ve */ + uint32_t mnt_maxreadcnt; /* Max. byte count for read */ + uint32_t mnt_maxwritecnt; /* Max. byte count for write */ + uint32_t mnt_segreadcnt; /* Max. segment count for read */ + uint32_t mnt_segwritecnt; /* Max. segment count for write */ + uint32_t mnt_maxsegreadsize; /* Max. segment read size */ + uint32_t mnt_maxsegwritesize; /* Max. segment write size */ + uint32_t mnt_alignmentmask; /* Mask of bits that aren't addressable via DMA */ + uint32_t mnt_devblocksize; /* the underlying device block size */ + uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ + uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ + uint32_t mnt_ioflags; /* flags for underlying device */ + uint32_t mnt_minsaturationbytecount; /* if non-zero, mininum amount of writes (in bytes) needed to max out throughput */ + pending_io_t mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending writes */ + pending_io_t mnt_pending_read_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending reads */ + struct timeval mnt_last_write_issued_timestamp; + struct timeval mnt_last_write_completed_timestamp; + int64_t mnt_max_swappin_available; + + lck_rw_t mnt_rwlock; /* mutex readwrite lock */ + lck_mtx_t mnt_renamelock; /* mutex that serializes renames that change shape of tree */ + vnode_t mnt_devvp; /* the device mounted on for local file systems */ + uint32_t mnt_devbsdunit; /* the BSD unit number of the device */ + uint64_t mnt_throttle_mask; /* the throttle mask of what devices will be affected by I/O from this mnt */ + void *mnt_throttle_info; /* used by the throttle code */ + int32_t mnt_crossref; /* refernces to cover lookups crossing into mp */ + int32_t mnt_iterref; /* refernces to cover iterations; drained makes it -ve */ #if CONFIG_TRIGGERS - int32_t mnt_numtriggers; /* num of trigger vnodes for this mount */ - vfs_trigger_callback_t *mnt_triggercallback; - void *mnt_triggerdata; + int32_t mnt_numtriggers; /* num of trigger vnodes for this mount */ + vfs_trigger_callback_t *mnt_triggercallback; + void *mnt_triggerdata; #endif /* XXX 3762912 hack to support HFS filesystem 'owner' */ - uid_t mnt_fsowner; - gid_t mnt_fsgroup; + uid_t mnt_fsowner; + gid_t mnt_fsgroup; - struct label *mnt_mntlabel; /* MAC mount label */ - struct label *mnt_fslabel; /* MAC default fs label */ + struct label *mnt_mntlabel; /* MAC mount label */ + struct label *mnt_fslabel; /* MAC default fs label */ /* * cache the rootvp of the last mount point @@ -174,14 +174,14 @@ struct mount { * we don't take an explicit long term reference * on it when we mount it */ - vnode_t mnt_realrootvp; - uint32_t mnt_realrootvp_vid; + vnode_t mnt_realrootvp; + uint32_t mnt_realrootvp_vid; /* * bumped each time a mount or unmount * occurs... its used to invalidate * 'mnt_realrootvp' from the cache */ - uint32_t mnt_generation; + uint32_t mnt_generation; /* * if 'MNTK_AUTH_CACHE_TIMEOUT' is * set, then 'mnt_authcache_ttl' is @@ -191,14 +191,14 @@ struct mount { * time-to-live for the cached lookup right for * volumes marked 'MNTK_AUTH_OPAQUE'. */ - int mnt_authcache_ttl; - char fstypename_override[MFSTYPENAMELEN]; + int mnt_authcache_ttl; + char fstypename_override[MFSTYPENAMELEN]; - uint32_t mnt_iobufinuse; + uint32_t mnt_iobufinuse; - void *mnt_disk_conditioner_info; + void *mnt_disk_conditioner_info; - lck_mtx_t mnt_iter_lock; /* mutex that protects iteration of vnodes */ + lck_mtx_t mnt_iter_lock; /* mutex that protects iteration of vnodes */ }; /* @@ -216,6 +216,7 @@ struct mount { #define MNT_IOFLAGS_CSUNMAP_SUPPORTED 0x00000008 #define MNT_IOFLAGS_SWAPPIN_SUPPORTED 0x00000010 #define MNT_IOFLAGS_FUSION_DRIVE 0x00000020 +#define MNT_IOFLAGS_PERIPHERAL_DRIVE 0x00000040 /* External: Attached directly to the system (USB,TBT,FW,etc.) */ /* * ioqueue depth for devices that don't report one @@ -241,6 +242,7 @@ extern struct mount * dead_mountp; * because the bits here were broken out from the high bits * of the mount flags. */ +#define MNTK_SYSTEM 0x00000040 /* Volume associated with system volume (do not allow unmount) */ #define MNTK_NOSWAP 0x00000080 /* swap files cannot be used on this mount */ #define MNTK_SWAP_MOUNT 0x00000100 /* we are swapping to this mount */ #define MNTK_DENY_READDIREXT 0x00000200 /* Deny Extended-style readdir's for this volume */ @@ -306,7 +308,7 @@ typedef struct fhandle fhandle_t; * mount time to identify the requested filesystem. */ struct vfstable { - struct vfsops *vfc_vfsops; /* filesystem operations vector */ + const struct vfsops *vfc_vfsops;/* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ @@ -375,10 +377,10 @@ struct user64_statfs { user64_long_t f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* file system id */ uid_t f_owner; /* user that mounted the filesystem */ - short f_reserved1; /* spare for later */ + short f_reserved1; /* spare for later */ short f_type; /* type of filesystem */ - user64_long_t f_flags; /* copy of mount exported flags */ - user64_long_t f_reserved2[2]; /* reserved for future use */ + user64_long_t f_flags; /* copy of mount exported flags */ + user64_long_t f_reserved2[2]; /* reserved for future use */ char f_fstypename[MFSNAMELEN]; /* fs type name */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ char f_mntfromname[MNAMELEN];/* mounted filesystem */ @@ -442,6 +444,9 @@ int mount_refdrain(mount_t); /* vfs_rootmountalloc should be kept as a private api */ errno_t vfs_rootmountalloc(const char *, const char *, mount_t *mpp); +int vfs_mount_rosv_data(void); +int vfs_mount_vm(void); + int vfs_mountroot(void); void vfs_unmountall(void); int safedounmount(struct mount *, int, vfs_context_t); @@ -460,11 +465,16 @@ void mount_iterdrop(mount_t); void mount_iterdrain(mount_t); void mount_iterreset(mount_t); +/* These flags are used as flag bits in the `internal_flags` argument to mount_common */ /* Private NFS spi */ #define KERNEL_MOUNT_NOAUTH 0x01 /* Don't check the UID of the directory we are mounting on */ #define KERNEL_MOUNT_PERMIT_UNMOUNT 0x02 /* Allow (non-forced) unmounts by users other the one who mounted the volume */ /* used by snapshot mounting SPI */ #define KERNEL_MOUNT_SNAPSHOT 0x04 /* Mounting a snapshot */ +#define KERNEL_MOUNT_DATAVOL 0x08 /* mount the data volume */ +#define KERNEL_MOUNT_VMVOL 0x10 /* mount the VM volume */ + + #if NFSCLIENT || DEVFS || ROUTEFS /* * NOTE: kernel_mount() does not force MNT_NOSUID, MNT_NOEXEC, or MNT_NODEC for non-privileged diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index ea81e399c..816e849c4 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -179,6 +179,7 @@ struct nameidata { #define AUDITVNPATH2 0x00200000 /* audit the path/vnode info */ #define USEDVP 0x00400000 /* start the lookup at ndp.ni_dvp */ #define CN_VOLFSPATH 0x00800000 /* user path was a volfs style path */ +#define CN_FIRMLINK_NOFOLLOW 0x01000000 /* Do not follow firm links */ #define UNIONCREATED 0x02000000 /* union fs creation of vnode */ #if NAMEDRSRCFORK #define CN_WANTSRSRCFORK 0x04000000 diff --git a/bsd/sys/persona.h b/bsd/sys/persona.h index c01074897..87907d172 100644 --- a/bsd/sys/persona.h +++ b/bsd/sys/persona.h @@ -32,13 +32,17 @@ #include enum { - PERSONA_INVALID = 0, - PERSONA_GUEST = 1, - PERSONA_MANAGED = 2, - PERSONA_PRIV = 3, - PERSONA_SYSTEM = 4, - - PERSONA_TYPE_MAX = PERSONA_SYSTEM, + PERSONA_INVALID = 0, + PERSONA_GUEST = 1, + PERSONA_MANAGED = 2, + PERSONA_PRIV = 3, + PERSONA_SYSTEM = 4, + PERSONA_DEFAULT = 5, + PERSONA_SYSTEM_PROXY = 6, + PERSONA_SYS_EXT = 7, + PERSONA_ENTERPRISE = 8, + + PERSONA_TYPE_MAX = PERSONA_ENTERPRISE, }; #define PERSONA_ID_NONE ((uid_t)-1) @@ -62,11 +66,16 @@ struct kpersona_info { #define PERSONA_OP_ALLOC 1 -#define PERSONA_OP_DEALLOC 2 -#define PERSONA_OP_GET 3 -#define PERSONA_OP_INFO 4 -#define PERSONA_OP_PIDINFO 5 -#define PERSONA_OP_FIND 6 +#define PERSONA_OP_PALLOC 2 +#define PERSONA_OP_DEALLOC 3 +#define PERSONA_OP_GET 4 +#define PERSONA_OP_INFO 5 +#define PERSONA_OP_PIDINFO 6 +#define PERSONA_OP_FIND 7 +#define PERSONA_OP_GETPATH 8 +#define PERSONA_OP_FIND_BY_TYPE 9 + +#define PERSONA_MGMT_ENTITLEMENT "com.apple.private.persona-mgmt" #ifndef KERNEL /* @@ -91,6 +100,29 @@ struct kpersona_info { */ int kpersona_alloc(struct kpersona_info *info, uid_t *id); +/* + * kpersona_palloc: Allocate a new in-kernel persona with a directory + * pathname + * + * Parameters: + * info: Pointer to persona info structure describing the + * attributes of the persona to create / allocate. + * + * path: Pointer to directory name that stores persona specific + * data. Assumes path buffer length = MAXPATHLEN and is a + * null-terminated string. + * + * id: output: set to the ID of the created persona + * + * Note: + * The 'persona_id' field of the 'info' parameter is ignored. + * + * Return: + * != 0: ERROR + * == 0: Success + */ +int kpersona_palloc(struct kpersona_info *info, uid_t *id, char path[MAXPATHLEN]); + /* * kpersona_dealloc: delete / destroy an in-kernel persona * @@ -103,13 +135,15 @@ int kpersona_alloc(struct kpersona_info *info, uid_t *id); */ int kpersona_dealloc(uid_t id); - /* * kpersona_get: retrieve the persona with which the current thread is running * + * To find the proc's persona id use kpersona_pidinfo + * * Parameters: - * id: output: will be filled with current thread's persona - * (or current processes persona) on success. + * id: output: will be filled with the persona id from the voucher adopted + * on the current thread. If that voucher contains no persona information + * or there is no such voucher, then it defaults to the proc's persona id. * * Return: * < 0: Thread is not running under any persona @@ -117,12 +151,29 @@ int kpersona_dealloc(uid_t id); */ int kpersona_get(uid_t *id); +/* + * kpersona_get_path: retrieve the given persona's path + * + * Parameters: + * id: ID of the persona + * + * path: output: filled in with path on success. + * Assumes path buffer length = MAXPATHLEN + * + * Return: + * < 0: Error + * 0: Success + */ +int kpersona_getpath(uid_t id, char path[MAXPATHLEN]); /* * kpersona_info: gather info about the given persona * * Parameters: * id: ID of the persona to investigate + * If set to 0, it uses persona id from the voucher adopted on the current + * thread. If that voucher contains no persona information or there is no + * such voucher, then it defaults to the proc's persona id. * * info: output: filled in with persona attributes on success. * @@ -132,7 +183,6 @@ int kpersona_get(uid_t *id); */ int kpersona_info(uid_t id, struct kpersona_info *info); - /* * kpersona_pidinfo: gather persona info about the given PID * @@ -147,7 +197,6 @@ int kpersona_info(uid_t id, struct kpersona_info *info); */ int kpersona_pidinfo(pid_t pid, struct kpersona_info *info); - /* * kpersona_find: lookup the kernel's UUID of a persona * @@ -159,6 +208,8 @@ int kpersona_pidinfo(pid_t pid, struct kpersona_info *info); * Set this to -1 to find personas by 'name' * * id: output: the ID(s) matching the input parameters + * This can be NULL + * * idlen: input - size of 'id' buffer (in number of IDs) * output - the total required size of the 'id' buffer * (in number of IDs) - may be larger than input size @@ -170,6 +221,24 @@ int kpersona_pidinfo(pid_t pid, struct kpersona_info *info); * >= 0: The number of IDs found to match the input parameters */ int kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen); + +/* + * kpersona_find_by_type: lookup the persona ids by type + * + * Parameters: + * persona_type: Type of persona id (see enum) + * + * id: output: the ID(s) matching the input parameters + * This can be NULL + * + * idlen: input - size of 'id' buffer (in number of IDs) + * output - the total required size of the 'id' buffer + * (in number of IDs) - may be larger than input size + * Return: + * < 0: ERROR + * >= 0: The number of IDs found to match the input parameters + */ +int kpersona_find_by_type(int persona_type, uid_t *id, size_t *idlen); #endif /* !KERNEL */ #ifdef KERNEL_PRIVATE @@ -201,6 +270,7 @@ struct persona { uid_t pna_id; int pna_type; char pna_login[MAXLOGNAME + 1]; + char *pna_path; kauth_cred_t pna_cred; uid_t pna_pgid; @@ -302,13 +372,32 @@ kauth_cred_t persona_get_cred(struct persona *persona); struct persona *persona_lookup(uid_t id); /* - * returns non-zero on error, on success returns 0 and updates 'plen' to - * total found (could be more than original value of 'plen') + * Search for personas based on name or uid + * + * Parameters: + * name: Local login name of the persona. + * Set this to NULL to find personas by 'uid'. + * + * uid: UID of the persona. + * Set this to -1 to find personas by 'name' + * + * persona: output - array of persona pointers. Each non-NULL value + * must* be released with persona_put. This can be NULL. + * + * plen: input - size of 'persona' buffer (in number of pointers) + * output - the total required size of the 'persona' buffer (could be larger than input value) + * + * Return: + * 0: Success + * != 0: failure (BSD errno value ESRCH or EINVAL) */ int persona_find(const char *login, uid_t uid, struct persona **persona, size_t *plen); -/* returns a reference to the persona tied to the current thread */ +/* returns a reference that must be released with persona_put() */ +struct persona *persona_proc_get(pid_t pid); + +/* returns a reference to the persona tied to the current thread (also uses adopted voucher) */ struct persona *current_persona_get(void); /* get a reference to a persona structure */ @@ -317,6 +406,25 @@ struct persona *persona_get(struct persona *persona); /* release a reference to a persona structure */ void persona_put(struct persona *persona); +/* + * Search for personas of a given type, 'persona_type'. + * + * Parameters: + * persona_type: Type of persona (see enum) + * + * persona: output - array of persona pointers. Each non-NULL value + * must* be released with persona_put. This can be NULL. + * + * plen: input - size of 'persona' buffer (in number of pointers) + * output - the total required size of the 'persona' buffer (could be larger than input value) + * + * Return: + * 0: Success + * != 0: failure (BSD errno value ESRCH or EINVAL) + */ +int persona_find_by_type(int persona_type, struct persona **persona, + size_t *plen); + #ifdef XNU_KERNEL_PRIVATE #if CONFIG_PERSONAS @@ -326,17 +434,18 @@ void persona_put(struct persona *persona); * In-kernel persona API */ extern uint32_t g_max_personas; -extern struct persona *g_system_persona; void personas_bootstrap(void); struct persona *persona_alloc(uid_t id, const char *login, - int type, int *error); + int type, char *path, int *error); int persona_init_begin(struct persona *persona); void persona_init_end(struct persona *persona, int error); struct persona *persona_lookup_and_invalidate(uid_t id); +int persona_verify_and_set_uniqueness(struct persona *persona); +boolean_t persona_is_unique(struct persona *persona); static inline int proc_has_persona(proc_t p) @@ -382,6 +491,9 @@ int persona_get_login(struct persona *persona, char login[MAXLOGNAME + 1]); /* returns a reference that must be released with persona_put() */ struct persona *persona_proc_get(pid_t pid); +int persona_find_all(const char *login, uid_t uid, int persona_type, + struct persona **persona, size_t *plen); + #else /* !CONFIG_PERSONAS */ static inline int diff --git a/bsd/sys/pipe.h b/bsd/sys/pipe.h index 294be47be..e53f05080 100644 --- a/bsd/sys/pipe.h +++ b/bsd/sys/pipe.h @@ -133,7 +133,7 @@ struct pipemapping { #define PIPE_LWANT 0x200 /* Process wants exclusive access to pointers/data. */ #define PIPE_DIRECTW 0x400 /* Pipe direct write active. */ #define PIPE_DIRECTOK 0x800 /* Direct mode ok. */ -#define PIPE_KNOTE 0x1000 /* Pipe has kernel events activated */ +// was PIPE_KNOTE 0x1000 #define PIPE_DRAIN 0x2000 /* Waiting for I/O to drop for a close. Treated like EOF; * only separate for easier debugging. */ #define PIPE_WSELECT 0x4000 /* Some thread has done an FWRITE select on the pipe */ diff --git a/bsd/sys/priv.h b/bsd/sys/priv.h index cdeb994a4..940debadf 100644 --- a/bsd/sys/priv.h +++ b/bsd/sys/priv.h @@ -93,6 +93,8 @@ #define PRIV_PACKAGE_EXTENSIONS 1013 /* Push package extension list used by vn_path_package_check() */ #define PRIV_TRIM_ACTIVE_FILE 1014 /* Allow freeing space out from under an active file */ #define PRIV_PROC_CPUMON_OVERRIDE 1015 /* Allow CPU usage monitor parameters less restrictive than default */ +#define PRIV_ENDPOINTSECURITY_CLIENT 1016 /* Allow EndpointSecurity clients to connect */ +#define PRIV_AUDIT_SESSION_PORT 1017 /* Obtain send-right for arbitrary audit session's port. */ /* * Virtual memory privileges. @@ -118,10 +120,20 @@ #define PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED 10010 /* Extended multipath (more aggressive on cell) */ #define PRIV_NET_RESTRICTED_ROUTE_NC_READ 10011 /* Enable route neighbhor cache read operations */ +#define PRIV_NET_PRIVILEGED_CLIENT_ACCESS 10012 /* Allow client networking access on restricted platforms */ +#define PRIV_NET_PRIVILEGED_SERVER_ACCESS 10013 /* Allow server networking access on restricted platforms */ + +#define PRIV_NET_VALIDATED_RESOLVER 10014 /* Privilege to sign DNS resolver results for validation */ + +#define PRIV_NET_CUSTOM_PROTOCOL 10015 /* Privilege to use custom protocol APIs */ +#define PRIV_NET_PRIVILEGED_NECP_DROP_ALL_BYPASS 10016 /* Privilege to bypass NECP drop-all */ +#define PRIV_NET_PRIVILEGED_IPSEC_WAKE_PACKET 10017 /* Privilege to get IPsec wake packet */ + /* * IPv4 and IPv6 privileges. */ #define PRIV_NETINET_RESERVEDPORT 11000 /* Bind low port number. */ +#define PRIV_NETINET_TCP_KA_OFFLOAD 11001 /* Can set TCP keep alive offload option */ /* @@ -131,10 +143,15 @@ #define PRIV_VFS_MOVE_DATA_EXTENTS 14001 /* Allow F_MOVEDATAEXTENTS fcntl */ #define PRIV_VFS_SNAPSHOT 14002 /* Allow create/rename/delete of snapshots */ #define PRIV_VFS_SNAPSHOT_REVERT 14003 /* Allow reverting filesystem to a previous snapshot */ +#define PRIV_VFS_DATALESS_RESOLVER 14004 /* Allow registration as dataless file resolver */ +#define PRIV_VFS_DATALESS_MANIPULATION 14005 /* Allow process to inspect dataless directories / manipulate dataless objects */ #define PRIV_APFS_EMBED_DRIVER 14100 /* Allow embedding an EFI driver into the APFS container */ #define PRIV_APFS_FUSION_DEBUG 14101 /* Allow getting internal statistics and controlling the APFS Fusion container */ #define PRIV_APFS_FUSION_ALLOW_PIN_FASTPROMOTE 14102 /* Allow changing pinned/fastPromote inode flags in APFS Fusion container */ +// #define PRIV_APFS_UNUSED 14103 +#define PRIV_APFS_SET_FREE_SPACE_CHANGE_THRESHOLD 14104 /* Allow setting the free space change notification threshold */ +#define PRIV_APFS_SET_FIRMLINK 14105 /* Allow setting the SF_FIRM_LINK bsd flag */ #ifdef KERNEL /* diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index ef8015554..87a39398b 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -272,6 +272,12 @@ extern int proc_rele(proc_t p); extern int proc_pid(proc_t); /* returns the pid of the parent of a given process */ extern int proc_ppid(proc_t); +/* returns the original pid of the parent of a given process */ +extern int proc_original_ppid(proc_t); +/* returns the platform (macos, ios, watchos, tvos, ...) of the given process */ +extern uint32_t proc_platform(proc_t); +/* returns the sdk version used by the current process */ +extern uint32_t proc_sdk(proc_t); /* returns 1 if the process is marked for no remote hangs */ extern int proc_noremotehang(proc_t); /* returns 1 if the process is marked for force quota */ @@ -313,6 +319,14 @@ pid_t proc_selfpgrpid(void); */ pid_t proc_pgrpid(proc_t p); +/*! + * @function proc_sessionid + * @abstract Get the process session id for the passed-in process. + * @param p Process whose session id to grab. + * @return session id for "p", or -1 on failure + */ +pid_t proc_sessionid(proc_t p); + #ifdef KERNEL_PRIVATE // mark a process as being allowed to call vfs_markdependency() void bsd_set_dependency_capable(task_t task); @@ -357,18 +371,29 @@ extern int proc_pidbackgrounded(pid_t pid, uint32_t* state); */ extern uint64_t proc_uniqueid(proc_t); +/* unique 64bit id for process's original parent */ +extern uint64_t proc_puniqueid(proc_t); + extern void proc_set_responsible_pid(proc_t target_proc, pid_t responsible_pid); /* return 1 if process is forcing case-sensitive HFS+ access, 0 for default */ extern int proc_is_forcing_hfs_case_sensitivity(proc_t); +/*! + * @function proc_exitstatus + * @abstract KPI to determine a process's exit status. + * @discussion This function is not safe to call if the process could be + * concurrently stopped or started, but it can be called from a + * mpo_proc_notify_exit callback. + * @param p The process to be queried. + * @return Value in the same format as wait()'s output parameter. + */ +extern int proc_exitstatus(proc_t p); + #endif /* KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE -/* unique 64bit id for process's original parent */ -extern uint64_t proc_puniqueid(proc_t); - extern void proc_getexecutableuuid(proc_t, unsigned char *, unsigned long); extern int proc_get_originatorbgstate(uint32_t *is_backgrounded); @@ -387,9 +412,16 @@ extern uint64_t get_current_unique_pid(void); #endif /* XNU_KERNEL_PRIVATE*/ #ifdef KERNEL_PRIVATE +/* If buf argument is NULL, the necessary length to allocate will be set in buflen */ +extern int proc_selfexecutableargs(uint8_t *buf, size_t *buflen); +extern off_t proc_getexecutableoffset(proc_t p); extern vnode_t proc_getexecutablevnode(proc_t); /* Returned with iocount, use vnode_put() to drop */ extern int networking_memstatus_callout(proc_t p, uint32_t); -#endif + +#define SYSCALL_MASK_UNIX 0 +extern size_t proc_get_syscall_filter_mask_size(int which); +extern int proc_set_syscall_filter_mask(proc_t p, int which, unsigned char *maskptr, size_t masklen); +#endif /* KERNEL_PRIVATE */ __END_DECLS @@ -422,6 +454,10 @@ int pid_shutdown_networking(int pid, int level); __END_DECLS #endif /* !KERNEL */ + +/* Entitlement to allow non-root processes to suspend/resume any task */ +#define PROCESS_RESUME_SUSPEND_ENTITLEMENT "com.apple.private.process.suspend-resume.any" + #endif /* PRIVATE */ #endif /* !_SYS_PROC_H_ */ diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index 15dc50f70..086ad7842 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -139,6 +139,11 @@ struct proc_originatorinfo { uint64_t p_reserve4; }; +struct proc_ipctableinfo { + uint32_t table_size; + uint32_t table_free; +}; + #endif @@ -377,6 +382,12 @@ struct proc_regionwithpathinfo { struct vnode_info_path prp_vip; }; +struct proc_regionpath { + uint64_t prpo_addr; + uint64_t prpo_regionlength; + char prpo_path[MAXPATHLEN]; +}; + struct proc_vnodepathinfo { struct vnode_info_path pvi_cdir; struct vnode_info_path pvi_rdir; @@ -800,6 +811,16 @@ struct proc_fileportinfo { #define PROC_PIDVMRTFAULTINFO 29 #define PROC_PIDVMRTFAULTINFO_SIZE (7 * sizeof(uint64_t)) + +#define PROC_PIDPLATFORMINFO 30 +#define PROC_PIDPLATFORMINFO_SIZE (sizeof(uint32_t)) + +#define PROC_PIDREGIONPATH 31 +#define PROC_PIDREGIONPATH_SIZE (sizeof(struct proc_regionpath)) + +#define PROC_PIDIPCTABLEINFO 32 +#define PROC_PIDIPCTABLEINFO_SIZE (sizeof(struct proc_ipctableinfo)) + #endif /* PRIVATE */ /* Flavors for proc_pidfdinfo */ diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index adaef95ff..763515e8f 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,7 +79,6 @@ #include #include // command/proc_name_t - __BEGIN_DECLS #include #if PSYNCH @@ -121,7 +120,7 @@ struct session { int s_listflags; }; -#define SESSION_NULL (struct session *)0 +#define SESSION_NULL (struct session *)NULL /* * accessor for s_ttyp which treats it as invalid if s_ttyvp is not valid; @@ -167,10 +166,10 @@ struct pgrp { #define PGRP_FLAG_ITERABEGIN 8 #define PGRP_FLAG_ITERWAIT 0x10 -#define PGRP_NULL (struct pgrp *)0 +#define PGRP_NULL (struct pgrp *)NULL struct proc; -#define PROC_NULL (struct proc *)0 +#define PROC_NULL (struct proc *)NULL #define PROC_UPDATE_CREDS_ONPROC(p) { \ p->p_uid = kauth_cred_getuid(p->p_ucred); \ @@ -197,6 +196,7 @@ struct proc { void * task; /* corresponding task (static)*/ struct proc * p_pptr; /* Pointer to parent process.(LL) */ pid_t p_ppid; /* process's parent pid number */ + pid_t p_original_ppid; /* process's original parent pid number, doesn't change if reparented */ pid_t p_pgrpid; /* process group id of the process (LL)*/ uid_t p_uid; gid_t p_gid; @@ -331,6 +331,10 @@ struct proc { cpu_type_t p_cputype; cpu_subtype_t p_cpusubtype; + uint8_t *syscall_filter_mask; /* syscall filter bitmask (length: nsysent bits) */ + uint32_t p_platform; + uint32_t p_sdk; + /* End area that is copied on creation. */ /* XXXXXXXXXXXXX End of BCOPY'ed on fork (AIOLOCK)XXXXXXXXXXXXXXXX */ #define p_endcopy p_aio_total_count @@ -374,6 +378,7 @@ struct proc { #endif /* DIAGNOSTIC */ uint64_t p_dispatchqueue_offset; uint64_t p_dispatchqueue_serialno_offset; + uint64_t p_dispatchqueue_label_offset; uint64_t p_return_to_kernel_offset; uint64_t p_mach_thread_self_offset; #if VM_PRESSURE_EVENTS @@ -383,9 +388,10 @@ struct proc { #if CONFIG_MEMORYSTATUS /* Fields protected by proc list lock */ TAILQ_ENTRY(proc) p_memstat_list; /* priority bucket link */ - uint32_t p_memstat_state; /* state */ + uint32_t p_memstat_state; /* state. Also used as a wakeup channel when the memstat's LOCKED bit changes */ int32_t p_memstat_effectivepriority; /* priority after transaction state accounted for */ int32_t p_memstat_requestedpriority; /* active priority */ + int32_t p_memstat_assertionpriority; /* assertion driven priority */ uint32_t p_memstat_dirty; /* dirty state */ uint64_t p_memstat_userdata; /* user state */ uint64_t p_memstat_idledeadline; /* time at which process became clean */ @@ -394,6 +400,7 @@ struct proc { int32_t p_memstat_memlimit; /* cached memory limit, toggles between active and inactive limits */ int32_t p_memstat_memlimit_active; /* memory limit enforced when process is in active jetsam state */ int32_t p_memstat_memlimit_inactive; /* memory limit enforced when process is in inactive jetsam state */ + int32_t p_memstat_relaunch_flags; /* flags indicating relaunch behavior for the process */ #if CONFIG_FREEZE uint32_t p_memstat_freeze_sharedanon_pages; /* shared pages left behind after freeze */ uint32_t p_memstat_frozen_count; @@ -405,6 +412,8 @@ struct proc { pid_t p_responsible_pid; /* pid resonsible for this process */ _Atomic uint32_t p_user_faults; /* count the number of user faults generated */ + uint32_t p_memlimit_increase; /* byte increase for memory limit for dyld SPI rdar://problem/49950264, structure packing 32-bit and 64-bit */ + struct os_reason *p_exit_reason; #if !CONFIG_EMBEDDED @@ -465,6 +474,7 @@ struct proc { #define P_LVMRSRCOWNER 0x01000000 /* can handle the resource ownership of */ #define P_LTERM_DECRYPTFAIL 0x04000000 /* process terminating due to key failure to decrypt */ #define P_LTERM_JETSAM 0x08000000 /* process is being jetsam'd */ + #define P_JETSAM_VMPAGESHORTAGE 0x00000000 /* jetsam: lowest jetsam priority proc, killed due to vm page shortage */ #define P_JETSAM_VMTHRASHING 0x10000000 /* jetsam: lowest jetsam priority proc, killed due to vm thrashing */ #define P_JETSAM_HIWAT 0x20000000 /* jetsam: high water mark */ @@ -473,6 +483,7 @@ struct proc { #define P_JETSAM_VNODE 0x50000000 /* jetsam: vnode kill */ #define P_JETSAM_FCTHRASHING 0x60000000 /* jetsam: lowest jetsam priority proc, killed due to filecache thrashing */ #define P_JETSAM_MASK 0x70000000 /* jetsam type mask */ +#define P_LNSPACE_RESOLVER 0x80000000 /* process is the namespace resolver */ /* Process control state for resource starvation */ #define P_PCTHROTTLE 1 @@ -498,7 +509,9 @@ struct proc { /* p_vfs_iopolicy flags */ #define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY 0x0001 #define P_VFS_IOPOLICY_ATIME_UPDATES 0x0002 -#define P_VFS_IOPOLICY_VALID_MASK (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY) +#define P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES 0x0004 +#define P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME 0x0008 +#define P_VFS_IOPOLICY_VALID_MASK (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES | P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME) /* process creation arguments */ #define PROC_CREATE_FORK 0 /* independent child (running) */ @@ -701,6 +714,11 @@ __private_extern__ void proc_drop_zombref(struct proc * p); /* Find zombie b extern int chgproccnt(uid_t uid, int diff); extern void pinsertchild(struct proc *parent, struct proc *child); +extern int setsid_internal(struct proc *p); +#ifndef __cplusplus +extern void setlogin_internal(proc_t p, const char login[static MAXLOGNAME]); +#endif // __cplusplus +extern int setgroups_internal(proc_t p, u_int gidsetsize, gid_t *gidset, uid_t gmuid); extern int enterpgrp(struct proc *p, pid_t pgid, int mksess); extern void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); extern int inferior(struct proc *p); @@ -819,7 +837,7 @@ typedef int (*proc_iterate_fn_t)(proc_t, void *); */ #define PGRP_DROPREF (1) -extern int pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); +extern void pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); /* * proc_iterate walks the `allproc` and/or `zombproc` lists, calling `filterfn` @@ -834,7 +852,7 @@ extern int pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t #define PROC_ZOMBPROCLIST (1U << 1) /* walk the zombie list */ #define PROC_NOWAITTRANS (1U << 2) /* do not wait for transitions (checkdirs only) */ -extern int proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); +extern void proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); /* * proc_childrenwalk walks the children of process `p`, calling `callout` for @@ -843,7 +861,7 @@ extern int proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg * `PCHILDREN_FOREACH` might also be used under the `proc_list_lock` to achieve * a similar effect. */ -extern int proc_childrenwalk(proc_t p, proc_iterate_fn_t callout, void *arg); +extern void proc_childrenwalk(proc_t p, proc_iterate_fn_t callout, void *arg); /* * proc_rebootscan should only be used by kern_shutdown.c diff --git a/bsd/sys/process_policy.h b/bsd/sys/process_policy.h index 4c2e3ce6b..8ba38fb19 100644 --- a/bsd/sys/process_policy.h +++ b/bsd/sys/process_policy.h @@ -65,7 +65,7 @@ __BEGIN_DECLS #define PROC_POLICY_HARDWARE_ACCESS 2 /* access to various hardware */ #define PROC_POLICY_RESOURCE_STARVATION 3 /* behavior on resource starvation */ #define PROC_POLICY_RESOURCE_USAGE 4 /* behavior on resource consumption */ -#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED +#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define PROC_POLICY_APP_LIFECYCLE 5 /* app life cycle management */ #else /* CONFIG_EMBEDDED */ #define PROC_POLICY_RESERVED 5 /* behavior on resource consumption */ @@ -79,7 +79,7 @@ __BEGIN_DECLS #define PROC_POLICY_BG_DISKTHROTTLE 2 /* disk accesses throttled */ #define PROC_POLICY_BG_NETTHROTTLE 4 /* network accesses throttled */ #define PROC_POLICY_BG_GPUDENY 8 /* no access to GPU */ -#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED +#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define PROC_POLICY_BG_ALL 0x0F #else /* CONFIG_EMBEDDED */ #define PROC_POLICY_BG_ALL 0x07 @@ -169,7 +169,7 @@ typedef struct proc_policy_cpuusage_attr { uint64_t ppattr_cpu_attr_deadline; /* 64bit deadline in nsecs */ } proc_policy_cpuusage_attr_t; -#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED +#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* sub policies for app lifecycle management */ #define PROC_POLICY_APPLIFE_NONE 0 /* does nothing.. */ #define PROC_POLICY_APPLIFE_STATE 1 /* sets the app to various lifecycle states */ @@ -180,7 +180,7 @@ typedef struct proc_policy_cpuusage_attr { /* sub policies for PROC_POLICY_APPTYPE */ #define PROC_POLICY_APPTYPE_NONE 0 /* does nothing.. */ #define PROC_POLICY_APPTYPE_MODIFY 1 /* sets the app to various lifecycle states */ -#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED +#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define PROC_POLICY_APPTYPE_THREADTHR 2 /* notes the device in inactive or short/long term */ #endif /* CONFIG_EMBEDDED */ diff --git a/bsd/sys/protosw.h b/bsd/sys/protosw.h index 0beec9bc5..a9f80ff06 100644 --- a/bsd/sys/protosw.h +++ b/bsd/sys/protosw.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -127,6 +127,7 @@ struct uio; struct ifnet; #ifdef XNU_KERNEL_PRIVATE struct domain_old; +struct proc; #endif /* XNU_KERNEL_PRIVATE */ #pragma pack(4) @@ -269,6 +270,12 @@ struct protosw { */ TAILQ_HEAD(, socket_filter) pr_filter_head; struct protosw_old *pr_old; + + void (*pr_update_last_owner) /* update last socket owner */ + (struct socket *so, struct proc *p, struct proc *ep); + + void (*pr_copy_last_owner) /* copy last socket from listener */ + (struct socket *so, struct socket *head); }; /* @@ -562,20 +569,25 @@ extern struct protosw *pffindproto_locked(int, int, int); extern struct protosw *pffindprotonotype(int, int); extern struct protosw *pffindtype(int, int); extern struct protosw_old *pffindproto_old(int, int, int); -extern int net_add_proto(struct protosw *, struct domain *, int); +extern int net_add_proto(struct protosw *, struct domain *, int) +__XNU_INTERNAL(net_add_proto); extern void net_init_proto(struct protosw *, struct domain *); -extern int net_del_proto(int, int, struct domain *); +extern int net_del_proto(int, int, struct domain *) +__XNU_INTERNAL(net_del_proto); extern int net_add_proto_old(struct protosw_old *, struct domain_old *); extern int net_del_proto_old(int, int, struct domain_old *); extern void net_update_uptime(void); extern void net_update_uptime_with_time(const struct timeval *); extern u_int64_t net_uptime(void); +extern u_int64_t net_uptime_ms(void); extern void net_uptime2timeval(struct timeval *); +extern struct protosw *pffindproto(int family, int protocol, int type) +__XNU_INTERNAL(pffindproto); #else extern int net_add_proto(struct protosw *, struct domain *); extern int net_del_proto(int, int, struct domain *); -#endif /* XNU_KERNEL_PRIVATE */ extern struct protosw *pffindproto(int family, int protocol, int type); +#endif /* XNU_KERNEL_PRIVATE */ __END_DECLS #endif /* KERNEL_PRIVATE */ #endif /* !_SYS_PROTOSW_H_ */ diff --git a/bsd/sys/pthread_shims.h b/bsd/sys/pthread_shims.h index 7a2d607dd..e956225b2 100644 --- a/bsd/sys/pthread_shims.h +++ b/bsd/sys/pthread_shims.h @@ -234,7 +234,7 @@ typedef const struct pthread_callbacks_s { void *__unused_was_zfree; void *__unused_was_zinit; - /* bsd/kerb/kern_sig.c */ + /* bsd/kern/kern_sig.c */ void (*__pthread_testcancel)(int); /* calls without portfolio */ @@ -251,7 +251,13 @@ typedef const struct pthread_callbacks_s { /* mach/thread_act.h */ kern_return_t (*thread_resume)(thread_act_t target_act); - void *__unused_was_ml_get_max_cpus; + /* bsd/sys/event.h */ + int (*kevent_workq_internal)(struct proc *p, + user_addr_t changelist, int nchanges, + user_addr_t eventlist, int nevents, + user_addr_t data_out, user_size_t *data_available, + unsigned int flags, int32_t *retval); + #if defined(__arm__) void *__unused_was_map_is_1gb; #endif diff --git a/bsd/sys/queue.h b/bsd/sys/queue.h index 8791385d7..23dc242c6 100644 --- a/bsd/sys/queue.h +++ b/bsd/sys/queue.h @@ -207,14 +207,44 @@ struct qm_trace { #define __MISMATCH_TAGS_POP #endif +/*! + * Ensures that these macros can safely be used in structs when compiling with + * clang. The macros do not allow for nullability attributes to be specified due + * to how they are expanded. For example: + * + * SLIST_HEAD(, foo _Nullable) bar; + * + * expands to + * + * struct { + * struct foo _Nullable *slh_first; + * } + * + * which is not valid because the nullability specifier has to apply to the + * pointer. So just ignore nullability completeness in all the places where this + * is an issue. + */ +#if defined(__clang__) +#define __NULLABILITY_COMPLETENESS_PUSH \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wnullability-completeness\"") +#define __NULLABILITY_COMPLETENESS_POP \ + _Pragma("clang diagnostic pop") +#else +#define __NULLABILITY_COMPLETENESS_PUSH +#define __NULLABILITY_COMPLETENESS_POP +#endif + /* * Singly-linked List declarations. */ #define SLIST_HEAD(name, type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct name { \ struct type *slh_first; /* first element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define SLIST_HEAD_INITIALIZER(head) \ @@ -222,9 +252,11 @@ __MISMATCH_TAGS_POP #define SLIST_ENTRY(type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct { \ struct type *sle_next; /* next element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* @@ -267,6 +299,7 @@ __MISMATCH_TAGS_POP #define SLIST_REMOVE(head, elm, type, field) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ do { \ if (SLIST_FIRST((head)) == (elm)) { \ SLIST_REMOVE_HEAD((head), field); \ @@ -279,6 +312,7 @@ do { \ } \ TRASHIT((elm)->field.sle_next); \ } while (0) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define SLIST_REMOVE_AFTER(elm, field) do { \ @@ -295,10 +329,12 @@ __MISMATCH_TAGS_POP */ #define STAILQ_HEAD(name, type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct name { \ struct type *stqh_first;/* first element */ \ struct type **stqh_last;/* addr of last next element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define STAILQ_HEAD_INITIALIZER(head) \ @@ -306,9 +342,11 @@ __MISMATCH_TAGS_POP #define STAILQ_ENTRY(type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct { \ struct type *stqe_next; /* next element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* @@ -362,16 +400,19 @@ __MISMATCH_TAGS_POP #define STAILQ_LAST(head, type, field) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ (STAILQ_EMPTY((head)) ? \ NULL : \ ((struct type *)(void *) \ ((char *)((head)->stqh_last) - __offsetof(struct type, field))))\ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) #define STAILQ_REMOVE(head, elm, type, field) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ do { \ if (STAILQ_FIRST((head)) == (elm)) { \ STAILQ_REMOVE_HEAD((head), field); \ @@ -384,6 +425,7 @@ do { \ } \ TRASHIT((elm)->field.stqe_next); \ } while (0) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define STAILQ_REMOVE_HEAD(head, field) do { \ @@ -405,6 +447,7 @@ __MISMATCH_TAGS_POP #define STAILQ_SWAP(head1, head2, type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ do { \ struct type *swap_first = STAILQ_FIRST(head1); \ struct type **swap_last = (head1)->stqh_last; \ @@ -417,6 +460,7 @@ do { \ if (STAILQ_EMPTY(head2)) \ (head2)->stqh_last = &STAILQ_FIRST(head2); \ } while (0) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP @@ -425,9 +469,11 @@ __MISMATCH_TAGS_POP */ #define LIST_HEAD(name, type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct name { \ struct type *lh_first; /* first element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define LIST_HEAD_INITIALIZER(head) \ @@ -435,10 +481,12 @@ __MISMATCH_TAGS_POP #define LIST_ENTRY(type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct { \ struct type *le_next; /* next element */ \ struct type **le_prev; /* address of previous next element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* @@ -530,6 +578,7 @@ __MISMATCH_TAGS_POP #define LIST_SWAP(head1, head2, type, field) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ do { \ struct type *swap_tmp = LIST_FIRST((head1)); \ LIST_FIRST((head1)) = LIST_FIRST((head2)); \ @@ -539,6 +588,7 @@ do { \ if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ } while (0) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* @@ -546,11 +596,13 @@ __MISMATCH_TAGS_POP */ #define TAILQ_HEAD(name, type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct name { \ struct type *tqh_first; /* first element */ \ struct type **tqh_last; /* addr of last next element */ \ TRACEBUF \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define TAILQ_HEAD_INITIALIZER(head) \ @@ -558,11 +610,13 @@ __MISMATCH_TAGS_POP #define TAILQ_ENTRY(type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct { \ struct type *tqe_next; /* next element */ \ struct type **tqe_prev; /* address of previous next element */ \ TRACEBUF \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* @@ -630,6 +684,17 @@ __MISMATCH_TAGS_POP (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ (var) = (tvar)) +#if XNU_KERNEL_PRIVATE +/* + * Can be used when the initialized HEAD was just bzeroed + * Works around deficiencies in clang analysis of initialization patterns. + * See: + */ +#define TAILQ_INIT_AFTER_BZERO(head) do { \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ +} while (0) +#endif /* XNU_KERNEL_PRIVATE */ + #define TAILQ_INIT(head) do { \ TAILQ_FIRST((head)) = NULL; \ (head)->tqh_last = &TAILQ_FIRST((head)); \ @@ -686,14 +751,18 @@ __MISMATCH_TAGS_POP #define TAILQ_LAST(head, headname) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ (*(((struct headname *)((head)->tqh_last))->tqh_last)) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) #define TAILQ_PREV(elm, headname, field) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define TAILQ_REMOVE(head, elm, field) do { \ @@ -717,6 +786,7 @@ __MISMATCH_TAGS_POP */ #define TAILQ_SWAP(head1, head2, type, field) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ do { \ struct type *swap_first = (head1)->tqh_first; \ struct type **swap_last = (head1)->tqh_last; \ @@ -733,6 +803,7 @@ do { \ else \ (head2)->tqh_last = &(head2)->tqh_first; \ } while (0) \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* @@ -740,18 +811,22 @@ __MISMATCH_TAGS_POP */ #define CIRCLEQ_HEAD(name, type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct name { \ struct type *cqh_first; /* first element */ \ struct type *cqh_last; /* last element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP #define CIRCLEQ_ENTRY(type) \ __MISMATCH_TAGS_PUSH \ +__NULLABILITY_COMPLETENESS_PUSH \ struct { \ struct type *cqe_next; /* next element */ \ struct type *cqe_prev; /* previous element */ \ } \ +__NULLABILITY_COMPLETENESS_POP \ __MISMATCH_TAGS_POP /* diff --git a/bsd/sys/quota.h b/bsd/sys/quota.h index bc9da9345..08fcfd7ac 100644 --- a/bsd/sys/quota.h +++ b/bsd/sys/quota.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -196,7 +196,7 @@ struct user_dqblk { #define INITQMAGICS { \ 0xff31ff35, /* USRQUOTA */ \ 0xff31ff27, /* GRPQUOTA */ \ -}; +} #define QF_VERSION 1 #define QF_STRING_TAG "QUOTA HASH FILE" diff --git a/bsd/sys/reason.h b/bsd/sys/reason.h index c69534091..355b59ae9 100644 --- a/bsd/sys/reason.h +++ b/bsd/sys/reason.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,13 +36,14 @@ __BEGIN_DECLS #ifdef KERNEL_PRIVATE #include +#include #ifdef XNU_KERNEL_PRIVATE #include typedef struct os_reason { - decl_lck_mtx_data(, osr_lock) - unsigned int osr_refcount; + decl_lck_mtx_data(, osr_lock); + os_refcnt_t osr_refcount; uint32_t osr_namespace; uint64_t osr_code; uint64_t osr_flags; @@ -76,7 +77,8 @@ int os_reason_alloc_buffer_noblock(os_reason_t cur_reason, uint32_t osr_bufsize) struct kcdata_descriptor * os_reason_get_kcdata_descriptor(os_reason_t cur_reason); void os_reason_ref(os_reason_t cur_reason); void os_reason_free(os_reason_t cur_reason); - +void os_reason_set_flags(os_reason_t cur_reason, uint64_t flags); +void os_reason_set_description_data(os_reason_t cur_reason, uint32_t type, void *reason_data, uint32_t reason_data_len); #endif /* KERNEL_PRIVATE */ /* @@ -97,7 +99,8 @@ void os_reason_free(os_reason_t cur_reason); #define OS_REASON_REPORTCRASH 12 #define OS_REASON_COREANIMATION 13 #define OS_REASON_AGGREGATED 14 -#define OS_REASON_ASSERTIOND 15 +#define OS_REASON_RUNNINGBOARD 15 +#define OS_REASON_ASSERTIOND OS_REASON_RUNNINGBOARD /* old name */ #define OS_REASON_SKYWALK 16 #define OS_REASON_SETTINGS 17 #define OS_REASON_LIBSYSTEM 18 @@ -107,11 +110,14 @@ void os_reason_free(os_reason_t cur_reason); #define OS_REASON_WATCHKIT 22 #define OS_REASON_GUARD 23 #define OS_REASON_ANALYTICS 24 +#define OS_REASON_SANDBOX 25 +#define OS_REASON_SECURITY 26 +#define OS_REASON_ENDPOINTSECURITY 27 /* * Update whenever new OS_REASON namespaces are added. */ -#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ANALYTICS +#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ENDPOINTSECURITY #define OS_REASON_BUFFER_MAX_SIZE 5120 @@ -153,7 +159,8 @@ void os_reason_free(os_reason_t cur_reason); * * Outputs: Does not return. */ -void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, uint64_t reason_flags) __attribute__((noreturn)); +void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, uint64_t reason_flags) +__attribute__((noreturn, cold)); /* * abort_with_payload: Used to exit the current process and pass along @@ -171,7 +178,7 @@ void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const ch * Outputs: Does not return. */ void abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, - uint64_t reason_flags) __attribute__((noreturn)); + uint64_t reason_flags) __attribute__((noreturn, cold)); /* * terminate_with_reason: Used to terminate a specific process and pass along diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index 38088aa9b..e0750b8db 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -142,7 +142,6 @@ __END_DECLS #endif /* __APPLE_API_OBSOLETE */ #ifdef BSD_KERNEL_PRIVATE -#include __BEGIN_DECLS int reboot_kernel(int, char *); diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index 357768313..0cc5a3983 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -339,8 +339,7 @@ struct rusage_info_v4 { uint64_t ri_billed_energy; uint64_t ri_serviced_energy; uint64_t ri_interval_max_phys_footprint; - // 1 reserve counter(s) remaining for future extension - uint64_t ri_unused[1]; + uint64_t ri_runnable_time; }; typedef struct rusage_info_v4 rusage_info_current; @@ -498,6 +497,8 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY 1 #endif #define IOPOL_TYPE_VFS_ATIME_UPDATES 2 +#define IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES 3 +#define IOPOL_TYPE_VFS_STATFS_NO_DATA_VOLUME 4 /* scope */ #define IOPOL_SCOPE_PROCESS 0 @@ -524,6 +525,13 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_ATIME_UPDATES_DEFAULT 0 #define IOPOL_ATIME_UPDATES_OFF 1 +#define IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT 0 +#define IOPOL_MATERIALIZE_DATALESS_FILES_OFF 1 +#define IOPOL_MATERIALIZE_DATALESS_FILES_ON 2 + +#define IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT 0 +#define IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME 1 + #ifdef PRIVATE /* * Structures for use in communicating via iopolicysys() between Libc and the diff --git a/bsd/sys/resourcevar.h b/bsd/sys/resourcevar.h index 9637ead44..6d9244314 100644 --- a/bsd/sys/resourcevar.h +++ b/bsd/sys/resourcevar.h @@ -119,16 +119,6 @@ struct plimit { }; #ifdef KERNEL -/* add user profiling from AST */ -#define ADDUPROF(p) \ - addupc_task(p, \ - (proc_is64bit((p)) ? (p)->p_stats->user_p_prof.pr_addr \ - : CAST_USER_ADDR_T((p)->p_stats->p_prof.pr_addr)), \ - (proc_is64bit((p)) ? (p)->p_stats->user_p_prof.pr_ticks \ - : (p)->p_stats->p_prof.pr_ticks)) - -void addupc_intr(struct proc *p, uint32_t pc, u_int ticks); -void addupc_task(struct proc *p, user_addr_t pc, u_int ticks); void calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *ip); void ruadd(struct rusage *ru, struct rusage *ru2); diff --git a/bsd/sys/sdt_impl.h b/bsd/sys/sdt_impl.h index f0d840c38..837e38c74 100644 --- a/bsd/sys/sdt_impl.h +++ b/bsd/sys/sdt_impl.h @@ -27,13 +27,6 @@ #ifndef _SDT_IMPL_H #define _SDT_IMPL_H -/* - * This file has been created by splitting up the original DTrace sdt.h - * header. Keep the pragma notice here to allow version tracking. - */ - -/* #pragma ident "@(#)sdt.h 1.7 05/06/08 SMI" */ - #ifdef __cplusplus extern "C" { #endif @@ -51,8 +44,6 @@ typedef struct sdt_probedesc { } #endif -/* #pragma ident "@(#)sdt_impl.h 1.3 05/06/08 SMI" */ - #ifdef __cplusplus extern "C" { #endif diff --git a/bsd/sys/select.h b/bsd/sys/select.h index 1ffa4c0b0..2fb516833 100644 --- a/bsd/sys/select.h +++ b/bsd/sys/select.h @@ -127,6 +127,7 @@ struct selinfo { #define SI_RECORDED 0x0004 /* select has been recorded */ #define SI_INITED 0x0008 /* selinfo has been inited */ #define SI_CLEAR 0x0010 /* selinfo has been cleared */ +#define SI_KNPOSTING 0x0020 /* posting to knotes */ #else struct selinfo; diff --git a/bsd/sys/signalvar.h b/bsd/sys/signalvar.h index e96bfab71..a209acb0f 100644 --- a/bsd/sys/signalvar.h +++ b/bsd/sys/signalvar.h @@ -224,7 +224,7 @@ struct os_reason; * Machine-dependent functions: */ void sendsig(struct proc *, /*sig_t*/ user_addr_t action, int sig, - int returnmask, uint32_t code); + int returnmask, uint32_t code, sigset_t siginfo); void psignal(struct proc *p, int sig); void psignal_with_reason(struct proc *p, int sig, struct os_reason *signal_reason); @@ -250,6 +250,13 @@ int sig_try_locked(struct proc *p); #endif /* BSD_KERNEL_PRIVATE */ +#if defined(KERNEL_PRIVATE) +/* Forward-declare these for consumers of the SDK that don't know about BSD types */ +struct proc; +typedef struct proc * proc_t; +struct os_reason; +void psignal_sigkill_with_reason(proc_t p, struct os_reason *signal_reason); +#endif /* defined(KERNEL_PRIVATE) */ #ifdef XNU_KERNEL_PRIVATE diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 9f68d473f..e851212be 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -191,6 +191,7 @@ #define SO_RESTRICT_DENY_OUT 0x2 /* deny outbound (trapdoor) */ #define SO_RESTRICT_DENY_CELLULAR 0x4 /* deny use of cellular (trapdoor) */ #define SO_RESTRICT_DENY_EXPENSIVE 0x8 /* deny use of expensive if (trapdoor) */ +#define SO_RESTRICT_DENY_CONSTRAINED 0x10 /* deny use of expensive if (trapdoor) */ #endif /* PRIVATE */ #define SO_RANDOMPORT 0x1082 /* APPLE: request local port randomization */ #define SO_NP_EXTENSIONS 0x1083 /* To turn off some POSIX behavior */ @@ -334,7 +335,20 @@ #define SO_EXTENDED_BK_IDLE 0x1114 /* extended time to keep socket idle after app is suspended (int) */ #define SO_MARK_CELLFALLBACK 0x1115 /* Mark as initiated by cell fallback */ #endif /* PRIVATE */ +#define SO_NET_SERVICE_TYPE 0x1116 /* Network service type */ +#ifdef PRIVATE +#define SO_QOSMARKING_POLICY_OVERRIDE 0x1117 /* int */ +#define SO_INTCOPROC_ALLOW 0x1118 /* Try to use internal co-processor interfaces. */ +#endif /* PRIVATE */ + +#define SO_NETSVC_MARKING_LEVEL 0x1119 /* Get QoS marking in effect for socket */ + +#ifdef PRIVATE +#define SO_NECP_LISTENUUID 0x1120 /* NECP client UUID for listener */ +#define SO_MPKL_SEND_INFO 0x1122 /* (struct so_mpkl_send_info) */ +#define SO_STATISTICS_EVENT 0x1123 /* int64 argument, an event in statistics collection */ +#endif /* PRIVATE */ /* * Network Service Type for option SO_NET_SERVICE_TYPE * @@ -417,7 +431,6 @@ * inelastic flow, constant packet rate, somewhat fixed size. * E.g. VoIP. */ -#define SO_NET_SERVICE_TYPE 0x1116 /* Network service type */ #define NET_SERVICE_TYPE_BE 0 /* Best effort */ #define NET_SERVICE_TYPE_BK 1 /* Background system initiated */ @@ -430,9 +443,6 @@ #define NET_SERVICE_TYPE_RD 8 /* Responsive Data */ #if PRIVATE -#define SO_QOSMARKING_POLICY_OVERRIDE 0x1117 /* int */ -#define SO_INTCOPROC_ALLOW 0x1118 /* Try to use internal co-processor interfaces. */ - #define _NET_SERVICE_TYPE_COUNT 9 #define _NET_SERVICE_TYPE_UNSPEC ((int)-1) @@ -450,14 +460,14 @@ extern const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT]; #define SO_TC_NETSVC_SIG (SO_TC_NET_SERVICE_OFFSET + NET_SERVICE_TYPE_SIG) #endif /* PRIVATE */ -#define SO_NETSVC_MARKING_LEVEL 0x1119 /* Get QoS marking in effect for socket */ - +/* These are supported values for SO_NETSVC_MARKING_LEVEL */ #define NETSVC_MRKNG_UNKNOWN 0 /* The outgoing network interface is not known */ #define NETSVC_MRKNG_LVL_L2 1 /* Default marking at layer 2 (for example Wi-Fi WMM) */ #define NETSVC_MRKNG_LVL_L3L2_ALL 2 /* Layer 3 DSCP marking and layer 2 marking for all Network Service Types */ #define NETSVC_MRKNG_LVL_L3L2_BK 3 /* The system policy limits layer 3 DSCP marking and layer 2 marking * to background Network Service Types */ + typedef __uint32_t sae_associd_t; #define SAE_ASSOCID_ANY 0 #define SAE_ASSOCID_ALL ((sae_associd_t)(-1ULL)) @@ -686,6 +696,7 @@ struct sockaddr_storage { #define PF_BOND ((uint32_t)0x626f6e64) /* 'bond' */ #ifdef KERNEL_PRIVATE #define PF_BRIDGE ((uint32_t)0x62726467) /* 'brdg' */ +#define PF_802154 ((uint32_t)0x38313534) /* '8154' */ #endif /* KERNEL_PRIVATE */ /* @@ -769,6 +780,15 @@ struct sockaddr_storage { #define NET_RT_MAXID 11 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#ifdef PRIVATE +/* These are supported values for SO_STATISTICS_EVENT */ +#define SO_STATISTICS_EVENT_ENTER_CELLFALLBACK (1 << 0) +#define SO_STATISTICS_EVENT_EXIT_CELLFALLBACK (1 << 1) +#define SO_STATISTICS_EVENT_RESERVED_1 (1 << 2) +#define SO_STATISTICS_EVENT_RESERVED_2 (1 << 3) +#endif /* PRIVATE */ + + #ifdef KERNEL_PRIVATE #define CTL_NET_RT_NAMES { \ { 0, 0 }, \ @@ -982,9 +1002,9 @@ struct user32_sa_endpoints { #else #define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ #endif -#define MSG_FLUSH 0x400 /* Start of 'hold' seq; dump so_temp */ -#define MSG_HOLD 0x800 /* Hold frag in so_temp */ -#define MSG_SEND 0x1000 /* Send the packet in so_temp */ +#define MSG_FLUSH 0x400 /* Start of 'hold' seq; dump so_temp, deprecated */ +#define MSG_HOLD 0x800 /* Hold frag in so_temp, deprecated */ +#define MSG_SEND 0x1000 /* Send the packet in so_temp, deprecated */ #define MSG_HAVEMORE 0x2000 /* Data ready to be read */ #define MSG_RCVMORE 0x4000 /* Data remains in current pkt */ #endif @@ -1090,7 +1110,9 @@ struct cmsgcred { #ifdef PRIVATE #define SCM_SEQNUM 0x05 /* TCP unordered recv seq no */ #define SCM_MSG_PRIORITY 0x06 /* TCP unordered snd priority */ -#define SCM_TIMESTAMP_CONTINUOUS 0x07 /* timestamp (uint64_t) */ +#define SCM_TIMESTAMP_CONTINUOUS 0x07 /* timestamp (uint64_t) */ +#define SCM_MPKL_SEND_INFO 0x08 /* send info for multi-layer packet logging (struct so_mpkl_send_info) */ +#define SCM_MPKL_RECV_INFO 0x09 /* receive info for multi-layer packet logging (struct so_mpkl_recv_info */ #endif /* PRIVATE */ #ifdef KERNEL_PRIVATE @@ -1290,10 +1312,7 @@ struct so_cordreq { */ struct netpolicy_event_data { __uint64_t eupid; /* effective unique PID */ - pid_t epid; /* effective PID */ -#if !defined(__LP64__) - __uint32_t pad; -#endif /* __LP64__ */ + __uint64_t epid; /* effective PID */ uuid_t euuid; /* effective UUID */ }; @@ -1305,18 +1324,6 @@ struct kev_netpolicy_ifdenied { __uint32_t ev_if_functional_type; }; -/* - * Common structure for KEV_SOCKET_SUBCLASS - */ -struct kev_socket_event_data { - struct sockaddr_storage kev_sockname; - struct sockaddr_storage kev_peername; -}; - -struct kev_socket_closed { - struct kev_socket_event_data ev_data; -}; - /* * Network Service Type to DiffServ Code Point mapping */ @@ -1325,6 +1332,19 @@ struct netsvctype_dscp_map { u_int8_t dscp; /* 6 bits diffserv code point */ }; +/* + * Multi-layer packet logging require SO_MPK_LOG to be set + */ +struct so_mpkl_send_info { + uuid_t mpkl_uuid; + __uint8_t mpkl_proto; /* see net/multi_layer_pkt_log.h */ +}; + +struct so_mpkl_recv_info { + __uint32_t mpkl_seq; + __uint8_t mpkl_proto; /* see net/multi_layer_pkt_log.h */ +}; + #ifndef KERNEL __BEGIN_DECLS diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 250f8724f..f7e1e82ff 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -243,9 +243,6 @@ struct socket { pid_t last_pid; /* pid of most recent accessor */ u_int64_t last_upid; /* upid of most recent accessor */ - struct mbuf *so_temp; /* Holding area for outbound frags */ - /* Plug-in support - make the socket interface overridable */ - struct mbuf *so_tail; struct socket_filter_entry *so_filt; /* NKE hook */ u_int32_t so_flags; /* Flags */ #define SOF_NOSIGPIPE 0x00000001 @@ -284,7 +281,7 @@ struct socket { #define SOF_CONTENT_FILTER 0x20000000 /* Content filter enabled */ uint32_t so_upcallusecount; /* number of upcalls in progress */ - int so_usecount; /* refcounting of socket use */; + int so_usecount; /* refcounting of socket use */ int so_retaincnt; u_int32_t so_filteruse; /* usecount for the socket filters */ u_int16_t so_traffic_class; @@ -355,8 +352,14 @@ struct socket { #define SOF1_IN_KERNEL_SOCKET 0x00100000 /* Socket created in kernel via KPI */ #define SOF1_CONNECT_COUNTED 0x00200000 /* connect() call was counted */ #define SOF1_DNS_COUNTED 0x00400000 /* socket counted to send DNS queries */ +#define SOF1_MPKL_SEND_INFO 0x00800000 /* SO_MPKL_SEND_INFO option is set */ +#define SOF1_INBOUND 0x01000000 /* Created via a passive listener */ u_int64_t so_extended_bk_start; + + u_int8_t so_log_seqn; /* Multi-layer Packet Logging rolling sequence number */ + uuid_t so_mpkl_send_uuid; + uint8_t so_mpkl_send_proto; }; /* Control message accessor in mbufs */ @@ -617,12 +620,14 @@ struct kextcb { #define SO_FILT_HINT_MUSTRST 0x00020000 /* must send RST and close */ #define SO_FILT_HINT_MPCANTRCVMORE 0x00040000 /* MPTCP DFIN Received */ #define SO_FILT_HINT_NOTIFY_ACK 0x00080000 /* Notify Acknowledgement */ +#define SO_FILT_HINT_MP_SUB_ERROR 0x00100000 /* Error happend on subflow */ #define SO_FILT_HINT_BITS \ "\020\1LOCKED\2CONNRESET\3CANTRCVMORE\4CANTSENDMORE\5TIMEOUT" \ "\6NOSRCADDR\7IFDENIED\10SUSPEND\11RESUME\12KEEPALIVE\13AWTIMO" \ "\14ARTIMO\15CONNECTED\16DISCONNECTED\17CONNINFO_UPDATED" \ - "\20MPFAILOVER\21MPSTATUS\22MUSTRST\23MPCANTRCVMORE\24NOTIFYACK" + "\20MPFAILOVER\21MPSTATUS\22MUSTRST\23MPCANTRCVMORE\24NOTIFYACK"\ + "\25MPSUBERROR" /* Mask for hints that have corresponding kqueue events */ #define SO_FILT_HINT_EV \ @@ -703,6 +708,8 @@ struct so_procinfo { uuid_t spi_uuid; uuid_t spi_euuid; int spi_delegated; + char spi_proc_name[MAXCOMLEN + 1]; + char spi_e_proc_name[MAXCOMLEN + 1]; }; extern u_int32_t sb_max; @@ -727,11 +734,6 @@ extern u_int32_t net_io_policy_uuid; extern struct soextbkidlestat soextbkidlestat; -struct net_qos_dscp_map { - u_int8_t sotc_to_dscp[SO_TC_MAX]; - u_int8_t netsvctype_to_dscp[_NET_SERVICE_TYPE_COUNT]; -}; - #endif /* BSD_KERNEL_PRIVATE */ struct mbuf; @@ -777,9 +779,12 @@ extern struct socket *sonewconn(struct socket *head, int connstatus, const struct sockaddr *from); extern int sopoll(struct socket *so, int events, struct ucred *cred, void *wql); extern int sooptcopyin(struct sockopt *sopt, void *data, size_t len, - size_t minlen); -extern int sooptcopyout(struct sockopt *sopt, void *data, size_t len); -extern int soopt_cred_check(struct socket *so, int priv, boolean_t allow_root); + size_t minlen) +__attribute__ ((warn_unused_result)); +extern int sooptcopyout(struct sockopt *sopt, void *data, size_t len) +__attribute__ ((warn_unused_result)); +extern int soopt_cred_check(struct socket *so, int priv, boolean_t allow_root, + boolean_t ignore_delegate); extern int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); extern int soreserve(struct socket *so, u_int32_t sndcc, u_int32_t rcvcc); @@ -838,7 +843,6 @@ extern struct mbuf **sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf **m); extern void sbdrop(struct sockbuf *sb, int len); extern void sbdroprecord(struct sockbuf *sb); -extern int sbinsertoob(struct sockbuf *sb, struct mbuf *m0); extern void sbrelease(struct sockbuf *sb); extern int sbreserve(struct sockbuf *sb, u_int32_t cc); extern void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); @@ -878,7 +882,7 @@ extern int sodisconnectx(struct socket *so, sae_associd_t, sae_connid_t); extern int sodisconnectxlocked(struct socket *so, sae_associd_t, sae_connid_t); extern void soevupcall(struct socket *, u_int32_t); /* flags for socreate_internal */ -#define SOCF_ASYNC 0x1 /* non-blocking socket */ +#define SOCF_MPTCP 0x1 /* MPTCP-subflow */ extern int socreate_internal(int dom, struct socket **aso, int type, int proto, struct proc *, uint32_t, struct proc *); extern int socreate(int dom, struct socket **aso, int type, int proto); @@ -906,6 +910,7 @@ extern int soissrcbesteffort(struct socket *so); extern void soclearfastopen(struct socket *so); extern int solisten(struct socket *so, int backlog); extern struct socket *sodropablereq(struct socket *head); +extern lck_mtx_t *socket_getlock(struct socket *so, int flags); extern void socket_lock(struct socket *so, int refcount); extern void socket_lock_assert_owned(struct socket *so); extern int socket_try_lock(struct socket *so); @@ -915,7 +920,7 @@ extern const char *solockhistory_nr(struct socket *); extern void soevent(struct socket *so, long hint); extern void sorflush(struct socket *so); extern void sowflush(struct socket *so); -extern void sowakeup(struct socket *so, struct sockbuf *sb); +extern void sowakeup(struct socket *so, struct sockbuf *sb, struct socket *so2); extern int soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p); extern int sogetoptlock(struct socket *so, struct sockopt *sopt, int); extern int sosetoptlock(struct socket *so, struct sockopt *sopt, int); @@ -936,8 +941,7 @@ extern int sosendcheck(struct socket *, struct sockaddr *, user_ssize_t, extern int soo_ioctl(struct fileproc *, u_long, caddr_t, vfs_context_t); extern int soo_stat(struct socket *, void *, int); extern int soo_select(struct fileproc *, int, void *, vfs_context_t); -extern int soo_kqfilter(struct fileproc *, struct knote *, - struct kevent_internal_s *kev, vfs_context_t); +extern int soo_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *); /* Service class flags used for setting service class on a packet */ #define PKT_SCF_IPV6 0x00000001 /* IPv6 packet */ @@ -971,8 +975,8 @@ extern int so_set_opportunistic(struct socket *, int); extern int so_get_opportunistic(struct socket *); extern int so_set_recv_anyif(struct socket *, int); extern int so_get_recv_anyif(struct socket *); -extern int so_set_effective_pid(struct socket *, int, struct proc *); -extern int so_set_effective_uuid(struct socket *, uuid_t, struct proc *); +extern int so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred); +extern int so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred); extern int so_set_restrictions(struct socket *, uint32_t); extern uint32_t so_get_restrictions(struct socket *); extern void socket_tclass_init(void); @@ -996,14 +1000,9 @@ extern void mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len); extern void mptcp_preproc_sbdrop(struct socket *, struct mbuf *, unsigned int); extern void mptcp_postproc_sbdrop(struct mbuf *, u_int64_t, u_int32_t, u_int32_t); -extern int mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, - uint64_t dsn, uint32_t rseq, uint16_t dlen); extern void netpolicy_post_msg(uint32_t, struct netpolicy_event_data *, uint32_t); -extern void socket_post_kev_msg(uint32_t, struct kev_socket_event_data *, - uint32_t); -extern void socket_post_kev_msg_closed(struct socket *); /* * Socket operation routines. * These routines are called by the routines in diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index a973c4896..e0a96050c 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -312,6 +312,8 @@ #define SIOCSIFDISABLEOUTPUT _IOWR('i', 187, struct ifreq) +#define SIOCSIFSUBFAMILY _IOWR('i', 188, struct ifreq) + #define SIOCGIFAGENTLIST _IOWR('i', 190, struct netagentlist_req) /* Get netagent dump */ #ifdef BSD_KERNEL_PRIVATE @@ -334,13 +336,28 @@ #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ +#define SIOCSIF6LOWPAN _IOW('i', 196, struct ifreq) /* set 6LOWPAN config */ +#define SIOCGIF6LOWPAN _IOWR('i', 197, struct ifreq) /* get 6LOWPAN config */ + #ifdef PRIVATE +#define SIOCGIFTCPKAOMAX _IOWR('i', 198, struct ifreq) /* Max TCP keep alive offload slots */ #define SIOCGIFLOWPOWER _IOWR('i', 199, struct ifreq) /* Low Power Mode */ #define SIOCSIFLOWPOWER _IOWR('i', 200, struct ifreq) /* Low Power Mode */ #if INET6 #define SIOCGIFCLAT46ADDR _IOWR('i', 201, struct if_clat46req) #endif /* INET6 */ + +#define SIOCGIFMPKLOG _IOWR('i', 202, struct ifreq) /* Multi-layer Packet Logging */ +#define SIOCSIFMPKLOG _IOWR('i', 203, struct ifreq) /* Multi-layer Packet Logging */ + +#define SIOCGIFCONSTRAINED _IOWR('i', 204, struct ifreq) /* get interface constrained flag */ +#define SIOCSIFCONSTRAINED _IOWR('i', 205, struct ifreq) /* mark interface constrained */ + +#define SIOCGIFXFLAGS _IOWR('i', 206, struct ifreq) /* get extended ifnet flags */ + +#define SIOCGIFNOACKPRIO _IOWR('i', 207, struct ifreq) /* get interface no ack prioritization flag */ +#define SIOCSIFNOACKPRIO _IOWR('i', 208, struct ifreq) /* mark interface no ack prioritization flagd */ #endif /* PRIVATE */ #endif /* !_SYS_SOCKIO_H_ */ diff --git a/bsd/sys/spawn.h b/bsd/sys/spawn.h index 790d9c47a..4bafc11c2 100644 --- a/bsd/sys/spawn.h +++ b/bsd/sys/spawn.h @@ -61,7 +61,9 @@ #ifdef PRIVATE #define _POSIX_SPAWN_DISABLE_ASLR 0x0100 #define _POSIX_SPAWN_NANO_ALLOCATOR 0x0200 -/* unused 0x0400 */ +#endif /* PRIVATE */ +#define POSIX_SPAWN_SETSID 0x0400 +#ifdef PRIVATE /* unused 0x0800 */ /* unused 0x1000 */ #define _POSIX_SPAWN_ALLOW_DATA_EXEC 0x2000 diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index 64877ea3d..d963cfdb9 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -76,6 +76,7 @@ typedef enum { PSPA_EXCEPTION = 1, PSPA_AU_SESSION = 2, PSPA_IMP_WATCHPORTS = 3, + PSPA_REGISTERED_PORTS = 4, } pspa_t; /* @@ -150,6 +151,24 @@ struct _posix_spawn_coalition_info { } psci_info[COALITION_NUM_TYPES]; }; +/* + * UID/GID attributes + */ +struct _posix_spawn_posix_cred_info { + uint32_t pspci_flags; /* spawn persona flags */ + uid_t pspci_uid; /* alternate posix/unix UID */ + gid_t pspci_gid; /* alternate posix/unix GID */ + uint32_t pspci_ngroups; /* alternate advisory groups */ + gid_t pspci_groups[NGROUPS]; + uid_t pspci_gmuid; /* group membership UID */ + char pspci_login[MAXLOGNAME + 1]; +}; + +#define POSIX_SPAWN_POSIX_CRED_UID 0x00010000 +#define POSIX_SPAWN_POSIX_CRED_GID 0x00020000 +#define POSIX_SPAWN_POSIX_CRED_GROUPS 0x00040000 +#define POSIX_SPAWN_POSIX_CRED_LOGIN 0x00080000 + /* * Persona attributes */ @@ -163,18 +182,18 @@ struct _posix_spawn_persona_info { uid_t pspi_gmuid; /* group membership UID */ }; -#define POSIX_SPAWN_PERSONA_FLAGS_NONE 0x0 -#define POSIX_SPAWN_PERSONA_FLAGS_OVERRIDE 0x1 -#define POSIX_SPAWN_PERSONA_FLAGS_VERIFY 0x2 +#define POSIX_SPAWN_PERSONA_FLAGS_NONE 0x0 +#define POSIX_SPAWN_PERSONA_FLAGS_OVERRIDE 0x1 +#define POSIX_SPAWN_PERSONA_FLAGS_VERIFY 0x2 #define POSIX_SPAWN_PERSONA_ALL_FLAGS \ (POSIX_SPAWN_PERSONA_FLAGS_OVERRIDE \ | POSIX_SPAWN_PERSONA_FLAGS_VERIFY \ ) -#define POSIX_SPAWN_PERSONA_UID 0x00010000 -#define POSIX_SPAWN_PERSONA_GID 0x00020000 -#define POSIX_SPAWN_PERSONA_GROUPS 0x00040000 +#define POSIX_SPAWN_PERSONA_UID POSIX_SPAWN_POSIX_CRED_UID +#define POSIX_SPAWN_PERSONA_GID POSIX_SPAWN_POSIX_CRED_GID +#define POSIX_SPAWN_PERSONA_GROUPS POSIX_SPAWN_POSIX_CRED_GROUPS /* @@ -221,6 +240,7 @@ typedef struct _posix_spawnattr { _posix_spawn_mac_policy_extensions_t psa_mac_extensions; /* MAC policy-specific extensions. */ struct _posix_spawn_coalition_info *psa_coalition_info; /* coalition info */ struct _posix_spawn_persona_info *psa_persona_info; /* spawn new process into given persona */ + struct _posix_spawn_posix_cred_info *psa_posix_cred_info; /* posix creds: uid/gid/groups */ } *_posix_spawnattr_t; /* @@ -239,6 +259,20 @@ typedef struct _posix_spawnattr { #define POSIX_SPAWN_JETSAM_MEMLIMIT_ACTIVE_FATAL 0x04 /* if set, limit is fatal when the process is active */ #define POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL 0x08 /* if set, limit is fatal when the process is inactive */ + +/* + * Flags set based on posix_spawnattr_set_jetsam_ttr_np(). + * Indicate relaunch behavior of process when jetsammed + */ +/* Mask and bucket counts for relaunch behavior */ +#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS (0x3) +#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK (0x30) + +/* Actual buckets based on behavior data */ +#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH (0x30) +#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED (0x20) +#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW (0x10) + /* * Deprecated posix_spawn psa_flags values * @@ -271,7 +305,6 @@ typedef struct _posix_spawnattr { * posix_spawn psa_apptype process type settings. * when POSIX_SPAWN_PROC_TYPE is set, old psa_apptype bits are ignored */ - #define POSIX_SPAWN_PROCESS_TYPE_NORMAL 0x00000000 #define POSIX_SPAWN_PROCESS_TYPE_DEFAULT POSIX_SPAWN_PROCESS_TYPE_NORMAL @@ -285,12 +318,15 @@ typedef struct _posix_spawnattr { #define POSIX_SPAWN_PROC_TYPE_DAEMON_BACKGROUND 0x00000500 #define POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE 0x00000600 +#define POSIX_SPAWN_PROC_TYPE_DRIVER 0x00000700 + #define POSIX_SPAWN_PROC_CLAMP_NONE 0x00000000 #define POSIX_SPAWN_PROC_CLAMP_UTILITY 0x00000001 #define POSIX_SPAWN_PROC_CLAMP_BACKGROUND 0x00000002 #define POSIX_SPAWN_PROC_CLAMP_MAINTENANCE 0x00000003 #define POSIX_SPAWN_PROC_CLAMP_LAST 0x00000004 +#define POSIX_SPAWN_ENTITLEMENT_DRIVER "com.apple.private.spawn-driver" /* Setting to indicate no change to darwin role */ #define POSIX_SPAWN_DARWIN_ROLE_NONE 0x00000000 /* Other possible values are specified by PRIO_DARWIN_ROLE in sys/resource.h */ @@ -302,7 +338,10 @@ typedef enum { PSFA_OPEN = 0, PSFA_CLOSE = 1, PSFA_DUP2 = 2, - PSFA_INHERIT = 3 + PSFA_INHERIT = 3, + PSFA_FILEPORT_DUP2 = 4, + PSFA_CHDIR = 5, + PSFA_FCHDIR = 6 } psfa_t; @@ -317,17 +356,26 @@ typedef enum { * a variable sized vector list to save space (i.e. a separate * string area, allocation of least amount of path buffer per * open action, etc.). - * - * XXX: Currently overloading psfao_oflag for PSFA_DUP2 */ typedef struct _psfa_action { - psfa_t psfaa_type; /* file action type */ - int psfaa_filedes; /* fd to operate on */ - struct _psfaa_open { - int psfao_oflag; /* open flags to use */ - mode_t psfao_mode; /* mode for open */ - char psfao_path[PATH_MAX]; /* path to open */ - } psfaa_openargs; + psfa_t psfaa_type; /* file action type */ + union { + int psfaa_filedes; /* fd to operate on */ + mach_port_name_t psfaa_fileport; /* fileport to operate on */ + }; + union { + struct _psfaa_open { + int psfao_oflag; /* open flags to use */ + mode_t psfao_mode; /* mode for open */ + char psfao_path[PATH_MAX]; /* path to open */ + } psfaa_openargs; + struct { + int psfad_newfiledes; /* new file descriptor to use */ + } psfaa_dup2args; + struct { + char psfac_path[PATH_MAX]; /* path to chdir */ + } psfaa_chdirargs; + }; } _psfa_action_t; @@ -393,6 +441,9 @@ struct _posix_spawn_args_desc { __darwin_size_t persona_info_size; struct _posix_spawn_persona_info *persona_info; + + __darwin_size_t posix_cred_info_size; + struct _posix_spawn_posix_cred_info *posix_cred_info; }; #ifdef KERNEL @@ -404,33 +455,37 @@ struct _posix_spawn_args_desc { #endif struct user32__posix_spawn_args_desc { - uint32_t attr_size; /* size of attributes block */ - uint32_t attrp; /* pointer to block */ + uint32_t attr_size; /* size of attributes block */ + uint32_t attrp; /* pointer to block */ uint32_t file_actions_size; /* size of file actions block */ - uint32_t file_actions; /* pointer to block */ + uint32_t file_actions; /* pointer to block */ uint32_t port_actions_size; /* size of port actions block */ - uint32_t port_actions; /* pointer to block */ + uint32_t port_actions; /* pointer to block */ uint32_t mac_extensions_size; uint32_t mac_extensions; uint32_t coal_info_size; uint32_t coal_info; uint32_t persona_info_size; uint32_t persona_info; + uint32_t posix_cred_info_size; + uint32_t posix_cred_info; }; struct user__posix_spawn_args_desc { - user_size_t attr_size; /* size of attributes block */ - user_addr_t attrp; /* pointer to block */ + user_size_t attr_size; /* size of attributes block */ + user_addr_t attrp; /* pointer to block */ user_size_t file_actions_size; /* size of file actions block */ - user_addr_t file_actions; /* pointer to block */ + user_addr_t file_actions; /* pointer to block */ user_size_t port_actions_size; /* size of port actions block */ - user_addr_t port_actions; /* pointer to block */ + user_addr_t port_actions; /* pointer to block */ user_size_t mac_extensions_size; /* size of MAC-specific attrs. */ user_addr_t mac_extensions; /* pointer to block */ user_size_t coal_info_size; user_addr_t coal_info; user_size_t persona_info_size; user_addr_t persona_info; + user_size_t posix_cred_info_size; + user_addr_t posix_cred_info; }; diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index b5f73326a..18c9ad950 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -493,8 +493,9 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); /* * Super-user changeable flags. */ -#define SF_SUPPORTED 0x001f0000 /* mask of superuser supported flags */ -#define SF_SETTABLE 0xffff0000 /* mask of superuser changeable flags */ +#define SF_SUPPORTED 0x009f0000 /* mask of superuser supported flags */ +#define SF_SETTABLE 0x3fff0000 /* mask of superuser changeable flags */ +#define SF_SYNTHETIC 0xc0000000 /* mask of system read-only synthetic flags */ #define SF_ARCHIVED 0x00010000 /* file is archived */ #define SF_IMMUTABLE 0x00020000 /* file may not be changed */ #define SF_APPEND 0x00040000 /* writes to file may only append */ @@ -508,6 +509,27 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); /* #define SF_SNAPSHOT 0x00200000 */ /* snapshot inode */ /* NOTE: There is no SF_HIDDEN bit. */ +#define SF_FIRMLINK 0x00800000 /* file is a firmlink */ + +/* + * Synthetic flags. + * + * These are read-only. We keep them out of SF_SUPPORTED so that + * attempts to set them will fail. + */ +#define SF_DATALESS 0x40000000 /* file is dataless object */ + +#ifdef PRIVATE +/* + * Protected flags. + * + * These flags are read-write, but can only be changed using the safe + * mechanism (FSIOC_CAS_BSDFLAGS). The standard chflags(2) mechanism + * will simply preserve these bits as they are in the inode. + */ +#define UF_SF_PROTECTED (UF_COMPRESSED) +#endif + #ifdef KERNEL /* * Shorthand abbreviations of above. diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index aea56f700..f37e9a07d 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -282,6 +282,8 @@ int sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int void sysctl_register_oid(struct sysctl_oid *oidp); void sysctl_unregister_oid(struct sysctl_oid *oidp); +void sysctl_load_devicetree_entries(void); + /* Deprecated */ void sysctl_register_fixed(void) __deprecated; @@ -327,7 +329,7 @@ __END_DECLS /* This constructs a "raw" MIB oid. */ #define SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ { \ - &sysctl_##parent##_children, { 0 }, \ + &sysctl_##parent##_children, { NULL }, \ nbr, (int)(kind|CTLFLAG_OID2), a1, (int)(a2), #name, handler, fmt, descr, SYSCTL_OID_VERSION, 0 \ } @@ -340,7 +342,7 @@ __END_DECLS struct sysctl_oid_list sysctl_##parent##_##name##_children; \ SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access, \ (void*)&sysctl_##parent##_##name##_children, 0, handler, \ - "N", descr); + "N", descr) /* Oid for a string. len can be 0 to indicate '\0' termination. */ #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \ @@ -359,31 +361,31 @@ __END_DECLS #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ ptr, val, sysctl_handle_int, "I", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int)) ? 0 : -1]; + typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int)) ? 0 : -1] /* Oid for an unsigned int. If ptr is NULL, val is returned. */ #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ ptr, val, sysctl_handle_int, "IU", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int)) ? 0 : -1]; + typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int)) ? 0 : -1] /* Oid for a long. The pointer must be non NULL. */ #define SYSCTL_LONG(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ ptr, 0, sysctl_handle_long, "L", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long)) ? 0 : -1]; + typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long)) ? 0 : -1] /* Oid for a unsigned long. The pointer must be non NULL. */ #define SYSCTL_ULONG(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ ptr, 0, sysctl_handle_long, "LU", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long)) ? 0 : -1]; + typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long)) ? 0 : -1] /* Oid for a quad. The pointer must be non NULL. */ #define SYSCTL_QUAD(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_QUAD|access, \ ptr, 0, sysctl_handle_quad, "Q", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long)) ? 0 : -1]; + typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long)) ? 0 : -1] /* Oid for an opaque object. Specified by a pointer and a length. */ #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \ @@ -522,7 +524,7 @@ SYSCTL_DECL(_hw_features); #define KERN_LOGSIGEXIT 36 /* int: do we log sigexit procs? */ #define KERN_SYMFILE 37 /* string: kernel symbol filename */ #define KERN_PROCARGS 38 -/* 39 was KERN_PCSAMPLES... now deprecated */ +/* 39 was KERN_PCSAMPLES... now obsolete */ #define KERN_NETBOOT 40 /* int: are we netbooted? 1=yes,0=no */ /* 41 was KERN_PANICINFO : panic UI information (deprecated) */ #define KERN_SYSV 42 /* node: System V IPC information */ @@ -717,6 +719,12 @@ SYSCTL_DECL(_hw_features); #define KERN_PROC_RUID 6 /* by real uid */ #define KERN_PROC_LCID 7 /* by login context id */ +/* + * KERN_VFSNSPACE subtypes + */ +#define KERN_VFSNSPACE_HANDLE_PROC 1 +#define KERN_VFSNSPACE_UNHANDLE_PROC 2 + #if defined(XNU_KERNEL_PRIVATE) || !defined(KERNEL) /* * KERN_PROC subtype ops return arrays of augmented proc structures: diff --git a/bsd/sys/sysent.h b/bsd/sys/sysent.h index a309bb90b..b68b3cded 100644 --- a/bsd/sys/sysent.h +++ b/bsd/sys/sysent.h @@ -58,7 +58,7 @@ struct sysent { /* system call table */ extern struct sysent sysent[]; #endif /* __INIT_SYSENT_C__ */ -extern unsigned int nsysent; +extern const unsigned int nsysent; /* * Valid values for sy_cancel diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index a06576c40..dfcd2e731 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -228,6 +228,8 @@ void throttle_info_mount_rel(mount_t mp); void throttle_info_release(void *throttle_info); void throttle_info_update(void *throttle_info, int flags); uint32_t throttle_lowpri_io(int sleep_amount); +/* returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept */ +int throttle_lowpri_io_will_be_throttled(int sleep_amount); void throttle_set_thread_io_policy(int policy); int throttle_get_thread_effective_io_policy(void); diff --git a/bsd/sys/tty.h b/bsd/sys/tty.h index 45708bf47..5c3609ccb 100644 --- a/bsd/sys/tty.h +++ b/bsd/sys/tty.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -146,7 +146,7 @@ struct tty { int t_refcnt; /* reference count */ }; -#define TTY_NULL (struct tty *)0 +#define TTY_NULL (struct tty *)NULL #define t_cc t_termios.c_cc #define t_cflag t_termios.c_cflag diff --git a/bsd/sys/ttycom.h b/bsd/sys/ttycom.h index c8eed3145..3fdc94b85 100644 --- a/bsd/sys/ttycom.h +++ b/bsd/sys/ttycom.h @@ -180,6 +180,9 @@ struct winsize { #define TIOCPTYGRANT _IO('t', 84) /* grantpt(3) */ #define TIOCPTYGNAME _IOC(IOC_OUT, 't', 83, 128) /* ptsname(3) */ #define TIOCPTYUNLK _IO('t', 82) /* unlockpt(3) */ +#ifdef KERNEL +#define TIOCREVOKE _IO('t', 81) +#endif #define TTYDISC 0 /* termios tty line discipline */ #define TABLDISC 3 /* tablet discipline */ diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index bc91fde31..e0a5cca0f 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -157,7 +157,7 @@ int ubc_create_upl_external(vnode_t, off_t, int, upl_t *, upl_page_info_t ** int ubc_create_upl_kernel(vnode_t, off_t, int, upl_t *, upl_page_info_t **, int, vm_tag_t); #endif /* XNU_KERNEL_PRIVATE */ -__attribute__((pure)) boolean_t ubc_is_mapped(const struct vnode *, boolean_t *writable); +boolean_t ubc_is_mapped(const struct vnode *, boolean_t *writable); __attribute__((pure)) boolean_t ubc_is_mapped_writable(const struct vnode *); uint32_t cluster_max_io_size(mount_t, int); diff --git a/bsd/sys/ucred.h b/bsd/sys/ucred.h index febbf1aea..b013af853 100644 --- a/bsd/sys/ucred.h +++ b/bsd/sys/ucred.h @@ -78,6 +78,7 @@ struct label; #ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL #include /* @@ -119,6 +120,11 @@ struct ucred { */ struct au_session cr_audit; /* user auditing data */ }; +#else /* KERNEL */ +struct ucred; +struct posix_cred; +#endif /* KERNEL */ + #ifndef _KAUTH_CRED_T #define _KAUTH_CRED_T typedef struct ucred *kauth_cred_t; diff --git a/bsd/sys/uio_internal.h b/bsd/sys/uio_internal.h index 91f00abb2..86b3eb221 100644 --- a/bsd/sys/uio_internal.h +++ b/bsd/sys/uio_internal.h @@ -76,21 +76,21 @@ * WARNING - make sure to check when adding flags! Be sure new flags * don't overlap the definitions in uio.h */ -// UIO_USERSPACE 0 defined in uio.h -#define UIO_USERISPACE 1 -// UIO_SYSSPACE 2 defined in uio.h -#define UIO_PHYS_USERSPACE 3 -#define UIO_PHYS_SYSSPACE 4 -// UIO_USERSPACE32 5 defined in uio.h -#define UIO_USERISPACE32 6 -#define UIO_PHYS_USERSPACE32 7 -// UIO_USERSPACE64 8 defined in uio.h -#define UIO_USERISPACE64 9 -#define UIO_PHYS_USERSPACE64 10 -// UIO_SYSSPACE32 11 defined in uio.h -// UIO_PHYS_SYSSPACE32 12 reserved, never used. Use UIO_PHYS_SYSSPACE -// UIO_SYSSPACE64 13 reserved, never used. Use UIO_SYSSPACE -// UIO_PHYS_SYSSPACE64 14 reserved, never used. Use UIO_PHYS_SYSSPACE +// UIO_USERSPACE 0 defined in uio.h +#define UIO_USERISPACE ((enum uio_seg)1) +// UIO_SYSSPACE 2 defined in uio.h +#define UIO_PHYS_USERSPACE ((enum uio_seg)3) +#define UIO_PHYS_SYSSPACE ((enum uio_seg)4) +// UIO_USERSPACE32 5 defined in uio.h +#define UIO_USERISPACE32 ((enum uio_seg)6) +#define UIO_PHYS_USERSPACE32 ((enum uio_seg)7) +// UIO_USERSPACE64 8 defined in uio.h +#define UIO_USERISPACE64 ((enum uio_seg)9) +#define UIO_PHYS_USERSPACE64 ((enum uio_seg)10) +// UIO_SYSSPACE32 11 defined in uio.h +// UIO_PHYS_SYSSPACE32 12 reserved, never used. Use UIO_PHYS_SYSSPACE +// UIO_SYSSPACE64 13 reserved, never used. Use UIO_SYSSPACE +// UIO_PHYS_SYSSPACE64 14 reserved, never used. Use UIO_PHYS_SYSSPACE __BEGIN_DECLS struct user_iovec; diff --git a/bsd/sys/ulock.h b/bsd/sys/ulock.h index bb48d3a72..b86d10eef 100644 --- a/bsd/sys/ulock.h +++ b/bsd/sys/ulock.h @@ -29,6 +29,10 @@ #ifndef _SYS_ULOCK_H #define _SYS_ULOCK_H +#include +#include +#include + __BEGIN_DECLS #if PRIVATE @@ -64,23 +68,30 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); #endif /* !KERNEL */ /* - * operation bits [7, 0] contain the operation code + * operation bits [7, 0] contain the operation code. + * + * NOTE: make sure to add logic for handling any new + * types to kdp_ulock_find_owner() */ -#define UL_COMPARE_AND_WAIT 1 -#define UL_UNFAIR_LOCK 2 +#define UL_COMPARE_AND_WAIT 1 +#define UL_UNFAIR_LOCK 2 +#define UL_COMPARE_AND_WAIT_SHARED 3 +#define UL_UNFAIR_LOCK64_SHARED 4 +#define UL_COMPARE_AND_WAIT64 5 +#define UL_COMPARE_AND_WAIT64_SHARED 6 /* obsolete names */ -#define UL_OSSPINLOCK UL_COMPARE_AND_WAIT -#define UL_HANDOFFLOCK UL_UNFAIR_LOCK +#define UL_OSSPINLOCK UL_COMPARE_AND_WAIT +#define UL_HANDOFFLOCK UL_UNFAIR_LOCK /* These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels */ #define UL_DEBUG_SIMULATE_COPYIN_FAULT 253 -#define UL_DEBUG_HASH_DUMP_ALL 254 -#define UL_DEBUG_HASH_DUMP_PID 255 +#define UL_DEBUG_HASH_DUMP_ALL 254 +#define UL_DEBUG_HASH_DUMP_PID 255 /* * operation bits [15, 8] contain the flags for __ulock_wake */ -#define ULF_WAKE_ALL 0x00000100 -#define ULF_WAKE_THREAD 0x00000200 +#define ULF_WAKE_ALL 0x00000100 +#define ULF_WAKE_THREAD 0x00000200 /* * operation bits [23, 16] contain the flags for __ulock_wait @@ -92,14 +103,19 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); * * @const ULF_WAIT_CANCEL_POINT * This wait is a cancelation point + * + * @const ULF_WAIT_ADAPTIVE_SPIN + * Use adaptive spinning when the thread that currently holds the unfair lock + * is on core. */ #define ULF_WAIT_WORKQ_DATA_CONTENTION 0x00010000 #define ULF_WAIT_CANCEL_POINT 0x00020000 +#define ULF_WAIT_ADAPTIVE_SPIN 0x00040000 /* * operation bits [31, 24] contain the generic flags */ -#define ULF_NO_ERRNO 0x01000000 +#define ULF_NO_ERRNO 0x01000000 /* * masks @@ -109,12 +125,12 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); #define ULF_GENERIC_MASK 0xFFFF0000 #define ULF_WAIT_MASK (ULF_NO_ERRNO | \ - ULF_WAIT_WORKQ_DATA_CONTENTION | \ - ULF_WAIT_CANCEL_POINT) + ULF_WAIT_WORKQ_DATA_CONTENTION | \ + ULF_WAIT_CANCEL_POINT | ULF_WAIT_ADAPTIVE_SPIN) -#define ULF_WAKE_MASK (ULF_WAKE_ALL | \ - ULF_WAKE_THREAD | \ - ULF_NO_ERRNO) +#define ULF_WAKE_MASK (ULF_NO_ERRNO | \ + ULF_WAKE_ALL | \ + ULF_WAKE_THREAD) #endif /* PRIVATE */ diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 4c79d0d8f..42734a4de 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -118,6 +118,11 @@ struct uthread { u_int64_t uu_arg[8]; /* arguments to current system call */ int uu_rval[2]; char uu_cursig; /* p_cursig for exc. */ + /* + * uu_workq_pthread_kill_allowed is not modified under a lock and thus + * relies on single copy atomicity and cannot be changed to a bitfield. + */ + bool uu_workq_pthread_kill_allowed; unsigned int syscall_code; /* current syscall code */ /* thread exception handling */ @@ -135,37 +140,13 @@ struct uthread { int32_t *retval; /* place to store return val */ } uus_select_data; - struct _kqueue_scan { - kevent_callback_t call; /* per-event callback */ - kqueue_continue_t cont; /* whole call continuation */ - filt_process_data_t process_data; /* needed for filter processing */ - uint64_t deadline; /* computed deadline for operation */ - void *data; /* caller's private data */ - } uus_kqueue_scan; /* saved state for kevent_scan() */ - - struct _kevent { - struct _kqueue_scan scan; /* space for the generic data */ - struct fileproc *fp; /* fileproc we hold iocount on */ - int fd; /* fd for fileproc (if held) */ - int eventcount; /* user-level event count */ - int eventout; /* number of events output */ - struct filt_process_s process_data; /* space for process data fed thru */ - int32_t *retval; /* place to store return val */ - user_addr_t eventlist; /* user-level event list address */ - uint64_t data_available; /* [user/kernel] addr of in/out size */ - } uus_kevent; /* saved state for kevent() */ + struct kevent_ctx_s uus_kevent; struct _kevent_register { - struct kevent_internal_s kev; /* the kevent to maybe copy out */ - struct knote *knote; /* the knote used for the wait */ - struct fileproc *fp; /* fileproc we hold iocount on */ + struct kevent_qos_s kev; /* the kevent to maybe copy out */ thread_t handoff_thread; /* thread we handed off to, has +1 */ - struct kqueue *kq; - int fd; /* fd for fileproc (if held) */ - int eventcount; /* user-level event count */ + struct kqworkloop *kqwl; int eventout; /* number of events output */ - unsigned int flags; /* flags for kevent_copyout() */ - int32_t *retval; /* place to store return val */ user_addr_t ueventlist; /* the user-address to copyout to */ } uus_kevent_register; /* saved for EVFILT_WORKLOOP wait */ @@ -234,7 +215,10 @@ struct uthread { struct kaudit_record *uu_ar; /* audit record */ struct task* uu_aio_task; /* target task for async io */ - lck_mtx_t *uu_mtx; + union { + lck_mtx_t *uu_mtx; + struct knote_lock_ctx *uu_knlock; + }; lck_spin_t uu_rethrottle_lock; /* locks was_rethrottled and is_throttled */ TAILQ_ENTRY(uthread) uu_throttlelist; /* List of uthreads currently throttled */ @@ -258,7 +242,7 @@ struct uthread { * Bound kqueue request. This field is only cleared by the current thread, * hence can be dereferenced safely by the current thread without locks. */ - struct kqrequest *uu_kqr_bound; + struct workq_threadreq_s *uu_kqr_bound; TAILQ_ENTRY(uthread) uu_workq_entry; mach_vm_offset_t uu_workq_stackaddr; mach_port_name_t uu_workq_thport; @@ -364,9 +348,10 @@ typedef struct uthread * uthread_t; #define UT_PASSIVE_IO 0x00000100 /* this thread issues passive I/O */ #define UT_PROCEXIT 0x00000200 /* this thread completed the proc exit */ #define UT_RAGE_VNODES 0x00000400 /* rapid age any vnodes created by this thread */ -#define UT_KERN_RAGE_VNODES 0x00000800 /* rapid age any vnodes created by this thread (kernel set) */ -/* 0x00001000 unused, used to be UT_BACKGROUND_TRAFFIC_MGT */ +#define UT_KERN_RAGE_VNODES 0x00000800 /* rapid age any vnodes created by this thread (kernel set) */ +#define UT_NSPACE_NODATALESSFAULTS 0x00001000 /* thread does not materialize dataless files */ #define UT_ATIME_UPDATE 0x00002000 /* don't update atime for files accessed by this thread */ +#define UT_NSPACE_FORCEDATALESSFAULTS 0x00004000 /* thread always materializes dataless files */ #define UT_VFORK 0x02000000 /* thread has vfork children */ #define UT_SETUID 0x04000000 /* thread is settugid() */ #define UT_WASSETUID 0x08000000 /* thread was settugid() (in vfork) */ diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index a21365bfe..5ec22ac93 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -109,9 +109,13 @@ enum vtagtype { /* 16 - 20 */ VT_HFS, VT_ZFS, VT_DEVFS, VT_WEBDAV, VT_UDF, /* 21 - 25 */ - VT_AFP, VT_CDDA, VT_CIFS, VT_OTHER, VT_APFS + VT_AFP, VT_CDDA, VT_CIFS, VT_OTHER, VT_APFS, + /* 26 */ + VT_LOCKERFS, }; +#define HAVE_VT_LOCKERFS 1 + /* * flags for VNOP_BLOCKMAP */ @@ -467,10 +471,16 @@ struct vnode_trigger_param { * VNT_NO_DIRECT_MOUNT: * A trigger vnode instance that doesn't directly trigger a mount, * instead it triggers the mounting of sub-trigger nodes. + * + * VNT_KERN_RESOLVE: + * A trigger vnode where all parameters have been set by the kernel, + * such as NFS mirror mounts. */ #define VNT_AUTO_REARM (1 << 0) #define VNT_NO_DIRECT_MOUNT (1 << 1) -#define VNT_VALID_MASK (VNT_AUTO_REARM | VNT_NO_DIRECT_MOUNT) +#define VNT_KERN_RESOLVE (1 << 2) +#define VNT_VALID_MASK (VNT_AUTO_REARM | VNT_NO_DIRECT_MOUNT | \ + VNT_KERN_RESOLVE) #endif /* KERNEL_PRIVATE */ @@ -753,6 +763,8 @@ struct vnode_attr { #define VA_NOINHERIT 0x040000 /* Don't inherit ACLs from parent */ #define VA_NOAUTH 0x080000 #define VA_64BITOBJIDS 0x100000 /* fileid/linkid/parentid are 64 bit */ +#define VA_REALFSID 0x200000 /* Return real fsid */ +#define VA_USEFSID 0x400000 /* Use fsid from filesystem */ /* * Modes. Some values same as Ixxx entries from inode.h for now. @@ -794,6 +806,7 @@ extern int vttoif_tab[]; #define VNODE_REMOVE_NODELETEBUSY 0x0001 /* Don't delete busy files (Carbon) */ #define VNODE_REMOVE_SKIP_NAMESPACE_EVENT 0x0002 /* Do not upcall to userland handlers */ #define VNODE_REMOVE_NO_AUDIT_PATH 0x0004 /* Do not audit the path */ +#define VNODE_REMOVE_DATALESS_DIR 0x0008 /* Special handling for removing a dataless directory without materialization */ /* VNOP_READDIR flags: */ #define VNODE_READDIR_EXTENDED 0x0001 /* use extended directory entries */ @@ -825,7 +838,7 @@ struct vnodeopv_entry_desc { struct vnodeopv_desc { /* ptr to the ptr to the vector where op should go */ int(***opv_desc_vector_p)(void *); - struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ + const struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ }; /*! @@ -973,6 +986,14 @@ enum vtype vnode_vtype(vnode_t vp); */ uint32_t vnode_vid(vnode_t vp); +/*! + * @function vnode_isonexternalstorage + * @abstract Return whether or not the storage device backing a vnode is external or not. + * @param vp The vnode whose physical location is to be determined. + * @return TRUE if storage device is external, FALSE if otherwise. + */ +boolean_t vnode_isonexternalstorage(vnode_t vp); + /*! * @function vnode_mountedhere * @abstract Returns a pointer to a mount placed on top of a vnode, should it exist. @@ -1111,7 +1132,26 @@ int vnode_isnamedstream(vnode_t vp); * @return 0 if the operation is successful, an error otherwise. */ errno_t vnode_setasnamedstream(vnode_t vp, vnode_t svp); -#endif + +/*! + * @function vnode_setasfirmlink + * @abstract Set a vnode to act as a firmlink i.e. point to a target vnode. + * @param vp The vnode which is to be acted on as a firmlink. + * @param target_vp The vnode which will be the target of the firmlink. + * @return 0 if the operation is successful, an error otherwise. + */ +errno_t vnode_setasfirmlink(vnode_t vp, vnode_t target_vp); + +/*! + * @function vnode_getfirmlink + * @abstract If a vnode is a firmlink, get its target vnode. + * @param vp The firmlink vnode. + * @param target_vp The firmlink traget vnode. This vnode is returned with an iocount. + * @return 0 if the operation is successful, an error otherwise. + */ +errno_t vnode_getfirmlink(vnode_t vp, vnode_t *target_vp); + +#endif /* KERNEL_PRIVATE */ /*! * @function vnode_ismountedon @@ -1637,6 +1677,18 @@ int vnode_ismonitored(vnode_t vp); int vnode_isdyldsharedcache(vnode_t vp); +/*! + * @function vn_authorize_unlink + * @abstract Authorize an unlink operation given the vfs_context_t + * @discussion Check if the context assocated with vfs_context_t is allowed to unlink the vnode vp in directory dvp. + * @param dvp Parent vnode of the file to be unlinked + * @param vp The vnode to be unlinked + * @param cnp A componentname containing the name of the file to be unlinked. May be NULL. + * @param reserved Pass NULL + * @return returns zero if the operation is allowed, non-zero indicates the unlink is not authorized. + */ +int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); + /*! * @function vn_getpath_fsenter * @abstract Attempt to get a vnode's path, willing to enter the filesystem. @@ -1651,6 +1703,19 @@ int vnode_isdyldsharedcache(vnode_t vp); */ int vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len); +/*! + * @function vn_getpath_no_firmlink + * @abstract Attempt to get a vnode's path without a firm-link translation. + * @discussion Paths to vnodes are not always straightforward: a file with multiple hard-links will have multiple pathnames, + * and it is sometimes impossible to determine a vnode's full path. Like vn_getpath, it will not reenter the filesystem. + * @param vp Vnode whose path to get + * @param pathbuf Buffer in which to store path. + * @param len Destination for length of resulting path string. Result will include NULL-terminator in count--that is, "len" + * will be strlen(pathbuf) + 1. + * @return 0 for success or an error. + */ +int vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len); + /*! * @function vn_getpath_fsenter_with_parent * @abstract Attempt to get a vnode's path by entering the file system if needed given a vnode and it's directory vnode. @@ -1666,6 +1731,27 @@ int vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len); */ int vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len); +/*! + * @function vn_getpath_ext + * @abstract Attempt to get a vnode's path without rentering filesystem (unless passed an option to allow) + * @discussion Paths to vnodes are not always straightforward: a file with multiple hard-links will have multiple pathnames, + * and it is sometimes impossible to determine a vnode's full path. vn_getpath_fsenter() may enter the filesystem + * to try to construct a path, so filesystems should be wary of calling it. + * @param vp Vnode whose path to get + * @param dvp parent vnode of vnode whose path to get, can be NULL if not available. + * @param pathbuf Buffer in which to store path. + * @param len Destination for length of resulting path string. Result will include NULL-terminator in count--that is, "len" + * will be strlen(pathbuf) + 1. + * @param flags flags for controlling behavior. + * @return 0 for success or an error. + */ +int vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int *len, int flags); + +/* supported flags for vn_getpath_ext */ +#define VN_GETPATH_FSENTER 0x0001 /* Can re-enter filesystem */ +#define VN_GETPATH_NO_FIRMLINK 0x0002 +#define VN_GETPATH_VOLUME_RELATIVE 0x0004 /* also implies VN_GETPATH_NO_FIRMLINK */ + #endif /* KERNEL_PRIVATE */ #define VNODE_UPDATE_PARENT 0x01 @@ -1673,6 +1759,9 @@ int vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char #define VNODE_UPDATE_NAME 0x02 #define VNODE_UPDATE_CACHE 0x04 #define VNODE_UPDATE_PURGE 0x08 +#ifdef BSD_KERNEL_PRIVATE +#define VNODE_UPDATE_PURGEFIRMLINK 0x10 +#endif /*! * @function vnode_update_identity * @abstract Update vnode data associated with the vfs cache. @@ -1833,12 +1922,26 @@ int vfs_get_notify_attributes(struct vnode_attr *vap); */ errno_t vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx); +#ifdef KERNEL_PRIVATE +/*! + * @function vnode_lookup starting from a directory vnode (only if path is relative) + * @abstract Convert a path into a vnode. + * @discussion This routine is a thin wrapper around xnu-internal lookup routines; if successful, + * it returns with an iocount held on the resulting vnode which must be dropped with vnode_put(). + * @param path Path to look up. + * @param flags VNODE_LOOKUP_NOFOLLOW: do not follow symbolic links. VNODE_LOOKUP_NOCROSSMOUNT: do not cross mount points. + * @param start_dvp vnode of directory to start lookup from. This parameter is ignored if path is absolute. start_dvp should + * have an iocount on it. + * @return Results 0 for success or an error code. + */ +errno_t vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx, vnode_t start_dvp); +#endif + /*! * @function vnode_open * @abstract Open a file identified by a path--roughly speaking an in-kernel open(2). - * @discussion If vnode_open() succeeds, it returns with both an iocount and a usecount on the returned vnode. These must - * be released eventually; the iocount should be released with vnode_put() as soon as any initial operations - * on the vnode are over, whereas the usecount should be released via vnode_close(). + * @discussion If vnode_open() succeeds, it returns with both an iocount and a usecount on the + * returned vnode. Both will be release once vnode_close is called. * @param path Path to look up. * @param fmode e.g. O_NONBLOCK, O_APPEND; see bsd/sys/fcntl.h. * @param cmode Permissions with which to create file if it does not exist. @@ -2132,9 +2235,9 @@ int vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp); #ifdef BSD_KERNEL_PRIVATE /* Not in export list so can be private */ struct stat; -int vn_stat(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64, +int vn_stat(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64, int needsrealdev, vfs_context_t ctx); -int vn_stat_noauth(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64, +int vn_stat_noauth(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64, int needsrealdev, vfs_context_t ctx, struct ucred *file_cred); int vaccess(mode_t file_mode, uid_t uid, gid_t gid, mode_t acc_mode, kauth_cred_t cred); @@ -2231,6 +2334,22 @@ errno_t vfs_setup_vattr_from_attrlist(struct attrlist *alp, struct vnode_attr *v */ errno_t vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, void *fndesc, vfs_context_t ctx); +/*! + * @function vfs_attr_pack_ex + * @abstract Pack a vnode_attr structure into a buffer in the same format as getattrlist(2). + * @Used by a VNOP_GETATTRLISTBULK implementation to pack data provided into a vnode_attr structure into a buffer the way getattrlist(2) does. + * @param mp the mount structure for the filesystem the packing operation is happening on. + * @param vp If available, the vnode for which the attributes are being given, NULL if vnode is not available (which will usually be the case for a VNOP_GETATTRLISTBULK implementation. + * @param uio - a uio_t initialised with one iovec.. + * @param alp - Pointer to an attrlist structure. + * @param options - options for call (same as options for getattrlistbulk(2)). + * @param vap Pointer to a filled in vnode_attr structure. Data from the vnode_attr structure will be used to copy and lay out the data in the required format for getatrlistbulk(2) by this function. + * @param fndesc Currently unused + * @param ctx vfs context of caller. + * @return error. + */ +errno_t vfs_attr_pack_ext(mount_t mp, vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, void *fndesc, vfs_context_t ctx); + #ifdef KERNEL_PRIVATE // Returns a value suitable, safe and consistent for tracing and logging @@ -2258,6 +2377,7 @@ void vnode_clearnoflush(vnode_t); #define BUILDPATH_CHECKACCESS 0x2 /* Check if parents have search rights */ #define BUILDPATH_CHECK_MOVED 0x4 /* Return EAGAIN if the parent hierarchy is modified */ #define BUILDPATH_VOLUME_RELATIVE 0x8 /* Return path relative to the nearest mount point */ +#define BUILDPATH_NO_FIRMLINK 0x10 /* Return non-firmlinked path */ int build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx); diff --git a/bsd/sys/vnode_if.h b/bsd/sys/vnode_if.h index 1b8cc8af3..7959ff764 100644 --- a/bsd/sys/vnode_if.h +++ b/bsd/sys/vnode_if.h @@ -112,6 +112,7 @@ extern struct vnodeop_desc vnop_ioctl_desc; extern struct vnodeop_desc vnop_select_desc; extern struct vnodeop_desc vnop_exchange_desc; extern struct vnodeop_desc vnop_revoke_desc; +extern struct vnodeop_desc vnop_mmap_check_desc; extern struct vnodeop_desc vnop_mmap_desc; extern struct vnodeop_desc vnop_mnomap_desc; extern struct vnodeop_desc vnop_fsync_desc; @@ -593,6 +594,30 @@ struct vnop_revoke_args { extern errno_t VNOP_REVOKE(vnode_t, int, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +struct vnop_mmap_check_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_flags; + vfs_context_t a_context; +}; + +/*! + * @function VNOP_MMAP_CHECK + * @abstract Check with a filesystem if a file can be mmap-ed. + * @discussion VNOP_MMAP_CHECK is used to check with the file system if a + * file can be mmap-ed. It will be called before any call to VNOP_MMAP(). + * @param vp The vnode being mmapped. + * @param flags Memory protection: PROT_READ, PROT_WRITE, PROT_EXEC. + * @param ctx Context to authenticate for mmap request. + * @return 0 for success; EPERM if the operation is not permitted; other + * errors (except ENOTSUP) may be returned at the discretion of the file + * system. ENOTSUP will never be returned by VNOP_MMAP_CHECK(). + */ +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_MMAP_CHECK(vnode_t, int, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + + struct vnop_mmap_args { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -753,6 +778,12 @@ enum { VFS_RENAME_SWAP = 0x00000002, VFS_RENAME_EXCL = 0x00000004, + /* + * VFS_RENAME_DATALESS is kernel-only and is intentionally + * not included in VFS_RENAME_FLAGS_MASK. + */ + VFS_RENAME_DATALESS = 0x00000008, + VFS_RENAME_FLAGS_MASK = (VFS_RENAME_SECLUDE | VFS_RENAME_SWAP | VFS_RENAME_EXCL), }; diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index bde95e48c..4e271502f 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -179,6 +179,11 @@ struct vnode { #if CONFIG_TRIGGERS vnode_resolve_t v_resolve; /* trigger vnode resolve info (VDIR only) */ #endif /* CONFIG_TRIGGERS */ +#if CONFIG_FIRMLINKS + vnode_t v_fmlink; /* firmlink if set (VDIR only), Points to source + * if VFLINKTARGET is set, if VFLINKTARGET is not + * set, points to target */ +#endif /* CONFIG_FIRMLINKS */ }; #define v_mountedhere v_un.vu_mountedhere @@ -260,8 +265,8 @@ struct vnode { #define VISDIRTY 0x4000000 /* vnode will need IO if reclaimed */ #define VFASTDEVCANDIDATE 0x8000000 /* vnode is a candidate to store on a fast device */ #define VAUTOCANDIDATE 0x10000000 /* vnode was automatically marked as a fast-dev candidate */ +#define VFMLINKTARGET 0x20000000 /* vnode is firmlink target */ /* - * 0x20000000 not used * 0x40000000 not used * 0x80000000 not used. */ @@ -435,7 +440,6 @@ int vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fm int vn_authorize_create(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); int vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx); void vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields); -int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); int vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, vfs_context_t ctx, void *reserved); @@ -585,8 +589,6 @@ int vfs_sysctl_node SYSCTL_HANDLER_ARGS; void vnode_setneedinactive(vnode_t); int vnode_hasnamedstreams(vnode_t); /* Does this vnode have associated named streams? */ -void nspace_proc_exit(struct proc *p); - errno_t vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, int *numdirent, vfs_context_t ctxp); @@ -605,6 +607,11 @@ void vfs_nested_trigger_unmounts(mount_t, int, vfs_context_t); int build_path_with_parent(vnode_t, vnode_t /* parent */, char *, int, int *, int, vfs_context_t); +void nspace_resolver_init(void); +void nspace_resolver_exited(struct proc *); + +int vnode_materialize_dataless_file(vnode_t, uint64_t); + #endif /* BSD_KERNEL_PRIVATE */ #endif /* !_SYS_VNODE_INTERNAL_H_ */ diff --git a/bsd/sys/work_interval.h b/bsd/sys/work_interval.h index ab5d80fb2..695a28ea1 100644 --- a/bsd/sys/work_interval.h +++ b/bsd/sys/work_interval.h @@ -118,6 +118,7 @@ __BEGIN_DECLS #define WORK_INTERVAL_TYPE_CA_RENDER_SERVER (0x2 << 28) #define WORK_INTERVAL_TYPE_CA_CLIENT (0x3 << 28) #define WORK_INTERVAL_TYPE_HID_DELIVERY (0x4 << 28) +#define WORK_INTERVAL_TYPE_COREMEDIA (0x5 << 28) #define WORK_INTERVAL_TYPE_LAST (0xF << 28) #ifndef KERNEL diff --git a/bsd/sys_private/Makefile b/bsd/sys_private/Makefile new file mode 100644 index 000000000..1789c0c20 --- /dev/null +++ b/bsd/sys_private/Makefile @@ -0,0 +1,37 @@ +# This private directory is necessary for BSD headers bound for +# `/usr/local/include/sys/` and the System framework. + +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTALL_MI_DIR = sys +INCDIR = $(SDKHEADERSROOT)/usr/local/include +DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include + +# Installs header files for Apple internal use in +# $(DSTROOT)/usr/local/include/sys + +INSTALL_MI_LIST = \ + kdebug_private.h + +INSTALL_DRIVERKIT_MI_LIST = \ + kdebug_private.h + +# Installs header files for Apple internal use in +# $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders/sys + +INSTALL_MI_LCL_LIST = \ + kdebug_private.h + +EXPORT_MI_DIR = sys + +EXPORT_MI_LIST = \ + kdebug_private.h + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/sys_private/kdebug_private.h b/bsd/sys_private/kdebug_private.h new file mode 100644 index 000000000..6444ea6f4 --- /dev/null +++ b/bsd/sys_private/kdebug_private.h @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef BSD_KDEBUG_PRIVATE_H +#define BSD_KDEBUG_PRIVATE_H + +#include +#include +#include +#include + +__BEGIN_DECLS + +#if !KERNEL + +#include + +#pragma mark - user space SPI + +/* + * OS components can use the full precision of the "code" field + * (Class, SubClass, Code) to inject events using kdebug_trace() by + * using: + * + * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, 1, 2, 3, 4); + * + * These trace points can be included in production code, since they + * use reserved, non-overlapping ranges. The performance impact when + * kernel tracing is not enabled is minimal. However, when tracing is enabled, + * each tracepoint becomes a syscall. For this reason, os_signpost(3) is + * recommended instead of kdebug_trace(2). + * + * Classes can be reserved by filing a Radar in xnu | ktrace. + * + * 64-bit arguments may be truncated if the system is using a 32-bit + * kernel. + * + * On error, -1 will be returned and errno will indicate the error. + */ +int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t arg3, + uint64_t arg4) +__OSX_AVAILABLE(10.10) __IOS_AVAILABLE(8.2); + +/*! + * @function kdebug_trace_string + * + * @discussion + * This function emits strings to kdebug trace along with an ID and allows + * for previously-traced strings to be overwritten and invalidated. + * + * To start tracing a string and generate an ID to use to refer to it: + * + * string_id = kdebug_trace_string(debugid, 0, "string"); + * + * To replace a string previously traced: + * + * string_id = kdebug_trace_string(debugid, string_id, "new string"); + * + * To invalidate a string ID: + * + * string_id = kdebug_trace_string(debugid, string_id, NULL); + * + * To check for errors: + * + * if ((int64_t)string_id == -1) { perror("string error") } + * + * @param debugid + * The `debugid` to check if its enabled before tracing and include as + * an argument in the event containing the string. + * + * Some classes or subclasses are reserved for specific uses and are not + * allowed to be used with this function. No function qualifiers are + * allowed on `debugid`. + * + * @param str_id + * When 0, a new ID will be generated and returned if tracing is + * enabled. + * + * Otherwise `str_id` must contain an ID that was previously generated + * with this function. Clents should pass NULL in `str` if `str_id` + * is no longer in use. Otherwise, the string previously mapped to + * `str_id` will be overwritten with the contents of `str`. + * + * @param str + * A NUL-terminated 'C' string containing the characters that should be + * traced alongside `str_id`. + * + * If necessary, the string will be truncated at an + * implementation-defined length. The string must not be the empty + * string, but can be NULL if a valid `str_id` is provided. + * + * @return + * 0 if tracing is disabled or `debugid` is being filtered out of trace. + * It can also return (int64_t)-1 if an error occured. Otherwise, + * it returns the ID to use to refer to the string in future + * kdebug_trace(2) calls. + * + * The errors that can occur are: + * + * EINVAL + * There are function qualifiers on `debugid`, `str` is empty, or + * `str_id` was not generated by this function. + * EPERM + * The `debugid`'s class or subclass is reserved for internal use. + * EFAULT + * `str` is an invalid address or NULL when `str_id` is 0. + */ +extern uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id, + const char *str) +__OSX_AVAILABLE(10.11) __IOS_AVAILABLE(9.0); + +/* + * Although the performance impact of kdebug_trace() when kernel + * tracing is not enabled is minimal, it may require the caller to + * perform an expensive calculation/summarization. This cost can be + * skipped by checking the kdebug_is_enabled() predicate: + * + * if (kdebug_is_enabled(KDBG_CODE(DBG_XPC, 15, 1))) { + * uint64_t arg1 = ...; + * uint64_t arg2 = ...; + * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, arg1, arg2, 0, 0); + * } + * + * If tracing is enabled for the code at the time of the check, 1 + * will be returned. Otherwise, 0 will be returned. + */ +extern bool kdebug_is_enabled(uint32_t code) +__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) +__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +/* + * Returns a pointer to the userspace typefilter, if one is available. + * May return NULL. + */ +extern void *kdebug_typefilter(void) +__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) +__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +/* + * Returns true if kdebug is using continuous time for its events, and false + * otherwise. + */ +extern bool kdebug_using_continuous_time(void) +__API_AVAILABLE(macos(10.15), ios(13), tvos(13), watchos(6)); + +#endif /* !KERNEL */ + +#pragma mark - private debugids + +#define DBG_PPT 36 +#define DBG_PERFCTRL 39 +#define DBG_CLPC 50 +#define DBG_MUSE 52 + +/* **** 128 to 139 are reserved for IOP tracing **** */ +#define DBG_ANS 128 +#define DBG_SIO 129 +#define DBG_SEP 130 +#define DBG_ISP 131 +#define DBG_OSCAR 132 +#define DBG_EMBEDDEDGFX 133 +#define DBG_PMP 134 +#define DBG_RTKIT 135 + +#define MACH_BRIDGE_RCV_TS 0x1 /* receive timestamp pair from interrupt handler */ +#define MACH_BRIDGE_REMOTE_TIME 0x2 /* calculate remote timestamp */ +#define MACH_BRIDGE_RESET_TS 0x3 /* reset timestamp conversion parameters */ +#define MACH_BRIDGE_TS_PARAMS 0x4 /* recompute timestamp conversion parameters */ +#define MACH_BRIDGE_SKIP_TS 0x5 /* skip timestamp */ +#define MACH_BRIDGE_TS_MISMATCH 0x6 /* mismatch between predicted and received remote timestamp */ +#define MACH_BRIDGE_OBSV_RATE 0x7 /* out of range observed rates */ + +/* DBG_SKYWALK has same toplevel code as DBG_DLIL, so don't reuse subcodes */ +#define DBG_SKYWALK_FLOWSWITCH 0x11 +#define DBG_SKYWALK_NETIF 0x12 +#define DBG_SKYWALK_CHANNEL 0x13 + +#define PPT_TEST 0x01 +#define PPT_JETSAM_HIWAT 0x02 +#define PPT_JETSAM_TOPPROC 0x03 + +#define SKYWALKDBG_CODE(SubClass, code) KDBG_CODE(DBG_DLIL, SubClass, code) +#define PPTDBG_CODE(SubClass, code) KDBG_CODE(DBG_PPT, SubClass, code) +#define PERFCTRL_CODE(SubClass, code) KDBG_CODE(DBG_PERFCTRL, SubClass, code) + +#if !defined(DRIVERKIT) + +extern unsigned int kdebug_enable; + +/* + * Bits used by kdebug_enable. These control which events are traced at + * runtime. + */ +#define KDEBUG_ENABLE_TRACE (1U << 0) +#define KDEBUG_ENABLE_ENTROPY (1U << 1) /* obsolete */ +#define KDEBUG_ENABLE_CHUD (1U << 2) /* obsolete */ +#define KDEBUG_ENABLE_PPT (1U << 3) /* obsolete */ +#define KDEBUG_ENABLE_SERIAL (1U << 4) /* obsolete */ + +/* + * If set, the timestamps in events are expected to be continuous times. + * Otherwise, the timestamps are absolute times. IOPs should observe this bit + * in order to log events that can be merged cleanly with other event streams. + */ +#define KDEBUG_ENABLE_CONT_TIME 0x20 + +#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE) + +/* + * Specify KDEBUG_PPT to indicate that the event belongs to the limited PPT set. + * PPT is deprecated -- use a typefilter and the PPTDBG class instead. + */ +#define KDEBUG_PPT (KDEBUG_ENABLE_PPT) +#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_PPT) + +/* + * The kernel debug configuration level. These values control which events are + * compiled in under different build configurations. + * + * Infer the supported kernel debug event level from config option. Use + * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug + * code. + */ +#define KDEBUG_LEVEL_NONE 0 +#define KDEBUG_LEVEL_IST 1 +#define KDEBUG_LEVEL_STANDARD 2 +#define KDEBUG_LEVEL_FULL 3 + +#if NO_KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE +#elif IST_KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_IST +#elif KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL +#else +#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD +/* + * Currently, all other kernel configurations (development, etc) build with + * KDEBUG_LEVEL_STANDARD. + */ +#endif + +/* + * Some Apple internal clients try to use the kernel macros in user space. + */ +#ifndef KERNEL_DEBUG +#define KERNEL_DEBUG(...) do { } while (0) +#endif /* !defined(KERNEL_DEBUG) */ + +#pragma mark - private definitions + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__arm64__) +typedef uint64_t kd_buf_argtype; +#else +typedef uintptr_t kd_buf_argtype; +#endif + +typedef struct { + uint64_t timestamp; + kd_buf_argtype arg1; + kd_buf_argtype arg2; + kd_buf_argtype arg3; + kd_buf_argtype arg4; + kd_buf_argtype arg5; /* the thread ID */ + uint32_t debugid; +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__LP64__) || defined(__arm64__) + uint32_t cpuid; + kd_buf_argtype unused; +#endif +} kd_buf; + +#if defined(__LP64__) || defined(__arm64__) +#define KDBG_TIMESTAMP_MASK 0xffffffffffffffffULL +static inline void +kdbg_set_cpu(kd_buf *kp, int cpu) +{ + kp->cpuid = (unsigned int)cpu; +} +static inline int +kdbg_get_cpu(kd_buf *kp) +{ + return (int)kp->cpuid; +} +static inline void +kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) +{ + kp->timestamp = thetime; +} +static inline uint64_t +kdbg_get_timestamp(kd_buf *kp) +{ + return kp->timestamp; +} +static inline void +kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) +{ + kdbg_set_timestamp(kp, thetime); + kdbg_set_cpu(kp, cpu); +} +#else +#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL +#define KDBG_CPU_MASK 0xff00000000000000ULL +#define KDBG_CPU_SHIFT 56 +static inline void +kdbg_set_cpu(kd_buf *kp, int cpu) +{ + kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) | + (((uint64_t) cpu) << KDBG_CPU_SHIFT); +} +static inline int +kdbg_get_cpu(kd_buf *kp) +{ + return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT); +} +static inline void +kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) +{ + kp->timestamp = thetime & KDBG_TIMESTAMP_MASK; +} +static inline uint64_t +kdbg_get_timestamp(kd_buf *kp) +{ + return kp->timestamp & KDBG_TIMESTAMP_MASK; +} +static inline void +kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) +{ + kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) | + (((uint64_t) cpu) << KDBG_CPU_SHIFT); +} +#endif + +/* + * 2^16 bits (8 kilobytes), one for each possible class/subclass combination + */ +#define KDBG_TYPEFILTER_BITMAP_SIZE ((256 * 256) / 8) + +/* + * Bits for kd_ctrl_page.flags, KERN_KD{D,E}FLAGS. + */ +#define KDBG_INIT (1U << 0) /* obsolete */ +/* disable tracing when buffers are full */ +#define KDBG_NOWRAP (1U << 1) +#define KDBG_FREERUN (1U << 2) /* obsolete */ +/* buffer has wrapped */ +#define KDBG_WRAPPED (1U << 3) +/* flags that are allowed to be set by user space */ +#define KDBG_USERFLAGS (KDBG_FREERUN | KDBG_NOWRAP | KDBG_INIT) +/* only include processes with kdebug bit set in proc */ +#define KDBG_PIDCHECK (1U << 4) +/* thread map is initialized */ +#define KDBG_MAPINIT (1U << 5) +/* exclude processes based on kdebug bit in proc */ +#define KDBG_PIDEXCLUDE (1U << 6) +/* whether the kdebug locks are intialized */ +#define KDBG_LOCKINIT (1U << 7) +/* word size of the kernel */ +#define KDBG_LP64 (1U << 8) + +/* bits for kd_ctrl_page.flags and kbufinfo_t.flags */ + +/* only trace events within a range */ +#define KDBG_RANGECHECK 0x00100000U +/* only trace at most 4 types of events, at the code granularity */ +#define KDBG_VALCHECK 0x00200000U +/* check class and subclass against the typefilter */ +#define KDBG_TYPEFILTER_CHECK 0x00400000U +/* kdebug trace buffers are initialized */ +#define KDBG_BUFINIT 0x80000000U + +/* bits for the type field of kd_regtype */ +#define KDBG_CLASSTYPE 0x10000 +#define KDBG_SUBCLSTYPE 0x20000 +#define KDBG_RANGETYPE 0x40000 +#define KDBG_TYPENONE 0x80000 +#define KDBG_CKTYPES 0xF0000 + +typedef struct { + unsigned int type; + unsigned int value1; + unsigned int value2; + unsigned int value3; + unsigned int value4; +} kd_regtype; + +typedef struct { + /* number of events that can fit in the buffers */ + int nkdbufs; + /* set if trace is disabled */ + int nolog; + /* kd_ctrl_page.flags */ + unsigned int flags; + /* number of threads in thread map */ + int nkdthreads; + /* the owning pid */ + int bufid; +} kbufinfo_t; + +typedef struct { + /* the thread ID */ +#if defined(__arm64__) + uint64_t thread; +#else + uintptr_t thread; +#endif + /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */ + int valid; + /* the name of the process owning the thread */ + char command[20]; +} kd_threadmap; + +typedef struct { + uint32_t version_no; + uint32_t cpu_count; +} kd_cpumap_header; + +/* cpumap flags */ +#define KDBG_CPUMAP_IS_IOP 0x1 + +typedef struct { + uint32_t cpu_id; + uint32_t flags; + char name[8]; +} kd_cpumap; + +typedef struct { + int version_no; + int thread_count; + uint64_t TOD_secs; + uint32_t TOD_usecs; +} RAW_header; + +#define RAW_VERSION0 0x55aa0000 +#define RAW_VERSION1 0x55aa0101 +#define RAW_VERSION2 0x55aa0200 /* Only used by kperf and Instruments */ + +/* + * Bits set in the comm page for kdebug. + */ +#define KDEBUG_COMMPAGE_ENABLE_TRACE 0x1 +#define KDEBUG_COMMPAGE_ENABLE_TYPEFILTER 0x2 /* Forced to false if ENABLE_TRACE is 0 */ + +#pragma mark - EnergyTracing + +/* for EnergyTracing user space & clients */ +#define kEnTrCompKernel 2 + +/* + * EnergyTracing opcodes + * + * Activations use DBG_FUNC_START/END. + * Events are DBG_FUNC_NONE. + */ + +/* Socket reads and writes are uniquely identified by the (sanitized) + * pointer to the socket struct in question. To associate this address + * with the user space file descriptor, we have a socket activation with + * the FD as its identifier and the socket struct pointer as its value. + */ +#define kEnTrActKernSocket 1 +#define kEnTrActKernSockRead 2 +#define kEnTrActKernSockWrite 3 + +#define kEnTrActKernPoll 10 +#define kEnTrActKernSelect 11 +#define kEnTrActKernKQWait 12 + +// events +#define kEnTrEvUnblocked 256 + +// EnergyTracing flags (the low-order 16 bits of 'quality') +#define kEnTrFlagNonBlocking 1 << 0 +#define kEnTrFlagNoWork 1 << 1 + +/* + * EnergyTracing macros. + */ + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) +// whether to bother calculating EnergyTracing inputs +// could change in future to see if DBG_ENERGYTRACE is active +#define ENTR_SHOULDTRACE kdebug_enable +// encode logical EnergyTracing into 32/64 KDebug trace +#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value) \ +do { \ + uint32_t kdcode__; \ + uintptr_t highval__, lowval__, mask__ = 0xffffffff; \ + kdcode__ = KDBG_CODE(DBG_ENERGYTRACE,component,opcode)|(lifespan); \ + highval__ = ((value) >> 32) & mask__; \ + lowval__ = (value) & mask__; \ + ENTR_KDTRACEFUNC(kdcode__, id, quality, highval__, lowval__); \ +} while(0) + +/* + * Trace the association of two existing activations. + * + * An association is traced as a modification to the parent activation. + * In order to fit the sub-activation's component, activation code, and + * activation ID into a kdebug tracepoint, the arguments that would hold + * the value are left separate, and one stores the component and opcode + * of the sub-activation, while the other stores the pointer-sized + * activation ID. + * + * arg2 arg3 arg4 + +-----------------+ +~+----+----+--------+ +----------+ + |kEnTrModAssociate| | | | | | | | + +-----------------+ +~+----+----+--------+ +----------+ + * 8-bits unused sub-activation ID + * 8-bit sub-component + * 16-bit sub-opcode + * + */ +#define kEnTrModAssociate (1 << 28) +#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id, \ + sub_comp, sub_opcode, sub_act_id) \ +do { \ + unsigned sub_compcode = ((unsigned)sub_comp << 16) | sub_opcode; \ + ENTR_KDTRACEFUNC(KDBG_CODE(DBG_ENERGYTRACE,par_comp,par_opcode), \ + par_act_id, kEnTrModAssociate, sub_compcode, \ + sub_act_id); \ +} while(0) + +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ + +#define ENTR_SHOULDTRACE FALSE +#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value) \ + do {} while (0) +#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id, \ + sub_comp, sub_opcode, sub_act_id) \ + do {} while (0) + +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ + +#endif /* !defined(DRIVERKIT) */ + +__END_DECLS + +#endif /* !defined(BSD_KDEBUG_PRIVATE_H) */ diff --git a/bsd/tests/bsd_tests.c b/bsd/tests/bsd_tests.c index a53bbdc19..3a3622fc7 100644 --- a/bsd/tests/bsd_tests.c +++ b/bsd/tests/bsd_tests.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,13 +51,27 @@ extern kern_return_t arm64_lock_test(void); #endif kern_return_t kalloc_test(void); kern_return_t ipi_test(void); +#if __ARM_PAN_AVAILABLE__ +extern kern_return_t arm64_late_pan_test(void); +#endif +#if HAS_TWO_STAGE_SPR_LOCK +extern kern_return_t arm64_spr_lock_test(void); +#endif +extern kern_return_t copyio_test(void); struct xnupost_test bsd_post_tests[] = { #ifdef __arm64__ XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test), +#endif +#if __ARM_PAN_AVAILABLE__ + XNUPOST_TEST_CONFIG_BASIC(arm64_late_pan_test), #endif XNUPOST_TEST_CONFIG_BASIC(kalloc_test), - XNUPOST_TEST_CONFIG_BASIC(ipi_test) + XNUPOST_TEST_CONFIG_BASIC(ipi_test), +#if HAS_TWO_STAGE_SPR_LOCK + XNUPOST_TEST_CONFIG_BASIC(arm64_spr_lock_test), +#endif + XNUPOST_TEST_CONFIG_BASIC(copyio_test), }; uint32_t bsd_post_tests_count = sizeof(bsd_post_tests) / sizeof(xnupost_test_data_t); @@ -130,17 +144,17 @@ kalloc_test() #define XNUPOST_TNAME_MAXLEN 132 struct kcdata_subtype_descriptor kc_xnupost_test_def[] = { - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 0, sizeof(uint16_t), "config"}, - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 1 * sizeof(uint16_t), sizeof(uint16_t), "test_num"}, - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t), sizeof(int32_t), "retval"}, - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t) + sizeof(int32_t), sizeof(int32_t), "expected_retval"}, - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)), sizeof(uint64_t), "begin_time"}, - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)) + sizeof(uint64_t), sizeof(uint64_t), "end_time"}, - {KCS_SUBTYPE_FLAGS_ARRAY, - KC_ST_CHAR, - 2 * (sizeof(uint16_t) + sizeof(int32_t) + sizeof(uint64_t)), - KCS_SUBTYPE_PACK_SIZE(XNUPOST_TNAME_MAXLEN * sizeof(char), sizeof(char)), - "test_name"} + {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT16, .kcs_elem_offset = 0, .kcs_elem_size = sizeof(uint16_t), .kcs_name = "config"}, + {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT16, .kcs_elem_offset = 1 * sizeof(uint16_t), .kcs_elem_size = sizeof(uint16_t), .kcs_name = "test_num"}, + {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_INT32, .kcs_elem_offset = 2 * sizeof(uint16_t), .kcs_elem_size = sizeof(int32_t), .kcs_name = "retval"}, + {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_INT32, .kcs_elem_offset = 2 * sizeof(uint16_t) + sizeof(int32_t), .kcs_elem_size = sizeof(int32_t), .kcs_name = "expected_retval"}, + {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT64, .kcs_elem_offset = 2 * (sizeof(uint16_t) + sizeof(int32_t)), .kcs_elem_size = sizeof(uint64_t), .kcs_name = "begin_time"}, + {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT64, .kcs_elem_offset = 2 * (sizeof(uint16_t) + sizeof(int32_t)) + sizeof(uint64_t), .kcs_elem_size = sizeof(uint64_t), .kcs_name = "end_time"}, + {.kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY, + .kcs_elem_type = KC_ST_CHAR, + .kcs_elem_offset = 2 * (sizeof(uint16_t) + sizeof(int32_t) + sizeof(uint64_t)), + .kcs_elem_size = KCS_SUBTYPE_PACK_SIZE(XNUPOST_TNAME_MAXLEN * sizeof(char), sizeof(char)), + .kcs_name = "test_name"} }; const uint32_t kc_xnupost_test_def_count = sizeof(kc_xnupost_test_def) / sizeof(struct kcdata_subtype_descriptor); diff --git a/bsd/tests/copyio_tests.c b/bsd/tests/copyio_tests.c new file mode 100644 index 000000000..f3594be79 --- /dev/null +++ b/bsd/tests/copyio_tests.c @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +kern_return_t copyio_test(void); + +#define copyio_test_buf_size (PAGE_SIZE * 16) +static const char copyio_test_string[] = {'T', 'e', 's', 't', ' ', 'S', 't', 'r', 'i', 'n', 'g', '!', '\0', 'A', 'B', 'C'}; + +struct copyio_test_data { + /* VM map of the current userspace process. */ + vm_map_t user_map; + /* The start of a `copyio_test_buf_size'-sized region mapped into userspace. */ + mach_vm_offset_t user_addr; + /* The start of a page-sized region that guaranteed to be unmapped in userspace. */ + mach_vm_offset_t unmapped_addr; + /* The start of a page-sized region mapped at the largest possible userspace address. */ + mach_vm_offset_t user_lastpage_addr; + /* Kernel mapping of the physical pages mapped at `user_addr'. */ + void *kern_addr; + + /* Scratch buffers of size `copyio_test_buf_size'. */ + char *buf1, *buf2; + /* Scratch data to pass to helper threads */ + union { + void *thread_ptr; + uint64_t thread_data; + }; +}; + +typedef int (*copyio_thread_fn_t)(struct copyio_test_data *); + +struct copyio_test_thread_data { + copyio_thread_fn_t fn; + struct copyio_test_data *data; + int ret; + semaphore_t done; +}; + +static void +copyio_thread_call_fn(void *arg, wait_result_t __unused res) +{ + struct copyio_test_thread_data *tdata = arg; + tdata->ret = tdata->fn(tdata->data); + semaphore_signal(tdata->done); +} + +static int +copyio_test_run_in_thread(copyio_thread_fn_t fn, struct copyio_test_data *data) +{ + struct copyio_test_thread_data tdata = { + .fn = fn, + .data = data, + }; + thread_t thread; + + semaphore_create(current_task(), &tdata.done, SYNC_POLICY_FIFO, 0); + kernel_thread_start(copyio_thread_call_fn, &tdata, &thread); + + semaphore_wait(tdata.done); + + thread_deallocate(thread); + semaphore_destroy(current_task(), tdata.done); + + return tdata.ret; +} + +static void +copyio_test_protect(struct copyio_test_data *data, vm_prot_t prot) +{ + kern_return_t ret = mach_vm_protect(data->user_map, data->user_addr, copyio_test_buf_size, false, prot); + assert(ret == KERN_SUCCESS); +} + +static int +copyin_from_kernel(struct copyio_test_data *data) +{ + char *in_buf = data->buf2; + return copyin((uintptr_t)data->kern_addr, in_buf, copyio_test_buf_size); +} + +static void +copyin_test(struct copyio_test_data *data) +{ + char *out_buf = data->buf1; + char *in_buf = data->buf2; + + for (size_t i = 0; i < copyio_test_buf_size; i++) { + out_buf[i] = (char)i; + } + memcpy(data->kern_addr, out_buf, copyio_test_buf_size); + + int err = copyin(data->user_addr, in_buf, copyio_test_buf_size); + T_EXPECT_EQ_INT(err, 0, "copyin() with valid parameters should succeed"); + int cmp = memcmp(out_buf, in_buf, copyio_test_buf_size); + T_EXPECT_EQ_INT(cmp, 0, "copyin() should correctly copy in data"); + + err = copyin(data->unmapped_addr, NULL, 0); + T_EXPECT_EQ_INT(err, 0, "copyin() with 0 size should always succeed"); + + err = copyin(data->unmapped_addr, in_buf, copyio_test_buf_size); + T_EXPECT_EQ_INT(err, EFAULT, "copyin() from unmapped userspace address should return EFAULT"); + err = copyin(data->unmapped_addr - PAGE_SIZE, in_buf, PAGE_SIZE * 2); + T_EXPECT_EQ_INT(err, EFAULT, "copyin() from partially valid userspace range should return EFAULT"); + err = copyin(data->user_lastpage_addr, in_buf, PAGE_SIZE * 2); + T_EXPECT_EQ_INT(err, EFAULT, "copyin() past end of userspace address space should return EFAULT"); + + bzero(in_buf, copyio_test_buf_size); + err = copyio_test_run_in_thread(copyin_from_kernel, data); + T_EXPECT_EQ_INT(err, 0, "copyin() from kernel address in kernel_task thread should succeed"); + cmp = memcmp(data->kern_addr, in_buf, copyio_test_buf_size); + T_EXPECT_EQ_INT(cmp, 0, "copyin() from kernel address should correctly copy in data"); + err = copyin_from_kernel(data); + T_EXPECT_EQ_INT(err, EFAULT, "copyin() from kernel address in other threads should return EFAULT"); + + copyio_test_protect(data, VM_PROT_WRITE); + err = copyin(data->user_addr, in_buf, copyio_test_buf_size); + T_EXPECT_EQ_INT(err, EFAULT, "copyin() from write-only address should return EFAULT"); + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); +} + +static int +copyout_to_kernel(struct copyio_test_data *data) +{ + char *out_buf = data->buf1; + return copyout(out_buf, (uintptr_t)data->kern_addr, copyio_test_buf_size); +} + +static void +copyout_test(struct copyio_test_data *data) +{ + char *out_buf = data->buf1; + + bzero(data->kern_addr, copyio_test_buf_size); + + for (size_t i = 0; i < copyio_test_buf_size; i++) { + out_buf[i] = ~(char)i; + } + int err = copyout(out_buf, data->user_addr, copyio_test_buf_size); + T_EXPECT_EQ_INT(err, 0, "copyout() with valid parameters should succeed"); + + int cmp = memcmp(data->kern_addr, out_buf, copyio_test_buf_size); + T_EXPECT_EQ_INT(cmp, 0, "copyout() should correctly copy out data"); + + err = copyout(NULL, data->unmapped_addr, 0); + T_EXPECT_EQ_INT(err, 0, "copyout() with 0 size should always succeed"); + + err = copyout(out_buf, data->unmapped_addr, copyio_test_buf_size); + T_EXPECT_EQ_INT(err, EFAULT, "copyout() to unmapped userspace address should return EFAULT"); + err = copyout(out_buf, data->unmapped_addr - PAGE_SIZE, PAGE_SIZE * 2); + T_EXPECT_EQ_INT(err, EFAULT, "copyout() to partially valid userspace range should return EFAULT"); + err = copyout(out_buf, data->user_lastpage_addr, PAGE_SIZE * 2); + T_EXPECT_EQ_INT(err, EFAULT, "copyout() past end of userspace address space should return EFAULT"); + + bzero(data->kern_addr, copyio_test_buf_size); + + err = copyio_test_run_in_thread(copyout_to_kernel, data); + T_EXPECT_EQ_INT(err, 0, "copyout() to kernel address in kernel_task thread should succeed"); + cmp = memcmp(out_buf, data->kern_addr, copyio_test_buf_size); + T_EXPECT_EQ_INT(cmp, 0, "copyout() to kernel address should correctly copy out data"); + err = copyout_to_kernel(data); + T_EXPECT_EQ_INT(err, EFAULT, "copyout() to kernel address in other threads should return EFAULT"); + + copyio_test_protect(data, VM_PROT_READ); + err = copyout(out_buf, data->user_addr, copyio_test_buf_size); + T_EXPECT_EQ_INT(err, EFAULT, "copyout() to read-only address should return EFAULT"); + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); +} + +static int +copyinstr_from_kernel(struct copyio_test_data *data) +{ + char *in_buf = data->buf1; + size_t *lencopied = data->thread_ptr; + return copyinstr((user_addr_t)data->kern_addr, in_buf, copyio_test_buf_size, lencopied); +} + +static void +copyinstr_test(struct copyio_test_data *data) +{ + char *in_buf = data->buf1; + + memcpy(data->kern_addr, copyio_test_string, sizeof(copyio_test_string)); + + bzero(in_buf, copyio_test_buf_size); + size_t lencopied; + int err = copyinstr(data->user_addr, in_buf, copyio_test_buf_size, &lencopied); + T_EXPECT_EQ_INT(err, 0, "copyinstr() with valid parameters should succeed"); + T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyinstr() with a large enough buffer should read entire string"); + + int cmp = strncmp(in_buf, copyio_test_string, lencopied); + T_EXPECT_EQ_INT(cmp, 0, "copyinstr() should correctly copy string up to NULL terminator"); + cmp = memcmp(in_buf, copyio_test_string, sizeof(copyio_test_string)); + T_EXPECT_NE_INT(cmp, 0, "copyinstr() should not read past NULL terminator"); + + bzero(in_buf, copyio_test_buf_size); + const vm_size_t trunc_size = strlen(copyio_test_string) - 4; + err = copyinstr(data->user_addr, in_buf, trunc_size, &lencopied); + T_EXPECT_EQ_INT(err, ENAMETOOLONG, "truncated copyinstr() should return ENAMETOOLONG"); + T_EXPECT_EQ_ULONG(lencopied, trunc_size, "truncated copyinstr() should copy exactly `maxlen' bytes"); + cmp = memcmp(in_buf, copyio_test_string, trunc_size); + T_EXPECT_EQ_INT(cmp, 0, "copyinstr() should correctly copy in truncated string"); + cmp = memcmp(in_buf, copyio_test_string, strlen(copyio_test_string)); + T_EXPECT_NE_INT(cmp, 0, "copyinstr() should stop copying at `maxlen' bytes"); + + err = copyinstr(data->unmapped_addr, in_buf, copyio_test_buf_size, &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from unmapped userspace address should return EFAULT"); + err = copyinstr(data->user_lastpage_addr, in_buf, PAGE_SIZE * 2, &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() past end of userspace address space should return EFAULT"); + + bzero(in_buf, copyio_test_buf_size); + data->thread_ptr = &lencopied; + + err = copyio_test_run_in_thread(copyinstr_from_kernel, data); +#if defined(CONFIG_EMBEDDED) + T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from kernel address in kernel_task thread should return EFAULT"); +#else + T_EXPECT_EQ_INT(err, 0, "copyinstr() from kernel address in kernel_task thread should succeed"); + T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyinstr() from kernel address should read entire string"); + cmp = strncmp(in_buf, copyio_test_string, lencopied); + T_EXPECT_EQ_INT(cmp, 0, "copyinstr() from kernel address should correctly copy string up to NULL terminator"); + cmp = memcmp(in_buf, copyio_test_string, sizeof(copyio_test_string)); + T_EXPECT_NE_INT(cmp, 0, "copyinstr() from kernel address should not read past NULL terminator"); +#endif + err = copyinstr_from_kernel(data); + T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from kernel address in other threads should return EFAULT"); + + copyio_test_protect(data, VM_PROT_WRITE); + err = copyinstr(data->user_addr, in_buf, copyio_test_buf_size, &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from write-only address should return EFAULT"); + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); + + /* Place an unterminated string at the end of the mapped region */ + const size_t unterminated_size = 16; + char *kern_unterminated_addr = (char *)data->kern_addr + copyio_test_buf_size - unterminated_size; + memset(kern_unterminated_addr, 'A', unterminated_size); + + mach_vm_offset_t user_unterminated_addr = data->user_addr + copyio_test_buf_size - unterminated_size; + err = copyinstr(user_unterminated_addr, in_buf, copyio_test_buf_size, &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from userspace region without NULL terminator should return EFAULT"); +} + +static int +copyoutstr_to_kernel(struct copyio_test_data *data) +{ + size_t *lencopied = data->thread_ptr; + return copyoutstr(copyio_test_string, (user_addr_t)data->kern_addr, sizeof(copyio_test_string), lencopied); +} + +static void +copyoutstr_test(struct copyio_test_data *data) +{ + bzero(data->kern_addr, sizeof(copyio_test_string)); + + size_t lencopied; + int err = copyoutstr(copyio_test_string, data->user_addr, sizeof(copyio_test_string), &lencopied); + T_EXPECT_EQ_INT(err, 0, "copyoutstr() with valid parameters should succeed"); + T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyoutstr() should copy string up to NULL terminator"); + + int cmp = strncmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string)); + T_EXPECT_EQ_INT(cmp, 0, "copyoutstr() should correctly copy out string"); + cmp = memcmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string)); + T_EXPECT_NE_INT(cmp, 0, "copyoutstr() should stop copying at NULL terminator"); + + bzero(data->kern_addr, sizeof(copyio_test_string)); + + const vm_size_t trunc_size = strlen(copyio_test_string) - 4; + err = copyoutstr(copyio_test_string, data->user_addr, trunc_size, &lencopied); + T_EXPECT_EQ_INT(err, ENAMETOOLONG, "truncated copyoutstr() should return ENAMETOOLONG"); + T_EXPECT_EQ_ULONG(lencopied, trunc_size, "truncated copyoutstr() should copy exactly `maxlen' bytes"); + cmp = strncmp(data->kern_addr, copyio_test_string, trunc_size); + T_EXPECT_EQ_INT(cmp, 0, "copyoutstr() should correctly copy out truncated string"); + cmp = memcmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string)); + T_EXPECT_NE_INT(cmp, 0, "copyoutstr() should stop copying at `maxlen' bytes"); + + err = copyoutstr(copyio_test_string, data->unmapped_addr, strlen(copyio_test_string), &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to unmapped userspace address should return EFAULT"); + err = copyoutstr(copyio_test_string, data->unmapped_addr - 1, strlen(copyio_test_string), &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to partially valid userspace range should return EFAULT"); + err = copyoutstr(copyio_test_string, data->user_lastpage_addr + PAGE_SIZE - 1, strlen(copyio_test_string), &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() past end of userspace address space should return EFAULT"); + + bzero(data->kern_addr, sizeof(copyio_test_string)); + data->thread_ptr = &lencopied; + + err = copyio_test_run_in_thread(copyoutstr_to_kernel, data); +#if defined(CONFIG_EMBEDDED) + T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to kernel address in kernel_task thread should return EFAULT"); +#else + T_EXPECT_EQ_INT(err, 0, "copyoutstr() to kernel address in kernel_task thread should succeed"); + T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyoutstr() to kernel address should copy string up to NULL terminator"); + cmp = strncmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string)); + T_EXPECT_EQ_INT(cmp, 0, "copyoutstr() to kernel address should correctly copy out data"); +#endif + err = copyoutstr_to_kernel(data); + T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to kernel address in other threads should return EFAULT"); + + copyio_test_protect(data, VM_PROT_READ); + err = copyoutstr(copyio_test_string, data->user_addr, strlen(copyio_test_string), &lencopied); + T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to read-only address should return EFAULT"); + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); +} + +static int +copyin_atomic32_from_kernel(struct copyio_test_data *data) +{ + return copyin_atomic32((uintptr_t)data->kern_addr, data->thread_ptr); +} + +static int +copyin_atomic64_from_kernel(struct copyio_test_data *data) +{ + return copyin_atomic64((uintptr_t)data->kern_addr, data->thread_ptr); +} + +static int +copyout_atomic32_to_kernel(struct copyio_test_data *data) +{ + return copyout_atomic32(data->thread_data, (user_addr_t)data->kern_addr); +} + +static int +copyout_atomic64_to_kernel(struct copyio_test_data *data) +{ + return copyout_atomic64(data->thread_data, (user_addr_t)data->kern_addr); +} + +/** + * Note: we can't test atomic copyio calls which go past the end of the + * userspace address space, since there's no way to provide a range + * that straddles the userspace address boundary while being suitably + * aligned for the copy. + */ +#define copyin_atomic_test(data, word_t, copyin_fn, copyin_from_kernel_fn) \ + do { \ + const word_t word_out = (word_t)0x123456789ABCDEF0UL; \ + word_t word_in = 0; \ + memcpy(data->kern_addr, &word_out, sizeof(word_out)); \ + \ + int err = copyin_fn(data->user_addr, &word_in); \ + T_EXPECT_EQ_INT(err, 0, #copyin_fn "() with valid parameters should succeed"); \ + \ + int cmp = memcmp(&word_in, &word_out, sizeof(word_t)); \ + T_EXPECT_EQ_INT(cmp, 0, #copyin_fn "() should correctly copy word"); \ + \ + for (unsigned int offset = 1; offset < sizeof(word_t); offset++) { \ + err = copyin_fn(data->user_addr + offset, &word_in); \ + T_EXPECT_EQ_INT(err, EINVAL, \ + #copyin_fn "() from unaligned userspace address should return EINVAL (offset = %u)", \ + offset); \ + }; \ + err = copyin_fn(data->unmapped_addr, &word_in); \ + T_EXPECT_EQ_INT(err, EFAULT, #copyin_fn "() from unmapped userspace address should return EFAULT"); \ + \ + data->thread_ptr = &word_in; \ + \ + err = copyio_test_run_in_thread(copyin_from_kernel_fn, data); \ + T_EXPECT_EQ_INT(err, EFAULT, \ + #copyin_fn "() from kernel address in kernel_task threads should return EFAULT"); \ + err = copyin_from_kernel_fn(data); \ + T_EXPECT_EQ_INT(err, EFAULT, \ + #copyin_fn "() from kernel address in other threads should return EFAULT"); \ + \ + copyio_test_protect(data, VM_PROT_WRITE); \ + err = copyin_fn(data->user_addr, &word_in); \ + T_EXPECT_EQ_INT(err, EFAULT, #copyin_fn "() from write-only address should return EFAULT"); \ + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); \ + } while (0) + +#define copyout_atomic_test(data, word_t, copyout_fn, copyout_to_kernel_fn) \ + do { \ + const word_t word_out = (word_t)0x123456789ABCDEF0UL; \ + bzero(data->kern_addr, sizeof(word_t)); \ + \ + int err = copyout_fn(word_out, data->user_addr); \ + T_EXPECT_EQ_INT(err, 0, #copyout_fn "() with valid parameters should succeed"); \ + \ + int cmp = memcmp(data->kern_addr, &word_out, sizeof(word_t)); \ + T_EXPECT_EQ_INT(cmp, 0, #copyout_fn "() should correctly copy word"); \ + \ + for (unsigned int offset = 1; offset < sizeof(word_t); offset++) { \ + err = copyout_fn(word_out, data->user_addr + offset); \ + T_EXPECT_EQ_INT(err, EINVAL, \ + #copyout_fn "() to unaligned userspace address should return EINVAL (offset = %u)", \ + offset); \ + }; \ + err = copyout_fn(word_out, data->unmapped_addr); \ + T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to unmapped userspace address should return EFAULT"); \ + err = copyout_fn(word_out, (uintptr_t)data->kern_addr); \ + T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to kernel address should return EFAULT"); \ + \ + data->thread_data = word_out; \ + \ + err = copyio_test_run_in_thread(copyout_to_kernel_fn, data); \ + T_EXPECT_EQ_INT(err, EFAULT, \ + #copyout_fn "() to kernel address in kernel_task thread should return EFAULT"); \ + err = copyout_to_kernel_fn(data); \ + T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to kernel address in other threads should return EFAULT"); \ + \ + copyio_test_protect(data, VM_PROT_READ); \ + err = copyout_fn(word_out, data->user_addr); \ + T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to read-only address should return EFAULT"); \ + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); \ + } while (0) + +#define copyio_atomic_test(data, size) \ + do { \ + copyin_atomic_test((data), uint ## size ## _t, copyin_atomic ## size, \ + copyin_atomic ## size ## _from_kernel); \ + copyout_atomic_test((data), uint ## size ## _t, copyout_atomic ## size, \ + copyout_atomic ## size ## _to_kernel); \ + } while (0) + +static int +copyin_atomic32_wait_if_equals_from_kernel(struct copyio_test_data *data) +{ + return copyin_atomic32_wait_if_equals((uintptr_t)data->kern_addr, data->thread_data); +} + +static void +copyin_atomic32_wait_if_equals_test(struct copyio_test_data *data) +{ + bzero(data->kern_addr, sizeof(uint32_t)); + int err = copyin_atomic32_wait_if_equals(data->user_addr, 0); + T_EXPECT_EQ_INT(err, 0, "copyin_atomic32_wait_if_equals() should return 0 when equals"); + err = copyin_atomic32_wait_if_equals(data->user_addr, ~0U); + T_EXPECT_EQ_INT(err, ESTALE, "copyin_atomic32_wait_if_equals() should return ESTALE when not equals"); + + for (unsigned int offset = 1; offset < sizeof(uint32_t); offset++) { + err = copyin_atomic32_wait_if_equals(data->user_addr + offset, 0); + T_EXPECT_EQ_INT(err, EINVAL, + "copyin_atomic32_wait_if_equals() on unaligned userspace address should return EINVAL (offset = %u)", + offset); + } + err = copyin_atomic32_wait_if_equals(data->unmapped_addr, 0); + T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() on unmapped userspace address should return EFAULT"); + + data->thread_data = 0; + + err = copyio_test_run_in_thread(copyin_atomic32_wait_if_equals_from_kernel, data); + T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() from kernel address in kernel_task thread should return EFAULT"); + err = copyin_atomic32_wait_if_equals_from_kernel(data); + T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() from kernel address in other threads should return EFAULT"); + + copyio_test_protect(data, VM_PROT_WRITE); + err = copyin_atomic32_wait_if_equals(data->user_addr, 0); + T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() on write-only address should return EFAULT"); + copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE); +} + +kern_return_t +copyio_test(void) +{ + struct copyio_test_data data = {}; + kern_return_t ret = KERN_SUCCESS; + + data.buf1 = kalloc(copyio_test_buf_size); + data.buf2 = kalloc(copyio_test_buf_size); + if (!data.buf1 || !data.buf2) { + T_FAIL("failed to allocate scratch buffers"); + ret = KERN_NO_SPACE; + goto err_kalloc; + } + + /** + * This test needs to manipulate the current userspace process's + * address space. This is okay to do at the specific point in time + * when bsd_do_post() runs: current_proc() points to the init process, + * which has been set up to the point of having a valid vm_map, but + * not to the point of actually execing yet. + */ + proc_t proc = current_proc(); + assert(proc->p_pid == 1); + data.user_map = get_task_map_reference(proc->task); + + ret = mach_vm_allocate_kernel(data.user_map, &data.user_addr, copyio_test_buf_size + PAGE_SIZE, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE); + if (ret) { + T_FAIL("mach_vm_allocate_kernel(user_addr) failed: %d", ret); + goto err_user_alloc; + } + + data.user_lastpage_addr = get_map_max(data.user_map) - PAGE_SIZE; + ret = mach_vm_allocate_kernel(data.user_map, &data.user_lastpage_addr, PAGE_SIZE, VM_FLAGS_FIXED, VM_KERN_MEMORY_NONE); + if (ret) { + T_FAIL("mach_vm_allocate_kernel(user_lastpage_addr) failed: %d", ret); + goto err_user_lastpage_alloc; + } + + data.unmapped_addr = data.user_addr + copyio_test_buf_size; + mach_vm_deallocate(data.user_map, data.unmapped_addr, PAGE_SIZE); + + vm_prot_t cur_protection, max_protection; + mach_vm_offset_t kern_addr = 0; + ret = mach_vm_remap_kernel(kernel_map, &kern_addr, copyio_test_buf_size, VM_PROT_READ | VM_PROT_WRITE, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE, + data.user_map, data.user_addr, false, &cur_protection, &max_protection, VM_INHERIT_NONE); + if (ret) { + T_FAIL("mach_vm_remap_kernel() failed: %d", ret); + goto err_kern_remap; + } + data.kern_addr = (void *)kern_addr; + + copyin_test(&data); + copyout_test(&data); + copyinstr_test(&data); + copyoutstr_test(&data); + copyio_atomic_test(&data, 32); + copyio_atomic_test(&data, 64); + copyin_atomic32_wait_if_equals_test(&data); + + mach_vm_deallocate(kernel_map, kern_addr, copyio_test_buf_size); +err_kern_remap: + mach_vm_deallocate(data.user_map, data.user_lastpage_addr, PAGE_SIZE); +err_user_lastpage_alloc: + mach_vm_deallocate(data.user_map, data.user_addr, copyio_test_buf_size); +err_user_alloc: + vm_map_deallocate(data.user_map); +err_kalloc: + kfree(data.buf2, copyio_test_buf_size); + kfree(data.buf1, copyio_test_buf_size); + return ret; +} diff --git a/bsd/tests/pmap_test_sysctl.c b/bsd/tests/pmap_test_sysctl.c index f94028df8..ad27ee5ed 100644 --- a/bsd/tests/pmap_test_sysctl.c +++ b/bsd/tests/pmap_test_sysctl.c @@ -30,6 +30,7 @@ extern kern_return_t test_pmap_enter_disconnect(unsigned int); extern kern_return_t test_pmap_iommu_disconnect(void); +extern kern_return_t test_pmap_extended(void); static int sysctl_test_pmap_enter_disconnect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -62,3 +63,19 @@ sysctl_test_pmap_iommu_disconnect(__unused struct sysctl_oid *oidp, __unused voi SYSCTL_PROC(_kern, OID_AUTO, pmap_iommu_disconnect_test, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_test_pmap_iommu_disconnect, "I", ""); + +static int +sysctl_test_pmap_extended(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + unsigned int run = 0; + int error, changed; + error = sysctl_io_number(req, 0, sizeof(run), &run, &changed); + if (error || !changed) { + return error; + } + return test_pmap_extended(); +} + +SYSCTL_PROC(_kern, OID_AUTO, pmap_extended_test, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_test_pmap_extended, "I", ""); diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index a72dd4259..79a40a817 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -719,7 +719,13 @@ vfs_setattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) { int error; - if (vfs_isrdonly(mp)) { + /* + * with a read-only system volume, we need to allow rename of the root volume + * even if it's read-only. Don't return EROFS here if setattr changes only + * the volume name + */ + if (vfs_isrdonly(mp) && + !((mp->mnt_flag & MNT_ROOTFS) && (vfa->f_active == VFSATTR_f_vol_name))) { return EROFS; } @@ -868,7 +874,7 @@ vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t *handle) int i, j; int(***opv_desc_vector_p)(void *); int(**opv_desc_vector)(void *); - struct vnodeopv_entry_desc *opve_descp; + const struct vnodeopv_entry_desc *opve_descp; int desccount; int descsize; PFI *descptr; @@ -1541,6 +1547,19 @@ vnode_mountdevvp(vnode_t vp) } #endif +boolean_t +vnode_isonexternalstorage(vnode_t vp) +{ + if (vp) { + if (vp->v_mount) { + if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_PERIPHERAL_DRIVE) { + return TRUE; + } + } + } + return FALSE; +} + mount_t vnode_mountedhere(vnode_t vp) { @@ -2436,6 +2455,8 @@ vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx) VATTR_SET_ACTIVE(vap, va_total_alloc); } + vap->va_vaflags &= ~VA_USEFSID; + error = VNOP_GETATTR(vp, vap, ctx); if (error) { KAUTH_DEBUG("ERROR - returning %d", error); @@ -2476,7 +2497,7 @@ vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx) error = ENOMEM; goto out; } - bcopy(&fsec->fsec_acl, facl, KAUTH_ACL_COPYSIZE(&fsec->fsec_acl)); + __nochk_bcopy(&fsec->fsec_acl, facl, KAUTH_ACL_COPYSIZE(&fsec->fsec_acl)); VATTR_RETURN(vap, va_acl, facl); } } @@ -2627,9 +2648,14 @@ vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx) /* * The fsid can be obtained from the mountpoint directly. */ - VATTR_RETURN(vap, va_fsid, vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + if (VATTR_IS_ACTIVE(vap, va_fsid) && + (!VATTR_IS_SUPPORTED(vap, va_fsid) || + vap->va_vaflags & VA_REALFSID || !(vap->va_vaflags & VA_USEFSID))) { + VATTR_RETURN(vap, va_fsid, vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + } out: + vap->va_vaflags &= ~VA_USEFSID; return error; } @@ -3812,6 +3838,39 @@ VNOP_REVOKE(vnode_t vp, int flags, vfs_context_t ctx) } +#if 0 +/* +*# +*# mmap_check - vp U U U +*# +*/ +struct vnop_mmap_check_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0 */ +errno_t +VNOP_MMAP_CHECK(vnode_t vp, int flags, vfs_context_t ctx) +{ + int _err; + struct vnop_mmap_check_args a; + + a.a_desc = &vnop_mmap_check_desc; + a.a_vp = vp; + a.a_flags = flags; + a.a_context = ctx; + + _err = (*vp->v_op[vnop_mmap_check_desc.vdesc_offset])(&a); + if (_err == ENOTSUP) { + _err = 0; + } + DTRACE_FSINFO(mmap_check, vnode_t, vp); + + return _err; +} + #if 0 /* *# @@ -4109,9 +4168,8 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s } else { xfromname = &smallname1[0]; } - strlcpy(xfromname, "._", min(sizeof smallname1, len)); - strncat(xfromname, fcnp->cn_nameptr, fcnp->cn_namelen); - xfromname[len - 1] = '\0'; + strlcpy(xfromname, "._", len); + strlcat(xfromname, fcnp->cn_nameptr, len); /* Get destination attribute file name. */ len = tcnp->cn_namelen + 3; @@ -4120,9 +4178,8 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s } else { xtoname = &smallname2[0]; } - strlcpy(xtoname, "._", min(sizeof smallname2, len)); - strncat(xtoname, tcnp->cn_nameptr, tcnp->cn_namelen); - xtoname[len - 1] = '\0'; + strlcpy(xtoname, "._", len); + strlcat(xtoname, tcnp->cn_nameptr, len); /* * Look up source attribute file, keep reference on it if exists. @@ -4207,6 +4264,9 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s #if CONFIG_MACF if (_err == 0) { mac_vnode_notify_rename(ctx, *fvpp, tdvp, tcnp); + if (flags & VFS_RENAME_SWAP) { + mac_vnode_notify_rename(ctx, *tvpp, fdvp, fcnp); + } } #endif diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index 5453d20c7..c344bef00 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2018 Apple Inc. All rights reserved. + * Copyright (c) 1995-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -285,10 +285,10 @@ attrlist_pack_string(struct _attrlist_buf *ab, const char *source, ssize_t count #define ATTR_PACK_TIME(b, v, is64) \ do { \ if (is64) { \ - struct user64_timespec us = {v.tv_sec, v.tv_nsec}; \ + struct user64_timespec us = {.tv_sec = v.tv_sec, .tv_nsec = v.tv_nsec}; \ ATTR_PACK(&b, us); \ } else { \ - struct user32_timespec us = {v.tv_sec, v.tv_nsec}; \ + struct user32_timespec us = {.tv_sec = v.tv_sec, .tv_nsec = v.tv_nsec}; \ ATTR_PACK(&b, us); \ } \ } while(0) @@ -304,64 +304,64 @@ struct getvolattrlist_attrtab { ssize_t size; }; static struct getvolattrlist_attrtab getvolattrlist_common_tab[] = { - {ATTR_CMN_NAME, 0, sizeof(struct attrreference)}, - {ATTR_CMN_DEVID, 0, sizeof(dev_t)}, - {ATTR_CMN_FSID, 0, sizeof(fsid_t)}, - {ATTR_CMN_OBJTYPE, 0, sizeof(fsobj_type_t)}, - {ATTR_CMN_OBJTAG, 0, sizeof(fsobj_tag_t)}, - {ATTR_CMN_OBJID, 0, sizeof(fsobj_id_t)}, - {ATTR_CMN_OBJPERMANENTID, 0, sizeof(fsobj_id_t)}, - {ATTR_CMN_PAROBJID, 0, sizeof(fsobj_id_t)}, - {ATTR_CMN_SCRIPT, 0, sizeof(text_encoding_t)}, - {ATTR_CMN_CRTIME, VFSATTR_BIT(f_create_time), ATTR_TIME_SIZE}, - {ATTR_CMN_MODTIME, VFSATTR_BIT(f_modify_time), ATTR_TIME_SIZE}, - {ATTR_CMN_CHGTIME, VFSATTR_BIT(f_modify_time), ATTR_TIME_SIZE}, - {ATTR_CMN_ACCTIME, VFSATTR_BIT(f_access_time), ATTR_TIME_SIZE}, - {ATTR_CMN_BKUPTIME, VFSATTR_BIT(f_backup_time), ATTR_TIME_SIZE}, - {ATTR_CMN_FNDRINFO, 0, 32}, - {ATTR_CMN_OWNERID, 0, sizeof(uid_t)}, - {ATTR_CMN_GRPID, 0, sizeof(gid_t)}, - {ATTR_CMN_ACCESSMASK, 0, sizeof(uint32_t)}, - {ATTR_CMN_FLAGS, 0, sizeof(uint32_t)}, - {ATTR_CMN_USERACCESS, 0, sizeof(uint32_t)}, - {ATTR_CMN_EXTENDED_SECURITY, 0, sizeof(struct attrreference)}, - {ATTR_CMN_UUID, 0, sizeof(guid_t)}, - {ATTR_CMN_GRPUUID, 0, sizeof(guid_t)}, - {ATTR_CMN_FILEID, 0, sizeof(uint64_t)}, - {ATTR_CMN_PARENTID, 0, sizeof(uint64_t)}, - {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t)}, - {ATTR_CMN_ERROR, 0, sizeof(uint32_t)}, - {0, 0, 0} + {.attr = ATTR_CMN_NAME, .bits = 0, .size = sizeof(struct attrreference)}, + {.attr = ATTR_CMN_DEVID, .bits = 0, .size = sizeof(dev_t)}, + {.attr = ATTR_CMN_FSID, .bits = 0, .size = sizeof(fsid_t)}, + {.attr = ATTR_CMN_OBJTYPE, .bits = 0, .size = sizeof(fsobj_type_t)}, + {.attr = ATTR_CMN_OBJTAG, .bits = 0, .size = sizeof(fsobj_tag_t)}, + {.attr = ATTR_CMN_OBJID, .bits = 0, .size = sizeof(fsobj_id_t)}, + {.attr = ATTR_CMN_OBJPERMANENTID, .bits = 0, .size = sizeof(fsobj_id_t)}, + {.attr = ATTR_CMN_PAROBJID, .bits = 0, .size = sizeof(fsobj_id_t)}, + {.attr = ATTR_CMN_SCRIPT, .bits = 0, .size = sizeof(text_encoding_t)}, + {.attr = ATTR_CMN_CRTIME, .bits = VFSATTR_BIT(f_create_time), .size = ATTR_TIME_SIZE}, + {.attr = ATTR_CMN_MODTIME, .bits = VFSATTR_BIT(f_modify_time), .size = ATTR_TIME_SIZE}, + {.attr = ATTR_CMN_CHGTIME, .bits = VFSATTR_BIT(f_modify_time), .size = ATTR_TIME_SIZE}, + {.attr = ATTR_CMN_ACCTIME, .bits = VFSATTR_BIT(f_access_time), .size = ATTR_TIME_SIZE}, + {.attr = ATTR_CMN_BKUPTIME, .bits = VFSATTR_BIT(f_backup_time), .size = ATTR_TIME_SIZE}, + {.attr = ATTR_CMN_FNDRINFO, .bits = 0, .size = 32}, + {.attr = ATTR_CMN_OWNERID, .bits = 0, .size = sizeof(uid_t)}, + {.attr = ATTR_CMN_GRPID, .bits = 0, .size = sizeof(gid_t)}, + {.attr = ATTR_CMN_ACCESSMASK, .bits = 0, .size = sizeof(uint32_t)}, + {.attr = ATTR_CMN_FLAGS, .bits = 0, .size = sizeof(uint32_t)}, + {.attr = ATTR_CMN_USERACCESS, .bits = 0, .size = sizeof(uint32_t)}, + {.attr = ATTR_CMN_EXTENDED_SECURITY, .bits = 0, .size = sizeof(struct attrreference)}, + {.attr = ATTR_CMN_UUID, .bits = 0, .size = sizeof(guid_t)}, + {.attr = ATTR_CMN_GRPUUID, .bits = 0, .size = sizeof(guid_t)}, + {.attr = ATTR_CMN_FILEID, .bits = 0, .size = sizeof(uint64_t)}, + {.attr = ATTR_CMN_PARENTID, .bits = 0, .size = sizeof(uint64_t)}, + {.attr = ATTR_CMN_RETURNED_ATTRS, .bits = 0, .size = sizeof(attribute_set_t)}, + {.attr = ATTR_CMN_ERROR, .bits = 0, .size = sizeof(uint32_t)}, + {.attr = 0, .bits = 0, .size = 0} }; #define ATTR_CMN_VOL_INVALID \ (ATTR_CMN_EXTENDED_SECURITY | ATTR_CMN_UUID | ATTR_CMN_GRPUUID | \ ATTR_CMN_FILEID | ATTR_CMN_PARENTID) static struct getvolattrlist_attrtab getvolattrlist_vol_tab[] = { - {ATTR_VOL_FSTYPE, 0, sizeof(uint32_t)}, - {ATTR_VOL_SIGNATURE, VFSATTR_BIT(f_signature), sizeof(uint32_t)}, - {ATTR_VOL_SIZE, VFSATTR_BIT(f_blocks) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_SPACEFREE, VFSATTR_BIT(f_bfree) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_SPACEAVAIL, VFSATTR_BIT(f_bavail) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_MINALLOCATION, VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_ALLOCATIONCLUMP, VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_IOBLOCKSIZE, VFSATTR_BIT(f_iosize), sizeof(uint32_t)}, - {ATTR_VOL_OBJCOUNT, VFSATTR_BIT(f_objcount), sizeof(uint32_t)}, - {ATTR_VOL_FILECOUNT, VFSATTR_BIT(f_filecount), sizeof(uint32_t)}, - {ATTR_VOL_DIRCOUNT, VFSATTR_BIT(f_dircount), sizeof(uint32_t)}, - {ATTR_VOL_MAXOBJCOUNT, VFSATTR_BIT(f_maxobjcount), sizeof(uint32_t)}, - {ATTR_VOL_MOUNTPOINT, 0, sizeof(struct attrreference)}, - {ATTR_VOL_NAME, VFSATTR_BIT(f_vol_name), sizeof(struct attrreference)}, - {ATTR_VOL_MOUNTFLAGS, 0, sizeof(uint32_t)}, - {ATTR_VOL_MOUNTEDDEVICE, 0, sizeof(struct attrreference)}, - {ATTR_VOL_ENCODINGSUSED, 0, sizeof(uint64_t)}, - {ATTR_VOL_CAPABILITIES, VFSATTR_BIT(f_capabilities), sizeof(vol_capabilities_attr_t)}, - {ATTR_VOL_UUID, VFSATTR_BIT(f_uuid), sizeof(uuid_t)}, - {ATTR_VOL_QUOTA_SIZE, VFSATTR_BIT(f_quota) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_RESERVED_SIZE, VFSATTR_BIT(f_reserved) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, - {ATTR_VOL_ATTRIBUTES, VFSATTR_BIT(f_attributes), sizeof(vol_attributes_attr_t)}, - {ATTR_VOL_INFO, 0, 0}, - {0, 0, 0} + {.attr = ATTR_VOL_FSTYPE, .bits = 0, .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_SIGNATURE, .bits = VFSATTR_BIT(f_signature), .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_SIZE, .bits = VFSATTR_BIT(f_blocks) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_SPACEFREE, .bits = VFSATTR_BIT(f_bfree) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_SPACEAVAIL, .bits = VFSATTR_BIT(f_bavail) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_MINALLOCATION, .bits = VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_ALLOCATIONCLUMP, .bits = VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_IOBLOCKSIZE, .bits = VFSATTR_BIT(f_iosize), .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_OBJCOUNT, .bits = VFSATTR_BIT(f_objcount), .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_FILECOUNT, .bits = VFSATTR_BIT(f_filecount), .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_DIRCOUNT, .bits = VFSATTR_BIT(f_dircount), .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_MAXOBJCOUNT, .bits = VFSATTR_BIT(f_maxobjcount), .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_MOUNTPOINT, .bits = 0, .size = sizeof(struct attrreference)}, + {.attr = ATTR_VOL_NAME, .bits = VFSATTR_BIT(f_vol_name), .size = sizeof(struct attrreference)}, + {.attr = ATTR_VOL_MOUNTFLAGS, .bits = 0, .size = sizeof(uint32_t)}, + {.attr = ATTR_VOL_MOUNTEDDEVICE, .bits = 0, .size = sizeof(struct attrreference)}, + {.attr = ATTR_VOL_ENCODINGSUSED, .bits = 0, .size = sizeof(uint64_t)}, + {.attr = ATTR_VOL_CAPABILITIES, .bits = VFSATTR_BIT(f_capabilities), .size = sizeof(vol_capabilities_attr_t)}, + {.attr = ATTR_VOL_UUID, .bits = VFSATTR_BIT(f_uuid), .size = sizeof(uuid_t)}, + {.attr = ATTR_VOL_QUOTA_SIZE, .bits = VFSATTR_BIT(f_quota) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_RESERVED_SIZE, .bits = VFSATTR_BIT(f_reserved) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)}, + {.attr = ATTR_VOL_ATTRIBUTES, .bits = VFSATTR_BIT(f_attributes), .size = sizeof(vol_attributes_attr_t)}, + {.attr = ATTR_VOL_INFO, .bits = 0, .size = 0}, + {.attr = 0, .bits = 0, .size = 0} }; static int @@ -479,69 +479,73 @@ struct getattrlist_attrtab { * information, and we will synthesize it at the VFS level. */ static struct getattrlist_attrtab getattrlist_common_tab[] = { - {ATTR_CMN_NAME, VATTR_BIT(va_name), sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_DEVID, 0, sizeof(dev_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FSID, 0, sizeof(fsid_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OBJTYPE, 0, sizeof(fsobj_type_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OBJTAG, 0, sizeof(fsobj_tag_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OBJID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OBJPERMANENTID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_PAROBJID, VATTR_BIT(va_parentid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_SCRIPT, VATTR_BIT(va_encoding), sizeof(text_encoding_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_CRTIME, VATTR_BIT(va_create_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_MODTIME, VATTR_BIT(va_modify_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_CHGTIME, VATTR_BIT(va_change_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_ACCTIME, VATTR_BIT(va_access_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_BKUPTIME, VATTR_BIT(va_backup_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FNDRINFO, 0, 32, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OWNERID, VATTR_BIT(va_uid), sizeof(uid_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_GRPID, VATTR_BIT(va_gid), sizeof(gid_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_ACCESSMASK, VATTR_BIT(va_mode), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FLAGS, VATTR_BIT(va_flags), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_GEN_COUNT, VATTR_BIT(va_write_gencount), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_DOCUMENT_ID, VATTR_BIT(va_document_id), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_USERACCESS, 0, sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_EXTENDED_SECURITY, VATTR_BIT(va_acl), sizeof(struct attrreference), KAUTH_VNODE_READ_SECURITY}, - {ATTR_CMN_UUID, VATTR_BIT(va_uuuid), sizeof(guid_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_GRPUUID, VATTR_BIT(va_guuid), sizeof(guid_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FILEID, VATTR_BIT(va_fileid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_PARENTID, VATTR_BIT(va_parentid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FULLPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_ADDEDTIME, VATTR_BIT(va_addedtime), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t), 0}, - {ATTR_CMN_ERROR, 0, sizeof(uint32_t), 0}, - {ATTR_CMN_DATA_PROTECT_FLAGS, VATTR_BIT(va_dataprotect_class), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {0, 0, 0, 0} + {.attr = ATTR_CMN_NAME, .bits = VATTR_BIT(va_name), .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_DEVID, .bits = VATTR_BIT(va_fsid), .size = sizeof(dev_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_FSID, .bits = VATTR_BIT(va_fsid64), .size = sizeof(fsid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OBJTYPE, .bits = 0, .size = sizeof(fsobj_type_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OBJTAG, .bits = 0, .size = sizeof(fsobj_tag_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OBJID, .bits = VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), .size = sizeof(fsobj_id_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OBJPERMANENTID, .bits = VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), .size = sizeof(fsobj_id_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_PAROBJID, .bits = VATTR_BIT(va_parentid), .size = sizeof(fsobj_id_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_SCRIPT, .bits = VATTR_BIT(va_encoding), .size = sizeof(text_encoding_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_CRTIME, .bits = VATTR_BIT(va_create_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_MODTIME, .bits = VATTR_BIT(va_modify_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_CHGTIME, .bits = VATTR_BIT(va_change_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_ACCTIME, .bits = VATTR_BIT(va_access_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_BKUPTIME, .bits = VATTR_BIT(va_backup_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_FNDRINFO, .bits = 0, .size = 32, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OWNERID, .bits = VATTR_BIT(va_uid), .size = sizeof(uid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_GRPID, .bits = VATTR_BIT(va_gid), .size = sizeof(gid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_ACCESSMASK, .bits = VATTR_BIT(va_mode), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_FLAGS, .bits = VATTR_BIT(va_flags), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_GEN_COUNT, .bits = VATTR_BIT(va_write_gencount), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_DOCUMENT_ID, .bits = VATTR_BIT(va_document_id), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_USERACCESS, .bits = 0, .size = sizeof(uint32_t), .action = 0}, + {.attr = ATTR_CMN_EXTENDED_SECURITY, .bits = VATTR_BIT(va_acl), .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_SECURITY}, + {.attr = ATTR_CMN_UUID, .bits = VATTR_BIT(va_uuuid), .size = sizeof(guid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_GRPUUID, .bits = VATTR_BIT(va_guuid), .size = sizeof(guid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_FILEID, .bits = VATTR_BIT(va_fileid), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_PARENTID, .bits = VATTR_BIT(va_parentid), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_FULLPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_ADDEDTIME, .bits = VATTR_BIT(va_addedtime), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_RETURNED_ATTRS, .bits = 0, .size = sizeof(attribute_set_t), .action = 0}, + {.attr = ATTR_CMN_ERROR, .bits = 0, .size = sizeof(uint32_t), .action = 0}, + {.attr = ATTR_CMN_DATA_PROTECT_FLAGS, .bits = VATTR_BIT(va_dataprotect_class), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; static struct getattrlist_attrtab getattrlist_dir_tab[] = { - {ATTR_DIR_LINKCOUNT, VATTR_BIT(va_dirlinkcount), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_DIR_ENTRYCOUNT, VATTR_BIT(va_nchildren), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_DIR_MOUNTSTATUS, 0, sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_DIR_ALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_DIR_IOBLOCKSIZE, VATTR_BIT(va_iosize), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_DIR_DATALENGTH, VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {0, 0, 0, 0} + {.attr = ATTR_DIR_LINKCOUNT, .bits = VATTR_BIT(va_dirlinkcount), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_DIR_ENTRYCOUNT, .bits = VATTR_BIT(va_nchildren), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_DIR_MOUNTSTATUS, .bits = 0, .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_DIR_ALLOCSIZE, .bits = VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_DIR_IOBLOCKSIZE, .bits = VATTR_BIT(va_iosize), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_DIR_DATALENGTH, .bits = VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; static struct getattrlist_attrtab getattrlist_file_tab[] = { - {ATTR_FILE_LINKCOUNT, VATTR_BIT(va_nlink), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_TOTALSIZE, VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_ALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_IOBLOCKSIZE, VATTR_BIT(va_iosize), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_DEVTYPE, VATTR_BIT(va_rdev), sizeof(dev_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_DATALENGTH, VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_DATAALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_data_alloc), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_RSRCLENGTH, 0, sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_RSRCALLOCSIZE, 0, sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {0, 0, 0, 0} + {.attr = ATTR_FILE_LINKCOUNT, .bits = VATTR_BIT(va_nlink), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_TOTALSIZE, .bits = VATTR_BIT(va_total_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_ALLOCSIZE, .bits = VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_IOBLOCKSIZE, .bits = VATTR_BIT(va_iosize), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_CLUMPSIZE, .bits = 0, .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_DEVTYPE, .bits = VATTR_BIT(va_rdev), .size = sizeof(dev_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_DATALENGTH, .bits = VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_DATAALLOCSIZE, .bits = VATTR_BIT(va_total_alloc) | VATTR_BIT(va_data_alloc), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_RSRCLENGTH, .bits = 0, .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_RSRCALLOCSIZE, .bits = 0, .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; //for forkattr bits repurposed as new common attributes static struct getattrlist_attrtab getattrlist_common_extended_tab[] = { - {ATTR_CMNEXT_RELPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMNEXT_PRIVATESIZE, VATTR_BIT(va_private_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMNEXT_LINKID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, - {0, 0, 0, 0} + {.attr = ATTR_CMNEXT_RELPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_PRIVATESIZE, .bits = VATTR_BIT(va_private_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_LINKID, .bits = VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_NOFIRMLINKPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_REALDEVID, .bits = VATTR_BIT(va_devid), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_REALFSID, .bits = VATTR_BIT(va_fsid64), .size = sizeof(fsid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; /* @@ -554,25 +558,25 @@ static struct getattrlist_attrtab getattrlist_common_extended_tab[] = { * accounted from the common, file and directory tables. */ static struct getattrlist_attrtab getattrlistbulk_common_tab[] = { - {ATTR_CMN_DEVID, VATTR_BIT(va_devid), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FSID, VATTR_BIT(va_fsid64), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OBJTYPE, VATTR_BIT(va_objtype), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_OBJTAG, VATTR_BIT(va_objtag), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_USERACCESS, VATTR_BIT(va_user_access), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_CMN_FNDRINFO, VATTR_BIT(va_finderinfo), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {0, 0, 0, 0} + {.attr = ATTR_CMN_DEVID, .bits = VATTR_BIT(va_devid), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_FSID, .bits = VATTR_BIT(va_fsid64), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OBJTYPE, .bits = VATTR_BIT(va_objtype), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_OBJTAG, .bits = VATTR_BIT(va_objtag), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMN_USERACCESS, .bits = VATTR_BIT(va_user_access), .size = 0, .action = 0}, + {.attr = ATTR_CMN_FNDRINFO, .bits = VATTR_BIT(va_finderinfo), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; static struct getattrlist_attrtab getattrlistbulk_file_tab[] = { - {ATTR_FILE_RSRCLENGTH, VATTR_BIT(va_rsrc_length), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {ATTR_FILE_RSRCALLOCSIZE, VATTR_BIT(va_rsrc_alloc), 0, KAUTH_VNODE_READ_ATTRIBUTES}, - {0, 0, 0, 0} + {.attr = ATTR_FILE_RSRCLENGTH, .bits = VATTR_BIT(va_rsrc_length), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_FILE_RSRCALLOCSIZE, .bits = VATTR_BIT(va_rsrc_alloc), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; static struct getattrlist_attrtab getattrlistbulk_common_extended_tab[] = { /* getattrlist_parsetab() expects > 1 entries */ - {0, 0, 0, 0}, - {0, 0, 0, 0} + {.attr = 0, .bits = 0, .size = 0, .action = 0}, + {.attr = 0, .bits = 0, .size = 0, .action = 0} }; /* @@ -601,7 +605,9 @@ static struct getattrlist_attrtab getattrlistbulk_common_extended_tab[] = { ATTR_CMN_DOCUMENT_ID | ATTR_CMN_GEN_COUNT | \ ATTR_CMN_DATA_PROTECT_FLAGS) -#define VFS_DFLT_ATTR_CMN_EXT (ATTR_CMNEXT_PRIVATESIZE | ATTR_CMNEXT_LINKID) +#define VFS_DFLT_ATTR_CMN_EXT (ATTR_CMNEXT_PRIVATESIZE | ATTR_CMNEXT_LINKID | \ + ATTR_CMNEXT_NOFIRMLINKPATH | ATTR_CMNEXT_REALDEVID | \ + ATTR_CMNEXT_REALFSID) #define VFS_DFLT_ATTR_DIR (ATTR_DIR_LINKCOUNT | ATTR_DIR_MOUNTSTATUS) @@ -718,11 +724,6 @@ getattrlist_setupvattr_all(struct attrlist *alp, struct vnode_attr *vap, (void)getattrlist_parsetab(getattrlistbulk_common_tab, alp->commonattr, vap, fixedsize, NULL, is_64bit, sizeof(getattrlistbulk_common_tab) / sizeof(getattrlistbulk_common_tab[0])); - /* - * turn off va_fsid since we will be using only - * va_fsid64 for ATTR_CMN_FSID. - */ - VATTR_CLEAR_ACTIVE(vap, va_fsid); } } @@ -765,6 +766,8 @@ int vfs_setup_vattr_from_attrlist(struct attrlist *alp, struct vnode_attr *vap, enum vtype obj_vtype, ssize_t *attrs_fixed_sizep, vfs_context_t ctx) { + VATTR_INIT(vap); + // the caller passes us no options, we assume the caller wants the new fork // attr behavior, hence the hardcoded 1 return getattrlist_setupvattr_all(alp, vap, obj_vtype, @@ -925,6 +928,7 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, mount_t mnt; int return_valid; int pack_invalid; + vnode_t root_vp = NULL; ab.base = NULL; VATTR_INIT(&va); @@ -948,15 +952,20 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, bcopy(&alp->commonattr, &ab.valid, sizeof(attribute_set_t)); } - /* - * For now, the vnode must be the root of its filesystem. - * To relax this, we need to be able to find the root vnode of a filesystem - * from any vnode in the filesystem. - */ + /* If we do not have root vnode, look it up and substitute it in */ if (!vnode_isvroot(vp)) { - error = EINVAL; - VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested but not the root of a filesystem"); - goto out; + if (mnt != NULL) { + error = VFS_ROOT(mnt, &root_vp, ctx); + if (error) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested on non-root vnode, but got an error getting root."); + goto out; + } + vp = root_vp; + } else { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested on non-root vnode, but no backpointer to mount."); + goto out; + } } /* @@ -1552,6 +1561,10 @@ out: FREE(ab.base, M_TEMP); } VFS_DEBUG(ctx, vp, "ATTRLIST - returning %d", error); + + if (root_vp != NULL) { + vnode_put(root_vp); + } return error; } @@ -1563,7 +1576,7 @@ out: * are in ad. */ static errno_t -attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp, +attr_pack_common(vfs_context_t ctx, mount_t mp, vnode_t vp, struct attrlist *alp, struct _attrlist_buf *abp, struct vnode_attr *vap, int proc_is64, const char *cnp, ssize_t cnl, const char *fullpathptr, ssize_t fullpathlen, int return_valid, int pack_invalid, int vtype, @@ -1582,7 +1595,14 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp, abp->actual.commonattr |= ATTR_CMN_NAME; } if (alp->commonattr & ATTR_CMN_DEVID) { - if (vp) { + if (mp) { /* caller needs real devid */ + ATTR_PACK4((*abp), + mp->mnt_vfsstat.f_fsid.val[0]); + abp->actual.commonattr |= ATTR_CMN_DEVID; + } else if (VATTR_IS_ACTIVE(vap, va_fsid) && VATTR_IS_SUPPORTED(vap, va_fsid)) { + ATTR_PACK4((*abp), vap->va_fsid); + abp->actual.commonattr |= ATTR_CMN_DEVID; + } else if (vp) { ATTR_PACK4((*abp), vp->v_mount->mnt_vfsstat.f_fsid.val[0]); abp->actual.commonattr |= ATTR_CMN_DEVID; @@ -1594,16 +1614,19 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp, } } if (alp->commonattr & ATTR_CMN_FSID) { - if (vp) { + if (mp) { /* caller needs real fsid */ ATTR_PACK8((*abp), - vp->v_mount->mnt_vfsstat.f_fsid); + mp->mnt_vfsstat.f_fsid); abp->actual.commonattr |= ATTR_CMN_FSID; } else if (VATTR_IS_SUPPORTED(vap, va_fsid64)) { ATTR_PACK8((*abp), vap->va_fsid64); abp->actual.commonattr |= ATTR_CMN_FSID; + } else if (vp) { + ATTR_PACK8((*abp), + vp->v_mount->mnt_vfsstat.f_fsid); + abp->actual.commonattr |= ATTR_CMN_FSID; } else if (!return_valid || pack_invalid) { fsid_t fsid = {{0}}; - ATTR_PACK8((*abp), fsid); } } @@ -1938,7 +1961,7 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp, ATTR_PACK_TIME((*abp), vap->va_addedtime, proc_is64); abp->actual.commonattr |= ATTR_CMN_ADDEDTIME; } else if (!return_valid || pack_invalid) { - struct timespec zerotime = {0, 0}; + struct timespec zerotime = {.tv_sec = 0, .tv_nsec = 0}; ATTR_PACK_TIME((*abp), zerotime, proc_is64); } @@ -2260,8 +2283,9 @@ out: * are in ad. */ static errno_t -attr_pack_common_extended(struct vnode *vp, struct attrlist *alp, +attr_pack_common_extended(mount_t mp, struct vnode *vp, struct attrlist *alp, struct _attrlist_buf *abp, const char *relpathptr, ssize_t relpathlen, + const char *REALpathptr, ssize_t REALpathlen, struct vnode_attr *vap, int return_valid, int pack_invalid) { if (vp && (alp->forkattr & ATTR_CMNEXT_RELPATH)) { @@ -2292,12 +2316,57 @@ attr_pack_common_extended(struct vnode *vp, struct attrlist *alp, abp->actual.forkattr |= ATTR_CMNEXT_LINKID; } + if (vp && (alp->forkattr & ATTR_CMNEXT_NOFIRMLINKPATH)) { + attrlist_pack_string(abp, REALpathptr, REALpathlen); + abp->actual.forkattr |= ATTR_CMNEXT_NOFIRMLINKPATH; + } + + if (alp->forkattr & ATTR_CMNEXT_REALDEVID) { + if (mp) { + ATTR_PACK4((*abp), + mp->mnt_vfsstat.f_fsid.val[0]); + abp->actual.forkattr |= ATTR_CMNEXT_REALDEVID; + } else if (vp) { + ATTR_PACK4((*abp), + vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + abp->actual.forkattr |= ATTR_CMNEXT_REALDEVID; + } else if (VATTR_IS_SUPPORTED(vap, va_fsid)) { + ATTR_PACK4((*abp), vap->va_fsid); + abp->actual.forkattr |= ATTR_CMN_DEVID; + } else if (!return_valid || pack_invalid) { + ATTR_PACK4((*abp), 0); + } + } + + if (alp->forkattr & ATTR_CMNEXT_REALFSID) { + if (mp) { + ATTR_PACK8((*abp), + mp->mnt_vfsstat.f_fsid); + abp->actual.forkattr |= ATTR_CMNEXT_REALFSID; + } else if (vp) { + ATTR_PACK8((*abp), + vp->v_mount->mnt_vfsstat.f_fsid); + abp->actual.forkattr |= ATTR_CMNEXT_REALFSID; + } else if (VATTR_IS_SUPPORTED(vap, va_fsid64)) { + ATTR_PACK8((*abp), vap->va_fsid64); + abp->actual.forkattr |= ATTR_CMN_FSID; + } else if (!return_valid || pack_invalid) { + fsid_t fsid = {{0}}; + + ATTR_PACK8((*abp), fsid); + } + } + return 0; } static void vattr_get_alt_data(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, - int return_valid, int is_bulk, vfs_context_t ctx) + int return_valid, int is_bulk, +#if !CONFIG_FIRMLINKS + __unused +#endif + int is_realdev, vfs_context_t ctx) { /* * There are a couple of special cases. @@ -2310,27 +2379,66 @@ vattr_get_alt_data(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, VATTR_CLEAR_ACTIVE(vap, va_linkid); } + /* + * A filesystem may not support va_fsid64. If it is not available, then we'll + * synthesize it from the mount. + */ + if ((alp->commonattr & ATTR_CMN_FSID) && !VATTR_IS_SUPPORTED(vap, va_fsid64)) { + VATTR_CLEAR_ACTIVE(vap, va_fsid64); + } + + /* Same for fsid */ + if ((alp->commonattr & ATTR_CMN_FSID) && !VATTR_IS_SUPPORTED(vap, va_fsid)) { + VATTR_CLEAR_ACTIVE(vap, va_fsid); + } + + /* We request the fsid64 for the devid */ + if ((alp->commonattr & ATTR_CMN_DEVID) && !VATTR_IS_SUPPORTED(vap, va_fsid)) { + VATTR_CLEAR_ACTIVE(vap, va_fsid); + } + + /* * Many filesystems don't know their parent object id. * If necessary, attempt to derive it from the vnode. */ - if ((alp->commonattr & (ATTR_CMN_PAROBJID | ATTR_CMN_PARENTID)) && - !VATTR_IS_SUPPORTED(vap, va_parentid) && vp && !is_bulk) { + if ((alp->commonattr & (ATTR_CMN_PAROBJID | ATTR_CMN_PARENTID)) && vp) { vnode_t dvp; - if ((dvp = vnode_getparent(vp)) != NULLVP) { +#if CONFIG_FIRMLINKS + /* If this is a firmlink target, we get the fileid of the firmlink parent. */ + if (!is_realdev && (vp->v_flag & VFMLINKTARGET) && ((dvp = vp->v_fmlink) != NULL) && (vnode_get(dvp) == 0)) { struct vnode_attr lva; VATTR_INIT(&lva); - VATTR_WANTED(&lva, va_fileid); + VATTR_WANTED(&lva, va_parentid); + VATTR_WANTED(&lva, va_fsid); if (vnode_getattr(dvp, &lva, ctx) == 0 && - VATTR_IS_SUPPORTED(vap, va_fileid)) { - vap->va_parentid = lva.va_fileid; + VATTR_IS_SUPPORTED(&lva, va_parentid) && + VATTR_IS_SUPPORTED(&lva, va_fsid) && + (lva.va_fsid == (uint32_t)vp->v_mount->mnt_vfsstat.f_fsid.val[0])) { + vap->va_parentid = lva.va_parentid; VATTR_SET_SUPPORTED(vap, va_parentid); } vnode_put(dvp); + } else +#endif /* CONFIG_FIRMLINKS */ + if (!VATTR_IS_SUPPORTED(vap, va_parentid) && !is_bulk) { + if ((dvp = vnode_getparent(vp)) != NULLVP) { + struct vnode_attr lva; + + VATTR_INIT(&lva); + VATTR_WANTED(&lva, va_fileid); + if (vnode_getattr(dvp, &lva, ctx) == 0 && + VATTR_IS_SUPPORTED(vap, va_fileid)) { + vap->va_parentid = lva.va_fileid; + VATTR_SET_SUPPORTED(vap, va_parentid); + } + vnode_put(dvp); + } } } + /* * And we can report datasize/alloc from total. */ @@ -2369,10 +2477,18 @@ vattr_get_alt_data(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, } } +struct _attrlist_paths { + char *fullpathptr; + ssize_t *fullpathlenp; + char *relpathptr; + ssize_t *relpathlenp; + char *REALpathptr; + ssize_t *REALpathlenp; +}; + static errno_t calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, - ssize_t *varsizep, char *fullpathptr, ssize_t *fullpathlenp, - char *relpathptr, ssize_t *relpathlenp, const char **vnamep, + ssize_t *varsizep, struct _attrlist_paths *pathsp, const char **vnamep, const char **cnpp, ssize_t *cnlp) { int error = 0; @@ -2426,16 +2542,17 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, int err; /* call build_path making sure NOT to use the cache-only behavior */ - err = build_path(vp, fullpathptr, len, &len, 0, vfs_context_current()); + err = build_path(vp, pathsp->fullpathptr, len, &len, 0, vfs_context_current()); if (err) { error = err; goto out; } - *fullpathlenp = 0; - if (fullpathptr) { - *fullpathlenp = strlen(fullpathptr); + if (pathsp->fullpathptr) { + *(pathsp->fullpathlenp) = strlen(pathsp->fullpathptr); + } else { + *(pathsp->fullpathlenp) = 0; } - *varsizep += roundup(((*fullpathlenp) + 1), 4); + *varsizep += roundup(((*(pathsp->fullpathlenp)) + 1), 4); } /* @@ -2446,14 +2563,33 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, int err; /* call build_path making sure NOT to use the cache-only behavior */ - err = build_path(vp, relpathptr, MAXPATHLEN, &len, BUILDPATH_VOLUME_RELATIVE, vfs_context_current()); + err = build_path(vp, pathsp->relpathptr, MAXPATHLEN, &len, BUILDPATH_VOLUME_RELATIVE, vfs_context_current()); + if (err) { + error = err; + goto out; + } + + //`len' includes trailing null + *(pathsp->relpathlenp) = len - 1; + *varsizep += roundup(len, 4); + } + + /* + * Compute this vnode's real (firmlink free) path. + */ + if (vp && (alp->forkattr & ATTR_CMNEXT_NOFIRMLINKPATH)) { + int len; + int err; + + /* call build_path making sure NOT to use the cache-only behavior */ + err = build_path(vp, pathsp->REALpathptr, MAXPATHLEN, &len, BUILDPATH_NO_FIRMLINK, vfs_context_current()); if (err) { error = err; goto out; } //`len' includes trailing null - *relpathlenp = len - 1; + *(pathsp->REALpathlenp) = len - 1; *varsizep += roundup(len, 4); } @@ -2482,11 +2618,14 @@ out: } static errno_t -vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, +vfs_attr_pack_internal(mount_t mp, vnode_t vp, uio_t auio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, __unused void *fndesc, vfs_context_t ctx, int is_bulk, enum vtype vtype, ssize_t fixedsize) { struct _attrlist_buf ab; + struct _attrlist_paths apaths = {.fullpathptr = NULL, .fullpathlenp = NULL, + .relpathptr = NULL, .relpathlenp = NULL, + .REALpathptr = NULL, .REALpathlenp = NULL}; ssize_t buf_size; size_t copy_size; ssize_t varsize; @@ -2497,10 +2636,13 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, ssize_t fullpathlen; char *relpathptr; ssize_t relpathlen; + char *REALpathptr; + ssize_t REALpathlen; int error; int proc_is64; int return_valid; int pack_invalid; + int is_realdev; int alloc_local_buf; const int use_fork = options & FSOPT_ATTR_CMN_EXTENDED; @@ -2512,6 +2654,8 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, fullpathlen = 0; relpathptr = NULL; relpathlen = 0; + REALpathptr = NULL; + REALpathlen = 0; error = 0; alloc_local_buf = 0; @@ -2524,6 +2668,7 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, /* Check for special packing semantics */ return_valid = (alp->commonattr & ATTR_CMN_RETURNED_ATTRS) ? 1 : 0; pack_invalid = (options & FSOPT_PACK_INVAL_ATTRS) ? 1 : 0; + is_realdev = options & FSOPT_RETURN_REALDEV ? 1 : 0; if (pack_invalid) { /* Generate a valid mask for post processing */ @@ -2531,8 +2676,17 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, } /* did we ask for something the filesystem doesn't support? */ - if (vap->va_active && !VATTR_ALL_SUPPORTED(vap)) { - vattr_get_alt_data(vp, alp, vap, return_valid, is_bulk, + if (vap->va_active && + (!VATTR_ALL_SUPPORTED(vap) +#if CONFIG_FIRMLINKS + /* For firmlink targets we have to overide what the FS returned for parentid */ + || + (!is_realdev && vp && (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink && + (alp->commonattr & (ATTR_CMN_PAROBJID | ATTR_CMN_PARENTID))) +#endif + )) { + // this disables the selectors that were not supported by the filesystem + vattr_get_alt_data(vp, alp, vap, return_valid, is_bulk, is_realdev, ctx); /* check again */ @@ -2566,24 +2720,41 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, goto out; } bzero(fullpathptr, MAXPATHLEN); + apaths.fullpathptr = fullpathptr; + apaths.fullpathlenp = &fullpathlen; } // only interpret fork attributes if they're used as new common attributes - if (vp && use_fork && (alp->forkattr & (ATTR_CMNEXT_RELPATH))) { - relpathptr = (char*) kalloc(MAXPATHLEN); - if (relpathptr == NULL) { - error = ENOMEM; - VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate relpath buffer"); - goto out; + if (vp && use_fork) { + if (alp->forkattr & (ATTR_CMNEXT_RELPATH)) { + relpathptr = (char*) kalloc(MAXPATHLEN); + if (relpathptr == NULL) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate relpath buffer"); + goto out; + } + bzero(relpathptr, MAXPATHLEN); + apaths.relpathptr = relpathptr; + apaths.relpathlenp = &relpathlen; + } + + if (alp->forkattr & (ATTR_CMNEXT_NOFIRMLINKPATH)) { + REALpathptr = (char*) kalloc(MAXPATHLEN); + if (REALpathptr == NULL) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate canonpath buffer"); + goto out; + } + bzero(REALpathptr, MAXPATHLEN); + apaths.REALpathptr = REALpathptr; + apaths.REALpathlenp = &REALpathlen; } - bzero(relpathptr, MAXPATHLEN); } /* * Compute variable-space requirements. */ - error = calc_varsize(vp, alp, vap, &varsize, fullpathptr, &fullpathlen, - relpathptr, &relpathlen, &vname, &cnp, &cnl); + error = calc_varsize(vp, alp, vap, &varsize, &apaths, &vname, &cnp, &cnl); if (error) { goto out; } @@ -2593,7 +2764,7 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, * * Note that we won't ever copy out more than the caller requested, even though * we might have to allocate more than they offer so that the diagnostic checks - * don't result in a panic if the caller's buffer is too small.. + * don't result in a panic if the caller's buffer is too small. */ ab.allocated = fixedsize + varsize; /* Cast 'allocated' to an unsigned to verify allocation size */ @@ -2702,8 +2873,9 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, ab.needed = ab.allocated; /* common attributes ************************************************/ - error = attr_pack_common(ctx, vp, alp, &ab, vap, proc_is64, cnp, cnl, - fullpathptr, fullpathlen, return_valid, pack_invalid, vtype, is_bulk); + error = attr_pack_common(ctx, (options & FSOPT_RETURN_REALDEV ? mp : NULL), + vp, alp, &ab, vap, proc_is64, cnp, cnl, fullpathptr, fullpathlen, + return_valid, pack_invalid, vtype, is_bulk); /* directory attributes *********************************************/ if (!error && alp->dirattr && (vtype == VDIR)) { @@ -2718,8 +2890,8 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, /* common extended attributes *****************************************/ if (!error && use_fork) { - error = attr_pack_common_extended(vp, alp, &ab, relpathptr, relpathlen, - vap, return_valid, pack_invalid); + error = attr_pack_common_extended(mp, vp, alp, &ab, relpathptr, relpathlen, + REALpathptr, REALpathlen, vap, return_valid, pack_invalid); } if (error) { @@ -2789,6 +2961,9 @@ out: if (relpathptr) { kfree(relpathptr, MAXPATHLEN); } + if (REALpathptr) { + kfree(REALpathptr, MAXPATHLEN); + } if (ab.base != NULL && alloc_local_buf) { FREE(ab.base, M_TEMP); } @@ -2796,7 +2971,7 @@ out: } errno_t -vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, +vfs_attr_pack_ext(mount_t mp, vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, __unused void *fndesc, vfs_context_t ctx) { int error; @@ -2824,7 +2999,7 @@ vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, goto out; } - error = vfs_attr_pack_internal(vp, uio, alp, + error = vfs_attr_pack_internal(mp, vp, uio, alp, options | FSOPT_REPORT_FULLSIZE, vap, NULL, ctx, 1, v_type, fixedsize); @@ -2835,6 +3010,13 @@ out: return error; } +errno_t +vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, + struct vnode_attr *vap, __unused void *fndesc, vfs_context_t ctx) +{ + return vfs_attr_pack_ext(NULL, vp, uio, alp, options, vap, fndesc, ctx); +} + /* * Obtain attribute information about a filesystem object. * @@ -2889,7 +3071,7 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, } VFS_DEBUG(ctx, vp, "%p ATTRLIST - %s request common %08x vol %08x file %08x dir %08x fork %08x %sfollow on '%s'", - vp, p->p_comm, alp->commonattr, alp->volattr, alp->fileattr, alp->dirattr, alp->forkattr, + vp, vfs_context_proc(ctx)->p_comm, alp->commonattr, alp->volattr, alp->fileattr, alp->dirattr, alp->forkattr, (options & FSOPT_NOFOLLOW) ? "no":"", vp->v_name); #if CONFIG_MACF @@ -3002,6 +3184,10 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, va.va_name = authoritative_name ? NULL : va_name; + if (options & FSOPT_RETURN_REALDEV) { + va.va_vaflags |= VA_REALFSID; + } + /* * Call the filesystem. */ @@ -3047,7 +3233,7 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, va.va_name = va_name; } - error = vfs_attr_pack_internal(vp, auio, alp, options, &va, NULL, ctx, + error = vfs_attr_pack_internal(vp->v_mount, vp, auio, alp, options, &va, NULL, ctx, 0, vtype, fixedsize); out: diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 8ba4e78d2..5ce788691 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -607,6 +607,22 @@ bufattr_quickcomplete(bufattr_t bap) return 0; } +void +bufattr_markioscheduled(bufattr_t bap) +{ + SET(bap->ba_flags, BA_IO_SCHEDULED); +} + + +int +bufattr_ioscheduled(bufattr_t bap) +{ + if ((bap->ba_flags & BA_IO_SCHEDULED)) { + return 1; + } + return 0; +} + errno_t buf_error(buf_t bp) { @@ -2171,13 +2187,13 @@ struct meta_zone_entry { }; struct meta_zone_entry meta_zones[] = { - {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" }, - {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" }, - {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" }, - {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" }, - {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" }, - {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" }, - {NULL, 0, 0, "" } /* End */ + {.mz_zone = NULL, .mz_size = (MINMETA * 1), .mz_max = 128 * (MINMETA * 1), .mz_name = "buf.512" }, + {.mz_zone = NULL, .mz_size = (MINMETA * 2), .mz_max = 64 * (MINMETA * 2), .mz_name = "buf.1024" }, + {.mz_zone = NULL, .mz_size = (MINMETA * 4), .mz_max = 16 * (MINMETA * 4), .mz_name = "buf.2048" }, + {.mz_zone = NULL, .mz_size = (MINMETA * 8), .mz_max = 512 * (MINMETA * 8), .mz_name = "buf.4096" }, + {.mz_zone = NULL, .mz_size = (MINMETA * 16), .mz_max = 512 * (MINMETA * 16), .mz_name = "buf.8192" }, + {.mz_zone = NULL, .mz_size = (MINMETA * 32), .mz_max = 512 * (MINMETA * 32), .mz_name = "buf.16384" }, + {.mz_zone = NULL, .mz_size = 0, .mz_max = 0, .mz_name = "" } /* End */ }; /* diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index 9f3aaa548..18a0906b8 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -164,6 +164,7 @@ static const char *add_name_internal(const char *, uint32_t, u_int, boolean_t, u static void init_string_table(void); static void cache_delete(struct namecache *, int); static void cache_enter_locked(vnode_t dvp, vnode_t vp, struct componentname *cnp, const char *strname); +static void cache_purge_locked(vnode_t vp, kauth_cred_t *credp); #ifdef DUMP_STRING_TABLE /* @@ -479,6 +480,13 @@ again: */ NAME_CACHE_LOCK_SHARED(); +#if CONFIG_FIRMLINKS + if (!(flags & BUILDPATH_NO_FIRMLINK) && + (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink) { + vp = vp->v_fmlink; + } +#endif + /* * Check if this is the root of a file system. */ @@ -501,6 +509,12 @@ again: * want to cross mount points. Therefore just return * '/' as the relative path. */ +#if CONFIG_FIRMLINKS + if (!(flags & BUILDPATH_NO_FIRMLINK) && + (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink) { + vp = vp->v_fmlink; + } else +#endif if (flags & BUILDPATH_VOLUME_RELATIVE) { *--end = '/'; goto out_unlock; @@ -730,6 +744,15 @@ bad_news: if (tvp == proc_root_dir_vp) { goto out_unlock; /* encountered the root */ } + +#if CONFIG_FIRMLINKS + if (!(flags & BUILDPATH_NO_FIRMLINK) && + (tvp->v_flag & VFMLINKTARGET) && tvp->v_fmlink) { + tvp = tvp->v_fmlink; + break; + } +#endif + if (!(tvp->v_flag & VROOT) || !tvp->v_mount) { break; /* not the root of a mounted FS */ } @@ -790,6 +813,9 @@ vnode_getparent(vnode_t vp) int pvid; NAME_CACHE_LOCK_SHARED(); + + pvp = vp->v_parent; + /* * v_parent is stable behind the name_cache lock * however, the only thing we can really guarantee @@ -797,7 +823,7 @@ vnode_getparent(vnode_t vp) * parent of 'vp' at the time we took the name_cache lock... * once we drop the lock, vp could get re-parented */ - if ((pvp = vp->v_parent) != NULLVP) { + if (pvp != NULLVP) { pvid = pvp->v_id; NAME_CACHE_UNLOCK(); @@ -930,9 +956,34 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u flags &= ~VNODE_UPDATE_NAME; } } - if ((flags & (VNODE_UPDATE_PURGE | VNODE_UPDATE_PARENT | VNODE_UPDATE_CACHE | VNODE_UPDATE_NAME))) { + if ((flags & (VNODE_UPDATE_PURGE | VNODE_UPDATE_PARENT | VNODE_UPDATE_CACHE | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGEFIRMLINK))) { NAME_CACHE_LOCK(); +#if CONFIG_FIRMLINKS + if (flags & VNODE_UPDATE_PURGEFIRMLINK) { + vnode_t old_fvp = vp->v_fmlink; + if (old_fvp) { + vnode_lock_spin(vp); + vp->v_flag &= ~VFMLINKTARGET; + vp->v_fmlink = NULLVP; + vnode_unlock(vp); + NAME_CACHE_UNLOCK(); + + /* + * vnode_rele can result in cascading series of + * usecount releases. The combination of calling + * vnode_recycle and dont_reenter (3rd arg to + * vnode_rele_internal) ensures we don't have + * that issue. + */ + vnode_recycle(old_fvp); + vnode_rele_internal(old_fvp, O_EVTONLY, 1, 0); + + NAME_CACHE_LOCK(); + } + } +#endif + if ((flags & VNODE_UPDATE_PURGE)) { if (vp->v_parent) { vp->v_parent->v_nc_generation++; @@ -1081,6 +1132,139 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u } } +#if CONFIG_FIRMLINKS +errno_t +vnode_setasfirmlink(vnode_t vp, vnode_t target_vp) +{ + int error = 0; + vnode_t old_target_vp = NULLVP; + vnode_t old_target_vp_v_fmlink = NULLVP; + kauth_cred_t target_vp_cred = NULL; + kauth_cred_t old_target_vp_cred = NULL; + + if (!vp) { + return EINVAL; + } + + if (target_vp) { + if (vp->v_fmlink == target_vp) { /* Will be checked again under the name cache lock */ + return 0; + } + + /* + * Firmlink source and target will take both a usecount + * and kusecount on each other. + */ + if ((error = vnode_ref_ext(target_vp, O_EVTONLY, VNODE_REF_FORCE))) { + return error; + } + + if ((error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE))) { + vnode_rele_ext(target_vp, O_EVTONLY, 1); + return error; + } + } + + NAME_CACHE_LOCK(); + + old_target_vp = vp->v_fmlink; + if (target_vp && (target_vp == old_target_vp)) { + NAME_CACHE_UNLOCK(); + return 0; + } + vp->v_fmlink = target_vp; + + vnode_lock_spin(vp); + vp->v_flag &= ~VFMLINKTARGET; + vnode_unlock(vp); + + if (target_vp) { + target_vp->v_fmlink = vp; + vnode_lock_spin(target_vp); + target_vp->v_flag |= VFMLINKTARGET; + vnode_unlock(target_vp); + cache_purge_locked(vp, &target_vp_cred); + } + + if (old_target_vp) { + old_target_vp_v_fmlink = old_target_vp->v_fmlink; + old_target_vp->v_fmlink = NULLVP; + vnode_lock_spin(old_target_vp); + old_target_vp->v_flag &= ~VFMLINKTARGET; + vnode_unlock(old_target_vp); + cache_purge_locked(vp, &old_target_vp_cred); + } + + NAME_CACHE_UNLOCK(); + + if (target_vp_cred && IS_VALID_CRED(target_vp_cred)) { + kauth_cred_unref(&target_vp_cred); + } + + if (old_target_vp) { + if (old_target_vp_cred && IS_VALID_CRED(old_target_vp_cred)) { + kauth_cred_unref(&old_target_vp_cred); + } + + vnode_rele_ext(old_target_vp, O_EVTONLY, 1); + if (old_target_vp_v_fmlink) { + vnode_rele_ext(old_target_vp_v_fmlink, O_EVTONLY, 1); + } + } + + return 0; +} + +errno_t +vnode_getfirmlink(vnode_t vp, vnode_t *target_vp) +{ + int error; + + if (!vp->v_fmlink) { + return ENODEV; + } + + NAME_CACHE_LOCK_SHARED(); + if (vp->v_fmlink && !(vp->v_flag & VFMLINKTARGET) && + (vnode_get(vp->v_fmlink) == 0)) { + vnode_t tvp = vp->v_fmlink; + + vnode_lock_spin(tvp); + if (tvp->v_lflag & (VL_TERMINATE | VL_DEAD)) { + vnode_unlock(tvp); + NAME_CACHE_UNLOCK(); + vnode_put(tvp); + return ENOENT; + } + if (!(tvp->v_flag & VFMLINKTARGET)) { + panic("firmlink target for vnode %p does not have flag set", vp); + } + vnode_unlock(tvp); + *target_vp = tvp; + error = 0; + } else { + *target_vp = NULLVP; + error = ENODEV; + } + NAME_CACHE_UNLOCK(); + return error; +} + +#else /* CONFIG_FIRMLINKS */ + +errno_t +vnode_setasfirmlink(__unused vnode_t vp, __unused vnode_t src_vp) +{ + return ENOTSUP; +} + +errno_t +vnode_getfirmlink(__unused vnode_t vp, __unused vnode_t *target_vp) +{ + return ENOTSUP; +} + +#endif /* * Mark a vnode as having multiple hard links. HFS makes use of this @@ -1476,6 +1660,12 @@ skiprsrcfork: break; } if (cnp->cn_flags & ISDOTDOT) { +#if CONFIG_FIRMLINKS + if (dp->v_fmlink && (dp->v_flag & VFMLINKTARGET)) { + dp = dp->v_fmlink; + } +#endif + /* * Force directory hardlinks to go to * file system for ".." requests. @@ -2336,12 +2526,12 @@ cache_delete(struct namecache *ncp, int free_entry) * purge the entry associated with the * specified vnode from the name cache */ -void -cache_purge(vnode_t vp) +static void +cache_purge_locked(vnode_t vp, kauth_cred_t *credp) { struct namecache *ncp; - kauth_cred_t tcred = NULL; + *credp = NULL; if ((LIST_FIRST(&vp->v_nclinks) == NULL) && (TAILQ_FIRST(&vp->v_ncchildren) == NULL) && (vp->v_cred == NOCRED) && @@ -2349,8 +2539,6 @@ cache_purge(vnode_t vp) return; } - NAME_CACHE_LOCK(); - if (vp->v_parent) { vp->v_parent->v_nc_generation++; } @@ -2366,13 +2554,30 @@ cache_purge(vnode_t vp) /* * Use a temp variable to avoid kauth_cred_unref() while NAME_CACHE_LOCK is held */ - tcred = vp->v_cred; + *credp = vp->v_cred; vp->v_cred = NOCRED; vp->v_authorized_actions = 0; +} + +void +cache_purge(vnode_t vp) +{ + kauth_cred_t tcred = NULL; + + if ((LIST_FIRST(&vp->v_nclinks) == NULL) && + (TAILQ_FIRST(&vp->v_ncchildren) == NULL) && + (vp->v_cred == NOCRED) && + (vp->v_parent == NULLVP)) { + return; + } + + NAME_CACHE_LOCK(); + + cache_purge_locked(vp, &tcred); NAME_CACHE_UNLOCK(); - if (IS_VALID_CRED(tcred)) { + if (tcred && IS_VALID_CRED(tcred)) { kauth_cred_unref(&tcred); } } diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 56d369787..181614fcb 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -5067,7 +5067,7 @@ wait_for_dreads: * vm_pre_fault() will call vm_fault() to enter the page into * the pmap if there isn't _a_ physical page for that VA already. */ - vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK)); + vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK), VM_PROT_READ); } if (io_req_size && retval == 0) { @@ -6897,8 +6897,13 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) modulus_size = DRT_HASH_XLARGE_MODULUS; map_size = DRT_XLARGE_ALLOCATION; } else { - modulus_size = DRT_HASH_LARGE_MODULUS; - map_size = DRT_LARGE_ALLOCATION; + /* + * If the ring is completely full and we can't + * expand, there's nothing useful for us to do. + * Behave as though we had compacted into the new + * array and return. + */ + return KERN_SUCCESS; } } else { /* already using the xlarge modulus */ diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index d6595dc96..1d61ed284 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,7 +94,7 @@ extern int nfs_mountroot(void); extern struct vfsops afs_vfsops; extern struct vfsops null_vfsops; extern struct vfsops devfs_vfsops; -extern struct vfsops routefs_vfsops; +extern const struct vfsops routefs_vfsops; extern struct vfsops nullfs_vfsops; #if MOCKFS @@ -123,36 +123,149 @@ enum fs_type_num { static struct vfstable vfstbllist[] = { /* Sun-compatible Network Filesystem */ #if NFSCLIENT - { &nfs_vfsops, "nfs", FT_NFS, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFSPREFLIGHT | VFC_VFS64BITREADY | VFC_VFSREADDIR_EXTENDED, NULL, 0, NULL}, -#endif + { + .vfc_vfsops = &nfs_vfsops, + .vfc_name = "nfs", + .vfc_typenum = FT_NFS, + .vfc_refcount = 0, + .vfc_flags = 0, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFSPREFLIGHT | VFC_VFS64BITREADY | VFC_VFSREADDIR_EXTENDED, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, +#endif /* NFSCLIENT */ /* Device Filesystem */ #if DEVFS #if CONFIG_MACF - { &devfs_vfsops, "devfs", FT_DEVFS, 0, MNT_MULTILABEL, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFS64BITREADY, NULL, 0, NULL}, -#else - { &devfs_vfsops, "devfs", FT_DEVFS, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFS64BITREADY, NULL, 0, NULL}, -#endif /* MAC */ -#endif + { + .vfc_vfsops = &devfs_vfsops, + .vfc_name = "devfs", + .vfc_typenum = FT_DEVFS, + .vfc_refcount = 0, + .vfc_flags = MNT_MULTILABEL, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFS64BITREADY, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, +#else /* !CONFIG_MAC */ + { + .vfc_vfsops = &devfs_vfsops, + .vfc_name = "devfs", + .vfc_typenum = FT_DEVFS, + .vfc_refcount = 0, + .vfc_flags = 0, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFS64BITREADY, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, +#endif /* CONFIG_MAC */ +#endif /* DEVFS */ #ifndef __LP64__ #endif /* __LP64__ */ #if NULLFS - { &nullfs_vfsops, "nullfs", FT_NULLFS, 0, (MNT_DONTBROWSE | MNT_RDONLY), NULL, NULL, 0, 0, VFC_VFS64BITREADY, NULL, 0, NULL}, + { + .vfc_vfsops = &nullfs_vfsops, + .vfc_name = "nullfs", + .vfc_typenum = FT_NULLFS, + .vfc_refcount = 0, + .vfc_flags = MNT_DONTBROWSE | MNT_RDONLY, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = VFC_VFS64BITREADY, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, #endif /* NULLFS */ #if MOCKFS /* If we are configured for it, mockfs should always be the last standard entry (and thus the last FS we attempt mountroot with) */ - { &mockfs_vfsops, "mockfs", FT_MOCKFS, 0, MNT_LOCAL, mockfs_mountroot, NULL, 0, 0, VFC_VFSGENERICARGS, NULL, 0, NULL}, + { + .vfc_vfsops = &mockfs_vfsops, + .vfc_name = "mockfs", + .vfc_typenum = FT_MOCKFS, + .vfc_refcount = 0, + .vfc_flags = MNT_LOCAL, + .vfc_mountroot = mockfs_mountroot, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = VFC_VFSGENERICARGS, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, #endif /* MOCKFS */ #if ROUTEFS /* If we are configured for it, mockfs should always be the last standard entry (and thus the last FS we attempt mountroot with) */ - { &routefs_vfsops, "routefs", FT_ROUTEFS, 0, MNT_LOCAL, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFS64BITREADY, NULL, 0, NULL}, + { + .vfc_vfsops = &routefs_vfsops, + .vfc_name = "routefs", + .vfc_typenum = FT_ROUTEFS, + .vfc_refcount = 0, + .vfc_flags = MNT_LOCAL, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFS64BITREADY, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, #endif /* ROUTEFS */ - {NULL, "", 0, 0, 0, NULL, NULL, 0, 0, 0, NULL, 0, NULL}, - {NULL, "", 0, 0, 0, NULL, NULL, 0, 0, 0, NULL, 0, NULL}, + + { + .vfc_vfsops = NULL, + .vfc_name = "", + .vfc_typenum = 0, + .vfc_refcount = 0, + .vfc_flags = 0, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = 0, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, + { + .vfc_vfsops = NULL, + .vfc_name = "", + .vfc_typenum = 0, + .vfc_refcount = 0, + .vfc_flags = 0, + .vfc_mountroot = NULL, + .vfc_next = NULL, + .vfc_reserved1 = 0, + .vfc_reserved2 = 0, + .vfc_vfsflags = 0, + .vfc_descptr = NULL, + .vfc_descsize = 0, + .vfc_sysctl = NULL + }, }; /* @@ -172,32 +285,34 @@ struct vfstable *vfsconf = vfstbllist; * */ extern struct vnodeopv_desc mfs_vnodeop_opv_desc; -extern struct vnodeopv_desc dead_vnodeop_opv_desc; +extern const struct vnodeopv_desc dead_vnodeop_opv_desc; #if FIFO && SOCKETS -extern struct vnodeopv_desc fifo_vnodeop_opv_desc; +extern const struct vnodeopv_desc fifo_vnodeop_opv_desc; #endif /* SOCKETS */ -extern struct vnodeopv_desc spec_vnodeop_opv_desc; -extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc; -extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; -extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; -extern struct vnodeopv_desc nfsv4_vnodeop_opv_desc; -extern struct vnodeopv_desc spec_nfsv4nodeop_opv_desc; -extern struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc; +extern const struct vnodeopv_desc spec_vnodeop_opv_desc; +extern const struct vnodeopv_desc nfsv2_vnodeop_opv_desc; +extern const struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; +extern const struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; +#if CONFIG_NFS4 +extern const struct vnodeopv_desc nfsv4_vnodeop_opv_desc; +extern const struct vnodeopv_desc spec_nfsv4nodeop_opv_desc; +extern const struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc; +#endif extern struct vnodeopv_desc null_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_spec_vnodeop_opv_desc; #if FDESC extern struct vnodeopv_desc devfs_devfd_vnodeop_opv_desc; -extern struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc; +extern const struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc; #endif /* FDESC */ #if MOCKFS -extern struct vnodeopv_desc mockfs_vnodeop_opv_desc; +extern const struct vnodeopv_desc mockfs_vnodeop_opv_desc; #endif /* MOCKFS */ -extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; +extern const struct vnodeopv_desc nullfs_vnodeop_opv_desc; -struct vnodeopv_desc *vfs_opv_descs[] = { +const struct vnodeopv_desc *vfs_opv_descs[] = { &dead_vnodeop_opv_desc, #if FIFO && SOCKETS &fifo_vnodeop_opv_desc, @@ -209,13 +324,17 @@ struct vnodeopv_desc *vfs_opv_descs[] = { #if NFSCLIENT &nfsv2_vnodeop_opv_desc, &spec_nfsv2nodeop_opv_desc, +#if CONFIG_NFS4 &nfsv4_vnodeop_opv_desc, &spec_nfsv4nodeop_opv_desc, +#endif #if FIFO &fifo_nfsv2nodeop_opv_desc, +#if CONFIG_NFS4 &fifo_nfsv4nodeop_opv_desc, -#endif -#endif +#endif /* CONFIG_NFS4 */ +#endif /* FIFO */ +#endif /* NFSCLIENT */ #if DEVFS &devfs_vnodeop_opv_desc, &devfs_spec_vnodeop_opv_desc, diff --git a/bsd/vfs/vfs_disk_conditioner.c b/bsd/vfs/vfs_disk_conditioner.c index 9bfbeab55..7df2f287b 100644 --- a/bsd/vfs/vfs_disk_conditioner.c +++ b/bsd/vfs/vfs_disk_conditioner.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -219,8 +219,9 @@ disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo) internal_info = mp->mnt_disk_conditioner_info; if (!internal_info) { - internal_info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t)); + internal_info = kalloc(sizeof(struct _disk_conditioner_info_t)); bzero(internal_info, sizeof(struct _disk_conditioner_info_t)); + mp->mnt_disk_conditioner_info = internal_info; mnt_fields = &(internal_info->mnt_fields); /* save mount_t fields for restoration later */ @@ -300,7 +301,10 @@ disk_conditioner_mount_is_ssd(mount_t mp) struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; if (!internal_info || !internal_info->dcinfo.enabled) { - return !!(mp->mnt_kern_flag & MNTK_SSD); + if (mp->mnt_kern_flag & MNTK_SSD) { + return TRUE; + } + return FALSE; } return internal_info->dcinfo.is_ssd; diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index 1a6fa3844..f7916db48 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2014 Apple Inc. All rights reserved. + * Copyright (c) 2004-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -605,7 +605,7 @@ add_fsevent(int type, vfs_context_t ctx, ...) val = 0xbadc0de2; } // overlay the dest inode number on the str/dest pointer fields - memcpy(&cur->str, &val, sizeof(ino64_t)); + __nochk_memcpy(&cur->str, &val, sizeof(ino64_t)); // and last the document-id @@ -619,7 +619,10 @@ add_fsevent(int type, vfs_context_t ctx, ...) } // the docid is 64-bit and overlays the uid/gid fields - memcpy(&cur->uid, &val, sizeof(uint64_t)); + static_assert(sizeof(cur->uid) + sizeof(cur->gid) == sizeof(val), "gid/uid size mismatch"); + static_assert(offsetof(struct kfs_event, gid) - offsetof(struct kfs_event, uid) == sizeof(cur->uid), "unexpected struct kfs_event layout"); + memcpy(&cur->uid, &val, sizeof(cur->uid)); + memcpy(&cur->gid, (u_int8_t *)&val + sizeof(cur->uid), sizeof(cur->gid)); goto done_with_args; } @@ -685,7 +688,7 @@ add_fsevent(int type, vfs_context_t ctx, ...) pathbuff_len = MAXPATHLEN; pathbuff[0] = '\0'; - if ((ret = vn_getpath(vp, pathbuff, &pathbuff_len)) != 0 || pathbuff[0] == '\0') { + if ((ret = vn_getpath_no_firmlink(vp, pathbuff, &pathbuff_len)) != 0 || pathbuff[0] == '\0') { cur->flags |= KFSE_CONTAINS_DROPPED_EVENTS; do { @@ -703,7 +706,7 @@ add_fsevent(int type, vfs_context_t ctx, ...) } pathbuff_len = MAXPATHLEN; - ret = vn_getpath(vp, pathbuff, &pathbuff_len); + ret = vn_getpath_no_firmlink(vp, pathbuff, &pathbuff_len); } while (ret == ENOSPC); if (ret != 0 || vp == NULL) { @@ -1621,7 +1624,7 @@ fsevent_unmount(__unused struct mount *mp, __unused vfs_context_t ctx) #if CONFIG_EMBEDDED dev_t dev = mp->mnt_vfsstat.f_fsid.val[0]; int error, waitcount = 0; - struct timespec ts = {1, 0}; + struct timespec ts = {.tv_sec = 1, .tv_nsec = 0}; // wait for any other pending unmounts to complete lock_watch_table(); @@ -1708,13 +1711,6 @@ fseventsf_read(struct fileproc *fp, struct uio *uio, } -static int -fseventsf_write(__unused struct fileproc *fp, __unused struct uio *uio, - __unused int flags, __unused vfs_context_t ctx) -{ - return EIO; -} - #pragma pack(push, 4) typedef struct fsevent_dev_filter_args32 { uint32_t num_devices; @@ -1939,11 +1935,12 @@ filt_fsevent_detach(struct knote *kn) * --If hint is revoke, set special flags and activate */ static int -filt_fsevent(struct knote *kn, long hint) +filt_fsevent_common(struct knote *kn, struct kevent_qos_s *kev, long hint) { fsevent_handle *fseh = (struct fsevent_handle *)kn->kn_hook; int activate = 0; int32_t rd, wr, amt; + int64_t data = 0; if (NOTE_REVOKE == hint) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); @@ -1960,11 +1957,8 @@ filt_fsevent(struct knote *kn, long hint) switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_data = amt; - - if (kn->kn_data != 0) { - activate = 1; - } + data = amt; + activate = (data != 0); break; case EVFILT_VNODE: /* Check events this note matches against the hint */ @@ -1975,18 +1969,25 @@ filt_fsevent(struct knote *kn, long hint) activate = 1; } break; - default: { + default: // nothing to do... break; } - } + if (activate && kev) { + knote_fill_kevent(kn, kev, data); + } return activate; } +static int +filt_fsevent(struct knote *kn, long hint) +{ + return filt_fsevent_common(kn, NULL, hint); +} static int -filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev) +filt_fsevent_touch(struct knote *kn, struct kevent_qos_s *kev) { int res; @@ -2004,7 +2005,7 @@ filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev) //kn->kn_fflags &= kev->fflags; /* determine if the filter is now fired */ - res = filt_fsevent(kn, 0); + res = filt_fsevent_common(kn, NULL, 0); unlock_watch_table(); @@ -2012,23 +2013,16 @@ filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_fsevent_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_fsevent_process(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) int res; lock_watch_table(); - res = filt_fsevent(kn, 0); - if (res) { - *kev = kn->kn_kevent; - if (kev->flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - } - } + res = filt_fsevent_common(kn, kev, 0); unlock_watch_table(); + return res; } @@ -2042,14 +2036,13 @@ SECURITY_READ_ONLY_EARLY(struct filterops) fsevent_filtops = { }; static int -fseventsf_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, - __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx) +fseventsf_kqfilter(struct fileproc *fp, struct knote *kn, + __unused struct kevent_qos_s *kev) { fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; int res; kn->kn_hook = (void*)fseh; - kn->kn_hookid = 1; kn->kn_filtid = EVFILTID_FSEVENT; lock_watch_table(); @@ -2057,7 +2050,7 @@ fseventsf_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, KNOTE_ATTACH(&fseh->knotes, kn); /* check to see if it is fired already */ - res = filt_fsevent(kn, 0); + res = filt_fsevent_common(kn, NULL, 0); unlock_watch_table(); @@ -2289,14 +2282,14 @@ fseventswrite(__unused dev_t dev, struct uio *uio, __unused int ioflag) static const struct fileops fsevents_fops = { - .fo_type = DTYPE_FSEVENTS, - .fo_read = fseventsf_read, - .fo_write = fseventsf_write, - .fo_ioctl = fseventsf_ioctl, - .fo_select = fseventsf_select, - .fo_close = fseventsf_close, + .fo_type = DTYPE_FSEVENTS, + .fo_read = fseventsf_read, + .fo_write = fo_no_write, + .fo_ioctl = fseventsf_ioctl, + .fo_select = fseventsf_select, + .fo_close = fseventsf_close, .fo_kqfilter = fseventsf_kqfilter, - .fo_drain = fseventsf_drain, + .fo_drain = fseventsf_drain, }; typedef struct fsevent_clone_args32 { @@ -2380,12 +2373,26 @@ handle_clone: return error; } + /* + * Lock down the user's "fd" result buffer so it's safe + * to hold locks while we copy it out. + */ + error = vslock((user_addr_t)fse_clone_args->fd, + sizeof(int32_t)); + if (error) { + FREE(event_list, M_TEMP); + FREE(fseh, M_TEMP); + return error; + } + error = add_watcher(event_list, fse_clone_args->num_events, fse_clone_args->event_queue_depth, &fseh->watcher, fseh); if (error) { + vsunlock((user_addr_t)fse_clone_args->fd, + sizeof(int32_t), 0); FREE(event_list, M_TEMP); FREE(fseh, M_TEMP); return error; @@ -2396,6 +2403,8 @@ handle_clone: error = falloc(p, &f, &fd, vfs_context_current()); if (error) { remove_watcher(fseh->watcher); + vsunlock((user_addr_t)fse_clone_args->fd, + sizeof(int32_t), 0); FREE(event_list, M_TEMP); FREE(fseh, M_TEMP); return error; @@ -2404,16 +2413,21 @@ handle_clone: f->f_fglob->fg_flag = FREAD | FWRITE; f->f_fglob->fg_ops = &fsevents_fops; f->f_fglob->fg_data = (caddr_t) fseh; - proc_fdunlock(p); + /* + * We can safely hold the proc_fdlock across this copyout() + * because of the vslock() call above. The vslock() call + * also ensures that we will never get an error, so assert + * this. + */ error = copyout((void *)&fd, fse_clone_args->fd, sizeof(int32_t)); - if (error != 0) { - fp_free(p, fd, f); - } else { - proc_fdlock(p); - procfdtbl_releasefd(p, fd, NULL); - fp_drop(p, fd, f, 1); - proc_fdunlock(p); - } + assert(error == 0); + + procfdtbl_releasefd(p, fd, NULL); + fp_drop(p, fd, f, 1); + proc_fdunlock(p); + + vsunlock((user_addr_t)fse_clone_args->fd, + sizeof(int32_t), 1); break; default: @@ -2510,6 +2524,7 @@ get_fse_info(struct vnode *vp, fse_info *fse, __unused vfs_context_t ctx) VATTR_INIT(&va); VATTR_WANTED(&va, va_fsid); + va.va_vaflags |= VA_REALFSID; VATTR_WANTED(&va, va_fileid); VATTR_WANTED(&va, va_mode); VATTR_WANTED(&va, va_uid); @@ -2595,7 +2610,7 @@ create_fsevent_from_kevent(vnode_t vp, uint32_t kevents, struct vnode_attr *vap) fse.gid = vap->va_gid; len = sizeof(pathbuf); - if (vn_getpath(vp, pathbuf, &len) == 0) { + if (vn_getpath_no_firmlink(vp, pathbuf, &len) == 0) { add_fsevent(fsevent_type, vfs_context_current(), FSE_ARG_STRING, len, pathbuf, FSE_ARG_FINFO, &fse, FSE_ARG_DONE); } return; diff --git a/bsd/vfs/vfs_init.c b/bsd/vfs/vfs_init.c index cc70d2c24..d17cb02bd 100644 --- a/bsd/vfs/vfs_init.c +++ b/bsd/vfs/vfs_init.c @@ -104,7 +104,7 @@ __private_extern__ void vntblinit(void); -extern struct vnodeopv_desc *vfs_opv_descs[]; +extern const struct vnodeopv_desc *vfs_opv_descs[]; /* a list of lists of vnodeops defns */ extern struct vnodeop_desc *vfs_op_descs[]; /* and the operations they perform */ @@ -150,7 +150,7 @@ vfs_opv_init(void) int i, j, k; int(***opv_desc_vector_p)(void *); int(**opv_desc_vector)(void *); - struct vnodeopv_entry_desc *opve_descp; + const struct vnodeopv_entry_desc *opve_descp; /* * Allocate the dynamic vectors and fill them in. @@ -319,8 +319,6 @@ lck_mtx_t *pkg_extensions_lck; struct mount * dead_mountp; -extern void nspace_handler_init(void); - /* * Initialize the vnode structures and initialize each file system type. */ @@ -415,8 +413,6 @@ vfsinit(void) */ nchinit(); - nspace_handler_init(); - /* * Build vnode operation vectors. */ @@ -516,6 +512,8 @@ vfsinit(void) #if FS_COMPRESSION decmpfs_init(); #endif + + nspace_resolver_init(); } void diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 2764ecc73..aaaf2fbb1 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -95,6 +95,8 @@ #include #endif +#include + #if NAMEDRSRCFORK #include #endif @@ -631,7 +633,21 @@ lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname /* Restore the truncated pathname buffer (for audits). */ if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') { - ndp->ni_next[0] = '/'; + /* + * While we replaced only '/' with '\0' and would ordinarily + * need to just switch that back, the buffer in which we did + * this may not be what the pathname buffer is now when symlinks + * are involved. If we just restore the "/" we will make the + * string not terminated anymore, so be safe and restore the + * entire suffix. + */ + strncpy(ndp->ni_next, _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC)); + cnp->cn_nameptr = ndp->ni_next + 1; + cnp->cn_namelen = sizeof(_PATH_RSRCFORKSPEC) - 1; + ndp->ni_next += cnp->cn_namelen; + if (ndp->ni_next[0] != '\0') { + panic("Incorrect termination of path in %s", __FUNCTION__); + } } cnp->cn_flags &= ~MAKEENTRY; @@ -1535,6 +1551,7 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) struct componentname *cnp = &ndp->ni_cnd; vnode_t dp; char *tmppn; + u_int rsrclen = (cnp->cn_flags & CN_WANTSRSRCFORK) ? sizeof(_PATH_RSRCFORKSPEC) : 0; if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { return ELOOP; @@ -1577,7 +1594,7 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) * is only 1024. */ linklen = MAXPATHLEN - (u_int)uio_resid(auio); - if (linklen + ndp->ni_pathlen > MAXPATHLEN) { + if (linklen + ndp->ni_pathlen + rsrclen > MAXPATHLEN) { if (need_newpathbuf) { FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); } @@ -1848,7 +1865,7 @@ kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, uint32_t flags) void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, - boolean_t lookup) + bool lookup) { kdebug_vfs_lookup(dbg_parms, dbg_namelen, dp, lookup ? KDBG_VFS_LOOKUP_FLAG_LOOKUP : 0); @@ -1972,7 +1989,24 @@ vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_ /* Get the target vnode. */ if (ino == 2) { - error = VFS_ROOT(mp, &vp, ctx); + struct vfs_attr vfsattr; + int use_vfs_root = TRUE; + + VFSATTR_INIT(&vfsattr); + VFSATTR_WANTED(&vfsattr, f_capabilities); + if (vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 && + VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { + if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) && + (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) { + use_vfs_root = FALSE; + } + } + + if (use_vfs_root) { + error = VFS_ROOT(mp, &vp, ctx); + } else { + error = VFS_VGET(mp, ino, &vp, ctx); + } } else { error = VFS_VGET(mp, ino, &vp, ctx); } diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index e32310c8e..8a3cdcc47 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -254,9 +254,20 @@ int ragevnodes = 0; #define RAGE_LIMIT_MIN 100 #define RAGE_TIME_LIMIT 5 +/* + * ROSV definitions + * NOTE: These are shadowed from PlatformSupport definitions, but XNU + * builds standalone. + */ +#define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data" +#define PLATFORM_VM_VOLUME_MOUNT_POINT "/private/var/vm" + + struct mntlist mountlist; /* mounted filesystem list */ static int nummounts = 0; +static int print_busy_vnodes = 0; /* print out busy vnodes */ + #if DIAGNOSTIC #define VLISTCHECK(fun, vp, list) \ if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \ @@ -477,6 +488,7 @@ int vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) { vnode_t vp; + int ret = 0; TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { if (vp->v_type == VDIR) { @@ -497,18 +509,28 @@ vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) /* Look for busy vnode */ if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) { - return 1; + ret = 1; + if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) { + vprint("vnode_umount_preflight - busy vnode", vp); + } else { + return ret; + } } else if (vp->v_iocount > 0) { /* Busy if iocount is > 0 for more than 3 seconds */ tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz); if (vp->v_iocount > 0) { - return 1; + ret = 1; + if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) { + vprint("vnode_umount_preflight - busy vnode", vp); + } else { + return ret; + } } continue; } } - return 0; + return ret; } /* @@ -1259,6 +1281,98 @@ fail: return ENODEV; } +/* + * Mount the data volume of an ROSV volume group + */ +int +vfs_mount_rosv_data(void) +{ +#if CONFIG_ROSV_STARTUP + int error = 0; + int do_rosv_mounts = 0; + + error = vnode_get(rootvnode); + if (error) { + /* root must be mounted first */ + printf("vnode_get(rootvnode) failed with error %d\n", error); + return error; + } + + printf("NOTE: Attempting ROSV mount\n"); + struct vfs_attr vfsattr; + VFSATTR_INIT(&vfsattr); + VFSATTR_WANTED(&vfsattr, f_capabilities); + if (vfs_getattr(rootvnode->v_mount, &vfsattr, vfs_context_kernel()) == 0 && + VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { + if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) && + (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) { + printf("NOTE: DETECTED ROSV CONFIG\n"); + do_rosv_mounts = 1; + } + } + + if (!do_rosv_mounts) { + vnode_put(rootvnode); + //bail out if config not supported + return 0; + } + + char datapath[] = PLATFORM_DATA_VOLUME_MOUNT_POINT; /* !const because of internal casting */ + + /* Mount the data volume */ + printf("attempting kernel mount for data volume... \n"); + error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP, + datapath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_DATAVOL), vfs_context_kernel()); + + if (error) { + printf("Failed to mount data volume (%d)\n", error); + } + + vnode_put(rootvnode); + + return error; + +#else + return 0; +#endif +} + +/* + * Mount the VM volume of a container + */ +int +vfs_mount_vm(void) +{ +#if CONFIG_MOUNT_VM + int error = 0; + + error = vnode_get(rootvnode); + if (error) { + /* root must be mounted first */ + printf("vnode_get(rootvnode) failed with error %d\n", error); + return error; + } + + char vmpath[] = PLATFORM_VM_VOLUME_MOUNT_POINT; /* !const because of internal casting */ + + /* Mount the VM volume */ + printf("attempting kernel mount for vm volume... \n"); + error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP, + vmpath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_VMVOL), vfs_context_kernel()); + + if (error) { + printf("Failed to mount vm volume (%d)\n", error); + } else { + printf("mounted VM volume\n"); + } + + vnode_put(rootvnode); + return error; +#else + return 0; +#endif +} + /* * Lookup a mount point by filesystem identifier. */ @@ -2035,9 +2149,6 @@ done: * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ -#if DIAGNOSTIC -int busyprt = 0; /* print out busy vnodes */ -#endif int vflush(struct mount *mp, struct vnode *skipvp, int flags) @@ -2047,6 +2158,7 @@ vflush(struct mount *mp, struct vnode *skipvp, int flags) int reclaimed = 0; int retval; unsigned int vid; + bool first_try = true; /* * See comments in vnode_iterate() for the rationale for this lock @@ -2191,11 +2303,12 @@ loop: mount_lock(mp); continue; } -#if DIAGNOSTIC - if (busyprt) { - vprint("vflush: busy vnode", vp); + + /* log vnodes blocking unforced unmounts */ + if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) { + vprint("vflush - busy vnode", vp); } -#endif + vnode_unlock(vp); mount_lock(mp); busy++; @@ -2206,6 +2319,7 @@ loop: busy = 0; reclaimed = 0; (void)vnode_iterate_reloadq(mp); + first_try = false; /* returned with mount lock held */ goto loop; } @@ -2213,6 +2327,7 @@ loop: /* if new vnodes were created in between retry the reclaim */ if (vnode_iterate_reloadq(mp) != 0) { if (!(busy && ((flags & FORCECLOSE) == 0))) { + first_try = false; goto loop; } } @@ -2367,7 +2482,7 @@ vclean(vnode_t vp, int flags) } // make sure the name & parent ptrs get cleaned out! - vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE); + vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE | VNODE_UPDATE_PURGEFIRMLINK); vnode_lock(vp); @@ -2697,8 +2812,9 @@ vprint(const char *label, struct vnode *vp) if (label != NULL) { printf("%s: ", label); } - printf("type %s, usecount %d, writecount %d", - typename[vp->v_type], vp->v_usecount, vp->v_writecount); + printf("name %s type %s, usecount %d, writecount %d\n", + vp->v_name, typename[vp->v_type], + vp->v_usecount, vp->v_writecount); sbuf[0] = '\0'; if (vp->v_flag & VROOT) { strlcat(sbuf, "|VROOT", sizeof(sbuf)); @@ -2719,7 +2835,7 @@ vprint(const char *label, struct vnode *vp) strlcat(sbuf, "|VALIASED", sizeof(sbuf)); } if (sbuf[0] != '\0') { - printf(" flags (%s)", &sbuf[1]); + printf("vnode flags (%s\n", &sbuf[1]); } } @@ -2772,6 +2888,29 @@ vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbu return build_path_with_parent(vp, dvp, pathbuf, *len, len, 0, vfs_context_current()); } +int +vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int *len, int flags) +{ + int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER; + + if (flags && (flags != VN_GETPATH_FSENTER)) { + if (flags & VN_GETPATH_NO_FIRMLINK) { + bpflags |= BUILDPATH_NO_FIRMLINK;; + } + if (flags & VN_GETPATH_VOLUME_RELATIVE) { + bpflags |= (BUILDPATH_VOLUME_RELATIVE | BUILDPATH_NO_FIRMLINK); + } + } + + return build_path_with_parent(vp, dvp, pathbuf, *len, len, bpflags, vfs_context_current()); +} + +int +vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len) +{ + return vn_getpath_ext(vp, NULLVP, pathbuf, len, VN_GETPATH_NO_FIRMLINK); +} + int vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash) { @@ -3260,6 +3399,7 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) u_int32_t blksize; u_int64_t temp; u_int32_t features; + u_int64_t location = 0; vfs_context_t ctx = vfs_context_current(); dk_corestorage_info_t cs_info; boolean_t cs_present = FALSE;; @@ -3497,6 +3637,16 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) } } + if (VNOP_IOCTL(devvp, DKIOCGETLOCATION, (caddr_t)&location, 0, ctx) == 0) { + if (location & DK_LOCATION_EXTERNAL) { + mp->mnt_ioflags |= MNT_IOFLAGS_PERIPHERAL_DRIVE; + /* This must be called after MNTK_VIRTUALDEV has been determined via DKIOCISVIRTUAL */ + if ((MNTK_VIRTUALDEV & mp->mnt_kern_flag)) { + mp->mnt_flag |= MNT_REMOVABLE; + } + } + } + #if CONFIG_IOSCHED if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) { mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED; @@ -3859,11 +4009,11 @@ out: return error; } -static int filt_fsattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_fsattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); -static int filt_fstouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_fstouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev); SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = { .f_attach = filt_fsattach, .f_detach = filt_fsdetach, @@ -3873,8 +4023,11 @@ SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = { }; static int -filt_fsattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev) { + kn->kn_flags |= EV_CLEAR; /* automatic */ + kn->kn_sdata = 0; /* incoming data is ignored */ + lck_mtx_lock(fs_klist_lock); KNOTE_ATTACH(&fs_klist, kn); lck_mtx_unlock(fs_klist_lock); @@ -3910,7 +4063,7 @@ filt_fsevent(struct knote *kn, long hint) } static int -filt_fstouch(struct knote *kn, struct kevent_internal_s *kev) +filt_fstouch(struct knote *kn, struct kevent_qos_s *kev) { int res; @@ -3936,18 +4089,14 @@ filt_fstouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) - int res; + int res = 0; lck_mtx_lock(fs_klist_lock); - res = (kn->kn_fflags != 0); - if (res) { - *kev = kn->kn_kevent; - kn->kn_flags |= EV_CLEAR; /* automatic */ - kn->kn_fflags = 0; - kn->kn_data = 0; + if (kn->kn_fflags) { + knote_fill_kevent(kn, kev, 0); + res = 1; } lck_mtx_unlock(fs_klist_lock); return res; @@ -4062,6 +4211,12 @@ SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &s SYSCTL_NODE(_vfs_generic, VFS_CONF, conf, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_vfs_generic_conf, ""); +#if DEVELOPMENT || DEBUG +SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes, + CTLTYPE_INT | CTLFLAG_RW, + &print_busy_vnodes, 0, + "VFS log busy vnodes blocking unmount"); +#endif /* Indicate that the root file system unmounted cleanly */ static int vfs_root_unmounted_cleanly = 0; @@ -4518,7 +4673,7 @@ steal_this_vp: */ assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT); assert((vp->v_lflag & VL_LABEL) != VL_LABEL); - if (vp->v_lflag & VL_LABELED) { + if (vp->v_lflag & VL_LABELED || vp->v_label != NULL) { vnode_lock_convert(vp); mac_vnode_label_recycle(vp); } else if (mac_vnode_label_init_needed(vp)) { @@ -4987,6 +5142,13 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) vn_clearunionwait(vp, 1); + if (vnode_istty(vp) && (flags & REVOKEALL) && vp->v_usecount && + (vp->v_iocount > 1)) { + vnode_unlock(vp); + VNOP_IOCTL(vp, TIOCREVOKE, (caddr_t)NULL, 0, vfs_context_kernel()); + vnode_lock(vp); + } + vnode_drain(vp); isfifo = (vp->v_type == VFIFO); @@ -5179,6 +5341,11 @@ vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, record_vp(vp, 1); #endif +#if CONFIG_FIRMLINKS + vp->v_fmlink = NULLVP; +#endif + vp->v_flag &= ~VFMLINKTARGET; + #if CONFIG_TRIGGERS /* * For trigger vnodes, attach trigger info to vnode @@ -5462,6 +5629,7 @@ vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) void * allocmem; int indx_start, indx_stop, indx_incr; int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF); + int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT); count = mount_getvfscnt(); count += 10; @@ -5493,7 +5661,8 @@ vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) continue; } mount_lock(mp); - if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { + if ((mp->mnt_lflag & MNT_LDEAD) || + (!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) { mount_unlock(mp); mount_iterdrop(mp); continue; @@ -5721,7 +5890,8 @@ out: } errno_t -vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) +vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx, + vnode_t start_dvp) { struct nameidata nd; int error; @@ -5749,15 +5919,29 @@ vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + if (start_dvp && (path[0] != '/')) { + nd.ni_dvp = start_dvp; + nd.ni_cnd.cn_flags |= USEDVP; + } + if ((error = namei(&nd))) { return error; } + + nd.ni_cnd.cn_flags &= ~USEDVP; + *vpp = nd.ni_vp; nameidone(&nd); return 0; } +errno_t +vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) +{ + return vnode_lookupat(path, flags, vpp, ctx, NULLVP); +} + errno_t vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx) { @@ -7673,7 +7857,7 @@ vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, i /* check for no-EA filesystems */ if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) && (vfs_flags(mp) & MNT_NOUSERXATTR)) { - KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp); + KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vap); error = EACCES; /* User attributes disabled */ goto out; } @@ -7694,7 +7878,7 @@ vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, i } } if ((error = vnode_immutable(vap, append, ignore)) != 0) { - KAUTH_DEBUG("%p DENIED - file is immutable", vp); + KAUTH_DEBUG("%p DENIED - file is immutable", vap); goto out; } } @@ -7954,14 +8138,14 @@ vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp, VATTR_IS_SUPPORTED(vcp->vap, va_mode) && !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { result = EPERM; - KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode); + KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vcp, vcp->vap->va_mode); goto out; } /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */ *found_deny = TRUE; - KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp); + KAUTH_DEBUG("%p ALLOWED - caller is superuser", vcp); } out: return result; @@ -8454,6 +8638,7 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin if (VATTR_IS_ACTIVE(vap, va_flags)) { + vap->va_flags &= ~SF_SYNTHETIC; if (has_priv_suser) { if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) { error = EPERM; @@ -8814,6 +8999,8 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ */ if (VATTR_IS_ACTIVE(vap, va_flags)) { /* compute changing flags bits */ + vap->va_flags &= ~SF_SYNTHETIC; + ova.va_flags &= ~SF_SYNTHETIC; if (VATTR_IS_SUPPORTED(&ova, va_flags)) { fdelta = vap->va_flags ^ ova.va_flags; } else { @@ -9040,12 +9227,12 @@ no_guuid_change: } /* chown always clears setuid/gid bits. An exception is made for - * setattrlist executed by a root process to set on a file: + * setattrlist which can set both at the same time: on a file: * setattrlist is allowed to set the new mode on the file and change (chown) * uid/gid. */ if (newmode & (S_ISUID | S_ISGID)) { - if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) { + if (!VATTR_IS_ACTIVE(vap, va_mode)) { KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, newmode & ~(S_ISUID | S_ISGID)); newmode &= ~(S_ISUID | S_ISGID); @@ -9195,6 +9382,59 @@ vn_clearunionwait(vnode_t vp, int locked) } } +int +vnode_materialize_dataless_file(vnode_t vp, uint64_t op_type) +{ + int error; + + /* Swap files are special; ignore them */ + if (vnode_isswap(vp)) { + return 0; + } + + error = resolve_nspace_item(vp, + op_type | NAMESPACE_HANDLER_NSPACE_EVENT); + + /* + * The file resolver owns the logic about what error to return + * to the caller. We only need to handle a couple of special + * cases here: + */ + if (error == EJUSTRETURN) { + /* + * The requesting process is allowed to interact with + * dataless objects. Make a couple of sanity-checks + * here to ensure the action makes sense. + */ + switch (op_type) { + case NAMESPACE_HANDLER_WRITE_OP: + case NAMESPACE_HANDLER_TRUNCATE_OP: + case NAMESPACE_HANDLER_RENAME_OP: + /* + * This handles the case of the resolver itself + * writing data to the file (or throwing it + * away). + */ + error = 0; + break; + case NAMESPACE_HANDLER_READ_OP: + /* + * This handles the case of the resolver needing + * to look up inside of a dataless directory while + * it's in the process of materializing it (for + * example, creating files or directories). + */ + error = (vnode_vtype(vp) == VDIR) ? 0 : EBADF; + break; + default: + error = EBADF; + break; + } + } + + return error; +} + /* * Removes orphaned apple double files during a rmdir * Works by: @@ -9233,6 +9473,15 @@ rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_f return error; } + /* + * Prevent dataless fault materialization while we have + * a suspended vnode. + */ + uthread_t ut = get_bsdthread_info(current_thread()); + bool saved_nodatalessfaults = + (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false; + ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS; + /* * set up UIO */ @@ -9411,8 +9660,11 @@ outsc: } FREE(rbuf, M_TEMP); - vnode_resume(vp); + if (saved_nodatalessfaults == false) { + ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS; + } + vnode_resume(vp); return error; } @@ -9883,9 +10135,16 @@ vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx) lck_mtx_unlock(&rp->vr_lock); #if CONFIG_MACF - int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd); - if (rv != 0) { - return rv; + if ((rp->vr_flags & VNT_KERN_RESOLVE) == 0) { + /* + * VNT_KERN_RESOLVE indicates this trigger has no parameters + * at the discression of the accessing process other than + * the act of access. All other triggers must be checked + */ + int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd); + if (rv != 0) { + return rv; + } } #endif diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index c9dc444b9..838ad8c12 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2017 Apple Inc. All rights reserved. + * Copyright (c) 1995-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -124,6 +126,13 @@ #include #include +// deps for MIG call +#include +#include +#include +#include +#include + #if ROUTEFS #include #endif /* ROUTEFS */ @@ -177,8 +186,6 @@ static int sync_callback(mount_t, void *); static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp, int *sizep, boolean_t is_64_bit, boolean_t partial_copy); -static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, - user_addr_t bufp); static int fsync_common(proc_t p, struct fsync_args *uap, int flags); static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp, struct componentname *cnp, user_addr_t fsmountargs, @@ -202,9 +209,10 @@ struct fd_vn_data * fg_vn_data_alloc(void); */ #define MAX_AUTHORIZE_ENOENT_RETRIES 1024 -static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg); +static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg, + int unlink_flags); -static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *); +static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *); #ifdef CONFIG_IMGSRC_ACCESS static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx); @@ -215,6 +223,11 @@ static void mount_end_update(mount_t mp); static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index); #endif /* CONFIG_IMGSRC_ACCESS */ +#if CONFIG_LOCKERBOOT +int mount_locker_protoboot(const char *fsname, const char *mntpoint, + const char *pbdevpath); +#endif + //snapshot functions #if CONFIG_MNT_ROOTSNAP static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx); @@ -267,7 +280,7 @@ vfs_iskernelmount(mount_t mp) __private_extern__ int kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path, - void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx) + void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx) { struct nameidata nd; boolean_t did_namei; @@ -282,6 +295,9 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path, if (vp == NULLVP) { error = namei(&nd); if (error) { + if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) { + printf("failed to locate mount-on path: %s ", path); + } return error; } vp = nd.ni_vp; @@ -615,6 +631,22 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, boolean_t did_rele = FALSE; boolean_t have_usecount = FALSE; +#if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM + /* Check for mutually-exclusive flag bits */ + uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)); + int bitcount = 0; + while (checkflags != 0) { + checkflags &= (checkflags - 1); + bitcount++; + } + + if (bitcount > 1) { + //not allowed to request multiple mount-by-role flags + error = EINVAL; + goto out1; + } +#endif + /* * Process an update for an existing mount */ @@ -655,6 +687,16 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, goto out1; } + /* + * can't turn off MNT_REMOVABLE either but it may be an unexpected + * failure to return an error for this so we'll just silently + * add it if it is not passed in. + */ + if ((mp->mnt_flag & MNT_REMOVABLE) && + ((flags & MNT_REMOVABLE) == 0)) { + flags |= MNT_REMOVABLE; + } + #ifdef CONFIG_IMGSRC_ACCESS /* Can't downgrade the backer of the root FS */ if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) && @@ -696,7 +738,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, vfsp = mp->mnt_vtable; goto update; - } + } // MNT_UPDATE /* * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and @@ -726,9 +768,11 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, } /* - * VFC_VFSLOCALARGS is not currently supported for kernel mounts + * VFC_VFSLOCALARGS is not currently supported for kernel mounts, + * except in ROSV configs. */ - if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) { + if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) && + ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) { error = EINVAL; /* unsupported request */ goto out1; } @@ -770,7 +814,13 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, //mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); - strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN); + do { + int pathlen = MAXPATHLEN; + + if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) { + strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN); + } + } while (0); mp->mnt_vnodecovered = vp; mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx)); mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; @@ -807,7 +857,7 @@ update: mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | - MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | + MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME | MNT_QUARANTINE | MNT_CPROTECT); #if SECURE_KERNEL @@ -824,7 +874,7 @@ update: mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | - MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | + MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME | MNT_QUARANTINE | MNT_CPROTECT); #if CONFIG_MACF @@ -840,7 +890,8 @@ update: * Process device path for local file systems if requested */ if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS && - !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) { + !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) { + //snapshot, vm, datavolume mounts are special if (vfs_context_is64bit(ctx)) { if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) { goto out1; @@ -969,7 +1020,8 @@ update: goto out2; } } - } + } // localargs && !(snapshot | data | vm) + #if CONFIG_MACF if ((flags & MNT_UPDATE) == 0) { mac_mount_label_init(mp); @@ -985,11 +1037,73 @@ update: } #endif /* - * Mount the filesystem. + * Mount the filesystem. We already asserted that internal_flags + * cannot have more than one mount-by-role bit set. */ if (internal_flags & KERNEL_MOUNT_SNAPSHOT) { error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT, (caddr_t)fsmountargs, 0, ctx); + } else if (internal_flags & KERNEL_MOUNT_DATAVOL) { +#if CONFIG_ROSV_STARTUP + struct mount *origin_mp = (struct mount*)fsmountargs; + fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE}; + error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx); + if (error) { + printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error); + } else { + /* Mark volume associated with system volume */ + mp->mnt_kern_flag |= MNTK_SYSTEM; + + /* Attempt to acquire the mnt_devvp and set it up */ + struct vnode *mp_devvp = NULL; + if (mp->mnt_vfsstat.f_mntfromname[0] != 0) { + errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, + 0, &mp_devvp, vfs_context_kernel()); + if (!lerr) { + mp->mnt_devvp = mp_devvp; + //vnode_lookup took an iocount, need to drop it. + vnode_put(mp_devvp); + // now set `device_vnode` to the devvp that was acquired. + // this is needed in order to ensure vfs_init_io_attributes is invoked. + // note that though the iocount above was dropped, the mount acquires + // an implicit reference against the device. + device_vnode = mp_devvp; + } + } + } +#else + error = EINVAL; +#endif + } else if (internal_flags & KERNEL_MOUNT_VMVOL) { +#if CONFIG_MOUNT_VM + struct mount *origin_mp = (struct mount*)fsmountargs; + fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE}; + error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx); + if (error) { + printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error); + } else { + /* Mark volume associated with system volume and a swap mount */ + mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT); + /* Attempt to acquire the mnt_devvp and set it up */ + struct vnode *mp_devvp = NULL; + if (mp->mnt_vfsstat.f_mntfromname[0] != 0) { + errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, + 0, &mp_devvp, vfs_context_kernel()); + if (!lerr) { + mp->mnt_devvp = mp_devvp; + //vnode_lookup took an iocount, need to drop it. + vnode_put(mp_devvp); + + // now set `device_vnode` to the devvp that was acquired. + // note that though the iocount above was dropped, the mount acquires + // an implicit reference against the device. + device_vnode = mp_devvp; + } + } + } +#else + error = EINVAL; +#endif } else { error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx); } @@ -1019,6 +1133,11 @@ update: if (error == 0) { struct vfs_attr vfsattr; #if CONFIG_MACF + error = mac_mount_check_mount_late(ctx, mp); + if (error != 0) { + goto out3; + } + if (vfs_flags(mp) & MNT_MULTILABEL) { error = VFS_ROOT(mp, &rvp, ctx); if (error) { @@ -1310,8 +1429,10 @@ out: #if CONFIG_IMGSRC_ACCESS -#if DEBUG -#define IMGSRC_DEBUG(args...) printf(args) +#define DEBUG_IMGSRC 0 + +#if DEBUG_IMGSRC +#define IMGSRC_DEBUG(args...) printf("imgsrc: " args) #else #define IMGSRC_DEBUG(args...) do { } while(0) #endif @@ -1323,8 +1444,13 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_ vnode_t vp, realdevvp; mode_t accessmode; int error; + enum uio_seg uio = UIO_USERSPACE; + + if (ctx == vfs_context_kernel()) { + uio = UIO_SYSSPACE; + } - NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx); if ((error = namei(&nd))) { IMGSRC_DEBUG("namei() failed with %d\n", error); return error; @@ -1378,8 +1504,10 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_ out1: vnode_put(realdevvp); + out: nameidone(&nd); + if (error) { vnode_put(vp); } @@ -1398,6 +1526,9 @@ place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx) mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */ + IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n", + mp->mnt_vtable->vfc_name, vnode_getname(vp)); + vnode_lock_spin(vp); CLR(vp->v_flag, VMOUNT); vp->v_mountedhere = mp; @@ -1518,18 +1649,18 @@ get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp) } static int -relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, - const char *fsname, vfs_context_t ctx, +relocate_imageboot_source(vnode_t pvp, vnode_t vp, + struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index) { int error; mount_t mp; boolean_t placed = FALSE; - vnode_t devvp = NULLVP; struct vfstable *vfsp; user_addr_t devpath; char *old_mntonname; vnode_t rvp; + vnode_t devvp; uint32_t height; uint32_t flags; @@ -1601,11 +1732,11 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, error = get_imgsrc_rootvnode(height, &rvp); if (error != 0) { - IMGSRC_DEBUG("getting root vnode failed with %d\n", error); + IMGSRC_DEBUG("getting old root vnode failed with %d\n", error); return error; } - IMGSRC_DEBUG("got root vnode.\n"); + IMGSRC_DEBUG("got old root vnode\n"); MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK); @@ -1617,6 +1748,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, goto out0; } + IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name); IMGSRC_DEBUG("Starting updated.\n"); /* Get exclusive rwlock on mount, authorize update on mp */ @@ -1635,7 +1767,6 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, goto out1; } - IMGSRC_DEBUG("Preparing coveredvp.\n"); /* Mark covered vnode as mount in progress, authorize placing mount on top */ @@ -1650,7 +1781,8 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, /* Sanity check the name caller has provided */ vfsp = mp->mnt_vtable; if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) { - IMGSRC_DEBUG("Wrong fs name.\n"); + IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n", + vfsp->vfc_name, fsname); error = EINVAL; goto out2; } @@ -1737,6 +1869,59 @@ out0: return error; } +#if CONFIG_LOCKERBOOT +__private_extern__ +int +mount_locker_protoboot(const char *fsname, const char *mntpoint, + const char *pbdevpath) +{ + int error = -1; + struct nameidata nd; + boolean_t cleanup_nd = FALSE; + vfs_context_t ctx = vfs_context_kernel(); + boolean_t is64 = TRUE; + boolean_t by_index = TRUE; + struct user64_mnt_imgsrc_args mia64 = { + .mi_height = 0, + .mi_flags = 0, + .mi_devpath = CAST_USER_ADDR_T(pbdevpath), + }; + user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64); + + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, + UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx); + error = namei(&nd); + if (error) { + IMGSRC_DEBUG("namei: %d\n", error); + goto out; + } + + cleanup_nd = TRUE; + error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp, + &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index); + +out: + if (cleanup_nd) { + int stashed = error; + + error = vnode_put(nd.ni_vp); + if (error) { + panic("vnode_put() returned non-zero: %d", error); + } + + if (nd.ni_dvp) { + error = vnode_put(nd.ni_dvp); + if (error) { + panic("vnode_put() returned non-zero: %d", error); + } + } + nameidone(&nd); + + error = stashed; + } + return error; +} +#endif /* CONFIG_LOCKERBOOT */ #endif /* CONFIG_IMGSRC_ACCESS */ void @@ -1966,10 +2151,10 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) } } /* - * Don't allow unmounting the root file system. + * Don't allow unmounting the root file system (or the associated VM or DATA mounts) . */ - if (mp->mnt_flag & MNT_ROOTFS) { - error = EBUSY; /* the root is always busy */ + if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) { + error = EBUSY; /* the root (or associated volumes) is always busy */ goto out; } @@ -2089,9 +2274,6 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) } } - /* free disk_conditioner_info structure for this mount */ - disk_conditioner_unmount(mp); - IOBSDMountChange(mp, kIOMountChangeUnmount); #if CONFIG_TRIGGERS @@ -2183,6 +2365,10 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) wakeup((caddr_t)mp); } mount_refdrain(mp); + + /* free disk_conditioner_info structure for this mount */ + disk_conditioner_unmount(mp); + out: if (mp->mnt_lflag & MNT_LWAIT) { mp->mnt_lflag &= ~MNT_LWAIT; @@ -2376,14 +2562,44 @@ int syncprt = 0; int print_vmpage_stat = 0; +/* + * sync_callback: simple wrapper that calls VFS_SYNC() on volumes + * mounted read-write with the passed waitfor value. + * + * Parameters: mp mount-point descriptor per mounted file-system instance. + * arg user argument (please see below) + * + * User argument is a pointer to 32 bit unsigned integer which describes the + * type of waitfor value to set for calling VFS_SYNC(). If user argument is + * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default + * waitfor value. + * + * Returns: VFS_RETURNED + */ static int -sync_callback(mount_t mp, __unused void *arg) +sync_callback(mount_t mp, void *arg) { if ((mp->mnt_flag & MNT_RDONLY) == 0) { int asyncflag = mp->mnt_flag & MNT_ASYNC; + unsigned waitfor = MNT_NOWAIT; + + if (arg) { + waitfor = *(uint32_t*)arg; + } + + /* Sanity check for flags - these are the only valid combinations for the flag bits*/ + if (waitfor != MNT_WAIT && + waitfor != (MNT_WAIT | MNT_VOLUME) && + waitfor != MNT_NOWAIT && + waitfor != (MNT_NOWAIT | MNT_VOLUME) && + waitfor != MNT_DWAIT && + waitfor != (MNT_DWAIT | MNT_VOLUME)) { + panic("Passed inappropriate waitfor %u to " + "sync_callback()", waitfor); + } mp->mnt_flag &= ~MNT_ASYNC; - VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel()); + (void)VFS_SYNC(mp, waitfor, vfs_context_kernel()); if (asyncflag) { mp->mnt_flag |= MNT_ASYNC; } @@ -2426,7 +2642,7 @@ sync_internal_callback(mount_t mp, void *arg) if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) { return VFS_RETURNED; - } else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) { + } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) { return VFS_RETURNED; } } @@ -2480,7 +2696,7 @@ sync_thread(__unused void *arg, __unused wait_result_t wr) #endif /* DIAGNOSTIC */ } -struct timeval sync_timeout_last_print = {0, 0}; +struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0}; /* * An in-kernel sync for power management to call. @@ -2492,7 +2708,7 @@ sync_internal(void) thread_t thd; int error; int thread_created = FALSE; - struct timespec ts = {sync_timeout_seconds, 0}; + struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0}; lck_mtx_lock(sync_mtx_lck); sync_thread_state |= SYNC_THREAD_RUN; @@ -2670,6 +2886,7 @@ statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval) #if CONFIG_MACF error = mac_mount_check_stat(ctx, mp); if (error != 0) { + vnode_put(vp); return error; } #endif @@ -2738,40 +2955,33 @@ out: return error; } -/* - * Common routine to handle copying of statfs64 data to user space - */ -static int -statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp) -{ - int error; - struct statfs64 sfs; - - bzero(&sfs, sizeof(sfs)); - - sfs.f_bsize = sfsp->f_bsize; - sfs.f_iosize = (int32_t)sfsp->f_iosize; - sfs.f_blocks = sfsp->f_blocks; - sfs.f_bfree = sfsp->f_bfree; - sfs.f_bavail = sfsp->f_bavail; - sfs.f_files = sfsp->f_files; - sfs.f_ffree = sfsp->f_ffree; - sfs.f_fsid = sfsp->f_fsid; - sfs.f_owner = sfsp->f_owner; - sfs.f_type = mp->mnt_vtable->vfc_typenum; - sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; - sfs.f_fssubtype = sfsp->f_fssubtype; +void +vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs) +{ + struct vfsstatfs *vsfs = &mp->mnt_vfsstat; + + bzero(sfs, sizeof(*sfs)); + + sfs->f_bsize = vsfs->f_bsize; + sfs->f_iosize = (int32_t)vsfs->f_iosize; + sfs->f_blocks = vsfs->f_blocks; + sfs->f_bfree = vsfs->f_bfree; + sfs->f_bavail = vsfs->f_bavail; + sfs->f_files = vsfs->f_files; + sfs->f_ffree = vsfs->f_ffree; + sfs->f_fsid = vsfs->f_fsid; + sfs->f_owner = vsfs->f_owner; + sfs->f_type = mp->mnt_vtable->vfc_typenum; + sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + sfs->f_fssubtype = vsfs->f_fssubtype; + sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0; if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { - strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); } else { - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN); + strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN); } - strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN); - strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN); - - error = copyout((caddr_t)&sfs, bufp, sizeof(sfs)); - - return error; + strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN); + strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN); } /* @@ -2781,9 +2991,9 @@ int statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval) { struct mount *mp; - struct vfsstatfs *sp; int error; struct nameidata nd; + struct statfs64 sfs; vfs_context_t ctxp = vfs_context_current(); vnode_t vp; @@ -2795,12 +3005,12 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r } vp = nd.ni_vp; mp = vp->v_mount; - sp = &mp->mnt_vfsstat; nameidone(&nd); #if CONFIG_MACF error = mac_mount_check_stat(ctxp, mp); if (error != 0) { + vnode_put(vp); return error; } #endif @@ -2811,7 +3021,13 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r return error; } - error = statfs64_common(mp, sp, uap->buf); + vfs_get_statfs64(mp, &sfs); + if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) && + (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) { + /* This process does not want to see a seperate data volume mountpoint */ + strlcpy(&sfs.f_mntonname[0], "/", sizeof("/")); + } + error = copyout(&sfs, uap->buf, sizeof(sfs)); vnode_put(vp); return error; @@ -2825,7 +3041,7 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t { struct vnode *vp; struct mount *mp; - struct vfsstatfs *sp; + struct statfs64 sfs; int error; AUDIT_ARG(fd, uap->fd); @@ -2855,12 +3071,17 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t } #endif - sp = &mp->mnt_vfsstat; if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) { goto out; } - error = statfs64_common(mp, sp, uap->buf); + vfs_get_statfs64(mp, &sfs); + if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) && + (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) { + /* This process does not want to see a seperate data volume mountpoint */ + strlcpy(&sfs.f_mntonname[0], "/", sizeof("/")); + } + error = copyout(&sfs, uap->buf, sizeof(sfs)); out: file_drop(uap->fd); @@ -2900,9 +3121,10 @@ getfsstat_callback(mount_t mp, void * arg) * If MNT_NOWAIT is specified, do not refresh the * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT. */ - if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) && - (error = vfs_update_vfsstat(mp, ctx, - VFS_USER_EVENT))) { + if ((mp->mnt_lflag & MNT_LDEAD) || + (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) && + (!(mp->mnt_lflag & MNT_LUNMOUNT)) && + (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) { KAUTH_DEBUG("vfs_update_vfsstat returned %d", error); return VFS_RETURNED; } @@ -2975,6 +3197,10 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval size_t count, maxcount, bufsize, macsize; struct getfsstat_struct fst; + if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) { + return EINVAL; + } + bufsize = (size_t) uap->bufsize; macsize = (size_t) uap->macsize; @@ -3038,7 +3264,7 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval fst.maxcount = maxcount; - vfs_iterate(0, getfsstat_callback, &fst); + vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst); if (mp) { FREE(mp, M_MACTEMP); @@ -3062,6 +3288,7 @@ getfsstat64_callback(mount_t mp, void * arg) { struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg; struct vfsstatfs *sp; + struct statfs64 sfs; int error; if (fstp->sfsp && fstp->count < fstp->maxcount) { @@ -3081,19 +3308,21 @@ getfsstat64_callback(mount_t mp, void * arg) * getfsstat, since the constants are out of the same * namespace. */ - if (((fstp->flags & MNT_NOWAIT) == 0 || - (fstp->flags & (MNT_WAIT | MNT_DWAIT))) && - (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) { + if ((mp->mnt_lflag & MNT_LDEAD) || + ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) && + (!(mp->mnt_lflag & MNT_LUNMOUNT)) && + (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) { KAUTH_DEBUG("vfs_update_vfsstat returned %d", error); return VFS_RETURNED; } - error = statfs64_common(mp, sp, fstp->sfsp); + vfs_get_statfs64(mp, &sfs); + error = copyout(&sfs, fstp->sfsp, sizeof(sfs)); if (error) { fstp->error = error; return VFS_RETURNED_DONE; } - fstp->sfsp += sizeof(struct statfs64); + fstp->sfsp += sizeof(sfs); } fstp->count++; return VFS_RETURNED; @@ -3120,7 +3349,7 @@ getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval) fst.error = 0; fst.maxcount = maxcount; - vfs_iterate(0, getfsstat64_callback, &fst); + vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst); if (fst.error) { KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error); @@ -3353,6 +3582,7 @@ __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t * return common_fchdir(p, (void *)uap, 1); } + /* * Change current working directory ("."). * @@ -3362,45 +3592,41 @@ __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t * * vnode_ref:ENOENT No such file or directory */ /* ARGSUSED */ -static int -common_chdir(proc_t p, struct chdir_args *uap, int per_thread) +int +chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread) { struct filedesc *fdp = p->p_fd; int error; - struct nameidata nd; vnode_t tvp; - vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); - error = change_dir(&nd, ctx); + error = change_dir(ndp, ctx); if (error) { return error; } - if ((error = vnode_ref(nd.ni_vp))) { - vnode_put(nd.ni_vp); + if ((error = vnode_ref(ndp->ni_vp))) { + vnode_put(ndp->ni_vp); return error; } /* * drop the iocount we picked up in change_dir */ - vnode_put(nd.ni_vp); + vnode_put(ndp->ni_vp); if (per_thread) { thread_t th = vfs_context_thread(ctx); if (th) { uthread_t uth = get_bsdthread_info(th); tvp = uth->uu_cdir; - uth->uu_cdir = nd.ni_vp; + uth->uu_cdir = ndp->ni_vp; OSBitOrAtomic(P_THCWD, &p->p_flag); } else { - vnode_rele(nd.ni_vp); + vnode_rele(ndp->ni_vp); return ENOENT; } } else { proc_fdlock(p); tvp = fdp->fd_cdir; - fdp->fd_cdir = nd.ni_vp; + fdp->fd_cdir = ndp->ni_vp; proc_fdunlock(p); } @@ -3412,6 +3638,28 @@ common_chdir(proc_t p, struct chdir_args *uap, int per_thread) } +/* + * Change current working directory ("."). + * + * Returns: 0 Success + * chdir_internal:ENOTDIR + * chdir_internal:ENOENT No such file or directory + * chdir_internal:??? + */ +/* ARGSUSED */ +static int +common_chdir(proc_t p, struct chdir_args *uap, int per_thread) +{ + struct nameidata nd; + vfs_context_t ctx = vfs_context_current(); + + NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); + + return chdir_internal(p, ctx, &nd, per_thread); +} + + /* * chdir * @@ -3680,20 +3928,6 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, fp->f_fglob->fg_flag |= FHASLOCK; } -#if DEVELOPMENT || DEBUG - /* - * XXX VSWAP: Check for entitlements or special flag here - * so we can restrict access appropriately. - */ -#else /* DEVELOPMENT || DEBUG */ - - if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) { - /* block attempt to write/truncate swapfile */ - error = EPERM; - goto bad; - } -#endif /* DEVELOPMENT || DEBUG */ - /* try to truncate by setting the size attribute */ if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) { goto bad; @@ -3750,7 +3984,7 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, size_t copied; /* XXX FBDP: better way to detect /Applications/ ? */ if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) { - copyinstr(ndp->ni_dirp, + (void)copyinstr(ndp->ni_dirp, pathname, sizeof(pathname), &copied); @@ -3784,27 +4018,14 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, #else /* not implemented... */ #endif - if (!strncmp(vp->v_name, - DYLD_SHARED_CACHE_NAME, - strlen(DYLD_SHARED_CACHE_NAME)) || - !strncmp(vp->v_name, - "dyld", - strlen(vp->v_name)) || - !strncmp(vp->v_name, - "launchd", - strlen(vp->v_name)) || - !strncmp(vp->v_name, - "Camera", - strlen(vp->v_name)) || - !strncmp(vp->v_name, - "mediaserverd", - strlen(vp->v_name)) || - !strncmp(vp->v_name, - "SpringBoard", - strlen(vp->v_name)) || - !strncmp(vp->v_name, - "backboardd", - strlen(vp->v_name))) { + size_t len = strlen(vp->v_name); + if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) || + !strncmp(vp->v_name, "dyld", len) || + !strncmp(vp->v_name, "launchd", len) || + !strncmp(vp->v_name, "Camera", len) || + !strncmp(vp->v_name, "mediaserverd", len) || + !strncmp(vp->v_name, "SpringBoard", len) || + !strncmp(vp->v_name, "backboardd", len)) { /* * This file matters when launching Camera: * do not store its contents in the secluded @@ -4136,9 +4357,8 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval) return ENOMEM; } - error = fsgetpath_internal( - ctx, fsid.val[0], objid, - buflen, buf, &pathlen); + error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen, + buf, FSOPT_ISREALFSID, &pathlen); if (error) { FREE(buf, M_TEMP); @@ -4408,15 +4628,22 @@ my_strrchr(char *p, int ch) /* NOTREACHED */ } +extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink); extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path); +extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path); int -safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path) +safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink) { int ret, len = _len; *truncated_path = 0; - ret = vn_getpath(dvp, path, &len); + + if (firmlink) { + ret = vn_getpath(dvp, path, &len); + } else { + ret = vn_getpath_no_firmlink(dvp, path, &len); + } if (ret == 0 && len < (MAXPATHLEN - 1)) { if (leafname) { path[len - 1] = '/'; @@ -4462,13 +4689,28 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc } len = _len; - ret = vn_getpath(mydvp, path, &len); + if (firmlink) { + ret = vn_getpath(mydvp, path, &len); + } else { + ret = vn_getpath_no_firmlink(mydvp, path, &len); + } } while (ret == ENOSPC); } return len; } +int +safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path) +{ + return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1); +} + +int +safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path) +{ + return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0); +} /* * Make a hard file link. @@ -4486,7 +4728,7 @@ static int linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, user_addr_t link, int flag, enum uio_seg segflg) { - vnode_t vp, dvp, lvp; + vnode_t vp, pvp, dvp, lvp; struct nameidata nd; int follow; int error; @@ -4653,10 +4895,22 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, FSE_ARG_FINFO, &finfo, FSE_ARG_DONE); } - if (vp->v_parent) { + + pvp = vp->v_parent; + // need an iocount on pvp in this case + if (pvp && pvp != dvp) { + error = vnode_get(pvp); + if (error) { + pvp = NULLVP; + error = 0; + } + } + if (pvp) { add_fsevent(FSE_STAT_CHANGED, ctx, - FSE_ARG_VNODE, vp->v_parent, - FSE_ARG_DONE); + FSE_ARG_VNODE, pvp, FSE_ARG_DONE); + } + if (pvp && pvp != dvp) { + vnode_put(pvp); } } #endif @@ -4899,7 +5153,9 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, int error; struct componentname *cnp; char *path = NULL; - int len = 0; + char *no_firmlink_path = NULL; + int len_path = 0; + int len_no_firmlink_path = 0; #if CONFIG_FSE fse_info finfo; struct vnode_attr va; @@ -4908,6 +5164,7 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, int need_event; int has_listeners; int truncated_path; + int truncated_no_firmlink_path; int batched; struct vnode_attr *vap; int do_retry; @@ -4934,6 +5191,7 @@ retry: need_event = 0; has_listeners = 0; truncated_path = 0; + truncated_no_firmlink_path = 0; vap = NULL; NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx); @@ -4967,8 +5225,9 @@ continue_lookup: /* * The root of a mounted filesystem cannot be deleted. */ - if (vp->v_flag & VROOT) { + if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) { error = EBUSY; + goto out; } #if DEVELOPMENT || DEBUG @@ -4988,7 +5247,6 @@ continue_lookup: error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL); if (error) { if (error == ENOENT) { - assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES); if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) { do_retry = 1; retry_count++; @@ -5032,7 +5290,15 @@ continue_lookup: goto out; } } - len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path); + len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path); + if (no_firmlink_path == NULL) { + GET_PATH(no_firmlink_path); + if (no_firmlink_path == NULL) { + error = ENOMEM; + goto out; + } + } + len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path); } #if NAMEDRSRCFORK @@ -5058,7 +5324,6 @@ continue_lookup: } goto continue_lookup; } else if (error == ENOENT && batched) { - assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES); if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) { /* * For compound VNOPs, the authorization callback may @@ -5106,7 +5371,7 @@ continue_lookup: finfo.mode |= FSE_TRUNCATED_PATH; } add_fsevent(FSE_DELETE, ctx, - FSE_ARG_STRING, len, path, + FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path, FSE_ARG_FINFO, &finfo, FSE_ARG_DONE); } @@ -5116,8 +5381,13 @@ continue_lookup: out: if (path != NULL) { RELEASE_PATH(path); + path = NULL; } + if (no_firmlink_path != NULL) { + RELEASE_PATH(no_firmlink_path); + no_firmlink_path = NULL; + } #if NAMEDRSRCFORK /* recycle the deleted rsrc fork vnode to force a reclaim, which * will cause its shadow file to go away if necessary. @@ -5176,13 +5446,18 @@ unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval) int unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~AT_REMOVEDIR) { + if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) { return EINVAL; } - if (uap->flag & AT_REMOVEDIR) { + if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) { + int unlink_flags = 0; + + if (uap->flag & AT_REMOVEDIR_DATALESS) { + unlink_flags |= VNODE_REMOVE_DATALESS_DIR; + } return rmdirat_internal(vfs_context_current(), uap->fd, - uap->path, UIO_USERSPACE); + uap->path, UIO_USERSPACE, unlink_flags); } else { return unlinkat_internal(vfs_context_current(), uap->fd, NULLVP, uap->path, UIO_USERSPACE, 0); @@ -5674,7 +5949,7 @@ faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode, context.vc_thread = ctx->vc_thread; - niopts = FOLLOW | AUDITVNPATH1; + niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1; /* need parent for vnode_authorize for deletion test */ if (amode & _DELETE_OK) { niopts |= WANTPARENT; @@ -5738,7 +6013,7 @@ int faccessat(__unused proc_t p, struct faccessat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~AT_EACCESS) { + if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) { return EINVAL; } @@ -5775,6 +6050,8 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, kauth_filesec_t fsec; size_t xsecurity_bufsize; void * statptr; + struct fileproc *fp = NULL; + int needsrealdev = 0; follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1, @@ -5785,9 +6062,24 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, /* stat calls are allowed for resource forks. */ nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif - error = nameiat(&nd, fd); - if (error) { - return error; + + if (flag & AT_FDONLY) { + vnode_t fvp; + + error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp); + if (error) { + return error; + } + if ((error = vnode_getwithref(fvp))) { + file_drop(fd); + return error; + } + nd.ni_vp = fvp; + } else { + error = nameiat(&nd, fd); + if (error) { + return error; + } } fsec = KAUTH_FILESEC_NONE; @@ -5806,7 +6098,19 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, } #endif - error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx); + needsrealdev = flag & AT_REALDEV ? 1 : 0; + if (fp && (xsecurity == USER_ADDR_NULL)) { + /* + * If the caller has the file open, and is not + * requesting extended security information, we are + * going to let them get the basic stat information. + */ + error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx, + fp->f_fglob->fg_cred); + } else { + error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), + isstat64, needsrealdev, ctx); + } #if NAMEDRSRCFORK if (is_namedstream) { @@ -5815,6 +6119,10 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, #endif vnode_put(nd.ni_vp); nameidone(&nd); + if (fp) { + file_drop(fd); + fp = NULL; + } if (error) { return error; @@ -6031,7 +6339,7 @@ lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~AT_SYMLINK_NOFOLLOW) { + if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) { return EINVAL; } @@ -6043,7 +6351,7 @@ int fstatat64(__unused proc_t p, struct fstatat64_args *uap, __unused int32_t *retval) { - if (uap->flag & ~AT_SYMLINK_NOFOLLOW) { + if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) { return EINVAL; } @@ -6159,29 +6467,25 @@ readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval) } /* - * Change file flags. - * - * NOTE: this will vnode_put() `vp' + * Change file flags, the deep inner layer. */ static int -chflags1(vnode_t vp, int flags, vfs_context_t ctx) +chflags0(vnode_t vp, struct vnode_attr *va, + int (*setattr)(vnode_t, void *, vfs_context_t), + void *arg, vfs_context_t ctx) { - struct vnode_attr va; - kauth_action_t action; + kauth_action_t action = 0; int error; - VATTR_INIT(&va); - VATTR_SET(&va, va_flags, flags); - #if CONFIG_MACF - error = mac_vnode_check_setflags(ctx, vp, flags); + error = mac_vnode_check_setflags(ctx, vp, va->va_flags); if (error) { goto out; } #endif /* request authorisation, disregard immutability */ - if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) { + if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) { goto out; } /* @@ -6192,19 +6496,39 @@ chflags1(vnode_t vp, int flags, vfs_context_t ctx) if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) { goto out; } - error = vnode_setattr(vp, &va, ctx); + error = (*setattr)(vp, arg, ctx); #if CONFIG_MACF if (error == 0) { - mac_vnode_notify_setflags(ctx, vp, flags); + mac_vnode_notify_setflags(ctx, vp, va->va_flags); } #endif +out: + return error; +} + +/* + * Change file flags. + * + * NOTE: this will vnode_put() `vp' + */ +static int +chflags1(vnode_t vp, int flags, vfs_context_t ctx) +{ + struct vnode_attr va; + int error; + + VATTR_INIT(&va); + VATTR_SET(&va, va_flags, flags); + + error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx); + vnode_put(vp); + if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) { error = ENOTSUP; } -out: - vnode_put(vp); + return error; } @@ -7578,18 +7902,48 @@ out: return error; } -/* - * Rename files. Source and destination must either both be directories, - * or both not be directories. If target is a directory, it must be empty. - */ -/* ARGSUSED */ static int -renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, - int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags) +rename_submounts_callback(mount_t mp, void *arg) { - if (flags & ~VFS_RENAME_FLAGS_MASK) { - return EINVAL; - } + int error = 0; + mount_t pmp = (mount_t)arg; + int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname); + + if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) { + return 0; + } + + if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') { + return 0; + } + + if ((error = vfs_busy(mp, LK_NOWAIT))) { + printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname); + return -1; + } + + int pathlen = MAXPATHLEN; + if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) { + printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname); + } + + vfs_unbusy(mp); + + return error; +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +/* ARGSUSED */ +static int +renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, + int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags) +{ + if (flags & ~VFS_RENAME_FLAGS_MASK) { + return EINVAL; + } if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) { return EINVAL; @@ -7607,14 +7961,17 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, int has_listeners; const char *oname = NULL; char *from_name = NULL, *to_name = NULL; + char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL; int from_len = 0, to_len = 0; + int from_len_no_firmlink = 0, to_len_no_firmlink = 0; int holding_mntlock; mount_t locked_mp = NULL; vnode_t oparent = NULLVP; #if CONFIG_FSE fse_info from_finfo, to_finfo; #endif - int from_truncated = 0, to_truncated; + int from_truncated = 0, to_truncated = 0; + int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0; int batched = 0; struct vnode_attr *fvap, *tvap; int continuing = 0; @@ -7749,6 +8106,16 @@ continue_lookup: } from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); + + if (from_name_no_firmlink == NULL) { + GET_PATH(from_name_no_firmlink); + if (from_name_no_firmlink == NULL) { + error = ENOMEM; + goto out1; + } + } + + from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink); } if (need_event || need_kpath2 || has_listeners) { @@ -7761,6 +8128,16 @@ continue_lookup: } to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); + + if (to_name_no_firmlink == NULL) { + GET_PATH(to_name_no_firmlink); + if (to_name_no_firmlink == NULL) { + error = ENOMEM; + goto out1; + } + } + + to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink); if (to_name && need_kpath2) { AUDIT_ARG(kpath, to_name, ARG_KPATH2); } @@ -7787,7 +8164,6 @@ continue_lookup: error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL); if (error) { if (error == ENOENT) { - assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES); if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) { /* * We encountered a race where after doing the namei, tvp stops @@ -7844,6 +8220,7 @@ continue_lookup: (fvp->v_mountedhere == NULL) && (fdvp == tdvp) && ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) && + ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) && (fvp->v_mount->mnt_vnodecovered != NULLVP)) { vnode_t coveredvp; @@ -7990,6 +8367,41 @@ skipped_lookup: holding_mntlock = 0; } if (error) { + if (error == EDATALESS) { + /* + * If we've been here before, something has gone + * horribly wrong and we should just get out lest + * we spiral around the drain forever. + */ + if (flags & VFS_RENAME_DATALESS) { + error = EIO; + goto out1; + } + + /* + * The object we're renaming is dataless (or has a + * dataless descendent) and requires materialization + * before the rename occurs. But we're holding the + * mount point's rename lock, so it's not safe to + * make the upcall. + * + * In this case, we release the lock, perform the + * materialization, and start the whole thing over. + */ + error = vnode_materialize_dataless_file(fvp, + NAMESPACE_HANDLER_RENAME_OP); + + if (error == 0) { + /* + * The next time around we need to tell the + * file system that the materializtaion has + * been performed. + */ + flags |= VFS_RENAME_DATALESS; + do_retry = 1; + } + goto out1; + } if (error == EKEEPLOOKING) { if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) { if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) { @@ -8020,7 +8432,6 @@ skipped_lookup: * cache, redrive the lookup. */ if (batched && error == ENOENT) { - assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES); if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) { do_retry = 1; retry_count += 1; @@ -8058,9 +8469,9 @@ skipped_lookup: if (tvp) { add_fsevent(FSE_RENAME, ctx, - FSE_ARG_STRING, from_len, from_name, + FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink, FSE_ARG_FINFO, &from_finfo, - FSE_ARG_STRING, to_len, to_name, + FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink, FSE_ARG_FINFO, &to_finfo, FSE_ARG_DONE); if (flags & VFS_RENAME_SWAP) { @@ -8071,17 +8482,17 @@ skipped_lookup: * two. */ add_fsevent(FSE_RENAME, ctx, - FSE_ARG_STRING, to_len, to_name, + FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink, FSE_ARG_FINFO, &to_finfo, - FSE_ARG_STRING, from_len, from_name, + FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink, FSE_ARG_FINFO, &from_finfo, FSE_ARG_DONE); } } else { add_fsevent(FSE_RENAME, ctx, - FSE_ARG_STRING, from_len, from_name, + FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink, FSE_ARG_FINFO, &from_finfo, - FSE_ARG_STRING, to_len, to_name, + FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink, FSE_ARG_DONE); } } @@ -8124,14 +8535,21 @@ skipped_lookup: mpname = cp + 1; } } + + /* Update f_mntonname of sub mounts */ + vfs_iterate(0, rename_submounts_callback, (void *)mp); + /* append name to prefix */ maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname); bzero(pathend, maxlen); + strlcpy(pathend, mpname, maxlen); } FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI); vfs_unbusy(mp); + + vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL); } /* * fix up name & parent pointers. note that we first @@ -8157,10 +8575,18 @@ out1: RELEASE_PATH(to_name); to_name = NULL; } + if (to_name_no_firmlink != NULL) { + RELEASE_PATH(to_name_no_firmlink); + to_name_no_firmlink = NULL; + } if (from_name != NULL) { RELEASE_PATH(from_name); from_name = NULL; } + if (from_name_no_firmlink != NULL) { + RELEASE_PATH(from_name_no_firmlink); + from_name_no_firmlink = NULL; + } if (holding_mntlock) { mount_unlock_renames(locked_mp); mount_drop(locked_mp, 0); @@ -8420,16 +8846,19 @@ mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval) static int rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath, - enum uio_seg segflg) + enum uio_seg segflg, int unlink_flags) { vnode_t vp, dvp; int error; struct nameidata nd; char *path = NULL; - int len = 0; + char *no_firmlink_path = NULL; + int len_path = 0; + int len_no_firmlink_path = 0; int has_listeners = 0; int need_event = 0; - int truncated = 0; + int truncated_path = 0; + int truncated_no_firmlink_path = 0; #if CONFIG_FSE struct vnode_attr va; #endif /* CONFIG_FSE */ @@ -8499,7 +8928,6 @@ continue_lookup: error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL); if (error) { if (error == ENOENT) { - assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES); if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) { restart_flag = 1; restart_count += 1; @@ -8543,9 +8971,19 @@ continue_lookup: } } - len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated); + len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path); + + if (no_firmlink_path == NULL) { + GET_PATH(no_firmlink_path); + if (no_firmlink_path == NULL) { + error = ENOMEM; + goto out; + } + } + + len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path); #if CONFIG_FSE - if (truncated) { + if (truncated_no_firmlink_path) { finfo.mode |= FSE_TRUNCATED_PATH; } #endif @@ -8561,7 +8999,6 @@ continue_lookup: if (error == EKEEPLOOKING) { goto continue_lookup; } else if (batched && error == ENOENT) { - assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES); if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) { /* * For compound VNOPs, the authorization callback @@ -8573,6 +9010,27 @@ continue_lookup: goto out; } } + + /* + * XXX There's no provision for passing flags + * to VNOP_RMDIR(). So, if vn_rmdir() fails + * because it's not empty, then we try again + * with VNOP_REMOVE(), passing in a special + * flag that clever file systems will know + * how to handle. + */ + if (error == ENOTEMPTY && + (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) { + /* + * If this fails, we want to keep the original + * error. + */ + if (vn_remove(dvp, &vp, &nd, + VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) { + error = 0; + } + } + #if CONFIG_APPLEDOUBLE /* * Special case to remove orphaned AppleDouble @@ -8581,8 +9039,9 @@ continue_lookup: * so here we are. */ if (error == ENOTEMPTY) { - error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag); - if (error == EBUSY) { + int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag); + if (ad_error == EBUSY) { + error = ad_error; goto out; } @@ -8590,7 +9049,7 @@ continue_lookup: /* * Assuming everything went well, we will try the RMDIR again */ - if (!error) { + if (!ad_error) { error = vn_rmdir(dvp, &vp, &nd, vap, ctx); } } @@ -8619,7 +9078,7 @@ continue_lookup: vnode_get_fse_info_from_vap(vp, &finfo, vap); } add_fsevent(FSE_DELETE, ctx, - FSE_ARG_STRING, len, path, + FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path, FSE_ARG_FINFO, &finfo, FSE_ARG_DONE); } @@ -8631,6 +9090,12 @@ out: RELEASE_PATH(path); path = NULL; } + + if (no_firmlink_path != NULL) { + RELEASE_PATH(no_firmlink_path); + no_firmlink_path = NULL; + } + /* * nameidone has to happen before we vnode_put(dvp) * since it may need to release the fs_nodelock on the dvp @@ -8660,7 +9125,7 @@ int rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) { return rmdirat_internal(vfs_context_current(), AT_FDCWD, - CAST_USER_ADDR_T(uap->path), UIO_USERSPACE); + CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0); } /* Get direntry length padded to 8 byte alignment */ @@ -8775,7 +9240,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, */ static int getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread, - off_t *offset, int flags) + off_t *offset, int *eofflag, int flags) { vnode_t vp; struct vfs_context context = *vfs_context_current(); /* local copy */ @@ -8783,7 +9248,7 @@ getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *byt uio_t auio; int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32; off_t loff; - int error, eofflag, numdirent; + int error, numdirent; char uio_buf[UIO_SIZEOF(1)]; error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp); @@ -8831,10 +9296,10 @@ unionread: uio_addiov(auio, bufp, bufsize); if (flags & VNODE_READDIR_EXTENDED) { - error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context); + error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context); fp->f_fglob->fg_offset = uio_offset(auio); } else { - error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context); + error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context); fp->f_fglob->fg_offset = uio_offset(auio); } if (error) { @@ -8885,10 +9350,11 @@ getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t * { off_t offset; ssize_t bytesread; - int error; + int error, eofflag; AUDIT_ARG(fd, uap->fd); - error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0); + error = getdirentries_common(uap->fd, uap->buf, uap->count, + &bytesread, &offset, &eofflag, 0); if (error == 0) { if (proc_is64bit(p)) { @@ -8908,14 +9374,37 @@ getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ { off_t offset; ssize_t bytesread; - int error; + int error, eofflag; + user_size_t bufsize; AUDIT_ARG(fd, uap->fd); - error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED); + + /* + * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large, + * then the kernel carves out the last 4 bytes to return extended + * information to userspace (namely whether we reached EOF with this call). + */ + if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) { + bufsize = uap->bufsize - sizeof(getdirentries64_flags_t); + } else { + bufsize = uap->bufsize; + } + + error = getdirentries_common(uap->fd, uap->buf, bufsize, + &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED); if (error == 0) { *retval = bytesread; error = copyout((caddr_t)&offset, uap->position, sizeof(off_t)); + + if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) { + getdirentries64_flags_t flags = 0; + if (eofflag) { + flags |= GETDIRENTRIES64_EOF; + } + error = copyout(&flags, (user_addr_t)uap->buf + bufsize, + sizeof(flags)); + } } return error; } @@ -9666,822 +10155,744 @@ searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t #endif /* CONFIG_SEARCHFS */ -lck_grp_attr_t * nspace_group_attr; -lck_attr_t * nspace_lock_attr; -lck_grp_t * nspace_mutex_group; +#if CONFIG_DATALESS_FILES -lck_mtx_t nspace_handler_lock; -lck_mtx_t nspace_handler_exclusion_lock; +/* + * === Namespace Resolver Up-call Mechanism === + * + * When I/O is performed to a dataless file or directory (read, write, + * lookup-in, etc.), the file system performs an upcall to the namespace + * resolver (filecoordinationd) to materialize the object. + * + * We need multiple up-calls to be in flight at once, and we need these + * up-calls to be interruptible, thus the following implementation: + * + * => The nspace_resolver_request represents the in-kernel request state. + * It contains a request ID, storage space for the errno code returned + * by filecoordinationd, and flags. + * + * => The request ID is simply a global monotonically incrementing 32-bit + * number. Outstanding requests are stored in a hash table, and the + * hash function is extremely simple. + * + * => When an upcall is to be made to filecoordinationd, a request structure + * is allocated on the stack (it is small, and needs to live only during + * the duration of the call to resolve_nspace_item_ext()). It is + * initialized and inserted into the table. Some backpressure from + * filecoordinationd is applied by limiting the numnber of entries that + * can be inserted into the table (and thus limiting the number of + * outstanding requests issued to filecoordinationd); waiting for an + * available slot is interruptible. + * + * => Once the request has been inserted into the table, the up-call is made + * to filecoordinationd via a MiG-generated stub. The up-call returns + * immediately and filecoordinationd processes the request asynchronously. + * + * => The caller now waits for the request to complete. Tnis is achieved by + * sleeping on the address of the request structure and waiting for + * filecoordinationd to mark the request structure as complete. This + * is an interruptible sleep call; if interrupted, the request structure + * is removed from the table and EINTR is returned to the caller. If + * this occurs, an advisory up-call is made to filecoordinationd with + * the request ID to indicate that the request can be aborted or + * de-prioritized at the discretion of filecoordinationd. + * + * => When filecoordinationd has completed the request, it signals completion + * by writing to the vfs.nspace.complete sysctl node. Only a process + * decorated as a namespace resolver can write to this sysctl node. The + * value is a request ID / errno tuple passed as an array of 2 uint32_t's. + * The request ID is looked up in the table, and if the request is found, + * the error code is stored in the request structure and a wakeup() + * issued on the address of the request structure. If the request is not + * found, we simply drop the completion notification, assuming that the + * caller was interrupted. + * + * => When the waiting thread wakes up, it extracts the error code from the + * request structure, removes the request from the table, and returns the + * error code to the calling function. Fini! + */ -time_t snapshot_timestamp = 0; -int nspace_allow_virtual_devs = 0; +struct nspace_resolver_request { + LIST_ENTRY(nspace_resolver_request) r_hashlink; + uint32_t r_req_id; + int r_resolver_error; + int r_flags; +}; -void nspace_handler_init(void); +#define RRF_COMPLETE 0x0001 -typedef struct nspace_item_info { - struct vnode *vp; - void *arg; - uint64_t op; - uint32_t vid; - uint32_t flags; - uint32_t token; - uint32_t refcount; -} nspace_item_info; - -#define MAX_NSPACE_ITEMS 128 -nspace_item_info nspace_items[MAX_NSPACE_ITEMS]; -uint32_t nspace_item_idx = 0; // also used as the sleep/wakeup rendezvous address -uint32_t nspace_token_id = 0; -uint32_t nspace_handler_timeout = 15; // seconds - -#define NSPACE_ITEM_NEW 0x0001 -#define NSPACE_ITEM_PROCESSING 0x0002 -#define NSPACE_ITEM_DEAD 0x0004 -#define NSPACE_ITEM_CANCELLED 0x0008 -#define NSPACE_ITEM_DONE 0x0010 -#define NSPACE_ITEM_RESET_TIMER 0x0020 - -#define NSPACE_ITEM_NSPACE_EVENT 0x0040 -#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080 - -#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT) - -//#pragma optimization_level 0 +static uint32_t +next_nspace_req_id(void) +{ + static uint32_t next_req_id; -typedef enum { - NSPACE_HANDLER_NSPACE = 0, - NSPACE_HANDLER_SNAPSHOT = 1, - - NSPACE_HANDLER_COUNT, -} nspace_type_t; - -typedef struct { - uint64_t handler_tid; - struct proc *handler_proc; - int handler_busy; -} nspace_handler_t; - -nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT]; - -/* namespace fsctl functions */ -static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type); -static int nspace_item_flags_for_type(nspace_type_t nspace_type); -static int nspace_open_flags_for_type(nspace_type_t nspace_type); -static nspace_type_t nspace_type_for_op(uint64_t op); -static int nspace_is_special_process(struct proc *proc); -static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx); -static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type); -static int validate_namespace_args(int is64bit, int size); -static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data); - - -static inline int -nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type) -{ - switch (nspace_type) { - case NSPACE_HANDLER_NSPACE: - return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT; - case NSPACE_HANDLER_SNAPSHOT: - return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT; - default: - printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type); - return 0; - } + return OSAddAtomic(1, &next_req_id); } -static inline int -nspace_item_flags_for_type(nspace_type_t nspace_type) -{ - switch (nspace_type) { - case NSPACE_HANDLER_NSPACE: - return NSPACE_ITEM_NSPACE_EVENT; - case NSPACE_HANDLER_SNAPSHOT: - return NSPACE_ITEM_SNAPSHOT_EVENT; - default: - printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type); - return 0; - } -} +#define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */ +#define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */ -static inline int -nspace_open_flags_for_type(nspace_type_t nspace_type) -{ - switch (nspace_type) { - case NSPACE_HANDLER_NSPACE: - return FREAD | FWRITE | O_EVTONLY; - case NSPACE_HANDLER_SNAPSHOT: - return FREAD | O_EVTONLY; - default: - printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type); - return 0; - } -} +static LIST_HEAD(nspace_resolver_requesthead, + nspace_resolver_request) * nspace_resolver_request_hashtbl; +static u_long nspace_resolver_request_hashmask; +static u_int nspace_resolver_request_count; +static bool nspace_resolver_request_wait_slot; +static lck_grp_t *nspace_resolver_request_lck_grp; +static lck_mtx_t nspace_resolver_request_hash_mutex; -static inline nspace_type_t -nspace_type_for_op(uint64_t op) -{ - switch (op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) { - case NAMESPACE_HANDLER_NSPACE_EVENT: - return NSPACE_HANDLER_NSPACE; - case NAMESPACE_HANDLER_SNAPSHOT_EVENT: - return NSPACE_HANDLER_SNAPSHOT; - default: - printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK); - return NSPACE_HANDLER_NSPACE; - } -} +#define NSPACE_REQ_LOCK() \ + lck_mtx_lock(&nspace_resolver_request_hash_mutex) +#define NSPACE_REQ_UNLOCK() \ + lck_mtx_unlock(&nspace_resolver_request_hash_mutex) + +#define NSPACE_RESOLVER_HASH(req_id) \ + (&nspace_resolver_request_hashtbl[(req_id) & \ + nspace_resolver_request_hashmask]) -static inline int -nspace_is_special_process(struct proc *proc) +static struct nspace_resolver_request * +nspace_resolver_req_lookup(uint32_t req_id) { - int i; - for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { - if (proc == nspace_handlers[i].handler_proc) { - return 1; + struct nspace_resolver_requesthead *bucket; + struct nspace_resolver_request *req; + + bucket = NSPACE_RESOLVER_HASH(req_id); + LIST_FOREACH(req, bucket, r_hashlink) { + if (req->r_req_id == req_id) { + return req; } } - return 0; -} -void -nspace_handler_init(void) -{ - nspace_lock_attr = lck_attr_alloc_init(); - nspace_group_attr = lck_grp_attr_alloc_init(); - nspace_mutex_group = lck_grp_alloc_init("nspace-mutex", nspace_group_attr); - lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr); - lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr); - memset(&nspace_items[0], 0, sizeof(nspace_items)); + return NULL; } -void -nspace_proc_exit(struct proc *p) +static int +nspace_resolver_req_add(struct nspace_resolver_request *req) { - int i, event_mask = 0; + struct nspace_resolver_requesthead *bucket; + int error; - for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { - if (p == nspace_handlers[i].handler_proc) { - event_mask |= nspace_item_flags_for_type(i); - nspace_handlers[i].handler_tid = 0; - nspace_handlers[i].handler_proc = NULL; + while (nspace_resolver_request_count >= + NSPACE_RESOLVER_MAX_OUTSTANDING) { + nspace_resolver_request_wait_slot = true; + error = msleep(&nspace_resolver_request_count, + &nspace_resolver_request_hash_mutex, + PVFS | PCATCH, "nspacerq", NULL); + if (error) { + return error; } } - if (event_mask == 0) { - return; - } + bucket = NSPACE_RESOLVER_HASH(req->r_req_id); +#if DIAGNOSTIC + assert(nspace_resolver_req_lookup(req->r_req_id) == NULL); +#endif /* DIAGNOSTIC */ + LIST_INSERT_HEAD(bucket, req, r_hashlink); + nspace_resolver_request_count++; - lck_mtx_lock(&nspace_handler_lock); - if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) { - // if this process was the snapshot handler, zero snapshot_timeout - snapshot_timestamp = 0; - } + return 0; +} - // - // unblock anyone that's waiting for the handler that died - // - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) { - if (nspace_items[i].flags & event_mask) { - if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { - vnode_lock_spin(nspace_items[i].vp); - nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; - vnode_unlock(nspace_items[i].vp); - } - nspace_items[i].vp = NULL; - nspace_items[i].vid = 0; - nspace_items[i].flags = NSPACE_ITEM_DONE; - nspace_items[i].token = 0; +static void +nspace_resolver_req_remove(struct nspace_resolver_request *req) +{ + struct nspace_resolver_requesthead *bucket; - wakeup((caddr_t)&(nspace_items[i].vp)); - } - } - } + bucket = NSPACE_RESOLVER_HASH(req->r_req_id); +#if DIAGNOSTIC + assert(nspace_resolver_req_lookup(req->r_req_id) != NULL); +#endif /* DIAGNOSTIC */ + LIST_REMOVE(req, r_hashlink); + nspace_resolver_request_count--; - wakeup((caddr_t)&nspace_item_idx); - lck_mtx_unlock(&nspace_handler_lock); + if (nspace_resolver_request_wait_slot) { + nspace_resolver_request_wait_slot = false; + wakeup(&nspace_resolver_request_count); + } } - -int -resolve_nspace_item(struct vnode *vp, uint64_t op) +static void +nspace_resolver_req_cancel(uint32_t req_id) { - return resolve_nspace_item_ext(vp, op, NULL); -} + kern_return_t kr; + mach_port_t mp; -int -resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) -{ - int i, error, keep_waiting; - struct timespec ts; - nspace_type_t nspace_type = nspace_type_for_op(op); + // Failures here aren't fatal -- the cancellation message + // sent to the resolver is merely advisory. - // only allow namespace events on regular files, directories and symlinks. - if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { - return 0; + kr = host_get_filecoordinationd_port(host_priv_self(), &mp); + if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) { + return; } - // - // if this is a snapshot event and the vnode is on a - // disk image just pretend nothing happened since any - // change to the disk image will cause the disk image - // itself to get backed up and this avoids multi-way - // deadlocks between the snapshot handler and the ever - // popular diskimages-helper process. the variable - // nspace_allow_virtual_devs allows this behavior to - // be overridden (for use by the Mobile TimeMachine - // testing infrastructure which uses disk images) - // - if ((op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) - && (vp->v_mount != NULL) - && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) - && !nspace_allow_virtual_devs) { - return 0; + kr = send_nspace_resolve_cancel(mp, req_id); + if (kr != KERN_SUCCESS) { + os_log_error(OS_LOG_DEFAULT, + "NSPACE send_nspace_resolve_cancel failure: %d", kr); } - // if (thread_tid(current_thread()) == namespace_handler_tid) { - if (nspace_handlers[nspace_type].handler_proc == NULL) { - return 0; - } + ipc_port_release_send(mp); +} - if (nspace_is_special_process(current_proc())) { - return EDEADLK; - } +static int +nspace_resolver_req_wait(struct nspace_resolver_request *req) +{ + bool send_cancel_message = false; + int error; - lck_mtx_lock(&nspace_handler_lock); + NSPACE_REQ_LOCK(); -retry: - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (vp == nspace_items[i].vp && op == nspace_items[i].op) { + while ((req->r_flags & RRF_COMPLETE) == 0) { + error = msleep(req, &nspace_resolver_request_hash_mutex, + PVFS | PCATCH, "nspace", NULL); + if (error && error != ERESTART) { + req->r_resolver_error = (error == EINTR) ? EINTR : + ETIMEDOUT; + send_cancel_message = true; break; } } - if (i >= MAX_NSPACE_ITEMS) { - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].flags == 0) { - break; - } - } - } else { - nspace_items[i].refcount++; - } + nspace_resolver_req_remove(req); - if (i >= MAX_NSPACE_ITEMS) { - ts.tv_sec = nspace_handler_timeout; - ts.tv_nsec = 0; + NSPACE_REQ_UNLOCK(); - error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS | PCATCH, "nspace-no-space", &ts); - if (error == 0) { - // an entry got free'd up, go see if we can get a slot - goto retry; - } else { - lck_mtx_unlock(&nspace_handler_lock); - return error; - } + if (send_cancel_message) { + nspace_resolver_req_cancel(req->r_req_id); } - // - // if it didn't already exist, add it. if it did exist - // we'll get woken up when someone does a wakeup() on - // the slot in the nspace_items table. - // - if (vp != nspace_items[i].vp) { - nspace_items[i].vp = vp; - nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg; // arg is {NULL, true, uio *} - only pass uio thru to the user - nspace_items[i].op = op; - nspace_items[i].vid = vnode_vid(vp); - nspace_items[i].flags = NSPACE_ITEM_NEW; - nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type); - if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) { - if (arg) { - vnode_lock_spin(vp); - vp->v_flag |= VNEEDSSNAPSHOT; - vnode_unlock(vp); - } - } + return req->r_resolver_error; +} - nspace_items[i].token = 0; - nspace_items[i].refcount = 1; +static void +nspace_resolver_req_mark_complete( + struct nspace_resolver_request *req, + int resolver_error) +{ + req->r_resolver_error = resolver_error; + req->r_flags |= RRF_COMPLETE; + wakeup(req); +} - wakeup((caddr_t)&nspace_item_idx); - } +static void +nspace_resolver_req_completed(uint32_t req_id, int resolver_error) +{ + struct nspace_resolver_request *req; - // - // Now go to sleep until the handler does a wakeup on this - // slot in the nspace_items table (or we timeout). - // - keep_waiting = 1; - while (keep_waiting) { - ts.tv_sec = nspace_handler_timeout; - ts.tv_nsec = 0; - error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS | PCATCH, "namespace-done", &ts); + NSPACE_REQ_LOCK(); - if (nspace_items[i].flags & NSPACE_ITEM_DONE) { - error = 0; - } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) { - error = nspace_items[i].token; - } else if (error == EWOULDBLOCK || error == ETIMEDOUT) { - if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) { - nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER; - continue; - } else { - error = ETIMEDOUT; - } - } else if (error == 0) { - // hmmm, why did we get woken up? - printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n", - nspace_items[i].token); - } + // If we don't find the request corresponding to our req_id, + // just drop the completion signal on the floor; it's likely + // that the requester interrupted with a signal. - if (--nspace_items[i].refcount == 0) { - nspace_items[i].vp = NULL; // clear this so that no one will match on it again - nspace_items[i].arg = NULL; - nspace_items[i].token = 0; // clear this so that the handler will not find it anymore - nspace_items[i].flags = 0; // this clears it for re-use - } - wakeup(&nspace_token_id); - keep_waiting = 0; + req = nspace_resolver_req_lookup(req_id); + if (req) { + nspace_resolver_req_mark_complete(req, resolver_error); } - lck_mtx_unlock(&nspace_handler_lock); + NSPACE_REQ_UNLOCK(); +} + +static struct proc *nspace_resolver_proc; - return error; +static int +nspace_resolver_get_proc_state(struct proc *p, int *is_resolver) +{ + *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) && + p == nspace_resolver_proc) ? 1 : 0; + return 0; } -int -nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg) +static int +nspace_resolver_set_proc_state(struct proc *p, int is_resolver) { - int snapshot_error = 0; + vfs_context_t ctx = vfs_context_current(); + int error = 0; - if (vp == NULL) { - return 0; + // + // The system filecoordinationd runs as uid == 0. This also + // has the nice side-effect of filtering out filecoordinationd + // running in the simulator. + // + if (!vfs_context_issuser(ctx)) { + return EPERM; } - /* Swap files are special; skip them */ - if (vnode_isswap(vp)) { - return 0; + error = priv_check_cred(vfs_context_ucred(ctx), + PRIV_VFS_DATALESS_RESOLVER, 0); + if (error) { + return error; } - if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) { - // the change time is within this epoch - int error; + if (is_resolver) { + NSPACE_REQ_LOCK(); - error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg); - if (error == EDEADLK) { - snapshot_error = 0; - } else if (error) { - if (error == EAGAIN) { - printf("nspace_snapshot_event: timed out waiting for namespace handler...\n"); - } else if (error == EINTR) { - // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n"); - snapshot_error = EINTR; - } + if (nspace_resolver_proc == NULL) { + proc_lock(p); + p->p_lflag |= P_LNSPACE_RESOLVER; + proc_unlock(p); + nspace_resolver_proc = p; + } else { + error = EBUSY; } + + NSPACE_REQ_UNLOCK(); + } else { + // This is basically just like the exit case. + // nspace_resolver_exited() will verify that the + // process is the resolver, and will clear the + // global. + nspace_resolver_exited(p); } - return snapshot_error; + return error; } -int -get_nspace_item_status(struct vnode *vp, int32_t *status) +static int +nspace_materialization_get_proc_state(struct proc *p, int *is_prevented) { - int i; - - lck_mtx_lock(&nspace_handler_lock); - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].vp == vp) { - break; - } + if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 || + (p->p_vfs_iopolicy & + P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) { + *is_prevented = 1; + } else { + *is_prevented = 0; } + return 0; +} - if (i >= MAX_NSPACE_ITEMS) { - lck_mtx_unlock(&nspace_handler_lock); - return ENOENT; +static int +nspace_materialization_set_proc_state(struct proc *p, int is_prevented) +{ + if (p->p_lflag & P_LNSPACE_RESOLVER) { + return is_prevented ? 0 : EBUSY; } - *status = nspace_items[i].flags; - lck_mtx_unlock(&nspace_handler_lock); + if (is_prevented) { + OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy); + } else { + OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy); + } return 0; } - -#if 0 static int -build_volfs_path(struct vnode *vp, char *path, int *len) +nspace_materialization_get_thread_state(int *is_prevented) { - struct vnode_attr va; - int ret; + uthread_t ut = get_bsdthread_info(current_thread()); - VATTR_INIT(&va); - VATTR_WANTED(&va, va_fsid); - VATTR_WANTED(&va, va_fileid); + *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0; + return 0; +} - if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) { - *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1; - ret = -1; +static int +nspace_materialization_set_thread_state(int is_prevented) +{ + uthread_t ut = get_bsdthread_info(current_thread()); + + if (is_prevented) { + ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS; } else { - *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1; - ret = 0; + ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS; } - - return ret; + return 0; } -#endif -// -// Note: this function does NOT check permissions on all of the -// parent directories leading to this vnode. It should only be -// called on behalf of a root process. Otherwise a process may -// get access to a file because the file itself is readable even -// though its parent directories would prevent access. -// static int -vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx) +nspace_materialization_is_prevented(void) { - int error, action; + proc_t p = current_proc(); + uthread_t ut = (uthread_t)get_bsdthread_info(current_thread()); + vfs_context_t ctx = vfs_context_current(); - if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { - return error; + /* + * Kernel context ==> return EDEADLK, as we would with any random + * process decorated as no-materialize. + */ + if (ctx == vfs_context_kernel()) { + return EDEADLK; } -#if CONFIG_MACF - error = mac_vnode_check_open(ctx, vp, fmode); - if (error) { - return error; + /* + * If the process has the dataless-manipulation entitlement, + * materialization is prevented, and depending on the kind + * of file system operation, things get to proceed as if the + * object is not dataless. + */ + if (vfs_context_is_dataless_manipulator(ctx)) { + return EJUSTRETURN; } -#endif - /* compute action to be authorized */ - action = 0; - if (fmode & FREAD) { - action |= KAUTH_VNODE_READ_DATA; - } - if (fmode & (FWRITE | O_TRUNC)) { - /* - * If we are writing, appending, and not truncating, - * indicate that we are appending so that if the - * UF_APPEND or SF_APPEND bits are set, we do not deny - * the open. - */ - if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { - action |= KAUTH_VNODE_APPEND_DATA; - } else { - action |= KAUTH_VNODE_WRITE_DATA; + /* + * Per-thread decorations override any process-wide decorations. + * (Foundation uses this, and this overrides even the dataless- + * manipulation entitlement so as to make API contracts consistent.) + */ + if (ut != NULL) { + if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) { + return EDEADLK; + } + if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) { + return 0; } } - if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) { - return error; + /* + * If the process's iopolicy specifies that dataless files + * can be materialized, then we let it go ahead. + */ + if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) { + return 0; } + /* + * The default behavior is to not materialize dataless files; + * return to the caller that deadlock was detected. + */ + return EDEADLK; +} - // - // if the vnode is tagged VOPENEVT and the current process - // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY - // flag to the open mode so that this open won't count against - // the vnode when carbon delete() does a vnode_isinuse() to see - // if a file is currently in use. this allows spotlight - // importers to not interfere with carbon apps that depend on - // the no-delete-if-busy semantics of carbon delete(). - // - if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { - fmode |= O_EVTONLY; - } +/* the vfs.nspace branch */ +SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge"); - if ((error = VNOP_OPEN(vp, fmode, ctx))) { +static int +sysctl_nspace_resolver(__unused struct sysctl_oid *oidp, + __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + struct proc *p = req->p; + int new_value, old_value, changed = 0; + int error; + + error = nspace_resolver_get_proc_state(p, &old_value); + if (error) { return error; } - if ((error = vnode_ref_ext(vp, fmode, 0))) { - VNOP_CLOSE(vp, fmode, ctx); - return error; + + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, + &changed); + if (error == 0 && changed) { + error = nspace_resolver_set_proc_state(p, new_value); } + return error; +} - /* Call out to allow 3rd party notification of open. - * Ignore result of kauth_authorize_fileop call. - */ -#if CONFIG_MACF - mac_vnode_notify_open(ctx, vp, fmode); -#endif - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, - (uintptr_t)vp, 0); +/* decorate this process as the dataless file resolver */ +SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + 0, 0, sysctl_nspace_resolver, "I", ""); +static int +sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp, + __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + struct proc *p = req->p; + int new_value, old_value, changed = 0; + int error; - return 0; + error = nspace_materialization_get_proc_state(p, &old_value); + if (error) { + return error; + } + + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, + &changed); + if (error == 0 && changed) { + error = nspace_materialization_set_proc_state(p, new_value); + } + return error; } +/* decorate this process as not wanting to materialize dataless files */ +SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + 0, 0, sysctl_nspace_prevent_materialization, "I", ""); + static int -wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type) +sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp, + __unused void *arg1, __unused int arg2, struct sysctl_req *req) { - int i; - int error = 0; - int unblock = 0; - task_t curtask; + int new_value, old_value, changed = 0; + int error; - lck_mtx_lock(&nspace_handler_exclusion_lock); - if (nspace_handlers[nspace_type].handler_busy) { - lck_mtx_unlock(&nspace_handler_exclusion_lock); - return EBUSY; + error = nspace_materialization_get_thread_state(&old_value); + if (error) { + return error; + } + + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, + &changed); + if (error == 0 && changed) { + error = nspace_materialization_set_thread_state(new_value); } + return error; +} - nspace_handlers[nspace_type].handler_busy = 1; - lck_mtx_unlock(&nspace_handler_exclusion_lock); +/* decorate this thread as not wanting to materialize dataless files */ +SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + 0, 0, sysctl_nspace_thread_prevent_materialization, "I", ""); - /* - * Any process that gets here will be one of the namespace handlers. - * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation - * as we can cause deadlocks to occur, because the namespace handler may prevent - * VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE - * process. - */ - curtask = current_task(); - bsd_set_dependency_capable(curtask); +static int +sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + struct proc *p = req->p; + uint32_t req_status[2] = { 0, 0 }; + int error, is_resolver, changed = 0; - lck_mtx_lock(&nspace_handler_lock); - if (nspace_handlers[nspace_type].handler_proc == NULL) { - nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread()); - nspace_handlers[nspace_type].handler_proc = current_proc(); + error = nspace_resolver_get_proc_state(p, &is_resolver); + if (error) { + return error; } - if (nspace_type == NSPACE_HANDLER_SNAPSHOT && - (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { - error = EINVAL; + if (!is_resolver) { + return EPERM; } - while (error == 0) { - /* Try to find matching namespace item */ - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].flags & NSPACE_ITEM_NEW) { - if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { - break; - } - } - } - - if (i >= MAX_NSPACE_ITEMS) { - /* Nothing is there yet. Wait for wake up and retry */ - error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS | PCATCH, "namespace-items", 0); - if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { - /* Prevent infinite loop if snapshot handler exited */ - error = EINVAL; - break; - } - continue; - } + error = sysctl_io_opaque(req, req_status, sizeof(req_status), + &changed); + if (error) { + return error; + } - nspace_items[i].flags &= ~NSPACE_ITEM_NEW; - nspace_items[i].flags |= NSPACE_ITEM_PROCESSING; - nspace_items[i].token = ++nspace_token_id; + /* + * req_status[0] is the req_id + * + * req_status[1] is the errno + */ + if (error == 0 && changed) { + nspace_resolver_req_completed(req_status[0], + (int)req_status[1]); + } + return error; +} - assert(nspace_items[i].vp); - struct fileproc *fp; - int32_t indx; - int32_t fmode; - struct proc *p = current_proc(); - vfs_context_t ctx = vfs_context_current(); - struct vnode_attr va; - bool vn_get_succsessful = false; - bool vn_open_successful = false; - bool fp_alloc_successful = false; +/* Resolver reports completed reqs here. */ +SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete, + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + 0, 0, sysctl_nspace_complete, "-", ""); - /* - * Use vnode pointer to acquire a file descriptor for - * hand-off to userland - */ - fmode = nspace_open_flags_for_type(nspace_type); - error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid); - if (error) { - goto cleanup; - } - vn_get_succsessful = true; +#endif /* CONFIG_DATALESS_FILES */ - error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx); - if (error) { - goto cleanup; - } - vn_open_successful = true; +#if CONFIG_DATALESS_FILES +#define __no_dataless_unused /* nothing */ +#else +#define __no_dataless_unused __unused +#endif - error = falloc(p, &fp, &indx, ctx); - if (error) { - goto cleanup; - } - fp_alloc_successful = true; +void +nspace_resolver_init(void) +{ +#if CONFIG_DATALESS_FILES + nspace_resolver_request_lck_grp = + lck_grp_alloc_init("file namespace resolver", NULL); - fp->f_fglob->fg_flag = fmode; - fp->f_fglob->fg_ops = &vnops; - fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp; + lck_mtx_init(&nspace_resolver_request_hash_mutex, + nspace_resolver_request_lck_grp, NULL); - proc_fdlock(p); - procfdtbl_releasefd(p, indx, NULL); - fp_drop(p, indx, fp, 1); - proc_fdunlock(p); + nspace_resolver_request_hashtbl = + hashinit(NSPACE_RESOLVER_REQ_HASHSIZE, + M_VNODE /* XXX */, &nspace_resolver_request_hashmask); +#endif /* CONFIG_DATALESS_FILES */ +} - /* - * All variants of the namespace handler struct support these three fields: - * token, flags, and the FD pointer - */ - error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t)); - if (error) { - goto cleanup; - } - error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t)); - if (error) { - goto cleanup; - } - error = copyout(&indx, nhd->fdptr, sizeof(uint32_t)); - if (error) { - goto cleanup; - } +void +nspace_resolver_exited(struct proc *p __no_dataless_unused) +{ +#if CONFIG_DATALESS_FILES + struct nspace_resolver_requesthead *bucket; + struct nspace_resolver_request *req; + u_long idx; - /* - * Handle optional fields: - * extended version support an info ptr (offset, length), and the - * - * namedata version supports a unique per-link object ID - * - */ - if (nhd->infoptr) { - uio_t uio = (uio_t)nspace_items[i].arg; - uint64_t u_offset, u_length; + NSPACE_REQ_LOCK(); - if (uio) { - u_offset = uio_offset(uio); - u_length = uio_resid(uio); - } else { - u_offset = 0; - u_length = 0; - } - error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t)); - if (error) { - goto cleanup; - } - error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t)); - if (error) { - goto cleanup; + if ((p->p_lflag & P_LNSPACE_RESOLVER) && + p == nspace_resolver_proc) { + for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) { + bucket = &nspace_resolver_request_hashtbl[idx]; + LIST_FOREACH(req, bucket, r_hashlink) { + nspace_resolver_req_mark_complete(req, + ETIMEDOUT); } } + nspace_resolver_proc = NULL; + } - if (nhd->objid) { - VATTR_INIT(&va); - VATTR_WANTED(&va, va_linkid); - error = vnode_getattr(nspace_items[i].vp, &va, ctx); - if (error) { - goto cleanup; - } + NSPACE_REQ_UNLOCK(); +#endif /* CONFIG_DATALESS_FILES */ +} - uint64_t linkid = 0; - if (VATTR_IS_SUPPORTED(&va, va_linkid)) { - linkid = (uint64_t)va.va_linkid; - } - error = copyout(&linkid, nhd->objid, sizeof(uint64_t)); - } -cleanup: - if (error) { - if (fp_alloc_successful) { - fp_free(p, indx, fp); - } - if (vn_open_successful) { - vn_close(nspace_items[i].vp, fmode, ctx); - } - unblock = 1; - } +int +resolve_nspace_item(struct vnode *vp, uint64_t op) +{ + return resolve_nspace_item_ext(vp, op, NULL); +} - if (vn_get_succsessful) { - vnode_put(nspace_items[i].vp); - } +#define DATALESS_RESOLVER_ENTITLEMENT \ + "com.apple.private.vfs.dataless-resolver" +#define DATALESS_MANIPULATION_ENTITLEMENT \ + "com.apple.private.vfs.dataless-manipulation" - break; - } +/* + * Return TRUE if the vfs context is associated with a process entitled + * for dataless manipulation. + * + * XXX Arguably belongs in vfs_subr.c, but is here because of the + * complication around CONFIG_DATALESS_FILES. + */ +boolean_t +vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused) +{ +#if CONFIG_DATALESS_FILES + assert(ctx->vc_thread == current_thread()); + task_t const task = current_task(); + return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) || + IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT); +#else + return false; +#endif /* CONFIG_DATALESS_FILES */ +} - if (unblock) { - if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { - vnode_lock_spin(nspace_items[i].vp); - nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; - vnode_unlock(nspace_items[i].vp); - } - nspace_items[i].vp = NULL; - nspace_items[i].vid = 0; - nspace_items[i].flags = NSPACE_ITEM_DONE; - nspace_items[i].token = 0; +int +resolve_nspace_item_ext( + struct vnode *vp __no_dataless_unused, + uint64_t op __no_dataless_unused, + void *arg __unused) +{ +#if CONFIG_DATALESS_FILES + int error; + mach_port_t mp; + char *path = NULL; + int path_len; + kern_return_t kr; + struct nspace_resolver_request req; - wakeup((caddr_t)&(nspace_items[i].vp)); + // only allow namespace events on regular files, directories and symlinks. + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + return EFTYPE; } - if (nspace_type == NSPACE_HANDLER_SNAPSHOT) { - // just go through every snapshot event and unblock it immediately. - if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].flags & NSPACE_ITEM_NEW) { - if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { - nspace_items[i].vp = NULL; - nspace_items[i].vid = 0; - nspace_items[i].flags = NSPACE_ITEM_DONE; - nspace_items[i].token = 0; - - wakeup((caddr_t)&(nspace_items[i].vp)); - } - } - } - } + // + // if this is a snapshot event and the vnode is on a + // disk image just pretend nothing happened since any + // change to the disk image will cause the disk image + // itself to get backed up and this avoids multi-way + // deadlocks between the snapshot handler and the ever + // popular diskimages-helper process. the variable + // nspace_allow_virtual_devs allows this behavior to + // be overridden (for use by the Mobile TimeMachine + // testing infrastructure which uses disk images) + // + if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) { + os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled"); + return ENOTSUP; } - lck_mtx_unlock(&nspace_handler_lock); + error = nspace_materialization_is_prevented(); + if (error) { + os_log_debug(OS_LOG_DEFAULT, + "NSPACE process/thread is decorated as no-materialization"); + return error; + } - lck_mtx_lock(&nspace_handler_exclusion_lock); - nspace_handlers[nspace_type].handler_busy = 0; - lck_mtx_unlock(&nspace_handler_exclusion_lock); + kr = host_get_filecoordinationd_port(host_priv_self(), &mp); + if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) { + os_log_error(OS_LOG_DEFAULT, "NSPACE no port"); + // Treat this like being unable to access the backing + // store server. + return ETIMEDOUT; + } - return error; -} + MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (path == NULL) { + error = ENOMEM; + goto out_release_port; + } + path_len = MAXPATHLEN; -static inline int -validate_namespace_args(int is64bit, int size) -{ - if (is64bit) { - /* Must be one of these */ - if (size == sizeof(user64_namespace_handler_info)) { - goto sizeok; - } - if (size == sizeof(user64_namespace_handler_info_ext)) { - goto sizeok; - } - if (size == sizeof(user64_namespace_handler_data)) { - goto sizeok; - } - return EINVAL; - } else { - /* 32 bit -- must be one of these */ - if (size == sizeof(user32_namespace_handler_info)) { - goto sizeok; - } - if (size == sizeof(user32_namespace_handler_info_ext)) { - goto sizeok; + error = vn_getpath(vp, path, &path_len); + if (error == 0) { + int xxx_rdar44371223; /* XXX Mig bug */ + req.r_req_id = next_nspace_req_id(); + req.r_resolver_error = 0; + req.r_flags = 0; + + NSPACE_REQ_LOCK(); + error = nspace_resolver_req_add(&req); + NSPACE_REQ_UNLOCK(); + if (error) { + goto out_release_port; } - if (size == sizeof(user32_namespace_handler_data)) { - goto sizeok; + + os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call"); + kr = send_nspace_resolve_path(mp, req.r_req_id, + current_proc()->p_pid, (uint32_t)(op & 0xffffffff), + path, &xxx_rdar44371223); + if (kr != KERN_SUCCESS) { + // Also treat this like being unable to access + // the backing store server. + os_log_error(OS_LOG_DEFAULT, + "NSPACE resolve_path failure: %d", kr); + error = ETIMEDOUT; + + NSPACE_REQ_LOCK(); + nspace_resolver_req_remove(&req); + NSPACE_REQ_UNLOCK(); + goto out_release_port; } - return EINVAL; + + // Give back the memory we allocated earlier while + // we wait; we no longer need it. + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + path = NULL; + + // Request has been submitted to the resolver. + // Now (interruptibly) wait for completion. + // Upon requrn, the request will have been removed + // from the lookup table. + error = nspace_resolver_req_wait(&req); } -sizeok: +out_release_port: + if (path != NULL) { + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + } + ipc_port_release_send(mp); + + return error; +#else + return ENOTSUP; +#endif /* CONFIG_DATALESS_FILES */ +} +int +nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime, + __unused uint64_t op_type, __unused void *arg) +{ return 0; } +#if 0 static int -process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data) +build_volfs_path(struct vnode *vp, char *path, int *len) { - int error = 0; - namespace_handler_data nhd; - - bzero(&nhd, sizeof(namespace_handler_data)); - - if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { - return error; - } - - error = validate_namespace_args(is64bit, size); - if (error) { - return error; - } - - /* Copy in the userland pointers into our kernel-only struct */ + struct vnode_attr va; + int ret; - if (is64bit) { - /* 64 bit userland structures */ - nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token; - nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags; - nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr; + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); - /* If the size is greater than the standard info struct, add in extra fields */ - if (size > (sizeof(user64_namespace_handler_info))) { - if (size >= (sizeof(user64_namespace_handler_info_ext))) { - nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr; - } - if (size == (sizeof(user64_namespace_handler_data))) { - nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid; - } - /* Otherwise the fields were pre-zeroed when we did the bzero above. */ - } + if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) { + *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1; + ret = -1; } else { - /* 32 bit userland structures */ - nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token); - nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags); - nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr); - - if (size > (sizeof(user32_namespace_handler_info))) { - if (size >= (sizeof(user32_namespace_handler_info_ext))) { - nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr); - } - if (size == (sizeof(user32_namespace_handler_data))) { - nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid; - } - /* Otherwise the fields were pre-zeroed when we did the bzero above. */ - } + *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1; + ret = 0; } - return wait_for_namespace_event(&nhd, nspace_type); + return ret; } +#endif static unsigned long fsctl_bogus_command_compat(unsigned long cmd) @@ -10493,22 +10904,6 @@ fsctl_bogus_command_compat(unsigned long cmd) return FSIOC_ROUTEFS_SETROUTEID; case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS): return FSIOC_SET_PACKAGE_EXTS; - case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET): - return FSIOC_NAMESPACE_HANDLER_GET; - case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET): - return FSIOC_OLD_SNAPSHOT_HANDLER_GET; - case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT): - return FSIOC_SNAPSHOT_HANDLER_GET_EXT; - case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE): - return FSIOC_NAMESPACE_HANDLER_UPDATE; - case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK): - return FSIOC_NAMESPACE_HANDLER_UNBLOCK; - case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL): - return FSIOC_NAMESPACE_HANDLER_CANCEL; - case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME): - return FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME; - case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS): - return FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS; case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE): return FSIOC_SET_FSTYPENAME_OVERRIDE; case IOCBASECMD(DISK_CONDITIONER_IOC_GET): @@ -10528,6 +10923,12 @@ fsctl_bogus_command_compat(unsigned long cmd) return cmd; } +static int +cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx) +{ + return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx); +} + /* * Make a filesystem-specific control call: */ @@ -10543,6 +10944,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long caddr_t data, memp; vnode_t vp = *arg_vp; + if (vp->v_type == VCHR || vp->v_type == VBLK) { + return ENOTTY; + } + cmd = fsctl_bogus_command_compat(cmd); size = IOCPARM_LEN(cmd); @@ -10596,8 +11001,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long /* Check to see if it's a generic command */ switch (cmd) { case FSIOC_SYNC_VOLUME: { + struct vfs_attr vfa; mount_t mp = vp->v_mount; - int arg = *(uint32_t*)data; + unsigned arg; + /* record vid of vp so we can drop it below. */ uint32_t vvid = vp->v_id; @@ -10613,8 +11020,27 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } vnode_put(vp); + arg = MNT_NOWAIT; + if (*(uint32_t*)data & FSCTL_SYNC_WAIT) { + arg = MNT_WAIT; + } + + /* + * If the filessytem supports multiple filesytems in a + * partition (For eg APFS volumes in a container, it knows + * that the waitfor argument to VFS_SYNC are flags. + */ + VFSATTR_INIT(&vfa); + VFSATTR_WANTED(&vfa, f_capabilities); + if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) && + VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) && + ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) && + ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) { + arg |= MNT_VOLUME; + } + /* issue the sync for this volume */ - (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL); + (void)sync_callback(mp, &arg); /* * Then release the mount_iterref once we're done syncing; it's not @@ -10687,191 +11113,6 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } break; - /* namespace handlers */ - case FSIOC_NAMESPACE_HANDLER_GET: { - error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data); - } - break; - - /* Snapshot handlers */ - case FSIOC_OLD_SNAPSHOT_HANDLER_GET: { - error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); - } - break; - - case FSIOC_SNAPSHOT_HANDLER_GET_EXT: { - error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); - } - break; - - case FSIOC_NAMESPACE_HANDLER_UPDATE: { - uint32_t token, val; - int i; - - if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { - break; - } - - if (!nspace_is_special_process(p)) { - error = EINVAL; - break; - } - - token = ((uint32_t *)data)[0]; - val = ((uint32_t *)data)[1]; - - lck_mtx_lock(&nspace_handler_lock); - - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].token == token) { - break; /* exit for loop, not case stmt */ - } - } - - if (i >= MAX_NSPACE_ITEMS) { - error = ENOENT; - } else { - // - // if this bit is set, when resolve_nspace_item() times out - // it will loop and go back to sleep. - // - nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER; - } - - lck_mtx_unlock(&nspace_handler_lock); - - if (error) { - printf("nspace-handler-update: did not find token %u\n", token); - } - } - break; - - case FSIOC_NAMESPACE_HANDLER_UNBLOCK: { - uint32_t token, val; - int i; - - if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { - break; - } - - if (!nspace_is_special_process(p)) { - error = EINVAL; - break; - } - - token = ((uint32_t *)data)[0]; - val = ((uint32_t *)data)[1]; - - lck_mtx_lock(&nspace_handler_lock); - - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].token == token) { - break; /* exit for loop, not case statement */ - } - } - - if (i >= MAX_NSPACE_ITEMS) { - printf("nspace-handler-unblock: did not find token %u\n", token); - error = ENOENT; - } else { - if (val == 0 && nspace_items[i].vp) { - vnode_lock_spin(nspace_items[i].vp); - nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; - vnode_unlock(nspace_items[i].vp); - } - - nspace_items[i].vp = NULL; - nspace_items[i].arg = NULL; - nspace_items[i].op = 0; - nspace_items[i].vid = 0; - nspace_items[i].flags = NSPACE_ITEM_DONE; - nspace_items[i].token = 0; - - wakeup((caddr_t)&(nspace_items[i].vp)); - } - - lck_mtx_unlock(&nspace_handler_lock); - } - break; - - case FSIOC_NAMESPACE_HANDLER_CANCEL: { - uint32_t token, val; - int i; - - if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { - break; - } - - if (!nspace_is_special_process(p)) { - error = EINVAL; - break; - } - - token = ((uint32_t *)data)[0]; - val = ((uint32_t *)data)[1]; - - lck_mtx_lock(&nspace_handler_lock); - - for (i = 0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].token == token) { - break; /* exit for loop, not case stmt */ - } - } - - if (i >= MAX_NSPACE_ITEMS) { - printf("nspace-handler-cancel: did not find token %u\n", token); - error = ENOENT; - } else { - if (nspace_items[i].vp) { - vnode_lock_spin(nspace_items[i].vp); - nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; - vnode_unlock(nspace_items[i].vp); - } - - nspace_items[i].vp = NULL; - nspace_items[i].arg = NULL; - nspace_items[i].vid = 0; - nspace_items[i].token = val; - nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING; - nspace_items[i].flags |= NSPACE_ITEM_CANCELLED; - - wakeup((caddr_t)&(nspace_items[i].vp)); - } - - lck_mtx_unlock(&nspace_handler_lock); - } - break; - - case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: { - if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { - break; - } - - // we explicitly do not do the namespace_handler_proc check here - - lck_mtx_lock(&nspace_handler_lock); - snapshot_timestamp = ((uint32_t *)data)[0]; - wakeup(&nspace_item_idx); - lck_mtx_unlock(&nspace_handler_lock); - printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp); - } - break; - - case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS: - { - if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { - break; - } - - lck_mtx_lock(&nspace_handler_lock); - nspace_allow_virtual_devs = ((uint32_t *)data)[0]; - lck_mtx_unlock(&nspace_handler_lock); - printf("nspace-snapshot-handler will%s allow events on disk-images\n", - nspace_allow_virtual_devs ? "" : " NOT"); - error = 0; - } - break; - case FSIOC_SET_FSTYPENAME_OVERRIDE: { if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { @@ -10908,6 +11149,17 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } break; + case FSIOC_CAS_BSDFLAGS: { + struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data; + struct vnode_attr va; + + VATTR_INIT(&va); + VATTR_SET(&va, va_flags, cas->new_flags); + + error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx); + } + break; + case FSIOC_FD_ONLY_OPEN_ONCE: { if (vnode_usecount(vp) > 1) { error = EBUSY; @@ -10993,6 +11245,9 @@ fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval) if ((uap->options & FSOPT_NOFOLLOW) == 0) { nameiflags |= FOLLOW; } + if (uap->cmd == FSIOC_FIRMLINK_CTL) { + nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE); + } NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd))) { @@ -11509,9 +11764,8 @@ flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval) } static int -fsgetpath_internal( - vfs_context_t ctx, int volfs_id, uint64_t objid, - vm_size_t bufsize, caddr_t buf, int *pathlen) +fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid, + vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen) { int error; struct mount *mp = NULL; @@ -11537,7 +11791,25 @@ retry: unionget: if (objid == 2) { - error = VFS_ROOT(mp, &vp, ctx); + struct vfs_attr vfsattr; + int use_vfs_root = TRUE; + + VFSATTR_INIT(&vfsattr); + VFSATTR_WANTED(&vfsattr, f_capabilities); + if (!(options & FSOPT_ISREALFSID) && + vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 && + VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { + if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) && + (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) { + use_vfs_root = FALSE; + } + } + + if (use_vfs_root) { + error = VFS_ROOT(mp, &vp, ctx); + } else { + error = VFS_VGET(mp, objid, &vp, ctx); + } } else { error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx); } @@ -11572,6 +11844,9 @@ unionget: /* Obtain the absolute path to this vnode. */ bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0; + if (options & FSOPT_NOFIRMLINKPATH) { + bpflags |= BUILDPATH_NO_FIRMLINK; + } bpflags |= BUILDPATH_CHECK_MOVED; error = build_path(vp, buf, bufsize, &length, bpflags, ctx); vnode_put(vp); @@ -11619,8 +11894,9 @@ out: /* * Obtain the full pathname of a file system object by id. */ -int -fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) +static int +fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid, + uint32_t options, user_ssize_t *retval) { vfs_context_t ctx = vfs_context_current(); fsid_t fsid; @@ -11628,30 +11904,33 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) int length; int error; - if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) { + if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) { + return EINVAL; + } + + if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) { return error; } AUDIT_ARG(value32, fsid.val[0]); - AUDIT_ARG(value64, uap->objid); + AUDIT_ARG(value64, objid); /* Restrict output buffer size for now. */ - if (uap->bufsize > PAGE_SIZE) { + if (bufsize > PAGE_SIZE || bufsize <= 0) { return EINVAL; } - MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO); + MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO); if (realpath == NULL) { return ENOMEM; } - error = fsgetpath_internal( - ctx, fsid.val[0], uap->objid, - uap->bufsize, realpath, &length); + error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath, + options, &length); if (error) { goto out; } - error = copyout((caddr_t)realpath, uap->buf, length); + error = copyout((caddr_t)realpath, buf, length); *retval = (user_ssize_t)length; /* may be superseded by error */ out: @@ -11661,6 +11940,20 @@ out: return error; } +int +fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) +{ + return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid, + 0, retval); +} + +int +fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval) +{ + return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid, + uap->options, retval); +} + /* * Common routine to handle various flavors of statfs data heading out * to user space. diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index e6ffc2c72..cadc0d367 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -123,30 +123,27 @@ static int vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); static int vn_select( struct fileproc *fp, int which, void * wql, vfs_context_t ctx); -static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx); +static int vn_kqfilter(struct fileproc *fp, struct knote *kn, + struct kevent_qos_s *kev); static void filt_vndetach(struct knote *kn); static int filt_vnode(struct knote *kn, long hint); -static int filt_vnode_common(struct knote *kn, vnode_t vp, long hint); +static int filt_vnode_common(struct knote *kn, struct kevent_qos_s *kev, + vnode_t vp, long hint); static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx); -#if 0 -static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident, - vfs_context_t ctx); -#endif const struct fileops vnops = { - .fo_type = DTYPE_VNODE, - .fo_read = vn_read, - .fo_write = vn_write, - .fo_ioctl = vn_ioctl, - .fo_select = vn_select, - .fo_close = vn_closefile, - .fo_kqfilter = vn_kqfilt_add, - .fo_drain = NULL, + .fo_type = DTYPE_VNODE, + .fo_read = vn_read, + .fo_write = vn_write, + .fo_ioctl = vn_ioctl, + .fo_select = vn_select, + .fo_close = vn_closefile, + .fo_drain = fo_no_drain, + .fo_kqfilter = vn_kqfilter, }; -static int filt_vntouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_vnprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_vntouch(struct knote *kn, struct kevent_qos_s *kev); +static int filt_vnprocess(struct knote *kn, struct kevent_qos_s*kev); SECURITY_READ_ONLY_EARLY(struct filterops) vnode_filtops = { .f_isfd = 1, @@ -578,19 +575,6 @@ continue_create_lookup: panic("Haven't cleaned up adequately in vn_open_auth()"); } -#if DEVELOPMENT || DEBUG - /* - * XXX VSWAP: Check for entitlements or special flag here - * so we can restrict access appropriately. - */ -#else /* DEVELOPMENT || DEBUG */ - - if (vnode_isswap(vp) && (fmode & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) { - error = EPERM; - goto bad; - } -#endif /* DEVELOPMENT || DEBUG */ - /* * Expect to use this code for filesystems without compound VNOPs, for the root * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(), @@ -761,8 +745,15 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) } } #endif - - /* work around for foxhound */ + /* + * If vnode @vp belongs to a chardev or a blkdev then it is handled + * specially. We first drop its user reference count @vp->v_usecount + * before calling VNOP_CLOSE(). This was done historically to ensure + * that the last close of a special device vnode performed some + * conditional cleanups. Now we still need to drop this reference here + * to ensure that devfsspec_close() can check if the vnode is still in + * use. + */ if (vnode_isspec(vp)) { (void)vnode_rele_ext(vp, flags, 0); } @@ -953,20 +944,7 @@ vn_rdwr_64( error = VNOP_READ(vp, auio, ioflg, &context); } } else { -#if DEVELOPMENT || DEBUG - /* - * XXX VSWAP: Check for entitlements or special flag here - * so we can restrict access appropriately. - */ error = VNOP_WRITE(vp, auio, ioflg, &context); -#else /* DEVELOPMENT || DEBUG */ - - if (vnode_isswap(vp) && ((ioflg & (IO_SWAP_DISPATCH | IO_SKIP_ENCRYPTION)) == 0)) { - error = EPERM; - } else { - error = VNOP_WRITE(vp, auio, ioflg, &context); - } -#endif /* DEVELOPMENT || DEBUG */ } } @@ -1104,21 +1082,6 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) return error; } -#if DEVELOPMENT || DEBUG - /* - * XXX VSWAP: Check for entitlements or special flag here - * so we can restrict access appropriately. - */ -#else /* DEVELOPMENT || DEBUG */ - - if (vnode_isswap(vp)) { - (void)vnode_put(vp); - error = EPERM; - return error; - } -#endif /* DEVELOPMENT || DEBUG */ - - #if CONFIG_MACF error = mac_vnode_check_write(ctx, vfs_context_ucred(ctx), vp); if (error) { @@ -1274,7 +1237,7 @@ error_out: */ int vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat64, - vfs_context_t ctx, struct ucred *file_cred) + int needsrealdev, vfs_context_t ctx, struct ucred *file_cred) { struct vnode_attr va; int error; @@ -1313,6 +1276,9 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 VATTR_WANTED(&va, va_guuid); VATTR_WANTED(&va, va_acl); } + if (needsrealdev) { + va.va_vaflags = VA_REALFSID; + } error = vnode_getattr(vp, &va, ctx); if (error) { goto out; @@ -1430,7 +1396,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 fsec->fsec_group = kauth_null_guid; } if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) { - bcopy(va.va_acl, &(fsec->fsec_acl), KAUTH_ACL_COPYSIZE(va.va_acl)); + __nochk_bcopy(va.va_acl, &(fsec->fsec_acl), KAUTH_ACL_COPYSIZE(va.va_acl)); } else { fsec->fsec_acl.acl_entrycount = KAUTH_FILESEC_NOACL; } @@ -1462,7 +1428,7 @@ out: } int -vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, vfs_context_t ctx) +vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, int needsrealdev, vfs_context_t ctx) { int error; @@ -1479,7 +1445,7 @@ vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, vfs_con } /* actual stat */ - return vn_stat_noauth(vp, sb, xsec, isstat64, ctx, NOCRED); + return vn_stat_noauth(vp, sb, xsec, isstat64, needsrealdev, ctx, NOCRED); } @@ -1529,6 +1495,11 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) case VCHR: case VBLK: + if (com == TIOCREVOKE) { + error = ENOTTY; + goto out; + } + /* Should not be able to set block size from user space */ if (com == DKIOCSETBLOCKSIZE) { error = EPERM; @@ -1721,9 +1692,9 @@ vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) } static int -vn_kqfilt_add(struct fileproc *fp, struct knote *kn, - struct kevent_internal_s *kev, vfs_context_t ctx) +vn_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev) { + vfs_context_t ctx = vfs_context_current(); struct vnode *vp; int error = 0; int result = 0; @@ -1770,12 +1741,11 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn, #endif kn->kn_hook = (void*)vp; - kn->kn_hookid = vnode_vid(vp); kn->kn_filtid = EVFILTID_VN; vnode_lock(vp); KNOTE_ATTACH(&vp->v_knotes, kn); - result = filt_vnode_common(kn, vp, 0); + result = filt_vnode_common(kn, NULL, vp, 0); vnode_unlock(vp); /* @@ -1790,8 +1760,7 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn, out: if (error) { - kn->kn_flags = EV_ERROR; - kn->kn_data = error; + knote_set_error(kn, error); } return result; @@ -1801,9 +1770,9 @@ static void filt_vndetach(struct knote *kn) { vfs_context_t ctx = vfs_context_current(); - struct vnode *vp; - vp = (struct vnode *)kn->kn_hook; - if (vnode_getwithvid(vp, kn->kn_hookid)) { + struct vnode *vp = (struct vnode *)kn->kn_hook; + uint32_t vid = vnode_vid(vp); + if (vnode_getwithvid(vp, vid)) { return; } @@ -1900,9 +1869,10 @@ vnode_writable_space_count(vnode_t vp) * --If hint is revoke, set special flags and activate */ static int -filt_vnode_common(struct knote *kn, vnode_t vp, long hint) +filt_vnode_common(struct knote *kn, struct kevent_qos_s *kev, vnode_t vp, long hint) { int activate = 0; + int64_t data = 0; lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); @@ -1917,32 +1887,29 @@ filt_vnode_common(struct knote *kn, vnode_t vp, long hint) } else { switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_data = vnode_readable_data_count(vp, kn->kn_fp->f_fglob->fg_offset, (kn->kn_flags & EV_POLL)); - - if (kn->kn_data != 0) { - activate = 1; - } + data = vnode_readable_data_count(vp, kn->kn_fp->f_fglob->fg_offset, (kn->kn_flags & EV_POLL)); + activate = (data != 0); break; case EVFILT_WRITE: - kn->kn_data = vnode_writable_space_count(vp); - - if (kn->kn_data != 0) { - activate = 1; - } + data = vnode_writable_space_count(vp); + activate = (data != 0); break; case EVFILT_VNODE: /* Check events this note matches against the hint */ if (kn->kn_sfflags & hint) { kn->kn_fflags |= hint; /* Set which event occurred */ } - if (kn->kn_fflags != 0) { - activate = 1; - } + activate = (kn->kn_fflags != 0); break; default: panic("Invalid knote filter on a vnode!\n"); } } + + if (kev && activate) { + knote_fill_kevent(kn, kev, data); + } + return activate; } @@ -1951,18 +1918,19 @@ filt_vnode(struct knote *kn, long hint) { vnode_t vp = (struct vnode *)kn->kn_hook; - return filt_vnode_common(kn, vp, hint); + return filt_vnode_common(kn, NULL, vp, hint); } static int -filt_vntouch(struct knote *kn, struct kevent_internal_s *kev) +filt_vntouch(struct knote *kn, struct kevent_qos_s *kev) { vnode_t vp = (struct vnode *)kn->kn_hook; + uint32_t vid = vnode_vid(vp); int activate; int hint = 0; vnode_lock(vp); - if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + if (vnode_getiocount(vp, vid, VNODE_NODEAD | VNODE_WITHID) != 0) { /* is recycled */ hint = NOTE_REVOKE; } @@ -1970,7 +1938,7 @@ filt_vntouch(struct knote *kn, struct kevent_internal_s *kev) /* accept new input fflags mask */ kn->kn_sfflags = kev->fflags; - activate = filt_vnode_common(kn, vp, hint); + activate = filt_vnode_common(kn, NULL, vp, hint); if (hint == 0) { vnode_put_locked(vp); @@ -1981,26 +1949,19 @@ filt_vntouch(struct knote *kn, struct kevent_internal_s *kev) } static int -filt_vnprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_vnprocess(struct knote *kn, struct kevent_qos_s *kev) { -#pragma unused(data) vnode_t vp = (struct vnode *)kn->kn_hook; + uint32_t vid = vnode_vid(vp); int activate; int hint = 0; vnode_lock(vp); - if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + if (vnode_getiocount(vp, vid, VNODE_NODEAD | VNODE_WITHID) != 0) { /* Is recycled */ hint = NOTE_REVOKE; } - activate = filt_vnode_common(kn, vp, hint); - if (activate) { - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - } - } + activate = filt_vnode_common(kn, kev, vp, hint); /* Definitely need to unlock, may need to put */ if (hint == 0) { diff --git a/bsd/vfs/vnode_if.c b/bsd/vfs/vnode_if.c index ffd01323c..a29e14f24 100644 --- a/bsd/vfs/vnode_if.c +++ b/bsd/vfs/vnode_if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -430,6 +430,22 @@ struct vnodeop_desc vnop_revoke_desc = { NULL }; +int vnop_mmap_check_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_mmap_check_args, a_vp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_mmap_check_desc = { + 0, + "vnop_mmap_check", + 0, + vnop_mmap_check_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL +}; int vnop_mmap_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_mmap_args, a_vp), @@ -448,7 +464,6 @@ struct vnodeop_desc vnop_mmap_desc = { NULL }; - int vnop_mnomap_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_mnomap_args, a_vp), VDESC_NO_OFFSET @@ -466,7 +481,6 @@ struct vnodeop_desc vnop_mnomap_desc = { NULL }; - int vnop_fsync_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_fsync_args, a_vp), VDESC_NO_OFFSET @@ -895,16 +909,16 @@ int vnop_copyfile_vp_offsets[] = { VDESC_NO_OFFSET }; struct vnodeop_desc vnop_copyfile_desc = { - 0, - "vnop_copyfile", - 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE, - vnop_copyfile_vp_offsets, - VDESC_NO_OFFSET, - VDESC_NO_OFFSET, - VDESC_NO_OFFSET, - VOPARG_OFFSETOF(struct vnop_copyfile_args, a_tcnp), - VDESC_NO_OFFSET, - NULL + .vdesc_offset = 0, + .vdesc_name = "vnop_copyfile", + .vdesc_flags = 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE, + .vdesc_vp_offsets = vnop_copyfile_vp_offsets, + .vdesc_vpp_offset = VDESC_NO_OFFSET, + .vdesc_cred_offset = VDESC_NO_OFFSET, + .vdesc_proc_offset = VDESC_NO_OFFSET, + .vdesc_componentname_offset = VOPARG_OFFSETOF(struct vnop_copyfile_args, a_tcnp), + .vdesc_context_offset = VDESC_NO_OFFSET, + .vdesc_transports = NULL }; int vnop_clonefile_vp_offsets[] = { @@ -913,16 +927,16 @@ int vnop_clonefile_vp_offsets[] = { VDESC_NO_OFFSET }; struct vnodeop_desc vnop_clonefile_desc = { - 0, - "vnop_clonefile", - 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VPP_WILLRELE, - vnop_clonefile_vp_offsets, - VOPARG_OFFSETOF(struct vnop_clonefile_args, a_vpp), - VDESC_NO_OFFSET, - VDESC_NO_OFFSET, - VOPARG_OFFSETOF(struct vnop_clonefile_args, a_cnp), - VOPARG_OFFSETOF(struct vnop_clonefile_args, a_context), - NULL + .vdesc_offset = 0, + .vdesc_name = "vnop_clonefile", + .vdesc_flags = 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VPP_WILLRELE, + .vdesc_vp_offsets = vnop_clonefile_vp_offsets, + .vdesc_vpp_offset = VOPARG_OFFSETOF(struct vnop_clonefile_args, a_vpp), + .vdesc_cred_offset = VDESC_NO_OFFSET, + .vdesc_proc_offset = VDESC_NO_OFFSET, + .vdesc_componentname_offset = VOPARG_OFFSETOF(struct vnop_clonefile_args, a_cnp), + .vdesc_context_offset = VOPARG_OFFSETOF(struct vnop_clonefile_args, a_context), + .vdesc_transports = NULL }; int vop_getxattr_vp_offsets[] = { @@ -1205,6 +1219,7 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_kqfilt_remove_desc, &vnop_setlabel_desc, &vnop_revoke_desc, + &vnop_mmap_check_desc, &vnop_mmap_desc, &vnop_mnomap_desc, &vnop_fsync_desc, diff --git a/bsd/vfs/vnode_if.sh b/bsd/vfs/vnode_if.sh index ff699a78e..c88ebe1dd 100755 --- a/bsd/vfs/vnode_if.sh +++ b/bsd/vfs/vnode_if.sh @@ -1,7 +1,7 @@ #!/bin/sh - copyright=' /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -266,15 +266,16 @@ echo ' #include struct vnodeop_desc vop_default_desc = { - 0, - "default", - 0, - NULL, - VDESC_NO_OFFSET, - VDESC_NO_OFFSET, - VDESC_NO_OFFSET, - VDESC_NO_OFFSET, - NULL, + .vdesc_offset = 0, + .vdesc_name = "default", + .vdesc_flags = 0, + .vdesc_vp_offsets = NULL, + .vdesc_vpp_offset = VDESC_NO_OFFSET, + .vdesc_cred_offset = VDESC_NO_OFFSET, + .vdesc_proc_offset = VDESC_NO_OFFSET, + .vdesc_componentname_offset = VDESC_NO_OFFSET, + .vdesc_context_offset = VDESC_NO_OFFSET, + .vdesc_transports = NULL, }; ' diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 596835593..b9626cc5e 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -98,6 +98,7 @@ #include #include +#include #if CONFIG_MACF #include @@ -106,6 +107,7 @@ #if CONFIG_CSR #include #endif /* CONFIG_CSR */ +#include int _shared_region_map_and_slide(struct proc*, int, unsigned int, struct shared_file_mapping_np*, uint32_t, user_addr_t, user_addr_t); int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *); @@ -230,6 +232,10 @@ SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_u SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, ""); +#if DEVELOPMENT || DEBUG +extern unsigned long pmap_asid_flushes; +SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, ""); +#endif #endif /* __arm__ || __arm64__ */ #if __arm64__ @@ -1042,6 +1048,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) proc_t targetproc = PROC_NULL; int pid = args->pid; int error = 0; + mach_port_t tfpport = MACH_PORT_NULL; #if CONFIG_MACF error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_SUSPEND); @@ -1062,7 +1069,8 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) goto out; } - if (!task_for_pid_posix_check(targetproc)) { + if (!task_for_pid_posix_check(targetproc) && + !IOTaskHasEntitlement(current_task(), PROCESS_RESUME_SUSPEND_ENTITLEMENT)) { error = EPERM; goto out; } @@ -1070,8 +1078,6 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) target = targetproc->task; #ifndef CONFIG_EMBEDDED if (target != TASK_NULL) { - mach_port_t tfpport; - /* If we aren't root and target's task access port is set... */ if (!kauth_cred_issuser(kauth_cred_get()) && targetproc != current_proc() && @@ -1115,6 +1121,10 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) task_deallocate(target); out: + if (tfpport != IPC_PORT_NULL) { + ipc_port_release_send(tfpport); + } + if (targetproc != PROC_NULL) { proc_rele(targetproc); } @@ -1122,6 +1132,141 @@ out: return error; } +kern_return_t +debug_control_port_for_pid(struct debug_control_port_for_pid_args *args) +{ + mach_port_name_t target_tport = args->target_tport; + int pid = args->pid; + user_addr_t task_addr = args->t; + proc_t p = PROC_NULL; + task_t t1 = TASK_NULL; + task_t task = TASK_NULL; + mach_port_name_t tret = MACH_PORT_NULL; + ipc_port_t tfpport = MACH_PORT_NULL; + ipc_port_t sright = NULL; + int error = 0; + + + AUDIT_MACH_SYSCALL_ENTER(AUE_DBGPORTFORPID); + AUDIT_ARG(pid, pid); + AUDIT_ARG(mach_port1, target_tport); + + /* Always check if pid == 0 */ + if (pid == 0) { + (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); + return KERN_FAILURE; + } + + t1 = port_name_to_task(target_tport); + if (t1 == TASK_NULL) { + (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); + return KERN_FAILURE; + } + + + p = proc_find(pid); + if (p == PROC_NULL) { + error = KERN_FAILURE; + goto tfpout; + } + +#if CONFIG_AUDIT + AUDIT_ARG(process, p); +#endif + + if (!(task_for_pid_posix_check(p))) { + error = KERN_FAILURE; + goto tfpout; + } + + if (p->task == TASK_NULL) { + error = KERN_SUCCESS; + goto tfpout; + } + + /* Grab a task reference since the proc ref might be dropped if an upcall to task access server is made */ + task = p->task; + task_reference(task); + + + if (!IOTaskHasEntitlement(current_task(), DEBUG_PORT_ENTITLEMENT)) { +#if CONFIG_MACF + error = mac_proc_check_get_task(kauth_cred_get(), p); + if (error) { + error = KERN_FAILURE; + goto tfpout; + } +#endif + + /* If we aren't root and target's task access port is set... */ + if (!kauth_cred_issuser(kauth_cred_get()) && + p != current_proc() && + (task_get_task_access_port(task, &tfpport) == 0) && + (tfpport != IPC_PORT_NULL)) { + if (tfpport == IPC_PORT_DEAD) { + error = KERN_PROTECTION_FAILURE; + goto tfpout; + } + + /* + * Drop the proc_find proc ref before making an upcall + * to taskgated, since holding a proc_find + * ref while making an upcall can cause deadlock. + */ + proc_rele(p); + p = PROC_NULL; + + /* Call up to the task access server */ + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + + if (error != MACH_MSG_SUCCESS) { + if (error == MACH_RCV_INTERRUPTED) { + error = KERN_ABORTED; + } else { + error = KERN_FAILURE; + } + goto tfpout; + } + } + } + + /* Check if the task has been corpsified */ + if (is_corpsetask(task)) { + error = KERN_FAILURE; + goto tfpout; + } + + error = task_get_debug_control_port(task, &sright); + if (error != KERN_SUCCESS) { + goto tfpout; + } + + tret = ipc_port_copyout_send( + sright, + get_task_ipcspace(current_task())); + + error = KERN_SUCCESS; + +tfpout: + task_deallocate(t1); + AUDIT_ARG(mach_port2, tret); + (void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t)); + + if (tfpport != IPC_PORT_NULL) { + ipc_port_release_send(tfpport); + } + if (task != TASK_NULL) { + task_deallocate(task); + } + if (p != PROC_NULL) { + proc_rele(p); + } + AUDIT_MACH_SYSCALL_EXIT(error); + return error; +} + kern_return_t pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) { @@ -1129,6 +1274,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) proc_t targetproc = PROC_NULL; int pid = args->pid; int error = 0; + mach_port_t tfpport = MACH_PORT_NULL; #if CONFIG_MACF error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_RESUME); @@ -1149,7 +1295,8 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) goto out; } - if (!task_for_pid_posix_check(targetproc)) { + if (!task_for_pid_posix_check(targetproc) && + !IOTaskHasEntitlement(current_task(), PROCESS_RESUME_SUSPEND_ENTITLEMENT)) { error = EPERM; goto out; } @@ -1157,8 +1304,6 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) target = targetproc->task; #ifndef CONFIG_EMBEDDED if (target != TASK_NULL) { - mach_port_t tfpport; - /* If we aren't root and target's task access port is set... */ if (!kauth_cred_issuser(kauth_cred_get()) && targetproc != current_proc() && @@ -1213,6 +1358,10 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) task_deallocate(target); out: + if (tfpport != IPC_PORT_NULL) { + ipc_port_release_send(tfpport); + } + if (targetproc != PROC_NULL) { proc_rele(targetproc); } @@ -1402,7 +1551,8 @@ pid_shutdown_sockets(struct proc *p __unused, struct pid_shutdown_sockets_args * goto out; } - if (!task_for_pid_posix_check(targetproc)) { + if (!task_for_pid_posix_check(targetproc) && + !IOTaskHasEntitlement(current_task(), PROCESS_RESUME_SUSPEND_ENTITLEMENT)) { error = EPERM; goto out; } @@ -1689,32 +1839,22 @@ _shared_region_map_and_slide( } #endif /* MAC */ - /* make sure vnode is on the process's root volume */ + /* The calling process cannot be chroot-ed. */ root_vp = p->p_fd->fd_rdir; if (root_vp == NULL) { root_vp = rootvnode; } else { - /* - * Chroot-ed processes can't use the shared_region. - */ - error = EINVAL; - goto done; - } - - if (vp->v_mount != root_vp->v_mount) { SHARED_REGION_TRACE_ERROR( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "not on process's root volume\n", - (void *)VM_KERNEL_ADDRPERM(current_thread()), - p->p_pid, p->p_comm, - (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name)); + ("calling process [%d(%s)] is chroot-ed, permission denied\n", + p->p_pid, p->p_comm)); error = EPERM; goto done; } - /* make sure vnode is owned by "root" */ + /* The shared cache file must be owned by root */ VATTR_INIT(&va); VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_flags); error = vnode_getattr(vp, &va, vfs_context_current()); if (error) { SHARED_REGION_TRACE_ERROR( @@ -1738,6 +1878,37 @@ _shared_region_map_and_slide( goto done; } +#if CONFIG_CSR + if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0 && + !(va.va_flags & SF_RESTRICTED)) { + /* + * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and + * the shared cache file is NOT SIP-protected, so reject the + * mapping request + */ + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(%p:'%s'), " + "vnode is not SIP-protected. \n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + p->p_pid, p->p_comm, (void *)VM_KERNEL_ADDRPERM(vp), + vp->v_name)); + error = EPERM; + goto done; + } +#else + /* Devices without SIP/ROSP need to make sure that the shared cache is on the root volume. */ + if (vp->v_mount != root_vp->v_mount) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(%p:'%s'): " + "not on process's root volume\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + p->p_pid, p->p_comm, + (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name)); + error = EPERM; + goto done; + } +#endif /* CONFIG_CSR */ + if (scdir_enforce) { /* get vnode for scdir_path */ error = vnode_lookup(scdir_path, 0, &scdir_vp, vfs_context_current()); @@ -2032,6 +2203,10 @@ extern unsigned int vm_page_purgeable_wired_count; SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_purgeable_wired_count, 0, "Wired purgeable page count"); +extern unsigned int vm_page_kern_lpage_count; +SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_page_kern_lpage_count, 0, "kernel used large pages"); + #if DEVELOPMENT || DEBUG extern uint64_t get_pages_grabbed_count(void); @@ -2171,10 +2346,12 @@ extern unsigned int vm_page_secluded_target; extern unsigned int vm_page_secluded_count; extern unsigned int vm_page_secluded_count_free; extern unsigned int vm_page_secluded_count_inuse; +extern unsigned int vm_page_secluded_count_over_target; SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, ""); extern struct vm_page_secluded_data vm_page_secluded; SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, ""); @@ -2344,6 +2521,12 @@ SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0 extern uint32_t vm_page_busy_absent_skipped; SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, ""); +extern uint32_t vm_page_upl_tainted; +SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, ""); + +extern uint32_t vm_page_iopl_tainted; +SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, ""); + #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) extern int vm_footprint_suspend_allowed; SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, ""); @@ -2425,3 +2608,10 @@ SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed, extern int pmap_ledgers_panic_leeway; SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, ""); #endif /* MACH_ASSERT */ + +extern int vm_protect_privileged_from_untrusted; +SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted, + CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, ""); +extern uint64_t vm_copied_on_read; +SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, ""); diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index 436268db2..2fae8525c 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -618,6 +618,7 @@ vnode_pagein( */ if ((error = VNOP_PAGEIN(vp, NULL, upl_offset, (off_t)f_offset, size, flags, vfs_context_current()))) { + set_thread_pagein_error(current_thread(), error); result = PAGER_ERROR; error = PAGER_ERROR; } @@ -761,6 +762,7 @@ vnode_pagein( ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); } } + set_thread_pagein_error(current_thread(), error); result = PAGER_ERROR; error = PAGER_ERROR; } diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index 934486bb8..4e8d858f5 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -93,6 +93,8 @@ _buf_uploffset _buf_valid _buf_vnode _buf_wcred +_bufattr_markioscheduled +_bufattr_ioscheduled _cache_enter _cache_lookup _cache_purge @@ -343,6 +345,9 @@ _kauth_null_guid _kauth_register_scope _kauth_unlisten_scope _kdebug_enable +_kdebug_debugid_enabled +_kdebug_debugid_explicitly_enabled +_kdebug_using_continuous_time _kernel_debug _kernel_debug1 _kernel_debug_filtered @@ -483,12 +488,14 @@ _proc_noremotehang _proc_pgrpid _proc_pid _proc_ppid +_proc_original_ppid _proc_rele _proc_self _proc_selfname _proc_selfpid _proc_selfppid _proc_selfpgrpid +_proc_sessionid _proc_signal _proc_suser _proto_inject @@ -628,6 +635,7 @@ _vflush _vfs_64bitready _vfs_addname _vfs_attr_pack +_vfs_attr_pack_ext _vfs_authcache_ttl _vfs_authopaque _vfs_authopaqueaccess @@ -688,6 +696,7 @@ _vfs_sysctl _vfs_typenum _vfs_unbusy _vfs_unmountbyfsid +_vn_authorize_unlink _vn_bwrite _vn_default_error _vn_getpath @@ -758,6 +767,7 @@ _vnode_vfsmaxsymlen _vnode_vfsname _vnode_vfstypenum _vnode_vid +_vnode_isonexternalstorage _vnode_vtype _vnode_waitforwrites _vnode_writedone @@ -783,6 +793,7 @@ _vnop_listxattr_desc _vnop_lookup_desc _vnop_mkdir_desc _vnop_mknod_desc +_vnop_mmap_check_desc _vnop_mmap_desc _vnop_mnomap_desc _vnop_offtoblk_desc diff --git a/config/IOKit.arm.exports b/config/IOKit.arm.exports index ad89576ca..5d3ed37cc 100644 --- a/config/IOKit.arm.exports +++ b/config/IOKit.arm.exports @@ -109,6 +109,7 @@ __ZN18IOMemoryDescriptor11makeMappingEPS_P4taskjmmm __ZN18IOMemoryDescriptor11withAddressEPvm11IODirection __ZN18IOMemoryDescriptor11withAddressEjm11IODirectionP4task __ZN18IOMemoryDescriptor11withOptionsEPvmmP4taskmP8IOMapper +__ZN18IOMemoryDescriptor12setOwnershipEP4taskim __ZN18IOMemoryDescriptor12setPurgeableEmPm __ZN18IOMemoryDescriptor12withSubRangeEPS_mm11IODirection __ZN18IOMemoryDescriptor13getPageCountsEPmS0_ @@ -169,11 +170,13 @@ __ZN21IONaturalMemoryCursor17withSpecificationEmmm __ZN21IONaturalMemoryCursor21initWithSpecificationEmmm __ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskjmmm __ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptormm11IODirection +__ZN21IOSubMemoryDescriptor12setOwnershipEP4taskim __ZN21IOSubMemoryDescriptor12setPurgeableEmPm __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptormmm __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEmPmm __ZN21IOSubMemoryDescriptor7prepareE11IODirection __ZN21IOSubMemoryDescriptor8completeE11IODirection +__ZN23IOMultiMemoryDescriptor12setOwnershipEP4taskim __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorm11IODirectionb __ZN23IOMultiMemoryDescriptor18getPhysicalSegmentEmPmm __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorm11IODirectionb @@ -195,6 +198,7 @@ __ZN24IOBufferMemoryDescriptor9setLengthEj __ZN24IOBufferMemoryDescriptor9withBytesEPKvj11IODirectionb __ZN25IOGeneralMemoryDescriptor11setPositionEm __ZN25IOGeneralMemoryDescriptor11wireVirtualE11IODirection +__ZN25IOGeneralMemoryDescriptor12setOwnershipEP4taskim __ZN25IOGeneralMemoryDescriptor12setPurgeableEmPm __ZN25IOGeneralMemoryDescriptor13mapIntoKernelEj __ZN25IOGeneralMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb diff --git a/config/IOKit.arm64.exports b/config/IOKit.arm64.exports index 065a36f0f..85e40f711 100644 --- a/config/IOKit.arm64.exports +++ b/config/IOKit.arm64.exports @@ -102,6 +102,7 @@ __ZN18IOMemoryDescriptor10writeBytesEyPKvy __ZN18IOMemoryDescriptor11makeMappingEPS_P4taskyjyy __ZN18IOMemoryDescriptor11withAddressEPvyj __ZN18IOMemoryDescriptor11withOptionsEPvjjP4taskjP8IOMapper +__ZN18IOMemoryDescriptor12setOwnershipEP4taskij __ZN18IOMemoryDescriptor12setPurgeableEjPj __ZN18IOMemoryDescriptor13getPageCountsEPyS0_ __ZN18IOMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper @@ -147,11 +148,13 @@ __ZN21IONaturalMemoryCursor17withSpecificationEyyy __ZN21IONaturalMemoryCursor21initWithSpecificationEyyy __ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskyjyy __ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptoryyj +__ZN21IOSubMemoryDescriptor12setOwnershipEP4taskij __ZN21IOSubMemoryDescriptor12setPurgeableEjPj __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptoryyj __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEyPyj __ZN21IOSubMemoryDescriptor7prepareEj __ZN21IOSubMemoryDescriptor8completeEj +__ZN23IOMultiMemoryDescriptor12setOwnershipEP4taskij __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorjjb __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorjjb __ZN23IOMultiMemoryDescriptor7prepareEj @@ -167,6 +170,7 @@ __ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskjyy __ZN24IOBufferMemoryDescriptor9setLengthEm __ZN24IOBufferMemoryDescriptor9withBytesEPKvmjb __ZN25IOGeneralMemoryDescriptor11wireVirtualEj +__ZN25IOGeneralMemoryDescriptor12setOwnershipEP4taskij __ZN25IOGeneralMemoryDescriptor12setPurgeableEjPj __ZN25IOGeneralMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper __ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEyPyj @@ -230,3 +234,4 @@ __ZNK18IOMemoryDescriptor19dmaCommandOperationEjPvj __ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEjPvj __ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryiU13block_pointerFbPS_P10IONotifierE + diff --git a/config/IOKit.exports b/config/IOKit.exports index c55892377..0010db9f1 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -1,3 +1,13 @@ +_IORPCMessageFromMach +__ZN12IOUserClient8DispatchE5IORPC + +__ZN16IODispatchSource23SetEnableWithCompletionEbU13block_pointerFvvEPFiP15OSMetaClassBase5IORPCE +__ZN16IODispatchSource9SetEnableEbPFiP15OSMetaClassBase5IORPCE + +__ZN22IOInterruptEventSource27getPimaryInterruptTimestampEv +__ZN22IOInterruptEventSource31enablePrimaryInterruptTimestampEb + +__ZN14IOPMrootDomain11setWakeTimeEy _IOAlignmentToSize _IOBSDNameMatching _IOBSDRegistryEntryForDeviceTree @@ -36,6 +46,7 @@ _IOMalloc _IOMallocAligned _IOMallocContiguous _IOMallocPageable +_IOMallocZero _IOMappedRead16 _IOMappedRead32 _IOMappedRead64 @@ -71,6 +82,7 @@ _IOSimpleLockAlloc _IOSimpleLockFree _IOSimpleLockGetMachLock _IOSimpleLockInit +_IOSimpleLockDestroy _IOSimpleLockLock:_lck_spin_lock _IOSimpleLockTryLock:_lck_spin_try_lock _IOSimpleLockUnlock:_lck_spin_unlock @@ -97,7 +109,6 @@ _PE_cpu_start _PE_enter_debugger _PE_halt_restart _PE_parse_boot_argn -_PE_poll_input _StartIOKit __Z17IODTMapInterruptsP15IORegistryEntry __Z17IODeviceTreeAllocPv @@ -107,24 +118,6 @@ __Z19printDictionaryKeysP12OSDictionaryPc __Z20IODTMatchNubWithKeysP15IORegistryEntryPKc __Z21IODTResolveAddressingP15IORegistryEntryPKcP14IODeviceMemory __Z27IODTInterruptControllerNameP15IORegistryEntry -__ZN10IOMachPort10gMetaClassE -__ZN10IOMachPort10superClassE -__ZN10IOMachPort11dictForTypeEj -__ZN10IOMachPort13portForObjectEP8OSObjectj -__ZN10IOMachPort14setHoldDestroyEP8OSObjectj -__ZN10IOMachPort20makeSendRightForTaskEP4taskP8OSObjectj -__ZN10IOMachPort20releasePortForObjectEP8OSObjectj -__ZN10IOMachPort22noMoreSendersForObjectEP8OSObjectjPj -__ZN10IOMachPort4freeEv -__ZN10IOMachPort9MetaClassC1Ev -__ZN10IOMachPort9MetaClassC2Ev -__ZN10IOMachPort9metaClassE -__ZN10IOMachPortC1EPK11OSMetaClass -__ZN10IOMachPortC1Ev -__ZN10IOMachPortC2EPK11OSMetaClass -__ZN10IOMachPortC2Ev -__ZN10IOMachPortD0Ev -__ZN10IOMachPortD2Ev __ZN10IONotifier10gMetaClassE __ZN10IONotifier10superClassE __ZN10IONotifier9MetaClassC1Ev @@ -158,29 +151,6 @@ __ZN10IOWorkLoopC2EPK11OSMetaClass __ZN10IOWorkLoopC2Ev __ZN10IOWorkLoopD0Ev __ZN10IOWorkLoopD2Ev -__ZN11IOCatalogue10addDriversEP7OSArrayb -__ZN11IOCatalogue10gMetaClassE -__ZN11IOCatalogue10initializeEv -__ZN11IOCatalogue10superClassE -__ZN11IOCatalogue13removeDriversEP12OSDictionaryb -__ZN11IOCatalogue13startMatchingEP12OSDictionary -__ZN11IOCatalogue15moduleHasLoadedEP8OSString -__ZN11IOCatalogue15moduleHasLoadedEPKc -__ZN11IOCatalogue16terminateDriversEP12OSDictionary -__ZN11IOCatalogue25terminateDriversForModuleEP8OSStringb -__ZN11IOCatalogue25terminateDriversForModuleEPKcb -__ZN11IOCatalogue4freeEv -__ZN11IOCatalogue4initEP7OSArray -__ZN11IOCatalogue5resetEv -__ZN11IOCatalogue9MetaClassC1Ev -__ZN11IOCatalogue9MetaClassC2Ev -__ZN11IOCatalogue9metaClassE -__ZN11IOCatalogueC1EPK11OSMetaClass -__ZN11IOCatalogueC1Ev -__ZN11IOCatalogueC2EPK11OSMetaClass -__ZN11IOCatalogueC2Ev -__ZN11IOCatalogueD0Ev -__ZN11IOCatalogueD2Ev __ZN11IODataQueue10gMetaClassE __ZN11IODataQueue10superClassE __ZN11IODataQueue19getMemoryDescriptorEv @@ -701,6 +671,8 @@ __ZN18IOMemoryDescriptor9MetaClassC2Ev __ZN18IOMemoryDescriptor9metaClassE __ZN18IOMemoryDescriptorC2EPK11OSMetaClass __ZN18IOMemoryDescriptorD2Ev +__ZN18IOMemoryDescriptor8getVMTagEP7_vm_map +__ZN18IOMemoryDescriptor9setVMTagsEjj __ZN18IORegistryIterator10enterEntryEPK15IORegistryPlane __ZN18IORegistryIterator10enterEntryEv __ZN18IORegistryIterator10gMetaClassE @@ -1134,8 +1106,6 @@ __ZN9IOServiceC2EPK11OSMetaClass __ZN9IOServiceC2Ev __ZN9IOServiceD0Ev __ZN9IOServiceD2Ev -__ZNK10IOMachPort12getMetaClassEv -__ZNK10IOMachPort9MetaClass5allocEv __ZNK10IONotifier12getMetaClassEv __ZNK10IONotifier9MetaClass5allocEv __ZNK10IOWorkLoop12getMetaClassEv @@ -1149,9 +1119,6 @@ __ZNK10IOWorkLoop9MetaClass5allocEv __ZNK10IOWorkLoop9getThreadEv __ZNK11IOCatalogue12getMetaClassEv __ZNK11IOCatalogue12unloadModuleEP8OSString -__ZNK11IOCatalogue14isModuleLoadedEP12OSDictionary -__ZNK11IOCatalogue14isModuleLoadedEP8OSString -__ZNK11IOCatalogue14isModuleLoadedEPKc __ZNK11IOCatalogue18getGenerationCountEv __ZNK11IOCatalogue9MetaClass5allocEv __ZNK11IOCatalogue9serializeEP11OSSerialize @@ -1314,7 +1281,6 @@ __ZNK9IOService6isOpenEPKS_ __ZNK9IOService8getStateEv __ZNK9IOService9MetaClass5allocEv __ZNK9IOService9getClientEv -__ZTV10IOMachPort __ZTV10IONotifier __ZTV10IOWorkLoop __ZTV11IOCatalogue @@ -1366,7 +1332,6 @@ __ZTV29IOInterleavedMemoryDescriptor __ZTV8IOMapper __ZTV9IOCommand __ZTV9IOService -__ZTVN10IOMachPort9MetaClassE __ZTVN10IONotifier9MetaClassE __ZTVN10IOWorkLoop9MetaClassE __ZTVN11IOCatalogue9MetaClassE @@ -1667,3 +1632,94 @@ __ZN9IOService22registerInterruptBlockEiP8OSObjectU13block_pointerFvPS_iE __ZNK13IOEventSource14getActionBlockEU13block_pointerFivE __ZN13IOEventSource9setRefconEPv __ZNK13IOEventSource9getRefconEv + +__ZN8OSAction17SetAbortedHandlerEU13block_pointerFvvE + +__ZN15IODispatchQueue9metaClassE +__ZN16IODispatchSource9metaClassE +__ZN25IOInterruptDispatchSource9metaClassE +__ZN9IOService5StartEPS_PFiP15OSMetaClassBase5IORPCE + +__ZN25IODataQueueDispatchSource10CopyMemoryEPP18IOMemoryDescriptorPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource10gMetaClassE +__ZN25IODataQueueDispatchSource10superClassE +__ZN25IODataQueueDispatchSource12DataServicedEP8OSActionPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource13DataAvailableEP8OSActionPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource15IsDataAvailableEv +__ZN25IODataQueueDispatchSource16SendDataServicedEv +__ZN25IODataQueueDispatchSource17SendDataAvailableEv +__ZN25IODataQueueDispatchSource22SetDataServicedHandlerEP8OSActionPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource23CopyDataServicedHandlerEPP8OSActionPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource23SetDataAvailableHandlerEP8OSActionPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource24CopyDataAvailableHandlerEPP8OSActionPFiP15OSMetaClassBase5IORPCE +__ZN25IODataQueueDispatchSource4PeekEU13block_pointerFvPKvmE +__ZN25IODataQueueDispatchSource4freeEv +__ZN25IODataQueueDispatchSource4initEv +__ZN25IODataQueueDispatchSource6CreateEyP15IODispatchQueuePPS_ +__ZN25IODataQueueDispatchSource7DequeueEU13block_pointerFvPKvmE +__ZN25IODataQueueDispatchSource7EnqueueEjU13block_pointerFvPvmE +__ZN25IODataQueueDispatchSource8DispatchE5IORPC +__ZN25IODataQueueDispatchSource9MetaClass8DispatchE5IORPC +__ZN25IODataQueueDispatchSource9MetaClassC1Ev +__ZN25IODataQueueDispatchSource9MetaClassC2Ev +__ZN25IODataQueueDispatchSource9_DispatchEPS_5IORPC +__ZN25IODataQueueDispatchSource9metaClassE +__ZN25IODataQueueDispatchSourceC1EPK11OSMetaClass +__ZN25IODataQueueDispatchSourceC1Ev +__ZN25IODataQueueDispatchSourceC2EPK11OSMetaClass +__ZN25IODataQueueDispatchSourceC2Ev +__ZN25IODataQueueDispatchSourceD0Ev +__ZN25IODataQueueDispatchSourceD1Ev +__ZN25IODataQueueDispatchSourceD2Ev +__ZNK25IODataQueueDispatchSource12getMetaClassEv +__ZNK25IODataQueueDispatchSource9MetaClass5allocEv +__ZTV25IODataQueueDispatchSource +__ZTVN25IODataQueueDispatchSource9MetaClassE +__ZN25IODataQueueDispatchSource19DequeueWithCoalesceEPbU13block_pointerFvPKvmE +__ZN25IODataQueueDispatchSource19EnqueueWithCoalesceEjPbU13block_pointerFvPvmE + +__ZN11IOMemoryMap17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P23IOMemoryMapPrivateStateE +__ZN12IOUserClient22AsyncCompletion_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActioniPKyjE +__ZN12IOUserClient22_ExternalMethod_InvokeE5IORPCP15OSMetaClassBasePFiS2_yPKyjP6OSDataP18IOMemoryDescriptorPyPjyPS6_S8_P8OSActionE +__ZN12IOUserClient30CopyClientMemoryForType_InvokeE5IORPCP15OSMetaClassBasePFiS2_yPyPP18IOMemoryDescriptorE +__ZN12IOUserServer11Exit_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcE +__ZN12IOUserServer13Create_InvokeE5IORPCPFiPKcyyPPS_E +__ZN12IOUserServer17LoadModule_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcE +__ZN15IODispatchQueue13Create_InvokeE5IORPCPFiPKcyyPPS_E +__ZN15IODispatchQueue14SetPort_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8ipc_portE +__ZN16IODispatchSource13Cancel_InvokeE5IORPCP15OSMetaClassBasePFiS2_U13block_pointerFvvEE +__ZN16IODispatchSource16SetEnable_InvokeE5IORPCP15OSMetaClassBasePFiS2_bE +__ZN16IODispatchSource19CheckForWork_InvokeE5IORPCP15OSMetaClassBasePFiS2_S0_bE +__ZN16IODispatchSource30SetEnableWithCompletion_InvokeE5IORPCP15OSMetaClassBasePFiS2_bU13block_pointerFvvEE +__ZN18IOMemoryDescriptor17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P16IOMDPrivateStateE +__ZN18IOMemoryDescriptor20PrepareForDMA_InvokeE5IORPCP15OSMetaClassBasePFiS2_yP9IOServiceyyPyS5_PjP16IOAddressSegmentE +__ZN24IOBufferMemoryDescriptor13Create_InvokeE5IORPCPFiyyyPPS_E +__ZN24IOBufferMemoryDescriptor16SetLength_InvokeE5IORPCP15OSMetaClassBasePFiS2_yE +__ZN25IODataQueueDispatchSource13Create_InvokeE5IORPCPFiyP15IODispatchQueuePPS_E +__ZN25IODataQueueDispatchSource17CopyMemory_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP18IOMemoryDescriptorE +__ZN25IODataQueueDispatchSource19DataServiced_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActionE +__ZN25IODataQueueDispatchSource20DataAvailable_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActionE +__ZN25IODataQueueDispatchSource29SetDataServicedHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8OSActionE +__ZN25IODataQueueDispatchSource30CopyDataServicedHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP8OSActionE +__ZN25IODataQueueDispatchSource30SetDataAvailableHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8OSActionE +__ZN25IODataQueueDispatchSource31CopyDataAvailableHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP8OSActionE +__ZN25IOInterruptDispatchSource13Create_InvokeE5IORPCPFiP9IOServicejP15IODispatchQueuePPS_E +__ZN25IOInterruptDispatchSource17SetHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8OSActionE +__ZN25IOInterruptDispatchSource24InterruptOccurred_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActionyyE +__ZN8OSAction13Create_InvokeE5IORPCPFiP8OSObjectyymPPS_E +__ZN8OSObject23SetDispatchQueue_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcP15IODispatchQueueE +__ZN8OSObject24CopyDispatchQueue_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcPP15IODispatchQueueE +__ZN9IOService11Stop_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_E +__ZN9IOService12Start_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_E +__ZN9IOService13Create_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_PKcPS3_E +__ZN9IOService14SetName_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcE +__ZN9IOService20NewUserClient_InvokeE5IORPCP15OSMetaClassBasePFiS2_jPP12IOUserClientE +__ZN9IOService20SetPowerState_InvokeE5IORPCP15OSMetaClassBasePFiS2_jE +__ZN9IOService20SetProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_P12OSDictionaryE +__ZN9IOService21CopyProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP12OSDictionaryE +__ZN9IOService21SearchProperty_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcS4_yPP8OSObjectE +__ZN9IOService22RegisterService_InvokeE5IORPCP15OSMetaClassBasePFiS2_E +__ZN9IOService23ChangePowerState_InvokeE5IORPCP15OSMetaClassBasePFiS2_jE +__ZN9IOService25GetRegistryEntryID_InvokeE5IORPCP15OSMetaClassBasePFiS2_PyE + +__ZN18IOMemoryDescriptor20CreateMapping_InvokeE5IORPCP15OSMetaClassBasePFiS2_yyyyyPP11IOMemoryMapE diff --git a/config/IOKit.x86_64.exports b/config/IOKit.x86_64.exports index d53a169a5..721f17eb7 100644 --- a/config/IOKit.x86_64.exports +++ b/config/IOKit.x86_64.exports @@ -1,3 +1,7 @@ +__ZN11IOCatalogue10addDriversEP7OSArrayb +__ZN11IOCatalogue13removeDriversEP12OSDictionaryb +__ZN11IOCatalogue13startMatchingEP12OSDictionary + _IOLockSleep_darwin14 _IOLockSleepDeadline_darwin14 _IOLockWakeup_darwin14 @@ -225,6 +229,7 @@ __ZN18IOMemoryDescriptor10writeBytesEyPKvy __ZN18IOMemoryDescriptor11makeMappingEPS_P4taskyjyy __ZN18IOMemoryDescriptor11withAddressEPvyj __ZN18IOMemoryDescriptor11withOptionsEPvjjP4taskjP8IOMapper +__ZN18IOMemoryDescriptor12setOwnershipEP4taskij __ZN18IOMemoryDescriptor12setPurgeableEjPj __ZN18IOMemoryDescriptor13getPageCountsEPyS0_ __ZN18IOMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper @@ -296,6 +301,7 @@ __ZN21IONaturalMemoryCursor17withSpecificationEyyy __ZN21IONaturalMemoryCursor21initWithSpecificationEyyy __ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskyjyy __ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptoryyj +__ZN21IOSubMemoryDescriptor12setOwnershipEP4taskij __ZN21IOSubMemoryDescriptor12setPurgeableEjPj __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptoryyj __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEyPyj @@ -309,6 +315,7 @@ __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource4Ev __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource5Ev __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource6Ev __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource7Ev +__ZN23IOMultiMemoryDescriptor12setOwnershipEP4taskij __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorjjb __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorjjb __ZN23IOMultiMemoryDescriptor7prepareEj @@ -340,6 +347,7 @@ __ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor15Ev __ZN24IOBufferMemoryDescriptor9setLengthEm __ZN24IOBufferMemoryDescriptor9withBytesEPKvmjb __ZN25IOGeneralMemoryDescriptor11wireVirtualEj +__ZN25IOGeneralMemoryDescriptor12setOwnershipEP4taskij __ZN25IOGeneralMemoryDescriptor12setPurgeableEjPj __ZN25IOGeneralMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper __ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEyPyj diff --git a/config/Libkern.arm.exports b/config/Libkern.arm.exports index ab47a9396..0be20457d 100644 --- a/config/Libkern.arm.exports +++ b/config/Libkern.arm.exports @@ -1,5 +1,5 @@ _OSAddAtomic64 _OSCompareAndSwap64 -__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE +__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvEm __ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ diff --git a/config/Libkern.arm64.exports b/config/Libkern.arm64.exports index 40f33219b..d2575ff18 100644 --- a/config/Libkern.arm64.exports +++ b/config/Libkern.arm64.exports @@ -1,6 +1,6 @@ _OSAddAtomic64 _OSCompareAndSwap64 _PAGE_SHIFT_CONST -__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE +__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvEm __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ diff --git a/config/Libkern.exports b/config/Libkern.exports index e5f047928..735ea69c3 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -56,6 +56,8 @@ _SHA512_Init _SHA512_Update _STRDUP __Z13OSUnserializePKcPP8OSString +__Z13OSUnserializePKcPN2os9smart_ptrI8OSString15osobject_policyEE +__Z16OSUnserializeXMLPKcPN2os9smart_ptrI8OSString15osobject_policyEE __Z16OSUnserializeXMLPKcPP8OSString __Z16OSUnserializeXMLPKcmPP8OSString __ZN10OSIterator10gMetaClassE @@ -88,6 +90,7 @@ __ZN11OSMetaClassD2Ev __ZN11OSMetaClassdlEPvm __ZN11OSMetaClassnwEm __ZN11OSSerialize10gMetaClassE +__ZN11OSSerialize10setIndexedEb __ZN11OSSerialize10superClassE __ZN11OSSerialize12addXMLEndTagEPKc __ZN11OSSerialize12withCapacityEj @@ -697,12 +700,14 @@ _os_log_debug_enabled _os_log_info_enabled _os_release _os_retain -_os_ref_init_count -_os_ref_retain -_os_ref_release_explicit -_os_ref_retain_try -_os_ref_retain_locked -_os_ref_release_locked +_os_ref_init_count_external:_os_ref_init_count_internal +_os_ref_release_barrier_external:_os_ref_release_barrier_internal +_os_ref_release_external:_os_ref_release_internal +_os_ref_release_locked_external:_os_ref_release_locked_internal +_os_ref_release_relaxed_external:_os_ref_release_relaxed_internal +_os_ref_retain_external:_os_ref_retain_internal +_os_ref_retain_locked_external:_os_ref_retain_locked_internal +_os_ref_retain_try_external:_os_ref_retain_try_internal _osrelease _ostype _page_mask @@ -765,6 +770,34 @@ __NSConcreteGlobalBlock __NSConcreteMallocBlock __NSConcreteStackBlock __NSConcreteWeakBlockVariable +__ZN12OSCollection14iterateObjectsEPvPFbS0_P8OSObjectE __ZN12OSCollection14iterateObjectsEU13block_pointerFbP8OSObjectE +__ZN12OSDictionary14iterateObjectsEPvPFbS0_PK8OSSymbolP8OSObjectE __ZN12OSDictionary14iterateObjectsEU13block_pointerFbPK8OSSymbolP8OSObjectE __ZN12OSSerializer9withBlockEU13block_pointerFbP11OSSerializeE + +__ZN15IODispatchQueue8DispatchE5IORPC +__ZN15IODispatchQueue9MetaClass8DispatchE5IORPC +__ZN15OSMetaClassBase8DispatchE5IORPC +__ZN15OSUserMetaClass8DispatchE5IORPC +__ZN16IODispatchSource8DispatchE5IORPC +__ZN16IODispatchSource9MetaClass8DispatchE5IORPC +__ZN18IOMemoryDescriptor8DispatchE5IORPC +__ZN18IOMemoryDescriptor9MetaClass8DispatchE5IORPC +__ZN24IOBufferMemoryDescriptor8DispatchE5IORPC +__ZN24IOBufferMemoryDescriptor9MetaClass8DispatchE5IORPC +__ZN25IOInterruptDispatchSource8DispatchE5IORPC +__ZN25IOInterruptDispatchSource9MetaClass8DispatchE5IORPC +__ZN8OSAction8DispatchE5IORPC +__ZN8OSAction9MetaClass8DispatchE5IORPC +__ZN8OSObject8DispatchE5IORPC +__ZN8OSObject9MetaClass8DispatchE5IORPC +__ZN9IOService8DispatchE5IORPC +__ZN9IOService9MetaClass8DispatchE5IORPC +__ZN8OSAction9metaClassE +__ZN15OSMetaClassBase6InvokeE5IORPC +__ZN8OSObject9_DispatchEPS_5IORPC +__ZN9IOService9_DispatchEPS_5IORPC + +__ZN8OSAction12GetReferenceEv +__ZN8OSAction6CreateEP8OSObjectyymPPS_ diff --git a/config/Libkern.x86_64.exports b/config/Libkern.x86_64.exports index 9ea8e005a..48690ba94 100644 --- a/config/Libkern.x86_64.exports +++ b/config/Libkern.x86_64.exports @@ -44,7 +44,6 @@ __ZN12OSOrderedSet22_RESERVEDOSOrderedSet4Ev __ZN12OSOrderedSet22_RESERVEDOSOrderedSet5Ev __ZN12OSOrderedSet22_RESERVEDOSOrderedSet6Ev __ZN12OSOrderedSet22_RESERVEDOSOrderedSet7Ev -__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev __ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase4Ev __ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase5Ev __ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase6Ev diff --git a/config/MACFramework.exports b/config/MACFramework.exports index e594b265f..e274ed3e1 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -18,10 +18,16 @@ _sbuf_cat _sbuf_data _sbuf_delete _sbuf_finish +_sbuf_len _sbuf_new +_sbuf_overflowed _sbuf_printf _sbuf_putc +_sbuf_vprintf _strsep _sysctl__security_mac_children _VNOP_SETXATTR _VNOP_GETXATTR +_mac_vnode_label_allocate +_mac_vnode_label_get +_mac_vnode_label_set diff --git a/config/MASTER b/config/MASTER index a4b109d11..f0900b345 100644 --- a/config/MASTER +++ b/config/MASTER @@ -5,7 +5,7 @@ # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. -# +# ####################################################################### # # Master machine independent configuration file. @@ -48,7 +48,7 @@ # medium = medium scale system configuration # small = small scale system configuration # xsmall = extra small scale system configuration -# bsmall = special extra small scale system configuration +# bsmall = special extra small scale system configuration # ####################################################################### # @@ -68,13 +68,14 @@ options MACH_NP # Mach IPC support # options MACH_NBC # No buffer cache # options MACH_NET # Fast network access # options MACH_XP # external pager support # -options NO_DIRECT_RPC # for untyped mig servers # +options NO_DIRECT_RPC # for untyped mig servers # options LOOP # loopback support # options VLAN # # +options SIXLOWPAN # 6LoWPAN support # options BOND # # options IF_FAKE # # +options IF_HEADLESS # # options AH_ALL_CRYPTO # AH all crypto algs # -options IPCOMP_ZLIB # IP compression using zlib # options PF # Packet Filter # options PF_ECN # PF use ECN marking # options PFLOG # PF log interface # @@ -96,30 +97,32 @@ options FLOW_DIVERT # options NECP # options CONTENT_FILTER # # options PACKET_MANGLER # # - +options SIXLOWPAN # # # secure_kernel - secure kernel from user programs -options SECURE_KERNEL # +options SECURE_KERNEL # options OLD_SEMWAIT_SIGNAL # old semwait_signal handler # -# 4.4 general kernel +# 4.4 general kernel # options SOCKETS # socket support # options DIAGNOSTIC # diagnostics # -options GPROF # build profiling # options PROFILE # kernel profiling # options SENDFILE # sendfile # options NETWORKING # networking layer # options CONFIG_FSE # file system events # options CONFIG_IMAGEBOOT # local image boot # +options CONFIG_LOCKERBOOT # locker boot # options CONFIG_MBUF_JUMBO # jumbo cluster pool # +options CONFIG_IMAGEBOOT_IMG4 # authenticate image with AppleImage4 # +options CONFIG_IMAGEBOOT_CHUNKLIST # authenticate image with a chunk list # options CONFIG_WORKQUEUE # options CONFIG_WORKLOOP_DEBUG # # -# 4.4 filesystems +# 4.4 filesystems # options MOCKFS # Boot from an executable # options FIFO # fifo support # @@ -143,21 +146,28 @@ options CONFIG_EXT_RESOLVER # e.g. memberd # options CONFIG_SEARCHFS # searchfs syscall support # options CONFIG_MNT_SUID # allow suid binaries # options CONFIG_MNT_ROOTSNAP # allow rooting from snapshot # +options CONFIG_ROSV_STARTUP # allow read-only system volume startup # +options CONFIG_FIRMLINKS # support "firmlinks" # +options CONFIG_MOUNT_VM # mount VM volume on startup # +options CONFIG_DATALESS_FILES # support dataless file materialization # # # NFS support # options NFSCLIENT # Be an NFS client # options NFSSERVER # Be an NFS server # +options CONFIG_NFS_GSS # Support NFS GSSAPI # +options CONFIG_NFS4 # Use NFSv4 # +options CONFIG_NETBOOT # network booting (requires NFSCLIENT) # # # Machine Independent Apple Features # profile # build a profiling kernel # -# +# # IPv6 Support -# +# options "INET6" # kernel IPv6 Support # options IPV6SEND # Secure Neighbor Discovery # options IPSEC # IP security # @@ -177,26 +187,27 @@ options ENCRYPTED_SWAP # options CONFIG_IMG4 # options ZLIB # inflate/deflate support # +options ZLIBC # inflate/deflate support # options IF_BRIDGE # # -# configurable kernel event related resources +# configurable kernel event related resources # options CONFIG_KN_HASHSIZE=64 # options CONFIG_KN_HASHSIZE=48 # options CONFIG_KN_HASHSIZE=20 # # -# configurable vfs related resources -# CONFIG_VNODES - used to pre allocate vnode related resources +# configurable vfs related resources +# CONFIG_VNODES - used to pre allocate vnode related resources # CONFIG_NC_HASH - name cache hash table allocation # CONFIG_VFS_NAMES - name strings # -# 263168 magic number for medium CONFIG_VNODES is based on memory -# Number vnodes is (memsize/64k) + 1024 +# 263168 magic number for medium CONFIG_VNODES is based on memory +# Number vnodes is (memsize/64k) + 1024 # This is the calculation that is used by launchd in tiger -# we are clipping the max based on 16G +# we are clipping the max based on 16G # ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168; options CONFIG_VNODES=263168 # @@ -218,7 +229,7 @@ options CONFIG_MAX_CLUSTERS=8 # options CONFIG_MAX_CLUSTERS=4 # # -# configurable options for minumum number of buffers for kernel memory +# configurable options for minumum number of buffers for kernel memory # options CONFIG_MIN_NBUF=256 # options CONFIG_MIN_NBUF=128 # @@ -249,7 +260,7 @@ options CONFIG_ICMP_BANDLIM=250 # options CONFIG_ICMP_BANDLIM=50 # # -# configurable async IO options +# configurable async IO options # CONFIG_AIO_MAX - system wide limit of async IO requests. # CONFIG_AIO_PROCESS_MAX - process limit of async IO requests. # CONFIG_AIO_THREAD_COUNT - number of async IO worker threads created. @@ -317,7 +328,7 @@ options CONFIG_EMBEDDED # # options CONFIG_DYNAMIC_CODE_SIGNING # -# enforce library validation on all processes. +# enforce library validation on all processes. # options CONFIG_ENFORCE_LIBRARY_VALIDATION # @@ -334,6 +345,11 @@ options CONFIG_PROTECT # #allow write-protection of key page options CONFIG_KEYPAGE_WP # +# +# allow vm_pageout_scan to dynamically adjust its priority based on priorities of waiters +# +options CONFIG_VPS_DYNAMIC_PRIO # + # # enable per-process memory priority tracking # @@ -408,7 +424,7 @@ options CONFIG_PROC_UUID_POLICY # # # ECC data logging -# +# options CONFIG_ECC_LOGGING # # @@ -526,6 +542,7 @@ options CONFIG_MACF_SOCKET_SUBSET # MAC socket subest (no labels) # +options CONFIG_ARCADE # Arcade validation support # # # MACH configuration options. @@ -582,20 +599,12 @@ options CONFIG_TASK_ZONE_INFO # # available when the kernel is being debugged. # options CONFIG_DEBUGGER_FOR_ZONE_INFO # -# -# XPR_DEBUG enables the gathering of data through the XPR macros inserted -# into various subsystems. This option is normally only enabled for -# specific performance or behavior studies, as the overhead in both -# code and data space is large. The data is normally retrieved through -# the kernel debugger (kdb) or by reading /dev/kmem. -# -options XPR_DEBUG # # -# +# # MACH_LDEBUG controls the internal consistency checks and # data gathering in the locking package. This also enables a debug-only # version of simple-locks on uniprocessor machines. The code size and # performance impact of this option is significant. -# +# options MACH_LDEBUG # # # @@ -621,7 +630,7 @@ options KPC # options PGO # # MACH_COUNTERS enables code that handles various counters in the system. -# +# options MACH_COUNTERS # # # DEVELOPMENT define for development builds @@ -629,6 +638,7 @@ options DEVELOPMENT # dev kernel # # DEBUG kernel options DEBUG # general debugging code # +options CONFIG_NONFATAL_ASSERTS # non fatal asserts # ########################################################## # @@ -653,7 +663,7 @@ options MACH_BSD # BSD subsystem on top of Mach # options IOKIT # # # -# configurable kernel related resources (CONFIG_THREAD_MAX needs to stay in +# configurable kernel related resources (CONFIG_THREAD_MAX needs to stay in # sync with bsd/conf/MASTER until we fix the config system... todo XXX # options CONFIG_THREAD_MAX=2560 # @@ -669,8 +679,8 @@ options CONFIG_TASK_MAX=512 # # options CONFIG_ZONE_MAP_MIN=120586240 # -# Sizes must be a power of two for the zhash to -# be able to just mask off bits instead of mod +# Sizes must be a power of two for the zhash to +# be able to just mask off bits instead of mod options CONFIG_ZLEAK_ALLOCATION_MAP_NUM=16384 # options CONFIG_ZLEAK_ALLOCATION_MAP_NUM=8192 # options CONFIG_ZLEAK_TRACE_MAP_NUM=8192 # @@ -688,6 +698,7 @@ options CONFIG_SCHED_GRRR # options CONFIG_SCHED_GRRR_CORE # options CONFIG_SCHED_MULTIQ # options CONFIG_SCHED_TIMESHARE_CORE # +options CONFIG_CLUTCH # options CONFIG_SCHED_IDLE_IN_PLACE # options CONFIG_SCHED_SFI # @@ -754,7 +765,7 @@ options CONFIG_REQUIRES_U32_MUNGING # incoming U32 argument structures must be options COPYOUT_SHIM # Shim for copyout memory analysis via kext # # -# Enable hardware correlation of mach absolute time +# Enable hardware correlation of mach absolute time # across intel/arm boundary options CONFIG_MACH_BRIDGE_SEND_TIME # # options CONFIG_MACH_BRIDGE_RECV_TIME # # @@ -766,3 +777,11 @@ options CONFIG_32BIT_TELEMETRY # # options CONFIG_QUIESCE_COUNTER # Support for _COMM_PAGE_CPU_QUIESCENT_COUNTER # +# +# Sanitizers +# +options CONFIG_KASAN # +options CONFIG_UBSAN # +options CONFIG_KSANCOV # + +pseudo-device ksancov 1 init ksancov_init_dev # diff --git a/config/MASTER.arm b/config/MASTER.arm index d463ad189..6cc8a1b52 100644 --- a/config/MASTER.arm +++ b/config/MASTER.arm @@ -5,14 +5,14 @@ # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. -# +# ###################################################################### # # Master Apple configuration file (see the master machine independent # configuration file for a description of the file format). # ###################################################################### -# +# # Standard Apple OS Configurations: # -------- ----- -- --------------- # @@ -20,18 +20,23 @@ # KERNEL_RELEASE = [ KERNEL_BASE ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ] -# BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy ] +# BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] -# BSD_DEV = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# BSD_DEBUG = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ] +# BSD_DEV = [ BSD_BASE config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ] +# BSD_DEBUG = [ BSD_BASE config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ] +# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs namedstreams ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE fdesc ] -# NFS = [ nfsclient nfsserver ] -# NETWORKING = [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake ] +# NFS_DEV = [ nfsclient nfsserver config_nfs_gss ] +# NETWORKING = [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake sixlowpan ] +# NETWORKING_RELEASE = [ NETWORKING ] +# NETWORKING_DEV = [ NETWORKING_RELEASE if_headless ] +# NETWORKING_DEBUG = [ NETWORKING_DEV ] # VPN = [ ipsec flow_divert necp content_filter ] -# PF = [ pf ] +# PF_RELEASE = [ pf ] +# PF_DEV = [ PF_RELEASE pflog ] +# PF_DEBUG = [ PF_DEV ] # MULTIPATH = [ multipath mptcp ] # IOKIT_BASE = [ iokit iokitcpp no_kextd no_kernel_hid config_sleep ] # IOKIT_RELEASE = [ IOKIT_BASE ] @@ -49,7 +54,7 @@ # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] # MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] -# SCHED_BASE = [ config_sched_traditional config_sched_multiq ] +# SCHED_BASE = [ config_sched_traditional config_sched_multiq config_clutch ] # SCHED_RELEASE = [ SCHED_BASE ] # SCHED_DEV = [ SCHED_BASE ] # SCHED_DEBUG = [ SCHED_BASE config_sched_grrr config_sched_proto ] @@ -58,9 +63,9 @@ # VM_DEV = [ VM_BASE dynamic_codesigning ] # VM_DEBUG = [ VM_BASE dynamic_codesigning ] # SECURITY = [ config_macf ] -# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] -# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS SKYWALK_DEV NETWORKING PF MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] -# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING PF MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] +# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] +# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] +# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] # ###################################################################### # diff --git a/config/MASTER.arm64 b/config/MASTER.arm64 index eadc388d6..110f6a6d6 100644 --- a/config/MASTER.arm64 +++ b/config/MASTER.arm64 @@ -5,14 +5,14 @@ # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. -# +# ###################################################################### # # Master Apple configuration file (see the master machine independent # configuration file for a description of the file format). # ###################################################################### -# +# # Standard Apple OS Configurations: # -------- ----- -- --------------- # @@ -20,21 +20,25 @@ # KERNEL_RELEASE = [ KERNEL_BASE ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] -# BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ] +# BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] -# BSD_DEV = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# BSD_DEBUG = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ] +# BSD_DEV = [ BSD_BASE config_netboot config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ] +# BSD_DEBUG = [ BSD_BASE config_netboot config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ] +# FILESYS_BASE = [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_triggers config_fse routefs namedstreams config_dataless_files ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE fdesc ] -# NFS = [ nfsclient nfsserver ] -# NETWORKING = [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake ] +# NFS_DEV = [ nfsclient nfsserver config_nfs_gss ] +# NFS_RELEASE = [ nfsclient ] +# NFS_DEBUG = [ nfsclient config_nfs_gss ] +# NETWORKING = [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake sixlowpan ] # NETWORKING_RELEASE = [ NETWORKING ] -# NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler ] +# NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless ] # NETWORKING_DEBUG = [ NETWORKING_DEV ] # VPN = [ ipsec flow_divert necp content_filter ] -# PF = [ pf ] +# PF_RELEASE = [ pf ] +# PF_DEV = [ PF_RELEASE pflog ] +# PF_DEBUG = [ PF_DEV ] # MULTIPATH = [ multipath mptcp ] # IOKIT_BASE = [ iokit iokitcpp no_kextd no_kernel_hid config_sleep ] # IOKIT_RELEASE = [ IOKIT_BASE ] @@ -52,19 +56,19 @@ # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] # MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] -# SCHED_BASE = [ config_sched_traditional config_sched_multiq config_sched_deferred_ast ] +# SCHED_BASE = [ config_sched_traditional config_sched_multiq config_sched_deferred_ast config_clutch ] # SCHED_RELEASE = [ SCHED_BASE ] # SCHED_DEV = [ SCHED_BASE ] # SCHED_DEBUG = [ SCHED_BASE config_sched_grrr config_sched_proto ] -# VM_BASE = [ vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_background_queue config_cs_validation_bitmap] +# VM_BASE = [ vps_dynamic_prio vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_background_queue config_cs_validation_bitmap] # VM_RELEASE = [ VM_BASE ] # VM_DEV = [ VM_BASE dynamic_codesigning ] # VM_DEBUG = [ VM_BASE dynamic_codesigning ] # SECURITY = [ config_macf kernel_integrity ] -# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] -# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS SKYWALK_DEV NETWORKING_DEV PF MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] -# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] -# KASAN = [ DEVELOPMENT ] +# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE NFS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] +# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] +# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG NFS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] +# KASAN = [ DEVELOPMENT config_kasan config_ubsan config_ksancov ] # ###################################################################### # diff --git a/config/MASTER.arm64.bcm2837 b/config/MASTER.arm64.bcm2837 index f6c35b27b..73670d3d3 100644 --- a/config/MASTER.arm64.bcm2837 +++ b/config/MASTER.arm64.bcm2837 @@ -1,18 +1,18 @@ # # Mach Operating System # Copyright (c) 1986 Carnegie-Mellon University -# Copyright 2001-2016 Apple Inc. +# Copyright 2001-2018 Apple Inc. # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. -# +# ###################################################################### # # Master Apple configuration file (see the master machine independent # configuration file for a description of the file format). # ###################################################################### -# +# # Standard Apple OS Configurations: # -------- ----- -- --------------- # @@ -22,16 +22,18 @@ # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] -# BSD_DEV = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# BSD_DEBUG = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ] +# BSD_DEV = [ BSD_BASE config_netboot config_imageboot config_coredump pgo config_vnguard ] +# BSD_DEBUG = [ BSD_BASE config_netboot config_imageboot config_coredump pgo config_vnguard ] +# FILESYS_BASE = [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_fse routefs namedstreams ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE fdesc ] -# NFS = [ nfsclient nfsserver ] +# NFS_DEV = [ nfsclient nfsserver config_nfs_gss ] +# NFS_RELEASE = [ nfsclient ] +# NFS_DEBUG = [ nfsclient config_nfs_gss ] # NETWORKING = [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake ] # NETWORKING_RELEASE = [ NETWORKING ] -# NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler ] +# NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless ] # NETWORKING_DEBUG = [ NETWORKING_DEV ] # VPN = [ ipsec flow_divert necp content_filter ] # PF = [ pf ] @@ -40,7 +42,7 @@ # IOKIT_RELEASE = [ IOKIT_BASE ] # IOKIT_DEV = [ IOKIT_BASE iokitstats iotracking ] # IOKIT_DEBUG = [ IOKIT_BASE iokitstats iotracking] -# LIBKERN_BASE = [ libkerncpp config_kec_fips zlib crypto_sha2 ] +# LIBKERN_BASE = [ libkerncpp config_blocks config_kec_fips zlib crypto_sha2 ] # LIBKERN_RELEASE =[ LIBKERN_BASE ] # LIBKERN_DEV = [ LIBKERN_BASE iotracking ] # LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] @@ -61,10 +63,10 @@ # VM_DEV = [ VM_BASE dynamic_codesigning ] # VM_DEBUG = [ VM_BASE dynamic_codesigning ] # SECURITY = [ config_macf kernel_integrity ] -# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] -# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS SKYWALK_DEV NETWORKING_DEV PF MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] -# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] -# KASAN = [ DEVELOPMENT ] +# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE NFS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] +# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] +# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG NFS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] +# KASAN = [ DEVELOPMENT config_kasan config_ubsan config_ksancov ] # ###################################################################### # diff --git a/config/MASTER.x86_64 b/config/MASTER.x86_64 index 66e7f98de..2e72d1d45 100644 --- a/config/MASTER.x86_64 +++ b/config/MASTER.x86_64 @@ -5,14 +5,14 @@ # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. -# +# ###################################################################### # # Master Apple configuration file (see the master machine independent # configuration file for a description of the file format). # ###################################################################### -# +# # Standard Apple OS Configurations: # -------- ----- -- --------------- # @@ -20,18 +20,18 @@ # KERNEL_RELEASE = [ KERNEL_BASE ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ] -# BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry ] +# BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_imageboot_chunklist config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry config_personas ] # BSD_RELEASE = [ BSD_BASE ] # BSD_DEV = [ BSD_BASE config_vnguard ] # BSD_DEBUG = [ BSD_BASE config_vnguard ] -# FILESYS_BASE = [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_mnt_rootsnap config_keypage_wp config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid ] +# FILESYS_BASE = [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_mnt_rootsnap config_rosv_startup config_mount_vm config_keypage_wp config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid config_firmlinks config_dataless_files ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE ] # FILESYS_DEBUG = [ FILESYS_BASE ] -# NFS = [ nfsclient nfsserver ] -# NETWORKING = [ inet inet6 ipv6send tcpdrop_synfin bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge ipcomp_zlib MULTIPATH if_fake ] +# NFS = [ nfsclient nfsserver config_nfs4 config_nfs_gss ] +# NETWORKING = [ inet inet6 ipv6send tcpdrop_synfin bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge MULTIPATH if_fake sixlowpan ] # NETWORKING_RELEASE = [ NETWORKING ] -# NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler ] +# NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless ] # NETWORKING_DEBUG = [ NETWORKING_DEV ] # VPN = [ ipsec flow_divert necp content_filter ] # PF = [ pf pflog ] @@ -40,10 +40,10 @@ # IOKIT_RELEASE = [ IOKIT_BASE ] # IOKIT_DEV = [ IOKIT_BASE iotracking ] # IOKIT_DEBUG = [ IOKIT_BASE iotracking ] -# LIBKERN_BASE = [ libkerncpp config_blocks config_kxld config_kec_fips zlib crypto_sha2 config_img4 ] -# LIBKERN_RELEASE =[ LIBKERN_BASE ] -# LIBKERN_DEV = [ LIBKERN_BASE iotracking ] -# LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] +# LIBKERN_BASE = [ libkerncpp config_blocks config_kxld config_kec_fips crypto_sha2 config_img4 ] +# LIBKERN_RELEASE =[ LIBKERN_BASE zlib ] +# LIBKERN_DEV = [ LIBKERN_BASE zlib iotracking ] +# LIBKERN_DEBUG = [ LIBKERN_BASE zlib iotracking ] # PERF_DBG_BASE = [ config_dtrace mach_kdp config_serial_kdp kdp_interactive_debugging kperf kpc zleaks config_gzalloc MONOTONIC_BASE ] # PERF_DBG_RELEASE=[ PERF_DBG_BASE ] # PERF_DBG_DEV =[ PERF_DBG_BASE lock_stats ] @@ -57,11 +57,11 @@ # SCHED_DEV = [ SCHED_BASE ] # SCHED_DEBUG = [ SCHED_BASE config_sched_grrr config_sched_proto ] # VM = [ vm_pressure_events memorystatus dynamic_codesigning config_code_decryption encrypted_swap config_background_queue] -# SECURITY = [ config_macf config_audit config_csr ] +# SECURITY = [ config_macf config_audit config_csr config_arcade] # RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE NFS SKYWALK_RELEASE NETWORKING_RELEASE PF VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM SECURITY ] # DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS SKYWALK_DEV NETWORKING_DEV PF VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM SECURITY ] # DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG NFS SKYWALK_DEBUG NETWORKING_DEBUG PF VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM SECURITY ] -# KASAN = [ DEVELOPMENT ] +# KASAN = [ DEVELOPMENT config_kasan config_ubsan config_ksancov ] # ###################################################################### # @@ -74,6 +74,7 @@ options PAL_I386 options CONFIG_MCA # Machine Check Architecture # options CONFIG_VMX # Virtual Machine Extensions # options CONFIG_MTRR # Memory Type Range Registers # +options CONFIG_MACF_LAZY_VNODE_LABELS # Turn on labels, don't preallocate options NO_NESTED_PMAP # diff --git a/config/Mach.exports b/config/Mach.exports index 439e238c6..025f57973 100644 --- a/config/Mach.exports +++ b/config/Mach.exports @@ -56,8 +56,10 @@ _thread_call_enter1_delayed _thread_call_enter_delayed _thread_call_free _thread_deallocate +_thread_has_thread_name _thread_policy_set _thread_reference +_thread_set_thread_name _thread_terminate _thread_tid _thread_wakeup_prim diff --git a/config/Makefile b/config/Makefile index da46458ff..0f5f3ab63 100644 --- a/config/Makefile +++ b/config/Makefile @@ -57,14 +57,19 @@ endif $(OBJPATH)/allsymbols: $(OBJPATH)/$(KERNEL_FILE_NAME) $(_v)$(NM) -gj $< > $@ -$(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset : %.exports %.$(EXPORT_SOURCE_ARCH_CONFIG_LC).exports $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET) - @echo "$(ColorH)SYMBOLSET$(Color0) $(ColorF)$*$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" - $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ - -import $(OBJPATH)/allsymbols \ - -export $(SOURCE)/$*.exports \ - -export $(SOURCE)/$*.$(EXPORT_SOURCE_ARCH_CONFIG_LC).exports \ - -output $@ $(_vstdout) +define symbol_set_rule +$(OBJPATH)/$(1).symbolset: MY_EXPORTS := $(filter $(1)%,$(EXPORTS_FILES)) +$(OBJPATH)/$(1).symbolset: MY_EXPORTS_ARGS := $$(foreach file,$$(MY_EXPORTS),-export $(SOURCE)/$$(file)) +$(OBJPATH)/$(1).symbolset: $$(MY_EXPORTS) $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET) + $$(call makelog,$(ColorH)SYMBOLSET$(Color0) $(ColorF)$(1)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") + $(_v)$(KEXT_CREATE_SYMBOL_SET) \ + $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ + -import $(OBJPATH)/allsymbols \ + $$(MY_EXPORTS_ARGS) \ + -output $$@ $(_vstdout) +endef + +$(foreach symbolset,$(SYMBOL_COMPONENT_LIST),$(eval $(call symbol_set_rule,$(symbolset)))) .PHONY: check_all_exports @@ -85,11 +90,11 @@ check_all_exports: $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET) -output /dev/null $(_vstdout) $(OBJPATH)/$(MD_SUPPORTED_KPI_FILENAME): $(EXPORTS_FILES) - @echo "$(ColorH)SUPPORTED_KPI$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorH)SUPPORTED_KPI$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(SRCROOT)/config/list_supported.sh $(SOURCE) $(EXPORT_SOURCE_ARCH_CONFIG_LC) $@ $(OBJPATH)/$(MI_SUPPORTED_KPI_FILENAME): $(EXPORTS_FILES) - @echo "$(ColorH)SUPPORTED_KPI$(Color0) \"($(ColorLF)all$(Color0))\"" + $(call makelog,$(ColorH)SUPPORTED_KPI$(Color0) "($(ColorLF)all$(Color0))") $(_v)$(SRCROOT)/config/list_supported.sh $(SOURCE) all $@ build_symbol_sets: check_all_exports $(SYMBOL_SET_BUILD) $(OBJPATH)/allsymbols \ @@ -101,12 +106,11 @@ do_config_all:: build_symbol_sets # There's no simple static pattern rule for these paths, so hardcode dependencies in the command list $(SYMROOT_INSTALL_KEXT_MACHO_FILES): ALWAYS $(_v)$(MKDIR) $(dir $@) + $(call makelog,$(ColorH)INSTALLSYM$(Color0) $(ColorF)symbolset $(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)if [ $(OBJROOT)/.symbolset.timestamp -nt $@ ]; then \ - echo INSTALLSYM symbolset $(notdir $@) "($(CURRENT_ARCH_CONFIG_LC))"; \ $(INSTALL) $(EXEC_INSTALL_FLAGS) $(OBJPATH)/$(@F).symbolset $@; \ cmdstatus=$$?; \ else \ - echo INSTALLSYM symbolset $(notdir $@) "($(CURRENT_ARCH_CONFIG_LC))"; \ $(LIPO) -create $@ $(OBJPATH)/$(@F).symbolset -output $@ 2>/dev/null || true; \ cmdstatus=$$?; \ fi; \ @@ -114,23 +118,23 @@ $(SYMROOT_INSTALL_KEXT_MACHO_FILES): ALWAYS $(SYMROOT_INSTALL_KEXT_PLISTS): $(SYMROOT)/% : $(SOURCE)/% $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALLSYM$(ColorH) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)" + $(call makelog,$(ColorH)INSTALLSYM$(Coloro) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)) $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ $(_v)$(NEWVERS) $@ $(_vstdout) $(DSTROOT_INSTALL_KEXT_PLISTS): $(INSTALL_KEXT_DIR)/% : $(SYMROOT)/% $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(ColorH) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)) $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ $(DSTROOT_INSTALL_KEXT_MACHO_FILES): $(INSTALL_KEXT_DIR)/% : $(SYMROOT)/% ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ $(DSTROOT)/$(KRESDIR)/$(MD_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR)/$(MI_SUPPORTED_KPI_FILENAME): $(DSTROOT)/$(KRESDIR)/% : $(OBJPATH)/% $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$*$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$*$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ ifneq ($(INSTALL_KASAN_ONLY),1) diff --git a/config/MasterVersion b/config/MasterVersion index c9bd63daf..f72373b1c 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -18.7.0 +19.0.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.arm.exports b/config/Private.arm.exports index 683c7e68f..8091f7218 100644 --- a/config/Private.arm.exports +++ b/config/Private.arm.exports @@ -5,13 +5,18 @@ _IOCPURunPlatformQuiesceActions _PE_get_default _PE_reboot_on_panic _PE_mark_hwaccess +_mach_vm_map:_mach_vm_map_external +_mach_vm_remap:_mach_vm_remap_external _ml_arm_sleep _ml_get_abstime_offset _ml_get_conttime_offset _ml_get_wake_timebase +_ml_set_reset_time _proc_getcdhash _cpu_broadcast_xcall _cpu_xcall +_cpu_broadcast_immediate_xcall +_cpu_immediate_xcall _cpu_number _enable_kernel_vfp_context _get_preemption_level diff --git a/config/Private.arm64.exports b/config/Private.arm64.exports index 4b43941bc..e3bc84a7d 100644 --- a/config/Private.arm64.exports +++ b/config/Private.arm64.exports @@ -14,15 +14,20 @@ __ZN17IONVRAMController* __ZTV17IONVRAMController _cpu_broadcast_xcall _cpu_xcall +_cpu_broadcast_immediate_xcall +_cpu_immediate_xcall _cpu_cluster_id _cpu_number _cpu_qos_update_register _ecc_log_record_event _get_preemption_level +_mach_vm_map:_mach_vm_map_external +_mach_vm_remap:_mach_vm_remap_external _ml_arm_sleep _ml_get_abstime_offset _ml_get_conttime_offset _ml_get_wake_timebase +_ml_set_reset_time _ml_thread_is64bit _pe_shmcon_set_child _proc_getcdhash @@ -42,5 +47,6 @@ _pgtrace_add_probe _pgtrace_clear_probe _mach_bridge_recv_timestamps _mach_bridge_init_timestamp +_mach_bridge_set_params _PE_panic_debugging_enabled _register_additional_panic_data_buffer diff --git a/config/Private.exports b/config/Private.exports index 9cf4a78f3..5447e64e6 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -186,6 +186,7 @@ _ifnet_link_status_report _ifnet_notice_master_elected _ifnet_notice_node_absence _ifnet_notice_node_presence +_ifnet_notice_node_presence_v2 _ifnet_poll_params _ifnet_purge _ifnet_report_issues @@ -206,6 +207,8 @@ _ifnet_get_unsent_bytes _ifnet_get_buffer_status _ifnet_normalise_unsent_data _ifnet_set_low_power_mode +_ifnet_notify_tcp_keepalive_offload_timeout +_ifnet_interface_advisory_report _in6_localaddr _in6addr_local _in_localaddr @@ -239,14 +242,11 @@ _kern_allocation_name_allocate _kern_allocation_name_release _thread_set_allocation_name _kern_asl_msg -_kern_asl_msg_va _kern_coredump_log _kern_register_coredump_helper _kern_config_is_development _kern_stack_snapshot_with_reason _kernel_debug_string -_kevent_id_internal -_kevent_qos_internal _kmem_alloc_kobject:_kmem_alloc_kobject_external _kmem_alloc_pageable:_kmem_alloc_pageable_external _kx_qsort @@ -267,9 +267,7 @@ _m_trailingspace:_mbuf_trailingspace _mach_vm_allocate:_mach_vm_allocate_external _mach_vm_behavior_set _mach_vm_deallocate -_mach_vm_map:_mach_vm_map_external _mach_vm_protect -_mach_vm_remap:_mach_vm_remap_external _mbuf_add_drvaux _mbuf_del_drvaux _mbuf_find_drvaux @@ -298,6 +296,8 @@ _mbuf_get_flowid _mbuf_set_flowid _mbuf_pkt_new_flow _mbuf_last_pkt +_mbuf_get_keepalive_flag +_mbuf_set_keepalive_flag _mcl_to_paddr _ml_io_read _ml_io_read16 @@ -323,19 +323,24 @@ _net_add_domain:_net_add_domain_old _net_add_proto:_net_add_proto_old _net_del_domain:_net_del_domain_old _net_del_proto:_net_del_proto_old +_net_domain_contains_hostname _netboot_root _os_reason_create _os_reason_alloc_buffer_noblock _os_reason_get_kcdata_descriptor _os_reason_ref _os_reason_free +_os_reason_set_flags +_os_reason_set_description_data _panic_with_options _persona_find +_persona_find_by_type _persona_get _persona_get_id _persona_get_type _persona_get_cred _persona_lookup +_persona_proc_get _current_persona_get _persona_put _pffinddomain:_pffinddomain_old @@ -343,13 +348,19 @@ _pffindproto:_pffindproto_old _port_name_to_task _port_name_to_thread _post_sys_powersource +_proc_get_syscall_filter_mask_size +_proc_getexecutableoffset _proc_getexecutablevnode +_proc_selfexecutableargs _proc_issetugid _proc_pidbackgrounded _proc_pidversion _proc_set_responsible_pid +_proc_set_syscall_filter_mask _proc_task _proc_uniqueid +_proc_puniqueid +_proc_exitstatus _priv_check_cred _pru_abort_notsupp _pru_accept_notsupp @@ -366,6 +377,7 @@ _pru_sense_null _pru_shutdown_notsupp _pru_sockaddr_notsupp _pru_sopoll_notsupp +_psignal_sigkill_with_reason _pthread_kext_register _q_to_b _register_and_init_prng @@ -409,6 +421,7 @@ _sorwakeup _sosend _strnstr _sysdiagnose_notify_user +_task_is_driver _termioschars _thread_call_allocate_with_priority _thread_call_allocate_with_qos @@ -431,6 +444,7 @@ _throttle_info_reset_window _throttle_info_update _throttle_info_update_by_mask _throttle_lowpri_io +_throttle_lowpri_io_will_be_throttled _throttle_lowpri_window _throttle_set_thread_io_policy _throttle_get_thread_effective_io_policy @@ -464,6 +478,7 @@ _utun_pkt_dtls_input _vfs_context_bind _vfs_context_get_special_port _vfs_context_set_special_port +_vfs_context_is_dataless_manipulator _vfs_devvp _vfs_getattr _vfs_getbyid @@ -487,27 +502,33 @@ _vm_map_round_page_mask _vm_map_trunc_page_mask _vm_map_wire_and_extract:_vm_map_wire_and_extract_external _vm_page_wire_count +_vn_getpath_ext _vn_getpath_fsenter _vn_getpath_fsenter_with_parent +_vn_getpath_no_firmlink +_vnode_getfirmlink _vn_searchfs_inappropriate_name _vnode_create_empty _vnode_initialize _vnode_isdyldsharedcache _vnode_ismonitored _vnode_istty +_vnode_lookupat _vnode_lookup_continue_needed _vnode_clearnoflush _vnode_isnoflush _vnode_getbackingvnode _vnode_setasnamedstream +_vnode_setasfirmlink _vnop_compound_mkdir_desc _vnop_compound_open_desc _vnop_compound_remove_desc _vnop_compound_rename_desc _vnop_compound_rmdir_desc _vnop_monitor_desc +_write_random -# HFS Kext Requirements +# HFS/APFS Kext Requirements _IOBSDMountChange _OSKextUnloadKextWithLoadTag _bdwrite_internal @@ -515,10 +536,15 @@ _buf_markstatic _count_lock_queue _decmpfs_cnode_destroy _decmpfs_cnode_get_vnode_cached_size +_decmpfs_cnode_get_vnode_cached_nchildren +_decmpfs_cnode_get_vnode_cached_total_size _decmpfs_cnode_get_vnode_state _decmpfs_cnode_init _decmpfs_cnode_alloc _decmpfs_cnode_free +_decmpfs_cnode_set_vnode_cached_size +_decmpfs_cnode_set_vnode_cached_nchildren +_decmpfs_cnode_set_vnode_cached_total_size _decmpfs_cnode_set_vnode_state _decmpfs_cnode_cmp_type _decmpfs_ctx @@ -533,6 +559,7 @@ _decmpfs_read_compressed _decmpfs_unlock_compressed_data _decmpfs_update_attributes _decmpfs_validate_compressed_file +_fg_get_vnode _fp_getfvp _kauth_cred_issuser _kdebug_lookup_gen_events @@ -578,6 +605,7 @@ _vnode_should_flush_after_write _vfs_setowner _vfs_idle_time _mount_set_noreaddirext +_vfs_get_statfs64 _cluster_max_io_size _vfs_context_cwd _resolve_nspace_item @@ -592,21 +620,6 @@ _proc_is_forcing_hfs_case_sensitivity _is_package_name _sysctl__hw_features_children _task_update_logical_writes -_dqfileclose -_dqfileopen -_dqflush -_dqget -_dqhashinit -_dqisinitialized -_dqlock -_dqrele -_dqsync -_dqsync_orphans -_dqunlock -_qf_get -_qf_put -_dqfileinit -_dqreclaim _zalloc _zalloc_noblock _zdestroy @@ -617,7 +630,11 @@ _fs_buffer_cache_gc_register _fs_buffer_cache_gc_unregister _cp_key_store_action_for_volume _mach_bridge_remote_time - +_lck_mtx_sleep_with_inheritor +_lck_rw_sleep_with_inheritor +_wakeup_one_with_inheritor +_wakeup_all_with_inheritor +_change_sleep_inheritor _Block_size __Block_extended_layout __Block_has_signature @@ -629,3 +646,4 @@ __Block_signature __Block_tryRetain __Block_use_RR2 __Block_use_stret +_IOPMRootDomainGetWillShutdown diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index 92da71aa1..a24003941 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -17,6 +17,7 @@ _cpuid_leaf7_features _cpuid_info _csr_check _csr_get_active_config +_hv_ast_pending _hv_ept_pmap_create _hv_get* _hv_release* @@ -55,6 +56,21 @@ _xts_encrypt _xts_start _aes_decrypt _PE_reboot_on_panic +_dqfileclose +_dqfileopen +_dqflush +_dqget +_dqhashinit +_dqisinitialized +_dqlock +_dqrele +_dqsync +_dqsync_orphans +_dqunlock +_qf_get +_qf_put +_dqfileinit +_dqreclaim # HFS Kext Requirements _file_vnode @@ -77,3 +93,7 @@ _csproc_mark_invalid_allowed _csproc_check_invalid_allowed _csproc_hardened_runtime _csproc_forced_lv + +#exports for vmware/, virtualbox, ... +_mach_vm_map +_mach_vm_remap diff --git a/config/Unsupported.exports b/config/Unsupported.exports index 8853251ca..07f1387d6 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -50,7 +50,6 @@ __ZTV9IODTNVRAM __ZTVN15IOWatchDogTimer9MetaClassE __doprnt __doprnt_log -__dtrace_register_anon_DOF _aes_decrypt_cbc _aes_decrypt_key _aes_decrypt_key128 @@ -165,6 +164,7 @@ _sock_accept_internal _sock_socket_internal _stack_privilege _task_get_special_port +_task_is_app_suspended _task_resume _task_resume2 _task_suspend diff --git a/config/Unused.arm.exports b/config/Unused.arm.exports new file mode 100644 index 000000000..58abc1249 --- /dev/null +++ b/config/Unused.arm.exports @@ -0,0 +1,4 @@ +# Symbols that are unused as KPI, but must be globally exported +_arm64_root_pgtable_level +_arm64_root_pgtable_num_ttes +_arm_hardware_page_size diff --git a/config/Unused.arm64.exports b/config/Unused.arm64.exports new file mode 100644 index 000000000..58abc1249 --- /dev/null +++ b/config/Unused.arm64.exports @@ -0,0 +1,4 @@ +# Symbols that are unused as KPI, but must be globally exported +_arm64_root_pgtable_level +_arm64_root_pgtable_num_ttes +_arm_hardware_page_size diff --git a/config/Unused.exports b/config/Unused.exports index 976fb68de..c877ff291 100644 --- a/config/Unused.exports +++ b/config/Unused.exports @@ -1,7 +1,4 @@ # Symbols that are unused as KPI, but must be globally exported -_arm64_root_pgtable_level -_arm64_root_pgtable_num_ttes -_arm_hardware_page_size _atm_mana* _bank_mana* _dtrace_zero* diff --git a/config/generate_linker_exports.sh b/config/generate_linker_exports.sh index 4af69e9b0..3308705e4 100755 --- a/config/generate_linker_exports.sh +++ b/config/generate_linker_exports.sh @@ -10,6 +10,9 @@ fi OUTPUT="$1" shift -( grep -h -v ":" "$@"; grep -h ":" "$@" | awk -F: '{print $2}' ) | sort -u > "$OUTPUT" +# Note: we used to export both sides of the alias since forever +# for now keep doing this + +( grep -h -v ":" "$@"; grep -h ":" "$@" | awk -F: '{print $1; print $2}' ) | sort -u > "$OUTPUT" exit 0 diff --git a/doc/atomics.md b/doc/atomics.md new file mode 100644 index 000000000..eda4cc2d3 --- /dev/null +++ b/doc/atomics.md @@ -0,0 +1,423 @@ +XNU use of Atomics and Memory Barriers +====================================== + +Goal +---- + +This document discusses the use of atomics and memory barriers in XNU. It is +meant as a guide to best practices, and warns against a variety of possible +pitfalls in the handling of atomics in C. + +It is assumed that the reader has a decent understanding of +the [C11 memory model](https://en.cppreference.com/w/c/atomic/memory_order) +as this document builds on it, and explains the liberties XNU takes with said +model. + +All the interfaces discussed in this document are available through +the `` header. + +Note: Linux has thorough documentation around memory barriers +(Documentation/memory-barriers.txt), some of which is Linux specific, +but most is not and is a valuable read. + + +Vocabulary +---------- + +In the rest of this document we'll refer to the various memory ordering defined +by C11 as relaxed, consume, acquire, release, acq\_rel and seq\_cst. + +`os_atomic` also tries to make the distinction between compiler **barriers** +(which limit how much the compiler can reorder code), and memory **fences**. + + +The dangers and pitfalls of C11's `` +------------------------------------------------- + +While the C11 memory model has likely been one of the most important additions +to modern C, in the purest C tradition, it is a sharp tool. + +By default, C11 comes with two variants of each atomic "operation": + +- an *explicit* variant where memory orderings can be specified, +- a regular variant which is equivalent to the former with the *seq_cst* + memory ordering. + +When an `_Atomic` qualified variable is accessed directly without using +any `atomic_*_explicit()` operation, then the compiler will generate the +matching *seq_cst* atomic operations on your behalf. + +The sequentially consistent world is extremely safe from a lot of compiler +and hardware reorderings and optimizations, which is great, but comes with +a huge cost in terms of memory barriers. It is also completely wasted when +building for a non SMP configuration. + + +It seems very tempting to use `atomic_*_explicit()` functions with explicit +memory orderings, however, the compiler is entitled to perform a number of +optimizations with relaxed atomics, that most developers will not expect. +Indeed, the compiler is perfectly allowed to perform various optimizations it +does with other plain memory accesess such as coalescing, reordering, hoisting +out of loops, ... + +For example, when the compiler can know what `doit` is doing (which due to LTO +is almost always the case for XNU), is allowed to transform this code: + +```c + void + perform_with_progress(int steps, long _Atomic *progress) + { + for (int i = 0; i < steps; i++) { + doit(i); + atomic_store_explicit(progress, i, memory_order_relaxed); + } + } +``` + +Into this, which obviously defeats the entire purpose of `progress`: + +```c + void + perform_with_progress(int steps, long _Atomic *progress) + { + for (int i = 0; i < steps; i++) { + doit(i); + } + atomic_store_explicit(progress, steps, memory_order_relaxed); + } +``` + + +How `os_atomic_*` tries to address `` pitfalls +----------------------------------------------------------- + +1. the memory locations passed to the various `os_atomic_*` + functions do not need to be marked `_Atomic` or `volatile` + (or `_Atomic volatile`), which allow for use of atomic + operations in code written before C11 was even a thing. + + It is however recommended in new code to use the `_Atomic` + specifier. + +2. `os_atomic_*` cannot be coalesced by the compiler: + all accesses are performed on the specified locations + as if their type was `_Atomic volatile` qualified. + +3. `os_atomic_*` only comes with the explicit variants: + orderings must be provided and can express either memory orders + where the name is the same as in C11 without the `memory_order_` prefix, + or a compiler barrier ordering `compiler_acquire`, `compiler_release`, + `compiler_acq_rel`. + +4. `os_atomic_*` elides barriers for non SMP configurations + by default, however, it emits the proper compiler barriers + that correspond to the requested memory ordering (using + `atomic_signal_fence()`), even on UP configuration, so that + the compiler cannot possibly reorder code on UP systems. + + +Best practices for the use of atomics in XNU +-------------------------------------------- + +For most generic code, the `os_atomic_*` functions from +`` are the perferred interfaces. + +`__sync_*`, `__c11_*` and `__atomic_*` compiler builtins should not be used. + +`` functions may be used if: + +- compiler coalescing / reordering is desired (refcounting + implementations may desire this for example). + +- defaulting to relaxed atomics for non SMP platforms doesn't make sense + (such as device access which may require memory fences even on UP systems). + + +Qualifying atomic variables with `_Atomic` or even +`_Atomic volatile` is encouraged, however authors must +be aware that a direct access to this variable will +result in quite heavy memory barriers. + +The *consume* memory ordering should not be used +(See *dependency* memory order later in this documentation). + +**Note**: `` provides a bunch of legacy +atomic interfaces, but this header is considered obsolete +and these functions should not be used in new code. + + +High level overview of `os_atomic_*` interfaces +----------------------------------------------- + +### Compiler barriers and memory fences + +`os_compiler_barrier(mem_order?)` provides a compiler barrier, +with an optional barrier ordering. It is implemented with C11's +`atomic_signal_fence()`. The barrier ordering argument is optional +and defaults to the `acq_rel` compiler barrier (which prevents the +compiler to reorder code in any direction around this barrier). + +`os_atomic_thread_fence(mem_order)` provides a memory barrier +according to the semantics of `atomic_thread_fence()`. It always +implies the equivalent `os_compiler_barrier()` even on UP systems. + +### Init, load and store + +`os_atomic_init`, `os_atomic_load` and `os_atomic_store` provide +facilities equivalent to `atomic_init`, `atomic_load_explicit` +and `atomic_store_explicit` respectively. + +Note that `os_atomic_load` and `os_atomic_store` promise that they will +compile to a plain load or store. `os_atomic_load_wide` and +`os_atomic_store_wide` can be used to have access to atomic loads and store +that involve more costly codegen (such as compare exchange loops). + +### Basic RMW (read/modify/write) atomic operations + +The following basic atomic RMW operations exist: + +- `inc`: atomic increment (equivalent to an atomic add of `1`), +- `dec`: atomic decrement (equivalent to an atomic sub of `1`), +- `add`: atomic add, +- `sub`: atomic sub, +- `or`: atomic bitwise or, +- `xor`: atomic bitwise xor, +- `and`: atomic bitwise and, +- `andnot`: atomic bitwise andnot (equivalent to atomic and of ~value), +- `min`: atomic min, +- `max`: atomic max. + +For any such operation, two variants exist: + +- `os_atomic_${op}_orig` (for example `os_atomic_add_orig`) + which returns the value stored at the specified location + *before* the atomic operation took place +- `os_atomic_${op}` (for example `os_atomic_add`) which + returns the value stored at the specified location + *after* the atomic operation took place + +This convention is picked for two reasons: + +1. `os_atomic_add(p, value, ...)` is essentially equivalent to the C + in place addition `(*p += value)` which returns the result of the + operation and not the original value of `*p`. + +2. Most subtle atomic algorithms do actually require the original value + stored at the location, especially for bit manipulations: + `(os_atomic_or_orig(p, bit, relaxed) & bit)` will atomically perform + `*p |= bit` but also tell you whether `bit` was set in the original value. + + Making it more explicit that the original value is used is hence + important for readers and worth the extra five keystrokes. + +Typically: + +```c + static int _Atomic i = 0; + + printf("%d\n", os_atomic_inc_orig(&i)); // prints 0 + printf("%d\n", os_atomic_inc(&i)); // prints 2 +``` + +### Atomic swap / compare and swap + +`os_atomic_xchg` is a simple wrapper around `atomic_exchange_explicit`. + +There are two variants of `os_atomic_cmpxchg` which are wrappers around +`atomic_compare_exchange_strong_explicit`. Both of these variants will +return false/0 if the compare exchange failed, and true/1 if the expected +value was found at the specified location and the new value was stored. + +1. `os_atomic_cmpxchg(address, expected, new_value, mem_order)` which + will atomically store `new_value` at `address` if the current value + is equal to `expected`. + +2. `os_atomic_cmpxchgv(address, expected, new_value, orig_value, mem_order)` + which has an extra `orig_value` argument which must be a pointer to a local + variable and will be filled with the current value at `address` whether the + compare exchange was successful or not. In case of success, the loaded value + will always be `expected`, however in case of failure it will be filled with + the current value, which is helpful to redrive compare exchange loops. + +Unlike `atomic_compare_exchange_strong_explicit`, a single ordering is +specified, which only takes effect in case of a successful compare exchange. +In C11 speak, `os_atomic_cmpxchg*` always specifies `memory_order_relaxed` +for the failure case ordering, as it is what is used most of the time. + +There is no wrapper around `atomic_compare_exchange_weak_explicit`, +as `os_atomic_rmw_loop` offers a much better alternative for CAS-loops. + +### `os_atomic_rmw_loop` + +This expressive and versatile construct allows for really terse and +way more readable compare exchange loops. It also uses LL/SC constructs more +efficiently than a compare exchange loop would allow. + +Instead of a typical CAS-loop in C11: + +```c + int _Atomic *address; + int old_value, new_value; + bool success = false; + + old_value = atomic_load_explicit(address, memory_order_relaxed); + do { + if (!validate(old_value)) { + break; + } + new_value = compute_new_value(old_value); + success = atomic_compare_exchange_weak_explicit(address, &old_value, + new_value, memory_order_acquire, memory_order_relaxed); + } while (__improbable(!success)); +``` + +`os_atomic_rmw_loop` allows this form: + +```c + int _Atomic *address; + int old_value, new_value; + bool success; + + success = os_atomic_rmw_loop(address, old_value, new_value, acquire, { + if (!validate(old_value)) { + os_atomic_rmw_loop_give_up(break); + } + new_value = compute_new_value(old_value); + }); +``` + +Unlike the C11 variant, it lets the reader know in program order that this will +be a CAS loop, and exposes the ordering upfront, while for traditional CAS loops +one has to jump to the end of the code to understand what it does. + +Any control flow that attempts to exit its scope of the loop needs to be +wrapped with `os_atomic_rmw_loop_give_up` (so that LL/SC architectures can +abort their opened LL/SC transaction). + +Because these loops are LL/SC transactions, it is undefined to perform +any store to memory (register operations are fine) within these loops, +as these may cause the store-conditional to always fail. +In particular nesting of `os_atomic_rmw_loop` is invalid. + +Use of `continue` within an `os_atomic_rmw_loop` is also invalid, instead an +`os_atomic_rmw_loop_give_up(goto again)` jumping to an `again:` label placed +before the loop should be used in this way: + +```c + int _Atomic *address; + int old_value, new_value; + bool success; + +again: + success = os_atomic_rmw_loop(address, old_value, new_value, acquire, { + if (needs_some_store_that_can_thwart_the_transaction(old_value)) { + os_atomic_rmw_loop_give_up({ + // Do whatever you need to do/store to central memory + // that would cause the loop to always fail + do_my_rmw_loop_breaking_store(); + + // And only then redrive. + goto again; + }); + } + if (!validate(old_value)) { + os_atomic_rmw_loop_give_up(break); + } + new_value = compute_new_value(old_value); + }); +``` + +### the *dependency* memory order + +Because the C11 *consume* memory order is broken in various ways, +most compilers, clang included, implement it as an equivalent +for `memory_order_acquire`. However, its concept is useful +for certain algorithms. + +As an attempt to provide a replacement for this, `` +implements an entirely new *dependency* memory ordering. + +The purpose of this ordering is to provide a relaxed load followed by an +implicit compiler barrier, that can be used as a root for a chain of hardware +dependencies that would otherwise pair with store-releases done at this address, +very much like the *consume* memory order is intended to provide. + +However, unlike the *consume* memory ordering where the compiler had to follow +the dependencies, the *dependency* memory ordering relies on explicit +annotations of when the dependencies are expected: + +- loads through a pointer loaded with a *dependency* memory ordering + will provide a hardware dependency, + +- dependencies may be injected into other loads not performed through this + particular pointer with the `os_atomic_load_with_dependency_on` and + `os_atomic_inject_dependency` interfaces. + +Here is an example of how it is meant to be used: + +```c + struct foo { + long value; + long _Atomic flag; + }; + + void + publish(struct foo *p, long value) + { + p->value = value; + os_atomic_store(&p->flag, 1, release); + } + + + bool + broken_read(struct foo *p, long *value) + { + /* + * This isn't safe, as there's absolutely no hardware dependency involved. + * Using an acquire barrier would of course fix it but is quite expensive... + */ + if (os_atomic_load(&p->flag, relaxed)) { + *value = p->value; + return true; + } + return false; + } + + bool + valid_read(struct foo *p, long *value) + { + long flag = os_atomic_load(&p->flag, dependency); + if (flag) { + /* + * Further the chain of dependency to any loads through `p` + * which properly pair with the release barrier in `publish`. + */ + *value = os_atomic_load_with_dependency_on(&p->value, flag); + return true; + } + return false; + } +``` + +There are 4 interfaces involved with hardware dependencies: + +1. `os_atomic_load(..., dependency)` to initiate roots of hardware dependencies, + that should pair with a store or rmw with release semantics or stronger + (release, acq\_rel or seq\_cst), + +2. `os_atomic_inject_dependency` can be used to inject the dependency provided + by a *dependency* load, or any other value that has had a dependency + injected, + +3. `os_atomic_load_with_dependency_on` to do an otherwise related relaxed load + that still prolongs a dependency chain, + +4. `os_atomic_make_dependency` to create an opaque token out of a given + dependency root to inject into multiple loads. + + +**Note**: this technique is NOT safe when the compiler can reason about the +pointers that you are manipulating, for example if the compiler can know that +the pointer can only take a couple of values and ditch all these manually +crafted dependency chains. Hopefully there will be a future C2Y standard that +provides a similar construct as a language feature instead. diff --git a/iokit/DriverKit/IOBufferMemoryDescriptor.iig b/iokit/DriverKit/IOBufferMemoryDescriptor.iig new file mode 100644 index 000000000..449d66ea5 --- /dev/null +++ b/iokit/DriverKit/IOBufferMemoryDescriptor.iig @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !__IIG +#if KERNEL +#include +#endif +#endif + + +#ifndef _IOKIT_UIOBUFFERMEMORYDESCRIPTOR_H +#define _IOKIT_UIOBUFFERMEMORYDESCRIPTOR_H + +#include + +/*! + * @class IOBufferMemoryDescriptor + * + * @abstract + * IOBufferMemoryDescriptor describes a memory buffer allocated in the callers address space. + * + * @discussion + * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create() + * Methods in this class are used for memory that was supplied as a parameter. + * IOBufferMemoryDescriptor can be handed to any API that expects an IOMemoryDescriptor. + */ + +class KERNEL IOBufferMemoryDescriptor : public IOMemoryDescriptor +{ +public: + + /*! + * @brief Create an IOBufferMemoryDescriptor. + * @param options Pass the flags kIOMemoryDirectionIn, kIOMemoryDirectionOut or kIOMemoryDirectionOutIn + * to set the direction of the i/o + * @param capacity Maximum length of the memory buffer. The descriptor has no valid data + * and zero length until set with SetLength(). + * @param memory Created descriptor with +1 retain count to be released by the caller. + * @param alignment For small less-than-page-size buffers, control the alignment of the memory buffer. + * Pass zero for no guaranteed alignment. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + static kern_return_t + Create( + uint64_t options, + uint64_t capacity, + uint64_t alignment, + IOBufferMemoryDescriptor ** memory); + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Obtain the address and length of the memory buffer. + * @param range An IOAddressSegment structure filled out with the address and length of the memory buffer. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + kern_return_t + GetAddressRange(IOAddressSegment * range) LOCALONLY; + + /*! + * @brief Set the valid length of the memory buffer. + * @discussion IOBufferMemoryDescriptor have capacity allocated at Create() but no valid data until set + * with this method. + * @param length New valid length of the memory described. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetLength(uint64_t length); +}; + +#endif /* ! _IOKIT_UIOBUFFERMEMORYDESCRIPTOR_H */ diff --git a/iokit/DriverKit/IODataQueueDispatchSource.iig b/iokit/DriverKit/IODataQueueDispatchSource.iig new file mode 100644 index 000000000..1b4a0df26 --- /dev/null +++ b/iokit/DriverKit/IODataQueueDispatchSource.iig @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_UIODATAQUEUEDISPATCHSOURCE_H +#define _IOKIT_UIODATAQUEUEDISPATCHSOURCE_H + +#include +#include + +typedef void (^IODataQueueClientEnqueueEntryBlock)(void *data, size_t dataSize); +typedef void (^IODataQueueClientDequeueEntryBlock)(const void *data, size_t dataSize); + +class NATIVE KERNEL IODataQueueDispatchSource : public IODispatchSource +{ +public: + + /*! + * @brief Create an IODataQueueDispatchSource for a shared memory data queue. + * @param queueByteCount The size of the queue in bytes. + * @param queue IODispatchQueue the source is attached to. Note that the DataAvailable + * and DataServiced handlers are invoked on the queue set for the target method + * of the OSAction, not this queue. + * @param source Created source with +1 retain count to be released by the caller. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + static kern_return_t + Create( + uint64_t queueByteCount, + IODispatchQueue * queue, + IODataQueueDispatchSource ** source); + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief As a consumer, set the handler block to run when the queue becomes non-empty. + * @param action OSAction instance specifying the callback method. The OSAction object will be retained + * until SetHandler is called again or the event source is cancelled. + * The DataAvailable handler is invoked on the queue set for the target method of the OSAction. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetDataAvailableHandler( + OSAction * action TYPE(DataAvailable)); + + /*! + * @brief As a producer, set the handler block to run when the queue becomes non-full, after an attempt + * to enqueue data failed. + * @param action OSAction instance specifying the callback method. The OSAction object will be retained + * until SetHandler is called again or the event source is cancelled. + * The DataServiced handler is invoked on the queue set for the target method of the OSAction. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetDataServicedHandler( + OSAction * action TYPE(DataServiced)); + + /*! + * @brief Control the enable state of the interrupt source. + * @param enable Pass true to enable the source or false to disable. + * @param handler Optional block to be executed after the interrupt has been disabled and any pending + * interrupt handlers completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetEnableWithCompletion( + bool enable, + IODispatchSourceCancelHandler handler) override LOCAL; + + /*! + * @brief Cancel all callbacks from the event source. + * @discussion After cancellation, the source can only be freed. It cannot be reactivated. + * @param handler Handler block to be invoked after any callbacks have completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Cancel(IODispatchSourceCancelHandler handler) override LOCAL; + + + /*! + * @brief As a consumer, check if the data queue is non-empty. + * @return True if the queue is non-empty. + */ + bool + IsDataAvailable(void) LOCALONLY; + + /*! + * @brief As a consumer, get access to the next queue entry without dequeuing it. + * @param callback to invoked if the queue is non-empty with the next entry to be dequeued. + * @return kIOReturnSuccess if the callback was invoked. + * kIOReturnUnderrun if the queue was empty. + * kIOReturnError if the queue was corrupt. + */ + kern_return_t + Peek(IODataQueueClientDequeueEntryBlock callback) LOCALONLY; + + /*! + * @brief As a consumer, dequeue the next queue entry. + * @param callback invoked if the queue was non-empty with the entry that was dequeued. + * @return kIOReturnSuccess if the callback was invoked. + * kIOReturnUnderrun if the queue was empty. + * kIOReturnError if the queue was corrupt. + */ + kern_return_t + Dequeue(IODataQueueClientDequeueEntryBlock callback) LOCALONLY; + + /*! + * @brief As a producer, enqueue a queue entry. + * @param dataSize size of the data to enqueue. + * @param callback invoked if the queue has enough space to enqueue the data. + * @return kIOReturnSuccess if the callback was invoked. + * kIOReturnOverrun if the queue was full. + * kIOReturnError if the queue was corrupt. + */ + kern_return_t + Enqueue(uint32_t dataSize, IODataQueueClientEnqueueEntryBlock callback) LOCALONLY; + + /*! + * @brief As a consumer, dequeue the next queue entry, but don't send any DataServiced notification. + * @param sendDataServiced Flag that indicates a DataServiced notification would have sent. + * It should be initialized to false before a series of calls to this method, + * and if true after those calls, the notification sent with SendDataServiced(). + * @param callback invoked if the queue was non-empty with the entry that was dequeued. + * @return kIOReturnSuccess if the callback was invoked. + * kIOReturnUnderrun if the queue was empty. + * kIOReturnError if the queue was corrupt. + */ + kern_return_t + DequeueWithCoalesce(bool * sendDataServiced, IODataQueueClientDequeueEntryBlock callback) LOCALONLY; + + /*! + * @brief As a producer, enqueue a queue entry, but don't send any DataAvailable notification. + * @param dataSize size of the data to enqueue + * @param sendDataAvailable Flag that indicates a DataAvailable notification would have been sent. + * It should be initialized to false before a series of calls to this method, + * and if true after those calls, the notification sent with SendDataAvailable(). + * @param callback invoked if the queue has enough space to enqueue the data. + * @return kIOReturnSuccess if the callback was invoked. + * kIOReturnOverrun if the queue was full. + * kIOReturnError if the queue was corrupt. + */ + kern_return_t + EnqueueWithCoalesce(uint32_t dataSize, bool * sendDataAvailable, IODataQueueClientEnqueueEntryBlock callback) LOCALONLY; + + /*! + * @brief As a consumer, send the DataServiced notification indicated by DequeueWithCoalesce. + */ + void + SendDataServiced(void) LOCALONLY; + + /*! + * @brief As a producer, send the DataAvailable notification indicated by EnqueueWithCoalesce. + */ + void + SendDataAvailable(void) LOCALONLY; + +private: + virtual kern_return_t + CopyMemory( + IOMemoryDescriptor ** memory); + + virtual kern_return_t + CopyDataAvailableHandler( + OSAction ** action); + + virtual kern_return_t + CopyDataServicedHandler( + OSAction ** action); + + virtual kern_return_t + CheckForWork(bool synchronous) override LOCAL; + + virtual void + DataAvailable( + OSAction * action TARGET) LOCAL = 0; + + virtual void + DataServiced( + OSAction * action TARGET) LOCAL = 0; +}; + +#endif /* ! _IOKIT_UIODATAQUEUEDISPATCHSOURCE_H */ diff --git a/iokit/DriverKit/IODispatchQueue.iig b/iokit/DriverKit/IODispatchQueue.iig new file mode 100644 index 000000000..b9b501f8f --- /dev/null +++ b/iokit/DriverKit/IODispatchQueue.iig @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_UIODISPATCHQUEUE_H +#define _IOKIT_UIODISPATCHQUEUE_H + +#include +#include +#include + +typedef int (*IODispatchLogFunction)(const char *format, ...); +typedef void (^IODispatchBlock)(void); +typedef void (*IODispatchFunction)(void * context); +typedef void (^IODispatchQueueCancelHandler)(void); + + +/*! + * @class IODispatchQueue + * + * @abstract + * IODispatchQueue provides a queue for ordered execution of blocks. + * + * @discussion + * All blocks submitted to dispatch queues are dequeued in FIFO order. + * By default the queue is serial and will execute one block at a time. + */ + +class NATIVE KERNEL IODispatchQueue : public OSObject +{ +public: + /*! + * @brief Creates a new dispatch queue object. + * @discussion Creates a new dispatch queue object. All queues are currently serial, executing one block at time + * FIFO order. The new object has retain count 1 and should be released by the caller. + * @param options No options are currently defined, pass zero. + * @param priority No priorities are currently defined, pass zero. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + static kern_return_t + Create( + const IODispatchQueueName name, + uint64_t options, + uint64_t priority, + IODispatchQueue ** queue) LOCAL; + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Determines if the current thread is running on the queue. + * @discussion Determines if the current thread is running on the queue, including if the queue invoked a + * second queue (ie. OnQueue can return true for more than one queue in a given context.) + * @return bool true if current thread is running on this queue. + */ + bool + OnQueue() LOCALONLY; + + /*! + * @brief Return the name the queue was created with. + * @discussion Returns a pointer to the queues name. Only valid while the queue is retained. + * @return C-string pointer in the queues internal storage. + */ + const char * + GetName() LOCALONLY; + + /*! + * @brief Stop the queue from executing futher work. + * @discussion Stops the queue from dequeuing work, and on completion of any block currently being executed, + * invokes a callback block. Canceling is asynchronous. + * @param handler Block that will executed when the queue has completed any inflight work + * and will not execute further work. + * @return C-string pointer in the queues internal storage. + */ + kern_return_t + Cancel(IODispatchQueueCancelHandler handler) LOCALONLY; + + /*! + * @brief Schedule a block to be executed on the queue asynchronously. + * @discussion Schedules work to be done on the queue without waiting for it to complete. The queue will be + * retained until the block completes. + * @param block Block that will executed on the queue, not in the context of the caller. + */ + void + DispatchAsync(IODispatchBlock block) LOCALONLY; + + /*! + * @brief C-function callback version of DispatchAsync. + */ + void + DispatchAsync_f(void * context, IODispatchFunction function) LOCALONLY; + + void + DispatchSync(IODispatchBlock block) LOCALONLY; + + /*! + * @brief C-function callback version of DispatchSync. + */ + void + DispatchSync_f(void * context, IODispatchFunction function) LOCALONLY; + + /*! + * @brief Log the current execution context with respect to any queues the current thread holds. + * @param output printf like output function. The address of IOLog is suitable to be used. + */ + static void + Log(const char * message, IODispatchLogFunction output) LOCALONLY; +}; + +#if DRIVERKIT_PRIVATE +class EXTENDS (IODispatchQueue) IODispatchQueuePrivate +{ + virtual kern_return_t + SetPort( + mach_port_t port PORTMAKESEND); +}; +#endif + +#endif /* ! _IOKIT_UIODISPATCH_H */ diff --git a/iokit/DriverKit/IODispatchSource.iig b/iokit/DriverKit/IODispatchSource.iig new file mode 100644 index 000000000..46cea7446 --- /dev/null +++ b/iokit/DriverKit/IODispatchSource.iig @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_UIODISPATCHSOURCE_H +#define _IOKIT_UIODISPATCHSOURCE_H + +#include + + +typedef void (^IODispatchSourceCancelHandler)(void); + +/*! + * @class IODispatchSource + + * @abstract + * IODispatchSource common base class for dispatch sources. + */ + +class NATIVE KERNEL IODispatchSource : public OSObject +{ +public: + + virtual bool + init() override; + + virtual void + free() override; + + virtual kern_return_t + Cancel(IODispatchSourceCancelHandler handler) = 0; + + virtual kern_return_t + SetEnableWithCompletion( + bool enable, + IODispatchSourceCancelHandler handler) = 0; + + virtual kern_return_t + CheckForWork(bool synchronous) INVOKEREPLY = 0; + + virtual kern_return_t + SetEnable(bool enable) LOCAL; +}; + +#endif /* ! _IOKIT_UIODISPATCHSOURCE_H */ diff --git a/iokit/DriverKit/IOInterruptDispatchSource.iig b/iokit/DriverKit/IOInterruptDispatchSource.iig new file mode 100644 index 000000000..ecd4b5c80 --- /dev/null +++ b/iokit/DriverKit/IOInterruptDispatchSource.iig @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_UIOINTERRUPTDISPATCHSOURCE_H +#define _IOKIT_UIOINTERRUPTDISPATCHSOURCE_H + +#include +#include + +struct IOInterruptDispatchSourcePayload { + uint64_t time; + uint64_t count; +}; + +/*! + * @class IOInterruptDispatchSource + * + * @abstract + * IOInterruptDispatchSource delivers interrupts to a handler block on a dispatch queue. + * + * @discussion + * A driver can run code in response to an interrupt from a device, specified as an IOService + * and index. The code runs at normal thread level, but is notified with the mach_absolute_time + * the primary interrupt fired. For IOPCIDevices, only MSI interrupt sources are supported. + */ + +class NATIVE KERNEL IOInterruptDispatchSource : public IODispatchSource +{ +public: + + /*! + * @brief Create an IOInterruptDispatchSource for an interrupt by index from an IOService provider. + * @param provider The IOService object representing the HW device producing the interrupt. + * @param index Index for the interrupt. + * @param queue Target queue to run the handler block. + * @param source Created source with +1 retain count to be released by the caller. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + static kern_return_t + Create(IOService * provider, + uint32_t index, + IODispatchQueue * queue, + IOInterruptDispatchSource ** source) LOCAL; + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Set the handler block to run when the interupt fires. + * @param action OSAction instance specifying the callback method. The OSAction object will be retained + * until SetHandler is called again or the event source is cancelled. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetHandler( + OSAction * action TYPE(InterruptOccurred)) LOCAL; + + /*! + * @brief Control the enable state of the interrupt source. + * @param enable Pass true to enable the source or false to disable. + * @param handler Optional block to be executed after the interrupt has been disabled and any pending + * interrupt handlers completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetEnableWithCompletion( + bool enable, + IODispatchSourceCancelHandler handler) override LOCAL; + + /*! + * @brief Cancel all callbacks from the event source. + * @discussion After cancellation, the source can only be freed. It cannot be reactivated. + * @param handler Handler block to be invoked after any callbacks have completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Cancel(IODispatchSourceCancelHandler handler) override LOCAL; + +private: + virtual kern_return_t + CheckForWork(bool synchronous) override LOCAL; + + virtual void + InterruptOccurred( + OSAction * action TARGET, + uint64_t count, + uint64_t time) REPLY LOCAL; +}; + +#endif /* ! _IOKIT_UIOINTERRUPTDISPATCHSOURCE_H */ diff --git a/iokit/DriverKit/IOMemoryDescriptor.iig b/iokit/DriverKit/IOMemoryDescriptor.iig new file mode 100644 index 000000000..760d48cb1 --- /dev/null +++ b/iokit/DriverKit/IOMemoryDescriptor.iig @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !__IIG +#if KERNEL +#include +#endif +#endif + +#ifndef _IOKIT_UIOMEMORYDESCRIPTOR_H +#define _IOKIT_UIOMEMORYDESCRIPTOR_H + +#include + +class IOService; +class IOMemoryMap; + + +// IOMemoryDescriptor Create options +enum { + kIOMemoryDirectionIn = 0x00000001, + kIOMemoryDirectionOut = 0x00000002, + kIOMemoryDirectionOutIn = kIOMemoryDirectionIn | kIOMemoryDirectionOut, + kIOMemoryDirectionInOut = kIOMemoryDirectionOutIn, +}; + +// IOMemoryDescriptor CreateMapping options +enum { + kIOMemoryMapFixedAddress = 0x00000001, + kIOMemoryMapReadOnly = 0x00000002, + kIOMemoryMapCacheModeDefault = 0x00000000, + kIOMemoryMapCacheModeInhibit = 0x00000100, + kIOMemoryMapCacheModeCopyback = 0x00000200, + kIOMemoryMapCacheModeWriteThrough = 0x00000400, +}; + +struct IOAddressSegment { + uint64_t address; + uint64_t length; +}; + +struct IOMDPrivateState { + uint64_t length; + uint64_t options; +}; + +/*! + * @class IOMemoryDescriptor + * + * @abstract + * IOMemoryDescriptor describes a memory buffer. + * + * @discussion + * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create() + * Methods in this class are used for memory that was supplied as a parameter. + * + +@iig implementation +#include +@iig end +*/ + +class KERNEL IOMemoryDescriptor : public OSObject +{ +public: + + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Obtain the length of the memory described. + * @param returnLength Returned length. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + kern_return_t + GetLength( + uint64_t * returnLength) LOCALONLY; + + /*! + * @brief Create a mapping of the memory in the callers address space. + * @param options + * kIOMemoryMapFixedAddress map at the address requested + * kIOMemoryMapReadOnly create a read only mapping + * kIOMemoryMapCacheModeDefault default cache mode + * kIOMemoryMapCacheModeInhibit inhibited cache mode + * kIOMemoryMapCacheModeCopyback copyback cache mode + * kIOMemoryMapCacheModeWriteThrough write through cache mode + * @param address Requested address if kIOMemoryMapFixedAddress was passed + * @param offset Start offset of the mapping in the descriptor. + * @param length Pass zero to map the entire memory, or a value <= the length of the descriptor. + * @param alignment of the memory virtual mapping. Only zero for no alignment is supported. + * @param map Returned IOMemoryMap object with +1 retain count. + * It should be retained until the map is no longer required. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + CreateMapping( + uint64_t options, + uint64_t address, + uint64_t offset, + uint64_t length, + uint64_t alignment, + IOMemoryMap ** map); + +private: + virtual kern_return_t + PrepareForDMA( + uint64_t options, + IOService * device, + uint64_t offset, + uint64_t length, + + uint64_t * flags, + uint64_t * returnLength, + uint32_t * segmentsCount, + IOAddressSegment segments[32]); + + kern_return_t + Map( + uint64_t options, + uint64_t address, + uint64_t length, + uint64_t alignment, + + uint64_t * returnAddress, + uint64_t * returnLength) LOCALONLY; +}; + +class EXTENDS (IOMemoryDescriptor) IOMemoryDescriptorPrivate +{ + virtual kern_return_t + _CopyState( + IOMDPrivateState * state); +}; + + + +#endif /* ! _IOKIT_UIOMEMORYDESCRIPTOR_H */ diff --git a/iokit/DriverKit/IOMemoryMap.iig b/iokit/DriverKit/IOMemoryMap.iig new file mode 100644 index 000000000..716c87f09 --- /dev/null +++ b/iokit/DriverKit/IOMemoryMap.iig @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !__IIG +#if KERNEL +#include +#endif +#endif + +#ifndef _IOKIT_UIOMEMORYMAP_H +#define _IOKIT_UIOMEMORYMAP_H + +#include + +struct IOMemoryMapPrivateState { + uint64_t length; + uint64_t offset; + uint64_t options; + uint64_t address; +}; + +/*! + * @class IOMemoryMap + * + * @abstract + * IOMemoryMap describes a memory mapping created with IOMemoryDescriptor::CreateMapping() + * + * @discussion + * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create() + * Methods in this class are used for memory that was supplied as a parameter. + */ + +class KERNEL IOMemoryMap : public OSObject +{ +public: + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Obtain the address of the memory mapping. + * @return Address. + */ + uint64_t + GetAddress() LOCALONLY; + + /*! + * @brief Obtain the length of the memory mapping. + * @return Length. + */ + uint64_t + GetLength() LOCALONLY; + + /*! + * @brief Obtain the offset of the memory mapping. + * @return Offset. + */ + uint64_t + GetOffset() LOCALONLY; +}; + +class EXTENDS (IOMemoryMap) IOMemoryMapPrivate +{ + virtual kern_return_t + _CopyState( + IOMemoryMapPrivateState * state); +}; + +#endif /* ! _IOKIT_UIOMEMORYMAP_H */ diff --git a/iokit/DriverKit/IORPC.h b/iokit/DriverKit/IORPC.h new file mode 100644 index 000000000..0ae141595 --- /dev/null +++ b/iokit/DriverKit/IORPC.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef _IORPC_H +#define _IORPC_H + +#include + +#ifndef PLATFORM_DriverKit + +#include + +#else /* !PLATFORM_DriverKit */ + +#ifndef _MACH_MESSAGE_H_ +#define _MACH_MESSAGE_H_ + +#define MACH_MSG_TYPE_MOVE_RECEIVE 16 /* Must hold receive right */ +#define MACH_MSG_TYPE_MOVE_SEND 17 /* Must hold send right(s) */ +#define MACH_MSG_TYPE_MOVE_SEND_ONCE 18 /* Must hold sendonce right */ +#define MACH_MSG_TYPE_COPY_SEND 19 /* Must hold send right(s) */ +#define MACH_MSG_TYPE_MAKE_SEND 20 /* Must hold receive right */ +#define MACH_MSG_TYPE_MAKE_SEND_ONCE 21 /* Must hold receive right */ +#define MACH_MSG_TYPE_COPY_RECEIVE 22 /* NOT VALID */ +#define MACH_MSG_TYPE_DISPOSE_RECEIVE 24 /* must hold receive right */ +#define MACH_MSG_TYPE_DISPOSE_SEND 25 /* must hold send right(s) */ +#define MACH_MSG_TYPE_DISPOSE_SEND_ONCE 26 /* must hold sendonce right */ + +#define MACH_MSG_TYPE_PORT_NONE 0 + +#define MACH_MSG_PORT_DESCRIPTOR 0 +#define MACH_MSG_OOL_DESCRIPTOR 1 + +typedef unsigned int mach_msg_copy_options_t; + +#define MACH_MSG_PHYSICAL_COPY 0 +#define MACH_MSG_VIRTUAL_COPY 1 +#define MACH_MSG_ALLOCATE 2 + +typedef uint32_t natural_t; +typedef int32_t integer_t; + +typedef unsigned int mach_msg_type_name_t; +typedef unsigned int mach_msg_descriptor_type_t; + +#if KERNEL +typedef void * mach_port_t; +#define MACH_PORT_NULL NULL +#else /* !KERNEL */ +typedef natural_t mach_port_t; +#define MACH_PORT_NULL 0 +#endif /* !KERNEL */ + +typedef natural_t mach_port_name_t; + +typedef unsigned int mach_msg_bits_t; +typedef natural_t mach_msg_size_t; +typedef integer_t mach_msg_id_t; + +#pragma pack(push, 4) + +typedef struct{ + mach_msg_bits_t msgh_bits; + mach_msg_size_t msgh_size; + mach_port_t msgh_remote_port; + mach_port_t msgh_local_port; + mach_port_name_t msgh_voucher_port; + mach_msg_id_t msgh_id; +} mach_msg_header_t; + +typedef struct{ + mach_msg_size_t msgh_descriptor_count; +} mach_msg_body_t; + +typedef struct{ + mach_port_t name; +#if !(defined(KERNEL) && defined(__LP64__)) +// Pad to 8 bytes everywhere except the K64 kernel where mach_port_t is 8 bytes + mach_msg_size_t pad1; +#endif + unsigned int pad2 : 16; + mach_msg_type_name_t disposition : 8; + mach_msg_descriptor_type_t type : 8; +#if defined(KERNEL) + uint32_t pad_end; +#endif +} mach_msg_port_descriptor_t; + +typedef struct{ + void * address; +#if !defined(__LP64__) + mach_msg_size_t size; +#endif + int deallocate: 8; + mach_msg_copy_options_t copy: 8; + unsigned int pad1: 8; + mach_msg_descriptor_type_t type: 8; +#if defined(__LP64__) + mach_msg_size_t size; +#endif +#if defined(KERNEL) && !defined(__LP64__) + uint32_t pad_end; +#endif +} mach_msg_ool_descriptor_t; + +typedef struct{ + unsigned int val[80 / sizeof(int)]; +} mach_msg_max_trailer_t; + +#pragma pack(pop) + +#endif /* _MACH_MESSAGE_H_ */ + +#endif /* PLATFORM_DriverKit */ + +#if KERNEL +class IOUserServer; +#endif /* KERNEL */ + +typedef uint64_t OSObjectRef; + +enum { + kIORPCVersion190615 = (mach_msg_id_t) 0x4da2b68c, + kIORPCVersion190615Reply = (mach_msg_id_t) 0x4da2b68d, + +#if DRIVERKIT_PRIVATE + kIORPCVersion190501 = (mach_msg_id_t) 0xfe316a7a, + kIORPCVersion190501Reply = (mach_msg_id_t) 0xfe316a7b, + + kIORPCVersionCurrent = kIORPCVersion190615, + kIORPCVersionCurrentReply = kIORPCVersion190615Reply +#endif /* DRIVERKIT_PRIVATE */ +}; + +enum{ + kIORPCMessageRemote = 0x00000001, + kIORPCMessageLocalHost = 0x00000002, + kIORPCMessageKernel = 0x00000004, + kIORPCMessageOneway = 0x00000008, + kIORPCMessageObjectRefs = 0x00000010, + kIORPCMessageOnqueue = 0x00000020, + kIORPCMessageError = 0x00000040, + kIORPCMessageSimpleReply = 0x00000080, +}; + +enum{ + kIORPCMessageIDKernel = (1ULL << 63), +}; + +struct IORPCMessageMach { + mach_msg_header_t msgh; + mach_msg_body_t msgh_body; + mach_msg_port_descriptor_t objects[0]; +}; +typedef struct IORPCMessageMach IORPCMessageMach; + +struct IORPCMessage { + uint64_t msgid; + uint64_t flags; + uint64_t objectRefs; + OSObjectRef objects[0]; +}; +typedef struct IORPCMessage IORPCMessage; + +extern "C" IORPCMessage * +IORPCMessageFromMach(IORPCMessageMach * msg, bool reply); + +struct IORPCMessageErrorReturnContent { + IORPCMessage hdr; + kern_return_t result; + uint32_t pad; +}; + +#pragma pack(4) +struct IORPCMessageErrorReturn { + IORPCMessageMach mach; + IORPCMessageErrorReturnContent content; +}; +#pragma pack() + + +class OSMetaClassBase; +struct IORPC; +typedef kern_return_t (*OSDispatchMethod)(OSMetaClassBase * self, const IORPC rpc); + +struct IORPC { + IORPCMessageMach * message; + IORPCMessageMach * reply; + uint32_t sendSize; + uint32_t replySize; +}; +typedef struct IORPC IORPC; + +enum { + kOSClassCanRemote = 0x00000001, +}; + +struct OSClassDescription { + uint32_t descriptionSize; + + char name[96]; + char superName[96]; + + uint32_t methodOptionsSize; + uint32_t methodOptionsOffset; + uint32_t metaMethodOptionsSize; + uint32_t metaMethodOptionsOffset; + uint32_t queueNamesSize; + uint32_t queueNamesOffset; + uint32_t methodNamesSize; + uint32_t methodNamesOffset; + uint32_t metaMethodNamesSize; + uint32_t metaMethodNamesOffset; + + uint64_t flags; + + uint64_t resv1[8]; + + uint64_t methodOptions[0]; + uint64_t metaMethodOptions[0]; + + char dispatchNames[0]; + char methodNames[0]; + char metaMethodNames[0]; +}; + +#endif /* _IORPC_H */ diff --git a/iokit/DriverKit/IOReturn.h b/iokit/DriverKit/IOReturn.h new file mode 100644 index 000000000..5175ee7d0 --- /dev/null +++ b/iokit/DriverKit/IOReturn.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 1998-2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * HISTORY + */ + +/* + * Core IOReturn values. Others may be family defined. + */ + +#ifndef __IOKIT_IORETURN_H +#define __IOKIT_IORETURN_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef PLATFORM_DriverKit + +#include + +#else /* PLATFORM_DriverKit */ + +#ifndef _MACH_ERROR_H_ +#define _MACH_ERROR_H_ + +typedef int kern_return_t; + +#define KERN_SUCCESS 0 + +/* + * error number layout as follows: + * + * hi lo + * | system(6) | subsystem(12) | code(14) | + */ + +#define err_none (kern_return_t)0 +#define ERR_SUCCESS (kern_return_t)0 + +#define err_system(x) ((signed)((((unsigned)(x))&0x3f)<<26)) +#define err_sub(x) (((x)&0xfff)<<14) + +#define err_get_system(err) (((err)>>26)&0x3f) +#define err_get_sub(err) (((err)>>14)&0xfff) +#define err_get_code(err) ((err)&0x3fff) + +#define err_max_system 0x3f + +#define system_emask (err_system(err_max_system)) +#define sub_emask (err_sub(0xfff)) +#define code_emask (0x3fff) + +#endif /* _MACH_ERROR_H_ */ + +#endif /* PLATFORM_DriverKit */ + +typedef kern_return_t IOReturn; + +#ifndef sys_iokit +#define sys_iokit err_system(0x38) +#endif /* sys_iokit */ +#define sub_iokit_common err_sub(0) +#define sub_iokit_usb err_sub(1) +#define sub_iokit_firewire err_sub(2) +#define sub_iokit_block_storage err_sub(4) +#define sub_iokit_graphics err_sub(5) +#define sub_iokit_networking err_sub(6) +#define sub_iokit_bluetooth err_sub(8) +#define sub_iokit_pmu err_sub(9) +#define sub_iokit_acpi err_sub(10) +#define sub_iokit_smbus err_sub(11) +#define sub_iokit_ahci err_sub(12) +#define sub_iokit_powermanagement err_sub(13) +#define sub_iokit_hidsystem err_sub(14) +#define sub_iokit_scsi err_sub(16) +#define sub_iokit_usbaudio err_sub(17) +#define sub_iokit_wirelesscharging err_sub(18) +//#define sub_iokit_pccard err_sub(21) +#ifdef PRIVATE +#define sub_iokit_nvme err_sub(28) +#endif +#define sub_iokit_thunderbolt err_sub(29) +#define sub_iokit_graphics_acceleration err_sub(30) +#define sub_iokit_keystore err_sub(31) +#ifdef PRIVATE +#define sub_iokit_smc err_sub(32) +#endif +#define sub_iokit_apfs err_sub(33) +#define sub_iokit_platform err_sub(0x2A) +#define sub_iokit_audio_video err_sub(0x45) +#define sub_iokit_cec err_sub(0x46) +#define sub_iokit_baseband err_sub(0x80) +#define sub_iokit_HDA err_sub(0xFE) +#define sub_iokit_hsic err_sub(0x147) +#define sub_iokit_sdio err_sub(0x174) +#define sub_iokit_wlan err_sub(0x208) +#define sub_iokit_appleembeddedsleepwakehandler err_sub(0x209) +#define sub_iokit_appleppm err_sub(0x20A) + +#define sub_iokit_vendor_specific err_sub(-2) +#define sub_iokit_reserved err_sub(-1) + +#define iokit_common_err(return ) (sys_iokit|sub_iokit_common|return) +#define iokit_family_err(sub, return ) (sys_iokit|sub|return) +#define iokit_vendor_specific_err(return ) (sys_iokit|sub_iokit_vendor_specific|return) + +#define kIOReturnSuccess KERN_SUCCESS // OK +#define kIOReturnError iokit_common_err(0x2bc) // general error +#define kIOReturnNoMemory iokit_common_err(0x2bd) // can't allocate memory +#define kIOReturnNoResources iokit_common_err(0x2be) // resource shortage +#define kIOReturnIPCError iokit_common_err(0x2bf) // error during IPC +#define kIOReturnNoDevice iokit_common_err(0x2c0) // no such device +#define kIOReturnNotPrivileged iokit_common_err(0x2c1) // privilege violation +#define kIOReturnBadArgument iokit_common_err(0x2c2) // invalid argument +#define kIOReturnLockedRead iokit_common_err(0x2c3) // device read locked +#define kIOReturnLockedWrite iokit_common_err(0x2c4) // device write locked +#define kIOReturnExclusiveAccess iokit_common_err(0x2c5) // exclusive access and + // device already open +#define kIOReturnBadMessageID iokit_common_err(0x2c6) // sent/received messages + // had different msg_id +#define kIOReturnUnsupported iokit_common_err(0x2c7) // unsupported function +#define kIOReturnVMError iokit_common_err(0x2c8) // misc. VM failure +#define kIOReturnInternalError iokit_common_err(0x2c9) // internal error +#define kIOReturnIOError iokit_common_err(0x2ca) // General I/O error +//#define kIOReturn???Error iokit_common_err(0x2cb) // ??? +#define kIOReturnCannotLock iokit_common_err(0x2cc) // can't acquire lock +#define kIOReturnNotOpen iokit_common_err(0x2cd) // device not open +#define kIOReturnNotReadable iokit_common_err(0x2ce) // read not supported +#define kIOReturnNotWritable iokit_common_err(0x2cf) // write not supported +#define kIOReturnNotAligned iokit_common_err(0x2d0) // alignment error +#define kIOReturnBadMedia iokit_common_err(0x2d1) // Media Error +#define kIOReturnStillOpen iokit_common_err(0x2d2) // device(s) still open +#define kIOReturnRLDError iokit_common_err(0x2d3) // rld failure +#define kIOReturnDMAError iokit_common_err(0x2d4) // DMA failure +#define kIOReturnBusy iokit_common_err(0x2d5) // Device Busy +#define kIOReturnTimeout iokit_common_err(0x2d6) // I/O Timeout +#define kIOReturnOffline iokit_common_err(0x2d7) // device offline +#define kIOReturnNotReady iokit_common_err(0x2d8) // not ready +#define kIOReturnNotAttached iokit_common_err(0x2d9) // device not attached +#define kIOReturnNoChannels iokit_common_err(0x2da) // no DMA channels left +#define kIOReturnNoSpace iokit_common_err(0x2db) // no space for data +//#define kIOReturn???Error iokit_common_err(0x2dc) // ??? +#define kIOReturnPortExists iokit_common_err(0x2dd) // port already exists +#define kIOReturnCannotWire iokit_common_err(0x2de) // can't wire down + // physical memory +#define kIOReturnNoInterrupt iokit_common_err(0x2df) // no interrupt attached +#define kIOReturnNoFrames iokit_common_err(0x2e0) // no DMA frames enqueued +#define kIOReturnMessageTooLarge iokit_common_err(0x2e1) // oversized msg received + // on interrupt port +#define kIOReturnNotPermitted iokit_common_err(0x2e2) // not permitted +#define kIOReturnNoPower iokit_common_err(0x2e3) // no power to device +#define kIOReturnNoMedia iokit_common_err(0x2e4) // media not present +#define kIOReturnUnformattedMedia iokit_common_err(0x2e5)// media not formatted +#define kIOReturnUnsupportedMode iokit_common_err(0x2e6) // no such mode +#define kIOReturnUnderrun iokit_common_err(0x2e7) // data underrun +#define kIOReturnOverrun iokit_common_err(0x2e8) // data overrun +#define kIOReturnDeviceError iokit_common_err(0x2e9) // the device is not working properly! +#define kIOReturnNoCompletion iokit_common_err(0x2ea) // a completion routine is required +#define kIOReturnAborted iokit_common_err(0x2eb) // operation aborted +#define kIOReturnNoBandwidth iokit_common_err(0x2ec) // bus bandwidth would be exceeded +#define kIOReturnNotResponding iokit_common_err(0x2ed) // device not responding +#define kIOReturnIsoTooOld iokit_common_err(0x2ee) // isochronous I/O request for distant past! +#define kIOReturnIsoTooNew iokit_common_err(0x2ef) // isochronous I/O request for distant future +#define kIOReturnNotFound iokit_common_err(0x2f0) // data was not found +#define kIOReturnInvalid iokit_common_err(0x1) // should never be seen + +#ifdef __cplusplus +} +#endif + +#endif /* ! __IOKIT_IORETURN_H */ diff --git a/iokit/DriverKit/IOService.iig b/iokit/DriverKit/IOService.iig new file mode 100644 index 000000000..5885850dc --- /dev/null +++ b/iokit/DriverKit/IOService.iig @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !__IIG +#if KERNEL +#include +#endif +#endif + +#ifndef _IOKIT_UIOSERVICE_H +#define _IOKIT_UIOSERVICE_H + +#include + +class IOMemoryDescriptor; +class IOBufferMemoryDescriptor; +class IOUserClient; + +typedef char IOServiceName[128]; +typedef char IOPropertyName[128]; +typedef char IORegistryPlaneName[128]; + +enum { + kIOServiceSearchPropertyParents = 0x00000001, +}; + +#define kIOServiceDefaultQueueName "Default" + +enum { + kIOServicePowerCapabilityOff = 0x00000000, + kIOServicePowerCapabilityOn = 0x00000002, + kIOServicePowerCapabilityLow = 0x00010000, +}; + +/*! + * @class IOService + * + * @abstract + * IOService represents an device or OS service in IOKit and DriverKit. + * + * @discussion + * IOKit provides driver lifecycle management through the IOService APIs. + * Drivers and devices are represented as subclasses of IOService. + * + +@iig implementation +#include +@iig end +*/ + +class KERNEL IOService : public OSObject +{ +public: + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief First call made to a matched IOService. + * @discussion During matching IOKit will create an IOService object for successful matches. + * Start is the first call made to the new object. + * @param provider The IOService provider for the match. This should be OSRequiredCast to the expected class. + * The provider is retained by DriverKit for the duration of Start() and on successful Start() until + * IOService::Stop() is called. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Start(IOService * provider) LOCAL; + + /*! + * @brief Terminate access to provider. + * @discussion During termination IOKit will teardown any IOService objects attached to a terminated provider. + * Stop should quiesce all activity and when complete, pass the call to super. After calling super, the + * provider is no longer valid and this object will likely be freed. + * @param provider The IOService provider for being terminated, one previously passed to Start + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Stop(IOService * provider) LOCAL; + + /*! + * @brief Obtain IOKit IORegistryEntryID. + * @param registryEntryID IORegistryEntryID for the IOKit object. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + GetRegistryEntryID(uint64_t * registryEntryID) LOCAL; + + /*! + * @brief Set the IORegistryEntry name. + * @param name Name for the IOKit object. The c-string will be copied. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetName( + const IOServiceName name); + + /*! + * @brief Start the matching process on the IOService object. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + RegisterService(); + + /*! + * @brief Set the IODispatchQueue for a given name on the IOService. + * @param name Name for the queue. The name may be referenced by methods in the .iig class definition + * with the QUEUENAME() attribute to indicate the method must be invoked on that queue. If a method + * is invoked before the queue is set for the name, the default queue is used. A default queue is + * created by DriverKit for every new IOService object with the name kIOServiceDefaultQueueName. + * @param queue Queue to be associated with the name on this IOService. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetDispatchQueue( + const IODispatchQueueName name, + IODispatchQueue * queue) override LOCAL; + + /*! + * @brief Obtain the IODispatchQueue for a given name on the IOService. + * @param name Name for the queue. + * @param queue Returned, retained queue or NULL. The caller should release this queue. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + CopyDispatchQueue( + const IODispatchQueueName name, + IODispatchQueue ** queue) override; + + /*! + * @brief Obtain the IOKit registry properties for the IOService. + * @param properties Returned, retained dictionary of properties or NULL. The caller should release this dictionary. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + CopyProperties( + OSDictionary ** properties); + + /*! + * @brief Obtain the an IOKit registry properties from the service or one of its parents. + * @param name Name of the property as a c-string. + * @param plane Name of the registry plane to be searched, if the option kIOServiceSearchPropertyParents + * is used. + * @param options Pass kIOServiceSearchPropertyParents to search for the property in the IOService and all + * its parents in the IOKit registry. + * @param property Returned, retained property object or NULL. The caller should release this property. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SearchProperty( + const IOPropertyName name, + const IORegistryPlaneName plane, + uint64_t options, + OSContainer ** property); + + /*! + * @brief Send a dictionary of properties to an IOService. + * @discussion By default the method will fail. A DriverKit subclass or kernel class may implement this method. + * @param properties Dictionary of properties. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetProperties( + OSDictionary * properties); + + /*! + * @brief Notification of change in power state of a provider. + * @discussion DriverKit notifies of changes in power of a provider. The driver should make itself safe for + * the new state before passing the call to super. + * @param powerFlags The power capabilities of the new state. The values possible are: + * kIOServicePowerCapabilityOff the system will be entering sleep state + * kIOServicePowerCapabilityOn the device and system are fully powered + * kIOServicePowerCapabilityLow the device is in a reduced power state while the system is running + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetPowerState( + uint32_t powerFlags) LOCAL; + + /*! + * @brief Allow provider to enter a low power state. + * @discussion A driver may allow a device to enter a lower power state. + * @param powerFlags The power capabilities of the new state. The values possible are: + * kIOServicePowerCapabilityLow the device is in a reduced power state while the system is running + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + ChangePowerState( + uint32_t powerFlags); + + /*! + * @brief Request create a new user client for a client process. + * @discussion An application may request an IOUserClient be opened with the IOKit framework + * IOServiceOpen() call. The type parameter of that call is passed here. The driver should respond to + * the call by calling IOService::Create() with a plist entry describing the new user client object. + * @param type The type passed to IOServiceOpen(). + * @param userClient The object created by IOService::Create() + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + NewUserClient( + uint32_t type, + IOUserClient ** userClient); + + /*! + * @brief Request to create an IOService object from a plist property. + * @discussion An IOService interface or IOUserClient subclass may be created from a plist property of the driver. + * The plist should contain the following IOKit matching keys: + * IOClass - kernel class of IOUserUserClient + * IOUserClass - DriverKit class to be instantiated + * IOServiceDEXTEntitlements - Array of entitlements to be checked against a user client owning task + * @param provider The provider of the new object. + * @param propertiesKey The name of the properties dictionary in this IOService + * @param result The created object retained, to be released by the caller. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Create( + IOService * provider, + const IOPropertyName propertiesKey, + IOService ** result); +}; + +#endif /* ! _IOKIT_UIOSERVICE_H */ diff --git a/iokit/DriverKit/IOTypes.h b/iokit/DriverKit/IOTypes.h new file mode 100644 index 000000000..de2d357f6 --- /dev/null +++ b/iokit/DriverKit/IOTypes.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1998-2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __IOKIT_IOTYPES_H +#define __IOKIT_IOTYPES_H + +#ifndef PLATFORM_DriverKit + +#ifndef IOKIT +#define IOKIT 1 +#endif /* !IOKIT */ + +#if KERNEL +#include +#else +#include +#include +#endif + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NULL +#if defined (__cplusplus) +#ifdef XNU_KERNEL_PRIVATE +#define NULL nullptr +#else +#if __cplusplus >= 201103L && (defined(__arm__) || defined(__arm64__)) +#define NULL nullptr +#else +#define NULL 0 +#endif +#endif +#else +#define NULL ((void *)0) +#endif +#endif + +/* + * Simple data types. + */ +#include +#include + +#if KERNEL +#include +#endif + +typedef UInt32 IOOptionBits; +typedef SInt32 IOFixed; +typedef UInt32 IOVersion; +typedef UInt32 IOItemCount; +typedef UInt32 IOCacheMode; + +typedef UInt32 IOByteCount32; +typedef UInt64 IOByteCount64; + +typedef UInt32 IOPhysicalAddress32; +typedef UInt64 IOPhysicalAddress64; +typedef UInt32 IOPhysicalLength32; +typedef UInt64 IOPhysicalLength64; + +#if !defined(__arm__) && !defined(__i386__) +typedef mach_vm_address_t IOVirtualAddress; +#else +typedef vm_address_t IOVirtualAddress; +#endif + +#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL)) && !(defined(__arm64__) && !defined(__LP64__)) +typedef IOByteCount64 IOByteCount; +#else +typedef IOByteCount32 IOByteCount; +#endif + +typedef IOVirtualAddress IOLogicalAddress; + +#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL)) + +typedef IOPhysicalAddress64 IOPhysicalAddress; +typedef IOPhysicalLength64 IOPhysicalLength; +#define IOPhysical32( hi, lo ) ((UInt64) lo + ((UInt64)(hi) << 32)) +#define IOPhysSize 64 + +#else + +typedef IOPhysicalAddress32 IOPhysicalAddress; +typedef IOPhysicalLength32 IOPhysicalLength; +#define IOPhysical32( hi, lo ) (lo) +#define IOPhysSize 32 + +#endif + + +typedef struct{ + IOPhysicalAddress address; + IOByteCount length; +} IOPhysicalRange; + +typedef struct{ + IOVirtualAddress address; + IOByteCount length; +} IOVirtualRange; + +#if !defined(__arm__) && !defined(__i386__) +typedef IOVirtualRange IOAddressRange; +#else +typedef struct{ + mach_vm_address_t address; + mach_vm_size_t length; +} IOAddressRange; +#endif + +/* + * Map between #defined or enum'd constants and text description. + */ +typedef struct { + int value; + const char *name; +} IONamedValue; + + +/* + * Memory alignment -- specified as a power of two. + */ +typedef unsigned int IOAlignment; + +#define IO_NULL_VM_TASK ((vm_task_t)0) + + +/* + * Pull in machine specific stuff. + */ + +//#include + +#ifndef MACH_KERNEL + +#ifndef __IOKIT_PORTS_DEFINED__ +#define __IOKIT_PORTS_DEFINED__ +#ifdef KERNEL +#ifdef __cplusplus +class OSObject; +typedef OSObject * io_object_t; +#else +typedef struct OSObject * io_object_t; +#endif +#else /* KERNEL */ +typedef mach_port_t io_object_t; +#endif /* KERNEL */ +#endif /* __IOKIT_PORTS_DEFINED__ */ + +#include + +typedef io_object_t io_connect_t; +typedef io_object_t io_enumerator_t; +typedef io_object_t io_iterator_t; +typedef io_object_t io_registry_entry_t; +typedef io_object_t io_service_t; +typedef io_object_t uext_object_t; + +#define IO_OBJECT_NULL ((io_object_t) 0) + +#endif /* MACH_KERNEL */ + +// IOConnectMapMemory memoryTypes +enum { + kIODefaultMemoryType = 0 +}; + +enum { + kIODefaultCache = 0, + kIOInhibitCache = 1, + kIOWriteThruCache = 2, + kIOCopybackCache = 3, + kIOWriteCombineCache = 4, + kIOCopybackInnerCache = 5, + kIOPostedWrite = 6, + kIORealTimeCache = 7, + kIOPostedReordered = 8, +}; + +// IOMemory mapping options +enum { + kIOMapAnywhere = 0x00000001, + + kIOMapCacheMask = 0x00000f00, + kIOMapCacheShift = 8, + kIOMapDefaultCache = kIODefaultCache << kIOMapCacheShift, + kIOMapInhibitCache = kIOInhibitCache << kIOMapCacheShift, + kIOMapWriteThruCache = kIOWriteThruCache << kIOMapCacheShift, + kIOMapCopybackCache = kIOCopybackCache << kIOMapCacheShift, + kIOMapWriteCombineCache = kIOWriteCombineCache << kIOMapCacheShift, + kIOMapCopybackInnerCache = kIOCopybackInnerCache << kIOMapCacheShift, + kIOMapPostedWrite = kIOPostedWrite << kIOMapCacheShift, + kIOMapRealTimeCache = kIORealTimeCache << kIOMapCacheShift, + kIOMapPostedReordered = kIOPostedReordered << kIOMapCacheShift, + + kIOMapUserOptionsMask = 0x00000fff, + + kIOMapReadOnly = 0x00001000, + + kIOMapStatic = 0x01000000, + kIOMapReference = 0x02000000, + kIOMapUnique = 0x04000000, +#ifdef XNU_KERNEL_PRIVATE + kIOMap64Bit = 0x08000000, +#endif + kIOMapPrefault = 0x10000000, + kIOMapOverwrite = 0x20000000 +}; + +/*! @enum Scale Factors + * @discussion Used when a scale_factor parameter is required to define a unit of time. + * @constant kNanosecondScale Scale factor for nanosecond based times. + * @constant kMicrosecondScale Scale factor for microsecond based times. + * @constant kMillisecondScale Scale factor for millisecond based times. + * @constant kTickScale Scale factor for the standard (100Hz) tick. + * @constant kSecondScale Scale factor for second based times. */ + +enum { + kNanosecondScale = 1, + kMicrosecondScale = 1000, + kMillisecondScale = 1000 * 1000, + kSecondScale = 1000 * 1000 * 1000, + kTickScale = (kSecondScale / 100) +}; + +enum { + kIOConnectMethodVarOutputSize = -3 +}; + +/* compatibility types */ + +#ifndef KERNEL + +typedef unsigned int IODeviceNumber; + +#endif + +#ifdef __cplusplus +} +#endif + +#else /* !PLATFORM_DriverKit */ + +#include + +typedef uint32_t IOOptionBits; +typedef int32_t IOFixed; +typedef uint32_t IOVersion; +typedef uint32_t IOItemCount; +typedef uint32_t IOCacheMode; + +typedef uint32_t IOByteCount32; +typedef uint64_t IOByteCount64; +typedef IOByteCount64 IOByteCount; + +typedef uint32_t IOPhysicalAddress32; +typedef uint64_t IOPhysicalAddress64; +typedef uint32_t IOPhysicalLength32; +typedef uint64_t IOPhysicalLength64; + +typedef IOPhysicalAddress64 IOPhysicalAddress; +typedef IOPhysicalLength64 IOPhysicalLength; + +typedef uint64_t IOVirtualAddress; + +#endif /* PLATFORM_DriverKit */ + +#endif /* ! __IOKIT_IOTYPES_H */ diff --git a/iokit/DriverKit/IOUserClient.iig b/iokit/DriverKit/IOUserClient.iig new file mode 100644 index 000000000..5523bb271 --- /dev/null +++ b/iokit/DriverKit/IOUserClient.iig @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !__IIG +#if KERNEL +#include +#endif +#endif + +#ifndef _IOKIT_UIOUSERCLIENT_H +#define _IOKIT_UIOUSERCLIENT_H + +#include +#include + +class IOMemoryDescriptor; +class IOBufferMemoryDescriptor; + +enum { + kIOUserClientScalarArrayCountMax = 16, +}; +typedef uint64_t IOUserClientScalarArray[kIOUserClientScalarArrayCountMax]; + +enum { + kIOUserClientAsyncReferenceCountMax = 16, +}; +typedef uint64_t IOUserClientAsyncReferenceArray[kIOUserClientAsyncReferenceCountMax]; + +enum { + kIOUserClientAsyncArgumentsCountMax = 16, +}; +typedef uint64_t IOUserClientAsyncArgumentsArray[kIOUserClientAsyncArgumentsCountMax]; + +// CopyClientMemoryForType options +enum { + kIOUserClientMemoryReadOnly = 0x00000001, +}; + + +/*! @enum + * @abstract Constant to denote a variable length structure argument to IOUserClient. + * @constant kIOUserClientVariableStructureSize Use in the structures IOUserClientMethodDispatch to specify the size of the structure is variable. + */ +enum { + kIOUserClientVariableStructureSize = 0xffffffff +}; + + +enum { +#define IO_USER_CLIENT_METHOD_ARGUMENTS_CURRENT_VERSION 2 + kIOUserClientMethodArgumentsCurrentVersion = IO_USER_CLIENT_METHOD_ARGUMENTS_CURRENT_VERSION +}; + +/*! + * @struct IOUserClientMethodArguments + * @brief Holds arguments from IOKit.framework IOConnectMethod calls. + * @discussion Any argument may be passed as NULL if not passed by the caller. + * @field selector Selector argument to IOConnectMethod. + * @field scalarInput Array of scalars from caller. + * @field scalarInputCount Count of valid scalars in scalarInput. + * @field structureInput OSData object containing structure input from IOConnectMethod. + * @field structureInputDescriptor IOMemoryDescriptor containing structure input from IOConnectMethod. + * This parameter is only set for large structures, and if set structureInput will be NULL. + * @field scalarOutput Array of scalars to return to the caller. + * @field scalarOutputCount Count of scalars to return to the caller in scalarOutput. + * @field structureOutput An OSData to be returned to the caller as structure output. + * A reference will be consumed by the caller. It is an error to set this field if + * structureOutputDescriptor was passed in + * @field structureOutputDescriptor A IOMemoryDescriptor specified by the caller for structure output. + * @field structureOutputMaximumSize Maximum size of structure output specified by caller + * or kIOUserClientVariableStructureSize. + * @field completion For IOConnectAsyncMethod, an OSAction used to deliver async data to the caller. + * It is only retained during the invocation of ExternalMethod and should be retained if + * used beyond then. + */ + +struct IOUserClientMethodArguments { + uint64_t version; + uint64_t selector; + OSAction * completion; + const uint64_t * scalarInput; + uint32_t scalarInputCount; + OSData * structureInput; + IOMemoryDescriptor * structureInputDescriptor; + uint64_t * scalarOutput; + uint32_t scalarOutputCount; + OSData * structureOutput; + IOMemoryDescriptor * structureOutputDescriptor; + uint64_t structureOutputMaximumSize; + uint64_t __reserved[30]; +}; + +typedef kern_return_t (*IOUserClientMethodFunction)( + OSObject * target, + void * reference, + IOUserClientMethodArguments * arguments); + +/*! + * @struct IOUserClientMethodDispatch + * @brief Used to check fields in IOUserClientMethodArguments + * @field function to invoke after making the checks specified below. If NULL and all checks pass, + * kIOReturnNoCompletion will be returned for the caller to implement the method. + * @field checkCompletionExists + * if true completion field must be set, + * if false must be zero, + * if -1U don't care + * @field checkScalarInputCount + * if has value kIOUserClientVariableStructureSize don't care, + * otherwise must equal args->scalarInputCount + * @field checkStructureInputSize + * if has value kIOUserClientVariableStructureSize don't care, + * otherwise must equal length of structureInput or structureInputDescriptor + * @field checkScalarOutputCount + * if has value kIOUserClientVariableStructureSize don't care, + * otherwise must equal args->scalarOutputCount + * @field checkStructureOutputSize + * if has value kIOUserClientVariableStructureSize don't care, + * otherwise must equal length of structureOutputMaximumSize + */ + +struct IOUserClientMethodDispatch { + IOUserClientMethodFunction function; + uint32_t checkCompletionExists; + uint32_t checkScalarInputCount; + uint32_t checkStructureInputSize; + uint32_t checkScalarOutputCount; + uint32_t checkStructureOutputSize; +}; + +/*! + * @class IOUserClient + * + * @abstract + * IOUserClient represents a connection opened by IOServiceOpen in the IOKit.framework. + * + * @discussion + * An application may open an IOUserClient by calling IOServiceOpen(). This results in a call + * to the IOService::NewUserClient API to create an instance representing the connection. + * and to receive untyped data via IOConnectMethod/IOConnectAsyncMethod. + * As an IOService subclass, IOUserClient receives the normal Start()/Stop() lifecyle calls. + * + +@iig implementation +#include +@iig end +*/ + +class KERNEL IOUserClient : public IOService +{ +public: + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Receive arguments from IOKit.framework IOConnectMethod calls. + * @discussion IOConnectMethod calls from the owner of the connection come here. + * Any argument may be passed as NULL if not passed by the caller. + * @param selector Selector argument to IOConnectMethod. + * @param scalarInput Array of scalars from caller. + * @param scalarInputCount Count of valid scalars in scalarInput. + * @param structureInput OSData object containing structure input from IOConnectMethod. + * @param structureInputDescriptor IOMemoryDescriptor containing structure input from IOConnectMethod. + * This parameter is only set for large structures, and if set structureInput will be NULL. + * @param scalarOutput Array of scalars to return to the caller. + * @param scalarOutputCount Count of scalars to return to the caller in scalarOutput. + * @param structureOutput An OSData to be returned to the caller as structureOutput. + * A reference will be consumed by the caller. + * @param structureOutputDescriptor An IOMemoryDescriptor to be returned to the caller as structureOutput. + * A reference will be consumed by the caller. + * Only one of structureOutput and structureOutputDescriptor may set. + * @param completion For IOConnectAsyncMethod, an OSAction used to deliver async data to the caller. + * It should be passed to the AsyncCompletion() method and released. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + + virtual kern_return_t + ExternalMethod( + uint64_t selector, + IOUserClientMethodArguments * arguments, + const IOUserClientMethodDispatch * dispatch, + OSObject * target, + void * reference) LOCALONLY; + + + /*! + * @brief Send asynchronous arguments to a completion supplied by ExternalMethod(). + * @discussion IOConnectAsyncMethod calls from the owner of the connection come will pass an OSAction instance. + * To deliver the asynchronous results the driver calls AsyncCompletion(). + * @param action OSAction passed to IOExternalMethod(). + * @param status An IOReturn status value to be sent. + * @param asyncData An array of scalar data to be sent. + * @param asyncDataCount Count of valid data in asyncData. + */ + virtual void + AsyncCompletion( + OSAction * action TARGET, + IOReturn status, + const IOUserClientAsyncArgumentsArray asyncData, + uint32_t asyncDataCount) = 0; + + /*! + * @brief Return an IOMemoryDescriptor to be mapped into the client task. + * @discussion IOConnectMapMemory()/UnmapMemory() will result in a call to this method to obtain + * an IOMemoryDescriptor instance for shared memory. For a given IOUserClient instance, calling + * CopyClientMemoryForType() with a given type, should return the same IOMemoryDescriptor instance. + * @param type Type parameter IOConnectMapMemory()/UnmapMemory(). + * @param options Set kIOUserClientMemoryReadOnly for memory to be mapped read only in the client. + * @param memory An instance of IOMemoryDescriptor on success. One reference will be consumed by the caller + * of this method. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + CopyClientMemoryForType( + uint64_t type, + uint64_t * options, + IOMemoryDescriptor ** memory) = 0; + +private: + virtual kern_return_t + _ExternalMethod( + uint64_t selector, + const IOUserClientScalarArray scalarInput, + uint32_t scalarInputCount, + OSData * structureInput, + IOMemoryDescriptor * structureInputDescriptor, + IOUserClientScalarArray scalarOutput, + uint32_t * scalarOutputCount, + uint64_t structureOutputMaximumSize, + OSData ** structureOutput, + IOMemoryDescriptor * structureOutputDescriptor, + OSAction * completion TYPE(IOUserClient::AsyncCompletion)) LOCAL; + + virtual void + KernelCompletion( + OSAction * action TARGET, + IOReturn status, + const IOUserClientAsyncArgumentsArray asyncData, + uint32_t asyncDataCount) + KERNEL + TYPE(IOUserClient::AsyncCompletion); +}; + +#endif /* ! _IOKIT_UIOUSERCLIENT_H */ diff --git a/iokit/DriverKit/IOUserServer.iig b/iokit/DriverKit/IOUserServer.iig new file mode 100644 index 000000000..ca946c446 --- /dev/null +++ b/iokit/DriverKit/IOUserServer.iig @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_UIOUSERSERVER_H +#define _IOKIT_UIOUSERSERVER_H + +#include +#include +#include + + +/*! +@iig implementation +#include +@iig end +*/ + +class KERNEL IOUserServer : public IOService +{ +public: + static kern_return_t + Create( + const char name[64], + uint64_t tag, + uint64_t options, + IOUserServer ** server); + + virtual bool + init() override; + + virtual void + free() override; + + virtual kern_return_t + Exit(const char reason[1024]) LOCAL; + + virtual kern_return_t + LoadModule(const char path[1024]) LOCAL; +}; + +#endif /* ! _IOKIT_UIOUSERSERVER_H */ diff --git a/iokit/DriverKit/Makefile b/iokit/DriverKit/Makefile new file mode 100644 index 000000000..62ab74bc2 --- /dev/null +++ b/iokit/DriverKit/Makefile @@ -0,0 +1,50 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +IIG_INCFLAGS = -I$(SRCROOT)/iokit -I$(SRCROOT)/osfmk -I$(SRCROOT)/bsd -I$(OBJROOT)/bsd $(INCFLAGS_EXTERN) +OTHER_IIG_CFLAGS = $(IIG_INCFLAGS) -isysroot $(SDKROOT) -x c++ -std=gnu++1z -D__IIG=1 -DDRIVERKIT_PRIVATE=1 $(DEPLOYMENT_TARGET_DEFINES) $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) + +INCDIR = $(FRAMEDIR)/$(DKIT_INCDIR) +DRIVERKITINCDIR = $(DRIVERKITFRAMEDIR)/$(DRIVERKIT_DKIT_INCDIR) +LCLDIR = $(FRAMEDIR)/$(DKIT_PINCDIR) + +include $(MakeInc_cmd) +include $(MakeInc_def) + +ALL_DEFS = $(notdir $(wildcard $(SOURCE)*.iig)) +ALL_HDRS = $(notdir $(wildcard $(SOURCE)*.h)) + +EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} + +INSTALL_MI_DIR = . + +EXPORT_MI_DIR = DriverKit + +GENERATED_HEADERS = $(patsubst %.iig,%.h,$(ALL_DEFS)) + +GENERATED_IMPL = $(patsubst %.iig,%.iig.cpp,$(ALL_DEFS)) + +INSTALL_MI_LIST = $(ALL_DEFS) +INSTALL_DRIVERKIT_MI_LIST = $(ALL_DEFS) + +OTHER_HEADERS = IOTypes.h IOReturn.h IORPC.h + +EXPORT_MI_GEN_LIST = $(GENERATED_HEADERS) $(OTHER_HEADERS) +INSTALL_MI_GEN_LIST = $(GENERATED_HEADERS) $(OTHER_HEADERS) +INSTALL_DRIVERKIT_MI_GEN_LIST = $(GENERATED_HEADERS) $(OTHER_HEADERS) +INSTALL_KF_MI_GEN_LIST = $(GENERATED_HEADERS) $(OTHER_HEADERS) + +COMP_FILES = ${GENERATED_HEADERS} $(GENERATED_IMPL) + +$(GENERATED_HEADERS) : \ + %.h : %.iig + $(IIG) --def $< --header $@ --impl $(patsubst %.h,%.iig.cpp,$@) --framework-name DriverKit ${OTHER_IIG_FLAGS} -- ${OTHER_IIG_CFLAGS} + +$(GENERATED_IMPL) : $(GENERATED_HEADERS) + +do_build_all:: $(COMP_FILES) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/iokit/DriverKit/OSAction.iig b/iokit/DriverKit/OSAction.iig new file mode 100644 index 000000000..999205c4e --- /dev/null +++ b/iokit/DriverKit/OSAction.iig @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_OSACTION_H +#define _IOKIT_OSACTION_H + +#include + +typedef void (^OSActionCancelHandler)(void); +typedef void (^OSActionAbortedHandler)(void); + +/*! + * @class OSAction + * + * @abstract + * OSAction is an object that represents a callback to be be invoked. + * + * @discussion + * The callback is specified as a method and object pair. + * State associated with the callback may be allocated and stored for the creator of the object. + * Methods to allocate an OSAction instance are generated for each method defined in a class with + * a TYPE attribute, so there should not be any need to directly call OSAction::Create(). + */ + +class NATIVE KERNEL OSAction : public OSObject +{ +public: + + /*! + * @brief Create an instance of OSAction. + * @discussion Methods to allocate an OSAction instance are generated for each method defined in a class with + * a TYPE attribute, so there should not be any need to directly call OSAction::Create(). + * @param target OSObject to receive the callback. This object will be retained until the OSAction is + * canceled or freed. + * @param targetmsgid Generated message ID for the target method. + * @param msgid Generated message ID for the method invoked by the receiver of the OSAction + * to generate the callback. + * @param referenceSize Size of additional state structure available to the creator of the OSAction + * with GetReference. + * @param action Created OSAction with +1 retain count to be released by the caller. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + static kern_return_t + Create( + OSObject * target, + uint64_t targetmsgid, + uint64_t msgid, + size_t referenceSize, + OSAction ** action) LOCAL; + + virtual void + free() override; + + /*! + * @brief Return a pointer to any state allocated by the OSAction creator. + * @discussion Reference data is allocated with zero initialized content. It may be set and retrieved later + * with this method. + * @return A pointer to storage for the owner. It will be NULL if referenceSize was zero, and NULL + * when called in a process other than the owner that is receiving the OSAction as a parameter. + */ + void * + GetReference() LOCALONLY; + + /*! + * @brief Cancel all callbacks from the action. + * @discussion After cancellation, the action can only be freed. It cannot be reactivated. + * @param handler Handler block to be invoked after any callbacks have completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + kern_return_t + Cancel(OSActionCancelHandler handler) LOCALONLY; + + /*! + * @brief Install a handler to be invoked when no other processes reference the action. + * @discussion When all tasks other than the creator release their references to the action, + * invoke the handler in the owner. A task exiting will always remove its references. + * @param handler Handler block to be invoked on no more references. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + kern_return_t + SetAbortedHandler(OSActionAbortedHandler handler) LOCALONLY; + + virtual void + Aborted(void) LOCAL; +}; + +#endif /* ! _IOKIT_OSACTION_H */ diff --git a/iokit/DriverKit/OSObject.iig b/iokit/DriverKit/OSObject.iig new file mode 100644 index 000000000..38b55fab3 --- /dev/null +++ b/iokit/DriverKit/OSObject.iig @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !__IIG +#if KERNEL +#include +#endif +#endif + +#ifndef _IOKIT_UOSOBJECT_H +#define _IOKIT_UOSOBJECT_H + +#if !KERNEL +#include +#include +#include +#if DRIVERKIT_PRIVATE +#include +#endif +#if !__IIG +#include +#include +#endif +class OSObject; +typedef OSObject * OSObjectPtr; +#endif + +#if __IIG && !__IIG_ATTRIBUTES_DEFINED__ + +#define __IIG_ATTRIBUTES_DEFINED__ 1 + +#define KERNEL __attribute__((annotate("kernel"))) +#define NATIVE __attribute__((annotate("native"))) +#define LOCAL __attribute__((annotate("local"))) +#define LOCALONLY __attribute__((annotate("localonly"))) +#define REMOTE __attribute__((annotate("remote"))) + +#define LOCALHOST __attribute__((annotate("localhost"))) + +#define INVOKEREPLY __attribute__((annotate("invokereply"))) +#define REPLY __attribute__((annotate("reply"))) + +#define PORTMAKESEND __attribute__((annotate("MACH_MSG_TYPE_MAKE_SEND"))) +#define PORTCOPYSEND __attribute__((annotate("MACH_MSG_TYPE_COPY_SEND"))) + +#define TARGET __attribute__((annotate("target"))) +#define TYPE(p) __attribute__((annotate("type=" # p))) + +//#define ARRAY(maxcount) __attribute__((annotate(# maxcount), annotate("array"))) +#define EXTENDS(cls) __attribute__((annotate("extends=" # cls))) + +//#define INTERFACE __attribute__((annotate("interface"))) +//#define IMPLEMENTS(i) void implements(i *); + +#define QUEUENAME(name) __attribute__((annotate("queuename=" # name))) + +#define IIG_SERIALIZABLE __attribute__((annotate("serializable"))) + +#else + +#define IIG_SERIALIZABLE + +#endif /* __IIG */ + + +#if !__IIG +#if KERNEL +typedef OSObject OSContainer; +#else +class IIG_SERIALIZABLE OSContainer; +#endif +#else +class IIG_SERIALIZABLE OSContainer; +#endif + +class IIG_SERIALIZABLE OSData; +class IIG_SERIALIZABLE OSNumber; +class IIG_SERIALIZABLE OSString; +class IIG_SERIALIZABLE OSBoolean; +class IIG_SERIALIZABLE OSDictionary; +class IIG_SERIALIZABLE OSArray; + +class OSMetaClass; +class IODispatchQueue; +typedef char IODispatchQueueName[256]; + +#if __IIG +class OSMetaClassBase +{ + virtual const OSMetaClass * + getMetaClass() const LOCALONLY; + + virtual void + retain() const LOCALONLY; + + virtual void + release() const LOCALONLY; + + virtual bool + isEqualTo(const OSMetaClassBase * anObject) const LOCALONLY; +}; +#endif /* __IIG */ + + +/*! +@iig implementation +#include +@iig end +*/ + +class OSObject : public OSMetaClassBase +{ +public: + + virtual bool + init() LOCALONLY; + + virtual void + free() LOCALONLY; + + virtual void + retain() const override; + + virtual void + release() const override; + + virtual kern_return_t + SetDispatchQueue( + const IODispatchQueueName name, + IODispatchQueue * queue) KERNEL = 0; + + virtual kern_return_t + CopyDispatchQueue( + const IODispatchQueueName name, + IODispatchQueue ** queue) KERNEL = 0; +}; + +#define DEFN(classname, name) \ +name ## _Impl(classname ## _ ## name ## _Args) + +#define IMPL(classname, name) \ +classname :: DEFN(classname, name) + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#endif /* ! _IOKIT_UOSOBJECT_H */ diff --git a/iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp b/iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp index 5fd5f6a8c..a900642c5 100644 --- a/iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp +++ b/iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp @@ -59,7 +59,7 @@ IOWatchDogTimer::start(IOService *provider) } notifier = registerSleepWakeInterest(IOWatchDogTimerSleepHandler, this); - if (notifier == 0) { + if (notifier == NULL) { return false; } @@ -92,7 +92,7 @@ IOWatchDogTimer::setProperties(OSObject *properties) } theNumber = OSDynamicCast(OSNumber, properties); - if (theNumber == 0) { + if (theNumber == NULL) { return kIOReturnBadArgument; } diff --git a/iokit/IOKit/IOBSD.h b/iokit/IOKit/IOBSD.h index 5135d166a..0df8690ef 100644 --- a/iokit/IOKit/IOBSD.h +++ b/iokit/IOKit/IOBSD.h @@ -61,8 +61,17 @@ enum{ extern void IOBSDMountChange(struct mount * mp, uint32_t op); extern boolean_t IOTaskHasEntitlement(task_t task, const char * entitlement); +typedef enum { + kIOPolledCoreFileModeNotInitialized, + kIOPolledCoreFileModeDisabled, + kIOPolledCoreFileModeClosed, + kIOPolledCoreFileModeStackshot, + kIOPolledCoreFileModeCoredump, +} IOPolledCoreFileMode_t; + extern struct IOPolledFileIOVars * gIOPolledCoreFileVars; extern kern_return_t gIOPolledCoreFileOpenRet; +extern IOPolledCoreFileMode_t gIOPolledCoreFileMode; #ifdef __cplusplus } diff --git a/iokit/IOKit/IOBufferMemoryDescriptor.h b/iokit/IOKit/IOBufferMemoryDescriptor.h index e025e01a3..112bf79fd 100644 --- a/iokit/IOKit/IOBufferMemoryDescriptor.h +++ b/iokit/IOKit/IOBufferMemoryDescriptor.h @@ -29,6 +29,7 @@ #define _IOBUFFERMEMORYDESCRIPTOR_H #include +#include enum { kIOMemoryPhysicallyContiguous = 0x00000010, @@ -58,7 +59,7 @@ enum { class IOBufferMemoryDescriptor : public IOGeneralMemoryDescriptor { - OSDeclareDefaultStructors(IOBufferMemoryDescriptor); + OSDeclareDefaultStructorsWithDispatch(IOBufferMemoryDescriptor); private: /*! @struct ExpansionData @@ -140,6 +141,13 @@ public: vm_offset_t alignment) APPLE_KEXT_DEPRECATED; /* use withOptions() instead */ #endif /* !__LP64__ */ + static IOBufferMemoryDescriptor * withCopy( + task_t inTask, + IOOptionBits options, + vm_map_t sourceMap, + mach_vm_address_t source, + mach_vm_size_t size); + static IOBufferMemoryDescriptor * withOptions( IOOptionBits options, vm_size_t capacity, vm_offset_t alignment = 1); diff --git a/iokit/IOKit/IOCPU.h b/iokit/IOKit/IOCPU.h index b1e6ca63b..a52591e4f 100644 --- a/iokit/IOKit/IOCPU.h +++ b/iokit/IOKit/IOCPU.h @@ -73,6 +73,8 @@ protected: public: virtual bool start(IOService *provider) APPLE_KEXT_OVERRIDE; + virtual void detach(IOService *provider) APPLE_KEXT_OVERRIDE; + virtual OSObject *getProperty(const OSSymbol *aKey) const APPLE_KEXT_OVERRIDE; virtual bool setProperty(const OSSymbol *aKey, OSObject *anObject) APPLE_KEXT_OVERRIDE; virtual bool serializeProperties(OSSerialize *serialize) const APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOCatalogue.h b/iokit/IOKit/IOCatalogue.h index a9f7e3b44..682625f43 100644 --- a/iokit/IOKit/IOCatalogue.h +++ b/iokit/IOKit/IOCatalogue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -47,6 +47,10 @@ class IOService; +extern const OSSymbol * gIOModuleIdentifierKey; +extern const OSSymbol * gIOModuleIdentifierKernelKey; + + /*! * @class IOCatalogue * @abstract In-kernel database for IOKit driver personalities. @@ -54,7 +58,7 @@ class IOService; */ class IOCatalogue : public OSObject { - OSDeclareDefaultStructors(IOCatalogue) + OSDeclareDefaultStructors(IOCatalogue); private: IORWLock * lock; @@ -125,36 +129,21 @@ public: */ SInt32 getGenerationCount( void ) const; -/*! - * @function isModuleLoaded - * @abstract Reports if a kernel module has been loaded. - * @param moduleName Name of the module. - * @result Returns true if the associated kernel module has been loaded into the kernel. - */ - bool isModuleLoaded( OSString * moduleName ) const; - -/*! - * @function isModuleLoaded - * @abstract Reports if a kernel module has been loaded. - * @param moduleName Name of the module. - * @result Returns true if the associated kernel module has been loaded into the kernel. - */ - bool isModuleLoaded( const char * moduleName ) const; - /*! * @function isModuleLoaded * @abstract Reports if a kernel module has been loaded for a particular personality. * @param driver A driver personality's property list. + * @param kextRef A reference to the kext getting loaded. * @result Returns true if the associated kernel module has been loaded into the kernel for a particular driver personality on which it depends. */ - bool isModuleLoaded( OSDictionary * driver ) const; + bool isModuleLoaded( OSDictionary * driver, OSObject ** kextRef ) const; /*! * @function moduleHasLoaded * @abstract Callback function called after a IOKit dependent kernel module is loaded. * @param name Name of the kernel module. */ - void moduleHasLoaded( OSString * name ); + void moduleHasLoaded( const OSSymbol * name ); /*! * @function moduleHasLoaded @@ -188,10 +177,15 @@ public: /*! * @function startMatching - * @abstract Starts an IOService matching thread where matching keys and values are provided by the matching dictionary. - * @param matching A dictionary whose keys and values are used for matching personalities in the database. For example, a matching dictionary containing a 'IOProviderClass' key with the value 'IOPCIDevice' will start matching for all personalities which have the key 'IOProviderClass' equal to 'IOPCIDevice'. + * @abstract Restarts IOService matching. + * @param identifier All IOService objects with this bundle indentifier are rematched. */ + bool startMatching( const OSSymbol * identifier ); + + // deprecated, for bin compat +#if defined(__i386__) || defined(__x86_64__) bool startMatching( OSDictionary * matching ); +#endif /*! * @function reset diff --git a/iokit/IOKit/IOCommand.h b/iokit/IOKit/IOCommand.h index 9c3e6c06b..6cfd848c9 100644 --- a/iokit/IOKit/IOCommand.h +++ b/iokit/IOKit/IOCommand.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,7 +68,7 @@ class IOCommand : public OSObject { - OSDeclareDefaultStructors(IOCommand) + OSDeclareDefaultStructors(IOCommand); public: virtual bool init(void) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOCommandGate.h b/iokit/IOKit/IOCommandGate.h index 2fa36e137..2a1d2f287 100644 --- a/iokit/IOKit/IOCommandGate.h +++ b/iokit/IOKit/IOCommandGate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,7 +59,7 @@ */ class IOCommandGate : public IOEventSource { - OSDeclareDefaultStructors(IOCommandGate) + OSDeclareDefaultStructors(IOCommandGate); public: /*! @@ -99,7 +99,7 @@ public: /*! @function commandGate * @abstract Factory method to create and initialise an IOCommandGate, See $link init. * @result Returns a pointer to the new command gate if sucessful, 0 otherwise. */ - static IOCommandGate *commandGate(OSObject *owner, Action action = 0); + static IOCommandGate *commandGate(OSObject *owner, Action action = NULL); /*! @function init * @abstract Class initialiser. @@ -112,7 +112,7 @@ public: * must cast the member function to $link IOCommandGate::Action and they will get a * compiler warning. Defaults to zero, see $link IOEventSource::setAction. * @result True if inherited classes initialise successfully. */ - virtual bool init(OSObject *owner, Action action = 0); + virtual bool init(OSObject *owner, Action action = NULL); // Superclass overrides virtual void free() APPLE_KEXT_OVERRIDE; @@ -132,8 +132,8 @@ public: * @param arg3 Parameter for action of command gate, defaults to 0. * @result kIOReturnSuccess if successful. kIOReturnAborted if a disabled command gate is free()ed before being reenabled, kIOReturnNoResources if no action available. */ - virtual IOReturn runCommand(void *arg0 = 0, void *arg1 = 0, - void *arg2 = 0, void *arg3 = 0); + virtual IOReturn runCommand(void *arg0 = NULL, void *arg1 = NULL, + void *arg2 = NULL, void *arg3 = NULL); /*! @function runAction * @abstract Single thread a call to an action with the target work loop. @@ -151,8 +151,8 @@ public: * @result The return value of action if it was called, kIOReturnBadArgument if action is not defined, kIOReturnAborted if a disabled command gate is free()ed before being reenabled. */ virtual IOReturn runAction(Action action, - void *arg0 = 0, void *arg1 = 0, - void *arg2 = 0, void *arg3 = 0); + void *arg0 = NULL, void *arg1 = NULL, + void *arg2 = NULL, void *arg3 = NULL); #ifdef __BLOCKS__ /*! @function runActionBlock @@ -179,8 +179,8 @@ public: * @param arg3 Parameter for action of command gate, defaults to 0. * @result kIOReturnSuccess if successful. kIOReturnNotPermitted if this event source is currently disabled, kIOReturnNoResources if no action available, kIOReturnCannotLock if lock attempt fails. */ - virtual IOReturn attemptCommand(void *arg0 = 0, void *arg1 = 0, - void *arg2 = 0, void *arg3 = 0); + virtual IOReturn attemptCommand(void *arg0 = NULL, void *arg1 = NULL, + void *arg2 = NULL, void *arg3 = NULL); /*! @function attemptAction * @abstract Single thread a call to an action with the target work loop. @@ -197,8 +197,8 @@ public: * */ virtual IOReturn attemptAction(Action action, - void *arg0 = 0, void *arg1 = 0, - void *arg2 = 0, void *arg3 = 0); + void *arg0 = NULL, void *arg1 = NULL, + void *arg2 = NULL, void *arg3 = NULL); /*! @function commandSleep * @abstract Put a thread that is currently holding the command gate to sleep. diff --git a/iokit/IOKit/IOCommandPool.h b/iokit/IOKit/IOCommandPool.h index 356c04ace..ee30bb44e 100644 --- a/iokit/IOKit/IOCommandPool.h +++ b/iokit/IOKit/IOCommandPool.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,7 +74,7 @@ class IOCommandPool : public OSObject { - OSDeclareDefaultStructors(IOCommandPool) + OSDeclareDefaultStructors(IOCommandPool); protected: @@ -199,7 +199,8 @@ protected: * doesn't wish to block until one does become available. * kIOReturnSuccess if the vCommand argument is valid. */ - virtual IOReturn gatedGetCommand(IOCommand **command, bool blockForCommand); + virtual IOReturn gatedGetCommand( + LIBKERN_RETURNS_NOT_RETAINED IOCommand **command, bool blockForCommand); /*! * @function gatedReturnCommand diff --git a/iokit/IOKit/IOCommandQueue.h b/iokit/IOKit/IOCommandQueue.h index 2193062b2..5ad86ec5b 100644 --- a/iokit/IOKit/IOCommandQueue.h +++ b/iokit/IOKit/IOCommandQueue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,7 +48,7 @@ typedef void (*IOCommandQueueAction) class IOCommandQueue : public IOEventSource { - OSDeclareDefaultStructors(IOCommandQueue) + OSDeclareDefaultStructors(IOCommandQueue); protected: static const int kIOCQDefaultSize = 128; @@ -65,17 +65,17 @@ protected: public: static IOCommandQueue *commandQueue(OSObject *inOwner, - IOCommandQueueAction inAction = 0, + IOCommandQueueAction inAction = NULL, int inSize = kIOCQDefaultSize) APPLE_KEXT_DEPRECATED; virtual bool init(OSObject *inOwner, - IOCommandQueueAction inAction = 0, + IOCommandQueueAction inAction = NULL, int inSize = kIOCQDefaultSize) APPLE_KEXT_DEPRECATED; virtual kern_return_t enqueueCommand(bool gotoSleep = true, - void *field0 = 0, void *field1 = 0, - void *field2 = 0, void *field3 = 0) + void *field0 = NULL, void *field1 = NULL, + void *field2 = NULL, void *field3 = NULL) APPLE_KEXT_DEPRECATED; // WARNING: This function can only be safely called from the appropriate @@ -84,8 +84,8 @@ public: // For each entry in the commandQueue call the target/action. // Lockout all new entries to the queue while iterating. // If the input fields are zero then the queue's owner/action will be used. - virtual int performAndFlush(OSObject *target = 0, - IOCommandQueueAction inAction = 0) + virtual int performAndFlush(OSObject *target = NULL, + IOCommandQueueAction inAction = NULL) APPLE_KEXT_DEPRECATED; }; diff --git a/iokit/IOKit/IOConditionLock.h b/iokit/IOKit/IOConditionLock.h index 408a78515..52dd502d0 100644 --- a/iokit/IOKit/IOConditionLock.h +++ b/iokit/IOKit/IOConditionLock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,7 +40,7 @@ class IOConditionLock : public OSObject { - OSDeclareDefaultStructors(IOConditionLock) + OSDeclareDefaultStructors(IOConditionLock); private: IOLock * cond_interlock; // condition var Simple lock diff --git a/iokit/IOKit/IODMACommand.h b/iokit/IOKit/IODMACommand.h index 5628986e2..adaaf4cd0 100644 --- a/iokit/IOKit/IODMACommand.h +++ b/iokit/IOKit/IODMACommand.h @@ -221,8 +221,8 @@ public: MappingOptions mappingOptions = kMapped, UInt64 maxTransferSize = 0, UInt32 alignment = 1, - IOMapper *mapper = 0, - void *refCon = 0); + IOMapper *mapper = NULL, + void *refCon = NULL); /*! @function weakWithSpecification * @abstract Creates and initialises an IODMACommand in one operation if this version of the operating system supports it. @@ -251,8 +251,8 @@ public: MappingOptions mapType = kMapped, UInt64 maxTransferSize = 0, UInt32 alignment = 1, - IOMapper *mapper = 0, - void *refCon = 0) __attribute__((always_inline)); + IOMapper *mapper = NULL, + void *refCon = NULL) __attribute__((always_inline)); static IODMACommand * withSpecification(SegmentFunction outSegFunc, @@ -276,7 +276,7 @@ public: * @discussion Factory function to create and initialise an IODMACommand in one operation. The current command's specification will be duplicated in the new object, but however none of its state will be duplicated. This means that it is safe to clone a command even if it is currently active and running, however you must be certain that the command to be duplicated does have a valid reference for the duration. * @result Returns a new IODMACommand if successfully created and initialised, 0 otherwise. */ - virtual IODMACommand *cloneCommand(void *refCon = 0); + virtual IODMACommand *cloneCommand(void *refCon = NULL); /*! @function initWithSpecification * @abstract Primary initializer for the IODMACommand class. @@ -296,8 +296,8 @@ public: MappingOptions mappingOptions = kMapped, UInt64 maxTransferSize = 0, UInt32 alignment = 1, - IOMapper *mapper = 0, - void *refCon = 0); + IOMapper *mapper = NULL, + void *refCon = NULL); /*! @function setMemoryDescriptor * @abstract Sets and resets the DMACommand's current memory descriptor @@ -481,7 +481,7 @@ public: MappingOptions mappingOptions = kMapped, UInt64 maxTransferSize = 0, UInt32 alignment = 1, - IOMapper *mapper = 0, + IOMapper *mapper = NULL, UInt64 offset = 0, UInt64 length = 0, bool flushCache = true, @@ -515,7 +515,7 @@ public: */ virtual - bool initWithRefCon(void * refCon = 0); + bool initWithRefCon(void * refCon = NULL); virtual bool initWithSpecification(SegmentFunction outSegFunc, @@ -638,7 +638,7 @@ weakWithSpecification(IODMACommand **newCommand, ret = kIOReturnSuccess; } else { self->release(); - self = 0; + self = NULL; ret = kIOReturnError; } diff --git a/iokit/IOKit/IODMAEventSource.h b/iokit/IOKit/IODMAEventSource.h index 88ffeed97..5b26e08b7 100644 --- a/iokit/IOKit/IODMAEventSource.h +++ b/iokit/IOKit/IODMAEventSource.h @@ -53,8 +53,8 @@ protected: public: static IODMAEventSource *dmaEventSource(OSObject *owner, IOService *provider, - Action completion = 0, - Action notification = 0, + Action completion = NULL, + Action notification = NULL, UInt32 dmaIndex = 0); virtual IOReturn startDMACommand(IODMACommand *dmaCommand, IODirection direction, IOByteCount byteCount = 0, IOByteCount byteOffset = 0); @@ -83,8 +83,8 @@ private: virtual bool init(OSObject *owner, IOService *provider, - Action completion = 0, - Action notification = 0, + Action completion = NULL, + Action notification = NULL, UInt32 dmaIndex = 0); virtual bool checkForWork(void) APPLE_KEXT_OVERRIDE; virtual void free(void) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IODataQueue.h b/iokit/IOKit/IODataQueue.h index c7de3c5fe..c16d03fa2 100644 --- a/iokit/IOKit/IODataQueue.h +++ b/iokit/IOKit/IODataQueue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,6 +46,14 @@ struct _notifyMsg { mach_msg_header_t h; }; + +#ifdef dequeue +#undef dequeue +#endif +#ifdef enqueue +#undef enqueue +#endif + /*! * @class IODataQueue : public OSObject * @abstract A generic queue designed to pass data from the kernel to a user process. @@ -65,7 +73,7 @@ class __attribute__((deprecated)) IODataQueue: public OSObject class IODataQueue : public OSObject #endif { - OSDeclareDefaultStructors(IODataQueue) + OSDeclareDefaultStructors(IODataQueue); protected: IODataQueueMemory * dataQueue; diff --git a/iokit/IOKit/IODeviceMemory.h b/iokit/IOKit/IODeviceMemory.h index 49d0324b8..dadc043b0 100644 --- a/iokit/IOKit/IODeviceMemory.h +++ b/iokit/IOKit/IODeviceMemory.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,7 +44,7 @@ class IODeviceMemory : public IOMemoryDescriptor { - OSDeclareDefaultStructors(IODeviceMemory) + OSDeclareDefaultStructors(IODeviceMemory); public: diff --git a/iokit/IOKit/IODeviceTreeSupport.h b/iokit/IOKit/IODeviceTreeSupport.h index 24c79221a..51d146aab 100644 --- a/iokit/IOKit/IODeviceTreeSupport.h +++ b/iokit/IOKit/IODeviceTreeSupport.h @@ -61,7 +61,8 @@ bool IODTMatchNubWithKeys( IORegistryEntry * nub, const char * keys ); bool IODTCompareNubName( const IORegistryEntry * regEntry, - OSString * name, OSString ** matchingName ); + OSString * name, + LIBKERN_RETURNS_RETAINED_ON_NONZERO OSString ** matchingName ); enum { kIODTRecursive = 0x00000001, diff --git a/iokit/IOKit/IOEventSource.h b/iokit/IOKit/IOEventSource.h index 4b48cb475..db000cf38 100644 --- a/iokit/IOKit/IOEventSource.h +++ b/iokit/IOKit/IOEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,7 +83,7 @@ __END_DECLS */ class IOEventSource : public OSObject { - OSDeclareAbstractStructors(IOEventSource) + OSDeclareAbstractStructors(IOEventSource); friend class IOWorkLoop; #if IOKITSTATS friend class IOStatistics; @@ -181,7 +181,7 @@ protected: * @result true if the inherited classes and this instance initialise * successfully. */ - virtual bool init(OSObject *owner, IOEventSource::Action action = 0); + virtual bool init(OSObject *owner, IOEventSource::Action action = NULL); virtual void free( void ) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOFilterInterruptEventSource.h b/iokit/IOKit/IOFilterInterruptEventSource.h index db887e746..263f8ac44 100644 --- a/iokit/IOKit/IOFilterInterruptEventSource.h +++ b/iokit/IOKit/IOFilterInterruptEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,7 +51,7 @@ class IOService; */ class IOFilterInterruptEventSource : public IOInterruptEventSource { - OSDeclareDefaultStructors(IOFilterInterruptEventSource) + OSDeclareDefaultStructors(IOFilterInterruptEventSource); public: /*! @@ -73,14 +73,14 @@ public: private: // Hide the superclass initializers virtual bool init(OSObject *inOwner, - IOInterruptEventSource::Action inAction = 0, - IOService *inProvider = 0, + IOInterruptEventSource::Action inAction = NULL, + IOService *inProvider = NULL, int inIntIndex = 0) APPLE_KEXT_OVERRIDE; static IOInterruptEventSource * interruptEventSource(OSObject *inOwner, - IOInterruptEventSource::Action inAction = 0, - IOService *inProvider = 0, + IOInterruptEventSource::Action inAction = NULL, + IOService *inProvider = NULL, int inIntIndex = 0); protected: diff --git a/iokit/IOKit/IOInterruptAccountingPrivate.h b/iokit/IOKit/IOInterruptAccountingPrivate.h index b1dd7e369..ca42aa7f2 100644 --- a/iokit/IOKit/IOInterruptAccountingPrivate.h +++ b/iokit/IOKit/IOInterruptAccountingPrivate.h @@ -134,13 +134,13 @@ static const char * const kInterruptAccountingStatisticNameArray[IA_NUM_INTERRUP * two processors at once (and the interrupt should serve to force out stores), and the second level * handler should be synchonized by the work loop it runs on. */ -#if __x86_64__ || __arm64 +#if __x86_64__ || __arm64__ #define IA_ADD_VALUE(target, value) \ (*(target) += (value)) -#else +#else /* !(__x86_64__ || __arm64__) */ #define IA_ADD_VALUE(target, value) \ (OSAddAtomic64((value), (target))) -#endif +#endif /* !(__x86_64__ || __arm64__) */ /* * TODO: Should this be an OSObject? Or properly pull in its methods as member functions? @@ -160,6 +160,9 @@ struct IOInterruptAccountingData { */ int interruptIndex; + bool enablePrimaryTimestamp; + volatile uint64_t primaryTimestamp __attribute__((aligned(8))); + /* * As long as we are based on the simple reporter, all our channels will be 64 bits. Align the data * to allow for safe atomic updates (we don't want to cross a cache line on any platform, but for some diff --git a/iokit/IOKit/IOInterruptEventSource.h b/iokit/IOKit/IOInterruptEventSource.h index 40e5bc1dc..6313a87b4 100644 --- a/iokit/IOKit/IOInterruptEventSource.h +++ b/iokit/IOKit/IOInterruptEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,7 +60,7 @@ struct IOInterruptAccountingData; */ class IOInterruptEventSource : public IOEventSource { - OSDeclareDefaultStructors(IOInterruptEventSource) + OSDeclareDefaultStructors(IOInterruptEventSource); public: /*! @typedef Action @@ -138,7 +138,7 @@ public: static IOInterruptEventSource * interruptEventSource(OSObject *owner, Action action, - IOService *provider = 0, + IOService *provider = NULL, int intIndex = 0); @@ -171,7 +171,7 @@ public: * successfully. */ virtual bool init(OSObject *owner, Action action, - IOService *provider = 0, + IOService *provider = NULL, int intIndex = 0); /*! @function enable @@ -231,6 +231,20 @@ public: * @param abstime Time at which interrupt is expected. */ IOReturn warmCPU(uint64_t abstime); +/*! @function enablePrimaryInterruptTimestamp + * @abstract Enables collection of mach_absolute_time at primary interrupt. + * @discussion Enables collection of mach_absolute_time at primary interrupt. + * @param enable True to enable timestamp. */ + + void enablePrimaryInterruptTimestamp(bool enable); + +/*! @function getPimaryInterruptTimestamp + * @abstract Returns mach_absolute_time timestamp of primary interrupt. + * @discussion Returns mach_absolute_time timestamp of primary interrupt. + * @result Value of the timestamp. Zero if never interrupted, or -1ULL if timestamp collection has not been enabled. */ + + uint64_t getPimaryInterruptTimestamp(); + private: IOReturn registerInterruptHandler(IOService *inProvider, int inIntIndex); void unregisterInterruptHandler(IOService *inProvider, int inIntIndex); diff --git a/iokit/IOKit/IOKitDebug.h b/iokit/IOKit/IOKitDebug.h index 6f114a347..50a811610 100644 --- a/iokit/IOKit/IOKitDebug.h +++ b/iokit/IOKit/IOKitDebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2010 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,7 +40,7 @@ class IOKitDiagnostics : public OSObject { - OSDeclareDefaultStructors(IOKitDiagnostics) + OSDeclareDefaultStructors(IOKitDiagnostics); public: static OSObject * diagnostics( void ); @@ -128,8 +128,31 @@ enum { kIOTraceCompatBootArgs = kIOTraceIOService | kIOTracePowerMgmt }; +enum { + kIODKEnable = 0x00000001ULL, + kIODKLogSetup = 0x00000002ULL, + kIODKLogIPC = 0x00000004ULL, + kIODKLogPM = 0x00000008ULL, + kIODKLogMessages = 0x00000010ULL, + + kIODKDisablePM = 0x000000100ULL, + kIODKDisableDextLaunch = 0x00001000ULL, + kIODKDisableDextTag = 0x00002000ULL, + kIODKDisableCDHashChecking = 0x00004000ULL, + kIODKDisableEntitlementChecking = 0x00008000ULL, +}; + +#if XNU_KERNEL_PRIVATE + +#define DKLOG(fmt, args...) { IOLog("DK: " fmt, ## args); } +#define DKS "%s-0x%qx" +#define DKN(s) s->getName(), s->getRegistryEntryID() + +#endif /* XNU_KERNEL_PRIVATE */ + extern SInt64 gIOKitDebug; extern SInt64 gIOKitTrace; +extern SInt64 gIODKDebug; #ifdef __cplusplus extern "C" { diff --git a/iokit/IOKit/IOKitDiagnosticsUserClient.h b/iokit/IOKit/IOKitDiagnosticsUserClient.h index 12976053b..f78c45cc1 100644 --- a/iokit/IOKit/IOKitDiagnosticsUserClient.h +++ b/iokit/IOKit/IOKitDiagnosticsUserClient.h @@ -1,10 +1,12 @@ +/* * Copyright (c) 2019 Apple Inc. All rights reserved. */ + #include #include class IOKitDiagnosticsClient : public IOUserClient { - OSDeclareDefaultStructors(IOKitDiagnosticsClient) + OSDeclareDefaultStructors(IOKitDiagnosticsClient); public: static IOUserClient * withTask(task_t owningTask); diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index 84e5a0afd..e8db76ced 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -86,6 +86,35 @@ #define kIOMatchCategoryKey "IOMatchCategory" #define kIODefaultMatchCategoryKey "IODefaultMatchCategory" +#define kIOMatchedPersonalityKey "IOMatchedPersonality" +#define kIORematchPersonalityKey "IORematchPersonality" +#define kIORematchCountKey "IORematchCount" +#define kIODEXTMatchCountKey "IODEXTMatchCount" + +// Entitlements to check against dext process +// Property is an array, one or more of which may match, of: +// an array of entitlement strings, all must be present +// Any array can be a single string. +#define kIOServiceDEXTEntitlementsKey "IOServiceDEXTEntitlements" + +// Entitlement required to open dext connection +#define kIODriverKitEntitlementKey "com.apple.developer.driverkit" + +// Entitlements required to open dext IOUserClient +// Property is an array of strings containing CFBundleIdentifiers of service being opened +#define kIODriverKitUserClientEntitlementsKey "com.apple.developer.driverkit.userclient-access" + +// Other DriverKit entitlements +#define kIODriverKitUSBTransportEntitlementKey "com.apple.developer.driverkit.transport.usb" +#define kIODriverKitHIDTransportEntitlementKey "com.apple.developer.driverkit.transport.hid" +#define kIODriverKitHIDFamilyDeviceEntitlementKey "com.apple.developer.driverkit.family.hid.device" +#define kIODriverKitHIDFamilyEventServiceEntitlementKey "com.apple.developer.driverkit.family.hid.eventservice" +#define kIODriverKitTransportBuiltinEntitlementKey "com.apple.developer.driverkit.builtin" + + +// When possible, defer matching of this driver until kextd has started. +#define kIOMatchDeferKey "IOMatchDefer" + // IOService default user client class, for loadable user clients #define kIOUserClientClassKey "IOUserClientClass" @@ -95,8 +124,16 @@ #define kIOUserClientCrossEndianKey "IOUserClientCrossEndian" #define kIOUserClientCrossEndianCompatibleKey "IOUserClientCrossEndianCompatible" #define kIOUserClientSharedInstanceKey "IOUserClientSharedInstance" +#if KERNEL_PRIVATE +#define kIOUserClientMessageAppSuspendedKey "IOUserClientMessageAppSuspended" +#endif // diagnostic string describing the creating task #define kIOUserClientCreatorKey "IOUserClientCreator" +// the expected cdhash value of the userspace driver executable +#define kIOUserServerCDHashKey "IOUserServerCDHash" + +#define kIOUserUserClientKey "IOUserUserClient" + // IOService notification types #define kIOPublishNotification "IOServicePublish" @@ -160,6 +197,7 @@ #define kIOPlatformUUIDKey "IOPlatformUUID" // (OSString) // IODTNVRAM property keys +#define kIONVRAMBootArgsKey "boot-args" #define kIONVRAMDeletePropertyKey "IONVRAM-DELETE-PROPERTY" #define kIONVRAMSyncNowPropertyKey "IONVRAM-SYNCNOW-PROPERTY" #define kIONVRAMActivateCSRConfigPropertyKey "IONVRAM-ARMCSR-PROPERTY" diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index 099970fbc..f7cc9eae6 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -71,8 +71,8 @@ enum { kIOCatalogAddDriversNoMatch, kIOCatalogRemoveDrivers, kIOCatalogRemoveDriversNoMatch, - kIOCatalogStartMatching, - kIOCatalogRemoveKernelLinker, + kIOCatalogStartMatching__Removed, + kIOCatalogRemoveKernelLinker__Removed, kIOCatalogKextdActive, kIOCatalogKextdFinishedLaunching, kIOCatalogResetDrivers, @@ -154,11 +154,17 @@ extern kern_return_t iokit_destroy_object_port( ipc_port_t port ); extern mach_port_name_t iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type ); +extern mach_port_t ipc_port_make_send(mach_port_t); +extern void ipc_port_release_send(ipc_port_t port); + +extern io_object_t iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type); + extern kern_return_t iokit_mod_send_right( task_t task, mach_port_name_t name, mach_port_delta_t delta ); extern io_object_t iokit_lookup_object_with_port_name(mach_port_name_t name, ipc_kobject_type_t type, task_t task); extern io_object_t iokit_lookup_connect_ref_current_task(mach_port_name_t name); +extern io_object_t iokit_lookup_uext_ref_current_task(mach_port_name_t name); extern void iokit_retain_port( ipc_port_t port ); extern void iokit_release_port( ipc_port_t port ); @@ -169,6 +175,17 @@ extern void iokit_unlock_port(ipc_port_t port); extern kern_return_t iokit_switch_object_port( ipc_port_t port, io_object_t obj, ipc_kobject_type_t type ); +#ifndef MACH_KERNEL_PRIVATE +typedef struct ipc_kmsg * ipc_kmsg_t; +extern ipc_kmsg_t ipc_kmsg_alloc(size_t); +extern void ipc_kmsg_destroy(ipc_kmsg_t); +extern mach_msg_header_t * ipc_kmsg_msg_header(ipc_kmsg_t); +#endif /* MACH_KERNEL_PRIVATE */ + +extern kern_return_t +uext_server(ipc_kmsg_t request, ipc_kmsg_t * preply); + + /* * Functions imported by iokit:IOMemoryDescriptor.cpp */ diff --git a/iokit/IOKit/IOLib.h b/iokit/IOKit/IOLib.h index 989dc1de1..df5036123 100644 --- a/iokit/IOKit/IOLib.h +++ b/iokit/IOKit/IOLib.h @@ -41,6 +41,7 @@ #include #include +#include #include @@ -80,7 +81,8 @@ typedef void (*IOThreadFunc)(void *argument); * @param size Size of the memory requested. * @result Pointer to the allocated memory, or zero on failure. */ -void * IOMalloc(vm_size_t size) __attribute__((alloc_size(1))); +void * IOMalloc(vm_size_t size) __attribute__((alloc_size(1))); +void * IOMallocZero(vm_size_t size) __attribute__((alloc_size(1))); /*! @function IOFree * @abstract Frees memory allocated with IOMalloc. @@ -147,14 +149,40 @@ void * IOMallocPageable(vm_size_t size, vm_size_t alignment) __attribute__((allo void IOFreePageable(void * address, vm_size_t size); /* - * Typed memory allocation macros. Both may block. + * Typed memory allocation macros. All may block. */ -#define IONew(type, number) \ -( ((number) != 0 && ((vm_size_t) ((sizeof(type) * (number) / (number))) != sizeof(type)) /* overflow check 20847256 */ \ - ? 0 \ - : ((type*)IOMalloc(sizeof(type) * (number)))) ) -#define IODelete(ptr, type, number) IOFree( (ptr) , sizeof(type) * (number) ) +#define IONew(type, count) \ +({ \ + size_t __size; \ + (os_mul_overflow(sizeof(type), (count), &__size) \ + ? ((type *) NULL) \ + : ((type *) IOMalloc(__size))); \ +}) + +#define IONewZero(type, count) \ +({ \ + size_t __size; \ + (os_mul_overflow(sizeof(type), (count), &__size) \ + ? ((type *) NULL) \ + : ((type *) IOMallocZero(__size))); \ +}) + +#define IODelete(ptr, type, count) \ +({ \ + size_t __size; \ + if (!os_mul_overflow(sizeof(type), (count), &__size)) { \ + IOFree(ptr, __size); \ + } \ +}) + +#define IOSafeDeleteNULL(ptr, type, count) \ + do { \ + if (NULL != (ptr)) { \ + IODelete((ptr), type, count); \ + (ptr) = NULL; \ + } \ + } while (0) \ ///////////////////////////////////////////////////////////////////////////// // @@ -344,7 +372,7 @@ void Debugger(const char * reason); #if __LP64__ #define IOPanic(reason) panic("%s", reason) #else -void IOPanic(const char *reason) __attribute__((deprecated)); +void IOPanic(const char *reason) __attribute__((deprecated)) __abortlike; #endif #ifdef __cplusplus diff --git a/iokit/IOKit/IOLocks.h b/iokit/IOKit/IOLocks.h index 2c89a177a..88242c940 100644 --- a/iokit/IOKit/IOLocks.h +++ b/iokit/IOKit/IOLocks.h @@ -401,11 +401,18 @@ lck_spin_t * IOSimpleLockGetMachLock( IOSimpleLock * lock); /*! @function IOSimpleLockInit * @abstract Initialize a spin lock. - * @discussion Initialize an embedded spin lock, to the unlocked state. + * @discussion Initialize a non heap allocated spin lock to the unlocked state. Use this function when your lock is, for example, a member variable. You will need to call IOSimpleLockDestroy when you are finished with the lock to avoid lock group refcount leaks. * @param lock Pointer to the lock. */ void IOSimpleLockInit( IOSimpleLock * lock ); +/*! @function IOSimpleLockDestroy + * @abstract De-initializes (destroys) a spin lock initialized with IOSimpleLockInit + * @discussion Destroy / De-initialize a non heap allocated spin lock, releasing any system resources such as lock group refcounts. + * @param lock Pointer to the lock. */ + +void IOSimpleLockDestroy( IOSimpleLock * lock ); + /*! @function IOSimpleLockLock * @abstract Lock a spin lock. * @discussion Lock the spin lock. If the lock is held, spin waiting for its unlock. Spin locks disable preemption, cannot be held across any blocking operation, and should be held for very short periods. When used to synchronize between interrupt context and thread context they should be locked with interrupts disabled - IOSimpleLockLockDisableInterrupt() will do both. Locking the lock recursively from one thread will result in deadlock. diff --git a/iokit/IOKit/IOMemoryCursor.h b/iokit/IOKit/IOMemoryCursor.h index 99f9dc814..3b81e6d3d 100644 --- a/iokit/IOKit/IOMemoryCursor.h +++ b/iokit/IOKit/IOMemoryCursor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,7 +54,7 @@ class IOMemoryDescriptor; */ class IOMemoryCursor : public OSObject { - OSDeclareDefaultStructors(IOMemoryCursor) + OSDeclareDefaultStructors(IOMemoryCursor); public: /*! @@ -148,7 +148,7 @@ public: void * segments, UInt32 maxSegments, UInt32 maxTransferSize = 0, - IOByteCount *transferSize = 0); + IOByteCount *transferSize = NULL); }; /************************ class IONaturalMemoryCursor ************************/ @@ -161,7 +161,7 @@ public: */ class IONaturalMemoryCursor : public IOMemoryCursor { - OSDeclareDefaultStructors(IONaturalMemoryCursor) + OSDeclareDefaultStructors(IONaturalMemoryCursor); public: /*! @function outputSegment @@ -221,7 +221,7 @@ public: PhysicalSegment *segments, UInt32 maxSegments, UInt32 inMaxTransferSize = 0, - IOByteCount *transferSize = 0) + IOByteCount *transferSize = NULL) { return genPhysicalSegments(descriptor, fromPosition, segments, maxSegments, inMaxTransferSize, transferSize); @@ -237,7 +237,7 @@ public: */ class IOBigMemoryCursor : public IOMemoryCursor { - OSDeclareDefaultStructors(IOBigMemoryCursor) + OSDeclareDefaultStructors(IOBigMemoryCursor); public: /*! @function outputSegment @@ -298,7 +298,7 @@ public: PhysicalSegment * segments, UInt32 maxSegments, UInt32 inMaxTransferSize = 0, - IOByteCount * transferSize = 0) + IOByteCount * transferSize = NULL) { return genPhysicalSegments(descriptor, fromPosition, segments, maxSegments, inMaxTransferSize, transferSize); @@ -314,7 +314,7 @@ public: */ class IOLittleMemoryCursor : public IOMemoryCursor { - OSDeclareDefaultStructors(IOLittleMemoryCursor) + OSDeclareDefaultStructors(IOLittleMemoryCursor); public: /*! @function outputSegment @@ -373,7 +373,7 @@ public: PhysicalSegment * segments, UInt32 maxSegments, UInt32 inMaxTransferSize = 0, - IOByteCount * transferSize = 0) + IOByteCount * transferSize = NULL) { return genPhysicalSegments(descriptor, fromPosition, segments, maxSegments, inMaxTransferSize, transferSize); diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index b9deeaa4a..0c19f4964 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,6 +33,8 @@ #include #include #include +#include +#include #ifdef XNU_KERNEL_PRIVATE #include #endif @@ -123,7 +125,18 @@ enum { #endif }; -#define kIOMapperSystem ((IOMapper *) 0) +#define kIOMapperSystem ((IOMapper *) NULL) + +enum{ + kIOMemoryLedgerTagDefault = VM_LEDGER_TAG_DEFAULT, + kIOmemoryLedgerTagNetwork = VM_LEDGER_TAG_NETWORK, + kIOMemoryLedgerTagMedia = VM_LEDGER_TAG_MEDIA, + kIOMemoryLedgerTagGraphics = VM_LEDGER_TAG_GRAPHICS, + kIOMemoryLedgerTagNeural = VM_LEDGER_TAG_NEURAL, +}; +enum{ + kIOMemoryLedgerFlagNoFootprint = VM_LEDGER_FLAG_NO_FOOTPRINT, +}; enum{ kIOMemoryPurgeableKeepCurrent = 1, @@ -257,7 +270,7 @@ class IOMemoryDescriptor : public OSObject friend class IOMemoryMap; friend class IOMultiMemoryDescriptor; - OSDeclareDefaultStructors(IOMemoryDescriptor); + OSDeclareDefaultStructorsWithDispatch(IOMemoryDescriptor); protected: @@ -334,6 +347,17 @@ public: virtual IOReturn setPurgeable( IOOptionBits newState, IOOptionBits * oldState ); +/*! @function setOwnership + * @abstract Control the ownership of a memory descriptors memory. + * @discussion IOBufferMemoryDescriptor are owned by a specific task. The ownership of such a buffer may be controlled with setOwnership(). + * @param newOwner - the task to be the new owner of the memory. + * @param newLedgerTag - the ledger this memory should be accounted in. + * @param newLedgerOptions - accounting options + * @result An IOReturn code. */ + + IOReturn setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ); /*! @function getPageCounts * @abstract Retrieve the number of resident and/or dirty pages encompassed by an IOMemoryDescriptor. @@ -381,8 +405,12 @@ public: virtual uint64_t getPreparationID( void ); void setPreparationID( void ); + void setVMTags(uint32_t kernelTag, uint32_t userTag); + uint32_t getVMTag(vm_map_t map); + #ifdef XNU_KERNEL_PRIVATE IOMemoryDescriptorReserved * getKernelReserved( void ); + void cleanKernelReserved(IOMemoryDescriptorReserved * reserved); IOReturn dmaMap( IOMapper * mapper, IODMACommand * command, @@ -401,9 +429,6 @@ public: IOMapper * mapper, IODMACommand * command, uint64_t mapLength); - - void setVMTags(vm_tag_t kernelTag, vm_tag_t userTag); - vm_tag_t getVMTag(vm_map_t map); #endif private: @@ -435,7 +460,7 @@ private: OSMetaClassDeclareReservedUnused(IOMemoryDescriptor, 15); protected: - virtual void free() APPLE_KEXT_OVERRIDE; + virtual void free(void) APPLE_KEXT_OVERRIDE; public: static void initialize( void ); @@ -796,7 +821,7 @@ protected: class IOMemoryMap : public OSObject { - OSDeclareDefaultStructors(IOMemoryMap) + OSDeclareDefaultStructorsWithDispatch(IOMemoryMap); #ifdef XNU_KERNEL_PRIVATE public: IOMemoryDescriptor * fMemory; @@ -817,8 +842,8 @@ public: #endif /* XNU_KERNEL_PRIVATE */ protected: - virtual void taggedRelease(const void *tag = 0) const APPLE_KEXT_OVERRIDE; - virtual void free() APPLE_KEXT_OVERRIDE; + virtual void taggedRelease(const void *tag = NULL) const APPLE_KEXT_OVERRIDE; + virtual void free(void) APPLE_KEXT_OVERRIDE; public: /*! @function getVirtualAddress @@ -826,7 +851,7 @@ public: * @discussion This method returns the virtual address of the first byte in the mapping. Since the IOVirtualAddress is only 32bit in 32bit kernels, the getAddress() method should be used for compatibility with 64bit task mappings. * @result A virtual address. */ - virtual IOVirtualAddress getVirtualAddress(); + virtual IOVirtualAddress getVirtualAddress(void); /*! @function getPhysicalSegment * @abstract Break a mapping into its physically contiguous segments. @@ -849,14 +874,14 @@ public: * @discussion This method returns the physical address of the first byte in the mapping. It is most useful on mappings known to be physically contiguous. * @result A physical address. */ - IOPhysicalAddress getPhysicalAddress(); + IOPhysicalAddress getPhysicalAddress(void); /*! @function getLength * @abstract Accessor to the length of the mapping. * @discussion This method returns the length of the mapping. * @result A byte count. */ - virtual IOByteCount getLength(); + virtual IOByteCount getLength(void); /*! @function getAddressTask * @abstract Accessor to the task of the mapping. @@ -1055,6 +1080,11 @@ public: IOMemoryReference * ref, IOOptionBits newState, IOOptionBits * oldState); + static IOReturn memoryReferenceSetOwnership( + IOMemoryReference * ref, + task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions); static IOReturn memoryReferenceGetPageCounts( IOMemoryReference * ref, IOByteCount * residentPageCount, @@ -1134,6 +1164,10 @@ public: virtual IOReturn setPurgeable( IOOptionBits newState, IOOptionBits * oldState ) APPLE_KEXT_OVERRIDE; + IOReturn setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ); + virtual addr64_t getPhysicalSegment( IOByteCount offset, IOByteCount * length, #ifdef __LP64__ @@ -1185,4 +1219,6 @@ IOMemoryMap::getSize() /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +extern boolean_t iokit_iomd_setownership_enabled; + #endif /* !_IOMEMORYDESCRIPTOR_H */ diff --git a/iokit/IOKit/IOMessage.h b/iokit/IOKit/IOMessage.h index cdd53930a..1560b3305 100644 --- a/iokit/IOKit/IOMessage.h +++ b/iokit/IOKit/IOMessage.h @@ -71,6 +71,12 @@ typedef UInt32 IOMessage; #define kIOMessageSystemCapabilityChange iokit_common_msg(0x340) #define kIOMessageDeviceSignaledWakeup iokit_common_msg(0x350) +#ifdef KERNEL_PRIVATE +// sent to IOUserClients with the property kIOUserClientMessageAppSuspendedKey +// when their task's app suspend state changes; +// use task_is_app_suspended() to retrieve the owning task's current state +#define kIOMessageTaskAppSuspendedChange iokit_common_msg(0x800) +#endif /*! * @defined kIOMessageDeviceWillPowerOff diff --git a/iokit/IOKit/IOMultiMemoryDescriptor.h b/iokit/IOKit/IOMultiMemoryDescriptor.h index 995207421..f7ede4a1b 100644 --- a/iokit/IOKit/IOMultiMemoryDescriptor.h +++ b/iokit/IOKit/IOMultiMemoryDescriptor.h @@ -108,6 +108,8 @@ public: virtual IOReturn setPurgeable(IOOptionBits newState, IOOptionBits * oldState) APPLE_KEXT_OVERRIDE; + IOReturn setOwnership(task_t newOwner, int newLedgerTag, IOOptionBits newOptions); + /*! @function getPageCounts * @abstract Retrieve the number of resident and/or dirty pages encompassed by an IOMemoryDescriptor. * @discussion This method returns the number of resident and/or dirty pages encompassed by an IOMemoryDescriptor. diff --git a/iokit/IOKit/IONVRAM.h b/iokit/IOKit/IONVRAM.h index a4da3d4fa..ef2e533e8 100644 --- a/iokit/IOKit/IONVRAM.h +++ b/iokit/IOKit/IONVRAM.h @@ -107,8 +107,8 @@ private: UInt32 *propType, UInt32 *propOffset); virtual bool convertPropToObject(UInt8 *propName, UInt32 propNameLength, UInt8 *propData, UInt32 propDataLength, - const OSSymbol **propSymbol, - OSObject **propObject); + LIBKERN_RETURNS_RETAINED const OSSymbol **propSymbol, + LIBKERN_RETURNS_RETAINED OSObject **propObject); virtual bool convertObjectToProp(UInt8 *buffer, UInt32 *length, const OSSymbol *propSymbol, OSObject *propObject); virtual UInt16 generateOWChecksum(UInt8 *buffer); @@ -137,6 +137,8 @@ private: void initNVRAMImage(void); void initProxyData(void); IOReturn syncVariables(void); + IOReturn setPropertyInternal(const OSSymbol *aKey, OSObject *anObject); + public: virtual bool init(IORegistryEntry *old, const IORegistryPlane *plane) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IONotifier.h b/iokit/IOKit/IONotifier.h index 0ee138617..3324fc7f5 100644 --- a/iokit/IOKit/IONotifier.h +++ b/iokit/IOKit/IONotifier.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,7 +43,7 @@ class IONotifier : public OSObject { - OSDeclareAbstractStructors(IONotifier) + OSDeclareAbstractStructors(IONotifier); public: diff --git a/iokit/IOKit/IOPlatformExpert.h b/iokit/IOKit/IOPlatformExpert.h index 7f47bc62c..8e4d78a94 100644 --- a/iokit/IOKit/IOPlatformExpert.h +++ b/iokit/IOKit/IOPlatformExpert.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,7 +68,9 @@ enum { kPEPagingOff, kPEPanicBegin, kPEPanicEnd, - kPEPanicDiskShutdown + kPEPanicDiskShutdown, + kPEPanicRestartCPUNoPanicEndCallouts, + kPEPanicRestartCPUNoCallouts }; extern int (*PE_halt_restart)(unsigned int type); extern int PEHaltRestart(unsigned int type); @@ -79,8 +81,14 @@ enum { kIOSystemShutdownNotificationStageRootUnmount = 1, }; extern void IOSystemShutdownNotification(int stage); + +extern uint32_t gEnforceQuiesceSafety; #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +extern boolean_t IOPMRootDomainGetWillShutdown(void); +#endif /* KERNEL_PRIVATE */ + // Save the Panic Info. Returns the number of bytes saved. extern UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length); extern void PESavePanicInfoAction(void *buffer, UInt32 offset, UInt32 length); @@ -118,12 +126,70 @@ extern coprocessor_type_t PEGetCoprocessorVersion( void ); extern OSSymbol * gPlatformInterruptControllerName; -extern const OSSymbol * gIOPlatformSleepActionKey; -extern const OSSymbol * gIOPlatformWakeActionKey; -extern const OSSymbol * gIOPlatformQuiesceActionKey; -extern const OSSymbol * gIOPlatformActiveActionKey; -extern const OSSymbol * gIOPlatformHaltRestartActionKey; -extern const OSSymbol * gIOPlatformPanicActionKey; +/* + * IOPlatformSleepAction + * + * Sleep is called after power management has finished all of the power plane + * driver power management notifications and state transitions and has + * committed to sleep, but before the other CPUs are powered off. + * The scheduler is still active. + */ +extern const OSSymbol *gIOPlatformSleepActionKey; + +/* + * IOPlatformWakeAction + * + * Wake is called with the scheduler enabled, but before + * powering on other CPUs, so try to minimize work done in this path to speed + * up wake time. + */ +extern const OSSymbol *gIOPlatformWakeActionKey; + +/* + * IOPlatformQuiesceAction + * + * Quiesce is called after all CPUs are off, scheduling is disabled, + * and the boot CPU is about to pull the plug. + * Mutexes and blocking are disallowed in this context and will panic. + * Do not pass this action to super() (incl. IOService, IOPlatformExpert) + */ +extern const OSSymbol *gIOPlatformQuiesceActionKey; + +/* + * IOPlatformActiveAction + * + * Active is called very early in the wake path before enabling the scheduler + * on the boot CPU. + * Mutexes and blocking are disallowed in this context and will panic. + * Do not pass this action to super() (incl. IOService, IOPlatformExpert) + */ +extern const OSSymbol *gIOPlatformActiveActionKey; + +/* + * IOPlatformHaltRestartAction + * + * Halt/Restart is called after the kernel finishes shutting down the + * system and is ready to power off or reboot. + * + * It is not guaranteed to be called in non-graceful shutdown scenarios. + */ +extern const OSSymbol *gIOPlatformHaltRestartActionKey; + +/* + * IOPlatformPanicAction + * + * Panic is called when the system is panicking before it records a core file + * (if it is configured to do so) + * + * It can be called at any time, in any context, in any state. Don't depend + * on anything being powered on in a useful state. + * + * Mutexes and blocking are disallowed in this context and will fail. + * + * If you hang or panic again in this callout, the panic log may not be recorded, + * leading to the loss of field reports about customer issues. + */ +extern const OSSymbol *gIOPlatformPanicActionKey; class IORangeAllocator; class IONVRAMController; @@ -168,7 +234,7 @@ public: virtual IOService * createNub( OSDictionary * from ); virtual bool compareNubName( const IOService * nub, OSString * name, - OSString ** matched = 0 ) const; + OSString ** matched = NULL ) const; virtual IOReturn getNubResources( IOService * nub ); virtual long getBootROMType(void); @@ -261,7 +327,7 @@ public: virtual bool createNubs( IOService * parent, LIBKERN_CONSUMED OSIterator * iter ); virtual bool compareNubName( const IOService * nub, OSString * name, - OSString ** matched = 0 ) const APPLE_KEXT_OVERRIDE; + OSString ** matched = NULL ) const APPLE_KEXT_OVERRIDE; virtual IOReturn getNubResources( IOService * nub ) APPLE_KEXT_OVERRIDE; @@ -318,7 +384,7 @@ public: class IOPlatformExpertDevice : public IOService { - OSDeclareDefaultStructors(IOPlatformExpertDevice) + OSDeclareDefaultStructors(IOPlatformExpertDevice); private: IOWorkLoop *workLoop; @@ -329,7 +395,7 @@ private: public: virtual bool initWithArgs( void * p1, void * p2, void * p3, void *p4 ); - virtual bool compareName( OSString * name, OSString ** matched = 0 ) const APPLE_KEXT_OVERRIDE; + virtual bool compareName( OSString * name, OSString ** matched = NULL ) const APPLE_KEXT_OVERRIDE; virtual IOWorkLoop *getWorkLoop() const APPLE_KEXT_OVERRIDE; virtual IOReturn setProperties( OSObject * properties ) APPLE_KEXT_OVERRIDE; @@ -353,13 +419,13 @@ public: class IOPlatformDevice : public IOService { - OSDeclareDefaultStructors(IOPlatformDevice) + OSDeclareDefaultStructors(IOPlatformDevice); struct ExpansionData { }; ExpansionData *iopd_reserved; public: - virtual bool compareName( OSString * name, OSString ** matched = 0 ) const APPLE_KEXT_OVERRIDE; + virtual bool compareName( OSString * name, OSString ** matched = NULL ) const APPLE_KEXT_OVERRIDE; virtual IOService * matchLocation( IOService * client ) APPLE_KEXT_OVERRIDE; virtual IOReturn getResources( void ) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOPolledInterface.h b/iokit/IOKit/IOPolledInterface.h index 498814c82..94e41c2c6 100644 --- a/iokit/IOKit/IOPolledInterface.h +++ b/iokit/IOKit/IOPolledInterface.h @@ -180,7 +180,7 @@ IOReturn IOPolledFileOpen(const char * filename, uint64_t setFileSize, uint64_t fsFreeSize, void * write_file_addr, size_t write_file_len, IOPolledFileIOVars ** fileVars, - OSData ** imagePath, + LIBKERN_RETURNS_RETAINED OSData ** imagePath, uint8_t * volumeCryptKey, size_t * keySize); IOReturn IOPolledFileClose(IOPolledFileIOVars ** pVars, diff --git a/iokit/IOKit/IORPC.h b/iokit/IOKit/IORPC.h new file mode 100644 index 000000000..0ae141595 --- /dev/null +++ b/iokit/IOKit/IORPC.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef _IORPC_H +#define _IORPC_H + +#include + +#ifndef PLATFORM_DriverKit + +#include + +#else /* !PLATFORM_DriverKit */ + +#ifndef _MACH_MESSAGE_H_ +#define _MACH_MESSAGE_H_ + +#define MACH_MSG_TYPE_MOVE_RECEIVE 16 /* Must hold receive right */ +#define MACH_MSG_TYPE_MOVE_SEND 17 /* Must hold send right(s) */ +#define MACH_MSG_TYPE_MOVE_SEND_ONCE 18 /* Must hold sendonce right */ +#define MACH_MSG_TYPE_COPY_SEND 19 /* Must hold send right(s) */ +#define MACH_MSG_TYPE_MAKE_SEND 20 /* Must hold receive right */ +#define MACH_MSG_TYPE_MAKE_SEND_ONCE 21 /* Must hold receive right */ +#define MACH_MSG_TYPE_COPY_RECEIVE 22 /* NOT VALID */ +#define MACH_MSG_TYPE_DISPOSE_RECEIVE 24 /* must hold receive right */ +#define MACH_MSG_TYPE_DISPOSE_SEND 25 /* must hold send right(s) */ +#define MACH_MSG_TYPE_DISPOSE_SEND_ONCE 26 /* must hold sendonce right */ + +#define MACH_MSG_TYPE_PORT_NONE 0 + +#define MACH_MSG_PORT_DESCRIPTOR 0 +#define MACH_MSG_OOL_DESCRIPTOR 1 + +typedef unsigned int mach_msg_copy_options_t; + +#define MACH_MSG_PHYSICAL_COPY 0 +#define MACH_MSG_VIRTUAL_COPY 1 +#define MACH_MSG_ALLOCATE 2 + +typedef uint32_t natural_t; +typedef int32_t integer_t; + +typedef unsigned int mach_msg_type_name_t; +typedef unsigned int mach_msg_descriptor_type_t; + +#if KERNEL +typedef void * mach_port_t; +#define MACH_PORT_NULL NULL +#else /* !KERNEL */ +typedef natural_t mach_port_t; +#define MACH_PORT_NULL 0 +#endif /* !KERNEL */ + +typedef natural_t mach_port_name_t; + +typedef unsigned int mach_msg_bits_t; +typedef natural_t mach_msg_size_t; +typedef integer_t mach_msg_id_t; + +#pragma pack(push, 4) + +typedef struct{ + mach_msg_bits_t msgh_bits; + mach_msg_size_t msgh_size; + mach_port_t msgh_remote_port; + mach_port_t msgh_local_port; + mach_port_name_t msgh_voucher_port; + mach_msg_id_t msgh_id; +} mach_msg_header_t; + +typedef struct{ + mach_msg_size_t msgh_descriptor_count; +} mach_msg_body_t; + +typedef struct{ + mach_port_t name; +#if !(defined(KERNEL) && defined(__LP64__)) +// Pad to 8 bytes everywhere except the K64 kernel where mach_port_t is 8 bytes + mach_msg_size_t pad1; +#endif + unsigned int pad2 : 16; + mach_msg_type_name_t disposition : 8; + mach_msg_descriptor_type_t type : 8; +#if defined(KERNEL) + uint32_t pad_end; +#endif +} mach_msg_port_descriptor_t; + +typedef struct{ + void * address; +#if !defined(__LP64__) + mach_msg_size_t size; +#endif + int deallocate: 8; + mach_msg_copy_options_t copy: 8; + unsigned int pad1: 8; + mach_msg_descriptor_type_t type: 8; +#if defined(__LP64__) + mach_msg_size_t size; +#endif +#if defined(KERNEL) && !defined(__LP64__) + uint32_t pad_end; +#endif +} mach_msg_ool_descriptor_t; + +typedef struct{ + unsigned int val[80 / sizeof(int)]; +} mach_msg_max_trailer_t; + +#pragma pack(pop) + +#endif /* _MACH_MESSAGE_H_ */ + +#endif /* PLATFORM_DriverKit */ + +#if KERNEL +class IOUserServer; +#endif /* KERNEL */ + +typedef uint64_t OSObjectRef; + +enum { + kIORPCVersion190615 = (mach_msg_id_t) 0x4da2b68c, + kIORPCVersion190615Reply = (mach_msg_id_t) 0x4da2b68d, + +#if DRIVERKIT_PRIVATE + kIORPCVersion190501 = (mach_msg_id_t) 0xfe316a7a, + kIORPCVersion190501Reply = (mach_msg_id_t) 0xfe316a7b, + + kIORPCVersionCurrent = kIORPCVersion190615, + kIORPCVersionCurrentReply = kIORPCVersion190615Reply +#endif /* DRIVERKIT_PRIVATE */ +}; + +enum{ + kIORPCMessageRemote = 0x00000001, + kIORPCMessageLocalHost = 0x00000002, + kIORPCMessageKernel = 0x00000004, + kIORPCMessageOneway = 0x00000008, + kIORPCMessageObjectRefs = 0x00000010, + kIORPCMessageOnqueue = 0x00000020, + kIORPCMessageError = 0x00000040, + kIORPCMessageSimpleReply = 0x00000080, +}; + +enum{ + kIORPCMessageIDKernel = (1ULL << 63), +}; + +struct IORPCMessageMach { + mach_msg_header_t msgh; + mach_msg_body_t msgh_body; + mach_msg_port_descriptor_t objects[0]; +}; +typedef struct IORPCMessageMach IORPCMessageMach; + +struct IORPCMessage { + uint64_t msgid; + uint64_t flags; + uint64_t objectRefs; + OSObjectRef objects[0]; +}; +typedef struct IORPCMessage IORPCMessage; + +extern "C" IORPCMessage * +IORPCMessageFromMach(IORPCMessageMach * msg, bool reply); + +struct IORPCMessageErrorReturnContent { + IORPCMessage hdr; + kern_return_t result; + uint32_t pad; +}; + +#pragma pack(4) +struct IORPCMessageErrorReturn { + IORPCMessageMach mach; + IORPCMessageErrorReturnContent content; +}; +#pragma pack() + + +class OSMetaClassBase; +struct IORPC; +typedef kern_return_t (*OSDispatchMethod)(OSMetaClassBase * self, const IORPC rpc); + +struct IORPC { + IORPCMessageMach * message; + IORPCMessageMach * reply; + uint32_t sendSize; + uint32_t replySize; +}; +typedef struct IORPC IORPC; + +enum { + kOSClassCanRemote = 0x00000001, +}; + +struct OSClassDescription { + uint32_t descriptionSize; + + char name[96]; + char superName[96]; + + uint32_t methodOptionsSize; + uint32_t methodOptionsOffset; + uint32_t metaMethodOptionsSize; + uint32_t metaMethodOptionsOffset; + uint32_t queueNamesSize; + uint32_t queueNamesOffset; + uint32_t methodNamesSize; + uint32_t methodNamesOffset; + uint32_t metaMethodNamesSize; + uint32_t metaMethodNamesOffset; + + uint64_t flags; + + uint64_t resv1[8]; + + uint64_t methodOptions[0]; + uint64_t metaMethodOptions[0]; + + char dispatchNames[0]; + char methodNames[0]; + char metaMethodNames[0]; +}; + +#endif /* _IORPC_H */ diff --git a/iokit/IOKit/IORangeAllocator.h b/iokit/IOKit/IORangeAllocator.h index 2520c5bd8..e7b0472dc 100644 --- a/iokit/IOKit/IORangeAllocator.h +++ b/iokit/IOKit/IORangeAllocator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,7 +48,7 @@ typedef IOByteCount IORangeScalar; */ class IORangeAllocator : public OSObject { - OSDeclareDefaultStructors(IORangeAllocator) + OSDeclareDefaultStructors(IORangeAllocator); protected: UInt32 numElements; diff --git a/iokit/IOKit/IORegistryEntry.h b/iokit/IOKit/IORegistryEntry.h index 0812c9579..c9e059654 100644 --- a/iokit/IOKit/IORegistryEntry.h +++ b/iokit/IOKit/IORegistryEntry.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,7 +73,7 @@ class IORegistryEntry : public OSObject { friend class IORegistryIterator; - OSDeclareDefaultStructors(IORegistryEntry) + OSDeclareDefaultStructors(IORegistryEntry); protected: /*! @struct ExpansionData @@ -181,8 +181,8 @@ public: * @result Returns the value of the Action callout. */ virtual IOReturn runPropertyAction(Action action, OSObject *target, - void *arg0 = 0, void *arg1 = 0, - void *arg2 = 0, void *arg3 = 0); + void *arg0 = NULL, void *arg1 = NULL, + void *arg2 = NULL, void *arg3 = NULL); private: #if __LP64__ @@ -260,7 +260,7 @@ public: * @param dictionary A dictionary that will become the registry entry's property table (retaining it), or zero which will cause an empty property table to be created. * @result true on success, or false on a resource failure. */ - virtual bool init( OSDictionary * dictionary = 0 ); + virtual bool init( OSDictionary * dictionary = NULL ); /*! @function free * @abstract Standard free method for all IORegistryEntry subclasses. @@ -576,7 +576,7 @@ public: * @param plane The plane object, 0 indicates any plane. * @result If the entry has a parent in the given plane or if plane = 0 then if entry has any parent; return true, otherwise false. */ - virtual bool inPlane( const IORegistryPlane * plane = 0) const; + virtual bool inPlane( const IORegistryPlane * plane = NULL) const; /*! @function getDepth * @abstract Counts the maximum number of entries between an entry and the registry root, in a plane. @@ -648,7 +648,7 @@ public: * @param plane The plane object, or zero for the global name. * @result A C-string name, valid while the entry is retained. */ - virtual const char * getName( const IORegistryPlane * plane = 0 ) const; + virtual const char * getName( const IORegistryPlane * plane = NULL ) const; /*! @function copyName * @abstract Returns the name assigned to the registry entry as an OSSymbol. @@ -657,7 +657,7 @@ public: * @result A reference to an OSSymbol for the name, which should be released by the caller. */ virtual const OSSymbol * copyName( - const IORegistryPlane * plane = 0 ) const; + const IORegistryPlane * plane = NULL ) const; /*! @function compareNames * @abstract Compares the name of the entry with one or more names, and optionally returns the matching name. @@ -666,7 +666,7 @@ public: * @param matched If the caller wants the successfully matched name returned, pass a non-zero pointer for the matched parameter and an OSString will be returned here. It should be released by the caller. * @result True if one of the names compared true with the entry's global name. */ - virtual bool compareNames( OSObject * name, OSString ** matched = 0 ) const; + virtual bool compareNames( OSObject * name, OSString ** matched = NULL ) const; /*! @function compareName * @abstract Compares the name of the entry with one name, and optionally returns the matching name. @@ -675,7 +675,7 @@ public: * @param matched If the caller wants the successfully matched name returned, pass a non-zero pointer for the matched parameter and an OSString will be returned here. It should be released by the caller. Generally, this will be the same as the name parameter, but may not be if wildcards are used. * @result True if the name compared true with the entry's global name. */ - virtual bool compareName( OSString * name, OSString ** matched = 0 ) const; + virtual bool compareName( OSString * name, OSString ** matched = NULL ) const; /*! @function setName * @abstract Sets a name for the registry entry, in a particular plane, or globally. @@ -684,7 +684,7 @@ public: * @param plane The plane object, or zero to set the global name. */ virtual void setName( const OSSymbol * name, - const IORegistryPlane * plane = 0 ); + const IORegistryPlane * plane = NULL ); /*! @function setName * @abstract Sets a name for the registry entry, in a particular plane, or globally. @@ -693,7 +693,7 @@ public: * @param plane The plane object, or zero to set the global name. */ virtual void setName( const char * name, - const IORegistryPlane * plane = 0 ); + const IORegistryPlane * plane = NULL ); /*! @function getLocation * @abstract Returns the location string assigned to the registry entry as a C-string. @@ -701,7 +701,7 @@ public: * @param plane The plane object, or zero for the global name. * @result A C-string location string, valid while the entry is retained, or zero. */ - virtual const char * getLocation( const IORegistryPlane * plane = 0 ) const; + virtual const char * getLocation( const IORegistryPlane * plane = NULL ) const; /*! @function copyLocation * @abstract Returns the location string assigned to the registry entry as an OSSymbol. @@ -710,7 +710,7 @@ public: * @result A reference to an OSSymbol for the location if one exists, which should be released by the caller, or zero. */ virtual const OSSymbol * copyLocation( - const IORegistryPlane * plane = 0 ) const; + const IORegistryPlane * plane = NULL ) const; /*! @function setLocation * @abstract Sets a location string for the registry entry, in a particular plane, or globally. @@ -719,9 +719,9 @@ public: * @param plane The plane object, or zero to set the global location string. */ virtual void setLocation( const OSSymbol * location, - const IORegistryPlane * plane = 0 ); + const IORegistryPlane * plane = NULL ); virtual void setLocation( const char * location, - const IORegistryPlane * plane = 0 ); + const IORegistryPlane * plane = NULL ); /*! @function getPath * @abstract Create a path for a registry entry. @@ -756,10 +756,10 @@ public: * @result A retained registry entry is returned on success, or zero on failure. The caller should release the entry. */ static IORegistryEntry * fromPath( const char * path, - const IORegistryPlane * plane = 0, - char * residualPath = 0, - int * residualLength = 0, - IORegistryEntry * fromEntry = 0 ); + const IORegistryPlane * plane = NULL, + char * residualPath = NULL, + int * residualLength = NULL, + IORegistryEntry * fromEntry = NULL ); /*! @function fromPath * @abstract Looks up a registry entry by relative path. @@ -771,9 +771,9 @@ public: * @result See IORegistryEntry::fromPath. */ virtual IORegistryEntry * childFromPath( const char * path, - const IORegistryPlane * plane = 0, - char * residualPath = 0, - int * residualLength = 0 ); + const IORegistryPlane * plane = NULL, + char * residualPath = NULL, + int * residualLength = NULL ); /*! @function dealiasPath * @abstract Strips any aliases from the head of path and returns the full path. @@ -815,12 +815,14 @@ private: #ifdef XNU_KERNEL_PRIVATE SInt32 getRegistryEntryGenerationCount( void ) const; + void setName(const OSString * name, + const IORegistryPlane * plane = NULL); #endif private: inline bool arrayMember( OSArray * set, const IORegistryEntry * member, - unsigned int * index = 0 ) const; + unsigned int * index = NULL ) const; bool makeLink( IORegistryEntry * to, unsigned int relation, @@ -842,9 +844,9 @@ private: const IORegistryPlane * plane ); APPLE_KEXT_COMPATIBILITY_VIRTUAL - LIBKERN_RETURNS_NOT_RETAINED - const OSSymbol * hasAlias( const IORegistryPlane * plane, - char * opath = 0, int * length = 0 ) const; + LIBKERN_RETURNS_NOT_RETAINED const OSSymbol * hasAlias( + const IORegistryPlane * plane, + char * opath = NULL, int * length = NULL ) const; APPLE_KEXT_COMPATIBILITY_VIRTUAL const char * matchPathLocation( const char * cmp, @@ -859,7 +861,7 @@ private: class IORegistryIterator : public OSIterator { - OSDeclareAbstractStructors(IORegistryIterator) + OSDeclareAbstractStructors(IORegistryIterator); private: struct IORegCursor { diff --git a/iokit/IOKit/IOReturn.h b/iokit/IOKit/IOReturn.h index 94347a72d..d93203222 100644 --- a/iokit/IOKit/IOReturn.h +++ b/iokit/IOKit/IOReturn.h @@ -40,8 +40,41 @@ extern "C" { #endif +#ifndef PLATFORM_DriverKit + #include +#else /* PLATFORM_DriverKit */ + +typedef int kern_return_t; + +#define KERN_SUCCESS 0 + +/* + * error number layout as follows: + * + * hi lo + * | system(6) | subsystem(12) | code(14) | + */ + +#define err_none (kern_return_t)0 +#define ERR_SUCCESS (kern_return_t)0 + +#define err_system(x) ((signed)((((unsigned)(x))&0x3f)<<26)) +#define err_sub(x) (((x)&0xfff)<<14) + +#define err_get_system(err) (((err)>>26)&0x3f) +#define err_get_sub(err) (((err)>>14)&0xfff) +#define err_get_code(err) ((err)&0x3fff) + +#define err_max_system 0x3f + +#define system_emask (err_system(err_max_system)) +#define sub_emask (err_sub(0xfff)) +#define code_emask (0x3fff) + +#endif /* PLATFORM_DriverKit */ + typedef kern_return_t IOReturn; #ifndef sys_iokit @@ -73,6 +106,10 @@ typedef kern_return_t IOReturn; #ifdef PRIVATE #define sub_iokit_smc err_sub(32) #endif +#define sub_iokit_apfs err_sub(33) +#define sub_iokit_acpiec err_sub(34) +#define sub_iokit_timesync_avb err_sub(35) + #define sub_iokit_platform err_sub(0x2A) #define sub_iokit_audio_video err_sub(0x45) #define sub_iokit_cec err_sub(0x46) diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index 4a6da1574..fdfded661 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,11 +53,13 @@ #include #include #include +#include extern "C" { #include } + #ifndef UINT64_MAX #define UINT64_MAX 18446744073709551615ULL #endif @@ -83,6 +85,7 @@ enum { // options for terminate() kIOServiceRequired = 0x00000001, kIOServiceTerminate = 0x00000004, + kIOServiceTerminateWithRematch = 0x00000010, // options for registerService() & terminate() kIOServiceSynchronous = 0x00000002, @@ -109,18 +112,33 @@ extern const IORegistryPlane * gIOPowerPlane; extern const OSSymbol * gIOResourcesKey; extern const OSSymbol * gIOResourceMatchKey; extern const OSSymbol * gIOResourceMatchedKey; +extern const OSSymbol * gIOResourceIOKitKey; + extern const OSSymbol * gIOProviderClassKey; extern const OSSymbol * gIONameMatchKey; extern const OSSymbol * gIONameMatchedKey; extern const OSSymbol * gIOPropertyMatchKey; +extern const OSSymbol * gIOPropertyExistsMatchKey; extern const OSSymbol * gIOLocationMatchKey; extern const OSSymbol * gIOParentMatchKey; extern const OSSymbol * gIOPathMatchKey; extern const OSSymbol * gIOMatchCategoryKey; extern const OSSymbol * gIODefaultMatchCategoryKey; extern const OSSymbol * gIOMatchedServiceCountKey; +extern const OSSymbol * gIOMatchedPersonalityKey; +extern const OSSymbol * gIORematchPersonalityKey; +extern const OSSymbol * gIORematchCountKey; +extern const OSSymbol * gIODEXTMatchCountKey; extern const OSSymbol * gIOUserClientClassKey; + +extern const OSSymbol * gIOUserClassKey; +extern const OSSymbol * gIOUserServerClassKey; +extern const OSSymbol * gIOUserServerNameKey; +extern const OSSymbol * gIOUserServerTagKey; +extern const OSSymbol * gIOUserServerCDHashKey; +extern const OSSymbol * gIOUserUserClientKey; + extern const OSSymbol * gIOKitDebugKey; extern const OSSymbol * gIOServiceKey; @@ -150,6 +168,11 @@ extern const OSSymbol * gIOBSDMajorKey; extern const OSSymbol * gIOBSDMinorKey; extern const OSSymbol * gIOBSDUnitKey; +extern const OSSymbol * gIODriverKitEntitlementKey; +extern const OSSymbol * gIOServiceDEXTEntitlementsKey; +extern const OSSymbol * gIODriverKitUserClientEntitlementsKey; +extern const OSSymbol * gIOMatchDeferKey; + extern SInt32 IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaClassBase * inObj2, void * ref ); typedef void (*IOInterruptAction)( OSObject * target, void * refCon, @@ -309,12 +332,17 @@ class IOPlatformExpert; struct IOInterruptAccountingData; struct IOInterruptAccountingReporter; +struct OSObjectUserVars; class IOService : public IORegistryEntry { - OSDeclareDefaultStructors(IOService) + OSDeclareDefaultStructorsWithDispatch(IOService); +#if XNU_KERNEL_PRIVATE +public: +#else protected: +#endif /* XNU_KERNEL_PRIVATE */ /*! @struct ExpansionData * @discussion This structure will be used to expand the capablilties of this class in the future. */ @@ -330,6 +358,8 @@ protected: IOLock * interruptStatisticsLock; IOInterruptAccountingReporter * interruptStatisticsArray; int interruptStatisticsArrayCount; + + OSObjectUserVars * uvars; }; /*! @var reserved @@ -566,7 +596,7 @@ public: virtual bool open( IOService * forClient, IOOptionBits options = 0, - void * arg = 0 ); + void * arg = NULL ); /*! @function close * @abstract Releases active access to a provider. @@ -583,7 +613,7 @@ public: * @param forClient If non-zero, isOpen returns the open state for that client. If zero is passed, isOpen returns the open state for all clients. * @result true if the specific, or any, client has the IOService object open. */ - virtual bool isOpen( const IOService * forClient = 0 ) const; + virtual bool isOpen( const IOService * forClient = NULL ) const; /*! @function handleOpen * @abstract Controls the open / close behavior of an IOService object (overrideable by subclasses). @@ -632,7 +662,7 @@ public: /*! @function init * @abstract Initializes generic IOService data structures (expansion data, etc). */ - virtual bool init( OSDictionary * dictionary = 0 ) APPLE_KEXT_OVERRIDE; + virtual bool init( OSDictionary * dictionary = NULL ) APPLE_KEXT_OVERRIDE; /*! @function init * @abstract Initializes generic IOService data structures (expansion data, etc). */ @@ -729,7 +759,8 @@ public: * @param key An OSSymbol key that globally identifies the object. * @param value The object to be published. */ - static void publishResource( const OSSymbol * key, OSObject * value = 0 ); + static void publishResource( const OSSymbol * key, OSObject * value = NULL ); + static void publishUserResource( const OSSymbol * key, OSObject * value = NULL ); /*! @function publishResource * @abstract Uses the resource service to publish a property. @@ -737,7 +768,7 @@ public: * @param key A C string key that globally identifies the object. * @param value The object to be published. */ - static void publishResource( const char * key, OSObject * value = 0 ); + static void publishResource( const char * key, OSObject * value = NULL ); virtual bool addNeededResource( const char * key ); /* Notifications */ @@ -762,7 +793,7 @@ public: static IONotifier * addNotification( const OSSymbol * type, OSDictionary * matching, IOServiceNotificationHandler handler, - void * target, void * ref = 0, + void * target, void * ref = NULL, SInt32 priority = 0 ) APPLE_KEXT_DEPRECATED; @@ -786,7 +817,7 @@ public: static IONotifier * addMatchingNotification( const OSSymbol * type, OSDictionary * matching, IOServiceMatchingNotificationHandler handler, - void * target, void * ref = 0, + void * target, void * ref = NULL, SInt32 priority = 0 ); @@ -804,10 +835,9 @@ public: * @param timeout The maximum time to wait. * @result A published IOService object matching the supplied dictionary. */ - LIBKERN_RETURNS_NOT_RETAINED - static IOService * waitForService( + static LIBKERN_RETURNS_NOT_RETAINED IOService * waitForService( LIBKERN_CONSUMED OSDictionary * matching, - mach_timespec_t * timeout = 0); + mach_timespec_t * timeout = NULL); /*! @function waitForMatchingService * @abstract Waits for a matching to service to be published. @@ -847,7 +877,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * serviceMatching( const char * className, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function serviceMatching * @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify an IOService class match. @@ -857,7 +887,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * serviceMatching( const OSString * className, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function nameMatching * @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify an IOService name match. @@ -867,7 +897,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * nameMatching( const char * name, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function nameMatching * @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify an IOService name match. @@ -877,7 +907,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * nameMatching( const OSString* name, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function resourceMatching * @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a resource service match. @@ -887,7 +917,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * resourceMatching( const char * name, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function resourceMatching * @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a resource service match. @@ -897,7 +927,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * resourceMatching( const OSString * name, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function propertyMatching @@ -909,7 +939,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * propertyMatching( const OSSymbol * key, const OSObject * value, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function registryEntryIDMatching * @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a IORegistryEntryID match. @@ -919,7 +949,7 @@ public: * @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ static OSDictionary * registryEntryIDMatching( uint64_t entryID, - OSDictionary * table = 0 ); + OSDictionary * table = NULL ); /*! @function addLocation @@ -1137,7 +1167,7 @@ public: virtual IOReturn registerInterrupt(int source, OSObject *target, IOInterruptAction handler, - void *refCon = 0); + void *refCon = NULL); #ifdef __BLOCKS__ /*! @function registerInterrupt @@ -1224,7 +1254,7 @@ public: * @result An IOReturn code defined by the message type. */ virtual IOReturn message( UInt32 type, IOService * provider, - void * argument = 0 ); + void * argument = NULL ); /*! @function messageClient * @abstract Sends a generic message to an attached client. @@ -1236,7 +1266,7 @@ public: * @result The return code from the client message call. */ virtual IOReturn messageClient( UInt32 messageType, OSObject * client, - void * messageArgument = 0, vm_size_t argSize = 0 ); + void * messageArgument = NULL, vm_size_t argSize = 0 ); /*! @function messageClients * @abstract Sends a generic message to all attached clients. @@ -1247,11 +1277,11 @@ public: * @result Any non-kIOReturnSuccess return codes returned by the clients, or kIOReturnSuccess if all return kIOReturnSuccess. */ virtual IOReturn messageClients( UInt32 type, - void * argument = 0, vm_size_t argSize = 0 ); + void * argument = NULL, vm_size_t argSize = 0 ); virtual IONotifier * registerInterest( const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, - void * target, void * ref = 0 ); + void * target, void * ref = NULL ); #ifdef __BLOCKS__ IONotifier * registerInterest(const OSSymbol * typeOfInterest, @@ -1285,10 +1315,11 @@ public: virtual IOReturn newUserClient( task_t owningTask, void * securityID, UInt32 type, OSDictionary * properties, - IOUserClient ** handler ); + LIBKERN_RETURNS_RETAINED IOUserClient ** handler ); virtual IOReturn newUserClient( task_t owningTask, void * securityID, - UInt32 type, IOUserClient ** handler ); + UInt32 type, + LIBKERN_RETURNS_RETAINED IOUserClient ** handler ); /* Return code utilities */ @@ -1347,6 +1378,9 @@ public: IOReturn setAuthorizationID( uint64_t authorizationID ); void cpusRunning(void); void scheduleFinalize(bool now); + static void willShutdown(); + static void startDeferredMatches(); + static void kextdLaunched(); private: static IOReturn waitMatchIdle( UInt32 ms ); @@ -1354,13 +1388,15 @@ private: const OSSymbol * type, OSDictionary * matching, IOServiceMatchingNotificationHandler handler, void * target, void * ref, - SInt32 priority, OSIterator ** existing ); + SInt32 priority, + LIBKERN_RETURNS_RETAINED OSIterator ** existing ); #if !defined(__LP64__) static IONotifier * installNotification( const OSSymbol * type, OSDictionary * matching, IOServiceNotificationHandler handler, void * target, void * ref, - SInt32 priority, OSIterator ** existing); + SInt32 priority, + LIBKERN_RETURNS_RETAINED OSIterator ** existing); #endif /* !defined(__LP64__) */ #endif @@ -1427,7 +1463,7 @@ private: OSArray * copyNotifiers(const OSSymbol * type, IOOptionBits orNewState, IOOptionBits andNewState); - bool invokeNotifiers(OSArray ** willSend); + bool invokeNotifiers(OSArray * willSend[]); bool invokeNotifier( class _IOServiceNotifier * notify ); APPLE_KEXT_COMPATIBILITY_VIRTUAL @@ -1435,7 +1471,7 @@ private: APPLE_KEXT_COMPATIBILITY_VIRTUAL IOReturn waitForState( UInt32 mask, UInt32 value, - mach_timespec_t * timeout = 0 ); + mach_timespec_t * timeout = NULL ); IOReturn waitForState( UInt32 mask, UInt32 value, uint64_t timeout ); @@ -1449,7 +1485,7 @@ private: static void __attribute__((__noreturn__)) terminateThread( void * arg, wait_result_t unused ); static void terminateWorker( IOOptionBits options ); static void actionWillTerminate( IOService * victim, IOOptionBits options, - OSArray * doPhase2List, void*, void * ); + OSArray * doPhase2List, bool, void * ); static void actionDidTerminate( IOService * victim, IOOptionBits options, void *, void *, void *); @@ -1466,7 +1502,10 @@ private: APPLE_KEXT_COMPATIBILITY_VIRTUAL IOReturn resolveInterrupt(IOService *nub, int source); APPLE_KEXT_COMPATIBILITY_VIRTUAL - IOReturn lookupInterrupt(int source, bool resolve, IOInterruptController **interruptController); + IOReturn lookupInterrupt( + int source, bool resolve, + LIBKERN_RETURNS_NOT_RETAINED IOInterruptController * + *interruptController); #ifdef XNU_KERNEL_PRIVATE /* end xnu internals */ @@ -1846,7 +1885,7 @@ protected: * Drivers may eliminate the influence of the changePowerStateTo method on power state one of two ways. See @link powerOverrideOnPriv powerOverrideOnPriv@/link to ignore the method's influence, or call changePowerStateTo(0) in the driver's start routine to remove the changePowerStateTo method's power request. * @param ordinal The number of the desired power state in the power state array. * @result A return code that can be ignored by the caller. */ - +public: IOReturn changePowerStateToPriv( unsigned long ordinal ); /*! @function powerOverrideOnPriv @@ -1874,8 +1913,8 @@ protected: public: void idleTimerExpired( void ); void settleTimerExpired( void ); - IOReturn synchronizePowerTree( IOOptionBits options = 0, IOService * notifyRoot = 0 ); - bool assertPMDriverCall( IOPMDriverCallEntry * callEntry, IOOptionBits options = 0, IOPMinformee * inform = 0 ); + IOReturn synchronizePowerTree( IOOptionBits options = 0, IOService * notifyRoot = NULL ); + bool assertPMDriverCall( IOPMDriverCallEntry * callEntry, IOOptionBits method, const IOPMinformee * inform = NULL, IOOptionBits options = 0 ); void deassertPMDriverCall( IOPMDriverCallEntry * callEntry ); IOReturn changePowerStateWithOverrideTo( IOPMPowerStateIndex ordinal, IOPMRequestTag tag ); IOReturn changePowerStateForRootDomain( IOPMPowerStateIndex ordinal ); @@ -1893,6 +1932,7 @@ public: static IOWorkLoop * getIOPMWorkloop( void ); bool getBlockingDriverCall(thread_t *thread, const void **callMethod); + void cancelIdlePowerDown(IOService * service); protected: bool tellClientsWithResponse( int messageType ); @@ -1963,7 +2003,7 @@ private: static IOReturn actionSpinDumpTimerExpired(OSObject *, void *, void *, void *, void * ); static IOReturn actionDriverCalloutDone(OSObject *, void *, void *, void *, void * ); - static IOPMRequest * acquirePMRequest( IOService * target, IOOptionBits type, IOPMRequest * active = 0 ); + static IOPMRequest * acquirePMRequest( IOService * target, IOOptionBits type, IOPMRequest * active = NULL ); static void releasePMRequest( IOPMRequest * request ); static void pmDriverCallout( IOService * from ); static void pmTellAppWithResponse( OSObject * object, void * context ); @@ -1971,7 +2011,7 @@ private: static void pmTellCapabilityAppWithResponse( OSObject * object, void * arg ); static void pmTellCapabilityClientWithResponse( OSObject * object, void * arg ); static void submitPMRequest(LIBKERN_CONSUMED IOPMRequest * request ); - static void submitPMRequests( IOPMRequest ** request, IOItemCount count ); + static void submitPMRequests( IOPMRequest * requests[], IOItemCount count ); bool ackTimerTick( void ); void addPowerChild1( IOPMRequest * request ); void addPowerChild2( IOPMRequest * request ); @@ -2014,7 +2054,7 @@ private: IOReturn updatePowerStatesReport( IOReportConfigureAction action, void *result, void *destination ); IOReturn configureSimplePowerReport(IOReportConfigureAction action, void *result ); IOReturn updateSimplePowerReport( IOReportConfigureAction action, void *result, void *destination ); - void waitForPMDriverCall( IOService * target = 0 ); + void waitForPMDriverCall( IOService * target = NULL ); #endif /* XNU_KERNEL_PRIVATE */ }; diff --git a/iokit/IOKit/IOServicePM.h b/iokit/IOKit/IOServicePM.h index e20f0f086..d226255bc 100644 --- a/iokit/IOKit/IOServicePM.h +++ b/iokit/IOKit/IOServicePM.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,7 +95,7 @@ class IOPMprot : public OSObject { friend class IOService; - OSDeclareDefaultStructors(IOPMprot) + OSDeclareDefaultStructors(IOPMprot); public: const char * ourName; diff --git a/iokit/IOKit/IOSharedDataQueue.h b/iokit/IOKit/IOSharedDataQueue.h index f0347c8c6..c956ce774 100644 --- a/iokit/IOKit/IOSharedDataQueue.h +++ b/iokit/IOKit/IOSharedDataQueue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,13 +29,6 @@ #ifndef _IOKIT_IOSHAREDDATAQUEUE_H #define _IOKIT_IOSHAREDDATAQUEUE_H -#ifdef dequeue -#undef dequeue -#endif -#ifdef enqueue -#undef enqueue -#endif - #define DISABLE_DATAQUEUE_WARNING /* IODataQueue is deprecated, please use IOSharedDataQueue instead */ #include @@ -57,7 +50,7 @@ typedef struct _IODataQueueEntry IODataQueueEntry; */ class IOSharedDataQueue : public IODataQueue { - OSDeclareDefaultStructors(IOSharedDataQueue) + OSDeclareDefaultStructors(IOSharedDataQueue); struct ExpansionData { UInt32 queueSize; diff --git a/iokit/IOKit/IOSubMemoryDescriptor.h b/iokit/IOKit/IOSubMemoryDescriptor.h index a228cb4ff..42b10d913 100644 --- a/iokit/IOKit/IOSubMemoryDescriptor.h +++ b/iokit/IOKit/IOSubMemoryDescriptor.h @@ -94,6 +94,10 @@ public: virtual IOReturn setPurgeable( IOOptionBits newState, IOOptionBits * oldState ) APPLE_KEXT_OVERRIDE; + IOReturn setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ); + // support map() on kIOMemoryTypeVirtual without prepare() virtual IOMemoryMap * makeMapping( IOMemoryDescriptor * owner, diff --git a/iokit/IOKit/IOSyncer.h b/iokit/IOKit/IOSyncer.h index f72dcc37f..299bb5e93 100644 --- a/iokit/IOKit/IOSyncer.h +++ b/iokit/IOKit/IOSyncer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,7 +34,7 @@ class IOSyncer : public OSObject { - OSDeclareDefaultStructors(IOSyncer) + OSDeclareDefaultStructors(IOSyncer); private: // The spin lock that is used to guard the 'threadMustStop' variable. diff --git a/iokit/IOKit/IOTimerEventSource.h b/iokit/IOKit/IOTimerEventSource.h index ed54a6a60..f4987b69b 100644 --- a/iokit/IOKit/IOTimerEventSource.h +++ b/iokit/IOKit/IOTimerEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,7 +107,7 @@ enum{ class IOTimerEventSource : public IOEventSource { - OSDeclareDefaultStructors(IOTimerEventSource) + OSDeclareDefaultStructors(IOTimerEventSource); protected: /*! @var calloutEntry thread_call entry for preregistered thread callouts */ @@ -161,7 +161,7 @@ public: #endif /* __BLOCKS__ */ static IOTimerEventSource * - timerEventSource(OSObject *owner, Action action = 0); + timerEventSource(OSObject *owner, Action action = NULL); /*! @function timerEventSource * @abstract Allocates and returns an initialized timer instance. @@ -170,7 +170,7 @@ public: * @param action 'C' Function pointer for the callout routine of this event source. */ static IOTimerEventSource * - timerEventSource(uint32_t options, OSObject *owner, Action action = 0); + timerEventSource(uint32_t options, OSObject *owner, Action action = NULL); #ifdef __BLOCKS__ /*! @function timerEventSource @@ -191,7 +191,7 @@ public: /*! @function init * @abstract Initializes the timer with an owner, and a handler to call when the timeout expires. */ - virtual bool init(OSObject *owner, Action action = 0); + virtual bool init(OSObject *owner, Action action = NULL); /*! @function enable * @abstract Enables a call to the action. diff --git a/iokit/IOKit/IOTypes.h b/iokit/IOKit/IOTypes.h index be2137e18..d07f14f1b 100644 --- a/iokit/IOKit/IOTypes.h +++ b/iokit/IOKit/IOTypes.h @@ -28,6 +28,8 @@ #ifndef __IOKIT_IOTYPES_H #define __IOKIT_IOTYPES_H +#ifndef PLATFORM_DriverKit + #ifndef IOKIT #define IOKIT 1 #endif /* !IOKIT */ @@ -47,7 +49,11 @@ extern "C" { #ifndef NULL #if defined (__cplusplus) +#if __cplusplus >= 201103L +#define NULL nullptr +#else #define NULL 0 +#endif #else #define NULL ((void *)0) #endif @@ -173,6 +179,7 @@ typedef io_object_t io_enumerator_t; typedef io_object_t io_iterator_t; typedef io_object_t io_registry_entry_t; typedef io_object_t io_service_t; +typedef io_object_t uext_object_t; #define IO_OBJECT_NULL ((io_object_t) 0) @@ -190,35 +197,41 @@ enum { kIOCopybackCache = 3, kIOWriteCombineCache = 4, kIOCopybackInnerCache = 5, - kIOPostedWrite = 6 + kIOPostedWrite = 6, + kIORealTimeCache = 7, + kIOPostedReordered = 8, + kIOPostedCombinedReordered = 9, }; // IOMemory mapping options enum { - kIOMapAnywhere = 0x00000001, - - kIOMapCacheMask = 0x00000700, - kIOMapCacheShift = 8, - kIOMapDefaultCache = kIODefaultCache << kIOMapCacheShift, - kIOMapInhibitCache = kIOInhibitCache << kIOMapCacheShift, - kIOMapWriteThruCache = kIOWriteThruCache << kIOMapCacheShift, - kIOMapCopybackCache = kIOCopybackCache << kIOMapCacheShift, - kIOMapWriteCombineCache = kIOWriteCombineCache << kIOMapCacheShift, - kIOMapCopybackInnerCache = kIOCopybackInnerCache << kIOMapCacheShift, - kIOMapPostedWrite = kIOPostedWrite << kIOMapCacheShift, - - kIOMapUserOptionsMask = 0x00000fff, - - kIOMapReadOnly = 0x00001000, - - kIOMapStatic = 0x01000000, - kIOMapReference = 0x02000000, - kIOMapUnique = 0x04000000, + kIOMapAnywhere = 0x00000001, + + kIOMapCacheMask = 0x00000f00, + kIOMapCacheShift = 8, + kIOMapDefaultCache = kIODefaultCache << kIOMapCacheShift, + kIOMapInhibitCache = kIOInhibitCache << kIOMapCacheShift, + kIOMapWriteThruCache = kIOWriteThruCache << kIOMapCacheShift, + kIOMapCopybackCache = kIOCopybackCache << kIOMapCacheShift, + kIOMapWriteCombineCache = kIOWriteCombineCache << kIOMapCacheShift, + kIOMapCopybackInnerCache = kIOCopybackInnerCache << kIOMapCacheShift, + kIOMapPostedWrite = kIOPostedWrite << kIOMapCacheShift, + kIOMapRealTimeCache = kIORealTimeCache << kIOMapCacheShift, + kIOMapPostedReordered = kIOPostedReordered << kIOMapCacheShift, + kIOMapPostedCombinedReordered = kIOPostedCombinedReordered << kIOMapCacheShift, + + kIOMapUserOptionsMask = 0x00000fff, + + kIOMapReadOnly = 0x00001000, + + kIOMapStatic = 0x01000000, + kIOMapReference = 0x02000000, + kIOMapUnique = 0x04000000, #ifdef XNU_KERNEL_PRIVATE - kIOMap64Bit = 0x08000000, + kIOMap64Bit = 0x08000000, #endif - kIOMapPrefault = 0x10000000, - kIOMapOverwrite = 0x20000000 + kIOMapPrefault = 0x10000000, + kIOMapOverwrite = 0x20000000 }; /*! @enum Scale Factors @@ -253,4 +266,30 @@ typedef unsigned int IODeviceNumber; } #endif +#else /* !PLATFORM_DriverKit */ + +#include + +typedef uint32_t IOOptionBits; +typedef int32_t IOFixed; +typedef uint32_t IOVersion; +typedef uint32_t IOItemCount; +typedef uint32_t IOCacheMode; + +typedef uint32_t IOByteCount32; +typedef uint64_t IOByteCount64; +typedef IOByteCount64 IOByteCount; + +typedef uint32_t IOPhysicalAddress32; +typedef uint64_t IOPhysicalAddress64; +typedef uint32_t IOPhysicalLength32; +typedef uint64_t IOPhysicalLength64; + +typedef IOPhysicalAddress64 IOPhysicalAddress; +typedef IOPhysicalLength64 IOPhysicalLength; + +typedef uint64_t IOVirtualAddress; + +#endif /* PLATFORM_DriverKit */ + #endif /* ! __IOKIT_IOTYPES_H */ diff --git a/iokit/IOKit/IOUserClient.h b/iokit/IOKit/IOUserClient.h index 1c17dda61..eba181d07 100644 --- a/iokit/IOKit/IOUserClient.h +++ b/iokit/IOKit/IOUserClient.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,6 +36,7 @@ #include #include #include +#include #if IOKITSTATS #include @@ -173,7 +174,7 @@ enum { class IOUserClient : public IOService { - OSDeclareAbstractStructors(IOUserClient) + OSDeclareAbstractStructorsWithDispatch(IOUserClient); #if IOKITSTATS friend class IOStatistics; #endif @@ -206,7 +207,7 @@ public: UInt8 sharedInstance; UInt8 closed; UInt8 __ipcFinal; - UInt8 __reservedA[1]; + UInt8 messageAppSuspended; volatile SInt32 __ipc; queue_head_t owners; IOLock * lock; @@ -222,10 +223,12 @@ private: #endif /* XNU_KERNEL_PRIVATE */ public: - virtual IOReturn externalMethod( uint32_t selector, IOExternalMethodArguments * arguments, - IOExternalMethodDispatch * dispatch = 0, OSObject * target = 0, void * reference = 0 ); + MIG_SERVER_ROUTINE virtual IOReturn + externalMethod(uint32_t selector, IOExternalMethodArguments *arguments, + IOExternalMethodDispatch *dispatch = NULL, + OSObject *target = NULL, void *reference = NULL); - virtual IOReturn registerNotificationPort( + MIG_SERVER_ROUTINE virtual IOReturn registerNotificationPort( mach_port_t port, UInt32 type, io_user_reference_t refCon); private: @@ -308,6 +311,8 @@ public: static OSObject * copyClientEntitlement( task_t task, const char * entitlement ); + static OSDictionary * copyClientEntitlements(task_t task); + /*! * @function releaseAsyncReference64 * @abstract Release the mach_port_t reference held within the OSAsyncReference64 structure. @@ -342,10 +347,10 @@ public: virtual IOService * getService( void ); - virtual IOReturn registerNotificationPort( + MIG_SERVER_ROUTINE virtual IOReturn registerNotificationPort( mach_port_t port, UInt32 type, UInt32 refCon ); - virtual IOReturn getNotificationSemaphore( UInt32 notification_type, + MIG_SERVER_ROUTINE virtual IOReturn getNotificationSemaphore( UInt32 notification_type, semaphore_t * semaphore ); virtual IOReturn connectClient( IOUserClient * client ); @@ -436,9 +441,11 @@ public: // Methods for accessing method vector. virtual IOExternalMethod * - getTargetAndMethodForIndex( IOService ** targetP, UInt32 index ); + getTargetAndMethodForIndex( + LIBKERN_RETURNS_NOT_RETAINED IOService ** targetP, UInt32 index ); virtual IOExternalAsyncMethod * - getAsyncTargetAndMethodForIndex( IOService ** targetP, UInt32 index ); + getAsyncTargetAndMethodForIndex( + LIBKERN_RETURNS_NOT_RETAINED IOService ** targetP, UInt32 index ); // Methods for accessing trap vector - old and new style virtual IOExternalTrap * @@ -446,7 +453,12 @@ public: APPLE_KEXT_DEPRECATED; virtual IOExternalTrap * - getTargetAndTrapForIndex( IOService **targetP, UInt32 index ); + getTargetAndTrapForIndex( + LIBKERN_RETURNS_NOT_RETAINED IOService **targetP, UInt32 index ); }; +#ifdef XNU_KERNEL_PRIVATE +extern "C" void IOMachPortDestroyUserReferences(OSObject * obj, natural_t type); +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* ! _IOKIT_IOUSERCLIENT_H */ diff --git a/iokit/IOKit/IOUserServer.h b/iokit/IOKit/IOUserServer.h new file mode 100644 index 000000000..0741ed4cb --- /dev/null +++ b/iokit/IOKit/IOUserServer.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef _IOUSERSERVER_H +#define _IOUSERSERVER_H + +#include + +#define kIOUserClassKey "IOUserClass" +#define kIOUserServerClassKey "IOUserServer" +#define kIOUserServerNameKey "IOUserServerName" +#define kIOUserServerTagKey "IOUserServerTag" +// the expected cdhash value of the userspace driver executable +#define kIOUserServerCDHashKey "IOUserServerCDHash" + +#if DRIVERKIT_PRIVATE + +enum{ + kIOKitUserServerClientType = 0x99000003, +}; + +enum{ + kIOUserServerMethodRegisterClass = 0x0001000, + kIOUserServerMethodStart = 0x0001001, + kIOUserServerMethodRegister = 0x0001002, +}; + + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +class OSObject; + +#define OSObject_Instantiate_ID 0x0000000100000001ULL + +enum { + kOSObjectRPCRemote = 0x00000001, + kOSObjectRPCKernel = 0x00000002, +}; + +struct OSObject_Instantiate_Msg_Content { + IORPCMessage __hdr; + OSObjectRef __object; +}; + +struct OSObject_Instantiate_Rpl_Content { + IORPCMessage __hdr; + kern_return_t __result; + uint32_t __pad; + uint64_t flags; + char classname[64]; + uint64_t methods[0]; +}; + +#pragma pack(4) +struct OSObject_Instantiate_Msg { + IORPCMessageMach mach; + mach_msg_port_descriptor_t __object__descriptor; + OSObject_Instantiate_Msg_Content content; +}; +struct OSObject_Instantiate_Rpl { + IORPCMessageMach mach; + OSObject_Instantiate_Rpl_Content content; +}; +#pragma pack() + +typedef uint64_t IOTrapMessageBuffer[256]; + +#endif /* DRIVERKIT_PRIVATE */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifdef XNU_KERNEL_PRIVATE + +#include +#include +#include +#include + +class IOUserServer; +class OSUserMetaClass; +class IODispatchQueue; +class IODispatchSource; +class IOInterruptDispatchSource; +class IOTimerDispatchSource; +struct IOPStrings; + +struct OSObjectUserVars { + IOUserServer * userServer; + IODispatchQueue ** queueArray; + OSUserMetaClass * userMeta; + OSArray * openProviders; + bool willTerminate; + bool didTerminate; + bool serverDied; + bool started; + bool stopped; + bool userServerPM; + bool willPower; + uint32_t powerOverride; +}; + +extern IOLock * gIOUserServerLock; + +typedef struct ipc_kmsg * ipc_kmsg_t; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +namespace IOServicePH +{ +void serverAdd(IOUserServer * server); +void serverRemove(IOUserServer * server); +void serverAck(IOUserServer * server); +}; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +class IOUserServer : public IOUserClient +{ + OSDeclareDefaultStructorsWithDispatch(IOUserServer); + + IOLock * fLock; + IOSimpleLock * fInterruptLock; + task_t fOwningTask; + OSDictionary * fEntitlements; + OSDictionary * fClasses; + IODispatchQueue * fRootQueue; + OSArray * fServices; + + uint64_t fPowerStates; + uint8_t fRootNotifier; + uint8_t fSystemPowerAck; + uint8_t fSystemOff; + +public: + + static IOUserClient * withTask(task_t owningTask); + virtual IOReturn clientClose(void) APPLE_KEXT_OVERRIDE; + virtual bool finalize(IOOptionBits options) APPLE_KEXT_OVERRIDE; + virtual void stop(IOService * provider) APPLE_KEXT_OVERRIDE; + virtual void free() APPLE_KEXT_OVERRIDE; + + virtual IOReturn setProperties(OSObject * properties) APPLE_KEXT_OVERRIDE; + virtual IOReturn externalMethod(uint32_t selector, IOExternalMethodArguments * args, + IOExternalMethodDispatch * dispatch, + OSObject * target, void * reference) APPLE_KEXT_OVERRIDE; + + virtual IOExternalTrap * getTargetAndTrapForIndex(IOService ** targetP, UInt32 index) APPLE_KEXT_OVERRIDE; + + IOReturn serviceAttach(IOService * service, IOService * provider); + IOReturn serviceStop(IOService * service, IOService * provider); + void serviceFree(IOService * service); + IOReturn serviceStarted(IOService * service, IOService * provider, bool result); + static void serviceWillTerminate(IOService * client, IOService * provider, IOOptionBits options); + static void serviceDidTerminate(IOService * client, IOService * provider, IOOptionBits options, bool * defer); + static void serviceDidStop(IOService * client, IOService * provider); + IOReturn serviceOpen(IOService * provider, IOService * client); + IOReturn serviceClose(IOService * provider, IOService * client); + IOReturn serviceNewUserClient(IOService * service, task_t owningTask, void * securityID, + uint32_t type, OSDictionary * properties, IOUserClient ** handler); + IOReturn exit(const char * reason); + + bool serviceMatchesCDHash(IOService *service); + bool checkEntitlements(IOService * provider, IOService * dext); + bool checkEntitlements(OSDictionary * entitlements, OSObject * prop, + IOService * provider, IOService * dext); + + void setTaskLoadTag(OSKext *kext); + void setDriverKitUUID(OSKext *kext); + void systemPower(bool powerOff); + IOReturn setPowerState(unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; + IOReturn powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; + IOReturn powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; + + IOPStrings * copyInStringArray(const char * string, uint32_t userSize); + uint32_t stringArrayIndex(IOPStrings * array, const char * look); + IOReturn registerClass(OSClassDescription * desc, uint32_t size, OSUserMetaClass ** cls); + IOReturn setRootQueue(IODispatchQueue * queue); + + OSObjectUserVars * varsForObject(OSObject * obj); + LIBKERN_RETURNS_NOT_RETAINED IODispatchQueue * queueForObject(OSObject * obj, uint64_t msgid); + + static ipc_port_t copySendRightForObject(OSObject * object, natural_t /* ipc_kobject_type_t */ type); + static OSObject * copyObjectForSendRight(ipc_port_t port, natural_t /* ipc_kobject_type_t */ type); + + IOReturn copyOutObjects(IORPCMessageMach * mach, IORPCMessage * message, + size_t size, bool consume); + IOReturn copyInObjects(IORPCMessageMach * mach, IORPCMessage * message, + size_t size, bool copyObjects, bool consumePorts); + + IOReturn consumeObjects(IORPCMessage * message, size_t messageSize); + + IOReturn objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * message); + IOReturn kernelDispatch(OSObject * obj, IORPC rpc); + static OSObject * target(OSAction * action, IORPCMessage * message); + + IOReturn rpc(IORPC rpc); + IOReturn server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * preply); + kern_return_t waitInterruptTrap(void * p1, void * p2, void * p3, void * p4, void * p5, void * p6); +}; + +extern "C" kern_return_t +IOUserServerUEXTTrap(OSObject * object, void * p1, void * p2, void * p3, void * p4, void * p5, void * p6); + +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* _IOUSERSERVER_H */ diff --git a/iokit/IOKit/IOWorkLoop.h b/iokit/IOKit/IOWorkLoop.h index 2c1fd64f5..1d5fa916b 100644 --- a/iokit/IOKit/IOWorkLoop.h +++ b/iokit/IOKit/IOWorkLoop.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,7 +53,7 @@ class IOCommandGate; */ class IOWorkLoop : public OSObject { - OSDeclareDefaultStructors(IOWorkLoop) + OSDeclareDefaultStructors(IOWorkLoop); public: /*! @@ -294,8 +294,8 @@ public: * @result Returns the value of the Action callout. */ virtual IOReturn runAction(Action action, OSObject *target, - void *arg0 = 0, void *arg1 = 0, - void *arg2 = 0, void *arg3 = 0); + void *arg0 = NULL, void *arg1 = NULL, + void *arg2 = NULL, void *arg3 = NULL); #ifdef __BLOCKS__ /*! @function runAction diff --git a/iokit/IOKit/Makefile b/iokit/IOKit/Makefile index 327365f21..e898d7e4b 100644 --- a/iokit/IOKit/Makefile +++ b/iokit/IOKit/Makefile @@ -29,7 +29,7 @@ EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} # Kernel.framework/Headers/IOKit AND Kernel.framework/PrivateHeaders/IOKit. # This is so the files with #ifdef ...PRIVATE portions can be processed. # xnu/README documents the INSTALL* and EXPORT_MI_DIR lists. -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) # Do not install these headers (anywhere). NOT_EXPORT_HEADERS = IOInterruptAccountingPrivate.h @@ -44,10 +44,11 @@ NOT_KF_MI_HEADERS = $(NOT_EXPORT_HEADERS) \ IOKernelReporters.h \ IOInterruptAccounting.h + # These should be additionally installed in IOKit.framework's public Headers INSTALL_MI_LIST = IOBSD.h IOKitKeys.h IOKitServer.h IOReturn.h \ IOSharedLock.h IOTypes.h OSMessageNotification.h \ - IODataQueueShared.h IOMessage.h + IODataQueueShared.h IOMessage.h IORPC.h IOUserServer.h # These should be additionally installed in IOKit.framework's PrivateHeaders INSTALL_MI_LCL_LIST = IOKitKeysPrivate.h IOHibernatePrivate.h \ diff --git a/iokit/IOKit/machine/Makefile b/iokit/IOKit/machine/Makefile index d7a224db7..11cf1ace2 100644 --- a/iokit/IOKit/machine/Makefile +++ b/iokit/IOKit/machine/Makefile @@ -13,7 +13,7 @@ include $(MakeInc_def) MI_DIR = machine EXCLUDE_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) HEADER_LIST = $(filter-out $(EXCLUDE_HEADERS), $(ALL_HEADERS)) INSTALL_MI_LIST = ${HEADER_LIST} diff --git a/iokit/IOKit/nvram/Makefile b/iokit/IOKit/nvram/Makefile index 393486e2a..b3946e7ff 100644 --- a/iokit/IOKit/nvram/Makefile +++ b/iokit/IOKit/nvram/Makefile @@ -13,7 +13,7 @@ include $(MakeInc_def) MI_DIR = nvram NOT_EXPORT_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = diff --git a/iokit/IOKit/perfcontrol/IOPerfControl.h b/iokit/IOKit/perfcontrol/IOPerfControl.h index b1501cd52..27e776ee3 100644 --- a/iokit/IOKit/perfcontrol/IOPerfControl.h +++ b/iokit/IOKit/perfcontrol/IOPerfControl.h @@ -8,6 +8,8 @@ #ifdef __cplusplus #include +#include +#include struct thread_group; @@ -175,6 +177,13 @@ public: WorkEndFunction workEnd; }; + struct IOPerfControlClientShared { + atomic_uint_fast8_t maxDriverIndex; + PerfControllerInterface interface; + IOLock *interfaceLock; + OSSet *deviceRegistrationList; + }; + /*! * @function registerPerformanceController * @abstract Register a performance controller to receive callbacks. Not for general driver use. @@ -190,20 +199,22 @@ private: uint8_t perfcontrol_data[32]; }; -// TODO: size of table should match sum(maxWorkCapacity) of all users - static constexpr size_t kWorkTableNumEntries = 1024; + static constexpr size_t kMaxWorkTableNumEntries = 1024; + static constexpr size_t kWorkTableIndexBits = 24; + static constexpr size_t kWorkTableMaxSize = (1 << kWorkTableIndexBits) - 1; // - 1 since + // kIOPerfControlClientWorkUntracked takes number 0 + static constexpr size_t kWorkTableIndexMask = mask(kWorkTableIndexBits); uint64_t allocateToken(thread_group *thread_group); void deallocateToken(uint64_t token); bool getEntryForToken(uint64_t token, WorkTableEntry &entry); void markEntryStarted(uint64_t token, bool started); + inline uint64_t tokenToGlobalUniqueToken(uint64_t token); - PerfControllerInterface interface; - IOLock *interfaceLock; - OSSet *deviceRegistrationList; - -// TODO: replace with ltable or pool of objects - WorkTableEntry workTable[kWorkTableNumEntries]; + uint8_t driverIndex; + IOPerfControlClientShared *shared; + WorkTableEntry *workTable; + size_t workTableLength; size_t workTableNextIndex; IOSimpleLock *workTableLock; }; diff --git a/iokit/IOKit/perfcontrol/Makefile b/iokit/IOKit/perfcontrol/Makefile index 3f8cad1d5..017c31610 100644 --- a/iokit/IOKit/perfcontrol/Makefile +++ b/iokit/IOKit/perfcontrol/Makefile @@ -13,7 +13,7 @@ include $(MakeInc_def) MI_DIR = perfcontrol NOT_EXPORT_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) # Install these only in Kernel.framework's PrivateHeaders (not Headers). NOT_KF_MI_HEADERS = $(NOT_EXPORT_HEADERS) \ diff --git a/iokit/IOKit/platform/Makefile b/iokit/IOKit/platform/Makefile index ebb1f416e..35ec20f4c 100644 --- a/iokit/IOKit/platform/Makefile +++ b/iokit/IOKit/platform/Makefile @@ -14,7 +14,7 @@ MI_DIR = platform NOT_EXPORT_HEADERS = NOT_KF_MI_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = diff --git a/iokit/IOKit/power/IOPwrController.h b/iokit/IOKit/power/IOPwrController.h index 6fcd0d8ba..f38707f8d 100644 --- a/iokit/IOKit/power/IOPwrController.h +++ b/iokit/IOKit/power/IOPwrController.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,7 +33,7 @@ class IOPwrController : public IOService { - OSDeclareAbstractStructors(IOPwrController) + OSDeclareAbstractStructors(IOPwrController); public: }; diff --git a/iokit/IOKit/power/Makefile b/iokit/IOKit/power/Makefile index 01cc4bd09..8407f3006 100644 --- a/iokit/IOKit/power/Makefile +++ b/iokit/IOKit/power/Makefile @@ -13,7 +13,7 @@ include $(MakeInc_def) MI_DIR = power NOT_EXPORT_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index bc1b88397..a7b0417b2 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -102,6 +102,12 @@ enum { kIOPMPowerOn = 0x00000002, kIOPMDeviceUsable = 0x00008000, kIOPMLowPower = 0x00010000, +#if PRIVATE +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + kIOPMAOTPower = 0x00020000, + kIOPMAOTCapability = kIOPMAOTPower, +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ +#endif /* PRIVATE */ kIOPMPreventIdleSleep = 0x00000040, kIOPMSleepCapability = 0x00000004, kIOPMRestartCapability = 0x00000080, @@ -321,6 +327,12 @@ enum { */ kIOPMDriverAssertionCPUBit = 0x01, + /*! kIOPMDriverAssertionPreventSystemIdleSleepBit + * When set, the system should not idle sleep. This does not prevent + * demand sleep. + */ + kIOPMDriverAssertionPreventSystemIdleSleepBit = 0x02, + /*! kIOPMDriverAssertionUSBExternalDeviceBit * When set, driver is informing PM that an external USB device is attached. */ @@ -473,7 +485,7 @@ enum { * Argument accompanying the kIOPMMessageSleepWakeUUIDChange notification when * the current UUID has been removed. */ -#define kIOPMMessageSleepWakeUUIDCleared ((void *)0) +#define kIOPMMessageSleepWakeUUIDCleared ((void *)NULL) /*! kIOPMMessageDriverAssertionsChanged * Sent when kernel PM driver assertions have changed. @@ -510,7 +522,8 @@ enum { kIOPMProcessorSpeedChange = (1 << 8),// change the processor speed kIOPMOverTemp = (1 << 9),// system dangerously hot kIOPMClamshellOpened = (1 << 10),// clamshell was opened - kIOPMDWOverTemp = (1 << 11)// DarkWake thermal limits exceeded. + kIOPMDWOverTemp = (1 << 11),// DarkWake thermal limits exceeded. + kIOPMPowerButtonUp = (1 << 12) // Power button up }; @@ -589,7 +602,7 @@ enum { #define kIOPMPSLegacyBatteryInfoKey "LegacyBatteryInfo" #define kIOPMPSBatteryHealthKey "BatteryHealth" #define kIOPMPSHealthConfidenceKey "HealthConfidence" -#define kIOPMPSCapacityEstimatedKey "CapacityEstimated" +#define kIOPMPSCapacityEstimatedKey "CapacityEstimated" #define kIOPMPSBatteryChargeStatusKey "ChargeStatus" #define kIOPMPSBatteryTemperatureKey "Temperature" #define kIOPMPSAdapterDetailsKey "AdapterDetails" @@ -627,13 +640,13 @@ enum { #define kIOPMPSAdapterDetailsRevisionKey "AdapterRevision" #define kIOPMPSAdapterDetailsSerialNumberKey "SerialNumber" #define kIOPMPSAdapterDetailsFamilyKey "FamilyCode" -#define kIOPMPSAdapterDetailsAmperageKey "Amperage" +#define kIOPMPSAdapterDetailsAmperageKey "Current" #define kIOPMPSAdapterDetailsDescriptionKey "Description" #define kIOPMPSAdapterDetailsPMUConfigurationKey "PMUConfiguration" -#define kIOPMPSAdapterDetailsVoltage "AdapterVoltage" -#define kIOPMPSAdapterDetailsSourceIDKey "SourceID" -#define kIOPMPSAdapterDetailsErrorFlagsKey "ErrorFlags" -#define kIOPMPSAdapterDetailsSharedSourceKey "SharedSource" +#define kIOPMPSAdapterDetailsVoltage "Voltage" +#define kIOPMPSAdapterDetailsSourceIDKey "Source" +#define kIOPMPSAdapterDetailsErrorFlagsKey "ErrorFlags" +#define kIOPMPSAdapterDetailsSharedSourceKey "SharedSource" #define kIOPMPSAdapterDetailsCloakedKey "CloakedSource" // values for kIOPSPowerAdapterFamilyKey diff --git a/iokit/IOKit/pwr_mgt/IOPMPowerSource.h b/iokit/IOKit/pwr_mgt/IOPMPowerSource.h index 7f199e6b8..c248941a5 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPowerSource.h +++ b/iokit/IOKit/pwr_mgt/IOPMPowerSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -170,7 +170,7 @@ enum { class IOPMPowerSource : public IOService { - OSDeclareDefaultStructors(IOPMPowerSource) + OSDeclareDefaultStructors(IOPMPowerSource); friend class IOPMPowerSourceList; diff --git a/iokit/IOKit/pwr_mgt/IOPMPowerSourceList.h b/iokit/IOKit/pwr_mgt/IOPMPowerSourceList.h index f78ca2d54..e69663a3e 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPowerSourceList.h +++ b/iokit/IOKit/pwr_mgt/IOPMPowerSourceList.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,7 @@ class IOPMPowerSource; class IOPMPowerSourceList : public OSObject { - OSDeclareDefaultStructors(IOPMPowerSourceList) + OSDeclareDefaultStructors(IOPMPowerSourceList); private: // pointer to first power source in list IOPMPowerSource *firstItem; diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 798be5d88..1a549c0c4 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -183,6 +183,7 @@ enum { * These are valid values for IOPM.h:IOPMCalendarStruct->selector */ enum { + kPMCalendarTypeInvalid = 0, kPMCalendarTypeMaintenance = 1, kPMCalendarTypeSleepService = 2 }; @@ -269,6 +270,8 @@ enum { #define kIOPMSleepStatisticsAppsKey "AppStatistics" #define kIOPMIdleSleepPreventersKey "IdleSleepPreventers" #define kIOPMSystemSleepPreventersKey "SystemSleepPreventers" +#define kIOPMIdleSleepPreventersWithIDKey "IdleSleepPreventersWithID" +#define kIOPMSystemSleepPreventersWithIDKey "SystemSleepPreventersWithID" // Application response statistics #define kIOPMStatsNameKey "Name" @@ -682,6 +685,84 @@ enum { #define kIOPMWakeEventSource 0x00000001 +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) +/***************************************************************************** + * + * AOT defs + * + *****************************************************************************/ + +// signals the device should wake up to user space running +#define kIOPMWakeEventAOTExit 0x00000002 + +// will start a 400 ms timer before sleeping +#define kIOPMWakeEventAOTPossibleExit 0x00000004 + +// signals the device should wake up to user space running +#define kIOPMWakeEventAOTConfirmedPossibleExit 0x00000008 + +// signals the device should go back to AOT +#define kIOPMWakeEventAOTRejectedPossibleExit 0x00000010 + +// signals the device should go back to AOT +#define kIOPMWakeEventAOTExpiredPossibleExit 0x00000020 + +#define kIOPMWakeEventAOTFlags \ + (kIOPMWakeEventAOTExit \ + | kIOPMWakeEventAOTPossibleExit \ + | kIOPMWakeEventAOTConfirmedPossibleExit \ + | kIOPMWakeEventAOTRejectedPossibleExit \ + | kIOPMWakeEventAOTExpiredPossibleExit) + +#define kIOPMWakeEventAOTPossibleFlags \ + (kIOPMWakeEventAOTPossibleExit \ + | kIOPMWakeEventAOTConfirmedPossibleExit \ + | kIOPMWakeEventAOTRejectedPossibleExit \ + | kIOPMWakeEventAOTExpiredPossibleExit) + +#define kIOPMWakeEventAOTPerCycleFlags \ + (kIOPMWakeEventAOTPossibleExit \ + | kIOPMWakeEventAOTRejectedPossibleExit \ + | kIOPMWakeEventAOTExpiredPossibleExit) + +#define kIOPMWakeEventAOTExitFlags \ + (kIOPMWakeEventAOTExit \ + | kIOPMWakeEventAOTConfirmedPossibleExit) + +enum { + kIOPMAOTModeEnable = 0x00000001, + kIOPMAOTModeCycle = 0x00000002, + kIOPMAOTModeAddEventFlags = 0x00000004, + kIOPMAOTModeRespectTimers = 0x00000008, + kIOPMAOTModeDefault = (kIOPMAOTModeEnable | kIOPMAOTModeAddEventFlags | kIOPMAOTModeRespectTimers) +}; + +enum { + kIOPMAOTMetricsKernelWakeCountMax = 24 +}; + +struct IOPMAOTMetrics +{ + uint32_t sleepCount; + uint32_t possibleCount; + uint32_t confirmedPossibleCount; + uint32_t rejectedPossibleCount; + uint32_t expiredPossibleCount; + uint32_t noTimeSetCount; + uint32_t rtcAlarmsCount; + uint32_t softwareRequestCount; + uint64_t totalTime; + + char kernelWakeReason[kIOPMAOTMetricsKernelWakeCountMax][64]; + // 54:10 secs:ms calendar time + uint64_t kernelSleepTime[kIOPMAOTMetricsKernelWakeCountMax]; + uint64_t kernelWakeTime[kIOPMAOTMetricsKernelWakeCountMax]; +}; + +#define kIOPMAOTPowerKey "aot-power" + +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + /***************************************************************************** * * System Sleep Policy @@ -873,6 +954,7 @@ typedef struct { #define SWD_VALID_LOGS 0x08 #define SWD_LOGS_IN_FILE 0x10 #define SWD_LOGS_IN_MEM 0x20 +#define SWD_PWR_BTN_STACKSHOT 0x30 #define SWD_DATA_CRC_ERROR 0x010000 #define SWD_BUF_SIZE_ERROR 0x020000 diff --git a/iokit/IOKit/pwr_mgt/IOPMinformee.h b/iokit/IOKit/pwr_mgt/IOPMinformee.h index 6280e2ba4..40111c0cc 100644 --- a/iokit/IOKit/pwr_mgt/IOPMinformee.h +++ b/iokit/IOKit/pwr_mgt/IOPMinformee.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2007 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,7 +33,7 @@ class IOPMinformee : public OSObject { - OSDeclareDefaultStructors(IOPMinformee) + OSDeclareDefaultStructors(IOPMinformee); friend class IOPMinformeeList; public: diff --git a/iokit/IOKit/pwr_mgt/IOPMinformeeList.h b/iokit/IOKit/pwr_mgt/IOPMinformeeList.h index f941a0ad1..ae4b12110 100644 --- a/iokit/IOKit/pwr_mgt/IOPMinformeeList.h +++ b/iokit/IOKit/pwr_mgt/IOPMinformeeList.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,7 +35,7 @@ extern uint32_t gCanSleepTimeout; class IOPMinformeeList : public OSObject { - OSDeclareDefaultStructors(IOPMinformeeList) + OSDeclareDefaultStructors(IOPMinformeeList); friend class IOPMinformee; private: diff --git a/iokit/IOKit/pwr_mgt/IOPowerConnection.h b/iokit/IOKit/pwr_mgt/IOPowerConnection.h index 98ebe50b5..f6dfb48d0 100644 --- a/iokit/IOKit/pwr_mgt/IOPowerConnection.h +++ b/iokit/IOKit/pwr_mgt/IOPowerConnection.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,7 +48,7 @@ class IOPowerConnection : public IOService { - OSDeclareDefaultStructors(IOPowerConnection) + OSDeclareDefaultStructors(IOPowerConnection); protected: /*! @field parentKnowsState true: parent knows state of its domain diff --git a/iokit/IOKit/pwr_mgt/Makefile b/iokit/IOKit/pwr_mgt/Makefile index ad7dcbbdf..c4cf72b31 100644 --- a/iokit/IOKit/pwr_mgt/Makefile +++ b/iokit/IOKit/pwr_mgt/Makefile @@ -17,7 +17,7 @@ NOT_EXPORT_HEADERS = \ IOPMlog.h \ IOPMPrivate.h -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) INSTALL_MI_LIST = IOPMLibDefs.h IOPM.h INSTALL_MI_LCL_LIST = IOPMPrivate.h diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 504d8d0f2..61334b1cd 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2016 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,6 +34,9 @@ #include #ifdef XNU_KERNEL_PRIVATE + +#include + struct AggressivesRecord; struct IOPMMessageFilterContext; struct IOPMActions; @@ -43,6 +46,7 @@ class PMTraceWorker; class IOPMPowerStateQueue; class RootDomainUserClient; class PMAssertionsTracker; +class IOTimerEventSource; #define OBFUSCATE(x) (void *)VM_KERNEL_UNSLIDE_OR_PERM(x) @@ -130,11 +134,11 @@ typedef IOReturn (*IOPMSettingControllerCallback) __BEGIN_DECLS IONotifier * registerSleepWakeInterest( - IOServiceInterestHandler, void *, void * = 0); + IOServiceInterestHandler, void *, void * = NULL); IONotifier * registerPrioritySleepWakeInterest( IOServiceInterestHandler handler, - void * self, void * ref = 0); + void * self, void * ref = NULL); IOReturn acknowledgeSleepWakeNotification(void * ); @@ -145,7 +149,7 @@ __END_DECLS class IOPMrootDomain : public IOService { - OSDeclareFinalStructors(IOPMrootDomain) + OSDeclareFinalStructors(IOPMrootDomain); public: static IOPMrootDomain * construct( void ); @@ -243,7 +247,7 @@ public: void claimSystemWakeEvent( IOService *device, IOOptionBits flags, const char *reason, - OSObject *details = 0 ); + OSObject *details = NULL ); virtual IOReturn receivePowerNotification( UInt32 msg ); @@ -324,7 +328,7 @@ public: virtual IONotifier * registerInterest( const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, - void * target, void * ref = 0 ) APPLE_KEXT_OVERRIDE; + void * target, void * ref = NULL ) APPLE_KEXT_OVERRIDE; virtual IOReturn callPlatformFunction( const OSSymbol *functionName, @@ -386,7 +390,11 @@ public: */ IOReturn restartWithStackshot(); + IOReturn setWakeTime(uint64_t wakeContinuousTime); + private: + unsigned long getRUN_STATE(void); + virtual IOReturn changePowerStateTo( unsigned long ordinal ) APPLE_KEXT_COMPATIBILITY_OVERRIDE; virtual IOReturn changePowerStateToPriv( unsigned long ordinal ); virtual IOReturn requestPowerDomainState( IOPMPowerFlags, IOPowerConnection *, unsigned long ) APPLE_KEXT_OVERRIDE; @@ -524,6 +532,10 @@ public: void updatePreventSystemSleepList( IOService * service, bool addNotRemove ); + bool updatePreventIdleSleepListInternal( + IOService * service, bool addNotRemove, unsigned int oldCount); + unsigned int idleSleepPreventersCount(); + void publishPMSetting( const OSSymbol * feature, uint32_t where, uint32_t * featureID ); @@ -549,14 +561,15 @@ public: uint32_t * hibernateFreeTime ); bool mustHibernate( void ); #endif - void takeStackshot(bool restart, bool isOSXWatchdog, bool isSpinDump); + void takeStackshot(bool restart); void sleepWakeDebugTrig(bool restart); void sleepWakeDebugEnableWdog(); bool sleepWakeDebugIsWdogEnabled(); void sleepWakeDebugSaveSpinDumpFile(); bool checkShutdownTimeout(); - void panicWithShutdownLog(uint32_t timeoutInMs); + void panicWithShutdownLog(uint32_t timeoutInMs) __abortlike; uint32_t getWatchdogTimeout(); + void deleteStackshot(); private: friend class PMSettingObject; @@ -646,6 +659,8 @@ private: // Used to wait between say display idle and system idle thread_call_t extraSleepTimer; + thread_call_t powerButtonDown; + thread_call_t powerButtonUp; thread_call_t diskSyncCalloutEntry; thread_call_t fullWakeThreadCall; thread_call_t updateConsoleUsersEntry; @@ -693,6 +708,7 @@ private: unsigned int wranglerTickled :1; unsigned int _preventUserActive :1; unsigned int graphicsSuppressed :1; + unsigned int isRTCAlarmWake :1; unsigned int capabilityLoss :1; unsigned int pciCantSleepFlag :1; @@ -719,6 +735,7 @@ private: unsigned int displayPowerOnRequested:1; uint8_t tasksSuspended; + uint8_t tasksSuspendState; uint32_t hibernateMode; AbsoluteTime userActivityTime; AbsoluteTime userActivityTime_prev; @@ -772,6 +789,7 @@ private: UInt32 _scheduledAlarms; UInt32 _userScheduledAlarm; + clock_sec_t _scheduledAlarmUTC; #if HIBERNATION clock_sec_t _standbyTimerResetSeconds; @@ -790,6 +808,39 @@ private: OSArray * _systemWakeEventsArray; bool _acceptSystemWakeEvents; +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + // AOT -- + IOPMCalendarStruct _aotWakeTimeCalendar; + IOTimerEventSource * _aotTimerES; + clock_sec_t _aotWakeTimeUTC; + uint64_t _aotTestTime; + uint64_t _aotTestInterval; + uint32_t _aotPendingFlags; +public: + IOPMAOTMetrics * _aotMetrics; + uint8_t _aotMode; +private: + uint8_t _aotNow; + uint8_t _aotTasksSuspended; + uint8_t _aotExit; + uint8_t _aotTimerScheduled; + uint8_t _aotReadyToFullWake; + uint64_t _aotLastWakeTime; + uint64_t _aotWakeTimeContinuous; + uint64_t _aotWakePreWindow; + uint64_t _aotWakePostWindow; + uint64_t _aotLingerTime; + + bool aotShouldExit(bool checkTimeSet, bool software); + void aotExit(bool cps); + void aotEvaluate(IOTimerEventSource * timer); +public: + bool isAOTMode(void); +private: + // -- AOT +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + + void updateTasksSuspend(void); int findSuspendedPID(uint32_t pid, uint32_t *outRefCount); // IOPMrootDomain internal sleep call @@ -807,6 +858,7 @@ private: void restoreUserSpinDownTimeout( void ); bool shouldSleepOnClamshellClosed(void ); + bool shouldSleepOnRTCAlarmWake(void ); void sendClientClamshellNotification( void ); // Inform PMCPU of changes to state like lid, AC vs. battery @@ -874,13 +926,14 @@ private: void preventTransitionToUserActive( bool prevent ); void setThermalState(OSObject *value); void copySleepPreventersList(OSArray **idleSleepList, OSArray **systemSleepList); + void copySleepPreventersListWithID(OSArray **idleSleepList, OSArray **systemSleepList); #endif /* XNU_KERNEL_PRIVATE */ }; #ifdef XNU_KERNEL_PRIVATE class IORootParent : public IOService { - OSDeclareFinalStructors(IORootParent) + OSDeclareFinalStructors(IORootParent); public: static void initialize( void ); diff --git a/iokit/IOKit/rtc/IORTCController.h b/iokit/IOKit/rtc/IORTCController.h index 5a73917ec..876210ee1 100644 --- a/iokit/IOKit/rtc/IORTCController.h +++ b/iokit/IOKit/rtc/IORTCController.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2017 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,7 +38,7 @@ typedef void (*RTC_tick_handler)( IOService * ); class IORTCController : public IOService { - OSDeclareAbstractStructors(IORTCController) + OSDeclareAbstractStructors(IORTCController); public: diff --git a/iokit/IOKit/rtc/Makefile b/iokit/IOKit/rtc/Makefile index 587476354..19d48faea 100644 --- a/iokit/IOKit/rtc/Makefile +++ b/iokit/IOKit/rtc/Makefile @@ -13,7 +13,7 @@ include $(MakeInc_def) MI_DIR = rtc NOT_EXPORT_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = diff --git a/iokit/IOKit/system_management/Makefile b/iokit/IOKit/system_management/Makefile index 715cf2d98..fc81916b4 100644 --- a/iokit/IOKit/system_management/Makefile +++ b/iokit/IOKit/system_management/Makefile @@ -13,7 +13,7 @@ include $(MakeInc_def) MI_DIR = system_management NOT_EXPORT_HEADERS = -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = diff --git a/iokit/IOKitUser/IOBlockStorageDevice.h b/iokit/IOKitUser/IOBlockStorageDevice.h new file mode 100644 index 000000000..0be0d68b8 --- /dev/null +++ b/iokit/IOKitUser/IOBlockStorageDevice.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOBufferMemoryDescriptor.h b/iokit/IOKitUser/IOBufferMemoryDescriptor.h new file mode 100644 index 000000000..357335695 --- /dev/null +++ b/iokit/IOKitUser/IOBufferMemoryDescriptor.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IODataQueueDispatchSource.h b/iokit/IOKitUser/IODataQueueDispatchSource.h new file mode 100644 index 000000000..4c7d36f1f --- /dev/null +++ b/iokit/IOKitUser/IODataQueueDispatchSource.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IODispatchQueue.h b/iokit/IOKitUser/IODispatchQueue.h new file mode 100644 index 000000000..05d4ff641 --- /dev/null +++ b/iokit/IOKitUser/IODispatchQueue.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IODispatchSource.h b/iokit/IOKitUser/IODispatchSource.h new file mode 100644 index 000000000..9a9d06cae --- /dev/null +++ b/iokit/IOKitUser/IODispatchSource.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOInterruptDispatchSource.h b/iokit/IOKitUser/IOInterruptDispatchSource.h new file mode 100644 index 000000000..b27409e27 --- /dev/null +++ b/iokit/IOKitUser/IOInterruptDispatchSource.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOMemoryDescriptor.h b/iokit/IOKitUser/IOMemoryDescriptor.h new file mode 100644 index 000000000..62cddfb57 --- /dev/null +++ b/iokit/IOKitUser/IOMemoryDescriptor.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOMemoryMap.h b/iokit/IOKitUser/IOMemoryMap.h new file mode 100644 index 000000000..56fe092e5 --- /dev/null +++ b/iokit/IOKitUser/IOMemoryMap.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOService.h b/iokit/IOKitUser/IOService.h new file mode 100644 index 000000000..ec9ad1b34 --- /dev/null +++ b/iokit/IOKitUser/IOService.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOTimerDispatchSource.h b/iokit/IOKitUser/IOTimerDispatchSource.h new file mode 100644 index 000000000..7b0634dc8 --- /dev/null +++ b/iokit/IOKitUser/IOTimerDispatchSource.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/IOUserServer.h b/iokit/IOKitUser/IOUserServer.h new file mode 100644 index 000000000..7c184bb4e --- /dev/null +++ b/iokit/IOKitUser/IOUserServer.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/Makefile b/iokit/IOKitUser/Makefile new file mode 100644 index 000000000..afafc2ec1 --- /dev/null +++ b/iokit/IOKitUser/Makefile @@ -0,0 +1,17 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h)) + +EXPORT_MI_DIR = IOKitUser +INSTALL_MI_DIR = IOKitUser + +INSTALL_KF_MI_LIST = $(ALL_HEADERS) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/iokit/IOKitUser/OSAction.h b/iokit/IOKitUser/OSAction.h new file mode 100644 index 000000000..9568834b0 --- /dev/null +++ b/iokit/IOKitUser/OSAction.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSArray.h b/iokit/IOKitUser/OSArray.h new file mode 100644 index 000000000..9bcf3f9d0 --- /dev/null +++ b/iokit/IOKitUser/OSArray.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSBoolean.h b/iokit/IOKitUser/OSBoolean.h new file mode 100644 index 000000000..885b393fe --- /dev/null +++ b/iokit/IOKitUser/OSBoolean.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSCollection.h b/iokit/IOKitUser/OSCollection.h new file mode 100644 index 000000000..9e842c658 --- /dev/null +++ b/iokit/IOKitUser/OSCollection.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSContainer.h b/iokit/IOKitUser/OSContainer.h new file mode 100644 index 000000000..b36a0ce9f --- /dev/null +++ b/iokit/IOKitUser/OSContainer.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSData.h b/iokit/IOKitUser/OSData.h new file mode 100644 index 000000000..bebb52d3d --- /dev/null +++ b/iokit/IOKitUser/OSData.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSDictionary.h b/iokit/IOKitUser/OSDictionary.h new file mode 100644 index 000000000..53cc653b3 --- /dev/null +++ b/iokit/IOKitUser/OSDictionary.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSNumber.h b/iokit/IOKitUser/OSNumber.h new file mode 100644 index 000000000..377405c87 --- /dev/null +++ b/iokit/IOKitUser/OSNumber.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSObject.h b/iokit/IOKitUser/OSObject.h new file mode 100644 index 000000000..4e5815435 --- /dev/null +++ b/iokit/IOKitUser/OSObject.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSSerialization.h b/iokit/IOKitUser/OSSerialization.h new file mode 100644 index 000000000..6635145e5 --- /dev/null +++ b/iokit/IOKitUser/OSSerialization.h @@ -0,0 +1 @@ +#include diff --git a/iokit/IOKitUser/OSString.h b/iokit/IOKitUser/OSString.h new file mode 100644 index 000000000..094407001 --- /dev/null +++ b/iokit/IOKitUser/OSString.h @@ -0,0 +1 @@ +#include diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index 91aeca2a8..dfe4b08ba 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -111,6 +111,63 @@ IOBufferMemoryDescriptor::initWithOptions( } #endif /* !__LP64__ */ +IOBufferMemoryDescriptor * +IOBufferMemoryDescriptor::withCopy( + task_t inTask, + IOOptionBits options, + vm_map_t sourceMap, + mach_vm_address_t source, + mach_vm_size_t size) +{ + IOBufferMemoryDescriptor * inst; + kern_return_t err; + vm_map_copy_t copy; + vm_map_address_t address; + + copy = NULL; + do { + err = kIOReturnNoMemory; + inst = new IOBufferMemoryDescriptor; + if (!inst) { + break; + } + inst->_ranges.v64 = IONew(IOAddressRange, 1); + if (!inst->_ranges.v64) { + break; + } + + err = vm_map_copyin(sourceMap, source, size, + false /* src_destroy */, ©); + if (KERN_SUCCESS != err) { + break; + } + + err = vm_map_copyout(get_task_map(inTask), &address, copy); + if (KERN_SUCCESS != err) { + break; + } + copy = NULL; + + inst->_ranges.v64->address = address; + inst->_ranges.v64->length = size; + + if (!inst->initWithPhysicalMask(inTask, options, size, page_size, 0)) { + err = kIOReturnError; + } + } while (false); + + if (KERN_SUCCESS == err) { + return inst; + } + + if (copy) { + vm_map_copy_discard(copy); + } + OSSafeReleaseNULL(inst); + return NULL; +} + + bool IOBufferMemoryDescriptor::initWithPhysicalMask( task_t inTask, @@ -125,6 +182,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( IOOptionBits iomdOptions = kIOMemoryTypeVirtual64 | kIOMemoryAsReference; IODMAMapSpecification mapSpec; bool mapped = false; + bool withCopy = false; bool needZero; if (!capacity) { @@ -135,14 +193,28 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( _capacity = capacity; _internalFlags = 0; _internalReserved = 0; - _buffer = 0; + _buffer = NULL; - _ranges.v64 = IONew(IOAddressRange, 1); if (!_ranges.v64) { - return false; + _ranges.v64 = IONew(IOAddressRange, 1); + if (!_ranges.v64) { + return false; + } + _ranges.v64->address = 0; + _ranges.v64->length = 0; + } else { + if (!_ranges.v64->address) { + return false; + } + if (!(kIOMemoryPageable & options)) { + return false; + } + if (!inTask) { + return false; + } + _buffer = (void *) _ranges.v64->address; + withCopy = true; } - _ranges.v64->address = 0; - _ranges.v64->length = 0; // make sure super::free doesn't dealloc _ranges before super::init _flags = kIOMemoryAsReference; @@ -151,7 +223,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( if (!(kIOMemoryMapperNone & options)) { IOMapper::checkForSystemMapper(); - mapped = (0 != IOMapper::gSystem); + mapped = (NULL != IOMapper::gSystem); } needZero = (mapped || (0 != (kIOMemorySharingTypeMask & options))); @@ -261,13 +333,17 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( vm_size_t size = round_page(capacity); // initWithOptions will create memory entry - iomdOptions |= kIOMemoryPersistent; + if (!withCopy) { + iomdOptions |= kIOMemoryPersistent; + } if (options & kIOMemoryPageable) { #if IOALLOCDEBUG OSAddAtomicLong(size, &debug_iomallocpageable_size); #endif - mapTask = inTask; + if (!withCopy) { + mapTask = inTask; + } if (NULL == inTask) { inTask = kernel_task; } @@ -284,11 +360,11 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( } } - _ranges.v64->address = (mach_vm_address_t) _buffer;; + _ranges.v64->address = (mach_vm_address_t) _buffer; _ranges.v64->length = _capacity; if (!super::initWithOptions(_ranges.v64, 1, 0, - inTask, iomdOptions, /* System mapper */ 0)) { + inTask, iomdOptions, /* System mapper */ NULL)) { return false; } @@ -315,7 +391,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( reserved->map = createMappingInTask(mapTask, 0, kIOMapAnywhere | (options & kIOMapPrefault) | (options & kIOMapCacheMask), 0, 0); if (!reserved->map) { - _buffer = 0; + _buffer = NULL; return false; } release(); // map took a retain on this @@ -344,7 +420,7 @@ IOBufferMemoryDescriptor::inTaskWithOptions( if (me && !me->initWithPhysicalMask(inTask, options, capacity, alignment, 0)) { me->release(); - me = 0; + me = NULL; } return me; } @@ -360,7 +436,7 @@ IOBufferMemoryDescriptor::inTaskWithPhysicalMask( if (me && !me->initWithPhysicalMask(inTask, options, capacity, 1, physicalMask)) { me->release(); - me = 0; + me = NULL; } return me; } @@ -386,7 +462,7 @@ IOBufferMemoryDescriptor::withOptions( if (me && !me->initWithPhysicalMask(kernel_task, options, capacity, alignment, 0)) { me->release(); - me = 0; + me = NULL; } return me; } @@ -458,7 +534,7 @@ IOBufferMemoryDescriptor::withBytes(const void * inBytes, | (inContiguous ? kIOMemoryPhysicallyContiguous : 0), inLength, inLength, 0 )) { me->release(); - me = 0; + me = NULL; } if (me) { @@ -467,7 +543,7 @@ IOBufferMemoryDescriptor::withBytes(const void * inBytes, if (!me->appendBytes(inBytes, inLength)) { me->release(); - me = 0; + me = NULL; } } return me; @@ -488,7 +564,7 @@ IOBufferMemoryDescriptor::free() IOOptionBits options = _options; vm_size_t size = _capacity; void * buffer = _buffer; - IOMemoryMap * map = 0; + IOMemoryMap * map = NULL; IOAddressRange * range = _ranges.v64; vm_offset_t alignment = _alignment; @@ -653,7 +729,7 @@ IOBufferMemoryDescriptor::getBytesNoCopy(vm_size_t start, vm_size_t withLength) IOVirtualAddress address; if ((start + withLength) < start) { - return 0; + return NULL; } if (kIOMemoryTypePhysical64 == (_flags & kIOMemoryTypeMask)) { @@ -665,7 +741,7 @@ IOBufferMemoryDescriptor::getBytesNoCopy(vm_size_t start, vm_size_t withLength) if (start < _length && (start + withLength) <= _length) { return (void *)(address + start); } - return 0; + return NULL; } #ifndef __LP64__ diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp index 88ac5d1ff..84b9cedec 100644 --- a/iokit/Kernel/IOCPU.cpp +++ b/iokit/Kernel/IOCPU.cpp @@ -44,6 +44,7 @@ extern void kperf_kernel_configure(char *); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include +#include extern "C" void console_suspend(); extern "C" void console_resume(); @@ -149,6 +150,7 @@ iocpu_run_platform_actions(queue_head_t * queue, uint32_t first_priority, uint32 extern "C" kern_return_t IOCPURunPlatformQuiesceActions(void) { + assert(preemption_enabled() == false); return iocpu_run_platform_actions(&gActionQueues[kQueueQuiesce], 0, 0U - 1, NULL, NULL, NULL, TRUE); } @@ -156,6 +158,7 @@ IOCPURunPlatformQuiesceActions(void) extern "C" kern_return_t IOCPURunPlatformActiveActions(void) { + assert(preemption_enabled() == false); return iocpu_run_platform_actions(&gActionQueues[kQueueActive], 0, 0U - 1, NULL, NULL, NULL, TRUE); } @@ -426,7 +429,7 @@ PE_cpu_machine_quiesce(cpu_id_t target) } #if defined(__arm__) || defined(__arm64__) -static perfmon_interrupt_handler_func pmi_handler = 0; +static perfmon_interrupt_handler_func pmi_handler = NULL; kern_return_t PE_cpu_perfmon_interrupt_install_handler(perfmon_interrupt_handler_func handler) @@ -446,7 +449,7 @@ PE_cpu_perfmon_interrupt_enable(cpu_id_t target, boolean_t enable) } if (enable) { - targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, 0); + targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, NULL); targetCPU->getProvider()->enableInterrupt(1); } else { targetCPU->getProvider()->disableInterrupt(1); @@ -495,7 +498,7 @@ IOCPUSleepKernel(void) iter = IORegistryIterator::iterateOver( gIOServicePlane, kIORegistryIterateRecursively ); if (iter) { - all = 0; + all = NULL; do{ if (all) { all->release(); @@ -525,6 +528,18 @@ IOCPUSleepKernel(void) currentShutdownTarget = NULL; #endif + integer_t old_pri; + thread_t self = current_thread(); + + /* + * We need to boost this thread's priority to the maximum kernel priority to + * ensure we can urgently preempt ANY thread currently executing on the + * target CPU. Note that realtime threads have their own mechanism to eventually + * demote their priority below MAXPRI_KERNEL if they hog the CPU for too long. + */ + old_pri = thread_kern_get_pri(self); + thread_kern_set_pri(self, thread_kern_get_kernel_maxpri()); + // Sleep the CPUs. cnt = numCPUs; while (cnt--) { @@ -551,9 +566,18 @@ IOCPUSleepKernel(void) rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver ); rootDomain->stop_watchdog_timer(); - // Now sleep the boot CPU. + /* + * Now sleep the boot CPU, including calling the kQueueQuiesce actions. + * The system sleeps here. + */ + bootCPU->haltCPU(); + /* + * The system is now coming back from sleep on the boot CPU. + * The kQueueActive actions have already been called. + */ + rootDomain->start_watchdog_timer(); rootDomain->tracePoint( kIOPMTracePointWakePlatformActions ); @@ -592,6 +616,8 @@ IOCPUSleepKernel(void) #if defined(__arm64__) sched_restore_recommended_cores_after_sleep(); #endif + + thread_kern_set_pri(self, old_pri); } bool @@ -639,6 +665,18 @@ IOCPU::start(IOService *provider) return true; } +void +IOCPU::detach(IOService *provider) +{ + super::detach(provider); + IOLockLock(gIOCPUsLock); + unsigned int index = gIOCPUs->getNextIndexOfObject(this, 0); + if (index != (unsigned int)-1) { + gIOCPUs->removeObject(index); + } + IOLockUnlock(gIOCPUsLock); +} + OSObject * IOCPU::getProperty(const OSSymbol *aKey) const { @@ -680,12 +718,12 @@ IOCPU::setProperties(OSObject *properties) OSString *stateStr; IOReturn result; - if (dict == 0) { + if (dict == NULL) { return kIOReturnUnsupported; } stateStr = OSDynamicCast(OSString, dict->getObject(gIOCPUStateKey)); - if (stateStr != 0) { + if (stateStr != NULL) { result = IOUserClient::clientHasPrivilege(current_task(), kIOClientPrivilegeAdministrator); if (result != kIOReturnSuccess) { return result; @@ -809,7 +847,7 @@ IOCPUInterruptController::initCPUInterruptController(int sources, int cpus) numCPUs = cpus; vectors = (IOInterruptVector *)IOMalloc(numSources * sizeof(IOInterruptVector)); - if (vectors == 0) { + if (vectors == NULL) { return kIOReturnNoMemory; } bzero(vectors, numSources * sizeof(IOInterruptVector)); @@ -863,8 +901,8 @@ IOCPUInterruptController::setCPUInterruptProperties(IOService *service) OSData *tmpData; long tmpLong; - if ((service->getProperty(gIOInterruptControllersKey) != 0) && - (service->getProperty(gIOInterruptSpecifiersKey) != 0)) { + if ((service->getProperty(gIOInterruptControllersKey) != NULL) && + (service->getProperty(gIOInterruptSpecifiersKey) != NULL)) { return; } @@ -899,7 +937,7 @@ IOCPUInterruptController::enableCPUInterrupt(IOCPU *cpu) assert(numCPUs > 0); - ml_install_interrupt_handler(cpu, cpu->getCPUNumber(), this, handler, 0); + ml_install_interrupt_handler(cpu, cpu->getCPUNumber(), this, handler, NULL); IOTakeLock(vectors[0].interruptLock); ++enabledCPUs; @@ -920,6 +958,9 @@ IOCPUInterruptController::registerInterrupt(IOService *nub, { IOInterruptVector *vector; + // Interrupts must be enabled, as this can allocate memory. + assert(ml_get_interrupts_enabled() == TRUE); + if (source >= numSources) { return kIOReturnNoResources; } @@ -966,7 +1007,7 @@ IOCPUInterruptController::getInterruptType(IOService */*nub*/, int /*source*/, int *interruptType) { - if (interruptType == 0) { + if (interruptType == NULL) { return kIOReturnBadArgument; } diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 814494af2..7c0201e4d 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -68,7 +68,8 @@ IOCatalogue * gIOCatalogue; const OSSymbol * gIOClassKey; const OSSymbol * gIOProbeScoreKey; const OSSymbol * gIOModuleIdentifierKey; -IORWLock * gIOCatalogLock; +const OSSymbol * gIOModuleIdentifierKernelKey; +IORWLock * gIOCatalogLock; #if PRAGMA_MARK #pragma mark Utility functions @@ -105,9 +106,11 @@ IOCatalogue::initialize(void) errorString->release(); } - gIOClassKey = OSSymbol::withCStringNoCopy( kIOClassKey ); - gIOProbeScoreKey = OSSymbol::withCStringNoCopy( kIOProbeScoreKey ); - gIOModuleIdentifierKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKey ); + gIOClassKey = OSSymbol::withCStringNoCopy( kIOClassKey ); + gIOProbeScoreKey = OSSymbol::withCStringNoCopy( kIOProbeScoreKey ); + gIOModuleIdentifierKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKey ); + gIOModuleIdentifierKernelKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKernelKey ); + assert( array && gIOClassKey && gIOProbeScoreKey && gIOModuleIdentifierKey); @@ -129,7 +132,7 @@ IOCatalogue::arrayForPersonality(OSDictionary * dict) sym = OSDynamicCast(OSSymbol, dict->getObject(gIOProviderClassKey)); if (!sym) { - return 0; + return NULL; } return (OSArray *) personalities->getObject(sym); @@ -178,7 +181,7 @@ IOCatalogue::init(OSArray * initArray) continue; } OSKext::uniquePersonalityProperties(dict); - if (0 == dict->getObject( gIOClassKey )) { + if (NULL == dict->getObject( gIOClassKey )) { IOLog("Missing or bad \"%s\" key\n", gIOClassKey->getCStringNoCopy()); continue; @@ -219,7 +222,7 @@ IOCatalogue::findDrivers( set = OSOrderedSet::withCapacity( 1, IOServiceOrdering, (void *)gIOProbeScoreKey ); if (!set) { - return 0; + return NULL; } IORWLockRead(lock); @@ -265,12 +268,12 @@ IOCatalogue::findDrivers( set = OSOrderedSet::withCapacity( 1, IOServiceOrdering, (void *)gIOProbeScoreKey ); if (!set) { - return 0; + return NULL; } iter = OSCollectionIterator::withCollection(personalities); if (!iter) { set->release(); - return 0; + return NULL; } IORWLockRead(lock); @@ -474,33 +477,17 @@ IOCatalogue::getGenerationCount(void) const return generation; } +// Check to see if kernel module has been loaded already, and request its load. bool -IOCatalogue::isModuleLoaded(OSString * moduleName) const +IOCatalogue::isModuleLoaded(OSDictionary * driver, OSObject ** kextRef) const { - return isModuleLoaded(moduleName->getCStringNoCopy()); -} + OSString * moduleName = NULL; + OSString * publisherName = NULL; + OSReturn ret; -bool -IOCatalogue::isModuleLoaded(const char * moduleName) const -{ - OSReturn ret; - ret = OSKext::loadKextWithIdentifier(moduleName); - if (kOSKextReturnDeferred == ret) { - // a request has been queued but the module isn't necessarily - // loaded yet, so stall. - return false; + if (kextRef) { + *kextRef = NULL; } - // module is present or never will be - return true; -} - -// Check to see if module has been loaded already. -bool -IOCatalogue::isModuleLoaded(OSDictionary * driver) const -{ - OSString * moduleName = NULL; - OSString * publisherName = NULL; - if (!driver) { return false; } @@ -515,12 +502,25 @@ IOCatalogue::isModuleLoaded(OSDictionary * driver) const driver->getObject(kIOPersonalityPublisherKey)); OSKext::recordIdentifierRequest(publisherName); - moduleName = OSDynamicCast(OSString, driver->getObject(gIOModuleIdentifierKey)); + moduleName = OSDynamicCast(OSString, driver->getObject(gIOModuleIdentifierKernelKey)); if (moduleName) { - return isModuleLoaded(moduleName); + ret = OSKext::loadKextWithIdentifier(moduleName, kextRef); + if (kOSKextReturnDeferred == ret) { + // a request has been queued but the module isn't necessarily + // loaded yet, so stall. + return false; + } + OSString *moduleDextName = OSDynamicCast(OSString, driver->getObject(gIOModuleIdentifierKey)); + if (moduleDextName && !(moduleName->isEqualTo(moduleDextName))) { + OSObject *dextRef = NULL; + ret = OSKext::loadKextWithIdentifier(moduleDextName, &dextRef); + OSSafeReleaseNULL(dextRef); + } + // module is present or never will be + return true; } - /* If a personality doesn't hold the "CFBundleIdentifier" key + /* If a personality doesn't hold the "CFBundleIdentifier" or "CFBundleIdentifierKernel" key * it is assumed to be an "in-kernel" driver. */ return true; @@ -531,14 +531,9 @@ IOCatalogue::isModuleLoaded(OSDictionary * driver) const * IOCatalogueModuleLoaded(). Sent from kextd. */ void -IOCatalogue::moduleHasLoaded(OSString * moduleName) +IOCatalogue::moduleHasLoaded(const OSSymbol * moduleName) { - OSDictionary * dict; - - dict = OSDictionary::withCapacity(2); - dict->setObject(gIOModuleIdentifierKey, moduleName); - startMatching(dict); - dict->release(); + startMatching(moduleName); (void) OSKext::setDeferredLoadSucceeded(); (void) OSKext::considerRebuildOfPrelinkedKernel(); @@ -547,9 +542,9 @@ IOCatalogue::moduleHasLoaded(OSString * moduleName) void IOCatalogue::moduleHasLoaded(const char * moduleName) { - OSString * name; + const OSSymbol * name; - name = OSString::withCString(moduleName); + name = OSSymbol::withCString(moduleName); moduleHasLoaded(name); name->release(); } @@ -574,7 +569,7 @@ IOCatalogue::_terminateDrivers(OSDictionary * matching) } ret = kIOReturnSuccess; - dict = 0; + dict = NULL; iter = IORegistryIterator::iterateOver(gIOServicePlane, kIORegistryIterateRecursively); if (!iter) { @@ -741,15 +736,11 @@ IOCatalogue::terminateDriversForModule( return ret; } +#if defined(__i386__) || defined(__x86_64__) bool IOCatalogue::startMatching( OSDictionary * matching ) { - OSCollectionIterator * iter; - OSDictionary * dict; OSOrderedSet * set; - OSArray * array; - const OSSymbol * key; - unsigned int idx; if (!matching) { return false; @@ -761,27 +752,71 @@ IOCatalogue::startMatching( OSDictionary * matching ) return false; } - iter = OSCollectionIterator::withCollection(personalities); - if (!iter) { - set->release(); + IORWLockRead(lock); + + personalities->iterateObjects(^bool (const OSSymbol * key, OSObject * value) { + OSArray * array; + OSDictionary * dict; + unsigned int idx; + + array = (OSArray *) value; + for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++) { + /* This comparison must be done with only the keys in the + * "matching" dict to enable general matching. + */ + if (dict->isEqualTo(matching, matching)) { + set->setObject(dict); + } + } + return false; + }); + + // Start device matching. + if (set->getCount() > 0) { + IOService::catalogNewDrivers(set); + generation++; + } + + IORWLockUnlock(lock); + + set->release(); + + return true; +} +#endif /* defined(__i386__) || defined(__x86_64__) */ + +bool +IOCatalogue::startMatching( const OSSymbol * moduleName ) +{ + OSOrderedSet * set; + + if (!moduleName) { + return false; + } + + set = OSOrderedSet::withCapacity(10, IOServiceOrdering, + (void *)gIOProbeScoreKey); + if (!set) { return false; } IORWLockRead(lock); - while ((key = (const OSSymbol *) iter->getNextObject())) { - array = (OSArray *) personalities->getObject(key); - if (array) { - for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++) { - /* This comparison must be done with only the keys in the - * "matching" dict to enable general matching. - */ - if (dict->isEqualTo(matching, matching)) { - set->setObject(dict); - } + personalities->iterateObjects(^bool (const OSSymbol * key, OSObject * value) { + OSArray * array; + OSDictionary * dict; + OSObject * obj; + unsigned int idx; + + array = (OSArray *) value; + for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++) { + obj = dict->getObject(gIOModuleIdentifierKernelKey); + if (obj && moduleName->isEqualTo(obj)) { + set->setObject(dict); } } - } + return false; + }); // Start device matching. if (set->getCount() > 0) { @@ -792,7 +827,6 @@ IOCatalogue::startMatching( OSDictionary * matching ) IORWLockUnlock(lock); set->release(); - iter->release(); return true; } diff --git a/iokit/Kernel/IOCommandGate.cpp b/iokit/Kernel/IOCommandGate.cpp index 265b2f786..1ae9bcf08 100644 --- a/iokit/Kernel/IOCommandGate.cpp +++ b/iokit/Kernel/IOCommandGate.cpp @@ -86,7 +86,7 @@ IOCommandGate::commandGate(OSObject *inOwner, Action inAction) if (me && !me->init(inOwner, inAction)) { me->release(); - return 0; + return NULL; } return me; @@ -117,7 +117,7 @@ IOCommandGate::enable() IOCommandGate::free() { if (workLoop) { - setWorkLoop(0); + setWorkLoop(NULL); } super::free(); } @@ -146,7 +146,7 @@ IOCommandGate::setWorkLoop(IOWorkLoop *inWorkLoop) *sleepersP &= ~kSleepersWaitEnabled; defer = (0 != (kSleepersActionsMask & *sleepersP)); if (!defer) { - super::setWorkLoop(0); + super::setWorkLoop(NULL); *sleepersP &= ~kSleepersRemoved; } wl->openGate(); @@ -180,9 +180,9 @@ IOCommandGateActionToBlock(OSObject *owner, } IOReturn -IOCommandGate::runActionBlock(ActionBlock action) +IOCommandGate::runActionBlock(ActionBlock _action) { - return runAction(&IOCommandGateActionToBlock, action); + return runAction(&IOCommandGateActionToBlock, _action); } IOReturn @@ -250,7 +250,7 @@ IOCommandGate::runAction(Action inAction, if (kSleepersRemoved == ((kSleepersActionsMask | kSleepersRemoved) & *sleepersP)) { // no actions outstanding *sleepersP &= ~kSleepersRemoved; - super::setWorkLoop(0); + super::setWorkLoop(NULL); } wl->openGate(); diff --git a/iokit/Kernel/IOCommandPool.cpp b/iokit/Kernel/IOCommandPool.cpp index d61f37fc3..550d9aac5 100644 --- a/iokit/Kernel/IOCommandPool.cpp +++ b/iokit/Kernel/IOCommandPool.cpp @@ -62,7 +62,7 @@ withWorkLoop(IOWorkLoop *inWorkLoop) if (me && !me->initWithWorkLoop(inWorkLoop)) { me->release(); - return 0; + return NULL; } return me; @@ -106,7 +106,7 @@ commandPool(IOService * inOwner, IOWorkLoop *inWorkLoop, UInt32 inSize) if (me && !me->init(inOwner, inWorkLoop, inSize)) { me->release(); - return 0; + return NULL; } return me; @@ -135,7 +135,7 @@ IOCommandPool::free(void) } fSerializer->release(); - fSerializer = 0; + fSerializer = NULL; } // Tell our superclass to cleanup too @@ -153,7 +153,7 @@ IOCommand * IOCommandPool::getCommand(bool blockForCommand) { IOReturn result = kIOReturnSuccess; - IOCommand *command = 0; + IOCommand *command = NULL; IOCommandGate::Action func = OSMemberFunctionCast( IOCommandGate::Action, this, &IOCommandPool::gatedGetCommand); @@ -162,7 +162,7 @@ IOCommandPool::getCommand(bool blockForCommand) if (kIOReturnSuccess == result) { return command; } else { - return 0; + return NULL; } } diff --git a/iokit/Kernel/IOCommandQueue.cpp b/iokit/Kernel/IOCommandQueue.cpp index 08ba843c9..2623d063d 100644 --- a/iokit/Kernel/IOCommandQueue.cpp +++ b/iokit/Kernel/IOCommandQueue.cpp @@ -120,7 +120,7 @@ IOCommandQueue::commandQueue(OSObject *inOwner, if (me && !me->init(inOwner, inAction, inSize)) { me->free(); - return 0; + return NULL; } return me; diff --git a/iokit/Kernel/IOConditionLock.cpp b/iokit/Kernel/IOConditionLock.cpp index 655a150a4..c24d8ce35 100644 --- a/iokit/Kernel/IOConditionLock.cpp +++ b/iokit/Kernel/IOConditionLock.cpp @@ -67,7 +67,7 @@ IOConditionLock::withCondition(int condition, bool intr) if (me && !me->initWithCondition(condition, intr)) { me->release(); - return 0; + return NULL; } return me; diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index b5383d481..24047b542 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -109,7 +109,7 @@ IODMACommand::withRefCon(void * refCon) if (me && !me->initWithRefCon(refCon)) { me->release(); - return 0; + return NULL; } return me; @@ -127,7 +127,7 @@ IODMACommand::withSpecification(SegmentFunction outSegFunc, if (me && !me->initWithSpecification(outSegFunc, segmentOptions, mappingOptions, mapper, refCon)) { me->release(); - return 0; + return NULL; } return me; @@ -150,7 +150,7 @@ IODMACommand::withSpecification(SegmentFunction outSegFunc, mappingOptions, maxTransferSize, alignment, mapper, refCon)) { me->release(); - return 0; + return NULL; } return me; @@ -244,7 +244,7 @@ IODMACommand::setSpecification(SegmentFunction outSegFunc, uint32_t mappingOptions, IOMapper * mapper) { - IOService * device = 0; + IOService * device = NULL; UInt8 numAddressBits; UInt64 maxSegmentSize; UInt64 maxTransferSize; @@ -284,7 +284,7 @@ IODMACommand::setSpecification(SegmentFunction outSegFunc, } if (mapper && !OSDynamicCast(IOMapper, mapper)) { device = mapper; - mapper = 0; + mapper = NULL; } if (!mapper && (kUnmapped != MAPTYPE(mappingOptions))) { IOMapper::checkForSystemMapper(); @@ -434,7 +434,7 @@ IODMACommand::clearMemoryDescriptor(bool autoComplete) fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0); } fMemory->release(); - fMemory = 0; + fMemory = NULL; } return kIOReturnSuccess; @@ -603,10 +603,10 @@ IODMACommand::walkAll(UInt8 op) state->fDoubleBuffer = false; state->fPrepared = false; state->fCopyNext = NULL; - state->fCopyPageAlloc = 0; + state->fCopyPageAlloc = NULL; state->fCopyPageCount = 0; state->fNextRemapPage = NULL; - state->fCopyMD = 0; + state->fCopyMD = NULL; if (!(kWalkDoubleBuffer & op)) { offset = 0; @@ -703,12 +703,12 @@ IODMACommand::walkAll(UInt8 op) if (kWalkComplete & op) { if (state->fCopyPageAlloc) { vm_page_free_list(state->fCopyPageAlloc, FALSE); - state->fCopyPageAlloc = 0; + state->fCopyPageAlloc = NULL; state->fCopyPageCount = 0; } if (state->fCopyMD) { state->fCopyMD->release(); - state->fCopyMD = 0; + state->fCopyMD = NULL; } state->fPrepared = false; @@ -833,10 +833,10 @@ IODMACommand::prepare(UInt64 offset, UInt64 length, bool flushCache, bool synchr state->fDoubleBuffer = false; state->fPrepared = false; state->fCopyNext = NULL; - state->fCopyPageAlloc = 0; + state->fCopyPageAlloc = NULL; state->fCopyPageCount = 0; state->fNextRemapPage = NULL; - state->fCopyMD = 0; + state->fCopyMD = NULL; state->fLocalMapperAlloc = 0; state->fLocalMapperAllocValid = false; state->fLocalMapperAllocLength = 0; diff --git a/iokit/Kernel/IODMAController.cpp b/iokit/Kernel/IODMAController.cpp index 4ce1edea3..8650d6189 100644 --- a/iokit/Kernel/IODMAController.cpp +++ b/iokit/Kernel/IODMAController.cpp @@ -52,7 +52,7 @@ IODMAController::getController(IOService *provider, UInt32 dmaIndex) // Find the name of the parent dma controller dmaParentData = OSDynamicCast(OSData, provider->getProperty("dma-parent")); - if (dmaParentData == 0) { + if (dmaParentData == NULL) { return NULL; } @@ -64,7 +64,7 @@ IODMAController::getController(IOService *provider, UInt32 dmaIndex) } dmaParentName = createControllerName(*(UInt32 *)dmaParentData->getBytesNoCopy(dmaIndex * sizeof(UInt32), sizeof(UInt32))); } - if (dmaParentName == 0) { + if (dmaParentName == NULL) { return NULL; } diff --git a/iokit/Kernel/IODMAEventSource.cpp b/iokit/Kernel/IODMAEventSource.cpp index af624aeaa..dd4d186f0 100644 --- a/iokit/Kernel/IODMAEventSource.cpp +++ b/iokit/Kernel/IODMAEventSource.cpp @@ -48,7 +48,7 @@ IODMAEventSource::init(OSObject *inOwner, return false; } - if (inProvider == 0) { + if (inProvider == NULL) { return false; } @@ -58,7 +58,7 @@ IODMAEventSource::init(OSObject *inOwner, dmaNotificationAction = inNotification; dmaController = IODMAController::getController(dmaProvider, inDMAIndex); - if (dmaController == 0) { + if (dmaController == NULL) { return false; } dmaController->retain(); @@ -94,7 +94,7 @@ IODMAEventSource::dmaEventSource(OSObject *inOwner, if (dmaES && !dmaES->init(inOwner, inProvider, inCompletion, inNotification, inDMAIndex)) { dmaES->release(); - return 0; + return NULL; } return dmaES; @@ -105,7 +105,7 @@ IODMAEventSource::startDMACommand(IODMACommand *dmaCommand, IODirection directio { IOReturn result; - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return kIOReturnError; } @@ -113,7 +113,7 @@ IODMAEventSource::startDMACommand(IODMACommand *dmaCommand, IODirection directio return kIOReturnBusy; } - if (dmaCompletionAction == 0) { + if (dmaCompletionAction == NULL) { dmaSynchBusy = true; } @@ -134,7 +134,7 @@ IODMAEventSource::startDMACommand(IODMACommand *dmaCommand, IODirection directio IOReturn IODMAEventSource::stopDMACommand(bool flush, uint64_t timeout) { - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return kIOReturnError; } @@ -145,7 +145,7 @@ IODMAEventSource::stopDMACommand(bool flush, uint64_t timeout) IOReturn IODMAEventSource::queryDMACommand(IODMACommand **dmaCommand, IOByteCount *transferCount, bool waitForIdle) { - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return kIOReturnError; } @@ -156,7 +156,7 @@ IODMAEventSource::queryDMACommand(IODMACommand **dmaCommand, IOByteCount *transf IOByteCount IODMAEventSource::getFIFODepth(IODirection direction) { - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return 0; } @@ -167,7 +167,7 @@ IODMAEventSource::getFIFODepth(IODirection direction) IOReturn IODMAEventSource::setFIFODepth(IOByteCount depth) { - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return kIOReturnError; } @@ -178,7 +178,7 @@ IODMAEventSource::setFIFODepth(IOByteCount depth) IOByteCount IODMAEventSource::validFIFODepth(IOByteCount depth, IODirection direction) { - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return kIOReturnError; } @@ -189,7 +189,7 @@ IODMAEventSource::validFIFODepth(IOByteCount depth, IODirection direction) IOReturn IODMAEventSource::setFrameSize(UInt8 byteCount) { - if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) { + if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) { return kIOReturnError; } @@ -224,7 +224,7 @@ IODMAEventSource::checkForWork(void) void IODMAEventSource::completeDMACommand(IODMACommand *dmaCommand) { - if (dmaCompletionAction != 0) { + if (dmaCompletionAction != NULL) { IOSimpleLockLock(dmaCommandsCompletedLock); queue_enter(&dmaCommandsCompleted, dmaCommand, IODMACommand *, fCommandChain); IOSimpleLockUnlock(dmaCommandsCompletedLock); @@ -243,7 +243,7 @@ IODMAEventSource::notifyDMACommand(IODMACommand *dmaCommand, IOReturn status, IO dmaCommand->reserved->fActualByteCount = actualByteCount; dmaCommand->reserved->fTimeStamp = timeStamp; - if (dmaNotificationAction != 0) { + if (dmaNotificationAction != NULL) { (*dmaNotificationAction)(owner, this, dmaCommand, status, actualByteCount, timeStamp); } } diff --git a/iokit/Kernel/IODataQueue.cpp b/iokit/Kernel/IODataQueue.cpp index 93bd0c268..dde414b6a 100644 --- a/iokit/Kernel/IODataQueue.cpp +++ b/iokit/Kernel/IODataQueue.cpp @@ -61,7 +61,7 @@ IODataQueue *IODataQueue::withCapacity(UInt32 size) if (dataQueue) { if (!dataQueue->initWithCapacity(size)) { dataQueue->release(); - dataQueue = 0; + dataQueue = NULL; } } @@ -76,7 +76,7 @@ IODataQueue::withEntries(UInt32 numEntries, UInt32 entrySize) if (dataQueue) { if (!dataQueue->initWithEntries(numEntries, entrySize)) { dataQueue->release(); - dataQueue = 0; + dataQueue = NULL; } } @@ -111,7 +111,7 @@ IODataQueue::initWithCapacity(UInt32 size) ((IODataQueueInternal *)notifyMsg)->queueSize = size; dataQueue = (IODataQueueMemory *)IOMallocAligned(allocSize, PAGE_SIZE); - if (dataQueue == 0) { + if (dataQueue == NULL) { return false; } bzero(dataQueue, allocSize); @@ -190,7 +190,7 @@ IODataQueue::enqueue(void * data, UInt32 dataSize) entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail); entry->size = dataSize; - memcpy(&entry->data, data, dataSize); + __nochk_memcpy(&entry->data, data, dataSize); // The tail can be out of bound when the size of the new entry // exactly matches the available space at the end of the queue. @@ -211,7 +211,7 @@ IODataQueue::enqueue(void * data, UInt32 dataSize) ((IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail))->size = dataSize; } - memcpy(&dataQueue->queue->data, data, dataSize); + __nochk_memcpy(&dataQueue->queue->data, data, dataSize); newTail = entrySize; } else { return false; // queue is full @@ -224,7 +224,7 @@ IODataQueue::enqueue(void * data, UInt32 dataSize) entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail); entry->size = dataSize; - memcpy(&entry->data, data, dataSize); + __nochk_memcpy(&entry->data, data, dataSize); newTail = tail + entrySize; } else { return false; // queue is full @@ -291,11 +291,11 @@ IODataQueue::sendDataAvailableNotification() IOMemoryDescriptor * IODataQueue::getMemoryDescriptor() { - IOMemoryDescriptor *descriptor = 0; + IOMemoryDescriptor *descriptor = NULL; UInt32 queueSize; queueSize = ((IODataQueueInternal *) notifyMsg)->queueSize; - if (dataQueue != 0) { + if (dataQueue != NULL) { descriptor = IOMemoryDescriptor::withAddress(dataQueue, queueSize + DATA_QUEUE_MEMORY_HEADER_SIZE, kIODirectionOutIn); } diff --git a/iokit/Kernel/IODeviceMemory.cpp b/iokit/Kernel/IODeviceMemory.cpp index ee178c845..ecc175f08 100644 --- a/iokit/Kernel/IODeviceMemory.cpp +++ b/iokit/Kernel/IODeviceMemory.cpp @@ -62,8 +62,8 @@ IODeviceMemory::arrayFromList( IOItemCount i; array = OSArray::withCapacity( count ); - if (0 == array) { - return 0; + if (NULL == array) { + return NULL; } for (i = 0; i < count; i++) { @@ -74,7 +74,7 @@ IODeviceMemory::arrayFromList( range->release(); } else { array->release(); - array = 0; + array = NULL; break; } } diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp index 6d49c4fee..f219e2f1d 100644 --- a/iokit/Kernel/IODeviceTreeSupport.cpp +++ b/iokit/Kernel/IODeviceTreeSupport.cpp @@ -154,7 +154,7 @@ IODeviceTreeAlloc( void * dtTop ) && gIODTInterruptCellKey ); - freeDT = (kSuccess == DTLookupEntry( 0, "/chosen/memory-map", &mapEntry )) + freeDT = (kSuccess == DTLookupEntry( NULL, "/chosen/memory-map", &mapEntry )) && (kSuccess == DTGetProperty( mapEntry, "DeviceTree", (void **) &dtMap, &propSize )) && ((2 * sizeof(uint32_t)) == propSize); @@ -202,7 +202,7 @@ IODeviceTreeAlloc( void * dtTop ) if (freeDT) { // free original device tree - DTInit(0); + DTInit(NULL); IODTFreeLoaderInfo( "DeviceTree", (void *)dtMap[0], (int) round_page(dtMap[1])); } @@ -221,6 +221,14 @@ IODeviceTreeAlloc( void * dtTop ) if (!intMap && child->getProperty( gIODTInterruptParentKey)) { intMap = true; } +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (!strcmp("sep", child->getName()) + || !strcmp("aop", child->getName()) + || !strcmp("disp0", child->getName())) { + uint32_t aotFlags = 1; + child->setProperty("aot-power", &aotFlags, sizeof(aotFlags)); + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } regIter->release(); } @@ -268,12 +276,12 @@ IODTGetLoaderInfo( const char *key, void **infoAddr, int *infoSize ) int ret = -1; chosen = IORegistryEntry::fromPath( "/chosen/memory-map", gIODTPlane ); - if (chosen == 0) { + if (chosen == NULL) { return -1; } propObj = OSDynamicCast( OSData, chosen->getProperty(key)); - if (propObj == 0) { + if (propObj == NULL) { goto cleanup; } @@ -283,7 +291,7 @@ IODTGetLoaderInfo( const char *key, void **infoAddr, int *infoSize ) } propPtr = (dtptr_t *)propObj->getBytesNoCopy(); - if (propPtr == 0) { + if (propPtr == NULL) { goto cleanup; } @@ -308,9 +316,9 @@ IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize ) range[1] = (vm_offset_t)infoSize; FreePhysicalMemory( range ); - if (key != 0) { + if (key != NULL) { chosen = IORegistryEntry::fromPath( "/chosen/memory-map", gIODTPlane ); - if (chosen != 0) { + if (chosen != NULL) { chosen->removeProperty(key); chosen->release(); } @@ -325,12 +333,12 @@ IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize ) unsigned int defaultSize; defaults = IORegistryEntry::fromPath( "/defaults", gIODTPlane ); - if (defaults == 0) { + if (defaults == NULL) { return -1; } defaultObj = OSDynamicCast( OSData, defaults->getProperty(key)); - if (defaultObj == 0) { + if (defaultObj == NULL) { return -1; } @@ -375,7 +383,7 @@ MakeReferenceTable( DTEntry dtEntry, bool copy ) if (regEntry && (false == regEntry->init())) { regEntry->release(); - regEntry = 0; + regEntry = NULL; } if (regEntry && @@ -451,7 +459,7 @@ static IORegistryEntry * FindPHandle( UInt32 phandle ) { OSData *data; - IORegistryEntry *regEntry = 0; + IORegistryEntry *regEntry = NULL; int i; for (i = 0; (data = (OSData *)gIODTPHandles->getObject( i )); i++) { @@ -501,10 +509,10 @@ IODTFindInterruptParent( IORegistryEntry * regEntry, IOItemCount index ) } phandle = ((UInt32 *) data->getBytesNoCopy())[index]; parent = FindPHandle( phandle ); - } else if (0 == regEntry->getProperty( "interrupt-controller")) { + } else if (NULL == regEntry->getProperty( "interrupt-controller")) { parent = regEntry->getParentEntry( gIODTPlane); } else { - parent = 0; + parent = NULL; } return parent; @@ -525,7 +533,7 @@ IODTInterruptControllerName( IORegistryEntry * regEntry ) snprintf(buf, sizeof(buf), "IOInterruptController%08X", (uint32_t)phandle); sym = OSSymbol::withCString( buf ); } else { - sym = 0; + sym = NULL; } return sym; @@ -547,9 +555,10 @@ IODTGetICellCounts( IORegistryEntry * regEntry, static UInt32 IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, - OSData ** spec, const OSSymbol ** controller ) + LIBKERN_RETURNS_RETAINED OSData ** spec, + LIBKERN_RETURNS_RETAINED const OSSymbol ** controller ) { - IORegistryEntry *parent = 0; + IORegistryEntry *parent = NULL; OSData *data; UInt32 *addrCmp; UInt32 *maskCmp; @@ -561,7 +570,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, parent = IODTFindInterruptParent( regEntry, index ); IODTGetICellCounts( parent, &icells, &acells ); - addrCmp = 0; + addrCmp = NULL; if (acells) { data = OSDynamicCast( OSData, regEntry->getProperty( "reg" )); if (data && (data->getLength() >= (acells * sizeof(UInt32)))) { @@ -588,7 +597,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, if (parent && (data = OSDynamicCast( OSData, regEntry->getProperty( "interrupt-controller")))) { // found a controller - don't want to follow cascaded controllers - parent = 0; + parent = NULL; *spec = OSData::withBytesNoCopy((void *) intSpec, icells * sizeof(UInt32)); *controller = IODTInterruptControllerName( regEntry ); @@ -602,7 +611,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, if (data && (data->getLength() >= ((acells + icells) * sizeof(UInt32)))) { maskCmp = (UInt32 *) data->getBytesNoCopy(); } else { - maskCmp = 0; + maskCmp = NULL; } #if IODTSUPPORTDEBUG @@ -647,7 +656,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, } map += acells + icells; - if (0 == (parent = FindPHandle( *(map++)))) { + if (NULL == (parent = FindPHandle( *(map++)))) { unexpected(break); } @@ -661,7 +670,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, } } while (!cmp && (map < endMap)); if (!cmp) { - parent = 0; + parent = NULL; } } @@ -729,14 +738,14 @@ IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * allInts ) OSObject * oneMap; OSArray * mapped; OSArray * controllerInts; - const OSSymbol * controller = 0; + const OSSymbol * controller = NULL; OSArray * controllers; UInt32 skip = 1; bool ok, nw; - nw = (0 == (local = OSDynamicCast( OSData, + nw = (NULL == (local = OSDynamicCast( OSData, regEntry->getProperty( gIODTAAPLInterruptsKey)))); - if (nw && (0 == (local = OSDynamicCast( OSData, + if (nw && (NULL == (local = OSDynamicCast( OSData, regEntry->getProperty( "interrupts"))))) { return true; // nothing to see here } @@ -834,7 +843,7 @@ IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * allInts ) bool IODTMapInterrupts( IORegistryEntry * regEntry ) { - return IODTMapInterruptsSharing( regEntry, 0 ); + return IODTMapInterruptsSharing( regEntry, NULL ); } /* @@ -843,7 +852,7 @@ IODTMapInterrupts( IORegistryEntry * regEntry ) static bool CompareKey( OSString * key, const IORegistryEntry * table, const OSSymbol * propName, - OSString ** matchingName ) + LIBKERN_RETURNS_RETAINED OSString ** matchingName ) { OSObject *prop; OSData *data; @@ -855,9 +864,9 @@ CompareKey( OSString * key, const char *lastName; bool wild; bool matched; - const char *result = 0; + const char *result = NULL; - if (0 == (prop = table->copyProperty( propName ))) { + if (NULL == (prop = table->copyProperty( propName ))) { return 0; } @@ -868,7 +877,7 @@ CompareKey( OSString * key, names = string->getCStringNoCopy(); lastName = names + string->getLength() + 1; } else { - names = 0; + names = NULL; } if (names) { @@ -901,7 +910,7 @@ CompareKey( OSString * key, prop->release(); } - return result != 0; + return result != NULL; } @@ -926,7 +935,7 @@ IODTMatchNubWithKeys( IORegistryEntry * regEntry, OSObject *obj; bool result = false; - obj = OSUnserialize( keys, 0 ); + obj = OSUnserialize( keys, NULL ); if (obj) { result = regEntry->compareNames( obj ); @@ -945,7 +954,7 @@ OSCollectionIterator * IODTFindMatchingEntries( IORegistryEntry * from, IOOptionBits options, const char * keys ) { - OSSet *result = 0; + OSSet *result = NULL; IORegistryEntry *next; IORegistryIterator *iter; OSCollectionIterator *cIter; @@ -1160,7 +1169,7 @@ IODTResolveAddressCell( IORegistryEntry * startEntry, do{ prop = OSDynamicCast( OSData, regEntry->getProperty( gIODTRangeKey )); - if (0 == prop) { + if (NULL == prop) { /* end of the road */ *phys = CellsValue( childAddressCells, cell ); *phys += offset; @@ -1291,11 +1300,11 @@ IODTResolveAddressing( IORegistryEntry * regEntry, OSArray *array; IODeviceMemory *range; - array = 0; + array = NULL; do{ parentEntry = regEntry->copyParentEntry( gIODTPlane ); addressProperty = (OSData *) regEntry->getProperty( addressPropertyName ); - if ((0 == addressProperty) || (0 == parentEntry)) { + if ((NULL == addressProperty) || (NULL == parentEntry)) { break; } @@ -1309,18 +1318,18 @@ IODTResolveAddressing( IORegistryEntry * regEntry, num = addressProperty->getLength() / (4 * cells); array = OSArray::withCapacity( 1 ); - if (0 == array) { + if (NULL == array) { break; } for (i = 0; i < num; i++) { if (IODTResolveAddressCell( parentEntry, reg, &phys, &len )) { - range = 0; + range = NULL; if (parent) { range = IODeviceMemory::withSubRange( parent, - phys - parent->getPhysicalSegment(0, 0, kIOMemoryMapperNone), len ); + phys - parent->getPhysicalSegment(0, NULL, kIOMemoryMapperNone), len ); } - if (0 == range) { + if (NULL == range) { range = IODeviceMemory::withRange( phys, len ); } if (range) { @@ -1344,7 +1353,7 @@ IODTFindSlotName( IORegistryEntry * regEntry, UInt32 deviceNumber ) { IORegistryEntry *parent; OSData *data; - OSData *ret = 0; + OSData *ret = NULL; UInt32 *bits; UInt32 i; size_t nlen; diff --git a/iokit/Kernel/IOEventSource.cpp b/iokit/Kernel/IOEventSource.cpp index 33306ae6c..3415fd34a 100644 --- a/iokit/Kernel/IOEventSource.cpp +++ b/iokit/Kernel/IOEventSource.cpp @@ -293,5 +293,5 @@ IOEventSource::getWorkLoop() const bool IOEventSource::onThread() const { - return (workLoop != 0) && workLoop->onThread(); + return (workLoop != NULL) && workLoop->onThread(); } diff --git a/iokit/Kernel/IOFilterInterruptEventSource.cpp b/iokit/Kernel/IOFilterInterruptEventSource.cpp index e3b9803cf..5e3371a10 100644 --- a/iokit/Kernel/IOFilterInterruptEventSource.cpp +++ b/iokit/Kernel/IOFilterInterruptEventSource.cpp @@ -84,7 +84,7 @@ IOFilterInterruptEventSource::interruptEventSource(OSObject *inOwner, IOService *inProvider, int inIntIndex) { - return 0; + return NULL; } bool @@ -122,7 +122,7 @@ IOFilterInterruptEventSource if (me && !me->init(inOwner, inAction, inFilterAction, inProvider, inIntIndex)) { me->release(); - return 0; + return NULL; } return me; @@ -141,14 +141,15 @@ IOFilterInterruptEventSource FilterBlock filter = Block_copy(inFilterAction); if (!filter) { - return 0; + OSSafeReleaseNULL(me); + return NULL; } if (me && !me->init(inOwner, (Action) NULL, (Filter) filter, inProvider, inIntIndex)) { me->release(); Block_release(filter); - return 0; + return NULL; } me->flags |= kFilterBlock; me->setActionBlock((IOEventSource::ActionBlock) inAction); @@ -220,9 +221,13 @@ IOFilterInterruptEventSource::normalInterruptOccurred } if (IOInterruptEventSource::reserved->statistics) { - if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex)) { + if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex) + || IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) { startTime = mach_absolute_time(); } + if (IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) { + IOInterruptEventSource::reserved->statistics->primaryTimestamp = startTime; + } } // Call the filter. @@ -269,9 +274,13 @@ IOFilterInterruptEventSource::disableInterruptOccurred } if (IOInterruptEventSource::reserved->statistics) { - if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex)) { + if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex) + || IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) { startTime = mach_absolute_time(); } + if (IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) { + IOInterruptEventSource::reserved->statistics->primaryTimestamp = startTime; + } } // Call the filter. diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index be2483dbf..3b696609a 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -494,11 +494,13 @@ IOHibernateSystemSleep(void) &vars->page_list_wired, &vars->page_list_pal); if (KERN_SUCCESS != err) { + HIBLOG("%s err, hibernate_alloc_page_lists return 0x%x\n", __FUNCTION__, err); break; } err = hibernate_pin_swap(TRUE); if (KERN_SUCCESS != err) { + HIBLOG("%s error, hibernate_pin_swap return 0x%x\n", __FUNCTION__, err); break; } swapPinned = true; @@ -702,7 +704,7 @@ IOHibernateSystemSleep(void) } // set BootNext if (!gIOHibernateBoot0082Data) { - OSData * fileData = 0; + OSData * fileData = NULL; data = OSDynamicCast(OSData, gIOChosenEntry->getProperty("boot-device-path")); if (data && data->getLength() >= 4) { fileData = OSDynamicCast(OSData, gIOChosenEntry->getProperty("boot-file-path")); @@ -1018,7 +1020,7 @@ IOHibernateSystemHasSlept(void) { IOReturn ret = kIOReturnSuccess; IOHibernateVars * vars = &gIOHibernateVars; - OSObject * obj = 0; + OSObject * obj = NULL; OSData * data; IOLockLock(gFSLock); @@ -1041,7 +1043,7 @@ IOHibernateSystemHasSlept(void) vars->consoleMapping = NULL; if (vars->previewBuffer && (kIOReturnSuccess != vars->previewBuffer->prepare())) { vars->previewBuffer->release(); - vars->previewBuffer = 0; + vars->previewBuffer = NULL; } if ((kIOHibernateOptionProgress & gIOHibernateCurrentHeader->options) @@ -1148,7 +1150,7 @@ IOHibernateDone(IOHibernateVars * vars) if (vars->previewBuffer) { vars->previewBuffer->release(); - vars->previewBuffer = 0; + vars->previewBuffer = NULL; } if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) { @@ -1300,7 +1302,7 @@ IOReturn IOHibernateSystemPostWake(bool now) { gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; - IOSetBootImageNVRAM(0); + IOSetBootImageNVRAM(NULL); IOLockLock(gFSLock); if (kFSTrimDelay == gFSState) { @@ -1486,7 +1488,7 @@ hibernate_write_image(void) uint32_t zvPageCount; IOPolledFileCryptVars _cryptvars; - IOPolledFileCryptVars * cryptvars = 0; + IOPolledFileCryptVars * cryptvars = NULL; wiredPagesEncrypted = 0; dirtyPagesEncrypted = 0; @@ -1658,7 +1660,7 @@ hibernate_write_image(void) } } err = IOHibernatePolledFileWrite(vars->fileVars, - (uint8_t *) 0, + (uint8_t *) NULL, &gIOHibernateRestoreStackEnd[0] - &gIOHibernateRestoreStack[0], cryptvars); if (kIOReturnSuccess != err) { @@ -1967,7 +1969,7 @@ hibernate_write_image(void) if (kWiredEncrypt != pageType) { // end of image1/2 - fill to next block - err = IOHibernatePolledFileWrite(vars->fileVars, 0, 0, cryptvars); + err = IOHibernatePolledFileWrite(vars->fileVars, NULL, 0, cryptvars); if (kIOReturnSuccess != err) { break; } @@ -2029,7 +2031,7 @@ hibernate_write_image(void) if (kIOReturnSuccess != err) { break; } - err = IOHibernatePolledFileWrite(vars->fileVars, 0, 0, cryptvars); + err = IOHibernatePolledFileWrite(vars->fileVars, NULL, 0, cryptvars); }while (false); clock_get_uptime(&endTime); @@ -2112,7 +2114,7 @@ hibernate_machine_init(void) uint64_t compBytes; uint32_t lastProgressStamp = 0; uint32_t progressStamp; - IOPolledFileCryptVars * cryptvars = 0; + IOPolledFileCryptVars * cryptvars = NULL; IOHibernateVars * vars = &gIOHibernateVars; bzero(gIOHibernateStats, sizeof(hibernate_statistics_t)); @@ -2174,7 +2176,7 @@ hibernate_machine_init(void) hibernate_page_list_discard(vars->page_list); } - cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0; + cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : NULL; if (gIOHibernateCurrentHeader->handoffPageCount > gIOHibernateHandoffPageCount) { panic("handoff overflow"); @@ -2300,7 +2302,7 @@ hibernate_machine_init(void) if (kIOReturnSuccess != err) { panic("IOPolledFilePollersSetEncryptionKey(0x%x)", err); } - cryptvars = 0; + cryptvars = NULL; } IOPolledFileSeek(vars->fileVars, gIOHibernateCurrentHeader->image1Size); @@ -2314,7 +2316,7 @@ hibernate_machine_init(void) vars->fileVars->cryptBytes = 0; AbsoluteTime_to_scalar(&vars->fileVars->cryptTime) = 0; - err = IOPolledFileRead(vars->fileVars, 0, 0, cryptvars); + err = IOPolledFileRead(vars->fileVars, NULL, 0, cryptvars); vars->fileVars->bufferOffset = vars->fileVars->bufferLimit; // -- diff --git a/iokit/Kernel/IOHistogramReporter.cpp b/iokit/Kernel/IOHistogramReporter.cpp index efbd6de16..b5e9176b4 100644 --- a/iokit/Kernel/IOHistogramReporter.cpp +++ b/iokit/Kernel/IOHistogramReporter.cpp @@ -65,7 +65,7 @@ IOHistogramReporter::with(IOService *reportingService, OSSafeReleaseNULL(reporter); OSSafeReleaseNULL(tmpChannelName); - return 0; + return NULL; } diff --git a/iokit/Kernel/IOInterleavedMemoryDescriptor.cpp b/iokit/Kernel/IOInterleavedMemoryDescriptor.cpp index d4ad771ff..7a9f15093 100644 --- a/iokit/Kernel/IOInterleavedMemoryDescriptor.cpp +++ b/iokit/Kernel/IOInterleavedMemoryDescriptor.cpp @@ -48,7 +48,7 @@ IOInterleavedMemoryDescriptor * IOInterleavedMemoryDescriptor::withCapacity( /* capacity */ capacity, /* direction */ direction )) { me->release(); - me = 0; + me = NULL; } return me; @@ -79,14 +79,14 @@ IOInterleavedMemoryDescriptor::initWithCapacity( _direction = (IODirection) (_flags & kIOMemoryDirectionMask); #endif /* !__LP64__ */ _length = 0; - _mappings = 0; + _mappings = NULL; _tag = 0; _descriptorCount = 0; _descriptors = IONew(IOMemoryDescriptor *, capacity); _descriptorOffsets = IONew(IOByteCount, capacity); _descriptorLengths = IONew(IOByteCount, capacity); - if ((_descriptors == 0) || (_descriptorOffsets == 0) || (_descriptorLengths == 0)) { + if ((_descriptors == NULL) || (_descriptorOffsets == NULL) || (_descriptorLengths == NULL)) { return false; } @@ -106,7 +106,7 @@ IOInterleavedMemoryDescriptor::clearMemoryDescriptors( IODirection direction ) } _descriptors[index]->release(); - _descriptors[index] = 0; + _descriptors[index] = NULL; _descriptorOffsets[index] = 0; _descriptorLengths[index] = 0; @@ -121,7 +121,7 @@ IOInterleavedMemoryDescriptor::clearMemoryDescriptors( IODirection direction ) _descriptorCount = 0; _length = 0; - _mappings = 0; + _mappings = NULL; _tag = 0; }; @@ -166,15 +166,15 @@ IOInterleavedMemoryDescriptor::free() _descriptors[index]->release(); } - if (_descriptors != 0) { + if (_descriptors != NULL) { IODelete(_descriptors, IOMemoryDescriptor *, _descriptorCapacity); } - if (_descriptorOffsets != 0) { + if (_descriptorOffsets != NULL) { IODelete(_descriptorOffsets, IOMemoryDescriptor *, _descriptorCapacity); } - if (_descriptorLengths != 0) { + if (_descriptorLengths != NULL) { IODelete(_descriptorLengths, IOMemoryDescriptor *, _descriptorCapacity); } } diff --git a/iokit/Kernel/IOInterruptController.cpp b/iokit/Kernel/IOInterruptController.cpp index f84357e38..18441e5ce 100644 --- a/iokit/Kernel/IOInterruptController.cpp +++ b/iokit/Kernel/IOInterruptController.cpp @@ -104,10 +104,10 @@ IOInterruptController::registerInterrupt(IOService *nub, int source, // register as a shared interrupt. if (wasAlreadyRegisterd || shouldBeShared) { // If this vector is not already shared, break it out. - if (vector->sharedController == 0) { + if (vector->sharedController == NULL) { // Make the IOShareInterruptController instance vector->sharedController = new IOSharedInterruptController; - if (vector->sharedController == 0) { + if (vector->sharedController == NULL) { IOLockUnlock(vector->interruptLock); return kIOReturnNoMemory; } @@ -133,7 +133,7 @@ IOInterruptController::registerInterrupt(IOService *nub, int source, enableInterrupt(originalNub, originalSource); } vector->sharedController->release(); - vector->sharedController = 0; + vector->sharedController = NULL; IOLockUnlock(vector->interruptLock); return error; } @@ -163,7 +163,7 @@ IOInterruptController::registerInterrupt(IOService *nub, int source, enableInterrupt(originalNub, originalSource); vector->sharedController->release(); - vector->sharedController = 0; + vector->sharedController = NULL; IOLockUnlock(vector->interruptLock); return error; } @@ -174,7 +174,7 @@ IOInterruptController::registerInterrupt(IOService *nub, int source, vector->nub = vector->sharedController; vector->source = 0; vector->target = vector->sharedController; - vector->refCon = 0; + vector->refCon = NULL; // If the interrupt was already registered, // save the driver's interrupt enablement state. @@ -259,11 +259,11 @@ IOInterruptController::unregisterInterrupt(IOService *nub, int source) vector->interruptDisabledSoft = 0; vector->interruptDisabledHard = 0; vector->interruptRegistered = 0; - vector->nub = 0; + vector->nub = NULL; vector->source = 0; - vector->handler = 0; - vector->target = 0; - vector->refCon = 0; + vector->handler = NULL; + vector->target = NULL; + vector->refCon = NULL; IOLockUnlock(vector->interruptLock); return kIOReturnSuccess; @@ -278,7 +278,7 @@ IOInterruptController::getInterruptType(IOService *nub, int source, IOInterruptVector *vector; OSData *vectorData; - if (interruptType == 0) { + if (interruptType == NULL) { return kIOReturnBadArgument; } @@ -372,7 +372,7 @@ IOInterruptController::causeInterrupt(IOService *nub, int source) IOInterruptAction IOInterruptController::getInterruptHandlerAddress(void) { - return 0; + return NULL; } IOReturn @@ -507,7 +507,7 @@ IOSharedInterruptController::initInterruptController(IOInterruptController *pare // Allocate the IOInterruptSource so this can act like a nub. _interruptSources = (IOInterruptSource *)IOMalloc(sizeof(IOInterruptSource)); - if (_interruptSources == 0) { + if (_interruptSources == NULL) { return kIOReturnNoMemory; } _numInterruptSources = 1; @@ -537,7 +537,7 @@ IOSharedInterruptController::initInterruptController(IOInterruptController *pare // Allocate the lock for the controller. controllerLock = IOSimpleLockAlloc(); - if (controllerLock == 0) { + if (controllerLock == NULL) { return kIOReturnNoResources; } @@ -571,7 +571,7 @@ IOSharedInterruptController::registerInterrupt(IOService *nub, { IOInterruptSource *interruptSources; IOInterruptVectorNumber vectorNumber; - IOInterruptVector *vector = 0; + IOInterruptVector *vector = NULL; OSData *vectorData; IOInterruptState interruptState; @@ -607,7 +607,7 @@ IOSharedInterruptController::registerInterrupt(IOService *nub, // Create the vectorData for the IOInterruptSource. vectorData = OSData::withBytes(&vectorNumber, sizeof(vectorNumber)); - if (vectorData == 0) { + if (vectorData == NULL) { IOLockUnlock(vector->interruptLock); return kIOReturnNoMemory; } @@ -667,11 +667,11 @@ IOSharedInterruptController::unregisterInterrupt(IOService *nub, vector->interruptDisabledSoft = 0; vector->interruptDisabledHard = 0; vector->interruptRegistered = 0; - vector->nub = 0; + vector->nub = NULL; vector->source = 0; - vector->handler = 0; - vector->target = 0; - vector->refCon = 0; + vector->handler = NULL; + vector->target = NULL; + vector->refCon = NULL; interruptState = IOSimpleLockLockDisableInterrupt(controllerLock); vectorsRegistered--; diff --git a/iokit/Kernel/IOInterruptEventSource.cpp b/iokit/Kernel/IOInterruptEventSource.cpp index 19d5d597d..5decae5c6 100644 --- a/iokit/Kernel/IOInterruptEventSource.cpp +++ b/iokit/Kernel/IOInterruptEventSource.cpp @@ -218,7 +218,7 @@ IOInterruptEventSource::interruptEventSource(OSObject *inOwner, if (me && !me->init(inOwner, inAction, inProvider, inIntIndex)) { me->release(); - return 0; + return NULL; } return me; @@ -456,6 +456,9 @@ IOInterruptEventSource::normalInterruptOccurred } if (reserved->statistics) { + if (reserved->statistics->enablePrimaryTimestamp) { + reserved->statistics->primaryTimestamp = mach_absolute_time(); + } if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) { IA_ADD_VALUE(&reserved->statistics->interruptStatistics[kInterruptAccountingFirstLevelCountIndex], 1); } @@ -484,6 +487,9 @@ IOInterruptEventSource::disableInterruptOccurred } if (reserved->statistics) { + if (reserved->statistics->enablePrimaryTimestamp) { + reserved->statistics->primaryTimestamp = mach_absolute_time(); + } if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) { IA_ADD_VALUE(&reserved->statistics->interruptStatistics[kInterruptAccountingFirstLevelCountIndex], 1); } @@ -498,12 +504,12 @@ IOInterruptEventSource::disableInterruptOccurred void IOInterruptEventSource::interruptOccurred -(void *refcon, IOService *prov, int source) +(void *_refcon, IOService *prov, int source) { if (autoDisable && prov) { - disableInterruptOccurred(refcon, prov, source); + disableInterruptOccurred(_refcon, prov, source); } else { - normalInterruptOccurred(refcon, prov, source); + normalInterruptOccurred(_refcon, prov, source); } } @@ -513,3 +519,20 @@ IOInterruptEventSource::warmCPU { return ml_interrupt_prewarm(abstime); } + +void +IOInterruptEventSource::enablePrimaryInterruptTimestamp(bool enable) +{ + if (reserved->statistics) { + reserved->statistics->enablePrimaryTimestamp = enable; + } +} + +uint64_t +IOInterruptEventSource::getPimaryInterruptTimestamp() +{ + if (reserved->statistics && reserved->statistics->enablePrimaryTimestamp) { + return reserved->statistics->primaryTimestamp; + } + return -1ULL; +} diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp index bfbac5edf..69e82fbec 100644 --- a/iokit/Kernel/IOKitDebug.cpp +++ b/iokit/Kernel/IOKitDebug.cpp @@ -174,7 +174,7 @@ OSObject * IOKitDiagnostics::diagnostics( void ) diags = new IOKitDiagnostics; if (diags && !diags->init()) { diags->release(); - diags = 0; + diags = NULL; } return diags; @@ -304,7 +304,7 @@ IOTRecursiveLockLock(IOTRecursiveLock * lock) lock->count++; } else { lck_mtx_lock(lock->mutex); - assert(lock->thread == 0); + assert(lock->thread == NULL); assert(lock->count == 0); lock->thread = current_thread(); lock->count = 1; @@ -316,7 +316,7 @@ IOTRecursiveLockUnlock(IOTRecursiveLock * lock) { assert(lock->thread == current_thread()); if (0 == (--lock->count)) { - lock->thread = 0; + lock->thread = NULL; lck_mtx_unlock(lock->mutex); } } @@ -488,13 +488,13 @@ IOTrackingAddUser(IOTrackingQueue * queue, IOTrackingUser * mem, vm_size_t size) assert(!mem->link.next); - num = backtrace(&mem->bt[0], kIOTrackingCallSiteBTs); + num = backtrace(&mem->bt[0], kIOTrackingCallSiteBTs, NULL); num = 0; if ((kernel_task != current_task()) && (self = proc_self())) { - bool user_64; + bool user_64 = false; mem->btPID = proc_pid(self); (void)backtrace_user(&mem->btUser[0], kIOTrackingCallSiteBTs - 1, &num, - &user_64); + &user_64, NULL); mem->user32 = !user_64; proc_rele(self); } @@ -545,7 +545,7 @@ IOTrackingAdd(IOTrackingQueue * queue, IOTracking * mem, size_t size, bool addre assert(!mem->link.next); - num = backtrace(&bt[0], kIOTrackingCallSiteBTs + 1); + num = backtrace(&bt[0], kIOTrackingCallSiteBTs + 1, NULL); if (!num) { return; } @@ -1083,9 +1083,9 @@ IOTrackingDebug(uint32_t selector, uint32_t options, uint64_t value, OSData * data; if (result) { - *result = 0; + *result = NULL; } - data = 0; + data = NULL; ret = kIOReturnNotReady; #if IOTRACKING @@ -1426,7 +1426,7 @@ IOUserClient * IOKitDiagnosticsClient::withTask(task_t owningTask) inst = new IOKitDiagnosticsClient; if (inst && !inst->init()) { inst->release(); - inst = 0; + inst = NULL; } return inst; @@ -1464,7 +1464,7 @@ IOKitDiagnosticsClient::externalMethod(uint32_t selector, IOExternalMethodArgume return kIOReturnBadArgument; } - names = 0; + names = NULL; namesLen = args->structureInputSize - sizeof(IOKitDiagnosticsParameters); if (namesLen) { names = (typeof(names))(params + 1); diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index 436b19793..f1a0d882d 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -167,6 +167,7 @@ struct IOMemoryDescriptorReserved { uint64_t kernReserved[4]; vm_tag_t kernelTag; vm_tag_t userTag; + task_t creator; }; struct iopa_t { @@ -206,6 +207,7 @@ extern bool gCPUsRunning; extern OSSet * gIORemoveOnReadProperties; extern "C" void IOKitInitializeTime( void ); +extern void IOMachPortInitialize(void); extern "C" OSString * IOCopyLogNameForPID(int pid); diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index 0dedff70f..16459d585 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -256,6 +256,19 @@ IOExitThread(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +void * +IOMallocZero(vm_size_t size) +{ + void * result; + result = IOMalloc(size); + if (result) { + bzero(result, size); + } + return result; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + #if IOTRACKING struct IOLibMallocHeader { IOTrackingAddress tracking; @@ -390,7 +403,7 @@ IOMallocAligned(vm_size_t size, vm_size_t alignment) IOLibPageMallocHeader * hdr; if (size == 0) { - return 0; + return NULL; } alignment = (1UL << log2up(alignment)); @@ -672,7 +685,7 @@ IOMallocContiguous(vm_size_t size, vm_size_t alignment, mach_vm_address_t address = 0; if (size == 0) { - return 0; + return NULL; } if (alignment == 0) { alignment = 1; @@ -852,10 +865,10 @@ IOMallocPageablePages(vm_size_t size, vm_size_t alignment, vm_tag_t tag) struct IOMallocPageableRef ref; if (alignment > page_size) { - return 0; + return NULL; } if (size > kIOPageableMaxMapSize) { - return 0; + return NULL; } ref.size = size; @@ -871,7 +884,7 @@ IOMallocPageablePages(vm_size_t size, vm_size_t alignment, vm_tag_t tag) vm_map_t IOPageableMapForAddress( uintptr_t address ) { - vm_map_t map = 0; + vm_map_t map = NULL; UInt32 index; for (index = 0; index < gIOKitPageableSpace.count; index++) { @@ -974,7 +987,7 @@ iopa_allocinpage(iopa_page_t * pa, uint32_t count, uint64_t align) pa->avail &= ~((-1ULL << (64 - count)) >> n); if (!pa->avail && pa->link.next) { remque(&pa->link); - pa->link.next = 0; + pa->link.next = NULL; } return n * gIOPageAllocChunkBytes + trunc_page((uintptr_t) pa); } @@ -1068,10 +1081,10 @@ iopa_free(iopa_t * a, uintptr_t addr, vm_size_t bytes) } pa->avail |= ((-1ULL << (64 - count)) >> chunk); if (pa->avail != -2ULL) { - pa = 0; + pa = NULL; } else { remque(&pa->link); - pa->link.next = 0; + pa->link.next = NULL; pa->signature = 0; a->pagecount--; // page to free @@ -1239,38 +1252,28 @@ void IOKitKernelLogBuffer(const char * title, const void * buffer, size_t size, void (*output)(const char *format, ...)) { + size_t idx, linestart; + enum { bytelen = (sizeof("0xZZ, ") - 1) }; + char hex[(bytelen * 16) + 1]; uint8_t c, chars[17]; - size_t idx; - output("%s(0x%x):\n", title, size); + output("%s(0x%lx):\n", title, size); + output(" 0 1 2 3 4 5 6 7 8 9 A B C D E F\n"); if (size > 4096) { size = 4096; } - chars[16] = idx = 0; - while (true) { - if (!(idx & 15)) { - if (idx) { - output(" |%s|\n", chars); - } - if (idx >= size) { - break; - } - output("%04x: ", idx); - } else if (!(idx & 7)) { - output(" "); - } - - c = ((char *)buffer)[idx]; - output("%02x ", c); + chars[16] = 0; + for (idx = 0, linestart = 0; idx < size;) { + c = ((char *)buffer)[idx]; + snprintf(&hex[bytelen * (idx & 15)], bytelen + 1, "0x%02x, ", c); chars[idx & 15] = ((c >= 0x20) && (c <= 0x7f)) ? c : ' '; - idx++; - if ((idx == size) && (idx & 15)) { - chars[idx & 15] = 0; - while (idx & 15) { - idx++; - output(" "); + if ((idx == size) || !(idx & 15)) { + if (idx & 15) { + chars[idx & 15] = 0; } + output("/* %04lx: */ %-96s /* |%-16s| */\n", linestart, hex, chars); + linestart += 16; } } } diff --git a/iokit/Kernel/IOLocks.cpp b/iokit/Kernel/IOLocks.cpp index 8871fb684..3b8a95359 100644 --- a/iokit/Kernel/IOLocks.cpp +++ b/iokit/Kernel/IOLocks.cpp @@ -144,18 +144,18 @@ IORecursiveLockAllocWithLockGroup( lck_grp_t * lockGroup ) { _IORecursiveLock * lock; - if (lockGroup == 0) { - return 0; + if (lockGroup == NULL) { + return NULL; } lock = IONew( _IORecursiveLock, 1 ); if (!lock) { - return 0; + return NULL; } lck_mtx_init( &lock->mutex, lockGroup, LCK_ATTR_NULL ); lock->group = lockGroup; - lock->thread = 0; + lock->thread = NULL; lock->count = 0; return (IORecursiveLock *) lock; @@ -192,7 +192,7 @@ IORecursiveLockLock( IORecursiveLock * _lock) lock->count++; } else { lck_mtx_lock( &lock->mutex ); - assert( lock->thread == 0 ); + assert( lock->thread == NULL ); assert( lock->count == 0 ); lock->thread = IOThreadSelf(); lock->count = 1; @@ -209,7 +209,7 @@ IORecursiveLockTryLock( IORecursiveLock * _lock) return true; } else { if (lck_mtx_try_lock( &lock->mutex )) { - assert( lock->thread == 0 ); + assert( lock->thread == NULL ); assert( lock->count == 0 ); lock->thread = IOThreadSelf(); lock->count = 1; @@ -227,7 +227,7 @@ IORecursiveLockUnlock( IORecursiveLock * _lock) assert( lock->thread == IOThreadSelf()); if (0 == (--lock->count)) { - lock->thread = 0; + lock->thread = NULL; lck_mtx_unlock( &lock->mutex ); } } @@ -250,12 +250,12 @@ IORecursiveLockSleep(IORecursiveLock *_lock, void *event, UInt32 interType) assert(lock->thread == IOThreadSelf()); lock->count = 0; - lock->thread = 0; + lock->thread = NULL; res = lck_mtx_sleep(&lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, (wait_interrupt_t) interType); // Must re-establish the recursive lock no matter why we woke up // otherwise we would potentially leave the return path corrupted. - assert(lock->thread == 0); + assert(lock->thread == NULL); assert(lock->count == 0); lock->thread = IOThreadSelf(); lock->count = count; @@ -273,13 +273,13 @@ IORecursiveLockSleepDeadline( IORecursiveLock * _lock, void *event, assert(lock->thread == IOThreadSelf()); lock->count = 0; - lock->thread = 0; + lock->thread = NULL; res = lck_mtx_sleep_deadline(&lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, (wait_interrupt_t) interType, __OSAbsoluteTime(deadline)); // Must re-establish the recursive lock no matter why we woke up // otherwise we would potentially leave the return path corrupted. - assert(lock->thread == 0); + assert(lock->thread == NULL); assert(lock->count == 0); lock->thread = IOThreadSelf(); lock->count = count; @@ -331,6 +331,12 @@ IOSimpleLockInit( IOSimpleLock * lock) lck_spin_init( lock, IOLockGroup, LCK_ATTR_NULL); } +void +IOSimpleLockDestroy( IOSimpleLock * lock ) +{ + lck_spin_destroy(lock, IOLockGroup); +} + void IOSimpleLockFree( IOSimpleLock * lock ) { diff --git a/iokit/Kernel/IOMemoryCursor.cpp b/iokit/Kernel/IOMemoryCursor.cpp index 1e4e94840..82b139267 100644 --- a/iokit/Kernel/IOMemoryCursor.cpp +++ b/iokit/Kernel/IOMemoryCursor.cpp @@ -52,7 +52,7 @@ IOMemoryCursor::withSpecification(SegmentFunction inSegFunc, inMaxTransferSize, inAlignment)) { me->release(); - return 0; + return NULL; } return me; @@ -215,7 +215,7 @@ IONaturalMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize, inMaxTransferSize, inAlignment)) { me->release(); - return 0; + return NULL; } return me; @@ -266,7 +266,7 @@ IOBigMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize, inMaxTransferSize, inAlignment)) { me->release(); - return 0; + return NULL; } return me; @@ -317,7 +317,7 @@ IOLittleMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize, inMaxTransferSize, inAlignment)) { me->release(); - return 0; + return NULL; } return me; diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index 3ff1f79ca..d73a4343b 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -63,6 +63,7 @@ __BEGIN_DECLS #include #include +#include #include #include @@ -151,7 +152,7 @@ struct ioGMDData { __attribute__((aligned(sizeof(upl_t)))) #endif ; - ioPLBlock fBlocks[1]; + //ioPLBlock fBlocks[1]; }; #define getDataP(osd) ((ioGMDData *) (osd)->getBytesNoCopy()) @@ -301,80 +302,55 @@ purgeableStateBits(int * state) return err; } +typedef struct { + unsigned int wimg; + unsigned int object_type; +} iokit_memtype_entry; + +static const iokit_memtype_entry iomd_mem_types[] = { + [kIODefaultCache] = {VM_WIMG_DEFAULT, MAP_MEM_NOOP}, + [kIOInhibitCache] = {VM_WIMG_IO, MAP_MEM_IO}, + [kIOWriteThruCache] = {VM_WIMG_WTHRU, MAP_MEM_WTHRU}, + [kIOWriteCombineCache] = {VM_WIMG_WCOMB, MAP_MEM_WCOMB}, + [kIOCopybackCache] = {VM_WIMG_COPYBACK, MAP_MEM_COPYBACK}, + [kIOCopybackInnerCache] = {VM_WIMG_INNERWBACK, MAP_MEM_INNERWBACK}, + [kIOPostedWrite] = {VM_WIMG_POSTED, MAP_MEM_POSTED}, + [kIORealTimeCache] = {VM_WIMG_RT, MAP_MEM_RT}, + [kIOPostedReordered] = {VM_WIMG_POSTED_REORDERED, MAP_MEM_POSTED_REORDERED}, + [kIOPostedCombinedReordered] = {VM_WIMG_POSTED_COMBINED_REORDERED, MAP_MEM_POSTED_COMBINED_REORDERED}, +}; static vm_prot_t vmProtForCacheMode(IOOptionBits cacheMode) { + assert(cacheMode < (sizeof(iomd_mem_types) / sizeof(iomd_mem_types[0]))); vm_prot_t prot = 0; - switch (cacheMode) { - case kIOInhibitCache: - SET_MAP_MEM(MAP_MEM_IO, prot); - break; - - case kIOWriteThruCache: - SET_MAP_MEM(MAP_MEM_WTHRU, prot); - break; - - case kIOWriteCombineCache: - SET_MAP_MEM(MAP_MEM_WCOMB, prot); - break; - - case kIOCopybackCache: - SET_MAP_MEM(MAP_MEM_COPYBACK, prot); - break; - - case kIOCopybackInnerCache: - SET_MAP_MEM(MAP_MEM_INNERWBACK, prot); - break; - - case kIOPostedWrite: - SET_MAP_MEM(MAP_MEM_POSTED, prot); - break; - - case kIODefaultCache: - default: - SET_MAP_MEM(MAP_MEM_NOOP, prot); - break; - } - + SET_MAP_MEM(iomd_mem_types[cacheMode].object_type, prot); return prot; } static unsigned int pagerFlagsForCacheMode(IOOptionBits cacheMode) { - unsigned int pagerFlags = 0; - switch (cacheMode) { - case kIOInhibitCache: - pagerFlags = DEVICE_PAGER_CACHE_INHIB | DEVICE_PAGER_COHERENT | DEVICE_PAGER_GUARDED; - break; - - case kIOWriteThruCache: - pagerFlags = DEVICE_PAGER_WRITE_THROUGH | DEVICE_PAGER_COHERENT | DEVICE_PAGER_GUARDED; - break; - - case kIOWriteCombineCache: - pagerFlags = DEVICE_PAGER_CACHE_INHIB | DEVICE_PAGER_COHERENT; - break; - - case kIOCopybackCache: - pagerFlags = DEVICE_PAGER_COHERENT; - break; - - case kIOCopybackInnerCache: - pagerFlags = DEVICE_PAGER_COHERENT; - break; - - case kIOPostedWrite: - pagerFlags = DEVICE_PAGER_CACHE_INHIB | DEVICE_PAGER_COHERENT | DEVICE_PAGER_GUARDED | DEVICE_PAGER_EARLY_ACK; - break; + assert(cacheMode < (sizeof(iomd_mem_types) / sizeof(iomd_mem_types[0]))); + if (cacheMode == kIODefaultCache) { + return -1U; + } + return iomd_mem_types[cacheMode].wimg; +} - case kIODefaultCache: - default: - pagerFlags = -1U; - break; +static IOOptionBits +cacheModeForPagerFlags(unsigned int pagerFlags) +{ + pagerFlags &= VM_WIMG_MASK; + IOOptionBits cacheMode = kIODefaultCache; + for (IOOptionBits i = 0; i < (sizeof(iomd_mem_types) / sizeof(iomd_mem_types[0])); ++i) { + if (iomd_mem_types[i].wimg == pagerFlags) { + cacheMode = i; + break; + } } - return pagerFlags; + return (cacheMode == kIODefaultCache) ? kIOCopybackCache : cacheMode; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -431,7 +407,7 @@ IOGeneralMemoryDescriptor::memoryReferenceAlloc(uint32_t capacity, IOMemoryRefer OSIncrementAtomic(&gIOMemoryReferenceCount); } if (!ref) { - return 0; + return NULL; } ref->capacity = capacity; return ref; @@ -445,7 +421,7 @@ IOGeneralMemoryDescriptor::memoryReferenceFree(IOMemoryReference * ref) if (ref->mapRef) { memoryReferenceFree(ref->mapRef); - ref->mapRef = 0; + ref->mapRef = NULL; } entries = ref->entries + ref->count; @@ -496,6 +472,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( IOOptionBits cacheMode; unsigned int pagerFlags; vm_tag_t tag; + vm_named_entry_kernel_flags_t vmne_kflags; ref = memoryReferenceAlloc(kCapacity, NULL); if (!ref) { @@ -503,6 +480,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( } tag = getVMTag(kernel_map); + vmne_kflags = VM_NAMED_ENTRY_KERNEL_FLAGS_NONE; entries = &ref->entries[0]; count = 0; err = KERN_SUCCESS; @@ -517,21 +495,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( // default cache mode for physical if (kIODefaultCache == ((_flags & kIOMemoryBufferCacheMask) >> kIOMemoryBufferCacheShift)) { - IOOptionBits mode; - pagerFlags = IODefaultCacheBits(nextAddr); - if (DEVICE_PAGER_CACHE_INHIB & pagerFlags) { - if (DEVICE_PAGER_EARLY_ACK & pagerFlags) { - mode = kIOPostedWrite; - } else if (DEVICE_PAGER_GUARDED & pagerFlags) { - mode = kIOInhibitCache; - } else { - mode = kIOWriteCombineCache; - } - } else if (DEVICE_PAGER_WRITE_THROUGH & pagerFlags) { - mode = kIOWriteThruCache; - } else { - mode = kIOCopybackCache; - } + IOOptionBits mode = cacheModeForPagerFlags(IODefaultCacheBits(nextAddr)); _flags |= (mode << kIOMemoryBufferCacheShift); } } @@ -554,6 +518,10 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( prot |= MAP_MEM_VM_COPY; } + if (kIOMemoryUseReserve & _flags) { + prot |= MAP_MEM_GRAB_SECLUDED; + } + if ((kIOMemoryReferenceReuse & options) && _memRef) { cloneEntries = &_memRef->entries[0]; prot |= MAP_MEM_NAMED_REUSE; @@ -563,14 +531,36 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( // virtual ranges if (kIOMemoryBufferPageable & _flags) { + int ledger_tag, ledger_no_footprint; + // IOBufferMemoryDescriptor alloc - set flags for entry + object create prot |= MAP_MEM_NAMED_CREATE; + + // default accounting settings: + // + "none" ledger tag + // + include in footprint + // can be changed later with ::setOwnership() + ledger_tag = VM_LEDGER_TAG_NONE; + ledger_no_footprint = 0; + if (kIOMemoryBufferPurgeable & _flags) { prot |= (MAP_MEM_PURGABLE | MAP_MEM_PURGABLE_KERNEL_ONLY); if (VM_KERN_MEMORY_SKYWALK == tag) { - prot |= MAP_MEM_LEDGER_TAG_NETWORK; + // Skywalk purgeable memory accounting: + // + "network" ledger tag + // + not included in footprint + ledger_tag = VM_LEDGER_TAG_NETWORK; + ledger_no_footprint = 1; + } else { + // regular purgeable memory accounting: + // + no ledger tag + // + included in footprint + ledger_tag = VM_LEDGER_TAG_NONE; + ledger_no_footprint = 0; } } + vmne_kflags.vmnekf_ledger_tag = ledger_tag; + vmne_kflags.vmnekf_ledger_no_footprint = ledger_no_footprint; if (kIOMemoryUseReserve & _flags) { prot |= MAP_MEM_GRAB_SECLUDED; } @@ -614,7 +604,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( } err = mach_make_memory_entry_internal(map, - &actualSize, entryAddr, prot, &entry, cloneEntry); + &actualSize, entryAddr, prot, vmne_kflags, &entry, cloneEntry); if (KERN_SUCCESS != err) { break; @@ -649,7 +639,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( } else { // _task == 0, physical or kIOMemoryTypeUPL memory_object_t pager; - vm_size_t size = ptoa_32(_pages); + vm_size_t size = ptoa_64(_pages); if (!getKernelReserved()) { panic("getKernelReserved"); @@ -666,7 +656,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( pagerFlags |= DEVICE_PAGER_CONTIGUOUS; } - pager = device_pager_setup((memory_object_t) 0, (uintptr_t) reserved, + pager = device_pager_setup((memory_object_t) NULL, (uintptr_t) reserved, size, pagerFlags); assert(pager); if (!pager) { @@ -1115,6 +1105,30 @@ IOGeneralMemoryDescriptor::memoryReferenceSetPurgeable( return err; } +IOReturn +IOGeneralMemoryDescriptor::memoryReferenceSetOwnership( + IOMemoryReference * ref, + task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions) +{ + IOReturn err, totalErr; + IOMemoryEntry * entries; + + totalErr = kIOReturnSuccess; + entries = ref->entries + ref->count; + while (entries > &ref->entries[0]) { + entries--; + + err = mach_memory_entry_ownership(entries->entry, newOwner, newLedgerTag, newLedgerOptions); + if (KERN_SUCCESS != err) { + totalErr = err; + } + } + + return totalErr; +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ IOMemoryDescriptor * @@ -1141,7 +1155,7 @@ IOMemoryDescriptor::withAddress(IOVirtualAddress address, that->release(); } - return 0; + return NULL; } #endif /* !__LP64__ */ @@ -1170,7 +1184,7 @@ IOMemoryDescriptor::withRanges( IOVirtualRange * ranges, that->release(); } - return 0; + return NULL; } #endif /* !__LP64__ */ @@ -1198,14 +1212,14 @@ IOMemoryDescriptor::withAddressRanges(IOAddressRange * ranges, options |= kIOMemoryTypePhysical64; } - if (that->initWithOptions(ranges, rangeCount, 0, task, options, /* mapper */ 0)) { + if (that->initWithOptions(ranges, rangeCount, 0, task, options, /* mapper */ NULL)) { return that; } that->release(); } - return 0; + return NULL; } @@ -1230,7 +1244,7 @@ IOMemoryDescriptor::withOptions(void * buffers, if (self && !self->initWithOptions(buffers, count, offset, task, opts, mapper)) { self->release(); - return 0; + return NULL; } return self; @@ -1262,7 +1276,7 @@ IOMemoryDescriptor::withPhysicalRanges( IOPhysicalRange * ranges, that->release(); } - return 0; + return NULL; } IOMemoryDescriptor * @@ -1285,7 +1299,7 @@ IOMemoryDescriptor::withPersistentMemoryDescriptor(IOMemoryDescriptor *originalM return IOGeneralMemoryDescriptor:: withPersistentMemoryDescriptor(origGenMD); } else { - return 0; + return NULL; } } @@ -1295,7 +1309,7 @@ IOGeneralMemoryDescriptor::withPersistentMemoryDescriptor(IOGeneralMemoryDescrip IOMemoryReference * memRef; if (kIOReturnSuccess != originalMD->memoryReferenceCreate(kIOMemoryReferenceReuse, &memRef)) { - return 0; + return NULL; } if (memRef == originalMD->_memRef) { @@ -1308,9 +1322,9 @@ IOGeneralMemoryDescriptor::withPersistentMemoryDescriptor(IOGeneralMemoryDescrip IOMDPersistentInitData initData = { originalMD, memRef }; if (self - && !self->initWithOptions(&initData, 1, 0, 0, kIOMemoryTypePersistentMD, 0)) { + && !self->initWithOptions(&initData, 1, 0, NULL, kIOMemoryTypePersistentMD, NULL)) { self->release(); - self = 0; + self = NULL; } return self; } @@ -1364,7 +1378,7 @@ IOGeneralMemoryDescriptor::initWithPhysicalRanges( mdOpts |= kIOMemoryAsReference; } - return initWithOptions(ranges, count, 0, 0, mdOpts, /* mapper */ 0); + return initWithOptions(ranges, count, 0, NULL, mdOpts, /* mapper */ NULL); } bool @@ -1394,7 +1408,7 @@ IOGeneralMemoryDescriptor::initWithRanges( mdOpts |= kIOMemoryTypePhysical; } - return initWithOptions(ranges, count, 0, task, mdOpts, /* mapper */ 0); + return initWithOptions(ranges, count, 0, task, mdOpts, /* mapper */ NULL); } #endif /* !__LP64__ */ @@ -1519,7 +1533,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, if (!(kIOMemoryRedirected & options)) { if (_memRef) { memoryReferenceRelease(_memRef); - _memRef = 0; + _memRef = NULL; } if (_mappings) { _mappings->flushCollection(); @@ -1537,7 +1551,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, options |= kIOMemoryMapperNone; } if (kIOMemoryMapperNone & options) { - mapper = 0; // No Mapper + mapper = NULL; // No Mapper } else if (mapper == kIOMapperSystem) { IOMapper::checkForSystemMapper(); gIOSystemMapper = mapper = IOMapper::gSystem; @@ -1780,7 +1794,7 @@ IOGeneralMemoryDescriptor::free() if (reserved) { LOCK; - reserved->dp.memory = 0; + reserved->dp.memory = NULL; UNLOCK; } if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type)) { @@ -1816,6 +1830,7 @@ IOGeneralMemoryDescriptor::free() } if (reserved) { + cleanKernelReserved(reserved); if (reserved->dp.devicePager) { // memEntry holds a ref on the device pager which owns reserved // (IOMemoryDescriptorReserved) so no reserved access after this point @@ -2075,19 +2090,26 @@ IOGeneralMemoryDescriptor::getPreparationID( void ) } if (kIOPreparationIDUnprepared == dataP->fPreparationID) { - dataP->fPreparationID = OSIncrementAtomic64(&gIOMDPreparationID); + SInt64 newID = OSIncrementAtomic64(&gIOMDPreparationID); + OSCompareAndSwap64(kIOPreparationIDUnprepared, newID, &dataP->fPreparationID); } return dataP->fPreparationID; } +void +IOMemoryDescriptor::cleanKernelReserved( IOMemoryDescriptorReserved * reserved ) +{ + if (reserved->creator) { + task_deallocate(reserved->creator); + reserved->creator = NULL; + } +} + IOMemoryDescriptorReserved * IOMemoryDescriptor::getKernelReserved( void ) { if (!reserved) { - reserved = IONew(IOMemoryDescriptorReserved, 1); - if (reserved) { - bzero(reserved, sizeof(IOMemoryDescriptorReserved)); - } + reserved = IONewZero(IOMemoryDescriptorReserved, 1); } return reserved; } @@ -2096,7 +2118,8 @@ void IOMemoryDescriptor::setPreparationID( void ) { if (getKernelReserved() && (kIOPreparationIDUnprepared == reserved->preparationID)) { - reserved->preparationID = OSIncrementAtomic64(&gIOMDPreparationID); + SInt64 newID = OSIncrementAtomic64(&gIOMDPreparationID); + OSCompareAndSwap64(kIOPreparationIDUnprepared, newID, &reserved->preparationID); } } @@ -2111,22 +2134,22 @@ IOMemoryDescriptor::getPreparationID( void ) } void -IOMemoryDescriptor::setVMTags(vm_tag_t kernelTag, vm_tag_t userTag) +IOMemoryDescriptor::setVMTags(uint32_t kernelTag, uint32_t userTag) { - _kernelTag = kernelTag; - _userTag = userTag; + _kernelTag = (vm_tag_t) kernelTag; + _userTag = (vm_tag_t) userTag; } -vm_tag_t +uint32_t IOMemoryDescriptor::getVMTag(vm_map_t map) { if (vm_kernel_map_is_kernel(map)) { if (VM_KERN_MEMORY_NONE != _kernelTag) { - return _kernelTag; + return (uint32_t) _kernelTag; } } else { if (VM_KERN_MEMORY_NONE != _userTag) { - return _userTag; + return (uint32_t) _userTag; } } return IOMemoryTag(map); @@ -2282,9 +2305,9 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI // Get the next segment struct InternalState { IOMDDMAWalkSegmentArgs fIO; - UInt fOffset2Index; + mach_vm_size_t fOffset2Index; + mach_vm_size_t fNextOffset; UInt fIndex; - UInt fNextOffset; } *isP; // Find the next segment @@ -2293,7 +2316,7 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI } isP = (InternalState *) vData; - UInt offset = isP->fIO.fOffset; + mach_vm_size_t offset = isP->fIO.fOffset; uint8_t mapped = isP->fIO.fMapped; uint64_t mappedBase; @@ -2343,7 +2366,8 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI } // Validate the previous offset - UInt ind, off2Ind = isP->fOffset2Index; + UInt ind; + mach_vm_size_t off2Ind = isP->fOffset2Index; if (!params && offset && (offset == isP->fNextOffset || off2Ind <= offset)) { @@ -2351,7 +2375,7 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI } else { ind = off2Ind = 0; // Start from beginning } - UInt length; + mach_vm_size_t length; UInt64 address; if ((_flags & kIOMemoryTypeMask) == kIOMemoryTypePhysical) { @@ -2678,7 +2702,7 @@ IOMemoryDescriptor::getPhysicalSegment64(IOByteCount offset, IOByteCount *length IOPhysicalAddress phys32; IOByteCount length; addr64_t phys64; - IOMapper * mapper = 0; + IOMapper * mapper = NULL; phys32 = getPhysicalSegment(offset, lengthOfSegment); if (!phys32) { @@ -2736,7 +2760,7 @@ IOGeneralMemoryDescriptor::getVirtualSegment(IOByteCount offset, panic("IOGMD::getVirtualSegment deprecated"); } - return 0; + return NULL; } #pragma clang diagnostic pop #endif /* !__LP64__ */ @@ -2891,6 +2915,68 @@ IOMemoryDescriptor::setPurgeable( IOOptionBits newState, return err; } +IOReturn +IOGeneralMemoryDescriptor::setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ) +{ + IOReturn err = kIOReturnSuccess; + + assert(!(kIOMemoryRemote & _flags)); + if (kIOMemoryRemote & _flags) { + return kIOReturnNotAttached; + } + + if (iokit_iomd_setownership_enabled == FALSE) { + return kIOReturnUnsupported; + } + + if (_memRef) { + err = super::setOwnership(newOwner, newLedgerTag, newLedgerOptions); + } else { + err = kIOReturnUnsupported; + } + + return err; +} + +IOReturn +IOMemoryDescriptor::setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ) +{ + IOReturn err = kIOReturnNotReady; + + assert(!(kIOMemoryRemote & _flags)); + if (kIOMemoryRemote & _flags) { + return kIOReturnNotAttached; + } + + if (iokit_iomd_setownership_enabled == FALSE) { + return kIOReturnUnsupported; + } + + if (kIOMemoryThreadSafe & _flags) { + LOCK; + } + if (_memRef) { + err = IOGeneralMemoryDescriptor::memoryReferenceSetOwnership(_memRef, newOwner, newLedgerTag, newLedgerOptions); + } else { + IOMultiMemoryDescriptor * mmd; + IOSubMemoryDescriptor * smd; + if ((smd = OSDynamicCast(IOSubMemoryDescriptor, this))) { + err = smd->setOwnership(newOwner, newLedgerTag, newLedgerOptions); + } else if ((mmd = OSDynamicCast(IOMultiMemoryDescriptor, this))) { + err = mmd->setOwnership(newOwner, newLedgerTag, newLedgerOptions); + } + } + if (kIOMemoryThreadSafe & _flags) { + UNLOCK; + } + + return err; +} + IOReturn IOMemoryDescriptor::getPageCounts( IOByteCount * residentPageCount, IOByteCount * dirtyPageCount ) @@ -2962,9 +3048,9 @@ IOMemoryDescriptor::performOperation( IOOptionBits options, { IOByteCount remaining; unsigned int res; - void (*func)(addr64_t pa, unsigned int count) = 0; + void (*func)(addr64_t pa, unsigned int count) = NULL; #if defined(__arm__) || defined(__arm64__) - void (*func_ext)(addr64_t pa, unsigned int count, unsigned int remaining, unsigned int *result) = 0; + void (*func_ext)(addr64_t pa, unsigned int count, unsigned int remaining, unsigned int *result) = NULL; #endif assert(!(kIOMemoryRemote & _flags)); @@ -3009,7 +3095,7 @@ IOMemoryDescriptor::performOperation( IOOptionBits options, } #if defined(__arm__) || defined(__arm64__) - if ((func == 0) && (func_ext == 0)) { + if ((func == NULL) && (func_ext == NULL)) { return kIOReturnUnsupported; } #else /* defined(__arm__) || defined(__arm64__) */ @@ -3211,15 +3297,15 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) if (uplPageSize > ((unsigned int)uplPageSize)) { return kIOReturnNoMemory; } - if (!_memoryEntries->appendBytes(0, uplPageSize)) { + if (!_memoryEntries->appendBytes(NULL, uplPageSize)) { return kIOReturnNoMemory; } - dataP = 0; + dataP = NULL; // Find the appropriate vm_map for the given task vm_map_t curMap; - if (_task == kernel_task && (kIOMemoryBufferPageable & _flags)) { - curMap = 0; + if ((NULL != _memRef) || ((_task == kernel_task && (kIOMemoryBufferPageable & _flags)))) { + curMap = NULL; } else { curMap = get_task_map(_task); } @@ -3230,7 +3316,7 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) IOByteCount mdOffset = 0; ppnum_t highestPage = 0; - IOMemoryEntry * memRefEntry = 0; + IOMemoryEntry * memRefEntry = NULL; if (_memRef) { memRefEntry = &_memRef->entries[0]; } @@ -3356,7 +3442,7 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) } goto abortExit; } - dataP = 0; + dataP = NULL; // Check for a multiple iopl's in one virtual range pageIndex += numPageInfo; @@ -3433,7 +3519,7 @@ IOGeneralMemoryDescriptor::initMemoryEntries(size_t size, IOMapper * mapper) return false; } - _memoryEntries->appendBytes(0, computeDataSize(0, 0)); + _memoryEntries->appendBytes(NULL, computeDataSize(0, 0)); dataP = getDataP(_memoryEntries); if (mapper == kIOMapperWaitSystem) { @@ -3523,7 +3609,7 @@ IOMemoryDescriptor::dmaUnmap( kern_allocation_name_t mapName; int16_t prior; - mapName = 0; + mapName = NULL; prior = 0; if (command) { mapName = _mapName; @@ -3754,7 +3840,7 @@ IOGeneralMemoryDescriptor::complete(IODirection forDirection) if (dataP->fCompletionError) { upl_abort(ioplList[ind].fIOPL, 0 /*!UPL_ABORT_DUMP_PAGES*/); } else { - upl_commit(ioplList[ind].fIOPL, 0, 0); + upl_commit(ioplList[ind].fIOPL, NULL, 0); } upl_deallocate(ioplList[ind].fIOPL); } @@ -3838,7 +3924,8 @@ IOGeneralMemoryDescriptor::doMap( if (!(kIOMapReadOnly & options)) { createOptions |= kIOMemoryReferenceWrite; #if DEVELOPMENT || DEBUG - if (kIODirectionOut == (kIODirectionOutIn & _flags)) { + if ((kIODirectionOut == (kIODirectionOutIn & _flags)) + && (!reserved || (reserved->creator != mapping->fAddressTask))) { OSReportWithBacktrace("warning: creating writable mapping from IOMemoryDescriptor(kIODirectionOut) - use kIOMapReadOnly or change direction"); } #endif @@ -3850,7 +3937,7 @@ IOGeneralMemoryDescriptor::doMap( } memory_object_t pager; - pager = (memory_object_t) (reserved ? reserved->dp.devicePager : 0); + pager = (memory_object_t) (reserved ? reserved->dp.devicePager : NULL); // field)) +#define iomap_offsetof(type, field) ((size_t)(&((type *)NULL)->field)) IOMemoryMap * map = (typeof(map))(((uintptr_t) tracking) - iomap_offsetof(IOMemoryMap, fTracking)); @@ -3981,7 +4068,7 @@ OSMetaClassDefineReservedUnused(IOMemoryMap, 7); IOPhysicalAddress IOMemoryMap::getPhysicalAddress() { - return getPhysicalSegment( 0, 0 ); + return getPhysicalSegment( 0, NULL ); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -4112,7 +4199,7 @@ IOMemoryDescriptor::populateDevicePager( // in the middle of the loop only map whole pages if (segLen >= bytes) { segLen = bytes; - } else if (segLen != trunc_page(segLen)) { + } else if (segLen != trunc_page_64(segLen)) { err = kIOReturnVMError; } if (physAddr != trunc_page_64(physAddr)) { @@ -4219,7 +4306,7 @@ IOReturn IOMemoryDescriptor::redirect( task_t safeTask, bool doRedirect ) { IOReturn err = kIOReturnSuccess; - IOMemoryMap * mapping = 0; + IOMemoryMap * mapping = NULL; OSIterator * iter; LOCK; @@ -4321,7 +4408,7 @@ IOMemoryMap::unmap( void ) LOCK; - if (fAddress && fAddressMap && (0 == fSuperMap) && fMemory + if (fAddress && fAddressMap && (NULL == fSuperMap) && fMemory && (0 == (kIOMapStatic & fOptions))) { err = fMemory->doUnmap(fAddressMap, (IOVirtualAddress) this, 0); } else { @@ -4330,7 +4417,7 @@ IOMemoryMap::unmap( void ) if (fAddressMap) { vm_map_deallocate(fAddressMap); - fAddressMap = 0; + fAddressMap = NULL; } fAddress = 0; @@ -4355,9 +4442,9 @@ IOMemoryMap::taskDied( void ) if (fAddressMap) { vm_map_deallocate(fAddressMap); - fAddressMap = 0; + fAddressMap = NULL; } - fAddressTask = 0; + fAddressTask = NULL; fAddress = 0; UNLOCK; } @@ -4481,28 +4568,28 @@ IOMemoryMap::copyCompatible( mach_vm_size_t _length = newMapping->fLength; if ((!task) || (!fAddressMap) || (fAddressMap != get_task_map(task))) { - return 0; + return NULL; } if ((fOptions ^ _options) & kIOMapReadOnly) { - return 0; + return NULL; } if ((kIOMapDefaultCache != (_options & kIOMapCacheMask)) && ((fOptions ^ _options) & kIOMapCacheMask)) { - return 0; + return NULL; } if ((0 == (_options & kIOMapAnywhere)) && (fAddress != toAddress)) { - return 0; + return NULL; } if (_offset < fOffset) { - return 0; + return NULL; } _offset -= fOffset; if ((_offset + _length) > fLength) { - return 0; + return NULL; } retain(); @@ -4569,7 +4656,7 @@ IOMemoryMap::getPhysicalSegment( IOByteCount _offset, IOPhysicalLength * _length void IOMemoryDescriptor::initialize( void ) { - if (0 == gIOMemoryLock) { + if (NULL == gIOMemoryLock) { gIOMemoryLock = IORecursiveLockAlloc(); } @@ -4584,6 +4671,7 @@ IOMemoryDescriptor::free( void ) } if (reserved) { + cleanKernelReserved(reserved); IODelete(reserved, IOMemoryDescriptorReserved, 1); reserved = NULL; } @@ -4621,7 +4709,7 @@ IOMemoryDescriptor::map( { if ((!(kIOMapAnywhere & options)) && vm_map_is_64bit(get_task_map(intoTask))) { OSReportWithBacktrace("IOMemoryDescriptor::map() in 64b task, use ::createMappingInTask()"); - return 0; + return NULL; } return createMappingInTask(intoTask, atAddress, @@ -4650,13 +4738,13 @@ IOMemoryDescriptor::createMappingInTask( && !mapping->init( intoTask, atAddress, options, offset, length )) { mapping->release(); - mapping = 0; + mapping = NULL; } if (mapping) { result = makeMapping(this, intoTask, (IOVirtualAddress) mapping, options | kIOMap64Bit, 0, 0); } else { - result = 0; + result = NULL; } #if DEBUG @@ -4685,7 +4773,7 @@ IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory, mach_vm_size_t offset) { IOReturn err = kIOReturnSuccess; - IOMemoryDescriptor * physMem = 0; + IOMemoryDescriptor * physMem = NULL; LOCK; @@ -4704,13 +4792,13 @@ IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory, if (KERN_SUCCESS != memory_object_iopl_request(fMemory->_memRef->entries[0].entry, 0, &size, &fRedirUPL, NULL, NULL, &flags, fMemory->getVMTag(kernel_map))) { - fRedirUPL = 0; + fRedirUPL = NULL; } if (physMem) { IOUnmapPages( fAddressMap, fAddress, fLength ); if ((false)) { - physMem->redirect(0, true); + physMem->redirect(NULL, true); } } } @@ -4727,10 +4815,10 @@ IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory, if (fRedirUPL) { upl_commit(fRedirUPL, NULL, 0); upl_deallocate(fRedirUPL); - fRedirUPL = 0; + fRedirUPL = NULL; } if ((false) && physMem) { - physMem->redirect(0, false); + physMem->redirect(NULL, false); } } }while (false); @@ -4760,8 +4848,8 @@ IOMemoryDescriptor::makeMapping( } #endif /* !__LP64__ */ - IOMemoryDescriptor * mapDesc = 0; - __block IOMemoryMap * result = 0; + IOMemoryDescriptor * mapDesc = NULL; + __block IOMemoryMap * result = NULL; IOMemoryMap * mapping = (IOMemoryMap *) __address; mach_vm_size_t offset = mapping->fOffset + __offset; @@ -4828,7 +4916,7 @@ IOMemoryDescriptor::makeMapping( mapDesc->retain(); } IOReturn - kr = mapDesc->doMap( 0, (IOVirtualAddress *) &mapping, options, 0, 0 ); + kr = mapDesc->doMap( NULL, (IOVirtualAddress *) &mapping, options, 0, 0 ); if (kIOReturnSuccess == kr) { result = mapping; mapDesc->addMapping(result); @@ -4853,7 +4941,7 @@ IOMemoryDescriptor::addMapping( IOMemoryMap * mapping ) { if (mapping) { - if (0 == _mappings) { + if (NULL == _mappings) { _mappings = OSSet::withCapacity(1); } if (_mappings) { @@ -4924,7 +5012,7 @@ void * IOMemoryDescriptor::getVirtualSegment(IOByteCount offset, IOByteCount * lengthOfSegment) { - return 0; + return NULL; } #endif /* !__LP64__ */ @@ -4933,8 +5021,8 @@ IOMemoryDescriptor::getVirtualSegment(IOByteCount offset, bool IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const { - OSSymbol const *keys[2] = {0}; - OSObject *values[2] = {0}; + OSSymbol const *keys[2] = {NULL}; + OSObject *values[2] = {NULL}; OSArray * array; vm_size_t vcopy_size; @@ -4962,7 +5050,7 @@ IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const goto bail; } vcopy = (SerData *) IOMalloc(vcopy_size); - if (vcopy == 0) { + if (vcopy == NULL) { result = false; goto bail; } @@ -4993,17 +5081,17 @@ IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const user_addr_t addr = vcopy[index].address; IOByteCount len = (IOByteCount) vcopy[index].length; values[0] = OSNumber::withNumber(addr, sizeof(addr) * 8); - if (values[0] == 0) { + if (values[0] == NULL) { result = false; goto bail; } values[1] = OSNumber::withNumber(len, sizeof(len) * 8); - if (values[1] == 0) { + if (values[1] == NULL) { result = false; goto bail; } OSDictionary *dict = OSDictionary::withObjects((const OSObject **)values, (const OSSymbol **)keys, 2); - if (dict == 0) { + if (dict == NULL) { result = false; goto bail; } @@ -5011,7 +5099,7 @@ IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const dict->release(); values[0]->release(); values[1]->release(); - values[0] = values[1] = 0; + values[0] = values[1] = NULL; } result = array->serialize(s); @@ -5072,5 +5160,5 @@ OSMetaClassDefineReservedUnused(IOMemoryDescriptor, 15); IOPhysicalAddress IOMemoryDescriptor::getPhysicalAddress() { - return getPhysicalSegment( 0, 0 ); + return getPhysicalSegment( 0, NULL ); } diff --git a/iokit/Kernel/IOMultiMemoryDescriptor.cpp b/iokit/Kernel/IOMultiMemoryDescriptor.cpp index 3418e41a8..d70531265 100644 --- a/iokit/Kernel/IOMultiMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMultiMemoryDescriptor.cpp @@ -54,7 +54,7 @@ IOMultiMemoryDescriptor * IOMultiMemoryDescriptor::withDescriptors( /* withDirection */ withDirection, /* asReference */ asReference ) == false) { me->release(); - me = 0; + me = NULL; } return me; @@ -97,7 +97,7 @@ IOMultiMemoryDescriptor::initWithDescriptors( // Initialize our minimal state. - _descriptors = 0; + _descriptors = NULL; _descriptorsCount = withCount; _descriptorsIsAllocated = asReference ? false : true; _flags = withDirection; @@ -105,14 +105,14 @@ IOMultiMemoryDescriptor::initWithDescriptors( _direction = (IODirection) (_flags & kIOMemoryDirectionMask); #endif /* !__LP64__ */ _length = 0; - _mappings = 0; + _mappings = NULL; _tag = 0; if (asReference) { _descriptors = descriptors; } else { _descriptors = IONew(IOMemoryDescriptor *, withCount); - if (_descriptors == 0) { + if (_descriptors == NULL) { return false; } @@ -396,6 +396,28 @@ IOMultiMemoryDescriptor::setPurgeable( IOOptionBits newState, return err; } +IOReturn +IOMultiMemoryDescriptor::setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ) +{ + IOReturn err; + + if (iokit_iomd_setownership_enabled == FALSE) { + return kIOReturnUnsupported; + } + + err = kIOReturnSuccess; + for (unsigned index = 0; index < _descriptorsCount; index++) { + err = _descriptors[index]->setOwnership(newOwner, newLedgerTag, newLedgerOptions); + if (kIOReturnSuccess != err) { + break; + } + } + + return err; +} + IOReturn IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount, IOByteCount * pDirtyPageCount) diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 88d58595f..4b0c315a9 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -34,8 +34,10 @@ #include #include #include +#include #include + #define super IOService #define kIONVRAMPrivilege kIOClientPrivilegeAdministrator @@ -53,28 +55,29 @@ IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane) } dict = OSDictionary::withCapacity(1); - if (dict == 0) { + if (dict == NULL) { return false; } setPropertyTable(dict); + dict->release(); _nvramImage = IONew(UInt8, kIODTNVRAMImageSize); - if (_nvramImage == 0) { + if (_nvramImage == NULL) { return false; } _nvramPartitionOffsets = OSDictionary::withCapacity(1); - if (_nvramPartitionOffsets == 0) { + if (_nvramPartitionOffsets == NULL) { return false; } _nvramPartitionLengths = OSDictionary::withCapacity(1); - if (_nvramPartitionLengths == 0) { + if (_nvramPartitionLengths == NULL) { return false; } _registryPropertiesKey = OSSymbol::withCStringNoCopy("aapl,pci"); - if (_registryPropertiesKey == 0) { + if (_registryPropertiesKey == NULL) { return false; } @@ -95,13 +98,13 @@ IODTNVRAM::initProxyData(void) const void *bytes; entry = IORegistryEntry::fromPath("/chosen", gIODTPlane); - if (entry != 0) { + if (entry != NULL) { prop = entry->getProperty(key); - if (prop != 0) { + if (prop != NULL) { data = OSDynamicCast(OSData, prop); - if (data != 0) { + if (data != NULL) { bytes = data->getBytesNoCopy(); - if ((bytes != 0) && (data->getLength() <= kIODTNVRAMImageSize)) { + if ((bytes != NULL) && (data->getLength() <= kIODTNVRAMImageSize)) { bcopy(bytes, _nvramImage, data->getLength()); initNVRAMImage(); _isProxied = true; @@ -116,7 +119,7 @@ IODTNVRAM::initProxyData(void) void IODTNVRAM::registerNVRAMController(IONVRAMController *nvram) { - if (_nvramController != 0) { + if (_nvramController != NULL) { return; } @@ -127,7 +130,7 @@ IODTNVRAM::registerNVRAMController(IONVRAMController *nvram) if (!_isProxied) { _nvramController->read(0, _nvramImage, kIODTNVRAMImageSize); initNVRAMImage(); - } else { + } else if (_ofLock) { IOLockLock(_ofLock); (void) syncVariables(); IOLockUnlock(_ofLock); @@ -249,7 +252,7 @@ IODTNVRAM::initNVRAMImage(void) _nvramImage[freePartitionOffset + 1] = calculatePartitionChecksum(_nvramImage + freePartitionOffset); - if (_nvramController != 0) { + if (_nvramController != NULL) { _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize); } } @@ -267,7 +270,7 @@ void IODTNVRAM::syncInternal(bool rateLimit) { // Don't try to perform controller operations if none has been registered. - if (_nvramController == 0) { + if (_nvramController == NULL) { return; } @@ -293,34 +296,34 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const UInt32 variablePerm; const OSSymbol *key; OSDictionary *dict; - OSCollectionIterator *iter = 0; + OSCollectionIterator *iter = NULL; // Verify permissions. hasPrivilege = (kIOReturnSuccess == IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege)); - if (_ofDict == 0) { + if (_ofDict == NULL) { /* No nvram. Return an empty dictionary. */ dict = OSDictionary::withCapacity(1); - if (dict == 0) { + if (dict == NULL) { return false; } } else { IOLockLock(_ofLock); dict = OSDictionary::withDictionary(_ofDict); IOLockUnlock(_ofLock); - if (dict == 0) { + if (dict == NULL) { return false; } /* Copy properties with client privilege. */ iter = OSCollectionIterator::withCollection(dict); - if (iter == 0) { + if (iter == NULL) { dict->release(); return false; } while (1) { key = OSDynamicCast(OSSymbol, iter->getNextObject()); - if (key == 0) { + if (key == NULL) { break; } @@ -337,7 +340,7 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const result = dict->serialize(s); dict->release(); - if (iter != 0) { + if (iter != NULL) { iter->release(); } @@ -351,8 +354,8 @@ IODTNVRAM::copyProperty(const OSSymbol *aKey) const UInt32 variablePerm; OSObject *theObject; - if (_ofDict == 0) { - return 0; + if (_ofDict == NULL) { + return NULL; } // Verify permissions. @@ -360,11 +363,11 @@ IODTNVRAM::copyProperty(const OSSymbol *aKey) const result = IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege); if (result != kIOReturnSuccess) { if (variablePerm == kOFVariablePermRootOnly) { - return 0; + return NULL; } } if (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) { - return 0; + return NULL; } IOLockLock(_ofLock); @@ -381,10 +384,10 @@ OSObject * IODTNVRAM::copyProperty(const char *aKey) const { const OSSymbol *keySymbol; - OSObject *theObject = 0; + OSObject *theObject = NULL; keySymbol = OSSymbol::withCString(aKey); - if (keySymbol != 0) { + if (keySymbol != NULL) { theObject = copyProperty(keySymbol); keySymbol->release(); } @@ -418,15 +421,15 @@ IODTNVRAM::getProperty(const char *aKey) const return theObject; } -bool -IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) +IOReturn +IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) { - bool result; + IOReturn result = kIOReturnSuccess; UInt32 propType, propPerm; - OSString *tmpString = 0; - OSObject *propObject = 0, *oldObject; + OSString *tmpString = NULL; + OSObject *propObject = NULL, *oldObject; - if (_ofDict == 0) { + if (_ofDict == NULL) { return false; } @@ -434,16 +437,16 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) propPerm = getOFVariablePerm(aKey); if (IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege) != kIOReturnSuccess) { if (propPerm != kOFVariablePermUserWrite) { - return false; + return kIOReturnNotPrivileged; } } if (propPerm == kOFVariablePermKernelOnly && current_task() != kernel_task) { - return 0; + return kIOReturnNotPrivileged; } // Don't allow change of 'aapl,panic-info'. if (aKey->isEqualTo(kIODTNVRAMPanicInfoKey)) { - return false; + return kIOReturnUnsupported; } // Make sure the object is of the correct type. @@ -459,13 +462,16 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) case kOFVariableTypeString: propObject = OSDynamicCast(OSString, anObject); + if (propObject != NULL && aKey->isEqualTo(kIONVRAMBootArgsKey) && ((OSString*)propObject)->getLength() >= BOOT_LINE_LENGTH) { + return kIOReturnNoSpace; + } break; case kOFVariableTypeData: propObject = OSDynamicCast(OSData, anObject); - if (propObject == 0) { + if (propObject == NULL) { tmpString = OSDynamicCast(OSString, anObject); - if (tmpString != 0) { + if (tmpString != NULL) { propObject = OSData::withBytes(tmpString->getCStringNoCopy(), tmpString->getLength()); } @@ -473,8 +479,8 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) break; } - if (propObject == 0) { - return false; + if (propObject == NULL) { + return kIOReturnBadArgument; } IOLockLock(_ofLock); @@ -483,9 +489,11 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) if (oldObject) { oldObject->retain(); } - result = _ofDict->setObject(aKey, propObject); + if (!_ofDict->setObject(aKey, propObject)) { + result = kIOReturnBadArgument; + } - if (result) { + if (result == kIOReturnSuccess) { if (syncVariables() != kIOReturnSuccess) { if (oldObject) { _ofDict->setObject(aKey, oldObject); @@ -493,7 +501,7 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) _ofDict->removeObject(aKey); } (void) syncVariables(); - result = false; + result = kIOReturnNoMemory; } } @@ -509,13 +517,19 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) return result; } +bool +IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) +{ + return setPropertyInternal(aKey, anObject) == kIOReturnSuccess; +} + void IODTNVRAM::removeProperty(const OSSymbol *aKey) { bool result; UInt32 propPerm; - if (_ofDict == 0) { + if (_ofDict == NULL) { return; } @@ -539,7 +553,7 @@ IODTNVRAM::removeProperty(const OSSymbol *aKey) // If the object exists, remove it from the dictionary. IOLockLock(_ofLock); - result = _ofDict->getObject(aKey) != 0; + result = _ofDict->getObject(aKey) != NULL; if (result) { _ofDict->removeObject(aKey); } @@ -554,7 +568,7 @@ IODTNVRAM::removeProperty(const OSSymbol *aKey) IOReturn IODTNVRAM::setProperties(OSObject *properties) { - bool result = true; + IOReturn res = kIOReturnSuccess; OSObject *object; const OSSymbol *key; const OSString *tmpStr; @@ -562,59 +576,53 @@ IODTNVRAM::setProperties(OSObject *properties) OSCollectionIterator *iter; dict = OSDynamicCast(OSDictionary, properties); - if (dict == 0) { + if (dict == NULL) { return kIOReturnBadArgument; } iter = OSCollectionIterator::withCollection(dict); - if (iter == 0) { + if (iter == NULL) { return kIOReturnBadArgument; } - while (result) { + while (res == kIOReturnSuccess) { key = OSDynamicCast(OSSymbol, iter->getNextObject()); - if (key == 0) { + if (key == NULL) { break; } object = dict->getObject(key); - if (object == 0) { + if (object == NULL) { continue; } if (key->isEqualTo(kIONVRAMDeletePropertyKey)) { tmpStr = OSDynamicCast(OSString, object); - if (tmpStr != 0) { + if (tmpStr != NULL) { key = OSSymbol::withString(tmpStr); removeProperty(key); key->release(); - result = true; } else { - result = false; + res = kIOReturnError; } } else if (key->isEqualTo(kIONVRAMSyncNowPropertyKey) || key->isEqualTo(kIONVRAMForceSyncNowPropertyKey)) { tmpStr = OSDynamicCast(OSString, object); - if (tmpStr != 0) { - result = true; - + if (tmpStr != NULL) { // We still want to throttle NVRAM commit rate for SyncNow. ForceSyncNow is provided as a really big hammer. - syncInternal(key->isEqualTo(kIONVRAMSyncNowPropertyKey)); } else { - result = false; + res = kIOReturnError; } } else { - result = setProperty(key, object); + if (!setProperty(key, object)) { + res = kIOReturnNoSpace; + } } } iter->release(); - if (result) { - return kIOReturnSuccess; - } else { - return kIOReturnError; - } + return res; } IOReturn @@ -674,7 +682,7 @@ IODTNVRAM::readNVRAMPartition(const OSSymbol *partitionID, partitionLengthNumber = (OSNumber *)_nvramPartitionLengths->getObject(partitionID); - if ((partitionOffsetNumber == 0) || (partitionLengthNumber == 0)) { + if ((partitionOffsetNumber == NULL) || (partitionLengthNumber == NULL)) { return kIOReturnNotFound; } @@ -684,7 +692,7 @@ IODTNVRAM::readNVRAMPartition(const OSSymbol *partitionID, if (os_add_overflow(offset, length, &end)) { return kIOReturnBadArgument; } - if ((buffer == 0) || (length == 0) || (end > partitionLength)) { + if ((buffer == NULL) || (length == 0) || (end > partitionLength)) { return kIOReturnBadArgument; } @@ -706,7 +714,7 @@ IODTNVRAM::writeNVRAMPartition(const OSSymbol *partitionID, partitionLengthNumber = (OSNumber *)_nvramPartitionLengths->getObject(partitionID); - if ((partitionOffsetNumber == 0) || (partitionLengthNumber == 0)) { + if ((partitionOffsetNumber == NULL) || (partitionLengthNumber == NULL)) { return kIOReturnNotFound; } @@ -716,13 +724,13 @@ IODTNVRAM::writeNVRAMPartition(const OSSymbol *partitionID, if (os_add_overflow(offset, length, &end)) { return kIOReturnBadArgument; } - if ((buffer == 0) || (length == 0) || (end > partitionLength)) { + if ((buffer == NULL) || (length == 0) || (end > partitionLength)) { return kIOReturnBadArgument; } bcopy(buffer, _nvramImage + partitionOffset + offset, length); - if (_nvramController != 0) { + if (_nvramController != NULL) { _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize); } @@ -732,7 +740,7 @@ IODTNVRAM::writeNVRAMPartition(const OSSymbol *partitionID, IOByteCount IODTNVRAM::savePanicInfo(UInt8 *buffer, IOByteCount length) { - if ((_piImage == 0) || (length <= 0)) { + if ((_piImage == NULL) || (length <= 0)) { return 0; } @@ -746,7 +754,7 @@ IODTNVRAM::savePanicInfo(UInt8 *buffer, IOByteCount length) // Save the Panic Info length. *(UInt32 *)_piImage = length; - if (_nvramController != 0) { + if (_nvramController != NULL) { _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize); } /* @@ -788,7 +796,7 @@ IODTNVRAM::initOFVariables(void) const OSSymbol *propSymbol; OSObject *propObject; - if (_ofImage == 0) { + if (_ofImage == NULL) { return kIOReturnNotReady; } @@ -844,15 +852,15 @@ IODTNVRAM::initOFVariables(void) } // Create the boot-args property if it is not in the dictionary. - if (_ofDict->getObject("boot-args") == 0) { + if (_ofDict->getObject(kIONVRAMBootArgsKey) == NULL) { propObject = OSString::withCStringNoCopy(""); - if (propObject != 0) { - _ofDict->setObject("boot-args", propObject); + if (propObject != NULL) { + _ofDict->setObject(kIONVRAMBootArgsKey, propObject); propObject->release(); } } - if (_piImage != 0) { + if (_piImage != NULL) { propDataLength = *(UInt32 *)_piImage; if ((propDataLength != 0) && (propDataLength <= (_piPartitionSize - 4))) { propObject = OSData::withBytes(_piImage + 4, propDataLength); @@ -861,7 +869,7 @@ IODTNVRAM::initOFVariables(void) // Clear the length from _piImage and mark dirty. *(UInt32 *)_piImage = 0; - if (_nvramController != 0) { + if (_nvramController != NULL) { _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize); } } @@ -888,12 +896,12 @@ IODTNVRAM::syncVariables(void) IOLockAssert(_ofLock, kIOLockAssertOwned); - if ((_ofImage == 0) || (_ofDict == 0) || _systemPaniced) { + if ((_ofImage == NULL) || (_ofDict == NULL) || _systemPaniced) { return kIOReturnNotReady; } buffer = tmpBuffer = IONew(UInt8, _ofPartitionSize); - if (buffer == 0) { + if (buffer == NULL) { return kIOReturnNoMemory; } bzero(buffer, _ofPartitionSize); @@ -902,13 +910,13 @@ IODTNVRAM::syncVariables(void) maxLength = _ofPartitionSize; iter = OSCollectionIterator::withCollection(_ofDict); - if (iter == 0) { + if (iter == NULL) { ok = false; } while (ok) { tmpSymbol = OSDynamicCast(OSSymbol, iter->getNextObject()); - if (tmpSymbol == 0) { + if (tmpSymbol == NULL) { break; } @@ -938,7 +946,7 @@ IODTNVRAM::syncVariables(void) return kIOReturnBadArgument; } - if (_nvramController != 0) { + if (_nvramController != NULL) { return _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize); } @@ -1018,7 +1026,7 @@ OFVariable gOFVariables[] = { {"enter-tdm-mode", kOFVariableTypeBoolean, kOFVariablePermUserWrite, -1}, {"nonce-seeds", kOFVariableTypeData, kOFVariablePermKernelOnly, -1}, #endif - {0, kOFVariableTypeData, kOFVariablePermUserRead, -1} + {NULL, kOFVariableTypeData, kOFVariablePermUserRead, -1} }; UInt32 @@ -1028,7 +1036,7 @@ IODTNVRAM::getOFVariableType(const OSSymbol *propSymbol) const ofVar = gOFVariables; while (1) { - if ((ofVar->variableName == 0) || + if ((ofVar->variableName == NULL) || propSymbol->isEqualTo(ofVar->variableName)) { break; } @@ -1045,7 +1053,7 @@ IODTNVRAM::getOFVariablePerm(const OSSymbol *propSymbol) const ofVar = gOFVariables; while (1) { - if ((ofVar->variableName == 0) || + if ((ofVar->variableName == NULL) || propSymbol->isEqualTo(ofVar->variableName)) { break; } @@ -1059,39 +1067,8 @@ bool IODTNVRAM::getOWVariableInfo(UInt32 variableNumber, const OSSymbol **propSymbol, UInt32 *propType, UInt32 *propOffset) { - const OFVariable *ofVar; - - ofVar = gOFVariables; - while (1) { - if (ofVar->variableName == 0) { - return false; - } - - if (ofVar->variableOffset == (SInt32) variableNumber) { - break; - } - - ofVar++; - } - - *propSymbol = OSSymbol::withCStringNoCopy(ofVar->variableName); - *propType = ofVar->variableType; - - switch (*propType) { - case kOFVariableTypeBoolean: - *propOffset = 1 << (31 - variableNumber); - break; - - case kOFVariableTypeNumber: - *propOffset = variableNumber - kOWVariableOffsetNumber; - break; - - case kOFVariableTypeString: - *propOffset = variableNumber - kOWVariableOffsetString; - break; - } - - return true; + /* UNSUPPORTED */ + return false; } bool @@ -1110,14 +1087,14 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength, propName[propNameLength] = '\0'; tmpSymbol = OSSymbol::withCString((const char *)propName); propName[propNameLength] = '='; - if (tmpSymbol == 0) { + if (tmpSymbol == NULL) { return false; } propType = getOFVariableType(tmpSymbol); // Create the object. - tmpObject = 0; + tmpObject = NULL; switch (propType) { case kOFVariableTypeBoolean: if (!strncmp("true", (const char *)propData, propDataLength)) { @@ -1128,15 +1105,15 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength, break; case kOFVariableTypeNumber: - tmpNumber = OSNumber::withNumber(strtol((const char *)propData, 0, 0), 32); - if (tmpNumber != 0) { + tmpNumber = OSNumber::withNumber(strtol((const char *)propData, NULL, 0), 32); + if (tmpNumber != NULL) { tmpObject = tmpNumber; } break; case kOFVariableTypeString: tmpString = OSString::withCString((const char *)propData); - if (tmpString != 0) { + if (tmpString != NULL) { tmpObject = tmpString; } break; @@ -1146,7 +1123,7 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength, break; } - if (tmpObject == 0) { + if (tmpObject == NULL) { tmpSymbol->release(); return false; } @@ -1164,10 +1141,10 @@ IODTNVRAM::convertObjectToProp(UInt8 *buffer, UInt32 *length, const UInt8 *propName; UInt32 propNameLength, propDataLength, remaining; UInt32 propType, tmpValue; - OSBoolean *tmpBoolean = 0; - OSNumber *tmpNumber = 0; - OSString *tmpString = 0; - OSData *tmpData = 0; + OSBoolean *tmpBoolean = NULL; + OSNumber *tmpNumber = NULL; + OSString *tmpString = NULL; + OSData *tmpData = NULL; propName = (const UInt8 *)propSymbol->getCStringNoCopy(); propNameLength = propSymbol->getLength(); @@ -1178,28 +1155,28 @@ IODTNVRAM::convertObjectToProp(UInt8 *buffer, UInt32 *length, switch (propType) { case kOFVariableTypeBoolean: tmpBoolean = OSDynamicCast(OSBoolean, propObject); - if (tmpBoolean != 0) { + if (tmpBoolean != NULL) { propDataLength = 5; } break; case kOFVariableTypeNumber: tmpNumber = OSDynamicCast(OSNumber, propObject); - if (tmpNumber != 0) { + if (tmpNumber != NULL) { propDataLength = 10; } break; case kOFVariableTypeString: tmpString = OSDynamicCast(OSString, propObject); - if (tmpString != 0) { + if (tmpString != NULL) { propDataLength = tmpString->getLength(); } break; case kOFVariableTypeData: tmpData = OSDynamicCast(OSData, propObject); - if (tmpData != 0) { + if (tmpData != NULL) { tmpData = escapeDataToData(tmpData); propDataLength = tmpData->getLength(); } @@ -1291,85 +1268,7 @@ IODTNVRAM::validateOWChecksum(UInt8 *buffer) void IODTNVRAM::updateOWBootArgs(const OSSymbol *key, OSObject *value) { - bool wasBootArgs, bootr = false; - UInt32 cnt; - OSString *tmpString, *bootCommand, *bootArgs = 0; - const UInt8 *bootCommandData, *bootArgsData; - UInt8 *tmpData; - UInt32 bootCommandDataLength, bootArgsDataLength, tmpDataLength; - - tmpString = OSDynamicCast(OSString, value); - if (tmpString == 0) { - return; - } - - if (key->isEqualTo("boot-command")) { - wasBootArgs = false; - bootCommand = tmpString; - } else if (key->isEqualTo("boot-args")) { - wasBootArgs = true; - bootArgs = tmpString; - bootCommand = OSDynamicCast(OSString, _ofDict->getObject("boot-command")); - if (bootCommand == 0) { - return; - } - } else { - return; - } - - bootCommandData = (const UInt8 *)bootCommand->getCStringNoCopy(); - bootCommandDataLength = bootCommand->getLength(); - - if (bootCommandData == 0) { - return; - } - - for (cnt = 0; cnt < bootCommandDataLength; cnt++) { - if ((bootCommandData[cnt] == 'b') && - !strncmp("bootr", (const char *)bootCommandData + cnt, 5)) { - cnt += 5; - while (bootCommandData[cnt] == ' ') { - cnt++; - } - bootr = true; - break; - } - } - if (!bootr) { - _ofDict->removeObject("boot-args"); - return; - } - - if (wasBootArgs) { - bootArgsData = (const UInt8 *)bootArgs->getCStringNoCopy(); - bootArgsDataLength = bootArgs->getLength(); - if (bootArgsData == 0) { - return; - } - - tmpDataLength = cnt + bootArgsDataLength; - tmpData = IONew(UInt8, tmpDataLength + 1); - if (tmpData == 0) { - return; - } - - cnt -= strlcpy((char *)tmpData, (const char *)bootCommandData, cnt); - strlcat((char *)tmpData, (const char *)bootArgsData, cnt); - - bootCommand = OSString::withCString((const char *)tmpData); - if (bootCommand != 0) { - _ofDict->setObject("boot-command", bootCommand); - bootCommand->release(); - } - - IODelete(tmpData, UInt8, tmpDataLength + 1); - } else { - bootArgs = OSString::withCString((const char *)(bootCommandData + cnt)); - if (bootArgs != 0) { - _ofDict->setObject("boot-args", bootArgs); - bootArgs->release(); - } - } + /* UNSUPPORTED */ } bool @@ -1399,7 +1298,7 @@ IODTNVRAM::writeNVRAMPropertyType0(IORegistryEntry *entry, OSData * IODTNVRAM::unescapeBytesToData(const UInt8 *bytes, UInt32 length) { - OSData *data = 0; + OSData *data = NULL; UInt32 totalLength = 0; UInt32 cnt, cnt2; UInt8 byte; @@ -1426,7 +1325,7 @@ IODTNVRAM::unescapeBytesToData(const UInt8 *bytes, UInt32 length) if (ok) { // Create an empty OSData of the correct size. data = OSData::withCapacity(totalLength); - if (data != 0) { + if (data != NULL) { for (cnt = 0; cnt < length;) { byte = bytes[cnt++]; if (byte == 0xFF) { @@ -1479,7 +1378,7 @@ IODTNVRAM::escapeDataToData(OSData * value) if (!ok) { result->release(); - result = 0; + result = NULL; } return result; @@ -1508,14 +1407,14 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry, const UInt8 *startPtr; const UInt8 *endPtr; const UInt8 *wherePtr; - const UInt8 *nvPath = 0; - const char *nvName = 0; - const char *resultName = 0; - const UInt8 *resultValue = 0; + const UInt8 *nvPath = NULL; + const char *nvName = NULL; + const char *resultName = NULL; + const UInt8 *resultValue = NULL; UInt32 resultValueLen = 0; UInt8 byte; - if (_ofDict == 0) { + if (_ofDict == NULL) { return err; } @@ -1523,7 +1422,7 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry, data = OSDynamicCast(OSData, _ofDict->getObject(_registryPropertiesKey)); IOLockUnlock(_ofLock); - if (data == 0) { + if (data == NULL) { return err; } @@ -1537,9 +1436,9 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry, continue; } - if (nvPath == 0) { + if (nvPath == NULL) { nvPath = startPtr; - } else if (nvName == 0) { + } else if (nvName == NULL) { nvName = (const char *) startPtr; } else { IORegistryEntry * compareEntry = IORegistryEntry::fromPath((const char *) nvPath, gIODTPlane); @@ -1557,15 +1456,15 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry, break; } } - nvPath = 0; - nvName = 0; + nvPath = NULL; + nvName = NULL; } startPtr = wherePtr; } if (resultName) { *name = OSSymbol::withCString(resultName); *value = unescapeBytesToData(resultValue, resultValueLen); - if ((*name != 0) && (*value != 0)) { + if ((*name != NULL) && (*value != NULL)) { err = kIOReturnSuccess; } else { err = kIOReturnNoMemory; @@ -1580,20 +1479,20 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, OSData *value) { OSData *oldData, *escapedData; - OSData *data = 0; + OSData *data = NULL; const UInt8 *startPtr; const UInt8 *propStart; const UInt8 *endPtr; const UInt8 *wherePtr; - const UInt8 *nvPath = 0; - const char *nvName = 0; + const UInt8 *nvPath = NULL; + const char *nvName = NULL; const char * comp; const char * name; UInt8 byte; bool ok = true; bool settingAppleProp; - if (_ofDict == 0) { + if (_ofDict == NULL) { return kIOReturnNoResources; } @@ -1615,9 +1514,9 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, if (byte) { continue; } - if (nvPath == 0) { + if (nvPath == NULL) { nvPath = startPtr; - } else if (nvName == 0) { + } else if (nvName == NULL) { nvName = (const char *) startPtr; } else { IORegistryEntry * compareEntry = IORegistryEntry::fromPath((const char *) nvPath, gIODTPlane); @@ -1635,8 +1534,8 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, break; } } - nvPath = 0; - nvName = 0; + nvPath = NULL; + nvName = NULL; } startPtr = wherePtr; @@ -1693,7 +1592,7 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, // append escaped data escapedData = escapeDataToData(value); - ok &= (escapedData != 0); + ok &= (escapedData != NULL); if (ok) { ok &= data->appendBytes(escapedData); } diff --git a/iokit/Kernel/IOPMPowerSource.cpp b/iokit/Kernel/IOPMPowerSource.cpp index e89680c08..f211a94de 100644 --- a/iokit/Kernel/IOPMPowerSource.cpp +++ b/iokit/Kernel/IOPMPowerSource.cpp @@ -162,6 +162,8 @@ IOPMPowerSource::free(void) if (batteryInfoKey) { batteryInfoKey->release(); } + + super::free(); } // ***************************************************************************** diff --git a/iokit/Kernel/IOPMPowerStateQueue.h b/iokit/Kernel/IOPMPowerStateQueue.h index db2e57412..1e737ed17 100644 --- a/iokit/Kernel/IOPMPowerStateQueue.h +++ b/iokit/Kernel/IOPMPowerStateQueue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2001-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,7 +37,7 @@ typedef void (*IOPMPowerStateQueueAction)(OSObject *, uint32_t event, void *, ui class IOPMPowerStateQueue : public IOEventSource { - OSDeclareDefaultStructors(IOPMPowerStateQueue) + OSDeclareDefaultStructors(IOPMPowerStateQueue); private: struct PowerEventEntry { @@ -57,7 +57,7 @@ protected: public: static IOPMPowerStateQueue * PMPowerStateQueue( OSObject * owner, Action action ); - bool submitPowerEvent( uint32_t eventType, void * arg0 = 0, uint64_t arg1 = 0 ); + bool submitPowerEvent( uint32_t eventType, void * arg0 = NULL, uint64_t arg1 = 0 ); }; #endif /* _IOPMPOWERSTATEQUEUE_H_ */ diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 320e8f3f2..0920486da 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2017 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,8 @@ #include "IOPMPowerStateQueue.h" #include #include +#include +#include #include "IOKitKernelInternal.h" #if HIBERNATION #include @@ -89,19 +92,22 @@ __END_DECLS #define LOG(x...) \ do { kprintf(LOG_PREFIX x); } while (false) -#if DEVELOPMENT -#define DLOG(x...) do { \ +#if DEVELOPMENT || DEBUG +#define DEBUG_LOG(x...) do { \ if (kIOLogPMRootDomain & gIOKitDebug) \ - kprintf(LOG_PREFIX x); \ - else \ - os_log(OS_LOG_DEFAULT, LOG_PREFIX x); \ + kprintf(LOG_PREFIX x); \ + os_log_debug(OS_LOG_DEFAULT, LOG_PREFIX x); \ } while (false) #else +#define DEBUG_LOG(x...) +#endif + #define DLOG(x...) do { \ if (kIOLogPMRootDomain & gIOKitDebug) \ kprintf(LOG_PREFIX x); \ + else \ + os_log(OS_LOG_DEFAULT, LOG_PREFIX x); \ } while (false) -#endif #define DMSG(x...) do { \ if (kIOLogPMRootDomain & gIOKitDebug) { \ @@ -114,7 +120,7 @@ __END_DECLS #define CHECK_THREAD_CONTEXT #ifdef CHECK_THREAD_CONTEXT -static IOWorkLoop * gIOPMWorkLoop = 0; +static IOWorkLoop * gIOPMWorkLoop = NULL; #define ASSERT_GATED() \ do { \ if (gIOPMWorkLoop && gIOPMWorkLoop->inGate() != true) { \ @@ -192,6 +198,13 @@ static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t ); static void notifySystemShutdown( IOService * root, uint32_t messageType ); static void handleAggressivesFunction( thread_call_param_t, thread_call_param_t ); static void pmEventTimeStamp(uint64_t *recordTS); +static void powerButtonUpCallout( thread_call_param_t, thread_call_param_t ); +static void powerButtonDownCallout( thread_call_param_t, thread_call_param_t ); + +static int IOPMConvertSecondsToCalendar(long secs, IOPMCalendarStruct * dt); +static long IOPMConvertCalendarToSeconds(const IOPMCalendarStruct * dt); +#define YMDTF "%04d/%02d/%d %02d:%02d:%02d" +#define YMDT(cal) ((int)(cal)->year), (cal)->month, (cal)->day, (cal)->hour, (cal)->minute, (cal)->second // "IOPMSetSleepSupported" callPlatformFunction name static const OSSymbol *sleepSupportedPEFunction = NULL; @@ -249,8 +262,8 @@ static const OSSymbol * gIOPMPSPostDishargeWaitSecondsKey; #define kDefaultWranglerIdlePeriod 1000 // in milliseconds #define kIOSleepWakeFailureString "SleepWakeFailureString" -#define kIOOSWatchdogFailureString "OSWatchdogFailureString" #define kIOEFIBootRomFailureKey "wake-failure" +#define kIOSleepWakeFailurePanic "SleepWakeFailurePanic" #define kRD_AllPowerSources (kIOPMSupportedOnAC \ | kIOPMSupportedOnBatt \ @@ -270,20 +283,60 @@ enum { OFF_STATE = 0, RESTART_STATE = 1, SLEEP_STATE = 2, - ON_STATE = 3, + AOT_STATE = 3, + ON_STATE = 4, NUM_POWER_STATES }; +const char * +getPowerStateString( uint32_t state ) +{ +#define POWER_STATE(x) {(uint32_t) x, #x} + + static const IONamedValue powerStates[] = { + POWER_STATE( OFF_STATE ), + POWER_STATE( RESTART_STATE ), + POWER_STATE( SLEEP_STATE ), + POWER_STATE( AOT_STATE ), + POWER_STATE( ON_STATE ), + { 0, NULL } + }; + return IOFindNameForValue(state, powerStates); +} + #define ON_POWER kIOPMPowerOn #define RESTART_POWER kIOPMRestart #define SLEEP_POWER kIOPMAuxPowerOn -static IOPMPowerState ourPowerStates[NUM_POWER_STATES] = -{ - {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {1, kIOPMRestartCapability, kIOPMRestart, RESTART_POWER, 0, 0, 0, 0, 0, 0, 0, 0}, - {1, kIOPMSleepCapability, kIOPMSleep, SLEEP_POWER, 0, 0, 0, 0, 0, 0, 0, 0}, - {1, kIOPMPowerOn, kIOPMPowerOn, ON_POWER, 0, 0, 0, 0, 0, 0, 0, 0} +static IOPMPowerState + ourPowerStates[NUM_POWER_STATES] = +{ + { .version = 1, + .capabilityFlags = 0, + .outputPowerCharacter = 0, + .inputPowerRequirement = 0 }, + { .version = 1, + .capabilityFlags = kIOPMRestartCapability, + .outputPowerCharacter = kIOPMRestart, + .inputPowerRequirement = RESTART_POWER }, + { .version = 1, + .capabilityFlags = kIOPMSleepCapability, + .outputPowerCharacter = kIOPMSleep, + .inputPowerRequirement = SLEEP_POWER }, + { .version = 1, +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + .capabilityFlags = kIOPMAOTCapability, + .outputPowerCharacter = kIOPMAOTPower, + .inputPowerRequirement = ON_POWER }, +#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + .capabilityFlags = 0, + .outputPowerCharacter = 0, + .inputPowerRequirement = 0xFFFFFFFF }, +#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + { .version = 1, + .capabilityFlags = kIOPMPowerOn, + .outputPowerCharacter = kIOPMPowerOn, + .inputPowerRequirement = ON_POWER }, }; #define kIOPMRootDomainWakeTypeSleepService "SleepService" @@ -302,6 +355,9 @@ static IOPMPowerState ourPowerStates[NUM_POWER_STATES] = // #define kIOPMSystemCapabilityInterest "IOPMSystemCapabilityInterest" +// Entitlement required for root domain clients +#define kRootDomainEntitlementSetProperty "com.apple.private.iokit.rootdomain-set-property" + #define WAKEEVENT_LOCK() IOLockLock(wakeEventLock) #define WAKEEVENT_UNLOCK() IOLockUnlock(wakeEventLock) @@ -313,6 +369,26 @@ static IOPMPowerState ourPowerStates[NUM_POWER_STATES] = #define kAggressivesMinValue 1 +const char * +getAggressivenessTypeString( uint32_t type ) +{ +#define AGGRESSIVENESS_TYPE(x) {(uint32_t) x, #x} + + static const IONamedValue aggressivenessTypes[] = { + AGGRESSIVENESS_TYPE( kPMGeneralAggressiveness ), + AGGRESSIVENESS_TYPE( kPMMinutesToDim ), + AGGRESSIVENESS_TYPE( kPMMinutesToSpinDown ), + AGGRESSIVENESS_TYPE( kPMMinutesToSleep ), + AGGRESSIVENESS_TYPE( kPMEthernetWakeOnLANSettings ), + AGGRESSIVENESS_TYPE( kPMSetProcessorSpeed ), + AGGRESSIVENESS_TYPE( kPMPowerSource), + AGGRESSIVENESS_TYPE( kPMMotionSensor ), + AGGRESSIVENESS_TYPE( kPMLastAggressivenessType ), + { 0, NULL } + }; + return IOFindNameForValue(type, aggressivenessTypes); +} + enum { kAggressivesStateBusy = 0x01, kAggressivesStateQuickSpindown = 0x02 @@ -351,6 +427,33 @@ enum { kAggressivesRecordFlagMinValue = 0x00000002 }; +// System Sleep Preventers + +enum { + kPMUserDisabledAllSleep = 1, + kPMSystemRestartBootingInProgress, + kPMConfigPreventSystemSleep, + kPMChildPreventSystemSleep, + kPMCPUAssertion, + kPMPCIUnsupported, +}; + +const char * +getSystemSleepPreventerString( uint32_t preventer ) +{ +#define SYSTEM_SLEEP_PREVENTER(x) {(int) x, #x} + static const IONamedValue systemSleepPreventers[] = { + SYSTEM_SLEEP_PREVENTER( kPMUserDisabledAllSleep ), + SYSTEM_SLEEP_PREVENTER( kPMSystemRestartBootingInProgress ), + SYSTEM_SLEEP_PREVENTER( kPMConfigPreventSystemSleep ), + SYSTEM_SLEEP_PREVENTER( kPMChildPreventSystemSleep ), + SYSTEM_SLEEP_PREVENTER( kPMCPUAssertion ), + SYSTEM_SLEEP_PREVENTER( kPMPCIUnsupported ), + { 0, NULL } + }; + return IOFindNameForValue(preventer, systemSleepPreventers); +} + // gDarkWakeFlags enum { kDarkWakeFlagHIDTickleEarly = 0x01,// hid tickle before gfx suppression @@ -363,7 +466,7 @@ enum { }; static IOPMrootDomain * gRootDomain; -static IONotifier * gSysPowerDownNotifier = 0; +static IONotifier * gSysPowerDownNotifier = NULL; static UInt32 gSleepOrShutdownPending = 0; static UInt32 gWillShutdown = 0; static UInt32 gPagingOff = 0; @@ -382,22 +485,27 @@ uuid_string_t bootsessionuuid_string; static uint32_t gDarkWakeFlags = kDarkWakeFlagHIDTickleNone; static uint32_t gNoIdleFlag = 0; -static uint32_t gSwdPanic = 0; +static uint32_t gSwdPanic = 1; static uint32_t gSwdSleepTimeout = 0; static uint32_t gSwdWakeTimeout = 0; static uint32_t gSwdSleepWakeTimeout = 0; static PMStatsStruct gPMStats; +#if DEVELOPMENT || DEBUG +static uint32_t swd_panic_phase; +#endif #if HIBERNATION -static IOPMSystemSleepPolicyHandler gSleepPolicyHandler = 0; -static IOPMSystemSleepPolicyVariables * gSleepPolicyVars = 0; +static IOPMSystemSleepPolicyHandler gSleepPolicyHandler = NULL; +static IOPMSystemSleepPolicyVariables * gSleepPolicyVars = NULL; static void * gSleepPolicyTarget; #endif struct timeval gIOLastSleepTime; struct timeval gIOLastWakeTime; +struct timeval gIOLastUserSleepTime; + static char gWakeReasonString[128]; static bool gWakeReasonSysctlRegistered = false; static AbsoluteTime gIOLastWakeAbsTime; @@ -421,9 +529,9 @@ static unsigned int gPMHaltBusyCount; static unsigned int gPMHaltIdleCount; static int gPMHaltDepth; static uint32_t gPMHaltMessageType; -static IOLock * gPMHaltLock = 0; -static OSArray * gPMHaltArray = 0; -static const OSSymbol * gPMHaltClientAcknowledgeKey = 0; +static IOLock * gPMHaltLock = NULL; +static OSArray * gPMHaltArray = NULL; +static const OSSymbol * gPMHaltClientAcknowledgeKey = NULL; static bool gPMQuiesced; // Constants used as arguments to IOPMrootDomain::informCPUStateChange @@ -448,7 +556,7 @@ const OSSymbol *gIOPMStatsDriverPSChangeSlow; */ class PMSettingHandle : public OSObject { - OSDeclareFinalStructors( PMSettingHandle ) + OSDeclareFinalStructors( PMSettingHandle ); friend class PMSettingObject; private: @@ -462,7 +570,7 @@ private: */ class PMSettingObject : public OSObject { - OSDeclareFinalStructors( PMSettingObject ) + OSDeclareFinalStructors( PMSettingObject ); friend class IOPMrootDomain; private: @@ -515,7 +623,7 @@ typedef void (*IOPMTracePointHandler)( class PMTraceWorker : public OSObject { - OSDeclareDefaultStructors(PMTraceWorker) + OSDeclareDefaultStructors(PMTraceWorker); public: typedef enum { kPowerChangeStart, kPowerChangeCompleted } change_t; @@ -552,7 +660,7 @@ private: */ class PMAssertionsTracker : public OSObject { - OSDeclareFinalStructors(PMAssertionsTracker) + OSDeclareFinalStructors(PMAssertionsTracker); public: static PMAssertionsTracker *pmAssertionsTracker( IOPMrootDomain * ); @@ -609,7 +717,7 @@ OSDefineMetaClassAndFinalStructors(PMAssertionsTracker, OSObject); class PMHaltWorker : public OSObject { - OSDeclareFinalStructors( PMHaltWorker ) + OSDeclareFinalStructors( PMHaltWorker ); public: IOService * service;// service being worked on @@ -632,11 +740,17 @@ OSDefineMetaClassAndFinalStructors( PMHaltWorker, OSObject ) #define super IOService OSDefineMetaClassAndFinalStructors(IOPMrootDomain, IOService) +boolean_t +IOPMRootDomainGetWillShutdown(void) +{ + return gWillShutdown != 0; +} + static void IOPMRootDomainWillShutdown(void) { if (OSCompareAndSwap(0, 1, &gWillShutdown)) { - OSKext::willShutdown(); + IOService::willShutdown(); for (int i = 0; i < 100; i++) { if (OSCompareAndSwap(0, 1, &gSleepOrShutdownPending)) { break; @@ -771,16 +885,14 @@ IOSystemShutdownNotification(int stage) startTime = mach_absolute_time(); IOPMRootDomainWillShutdown(); - halt_log_enter("IOPMRootDomainWillShutdown", 0, mach_absolute_time() - startTime); + halt_log_enter("IOPMRootDomainWillShutdown", NULL, mach_absolute_time() - startTime); #if HIBERNATION startTime = mach_absolute_time(); IOHibernateSystemPostWake(true); - halt_log_enter("IOHibernateSystemPostWake", 0, mach_absolute_time() - startTime); + halt_log_enter("IOHibernateSystemPostWake", NULL, mach_absolute_time() - startTime); #endif if (OSCompareAndSwap(0, 1, &gPagingOff)) { -#if !CONFIG_EMBEDDED gRootDomain->handlePlatformHaltRestart(kPEPagingOff); -#endif } } @@ -862,10 +974,27 @@ IOPMrootDomain::updateConsoleUsers(void) IOService::updateConsoleUsers(NULL, kIOMessageSystemHasPoweredOn); if (tasksSuspended) { tasksSuspended = FALSE; - tasks_system_suspend(tasksSuspended); + updateTasksSuspend(); } } +void +IOPMrootDomain::updateTasksSuspend(void) +{ + bool newSuspend; + +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + newSuspend = (tasksSuspended || _aotTasksSuspended); +#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + newSuspend = tasksSuspended; +#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + if (newSuspend == tasksSuspendState) { + return; + } + tasksSuspendState = newSuspend; + tasks_system_suspend(newSuspend); +} + //****************************************************************************** static void @@ -943,7 +1072,7 @@ sysctl_sleepwaketime SYSCTL_HANDLER_ARGS static SYSCTL_PROC(_kern, OID_AUTO, sleeptime, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - &gIOLastSleepTime, 0, sysctl_sleepwaketime, "S,timeval", ""); + &gIOLastUserSleepTime, 0, sysctl_sleepwaketime, "S,timeval", ""); static SYSCTL_PROC(_kern, OID_AUTO, waketime, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, @@ -972,7 +1101,7 @@ sysctl_willshutdown static SYSCTL_PROC(_kern, OID_AUTO, willshutdown, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_willshutdown, "I", ""); + NULL, 0, sysctl_willshutdown, "I", ""); extern struct sysctl_oid sysctl__kern_iokittest; extern struct sysctl_oid sysctl__debug_iokit; @@ -1013,11 +1142,11 @@ sysctl_progressmeter static SYSCTL_PROC(_kern, OID_AUTO, progressmeterenable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_progressmeterenable, "I", ""); + NULL, 0, sysctl_progressmeterenable, "I", ""); static SYSCTL_PROC(_kern, OID_AUTO, progressmeter, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_progressmeter, "I", ""); + NULL, 0, sysctl_progressmeter, "I", ""); #endif /* !CONFIG_EMBEDDED */ @@ -1041,7 +1170,7 @@ sysctl_consoleoptions static SYSCTL_PROC(_kern, OID_AUTO, consoleoptions, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_consoleoptions, "I", ""); + NULL, 0, sysctl_consoleoptions, "I", ""); static int @@ -1101,11 +1230,114 @@ static SYSCTL_INT(_debug, OID_AUTO, swd_sleep_timeout, CTLFLAG_RW, &gSwdSleepTim static SYSCTL_INT(_debug, OID_AUTO, swd_wake_timeout, CTLFLAG_RW, &gSwdWakeTimeout, 0, ""); static SYSCTL_INT(_debug, OID_AUTO, swd_timeout, CTLFLAG_RW, &gSwdSleepWakeTimeout, 0, ""); static SYSCTL_INT(_debug, OID_AUTO, swd_panic, CTLFLAG_RW, &gSwdPanic, 0, ""); +#if DEVELOPMENT || DEBUG +static SYSCTL_INT(_debug, OID_AUTO, swd_panic_phase, CTLFLAG_RW, &swd_panic_phase, 0, ""); +#endif + +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) +//****************************************************************************** +// AOT + +static int +sysctl_aotmetrics SYSCTL_HANDLER_ARGS +{ + if (NULL == gRootDomain) { + return ENOENT; + } + if (NULL == gRootDomain->_aotMetrics) { + return ENOENT; + } + return sysctl_io_opaque(req, gRootDomain->_aotMetrics, sizeof(IOPMAOTMetrics), NULL); +} + +static SYSCTL_PROC(_kern, OID_AUTO, aotmetrics, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + NULL, 0, sysctl_aotmetrics, "S,IOPMAOTMetrics", ""); + + +static int +update_aotmode(uint32_t mode) +{ + int result; + + if (!gIOPMWorkLoop) { + return ENOENT; + } + result = gIOPMWorkLoop->runActionBlock(^IOReturn (void) { + unsigned int oldCount; + + if (mode && !gRootDomain->_aotMetrics) { + gRootDomain->_aotMetrics = IONewZero(IOPMAOTMetrics, 1); + if (!gRootDomain->_aotMetrics) { + return ENOMEM; + } + } + + oldCount = gRootDomain->idleSleepPreventersCount(); + gRootDomain->_aotMode = mode; + gRootDomain->updatePreventIdleSleepListInternal(NULL, false, oldCount); + return 0; + }); + return result; +} + +static int +sysctl_aotmodebits +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error, changed; + uint32_t new_value; + + if (NULL == gRootDomain) { + return ENOENT; + } + error = sysctl_io_number(req, gRootDomain->_aotMode, sizeof(uint32_t), &new_value, &changed); + if (changed && gIOPMWorkLoop) { + error = update_aotmode(new_value); + } + + return error; +} + +static SYSCTL_PROC(_kern, OID_AUTO, aotmodebits, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + NULL, 0, sysctl_aotmodebits, "I", ""); + +static int +sysctl_aotmode +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error, changed; + uint32_t new_value; + + if (NULL == gRootDomain) { + return ENOENT; + } + error = sysctl_io_number(req, gRootDomain->_aotMode, sizeof(uint32_t), &new_value, &changed); + if (changed && gIOPMWorkLoop) { + if (new_value) { + new_value = kIOPMAOTModeDefault; // & ~kIOPMAOTModeRespectTimers; + } + error = update_aotmode(new_value); + } + + return error; +} + +static SYSCTL_PROC(_kern, OID_AUTO, aotmode, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + NULL, 0, sysctl_aotmode, "I", ""); + +//****************************************************************************** +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ static const OSSymbol * gIOPMSettingAutoWakeCalendarKey; static const OSSymbol * gIOPMSettingAutoWakeSecondsKey; +static const OSSymbol * gIOPMSettingAutoPowerCalendarKey; +static const OSSymbol * gIOPMSettingAutoPowerSecondsKey; static const OSSymbol * gIOPMSettingDebugWakeRelativeKey; +static const OSSymbol * gIOPMSettingDebugPowerRelativeKey; static const OSSymbol * gIOPMSettingMaintenanceWakeCalendarKey; static const OSSymbol * gIOPMSettingSleepServiceWakeCalendarKey; static const OSSymbol * gIOPMSettingSilentRunningKey; @@ -1131,7 +1363,10 @@ IOPMrootDomain::start( IOService * nub ) gRootDomain = this; gIOPMSettingAutoWakeCalendarKey = OSSymbol::withCString(kIOPMSettingAutoWakeCalendarKey); gIOPMSettingAutoWakeSecondsKey = OSSymbol::withCString(kIOPMSettingAutoWakeSecondsKey); + gIOPMSettingAutoPowerCalendarKey = OSSymbol::withCString(kIOPMSettingAutoPowerCalendarKey); + gIOPMSettingAutoPowerSecondsKey = OSSymbol::withCString(kIOPMSettingAutoPowerSecondsKey); gIOPMSettingDebugWakeRelativeKey = OSSymbol::withCString(kIOPMSettingDebugWakeRelativeKey); + gIOPMSettingDebugPowerRelativeKey = OSSymbol::withCString(kIOPMSettingDebugPowerRelativeKey); gIOPMSettingMaintenanceWakeCalendarKey = OSSymbol::withCString(kIOPMSettingMaintenanceWakeCalendarKey); gIOPMSettingSleepServiceWakeCalendarKey = OSSymbol::withCString(kIOPMSettingSleepServiceWakeCalendarKey); gIOPMSettingSilentRunningKey = OSSymbol::withCStringNoCopy(kIOPMSettingSilentRunningKey); @@ -1151,11 +1386,11 @@ IOPMrootDomain::start( IOService * nub ) { OSSymbol::withCString(kIOPMSettingSleepOnPowerButtonKey), gIOPMSettingAutoWakeSecondsKey, - OSSymbol::withCString(kIOPMSettingAutoPowerSecondsKey), + gIOPMSettingAutoPowerSecondsKey, gIOPMSettingAutoWakeCalendarKey, - OSSymbol::withCString(kIOPMSettingAutoPowerCalendarKey), + gIOPMSettingAutoPowerCalendarKey, gIOPMSettingDebugWakeRelativeKey, - OSSymbol::withCString(kIOPMSettingDebugPowerRelativeKey), + gIOPMSettingDebugPowerRelativeKey, OSSymbol::withCString(kIOPMSettingWakeOnRingKey), OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey), OSSymbol::withCString(kIOPMSettingWakeOnClamshellKey), @@ -1191,6 +1426,14 @@ IOPMrootDomain::start( IOService * nub ) idleSleepTimerExpired, (thread_call_param_t) this); + powerButtonDown = thread_call_allocate( + powerButtonDownCallout, + (thread_call_param_t) this); + + powerButtonUp = thread_call_allocate( + powerButtonUpCallout, + (thread_call_param_t) this); + diskSyncCalloutEntry = thread_call_allocate( &disk_sync_callout, (thread_call_param_t) this); @@ -1291,6 +1534,14 @@ IOPMrootDomain::start( IOService * nub ) &IOPMrootDomain::dispatchPowerEvent)); gIOPMWorkLoop->addEventSource(pmPowerStateQueue); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + _aotMode = 0; + _aotTimerES = IOTimerEventSource::timerEventSource(this, + OSMemberFunctionCast(IOTimerEventSource::Action, + this, &IOPMrootDomain::aotEvaluate)); + gIOPMWorkLoop->addEventSource(_aotTimerES); +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + // create our power parent patriarch = new IORootParent; patriarch->init(); @@ -1302,7 +1553,7 @@ IOPMrootDomain::start( IOService * nub ) changePowerStateToPriv(ON_STATE); // install power change handler - gSysPowerDownNotifier = registerPrioritySleepWakeInterest( &sysPowerDownHandler, this, 0); + gSysPowerDownNotifier = registerPrioritySleepWakeInterest( &sysPowerDownHandler, this, NULL); #if !NO_KERNEL_HID // Register for a notification when IODisplayWrangler is published @@ -1310,7 +1561,7 @@ IOPMrootDomain::start( IOService * nub ) _displayWranglerNotifier = addMatchingNotification( gIOPublishNotification, tmpDict, (IOServiceMatchingNotificationHandler) & displayWranglerMatchPublished, - this, 0); + this, NULL); tmpDict->release(); } #endif @@ -1354,6 +1605,8 @@ IOPMrootDomain::start( IOService * nub ) psIterator->release(); } + // read swd_panic boot-arg + PE_parse_boot_argn("swd_panic", &gSwdPanic, sizeof(gSwdPanic)); sysctl_register_oid(&sysctl__kern_sleeptime); sysctl_register_oid(&sysctl__kern_waketime); sysctl_register_oid(&sysctl__kern_willshutdown); @@ -1369,6 +1622,12 @@ IOPMrootDomain::start( IOService * nub ) sysctl_register_oid(&sysctl__kern_consoleoptions); sysctl_register_oid(&sysctl__kern_progressoptions); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + sysctl_register_oid(&sysctl__kern_aotmode); + sysctl_register_oid(&sysctl__kern_aotmodebits); + sysctl_register_oid(&sysctl__kern_aotmetrics); +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + #if HIBERNATION IOHibernateSystemInit(this); #endif @@ -1394,7 +1653,46 @@ IOPMrootDomain::setProperties( OSObject * props_obj ) OSNumber *n; const OSSymbol *key; OSObject *obj; - OSCollectionIterator * iter = 0; + OSCollectionIterator * iter = NULL; + + if (!dict) { + return kIOReturnBadArgument; + } + + bool clientEntitled = false; + obj = IOUserClient::copyClientEntitlement(current_task(), kRootDomainEntitlementSetProperty); + clientEntitled = (obj == kOSBooleanTrue); + OSSafeReleaseNULL(obj); + + if (!clientEntitled) { + const char * errorSuffix = NULL; + + // IOPMSchedulePowerEvent() clients may not be entitled, but must be root. + // That API can set 6 possible keys that are checked below. + if ((dict->getCount() == 1) && + (dict->getObject(gIOPMSettingAutoWakeSecondsKey) || + dict->getObject(gIOPMSettingAutoPowerSecondsKey) || + dict->getObject(gIOPMSettingAutoWakeCalendarKey) || + dict->getObject(gIOPMSettingAutoPowerCalendarKey) || + dict->getObject(gIOPMSettingDebugWakeRelativeKey) || + dict->getObject(gIOPMSettingDebugPowerRelativeKey))) { + return_value = IOUserClient::clientHasPrivilege(current_task(), kIOClientPrivilegeAdministrator); + if (return_value != kIOReturnSuccess) { + errorSuffix = "privileged"; + } + } else { + return_value = kIOReturnNotPermitted; + errorSuffix = "entitled"; + } + + if (return_value != kIOReturnSuccess) { + OSString * procName = IOCopyLogNameForPID(proc_selfpid()); + DLOG("%s failed, process %s is not %s\n", __func__, + procName ? procName->getCStringNoCopy() : "", errorSuffix); + OSSafeReleaseNULL(procName); + return return_value; + } + } const OSSymbol *publish_simulated_battery_string = OSSymbol::withCString("SoftwareSimulatedBatteries"); const OSSymbol *boot_complete_string = OSSymbol::withCString("System Boot Complete"); @@ -1416,11 +1714,6 @@ IOPMrootDomain::setProperties( OSObject * props_obj ) const OSSymbol *hibernatefreetime_string = OSSymbol::withCString(kIOHibernateFreeTimeKey); #endif - if (!dict) { - return_value = kIOReturnBadArgument; - goto exit; - } - iter = OSCollectionIterator::withCollection(dict); if (!iter) { return_value = kIOReturnNoMemory; @@ -1533,10 +1826,12 @@ IOPMrootDomain::setProperties( OSObject * props_obj ) (data->getLength() == sizeof(IOPMCalendarStruct))) { const IOPMCalendarStruct * cs = (const IOPMCalendarStruct *) data->getBytesNoCopy(); - + IOLog("gIOPMSettingAutoWakeCalendarKey " YMDTF "\n", YMDT(cs)); if (cs->year) { + _scheduledAlarmUTC = IOPMConvertCalendarToSeconds(cs); OSBitOrAtomic(kIOPMAlarmBitCalendarWake, &_scheduledAlarms); } else { + _scheduledAlarmUTC = 0; OSBitAndAtomic(~kIOPMAlarmBitCalendarWake, &_scheduledAlarms); } DLOG("_scheduledAlarms = 0x%x\n", (uint32_t) _scheduledAlarms); @@ -1631,8 +1926,13 @@ IOPMrootDomain::setAggressiveness( AggressivesRequest * request; bool found = false; - DLOG("setAggressiveness(%x) 0x%x = %u\n", - (uint32_t) options, (uint32_t) type, (uint32_t) value); + if (type == kPMMinutesToDim || type == kPMMinutesToSleep) { + DLOG("setAggressiveness(%x) %s = %u\n", + (uint32_t) options, getAggressivenessTypeString((uint32_t) type), (uint32_t) value); + } else { + DEBUG_LOG("setAggressiveness(%x) %s = %u\n", + (uint32_t) options, getAggressivenessTypeString((uint32_t) type), (uint32_t) value); + } request = IONew(AggressivesRequest, 1); if (!request) { @@ -1756,8 +2056,6 @@ IOPMrootDomain::getAggressiveness( AGGRESSIVES_UNLOCK(); if (source) { - DLOG("getAggressiveness(%d) 0x%x = %u\n", - source, (uint32_t) type, value); *outLevel = (unsigned long) value; return kIOReturnSuccess; } else { @@ -1783,7 +2081,7 @@ IOPMrootDomain::joinAggressiveness( return kIOReturnBadArgument; } - DLOG("joinAggressiveness %s %p\n", service->getName(), OBFUSCATE(service)); + DEBUG_LOG("joinAggressiveness %s %p\n", service->getName(), OBFUSCATE(service)); request = IONew(AggressivesRequest, 1); if (!request) { @@ -1988,14 +2286,14 @@ IOPMrootDomain::synchronizeAggressives( if (request->dataType == kAggressivesRequestTypeService) { service = request->data.service; } else { - service = 0; + service = NULL; } IODelete(request, AggressivesRequest, 1); - request = 0; + request = NULL; if (service) { - if (service->assertPMDriverCall(&callEntry)) { + if (service->assertPMDriverCall(&callEntry, kIOPMDriverCallMethodSetAggressive)) { for (i = 0, record = array; i < count; i++, record++) { value = record->value; if (record->flags & kAggressivesRecordFlagMinValue) { @@ -2045,7 +2343,7 @@ IOPMrootDomain::broadcastAggressives( } if ((service = OSDynamicCast(IOService, connect->copyChildEntry(gIOPowerPlane)))) { - if (service->assertPMDriverCall(&callEntry)) { + if (service->assertPMDriverCall(&callEntry, kIOPMDriverCallMethodSetAggressive)) { for (i = 0, record = array; i < count; i++, record++) { if (record->flags & kAggressivesRecordFlagModified) { value = record->value; @@ -2067,6 +2365,31 @@ IOPMrootDomain::broadcastAggressives( } } +//***************************************** +// stackshot on power button press +// *************************************** +static void +powerButtonDownCallout(thread_call_param_t us, thread_call_param_t ) +{ + /* Power button pressed during wake + * Take a stackshot + */ + DEBUG_LOG("Powerbutton: down. Taking stackshot\n"); + ((IOPMrootDomain *)us)->takeStackshot(false); +} + +static void +powerButtonUpCallout(thread_call_param_t us, thread_call_param_t) +{ + /* Power button released. + * Delete any stackshot data + */ + DEBUG_LOG("PowerButton: up callout. Delete stackshot\n"); + ((IOPMrootDomain *)us)->deleteStackshot(); +} +//************************************************************************* +// + // MARK: - // MARK: System Sleep @@ -2302,7 +2625,6 @@ IOPMrootDomain::privateSleepSystem( uint32_t sleepReason ) // // This overrides powerChangeDone in IOService. //****************************************************************************** - void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) { @@ -2310,34 +2632,85 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) uint64_t timeSinceReset = 0; #endif uint64_t now; + unsigned long newState; + clock_sec_t secs; + clock_usec_t microsecs; +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + clock_sec_t adjWakeTime; + IOPMCalendarStruct nowCalendar; +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + ASSERT_GATED(); - DLOG("PowerChangeDone: %u->%u\n", - (uint32_t) previousPowerState, (uint32_t) getPowerState()); + newState = getPowerState(); + DLOG("PowerChangeDone: %s->%s\n", + getPowerStateString((uint32_t) previousPowerState), getPowerStateString((uint32_t) getPowerState())); + + if (previousPowerState == newState) { + return; + } notifierThread = current_thread(); switch (getPowerState()) { case SLEEP_STATE: { - if (previousPowerState != ON_STATE) { - break; - } +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector) { + secs = 0; + microsecs = 0; + PEGetUTCTimeOfDay(&secs, µsecs); + + adjWakeTime = 0; + if ((kIOPMAOTModeRespectTimers & _aotMode) && (_scheduledAlarmUTC < _aotWakeTimeUTC)) { + IOLog("use _scheduledAlarmUTC\n"); + adjWakeTime = _scheduledAlarmUTC; + } else if (_aotExit || (kIOPMWakeEventAOTExitFlags & _aotPendingFlags)) { + IOLog("accelerate _aotWakeTime for exit\n"); + adjWakeTime = secs; + } else if (kIOPMDriverAssertionLevelOn == getPMAssertionLevel(kIOPMDriverAssertionCPUBit)) { + IOLog("accelerate _aotWakeTime for assertion\n"); + adjWakeTime = secs; + } + if (adjWakeTime) { + IOPMConvertSecondsToCalendar(adjWakeTime, &_aotWakeTimeCalendar); + } + + IOPMConvertSecondsToCalendar(secs, &nowCalendar); + IOLog("aotSleep at " YMDTF " sched: " YMDTF "\n", YMDT(&nowCalendar), YMDT(&_aotWakeTimeCalendar)); + IOReturn __unused ret = setMaintenanceWakeCalendar(&_aotWakeTimeCalendar); + assert(kIOReturnSuccess == ret); + } + if (_aotLastWakeTime) { + _aotMetrics->totalTime += mach_absolute_time() - _aotLastWakeTime; + if (_aotMetrics->sleepCount && (_aotMetrics->sleepCount <= kIOPMAOTMetricsKernelWakeCountMax)) { + strlcpy(&_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount - 1][0], + gWakeReasonString, + sizeof(_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount])); + } + } + _aotPendingFlags &= ~kIOPMWakeEventAOTPerCycleFlags; +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ acceptSystemWakeEvents(true); // re-enable this timer for next sleep cancelIdleSleepTimer(); - clock_sec_t secs; - clock_usec_t microsecs; clock_get_calendar_absolute_and_microtime(&secs, µsecs, &now); logtime(secs); gIOLastSleepTime.tv_sec = secs; gIOLastSleepTime.tv_usec = microsecs; +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (!_aotLastWakeTime) { + gIOLastUserSleepTime = gIOLastSleepTime; + } +#else + gIOLastUserSleepTime = gIOLastSleepTime; +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + gIOLastWakeTime.tv_sec = 0; gIOLastWakeTime.tv_usec = 0; gIOLastSleepAbsTime = now; if (wake2DarkwakeDelay && sleepDelaysReport) { - clock_usec_t microsecs; clock_sec_t wake2DarkwakeSecs, darkwake2SleepSecs; // Update 'wake2DarkwakeDelay' histogram if this is a fullwake->sleep transition @@ -2376,6 +2749,8 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) clock_usec_t microsecs = 0; uint64_t now_b = mach_absolute_time(); + secs = 0; + microsecs = 0; PEGetUTCTimeOfDay(&secs, µsecs); uint64_t now_a = mach_absolute_time(); @@ -2401,16 +2776,37 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) gSleepOrShutdownPending = 0; // trip the reset of the calendar clock - { - clock_sec_t wakeSecs; - clock_usec_t wakeMicrosecs; - - clock_wakeup_calendar(); - - clock_get_calendar_microtime(&wakeSecs, &wakeMicrosecs); - gIOLastWakeTime.tv_sec = wakeSecs; - gIOLastWakeTime.tv_usec = wakeMicrosecs; + clock_wakeup_calendar(); + clock_get_calendar_microtime(&secs, µsecs); + gIOLastWakeTime.tv_sec = secs; + gIOLastWakeTime.tv_usec = microsecs; + +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + // aot + if (_aotWakeTimeCalendar.selector != kPMCalendarTypeInvalid) { + _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid; + secs = 0; + microsecs = 0; + PEGetUTCTimeOfDay(&secs, µsecs); + IOPMConvertSecondsToCalendar(secs, &nowCalendar); + IOLog("aotWake at " YMDTF " sched: " YMDTF "\n", YMDT(&nowCalendar), YMDT(&_aotWakeTimeCalendar)); + _aotMetrics->sleepCount++; + _aotLastWakeTime = gIOLastWakeAbsTime; + if (_aotMetrics->sleepCount <= kIOPMAOTMetricsKernelWakeCountMax) { + _aotMetrics->kernelSleepTime[_aotMetrics->sleepCount - 1] + = (((uint64_t) gIOLastSleepTime.tv_sec) << 10) + (gIOLastSleepTime.tv_usec / 1000); + _aotMetrics->kernelWakeTime[_aotMetrics->sleepCount - 1] + = (((uint64_t) gIOLastWakeTime.tv_sec) << 10) + (gIOLastWakeTime.tv_usec / 1000); + } + + if (_aotTestTime) { + if (_aotWakeTimeUTC <= secs) { + _aotTestTime = _aotTestTime + _aotTestInterval; + } + setWakeTime(_aotTestTime); + } } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ #if HIBERNATION LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); @@ -2434,6 +2830,7 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) sleepToStandby = false; wranglerTickleLatched = false; userWasActive = false; + isRTCAlarmWake = false; fullWakeReason = kFullWakeReasonNone; OSString * wakeType = OSDynamicCast( @@ -2459,9 +2856,11 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) !(hibOptions->unsigned32BitValue() & kIOHibernateOptionDarkWake)))) { // Hibernate aborted, or EFI brought up graphics wranglerTickled = true; - DLOG("hibernation aborted %d, options 0x%x\n", - hibernateAborted, - hibOptions ? hibOptions->unsigned32BitValue() : 0); + if (hibernateAborted) { + DLOG("Hibernation aborted\n"); + } else { + DLOG("EFI brought up graphics. Going to full wake. HibOptions: 0x%x\n", hibOptions->unsigned32BitValue()); + } } else #endif if (wakeType && ( @@ -2469,6 +2868,9 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) wakeType->isEqualTo(kIOPMRootDomainWakeTypeAlarm))) { // User wake or RTC alarm wranglerTickled = true; + if (wakeType->isEqualTo(kIOPMRootDomainWakeTypeAlarm)) { + isRTCAlarmWake = true; + } } else if (wakeType && wakeType->isEqualTo(kIOPMRootDomainWakeTypeSleepTimer)) { // SMC standby timer trumps SleepX @@ -2546,21 +2948,30 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) thread_call_enter(updateConsoleUsersEntry); - changePowerStateToPriv(ON_STATE); - } break; + changePowerStateToPriv(getRUN_STATE()); + break; + } #if !__i386__ && !__x86_64__ - case ON_STATE: { - if (previousPowerState != ON_STATE) { - DLOG("Force re-evaluating aggressiveness\n"); - /* Force re-evaluate the aggressiveness values to set appropriate idle sleep timer */ - pmPowerStateQueue->submitPowerEvent( - kPowerEventPolicyStimulus, - (void *) kStimulusNoIdleSleepPreventers ); + case ON_STATE: + case AOT_STATE: + { + DLOG("Force re-evaluating aggressiveness\n"); + /* Force re-evaluate the aggressiveness values to set appropriate idle sleep timer */ + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusNoIdleSleepPreventers ); + + // After changing to ON_STATE, invalidate any previously queued + // request to change to a state less than ON_STATE. This isn't + // necessary for AOT_STATE or if the device has only one running + // state since the changePowerStateToPriv() issued at the tail + // end of SLEEP_STATE case should take care of that. + if (getPowerState() == ON_STATE) { + changePowerStateToPriv(ON_STATE); } break; } - -#endif +#endif /* !__i386__ && !__x86_64__ */ } notifierThread = NULL; } @@ -2594,38 +3005,49 @@ IOPMrootDomain::requestPowerDomainState( bool IOPMrootDomain::updatePreventIdleSleepList( - IOService * service, bool addNotRemove ) + IOService * service, bool addNotRemove) { - unsigned int oldCount, newCount; + unsigned int oldCount; + + oldCount = idleSleepPreventersCount(); + return updatePreventIdleSleepListInternal(service, addNotRemove, oldCount); +} + +bool +IOPMrootDomain::updatePreventIdleSleepListInternal( + IOService * service, bool addNotRemove, unsigned int oldCount) +{ + unsigned int newCount; ASSERT_GATED(); #if defined(__i386__) || defined(__x86_64__) // Disregard disk I/O (besides the display wrangler) as a factor preventing // idle sleep, except in the case of legacy disk I/O - if ((service != wrangler) && (service != this)) { + if (service && (service != wrangler) && (service != this)) { return false; } #endif - oldCount = preventIdleSleepList->getCount(); - if (addNotRemove) { - preventIdleSleepList->setObject(service); - DLOG("prevent idle sleep list: %s+ (%u)\n", - service->getName(), preventIdleSleepList->getCount()); - } else if (preventIdleSleepList->member(service)) { - preventIdleSleepList->removeObject(service); - DLOG("prevent idle sleep list: %s- (%u)\n", - service->getName(), preventIdleSleepList->getCount()); + if (service) { + if (addNotRemove) { + preventIdleSleepList->setObject(service); + DLOG("prevent idle sleep list: %s+ (%u)\n", + service->getName(), preventIdleSleepList->getCount()); + } else if (preventIdleSleepList->member(service)) { + preventIdleSleepList->removeObject(service); + DLOG("prevent idle sleep list: %s- (%u)\n", + service->getName(), preventIdleSleepList->getCount()); + } } - newCount = preventIdleSleepList->getCount(); + newCount = idleSleepPreventersCount(); if ((oldCount == 0) && (newCount != 0)) { // Driver added to empty prevent list. // Update the driver desire to prevent idle sleep. // Driver desire does not prevent demand sleep. - changePowerStateTo(ON_STATE); + changePowerStateTo(getRUN_STATE()); } else if ((oldCount != 0) && (newCount == 0)) { // Last driver removed from prevent list. // Drop the driver clamp to allow idle sleep. @@ -2751,6 +3173,68 @@ IOPMrootDomain::copySleepPreventersList(OSArray **idleSleepList, OSArray **syste } } +void +IOPMrootDomain::copySleepPreventersListWithID(OSArray **idleSleepList, OSArray **systemSleepList) +{ + OSCollectionIterator *iterator = NULL; + OSObject *object = NULL; + OSArray *array = NULL; + + if (!gIOPMWorkLoop->inGate()) { + gIOPMWorkLoop->runAction( + OSMemberFunctionCast(IOWorkLoop::Action, this, + &IOPMrootDomain::IOPMrootDomain::copySleepPreventersListWithID), + this, (void *)idleSleepList, (void *)systemSleepList); + return; + } + + if (idleSleepList && preventIdleSleepList && (preventIdleSleepList->getCount() != 0)) { + iterator = OSCollectionIterator::withCollection(preventIdleSleepList); + array = OSArray::withCapacity(5); + + while ((object = iterator->getNextObject())) { + IOService *service = OSDynamicCast(IOService, object); + if (object) { + OSDictionary *dict = OSDictionary::withCapacity(2); + if (dict) { + OSNumber *id = OSNumber::withNumber(service->getRegistryEntryID(), 64); + dict->setObject(kIOPMDriverAssertionRegistryEntryIDKey, id); + dict->setObject(kIOPMDriverAssertionOwnerStringKey, OSSymbol::withCString(service->getName())); + array->setObject(dict); + id->release(); + dict->release(); + } + } + } + + iterator->release(); + *idleSleepList = array; + } + + if (systemSleepList && preventSystemSleepList && (preventSystemSleepList->getCount() != 0)) { + iterator = OSCollectionIterator::withCollection(preventSystemSleepList); + array = OSArray::withCapacity(5); + + while ((object = iterator->getNextObject())) { + IOService *service = OSDynamicCast(IOService, object); + if (object) { + OSDictionary *dict = OSDictionary::withCapacity(2); + if (dict) { + OSNumber *id = OSNumber::withNumber(service->getRegistryEntryID(), 64); + dict->setObject(kIOPMDriverAssertionRegistryEntryIDKey, id); + dict->setObject(kIOPMDriverAssertionOwnerStringKey, OSSymbol::withCString(service->getName())); + array->setObject(dict); + id->release(); + dict->release(); + } + } + } + + iterator->release(); + *systemSleepList = array; + } +} + //****************************************************************************** // tellChangeDown // @@ -2760,8 +3244,8 @@ IOPMrootDomain::copySleepPreventersList(OSArray **idleSleepList, OSArray **syste bool IOPMrootDomain::tellChangeDown( unsigned long stateNum ) { - DLOG("tellChangeDown %u->%u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum); + DLOG("tellChangeDown %s->%s\n", + getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum)); if (SLEEP_STATE == stateNum) { // Legacy apps were already told in the full->dark transition @@ -2804,8 +3288,8 @@ IOPMrootDomain::tellChangeDown( unsigned long stateNum ) bool IOPMrootDomain::askChangeDown( unsigned long stateNum ) { - DLOG("askChangeDown %u->%u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum); + DLOG("askChangeDown %s->%s\n", + getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum)); // Don't log for dark wake entry if (kSystemTransitionSleep == _systemTransitionType) { @@ -2860,6 +3344,16 @@ IOPMrootDomain::askChangeDownDone( *cancel = true; DLOG("cancel dark->sleep\n"); } +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (_aotMode && (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector)) { + uint64_t now = mach_continuous_time(); + if (((now + _aotWakePreWindow) >= _aotWakeTimeContinuous) + && (now < (_aotWakeTimeContinuous + _aotWakePostWindow))) { + *cancel = true; + IOLog("AOT wake window cancel: %qd, %qd\n", now, _aotWakeTimeContinuous); + } + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } } @@ -2930,8 +3424,8 @@ IOPMrootDomain::systemDidNotSleep( void ) void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum ) { - DLOG("tellNoChangeDown %u->%u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum); + DLOG("tellNoChangeDown %s->%s\n", + getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum)); // Sleep canceled, clear the sleep trace point. tracePoint(kIOPMTracePointSystemUp); @@ -2952,8 +3446,8 @@ IOPMrootDomain::tellNoChangeDown( unsigned long stateNum ) void IOPMrootDomain::tellChangeUp( unsigned long stateNum ) { - DLOG("tellChangeUp %u->%u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum); + DLOG("tellChangeUp %s->%s\n", + getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum)); ignoreTellChangeDown = false; @@ -2969,7 +3463,21 @@ IOPMrootDomain::tellChangeUp( unsigned long stateNum ) NULL, NULL, NULL); if (getPowerState() == ON_STATE) { - // this is a quick wake from aborted sleep + // Sleep was cancelled by idle cancel or revert + if (!CAP_CURRENT(kIOPMSystemCapabilityGraphics)) { + // rdar://problem/50363791 + // If system is in dark wake and sleep is cancelled, do not + // send SystemWillPowerOn/HasPoweredOn messages to kernel + // priority clients. They haven't yet seen a SystemWillSleep + // message before the cancellation. So make sure the kernel + // client bit is cleared in _systemMessageClientMask before + // invoking the tellClients() below. This bit may have been + // set by handleOurPowerChangeStart() anticipating a successful + // sleep and setting the filter mask ahead of time allows the + // SystemWillSleep message to go through. + _systemMessageClientMask &= ~kSystemMessageClientKernel; + } + systemDidNotSleep(); tellClients( kIOMessageSystemWillPowerOn ); } @@ -3011,10 +3519,29 @@ IOPMrootDomain::sysPowerDownHandler( UInt32 messageType, IOService * service, void * messageArgs, vm_size_t argSize ) { + static UInt32 lastSystemMessageType = 0; IOReturn ret = 0; DLOG("sysPowerDownHandler message %s\n", getIOMessageString(messageType)); + // rdar://problem/50363791 + // Sanity check to make sure the SystemWill/Has message types are + // received in the expected order for all kernel priority clients. + if (messageType == kIOMessageSystemWillSleep || + messageType == kIOMessageSystemWillPowerOn || + messageType == kIOMessageSystemHasPoweredOn) { + switch (messageType) { + case kIOMessageSystemWillPowerOn: + assert(lastSystemMessageType == kIOMessageSystemWillSleep); + break; + case kIOMessageSystemHasPoweredOn: + assert(lastSystemMessageType == kIOMessageSystemWillPowerOn); + break; + } + + lastSystemMessageType = messageType; + } + if (!gRootDomain) { return kIOReturnUnsupported; } @@ -3235,9 +3762,9 @@ IOPMrootDomain::initializeBootSessionUUID(void) IOReturn IOPMrootDomain::changePowerStateTo( unsigned long ordinal ) { - DLOG("changePowerStateTo(%lu)\n", ordinal); + DLOG("changePowerStateTo(%u)\n", (uint32_t) ordinal); - if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE)) { + if ((ordinal != ON_STATE) && (ordinal != AOT_STATE) && (ordinal != SLEEP_STATE)) { return kIOReturnUnsupported; } @@ -3247,9 +3774,9 @@ IOPMrootDomain::changePowerStateTo( unsigned long ordinal ) IOReturn IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal ) { - DLOG("changePowerStateToPriv(%lu)\n", ordinal); + DLOG("changePowerStateToPriv(%u)\n", (uint32_t) ordinal); - if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE)) { + if ((ordinal != ON_STATE) && (ordinal != AOT_STATE) && (ordinal != SLEEP_STATE)) { return kIOReturnUnsupported; } @@ -3309,7 +3836,7 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) if (!tasksSuspended) { AbsoluteTime deadline; tasksSuspended = TRUE; - tasks_system_suspend(tasksSuspended); + updateTasksSuspend(); clock_interval_to_deadline(10, kSecondScale, &deadline); #if !CONFIG_EMBEDDED @@ -3317,6 +3844,44 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) #endif /* !CONFIG_EMBEDDED */ } +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + _aotReadyToFullWake = false; +#if 0 + if (_aotLingerTime) { + uint64_t deadline; + IOLog("aot linger no return\n"); + clock_absolutetime_interval_to_deadline(_aotLingerTime, &deadline); + clock_delay_until(deadline); + } +#endif + if (!_aotMode) { + _aotTestTime = 0; + _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid; + if (_aotMetrics) { + bzero(_aotMetrics, sizeof(IOPMAOTMetrics)); + } + } else if (!_aotNow && !_debugWakeSeconds) { + _aotNow = true; + _aotExit = false; + _aotPendingFlags = 0; + _aotTasksSuspended = true; + _aotLastWakeTime = 0; + bzero(_aotMetrics, sizeof(IOPMAOTMetrics)); + if (kIOPMAOTModeCycle & _aotMode) { + clock_interval_to_absolutetime_interval(60, kSecondScale, &_aotTestInterval); + _aotTestTime = mach_continuous_time() + _aotTestInterval; + setWakeTime(_aotTestTime); + } + uint32_t lingerSecs; + if (!PE_parse_boot_argn("aotlinger", &lingerSecs, sizeof(lingerSecs))) { + lingerSecs = 0; + } + clock_interval_to_absolutetime_interval(lingerSecs, kSecondScale, &_aotLingerTime); + clock_interval_to_absolutetime_interval(2000, kMillisecondScale, &_aotWakePreWindow); + clock_interval_to_absolutetime_interval(1100, kMillisecondScale, &_aotWakePostWindow); + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + #if HIBERNATION IOHibernateSystemSleep(); IOHibernateIOKitSleep(); @@ -3362,6 +3927,21 @@ IOPMrootDomain::shouldSleepOnClamshellClosed( void ) return !clamshellDisabled && !(desktopMode && acAdaptorConnected) && !clamshellSleepDisabled; } +bool +IOPMrootDomain::shouldSleepOnRTCAlarmWake( void ) +{ + // Called once every RTC/Alarm wake. Device should go to sleep if on clamshell + // closed && battery + if (!clamshellExists) { + return false; + } + + DLOG("shouldSleepOnRTCAlarmWake: clamshell closed %d, disabled %d, desktopMode %d, ac %d sleepDisabled %d\n", + clamshellClosed, clamshellDisabled, desktopMode, acAdaptorConnected, clamshellSleepDisabled); + + return !acAdaptorConnected && !clamshellSleepDisabled; +} + void IOPMrootDomain::sendClientClamshellNotification( void ) { @@ -3544,7 +4124,7 @@ IOPMrootDomain::publishFeature( existing_feature_arr->setObject(new_feature_data); features->setObject(feature, existing_feature_arr); existing_feature_arr->release(); - existing_feature_arr = 0; + existing_feature_arr = NULL; } } else { // The easy case: no previously existing features listed. We simply @@ -3733,8 +4313,8 @@ IOPMrootDomain::setPMSetting( const OSSymbol *type, OSObject *object ) { - PMSettingCallEntry *entries = 0; - OSArray *chosen = 0; + PMSettingCallEntry *entries = NULL; + OSArray *chosen = NULL; const OSArray *array; PMSettingObject *pmso; thread_t thisThread; @@ -3961,10 +4541,10 @@ IOPMrootDomain::deregisterPMSettingObject( PMSettingObject * pmso ) } } if (wait) { - assert(0 == pmso->waitThread); + assert(NULL == pmso->waitThread); pmso->waitThread = thisThread; PMSETTING_WAIT(pmso); - pmso->waitThread = 0; + pmso->waitThread = NULL; } } while (wait); @@ -4113,9 +4693,10 @@ IOPMrootDomain::evaluateSystemSleepPolicy( IOPMSystemSleepParameters * params, int sleepPhase, uint32_t * hibMode ) { const IOPMSystemSleepPolicyTable * pt; - OSObject * prop = 0; + OSObject * prop = NULL; OSData * policyData; uint64_t currentFactors = 0; + char currentFactorsBuf[512]; uint32_t standbyDelay = 0; uint32_t powerOffDelay = 0; uint32_t powerOffTimer = 0; @@ -4149,6 +4730,7 @@ IOPMrootDomain::evaluateSystemSleepPolicy( sleepPhase, standbyEnabled, standbyDelay, standbyTimer, powerOffEnabled, powerOffDelay, powerOffTimer, *hibMode); + currentFactorsBuf[0] = 0; // pmset level overrides if ((*hibMode & kIOHibernateModeOn) == 0) { if (!gSleepPolicyHandler) { @@ -4161,86 +4743,109 @@ IOPMrootDomain::evaluateSystemSleepPolicy( // If poweroff is enabled, force poweroff. if (standbyEnabled) { currentFactors |= kIOPMSleepFactorStandbyForced; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "StandbyForced"); } else if (powerOffEnabled) { currentFactors |= kIOPMSleepFactorAutoPowerOffForced; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "AutoPowerOffForced"); } else { currentFactors |= kIOPMSleepFactorHibernateForced; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "HibernateForced"); } } // Current factors based on environment and assertions if (sleepTimerMaintenance) { currentFactors |= kIOPMSleepFactorSleepTimerWake; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "SleepTimerWake"); } if (standbyEnabled && sleepToStandby && !gSleepPolicyHandler) { currentFactors |= kIOPMSleepFactorSleepTimerWake; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "SleepTimerWake"); } if (!clamshellClosed) { currentFactors |= kIOPMSleepFactorLidOpen; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "LidOpen"); } if (acAdaptorConnected) { currentFactors |= kIOPMSleepFactorACPower; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ACPower"); } if (lowBatteryCondition) { currentFactors |= kIOPMSleepFactorBatteryLow; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "BatteryLow"); } if (!standbyDelay || !standbyTimer) { currentFactors |= kIOPMSleepFactorStandbyNoDelay; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "StandbyNoDelay"); } if (standbyNixed || !standbyEnabled) { currentFactors |= kIOPMSleepFactorStandbyDisabled; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "StandbyDisabled"); } if (resetTimers) { currentFactors |= kIOPMSleepFactorLocalUserActivity; currentFactors &= ~kIOPMSleepFactorSleepTimerWake; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "LocalUserActivity, !SleepTimerWake"); } if (getPMAssertionLevel(kIOPMDriverAssertionUSBExternalDeviceBit) != kIOPMDriverAssertionLevelOff) { currentFactors |= kIOPMSleepFactorUSBExternalDevice; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "USBExternalDevice"); } if (getPMAssertionLevel(kIOPMDriverAssertionBluetoothHIDDevicePairedBit) != kIOPMDriverAssertionLevelOff) { currentFactors |= kIOPMSleepFactorBluetoothHIDDevice; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "BluetoothHIDDevice"); } if (getPMAssertionLevel(kIOPMDriverAssertionExternalMediaMountedBit) != kIOPMDriverAssertionLevelOff) { currentFactors |= kIOPMSleepFactorExternalMediaMounted; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ExternalMediaMounted"); } if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) != kIOPMDriverAssertionLevelOff) { currentFactors |= kIOPMSleepFactorThunderboltDevice; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ThunderboltDevice"); } if (_scheduledAlarms != 0) { currentFactors |= kIOPMSleepFactorRTCAlarmScheduled; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "RTCAlaramScheduled"); } if (getPMAssertionLevel(kIOPMDriverAssertionMagicPacketWakeEnabledBit) != kIOPMDriverAssertionLevelOff) { currentFactors |= kIOPMSleepFactorMagicPacketWakeEnabled; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "MagicPacketWakeEnabled"); } #define TCPKEEPALIVE 1 #if TCPKEEPALIVE if (getPMAssertionLevel(kIOPMDriverAssertionNetworkKeepAliveActiveBit) != kIOPMDriverAssertionLevelOff) { currentFactors |= kIOPMSleepFactorNetworkKeepAliveActive; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "NetworkKeepAliveActive"); } #endif if (!powerOffEnabled) { currentFactors |= kIOPMSleepFactorAutoPowerOffDisabled; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "AutoPowerOffDisabled"); } if (desktopMode) { currentFactors |= kIOPMSleepFactorExternalDisplay; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ExternalDisplay"); } if (userWasActive) { currentFactors |= kIOPMSleepFactorLocalUserActivity; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "LocalUserActivity"); } if (darkWakeHibernateError && !CAP_HIGHEST(kIOPMSystemCapabilityGraphics)) { currentFactors |= kIOPMSleepFactorHibernateFailed; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "HibernateFailed"); } if (thermalWarningState) { currentFactors |= kIOPMSleepFactorThermalWarning; + snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ThermalWarning"); } - DLOG("sleep factors 0x%llx\n", currentFactors); + DLOG("sleep factors 0x%llx %s\n", currentFactors, currentFactorsBuf); if (gSleepPolicyHandler) { uint32_t savedHibernateMode; @@ -4524,7 +5129,7 @@ IOPMrootDomain::getSleepOption( const char * key, uint32_t * option ) { OSObject * optionsProp; OSDictionary * optionsDict; - OSObject * obj = 0; + OSObject * obj = NULL; OSNumber * num; bool ok = false; @@ -4677,7 +5282,7 @@ platformHaltRestartApplier( OSObject * object, void * context ) halt_log_enter("PowerOff/Restart message to priority client", (const void *) notifier->handler, elapsedTime); } - ctx->handler = 0; + ctx->handler = NULL; ctx->Counter++; } @@ -4751,7 +5356,13 @@ IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type ) } gHaltRestartCtx.phase = kNotifyHaltRestartAction; +#if !CONFIG_EMBEDDED IOCPURunPlatformHaltRestartActions(pe_type); +#else + if (kPEPagingOff != pe_type) { + IOCPURunPlatformHaltRestartActions(pe_type); + } +#endif // Wait for PM to quiesce if ((kPEPagingOff != pe_type) && gPMHaltLock) { @@ -4876,7 +5487,7 @@ IOPMrootDomain::tagPowerPlaneService( } #if !NO_KERNEL_HID - isDisplayWrangler = (0 != service->metaCast("IODisplayWrangler")); + isDisplayWrangler = (NULL != service->metaCast("IODisplayWrangler")); if (isDisplayWrangler) { wrangler = service; // found the display wrangler, check for any display assertions already created @@ -5006,6 +5617,14 @@ IOPMrootDomain::overrideOurPowerChange( uint32_t changeFlags = *inOutChangeFlags; uint32_t currentPowerState = (uint32_t) getPowerState(); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if ((AOT_STATE == powerState) && (ON_STATE == currentPowerState)) { + // Assertion may have been taken in AOT leading to changePowerStateTo(AOT) + *inOutChangeFlags |= kIOPMNotDone; + return; + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + if (changeFlags & kIOPMParentInitiated) { // Root parent is permanently pegged at max power, // a parent initiated power change is unexpected. @@ -5028,11 +5647,11 @@ IOPMrootDomain::overrideOurPowerChange( kIOPMSystemCapabilityAudio); // Convert to capability change (ON->ON) - *inOutPowerState = ON_STATE; + *inOutPowerState = getRUN_STATE(); *inOutChangeFlags |= kIOPMSynchronize; // Revert device desire from SLEEP to ON - changePowerStateToPriv(ON_STATE); + changePowerStateToPriv(getRUN_STATE()); } else { // System is in dark wake, ok to drop power state. // Broadcast root powering down to entire tree. @@ -5193,6 +5812,9 @@ IOPMrootDomain::handleOurPowerChangeStart( _systemMessageClientMask &= ~kSystemMessageClientLegacyApp; } if ((_highestCapability & kIOPMSystemCapabilityGraphics) == 0) { + // Kernel priority clients are only notified on the initial + // transition to full wake, so don't notify them unless system + // has gained graphics capability since the last system wake. _systemMessageClientMask &= ~kSystemMessageClientKernel; } #if HIBERNATION @@ -5214,6 +5836,10 @@ IOPMrootDomain::handleOurPowerChangeStart( tracePoint( kIOPMTracePointWakeWillPowerOnClients ); // Clear stats about sleep + if (AOT_STATE == powerState) { + _pendingCapability = 0; + } + if (_pendingCapability & kIOPMSystemCapabilityGraphics) { willEnterFullWake(); } else { @@ -5255,13 +5881,23 @@ IOPMrootDomain::handleOurPowerChangeStart( _systemStateGeneration++; systemDarkWake = false; - DLOG("=== START (%u->%u, 0x%x) type %u, gen %u, msg %x, " + DLOG("=== START (%s->%s, 0x%x) type %u, gen %u, msg %x, " "dcp %x:%x:%x\n", - currentPowerState, (uint32_t) powerState, *inOutChangeFlags, + getPowerStateString(currentPowerState), getPowerStateString((uint32_t) powerState), *inOutChangeFlags, _systemTransitionType, _systemStateGeneration, _systemMessageClientMask, _desiredCapability, _currentCapability, _pendingCapability); } + +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if ((AOT_STATE == powerState) && (SLEEP_STATE != currentPowerState)) { + panic("illegal AOT entry from %s", getPowerStateString(currentPowerState)); + } + if (_aotNow && (ON_STATE == powerState)) { + aotShouldExit(false, true); + aotExit(false); + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } void @@ -5304,7 +5940,7 @@ IOPMrootDomain::handleOurPowerChangeDone( } // Revert device desire to max. - changePowerStateToPriv(ON_STATE); + changePowerStateToPriv(getRUN_STATE()); } else { // Send message on dark wake to full wake promotion. // tellChangeUp() handles the normal SLEEP->ON case. @@ -5363,9 +5999,9 @@ IOPMrootDomain::handleOurPowerChangeDone( } } - DLOG("=== FINISH (%u->%u, 0x%x) type %u, gen %u, msg %x, " + DLOG("=== FINISH (%s->%s, 0x%x) type %u, gen %u, msg %x, " "dcp %x:%x:%x, dbgtimer %u\n", - currentPowerState, (uint32_t) powerState, changeFlags, + getPowerStateString(currentPowerState), getPowerStateString((uint32_t) powerState), changeFlags, _systemTransitionType, _systemStateGeneration, _systemMessageClientMask, _desiredCapability, _currentCapability, _pendingCapability, @@ -5432,6 +6068,11 @@ IOPMrootDomain::handleOurPowerChangeDone( DLOG("DisplayOn fullwake request is removed\n"); handleDisplayPowerOn(); } + + if (isRTCAlarmWake) { + pmPowerStateQueue->submitPowerEvent( + kPowerEventReceivedPowerNotification, (void *)(uintptr_t) kLocalEvalClamshellCommand ); + } } } @@ -5723,7 +6364,7 @@ IOPMrootDomain::handlePowerChangeDoneForPCIDevice( class IOPMServiceInterestNotifier : public _IOServiceInterestNotifier { friend class IOPMrootDomain; - OSDeclareDefaultStructors(IOPMServiceInterestNotifier) + OSDeclareDefaultStructors(IOPMServiceInterestNotifier); protected: uint32_t ackTimeoutCnt; @@ -5741,7 +6382,7 @@ IONotifier * IOPMrootDomain::registerInterest( IOServiceInterestHandler handler, void * target, void * ref ) { - IOPMServiceInterestNotifier *notifier = 0; + IOPMServiceInterestNotifier *notifier = NULL; bool isSystemCapabilityClient; bool isKernelCapabilityClient; IOReturn rc = kIOReturnError;; @@ -5768,7 +6409,7 @@ IONotifier * IOPMrootDomain::registerInterest( } if (rc != kIOReturnSuccess) { notifier->release(); - notifier = 0; + notifier = NULL; return NULL; } @@ -5835,6 +6476,7 @@ IOPMrootDomain::systemMessageFilter( IOPMServiceInterestNotifier *notifier; notifier = OSDynamicCast(IOPMServiceInterestNotifier, (OSObject *)object); + do { if ((kSystemTransitionNewCapClient == _systemTransitionType) && (!isCapMsg || !_joinedCapabilityClients || @@ -5956,7 +6598,7 @@ IOPMrootDomain::systemMessageFilter( DLOG("destroyed capability client set %p\n", OBFUSCATE(_joinedCapabilityClients)); _joinedCapabilityClients->release(); - _joinedCapabilityClients = 0; + _joinedCapabilityClients = NULL; } } if (notifier) { @@ -6090,7 +6732,7 @@ IOPMrootDomain::displayWranglerMatchPublished( #if !NO_KERNEL_HID // install a handler if (!newService->registerInterest( gIOGeneralInterest, - &displayWranglerNotification, target, 0)) { + &displayWranglerNotification, target, NULL)) { return false; } #endif @@ -6170,7 +6812,7 @@ void IOPMrootDomain::setDisplayPowerOn( uint32_t options ) { pmPowerStateQueue->submitPowerEvent( kPowerEventSetDisplayPowerOn, - (void *) 0, options ); + (void *) NULL, options ); } // MARK: - @@ -6211,18 +6853,18 @@ bool IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options, uint32_t sleepReason ) { - int err = 0; + uint32_t err = 0; // Conditions that prevent idle and demand system sleep. do { if (userDisabledAllSleep) { - err = 1; // 1. user-space sleep kill switch + err = kPMUserDisabledAllSleep; // 1. user-space sleep kill switch break; } if (systemBooting || systemShutdown || gWillShutdown) { - err = 2; // 2. restart or shutdown in progress + err = kPMSystemRestartBootingInProgress; // 2. restart or shutdown in progress break; } @@ -6235,7 +6877,7 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options, // dark wake, and must be called from gated context. #if !CONFIG_SLEEP - err = 3; // 3. config does not support sleep + err = kPMConfigPreventSystemSleep; // 3. config does not support sleep break; #endif @@ -6248,19 +6890,19 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options, } if (preventSystemSleepList->getCount() != 0) { - err = 4; // 4. child prevent system sleep clamp + err = kPMChildPreventSystemSleep; // 4. child prevent system sleep clamp break; } if (getPMAssertionLevel( kIOPMDriverAssertionCPUBit ) == kIOPMDriverAssertionLevelOn) { - err = 5; // 5. CPU assertion + err = kPMCPUAssertion; // 5. CPU assertion break; } if (pciCantSleepValid) { if (pciCantSleepFlag) { - err = 6; // 6. PCI card does not support PM (cached) + err = kPMPCIUnsupported; // 6. PCI card does not support PM (cached) } break; } else if (sleepSupportedPEFunction && @@ -6282,7 +6924,7 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options, }while (false); if (err) { - DLOG("System sleep prevented by %d\n", err); + DLOG("System sleep prevented by %s\n", getSystemSleepPreventerString(err)); return false; } return true; @@ -6344,6 +6986,295 @@ IOPMrootDomain::mustHibernate( void ) #endif /* HIBERNATION */ +//****************************************************************************** +// AOT +//****************************************************************************** + +// Tables for accumulated days in year by month, latter used for leap years + +static const int daysbymonth[] = +{ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }; + +static const int lydaysbymonth[] = +{ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 }; + +static int __unused +IOPMConvertSecondsToCalendar(long secs, IOPMCalendarStruct * dt) +{ + const int * dbm = daysbymonth; + long n, x, y, z; + + if (secs < 0) { + return 0; + } + + // Calculate seconds, minutes and hours + + n = secs % (24 * 3600); + dt->second = n % 60; + n /= 60; + dt->minute = n % 60; + dt->hour = n / 60; + + // Calculate day of week + + n = secs / (24 * 3600); +// dt->dayWeek = (n + 4) % 7; + + // Calculate year + // Rebase from days since Unix epoch (1/1/1970) store in 'n', + // to days since 1/1/1968 to start on 4 year cycle, beginning + // on a leap year. + + n += (366 + 365); + + // Every 4 year cycle will be exactly (366 + 365 * 3) = 1461 days. + // Valid before 2100, since 2100 is not a leap year. + + x = n / 1461; // number of 4 year cycles + y = n % 1461; // days into current 4 year cycle + z = 1968 + (4 * x); + + // Add in years in the current 4 year cycle + + if (y >= 366) { + y -= 366; // days after the leap year + n = y % 365; // days into the current year + z += (1 + y / 365); // years after the past 4-yr cycle + } else { + n = y; + dbm = lydaysbymonth; + } + if (z > 2099) { + return 0; + } + + dt->year = z; + + // Adjust remaining days value to start at 1 + + n += 1; + + // Calculate month + + for (x = 1; n > dbm[x]; x++) { + continue; + } + dt->month = x; + + // Calculate day of month + + dt->day = n - dbm[x - 1]; + + return 1; +} + +static long +IOPMConvertCalendarToSeconds(const IOPMCalendarStruct * dt) +{ + const int * dbm = daysbymonth; + long y, secs, days; + + if (dt->year < 1970) { + return 0; + } + + // Seconds elapsed in the current day + + secs = dt->second + 60 * dt->minute + 3600 * dt->hour; + + // Number of days from 1/1/70 to beginning of current year + // Account for extra day every 4 years starting at 1973 + + y = dt->year - 1970; + days = (y * 365) + ((y + 1) / 4); + + // Change table if current year is a leap year + + if ((dt->year % 4) == 0) { + dbm = lydaysbymonth; + } + + // Add in days elapsed in the current year + + days += (dt->day - 1) + dbm[dt->month - 1]; + + // Add accumulated days to accumulated seconds + + secs += 24 * 3600 * days; + + return secs; +} + +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + +unsigned long +IOPMrootDomain::getRUN_STATE(void) +{ + return _aotNow ? AOT_STATE : ON_STATE; +} + +bool +IOPMrootDomain::isAOTMode() +{ + return _aotNow; +} + +IOReturn +IOPMrootDomain::setWakeTime(uint64_t wakeContinuousTime) +{ + clock_sec_t nowsecs, wakesecs; + clock_usec_t nowmicrosecs, wakemicrosecs; + uint64_t nowAbs, wakeAbs; + + clock_gettimeofday_and_absolute_time(&nowsecs, &nowmicrosecs, &nowAbs); + wakeAbs = continuoustime_to_absolutetime(wakeContinuousTime); + if (wakeAbs < nowAbs) { + printf(LOG_PREFIX "wakeAbs %qd < nowAbs %qd\n", wakeAbs, nowAbs); + wakeAbs = nowAbs; + } + wakeAbs -= nowAbs; + absolutetime_to_microtime(wakeAbs, &wakesecs, &wakemicrosecs); + + wakesecs += nowsecs; + wakemicrosecs += nowmicrosecs; + if (wakemicrosecs >= USEC_PER_SEC) { + wakesecs++; + wakemicrosecs -= USEC_PER_SEC; + } + if (wakemicrosecs >= (USEC_PER_SEC / 10)) { + wakesecs++; + } + + IOPMConvertSecondsToCalendar(wakesecs, &_aotWakeTimeCalendar); + + if (_aotWakeTimeContinuous != wakeContinuousTime) { + _aotWakeTimeContinuous = wakeContinuousTime; + IOLog(LOG_PREFIX "setWakeTime: " YMDTF "\n", YMDT(&_aotWakeTimeCalendar)); + } + _aotWakeTimeCalendar.selector = kPMCalendarTypeMaintenance; + _aotWakeTimeUTC = wakesecs; + + return kIOReturnSuccess; +} + +// assumes WAKEEVENT_LOCK +bool +IOPMrootDomain::aotShouldExit(bool checkTimeSet, bool software) +{ + bool exitNow; + const char * reason = ""; + + if (software) { + _aotExit = true; + _aotMetrics->softwareRequestCount++; + reason = "software request"; + } else if (kIOPMWakeEventAOTExitFlags & _aotPendingFlags) { + _aotExit = true; + reason = gWakeReasonString; + } else if (checkTimeSet && (kPMCalendarTypeInvalid == _aotWakeTimeCalendar.selector)) { + _aotExit = true; + _aotMetrics->noTimeSetCount++; + reason = "flipbook expired"; + } else if ((kIOPMAOTModeRespectTimers & _aotMode) && _scheduledAlarmUTC) { + clock_sec_t sec; + clock_usec_t usec; + clock_get_calendar_microtime(&sec, &usec); + if (_scheduledAlarmUTC <= sec) { + _aotExit = true; + _aotMetrics->rtcAlarmsCount++; + reason = "user alarm"; + } + } + exitNow = (_aotNow && _aotExit); + if (exitNow) { + _aotNow = false; + IOLog(LOG_PREFIX "AOT exit for %s, sc %d po %d, cp %d, rj %d, ex %d, nt %d, rt %d\n", + reason, + _aotMetrics->sleepCount, + _aotMetrics->possibleCount, + _aotMetrics->confirmedPossibleCount, + _aotMetrics->rejectedPossibleCount, + _aotMetrics->expiredPossibleCount, + _aotMetrics->noTimeSetCount, + _aotMetrics->rtcAlarmsCount); + } + return exitNow; +} + +void +IOPMrootDomain::aotExit(bool cps) +{ + _aotTasksSuspended = false; + _aotReadyToFullWake = false; + if (_aotTimerScheduled) { + _aotTimerES->cancelTimeout(); + _aotTimerScheduled = false; + } + updateTasksSuspend(); + + _aotMetrics->totalTime += mach_absolute_time() - _aotLastWakeTime; + _aotLastWakeTime = 0; + if (_aotMetrics->sleepCount && (_aotMetrics->sleepCount <= kIOPMAOTMetricsKernelWakeCountMax)) { + strlcpy(&_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount - 1][0], + gWakeReasonString, + sizeof(_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount])); + } + + _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid; + + _systemMessageClientMask = kSystemMessageClientLegacyApp; + tellClients(kIOMessageSystemWillPowerOn); + + if (cps) { + changePowerStateToPriv(getRUN_STATE()); + } +} + +void +IOPMrootDomain::aotEvaluate(IOTimerEventSource * timer) +{ + bool exitNow; + + IOLog("aotEvaluate(%d) 0x%x\n", (timer != NULL), _aotPendingFlags); + + WAKEEVENT_LOCK(); + exitNow = aotShouldExit(false, false); + if (timer != NULL) { + _aotTimerScheduled = false; + } + WAKEEVENT_UNLOCK(); + if (exitNow) { + aotExit(true); + } else { +#if 0 + if (_aotLingerTime) { + uint64_t deadline; + IOLog("aot linger before sleep\n"); + clock_absolutetime_interval_to_deadline(_aotLingerTime, &deadline); + clock_delay_until(deadline); + } +#endif + privateSleepSystem(kIOPMSleepReasonSoftware); + } +} + +#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + +unsigned long +IOPMrootDomain::getRUN_STATE(void) +{ + return ON_STATE; +} + +IOReturn +IOPMrootDomain::setWakeTime(uint64_t wakeContinuousTime) +{ + return kIOReturnUnsupported; +} + +#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + //****************************************************************************** // adjustPowerState // @@ -6357,13 +7288,45 @@ IOPMrootDomain::mustHibernate( void ) void IOPMrootDomain::adjustPowerState( bool sleepASAP ) { - DLOG("adjustPowerState ps %u, asap %d, idleSleepEnabled %d\n", - (uint32_t) getPowerState(), sleepASAP, idleSleepEnabled); + DEBUG_LOG("adjustPowerState ps %s, asap %d, idleSleepEnabled %d\n", + getPowerStateString((uint32_t) getPowerState()), sleepASAP, idleSleepEnabled); ASSERT_GATED(); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (_aotNow) { + bool exitNow; + + if (AOT_STATE != getPowerState()) { + return; + } + WAKEEVENT_LOCK(); + exitNow = aotShouldExit(true, false); + if (!exitNow + && !_aotTimerScheduled + && (kIOPMWakeEventAOTPossibleExit == (kIOPMWakeEventAOTPossibleFlags & _aotPendingFlags))) { + _aotTimerScheduled = true; + if (_aotLingerTime) { + _aotTimerES->setTimeout(_aotLingerTime); + } else { + _aotTimerES->setTimeout(800, kMillisecondScale); + } + } + WAKEEVENT_UNLOCK(); + if (exitNow) { + aotExit(true); + } else { + _aotReadyToFullWake = true; + if (!_aotTimerScheduled) { + privateSleepSystem(kIOPMSleepReasonSoftware); + } + } + return; + } +#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + if ((!idleSleepEnabled) || !checkSystemSleepEnabled()) { - changePowerStateToPriv(ON_STATE); + changePowerStateToPriv(getRUN_STATE()); } else if (sleepASAP) { changePowerStateToPriv(SLEEP_STATE); } @@ -6493,7 +7456,7 @@ IOPMrootDomain::dispatchPowerEvent( DLOG("power event %u args %p 0x%llx\n", event, OBFUSCATE(arg0), arg1); if (systemCapabilityNotifier) { systemCapabilityNotifier->release(); - systemCapabilityNotifier = 0; + systemCapabilityNotifier = NULL; } if (arg0) { systemCapabilityNotifier = (IONotifier *) arg0; @@ -6692,8 +7655,24 @@ exit: IOReturn IOPMrootDomain::receivePowerNotification( UInt32 msg ) { - pmPowerStateQueue->submitPowerEvent( - kPowerEventReceivedPowerNotification, (void *)(uintptr_t) msg ); + if (msg & kIOPMPowerButton) { + uint32_t currentPhase = pmTracer->getTracePhase(); + if (currentPhase != kIOPMTracePointSystemUp && currentPhase > kIOPMTracePointSystemSleep) { + DEBUG_LOG("power button pressed during wake. phase = %u\n", currentPhase); + swd_flags |= SWD_PWR_BTN_STACKSHOT; + thread_call_enter(powerButtonDown); + } else { + DEBUG_LOG("power button pressed when system is up\n"); + } + } else if (msg & kIOPMPowerButtonUp) { + if (swd_flags & SWD_PWR_BTN_STACKSHOT) { + swd_flags &= ~SWD_PWR_BTN_STACKSHOT; + thread_call_enter(powerButtonUp); + } + } else { + pmPowerStateQueue->submitPowerEvent( + kPowerEventReceivedPowerNotification, (void *)(uintptr_t) msg ); + } return kIOReturnSuccess; } @@ -6701,6 +7680,7 @@ void IOPMrootDomain::handlePowerNotification( UInt32 msg ) { bool eval_clamshell = false; + bool eval_clamshell_alarm = false; ASSERT_GATED(); @@ -6708,7 +7688,16 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg ) * Local (IOPMrootDomain only) eval clamshell command */ if (msg & kLocalEvalClamshellCommand) { - eval_clamshell = true; + if (isRTCAlarmWake) { + eval_clamshell_alarm = true; + + // reset isRTCAlarmWake. This evaluation should happen only once + // on RTC/Alarm wake. Any clamshell events after wake should follow + // the regular evaluation + isRTCAlarmWake = false; + } else { + eval_clamshell = true; + } } /* @@ -6883,31 +7872,17 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg ) /* * Evaluate clamshell and SLEEP if appropiate */ - if (eval_clamshell && clamshellClosed) { + if (eval_clamshell_alarm && clamshellClosed) { + if (shouldSleepOnRTCAlarmWake()) { + privateSleepSystem(kIOPMSleepReasonClamshell); + } + } else if (eval_clamshell && clamshellClosed) { if (shouldSleepOnClamshellClosed()) { privateSleepSystem(kIOPMSleepReasonClamshell); } else { evaluatePolicy( kStimulusDarkWakeEvaluate ); } } - - /* - * Power Button - */ - if (msg & kIOPMPowerButton) { - DLOG("Powerbutton press\n"); - if (!wranglerAsleep) { - OSString *pbs = OSString::withCString("DisablePowerButtonSleep"); - // Check that power button sleep is enabled - if (pbs) { - if (kOSBooleanTrue != getProperty(pbs)) { - privateSleepSystem(kIOPMSleepReasonPowerButton); - } - } - } else { - reportUserInput(); - } - } } //****************************************************************************** @@ -6966,7 +7941,7 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) if (kFullWakeReasonDisplayOn == fullWakeReason) { fullWakeReason = fFullWakeReasonDisplayOnAndLocalUser; DLOG("User activity while in notification wake\n"); - changePowerStateWithOverrideTo( ON_STATE, 0); + changePowerStateWithOverrideTo( getRUN_STATE(), 0); } kdebugTrace(kPMLogUserActiveState, 0, 1, 0); @@ -7166,7 +8141,7 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) // Release power clamp, and wait for children idle. adjustPowerState(true); } else { - changePowerStateToPriv(ON_STATE); + changePowerStateToPriv(getRUN_STATE()); } } } @@ -7210,7 +8185,7 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) if (flags.bit.idleSleepEnabled) { DLOG("idle sleep timer enabled\n"); if (!wrangler) { - changePowerStateToPriv(ON_STATE); + changePowerStateToPriv(getRUN_STATE()); startIdleSleepTimer( idleSeconds ); } else { // Start idle timer if prefs now allow system sleep @@ -7233,9 +8208,9 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) if (flags.bit.adjustPowerState) { bool sleepASAP = false; - if (!systemBooting && (preventIdleSleepList->getCount() == 0)) { + if (!systemBooting && (0 == idleSleepPreventersCount())) { if (!wrangler) { - changePowerStateToPriv(ON_STATE); + changePowerStateToPriv(getRUN_STATE()); if (idleSleepEnabled) { // stay awake for at least idleSeconds startIdleSleepTimer(idleSeconds); @@ -7249,6 +8224,28 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) } } +//****************************************************************************** + +unsigned int +IOPMrootDomain::idleSleepPreventersCount() +{ +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (_aotMode) { + unsigned int count __block; + count = 0; + preventIdleSleepList->iterateObjects(^bool (OSObject * obj) + { + count += (NULL == obj->metaCast("AppleARMBacklight")); + return false; + }); + return count; + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + + return preventIdleSleepList->getCount(); +} + + //****************************************************************************** // requestFullWake // @@ -7259,7 +8256,7 @@ void IOPMrootDomain::requestFullWake( FullWakeReason reason ) { uint32_t options = 0; - IOService * pciRoot = 0; + IOService * pciRoot = NULL; bool promotion = false; // System must be in dark wake and a valid reason for entering full wake @@ -7393,6 +8390,13 @@ IOPMrootDomain::fullWakeDelayedWork( void ) // evaluateAssertions // //****************************************************************************** + +// Bitmask of all kernel assertions that prevent system idle sleep. +// kIOPMDriverAssertionReservedBit7 is reserved for IOMediaBSDClient. +#define NO_IDLE_SLEEP_ASSERTIONS_MASK \ + (kIOPMDriverAssertionReservedBit7 | \ + kIOPMDriverAssertionPreventSystemIdleSleepBit) + void IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDriverAssertionType oldAssertions) { @@ -7410,7 +8414,14 @@ IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDr } if (changedBits & kIOPMDriverAssertionCPUBit) { +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (_aotNow) { + IOLog("CPU assertions %d\n", (0 != (kIOPMDriverAssertionCPUBit & newAssertions))); + } + evaluatePolicy(_aotNow ? kStimulusNoIdleSleepPreventers : kStimulusDarkWakeEvaluate); +#else evaluatePolicy(kStimulusDarkWakeEvaluate); +#endif if (!assertOnWakeSecs && gIOLastWakeAbsTime) { AbsoluteTime now; clock_usec_t microsecs; @@ -7424,13 +8435,18 @@ IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDr } } - if (changedBits & kIOPMDriverAssertionReservedBit7) { - bool value = (newAssertions & kIOPMDriverAssertionReservedBit7) ? true : false; - if (value) { - DLOG("Driver assertion ReservedBit7 raised. Legacy IO preventing sleep\n"); - updatePreventIdleSleepList(this, true); + if (changedBits & NO_IDLE_SLEEP_ASSERTIONS_MASK) { + if ((newAssertions & NO_IDLE_SLEEP_ASSERTIONS_MASK) != 0) { + if ((oldAssertions & NO_IDLE_SLEEP_ASSERTIONS_MASK) == 0) { + DLOG("PreventIdleSleep driver assertion raised\n"); + bool ok = updatePreventIdleSleepList(this, true); + if (ok && (changedBits & kIOPMDriverAssertionPreventSystemIdleSleepBit)) { + // Cancel idle sleep if there is one in progress + cancelIdlePowerDown(this); + } + } } else { - DLOG("Driver assertion ReservedBit7 dropped\n"); + DLOG("PreventIdleSleep driver assertion dropped\n"); updatePreventIdleSleepList(this, false); } } @@ -7513,7 +8529,7 @@ IOPMrootDomain::pmStatsRecordApplicationResponse( OSNumber *msgNum = NULL; const OSSymbol *appname; const OSSymbol *sleep = NULL, *wake = NULL; - IOPMServiceInterestNotifier *notify = 0; + IOPMServiceInterestNotifier *notify = NULL; if (object && (notify = OSDynamicCast(IOPMServiceInterestNotifier, object))) { if (response->isEqualTo(gIOPMStatsResponseTimedOut)) { @@ -8032,6 +9048,14 @@ PMTraceWorker::RTC_TRACE(void) tracePointHandler( tracePointTarget, traceData32, wordA ); _LOG("RTC_TRACE wrote 0x%08x 0x%08x\n", traceData32, wordA); } +#if DEVELOPMENT || DEBUG + if ((swd_panic_phase != 0) && (swd_panic_phase == tracePhase)) { + DEBUG_LOG("Causing sleep wake failure in phase 0x%08x\n", tracePhase); + IOLock *l = IOLockAlloc(); + IOLockLock(l); + IOLockLock(l); + } +#endif } int @@ -8229,7 +9253,7 @@ PMHaltWorker::worker( void ) if (me) { me->release(); } - return 0; + return NULL; } void @@ -8238,7 +9262,7 @@ PMHaltWorker::free( void ) DLOG("PMHaltWorker free %p\n", OBFUSCATE(this)); if (lock) { IOLockFree(lock); - lock = 0; + lock = NULL; } return OSObject::free(); } @@ -8291,7 +9315,7 @@ PMHaltWorker::work( PMHaltWorker * me ) bool timeout; while (true) { - service = 0; + service = NULL; timeout = false; // Claim an unit of work from the shared pool @@ -8325,7 +9349,7 @@ PMHaltWorker::work( PMHaltWorker * me ) while (service->getProperty(gPMHaltClientAcknowledgeKey)) { IOLockSleep(me->lock, me, THREAD_UNINT); } - me->service = 0; + me->service = NULL; timeout = me->timeout; IOLockUnlock(me->lock); } @@ -8754,7 +9778,7 @@ IOPMrootDomain::copyProperty( const char * aKey) const } if (!strcmp(aKey, kIOPMDriverWakeEventsKey)) { - OSArray * array = 0; + OSArray * array = NULL; WAKEEVENT_LOCK(); if (_systemWakeEventsArray && _systemWakeEventsArray->getCount()) { OSCollection *collection = _systemWakeEventsArray->copyCollection(); @@ -8767,7 +9791,7 @@ IOPMrootDomain::copyProperty( const char * aKey) const } if (!strcmp(aKey, kIOPMSleepStatisticsAppsKey)) { - OSArray * array = 0; + OSArray * array = NULL; IOLockLock(pmStatsLock); if (pmStatsAppResponses && pmStatsAppResponses->getCount()) { OSCollection *collection = pmStatsAppResponses->copyCollection(); @@ -8792,6 +9816,17 @@ IOPMrootDomain::copyProperty( const char * aKey) const return systemSleepList; } + if (!strcmp(aKey, kIOPMIdleSleepPreventersWithIDKey)) { + OSArray *idleSleepList = NULL; + gRootDomain->copySleepPreventersListWithID(&idleSleepList, NULL); + return idleSleepList; + } + + if (!strcmp(aKey, kIOPMSystemSleepPreventersWithIDKey)) { + OSArray *systemSleepList = NULL; + gRootDomain->copySleepPreventersListWithID(NULL, &systemSleepList); + return systemSleepList; + } return NULL; } @@ -8819,12 +9854,18 @@ IOPMrootDomain::acceptSystemWakeEvents( bool accept ) WAKEEVENT_LOCK(); if (accept) { - gWakeReasonString[0] = '\0'; if (!_systemWakeEventsArray) { _systemWakeEventsArray = OSArray::withCapacity(4); } - if ((_acceptSystemWakeEvents = (_systemWakeEventsArray != 0))) { - _systemWakeEventsArray->flushCollection(); + _acceptSystemWakeEvents = (_systemWakeEventsArray != NULL); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (!(kIOPMWakeEventAOTExitFlags & _aotPendingFlags)) +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + { + gWakeReasonString[0] = '\0'; + if (_systemWakeEventsArray) { + _systemWakeEventsArray->flushCollection(); + } } } else { _acceptSystemWakeEvents = false; @@ -8835,7 +9876,7 @@ IOPMrootDomain::acceptSystemWakeEvents( bool accept ) if ((panic_allowed == -1) && (PE_parse_boot_argn("swd_wakereason_panic", &panic_allowed, sizeof(panic_allowed)) == false)) { - panic_allowed = 1; + panic_allowed = 0; } if (panic_allowed) { @@ -8873,14 +9914,15 @@ IOPMrootDomain::claimSystemWakeEvent( const char * reason, OSObject * details ) { - const OSSymbol * deviceName = 0; - OSNumber * deviceRegId = 0; - OSNumber * claimTime = 0; - OSData * flagsData = 0; - OSString * reasonString = 0; - OSDictionary * d = 0; + const OSSymbol * deviceName = NULL; + OSNumber * deviceRegId = NULL; + OSNumber * claimTime = NULL; + OSData * flagsData = NULL; + OSString * reasonString = NULL; + OSDictionary * d = NULL; uint64_t timestamp; bool ok = false; + bool addWakeReason; pmEventTimeStamp(×tamp); @@ -8888,6 +9930,32 @@ IOPMrootDomain::claimSystemWakeEvent( return; } +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + IOOptionBits aotFlags = 0; + bool needAOTEvaluate = FALSE; + + if (kIOPMAOTModeAddEventFlags & _aotMode) { + if (!strcmp("hold", reason) + || !strcmp("help", reason) + || !strcmp("menu", reason) + || !strcmp("stockholm", reason) + || !strcmp("ringer", reason) + || !strcmp("ringerab", reason) + || !strcmp("smc0", reason) + || !strcmp("AOP.RTPWakeupAP", reason) + || !strcmp("BT.OutboxNotEmpty", reason) + || !strcmp("WL.OutboxNotEmpty", reason)) { + flags |= kIOPMWakeEventAOTExit; + } + } + +#if DEVELOPMENT || DEBUG + if (_aotLingerTime && !strcmp("rtc", reason)) { + flags |= kIOPMWakeEventAOTPossibleExit; + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + deviceName = device->copyName(gIOServicePlane); deviceRegId = OSNumber::withNumber(device->getRegistryEntryID(), 64); claimTime = OSNumber::withNumber(timestamp, 64); @@ -8908,6 +9976,34 @@ IOPMrootDomain::claimSystemWakeEvent( } WAKEEVENT_LOCK(); + addWakeReason = _acceptSystemWakeEvents; +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (_aotMode) { + IOLog("claimSystemWakeEvent(%s, %s, 0x%x) 0x%x %d\n", reason, deviceName->getCStringNoCopy(), (int)flags, _aotPendingFlags, _aotReadyToFullWake); + } + aotFlags = (kIOPMWakeEventAOTFlags & flags); + aotFlags = (aotFlags & ~_aotPendingFlags); + needAOTEvaluate = false; + if (_aotNow && aotFlags) { + if (kIOPMWakeEventAOTPossibleExit & flags) { + _aotMetrics->possibleCount++; + } + if (kIOPMWakeEventAOTConfirmedPossibleExit & flags) { + _aotMetrics->confirmedPossibleCount++; + } + if (kIOPMWakeEventAOTRejectedPossibleExit & flags) { + _aotMetrics->rejectedPossibleCount++; + } + if (kIOPMWakeEventAOTExpiredPossibleExit & flags) { + _aotMetrics->expiredPossibleCount++; + } + + _aotPendingFlags |= aotFlags; + addWakeReason = _aotNow && _systemWakeEventsArray && ((kIOPMWakeEventAOTExitFlags & aotFlags)); + needAOTEvaluate = _aotReadyToFullWake; + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + if (!gWakeReasonSysctlRegistered) { // Lazy registration until the platform driver stops registering // the same name. @@ -8916,14 +10012,20 @@ IOPMrootDomain::claimSystemWakeEvent( sysctl_register_oid(&sysctl__kern_wakereason); #endif } - if (_acceptSystemWakeEvents) { + if (addWakeReason) { ok = _systemWakeEventsArray->setObject(d); if (gWakeReasonString[0] != '\0') { strlcat(gWakeReasonString, " ", sizeof(gWakeReasonString)); } strlcat(gWakeReasonString, reason, sizeof(gWakeReasonString)); } + WAKEEVENT_UNLOCK(); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (needAOTEvaluate) { + aotEvaluate(NULL); + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ done: if (deviceName) { @@ -8959,7 +10061,7 @@ PMSettingHandle::free( void ) if (pmso) { pmso->clientHandleFreed(); pmso->release(); - pmso = 0; + pmso = NULL; } OSObject::free(); @@ -8985,8 +10087,8 @@ PMSettingObject *PMSettingObject::pmSettingObject( OSObject * *handle_obj) { uint32_t settingCount = 0; - PMSettingObject *pmso = 0; - PMSettingHandle *pmsh = 0; + PMSettingObject *pmso = NULL; + PMSettingHandle *pmsh = NULL; if (!parent_arg || !handler_arg || !settings || !handle_obj) { return NULL; @@ -9253,7 +10355,7 @@ PMAssertionsTracker::createAssertion( track.id = OSIncrementAtomic64((SInt64*) &issuingUniqueID); track.level = level; track.assertionBits = which; - track.ownerString = whoItIs ? OSSymbol::withCString(whoItIs):0; + track.ownerString = whoItIs ? OSSymbol::withCString(whoItIs):NULL; track.ownerService = serviceID; track.registryEntryID = serviceID ? serviceID->getRegistryEntryID():0; track.modifiedTime = 0; @@ -9313,7 +10415,7 @@ PMAssertionsTracker::releaseAssertion( IOPMDriverAssertionID _id) { if (owner && owner->pmPowerStateQueue) { - owner->pmPowerStateQueue->submitPowerEvent(kPowerEventAssertionRelease, 0, _id); + owner->pmPowerStateQueue->submitPowerEvent(kPowerEventAssertionRelease, NULL, _id); } return kIOReturnSuccess; } @@ -9385,7 +10487,7 @@ PMAssertionsTracker::setUserAssertionLevels( this, &PMAssertionsTracker::handleSetUserAssertionLevels), this, - (void *) &new_user_levels, 0, 0, 0); + (void *) &new_user_levels, NULL, NULL, NULL); } return kIOReturnSuccess; @@ -9645,7 +10747,7 @@ IOPMrootDomain::getWatchdogTimeout() IOReturn IOPMrootDomain::restartWithStackshot() { - takeStackshot(true, true, false); + takeStackshot(true); return kIOReturnSuccess; } @@ -9653,7 +10755,7 @@ IOPMrootDomain::restartWithStackshot() void IOPMrootDomain::sleepWakeDebugTrig(bool wdogTrigger) { - takeStackshot(wdogTrigger, false, false); + takeStackshot(wdogTrigger); } void @@ -9810,39 +10912,72 @@ IOPMrootDomain::saveFailureData2File() char failureStr[512]; errno_t error; char *outbuf; - bool oswatchdog = false; + OSNumber *statusCode; + uint64_t pmStatusCode = 0; + uint32_t phaseData = 0; + uint32_t phaseDetail = 0; + bool efiFailure = false; + + statusCode = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey)); + if (statusCode) { + pmStatusCode = statusCode->unsigned64BitValue(); + phaseData = pmStatusCode & 0xFFFFFFFF; + phaseDetail = (pmStatusCode >> 32) & 0xFFFFFFFF; + if ((phaseData & 0xFF) == kIOPMTracePointSystemSleep) { + LOG("Sleep Wake failure in EFI\n"); + efiFailure = true; + failureStr[0] = 0; + snprintf(failureStr, sizeof(failureStr), "Sleep Wake failure in EFI\n\nFailure code:: 0x%08x 0x%08x\n\nPlease IGNORE the below stackshot\n", phaseDetail, phaseData); + len = strlen(failureStr); + } + } + + if (!efiFailure) { + if (PEReadNVRAMProperty(kIOSleepWakeFailurePanic, NULL, &len)) { + swd_flags |= SWD_BOOT_BY_SW_WDOG; + PERemoveNVRAMProperty(kIOSleepWakeFailurePanic); + // dump panic will handle saving nvram data + return; + } - if (!PEReadNVRAMProperty(kIOSleepWakeFailureString, NULL, &len) && - !PEReadNVRAMProperty(kIOOSWatchdogFailureString, NULL, &len)) { - DLOG("No SleepWake failure or OSWatchdog failure string to read\n"); - return; - } + /* Keeping this around for capturing data during power + * button press */ - if (len == 0) { - DLOG("Ignoring zero byte SleepWake failure string\n"); - goto exit; - } + if (!PEReadNVRAMProperty(kIOSleepWakeFailureString, NULL, &len)) { + DLOG("No sleep wake failure string\n"); + return; + } + if (len == 0) { + DLOG("Ignoring zero byte SleepWake failure string\n"); + goto exit; + } - if (len > sizeof(failureStr)) { - len = sizeof(failureStr); - } - failureStr[0] = 0; - if (PEReadNVRAMProperty(kIOSleepWakeFailureString, failureStr, &len) == false) { - if (PEReadNVRAMProperty(kIOOSWatchdogFailureString, failureStr, &len)) { - oswatchdog = true; + // if PMStatus code is zero, delete stackshot and return + if (statusCode) { + if (((pmStatusCode & 0xFFFFFFFF) & 0xFF) == 0) { + // there was no sleep wake failure + // this can happen if delete stackshot was called + // before take stackshot completed. Let us delete any + // sleep wake failure data in nvram + DLOG("Deleting stackshot on successful wake\n"); + deleteStackshot(); + return; + } + } + + if (len > sizeof(failureStr)) { + len = sizeof(failureStr); } + failureStr[0] = 0; + PEReadNVRAMProperty(kIOSleepWakeFailureString, failureStr, &len); } if (failureStr[0] != 0) { - error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogFailureStringFile : kSleepWakeFailureStringFile, - failureStr, len); + error = sleepWakeDebugSaveFile(kSleepWakeFailureStringFile, failureStr, len); if (error) { DLOG("Failed to save SleepWake failure string to file. error:%d\n", error); } else { DLOG("Saved SleepWake failure string to file.\n"); } - if (!oswatchdog) { - swd_flags |= SWD_BOOT_BY_SW_WDOG; - } } if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) { @@ -9877,8 +11012,17 @@ IOPMrootDomain::saveFailureData2File() LOG("Concatenated length for the SWD blob %d\n", concat_len); if (concat_len) { - error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogStacksFilename : kSleepWakeStacksFilename, - outbuf, concat_len); + error = sleepWakeDebugSaveFile(kSleepWakeStacksFilename, outbuf, concat_len); + if (error) { + LOG("Failed to save SleepWake zipped data to file. error:%d\n", error); + } else { + LOG("Saved SleepWake zipped data to file.\n"); + } + } else { + // There is a sleep wake failure string but no stackshot + // Write a placeholder stacks file so that swd runs + snprintf(outbuf, 20, "%s", "No stackshot data\n"); + error = sleepWakeDebugSaveFile(kSleepWakeStacksFilename, outbuf, 20); if (error) { LOG("Failed to save SleepWake zipped data to file. error:%d\n", error); } else { @@ -9892,7 +11036,7 @@ IOPMrootDomain::saveFailureData2File() gRootDomain->swd_lock = 0; exit: - PERemoveNVRAMProperty(oswatchdog ? kIOOSWatchdogFailureString : kIOSleepWakeFailureString); + PERemoveNVRAMProperty(kIOSleepWakeFailureString); return; } @@ -9952,6 +11096,7 @@ IOPMrootDomain::getFailureData(thread_t *thread, char *failureStr, size_t strLen OSKext *kext = OSKext::lookupKextWithAddress((vm_address_t)callMethod); if (kext) { objectName = kext->getIdentifierCString(); + kext->release(); } } } @@ -10136,238 +11281,222 @@ swd_compress(char *inPtr, char *outPtr, size_t numBytes) } void -IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool isSpinDump) +IOPMrootDomain::deleteStackshot() { - swd_hdr * hdr = NULL; - int wdog_panic = -1; - int cnt = 0; - pid_t pid = 0; - kern_return_t kr = KERN_SUCCESS; - uint32_t flags; + if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) { + // takeStackshot hasn't completed + return; + } + LOG("Deleting any sleepwake failure data in nvram\n"); - char * dstAddr; - uint32_t size; - uint32_t bytesRemaining; - unsigned bytesWritten = 0; - unsigned totalBytes = 0; - OSString * UUIDstring = NULL; + PERemoveNVRAMProperty(kIOSleepWakeFailureString); + char nvram_var_name_buf[20]; + for (int i = 0; i < 8; i++) { + snprintf(nvram_var_name_buf, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, i + 1); + if (PERemoveNVRAMProperty(nvram_var_name_buf) == false) { + LOG("Removing %s returned false\n", nvram_var_name_buf); + } + } + // force NVRAM sync + if (PEWriteNVRAMProperty(kIONVRAMSyncNowPropertyKey, kIONVRAMSyncNowPropertyKey, strlen(kIONVRAMSyncNowPropertyKey)) == false) { + DLOG("Failed to force nvram sync\n"); + } + gRootDomain->swd_lock = 0; +} +void +IOPMrootDomain::takeStackshot(bool wdogTrigger) +{ + swd_hdr * hdr = NULL; + int cnt = 0; + int max_cnt = 2; + pid_t pid = 0; + kern_return_t kr = KERN_SUCCESS; + uint32_t flags; - char failureStr[512]; - thread_t thread = NULL; - const char * uuid; + char * dstAddr; + uint32_t size; + uint32_t bytesRemaining; + unsigned bytesWritten = 0; + char failureStr[512]; + thread_t thread = NULL; + const char * swfPanic = "swfPanic"; - uint32_t bufSize; - uint32_t initialStackSize; + uint32_t bufSize; + int success = 0; + if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) { + return; + } failureStr[0] = 0; - if (isSpinDump) { - if (_systemTransitionType != kSystemTransitionSleep && - _systemTransitionType != kSystemTransitionWake) { - return; - } - - if (gSpinDumpBufferFull) { - return; - } - if (swd_spindump_buffer == NULL) { - sleepWakeDebugSpinDumpMemAlloc(); - if (swd_spindump_buffer == NULL) { - return; - } - } - - bufSize = SWD_SPINDUMP_SIZE; - initialStackSize = SWD_INITIAL_SPINDUMP_SIZE; - hdr = (swd_hdr *)swd_spindump_buffer; - } else { - if ((kIOSleepWakeWdogOff & gIOKitDebug) || systemBooting || systemShutdown || gWillShutdown) { - return; - } + if ((kIOSleepWakeWdogOff & gIOKitDebug) || systemBooting || systemShutdown || gWillShutdown) { + return; + } - if (isOSXWatchdog) { - snprintf(failureStr, sizeof(failureStr), "Stackshot Reason: "); - snprintf(failureStr, sizeof(failureStr), "%smacOS watchdog triggered failure\n", failureStr); - } else if (wdogTrigger) { - if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL) { - uuid = UUIDstring->getCStringNoCopy(); - snprintf(failureStr, sizeof(failureStr), "UUID: %s\n", uuid); - } + if (wdogTrigger) { + getFailureData(&thread, failureStr, sizeof(failureStr)); - snprintf(failureStr, sizeof(failureStr), "%sStackshot Reason: ", failureStr); - getFailureData(&thread, failureStr, sizeof(failureStr)); - if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) { - goto skip_stackshot; - } - } else { - snprintf(failureStr, sizeof(failureStr), "%sStackshot triggered for debugging stackshot collection.\n", failureStr); + if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) { + goto skip_stackshot; } - // Take only one stackshot in this case. - cnt = SWD_MAX_STACKSHOTS - 1; + } else { + AbsoluteTime now; + uint64_t nsec; + clock_get_uptime(&now); + SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime); + absolutetime_to_nanoseconds(now, &nsec); + snprintf(failureStr, sizeof(failureStr), "%sPower button pressed during wake transition after %u ms.\n", failureStr, ((int)((nsec) / NSEC_PER_MSEC))); + } + if (swd_buffer == NULL) { + sleepWakeDebugMemAlloc(); if (swd_buffer == NULL) { - sleepWakeDebugMemAlloc(); - if (swd_buffer == NULL) { - return; - } + return; } - hdr = (swd_hdr *)swd_buffer; - - bufSize = hdr->alloc_size;; - initialStackSize = bufSize; } + hdr = (swd_hdr *)swd_buffer; + bufSize = hdr->alloc_size;; - if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) { - return; - } dstAddr = (char*)hdr + hdr->spindump_offset; - bytesRemaining = bufSize - hdr->spindump_offset; - - DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining); - flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_NO_IO_STATS | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY | STACKSHOT_THREAD_WAITINFO; - while (kr == KERN_SUCCESS) { - if (cnt == 0) { - /* - * Take stackshot of all process on first sample. Size is restricted - * to SWD_INITIAL_STACK_SIZE - */ - pid = -1; - size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining; - } else { - /* Take sample of kernel threads only */ - pid = 0; - size = bytesRemaining; - } + /* If not wdogTrigger only take kernel tasks stackshot + */ + if (wdogTrigger) { + pid = -1; + } else { + pid = 0; + } + + /* Attempt to take stackshot with all ACTIVE_KERNEL_THREADS + * If we run out of space, take stackshot with only kernel task + */ + while (success == 0 && cnt < max_cnt) { + bytesRemaining = bufSize - hdr->spindump_offset; + cnt++; + DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining); + size = bytesRemaining; kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten); DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n", kr, pid, size, flags, bytesWritten); if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) { if (pid == -1) { - // Insufficient buffer when trying to take stackshot of user & kernel space threads. - // Continue to take stackshot of just kernel threads - ++cnt; - kr = KERN_SUCCESS; - continue; - } else if (totalBytes == 0) { - MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags); + pid = 0; + } else { + LOG("Insufficient buffer size for only kernel task\n"); + break; } } + if (kr == KERN_SUCCESS) { + if (bytesWritten == 0) { + MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags); + continue; + } + bytesRemaining -= bytesWritten; + hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset); - dstAddr += bytesWritten; - totalBytes += bytesWritten; - bytesRemaining -= bytesWritten; + memset(hdr->reason, 0x20, sizeof(hdr->reason)); - if (++cnt == SWD_MAX_STACKSHOTS) { - break; - } - IOSleep(10); // 10 ms - } + // Compress stackshot and save to NVRAM + { + char *outbuf = (char *)swd_compressed_buffer; + int outlen = 0; + int num_chunks = 0; + int max_chunks = 0; + int leftover = 0; + char nvram_var_name_buffer[20]; - hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset); + outlen = swd_compress((char*)hdr + hdr->spindump_offset, outbuf, bytesWritten); - memset(hdr->reason, 0x20, sizeof(hdr->reason)); - if (isSpinDump) { - snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Power State Change Delay\n\n"); - gRootDomain->swd_lock = 0; - gSpinDumpBufferFull = true; - return; - } + if (outlen) { + max_chunks = outlen / (2096 - 200); + leftover = outlen % (2096 - 200); - // Compress stackshot and save to NVRAM - { - char *outbuf = (char *)swd_compressed_buffer; - int outlen = 0; - int num_chunks = 0; - int max_chunks = 0; - int leftover = 0; - char nvram_var_name_buffer[20]; - - outlen = swd_compress((char*)hdr + hdr->spindump_offset, outbuf, bytesWritten); - - if (outlen) { - max_chunks = outlen / (2096 - 200); - leftover = outlen % (2096 - 200); - - if (max_chunks < 8) { - for (num_chunks = 0; num_chunks < max_chunks; num_chunks++) { - snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1); - if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), (2096 - 200)) == FALSE) { - LOG("Failed to update NVRAM %d\n", num_chunks); - break; - } - } - if (leftover) { - snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1); - if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), leftover) == FALSE) { - LOG("Failed to update NVRAM with leftovers\n"); + if (max_chunks < 8) { + for (num_chunks = 0; num_chunks < max_chunks; num_chunks++) { + snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1); + if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), (2096 - 200)) == FALSE) { + LOG("Failed to update NVRAM %d\n", num_chunks); + break; + } + } + if (leftover) { + snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1); + if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), leftover) == FALSE) { + LOG("Failed to update NVRAM with leftovers\n"); + } + } + success = 1; + LOG("Successfully saved stackshot to NVRAM\n"); + } else { + LOG("Compressed failure stackshot is too large. size=%d bytes\n", outlen); + if (pid == -1) { + pid = 0; + } else { + LOG("Compressed failure stackshot of only kernel is too large size=%d bytes\n", outlen); + break; + } } } - } else { - LOG("Compressed failure stackshot is too large. size=%d bytes\n", outlen); } } } if (failureStr[0]) { - if (!isOSXWatchdog) { - // append sleep-wake failure code - snprintf(failureStr, sizeof(failureStr), "%s\nFailure code:: 0x%08x %08x\n", - failureStr, pmTracer->getTraceData(), pmTracer->getTracePhase()); - if (PEWriteNVRAMProperty(kIOSleepWakeFailureString, failureStr, strlen(failureStr)) == false) { - DLOG("Failed to write SleepWake failure string\n"); - } - } else { - if (PEWriteNVRAMProperty(kIOOSWatchdogFailureString, failureStr, strlen(failureStr)) == false) { - DLOG("Failed to write OSWatchdog failure string\n"); - } + // append sleep-wake failure code + snprintf(failureStr, sizeof(failureStr), "%s\nFailure code:: 0x%08x %08x\n", + failureStr, pmTracer->getTraceData(), pmTracer->getTracePhase()); + if (PEWriteNVRAMProperty(kIOSleepWakeFailureString, failureStr, strlen(failureStr)) == false) { + DLOG("Failed to write SleepWake failure string\n"); } } - gRootDomain->swd_lock = 0; + + // force NVRAM sync + if (PEWriteNVRAMProperty(kIONVRAMSyncNowPropertyKey, kIONVRAMSyncNowPropertyKey, strlen(kIONVRAMSyncNowPropertyKey)) == false) { + DLOG("Failed to force nvram sync\n"); + } skip_stackshot: if (wdogTrigger) { - PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)); - - if ((wdog_panic == 1) || (PEGetCoprocessorVersion() >= kCoprocessorVersion2)) { - if (thread) { - panic_with_thread_context(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, thread, "%s", failureStr); - } else { - panic_with_options(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, "%s", failureStr); + if (PEGetCoprocessorVersion() < kCoprocessorVersion2) { + if (swd_flags & SWD_BOOT_BY_SW_WDOG) { + // If current boot is due to this watch dog trigger restart in previous boot, + // then don't trigger again until at least 1 successful sleep & wake. + if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) { + LOG("Shutting down due to repeated Sleep/Wake failures\n"); + if (!tasksSuspended) { + tasksSuspended = TRUE; + updateTasksSuspend(); + } + PEHaltRestart(kPEHaltCPU); + return; + } } - return; - } else if (swd_flags & SWD_BOOT_BY_SW_WDOG) { - // If current boot is due to this watch dog trigger restart in previous boot, - // then don't trigger again until at least 1 successful sleep & wake. - if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) { - LOG("Shutting down due to repeated Sleep/Wake failures\n"); + if (gSwdPanic == 0) { + LOG("Calling panic prevented by swd_panic boot-args. Calling restart"); if (!tasksSuspended) { tasksSuspended = TRUE; - tasks_system_suspend(true); + updateTasksSuspend(); } - PEHaltRestart(kPEHaltCPU); - return; + PEHaltRestart(kPERestartCPU); } } - } - - - if (wdogTrigger) { - LOG("Restarting to collect Sleep wake debug logs\n"); - if (!tasksSuspended) { - tasksSuspended = TRUE; - tasks_system_suspend(true); + if (PEWriteNVRAMProperty(kIOSleepWakeFailurePanic, swfPanic, strlen(swfPanic)) == false) { + DLOG("Failed to write SleepWake failure panic key\n"); + } + if (thread) { + panic_with_thread_context(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, thread, "%s", failureStr); + } else { + panic_with_options(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, "%s", failureStr); } - - PEHaltRestart(kPERestartCPU); } else { - saveFailureData2File(); + gRootDomain->swd_lock = 0; + return; } } @@ -10386,10 +11515,6 @@ IOPMrootDomain::sleepWakeDebugMemAlloc() return; } - if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) { - return; - } - if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) { return; } @@ -10556,11 +11681,8 @@ exit: void IOPMrootDomain::sleepWakeDebugTrig(bool restart) { - uint32_t wdog_panic = 1; - if (restart) { - if (PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)) && - (wdog_panic == 0)) { + if (gSwdPanic == 0) { return; } panic("Sleep/Wake hang detected"); @@ -10569,12 +11691,14 @@ IOPMrootDomain::sleepWakeDebugTrig(bool restart) } void -IOPMrootDomain::takeStackshot(bool restart, bool isOSXWatchdog, bool isSpinDump) +IOPMrootDomain::takeStackshot(bool restart) { #pragma unused(restart) -#pragma unused(isOSXWatchdog) } - +void +IOPMrootDomain::deleteStackshot() +{ +} void IOPMrootDomain::sleepWakeDebugMemAlloc() { diff --git a/iokit/Kernel/IOPerfControl.cpp b/iokit/Kernel/IOPerfControl.cpp index f90699c34..b3d1a5aac 100644 --- a/iokit/Kernel/IOPerfControl.cpp +++ b/iokit/Kernel/IOPerfControl.cpp @@ -12,97 +12,148 @@ #define super OSObject OSDefineMetaClassAndStructors(IOPerfControlClient, OSObject); +static IOPerfControlClient::IOPerfControlClientShared *_Atomic gIOPerfControlClientShared; + bool IOPerfControlClient::init(IOService *driver, uint64_t maxWorkCapacity) { + // TODO: Remove this limit and implement dynamic table growth if workloads are found that exceed this + if (maxWorkCapacity > kMaxWorkTableNumEntries) { + maxWorkCapacity = kMaxWorkTableNumEntries; + } + if (!super::init()) { return false; } - interface = PerfControllerInterface{ - .version = 0, - .registerDevice = - [](IOService *device) { - return kIOReturnSuccess; - }, - .unregisterDevice = + shared = atomic_load_explicit(&gIOPerfControlClientShared, memory_order_acquire); + if (shared == nullptr) { + IOPerfControlClient::IOPerfControlClientShared *expected = shared; + shared = reinterpret_cast(kalloc(sizeof(IOPerfControlClientShared))); + if (!shared) { + return false; + } + + atomic_init(&shared->maxDriverIndex, 0); + + shared->interface = PerfControllerInterface{ + .version = 0, + .registerDevice = [](IOService *device) { return kIOReturnSuccess; }, - .workCanSubmit = - [](IOService *device, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) { - return false; - }, - .workSubmit = - [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) { - }, - .workBegin = - [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkBeginArgs *args) { - }, - .workEnd = - [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkEndArgs *args, bool done) { - }, - }; + .unregisterDevice = + [](IOService *device) { + return kIOReturnSuccess; + }, + .workCanSubmit = + [](IOService *device, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) { + return false; + }, + .workSubmit = + [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) { + }, + .workBegin = + [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkBeginArgs *args) { + }, + .workEnd = + [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkEndArgs *args, bool done) { + }, + }; + + shared->interfaceLock = IOLockAlloc(); + if (!shared->interfaceLock) { + goto shared_init_error; + } - interfaceLock = IOLockAlloc(); - if (!interfaceLock) { - goto error; - } + shared->deviceRegistrationList = OSSet::withCapacity(4); + if (!shared->deviceRegistrationList) { + goto shared_init_error; + } - deviceRegistrationList = OSSet::withCapacity(4); - if (!deviceRegistrationList) { - goto error; + if (!atomic_compare_exchange_strong_explicit(&gIOPerfControlClientShared, &expected, shared, memory_order_acq_rel, + memory_order_acquire)) { + IOLockFree(shared->interfaceLock); + shared->deviceRegistrationList->release(); + kfree(shared, sizeof(*shared)); + shared = expected; + } } - bzero(workTable, sizeof(workTable)); - memset(&workTable[kIOPerfControlClientWorkUntracked], ~0, sizeof(WorkTableEntry)); - workTableNextIndex = kIOPerfControlClientWorkUntracked + 1; + driverIndex = atomic_fetch_add_explicit(&shared->maxDriverIndex, 1, memory_order_relaxed) + 1; + assertf(driverIndex != 0, "Overflow in driverIndex. Too many IOPerfControlClients created.\n"); - workTableLock = IOSimpleLockAlloc(); - if (!workTableLock) { - goto error; - } + // + 1 since index 0 is unused for kIOPerfControlClientWorkUntracked + workTableLength = maxWorkCapacity + 1; + assertf(workTableLength <= kWorkTableMaxSize, "%zu exceeds max allowed capacity of %zu", workTableLength, kWorkTableMaxSize); + if (maxWorkCapacity > 0) { + workTable = reinterpret_cast(kalloc(workTableLength * sizeof(WorkTableEntry))); + if (!workTable) { + goto error; + } + bzero(workTable, workTableLength * sizeof(WorkTableEntry)); + workTableNextIndex = 1; - // TODO: check sum(maxWorkCapacities) < table size + workTableLock = IOSimpleLockAlloc(); + if (!workTableLock) { + goto error; + } + } return true; error: - if (interfaceLock) { - IOLockFree(interfaceLock); - } - if (deviceRegistrationList) { - deviceRegistrationList->release(); + if (workTable) { + kfree(workTable, maxWorkCapacity * sizeof(WorkTableEntry)); } if (workTableLock) { IOSimpleLockFree(workTableLock); } return false; +shared_init_error: + if (shared) { + if (shared->interfaceLock) { + IOLockFree(shared->interfaceLock); + } + if (shared->deviceRegistrationList) { + shared->deviceRegistrationList->release(); + } + kfree(shared, sizeof(*shared)); + shared = nullptr; + } + return false; } -IOPerfControlClient *_Atomic gSharedClient = nullptr; - IOPerfControlClient * IOPerfControlClient::copyClient(IOService *driver, uint64_t maxWorkCapacity) { - IOPerfControlClient *client = atomic_load_explicit(&gSharedClient, memory_order_acquire); - if (client == nullptr) { - IOPerfControlClient *expected = client; - client = new IOPerfControlClient; - if (!client || !client->init(driver, maxWorkCapacity)) { - panic("could not create IOPerfControlClient"); - } - if (!atomic_compare_exchange_strong_explicit(&gSharedClient, &expected, client, memory_order_acq_rel, - memory_order_acquire)) { - client->release(); - client = expected; - } + IOPerfControlClient *client = new IOPerfControlClient; + if (!client || !client->init(driver, maxWorkCapacity)) { + panic("could not create IOPerfControlClient"); } - // TODO: add maxWorkCapacity to existing client - client->retain(); return client; } +/* Convert the per driver token into a globally unique token for the performance + * controller's consumption. This is achieved by setting the driver's unique + * index onto the high order bits. The performance controller is shared between + * all drivers and must track all instances separately, while each driver has + * its own token table, so this step is needed to avoid token collisions between + * drivers. + */ +inline uint64_t +IOPerfControlClient::tokenToGlobalUniqueToken(uint64_t token) +{ + return token | (static_cast(driverIndex) << kWorkTableIndexBits); +} + +/* With this implementation, tokens returned to the driver differ from tokens + * passed to the performance controller. This implementation has the nice + * property that tokens returns to the driver will aways be between 1 and + * the value of maxWorkCapacity passed by the driver to copyClient. The tokens + * the performance controller sees will match on the lower order bits and have + * the driver index set on the high order bits. + */ uint64_t IOPerfControlClient::allocateToken(thread_group *thread_group) { @@ -124,7 +175,7 @@ IOPerfControlClient::getEntryForToken(uint64_t token, IOPerfControlClient::WorkT return false; } - if (token >= kWorkTableNumEntries) { + if (token >= workTableLength) { panic("Invalid work token (%llu): index out of bounds.", token); } @@ -141,7 +192,7 @@ IOPerfControlClient::markEntryStarted(uint64_t token, bool started) return; } - if (token >= kWorkTableNumEntries) { + if (token >= workTableLength) { panic("Invalid work token (%llu): index out of bounds.", token); } @@ -153,15 +204,15 @@ IOPerfControlClient::registerDevice(__unused IOService *driver, IOService *devic { IOReturn ret = kIOReturnSuccess; - IOLockLock(interfaceLock); + IOLockLock(shared->interfaceLock); - if (interface.version > 0) { - ret = interface.registerDevice(device); + if (shared->interface.version > 0) { + ret = shared->interface.registerDevice(device); } else { - deviceRegistrationList->setObject(device); + shared->deviceRegistrationList->setObject(device); } - IOLockUnlock(interfaceLock); + IOLockUnlock(shared->interfaceLock); return ret; } @@ -169,15 +220,15 @@ IOPerfControlClient::registerDevice(__unused IOService *driver, IOService *devic void IOPerfControlClient::unregisterDevice(__unused IOService *driver, IOService *device) { - IOLockLock(interfaceLock); + IOLockLock(shared->interfaceLock); - if (interface.version > 0) { - interface.unregisterDevice(device); + if (shared->interface.version > 0) { + shared->interface.unregisterDevice(device); } else { - deviceRegistrationList->removeObject(device); + shared->deviceRegistrationList->removeObject(device); } - IOLockUnlock(interfaceLock); + IOLockUnlock(shared->interfaceLock); } uint64_t @@ -207,25 +258,25 @@ IOPerfControlClient::registerPerformanceController(PerfControllerInterface pci) { IOReturn result = kIOReturnError; - IOLockLock(interfaceLock); + IOLockLock(shared->interfaceLock); - if (interface.version == 0 && pci.version > 0) { + if (shared->interface.version == 0 && pci.version > 0) { assert(pci.registerDevice && pci.unregisterDevice && pci.workCanSubmit && pci.workSubmit && pci.workBegin && pci.workEnd); result = kIOReturnSuccess; OSObject *obj; - while ((obj = deviceRegistrationList->getAnyObject())) { + while ((obj = shared->deviceRegistrationList->getAnyObject())) { IOService *device = OSDynamicCast(IOService, obj); if (device) { pci.registerDevice(device); } - deviceRegistrationList->removeObject(obj); + shared->deviceRegistrationList->removeObject(obj); } - interface = pci; + shared->interface = pci; } - IOLockUnlock(interfaceLock); + IOLockUnlock(shared->interfaceLock); return result; } diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index d7087fbb2..1fb74c642 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -56,15 +57,26 @@ extern "C" { #define kShutdownTimeout 30 //in secs -#if !CONFIG_EMBEDDED +#if defined(XNU_TARGET_OS_OSX) boolean_t coprocessor_cross_panic_enabled = TRUE; -#define APPLE_SECURE_BOOT_VARIABLE_GUID "94b73556-2197-4702-82a8-3e1337dafbfb" -#endif /* !CONFIG_EMBEDDED */ +#define APPLE_VENDOR_VARIABLE_GUID "4d1ede05-38c7-4a6a-9cc6-4bcca8b38c14" +#endif /* defined(XNU_TARGET_OS_OSX) */ void printDictionaryKeys(OSDictionary * inDictionary, char * inMsg); static void getCStringForObject(OSObject *inObj, char *outStr, size_t outStrLen); +/* + * There are drivers which take mutexes in the quiesce callout or pass + * the quiesce/active action to super. Even though it sometimes panics, + * because it doesn't *always* panic, they get away with it. + * We need a chicken bit to diagnose and fix them all before this + * can be enabled by default. + * + * tracks turning this on by default. + */ +uint32_t gEnforceQuiesceSafety = 0; + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define super IOService @@ -133,7 +145,7 @@ IOPlatformExpert::start( IOService * provider ) // Register the presence or lack thereof a system // PCI address mapper with the IOMapper class - IOMapper::setMapperRequired(0 != getProperty(kIOPlatformMapperPresentKey)); + IOMapper::setMapperRequired(NULL != getProperty(kIOPlatformMapperPresentKey)); gIOInterruptControllers = OSDictionary::withCapacity(1); gIOInterruptControllersLock = IOLockAlloc(); @@ -172,6 +184,9 @@ IOPlatformExpert::start( IOService * provider ) } #endif + PE_parse_boot_argn("enforce_quiesce_safety", &gEnforceQuiesceSafety, + sizeof(gEnforceQuiesceSafety)); + return configure(provider); } @@ -190,7 +205,7 @@ IOPlatformExpert::configure( IOService * provider ) dict->retain(); topLevel->removeObject( dict ); nub = createNub( dict ); - if (0 == nub) { + if (NULL == nub) { continue; } dict->release(); @@ -211,7 +226,7 @@ IOPlatformExpert::createNub( OSDictionary * from ) if (nub) { if (!nub->init( from )) { nub->release(); - nub = 0; + nub = NULL; } } return nub; @@ -291,7 +306,7 @@ IOPlatformExpert::getPhysicalRangeAllocator(void) getProperty("Platform Memory Ranges")); } -int (*PE_halt_restart)(unsigned int type) = 0; +int (*PE_halt_restart)(unsigned int type) = NULL; int IOPlatformExpert::haltRestart(unsigned int type) @@ -408,7 +423,7 @@ IOPlatformExpert::lookUpInterruptController(OSSymbol *name) while (1) { object = gIOInterruptControllers->getObject(name); - if (object != 0) { + if (object != NULL) { break; } @@ -825,6 +840,9 @@ getCStringForObject(OSObject *inObj, char *outStr, size_t outStrLen) /* IOShutdownNotificationsTimedOut * - Called from a timer installed by PEHaltRestart */ +#ifdef CONFIG_EMBEDDED +__abortlike +#endif static void IOShutdownNotificationsTimedOut( thread_call_param_t p0, @@ -900,6 +918,13 @@ PEHaltRestart(unsigned int type) static boolean_t panic_begin_called = FALSE; if (type == kPEHaltCPU || type == kPERestartCPU || type == kPEUPSDelayHaltCPU) { + /* If we're in the panic path, the locks and memory allocations required below + * could fail. So just try to reboot instead of risking a nested panic. + */ + if (panic_begin_called) { + goto skip_to_haltRestart; + } + pmRootDomain = IOService::getPMRootDomain(); /* Notify IOKit PM clients of shutdown/restart * Clients subscribe to this message with a call to @@ -924,10 +949,20 @@ PEHaltRestart(unsigned int type) } } - shutdown_hang = thread_call_allocate( &IOShutdownNotificationsTimedOut, - (thread_call_param_t)(uintptr_t) type); - clock_interval_to_deadline( timeout, kSecondScale, &deadline ); - thread_call_enter1_delayed( shutdown_hang, (thread_call_param_t)(uintptr_t)timeout, deadline ); +#if (DEVELOPMENT || DEBUG) + /* Override the default timeout via a boot-arg */ + uint32_t boot_arg_val; + if (PE_parse_boot_argn("halt_restart_timeout", &boot_arg_val, sizeof(boot_arg_val))) { + timeout = boot_arg_val; + } +#endif + + if (timeout) { + shutdown_hang = thread_call_allocate( &IOShutdownNotificationsTimedOut, + (thread_call_param_t)(uintptr_t) type); + clock_interval_to_deadline( timeout, kSecondScale, &deadline ); + thread_call_enter1_delayed( shutdown_hang, (thread_call_param_t)(uintptr_t)timeout, deadline ); + } pmRootDomain->handlePlatformHaltRestart(type); /* This notification should have few clients who all do @@ -938,7 +973,8 @@ PEHaltRestart(unsigned int type) * later. PM internals make it very hard to wait for asynchronous * replies. */ - } else if (type == kPEPanicRestartCPU || type == kPEPanicSync) { + } else if (type == kPEPanicRestartCPU || type == kPEPanicSync || type == kPEPanicRestartCPUNoPanicEndCallouts || + type == kPEPanicRestartCPUNoCallouts) { if (type == kPEPanicRestartCPU) { // Notify any listeners that we're done collecting // panic data before we call through to do the restart @@ -946,13 +982,20 @@ PEHaltRestart(unsigned int type) if (coprocessor_cross_panic_enabled) #endif IOCPURunPlatformPanicActions(kPEPanicEnd); + } + if ((type == kPEPanicRestartCPU) || (type == kPEPanicRestartCPUNoPanicEndCallouts)) { // Callout to shutdown the disk driver once we've returned from the - // kPEPanicEnd callback (and we know all core dumps on this system - // are complete). + // kPEPanicEnd callbacks (if appropriate) and we know all coredumps + // on this system are complete). IOCPURunPlatformPanicActions(kPEPanicDiskShutdown); } + if (type == kPEPanicRestartCPUNoPanicEndCallouts || type == kPEPanicRestartCPUNoCallouts) { + // Replace the wrapper type with the type drivers handle + type = kPEPanicRestartCPU; + } + // Do an initial sync to flush as much panic data as possible, // in case we have a problem in one of the platorm panic handlers. // After running the platform handlers, do a final sync w/ @@ -978,6 +1021,7 @@ PEHaltRestart(unsigned int type) } } +skip_to_haltRestart: if (gIOPlatform) { return gIOPlatform->haltRestart(type); } else { @@ -988,7 +1032,7 @@ PEHaltRestart(unsigned int type) UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length) { - if (gIOPlatform != 0) { + if (gIOPlatform != NULL) { return gIOPlatform->savePanicInfo(buffer, length); } else { return 0; @@ -1268,7 +1312,7 @@ IOPlatformExpert::registerNVRAMController(IONVRAMController * caller) { OSData * data; IORegistryEntry * entry; - OSString * string = 0; + OSString * string = NULL; uuid_string_t uuid; #if CONFIG_EMBEDDED @@ -1302,20 +1346,22 @@ IOPlatformExpert::registerNVRAMController(IONVRAMController * caller) entry->release(); } -#else /* !CONFIG_EMBEDDED */ +#endif /* CONFIG_EMBEDDED */ + +#if defined(XNU_TARGET_OS_OSX) /* - * If we have panic debugging enabled and a prod-fused coprocessor, + * If we have panic debugging enabled and the bridgeOS panic SoC watchdog is enabled, * disable cross panics so that the co-processor doesn't cause the system * to reset when we enter the debugger or hit a panic on the x86 side. */ if (panicDebugging) { entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); if (entry) { - data = OSDynamicCast( OSData, entry->getProperty( APPLE_SECURE_BOOT_VARIABLE_GUID":EffectiveProductionStatus" )); + data = OSDynamicCast( OSData, entry->getProperty( APPLE_VENDOR_VARIABLE_GUID":BridgeOSPanicWatchdogEnabled" )); if (data && (data->getLength() == sizeof(UInt8))) { - UInt8 *isProdFused = (UInt8 *) data->getBytesNoCopy(); + UInt8 *panicWatchdogEnabled = (UInt8 *) data->getBytesNoCopy(); UInt32 debug_flags = 0; - if (*isProdFused || (PE_i_can_has_debugger(&debug_flags) && + if (*panicWatchdogEnabled || (PE_i_can_has_debugger(&debug_flags) && (debug_flags & DB_DISABLE_CROSS_PANIC))) { coprocessor_cross_panic_enabled = FALSE; } @@ -1346,9 +1392,9 @@ IOPlatformExpert::registerNVRAMController(IONVRAMController * caller) entry->release(); } -#endif /* !CONFIG_EMBEDDED */ +#endif /* defined(XNU_TARGET_OS_OSX) */ - if (string == 0) { + if (string == NULL) { entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); if (entry) { data = OSDynamicCast( OSData, entry->getProperty( "platform-uuid" )); @@ -1379,17 +1425,29 @@ IOPlatformExpert::callPlatformFunction(const OSSymbol *functionName, { IOService *service, *_resources; + if (functionName == gIOPlatformQuiesceActionKey || + functionName == gIOPlatformActiveActionKey) { + /* + * Services which register for IOPlatformQuiesceAction / IOPlatformActiveAction + * must consume that event themselves, without passing it up to super/IOPlatformExpert. + */ + if (gEnforceQuiesceSafety) { + panic("Class %s passed the quiesce/active action to IOPlatformExpert", + getMetaClass()->getClassName()); + } + } + if (waitForFunction) { _resources = waitForService(resourceMatching(functionName)); } else { _resources = getResourceService(); } - if (_resources == 0) { + if (_resources == NULL) { return kIOReturnUnsupported; } service = OSDynamicCast(IOService, _resources->getProperty(functionName)); - if (service == 0) { + if (service == NULL) { return kIOReturnUnsupported; } @@ -1426,12 +1484,12 @@ IODTPlatformExpert::probe( IOService * provider, SInt32 * score ) { if (!super::probe( provider, score)) { - return 0; + return NULL; } // check machine types if (!provider->compareNames( getProperty( gIONameMatchKey ))) { - return 0; + return NULL; } return this; @@ -1458,7 +1516,7 @@ IODTPlatformExpert::createNub( IORegistryEntry * from ) if (nub) { if (!nub->init( from, gIODTPlane )) { nub->free(); - nub = 0; + nub = NULL; } } return nub; @@ -1473,7 +1531,7 @@ IODTPlatformExpert::createNubs( IOService * parent, OSIterator * iter ) if (iter) { while ((next = (IORegistryEntry *) iter->getNextObject())) { - if (0 == (nub = createNub( next ))) { + if (NULL == (nub = createNub( next ))) { continue; } @@ -1510,7 +1568,7 @@ IODTPlatformExpert::processTopLevel( IORegistryEntry * rootEntry ) if (dtNVRAM) { if (!dtNVRAM->init(options, gIODTPlane)) { dtNVRAM->release(); - dtNVRAM = 0; + dtNVRAM = NULL; } else { dtNVRAM->attach(this); dtNVRAM->registerService(); @@ -1522,7 +1580,7 @@ IODTPlatformExpert::processTopLevel( IORegistryEntry * rootEntry ) // Publish the cpus. cpus = rootEntry->childFromPath( "cpus", gIODTPlane); if (cpus) { - createNubs( this, IODTFindMatchingEntries( cpus, kIODTExclusive, 0)); + createNubs( this, IODTFindMatchingEntries( cpus, kIODTExclusive, NULL)); cpus->release(); } @@ -1537,7 +1595,7 @@ IODTPlatformExpert::getNubResources( IOService * nub ) return kIOReturnSuccess; } - IODTResolveAddressing( nub, "reg", 0); + IODTResolveAddressing( nub, "reg", NULL); return kIOReturnSuccess; } @@ -1595,7 +1653,7 @@ IODTPlatformExpert::getMachineName( char * name, int maxLength ) maxLength--; prop = (OSData *) getProvider()->getProperty( gIODTModelKey ); - ok = (0 != prop); + ok = (NULL != prop); if (ok) { strlcpy( name, (const char *) prop->getBytesNoCopy(), maxLength ); @@ -1678,7 +1736,7 @@ IODTPlatformExpert::getNVRAMPartitions(void) if (dtNVRAM) { return dtNVRAM->getNVRAMPartitions(); } else { - return 0; + return NULL; } } @@ -1786,7 +1844,7 @@ bool IOPlatformExpertDevice::initWithArgs( void * dtTop, void * p2, void * p3, void * p4 ) { - IORegistryEntry * dt = 0; + IORegistryEntry * dt = NULL; bool ok; // dtTop may be zero on non- device tree systems @@ -1826,8 +1884,8 @@ IOPlatformExpertDevice::newUserClient( task_t owningTask, void * securityID, IOUserClient ** handler ) { IOReturn err = kIOReturnSuccess; - IOUserClient * newConnect = 0; - IOUserClient * theConnect = 0; + IOUserClient * newConnect = NULL; + IOUserClient * theConnect = NULL; switch (type) { case kIOKitDiagnosticsClientType: @@ -1836,6 +1894,12 @@ IOPlatformExpertDevice::newUserClient( task_t owningTask, void * securityID, err = kIOReturnNotPermitted; } break; + case kIOKitUserServerClientType: + newConnect = IOUserServer::withTask(owningTask); + if (!newConnect) { + err = kIOReturnNotPermitted; + } + break; default: err = kIOReturnBadArgument; } diff --git a/iokit/Kernel/IOPolledInterface.cpp b/iokit/Kernel/IOPolledInterface.cpp index cb982b0d6..d36c0c6db 100644 --- a/iokit/Kernel/IOPolledInterface.cpp +++ b/iokit/Kernel/IOPolledInterface.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2009 Apple Inc. All rights reserved. + * Copyright (c) 2006-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,7 +73,7 @@ enum { kDefaultIOSize = 128 * 1024 }; class IOPolledFilePollers : public OSObject { - OSDeclareDefaultStructors(IOPolledFilePollers) + OSDeclareDefaultStructors(IOPolledFilePollers); public: IOService * media; @@ -188,7 +188,7 @@ IOPolledFilePollersOpen(IOPolledFileIOVars * filevars, uint32_t state, bool abor int32_t idx; vars->abortable = abortable; - ioBuffer = 0; + ioBuffer = NULL; if (kIOPolledAfterSleepState == state) { vars->ioStatus = 0; @@ -270,7 +270,7 @@ IOPolledFilePollersClose(IOPolledFileIOVars * filevars, uint32_t state) } if (vars->ioBuffer) { vars->ioBuffer->release(); - vars->ioBuffer = 0; + vars->ioBuffer = NULL; } }while (false); } @@ -348,7 +348,7 @@ IOStartPolledIO(IOPolledFilePollers * vars, return err; } - completion.target = 0; + completion.target = NULL; completion.action = &IOPolledIOComplete; completion.parameter = vars; @@ -452,11 +452,11 @@ IOCopyMediaForDev(dev_t device) OSDictionary * matching; OSNumber * num; OSIterator * iter; - IOService * result = 0; + IOService * result = NULL; matching = IOService::serviceMatching("IOMedia"); if (!matching) { - return 0; + return NULL; } do{ num = OSNumber::withNumber(major(device), 32); @@ -489,13 +489,15 @@ IOCopyMediaForDev(dev_t device) #define APFSMEDIA_GETHIBERKEY "getHiberKey" static IOReturn -IOGetVolumeCryptKey(dev_t block_dev, OSString ** pKeyUUID, - uint8_t * volumeCryptKey, size_t * keySize) +IOGetVolumeCryptKey(dev_t block_dev, + LIBKERN_RETURNS_RETAINED OSString ** pKeyUUID, + uint8_t * volumeCryptKey, + size_t * keySize) { IOReturn err; IOService * part; - OSString * keyUUID = 0; - OSString * keyStoreUUID = 0; + OSString * keyUUID = NULL; + OSString * keyStoreUUID = NULL; uuid_t volumeKeyUUID; aks_volume_key_t vek; size_t callerKeySize; @@ -585,7 +587,7 @@ IOPolledFileOpen(const char * filename, _OpenFileContext ctx; OSData * extentsData = NULL; OSNumber * num; - IOService * part = 0; + IOService * part = NULL; dev_t block_dev; dev_t image_dev; AbsoluteTime startTime, endTime; @@ -694,7 +696,7 @@ IOPolledFileOpen(const char * filename, (void *) part, (void *) str2, (void *) (uintptr_t) true, (void *) &data); #else - data = 0; + data = NULL; err = kIOReturnSuccess; #endif if (kIOReturnSuccess != err) { @@ -713,7 +715,7 @@ IOPolledFileOpen(const char * filename, if (kIOReturnSuccess != err) { HIBLOG("error 0x%x opening polled file\n", err); - IOPolledFileClose(&vars, 0, 0, 0, 0, 0); + IOPolledFileClose(&vars, 0, NULL, 0, 0, 0); if (extentsData) { extentsData->release(); } @@ -747,11 +749,11 @@ IOPolledFileClose(IOPolledFileIOVars ** pVars, } if (vars->fileExtents) { vars->fileExtents->release(); - vars->fileExtents = 0; + vars->fileExtents = NULL; } if (vars->pollers) { vars->pollers->release(); - vars->pollers = 0; + vars->pollers = NULL; } if (vars->allocated) { @@ -1032,7 +1034,7 @@ IOPolledFileRead(IOPolledFileIOVars * vars, if ((vars->bufferOffset == vars->bufferLimit) && (vars->position < vars->readEnd)) { if (!vars->pollers->io) { - cryptvars = 0; + cryptvars = NULL; } err = IOPolledFilePollersIODone(vars->pollers, true); if (kIOReturnSuccess != err) { diff --git a/iokit/Kernel/IORangeAllocator.cpp b/iokit/Kernel/IORangeAllocator.cpp index aca4f6790..6e1148697 100644 --- a/iokit/Kernel/IORangeAllocator.cpp +++ b/iokit/Kernel/IORangeAllocator.cpp @@ -82,7 +82,7 @@ IORangeAllocator::init( IORangeScalar endOfRange, capacity = 0; capacityIncrement = _capacity; numElements = 0; - elements = 0; + elements = NULL; defaultAlignmentMask = _defaultAlignment - 1; options = _options; @@ -110,7 +110,7 @@ IORangeAllocator::withRange( if (thingy && !thingy->init( endOfRange, defaultAlignment, capacity, options )) { thingy->release(); - thingy = 0; + thingy = NULL; } return thingy; diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp index 31e8ca1a9..45b3f42e6 100644 --- a/iokit/Kernel/IORegistryEntry.cpp +++ b/iokit/Kernel/IORegistryEntry.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,7 +90,7 @@ static uint64_t gIORegistryLastID = kIORegistryIDReserved; class IORegistryPlane : public OSObject { friend class IORegistryEntry; - OSDeclareAbstractStructors(IORegistryPlane) + OSDeclareAbstractStructors(IORegistryPlane); const OSSymbol * nameKey; const OSSymbol * keys[kNumSetIndex]; @@ -255,7 +255,7 @@ IORegistryEntry::makePlane( const char * name ) if (nameKey) { nameKey->release(); } - plane = 0; + plane = NULL; } return plane; @@ -450,6 +450,7 @@ IORegistryEntry::free( void ) void IORegistryEntry::setPropertyTable( OSDictionary * dict ) { + PLOCK; if (dict) { dict->retain(); } @@ -458,6 +459,7 @@ IORegistryEntry::setPropertyTable( OSDictionary * dict ) } fPropertyTable = dict; + PUNLOCK; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -487,13 +489,13 @@ IORegistryEntry::getProperty( type * aKey, \ { \ OSObject * obj = getProperty( aKey ); \ \ - if ( (0 == obj) && plane && (options & kIORegistryIterateRecursively) ) { \ + if ( (NULL == obj) && plane && (options & kIORegistryIterateRecursively) ) { \ IORegistryEntry * entry = (IORegistryEntry *) this; \ IORegistryIterator * iter; \ iter = IORegistryIterator::iterateOver( entry, plane, options ); \ \ if(iter) { \ - while ( (0 == obj) && (entry = iter->getNextObject()) ) { \ + while ( (NULL == obj) && (entry = iter->getNextObject()) ) { \ obj = entry->getProperty( aKey ); \ } \ iter->release(); \ @@ -511,13 +513,13 @@ IORegistryEntry::copyProperty( type * aKey, \ { \ OSObject * obj = copyProperty( aKey ); \ \ - if ( (0 == obj) && plane && (options & kIORegistryIterateRecursively) ) { \ + if ( (NULL == obj) && plane && (options & kIORegistryIterateRecursively) ) { \ IORegistryEntry * entry = (IORegistryEntry *) this; \ IORegistryIterator * iter; \ iter = IORegistryIterator::iterateOver( entry, plane, options ); \ \ if(iter) { \ - while ( (0 == obj) && (entry = iter->getNextObject()) ) { \ + while ( (NULL == obj) && (entry = iter->getNextObject()) ) { \ obj = entry->copyProperty( aKey ); \ } \ iter->release(); \ @@ -796,14 +798,14 @@ IORegistryEntry::setIndexedProperty(uint32_t index, OSObject * anObject) OSObject * prior; if (index >= kIORegistryEntryIndexedPropertyCount) { - return 0; + return NULL; } array = atomic_load_explicit(&reserved->fIndexedProperties, memory_order_acquire); if (!array) { array = IONew(OSObject *, kIORegistryEntryIndexedPropertyCount); if (!array) { - return 0; + return NULL; } bzero(array, kIORegistryEntryIndexedPropertyCount * sizeof(array[0])); if (!OSCompareAndSwapPtr(NULL, array, &reserved->fIndexedProperties)) { @@ -811,7 +813,7 @@ IORegistryEntry::setIndexedProperty(uint32_t index, OSObject * anObject) } } if (!reserved->fIndexedProperties) { - return 0; + return NULL; } prior = reserved->fIndexedProperties[index]; @@ -827,10 +829,10 @@ OSObject * IORegistryEntry::getIndexedProperty(uint32_t index) const { if (index >= kIORegistryEntryIndexedPropertyCount) { - return 0; + return NULL; } if (!reserved->fIndexedProperties) { - return 0; + return NULL; } return reserved->fIndexedProperties[index]; @@ -843,7 +845,7 @@ IORegistryEntry::getIndexedProperty(uint32_t index) const const char * IORegistryEntry::getName( const IORegistryPlane * plane ) const { - OSSymbol * sym = 0; + OSSymbol * sym = NULL; RLOCK; if (plane) { @@ -865,7 +867,7 @@ const OSSymbol * IORegistryEntry::copyName( const IORegistryPlane * plane ) const { - OSSymbol * sym = 0; + OSSymbol * sym = NULL; RLOCK; if (plane) { @@ -890,7 +892,7 @@ const OSSymbol * IORegistryEntry::copyLocation( const IORegistryPlane * plane ) const { - OSSymbol * sym = 0; + OSSymbol * sym = NULL; RLOCK; if (plane) { @@ -911,7 +913,7 @@ const char * IORegistryEntry::getLocation( const IORegistryPlane * plane ) const { const OSSymbol * sym = copyLocation( plane ); - const char * result = 0; + const char * result = NULL; if (sym) { result = sym->getCStringNoCopy(); @@ -963,6 +965,17 @@ IORegistryEntry::setName( const char * name, } } +void +IORegistryEntry::setName( const OSString * name, + const IORegistryPlane * plane ) +{ + const OSSymbol * sym = OSSymbol::withString( name ); + if (sym) { + setName( sym, plane ); + sym->release(); + } +} + void IORegistryEntry::setLocation( const OSSymbol * location, const IORegistryPlane * plane ) @@ -1018,12 +1031,12 @@ IORegistryEntry::compareNames( OSObject * names, OSString ** matched ) const { OSString * string; OSCollection * collection; - OSIterator * iter = 0; + OSIterator * iter = NULL; bool result = false; if ((collection = OSDynamicCast( OSCollection, names))) { iter = OSCollectionIterator::withCollection( collection ); - string = 0; + string = NULL; } else { string = OSDynamicCast( OSString, names); } @@ -1100,7 +1113,7 @@ IORegistryEntry::getPath( char * path, int * length, stack->setObject((OSObject *) entry ); } - ok = (0 != parent); + ok = (NULL != parent); if (ok) { index = stack->getCount(); if (0 == index) { @@ -1184,7 +1197,7 @@ IORegistryEntry::matchPathLocation( const char * cmp, const IORegistryPlane * plane ) { const char * str; - const char * result = 0; + const char * result = NULL; u_quad_t num1, num2; char lastPathChar, lastLocationChar; @@ -1233,11 +1246,11 @@ IORegistryEntry * IORegistryEntry::getChildFromComponent( const char ** opath, const IORegistryPlane * plane ) { - IORegistryEntry * entry = 0; + IORegistryEntry * entry = NULL; OSArray * set; unsigned int index; const char * path; - const char * cmp = 0; + const char * cmp = NULL; char c; size_t len; const char * str; @@ -1287,7 +1300,7 @@ IORegistryEntry::hasAlias( const IORegistryPlane * plane, IORegistryEntry * entry; IORegistryEntry * entry2; const OSSymbol * key; - const OSSymbol * bestKey = 0; + const OSSymbol * bestKey = NULL; OSIterator * iter; OSData * data; const char * path = "/aliases"; @@ -1328,7 +1341,7 @@ IORegistryEntry::dealiasPath( IORegistryEntry * entry; OSData * data; const char * path = *opath; - const char * rpath = 0; + const char * rpath = NULL; const char * end; char c; char temp[kIOMaxPlaneName + 1]; @@ -1371,8 +1384,8 @@ IORegistryEntry::fromPath( int * length, IORegistryEntry * fromEntry ) { - IORegistryEntry * where = 0; - IORegistryEntry * aliasEntry = 0; + IORegistryEntry * where = NULL; + IORegistryEntry * aliasEntry = NULL; IORegistryEntry * next; const char * alias; const char * end; @@ -1381,11 +1394,11 @@ IORegistryEntry::fromPath( char c; char temp[kIOMaxPlaneName + 1]; - if (0 == path) { - return 0; + if (NULL == path) { + return NULL; } - if (0 == plane) { + if (NULL == plane) { // get plane name end = strchr( path, ':' ); if (end && ((end - path) < kIOMaxPlaneName)) { @@ -1394,8 +1407,8 @@ IORegistryEntry::fromPath( path = end + 1; } } - if (0 == plane) { - return 0; + if (NULL == plane) { + return NULL; } // check for alias @@ -1417,19 +1430,19 @@ IORegistryEntry::fromPath( RLOCK; do { - if (0 == where) { - if ((0 == fromEntry) && (*path++ == '/')) { + if (NULL == where) { + if ((NULL == fromEntry) && (*path++ == '/')) { fromEntry = gRegistryRoot->getChildEntry( plane ); } where = fromEntry; - if (0 == where) { + if (NULL == where) { break; } } else { c = *path++; if (c != '/') { if (c && (c != ':')) { // check valid terminator - where = 0; + where = NULL; } break; } @@ -1455,7 +1468,7 @@ IORegistryEntry::fromPath( *length = (len + len2); } else if (path[0]) { // no residual path => must be no tail for success - where = 0; + where = NULL; } } @@ -1523,7 +1536,7 @@ IORegistryEntry::makeLink( IORegistryEntry * to, } } else { links = OSArray::withObjects((const OSObject **) &to, 1, 1 ); - result = (links != 0); + result = (links != NULL); if (result) { result = registryTable()->setObject( plane->keys[relation], links ); @@ -1564,7 +1577,7 @@ IORegistryEntry::getParentSetReference( return (OSArray *) registryTable()->getObject( plane->keys[kParentSetIndex]); } else { - return 0; + return NULL; } } @@ -1576,12 +1589,12 @@ IORegistryEntry::getParentIterator( OSIterator * iter; if (!plane) { - return 0; + return NULL; } RLOCK; links = getParentSetReference( plane ); - if (0 == links) { + if (NULL == links) { links = OSArray::withCapacity( 1 ); } else { links = OSArray::withArray( links, links->getCount()); @@ -1600,7 +1613,7 @@ IORegistryEntry::getParentIterator( IORegistryEntry * IORegistryEntry::copyParentEntry( const IORegistryPlane * plane ) const { - IORegistryEntry * entry = 0; + IORegistryEntry * entry = NULL; OSArray * links; RLOCK; @@ -1635,7 +1648,7 @@ IORegistryEntry::getChildSetReference( const IORegistryPlane * plane ) const return (OSArray *) registryTable()->getObject( plane->keys[kChildSetIndex]); } else { - return 0; + return NULL; } } @@ -1646,12 +1659,12 @@ IORegistryEntry::getChildIterator( const IORegistryPlane * plane ) const OSIterator * iter; if (!plane) { - return 0; + return NULL; } RLOCK; links = getChildSetReference( plane ); - if (0 == links) { + if (NULL == links) { links = OSArray::withCapacity( 1 ); } else { links = OSArray::withArray( links, links->getCount()); @@ -1687,7 +1700,7 @@ IORegistryEntry * IORegistryEntry::copyChildEntry( const IORegistryPlane * plane ) const { - IORegistryEntry * entry = 0; + IORegistryEntry * entry = NULL; OSArray * links; RLOCK; @@ -1824,7 +1837,7 @@ IORegistryEntry::inPlane( const IORegistryPlane * plane ) const RLOCK; if (plane) { - ret = (0 != getParentSetReference( plane )); + ret = (NULL != getParentSetReference( plane )); } else { // Check to see if this is in any plane. If it is in a plane // then the registryTable will contain a key with the ParentLinks @@ -2055,7 +2068,7 @@ IORegistryEntry::detachAll( const IORegistryPlane * plane ) IORegistryIterator * regIter; regIter = IORegistryIterator::iterateOver( this, plane, true ); - if (0 == regIter) { + if (NULL == regIter) { return; } all = regIter->iterateAll(); @@ -2134,11 +2147,11 @@ IORegistryIterator::iterateOver( IORegistryEntry * root, { IORegistryIterator * create; - if (0 == root) { - return 0; + if (NULL == root) { + return NULL; } - if (0 == plane) { - return 0; + if (NULL == plane) { + return NULL; } create = new IORegistryIterator; @@ -2152,7 +2165,7 @@ IORegistryIterator::iterateOver( IORegistryEntry * root, create->options = options & ~kIORegistryIteratorInvalidFlag; } else { create->release(); - create = 0; + create = NULL; } } return create; @@ -2198,7 +2211,7 @@ IORegistryIterator::enterEntry( const IORegistryPlane * enterPlane ) assert( where); if (where) { - where->iter = 0; + where->iter = NULL; where->next = prev; where->current = prev->current; plane = enterPlane; @@ -2218,7 +2231,7 @@ IORegistryIterator::exitEntry( void ) if (where->iter) { where->iter->release(); - where->iter = 0; + where->iter = NULL; if (where->current) {// && (where != &start)) where->current->release(); } @@ -2242,7 +2255,7 @@ IORegistryIterator::reset( void ) if (done) { done->release(); - done = 0; + done = NULL; } where->current = root; @@ -2265,12 +2278,12 @@ IORegistryIterator::free( void ) IORegistryEntry * IORegistryIterator::getNextObjectFlat( void ) { - IORegistryEntry * next = 0; - OSArray * links = 0; + IORegistryEntry * next = NULL; + OSArray * links = NULL; RLOCK; - if ((0 == where->iter)) { + if ((NULL == where->iter)) { // just entered - create new iter if (isValid() && where->current @@ -2309,10 +2322,10 @@ IORegistryIterator::getNextObjectRecursive( void ) do{ next = getNextObjectFlat(); - } while ((0 == next) && exitEntry()); + } while ((NULL == next) && exitEntry()); if (next) { - if (0 == done) { + if (NULL == done) { done = OSOrderedSet::withCapacity( 10 ); } if (done->setObject((OSObject *) next)) { @@ -2339,7 +2352,7 @@ IORegistryIterator::getCurrentEntry( void ) if (isValid()) { return where->current; } else { - return 0; + return NULL; } } diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 5ef9f4325..7e5abfb41 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -27,12 +27,12 @@ */ #include - #include #include #include #include #include +#include #include #include #include @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +72,10 @@ // disabled since lockForArbitration() can be held externally #define DEBUG_NOTIFIER_LOCKED 0 +enum{ + kIOUserServerCheckInTimeoutSecs = 120ULL +}; + #include "IOServicePrivate.h" #include "IOKitKernelInternal.h" @@ -91,6 +97,7 @@ OSDefineMetaClassAndStructors(_IOConfigThread, OSObject) OSDefineMetaClassAndStructors(_IOServiceJob, OSObject) OSDefineMetaClassAndStructors(IOResources, IOService) +OSDefineMetaClassAndStructors(IOUserResources, IOService) OSDefineMetaClassAndStructors(_IOOpenServiceIterator, OSIterator) @@ -107,8 +114,11 @@ const OSSymbol * gIOInterruptControllersKey; const OSSymbol * gIOInterruptSpecifiersKey; const OSSymbol * gIOResourcesKey; +const OSSymbol * gIOUserResourcesKey; const OSSymbol * gIOResourceMatchKey; const OSSymbol * gIOResourceMatchedKey; +const OSSymbol * gIOResourceIOKitKey; + const OSSymbol * gIOProviderClassKey; const OSSymbol * gIONameMatchKey; const OSSymbol * gIONameMatchedKey; @@ -120,12 +130,24 @@ const OSSymbol * gIOPathMatchKey; const OSSymbol * gIOMatchCategoryKey; const OSSymbol * gIODefaultMatchCategoryKey; const OSSymbol * gIOMatchedServiceCountKey; +const OSSymbol * gIOMatchedPersonalityKey; +const OSSymbol * gIORematchPersonalityKey; +const OSSymbol * gIORematchCountKey; +const OSSymbol * gIODEXTMatchCountKey; #if !CONFIG_EMBEDDED const OSSymbol * gIOServiceLegacyMatchingRegistryIDKey; #endif const OSSymbol * gIOMapperIDKey; const OSSymbol * gIOUserClientClassKey; + +const OSSymbol * gIOUserClassKey; +const OSSymbol * gIOUserServerClassKey; +const OSSymbol * gIOUserServerNameKey; +const OSSymbol * gIOUserServerTagKey; +const OSSymbol * gIOUserServerCDHashKey; +const OSSymbol * gIOUserUserClientKey; + const OSSymbol * gIOKitDebugKey; const OSSymbol * gIOCommandPoolSizeKey; @@ -158,6 +180,11 @@ const OSSymbol * gIOFirstMatchNotification; const OSSymbol * gIOTerminatedNotification; const OSSymbol * gIOWillTerminateNotification; +const OSSymbol * gIOServiceDEXTEntitlementsKey; +const OSSymbol * gIODriverKitEntitlementKey; +const OSSymbol * gIODriverKitUserClientEntitlementsKey; +const OSSymbol * gIOMatchDeferKey; + const OSSymbol * gIOGeneralInterest; const OSSymbol * gIOBusyInterest; const OSSymbol * gIOAppPowerStateInterest; @@ -179,6 +206,7 @@ static OSDictionary * gNotifications; static IORecursiveLock * gNotificationLock; static IOService * gIOResources; +static IOService * gIOUserResources; static IOService * gIOServiceRoot; static OSOrderedSet * gJobs; @@ -189,6 +217,7 @@ static int gNumConfigThreads; static int gNumWaitingThreads; static IOLock * gIOServiceBusyLock; bool gCPUsRunning; +bool gKextdWillTerminate; static thread_t gIOTerminateThread; static thread_t gIOTerminateWorkerThread; @@ -198,6 +227,10 @@ static OSArray * gIOStopList; static OSArray * gIOStopProviderList; static OSArray * gIOFinalizeList; +#if !NO_KEXTD +static OSArray * gIOMatchDeferList; +#endif + static SInt32 gIOConsoleUsersSeed; static OSData * gIOConsoleUsersSeedValue; @@ -205,10 +238,13 @@ extern const OSSymbol * gIODTPHandleKey; const OSSymbol * gIOPlatformFunctionHandlerSet; + static IOLock * gIOConsoleUsersLock; static thread_call_t gIOConsoleLockCallout; static IONotifier * gIOServiceNullNotifier; +static uint32_t gIODextRelaunchMax = 1000; + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define LOCKREADNOTIFY() \ @@ -305,6 +341,35 @@ setLatencyHandler(UInt32 delayType, IOService * target, bool enable); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +namespace IOServicePH +{ +IONotifier * fRootNotifier; +OSArray * fUserServers; +OSArray * fUserServersWait; +OSArray * fMatchingWork; +OSArray * fMatchingDelayed; +IOService * fSystemPowerAckTo; +uint32_t fSystemPowerAckRef; +uint8_t fSystemOff; +uint8_t fUserServerOff; + +void lock(); +void unlock(); + +void init(IOPMrootDomain * root); + +IOReturn systemPowerChange( + void * target, + void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize); + +bool matchingStart(IOService * service); +void matchingEnd(IOService * service); +}; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + void IOService::initialize( void ) { @@ -327,16 +392,36 @@ IOService::initialize( void ) kIODefaultMatchCategoryKey ); gIOMatchedServiceCountKey = OSSymbol::withCStringNoCopy( kIOMatchedServiceCountKey ); + gIOMatchedPersonalityKey = OSSymbol::withCStringNoCopy( + kIOMatchedPersonalityKey ); + gIORematchPersonalityKey = OSSymbol::withCStringNoCopy( + kIORematchPersonalityKey ); + gIORematchCountKey = OSSymbol::withCStringNoCopy( + kIORematchCountKey ); + gIODEXTMatchCountKey = OSSymbol::withCStringNoCopy( + kIODEXTMatchCountKey ); + #if !CONFIG_EMBEDDED gIOServiceLegacyMatchingRegistryIDKey = OSSymbol::withCStringNoCopy( kIOServiceLegacyMatchingRegistryIDKey ); #endif + PE_parse_boot_argn("dextrelaunch", &gIODextRelaunchMax, sizeof(gIODextRelaunchMax)); + gIOUserClientClassKey = OSSymbol::withCStringNoCopy( kIOUserClientClassKey ); + gIOUserClassKey = OSSymbol::withCStringNoCopy(kIOUserClassKey); + + gIOUserServerClassKey = OSSymbol::withCStringNoCopy(kIOUserServerClassKey); + gIOUserServerNameKey = OSSymbol::withCStringNoCopy(kIOUserServerNameKey); + gIOUserServerTagKey = OSSymbol::withCStringNoCopy(kIOUserServerTagKey); + gIOUserServerCDHashKey = OSSymbol::withCStringNoCopy(kIOUserServerCDHashKey); + gIOUserUserClientKey = OSSymbol::withCStringNoCopy(kIOUserUserClientKey); + gIOResourcesKey = OSSymbol::withCStringNoCopy( kIOResourcesClass ); gIOResourceMatchKey = OSSymbol::withCStringNoCopy( kIOResourceMatchKey ); gIOResourceMatchedKey = OSSymbol::withCStringNoCopy( kIOResourceMatchedKey ); + gIOResourceIOKitKey = OSSymbol::withCStringNoCopy("IOKit"); gIODeviceMemoryKey = OSSymbol::withCStringNoCopy( "IODeviceMemory" ); gIOInterruptControllersKey @@ -377,6 +462,7 @@ IOService::initialize( void ) kIOWillTerminateNotification ); gIOServiceKey = OSSymbol::withCStringNoCopy( kIOServiceClass); + gIOConsoleLockedKey = OSSymbol::withCStringNoCopy( kIOConsoleLockedKey); gIOConsoleUsersKey = OSSymbol::withCStringNoCopy( kIOConsoleUsersKey); gIOConsoleSessionUIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionUIDKey); @@ -391,6 +477,11 @@ IOService::initialize( void ) gIOConsoleUsersSeedValue = OSData::withBytesNoCopy(&gIOConsoleUsersSeed, sizeof(gIOConsoleUsersSeed)); + gIOServiceDEXTEntitlementsKey = OSSymbol::withCStringNoCopy( kIOServiceDEXTEntitlementsKey ); + gIODriverKitEntitlementKey = OSSymbol::withCStringNoCopy( kIODriverKitEntitlementKey ); + gIODriverKitUserClientEntitlementsKey = OSSymbol::withCStringNoCopy( kIODriverKitUserClientEntitlementsKey ); + gIOMatchDeferKey = OSSymbol::withCStringNoCopy( kIOMatchDeferKey ); + gIOPlatformFunctionHandlerSet = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerSet); #if defined(__i386__) || defined(__x86_64__) sCPULatencyFunctionName[kCpuDelayBusStall] = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerMaxBusDelay); @@ -435,7 +526,8 @@ IOService::initialize( void ) && gIOConsoleLockCallout && (err == KERN_SUCCESS)); gIOResources = IOResources::resources(); - assert( gIOResources ); + gIOUserResources = IOUserResources::resources(); + assert( gIOResources && gIOUserResources ); gIOServiceNullNotifier = OSTypeAlloc(_IOServiceNullNotifier); assert(gIOServiceNullNotifier); @@ -451,6 +543,9 @@ IOService::initialize( void ) gIOStopList = OSArray::withCapacity( 16 ); gIOStopProviderList = OSArray::withCapacity( 16 ); gIOFinalizeList = OSArray::withCapacity( 16 ); +#if !NO_KEXTD + gIOMatchDeferList = OSArray::withCapacity( 16 ); +#endif assert( gIOTerminatePhase2List && gIOStopList && gIOStopProviderList && gIOFinalizeList ); // worker thread that is responsible for terminating / cleaning up threads @@ -540,6 +635,9 @@ IOService::start( IOService * provider ) void IOService::stop( IOService * provider ) { + if (reserved->uvars && reserved->uvars->started && reserved->uvars->userServer) { + reserved->uvars->userServer->serviceStop(this, provider); + } } bool @@ -642,6 +740,9 @@ IOService::free( void ) if (reserved->interruptStatisticsLock) { IOLockFree(reserved->interruptStatisticsLock); } + if (reserved->uvars && reserved->uvars->userServer) { + reserved->uvars->userServer->serviceFree(this); + } IODelete(reserved, ExpansionData, 1); } @@ -654,7 +755,7 @@ IOService::free( void ) } IOFree(_interruptSources, _numInterruptSources * sizeofAllIOInterruptSource); - _interruptSources = 0; + _interruptSources = NULL; } super::free(); @@ -728,7 +829,7 @@ IOService::getServiceRoot( void ) void IOService::detach( IOService * provider ) { - IOService * newProvider = 0; + IOService * newProvider = NULL; SInt32 busy; bool adjParent; @@ -736,6 +837,23 @@ IOService::detach( IOService * provider ) LOG("%s::detach(%s)\n", getName(), provider->getName()); } +#if !NO_KEXTD + IOLockLock(gJobsLock); + if (gIOMatchDeferList) { + auto idx = gIOMatchDeferList->getNextIndexOfObject(this, 0); + if (-1U != idx) { + gIOMatchDeferList->removeObject(idx); + } + } + if (IOServicePH::fMatchingDelayed) { + auto idx = IOServicePH::fMatchingDelayed->getNextIndexOfObject(this, 0); + if (-1U != idx) { + IOServicePH::fMatchingDelayed->removeObject(idx); + } + } + IOLockUnlock(gJobsLock); +#endif /* NO_KEXTD */ + lockForArbitration(); uint64_t regID1 = provider->getRegistryEntryID(); @@ -754,7 +872,7 @@ IOService::detach( IOService * provider ) if (busy) { newProvider = getProvider(); - if (busy && (__state[1] & kIOServiceTermPhase3State) && (0 == newProvider)) { + if (busy && (__state[1] & kIOServiceTermPhase3State) && (NULL == newProvider)) { _adjustBusy( -busy ); } } @@ -781,7 +899,7 @@ IOService::detach( IOService * provider ) provider->_adjustBusy( -1 ); } if ((provider->__state[1] & kIOServiceTermPhase3State) - && (0 == provider->getClient())) { + && (NULL == provider->getClient())) { provider->scheduleFinalize(false); } @@ -912,7 +1030,7 @@ IOService::startMatching( IOOptionBits options ) thread_wakeup((event_t) this /*&__state[1]*/ ); IOLockUnlock( gIOServiceBusyLock ); } else if (!sync || (kIOServiceAsynchronous & options)) { - ok = (0 != _IOServiceJob::startJob( this, kMatchNubJob, options )); + ok = (NULL != _IOServiceJob::startJob( this, kMatchNubJob, options )); } else { do { if ((__state[1] & kIOServiceNeedConfigState)) { @@ -946,12 +1064,47 @@ IOService::startMatching( IOOptionBits options ) } } + +void +IOService::startDeferredMatches(void) +{ +#if !NO_KEXTD + OSArray * array; + + IOLockLock(gJobsLock); + array = gIOMatchDeferList; + gIOMatchDeferList = NULL; + IOLockUnlock(gJobsLock); + + if (array) { + IOLog("deferred rematching count %d\n", array->getCount()); + array->iterateObjects(^bool (OSObject * obj) + { + ((IOService *)obj)->startMatching(kIOServiceAsynchronous); + return false; + }); + array->release(); + } +#endif /* !NO_KEXTD */ +} + +void +IOService::kextdLaunched(void) +{ +#if !NO_KEXTD + IOServiceTrace(IOSERVICE_KEXTD_READY, 0, 0, 0, 0); + startDeferredMatches(); + getServiceRoot()->adjustBusy(-1); + IOService::publishUserResource(gIOResourceIOKitKey); +#endif /* !NO_KEXTD */ +} + IOReturn IOService::catalogNewDrivers( OSOrderedSet * newTables ) { OSDictionary * table; OSSet * set; - OSSet * allSet = 0; + OSSet * allSet = NULL; IOService * service; #if IOMATCHDEBUG SInt32 count = 0; @@ -1007,7 +1160,7 @@ _IOServiceJob::startJob( IOService * nub, int type, job = new _IOServiceJob; if (job && !job->init()) { job->release(); - job = 0; + job = NULL; } if (job) { @@ -1068,7 +1221,7 @@ IOService::getProvider( void ) const parent = (IOService *) getParentEntry( gIOServicePlane); if (parent == IORegistryEntry::getRegistryRoot()) { /* root is not an IOService */ - parent = 0; + parent = NULL; } self->__provider = parent; @@ -1087,7 +1240,7 @@ IOService::getWorkLoop() const if (provider) { return provider->getWorkLoop(); } else { - return 0; + return NULL; } } @@ -1117,14 +1270,14 @@ _IOOpenServiceIterator::iterator( OSIterator * _iter, _IOOpenServiceIterator * inst; if (!_iter) { - return 0; + return NULL; } inst = new _IOOpenServiceIterator; if (inst && !inst->init()) { inst->release(); - inst = 0; + inst = NULL; } if (inst) { inst->iter = _iter; @@ -1179,7 +1332,7 @@ _IOOpenServiceIterator::reset() { if (last) { last->unlockForArbitration(); - last = 0; + last = NULL; } iter->reset(); } @@ -1187,13 +1340,13 @@ _IOOpenServiceIterator::reset() OSIterator * IOService::getOpenProviderIterator( void ) const { - return _IOOpenServiceIterator::iterator( getProviderIterator(), this, 0 ); + return _IOOpenServiceIterator::iterator( getProviderIterator(), this, NULL ); } OSIterator * IOService::getOpenClientIterator( void ) const { - return _IOOpenServiceIterator::iterator( getClientIterator(), 0, this ); + return _IOOpenServiceIterator::iterator( getClientIterator(), NULL, this ); } @@ -1206,11 +1359,23 @@ IOService::callPlatformFunction( const OSSymbol * functionName, IOReturn result = kIOReturnUnsupported; IOService *provider; + if (functionName == gIOPlatformQuiesceActionKey || + functionName == gIOPlatformActiveActionKey) { + /* + * Services which register for IOPlatformQuiesceAction / IOPlatformActiveAction + * must consume that event themselves, without passing it up to super/IOService. + */ + if (gEnforceQuiesceSafety) { + panic("Class %s passed the quiesce/active action to IOService", + getMetaClass()->getClassName()); + } + } + if (gIOPlatformFunctionHandlerSet == functionName) { #if defined(__i386__) || defined(__x86_64__) const OSSymbol * functionHandlerName = (const OSSymbol *) param1; IOService * target = (IOService *) param2; - bool enable = (param3 != 0); + bool enable = (param3 != NULL); if (sCPULatencyFunctionName[kCpuDelayBusStall] == functionHandlerName) { result = setLatencyHandler(kCpuDelayBusStall, target, enable); @@ -1237,7 +1402,7 @@ IOService::callPlatformFunction( const char * functionName, IOReturn result = kIOReturnNoMemory; const OSSymbol *functionSymbol = OSSymbol::withCString(functionName); - if (functionSymbol != 0) { + if (functionSymbol != NULL) { result = callPlatformFunction(functionSymbol, waitForFunction, param1, param2, param3, param4); functionSymbol->release(); @@ -1274,6 +1439,7 @@ IOService::setPlatform( IOPlatformExpert * platform) { gIOPlatform = platform; gIOResources->attachToParent( gIOServiceRoot, gIOServicePlane ); + gIOUserResources->attachToParent( gIOServiceRoot, gIOServicePlane ); #if defined(__i386__) || defined(__x86_64__) @@ -1300,7 +1466,8 @@ void IOService::setPMRootDomain( class IOPMrootDomain * rootDomain) { gIOPMRootDomain = rootDomain; - publishResource("IOKit"); + publishResource(gIOResourceIOKitKey); + IOServicePH::init(rootDomain); } /* @@ -1355,7 +1522,7 @@ IOService::lockForArbitration( bool isSuccessRequired ) if (found) { // this object is already locked // determine whether it is the same or a different thread trying to lock if (active->thread != element->thread) { // it is a different thread - ArbitrationLockQueueElement * victim = 0; + ArbitrationLockQueueElement * victim = NULL; // before placing this new thread on the waiting queue, we look for // a deadlock cycle... @@ -1766,7 +1933,7 @@ applyToInterestNotifiers(const IORegistryEntry *target, OSObjectApplierFunction applier, void * context ) { - OSArray * copyArray = 0; + OSArray * copyArray = NULL; OSObject * prop; LOCKREADNOTIFY(); @@ -1862,7 +2029,7 @@ IONotifier * IOService::registerInterest( const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, void * target, void * ref ) { - _IOServiceInterestNotifier * notify = 0; + _IOServiceInterestNotifier * notify = NULL; IOReturn rc = kIOReturnError; notify = new _IOServiceInterestNotifier; @@ -1877,7 +2044,7 @@ IOService::registerInterest( const OSSymbol * typeOfInterest, if (rc != kIOReturnSuccess) { notify->release(); - notify = 0; + notify = NULL; } return notify; @@ -1919,7 +2086,7 @@ IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol * IOServiceInterestHandler handler, void * target, void * ref ) { IOReturn rc = kIOReturnSuccess; - _IOServiceInterestNotifier *notify = 0; + _IOServiceInterestNotifier *notify = NULL; if (!svcNotify || !(notify = OSDynamicCast(_IOServiceInterestNotifier, svcNotify))) { return kIOReturnBadArgument; @@ -1955,7 +2122,7 @@ IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol * bool ok = setProperty( typeOfInterest, notifyList); notifyList->release(); if (!ok) { - notifyList = 0; + notifyList = NULL; } } } @@ -1987,7 +2154,7 @@ cleanInterestList( OSObject * head ) LOCKWRITENOTIFY(); while (queue_entry_t entry = dequeue(¬ifyHead->fCommandChain)) { - queue_next(entry) = queue_prev(entry) = 0; + queue_next(entry) = queue_prev(entry) = NULL; _IOServiceInterestNotifier * notify; @@ -2071,7 +2238,7 @@ _IOServiceInterestNotifier::remove() if (queue_next( &chain )) { remqueue(&chain); - queue_next( &chain) = queue_prev( &chain) = 0; + queue_next( &chain) = queue_prev( &chain) = NULL; release(); } @@ -2133,8 +2300,8 @@ _IOServiceInterestNotifier::init() static void _workLoopAction( IOWorkLoop::Action action, IOService * service, - void * p0 = 0, void * p1 = 0, - void * p2 = 0, void * p3 = 0 ) + void * p0 = NULL, void * p1 = NULL, + void * p2 = NULL, void * p3 = NULL ) { IOWorkLoop * wl; @@ -2170,9 +2337,11 @@ IOService::terminatePhase1( IOOptionBits options ) { IOService * victim; IOService * client; + IOService * rematchProvider; OSIterator * iter; OSArray * makeInactive; OSArray * waitingInactive; + IOOptionBits callerOptions; int waitResult = THREAD_AWAKENED; bool wait; bool ok; @@ -2181,6 +2350,8 @@ IOService::terminatePhase1( IOOptionBits options ) TLOG("%s[0x%qx]::terminatePhase1(%08llx)\n", getName(), getRegistryEntryID(), (long long)options); + callerOptions = options; + rematchProvider = NULL; uint64_t regID = getRegistryEntryID(); IOServiceTrace( IOSERVICE_TERMINATE_PHASE1, @@ -2260,10 +2431,36 @@ IOService::terminatePhase1( IOOptionBits options ) } } victim->_adjustBusy( 1 ); + + if ((options & kIOServiceTerminateWithRematch) && (victim == this)) { + OSObject * obj; + OSObject * rematchProps; + OSNumber * num; + uint32_t count; + + rematchProvider = getProvider(); + if (rematchProvider) { + obj = rematchProvider->copyProperty(gIORematchCountKey); + num = OSDynamicCast(OSNumber, obj); + count = 0; + if (num) { + count = num->unsigned32BitValue(); + count++; + } + num = OSNumber::withNumber(count, 32); + rematchProvider->setProperty(gIORematchCountKey, num); + rematchProps = copyProperty(gIOMatchedPersonalityKey); + rematchProvider->setProperty(gIORematchPersonalityKey, rematchProps); + OSSafeReleaseNULL(num); + OSSafeReleaseNULL(rematchProps); + OSSafeReleaseNULL(obj); + } + } } victim->unlockForArbitration(); } if (victim == this) { + options &= ~kIOServiceTerminateWithRematch; startPhase2 = didInactive; } if (didInactive) { @@ -2336,6 +2533,11 @@ IOService::terminatePhase1( IOOptionBits options ) release(); } + if (rematchProvider) { + DKLOG(DKS " rematching after dext crash\n", DKN(rematchProvider)); + rematchProvider->registerService(); + } + return true; } @@ -2420,18 +2622,19 @@ IOService::scheduleTerminatePhase2( IOOptionBits options ) haveDeadline = true; } /* let others do work while we wait */ - gIOTerminateThread = 0; + gIOTerminateThread = NULL; IOLockWakeup( gJobsLock, (event_t) &gIOTerminateThread, /* one-thread */ false); waitResult = IOLockSleepDeadline( gJobsLock, &gIOTerminateWork, deadline, THREAD_UNINT ); if (__improbable(waitResult == THREAD_TIMED_OUT)) { - panic("%s[0x%qx]::terminate(kIOServiceSynchronous) timeout\n", getName(), getRegistryEntryID()); + IOLog("%s[0x%qx]::terminate(kIOServiceSynchronous): THREAD_TIMED_OUT. " + "Attempting to auto-resolve your deadlock. PLEASE FIX!\n", getName(), getRegistryEntryID()); } waitToBecomeTerminateThread(); } } while (gIOTerminateWork || (wait && (waitResult != THREAD_TIMED_OUT))); - gIOTerminateThread = 0; + gIOTerminateThread = NULL; IOLockWakeup( gJobsLock, (event_t) &gIOTerminateThread, /* one-thread */ false); } else { // ! kIOServiceSynchronous @@ -2463,7 +2666,7 @@ IOService::terminateThread( void * arg, wait_result_t waitResult ) terminateWorker((uintptr_t)arg ); } - gIOTerminateThread = 0; + gIOTerminateThread = NULL; IOLockWakeup( gJobsLock, (event_t) &gIOTerminateThread, /* one-thread */ false); IOLockSleep(gJobsLock, &gIOTerminateWork, THREAD_UNINT); } @@ -2521,12 +2724,19 @@ IOService::scheduleFinalize(bool now) bool IOService::willTerminate( IOService * provider, IOOptionBits options ) { + if (reserved->uvars) { + IOUserServer::serviceWillTerminate(this, provider, options); + } return true; } bool IOService::didTerminate( IOService * provider, IOOptionBits options, bool * defer ) { + if (reserved->uvars) { + IOUserServer::serviceDidTerminate(this, provider, options, defer); + } + if (false == *defer) { if (lockForArbitration( true )) { if (false == provider->handleIsOpen( this )) { @@ -2550,8 +2760,8 @@ IOService::didTerminate( IOService * provider, IOOptionBits options, bool * defe void IOService::actionWillTerminate( IOService * victim, IOOptionBits options, OSArray * doPhase2List, - void *unused2 __unused, - void *unused3 __unused ) + bool user, + void *unused3 __unused) { OSIterator * iter; IOService * client; @@ -2561,6 +2771,9 @@ IOService::actionWillTerminate( IOService * victim, IOOptionBits options, iter = victim->getClientIterator(); if (iter) { while ((client = (IOService *) iter->getNextObject())) { + if (user != (NULL != client->reserved->uvars)) { + continue; + } regID1 = client->getRegistryEntryID(); TLOG("%s[0x%qx]::willTerminate(%s[0x%qx], %08llx)\n", client->getName(), regID1, @@ -2746,7 +2959,7 @@ IOService::terminateWorker( IOOptionBits options ) doPhase2List = OSArray::withCapacity( 16 ); didPhase2List = OSArray::withCapacity( 16 ); freeList = OSSet::withCapacity( 16 ); - if ((0 == doPhase2List) || (0 == didPhase2List) || (0 == freeList)) { + if ((NULL == doPhase2List) || (NULL == didPhase2List) || (NULL == freeList)) { return; } @@ -2819,7 +3032,13 @@ IOService::terminateWorker( IOOptionBits options ) victim->invokeNotifiers(¬ifiers); _workLoopAction((IOWorkLoop::Action) &actionWillTerminate, - victim, (void *)(uintptr_t) options, (void *)(uintptr_t) doPhase2List ); + victim, + (void *)(uintptr_t) options, + (void *)(uintptr_t) doPhase2List, + (void *)(uintptr_t) false); + + actionWillTerminate( + victim, options, doPhase2List, true, NULL); didPhase2List->headQ( victim ); } @@ -2835,7 +3054,7 @@ IOService::terminateWorker( IOOptionBits options ) bool scheduleFinalize = false; if (victim->lockForArbitration( true )) { victim->__state[1] |= kIOServiceTermPhase3State; - scheduleFinalize = (0 == victim->getClient()); + scheduleFinalize = (NULL == victim->getClient()); victim->unlockForArbitration(); } _workLoopAction((IOWorkLoop::Action) &actionDidTerminate, @@ -3084,6 +3303,10 @@ IOService::open( IOService * forClient, ok = handleOpen( forClient, options, arg ); } + if (ok && forClient && forClient->reserved->uvars && forClient->reserved->uvars->userServer) { + forClient->reserved->uvars->userServer->serviceOpen(this, forClient); + } + unlockForArbitration(); return ok; @@ -3102,6 +3325,10 @@ IOService::close( IOService * forClient, if (wasClosed) { handleClose( forClient, options ); last = (__state[1] & kIOServiceTermPhase3State); + + if (forClient && forClient->reserved->uvars && forClient->reserved->uvars->userServer) { + forClient->reserved->uvars->userServer->serviceClose(this, forClient); + } } unlockForArbitration(); @@ -3143,13 +3370,13 @@ IOService::handleOpen( IOService * forClient, { bool ok; - ok = (0 == __owner); + ok = (NULL == __owner); if (ok) { __owner = forClient; } else if (options & kIOServiceSeize) { ok = (kIOReturnSuccess == messageClient( kIOMessageServiceIsRequestingClose, __owner, (void *)(uintptr_t) options )); - if (ok && (0 == __owner)) { + if (ok && (NULL == __owner)) { __owner = forClient; } else { ok = false; @@ -3163,7 +3390,7 @@ IOService::handleClose( IOService * forClient, IOOptionBits options ) { if (__owner == forClient) { - __owner = 0; + __owner = NULL; } } @@ -3213,7 +3440,7 @@ IOServiceObjectOrder( const OSObject * entry, void * ref) OSObject * prop; SInt32 result; - prop = 0; + prop = NULL; result = kIODefaultProbeScore; if ((dict = OSDynamicCast( OSDictionary, entry))) { offset = OSDynamicCast(OSNumber, dict->getObject( key )); @@ -3224,7 +3451,7 @@ IOServiceObjectOrder( const OSObject * entry, void * ref) offset = OSDynamicCast(OSNumber, prop); } else { assert( false ); - offset = 0; + offset = NULL; } if (offset) { @@ -3261,7 +3488,7 @@ IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaClassBase * inObj IOService * IOService::copyClientWithCategory( const OSSymbol * category ) { - IOService * service = 0; + IOService * service = NULL; OSIterator * iter; const OSSymbol * nextCat; @@ -3336,7 +3563,7 @@ IOService::invokeNotifier( _IOServiceNotifier * notify ) } bool -IOService::invokeNotifiers(OSArray ** willSend) +IOService::invokeNotifiers(OSArray * willSend[]) { OSArray * array; _IOServiceNotifier * notify; @@ -3346,7 +3573,7 @@ IOService::invokeNotifiers(OSArray ** willSend) if (!array) { return true; } - *willSend = 0; + *willSend = NULL; for (unsigned int idx = 0; (notify = (_IOServiceNotifier *) array->getObject(idx)); @@ -3367,30 +3594,56 @@ IOService::invokeNotifiers(OSArray ** willSend) void IOService::probeCandidates( OSOrderedSet * matches ) { - OSDictionary * match = 0; + OSDictionary * match = NULL; OSSymbol * symbol; IOService * inst; IOService * newInst; OSDictionary * props; SInt32 score; OSNumber * newPri; - OSOrderedSet * familyMatches = 0; + OSOrderedSet * familyMatches = NULL; OSOrderedSet * startList; - OSDictionary * startDict = 0; + OSSet * kexts = NULL; + OSObject * kextRef; + + OSDictionary * startDict = NULL; const OSSymbol * category; OSIterator * iter; - _IOServiceNotifier * notify; - OSObject * nextMatch = 0; + _IOServiceNotifier * notify; + OSObject * nextMatch = NULL; bool started; bool needReloc = false; + bool matchDeferred = false; #if IOMATCHDEBUG SInt64 debugFlags; #endif - IOService * client = NULL; - + IOService * client = NULL; + OSObject * prop1; + OSObject * prop2; + OSDictionary * rematchPersonality; + OSNumber * num; + uint32_t count; + uint32_t dextCount; + bool isDext; + bool categoryConsumed; + + prop2 = NULL; + count = 0; + prop1 = copyProperty(gIORematchPersonalityKey); + rematchPersonality = OSDynamicCast(OSDictionary, prop1); + if (rematchPersonality) { + prop2 = copyProperty(gIORematchCountKey); + num = OSDynamicCast(OSNumber, prop2); + if (num) { + count = num->unsigned32BitValue(); + } + OSSafeReleaseNULL(prop2); + } + dextCount = 0; assert( matches ); - while (!needReloc && (nextMatch = matches->getFirstObject())) { + while (!needReloc + && (nextMatch = matches->getFirstObject())) { nextMatch->retain(); matches->removeObject(nextMatch); @@ -3399,56 +3652,76 @@ IOService::probeCandidates( OSOrderedSet * matches ) invokeNotifier( notify ); } nextMatch->release(); - nextMatch = 0; + nextMatch = NULL; continue; } else if (!(match = OSDynamicCast( OSDictionary, nextMatch ))) { nextMatch->release(); - nextMatch = 0; + nextMatch = NULL; continue; } - props = 0; + props = NULL; #if IOMATCHDEBUG debugFlags = getDebugFlags( match ); #endif do { + isDext = (NULL != match->getObject(gIOUserServerNameKey)); + if (isDext && !(kIODKEnable & gIODKDebug)) { + continue; + } + category = OSDynamicCast( OSSymbol, match->getObject( gIOMatchCategoryKey )); - if (0 == category) { + if (NULL == category) { category = gIODefaultMatchCategoryKey; } + client = copyClientWithCategory(category); - if ((client = copyClientWithCategory(category))) { + categoryConsumed = (client != NULL); + if (categoryConsumed) { #if IOMATCHDEBUG if ((debugFlags & kIOLogMatch) && (this != gIOResources)) { LOG("%s: match category %s exists\n", getName(), category->getCStringNoCopy()); } #endif - nextMatch->release(); - nextMatch = 0; - - client->release(); - client = NULL; - - continue; + OSSafeReleaseNULL(client); + if (!isDext) { + break; + } } // create a copy now in case its modified during matching - props = OSDictionary::withDictionary( match, match->getCount()); - if (0 == props) { - continue; + props = OSDictionary::withDictionary(match, match->getCount()); + if (NULL == props) { + break; } props->setCapacityIncrement(1); // check the nub matches if (false == matchPassive(props, kIOServiceChangesOK | kIOServiceClassDone)) { - continue; + break; + } + if (isDext) { + dextCount++; + if (categoryConsumed) { + break; + } + } + + if (rematchPersonality) { + bool personalityMatch = match->isEqualTo(rematchPersonality); + if (count > gIODextRelaunchMax) { + personalityMatch = !personalityMatch; + } + if (!personalityMatch) { + break; + } } // Check to see if driver reloc has been loaded. - needReloc = (false == gIOCatalogue->isModuleLoaded( match )); + needReloc = (false == gIOCatalogue->isModuleLoaded( match, &kextRef )); if (needReloc) { #if IOMATCHDEBUG if (debugFlags & kIOLogCatalogue) { @@ -3457,11 +3730,23 @@ IOService::probeCandidates( OSOrderedSet * matches ) #endif // If reloc hasn't been loaded, exit; // reprobing will occur after reloc has been loaded. - continue; + break; + } + if (kextRef) { + if (NULL == kexts) { + kexts = OSSet::withCapacity(1); + } + if (kexts) { + kexts->setObject(kextRef); + kextRef->release(); + } + } + if (isDext) { + // copy saved for rematchng + props->setObject(gIOMatchedPersonalityKey, match); } - // reorder on family matchPropertyTable score. - if (0 == familyMatches) { + if (NULL == familyMatches) { familyMatches = OSOrderedSet::withCapacity( 1, IOServiceOrdering, (void *) gIOProbeScoreKey ); } @@ -3470,16 +3755,11 @@ IOService::probeCandidates( OSOrderedSet * matches ) } } while (false); - if (nextMatch) { - nextMatch->release(); - nextMatch = 0; - } - if (props) { - props->release(); - } + OSSafeReleaseNULL(nextMatch); + OSSafeReleaseNULL(props); } matches->release(); - matches = 0; + matches = NULL; if (familyMatches) { while (!needReloc @@ -3487,8 +3767,8 @@ IOService::probeCandidates( OSOrderedSet * matches ) props->retain(); familyMatches->removeObject( props ); - inst = 0; - newInst = 0; + inst = NULL; + newInst = NULL; #if IOMATCHDEBUG debugFlags = getDebugFlags( props ); #endif @@ -3526,7 +3806,7 @@ IOService::probeCandidates( OSOrderedSet * matches ) // give the driver the default match category if not specified category = OSDynamicCast( OSSymbol, props->getObject( gIOMatchCategoryKey )); - if (0 == category) { + if (NULL == category) { category = gIODefaultMatchCategoryKey; } inst->setProperty( gIOMatchCategoryKey, (OSObject *) category ); @@ -3548,7 +3828,7 @@ IOService::probeCandidates( OSOrderedSet * matches ) newInst = inst->probe( this, &score ); inst->detach( this ); - if (0 == newInst) { + if (NULL == newInst) { #if IOMATCHDEBUG if (debugFlags & kIOLogProbe) { IOLog("%s::probe fails\n", symbol->getCStringNoCopy()); @@ -3565,13 +3845,13 @@ IOService::probeCandidates( OSOrderedSet * matches ) } // add to start list for the match category - if (0 == startDict) { + if (NULL == startDict) { startDict = OSDictionary::withCapacity( 1 ); } assert( startDict ); startList = (OSOrderedSet *) startDict->getObject( category ); - if (0 == startList) { + if (NULL == startList) { startList = OSOrderedSet::withCapacity( 1, IOServiceOrdering, (void *) gIOProbeScoreKey ); if (startDict && startList) { @@ -3591,7 +3871,7 @@ IOService::probeCandidates( OSOrderedSet * matches ) } } familyMatches->release(); - familyMatches = 0; + familyMatches = NULL; } // start the best (until success) of each category @@ -3607,6 +3887,7 @@ IOService::probeCandidates( OSOrderedSet * matches ) started = false; while (true // (!started) + && !matchDeferred && (inst = (IOService *)startList->getFirstObject())) { inst->retain(); startList->removeObject(inst); @@ -3623,20 +3904,53 @@ IOService::probeCandidates( OSOrderedSet * matches ) } #endif if (false == started) { - started = startCandidate( inst ); - } +#if !NO_KEXTD + IOLockLock(gJobsLock); + matchDeferred = (gIOMatchDeferList + && (kOSBooleanTrue == inst->getProperty(gIOMatchDeferKey))); + if (matchDeferred && (-1U == gIOMatchDeferList->getNextIndexOfObject(this, 0))) { + gIOMatchDeferList->setObject(this); + } + IOLockUnlock(gJobsLock); + if (matchDeferred) { + symbol = OSDynamicCast(OSSymbol, inst->getProperty(gIOClassKey)); + IOLog("%s(0x%qx): matching deferred by %s\n", + getName(), getRegistryEntryID(), + symbol ? symbol->getCStringNoCopy() : ""); + // rematching will occur after kextd loads all plists + } +#endif + if (!matchDeferred) { + started = startCandidate( inst ); #if IOMATCHDEBUG - if ((debugFlags & kIOLogStart) && (false == started)) { - LOG( "%s::start(%s) <%d> failed\n", inst->getName(), getName(), - inst->getRetainCount()); - } + if ((debugFlags & kIOLogStart) && (false == started)) { + LOG( "%s::start(%s) <%d> failed\n", inst->getName(), getName(), + inst->getRetainCount()); + } #endif + } + } inst->release(); } } iter->release(); } + OSSafeReleaseNULL(prop1); + + if (dextCount) { + num = OSNumber::withNumber(dextCount, 32); + setProperty(gIODEXTMatchCountKey, num); + OSSafeReleaseNULL(num); + } else if (rematchPersonality) { + removeProperty(gIODEXTMatchCountKey); + } + + // now that instances are created, drop the refs on any kexts allowing unload + if (kexts) { + OSKext::dropMatchingReferences(kexts); + OSSafeReleaseNULL(kexts); + } // adjust the busy count by +1 if matching is stalled for a module, // or -1 if a previously stalled matching is complete. @@ -3677,6 +3991,272 @@ IOService::probeCandidates( OSOrderedSet * matches ) } } +/* + * Wait for a IOUserServer to check in + */ + +static +__attribute__((noinline, not_tail_called)) +IOService * +__WAITING_FOR_USER_SERVER__(OSDictionary * matching) +{ + IOService * server; + server = IOService::waitForMatchingService(matching, kIOUserServerCheckInTimeoutSecs * NSEC_PER_SEC); + return server; +} + +void +IOService::willShutdown() +{ + gKextdWillTerminate = true; +#if !NO_KEXTD + getPlatform()->waitQuiet(30 * NSEC_PER_SEC); +#endif + OSKext::willShutdown(); +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +void +IOServicePH::init(IOPMrootDomain * root) +{ + fUserServers = OSArray::withCapacity(4); + fMatchingWork = OSArray::withCapacity(4); + + assert(fUserServers && fMatchingWork); + + fRootNotifier = root->registerInterest( + gIOPriorityPowerStateInterest, &IOServicePH::systemPowerChange, NULL, NULL); + + assert(fRootNotifier); +} + +void +IOServicePH::lock() +{ + IOLockLock(gJobsLock); +} + +void +IOServicePH::unlock() +{ + IOLockUnlock(gJobsLock); +} + +void +IOServicePH::serverAdd(IOUserServer * server) +{ + uint32_t idx; + + lock(); + idx = fUserServers->getNextIndexOfObject(server, 0); + if (idx == -1U) { + fUserServers->setObject(server); + } + unlock(); +} + +void +IOServicePH::serverRemove(IOUserServer * server) +{ + uint32_t idx; + + lock(); + idx = fUserServers->getNextIndexOfObject(server, 0); + if (idx != -1U) { + fUserServers->removeObject(idx); + } + unlock(); +} + +void +IOServicePH::serverAck(IOUserServer * server) +{ + uint32_t idx; + IOService * ackTo; + uint32_t ackToRef; + + ackTo = NULL; + lock(); + if (server && fUserServersWait) { + idx = fUserServersWait->getNextIndexOfObject(server, 0); + if (idx != -1U) { + fUserServersWait->removeObject(idx); + if (0 == fUserServersWait->getCount()) { + OSSafeReleaseNULL(fUserServersWait); + } + } + } + if (!fUserServersWait && !fMatchingWork->getCount()) { + ackTo = fSystemPowerAckTo; + ackToRef = fSystemPowerAckRef; + fSystemPowerAckTo = NULL; + } + unlock(); + + if (ackTo) { + DKLOG("allowPowerChange\n"); + ackTo->allowPowerChange((uintptr_t) ackToRef); + } +} + +bool +IOServicePH::matchingStart(IOService * service) +{ + uint32_t idx; + bool ok; + + lock(); + ok = !fSystemOff; + if (ok) { + idx = fMatchingWork->getNextIndexOfObject(service, 0); + if (idx == -1U) { + fMatchingWork->setObject(service); + } + } else { + if (!fMatchingDelayed) { + fMatchingDelayed = OSArray::withObjects((const OSObject **) &service, 1, 1); + } else { + idx = fMatchingDelayed->getNextIndexOfObject(service, 0); + if (idx == -1U) { + fMatchingDelayed->setObject(service); + } + } + } + unlock(); + + return ok; +} + +void +IOServicePH::matchingEnd(IOService * service) +{ + uint32_t idx; + OSArray * notifyServers; + OSArray * deferredMatches; + + notifyServers = NULL; + deferredMatches = NULL; + + lock(); + + if (service) { + idx = fMatchingWork->getNextIndexOfObject(service, 0); + if (idx != -1U) { + fMatchingWork->removeObject(idx); + } + } + + + if ((fUserServerOff != fSystemOff) && fUserServers->getCount()) { + if (fSystemOff) { + if (0 == fMatchingWork->getCount()) { + fUserServersWait = OSArray::withArray(fUserServers); + notifyServers = OSArray::withArray(fUserServers); + fUserServerOff = fSystemOff; + } + } else { + notifyServers = OSArray::withArray(fUserServers); + fUserServerOff = fSystemOff; + } + } + + if (!fSystemOff && fMatchingDelayed) { + deferredMatches = fMatchingDelayed; + fMatchingDelayed = NULL; + } + + unlock(); + + if (notifyServers) { + notifyServers->iterateObjects(^bool (OSObject * obj) { + IOUserServer * us; + us = (typeof(us))obj; + us->systemPower(fSystemOff); + return false; + }); + OSSafeReleaseNULL(notifyServers); + } + + if (deferredMatches) { + DKLOG("sleep deferred rematching count %d\n", deferredMatches->getCount()); + deferredMatches->iterateObjects(^bool (OSObject * obj) + { + ((IOService *)obj)->startMatching(kIOServiceAsynchronous); + return false; + }); + deferredMatches->release(); + } + + serverAck(NULL); +} + +IOReturn +IOServicePH::systemPowerChange( + void * target, + void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize) +{ + IOReturn ret; + IOUserServer * us; + IOPMSystemCapabilityChangeParameters * params; + + us = NULL; + + switch (messageType) { + case kIOMessageSystemCapabilityChange: + + params = (typeof params)messageArgument; + + if (kIODKLogPM & gIODKDebug) { + IOLog("IOServicePH::kIOMessageSystemCapabilityChange: %s%s 0x%x->0x%x\n", + params->changeFlags & kIOPMSystemCapabilityWillChange ? "will" : "", + params->changeFlags & kIOPMSystemCapabilityDidChange ? "did" : "", + params->fromCapabilities, + params->toCapabilities); + } + + if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && + (params->fromCapabilities & kIOPMSystemCapabilityCPU) && + ((params->toCapabilities & kIOPMSystemCapabilityCPU) == 0)) { + lock(); + fSystemOff = true; + fSystemPowerAckRef = params->notifyRef; + fSystemPowerAckTo = service; + unlock(); + + matchingEnd(NULL); + + params->maxWaitForReply = 60 * 1000 * 1000; + ret = kIOReturnSuccess; + } else if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && + ((params->fromCapabilities & kIOPMSystemCapabilityCPU) == 0) && + (params->toCapabilities & kIOPMSystemCapabilityCPU)) { + lock(); + fSystemOff = false; + unlock(); + + matchingEnd(NULL); + + params->maxWaitForReply = 0; + ret = kIOReturnSuccess; + } else { + params->maxWaitForReply = 0; + ret = kIOReturnSuccess; + } + break; + + default: + ret = kIOReturnUnsupported; + break; + } + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + /* * Start a previously attached & probed instance, * called on exporting object instance @@ -3686,42 +4266,164 @@ bool IOService::startCandidate( IOService * service ) { bool ok; + OSObject * obj; + OSObject * prop; + IOUserServer * userServer; + bool ph; - ok = service->attach( this ); + userServer = NULL; + obj = service->copyProperty(gIOUserServerNameKey); - if (ok) { - if (this != gIOResources) { - // stall for any nub resources - checkResources(); - // stall for any driver resources - service->checkResources(); - } + if (obj && (this == gIOResources)) { + ok = false; + } else { + ok = service->attach( this ); + } + if (!ok) { + return false; + } - AbsoluteTime startTime; - AbsoluteTime endTime; - UInt64 nano; + if ((this != gIOResources) && (this != gIOUserResources)) { + // stall for any nub resources + checkResources(); + // stall for any driver resources + service->checkResources(); + } + ph = false; + { + OSString * bundleID; + OSString * serverName; + OSString * str; + const OSSymbol * sym; + OSDictionary * matching; + IOService * server; + OSNumber * serverTag; + uint64_t entryID; + + if ((serverName = OSDynamicCast(OSString, obj))) { + obj = service->copyProperty(gIOModuleIdentifierKey); + bundleID = OSDynamicCast(OSString, obj); + entryID = service->getRegistryEntryID(); + serverTag = OSNumber::withNumber(entryID, 64); + + if (gKextdWillTerminate) { + DKLOG("%s disabled in shutdown\n", serverName->getCStringNoCopy()); + service->detach(this); + OSSafeReleaseNULL(obj); + return false; + } - if (kIOLogStart & gIOKitDebug) { - clock_get_uptime(&startTime); - } + ph = IOServicePH::matchingStart(this); + if (!ph) { + DKLOG("%s deferred in sleep\n", serverName->getCStringNoCopy()); + service->detach(this); + OSSafeReleaseNULL(obj); + return false; + } + + prop = service->copyProperty(gIOUserClassKey); + str = OSDynamicCast(OSString, prop); + if (str) { + service->setName(str); + } + OSSafeReleaseNULL(prop); - ok = service->start(this); + if (!(kIODKDisableDextLaunch & gIODKDebug)) { + OSKext::requestDaemonLaunch(bundleID, serverName, serverTag); + } + sym = OSSymbol::withString(serverName); + matching = serviceMatching(gIOUserServerClassKey); + propertyMatching(gIOUserServerNameKey, sym, matching); + if (!(kIODKDisableDextTag & gIODKDebug)) { + propertyMatching(gIOUserServerTagKey, serverTag, matching); + } - if (kIOLogStart & gIOKitDebug) { - clock_get_uptime(&endTime); + server = __WAITING_FOR_USER_SERVER__(matching); + matching->release(); + OSSafeReleaseNULL(serverTag); + OSSafeReleaseNULL(serverName); + + userServer = OSDynamicCast(IOUserServer, server); + if (!userServer) { + service->detach(this); + IOServicePH::matchingEnd(this); + DKLOG(DKS " user server timeout\n", DKN(service)); + return false; + } - if (CMP_ABSOLUTETIME(&endTime, &startTime) > 0) { - SUB_ABSOLUTETIME(&endTime, &startTime); - absolutetime_to_nanoseconds(endTime, &nano); - if (nano > 500000000ULL) { - IOLog("%s::start took %ld ms\n", service->getName(), (long)(UInt32)(nano / 1000000ULL)); + if (!(kIODKDisableCDHashChecking & gIODKDebug)) { + if (!userServer->serviceMatchesCDHash(service)) { + service->detach(this); + IOServicePH::matchingEnd(this); + userServer->exit("CDHash check failed"); + userServer->release(); + return false; } } + OSKext *kext = OSKext::lookupKextWithIdentifier(bundleID); + if (!kext) { + const char *name = bundleID->getCStringNoCopy(); + IOLog("%s Could not find OSKext for %s\n", __func__, name); + goto skip_log; + } + + /* + * Used for logging + */ + userServer->setTaskLoadTag(kext); + userServer->setDriverKitUUID(kext); + OSKext::OSKextLogDriverKitInfoLoad(kext); +skip_log: + OSSafeReleaseNULL(bundleID); + OSSafeReleaseNULL(kext); + + if (!(kIODKDisableEntitlementChecking & gIODKDebug)) { + if (!userServer->checkEntitlements(this, service)) { + service->detach(this); + IOServicePH::matchingEnd(this); + userServer->exit("Entitlements check failed"); + userServer->release(); + return false; + } + } + + userServer->serviceAttach(service, this); } - if (!ok) { - service->detach( this ); + } + + AbsoluteTime startTime; + AbsoluteTime endTime; + UInt64 nano; + + if (kIOLogStart & gIOKitDebug) { + clock_get_uptime(&startTime); + } + + ok = service->start(this); + + if (kIOLogStart & gIOKitDebug) { + clock_get_uptime(&endTime); + + if (CMP_ABSOLUTETIME(&endTime, &startTime) > 0) { + SUB_ABSOLUTETIME(&endTime, &startTime); + absolutetime_to_nanoseconds(endTime, &nano); + if (nano > 500000000ULL) { + IOLog("%s::start took %ld ms\n", service->getName(), (long)(UInt32)(nano / 1000000ULL)); + } } } + if (userServer) { + userServer->serviceStarted(service, this, ok); + userServer->release(); + } + if (!ok) { + service->detach( this ); + } + + if (ph) { + IOServicePH::matchingEnd(this); + } + return ok; } @@ -3739,7 +4441,7 @@ IOService::publishResource( const char * key, OSObject * value ) void IOService::publishResource( const OSSymbol * key, OSObject * value ) { - if (0 == value) { + if (NULL == value) { value = (OSObject *) gIOServiceKey; } @@ -3753,6 +4455,23 @@ IOService::publishResource( const OSSymbol * key, OSObject * value ) gIOResources->registerService(); } +void +IOService::publishUserResource( const OSSymbol * key, OSObject * value ) +{ + if (NULL == value) { + value = (OSObject *) gIOServiceKey; + } + + gIOUserResources->setProperty( key, value); + + if (IORecursiveLockHaveLock( gNotificationLock)) { + return; + } + + gIOResourceGenerationCount++; + gIOUserResources->registerService(); +} + bool IOService::addNeededResource( const char * key ) { @@ -3836,13 +4555,13 @@ IOService::checkResources( void ) bool ok; resourcesProp = copyProperty( gIOResourceMatchKey ); - if (0 == resourcesProp) { + if (NULL == resourcesProp) { return true; } if ((set = OSDynamicCast( OSSet, resourcesProp ))) { iter = OSCollectionIterator::withCollection( set ); - ok = (0 != iter); + ok = (NULL != iter); while (ok && (resourcesProp = iter->getNextObject())) { ok = checkResource( resourcesProp ); } @@ -3860,7 +4579,7 @@ IOService::checkResources( void ) void -_IOConfigThread::configThread( void ) +_IOConfigThread::configThread( int configThreadId ) { _IOConfigThread * inst; @@ -3871,11 +4590,16 @@ _IOConfigThread::configThread( void ) if (!inst->init()) { continue; } - thread_t unused; - if (KERN_SUCCESS != kernel_thread_start(&_IOConfigThread::main, inst, &unused)) { + thread_t thread; + if (KERN_SUCCESS != kernel_thread_start(&_IOConfigThread::main, inst, &thread)) { continue; } + char threadName[MAXTHREADNAMESIZE]; + snprintf(threadName, sizeof(threadName), "IOConfigThread_%d", configThreadId); + thread_set_thread_name(thread, threadName); + thread_deallocate(thread); + return; } while (false); @@ -3886,25 +4610,18 @@ _IOConfigThread::configThread( void ) return; } -void -_IOConfigThread::free( void ) -{ - thread_deallocate(current_thread()); - OSObject::free(); -} - void IOService::doServiceMatch( IOOptionBits options ) { _IOServiceNotifier * notify; OSIterator * iter; OSOrderedSet * matches; - OSArray * resourceKeys = 0; + OSArray * resourceKeys = NULL; SInt32 catalogGeneration; bool keepGuessing = true; bool reRegistered = true; bool didRegister; - OSArray * notifiers[2] = {0}; + OSArray * notifiers[2] = {NULL}; // job->nub->deliverNotification( gIOPublishNotification, // kIOServiceRegisteredState, 0xffffffff ); @@ -3946,7 +4663,7 @@ IOService::doServiceMatch( IOOptionBits options ) invokeNotifiers(¬ifiers[0]); if (keepGuessing && matches->getCount() && (kIOReturnSuccess == getResources())) { - if (this == gIOResources) { + if ((this == gIOResources) || (this == gIOUserResources)) { if (resourceKeys) { resourceKeys->release(); } @@ -4179,7 +4896,17 @@ IOService::waitQuiet( uint64_t timeout ) bool kextdWait; bool dopanic; +#if KASAN + /* + * On kasan kernels, everything takes longer, so double the number of + * timeout extensions. This should help with issues like 41259215 + * where WindowServer was timing out waiting for kextd to get all the + * kasan kexts loaded and started. + */ + enum { kTimeoutExtensions = 8 }; +#else enum { kTimeoutExtensions = 4 }; +#endif time = mach_absolute_time(); kextdWait = false; @@ -4194,7 +4921,7 @@ IOService::waitQuiet( uint64_t timeout ) break; } else if (kIOReturnTimeout != ret) { break; - } else if (timeout < 41000000000) { + } else if (timeout < (4100ull * NSEC_PER_SEC)) { break; } @@ -4459,7 +5186,7 @@ _IOServiceJob::pingConfig( _IOServiceJob * job ) if (gIOKitDebug & kIOLogConfig) { LOG("config(%d): creating\n", gNumConfigThreads - 1); } - _IOConfigThread::configThread(); + _IOConfigThread::configThread(gNumConfigThreads - 1); } semaphore_signal( gJobsSemaphore ); @@ -4517,14 +5244,14 @@ OSObject * IOService::copyExistingServices( OSDictionary * matching, IOOptionBits inState, IOOptionBits options ) { - OSObject * current = 0; + OSObject * current = NULL; OSIterator * iter; IOService * service; OSObject * obj; OSString * str; if (!matching) { - return 0; + return NULL; } #if MATCH_DEBUG @@ -4553,7 +5280,7 @@ IOService::copyExistingServices( OSDictionary * matching, ctx.count = 0; ctx.done = 0; ctx.options = options; - ctx.result = 0; + ctx.result = NULL; if ((str = OSDynamicCast(OSString, obj))) { const OSSymbol * sym = OSSymbol::withString(str); @@ -4570,7 +5297,7 @@ IOService::copyExistingServices( OSDictionary * matching, if (current && (ctx.done != ctx.count)) { OSSet * source = OSDynamicCast(OSSet, current); - current = 0; + current = NULL; while ((service = (IOService *) source->getAnyObject())) { if (service->matchPassive(matching, options)) { if (options & kIONotifyOnce) { @@ -4621,7 +5348,6 @@ IOService::copyExistingServices( OSDictionary * matching, iter->release(); } - if (((current != 0) != (_current != 0)) || (current && _current && !current->isEqualTo(_current))) { OSSerialize * s1 = OSSerialize::withCapacity(128); @@ -4704,17 +5430,17 @@ IOService::setNotification( IOServiceMatchingNotificationHandler handler, void * target, void * ref, SInt32 priority ) { - _IOServiceNotifier * notify = 0; + _IOServiceNotifier * notify = NULL; OSOrderedSet * set; if (!matching) { - return 0; + return NULL; } notify = new _IOServiceNotifier; if (notify && !notify->init()) { notify->release(); - notify = 0; + notify = NULL; } if (notify) { @@ -4735,9 +5461,9 @@ IOService::setNotification( ////// queue - if (0 == (set = (OSOrderedSet *) gNotifications->getObject( type ))) { + if (NULL == (set = (OSOrderedSet *) gNotifications->getObject( type ))) { set = OSOrderedSet::withCapacity( 1, - IONotifyOrdering, 0 ); + IONotifyOrdering, NULL ); if (set) { gNotifications->setObject( type, set ); set->release(); @@ -4765,7 +5491,7 @@ IOService::doInstallNotification( IOOptionBits inState; if (!matching) { - return 0; + return NULL; } if (type == gIOPublishNotification) { @@ -4779,7 +5505,7 @@ IOService::doInstallNotification( } else if ((type == gIOTerminatedNotification) || (type == gIOWillTerminateNotification)) { inState = 0; } else { - return 0; + return NULL; } notify = setNotification( type, matching, handler, target, ref, priority ); @@ -4788,7 +5514,7 @@ IOService::doInstallNotification( // get the current set exist = (OSIterator *) copyExistingServices( matching, inState ); } else { - exist = 0; + exist = NULL; } *existing = exist; @@ -4882,7 +5608,7 @@ IOService::addMatchingNotification( ret = notify = (_IOServiceNotifier *) installNotification( type, matching, handler, target, ref, priority, &existing ); if (!ret) { - return 0; + return NULL; } // send notifications for existing set @@ -4896,7 +5622,7 @@ IOService::addMatchingNotification( } LOCKWRITENOTIFY(); - bool removed = (0 == notify->whence); + bool removed = (NULL == notify->whence); notify->release(); if (removed) { ret = gIOServiceNullNotifier; @@ -4960,13 +5686,13 @@ IOService * IOService::waitForMatchingService( OSDictionary * matching, uint64_t timeout) { - IONotifier * notify = 0; + IONotifier * notify = NULL; // priority doesn't help us much since we need a thread wakeup SInt32 priority = 0; IOService * result; if (!matching) { - return 0; + return NULL; } result = NULL; @@ -4979,7 +5705,7 @@ IOService::waitForMatchingService( OSDictionary * matching, break; } notify = IOService::setNotification( gIOMatchedNotification, matching, - &IOService::syncNotificationHandler, (void *) 0, + &IOService::syncNotificationHandler, (void *) NULL, &result, priority ); if (!notify) { break; @@ -5027,6 +5753,7 @@ IOService::waitForService( OSDictionary * matching, return result; } +__dead2 void IOService::deliverNotification( const OSSymbol * type, IOOptionBits orNewState, IOOptionBits andNewState ) @@ -5040,7 +5767,7 @@ IOService::copyNotifiers(const OSSymbol * type, { _IOServiceNotifier * notify; OSIterator * iter; - OSArray * willSend = 0; + OSArray * willSend = NULL; lockForArbitration(); @@ -5056,7 +5783,7 @@ IOService::copyNotifiers(const OSSymbol * type, while ((notify = (_IOServiceNotifier *) iter->getNextObject())) { if (matchPassive(notify->matching, 0) && (kIOServiceNotifyEnable & notify->state)) { - if (0 == willSend) { + if (NULL == willSend) { willSend = OSArray::withCapacity(8); } if (willSend) { @@ -5093,7 +5820,7 @@ IOService::serviceMatching( const OSString * name, str = OSSymbol::withString(name); if (!str) { - return 0; + return NULL; } if (!table) { @@ -5115,7 +5842,7 @@ IOService::serviceMatching( const char * name, str = OSSymbol::withCString( name ); if (!str) { - return 0; + return NULL; } table = serviceMatching( str, table ); @@ -5145,7 +5872,7 @@ IOService::nameMatching( const char * name, str = OSSymbol::withCString( name ); if (!str) { - return 0; + return NULL; } table = nameMatching( str, table ); @@ -5173,7 +5900,7 @@ IOService::resourceMatching( const char * name, str = OSSymbol::withCString( name ); if (!str) { - return 0; + return NULL; } table = resourceMatching( str, table ); @@ -5190,7 +5917,7 @@ IOService::propertyMatching( const OSSymbol * key, const OSObject * value, properties = OSDictionary::withCapacity( 2 ); if (!properties) { - return 0; + return NULL; } properties->setObject( key, value ); @@ -5214,7 +5941,7 @@ IOService::registryEntryIDMatching( uint64_t entryID, num = OSNumber::withNumber( entryID, 64 ); if (!num) { - return 0; + return NULL; } if (!table) { @@ -5280,11 +6007,11 @@ _IOServiceNotifier::remove() if (whence) { whence->removeObject((OSObject *) this ); - whence = 0; + whence = NULL; } if (matching) { matching->release(); - matching = 0; + matching = NULL; } state &= ~kIOServiceNotifyEnable; @@ -5373,7 +6100,7 @@ IOResources::resources( void ) inst = new IOResources; if (inst && !inst->init()) { inst->release(); - inst = 0; + inst = NULL; } return inst; @@ -5397,8 +6124,8 @@ IOResources::init( OSDictionary * dictionary ) OSNumber *num; const OSSymbol * sym; - if ((num = OSNumber::withNumber(property_value, 32)) != 0) { - if ((sym = OSSymbol::withCString( property_name)) != 0) { + if ((num = OSNumber::withNumber(property_value, 32)) != NULL) { + if ((sym = OSSymbol::withCString( property_name)) != NULL) { this->setProperty( sym, num ); sym->release(); } @@ -5429,8 +6156,8 @@ IOResources::getWorkLoop() const } } -bool -IOResources::matchPropertyTable( OSDictionary * table ) +static bool +IOResourcesMatchPropertyTable(IOService * resources, OSDictionary * table) { OSObject * prop; OSString * str; @@ -5443,19 +6170,19 @@ IOResources::matchPropertyTable( OSDictionary * table ) prop = table->getObject( gIOResourceMatchKey ); str = OSDynamicCast( OSString, prop ); if (str) { - ok = (0 != getProperty( str )); + ok = (NULL != resources->getProperty( str )); } else if ((set = OSDynamicCast( OSSet, prop))) { iter = OSCollectionIterator::withCollection( set ); - ok = (iter != 0); + ok = (iter != NULL); while (ok && (str = OSDynamicCast( OSString, iter->getNextObject()))) { - ok = (0 != getProperty( str )); + ok = (NULL != resources->getProperty( str )); } if (iter) { iter->release(); } } else if ((prop = table->getObject(gIOResourceMatchedKey))) { - obj = copyProperty(gIOResourceMatchedKey); + obj = resources->copyProperty(gIOResourceMatchedKey); keys = OSDynamicCast(OSArray, obj); ok = false; if (keys) { @@ -5468,6 +6195,62 @@ IOResources::matchPropertyTable( OSDictionary * table ) return ok; } +bool +IOResources::matchPropertyTable( OSDictionary * table ) +{ + return IOResourcesMatchPropertyTable(this, table); +} + +/* + * IOUserResources + */ + +IOService * +IOUserResources::resources( void ) +{ + IOUserResources * inst; + + inst = OSTypeAlloc(IOUserResources); + if (inst && !inst->init()) { + inst->release(); + inst = NULL; + } + + return inst; +} + +bool +IOUserResources::init( OSDictionary * dictionary ) +{ + // Do super init first + if (!IOService::init()) { + return false; + } + return true; +} + +IOReturn +IOUserResources::newUserClient(task_t owningTask, void * securityID, + UInt32 type, OSDictionary * properties, + IOUserClient ** handler) +{ + return kIOReturnUnsupported; +} + +IOWorkLoop * +IOUserResources::getWorkLoop() const +{ + return getPlatform()->getWorkLoop(); +} + +bool +IOUserResources::matchPropertyTable( OSDictionary * table ) +{ + return IOResourcesMatchPropertyTable(this, table); +} + +// -- + void IOService::consoleLockTimer(thread_call_param_t p0, thread_call_param_t p1) { @@ -5507,7 +6290,7 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage) gIOConsoleBooterLockState = kOSBooleanTrue; break; case kIOScreenLockNoLock: - gIOConsoleBooterLockState = 0; + gIOConsoleBooterLockState = NULL; break; case kIOScreenLockUnlocked: default: @@ -5519,7 +6302,7 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage) } if (consoleUsers) { - OSNumber * num = 0; + OSNumber * num = NULL; bool loginLocked = true; gIOConsoleLoggedIn = false; @@ -5536,10 +6319,10 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage) } #if HIBERNATION if (!loginLocked) { - gIOConsoleBooterLockState = 0; + gIOConsoleBooterLockState = NULL; } IOLog("IOConsoleUsers: time(%d) %ld->%d, lin %d, llk %d, \n", - (num != 0), gIOConsoleLockTime, (num ? num->unsigned32BitValue() : 0), + (num != NULL), gIOConsoleLockTime, (num ? num->unsigned32BitValue() : 0), gIOConsoleLoggedIn, loginLocked); #endif /* HIBERNATION */ gIOConsoleLockTime = num ? num->unsigned32BitValue() : 0; @@ -5587,7 +6370,7 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage) gIOChosenEntry->setProperty(kIOScreenLockStateKey, &gIOScreenLockState, sizeof(gIOScreenLockState)); IOLog("IOConsoleUsers: gIOScreenLockState %d, hs %d, bs %d, now %ld, sm 0x%x\n", - gIOScreenLockState, gIOHibernateState, (gIOConsoleBooterLockState != 0), now, systemMessage); + gIOScreenLockState, gIOHibernateState, (gIOConsoleBooterLockState != NULL), now, systemMessage); } #endif /* HIBERNATION */ @@ -5622,12 +6405,12 @@ IOResources::setProperties( OSObject * properties ) } dict = OSDynamicCast(OSDictionary, properties); - if (0 == dict) { + if (NULL == dict) { return kIOReturnBadArgument; } iter = OSCollectionIterator::withCollection( dict); - if (0 == iter) { + if (NULL == iter) { return kIOReturnBadArgument; } @@ -5736,13 +6519,16 @@ IOService::addLocation( OSDictionary * table ) OSDictionary * dict; if (!table) { - return 0; + return NULL; } dict = OSDictionary::withCapacity( 1 ); if (dict) { - table->setObject( gIOLocationMatchKey, dict ); + bool ok = table->setObject( gIOLocationMatchKey, dict ); dict->release(); + if (!ok) { + dict = NULL; + } } return dict; @@ -5786,7 +6572,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did) str = OSDynamicCast(OSString, table->getObject(gIOProviderClassKey)); if (str) { done++; - match = ((kIOServiceClassDone & options) || (0 != metaCast(str))); + match = ((kIOServiceClassDone & options) || (NULL != metaCast(str))); #if MATCH_DEBUG match = (0 != metaCast( str )); if ((kIOServiceClassDone & options) && !match) { @@ -5801,7 +6587,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did) obj = table->getObject( gIONameMatchKey ); if (obj) { done++; - match = compareNames( obj, changesOK ? &matched : 0 ); + match = compareNames( obj, changesOK ? &matched : NULL ); if (!match) { break; } @@ -5841,20 +6627,20 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did) if (dict) { nextDict = OSDynamicCast( OSDictionary, obj); if (nextDict) { - iter = 0; + iter = NULL; } else { iter = OSCollectionIterator::withCollection( OSDynamicCast(OSCollection, obj)); } while (nextDict - || (iter && (0 != (nextDict = OSDynamicCast(OSDictionary, + || (iter && (NULL != (nextDict = OSDynamicCast(OSDictionary, iter->getNextObject()))))) { match = dict->isEqualTo( nextDict, nextDict); if (match) { break; } - nextDict = 0; + nextDict = NULL; } dict->release(); if (iter) { @@ -5877,20 +6663,20 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did) if (dict) { nextKey = OSDynamicCast( OSString, obj); if (nextKey) { - iter = 0; + iter = NULL; } else { iter = OSCollectionIterator::withCollection( OSDynamicCast(OSCollection, obj)); } while (nextKey - || (iter && (0 != (nextKey = OSDynamicCast(OSString, + || (iter && (NULL != (nextKey = OSDynamicCast(OSString, iter->getNextObject()))))) { - match = (0 != dict->getObject(nextKey)); + match = (NULL != dict->getObject(nextKey)); if (match) { break; } - nextKey = 0; + nextKey = NULL; } dict->release(); if (iter) { @@ -5927,7 +6713,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did) num = OSDynamicCast( OSNumber, table->getObject( gIOMatchedServiceCountKey )); if (num) { OSIterator * iter; - IOService * service = 0; + IOService * service = NULL; UInt32 serviceCount = 0; done++; @@ -5937,7 +6723,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did) if (kIOServiceInactiveState & service->__state[0]) { continue; } - if (0 == service->getProperty( gIOMatchCategoryKey )) { + if (NULL == service->getProperty( gIOMatchCategoryKey )) { continue; } ++serviceCount; @@ -6135,11 +6921,15 @@ IOService::newUserClient( task_t owningTask, void * securityID, UInt32 type, OSDictionary * properties, IOUserClient ** handler ) { - const OSSymbol *userClientClass = 0; + const OSSymbol *userClientClass = NULL; IOUserClient *client; OSObject *prop; OSObject *temp; + if (reserved && reserved->uvars && reserved->uvars->userServer) { + return reserved->uvars->userServer->serviceNewUserClient(this, owningTask, securityID, type, properties, handler); + } + if (kIOReturnSuccess == newUserClient( owningTask, securityID, type, handler )) { return kIOReturnSuccess; } @@ -6417,7 +7207,7 @@ IOService::getDeviceMemoryWithIndex( unsigned int index ) if (array) { range = (IODeviceMemory *) array->getObject( index ); } else { - range = 0; + range = NULL; } return range; @@ -6434,7 +7224,7 @@ IOService::mapDeviceMemoryWithIndex( unsigned int index, if (range) { map = range->map( options ); } else { - map = 0; + map = NULL; } return map; @@ -6539,7 +7329,7 @@ requireMaxCpuDelay(IOService * service, UInt32 ns, UInt32 delayType) // Check if entry found if (kNoReplace != replace) { - entries[replace].fService = 0; // Null the entry + entries[replace].fService = NULL; // Null the entry setCpuDelay = true; } } @@ -6665,16 +7455,16 @@ IOService::resolveInterrupt(IOService *nub, int source) // Get the parents list from the nub. array = OSDynamicCast(OSArray, nub->getProperty(gIOInterruptControllersKey)); - if (array == 0) { + if (array == NULL) { return kIOReturnNoResources; } // Allocate space for the IOInterruptSources if needed... then return early. - if (nub->_interruptSources == 0) { + if (nub->_interruptSources == NULL) { numSources = array->getCount(); interruptSources = (IOInterruptSource *)IOMalloc( numSources * sizeofAllIOInterruptSource); - if (interruptSources == 0) { + if (interruptSources == NULL) { return kIOReturnNoMemory; } @@ -6686,22 +7476,22 @@ IOService::resolveInterrupt(IOService *nub, int source) } interruptControllerName = OSDynamicCast(OSSymbol, array->getObject(source)); - if (interruptControllerName == 0) { + if (interruptControllerName == NULL) { return kIOReturnNoResources; } interruptController = getPlatform()->lookUpInterruptController(interruptControllerName); - if (interruptController == 0) { + if (interruptController == NULL) { return kIOReturnNoResources; } // Get the interrupt numbers from the nub. array = OSDynamicCast(OSArray, nub->getProperty(gIOInterruptSpecifiersKey)); - if (array == 0) { + if (array == NULL) { return kIOReturnNoResources; } data = OSDynamicCast(OSData, array->getObject(source)); - if (data == 0) { + if (data == NULL) { return kIOReturnNoResources; } @@ -6719,7 +7509,7 @@ IOService::lookupInterrupt(int source, bool resolve, IOInterruptController **int IOReturn ret; /* Make sure the _interruptSources are set */ - if (_interruptSources == 0) { + if (_interruptSources == NULL) { ret = resolveInterrupt(this, source); if (ret != kIOReturnSuccess) { return ret; @@ -7209,6 +7999,9 @@ IOService::setAuthorizationID( uint64_t authorizationID ) return status; } +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + #if __LP64__ OSMetaClassDefineReservedUsed(IOService, 0); OSMetaClassDefineReservedUsed(IOService, 1); diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index c4d1c96fd..5cdc56a78 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -86,36 +87,23 @@ static bool gIOPMInitialized = false; static uint32_t gIOPMBusyRequestCount = 0; static uint32_t gIOPMWorkInvokeCount = 0; static uint32_t gIOPMTickleGeneration = 0; -static IOWorkLoop * gIOPMWorkLoop = 0; -static IOPMRequestQueue * gIOPMRequestQueue = 0; -static IOPMRequestQueue * gIOPMReplyQueue = 0; -static IOPMWorkQueue * gIOPMWorkQueue = 0; -static IOPMCompletionQueue * gIOPMCompletionQueue = 0; -static IOPMRequest * gIOPMRequest = 0; -static IOService * gIOPMRootNode = 0; -static IOPlatformExpert * gPlatform = 0; - -static char gIOSpinDumpKextName[128]; -static char gIOSpinDumpDelayType[16]; -static uint32_t gIOSpinDumpDelayDuration = 0; - -static SYSCTL_STRING(_debug, OID_AUTO, swd_kext_name, - CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, - &gIOSpinDumpKextName, sizeof(gIOSpinDumpKextName), ""); -static SYSCTL_STRING(_debug, OID_AUTO, swd_delay_type, - CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, - &gIOSpinDumpDelayType, sizeof(gIOSpinDumpDelayType), ""); -static SYSCTL_INT(_debug, OID_AUTO, swd_delay_duration, - CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, - &gIOSpinDumpDelayDuration, 0, ""); - -const OSSymbol * gIOPMPowerClientDevice = 0; -const OSSymbol * gIOPMPowerClientDriver = 0; -const OSSymbol * gIOPMPowerClientChildProxy = 0; -const OSSymbol * gIOPMPowerClientChildren = 0; -const OSSymbol * gIOPMPowerClientRootDomain = 0; - -static const OSSymbol * gIOPMPowerClientAdvisoryTickle = 0; +static IOWorkLoop * gIOPMWorkLoop = NULL; +static IOPMRequestQueue * gIOPMRequestQueue = NULL; +static IOPMRequestQueue * gIOPMReplyQueue = NULL; +static IOPMWorkQueue * gIOPMWorkQueue = NULL; +static IOPMCompletionQueue * gIOPMCompletionQueue = NULL; +static IOPMRequest * gIOPMRequest = NULL; +static IOService * gIOPMRootNode = NULL; +static IOPlatformExpert * gPlatform = NULL; + + +const OSSymbol * gIOPMPowerClientDevice = NULL; +const OSSymbol * gIOPMPowerClientDriver = NULL; +const OSSymbol * gIOPMPowerClientChildProxy = NULL; +const OSSymbol * gIOPMPowerClientChildren = NULL; +const OSSymbol * gIOPMPowerClientRootDomain = NULL; + +static const OSSymbol * gIOPMPowerClientAdvisoryTickle = NULL; static bool gIOPMAdvisoryTickleEnabled = true; static thread_t gIOPMWatchDogThread = NULL; uint32_t gCanSleepTimeout = 0; @@ -367,19 +355,19 @@ IOService::PMinit( void ) if (gIOPMWorkLoop->addEventSource(gIOPMRequestQueue) != kIOReturnSuccess) { gIOPMRequestQueue->release(); - gIOPMRequestQueue = 0; + gIOPMRequestQueue = NULL; } if (gIOPMWorkLoop->addEventSource(gIOPMReplyQueue) != kIOReturnSuccess) { gIOPMReplyQueue->release(); - gIOPMReplyQueue = 0; + gIOPMReplyQueue = NULL; } if (gIOPMWorkLoop->addEventSource(gIOPMWorkQueue) != kIOReturnSuccess) { gIOPMWorkQueue->release(); - gIOPMWorkQueue = 0; + gIOPMWorkQueue = NULL; } // Must be added after the work queue, which pushes request @@ -387,7 +375,7 @@ IOService::PMinit( void ) if (gIOPMWorkLoop->addEventSource(gIOPMCompletionQueue) != kIOReturnSuccess) { gIOPMCompletionQueue->release(); - gIOPMCompletionQueue = 0; + gIOPMCompletionQueue = NULL; } gIOPMPowerClientDevice = @@ -407,9 +395,6 @@ IOService::PMinit( void ) gIOPMPowerClientRootDomain = OSSymbol::withCStringNoCopy( "RootDomainPower" ); - - gIOSpinDumpKextName[0] = '\0'; - gIOSpinDumpDelayType[0] = '\0'; } if (gIOPMRequestQueue && gIOPMReplyQueue && gIOPMCompletionQueue) { @@ -483,10 +468,6 @@ IOService::PMinit( void ) fDriverCallEntry = thread_call_allocate( (thread_call_func_t) &IOService::pmDriverCallout, this); assert(fDriverCallEntry); - if (kIOKextSpinDump & gIOKitDebug) { - fSpinDumpTimer = thread_call_allocate( - &IOService::spindump_timer_expired, (thread_call_param_t)this); - } // Check for powerChangeDone override. if (OSMemberFunctionCast(void (*)(void), @@ -523,7 +504,7 @@ void IOService::PMfree( void ) { initialized = false; - pm_vars = 0; + pm_vars = NULL; if (pwrMgt) { assert(fMachineState == kIOPM_Finished); @@ -568,11 +549,6 @@ IOService::PMfree( void ) thread_call_free(fDriverCallEntry); fDriverCallEntry = NULL; } - if (fSpinDumpTimer) { - thread_call_cancel(fSpinDumpTimer); - thread_call_free(fSpinDumpTimer); - fSpinDumpTimer = NULL; - } if (fPMLock) { IOLockFree(fPMLock); fPMLock = NULL; @@ -587,7 +563,7 @@ IOService::PMfree( void ) } if (fDriverCallParamSlots && fDriverCallParamPtr) { IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); - fDriverCallParamPtr = 0; + fDriverCallParamPtr = NULL; fDriverCallParamSlots = 0; } if (fResponseArray) { @@ -605,18 +581,18 @@ IOService::PMfree( void ) } if (fPowerClients) { fPowerClients->release(); - fPowerClients = 0; + fPowerClients = NULL; } #if PM_VARS_SUPPORT if (fPMVars) { fPMVars->release(); - fPMVars = 0; + fPMVars = NULL; } #endif pwrMgt->release(); - pwrMgt = 0; + pwrMgt = NULL; } } @@ -645,7 +621,7 @@ IOService::joinPMtree( IOService * driver ) IOPlatformExpert * platform; platform = getPlatform(); - assert(platform != 0); + assert(platform != NULL); platform->PMRegisterDevice(this, driver); } @@ -806,8 +782,8 @@ IOService::handlePMstop( IOPMRequest * request ) IOReturn IOService::addPowerChild( IOService * child ) { - IOPowerConnection * connection = 0; - IOPMRequest * requests[3] = {0, 0, 0}; + IOPowerConnection * connection = NULL; + IOPMRequest * requests[3] = {NULL, NULL, NULL}; OSIterator * iter; bool ok = true; @@ -839,7 +815,7 @@ IOService::addPowerChild( IOService * child ) iter->release(); } if (!ok) { - PM_LOG("%s: %s (%p) is already a child\n", + PM_LOG2("%s: %s (%p) is already a child\n", getName(), child->getName(), OBFUSCATE(child)); break; } @@ -1125,7 +1101,7 @@ IOService::registerPowerDriver( unsigned long numberOfStates ) { IOPMRequest * request; - IOPMPSEntry * powerStatesCopy = 0; + IOPMPSEntry * powerStatesCopy = NULL; IOPMPowerStateIndex stateOrder; IOReturn error = kIOReturnSuccess; @@ -1246,6 +1222,40 @@ IOService::handleRegisterPowerDriver( IOPMRequest * request ) lowestPowerState = fPowerStates[0].stateOrderToIndex; fHighestPowerState = fPowerStates[numberOfStates - 1].stateOrderToIndex; +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + { + uint32_t aotFlags; + IOService * service; + OSObject * object; + OSData * data; + + // Disallow kIOPMAOTPower states unless device tree enabled + + aotFlags = 0; + service = this; + while (service && !service->inPlane(gIODTPlane)) { + service = service->getProvider(); + } + if (service) { + object = service->copyProperty(kIOPMAOTPowerKey, gIODTPlane); + data = OSDynamicCast(OSData, object); + if (data && (data->getLength() >= sizeof(uint32_t))) { + aotFlags = ((uint32_t *)data->getBytesNoCopy())[0]; + } + OSSafeReleaseNULL(object); + } + if (!aotFlags) { + for (i = 0; i < numberOfStates; i++) { + if (kIOPMAOTPower & fPowerStates[i].inputPowerFlags) { + fPowerStates[i].inputPowerFlags = 0xFFFFFFFF; + fPowerStates[i].capabilityFlags = 0; + fPowerStates[i].outputPowerFlags = 0; + } + } + } + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + // OR'in all the output power flags fMergedOutputPowerFlags = 0; fDeviceUsablePowerState = lowestPowerState; @@ -1434,7 +1444,7 @@ IOService::handleInterestChanged( IOPMRequest * request ) fInsertInterestSet->removeObject(driver); } fInsertInterestSet->release(); - fInsertInterestSet = 0; + fInsertInterestSet = NULL; } if (fRemoveInterestSet) { @@ -1451,7 +1461,7 @@ IOService::handleInterestChanged( IOPMRequest * request ) fRemoveInterestSet->removeObject(driver); } fRemoveInterestSet->release(); - fRemoveInterestSet = 0; + fRemoveInterestSet = NULL; } PM_UNLOCK(); @@ -1642,7 +1652,7 @@ IOService::adjustPowerState( uint32_t clamp ) /* flags */ changeFlags, /* power state */ fDesiredPowerState, /* domain flags */ 0, - /* connection */ 0, + /* connection */ NULL, /* parent flags */ 0); } } @@ -1656,7 +1666,7 @@ IOService::synchronizePowerTree( IOOptionBits options, IOService * notifyRoot ) { - IOPMRequest * request_c = 0; + IOPMRequest * request_c = NULL; IOPMRequest * request_s; if (this != getPMRootDomain()) { @@ -1666,7 +1676,7 @@ IOService::synchronizePowerTree( return kIOPMNotYetInitialized; } - OUR_PMLog(kPMLogCSynchronizePowerTree, options, (notifyRoot != 0)); + OUR_PMLog(kPMLogCSynchronizePowerTree, options, (notifyRoot != NULL)); if (notifyRoot) { IOPMRequest * nr; @@ -1727,7 +1737,7 @@ IOService::handleSynchronizePowerTree( IOPMRequest * request ) (options & kIOPMSyncNoChildNotify), /* power state */ fCurrentPowerState, /* domain flags */ 0, - /* connection */ 0, + /* connection */ NULL, /* parent flags */ 0); } } @@ -1965,6 +1975,12 @@ IOService::handlePowerDomainDidChangeTo( IOPMRequest * request ) myChangeFlags = kIOPMParentInitiated | kIOPMDomainDidChange | (parentChangeFlags & kIOPMRootBroadcastFlags); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (kIOPMAOTPower & fPowerStates[maxPowerState].inputPowerFlags) { + IOLog("aotPS %s0x%qx[%ld]\n", getName(), getRegistryEntryID(), maxPowerState); + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + result = startPowerChange( /* flags */ myChangeFlags, /* power state */ maxPowerState, @@ -2339,7 +2355,7 @@ IOService::changePowerStateWithOverrideTo( IOPMPowerStateIndex ordinal, request->fRequestTag = tag; request->fArg0 = (void *) ordinal; request->fArg1 = (void *) gIOPMPowerClientDevice; - request->fArg2 = 0; + request->fArg2 = NULL; #if NOT_READY if (action) { request->installCompletionAction( action, target, param ); @@ -2441,7 +2457,7 @@ IOService::requestPowerState( client->retain(); request->fArg0 = (void *)(uintptr_t) state; request->fArg1 = (void *) client; - request->fArg2 = 0; + request->fArg2 = NULL; #if NOT_READY if (action) { request->installCompletionAction( action, target, param ); @@ -2731,6 +2747,15 @@ IOService::computeDesiredState( unsigned long localClamp, bool computeOnly ) newPowerState = fHighestPowerState; } +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + if (getPMRootDomain()->isAOTMode()) { + if ((kIOPMPreventIdleSleep & fPowerStates[newPowerState].capabilityFlags) + && !(kIOPMPreventIdleSleep & fPowerStates[fDesiredPowerState].capabilityFlags)) { + getPMRootDomain()->claimSystemWakeEvent(this, kIOPMWakeEventAOTExit, getName(), NULL); + } + } +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + fDesiredPowerState = newPowerState; PM_LOG1(" temp %u, clamp %u, current %u, new %u\n", @@ -3432,6 +3457,8 @@ IOService::startPowerChange( IOPowerConnection * parentConnection, IOPMPowerFlags parentFlags ) { + uint32_t savedPMActionsParam; + PM_ASSERT_IN_GATE(); assert( fMachineState == kIOPM_Finished ); assert( powerState < fNumberOfPowerStates ); @@ -3441,8 +3468,25 @@ IOService::startPowerChange( } fIsPreChange = true; + savedPMActionsParam = fPMActions.parameter; PM_ACTION_2(actionPowerChangeOverride, &powerState, &changeFlags); + // rdar://problem/55040032 + // Schedule a power adjustment after removing the power clamp + // to inform our power parent(s) about our latest desired domain + // power state. For a self-initiated change, let OurChangeStart() + // automatically request parent power when necessary. + if (!fAdjustPowerScheduled && + ((changeFlags & kIOPMSelfInitiated) == 0) && + ((fPMActions.parameter & kPMActionsFlagLimitPower) == 0) && + ((savedPMActionsParam & kPMActionsFlagLimitPower) != 0)) { + IOPMRequest * request = acquirePMRequest(this, kIOPMRequestTypeAdjustPowerState); + if (request) { + submitPMRequest(request); + fAdjustPowerScheduled = true; + } + } + if (changeFlags & kIOPMExpireIdleTimer) { // Root domain requested removal of tickle influence if (StateOrder(fDeviceDesire) > StateOrder(powerState)) { @@ -3523,7 +3567,7 @@ IOService::notifyInterestedDrivers( void ) if (fDriverCallParamSlots) { assert(fDriverCallParamPtr); IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); - fDriverCallParamPtr = 0; + fDriverCallParamPtr = NULL; fDriverCallParamSlots = 0; } @@ -3703,7 +3747,7 @@ IOService::notifyChildren( void ) OSIterator * iter; OSObject * next; IOPowerConnection * connection; - OSArray * children = 0; + OSArray * children = NULL; IOPMrootDomain * rootDomain; bool delayNotify = false; @@ -3763,10 +3807,10 @@ IOService::notifyChildren( void ) if (children && (children->getCount() == 0)) { children->release(); - children = 0; + children = NULL; } if (children) { - assert(fNotifyChildArray == 0); + assert(fNotifyChildArray == NULL); fNotifyChildArray = children; MS_PUSH(fMachineState); @@ -3818,7 +3862,7 @@ IOService::notifyChildrenOrdered( void ) fNotifyChildArray->removeObject(0); } else { fNotifyChildArray->release(); - fNotifyChildArray = 0; + fNotifyChildArray = NULL; MS_POP(); // pushed by notifyChildren() } @@ -3853,7 +3897,7 @@ IOService::notifyChildrenDelayed( void ) PM_LOG2("%s: notified delayed children\n", getName()); fNotifyChildArray->release(); - fNotifyChildArray = 0; + fNotifyChildArray = NULL; MS_POP(); // pushed by notifyChildren() } @@ -3954,14 +3998,11 @@ IOService::driverSetPowerState( void ) param = (DriverCallParam *) fDriverCallParamPtr; powerState = fHeadNotePowerState; - callEntry.callMethod = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState); - if (assertPMDriverCall(&callEntry)) { + if (assertPMDriverCall(&callEntry, kIOPMDriverCallMethodSetPowerState)) { OUR_PMLogFuncStart(kPMLogProgramHardware, (uintptr_t) this, powerState); - start_spindump_timer("SetState"); clock_get_uptime(&fDriverCallStartTime); result = fControllingDriver->setPowerState( powerState, this ); clock_get_uptime(&end); - stop_spindump_timer(); OUR_PMLogFuncEnd(kPMLogProgramHardware, (uintptr_t) this, (UInt32) result); deassertPMDriverCall(&callEntry); @@ -4017,6 +4058,8 @@ IOService::driverInformPowerChange( void ) AbsoluteTime end; IOReturn result; IOItemCount count; + IOOptionBits callMethod = (fDriverCallReason == kDriverCallInformPreChange) ? + kIOPMDriverCallMethodWillChange : kIOPMDriverCallMethodDidChange; assert( fDriverCallBusy ); assert( fDriverCallParamPtr ); @@ -4032,27 +4075,18 @@ IOService::driverInformPowerChange( void ) informee = (IOPMinformee *) param->Target; driver = informee->whatObject; - if (fDriverCallReason == kDriverCallInformPreChange) { - callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateWillChangeTo); - } else { - callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateDidChangeTo); - } - if (assertPMDriverCall(&callEntry, 0, informee)) { + if (assertPMDriverCall(&callEntry, callMethod, informee)) { if (fDriverCallReason == kDriverCallInformPreChange) { OUR_PMLogFuncStart(kPMLogInformDriverPreChange, (uintptr_t) this, powerState); - start_spindump_timer("WillChange"); clock_get_uptime(&informee->startTime); result = driver->powerStateWillChangeTo(powerFlags, powerState, this); clock_get_uptime(&end); - stop_spindump_timer(); OUR_PMLogFuncEnd(kPMLogInformDriverPreChange, (uintptr_t) this, result); } else { OUR_PMLogFuncStart(kPMLogInformDriverPostChange, (uintptr_t) this, powerState); - start_spindump_timer("DidChange"); clock_get_uptime(&informee->startTime); result = driver->powerStateDidChangeTo(powerFlags, powerState, this); clock_get_uptime(&end); - stop_spindump_timer(); OUR_PMLogFuncEnd(kPMLogInformDriverPostChange, (uintptr_t) this, result); } @@ -4340,7 +4374,7 @@ IOService::all_done( void ) // inform subclass policy-maker if (fPCDFunctionOverride && fParentsKnowState && - assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) { + assertPMDriverCall(&callEntry, kIOPMDriverCallMethodChangeDone, NULL, kIOPMDriverCallNoInactiveCheck)) { powerChangeDone(prevPowerState); deassertPMDriverCall(&callEntry); } @@ -4392,7 +4426,7 @@ IOService::all_done( void ) // inform subclass policy-maker if (fPCDFunctionOverride && fParentsKnowState && - assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) { + assertPMDriverCall(&callEntry, kIOPMDriverCallMethodChangeDone, NULL, kIOPMDriverCallNoInactiveCheck)) { powerChangeDone(prevPowerState); deassertPMDriverCall(&callEntry); } @@ -4509,7 +4543,7 @@ requestDomainPowerApplier( IOService * parent; IOPMRequestDomainPowerContext * context; - if ((connection = OSDynamicCast(IOPowerConnection, entry)) == 0) { + if ((connection = OSDynamicCast(IOPowerConnection, entry)) == NULL) { return; } parent = (IOService *) connection->copyParentEntry(gIOPowerPlane); @@ -5203,6 +5237,9 @@ IOService::startSettleTimer( void ) //********************************************************************************* #ifndef __LP64__ +#if MACH_ASSERT +__dead2 +#endif void IOService::ack_timer_ticked( void ) { @@ -5232,14 +5269,40 @@ IOService::ackTimerTick( void ) PM_ERROR("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms\n", fName, OBFUSCATE(this), fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); -#if DEBUG && CONFIG_EMBEDDED - panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms", - fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); +#if DEBUG || DEVELOPMENT || CONFIG_EMBEDDED + uint32_t panic_allowed = -1; + PE_parse_boot_argn("setpowerstate_panic", &panic_allowed, sizeof(panic_allowed)); + if (panic_allowed != 0) { + // rdar://problem/48743340 - excluding AppleSEPManager from panic + const char *whitelist = "AppleSEPManager"; + if (strncmp(fName, whitelist, strlen(whitelist))) { + panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms", + fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); + } + } else { + PM_ERROR("setPowerState panic disabled by setpowerstate_panic boot-arg\n"); + } #else if (gIOKitDebug & kIOLogDebugPower) { panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms", fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); } else { + // panic for first party kexts + const void *function_addr = NULL; + OSKext *kext = NULL; + function_addr = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState); + kext = OSKext::lookupKextWithAddress((vm_address_t)function_addr); + if (kext) { + const char *bundleID = kext->getIdentifierCString(); + const char *apple_prefix = "com.apple"; + const char *kernel_prefix = "__kernel__"; + if (strncmp(bundleID, apple_prefix, strlen(apple_prefix)) == 0 || strncmp(bundleID, kernel_prefix, strlen(kernel_prefix)) == 0) { + // first party client + panic("%s::setPowerState(%p : %p, %lu -> %lu) timed out after %d ms", + fName, this, function_addr, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); + } + kext->release(); + } // Unblock state machine and pretend driver has acked. done = true; } @@ -5283,6 +5346,7 @@ IOService::ackTimerTick( void ) if (fHeadNotePendingAcks == 0) { // yes, we can continue done = true; + getPMRootDomain()->reset_watchdog_timer(this, 0); } else { // no, set timer again start_ack_timer(); @@ -5414,7 +5478,6 @@ IOService::reset_watchdog_timer(IOService *blockedObject, int pendingResponseTim goto exit; } - for (i = 0; i < fBlockedArray->getCount(); i++) { obj = OSDynamicCast(IOService, fBlockedArray->getObject(i)); if (obj && (obj->fPendingResponseDeadline < deadline)) { @@ -5459,9 +5522,9 @@ IOService::watchdog_timer_expired( thread_call_param_t arg0, thread_call_param_t gIOPMWatchDogThread = current_thread(); getPMRootDomain()->sleepWakeDebugTrig(true); - gIOPMWatchDogThread = 0; + gIOPMWatchDogThread = NULL; thread_call_free(me->fWatchdogTimer); - me->fWatchdogTimer = 0; + me->fWatchdogTimer = NULL; return; } @@ -5558,108 +5621,6 @@ IOService::ack_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 me->release(); } -//********************************************************************************* -// [private] start_spindump_timer -//********************************************************************************* - -void -IOService::start_spindump_timer( const char * delay_type ) -{ - AbsoluteTime deadline; - boolean_t pending; - - if (!fSpinDumpTimer || !(kIOKextSpinDump & gIOKitDebug)) { - return; - } - - if (gIOSpinDumpKextName[0] == '\0' && - !(PE_parse_boot_argn("swd_kext_name", &gIOSpinDumpKextName, - sizeof(gIOSpinDumpKextName)))) { - return; - } - - if (strncmp(gIOSpinDumpKextName, fName, sizeof(gIOSpinDumpKextName)) != 0) { - return; - } - - if (gIOSpinDumpDelayType[0] == '\0' && - !(PE_parse_boot_argn("swd_delay_type", &gIOSpinDumpDelayType, - sizeof(gIOSpinDumpDelayType)))) { - strncpy(gIOSpinDumpDelayType, "SetState", sizeof(gIOSpinDumpDelayType)); - } - - if (strncmp(delay_type, gIOSpinDumpDelayType, sizeof(gIOSpinDumpDelayType)) != 0) { - return; - } - - if (gIOSpinDumpDelayDuration == 0 && - !(PE_parse_boot_argn("swd_delay_duration", &gIOSpinDumpDelayDuration, - sizeof(gIOSpinDumpDelayDuration)))) { - gIOSpinDumpDelayDuration = 300; - } - - clock_interval_to_deadline(gIOSpinDumpDelayDuration, kMillisecondScale, &deadline); - - retain(); - pending = thread_call_enter_delayed(fSpinDumpTimer, deadline); - if (pending) { - release(); - } -} - -//********************************************************************************* -// [private] stop_spindump_timer -//********************************************************************************* - -void -IOService::stop_spindump_timer( void ) -{ - boolean_t pending; - - if (!fSpinDumpTimer || !(kIOKextSpinDump & gIOKitDebug)) { - return; - } - - pending = thread_call_cancel(fSpinDumpTimer); - if (pending) { - release(); - } -} - - -//********************************************************************************* -// [static] actionSpinDumpTimerExpired -// -// Inside PM work loop's gate. -//********************************************************************************* - -IOReturn -IOService::actionSpinDumpTimerExpired( - OSObject * target, - void * arg0, void * arg1, - void * arg2, void * arg3 ) -{ - getPMRootDomain()->takeStackshot(false, false, true); - - return kIOReturnSuccess; -} - -//********************************************************************************* -// spindump_timer_expired -// -// Thread call function. Holds a retain while the callout is in flight. -//********************************************************************************* - -void -IOService::spindump_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ) -{ - IOService * me = (IOService *) arg0; - - if (gIOPMWorkLoop) { - gIOPMWorkLoop->runAction(&actionSpinDumpTimerExpired, me); - } - me->release(); -} // MARK: - // MARK: Client Messaging @@ -5887,10 +5848,12 @@ IOService::tellClientsWithResponse( int messageType ) } context.responseArray = fResponseArray; - context.notifyClients = 0; + context.notifyClients = NULL; context.serialNumber = fSerialNumber; context.messageType = messageType; context.notifyType = fOutOfBandParameter; + context.skippedInDark = 0; + context.notSkippedInDark = 0; context.isPreChange = fIsPreChange; context.enableTracing = false; context.us = this; @@ -5902,7 +5865,7 @@ IOService::tellClientsWithResponse( int messageType ) OSMemberFunctionCast( IOPMMessageFilter, this, - &IOPMrootDomain::systemMessageFilter) : 0; + &IOPMrootDomain::systemMessageFilter) : NULL; switch (fOutOfBandParameter) { case kNotifyApps: @@ -5969,6 +5932,12 @@ IOService::tellClientsWithResponse( int messageType ) } fNotifyClientArray = context.notifyClients; + if (context.skippedInDark) { + IOLog("tellClientsWithResponse(%s, %d) %d of %d skipped in dark\n", + getIOMessageString(messageType), fOutOfBandParameter, + context.skippedInDark, context.skippedInDark + context.notSkippedInDark); + } + // do we have to wait for somebody? if (!checkForDone()) { OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0); @@ -6027,10 +5996,16 @@ IOService::pmTellAppWithResponse( OSObject * object, void * arg ) if (proc) { proc_suspended = get_task_pidsuspended((task_t) proc->task); - proc_rele(proc); - if (proc_suspended) { logClientIDForNotification(object, context, "PMTellAppWithResponse - Suspended"); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + } else if (getPMRootDomain()->isAOTMode() && get_task_suspended((task_t) proc->task)) { + proc_suspended = true; + context->skippedInDark++; +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + } + proc_rele(proc); + if (proc_suspended) { return; } } @@ -6038,16 +6013,17 @@ IOService::pmTellAppWithResponse( OSObject * object, void * arg ) } if (context->messageFilter && - !context->messageFilter(context->us, object, context, 0, &waitForReply)) { + !context->messageFilter(context->us, object, context, NULL, &waitForReply)) { if (kIOLogDebugPower & gIOKitDebug) { logClientIDForNotification(object, context, "DROP App"); } return; } + context->notSkippedInDark++; // Create client array (for tracking purposes) only if the service // has app clients. Usually only root domain does. - if (0 == context->notifyClients) { + if (NULL == context->notifyClients) { context->notifyClients = OSArray::withCapacity( 32 ); } @@ -6104,7 +6080,7 @@ IOService::pmTellClientWithResponse( OSObject * object, void * arg ) uint64_t nsec; if (context->messageFilter && - !context->messageFilter(context->us, object, context, 0, 0)) { + !context->messageFilter(context->us, object, context, NULL, NULL)) { if ((kIOLogDebugPower & gIOKitDebug) && (OSDynamicCast(_IOServiceInterestNotifier, object))) { _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; @@ -6138,7 +6114,7 @@ IOService::pmTellClientWithResponse( OSObject * object, void * arg ) OBFUSCATE(object), OBFUSCATE(notifier->handler)); } - if (0 == context->notifyClients) { + if (NULL == context->notifyClients) { context->notifyClients = OSArray::withCapacity( 32 ); } @@ -6147,7 +6123,7 @@ IOService::pmTellClientWithResponse( OSObject * object, void * arg ) notify.stateNumber = context->stateNumber; notify.stateFlags = context->stateFlags; - if (context->enableTracing && (notifier != 0)) { + if (context->enableTracing && (notifier != NULL)) { getPMRootDomain()->traceDetail(notifier, true); } @@ -6236,9 +6212,40 @@ IOService::pmTellCapabilityAppWithResponse( OSObject * object, void * arg ) return; } + if (context->us == getPMRootDomain() && +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + getPMRootDomain()->isAOTMode() +#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + false +#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + ) { + OSNumber *clientID = NULL; + boolean_t proc_suspended = FALSE; + proc_t proc = NULL; + if ((clientID = copyClientIDForNotification(object, context))) { + uint32_t clientPID = clientID->unsigned32BitValue(); + clientID->release(); + proc = proc_find(clientPID); + if (proc) { + proc_suspended = get_task_pidsuspended((task_t) proc->task); + if (proc_suspended) { + logClientIDForNotification(object, context, "PMTellCapablityAppWithResponse - Suspended"); + } else if (get_task_suspended((task_t) proc->task)) { + proc_suspended = true; + context->skippedInDark++; + } + proc_rele(proc); + if (proc_suspended) { + return; + } + } + } + } + context->notSkippedInDark++; + // Create client array (for tracking purposes) only if the service // has app clients. Usually only root domain does. - if (0 == context->notifyClients) { + if (NULL == context->notifyClients) { context->notifyClients = OSArray::withCapacity( 32 ); } @@ -6316,7 +6323,7 @@ IOService::pmTellCapabilityClientWithResponse( memset(&msgArg, 0, sizeof(msgArg)); if (context->messageFilter && - !context->messageFilter(context->us, object, context, &msgArg, 0)) { + !context->messageFilter(context->us, object, context, &msgArg, NULL)) { if ((kIOLogDebugPower & gIOKitDebug) && (OSDynamicCast(_IOServiceInterestNotifier, object))) { _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; @@ -6328,7 +6335,7 @@ IOService::pmTellCapabilityClientWithResponse( return; } - if (0 == context->notifyClients) { + if (NULL == context->notifyClients) { context->notifyClients = OSArray::withCapacity( 32 ); } notifier = OSDynamicCast(_IOServiceInterestNotifier, object); @@ -6356,7 +6363,7 @@ IOService::pmTellCapabilityClientWithResponse( msgArg.notifyRef = msgRef; msgArg.maxWaitForReply = 0; - if (context->enableTracing && (notifier != 0)) { + if (context->enableTracing && (notifier != NULL)) { getPMRootDomain()->traceDetail(notifier, true); } @@ -6475,7 +6482,7 @@ IOService::tellClients( int messageType ) OSMemberFunctionCast( IOPMMessageFilter, this, - &IOPMrootDomain::systemMessageFilter) : 0; + &IOPMrootDomain::systemMessageFilter) : NULL; context.notifyType = kNotifyPriority; applyToInterested( gIOPriorityPowerStateInterest, @@ -6502,7 +6509,7 @@ tellKernelClientApplier( OSObject * object, void * arg ) IOPMInterestContext * context = (IOPMInterestContext *) arg; if (context->messageFilter && - !context->messageFilter(context->us, object, context, 0, 0)) { + !context->messageFilter(context->us, object, context, NULL, NULL)) { if ((kIOLogDebugPower & gIOKitDebug) && (OSDynamicCast(_IOServiceInterestNotifier, object))) { _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; @@ -6514,7 +6521,7 @@ tellKernelClientApplier( OSObject * object, void * arg ) return; } - notify.powerRef = (void *) 0; + notify.powerRef = (void *) NULL; notify.returnValue = 0; notify.stateNumber = context->stateNumber; notify.stateFlags = context->stateFlags; @@ -6596,10 +6603,16 @@ tellAppClientApplier( OSObject * object, void * arg ) if (proc) { proc_suspended = get_task_pidsuspended((task_t) proc->task); - proc_rele(proc); - if (proc_suspended) { logClientIDForNotification(object, context, "tellAppClientApplier - Suspended"); +#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) + } else if (IOService::getPMRootDomain()->isAOTMode() && get_task_suspended((task_t) proc->task)) { + proc_suspended = true; + context->skippedInDark++; +#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ + } + proc_rele(proc); + if (proc_suspended) { return; } } @@ -6607,18 +6620,19 @@ tellAppClientApplier( OSObject * object, void * arg ) } if (context->messageFilter && - !context->messageFilter(context->us, object, context, 0, 0)) { + !context->messageFilter(context->us, object, context, NULL, NULL)) { if (kIOLogDebugPower & gIOKitDebug) { logClientIDForNotification(object, context, "DROP App"); } return; } + context->notSkippedInDark++; if (kIOLogDebugPower & gIOKitDebug) { logClientIDForNotification(object, context, "MESG App"); } - context->us->messageClient(context->messageType, object, 0); + context->us->messageClient(context->messageType, object, NULL); } //********************************************************************************* @@ -6659,7 +6673,7 @@ IOService::responseValid( uint32_t refcon, int pid ) UInt16 serialComponent; UInt16 ordinalComponent; OSObject * theFlag; - OSObject *object = 0; + OSObject *object = NULL; serialComponent = (refcon >> 16) & 0xFFFF; ordinalComponent = (refcon & 0xFFFF); @@ -6674,7 +6688,7 @@ IOService::responseValid( uint32_t refcon, int pid ) theFlag = fResponseArray->getObject(ordinalComponent); - if (theFlag == 0) { + if (theFlag == NULL) { return false; } @@ -6727,7 +6741,7 @@ IOService::responseValid( uint32_t refcon, int pid ) } else if (object) { getPMRootDomain()->pmStatsRecordApplicationResponse( gIOPMStatsResponsePrompt, - 0, 0, 0, pid, object); + NULL, 0, 0, pid, object); } if (kOSBooleanFalse == theFlag) { @@ -6762,7 +6776,7 @@ IOService::allowPowerChange( unsigned long refcon ) request->fArg0 = (void *) refcon; request->fArg1 = (void *)(uintptr_t) proc_selfpid(); - request->fArg2 = (void *) 0; + request->fArg2 = (void *) NULL; submitPMRequest( request ); return kIOReturnSuccess; @@ -6814,6 +6828,23 @@ IOService::cancelPowerChange( unsigned long refcon ) return kIOReturnSuccess; } +//********************************************************************************* +// cancelIdlePowerDown +// +// Internal method to trigger an idle cancel or revert +//********************************************************************************* + +void +IOService::cancelIdlePowerDown( IOService * service ) +{ + IOPMRequest * request; + + request = acquirePMRequest(service, kIOPMRequestTypeIdleCancel); + if (request) { + submitPMRequest(request); + } +} + #ifndef __LP64__ IOReturn IOService::serializedCancelPowerChange2( unsigned long refcon ) @@ -7375,7 +7406,7 @@ IOService::actionPMCompletionQueue( IOPMRequest * request, IOPMCompletionQueue * queue ) { - bool more = (request->getNextRequest() != 0); + bool more = (request->getNextRequest() != NULL); IOPMRequest * root = request->getRootRequest(); if (root && (root != request)) { @@ -7767,7 +7798,7 @@ IOService::actionPMWorkQueueInvoke( IOPMRequest * request, IOPMWorkQueue * queue fMachineState); } - gIOPMRequest = 0; + gIOPMRequest = NULL; if (fMachineState == kIOPM_Finished) { stop_watchdog_timer(); @@ -7904,7 +7935,7 @@ IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * queue ) getPMRootDomain()->pmStatsRecordApplicationResponse( gIOPMStatsResponseCancel, name ? name->getCStringNoCopy() : "", 0, - 0, (int)(uintptr_t) request->fArg1, 0); + 0, (int)(uintptr_t) request->fArg1, NULL); } } @@ -8009,10 +8040,11 @@ IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * queue ) bool IOService::assertPMDriverCall( IOPMDriverCallEntry * entry, - IOOptionBits options, - IOPMinformee * inform ) + IOOptionBits method, + const IOPMinformee * inform, + IOOptionBits options ) { - IOService * target = 0; + IOService * target = NULL; bool ok = false; if (!initialized) { @@ -8025,7 +8057,7 @@ IOService::assertPMDriverCall( goto fail; } - if (((options & kIOPMADC_NoInactiveCheck) == 0) && isInactive()) { + if (((options & kIOPMDriverCallNoInactiveCheck) == 0) && isInactive()) { goto fail; } @@ -8039,6 +8071,24 @@ IOService::assertPMDriverCall( } } + // Record calling address for sleep failure diagnostics + switch (method) { + case kIOPMDriverCallMethodSetPowerState: + entry->callMethod = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState); + break; + case kIOPMDriverCallMethodWillChange: + entry->callMethod = OSMemberFunctionCast(const void *, target, &IOService::powerStateWillChangeTo); + break; + case kIOPMDriverCallMethodDidChange: + entry->callMethod = OSMemberFunctionCast(const void *, target, &IOService::powerStateDidChangeTo); + break; + case kIOPMDriverCallMethodUnknown: + case kIOPMDriverCallMethodSetAggressive: + default: + entry->callMethod = NULL; + break; + } + entry->thread = current_thread(); entry->target = target; queue_enter(&fPMDriverCallQueue, entry, IOPMDriverCallEntry *, link); @@ -8193,9 +8243,9 @@ IOPMRequest * IOPMRequest::create( void ) { IOPMRequest * me = OSTypeAlloc(IOPMRequest); - if (me && !me->init(0, kIOPMRequestTypeInvalid)) { + if (me && !me->init(NULL, kIOPMRequestTypeInvalid)) { me->release(); - me = 0; + me = NULL; } return me; } @@ -8235,14 +8285,14 @@ IOPMRequest::reset( void ) if (fCompletionAction && (fRequestType == kIOPMRequestTypeQuiescePowerTree)) { // Call the completion on PM work loop context fCompletionAction(fCompletionTarget, fCompletionParam); - fCompletionAction = 0; + fCompletionAction = NULL; } fRequestType = kIOPMRequestTypeInvalid; if (fTarget) { fTarget->release(); - fTarget = 0; + fTarget = NULL; } } @@ -8285,7 +8335,7 @@ IOPMRequest::detachNextRequest( void ) (uint32_t) fRequestNext->fWorkWaitCount, fTarget->getName()); #endif - fRequestNext = 0; + fRequestNext = NULL; ok = true; } return ok; @@ -8330,7 +8380,7 @@ IOPMRequest::detachRootRequest( void ) (uint32_t) fRequestRoot->fFreeWaitCount, fTarget->getName()); #endif - fRequestRoot = 0; + fRequestRoot = NULL; ok = true; } return ok; @@ -8353,7 +8403,7 @@ IOPMRequestQueue::create( IOService * inOwner, Action inAction ) IOPMRequestQueue * me = OSTypeAlloc(IOPMRequestQueue); if (me && !me->init(inOwner, inAction)) { me->release(); - me = 0; + me = NULL; } return me; } @@ -8367,7 +8417,7 @@ IOPMRequestQueue::init( IOService * inOwner, Action inAction ) queue_init(&fQueue); fLock = IOLockAlloc(); - return fLock != 0; + return fLock != NULL; } void @@ -8375,7 +8425,7 @@ IOPMRequestQueue::free( void ) { if (fLock) { IOLockFree(fLock); - fLock = 0; + fLock = NULL; } return IOEventSource::free(); } @@ -8458,7 +8508,7 @@ IOPMWorkQueue::create( IOService * inOwner, Action invoke, Action retire ) IOPMWorkQueue * me = OSTypeAlloc(IOPMWorkQueue); if (me && !me->init(inOwner, invoke, retire)) { me->release(); - me = 0; + me = NULL; } return me; } @@ -8467,7 +8517,7 @@ bool IOPMWorkQueue::init( IOService * inOwner, Action invoke, Action retire ) { if (!invoke || !retire || - !IOEventSource::init(inOwner, (IOEventSourceAction)0)) { + !IOEventSource::init(inOwner, (IOEventSourceAction)NULL)) { return false; } @@ -8562,7 +8612,7 @@ IOPMWorkQueue::checkRequestQueue( queue_head_t * requestQueue, bool * empty ) } if (request == fQuiesceRequest) { - fQuiesceRequest = 0; + fQuiesceRequest = NULL; } queue_remove_first(requestQueue, request, typeof(request), fCommandChain); @@ -8685,7 +8735,7 @@ IOPMWorkQueue::finishQuiesceRequest( IOPMRequest * quiesceRequest ) { if (fQuiesceRequest && (quiesceRequest == fQuiesceRequest) && (fQuiesceStartTime != 0)) { - fInvokeAction = 0; + fInvokeAction = NULL; fQuiesceFinishTime = mach_absolute_time(); } } @@ -8705,7 +8755,7 @@ IOPMCompletionQueue::create( IOService * inOwner, Action inAction ) IOPMCompletionQueue * me = OSTypeAlloc(IOPMCompletionQueue); if (me && !me->init(inOwner, inAction)) { me->release(); - me = 0; + me = NULL; } return me; } diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index 9c4f3bb7f..7d5a2bc54 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -158,7 +158,7 @@ class IOServicePM : public OSObject friend class IOService; friend class IOPMWorkQueue; - OSDeclareDefaultStructors( IOServicePM ) + OSDeclareDefaultStructors( IOServicePM ); private: // Link IOServicePM objects on IOPMWorkQueue. @@ -368,7 +368,6 @@ private: #define fWatchdogLock pwrMgt->WatchdogLock #define fBlockedArray pwrMgt->BlockedArray #define fPendingResponseDeadline pwrMgt->PendingResponseDeadline -#define fSpinDumpTimer pwrMgt->SpinDumpTimer #define fSettleTimeUS pwrMgt->SettleTimeUS #define fIdleTimerGeneration pwrMgt->IdleTimerGeneration #define fHeadNoteChangeFlags pwrMgt->HeadNoteChangeFlags @@ -542,6 +541,8 @@ struct IOPMInterestContext { uint32_t maxTimeRequested; uint32_t messageType; uint32_t notifyType; + uint32_t skippedInDark; + uint32_t notSkippedInDark; IOService * us; IOPMPowerStateIndex stateNumber; IOPMPowerFlags stateFlags; @@ -552,7 +553,17 @@ struct IOPMInterestContext { // assertPMDriverCall() options enum { - kIOPMADC_NoInactiveCheck = 1 + kIOPMDriverCallNoInactiveCheck = 1 +}; + +// assertPMDriverCall() method +enum { + kIOPMDriverCallMethodUnknown = 0, + kIOPMDriverCallMethodSetPowerState = 1, + kIOPMDriverCallMethodWillChange = 2, + kIOPMDriverCallMethodDidChange = 3, + kIOPMDriverCallMethodChangeDone = 4, + kIOPMDriverCallMethodSetAggressive = 5 }; //****************************************************************************** @@ -571,7 +582,7 @@ extern const OSSymbol *gIOPMStatsDriverPSChangeSlow; class IOPMRequest : public IOCommand { - OSDeclareDefaultStructors( IOPMRequest ) + OSDeclareDefaultStructors( IOPMRequest ); protected: IOService * fTarget; // request target @@ -621,7 +632,7 @@ public: return (IOPMRequest *) this; } #endif - return 0; + return NULL; } inline uint32_t @@ -652,7 +663,7 @@ public: isQuiesceType( void ) const { return (kIOPMRequestTypeQuiescePowerTree == fRequestType) && - (fCompletionAction != 0) && (fCompletionTarget != 0); + (fCompletionAction != NULL) && (fCompletionTarget != NULL); } inline void @@ -681,7 +692,7 @@ public: class IOPMRequestQueue : public IOEventSource { - OSDeclareDefaultStructors( IOPMRequestQueue ) + OSDeclareDefaultStructors( IOPMRequestQueue ); public: typedef bool (*Action)( IOService *, IOPMRequest *, IOPMRequestQueue * ); @@ -710,7 +721,7 @@ public: class IOPMWorkQueue : public IOEventSource { - OSDeclareDefaultStructors( IOPMWorkQueue ) + OSDeclareDefaultStructors( IOPMWorkQueue ); public: typedef bool (*Action)( IOService *, IOPMRequest *, IOPMWorkQueue * ); @@ -752,7 +763,7 @@ public: class IOPMCompletionQueue : public IOEventSource { - OSDeclareDefaultStructors( IOPMCompletionQueue ) + OSDeclareDefaultStructors( IOPMCompletionQueue ); public: typedef bool (*Action)( IOService *, IOPMRequest *, IOPMCompletionQueue * ); diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h index 91bc2a1d1..4ae23be1b 100644 --- a/iokit/Kernel/IOServicePrivate.h +++ b/iokit/Kernel/IOServicePrivate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,7 +84,7 @@ class _IOServiceNotifier : public IONotifier { friend class IOService; - OSDeclareDefaultStructors(_IOServiceNotifier) + OSDeclareDefaultStructors(_IOServiceNotifier); public: OSOrderedSet * whence; @@ -110,7 +110,7 @@ class _IOServiceInterestNotifier : public IONotifier { friend class IOService; - OSDeclareDefaultStructors(_IOServiceInterestNotifier) + OSDeclareDefaultStructors(_IOServiceInterestNotifier); public: queue_chain_t chain; @@ -131,7 +131,7 @@ public: class _IOServiceNullNotifier : public IONotifier { - OSDeclareDefaultStructors(_IOServiceNullNotifier) + OSDeclareDefaultStructors(_IOServiceNullNotifier); public: virtual void taggedRetain(const void *tag) const APPLE_KEXT_OVERRIDE; @@ -147,12 +147,10 @@ class _IOConfigThread : public OSObject { friend class IOService; - OSDeclareDefaultStructors(_IOConfigThread) + OSDeclareDefaultStructors(_IOConfigThread); public: - virtual void free() APPLE_KEXT_OVERRIDE; - - static void configThread( void ); + static void configThread( int configThreadId ); static void main( void * arg, wait_result_t result ); }; @@ -168,7 +166,7 @@ class _IOServiceJob : public OSObject { friend class IOService; - OSDeclareDefaultStructors(_IOServiceJob) + OSDeclareDefaultStructors(_IOServiceJob); public: int type; @@ -184,11 +182,11 @@ class IOResources : public IOService { friend class IOService; - OSDeclareDefaultStructors(IOResources) + OSDeclareDefaultStructors(IOResources); public: static IOService * resources( void ); - virtual bool init( OSDictionary * dictionary = 0 ) APPLE_KEXT_OVERRIDE; + virtual bool init( OSDictionary * dictionary = NULL ) APPLE_KEXT_OVERRIDE; virtual IOReturn newUserClient(task_t owningTask, void * securityID, UInt32 type, OSDictionary * properties, IOUserClient ** handler) APPLE_KEXT_OVERRIDE; @@ -197,11 +195,27 @@ public: virtual IOReturn setProperties( OSObject * properties ) APPLE_KEXT_OVERRIDE; }; +class IOUserResources : public IOService +{ + friend class IOService; + + OSDeclareDefaultStructors(IOUserResources); + +public: + static IOService * resources( void ); + virtual bool init( OSDictionary * dictionary = NULL ) APPLE_KEXT_OVERRIDE; + virtual IOReturn newUserClient(task_t owningTask, void * securityID, + UInt32 type, OSDictionary * properties, + IOUserClient ** handler) APPLE_KEXT_OVERRIDE; + virtual IOWorkLoop * getWorkLoop() const APPLE_KEXT_OVERRIDE; + virtual bool matchPropertyTable( OSDictionary * table ) APPLE_KEXT_OVERRIDE; +}; + class _IOOpenServiceIterator : public OSIterator { friend class IOService; - OSDeclareDefaultStructors(_IOOpenServiceIterator) + OSDeclareDefaultStructors(_IOOpenServiceIterator); OSIterator * iter; const IOService * client; diff --git a/iokit/Kernel/IOSharedDataQueue.cpp b/iokit/Kernel/IOSharedDataQueue.cpp index 797583bf8..71d3c6817 100644 --- a/iokit/Kernel/IOSharedDataQueue.cpp +++ b/iokit/Kernel/IOSharedDataQueue.cpp @@ -50,7 +50,7 @@ IOSharedDataQueue *IOSharedDataQueue::withCapacity(UInt32 size) if (dataQueue) { if (!dataQueue->initWithCapacity(size)) { dataQueue->release(); - dataQueue = 0; + dataQueue = NULL; } } @@ -65,7 +65,7 @@ IOSharedDataQueue::withEntries(UInt32 numEntries, UInt32 entrySize) if (dataQueue) { if (!dataQueue->initWithEntries(numEntries, entrySize)) { dataQueue->release(); - dataQueue = 0; + dataQueue = NULL; } } @@ -98,7 +98,7 @@ IOSharedDataQueue::initWithCapacity(UInt32 size) } dataQueue = (IODataQueueMemory *)IOMallocAligned(allocSize, PAGE_SIZE); - if (dataQueue == 0) { + if (dataQueue == NULL) { return false; } bzero(dataQueue, allocSize); @@ -150,9 +150,9 @@ IOSharedDataQueue::free() IOMemoryDescriptor * IOSharedDataQueue::getMemoryDescriptor() { - IOMemoryDescriptor *descriptor = 0; + IOMemoryDescriptor *descriptor = NULL; - if (dataQueue != 0) { + if (dataQueue != NULL) { descriptor = IOMemoryDescriptor::withAddress(dataQueue, getQueueSize() + DATA_QUEUE_MEMORY_HEADER_SIZE + DATA_QUEUE_MEMORY_APPENDIX_SIZE, kIODirectionOutIn); } @@ -163,7 +163,7 @@ IOSharedDataQueue::getMemoryDescriptor() IODataQueueEntry * IOSharedDataQueue::peek() { - IODataQueueEntry *entry = 0; + IODataQueueEntry *entry = NULL; UInt32 headOffset; UInt32 tailOffset; @@ -177,7 +177,7 @@ IOSharedDataQueue::peek() tailOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_ACQUIRE); if (headOffset != tailOffset) { - volatile IODataQueueEntry * head = 0; + volatile IODataQueueEntry * head = NULL; UInt32 headSize = 0; UInt32 headOffset = dataQueue->head; UInt32 queueSize = getQueueSize(); @@ -239,7 +239,7 @@ IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail); entry->size = dataSize; - memcpy(&entry->data, data, dataSize); + __nochk_memcpy(&entry->data, data, dataSize); // The tail can be out of bound when the size of the new entry // exactly matches the available space at the end of the queue. @@ -260,7 +260,7 @@ IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) ((IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail))->size = dataSize; } - memcpy(&dataQueue->queue->data, data, dataSize); + __nochk_memcpy(&dataQueue->queue->data, data, dataSize); newTail = entrySize; } else { return false; // queue is full @@ -273,7 +273,7 @@ IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail); entry->size = dataSize; - memcpy(&entry->data, data, dataSize); + __nochk_memcpy(&entry->data, data, dataSize); newTail = tail + entrySize; } else { return false; // queue is full @@ -308,7 +308,7 @@ Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) { Boolean retVal = TRUE; - volatile IODataQueueEntry * entry = 0; + volatile IODataQueueEntry * entry = NULL; UInt32 entrySize = 0; UInt32 headOffset = 0; UInt32 tailOffset = 0; @@ -324,7 +324,7 @@ IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) tailOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_ACQUIRE); if (headOffset != tailOffset) { - volatile IODataQueueEntry * head = 0; + volatile IODataQueueEntry * head = NULL; UInt32 headSize = 0; UInt32 queueSize = getQueueSize(); @@ -372,7 +372,7 @@ IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) // not enough space return false; } - memcpy(data, (void *)entry->data, entrySize); + __nochk_memcpy(data, (void *)entry->data, entrySize); *dataSize = entrySize; } diff --git a/iokit/Kernel/IOStartIOKit.cpp b/iokit/Kernel/IOStartIOKit.cpp index 196e91086..e64a84a45 100644 --- a/iokit/Kernel/IOStartIOKit.cpp +++ b/iokit/Kernel/IOStartIOKit.cpp @@ -70,8 +70,12 @@ IOKitInitializeTime( void ) t.tv_sec = 30; t.tv_nsec = 0; + +// RTC is not present on this target +#ifndef BCM2837 IOService::waitForService( IOService::resourceMatching("IORTC"), &t ); +#endif #if defined(__i386__) || defined(__x86_64__) IOService::waitForService( IOService::resourceMatching("IONVRAM"), &t ); @@ -116,7 +120,7 @@ iokit_post_constructor_init(void) /***** * Pointer into bootstrap KLD segment for functions never used past startup. */ -void (*record_startup_extensions_function)(void) = 0; +void (*record_startup_extensions_function)(void) = NULL; void StartIOKit( void * p1, void * p2, void * p3, void * p4 ) @@ -143,6 +147,12 @@ StartIOKit( void * p1, void * p2, void * p3, void * p4 ) if (PE_parse_boot_argn( "pmtimeout", &debugFlags, sizeof(debugFlags))) { gCanSleepTimeout = debugFlags; } + + if (PE_parse_boot_argn( "dk", &debugFlags, sizeof(debugFlags))) { + gIODKDebug = debugFlags; + } + + // // Have to start IOKit environment before we attempt to start // the C++ runtime environment. At some stage we have to clean up @@ -152,6 +162,7 @@ StartIOKit( void * p1, void * p2, void * p3, void * p4 ) // IOLibInit(); OSlibkernInit(); + IOMachPortInitialize(); devsw_init(); gIOProgressBackbufferKey = OSSymbol::withCStringNoCopy(kIOProgressBackbufferKey); @@ -162,7 +173,7 @@ StartIOKit( void * p1, void * p2, void * p3, void * p4 ) rootNub = new IOPlatformExpertDevice; if (rootNub && rootNub->initWithArgs( p1, p2, p3, p4)) { - rootNub->attach( 0 ); + rootNub->attach( NULL ); /* If the bootstrap segment set up a function to record startup * extensions, call it now. diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp index 54338fb77..6bac5ad69 100644 --- a/iokit/Kernel/IOStatistics.cpp +++ b/iokit/Kernel/IOStatistics.cpp @@ -168,19 +168,19 @@ oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, stru return error; } -SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IOStatistics"); +SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "IOStatistics"); static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, general, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, kIOStatisticsGeneral, oid_sysctl, "S", ""); + NULL, kIOStatisticsGeneral, oid_sysctl, "S", ""); static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, workloop, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, kIOStatisticsWorkLoop, oid_sysctl, "S", ""); + NULL, kIOStatisticsWorkLoop, oid_sysctl, "S", ""); static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, userclient, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, kIOStatisticsUserClient, oid_sysctl, "S", ""); + NULL, kIOStatisticsUserClient, oid_sysctl, "S", ""); void IOStatistics::initialize() @@ -1260,7 +1260,7 @@ IOStatistics::getKextNodeFromBacktrace(boolean_t write) * overhead. OSBacktrace does many safety checks that * are not needed in this situation. */ - btCount = backtrace((uintptr_t*)bt, btCount); + btCount = backtrace((uintptr_t*)bt, btCount, NULL); if (write) { IORWLockWrite(lock); diff --git a/iokit/Kernel/IOSubMemoryDescriptor.cpp b/iokit/Kernel/IOSubMemoryDescriptor.cpp index 8d0a472b1..c65c7c486 100644 --- a/iokit/Kernel/IOSubMemoryDescriptor.cpp +++ b/iokit/Kernel/IOSubMemoryDescriptor.cpp @@ -53,7 +53,7 @@ IOSubMemoryDescriptor::withSubRange(IOMemoryDescriptor * of, if (self && !self->initSubRange(of, offset, length, (IODirection) options)) { self->release(); - self = 0; + self = NULL; } return self; } @@ -151,6 +151,22 @@ IOSubMemoryDescriptor::setPurgeable( IOOptionBits newState, return err; } +IOReturn +IOSubMemoryDescriptor::setOwnership( task_t newOwner, + int newLedgerTag, + IOOptionBits newLedgerOptions ) +{ + IOReturn err; + + if (iokit_iomd_setownership_enabled == FALSE) { + return kIOReturnUnsupported; + } + + err = _parent->setOwnership( newOwner, newLedgerTag, newLedgerOptions ); + + return err; +} + IOReturn IOSubMemoryDescriptor::prepare( IODirection forDirection) @@ -182,7 +198,7 @@ IOSubMemoryDescriptor::makeMapping( IOByteCount offset, IOByteCount length ) { - IOMemoryMap * mapping = 0; + IOMemoryMap * mapping = NULL; #ifndef __LP64__ if (!(kIOMap64Bit & options)) { diff --git a/iokit/Kernel/IOSyncer.cpp b/iokit/Kernel/IOSyncer.cpp index 32449d6f6..b4df67d2f 100644 --- a/iokit/Kernel/IOSyncer.cpp +++ b/iokit/Kernel/IOSyncer.cpp @@ -38,7 +38,7 @@ IOSyncer * IOSyncer::create(bool twoRetains) if (me && !me->init(twoRetains)) { me->release(); - return 0; + return NULL; } return me; diff --git a/iokit/Kernel/IOTimerEventSource.cpp b/iokit/Kernel/IOTimerEventSource.cpp index ad6b75455..59eb6a0a2 100644 --- a/iokit/Kernel/IOTimerEventSource.cpp +++ b/iokit/Kernel/IOTimerEventSource.cpp @@ -100,29 +100,29 @@ do { \ // __inline__ void -IOTimerEventSource::invokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts, - OSObject * owner, IOWorkLoop * workLoop) +IOTimerEventSource::invokeAction(IOTimerEventSource::Action _action, IOTimerEventSource * ts, + OSObject * _owner, IOWorkLoop * _workLoop) { bool trace = (gIOKitTrace & kIOTraceTimers) ? true : false; if (trace) { IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION), - VM_KERNEL_ADDRHIDE(action), VM_KERNEL_ADDRHIDE(owner)); + VM_KERNEL_ADDRHIDE(_action), VM_KERNEL_ADDRHIDE(_owner)); } if (kActionBlock & flags) { ((IOTimerEventSource::ActionBlock) actionBlock)(ts); } else { - (*action)(owner, ts); + (*_action)(_owner, ts); } #if CONFIG_DTRACE - DTRACE_TMR3(iotescallout__expire, Action, action, OSObject, owner, void, workLoop); + DTRACE_TMR3(iotescallout__expire, Action, _action, OSObject, _owner, void, _workLoop); #endif if (trace) { IOTimeStampEndConstant(IODBG_TIMES(IOTIMES_ACTION), - VM_KERNEL_UNSLIDE(action), VM_KERNEL_ADDRHIDE(owner)); + VM_KERNEL_UNSLIDE(_action), VM_KERNEL_ADDRHIDE(_owner)); } } @@ -319,19 +319,19 @@ IOTimerEventSource::timerEventSource(uint32_t inOptions, OSObject *inOwner, Acti if (me && !me->init(inOptions, inOwner, inAction)) { me->release(); - return 0; + return NULL; } return me; } IOTimerEventSource * -IOTimerEventSource::timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock action) +IOTimerEventSource::timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock _action) { IOTimerEventSource * tes; tes = IOTimerEventSource::timerEventSource(options, inOwner, (Action) NULL); if (tes) { - tes->setActionBlock((IOEventSource::ActionBlock) action); + tes->setActionBlock((IOEventSource::ActionBlock) _action); } return tes; diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 12ae32416..bbe9448fd 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,13 +41,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #if CONFIG_MACF @@ -132,29 +135,37 @@ extern "C" { #include } /* extern "C" */ +struct IOMachPortHashList; + +static_assert(IKOT_MAX_TYPE <= 255); + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // IOMachPort maps OSObjects to ports, avoiding adding an ivar to OSObject. - class IOMachPort : public OSObject { - OSDeclareDefaultStructors(IOMachPort) + OSDeclareDefaultStructors(IOMachPort); public: - OSObject * object; + SLIST_ENTRY(IOMachPort) link; ipc_port_t port; + OSObject* object; UInt32 mscount; UInt8 holdDestroy; + UInt8 type; + + static IOMachPort* withObjectAndType(OSObject *obj, ipc_kobject_type_t type); + + static IOMachPortHashList* bucketForObject(OSObject *obj, + ipc_kobject_type_t type); + + static IOMachPort* portForObjectInBucket(IOMachPortHashList *bucket, OSObject *obj, ipc_kobject_type_t type); - static IOMachPort * portForObject( OSObject * obj, - ipc_kobject_type_t type ); static bool noMoreSendersForObject( OSObject * obj, ipc_kobject_type_t type, mach_port_mscount_t * mscount ); static void releasePortForObject( OSObject * obj, ipc_kobject_type_t type ); static void setHoldDestroy( OSObject * obj, ipc_kobject_type_t type ); - static OSDictionary * dictForType( ipc_kobject_type_t type ); - static mach_port_name_t makeSendRightForTask( task_t task, io_object_t obj, ipc_kobject_type_t type ); @@ -165,116 +176,109 @@ public: OSDefineMetaClassAndStructors(IOMachPort, OSObject) static IOLock * gIOObjectPortLock; +IOLock * gIOUserServerLock; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -// not in dictForType() for debugging ease -static OSDictionary * gIOObjectPorts; -static OSDictionary * gIOConnectPorts; -static OSDictionary * gIOIdentifierPorts; +SLIST_HEAD(IOMachPortHashList, IOMachPort); -OSDictionary * -IOMachPort::dictForType( ipc_kobject_type_t type ) -{ - OSDictionary ** dict; +#if CONFIG_EMBEDDED +#define PORT_HASH_SIZE 256 +#else +#define PORT_HASH_SIZE 4096 +#endif /* CONFIG_EMBEDDED */ - switch (type) { - case IKOT_IOKIT_OBJECT: - dict = &gIOObjectPorts; - break; - case IKOT_IOKIT_CONNECT: - dict = &gIOConnectPorts; - break; - case IKOT_IOKIT_IDENT: - dict = &gIOIdentifierPorts; - break; - default: - panic("dictForType %d", type); - dict = NULL; - break; - } +IOMachPortHashList ports[PORT_HASH_SIZE]; - if (0 == *dict) { - *dict = OSDictionary::withCapacity( 1 ); +void +IOMachPortInitialize(void) +{ + for (size_t i = 0; i < PORT_HASH_SIZE; i++) { + SLIST_INIT(&ports[i]); } - - return *dict; } -IOMachPort * -IOMachPort::portForObject( OSObject * obj, - ipc_kobject_type_t type ) +IOMachPortHashList* +IOMachPort::bucketForObject(OSObject *obj, ipc_kobject_type_t type ) { - IOMachPort * inst = 0; - OSDictionary * dict; + return &ports[os_hash_kernel_pointer(obj) % PORT_HASH_SIZE]; +} - IOTakeLock( gIOObjectPortLock); +IOMachPort* +IOMachPort::portForObjectInBucket(IOMachPortHashList *bucket, OSObject *obj, ipc_kobject_type_t type) +{ + IOMachPort *machPort; - do { - dict = dictForType( type ); - if (!dict) { - continue; + SLIST_FOREACH(machPort, bucket, link) { + if (machPort->object == obj && machPort->type == type) { + return machPort; } + } + return NULL; +} - if ((inst = (IOMachPort *) - dict->getObject((const OSSymbol *) obj ))) { - inst->mscount++; - inst->retain(); - continue; - } +IOMachPort* +IOMachPort::withObjectAndType(OSObject *obj, ipc_kobject_type_t type) +{ + IOMachPort *machPort = NULL; - inst = new IOMachPort; - if (inst && !inst->init()) { - inst = 0; - continue; - } + machPort = new IOMachPort; + if (__improbable(machPort && !machPort->init())) { + return NULL; + } - inst->port = iokit_alloc_object_port( obj, type ); - if (inst->port) { - // retains obj - dict->setObject((const OSSymbol *) obj, inst ); - inst->mscount++; - } else { - inst->release(); - inst = 0; - } - } while (false); + machPort->object = obj; + machPort->type = (typeof(machPort->type))type; + machPort->port = iokit_alloc_object_port(obj, type); - IOUnlock( gIOObjectPortLock); + obj->taggedRetain(OSTypeID(OSCollection)); + machPort->mscount++; - return inst; + return machPort; } bool IOMachPort::noMoreSendersForObject( OSObject * obj, ipc_kobject_type_t type, mach_port_mscount_t * mscount ) { - OSDictionary * dict; - IOMachPort * machPort; - IOUserClient * uc; - bool destroyed = true; + IOMachPort *machPort = NULL; + IOUserClient *uc; + OSAction *action; + bool destroyed = true; - IOTakeLock( gIOObjectPortLock); + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); - if ((dict = dictForType( type ))) { - obj->retain(); + obj->retain(); - machPort = (IOMachPort *) dict->getObject((const OSSymbol *) obj ); - if (machPort) { - destroyed = (machPort->mscount <= *mscount); - if (!destroyed) { - *mscount = machPort->mscount; - } else { - if ((IKOT_IOKIT_CONNECT == type) && (uc = OSDynamicCast(IOUserClient, obj))) { - uc->noMoreSenders(); - } - dict->removeObject((const OSSymbol *) obj ); + lck_mtx_lock(gIOObjectPortLock); + + machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); + + if (machPort) { + destroyed = (machPort->mscount <= *mscount); + if (!destroyed) { + *mscount = machPort->mscount; + lck_mtx_unlock(gIOObjectPortLock); + } else { + if ((IKOT_IOKIT_CONNECT == type) && (uc = OSDynamicCast(IOUserClient, obj))) { + uc->noMoreSenders(); } + SLIST_REMOVE(bucket, machPort, IOMachPort, link); + + lck_mtx_unlock(gIOObjectPortLock); + + machPort->release(); + obj->taggedRelease(OSTypeID(OSCollection)); } - obj->release(); + } else { + lck_mtx_unlock(gIOObjectPortLock); + } + + if ((IKOT_UEXT_OBJECT == type) && (action = OSDynamicCast(OSAction, obj))) { + action->Aborted(); } - IOUnlock( gIOObjectPortLock); + obj->release(); return destroyed; } @@ -283,76 +287,108 @@ void IOMachPort::releasePortForObject( OSObject * obj, ipc_kobject_type_t type ) { - OSDictionary * dict; - IOMachPort * machPort; + IOMachPort *machPort; + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); assert(IKOT_IOKIT_CONNECT != type); - IOTakeLock( gIOObjectPortLock); + lck_mtx_lock(gIOObjectPortLock); + + machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); - if ((dict = dictForType( type ))) { + if (machPort && !machPort->holdDestroy) { obj->retain(); - machPort = (IOMachPort *) dict->getObject((const OSSymbol *) obj ); - if (machPort && !machPort->holdDestroy) { - dict->removeObject((const OSSymbol *) obj ); - } + SLIST_REMOVE(bucket, machPort, IOMachPort, link); + + lck_mtx_unlock(gIOObjectPortLock); + + machPort->release(); + obj->taggedRelease(OSTypeID(OSCollection)); obj->release(); + } else { + lck_mtx_unlock(gIOObjectPortLock); } - - IOUnlock( gIOObjectPortLock); } void IOMachPort::setHoldDestroy( OSObject * obj, ipc_kobject_type_t type ) { - OSDictionary * dict; IOMachPort * machPort; - IOLockLock( gIOObjectPortLock ); + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); + lck_mtx_lock(gIOObjectPortLock); - if ((dict = dictForType( type ))) { - machPort = (IOMachPort *) dict->getObject((const OSSymbol *) obj ); - if (machPort) { - machPort->holdDestroy = true; - } + machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); + + if (machPort) { + machPort->holdDestroy = true; } - IOLockUnlock( gIOObjectPortLock ); + lck_mtx_unlock(gIOObjectPortLock); +} + +void +IOMachPortDestroyUserReferences(OSObject * obj, natural_t type) +{ + IOMachPort::releasePortForObject(obj, type); } void IOUserClient::destroyUserReferences( OSObject * obj ) { + IOMachPort *machPort; + IOMachPort::releasePortForObject( obj, IKOT_IOKIT_OBJECT ); // panther, 3160200 // IOMachPort::releasePortForObject( obj, IKOT_IOKIT_CONNECT ); - OSDictionary * dict; - - IOTakeLock( gIOObjectPortLock); obj->retain(); + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, IKOT_IOKIT_CONNECT); + IOMachPortHashList *mappingBucket = NULL; - if ((dict = IOMachPort::dictForType( IKOT_IOKIT_CONNECT ))) { - IOMachPort * port; - port = (IOMachPort *) dict->getObject((const OSSymbol *) obj ); - if (port) { - IOUserClient * uc; - if ((uc = OSDynamicCast(IOUserClient, obj))) { - uc->noMoreSenders(); - if (uc->mappings) { - dict->setObject((const OSSymbol *) uc->mappings, port); - iokit_switch_object_port(port->port, uc->mappings, IKOT_IOKIT_CONNECT); + lck_mtx_lock(gIOObjectPortLock); - uc->mappings->release(); - uc->mappings = 0; - } - } - dict->removeObject((const OSSymbol *) obj ); + IOUserClient * uc = OSDynamicCast(IOUserClient, obj); + if (uc && uc->mappings) { + mappingBucket = IOMachPort::bucketForObject(uc->mappings, IKOT_IOKIT_CONNECT); + } + + machPort = IOMachPort::portForObjectInBucket(bucket, obj, IKOT_IOKIT_CONNECT); + + if (machPort == NULL) { + lck_mtx_unlock(gIOObjectPortLock); + goto end; + } + + SLIST_REMOVE(bucket, machPort, IOMachPort, link); + obj->taggedRelease(OSTypeID(OSCollection)); + + if (uc) { + uc->noMoreSenders(); + if (uc->mappings) { + uc->mappings->taggedRetain(OSTypeID(OSCollection)); + machPort->object = uc->mappings; + SLIST_INSERT_HEAD(mappingBucket, machPort, link); + iokit_switch_object_port(machPort->port, uc->mappings, IKOT_IOKIT_CONNECT); + + lck_mtx_unlock(gIOObjectPortLock); + + uc->mappings->release(); + uc->mappings = NULL; + } else { + lck_mtx_unlock(gIOObjectPortLock); + machPort->release(); } + } else { + lck_mtx_unlock(gIOObjectPortLock); + machPort->release(); } + + +end: + obj->release(); - IOUnlock( gIOObjectPortLock); } mach_port_name_t @@ -375,7 +411,7 @@ IOMachPort::free( void ) class IOUserIterator : public OSIterator { - OSDeclareDefaultStructors(IOUserIterator) + OSDeclareDefaultStructors(IOUserIterator); public: OSObject * userIteratorObject; IOLock * lock; @@ -394,7 +430,7 @@ public: class IOUserNotification : public IOUserIterator { - OSDeclareDefaultStructors(IOUserNotification) + OSDeclareDefaultStructors(IOUserNotification); #define holdNotify userIteratorObject @@ -418,13 +454,13 @@ IOUserIterator::withIterator(OSIterator * iter) IOUserIterator * me; if (!iter) { - return 0; + return NULL; } me = new IOUserIterator; if (me && !me->init()) { me->release(); - me = 0; + me = NULL; } if (!me) { return me; @@ -581,20 +617,31 @@ IOUserClient::finalizeUserReferences(OSObject * obj) ipc_port_t iokit_port_for_object( io_object_t obj, ipc_kobject_type_t type ) { - IOMachPort * machPort; - ipc_port_t port; + IOMachPort *machPort = NULL; + ipc_port_t port = NULL; - if ((machPort = IOMachPort::portForObject( obj, type ))) { - port = machPort->port; - if (port) { - iokit_retain_port( port ); - } + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); - machPort->release(); + lck_mtx_lock(gIOObjectPortLock); + + machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); + + if (__improbable(machPort == NULL)) { + machPort = IOMachPort::withObjectAndType(obj, type); + if (__improbable(machPort == NULL)) { + goto end; + } + SLIST_INSERT_HEAD(bucket, machPort, link); } else { - port = NULL; + machPort->mscount++; } + iokit_retain_port(machPort->port); + port = machPort->port; + +end: + lck_mtx_unlock(gIOObjectPortLock); + return port; } @@ -621,7 +668,7 @@ iokit_client_died( io_object_t obj, ipc_port_t /* port */, if ((map = OSDynamicCast( IOMemoryMap, obj ))) { map->taskDied(); } else if ((notify = OSDynamicCast( IOUserNotification, obj ))) { - notify->setNotification( 0 ); + notify->setNotification( NULL ); } } @@ -633,7 +680,7 @@ iokit_client_died( io_object_t obj, ipc_port_t /* port */, class IOServiceUserNotification : public IOUserNotification { - OSDeclareDefaultStructors(IOServiceUserNotification) + OSDeclareDefaultStructors(IOServiceUserNotification); struct PingMsg { mach_msg_header_t msgHdr; @@ -666,7 +713,7 @@ public: class IOServiceMessageUserNotification : public IOUserNotification { - OSDeclareDefaultStructors(IOServiceMessageUserNotification) + OSDeclareDefaultStructors(IOServiceMessageUserNotification); struct PingMsg { mach_msg_header_t msgHdr; @@ -706,8 +753,8 @@ public: #undef super #define super IOUserIterator -OSDefineMetaClass( IOUserNotification, IOUserIterator ) -OSDefineAbstractStructors( IOUserNotification, IOUserIterator ) +OSDefineMetaClass( IOUserNotification, IOUserIterator ); +OSDefineAbstractStructors( IOUserNotification, IOUserIterator ); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -717,7 +764,7 @@ IOUserNotification::free( void ) if (holdNotify) { assert(OSDynamicCast(IONotifier, holdNotify)); ((IONotifier *)holdNotify)->remove(); - holdNotify = 0; + holdNotify = NULL; } // can't be in handler now @@ -912,7 +959,7 @@ IOServiceUserNotification::copyNextObject() result->retain(); newSet->removeObject( count - 1); } else { - result = 0; + result = NULL; armed = true; } @@ -963,7 +1010,7 @@ IOServiceMessageUserNotification::init( mach_port_t port, natural_t type, pingMsg->msgBody.msgh_descriptor_count = 1; - pingMsg->ports[0].name = 0; + pingMsg->ports[0].name = NULL; pingMsg->ports[0].disposition = MACH_MSG_TYPE_MAKE_SEND; pingMsg->ports[0].type = MACH_MSG_PORT_DESCRIPTOR; @@ -1062,7 +1109,7 @@ IOServiceMessageUserNotification::handler( void * ref, } thisMsg = (typeof(thisMsg))allocMsg; } else { - allocMsg = 0; + allocMsg = NULL; thisMsg = (typeof(thisMsg))stackMsg; } @@ -1116,7 +1163,7 @@ IOServiceMessageUserNotification::handler( void * ref, OSObject * IOServiceMessageUserNotification::getNextObject() { - return 0; + return NULL; } OSObject * @@ -1138,6 +1185,7 @@ IOUserClient::initialize( void ) { gIOObjectPortLock = IOLockAlloc(); gIOUserClientOwnersLock = IOLockAlloc(); + gIOUserServerLock = IOLockAlloc(); assert(gIOObjectPortLock && gIOUserClientOwnersLock); } @@ -1178,7 +1226,7 @@ static OSDictionary * CopyConsoleUser(UInt32 uid) { OSArray * array; - OSDictionary * user = 0; + OSDictionary * user = NULL; if ((array = OSDynamicCast(OSArray, IORegistryEntry::getRegistryRoot()->copyProperty(gIOConsoleUsersKey)))) { @@ -1202,7 +1250,7 @@ static OSDictionary * CopyUserOnConsole(void) { OSArray * array; - OSDictionary * user = 0; + OSDictionary * user = NULL; if ((array = OSDynamicCast(OSArray, IORegistryEntry::getRegistryRoot()->copyProperty(gIOConsoleUsersKey)))) { @@ -1335,29 +1383,31 @@ IOUserClient::clientHasPrivilege( void * securityToken, return kr; } -OSObject * -IOUserClient::copyClientEntitlement( task_t task, - const char * entitlement ) +OSDictionary * +IOUserClient::copyClientEntitlements(task_t task) { #define MAX_ENTITLEMENTS_LEN (128 * 1024) proc_t p = NULL; pid_t pid = 0; - char procname[MAXCOMLEN + 1] = ""; size_t len = 0; void *entitlements_blob = NULL; char *entitlements_data = NULL; OSObject *entitlements_obj = NULL; OSDictionary *entitlements = NULL; OSString *errorString = NULL; - OSObject *value = NULL; p = (proc_t)get_bsdtask_info(task); if (p == NULL) { goto fail; } pid = proc_pid(p); - proc_name(pid, procname, (int)sizeof(procname)); + + if (cs_entitlements_dictionary_copy(p, (void **)&entitlements) == 0) { + if (entitlements) { + return entitlements; + } + } if (cs_entitlements_blob_get(p, &entitlements_blob, &len) != 0) { goto fail; @@ -1373,7 +1423,8 @@ IOUserClient::copyClientEntitlement( task_t task, */ len -= offsetof(CS_GenericBlob, data); if (len > MAX_ENTITLEMENTS_LEN) { - IOLog("failed to parse entitlements for %s[%u]: %lu bytes of entitlements exceeds maximum of %u\n", procname, pid, len, MAX_ENTITLEMENTS_LEN); + IOLog("failed to parse entitlements for %s[%u]: %lu bytes of entitlements exceeds maximum of %u\n", + proc_best_name(p), pid, len, MAX_ENTITLEMENTS_LEN); goto fail; } @@ -1391,7 +1442,8 @@ IOUserClient::copyClientEntitlement( task_t task, entitlements_obj = OSUnserializeXML(entitlements_data, len + 1, &errorString); if (errorString != NULL) { - IOLog("failed to parse entitlements for %s[%u]: %s\n", procname, pid, errorString->getCStringNoCopy()); + IOLog("failed to parse entitlements for %s[%u]: %s\n", + proc_best_name(p), pid, errorString->getCStringNoCopy()); goto fail; } if (entitlements_obj == NULL) { @@ -1402,12 +1454,7 @@ IOUserClient::copyClientEntitlement( task_t task, if (entitlements == NULL) { goto fail; } - - /* Fetch the entitlement value from the dictionary. */ - value = entitlements->getObject(entitlement); - if (value != NULL) { - value->retain(); - } + entitlements_obj = NULL; fail: if (entitlements_data != NULL) { @@ -1419,6 +1466,28 @@ fail: if (errorString != NULL) { errorString->release(); } + return entitlements; +} + +OSObject * +IOUserClient::copyClientEntitlement( task_t task, + const char * entitlement ) +{ + OSDictionary *entitlements; + OSObject *value; + + entitlements = copyClientEntitlements(task); + if (entitlements == NULL) { + return NULL; + } + + /* Fetch the entitlement value from the dictionary. */ + value = entitlements->getObject(entitlement); + if (value != NULL) { + value->retain(); + } + + entitlements->release(); return value; } @@ -1523,6 +1592,9 @@ IOUserClient::registerOwner(task_t task) owner->uc = this; queue_enter_first(&owners, owner, IOUserClientOwner *, ucLink); queue_enter_first(task_io_user_clients(task), owner, IOUserClientOwner *, taskLink); + if (messageAppSuspended) { + task_set_message_app_suspended(task, true); + } } } @@ -1535,13 +1607,25 @@ void IOUserClient::noMoreSenders(void) { IOUserClientOwner * owner; + IOUserClientOwner * iter; + queue_head_t * taskque; + bool hasMessageAppSuspended; IOLockLock(gIOUserClientOwnersLock); if (owners.next) { while (!queue_empty(&owners)) { owner = (IOUserClientOwner *)(void *) queue_first(&owners); - queue_remove(task_io_user_clients(owner->task), owner, IOUserClientOwner *, taskLink); + taskque = task_io_user_clients(owner->task); + queue_remove(taskque, owner, IOUserClientOwner *, taskLink); + hasMessageAppSuspended = false; + queue_iterate(taskque, iter, IOUserClientOwner *, taskLink) { + hasMessageAppSuspended = iter->uc->messageAppSuspended; + if (hasMessageAppSuspended) { + break; + } + } + task_set_message_app_suspended(owner->task, hasMessageAppSuspended); queue_remove(&owners, owner, IOUserClientOwner *, ucLink); IODelete(owner, IOUserClientOwner, 1); } @@ -1551,6 +1635,55 @@ IOUserClient::noMoreSenders(void) IOLockUnlock(gIOUserClientOwnersLock); } + +extern "C" void +iokit_task_app_suspended_changed(task_t task) +{ + queue_head_t * taskque; + IOUserClientOwner * owner; + OSSet * set; + + IOLockLock(gIOUserClientOwnersLock); + + taskque = task_io_user_clients(task); + set = NULL; + queue_iterate(taskque, owner, IOUserClientOwner *, taskLink) { + if (!owner->uc->messageAppSuspended) { + continue; + } + if (!set) { + set = OSSet::withCapacity(4); + if (!set) { + break; + } + } + set->setObject(owner->uc); + } + + IOLockUnlock(gIOUserClientOwnersLock); + + if (set) { + set->iterateObjects(^bool (OSObject * obj) { + IOUserClient * uc; + + uc = (typeof(uc))obj; +#if 0 + { + OSString * str; + str = IOCopyLogNameForPID(task_pid(task)); + IOLog("iokit_task_app_suspended_changed(%s) %s %d\n", str ? str->getCStringNoCopy() : "", + uc->getName(), task_is_app_suspended(task)); + OSSafeReleaseNULL(str); + } +#endif + uc->message(kIOMessageTaskAppSuspendedChange, NULL); + + return false; + }); + set->release(); + } +} + extern "C" kern_return_t iokit_task_terminate(task_t task) { @@ -1638,7 +1771,7 @@ IOUserClient::clientClose( void ) IOService * IOUserClient::getService( void ) { - return 0; + return NULL; } IOReturn @@ -1701,8 +1834,8 @@ IOUserClient::mapClientMemory64( { IOReturn err; IOOptionBits options = 0; - IOMemoryDescriptor * memory = 0; - IOMemoryMap * map = 0; + IOMemoryDescriptor * memory = NULL; + IOMemoryMap * map = NULL; err = clientMemoryForType((UInt32) type, &options, &memory ); @@ -1772,13 +1905,13 @@ IOUserClient::adjustPortNameReferencesInTask(task_t task, mach_port_name_t port_ IOExternalMethod * IOUserClient::getExternalMethodForIndex( UInt32 /* index */) { - return 0; + return NULL; } IOExternalAsyncMethod * IOUserClient::getExternalAsyncMethodForIndex( UInt32 /* index */) { - return 0; + return NULL; } IOExternalTrap * @@ -1930,7 +2063,7 @@ IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference, replyMsg.msgHdr.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND /*remote*/, 0 /*local*/); replyMsg.msgHdr.msgh_remote_port = replyPort; - replyMsg.msgHdr.msgh_local_port = 0; + replyMsg.msgHdr.msgh_local_port = NULL; replyMsg.msgHdr.msgh_id = kOSNotificationMessageID; if (kIOUCAsync64Flag & reference[0]) { replyMsg.msgHdr.msgh_size = @@ -2077,7 +2210,7 @@ is_io_object_get_superclass( } ret = kIOReturnNotFound; - meta = 0; + meta = NULL; do{ name = OSSymbol::withCString(obj_name); if (!name) { @@ -2128,7 +2261,7 @@ is_io_object_get_bundle_identifier( } ret = kIOReturnNotFound; - meta = 0; + meta = NULL; do{ name = OSSymbol::withCString(obj_name); if (!name) { @@ -2169,7 +2302,7 @@ is_io_object_conforms_to( return kIOReturnBadArgument; } - *conforms = (0 != object->metaCast( className )); + *conforms = (NULL != object->metaCast( className )); return kIOReturnSuccess; } @@ -2385,7 +2518,7 @@ is_io_service_get_matching_services_ool( if (KERN_SUCCESS == kr) { // must return success after vm_map_copyout() succeeds // and mig will copy out objects on success - *existing = 0; + *existing = NULL; *result = internal_io_service_get_matching_services(master_port, (const char *) data, matchingCnt, existing); vm_deallocate( kernel_map, data, matchingCnt ); @@ -2467,7 +2600,7 @@ is_io_service_get_matching_service_ool( if (KERN_SUCCESS == kr) { // must return success after vm_map_copyout() succeeds // and mig will copy out objects on success - *service = 0; + *service = NULL; *result = internal_io_service_get_matching_service(master_port, (const char *) data, matchingCnt, service ); vm_deallocate( kernel_map, data, matchingCnt ); @@ -2499,8 +2632,8 @@ internal_io_service_add_notification( bool client64, io_object_t * notification ) { - IOServiceUserNotification * userNotify = 0; - IONotifier * notify = 0; + IOServiceUserNotification * userNotify = NULL; + IONotifier * notify = NULL; const OSSymbol * sym; OSDictionary * dict; IOReturn err; @@ -2546,7 +2679,7 @@ internal_io_service_add_notification( if (userNotify && !userNotify->init( port, userMsgType, reference, referenceSize, client64)) { userNotify->release(); - userNotify = 0; + userNotify = NULL; } if (!userNotify) { continue; @@ -2566,7 +2699,7 @@ internal_io_service_add_notification( if ((kIOReturnSuccess != err) && userNotify) { userNotify->invalidatePort(); userNotify->release(); - userNotify = 0; + userNotify = NULL; } if (sym) { @@ -2683,7 +2816,7 @@ internal_io_service_add_notification_ool( if (KERN_SUCCESS == kr) { // must return success after vm_map_copyout() succeeds // and mig will copy out objects on success - *notification = 0; + *notification = NULL; *result = internal_io_service_add_notification( master_port, notification_type, (char *) data, matchingCnt, wake_port, reference, referenceSize, client64, notification ); vm_deallocate( kernel_map, data, matchingCnt ); @@ -2770,8 +2903,8 @@ internal_io_service_add_interest_notification( bool client64, io_object_t * notification ) { - IOServiceMessageUserNotification * userNotify = 0; - IONotifier * notify = 0; + IOServiceMessageUserNotification * userNotify = NULL; + IONotifier * notify = NULL; const OSSymbol * sym; IOReturn err; @@ -2787,7 +2920,7 @@ internal_io_service_add_interest_notification( kIOUserNotifyMaxMessageSize, client64 )) { userNotify->release(); - userNotify = 0; + userNotify = NULL; } if (!userNotify) { continue; @@ -2810,7 +2943,7 @@ internal_io_service_add_interest_notification( if ((kIOReturnSuccess != err) && userNotify) { userNotify->invalidatePort(); userNotify->release(); - userNotify = 0; + userNotify = NULL; } return err; @@ -3017,7 +3150,7 @@ is_io_registry_entry_from_path_ool( } map_data = 0; - entry = 0; + entry = NULL; res = err = KERN_SUCCESS; if (path[0]) { cpath = path; @@ -3158,7 +3291,7 @@ is_io_registry_entry_get_name_in_plane( if (planeName[0]) { plane = IORegistryEntry::getPlane( planeName ); } else { - plane = 0; + plane = NULL; } strncpy( name, entry->getName( plane), sizeof(io_name_t)); @@ -3179,7 +3312,7 @@ is_io_registry_entry_get_location_in_plane( if (planeName[0]) { plane = IORegistryEntry::getPlane( planeName ); } else { - plane = 0; + plane = NULL; } const char * cstr = entry->getLocation( plane ); @@ -3220,7 +3353,7 @@ is_io_registry_entry_get_property_bytes( OSNumber * off; UInt64 offsetBytes; unsigned int len = 0; - const void * bytes = 0; + const void * bytes = NULL; IOReturn ret = kIOReturnSuccess; CHECK( IORegistryEntry, registry_entry, entry ); @@ -3404,7 +3537,7 @@ GetPropertiesEditor(void * reference, } if (ref->root == container) { if (0 != mac_iokit_check_get_property(ref->cred, ref->entry, name->getCStringNoCopy())) { - value = 0; + value = NULL; } } if (value) { @@ -3425,8 +3558,8 @@ is_io_registry_entry_get_properties_bin( kern_return_t err = kIOReturnSuccess; vm_size_t len; OSSerialize * s; - OSSerialize::Editor editor = 0; - void * editRef = 0; + OSSerialize::Editor editor = NULL; + void * editRef = NULL; CHECK(IORegistryEntry, registry_entry, entry); @@ -3437,7 +3570,7 @@ is_io_registry_entry_get_properties_bin( editRef = &ref; ref.cred = kauth_cred_get(); ref.entry = entry; - ref.root = 0; + ref.root = NULL; } #endif @@ -3717,10 +3850,10 @@ is_io_service_open_extended( kern_return_t * result, io_object_t *connection ) { - IOUserClient * client = 0; + IOUserClient * client = NULL; kern_return_t err = KERN_SUCCESS; IOReturn res = kIOReturnSuccess; - OSDictionary * propertiesDict = 0; + OSDictionary * propertiesDict = NULL; bool crossEndian; bool disallowAccess; @@ -3792,7 +3925,8 @@ is_io_service_open_extended( if (res == kIOReturnSuccess) { assert( OSDynamicCast(IOUserClient, client)); - client->sharedInstance = (0 != client->getProperty(kIOUserClientSharedInstanceKey)); + client->sharedInstance = (NULL != client->getProperty(kIOUserClientSharedInstanceKey)); + client->messageAppSuspended = (NULL != client->getProperty(kIOUserClientMessageAppSuspendedKey)); client->closed = false; client->lock = IOLockAlloc(); @@ -3816,7 +3950,7 @@ is_io_service_open_extended( IOStatisticsClientCall(); client->clientClose(); client->release(); - client = 0; + client = NULL; break; } OSString * creatorName = IOCopyLogNameForPID(proc_selfpid()); @@ -3959,7 +4093,7 @@ is_io_connect_map_memory_into_task } else { // keep it with the user client IOLockLock( gIOObjectPortLock); - if (0 == client->mappings) { + if (NULL == client->mappings) { client->mappings = OSSet::withCapacity(2); } if (client->mappings) { @@ -4006,7 +4140,7 @@ IOMemoryMap * IOUserClient::removeMappingForDescriptor(IOMemoryDescriptor * mem) { OSIterator * iter; - IOMemoryMap * map = 0; + IOMemoryMap * map = NULL; IOLockLock(gIOObjectPortLock); @@ -4039,7 +4173,7 @@ is_io_connect_unmap_memory_from_task { IOReturn err; IOOptionBits options = 0; - IOMemoryDescriptor * memory = 0; + IOMemoryDescriptor * memory = NULL; IOMemoryMap * map; CHECK( IOUserClient, connection, client ); @@ -4155,8 +4289,8 @@ is_io_connect_method_var_output IOExternalMethodArguments args; IOReturn ret; - IOMemoryDescriptor * inputMD = 0; - OSObject * structureVariableOutputData = 0; + IOMemoryDescriptor * inputMD = NULL; + OSObject * structureVariableOutputData = NULL; bzero(&args.__reserved[0], sizeof(args.__reserved)); args.__reservedA = 0; @@ -4165,7 +4299,7 @@ is_io_connect_method_var_output args.selector = selector; args.asyncWakePort = MACH_PORT_NULL; - args.asyncReference = 0; + args.asyncReference = NULL; args.asyncReferenceCount = 0; args.structureVariableOutputData = &structureVariableOutputData; @@ -4252,8 +4386,8 @@ is_io_connect_method IOExternalMethodArguments args; IOReturn ret; - IOMemoryDescriptor * inputMD = 0; - IOMemoryDescriptor * outputMD = 0; + IOMemoryDescriptor * inputMD = NULL; + IOMemoryDescriptor * outputMD = NULL; bzero(&args.__reserved[0], sizeof(args.__reserved)); args.__reservedA = 0; @@ -4262,9 +4396,9 @@ is_io_connect_method args.selector = selector; args.asyncWakePort = MACH_PORT_NULL; - args.asyncReference = 0; + args.asyncReference = NULL; args.asyncReferenceCount = 0; - args.structureVariableOutputData = 0; + args.structureVariableOutputData = NULL; args.scalarInput = scalar_input; args.scalarInputCount = scalar_inputCnt; @@ -4344,8 +4478,8 @@ is_io_connect_async_method IOExternalMethodArguments args; IOReturn ret; - IOMemoryDescriptor * inputMD = 0; - IOMemoryDescriptor * outputMD = 0; + IOMemoryDescriptor * inputMD = NULL; + IOMemoryDescriptor * outputMD = NULL; bzero(&args.__reserved[0], sizeof(args.__reserved)); args.__reservedA = 0; @@ -4362,7 +4496,7 @@ is_io_connect_async_method args.asyncReference = reference; args.asyncReferenceCount = referenceCnt; - args.structureVariableOutputData = 0; + args.structureVariableOutputData = NULL; args.scalarInput = scalar_input; args.scalarInputCount = scalar_inputCnt; @@ -4401,6 +4535,7 @@ is_io_connect_async_method IOStatisticsClientCall(); ret = client->externalMethod( selector, &args ); + *scalar_outputCnt = args.scalarOutputCount; *inband_outputCnt = args.structureOutputSize; *ool_output_size = args.structureOutputDescriptorSize; @@ -4547,10 +4682,14 @@ is_io_async_method_scalarI_scalarO( io_scalar_inband64_t _output; io_async_ref64_t _reference; + if (referenceCnt > ASYNC_REF64_COUNT) { + return kIOReturnBadArgument; + } bzero(&_output[0], sizeof(_output)); for (i = 0; i < referenceCnt; i++) { _reference[i] = REF64(reference[i]); } + bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0])); mach_msg_type_number_t struct_outputCnt = 0; mach_vm_size_t ool_output_size = 0; @@ -4592,9 +4731,13 @@ is_io_async_method_scalarI_structureO( io_scalar_inband64_t _input; io_async_ref64_t _reference; + if (referenceCnt > ASYNC_REF64_COUNT) { + return kIOReturnBadArgument; + } for (i = 0; i < referenceCnt; i++) { _reference[i] = REF64(reference[i]); } + bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0])); mach_msg_type_number_t scalar_outputCnt = 0; mach_vm_size_t ool_output_size = 0; @@ -4631,9 +4774,13 @@ is_io_async_method_scalarI_structureI( io_scalar_inband64_t _input; io_async_ref64_t _reference; + if (referenceCnt > ASYNC_REF64_COUNT) { + return kIOReturnBadArgument; + } for (i = 0; i < referenceCnt; i++) { _reference[i] = REF64(reference[i]); } + bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0])); mach_msg_type_number_t scalar_outputCnt = 0; mach_msg_type_number_t inband_outputCnt = 0; @@ -4672,9 +4819,13 @@ is_io_async_method_structureI_structureO( mach_vm_size_t ool_output_size = 0; io_async_ref64_t _reference; + if (referenceCnt > ASYNC_REF64_COUNT) { + return kIOReturnBadArgument; + } for (i = 0; i < referenceCnt; i++) { _reference[i] = REF64(reference[i]); } + bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0])); return is_io_connect_async_method(connect, wake_port, _reference, referenceCnt, @@ -4853,18 +5004,18 @@ shim_io_connect_method_scalarI_structureO( break; case 3: err = (object->*func)( ARG32(input[0]), ARG32(input[1]), ARG32(input[2]), - output, (void *)outputCount, 0 ); + output, (void *)outputCount, NULL ); break; case 2: err = (object->*func)( ARG32(input[0]), ARG32(input[1]), - output, (void *)outputCount, 0, 0 ); + output, (void *)outputCount, NULL, NULL ); break; case 1: err = (object->*func)( ARG32(input[0]), - output, (void *)outputCount, 0, 0, 0 ); + output, (void *)outputCount, NULL, NULL, NULL ); break; case 0: - err = (object->*func)( output, (void *)outputCount, 0, 0, 0, 0 ); + err = (object->*func)( output, (void *)outputCount, NULL, NULL, NULL, NULL ); break; default: @@ -4929,21 +5080,21 @@ shim_io_async_method_scalarI_structureO( case 3: err = (object->*func)( reference, ARG32(input[0]), ARG32(input[1]), ARG32(input[2]), - output, (void *)outputCount, 0 ); + output, (void *)outputCount, NULL ); break; case 2: err = (object->*func)( reference, ARG32(input[0]), ARG32(input[1]), - output, (void *)outputCount, 0, 0 ); + output, (void *)outputCount, NULL, NULL ); break; case 1: err = (object->*func)( reference, ARG32(input[0]), - output, (void *)outputCount, 0, 0, 0 ); + output, (void *)outputCount, NULL, NULL, NULL ); break; case 0: err = (object->*func)( reference, - output, (void *)outputCount, 0, 0, 0, 0 ); + output, (void *)outputCount, NULL, NULL, NULL, NULL ); break; default: @@ -5025,21 +5176,21 @@ shim_io_connect_method_scalarI_structureI( case 3: err = (object->*func)( ARG32(input[0]), ARG32(input[1]), ARG32(input[2]), inputStruct, (void *)(uintptr_t)inputStructCount, - 0 ); + NULL ); break; case 2: err = (object->*func)( ARG32(input[0]), ARG32(input[1]), inputStruct, (void *)(uintptr_t)inputStructCount, - 0, 0 ); + NULL, NULL ); break; case 1: err = (object->*func)( ARG32(input[0]), inputStruct, (void *)(uintptr_t)inputStructCount, - 0, 0, 0 ); + NULL, NULL, NULL ); break; case 0: err = (object->*func)( inputStruct, (void *)(uintptr_t)inputStructCount, - 0, 0, 0, 0 ); + NULL, NULL, NULL, NULL ); break; default: @@ -5103,24 +5254,24 @@ shim_io_async_method_scalarI_structureI( err = (object->*func)( reference, ARG32(input[0]), ARG32(input[1]), ARG32(input[2]), inputStruct, (void *)(uintptr_t)inputStructCount, - 0 ); + NULL ); break; case 2: err = (object->*func)( reference, ARG32(input[0]), ARG32(input[1]), inputStruct, (void *)(uintptr_t)inputStructCount, - 0, 0 ); + NULL, NULL ); break; case 1: err = (object->*func)( reference, ARG32(input[0]), inputStruct, (void *)(uintptr_t)inputStructCount, - 0, 0, 0 ); + NULL, NULL, NULL ); break; case 0: err = (object->*func)( reference, inputStruct, (void *)(uintptr_t)inputStructCount, - 0, 0, 0, 0 ); + NULL, NULL, NULL, NULL ); break; default: @@ -5184,12 +5335,12 @@ shim_io_connect_method_structureI_structureO( if (method->count1) { if (method->count0) { err = (object->*func)( input, output, - (void *)(uintptr_t)inputCount, outputCount, 0, 0 ); + (void *)(uintptr_t)inputCount, outputCount, NULL, NULL ); } else { - err = (object->*func)( output, outputCount, 0, 0, 0, 0 ); + err = (object->*func)( output, outputCount, NULL, NULL, NULL, NULL ); } } else { - err = (object->*func)( input, (void *)(uintptr_t)inputCount, 0, 0, 0, 0 ); + err = (object->*func)( input, (void *)(uintptr_t)inputCount, NULL, NULL, NULL, NULL ); } }while (false); @@ -5239,14 +5390,14 @@ shim_io_async_method_structureI_structureO( if (method->count0) { err = (object->*func)( reference, input, output, - (void *)(uintptr_t)inputCount, outputCount, 0, 0 ); + (void *)(uintptr_t)inputCount, outputCount, NULL, NULL ); } else { err = (object->*func)( reference, - output, outputCount, 0, 0, 0, 0 ); + output, outputCount, NULL, NULL, NULL, NULL ); } } else { err = (object->*func)( reference, - input, (void *)(uintptr_t)inputCount, 0, 0, 0, 0 ); + input, (void *)(uintptr_t)inputCount, NULL, NULL, NULL, NULL ); } }while (false); @@ -5269,7 +5420,7 @@ is_io_catalog_send_data( #if NO_KEXTD return kIOReturnNotPrivileged; #else /* NO_KEXTD */ - OSObject * obj = 0; + OSObject * obj = NULL; vm_offset_t data; kern_return_t kr = kIOReturnError; @@ -5279,14 +5430,14 @@ is_io_catalog_send_data( return kIOReturnNotPrivileged; } - if ((flag != kIOCatalogRemoveKernelLinker && + if ((flag != kIOCatalogRemoveKernelLinker__Removed && flag != kIOCatalogKextdActive && flag != kIOCatalogKextdFinishedLaunching) && (!inData || !inDataCount)) { return kIOReturnBadArgument; } - if (!IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) { + if (!IOTaskHasEntitlement(current_task(), kOSKextManagementEntitlement)) { OSString * taskName = IOCopyLogNameForPID(proc_selfpid()); IOLog("IOCatalogueSendData(%s): Not entitled\n", taskName ? taskName->getCStringNoCopy() : ""); OSSafeReleaseNULL(taskName); @@ -5370,21 +5521,8 @@ is_io_catalog_send_data( } break; - case kIOCatalogStartMatching: { - OSDictionary * dict; - - dict = OSDynamicCast(OSDictionary, obj); - if (dict) { - if (!gIOCatalogue->startMatching( dict )) { - kr = kIOReturnError; - } - } else { - kr = kIOReturnBadArgument; - } - } - break; - - case kIOCatalogRemoveKernelLinker: + case kIOCatalogStartMatching__Removed: + case kIOCatalogRemoveKernelLinker__Removed: kr = KERN_NOT_SUPPORTED; break; @@ -5404,12 +5542,8 @@ is_io_catalog_send_data( case kIOCatalogKextdFinishedLaunching: { #if !NO_KEXTD if (!gIOKextdClearedBusy) { - IOService * serviceRoot = IOService::getServiceRoot(); - if (serviceRoot) { - IOServiceTrace(IOSERVICE_KEXTD_READY, 0, 0, 0, 0); - serviceRoot->adjustBusy(-1); - gIOKextdClearedBusy = true; - } + IOService::kextdLaunched(); + gIOKextdClearedBusy = true; } #endif kr = kIOReturnSuccess; @@ -5604,11 +5738,19 @@ is_io_catalog_reset( kern_return_t iokit_user_client_trap(struct iokit_user_client_trap_args *args) { - kern_return_t result = kIOReturnBadArgument; - IOUserClient *userClient; + kern_return_t result = kIOReturnBadArgument; + IOUserClient * userClient; + OSObject * object; + uintptr_t ref; - if ((userClient = OSDynamicCast(IOUserClient, - iokit_lookup_connect_ref_current_task((mach_port_name_t)(uintptr_t)args->userClientRef)))) { + ref = (uintptr_t) args->userClientRef; + if ((1ULL << 32) & ref) { + object = iokit_lookup_uext_ref_current_task((mach_port_name_t) ref); + if (object) { + result = IOUserServerUEXTTrap(object, args->p1, args->p2, args->p3, args->p4, args->p5, args->p6); + } + OSSafeReleaseNULL(object); + } else if ((userClient = OSDynamicCast(IOUserClient, iokit_lookup_connect_ref_current_task((mach_port_name_t) ref)))) { IOExternalTrap *trap; IOService *target = NULL; @@ -5704,7 +5846,7 @@ IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArguments * arg if (args->asyncWakePort) { IOExternalAsyncMethod * method; - object = 0; + object = NULL; if (!(method = getAsyncTargetAndMethodForIndex(&object, selector)) || !object) { return kIOReturnUnsupported; } @@ -5751,7 +5893,7 @@ IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArguments * arg } } else { IOExternalMethod * method; - object = 0; + object = NULL; if (!(method = getTargetAndMethodForIndex(&object, selector)) || !object) { return kIOReturnUnsupported; } diff --git a/iokit/Kernel/IOUserServer.cpp b/iokit/Kernel/IOUserServer.cpp new file mode 100644 index 000000000..4ad8eb5bd --- /dev/null +++ b/iokit/Kernel/IOUserServer.cpp @@ -0,0 +1,3462 @@ +/* + * Copyright (c) 1998-2014 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "IOKitKernelInternal.h" + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +SInt64 gIODKDebug = kIODKEnable; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +struct IOPStrings; + +class OSUserMetaClass : public OSObject +{ + OSDeclareDefaultStructors(OSUserMetaClass); +public: + const OSSymbol * name; + const OSMetaClass * meta; + OSUserMetaClass * superMeta; + + queue_chain_t link; + + OSClassDescription * description; + IOPStrings * queueNames; + uint32_t methodCount; + uint64_t * methods; + + virtual void free() override; + virtual kern_return_t Dispatch(const IORPC rpc) APPLE_KEXT_OVERRIDE; +}; +OSDefineMetaClassAndStructors(OSUserMetaClass, OSObject); + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +class IOUserService : public IOService +{ + friend class IOService; + + OSDeclareDefaultStructors(IOUserService) + + virtual bool + start(IOService * provider) APPLE_KEXT_OVERRIDE; + virtual IOReturn + setProperties(OSObject * props) APPLE_KEXT_OVERRIDE; +}; + +OSDefineMetaClassAndStructors(IOUserService, IOService) + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +class IOUserUserClient : public IOUserClient +{ + OSDeclareDefaultStructors(IOUserUserClient); +public: + task_t fTask; + + IOReturn setTask(task_t task); + virtual void stop(IOService * provider) APPLE_KEXT_OVERRIDE; + virtual IOReturn clientClose(void) APPLE_KEXT_OVERRIDE; + virtual IOReturn setProperties(OSObject * properties) APPLE_KEXT_OVERRIDE; + virtual IOReturn externalMethod(uint32_t selector, IOExternalMethodArguments * args, + IOExternalMethodDispatch * dispatch, OSObject * target, void * reference) APPLE_KEXT_OVERRIDE; + virtual IOReturn clientMemoryForType(UInt32 type, + IOOptionBits * options, + IOMemoryDescriptor ** memory) APPLE_KEXT_OVERRIDE; +}; + + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + +bool +IOUserService::start(IOService * provider) +{ + bool ok = true; + IOReturn ret; + + ret = Start(provider); + if (kIOReturnSuccess != ret) { + return false; + } + + return ok; +} + +IOReturn +IOUserService::setProperties(OSObject * properties) +{ + setProperty("USER", properties); + return kIOReturnSuccess; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#undef super + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +struct IODispatchQueue_IVars { + IOUserServer * userServer; + IODispatchQueue * queue; + queue_chain_t link; + uint64_t tid; + + mach_port_t serverPort; +}; + +struct OSAction_IVars { + OSObject * target; + uint64_t targetmsgid; + uint64_t msgid; + OSActionAbortedHandler abortedHandler; + size_t referenceSize; + void * reference[0]; +}; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IMPL(IOService, GetRegistryEntryID) +{ + IOReturn ret = kIOReturnSuccess; + + *registryEntryID = getRegistryEntryID(); + + return ret; +} + +kern_return_t +IMPL(IOService, SetName) +{ + IOReturn ret = kIOReturnSuccess; + + setName(name); + + return ret; +} + +kern_return_t +IMPL(IOService, Start) +{ + IOReturn ret = kIOReturnSuccess; + return ret; +} + +kern_return_t +IMPL(IOService, RegisterService) +{ + IOReturn ret = kIOReturnSuccess; + + registerService(); + + return ret; +} + +kern_return_t +IMPL(IOService, CopyDispatchQueue) +{ + IODispatchQueue * result; + IOService * service; + IOReturn ret; + uint32_t index; + + ret = kIOReturnNotFound; + index = -1U; + if (!strcmp("Default", name)) { + index = 0; + } else if (reserved->uvars->userMeta + && reserved->uvars->userMeta->queueNames) { + index = reserved->uvars->userServer->stringArrayIndex(reserved->uvars->userMeta->queueNames, name); + if (index != -1U) { + index++; + } + } + if (index == -1U) { + if ((service = getProvider())) { + ret = service->CopyDispatchQueue(name, queue); + } + } else { + result = reserved->uvars->queueArray[index]; + if (result) { + result->retain(); + *queue = result; + ret = kIOReturnSuccess; + } + } + + return ret; +} + +kern_return_t +IMPL(IOService, SetDispatchQueue) +{ + IOReturn ret = kIOReturnSuccess; + uint32_t index; + + if (kIODKLogSetup & gIODKDebug) { + DKLOG(DKS "::SetDispatchQueue(%s)\n", DKN(this), name); + } + queue->ivars->userServer = reserved->uvars->userServer; + index = -1U; + if (!strcmp("Default", name)) { + index = 0; + } else if (reserved->uvars->userMeta + && reserved->uvars->userMeta->queueNames) { + index = reserved->uvars->userServer->stringArrayIndex(reserved->uvars->userMeta->queueNames, name); + if (index != -1U) { + index++; + } + } + if (index == -1U) { + ret = kIOReturnBadArgument; + } else { + reserved->uvars->queueArray[index] = queue; + queue->retain(); + } + + return ret; +} + +kern_return_t +IMPL(IOService, SetProperties) +{ + IOReturn ret = kIOReturnUnsupported; + + ret = setProperties(properties); + + return ret; +} + +kern_return_t +IMPL(IOService, CopyProperties) +{ + IOReturn ret = kIOReturnSuccess; + *properties = dictionaryWithProperties(); + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IMPL(IOMemoryDescriptor, _CopyState) +{ + IOReturn ret; + + state->length = _length; + state->options = _flags; + + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IOMemoryDescriptor::GetLength(uint64_t * returnLength) +{ + *returnLength = getLength(); + + return kIOReturnSuccess; +} + +kern_return_t +IMPL(IOMemoryDescriptor, CreateMapping) +{ + IOReturn ret; + IOMemoryMap * resultMap; + IOOptionBits koptions; + mach_vm_address_t atAddress; + + ret = kIOReturnSuccess; + koptions = 0; + resultMap = NULL; + + if (kIOMemoryMapFixedAddress & options) { + atAddress = address; + koptions = 0; + } else { + atAddress = 0; + koptions |= kIOMapAnywhere; + } + + if (kIOMemoryMapReadOnly & options || (kIODirectionOut == getDirection())) { + if (!reserved || (current_task() != reserved->creator)) { + koptions |= kIOMapReadOnly; + } + } + + switch (0xFF00 & options) { + case kIOMemoryMapCacheModeDefault: + koptions |= kIOMapDefaultCache; + break; + case kIOMemoryMapCacheModeInhibit: + koptions |= kIOMapInhibitCache; + break; + case kIOMemoryMapCacheModeCopyback: + koptions |= kIOMapCopybackCache; + break; + case kIOMemoryMapCacheModeWriteThrough: + koptions |= kIOMapWriteThruCache; + break; + default: + ret = kIOReturnBadArgument; + } + + if (kIOReturnSuccess == ret) { + resultMap = createMappingInTask(current_task(), atAddress, koptions, offset, length); + if (!resultMap) { + ret = kIOReturnError; + } + } + + *map = resultMap; + + return ret; +} + +kern_return_t +IMPL(IOMemoryDescriptor, PrepareForDMA) +{ + IOReturn ret; + uint32_t idx, count; + uint64_t sumLength; + uint64_t lflags; + + if (!device) { + return kIOReturnBadArgument; + } + + count = *segmentsCount; + sumLength = 0; + for (idx = 0; idx < count; idx++) { +#ifdef __LP64__ + segments[idx].address = getPhysicalSegment(offset, &segments[idx].length); +#else + segments[idx].address = 0; +#endif + if (!segments[idx].address) { + break; + } + sumLength += segments[idx].length; + offset += segments[idx].length; + } + *returnLength = sumLength; + *segmentsCount = idx; + + // !!translate flags + lflags = 0; + if (kIODirectionOut & _flags) { + lflags |= kIOMemoryDirectionOut; + } + if (kIODirectionIn & _flags) { + lflags |= kIOMemoryDirectionIn; + } + + *flags = lflags; + ret = kIOReturnSuccess; + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IMPL(IOMemoryMap, _CopyState) +{ + IOReturn ret; + + state->offset = fOffset; + state->length = getLength(); + state->address = getAddress(); + state->options = getMapOptions(); + + ret = kIOReturnSuccess; + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IMPL(IOBufferMemoryDescriptor, Create) +{ + IOReturn ret; + IOBufferMemoryDescriptor * bmd; + IOMemoryDescriptorReserved * reserved; + + if (options & ~((uint64_t) kIOMemoryDirectionOutIn)) { + // no other options currently defined + return kIOReturnBadArgument; + } + options &= kIOMemoryDirectionOutIn; + bmd = IOBufferMemoryDescriptor::inTaskWithOptions( + kernel_task, options, capacity, alignment); + + *memory = bmd; + + if (!bmd) { + return kIOReturnNoMemory; + } + + reserved = bmd->getKernelReserved(); + reserved->creator = current_task(); + task_reference(reserved->creator); + + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IMPL(IOBufferMemoryDescriptor, SetLength) +{ + setLength(length); + return kIOReturnSuccess; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +OSAction::Create(OSAction_Create_Args) +{ + kern_return_t ret; + ret = OSAction::Create_Call(target, targetmsgid, msgid, referenceSize, action); + return ret; +} + +kern_return_t +IMPL(OSAction, Create) +{ + OSAction * inst; + vm_size_t allocsize; + + if (os_add_overflow(referenceSize, sizeof(OSAction_IVars), &allocsize)) { + return kIOReturnBadArgument; + } + inst = OSTypeAlloc(OSAction); + if (!inst) { + return kIOReturnNoMemory; + } + inst->ivars = (typeof(inst->ivars))(uintptr_t) IONewZero(uint8_t, allocsize); + if (!inst->ivars) { + inst->release(); + return kIOReturnNoMemory; + } + target->retain(); + inst->ivars->target = target; + inst->ivars->targetmsgid = targetmsgid; + inst->ivars->msgid = msgid; + inst->ivars->referenceSize = referenceSize; + + *action = inst; + + return kIOReturnSuccess; +} + +void +OSAction::free() +{ + if (ivars) { + if (ivars->abortedHandler) { + Block_release(ivars->abortedHandler); + ivars->abortedHandler = NULL; + } + OSSafeReleaseNULL(ivars->target); + IOSafeDeleteNULL(ivars, uint8_t, ivars->referenceSize + sizeof(OSAction_IVars)); + } + return super::free(); +} + +void * +OSAction::GetReference() +{ + assert(ivars && ivars->referenceSize); + return &ivars->reference[0]; +} + +kern_return_t +OSAction::SetAbortedHandler(OSActionAbortedHandler handler) +{ + ivars->abortedHandler = Block_copy(handler); + return kIOReturnSuccess; +} + +void +OSAction::Aborted_Impl(void) +{ + if (ivars->abortedHandler) { + ivars->abortedHandler(); + } +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +struct IODispatchSource_IVars { + queue_chain_t link; + IODispatchSource * source; + IOUserServer * server; + IODispatchQueue_IVars * queue; + bool enabled; +}; + +bool +IODispatchSource::init() +{ + if (!super::init()) { + return false; + } + + ivars = IONewZero(IODispatchSource_IVars, 1); + + ivars->source = this; + + return true; +} + +void +IODispatchSource::free() +{ + IOSafeDeleteNULL(ivars, IODispatchSource_IVars, 1); + super::free(); +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +struct IOInterruptDispatchSource_IVars { + IOService * provider; + uint32_t intIndex; + IOSimpleLock * lock; + thread_t waiter; + uint64_t count; + uint64_t time; + OSAction * action; + bool enable; +}; + +static void +IOInterruptDispatchSourceInterrupt(OSObject * target, void * refCon, + IOService * nub, int source ) +{ + IOInterruptDispatchSource_IVars * ivars = (typeof(ivars))refCon; + IOInterruptState is; + + is = IOSimpleLockLockDisableInterrupt(ivars->lock); + ivars->count++; + if (ivars->waiter) { + ivars->time = mach_absolute_time(); + thread_wakeup_thread((event_t) ivars, ivars->waiter); + ivars->waiter = NULL; + } + IOSimpleLockUnlockEnableInterrupt(ivars->lock, is); +} + +kern_return_t +IMPL(IOInterruptDispatchSource, Create) +{ + IOReturn ret; + IOInterruptDispatchSource * inst; + + inst = OSTypeAlloc(IOInterruptDispatchSource); + if (!inst->init()) { + inst->free(); + return kIOReturnNoMemory; + } + + inst->ivars->lock = IOSimpleLockAlloc(); + + ret = provider->registerInterrupt(index, inst, IOInterruptDispatchSourceInterrupt, inst->ivars); + if (kIOReturnSuccess == ret) { + inst->ivars->intIndex = index; + inst->ivars->provider = provider; + *source = inst; + } + return ret; +} + +bool +IOInterruptDispatchSource::init() +{ + if (!super::init()) { + return false; + } + ivars = IONewZero(IOInterruptDispatchSource_IVars, 1); + if (!ivars) { + return false; + } + + return true; +} + +void +IOInterruptDispatchSource::free() +{ + IOReturn ret; + + if (ivars && ivars->provider) { + ret = ivars->provider->unregisterInterrupt(ivars->intIndex); + assert(kIOReturnSuccess == ret); + } + + IOSafeDeleteNULL(ivars, IOInterruptDispatchSource_IVars, 1); + + super::free(); +} + +kern_return_t +IMPL(IOInterruptDispatchSource, SetHandler) +{ + IOReturn ret; + OSAction * oldAction; + + oldAction = (typeof(oldAction))ivars->action; + if (oldAction && OSCompareAndSwapPtr(oldAction, NULL, &ivars->action)) { + oldAction->release(); + } + action->retain(); + ivars->action = action; + + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IMPL(IOInterruptDispatchSource, SetEnableWithCompletion) +{ + IOReturn ret; + IOInterruptState is; + + if (enable == ivars->enable) { + return kIOReturnSuccess; + } + + if (enable) { + is = IOSimpleLockLockDisableInterrupt(ivars->lock); + ivars->enable = enable; + IOSimpleLockUnlockEnableInterrupt(ivars->lock, is); + ret = ivars->provider->enableInterrupt(ivars->intIndex); + } else { + ret = ivars->provider->disableInterrupt(ivars->intIndex); + is = IOSimpleLockLockDisableInterrupt(ivars->lock); + ivars->enable = enable; + IOSimpleLockUnlockEnableInterrupt(ivars->lock, is); + } + + return ret; +} + +kern_return_t +IMPL(IODispatchSource, SetEnable) +{ + return SetEnableWithCompletion(enable, NULL); +} + +kern_return_t +IMPL(IOInterruptDispatchSource, CheckForWork) +{ + IOReturn ret = kIOReturnNotReady; + IOInterruptState is; + wait_result_t waitResult; + uint64_t icount; + uint64_t itime; + thread_t self; + + self = current_thread(); + icount = 0; + do { + is = IOSimpleLockLockDisableInterrupt(ivars->lock); + if ((icount = ivars->count)) { + itime = ivars->time; + ivars->count = 0; + waitResult = THREAD_AWAKENED; + } else if (synchronous) { + assert(NULL == ivars->waiter); + ivars->waiter = self; + waitResult = assert_wait((event_t) ivars, THREAD_INTERRUPTIBLE); + } + IOSimpleLockUnlockEnableInterrupt(ivars->lock, is); + if (synchronous && (waitResult == THREAD_WAITING)) { + waitResult = thread_block(THREAD_CONTINUE_NULL); + if (THREAD_INTERRUPTED == waitResult) { + break; + } + } + } while (synchronous && !icount); + + if (icount && ivars->action) { + ret = InterruptOccurred(rpc, ivars->action, icount, itime); + } + + return ret; +} + +void +IMPL(IOInterruptDispatchSource, InterruptOccurred) +{ +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IOUserServer::waitInterruptTrap(void * p1, void * p2, void * p3, void * p4, void * p5, void * p6) +{ + IOReturn ret = kIOReturnBadArgument; + IOInterruptState is; + IOInterruptDispatchSource * interrupt; + IOInterruptDispatchSource_IVars * ivars; + IOInterruptDispatchSourcePayload payload; + + wait_result_t waitResult; + thread_t self; + + OSObject * object; + + object = iokit_lookup_object_with_port_name((mach_port_name_t)(uintptr_t)p1, IKOT_UEXT_OBJECT, current_task()); + + if (!object) { + return kIOReturnBadArgument; + } + if (!(interrupt = OSDynamicCast(IOInterruptDispatchSource, object))) { + ret = kIOReturnBadArgument; + } else { + self = current_thread(); + ivars = interrupt->ivars; + payload.count = 0; + do { + is = IOSimpleLockLockDisableInterrupt(ivars->lock); + if ((payload.count = ivars->count)) { + payload.time = ivars->time; + ivars->count = 0; + waitResult = THREAD_AWAKENED; + } else { + assert(NULL == ivars->waiter); + ivars->waiter = self; + waitResult = assert_wait((event_t) ivars, THREAD_INTERRUPTIBLE); + } + IOSimpleLockUnlockEnableInterrupt(ivars->lock, is); + if (waitResult == THREAD_WAITING) { + waitResult = thread_block(THREAD_CONTINUE_NULL); + if (THREAD_INTERRUPTED == waitResult) { + break; + } + } + } while (!payload.count); + ret = (payload.count ? kIOReturnSuccess : kIOReturnAborted); + } + + if (kIOReturnSuccess == ret) { + int copyerr = copyout(&payload, (user_addr_t) p2, sizeof(payload)); + if (copyerr) { + ret = kIOReturnVMError; + } + } + + object->release(); + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IMPL(IOUserServer, Create) +{ + IOReturn ret; + IOUserServer * us; + const OSSymbol * sym; + OSNumber * serverTag; + io_name_t rname; + + us = (typeof(us))thread_iokit_tls_get(0); + assert(OSDynamicCast(IOUserServer, us)); + if (kIODKLogSetup & gIODKDebug) { + DKLOG(DKS "::Create(" DKS ") %p\n", DKN(us), name, tag, us); + } + if (!us) { + return kIOReturnError; + } + + sym = OSSymbol::withCString(name); + serverTag = OSNumber::withNumber(tag, 64); + + us->setProperty(gIOUserServerNameKey, (OSObject *) sym); + us->setProperty(gIOUserServerTagKey, serverTag); + + serverTag->release(); + OSSafeReleaseNULL(sym); + + snprintf(rname, sizeof(rname), "IOUserServer(%s-0x%qx)", name, tag); + us->setName(rname); + + us->retain(); + *server = us; + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IMPL(IOUserServer, Exit) +{ + return kIOReturnUnsupported; +} + +kern_return_t +IMPL(IOUserServer, LoadModule) +{ + return kIOReturnUnsupported; +} + + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +IMPL(IODispatchQueue, Create) +{ + IODispatchQueue * result; + IOUserServer * us; + + result = OSTypeAlloc(IODispatchQueue); + if (!result) { + return kIOReturnNoMemory; + } + if (!result->init()) { + return kIOReturnNoMemory; + } + + *queue = result; + + if (!strcmp("Root", name)) { + us = (typeof(us))thread_iokit_tls_get(0); + assert(OSDynamicCast(IOUserServer, us)); + us->setRootQueue(result); + } + + if (kIODKLogSetup & gIODKDebug) { + DKLOG("IODispatchQueue::Create %s %p\n", name, result); + } + + return kIOReturnSuccess; +} + +kern_return_t +IMPL(IODispatchQueue, SetPort) +{ + ivars->serverPort = port; + return kIOReturnSuccess; +} + +bool +IODispatchQueue::init() +{ + ivars = IONewZero(IODispatchQueue_IVars, 1); + if (!ivars) { + return false; + } + ivars->queue = this; + + return true; +} + +void +IODispatchQueue::free() +{ + IOSafeDeleteNULL(ivars, IODispatchQueue_IVars, 1); + super::free(); +} + +bool +IODispatchQueue::OnQueue() +{ + return false; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + +kern_return_t +OSMetaClassBase::Dispatch(IORPC rpc) +{ + return kIOReturnUnsupported; +} + +kern_return_t +OSMetaClassBase::Invoke(IORPC rpc) +{ + IOReturn ret = kIOReturnUnsupported; + OSMetaClassBase * object; + OSAction * action; + IOService * service; + IOUserServer * us; + IORPCMessage * message; + + assert(rpc.sendSize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))); + message = IORPCMessageFromMach(rpc.message, false); + if (!message) { + return kIOReturnIPCError; + } + message->flags |= kIORPCMessageKernel; + + us = NULL; + if (!(kIORPCMessageLocalHost & message->flags)) { + us = OSDynamicCast(IOUserServer, this); + if (!us) { + if ((action = OSDynamicCast(OSAction, this))) { + object = IOUserServer::target(action, message); + } else { + object = this; + } + if ((service = OSDynamicCast(IOService, object)) + && service->reserved->uvars) { + // xxx other classes + us = service->reserved->uvars->userServer; + } + } + } + if (us) { + message->flags |= kIORPCMessageRemote; + ret = us->rpc(rpc); + if (kIOReturnSuccess != ret) { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("OSMetaClassBase::Invoke user 0x%x\n", ret); + } + } + } else { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("OSMetaClassBase::Invoke kernel %s 0x%qx\n", getMetaClass()->getClassName(), message->msgid); + } + ret = Dispatch(rpc); + } + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +struct IOPStrings { + uint32_t dataSize; + uint32_t count; + const char strings[0]; +}; + +kern_return_t +OSUserMetaClass::Dispatch(IORPC rpc) +{ + return const_cast(meta)->Dispatch(rpc); +} + +void +OSUserMetaClass::free() +{ + if (queueNames) { + IOFree(queueNames, sizeof(IOPStrings) + queueNames->dataSize * sizeof(char)); + queueNames = NULL; + } + if (description) { + IOFree(description, description->descriptionSize); + description = NULL; + } + IOSafeDeleteNULL(methods, uint64_t, 2 * methodCount); + if (meta) { + meta->releaseMetaClass(); + } + if (name) { + name->release(); + } + OSObject::free(); +} + +/* + * Sets the loadTag of the associated OSKext + * in the dext task. + * NOTE: different instances of the same OSKext + * (so same BounleID but different tasks) + * will have the same loadTag. + */ +void +IOUserServer::setTaskLoadTag(OSKext *kext) +{ + task_t owningTask; + uint32_t loadTag, prev_taskloadTag; + + owningTask = this->fOwningTask; + if (!owningTask) { + printf("%s: fOwningTask not found\n", __FUNCTION__); + return; + } + + loadTag = kext->getLoadTag(); + prev_taskloadTag = set_task_loadTag(owningTask, loadTag); + if (prev_taskloadTag) { + printf("%s: found the task loadTag already set to %u (set to %u)\n", + __FUNCTION__, prev_taskloadTag, loadTag); + } +} + +/* + * Sets the OSKext uuid as the uuid of the userspace + * dext executable. + */ +void +IOUserServer::setDriverKitUUID(OSKext *kext) +{ + task_t task; + proc_t p; + uuid_t p_uuid, k_uuid; + OSData *k_data_uuid; + OSData *new_uuid; + uuid_string_t uuid_string = ""; + + task = this->fOwningTask; + if (!task) { + printf("%s: fOwningTask not found\n", __FUNCTION__); + return; + } + + p = (proc_t)(get_bsdtask_info(task)); + if (!p) { + printf("%s: proc not found\n", __FUNCTION__); + return; + } + proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid)); + + k_data_uuid = kext->copyUUID(); + if (k_data_uuid) { + memcpy(&k_uuid, k_data_uuid->getBytesNoCopy(), sizeof(k_uuid)); + OSSafeReleaseNULL(k_data_uuid); + if (uuid_compare(k_uuid, p_uuid) != 0) { + printf("%s: uuid not matching\n", __FUNCTION__); + } + return; + } + + uuid_unparse(p_uuid, uuid_string); + new_uuid = OSData::withBytes(p_uuid, sizeof(p_uuid)); + kext->setDriverKitUUID(new_uuid); +} + +bool +IOUserServer::serviceMatchesCDHash(IOService *service) +{ + OSObject *obj = NULL; + bool result = false; + OSString *requiredCDHashStr = NULL; + const char *requiredCDHash = NULL; + char taskCDHash[CS_CDHASH_LEN]; + + task_t owningTask = this->fOwningTask; + if (!owningTask) { + printf("%s: fOwningTask not found\n", __FUNCTION__); + goto out; + } + + obj = service->copyProperty(gIOUserServerCDHashKey); + requiredCDHashStr = OSDynamicCast(OSString, obj); + if (!requiredCDHashStr) { + printf("%s: required cdhash not found as property of personality\n", __FUNCTION__); + goto out; + } + + requiredCDHash = requiredCDHashStr->getCStringNoCopy(); + if (!requiredCDHash) { + printf("%s: required cdhash unable to be read as string\n", __FUNCTION__); + goto out; + } + + if (strlen(requiredCDHash) != CS_CDHASH_LEN * 2) { + printf("%s: required cdhash string has incorrect length\n", __FUNCTION__); + goto out; + } + + get_task_cdhash(owningTask, taskCDHash); + for (int i = 0; i < (int)CS_CDHASH_LEN * 2; i++) { + uint8_t which = (i + 1) & 0x1; /* 1 for upper nibble, 0 for lower */ + uint8_t nibble = requiredCDHash[i]; + uint8_t byte = taskCDHash[i / 2]; + if ('0' <= nibble && nibble <= '9') { + nibble -= '0'; + } else if ('a' <= nibble && nibble <= 'f') { + nibble -= 'a' - 10; + } else if ('A' <= nibble && nibble <= 'F') { + nibble -= 'A' - 10; + } else { + printf("%s: required cdhash contains invalid token '%c'\n", __FUNCTION__, nibble); + goto out; + } + + /* + * Decide which half of the byte to compare + */ + if (nibble != (which ? (byte >> 4) : (byte & 0x0f))) { + printf("%s: required cdhash %s in personality does not match service\n", __FUNCTION__, requiredCDHash); + goto out; + } + } + + result = true; +out: + OSSafeReleaseNULL(obj); + return result; +} + +bool +IOUserServer::checkEntitlements( + OSDictionary * entitlements, OSObject * prop, + IOService * provider, IOService * dext) +{ + OSDictionary * matching; + + if (!prop) { + return true; + } + if (!entitlements) { + return false; + } + + matching = NULL; + if (dext) { + matching = dext->dictionaryWithProperties(); + if (!matching) { + return false; + } + } + + bool allPresent __block; + prop->iterateObjects(^bool (OSObject * object) { + allPresent = false; + object->iterateObjects(^bool (OSObject * object) { + OSString * string; + OSObject * value; + string = OSDynamicCast(OSString, object); + value = entitlements->getObject(string); + if (matching && value) { + matching->setObject(string, value); + } + allPresent = (NULL != value); + return !allPresent; + }); + return allPresent; + }); + + if (allPresent && matching && provider) { + allPresent = provider->matchPropertyTable(matching); + } + + OSSafeReleaseNULL(matching); + OSSafeReleaseNULL(prop); + + return allPresent; +} + +bool +IOUserServer::checkEntitlements(IOService * provider, IOService * dext) +{ + OSObject * prop; + bool ok; + + if (!fOwningTask) { + return false; + } + + prop = provider->copyProperty(gIOServiceDEXTEntitlementsKey); + ok = checkEntitlements(fEntitlements, prop, provider, dext); + if (!ok) { + DKLOG(DKS ": provider entitlements check failed\n", DKN(dext)); + } + if (ok) { + prop = dext->copyProperty(gIOServiceDEXTEntitlementsKey); + ok = checkEntitlements(fEntitlements, prop, NULL, NULL); + if (!ok) { + DKLOG(DKS ": family entitlements check failed\n", DKN(dext)); + } + } + + return ok; +} + +IOReturn +IOUserServer::exit(const char * reason) +{ + DKLOG("%s::exit(%s)\n", getName(), reason); + Exit(reason); + return kIOReturnSuccess; +} + +OSObjectUserVars * +IOUserServer::varsForObject(OSObject * obj) +{ + IOService * service; + + if ((service = OSDynamicCast(IOService, obj))) { + return service->reserved->uvars; + } + + return NULL; +} + +IOPStrings * +IOUserServer::copyInStringArray(const char * string, uint32_t userSize) +{ + IOPStrings * array; + vm_size_t alloc; + size_t len; + const char * cstr; + const char * end; + + if (userSize <= 1) { + return NULL; + } + + if (os_add_overflow(sizeof(IOPStrings), userSize, &alloc)) { + assert(false); + return NULL; + } + if (alloc > 16384) { + assert(false); + return NULL; + } + array = (typeof(array))IOMalloc(alloc); + if (!array) { + return NULL; + } + array->dataSize = userSize; + bcopy(string, (void *) &array->strings[0], userSize); + + array->count = 0; + cstr = &array->strings[0]; + end = &array->strings[array->dataSize]; + while ((len = cstr[0])) { + cstr++; + if ((cstr + len) >= end) { + break; + } + cstr += len; + array->count++; + } + if (len) { + IOFree(array, alloc); + array = NULL; + } + + return array; +} + +uint32_t +IOUserServer::stringArrayIndex(IOPStrings * array, const char * look) +{ + uint32_t idx; + size_t len, llen; + const char * cstr; + const char * end; + + idx = 0; + cstr = &array->strings[0]; + end = &array->strings[array->dataSize]; + llen = strlen(look); + while ((len = cstr[0])) { + cstr++; + if ((cstr + len) >= end) { + break; + } + if ((len == llen) && !strncmp(cstr, look, len)) { + return idx; + } + cstr += len; + idx++; + } + + return -1U; +} +#define kIODispatchQueueStopped ((IODispatchQueue *) -1L) + +IODispatchQueue * +IOUserServer::queueForObject(OSObject * obj, uint64_t msgid) +{ + IODispatchQueue * queue; + OSObjectUserVars * uvars; + uint64_t option; + + uvars = varsForObject(obj); + if (!uvars) { + return NULL; + } + if (!uvars->queueArray) { + if (uvars->stopped) { + return kIODispatchQueueStopped; + } + return NULL; + } + queue = uvars->queueArray[0]; + + if (uvars->userMeta + && uvars->userMeta->methods) { + uint32_t idx, baseIdx; + uint32_t lim; + // bsearch + for (baseIdx = 0, lim = uvars->userMeta->methodCount; lim; lim >>= 1) { + idx = baseIdx + (lim >> 1); + if (msgid == uvars->userMeta->methods[idx]) { + option = uvars->userMeta->methods[uvars->userMeta->methodCount + idx]; + option &= 0xFF; + if (option < uvars->userMeta->queueNames->count) { + queue = uvars->queueArray[option + 1]; + } + break; + } else if (msgid > uvars->userMeta->methods[idx]) { + // move right + baseIdx += (lim >> 1) + 1; + lim--; + } + // else move left + } + } + return queue; +} + +IOReturn +IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * message) +{ + IOReturn ret; + OSString * str; + OSObject * prop; + IOService * service; + + OSAction * action; + OSObject * target; + uint32_t queueCount, queueAlloc; + const char * resultClassName; + uint64_t resultFlags; + + size_t replySize; + uint32_t methodCount; + const uint64_t * methods; + IODispatchQueue * queue; + OSUserMetaClass * userMeta; + OSObjectUserVars * uvars; + uint32_t idx; + ipc_port_t sendPort; + + OSObject_Instantiate_Rpl_Content * reply; + + queueCount = 0; + methodCount = 0; + methods = NULL; + str = NULL; + prop = NULL; + userMeta = NULL; + resultClassName = NULL; + resultFlags = 0; + ret = kIOReturnUnsupportedMode; + + service = OSDynamicCast(IOService, obj); + if (!service) { + // xxx other classes hosted + resultFlags |= kOSObjectRPCKernel; + resultFlags |= kOSObjectRPCRemote; + } else { + if (service->isInactive()) { + DKLOG(DKS "::instantiate inactive\n", DKN(service)); + return kIOReturnOffline; + } + prop = service->copyProperty(gIOUserClassKey); + str = OSDynamicCast(OSString, prop); + if (!service->reserved->uvars) { + resultFlags |= kOSObjectRPCRemote; + resultFlags |= kOSObjectRPCKernel; + } else if (this != service->reserved->uvars->userServer) { + // remote, use base class + resultFlags |= kOSObjectRPCRemote; + } + if (service->reserved->uvars && service->reserved->uvars->userServer) { + userMeta = (typeof(userMeta))service->reserved->uvars->userServer->fClasses->getObject(str); + } + } + if (!str && !userMeta) { + const OSMetaClass * meta; + meta = obj->getMetaClass(); + while (meta && !userMeta) { + str = (OSString *) meta->getClassNameSymbol(); + userMeta = (typeof(userMeta))fClasses->getObject(str); + if (!userMeta) { + meta = meta->getSuperClass(); + } + } + } + if (str) { + if (!userMeta) { + userMeta = (typeof(userMeta))fClasses->getObject(str); + } + if (kIODKLogSetup & gIODKDebug) { + DKLOG("userMeta %s %p\n", str->getCStringNoCopy(), userMeta); + } + if (userMeta) { + if (kOSObjectRPCRemote & resultFlags) { + while (userMeta && !(kOSClassCanRemote & userMeta->description->flags)) { + userMeta = userMeta->superMeta; + } + if (userMeta) { + resultClassName = userMeta->description->name; + ret = kIOReturnSuccess; + } + } else { + service->reserved->uvars->userMeta = userMeta; + queueAlloc = 1; + if (userMeta->queueNames) { + queueAlloc += userMeta->queueNames->count; + } + service->reserved->uvars->queueArray = + IONewZero(IODispatchQueue *, queueAlloc); + resultClassName = str->getCStringNoCopy(); + ret = kIOReturnSuccess; + } + } + } + OSSafeReleaseNULL(prop); + + IORPCMessageMach * machReply = rpc.reply; + replySize = sizeof(OSObject_Instantiate_Rpl); + + if ((kIOReturnSuccess == ret) && (kOSObjectRPCRemote & resultFlags)) { + target = obj; + if ((action = OSDynamicCast(OSAction, obj))) { + if (action->ivars->referenceSize) { + resultFlags |= kOSObjectRPCKernel; + } else { + resultFlags &= ~kOSObjectRPCKernel; + target = action->ivars->target; + + queueCount = 1; + queue = queueForObject(target, action->ivars->targetmsgid); + idx = 0; + sendPort = NULL; + if (queue && (kIODispatchQueueStopped != queue)) { + sendPort = ipc_port_make_send(queue->ivars->serverPort); + } + replySize = sizeof(OSObject_Instantiate_Rpl) + + queueCount * sizeof(machReply->objects[0]) + + 2 * methodCount * sizeof(reply->methods[0]); + if (replySize > rpc.replySize) { + assert(false); + return kIOReturnIPCError; + } + machReply->objects[idx].type = MACH_MSG_PORT_DESCRIPTOR; + machReply->objects[idx].disposition = MACH_MSG_TYPE_MOVE_SEND; + machReply->objects[idx].name = sendPort; + machReply->objects[idx].pad2 = 0; + machReply->objects[idx].pad_end = 0; + } + } else { + uvars = varsForObject(target); + if (uvars && uvars->userMeta) { + queueCount = 1; + if (uvars->userMeta->queueNames) { + queueCount += uvars->userMeta->queueNames->count; + } + methods = &uvars->userMeta->methods[0]; + methodCount = uvars->userMeta->methodCount; + replySize = sizeof(OSObject_Instantiate_Rpl) + + queueCount * sizeof(machReply->objects[0]) + + 2 * methodCount * sizeof(reply->methods[0]); + if (replySize > rpc.replySize) { + assert(false); + return kIOReturnIPCError; + } + for (idx = 0; idx < queueCount; idx++) { + queue = uvars->queueArray[idx]; + sendPort = NULL; + if (queue) { + sendPort = ipc_port_make_send(queue->ivars->serverPort); + } + machReply->objects[idx].type = MACH_MSG_PORT_DESCRIPTOR; + machReply->objects[idx].disposition = MACH_MSG_TYPE_MOVE_SEND; + machReply->objects[idx].name = sendPort; + machReply->objects[idx].pad2 = 0; + machReply->objects[idx].pad_end = 0; + } + } + } + } + + if (kIODKLogIPC & gIODKDebug) { + DKLOG("instantiate %s\n", obj->getMetaClass()->getClassName()); + } + + if (kIOReturnSuccess != ret) { + DKLOG("%s: no user class found\n", str ? str->getCStringNoCopy() : obj->getMetaClass()->getClassName()); + resultClassName = "unknown"; + } + + machReply->msgh.msgh_id = kIORPCVersionCurrentReply; + machReply->msgh.msgh_size = replySize; + machReply->msgh_body.msgh_descriptor_count = queueCount; + + reply = (typeof(reply))IORPCMessageFromMach(machReply, true); + if (!reply) { + return kIOReturnIPCError; + } + if (methodCount) { + bcopy(methods, &reply->methods[0], methodCount * 2 * sizeof(reply->methods[0])); + } + reply->__hdr.msgid = OSObject_Instantiate_ID; + reply->__hdr.flags = kIORPCMessageOneway; + reply->__hdr.objectRefs = 0; + reply->__pad = 0; + reply->flags = resultFlags; + strlcpy(reply->classname, resultClassName, sizeof(reply->classname)); + reply->__result = ret; + + ret = kIOReturnSuccess; + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IOReturn +IOUserServer::kernelDispatch(OSObject * obj, IORPC rpc) +{ + IOReturn ret; + IORPCMessage * message; + + message = IORPCMessageFromMach(rpc.message, false); + if (!message) { + return kIOReturnIPCError; + } + + if (OSObject_Instantiate_ID == message->msgid) { + ret = objectInstantiate(obj, rpc, message); + if (kIOReturnSuccess != ret) { + DKLOG("%s: instantiate failed 0x%x\n", obj->getMetaClass()->getClassName(), ret); + } + } else { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("%s::Dispatch kernel 0x%qx\n", obj->getMetaClass()->getClassName(), message->msgid); + } + ret = obj->Dispatch(rpc); + if (kIODKLogIPC & gIODKDebug) { + DKLOG("%s::Dispatch kernel 0x%qx result 0x%x\n", obj->getMetaClass()->getClassName(), message->msgid, ret); + } + } + + return ret; +} + + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +OSObject * +IOUserServer::target(OSAction * action, IORPCMessage * message) +{ + OSObject * object; + + if (message->msgid != action->ivars->msgid) { + return action; + } + object = action->ivars->target; + message->msgid = action->ivars->targetmsgid; + message->objects[0] = (OSObjectRef) object; + if (kIORPCMessageRemote & message->flags) { + object->retain(); + action->release(); + } + if (kIODKLogIPC & gIODKDebug) { + DKLOG("TARGET %s msg 0x%qx from 0x%qx\n", object->getMetaClass()->getClassName(), message->msgid, action->ivars->msgid); + } + + return object; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +kern_return_t +uext_server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * pReply) +{ + kern_return_t ret; + IORPCMessageMach * msgin; + OSObject * object; + IOUserServer * server; + + msgin = (typeof(msgin))ipc_kmsg_msg_header(requestkmsg); + + object = IOUserServer::copyObjectForSendRight(msgin->msgh.msgh_remote_port, IKOT_UEXT_OBJECT); + server = OSDynamicCast(IOUserServer, object); + if (!server) { + OSSafeReleaseNULL(object); + return KERN_INVALID_NAME; + } + ret = server->server(requestkmsg, pReply); + object->release(); + + return ret; +} + +#define MAX_UEXT_REPLY_SIZE 0x17c0 + +kern_return_t +IOUserServer::server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * pReply) +{ + kern_return_t ret; + mach_msg_size_t replyAlloc; + ipc_kmsg_t replykmsg; + IORPCMessageMach * msgin; + IORPCMessage * message; + IORPCMessageMach * msgout; + IORPCMessage * reply; + uint32_t replySize; + OSObject * object; + OSAction * action; + bool oneway; + uint64_t msgid; + + msgin = (typeof(msgin))ipc_kmsg_msg_header(requestkmsg); + replyAlloc = 0; + msgout = NULL; + replykmsg = NULL; + + if (msgin->msgh.msgh_size < (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))) { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("UEXT notify %o\n", msgin->msgh.msgh_id); + } + return KERN_NOT_SUPPORTED; + } + + if (!(MACH_MSGH_BITS_COMPLEX & msgin->msgh.msgh_bits)) { + msgin->msgh_body.msgh_descriptor_count = 0; + } + message = IORPCMessageFromMach(msgin, false); + if (!message) { + return kIOReturnIPCError; + } + ret = copyInObjects(msgin, message, msgin->msgh.msgh_size, true, false); + if (kIOReturnSuccess != ret) { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("UEXT copyin(0x%x) %x\n", ret, msgin->msgh.msgh_id); + } + return KERN_NOT_SUPPORTED; + } + + if (msgin->msgh_body.msgh_descriptor_count < 1) { + return KERN_NOT_SUPPORTED; + } + object = (OSObject *) message->objects[0]; + msgid = message->msgid; + message->flags &= ~kIORPCMessageKernel; + message->flags |= kIORPCMessageRemote; + + if ((action = OSDynamicCast(OSAction, object))) { + object = target(action, message); + msgid = message->msgid; + } + + oneway = (0 != (kIORPCMessageOneway & message->flags)); + assert(oneway || (MACH_PORT_NULL != msgin->msgh.msgh_local_port)); + + // includes trailer size + replyAlloc = oneway ? 0 : MAX_UEXT_REPLY_SIZE; + if (replyAlloc) { + replykmsg = ipc_kmsg_alloc(replyAlloc); + if (replykmsg == NULL) { +// printf("uext_server: dropping request\n"); + // ipc_kmsg_trace_send(request, option); + consumeObjects(message, msgin->msgh.msgh_size); + ipc_kmsg_destroy(requestkmsg); + return KERN_MEMORY_FAILURE; + } + + msgout = (typeof(msgout))ipc_kmsg_msg_header(replykmsg); + /* + * MIG should really assure no data leakage - + * but until it does, pessimistically zero the + * whole reply buffer. + */ + bzero((void *)msgout, replyAlloc); + } + + IORPC rpc = { .message = msgin, .sendSize = msgin->msgh.msgh_size, .reply = msgout, .replySize = replyAlloc }; + + if (object) { + thread_iokit_tls_set(0, this); + ret = kernelDispatch(object, rpc); + thread_iokit_tls_set(0, NULL); + } else { + ret = kIOReturnBadArgument; + } + + // release objects + consumeObjects(message, msgin->msgh.msgh_size); + + // release ports + copyInObjects(msgin, message, msgin->msgh.msgh_size, false, true); + + if (!oneway) { + if (kIOReturnSuccess == ret) { + replySize = msgout->msgh.msgh_size; + reply = IORPCMessageFromMach(msgout, true); + if (!reply) { + ret = kIOReturnIPCError; + } else { + ret = copyOutObjects(msgout, reply, replySize, (kIORPCVersionCurrentReply == msgout->msgh.msgh_id) /* =>!InvokeReply */); + } + } + if (kIOReturnSuccess != ret) { + IORPCMessageErrorReturnContent * errorMsg; + + msgout->msgh_body.msgh_descriptor_count = 0; + msgout->msgh.msgh_id = kIORPCVersionCurrentReply; + errorMsg = (typeof(errorMsg))IORPCMessageFromMach(msgout, true); + errorMsg->hdr.msgid = message->msgid; + errorMsg->hdr.flags = kIORPCMessageOneway | kIORPCMessageError; + errorMsg->hdr.objectRefs = 0; + errorMsg->result = ret; + errorMsg->pad = 0; + replySize = sizeof(IORPCMessageErrorReturn); + } + + msgout->msgh.msgh_bits = MACH_MSGH_BITS_COMPLEX | + MACH_MSGH_BITS_SET(MACH_MSGH_BITS_LOCAL(msgin->msgh.msgh_bits) /*remote*/, 0 /*local*/, 0, 0); + + msgout->msgh.msgh_remote_port = msgin->msgh.msgh_local_port; + msgout->msgh.msgh_local_port = MACH_PORT_NULL; + msgout->msgh.msgh_voucher_port = (mach_port_name_t) 0; + msgout->msgh.msgh_reserved = 0; + msgout->msgh.msgh_size = replySize; + } + + *pReply = replykmsg; + + return oneway ? MIG_NO_REPLY : KERN_SUCCESS; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#define MAX_OBJECT_COUNT(mach, size, message) \ + ((((size) + ((uintptr_t) (mach))) - ((uintptr_t) (&message->objects[0]))) / sizeof(OSObjectRef)) + +kern_return_t +IOUserServerUEXTTrap(OSObject * object, void * p1, void * p2, void * p3, void * p4, void * p5, void * p6) +{ + const user_addr_t msg = (uintptr_t) p1; + size_t inSize = (uintptr_t) p2; + user_addr_t out = (uintptr_t) p3; + size_t outSize = (uintptr_t) p4; + mach_port_name_t objectName1 = (uintptr_t) p5; + size_t totalSize; + OSObject * objectArg1; + + IORPCMessageMach * mach; + mach_msg_port_descriptor_t * descs; + +#pragma pack(4) + struct { + uint32_t pad; + IORPCMessageMach mach; + mach_msg_port_descriptor_t objects[2]; + IOTrapMessageBuffer buffer; + } buffer; +#pragma pack() + + IOReturn ret; + OSAction * action; + int copyerr; + IORPCMessage * message; + IORPCMessage * reply; + IORPC rpc; + uint64_t refs; + uint32_t maxObjectCount; + size_t copySize; + uint64_t * replyHdr; + uintptr_t p; + + bzero(&buffer, sizeof(buffer)); + + p = (typeof(p)) & buffer.buffer[0]; + if (os_add_overflow(inSize, outSize, &totalSize)) { + return kIOReturnMessageTooLarge; + } + if (totalSize > sizeof(buffer.buffer)) { + return kIOReturnMessageTooLarge; + } + if (inSize < sizeof(IORPCMessage)) { + return kIOReturnIPCError; + } + copyerr = copyin(msg, &buffer.buffer[0], inSize); + if (copyerr) { + return kIOReturnVMError; + } + + message = (typeof(message))p; + refs = message->objectRefs; + if ((refs > 2) || !refs) { + return kIOReturnUnsupported; + } + if (!(kIORPCMessageSimpleReply & message->flags)) { + return kIOReturnUnsupported; + } + + descs = (typeof(descs))(p - refs * sizeof(*descs)); + mach = (typeof(mach))(p - refs * sizeof(*descs) - sizeof(*mach)); + + mach->msgh.msgh_id = kIORPCVersionCurrent; + mach->msgh.msgh_size = sizeof(IORPCMessageMach) + refs * sizeof(*descs) + inSize; + mach->msgh_body.msgh_descriptor_count = refs; + + rpc.message = mach; + rpc.sendSize = mach->msgh.msgh_size; + rpc.reply = (IORPCMessageMach *) (p + inSize); + rpc.replySize = sizeof(buffer.buffer) - inSize; + + message->objects[0] = 0; + if ((action = OSDynamicCast(OSAction, object))) { + maxObjectCount = MAX_OBJECT_COUNT(rpc.message, rpc.sendSize, message); + if (refs > maxObjectCount) { + return kIOReturnBadArgument; + } + object = IOUserServer::target(action, message); + message->objects[1] = (OSObjectRef) action; + if (kIODKLogIPC & gIODKDebug) { + DKLOG("%s::Dispatch(trap) kernel 0x%qx\n", object->getMetaClass()->getClassName(), message->msgid); + } + ret = object->Dispatch(rpc); + } else { + objectArg1 = NULL; + if (refs > 1) { + objectArg1 = iokit_lookup_uext_ref_current_task(objectName1); + if (!objectArg1) { + return kIOReturnIPCError; + } + message->objects[1] = (OSObjectRef) objectArg1; + } + if (kIODKLogIPC & gIODKDebug) { + DKLOG("%s::Dispatch(trap) kernel 0x%qx\n", object->getMetaClass()->getClassName(), message->msgid); + } + ret = object->Dispatch(rpc); + if (kIODKLogIPC & gIODKDebug) { + DKLOG("%s::Dispatch(trap) kernel 0x%qx 0x%x\n", object->getMetaClass()->getClassName(), message->msgid, ret); + } + OSSafeReleaseNULL(objectArg1); + + if (kIOReturnSuccess == ret) { + if (rpc.reply->msgh_body.msgh_descriptor_count) { + return kIOReturnIPCError; + } + reply = IORPCMessageFromMach(rpc.reply, rpc.reply->msgh.msgh_size); + if (!reply) { + return kIOReturnIPCError; + } + copySize = rpc.reply->msgh.msgh_size - (((uintptr_t) reply) - ((uintptr_t) rpc.reply)) + sizeof(uint64_t); + if (copySize > outSize) { + return kIOReturnIPCError; + } + replyHdr = (uint64_t *) reply; + replyHdr--; + replyHdr[0] = copySize; + copyerr = copyout(replyHdr, out, copySize); + if (copyerr) { + return kIOReturnVMError; + } + } + } + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IOReturn +IOUserServer::rpc(IORPC rpc) +{ + if (isInactive() && !fRootQueue) { + return kIOReturnOffline; + } + + IOReturn ret; + IORPCMessage * message; + IORPCMessageMach * mach; + mach_msg_id_t machid; + uint32_t sendSize, replySize; + bool oneway; + uint64_t msgid; + IODispatchQueue * queue; + IOService * service; + ipc_port_t port; + ipc_port_t sendPort; + + queue = NULL; + port = NULL; + sendPort = NULL; + + mach = rpc.message; + sendSize = rpc.sendSize; + replySize = rpc.replySize; + + assert(sendSize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))); + + message = IORPCMessageFromMach(mach, false); + if (!message) { + ret = kIOReturnIPCError; + } + msgid = message->msgid; + machid = (msgid >> 32); + + if (mach->msgh_body.msgh_descriptor_count < 1) { + return kIOReturnNoMedia; + } + + IOLockLock(gIOUserServerLock); + if ((service = OSDynamicCast(IOService, (OSObject *) message->objects[0]))) { + queue = queueForObject(service, msgid); + } + if (!queue) { + queue = fRootQueue; + } + if (queue && (kIODispatchQueueStopped != queue)) { + port = queue->ivars->serverPort; + } + if (port) { + sendPort = ipc_port_make_send(port); + } + IOLockUnlock(gIOUserServerLock); + if (!sendPort) { + return kIOReturnNotReady; + } + + oneway = (0 != (kIORPCMessageOneway & message->flags)); + + ret = copyOutObjects(mach, message, sendSize, false); + + mach->msgh.msgh_bits = MACH_MSGH_BITS_COMPLEX | + MACH_MSGH_BITS(MACH_MSG_TYPE_MOVE_SEND, (oneway ? 0 : MACH_MSG_TYPE_MAKE_SEND_ONCE)); + mach->msgh.msgh_remote_port = sendPort; + mach->msgh.msgh_local_port = (oneway ? MACH_PORT_NULL : mig_get_reply_port()); + mach->msgh.msgh_id = kIORPCVersionCurrent; + mach->msgh.msgh_reserved = 0; + + if (oneway) { + ret = mach_msg_send_from_kernel(&mach->msgh, sendSize); + } else { + assert(replySize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))); + ret = mach_msg_rpc_from_kernel(&mach->msgh, sendSize, replySize); + if (KERN_SUCCESS == ret) { + if (kIORPCVersionCurrentReply != mach->msgh.msgh_id) { + ret = (MACH_NOTIFY_SEND_ONCE == mach->msgh.msgh_id) ? MIG_SERVER_DIED : MIG_REPLY_MISMATCH; + } else if ((replySize = mach->msgh.msgh_size) < (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))) { +// printf("BAD REPLY SIZE\n"); + ret = MIG_BAD_ARGUMENTS; + } else { + if (!(MACH_MSGH_BITS_COMPLEX & mach->msgh.msgh_bits)) { + mach->msgh_body.msgh_descriptor_count = 0; + } + message = IORPCMessageFromMach(mach, true); + if (!message) { + ret = kIOReturnIPCError; + } else if (message->msgid != msgid) { +// printf("BAD REPLY ID\n"); + ret = MIG_BAD_ARGUMENTS; + } else { + bool isError = (0 != (kIORPCMessageError & message->flags)); + ret = copyInObjects(mach, message, replySize, !isError, true); + if (kIOReturnSuccess != ret) { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("rpc copyin(0x%x) %x\n", ret, mach->msgh.msgh_id); + } + return KERN_NOT_SUPPORTED; + } + if (isError) { + IORPCMessageErrorReturnContent * errorMsg = (typeof(errorMsg))message; + ret = errorMsg->result; + } + } + } + } + } + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IORPCMessage * +IORPCMessageFromMach(IORPCMessageMach * msg, bool reply) +{ + mach_msg_size_t idx, count; + mach_msg_port_descriptor_t * desc; + mach_msg_port_descriptor_t * maxDesc; + size_t size, msgsize; + bool upgrade; + + msgsize = msg->msgh.msgh_size; + count = msg->msgh_body.msgh_descriptor_count; + desc = &msg->objects[0]; + maxDesc = (typeof(maxDesc))(((uintptr_t) msg) + msgsize); + upgrade = (msg->msgh.msgh_id != (reply ? kIORPCVersionCurrentReply : kIORPCVersionCurrent)); + + if (upgrade) { + OSReportWithBacktrace("obsolete message"); + return NULL; + } + + for (idx = 0; idx < count; idx++) { + if (desc >= maxDesc) { + return NULL; + } + switch (desc->type) { + case MACH_MSG_PORT_DESCRIPTOR: + size = sizeof(mach_msg_port_descriptor_t); + break; + case MACH_MSG_OOL_DESCRIPTOR: + size = sizeof(mach_msg_ool_descriptor_t); + break; + default: + return NULL; + } + desc = (typeof(desc))(((uintptr_t) desc) + size); + } + return (IORPCMessage *)(uintptr_t) desc; +} + +ipc_port_t +IOUserServer::copySendRightForObject(OSObject * object, ipc_kobject_type_t type) +{ + ipc_port_t port; + ipc_port_t sendPort = NULL; + + port = iokit_port_for_object(object, type); + if (port) { + sendPort = ipc_port_make_send(port); + iokit_release_port(port); + } + + return sendPort; +} + +OSObject * +IOUserServer::copyObjectForSendRight(ipc_port_t port, ipc_kobject_type_t type) +{ + OSObject * object; + object = iokit_lookup_io_object(port, type); + return object; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +// Create a vm_map_copy_t or kalloc'ed data for memory +// to be copied out. ipc will free after the copyout. + +static kern_return_t +copyoutkdata(const void * data, vm_size_t len, void ** buf) +{ + kern_return_t err; + vm_map_copy_t copy; + + err = vm_map_copyin( kernel_map, CAST_USER_ADDR_T(data), len, + false /* src_destroy */, ©); + + assert( err == KERN_SUCCESS ); + if (err == KERN_SUCCESS) { + *buf = (char *) copy; + } + + return err; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IOReturn +IOUserServer::copyOutObjects(IORPCMessageMach * mach, IORPCMessage * message, + size_t size, bool consume) +{ + uint64_t refs; + uint32_t idx, maxObjectCount; + ipc_port_t port; + OSObject * object; + size_t descsize; + mach_msg_port_descriptor_t * desc; + mach_msg_ool_descriptor_t * ool; + vm_map_copy_t copy; + void * address; + mach_msg_size_t length; + kern_return_t kr; + OSSerialize * s; + + refs = message->objectRefs; + maxObjectCount = MAX_OBJECT_COUNT(mach, size, message); +// assert(refs <= mach->msgh_body.msgh_descriptor_count); +// assert(refs <= maxObjectCount); + if (refs > mach->msgh_body.msgh_descriptor_count) { + return kIOReturnBadArgument; + } + if (refs > maxObjectCount) { + return kIOReturnBadArgument; + } + + desc = &mach->objects[0]; + for (idx = 0; idx < refs; idx++) { + object = (OSObject *) message->objects[idx]; + + switch (desc->type) { + case MACH_MSG_PORT_DESCRIPTOR: + descsize = sizeof(mach_msg_port_descriptor_t); + port = NULL; + if (object) { + port = copySendRightForObject(object, IKOT_UEXT_OBJECT); + if (!port) { + break; + } + if (consume) { + object->release(); + } + message->objects[idx] = 0; + } +// desc->type = MACH_MSG_PORT_DESCRIPTOR; + desc->disposition = MACH_MSG_TYPE_MOVE_SEND; + desc->name = port; + desc->pad2 = 0; + desc->pad_end = 0; + break; + + case MACH_MSG_OOL_DESCRIPTOR: + descsize = sizeof(mach_msg_ool_descriptor_t); + + length = 0; + address = NULL; + if (object) { + s = OSSerialize::binaryWithCapacity(4096); + assert(s); + if (!s) { + break; + } + s->setIndexed(true); + if (!object->serialize(s)) { + assert(false); + descsize = -1UL; + s->release(); + break; + } + length = s->getLength(); + kr = copyoutkdata(s->text(), length, &address); + s->release(); + if (KERN_SUCCESS != kr) { + descsize = -1UL; + address = NULL; + length = 0; + } + if (consume) { + object->release(); + } + message->objects[idx] = 0; + } + ool = (typeof(ool))desc; +// ool->type = MACH_MSG_OOL_DESCRIPTOR; + ool->deallocate = false; + ool->copy = MACH_MSG_PHYSICAL_COPY; + ool->size = length; + ool->address = address; + break; + + default: + descsize = -1UL; + break; + } + if (-1UL == descsize) { + break; + } + desc = (typeof(desc))(((uintptr_t) desc) + descsize); + } + + if (idx >= refs) { + return kIOReturnSuccess; + } + + desc = &mach->objects[0]; + while (idx--) { + switch (desc->type) { + case MACH_MSG_PORT_DESCRIPTOR: + descsize = sizeof(mach_msg_port_descriptor_t); + port = desc->name; + if (port) { + ipc_port_release_send(port); + } + break; + + case MACH_MSG_OOL_DESCRIPTOR: + descsize = sizeof(mach_msg_ool_descriptor_t); + ool = (typeof(ool))desc; + copy = (vm_map_copy_t) ool->address; + if (copy) { + vm_map_copy_discard(copy); + } + break; + + default: + descsize = -1UL; + break; + } + if (-1UL == descsize) { + break; + } + desc = (typeof(desc))(((uintptr_t) desc) + descsize); + } + + return kIOReturnBadArgument; +} + +IOReturn +IOUserServer::copyInObjects(IORPCMessageMach * mach, IORPCMessage * message, + size_t size, bool copyObjects, bool consumePorts) +{ + uint64_t refs; + uint32_t idx, maxObjectCount; + ipc_port_t port; + OSObject * object; + size_t descsize; + mach_msg_port_descriptor_t * desc; + mach_msg_ool_descriptor_t * ool; + vm_map_address_t copyoutdata; + kern_return_t kr; + + refs = message->objectRefs; + maxObjectCount = MAX_OBJECT_COUNT(mach, size, message); +// assert(refs <= mach->msgh_body.msgh_descriptor_count); +// assert(refs <= maxObjectCount); + if (refs > mach->msgh_body.msgh_descriptor_count) { + return kIOReturnBadArgument; + } + if (refs > maxObjectCount) { + return kIOReturnBadArgument; + } + + desc = &mach->objects[0]; + for (idx = 0; idx < refs; idx++) { + switch (desc->type) { + case MACH_MSG_PORT_DESCRIPTOR: + descsize = sizeof(mach_msg_port_descriptor_t); + + object = NULL; + port = desc->name; + if (port) { + if (copyObjects) { + object = copyObjectForSendRight(port, IKOT_UEXT_OBJECT); + if (!object) { + descsize = -1UL; + break; + } + } + if (consumePorts) { + ipc_port_release_send(port); + } + } + break; + + case MACH_MSG_OOL_DESCRIPTOR: + descsize = sizeof(mach_msg_ool_descriptor_t); + ool = (typeof(ool))desc; + + object = NULL; + if (copyObjects && ool->size && ool->address) { + kr = vm_map_copyout(kernel_map, ©outdata, (vm_map_copy_t) ool->address); + if (KERN_SUCCESS == kr) { + object = OSUnserializeXML((const char *) copyoutdata, ool->size); + // vm_map_copyout() has consumed the vm_map_copy_t in the message + ool->size = 0; + ool->address = NULL; + kr = vm_deallocate(kernel_map, copyoutdata, ool->size); + assert(KERN_SUCCESS == kr); + } + if (!object) { + descsize = -1UL; + break; + } + } + break; + + default: + descsize = -1UL; + break; + } + if (-1UL == descsize) { + break; + } + if (copyObjects) { + message->objects[idx] = (OSObjectRef) object; + } + desc = (typeof(desc))(((uintptr_t) desc) + descsize); + } + + if (idx >= refs) { + return kIOReturnSuccess; + } + + while (idx--) { + object = (OSObject *) message->objects[idx]; + object->release(); + message->objects[idx] = 0; + } + + return kIOReturnBadArgument; +} + +IOReturn +IOUserServer::consumeObjects(IORPCMessage * message, size_t messageSize) +{ + uint64_t refs, idx; + OSObject * object; + + refs = message->objectRefs; + for (idx = 0; idx < refs; idx++) { + object = (OSObject *) message->objects[idx]; + if (object) { + object->release(); + message->objects[idx] = 0; + } + } + + return kIOReturnSuccess; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +bool +IOUserServer::finalize(IOOptionBits options) +{ + OSArray * services; + + if (kIODKLogSetup & gIODKDebug) { + DKLOG("%s::finalize(%p)\n", getName(), this); + } + + IOLockLock(gIOUserServerLock); + OSSafeReleaseNULL(fRootQueue); + IOLockUnlock(gIOUserServerLock); + + services = NULL; + IOLockLock(fLock); + if (fServices) { + services = OSArray::withArray(fServices); + } + IOLockUnlock(fLock); + + if (services) { + services->iterateObjects(^bool (OSObject * obj) { + IOService * service; + IOService * provider; + bool started = false; + + service = (IOService *) obj; + if (kIODKLogSetup & gIODKDebug) { + DKLOG("%s::terminate(" DKS ")\n", getName(), DKN(service)); + } + if (service->reserved->uvars) { + started = service->reserved->uvars->started; + service->reserved->uvars->serverDied = true; + if (started) { + provider = service->getProvider(); + serviceDidStop(service, provider); + service->terminate(kIOServiceTerminateNeedWillTerminate | kIOServiceTerminateWithRematch); + } + } + if (!started) { + DKLOG("%s::terminate(" DKS ") server exit before start()\n", getName(), DKN(service)); + serviceStop(service, NULL); + } + return false; + }); + services->release(); + } + + return IOUserClient::finalize(options); +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#undef super +#define super IOUserClient + +OSDefineMetaClassAndStructors(IOUserServer, IOUserClient) + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IOUserClient * IOUserServer::withTask(task_t owningTask) +{ + IOUserServer * inst; + + inst = new IOUserServer; + if (inst && !inst->init()) { + inst->release(); + inst = NULL; + return inst; + } + inst->PMinit(); + + inst->fOwningTask = current_task(); + inst->fEntitlements = IOUserClient::copyClientEntitlements(inst->fOwningTask); + + if (!(kIODKDisableEntitlementChecking & gIODKDebug)) { + if (!inst->fEntitlements || !inst->fEntitlements->getObject(gIODriverKitEntitlementKey)) { + proc_t p; + pid_t pid; + + p = (proc_t)get_bsdtask_info(inst->fOwningTask); + if (p) { + pid = proc_pid(p); + IOLog(kIODriverKitEntitlementKey " entitlement check failed for %s[%d]\n", proc_best_name(p), pid); + } + inst->release(); + inst = NULL; + return inst; + } + } + + inst->fLock = IOLockAlloc(); + inst->fServices = OSArray::withCapacity(4); + inst->fClasses = OSDictionary::withCapacity(16); + inst->fClasses->setOptions(OSCollection::kSort, OSCollection::kSort); + + return inst; +} + +IOReturn +IOUserServer::clientClose(void) +{ + terminate(); + return kIOReturnSuccess; +} + +IOReturn +IOUserServer::setProperties(OSObject * properties) +{ + IOReturn kr = kIOReturnUnsupported; + return kr; +} + +void +IOUserServer::stop(IOService * provider) +{ + fOwningTask = TASK_NULL; + + PMstop(); + + IOServicePH::serverRemove(this); + + OSSafeReleaseNULL(fRootQueue); + + if (fInterruptLock) { + IOSimpleLockFree(fInterruptLock); + } +} + +void +IOUserServer::free() +{ + OSSafeReleaseNULL(fEntitlements); + OSSafeReleaseNULL(fClasses); + if (fLock) { + IOLockFree(fLock); + } + OSSafeReleaseNULL(fServices); + IOUserClient::free(); +} + +IOReturn +IOUserServer::registerClass(OSClassDescription * desc, uint32_t size, OSUserMetaClass ** pCls) +{ + OSUserMetaClass * cls; + const OSSymbol * sym; + uint64_t * methodOptions; + const char * queueNames; + uint32_t methodOptionsEnd, queueNamesEnd; + IOReturn ret = kIOReturnSuccess; + + if (size < sizeof(OSClassDescription)) { + assert(false); + return kIOReturnBadArgument; + } + + if (kIODKLogSetup & gIODKDebug) { + DKLOG("%s::registerClass %s, %d, %d\n", getName(), desc->name, desc->queueNamesSize, desc->methodNamesSize); + } + + if (desc->descriptionSize != size) { + assert(false); + return kIOReturnBadArgument; + } + if (os_add_overflow(desc->queueNamesOffset, desc->queueNamesSize, &queueNamesEnd)) { + assert(false); + return kIOReturnBadArgument; + } + if (queueNamesEnd > size) { + assert(false); + return kIOReturnBadArgument; + } + if (os_add_overflow(desc->methodOptionsOffset, desc->methodOptionsSize, &methodOptionsEnd)) { + assert(false); + return kIOReturnBadArgument; + } + if (methodOptionsEnd > size) { + assert(false); + return kIOReturnBadArgument; + } + // overlaps? + if ((desc->queueNamesOffset >= desc->methodOptionsOffset) && (desc->queueNamesOffset < methodOptionsEnd)) { + assert(false); + return kIOReturnBadArgument; + } + if ((queueNamesEnd >= desc->methodOptionsOffset) && (queueNamesEnd < methodOptionsEnd)) { + assert(false); + return kIOReturnBadArgument; + } + + if (desc->methodOptionsSize & ((2 * sizeof(uint64_t)) - 1)) { + assert(false); + return kIOReturnBadArgument; + } + if (sizeof(desc->name) == strnlen(desc->name, sizeof(desc->name))) { + assert(false); + return kIOReturnBadArgument; + } + if (sizeof(desc->superName) == strnlen(desc->superName, sizeof(desc->superName))) { + assert(false); + return kIOReturnBadArgument; + } + + cls = OSTypeAlloc(OSUserMetaClass); + assert(cls); + if (!cls) { + return kIOReturnNoMemory; + } + + cls->description = (typeof(cls->description))IOMalloc(size); + assert(cls->description); + if (!cls->description) { + assert(false); + cls->release(); + return kIOReturnNoMemory; + } + bcopy(desc, cls->description, size); + + cls->methodCount = desc->methodOptionsSize / (2 * sizeof(uint64_t)); + cls->methods = IONew(uint64_t, 2 * cls->methodCount); + if (!cls->methods) { + assert(false); + cls->release(); + return kIOReturnNoMemory; + } + + methodOptions = (typeof(methodOptions))(((uintptr_t) desc) + desc->methodOptionsOffset); + bcopy(methodOptions, cls->methods, 2 * cls->methodCount * sizeof(uint64_t)); + + queueNames = (typeof(queueNames))(((uintptr_t) desc) + desc->queueNamesOffset); + cls->queueNames = copyInStringArray(queueNames, desc->queueNamesSize); + + sym = OSSymbol::withCString(desc->name); + assert(sym); + if (!sym) { + assert(false); + cls->release(); + return kIOReturnNoMemory; + } + + cls->name = sym; + cls->meta = OSMetaClass::copyMetaClassWithName(sym); + cls->superMeta = OSDynamicCast(OSUserMetaClass, fClasses->getObject(desc->superName)); + fClasses->setObject(sym, cls); + cls->release(); + + *pCls = cls; + + return ret; +} + +IOReturn +IOUserServer::setRootQueue(IODispatchQueue * queue) +{ + assert(!fRootQueue); + if (fRootQueue) { + return kIOReturnStillOpen; + } + queue->retain(); + fRootQueue = queue; + + return kIOReturnSuccess; +} + +IOReturn +IOUserServer::externalMethod(uint32_t selector, IOExternalMethodArguments * args, + IOExternalMethodDispatch * dispatch, OSObject * target, void * reference) +{ + IOReturn ret = kIOReturnBadArgument; + mach_port_name_t portname; + + switch (selector) { + case kIOUserServerMethodRegisterClass: + { + OSUserMetaClass * cls; + if (!args->structureInputSize) { + return kIOReturnBadArgument; + } + if (args->scalarOutputCount != 2) { + return kIOReturnBadArgument; + } + ret = registerClass((OSClassDescription *) args->structureInput, args->structureInputSize, &cls); + if (kIOReturnSuccess == ret) { + portname = iokit_make_send_right(fOwningTask, cls, IKOT_UEXT_OBJECT); + assert(portname); + args->scalarOutput[0] = portname; + args->scalarOutput[1] = kOSObjectRPCRemote; + } + break; + } + case kIOUserServerMethodStart: + { + if (args->scalarOutputCount != 1) { + return kIOReturnBadArgument; + } + portname = iokit_make_send_right(fOwningTask, this, IKOT_UEXT_OBJECT); + assert(portname); + args->scalarOutput[0] = portname; + ret = kIOReturnSuccess; + break; + } + default: + break; + } + + return ret; +} + +IOExternalTrap * +IOUserServer::getTargetAndTrapForIndex( IOService **targetP, UInt32 index ) +{ + static const IOExternalTrap trapTemplate[] = { + { NULL, (IOTrap) & IOUserServer::waitInterruptTrap}, + }; + if (index >= (sizeof(trapTemplate) / sizeof(IOExternalTrap))) { + return NULL; + } + *targetP = this; + return (IOExternalTrap *)&trapTemplate[index]; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IOReturn +IOUserServer::serviceAttach(IOService * service, IOService * provider) +{ + IOReturn ret; + OSObjectUserVars * vars; + OSObject * prop; + OSString * str; + OSSymbolConstPtr bundleID; + char execPath[1024]; + + vars = IONewZero(OSObjectUserVars, 1); + service->reserved->uvars = vars; + + vars->userServer = this; + vars->userServer->retain(); + IOLockLock(fLock); + if (-1U == fServices->getNextIndexOfObject(service, 0)) { + fServices->setObject(service); + } + IOLockUnlock(fLock); + + prop = service->copyProperty(gIOUserClassKey); + str = OSDynamicCast(OSString, prop); + if (str) { + service->setName(str); + } + OSSafeReleaseNULL(prop); + + prop = service->copyProperty(gIOModuleIdentifierKey); + bundleID = OSDynamicCast(OSSymbol, prop); + if (bundleID) { + execPath[0] = 0; + bool ok = OSKext::copyUserExecutablePath(bundleID, execPath, sizeof(execPath)); + if (ok) { + ret = LoadModule(execPath); + if (kIODKLogSetup & gIODKDebug) { + DKLOG("%s::LoadModule 0x%x %s\n", getName(), ret, execPath); + } + } + } + OSSafeReleaseNULL(prop); + + ret = kIOReturnSuccess; + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#define kDriverKitUCPrefix "com.apple.developer.driverkit.userclient-access." + +IOReturn +IOUserServer::serviceNewUserClient(IOService * service, task_t owningTask, void * securityID, + uint32_t type, OSDictionary * properties, IOUserClient ** handler) +{ + IOReturn ret; + IOUserClient * uc; + IOUserUserClient * userUC; + OSDictionary * entitlements; + OSObject * prop; + OSObject * bundleID; + bool ok; + + *handler = NULL; + ret = service->NewUserClient(type, &uc); + if (kIOReturnSuccess != ret) { + return ret; + } + userUC = OSDynamicCast(IOUserUserClient, uc); + if (!userUC) { + uc->terminate(); + OSSafeReleaseNULL(uc); + return kIOReturnUnsupported; + } + userUC->setTask(owningTask); + + if (!(kIODKDisableEntitlementChecking & gIODKDebug)) { + entitlements = IOUserClient::copyClientEntitlements(owningTask); + bundleID = service->copyProperty(gIOModuleIdentifierKey); + ok = (entitlements + && bundleID + && (prop = entitlements->getObject(gIODriverKitUserClientEntitlementsKey))); + if (ok) { + bool found __block = false; + ok = prop->iterateObjects(^bool (OSObject * object) { + found = object->isEqualTo(bundleID); + return found; + }); + ok = found; + } + if (ok) { + prop = userUC->copyProperty(gIOServiceDEXTEntitlementsKey); + ok = checkEntitlements(entitlements, prop, NULL, NULL); + } + OSSafeReleaseNULL(bundleID); + OSSafeReleaseNULL(entitlements); + if (!ok) { + DKLOG(DKS ":UC entitlements check failed\n", DKN(userUC)); + uc->terminate(); + OSSafeReleaseNULL(uc); + return kIOReturnNotPermitted; + } + } + + ret = userUC->Start(service); + if (kIOReturnSuccess != ret) { + userUC->detach(this); + userUC->release(); + return ret; + } + + *handler = userUC; + + return ret; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +static IOPMPowerState + sPowerStates[] = { + { .version = kIOPMPowerStateVersion1, + .capabilityFlags = 0, + .outputPowerCharacter = 0, + .inputPowerRequirement = 0}, + { .version = kIOPMPowerStateVersion1, + .capabilityFlags = kIOPMLowPower, + .outputPowerCharacter = kIOPMLowPower, + .inputPowerRequirement = kIOPMLowPower}, + { .version = kIOPMPowerStateVersion1, + .capabilityFlags = kIOPMPowerOn, + .outputPowerCharacter = kIOPMPowerOn, + .inputPowerRequirement = kIOPMPowerOn}, +}; + +IOReturn +IOUserServer::setPowerState(unsigned long state, IOService * service) +{ + if (kIODKLogPM & gIODKDebug) { + DKLOG(DKS "::setPowerState(%ld) %d\n", DKN(service), state, fSystemPowerAck); + } + return kIOPMAckImplied; +} + +IOReturn +IOUserServer::powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) +{ + IOReturn ret; + + if (service->reserved->uvars) { + if (!fSystemOff && !(kIODKDisablePM & gIODKDebug)) { + service->reserved->uvars->willPower = true; + if (kIODKLogPM & gIODKDebug) { + DKLOG(DKS "::powerStateWillChangeTo(%ld) 0x%qx, %d\n", DKN(service), state, fPowerStates, fSystemPowerAck); + } + ret = service->SetPowerState(flags); + if (kIOReturnSuccess == ret) { + return 20 * 1000 * 1000; + } + } + service->reserved->uvars->willPower = false; + } + + return kIOPMAckImplied; +} + +IOReturn +IOUserServer::powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) +{ + unsigned int idx; + bool pmAck; + + pmAck = false; + IOLockLock(fLock); + idx = fServices->getNextIndexOfObject(service, 0); + if (-1U == idx) { + IOLockUnlock(fLock); + return kIOPMAckImplied; + } + assert(idx <= 63); + + if (state) { + fPowerStates |= (1ULL << idx); + } else { + fPowerStates &= ~(1ULL << idx); + } + if (kIODKLogPM & gIODKDebug) { + DKLOG(DKS "::powerStateDidChangeTo(%ld) 0x%qx, %d\n", DKN(service), state, fPowerStates, fSystemPowerAck); + } + if (!fPowerStates && (pmAck = fSystemPowerAck)) { + fSystemPowerAck = false; + fSystemOff = true; + } + IOLockUnlock(fLock); + + if (pmAck) { + IOServicePH::serverAck(this); + } + + return kIOPMAckImplied; +} + +kern_return_t +IMPL(IOService, SetPowerState) +{ + if (kIODKLogPM & gIODKDebug) { + DKLOG(DKS "::SetPowerState(%d), %d\n", DKN(this), powerFlags, reserved->uvars->willPower); + } + if (reserved->uvars + && reserved->uvars->userServer + && reserved->uvars->willPower) { + reserved->uvars->willPower = false; + acknowledgePowerChange(reserved->uvars->userServer); + return kIOReturnSuccess; + } + return kIOReturnNotReady; +} + +kern_return_t +IMPL(IOService, ChangePowerState) +{ + switch (powerFlags) { + case kIOServicePowerCapabilityOff: + changePowerStateToPriv(0); + break; + case kIOServicePowerCapabilityLow: + changePowerStateToPriv(1); + break; + case kIOServicePowerCapabilityOn: + changePowerStateToPriv(2); + break; + default: + return kIOReturnBadArgument; + } + + return kIOReturnSuccess; +} + +kern_return_t +IMPL(IOService, Create) +{ + OSObject * inst; + IOService * service; + OSString * str; + const OSSymbol * sym; + OSObject * prop; + OSDictionary * properties; + kern_return_t ret; + + if (provider != this) { + return kIOReturnUnsupported; + } + + ret = kIOReturnUnsupported; + inst = NULL; + service = NULL; + + prop = copyProperty(propertiesKey); + properties = OSDynamicCast(OSDictionary, prop); + assert(properties); + if (properties) { + str = OSDynamicCast(OSString, properties->getObject(gIOClassKey)); + assert(str); + sym = OSSymbol::withString(str); + if (sym) { + inst = OSMetaClass::allocClassWithName(sym); + service = OSDynamicCast(IOService, inst); + if (service && service->init(properties) && service->attach(this)) { + reserved->uvars->userServer->serviceAttach(service, this); + ret = kIOReturnSuccess; + *result = service; + } + OSSafeReleaseNULL(sym); + } + } + + OSSafeReleaseNULL(prop); + if (kIOReturnSuccess != ret) { + OSSafeReleaseNULL(inst); + } + + return ret; +} + +kern_return_t +IMPL(IOService, NewUserClient) +{ + return kIOReturnError; +} + +kern_return_t +IMPL(IOService, SearchProperty) +{ + OSObject * object; + + if (kIOServiceSearchPropertyParents & options) { + options = kIORegistryIterateParents | kIORegistryIterateRecursively; + } else { + options = 0; + } + + object = copyProperty(name, IORegistryEntry::getPlane(plane), options); + *property = object; + + return object ? kIOReturnSuccess : kIOReturnNotFound; +} + +void +IOUserServer::systemPower(bool powerOff) +{ + OSArray * services; + + if (kIODKLogPM & gIODKDebug) { + DKLOG("%s::powerOff(%d) 0x%qx\n", getName(), powerOff, fPowerStates); + } + + IOLockLock(fLock); + services = OSArray::withArray(fServices); + + if (powerOff) { + fSystemPowerAck = (0 != fPowerStates); + if (!fSystemPowerAck) { + fSystemOff = true; + } + IOLockUnlock(fLock); + + if (!fSystemPowerAck) { + IOServicePH::serverAck(this); + } else { + if (services) { + services->iterateObjects(^bool (OSObject * obj) { + IOService * service; + service = (IOService *) obj; + if (kIODKLogPM & gIODKDebug) { + DKLOG("changePowerStateWithOverrideTo(" DKS ", %d)\n", DKN(service), 0); + } + service->reserved->uvars->powerOverride = service->getPowerState(); + service->changePowerStateWithOverrideTo(0, 0); + return false; + }); + } + } + } else { + fSystemOff = false; + IOLockUnlock(fLock); + if (services) { + services->iterateObjects(^bool (OSObject * obj) { + IOService * service; + service = (IOService *) obj; + if (-1U != service->reserved->uvars->powerOverride) { + if (kIODKLogPM & gIODKDebug) { + DKLOG("changePowerStateWithOverrideTo(" DKS ", %d)\n", DKN(service), service->reserved->uvars->powerOverride); + } + service->changePowerStateWithOverrideTo(service->reserved->uvars->powerOverride, 0); + service->reserved->uvars->powerOverride = -1U; + } + return false; + }); + } + } + OSSafeReleaseNULL(services); +} + + + +IOReturn +IOUserServer::serviceStarted(IOService * service, IOService * provider, bool result) +{ + IOReturn ret; + IOService * pmProvider; + + DKLOG(DKS "::start(" DKS ") %s\n", DKN(service), DKN(provider), result ? "ok" : "fail"); + + if (!result) { + ret = kIOReturnSuccess; + return ret; + } + + if (!fRootNotifier) { + ret = registerPowerDriver(this, sPowerStates, sizeof(sPowerStates) / sizeof(sPowerStates[0])); + assert(kIOReturnSuccess == ret); + IOServicePH::serverAdd(this); + fRootNotifier = true; + } + + if (!(kIODKDisablePM & gIODKDebug) && !service->pm_vars) { + service->PMinit(); + ret = service->registerPowerDriver(this, sPowerStates, sizeof(sPowerStates) / sizeof(sPowerStates[0])); + assert(kIOReturnSuccess == ret); + + pmProvider = service; + while (pmProvider && !pmProvider->inPlane(gIOPowerPlane)) { + pmProvider = pmProvider->getProvider(); + } + if (pmProvider) { + OSObject * prop; + OSString * str; + prop = pmProvider->copyProperty("non-removable"); + if (prop) { + str = OSDynamicCast(OSString, prop); + if (str && str->isEqualTo("yes")) { + pmProvider = NULL; + } + prop->release(); + } + } + if (pmProvider) { + IOLockLock(fLock); + unsigned int idx = fServices->getNextIndexOfObject(service, 0); + assert(idx <= 63); + fPowerStates |= (1ULL << idx); + IOLockUnlock(fLock); + + pmProvider->joinPMtree(service); + service->reserved->uvars->userServerPM = true; + } + } + + service->registerInterestedDriver(this); + service->reserved->uvars->started = true; + + return kIOReturnSuccess; +} + + +IOReturn +IOUserServer::serviceOpen(IOService * provider, IOService * client) +{ + OSObjectUserVars * uvars; + + uvars = client->reserved->uvars; + if (!uvars->openProviders) { + uvars->openProviders = OSArray::withObjects((const OSObject **) &provider, 1); + } else if (-1U == uvars->openProviders->getNextIndexOfObject(client, 0)) { + uvars->openProviders->setObject(provider); + } + + return kIOReturnSuccess; +} + +IOReturn +IOUserServer::serviceClose(IOService * provider, IOService * client) +{ + OSObjectUserVars * uvars; + unsigned int idx; + + uvars = client->reserved->uvars; + if (!uvars->openProviders) { + return kIOReturnNotOpen; + } + idx = uvars->openProviders->getNextIndexOfObject(client, 0); + if (-1U == idx) { + return kIOReturnNotOpen; + } + uvars->openProviders->removeObject(idx); + + return kIOReturnSuccess; +} + + +IOReturn +IOUserServer::serviceStop(IOService * service, IOService *) +{ + IOReturn ret; + uint32_t idx, queueAlloc; + OSObjectUserVars * uvars; + + IOLockLock(fLock); + idx = fServices->getNextIndexOfObject(service, 0); + if (-1U != idx) { + fServices->removeObject(idx); + uvars = service->reserved->uvars; + uvars->stopped = true; + } + IOLockUnlock(fLock); + + if (-1U == idx) { + return kIOReturnSuccess; + } + + IOMachPortDestroyUserReferences(service, IKOT_UEXT_OBJECT); + + if (uvars->queueArray && uvars->userMeta) { + queueAlloc = 1; + if (uvars->userMeta->queueNames) { + queueAlloc += uvars->userMeta->queueNames->count; + } + for (idx = 0; idx < queueAlloc; idx++) { + OSSafeReleaseNULL(uvars->queueArray[idx]); + } + IOSafeDeleteNULL(uvars->queueArray, IODispatchQueue *, queueAlloc); + } + + (void) service->deRegisterInterestedDriver(this); + if (uvars->userServerPM) { + service->PMstop(); + } + + ret = kIOReturnSuccess; + return ret; +} + +void +IOUserServer::serviceFree(IOService * service) +{ + OSObjectUserVars * uvars; + + uvars = service->reserved->uvars; + if (!uvars) { + return; + } + OSSafeReleaseNULL(uvars->userServer); + IOSafeDeleteNULL(service->reserved->uvars, OSObjectUserVars, 1); +} + +void +IOUserServer::serviceWillTerminate(IOService * client, IOService * provider, IOOptionBits options) +{ + IOReturn ret; + bool willTerminate; + + willTerminate = false; + if (client->lockForArbitration(true)) { + if (!client->reserved->uvars->serverDied + && !client->reserved->uvars->willTerminate) { + client->reserved->uvars->willTerminate = true; + willTerminate = true; + } + client->unlockForArbitration(); + } + + if (willTerminate) { + ret = client->Stop(provider); + if (kIOReturnSuccess != ret) { + ret = client->IOService::Stop(provider); + } + } +} + +void +IOUserServer::serviceDidTerminate(IOService * client, IOService * provider, IOOptionBits options, bool * defer) +{ + if (client->lockForArbitration(true)) { + client->reserved->uvars->didTerminate = true; + if (!client->reserved->uvars->serverDied + && !client->reserved->uvars->stopped) { + *defer = true; + } + client->unlockForArbitration(); + } +} + +void +IOUserServer::serviceDidStop(IOService * client, IOService * provider) +{ + bool complete; + OSArray * closeArray; + + complete = false; + closeArray = NULL; + + if (client->lockForArbitration(true)) { + if (client->reserved->uvars + && client->reserved->uvars->willTerminate + && !client->reserved->uvars->stopped) { + client->reserved->uvars->stopped = true; + complete = client->reserved->uvars->didTerminate; + } + + if (client->reserved->uvars) { + closeArray = client->reserved->uvars->openProviders; + client->reserved->uvars->openProviders = NULL; + } + client->unlockForArbitration(); + if (closeArray) { + closeArray->iterateObjects(^bool (OSObject * obj) { + IOService * toClose; + toClose = OSDynamicCast(IOService, obj); + if (toClose) { + DKLOG(DKS ":force close (" DKS ")\n", DKN(client), DKN(toClose)); + toClose->close(client); + } + return false; + }); + closeArray->release(); + } + } + if (complete) { + bool defer = false; + client->didTerminate(provider, 0, &defer); + } +} + +kern_return_t +IMPL(IOService, Stop) +{ + IOUserServer::serviceDidStop(this, provider); + + return kIOReturnSuccess; +} + +kern_return_t +IMPL(IOInterruptDispatchSource, Cancel) +{ + return kIOReturnUnsupported; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#undef super +#define super IOUserClient + +OSDefineMetaClassAndStructors(IOUserUserClient, IOUserClient) + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +IOReturn +IOUserUserClient::setTask(task_t task) +{ + task_reference(task); + fTask = task; + + return kIOReturnSuccess; +} + +void +IOUserUserClient::stop(IOService * provider) +{ + if (fTask) { + task_deallocate(fTask); + fTask = NULL; + } + super::stop(provider); +} + +IOReturn +IOUserUserClient::clientClose(void) +{ + terminate(); + return kIOReturnSuccess; +} + +IOReturn +IOUserUserClient::setProperties(OSObject * properties) +{ + IOReturn ret = kIOReturnUnsupported; + return ret; +} + +struct IOUserUserClientActionRef { + OSAsyncReference64 asyncRef; +}; + +void +IMPL(IOUserClient, KernelCompletion) +{ + IOUserUserClientActionRef * ref; + + ref = (typeof(ref))action->GetReference(); + + IOUserClient::sendAsyncResult64(ref->asyncRef, status, (io_user_reference_t *) asyncData, asyncDataCount); +} + +kern_return_t +IMPL(IOUserClient, _ExternalMethod) +{ + return kIOReturnUnsupported; +} + +IOReturn +IOUserUserClient::clientMemoryForType(UInt32 type, + IOOptionBits * koptions, + IOMemoryDescriptor ** kmemory) +{ + IOReturn kr; + uint64_t options; + IOMemoryDescriptor * memory; + + kr = CopyClientMemoryForType(type, &options, &memory); + + *koptions = 0; + *kmemory = NULL; + if (kIOReturnSuccess != kr) { + return kr; + } + + if (kIOUserClientMemoryReadOnly & options) { + *koptions |= kIOMapReadOnly; + } + *kmemory = memory; + + return kr; +} + +IOReturn +IOUserUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments * args, + IOExternalMethodDispatch * dispatch, OSObject * target, void * reference) +{ + IOReturn kr; + OSData * structureInput; + OSData * structureOutput; + size_t copylen; + uint64_t structureOutputSize; + OSAction * action; + IOUserUserClientActionRef * ref; + + kr = kIOReturnUnsupported; + structureInput = NULL; + action = NULL; + + if (args->structureInputSize) { + structureInput = OSData::withBytesNoCopy((void *) args->structureInput, args->structureInputSize); + } + + if (MACH_PORT_NULL != args->asyncWakePort) { + kr = CreateActionKernelCompletion(sizeof(IOUserUserClientActionRef), &action); + assert(KERN_SUCCESS == kr); + ref = (typeof(ref))action->GetReference(); + bcopy(args->asyncReference, &ref->asyncRef[0], args->asyncReferenceCount * sizeof(ref->asyncRef[0])); + } + + if (args->structureVariableOutputData) { + structureOutputSize = kIOUserClientVariableStructureSize; + } else if (args->structureOutputDescriptor) { + structureOutputSize = args->structureOutputDescriptor->getLength(); + } else { + structureOutputSize = args->structureOutputSize; + } + + kr = _ExternalMethod(selector, &args->scalarInput[0], args->scalarInputCount, + structureInput, args->structureInputDescriptor, + args->scalarOutput, &args->scalarOutputCount, + structureOutputSize, &structureOutput, args->structureOutputDescriptor, + action); + + OSSafeReleaseNULL(structureInput); + OSSafeReleaseNULL(action); + + if (kIOReturnSuccess != kr) { + return kr; + } + if (structureOutput) { + if (args->structureVariableOutputData) { + *args->structureVariableOutputData = structureOutput; + } else { + copylen = structureOutput->getLength(); + if (copylen > args->structureOutputSize) { + kr = kIOReturnBadArgument; + } else { + bcopy((const void *) structureOutput->getBytesNoCopy(), args->structureOutput, copylen); + } + OSSafeReleaseNULL(structureOutput); + } + } + + return kr; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ diff --git a/iokit/Kernel/IOWorkLoop.cpp b/iokit/Kernel/IOWorkLoop.cpp index a9aff9f93..d046938d2 100644 --- a/iokit/Kernel/IOWorkLoop.cpp +++ b/iokit/Kernel/IOWorkLoop.cpp @@ -206,7 +206,7 @@ IOWorkLoop::workLoopWithOptions(IOOptionBits options) me->reserved = IONew(ExpansionData, 1); if (!me->reserved) { me->release(); - return 0; + return NULL; } bzero(me->reserved, sizeof(ExpansionData)); me->reserved->options = options; @@ -214,7 +214,7 @@ IOWorkLoop::workLoopWithOptions(IOOptionBits options) if (me && !me->init()) { me->release(); - return 0; + return NULL; } return me; @@ -250,45 +250,45 @@ IOWorkLoop::free() for (event = eventChain; event; event = next) { next = event->getNext(); - event->setWorkLoop(0); - event->setNext(0); + event->setWorkLoop(NULL); + event->setNext(NULL); event->release(); } - eventChain = 0; + eventChain = NULL; for (event = passiveEventChain; event; event = next) { next = event->getNext(); - event->setWorkLoop(0); - event->setNext(0); + event->setWorkLoop(NULL); + event->setNext(NULL); event->release(); } - passiveEventChain = 0; + passiveEventChain = NULL; // Either we have a partial initialization to clean up // or the workThread itself is performing hari-kari. // Either way clean up all of our resources and return. if (controlG) { - controlG->workLoop = 0; + controlG->workLoop = NULL; controlG->release(); - controlG = 0; + controlG = NULL; } if (workToDoLock) { IOSimpleLockFree(workToDoLock); - workToDoLock = 0; + workToDoLock = NULL; } if (gateLock) { IORecursiveLockFree(gateLock); - gateLock = 0; + gateLock = NULL; } IOStatisticsUnregisterCounter(); if (reserved) { IODelete(reserved, ExpansionData, 1); - reserved = 0; + reserved = NULL; } super::free(); @@ -457,7 +457,7 @@ restartThread: exitThread: closeGate(); thread_t thread = workThread; - workThread = 0; // Say we don't have a loop and free ourselves + workThread = NULL; // Say we don't have a loop and free ourselves openGate(); free(); @@ -589,7 +589,7 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *) inEvent->retain(); inEvent->setWorkLoop(this); - inEvent->setNext(0); + inEvent->setNext(NULL); /* Check if this is a passive or active event source being added */ if (eventSourcePerformsWork(inEvent)) { @@ -627,7 +627,7 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *) if (eventChain == inEvent) { eventChain = inEvent->getNext(); } else { - IOEventSource *event, *next = 0; + IOEventSource *event, *next = NULL; event = eventChain; if (event) { @@ -646,7 +646,7 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *) if (passiveEventChain == inEvent) { passiveEventChain = inEvent->getNext(); } else { - IOEventSource *event, *next = 0; + IOEventSource *event, *next = NULL; event = passiveEventChain; if (event) { @@ -663,8 +663,8 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *) } } - inEvent->setWorkLoop(0); - inEvent->setNext(0); + inEvent->setWorkLoop(NULL); + inEvent->setNext(NULL); inEvent->release(); SETP(&fFlags, kLoopRestart); } diff --git a/iokit/Kernel/RootDomainUserClient.cpp b/iokit/Kernel/RootDomainUserClient.cpp index 646ccec4e..632b97ae5 100644 --- a/iokit/Kernel/RootDomainUserClient.cpp +++ b/iokit/Kernel/RootDomainUserClient.cpp @@ -93,8 +93,6 @@ RootDomainUserClient::secureSleepSystemOptions( int local_priv = 0; int admin_priv = 0; IOReturn ret = kIOReturnNotPrivileged; - OSDictionary *unserializedOptions = NULL; - OSString *unserializeErrorString = NULL; ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeLocalUser); local_priv = (kIOReturnSuccess == ret); @@ -102,38 +100,38 @@ RootDomainUserClient::secureSleepSystemOptions( ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeAdministrator); admin_priv = (kIOReturnSuccess == ret); - - if (inOptions) { - unserializedOptions = OSDynamicCast( OSDictionary, - OSUnserializeXML((const char *)inOptions, inOptionsSize, &unserializeErrorString)); - - if (!unserializedOptions) { - IOLog("IOPMRootDomain SleepSystem unserialization failure: %s\n", - unserializeErrorString ? unserializeErrorString->getCStringNoCopy() : "Unknown"); - } - } - if ((local_priv || admin_priv) && fOwner) { + OSString *unserializeErrorString = NULL; + OSObject *unserializedObject = NULL; + OSDictionary *sleepOptionsDict = NULL; // do not release + proc_t p; p = (proc_t)get_bsdtask_info(fOwningTask); if (p) { fOwner->setProperty("SleepRequestedByPID", proc_pid(p), 32); } - if (unserializedOptions) { - // Publish Sleep Options in registry under root_domain - fOwner->setProperty( kRootDomainSleepOptionsKey, unserializedOptions); - - *returnCode = fOwner->sleepSystemOptions( unserializedOptions ); + if (inOptions) { + unserializedObject = OSUnserializeXML((const char *)inOptions, inOptionsSize, &unserializeErrorString); + sleepOptionsDict = OSDynamicCast( OSDictionary, unserializedObject); + if (!sleepOptionsDict) { + IOLog("IOPMRootDomain SleepSystem unserialization failure: %s\n", + unserializeErrorString ? unserializeErrorString->getCStringNoCopy() : "Unknown"); + } + } - unserializedOptions->release(); + if (sleepOptionsDict) { + // Publish Sleep Options in registry under root_domain + fOwner->setProperty( kRootDomainSleepOptionsKey, sleepOptionsDict); } else { // No options // Clear any pre-existing options fOwner->removeProperty( kRootDomainSleepOptionsKey ); - - *returnCode = fOwner->sleepSystemOptions( NULL ); } + + *returnCode = fOwner->sleepSystemOptions( sleepOptionsDict ); + OSSafeReleaseNULL(unserializedObject); + OSSafeReleaseNULL(unserializeErrorString); } else { *returnCode = kIOReturnNotPrivileged; } @@ -233,7 +231,7 @@ RootDomainUserClient::stop( IOService *provider) { if (fOwningTask) { task_deallocate(fOwningTask); - fOwningTask = 0; + fOwningTask = NULL; } super::stop(provider); diff --git a/iokit/Kernel/RootDomainUserClient.h b/iokit/Kernel/RootDomainUserClient.h index aea9ca375..84276c5b2 100644 --- a/iokit/Kernel/RootDomainUserClient.h +++ b/iokit/Kernel/RootDomainUserClient.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,7 +44,7 @@ class RootDomainUserClient : public IOUserClient { - OSDeclareDefaultStructors(RootDomainUserClient) + OSDeclareDefaultStructors(RootDomainUserClient); friend class IOPMrootDomain; private: diff --git a/iokit/KernelConfigTables.cpp b/iokit/KernelConfigTables.cpp index 9dda5e80d..2b14eda8f 100644 --- a/iokit/KernelConfigTables.cpp +++ b/iokit/KernelConfigTables.cpp @@ -40,9 +40,3 @@ const char * gIOKernelConfigTables = " 'IOProbeScore' = 0:32;" " }" ")"; - -/* This stuff is no longer used at all but was exported in prior - * releases, so we'll keep them around for PPC/i386 only. - * See libkern's OSKext.cpp for other symbols, which have been moved - * there for sanity. - */ diff --git a/iokit/Makefile b/iokit/Makefile index c9c3d03b6..e82361d0b 100644 --- a/iokit/Makefile +++ b/iokit/Makefile @@ -6,9 +6,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -INSTINC_SUBDIRS = IOKit +INSTINC_SUBDIRS = IOKit DriverKit System IOKitUser -EXPINC_SUBDIRS = IOKit +EXPINC_SUBDIRS = IOKit DriverKit COMP_SUBDIRS = conf diff --git a/iokit/System/IODataQueueDispatchSourceShared.h b/iokit/System/IODataQueueDispatchSourceShared.h new file mode 100644 index 000000000..bee716add --- /dev/null +++ b/iokit/System/IODataQueueDispatchSourceShared.h @@ -0,0 +1,594 @@ +typedef struct _IODataQueueEntry { + uint32_t size; + uint8_t data[0]; +} IODataQueueEntry; + +#define DATA_QUEUE_ENTRY_HEADER_SIZE sizeof(IODataQueueEntry) + +typedef struct _IODataQueueMemory { + volatile uint32_t head; + volatile uint32_t tail; + volatile uint8_t needServicedCallback; + volatile uint8_t _resv[31]; + IODataQueueEntry queue[0]; +} IODataQueueMemory; + +struct IODataQueueDispatchSource_IVars { + IODataQueueMemory * dataQueue; + IODataQueueDispatchSource * source; +// IODispatchQueue * queue; + IOMemoryDescriptor * memory; + OSAction * dataAvailableAction; + OSAction * dataServicedAction; + uint64_t options; + uint32_t queueByteCount; + +#if !KERNEL + bool enable; + bool canceled; +#endif +}; + +bool +IODataQueueDispatchSource::init() +{ + if (!super::init()) { + return false; + } + + ivars = IONewZero(IODataQueueDispatchSource_IVars, 1); + ivars->source = this; + +#if !KERNEL + kern_return_t ret; + + ret = CopyMemory(&ivars->memory); + assert(kIOReturnSuccess == ret); + + uint64_t address; + uint64_t length; + + ret = ivars->memory->Map(0, 0, 0, 0, &address, &length); + assert(kIOReturnSuccess == ret); + ivars->dataQueue = (typeof(ivars->dataQueue))(uintptr_t) address; + ivars->queueByteCount = length; +#endif + + return true; +} + +kern_return_t +IMPL(IODataQueueDispatchSource, CheckForWork) +{ + IOReturn ret = kIOReturnNotReady; + + return ret; +} + +#if KERNEL + +kern_return_t +IMPL(IODataQueueDispatchSource, Create) +{ + IODataQueueDispatchSource * inst; + IOBufferMemoryDescriptor * bmd; + + if (3 & queueByteCount) { + return kIOReturnBadArgument; + } + inst = OSTypeAlloc(IODataQueueDispatchSource); + if (!inst) { + return kIOReturnNoMemory; + } + if (!inst->init()) { + inst->release(); + return kIOReturnError; + } + + bmd = IOBufferMemoryDescriptor::withOptions( + kIODirectionOutIn | kIOMemoryKernelUserShared, + queueByteCount, page_size); + if (!bmd) { + inst->release(); + return kIOReturnNoMemory; + } + inst->ivars->memory = bmd; + inst->ivars->queueByteCount = queueByteCount; + inst->ivars->options = 0; + inst->ivars->dataQueue = (typeof(inst->ivars->dataQueue))bmd->getBytesNoCopy(); + + *source = inst; + + return kIOReturnSuccess; +} + +kern_return_t +IMPL(IODataQueueDispatchSource, CopyMemory) +{ + kern_return_t ret; + IOMemoryDescriptor * result; + + result = ivars->memory; + if (result) { + result->retain(); + ret = kIOReturnSuccess; + } else { + ret = kIOReturnNotReady; + } + *memory = result; + + return ret; +} + +kern_return_t +IMPL(IODataQueueDispatchSource, CopyDataAvailableHandler) +{ + kern_return_t ret; + OSAction * result; + + result = ivars->dataAvailableAction; + if (result) { + result->retain(); + ret = kIOReturnSuccess; + } else { + ret = kIOReturnNotReady; + } + *action = result; + + return ret; +} + +kern_return_t +IMPL(IODataQueueDispatchSource, CopyDataServicedHandler) +{ + kern_return_t ret; + OSAction * result; + + result = ivars->dataServicedAction; + if (result) { + result->retain(); + ret = kIOReturnSuccess; + } else { + ret = kIOReturnNotReady; + } + *action = result; + return ret; +} + +kern_return_t +IMPL(IODataQueueDispatchSource, SetDataAvailableHandler) +{ + IOReturn ret; + OSAction * oldAction; + + oldAction = ivars->dataAvailableAction; + if (oldAction && OSCompareAndSwapPtr(oldAction, NULL, &ivars->dataAvailableAction)) { + oldAction->release(); + } + if (action) { + action->retain(); + ivars->dataAvailableAction = action; + if (IsDataAvailable()) { + DataAvailable(ivars->dataAvailableAction); + } + } + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IMPL(IODataQueueDispatchSource, SetDataServicedHandler) +{ + IOReturn ret; + OSAction * oldAction; + + oldAction = ivars->dataServicedAction; + if (oldAction && OSCompareAndSwapPtr(oldAction, NULL, &ivars->dataServicedAction)) { + oldAction->release(); + } + if (action) { + action->retain(); + ivars->dataServicedAction = action; + } + ret = kIOReturnSuccess; + + return ret; +} + +#endif /* KERNEL */ + +void +IODataQueueDispatchSource::SendDataAvailable(void) +{ + IOReturn ret; + + if (!ivars->dataAvailableAction) { + ret = CopyDataAvailableHandler(&ivars->dataAvailableAction); + if (kIOReturnSuccess != ret) { + ivars->dataAvailableAction = NULL; + } + } + if (ivars->dataAvailableAction) { + DataAvailable(ivars->dataAvailableAction); + } +} + +void +IODataQueueDispatchSource::SendDataServiced(void) +{ + IOReturn ret; + + if (!ivars->dataServicedAction) { + ret = CopyDataServicedHandler(&ivars->dataServicedAction); + if (kIOReturnSuccess != ret) { + ivars->dataServicedAction = NULL; + } + } + if (ivars->dataServicedAction) { + ivars->dataQueue->needServicedCallback = false; + DataServiced(ivars->dataServicedAction); + } +} + +kern_return_t +IMPL(IODataQueueDispatchSource, SetEnableWithCompletion) +{ + IOReturn ret; + +#if !KERNEL + ivars->enable = enable; +#endif + + ret = kIOReturnSuccess; + return ret; +} + +void +IODataQueueDispatchSource::free() +{ + OSSafeReleaseNULL(ivars->memory); + OSSafeReleaseNULL(ivars->dataAvailableAction); + OSSafeReleaseNULL(ivars->dataServicedAction); + IOSafeDeleteNULL(ivars, IODataQueueDispatchSource_IVars, 1); + super::free(); +} + +kern_return_t +IMPL(IODataQueueDispatchSource, Cancel) +{ + return kIOReturnSuccess; +} + +bool +IODataQueueDispatchSource::IsDataAvailable(void) +{ + IODataQueueMemory *dataQueue = ivars->dataQueue; + + return dataQueue && (dataQueue->head != dataQueue->tail); +} + +kern_return_t +IODataQueueDispatchSource::Peek(IODataQueueClientDequeueEntryBlock callback) +{ + IODataQueueEntry * entry = NULL; + IODataQueueMemory * dataQueue; + uint32_t callerDataSize; + uint32_t dataSize; + uint32_t headOffset; + uint32_t tailOffset; + + dataQueue = ivars->dataQueue; + if (!dataQueue) { + return kIOReturnNoMemory; + } + + // Read head and tail with acquire barrier + headOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_RELAXED); + tailOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->tail, __ATOMIC_ACQUIRE); + + if (headOffset != tailOffset) { + IODataQueueEntry * head = NULL; + uint32_t headSize = 0; + uint32_t queueSize = ivars->queueByteCount; + + if (headOffset > queueSize) { + return kIOReturnError; + } + + head = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + headOffset); + callerDataSize = head->size; + if (os_add_overflow(3, callerDataSize, &headSize)) { + return kIOReturnError; + } + headSize &= ~3U; + + // Check if there's enough room before the end of the queue for a header. + // If there is room, check if there's enough room to hold the header and + // the data. + + if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) || + (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) || + (headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { + // No room for the header or the data, wrap to the beginning of the queue. + // Note: wrapping even with the UINT32_MAX checks, as we have to support + // queueSize of UINT32_MAX + entry = dataQueue->queue; + callerDataSize = entry->size; + dataSize = entry->size; + if (os_add_overflow(3, callerDataSize, &dataSize)) { + return kIOReturnError; + } + dataSize &= ~3U; + + if ((dataSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { + return kIOReturnError; + } + + callback(&entry->data, callerDataSize); + return kIOReturnSuccess; + } else { + callback(&head->data, callerDataSize); + return kIOReturnSuccess; + } + } + + return kIOReturnUnderrun; +} + +kern_return_t +IODataQueueDispatchSource::Dequeue(IODataQueueClientDequeueEntryBlock callback) +{ + kern_return_t ret; + bool sendDataServiced; + + sendDataServiced = false; + ret = DequeueWithCoalesce(&sendDataServiced, callback); + if (sendDataServiced) { + SendDataServiced(); + } + return ret; +} + +kern_return_t +IODataQueueDispatchSource::DequeueWithCoalesce(bool * sendDataServiced, + IODataQueueClientDequeueEntryBlock callback) +{ + IOReturn retVal = kIOReturnSuccess; + IODataQueueEntry * entry = NULL; + IODataQueueMemory * dataQueue; + uint32_t callerDataSize; + uint32_t dataSize = 0; + uint32_t headOffset = 0; + uint32_t tailOffset = 0; + uint32_t newHeadOffset = 0; + + dataQueue = ivars->dataQueue; + if (!dataQueue) { + return kIOReturnNoMemory; + } + + // Read head and tail with acquire barrier + headOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_RELAXED); + tailOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->tail, __ATOMIC_ACQUIRE); + + if (headOffset != tailOffset) { + IODataQueueEntry * head = NULL; + uint32_t headSize = 0; + uint32_t queueSize = ivars->queueByteCount; + + if (headOffset > queueSize) { + return kIOReturnError; + } + + head = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + headOffset); + callerDataSize = head->size; + if (os_add_overflow(3, callerDataSize, &headSize)) { + return kIOReturnError; + } + headSize &= ~3U; + + // we wrapped around to beginning, so read from there + // either there was not even room for the header + if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) || + // or there was room for the header, but not for the data + (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) || + (headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { + // Note: we have to wrap to the beginning even with the UINT32_MAX checks + // because we have to support a queueSize of UINT32_MAX. + entry = dataQueue->queue; + callerDataSize = entry->size; + + if (os_add_overflow(callerDataSize, 3, &dataSize)) { + return kIOReturnError; + } + dataSize &= ~3U; + if ((dataSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { + return kIOReturnError; + } + newHeadOffset = dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE; + // else it is at the end + } else { + entry = head; + + if ((headSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headOffset) || + (headSize + DATA_QUEUE_ENTRY_HEADER_SIZE + headOffset > queueSize)) { + return kIOReturnError; + } + newHeadOffset = headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE; + } + } else { + // empty queue + if (dataQueue->needServicedCallback) { + *sendDataServiced = true; + } + return kIOReturnUnderrun; + } + + callback(&entry->data, callerDataSize); + if (dataQueue->needServicedCallback) { + *sendDataServiced = true; + } + + __c11_atomic_store((_Atomic uint32_t *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE); + + if (newHeadOffset == tailOffset) { + // + // If we are making the queue empty, then we need to make sure + // that either the enqueuer notices, or we notice the enqueue + // that raced with our making of the queue empty. + // + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); + } + + return retVal; +} + +kern_return_t +IODataQueueDispatchSource::Enqueue(uint32_t callerDataSize, + IODataQueueClientEnqueueEntryBlock callback) +{ + kern_return_t ret; + bool sendDataAvailable; + + sendDataAvailable = false; + ret = EnqueueWithCoalesce(callerDataSize, &sendDataAvailable, callback); + if (sendDataAvailable) { + SendDataAvailable(); + } + return ret; +} + +kern_return_t +IODataQueueDispatchSource::EnqueueWithCoalesce(uint32_t callerDataSize, + bool * sendDataAvailable, + IODataQueueClientEnqueueEntryBlock callback) +{ + IODataQueueMemory * dataQueue; + IODataQueueEntry * entry; + uint32_t head; + uint32_t tail; + uint32_t newTail; + uint32_t dataSize; + uint32_t queueSize; + uint32_t entrySize; + IOReturn retVal = kIOReturnSuccess; + + dataQueue = ivars->dataQueue; + if (!dataQueue) { + return kIOReturnNoMemory; + } + queueSize = ivars->queueByteCount; + + // Force a single read of head and tail + tail = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->tail, __ATOMIC_RELAXED); + head = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_ACQUIRE); + + if (os_add_overflow(callerDataSize, 3, &dataSize)) { + return kIOReturnOverrun; + } + dataSize &= ~3U; + + // Check for overflow of entrySize + if (os_add_overflow(DATA_QUEUE_ENTRY_HEADER_SIZE, dataSize, &entrySize)) { + return kIOReturnOverrun; + } + + // Check for underflow of (getQueueSize() - tail) + if (queueSize < tail || queueSize < head) { + return kIOReturnUnderrun; + } + + newTail = tail; + if (tail >= head) { + // Is there enough room at the end for the entry? + if ((entrySize <= (UINT32_MAX - tail)) && + ((tail + entrySize) <= queueSize)) { + entry = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + tail); + + callback(&entry->data, callerDataSize); + + entry->size = callerDataSize; + + // The tail can be out of bound when the size of the new entry + // exactly matches the available space at the end of the queue. + // The tail can range from 0 to queueSize inclusive. + + newTail = tail + entrySize; + } else if (head > entrySize) { // Is there enough room at the beginning? + entry = (IODataQueueEntry *)((uintptr_t)dataQueue->queue); + + callback(&entry->data, callerDataSize); + + // Wrap around to the beginning, but do not allow the tail to catch + // up to the head. + + entry->size = callerDataSize; + + // We need to make sure that there is enough room to set the size before + // doing this. The user client checks for this and will look for the size + // at the beginning if there isn't room for it at the end. + + if ((queueSize - tail) >= DATA_QUEUE_ENTRY_HEADER_SIZE) { + ((IODataQueueEntry *)((uintptr_t)dataQueue->queue + tail))->size = dataSize; + } + + newTail = entrySize; + } else { + retVal = kIOReturnOverrun; // queue is full + } + } else { + // Do not allow the tail to catch up to the head when the queue is full. + // That's why the comparison uses a '>' rather than '>='. + + if ((head - tail) > entrySize) { + entry = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + tail); + + callback(&entry->data, callerDataSize); + + entry->size = callerDataSize; + + newTail = tail + entrySize; + } else { + retVal = kIOReturnOverrun; // queue is full + } + } + + // Send notification (via mach message) that data is available. + + if (retVal == kIOReturnSuccess) { + // Publish the data we just enqueued + __c11_atomic_store((_Atomic uint32_t *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); + + if (tail != head) { + // + // The memory barrier below pairs with the one in dequeue + // so that either our store to the tail cannot be missed by + // the next dequeue attempt, or we will observe the dequeuer + // making the queue empty. + // + // Of course, if we already think the queue is empty, + // there's no point paying this extra cost. + // + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); + head = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_RELAXED); + } + + if (tail == head) { + // Send notification that data is now available. + *sendDataAvailable = true; + retVal = kIOReturnSuccess; + } + } else if (retVal == kIOReturnOverrun) { + // ask to be notified of Dequeue() + dataQueue->needServicedCallback = true; + *sendDataAvailable = true; + } + + return retVal; +} diff --git a/iokit/System/Makefile b/iokit/System/Makefile new file mode 100644 index 000000000..2e9bc0d78 --- /dev/null +++ b/iokit/System/Makefile @@ -0,0 +1,24 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +# These are System.framework headers + +include $(MakeInc_cmd) +include $(MakeInc_def) + +ALL_HDRS = $(shell (cd $(SOURCE); echo *.h)) + +# INSTINC_SUBDIRS = Headers + +EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} + +# INSTALL_MI_DIR = . + +INSTALL_MI_LCL_LIST = $(ALL_HDRS) + +INSTALL_MI_GEN_LIST = $(GENERATED_HEADERS) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/iokit/Tests/Tests.cpp b/iokit/Tests/Tests.cpp index 7e7fe8f1d..47be23b23 100644 --- a/iokit/Tests/Tests.cpp +++ b/iokit/Tests/Tests.cpp @@ -426,6 +426,27 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused data->release(); } + if (changed && (newValue >= 6666) && (newValue <= 6669)) { + OSIterator * iter; + IOService * service; + + service = NULL; + iter = IOService::getMatchingServices(IOService::nameMatching("XHC1")); + if (iter && (service = (IOService *) iter->getNextObject())) { + if (newValue == 6666) { + IOLog("terminating 0x%qx\n", service->getRegistryEntryID()); + service->terminate(); + } else if (newValue == 6667) { + IOLog("register 0x%qx\n", service->getRegistryEntryID()); + service->registerService(); + } + } + OSSafeReleaseNULL(iter); + if (service) { + return 0; + } + } + if (changed && newValue) { error = IOWorkLoopTest(newValue); @@ -444,4 +465,4 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused SYSCTL_PROC(_kern, OID_AUTO, iokittest, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_iokittest, "I", ""); + NULL, 0, sysctl_iokittest, "I", ""); diff --git a/iokit/bsddev/DINetBootHook.cpp b/iokit/bsddev/DINetBootHook.cpp index 17ebe5a23..6f4eb396d 100644 --- a/iokit/bsddev/DINetBootHook.cpp +++ b/iokit/bsddev/DINetBootHook.cpp @@ -101,9 +101,9 @@ static IOService * di_load_controller( void ) { - OSIterator * controllerIterator = 0; - OSDictionary * matchDictionary = 0; - IOService * controller = 0; + OSIterator * controllerIterator = NULL; + OSDictionary * matchDictionary = NULL; + IOService * controller = NULL; do { IOService::getResourceService()->publishResource("com.apple.AppleDiskImageController.load", kOSBooleanTrue); @@ -151,11 +151,11 @@ int di_root_image(const char *path, char *devname, size_t devsz, dev_t *dev_p) { IOReturn res = 0; - IOService * controller = 0; - OSString * pathString = 0; - OSNumber * myResult = 0; - OSString * myDevName = 0; - OSNumber * myDevT = 0; + IOService * controller = NULL; + OSString * pathString = NULL; + OSNumber * myResult = NULL; + OSString * myDevName = NULL; + OSNumber * myDevT = NULL; // sanity check arguments please if (devname) { @@ -243,11 +243,11 @@ int di_root_ramfile_buf(void *buf, size_t bufsz, char *devname, size_t devsz, dev_t *dev_p) { IOReturn res = 0; - IOService *controller = 0; - OSNumber *myResult = 0; - OSString *myDevName = 0; - OSNumber *myDevT = 0; - IOMemoryDescriptor *mem = 0; + IOService *controller = NULL; + OSNumber *myResult = NULL; + OSString *myDevName = NULL; + OSNumber *myDevT = NULL; + IOMemoryDescriptor *mem = NULL; mem = IOMemoryDescriptor::withAddress(buf, bufsz, kIODirectionInOut); assert(mem); @@ -306,7 +306,7 @@ di_root_ramfile( IORegistryEntry * entry ) IOMemoryDescriptor * mem; uint64_t dmgSize; uint64_t remain, length; - OSData * extentData = 0; + OSData * extentData = NULL; IOAddressRange * extentList; uint64_t extentSize; uint32_t extentCount; diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 6ce81657a..11514d895 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -49,49 +49,21 @@ extern "C" { #define ROOTDEVICETIMEOUT 60 #endif -int panic_on_exception_triage = 0; - extern dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); extern dev_t mdevlookup(int devid); extern void mdevremoveall(void); extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size); extern void di_root_ramfile(IORegistryEntry * entry); -#if CONFIG_EMBEDDED +#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1))) + #define IOPOLLED_COREFILE (CONFIG_KDP_INTERACTIVE_DEBUGGING) #if defined(XNU_TARGET_OS_BRIDGE) - -#define kIOCoreDumpSize 150ULL*1024ULL*1024ULL -// leave free space on volume: -#define kIOCoreDumpFreeSize 150ULL*1024ULL*1024ULL #define kIOCoreDumpPath "/private/var/internal/kernelcore" - -#else /* defined(XNU_TARGET_OS_BRIDGE) */ -#define kIOCoreDumpMinSize 350ULL*1024ULL*1024ULL -#define kIOCoreDumpLargeSize 500ULL*1024ULL*1024ULL -// leave free space on volume: -#define kIOCoreDumpFreeSize 350ULL*1024ULL*1024ULL +#else #define kIOCoreDumpPath "/private/var/vm/kernelcore" - -#endif /* defined(XNU_TARGET_OS_BRIDGE) */ - -#elif DEVELOPMENT /* CONFIG_EMBEDDED */ -#define IOPOLLED_COREFILE 1 -// no sizing -#define kIOCoreDumpSize 0ULL -#define kIOCoreDumpFreeSize 0ULL -#else /* CONFIG_EMBEDDED */ -#define IOPOLLED_COREFILE 0 -#endif /* CONFIG_EMBEDDED */ - - -#if IOPOLLED_COREFILE -static bool -NewKernelCoreMedia(void * target, void * refCon, - IOService * newService, - IONotifier * notifier); -#endif /* IOPOLLED_COREFILE */ +#endif #if CONFIG_KDP_INTERACTIVE_DEBUGGING /* @@ -101,11 +73,21 @@ extern uint64_t kdp_core_ramdisk_addr; extern uint64_t kdp_core_ramdisk_size; #endif +#if IOPOLLED_COREFILE +static void IOOpenPolledCoreFile(thread_call_param_t __unused, thread_call_param_t corefilename); + +thread_call_t corefile_open_call = NULL; +#endif + kern_return_t IOKitBSDInit( void ) { IOService::publishResource("IOBSD"); +#if IOPOLLED_COREFILE + corefile_open_call = thread_call_allocate_with_options(IOOpenPolledCoreFile, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); +#endif + return kIOReturnSuccess; } @@ -122,8 +104,8 @@ IOServicePublishResource( const char * property, boolean_t value ) boolean_t IOServiceWaitForMatchingResource( const char * property, uint64_t timeout ) { - OSDictionary * dict = 0; - IOService * match = 0; + OSDictionary * dict = NULL; + IOService * match = NULL; boolean_t found = false; do { @@ -150,8 +132,8 @@ IOServiceWaitForMatchingResource( const char * property, uint64_t timeout ) boolean_t IOCatalogueMatchingDriversPresent( const char * property ) { - OSDictionary * dict = 0; - OSOrderedSet * set = 0; + OSDictionary * dict = NULL; + OSOrderedSet * set = NULL; SInt32 generationCount = 0; boolean_t found = false; @@ -181,7 +163,7 @@ OSDictionary * IOBSDNameMatching( const char * name ) { OSDictionary * dict; - const OSSymbol * str = 0; + const OSSymbol * str = NULL; do { dict = IOService::serviceMatching( gIOServiceKey ); @@ -205,7 +187,7 @@ IOBSDNameMatching( const char * name ) str->release(); } - return 0; + return NULL; } OSDictionary * @@ -218,29 +200,29 @@ OSDictionary * IONetworkNamePrefixMatching( const char * prefix ) { OSDictionary * matching; - OSDictionary * propDict = 0; - const OSSymbol * str = 0; + OSDictionary * propDict = NULL; + const OSSymbol * str = NULL; char networkType[128]; do { matching = IOService::serviceMatching( "IONetworkInterface" ); - if (matching == 0) { + if (matching == NULL) { continue; } propDict = OSDictionary::withCapacity(1); - if (propDict == 0) { + if (propDict == NULL) { continue; } str = OSSymbol::withCString( prefix ); - if (str == 0) { + if (str == NULL) { continue; } propDict->setObject( "IOInterfaceNamePrefix", (OSObject *) str ); str->release(); - str = 0; + str = NULL; // see if we're contrained to netroot off of specific network type if (PE_parse_boot_argn( "network-type", networkType, 128 )) { @@ -248,7 +230,7 @@ IONetworkNamePrefixMatching( const char * prefix ) if (str) { propDict->setObject( "IONetworkRootType", str); str->release(); - str = 0; + str = NULL; } } @@ -258,7 +240,7 @@ IONetworkNamePrefixMatching( const char * prefix ) } propDict->release(); - propDict = 0; + propDict = NULL; return matching; } while (false); @@ -273,7 +255,7 @@ IONetworkNamePrefixMatching( const char * prefix ) str->release(); } - return 0; + return NULL; } static bool @@ -287,32 +269,32 @@ IORegisterNetworkInterface( IOService * netif ) // device is handed to BSD. IOService * stack; - OSNumber * zero = 0; - OSString * path = 0; - OSDictionary * dict = 0; - char * pathBuf = 0; + OSNumber * zero = NULL; + OSString * path = NULL; + OSDictionary * dict = NULL; + char * pathBuf = NULL; int len; enum { kMaxPathLen = 512 }; do { stack = IOService::waitForService( IOService::serviceMatching("IONetworkStack")); - if (stack == 0) { + if (stack == NULL) { break; } dict = OSDictionary::withCapacity(3); - if (dict == 0) { + if (dict == NULL) { break; } zero = OSNumber::withNumber((UInt64) 0, 32); - if (zero == 0) { + if (zero == NULL) { break; } pathBuf = (char *) IOMalloc( kMaxPathLen ); - if (pathBuf == 0) { + if (pathBuf == NULL) { break; } @@ -323,7 +305,7 @@ IORegisterNetworkInterface( IOService * netif ) } path = OSString::withCStringNoCopy( pathBuf ); - if (path == 0) { + if (path == NULL) { break; } @@ -346,7 +328,7 @@ IORegisterNetworkInterface( IOService * netif ) IOFree(pathBuf, kMaxPathLen); } - return netif->getProperty( kIOBSDNameKey ) != 0; + return netif->getProperty( kIOBSDNameKey ) != NULL; } OSDictionary * @@ -393,7 +375,7 @@ IOOFPathMatching( const char * path, char * buf, int maxLen ) matching->release(); } - return 0; + return NULL; } static int didRam = 0; @@ -406,19 +388,20 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize, mach_timespec_t t; IOService * service; IORegistryEntry * regEntry; - OSDictionary * matching = 0; + OSDictionary * matching = NULL; OSString * iostr; OSNumber * off; - OSData * data = 0; + OSData * data = NULL; UInt32 flags = 0; int mnr, mjr; - const char * mediaProperty = 0; + const char * mediaProperty = NULL; char * rdBootVar; char * str; - const char * look = 0; + const char * look = NULL; int len; bool debugInfoPrintedOnce = false; + bool needNetworkKexts = false; const char * uuidStr = NULL; static int mountAttempts = 0; @@ -556,6 +539,7 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize, if (strncmp( look, "en", strlen( "en" )) == 0) { matching = IONetworkNamePrefixMatching( "en" ); + needNetworkKexts = true; } else if (strncmp( look, "uuid", strlen( "uuid" )) == 0) { char *uuid; OSString *uuidString; @@ -607,6 +591,12 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize, } } + char namep[8]; + if (needNetworkKexts + || PE_parse_boot_argn("-s", namep, sizeof(namep))) { + IOService::startDeferredMatches(); + } + do { t.tv_sec = ROOTDEVICETIMEOUT; t.tv_nsec = 0; @@ -648,7 +638,7 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize, if (service && service->metaCast( "IONetworkInterface" ) && !IORegisterNetworkInterface( service )) { - service = 0; + service = NULL; } if (service) { @@ -726,7 +716,6 @@ void IOSecureBSDRoot(const char * rootName) { #if CONFIG_EMBEDDED - int tmpInt; IOReturn result; IOPlatformExpert *pe; OSDictionary *matching; @@ -739,20 +728,12 @@ IOSecureBSDRoot(const char * rootName) assert(pe); // Returns kIOReturnNotPrivileged is the root device is not secure. // Returns kIOReturnUnsupported if "SecureRootName" is not implemented. - result = pe->callPlatformFunction(functionName, false, (void *)rootName, (void *)0, (void *)0, (void *)0); + result = pe->callPlatformFunction(functionName, false, (void *)rootName, (void *)NULL, (void *)NULL, (void *)NULL); functionName->release(); OSSafeReleaseNULL(pe); if (result == kIOReturnNotPrivileged) { mdevremoveall(); - } else if (result == kIOReturnSuccess) { - // If we are booting with a secure root, and we have the right - // boot-arg, we will want to panic on exception triage. This - // behavior is intended as a debug aid (we can look at why an - // exception occured in the kernel debugger). - if (PE_parse_boot_argn("-panic_on_exception_triage", &tmpInt, sizeof(tmpInt))) { - panic_on_exception_triage = 1; - } } #endif // CONFIG_EMBEDDED @@ -796,13 +777,13 @@ IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout ) IOService * resources; OSString * string; - resources = IOService::waitForService( IOService::resourceMatching( kIOPlatformUUIDKey ), (timeout.tv_sec || timeout.tv_nsec) ? &timeout : 0 ); - if (resources == 0) { + resources = IOService::waitForService( IOService::resourceMatching( kIOPlatformUUIDKey ), (timeout.tv_sec || timeout.tv_nsec) ? &timeout : NULL ); + if (resources == NULL) { return KERN_OPERATION_TIMED_OUT; } string = (OSString *) IOService::getPlatform()->getProvider()->getProperty( kIOPlatformUUIDKey ); - if (string == 0) { + if (string == NULL) { return KERN_NOT_SUPPORTED; } @@ -823,179 +804,167 @@ IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout ) IOPolledFileIOVars * gIOPolledCoreFileVars; kern_return_t gIOPolledCoreFileOpenRet = kIOReturnNotReady; +IOPolledCoreFileMode_t gIOPolledCoreFileMode = kIOPolledCoreFileModeNotInitialized; + #if IOPOLLED_COREFILE -static IOReturn -IOOpenPolledCoreFile(const char * filename) +#if defined(XNU_TARGET_OS_BRIDGE) +// On bridgeOS allocate a 150MB corefile and leave 150MB free +#define kIOCoreDumpSize 150ULL*1024ULL*1024ULL +#define kIOCoreDumpFreeSize 150ULL*1024ULL*1024ULL + +#elif CONFIG_EMBEDDED /* defined(XNU_TARGET_OS_BRIDGE) */ +// On embedded devices with >3GB DRAM we allocate a 500MB corefile +// otherwise allocate a 350MB corefile. Leave 350 MB free + +#define kIOCoreDumpMinSize 350ULL*1024ULL*1024ULL +#define kIOCoreDumpLargeSize 500ULL*1024ULL*1024ULL + +#define kIOCoreDumpFreeSize 350ULL*1024ULL*1024ULL + +#else /* defined(XNU_TARGET_OS_BRIDGE) */ +// on macOS devices allocate a corefile sized at 1GB / 32GB of DRAM, +// fallback to a 1GB corefile and leave at least 1GB free +#define kIOCoreDumpMinSize 1024ULL*1024ULL*1024ULL +#define kIOCoreDumpIncrementalSize 1024ULL*1024ULL*1024ULL + +#define kIOCoreDumpFreeSize 1024ULL*1024ULL*1024ULL + +// on older macOS devices we allocate a 1MB file at boot +// to store a panic time stackshot +#define kIOStackshotFileSize 1024ULL*1024ULL + +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ + +static IOPolledCoreFileMode_t +GetCoreFileMode() +{ + if (on_device_corefile_enabled()) { + return kIOPolledCoreFileModeCoredump; + } else if (panic_stackshot_to_disk_enabled()) { + return kIOPolledCoreFileModeStackshot; + } else { + return kIOPolledCoreFileModeDisabled; + } +} + +static void +IOCoreFileGetSize(uint64_t *ideal_size, uint64_t *fallback_size, uint64_t *free_space_to_leave, IOPolledCoreFileMode_t mode) +{ + unsigned int requested_corefile_size = 0; + + *ideal_size = *fallback_size = *free_space_to_leave = 0; + +#if defined(XNU_TARGET_OS_BRIDGE) +#pragma unused(mode) + *ideal_size = *fallback_size = kIOCoreDumpSize; + *free_space_to_leave = kIOCoreDumpFreeSize; +#elif CONFIG_EMBEDDED /* defined(XNU_TARGET_OS_BRIDGE) */ +#pragma unused(mode) + *ideal_size = *fallback_size = kIOCoreDumpMinSize; + + if (max_mem > (3 * 1024ULL * 1024ULL * 1024ULL)) { + *ideal_size = kIOCoreDumpLargeSize; + } + + *free_space_to_leave = kIOCoreDumpFreeSize; +#else /* defined(XNU_TARGET_OS_BRIDGE) */ + if (mode == kIOPolledCoreFileModeCoredump) { + *ideal_size = *fallback_size = kIOCoreDumpMinSize; + if (kIOCoreDumpIncrementalSize != 0 && max_mem > (32 * 1024ULL * 1024ULL * 1024ULL)) { + *ideal_size = ((ROUNDUP(max_mem, (32 * 1024ULL * 1024ULL * 1024ULL)) / (32 * 1024ULL * 1024ULL * 1024ULL)) * kIOCoreDumpIncrementalSize); + } + *free_space_to_leave = kIOCoreDumpFreeSize; + } else if (mode == kIOPolledCoreFileModeStackshot) { + *ideal_size = *fallback_size = *free_space_to_leave = kIOStackshotFileSize; + } +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ + // If a custom size was requested, override the ideal and requested sizes + if (PE_parse_boot_argn("corefile_size_mb", &requested_corefile_size, sizeof(requested_corefile_size))) { + IOLog("Boot-args specify %d MB kernel corefile\n", requested_corefile_size); + + *ideal_size = *fallback_size = (requested_corefile_size * 1024ULL * 1024ULL); + } + + return; +} + +static void +IOOpenPolledCoreFile(thread_call_param_t __unused, thread_call_param_t corefilename) { + assert(corefilename != NULL); + IOReturn err; - unsigned int debug; - uint64_t corefile_size_bytes = 0; + char *filename = (char *) corefilename; + uint64_t corefile_size_bytes = 0, corefile_fallback_size_bytes = 0, free_space_to_leave_bytes = 0; + IOPolledCoreFileMode_t mode_to_init = GetCoreFileMode(); if (gIOPolledCoreFileVars) { - return kIOReturnBusy; + return; } if (!IOPolledInterface::gMetaClass.getInstanceCount()) { - return kIOReturnUnsupported; + return; } - debug = 0; - PE_parse_boot_argn("debug", &debug, sizeof(debug)); - if (DB_DISABLE_LOCAL_CORE & debug) { - return kIOReturnUnsupported; + if (mode_to_init == kIOPolledCoreFileModeDisabled) { + gIOPolledCoreFileMode = kIOPolledCoreFileModeDisabled; + return; } -#if CONFIG_EMBEDDED - unsigned int requested_corefile_size = 0; - if (PE_parse_boot_argn("corefile_size_mb", &requested_corefile_size, sizeof(requested_corefile_size))) { - IOLog("Boot-args specify %d MB kernel corefile\n", requested_corefile_size); - - corefile_size_bytes = (requested_corefile_size * 1024ULL * 1024ULL); - } -#endif + // We'll overwrite this once we open the file, we update this to mark that we have made + // it past initialization + gIOPolledCoreFileMode = kIOPolledCoreFileModeClosed; + IOCoreFileGetSize(&corefile_size_bytes, &corefile_fallback_size_bytes, &free_space_to_leave_bytes, mode_to_init); do { -#if defined(kIOCoreDumpLargeSize) - if (0 == corefile_size_bytes) { - // If no custom size was requested and we're on a device with >3GB of DRAM, attempt - // to allocate a large corefile otherwise use a small file. - if (max_mem > (3 * 1024ULL * 1024ULL * 1024ULL)) { - corefile_size_bytes = kIOCoreDumpLargeSize; - err = IOPolledFileOpen(filename, - kIOPolledFileCreate, - corefile_size_bytes, kIOCoreDumpFreeSize, - NULL, 0, - &gIOPolledCoreFileVars, NULL, NULL, 0); - if (kIOReturnSuccess == err) { - break; - } else if (kIOReturnNoSpace == err) { - IOLog("Failed to open corefile of size %llu MB (low disk space)", - (corefile_size_bytes / (1024ULL * 1024ULL))); - if (corefile_size_bytes == kIOCoreDumpMinSize) { - gIOPolledCoreFileOpenRet = err; - return err; - } - // Try to open a smaller corefile (set size and fall-through) - corefile_size_bytes = kIOCoreDumpMinSize; - } else { - IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n", - (corefile_size_bytes / (1024ULL * 1024ULL)), err); - gIOPolledCoreFileOpenRet = err; - return err; - } - } else { - corefile_size_bytes = kIOCoreDumpMinSize; + err = IOPolledFileOpen(filename, kIOPolledFileCreate, corefile_size_bytes, free_space_to_leave_bytes, + NULL, 0, &gIOPolledCoreFileVars, NULL, NULL, NULL); + if (kIOReturnSuccess == err) { + break; + } else if (kIOReturnNoSpace == err) { + IOLog("Failed to open corefile of size %llu MB (low disk space)", + (corefile_size_bytes / (1024ULL * 1024ULL))); + if (corefile_size_bytes == corefile_fallback_size_bytes) { + gIOPolledCoreFileOpenRet = err; + return; } + } else { + IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n", + (corefile_size_bytes / (1024ULL * 1024ULL)), err); + gIOPolledCoreFileOpenRet = err; + return; } -#else /* defined(kIOCoreDumpLargeSize) */ - if (0 == corefile_size_bytes) { - corefile_size_bytes = kIOCoreDumpSize; - } -#endif /* defined(kIOCoreDumpLargeSize) */ - err = IOPolledFileOpen(filename, - kIOPolledFileCreate, - corefile_size_bytes, kIOCoreDumpFreeSize, - NULL, 0, - &gIOPolledCoreFileVars, NULL, NULL, 0); + + err = IOPolledFileOpen(filename, kIOPolledFileCreate, corefile_fallback_size_bytes, free_space_to_leave_bytes, + NULL, 0, &gIOPolledCoreFileVars, NULL, NULL, NULL); if (kIOReturnSuccess != err) { IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n", - (corefile_size_bytes / (1024ULL * 1024ULL)), err); + (corefile_fallback_size_bytes / (1024ULL * 1024ULL)), err); gIOPolledCoreFileOpenRet = err; - return err; + return; } } while (false); - err = IOPolledFilePollersSetup(gIOPolledCoreFileVars, kIOPolledPreflightCoreDumpState); - if (kIOReturnSuccess != err) { - IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0); + gIOPolledCoreFileOpenRet = IOPolledFilePollersSetup(gIOPolledCoreFileVars, kIOPolledPreflightCoreDumpState); + if (kIOReturnSuccess != gIOPolledCoreFileOpenRet) { + IOPolledFileClose(&gIOPolledCoreFileVars, 0, NULL, 0, 0, 0); IOLog("IOPolledFilePollersSetup for corefile failed with error: 0x%x\n", err); - gIOPolledCoreFileOpenRet = err; } else { IOLog("Opened corefile of size %llu MB\n", (corefile_size_bytes / (1024ULL * 1024ULL))); + gIOPolledCoreFileMode = mode_to_init; } - return err; + return; } static void IOClosePolledCoreFile(void) { gIOPolledCoreFileOpenRet = kIOReturnNotOpen; + gIOPolledCoreFileMode = kIOPolledCoreFileModeClosed; IOPolledFilePollersClose(gIOPolledCoreFileVars, kIOPolledPostflightCoreDumpState); - IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0); -} - -static thread_call_t gIOOpenPolledCoreFileTC; -static IONotifier * gIOPolledCoreFileNotifier; -static IONotifier * gIOPolledCoreFileInterestNotifier; - -static IOReturn -KernelCoreMediaInterest(void * target, void * refCon, - UInt32 messageType, IOService * provider, - void * messageArgument, vm_size_t argSize ) -{ - if (kIOMessageServiceIsTerminated == messageType) { - gIOPolledCoreFileInterestNotifier->remove(); - gIOPolledCoreFileInterestNotifier = 0; - IOClosePolledCoreFile(); - } - - return kIOReturnSuccess; -} - -static void -OpenKernelCoreMedia(thread_call_param_t p0, thread_call_param_t p1) -{ - IOService * newService; - OSString * string; - char filename[16]; - - newService = (IOService *) p1; - do{ - if (gIOPolledCoreFileVars) { - break; - } - string = OSDynamicCast(OSString, newService->getProperty(kIOBSDNameKey)); - if (!string) { - break; - } - snprintf(filename, sizeof(filename), "/dev/%s", string->getCStringNoCopy()); - if (kIOReturnSuccess != IOOpenPolledCoreFile(filename)) { - break; - } - gIOPolledCoreFileInterestNotifier = newService->registerInterest( - gIOGeneralInterest, &KernelCoreMediaInterest, NULL, 0); - }while (false); - - newService->release(); -} - -static bool -NewKernelCoreMedia(void * target, void * refCon, - IOService * newService, - IONotifier * notifier) -{ - static volatile UInt32 onlyOneCorePartition = 0; - do{ - if (!OSCompareAndSwap(0, 1, &onlyOneCorePartition)) { - break; - } - if (gIOPolledCoreFileVars) { - break; - } - if (!gIOOpenPolledCoreFileTC) { - break; - } - newService = newService->getProvider(); - if (!newService) { - break; - } - newService->retain(); - thread_call_enter1(gIOOpenPolledCoreFileTC, newService); - }while (false); - - return false; + IOPolledFileClose(&gIOPolledCoreFileVars, 0, NULL, 0, 0, 0); } #endif /* IOPOLLED_COREFILE */ @@ -1004,37 +973,6 @@ extern "C" void IOBSDMountChange(struct mount * mp, uint32_t op) { #if IOPOLLED_COREFILE - - OSDictionary * bsdMatching; - OSDictionary * mediaMatching; - OSString * string; - - if (!gIOPolledCoreFileNotifier) { - do{ - if (!gIOOpenPolledCoreFileTC) { - gIOOpenPolledCoreFileTC = thread_call_allocate(&OpenKernelCoreMedia, NULL); - } - bsdMatching = IOService::serviceMatching("IOMediaBSDClient"); - if (!bsdMatching) { - break; - } - mediaMatching = IOService::serviceMatching("IOMedia"); - string = OSString::withCStringNoCopy("5361644D-6163-11AA-AA11-00306543ECAC"); - if (!string || !mediaMatching) { - break; - } - mediaMatching->setObject("Content", string); - string->release(); - bsdMatching->setObject(gIOParentMatchKey, mediaMatching); - mediaMatching->release(); - - gIOPolledCoreFileNotifier = IOService::addMatchingNotification( - gIOFirstMatchNotification, bsdMatching, - &NewKernelCoreMedia, NULL, NULL, -1000); - }while (false); - } - -#if CONFIG_EMBEDDED uint64_t flags; char path[128]; int pathLen; @@ -1080,17 +1018,18 @@ IOBSDMountChange(struct mount * mp, uint32_t op) if (0 != strncmp(path, kIOCoreDumpPath, pathLen - 1)) { break; } - IOOpenPolledCoreFile(kIOCoreDumpPath); + + thread_call_enter1(corefile_open_call, (void *) kIOCoreDumpPath); break; case kIOMountChangeUnmount: case kIOMountChangeWillResize: if (gIOPolledCoreFileVars && (mp == kern_file_mount(gIOPolledCoreFileVars->fileRef))) { + thread_call_cancel_wait(corefile_open_call); IOClosePolledCoreFile(); } break; } -#endif /* CONFIG_EMBEDDED */ #endif /* IOPOLLED_COREFILE */ } diff --git a/iokit/bsddev/skywalk/IOSkywalkSupport.cpp b/iokit/bsddev/skywalk/IOSkywalkSupport.cpp index b38d804d8..5d030959c 100644 --- a/iokit/bsddev/skywalk/IOSkywalkSupport.cpp +++ b/iokit/bsddev/skywalk/IOSkywalkSupport.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * Copyright (c) 2015-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,7 +37,7 @@ IOSKCopyKextIdentifierWithAddress( vm_address_t address ); const OSSymbol * IOSKCopyKextIdentifierWithAddress( vm_address_t address ) { - const OSSymbol * id = 0; + const OSSymbol * id = NULL; OSKext * kext = OSKext::lookupKextWithAddress(address); if (kext) { diff --git a/iokit/conf/Makefile b/iokit/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/iokit/conf/Makefile +++ b/iokit/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/iokit/conf/Makefile.template b/iokit/conf/Makefile.template index c29d59de6..b58cd7ee8 100644 --- a/iokit/conf/Makefile.template +++ b/iokit/conf/Makefile.template @@ -17,7 +17,7 @@ include $(MakeInc_def) # XXX: CFLAGS # CFLAGS+= -include meta_features.h -DDRIVER_PRIVATE \ - -DIOKIT_KERNEL_PRIVATE -DIOMATCHDEBUG=1 -DIOALLOCDEBUG=1 + -DIOKIT_KERNEL_PRIVATE -DDRIVERKIT_PRIVATE=1 -DIOMATCHDEBUG=1 -DIOALLOCDEBUG=1 SFLAGS+= -include meta_features.h #-DIOKITDEBUG=-1 @@ -48,9 +48,10 @@ IOKitBSDInit.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes -Wno-documentation IOPMrootDomain.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes # -# Directories for mig generated files +# Directories for generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = \ + DriverKit # # Make sure we don't remove this by accident if interrupted at the wrong @@ -96,9 +97,9 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \ mv $${hib_file}__ $${hib_file} || exit 1; \ done - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist do_all: $(COMPONENT).filelist diff --git a/iokit/conf/files b/iokit/conf/files index 5af89b2b0..9b2578710 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -8,6 +8,21 @@ OPTIONS/crypto optional crypto OPTIONS/config_dtrace optional config_dtrace OPTIONS/mach_assert optional mach_assert +# iig generated implementation files + +./DriverKit/OSObject.iig.cpp optional iokitcpp +./DriverKit/OSAction.iig.cpp optional iokitcpp +./DriverKit/IOService.iig.cpp optional iokitcpp +./DriverKit/IOUserClient.iig.cpp optional iokitcpp +./DriverKit/IOMemoryDescriptor.iig.cpp optional iokitcpp +./DriverKit/IOBufferMemoryDescriptor.iig.cpp optional iokitcpp +./DriverKit/IOMemoryMap.iig.cpp optional iokitcpp +./DriverKit/IODispatchSource.iig.cpp optional iokitcpp +./DriverKit/IODispatchQueue.iig.cpp optional iokitcpp +./DriverKit/IOInterruptDispatchSource.iig.cpp optional iokitcpp +./DriverKit/IODataQueueDispatchSource.iig.cpp optional iokitcpp +./DriverKit/IOUserServer.iig.cpp optional iokitcpp + # libIOKit iokit/Kernel/IOHibernateIO.cpp optional hibernation @@ -89,6 +104,8 @@ iokit/Kernel/IORTC.cpp optional iokitcpp iokit/Kernel/IOStringFuncs.c standard +iokit/Kernel/IOUserServer.cpp optional iokitcpp + # Property tables for kernel-linked objects iokit/KernelConfigTables.cpp optional iokitcpp diff --git a/libkdd/kcdata.h b/libkdd/kcdata.h index 85cf4998b..f00a3be8f 100644 --- a/libkdd/kcdata.h +++ b/libkdd/kcdata.h @@ -436,45 +436,47 @@ struct kcdata_type_definition { * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes * in STACKSHOT_KCTYPE_* types. */ -#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ -#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ -#define STACKSHOT_KCCONTAINER_TASK 0x903u -#define STACKSHOT_KCCONTAINER_THREAD 0x904u -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ -#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ -#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ -#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ -#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ -#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ -#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ -#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ -#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */ -#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ -#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ -#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ -#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ -#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ -#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ -#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u /* struct stack_snapshot_stacktop */ -#define STACKSHOT_KCTYPE_ASID 0x925u /* uint32_t */ -#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u /* uint64_t */ -#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ +#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ +#define STACKSHOT_KCCONTAINER_TASK 0x903u +#define STACKSHOT_KCCONTAINER_THREAD 0x904u +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ +#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ +#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ +#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ +#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ +#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ +#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ +#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ +#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */ +#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ +#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ +#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ +#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ +#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ +#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ +#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u /* struct stack_snapshot_stacktop */ +#define STACKSHOT_KCTYPE_ASID 0x925u /* uint32_t */ +#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u /* uint64_t */ +#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL 0x928u /* dispatch queue label */ +#define STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO 0x929u /* struct stackshot_thread_turnstileinfo */ #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u /* task_delta_snapshot_v2 */ #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */ @@ -517,6 +519,7 @@ struct user64_dyld_uuid_info { }; enum task_snapshot_flags { + /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */ kTaskRsrcFlagged = 0x4, // In the EXC_RESOURCE danger zone? kTerminatedSnapshot = 0x8, kPidSuspended = 0x10, // true for suspended task @@ -546,6 +549,7 @@ enum task_snapshot_flags { }; enum thread_snapshot_flags { + /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */ kHasDispatchSerial = 0x4, kStacksPCOnly = 0x8, /* Stack traces have no frame pointers. */ kThreadDarwinBG = 0x10, /* Thread is darwinbg */ @@ -814,6 +818,18 @@ typedef struct stackshot_thread_waitinfo { uint8_t wait_type; /* The type of object that the thread is waiting on */ } __attribute__((packed)) thread_waitinfo_t; +typedef struct stackshot_thread_turnstileinfo { + uint64_t waiter; /* The thread that's waiting on the object */ + uint64_t turnstile_context; /* Associated data (either thread id, or workq addr) */ + uint8_t turnstile_priority; + uint8_t number_of_hops; +#define STACKSHOT_TURNSTILE_STATUS_UNKNOWN (1 << 0) /* The final inheritor is unknown (bug?) */ +#define STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ (1 << 1) /* A waitq was found to be locked */ +#define STACKSHOT_TURNSTILE_STATUS_WORKQUEUE (1 << 2) /* The final inheritor is a workqueue */ +#define STACKSHOT_TURNSTILE_STATUS_THREAD (1 << 3) /* The final inheritor is a thread */ + uint64_t turnstile_flags; +} __attribute__((packed)) thread_turnstileinfo_t; + #define STACKSHOT_WAITOWNER_KERNEL (UINT64_MAX - 1) #define STACKSHOT_WAITOWNER_PORT_LOCKED (UINT64_MAX - 2) #define STACKSHOT_WAITOWNER_PSET_LOCKED (UINT64_MAX - 3) @@ -895,6 +911,8 @@ struct crashinfo_proc_uniqidentifierinfo { #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE 0x828 /* uint64_t */ #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED 0x829 /* uint64_t */ #define TASK_CRASHINFO_LEDGER_WIRED_MEM 0x82A /* uint64_t */ +#define TASK_CRASHINFO_PROC_PERSONA_ID 0x82B /* uid_t */ +#define TASK_CRASHINFO_MEMORY_LIMIT_INCREASE 0x82C /* uint32_t */ @@ -971,7 +989,7 @@ kcdata_iter_unsafe(void *buffer) return iter; } -static const kcdata_iter_t kcdata_invalid_iter = { .item = 0, .end = 0 }; +static const kcdata_iter_t kcdata_invalid_iter = { .item = NULL, .end = NULL }; static inline int diff --git a/libkdd/kcdtypes.c b/libkdd/kcdtypes.c index e3ef22e33..cafecb641 100644 --- a/libkdd/kcdtypes.c +++ b/libkdd/kcdtypes.c @@ -832,6 +832,24 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s break; } + case STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL: { + i = 0; + _STRINGTYPE("dispatch_queue_label"); + setup_type_definition(retval, type_id, i, "dispatch_queue_label"); + break; + } + + case STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct stackshot_thread_turnstileinfo, waiter); + _SUBTYPE(KC_ST_UINT64, struct stackshot_thread_turnstileinfo, turnstile_context); + _SUBTYPE(KC_ST_UINT8, struct stackshot_thread_turnstileinfo, turnstile_priority); + _SUBTYPE(KC_ST_UINT8, struct stackshot_thread_turnstileinfo, number_of_hops); + _SUBTYPE(KC_ST_UINT64, struct stackshot_thread_turnstileinfo, turnstile_flags); + setup_type_definition(retval, type_id, i, "thread_turnstileinfo"); + break; + } + default: retval = NULL; break; diff --git a/libkdd/kdd.xcodeproj/project.pbxproj b/libkdd/kdd.xcodeproj/project.pbxproj index a16b8bdca..33575c38d 100644 --- a/libkdd/kdd.xcodeproj/project.pbxproj +++ b/libkdd/kdd.xcodeproj/project.pbxproj @@ -69,6 +69,7 @@ 08B4808C1BF9474A00B4AAE0 /* corpse-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */; }; 08B9297E1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t in Resources */ = {isa = PBXBuildFile; fileRef = 08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */; }; 08B9297F1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */; }; + 08C3972F204E0A7C00BDDB3F /* xnu.libkdd.plist in CopyFiles */ = {isa = PBXBuildFile; fileRef = 08C3972E204E0A7500BDDB3F /* xnu.libkdd.plist */; }; 08C9D83D1BFFF8E100DF6C05 /* exitreason-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */; }; 08C9D83E1BFFF8E100DF6C05 /* exitreason-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */; }; 08CF18FF1BF9B7B100D05813 /* stackshot-sample-tailspin in Resources */ = {isa = PBXBuildFile; fileRef = 08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */; }; @@ -112,6 +113,12 @@ C9DCEF011F01C3810000BD02 /* stackshot-sample-instrs-cycles in Resources */ = {isa = PBXBuildFile; fileRef = C9DCEF001F01C3790000BD02 /* stackshot-sample-instrs-cycles */; }; C9DCEF021F01C3810000BD02 /* stackshot-sample-instrs-cycles.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C9DCEEFF1F01C3790000BD02 /* stackshot-sample-instrs-cycles.plist.gz */; }; C9DE39141ACB5A540020F4A3 /* kcdata_core.m in Sources */ = {isa = PBXBuildFile; fileRef = C9DE39131ACB5A540020F4A3 /* kcdata_core.m */; }; + F702EC6422AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label in Resources */ = {isa = PBXBuildFile; fileRef = F702EC6322AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label */; }; + F702EC6522AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = F702EC6222AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz */; }; + F7C20D3322A168620002AD06 /* stackshot-sample-asid-pagetable in Resources */ = {isa = PBXBuildFile; fileRef = F7C20D3122A168610002AD06 /* stackshot-sample-asid-pagetable */; }; + F7C20D3422A168620002AD06 /* stackshot-sample-asid-pagetable.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = F7C20D3222A168610002AD06 /* stackshot-sample-asid-pagetable.plist.gz */; }; + F7F2F28422A159F700542597 /* stackshot-sample-turnstileinfo in Resources */ = {isa = PBXBuildFile; fileRef = F7F2F28222A159F700542597 /* stackshot-sample-turnstileinfo */; }; + F7F2F28522A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = F7F2F28322A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -148,6 +155,16 @@ ); runOnlyForDeploymentPostprocessing = 1; }; + 08C3972D204E0A5300BDDB3F /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = /AppleInternal/CoreOS/BATS/unit_tests; + dstSubfolderSpec = 0; + files = ( + 08C3972F204E0A7C00BDDB3F /* xnu.libkdd.plist in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ @@ -197,6 +214,7 @@ 08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "corpse-sample.plist.gz"; path = "tests/corpse-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; 08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-ths-thread-t"; path = "tests/stackshot-sample-ths-thread-t"; sourceTree = SOURCE_ROOT; }; 08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-ths-thread-t.plist.gz"; path = "tests/stackshot-sample-ths-thread-t.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08C3972E204E0A7500BDDB3F /* xnu.libkdd.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = xnu.libkdd.plist; sourceTree = ""; }; 08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "exitreason-sample"; path = "tests/exitreason-sample"; sourceTree = SOURCE_ROOT; }; 08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "exitreason-sample.plist.gz"; path = "tests/exitreason-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; 08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-tailspin"; path = "tests/stackshot-sample-tailspin"; sourceTree = SOURCE_ROOT; }; @@ -241,6 +259,12 @@ C9DCEEFF1F01C3790000BD02 /* stackshot-sample-instrs-cycles.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-instrs-cycles.plist.gz"; sourceTree = ""; }; C9DCEF001F01C3790000BD02 /* stackshot-sample-instrs-cycles */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-instrs-cycles"; sourceTree = ""; }; C9DE39131ACB5A540020F4A3 /* kcdata_core.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = kcdata_core.m; sourceTree = ""; }; + F702EC6222AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-dispatch-queue-label.plist.gz"; sourceTree = ""; }; + F702EC6322AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-dispatch-queue-label"; sourceTree = ""; }; + F7C20D3122A168610002AD06 /* stackshot-sample-asid-pagetable */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-asid-pagetable"; sourceTree = ""; }; + F7C20D3222A168610002AD06 /* stackshot-sample-asid-pagetable.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-asid-pagetable.plist.gz"; sourceTree = ""; }; + F7F2F28222A159F700542597 /* stackshot-sample-turnstileinfo */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-turnstileinfo"; sourceTree = ""; }; + F7F2F28322A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-turnstileinfo.plist.gz"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -283,6 +307,10 @@ 08603F351BF69EDE007D3784 /* tests */ = { isa = PBXGroup; children = ( + F7F2F28222A159F700542597 /* stackshot-sample-turnstileinfo */, + F7F2F28322A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz */, + F702EC6322AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label */, + F702EC6222AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz */, 13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */, 13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */, C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */, @@ -324,6 +352,8 @@ 08603F3F1BF69F44007D3784 /* kdd_bridge.h */, 0843EE911BF6AFB700CD4150 /* stackshot-sample */, 08603F361BF69EDE007D3784 /* Tests.swift */, + F7C20D3122A168610002AD06 /* stackshot-sample-asid-pagetable */, + F7C20D3222A168610002AD06 /* stackshot-sample-asid-pagetable.plist.gz */, 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */, 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */, 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */, @@ -408,6 +438,7 @@ C91C93BE1ACB58B700119B60 = { isa = PBXGroup; children = ( + 08C3972E204E0A7500BDDB3F /* xnu.libkdd.plist */, 08DE68361BFFB71D00BC682F /* kdd */, 08DE68351BFFB70900BC682F /* libkdd */, 08F1501D1BFEA7AC00F2C89C /* libz.dylib */, @@ -465,6 +496,7 @@ 08603F301BF69EDE007D3784 /* Sources */, 08603F311BF69EDE007D3784 /* Frameworks */, 08603F321BF69EDE007D3784 /* Resources */, + 08C3972D204E0A5300BDDB3F /* CopyFiles */, ); buildRules = ( ); @@ -535,6 +567,7 @@ C91C93BF1ACB58B700119B60 /* Project object */ = { isa = PBXProject; attributes = { + DefaultBuildSystemTypeForWorkspace = Latest; LastSwiftUpdateCheck = 0730; LastUpgradeCheck = 0830; ORGANIZATIONNAME = "Vishal Patel"; @@ -564,6 +597,7 @@ developmentRegion = English; hasScannedForEncodings = 0; knownRegions = ( + English, en, ); mainGroup = C91C93BE1ACB58B700119B60; @@ -585,6 +619,7 @@ isa = PBXResourcesBuildPhase; buildActionMask = 2147483647; files = ( + F7F2F28422A159F700542597 /* stackshot-sample-turnstileinfo in Resources */, 13739E8620DB18B600D8D9B9 /* stackshot-with-shared-cache-layout in Resources */, 084422F82048BABB008A085B /* stackshot-sample-asid in Resources */, 084422F92048BABB008A085B /* stackshot-sample-asid.plist.gz in Resources */, @@ -593,6 +628,7 @@ 18C577C31F96DB5200C67EB3 /* stackshot-sample-thread-groups-flags in Resources */, C9DCEF011F01C3810000BD02 /* stackshot-sample-instrs-cycles in Resources */, C9DCEF021F01C3810000BD02 /* stackshot-sample-instrs-cycles.plist.gz in Resources */, + F7F2F28522A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz in Resources */, 088C36E01EF323C300ABB2E0 /* stackshot-sample-thread-policy in Resources */, 088C36E11EF323C300ABB2E0 /* stackshot-sample-thread-policy.plist.gz in Resources */, 045F7F131D2ADE8000B4808B /* stackshot-with-waitinfo.plist.gz in Resources */, @@ -605,6 +641,7 @@ 13D6C5D01C4DDDB6005E617C /* corpse-twr-sample in Resources */, C95E4D1B204F42C500FD2229 /* stackshot-sample-cpu-times in Resources */, C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */, + F7C20D3322A168620002AD06 /* stackshot-sample-asid-pagetable in Resources */, 081725D51C3F476500371A54 /* stackshot-sample-duration in Resources */, 08F2AC0B1FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz in Resources */, 081725D61C3F476500371A54 /* stackshot-sample-duration.plist.gz in Resources */, @@ -625,8 +662,10 @@ 13DBA2681CAB1AD600227EB2 /* stackshot-sample-sharedcachev2.plist.gz in Resources */, 08CF19001BF9B7B100D05813 /* stackshot-sample-tailspin.plist.gz in Resources */, 13CC08441CB97F8D00EA6069 /* stackshot-fault-stats in Resources */, + F702EC6522AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz in Resources */, 13F3DA9C1C7C1BEE00ACFFCC /* corpse-twr-sample-v2 in Resources */, 13D6C5D31C4DDE0D005E617C /* test-twr-sample.plist.gz in Resources */, + F7C20D3422A168620002AD06 /* stackshot-sample-asid-pagetable.plist.gz in Resources */, 1862B0351E7A083F0005ADF4 /* stackshot-sample-thread-groups.plist.gz in Resources */, 1368F0851C87E06A00940FC6 /* exitreason-codesigning.plist.gz in Resources */, 08C9D83D1BFFF8E100DF6C05 /* exitreason-sample in Resources */, @@ -647,6 +686,7 @@ 08B4807A1BF8297500B4AAE0 /* stackshot-sample-old-arrays in Resources */, 08B4807B1BF8297500B4AAE0 /* stackshot-sample-old-arrays.plist.gz in Resources */, 0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */, + F702EC6422AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label in Resources */, 0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */, 08AD0BF11FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz in Resources */, 18E592991E9451A20018612A /* stackshot-sample-coalitions.plist.gz in Resources */, diff --git a/libkdd/tests/Tests.swift b/libkdd/tests/Tests.swift index e073f85a4..34f8d9e8e 100644 --- a/libkdd/tests/Tests.swift +++ b/libkdd/tests/Tests.swift @@ -734,6 +734,66 @@ class Tests: XCTestCase { XCTAssert(dict.value(forKeyPath: "kcdata_crashinfo.task_snapshots.0.crashed_threadid") as? Int == 42) } + func testDispatchQueueLabel() { + let buffer = NSMutableData(capacity:1000)! + + var item = kcdata_item() + let dql = "houston.we.had.a.problem" + var payload32 : UInt32 + + item.type = KCDATA_BUFFER_BEGIN_STACKSHOT + item.flags = 0 + item.size = 0 + buffer.append(&item, length: MemoryLayout.size) + + item.type = UInt32(KCDATA_TYPE_CONTAINER_BEGIN) + item.flags = 0 + item.size = UInt32(MemoryLayout.size) + buffer.append(&item, length: MemoryLayout.size) + payload32 = UInt32(STACKSHOT_KCCONTAINER_TASK) + buffer.append(&payload32, length:MemoryLayout.size) + + item.type = UInt32(KCDATA_TYPE_CONTAINER_BEGIN) + item.flags = 0 + item.size = UInt32(MemoryLayout.size) + buffer.append(&item, length: MemoryLayout.size) + payload32 = UInt32(STACKSHOT_KCCONTAINER_THREAD) + buffer.append(&payload32, length:MemoryLayout.size) + + item.type = UInt32(STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL) + item.flags = 0 + item.size = UInt32(dql.utf8.count + 1) + buffer.append(&item, length: MemoryLayout.size) + dql.utf8CString.withUnsafeBufferPointer({ + buffer.append($0.baseAddress!, length:dql.utf8.count + 1) + }) + + item.type = UInt32(KCDATA_TYPE_CONTAINER_END) + item.flags = 0 + item.size = UInt32(MemoryLayout.size) + buffer.append(&item, length: MemoryLayout.size) + payload32 = UInt32(STACKSHOT_KCCONTAINER_THREAD) + buffer.append(&payload32, length:MemoryLayout.size) + + item.type = UInt32(KCDATA_TYPE_CONTAINER_END) + item.flags = 0 + item.size = UInt32(MemoryLayout.size) + buffer.append(&item, length: MemoryLayout.size) + payload32 = UInt32(STACKSHOT_KCCONTAINER_TASK) + buffer.append(&payload32, length:MemoryLayout.size) + + + item.type = KCDATA_TYPE_BUFFER_END + item.flags = 0 + item.size = 0 + buffer.append(&item, length: MemoryLayout.size) + + guard let dict = try? self.parseBuffer(buffer) + else { XCTFail(); return; } + + XCTAssert(dict.value(forKeyPath: "kcdata_stackshot.task_snapshots.0.thread_snapshots.0.dispatch_queue_label") as? String == dql) + } + func testRepeatedContainer() { //repeated container of same name and key shoudl fail @@ -1348,6 +1408,10 @@ class Tests: XCTestCase { self.testSampleStackshot("stackshot-sample-coalitions") } + func testSampleTurnstileInfo() { + self.testSampleStackshot("stackshot-sample-turnstileinfo") + } + func testStackshotSharedcacheV2() { self.testSampleStackshot("stackshot-sample-sharedcachev2") } @@ -1400,6 +1464,10 @@ class Tests: XCTestCase { self.testSampleStackshot("stackshot-with-shared-cache-layout") } + func testStackshotDispatchQueueLabel() { + self.testSampleStackshot("stackshot-sample-dispatch-queue-label") + } + func testTrivial() { } } diff --git a/libkdd/tests/kdd_bridge.h b/libkdd/tests/kdd_bridge.h index d6691bafb..fb2f48487 100644 --- a/libkdd/tests/kdd_bridge.h +++ b/libkdd/tests/kdd_bridge.h @@ -9,8 +9,8 @@ #ifndef kdd_bridge_h #define kdd_bridge_h -#import "kdd.h" -#include "kcdata.h" +#include +#include #include #endif /* kdd_bridge_h */ diff --git a/libkdd/tests/stackshot-sample-dispatch-queue-label b/libkdd/tests/stackshot-sample-dispatch-queue-label new file mode 100644 index 000000000..b57d3fb41 Binary files /dev/null and b/libkdd/tests/stackshot-sample-dispatch-queue-label differ diff --git a/libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz b/libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz new file mode 100644 index 000000000..2d9157762 Binary files /dev/null and b/libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-turnstileinfo b/libkdd/tests/stackshot-sample-turnstileinfo new file mode 100644 index 000000000..4af0879cb Binary files /dev/null and b/libkdd/tests/stackshot-sample-turnstileinfo differ diff --git a/libkdd/tests/stackshot-sample-turnstileinfo.plist.gz b/libkdd/tests/stackshot-sample-turnstileinfo.plist.gz new file mode 100644 index 000000000..76c66172d Binary files /dev/null and b/libkdd/tests/stackshot-sample-turnstileinfo.plist.gz differ diff --git a/libkdd/xnu.libkdd.plist b/libkdd/xnu.libkdd.plist new file mode 100644 index 000000000..d6c614926 --- /dev/null +++ b/libkdd/xnu.libkdd.plist @@ -0,0 +1,35 @@ + + + + + Project + libkdd + + RadarComponents + + Name + xnu + Version + debugging + + + + Tests + + + TestName + libkdd_tests + AsRoot + + Arch + platform-native + Command + + BATS_XCTEST_CMD + /AppleInternal/XCTests/com.apple.libkdd/tests.xctest + + + + + + diff --git a/libkern/OSKextLib.cpp b/libkern/OSKextLib.cpp index 4fff0ba81..5a887bc79 100644 --- a/libkern/OSKextLib.cpp +++ b/libkern/OSKextLib.cpp @@ -190,9 +190,12 @@ OSKextCancelRequest( #pragma mark MIG Functions & Wrappers #endif /********************************************************************* -* IMPORTANT: Once we have done the vm_map_copyout(), we *must* return -* KERN_SUCCESS or the kernel map gets messed up (reason as yet -* unknown). We use op_result to return the real result of our work. +* IMPORTANT: vm_map_copyout_size() consumes the requestIn copy +* object on success. Therefore once it has been invoked successfully, +* this routine *must* return KERN_SUCCESS, regardless of our actual +* result. Our contract with the caller is that requestIn must be +* caller-deallocated if we return an error. We use op_result to return +* the real result of our work. *********************************************************************/ kern_return_t kext_request( @@ -222,9 +225,9 @@ kext_request( * just in case, or MIG will try to copy out bogus data. */ *op_result = KERN_FAILURE; - *responseOut = NULL; + *responseOut = 0; *responseLengthOut = 0; - *logDataOut = NULL; + *logDataOut = 0; *logDataLengthOut = 0; /* Check for input. Don't discard what isn't there, though. @@ -238,17 +241,17 @@ kext_request( goto finish; } - /* Once we have done the vm_map_copyout(), we *must* return KERN_SUCCESS - * or the kernel map gets messed up (reason as yet unknown). We will use - * op_result to return the real result of our work. - */ - result = vm_map_copyout(kernel_map, &map_addr, (vm_map_copy_t)requestIn); + result = vm_map_copyout_size(kernel_map, &map_addr, (vm_map_copy_t)requestIn, requestLengthIn); if (result != KERN_SUCCESS) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogIPCFlag, "vm_map_copyout() failed for request from user space."); - vm_map_copy_discard((vm_map_copy_t)requestIn); + /* + * If we return an error it is our caller's responsibility to + * deallocate the requestIn copy object, so do not deallocate it + * here. See comment above. + */ goto finish; } request = CAST_DOWN(char *, map_addr); @@ -314,7 +317,7 @@ kext_request( kOSKextLogIPCFlag, "Failed to copy response to request from user space."); *op_result = copyin_result; // xxx - should we map to our own code? - *responseOut = NULL; + *responseOut = 0; *responseLengthOut = 0; goto finish; } @@ -334,7 +337,7 @@ kext_request( kOSKextLogIPCFlag, "Failed to copy log data for request from user space."); *op_result = copyin_result; // xxx - should we map to our own code? - *logDataOut = NULL; + *logDataOut = 0; *logDataLengthOut = 0; goto finish; } @@ -392,7 +395,7 @@ kext_weak_symbol_referenced(void) panic("A kext referenced an unresolved weak symbol\n"); } -const void *gOSKextUnresolved = (const void *)&kext_weak_symbol_referenced; +const void * const gOSKextUnresolved = (const void *)&kext_weak_symbol_referenced; #if PRAGMA_MARK #pragma mark Kernel-Internal C Functions diff --git a/libkern/c++/OSArray.cpp b/libkern/c++/OSArray.cpp index 92558ac4c..cd04323f3 100644 --- a/libkern/c++/OSArray.cpp +++ b/libkern/c++/OSArray.cpp @@ -133,7 +133,7 @@ OSArray::withCapacity(unsigned int capacity) if (me && !me->initWithCapacity(capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -148,7 +148,7 @@ OSArray::withObjects(const OSObject *objects[], if (me && !me->initWithObjects(objects, count, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -162,7 +162,7 @@ OSArray::withArray(const OSArray *array, if (me && !me->initWithArray(array, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -403,7 +403,7 @@ OSObject * OSArray::getObject(unsigned int index) const { if (index >= count) { - return 0; + return NULL; } else { return (OSObject *) (const_cast(array[index])); } @@ -413,7 +413,7 @@ OSObject * OSArray::getLastObject() const { if (count == 0) { - return 0; + return NULL; } else { return (OSObject *) (const_cast(array[count - 1])); } @@ -457,7 +457,7 @@ OSArray::getNextObjectForIterator(void *inIterator, OSObject **ret) const *ret = (OSObject *)(const_cast (array[index])); return true; } else { - *ret = 0; + *ret = NULL; return false; } } @@ -503,13 +503,13 @@ OSCollection * OSArray::copyCollection(OSDictionary *cycleDict) { bool allocDict = !cycleDict; - OSCollection *ret = 0; - OSArray *newArray = 0; + OSCollection *ret = NULL; + OSArray *newArray = NULL; if (allocDict) { cycleDict = OSDictionary::withCapacity(16); if (!cycleDict) { - return 0; + return NULL; } } @@ -546,7 +546,7 @@ OSArray::copyCollection(OSDictionary *cycleDict) ; ret = newArray; - newArray = 0; + newArray = NULL; } while (false); abortCopy: diff --git a/libkern/c++/OSBoolean.cpp b/libkern/c++/OSBoolean.cpp index cabe30ab7..6918e6954 100644 --- a/libkern/c++/OSBoolean.cpp +++ b/libkern/c++/OSBoolean.cpp @@ -44,8 +44,8 @@ OSMetaClassDefineReservedUnused(OSBoolean, 5); OSMetaClassDefineReservedUnused(OSBoolean, 6); OSMetaClassDefineReservedUnused(OSBoolean, 7); -static OSBoolean * gOSBooleanTrue = 0; -static OSBoolean * gOSBooleanFalse = 0; +static OSBoolean * gOSBooleanTrue = NULL; +static OSBoolean * gOSBooleanFalse = NULL; OSBoolean * const & kOSBooleanTrue = gOSBooleanTrue; OSBoolean * const & kOSBooleanFalse = gOSBooleanFalse; diff --git a/libkern/c++/OSCollectionIterator.cpp b/libkern/c++/OSCollectionIterator.cpp index 93a2433e6..cc60901c8 100644 --- a/libkern/c++/OSCollectionIterator.cpp +++ b/libkern/c++/OSCollectionIterator.cpp @@ -45,7 +45,7 @@ OSCollectionIterator::initWithCollection(const OSCollection *inColl) inColl->retain(); collection = inColl; - collIterator = 0; + collIterator = NULL; initialUpdateStamp = 0; valid = false; @@ -59,7 +59,7 @@ OSCollectionIterator::withCollection(const OSCollection *inColl) if (me && !me->initWithCollection(inColl)) { me->release(); - return 0; + return NULL; } return me; @@ -71,12 +71,12 @@ OSCollectionIterator::free() if (collIterator) { kfree(collIterator, collection->iteratorSize()); OSCONTAINER_ACCUMSIZE(-((size_t) collection->iteratorSize())); - collIterator = 0; + collIterator = NULL; } if (collection) { collection->release(); - collection = 0; + collection = NULL; } super::free(); @@ -128,9 +128,9 @@ OSCollectionIterator::getNextObject() bool retVal; if (!isValid()) { - return 0; + return NULL; } retVal = collection->getNextObjectForIterator(collIterator, &retObj); - return (retVal)? retObj : 0; + return (retVal)? retObj : NULL; } diff --git a/libkern/c++/OSCompat.cpp b/libkern/c++/OSCompat.cpp new file mode 100644 index 000000000..b0fd9156f --- /dev/null +++ b/libkern/c++/OSCompat.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Compatibility definitions for I/O Kit smart pointers + */ + +#define LIBKERN_SMART_POINTERS + +#include +#include + +extern OSObjectPtr +OSUnserialize(const char *buffer, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errorString); + +OSObjectPtr +OSUnserialize(const char *buffer, OSStringPtr *errorString) +{ + return OSUnserialize(buffer, OSOutPtr(errorString)); +} + +extern OSObjectPtr +OSUnserializeXML(const char *buffer, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errorString); + +OSObjectPtr +OSUnserializeXML(const char *buffer, OSStringPtr *errorString) +{ + return OSUnserializeXML(buffer, OSOutPtr(errorString)); +} diff --git a/libkern/c++/OSData.cpp b/libkern/c++/OSData.cpp index 92daa9c1e..082711312 100644 --- a/libkern/c++/OSData.cpp +++ b/libkern/c++/OSData.cpp @@ -33,6 +33,8 @@ __BEGIN_DECLS #include __END_DECLS +#define LIBKERN_SMART_POINTERS + #include #include #include @@ -65,7 +67,7 @@ OSData::initWithCapacity(unsigned int inCapacity) } else { kmem_free(kernel_map, (vm_offset_t)data, capacity); } - data = 0; + data = NULL; capacity = 0; } } @@ -153,66 +155,61 @@ OSData::initWithData(const OSData *inData, } } -OSData * +OSDataPtr OSData::withCapacity(unsigned int inCapacity) { - OSData *me = new OSData; + OSDataPtr me = OSDataPtr::alloc(); if (me && !me->initWithCapacity(inCapacity)) { - me->release(); - return 0; + return nullptr; } return me; } -OSData * +OSDataPtr OSData::withBytes(const void *bytes, unsigned int inLength) { - OSData *me = new OSData; + OSDataPtr me = OSDataPtr::alloc(); if (me && !me->initWithBytes(bytes, inLength)) { - me->release(); - return 0; + return nullptr; } return me; } -OSData * +OSDataPtr OSData::withBytesNoCopy(void *bytes, unsigned int inLength) { - OSData *me = new OSData; + OSDataPtr me = OSDataPtr::alloc(); if (me && !me->initWithBytesNoCopy(bytes, inLength)) { - me->release(); - return 0; + return nullptr; } return me; } -OSData * +OSDataPtr OSData::withData(const OSData *inData) { - OSData *me = new OSData; + OSDataPtr me = OSDataPtr::alloc(); if (me && !me->initWithData(inData)) { - me->release(); - return 0; + return nullptr; } return me; } -OSData * +OSDataPtr OSData::withData(const OSData *inData, unsigned int start, unsigned int inLength) { - OSData *me = new OSData; + OSDataPtr me = OSDataPtr::alloc(); if (me && !me->initWithData(inData, start, inLength)) { - me->release(); - return 0; + return nullptr; } return me; @@ -401,7 +398,7 @@ const void * OSData::getBytesNoCopy() const { if (!length) { - return 0; + return NULL; } else { return data; } @@ -411,7 +408,7 @@ const void * OSData::getBytesNoCopy(unsigned int start, unsigned int inLength) const { - const void *outData = 0; + const void *outData = NULL; if (length && start < length diff --git a/libkern/c++/OSDictionary.cpp b/libkern/c++/OSDictionary.cpp index d9c756798..c928cdd15 100644 --- a/libkern/c++/OSDictionary.cpp +++ b/libkern/c++/OSDictionary.cpp @@ -228,7 +228,7 @@ OSDictionary::withCapacity(unsigned int capacity) if (me && !me->initWithCapacity(capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -244,7 +244,7 @@ OSDictionary::withObjects(const OSObject *objects[], if (me && !me->initWithObjects(objects, keys, count, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -260,7 +260,7 @@ OSDictionary::withObjects(const OSObject *objects[], if (me && !me->initWithObjects(objects, keys, count, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -274,7 +274,7 @@ OSDictionary::withDictionary(const OSDictionary *dict, if (me && !me->initWithDictionary(dict, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -511,7 +511,7 @@ OSDictionary::getObject(const OSSymbol *aKey) const unsigned int i, l = 0, r = count; if (!aKey) { - return 0; + return NULL; } // if the key exists, return the object @@ -703,10 +703,10 @@ OSDictionary::getNextObjectForIterator(void *inIterator, OSObject **ret) const if (index < count) { *ret = (OSObject *) dictionary[index].key; } else { - *ret = 0; + *ret = NULL; } - return *ret != 0; + return *ret != NULL; } bool @@ -788,13 +788,13 @@ OSCollection * OSDictionary::copyCollection(OSDictionary *cycleDict) { bool allocDict = !cycleDict; - OSCollection *ret = 0; - OSDictionary *newDict = 0; + OSCollection *ret = NULL; + OSDictionary *newDict = NULL; if (allocDict) { cycleDict = OSDictionary::withCapacity(16); if (!cycleDict) { - return 0; + return NULL; } } @@ -833,7 +833,7 @@ OSDictionary::copyCollection(OSDictionary *cycleDict) } ret = newDict; - newDict = 0; + newDict = NULL; } while (false); abortCopy: @@ -855,13 +855,13 @@ OSDictionary::copyKeys(void) array = OSArray::withCapacity(count); if (!array) { - return 0; + return NULL; } for (unsigned int i = 0; i < count; i++) { if (!array->setObject(i, dictionary[i].key)) { array->release(); - array = 0; + array = NULL; break; } } diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index d79b4b64e..6de614778 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -113,6 +113,7 @@ static bool _OSKextInUnloadedPrelinkedKexts(const OSSymbol * theBundleID); // We really should add containsObject() & containsCString to OSCollection & subclasses. // So few pad slots, though.... static bool _OSArrayContainsCString(OSArray * array, const char * cString); +static void OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_tracepoint_code_t code); /* Prelinked arm kexts do not have VM entries because the method we use to * fake an entry (see libsa/bootstrap.cpp:readPrelinkedExtensions()) does @@ -269,6 +270,7 @@ static OSDictionary * sExcludeListByID = NULL; static OSKextVersion sExcludeListVersion = 0; static OSArray * sLoadedKexts = NULL; static OSArray * sUnloadedPrelinkedKexts = NULL; +static OSArray * sLoadedDriverKitKexts = NULL; // Requests to kextd waiting to be picked up. static OSArray * sKernelRequests = NULL; @@ -313,35 +315,35 @@ static OSKext * sKernelKext = NULL; * binary compability. */ kmod_info_t g_kernel_kmod_info = { - /* next */ 0, - /* info_version */ KMOD_INFO_VERSION, - /* id */ 0, // loadTag: kernel is always 0 - /* name */ kOSKextKernelIdentifier, // bundle identifier - /* version */ "0", // filled in in OSKext::initialize() - /* reference_count */ -1, // never adjusted; kernel never unloads - /* reference_list */ NULL, - /* address */ 0, - /* size */ 0, // filled in in OSKext::initialize() - /* hdr_size */ 0, - /* start */ 0, - /* stop */ 0 + .next = NULL, + .info_version = KMOD_INFO_VERSION, + .id = 0, // loadTag: kernel is always 0 + .name = kOSKextKernelIdentifier,// bundle identifier + .version = "0", // filled in in OSKext::initialize() + .reference_count = -1, // never adjusted; kernel never unloads + .reference_list = NULL, + .address = 0, + .size = 0, // filled in in OSKext::initialize() + .hdr_size = 0, + .start = NULL, + .stop = NULL }; /* Set up a fake kmod_info struct for statically linked kexts that don't have one. */ kmod_info_t invalid_kmod_info = { - /* next */ 0, - /* info_version */ KMOD_INFO_VERSION, - /* id */ UINT32_MAX, - /* name */ "invalid", - /* version */ "0", - /* reference_count */ -1, - /* reference_list */ NULL, - /* address */ 0, - /* size */ 0, - /* hdr_size */ 0, - /* start */ 0, - /* stop */ 0 + .next = NULL, + .info_version = KMOD_INFO_VERSION, + .id = UINT32_MAX, + .name = "invalid", + .version = "0", + .reference_count = -1, + .reference_list = NULL, + .address = 0, + .size = 0, + .hdr_size = 0, + .start = NULL, + .stop = NULL }; extern "C" { @@ -407,8 +409,8 @@ static bool sConsiderUnloadsCalled = false; static bool sConsiderUnloadsPending = false; static unsigned int sConsiderUnloadDelay = 60; // seconds -static thread_call_t sUnloadCallout = 0; -static thread_call_t sDestroyLinkContextThread = 0; // one-shot, one-at-a-time thread +static thread_call_t sUnloadCallout = NULL; +static thread_call_t sDestroyLinkContextThread = NULL; // one-shot, one-at-a-time thread static bool sSystemSleep = false; // true when system going to sleep static AbsoluteTime sLastWakeTime; // last time we woke up @@ -429,7 +431,7 @@ static IOLock * sKextSummariesLock = NULL; extern "C" lck_spin_t vm_allocation_sites_lock; static IOSimpleLock * sKextAccountsLock = &vm_allocation_sites_lock; -void (*sLoadedKextSummariesUpdated)(void) = OSKextLoadedKextSummariesUpdated; +void(*const sLoadedKextSummariesUpdated)(void) = OSKextLoadedKextSummariesUpdated; OSKextLoadedKextSummaryHeader * gLoadedKextSummaries __attribute__((used)) = NULL; uint64_t gLoadedKextSummariesTimestamp __attribute__((used)) = 0; static size_t sLoadedKextSummariesAllocSize = 0; @@ -678,7 +680,7 @@ OSKext::initialize(void) OSNumber * kernelCPUSubtype = NULL;// must release OSKextLogSpec bootLogFilter = kOSKextLogSilentFilter; bool setResult = false; - uint64_t * timestamp = 0; + uint64_t * timestamp = NULL; char bootArgBuffer[16];// for PE_parse_boot_argn w/strings /* This must be the first thing allocated. Everything else grabs this lock. @@ -694,12 +696,13 @@ OSKext::initialize(void) sKextsByID = OSDictionary::withCapacity(kOSKextTypicalLoadCount); sLoadedKexts = OSArray::withCapacity(kOSKextTypicalLoadCount); + sLoadedDriverKitKexts = OSArray::withCapacity(kOSKextTypicalLoadCount); sUnloadedPrelinkedKexts = OSArray::withCapacity(kOSKextTypicalLoadCount / 10); sKernelRequests = OSArray::withCapacity(0); sPostedKextLoadIdentifiers = OSSet::withCapacity(0); sAllKextLoadIdentifiers = OSSet::withCapacity(kOSKextTypicalLoadCount); sRequestCallbackRecords = OSArray::withCapacity(0); - assert(sKextsByID && sLoadedKexts && sKernelRequests && + assert(sKextsByID && sLoadedKexts && sLoadedDriverKitKexts && sKernelRequests && sPostedKextLoadIdentifiers && sAllKextLoadIdentifiers && sRequestCallbackRecords && sUnloadedPrelinkedKexts); @@ -750,11 +753,12 @@ OSKext::initialize(void) assert(kernelExecutable); #if KASLR_KEXT_DEBUG - IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu vm_kernel_slide %llu (0x%016lx) \n", + IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu vm_kernel_slide %lu (0x%016lx) \n", (unsigned long)kernelStart, (unsigned long)getlastaddr(), kernelLength, - vm_kernel_slide, vm_kernel_slide); + (unsigned long)vm_kernel_slide, + (unsigned long)vm_kernel_slide); #endif sKernelKext->loadTag = sNextLoadTag++; // the kernel is load tag 0 @@ -1081,12 +1085,7 @@ void OSKext::flushNonloadedKexts( Boolean flushPrelinkedKexts) { - OSSet * prelinkedKexts = NULL;// must release - OSCollectionIterator * kextIterator = NULL;// must release - OSCollectionIterator * prelinkIterator = NULL; // must release - const OSSymbol * thisID = NULL;// do not release - OSKext * thisKext = NULL;// do not release - uint32_t count, i; + OSSet * keepKexts = NULL;// must release IORecursiveLockLock(sKextLock); @@ -1100,33 +1099,36 @@ OSKext::flushNonloadedKexts( /* If we aren't flushing unused prelinked kexts, we have to put them * aside while we flush everything else so make a container for them. */ - if (!flushPrelinkedKexts) { - prelinkedKexts = OSSet::withCapacity(0); - if (!prelinkedKexts) { - goto finish; - } + keepKexts = OSSet::withCapacity(16); + if (!keepKexts) { + goto finish; } /* Set aside prelinked kexts (in-use or not) and break * any lingering inter-kext references for nonloaded kexts * so they have min. retain counts. */ - kextIterator = OSCollectionIterator::withCollection(sKextsByID); - if (!kextIterator) { - goto finish; - } - - while ((thisID = OSDynamicCast(OSSymbol, - kextIterator->getNextObject()))) { - thisKext = OSDynamicCast(OSKext, sKextsByID->getObject(thisID)); - - if (thisKext) { - if (prelinkedKexts && thisKext->isPrelinked()) { - prelinkedKexts->setObject(thisKext); - } - thisKext->flushDependencies(/* forceIfLoaded */ false); + sKextsByID->iterateObjects(^bool (const OSSymbol * thisID __unused, OSObject * obj) { + OSKext * thisKext = OSDynamicCast(OSKext, obj); + if (!thisKext) { + return false; } - } + if (!flushPrelinkedKexts && thisKext->isPrelinked()) { + keepKexts->setObject(thisKext); + } + if (!thisKext->declaresExecutable()) { + /* + * Don't unload codeless kexts, because they never appear in the loadedKexts array. + * Requesting one from kextd will load it and then immediately remove it by calling + * flushNonloadedKexts(). + * And adding one to loadedKexts breaks code assuming they have kmod_info etc. + */ + keepKexts->setObject(thisKext); + } + + thisKext->flushDependencies(/* forceIfLoaded */ false); + return false; + }); /* Dump all the kexts in the ID dictionary; we'll repopulate it shortly. */ @@ -1134,33 +1136,30 @@ OSKext::flushNonloadedKexts( /* Now put the loaded kexts back into the ID dictionary. */ - count = sLoadedKexts->getCount(); - for (i = 0; i < count; i++) { - thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + sLoadedKexts->iterateObjects(^bool (OSObject * obj) { + OSKext * thisKext = OSDynamicCast(OSKext, obj); + if (!thisKext) { + return false; + } sKextsByID->setObject(thisKext->getIdentifierCString(), thisKext); - } + return false; + }); - /* Finally, put back the prelinked kexts if we saved any. + /* Finally, put back the kept kexts if we saved any. */ - if (prelinkedKexts) { - prelinkIterator = OSCollectionIterator::withCollection(prelinkedKexts); - if (!prelinkIterator) { - goto finish; - } - - while ((thisKext = OSDynamicCast(OSKext, - prelinkIterator->getNextObject()))) { - sKextsByID->setObject(thisKext->getIdentifierCString(), - thisKext); + keepKexts->iterateObjects(^bool (OSObject * obj) { + OSKext * thisKext = OSDynamicCast(OSKext, obj); + if (!thisKext) { + return false; } - } + sKextsByID->setObject(thisKext->getIdentifierCString(), thisKext); + return false; + }); finish: IORecursiveLockUnlock(sKextLock); - OSSafeReleaseNULL(prelinkedKexts); - OSSafeReleaseNULL(kextIterator); - OSSafeReleaseNULL(prelinkIterator); + OSSafeReleaseNULL(keepKexts); return; } @@ -1525,6 +1524,12 @@ OSKext::initWithPrelinkedInfoDict( executableRelPath->retain(); } + userExecutableRelPath = OSDynamicCast(OSString, + anInfoDict->getObject("CFBundleUEXTExecutable")); + if (userExecutableRelPath) { + userExecutableRelPath->retain(); + } + /* Don't need the paths to be in the info dictionary any more. */ anInfoDict->removeObject(kPrelinkBundlePathKey); @@ -1551,7 +1556,7 @@ OSKext::initWithPrelinkedInfoDict( #if KASLR_KEXT_DEBUG IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n", - (unsigned long)ml_static_unslide(data), + (unsigned long)ml_static_unslide((vm_offset_t)data), (unsigned long)data, length); #endif @@ -1568,7 +1573,7 @@ OSKext::initWithPrelinkedInfoDict( #if KASLR_KEXT_DEBUG IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n", - (unsigned long)ml_static_unslide(srcData), + (unsigned long)ml_static_unslide((vm_offset_t)srcData), (unsigned long)srcData); #endif @@ -1630,7 +1635,7 @@ OSKext::initWithPrelinkedInfoDict( kmod_info->address = ml_static_slide(kmod_info->address); #if KASLR_KEXT_DEBUG IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n", - (unsigned long)ml_static_unslide(kmod_info), + (unsigned long)ml_static_unslide((vm_offset_t)kmod_info), (unsigned long)kmod_info); IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info->address \n", (unsigned long)ml_static_unslide(kmod_info->address), @@ -2406,6 +2411,11 @@ OSKext::uniquePersonalityProperties(OSDictionary * personalityDict) uniqueStringPlistProperty(personalityDict, kCFBundleIdentifierKey); uniqueStringPlistProperty(personalityDict, kIOProviderClassKey); uniqueStringPlistProperty(personalityDict, gIOClassKey); + if (personalityDict->getObject(kCFBundleIdentifierKernelKey)) { + uniqueStringPlistProperty(personalityDict, kCFBundleIdentifierKernelKey); + } else { + personalityDict->setObject(kCFBundleIdentifierKernelKey, personalityDict->getObject(kCFBundleIdentifierKey)); + } /* Other commonly used properties. */ @@ -2443,10 +2453,12 @@ OSKext::free(void) OSSafeReleaseNULL(bundleID); OSSafeReleaseNULL(path); OSSafeReleaseNULL(executableRelPath); + OSSafeReleaseNULL(userExecutableRelPath); OSSafeReleaseNULL(dependencies); OSSafeReleaseNULL(linkedExecutable); OSSafeReleaseNULL(metaClasses); OSSafeReleaseNULL(interfaceUUID); + OSSafeReleaseNULL(driverKitUUID); if (isInterface() && kmod_info) { kfree(kmod_info, sizeof(kmod_info_t)); @@ -2467,7 +2479,7 @@ OSKext::readMkextArchive(OSData * mkextData, { OSReturn result = kOSKextReturnBadData; uint32_t mkextLength = 0; - mkext_header * mkextHeader = 0;// do not free + mkext_header * mkextHeader = NULL;// do not free uint32_t mkextVersion = 0; /* Note default return of kOSKextReturnBadData above. @@ -2874,7 +2886,7 @@ OSKext::extractMkext2FileData( OSData * uncompressedData = NULL;// release on error - uint8_t * uncompressedDataBuffer = 0;// do not free + uint8_t * uncompressedDataBuffer = NULL;// do not free unsigned long uncompressedSize; z_stream zstream; bool zstream_inited = false; @@ -3153,6 +3165,7 @@ OSKext::loadFromMkext( kextIdentifier = OSDynamicCast(OSString, requestArgs->getObject(kKextRequestArgumentBundleIdentifierKey)); + if (!kextIdentifier) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | @@ -3194,6 +3207,7 @@ OSKext::loadFromMkext( */ result = OSKext::loadKextWithIdentifier( kextIdentifier, + /* kextRef */ NULL, /* allowDefer */ false, delayAutounload, startKextExcludeLevel, @@ -3351,17 +3365,20 @@ OSKext * OSKext::lookupKextWithLoadTag(uint32_t aTag) { OSKext * foundKext = NULL; // returned - uint32_t count, i; + uint32_t i, j; + OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts}; + uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()}; IORecursiveLockLock(sKextLock); - count = sLoadedKexts->getCount(); - for (i = 0; i < count; i++) { - OSKext * thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); - if (thisKext->getLoadTag() == aTag) { - foundKext = thisKext; - foundKext->retain(); - goto finish; + for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) { + for (i = 0; i < count[j]; i++) { + OSKext * thisKext = OSDynamicCast(OSKext, list[j]->getObject(i)); + if (thisKext->getLoadTag() == aTag) { + foundKext = thisKext; + foundKext->retain(); + goto finish; + } } } @@ -3397,6 +3414,19 @@ OSKext::lookupKextWithAddress(vm_address_t address) } } + count = sLoadedDriverKitKexts->getCount(); + for (i = 0; i < count; i++) { + OSKext * thisKext = OSDynamicCast(OSKext, sLoadedDriverKitKexts->getObject(i)); + /* + * DriverKitKexts do not have a linkedExecutable, + * so we "fake" their address with the LoadTag + */ + if (thisKext->getLoadTag() == address) { + foundKext = thisKext; + foundKext->retain(); + } + } + finish: IORecursiveLockUnlock(sKextLock); @@ -3411,6 +3441,7 @@ OSKext::copyKextUUIDForAddress(OSNumber *address) OSKext * kext = NULL; uint32_t baseIdx; uint32_t lim; + uint32_t count, i; if (!address) { return NULL; @@ -3457,6 +3488,36 @@ OSKext::copyKextUUIDForAddress(OSNumber *address) } IOSimpleLockUnlock(sKextAccountsLock); + if (!kext) { + /* + * Maybe it is a Dext. + * DriverKit userspace executables do not have a kernel linkedExecutable, + * so we "fake" their address range with the LoadTag. + * + * This is supposed to be used for logging reasons only. When logd + * calls this function it ors the address with FIREHOSE_TRACEPOINT_PC_KERNEL_MASK, so we + * remove it here before checking it against the LoadTag. + * Also we need to remove FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT set when emitting the log line. + */ + addr = (uintptr_t)address->unsigned64BitValue() & ~(FIREHOSE_TRACEPOINT_PC_KERNEL_MASK | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT); + IORecursiveLockLock(sKextLock); + count = sLoadedDriverKitKexts->getCount(); + for (i = 0; i < count; i++) { + OSKext * thisKext = NULL; + + thisKext = OSDynamicCast(OSKext, sLoadedDriverKitKexts->getObject(i)); + if (!thisKext) { + continue; + } + if (thisKext->getLoadTag() == addr) { + kext = thisKext; + kext->retain(); + break; + } + } + IORecursiveLockUnlock(sKextLock); + } + if (kext) { uuid = kext->copyTextUUID(); kext->release(); @@ -3473,36 +3534,38 @@ OSKext * OSKext::lookupKextWithUUID(uuid_t wanted) { OSKext * foundKext = NULL; // returned - uint32_t count, i; + uint32_t j, i; + OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts}; + uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()}; + IORecursiveLockLock(sKextLock); - count = sLoadedKexts->getCount(); + for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) { + for (i = 0; i < count[j]; i++) { + OSKext * thisKext = NULL; - for (i = 0; i < count; i++) { - OSKext * thisKext = NULL; - - thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); - if (!thisKext) { - continue; - } + thisKext = OSDynamicCast(OSKext, list[j]->getObject(i)); + if (!thisKext) { + continue; + } - OSData *uuid_data = thisKext->copyUUID(); - if (!uuid_data) { - continue; - } + OSData *uuid_data = thisKext->copyUUID(); + if (!uuid_data) { + continue; + } - uuid_t uuid; - memcpy(&uuid, uuid_data->getBytesNoCopy(), sizeof(uuid)); - uuid_data->release(); + uuid_t uuid; + memcpy(&uuid, uuid_data->getBytesNoCopy(), sizeof(uuid)); + uuid_data->release(); - if (0 == uuid_compare(wanted, uuid)) { - foundKext = thisKext; - foundKext->retain(); - goto finish; + if (0 == uuid_compare(wanted, uuid)) { + foundKext = thisKext; + foundKext->retain(); + goto finish; + } } } - finish: IORecursiveLockUnlock(sKextLock); @@ -3696,16 +3759,20 @@ OSKext::removeKextWithLoadTag( { OSReturn result = kOSReturnError; OSKext * foundKext = NULL; - uint32_t count, i; + uint32_t i, j; + OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts}; + uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()}; + IORecursiveLockLock(sKextLock); - count = sLoadedKexts->getCount(); - for (i = 0; i < count; i++) { - OSKext * thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); - if (thisKext->loadTag == loadTag) { - foundKext = thisKext; - break; + for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) { + for (i = 0; i < count[j]; i++) { + OSKext * thisKext = OSDynamicCast(OSKext, list[j]->getObject(i)); + if (thisKext->loadTag == loadTag) { + foundKext = thisKext; + break; + } } } @@ -3985,6 +4052,9 @@ OSKext::isCompatibleWithVersion(OSKextVersion aVersion) bool OSKext::declaresExecutable(void) { + if (isDriverKit()) { + return false; + } return getPropertyForHostArch(kCFBundleExecutableKey) != NULL; } @@ -4216,6 +4286,15 @@ OSKext::copyUUID(void) return sKernelKext->copyUUID(); } + if (isDriverKit() && infoDict) { + if (driverKitUUID) { + driverKitUUID->retain(); + return driverKitUUID; + } else { + return NULL; + } + } + /* For real kexts, try to get the UUID from the linked executable, * or if is hasn't been linked yet, the unrelocated executable. */ @@ -4223,6 +4302,7 @@ OSKext::copyUUID(void) if (!theExecutable) { theExecutable = getExecutable(); } + if (!theExecutable) { goto finish; } @@ -4279,6 +4359,14 @@ finish: return result; } +void +OSKext::setDriverKitUUID(OSData *uuid) +{ + if (!OSCompareAndSwapPtr(nullptr, uuid, &driverKitUUID)) { + OSSafeReleaseNULL(uuid); + } +} + /********************************************************************* *********************************************************************/ #if defined (__arm__) @@ -4511,6 +4599,7 @@ OSKext::loadKextWithIdentifier( goto finish; } result = OSKext::loadKextWithIdentifier(kextIdentifier, + NULL /* kextRef */, allowDeferFlag, delayAutounloadFlag, startOpt, startMatchingOpt, personalityNames); @@ -4524,6 +4613,7 @@ finish: OSReturn OSKext::loadKextWithIdentifier( OSString * kextIdentifier, + OSObject ** kextRef, Boolean allowDeferFlag, Boolean delayAutounloadFlag, OSKextExcludeLevel startOpt, @@ -4536,6 +4626,10 @@ OSKext::loadKextWithIdentifier( OSDictionary * loadRequest = NULL;// must release const OSSymbol * kextIdentifierSymbol = NULL;// must release + if (kextRef) { + *kextRef = NULL; + } + IORecursiveLockLock(sKextLock); if (!kextIdentifier) { @@ -4638,10 +4732,34 @@ finish: OSSafeReleaseNULL(loadRequest); OSSafeReleaseNULL(kextIdentifierSymbol); + if ((kOSReturnSuccess == result) && kextRef) { + theKext->retain(); + theKext->matchingRefCount++; + *kextRef = theKext; + } + IORecursiveLockUnlock(sKextLock); return result; } +/********************************************************************* +*********************************************************************/ +/* static */ +void +OSKext::dropMatchingReferences( + OSSet * kexts) +{ + IORecursiveLockLock(sKextLock); + kexts->iterateObjects(^bool (OSObject * obj) { + OSKext * thisKext = OSDynamicCast(OSKext, obj); + if (!thisKext) { + return false; + } + thisKext->matchingRefCount--; + return false; + }); + IORecursiveLockUnlock(sKextLock); +} /********************************************************************* *********************************************************************/ @@ -4791,6 +4909,13 @@ OSKext::load( "KextExcludeList was updated to version: %lld", sExcludeListVersion); } } + + if (isDriverKit()) { + if (loadTag == 0) { + sLoadedDriverKitKexts->setObject(this); + loadTag = sNextLoadTag++; + } + } result = kOSReturnSuccess; goto loaded; } @@ -4998,24 +5123,6 @@ loaded: finish: - /* More hack! If the kext doesn't declare an executable, even if we - * "loaded" it, we have to remove any personalities naming it, or we'll - * never see the registry go quiet. Errors here do not count for the - * load operation itself. - * - * Note that in every other regard it's perfectly ok for a kext to - * not declare an executable and serve only as a package for personalities - * naming another kext, so we do have to allow such kexts to be "loaded" - * so that those other personalities get added & matched. - */ - if (!declaresExecutable()) { - OSKextLog(this, - kOSKextLogStepLevel | kOSKextLogLoadFlag, - "Kext %s has no executable; removing any personalities naming it.", - getIdentifierCString()); - removePersonalitiesFromCatalog(); - } - if (result != kOSReturnSuccess) { OSKextLog(this, kOSKextLogErrorLevel | @@ -5079,12 +5186,12 @@ OSKext::lookupSection(const char *segname, const char *secname) mh = (kernel_mach_header_t *)linkedExecutable->getBytesNoCopy(); for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { - if (0 != strcmp(seg->segname, segname)) { + if (0 != strncmp(seg->segname, segname, sizeof(seg->segname))) { continue; } for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) { - if (0 == strcmp(sec->sectname, secname)) { + if (0 == strncmp(sec->sectname, secname, sizeof(sec->sectname))) { found_section = sec; goto out; } @@ -5383,7 +5490,7 @@ OSKext::loadExecutable() } /* all callers must be entitled */ - if (FALSE == IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) { + if (FALSE == IOTaskHasEntitlement(current_task(), kOSKextManagementEntitlement)) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogLoadFlag, "Not entitled to link kext '%s'", @@ -6291,12 +6398,21 @@ OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_trace } uuid_info->ftui_size = size; - uuid_info->ftui_address = ml_static_unslide(address); - + if (aKext->isDriverKit()) { + uuid_info->ftui_address = address; + } else { + uuid_info->ftui_address = ml_static_unslide(address); + } firehose_trace_metadata(firehose_stream_metadata, trace_id, stamp, uuid_info, uuid_info_len); return; } +void +OSKext::OSKextLogDriverKitInfoLoad(OSKext *kext) +{ + OSKextLogKextInfo(kext, kext->getLoadTag(), 1, firehose_tracepoint_code_load); +} + /********************************************************************* *********************************************************************/ OSReturn @@ -6588,6 +6704,15 @@ OSKext::unload(void) goto finish; } + if (isDriverKit()) { + index = sLoadedKexts->getNextIndexOfObject(this, 0); + if (index != (unsigned int)-1) { + sLoadedDriverKitKexts->removeObject(index); + OSKextLogKextInfo(this, loadTag, 1, firehose_tracepoint_code_unload); + loadTag = 0; + } + } + if (!isLoaded()) { result = kOSReturnSuccess; goto finish; @@ -6904,7 +7029,7 @@ _OSKextConsiderDestroyingLinkContext( kOSKextLogGeneralFlag, "thread_call_free() failed for kext link context."); } - sDestroyLinkContextThread = 0; + sDestroyLinkContextThread = NULL; } IORecursiveLockUnlock(sKextInnerLock); @@ -6939,7 +7064,7 @@ OSKext::considerDestroyingLinkContext(void) * this thread_call, so don't share it around. */ sDestroyLinkContextThread = thread_call_allocate( - &_OSKextConsiderDestroyingLinkContext, 0); + &_OSKextConsiderDestroyingLinkContext, NULL); if (!sDestroyLinkContextThread) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogGeneralFlag | kOSKextLogLinkFlag, @@ -7097,7 +7222,7 @@ OSKext::considerUnloads(Boolean rescheduleOnlyFlag) IORecursiveLockLock(sKextInnerLock); if (!sUnloadCallout) { - sUnloadCallout = thread_call_allocate(&_OSKextConsiderUnloads, 0); + sUnloadCallout = thread_call_allocate(&_OSKextConsiderUnloads, NULL); } /* we only reset delay value for unloading if we already have something @@ -8497,7 +8622,12 @@ OSKextGrabPgoDataLocked(OSKext *kext, size_t metadata_size = 0; sect_prf_data = kext->lookupSection("__DATA", "__llvm_prf_data"); - sect_prf_name = kext->lookupSection("__DATA", "__llvm_prf_name"); + sect_prf_name = kext->lookupSection("__DATA", "__llvm_prf_names"); + if (!sect_prf_name) { + // kextcache sometimes truncates the section name to 15 chars + // 16 character section name is truncated to 15 characters by kextcache + sect_prf_name = kext->lookupSection("__DATA", "__llvm_prf_name"); + } sect_prf_cnts = kext->lookupSection("__DATA", "__llvm_prf_cnts"); if (!sect_prf_data || !sect_prf_name || !sect_prf_cnts) { @@ -8664,11 +8794,12 @@ OSKext::copyLoadedKextInfoByUUID( { OSDictionary * result = NULL; OSDictionary * kextInfo = NULL; // must release - uint32_t count, i; + uint32_t max_count, i, j; uint32_t idCount = 0; uint32_t idIndex = 0; - IORecursiveLockLock(sKextLock); + OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts}; + uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()}; #if CONFIG_MACF /* Is the calling process allowed to query kext info? */ @@ -8704,81 +8835,83 @@ OSKext::copyLoadedKextInfoByUUID( infoKeys = NULL; } - count = sLoadedKexts->getCount(); - result = OSDictionary::withCapacity(count); + max_count = count[0] + count[1]; + result = OSDictionary::withCapacity(max_count); if (!result) { goto finish; } - for (i = 0; i < count; i++) { - OSKext *thisKext = NULL;// do not release - Boolean includeThis = true; - uuid_t thisKextUUID; - uuid_t thisKextTextUUID; - OSData *uuid_data; - uuid_string_t uuid_key; - - thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); - if (!thisKext) { - continue; - } - - uuid_data = thisKext->copyUUID(); - if (!uuid_data) { - continue; - } - - memcpy(&thisKextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextUUID)); - OSSafeReleaseNULL(uuid_data); + for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) { + for (i = 0; i < count[j]; i++) { + OSKext *thisKext = NULL;// do not release + Boolean includeThis = true; + uuid_t thisKextUUID; + uuid_t thisKextTextUUID; + OSData *uuid_data; + uuid_string_t uuid_key; - uuid_unparse(thisKextUUID, uuid_key); + thisKext = OSDynamicCast(OSKext, list[j]->getObject(i)); + if (!thisKext) { + continue; + } - uuid_data = thisKext->copyTextUUID(); - if (!uuid_data) { - continue; - } - memcpy(&thisKextTextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextTextUUID)); - OSSafeReleaseNULL(uuid_data); + uuid_data = thisKext->copyUUID(); + if (!uuid_data) { + continue; + } - /* Skip current kext if we have a list of UUIDs and - * it isn't in the list. - */ - if (kextIdentifiers) { - includeThis = false; + memcpy(&thisKextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextUUID)); + OSSafeReleaseNULL(uuid_data); - for (idIndex = 0; idIndex < idCount; idIndex++) { - const OSString* wantedUUID = OSDynamicCast(OSString, - kextIdentifiers->getObject(idIndex)); + uuid_unparse(thisKextUUID, uuid_key); - uuid_t uuid; - uuid_parse(wantedUUID->getCStringNoCopy(), uuid); + uuid_data = thisKext->copyTextUUID(); + if (!uuid_data) { + continue; + } + memcpy(&thisKextTextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextTextUUID)); + OSSafeReleaseNULL(uuid_data); - if ((0 == uuid_compare(uuid, thisKextUUID)) - || (0 == uuid_compare(uuid, thisKextTextUUID))) { - includeThis = true; - /* Only need to find the first kext if multiple match, - * ie. asking for the kernel uuid does not need to find - * interface kexts or builtin static kexts. - */ - kextIdentifiers->removeObject(idIndex); - uuid_unparse(uuid, uuid_key); - break; + /* Skip current kext if we have a list of UUIDs and + * it isn't in the list. + */ + if (kextIdentifiers) { + includeThis = false; + + for (idIndex = 0; idIndex < idCount; idIndex++) { + const OSString* wantedUUID = OSDynamicCast(OSString, + kextIdentifiers->getObject(idIndex)); + + uuid_t uuid; + uuid_parse(wantedUUID->getCStringNoCopy(), uuid); + + if ((0 == uuid_compare(uuid, thisKextUUID)) + || (0 == uuid_compare(uuid, thisKextTextUUID))) { + includeThis = true; + /* Only need to find the first kext if multiple match, + * ie. asking for the kernel uuid does not need to find + * interface kexts or builtin static kexts. + */ + kextIdentifiers->removeObject(idIndex); + uuid_unparse(uuid, uuid_key); + break; + } } } - } - if (!includeThis) { - continue; - } + if (!includeThis) { + continue; + } - kextInfo = thisKext->copyInfo(infoKeys); - if (kextInfo) { - result->setObject(uuid_key, kextInfo); - kextInfo->release(); - } + kextInfo = thisKext->copyInfo(infoKeys); + if (kextInfo) { + result->setObject(uuid_key, kextInfo); + kextInfo->release(); + } - if (kextIdentifiers && !kextIdentifiers->getCount()) { - break; + if (kextIdentifiers && !kextIdentifiers->getCount()) { + goto finish; + } } } @@ -9121,6 +9254,30 @@ OSKext::copyInfo(OSArray * infoKeys) } result->setObject(kOSBundleCPUSubtypeKey, cpuSubtypeNumber); } + } else { + if (isDriverKit() && _OSArrayContainsCString(infoKeys, kOSBundleLogStringsKey)) { + osLogDataHeaderRef *header; + char headerBytes[offsetof(osLogDataHeaderRef, sections) + NUM_OS_LOG_SECTIONS * sizeof(header->sections[0])]; + bool res; + + header = (osLogDataHeaderRef *) headerBytes; + header->version = OS_LOG_HDR_VERSION; + header->sect_count = NUM_OS_LOG_SECTIONS; + header->sections[OS_LOG_SECT_IDX].sect_offset = 0; + header->sections[OS_LOG_SECT_IDX].sect_size = (uint32_t) 0; + header->sections[CSTRING_SECT_IDX].sect_offset = 0; + header->sections[CSTRING_SECT_IDX].sect_size = (uint32_t) 0; + + logData = OSData::withBytes(header, (u_int) (sizeof(osLogDataHeaderRef))); + if (!logData) { + goto finish; + } + res = logData->appendBytes(&(header->sections[0]), (u_int)(header->sect_count * sizeof(header->sections[0]))); + if (!res) { + goto finish; + } + result->setObject(kOSBundleLogStringsKey, logData); + } } } @@ -9187,6 +9344,29 @@ OSKext::copyInfo(OSArray * infoKeys) result->setObject(kOSBundleExecutablePathKey, executablePathString); } else if (flags.builtin) { result->setObject(kOSBundleExecutablePathKey, bundleID); + } else if (isDriverKit()) { + if (path) { + // +1 for slash, +1 for \0 + uint32_t pathLength = path->getLength(); + executablePathCStringSize = pathLength + 2; + + executablePathCString = (char *)kalloc_tag((executablePathCStringSize) * + sizeof(char), VM_KERN_MEMORY_OSKEXT); + if (!executablePathCString) { + goto finish; + } + strlcpy(executablePathCString, path->getCStringNoCopy(), executablePathCStringSize); + executablePathCString[pathLength++] = '/'; + executablePathCString[pathLength++] = '\0'; + + executablePathString = OSString::withCString(executablePathCString); + + if (!executablePathString) { + goto finish; + } + + result->setObject(kOSBundleExecutablePathKey, executablePathString); + } } } @@ -9249,7 +9429,8 @@ OSKext::copyInfo(OSArray * infoKeys) _OSArrayContainsCString(infoKeys, kOSBundleExecLoadAddressKey) || _OSArrayContainsCString(infoKeys, kOSBundleExecLoadSizeKey) || _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) { - if (isInterface() || flags.builtin || linkedExecutable) { + bool is_dext = isDriverKit(); + if (isInterface() || flags.builtin || linkedExecutable || is_dext) { /* These go to userspace via serialization, so we don't want any doubts * about their size. */ @@ -9299,6 +9480,15 @@ OSKext::copyInfo(OSArray * infoKeys) } else { wiredSize = loadSize; } + } else if (is_dext) { + /* + * DriverKit userspace executables do not have a kernel linkedExecutable, + * so we "fake" their address range with the LoadTag. + */ + if (loadTag) { + loadAddress = execLoadAddress = loadTag; + loadSize = execLoadSize = 1; + } } if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadAddressKey)) { @@ -9492,6 +9682,35 @@ finish: return result; } +/********************************************************************* +*********************************************************************/ +/* static */ +bool +OSKext::copyUserExecutablePath(const OSSymbol * bundleID, char * pathResult, size_t pathSize) +{ + bool ok; + OSKext * kext; + + IORecursiveLockLock(sKextLock); + kext = OSDynamicCast(OSKext, sKextsByID->getObject(bundleID)); + if (kext) { + kext->retain(); + } + IORecursiveLockUnlock(sKextLock); + + if (!kext || !kext->path || !kext->userExecutableRelPath) { + OSSafeReleaseNULL(kext); + return false; + } + snprintf(pathResult, pathSize, "%s/Contents/MacOS/%s", + kext->path->getCStringNoCopy(), + kext->userExecutableRelPath->getCStringNoCopy()); + ok = true; + kext->release(); + + return ok; +} + /********************************************************************* *********************************************************************/ /* static */ @@ -9690,6 +9909,64 @@ finish: return result; } +OSReturn +OSKext::requestDaemonLaunch( + OSString *kextIdentifier, + OSString *serverName, + OSNumber *serverTag) +{ + OSReturn result = kOSReturnError; + OSDictionary * requestDict = NULL; // must release + + if (!kextIdentifier || !serverName || !serverTag) { + result = kOSKextReturnInvalidArgument; + goto finish; + } + + IORecursiveLockLock(sKextLock); + + OSKextLog(/* kext */ NULL, + kOSKextLogDebugLevel | + kOSKextLogGeneralFlag, + "Requesting daemon launch for %s with serverName %s and tag %llu", + kextIdentifier->getCStringNoCopy(), + serverName->getCStringNoCopy(), + serverTag->unsigned64BitValue() + ); + + result = _OSKextCreateRequest(kKextRequestPredicateRequestDaemonLaunch, &requestDict); + if (result != kOSReturnSuccess) { + goto finish; + } + + if (!_OSKextSetRequestArgument(requestDict, + kKextRequestArgumentBundleIdentifierKey, kextIdentifier) || + !_OSKextSetRequestArgument(requestDict, + kKextRequestArgumentDriverExtensionServerName, serverName) || + !_OSKextSetRequestArgument(requestDict, + kKextRequestArgumentDriverExtensionServerTag, serverTag)) { + result = kOSKextReturnNoMemory; + goto finish; + } + + /* Only post the requests after all the other potential failure points + * have been passed. + */ + if (!sKernelRequests->setObject(requestDict)) { + result = kOSKextReturnNoMemory; + goto finish; + } + OSKext::pingKextd(); + + result = kOSReturnSuccess; +finish: + IORecursiveLockUnlock(sKextLock); + if (requestDict) { + requestDict->release(); + } + return result; +} + /********************************************************************* * Assumes sKextLock is held. *********************************************************************/ @@ -11955,6 +12232,20 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp) accountp->account = this->account; } +bool +OSKext::isDriverKit(void) +{ + OSString *bundleType; + + if (infoDict) { + bundleType = OSDynamicCast(OSString, infoDict->getObject(kCFBundlePackageTypeKey)); + if (bundleType && bundleType->isEqualTo(kOSKextBundlePackageTypeDriverKit)) { + return TRUE; + } + } + return FALSE; +} + extern "C" const vm_allocation_site_t * OSKextGetAllocationSiteForCaller(uintptr_t address) { diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index 7db8d37aa..0d564de95 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -60,6 +60,7 @@ __BEGIN_DECLS #include #include #include +#include #if PRAGMA_MARK #pragma mark Macros @@ -144,14 +145,14 @@ OSMetaClassBase::_RESERVEDOSMetaClassBase2() { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 2); } -#endif /* SLOT_USED */ - -// As these slots are used move them up inside the #if above void OSMetaClassBase::_RESERVEDOSMetaClassBase3() { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 3); } +#endif /* SLOT_USED */ + +// As these slots are used move them up inside the #if above void OSMetaClassBase::_RESERVEDOSMetaClassBase4() { @@ -169,13 +170,14 @@ OSMetaClassBase::_RESERVEDOSMetaClassBase6() } #endif - /********************************************************************* *********************************************************************/ #if defined(__arm__) || defined(__arm64__) - +#if defined(HAS_APPLE_PAC) +#include +#endif /* defined(HAS_APPLE_PAC) */ /* * IHI0059A "C++ Application Binary Interface Standard for the ARM 64 - bit Architecture": @@ -194,9 +196,16 @@ OSMetaClassBase::_RESERVEDOSMetaClassBase6() */ OSMetaClassBase::_ptf_t -OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) +#if defined(HAS_APPLE_PAC) && __has_feature(ptrauth_type_discriminator) +OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self __attribute__((unused)), + void (OSMetaClassBase::*func)(void), uintptr_t typeDisc) +#else +OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, + void (OSMetaClassBase::*func)(void), + uintptr_t typeDisc + __attribute__((unused))) +#endif { - typedef long int ptrdiff_t; struct ptmf_t { _ptf_t fPFN; ptrdiff_t delta; @@ -210,6 +219,13 @@ OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::* map.fIn = func; pfn = map.pTMF.fPFN; +#if defined(HAS_APPLE_PAC) && __has_feature(ptrauth_type_discriminator) + // Authenticate 'pfn' using the member function pointer type discriminator + // and resign it as a C function pointer. 'pfn' can point to either a + // non-virtual function or a virtual member function thunk. + pfn = ptrauth_auth_function(pfn, ptrauth_key_function_pointer, typeDisc); + return pfn; +#else if (map.pTMF.delta & 1) { // virtual union { @@ -219,12 +235,33 @@ OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::* u.fObj = self; // Virtual member function so dereference table +#if defined(HAS_APPLE_PAC) + // The entity hash is stored in the top 32-bits of the vtable offset of a + // member function pointer. + uint32_t entity_hash = ((uintptr_t)pfn) >> 32; + pfn = (_ptf_t)(((uintptr_t) pfn) & 0xFFFFFFFF); + + // Authenticate the vtable pointer. + _ptf_t *vtablep = ptrauth_auth_data(*u.vtablep, + ptrauth_key_cxx_vtable_pointer, 0); + // Calculate the address of the vtable entry. + _ptf_t *vtentryp = (_ptf_t *)(((uintptr_t)vtablep) + (uintptr_t)pfn); + // Load the pointer from the vtable entry. + pfn = *vtentryp; + + // Finally, resign the vtable entry as a function pointer. + uintptr_t auth_data = ptrauth_blend_discriminator(vtentryp, entity_hash); + pfn = ptrauth_auth_and_resign(pfn, ptrauth_key_function_pointer, + auth_data, ptrauth_key_function_pointer, 0); +#else /* defined(HAS_APPLE_PAC) */ pfn = *(_ptf_t *)(((uintptr_t)*u.vtablep) + (uintptr_t)pfn); +#endif /* !defined(HAS_APPLE_PAC) */ return pfn; } else { // Not virtual, i.e. plain member func return pfn; } +#endif } #endif /* defined(__arm__) || defined(__arm64__) */ @@ -243,7 +280,32 @@ OSMetaClassBase::safeMetaCast( const OSMetaClassBase * me, const OSMetaClass * toType) { - return (me)? me->metaCast(toType) : 0; + return (me)? me->metaCast(toType) : NULL; +} + +/// A helper function to crash with a kernel panic. +__attribute__((cold, not_tail_called, noreturn)) +static inline void +panic_crash_fail_cast(const OSMetaClassBase *me, + const OSMetaClass *toType) +{ + panic("Unexpected cast fail: from %p to %p", me, toType); + __builtin_unreachable(); +} + +OSMetaClassBase * +OSMetaClassBase::requiredMetaCast( + const OSMetaClassBase * me, + const OSMetaClass * toType) +{ + if (!me) { + return NULL; + } + OSMetaClassBase *tmp = safeMetaCast(me, toType); + if (!tmp) { + panic_crash_fail_cast(me, toType); + } + return tmp; } /********************************************************************* @@ -254,7 +316,7 @@ OSMetaClassBase::checkTypeInst( const OSMetaClassBase * typeinst) { const OSMetaClass * toType = OSTypeIDInst(typeinst); - return typeinst && inst && (0 != inst->metaCast(toType)); + return typeinst && inst && (NULL != inst->metaCast(toType)); } /********************************************************************* @@ -327,7 +389,7 @@ OSMetaClassBase * OSMetaClassBase::metaCast(const OSString * toMetaStr) const { const OSSymbol * tempSymb = OSSymbol::withString(toMetaStr); - OSMetaClassBase * ret = 0; + OSMetaClassBase * ret = NULL; if (tempSymb) { ret = metaCast(tempSymb); tempSymb->release(); @@ -341,7 +403,7 @@ OSMetaClassBase * OSMetaClassBase::metaCast(const char * toMetaCStr) const { const OSSymbol * tempSymb = OSSymbol::withCString(toMetaCStr); - OSMetaClassBase * ret = 0; + OSMetaClassBase * ret = NULL; if (tempSymb) { ret = metaCast(tempSymb); tempSymb->release(); @@ -362,13 +424,13 @@ public: OSObject * alloc() const; }; OSMetaClassMeta::OSMetaClassMeta() - : OSMetaClass("OSMetaClass", 0, sizeof(OSMetaClass)) + : OSMetaClass("OSMetaClass", NULL, sizeof(OSMetaClass)) { } OSObject * OSMetaClassMeta::alloc() const { - return 0; + return NULL; } static OSMetaClassMeta sOSMetaClassMeta; @@ -496,6 +558,7 @@ OSMetaClass::logError(OSReturn error) * registration, and OSMetaClass::postModLoad(), which actually * records all the class/kext relationships of the new MetaClasses. *********************************************************************/ + OSMetaClass::OSMetaClass( const char * inClassName, const OSMetaClass * inSuperClass, @@ -568,7 +631,7 @@ OSMetaClass::OSMetaClass( *********************************************************************/ OSMetaClass::~OSMetaClass() { - OSKext * myKext = reserved ? reserved->kext : 0; // do not release + OSKext * myKext = reserved ? reserved->kext : NULL; // do not release /* Hack alert: 'className' is a C string during early C++ init, and * is converted to a real OSSymbol only when we record the OSKext in @@ -698,7 +761,7 @@ OSMetaClass::preModLoad(const char * kextIdentifier) kalloc_tag(kKModCapacityIncrement * sizeof(OSMetaClass *), VM_KERN_MEMORY_OSKEXT); if (!sStalled->classes) { kfree(sStalled, sizeof(*sStalled)); - return 0; + return NULL; } OSMETA_ACCUMSIZE((kKModCapacityIncrement * sizeof(OSMetaClass *)) + sizeof(*sStalled)); @@ -730,8 +793,8 @@ OSReturn OSMetaClass::postModLoad(void * loadHandle) { OSReturn result = kOSReturnSuccess; - OSSymbol * myKextName = 0;// must release - OSKext * myKext = 0;// must release + OSSymbol * myKextName = NULL;// must release + OSKext * myKext = NULL;// must release if (!sStalled || loadHandle != sStalled) { result = kOSMetaClassInternal; @@ -882,7 +945,7 @@ finish: sizeof(*sStalled))); kfree(sStalled->classes, sStalled->capacity * sizeof(OSMetaClass *)); kfree(sStalled, sizeof(*sStalled)); - sStalled = 0; + sStalled = NULL; } IOLockUnlock(sStalledClassesLock); @@ -988,7 +1051,7 @@ OSMetaClass::removeInstance(const OSObject * instance, bool super) const } IOLockLock(sAllClassesLock); reserved->instances->release(); - reserved->instances = 0; + reserved->instances = NULL; IOLockUnlock(sAllClassesLock); } } @@ -1072,7 +1135,7 @@ OSMetaClass::applyToInstancesOfClassName( void * context) { OSMetaClass * meta; - OSOrderedSet * set = 0; + OSOrderedSet * set = NULL; IOLockLock(sAllClassesLock); if (sAllClassesDict @@ -1144,10 +1207,10 @@ OSMetaClass::removeClasses(OSCollection * metaClasses) const OSMetaClass * OSMetaClass::getMetaClassWithName(const OSSymbol * name) { - OSMetaClass * retMeta = 0; + OSMetaClass * retMeta = NULL; if (!name) { - return 0; + return NULL; } IOLockLock(sAllClassesLock); @@ -1167,10 +1230,10 @@ OSMetaClass::copyMetaClassWithName(const OSSymbol * name) const OSMetaClass * meta; if (!name) { - return 0; + return NULL; } - meta = 0; + meta = NULL; IOLockLock(sAllClassesLock); if (sAllClassesDict) { meta = (OSMetaClass *) sAllClassesDict->getObject(name); @@ -1199,7 +1262,7 @@ OSMetaClass::allocClassWithName(const OSSymbol * name) const OSMetaClass * meta; OSObject * result; - result = 0; + result = NULL; meta = copyMetaClassWithName(name); if (meta) { result = meta->alloc(); @@ -1239,7 +1302,7 @@ OSMetaClass::checkMetaCastWithName( const OSSymbol * name, const OSMetaClassBase * in) { - OSMetaClassBase * result = 0; + OSMetaClassBase * result = NULL; const OSMetaClass * const meta = getMetaClassWithName(name); @@ -1305,11 +1368,12 @@ OSMetaClass::checkMetaCast( } } - return 0; + return NULL; } /********************************************************************* *********************************************************************/ +__dead2 void OSMetaClass::reservedCalled(int ind) const { @@ -1332,7 +1396,7 @@ OSMetaClass::getSuperClass() const const OSSymbol * OSMetaClass::getKmodName() const { - OSKext * myKext = reserved ? reserved->kext : 0; + OSKext * myKext = reserved ? reserved->kext : NULL; if (myKext) { return myKext->getIdentifier(); } @@ -1383,7 +1447,7 @@ OSDictionary * OSMetaClass::getClassDictionary() { panic("OSMetaClass::getClassDictionary() is obsoleted.\n"); - return 0; + return NULL; } /********************************************************************* diff --git a/libkern/c++/OSNumber.cpp b/libkern/c++/OSNumber.cpp index ffbc9e793..6b6a6caae 100644 --- a/libkern/c++/OSNumber.cpp +++ b/libkern/c++/OSNumber.cpp @@ -85,7 +85,7 @@ OSNumber::withNumber(unsigned long long value, if (me && !me->init(value, newNumberOfBits)) { me->release(); - return 0; + return NULL; } return me; @@ -98,7 +98,7 @@ OSNumber::withNumber(const char *value, unsigned int newNumberOfBits) if (me && !me->init(value, newNumberOfBits)) { me->release(); - return 0; + return NULL; } return me; diff --git a/libkern/c++/OSObject.cpp b/libkern/c++/OSObject.cpp index de9cc00fa..61168bf48 100644 --- a/libkern/c++/OSObject.cpp +++ b/libkern/c++/OSObject.cpp @@ -50,7 +50,7 @@ __END_DECLS /* Class global data */ OSObject::MetaClass OSObject::gMetaClass; const OSMetaClass * const OSObject::metaClass = &OSObject::gMetaClass; -const OSMetaClass * const OSObject::superClass = 0; +const OSMetaClass * const OSObject::superClass = NULL; /* Class member functions - Can't use defaults */ OSObject::~OSObject() @@ -64,7 +64,7 @@ OSObject::getMetaClass() const OSObject * OSObject::MetaClass::alloc() const { - return 0; + return NULL; } /* The OSObject::MetaClass constructor */ @@ -233,13 +233,13 @@ OSObject::taggedRelease(const void *tag, const int when) const void OSObject::release() const { - taggedRelease(0); + taggedRelease(NULL); } void OSObject::retain() const { - taggedRetain(0); + taggedRetain(NULL); } extern "C" void @@ -257,7 +257,7 @@ osobject_release(void * object) void OSObject::release(int when) const { - taggedRelease(0, when); + taggedRelease(NULL, when); } bool @@ -365,3 +365,24 @@ OSObject::OSObject(const OSMetaClass *) retainCount = 1; // if (kIOTracking & gIOKitDebug) getMetaClass()->trackedInstance(this); } + + +bool +OSObject::iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object)) +{ + OSCollection * col; + if ((col = OSDynamicCast(OSCollection, this))) { + return col->iterateObjects(refcon, callback); + } + return callback(refcon, this); +} + +bool +OSObject::iterateObjects(bool (^block)(OSObject * object)) +{ + OSCollection * col; + if ((col = OSDynamicCast(OSCollection, this))) { + return col->iterateObjects(block); + } + return block(this); +} diff --git a/libkern/c++/OSOrderedSet.cpp b/libkern/c++/OSOrderedSet.cpp index 2b7cd44c8..88230f659 100644 --- a/libkern/c++/OSOrderedSet.cpp +++ b/libkern/c++/OSOrderedSet.cpp @@ -93,7 +93,7 @@ withCapacity(unsigned int capacity, if (me && !me->initWithCapacity(capacity, ordering, orderingRef)) { me->release(); - me = 0; + me = NULL; } return me; @@ -298,7 +298,7 @@ OSObject * OSOrderedSet::getObject( unsigned int index ) const { if (index >= count) { - return 0; + return NULL; } // if( pri) @@ -313,7 +313,7 @@ OSOrderedSet::getFirstObject() const if (count) { return const_cast((const OSObject *) array[0].obj); } else { - return 0; + return NULL; } } @@ -323,14 +323,14 @@ OSOrderedSet::getLastObject() const if (count) { return const_cast((const OSObject *) array[count - 1].obj); } else { - return 0; + return NULL; } } SInt32 OSOrderedSet::orderObject( const OSMetaClassBase * anObject ) { - return ORDER( anObject, 0 ); + return ORDER( anObject, NULL ); } void * @@ -399,10 +399,10 @@ getNextObjectForIterator(void *inIterator, OSObject **ret) const if (index < count) { *ret = const_cast((const OSObject *) array[index].obj); } else { - *ret = 0; + *ret = NULL; } - return *ret != 0; + return *ret != NULL; } @@ -427,13 +427,13 @@ OSCollection * OSOrderedSet::copyCollection(OSDictionary *cycleDict) { bool allocDict = !cycleDict; - OSCollection *ret = 0; - OSOrderedSet *newSet = 0; + OSCollection *ret = NULL; + OSOrderedSet *newSet = NULL; if (allocDict) { cycleDict = OSDictionary::withCapacity(16); if (!cycleDict) { - return 0; + return NULL; } } @@ -474,7 +474,7 @@ OSOrderedSet::copyCollection(OSDictionary *cycleDict) ; ret = newSet; - newSet = 0; + newSet = NULL; } while (false); abortCopy: diff --git a/libkern/c++/OSRuntime.cpp b/libkern/c++/OSRuntime.cpp index 122acda60..ba1dd30b1 100644 --- a/libkern/c++/OSRuntime.cpp +++ b/libkern/c++/OSRuntime.cpp @@ -45,6 +45,10 @@ __BEGIN_DECLS #include #include +#if KASAN +#include +#endif + #if PRAGMA_MARK #pragma mark Constants &c. #endif /* PRAGMA_MARK */ @@ -95,12 +99,12 @@ kern_os_malloc(size_t size) { void *mem; if (size == 0) { - return 0; + return NULL; } mem = kallocp_tag_bt((vm_size_t *)&size, VM_KERN_MEMORY_LIBKERN); if (!mem) { - return 0; + return NULL; } #if OSALLOCDEBUG @@ -147,13 +151,13 @@ kern_os_realloc( if (nsize == 0) { kfree_addr(addr); - return 0; + return NULL; } nmem = kallocp_tag_bt((vm_size_t *)&nsize, VM_KERN_MEMORY_LIBKERN); if (!nmem) { kfree_addr(addr); - return 0; + return NULL; } #if OSALLOCDEBUG @@ -177,13 +181,13 @@ kern_os_realloc( *********************************************************************/ #if __GNUC__ >= 3 -void +void __dead2 __cxa_pure_virtual( void ) { panic("%s", __FUNCTION__); } #else -void +void __dead2 __pure_virtual( void ) { panic("%s", __FUNCTION__); @@ -236,6 +240,9 @@ __END_DECLS * kern_os C++ Runtime Load/Unload *********************************************************************/ +#if defined(HAS_APPLE_PAC) +#include +#endif /* defined(HAS_APPLE_PAC) */ typedef void (*structor_t)(void); @@ -310,6 +317,10 @@ OSRuntimeCallStructorsInSection( break; } +#if !defined(XXX) && defined(HAS_APPLE_PAC) + structor = __builtin_ptrauth_strip(structor, ptrauth_key_function_pointer); + structor = __builtin_ptrauth_sign_unauthenticated(structor, ptrauth_key_function_pointer, 0); +#endif (*structor)(); } else if (!hit_null_structor) { hit_null_structor = 1; @@ -393,7 +404,7 @@ OSRuntimeFinalizeCPP( segment = firstsegfromheader(header); for (segment = firstsegfromheader(header); - segment != 0; + segment != NULL; segment = nextsegfromheader(header, segment)) { OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment, sectionNames[kOSSectionNameFinalizer], textStart, textEnd); @@ -487,7 +498,7 @@ OSRuntimeInitializeCPP( * segment, and invoke the constructors within those sections. */ for (segment = firstsegfromheader(header); - segment != failure_segment && segment != 0; + segment != failure_segment && segment != NULL; segment = nextsegfromheader(header, segment)) { OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment, sectionNames[kOSSectionNameFinalizer], textStart, textEnd); @@ -572,11 +583,42 @@ noexcept #endif { if (ptr) { +#if KASAN + /* + * Unpoison the C++ array cookie inserted (but not removed) by the + * compiler on new[]. + */ + kasan_unpoison_cxx_array_cookie(ptr); +#endif kern_os_free(ptr); } return; } +#if __cplusplus >= 201103L + +void +operator delete(void * addr, size_t sz) noexcept +{ +#if OSALLOCDEBUG + OSAddAtomic(-sz, &debug_iomalloc_size); +#endif /* OSALLOCDEBUG */ + kfree(addr, sz); +} + +void +operator delete[](void * addr, size_t sz) noexcept +{ + if (addr) { +#if OSALLOCDEBUG + OSAddAtomic(-sz, &debug_iomalloc_size); +#endif /* OSALLOCDEBUG */ + kfree(addr, sz); + } +} + +#endif /* __cplusplus >= 201103L */ + /* PR-6481964 - The compiler is going to check for size overflows in calls to * new[], and if there is an overflow, it will call __throw_length_error. * This is an unrecoverable error by the C++ standard, so we must panic here. @@ -585,7 +627,7 @@ noexcept * compiler expects the name to be mangled. */ namespace std { -void +void __dead2 __throw_length_error(const char *msg __unused) { panic("Size of array created by new[] has overflowed"); diff --git a/libkern/c++/OSSerialize.cpp b/libkern/c++/OSSerialize.cpp index a0366f02d..d015efe9e 100644 --- a/libkern/c++/OSSerialize.cpp +++ b/libkern/c++/OSSerialize.cpp @@ -205,7 +205,7 @@ OSSerialize::initWithCapacity(unsigned int inCapacity) } if (round_page_overflow(inCapacity, &capacity)) { tags->release(); - tags = 0; + tags = NULL; return false; } @@ -217,7 +217,7 @@ OSSerialize::initWithCapacity(unsigned int inCapacity) kern_return_t rc = kmem_alloc(kernel_map, (vm_offset_t *)&data, capacity, IOMemoryTag(kernel_map)); if (rc) { tags->release(); - tags = 0; + tags = NULL; return false; } bzero((void *)data, capacity); @@ -235,7 +235,7 @@ OSSerialize::withCapacity(unsigned int inCapacity) if (me && !me->initWithCapacity(inCapacity)) { me->release(); - return 0; + return NULL; } return me; @@ -303,9 +303,8 @@ OSSerialize::ensureCapacity(unsigned int newCapacity) void OSSerialize::free() { - if (tags) { - tags->release(); - } + OSSafeReleaseNULL(tags); + OSSafeReleaseNULL(indexData); if (data) { kmem_free(kernel_map, (vm_offset_t)data, capacity); @@ -325,7 +324,7 @@ OSSerializer * OSSerializer::forTarget( void * target, thing = new OSSerializer; if (thing && !thing->init()) { thing->release(); - thing = 0; + thing = NULL; } if (thing) { @@ -352,7 +351,7 @@ OSSerializer::withBlock( block = Block_copy(callback); if (!block) { - return 0; + return NULL; } serializer = (OSSerializer::forTarget(NULL, &OSSerializer::callbackToBlock, block)); diff --git a/libkern/c++/OSSerializeBinary.cpp b/libkern/c++/OSSerializeBinary.cpp index b408296c4..a9d9ed61c 100644 --- a/libkern/c++/OSSerializeBinary.cpp +++ b/libkern/c++/OSSerializeBinary.cpp @@ -51,11 +51,11 @@ OSSerialize::binaryWithCapacity(unsigned int inCapacity, OSSerialize *me; if (inCapacity < sizeof(uint32_t)) { - return 0; + return NULL; } me = OSSerialize::withCapacity(inCapacity); if (!me) { - return 0; + return NULL; } me->binary = true; @@ -98,17 +98,38 @@ OSSerialize::addBinary(const void * bits, size_t size) return true; } +void +OSSerialize::setIndexed(bool index __unused) +{ + assert(index && !indexData); + indexData = OSData::withCapacity(256); + assert(indexData); +} + bool OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key, - const void * bits, size_t size) + const void * bits, size_t size, + uint32_t * startCollection) { unsigned int newCapacity; size_t alignSize; + size_t headerSize; // add to tag array tags->setObject(o); - if (os_add3_overflow(size, sizeof(key), 3, &alignSize)) { + headerSize = sizeof(key); + if (indexData) { + uint32_t offset = length; + if (startCollection) { + *startCollection = offset; + headerSize += sizeof(uint32_t); + } + offset /= sizeof(uint32_t); + indexData->appendBytes(&offset, sizeof(offset)); + } + + if (os_add3_overflow(size, headerSize, 3, &alignSize)) { return false; } alignSize &= ~3L; @@ -131,14 +152,58 @@ OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key, } bcopy(&key, &data[length], sizeof(key)); - bcopy(bits, &data[length + sizeof(key)], size); + bcopy(bits, &data[length + headerSize], size); length += alignSize; return true; } +void +OSSerialize::endBinaryCollection(uint32_t startCollection) +{ + uint32_t clength; + + if (!indexData) { + return; + } + + assert(length > startCollection); + if (length <= startCollection) { + return; + } + + clength = length - startCollection; + assert(!(clength & 3)); + clength /= sizeof(uint32_t); + + memcpy(&data[startCollection + sizeof(uint32_t)], &clength, sizeof(clength)); +} + bool OSSerialize::binarySerialize(const OSMetaClassBase *o) +{ + bool ok; + uint32_t header; + + ok = binarySerializeInternal(o); + if (!ok) { + return ok; + } + + if (indexData) { + header = indexData->getLength() / sizeof(uint32_t); + assert(header <= kOSSerializeDataMask); + header <<= 8; + header |= kOSSerializeIndexedBinarySignature; + + memcpy(&data[0], &header, sizeof(header)); + } + + return ok; +} + +bool +OSSerialize::binarySerializeInternal(const OSMetaClassBase *o) { OSDictionary * dict; OSArray * array; @@ -150,13 +215,18 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o) OSBoolean * boo; unsigned int tagIdx; - uint32_t i, key; + uint32_t i, key, startCollection; size_t len; bool ok; tagIdx = tags->getNextIndexOfObject(o, 0); // does it exist? if (-1U != tagIdx) { + if (indexData) { + assert(indexData->getLength() > (tagIdx * sizeof(uint32_t))); + tagIdx = ((const uint32_t *)indexData->getBytesNoCopy())[tagIdx]; + assert(tagIdx <= kOSSerializeDataMask); + } key = (kOSSerializeObject | tagIdx); if (endCollection) { endCollection = false; @@ -168,11 +238,11 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o) if ((dict = OSDynamicCast(OSDictionary, o))) { key = (kOSSerializeDictionary | dict->count); - ok = addBinaryObject(o, key, NULL, 0); + ok = addBinaryObject(o, key, NULL, 0, &startCollection); for (i = 0; ok && (i < dict->count);) { const OSSymbol * dictKey; const OSMetaClassBase * dictValue; - const OSMetaClassBase * nvalue = 0; + const OSMetaClassBase * nvalue = NULL; dictKey = dict->dictionary[i].key; dictValue = dict->dictionary[i].value; @@ -197,9 +267,10 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o) } // if (!ok) ok = binarySerialize(kOSBooleanFalse); } + endBinaryCollection(startCollection); } else if ((array = OSDynamicCast(OSArray, o))) { key = (kOSSerializeArray | array->count); - ok = addBinaryObject(o, key, NULL, 0); + ok = addBinaryObject(o, key, NULL, 0, &startCollection); for (i = 0; ok && (i < array->count);) { i++; endCollection = (i == array->count); @@ -209,9 +280,10 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o) } // if (!ok) ok = binarySerialize(kOSBooleanFalse); } + endBinaryCollection(startCollection); } else if ((set = OSDynamicCast(OSSet, o))) { key = (kOSSerializeSet | set->members->count); - ok = addBinaryObject(o, key, NULL, 0); + ok = addBinaryObject(o, key, NULL, 0, &startCollection); for (i = 0; ok && (i < set->members->count);) { i++; endCollection = (i == set->members->count); @@ -221,27 +293,28 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o) } // if (!ok) ok = binarySerialize(kOSBooleanFalse); } + endBinaryCollection(startCollection); } else if ((num = OSDynamicCast(OSNumber, o))) { key = (kOSSerializeNumber | num->size); - ok = addBinaryObject(o, key, &num->value, sizeof(num->value)); + ok = addBinaryObject(o, key, &num->value, sizeof(num->value), NULL); } else if ((boo = OSDynamicCast(OSBoolean, o))) { key = (kOSSerializeBoolean | (kOSBooleanTrue == boo)); - ok = addBinaryObject(o, key, NULL, 0); + ok = addBinaryObject(o, key, NULL, 0, NULL); } else if ((sym = OSDynamicCast(OSSymbol, o))) { len = (sym->getLength() + 1); key = (kOSSerializeSymbol | len); - ok = addBinaryObject(o, key, sym->getCStringNoCopy(), len); + ok = addBinaryObject(o, key, sym->getCStringNoCopy(), len, NULL); } else if ((str = OSDynamicCast(OSString, o))) { - len = (str->getLength() + 0); + len = (str->getLength() + ((indexData != NULL) ? 1 : 0)); key = (kOSSerializeString | len); - ok = addBinaryObject(o, key, str->getCStringNoCopy(), len); + ok = addBinaryObject(o, key, str->getCStringNoCopy(), len, NULL); } else if ((ldata = OSDynamicCast(OSData, o))) { len = ldata->getLength(); if (ldata->reserved && ldata->reserved->disableSerialization) { len = 0; } key = (kOSSerializeData | len); - ok = addBinaryObject(o, key, ldata->getBytesNoCopy(), len); + ok = addBinaryObject(o, key, ldata->getBytesNoCopy(), len, NULL); } else { return false; } @@ -303,23 +376,28 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin size_t bufferPos; const uint32_t * next; - uint32_t key, len, wordLen; + uint32_t key, len, wordLen, length; bool end, newCollect, isRef; unsigned long long value; - bool ok; + bool ok, indexed, hasLength; + indexed = false; if (errorString) { - *errorString = 0; + *errorString = NULL; } + if (bufferSize < sizeof(kOSSerializeBinarySignature)) { return NULL; } - if (0 != strcmp(kOSSerializeBinarySignature, buffer)) { + if (kOSSerializeIndexedBinarySignature == (((const uint8_t *) buffer)[0])) { + indexed = true; + } else if (0 != strcmp(kOSSerializeBinarySignature, buffer)) { return NULL; } if (3 & ((uintptr_t) buffer)) { return NULL; } + bufferPos = sizeof(kOSSerializeBinarySignature); next = (typeof(next))(((uintptr_t) buffer) + bufferPos); @@ -329,12 +407,12 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin objsIdx = objsCapacity = 0; stackIdx = stackCapacity = 0; - result = 0; - parent = 0; - dict = 0; - array = 0; - set = 0; - sym = 0; + result = NULL; + parent = NULL; + dict = NULL; + array = NULL; + set = NULL; + sym = NULL; ok = true; while (ok) { @@ -343,27 +421,31 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin break; } key = *next++; + length = 0; len = (key & kOSSerializeDataMask); wordLen = (len + 3) >> 2; end = (0 != (kOSSerializeEndCollecton & key)); DEBG("key 0x%08x: 0x%04x, %d\n", key, len, end); - newCollect = isRef = false; - o = 0; newDict = 0; newArray = 0; newSet = 0; + newCollect = isRef = hasLength = false; + o = NULL; newDict = NULL; newArray = NULL; newSet = NULL; switch (kOSSerializeTypeMask & key) { case kOSSerializeDictionary: o = newDict = OSDictionary::withCapacity(len); newCollect = (len != 0); + hasLength = indexed; break; case kOSSerializeArray: o = newArray = OSArray::withCapacity(len); newCollect = (len != 0); + hasLength = indexed; break; case kOSSerializeSet: o = newSet = OSSet::withCapacity(len); newCollect = (len != 0); + hasLength = indexed; break; case kOSSerializeObject: @@ -430,10 +512,18 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin break; } - if (!(ok = (o != 0))) { + if (!(ok = (o != NULL))) { break; } + if (hasLength) { + bufferPos += sizeof(*next); + if (!(ok = (bufferPos <= bufferSize))) { + break; + } + length = *next++; + } + if (!isRef) { setAtIndex(objs, objsIdx, o); if (!ok) { @@ -451,7 +541,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin sym = OSDynamicCast(OSSymbol, sym); if (!sym && (str = OSDynamicCast(OSString, str))) { sym = const_cast(OSSymbol::withString(str)); - ok = (sym != 0); + ok = (sym != NULL); if (!ok) { break; } @@ -463,7 +553,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin if (sym && (sym != str)) { sym->release(); } - sym = 0; + sym = NULL; } } else if (array) { ok = array->setObject(o); @@ -481,7 +571,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin } if (end) { - parent = 0; + parent = NULL; } if (newCollect) { stackIdx++; @@ -509,12 +599,12 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin if (!parent) { break; } - set = 0; - dict = 0; - array = 0; + set = NULL; + dict = NULL; + array = NULL; if (!(dict = OSDynamicCast(OSDictionary, parent))) { if (!(array = OSDynamicCast(OSArray, parent))) { - ok = (0 != (set = OSDynamicCast(OSSet, parent))); + ok = (NULL != (set = OSDynamicCast(OSSet, parent))); } } } @@ -522,11 +612,11 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin DEBG("ret %p\n", result); if (!ok) { - result = 0; + result = NULL; } if (objsCapacity) { - for (len = (result != 0); len < objsIdx; len++) { + for (len = (result != NULL); len < objsIdx; len++) { objsArray[len]->release(); } kfree(objsArray, objsCapacity * sizeof(*objsArray)); diff --git a/libkern/c++/OSSet.cpp b/libkern/c++/OSSet.cpp index 3c7701dcf..ed8b2762c 100644 --- a/libkern/c++/OSSet.cpp +++ b/libkern/c++/OSSet.cpp @@ -119,7 +119,7 @@ OSSet::withCapacity(unsigned int capacity) if (me && !me->initWithCapacity(capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -134,7 +134,7 @@ OSSet::withObjects(const OSObject *objects[], if (me && !me->initWithObjects(objects, count, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -148,7 +148,7 @@ OSSet::withArray(const OSArray *array, if (me && !me->initWithArray(array, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -162,7 +162,7 @@ OSSet::withSet(const OSSet *set, if (me && !me->initWithSet(set, capacity)) { me->release(); - return 0; + return NULL; } return me; @@ -230,7 +230,7 @@ OSSet::setObject(const OSMetaClassBase *anObject) bool OSSet::merge(const OSArray * array) { - const OSMetaClassBase * anObject = 0; + const OSMetaClassBase * anObject = NULL; bool result = true; for (int i = 0; (anObject = array->getObject(i)); i++) { @@ -367,10 +367,10 @@ OSSet::getNextObjectForIterator(void *inIterator, OSObject **ret) const if (index < members->count) { *ret = members->getObject(index); } else { - *ret = 0; + *ret = NULL; } - return *ret != 0; + return *ret != NULL; } bool @@ -410,13 +410,13 @@ OSCollection * OSSet::copyCollection(OSDictionary *cycleDict) { bool allocDict = !cycleDict; - OSCollection *ret = 0; - OSSet *newSet = 0; + OSCollection *ret = NULL; + OSSet *newSet = NULL; if (allocDict) { cycleDict = OSDictionary::withCapacity(16); if (!cycleDict) { - return 0; + return NULL; } } @@ -455,7 +455,7 @@ OSSet::copyCollection(OSDictionary *cycleDict) ; ret = newSet; - newSet = 0; + newSet = NULL; } while (false); abortCopy: diff --git a/libkern/c++/OSString.cpp b/libkern/c++/OSString.cpp index 91fc3cba2..c5196917c 100644 --- a/libkern/c++/OSString.cpp +++ b/libkern/c++/OSString.cpp @@ -165,7 +165,7 @@ OSString::withString(const OSString *aString) if (me && !me->initWithString(aString)) { me->release(); - return 0; + return NULL; } return me; @@ -178,7 +178,7 @@ OSString::withCString(const char *cString) if (me && !me->initWithCString(cString)) { me->release(); - return 0; + return NULL; } return me; @@ -191,7 +191,7 @@ OSString::withCStringNoCopy(const char *cString) if (me && !me->initWithCStringNoCopy(cString)) { me->release(); - return 0; + return NULL; } return me; @@ -204,7 +204,7 @@ OSString::withStringOfLength(const char *cString, size_t length) if (me && !me->initWithStringOfLength(cString, length)) { me->release(); - return 0; + return NULL; } return me; diff --git a/libkern/c++/OSSymbol.cpp b/libkern/c++/OSSymbol.cpp index 455ea10d6..695fda33a 100644 --- a/libkern/c++/OSSymbol.cpp +++ b/libkern/c++/OSSymbol.cpp @@ -188,7 +188,7 @@ OSSymbolPool::init() poolGate = lck_rw_alloc_init(IOLockGroup, LCK_ATTR_NULL); - return poolGate != 0; + return poolGate != NULL; } OSSymbolPool::OSSymbolPool(const OSSymbolPool *old) @@ -197,7 +197,7 @@ OSSymbolPool::OSSymbolPool(const OSSymbolPool *old) nBuckets = old->nBuckets; buckets = old->buckets; - poolGate = 0; // Do not duplicate the poolGate + poolGate = NULL; // Do not duplicate the poolGate } OSSymbolPool::~OSSymbolPool() @@ -250,7 +250,7 @@ OSSymbolPool::nextHashState(OSSymbolPoolState *stateP) while (!stateP->j) { if (!stateP->i) { - return 0; + return NULL; } stateP->i--; thisBucket--; @@ -319,7 +319,7 @@ OSSymbolPool::findSymbol(const char *cString) const j = thisBucket->count; if (!j) { - return 0; + return NULL; } if (j == 1) { @@ -330,7 +330,7 @@ OSSymbolPool::findSymbol(const char *cString) const && probeSymbol->taggedTryRetain(nullptr)) { return probeSymbol; } - return 0; + return NULL; } for (list = thisBucket->symbolP; j--; list++) { @@ -342,7 +342,7 @@ OSSymbolPool::findSymbol(const char *cString) const } } - return 0; + return NULL; } OSSymbol * @@ -432,7 +432,7 @@ OSSymbolPool::removeSymbol(OSSymbol *sym) probeSymbol = (OSSymbol *) list; if (probeSymbol == sym) { - thisBucket->symbolP = 0; + thisBucket->symbolP = NULL; count--; thisBucket->count--; SHRINK_POOL(); diff --git a/libkern/c++/OSUnserialize.y b/libkern/c++/OSUnserialize.y index 86f396784..d3189324e 100644 --- a/libkern/c++/OSUnserialize.y +++ b/libkern/c++/OSUnserialize.y @@ -292,7 +292,7 @@ yylex() /* copy to null terminated buffer */ tempString = (char *)malloc(length + 1); - if (tempString == 0) { + if (tempString == NULL) { printf("OSUnserialize: can't alloc temp memory\n"); return 0; } @@ -320,7 +320,7 @@ yylex() (void)nextChar(); /* copy to null terminated buffer */ tempString = (char *)malloc(length + 1); - if (tempString == 0) { + if (tempString == NULL) { printf("OSUnserialize: can't alloc temp memory\n"); return 0; } @@ -626,9 +626,9 @@ OSUnserialize(const char *buffer, OSString **errorString) tags = OSDictionary::withCapacity(128); if (yyparse() == 0) { object = parsedObject; - if (errorString) *errorString = 0; + if (errorString) *errorString = NULL; } else { - object = 0; + object = NULL; if (errorString) *errorString = OSString::withCString(yyerror_message); } diff --git a/libkern/c++/OSUnserializeXML.cpp b/libkern/c++/OSUnserializeXML.cpp index 2c53ef4cc..338246042 100644 --- a/libkern/c++/OSUnserializeXML.cpp +++ b/libkern/c++/OSUnserializeXML.cpp @@ -258,7 +258,7 @@ typedef int YYSTYPE; /* Line 216 of yacc.c. */ -#line 215 "OSUnserializeXML.tab.c" +#line 212 "OSUnserializeXML.tab.c" #ifdef short # undef short @@ -549,10 +549,10 @@ static const yytype_int8 yyrhs[] = /* YYRLINE[YYN] -- source line where rule number YYN was defined. */ static const yytype_uint16 yyrline[] = { - 0, 149, 149, 152, 157, 162, 174, 186, 198, 210, - 222, 234, 246, 265, 268, 271, 274, 275, 290, 299, - 311, 314, 317, 320, 323, 326, 329, 332, 339, 342, - 345, 348, 351 + 0, 146, 146, 149, 154, 159, 171, 183, 195, 207, + 219, 231, 243, 267, 270, 273, 276, 277, 292, 301, + 313, 316, 319, 322, 325, 328, 331, 334, 341, 344, + 347, 350, 353 }; #endif @@ -933,7 +933,7 @@ int yydebug; /* YYINITDEPTH -- initial size of the parser's stacks. */ #ifndef YYINITDEPTH -# define YYINITDEPTH 64 +# define YYINITDEPTH 200 #endif /* YYMAXDEPTH -- maximum size the stacks can grow to (effective only @@ -1495,14 +1495,14 @@ yyreduce: YY_REDUCE_PRINT(yyn); switch (yyn) { case 2: -#line 149 "OSUnserializeXML.y" +#line 146 "OSUnserializeXML.y" { yyerror("unexpected end of buffer"); YYERROR; ;} break; case 3: -#line 152 "OSUnserializeXML.y" +#line 149 "OSUnserializeXML.y" { STATE->parsedObject = (yyvsp[(1) - (1)])->object; (yyvsp[(1) - (1)])->object = 0; freeObject(STATE, (yyvsp[(1) - (1)])); @@ -1511,14 +1511,14 @@ yyreduce: break; case 4: -#line 157 "OSUnserializeXML.y" +#line 154 "OSUnserializeXML.y" { yyerror("syntax error"); YYERROR; ;} break; case 5: -#line 162 "OSUnserializeXML.y" +#line 159 "OSUnserializeXML.y" { (yyval) = buildDictionary(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1534,7 +1534,7 @@ yyreduce: break; case 6: -#line 174 "OSUnserializeXML.y" +#line 171 "OSUnserializeXML.y" { (yyval) = buildArray(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1550,7 +1550,7 @@ yyreduce: break; case 7: -#line 186 "OSUnserializeXML.y" +#line 183 "OSUnserializeXML.y" { (yyval) = buildSet(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1566,7 +1566,7 @@ yyreduce: break; case 8: -#line 198 "OSUnserializeXML.y" +#line 195 "OSUnserializeXML.y" { (yyval) = buildString(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1582,7 +1582,7 @@ yyreduce: break; case 9: -#line 210 "OSUnserializeXML.y" +#line 207 "OSUnserializeXML.y" { (yyval) = buildData(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1598,7 +1598,7 @@ yyreduce: break; case 10: -#line 222 "OSUnserializeXML.y" +#line 219 "OSUnserializeXML.y" { (yyval) = buildNumber(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1614,7 +1614,7 @@ yyreduce: break; case 11: -#line 234 "OSUnserializeXML.y" +#line 231 "OSUnserializeXML.y" { (yyval) = buildBoolean(STATE, (yyvsp[(1) - (1)])); if (!yyval->object) { @@ -1630,7 +1630,7 @@ yyreduce: break; case 12: -#line 246 "OSUnserializeXML.y" +#line 243 "OSUnserializeXML.y" { (yyval) = retrieveObject(STATE, (yyvsp[(1) - (1)])->idref); if ((yyval)) { STATE->retrievedObjectCount++; @@ -1654,21 +1654,21 @@ yyreduce: break; case 13: -#line 265 "OSUnserializeXML.y" +#line 267 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (2)]); (yyval)->elements = NULL; ;} break; case 14: -#line 268 "OSUnserializeXML.y" +#line 270 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (3)]); (yyval)->elements = (yyvsp[(2) - (3)]); ;} break; case 17: -#line 275 "OSUnserializeXML.y" +#line 277 "OSUnserializeXML.y" { (yyval) = (yyvsp[(2) - (2)]); (yyval)->next = (yyvsp[(1) - (2)]); @@ -1685,7 +1685,7 @@ yyreduce: break; case 18: -#line 290 "OSUnserializeXML.y" +#line 292 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (2)]); (yyval)->key = (OSSymbol *)(yyval)->object; (yyval)->object = (yyvsp[(2) - (2)])->object; @@ -1696,7 +1696,7 @@ yyreduce: break; case 19: -#line 299 "OSUnserializeXML.y" +#line 301 "OSUnserializeXML.y" { (yyval) = buildSymbol(STATE, (yyvsp[(1) - (1)])); // STATE->parsedObjectCount++; @@ -1708,42 +1708,42 @@ yyreduce: break; case 20: -#line 311 "OSUnserializeXML.y" +#line 313 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (2)]); (yyval)->elements = NULL; ;} break; case 21: -#line 314 "OSUnserializeXML.y" +#line 316 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (3)]); (yyval)->elements = (yyvsp[(2) - (3)]); ;} break; case 23: -#line 320 "OSUnserializeXML.y" +#line 322 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (2)]); (yyval)->elements = NULL; ;} break; case 24: -#line 323 "OSUnserializeXML.y" +#line 325 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (3)]); (yyval)->elements = (yyvsp[(2) - (3)]); ;} break; case 26: -#line 329 "OSUnserializeXML.y" +#line 331 "OSUnserializeXML.y" { (yyval) = (yyvsp[(1) - (1)]); (yyval)->next = NULL; ;} break; case 27: -#line 332 "OSUnserializeXML.y" +#line 334 "OSUnserializeXML.y" { (yyval) = (yyvsp[(2) - (2)]); (yyval)->next = (yyvsp[(1) - (2)]); ;} @@ -1751,7 +1751,7 @@ yyreduce: /* Line 1267 of yacc.c. */ -#line 1699 "OSUnserializeXML.tab.c" +#line 1701 "OSUnserializeXML.tab.c" default: break; } YY_SYMBOL_PRINT("-> $$ =", yyr1[yyn], &yyval, &yyloc); @@ -1963,7 +1963,7 @@ yyreturn: } -#line 354 "OSUnserializeXML.y" +#line 356 "OSUnserializeXML.y" int @@ -2187,7 +2187,7 @@ getString(parser_state_t *state) /* copy to null terminated buffer */ tempString = (char *)malloc(length + 1); - if (tempString == 0) { + if (tempString == NULL) { printf("OSUnserializeXML: can't alloc temp memory\n"); goto error; } @@ -2324,7 +2324,8 @@ static const signed char __CFPLDataDecodeTable[128] = { static void * getCFEncodedData(parser_state_t *state, unsigned int *size) { - int numeq = 0, acc = 0, cntr = 0; + int numeq = 0, cntr = 0; + unsigned int acc = 0; int tmpbufpos = 0, tmpbuflen = 0; unsigned char *tmpbuf = (unsigned char *)malloc(DATA_ALLOC_SIZE); @@ -2865,7 +2866,7 @@ buildSymbol(parser_state_t *state, object_t *o) { OSSymbol *symbol; - symbol = const_cast(OSSymbol::withCString(o->string)); + symbol = const_cast < OSSymbol * > (OSSymbol::withCString(o->string)); if (o->idref >= 0) { rememberObject(state, o->idref, symbol); } @@ -2972,7 +2973,8 @@ OSUnserializeXML(const char *buffer, size_t bufferSize, OSString **errorString) return 0; } - if (!strcmp(kOSSerializeBinarySignature, buffer)) { + if (!strcmp(kOSSerializeBinarySignature, buffer) + || (kOSSerializeIndexedBinarySignature == (uint8_t)buffer[0])) { return OSUnserializeBinary(buffer, bufferSize, errorString); } diff --git a/libkern/c++/OSUnserializeXML.y b/libkern/c++/OSUnserializeXML.y index 4f1c3cc97..1769fb631 100644 --- a/libkern/c++/OSUnserializeXML.y +++ b/libkern/c++/OSUnserializeXML.y @@ -1,8 +1,8 @@ /* - * Copyright (c) 1999-2013 Apple Inc. All rights reserved. + * Copyright (c) 1999-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -55,7 +55,7 @@ // // - + %pure_parser %{ @@ -64,63 +64,65 @@ #include #include -#define MAX_OBJECTS 65535 +#define MAX_OBJECTS 131071 +#define MAX_REFED_OBJECTS 65535 #define YYSTYPE object_t * -#define YYPARSE_PARAM state -#define YYLEX_PARAM (parser_state_t *)state +#define YYPARSE_PARAM state +#define YYLEX_PARAM (parser_state_t *)state // this is the internal struct used to hold objects on parser stack // it represents objects both before and after they have been created -typedef struct object { - struct object *next; - struct object *free; - struct object *elements; - OSObject *object; - OSSymbol *key; // for dictionary - int size; - void *data; // for data - char *string; // for string & symbol - long long number; // for number - int idref; +typedef struct object { + struct object *next; + struct object *free; + struct object *elements; + OSObject *object; + OSSymbol *key; // for dictionary + int size; + void *data; // for data + char *string; // for string & symbol + long long number; // for number + int idref; } object_t; // this code is reentrant, this structure contains all // state information for the parsing of a single buffer typedef struct parser_state { - const char *parseBuffer; // start of text to be parsed - int parseBufferIndex; // current index into text - int lineNumber; // current line number - object_t *objects; // internal objects in use - object_t *freeObjects; // internal objects that are free - OSDictionary *tags; // used to remember "ID" tags - OSString **errorString; // parse error with line - OSObject *parsedObject; // resultant object of parsed text - int parsedObjectCount; + const char *parseBuffer; // start of text to be parsed + int parseBufferIndex; // current index into text + int lineNumber; // current line number + object_t *objects; // internal objects in use + object_t *freeObjects; // internal objects that are free + OSDictionary *tags; // used to remember "ID" tags + OSString **errorString; // parse error with line + OSObject *parsedObject; // resultant object of parsed text + int parsedObjectCount; + int retrievedObjectCount; } parser_state_t; -#define STATE ((parser_state_t *)state) +#define STATE ((parser_state_t *)state) -#undef yyerror -#define yyerror(s) OSUnserializeerror(STATE, (s)) -static int OSUnserializeerror(parser_state_t *state, const char *s); +#undef yyerror +#define yyerror(s) OSUnserializeerror(STATE, (s)) +static int OSUnserializeerror(parser_state_t *state, const char *s); -static int yylex(YYSTYPE *lvalp, parser_state_t *state); +static int yylex(YYSTYPE *lvalp, parser_state_t *state); -static object_t *newObject(parser_state_t *state); -static void freeObject(parser_state_t *state, object_t *o); -static void rememberObject(parser_state_t *state, int tag, OSObject *o); -static object_t *retrieveObject(parser_state_t *state, int tag); -static void cleanupObjects(parser_state_t *state); +static object_t *newObject(parser_state_t *state); +static void freeObject(parser_state_t *state, object_t *o); +static void rememberObject(parser_state_t *state, int tag, OSObject *o); +static object_t *retrieveObject(parser_state_t *state, int tag); +static void cleanupObjects(parser_state_t *state); -static object_t *buildDictionary(parser_state_t *state, object_t *o); -static object_t *buildArray(parser_state_t *state, object_t *o); -static object_t *buildSet(parser_state_t *state, object_t *o); -static object_t *buildString(parser_state_t *state, object_t *o); -static object_t *buildSymbol(parser_state_t *state, object_t *o); -static object_t *buildData(parser_state_t *state, object_t *o); -static object_t *buildNumber(parser_state_t *state, object_t *o); -static object_t *buildBoolean(parser_state_t *state, object_t *o); +static object_t *buildDictionary(parser_state_t *state, object_t *o); +static object_t *buildArray(parser_state_t *state, object_t *o); +static object_t *buildSet(parser_state_t *state, object_t *o); +static object_t *buildString(parser_state_t *state, object_t *o); +static object_t *buildSymbol(parser_state_t *state, object_t *o); +static object_t *buildData(parser_state_t *state, object_t *o); +static object_t *buildNumber(parser_state_t *state, object_t *o); +static object_t *buildBoolean(parser_state_t *state, object_t *o); #include @@ -138,7 +140,7 @@ static object_t *buildBoolean(parser_state_t *state, object_t *o); %token NUMBER %token SET %token STRING -%token SYNTAX_ERROR +%token SYNTAX_ERROR %% /* Grammar rules and actions follow */ input: /* empty */ { yyerror("unexpected end of buffer"); @@ -240,8 +242,13 @@ object: dict { $$ = buildDictionary(STATE, $1); } | idref { $$ = retrieveObject(STATE, $1->idref); if ($$) { + STATE->retrievedObjectCount++; $$->object->retain(); - } else { + if (STATE->retrievedObjectCount > MAX_REFED_OBJECTS) { + yyerror("maximum object reference count"); + YYERROR; + } + } else { yyerror("forward reference detected"); YYERROR; } @@ -285,7 +292,7 @@ pairs: pair pair: key object { $$ = $1; $$->key = (OSSymbol *)$$->object; $$->object = $2->object; - $$->next = NULL; + $$->next = NULL; $2->object = 0; freeObject(STATE, $2); } @@ -321,8 +328,8 @@ set: '[' ']' { $$ = $1; | SET ; -elements: object { $$ = $1; - $$->next = NULL; +elements: object { $$ = $1; + $$->next = NULL; } | elements object { $$ = $2; $$->next = $1; @@ -351,40 +358,40 @@ string: STRING int OSUnserializeerror(parser_state_t * state, const char *s) /* Called by yyparse on errors */ { - if (state->errorString) { - char tempString[128]; - snprintf(tempString, 128, "OSUnserializeXML: %s near line %d\n", s, state->lineNumber); - *(state->errorString) = OSString::withCString(tempString); - } - - return 0; + if (state->errorString) { + char tempString[128]; + snprintf(tempString, 128, "OSUnserializeXML: %s near line %d\n", s, state->lineNumber); + *(state->errorString) = OSString::withCString(tempString); + } + + return 0; } -#define TAG_MAX_LENGTH 32 -#define TAG_MAX_ATTRIBUTES 32 -#define TAG_BAD 0 -#define TAG_START 1 -#define TAG_END 2 -#define TAG_EMPTY 3 -#define TAG_IGNORE 4 - -#define currentChar() (state->parseBuffer[state->parseBufferIndex]) -#define nextChar() (state->parseBuffer[++state->parseBufferIndex]) -#define prevChar() (state->parseBuffer[state->parseBufferIndex - 1]) - -#define isSpace(c) ((c) == ' ' || (c) == '\t') -#define isAlpha(c) (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z')) -#define isDigit(c) ((c) >= '0' && (c) <= '9') -#define isAlphaDigit(c) ((c) >= 'a' && (c) <= 'f') -#define isHexDigit(c) (isDigit(c) || isAlphaDigit(c)) -#define isAlphaNumeric(c) (isAlpha(c) || isDigit(c) || ((c) == '-')) +#define TAG_MAX_LENGTH 32 +#define TAG_MAX_ATTRIBUTES 32 +#define TAG_BAD 0 +#define TAG_START 1 +#define TAG_END 2 +#define TAG_EMPTY 3 +#define TAG_IGNORE 4 + +#define currentChar() (state->parseBuffer[state->parseBufferIndex]) +#define nextChar() (state->parseBuffer[++state->parseBufferIndex]) +#define prevChar() (state->parseBuffer[state->parseBufferIndex - 1]) + +#define isSpace(c) ((c) == ' ' || (c) == '\t') +#define isAlpha(c) (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z')) +#define isDigit(c) ((c) >= '0' && (c) <= '9') +#define isAlphaDigit(c) ((c) >= 'a' && (c) <= 'f') +#define isHexDigit(c) (isDigit(c) || isAlphaDigit(c)) +#define isAlphaNumeric(c) (isAlpha(c) || isDigit(c) || ((c) == '-')) static int getTag(parser_state_t *state, - char tag[TAG_MAX_LENGTH], - int *attributeCount, - char attributes[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH], - char values[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH] ) + char tag[TAG_MAX_LENGTH], + int *attributeCount, + char attributes[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH], + char values[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH] ) { int length = 0; int c = currentChar(); @@ -392,116 +399,151 @@ getTag(parser_state_t *state, *attributeCount = 0; - if (c != '<') return TAG_BAD; - c = nextChar(); // skip '<' + if (c != '<') { + return TAG_BAD; + } + c = nextChar(); // skip '<' // // - if (c == '!') { - c = nextChar(); - bool isComment = (c == '-') && ((c = nextChar()) != 0) && (c == '-'); - if (!isComment && !isAlpha(c)) return TAG_BAD; // lineNumber++; - if (isComment) { - if (c != '-') continue; - c = nextChar(); - if (c != '-') continue; - c = nextChar(); + if (c == '!') { + c = nextChar(); + bool isComment = (c == '-') && ((c = nextChar()) != 0) && (c == '-'); + if (!isComment && !isAlpha(c)) { + return TAG_BAD; // ') { - (void)nextChar(); - return TAG_IGNORE; + while (c && (c = nextChar()) != 0) { + if (c == '\n') { + state->lineNumber++; + } + if (isComment) { + if (c != '-') { + continue; + } + c = nextChar(); + if (c != '-') { + continue; + } + c = nextChar(); + } + if (c == '>') { + (void)nextChar(); + return TAG_IGNORE; + } + if (isComment) { + break; + } } - if (isComment) break; - } - return TAG_BAD; - } - - else - + return TAG_BAD; + } else // - if (c == '?') { - while ((c = nextChar()) != 0) { - if (c == '\n') state->lineNumber++; - if (c != '?') continue; - c = nextChar(); - if (!c) return TAG_IGNORE; - if (c == '>') { - (void)nextChar(); - return TAG_IGNORE; + if (c == '?') { + while ((c = nextChar()) != 0) { + if (c == '\n') { + state->lineNumber++; + } + if (c != '?') { + continue; + } + c = nextChar(); + if (!c) { + return TAG_IGNORE; + } + if (c == '>') { + (void)nextChar(); + return TAG_IGNORE; + } } - } - return TAG_BAD; - } - - else - - // + return TAG_BAD; + } else + // if (c == '/') { - c = nextChar(); // skip '/' + c = nextChar(); // skip '/' tagType = TAG_END; } - if (!isAlpha(c)) return TAG_BAD; + if (!isAlpha(c)) { + return TAG_BAD; + } /* find end of tag while copying it */ while (isAlphaNumeric(c)) { tag[length++] = c; c = nextChar(); - if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD; + if (length >= (TAG_MAX_LENGTH - 1)) { + return TAG_BAD; + } } tag[length] = 0; // printf("tag %s, type %d\n", tag, tagType); - + // look for attributes of the form attribute = "value" ... while ((c != '>') && (c != '/')) { - while (isSpace(c)) c = nextChar(); + while (isSpace(c)) { + c = nextChar(); + } length = 0; while (isAlphaNumeric(c)) { attributes[*attributeCount][length++] = c; - if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD; + if (length >= (TAG_MAX_LENGTH - 1)) { + return TAG_BAD; + } c = nextChar(); } attributes[*attributeCount][length] = 0; - while (isSpace(c)) c = nextChar(); - - if (c != '=') return TAG_BAD; + while (isSpace(c)) { + c = nextChar(); + } + + if (c != '=') { + return TAG_BAD; + } c = nextChar(); - - while (isSpace(c)) c = nextChar(); - if (c != '"') return TAG_BAD; + while (isSpace(c)) { + c = nextChar(); + } + + if (c != '"') { + return TAG_BAD; + } c = nextChar(); length = 0; while (c != '"') { values[*attributeCount][length++] = c; - if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD; + if (length >= (TAG_MAX_LENGTH - 1)) { + return TAG_BAD; + } c = nextChar(); - if (!c) return TAG_BAD; + if (!c) { + return TAG_BAD; + } } values[*attributeCount][length] = 0; c = nextChar(); // skip closing quote -// printf(" attribute '%s' = '%s', nextchar = '%c'\n", +// printf(" attribute '%s' = '%s', nextchar = '%c'\n", // attributes[*attributeCount], values[*attributeCount], c); (*attributeCount)++; - if (*attributeCount >= TAG_MAX_ATTRIBUTES) return TAG_BAD; + if (*attributeCount >= TAG_MAX_ATTRIBUTES) { + return TAG_BAD; + } } if (c == '/') { - c = nextChar(); // skip '/' + c = nextChar(); // skip '/' tagType = TAG_EMPTY; } - if (c != '>') return TAG_BAD; - c = nextChar(); // skip '>' + if (c != '>') { + return TAG_BAD; + } + c = nextChar(); // skip '>' return tagType; } @@ -517,20 +559,24 @@ getString(parser_state_t *state) /* find end of string */ while (c != 0) { - if (c == '\n') state->lineNumber++; + if (c == '\n') { + state->lineNumber++; + } if (c == '<') { break; } c = nextChar(); } - if (c != '<') return 0; + if (c != '<') { + return 0; + } length = state->parseBufferIndex - start; /* copy to null terminated buffer */ tempString = (char *)malloc(length + 1); - if (tempString == 0) { + if (tempString == NULL) { printf("OSUnserializeXML: can't alloc temp memory\n"); goto error; } @@ -544,30 +590,48 @@ getString(parser_state_t *state) if (c != '&') { tempString[j++] = c; } else { - if ((i+3) > length) goto error; + if ((i + 3) > length) { + goto error; + } c = state->parseBuffer[start + i++]; if (c == 'l') { - if (state->parseBuffer[start + i++] != 't') goto error; - if (state->parseBuffer[start + i++] != ';') goto error; + if (state->parseBuffer[start + i++] != 't') { + goto error; + } + if (state->parseBuffer[start + i++] != ';') { + goto error; + } tempString[j++] = '<'; continue; - } + } if (c == 'g') { - if (state->parseBuffer[start + i++] != 't') goto error; - if (state->parseBuffer[start + i++] != ';') goto error; + if (state->parseBuffer[start + i++] != 't') { + goto error; + } + if (state->parseBuffer[start + i++] != ';') { + goto error; + } tempString[j++] = '>'; continue; - } - if ((i+3) > length) goto error; + } + if ((i + 3) > length) { + goto error; + } if (c == 'a') { - if (state->parseBuffer[start + i++] != 'm') goto error; - if (state->parseBuffer[start + i++] != 'p') goto error; - if (state->parseBuffer[start + i++] != ';') goto error; + if (state->parseBuffer[start + i++] != 'm') { + goto error; + } + if (state->parseBuffer[start + i++] != 'p') { + goto error; + } + if (state->parseBuffer[start + i++] != ';') { + goto error; + } tempString[j++] = '&'; continue; } goto error; - } + } } tempString[j] = 0; @@ -576,7 +640,9 @@ getString(parser_state_t *state) return tempString; error: - if (tempString) free(tempString); + if (tempString) { + free(tempString); + } return 0; } @@ -600,7 +666,7 @@ getNumber(parser_state_t *state) negate = true; c = nextChar(); } - while(isDigit(c)) { + while (isDigit(c)) { n = (n * base + c - '0'); c = nextChar(); } @@ -608,7 +674,7 @@ getNumber(parser_state_t *state) n = (unsigned long long)((long long)n * (long long)-1); } } else { - while(isHexDigit(c)) { + while (isHexDigit(c)) { if (isDigit(c)) { n = (n * base + c - '0'); } else { @@ -624,22 +690,22 @@ getNumber(parser_state_t *state) // taken from CFXMLParsing/CFPropertyList.c static const signed char __CFPLDataDecodeTable[128] = { - /* 000 */ -1, -1, -1, -1, -1, -1, -1, -1, - /* 010 */ -1, -1, -1, -1, -1, -1, -1, -1, - /* 020 */ -1, -1, -1, -1, -1, -1, -1, -1, - /* 030 */ -1, -1, -1, -1, -1, -1, -1, -1, - /* ' ' */ -1, -1, -1, -1, -1, -1, -1, -1, - /* '(' */ -1, -1, -1, 62, -1, -1, -1, 63, - /* '0' */ 52, 53, 54, 55, 56, 57, 58, 59, - /* '8' */ 60, 61, -1, -1, -1, 0, -1, -1, - /* '@' */ -1, 0, 1, 2, 3, 4, 5, 6, - /* 'H' */ 7, 8, 9, 10, 11, 12, 13, 14, - /* 'P' */ 15, 16, 17, 18, 19, 20, 21, 22, - /* 'X' */ 23, 24, 25, -1, -1, -1, -1, -1, - /* '`' */ -1, 26, 27, 28, 29, 30, 31, 32, - /* 'h' */ 33, 34, 35, 36, 37, 38, 39, 40, - /* 'p' */ 41, 42, 43, 44, 45, 46, 47, 48, - /* 'x' */ 49, 50, 51, -1, -1, -1, -1, -1 + /* 000 */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 010 */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 020 */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 030 */ -1, -1, -1, -1, -1, -1, -1, -1, + /* ' ' */ -1, -1, -1, -1, -1, -1, -1, -1, + /* '(' */ -1, -1, -1, 62, -1, -1, -1, 63, + /* '0' */ 52, 53, 54, 55, 56, 57, 58, 59, + /* '8' */ 60, 61, -1, -1, -1, 0, -1, -1, + /* '@' */ -1, 0, 1, 2, 3, 4, 5, 6, + /* 'H' */ 7, 8, 9, 10, 11, 12, 13, 14, + /* 'P' */ 15, 16, 17, 18, 19, 20, 21, 22, + /* 'X' */ 23, 24, 25, -1, -1, -1, -1, -1, + /* '`' */ -1, 26, 27, 28, 29, 30, 31, 32, + /* 'h' */ 33, 34, 35, 36, 37, 38, 39, 40, + /* 'p' */ 41, 42, 43, 44, 45, 46, 47, 48, + /* 'x' */ 49, 50, 51, -1, -1, -1, -1, -1 }; #define DATA_ALLOC_SIZE 4096 @@ -647,103 +713,115 @@ static const signed char __CFPLDataDecodeTable[128] = { static void * getCFEncodedData(parser_state_t *state, unsigned int *size) { - int numeq = 0, acc = 0, cntr = 0; - int tmpbufpos = 0, tmpbuflen = 0; - unsigned char *tmpbuf = (unsigned char *)malloc(DATA_ALLOC_SIZE); - - int c = currentChar(); - *size = 0; - - while (c != '<') { - c &= 0x7f; - if (c == 0) { + int numeq = 0, cntr = 0; + unsigned int acc = 0; + int tmpbufpos = 0, tmpbuflen = 0; + unsigned char *tmpbuf = (unsigned char *)malloc(DATA_ALLOC_SIZE); + + int c = currentChar(); + *size = 0; + + while (c != '<') { + c &= 0x7f; + if (c == 0) { + free(tmpbuf); + return 0; + } + if (c == '=') { + numeq++; + } else { + numeq = 0; + } + if (c == '\n') { + state->lineNumber++; + } + if (__CFPLDataDecodeTable[c] < 0) { + c = nextChar(); + continue; + } + cntr++; + acc <<= 6; + acc += __CFPLDataDecodeTable[c]; + if (0 == (cntr & 0x3)) { + if (tmpbuflen <= tmpbufpos + 2) { + tmpbuflen += DATA_ALLOC_SIZE; + tmpbuf = (unsigned char *)realloc(tmpbuf, tmpbuflen); + } + tmpbuf[tmpbufpos++] = (acc >> 16) & 0xff; + if (numeq < 2) { + tmpbuf[tmpbufpos++] = (acc >> 8) & 0xff; + } + if (numeq < 1) { + tmpbuf[tmpbufpos++] = acc & 0xff; + } + } + c = nextChar(); + } + *size = tmpbufpos; + if (*size == 0) { free(tmpbuf); return 0; } - if (c == '=') numeq++; else numeq = 0; - if (c == '\n') state->lineNumber++; - if (__CFPLDataDecodeTable[c] < 0) { - c = nextChar(); - continue; - } - cntr++; - acc <<= 6; - acc += __CFPLDataDecodeTable[c]; - if (0 == (cntr & 0x3)) { - if (tmpbuflen <= tmpbufpos + 2) { - tmpbuflen += DATA_ALLOC_SIZE; - tmpbuf = (unsigned char *)realloc(tmpbuf, tmpbuflen); - } - tmpbuf[tmpbufpos++] = (acc >> 16) & 0xff; - if (numeq < 2) - tmpbuf[tmpbufpos++] = (acc >> 8) & 0xff; - if (numeq < 1) - tmpbuf[tmpbufpos++] = acc & 0xff; - } - c = nextChar(); - } - *size = tmpbufpos; - if (*size == 0) { - free(tmpbuf); - return 0; - } - return tmpbuf; + return tmpbuf; } static void * getHexData(parser_state_t *state, unsigned int *size) { - int c; - unsigned char *d, *start, *lastStart; + int c; + unsigned char *d, *start, *lastStart; - start = lastStart = d = (unsigned char *)malloc(DATA_ALLOC_SIZE); - c = currentChar(); + start = lastStart = d = (unsigned char *)malloc(DATA_ALLOC_SIZE); + c = currentChar(); - while (c != '<') { + while (c != '<') { + if (isSpace(c)) { + while ((c = nextChar()) != 0 && isSpace(c)) { + } + } + ; + if (c == '\n') { + state->lineNumber++; + c = nextChar(); + continue; + } - if (isSpace(c)) while ((c = nextChar()) != 0 && isSpace(c)) {}; - if (c == '\n') { - state->lineNumber++; - c = nextChar(); - continue; - } + // get high nibble + if (isDigit(c)) { + *d = (c - '0') << 4; + } else if (isAlphaDigit(c)) { + *d = (0xa + (c - 'a')) << 4; + } else { + goto error; + } - // get high nibble - if (isDigit(c)) { - *d = (c - '0') << 4; - } else if (isAlphaDigit(c)) { - *d = (0xa + (c - 'a')) << 4; - } else { - goto error; - } + // get low nibble + c = nextChar(); + if (isDigit(c)) { + *d |= c - '0'; + } else if (isAlphaDigit(c)) { + *d |= 0xa + (c - 'a'); + } else { + goto error; + } - // get low nibble - c = nextChar(); - if (isDigit(c)) { - *d |= c - '0'; - } else if (isAlphaDigit(c)) { - *d |= 0xa + (c - 'a'); - } else { - goto error; - } - - d++; - if ((d - lastStart) >= DATA_ALLOC_SIZE) { - int oldsize = d - start; - start = (unsigned char *)realloc(start, oldsize + DATA_ALLOC_SIZE); - d = lastStart = start + oldsize; + d++; + if ((d - lastStart) >= DATA_ALLOC_SIZE) { + int oldsize = d - start; + start = (unsigned char *)realloc(start, oldsize + DATA_ALLOC_SIZE); + d = lastStart = start + oldsize; + } + c = nextChar(); } - c = nextChar(); - } - *size = d - start; - return start; + *size = d - start; + return start; - error: +error: - *size = 0; - free(start); - return 0; + *size = 0; + free(start); + return 0; } static int @@ -757,11 +835,15 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) char values[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH]; object_t *object; - top: +top: c = currentChar(); /* skip white space */ - if (isSpace(c)) while ((c = nextChar()) != 0 && isSpace(c)) {}; + if (isSpace(c)) { + while ((c = nextChar()) != 0 && isSpace(c)) { + } + } + ; /* keep track of line number, don't return \n's */ if (c == '\n') { @@ -771,33 +853,41 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) } // end of the buffer? - if (!c) return 0; + if (!c) { + return 0; + } tagType = getTag(STATE, tag, &attributeCount, attributes, values); - if (tagType == TAG_BAD) return SYNTAX_ERROR; - if (tagType == TAG_IGNORE) goto top; + if (tagType == TAG_BAD) { + return SYNTAX_ERROR; + } + if (tagType == TAG_IGNORE) { + goto top; + } // handle allocation and check for "ID" and "IDREF" tags up front *lvalp = object = newObject(STATE); object->idref = -1; - for (i=0; i < attributeCount; i++) { - if (attributes[i][0] == 'I' && attributes[i][1] == 'D') { - // check for idref's, note: we ignore the tag, for - // this to work correctly, all idrefs must be unique - // across the whole serialization - if (attributes[i][2] == 'R' && attributes[i][3] == 'E' && - attributes[i][4] == 'F' && !attributes[i][5]) { - if (tagType != TAG_EMPTY) return SYNTAX_ERROR; - object->idref = strtol(values[i], NULL, 0); - return IDREF; - } - // check for id's - if (!attributes[i][2]) { - object->idref = strtol(values[i], NULL, 0); - } else { - return SYNTAX_ERROR; + for (i = 0; i < attributeCount; i++) { + if (attributes[i][0] == 'I' && attributes[i][1] == 'D') { + // check for idref's, note: we ignore the tag, for + // this to work correctly, all idrefs must be unique + // across the whole serialization + if (attributes[i][2] == 'R' && attributes[i][3] == 'E' && + attributes[i][4] == 'F' && !attributes[i][5]) { + if (tagType != TAG_EMPTY) { + return SYNTAX_ERROR; + } + object->idref = strtol(values[i], NULL, 0); + return IDREF; + } + // check for id's + if (!attributes[i][2]) { + object->idref = strtol(values[i], NULL, 0); + } else { + return SYNTAX_ERROR; + } } - } } switch (*tag) { @@ -827,7 +917,7 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) } bool isHexFormat = false; - for (i=0; i < attributeCount; i++) { + for (i = 0; i < attributeCount; i++) { if (!strcmp(attributes[i], "format") && !strcmp(values[i], "hex")) { isHexFormat = true; break; @@ -835,9 +925,9 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) } // CF encoded is the default form if (isHexFormat) { - object->data = getHexData(STATE, &size); + object->data = getHexData(STATE, &size); } else { - object->data = getCFEncodedData(STATE, &size); + object->data = getCFEncodedData(STATE, &size); } object->size = size; if ((getTag(STATE, tag, &attributeCount, attributes, values) != TAG_END) || strcmp(tag, "data")) { @@ -856,8 +946,8 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) break; case 'i': if (!strcmp(tag, "integer")) { - object->size = 64; // default - for (i=0; i < attributeCount; i++) { + object->size = 64; // default + for (i = 0; i < attributeCount; i++) { if (!strcmp(attributes[i], "size")) { object->size = strtoul(values[i], NULL, 0); } @@ -875,13 +965,15 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) break; case 'k': if (!strcmp(tag, "key")) { - if (tagType == TAG_EMPTY) return SYNTAX_ERROR; + if (tagType == TAG_EMPTY) { + return SYNTAX_ERROR; + } object->string = getString(STATE); if (!object->string) { return SYNTAX_ERROR; } if ((getTag(STATE, tag, &attributeCount, attributes, values) != TAG_END) - || strcmp(tag, "key")) { + || strcmp(tag, "key")) { return SYNTAX_ERROR; } return KEY; @@ -896,8 +988,8 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) case 's': if (!strcmp(tag, "string")) { if (tagType == TAG_EMPTY) { - object->string = (char *)malloc(1); - object->string[0] = 0; + object->string = (char *)malloc(1); + object->string[0] = 0; return STRING; } object->string = getString(STATE); @@ -905,7 +997,7 @@ yylex(YYSTYPE *lvalp, parser_state_t *state) return SYNTAX_ERROR; } if ((getTag(STATE, tag, &attributeCount, attributes, values) != TAG_END) - || strcmp(tag, "string")) { + || strcmp(tag, "string")) { return SYNTAX_ERROR; } return STRING; @@ -960,7 +1052,7 @@ newObject(parser_state_t *state) o->free = state->objects; state->objects = o; } - + return o; } @@ -968,7 +1060,7 @@ void freeObject(parser_state_t * state, object_t *o) { o->next = state->freeObjects; - state->freeObjects = o; + state->freeObjects = o; } void @@ -1006,7 +1098,7 @@ cleanupObjects(parser_state_t *state) // !@$&)(^Q$&*^!$(*!@$_(^%_(*Q#$(_*&!$_(*&!$_(*&!#$(*!@&^!@#%!_!# // !@$&)(^Q$&*^!$(*!@$_(^%_(*Q#$(_*&!$_(*&!$_(*&!#$(*!@&^!@#%!_!# -static void +static void rememberObject(parser_state_t *state, int tag, OSObject *o) { char key[16]; @@ -1028,7 +1120,9 @@ retrieveObject(parser_state_t *state, int tag) // printf("retrieve key '%s'\n", key); ref = state->tags->getObject(key); - if (!ref) return 0; + if (!ref) { + return 0; + } o = newObject(state); o->object = ref; @@ -1059,7 +1153,9 @@ buildDictionary(parser_state_t *state, object_t * header) } dict = OSDictionary::withCapacity(count); - if (header->idref >= 0) rememberObject(state, header->idref, dict); + if (header->idref >= 0) { + rememberObject(state, header->idref, dict); + } o = header->elements; while (o) { @@ -1099,7 +1195,9 @@ buildArray(parser_state_t *state, object_t * header) } array = OSArray::withCapacity(count); - if (header->idref >= 0) rememberObject(state, header->idref, array); + if (header->idref >= 0) { + rememberObject(state, header->idref, array); + } o = header->elements; while (o) { @@ -1126,7 +1224,9 @@ buildSet(parser_state_t *state, object_t *header) OSSet *set = OSSet::withArray(array, array->getCapacity()); // write over the reference created in buildArray - if (header->idref >= 0) rememberObject(state, header->idref, set); + if (header->idref >= 0) { + rememberObject(state, header->idref, set); + } array->release(); o->object = set; @@ -1139,7 +1239,9 @@ buildString(parser_state_t *state, object_t *o) OSString *string; string = OSString::withCString(o->string); - if (o->idref >= 0) rememberObject(state, o->idref, string); + if (o->idref >= 0) { + rememberObject(state, o->idref, string); + } free(o->string); o->string = 0; @@ -1153,8 +1255,10 @@ buildSymbol(parser_state_t *state, object_t *o) { OSSymbol *symbol; - symbol = const_cast(OSSymbol::withCString(o->string)); - if (o->idref >= 0) rememberObject(state, o->idref, symbol); + symbol = const_cast < OSSymbol * > (OSSymbol::withCString(o->string)); + if (o->idref >= 0) { + rememberObject(state, o->idref, symbol); + } free(o->string); o->string = 0; @@ -1173,9 +1277,13 @@ buildData(parser_state_t *state, object_t *o) } else { data = OSData::withCapacity(0); } - if (o->idref >= 0) rememberObject(state, o->idref, data); + if (o->idref >= 0) { + rememberObject(state, o->idref, data); + } - if (o->size) free(o->data); + if (o->size) { + free(o->data); + } o->data = 0; o->object = data; return o; @@ -1186,7 +1294,9 @@ buildNumber(parser_state_t *state, object_t *o) { OSNumber *number = OSNumber::withNumber(o->number, o->size); - if (o->idref >= 0) rememberObject(state, o->idref, number); + if (o->idref >= 0) { + rememberObject(state, o->idref, number); + } o->object = number; return o; @@ -1205,12 +1315,18 @@ OSUnserializeXML(const char *buffer, OSString **errorString) { OSObject *object; - if (!buffer) return 0; + if (!buffer) { + return 0; + } parser_state_t *state = (parser_state_t *)malloc(sizeof(parser_state_t)); - if (!state) return 0; + if (!state) { + return 0; + } // just in case - if (errorString) *errorString = NULL; + if (errorString) { + *errorString = NULL; + } state->parseBuffer = buffer; state->parseBufferIndex = 0; @@ -1221,6 +1337,7 @@ OSUnserializeXML(const char *buffer, OSString **errorString) state->errorString = errorString; state->parsedObject = 0; state->parsedObjectCount = 0; + state->retrievedObjectCount = 0; (void)yyparse((void *)state); @@ -1238,13 +1355,22 @@ OSUnserializeXML(const char *buffer, OSString **errorString) OSObject* OSUnserializeXML(const char *buffer, size_t bufferSize, OSString **errorString) { - if (!buffer) return (0); - if (bufferSize < sizeof(kOSSerializeBinarySignature)) return (0); + if (!buffer) { + return 0; + } + if (bufferSize < sizeof(kOSSerializeBinarySignature)) { + return 0; + } - if (!strcmp(kOSSerializeBinarySignature, buffer)) return OSUnserializeBinary(buffer, bufferSize, errorString); + if (!strcmp(kOSSerializeBinarySignature, buffer) + || (kOSSerializeIndexedBinarySignature == (uint8_t)buffer[0])) { + return OSUnserializeBinary(buffer, bufferSize, errorString); + } // XML must be null terminated - if (buffer[bufferSize - 1]) return 0; + if (buffer[bufferSize - 1]) { + return 0; + } return OSUnserializeXML(buffer, errorString); } diff --git a/libkern/conf/Makefile b/libkern/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/libkern/conf/Makefile +++ b/libkern/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/libkern/conf/Makefile.template b/libkern/conf/Makefile.template index b38b73fe1..fa45a7f1f 100644 --- a/libkern/conf/Makefile.template +++ b/libkern/conf/Makefile.template @@ -42,8 +42,8 @@ runtime.cpo_CXXWARNFLAGS_ADD = -Wno-cast-qual # warnings in bison-generated code -OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break -OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code +OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break -Wno-zero-as-null-pointer-constant +OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code -Wno-zero-as-null-pointer-constant # Runtime support functions don't interact well with LTO (9294679) stack_protector.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) @@ -97,9 +97,9 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \ mv $${hib_file}__ $${hib_file} || exit 1; \ done - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist do_all: $(COMPONENT).filelist diff --git a/libkern/conf/files b/libkern/conf/files index 5181c0143..95674f7f4 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -2,13 +2,13 @@ OPTIONS/libkerncpp optional libkerncpp OPTIONS/kdebug optional kdebug -OPTIONS/gprof optional gprof OPTIONS/config_dtrace optional config_dtrace OPTIONS/hibernation optional hibernation OPTIONS/iotracking optional iotracking OPTIONS/networking optional networking OPTIONS/crypto optional crypto OPTIONS/zlib optional zlib +OPTIONS/zlibc optional zlibc # libkern @@ -37,6 +37,7 @@ libkern/c++/OSSymbol.cpp optional libkerncpp libkern/c++/OSUnserialize.cpp optional libkerncpp libkern/c++/OSUnserializeXML.cpp optional libkerncpp libkern/c++/OSSerializeBinary.cpp optional libkerncpp +libkern/c++/OSCompat.cpp optional libkerncpp libkern/OSKextLib.cpp optional libkerncpp libkern/mkext.c standard @@ -70,6 +71,18 @@ libkern/zlib/trees.c optional zlib libkern/zlib/uncompr.c optional zlib libkern/zlib/zutil.c optional zlib +libkern/zlib/adler32.c optional zlibc +libkern/zlib/compress.c optional zlibc +libkern/zlib/z_crc32.c optional zlibc +libkern/zlib/deflate.c optional zlibc +libkern/zlib/infback.c optional zlibc +libkern/zlib/inffast.c optional zlibc +libkern/zlib/inflate.c optional zlibc +libkern/zlib/inftrees.c optional zlibc +libkern/zlib/trees.c optional zlibc +libkern/zlib/uncompr.c optional zlibc +libkern/zlib/zutil.c optional zlibc + libkern/crypto/register_crypto.c optional crypto libkern/crypto/corecrypto_sha2.c standard libkern/crypto/corecrypto_sha1.c optional crypto diff --git a/libkern/crypto/corecrypto_aes.c b/libkern/crypto/corecrypto_aes.c index 9aa590e14..0105da4e7 100644 --- a/libkern/crypto/corecrypto_aes.c +++ b/libkern/crypto/corecrypto_aes.c @@ -42,9 +42,7 @@ aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]) panic("%s: inconsistent size for AES encrypt context", __FUNCTION__); } - cccbc_init(cbc, cx[0].ctx, key_len, key); - - return aes_good; + return cccbc_init(cbc, cx[0].ctx, key_len, key); } aes_rval @@ -54,10 +52,12 @@ aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigne const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_encrypt; cccbc_iv_decl(cbc->block_size, ctx_iv); - cccbc_set_iv(cbc, ctx_iv, in_iv); - cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc encrypt. + int rc = cccbc_set_iv(cbc, ctx_iv, in_iv); + if (rc) { + return rc; + } - return aes_good; + return cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc encrypt. } #if defined (__i386__) || defined (__x86_64__) || defined (__arm64__) @@ -79,9 +79,7 @@ aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]) panic("%s: inconsistent size for AES decrypt context", __FUNCTION__); } - cccbc_init(cbc, cx[0].ctx, key_len, key); - - return aes_good; + return cccbc_init(cbc, cx[0].ctx, key_len, key); } aes_rval @@ -91,10 +89,12 @@ aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigne const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_decrypt; cccbc_iv_decl(cbc->block_size, ctx_iv); - cccbc_set_iv(cbc, ctx_iv, in_iv); - cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc decrypt. + int rc = cccbc_set_iv(cbc, ctx_iv, in_iv); + if (rc) { + return rc; + } - return aes_good; + return cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc decrypt. } #if defined (__i386__) || defined (__x86_64__) || defined (__arm64__) @@ -194,7 +194,7 @@ aes_encrypt_aad_gcm(const unsigned char *aad, unsigned int aad_bytes, ccgcm_ctx return aes_error; } - return ccgcm_gmac(gcm, ctx, aad_bytes, aad); + return ccgcm_aad(gcm, ctx, aad_bytes, aad); } aes_rval @@ -212,15 +212,17 @@ aes_encrypt_gcm(const unsigned char *in_blk, unsigned int num_bytes, aes_rval aes_encrypt_finalize_gcm(unsigned char *tag, unsigned int tag_bytes, ccgcm_ctx *ctx) { - int rc; const struct ccmode_gcm *gcm = g_crypto_funcs->ccaes_gcm_encrypt; if (!gcm) { return aes_error; } - rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag); - rc |= ccgcm_reset(gcm, ctx); - return rc; + int rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag); + if (rc) { + return rc; + } + + return ccgcm_reset(gcm, ctx); } aes_rval @@ -248,16 +250,17 @@ aes_decrypt_key_with_iv_gcm(const unsigned char *key, int key_len, const unsigne aes_rval aes_decrypt_set_iv_gcm(const unsigned char *in_iv, unsigned int len, ccgcm_ctx *ctx) { - int rc; - const struct ccmode_gcm *gcm = g_crypto_funcs->ccaes_gcm_decrypt; if (!gcm) { return aes_error; } - rc = ccgcm_reset(gcm, ctx); - rc |= ccgcm_set_iv(gcm, ctx, len, in_iv); - return rc; + int rc = ccgcm_reset(gcm, ctx); + if (rc) { + return rc; + } + + return ccgcm_set_iv(gcm, ctx, len, in_iv); } aes_rval @@ -290,7 +293,7 @@ aes_decrypt_aad_gcm(const unsigned char *aad, unsigned int aad_bytes, ccgcm_ctx return aes_error; } - return ccgcm_gmac(gcm, ctx, aad_bytes, aad); + return ccgcm_aad(gcm, ctx, aad_bytes, aad); } aes_rval @@ -308,15 +311,17 @@ aes_decrypt_gcm(const unsigned char *in_blk, unsigned int num_bytes, aes_rval aes_decrypt_finalize_gcm(unsigned char *tag, unsigned int tag_bytes, ccgcm_ctx *ctx) { - int rc; const struct ccmode_gcm *gcm = g_crypto_funcs->ccaes_gcm_decrypt; if (!gcm) { return aes_error; } - rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag); - rc |= ccgcm_reset(gcm, ctx); - return rc; + int rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag); + if (rc) { + return rc; + } + + return ccgcm_reset(gcm, ctx); } unsigned diff --git a/libkern/crypto/corecrypto_aesxts.c b/libkern/crypto/corecrypto_aesxts.c index 80cd614fd..61c2d0d63 100644 --- a/libkern/crypto/corecrypto_aesxts.c +++ b/libkern/crypto/corecrypto_aesxts.c @@ -64,10 +64,10 @@ xts_start(uint32_t cipher __unused, // ignored - we're doing this for xts-aes on panic("%s: inconsistent size for AES-XTS context", __FUNCTION__); } - enc->init(enc, xts->enc, keylen, key1, key2); - dec->init(dec, xts->dec, keylen, key1, key2); + int rc = enc->init(enc, xts->enc, keylen, key1, key2); + rc |= dec->init(dec, xts->dec, keylen, key1, key2); - return 0; //never fails + return rc; } int @@ -83,10 +83,13 @@ xts_encrypt(const uint8_t *pt, unsigned long ptlen, panic("xts encrypt not a multiple of block size\n"); } - xtsenc->set_tweak(xts->enc, tweak, iv); - xtsenc->xts(xts->enc, tweak, ptlen / 16, pt, ct); + int rc = xtsenc->set_tweak(xts->enc, tweak, iv); + if (rc) { + return rc; + } - return 0; //never fails + xtsenc->xts(xts->enc, tweak, ptlen / 16, pt, ct); + return 0; } int @@ -102,10 +105,13 @@ xts_decrypt(const uint8_t *ct, unsigned long ptlen, panic("xts decrypt not a multiple of block size\n"); } - xtsdec->set_tweak(xts->dec, tweak, iv); - xtsdec->xts(xts->dec, tweak, ptlen / 16, ct, pt); + int rc = xtsdec->set_tweak(xts->dec, tweak, iv); + if (rc) { + return rc; + } - return 0; //never fails + xtsdec->xts(xts->dec, tweak, ptlen / 16, ct, pt); + return 0; } void diff --git a/libkern/crypto/corecrypto_des.c b/libkern/crypto/corecrypto_des.c index 80406bb02..b77967c49 100644 --- a/libkern/crypto/corecrypto_des.c +++ b/libkern/crypto/corecrypto_des.c @@ -45,22 +45,22 @@ des_ecb_key_sched(des_cblock *key, des_ecb_key_schedule *ks) panic("%s: inconsistent size for DES-ECB context", __FUNCTION__); } - enc->init(enc, ks->enc, CCDES_KEY_SIZE, key); - dec->init(dec, ks->dec, CCDES_KEY_SIZE, key); + int rc = enc->init(enc, ks->enc, CCDES_KEY_SIZE, key); + if (rc) { + return rc; + } - /* The old DES interface could return -1 or -2 for weak keys and wrong parity, - * but this was disabled all the time, so we never fail here */ - return 0; + return dec->init(dec, ks->dec, CCDES_KEY_SIZE, key); } /* Simple des - 1 block */ -void +int des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int enc) { const struct ccmode_ecb *ecb = enc ? g_crypto_funcs->ccdes_ecb_encrypt : g_crypto_funcs->ccdes_ecb_decrypt; ccecb_ctx *ctx = enc ? ks->enc : ks->dec; - ecb->ecb(ctx, 1, in, out); + return ecb->ecb(ctx, 1, in, out); } @@ -68,7 +68,6 @@ des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int e int des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks) { - int rc; const struct ccmode_ecb *enc = g_crypto_funcs->cctdes_ecb_encrypt; const struct ccmode_ecb *dec = g_crypto_funcs->cctdes_ecb_decrypt; @@ -77,20 +76,22 @@ des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks) panic("%s: inconsistent size for 3DES-ECB context", __FUNCTION__); } - rc = enc->init(enc, ks->enc, CCDES_KEY_SIZE * 3, key); - rc |= dec->init(dec, ks->dec, CCDES_KEY_SIZE * 3, key); + int rc = enc->init(enc, ks->enc, CCDES_KEY_SIZE * 3, key); + if (rc) { + return rc; + } - return rc; + return dec->init(dec, ks->dec, CCDES_KEY_SIZE * 3, key); } /* Simple des - 1 block */ -void +int des3_ecb_encrypt(des_cblock *in, des_cblock *out, des3_ecb_key_schedule *ks, int enc) { const struct ccmode_ecb *ecb = enc ? g_crypto_funcs->cctdes_ecb_encrypt : g_crypto_funcs->cctdes_ecb_decrypt; ccecb_ctx *ctx = enc ? ks->enc : ks->dec; - ecb->ecb(ctx, 1, in, out); + return ecb->ecb(ctx, 1, in, out); } /* Raw key helper functions */ diff --git a/libkern/firehose/Makefile b/libkern/firehose/Makefile index d6f2503f9..41be8924a 100644 --- a/libkern/firehose/Makefile +++ b/libkern/firehose/Makefile @@ -6,7 +6,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -LCLDIR = /usr/local/include +INSTALLHDRS_SKIP_HOST = NO + +LCLDIR = $(SDKHEADERSROOT)/usr/local/include KERNELFILES = diff --git a/libkern/firehose/chunk_private.h b/libkern/firehose/chunk_private.h index 80d5969fa..dece91a37 100644 --- a/libkern/firehose/chunk_private.h +++ b/libkern/firehose/chunk_private.h @@ -146,7 +146,7 @@ firehose_chunk_tracepoint_try_reserve(firehose_chunk_t fc, uint64_t stamp, return FIREHOSE_CHUNK_TRY_RESERVE_FAIL_ENQUEUE; } if (privptr) { - *privptr = fc->fc_start + pos.fcp_private_offs; + *privptr = (uint8_t *)((uintptr_t)fc->fc_start + pos.fcp_private_offs); } return orig.fcp_next_entry_offs; } @@ -157,7 +157,7 @@ firehose_chunk_tracepoint_begin(firehose_chunk_t fc, uint64_t stamp, uint16_t pubsize, uint64_t thread_id, long offset) { firehose_tracepoint_t ft = (firehose_tracepoint_t) - __builtin_assume_aligned(fc->fc_start + offset, 8); + __builtin_assume_aligned((void *)((uintptr_t)fc->fc_start + (uintptr_t)offset), 8); stamp -= fc->fc_timestamp; stamp |= (uint64_t)pubsize << 48; // The compiler barrier is needed for userland process death handling, see diff --git a/libkern/firehose/firehose_types_private.h b/libkern/firehose/firehose_types_private.h index c47635c88..22e3def40 100644 --- a/libkern/firehose/firehose_types_private.h +++ b/libkern/firehose/firehose_types_private.h @@ -36,14 +36,14 @@ __BEGIN_DECLS * The lower 8 bits are or-ed in the upper 8 bits of Activity ID and propagated * to children activities */ - OS_ENUM(firehose_activity_flags, unsigned long, - firehose_activity_flags_default = 0x0000, + OS_OPTIONS(firehose_activity_flags, unsigned long, + firehose_activity_flags_default = 0x0000, - firehose_activity_flags_info_mode = 0x0001, - firehose_activity_flags_debug_mode = 0x0002, - firehose_activity_flags_stream_live_mode = 0x0004, + firehose_activity_flags_info_mode = 0x0001, + firehose_activity_flags_debug_mode = 0x0002, + firehose_activity_flags_stream_live_mode = 0x0004, - firehose_activity_flags_precise_timestamp = 0x0080, + firehose_activity_flags_precise_timestamp = 0x0080, ); /*! @@ -69,13 +69,13 @@ typedef uint64_t firehose_activity_id_t; * @enum firehose_stream_t */ OS_ENUM(firehose_stream, uint8_t, - firehose_stream_persist = 0, - firehose_stream_special = 1, - firehose_stream_memory = 2, - firehose_stream_metadata = 3, - firehose_stream_signpost = 4, - firehose_stream_memory_wifi = 5, - firehose_stream_memory_baseband = 6, + firehose_stream_persist = 0, + firehose_stream_special = 1, + firehose_stream_memory = 2, + firehose_stream_metadata = 3, + firehose_stream_signpost = 4, + firehose_stream_memory_wifi = 5, + firehose_stream_memory_baseband = 6, _firehose_stream_max, ); @@ -87,12 +87,12 @@ OS_ENUM(firehose_stream, uint8_t, * Namespaces of tracepoints. */ OS_ENUM(firehose_tracepoint_namespace, uint8_t, - firehose_tracepoint_namespace_activity = 0x02, - firehose_tracepoint_namespace_trace = 0x03, - firehose_tracepoint_namespace_log = 0x04, - firehose_tracepoint_namespace_metadata = 0x05, - firehose_tracepoint_namespace_signpost = 0x06, - firehose_tracepoint_namespace_loss = 0x07, + firehose_tracepoint_namespace_activity = 0x02, + firehose_tracepoint_namespace_trace = 0x03, + firehose_tracepoint_namespace_log = 0x04, + firehose_tracepoint_namespace_metadata = 0x05, + firehose_tracepoint_namespace_signpost = 0x06, + firehose_tracepoint_namespace_loss = 0x07, ); /*! @@ -102,8 +102,8 @@ OS_ENUM(firehose_tracepoint_namespace, uint8_t, * Codes of tracepoints. */ OS_ENUM(firehose_tracepoint_code, uint32_t, - firehose_tracepoint_code_load = 0x01, - firehose_tracepoint_code_unload = 0x02, + firehose_tracepoint_code_load = 0x01, + firehose_tracepoint_code_unload = 0x02, ); /*! @@ -120,10 +120,10 @@ typedef uint8_t firehose_tracepoint_type_t; * @abstract * Flags for tracepoints. */ -OS_ENUM(firehose_tracepoint_flags, uint16_t, +OS_OPTIONS(firehose_tracepoint_flags, uint16_t, _firehose_tracepoint_flags_base_has_current_aid = 0x0001, #define _firehose_tracepoint_flags_pc_style_mask (0x0007 << 1) - _firehose_tracepoint_flags_pc_style_none = 0x0000 << 1, + _firehose_tracepoint_flags_pc_style_none = 0x0000 << 1, _firehose_tracepoint_flags_pc_style_main_exe = 0x0001 << 1, _firehose_tracepoint_flags_pc_style_shared_cache = 0x0002 << 1, _firehose_tracepoint_flags_pc_style_main_plugin = 0x0003 << 1, @@ -134,6 +134,25 @@ OS_ENUM(firehose_tracepoint_flags, uint16_t, _firehose_tracepoint_flags_base_has_unique_pid = 0x0010, ); +/* + * Same as _OS_TRACE_DYNAMIC_BIT defined in libtrace/tracepoint_internal.h. + * This bit is used by logd to know how to evaluate the format + * string. + * If it is set, logd assumes that the format is "%s" and the content of the + * whole string is passed with the firehose_tracepoint. + * Otherwise it tries to find the unformatted string within the text + * section of the executable and expects only the content of the variables + * on the firehose_tracepoint. + */ +#define FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT 0x80000000 + +/* + * Same as KERNEL_MASK defined in logd/logd_main.c + * It is used by logd to mask the pc before calling + * OSKextCopyUUIDForAddress. + */ +#define FIREHOSE_TRACEPOINT_PC_KERNEL_MASK 0xffff000000000000 + /*! * @typedef firehose_tracepoint_id_t * @@ -149,8 +168,8 @@ typedef uint64_t firehose_tracepoint_id_t; * Types of Activity tracepoints (namespace activity). */ OS_ENUM(_firehose_tracepoint_type_activity, firehose_tracepoint_type_t, - _firehose_tracepoint_type_activity_create = 0x01, - _firehose_tracepoint_type_activity_swap = 0x02, + _firehose_tracepoint_type_activity_create = 0x01, + _firehose_tracepoint_type_activity_swap = 0x02, _firehose_tracepoint_type_activity_useraction = 0x03, ); @@ -160,7 +179,7 @@ OS_ENUM(_firehose_tracepoint_type_activity, firehose_tracepoint_type_t, * @abstract * Flags for Activity tracepoints (namespace activity). */ -OS_ENUM(_firehose_tracepoint_flags_activity, uint16_t, +OS_OPTIONS(_firehose_tracepoint_flags_activity, uint16_t, _firehose_tracepoint_flags_activity_user_interface = 0x0100, _firehose_tracepoint_flags_activity_has_other_aid = 0x0200, ); @@ -172,11 +191,11 @@ OS_ENUM(_firehose_tracepoint_flags_activity, uint16_t, * Types of trace tracepoints (namespace trace). */ OS_ENUM(_firehose_tracepoint_type_trace, firehose_tracepoint_type_t, - _firehose_tracepoint_type_trace_default = 0x00, - _firehose_tracepoint_type_trace_info = 0x01, - _firehose_tracepoint_type_trace_debug = 0x02, - _firehose_tracepoint_type_trace_error = 0x10, - _firehose_tracepoint_type_trace_fault = 0x11, + _firehose_tracepoint_type_trace_default = 0x00, + _firehose_tracepoint_type_trace_info = 0x01, + _firehose_tracepoint_type_trace_debug = 0x02, + _firehose_tracepoint_type_trace_error = 0x10, + _firehose_tracepoint_type_trace_fault = 0x11, ); /*! @@ -186,11 +205,11 @@ OS_ENUM(_firehose_tracepoint_type_trace, firehose_tracepoint_type_t, * Types of Log tracepoints (namespace log). */ OS_ENUM(_firehose_tracepoint_type_log, firehose_tracepoint_type_t, - _firehose_tracepoint_type_log_default = 0x00, - _firehose_tracepoint_type_log_info = 0x01, - _firehose_tracepoint_type_log_debug = 0x02, - _firehose_tracepoint_type_log_error = 0x10, - _firehose_tracepoint_type_log_fault = 0x11, + _firehose_tracepoint_type_log_default = 0x00, + _firehose_tracepoint_type_log_info = 0x01, + _firehose_tracepoint_type_log_debug = 0x02, + _firehose_tracepoint_type_log_error = 0x10, + _firehose_tracepoint_type_log_fault = 0x11, ); /*! @@ -199,11 +218,11 @@ OS_ENUM(_firehose_tracepoint_type_log, firehose_tracepoint_type_t, * @abstract * Flags for Log tracepoints (namespace log). */ -OS_ENUM(_firehose_tracepoint_flags_log, uint16_t, +OS_OPTIONS(_firehose_tracepoint_flags_log, uint16_t, _firehose_tracepoint_flags_log_has_private_data = 0x0100, _firehose_tracepoint_flags_log_has_subsystem = 0x0200, - _firehose_tracepoint_flags_log_has_rules = 0x0400, - _firehose_tracepoint_flags_log_has_oversize = 0x0800, + _firehose_tracepoint_flags_log_has_rules = 0x0400, + _firehose_tracepoint_flags_log_has_oversize = 0x0800, _firehose_tracepoint_flags_log_has_context_data = 0x1000, ); @@ -214,9 +233,9 @@ OS_ENUM(_firehose_tracepoint_flags_log, uint16_t, * Types for metadata tracepoints (namespace metadata). */ OS_ENUM(_firehose_tracepoint_type_metadata, firehose_tracepoint_type_t, - _firehose_tracepoint_type_metadata_dyld = 0x01, + _firehose_tracepoint_type_metadata_dyld = 0x01, _firehose_tracepoint_type_metadata_subsystem = 0x02, - _firehose_tracepoint_type_metadata_kext = 0x03, + _firehose_tracepoint_type_metadata_kext = 0x03, ); /*! @@ -226,7 +245,7 @@ OS_ENUM(_firehose_tracepoint_type_metadata, firehose_tracepoint_type_t, * Types of Log tracepoints (namespace signpost). */ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t, - _firehose_tracepoint_type_signpost_event = 0x00, + _firehose_tracepoint_type_signpost_event = 0x00, _firehose_tracepoint_type_signpost_interval_begin = 0x01, _firehose_tracepoint_type_signpost_interval_end = 0x02, @@ -242,13 +261,13 @@ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t, * @abstract * Flags for Log tracepoints (namespace signpost). * - * When flags are shared with the log type, they should have the same values. + * When flags are shared with the log type, they should havethe same values. */ -OS_ENUM(_firehose_tracepoint_flags_signpost, uint16_t, +OS_OPTIONS(_firehose_tracepoint_flags_signpost, uint16_t, _firehose_tracepoint_flags_signpost_has_private_data = 0x0100, - _firehose_tracepoint_flags_signpost_has_subsystem = 0x0200, - _firehose_tracepoint_flags_signpost_has_rules = 0x0400, - _firehose_tracepoint_flags_signpost_has_oversize = 0x0800, + _firehose_tracepoint_flags_signpost_has_subsystem = 0x0200, + _firehose_tracepoint_flags_signpost_has_rules = 0x0400, + _firehose_tracepoint_flags_signpost_has_oversize = 0x0800, _firehose_tracepoint_flags_signpost_has_context_data = 0x1000, ); diff --git a/libkern/gen/OSAtomicOperations.c b/libkern/gen/OSAtomicOperations.c index 7866c302e..8408c83a4 100644 --- a/libkern/gen/OSAtomicOperations.c +++ b/libkern/gen/OSAtomicOperations.c @@ -47,10 +47,6 @@ enum { #define ALIGN_TEST(p, t) do{}while(0) #endif -// 19831745 - start of big hammer! -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wcast-qual" - /* * atomic operations * These are _the_ atomic operations, now implemented via compiler built-ins. @@ -63,16 +59,14 @@ enum { Boolean OSCompareAndSwap8(UInt8 oldValue, UInt8 newValue, volatile UInt8 *address) { - return __c11_atomic_compare_exchange_strong((_Atomic UInt8 *)address, &oldValue, newValue, - memory_order_acq_rel_smp, memory_order_relaxed); + return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel); } #undef OSCompareAndSwap16 Boolean OSCompareAndSwap16(UInt16 oldValue, UInt16 newValue, volatile UInt16 *address) { - return __c11_atomic_compare_exchange_strong((_Atomic UInt16 *)address, &oldValue, newValue, - memory_order_acq_rel_smp, memory_order_relaxed); + return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel); } #undef OSCompareAndSwap @@ -80,8 +74,7 @@ Boolean OSCompareAndSwap(UInt32 oldValue, UInt32 newValue, volatile UInt32 *address) { ALIGN_TEST(address, UInt32); - return __c11_atomic_compare_exchange_strong((_Atomic UInt32 *)address, &oldValue, newValue, - memory_order_acq_rel_smp, memory_order_relaxed); + return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel); } #undef OSCompareAndSwap64 @@ -96,31 +89,26 @@ OSCompareAndSwap64(UInt64 oldValue, UInt64 newValue, volatile UInt64 *address) _Atomic UInt64 *aligned_addr = (_Atomic UInt64 *)(uintptr_t)address; ALIGN_TEST(address, UInt64); - return __c11_atomic_compare_exchange_strong(aligned_addr, &oldValue, newValue, - memory_order_acq_rel_smp, memory_order_relaxed); + return os_atomic_cmpxchg(aligned_addr, oldValue, newValue, acq_rel); } #undef OSCompareAndSwapPtr Boolean OSCompareAndSwapPtr(void *oldValue, void *newValue, void * volatile *address) { -#if __LP64__ - return OSCompareAndSwap64((UInt64)oldValue, (UInt64)newValue, (volatile UInt64 *)address); -#else - return OSCompareAndSwap((UInt32)oldValue, (UInt32)newValue, (volatile UInt32 *)address); -#endif + return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel); } SInt8 OSAddAtomic8(SInt32 amount, volatile SInt8 *address) { - return __c11_atomic_fetch_add((_Atomic SInt8*)address, amount, memory_order_relaxed); + return os_atomic_add_orig(address, amount, relaxed); } SInt16 OSAddAtomic16(SInt32 amount, volatile SInt16 *address) { - return __c11_atomic_fetch_add((_Atomic SInt16*)address, amount, memory_order_relaxed); + return os_atomic_add_orig(address, amount, relaxed); } #undef OSAddAtomic @@ -128,7 +116,7 @@ SInt32 OSAddAtomic(SInt32 amount, volatile SInt32 *address) { ALIGN_TEST(address, UInt32); - return __c11_atomic_fetch_add((_Atomic SInt32*)address, amount, memory_order_relaxed); + return os_atomic_add_orig(address, amount, relaxed); } #undef OSAddAtomic64 @@ -138,75 +126,69 @@ OSAddAtomic64(SInt64 amount, volatile SInt64 *address) _Atomic SInt64* aligned_address = (_Atomic SInt64*)(uintptr_t)address; ALIGN_TEST(address, SInt64); - return __c11_atomic_fetch_add(aligned_address, amount, memory_order_relaxed); + return os_atomic_add_orig(aligned_address, amount, relaxed); } #undef OSAddAtomicLong long OSAddAtomicLong(long theAmount, volatile long *address) { -#ifdef __LP64__ - return (long)OSAddAtomic64((SInt64)theAmount, (SInt64*)address); -#else - return (long)OSAddAtomic((SInt32)theAmount, address); -#endif + return os_atomic_add_orig(address, theAmount, relaxed); } #undef OSIncrementAtomic SInt32 OSIncrementAtomic(volatile SInt32 * value) { - return OSAddAtomic(1, value); + return os_atomic_inc_orig(value, relaxed); } #undef OSDecrementAtomic SInt32 OSDecrementAtomic(volatile SInt32 * value) { - return OSAddAtomic(-1, value); + return os_atomic_dec_orig(value, relaxed); } #undef OSBitAndAtomic UInt32 OSBitAndAtomic(UInt32 mask, volatile UInt32 * value) { - return __c11_atomic_fetch_and((_Atomic UInt32*)value, mask, memory_order_relaxed); + return os_atomic_and_orig(value, mask, relaxed); } #undef OSBitOrAtomic UInt32 OSBitOrAtomic(UInt32 mask, volatile UInt32 * value) { - return __c11_atomic_fetch_or((_Atomic UInt32*)value, mask, memory_order_relaxed); + return os_atomic_or_orig(value, mask, relaxed); } #undef OSBitXorAtomic UInt32 OSBitXorAtomic(UInt32 mask, volatile UInt32 * value) { - return __c11_atomic_fetch_xor((_Atomic UInt32*)value, mask, memory_order_relaxed); + return os_atomic_xor_orig(value, mask, relaxed); } static Boolean OSTestAndSetClear(UInt32 bit, Boolean wantSet, volatile UInt8 * startAddress) { UInt8 mask = 1; - UInt8 oldValue; + UInt8 oldValue, newValue; UInt8 wantValue; + UInt8 *address; - startAddress += (bit / 8); + address = (UInt8 *)(uintptr_t)(startAddress + (bit / 8)); mask <<= (7 - (bit % 8)); wantValue = wantSet ? mask : 0; - do { - oldValue = *startAddress; + return !os_atomic_rmw_loop(address, oldValue, newValue, relaxed, { if ((oldValue & mask) == wantValue) { - break; + os_atomic_rmw_loop_give_up(break); } - } while (!__c11_atomic_compare_exchange_strong((_Atomic UInt8 *)startAddress, - &oldValue, (oldValue & ~mask) | wantValue, memory_order_relaxed, memory_order_relaxed)); - - return (oldValue & mask) == wantValue; + newValue = (oldValue & ~mask) | wantValue; + }); } Boolean @@ -228,31 +210,31 @@ OSTestAndClear(UInt32 bit, volatile UInt8 * startAddress) SInt8 OSIncrementAtomic8(volatile SInt8 * value) { - return OSAddAtomic8(1, value); + return os_atomic_inc_orig(value, relaxed); } SInt8 OSDecrementAtomic8(volatile SInt8 * value) { - return OSAddAtomic8(-1, value); + return os_atomic_dec_orig(value, relaxed); } UInt8 OSBitAndAtomic8(UInt32 mask, volatile UInt8 * value) { - return __c11_atomic_fetch_and((_Atomic UInt8 *)value, mask, memory_order_relaxed); + return os_atomic_and_orig(value, mask, relaxed); } UInt8 OSBitOrAtomic8(UInt32 mask, volatile UInt8 * value) { - return __c11_atomic_fetch_or((_Atomic UInt8 *)value, mask, memory_order_relaxed); + return os_atomic_or_orig(value, mask, relaxed); } UInt8 OSBitXorAtomic8(UInt32 mask, volatile UInt8 * value) { - return __c11_atomic_fetch_xor((_Atomic UInt8 *)value, mask, memory_order_relaxed); + return os_atomic_xor_orig(value, mask, relaxed); } SInt16 @@ -270,20 +252,17 @@ OSDecrementAtomic16(volatile SInt16 * value) UInt16 OSBitAndAtomic16(UInt32 mask, volatile UInt16 * value) { - return __c11_atomic_fetch_and((_Atomic UInt16 *)value, mask, memory_order_relaxed); + return os_atomic_and_orig(value, mask, relaxed); } UInt16 OSBitOrAtomic16(UInt32 mask, volatile UInt16 * value) { - return __c11_atomic_fetch_or((_Atomic UInt16 *)value, mask, memory_order_relaxed); + return os_atomic_or_orig(value, mask, relaxed); } UInt16 OSBitXorAtomic16(UInt32 mask, volatile UInt16 * value) { - return __c11_atomic_fetch_xor((_Atomic UInt16 *)value, mask, memory_order_relaxed); + return os_atomic_xor_orig(value, mask, relaxed); } - -// 19831745 - end of big hammer! -#pragma clang diagnostic pop diff --git a/libkern/gen/OSDebug.cpp b/libkern/gen/OSDebug.cpp index b806dce3c..402b6a345 100644 --- a/libkern/gen/OSDebug.cpp +++ b/libkern/gen/OSDebug.cpp @@ -43,6 +43,9 @@ #include #include +#if defined(HAS_APPLE_PAC) +#include +#endif extern int etext; __BEGIN_DECLS @@ -95,7 +98,7 @@ trace_backtrace(uint32_t debugid, uint32_t debugid2, uintptr_t size, uintptr_t d i = 2; } -#define safe_bt(a) (uintptr_t)(aflags = flags; context->allocate_callback = allocate_callback; @@ -154,9 +153,8 @@ kxld_create_context(KXLDContext **_context, kxld_set_logging_callback(logging_callback); - context->kext = kxld_alloc(kxld_kext_sizeof()); + context->kext = kxld_calloc(kxld_kext_sizeof()); require_action(context->kext, finish, rval = KERN_RESOURCE_SHORTAGE); - bzero(context->kext, kxld_kext_sizeof()); /* Check if we already have an order array for this arch */ @@ -166,9 +164,8 @@ kxld_create_context(KXLDContext **_context, #else /* In userspace, create the dictionary if it doesn't already exist */ if (!s_order_dict) { - s_order_dict = kxld_alloc(sizeof(*s_order_dict)); + s_order_dict = kxld_calloc(sizeof(*s_order_dict)); require_action(s_order_dict, finish, rval = KERN_RESOURCE_SHORTAGE); - bzero(s_order_dict, sizeof(*s_order_dict)); rval = kxld_dict_init(s_order_dict, kxld_dict_uint32_hash, kxld_dict_uint32_cmp, 0); @@ -181,9 +178,8 @@ kxld_create_context(KXLDContext **_context, /* Create an order array for this arch if needed */ if (!context->section_order) { - section_order = kxld_alloc(sizeof(*section_order)); + section_order = kxld_calloc(sizeof(*section_order)); require_action(section_order, finish, rval = KERN_RESOURCE_SHORTAGE); - bzero(section_order, sizeof(*section_order)); #if KERNEL s_section_order = section_order; @@ -620,8 +616,6 @@ allocate_split_kext(KXLDContext *context, splitKextLinkInfo * link_info) linked_object = kxld_page_alloc_untracked(link_info->linkedKextSize); require(linked_object, finish); link_info->linkedKext = linked_object; - - bzero(linked_object, vmsize); rval = KERN_SUCCESS; finish: @@ -653,8 +647,14 @@ allocate_kext(KXLDContext *context, "Load address %p is not page-aligned.", (void *) (uintptr_t) vmaddr)); + /* Zero out the memory before we fill it. We fill this buffer in a + * sparse fashion, and it's simpler to clear it now rather than + * track and zero any pieces we didn't touch after we've written + * all of the sections to memory. + */ if (flags & kKxldAllocateWritable) { linked_object = (u_char *) (u_long) vmaddr; + bzero(linked_object, vmsize); } else { linked_object = kxld_page_alloc_untracked(vmsize); require(linked_object, finish); @@ -664,12 +664,6 @@ allocate_kext(KXLDContext *context, kxld_kext_set_linked_object_size(context->kext, vmsize); - /* Zero out the memory before we fill it. We fill this buffer in a - * sparse fashion, and it's simpler to clear it now rather than - * track and zero any pieces we didn't touch after we've written - * all of the sections to memory. - */ - bzero(linked_object, vmsize); *vmaddr_out = vmaddr; *vmsize_out = vmsize; diff --git a/libkern/kxld/kxld_array.c b/libkern/kxld/kxld_array.c index 51c6df6ad..1cec07c85 100644 --- a/libkern/kxld/kxld_array.c +++ b/libkern/kxld/kxld_array.c @@ -177,12 +177,11 @@ pool_create(size_t capacity) { KXLDArrayPool *pool = NULL, *rval = NULL; - pool = kxld_alloc(sizeof(*pool)); + pool = kxld_calloc(sizeof(*pool)); require(pool, finish); pool->buffer = kxld_page_alloc(capacity); require(pool->buffer, finish); - bzero(pool->buffer, capacity); rval = pool; pool = NULL; diff --git a/libkern/kxld/kxld_demangle.h b/libkern/kxld/kxld_demangle.h index 116b0ffbb..998ab0242 100644 --- a/libkern/kxld/kxld_demangle.h +++ b/libkern/kxld/kxld_demangle.h @@ -47,6 +47,6 @@ * */ const char * kxld_demangle(const char *str, char **buffer, size_t *length) -__attribute__((pure, nonnull(1), visibility("hidden"))); +__attribute__((nonnull(1), visibility("hidden"))); #endif /* !_KXLD_DEMANGLE_H_ */ diff --git a/libkern/kxld/kxld_object.c b/libkern/kxld/kxld_object.c index 51cbb170b..f83f0f321 100644 --- a/libkern/kxld/kxld_object.c +++ b/libkern/kxld/kxld_object.c @@ -248,9 +248,8 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, /* Allocate the symbol table */ if (!object->symtab) { - object->symtab = kxld_alloc(kxld_symtab_sizeof()); + object->symtab = kxld_calloc(kxld_symtab_sizeof()); require_action(object->symtab, finish, rval = KERN_RESOURCE_SHORTAGE); - bzero(object->symtab, kxld_symtab_sizeof()); } /* Build the relocator */ diff --git a/libkern/kxld/kxld_reloc.c b/libkern/kxld/kxld_reloc.c index 7a289f744..ac4a8b319 100644 --- a/libkern/kxld/kxld_reloc.c +++ b/libkern/kxld/kxld_reloc.c @@ -619,8 +619,7 @@ kxld_reloc_export_macho(const KXLDRelocator *relocator, if (kaslr_offsets == NULL) { kaslr_offsets_index = 0; kaslr_offsets_count = locrelocs->nitems + extrelocs->nitems; - kaslr_offsets = (uint32_t *)malloc(kaslr_offsets_count * sizeof(*kaslr_offsets)); - bzero(kaslr_offsets, kaslr_offsets_count * sizeof(*kaslr_offsets)); + kaslr_offsets = (uint32_t *)calloc(kaslr_offsets_count, sizeof(*kaslr_offsets)); } // copies the reloc data into the __LINKEDIT segment diff --git a/libkern/kxld/kxld_sym.c b/libkern/kxld/kxld_sym.c index bc2ace856..0072c4197 100644 --- a/libkern/kxld/kxld_sym.c +++ b/libkern/kxld/kxld_sym.c @@ -114,6 +114,10 @@ kxld_sym_init_from_macho64(KXLDSym *sym, char *strtab, const struct nlist_64 *sr sym->base_addr = src->n_value; sym->link_addr = sym->base_addr; + if (!strcmp("__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev", sym->name)) { + sym->name = (char *)(uintptr_t) "__ZN15OSMetaClassBase8DispatchE5IORPC"; + } + rval = init_predicates(sym, src->n_type, src->n_desc); require_noerr(rval, finish); diff --git a/libkern/kxld/kxld_util.c b/libkern/kxld/kxld_util.c index f40a3d173..47df25866 100644 --- a/libkern/kxld/kxld_util.c +++ b/libkern/kxld/kxld_util.c @@ -144,6 +144,30 @@ kxld_log(KXLDLogSubsystem subsystem, KXLDLogLevel level, /******************************************************************************* *******************************************************************************/ +void * +kxld_calloc(size_t size) +{ + void * ptr = NULL; + +#if KERNEL + ptr = kalloc(size); + if (ptr) { + bzero(ptr, size); + } +#else + ptr = calloc(1, size); +#endif + +#if DEBUG + if (ptr) { + ++num_allocations; + bytes_allocated += size; + } +#endif + + return ptr; +} + void * kxld_alloc(size_t size) { @@ -187,8 +211,11 @@ kxld_page_alloc_untracked(size_t size) ptr = (void *) addr; } } + if (ptr) { + bzero(ptr, size); + } #else /* !KERNEL */ - ptr = malloc(size); + ptr = calloc(1, size); #endif /* KERNEL */ return ptr; diff --git a/libkern/kxld/kxld_util.h b/libkern/kxld/kxld_util.h index ec9bb482b..d11d2dc8b 100644 --- a/libkern/kxld/kxld_util.h +++ b/libkern/kxld/kxld_util.h @@ -139,6 +139,9 @@ __attribute__((visibility("hidden"), format(printf, 3, 4))); * Allocators *******************************************************************************/ +void * kxld_calloc(size_t size) +__attribute__((malloc, visibility("hidden"))); + void * kxld_alloc(size_t size) __attribute__((malloc, visibility("hidden"))); diff --git a/libkern/libclosure/runtime.cpp b/libkern/libclosure/runtime.cpp index 95bf5a48c..4ae5cd977 100644 --- a/libkern/libclosure/runtime.cpp +++ b/libkern/libclosure/runtime.cpp @@ -26,6 +26,7 @@ #endif /* KERNEL */ +#include #include #include #ifndef os_assumes @@ -54,8 +55,8 @@ OSAtomicCompareAndSwapInt(int oldi, int newi, int volatile *dst) return original == oldi; } #else -#define OSAtomicCompareAndSwapLong(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New) -#define OSAtomicCompareAndSwapInt(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New) +#define OSAtomicCompareAndSwapLong(_Old, _New, _Ptr) os_atomic_cmpxchg(_Ptr, _Old, _New, relaxed) +#define OSAtomicCompareAndSwapInt(_Old, _New, _Ptr) os_atomic_cmpxchg(_Ptr, _Old, _New, relaxed) #endif diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index a7b66fe8a..f40ad2170 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -34,6 +34,10 @@ DATAFILES = \ OSReturn.h \ OSTypes.h +DRIVERKIT_DATAFILES = \ + OSByteOrder.h \ + _OSByteOrder.h + KERNELFILES = \ ${DATAFILES} \ OSAtomic.h \ @@ -62,6 +66,10 @@ PRIVATE_DATAFILES = \ INSTALL_MI_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} + +DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include + INSTALL_MI_DIR = libkern INSTALL_MI_LCL_LIST = \ @@ -85,9 +93,9 @@ EXPORT_MI_GEN_LIST = version.h EXPORT_MI_DIR = libkern version.h: version.h.template $(SRCROOT)/config/MasterVersion - @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)libkern/$@$(Color0) from $(ColorF)$<$(Color0)"; + $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)libkern/$@$(Color0) from $(ColorF)$<$(Color0)) $(_v)install $(DATA_INSTALL_FLAGS) $< $@ - $(_v)$(NEWVERS) $@ > /dev/null; + $(_v)$(NEWVERS) $@ > /dev/null include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/libkern/libkern/OSAtomic.h b/libkern/libkern/OSAtomic.h index 375151929..e1290895c 100644 --- a/libkern/libkern/OSAtomic.h +++ b/libkern/libkern/OSAtomic.h @@ -37,6 +37,7 @@ #define _OS_OSATOMIC_H #include +#include #if defined(__cplusplus) extern "C" { @@ -53,7 +54,7 @@ extern "C" { * -- var is used, but sizeof does not evaluate the * argument, i.e. we're safe against "++" etc. in var -- */ -#define __SAFE_CAST_PTR(type, var) (((type)(var))+(0/(sizeof(*var) == sizeof(*(type)0) ? 1 : 0))) +#define __SAFE_CAST_PTR(type, var) (((type)(var))+(0/(sizeof(*var) == sizeof(*(type)NULL) ? 1 : 0))) #else #define __SAFE_CAST_PTR(type, var) ((type)(var)) #endif diff --git a/libkern/libkern/OSKextLib.h b/libkern/libkern/OSKextLib.h index 2729d3c83..b9011d072 100644 --- a/libkern/libkern/OSKextLib.h +++ b/libkern/libkern/OSKextLib.h @@ -261,9 +261,12 @@ __BEGIN_DECLS /* Define C-string versions of the CFBundle keys for use in the kernel. */ #define kCFBundleIdentifierKey "CFBundleIdentifier" +#define kCFBundleIdentifierKernelKey "CFBundleIdentifierKernel" #define kCFBundleVersionKey "CFBundleVersion" #define kCFBundleNameKey "CFBundleName" #define kCFBundleExecutableKey "CFBundleExecutable" +#define kCFBundlePackageTypeKey "CFBundlePackageType" +#define kCFBundleDriverKitUUIDKey "CFBundleDriverKitUUID" #endif /* KERNEL */ /*! @@ -339,6 +342,13 @@ __BEGIN_DECLS */ #define kOSKernelResourceKey "OSKernelResource" +/*! + * @define kOSKextVariantOverrideKey + * @abstract A dictionary with target names as key and a target-specific variant + * name as value. + */ +#define kOSKextVariantOverrideKey "OSKextVariantOverride" + /*! * @define kIOKitPersonalitiesKey * @abstract A dictionary of dictionaries used in matching for I/O Kit drivers. @@ -408,6 +418,20 @@ __BEGIN_DECLS */ #define kOSKextKernelIdentifier "__kernel__" +/*! + * @define kOSKextBundlePackageTypeKext + * @abstract + * The bundle type value for Kernel Extensions. + */ +#define kOSKextBundlePackageTypeKext "KEXT" + +/*! + * @define kOSKextBundlePackageTypeDriverKit + * @abstract + * The bundle type value for Driver Extensions. + */ +#define kOSKextBundlePackageTypeDriverKit "DEXT" + /*! * @define kOSBundleRequiredRoot * @abstract @@ -457,6 +481,19 @@ __BEGIN_DECLS */ #define kOSBundleRequiredConsole "Console" +/*! + * @define kOSBundleRequiredDriverKit + * @abstract + * This @link kOSBundleRequiredKey OSBundleRequired@/link + * value indicates that the driver extension's (DriverKit driver's) + * personalities must be present in the kernel at early boot (specifically + * before @link //apple_ref/doc/man/8/kextd kextd(8)@/link starts) + * in order to compete with kexts built into the prelinkedkernel. Note that + * kextd is still required to launch the user space driver binary. The IOKit + * matching will happen during early boot, and the actual driver launch + * will happen after kextd starts. + */ +#define kOSBundleRequiredDriverKit "DriverKit" #if PRAGMA_MARK #pragma mark - @@ -973,7 +1010,7 @@ OSKextResetPgoCounters(void); * @/textblock * */ -extern const void * gOSKextUnresolved; +extern const void * const gOSKextUnresolved; /*! * @define OSKextSymbolIsResolved diff --git a/libkern/libkern/OSKextLibPrivate.h b/libkern/libkern/OSKextLibPrivate.h index 2167f212b..a3971be07 100644 --- a/libkern/libkern/OSKextLibPrivate.h +++ b/libkern/libkern/OSKextLibPrivate.h @@ -60,6 +60,8 @@ typedef uint8_t OSKextExcludeLevel; #define kOSKextExcludeKext (1) #define kOSKextExcludeAll (2) +#define kOSKextManagementEntitlement "com.apple.private.security.kext-management" + #if PRAGMA_MARK #pragma mark - /********************************************************************/ @@ -698,7 +700,7 @@ Boolean OSKextVersionGetString( /********************************************************************/ #endif #ifdef XNU_KERNEL_PRIVATE -void kext_weak_symbol_referenced(void); +void kext_weak_symbol_referenced(void) __abortlike; #endif /* XNU_KERNEL_PRIVATE */ #if PRAGMA_MARK diff --git a/libkern/libkern/OSSerializeBinary.h b/libkern/libkern/OSSerializeBinary.h index bab81dba9..0396821f6 100644 --- a/libkern/libkern/OSSerializeBinary.h +++ b/libkern/libkern/OSSerializeBinary.h @@ -44,7 +44,7 @@ enum{ kOSSerializeEndCollecton = 0x80000000U, }; -#define kOSSerializeBinarySignature "\323\0\0" - +#define kOSSerializeBinarySignature "\323\0\0" +#define kOSSerializeIndexedBinarySignature 0x000000D4 #endif /* _OS_OSSERIALIZEBINARY_H */ diff --git a/libkern/libkern/c++/Makefile b/libkern/libkern/c++/Makefile index 69b376774..f9e09b101 100644 --- a/libkern/libkern/c++/Makefile +++ b/libkern/libkern/c++/Makefile @@ -23,6 +23,7 @@ DATAFILES = \ OSNumber.h \ OSObject.h \ OSOrderedSet.h \ + OSPtr.h \ OSSerialize.h \ OSSet.h \ OSString.h \ diff --git a/libkern/libkern/c++/OSArray.h b/libkern/libkern/c++/OSArray.h index d094b0e08..73edeb601 100644 --- a/libkern/libkern/c++/OSArray.h +++ b/libkern/libkern/c++/OSArray.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,8 +32,12 @@ #define _OS_OSARRAY_H #include +#include class OSSerialize; +class OSArray; + +typedef OSPtr OSArrayPtr; /*! * @header @@ -90,7 +94,7 @@ class OSArray : public OSCollection friend class OSSet; friend class OSSerialize; - OSDeclareDefaultStructors(OSArray) + OSDeclareDefaultStructors(OSArray); #if APPLE_KEXT_ALIGN_CONTAINERS @@ -98,12 +102,12 @@ protected: unsigned int count; unsigned int capacity; unsigned int capacityIncrement; - const OSMetaClassBase ** array; + OSCollectionTaggedPtr *array; #else /* APPLE_KEXT_ALIGN_CONTAINERS */ protected: - const OSMetaClassBase ** array; + OSCollectionTaggedPtr *array; unsigned int count; unsigned int capacity; unsigned int capacityIncrement; @@ -140,7 +144,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001502 CFMutableArray@/link, * for which the initial capacity is a hard limit). */ - static OSArray * withCapacity(unsigned int capacity); + static OSArrayPtr withCapacity(unsigned int capacity); /*! @@ -168,7 +172,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001502 CFMutableArray@/link, * for which the initial capacity is a hard limit). */ - static OSArray * withObjects( + static OSArrayPtr withObjects( const OSObject * objects[], unsigned int count, unsigned int capacity = 0); @@ -206,7 +210,7 @@ public: * for storage in the new OSArray, * not copied. */ - static OSArray * withArray( + static OSArrayPtr withArray( const OSArray * array, unsigned int capacity = 0); @@ -698,7 +702,7 @@ public: virtual unsigned setOptions( unsigned options, unsigned mask, - void * context = 0) APPLE_KEXT_OVERRIDE; + void * context = NULL) APPLE_KEXT_OVERRIDE; /*! @@ -723,7 +727,7 @@ public: * Objects that are not derived from OSCollection are retained * rather than copied. */ - OSCollection * copyCollection(OSDictionary * cycleDict = 0) APPLE_KEXT_OVERRIDE; + OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL) APPLE_KEXT_OVERRIDE; OSMetaClassDeclareReservedUnused(OSArray, 0); OSMetaClassDeclareReservedUnused(OSArray, 1); diff --git a/libkern/libkern/c++/OSBoolean.h b/libkern/libkern/c++/OSBoolean.h index 207bb4da8..67e3b840b 100644 --- a/libkern/libkern/c++/OSBoolean.h +++ b/libkern/libkern/c++/OSBoolean.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,8 +31,12 @@ #define _OS_OSBOOLEAN_H #include +#include class OSString; +class OSBoolean; + +typedef OSPtr OSBooleanPtr; /*! * @header @@ -62,7 +66,7 @@ class OSString; */ class OSBoolean : public OSObject { - OSDeclareDefaultStructors(OSBoolean) + OSDeclareDefaultStructors(OSBoolean); friend class OSSerialize; protected: @@ -102,7 +106,7 @@ public: * @link kOSBooleanFalse kOSBooleanFalse@/link, * so that you can always use pointer comparison with OSBoolean objects. */ - static OSBoolean * withBoolean(bool value); + static OSBooleanPtr withBoolean(bool value); /*! * @function free diff --git a/libkern/libkern/c++/OSCollection.h b/libkern/libkern/c++/OSCollection.h index f6c7e01b7..67ec1f771 100644 --- a/libkern/libkern/c++/OSCollection.h +++ b/libkern/libkern/c++/OSCollection.h @@ -31,8 +31,15 @@ #define _OS_OSCOLLECTION_H #include +#include class OSDictionary; +class OSCollection; + +typedef OSPtr OSCollectionPtr; + +template +using OSCollectionTaggedPtr = OSTaggedPtr; /*! @@ -418,7 +425,7 @@ public: virtual unsigned setOptions( unsigned options, unsigned mask, - void * context = 0); + void * context = NULL); OSMetaClassDeclareReservedUsed(OSCollection, 0); /*! @@ -445,7 +452,7 @@ public: * Subclasses of OSCollection must override this function * to properly support deep copies. */ - virtual OSCollection *copyCollection(OSDictionary * cycleDict = 0); + virtual OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL); OSMetaClassDeclareReservedUsed(OSCollection, 1); /*! diff --git a/libkern/libkern/c++/OSCollectionIterator.h b/libkern/libkern/c++/OSCollectionIterator.h index d82cff509..eb57231d9 100644 --- a/libkern/libkern/c++/OSCollectionIterator.h +++ b/libkern/libkern/c++/OSCollectionIterator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,8 +31,12 @@ #define _OS_OSCOLLECTIONITERATOR_H #include +#include +#include -class OSCollection; +class OSCollectionIterator; + +typedef OSPtr OSCollectionIteratorPtr; /*! * @header @@ -90,11 +94,11 @@ class OSCollection; */ class OSCollectionIterator : public OSIterator { - OSDeclareDefaultStructors(OSCollectionIterator) + OSDeclareDefaultStructors(OSCollectionIterator); protected: // xx-review: Do we want to document these? - const OSCollection * collection; + OSPtr collection; void * collIterator; unsigned int initialUpdateStamp; bool valid; @@ -112,7 +116,7 @@ public: * @result * A new instance of OSCollectionIterator, or NULL on failure. */ - static OSCollectionIterator * withCollection(const OSCollection * inColl); + static OSCollectionIteratorPtr withCollection(const OSCollection * inColl); /*! diff --git a/libkern/libkern/c++/OSData.h b/libkern/libkern/c++/OSData.h index b1547ae8e..ed473487a 100644 --- a/libkern/libkern/c++/OSData.h +++ b/libkern/libkern/c++/OSData.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,9 +32,14 @@ #define _OS_OSDATA_H #include +#include +class OSData; class OSString; +typedef OSPtr OSDataPtr; +typedef OSPtr OSDataConstPtr; + /*! * @header * @@ -75,7 +80,7 @@ class OSData : public OSObject { friend class OSSerialize; - OSDeclareDefaultStructors(OSData) + OSDeclareDefaultStructors(OSData); #if APPLE_KEXT_ALIGN_CONTAINERS @@ -135,7 +140,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001498 CFMutableData@/link, * for which a nonzero initial capacity is a hard limit). */ - static OSData * withCapacity(unsigned int capacity); + static OSDataPtr withCapacity(unsigned int capacity); /*! @@ -158,7 +163,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001498 CFMutableData@/link, * for which a nonzero initial capacity is a hard limit). */ - static OSData * withBytes( + static OSDataPtr withBytes( const void * bytes, unsigned int numBytes); @@ -191,7 +196,7 @@ public: * but you can get the byte pointer and * modify bytes within the shared buffer. */ - static OSData * withBytesNoCopy( + static OSDataPtr withBytesNoCopy( void * bytes, unsigned int numBytes); @@ -215,7 +220,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001498 CFMutableData@/link, * for which a nonzero initial capacity is a hard limit). */ - static OSData * withData(const OSData * inData); + static OSDataPtr withData(const OSData * inData); /*! @@ -240,7 +245,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001498 CFMutableData@/link, * for which a nonzero initial capacity is a hard limit). */ - static OSData * withData( + static OSDataPtr withData( const OSData * inData, unsigned int start, unsigned int numBytes); diff --git a/libkern/libkern/c++/OSDictionary.h b/libkern/libkern/c++/OSDictionary.h index a7dcfcdb0..98d258153 100644 --- a/libkern/libkern/c++/OSDictionary.h +++ b/libkern/libkern/c++/OSDictionary.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,10 +38,15 @@ #define _IOKIT_IODICTIONARY_H #include +#include +#include class OSArray; class OSSymbol; class OSString; +class OSDictionary; + +typedef OSPtr OSDictionaryPtr; /*! * @header @@ -114,7 +119,7 @@ class OSDictionary : public OSCollection { friend class OSSerialize; - OSDeclareDefaultStructors(OSDictionary) + OSDeclareDefaultStructors(OSDictionary); #if APPLE_KEXT_ALIGN_CONTAINERS @@ -123,8 +128,8 @@ protected: unsigned int capacity; unsigned int capacityIncrement; struct dictEntry { - const OSSymbol * key; - const OSMetaClassBase * value; + OSCollectionTaggedPtr key; + OSCollectionTaggedPtr value; #if XNU_KERNEL_PRIVATE static int compare(const void *, const void *); #endif @@ -135,8 +140,8 @@ protected: protected: struct dictEntry { - const OSSymbol * key; - const OSMetaClassBase * value; + OSCollectionTaggedPtr key; + OSCollectionTaggedPtr value; #if XNU_KERNEL_PRIVATE static int compare(const void *, const void *); #endif @@ -179,7 +184,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001497 CFMutableDictionary@/link, * for which the initial capacity is a hard limit). */ - static OSDictionary * withCapacity(unsigned int capacity); + static OSDictionaryPtr withCapacity(unsigned int capacity); /*! @@ -214,7 +219,7 @@ public: * @link //apple_ref/doc/uid/20001497 CFMutableDictionary@/link, * for which the initial capacity is a hard limit). */ - static OSDictionary * withObjects( + static OSDictionaryPtr withObjects( const OSObject * objects[], const OSSymbol * keys[], unsigned int count, @@ -252,7 +257,7 @@ public: * @link //apple_ref/doc/uid/20001497 CFMutableDictionary@/link, * for which the initial capacity is a hard limit). */ - static OSDictionary * withObjects( + static OSDictionaryPtr withObjects( const OSObject * objects[], const OSString * keys[], unsigned int count, @@ -293,7 +298,7 @@ public: * in the new OSDictionary, * not copied. */ - static OSDictionary * withDictionary( + static OSDictionaryPtr withDictionary( const OSDictionary * dict, unsigned int capacity = 0); @@ -898,7 +903,7 @@ public: virtual unsigned setOptions( unsigned options, unsigned mask, - void * context = 0) APPLE_KEXT_OVERRIDE; + void * context = NULL) APPLE_KEXT_OVERRIDE; /*! @@ -924,12 +929,12 @@ public: * Objects that are not derived from OSCollection are retained * rather than copied. */ - OSCollection * copyCollection(OSDictionary * cycleDict = 0) APPLE_KEXT_OVERRIDE; + OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL) APPLE_KEXT_OVERRIDE; #if XNU_KERNEL_PRIVATE bool setObject(const OSSymbol *aKey, const OSMetaClassBase *anObject, bool onlyAdd); - OSArray * copyKeys(void); void sortBySymbol(void); + OSArrayPtr copyKeys(void); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/libkern/libkern/c++/OSIterator.h b/libkern/libkern/c++/OSIterator.h index f23dd782c..a9c049b87 100644 --- a/libkern/libkern/c++/OSIterator.h +++ b/libkern/libkern/c++/OSIterator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,7 +67,7 @@ */ class OSIterator : public OSObject { - OSDeclareAbstractStructors(OSIterator) + OSDeclareAbstractStructors(OSIterator); public: /*! diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index 898217069..942788f0c 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2016 Apple Inc. All rights reserved. + * Copyright (c) 2008-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,7 @@ extern "C" { #include #include #include +#include #include /********************************************************************* @@ -70,6 +71,8 @@ void kxld_log_callback( *********************************************************************/ class OSKext; +typedef OSPtr OSKextPtr; + extern "C" { void OSKextLog( OSKext * aKext, @@ -153,7 +156,7 @@ typedef struct OSKextActiveAccount OSKextActiveAccount; /********************************************************************/ class OSKext : public OSObject { - OSDeclareDefaultStructors(OSKext) + OSDeclareDefaultStructors(OSKext); #if PRAGMA_MARK /**************************************/ @@ -234,11 +237,12 @@ private: /************************* * Instance variables *************************/ - OSDictionary * infoDict; + OSDictionaryPtr infoDict; - const OSSymbol * bundleID; - OSString * path; // not necessarily correct :-/ - OSString * executableRelPath;// relative to bundle + OSSymbolConstPtr bundleID; + OSStringPtr path; // not necessarily correct :-/ + OSStringPtr executableRelPath;// relative to bundle + OSStringPtr userExecutableRelPath;// relative to bundle OSKextVersion version; // parsed OSKextVersion compatibleVersion;// parsed @@ -250,18 +254,19 @@ private: // kOSKextInvalidLoadTag invalid kmod_info_t * kmod_info; // address into linkedExec./alloced for interface - OSArray * dependencies; // kernel resource does not have any; - // links directly to kernel + OSArrayPtr dependencies; // kernel resource does not have any; + // links directly to kernel /* Only real kexts have these; interface kexts do not. */ - OSData * linkedExecutable; - OSSet * metaClasses; // for C++/OSMetaClass kexts + OSDataPtr linkedExecutable; + OSSetPtr metaClasses; // for C++/OSMetaClass kexts /* Only interface kexts have these; non-interface kexts can get at them * in the linked Executable. */ - OSData * interfaceUUID; + OSDataPtr interfaceUUID; + OSDataPtr driverKitUUID; struct { unsigned int loggingEnabled:1; @@ -287,6 +292,8 @@ private: unsigned int jettisonLinkeditSeg:1; } flags; + uint32_t matchingRefCount; + struct list_head pendingPgoHead; uuid_t instance_uuid; OSKextAccount * account; @@ -303,13 +310,13 @@ private: */ public: static void initialize(void); - static OSDictionary * copyKexts(void); + static OSDictionaryPtr copyKexts(void); static OSReturn removeKextBootstrap(void); static void willShutdown(void);// called by IOPMrootDomain on shutdown static void reportOSMetaClassInstances( const char * kextIdentifier, OSKextLogSpec msgLogSpec); - + static void OSKextLogDriverKitInfoLoad(OSKext *kext); #endif /* XNU_KERNEL_PRIVATE */ private: @@ -328,14 +335,14 @@ private: /* Instance life cycle. */ - static OSKext * withBooterData( + static OSKextPtr withBooterData( OSString * deviceTreeName, OSData * booterData); virtual bool initWithBooterData( OSString * deviceTreeName, OSData * booterData); - static OSKext * withPrelinkedInfoDict( + static OSKextPtr withPrelinkedInfoDict( OSDictionary * infoDict, bool doCoalesedSlides); virtual bool initWithPrelinkedInfoDict( @@ -344,7 +351,7 @@ private: static void setAllVMAttributes(void); - static OSKext * withMkext2Info( + static OSKextPtr withMkext2Info( OSDictionary * anInfoDict, OSData * mkextData); virtual bool initWithMkext2Info( @@ -381,7 +388,7 @@ private: OSData * mkextData, OSNumber * offsetNum, const char * entryName); - virtual OSData * extractMkext2FileData( + virtual OSDataPtr extractMkext2FileData( UInt8 * data, const char * name, uint32_t compressedSize, @@ -454,7 +461,7 @@ private: virtual OSReturn validateKextMapping(bool startFlag); virtual boolean_t verifySegmentMapping(kernel_segment_command_t *seg); - static OSArray * copyAllKextPersonalities( + static OSArrayPtr copyAllKextPersonalities( bool filterSafeBootFlag = false); static void setPrelinkedPersonalities(OSArray * personalitiesArray); @@ -477,21 +484,21 @@ private: /* Getting info about loaded kexts (kextstat). */ - static OSDictionary * copyLoadedKextInfo( + static OSDictionaryPtr copyLoadedKextInfo( OSArray * kextIdentifiers = NULL, OSArray * keys = NULL); - static OSDictionary * copyLoadedKextInfoByUUID( + static OSDictionaryPtr copyLoadedKextInfoByUUID( OSArray * kextIdentifiers = NULL, OSArray * keys = NULL); - static OSData * copyKextUUIDForAddress(OSNumber *address = NULL); - virtual OSDictionary * copyInfo(OSArray * keys = NULL); + static OSDataPtr copyKextUUIDForAddress(OSNumber *address = NULL); + virtual OSDictionaryPtr copyInfo(OSArray * keys = NULL); /* Logging to user space. */ static OSKextLogSpec setUserSpaceLogFilter( OSKextLogSpec userLogSpec, bool captureFlag = false); - static OSArray * clearUserSpaceLogFilter(void); + static OSArrayPtr clearUserSpaceLogFilter(void); static OSKextLogSpec getUserSpaceLogFilter(void); /* OSMetaClasses defined by kext. @@ -513,10 +520,10 @@ private: static OSReturn dequeueCallbackForRequestTag( OSKextRequestTag requestTag, - OSDictionary ** callbackRecordOut); + LIBKERN_RETURNS_RETAINED OSDictionary ** callbackRecordOut); static OSReturn dequeueCallbackForRequestTag( OSNumber * requestTagNum, - OSDictionary ** callbackRecordOut); + LIBKERN_RETURNS_RETAINED OSDictionary ** callbackRecordOut); static void invokeRequestCallback( OSDictionary * callbackRecord, OSReturn requestResult); @@ -538,6 +545,7 @@ public: unsigned int cnt, int (* printf_func)(const char *fmt, ...), uint32_t flags); + bool isDriverKit(void); private: static OSKextLoadedKextSummary *summaryForAddress(const uintptr_t addr); static void *kextForAddress(const void *addr); @@ -580,12 +588,12 @@ public: /**************************************/ #endif public: -// caller must release - static OSKext * lookupKextWithIdentifier(const char * kextIdentifier); - static OSKext * lookupKextWithIdentifier(OSString * kextIdentifier); - static OSKext * lookupKextWithLoadTag(OSKextLoadTag aTag); - static OSKext * lookupKextWithAddress(vm_address_t address); - static OSKext * lookupKextWithUUID(uuid_t uuid); + // caller must release + static OSKextPtr lookupKextWithIdentifier(const char * kextIdentifier); + static OSKextPtr lookupKextWithIdentifier(OSString * kextIdentifier); + static OSKextPtr lookupKextWithLoadTag(OSKextLoadTag aTag); + static OSKextPtr lookupKextWithAddress(vm_address_t address); + static OSKextPtr lookupKextWithUUID(uuid_t uuid); kernel_section_t *lookupSection(const char *segname, const char*secname); @@ -598,20 +606,29 @@ public: OSKextExcludeLevel startOpt = kOSKextExcludeNone, OSKextExcludeLevel startMatchingOpt = kOSKextExcludeAll, OSArray * personalityNames = NULL); + static OSReturn loadKextWithIdentifier( OSString * kextIdentifier, + LIBKERN_RETURNS_RETAINED_ON_ZERO OSObject ** kextRef, Boolean allowDeferFlag = true, Boolean delayAutounloadFlag = false, OSKextExcludeLevel startOpt = kOSKextExcludeNone, OSKextExcludeLevel startMatchingOpt = kOSKextExcludeAll, OSArray * personalityNames = NULL); + + static void dropMatchingReferences( + OSSet * kexts); + static OSReturn removeKextWithIdentifier( const char * kextIdentifier, bool terminateServicesAndRemovePersonalitiesFlag = false); static OSReturn removeKextWithLoadTag( OSKextLoadTag loadTag, bool terminateServicesAndRemovePersonalitiesFlag = false); - + static OSReturn requestDaemonLaunch( + OSString * kextIdentifier, + OSString * serverName, + OSNumber * serverTag); static OSReturn requestResource( const char * kextIdentifier, const char * resourceName, @@ -647,11 +664,12 @@ public: virtual OSKextLoadTag getLoadTag(void); virtual void getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize); - virtual OSData * copyUUID(void); - OSData * copyTextUUID(void); - OSData * copyMachoUUID(const kernel_mach_header_t * header); - virtual OSArray * copyPersonalitiesArray(void); - + virtual OSDataPtr copyUUID(void); + OSDataPtr copyTextUUID(void); + OSDataPtr copyMachoUUID(const kernel_mach_header_t * header); + virtual OSArrayPtr copyPersonalitiesArray(void); + static bool copyUserExecutablePath(const OSSymbol * bundleID, char * pathResult, size_t pathSize); + virtual void setDriverKitUUID(OSData *uuid); /* This removes personalities naming the kext (by CFBundleIdentifier), * not all personalities defined by the kext (IOPersonalityPublisher or CFBundleIdentifier). */ diff --git a/libkern/libkern/c++/OSLib.h b/libkern/libkern/c++/OSLib.h index 93c2548e1..358b2ad53 100644 --- a/libkern/libkern/c++/OSLib.h +++ b/libkern/libkern/c++/OSLib.h @@ -78,7 +78,11 @@ extern "C" int debug_ivars_size; #ifndef NULL #if defined (__cplusplus) +#if __cplusplus >= 201103L +#define NULL nullptr +#else #define NULL 0 +#endif #else #define NULL ((void *)0) #endif diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index 21b4f40e6..03da0e6c2 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,7 @@ #include #include +#include /* * LIBKERN_ macros below can be used to describe the ownership semantics @@ -57,6 +58,17 @@ * attribute applied to a function. * In the former case, it stipulates that the function is returning at "+1", * and in the latter case "+0". + * + * LIBKERN_RETURNS_RETAINED and LIBKERN_RETURNS_NOT_RETAINED attributes + * can be also applied to out parameters, in which case they specify + * that an out parameter is written into at +1 or +0 respectively. + * For out parameters of non-void functions an assumption is + * that an out parameter is written into iff the return value is non-zero + * unless the function returns a typedef to kern_return_t, + * in which case it is assumed to be written into on zero value + * (kIOReturnSuccess). + * This can be customized using the attributes + * LIBKERN_RETURNS_RETAINED_ON_ZERO and LIBKERN_RETURNS_RETAINED_ON_NONZERO. */ #if __has_attribute(os_returns_retained) #define LIBKERN_RETURNS_RETAINED __attribute__((os_returns_retained)) @@ -91,6 +103,30 @@ #define LIBKERN_CONSUMES_THIS #endif +/* + * LIBKERN_RETURNS_RETAINED_ON_ZERO is an attribute applicable to out + * parameters. + * It specifies that an out parameter at +1 is written into an argument iff + * the function returns a zero return value. + */ +#if __has_attribute(os_returns_retained_on_zero) +#define LIBKERN_RETURNS_RETAINED_ON_ZERO __attribute__((os_returns_retained_on_zero)) +#else +#define LIBKERN_RETURNS_RETAINED_ON_ZERO +#endif + +/* + * LIBKERN_RETURNS_RETAINED_ON_NON_ZERO is an attribute applicable to out + * parameters. + * It specifies that an out parameter at +1 is written into an argument iff + * the function returns a non-zero return value. + */ +#if __has_attribute(os_returns_retained_on_non_zero) +#define LIBKERN_RETURNS_RETAINED_ON_NONZERO __attribute__((os_returns_retained_on_non_zero)) +#else +#define LIBKERN_RETURNS_RETAINED_ON_NONZERO +#endif + class OSMetaClass; class OSObject; class OSString; @@ -101,7 +137,10 @@ class OSSerialize; class OSOrderedSet; class OSCollection; #endif /* XNU_KERNEL_PRIVATE */ - +struct IORPC; +class OSInterface +{ +}; /*! * @header @@ -128,12 +167,12 @@ class OSCollection; #else /* XNU_KERNEL_PRIVATE */ #include -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define APPLE_KEXT_VTABLE_PADDING 0 -#else /* TARGET_OS_EMBEDDED */ +#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /*! @parseOnly */ #define APPLE_KEXT_VTABLE_PADDING 1 -#endif /* TARGET_OS_EMBEDDED */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ #endif /* XNU_KERNEL_PRIVATE */ @@ -160,7 +199,10 @@ class OSCollection; #define APPLE_KEXT_DEPRECATED __attribute__((deprecated)) -#if __cplusplus >= 201103L +/* + * AppleUSBAudio builds xnu's libkern headers in user space + */ +#if !defined(BUILD_FOR_USERSPACE) && (__cplusplus >= 201103L) #define APPLE_KEXT_OVERRIDE override #if defined(__LP64__) #define APPLE_KEXT_COMPATIBILITY_OVERRIDE @@ -172,7 +214,7 @@ class OSCollection; #define APPLE_KEXT_COMPATIBILITY_OVERRIDE #endif -#define APPLE_KEXT_WSHADOW_PUSH _Pragma("clang diagnostic push"); \ +#define APPLE_KEXT_WSHADOW_PUSH _Pragma("clang diagnostic push") \ _Pragma("clang diagnostic ignored \"-Wunknown-warning-option\"") \ _Pragma("clang diagnostic ignored \"-Wshadow-field\"") @@ -232,6 +274,7 @@ class OSCollection; * * The run-time type macros and functions of OSMetaClassBase are thread-safe. */ + class OSMetaClassBase { public: @@ -288,6 +331,7 @@ public: * @link //apple_ref/cpp/macro/OSCheckTypeInst OSCheckTypeInst@/link. */ #define OSTypeID(type) (type::metaClass) +#define OSMTypeID(type) ((OSMetaClass *) type::metaClass) /*! @@ -348,6 +392,27 @@ public: #define OSDynamicCast(type, inst) \ ((type *) OSMetaClassBase::safeMetaCast((inst), OSTypeID(type))) +/*! + * @define OSRequiredCast + * @hidecontents + * + * @abstract + * Safe type-casting for Libkern C++ objects; panics on failure. + * The input parameters are the same as for the {@code OSDynamicCast} macro. + * + * @result {@code inst} if it is NULL or derived from {@code type}; + * otherwise triggers a kernel panic. + * + * @discussion + * This macro should be used in place of C-style casts or + * @link OSDynamicCast OSDynamicCast@/link. + * when the caller is absolutely sure that the passed + * argument is a subclass of a required type. + * It is equivalent to using {@code OSDynamicCast} and crashing with a kernel + * panic on cast failure. + */ +#define OSRequiredCast(type, inst) \ + (type *) OSMetaClassBase::requiredMetaCast((inst), OSTypeID(type)) /*! * @define OSCheckTypeInst @@ -382,7 +447,7 @@ public: #if defined(__arm__) || defined(__arm64__) - static _ptf_t _ptmf2ptf(const OSMetaClassBase * self, void (OSMetaClassBase::*func)(void)); + static _ptf_t _ptmf2ptf(const OSMetaClassBase * self, void (OSMetaClassBase::*func)(void), uintptr_t typeDisc); #elif defined(__i386__) || defined(__x86_64__) @@ -391,7 +456,8 @@ public: // ABI static inline _ptf_t - _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) + _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void), + uintptr_t typeDisc __attribute__((unused))) { union { void (OSMetaClassBase::*fIn)(void); @@ -451,7 +517,8 @@ public: */ #define OSMemberFunctionCast(cptrtype, self, func) \ (cptrtype) OSMetaClassBase:: \ - _ptmf2ptf(self, (void (OSMetaClassBase::*)(void)) func) + _ptmf2ptf(self, (void (OSMetaClassBase::*)(void)) func, \ + ptrauth_type_discriminator(__typeof__(func))) protected: OSMetaClassBase(); @@ -717,6 +784,31 @@ public: const OSMetaClassBase * anObject, const OSMetaClass * toMeta); +/*! + * @function requiredMetaCast + * + * @abstract + * Casts an object to the class managed by the given OSMetaClass or + * fails with a kernel panic if the cast does not succeed. + * + * @param anObject A pointer to the object to be cast. + * @param toMeta A pointer to a constant OSMetaClass + * for the desired target type. + * + * @result + * anObject if the object is derived + * from the class managed by toMeta, + * NULL if anObject was NULL, + * kernel panic otherwise. + * + * @discussion + * It is far more convenient to use + * @link OSRequiredCast OSRequiredCast@/link. + */ + static OSMetaClassBase *requiredMetaCast( + const OSMetaClassBase * anObject, + const OSMetaClass * toMeta); + /*! * @function checkTypeInst * @@ -761,7 +853,7 @@ public: * OSObject::taggedRetain(const void *)@/link. */ // WAS: virtual void _RESERVEDOSMetaClassBase0(); - virtual void taggedRetain(const void * tag = 0) const = 0; + virtual void taggedRetain(const void * tag = NULL) const = 0; /*! @@ -780,7 +872,7 @@ public: * OSObject::taggedRelease(const void *)@/link. */ // WAS: virtual void _RESERVEDOSMetaClassBase1(); - virtual void taggedRelease(const void * tag = 0) const = 0; + virtual void taggedRelease(const void * tag = NULL) const = 0; protected: /*! @@ -803,10 +895,16 @@ protected: const void * tag, const int freeWhen) const = 0; +public: + virtual kern_return_t + Dispatch(const IORPC rpc); + + kern_return_t + Invoke(const IORPC rpc); + private: #if APPLE_KEXT_VTABLE_PADDING // Virtual Padding - virtual void _RESERVEDOSMetaClassBase3(); virtual void _RESERVEDOSMetaClassBase4(); virtual void _RESERVEDOSMetaClassBase5(); virtual void _RESERVEDOSMetaClassBase6(); @@ -901,7 +999,7 @@ typedef bool (*OSMetaClassInstanceApplierFunction)(const OSObject * instance, * by the run-time type information system, * which handles concurrency and locking internally. */ -class OSMetaClass : private OSMetaClassBase +class OSMetaClass : public OSMetaClassBase { friend class OSKext; #if IOKITSTATS @@ -1061,7 +1159,7 @@ protected: * for as long as its kernel extension is loaded, * OSMetaClass does not use reference-counting. */ - virtual void taggedRetain(const void * tag = 0) const; + virtual void taggedRetain(const void * tag = NULL) const; /*! @@ -1078,7 +1176,7 @@ protected: * for as long as its kernel extension is loaded, * OSMetaClass does not use reference-counting. */ - virtual void taggedRelease(const void * tag = 0) const; + virtual void taggedRelease(const void * tag = NULL) const; /*! @@ -1658,7 +1756,21 @@ public: * @param className The name of the C++ class, as a raw token, * not a string or macro. */ -#define OSDeclareCommonStructors(className) \ + +#define _OS_ADD_METAMETHODS(b) _OS_ADD_METAMETHODS_ ## b +#define _OS_ADD_METAMETHODS_ +#define _OS_ADD_METAMETHODS_dispatch \ + virtual kern_return_t Dispatch(const IORPC rpc) APPLE_KEXT_OVERRIDE; + +#define _OS_ADD_METHODS(className, b) _OS_ADD_METHODS_ ## b(className) +#define _OS_ADD_METHODS_(className) +#define _OS_ADD_METHODS_dispatch(className) \ + className ## _Methods \ + className ## _KernelMethods + +#define SUPERDISPATCH ((OSDispatchMethod)&super::_Dispatch) + +#define OSDeclareCommonStructors(className, dispatch) \ private: \ static const OSMetaClass * const superClass; \ public: \ @@ -1666,13 +1778,15 @@ public: static class MetaClass : public OSMetaClass { \ public: \ MetaClass(); \ - virtual OSObject *alloc() const; \ + virtual OSObject *alloc() const APPLE_KEXT_OVERRIDE;\ + _OS_ADD_METAMETHODS(dispatch); \ } gMetaClass; \ friend class className ::MetaClass; \ virtual const OSMetaClass * getMetaClass() const APPLE_KEXT_OVERRIDE; \ protected: \ className (const OSMetaClass *); \ - virtual ~ className () APPLE_KEXT_OVERRIDE + virtual ~ className () APPLE_KEXT_OVERRIDE; \ + _OS_ADD_METHODS(className, dispatch) /*! @@ -1681,7 +1795,7 @@ public: * * @abstract * Declares run-time type information and functions - * for a concrete Libkern C++ class. + * for a final (non-subclassable) Libkern C++ class. * * @param className The name of the C++ class, as a raw token, * not a string or macro. @@ -1691,13 +1805,20 @@ public: * immediately after the opening brace in a class declaration. * It leaves the current privacy state as protected:. */ -#define OSDeclareDefaultStructors(className) \ - OSDeclareCommonStructors(className); \ +#define _OSDeclareDefaultStructors(className, dispatch) \ + OSDeclareCommonStructors(className, dispatch); \ public: \ - className (); \ + className (void); \ protected: +#define OSDeclareDefaultStructors(className) \ +_OSDeclareDefaultStructors(className, ) + +#define OSDeclareDefaultStructorsWithDispatch(className) \ +_OSDeclareDefaultStructors(className, dispatch) + + /*! * @define OSDeclareAbstractStructors * @hidecontents @@ -1715,19 +1836,25 @@ public: * immediately after the opening brace in a class declaration. * It leaves the current privacy state as protected:. */ -#define OSDeclareAbstractStructors(className) \ - OSDeclareCommonStructors(className); \ - private: \ - className (); /* Make primary constructor private in abstract */ \ +#define _OSDeclareAbstractStructors(className, dispatch) \ + OSDeclareCommonStructors(className, dispatch); \ + private: \ + className (void); /* Make primary constructor private in abstract */ \ protected: +#define OSDeclareAbstractStructors(className) \ +_OSDeclareAbstractStructors(className, ) + +#define OSDeclareAbstractStructorsWithDispatch(className) \ +_OSDeclareAbstractStructors(className, dispatch) + /*! * @define OSDeclareFinalStructors * @hidecontents * * @abstract * Declares run-time type information and functions - * for a final (non-subclassable) Libkern C++ class. + * for a concrete Libkern C++ class. * * @param className The name of the C++ class, as a raw token, * not a string or macro. @@ -1746,13 +1873,20 @@ public: * Warning: Changing a class from "Default" to "Final" will break * binary compatibility. */ -#define OSDeclareFinalStructors(className) \ - OSDeclareDefaultStructors(className) \ - private: \ - void __OSFinalClass(void); \ +#define _OSDeclareFinalStructors(className, dispatch) \ + _OSDeclareDefaultStructors(className, dispatch) \ + private: \ + void __OSFinalClass(void); \ protected: +#define OSDeclareFinalStructors(className) \ +_OSDeclareFinalStructors(className, ) + +#define OSDeclareFinalStructorsWithDispatch(className) \ +_OSDeclareFinalStructors(className, dispatch) + + /* Not to be included in headerdoc. * * @define OSDefineMetaClassWithInit @@ -1805,7 +1939,7 @@ public: * not a string or macro. */ #define OSDefineAbstractStructors(className, superclassName) \ - OSObject * className ::MetaClass::alloc() const { return 0; } + OSObject * className ::MetaClass::alloc() const { return NULL; } /* Not to be included in headerdoc. @@ -1991,7 +2125,7 @@ public: * * @abstract * Defines an OSMetaClass and associated routines - * for a final (non-subclassable) Libkern C++ class. + * for concrete Libkern C++ class. * * @param className The name of the C++ class, as a raw token, * not a string or macro. diff --git a/libkern/libkern/c++/OSNumber.h b/libkern/libkern/c++/OSNumber.h index ebb81a616..34a9472d5 100644 --- a/libkern/libkern/c++/OSNumber.h +++ b/libkern/libkern/c++/OSNumber.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,7 @@ #define _OS_OSNUMBER_H #include +#include /*! * @header @@ -40,6 +41,9 @@ * This header declares the OSNumber container class. */ +class OSNumber; + +typedef OSPtr OSNumberPtr; /*! * @class OSNumber @@ -72,7 +76,7 @@ class OSNumber : public OSObject { friend class OSSerialize; - OSDeclareDefaultStructors(OSNumber) + OSDeclareDefaultStructors(OSNumber); #if APPLE_KEXT_ALIGN_CONTAINERS @@ -118,7 +122,7 @@ public: * and @link addValue addValue@/link, * but you can't change the bit size. */ - static OSNumber * withNumber( + static OSNumberPtr withNumber( unsigned long long value, unsigned int numberOfBits); @@ -153,7 +157,7 @@ public: * and @link addValue addValue@/link, * but you can't change the bit size. */ - static OSNumber * withNumber( + static OSNumberPtr withNumber( const char * valueString, unsigned int numberOfBits); diff --git a/libkern/libkern/c++/OSObject.h b/libkern/libkern/c++/OSObject.h index 036730372..d75fad273 100644 --- a/libkern/libkern/c++/OSObject.h +++ b/libkern/libkern/c++/OSObject.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,9 @@ #define _LIBKERN_OSOBJECT_H #include +#include +#include +#include #if defined(__clang__) #pragma clang diagnostic ignored "-Woverloaded-virtual" @@ -42,6 +45,9 @@ class OSSymbol; class OSString; +class OSObject; + +typedef OSPtr OSObjectPtr; /*! @@ -167,11 +173,17 @@ class OSString; */ class OSObject : public OSMetaClassBase { - OSDeclareAbstractStructors(OSObject) + OSDeclareAbstractStructorsWithDispatch(OSObject); + #if IOKITSTATS friend class IOStatistics; #endif +#ifdef LIBKERN_SMART_POINTERS + template + friend class os::smart_ptr; +#endif + private: /* Not to be included in headerdoc. * @@ -192,10 +204,10 @@ protected: * drops below the specified threshold. * * @param freeWhen If decrementing the reference count makes it - * >= freeWhen, the object is immediately freed. + * < freeWhen, the object is immediately freed. * * @discussion - * If the receiver has freeWhen or fewer references + * If the receiver has fewer than freeWhen references * after its reference count is decremented, * it is immediately freed. * @@ -215,14 +227,14 @@ protected: * * @param tag Used for tracking collection references. * @param freeWhen If decrementing the reference count makes it - * >= freeWhen, the object is immediately freed. + * < freeWhen, the object is immediately freed. * * @discussion * Kernel extensions should not use this function. * It is for use by OSCollection and subclasses to track * inclusion in collections. * - * If the receiver has freeWhen or fewer references + * If the receiver has fewer than freeWhen references * after its reference count is decremented, * it is immediately freed. * @@ -299,7 +311,13 @@ protected: */ static void operator delete(void * mem, size_t size); +// XXX: eventually we can flip this switch +//#ifdef LIBKERN_SMART_POINTERS +#if 0 +private: +#else public: +#endif /*! * @function operator new @@ -314,6 +332,7 @@ public: */ static void * operator new(size_t size); +public: /*! * @function getRetainCount @@ -378,7 +397,7 @@ public: * outside the context in which you received it, * you should always retain it immediately. */ - virtual void taggedRetain(const void * tag = 0) const APPLE_KEXT_OVERRIDE; + virtual void taggedRetain(const void * tag = NULL) const APPLE_KEXT_OVERRIDE; /*! @@ -396,7 +415,7 @@ public: * It is for use by OSCollection and subclasses to track * inclusion in collections. */ - virtual void taggedRelease(const void * tag = 0) const APPLE_KEXT_OVERRIDE; + virtual void taggedRelease(const void * tag = NULL) const APPLE_KEXT_OVERRIDE; // xx-review: used to say, "Remove a reference on this object with this tag, if an attempt is made to remove a reference that isn't associated with this tag the kernel will panic immediately", but I don't see that in the implementation @@ -430,7 +449,13 @@ public: #endif bool taggedTryRetain(const void *tag) const; -#endif + + bool iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object)); +#ifdef __BLOCKS__ + bool iterateObjects(bool (^block)(OSObject * object)); +#endif /* __BLOCKS__ */ + +#endif /* XNU_KERNEL_PRIVATE */ // Unused Padding OSMetaClassDeclareReservedUnused(OSObject, 0); diff --git a/libkern/libkern/c++/OSOrderedSet.h b/libkern/libkern/c++/OSOrderedSet.h index 2a24e321f..dc1a61d20 100644 --- a/libkern/libkern/c++/OSOrderedSet.h +++ b/libkern/libkern/c++/OSOrderedSet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -30,9 +30,13 @@ #define _OS_OSORDEREDSET_H #include +#include #include class OSOffset; +class OSOrderedSet; + +typedef OSPtr OSOrderedSetPtr; /*! * @header @@ -94,7 +98,7 @@ class OSOffset; */ class OSOrderedSet : public OSCollection { - OSDeclareDefaultStructors(OSOrderedSet) + OSDeclareDefaultStructors(OSOrderedSet); public: /*! @@ -180,10 +184,10 @@ public: * See * @link getOrderingRef getOrderingRef@/link. */ - static OSOrderedSet * withCapacity( + static OSOrderedSetPtr withCapacity( unsigned int capacity, - OSOrderFunction orderFunc = 0, - void * orderingContext = 0); + OSOrderFunction orderFunc = NULL, + void * orderingContext = NULL); /*! @@ -231,8 +235,8 @@ public: */ virtual bool initWithCapacity( unsigned int capacity, - OSOrderFunction orderFunc = 0, - void * orderingContext = 0); + OSOrderFunction orderFunc = NULL, + void * orderingContext = NULL); /*! @@ -728,7 +732,7 @@ public: virtual unsigned setOptions( unsigned options, unsigned mask, - void * context = 0) APPLE_KEXT_OVERRIDE; + void * context = NULL) APPLE_KEXT_OVERRIDE; /*! @@ -753,7 +757,7 @@ public: * Objects that are not derived from OSCollection are retained * rather than copied. */ - OSCollection *copyCollection(OSDictionary * cycleDict = 0) APPLE_KEXT_OVERRIDE; + OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL) APPLE_KEXT_OVERRIDE; OSMetaClassDeclareReservedUnused(OSOrderedSet, 0); OSMetaClassDeclareReservedUnused(OSOrderedSet, 1); diff --git a/libkern/libkern/c++/OSPtr.h b/libkern/libkern/c++/OSPtr.h new file mode 100644 index 000000000..fb2dc9704 --- /dev/null +++ b/libkern/libkern/c++/OSPtr.h @@ -0,0 +1,145 @@ +#ifndef _OS_OBJECT_PTR_H +#define _OS_OBJECT_PTR_H + +#include +#include + +#if KERNEL +# include +#endif + +#ifdef LIBKERN_SMART_POINTERS + +/* + * OSObject pointers (OSPtr) + */ + +struct osobject_policy { + static void + retain(const OSMetaClassBase *obj) + { + obj->retain(); + } + static void + release(const OSMetaClassBase *obj) + { + obj->release(); + } + template static T * + alloc() + { + return OSTypeAlloc(T); + } + template static To * + dyn_cast(From *p) + { + return OSDynamicCast(To, p); + } +}; + +template +using OSPtr = os::smart_ptr; + +/* + * Tagged OSObject pointers (OSTaggedPtr) + */ + +template +struct osobject_tagged_policy { + static void + retain(const OSMetaClassBase *obj) + { + obj->taggedRetain(OSTypeID(Tag)); + } + static void + release(const OSMetaClassBase *obj) + { + obj->taggedRelease(OSTypeID(Tag)); + } + template static T * + alloc() + { + return OSTypeAlloc(T); + } + template static To * + dyn_cast(From *p) + { + return OSDynamicCast(To, p); + } +}; + +template +using OSTaggedPtr = os::smart_ptr >; + +/* + * Dynamic cast + */ + +template +os::smart_ptr +OSDynamicCastPtr(os::smart_ptr const &from) +{ + return from.template dynamic_pointer_cast(); +} + +template +os::smart_ptr +OSDynamicCastPtr(os::smart_ptr &&from) +{ + return os::move(from).template dynamic_pointer_cast(); +} + +/* + * Creation helpers + */ + +template +os::smart_ptr +OSNewObject() +{ + return os::smart_ptr::alloc(); +} + +template +os::smart_ptr +OSMakePtr(T *&p) +{ + return os::smart_ptr(p); +} + +template +os::smart_ptr +OSMakePtr(T *&&p) +{ + return os::smart_ptr(os::move(p)); +} + +template +os::smart_ptr +OSMakePtr(T *&&p, bool retain) +{ + return os::smart_ptr(os::move(p), retain); +} + +template +static inline T ** +OSOutPtr(os::smart_ptr *p) +{ + if (p == nullptr) { + return nullptr; + } else { + return p->get_for_out_param(); + } +} + +#else /* LIBKERN_SMART_POINTERS */ + +/* Fall back to the smart pointer types just being a simple pointer */ +template +using OSPtr = T *; + +template +using OSTaggedPtr = T *; + +#endif /* LIBKERN_SMART_POINTERS */ +#endif /* _OS_OBJECT_PTR_H */ diff --git a/libkern/libkern/c++/OSSerialize.h b/libkern/libkern/c++/OSSerialize.h index 061830af2..758162584 100644 --- a/libkern/libkern/c++/OSSerialize.h +++ b/libkern/libkern/c++/OSSerialize.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,6 +36,13 @@ class OSCollection; class OSSet; class OSDictionary; class OSArray; +class OSData; + +class OSSerializer; +typedef OSPtr OSSerializerPtr; + +class OSSerialize; +typedef OSPtr OSSerializePtr; /*! * @header @@ -44,7 +51,7 @@ class OSArray; * This header declares the OSSerialize class. */ -OSObject * +OSObjectPtr OSUnserializeBinary(const void *buffer, size_t bufferSize); /*! @@ -83,7 +90,7 @@ OSUnserializeBinary(const void *buffer, size_t bufferSize); class OSSerialize : public OSObject { - OSDeclareDefaultStructors(OSSerialize) + OSDeclareDefaultStructors(OSSerialize); friend class OSBoolean; private: @@ -105,14 +112,18 @@ public: typedef void * Editor; #endif - bool binary; - bool endCollection; - Editor editor; - void * editRef; + bool binary; + bool endCollection; + Editor editor; + void * editRef; + OSData * indexData; bool binarySerialize(const OSMetaClassBase *o); + bool binarySerializeInternal(const OSMetaClassBase *o); bool addBinary(const void * data, size_t size); - bool addBinaryObject(const OSMetaClassBase * o, uint32_t key, const void * _bits, size_t size); + bool addBinaryObject(const OSMetaClassBase * o, uint32_t key, const void * _bits, size_t size, + uint32_t * startCollection); + void endBinaryCollection(uint32_t startCollection); public: @@ -132,9 +143,10 @@ public: * @discussion * The serializer will grow as needed to accommodate more data. */ - static OSSerialize * withCapacity(unsigned int capacity); + static OSSerializePtr withCapacity(unsigned int capacity); - static OSSerialize * binaryWithCapacity(unsigned int inCapacity, Editor editor = 0, void * reference = 0); + static OSSerializePtr binaryWithCapacity(unsigned int inCapacity, Editor editor = NULL, void * reference = NULL); + void setIndexed(bool index); /*! * @function text @@ -321,7 +333,7 @@ typedef bool (^OSSerializerBlock)(OSSerialize * serializer); class OSSerializer : public OSObject { - OSDeclareDefaultStructors(OSSerializer) + OSDeclareDefaultStructors(OSSerializer); void * target; void * ref; @@ -329,13 +341,13 @@ class OSSerializer : public OSObject public: - static OSSerializer * forTarget( + static OSSerializerPtr forTarget( void * target, OSSerializerCallback callback, - void * ref = 0); + void * ref = NULL); #ifdef __BLOCKS__ - static OSSerializer * withBlock( + static OSSerializerPtr withBlock( OSSerializerBlock callback); #endif diff --git a/libkern/libkern/c++/OSSet.h b/libkern/libkern/c++/OSSet.h index bec190e9f..9c7718807 100644 --- a/libkern/libkern/c++/OSSet.h +++ b/libkern/libkern/c++/OSSet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,8 +32,13 @@ #define _OS_OSSET_H #include +#include class OSArray; +class OSSet; + +typedef OSPtr OSSetPtr; +typedef OSPtr OSArrayPtr; /*! * @header @@ -86,17 +91,17 @@ class OSSet : public OSCollection { friend class OSSerialize; - OSDeclareDefaultStructors(OSSet) + OSDeclareDefaultStructors(OSSet); #if APPLE_KEXT_ALIGN_CONTAINERS private: - OSArray * members; + OSArrayPtr members; #else /* APPLE_KEXT_ALIGN_CONTAINERS */ private: - OSArray * members; + OSArrayPtr members; protected: struct ExpansionData { }; @@ -135,7 +140,7 @@ public: * (unlike @link //apple_ref/doc/uid/20001503 CFMutableSet@/link, * for which the initial capacity is a hard limit). */ - static OSSet * withCapacity(unsigned int capacity); + static OSSetPtr withCapacity(unsigned int capacity); /*! @@ -169,7 +174,7 @@ public: * The objects in objects are retained for storage in the new set, * not copied. */ - static OSSet * withObjects( + static OSSetPtr withObjects( const OSObject * objects[], unsigned int count, unsigned int capacity = 0); @@ -207,7 +212,7 @@ public: * The objects in array are retained for storage in the new set, * not copied. */ - static OSSet * withArray( + static OSSetPtr withArray( const OSArray * array, unsigned int capacity = 0); @@ -243,7 +248,7 @@ public: * The objects in set are retained for storage in the new set, * not copied. */ - static OSSet * withSet(const OSSet * set, + static OSSetPtr withSet(const OSSet * set, unsigned int capacity = 0); @@ -749,7 +754,7 @@ public: * Child collections' options are changed only if the receiving set's * options actually change. */ - virtual unsigned setOptions(unsigned options, unsigned mask, void * context = 0) APPLE_KEXT_OVERRIDE; + virtual unsigned setOptions(unsigned options, unsigned mask, void * context = NULL) APPLE_KEXT_OVERRIDE; /*! @@ -774,7 +779,7 @@ public: * Objects that are not derived from OSCollection are retained * rather than copied. */ - OSCollection *copyCollection(OSDictionary *cycleDict = 0) APPLE_KEXT_OVERRIDE; + OSCollectionPtr copyCollection(OSDictionary *cycleDict = NULL) APPLE_KEXT_OVERRIDE; OSMetaClassDeclareReservedUnused(OSSet, 0); OSMetaClassDeclareReservedUnused(OSSet, 1); diff --git a/libkern/libkern/c++/OSString.h b/libkern/libkern/c++/OSString.h index c761c9d28..925d5a3a4 100644 --- a/libkern/libkern/c++/OSString.h +++ b/libkern/libkern/c++/OSString.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,8 +32,13 @@ #define _OS_OSSTRING_H #include +#include class OSData; +class OSString; + +typedef OSPtr OSStringPtr; +typedef OSPtr OSStringConstPtr; /*! @@ -102,7 +107,7 @@ enum { kOSStringNoCopy = 0x00000001 }; */ class OSString : public OSObject { - OSDeclareDefaultStructors(OSString) + OSDeclareDefaultStructors(OSString); enum { kMaxStringLength = 262142 }; @@ -145,7 +150,7 @@ public: * with the reference count incremented. * Changes to one will not be reflected in the other. */ - static OSString * withString(const OSString * aString); + static OSStringPtr withString(const OSString * aString); /*! @@ -162,7 +167,7 @@ public: * and with a reference count of 1; * NULL on failure. */ - static OSString * withCString(const char * cString); + static OSStringPtr withCString(const char * cString); /*! @@ -191,10 +196,10 @@ public: * An OSString object created with this function does not * allow changing the string via @link setChar setChar@/link. */ - static OSString * withCStringNoCopy(const char * cString); + static OSStringPtr withCStringNoCopy(const char * cString); #if XNU_KERNEL_PRIVATE - static OSString * withStringOfLength(const char *cString, size_t length); + static OSStringPtr withStringOfLength(const char *cString, size_t length); #endif /* XNU_KERNEL_PRIVATE */ /*! diff --git a/libkern/libkern/c++/OSSymbol.h b/libkern/libkern/c++/OSSymbol.h index 03490a026..1ee9792b6 100644 --- a/libkern/libkern/c++/OSSymbol.h +++ b/libkern/libkern/c++/OSSymbol.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,12 @@ #define _OS_OSSYMBOL_H #include +#include + +class OSSymbol; + +typedef OSPtr OSSymbolPtr; +typedef OSPtr OSSymbolConstPtr; /*! * @header @@ -82,7 +88,7 @@ class OSSymbol : public OSString { friend class OSSymbolPool; - OSDeclareAbstractStructors(OSSymbol) + OSDeclareAbstractStructors(OSSymbol); private: @@ -245,7 +251,7 @@ public: * new OSSymbol with a retain count of 1, * or increments the retain count of the existing instance. */ - static const OSSymbol * withString(const OSString * aString); + static OSSymbolConstPtr withString(const OSString * aString); /*! @@ -272,7 +278,7 @@ public: * new OSSymbol with a retain count of 1, * or increments the retain count of the existing instance. */ - static const OSSymbol * withCString(const char * cString); + static OSSymbolConstPtr withCString(const char * cString); /*! @@ -302,7 +308,7 @@ public: * new OSSymbol with a retain count of 1, * or increments the retain count of the existing instance. */ - static const OSSymbol * withCStringNoCopy(const char * cString); + static OSSymbolConstPtr withCStringNoCopy(const char * cString); /*! * @function existingSymbolForString @@ -321,7 +327,7 @@ public: * The returned OSSymbol object is returned with an incremented refcount * that needs to be released. */ - static const OSSymbol* existingSymbolForString(const OSString *aString); + static OSSymbolConstPtr existingSymbolForString(const OSString *aString); /*! * @function existingSymbolForCString @@ -340,7 +346,7 @@ public: * The returned OSSymbol object is returned with an incremented refcount * that needs to be released. */ - static const OSSymbol* existingSymbolForCString(const char *aCString); + static OSSymbolConstPtr existingSymbolForCString(const char *aCString); /*! * @function isEqualTo diff --git a/libkern/libkern/c++/OSUnserialize.h b/libkern/libkern/c++/OSUnserialize.h index a3f8fc378..678e48828 100644 --- a/libkern/libkern/c++/OSUnserialize.h +++ b/libkern/libkern/c++/OSUnserialize.h @@ -30,6 +30,9 @@ #ifndef _OS_OSUNSERIALIZE_H #define _OS_OSUNSERIALIZE_H +#include +#include + #include #include @@ -64,9 +67,9 @@ class OSString; * @discussion * Not safe to call in a primary interrupt handler. */ -extern "C++" OSObject * OSUnserializeXML( +extern "C++" OSObjectPtr OSUnserializeXML( const char * buffer, - OSString ** errorString = 0); + OSStringPtr * errorString = NULL); /*! * @function OSUnserializeXML @@ -90,16 +93,16 @@ extern "C++" OSObject * OSUnserializeXML( * @discussion * Not safe to call in a primary interrupt handler. */ -extern "C++" OSObject * OSUnserializeXML( +extern "C++" OSObjectPtr OSUnserializeXML( const char * buffer, size_t bufferSize, - OSString ** errorString = 0); + OSStringPtr *errorString = NULL); -extern "C++" OSObject * -OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorString); +extern "C++" OSObjectPtr +OSUnserializeBinary(const char *buffer, size_t bufferSize, OSStringPtr *errorString); #ifdef __APPLE_API_OBSOLETE -extern OSObject* OSUnserialize(const char *buffer, OSString **errorString = 0); +extern OSObjectPtr OSUnserialize(const char *buffer, OSStringPtr *errorString = NULL); #endif /* __APPLE_API_OBSOLETE */ #endif /* _OS_OSUNSERIALIZE_H */ diff --git a/libkern/libkern/crypto/des.h b/libkern/libkern/crypto/des.h index e2df46dbe..a3efc08de 100644 --- a/libkern/libkern/crypto/des.h +++ b/libkern/libkern/crypto/des.h @@ -67,11 +67,11 @@ typedef des_ecb_key_schedule des_key_schedule[1]; /* Single DES ECB - 1 block */ int des_ecb_key_sched(des_cblock *key, des_ecb_key_schedule *ks); -void des_ecb_encrypt(des_cblock * in, des_cblock *out, des_ecb_key_schedule *ks, int encrypt); +int des_ecb_encrypt(des_cblock * in, des_cblock *out, des_ecb_key_schedule *ks, int encrypt); /* Triple DES ECB - 1 block */ int des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks); -void des3_ecb_encrypt(des_cblock *block, des_cblock *, des3_ecb_key_schedule *ks, int encrypt); +int des3_ecb_encrypt(des_cblock *block, des_cblock *, des3_ecb_key_schedule *ks, int encrypt); int des_is_weak_key(des_cblock *key); diff --git a/libkern/libkern/i386/Makefile b/libkern/libkern/i386/Makefile index 78bbfa507..19e08c823 100644 --- a/libkern/libkern/i386/Makefile +++ b/libkern/libkern/i386/Makefile @@ -10,8 +10,16 @@ DATAFILES = \ OSByteOrder.h \ _OSByteOrder.h +DRIVERKIT_DATAFILES = \ + OSByteOrder.h \ + _OSByteOrder.h + INSTALL_MD_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MD_LIST = ${DRIVERKIT_DATAFILES} + +DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include + INSTALL_MD_DIR = libkern/i386 EXPORT_MD_LIST = ${DATAFILES} diff --git a/libkern/libkern/img4/interface.h b/libkern/libkern/img4/interface.h index 01d749e4f..7bf58353d 100644 --- a/libkern/libkern/img4/interface.h +++ b/libkern/libkern/img4/interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,15 +37,6 @@ #include #include -#if MACH_KERNEL_PRIVATE -#define _SYS_TYPES_H_ 1 -#include -#include -#else -#include -#include -#endif - /* * We rely on img4.h's logic for either including sys/types.h or declaring * errno_t ourselves. So when building the kernel, include img4.h from our @@ -63,7 +54,7 @@ * it can be tested at build-time and not require rev-locked submissions of xnu * and AppleImage4. */ -#define IMG4_INTERFACE_VERSION (2u) +#define IMG4_INTERFACE_VERSION (3u) /*! * @typedef img4_init_t @@ -240,9 +231,6 @@ typedef errno_t (*img4_payload_init_with_vnode_4xnu_t)( * * @field i4if_v3.nonce_domain_cryptex * The {@link IMG4_NONCE_DOMAIN_CRYPTEX} global. - * - * @field i4if_v4.environment_init_identity - * A pointer to the {@link img4_environment_init_identity} function. */ typedef struct _img4_interface { @@ -266,10 +254,14 @@ typedef struct _img4_interface { struct { img4_payload_init_with_vnode_4xnu_t payload_init_with_vnode_4xnu; } i4if_v2; - void *__reserved[17]; + struct { + const img4_nonce_domain_t *nonce_domain_pdi; + const img4_nonce_domain_t *nonce_domain_cryptex; + } i4if_v3; + void *__reserved[15]; } img4_interface_t; -__BEGIN_DECLS; +__BEGIN_DECLS /*! * @const img4if @@ -292,6 +284,6 @@ OS_EXPORT OS_NONNULL1 void img4_interface_register(const img4_interface_t *i4); -__END_DECLS; +__END_DECLS #endif // __IMG4_INTERFACE_H diff --git a/libkern/libkern/kext_panic_report.h b/libkern/libkern/kext_panic_report.h index 73f2985fe..4c241e400 100644 --- a/libkern/libkern/kext_panic_report.h +++ b/libkern/libkern/kext_panic_report.h @@ -53,7 +53,7 @@ subs_entry_t kext_identifier_prefix_subs[] = { { "com.apple.security.", '$' }, { "com.apple.", '@' }, - { (char *)0, '\0' } + { (char *)NULL, '\0' } }; /* Substring substitution list. Substrings are replaced with a '!' followed @@ -71,7 +71,7 @@ subs_entry_t kext_identifier_substring_subs[] = { { "Bluetooth", 'B' }, { "Intel", 'I' }, - { (char *)0, '\0' } + { (char *)NULL, '\0' } }; __END_DECLS diff --git a/libkern/libkern/kext_request_keys.h b/libkern/libkern/kext_request_keys.h index 0cd79f5ff..fa1697ac0 100644 --- a/libkern/libkern/kext_request_keys.h +++ b/libkern/libkern/kext_request_keys.h @@ -269,6 +269,16 @@ extern "C" { */ #define kKextRequestPredicateRequestKextdExit "Kextd Exit" +/* Predicate: Dext Daemon Launch + * Argument: kKextRequestArgumentBundleIdentifierKey + * Argument: IOUserServerName + * Response: Asynchronous via a DriverKit daemon checking in + * Op result: OSReturn indicating result (see OSKextLib.h) + * + * Requests kextd to launch a driver extension userspace daemon. + */ +#define kKextRequestPredicateRequestDaemonLaunch "Dext Daemon Launch" + #if PRAGMA_MARK /********************************************************************/ #pragma mark - @@ -436,6 +446,30 @@ extern "C" { */ #define kKextRequestArgumentTerminateIOServicesKey "Terminate IOServices" +#if PRAGMA_MARK +#pragma mark Daemon Launch Request Arguments +#endif + +/* Argument: Server tag + * Type: Integer + * Default: N/A + * + * A DriverKit daemon launch request must include a "server tag" that + * is unique to every launch request. Userspace daemons include this + * tag in their messages when attempting to rendez-vous with IOKit. + */ +#define kKextRequestArgumentDriverExtensionServerTag "Driver Extension Server Tag" + +/* Argument: Server name + * Type: String + * Default: N/A + * + * A DriverKit daemon launch request must include a "server name" that + * can be used to identify what personality the driver is matching on. + * This name is also used for the launchd service name of the daemon. + */ +#define kKextRequestArgumentDriverExtensionServerName "Driver Extension Server Name" + #if PRAGMA_MARK #pragma mark Internal Tracking Properties #endif diff --git a/libkern/libkern/machine/Makefile b/libkern/libkern/machine/Makefile index 3e9849371..deaf2e376 100644 --- a/libkern/libkern/machine/Makefile +++ b/libkern/libkern/machine/Makefile @@ -9,8 +9,15 @@ include $(MakeInc_def) DATAFILES = \ OSByteOrder.h +DRIVERKIT_DATAFILES = \ + OSByteOrder.h + INSTALL_MI_LIST = ${DATAFILES} +INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} + +DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include + INSTALL_MI_DIR = libkern/machine EXPORT_MI_LIST = ${DATAFILES} diff --git a/libkern/libkern/stack_protector.h b/libkern/libkern/stack_protector.h index 2ffcbabca..443d20a3d 100644 --- a/libkern/libkern/stack_protector.h +++ b/libkern/libkern/stack_protector.h @@ -33,6 +33,7 @@ extern unsigned long __stack_chk_guard; /* Called as a consequence on stack corruption */ +__attribute__((noreturn)) extern void __stack_chk_fail(void); #endif // _STACK_PROTECTOR_H_ diff --git a/libkern/libkern/tree.h b/libkern/libkern/tree.h index 15b663639..5cf38cbc0 100644 --- a/libkern/libkern/tree.h +++ b/libkern/libkern/tree.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * Copyright (c) 2009-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -408,6 +408,7 @@ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ struct type *name##_RB_REMOVE(struct name *, struct type *); \ struct type *name##_RB_INSERT(struct name *, struct type *); \ struct type *name##_RB_FIND(struct name *, struct type *); \ +struct type *name##_RB_NFIND(struct name *, struct type *); \ struct type *name##_RB_NEXT(struct type *); \ struct type *name##_RB_MINMAX(struct name *, int); \ struct type *name##_RB_GETPARENT(struct type*); \ @@ -422,12 +423,13 @@ _sc_ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *); \ _sc_ struct type *name##_RB_REMOVE(struct name *, struct type *); \ _sc_ struct type *name##_RB_INSERT(struct name *, struct type *); \ _sc_ struct type *name##_RB_FIND(struct name *, struct type *); \ +_sc_ struct type *name##_RB_NFIND(struct name *, struct type *); \ _sc_ struct type *name##_RB_NEXT(struct type *); \ _sc_ struct type *name##_RB_MINMAX(struct name *, int); \ _sc_ struct type *name##_RB_GETPARENT(struct type*); \ _sc_ struct type *name##_RB_SETPARENT(struct type*, struct type*); \ _sc_ int name##_RB_GETCOLOR(struct type*); \ -_sc_ void name##_RB_SETCOLOR(struct type*,int); +_sc_ void name##_RB_SETCOLOR(struct type*,int) /* Main rb operation. @@ -698,6 +700,28 @@ name##_RB_FIND(struct name *head, struct type *elm) \ return (NULL); \ } \ \ +/* Finds the first node greater than or equal to the search key */ \ +__attribute__((unused)) \ +struct type * \ +name##_RB_NFIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *res = NULL; \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) { \ + res = tmp; \ + tmp = RB_LEFT(tmp, field); \ + } \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (res); \ +} \ + \ /* ARGSUSED */ \ struct type * \ name##_RB_NEXT(struct type *elm) \ @@ -742,11 +766,11 @@ struct type *name##_RB_PREV(struct type *); #define RB_PROTOTYPE_SC_PREV(_sc_, name, type, field, cmp) \ - RB_PROTOTYPE_SC(_sc_, name, type, field, cmp) \ -_sc_ struct type *name##_RB_PREV(struct type *); + RB_PROTOTYPE_SC(_sc_, name, type, field, cmp); \ +_sc_ struct type *name##_RB_PREV(struct type *) #define RB_GENERATE_PREV(name, type, field, cmp) \ - RB_GENERATE(name, type, field, cmp) \ + RB_GENERATE(name, type, field, cmp); \ struct type * \ name##_RB_PREV(struct type *elm) \ { \ @@ -774,6 +798,7 @@ name##_RB_PREV(struct type *elm) \ #define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) #define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) #define RB_FIND(name, x, y) name##_RB_FIND(x, y) +#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) #define RB_NEXT(name, x, y) name##_RB_NEXT(y) #define RB_PREV(name, x, y) name##_RB_PREV(y) #define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) diff --git a/libkern/os/Makefile b/libkern/os/Makefile index 5db93b07d..26c29df2c 100644 --- a/libkern/os/Makefile +++ b/libkern/os/Makefile @@ -6,26 +6,36 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -LCLDIR = /usr/local/include - KERNELFILES = \ base.h \ object.h \ log.h \ trace.h \ - overflow.h + overflow.h \ + smart_ptr.h \ + cpp_util.h PRIVATE_KERNELFILES = \ hash.h \ object_private.h \ + ptrtools.h \ reason_private.h \ - refcnt.h + refcnt.h \ + refcnt_internal.h + +DATAFILES = \ + overflow.h + +DRIVERKIT_DATAFILES = \ + base.h \ + overflow.h PRIVATE_DATAFILES = \ reason_private.h -INSTALL_MI_LIST = \ - overflow.h +INSTALL_MI_LIST = ${DATAFILES} + +INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} INSTALL_MI_DIR = os diff --git a/libkern/os/base.h b/libkern/os/base.h index 62b98b453..bea2772a4 100644 --- a/libkern/os/base.h +++ b/libkern/os/base.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2016 Apple Inc. All rights reserved. + * Copyright (c) 2008-2013 Apple Inc. All rights reserved. * * @APPLE_APACHE_LICENSE_HEADER_START@ * @@ -122,6 +122,12 @@ #define OS_NOESCAPE #endif +#if defined(__cplusplus) && defined(__clang__) +#define OS_FALLTHROUGH [[clang::fallthrough]] +#else +#define OS_FALLTHROUGH +#endif + #if __has_feature(assume_nonnull) #define OS_ASSUME_NONNULL_BEGIN _Pragma("clang assume_nonnull begin") #define OS_ASSUME_NONNULL_END _Pragma("clang assume_nonnull end") @@ -142,13 +148,86 @@ #define OS_OVERLOADABLE #endif -#if __has_feature(objc_fixed_enum) || __has_extension(cxx_strong_enums) +#if __has_attribute(enum_extensibility) +#define __OS_ENUM_ATTR __attribute__((enum_extensibility(open))) +#define __OS_ENUM_ATTR_CLOSED __attribute__((enum_extensibility(closed))) +#else +#define __OS_ENUM_ATTR +#define __OS_ENUM_ATTR_CLOSED +#endif // __has_attribute(enum_extensibility) + +#if __has_attribute(flag_enum) +/*! + * Compile with -Wflag-enum and -Wassign-enum to enforce at definition and + * assignment, respectively, i.e. -Wflag-enum prevents you from creating new + * enumeration values from illegal values within the enum definition, and + * -Wassign-enum prevents you from assigning illegal values to a variable of the + * enum type. + */ +#define __OS_OPTIONS_ATTR __attribute__((flag_enum)) +#else +#define __OS_OPTIONS_ATTR +#endif // __has_attribute(flag_enum) + +#if __has_feature(objc_fixed_enum) || __has_extension(cxx_fixed_enum) || \ + __has_extension(cxx_strong_enums) #define OS_ENUM(_name, _type, ...) \ typedef enum : _type { __VA_ARGS__ } _name##_t +#define OS_CLOSED_ENUM(_name, _type, ...) \ + typedef enum : _type { __VA_ARGS__ } \ + __OS_ENUM_ATTR_CLOSED _name##_t +#define OS_OPTIONS(_name, _type, ...) \ + typedef enum : _type { __VA_ARGS__ } \ + __OS_ENUM_ATTR __OS_OPTIONS_ATTR _name##_t +#define OS_CLOSED_OPTIONS(_name, _type, ...) \ + typedef enum : _type { __VA_ARGS__ } \ + __OS_ENUM_ATTR_CLOSED __OS_OPTIONS_ATTR _name##_t #else +/*! + * There is unfortunately no good way in plain C to have both fixed-type enums + * and enforcement for clang's enum_extensibility extensions. The primary goal + * of these macros is to allow you to define an enum and specify its width in a + * single statement, and for plain C that is accomplished by defining an + * anonymous enum and then separately typedef'ing the requested type name to the + * requested underlying integer type. So the type emitted actually has no + * relationship at all to the enum, and therefore while the compiler could + * enforce enum extensibility if you used the enum type, it cannot do so if you + * use the "_t" type resulting from this expression. + * + * But we still define a named enum type and decorate it appropriately for you, + * so if you really want the enum extensibility enforcement, you can use the + * enum type yourself, i.e. when compiling with a C compiler: + * + * OS_CLOSED_ENUM(my_type, uint64_t, + * FOO, + * BAR, + * BAZ, + * ); + * + * my_type_t mt = 98; // legal + * enum my_type emt = 98; // illegal + * + * But be aware that the underlying enum type's width is subject only to the C + * language's guarantees -- namely that it will be compatible with int, char, + * and unsigned char. It is not safe to rely on the size of this type. + * + * When compiling in ObjC or C++, both of the above assignments are illegal. + */ +#define __OS_ENUM_C_FALLBACK(_name, _type, ...) \ + typedef _type _name##_t; enum _name { __VA_ARGS__ } + #define OS_ENUM(_name, _type, ...) \ - enum { __VA_ARGS__ }; typedef _type _name##_t -#endif + typedef _type _name##_t; enum { __VA_ARGS__ } +#define OS_CLOSED_ENUM(_name, _type, ...) \ + __OS_ENUM_C_FALLBACK(_name, _type, ## __VA_ARGS__) \ + __OS_ENUM_ATTR_CLOSED +#define OS_OPTIONS(_name, _type, ...) \ + __OS_ENUM_C_FALLBACK(_name, _type, ## __VA_ARGS__) \ + __OS_ENUM_ATTR __OS_OPTIONS_ATTR +#define OS_CLOSED_OPTIONS(_name, _type, ...) \ + __OS_ENUM_C_FALLBACK(_name, _type, ## __VA_ARGS__) \ + __OS_ENUM_ATTR_CLOSED __OS_OPTIONS_ATTR +#endif // __has_feature(objc_fixed_enum) || __has_extension(cxx_strong_enums) #if __has_feature(attribute_availability_swift) // equivalent to __SWIFT_UNAVAILABLE from Availability.h @@ -178,12 +257,16 @@ #ifdef __GNUC__ #define os_prevent_tail_call_optimization() __asm__("") #define os_is_compile_time_constant(expr) __builtin_constant_p(expr) +#ifndef KERNEL #define os_compiler_barrier() __asm__ __volatile__("" ::: "memory") +#endif #else #define os_prevent_tail_call_optimization() do { } while (0) #define os_is_compile_time_constant(expr) 0 +#ifndef KERNEL #define os_compiler_barrier() do { } while (0) #endif +#endif #if __has_attribute(not_tail_called) #define OS_NOT_TAIL_CALLED __attribute__((__not_tail_called__)) @@ -191,6 +274,18 @@ #define OS_NOT_TAIL_CALLED #endif +/* + * LIBKERN_ALWAYS_DESTROY attribute can be applied to global variables with + * destructors. It specifies that and object should have its exit-time + * destructor run. This attribute is the default unless clang was invoked with + * -fno-c++-static-destructors. + */ +#if __has_attribute(always_destroy) +#define LIBKERN_ALWAYS_DESTROY __attribute__((always_destroy)) +#else +#define LIBKERN_ALWAYS_DESTROY +#endif + typedef void (*os_function_t)(void *_Nullable); #ifdef __BLOCKS__ diff --git a/libkern/os/cpp_util.h b/libkern/os/cpp_util.h new file mode 100644 index 000000000..dc7236bff --- /dev/null +++ b/libkern/os/cpp_util.h @@ -0,0 +1,49 @@ +#ifndef _OS_CPP_UTIL_H +#define _OS_CPP_UTIL_H + +#include + +#if __has_feature(cxx_nullptr) && __has_feature(cxx_decltype) +# define OS_HAS_NULLPTR 1 +#endif + +#if __has_feature(cxx_rvalue_references) || __has_extension(cxx_rvalue_references) +# define OS_HAS_RVALUE_REFERENCES 1 +#endif + +namespace os { +#if OS_HAS_NULLPTR +typedef decltype(nullptr) nullptr_t; +#endif + +/* + * Reference removal + */ + +template struct remove_reference {typedef _T type;}; +template struct remove_reference<_T&> {typedef _T type;}; +template struct remove_reference<_T &&> {typedef _T type;}; +template using remove_reference_t = typename remove_reference<_T>::type; + +/* + * Const removal + */ + +template struct remove_const {typedef _T type;}; +template struct remove_const {typedef _T type;}; +template using remove_const_t = typename remove_const<_T>::type; + +/* + * Move + */ + +template +inline typename remove_reference<_T>::type && +move(_T && _t) +{ + typedef typename os::remove_reference<_T>::type _U; + return static_cast<_U &&>(_t); +} +} + +#endif /* _OS_CPP_UTIL_H */ diff --git a/libkern/os/log.c b/libkern/os/log.c index 8a64d5e32..b01f58322 100644 --- a/libkern/os/log.c +++ b/libkern/os/log.c @@ -1,3 +1,5 @@ +/* * Copyright (c) 2019 Apple Inc. All rights reserved. */ + #include #undef offset @@ -23,6 +25,7 @@ #include #include #include +#include #include #include @@ -56,7 +59,7 @@ extern void bsd_log_lock(void); extern void bsd_log_unlock(void); extern void logwakeup(struct msgbuf *); -decl_lck_spin_data(extern, oslog_stream_lock) +decl_lck_spin_data(extern, oslog_stream_lock); #define stream_lock() lck_spin_lock(&oslog_stream_lock) #define stream_unlock() lck_spin_unlock(&oslog_stream_lock) @@ -106,19 +109,19 @@ oslog_stream_create_buf_entry(oslog_stream_link_type_t type, firehose_tracepoint static void _os_log_with_args_internal(os_log_t oslog __unused, os_log_type_t type __unused, - const char *format, va_list args, void *addr, void *dso); + const char *format, va_list args, void *addr, void *dso, bool driverKit, bool addcr); static void -_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging); +_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging, bool addcr); static void _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, - const char *format, va_list args, void *addr, void *dso); + const char *format, va_list args, void *addr, void *dso, bool driverKit); static void _os_log_actual(os_log_t oslog, os_log_type_t type, const char *format, void - *dso, void *addr, os_log_buffer_context_t context); + *dso, void *addr, os_log_buffer_context_t context, bool driverKit); bool os_log_info_enabled(os_log_t log __unused) @@ -152,13 +155,63 @@ _os_log_internal(void *dso, os_log_t log, uint8_t type, const char *message, ... va_start(args, message); - _os_log_with_args_internal(log, type, message, args, addr, dso); + _os_log_with_args_internal(log, type, message, args, addr, dso, FALSE, FALSE); va_end(args); return; } +__attribute__((noinline, not_tail_called)) int +_os_log_internal_driverKit(void *dso, os_log_t log, uint8_t type, const char *message, ...) +{ + va_list args; + void *addr = __builtin_return_address(0); + bool driverKitLog = FALSE; + + /* + * We want to be able to identify dexts from the logs. + * + * Usually the addr is used to understand if the log line + * was generated by a kext or the kernel main executable. + * Logd uses copyKextUUIDForAddress with the addr specified + * in the log line to retrieve the kext UUID of the sender. + * + * Dext however are not loaded in kernel space so they do not + * have a kernel range of addresses. + * + * To make the same mechanism work, OSKext fakes a kernel + * address range for dexts using the loadTag, + * so we just need to use the loadTag as addr here + * to allow logd to retrieve the correct UUID. + * + * NOTE: loadTag is populated in the task when the dext is matching, + * so if log lines are generated before the matching they will be + * identified as kernel main executable. + */ + task_t self_task = current_task(); + + /* + * Only dextis are supposed to use this log path. + */ + if (!task_is_driver(self_task)) { + return EPERM; + } + + uint64_t loadTag = get_task_loadTag(self_task); + if (loadTag != 0) { + driverKitLog = TRUE; + addr = (void*) loadTag; + } + va_start(args, message); + + _os_log_with_args_internal(log, type, message, args, addr, dso, driverKitLog, true); + + va_end(args); + + return 0; +} + #pragma mark - shim functions __attribute__((noinline, not_tail_called)) void @@ -169,12 +222,12 @@ os_log_with_args(os_log_t oslog, os_log_type_t type, const char *format, va_list addr = __builtin_return_address(0); } - _os_log_with_args_internal(oslog, type, format, args, addr, NULL); + _os_log_with_args_internal(oslog, type, format, args, addr, NULL, FALSE, FALSE); } static void _os_log_with_args_internal(os_log_t oslog, os_log_type_t type, - const char *format, va_list args, void *addr, void *dso) + const char *format, va_list args, void *addr, void *dso, bool driverKit, bool addcr) { uint32_t logging_config = atm_get_diagnostic_config(); boolean_t safe; @@ -194,16 +247,16 @@ _os_log_with_args_internal(os_log_t oslog, os_log_type_t type, } if (oslog != &_os_log_replay) { - _os_log_to_msgbuf_internal(format, args, safe, logging); + _os_log_to_msgbuf_internal(format, args, safe, logging, addcr); } if (safe && logging) { - _os_log_to_log_internal(oslog, type, format, args, addr, dso); + _os_log_to_log_internal(oslog, type, format, args, addr, dso, driverKit); } } static void -_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging) +_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging, bool addcr) { static int msgbufreplay = -1; va_list args_copy; @@ -279,7 +332,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log } va_copy(args_copy, args); - vprintf_log_locked(format, args_copy); + vprintf_log_locked(format, args_copy, addcr); va_end(args_copy); #if DEVELOPMENT || DEBUG @@ -297,7 +350,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log static void _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, - const char *format, va_list args, void *addr, void *dso) + const char *format, va_list args, void *addr, void *dso, bool driverKit) { struct os_log_buffer_context_s context; unsigned char buffer_data[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8))); @@ -322,10 +375,11 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, if (!_os_trace_addr_in_text_segment(dso, format)) { return; } - - void *dso_addr = (void *) OSKextKextForAddress(addr); - if (dso != dso_addr) { - return; + if (!driverKit) { + void *dso_addr = (void *) OSKextKextForAddress(addr); + if (dso != dso_addr) { + return; + } } #endif /* FIREHOSE_USES_SHARED_CACHE */ @@ -340,11 +394,11 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, va_copy(args_copy, args); - (void)hw_atomic_add(&oslog_p_total_msgcount, 1); + os_atomic_inc(&oslog_p_total_msgcount, relaxed); if (_os_log_encode(format, args_copy, 0, &context)) { - _os_log_actual(oslog, type, format, dso, addr, &context); + _os_log_actual(oslog, type, format, dso, addr, &context, driverKit); } else { - (void)hw_atomic_add(&oslog_p_error_count, 1); + os_atomic_inc(&oslog_p_error_count, relaxed); } va_end(args_copy); @@ -352,26 +406,37 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, static inline size_t _os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)], - void *dso, const void *address, firehose_tracepoint_flags_t *flags) + void *dso, const void *address, firehose_tracepoint_flags_t *flags, __unused bool driverKit) { + uintptr_t shift_addr = (uintptr_t)address - (uintptr_t)dso; #if FIREHOSE_USES_SHARED_CACHE + *flags = _firehose_tracepoint_flags_pc_style_shared_cache; - memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso }, + memcpy(buf, (uint32_t[]){ shift_addr }, sizeof(uint32_t)); return sizeof(uint32_t); #else /* FIREHOSE_USES_SHARED_CACHE */ kernel_mach_header_t *mh = dso; - if (mh->filetype == MH_EXECUTE) { + /* + * driverKit will have the dso set as MH_EXECUTE + * (it is logging from a syscall in the kernel) + * but needs logd to parse the address as an + * absolute pc. + */ + if (mh->filetype == MH_EXECUTE && !driverKit) { *flags = _firehose_tracepoint_flags_pc_style_main_exe; - - memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso }, - sizeof(uint32_t)); + memcpy(buf, (uint32_t[]){ shift_addr}, sizeof(uint32_t)); return sizeof(uint32_t); } else { *flags = _firehose_tracepoint_flags_pc_style_absolute; - memcpy(buf, (uintptr_t[]){ VM_KERNEL_UNSLIDE(address) }, sizeof(uintptr_t)); + if (!driverKit) { + shift_addr = VM_KERNEL_UNSLIDE(address); + } else { + shift_addr = (uintptr_t) address; + } + memcpy(buf, (uintptr_t[]){ shift_addr }, sizeof(uintptr_t)); #if __LP64__ return 6; // 48 bits are enough #else @@ -402,7 +467,7 @@ _os_log_buffer_pack(uint8_t *buffdata, size_t buffdata_sz, static void _os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format, - void *dso, void *addr, os_log_buffer_context_t context) + void *dso, void *addr, os_log_buffer_context_t context, bool driverKit) { firehose_stream_t stream; firehose_tracepoint_flags_t flags = 0; @@ -413,7 +478,7 @@ _os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format, uint64_t thread_id; // dso == the start of the binary that was loaded - addr_len = _os_trace_write_location_for_address(buffdata, dso, addr, &flags); + addr_len = _os_trace_write_location_for_address(buffdata, dso, addr, &flags, driverKit); buffdata_sz = _os_log_buffer_pack(buffdata + addr_len, sizeof(buffdata) - addr_len, context); if (buffdata_sz == 0) { @@ -424,9 +489,16 @@ _os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format, timestamp = firehose_tracepoint_time(firehose_activity_flags_default); thread_id = thread_tid(current_thread()); - // create trace_id after we've set additional flags - trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, - type, flags, _os_trace_offset(dso, format, flags)); + if (driverKit) { + // set FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT so logd will not try to find the format string in + // the executable text + trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, + type, flags, (uintptr_t) addr | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT); + } else { + // create trace_id after we've set additional flags + trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, + type, flags, _os_trace_offset(dso, format, (_firehose_tracepoint_flags_activity_t)flags)); + } if (FALSE) { firehose_debug_trace(stream, trace_id.ftid_value, timestamp, @@ -452,7 +524,7 @@ _firehose_trace(firehose_stream_t stream, firehose_tracepoint_id_u ftid, if (slowpath(ft_size + publen > _firehose_chunk_payload_size)) { // We'll need to have some handling here. For now - return 0 - (void)hw_atomic_add(&oslog_p_error_count, 1); + os_atomic_inc(&oslog_p_error_count, relaxed); return 0; } @@ -474,11 +546,11 @@ out: if (!fastpath(ft)) { if (oslog_boot_done) { if (stream == firehose_stream_metadata) { - (void)hw_atomic_add(&oslog_p_metadata_dropped_msgcount, 1); + os_atomic_inc(&oslog_p_metadata_dropped_msgcount, relaxed); } else { // If we run out of space in the persistence buffer we're // dropping the message. - (void)hw_atomic_add(&oslog_p_dropped_msgcount, 1); + os_atomic_inc(&oslog_p_dropped_msgcount, relaxed); } return 0; } @@ -489,7 +561,7 @@ out: offset = firehose_chunk_tracepoint_try_reserve(fbc, stamp, firehose_stream_persist, 0, publen, 0, NULL); if (offset <= 0) { - (void)hw_atomic_add(&oslog_p_boot_dropped_msgcount, 1); + os_atomic_inc(&oslog_p_boot_dropped_msgcount, relaxed); return 0; } @@ -497,7 +569,7 @@ out: thread_tid(current_thread()), offset); memcpy(ft->ft_data, pubdata, publen); firehose_chunk_tracepoint_end(fbc, ft, ftid); - (void)hw_atomic_add(&oslog_p_saved_msgcount, 1); + os_atomic_inc(&oslog_p_saved_msgcount, relaxed); return ftid.ftid_value; } if (!oslog_boot_done) { @@ -507,9 +579,9 @@ out: __firehose_buffer_tracepoint_flush(ft, ftid); if (stream == firehose_stream_metadata) { - (void)hw_atomic_add(&oslog_p_metadata_saved_msgcount, 1); + os_atomic_inc(&oslog_p_metadata_saved_msgcount, relaxed); } else { - (void)hw_atomic_add(&oslog_p_saved_msgcount, 1); + os_atomic_inc(&oslog_p_saved_msgcount, relaxed); } return ftid.ftid_value; } @@ -567,7 +639,7 @@ firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid, m_entry = oslog_stream_create_buf_entry(oslog_stream_link_type_metadata, ftid, stamp, pubdata, publen); if (!m_entry) { - (void)hw_atomic_add(&oslog_s_error_count, 1); + os_atomic_inc(&oslog_s_error_count, relaxed); goto finish; } @@ -796,7 +868,7 @@ _test_log_loop(void * arg __unused, wait_result_t wres __unused) { uint32_t uniqid = RandomULong(); test_oslog_debug_helper(uniqid, 100); - (void)hw_atomic_add(&_test_log_loop_count, 100); + os_atomic_add(&_test_log_loop_count, 100, relaxed); } kern_return_t diff --git a/libkern/os/log.h b/libkern/os/log.h index 8b58e484e..2972daca7 100644 --- a/libkern/os/log.h +++ b/libkern/os/log.h @@ -53,6 +53,16 @@ extern bool startup_serial_logging_active; extern uint64_t startup_serial_num_procs; #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL +#define OS_LOG_BUFFER_MAX_SIZE 256 +#else +#define OS_LOG_BUFFER_MAX_SIZE 1024 +#endif + +// The OS_LOG_BUFFER_MAX_SIZE limit includes the metadata that +// must be included in the os_log firehose buffer +#define OS_LOG_DATA_MAX_SIZE (OS_LOG_BUFFER_MAX_SIZE - 16) + OS_ALWAYS_INLINE static inline void _os_log_verify_format_str(__unused const char *msg, ...) __attribute__((format(os_log, 1, 2))); OS_ALWAYS_INLINE static inline void _os_log_verify_format_str(__unused const char *msg, ...) /* placeholder */ @@ -452,6 +462,38 @@ os_log_debug_enabled(os_log_t log); __asm__(""); /* avoid tailcall */ \ }) +/*! + * @function os_log_driverKit + * + * @abstract + * Log a message using a specific type. This variant should be called only from dexts. + * + * @discussion + * Will log a message with the provided os_log_type_t. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param type + * Pass a valid type from os_log_type_t. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + * + * @result + * Returns EPERM if the caller is not a driverKit process, 0 in case of success. + */ +#define os_log_driverKit(out, log, type, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + (*(out)) = _os_log_internal_driverKit(&__dso_handle, log, type, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + + /*! * @function os_log_sensitive_debug * @@ -511,6 +553,16 @@ OS_EXPORT OS_NOTHROW void _os_log_internal(void *dso, os_log_t log, os_log_type_t type, const char *message, ...); +/*! + * @function _os_log_internal_driverKit + * + * @abstract + * Internal function used by macros. + */ +__WATCHOS_AVAILABLE(6.0) __OSX_AVAILABLE(10.15) __IOS_AVAILABLE(13.0) __TVOS_AVAILABLE(13.0) +OS_EXPORT OS_NOTHROW +int +_os_log_internal_driverKit(void *dso, os_log_t log, os_log_type_t type, const char *message, ...); __END_DECLS #endif /* __os_log_h */ diff --git a/libkern/os/log_encode_types.h b/libkern/os/log_encode_types.h index e07364752..ac4b44bdb 100644 --- a/libkern/os/log_encode_types.h +++ b/libkern/os/log_encode_types.h @@ -118,11 +118,6 @@ typedef struct os_log_buffer_value_s { typedef struct os_log_buffer_s { #define OS_LOG_BUFFER_HAS_PRIVATE 0x1 #define OS_LOG_BUFFER_HAS_NON_SCALAR 0x2 -#ifdef KERNEL -#define OS_LOG_BUFFER_MAX_SIZE 256 -#else -#define OS_LOG_BUFFER_MAX_SIZE 1024 -#endif uint8_t flags; uint8_t arg_cnt; uint8_t content[]; diff --git a/libkern/os/overflow.h b/libkern/os/overflow.h index abf04917a..f00a6024f 100644 --- a/libkern/os/overflow.h +++ b/libkern/os/overflow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,66 +60,7 @@ __os_warn_unused(__const bool x) #define os_mul_overflow(a, b, res) __os_warn_unused(__builtin_mul_overflow((a), (b), (res))) #else - -/* compile-time assertion that 'x' and 'y' are equivalent types */ -#ifdef __cplusplus -#define __OS_TYPE_CHECK(x, y) do { \ - __typeof__(x) _x; \ - __typeof__(y) _y; \ - (void)(&_x == &_y, "overflow arithmetic: incompatible types"); \ -} while (0) -#else -#define __OS_TYPE_CHECK(x, y) do { \ - _Static_assert(__builtin_types_compatible_p(__typeof(x),__typeof(y)), \ - "overflow arithmetic: incompatible types"); \ -} while (0) -#endif - -#define __os_add_overflow_func(T, U, V) _Generic((T),\ - unsigned: __builtin_uadd_overflow, \ - unsigned long: __builtin_uaddl_overflow, \ - unsigned long long: __builtin_uaddll_overflow, \ - int: __builtin_sadd_overflow, \ - long: __builtin_saddl_overflow, \ - long long: __builtin_saddll_overflow \ - )(T,U,V) - -#define __os_sub_overflow_func(T, U, V) _Generic((T),\ - unsigned: __builtin_usub_overflow, \ - unsigned long: __builtin_usubl_overflow, \ - unsigned long long: __builtin_usubll_overflow, \ - int: __builtin_ssub_overflow, \ - long: __builtin_ssubl_overflow, \ - long long: __builtin_ssubll_overflow \ - )(T,U,V) - -#define __os_mul_overflow_func(T, U, V) _Generic((T),\ - unsigned: __builtin_umul_overflow, \ - unsigned long: __builtin_umull_overflow, \ - unsigned long long: __builtin_umulll_overflow, \ - int: __builtin_smul_overflow, \ - long: __builtin_smull_overflow, \ - long long: __builtin_smulll_overflow \ - )(T,U,V) - -#define os_add_overflow(a, b, res) __os_warn_unused(__extension__({ \ - __OS_TYPE_CHECK((a), (b)); \ - __OS_TYPE_CHECK((b), *(res)); \ - __os_add_overflow_func((a), (b), (res)); \ -})) - -#define os_sub_overflow(a, b, res) __os_warn_unused(__extension__({ \ - __OS_TYPE_CHECK((a), (b)); \ - __OS_TYPE_CHECK((b), *(res)); \ - __os_sub_overflow_func((a), (b), (res)); \ -})) - -#define os_mul_overflow(a, b, res) __os_warn_unused(__extension__({ \ - __OS_TYPE_CHECK((a), (b)); \ - __OS_TYPE_CHECK((b), *(res)); \ - __os_mul_overflow_func((a), (b), (res)); \ -})) - +# error os_overflow expects type-generic builtins #endif /* __has_builtin(...) */ /* os_add3_overflow(a, b, c) -> (a + b + c) */ @@ -158,6 +99,20 @@ __os_warn_unused(__const bool x) _s | _t; \ })) +/* os_convert_overflow(a) -> a [converted to the result type] */ #define os_convert_overflow(a, res) os_add_overflow((a), 0, (res)) +/* os_inc_overflow(res) -> *res += 1 */ +#define os_inc_overflow(res) __os_warn_unused(__extension__({ \ + __typeof((res)) _tmp = (res); \ + os_add_overflow(*_tmp, 1, _tmp); \ +})) + +/* os_dec_overflow(res) -> *res -= 1 */ +#define os_dec_overflow(res) __os_warn_unused(__extension__({ \ + __typeof((res)) _tmp = (res); \ + os_sub_overflow(*_tmp, 1, _tmp); \ +})) + + #endif /* _OS_OVERFLOW_H */ diff --git a/libkern/os/ptrtools.h b/libkern/os/ptrtools.h new file mode 100644 index 000000000..9aaf43633 --- /dev/null +++ b/libkern/os/ptrtools.h @@ -0,0 +1,39 @@ +/* * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _OS_PTRTOOLS_H +#define _OS_PTRTOOLS_H + +/* dereference unaligned pointer 'p' */ +#define os_unaligned_deref(p) ((__os_unaligned_type(p))(p))->val + +/* ensure the compiler emits at most one access to 'val' */ +#define os_access_once(val) (*((volatile __typeof__((val)) *)&(val))) + +#define __os_unaligned_type(p) struct { __typeof__(*(p)) val; } __attribute__((packed)) * + +#endif /* _OS_PTRTOOLS_H */ diff --git a/libkern/os/reason_private.h b/libkern/os/reason_private.h index a83940b07..a4b9b1c3f 100644 --- a/libkern/os/reason_private.h +++ b/libkern/os/reason_private.h @@ -53,7 +53,7 @@ OS_ENUM(os_reason_libsystem_code, uint64_t, int os_fault_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, - uint64_t reason_flags); + uint64_t reason_flags) __attribute__((cold)); #endif // !KERNEL diff --git a/libkern/os/refcnt.c b/libkern/os/refcnt.c index 67deb068f..0cbcdf745 100644 --- a/libkern/os/refcnt.c +++ b/libkern/os/refcnt.c @@ -1,16 +1,22 @@ +#if KERNEL #include #include #include #include #include #include +#endif + #include "refcnt.h" #define OS_REFCNT_MAX_COUNT ((os_ref_count_t)0x0FFFFFFFUL) #if OS_REFCNT_DEBUG -os_refgrp_decl(static, global_ref_group, "all", NULL); -static bool ref_debug_enable = false; +extern struct os_refgrp global_ref_group; +os_refgrp_decl(, global_ref_group, "all", NULL); + +extern bool ref_debug_enable; +bool ref_debug_enable = false; static const size_t ref_log_nrecords = 1000000; #define REFLOG_BTDEPTH 10 @@ -22,77 +28,75 @@ static const size_t ref_log_nrecords = 1000000; # define __debug_only __unused #endif /* OS_REFCNT_DEBUG */ -static const char * -ref_grp_name(struct os_refcnt __debug_only *rc) +void +os_ref_panic_live(void *rc) { -#if OS_REFCNT_DEBUG - if (rc && rc->ref_group && rc->ref_group->grp_name) { - return rc->ref_group->grp_name; - } -#endif - return ""; + panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc); + __builtin_unreachable(); } -__attribute__((cold, noinline, not_tail_called, noreturn)) +__abortlike static void -os_ref_panic_underflow(struct os_refcnt *rc) +os_ref_panic_underflow(void *rc) { - panic("os_refcnt: underflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc)); + panic("os_refcnt: underflow (rc=%p)\n", rc); __builtin_unreachable(); } -static inline void -os_ref_check_underflow(struct os_refcnt *rc, os_ref_count_t count) +__abortlike +static void +os_ref_panic_resurrection(void *rc) { - if (__improbable(count == 0)) { - os_ref_panic_underflow(rc); - } + panic("os_refcnt: attempted resurrection (rc=%p)\n", rc); + __builtin_unreachable(); } -__attribute__((cold, noinline, not_tail_called, noreturn)) +__abortlike static void -os_ref_panic_resurrection(struct os_refcnt *rc) +os_ref_panic_overflow(void *rc) { - panic("os_refcnt: used unsafely when zero (rc=%p, grp=%s)\n", rc, ref_grp_name(rc)); + panic("os_refcnt: overflow (rc=%p)\n", rc); __builtin_unreachable(); } static inline void -os_ref_assert_referenced(struct os_refcnt *rc, os_ref_count_t count) +os_ref_check_underflow(void *rc, os_ref_count_t count) { if (__improbable(count == 0)) { - os_ref_panic_resurrection(rc); + os_ref_panic_underflow(rc); } } -__attribute__((cold, noinline, not_tail_called, noreturn)) -static void -os_ref_panic_overflow(struct os_refcnt *rc) +static inline void +os_ref_check_overflow(os_ref_atomic_t *rc, os_ref_count_t count) { - panic("os_refcnt: overflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc)); - __builtin_unreachable(); + if (__improbable(count >= OS_REFCNT_MAX_COUNT)) { + os_ref_panic_overflow(rc); + } } static inline void -os_ref_check_overflow(struct os_refcnt *rc, os_ref_count_t count) +os_ref_assert_referenced(void *rc, os_ref_count_t count) { - if (__improbable(count >= OS_REFCNT_MAX_COUNT)) { - os_ref_panic_overflow(rc); + if (__improbable(count == 0)) { + os_ref_panic_resurrection(rc); } } -static void -os_ref_check_retain(struct os_refcnt *rc, os_ref_count_t count) +static inline void +os_ref_check_retain(os_ref_atomic_t *rc, os_ref_count_t count) { os_ref_assert_referenced(rc, count); os_ref_check_overflow(rc, count); } #if OS_REFCNT_DEBUG +#if KERNEL +__attribute__((cold, noinline)) static void ref_log_op(struct os_refgrp *grp, void *elem, int op) { - if (!ref_debug_enable || grp == NULL) { + if (grp == NULL) { return; } @@ -102,10 +106,11 @@ ref_log_op(struct os_refgrp *grp, void *elem, int op) } uintptr_t bt[REFLOG_BTDEPTH]; - uint32_t nframes = backtrace(bt, REFLOG_BTDEPTH); + uint32_t nframes = backtrace(bt, REFLOG_BTDEPTH, NULL); btlog_add_entry((btlog_t *)grp->grp_log, elem, op, (void **)bt, nframes); } +__attribute__((cold, noinline)) static void ref_log_drop(struct os_refgrp *grp, void *elem) { @@ -121,6 +126,7 @@ ref_log_drop(struct os_refgrp *grp, void *elem) btlog_remove_entries_for_element(grp->grp_log, elem); } +__attribute__((cold, noinline)) static void ref_log_init(struct os_refgrp *grp) { @@ -145,17 +151,30 @@ ref_log_init(struct os_refgrp *grp) if (strcmp(g, grp->grp_name) == 0) { /* enable logging on this refgrp */ grp->grp_log = btlog_create(ref_log_nrecords, REFLOG_BTDEPTH, true); - assert(grp->grp_log); return; } } } +#else + +#ifndef ref_log_init +# define ref_log_init(...) do {} while (0) +#endif +#ifndef ref_log_op +# define ref_log_op(...) do {} while (0) +#endif +#ifndef ref_log_drop +# define ref_log_drop(...) do {} while (0) +#endif + +#endif /* KERNEL */ /* * attach a new refcnt to a group */ +__attribute__((cold, noinline)) static void -ref_attach_to_group(struct os_refcnt *rc, struct os_refgrp *grp, os_ref_count_t init_count) +ref_attach_to_group(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t init_count) { if (grp == NULL) { return; @@ -181,7 +200,7 @@ ref_attach_to_group(struct os_refcnt *rc, struct os_refgrp *grp, os_ref_count_t ref_attach_to_group(rc, grp->grp_parent, init_count); } -static inline void +static void ref_retain_group(struct os_refgrp *grp) { if (grp) { @@ -191,7 +210,8 @@ ref_retain_group(struct os_refgrp *grp) } } -static inline void +__attribute__((cold, noinline)) +static void ref_release_group(struct os_refgrp *grp, bool final) { if (grp) { @@ -204,48 +224,57 @@ ref_release_group(struct os_refgrp *grp, bool final) ref_release_group(grp->grp_parent, final); } } -#endif -#undef os_ref_init_count -void -os_ref_init_count(struct os_refcnt *rc, struct os_refgrp __debug_only *grp, os_ref_count_t count) +__attribute__((cold, noinline)) +static void +ref_init_debug(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t count) { - atomic_init(&rc->ref_count, count); + ref_attach_to_group(rc, grp, count); -#if OS_REFCNT_DEBUG - assert(count > 0); - if (grp) { - rc->ref_group = grp; - } else { - rc->ref_group = &global_ref_group; + for (os_ref_count_t i = 0; i < count; i++) { + ref_log_op(grp, (void *)rc, REFLOG_RETAIN); } +} - ref_attach_to_group(rc, rc->ref_group, count); +__attribute__((cold, noinline)) +static void +ref_retain_debug(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp) +{ + ref_retain_group(grp); + ref_log_op(grp, (void *)rc, REFLOG_RETAIN); +} +#endif - for (os_ref_count_t i = 0; i < count; i++) { - ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); +void +os_ref_init_count_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t count) +{ + os_ref_check_underflow(rc, count); + atomic_init(rc, count); + +#if OS_REFCNT_DEBUG + if (__improbable(ref_debug_enable && grp)) { + ref_init_debug(rc, grp, count); } #endif } void -os_ref_retain(struct os_refcnt *rc) +os_ref_retain_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp) { - os_ref_count_t old = atomic_fetch_add_explicit(&rc->ref_count, 1, memory_order_relaxed); + os_ref_count_t old = atomic_fetch_add_explicit(rc, 1, memory_order_relaxed); os_ref_check_retain(rc, old); #if OS_REFCNT_DEBUG - if (__improbable(ref_debug_enable)) { - ref_retain_group(rc->ref_group); - ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); + if (__improbable(grp && ref_debug_enable)) { + ref_retain_debug(rc, grp); } #endif } bool -os_ref_retain_try(struct os_refcnt *rc) +os_ref_retain_try_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp) { - os_ref_count_t cur = os_ref_get_count(rc); + os_ref_count_t cur = os_ref_get_count_internal(rc); while (1) { if (__improbable(cur == 0)) { @@ -254,83 +283,275 @@ os_ref_retain_try(struct os_refcnt *rc) os_ref_check_retain(rc, cur); - if (atomic_compare_exchange_weak_explicit(&rc->ref_count, &cur, cur + 1, + if (atomic_compare_exchange_weak_explicit(rc, &cur, cur + 1, memory_order_relaxed, memory_order_relaxed)) { -#if OS_REFCNT_DEBUG - if (__improbable(ref_debug_enable)) { - ref_retain_group(rc->ref_group); - ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); - } -#endif - return true; + break; } } + +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + ref_retain_debug(rc, grp); + } +#endif + + return true; } -os_ref_count_t -os_ref_release_explicit(struct os_refcnt *rc, memory_order release_order, memory_order dealloc_order) +__attribute__((always_inline)) +static inline os_ref_count_t +_os_ref_release_inline(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, + memory_order release_order, memory_order dealloc_order) { + os_ref_count_t val; + #if OS_REFCNT_DEBUG - /* - * Care not to use 'rc' after the decrement because it might be deallocated - * under us. - */ - struct os_refgrp *grp = rc->ref_group; - if (__improbable(ref_debug_enable)) { + if (__improbable(grp && ref_debug_enable)) { + /* + * Care not to use 'rc' after the decrement because it might be deallocated + * under us. + */ ref_log_op(grp, (void *)rc, REFLOG_RELEASE); } #endif - os_ref_count_t val = atomic_fetch_sub_explicit(&rc->ref_count, 1, release_order); + val = atomic_fetch_sub_explicit(rc, 1, release_order); os_ref_check_underflow(rc, val); if (__improbable(--val == 0)) { - atomic_load_explicit(&rc->ref_count, dealloc_order); + atomic_load_explicit(rc, dealloc_order); + } + #if OS_REFCNT_DEBUG - if (__improbable(ref_debug_enable)) { + if (__improbable(grp && ref_debug_enable)) { + if (val == 0) { ref_log_drop(grp, (void *)rc); /* rc is only used as an identifier */ } + ref_release_group(grp, !val); + } #endif + + return val; +} + +__attribute__((noinline)) +os_ref_count_t +os_ref_release_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, + memory_order release_order, memory_order dealloc_order) +{ + // Legacy exported interface with bad codegen due to the barriers + // not being immediate + // + // Also serves as the debug function + return _os_ref_release_inline(rc, grp, release_order, dealloc_order); +} + +os_ref_count_t +os_ref_release_barrier_internal(os_ref_atomic_t *rc, + struct os_refgrp * __debug_only grp) +{ +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + return os_ref_release_internal(rc, grp, + memory_order_release, memory_order_acquire); } +#endif + return _os_ref_release_inline(rc, NULL, + memory_order_release, memory_order_acquire); +} +os_ref_count_t +os_ref_release_relaxed_internal(os_ref_atomic_t *rc, + struct os_refgrp * __debug_only grp) +{ #if OS_REFCNT_DEBUG - if (__improbable(ref_debug_enable)) { + if (__improbable(grp && ref_debug_enable)) { + return os_ref_release_internal(rc, grp, + memory_order_relaxed, memory_order_relaxed); + } +#endif + return _os_ref_release_inline(rc, NULL, + memory_order_relaxed, memory_order_relaxed); +} + +void +os_ref_retain_locked_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp) +{ + os_ref_count_t val = os_ref_get_count_internal(rc); + os_ref_check_retain(rc, val); + atomic_store_explicit(rc, ++val, memory_order_relaxed); + +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + ref_retain_debug(rc, grp); + } +#endif +} + +os_ref_count_t +os_ref_release_locked_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp) +{ + os_ref_count_t val = os_ref_get_count_internal(rc); + os_ref_check_underflow(rc, val); + atomic_store_explicit(rc, --val, memory_order_relaxed); + +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { ref_release_group(grp, !val); + ref_log_op(grp, (void *)rc, REFLOG_RELEASE); + if (val == 0) { + ref_log_drop(grp, (void *)rc); + } } #endif return val; } +/* + * Bitwise API + */ + +os_ref_count_t +os_ref_get_count_mask(os_ref_atomic_t *rc, os_ref_count_t bits) +{ + os_ref_count_t ret; + ret = os_ref_get_count_raw(rc); + return ret >> bits; +} + +#undef os_ref_init_count_mask void -os_ref_retain_locked(struct os_refcnt *rc) +os_ref_init_count_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, + os_ref_count_t init_count, os_ref_count_t init_bits, os_ref_count_t b) { - os_ref_count_t val = atomic_load_explicit(&rc->ref_count, memory_order_relaxed); - os_ref_check_retain(rc, val); - atomic_store_explicit(&rc->ref_count, ++val, memory_order_relaxed); + assert(init_bits < (1U << b)); + os_ref_check_underflow(rc, init_count); + atomic_init(rc, (init_count << b) | init_bits); + +#if OS_REFCNT_DEBUG + if (__improbable(ref_debug_enable && grp)) { + ref_init_debug(rc, grp, init_count); + } +#endif +} + +#undef os_ref_retain_mask +void +os_ref_retain_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits) +{ + os_ref_count_t old = atomic_fetch_add_explicit(rc, 1U << bits, memory_order_relaxed); + os_ref_check_overflow(rc, old); + os_ref_assert_referenced(rc, old >> bits); #if OS_REFCNT_DEBUG - if (__improbable(ref_debug_enable)) { - ref_retain_group(rc->ref_group); - ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); + if (__improbable(grp && ref_debug_enable)) { + ref_retain_debug(rc, grp); } #endif } +#undef os_ref_release_mask_internal os_ref_count_t -os_ref_release_locked(struct os_refcnt *rc) +os_ref_release_mask_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits, + memory_order release_order, memory_order dealloc_order) { - os_ref_count_t val = atomic_load_explicit(&rc->ref_count, memory_order_relaxed); +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + /* + * Care not to use 'rc' after the decrement because it might be deallocated + * under us. + */ + ref_log_op(grp, (void *)rc, REFLOG_RELEASE); + } +#endif + + os_ref_count_t val = atomic_fetch_sub_explicit(rc, 1U << bits, release_order); + val >>= bits; os_ref_check_underflow(rc, val); - atomic_store_explicit(&rc->ref_count, --val, memory_order_relaxed); + if (__improbable(--val == 0)) { + atomic_load_explicit(rc, dealloc_order); + } + +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + if (val == 0) { + ref_log_drop(grp, (void *)rc); /* rc is only used as an identifier */ + } + ref_release_group(grp, !val); + } +#endif + + return val; +} + +#undef os_ref_retain_try_mask +bool +os_ref_retain_try_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits) +{ + os_ref_count_t cur = os_ref_get_count_internal(rc); + + while (1) { + if (__improbable((cur >> bits) == 0)) { + return false; + } + + os_ref_check_overflow(rc, cur); + + os_ref_count_t next = cur + (1U << bits); + if (atomic_compare_exchange_weak_explicit(rc, &cur, next, + memory_order_relaxed, memory_order_relaxed)) { + break; + } + } + +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + ref_retain_debug(rc, grp); + } +#endif + + return true; +} + +#undef os_ref_retain_locked_mask +void +os_ref_retain_locked_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits) +{ + os_ref_count_t val = os_ref_get_count_internal(rc); + + os_ref_check_overflow(rc, val); + os_ref_assert_referenced(rc, val >> bits); + + val += (1U << bits); + atomic_store_explicit(rc, val, memory_order_relaxed); + +#if OS_REFCNT_DEBUG + if (__improbable(grp && ref_debug_enable)) { + ref_retain_debug(rc, grp); + } +#endif +} + +#undef os_ref_release_locked_mask +os_ref_count_t +os_ref_release_locked_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits) +{ + os_ref_count_t val = os_ref_get_count_internal(rc); + os_ref_check_underflow(rc, val >> bits); + val -= (1U << bits); + atomic_store_explicit(rc, val, memory_order_relaxed); + + val >>= bits; #if OS_REFCNT_DEBUG - if (__improbable(ref_debug_enable)) { - ref_release_group(rc->ref_group, !val); - ref_log_op(rc->ref_group, (void *)rc, REFLOG_RELEASE); + if (__improbable(grp && ref_debug_enable)) { + ref_release_group(grp, !val); + ref_log_op(grp, (void *)rc, REFLOG_RELEASE); if (val == 0) { - ref_log_drop(rc->ref_group, (void *)rc); + ref_log_drop(grp, (void *)rc); } } #endif + return val; } diff --git a/libkern/os/refcnt.h b/libkern/os/refcnt.h index e306d3552..bca8fcdf8 100644 --- a/libkern/os/refcnt.h +++ b/libkern/os/refcnt.h @@ -36,6 +36,15 @@ * operations and requires no external synchronization, whereas the locked flavor * assumes the refcnt object is locked by the caller. It is NOT safe to * mix-and-match locked and atomic calls. + * + * 'refgrp's are a way to (hierarchically) group like refcount objects for + * debugging purposes. The group keeps track of the total number and aggregate + * reference count of member refcounts, and the "rlog=" boot-arg is used to enable + * refcount logging by group name. Named groups can be created explicitly with + * os_refgrp_decl(), or implicitly by passing NULL for the refgrp when + * initializing a refcnt object. In the latter case, the group name is the same as + * the function enclosing the init call. Groups are only available on DEV or DEBUG + * builds, and are otherwise compiled out. */ #include @@ -48,40 +57,33 @@ typedef struct os_refcnt os_refcnt_t; /* type of the internal counter */ typedef uint32_t os_ref_count_t; - -#if DEVELOPMENT || DEBUG -# define OS_REFCNT_DEBUG 1 -#else -# define OS_REFCNT_DEBUG 0 -#endif +typedef _Atomic(os_ref_count_t) os_ref_atomic_t; /* - * Debugging is keyed off ref_group, so leave that field for kexts so that the - * combination of dev/debug kernel and release kext works. + * OS_REF_INITIALIZER + * OS_REF_ATOMIC_INITIALIZER + * + * Static initializers that create refcnt objects with safe initial values for use + * between declaration and initialization (os_ref*_init()). Equivalent to zeroing. */ -#if XNU_KERNEL_PRIVATE -# define OS_REFCNT_HAS_GROUP OS_REFCNT_DEBUG -#else -# define OS_REFCNT_HAS_GROUP 1 -#endif -struct os_refcnt { - _Atomic os_ref_count_t ref_count; -#if OS_REFCNT_HAS_GROUP - struct os_refgrp *ref_group; +#ifndef KERNEL +# include +# include +# ifndef __improbable +# define __improbable(x) x +# endif +# ifndef panic +# define panic(x, ...) do { fprintf(stderr, x, __VA_ARGS__); abort(); } while (0) +# endif #endif -}; - -#if OS_REFCNT_DEBUG -struct os_refgrp { - const char *const grp_name; - _Atomic os_ref_count_t grp_children; /* number of refcount objects in group */ - _Atomic os_ref_count_t grp_count; /* current reference count of group */ - _Atomic uint64_t grp_retain_total; - _Atomic uint64_t grp_release_total; - struct os_refgrp *grp_parent; - void *grp_log; /* refcount logging context */ -}; + +#ifndef OS_REFCNT_DEBUG +# if DEVELOPMENT || DEBUG +# define OS_REFCNT_DEBUG 1 +# else +# define OS_REFCNT_DEBUG 0 +# endif #endif #if __has_attribute(diagnose_if) @@ -97,33 +99,16 @@ __BEGIN_DECLS * os_ref_init_count: initialize an os_refcnt with a specific count >= 1 */ #define os_ref_init(rc, grp) os_ref_init_count((rc), (grp), 1) -void os_ref_init_count(struct os_refcnt *, struct os_refgrp *, os_ref_count_t count) +static void os_ref_init_count(struct os_refcnt *, struct os_refgrp *, os_ref_count_t count) os_error_if(count == 0, "Reference count must be non-zero initialized"); -#if OS_REFCNT_DEBUG -# define os_refgrp_decl(qual, var, name, parent) \ - qual struct os_refgrp __attribute__((section("__DATA,__refgrps"))) var = { \ - .grp_name = (name), \ - .grp_children = ATOMIC_VAR_INIT(0), \ - .grp_count = ATOMIC_VAR_INIT(0), \ - .grp_retain_total = ATOMIC_VAR_INIT(0), \ - .grp_release_total = ATOMIC_VAR_INIT(0), \ - .grp_parent = (parent), \ - .grp_log = NULL, \ - } - -/* Create a default group based on the init() callsite if no explicit group - * is provided. */ -# define os_ref_init_count(rc, grp, count) ({ \ - os_refgrp_decl(static, __grp, __func__, NULL); \ - (os_ref_init_count)((rc), (grp) ? (grp) : &__grp, (count)); \ - }) -#else -# define os_refgrp_decl(...) -# define os_ref_init_count(rc, grp, count) (os_ref_init_count)((rc), NULL, (count)) -#endif /* OS_REFCNT_DEBUG */ +/* + * os_refgrp_decl(qual, var, name, parent): declare a refgroup object 'var' with + * given name string and parent group. + */ /* + * * os_ref_retain: acquire a reference (increment reference count by 1) atomically. * * os_ref_release: release a reference (decrement reference count) atomically and @@ -138,33 +123,10 @@ os_error_if(count == 0, "Reference count must be non-zero initialized"); * * os_ref_release_live: release a reference that is guaranteed not to be the last one. */ -void os_ref_retain(struct os_refcnt *); - -os_ref_count_t os_ref_release_explicit(struct os_refcnt *rc, - memory_order release_order, memory_order dealloc_order) OS_WARN_RESULT; - -static inline os_ref_count_t OS_WARN_RESULT -os_ref_release(struct os_refcnt *rc) -{ - return os_ref_release_explicit(rc, memory_order_release, memory_order_acquire); -} - -static inline os_ref_count_t OS_WARN_RESULT -os_ref_release_relaxed(struct os_refcnt *rc) -{ - return os_ref_release_explicit(rc, memory_order_relaxed, memory_order_relaxed); -} - -static inline void -os_ref_release_live(struct os_refcnt *rc) -{ - if (__improbable(os_ref_release_explicit(rc, - memory_order_release, memory_order_relaxed) == 0)) { - panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc); - __builtin_unreachable(); - } -} - +static void os_ref_retain(struct os_refcnt *); +static os_ref_count_t os_ref_release(struct os_refcnt *) OS_WARN_RESULT; +static os_ref_count_t os_ref_release_relaxed(struct os_refcnt *) OS_WARN_RESULT; +static void os_ref_release_live(struct os_refcnt *); /* * os_ref_retain_try: a variant of atomic retain that fails for objects with a @@ -174,8 +136,7 @@ os_ref_release_live(struct os_refcnt *rc) * for objects stored in a collection, because no lock is required on the * release() side until the object is deallocated. */ -bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT; - +static bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT; /* * os_ref_retain_locked: acquire a reference on an object protected by a held @@ -185,20 +146,71 @@ bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT; * os_ref_release_locked: release a reference on an object protected by a held * lock. */ -void os_ref_retain_locked(struct os_refcnt *); -os_ref_count_t os_ref_release_locked(struct os_refcnt *) OS_WARN_RESULT; - +static void os_ref_retain_locked(struct os_refcnt *); +static os_ref_count_t os_ref_release_locked(struct os_refcnt *) OS_WARN_RESULT; /* * os_ref_get_count: return the current reference count. This is unsafe for * synchronization. */ -static inline os_ref_count_t -os_ref_get_count(struct os_refcnt *rc) -{ - return atomic_load_explicit(&rc->ref_count, memory_order_relaxed); -} +static os_ref_count_t os_ref_get_count(struct os_refcnt *rc); + + +#if XNU_KERNEL_PRIVATE +/* + * Raw API that uses a plain atomic counter (os_ref_atomic_t) and a separate + * refgroup. This can be used in situations where the refcount object must be + * fixed size, for example for embedding in structures with ABI stability + * requirements. + */ + +#define os_ref_init_raw(rc, grp) os_ref_init_count_raw((rc), (grp), 1) +static void os_ref_init_count_raw(os_ref_atomic_t *, struct os_refgrp *, os_ref_count_t count) +os_error_if(count == 0, "Reference count must be non-zero initialized"); +static void os_ref_retain_raw(os_ref_atomic_t *, struct os_refgrp *); +static os_ref_count_t os_ref_release_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT; +static os_ref_count_t os_ref_release_relaxed_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT; +static void os_ref_release_live_raw(os_ref_atomic_t *, struct os_refgrp *); +static bool os_ref_retain_try_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT; +static void os_ref_retain_locked_raw(os_ref_atomic_t *, struct os_refgrp *); +static os_ref_count_t os_ref_release_locked_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT; +static os_ref_count_t os_ref_get_count_raw(os_ref_atomic_t *rc); + + +/* + * Bitwise API: like the raw API, but allows some bits in the refcount value to be + * reserved for other purposes. 'b' defines the number of trailing (LSB) reserved + * bits, which the refcnt_raw API will never modify (except at init()). + * + * It is assumed that users of this API always use atomic ops on the + * os_ref_atomic_t (or hold a lock for the locked variants), and never modify the + * top (32 - 'b') bits. + * + * Due to guard bits, the maximum reference count is 2^(28 - 'b') - 1, and the + * maximum 'b' is 26 bits. This API can also be used just to limit the max + * refcount. + */ + +/* Initialize the reference count and reserved bits */ +#define os_ref_init_mask(rc, grp, b) os_ref_init_count_mask((rc), (grp), 1, 0, (b)) +void os_ref_init_count_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t init_count, + os_ref_count_t init_bits, os_ref_count_t b) +os_error_if(init_count == 0, "Reference count must be non-zero initialized") +os_error_if(b > 26, "Bitwise reference count limited to 26 bits") +os_error_if(init_bits >= (1U << b), "Bits out of range"); + +void os_ref_retain_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b); +static os_ref_count_t os_ref_release_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT; +static os_ref_count_t os_ref_release_relaxed_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT; +static void os_ref_release_live_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b); +bool os_ref_retain_try_mask(os_ref_atomic_t *, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT; +void os_ref_retain_locked_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b); +os_ref_count_t os_ref_release_locked_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT; +os_ref_count_t os_ref_get_count_mask(os_ref_atomic_t *rc, os_ref_count_t b); + +#endif /* XNU_KERNEL_PRIVATE */ __END_DECLS +#include #endif diff --git a/libkern/os/refcnt_internal.h b/libkern/os/refcnt_internal.h new file mode 100644 index 000000000..fdc26ecd2 --- /dev/null +++ b/libkern/os/refcnt_internal.h @@ -0,0 +1,301 @@ +#ifndef _OS_REFCNT_INTERNAL_H +#define _OS_REFCNT_INTERNAL_H + +struct os_refcnt { + os_ref_atomic_t ref_count; +#if OS_REFCNT_DEBUG + struct os_refgrp *ref_group; +#endif +}; + +#if OS_REFCNT_DEBUG +struct os_refgrp { + const char *const grp_name; + os_ref_atomic_t grp_children; /* number of refcount objects in group */ + os_ref_atomic_t grp_count; /* current reference count of group */ + _Atomic uint64_t grp_retain_total; + _Atomic uint64_t grp_release_total; + struct os_refgrp *grp_parent; + void *grp_log; /* refcount logging context */ +}; +#endif + +# define OS_REF_ATOMIC_INITIALIZER ATOMIC_VAR_INIT(0) +#if OS_REFCNT_DEBUG +# define OS_REF_INITIALIZER { .ref_count = OS_REF_ATOMIC_INITIALIZER, .ref_group = NULL } +#else +# define OS_REF_INITIALIZER { .ref_count = OS_REF_ATOMIC_INITIALIZER } +#endif + +__BEGIN_DECLS + +#if OS_REFCNT_DEBUG +# define os_ref_if_debug(x, y) x +#else +# define os_ref_if_debug(x, y) y +#endif + +void os_ref_init_count_external(os_ref_atomic_t *, struct os_refgrp *, os_ref_count_t); +void os_ref_retain_external(os_ref_atomic_t *, struct os_refgrp *); +void os_ref_retain_locked_external(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_external(os_ref_atomic_t *, struct os_refgrp *, + memory_order release_order, memory_order dealloc_order); +os_ref_count_t os_ref_release_relaxed_external(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_barrier_external(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_locked_external(os_ref_atomic_t *, struct os_refgrp *); +bool os_ref_retain_try_external(os_ref_atomic_t *, struct os_refgrp *); + +#if XNU_KERNEL_PRIVATE +void os_ref_init_count_internal(os_ref_atomic_t *, struct os_refgrp *, os_ref_count_t); +void os_ref_retain_internal(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_relaxed_internal(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_barrier_internal(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_internal(os_ref_atomic_t *, struct os_refgrp *, + memory_order release_order, memory_order dealloc_order); +bool os_ref_retain_try_internal(os_ref_atomic_t *, struct os_refgrp *); +void os_ref_retain_locked_internal(os_ref_atomic_t *, struct os_refgrp *); +os_ref_count_t os_ref_release_locked_internal(os_ref_atomic_t *, struct os_refgrp *); +#else +/* For now, the internal and external variants are identical */ +#define os_ref_init_count_internal os_ref_init_count_external +#define os_ref_retain_internal os_ref_retain_external +#define os_ref_retain_locked_internal os_ref_retain_locked_external +#define os_ref_release_internal os_ref_release_external +#define os_ref_release_barrier_internal os_ref_release_barrier_external +#define os_ref_release_relaxed_internal os_ref_release_relaxed_external +#define os_ref_release_locked_internal os_ref_release_locked_external +#define os_ref_retain_try_internal os_ref_retain_try_external +#endif + +static inline void +os_ref_init_count(struct os_refcnt *rc, struct os_refgrp * __unused grp, os_ref_count_t count) +{ +#if OS_REFCNT_DEBUG + rc->ref_group = grp; +#endif + os_ref_init_count_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL), count); +} + +static inline void +os_ref_retain(struct os_refcnt *rc) +{ + os_ref_retain_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL)); +} + +static inline os_ref_count_t +os_ref_release_locked(struct os_refcnt *rc) +{ + return os_ref_release_locked_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL)); +} + +static inline void +os_ref_retain_locked(struct os_refcnt *rc) +{ + os_ref_retain_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL)); +} + +static inline bool +os_ref_retain_try(struct os_refcnt *rc) +{ + return os_ref_retain_try_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL)); +} + +__deprecated_msg("inefficient codegen, prefer os_ref_release / os_ref_release_relaxed") +static inline os_ref_count_t OS_WARN_RESULT +os_ref_release_explicit(struct os_refcnt *rc, memory_order release_order, memory_order dealloc_order) +{ + return os_ref_release_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL), + release_order, dealloc_order); +} + +#if OS_REFCNT_DEBUG +# define os_refgrp_decl(qual, var, name, parent) \ + qual struct os_refgrp __attribute__((section("__DATA,__refgrps"))) var = { \ + .grp_name = (name), \ + .grp_children = ATOMIC_VAR_INIT(0u), \ + .grp_count = ATOMIC_VAR_INIT(0u), \ + .grp_retain_total = ATOMIC_VAR_INIT(0u), \ + .grp_release_total = ATOMIC_VAR_INIT(0u), \ + .grp_parent = (parent), \ + .grp_log = NULL, \ + } +# define os_refgrp_decl_extern(var) \ + extern struct os_refgrp var + +/* Create a default group based on the init() callsite if no explicit group + * is provided. */ +# define os_ref_init_count(rc, grp, count) ({ \ + os_refgrp_decl(static, __grp, __func__, NULL); \ + (os_ref_init_count)((rc), (grp) ? (grp) : &__grp, (count)); \ + }) + +#else /* OS_REFCNT_DEBUG */ + +# define os_refgrp_decl(...) extern struct os_refgrp var __attribute__((unused)) +# define os_refgrp_decl_extern(var) os_refgrp_decl(var) +# define os_ref_init_count(rc, grp, count) (os_ref_init_count)((rc), NULL, (count)) + +#endif /* OS_REFCNT_DEBUG */ + +#if XNU_KERNEL_PRIVATE +void os_ref_panic_live(void *rc) __abortlike; +#else +__abortlike +static inline void +os_ref_panic_live(void *rc) +{ + panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc); + __builtin_unreachable(); +} +#endif + +static inline os_ref_count_t OS_WARN_RESULT +os_ref_release(struct os_refcnt *rc) +{ + return os_ref_release_barrier_internal(&rc->ref_count, + os_ref_if_debug(rc->ref_group, NULL)); +} + +static inline os_ref_count_t OS_WARN_RESULT +os_ref_release_relaxed(struct os_refcnt *rc) +{ + return os_ref_release_relaxed_internal(&rc->ref_count, + os_ref_if_debug(rc->ref_group, NULL)); +} + +static inline void +os_ref_release_live(struct os_refcnt *rc) +{ + if (__improbable(os_ref_release(rc) == 0)) { + os_ref_panic_live(rc); + } +} + +static inline os_ref_count_t +os_ref_get_count_internal(os_ref_atomic_t *rc) +{ + return atomic_load_explicit(rc, memory_order_relaxed); +} + +static inline os_ref_count_t +os_ref_get_count(struct os_refcnt *rc) +{ + return os_ref_get_count_internal(&rc->ref_count); +} + + + +/* + * Raw API + */ + +static inline void +os_ref_init_count_raw(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t count) +{ + os_ref_init_count_internal(rc, grp, count); +} + +static inline void +os_ref_retain_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + os_ref_retain_internal(rc, grp); +} + +static inline os_ref_count_t +os_ref_release_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + return os_ref_release_barrier_internal(rc, grp); +} + +static inline os_ref_count_t +os_ref_release_relaxed_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + return os_ref_release_relaxed_internal(rc, grp); +} + +static inline void +os_ref_release_live_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + if (__improbable(os_ref_release_barrier_internal(rc, grp) == 0)) { + os_ref_panic_live(rc); + } +} + +static inline bool +os_ref_retain_try_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + return os_ref_retain_try_internal(rc, grp); +} + +static inline void +os_ref_retain_locked_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + os_ref_retain_locked_internal(rc, grp); +} + +static inline os_ref_count_t +os_ref_release_locked_raw(os_ref_atomic_t *rc, struct os_refgrp *grp) +{ + return os_ref_release_locked_internal(rc, grp); +} + +static inline os_ref_count_t +os_ref_get_count_raw(os_ref_atomic_t *rc) +{ + return os_ref_get_count_internal(rc); +} + +#if !OS_REFCNT_DEBUG +/* remove the group argument for non-debug */ +#define os_ref_init_count_raw(rc, grp, count) (os_ref_init_count_raw)((rc), NULL, (count)) +#define os_ref_retain_raw(rc, grp) (os_ref_retain_raw)((rc), NULL) +#define os_ref_release_raw(rc, grp) (os_ref_release_raw)((rc), NULL) +#define os_ref_release_relaxed_raw(rc, grp) (os_ref_release_relaxed_raw)((rc), NULL) +#define os_ref_release_live_raw(rc, grp) (os_ref_release_live_raw)((rc), NULL) +#define os_ref_retain_try_raw(rc, grp) (os_ref_retain_try_raw)((rc), NULL) +#define os_ref_retain_locked_raw(rc, grp) (os_ref_retain_locked_raw)((rc), NULL) +#define os_ref_release_locked_raw(rc, grp) (os_ref_release_locked_raw)((rc), NULL) +#endif + +#if XNU_KERNEL_PRIVATE +os_ref_count_t os_ref_release_mask_internal(os_ref_atomic_t *rc, struct os_refgrp *grp, + os_ref_count_t b, memory_order release_order, memory_order dealloc_order); + +static inline os_ref_count_t +os_ref_release_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) +{ + return os_ref_release_mask_internal(rc, grp, b, memory_order_release, memory_order_acquire); +} + +static inline os_ref_count_t +os_ref_release_relaxed_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) +{ + return os_ref_release_mask_internal(rc, grp, b, memory_order_relaxed, memory_order_relaxed); +} + +static inline void +os_ref_release_live_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) +{ + if (__improbable(os_ref_release_mask_internal(rc, grp, b, + memory_order_release, memory_order_relaxed) == 0)) { + os_ref_panic_live(rc); + } +} + +#if !OS_REFCNT_DEBUG +/* remove the group argument for non-debug */ +#define os_ref_init_count_mask(rc, grp, init_c, init_b, b) (os_ref_init_count_mask)(rc, NULL, init_c, init_b, b) +#define os_ref_retain_mask(rc, grp, b) (os_ref_retain_mask)((rc), NULL, (b)) +#define os_ref_release_mask(rc, grp, b) (os_ref_release_mask)((rc), NULL, (b)) +#define os_ref_release_relaxed_mask(rc, grp, b) (os_ref_relaxed_mask)((rc), NULL, (b)) +#define os_ref_release_live_mask(rc, grp, b) (os_ref_release_live_mask)((rc), NULL, (b)) +#define os_ref_retain_try_mask(rc, grp, b) (os_ref_retain_try_mask)((rc), NULL, (b)) +#define os_ref_release_locked_mask(rc, grp, b) (os_ref_release_locked_mask)((rc), NULL, (b)) +#define os_ref_retain_locked_mask(rc, grp, b) (os_ref_retain_locked_mask)((rc), NULL, (b)) +#endif + +#endif + +__END_DECLS + +#endif /* _OS_REFCNT_INTERNAL_H */ diff --git a/libkern/os/smart_ptr.h b/libkern/os/smart_ptr.h new file mode 100644 index 000000000..5f89c7fec --- /dev/null +++ b/libkern/os/smart_ptr.h @@ -0,0 +1,523 @@ +#ifndef _OS_SMART_POINTER_H +#define _OS_SMART_POINTER_H + +#include +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++11-extensions" + +#if __has_attribute(trivial_abi) +# define OS_TRIVIAL_ABI __attribute__((trivial_abi)) +#else +# error Smart pointers depend on trivial_abi attribute +#endif + +#if !OS_HAS_RVALUE_REFERENCES +# error Smart pointers depend on rvalue references +#endif + +/* C++98 compatibility */ +#if !OS_HAS_NULLPTR && !defined(nullptr) +# define nullptr NULL +#endif + +#ifndef OSPTR_LOG +# define OSPTR_LOG(x, ...) do {} while(0) +#endif + +namespace os { +static struct no_retain_t {} no_retain; + +template +class OS_TRIVIAL_ABI smart_ptr +{ + template friend class smart_ptr; + +public: + +/* + * Default constructor, creates a null pointer + */ + smart_ptr() : pointer(nullptr) + { + OSPTR_LOG("Default construct smart_ptr\n"); + } + +#if OS_HAS_NULLPTR +/* + * Construction from a nullptr + */ + smart_ptr(os::nullptr_t) : pointer(nullptr) + { + OSPTR_LOG("Construct smart_ptr from null\n"); + } +#endif + +/* + * Construct from a raw pointer, taking a reference to the object + */ + explicit smart_ptr(T *&p) : pointer(p) + { + OSPTR_LOG("Construct smart_ptr from raw %p\n", pointer); + if (pointer != nullptr) { + _retain(pointer); + } + } + +/* + * Construct from a raw pointer, without bumping the refcount + */ + explicit smart_ptr(T *&p, no_retain_t) : pointer(p) + { + OSPTR_LOG("Construct smart_ptr from raw %p no retain\n", pointer); + } + +/* + * Copy constructor from the same smart_ptr type + */ + smart_ptr(smart_ptr const &rhs) : pointer(rhs.pointer) + { + OSPTR_LOG("Copy construct smart_ptr with %p\n", rhs.pointer); + if (pointer != nullptr) { + _retain(pointer); + } + } + +#if !LIBKERN_NO_MEMBER_TEMPLATES +/* + * Allows copy of a smart_ptr from a smart_ptr + * if U is convertible to T. For example, if T is a base class of U + */ + template + smart_ptr(smart_ptr const &rhs) : pointer(rhs.get()) + { + OSPTR_LOG("Copy construct smart_ptr with compatible %p\n", rhs.pointer); + if (pointer != nullptr) { + _retain(pointer); + } + } +#endif + +/* + * Assign to an OSPointer from a raw pointer + */ + smart_ptr & + operator=(T *&rhs) + { + OSPTR_LOG("Assign smart_ptr with replacing %p with raw %p\n", pointer, rhs); + smart_ptr(rhs).swap(*this); + return *this; + } + +#if OS_HAS_NULLPTR +/* + * Assign to an OSPointer from a null pointer + */ + smart_ptr & + operator=(os::nullptr_t) + { + OSPTR_LOG("Assign smart_ptr to null replacing %p\n", pointer); + smart_ptr().swap(*this); + return *this; + } +#endif + +/* + * Assign to a smart_ptr from a smart_ptr of the same type + */ + smart_ptr & + operator=(smart_ptr &rhs) + { + OSPTR_LOG("Assign smart_ptr replacing %p with %p\n", pointer, rhs.pointer); + smart_ptr(rhs).swap(*this); + return *this; + } + +#if !LIBKERN_NO_MEMBER_TEMPLATES +/* + * Allows assignment of a smart_ptr from a smart_ptr + * if U is convertible to T. For example, if T is a base class of U. + */ + template + smart_ptr & + operator=(smart_ptr const &rhs) + { + OSPTR_LOG("Assign smart_ptr to compatible replacing %p with %p\n", pointer, rhs.pointer); + smart_ptr(rhs.get()).swap(*this); + return *this; + } +#endif + +/* + * Move support + */ + +#if OS_HAS_RVALUE_REFERENCES +/* + * Move-construct from a different smart_ptr of the same pointer type + */ + smart_ptr(smart_ptr &&rhs) : pointer(rhs.pointer) + { + OSPTR_LOG("Move construct smart_ptr with %p\n", rhs.pointer); + rhs.pointer = nullptr; + } + +/* + * Move-construct from a raw pointer + */ + smart_ptr(T *&&p) : pointer(p) + { + OSPTR_LOG("Move construct smart_ptr with %p\n", pointer); + if (pointer != nullptr) { + _retain(pointer); + } + p = nullptr; + } + +/* + * Move-construct from a raw pointer without bumping the refcount + */ + smart_ptr(T *&&p, no_retain_t) : pointer(p) + { + OSPTR_LOG("Move construct smart_ptr with %p no retain\n", pointer); + p = nullptr; + } + +/* + * Move-assign to a smart_ptr from a raw pointer + */ + smart_ptr & + operator=(T *&&rhs) + { + OSPTR_LOG("Move assign smart_ptr replacing %p with raw %p\n", pointer, rhs); + smart_ptr(os::move(rhs)).swap(*this); + rhs = nullptr; + return *this; + } + +/* + * Move-assign from a different smart_ptr of the same type + */ + smart_ptr & + operator=(smart_ptr &&rhs) + { + OSPTR_LOG("Move assign smart_ptr replacing %p with %p\n", pointer, rhs.pointer); + smart_ptr(os::move(rhs)).swap(*this); + return *this; + } + +/* + * Move from a different smart_ptr with a compatible pointer type + */ + template + smart_ptr(smart_ptr &&rhs) : pointer(rhs.pointer) + { + OSPTR_LOG("Move construct smart_ptr with compatible %p\n", rhs.pointer); + rhs.pointer = nullptr; + } + + template + smart_ptr & + operator=(smart_ptr &&rhs) + { + OSPTR_LOG("Move assign smart_ptr replacing %p with compatible %p\n", pointer, rhs.pointer); + smart_ptr(os::move(rhs)).swap(*this); + return *this; + } +#endif + +/* + * Destructor - decreases the object's reference count + */ + ~smart_ptr() + { + OSPTR_LOG("Destroy smart_ptr with %p\n", pointer); + if (pointer) { + _release(pointer); + } + } + +/* + * Create a new object of type T and wrap it in a smart_ptr. The object will have + * a reference count of 1, so destruction of the smart_ptr will result in the + * object being freed if the smart_ptr wasn't copied first. + */ + static inline smart_ptr + alloc() + { + return smart_ptr(_alloc(), no_retain); + } + + void + reset() + { + smart_ptr().swap(*this); + } + + T * + get() const + { + return pointer; + } + + T ** + get_for_out_param() + { + reset(); + return &pointer; + } + +/* + * Take ownership of object from raw pointer + */ + void + attach(T *&p) + { + OSPTR_LOG("Attach smart_ptr with %p\n", p); + smart_ptr(p, no_retain).swap(*this); + } + + void + attach(T *&&p) + { + OSPTR_LOG("Move attach smart_ptr with %p\n", p); + smart_ptr(os::move(p), no_retain).swap(*this); + } + +/* Return and drop ownership of pointer with NO release() */ + T * + detach() + { + OSPTR_LOG("Detach smart_ptr with %p\n", pointer); + T *ret = pointer; + pointer = nullptr; + return ret; + } + + T * + operator->() const + { + OSPTR_LOG("Dereference smart_ptr with %p\n", pointer); + return pointer; + } + + explicit + operator bool() const + { + return pointer != nullptr; + } + + inline void + swap(smart_ptr &p) + { + T *temp = pointer; + pointer = p.pointer; + p.pointer = temp; + } + +/* swap pointers to the same type but with different policies */ + template + void + swap(smart_ptr &p) + { + if (p.pointer) { + _retain(p.pointer); + } + if (pointer) { + smart_ptr::_retain(pointer); + } + + T *temp = pointer; + pointer = p.pointer; + p.pointer = temp; + + if (p.pointer) { + _release(p.pointer); + } + if (pointer) { + smart_ptr::_release(pointer); + } + } + + template + smart_ptr + const_pointer_cast() const & + { + OSPTR_LOG("const_pointer_cast smart_ptr with %p\n", pointer); + return smart_ptr(const_cast(pointer)); + } + + template + smart_ptr + const_pointer_cast() && + { + OSPTR_LOG("const_pointer_cast move smart_ptr with %p\n", pointer); + U *newPointer = const_cast(detach()); + return smart_ptr(os::move(newPointer), no_retain); + } + + template + smart_ptr + static_pointer_cast() const & + { + OSPTR_LOG("static_pointer_cast smart_ptr with %p\n", pointer); + return smart_ptr(static_cast(pointer)); + } + + template + smart_ptr + static_pointer_cast() && + { + OSPTR_LOG("static_pointer_cast move smart_ptr with %p\n", pointer); + return smart_ptr(static_cast(detach()), no_retain); + } + + template + smart_ptr + dynamic_pointer_cast() const & + { + OSPTR_LOG("dynamic_pointer_cast smart_ptr with %p\n", pointer); + return smart_ptr(Policy::template dyn_cast(pointer)); + } + + template + smart_ptr + dynamic_pointer_cast() && + { + OSPTR_LOG("dynamic_pointer_cast move smart_ptr with %p\n", pointer); + U *newPointer = Policy::template dyn_cast(pointer); + + if (newPointer != nullptr) { + detach(); + } else { + reset(); + } + return smart_ptr(os::move(newPointer), no_retain); + } + +private: + static inline void + _retain(T *obj) + { + OSPTR_LOG(" %s with %p\n", __FUNCTION__, obj); + Policy::retain(obj); + } + + static inline void + _release(T *obj) + { + OSPTR_LOG(" %s with %p\n", __FUNCTION__, obj); + Policy::release(obj); + } + + static inline T * + _alloc() + { + OSPTR_LOG(" %s\n", __FUNCTION__); + return Policy::template alloc(); + } + + T *pointer; +}; + +/* + * Comparison + */ + +template +inline bool +operator==(smart_ptr const &a, smart_ptr const &b) +{ + return a.get() == b.get(); +} + +template +inline bool +operator!=(smart_ptr const &a, smart_ptr const &b) +{ + return a.get() != b.get(); +} + +template +inline bool +operator==(smart_ptr const &a, smart_ptr const &b) +{ + return a.get() == b.get(); +} + +template +inline bool +operator!=(smart_ptr const &a, smart_ptr const &b) +{ + return a.get() != b.get(); +} + +/* + * Comparison with nullptr + */ + +#if OS_HAS_NULLPTR +template +inline bool +operator==(smart_ptr const &p, os::nullptr_t) +{ + return p.get() == nullptr; +} + +template inline bool +operator==(os::nullptr_t, smart_ptr const &p) +{ + return p.get() == nullptr; +} + +template +inline bool +operator!=(smart_ptr const &p, os::nullptr_t) +{ + return p.get() != nullptr; +} + +template +inline bool +operator!=(os::nullptr_t, smart_ptr const &p) +{ + return p.get() != nullptr; +} +#endif + +/* + * Comparison with raw pointer + */ + +template +inline bool +operator==(smart_ptr const &p, const os::remove_const_t *other) +{ + return p.get() == other; +} + +template +inline bool +operator==(const os::remove_const_t *other, smart_ptr const &p) +{ + return other == p.get(); +} + +template +inline bool +operator!=(smart_ptr const &p, const os::remove_const_t *other) +{ + return p.get() != other; +} + +template +inline bool +operator!=(const os::remove_const_t *other, smart_ptr const &p) +{ + return other != p.get(); +} +}; + +#pragma clang diagnostic pop +#endif /* _OS_SMART_POINTER_H */ diff --git a/libsa/bootstrap.cpp b/libsa/bootstrap.cpp index 8f33073a2..f46485a1d 100644 --- a/libsa/bootstrap.cpp +++ b/libsa/bootstrap.cpp @@ -164,7 +164,7 @@ public: ~KLDBootstrap(void); }; -static KLDBootstrap sBootstrapObject; +LIBKERN_ALWAYS_DESTROY static KLDBootstrap sBootstrapObject; /********************************************************************* * Set the function pointers for the entry points into the bootstrap @@ -190,8 +190,8 @@ KLDBootstrap::~KLDBootstrap(void) } - record_startup_extensions_function = 0; - load_security_extensions_function = 0; + record_startup_extensions_function = NULL; + load_security_extensions_function = NULL; } /********************************************************************* @@ -725,7 +725,6 @@ KLDBootstrap::loadSecurityExtensions(void) OSCollectionIterator * keyIterator = NULL;// must release OSString * bundleID = NULL;// don't release OSKext * theKext = NULL;// don't release - OSBoolean * isSecurityKext = NULL;// don't release OSKextLog(/* kext */ NULL, kOSKextLogStepLevel | @@ -761,9 +760,7 @@ KLDBootstrap::loadSecurityExtensions(void) continue; } - isSecurityKext = OSDynamicCast(OSBoolean, - theKext->getPropertyForHostArch(kAppleSecurityExtensionKey)); - if (isSecurityKext && isSecurityKext->isTrue()) { + if (kOSBooleanTrue == theKext->getPropertyForHostArch(kAppleSecurityExtensionKey)) { OSKextLog(/* kext */ NULL, kOSKextLogStepLevel | kOSKextLogLoadFlag, diff --git a/libsa/conf/Makefile b/libsa/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/libsa/conf/Makefile +++ b/libsa/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/libsa/conf/Makefile.template b/libsa/conf/Makefile.template index bc570dde5..628823632 100644 --- a/libsa/conf/Makefile.template +++ b/libsa/conf/Makefile.template @@ -69,9 +69,9 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \ mv $${kld_file}__ $${kld_file} || exit 1; \ done - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist diff --git a/libsa/nonlto.c b/libsa/nonlto.c new file mode 100644 index 000000000..68adb0767 --- /dev/null +++ b/libsa/nonlto.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * used to make a tiny non-LTO object file to force the linker to turn bitcode to macho + */ + +int __attribute__((__unused__)) __not_used_at_all__; diff --git a/libsyscall/Libsyscall.xcconfig b/libsyscall/Libsyscall.xcconfig index 7153bda3e..15531c0fe 100644 --- a/libsyscall/Libsyscall.xcconfig +++ b/libsyscall/Libsyscall.xcconfig @@ -5,27 +5,61 @@ SUPPORTED_PLATFORMS = macosx iphoneos iphoneosnano tvos appletvos watchos bridge ONLY_ACTIVE_ARCH = NO DEAD_CODE_STRIPPING = YES DEBUG_INFORMATION_FORMAT = dwarf-with-dsym -INSTALL_PATH = /usr/lib/system -PUBLIC_HEADERS_FOLDER_PATH = /usr/include -PRIVATE_HEADERS_FOLDER_PATH = /usr/local/include -OS_PRIVATE_HEADERS_FOLDER_PATH = /usr/local/include/os + +SDK_INSTALL_VARIANT = $(SDK_INSTALL_VARIANT_$(DRIVERKIT)) +SDK_INSTALL_VARIANT_1 = driverkit +SDK_INSTALL_VARIANT_ = default +SDK_INSTALL_ROOT = $(SDK_INSTALL_ROOT_$(SDK_INSTALL_VARIANT)) +SDK_INSTALL_ROOT_driverkit = $(DRIVERKITROOT) +SDK_INSTALL_HEADERS_ROOT = $(SDK_INSTALL_HEADERS_ROOT_$(SDK_INSTALL_VARIANT)) +SDK_INSTALL_HEADERS_ROOT_driverkit = $(SDK_INSTALL_ROOT)/$(SDK_RUNTIME_HEADERS_PREFIX) +SDK_RUNTIME_HEADERS_PREFIX = Runtime + +INSTALL_PATH = $(SDK_INSTALL_ROOT)/usr/lib/system +PUBLIC_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/include +PRIVATE_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/local/include +OS_PRIVATE_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/local/include/os +OS_PUBLIC_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/include/os EXECUTABLE_PREFIX = libsystem_ PRODUCT_NAME = kernel ALWAYS_SEARCH_USER_PATHS = NO ORDER_FILE[sdk=iphoneos*] = $(SDKROOT)/$(APPLE_INTERNAL_DIR)/OrderFiles/libsystem_kernel.order -OTHER_CFLAGS = -fdollars-in-identifiers -no-cpp-precomp -fno-common -fno-stack-protector -fno-stack-check -momit-leaf-frame-pointer -DLIBSYSCALL_INTERFACE -D__DARWIN_VERS_1050=1 -OTHER_CFLAGS[sdk=macosx*] = $(inherited) -DSYSCALL_PRE1050 -OTHER_CFLAGS[sdk=macosx*][arch=x86_64*] = $(inherited) -DNO_SYSCALL_LEGACY -OTHER_CFLAGS[sdk=iphoneos*] = $(inherited) -DNO_SYSCALL_LEGACY -OTHER_CFLAGS[sdk=watchos*] = $(inherited) -DNO_SYSCALL_LEGACY -OTHER_CFLAGS[sdk=tvos*] = $(inherited) -DNO_SYSCALL_LEGACY -OTHER_CFLAGS[sdk=appletvos*] = $(inherited) -DNO_SYSCALL_LEGACY -OTHER_CFLAGS[sdk=bridgeos*] = $(inherited) -DNO_SYSCALL_LEGACY +OTHER_CFLAGS = -fdollars-in-identifiers -no-cpp-precomp -fno-common -fno-stack-protector -fno-stack-check -momit-leaf-frame-pointer -DLIBSYSCALL_INTERFACE -D__DARWIN_VERS_1050=1 -DNO_SYSCALL_LEGACY +OTHER_CFLAGS[sdk=macosx*][arch=i386] = $(inherited) -UNO_SYSCALL_LEGACY -DSYSCALL_PRE1050 +OTHER_CFLAGS[sdk=macosx*][arch=x86_64*] = $(inherited) -DSYSCALL_PRE1050 GCC_PREPROCESSOR_DEFINITIONS = CF_OPEN_SOURCE CF_EXCLUDE_CSTD_HEADERS DEBUG _FORTIFY_SOURCE=0 -HEADER_SEARCH_PATHS = $(PROJECT_DIR)/mach $(PROJECT_DIR)/os $(PROJECT_DIR)/wrappers $(PROJECT_DIR)/wrappers/string $(PROJECT_DIR)/wrappers/libproc $(PROJECT_DIR)/wrappers/libproc/spawn $(BUILT_PRODUCTS_DIR)/internal_hdr/include $(BUILT_PRODUCTS_DIR)/mig_hdr/local/include $(BUILT_PRODUCTS_DIR)/mig_hdr/include $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders + +HEADER_SEARCH_PATHS = $(PROJECT_DIR)/mach $(PROJECT_DIR)/os $(PROJECT_DIR)/wrappers $(PROJECT_DIR)/wrappers/string $(PROJECT_DIR)/wrappers/libproc $(PROJECT_DIR)/wrappers/libproc/spawn $(BUILT_PRODUCTS_DIR)/internal_hdr/include $(BUILT_PRODUCTS_DIR)/mig_hdr/local/include $(BUILT_PRODUCTS_DIR)/mig_hdr/include +SYSTEM_HEADER_SEARCH_PATHS = $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/System/Library/Frameworks/System.framework/PrivateHeaders $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/local/include $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/include +SYSTEM_FRAMEWORK_SEARCH_PATHS = $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/System/Library/Frameworks + +OTHER_MIGFLAGS = -novouchers -I$(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -I$(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/local/include -I$(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/include + WARNING_CFLAGS = -Wmost GCC_TREAT_WARNINGS_AS_ERRORS = YES GCC_WARN_ABOUT_MISSING_NEWLINE = YES +GCC_NO_COMMON_BLOCKS = YES +GCC_C_LANGUAGE_STANDARD = gnu11 +CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES +CLANG_WARN_EMPTY_BODY = YES +CLANG_WARN_BOOL_CONVERSION = YES +CLANG_WARN_CONSTANT_CONVERSION = YES +GCC_WARN_64_TO_32_BIT_CONVERSION = NO +CLANG_WARN_ENUM_CONVERSION = YES +CLANG_WARN_INT_CONVERSION = NO +CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES +CLANG_WARN_INFINITE_RECURSION = YES +GCC_WARN_ABOUT_RETURN_TYPE = YES +CLANG_WARN_STRICT_PROTOTYPES = YES +CLANG_WARN_COMMA = YES +GCC_WARN_UNINITIALIZED_AUTOS = YES +CLANG_WARN_UNREACHABLE_CODE = YES +GCC_WARN_UNUSED_FUNCTION = YES +GCC_WARN_UNUSED_PARAMETER = YES +GCC_WARN_UNUSED_VARIABLE = YES +CLANG_WARN_RANGE_LOOP_ANALYSIS = YES +CLANG_WARN_SUSPICIOUS_MOVE = YES + CODE_SIGN_IDENTITY = - DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion) DYLIB_LDFLAGS = -umbrella System -all_load @@ -34,9 +68,12 @@ DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=appletvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=bridgeos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 -OTHER_LDFLAGS = +OTHER_LDFLAGS = $(SIMULATOR_LDFLAGS) +SIMULATOR_LDFLAGS = +SIMULATOR_LDFLAGS[sdk=macosx*] = -Wl,-simulator_support INSTALLHDRS_SCRIPT_PHASE = YES INSTALLHDRS_COPY_PHASE = YES USE_HEADERMAP = NO LINK_WITH_STANDARD_LIBRARIES = NO ALWAYS_SEARCH_USER_PATHS = YES +IS_ZIPPERED = YES diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj index c5f769437..0fba2db30 100644 --- a/libsyscall/Libsyscall.xcodeproj/project.pbxproj +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -44,6 +44,17 @@ name = "MIG headers"; productName = "MIG headers"; }; + E46CB80621FBAC32005D1E53 /* Libsyscall_driverkit */ = { + isa = PBXAggregateTarget; + buildConfigurationList = E46CB80F21FBAC32005D1E53 /* Build configuration list for PBXAggregateTarget "Libsyscall_driverkit" */; + buildPhases = ( + ); + dependencies = ( + E46CB80721FBAC32005D1E53 /* PBXTargetDependency */, + ); + name = Libsyscall_driverkit; + productName = Build; + }; /* End PBXAggregateTarget section */ /* Begin PBXBuildFile section */ @@ -107,6 +118,7 @@ 29A59AE2183B0DE000E8B896 /* renameat.c in Sources */ = {isa = PBXBuildFile; fileRef = 29A59AE1183B0DE000E8B896 /* renameat.c */; }; 29A59AE6183B110C00E8B896 /* unlinkat.c in Sources */ = {isa = PBXBuildFile; fileRef = 29A59AE5183B110C00E8B896 /* unlinkat.c */; }; 2BA88DCC1810A3CE00EB63F6 /* coalition.c in Sources */ = {isa = PBXBuildFile; fileRef = 2BA88DCB1810A3CE00EB63F6 /* coalition.c */; }; + 2C4853EC221C82160008D1F5 /* os_channel_event.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C4853EB221C82160008D1F5 /* os_channel_event.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; }; 3695DCC91F3D2C5F0072C0B3 /* reboot.c in Sources */ = {isa = PBXBuildFile; fileRef = 3695DCC81F3D2C5A0072C0B3 /* reboot.c */; }; 374A36E314748F1300AAF39D /* varargs_wrappers.s in Sources */ = {isa = PBXBuildFile; fileRef = 374A36E214748EE400AAF39D /* varargs_wrappers.s */; }; 3F538F891A659C5600B37EFD /* persona.c in Sources */ = {isa = PBXBuildFile; fileRef = 3F538F881A659C5600B37EFD /* persona.c */; }; @@ -124,9 +136,15 @@ 4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BCDD8B120741C2F00FA37A3 /* mach_right.c */; }; 4BDD5F1D1891AB2F004BF300 /* mach_approximate_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */; }; 4BDD5F1E1891AB2F004BF300 /* mach_approximate_time.s in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */; }; + 6E6EAA8121EA5AC6001C5D04 /* restartable.defs in Sources */ = {isa = PBXBuildFile; fileRef = 6E6EAA8021EA5AAE001C5D04 /* restartable.defs */; }; 726D915520ACD7FC0039A2FE /* mach_bridge_remote_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */; }; + 72950DF822418FAC00EFD5E0 /* proc.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; }; 729B7D0A15C8938C000E2501 /* carbon_delete.c in Sources */ = {isa = PBXBuildFile; fileRef = FB50F1B315AB7DE700F814BA /* carbon_delete.c */; }; + 72AAD86A22418795001511C3 /* proc.h in Headers */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; }; 72B1E6ED190723DB00FB3FA2 /* guarded_open_dprotected_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */; }; + 72D6AFCF22421725004CD782 /* proc.h in Headers */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; }; + 72D6AFD122421753004CD782 /* proc.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; }; + 72DE4B6A224174D0007844CB /* proc.c in Sources */ = {isa = PBXBuildFile; fileRef = 72DE4B69224174D0007844CB /* proc.c */; }; 72E09E941B444B19006F11A4 /* mach_continuous_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 72FB18801B437F7A00181A5B /* mach_continuous_time.c */; }; 74119F46188F3B6A00C6F48F /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; 7466C924170CBA53004557CC /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; @@ -141,6 +159,7 @@ 929FD46F1C5711DB0087B9C8 /* mach_timebase_info.c in Sources */ = {isa = PBXBuildFile; fileRef = 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */; }; 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; }; 978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; }; + 9C4B507422273E0F00F068C1 /* log_data.c in Sources */ = {isa = PBXBuildFile; fileRef = 9C4B507322273E0F00F068C1 /* log_data.c */; }; 9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; }; 9C6DA3D320A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; }; 9CCF28271E68E993002EE6CD /* pid_shutdown_networking.c in Sources */ = {isa = PBXBuildFile; fileRef = 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */; }; @@ -150,6 +169,7 @@ A50BD5301DDA5500006622C8 /* thread_self_restrict.h in Headers */ = {isa = PBXBuildFile; fileRef = A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */; }; A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = A59CB95516669DB700B064B3 /* stack_logging_internal.h */; }; A59CB9581666A1A200B064B3 /* munmap.c in Sources */ = {isa = PBXBuildFile; fileRef = A59CB9571666A1A200B064B3 /* munmap.c */; }; + AE69785A22405C21001445CE /* memory_entry.defs in Sources */ = {isa = PBXBuildFile; fileRef = AE69785922405C21001445CE /* memory_entry.defs */; }; BA0D9FB1199031AD007E8A73 /* kdebug_trace.c in Sources */ = {isa = PBXBuildFile; fileRef = BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */; }; BA4414AA18336A5F00AAE813 /* mach in CopyFiles */ = {isa = PBXBuildFile; fileRef = BA4414A51833697C00AAE813 /* mach */; }; BA4414AB18336A6400AAE813 /* servers in CopyFiles */ = {isa = PBXBuildFile; fileRef = BA4414A6183369A100AAE813 /* servers */; }; @@ -315,9 +335,36 @@ remoteGlobalIDString = BA4414A1183366E600AAE813; remoteInfo = "MIG headers"; }; + E46CB80821FBAC32005D1E53 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 249C60FE1194747600ED73F3; + remoteInfo = Libmach; + }; /* End PBXContainerItemProxy section */ /* Begin PBXCopyFilesBuildPhase section */ + 72950DF722418F7F00EFD5E0 /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = "${OS_PUBLIC_HEADERS_FOLDER_PATH}"; + dstSubfolderSpec = 0; + files = ( + 72950DF822418FAC00EFD5E0 /* proc.h in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; + 72D6AFD02242173F004CD782 /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = "${OS_PUBLIC_HEADERS_FOLDER_PATH}"; + dstSubfolderSpec = 0; + files = ( + 72D6AFD122421753004CD782 /* proc.h in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; BA4414A818336A1300AAE813 /* CopyFiles */ = { isa = PBXCopyFilesBuildPhase; buildActionMask = 8; @@ -502,6 +549,7 @@ 29A59AE1183B0DE000E8B896 /* renameat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = renameat.c; sourceTree = ""; }; 29A59AE5183B110C00E8B896 /* unlinkat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = unlinkat.c; sourceTree = ""; }; 2BA88DCB1810A3CE00EB63F6 /* coalition.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = coalition.c; sourceTree = ""; }; + 2C4853EB221C82160008D1F5 /* os_channel_event.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = os_channel_event.c; path = skywalk/os_channel_event.c; sourceTree = ""; }; 3695DCC81F3D2C5A0072C0B3 /* reboot.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = reboot.c; sourceTree = ""; }; 374A36E214748EE400AAF39D /* varargs_wrappers.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = varargs_wrappers.s; sourceTree = ""; }; 37DDFB7614748713009D3355 /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; @@ -519,8 +567,11 @@ 4BCDD8B120741C2F00FA37A3 /* mach_right.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_right.c; sourceTree = ""; }; 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_approximate_time.c; sourceTree = ""; }; 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_approximate_time.s; sourceTree = ""; }; - 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = mach_bridge_remote_time.c; path = wrappers/mach_bridge_remote_time.c; sourceTree = ""; }; + 6E6EAA8021EA5AAE001C5D04 /* restartable.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = restartable.defs; sourceTree = ""; }; + 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_bridge_remote_time.c; sourceTree = ""; }; + 72AAD8692241878C001511C3 /* proc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = proc.h; sourceTree = ""; }; 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_dprotected_np.c; sourceTree = ""; }; + 72DE4B69224174D0007844CB /* proc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = proc.c; sourceTree = ""; }; 72FB18801B437F7A00181A5B /* mach_continuous_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_continuous_time.c; sourceTree = ""; }; 7466C923170CB99B004557CC /* vm_page_size.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = vm_page_size.h; sourceTree = ""; }; 7AE28FDE18AC41B1006A5626 /* csr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = csr.c; sourceTree = ""; }; @@ -532,11 +583,13 @@ 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_timebase_info.c; sourceTree = ""; }; 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = ""; }; 978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = ""; }; + 9C4B507322273E0F00F068C1 /* log_data.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = log_data.c; sourceTree = ""; }; 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_sync_ipc.h; sourceTree = ""; }; 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = pid_shutdown_networking.c; sourceTree = ""; }; A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_self_restrict.h; sourceTree = ""; }; A59CB95516669DB700B064B3 /* stack_logging_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stack_logging_internal.h; sourceTree = ""; }; A59CB9571666A1A200B064B3 /* munmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = munmap.c; sourceTree = ""; }; + AE69785922405C21001445CE /* memory_entry.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = memory_entry.defs; sourceTree = ""; }; BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kdebug_trace.c; sourceTree = ""; }; BA4414A51833697C00AAE813 /* mach */ = {isa = PBXFileReference; lastKnownFileType = text; name = mach; path = mig_hdr/include/mach; sourceTree = BUILT_PRODUCTS_DIR; }; BA4414A6183369A100AAE813 /* servers */ = {isa = PBXFileReference; lastKnownFileType = text; name = servers; path = mig_hdr/include/servers; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -669,7 +722,6 @@ 08FB7794FE84155DC02AAC07 /* mach */ = { isa = PBXGroup; children = ( - 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */, C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */, 24D1158911E672270063D54D /* Platforms */, 24D1156511E671B20063D54D /* custom */, @@ -687,7 +739,6 @@ 08FB7795FE84155DC02AAC07 /* mach */ = { isa = PBXGroup; children = ( - C9001751206B00850070D674 /* port_descriptions.c */, 247A08FF11F8E18000E4693F /* abort.h */, C9D9BCC5114B00600000D8B9 /* clock_priv.defs */, C9D9BCC6114B00600000D8B9 /* clock_reply.defs */, @@ -702,18 +753,18 @@ C9D9BCCF114B00600000D8B9 /* err_us.sub */, C9D9BCD0114B00600000D8B9 /* error_codes.c */, C9D9BCD1114B00600000D8B9 /* errorlib.h */, - 247A091611F8E7A800E4693F /* exc_catcher.h */, C9D9BCD2114B00600000D8B9 /* exc_catcher_state_identity.c */, C9D9BCD3114B00600000D8B9 /* exc_catcher_state.c */, C9D9BCD4114B00600000D8B9 /* exc_catcher.c */, + 247A091611F8E7A800E4693F /* exc_catcher.h */, C9D9BCD5114B00600000D8B9 /* exc.defs */, C9D9BCD6114B00600000D8B9 /* externs.h */, C9D9BCD7114B00600000D8B9 /* fprintf_stderr.c */, - C9D9BCD8114B00600000D8B9 /* mach */, C9D9BCE4114B00600000D8B9 /* host_priv.defs */, - BABA36CA1A856C4700BBBCF7 /* host.c */, C9D9BCE5114B00600000D8B9 /* host_security.defs */, + BABA36CA1A856C4700BBBCF7 /* host.c */, C9D9BCEA114B00600000D8B9 /* lock_set.defs */, + C9D9BCD8114B00600000D8B9 /* mach */, C9D9BCEB114B00600000D8B9 /* mach_error_string.c */, C9D9BCEC114B00600000D8B9 /* mach_error.c */, C9D9BCED114B00600000D8B9 /* mach_host.defs */, @@ -725,25 +776,28 @@ 4BCDD8B120741C2F00FA37A3 /* mach_right.c */, C9D9BCF3114B00600000D8B9 /* mach_traps.s */, 291D3C271354FDD100D46061 /* mach_vm.c */, - E4216C301822D404006F2632 /* mach_voucher.defs */, C9D9BCF4114B00600000D8B9 /* mach_vm.defs */, + AE69785922405C21001445CE /* memory_entry.defs */, + E4216C301822D404006F2632 /* mach_voucher.defs */, C9D9BCF6114B00600000D8B9 /* mig_allocate.c */, C9D9BCF7114B00600000D8B9 /* mig_deallocate.c */, + 24484A9311F61D1900E10CD2 /* mig_reply_port.c */, C9D9BCF8114B00600000D8B9 /* mig_reply_setup.c */, C9D9BCF9114B00600000D8B9 /* mig_strncpy.c */, - 24484A9311F61D1900E10CD2 /* mig_reply_port.c */, C9D9BCFA114B00600000D8B9 /* ms_thread_switch.c */, C9D9BCFB114B00600000D8B9 /* notify.defs */, C9D9BCFC114B00600000D8B9 /* panic.c */, + C9001751206B00850070D674 /* port_descriptions.c */, C9D9BCFD114B00600000D8B9 /* port_obj.c */, C9D9BD03114B00600000D8B9 /* processor_set.defs */, C9D9BD04114B00600000D8B9 /* processor.defs */, + 6E6EAA8021EA5AAE001C5D04 /* restartable.defs */, C9D9BD06114B00600000D8B9 /* semaphore.c */, C9D9BD07114B00600000D8B9 /* servers */, C9D9BD0E114B00600000D8B9 /* slot_name.c */, A59CB95516669DB700B064B3 /* stack_logging_internal.h */, - 24484A7311F51E9800E10CD2 /* string.h */, 24484A7411F51E9800E10CD2 /* string.c */, + 24484A7311F51E9800E10CD2 /* string.h */, C9D9BD0F114B00600000D8B9 /* task.defs */, C962B16D18DBB43F0031244A /* thread_act.c */, C9D9BD10114B00600000D8B9 /* thread_act.defs */, @@ -816,6 +870,7 @@ E4D45C2316F856900002AF25 /* mach_absolute_time.s */, 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */, 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */, + 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */, 925559911CBBBBB300E527CE /* mach_boottime.c */, 72FB18801B437F7A00181A5B /* mach_continuous_time.c */, 14FE60EB1B7D3BED00ACB44C /* mach_get_times.c */, @@ -824,6 +879,7 @@ 3F538F881A659C5600B37EFD /* persona.c */, 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */, C6BEE9171806840200D25AAB /* posix_sem_obsolete.c */, + 72DE4B69224174D0007844CB /* proc.c */, BA9973461C3B4C8A00B14D8C /* quota_obsolete.c */, 24B8C2611237F53900D36CC3 /* remove-counter.c */, 248AA966122C7CDA0085F5B1 /* rename.c */, @@ -984,6 +1040,7 @@ 401BB7141BCAE523005080D3 /* skywalk */ = { isa = PBXGroup; children = ( + 2C4853EB221C82160008D1F5 /* os_channel_event.c */, 405FA3381E0C669D007D66EA /* os_packet.c */, 40DD162F1E4ACCAA003297CC /* cpu_copy_in_cksum.s */, 409A78301E4EB3D900E0699B /* cpu_in_cksum.s */, @@ -1009,8 +1066,10 @@ isa = PBXGroup; children = ( C9C1824F15338C0B00933F23 /* alloc_once.c */, - C9EE57F51669673D00337E4B /* tsd.h */, + 9C4B507322273E0F00F068C1 /* log_data.c */, + 72AAD8692241878C001511C3 /* proc.h */, A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */, + C9EE57F51669673D00337E4B /* tsd.h */, ); path = os; sourceTree = ""; @@ -1108,6 +1167,7 @@ A50BD52F1DDA548F006622C8 /* thread_self_restrict.h in Headers */, C6D3EFC116542C510052CF30 /* vm_task.h in Headers */, C6D3EFC216542C510052CF30 /* key_defs.h in Headers */, + 72D6AFCF22421725004CD782 /* proc.h in Headers */, C6D3EFC316542C510052CF30 /* ls_defs.h in Headers */, C6D3EFC416542C510052CF30 /* netname_defs.h in Headers */, C6D3EFC516542C510052CF30 /* nm_defs.h in Headers */, @@ -1157,6 +1217,7 @@ A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */, E4D45C3F16FB20D30002AF25 /* spawn.h in Headers */, E4512B8C21AFDF1600673F73 /* mach_right_private.h in Headers */, + 72AAD86A22418795001511C3 /* proc.h in Headers */, 9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */, E4D45C4016FB20DC0002AF25 /* spawn_private.h in Headers */, E4D45C2F16F868ED0002AF25 /* libproc.h in Headers */, @@ -1193,6 +1254,7 @@ C6D3EFCA16542C510052CF30 /* CopyFiles */, BA4414B418336E1A00AAE813 /* Copy Files */, BA4414B718336E5600AAE813 /* CopyFiles */, + 72D6AFD02242173F004CD782 /* CopyFiles */, C6D3EFCD16542C510052CF30 /* Sources */, ); buildRules = ( @@ -1213,6 +1275,7 @@ C63F480B1654203800A1F78F /* CopyFiles */, BA4414A818336A1300AAE813 /* CopyFiles */, BA4414AC18336A7700AAE813 /* CopyFiles */, + 72950DF722418F7F00EFD5E0 /* CopyFiles */, D2AAC0610554660B00DB518D /* Sources */, D289988505E68E00004EDB86 /* Frameworks */, ); @@ -1252,6 +1315,7 @@ 249C61101194755D00ED73F3 /* Build */, 24614EF311E7C98600E78584 /* Syscalls */, BA4414A1183366E600AAE813 /* MIG headers */, + E46CB80621FBAC32005D1E53 /* Libsyscall_driverkit */, D2AAC0620554660B00DB518D /* Libsyscall_static */, 249C60FE1194747600ED73F3 /* Libsyscall_dynamic */, C6D3EFB216542C510052CF30 /* Libsyscall_headers_Sim */, @@ -1272,7 +1336,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n\nmkdir -p $OBJROOT/sys\n\n$SRCROOT/xcodescripts/create-syscalls.pl \\\n\t$SRCROOT/../bsd/kern/syscalls.master \\\n\t$SRCROOT/custom \\\n\t$SRCROOT/Platforms \\\n\t$MAP_PLATFORM \\\n\t$OBJROOT/sys\n"; + shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n# workaround 48125283\n[ -n \"$DRIVERKIT_DEPLOYMENT_TARGET\" ] && unset MACOSX_DEPLOYMENT_TARGET\n\nmkdir -p $OBJROOT/sys\n\n$SRCROOT/xcodescripts/create-syscalls.pl \\\n\t$SRCROOT/../bsd/kern/syscalls.master \\\n\t$SRCROOT/custom \\\n\t$SRCROOT/Platforms \\\n\t$MAP_PLATFORM \\\n\t$OBJROOT/sys\n"; }; 24614EF611E7C9A000E78584 /* Compile Syscalls */ = { isa = PBXShellScriptBuildPhase; @@ -1287,7 +1351,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n\nmkdir -p $OBJROOT/UninstalledProducts\n\n$SRCROOT/xcodescripts/compile-syscalls.pl \\\n\t$OBJROOT/sys/stubs.list \\\n\t$BUILD_ROOT/syscalls.a"; + shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n# workaround 48125283\n[ -n \"$DRIVERKIT_DEPLOYMENT_TARGET\" ] && unset MACOSX_DEPLOYMENT_TARGET\n\nmkdir -p $OBJROOT/UninstalledProducts\n\n$SRCROOT/xcodescripts/compile-syscalls.pl \\\n\t$OBJROOT/sys/stubs.list \\\n\t$BUILD_ROOT/syscalls.a\n"; }; BA4414A41833672200AAE813 /* Generate MIG Headers */ = { isa = PBXShellScriptBuildPhase; @@ -1301,7 +1365,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"$PROJECT_DIR\"/xcodescripts/mach_install_mig.sh"; + shellScript = "# workaround 48125283\n[ -n \"$DRIVERKIT_DEPLOYMENT_TARGET\" ] && unset MACOSX_DEPLOYMENT_TARGET\n\n\"$PROJECT_DIR\"/xcodescripts/mach_install_mig.sh\n"; }; /* End PBXShellScriptBuildPhase section */ @@ -1381,6 +1445,7 @@ E4D45C2416F856900002AF25 /* __commpage_gettimeofday.c in Sources */, C9D9BD43114B00600000D8B9 /* mig_reply_setup.c in Sources */, 24484A9411F61D2B00E10CD2 /* mig_reply_port.c in Sources */, + 6E6EAA8121EA5AC6001C5D04 /* restartable.defs in Sources */, C9D9BD44114B00600000D8B9 /* mig_strncpy.c in Sources */, C9D9BD45114B00600000D8B9 /* ms_thread_switch.c in Sources */, C9D9BD47114B00600000D8B9 /* panic.c in Sources */, @@ -1393,6 +1458,7 @@ 2485235511582D8F0051B413 /* mach_legacy.c in Sources */, 242AB66611EBDC1200107336 /* errno.c in Sources */, 4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */, + 72DE4B6A224174D0007844CB /* proc.c in Sources */, E4D45C2E16F868ED0002AF25 /* libproc.c in Sources */, 24A7C5BC11FF8DA6007669EB /* accept.c in Sources */, 24A7C5BD11FF8DA6007669EB /* bind.c in Sources */, @@ -1409,6 +1475,7 @@ C962B16E18DBB43F0031244A /* thread_act.c in Sources */, 24A7C5C511FF8DA6007669EB /* recvmsg.c in Sources */, 24A7C5C611FF8DA6007669EB /* sendmsg.c in Sources */, + AE69785A22405C21001445CE /* memory_entry.defs in Sources */, 24A7C5C711FF8DA6007669EB /* sendto.c in Sources */, 24A7C5C811FF8DA6007669EB /* setattrlist.c in Sources */, 24A7C5C911FF8DA6007669EB /* socketpair.c in Sources */, @@ -1416,6 +1483,7 @@ 9002401118FC9A7F00D73BFA /* renamex.c in Sources */, 2419382B12135FF6003CDE41 /* chmod.c in Sources */, 248BA01D121C56BF008C073F /* connect.c in Sources */, + 9C4B507422273E0F00F068C1 /* log_data.c in Sources */, 248BA01F121C607E008C073F /* fchmod.c in Sources */, E4D45C3616F86BD80002AF25 /* posix_spawn.c in Sources */, 13B598941A142F6400DB2D5A /* stackshot.c in Sources */, @@ -1466,6 +1534,7 @@ 402AF43F1E5CD88600F1A4B9 /* cpu_in_cksum_gen.c in Sources */, 030B179B135377B400DAD1F0 /* open_dprotected_np.c in Sources */, E4D45C3116F868ED0002AF25 /* proc_listpidspath.c in Sources */, + 2C4853EC221C82160008D1F5 /* os_channel_event.c in Sources */, 374A36E314748F1300AAF39D /* varargs_wrappers.s in Sources */, 291D3C281354FDD100D46061 /* mach_port.c in Sources */, 291D3C291354FDD100D46061 /* mach_vm.c in Sources */, @@ -1506,6 +1575,11 @@ target = BA4414A1183366E600AAE813 /* MIG headers */; targetProxy = BA4414B218336D8D00AAE813 /* PBXContainerItemProxy */; }; + E46CB80721FBAC32005D1E53 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 249C60FE1194747600ED73F3 /* Libsyscall_dynamic */; + targetProxy = E46CB80821FBAC32005D1E53 /* PBXContainerItemProxy */; + }; /* End PBXTargetDependency section */ /* Begin XCBuildConfiguration section */ @@ -1513,9 +1587,10 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = NO; INSTALL_PATH = /usr/local/lib/dyld; + SKIP_INSTALL = "$(SKIP_INSTALL_$(SDK_INSTALL_VARIANT))"; + SKIP_INSTALL_driverkit = YES; STRIP_INSTALLED_PRODUCT = NO; }; name = Release; @@ -1524,35 +1599,6 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = NO; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - ENABLE_STRICT_OBJC_MSGSEND = YES; - GCC_C_LANGUAGE_STANDARD = gnu99; - GCC_NO_COMMON_BLOCKS = YES; - GCC_WARN_64_TO_32_BIT_CONVERSION = NO; - GCC_WARN_ABOUT_RETURN_TYPE = YES; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_PARAMETER = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - OTHER_MIGFLAGS = "-novouchers"; }; name = Release; }; @@ -1560,15 +1606,15 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = YES; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; MAP_PLATFORM = "$(MAP_PLATFORM_$(PLATFORM_NAME))"; MAP_PLATFORM_appletvos = iPhoneOS; MAP_PLATFORM_bridgeos = iPhoneOS; MAP_PLATFORM_iphoneos = iPhoneOS; MAP_PLATFORM_iphoneosnano = iPhoneOS; - MAP_PLATFORM_macosx = MacOSX; + MAP_PLATFORM_macosx = "$(MAP_PLATFORM_macosx_$(SDK_INSTALL_VARIANT))"; + MAP_PLATFORM_macosx_default = MacOSX; + MAP_PLATFORM_macosx_driverkit = DriverKit; MAP_PLATFORM_tvos = iPhoneOS; MAP_PLATFORM_watchos = iPhoneOS; PRODUCT_NAME = Syscalls; @@ -1580,8 +1626,10 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ENABLE_OBJC_WEAK = YES; - OTHER_LDFLAGS = "$(DYLIB_LDFLAGS)"; + OTHER_LDFLAGS = ( + "$(SIMULATOR_LDFLAGS)", + "$(DYLIB_LDFLAGS)", + ); VERSION_INFO_PREFIX = "___"; }; name = Release; @@ -1590,7 +1638,6 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = NO; INSTALLHDRS_COPY_PHASE = NO; PRODUCT_NAME = Build; @@ -1602,7 +1649,6 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ENABLE_OBJC_WEAK = YES; PRODUCT_NAME = "$(TARGET_NAME)"; }; name = Release; @@ -1611,13 +1657,22 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = NO; PRODUCT_NAME = Libsyscall_headers_Sim; SKIP_INSTALL = YES; }; name = Release; }; + E46CB81021FBAC32005D1E53 /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; + buildSettings = { + COPY_PHASE_STRIP = NO; + PRODUCT_NAME = "$(TARGET_NAME)"; + STRIP_STYLE = debugging; + }; + name = Release; + }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ @@ -1677,6 +1732,14 @@ defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; + E46CB80F21FBAC32005D1E53 /* Build configuration list for PBXAggregateTarget "Libsyscall_driverkit" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E46CB81021FBAC32005D1E53 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; /* End XCConfigurationList section */ }; rootObject = 08FB7793FE84155DC02AAC07 /* Project object */; diff --git a/libsyscall/Platforms/DriverKit/x86_64/syscall.map b/libsyscall/Platforms/DriverKit/x86_64/syscall.map new file mode 100644 index 000000000..9aa2064cb --- /dev/null +++ b/libsyscall/Platforms/DriverKit/x86_64/syscall.map @@ -0,0 +1,61 @@ +_accept$NOCANCEL ___accept_nocancel +_aio_suspend$NOCANCEL ___aio_suspend_nocancel +_close$NOCANCEL ___close_nocancel +_connect$NOCANCEL ___connect_nocancel +_fstat ___fstat64 +_fstatat ___fstatat64 +_fstatfs ___fstatfs64 +_fsync$NOCANCEL ___fsync_nocancel +_getfsstat ___getfsstat64 +_lstat ___lstat64 +_msgrcv$NOCANCEL ___msgrcv_nocancel +_msgsnd$NOCANCEL ___msgsnd_nocancel +_msgsys ___msgsys +_msync$NOCANCEL ___msync_nocancel +_open$NOCANCEL ___open_nocancel +_openat$NOCANCEL ___openat_nocancel +_poll$NOCANCEL ___poll_nocancel +_pread$NOCANCEL ___pread_nocancel +_pwrite$NOCANCEL ___pwrite_nocancel +_read$NOCANCEL ___read_nocancel +_readv$NOCANCEL ___readv_nocancel +_recvfrom$NOCANCEL ___recvfrom_nocancel +_recvmsg$NOCANCEL ___recvmsg_nocancel +_select$DARWIN_EXTSN ___select +_select$DARWIN_EXTSN$NOCANCEL ___select_nocancel +_sem_wait$NOCANCEL ___sem_wait_nocancel +_semsys ___semsys +_sendmsg$NOCANCEL ___sendmsg_nocancel +_sendto$NOCANCEL ___sendto_nocancel +_stat ___stat64 +_statfs ___statfs64 +_waitid$NOCANCEL ___waitid_nocancel +_write$NOCANCEL ___write_nocancel +_writev$NOCANCEL ___writev_nocancel + +_accept ___accept +_bind ___bind +_connect ___connect +_getattrlist ___getattrlist +_getpeername ___getpeername +_getsockname ___getsockname +_lchown ___lchown +_listen ___listen +_mprotect ___mprotect +_msgctl ___msgctl +_msync ___msync +_open ___open +_openat ___openat +_recvfrom ___recvfrom +_recvmsg ___recvmsg +_semctl ___semctl +_sem_open ___sem_open +_sendmsg ___sendmsg +_sendto ___sendto +_setattrlist ___setattrlist +_setregid ___setregid +_setreuid ___setreuid +_shmctl ___shmctl +_shmsys ___shmsys +_shm_open ___shm_open +_socketpair ___socketpair diff --git a/libsyscall/custom/SYS.h b/libsyscall/custom/SYS.h index a62cc1f99..ffc6a8f2e 100644 --- a/libsyscall/custom/SYS.h +++ b/libsyscall/custom/SYS.h @@ -450,6 +450,7 @@ pseudo: ;\ PUSH_FRAME %%\ bl _##cerror %%\ POP_FRAME %%\ + ret %%\ 2: #define MI_GET_ADDRESS(reg,var) \ diff --git a/libsyscall/mach/abort.h b/libsyscall/mach/abort.h index d80b8d6c0..f9f4212b8 100644 --- a/libsyscall/mach/abort.h +++ b/libsyscall/mach/abort.h @@ -34,7 +34,7 @@ extern int __getpid(void); extern int __kill(int pid, int signum, int posix); extern int __exit(int) __attribute__((noreturn)); -static inline void __attribute__((noreturn)) +static inline void __attribute__((noreturn, cold)) abort(void) { (void)__kill(__getpid(), __SIGABRT, 0); diff --git a/libsyscall/mach/err_iokit.sub b/libsyscall/mach/err_iokit.sub index b5361b8be..7a105843b 100644 --- a/libsyscall/mach/err_iokit.sub +++ b/libsyscall/mach/err_iokit.sub @@ -29,7 +29,7 @@ #include #include -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #include #include #endif @@ -97,7 +97,7 @@ static const char * const err_codes_iokit_common[] = { "(iokit/common) data was not found", // 0x2f0 }; -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) static const struct error_sparse_map err_codes_iokit_usb_map[] = { err_code_map_entry(kIOUSBCRCErr, kIOUSBDataToggleErr), err_code_map_entry(kIOUSBPIDCheckErr, kIOUSBWrongPIDErr), @@ -203,7 +203,7 @@ static const char * const err_codes_iokit_bluetooth[] = { "(iokit/bluetooth) no HCI controller", // 003 "(iokit/bluetooth) changing power states is unsupported", // 004 }; -#endif /* !TARGET_OS_EMBEDDED */ +#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ static const struct error_sparse_map err_iokit_sub_map[] = { err_sub_map_entry(sub_iokit_common, sub_iokit_pmu), @@ -220,7 +220,7 @@ static const struct error_subsystem err_iokit_sub[] = err_codes_iokit_common_map, errlib_count(err_codes_iokit_common_map), }, -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* 1 */ { "(iokit/usb)", // 0xe0004000 errlib_count(err_codes_iokit_usb), @@ -235,20 +235,20 @@ static const struct error_subsystem err_iokit_sub[] = err_codes_iokit_fw_map, errlib_count(err_codes_iokit_fw_map), }, -#endif /* !TARGET_OS_EMBEDDED */ +#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /* 3 */ err_iokit_null_sub, // 0xe000c000 /* 4 */ { "(iokit/blkstorage)", 0 }, // 0xe0010000 /* 5 */ { "(iokit/graphics)", 0 }, // 0xe0014000 /* 6 */ err_iokit_null_sub, // 0xe0018000 /* 7 */ err_iokit_null_sub, // 0xe001c000 -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* 8 */ { "(iokit/bluetooth)", // 0xe0020000 errlib_count(err_codes_iokit_bluetooth), err_codes_iokit_bluetooth, NULL, 0, }, -#endif /* !TARGET_OS_EMBEDDED */ +#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /* 9 */ { "(iokit/pmu)", 0 }, // 0xe0024000 /* -2 */ { "(iokit/vendor)", 0 }, // 0xe0028000 /* -1 */ { "(iokit/reserved)", 0 }, // 0xe002c000 diff --git a/libsyscall/mach/err_mach_ipc.sub b/libsyscall/mach/err_mach_ipc.sub index b0e82564c..97aa377c7 100644 --- a/libsyscall/mach/err_mach_ipc.sub +++ b/libsyscall/mach/err_mach_ipc.sub @@ -55,10 +55,11 @@ static const char * const err_codes_mach_send[] = { /* 15 */ "(ipc/send) invalid msg-type", /* 16 */ "(ipc/send) invalid msg-header", /* 17 */ "(ipc/send) invalid msg-trailer", - /* 18 */ "(ipc/send) unused error", + /* 18 */ "(ipc/send) invalid context for reply", /* 19 */ "(ipc/send) unused error", /* 20 */ "(ipc/send) unused error", /* 21 */ "(ipc/send) out-of-line buffer too large", + /* 22 */ "(ipc/send) destination does not accept OOL ports", }; static const char * const err_codes_mach_rcv[] = { diff --git a/libsyscall/mach/error_codes.c b/libsyscall/mach/error_codes.c index 4ec633dfa..4aae84fc2 100644 --- a/libsyscall/mach/error_codes.c +++ b/libsyscall/mach/error_codes.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2010 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,10 +62,13 @@ * Generic error code interface */ +#include #include #include "errorlib.h" +#if !TARGET_OS_DRIVERKIT #include "err_libkern.sub" #include "err_iokit.sub" +#endif // !TARGET_OS_DRIVERKIT #include "err_ipc.sub" #include "err_kern.sub" #include "err_mach_ipc.sub" @@ -75,33 +78,33 @@ const struct error_system _mach_errors[err_max_system + 1] = { /* 0; err_kern */ { - errlib_count(err_os_sub), - "(operating system/?) unknown subsystem error", - err_os_sub, + .max_sub = errlib_count(err_os_sub), + .bad_sub = "(operating system/?) unknown subsystem error", + .subsystem = err_os_sub, }, /* 1; err_us */ { - errlib_count(err_us_sub), - "(user space/?) unknown subsystem error", - err_us_sub, + .max_sub = errlib_count(err_us_sub), + .bad_sub = "(user space/?) unknown subsystem error", + .subsystem = err_us_sub, }, /* 2; err_server */ { - errlib_count(err_server_sub), - "(server/?) unknown subsystem error", - err_server_sub, + .max_sub = errlib_count(err_server_sub), + .bad_sub = "(server/?) unknown subsystem error", + .subsystem = err_server_sub, }, /* 3 (& 3f); err_ipc */ { - errlib_count(err_ipc_sub), - "(ipc/?) unknown subsystem error", - err_ipc_sub, + .max_sub = errlib_count(err_ipc_sub), + .bad_sub = "(ipc/?) unknown subsystem error", + .subsystem = err_ipc_sub, }, /* 4; err_mach_ipc */ { - errlib_count(err_mach_ipc_sub), - "(ipc/?) unknown subsystem error", - err_mach_ipc_sub, + .max_sub = errlib_count(err_mach_ipc_sub), + .bad_sub = "(ipc/?) unknown subsystem error", + .subsystem = err_mach_ipc_sub, }, /* 0x05 */ errorlib_system_null, @@ -134,21 +137,25 @@ const struct error_system _mach_errors[err_max_system + 1] = { /* 0x34 */ errorlib_system_null, /* 0x35 */ errorlib_system_null, /* 0x36 */ errorlib_system_null, +#if !TARGET_OS_DRIVERKIT /* 0x37; err_libkern */ { - errlib_count(err_libkern_sub), - "(libkern/?) unknown subsystem error", - err_libkern_sub, + .max_sub = errlib_count(err_libkern_sub), + .bad_sub = "(libkern/?) unknown subsystem error", + .subsystem = err_libkern_sub, }, /* 0x38; err_iokit */ { - errlib_count(err_iokit_sub), - "(iokit/?) unknown subsystem error", - err_iokit_sub, - err_iokit_sub_map, - errlib_count(err_iokit_sub_map) + .max_sub = errlib_count(err_iokit_sub), + .bad_sub = "(iokit/?) unknown subsystem error", + .subsystem = err_iokit_sub, + .map_table = err_iokit_sub_map, + .map_count = errlib_count(err_iokit_sub_map) }, +#else + /* 0x37 */ errorlib_system_null, /* 0x38 */ errorlib_system_null, +#endif // TARGET_OS_DRIVERKIT /* 0x39 */ errorlib_system_null, /* 0x3a */ errorlib_system_null, /* 0x3b */ errorlib_system_null, diff --git a/libsyscall/mach/errorlib.h b/libsyscall/mach/errorlib.h index f78f3d7a1..f7ac4e5e3 100644 --- a/libsyscall/mach/errorlib.h +++ b/libsyscall/mach/errorlib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,7 +97,7 @@ struct error_subsystem { const struct error_sparse_map *map_table; int map_count; }; -#define errorlib_system_null { 0, NULL, NULL, NULL, 0 } +#define errorlib_system_null { NULL, 0, NULL, NULL, 0 } struct error_system { int max_sub; @@ -106,7 +106,7 @@ struct error_system { const struct error_sparse_map *map_table; int map_count; }; -#define errorlib_sub_null { NULL, 0, NULL, NULL, 0 } +#define errorlib_sub_null { 0, NULL, NULL, NULL, 0 } extern const struct error_system _mach_errors[err_max_system + 1]; char *mach_error_string_int(mach_error_t, boolean_t *); diff --git a/libsyscall/mach/exc_catcher.c b/libsyscall/mach/exc_catcher.c index a0d9f7165..d24b486b6 100644 --- a/libsyscall/mach/exc_catcher.c +++ b/libsyscall/mach/exc_catcher.c @@ -51,7 +51,7 @@ internal_catch_exception_raise( #if defined(__DYNAMIC__) static _libkernel_exc_raise_func_t exc_raise_func = (void*)-1; - if (exc_raise_func == ((void*)-1)) { + if (exc_raise_func == ((void*)-1) && _dlsym) { exc_raise_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise"); } if (exc_raise_func == 0) { diff --git a/libsyscall/mach/exc_catcher_state.c b/libsyscall/mach/exc_catcher_state.c index deedf57d1..47ac3d7fb 100644 --- a/libsyscall/mach/exc_catcher_state.c +++ b/libsyscall/mach/exc_catcher_state.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "abort.h" #include "exc_catcher.h" @@ -55,7 +54,7 @@ internal_catch_exception_raise_state( #if defined(__DYNAMIC__) static _libkernel_exc_raise_state_func_t exc_raise_state_func = (void*)-1; - if (exc_raise_state_func == ((void*)-1)) { + if (exc_raise_state_func == ((void*)-1) && _dlsym) { exc_raise_state_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise_state"); } if (exc_raise_state_func == 0) { diff --git a/libsyscall/mach/exc_catcher_state_identity.c b/libsyscall/mach/exc_catcher_state_identity.c index 1eac28e6c..1ddaf8c65 100644 --- a/libsyscall/mach/exc_catcher_state_identity.c +++ b/libsyscall/mach/exc_catcher_state_identity.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "abort.h" #include "exc_catcher.h" @@ -57,7 +56,7 @@ internal_catch_exception_raise_state_identity( #if defined(__DYNAMIC__) static _libkernel_exec_raise_state_identity_t exc_raise_state_identity_func = (void*)-1; - if (exc_raise_state_identity_func == ((void*)-1)) { + if (exc_raise_state_identity_func == ((void*)-1) && _dlsym) { exc_raise_state_identity_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise_state_identity"); } if (exc_raise_state_identity_func == 0) { diff --git a/libsyscall/mach/host.c b/libsyscall/mach/host.c index 2aa1c8423..6a7ec639e 100644 --- a/libsyscall/mach/host.c +++ b/libsyscall/mach/host.c @@ -46,7 +46,7 @@ kern_return_t host_get_multiuser_config_flags(host_t host __unused, uint32_t *multiuser_flags) { -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG); *multiuser_flags = *multiuser_flag_address; return KERN_SUCCESS; @@ -60,7 +60,7 @@ kern_return_t host_check_multiuser_mode(host_t host __unused, uint32_t *multiuser_mode) { -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) uint32_t multiuser_flags; kern_return_t kr; diff --git a/libsyscall/mach/mach_msg.c b/libsyscall/mach/mach_msg.c index 88c2583b7..1c1b7af74 100644 --- a/libsyscall/mach/mach_msg.c +++ b/libsyscall/mach/mach_msg.c @@ -363,11 +363,78 @@ mach_msg_destroy(mach_msg_header_t *msg) daddr = (mach_msg_descriptor_t *)(dsc + 1); break; } + + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_guarded_port_descriptor_t *dsc; + mach_msg_guard_flags_t flags; + /* + * Destroy port right carried in the message + */ + dsc = &daddr->guarded_port; + flags = dsc->flags; + if ((flags & MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND) == 0) { + /* Need to unguard before destroying the port */ + mach_port_unguard(mach_task_self_, dsc->name, (uint64_t)dsc->context); + } + mach_msg_destroy_port(dsc->name, dsc->disposition); + daddr = (mach_msg_descriptor_t *)(dsc + 1); + break; + } } } } } +static inline boolean_t +mach_msg_server_is_recoverable_send_error(kern_return_t kr) +{ + switch (kr) { + case MACH_SEND_INVALID_DEST: + case MACH_SEND_TIMED_OUT: + case MACH_SEND_INTERRUPTED: + return TRUE; + default: + /* + * Other errors mean that the message may have been partially destroyed + * by the kernel, and these can't be recovered and may leak resources. + */ + return FALSE; + } +} + +static kern_return_t +mach_msg_server_mig_return_code(mig_reply_error_t *reply) +{ + /* + * If the message is complex, it is assumed that the reply was successful, + * as the RetCode is where the count of out of line descriptors is. + * + * If not, we read RetCode. + */ + if (reply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) { + return KERN_SUCCESS; + } + return reply->RetCode; +} + +static void +mach_msg_server_consume_unsent_message(mach_msg_header_t *hdr) +{ + /* mach_msg_destroy doesn't handle the local port */ + mach_port_t port = hdr->msgh_local_port; + if (MACH_PORT_VALID(port)) { + switch (MACH_MSGH_BITS_LOCAL(hdr->msgh_bits)) { + case MACH_MSG_TYPE_MOVE_SEND: + case MACH_MSG_TYPE_MOVE_SEND_ONCE: + /* destroy the send/send-once right */ + (void) mach_port_deallocate(mach_task_self_, port); + hdr->msgh_local_port = MACH_PORT_NULL; + break; + } + } + mach_msg_destroy(hdr); +} + /* * Routine: mach_msg_server_once * Purpose: @@ -453,15 +520,19 @@ mach_msg_server_once( (void) (*demux)(&bufRequest->Head, &bufReply->Head); - if (!(bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) { - if (bufReply->RetCode == MIG_NO_REPLY) { - bufReply->Head.msgh_remote_port = MACH_PORT_NULL; - } else if ((bufReply->RetCode != KERN_SUCCESS) && - (bufRequest->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) { - /* destroy the request - but not the reply port */ - bufRequest->Head.msgh_remote_port = MACH_PORT_NULL; - mach_msg_destroy(&bufRequest->Head); - } + switch (mach_msg_server_mig_return_code(bufReply)) { + case KERN_SUCCESS: + break; + case MIG_NO_REPLY: + bufReply->Head.msgh_remote_port = MACH_PORT_NULL; + break; + default: + /* + * destroy the request - but not the reply port + * (MIG moved it into the bufReply). + */ + bufRequest->Head.msgh_remote_port = MACH_PORT_NULL; + mach_msg_destroy(&bufRequest->Head); } /* @@ -482,18 +553,13 @@ mach_msg_server_once( bufReply->Head.msgh_size, 0, MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL); - if ((mr != MACH_SEND_INVALID_DEST) && - (mr != MACH_SEND_TIMED_OUT)) { - goto done_once; + if (mach_msg_server_is_recoverable_send_error(mr)) { + mach_msg_server_consume_unsent_message(&bufReply->Head); + mr = MACH_MSG_SUCCESS; } - mr = MACH_MSG_SUCCESS; - } - if (bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) { - mach_msg_destroy(&bufReply->Head); } } -done_once: voucher_mach_msg_revert(old_state); (void)vm_deallocate(self, @@ -530,7 +596,7 @@ mach_msg_server( voucher_mach_msg_state_t old_state = VOUCHER_MACH_MSG_STATE_UNCHANGED; boolean_t buffers_swapped = FALSE; - options &= ~(MACH_SEND_MSG | MACH_RCV_MSG | MACH_RCV_VOUCHER | MACH_RCV_OVERWRITE); + options &= ~(MACH_SEND_MSG | MACH_RCV_MSG | MACH_RCV_VOUCHER); reply_alloc = (mach_msg_size_t)round_page((options & MACH_SEND_TRAILER) ? (max_size + MAX_TRAILER_SIZE) : max_size); @@ -578,15 +644,19 @@ mach_msg_server( (void) (*demux)(&bufRequest->Head, &bufReply->Head); - if (!(bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) { - if (bufReply->RetCode == MIG_NO_REPLY) { - bufReply->Head.msgh_remote_port = MACH_PORT_NULL; - } else if ((bufReply->RetCode != KERN_SUCCESS) && - (bufRequest->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) { - /* destroy the request - but not the reply port */ - bufRequest->Head.msgh_remote_port = MACH_PORT_NULL; - mach_msg_destroy(&bufRequest->Head); - } + switch (mach_msg_server_mig_return_code(bufReply)) { + case KERN_SUCCESS: + break; + case MIG_NO_REPLY: + bufReply->Head.msgh_remote_port = MACH_PORT_NULL; + break; + default: + /* + * destroy the request - but not the reply port + * (MIG moved it into the bufReply). + */ + bufRequest->Head.msgh_remote_port = MACH_PORT_NULL; + mach_msg_destroy(&bufRequest->Head); } /* @@ -628,32 +698,25 @@ mach_msg_server( &bufRequest->Head, 0); } - if ((mr != MACH_SEND_INVALID_DEST) && - (mr != MACH_SEND_TIMED_OUT) && - (mr != MACH_RCV_TIMED_OUT)) { + /* + * Need to destroy the reply msg in case if there was a send timeout or + * invalid destination. The reply msg would be swapped with request msg + * if buffers_swapped is true, thus destroy request msg instead of + * reply msg in such cases. + */ + if (mach_msg_server_is_recoverable_send_error(mr)) { + if (buffers_swapped) { + mach_msg_server_consume_unsent_message(&bufRequest->Head); + } else { + mach_msg_server_consume_unsent_message(&bufReply->Head); + } + } else if (mr != MACH_RCV_TIMED_OUT) { voucher_mach_msg_revert(old_state); old_state = VOUCHER_MACH_MSG_STATE_UNCHANGED; continue; } } - /* - * Need to destroy the reply msg in case if there was a send timeout or - * invalid destination. The reply msg would be swapped with request msg - * if buffers_swapped is true, thus destroy request msg instead of - * reply msg in such cases. - */ - if (mr != MACH_RCV_TIMED_OUT) { - if (buffers_swapped) { - if (bufRequest->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) { - mach_msg_destroy(&bufRequest->Head); - } - } else { - if (bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) { - mach_msg_destroy(&bufReply->Head); - } - } - } voucher_mach_msg_revert(old_state); old_state = VOUCHER_MACH_MSG_STATE_UNCHANGED; diff --git a/libsyscall/mach/mach_port.c b/libsyscall/mach/mach_port.c index 52f731b99..57d675094 100644 --- a/libsyscall/mach/mach_port.c +++ b/libsyscall/mach/mach_port.c @@ -33,6 +33,7 @@ #include #include "tsd.h" + kern_return_t mach_port_names( ipc_space_t task, @@ -57,7 +58,11 @@ mach_port_type( { kern_return_t rv; - rv = _kernelrpc_mach_port_type(task, name, ptype); + rv = _kernelrpc_mach_port_type_trap(task, name, ptype); + + if (rv == MACH_SEND_INVALID_DEST) { + rv = _kernelrpc_mach_port_type(task, name, ptype); + } return rv; } @@ -246,9 +251,14 @@ mach_port_request_notification( { kern_return_t rv; - rv = _kernelrpc_mach_port_request_notification(task, name, msgid, + rv = _kernelrpc_mach_port_request_notification_trap(task, name, msgid, sync, notify, notifyPoly, previous); + if (rv == MACH_SEND_INVALID_DEST) { + rv = _kernelrpc_mach_port_request_notification(task, name, msgid, + sync, notify, notifyPoly, previous); + } + return rv; } @@ -744,3 +754,31 @@ thread_destruct_special_reply_port( return KERN_INVALID_ARGUMENT; } } + +kern_return_t +mach_port_guard_with_flags( + ipc_space_t task, + mach_port_name_t name, + mach_port_context_t guard, + uint64_t flags) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_guard_with_flags(task, name, (uint64_t) guard, flags); + + return rv; +} + +kern_return_t +mach_port_swap_guard( + ipc_space_t task, + mach_port_name_t name, + mach_port_context_t old_guard, + mach_port_context_t new_guard) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_swap_guard(task, name, (uint64_t)old_guard, (uint64_t)new_guard); + + return rv; +} diff --git a/libsyscall/mach/mach_vm.c b/libsyscall/mach/mach_vm.c index fe89c6513..f8fbf921d 100644 --- a/libsyscall/mach/mach_vm.c +++ b/libsyscall/mach/mach_vm.c @@ -58,7 +58,7 @@ mach_vm_allocate( rv = _kernelrpc_mach_vm_allocate(target, address, size, flags); } - if (__syscall_logger) { + if (__syscall_logger && rv == KERN_SUCCESS && !(flags & VM_MAKE_TAG(VM_MEMORY_STACK))) { int userTagFlags = flags & VM_FLAGS_ALIAS_MASK; __syscall_logger(stack_logging_type_vm_allocate | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0); } @@ -184,7 +184,7 @@ mach_vm_map( offset, copy, cur_protection, max_protection, inheritance); } - if (__syscall_logger) { + if (__syscall_logger && rv == KERN_SUCCESS && !(flags & VM_MAKE_TAG(VM_MEMORY_STACK))) { int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; int userTagFlags = flags & VM_FLAGS_ALIAS_MASK; __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0); @@ -213,7 +213,7 @@ mach_vm_remap( src_task, src_address, copy, cur_protection, max_protection, inheritance); - if (__syscall_logger) { + if (__syscall_logger && rv == KERN_SUCCESS) { int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; int userTagFlags = flags & VM_FLAGS_ALIAS_MASK; __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0); @@ -234,7 +234,7 @@ mach_vm_read( rv = _kernelrpc_mach_vm_read(target, address, size, data, dataCnt); - if (__syscall_logger) { + if (__syscall_logger && rv == KERN_SUCCESS) { int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; // The target argument is the remote task from which data is being read, // so pass mach_task_self() as the destination task receiving the allocation. @@ -263,7 +263,7 @@ vm_map( rv = _kernelrpc_vm_map(target, address, size, mask, flags, object, offset, copy, cur_protection, max_protection, inheritance); - if (__syscall_logger) { + if (__syscall_logger && rv == KERN_SUCCESS) { int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; int userTagFlags = flags & VM_FLAGS_ALIAS_MASK; __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0); @@ -313,7 +313,7 @@ vm_read( rv = _kernelrpc_vm_read(target, address, size, data, dataCnt); - if (__syscall_logger) { + if (__syscall_logger && rv == KERN_SUCCESS) { int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; // The target argument is the remote task from which data is being read, // so pass mach_task_self() as the destination task receiving the allocation. diff --git a/libsyscall/mach/memory_entry.defs b/libsyscall/mach/memory_entry.defs new file mode 100644 index 000000000..1cfbe3d26 --- /dev/null +++ b/libsyscall/mach/memory_entry.defs @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include diff --git a/libsyscall/mach/port_descriptions.c b/libsyscall/mach/port_descriptions.c index 035cf2237..2e086c80d 100644 --- a/libsyscall/mach/port_descriptions.c +++ b/libsyscall/mach/port_descriptions.c @@ -69,8 +69,10 @@ mach_host_special_port_description(int port) [HOST_RESOURCE_NOTIFY_PORT] = "resource notify", [HOST_CLOSURED_PORT] = "closured", [HOST_SYSPOLICYD_PORT] = "syspolicyd", + [HOST_FILECOORDINATIOND_PORT] = "filecoordinationd", + [HOST_FAIRPLAYD_PORT] = "fairplayd", }; - _Static_assert(HOST_SYSPOLICYD_PORT == HOST_MAX_SPECIAL_PORT, + _Static_assert(HOST_FAIRPLAYD_PORT == HOST_MAX_SPECIAL_PORT, "all host special ports must have descriptions"); return hsp_descs[port_index]; @@ -149,6 +151,7 @@ mach_host_special_port_for_id(const char *id) SP_ENTRY(HOST_RESOURCE_NOTIFY_PORT), SP_ENTRY(HOST_CLOSURED_PORT), SP_ENTRY(HOST_SYSPOLICYD_PORT), + SP_ENTRY(HOST_FILECOORDINATIOND_PORT), }; return port_for_id_internal(id, hsp_ids, diff --git a/libsyscall/mach/restartable.defs b/libsyscall/mach/restartable.defs new file mode 100644 index 000000000..d43bda011 --- /dev/null +++ b/libsyscall/mach/restartable.defs @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include diff --git a/libsyscall/mach/slot_name.c b/libsyscall/mach/slot_name.c index fefeb9d50..8ed90a0e5 100644 --- a/libsyscall/mach/slot_name.c +++ b/libsyscall/mach/slot_name.c @@ -43,7 +43,6 @@ */ #include -#include #include kern_return_t diff --git a/libsyscall/os/log_data.c b/libsyscall/os/log_data.c new file mode 100644 index 000000000..baaafcfb5 --- /dev/null +++ b/libsyscall/os/log_data.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +extern int __log_data(unsigned int tag, unsigned int flags, void *buffer, unsigned int size); + +int +log_data_as_kernel(unsigned int tag, unsigned int flags, void *buffer, unsigned int size) +{ + return __log_data(tag, flags, buffer, size); +} diff --git a/libsyscall/os/proc.h b/libsyscall/os/proc.h new file mode 100644 index 000000000..84a3396b8 --- /dev/null +++ b/libsyscall/os/proc.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __OS_PROC__ +#define __OS_PROC__ + +#include +#include +#include + +/*! + * @header + * + * @preprocinfo + * This is for functions that operate on the calling process alone. + */ + +__BEGIN_DECLS + +/*! + * @function os_proc_available_memory + * + * @abstract + * Return the number of bytes remaining, at the time of the call, before the + * current process will hit its current dirty memory limit. + * + * @discussion + * Developers can query this value efficiently whenever it is needed. The return + * value is only a snapshot at the time of the call. Caching the result is not + * advised. The result may be instantaneously invalidated by actions taken in + * another thread or another framework. + * + * Memory limits can change during the app life cycle. Make sure to check accordingly. + * + * The size returned is not representative of the total memory of the device, it + * is the current dirty memory limit minus the dirty memory footprint used at the + * time of the query. + * + * This interface allows an app to efficiently consume all available memory resources. + * Significant memory use, even under the current memory limit, may still cause + * system-wide performance including the termination of other apps and system + * processes. Take care to use the minimum amount of memory needed to satisfy the + * user’s need. + * + * If you need more information than just the available memory, you can use task_info(). + * The information returned is equivalent to the task_vm_info.limit_bytes_remaining + * field. task_info() is a more expensive call, and will return information such + * as your phys_footprint, which is used to calculate the return of this function. + * + * Dirty memory contains data that must be kept in RAM (or the equivalent) even + * when unused. It is memory that has been modified. + * + * @param none + * + * @result + * The remaining bytes. 0 is returned if the calling process is not an app, or + * the calling process exceeds its memory limit. + */ + + API_UNAVAILABLE(macos) API_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0)) +extern +size_t os_proc_available_memory(void); + +__END_DECLS + +#endif diff --git a/libsyscall/os/tsd.h b/libsyscall/os/tsd.h index 474c97aec..e4ab6d678 100644 --- a/libsyscall/os/tsd.h +++ b/libsyscall/os/tsd.h @@ -58,7 +58,7 @@ __attribute__((always_inline)) static __inline__ unsigned int _os_cpu_number(void) { -#if defined(__arm__) && defined(_ARM_ARCH_6) +#if defined(__arm__) uintptr_t p; __asm__("mrc p15, 0, %[p], c13, c0, 3" : [p] "=&r" (p)); return (unsigned int)(p & 0x3ul); @@ -116,16 +116,16 @@ __attribute__((always_inline, pure)) static __inline__ void** _os_tsd_get_base(void) { -#if defined(__arm__) && defined(_ARM_ARCH_6) +#if defined(__arm__) uintptr_t tsd; - __asm__("mrc p15, 0, %0, c13, c0, 3" : "=r" (tsd)); - tsd &= ~0x3ul; /* lower 2-bits contain CPU number */ -#elif defined(__arm__) && defined(_ARM_ARCH_5) - register uintptr_t tsd asm ("r9"); + __asm__("mrc p15, 0, %0, c13, c0, 3\n" + "bic %0, %0, #0x3\n" : "=r" (tsd)); + /* lower 2-bits contain CPU number */ #elif defined(__arm64__) uint64_t tsd; - __asm__("mrs %0, TPIDRRO_EL0" : "=r" (tsd)); - tsd &= ~0x7ull; + __asm__("mrs %0, TPIDRRO_EL0\n" + "bic %0, %0, #0x7\n" : "=r" (tsd)); + /* lower 3-bits contain CPU number */ #endif return (void**)(uintptr_t)tsd; diff --git a/libsyscall/wrappers/cancelable/fcntl-base.c b/libsyscall/wrappers/cancelable/fcntl-base.c index 0b295ba10..2f84dba81 100644 --- a/libsyscall/wrappers/cancelable/fcntl-base.c +++ b/libsyscall/wrappers/cancelable/fcntl-base.c @@ -54,6 +54,7 @@ fcntl(int fd, int cmd, ...) case F_LOG2PHYS: case F_LOG2PHYS_EXT: case F_GETPATH: + case F_GETPATH_NOFIRMLINK: case F_GETPATH_MTMINFO: case F_GETCODEDIR: case F_PATHPKG_CHECK: @@ -66,6 +67,7 @@ fcntl(int fd, int cmd, ...) case F_FINDSIGS: case F_TRANSCODEKEY: case F_TRIM_ACTIVE_FILE: + case F_SPECULATIVE_READ: case F_CHECK_LV: arg = va_arg(ap, void *); break; diff --git a/libsyscall/wrappers/coalition.c b/libsyscall/wrappers/coalition.c index 33da1103d..6c183a9f7 100644 --- a/libsyscall/wrappers/coalition.c +++ b/libsyscall/wrappers/coalition.c @@ -30,6 +30,7 @@ /* Syscall entry points */ int __coalition(uint32_t operation, uint64_t *cid, uint32_t flags); int __coalition_info(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize); +int __coalition_ledger(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize); int coalition_create(uint64_t *cid_out, uint32_t flags) @@ -67,3 +68,10 @@ coalition_info_set_efficiency(uint64_t cid, uint64_t flags) size_t size = sizeof(flags); return __coalition_info(COALITION_INFO_SET_EFFICIENCY, &cid, (void *)&flags, &size); } + +int +coalition_ledger_set_logical_writes_limit(uint64_t cid, int64_t limit) +{ + size_t size = sizeof(limit); + return __coalition_ledger(COALITION_LEDGER_SET_LOGICAL_WRITES_LIMIT, &cid, (void *)&limit, &size); +} diff --git a/libsyscall/wrappers/gethostuuid.c b/libsyscall/wrappers/gethostuuid.c index daa0ea30a..60b0e26de 100644 --- a/libsyscall/wrappers/gethostuuid.c +++ b/libsyscall/wrappers/gethostuuid.c @@ -29,7 +29,7 @@ #include "gethostuuid_private.h" -extern int __gethostuuid(uuid_t, const struct timespec *, int); +extern int __gethostuuid(uuid_t, const struct timespec *); static volatile int (*_gethostuuid_callback)(uuid_t) = (void *)0; @@ -38,7 +38,7 @@ gethostuuid(uuid_t uuid, const struct timespec *timeout) { int result; - result = __gethostuuid(uuid, timeout, 0); + result = __gethostuuid(uuid, timeout); if ((result == -1) && (errno == EPERM)) { if (_gethostuuid_callback) { result = _gethostuuid_callback(uuid); @@ -51,11 +51,11 @@ gethostuuid(uuid_t uuid, const struct timespec *timeout) return result; } -/* SPI to call gethostuuid syscall directly, without fallback */ +/* SPI to call gethostuuid syscall directly, without fallback, need an entitlement */ int _getprivatesystemidentifier(uuid_t uuid, const struct timespec *timeout) { - return __gethostuuid(uuid, timeout, 1); + return __gethostuuid(uuid, timeout); } int diff --git a/libsyscall/wrappers/getiopolicy_np.c b/libsyscall/wrappers/getiopolicy_np.c index db097ad44..e09f849cc 100644 --- a/libsyscall/wrappers/getiopolicy_np.c +++ b/libsyscall/wrappers/getiopolicy_np.c @@ -33,7 +33,7 @@ getiopolicy_np(int iotype, int scope) int policy, error; struct _iopol_param_t iop_param; - if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES) || + if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES && iotype != IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES) || (scope != IOPOL_SCOPE_PROCESS && scope != IOPOL_SCOPE_THREAD)) { errno = EINVAL; policy = -1; diff --git a/libsyscall/wrappers/kdebug_trace.c b/libsyscall/wrappers/kdebug_trace.c index e42794a49..d7409d541 100644 --- a/libsyscall/wrappers/kdebug_trace.c +++ b/libsyscall/wrappers/kdebug_trace.c @@ -116,6 +116,13 @@ kdebug_is_enabled(uint32_t debugid) return TRUE; } +bool +kdebug_using_continuous_time(void) +{ + uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE)); + return state & KDEBUG_ENABLE_CONT_TIME; +} + int kdebug_trace(uint32_t debugid, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) diff --git a/libsyscall/wrappers/libproc/libproc.c b/libsyscall/wrappers/libproc/libproc.c index 4c6fc2356..8cf27b6ac 100644 --- a/libsyscall/wrappers/libproc/libproc.c +++ b/libsyscall/wrappers/libproc/libproc.c @@ -226,16 +226,16 @@ int proc_regionfilename(int pid, uint64_t address, void * buffer, uint32_t buffersize) { int retval; - struct proc_regionwithpathinfo reginfo; + struct proc_regionpath path; if (buffersize < MAXPATHLEN) { errno = ENOMEM; return 0; } - retval = proc_pidinfo(pid, PROC_PIDREGIONPATHINFO2, (uint64_t)address, ®info, sizeof(struct proc_regionwithpathinfo)); + retval = proc_pidinfo(pid, PROC_PIDREGIONPATH, (uint64_t)address, &path, sizeof(struct proc_regionpath)); if (retval != -1) { - return (int)(strlcpy(buffer, reginfo.prp_vip.vip_path, MAXPATHLEN)); + return (int)(strlcpy(buffer, path.prpo_path, buffersize)); } return 0; } @@ -622,7 +622,7 @@ proc_clear_cpulimits(pid_t pid) } } -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) int proc_setcpu_deadline(pid_t pid, int action, uint64_t deadline) @@ -739,7 +739,7 @@ proc_can_use_foreground_hw(int pid, uint32_t *reason) { return __proc_info(PROC_INFO_CALL_CANUSEFGHW, pid, 0, 0, reason, sizeof(*reason)); } -#endif /* TARGET_OS_EMBEDDED */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /* Donate importance to adaptive processes from this process */ @@ -748,19 +748,19 @@ proc_donate_importance_boost() { int rval; -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) rval = __process_policy(PROC_POLICY_SCOPE_PROCESS, PROC_POLICY_ACTION_ENABLE, PROC_POLICY_APPTYPE, PROC_POLICY_IOS_DONATEIMP, NULL, getpid(), (uint64_t)0); -#else /* TARGET_OS_EMBEDDED */ +#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ rval = __process_policy(PROC_POLICY_SCOPE_PROCESS, PROC_POLICY_ACTION_SET, PROC_POLICY_BOOST, PROC_POLICY_IMP_DONATION, NULL, getpid(), 0); -#endif /* TARGET_OS_EMBEDDED */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ if (rval == 0) { return 0; @@ -903,7 +903,7 @@ proc_denap_assertion_complete(uint64_t assertion_token) return proc_importance_assertion_complete(assertion_token); } -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) int proc_clear_vmpressure(pid_t pid) @@ -992,7 +992,7 @@ proc_enable_apptype(pid_t pid, int apptype) } } -#if !TARGET_IPHONE_SIMULATOR +#if !TARGET_OS_SIMULATOR int proc_suppress(__unused pid_t pid, __unused uint64_t *generation) @@ -1000,6 +1000,6 @@ proc_suppress(__unused pid_t pid, __unused uint64_t *generation) return 0; } -#endif /* !TARGET_IPHONE_SIMULATOR */ +#endif /* !TARGET_OS_SIMULATOR */ -#endif /* !TARGET_OS_EMBEDDED */ +#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ diff --git a/libsyscall/wrappers/libproc/libproc_internal.h b/libsyscall/wrappers/libproc/libproc_internal.h index 1f4bc60da..c154510f0 100644 --- a/libsyscall/wrappers/libproc/libproc_internal.h +++ b/libsyscall/wrappers/libproc/libproc_internal.h @@ -41,7 +41,7 @@ int proc_clear_cpulimits(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_12_2, __IP /* CPU limits, applies to current thread only. 0% unsets limit */ int proc_setthread_cpupercent(uint8_t percentage, uint32_t ms_refill) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_5_0); -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* CPU monitor action, continued */ #define PROC_SETCPU_ACTION_SUSPEND 2 @@ -86,7 +86,7 @@ int proc_pidbind(int pid, uint64_t threadid, int bind); */ int proc_can_use_foreground_hw(int pid, uint32_t *reason); -#else /* TARGET_OS_EMBEDDED */ +#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /* resume the process suspend due to low VM resource */ int proc_clear_vmpressure(pid_t pid); @@ -113,7 +113,7 @@ int proc_clear_delayidlesleep(void); int proc_disable_apptype(pid_t pid, int apptype); int proc_enable_apptype(pid_t pid, int apptype); -#endif /* TARGET_OS_EMBEDDED */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /* mark process as importance donating */ int proc_donate_importance_boost(void); @@ -160,7 +160,7 @@ int proc_pidoriginatorinfo(int flavor, void *buffer, int buffersize) __OSX_AVAIL int proc_listcoalitions(int flavor, int coaltype, void *buffer, int buffersize) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_8_3); -#if !TARGET_IPHONE_SIMULATOR +#if !TARGET_OS_SIMULATOR #define PROC_SUPPRESS_SUCCESS (0) #define PROC_SUPPRESS_BAD_ARGUMENTS (-1) @@ -168,7 +168,7 @@ int proc_listcoalitions(int flavor, int coaltype, void *buffer, int buffersize) #define PROC_SUPPRESS_ALREADY_SUPPRESSED (-3) int proc_suppress(pid_t pid, uint64_t *generation); -#endif /* !TARGET_IPHONE_SIMULATOR */ +#endif /* !TARGET_OS_SIMULATOR */ __END_DECLS diff --git a/libsyscall/wrappers/mach_absolute_time.s b/libsyscall/wrappers/mach_absolute_time.s index 2c637bfc9..5b3b36541 100644 --- a/libsyscall/wrappers/mach_absolute_time.s +++ b/libsyscall/wrappers/mach_absolute_time.s @@ -178,7 +178,7 @@ _mach_absolute_time: movw ip, #((_COMM_PAGE_TIMEBASE_OFFSET) & 0x0000FFFF) movt ip, #(((_COMM_PAGE_TIMEBASE_OFFSET) >> 16) & 0x0000FFFF) ldrb r0, [ip, #((_COMM_PAGE_USER_TIMEBASE) - (_COMM_PAGE_TIMEBASE_OFFSET))] - cmp r0, #0 // Are userspace reads supported? + cmp r0, #USER_TIMEBASE_NONE // Are userspace reads supported? beq _mach_absolute_time_kernel // If not, go to the kernel isb // Prevent speculation on CNTPCT across calls // (see ARMV7C.b section B8.1.2, ARMv8 section D6.1.2) @@ -242,7 +242,7 @@ _mach_absolute_time: movk x3, #(((_COMM_PAGE_TIMEBASE_OFFSET) >> 16) & 0x000000000000FFFF), lsl #16 movk x3, #((_COMM_PAGE_TIMEBASE_OFFSET) & 0x000000000000FFFF) ldrb w2, [x3, #((_COMM_PAGE_USER_TIMEBASE) - (_COMM_PAGE_TIMEBASE_OFFSET))] - cmp x2, #0 // Are userspace reads supported? + cmp x2, #USER_TIMEBASE_NONE // Are userspace reads supported? b.eq _mach_absolute_time_kernel // If not, go to the kernel isb // Prevent speculation on CNTPCT across calls // (see ARMV7C.b section B8.1.2, ARMv8 section D6.1.2) @@ -253,7 +253,9 @@ L_mach_absolute_time_user: cmp x1, x2 // Compare our offset values... b.ne L_mach_absolute_time_user // If they changed, try again add x0, x0, x1 // Construct mach_absolute_time - ret + ret + + .text .align 2 diff --git a/libsyscall/wrappers/mach_continuous_time.c b/libsyscall/wrappers/mach_continuous_time.c index 353ef0d87..c128ac1b7 100644 --- a/libsyscall/wrappers/mach_continuous_time.c +++ b/libsyscall/wrappers/mach_continuous_time.c @@ -59,11 +59,11 @@ kern_return_t _mach_continuous_hwclock(uint64_t *cont_time __unused) { #if defined(__arm64__) +#define ISB_SY 0xf uint8_t cont_hwclock = *((uint8_t*)_COMM_PAGE_CONT_HWCLOCK); - uint64_t timebase; if (cont_hwclock) { - __asm__ volatile ("isb\n" "mrs %0, CNTPCT_EL0" : "=r"(timebase)); - *cont_time = timebase; + __builtin_arm_isb(ISB_SY); + *cont_time = __builtin_arm_rsr64("CNTPCT_EL0"); return KERN_SUCCESS; } #endif diff --git a/libsyscall/wrappers/mach_get_times.c b/libsyscall/wrappers/mach_get_times.c index f44b4b1eb..4a8a3f19e 100644 --- a/libsyscall/wrappers/mach_get_times.c +++ b/libsyscall/wrappers/mach_get_times.c @@ -64,7 +64,7 @@ mach_get_times(uint64_t* absolute_time, uint64_t* cont_time, struct timespec *tp if (__gettimeofday_with_mach(&tv, NULL, &tbr) < 0) { return KERN_FAILURE; } else if (tbr == 0) { -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) // On an old kernel, likely chroot'ed. (remove next year) tbr = mach_absolute_time(); #else diff --git a/libsyscall/wrappers/persona.c b/libsyscall/wrappers/persona.c index 67bee1546..513543952 100644 --- a/libsyscall/wrappers/persona.c +++ b/libsyscall/wrappers/persona.c @@ -29,20 +29,27 @@ #include "strings.h" /* syscall entry point */ -int __persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen); +int __persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen, char *path); int kpersona_alloc(struct kpersona_info *info, uid_t *id) { size_t idlen = 1; - return __persona(PERSONA_OP_ALLOC, 0, info, id, &idlen); + return __persona(PERSONA_OP_ALLOC, 0, info, id, &idlen, NULL); +} + +int +kpersona_palloc(struct kpersona_info *info, uid_t *id, char path[MAXPATHLEN]) +{ + size_t idlen = 1; + return __persona(PERSONA_OP_PALLOC, 0, info, id, &idlen, path); } int kpersona_dealloc(uid_t id) { size_t idlen = 1; - return __persona(PERSONA_OP_DEALLOC, 0, NULL, &id, &idlen); + return __persona(PERSONA_OP_DEALLOC, 0, NULL, &id, &idlen, NULL); } int @@ -53,7 +60,7 @@ kpersona_get(uid_t *id) if (p_id == PERSONA_ID_NONE) { int ret = 0; size_t idlen = 1; - ret = __persona(PERSONA_OP_GET, 0, NULL, &p_id, &idlen); + ret = __persona(PERSONA_OP_GET, 0, NULL, &p_id, &idlen, NULL); if (ret != 0) { return ret; } @@ -62,11 +69,18 @@ kpersona_get(uid_t *id) return 0; } +int +kpersona_getpath(uid_t id, char path[MAXPATHLEN]) +{ + size_t idlen = 1; + return __persona(PERSONA_OP_GETPATH, 0, NULL, &id, &idlen, path); +} + int kpersona_info(uid_t id, struct kpersona_info *info) { size_t idlen = 1; - return __persona(PERSONA_OP_INFO, 0, info, &id, &idlen); + return __persona(PERSONA_OP_INFO, 0, info, &id, &idlen, NULL); } int @@ -74,7 +88,7 @@ kpersona_pidinfo(pid_t pid, struct kpersona_info *info) { size_t idlen = 1; uid_t id = (uid_t)pid; - return __persona(PERSONA_OP_PIDINFO, 0, info, &id, &idlen); + return __persona(PERSONA_OP_PIDINFO, 0, info, &id, &idlen, NULL); } int @@ -92,7 +106,26 @@ kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen) if (name) { strlcpy(kinfo.persona_name, name, sizeof(kinfo.persona_name)); } - ret = __persona(PERSONA_OP_FIND, 0, &kinfo, id, idlen); + ret = __persona(PERSONA_OP_FIND, 0, &kinfo, id, idlen, NULL); + if (ret < 0) { + return ret; + } + return (int)(*idlen); +} + +int +kpersona_find_by_type(int persona_type, uid_t *id, size_t *idlen) +{ + int ret; + struct kpersona_info kinfo; + kinfo.persona_info_version = PERSONA_INFO_V1; + kinfo.persona_type = persona_type; + kinfo.persona_id = -1; + kinfo.persona_gid = 0; + kinfo.persona_ngroups = 0; + kinfo.persona_groups[0] = 0; + kinfo.persona_name[0] = 0; + ret = __persona(PERSONA_OP_FIND_BY_TYPE, 0, &kinfo, id, idlen, NULL); if (ret < 0) { return ret; } diff --git a/libsyscall/wrappers/proc.c b/libsyscall/wrappers/proc.c new file mode 100644 index 000000000..ce95bce97 --- /dev/null +++ b/libsyscall/wrappers/proc.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#if !TARGET_OS_OSX +extern uint64_t __memorystatus_available_memory(void); + +size_t +os_proc_available_memory(void) +{ + return (size_t)__memorystatus_available_memory(); +} +#endif diff --git a/libsyscall/wrappers/quota_obsolete.c b/libsyscall/wrappers/quota_obsolete.c index 1aff8f182..ac235cbe4 100644 --- a/libsyscall/wrappers/quota_obsolete.c +++ b/libsyscall/wrappers/quota_obsolete.c @@ -25,7 +25,7 @@ #include #include -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* * system call stubs are no longer generated for these from * syscalls.master. Instead, provide simple stubs here. @@ -45,4 +45,4 @@ setquota(void) { return kill(getpid(), SIGSYS); } -#endif /* !TARGET_OS_EMBEDDED */ +#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ diff --git a/libsyscall/wrappers/skywalk/os_channel.c b/libsyscall/wrappers/skywalk/os_channel.c index 4aee6e02d..7d7762110 100644 --- a/libsyscall/wrappers/skywalk/os_channel.c +++ b/libsyscall/wrappers/skywalk/os_channel.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Apple Inc. All rights reserved. + * Copyright (c) 2015-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/libsyscall/wrappers/skywalk/os_channel_event.c b/libsyscall/wrappers/skywalk/os_channel_event.c new file mode 100644 index 000000000..2d6fde59d --- /dev/null +++ b/libsyscall/wrappers/skywalk/os_channel_event.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + diff --git a/libsyscall/wrappers/spawn/posix_spawn.c b/libsyscall/wrappers/spawn/posix_spawn.c index 819863128..ae5585b04 100644 --- a/libsyscall/wrappers/spawn/posix_spawn.c +++ b/libsyscall/wrappers/spawn/posix_spawn.c @@ -37,6 +37,7 @@ #include #include #include +#include /* for TASK_PORT_REGISTER_MAX */ #include #include /* for COALITION_TYPE_MAX */ #include @@ -137,6 +138,8 @@ posix_spawnattr_init(posix_spawnattr_t *attr) (*psattrp)->psa_persona_info = NULL; + (*psattrp)->psa_posix_cred_info = NULL; + /* * old coalition field * For backwards compatibility reasons, we set this to 1 @@ -183,6 +186,8 @@ posix_spawnattr_init(posix_spawnattr_t *attr) static int posix_spawn_destroyportactions_np(posix_spawnattr_t *); static int posix_spawn_destroycoalition_info_np(posix_spawnattr_t *); static int posix_spawn_destroypersona_info_np(posix_spawnattr_t *); +static int posix_spawn_destroyposix_cred_info_np(posix_spawnattr_t *); +static int posix_spawn_destroymacpolicy_info_np(posix_spawnattr_t *); int posix_spawnattr_destroy(posix_spawnattr_t *attr) @@ -197,6 +202,8 @@ posix_spawnattr_destroy(posix_spawnattr_t *attr) posix_spawn_destroyportactions_np(attr); posix_spawn_destroycoalition_info_np(attr); posix_spawn_destroypersona_info_np(attr); + posix_spawn_destroyposix_cred_info_np(attr); + posix_spawn_destroymacpolicy_info_np(attr); free(psattr); *attr = NULL; @@ -841,6 +848,31 @@ posix_spawn_destroypersona_info_np(posix_spawnattr_t *attr) return 0; } +/* + * posix_spawn_destroyposix_cred_info_np + * Description: clean up posix_cred_info struct in posix_spawnattr_t attr + */ +static int +posix_spawn_destroyposix_cred_info_np(posix_spawnattr_t *attr) +{ + _posix_spawnattr_t psattr; + struct _posix_spawn_posix_cred_info *pspci; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + psattr = *(_posix_spawnattr_t *)attr; + pspci = psattr->psa_posix_cred_info; + if (pspci == NULL) { + return EINVAL; + } + + psattr->psa_posix_cred_info = NULL; + free(pspci); + return 0; +} + /* * posix_spawn_appendportaction_np * Description: append a port action, grow the array if necessary @@ -1276,7 +1308,69 @@ posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *file_actions, psfileact->psfaa_type = PSFA_DUP2; psfileact->psfaa_filedes = filedes; - psfileact->psfaa_openargs.psfao_oflag = newfiledes; + psfileact->psfaa_dup2args.psfad_newfiledes = newfiledes; + + return 0; +} + +/* + * posix_spawn_file_actions_add_fileportdup2_np + * + * Description: Add a dup2 action to the object referenced by 'file_actions' + * that will cause the file referenced by 'fileport' to be + * attempted to be dup2'ed to the descriptor 'newfiledes' in the + * spawned process. + * + * Parameters: file_actions File action object to augment + * filedes fileport to dup2 + * newfiledes fd to dup2 it to + * + * Returns: 0 Success + * EBADF fileport isn't a valid port, or the + * value specified by newfiledes is + * negative or greater than or equal to + * {OPEN_MAX}. + * ENOMEM Insufficient memory exists to add to + * the spawn file actions object. + * + * NOTIMP: Allowed failures (checking NOT required): + * EINVAL The value specified by file_actions is invalid. + */ +int +posix_spawn_file_actions_add_fileportdup2_np( + posix_spawn_file_actions_t *file_actions, + mach_port_t fileport, int newfiledes) +{ + _posix_spawn_file_actions_t *psactsp; + _psfa_action_t *psfileact; + + if (file_actions == NULL || *file_actions == NULL) { + return EINVAL; + } + + psactsp = (_posix_spawn_file_actions_t *)file_actions; + /* Range check; required by POSIX */ + if (!MACH_PORT_VALID(fileport) || + newfiledes < 0 || newfiledes >= OPEN_MAX) { + return EBADF; + } + + /* If we do not have enough slots, grow the structure */ + if ((*psactsp)->psfa_act_count == (*psactsp)->psfa_act_alloc) { + /* need to grow file actions structure */ + if (_posix_spawn_file_actions_grow(psactsp)) { + return ENOMEM; + } + } + + /* + * Allocate next available slot and fill it out + */ + psfileact = &(*psactsp)->psfa_act_acts[(*psactsp)->psfa_act_count++]; + + psfileact->psfaa_type = PSFA_FILEPORT_DUP2; + psfileact->psfaa_fileport = fileport; + psfileact->psfaa_dup2args.psfad_newfiledes = newfiledes; return 0; } @@ -1351,6 +1445,117 @@ posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *file_actions, return 0; } + +/* + * posix_spawn_file_actions_addchdir_np + * + * Description: Add a chdir action to the object referenced by 'file_actions' + * that will cause the current working directory to attempt to be changed + * to that referenced by 'path' in the spawned process. + * + * Parameters: file_actions File action object to augment + * path path of the desired working directory + * + * Returns: 0 Success + * ENOMEM Insufficient memory exists to add to + * the spawn file actions object. + * ENAMETOOLONG The supplied path exceeded PATH_MAX. + * + * NOTIMP: Allowed failures (checking NOT required): + * EINVAL The value specified by file_actions is invalid. + */ +int +posix_spawn_file_actions_addchdir_np( + posix_spawn_file_actions_t * __restrict file_actions, + const char * __restrict path) +{ + _posix_spawn_file_actions_t *psactsp; + _psfa_action_t *psfileact; + + if (file_actions == NULL || *file_actions == NULL) { + return EINVAL; + } + + psactsp = (_posix_spawn_file_actions_t *)file_actions; + + /* If we do not have enough slots, grow the structure */ + if ((*psactsp)->psfa_act_count == (*psactsp)->psfa_act_alloc) { + /* need to grow file actions structure */ + if (_posix_spawn_file_actions_grow(psactsp)) { + return ENOMEM; + } + } + + /* + * Allocate next available slot and fill it out + */ + psfileact = &(*psactsp)->psfa_act_acts[(*psactsp)->psfa_act_count++]; + + psfileact->psfaa_type = PSFA_CHDIR; + if (strlcpy(psfileact->psfaa_chdirargs.psfac_path, path, PATH_MAX) >= PATH_MAX) { + (*psactsp)->psfa_act_count--; + return ENAMETOOLONG; + } + + return 0; +} + + +/* + * posix_spawn_file_actions_fchdir_np + * + * Description: Add a fchdir action to the object referenced by 'file_actions' + * that will cause the current working directory to attempt to be changed + * to that referenced by the descriptor 'filedes' in the spawned process. + * + * Parameters: file_actions File action object to augment + * filedes fd to chdir to + * + * Returns: 0 Success + * EBADF The value specified by either fildes is negative or + * greater than or equal to {OPEN_MAX}. + * ENOMEM Insufficient memory exists to add to + * the spawn file actions object. + * + * NOTIMP: Allowed failures (checking NOT required): + * EINVAL The value specified by file_actions is invalid. + */ +int +posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *file_actions, + int filedes) +{ + _posix_spawn_file_actions_t *psactsp; + _psfa_action_t *psfileact; + + if (file_actions == NULL || *file_actions == NULL) { + return EINVAL; + } + + psactsp = (_posix_spawn_file_actions_t *)file_actions; + /* Range check; in spirit of POSIX */ + if (filedes < 0 || filedes >= OPEN_MAX) { + return EBADF; + } + + /* If we do not have enough slots, grow the structure */ + if ((*psactsp)->psfa_act_count == (*psactsp)->psfa_act_alloc) { + /* need to grow file actions structure */ + if (_posix_spawn_file_actions_grow(psactsp)) { + return ENOMEM; + } + } + + /* + * Allocate next available slot and fill it out + */ + psfileact = &(*psactsp)->psfa_act_acts[(*psactsp)->psfa_act_count++]; + + psfileact->psfaa_type = PSFA_FCHDIR; + psfileact->psfaa_filedes = filedes; + + return 0; +} + int posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict attr) { @@ -1393,7 +1598,7 @@ posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict attr, return 0; } -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* * posix_spawnattr_setjetsam * @@ -1427,7 +1632,7 @@ posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr, return posix_spawnattr_setjetsam_ext(attr, flags_ext, priority, memlimit, memlimit); } -#endif /* TARGET_OS_EMBEDDED */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ /* * posix_spawnattr_setjetsam_ext @@ -1520,7 +1725,7 @@ posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr, .port_type = PSPA_IMP_WATCHPORTS, .new_port = portarray[i], }; - int err = posix_spawn_appendportaction_np(attr, &action); + err = posix_spawn_appendportaction_np(attr, &action); if (err) { break; } @@ -1528,6 +1733,28 @@ posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr, return err; } +int +posix_spawnattr_set_registered_ports_np(posix_spawnattr_t * __restrict attr, + mach_port_t portarray[], uint32_t count) +{ + int err = 0; + + if (count > TASK_PORT_REGISTER_MAX) { + return EINVAL; + } + + for (uint32_t i = 0; i < count; i++) { + _ps_port_action_t action = { + .port_type = PSPA_REGISTERED_PORTS, + .new_port = portarray[i], + }; + err = posix_spawn_appendportaction_np(attr, &action); + if (err) { + break; + } + } + return err; +} static @@ -1621,6 +1848,31 @@ posix_spawnattr_setmacpolicyinfo_np(posix_spawnattr_t * __restrict attr, return 0; } +/* + * posix_spawn_destroymacpolicy_info_np + * Description: cleanup the macpolicy struct in posix_spawnattr_t attr + */ +static int +posix_spawn_destroymacpolicy_info_np(posix_spawnattr_t *attr) +{ + _posix_spawnattr_t psattr; + _posix_spawn_mac_policy_extensions_t psmx; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + psattr = *(_posix_spawnattr_t *)attr; + psmx = psattr->psa_mac_extensions; + if (psmx == NULL) { + return EINVAL; + } + + psattr->psa_mac_extensions = NULL; + free(psmx); + return 0; +} + int posix_spawnattr_setcoalition_np(const posix_spawnattr_t * __restrict attr, uint64_t coalitionid, int type, int role) @@ -1745,6 +1997,7 @@ posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict attr, uid_t persona->pspi_gid = 0; persona->pspi_ngroups = 0; persona->pspi_groups[0] = 0; + persona->pspi_gmuid = 0; psattr->psa_persona_info = persona; } @@ -1864,6 +2117,199 @@ posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint6 return 0; } +static struct _posix_spawn_posix_cred_info * +_posix_spawnattr_get_posix_creds_info(_posix_spawnattr_t psattr) +{ + struct _posix_spawn_posix_cred_info *pspci = psattr->psa_posix_cred_info; + + if (pspci == NULL) { + pspci = malloc(sizeof(struct _posix_spawn_posix_cred_info)); + if (pspci != NULL) { + pspci->pspci_flags = 0; + pspci->pspci_uid = 0; + pspci->pspci_gid = 0; + pspci->pspci_ngroups = 0; + pspci->pspci_groups[0] = 0; + pspci->pspci_gmuid = 0; + pspci->pspci_login[0] = '\0'; + psattr->psa_posix_cred_info = pspci; + } + } + return pspci; +} + +int +posix_spawnattr_set_uid_np(const posix_spawnattr_t *attr, uid_t uid) +{ + struct _posix_spawn_posix_cred_info *pspci; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr); + if (pspci == NULL) { + return ENOMEM; + } + + pspci->pspci_uid = uid; + + pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_UID; + + return 0; +} + +int +posix_spawnattr_set_gid_np(const posix_spawnattr_t *attr, gid_t gid) +{ + struct _posix_spawn_posix_cred_info *pspci; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr); + if (pspci == NULL) { + return ENOMEM; + } + + pspci->pspci_gid = gid; + + pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_GID; + + return 0; +} + +int +posix_spawnattr_set_groups_np(const posix_spawnattr_t *attr, + int ngroups, gid_t *gidarray, uid_t gmuid) +{ + struct _posix_spawn_posix_cred_info *pspci; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + if (gidarray == NULL) { + return EINVAL; + } + + if (ngroups > NGROUPS || ngroups < 0) { + return EINVAL; + } + + pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr); + if (pspci == NULL) { + return ENOMEM; + } + + pspci->pspci_ngroups = ngroups; + for (int i = 0; i < ngroups; i++) { + pspci->pspci_groups[i] = gidarray[i]; + } + + pspci->pspci_gmuid = gmuid; + + pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_GROUPS; + + return 0; +} + +int +posix_spawnattr_set_login_np(const posix_spawnattr_t *attr, const char *login) +{ + struct _posix_spawn_posix_cred_info *pspci; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + if (strlen(login) > MAXLOGNAME) { + return ERANGE; + } + + pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr); + if (pspci == NULL) { + return ENOMEM; + } + + strlcpy(pspci->pspci_login, login, sizeof(pspci->pspci_login)); + + pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_LOGIN; + + return 0; +} + +/* + * posix_spawnattr_set_jetsam_ttr_np + * + * Description: Pass data regarding recent relaunch behavior when jetsammed for the process. + * The recent history is effectively converted into a histogram and the highest + * frequency bucket defines the "type" of the process. The type is passed along + * to the jetsam code as part of psa_jetsam_flags. + * + * Parameters: count Number of entries in the ttrs_millis array + * ttrs_millis Array of raw data for relaunch behavior + * + * Returns: 0 Success + * EINVAL Bad attr pointer or empty data array + */ +int +posix_spawnattr_set_jetsam_ttr_np(const posix_spawnattr_t * __restrict attr, uint32_t count, uint32_t *ttrs_millis) +{ + _posix_spawnattr_t psattr; + + /* + * Define the bucketizing policy which would be used to generate the histogram. These + * values are based on looking at data from various Avg. Joanna runs. + */ + static const uint32_t relaunch_buckets_msecs[POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS] = { + 5000, + 10000, + UINT32_MAX + }; + static const uint32_t relaunch_jetsam_flags[POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS] = { + POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH, + POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED, + POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW + }; + + /* Make sure the attr pointer is valid */ + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + /* Make sure the count of entries is non-zero */ + if (count == 0) { + return EINVAL; + } + + psattr = *(_posix_spawnattr_t *)attr; + + /* Generate a histogram based on the relaunch data while maintaining highest frequency bucket info */ + int relaunch_histogram[POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS] = {0}; + int max_frequency = -1; + int highest_frequency_bucket = -1; + + for (uint32_t i = 0; i < count; i++) { + /* For each data point passed in via launchd, find the bucket it lands in */ + for (uint32_t bucket = 0; bucket < POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS; bucket++) { + if (ttrs_millis[i] <= relaunch_buckets_msecs[bucket]) { + relaunch_histogram[bucket]++; + + /* Check if the bucket is the highest frequency bucket now */ + if (relaunch_histogram[bucket] > max_frequency) { + max_frequency = relaunch_histogram[bucket]; + highest_frequency_bucket = bucket; + } + break; + } + } + } + psattr->psa_jetsam_flags |= relaunch_jetsam_flags[highest_frequency_bucket]; + return 0; +} + /* * posix_spawn * @@ -1955,6 +2401,10 @@ posix_spawn(pid_t * __restrict pid, const char * __restrict path, ad.persona_info_size = sizeof(struct _posix_spawn_persona_info); ad.persona_info = psattr->psa_persona_info; } + if (psattr->psa_posix_cred_info != NULL) { + ad.posix_cred_info_size = sizeof(struct _posix_spawn_posix_cred_info); + ad.posix_cred_info = psattr->psa_posix_cred_info; + } } if (file_actions != NULL && *file_actions != NULL) { _posix_spawn_file_actions_t psactsp = diff --git a/libsyscall/wrappers/spawn/spawn.h b/libsyscall/wrappers/spawn/spawn.h index 7fa018967..1b83c9d96 100644 --- a/libsyscall/wrappers/spawn/spawn.h +++ b/libsyscall/wrappers/spawn/spawn.h @@ -57,73 +57,56 @@ __BEGIN_DECLS * a dummy argument name is added. */ -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawn(pid_t * __restrict, const char * __restrict, const posix_spawn_file_actions_t *, const posix_spawnattr_t * __restrict, char *const __argv[__restrict], - char *const __envp[__restrict]) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + char *const __envp[__restrict]) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnp(pid_t * __restrict, const char * __restrict, const posix_spawn_file_actions_t *, const posix_spawnattr_t * __restrict, char *const __argv[__restrict], - char *const __envp[__restrict]) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + char *const __envp[__restrict]) __API_AVAILABLE(macos(10.5), ios(2.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *, int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *, int, - int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawn_file_actions_addopen( posix_spawn_file_actions_t * __restrict, int, - const char * __restrict, int, mode_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + const char * __restrict, int, mode_t) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawn_file_actions_destroy(posix_spawn_file_actions_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawn_file_actions_destroy(posix_spawn_file_actions_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawn_file_actions_init(posix_spawn_file_actions_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawn_file_actions_init(posix_spawn_file_actions_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawnattr_destroy(posix_spawnattr_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawnattr_destroy(posix_spawnattr_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_getsigdefault(const posix_spawnattr_t * __restrict, - sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_getflags(const posix_spawnattr_t * __restrict, - short * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + short * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_getpgroup(const posix_spawnattr_t * __restrict, - pid_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + pid_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_getsigmask(const posix_spawnattr_t * __restrict, - sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawnattr_init(posix_spawnattr_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawnattr_init(posix_spawnattr_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_setsigdefault(posix_spawnattr_t * __restrict, - const sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + const sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawnattr_setflags(posix_spawnattr_t *, short) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawnattr_setflags(posix_spawnattr_t *, short) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED -int posix_spawnattr_setpgroup(posix_spawnattr_t *, pid_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int posix_spawnattr_setpgroup(posix_spawnattr_t *, pid_t) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_setsigmask(posix_spawnattr_t * __restrict, - const sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + const sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); #if 0 /* _POSIX_PRIORITY_SCHEDULING [PS] : not supported */ int posix_spawnattr_setschedparam(posix_spawnattr_t * __restrict, @@ -149,30 +132,30 @@ __END_DECLS __BEGIN_DECLS -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_getbinpref_np(const posix_spawnattr_t * __restrict, - size_t, cpu_type_t *__restrict, size_t *__restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + size_t, cpu_type_t *__restrict, size_t *__restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_setauditsessionport_np(posix_spawnattr_t * __restrict, - mach_port_t) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); + mach_port_t) __API_AVAILABLE(macos(10.6), ios(3.2)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_setbinpref_np(posix_spawnattr_t * __restrict, - size_t, cpu_type_t *__restrict, size_t *__restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + size_t, cpu_type_t *__restrict, size_t *__restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_setexceptionports_np(posix_spawnattr_t * __restrict, exception_mask_t, mach_port_t, - exception_behavior_t, thread_state_flavor_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + exception_behavior_t, thread_state_flavor_t) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawnattr_setspecialport_np(posix_spawnattr_t * __restrict, - mach_port_t, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); + mach_port_t, int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); -__WATCHOS_PROHIBITED __TVOS_PROHIBITED int posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *, - int) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + int) __API_AVAILABLE(macos(10.7), ios(4.3)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); + +int posix_spawn_file_actions_addchdir_np(posix_spawn_file_actions_t *, + const char * __restrict) __API_AVAILABLE(macos(10.15)) __SPI_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0)); + +int posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *, + int) __API_AVAILABLE(macos(10.15)) __SPI_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0)); __END_DECLS diff --git a/libsyscall/wrappers/spawn/spawn_private.h b/libsyscall/wrappers/spawn/spawn_private.h index 41878e746..aa2897d33 100644 --- a/libsyscall/wrappers/spawn/spawn_private.h +++ b/libsyscall/wrappers/spawn/spawn_private.h @@ -30,47 +30,60 @@ #include #include -int posix_spawnattr_getpcontrol_np(const posix_spawnattr_t * __restrict, int * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); -int posix_spawnattr_setpcontrol_np(posix_spawnattr_t *, const int) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); +int posix_spawnattr_getpcontrol_np(const posix_spawnattr_t * __restrict, int * __restrict) __API_AVAILABLE(macos(10.6), ios(3.2)); +int posix_spawnattr_setpcontrol_np(posix_spawnattr_t *, const int) __API_AVAILABLE(macos(10.6), ios(3.2)); -int posix_spawnattr_getprocesstype_np(const posix_spawnattr_t * __restrict, int * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); -int posix_spawnattr_setprocesstype_np(posix_spawnattr_t *, const int) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); +int posix_spawnattr_getprocesstype_np(const posix_spawnattr_t * __restrict, int * __restrict) __API_AVAILABLE(macos(10.8), ios(6.0)); +int posix_spawnattr_setprocesstype_np(posix_spawnattr_t *, const int) __API_AVAILABLE(macos(10.8), ios(6.0)); -int posix_spawnattr_setcpumonitor(posix_spawnattr_t * __restrict, uint64_t, uint64_t) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); -int posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict, uint64_t *, uint64_t *) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); -int posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0); +int posix_spawnattr_setcpumonitor(posix_spawnattr_t * __restrict, uint64_t, uint64_t) __API_AVAILABLE(macos(10.8), ios(6.0)); +int posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict, uint64_t *, uint64_t *) __API_AVAILABLE(macos(10.8), ios(6.0)); +int posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict) __API_AVAILABLE(macos(10.9), ios(6.0)); -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) int posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr, - short flags, int priority, int memlimit) __OSX_AVAILABLE_STARTING(__MAC_NA, __IPHONE_5_0); -#endif /* TARGET_OS_EMBEDDED */ + short flags, int priority, int memlimit) __API_UNAVAILABLE(macos) __API_AVAILABLE(ios(5.0)); +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ int posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr, - short flags, int priority, int memlimit_active, int memlimit_inactive) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); + short flags, int priority, int memlimit_active, int memlimit_inactive) __API_AVAILABLE(macos(10.11), ios(9.0)); + +// time-to-relaunch after jetsam, set by launchd +int posix_spawnattr_set_jetsam_ttr_np(const posix_spawnattr_t * __restrict attr, uint32_t count, uint32_t *ttrs_millis) __OSX_AVAILABLE_STARTING(__MAC_10_15, __IPHONE_13_0); int posix_spawnattr_set_threadlimit_ext(posix_spawnattr_t * __restrict attr, - int thread_limit); + int thread_limit) __API_AVAILABLE(macos(10.14), ios(12.0), tvos(12.0), watchos(5.0)); #define POSIX_SPAWN_IMPORTANCE_PORT_COUNT 32 int posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr, - int count, mach_port_t portarray[]) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0); + int count, mach_port_t portarray[]) __API_AVAILABLE(macos(10.9), ios(6.0)); + +int posix_spawnattr_set_registered_ports_np(posix_spawnattr_t * __restrict attr, mach_port_t portarray[], uint32_t count) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)); #define POSIX_SPAWN_MACPOLICYINFO_WITHSIZE 1 -int posix_spawnattr_getmacpolicyinfo_np(const posix_spawnattr_t * __restrict, const char *, void **, size_t *) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); -int posix_spawnattr_setmacpolicyinfo_np(posix_spawnattr_t * __restrict, const char *, void *, size_t) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); +int posix_spawnattr_getmacpolicyinfo_np(const posix_spawnattr_t * __restrict, const char *, void **, size_t *) __API_AVAILABLE(macos(10.9), ios(7.0)); +int posix_spawnattr_setmacpolicyinfo_np(posix_spawnattr_t * __restrict, const char *, void *, size_t) __API_AVAILABLE(macos(10.9), ios(7.0)); + +int posix_spawnattr_setcoalition_np(const posix_spawnattr_t * __restrict, uint64_t, int, int) __API_AVAILABLE(macos(10.10), ios(8.0)); + +int posix_spawnattr_set_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t) __API_AVAILABLE(macos(10.10), ios(8.0)); +int posix_spawnattr_get_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __API_AVAILABLE(macos(10.10), ios(8.0)); + +int posix_spawnattr_set_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t) __API_AVAILABLE(macos(10.11), ios(9.0)); +int posix_spawnattr_get_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __API_AVAILABLE(macos(10.11), ios(9.0)); -int posix_spawnattr_setcoalition_np(const posix_spawnattr_t * __restrict, uint64_t, int, int) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); +int posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict, uid_t, uint32_t) __API_AVAILABLE(macos(10.11), ios(9.0)); +int posix_spawnattr_set_persona_uid_np(const posix_spawnattr_t * __restrict, uid_t) __API_AVAILABLE(macos(10.11), ios(9.0)); +int posix_spawnattr_set_persona_gid_np(const posix_spawnattr_t * __restrict, gid_t) __API_AVAILABLE(macos(10.11), ios(9.0)); +int posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict, int, gid_t * __restrict, uid_t) __API_AVAILABLE(macos(10.11), ios(9.0)); -int posix_spawnattr_set_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); -int posix_spawnattr_get_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); +int posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) __API_AVAILABLE(macos(10.14), ios(12.0), tvos(12.0), watchos(5.0)); -int posix_spawnattr_set_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); -int posix_spawnattr_get_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); +int posix_spawnattr_set_uid_np(const posix_spawnattr_t * __restrict, uid_t) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)); +int posix_spawnattr_set_gid_np(const posix_spawnattr_t * __restrict, gid_t) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)); +int posix_spawnattr_set_groups_np(const posix_spawnattr_t * __restrict, int, gid_t * __restrict, uid_t) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)); +int posix_spawnattr_set_login_np(const posix_spawnattr_t * __restrict, const char * __restrict) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)); -int posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict, uid_t, uint32_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); -int posix_spawnattr_set_persona_uid_np(const posix_spawnattr_t * __restrict, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); -int posix_spawnattr_set_persona_gid_np(const posix_spawnattr_t * __restrict, gid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); -int posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict, int, gid_t *, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); -int posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) __OSX_AVAILABLE_STARTING(__MAC_10_14, __IPHONE_12_0); +int posix_spawn_file_actions_add_fileportdup2_np(posix_spawn_file_actions_t * __restrict, mach_port_t, int) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0)); #endif /* !defined _SPAWN_PRIVATE_H_*/ diff --git a/libsyscall/wrappers/terminate_with_reason.c b/libsyscall/wrappers/terminate_with_reason.c index 52082bb68..dd7719b47 100644 --- a/libsyscall/wrappers/terminate_with_reason.c +++ b/libsyscall/wrappers/terminate_with_reason.c @@ -37,7 +37,7 @@ void __abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, static void abort_with_payload_wrapper_internal(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, - uint64_t reason_flags) __attribute__((noreturn)); + uint64_t reason_flags) __attribute__((noreturn, cold)); /* System call wrappers */ int diff --git a/libsyscall/xcodescripts/compile-syscalls.pl b/libsyscall/xcodescripts/compile-syscalls.pl index f0c269132..f278f1dba 100755 --- a/libsyscall/xcodescripts/compile-syscalls.pl +++ b/libsyscall/xcodescripts/compile-syscalls.pl @@ -63,7 +63,9 @@ my @CFLAGS = ( "-x assembler-with-cpp", "-c", "-isysroot", $ENV{'SDKROOT'} || "/", - "-I".$ENV{"SDKROOT"}."/System/Library/Frameworks/System.framework/PrivateHeaders", + "-I".$ENV{"SDKROOT"}."/".$ENV{"SDK_INSTALL_HEADERS_ROOT"}."/usr/include", + "-I".$ENV{"SDKROOT"}."/".$ENV{"SDK_INSTALL_HEADERS_ROOT"}."/usr/local/include", + "-I".$ENV{"SDKROOT"}."/".$ENV{"SDK_INSTALL_HEADERS_ROOT"}."/System/Library/Frameworks/System.framework/PrivateHeaders", ); chomp(my $LIBTOOL = `xcrun -sdk "$ENV{'SDKROOT'}" -find libtool`); diff --git a/libsyscall/xcodescripts/create-syscalls.pl b/libsyscall/xcodescripts/create-syscalls.pl index a4e17d689..6bf15db90 100755 --- a/libsyscall/xcodescripts/create-syscalls.pl +++ b/libsyscall/xcodescripts/create-syscalls.pl @@ -150,8 +150,10 @@ sub usage { # Read the syscall.master file and collect the system call names and number # of arguments. It looks for the NO_SYSCALL_STUB quailifier following the # prototype to determine if no automatic stub should be created by Libsystem. -# System call name that are already prefixed with double-underbar are set as -# if the NO_SYSCALL_STUB qualifier were specified (whether it is or not). +# +# The `sys_` prefix is stripped from syscall names, and is only kept for +# the kernel symbol in order to avoid namespace clashes and identify +# syscalls more easily. # # For the #if lines in syscall.master, all macros are assumed to be defined, # except COMPAT_GETFSSTAT (assumed undefined). @@ -186,6 +188,7 @@ sub readMaster { my $no_syscall_stub = /\)\s*NO_SYSCALL_STUB\s*;/; my($name, $args) = /\s(\S+)\s*\(([^)]*)\)/; next if $name =~ /e?nosys/; + $name =~ s/^sys_//; $args =~ s/^\s+//; $args =~ s/\s+$//; my $argbytes = 0; @@ -330,13 +333,13 @@ sub writeStubForSymbol { $arch =~ s/arm64(.*)/arm64/; push(@conditions, "defined(__${arch}__)") unless grep { $_ eq $arch } @{$$symbol{except}}; - if($arch == 'arm64') { + if($arch eq "arm64") { $has_arm64 = 1 unless grep { $_ eq $arch } @{$$symbol{except}}; } } - my %is_cancel; - for (@Cancelable) { $is_cancel{$_} = 1 }; + my %is_cancel; + for (@Cancelable) { $is_cancel{$_} = 1 }; print $f "#define __SYSCALL_32BIT_ARG_BYTES $$symbol{bytes}\n"; print $f "#include \"SYS.h\"\n\n"; diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh index 40bbcbf74..0761c11fe 100755 --- a/libsyscall/xcodescripts/mach_install_mig.sh +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -40,6 +40,11 @@ MIG_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach" SERVER_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/servers" MACH_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach" MACH_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach" +MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach" +MIG_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/include -I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/local/include" +MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/System/Library/Frameworks/System.framework/PrivateHeaders" +SRC="$SRCROOT/mach" +FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk" # from old Libsystem makefiles MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 1` @@ -63,11 +68,6 @@ then MACHINE_ARCH="i386" fi -SRC="$SRCROOT/mach" -MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach" -MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders" -FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk" - ASROOT="" if [ `whoami` = "root" ]; then ASROOT="-o 0" @@ -83,6 +83,7 @@ MIGS="clock.defs mach_host.defs mach_port.defs mach_voucher.defs + memory_entry.defs processor.defs processor_set.defs task.defs @@ -146,7 +147,7 @@ for hdr in $MACH_PRIVATE_HDRS; do done # special case because we only have one to do here -$MIG -novouchers -arch $MACHINE_ARCH -header "$SERVER_HEADER_DST/netname.h" $SRC/servers/netname.defs +$MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$SERVER_HEADER_DST/netname.h" $MIG_INCFLAGS $SRC/servers/netname.defs # install /usr/include/mach mig headers @@ -155,7 +156,7 @@ mkdir -p $MIG_HEADER_OBJ for mig in $MIGS $MIGS_DUAL_PUBLIC_PRIVATE; do MIG_NAME=`basename $mig .defs` - $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_HEADER_OBJ/$MIG_NAME.h" $MIG_DEFINES $SRC/$mig + $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_HEADER_OBJ/$MIG_NAME.h" $MIG_DEFINES $MIG_INCFLAGS $SRC/$mig for filter in $MIG_FILTERS; do $FILTER_MIG $SRC/$filter $MIG_HEADER_OBJ/$MIG_NAME.h > $MIG_HEADER_OBJ/$MIG_NAME.tmp.h mv $MIG_HEADER_OBJ/$MIG_NAME.tmp.h $MIG_HEADER_OBJ/$MIG_NAME.h @@ -167,7 +168,7 @@ mkdir -p $MIG_PRIVATE_HEADER_DST for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do MIG_NAME=`basename $mig .defs` - $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig + $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_INCFLAGS $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h" fi @@ -182,6 +183,6 @@ mkdir -p $MIG_INTERNAL_HEADER_DST for mig in $MIGS_INTERNAL; do MIG_NAME=`basename $mig .defs` - $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $SRC/$mig + $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $MIG_INCFLAGS $SRC/$mig done diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 3f6f71317..0c8420d13 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -10,30 +10,44 @@ # # Commands for the build environment # + ## # Verbosity ## + ifeq ($(RC_XBS),YES) -VERBOSE = YES -else -VERBOSE = NO -endif -ifeq ($(VERBOSE),YES) -_v = -_vstdout = + VERBOSE = YES else -_v = @ -_vstdout = > /dev/null + VERBOSE = NO endif -VERBOSE_GENERATED_MAKE_FRAGMENTS = NO +ECHO = echo + +LOG = echo +makelog = $(info $1) +ERR = $(ECHO) > /dev/stderr + +QUIET ?= 0 +ifneq ($(QUIET),0) + LOG = : + makelog = + ifeq ($(VERBOSE),YES) + override VERBOSE = NO + endif +endif ifeq ($(VERBOSE),YES) + _v = + _vstdout = XCRUN = /usr/bin/xcrun -verbose else + _v = @ + _vstdout = > /dev/null XCRUN = /usr/bin/xcrun endif +VERBOSE_GENERATED_MAKE_FRAGMENTS = NO + SDKROOT ?= macosx HOST_SDKROOT ?= macosx @@ -66,6 +80,15 @@ ifeq ($(PLATFORM),) endif endif +ifeq ($(PLATFORM),MacOSX) + ifeq (DriverKit,$(shell echo $(SDKROOT_RESOLVED) | sed 's,^.*/\([^/1-9]*\)[1-9][^/]*\.sdk$$,\1,')) + export PLATFORM := DriverKit + export DRIVERKIT ?= 1 + export DRIVERKITROOT ?= /System/DriverKit + export DRIVERKITRUNTIMEROOT = $(DRIVERKITROOT)/Runtime + endif +endif + ifeq ($(SDKVERSION),) export SDKVERSION := $(shell $(XCRUN) -sdk $(SDKROOT) -show-sdk-version) endif @@ -87,6 +110,9 @@ endif ifeq ($(MIGCC),) export MIGCC := $(CC) endif +ifeq ($(IIG),) + export IIG := $(shell $(XCRUN) -sdk $(SDKROOT) -find iig) +endif ifeq ($(STRIP),) export STRIP := $(shell $(XCRUN) -sdk $(SDKROOT) -find strip) endif @@ -123,7 +149,7 @@ endif # SUPPORTED_EMBEDDED_PLATFORMS := iPhoneOS iPhoneOSNano tvOS AppleTVOS WatchOS BridgeOS SUPPORTED_SIMULATOR_PLATFORMS := iPhoneSimulator iPhoneNanoSimulator tvSimulator AppleTVSimulator WatchSimulator -SUPPORTED_PLATFORMS := MacOSX $(SUPPORTED_SIMULATOR_PLATFORMS) $(SUPPORTED_EMBEDDED_PLATFORMS) +SUPPORTED_PLATFORMS := MacOSX DriverKit $(SUPPORTED_SIMULATOR_PLATFORMS) $(SUPPORTED_EMBEDDED_PLATFORMS) # Platform-specific tools ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),) @@ -170,7 +196,6 @@ TOUCH = /usr/bin/touch SLEEP = /bin/sleep AWK = /usr/bin/awk SED = /usr/bin/sed -ECHO = /bin/echo PLUTIL = /usr/bin/plutil # diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index de10f2053..a1030c34a 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2017 Apple Inc. All rights reserved. +# Copyright (C) 1999-2019 Apple Inc. All rights reserved. # # MakeInc.def contains global definitions for building, # linking, and installing files. @@ -16,6 +16,7 @@ SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H ARM ARM64 # SUPPORTED_KERNEL_CONFIGS = RELEASE DEVELOPMENT DEBUG PROFILE KASAN + # # Machine Configuration options # @@ -24,7 +25,7 @@ SUPPORTED_X86_64_MACHINE_CONFIGS = NONE SUPPORTED_X86_64H_MACHINE_CONFIGS = NONE SUPPORTED_ARM_MACHINE_CONFIGS = S7002 T8002 T8004 -SUPPORTED_ARM64_MACHINE_CONFIGS = S5L8960X T7000 T7001 S8000 S8001 T8010 T8011 BCM2837 +SUPPORTED_ARM64_MACHINE_CONFIGS = T7000 T7001 S8000 S8001 T8010 T8011 BCM2837 # @@ -50,7 +51,6 @@ COMPONENT_LIST = osfmk bsd libkern iokit pexpert libsa security san COMPONENT = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH)))) COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST)) -MACHINE_FLAGS_ARM64_S5L8960X = -DARM64_BOARD_CONFIG_S5L8960X MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000 MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001 MACHINE_FLAGS_ARM_S7002 = -DARM_BOARD_CONFIG_S7002 @@ -67,21 +67,24 @@ MACHINE_FLAGS_ARM64_BCM2837 = -DARM64_BOARD_CONFIG_BCM2837 # Deployment target flag # ifeq ($(PLATFORM),MacOSX) - DEPLOYMENT_TARGET_FLAGS = -mmacosx-version-min=$(SDKVERSION) + DEPLOYMENT_TARGET_FLAGS = -mmacosx-version-min=$(SDKVERSION) -DXNU_TARGET_OS_OSX DEPLOYMENT_LINKER_FLAGS = -Wl,-macosx_version_min,$(SDKVERSION) +else ifeq ($(PLATFORM),DriverKit) + DEPLOYMENT_TARGET_FLAGS = -target x86_64-apple-driverkit$(SDKVERSION) -DXNU_TARGET_OS_OSX + DEPLOYMENT_LINKER_FLAGS = -Wl,-target,x86_64-apple-driverkit$(SDKVERSION) else ifeq ($(PLATFORM),WatchOS) DEPLOYMENT_TARGET_FLAGS = -mwatchos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_WATCH DEPLOYMENT_LINKER_FLAGS = else ifeq ($(PLATFORM),tvOS) - DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) + DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_TV DEPLOYMENT_LINKER_FLAGS = else ifeq ($(PLATFORM),AppleTVOS) - DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) + DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_TV else ifeq ($(PLATFORM),BridgeOS) DEPLOYMENT_TARGET_FLAGS = -mbridgeos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_BRIDGE DEPLOYMENT_LINKER_FLAGS = else ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),) - DEPLOYMENT_TARGET_FLAGS = -miphoneos-version-min=$(SDKVERSION) + DEPLOYMENT_TARGET_FLAGS = -miphoneos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_IOS DEPLOYMENT_LINKER_FLAGS = -Wl,-ios_version_min,$(SDKVERSION) else ifneq ($(filter $(SUPPORTED_SIMULATOR_PLATFORMS),$(PLATFORM)),) DEPLOYMENT_TARGET_FLAGS = @@ -123,36 +126,30 @@ WERROR := -Werror endif # Shared C/C++ warning flags +# NOTE: order matters here. -Wno-xxx goes before opt-in of ones we want WARNFLAGS_STD := \ -Weverything \ -Wno-pedantic \ $(WERROR) \ - -Wno-assign-enum \ + -Wno-implicit-int-conversion \ + -Wno-sign-conversion \ + -Wno-shorten-64-to-32 \ -Wno-bad-function-cast \ - -Wno-c++98-compat \ -Wno-c++-compat \ + -Wno-c++98-compat \ -Wno-conditional-uninitialized \ - -Wno-conversion \ - -Wnull-conversion \ - -Wstring-conversion \ - -Wliteral-conversion \ - -Wnon-literal-null-conversion \ - -Wint-conversion \ - -Wenum-conversion \ - -Wfloat-conversion \ - -Wconstant-conversion \ - -Wpointer-bool-conversion \ -Wno-covered-switch-default \ -Wno-disabled-macro-expansion \ -Wno-documentation-unknown-command \ + -Wno-extra-semi-stmt \ -Wno-format-non-iso \ -Wno-format-nonliteral \ - -Wno-reserved-id-macro \ -Wno-language-extension-token \ -Wno-missing-variable-declarations \ -Wno-packed \ -Wno-padded \ -Wno-partial-availability \ + -Wno-reserved-id-macro \ -Wno-shift-sign-overflow \ -Wno-switch-enum \ -Wno-undef \ @@ -169,6 +166,8 @@ WARNFLAGS_STD := $(WARNFLAGS_STD) \ CWARNFLAGS_STD = \ $(WARNFLAGS_STD) + + # Can be overridden in Makefile.template or Makefile.$arch export CWARNFLAGS ?= $(CWARNFLAGS_STD) @@ -176,13 +175,16 @@ define add_perfile_cflags $(1)_CWARNFLAGS_ADD += $2 endef +define rm_perfile_cflags +$(1)_CFLAGS_RM += $2 +endef + CXXWARNFLAGS_STD = \ $(WARNFLAGS_STD) \ -Wno-c++98-compat-pedantic \ -Wno-exit-time-destructors \ -Wno-global-constructors \ - -Wno-old-style-cast \ - -Wno-zero-as-null-pointer-constant + -Wno-old-style-cast # Can be overridden in Makefile.template or Makefile.$arch export CXXWARNFLAGS ?= $(CXXWARNFLAGS_STD) @@ -203,6 +205,15 @@ ifndef ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG export ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT KernelMachOArchitecture FROM Targets WHERE KernelPlatform IS \"$(CURRENT_MACHINE_CONFIG_LC)\" LIMIT 1 || echo UNKNOWN ) endif +# +# This can have false negatives, and is used to avoid calling CTF when we'll build a static KC +# +ifndef WILL_BUILD_STATIC_KC +export WILL_BUILD_STATIC_KC := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) \ + -query 'SELECT COUNT(*) != 0 FROM Targets WHERE KernelPlatform IS "$(CURRENT_MACHINE_CONFIG_LC)" \ + AND (KernelMachOArchitecture LIKE "arm64e" OR ProductType LIKE "iphone10,%")') +endif + BUILD_STATIC_LINK := 1 endif @@ -249,8 +260,15 @@ BUILD_DSYM := 1 # probes from the kernel. # CFLAGS_GEN = $(DEBUG_CFLAGS) -nostdinc \ - -fno-builtin -fno-common \ - -fsigned-bitfields $(OTHER_CFLAGS) + -ferror-limit=10000 \ + -fno-builtin \ + -fno-common \ + -ftrivial-auto-var-init=zero \ + -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang \ + -fsigned-bitfields \ + -fmerge-all-constants \ + -fno-c++-static-destructors \ + $(OTHER_CFLAGS) CFLAGS_RELEASE = CFLAGS_DEVELOPMENT = @@ -264,10 +282,10 @@ CFLAGS_X86_64 = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \ CFLAGS_X86_64H = $(CFLAGS_X86_64) CFLAGS_ARM = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \ - -fno-strict-aliasing -D__API__=v4 + -momit-leaf-frame-pointer -fno-strict-aliasing -D__API__=v4 CFLAGS_ARM64 = -Darm64 -DARM64 -D__ARM64__ -DLP64 -DPAGE_SIZE_FIXED \ - -fno-strict-aliasing -D__API__=v4 -mkernel + -momit-leaf-frame-pointer -fno-strict-aliasing -D__API__=v4 -mkernel CFLAGS_RELEASEX86_64 = -O2 CFLAGS_DEVELOPMENTX86_64 = -O2 @@ -302,13 +320,15 @@ CFLAGS_PROFILEARM64 = -O2 SAN=0 ifeq ($(CURRENT_KERNEL_CONFIG),KASAN) +# KASan kernel config implicitly enables the KASan instrumentation. +# Instrumentation for other sanitizers is enabled explicitly at build time. KASAN = 1 endif ifeq ($(KASAN),1) SAN=1 BUILD_LTO = 0 -KASAN_SHIFT_ARM64=0xdffffff800000000 +KASAN_SHIFT_ARM64=0xe000000000000000 # # To calculate the kasan shift, subtract the lowest KVA to sanitize, shifted right by 3 bits, # from the base address of the kasan shadow area, (e.g. solve the following equation: @@ -329,26 +349,38 @@ endif ifeq ($(UBSAN),1) SAN=1 -UBSAN_CHECKS = signed-integer-overflow shift pointer-overflow # non-fatal (calls runtime, can return) +UBSAN_CHECKS = signed-integer-overflow shift pointer-overflow bounds object-size # non-fatal (calls runtime, can return) +# UBSAN_CHECKS = undefined nullability unsigned-integer-overflow # everything UBSAN_CHECKS_FATAL = # fatal (calls runtime, must not return) UBSAN_CHECKS_TRAP = vla-bound builtin # emit a trap instruction (no runtime support) -UBSAN_DISABLED = bounds object-size +UBSAN_DISABLED = -ifneq ($(KASAN),1) -UBSAN_CHECKS += alignment # UBSan alignment + KASan code size is too large -UBSAN_CHECKS_FATAL += unreachable # UBSan unreachable doesn't play nice with ASan (40723397) +UBSAN_DISABLED += vptr function # requires unsupported C++ runtime +ifeq ($(KASAN),1) +# UBSan alignment + KASan code size is too large +# UBSan unreachable doesn't play nice with ASan (40723397) +UBSAN_DISABLED += alignment unreachable endif CFLAGS_GEN += -DUBSAN=1 CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS) $(UBSAN_CHECKS_FATAL) $(UBSAN_CHECKS_TRAP),-fsanitize=$(x)) CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_FATAL),-fno-sanitize-recover=$(x)) CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_TRAP),-fsanitize-trap=$(x)) +CFLAGS_GEN += $(foreach x,$(UBSAN_DISABLED),-fno-sanitize=$(x)) +endif + +ifeq ($(KSANCOV),1) +# Enable SanitizerCoverage instrumentation in xnu +SAN = 1 +KSANCOV_CFLAGS := -fsanitize-coverage=trace-pc-guard +CFLAGS_GEN += $(KSANCOV_CFLAGS) -DKSANCOV=1 endif ifeq ($(SAN),1) CFLAGS_GEN += -fsanitize-blacklist=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC) endif + CFLAGS = $(CFLAGS_GEN) \ $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \ @@ -365,7 +397,7 @@ CFLAGS = $(CFLAGS_GEN) \ OTHER_CXXFLAGS = -CXXFLAGS_GEN = -std=gnu++1z -fapple-kext $(OTHER_CXXFLAGS) +CXXFLAGS_GEN = -std=gnu++1z -fsized-deallocation -fapple-kext $(OTHER_CXXFLAGS) CXXFLAGS = $(CXXFLAGS_GEN) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),CXXFLAGS_)) \ @@ -469,7 +501,7 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \ -Wl,-sectalign,__HIB,__llvm_prf_names,0x1000 \ -Wl,-sectalign,__HIB,__llvm_prf_data,0x1000 \ -Wl,-sectalign,__HIB,__textcoal_nt,0x1000 \ - -Wl,-rename_section,__DATA,__const,__CONST,__constdata \ + -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \ -Wl,-no_zero_fill_sections \ $(LDFLAGS_NOSTRIP_FLAG) @@ -478,13 +510,10 @@ LDFLAGS_KERNEL_RELEASEX86_64 += \ -Wl,-sectalign,__HIB,__cstring,0x1000 endif -ifeq ($(KASAN),1) +ifeq ($(KSANCOV),1) LDFLAGS_KERNEL_RELEASEX86_64 += \ - -Wl,-sectalign,__HIB,__asan_globals,0x1000 \ - -Wl,-sectalign,__HIB,__asan_liveness,0x1000 \ - -Wl,-sectalign,__HIB,__mod_term_func,0x1000 \ - -Wl,-rename_section,__HIB,__mod_init_func,__NULL,__mod_init_func \ - -Wl,-rename_section,__HIB,__eh_frame,__NULL,__eh_frame + -Wl,-sectalign,__HIB,__sancov_guards,0x1000 \ + -Wl,-sectalign,__HIB,__sancov_pcs,0x1000 endif # Define KERNEL_BASE_OFFSET so known at compile time: @@ -493,13 +522,18 @@ CFLAGS_X86_64H += -DKERNEL_BASE_OFFSET=$(KERNEL_BASE_OFFSET) LDFLAGS_KERNEL_DEBUGX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) LDFLAGS_KERNEL_DEVELOPMENTX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) -LDFLAGS_KERNEL_KASANX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) +LDFLAGS_KERNEL_KASANX86_64 = $(LDFLAGS_KERNEL_DEVELOPMENTX86_64) \ + -Wl,-sectalign,__HIB,__asan_globals,0x1000 \ + -Wl,-sectalign,__HIB,__asan_liveness,0x1000 \ + -Wl,-sectalign,__HIB,__mod_term_func,0x1000 \ + -Wl,-rename_section,__HIB,__mod_init_func,__NULL,__mod_init_func \ + -Wl,-rename_section,__HIB,__eh_frame,__NULL,__eh_frame LDFLAGS_KERNEL_PROFILEX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) LDFLAGS_KERNEL_RELEASEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64) LDFLAGS_KERNEL_DEBUGX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) LDFLAGS_KERNEL_DEVELOPMENTX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) -LDFLAGS_KERNEL_KASANX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) +LDFLAGS_KERNEL_KASANX86_64H = $(LDFLAGS_KERNEL_KASANX86_64) LDFLAGS_KERNEL_PROFILEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) # We preload ___udivmoddi4 in order to work around an issue with building @@ -512,7 +546,8 @@ LDFLAGS_KERNEL_GENARM = \ -Wl,-u,___udivmoddi4 LDFLAGS_KERNEL_RELEASEARM = \ - $(LDFLAGS_KERNEL_GENARM) + $(LDFLAGS_KERNEL_GENARM) \ + $(LDFLAGS_KERNEL_STRIP_LTO) LDFLAGS_KERNEL_EXPORTS_RELEASEARM = \ -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp @@ -580,7 +615,8 @@ LDFLAGS_KERNEL_SEGARM64 ?= \ LDFLAGS_KERNEL_RELEASEARM64 = \ $(LDFLAGS_KERNEL_GENARM64) \ - $(LDFLAGS_KERNEL_SEGARM64) + $(LDFLAGS_KERNEL_SEGARM64) \ + $(LDFLAGS_KERNEL_STRIP_LTO) LDFLAGS_KERNEL_EXPORTS_RELEASEARM64 = \ -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp @@ -608,7 +644,9 @@ LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ LDFLAGS_KERNEL_EXPORTS = \ - $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_EXPORTS_))) + $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_EXPORTS_))) \ + -Wl,-alias_list,$(TARGET)/all-alias.exp + # # Default runtime libraries to be linked with the kernel @@ -619,21 +657,20 @@ LD_KERNEL_ARCHIVES = $(LDFLAGS_KERNEL_SDK) -lfirehose_kernel # # DTrace support # +ifndef DO_CTFMERGE +DO_CTFMERGE := 1 ifeq ($(CURRENT_KERNEL_CONFIG),RELEASE) ifneq ($(filter ARM%,$(CURRENT_ARCH_CONFIG)),) -DO_CTFCONVERT = 0 -DO_CTFMERGE = 0 -DO_CTFMACHO = 0 -else -DO_CTFCONVERT = $(SUPPORTS_CTFCONVERT) -DO_CTFMERGE = 1 -DO_CTFMACHO = $(NEEDS_CTF_MACHOS) +DO_CTFMERGE := 0 endif -else -DO_CTFCONVERT = $(SUPPORTS_CTFCONVERT) -DO_CTFMERGE = 1 -DO_CTFMACHO = $(NEEDS_CTF_MACHOS) endif +ifneq ($(CURRENT_KERNEL_CONFIG),KASAN) +ifeq ($(WILL_BUILD_STATIC_KC),1) +DO_CTFMERGE := 0 +endif +endif +endif # DO_CTFMERGE + # # Default INCFLAGS @@ -693,28 +730,25 @@ else USE_LTO = $(LTO_ENABLED_$(CURRENT_KERNEL_CONFIG)) endif -SUPPORTS_CTFCONVERT = 0 ifeq ($(USE_LTO),1) CFLAGS_GEN += -flto CXXFLAGS_GEN += -flto -LDFLAGS_KERNEL_GEN += -Wl,-mllvm,-inline-threshold=100 -Wl,-object_path_lto,$(TARGET)/lto.o +LDFLAGS_KERNEL_LTO = -Wl,-mllvm,-inline-threshold=100 +LDFLAGS_KERNEL_GEN += $(LDFLAGS_KERNEL_LTO) -Wl,-object_path_lto,$(TARGET)/lto.o LDFLAGS_NOSTRIP_FLAG = -rdynamic +LDFLAGS_KERNEL_STRIP_LTO = -Wl,-dead_strip,-no_dead_strip_inits_and_terms + CFLAGS_NOLTO_FLAG = -fno-lto -NEEDS_CTF_MACHOS = 1 else +LDFLAGS_KERNEL_LTO = LDFLAGS_NOSTRIP_FLAG = +LDFLAGS_KERNEL_STRIP_LTO = CFLAGS_NOLTO_FLAG = -ifneq ($(CTFCONVERT),) -SUPPORTS_CTFCONVERT = 1 -endif -NEEDS_CTF_MACHOS = 0 endif ifeq ($(BUILD_JSON_COMPILATION_DATABASE),1) BUILD_DSYM := 0 -DO_CTFCONVERT := 0 -DO_CTFMERGE := 0 -DO_CTFMACHO := 0 +DO_CTFMERGE := 0 KCC = $(JSONCOMPILATIONDB) $(OBJPATH)/compile_commands.json $(PWD) $< $(CC) KC++ = $(JSONCOMPILATIONDB) $(OBJPATH)/compile_commands.json $(PWD) $< $(CXX) S_KCC = $(JSONCOMPILATIONDB) $(OBJPATH)/compile_commands.json $(PWD) $< $(CC) @@ -740,7 +774,15 @@ EXEC_INSTALL_FLAGS = -c -S -m 0755 # # Header file destinations # -FRAMEDIR = /System/Library/Frameworks + +ifeq ($(DRIVERKIT),1) + SDKHEADERSROOT=$(DRIVERKITRUNTIMEROOT) + # only whitelisted headers install outside of the DriverKit Runtime hierarchy + DRIVERKITSDKHEADERSROOT=$(DRIVERKITROOT) + DRIVERKITFRAMEDIR = $(DRIVERKITROOT)/System/Library/Frameworks +endif + +FRAMEDIR = $(SDKHEADERSROOT)/System/Library/Frameworks SINCVERS = B SINCFRAME = $(FRAMEDIR)/System.framework @@ -749,11 +791,17 @@ SPINCDIR = $(SINCFRAME)/Versions/$(SINCVERS)/PrivateHeaders SRESDIR = $(SINCFRAME)/Versions/$(SINCVERS)/Resources ifndef INCDIR - INCDIR = /usr/include + INCDIR = $(SDKHEADERSROOT)/usr/include +endif +ifndef DRIVERKITINCDIR + DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/include endif ifndef LCLDIR LCLDIR = $(SPINCDIR) endif +ifndef DRIVERKITLCLDIR + DRIVERKITLCLDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include +endif KINCVERS = A KINCFRAME = $(FRAMEDIR)/Kernel.framework @@ -761,17 +809,27 @@ KINCDIR = $(KINCFRAME)/Versions/$(KINCVERS)/Headers KPINCDIR = $(KINCFRAME)/Versions/$(KINCVERS)/PrivateHeaders KRESDIR = $(KINCFRAME)/Versions/$(KINCVERS)/Resources +DKIT_INCVERS = A +DKIT_INCFRAME = DriverKit.framework +DKIT_INCDIR = $(DKIT_INCFRAME)/Versions/$(DKIT_INCVERS)/Headers +DKIT_PINCDIR = $(DKIT_INCFRAME)/Versions/$(DKIT_INCVERS)/PrivateHeaders +# DriverKit SDK frameworks use shallow bundle structure +DRIVERKIT_DKIT_INCDIR = $(DKIT_INCFRAME)/Headers +DRIVERKIT_DKIT_PINCDIR = $(DKIT_INCFRAME)/PrivateHeaders + XNU_PRIVATE_UNIFDEF = -UMACH_KERNEL_PRIVATE -UBSD_KERNEL_PRIVATE -UIOKIT_KERNEL_PRIVATE -ULIBKERN_KERNEL_PRIVATE -ULIBSA_KERNEL_PRIVATE -UPEXPERT_KERNEL_PRIVATE -UXNU_KERNEL_PRIVATE PLATFORM_UNIFDEF = $(foreach x,$(SUPPORTED_PLATFORMS),$(if $(filter $(PLATFORM),$(x)),-DPLATFORM_$(x) $(foreach token,$(PLATFORM_UNIFDEF_BLACKLIST_TOKENS_$(x)),-U$(token)),-UPLATFORM_$(x))) -SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ -SINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ -KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DPRIVATE -DKERNEL -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ -KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UPRIVATE -DKERNEL -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ -DATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ +SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ +SINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ +DKPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -DDRIVERKIT -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ +DKINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -DDRIVERKIT -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ +KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DKERNEL -DPRIVATE -UDRIVERKIT -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ +KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -DKERNEL -UPRIVATE -UDRIVERKIT -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ +DATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ # # Compononent Header file destinations diff --git a/makedefs/MakeInc.kernel b/makedefs/MakeInc.kernel index 55de6d307..363709142 100644 --- a/makedefs/MakeInc.kernel +++ b/makedefs/MakeInc.kernel @@ -31,11 +31,22 @@ do_build_setup:: $(_v)$(CAT) > $(OBJPATH)/compile_commands.json < /dev/null endif +ifeq ($(BUILD_STATIC_LINK),1) +ifeq ($(USE_LTO),1) +# +# To run LTO in the xnu project while linking the final result in KCB, without losing debugging info, +# run ld -r on only the LTO bitcode object files to produce one mach-o for KCB to use, which is added +# to the static link archive, along with the non-LTO objects (not linked, since ld -r on mach-o objects +# does not preserve DWARF.) +PRE_LTO=1 +endif +endif + # # Rules for the highly parallel "build" phase, where each build configuration # writes into their own $(TARGET) independent of other build configs # -# There are 3 primary build outputs: +# There are 4 primary build outputs: # 1) $(KERNEL_FILE_NAME).unstripped (raw linked kernel, unstripped) # 2) $(KERNEL_FILE_NAME) (stripped kernel, with optional CTF data) # 3) $(KERNEL_FILE_NAME).dSYM (dSYM) @@ -44,7 +55,7 @@ endif ifeq ($(BUILD_STATIC_LINK),1) KERNEL_STATIC_LINK_TARGETS = \ - $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a + $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a KERNEL_STATIC_LINK_DST = \ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a \ @@ -78,45 +89,82 @@ do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM .CFLAGS: ALWAYS $(_v)$(REPLACECONTENTS) $@ $(KCC) $(CFLAGS) $(INCFLAGS) -$(TARGET)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped - @echo "$(ColorH)STRIP$(Color0) $(ColorLF)$(@F)$(Color0)" +$(TARGET)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped $(TARGET)/$(KERNEL_FILE_NAME).dSYM + $(call makelog,$(ColorH)STRIP$(Color0) $(ColorLF)$(@F)$(Color0)) $(_v)$(STRIP) $(STRIP_FLAGS) $< -o $@ $(_v)$(RM) $@.ctfdata ifeq ($(DO_CTFMERGE),1) - @echo "$(ColorH)CTFMERGE$(Color0) $(ColorLF)$(@F)$(Color0)" - $(_v)$(FIND) $(TARGET)/ -name \*.ctf -size +0 | \ - $(XARGS) $(CTFMERGE) -l xnu -o $@ -Z $@.ctfdata || true -endif + $(call makelog,$(ColorH)CTFCONVERT$(Color0) $(ColorLF)$(@F)$(Color0)) + $(_v)$(CTFCONVERT) -c -l xnu -u /xnu -o $@.ctf $(TARGET)/$(KERNEL_FILE_NAME).dSYM/Contents/Resources/DWARF/$(KERNEL_FILE_NAME) + $(call makelog,$(ColorH)CTFMERGE$(Color0) $(ColorLF)$(@F)$(Color0)) + $(_v)$(CTFMERGE) -l xnu -o $@ -Z $@.ctfdata $@.ctf $(_v)if [ -s $@.ctfdata ]; then \ - echo "$(ColorH)CTFINSERT$(Color0) $(ColorLF)$(@F)$(Color0)"; \ + $(LOG) "$(ColorH)CTFINSERT$(Color0) $(ColorLF)$(@F)$(Color0)"; \ $(CTFINSERT) $@ $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ $@.ctfdata -o $@; \ fi; +endif $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME) $(OBJROOT)/$(KERNEL_FILE_NAME) $(TARGET)/$(KERNEL_FILE_NAME).dSYM: $(TARGET)/$(KERNEL_FILE_NAME).unstripped - $(_v)echo "$(ColorH)DSYMUTIL$(Color0) $(ColorLF)$(@F)$(Color0)" + $(call makelog,$(ColorH)DSYMUTIL$(Color0) $(ColorLF)$(@F)$(Color0)) $(_v)$(DSYMUTIL) $(DSYMUTIL_FLAGS) $< -o $@ $(_v)$(MV) $@/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME).unstripped $@/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(_v)$(TOUCH) $@ -$(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(SRCROOT)/config/version.c $(SRCROOT)/config/MasterVersion .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) +$(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o nonlto.o $(SRCROOT)/config/version.c $(SRCROOT)/config/MasterVersion .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) $(_v)${MAKE} -f $(firstword $(MAKEFILE_LIST)) version.o - @echo "$(ColorL)LD$(Color0) $(ColorLF)$(@F)$(Color0)" +ifeq ($(PRE_LTO),1) + $(call makelog,$(ColorL)LTO$(Color0) $(ColorLF)$(@F)$(Color0)) + $(_v)rm -f ltolink.filelist + $(_v)rm -f nonltolink.filelist + $(_v)files="$$($(CAT) $(filter %.filelist,$+)) version.o $(filter %.o,$+)"; \ + for ofile in $$files; \ + do \ + hdr=$$(od -An -N 4 -t x4 $$ofile); \ + if [ $$hdr == "0b17c0de" ]; \ + then \ + lto="$$lto$$ofile"$$'\n'; \ + else \ + nonlto="$$nonlto$$ofile"$$'\n'; \ + fi; \ + done; \ + printf "$$lto" >ltolink.filelist; \ + printf "$$nonlto" >nonltolink.filelist + $(_v)if [ -s ltolink.filelist ]; \ + then \ + $(LD) $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) -r nonlto.o -filelist ltolink.filelist $(LDFLAGS_KERNEL_LTO) -Wl,-object_path_lto,$(TARGET)/justlto.o -o $(TARGET)/justlto.tmp.o && \ + $(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist nonltolink.filelist $(TARGET)/justlto.o $(LDFLAGS_KERNEL_STRIP_LTO) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES); \ + else \ + $(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist nonltolink.filelist -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES); \ + fi +else + $(call makelog,$(ColorL)LD$(Color0) $(ColorLF)$(@F)$(Color0)) $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > link.filelist $(_v)$(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES) +endif + -$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(TARGET)/$(KERNEL_FILE_NAME).unstripped .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) - @echo "$(ColorL)LIBTOOL$(Color0) $(ColorLF)$(@F)$(Color0)" +$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(TARGET)/$(KERNEL_FILE_NAME).unstripped .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) + $(call makelog,$(ColorL)LIBTOOL$(Color0) $(ColorLF)$(@F)$(Color0)) $(_v)$(MKDIR) $(dir $@) - $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > libtool.filelist - $(_v)$(LIBTOOL) -ca -filelist libtool.filelist $(filter %.o,$+) version.o -o $@ +ifeq ($(PRE_LTO),1) + $(_v)$(LIBTOOL) -ca $(TARGET)/justlto.o -filelist nonltolink.filelist -o $@ +else + $(_v)$(LIBTOOL) -ca -filelist link.filelist version.o lastkerneldataconst.o lastkernelconstructor.o -o $@ +endif $(_v)cp $(TARGET)/all-kpi.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp $(_v)cp $(TARGET)/all-alias.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp $(_v)echo "$(LD_KERNEL_ARCHIVES)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives $(_v)echo "$(LDFLAGS_KERNEL) $(LD_KERNEL_LIBS)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME).link $(OBJROOT)/$(KERNEL_FILE_NAME).link +nonlto.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) +nonlto.o: $(SRCROOT)/libsa/nonlto.c + ${C_RULE_0} + ${C_RULE_1A}$< $(CFLAGS_NOLTO_FLAG) + ${C_RULE_2} + -include version.d version.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) version.o: $(OBJPATH)/version.c @@ -155,10 +203,10 @@ lastkernelconstructor.o: $(SRCROOT)/libsa/lastkernelconstructor.c ${C_RULE_3} ${C_RULE_4} $(_v)for last_file in ${LAST_FILES}; \ - do \ + do \ $(SEG_HACK) -s __DATA -n __LAST -o $${last_file}__ $${last_file} || exit 1; \ - mv $${last_file}__ $${last_file} || exit 1; \ - done + mv $${last_file}__ $${last_file} || exit 1; \ + done # # Install rules. Each build config is classified as "primary" (the first @@ -209,12 +257,11 @@ do_install_xnu_debug_files: $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/README.DEBUG-kern $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME) ALWAYS $(_v)$(MKDIR) $(dir $@) + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))") $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ - echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))\""; \ $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ cmdstatus=$$?; \ else \ - echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))\""; \ $(LIPO) -create $@ $< -output $@; \ cmdstatus=$$?; \ fi; \ @@ -224,27 +271,27 @@ ifeq ($(BUILD_STATIC_LINK),1) $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ # BUILD_STATIC_LINK @@ -252,12 +299,11 @@ endif $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS $(_v)$(MKDIR) $(dir $@) + $(call makelog,$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ - echo "$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ cmdstatus=$$?; \ else \ - echo "$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ $(LIPO) -create $@ $< -output $@; \ cmdstatus=$$?; \ fi; \ @@ -269,7 +315,7 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dS $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros: \ $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(CP) -r $< $(dir $@) $(_v)$(TOUCH) $@ @@ -278,27 +324,26 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dS $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME): \ $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/README.DEBUG-kernel.txt: $(SRCROOT)/config/README.DEBUG-kernel.txt $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist: $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) ALWAYS $(_v)$(MKDIR) $(dir $@) + $(call makelog,$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ - echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ cmdstatus=$$?; \ else \ - echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ $(LIPO) -create $@ $< -output $@; \ cmdstatus=$$?; \ fi; \ @@ -331,6 +376,7 @@ do_install_machine_specific_KDK_dSYM: \ # symlink during incremental builds and create a new symlink inside # the target of the existing symlink do_installhdrs_mi:: $(DSTROOT)/$(KRESDIR)/Info.plist +ifneq ($(INSTALLHDRS_SKIP_HOST),YES) $(_v)$(MKDIR) $(DSTROOT)/$(KINCFRAME) $(_v)$(MKDIR) $(DSTROOT)/$(KPINCDIR) $(_v)$(MKDIR) $(DSTROOT)/$(KRESDIR) @@ -345,17 +391,20 @@ do_installhdrs_mi:: $(DSTROOT)/$(KRESDIR)/Info.plist $(_v)$(RM) $(DSTROOT)/$(KINCFRAME)/Resources $(_v)$(LN) Versions/Current/Resources \ $(DSTROOT)/$(KINCFRAME)/Resources +endif $(DSTROOT)/$(KRESDIR)/Info.plist: $(SOURCE)/EXTERNAL_HEADERS/Info.plist +ifneq ($(INSTALLHDRS_SKIP_HOST),YES) $(_v)$(MKDIR) $(DSTROOT)/$(KRESDIR) $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ $(_v)$(NEWVERS) $@ $(_vstdout) ifeq ($(USE_BINARY_PLIST),1) $(_v)$(PLUTIL) -convert binary1 -o $@ $@ endif +endif $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(ALIAS_FILE_NAME): ALWAYS - $(_v)echo "$(ColorH)ALIAS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_ALIAS_MACHINE_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorH)ALIAS$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_ALIAS_MACHINE_CONFIG_LC)$(Color0))") $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME) $@ install_alias: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(ALIAS_FILE_NAME) diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index d2e05a89f..3961872d5 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -61,29 +61,29 @@ ifeq ($(LOGCOLORS),y) ifeq ($(CURRENT_MACHINE_CONFIG),NONE) export _MACHINE := $(subst Mac,,$(PLATFORM)) endif - export CMD_MC := \\033[1m$(shell __A="$(CURRENT_ARCH_CONFIG_LC)"; \ + export CMD_MC := $(shell __A="$(CURRENT_ARCH_CONFIG_LC)"; \ __As=$$((6-$${\#__A})); \ - printf "%-.6s%*.*s %9.9s" \ + printf "\\033[1m%-.6s%*.*s %9.9s\\033[m" \ "$${__A}" \ $${__As} $${__As} " " \ - "$(_MACHINE)")\\033[m + "$(_MACHINE)") endif # Turn off colored output - Color0=\\033[m + Color0:=$(shell printf "\\033[m") # Start a host command: bold, underlined pink text - ColorH=\\033[1;4;35m + ColorH:=$(shell printf "\\033[1;4;35m") # Start a compilation-related command: bold, underlined blue text - ColorC=[$(CMD_MC)] \\033[1;4;34m + ColorC:=$(shell printf "[$(CMD_MC)] \\033[1;4;34m") # Start a MIG command: bold, green text on light grey background - ColorM=[$(CMD_MC)] \\033[1;32;40m + ColorM:=$(shell printf "[$(CMD_MC)] \\033[1;32;40m") # Start a linking command: bold, white text on blue background - ColorL=[$(CMD_MC)] \\033[1;37;44m + ColorL:=$(shell printf "[$(CMD_MC)] \\033[1;37;44m") # Start a filename: bold, white text - ColorF=\\033[1;37m + ColorF:=$(shell printf "\\033[1;37m") # Start a linked file name: yellow text on light grey background - ColorLF=\\033[1;33;40m + ColorLF:=$(shell printf "\\033[1;33;40m") # Error strings: underlined bold white text on red background - ColorErr=\033[1;4;37;41m + ColorErr:=$(shell printf "\033[1;4;37;41m") endif .PHONY: ALWAYS @@ -112,10 +112,10 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) $(4) $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR - @echo "$$(ColorH)INSTALLHDR$$(Color0) $$(ColorF)$$*$$(Color0)" + $$(call makelog,$$(ColorH)INSTALLHDR$$(Color0) $$(ColorF)$$*$$(Color0)) $$(_v)$$(UNIFDEF) $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$; \ if [ $$$$? -eq 2 ]; then \ - echo Parse failure for $$<; \ + $(ERR) Parse failure for $$<; \ exit 1; \ fi; \ $$(DECOMMENT) ./$(3)/$$*.unifdef.$$$$$$$$ r > \ @@ -148,11 +148,11 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) -t $(4) $(1): $(5)% : $(2) | $(3)_MKDIR - @echo "$$(ColorH)INSTALLPY$$(Color0) $$(ColorF)$$*$$(Color0)" + $$(call makelog,$$(ColorH)INSTALLPY$$(Color0) $$(ColorF)$$*$$(Color0)) $$(_v)$$(MKDIR) $$(dir $$@) $$(dir ./$(3)/$$*) $$(_v)$$(UNIFDEF) -t $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*); \ if [ $$$$? -eq 2 ]; then \ - echo Parse failure for $$<; \ + $(ERR) Parse failure for $$<; \ exit 1; \ fi; \ $$(INSTALL) $$(DATA_INSTALL_FLAGS) \ @@ -163,6 +163,37 @@ $(1): $(5)% : $(2) | $(3)_MKDIR $$(_v)if [ -n "$(5)" ]; then $$(TOUCH) "$(5)"; fi endef +# +# Empty the install lists of non-host headers if building the host headers alias +# + +ifeq ($(INSTALLHDRS_SKIP_HOST),YES) +INSTALL_MI_LIST = +INSTALL_MI_GEN_LIST = +INSTALL_DRIVERKIT_MI_LIST = +INSTALL_DRIVERKIT_MI_GEN_LIST = +INSTALL_KF_MI_LIST = +INSTALL_KF_MI_GEN_LIST = +INSTALL_MI_LCL_LIST = +INSTALL_MI_LCL_GEN_LIST = +INSTALL_DRIVERKIT_MI_LCL_LIST = +INSTALL_DRIVERKIT_MI_LCL_GEN_LIST = +INSTALL_KF_MI_LCL_LIST = +INSTALL_KF_MI_LCL_GEN_LIST = +INSTALL_MD_LIST = +INSTALL_MD_GEN_LIST = +INSTALL_DRIVERKIT_MD_LIST = +INSTALL_DRIVERKIT_MD_GEN_LIST = +INSTALL_KF_MD_LIST = +INSTALL_KF_MD_GEN_LIST = +INSTALL_MD_LCL_LIST = +INSTALL_MD_LCL_GEN_LIST = +INSTALL_DRIVERKIT_MD_LCL_LIST = +INSTALL_DRIVERKIT_MD_LCL_GEN_LIST = +INSTALL_KF_MD_LCL_LIST = +INSTALL_KF_MD_LCL_GEN_LIST = +endif + # # Machine-independent (public) files # @@ -173,6 +204,14 @@ INSTALL_MI_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(INCDIR)/$(INSTALL_MI_DIR)/, $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_INC_FILES),,incmidir,$(SINCFRAME_UNIFDEF))) $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_INC_GEN_FILES),1,incmigendir,$(SINCFRAME_UNIFDEF))) +ifeq ($(DRIVERKIT),1) +INSTALL_DRIVERKIT_MI_INC_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LIST)) +INSTALL_DRIVERKIT_MI_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_GEN_LIST)) + +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_INC_FILES),,dkincmidir,$(DKINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_INC_GEN_FILES),1,dkincmigendir,$(DKINCFRAME_UNIFDEF))) +endif + INSTALL_KF_MI_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_LIST)) INSTALL_KF_MI_GEN_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_GEN_LIST)) @@ -189,6 +228,14 @@ INSTALL_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(LCLDIR)/$(INSTALL_MI_DIR)/, $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_LCL_FILES),,pincmidir,$(SPINCFRAME_UNIFDEF))) $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_LCL_GEN_FILES),1,pincmigendir,$(SPINCFRAME_UNIFDEF))) +ifeq ($(DRIVERKIT),1) +INSTALL_DRIVERKIT_MI_LCL_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LCL_LIST)) +INSTALL_DRIVERKIT_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LCL_GEN_LIST)) + +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_LCL_FILES),,dkpincmidir,$(DKPINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_LCL_GEN_FILES),1,dkpincmigendir,$(DKPINCFRAME_UNIFDEF))) +endif + INSTALL_KF_MI_LCL_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_LCL_LIST)) INSTALL_KF_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_LCL_GEN_LIST)) @@ -205,6 +252,14 @@ INSTALL_MD_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(INCDIR)/$(INSTALL_MD_DIR)/, $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_INC_FILES),,incdir,$(SINCFRAME_UNIFDEF))) $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_INC_GEN_FILES),1,incgendir,$(SINCFRAME_UNIFDEF))) +ifeq ($(DRIVERKIT),1) +INSTALL_DRIVERKIT_MD_INC_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LIST)) +INSTALL_DRIVERKIT_MD_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_GEN_LIST)) + +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_INC_FILES),,dkincdir,$(DKINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_INC_GEN_FILES),1,dkincgendir,$(DKINCFRAME_UNIFDEF))) +endif + INSTALL_KF_MD_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_LIST)) INSTALL_KF_MD_GEN_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_GEN_LIST)) @@ -221,6 +276,14 @@ INSTALL_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(LCLDIR)/$(INSTALL_MD_DIR)/, $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_LCL_FILES),,pincdir,$(SPINCFRAME_UNIFDEF))) $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_LCL_GEN_FILES),1,pincgendir,$(SPINCFRAME_UNIFDEF))) +ifeq ($(DRIVERKIT),1) +INSTALL_DRIVERKIT_MD_LCL_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LCL_LIST)) +INSTALL_DRIVERKIT_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LCL_GEN_LIST)) + +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_LCL_FILES),,dkpincdir,$(DKPINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_LCL_GEN_FILES),1,dkpincgendir,$(DKPINCFRAME_UNIFDEF))) +endif + INSTALL_KF_MD_LCL_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_LCL_LIST)) INSTALL_KF_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_LCL_GEN_LIST)) @@ -230,14 +293,22 @@ $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_KF_MD_LCL_GEN_FILES),1,kpincge .PHONY: do_installhdrs_mi # Double-colon rule so that MakeInc.kernel can add custom behaviors -do_installhdrs_mi:: $(INSTALL_MI_INC_FILES) $(INSTALL_MI_INC_GEN_FILES) $(INSTALL_KF_MI_FILES) $(INSTALL_KF_MI_GEN_FILES) \ - $(INSTALL_MI_LCL_FILES) $(INSTALL_MI_LCL_GEN_FILES) $(INSTALL_KF_MI_LCL_FILES) $(INSTALL_KF_MI_LCL_GEN_FILES) +do_installhdrs_mi:: $(INSTALL_MI_INC_FILES) $(INSTALL_MI_INC_GEN_FILES) \ + $(INSTALL_DRIVERKIT_MI_INC_FILES) $(INSTALL_DRIVERKIT_MI_INC_GEN_FILES) \ + $(INSTALL_KF_MI_FILES) $(INSTALL_KF_MI_GEN_FILES) \ + $(INSTALL_MI_LCL_FILES) $(INSTALL_MI_LCL_GEN_FILES) \ + $(INSTALL_DRIVERKIT_MI_LCL_FILES) $(INSTALL_DRIVERKIT_MI_LCL_GEN_FILES) \ + $(INSTALL_KF_MI_LCL_FILES) $(INSTALL_KF_MI_LCL_GEN_FILES) @: .PHONY: do_installhdrs_md -do_installhdrs_md: $(INSTALL_MD_INC_FILES) $(INSTALL_MD_INC_GEN_FILES) $(INSTALL_KF_MD_FILES) $(INSTALL_KF_MD_GEN_FILES) \ - $(INSTALL_MD_LCL_FILES) $(INSTALL_MD_LCL_GEN_FILES) $(INSTALL_KF_MD_LCL_FILES) $(INSTALL_KF_MD_LCL_GEN_FILES) +do_installhdrs_md: $(INSTALL_MD_INC_FILES) $(INSTALL_MD_INC_GEN_FILES) \ + $(INSTALL_DRIVERKIT_MD_INC_FILES) $(INSTALL_DRIVERKIT_MD_INC_GEN_FILES) \ + $(INSTALL_KF_MD_FILES) $(INSTALL_KF_MD_GEN_FILES) \ + $(INSTALL_MD_LCL_FILES) $(INSTALL_MD_LCL_GEN_FILES) \ + $(INSTALL_DRIVERKIT_MD_LCL_FILES) $(INSTALL_DRIVERKIT_MD_LCL_GEN_FILES) \ + $(INSTALL_KF_MD_LCL_FILES) $(INSTALL_KF_MD_LCL_GEN_FILES) @: # @@ -289,7 +360,7 @@ do_exporthdrs_md: $(EXPORT_MD_GEN_INC_FILES) $(EXPORT_MD_INC_FILES) # Compilation rules to generate .o from .s # -S_RULE_0=@echo "$(ColorC)AS$(Color0) $(ColorF)$@$(Color0)" +S_RULE_0=$(call makelog,$(ColorC)AS$(Color0) $(ColorF)$@$(Color0)) S_RULE_1A=$(_v)${S_KCC} -c ${SFLAGS} -MD -MF $(@:o=d) -MP ${$@_SFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} S_RULE_1B=$(&1 > /dev/null || true`; \ - if [ ! -z "$${ctferr}" ]; then \ - echo "[$(CMD_MC)] $(ColorErr)$@$(Color0) $(ColorErr)$${ctferr}$(Color0)"; \ - fi; \ - fi -else C_RULE_2= -endif -ifeq ($(DO_CTFMACHO), 1) -C_CTFRULE_1A=$(_v)${KCC} -o $@.non_lto -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} $(CFLAGS_NOLTO_FLAG) ${$@_INCFLAGS} -C_CTFRULE_1B=$(&1 > /dev/null || true`; \ - if [ ! -z "$${ctferr}" ]; then \ - echo "[$(CMD_MC)] $(ColorErr)$@$(Color0) $(ColorErr)$${ctferr}$(Color0)"; \ - fi; \ - fi -else -C_CTFRULE_1A=@true -C_CTFRULE_1B= -C_CTFRULE_2=@true -endif -C_RULE_3=@true -C_RULE_4A=@true +C_RULE_3= +C_RULE_4A= C_RULE_4B= # @@ -341,25 +387,18 @@ C_RULE_2_D=${C_RULE_2} C_RULE_3_D=${C_RULE_3} C_RULE_4A_D=${C_RULE_4A} C_RULE_4B_D=${C_RULE_4B} -C_CTFRULE_1A_D=${C_CTFRULE_1A} -C_CTFRULE_1B_D=${C_CTFRULE_1B} -C_CTFRULE_2_D=${C_CTFRULE_2} -C_CTFRULE_3_D=${C_CTFRULE_3} # # Compilation rules to generate .co from .cp or .cpo from .cpp # The config tool slickly changes the last source filename char to 'o' # for the object filename. -P_RULE_0=@echo "$(ColorC)C++$(Color0) $(ColorF)$@$(Color0)" +P_RULE_0=$(call makelog,$(ColorC)C++$(Color0) $(ColorF)$@$(Color0)) P_RULE_1A=$(_v)${KC++} -o $@ -c ${CXXFLAGS} ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CXXWARNFLAGS}} -MD -MF $(@:o=d) -MP ${$@_CFLAGS_ADD} ${$@_CXXWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} P_RULE_1B=$(self_port = ipc_port_alloc_kernel(); - if (reply->self_port == IP_NULL) { - kfree(reply, sizeof(struct UNDReply)); - reply = UND_REPLY_NULL; - } else { - lck_mtx_init(&reply->lock, &LockCompatGroup, LCK_ATTR_NULL); - reply->userLandNotificationKey = -1; - reply->inprogress = FALSE; - ipc_kobject_set(reply->self_port, - (ipc_kobject_t)reply, - IKOT_UND_REPLY); - } + reply->self_port = ipc_kobject_alloc_port((ipc_kobject_t)reply, + IKOT_UND_REPLY, IPC_KOBJECT_ALLOC_NONE); + lck_mtx_init(&reply->lock, &LockCompatGroup, LCK_ATTR_NULL); + reply->userLandNotificationKey = -1; + reply->inprogress = FALSE; } return (KUNCUserNotificationID) reply; } diff --git a/osfmk/UserNotification/Makefile b/osfmk/UserNotification/Makefile index 2de33166a..be0723a8b 100644 --- a/osfmk/UserNotification/Makefile +++ b/osfmk/UserNotification/Makefile @@ -63,7 +63,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*.c \ -header $*.h \ @@ -73,7 +73,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %Server.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/arm/Makefile b/osfmk/arm/Makefile index 38c19f380..ab9975c6e 100644 --- a/osfmk/arm/Makefile +++ b/osfmk/arm/Makefile @@ -20,6 +20,7 @@ ARM_HEADER_FILES = \ machine_cpu.h \ machine_cpuid.h \ machine_routines.h \ + memory_types.h \ pal_routines.h \ pmap_public.h \ proc_reg.h \ diff --git a/osfmk/arm/arm_init.c b/osfmk/arm/arm_init.c index cf4906915..9f2b60169 100644 --- a/osfmk/arm/arm_init.c +++ b/osfmk/arm/arm_init.c @@ -85,6 +85,7 @@ extern void sleep_token_buffer_init(void); extern vm_offset_t intstack_top; #if __arm64__ extern vm_offset_t excepstack_top; +extern uint64_t events_per_sec; #else extern vm_offset_t fiqstack_top; #endif @@ -101,6 +102,7 @@ boolean_t up_style_idle_exit = 0; + #if INTERRUPT_MASKED_DEBUG boolean_t interrupt_masked_debug = 1; uint64_t interrupt_masked_timeout = 0xd0000; @@ -135,6 +137,9 @@ unsigned int page_shift_user32; /* for page_size as seen by a 32-bit task */ * JOP rebasing */ +#if defined(HAS_APPLE_PAC) +#include +#endif /* defined(HAS_APPLE_PAC) */ // Note, the following should come from a header from dyld static void @@ -145,6 +150,11 @@ rebase_chain(uintptr_t chainStartAddress, uint64_t stepMultiplier, uintptr_t bas do { uint64_t value = *(uint64_t*)address; +#if HAS_APPLE_PAC + uint16_t diversity = (uint16_t)(value >> 32); + bool hasAddressDiversity = (value & (1ULL << 48)) != 0; + ptrauth_key key = (ptrauth_key)((value >> 49) & 0x3); +#endif bool isAuthenticated = (value & (1ULL << 63)) != 0; bool isRebase = (value & (1ULL << 62)) == 0; if (isRebase) { @@ -153,6 +163,33 @@ rebase_chain(uintptr_t chainStartAddress, uint64_t stepMultiplier, uintptr_t bas uint64_t newValue = (value & 0xFFFFFFFF) + slide; // Add in the offset from the mach_header newValue += baseAddress; +#if HAS_APPLE_PAC + // We have bits to merge in to the discriminator + uintptr_t discriminator = diversity; + if (hasAddressDiversity) { + // First calculate a new discriminator using the address of where we are trying to store the value + // Only blend if we have a discriminator + if (discriminator) { + discriminator = __builtin_ptrauth_blend_discriminator((void*)address, discriminator); + } else { + discriminator = address; + } + } + switch (key) { + case ptrauth_key_asia: + newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asia, discriminator); + break; + case ptrauth_key_asib: + newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asib, discriminator); + break; + case ptrauth_key_asda: + newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asda, discriminator); + break; + case ptrauth_key_asdb: + newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asdb, discriminator); + break; + } +#endif *(uint64_t*)address = newValue; } else { // Regular pointer which needs to fit in 51-bits of value. @@ -190,6 +227,7 @@ rebase_threaded_starts(uint32_t *threadArrayStart, uint32_t *threadArrayEnd, return true; } + /* * Routine: arm_init * Function: @@ -222,12 +260,29 @@ arm_init( BootArgs = args = &const_boot_args; cpu_data_init(&BootCpuData); +#if defined(HAS_APPLE_PAC) + /* bootstrap cpu process dependent key for kernel has been loaded by start.s */ + BootCpuData.rop_key = KERNEL_ROP_ID; +#endif /* defined(HAS_APPLE_PAC) */ PE_init_platform(FALSE, args); /* Get platform expert set up */ #if __arm64__ +#if defined(HAS_APPLE_PAC) + boolean_t user_jop = TRUE; + PE_parse_boot_argn("user_jop", &user_jop, sizeof(user_jop)); + if (!user_jop) { + args->bootFlags |= kBootFlagsDisableUserJOP; + } + boolean_t user_ts_jop = TRUE; + PE_parse_boot_argn("user_ts_jop", &user_ts_jop, sizeof(user_ts_jop)); + if (!user_ts_jop) { + args->bootFlags |= kBootFlagsDisableUserThreadStateJOP; + } +#endif /* defined(HAS_APPLE_PAC) */ + { unsigned int tmp_16k = 0; @@ -339,12 +394,17 @@ arm_init( rtclock_early_init(); + lck_mod_init(); + + /* + * Initialize the timer callout world + */ + timer_call_init(); + kernel_early_bootstrap(); cpu_init(); - EntropyData.index_ptr = EntropyData.buffer; - processor_bootstrap(); my_master_proc = master_processor; @@ -366,7 +426,7 @@ arm_init( /* Disable if WDT is disabled or no_interrupt_mask_debug in boot-args */ if (PE_parse_boot_argn("no_interrupt_masked_debug", &interrupt_masked_debug, sizeof(interrupt_masked_debug)) || (PE_parse_boot_argn("wdt", &wdt_boot_arg, - sizeof(wdt_boot_arg)) && (wdt_boot_arg == -1))) { + sizeof(wdt_boot_arg)) && (wdt_boot_arg == -1)) || kern_feature_override(KF_INTERRUPT_MASKED_DEBUG_OVRD)) { interrupt_masked_debug = 0; } @@ -450,7 +510,26 @@ arm_init( #endif PE_init_platform(TRUE, &BootCpuData); + +#if __arm64__ + if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) { + if (events_per_sec <= 0) { + events_per_sec = 1; + } else if (events_per_sec > USEC_PER_SEC) { + events_per_sec = USEC_PER_SEC; + } + } else { +#if defined(ARM_BOARD_WFE_TIMEOUT_NS) + events_per_sec = NSEC_PER_SEC / ARM_BOARD_WFE_TIMEOUT_NS; +#else /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */ + /* Default to 1usec (or as close as we can get) */ + events_per_sec = USEC_PER_SEC; +#endif /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */ + } +#endif + cpu_timebase_init(TRUE); + PE_init_cpu(); fiq_context_bootstrap(TRUE); @@ -483,6 +562,7 @@ arm_init_cpu( __builtin_arm_wsr("pan", 1); #endif + cpu_data_ptr->cpu_flags &= ~SleepState; #if __ARM_SMP__ && defined(ARMA7) cpu_data_ptr->cpu_CLW_active = 1; @@ -528,6 +608,7 @@ arm_init_cpu( PE_init_platform(TRUE, NULL); commpage_update_timebase(); } + PE_init_cpu(); fiq_context_init(TRUE); cpu_data_ptr->rtcPop = EndOfAllTime; diff --git a/osfmk/arm/arm_timer.c b/osfmk/arm/arm_timer.c index da4a0c3b5..3b9c4f310 100644 --- a/osfmk/arm/arm_timer.c +++ b/osfmk/arm/arm_timer.c @@ -199,6 +199,20 @@ timer_resync_deadlines(void) splx(s); } +void +timer_queue_expire_local( + __unused void *arg) +{ + rtclock_timer_t *mytimer = &getCpuDatap()->rtclock_timer; + uint64_t abstime; + + abstime = mach_absolute_time(); + mytimer->has_expired = TRUE; + mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); + mytimer->has_expired = FALSE; + + timer_resync_deadlines(); +} boolean_t timer_resort_threshold(__unused uint64_t skew) diff --git a/osfmk/arm/arm_vm_init.c b/osfmk/arm/arm_vm_init.c index 5a2488458..08788e136 100644 --- a/osfmk/arm/arm_vm_init.c +++ b/osfmk/arm/arm_vm_init.c @@ -184,11 +184,12 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, } else { /* TTE must be reincarnated COARSE. */ ppte = (pt_entry_t *)phystokv(avail_start); + pmap_paddr_t l2table = avail_start; avail_start += ARM_PGBYTES; bzero(ppte, ARM_PGBYTES); for (i = 0; i < 4; ++i) { - tte[i] = pa_to_tte(kvtophys((vm_offset_t)ppte) + (i * 0x400)) | ARM_TTE_TYPE_TABLE; + tte[i] = pa_to_tte(l2table + (i * 0x400)) | ARM_TTE_TYPE_TABLE; } } @@ -343,8 +344,9 @@ arm_vm_prot_init(boot_args * args) */ pmap_paddr_t p = (pmap_paddr_t)(args->topOfKernelData) + (ARM_PGBYTES * 9); pt_entry_t *ppte = (pt_entry_t *)phystokv(p); + pmap_init_pte_page(kernel_pmap, ppte, HIGH_EXC_VECTORS & ~ARM_TT_L1_PT_OFFMASK, 2, TRUE, FALSE); - int idx = (HIGH_EXC_VECTORS & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT; + int idx = (HIGH_EXC_VECTORS & ARM_TT_L1_PT_OFFMASK) >> ARM_TT_L2_SHIFT; pt_entry_t ptmp = ppte[idx]; ptmp = (ptmp & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA); @@ -367,9 +369,6 @@ arm_vm_prot_finalize(boot_args * args) arm_vm_page_granular_RWNX(phystokv(args->topOfKernelData) + ARM_PGBYTES * 9, ARM_PGBYTES, FALSE); /* commpage, EVB */ -#ifndef __ARM_L1_PTW__ - FlushPoC_Dcache(); -#endif flush_mmu_tlb(); } @@ -497,11 +496,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) sectCONSTB = sectDCONST->addr; sectSizeCONST = sectDCONST->size; -#if !SECURE_KERNEL - /* doconstro is true by default, but we allow a boot-arg to disable it */ - (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro)); -#endif - if (doconstro) { extern vm_offset_t _lastkerneldataconst; extern vm_size_t _lastkerneldataconst_padsize; @@ -534,25 +528,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) vm_set_page_size(); -#ifndef __ARM_L1_PTW__ - FlushPoC_Dcache(); -#endif - set_mmu_ttb(cpu_ttep); - set_mmu_ttb_alternate(cpu_ttep); - flush_mmu_tlb(); -#if __arm__ && __ARM_USER_PROTECT__ - { - unsigned int ttbr0_val, ttbr1_val, ttbcr_val; - thread_t thread = current_thread(); - - __asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val)); - __asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val)); - __asm__ volatile ("mrc p15,0,%0,c2,c0,2\n" : "=r"(ttbcr_val)); - thread->machine.uptw_ttb = ttbr0_val; - thread->machine.kptw_ttb = ttbr1_val; - thread->machine.uptw_ttc = ttbcr_val; - } -#endif vm_prelink_stext = segPRELINKTEXTB; vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT; vm_prelink_sinfo = segPRELINKINFOB; @@ -591,14 +566,30 @@ arm_vm_init(uint64_t memory_size, boot_args * args) ptp = (pt_entry_t *) phystokv(avail_start); ptp_phys = (pmap_paddr_t)avail_start; avail_start += ARM_PGBYTES; - pmap_init_pte_page(kernel_pmap, ptp, va + off, 2, TRUE); + pmap_init_pte_page(kernel_pmap, ptp, va + off, 2, TRUE, TRUE); tte = &cpu_tte[ttenum(va + off)]; - *tte = pa_to_tte((ptp_phys)) | ARM_TTE_TYPE_TABLE;; - *(tte + 1) = pa_to_tte((ptp_phys + 0x400)) | ARM_TTE_TYPE_TABLE;; - *(tte + 2) = pa_to_tte((ptp_phys + 0x800)) | ARM_TTE_TYPE_TABLE;; - *(tte + 3) = pa_to_tte((ptp_phys + 0xC00)) | ARM_TTE_TYPE_TABLE;; + *tte = pa_to_tte((ptp_phys)) | ARM_TTE_TYPE_TABLE; + *(tte + 1) = pa_to_tte((ptp_phys + 0x400)) | ARM_TTE_TYPE_TABLE; + *(tte + 2) = pa_to_tte((ptp_phys + 0x800)) | ARM_TTE_TYPE_TABLE; + *(tte + 3) = pa_to_tte((ptp_phys + 0xC00)) | ARM_TTE_TYPE_TABLE; } + set_mmu_ttb(cpu_ttep); + set_mmu_ttb_alternate(cpu_ttep); + flush_mmu_tlb(); +#if __arm__ && __ARM_USER_PROTECT__ + { + unsigned int ttbr0_val, ttbr1_val, ttbcr_val; + thread_t thread = current_thread(); + + __asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val)); + __asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val)); + __asm__ volatile ("mrc p15,0,%0,c2,c0,2\n" : "=r"(ttbcr_val)); + thread->machine.uptw_ttb = ttbr0_val; + thread->machine.kptw_ttb = ttbr1_val; + thread->machine.uptw_ttc = ttbcr_val; + } +#endif avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK; first_avail = avail_start; diff --git a/osfmk/arm/atomic.h b/osfmk/arm/atomic.h index 380286cde..a6b4c2b8c 100644 --- a/osfmk/arm/atomic.h +++ b/osfmk/arm/atomic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,281 +26,200 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#ifndef _MACHINE_ATOMIC_H +#error "Do not include directly, use " +#endif + #ifndef _ARM_ATOMIC_H_ #define _ARM_ATOMIC_H_ #include -#include // Parameter for __builtin_arm_dmb +#define DMB_OSHLD 0x1 +#define DMB_OSHST 0x2 +#define DMB_OSH 0x3 +#define DMB_NSHLD 0x5 +#define DMB_NSHST 0x6 #define DMB_NSH 0x7 #define DMB_ISHLD 0x9 #define DMB_ISHST 0xa #define DMB_ISH 0xb +#define DMB_LD 0xd +#define DMB_ST 0xe #define DMB_SY 0xf // Parameter for __builtin_arm_dsb +#define DSB_OSHLD 0x1 +#define DSB_OSHST 0x2 +#define DSB_OSH 0x3 +#define DSB_NSHLD 0x5 +#define DSB_NSHST 0x6 #define DSB_NSH 0x7 #define DSB_ISHLD 0x9 #define DSB_ISHST 0xa #define DSB_ISH 0xb +#define DSB_LD 0xd +#define DSB_ST 0xe #define DSB_SY 0xf // Parameter for __builtin_arm_isb #define ISB_SY 0xf -#if __SMP__ - -#define memory_order_consume_smp memory_order_consume -#define memory_order_acquire_smp memory_order_acquire -#define memory_order_release_smp memory_order_release -#define memory_order_acq_rel_smp memory_order_acq_rel -#define memory_order_seq_cst_smp memory_order_seq_cst - -#else - -#define memory_order_consume_smp memory_order_relaxed -#define memory_order_acquire_smp memory_order_relaxed -#define memory_order_release_smp memory_order_relaxed -#define memory_order_acq_rel_smp memory_order_relaxed -#define memory_order_seq_cst_smp memory_order_relaxed +#undef OS_ATOMIC_HAS_LLSC +#define OS_ATOMIC_HAS_LLSC 1 +#if defined(__ARM_ARCH_8_2__) && defined(__arm64__) +#undef OS_ATOMIC_USE_LLSC +#define OS_ATOMIC_USE_LLSC 0 #endif + /* - * Atomic operations functions - * - * These static functions are designed for inlining - * It is expected that the memory_order arguments are - * known at compile time. This collapses these - * functions into a simple atomic operation + * On armv7 & arm64, we do provide fine grained dependency injection, so + * memory_order_dependency maps to relaxed as far as thread fences are concerned */ +#undef memory_order_dependency_smp +#define memory_order_dependency_smp memory_order_relaxed -static inline boolean_t -memory_order_has_acquire(enum memory_order ord) -{ - switch (ord) { - case memory_order_consume: - case memory_order_acquire: - case memory_order_acq_rel: - case memory_order_seq_cst: - return TRUE; - default: - return FALSE; - } -} - -static inline boolean_t -memory_order_has_release(enum memory_order ord) -{ - switch (ord) { - case memory_order_release: - case memory_order_acq_rel: - case memory_order_seq_cst: - return TRUE; - default: - return FALSE; - } -} - -#ifdef ATOMIC_PRIVATE - -#define clear_exclusive() __builtin_arm_clrex() - -__unused static uint32_t -load_exclusive32(uint32_t *target, enum memory_order ord) -{ - uint32_t value; +#define os_atomic_clear_exclusive() __builtin_arm_clrex() #if __arm__ - if (memory_order_has_release(ord)) { - // Pre-load release barrier - atomic_thread_fence(memory_order_release); - } - value = __builtin_arm_ldrex(target); -#else - if (memory_order_has_acquire(ord)) { - value = __builtin_arm_ldaex(target); // ldaxr - } else { - value = __builtin_arm_ldrex(target); // ldxr - } -#endif // __arm__ - return value; -} - -__unused static boolean_t -store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord) -{ - boolean_t err; -#if __arm__ - err = __builtin_arm_strex(value, target); - if (memory_order_has_acquire(ord)) { - // Post-store acquire barrier - atomic_thread_fence(memory_order_acquire); - } -#else - if (memory_order_has_release(ord)) { - err = __builtin_arm_stlex(value, target); // stlxr - } else { - err = __builtin_arm_strex(value, target); // stxr - } -#endif // __arm__ - return !err; -} +#define os_atomic_load_exclusive(p, m) ({ \ + _os_atomic_basetypeof(p) _r; \ + _r = __builtin_arm_ldrex(p); \ + _os_memory_fence_after_atomic(m); \ + _os_compiler_barrier_after_atomic(m); \ + _r; \ +}) -__unused static uintptr_t -load_exclusive(uintptr_t *target, enum memory_order ord) -{ -#if !__LP64__ - return load_exclusive32((uint32_t *)target, ord); -#else - uintptr_t value; +#define os_atomic_store_exclusive(p, v, m) ({ \ + _os_compiler_barrier_before_atomic(m); \ + _os_memory_fence_before_atomic(m); \ + !__builtin_arm_strex(p, v); \ +}) - if (memory_order_has_acquire(ord)) { - value = __builtin_arm_ldaex(target); // ldaxr - } else { - value = __builtin_arm_ldrex(target); // ldxr - } - return value; -#endif // __arm__ -} - -__unused static uint8_t -load_exclusive_acquire8(uint8_t *target) -{ - uint8_t value; -#if __arm__ - value = __builtin_arm_ldrex(target); - __c11_atomic_thread_fence(__ATOMIC_ACQUIRE); -#else - value = __builtin_arm_ldaex(target); // ldaxr - /* "Compiler barrier", no barrier instructions are emitted */ - atomic_signal_fence(memory_order_acquire); -#endif - return value; -} - -__unused static boolean_t -store_exclusive(uintptr_t *target, uintptr_t value, enum memory_order ord) -{ -#if !__LP64__ - return store_exclusive32((uint32_t *)target, value, ord); -#else - boolean_t err; - - if (memory_order_has_release(ord)) { - err = __builtin_arm_stlex(value, target); // stlxr - } else { - err = __builtin_arm_strex(value, target); // stxr - } - return !err; -#endif -} - -__unused static boolean_t -atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, - enum memory_order orig_ord, boolean_t wait) -{ - enum memory_order ord = orig_ord; - uintptr_t value; - - -#if __arm__ - ord = memory_order_relaxed; - if (memory_order_has_release(orig_ord)) { - atomic_thread_fence(memory_order_release); - } -#endif - do { - value = load_exclusive(target, ord); - if (value != oldval) { - if (wait) { - wait_for_event(); // Wait with monitor held - } else { - clear_exclusive(); // Clear exclusive monitor - } - return FALSE; - } - } while (!store_exclusive(target, newval, ord)); -#if __arm__ - if (memory_order_has_acquire(orig_ord)) { - atomic_thread_fence(memory_order_acquire); - } -#endif - return TRUE; -} - -#endif // ATOMIC_PRIVATE +/* + * armv7 override of os_atomic_make_dependency + * documentation for os_atomic_make_dependency is in + */ +#undef os_atomic_make_dependency +#define os_atomic_make_dependency(v) ({ \ + os_atomic_dependency_t _dep; \ + __asm__ __volatile__("and %[_dep], %[_v], #0" \ + : [_dep] "=r" (_dep.__opaque_zero) : [_v] "r" (v)); \ + os_compiler_barrier(acquire); \ + _dep; \ +}) -#if __arm__ +/* + * armv7 override of os_atomic_rmw_loop + * documentation for os_atomic_rmw_loop is in + */ #undef os_atomic_rmw_loop #define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ - boolean_t _result = FALSE; uint32_t _err = 0; \ - typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \ + int _result = 0; uint32_t _err = 0; \ + _os_atomic_basetypeof(p) *_p; \ + _p = (_os_atomic_basetypeof(p) *)(p); \ + _os_compiler_barrier_before_atomic(m); \ for (;;) { \ ov = __builtin_arm_ldrex(_p); \ __VA_ARGS__; \ - if (!_err && memory_order_has_release(memory_order_##m)) { \ - /* only done for the first loop iteration */ \ - atomic_thread_fence(memory_order_release); \ + if (!_err) { \ + /* release barrier only done for the first loop iteration */ \ + _os_memory_fence_before_atomic(m); \ } \ _err = __builtin_arm_strex(nv, _p); \ if (__builtin_expect(!_err, 1)) { \ - if (memory_order_has_acquire(memory_order_##m)) { \ - atomic_thread_fence(memory_order_acquire); \ - } \ - _result = TRUE; \ + _os_memory_fence_after_atomic(m); \ + _result = 1; \ break; \ } \ } \ + _os_compiler_barrier_after_atomic(m); \ _result; \ }) +/* + * armv7 override of os_atomic_rmw_loop_give_up + * documentation for os_atomic_rmw_loop_give_up is in + */ #undef os_atomic_rmw_loop_give_up -#define os_atomic_rmw_loop_give_up(expr) \ - ({ __builtin_arm_clrex(); expr; __builtin_trap(); }) +#define os_atomic_rmw_loop_give_up(...) \ + ({ os_atomic_clear_exclusive(); __VA_ARGS__; break; }) + +#else // __arm64__ + +#define os_atomic_load_exclusive(p, m) ({ \ + _os_atomic_basetypeof(p) _r; \ + if (memory_order_has_acquire(memory_order_##m##_smp)) { \ + _r = __builtin_arm_ldaex(p); \ + } else { \ + _r = __builtin_arm_ldrex(p); \ + } \ + _os_compiler_barrier_after_atomic(m); \ + _r; \ +}) -#else +#define os_atomic_store_exclusive(p, v, m) ({ \ + _os_compiler_barrier_before_atomic(m); \ + (memory_order_has_release(memory_order_##m##_smp) ? \ + !__builtin_arm_stlex(p, v) : !__builtin_arm_strex(p, v)); \ +}) +/* + * arm64 override of os_atomic_make_dependency + * documentation for os_atomic_make_dependency is in + */ +#undef os_atomic_make_dependency +#define os_atomic_make_dependency(v) ({ \ + os_atomic_dependency_t _dep; \ + __asm__ __volatile__("and %[_dep], %[_v], xzr" \ + : [_dep] "=r" (_dep.__opaque_zero) : [_v] "r" (v)); \ + os_compiler_barrier(acquire); \ + _dep; \ +}) + +#if OS_ATOMIC_USE_LLSC + +/* + * arm64 (without armv81 atomics) override of os_atomic_rmw_loop + * documentation for os_atomic_rmw_loop is in + */ #undef os_atomic_rmw_loop #define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ - boolean_t _result = FALSE; \ - typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \ + int _result = 0; \ + _os_atomic_basetypeof(p) *_p; \ + _p = (_os_atomic_basetypeof(p) *)(p); \ + _os_compiler_barrier_before_atomic(m); \ do { \ - if (memory_order_has_acquire(memory_order_##m)) { \ + if (memory_order_has_acquire(memory_order_##m##_smp)) { \ ov = __builtin_arm_ldaex(_p); \ } else { \ ov = __builtin_arm_ldrex(_p); \ } \ __VA_ARGS__; \ - if (memory_order_has_release(memory_order_##m)) { \ + if (memory_order_has_release(memory_order_##m##_smp)) { \ _result = !__builtin_arm_stlex(nv, _p); \ } else { \ _result = !__builtin_arm_strex(nv, _p); \ } \ } while (__builtin_expect(!_result, 0)); \ + _os_compiler_barrier_after_atomic(m); \ _result; \ }) +/* + * arm64 override of os_atomic_rmw_loop_give_up + * documentation for os_atomic_rmw_loop_give_up is in + */ #undef os_atomic_rmw_loop_give_up -#define os_atomic_rmw_loop_give_up(expr) \ - ({ __builtin_arm_clrex(); expr; __builtin_trap(); }) -#endif +#define os_atomic_rmw_loop_give_up(...) \ + ({ os_atomic_clear_exclusive(); __VA_ARGS__; break; }) -#undef os_atomic_force_dependency_on -#if defined(__arm64__) -#define os_atomic_force_dependency_on(p, e) ({ \ - unsigned long _v; \ - __asm__("and %x[_v], %x[_e], xzr" : [_v] "=r" (_v) : [_e] "r" (e)); \ - (typeof(*(p)) *)((char *)(p) + _v); \ - }) -#else -#define os_atomic_force_dependency_on(p, e) ({ \ - unsigned long _v; \ - __asm__("and %[_v], %[_e], #0" : [_v] "=r" (_v) : [_e] "r" (e)); \ - (typeof(*(p)) *)((char *)(p) + _v); \ - }) -#endif // defined(__arm64__) +#endif // OS_ATOMIC_USE_LLSC + +#endif // __arm64__ #endif // _ARM_ATOMIC_H_ diff --git a/osfmk/arm/caches.c b/osfmk/arm/caches.c index f76a19edf..e5e64cff7 100644 --- a/osfmk/arm/caches.c +++ b/osfmk/arm/caches.c @@ -64,44 +64,30 @@ flush_dcache( boolean_t phys) { cpu_data_t *cpu_data_ptr = getCpuDatap(); - - if (phys) { - pmap_paddr_t paddr; - vm_offset_t vaddr; - - paddr = CAST_DOWN(pmap_paddr_t, addr); - if (!isphysmem(paddr)) { - return; - } - vaddr = phystokv(paddr); - FlushPoC_DcacheRegion((vm_offset_t) vaddr, length); - - if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) { - ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( - cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, length); - } - return; - } - if (cpu_data_ptr->cpu_cache_dispatch == (cache_dispatch_t) NULL) { - FlushPoC_DcacheRegion((vm_offset_t) addr, length); - } else { - addr64_t paddr; - uint32_t count; - - while (length > 0) { + vm_offset_t vaddr; + addr64_t paddr; + vm_size_t count; + + while (length > 0) { + if (phys) { + count = length; + paddr = CAST_DOWN(pmap_paddr_t, addr); + vaddr = phystokv_range(paddr, &count); + } else { + paddr = kvtophys(addr); + vaddr = addr; count = PAGE_SIZE - (addr & PAGE_MASK); if (count > length) { count = length; } - FlushPoC_DcacheRegion((vm_offset_t) addr, count); - paddr = kvtophys(addr); - if (paddr) { - ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( - cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, count); - } - addr += count; - length -= count; } + FlushPoC_DcacheRegion(vaddr, (unsigned)count); + if (paddr && (cpu_data_ptr->cpu_cache_dispatch != NULL)) { + ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( + cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, (unsigned)count); + } + addr += count; + length -= count; } return; } @@ -113,46 +99,30 @@ clean_dcache( boolean_t phys) { cpu_data_t *cpu_data_ptr = getCpuDatap(); - - if (phys) { - pmap_paddr_t paddr; - vm_offset_t vaddr; - - paddr = CAST_DOWN(pmap_paddr_t, addr); - if (!isphysmem(paddr)) { - return; - } - - vaddr = phystokv(paddr); - CleanPoC_DcacheRegion((vm_offset_t) vaddr, length); - - if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) { - ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( - cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, length); - } - return; - } - - if (cpu_data_ptr->cpu_cache_dispatch == (cache_dispatch_t) NULL) { - CleanPoC_DcacheRegion((vm_offset_t) addr, length); - } else { - addr64_t paddr; - uint32_t count; - - while (length > 0) { + vm_offset_t vaddr; + addr64_t paddr; + vm_size_t count; + + while (length > 0) { + if (phys) { + count = length; + paddr = CAST_DOWN(pmap_paddr_t, addr); + vaddr = phystokv_range(paddr, &count); + } else { + paddr = kvtophys(addr); + vaddr = addr; count = PAGE_SIZE - (addr & PAGE_MASK); if (count > length) { count = length; } - CleanPoC_DcacheRegion((vm_offset_t) addr, count); - paddr = kvtophys(addr); - if (paddr) { - ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( - cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, count); - } - addr += count; - length -= count; } + CleanPoC_DcacheRegion(vaddr, (unsigned)count); + if (paddr && (cpu_data_ptr->cpu_cache_dispatch != NULL)) { + ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( + cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, (unsigned)count); + } + addr += count; + length -= count; } return; } @@ -184,8 +154,6 @@ dcache_incoherent_io_flush64( unsigned int remaining, unsigned int *res) { - vm_offset_t vaddr; - pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa); cpu_data_t *cpu_data_ptr = getCpuDatap(); if ((cache_info()->c_bulksize_op != 0) && (remaining >= (cache_info()->c_bulksize_op))) { @@ -199,41 +167,38 @@ dcache_incoherent_io_flush64( #endif *res = BWOpDone; } else { - if (isphysmem(paddr)) { - vaddr = phystokv(pa); - { - FlushPoC_DcacheRegion((vm_offset_t) vaddr, size); - - if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) { - ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) pa, size); - } - } - } else { - /* slow path - pa isn't in the vtop region. Flush one page at a time via cpu_copywindows */ - unsigned int wimg_bits, index; - uint32_t count; - - mp_disable_preemption(); - - while (size > 0) { + vm_offset_t vaddr; + pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa); + vm_size_t count; + unsigned int wimg_bits, index; + + while (size > 0) { + if (isphysmem(paddr)) { + count = size; + vaddr = phystokv_range(paddr, &count); + } else { count = PAGE_SIZE - (paddr & PAGE_MASK); if (count > size) { count = size; } wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT)); + mp_disable_preemption(); index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ | VM_PROT_WRITE, wimg_bits); vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK); - - CleanPoC_DcacheRegion((vm_offset_t) vaddr, count); - + } + FlushPoC_DcacheRegion(vaddr, (unsigned)count); + if (isphysmem(paddr)) { + if (cpu_data_ptr->cpu_cache_dispatch != NULL) { + ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( + cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, (unsigned)count); + } + } else { pmap_unmap_cpu_windows_copy(index); - - paddr += count; - size -= count; + mp_enable_preemption(); } - - mp_enable_preemption(); + paddr += count; + size -= count; } } @@ -247,13 +212,12 @@ dcache_incoherent_io_store64( unsigned int remaining, unsigned int *res) { - vm_offset_t vaddr; pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa); cpu_data_t *cpu_data_ptr = getCpuDatap(); if (isphysmem(paddr)) { unsigned int wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT)); - if ((wimg_bits == VM_WIMG_IO) || (wimg_bits == VM_WIMG_WCOMB)) { + if ((wimg_bits == VM_WIMG_IO) || (wimg_bits == VM_WIMG_WCOMB) || (wimg_bits == VM_WIMG_RT)) { return; } } @@ -272,41 +236,36 @@ dcache_incoherent_io_store64( #endif *res = BWOpDone; } else { - if (isphysmem(paddr)) { - vaddr = phystokv(pa); - { - CleanPoC_DcacheRegion((vm_offset_t) vaddr, size); - - if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) { - ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) pa, size); - } - } - } else { - /* slow path - pa isn't in the vtop region. Flush one page at a time via cpu_copywindows */ - unsigned int wimg_bits, index; - uint32_t count; - - mp_disable_preemption(); - - while (size > 0) { + vm_offset_t vaddr; + vm_size_t count; + unsigned int wimg_bits, index; + + while (size > 0) { + if (isphysmem(paddr)) { + count = size; + vaddr = phystokv_range(paddr, &count); + } else { count = PAGE_SIZE - (paddr & PAGE_MASK); if (count > size) { count = size; } - wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT)); + mp_disable_preemption(); index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ | VM_PROT_WRITE, wimg_bits); vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK); - - CleanPoC_DcacheRegion((vm_offset_t) vaddr, count); - + } + CleanPoC_DcacheRegion(vaddr, (unsigned)count); + if (isphysmem(paddr)) { + if (cpu_data_ptr->cpu_cache_dispatch != NULL) { + ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)( + cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, (unsigned)count); + } + } else { pmap_unmap_cpu_windows_copy(index); - - paddr += count; - size -= count; + mp_enable_preemption(); } - - mp_enable_preemption(); + paddr += count; + size -= count; } } @@ -322,13 +281,7 @@ cache_sync_page( if (isphysmem(paddr)) { vm_offset_t vaddr = phystokv(paddr); - - CleanPoU_DcacheRegion(vaddr, PAGE_SIZE); -#ifdef __ARM_IC_NOALIAS_ICACHE__ InvalidatePoU_IcacheRegion(vaddr, PAGE_SIZE); -#else - InvalidatePoU_Icache(); -#endif } else { FlushPoC_Dcache(); InvalidatePoU_Icache(); @@ -406,14 +359,10 @@ platform_cache_disable(void) uint32_t sctlr_value = 0; /* Disable dcache allocation. */ - __asm__ volatile ("mrc p15, 0, %0, c1, c0, 0" - : "=r"(sctlr_value)); - + sctlr_value = __builtin_arm_mrc(MRC_SCTLR); sctlr_value &= ~SCTLR_DCACHE; - - __asm__ volatile ("mcr p15, 0, %0, c1, c0, 0\n" - "isb" - :: "r"(sctlr_value)); + __builtin_arm_mcr(MCR_SCTLR(sctlr_value)); + __builtin_arm_isb(ISB_SY); #endif /* (__ARM_ARCH__ < 8) */ } @@ -421,7 +370,7 @@ void platform_cache_idle_enter( void) { -#if __ARM_SMP__ +#if __ARM_SMP__ platform_cache_disable(); /* @@ -438,42 +387,39 @@ platform_cache_idle_enter( #if (__ARM_ARCH__ < 8) cpu_data_t *cpu_data_ptr = getCpuDatap(); cpu_data_ptr->cpu_CLW_active = 0; - __asm__ volatile ("dmb ish"); + __builtin_arm_dmb(DMB_ISH); cpu_data_ptr->cpu_CLWFlush_req = 0; cpu_data_ptr->cpu_CLWClean_req = 0; CleanPoC_DcacheRegion((vm_offset_t) cpu_data_ptr, sizeof(cpu_data_t)); #endif /* (__ARM_ARCH__ < 8) */ } -#else +#else /* !__ARM_SMP__ */ CleanPoU_Dcache(); -#endif +#endif /* !__ARM_SMP__ */ -#if defined (__ARM_SMP__) && defined (ARMA7) +#if defined(__ARM_SMP__) && defined(ARMA7) uint32_t actlr_value = 0; /* Leave the coherency domain */ - __asm__ volatile ("clrex\n" - "mrc p15, 0, %0, c1, c0, 1\n" - : "=r"(actlr_value)); - + __builtin_arm_clrex(); + actlr_value = __builtin_arm_mrc(MRC_ACTLR); actlr_value &= ~0x40; - __asm__ volatile ("mcr p15, 0, %0, c1, c0, 1\n" - /* Ensures any pending fwd request gets serviced and ends up */ - "dsb\n" - /* Forces the processor to re-fetch, so any pending fwd request gets into the core */ - "isb\n" - /* Ensures the second possible pending fwd request ends up. */ - "dsb\n" - :: "r"(actlr_value)); -#endif + __builtin_arm_mcr(MCR_ACTLR(actlr_value)); + /* Ensures any pending fwd request gets serviced and ends up */ + __builtin_arm_dsb(DSB_SY); + /* Forces the processor to re-fetch, so any pending fwd request gets into the core */ + __builtin_arm_isb(ISB_SY); + /* Ensures the second possible pending fwd request ends up. */ + __builtin_arm_dsb(DSB_SY); +#endif /* defined(__ARM_SMP__) && defined(ARMA7) */ } void platform_cache_idle_exit( void) { -#if defined (ARMA7) +#if defined(ARMA7) uint32_t actlr_value = 0; /* Flush L1 caches and TLB before rejoining the coherency domain */ @@ -491,30 +437,22 @@ platform_cache_idle_exit( } /* Rejoin the coherency domain */ - __asm__ volatile ("mrc p15, 0, %0, c1, c0, 1\n" - : "=r"(actlr_value)); - + actlr_value = __builtin_arm_mrc(MRC_ACTLR); actlr_value |= 0x40; - - __asm__ volatile ("mcr p15, 0, %0, c1, c0, 1\n" - "isb\n" - :: "r"(actlr_value)); + __builtin_arm_mcr(MCR_ACTLR(actlr_value)); + __builtin_arm_isb(ISB_SY); #if __ARM_SMP__ uint32_t sctlr_value = 0; /* Enable dcache allocation. */ - __asm__ volatile ("mrc p15, 0, %0, c1, c0, 0\n" - : "=r"(sctlr_value)); - + sctlr_value = __builtin_arm_mrc(MRC_SCTLR); sctlr_value |= SCTLR_DCACHE; - - __asm__ volatile ("mcr p15, 0, %0, c1, c0, 0\n" - "isb" - :: "r"(sctlr_value)); + __builtin_arm_mcr(MCR_SCTLR(sctlr_value)); + __builtin_arm_isb(ISB_SY); getCpuDatap()->cpu_CLW_active = 1; -#endif -#endif +#endif /* __ARM_SMP__ */ +#endif /* defined(ARMA7) */ } boolean_t @@ -603,7 +541,7 @@ cache_xcall(unsigned int op) } else if (op == LWClean) { target_cdp->cpu_CLWClean_req = abstime; } - __asm__ volatile ("dmb ish"); + __builtin_arm_dmb(DMB_ISH); if (target_cdp->cpu_CLW_active == 0) { if (op == LWFlush) { target_cdp->cpu_CLWFlush_req = 0x0ULL; @@ -675,7 +613,7 @@ flush_dcache( __unused unsigned length, __unused boolean_t phys) { - __asm__ volatile ("dsb sy"); + __builtin_arm_dsb(DSB_SY); } void @@ -684,7 +622,7 @@ clean_dcache( __unused unsigned length, __unused boolean_t phys) { - __asm__ volatile ("dsb sy"); + __builtin_arm_dsb(DSB_SY); } void @@ -692,7 +630,7 @@ flush_dcache_syscall( __unused vm_offset_t va, __unused unsigned length) { - __asm__ volatile ("dsb sy"); + __builtin_arm_dsb(DSB_SY); } void @@ -702,7 +640,7 @@ dcache_incoherent_io_flush64( __unused unsigned int remaining, __unused unsigned int *res) { - __asm__ volatile ("dsb sy"); + __builtin_arm_dsb(DSB_SY); *res = LWOpDone; return; } @@ -714,7 +652,7 @@ dcache_incoherent_io_store64( __unused unsigned int remaining, __unused unsigned int *res) { - __asm__ volatile ("dsb sy"); + __builtin_arm_dsb(DSB_SY); *res = LWOpDone; return; } @@ -728,12 +666,7 @@ cache_sync_page( if (isphysmem(paddr)) { vm_offset_t vaddr = phystokv(paddr); - -#ifdef __ARM_IC_NOALIAS_ICACHE__ InvalidatePoU_IcacheRegion(vaddr, PAGE_SIZE); -#else - InvalidatePoU_Icache(); -#endif } } diff --git a/osfmk/arm/caches_asm.s b/osfmk/arm/caches_asm.s index b4e6a94c8..0b305f48f 100644 --- a/osfmk/arm/caches_asm.s +++ b/osfmk/arm/caches_asm.s @@ -43,7 +43,10 @@ .globl EXT(invalidate_mmu_cache) LEXT(invalidate_mmu_cache) mov r0, #0 + dsb mcr p15, 0, r0, c7, c7, 0 // Invalidate caches + dsb + isb bx lr /* @@ -56,7 +59,9 @@ LEXT(invalidate_mmu_cache) .globl EXT(invalidate_mmu_dcache) LEXT(invalidate_mmu_dcache) mov r0, #0 + dsb mcr p15, 0, r0, c7, c6, 0 // Invalidate dcache + dsb bx lr /* @@ -73,12 +78,13 @@ LEXT(invalidate_mmu_dcache_region) add r1, r1, r2 sub r1, r1, #1 mov r1, r1, LSR #MMU_CLINE // Set cache line counter + dsb fmdr_loop: mcr p15, 0, r0, c7, c14, 1 // Invalidate dcache line add r0, r0, #1< #include #include +#include #include #include #include @@ -70,6 +71,7 @@ SECURITY_READ_ONLY_LATE(uint32_t) _cpu_capabilities = 0; /* For sysctl access from BSD side */ extern int gARMv81Atomics; extern int gARMv8Crc32; +extern int gARMv82FHM; void commpage_populate( @@ -81,6 +83,14 @@ commpage_populate( sharedpage_rw_addr = pmap_create_sharedpage(); commPagePtr = (vm_address_t)_COMM_PAGE_BASE_ADDRESS; +#if __arm64__ + bcopy(_COMM_PAGE64_SIGNATURE_STRING, (void *)(_COMM_PAGE_SIGNATURE + _COMM_PAGE_RW_OFFSET), + MIN(_COMM_PAGE_SIGNATURELEN, strlen(_COMM_PAGE64_SIGNATURE_STRING))); +#else + bcopy(_COMM_PAGE32_SIGNATURE_STRING, (void *)(_COMM_PAGE_SIGNATURE + _COMM_PAGE_RW_OFFSET), + MIN(_COMM_PAGE_SIGNATURELEN, strlen(_COMM_PAGE32_SIGNATURE_STRING))); +#endif + *((uint16_t*)(_COMM_PAGE_VERSION + _COMM_PAGE_RW_OFFSET)) = (uint16_t) _COMM_PAGE_THIS_VERSION; commpage_init_cpu_capabilities(); @@ -108,14 +118,14 @@ commpage_populate( *((uint64_t*)(_COMM_PAGE_MEMORY_SIZE + _COMM_PAGE_RW_OFFSET)) = machine_info.max_mem; *((uint32_t*)(_COMM_PAGE_CPUFAMILY + _COMM_PAGE_RW_OFFSET)) = (uint32_t)cpufamily; *((uint32_t*)(_COMM_PAGE_DEV_FIRM + _COMM_PAGE_RW_OFFSET)) = (uint32_t)PE_i_can_has_debugger(NULL); - *((uint8_t*)(_COMM_PAGE_USER_TIMEBASE + _COMM_PAGE_RW_OFFSET)) = user_timebase_allowed(); + *((uint8_t*)(_COMM_PAGE_USER_TIMEBASE + _COMM_PAGE_RW_OFFSET)) = user_timebase_type(); *((uint8_t*)(_COMM_PAGE_CONT_HWCLOCK + _COMM_PAGE_RW_OFFSET)) = user_cont_hwclock_allowed(); *((uint8_t*)(_COMM_PAGE_KERNEL_PAGE_SHIFT + _COMM_PAGE_RW_OFFSET)) = (uint8_t) page_shift; #if __arm64__ *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_32 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) page_shift_user32; *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_64 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) SIXTEENK_PAGE_SHIFT; -#elif (__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS) +#elif (__ARM_ARCH_7K__ >= 2) /* enforce 16KB alignment for watch targets with new ABI */ *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_32 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) SIXTEENK_PAGE_SHIFT; *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_64 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) SIXTEENK_PAGE_SHIFT; @@ -243,6 +253,12 @@ commpage_cpus( void ) return cpus; } +int +_get_cpu_capabilities(void) +{ + return _cpu_capabilities; +} + vm_address_t _get_commpage_priv_address(void) { @@ -323,7 +339,15 @@ commpage_init_cpu_capabilities( void ) bits |= kHasARMv8Crc32; gARMv8Crc32 = 1; } + if ((isar0 & ID_AA64ISAR0_EL1_FHM_MASK) >= ID_AA64ISAR0_EL1_FHM_8_2) { + bits |= kHasARMv82FHM; + gARMv82FHM = 1; + } #endif + + + + _cpu_capabilities = bits; *((uint32_t *)(_COMM_PAGE_CPU_CAPABILITIES + _COMM_PAGE_RW_OFFSET)) = _cpu_capabilities; @@ -513,10 +537,32 @@ commpage_increment_cpu_quiescent_counter(void) * the cpu mask, relaxed loads and stores is more efficient. */ #if __LP64__ - old_gen = atomic_load_explicit(sched_gen, memory_order_relaxed); - atomic_store_explicit(sched_gen, old_gen + 1, memory_order_relaxed); + old_gen = os_atomic_load(sched_gen, relaxed); + os_atomic_store(sched_gen, old_gen + 1, relaxed); #else old_gen = atomic_fetch_add_explicit(sched_gen, 1, memory_order_relaxed); #endif return old_gen; } + +/* + * update the commpage with if dtrace user land probes are enabled + */ +void +commpage_update_dof(boolean_t enabled) +{ +#if CONFIG_DTRACE + *((uint8_t*)(_COMM_PAGE_DTRACE_DOF_ENABLED + _COMM_PAGE_RW_OFFSET)) = (enabled ? 1 : 0); +#else + (void)enabled; +#endif +} + +/* + * update the dyld global config flags + */ +void +commpage_update_dyld_flags(uint64_t value) +{ + *((uint64_t*)(_COMM_PAGE_DYLD_SYSTEM_FLAGS + _COMM_PAGE_RW_OFFSET)) = value; +} diff --git a/osfmk/arm/commpage/commpage.h b/osfmk/arm/commpage/commpage.h index 6eeb63799..ee124d4b1 100644 --- a/osfmk/arm/commpage/commpage.h +++ b/osfmk/arm/commpage/commpage.h @@ -47,5 +47,7 @@ extern void commpage_update_multiuser_config(uint32_t); extern void commpage_update_boottime(uint64_t boottime_usec); extern void commpage_set_remotetime_params(double rate, uint64_t base_local_ts, uint64_t base_remote_ts); extern uint64_t commpage_increment_cpu_quiescent_counter(void); +extern void commpage_update_dof(boolean_t enabled); +extern void commpage_update_dyld_flags(uint64_t value); #endif /* _ARM_COMMPAGE_H */ diff --git a/osfmk/arm/cpu.c b/osfmk/arm/cpu.c index 4109f698e..72e8c7800 100644 --- a/osfmk/arm/cpu.c +++ b/osfmk/arm/cpu.c @@ -305,11 +305,17 @@ cpu_stack_alloc(cpu_data_t *cpu_data_ptr) void cpu_data_free(cpu_data_t *cpu_data_ptr) { - if (cpu_data_ptr == &BootCpuData) { + if ((cpu_data_ptr == NULL) || (cpu_data_ptr == &BootCpuData)) { return; } cpu_processor_free( cpu_data_ptr->cpu_processor); + if (CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr == cpu_data_ptr) { + OSDecrementAtomic((SInt32*)&real_ncpus); + CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr = NULL; + CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_paddr = 0; + __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu array are visible + } (kfree)((void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE); (kfree)((void *)(cpu_data_ptr->fiqstack_top - FIQSTACK_SIZE), FIQSTACK_SIZE); kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t)); @@ -351,12 +357,6 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->cpu_signal = SIGPdisabled; -#if DEBUG || DEVELOPMENT - cpu_data_ptr->failed_xcall = NULL; - cpu_data_ptr->failed_signal = 0; - cpu_data_ptr->failed_signal_count = 0; -#endif - cpu_data_ptr->cpu_get_fiq_handler = NULL; cpu_data_ptr->cpu_tbd_hardware_addr = NULL; cpu_data_ptr->cpu_tbd_hardware_val = NULL; @@ -366,6 +366,8 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->cpu_sleep_token_last = 0x00000000UL; cpu_data_ptr->cpu_xcall_p0 = NULL; cpu_data_ptr->cpu_xcall_p1 = NULL; + cpu_data_ptr->cpu_imm_xcall_p0 = NULL; + cpu_data_ptr->cpu_imm_xcall_p1 = NULL; #if __ARM_SMP__ && defined(ARMA7) cpu_data_ptr->cpu_CLWFlush_req = 0x0ULL; @@ -398,6 +400,7 @@ cpu_data_register(cpu_data_t *cpu_data_ptr) } cpu_data_ptr->cpu_number = cpu; + __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu data are visible CpuDataEntries[cpu].cpu_data_vaddr = cpu_data_ptr; CpuDataEntries[cpu].cpu_data_paddr = (void *)ml_vtophys((vm_offset_t)cpu_data_ptr); return KERN_SUCCESS; @@ -420,8 +423,8 @@ cpu_start(int cpu) cpu_data_ptr->cpu_pmap_cpu_data.cpu_user_pmap = NULL; - if (cpu_data_ptr->cpu_processor->next_thread != THREAD_NULL) { - first_thread = cpu_data_ptr->cpu_processor->next_thread; + if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) { + first_thread = cpu_data_ptr->cpu_processor->startup_thread; } else { first_thread = cpu_data_ptr->cpu_processor->idle_thread; } @@ -594,8 +597,8 @@ void machine_track_platform_idle(boolean_t entry) { if (entry) { - (void)__c11_atomic_fetch_add(&cpu_idle_count, 1, __ATOMIC_RELAXED); + os_atomic_inc(&cpu_idle_count, relaxed); } else { - (void)__c11_atomic_fetch_sub(&cpu_idle_count, 1, __ATOMIC_RELAXED); + os_atomic_dec(&cpu_idle_count, relaxed); } } diff --git a/osfmk/arm/cpu_capabilities.h b/osfmk/arm/cpu_capabilities.h index c43156ceb..b0f2b3fda 100644 --- a/osfmk/arm/cpu_capabilities.h +++ b/osfmk/arm/cpu_capabilities.h @@ -35,6 +35,9 @@ #include #endif +#define USER_TIMEBASE_NONE 0 +#define USER_TIMEBASE_SPEC 1 + /* * This is the authoritative way to determine from user mode what * implementation-specific processor features are available. @@ -45,6 +48,8 @@ /* * Bit definitions for _cpu_capabilities: */ +#define kHasICDSBShift 2 +#define kHasICDSB 0x00000004 // ICache Data Syncronization on DSB enabled (H13) #define kHasNeonFP16 0x00000008 // ARM v8.2 NEON FP16 supported #define kCache32 0x00000010 // cache line size is 32 bytes #define kCache64 0x00000020 // cache line size is 64 bytes @@ -53,16 +58,17 @@ #define kHasNeon 0x00000100 // Advanced SIMD is supported #define kHasNeonHPFP 0x00000200 // Advanced SIMD half-precision #define kHasVfp 0x00000400 // VFP is supported +#define kHasUCNormalMemory 0x00000800 // Uncacheable normal memory type supported #define kHasEvent 0x00001000 // WFE/SVE and period event wakeup #define kHasFMA 0x00002000 // Fused multiply add is supported +#define kHasARMv82FHM 0x00004000 // Optional ARMv8.2 FMLAL/FMLSL instructions (required in ARMv8.4) #define kUP 0x00008000 // set if (kNumCPUs == 1) #define kNumCPUs 0x00FF0000 // number of CPUs (see _NumCPUs() below) #define kHasARMv8Crypto 0x01000000 // Optional ARMv8 Crypto extensions #define kHasARMv81Atomics 0x02000000 // ARMv8.1 Atomic instructions supported #define kHasARMv8Crc32 0x04000000 // Optional ARMv8 crc32 instructions (required in ARMv8.1) -#define kNumCPUsShift 16 // see _NumCPUs() below - +#define kNumCPUsShift 16 // see _NumCPUs() below /* * Bit definitions for multiuser_config: */ @@ -72,7 +78,9 @@ #ifndef __ASSEMBLER__ #include +__BEGIN_DECLS extern int _get_cpu_capabilities( void ); +__END_DECLS __inline static int @@ -81,6 +89,7 @@ _NumCPUs( void ) return (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift; } + typedef struct { volatile uint64_t TimeBase; volatile uint32_t TimeStamp_sec; @@ -92,7 +101,9 @@ typedef struct { volatile uint32_t TimeBase_shift; } commpage_timeofday_data_t; +__BEGIN_DECLS extern vm_address_t _get_commpage_priv_address(void); +__END_DECLS #endif /* __ASSEMBLER__ */ @@ -166,6 +177,7 @@ extern vm_address_t _get_commpage_priv_address(void) * apply _COMM_PAGE_PRIV macro to use these in privileged mode */ #define _COMM_PAGE_SIGNATURE (_COMM_PAGE_START_ADDRESS+0x000) // first few bytes are a signature +#define _COMM_PAGE_SIGNATURELEN (0x10) #define _COMM_PAGE_VERSION (_COMM_PAGE_START_ADDRESS+0x01E) // 16-bit version# #define _COMM_PAGE_THIS_VERSION 3 // version of the commarea format @@ -188,7 +200,8 @@ extern vm_address_t _get_commpage_priv_address(void) #define _COMM_PAGE_TIMEBASE_OFFSET (_COMM_PAGE_START_ADDRESS+0x088) // uint64_t timebase offset for constructing mach_absolute_time() #define _COMM_PAGE_USER_TIMEBASE (_COMM_PAGE_START_ADDRESS+0x090) // uint8_t is userspace mach_absolute_time supported (can read the timebase) #define _COMM_PAGE_CONT_HWCLOCK (_COMM_PAGE_START_ADDRESS+0x091) // uint8_t is always-on hardware clock present for mach_continuous_time() -#define _COMM_PAGE_UNUSED0 (_COMM_PAGE_START_ADDRESS+0x092) // 6 unused bytes +#define _COMM_PAGE_DTRACE_DOF_ENABLED (_COMM_PAGE_START_ADDRESS+0x092) // uint8_t 0 if userspace DOF disable, 1 if enabled +#define _COMM_PAGE_UNUSED0 (_COMM_PAGE_START_ADDRESS+0x093) // 5 unused bytes #define _COMM_PAGE_CONT_TIMEBASE (_COMM_PAGE_START_ADDRESS+0x098) // uint64_t base for mach_continuous_time() #define _COMM_PAGE_BOOTTIME_USEC (_COMM_PAGE_START_ADDRESS+0x0A0) // uint64_t boottime in microseconds @@ -204,6 +217,7 @@ extern vm_address_t _get_commpage_priv_address(void) #define _COMM_PAGE_NEWTIMEOFDAY_DATA (_COMM_PAGE_START_ADDRESS+0x120) // used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40. #define _COMM_PAGE_REMOTETIME_PARAMS (_COMM_PAGE_START_ADDRESS+0x148) // used by mach_bridge_remote_time(). Currently, sizeof(struct bt_params) = 24 +#define _COMM_PAGE_DYLD_SYSTEM_FLAGS (_COMM_PAGE_START_ADDRESS+0x160) // uint64_t export kern.dyld_system_flags to userspace // aligning to 128 bytes for cacheline/fabric size #define _COMM_PAGE_CPU_QUIESCENT_COUNTER (_COMM_PAGE_START_ADDRESS+0x180) // uint64_t, but reserve the whole 128 (0x80) bytes diff --git a/osfmk/arm/cpu_common.c b/osfmk/arm/cpu_common.c index 85f1cf13b..327434ece 100644 --- a/osfmk/arm/cpu_common.c +++ b/osfmk/arm/cpu_common.c @@ -150,7 +150,9 @@ cpu_info(processor_flavor_t flavor, int slot_num, processor_info_t info, cpu_stat->vfp_shortv_cnt = 0; cpu_stat->data_ex_cnt = cpu_data_ptr->cpu_stat.data_ex_cnt; cpu_stat->instr_ex_cnt = cpu_data_ptr->cpu_stat.instr_ex_cnt; - cpu_stat->pmi_cnt = cpu_data_ptr->cpu_stat.pmi_cnt; +#if MONOTONIC + cpu_stat->pmi_cnt = cpu_data_ptr->cpu_monotonic.mtc_npmis; +#endif /* MONOTONIC */ *count = PROCESSOR_CPU_STAT64_COUNT; @@ -207,7 +209,7 @@ cpu_handle_xcall(cpu_data_t *cpu_data_ptr) broadcastFunc xfunc; void *xparam; - __c11_atomic_thread_fence(memory_order_acquire_smp); + os_atomic_thread_fence(acquire); /* Come back around if cpu_signal_internal is running on another CPU and has just * added SIGPxcall to the pending mask, but hasn't yet assigned the call params.*/ if (cpu_data_ptr->cpu_xcall_p0 != NULL && cpu_data_ptr->cpu_xcall_p1 != NULL) { @@ -215,14 +217,24 @@ cpu_handle_xcall(cpu_data_t *cpu_data_ptr) xparam = cpu_data_ptr->cpu_xcall_p1; cpu_data_ptr->cpu_xcall_p0 = NULL; cpu_data_ptr->cpu_xcall_p1 = NULL; - __c11_atomic_thread_fence(memory_order_acq_rel_smp); - hw_atomic_and_noret(&cpu_data_ptr->cpu_signal, ~SIGPxcall); + os_atomic_thread_fence(acq_rel); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPxcall, relaxed); + xfunc(xparam); + } + if (cpu_data_ptr->cpu_imm_xcall_p0 != NULL && cpu_data_ptr->cpu_imm_xcall_p1 != NULL) { + xfunc = cpu_data_ptr->cpu_imm_xcall_p0; + xparam = cpu_data_ptr->cpu_imm_xcall_p1; + cpu_data_ptr->cpu_imm_xcall_p0 = NULL; + cpu_data_ptr->cpu_imm_xcall_p1 = NULL; + os_atomic_thread_fence(acq_rel); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPxcallImm, relaxed); xfunc(xparam); } } -unsigned int -cpu_broadcast_xcall(uint32_t *synch, +static unsigned int +cpu_broadcast_xcall_internal(unsigned int signal, + uint32_t *synch, boolean_t self_xcall, broadcastFunc func, void *parm) @@ -232,7 +244,7 @@ cpu_broadcast_xcall(uint32_t *synch, cpu_data_t *target_cpu_datap; unsigned int failsig; int cpu; - int max_cpu; + int max_cpu = ml_get_max_cpu_number() + 1; intr = ml_set_interrupts_enabled(FALSE); cpu_data_ptr = getCpuDatap(); @@ -240,19 +252,19 @@ cpu_broadcast_xcall(uint32_t *synch, failsig = 0; if (synch != NULL) { - *synch = real_ncpus; + *synch = max_cpu; assert_wait((event_t)synch, THREAD_UNINT); } - max_cpu = ml_get_max_cpu_number(); - for (cpu = 0; cpu <= max_cpu; cpu++) { + for (cpu = 0; cpu < max_cpu; cpu++) { target_cpu_datap = (cpu_data_t *)CpuDataEntries[cpu].cpu_data_vaddr; - if ((target_cpu_datap == NULL) || (target_cpu_datap == cpu_data_ptr)) { + if (target_cpu_datap == cpu_data_ptr) { continue; } - if (KERN_SUCCESS != cpu_signal(target_cpu_datap, SIGPxcall, (void *)func, parm)) { + if ((target_cpu_datap == NULL) || + KERN_SUCCESS != cpu_signal(target_cpu_datap, signal, (void *)func, parm)) { failsig++; } } @@ -265,7 +277,7 @@ cpu_broadcast_xcall(uint32_t *synch, (void) ml_set_interrupts_enabled(intr); if (synch != NULL) { - if (hw_atomic_sub(synch, (!self_xcall)? failsig + 1 : failsig) == 0) { + if (os_atomic_sub(synch, (!self_xcall) ? failsig + 1 : failsig, relaxed) == 0) { clear_wait(current_thread(), THREAD_AWAKENED); } else { thread_block(THREAD_CONTINUE_NULL); @@ -273,14 +285,32 @@ cpu_broadcast_xcall(uint32_t *synch, } if (!self_xcall) { - return real_ncpus - failsig - 1; + return max_cpu - failsig - 1; } else { - return real_ncpus - failsig; + return max_cpu - failsig; } } -kern_return_t -cpu_xcall(int cpu_number, broadcastFunc func, void *param) +unsigned int +cpu_broadcast_xcall(uint32_t *synch, + boolean_t self_xcall, + broadcastFunc func, + void *parm) +{ + return cpu_broadcast_xcall_internal(SIGPxcall, synch, self_xcall, func, parm); +} + +unsigned int +cpu_broadcast_immediate_xcall(uint32_t *synch, + boolean_t self_xcall, + broadcastFunc func, + void *parm) +{ + return cpu_broadcast_xcall_internal(SIGPxcallImm, synch, self_xcall, func, parm); +} + +static kern_return_t +cpu_xcall_internal(unsigned int signal, int cpu_number, broadcastFunc func, void *param) { cpu_data_t *target_cpu_datap; @@ -288,12 +318,28 @@ cpu_xcall(int cpu_number, broadcastFunc func, void *param) return KERN_INVALID_ARGUMENT; } + if (func == NULL || param == NULL) { + return KERN_INVALID_ARGUMENT; + } + target_cpu_datap = (cpu_data_t*)CpuDataEntries[cpu_number].cpu_data_vaddr; if (target_cpu_datap == NULL) { return KERN_INVALID_ARGUMENT; } - return cpu_signal(target_cpu_datap, SIGPxcall, (void*)func, param); + return cpu_signal(target_cpu_datap, signal, (void*)func, param); +} + +kern_return_t +cpu_xcall(int cpu_number, broadcastFunc func, void *param) +{ + return cpu_xcall_internal(SIGPxcall, cpu_number, func, param); +} + +kern_return_t +cpu_immediate_xcall(int cpu_number, broadcastFunc func, void *param) +{ + return cpu_xcall_internal(SIGPxcallImm, cpu_number, func, param); } static kern_return_t @@ -320,39 +366,40 @@ cpu_signal_internal(cpu_data_t *target_proc, Check_SIGPdisabled = 0; } - if (signal == SIGPxcall) { + if ((signal == SIGPxcall) || (signal == SIGPxcallImm)) { do { current_signals = target_proc->cpu_signal; if ((current_signals & SIGPdisabled) == SIGPdisabled) { -#if DEBUG || DEVELOPMENT - target_proc->failed_signal = SIGPxcall; - target_proc->failed_xcall = p0; - OSIncrementAtomicLong(&target_proc->failed_signal_count); -#endif ml_set_interrupts_enabled(interruptible); return KERN_FAILURE; } - swap_success = OSCompareAndSwap(current_signals & (~SIGPxcall), current_signals | SIGPxcall, + swap_success = OSCompareAndSwap(current_signals & (~signal), current_signals | signal, &target_proc->cpu_signal); + if (!swap_success && (signal == SIGPxcallImm) && (target_proc->cpu_signal & SIGPxcallImm)) { + ml_set_interrupts_enabled(interruptible); + return KERN_ALREADY_WAITING; + } + /* Drain pending xcalls on this cpu; the CPU we're trying to xcall may in turn * be trying to xcall us. Since we have interrupts disabled that can deadlock, * so break the deadlock by draining pending xcalls. */ - if (!swap_success && (current_proc->cpu_signal & SIGPxcall)) { + if (!swap_success && (current_proc->cpu_signal & signal)) { cpu_handle_xcall(current_proc); } } while (!swap_success); - target_proc->cpu_xcall_p0 = p0; - target_proc->cpu_xcall_p1 = p1; + if (signal == SIGPxcallImm) { + target_proc->cpu_imm_xcall_p0 = p0; + target_proc->cpu_imm_xcall_p1 = p1; + } else { + target_proc->cpu_xcall_p0 = p0; + target_proc->cpu_xcall_p1 = p1; + } } else { do { current_signals = target_proc->cpu_signal; if ((Check_SIGPdisabled != 0) && (current_signals & Check_SIGPdisabled) == SIGPdisabled) { -#if DEBUG || DEVELOPMENT - target_proc->failed_signal = signal; - OSIncrementAtomicLong(&target_proc->failed_signal_count); -#endif ml_set_interrupts_enabled(interruptible); return KERN_FAILURE; } @@ -424,48 +471,48 @@ cpu_signal_handler_internal(boolean_t disable_signal) SCHED_STATS_IPI(current_processor()); - cpu_signal = hw_atomic_or(&cpu_data_ptr->cpu_signal, 0); + cpu_signal = os_atomic_or(&cpu_data_ptr->cpu_signal, 0, relaxed); if ((!(cpu_signal & SIGPdisabled)) && (disable_signal == TRUE)) { - (void)hw_atomic_or(&cpu_data_ptr->cpu_signal, SIGPdisabled); + os_atomic_or(&cpu_data_ptr->cpu_signal, SIGPdisabled, relaxed); } else if ((cpu_signal & SIGPdisabled) && (disable_signal == FALSE)) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPdisabled); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdisabled, relaxed); } while (cpu_signal & ~SIGPdisabled) { if (cpu_signal & SIGPdec) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPdec); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdec, relaxed); rtclock_intr(FALSE); } #if KPERF if (cpu_signal & SIGPkptimer) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPkptimer); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPkptimer, relaxed); kperf_signal_handler((unsigned int)cpu_data_ptr->cpu_number); } #endif - if (cpu_signal & SIGPxcall) { + if (cpu_signal & (SIGPxcall | SIGPxcallImm)) { cpu_handle_xcall(cpu_data_ptr); } if (cpu_signal & SIGPast) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPast); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPast, relaxed); ast_check(cpu_data_ptr->cpu_processor); } if (cpu_signal & SIGPdebug) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPdebug); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdebug, relaxed); DebuggerXCall(cpu_data_ptr->cpu_int_state); } #if __ARM_SMP__ && defined(ARMA7) if (cpu_signal & SIGPLWFlush) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPLWFlush); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPLWFlush, relaxed); cache_xcall_handler(LWFlush); } if (cpu_signal & SIGPLWClean) { - (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPLWClean); + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPLWClean, relaxed); cache_xcall_handler(LWClean); } #endif - cpu_signal = hw_atomic_or(&cpu_data_ptr->cpu_signal, 0); + cpu_signal = os_atomic_or(&cpu_data_ptr->cpu_signal, 0, relaxed); } } @@ -499,7 +546,10 @@ cpu_machine_init(void) if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) { platform_cache_init(); } + + /* Note: this calls IOCPURunPlatformActiveActions when resuming on boot cpu */ PE_cpu_machine_init(cpu_data_ptr->cpu_id, !started); + cpu_data_ptr->cpu_flags |= StartedState; ml_init_interrupt(); } diff --git a/osfmk/arm/cpu_data.h b/osfmk/arm/cpu_data.h index b99f054e1..7b001d176 100644 --- a/osfmk/arm/cpu_data.h +++ b/osfmk/arm/cpu_data.h @@ -48,7 +48,7 @@ #define current_thread() current_thread_fast() -static inline __pure2 thread_t +static inline __attribute__((const)) thread_t current_thread_fast(void) { #if defined(__arm64__) diff --git a/osfmk/arm/cpu_data_internal.h b/osfmk/arm/cpu_data_internal.h index ac6569f7c..8b29c711a 100644 --- a/osfmk/arm/cpu_data_internal.h +++ b/osfmk/arm/cpu_data_internal.h @@ -69,13 +69,15 @@ extern reset_handler_data_t ResetHandlerData; #define MAX_CPUS 1 #endif -#define CPUWINDOWS_MAX 4 +/* Put the static check for cpumap_t here as it's defined in */ +static_assert(sizeof(cpumap_t) * CHAR_BIT >= MAX_CPUS, "cpumap_t bitvector is too small for current MAX_CPUS value"); + #ifdef __arm__ -#define CPUWINDOWS_BASE 0xFFF00000UL +#define CPUWINDOWS_BASE_MASK 0xFFF00000UL #else #define CPUWINDOWS_BASE_MASK 0xFFFFFFFFFFF00000UL -#define CPUWINDOWS_BASE (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) #endif +#define CPUWINDOWS_BASE (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) #define CPUWINDOWS_TOP (CPUWINDOWS_BASE + (MAX_CPUS * CPUWINDOWS_MAX * PAGE_SIZE)) typedef struct cpu_data_entry { @@ -109,8 +111,9 @@ typedef struct { uint64_t ipi_cnt_wake; uint64_t timer_cnt; uint64_t timer_cnt_wake; - uint64_t pmi_cnt; +#if MONOTONIC uint64_t pmi_cnt_wake; +#endif /* MONOTONIC */ uint64_t undef_ex_cnt; uint64_t unaligned_cnt; uint64_t vfp_cnt; @@ -137,11 +140,6 @@ typedef struct cpu_data { unsigned int cpu_ident; cpu_id_t cpu_id; unsigned volatile int cpu_signal; -#if DEBUG || DEVELOPMENT - void *failed_xcall; - unsigned int failed_signal; - volatile long failed_signal_count; -#endif void *cpu_cache_dispatch; ast_t cpu_pending_ast; struct processor *cpu_processor; @@ -223,6 +221,8 @@ typedef struct cpu_data { void *cpu_xcall_p0; void *cpu_xcall_p1; + void *cpu_imm_xcall_p0; + void *cpu_imm_xcall_p1; #if __ARM_SMP__ && defined(ARMA7) volatile uint32_t cpu_CLW_active; @@ -278,6 +278,9 @@ typedef struct cpu_data { CPU_HALTED, CPU_HALTED_WITH_STATE } halt_status; +#if defined(HAS_APPLE_PAC) + uint64_t rop_key; +#endif /* defined(HAS_APPLE_PAC) */ } cpu_data_t; /* diff --git a/osfmk/arm/cpu_internal.h b/osfmk/arm/cpu_internal.h index f40941de5..7a9892600 100644 --- a/osfmk/arm/cpu_internal.h +++ b/osfmk/arm/cpu_internal.h @@ -64,6 +64,7 @@ extern void cpu_signal_cancel( #define SIGPLWFlush 0x00000020UL /* Request LWFlush call */ #define SIGPLWClean 0x00000040UL /* Request LWClean call */ #define SIGPkptimer 0x00000100U /* Request kperf timer */ +#define SIGPxcallImm 0x00000200U /* Send a cross-call, fail if already pending */ #define SIGPdisabled 0x80000000U /* Signal disabled */ diff --git a/osfmk/arm/cpuid.c b/osfmk/arm/cpuid.c index 147bfaa1d..73f9b0d83 100644 --- a/osfmk/arm/cpuid.c +++ b/osfmk/arm/cpuid.c @@ -42,36 +42,34 @@ typedef struct { uint32_t - - Ctype1:3, /* 2:0 */ - Ctype2:3, /* 5:3 */ - Ctype3:3, /* 8:6 */ - Ctypes:15, /* 6:23 - Don't Care */ - LoC:3, /* 26-24 - Level of Coherency */ - LoU:3, /* 29:27 - Level of Unification */ - RAZ:2; /* 31:30 - Read-As-Zero */ -} arm_cache_clidr_t; + Ctype1:3, /* 2:0 */ + Ctype2:3, /* 5:3 */ + Ctype3:3, /* 8:6 */ + Ctypes:15, /* 6:23 - Don't Care */ + LoC:3, /* 26-24 - Level of Coherency */ + LoU:3, /* 29:27 - Level of Unification */ + RAZ:2; /* 31:30 - Read-As-Zero */ +} arm_cache_clidr_t; typedef union { arm_cache_clidr_t bits; - uint32_t value; -} arm_cache_clidr_info_t; + uint32_t value; +} arm_cache_clidr_info_t; typedef struct { uint32_t - LineSize:3, /* 2:0 - Number of words in cache line */ - Assoc:10, /* 12:3 - Associativity of cache */ + Assoc:10, /* 12:3 - Associativity of cache */ NumSets:15, /* 27:13 - Number of sets in cache */ - c_type:4; /* 31:28 - Cache type */ -} arm_cache_ccsidr_t; + c_type:4; /* 31:28 - Cache type */ +} arm_cache_ccsidr_t; typedef union { arm_cache_ccsidr_t bits; - uint32_t value; -} arm_cache_ccsidr_info_t; + uint32_t value; +} arm_cache_ccsidr_info_t; /* Statics */ @@ -85,17 +83,21 @@ void do_cpuid(void) { cpuid_cpu_info.value = machine_read_midr(); -#if (__ARM_ARCH__ == 8) +#if (__ARM_ARCH__ == 8) +#if defined(HAS_APPLE_PAC) + cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv8E; +#else /* defined(HAS_APPLE_PAC) */ cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv8; +#endif /* defined(HAS_APPLE_PAC) */ -#elif (__ARM_ARCH__ == 7) - #ifdef __ARM_SUB_ARCH__ +#elif (__ARM_ARCH__ == 7) +#ifdef __ARM_SUB_ARCH__ cpuid_cpu_info.arm_info.arm_arch = __ARM_SUB_ARCH__; - #else +#else /* __ARM_SUB_ARCH__ */ cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv7; - #endif -#else +#endif /* __ARM_SUB_ARCH__ */ +#else /* (__ARM_ARCH__ != 7) && (__ARM_ARCH__ != 8) */ /* 1176 architecture lives in the extended feature register */ if (cpuid_cpu_info.arm_info.arm_arch == CPU_ARCH_EXTENDED) { arm_isa_feat1_reg isa = machine_read_isa_feat1(); @@ -108,7 +110,7 @@ do_cpuid(void) cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv6; } } -#endif +#endif /* (__ARM_ARCH__ != 7) && (__ARM_ARCH__ != 8) */ } arm_cpu_info_t * @@ -176,6 +178,13 @@ cpuid_get_cpufamily(void) case CPU_PART_MISTRAL: cpufamily = CPUFAMILY_ARM_MONSOON_MISTRAL; break; + case CPU_PART_VORTEX: + case CPU_PART_TEMPEST: + case CPU_PART_TEMPEST_M9: + case CPU_PART_VORTEX_ARUBA: + case CPU_PART_TEMPEST_ARUBA: + cpufamily = CPUFAMILY_ARM_VORTEX_TEMPEST; + break; default: cpufamily = CPUFAMILY_UNKNOWN; break; diff --git a/osfmk/arm/cpuid.h b/osfmk/arm/cpuid.h index bc6468a96..559cde9fc 100644 --- a/osfmk/arm/cpuid.h +++ b/osfmk/arm/cpuid.h @@ -41,46 +41,47 @@ #include typedef struct { - uint32_t arm_rev : 4,/* 00:03 revision number */ - arm_part : 12, /* 04:15 primary part number */ - arm_arch : 4, /* 16:19 architecture */ - arm_variant : 4, /* 20:23 variant */ - arm_implementor : 8; /* 24:31 implementor (0x41) */ + uint32_t arm_rev : 4, /* 00:03 revision number */ + arm_part : 12,/* 04:15 primary part number */ + arm_arch : 4,/* 16:19 architecture */ + arm_variant : 4,/* 20:23 variant */ + arm_implementor : 8;/* 24:31 implementor (0x41) */ } arm_cpuid_bits_t; typedef union { - arm_cpuid_bits_t arm_info; /* ARM9xx, ARM11xx, and later processors */ - uint32_t value; + arm_cpuid_bits_t arm_info; /* ARM9xx, ARM11xx, and later processors */ + uint32_t value; } arm_cpu_info_t; /* Implementor codes */ -#define CPU_VID_ARM 0x41 // ARM Limited -#define CPU_VID_DEC 0x44 // Digital Equipment Corporation -#define CPU_VID_MOTOROLA 0x4D // Motorola - Freescale Semiconductor Inc. -#define CPU_VID_MARVELL 0x56 // Marvell Semiconductor Inc. -#define CPU_VID_INTEL 0x69 // Intel ARM parts. -#define CPU_VID_APPLE 0x61 // Apple Inc. +#define CPU_VID_ARM 0x41 // ARM Limited +#define CPU_VID_DEC 0x44 // Digital Equipment Corporation +#define CPU_VID_MOTOROLA 0x4D // Motorola - Freescale Semiconductor Inc. +#define CPU_VID_MARVELL 0x56 // Marvell Semiconductor Inc. +#define CPU_VID_INTEL 0x69 // Intel ARM parts. +#define CPU_VID_APPLE 0x61 // Apple Inc. /* ARM Architecture Codes */ -#define CPU_ARCH_ARMv4 0x1 /* ARMv4 */ -#define CPU_ARCH_ARMv4T 0x2 /* ARMv4 + Thumb */ -#define CPU_ARCH_ARMv5 0x3 /* ARMv5 */ -#define CPU_ARCH_ARMv5T 0x4 /* ARMv5 + Thumb */ -#define CPU_ARCH_ARMv5TE 0x5 /* ARMv5 + Thumb + Extensions(?) */ -#define CPU_ARCH_ARMv5TEJ 0x6 /* ARMv5 + Thumb + Extensions(?) + //Jazelle(?) XXX */ -#define CPU_ARCH_ARMv6 0x7 /* ARMv6 */ -#define CPU_ARCH_ARMv7 0x8 /* ARMv7 */ -#define CPU_ARCH_ARMv7f 0x9 /* ARMv7 for Cortex A9 */ -#define CPU_ARCH_ARMv7s 0xa /* ARMv7 for Swift */ -#define CPU_ARCH_ARMv7k 0xb /* ARMv7 for Cortex A7 */ +#define CPU_ARCH_ARMv4 0x1 /* ARMv4 */ +#define CPU_ARCH_ARMv4T 0x2 /* ARMv4 + Thumb */ +#define CPU_ARCH_ARMv5 0x3 /* ARMv5 */ +#define CPU_ARCH_ARMv5T 0x4 /* ARMv5 + Thumb */ +#define CPU_ARCH_ARMv5TE 0x5 /* ARMv5 + Thumb + Extensions(?) */ +#define CPU_ARCH_ARMv5TEJ 0x6 /* ARMv5 + Thumb + Extensions(?) + //Jazelle(?) XXX */ +#define CPU_ARCH_ARMv6 0x7 /* ARMv6 */ +#define CPU_ARCH_ARMv7 0x8 /* ARMv7 */ +#define CPU_ARCH_ARMv7f 0x9 /* ARMv7 for Cortex A9 */ +#define CPU_ARCH_ARMv7s 0xa /* ARMv7 for Swift */ +#define CPU_ARCH_ARMv7k 0xb /* ARMv7 for Cortex A7 */ -#define CPU_ARCH_ARMv8 0xc /* Subtype for CPU_TYPE_ARM64 */ +#define CPU_ARCH_ARMv8 0xc /* Subtype for CPU_TYPE_ARM64 */ +#define CPU_ARCH_ARMv8E 0xd /* ARMv8.3a + Apple Private ISA Subtype for CPU_TYPE_ARM64 */ /* special code indicating we need to look somewhere else for the architecture version */ -#define CPU_ARCH_EXTENDED 0xF +#define CPU_ARCH_EXTENDED 0xF /* ARM Part Numbers */ /* @@ -89,54 +90,69 @@ typedef union { */ /* ARM9 (ARMv4T architecture) */ -#define CPU_PART_920T 0x920 -#define CPU_PART_926EJS 0x926 /* ARM926EJ-S */ +#define CPU_PART_920T 0x920 +#define CPU_PART_926EJS 0x926 /* ARM926EJ-S */ /* ARM11 (ARMv6 architecture) */ -#define CPU_PART_1136JFS 0xB36 /* ARM1136JF-S or ARM1136J-S */ -#define CPU_PART_1176JZFS 0xB76 /* ARM1176JZF-S */ +#define CPU_PART_1136JFS 0xB36 /* ARM1136JF-S or ARM1136J-S */ +#define CPU_PART_1176JZFS 0xB76 /* ARM1176JZF-S */ /* G1 (ARMv7 architecture) */ -#define CPU_PART_CORTEXA5 0xC05 +#define CPU_PART_CORTEXA5 0xC05 /* M7 (ARMv7 architecture) */ -#define CPU_PART_CORTEXA7 0xC07 +#define CPU_PART_CORTEXA7 0xC07 /* H2 H3 (ARMv7 architecture) */ -#define CPU_PART_CORTEXA8 0xC08 +#define CPU_PART_CORTEXA8 0xC08 /* H4 (ARMv7 architecture) */ -#define CPU_PART_CORTEXA9 0xC09 +#define CPU_PART_CORTEXA9 0xC09 /* H5 (SWIFT architecture) */ -#define CPU_PART_SWIFT 0x0 +#define CPU_PART_SWIFT 0x0 /* H6 (ARMv8 architecture) */ -#define CPU_PART_CYCLONE 0x1 +#define CPU_PART_CYCLONE 0x1 /* H7 (ARMv8 architecture) */ -#define CPU_PART_TYPHOON 0x2 +#define CPU_PART_TYPHOON 0x2 /* H7G (ARMv8 architecture) */ -#define CPU_PART_TYPHOON_CAPRI 0x3 +#define CPU_PART_TYPHOON_CAPRI 0x3 /* H8 (ARMv8 architecture) */ -#define CPU_PART_TWISTER 0x4 +#define CPU_PART_TWISTER 0x4 /* H8G H8M (ARMv8 architecture) */ -#define CPU_PART_TWISTER_ELBA_MALTA 0x5 +#define CPU_PART_TWISTER_ELBA_MALTA 0x5 /* H9 (ARMv8 architecture) */ -#define CPU_PART_HURRICANE 0x6 +#define CPU_PART_HURRICANE 0x6 /* H9G (ARMv8 architecture) */ -#define CPU_PART_HURRICANE_MYST 0x7 +#define CPU_PART_HURRICANE_MYST 0x7 /* H10 p-Core (ARMv8 architecture) */ -#define CPU_PART_MONSOON 0x8 +#define CPU_PART_MONSOON 0x8 /* H10 e-Core (ARMv8 architecture) */ -#define CPU_PART_MISTRAL 0x9 +#define CPU_PART_MISTRAL 0x9 + +/* H11 p-Core (ARMv8 architecture) */ +#define CPU_PART_VORTEX 0xB + +/* H11 e-Core (ARMv8 architecture) */ +#define CPU_PART_TEMPEST 0xC + +/* M9 e-Core (ARMv8 architecture) */ +#define CPU_PART_TEMPEST_M9 0xF + +/* H11G p-Core (ARMv8 architecture) */ +#define CPU_PART_VORTEX_ARUBA 0x10 + +/* H11G e-Core (ARMv8 architecture) */ +#define CPU_PART_TEMPEST_ARUBA 0x11 /* Cache type identification */ @@ -151,24 +167,23 @@ typedef enum { } cache_type_t; typedef struct { - boolean_t c_unified; /* unified I & D cache? */ - uint32_t c_isize; /* in Bytes (ARM caches can be 0.5 KB) */ - boolean_t c_i_ppage; /* protected page restriction for I cache - * (see B6-11 in ARM DDI 0100I document). */ - uint32_t c_dsize; /* in Bytes (ARM caches can be 0.5 KB) */ - boolean_t c_d_ppage; /* protected page restriction for I cache - * (see B6-11 in ARM DDI 0100I document). */ - cache_type_t c_type; /* WB or WT */ - uint32_t c_linesz; /* number of bytes */ - uint32_t c_assoc; /* n-way associativity */ - uint32_t c_l2size; /* L2 size, if present */ - uint32_t c_bulksize_op;/* bulk operation size limit. 0 if disabled */ - uint32_t c_inner_cache_size; /* inner dache size */ + boolean_t c_unified; /* unified I & D cache? */ + uint32_t c_isize; /* in Bytes (ARM caches can be 0.5 KB) */ + boolean_t c_i_ppage; /* protected page restriction for I cache + * (see B6-11 in ARM DDI 0100I document). */ + uint32_t c_dsize; /* in Bytes (ARM caches can be 0.5 KB) */ + boolean_t c_d_ppage; /* protected page restriction for I cache + * (see B6-11 in ARM DDI 0100I document). */ + cache_type_t c_type; /* WB or WT */ + uint32_t c_linesz; /* number of bytes */ + uint32_t c_assoc; /* n-way associativity */ + uint32_t c_l2size; /* L2 size, if present */ + uint32_t c_bulksize_op; /* bulk operation size limit. 0 if disabled */ + uint32_t c_inner_cache_size; /* inner dache size */ } cache_info_t; typedef struct { uint32_t - RB:4, /* 3:0 - 32x64-bit media register bank supported: 0x2 */ SP:4, /* 7:4 - Single precision supported in VFPv3: 0x2 */ DP:4, /* 8:11 - Double precision supported in VFPv3: 0x2 */ @@ -186,7 +201,6 @@ typedef union { typedef struct { uint32_t - FZ:4, /* 3:0 - Full denormal arithmetic supported for VFP: 0x1 */ DN:4, /* 7:4 - Propagation of NaN values supported for VFP: 0x1 */ LS:4, /* 11:8 - Load/store instructions supported for NEON: 0x1 */ @@ -202,14 +216,14 @@ typedef union { } arm_mvfr1_info_t; typedef struct { - uint32_t neon; - uint32_t neon_hpfp; - uint32_t neon_fp16; + uint32_t neon; + uint32_t neon_hpfp; + uint32_t neon_fp16; } arm_mvfp_info_t; #ifdef __cplusplus extern "C" { -#endif +#endif /* __cplusplus */ extern void do_cpuid(void); extern arm_cpu_info_t *cpuid_info(void); @@ -226,6 +240,6 @@ extern arm_mvfp_info_t *arm_mvfp_info(void); #ifdef __cplusplus } -#endif +#endif /* __cplusplus */ #endif // _MACHINE_CPUID_H_ diff --git a/osfmk/arm/genassym.c b/osfmk/arm/genassym.c index 435383706..5ebbf990b 100644 --- a/osfmk/arm/genassym.c +++ b/osfmk/arm/genassym.c @@ -355,11 +355,11 @@ main( DECLARE("BA_TOP_OF_KERNEL_DATA", offsetof(struct boot_args, topOfKernelData)); - DECLARE("ENTROPY_INDEX_PTR", - offsetof(entropy_data_t, index_ptr)); + DECLARE("ENTROPY_SAMPLE_COUNT", + offsetof(entropy_data_t, sample_count)); DECLARE("ENTROPY_BUFFER", offsetof(entropy_data_t, buffer)); - DECLARE("ENTROPY_DATA_SIZE", sizeof(struct entropy_data)); + DECLARE("ENTROPY_BUFFER_INDEX_MASK", ENTROPY_BUFFER_INDEX_MASK); return 0; } diff --git a/osfmk/arm/io_map.c b/osfmk/arm/io_map.c index bae84c780..1bb8b82d6 100644 --- a/osfmk/arm/io_map.c +++ b/osfmk/arm/io_map.c @@ -69,6 +69,16 @@ extern vm_offset_t virtual_space_start; /* Next available kernel VA */ */ vm_offset_t io_map(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags) +{ + return io_map_with_prot(phys_addr, size, flags, VM_PROT_READ | VM_PROT_WRITE); +} + +/* + * Allocate and map memory for devices that may need to be mapped before + * Mach VM is running. Allows caller to specify mapping protection + */ +vm_offset_t +io_map_with_prot(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags, vm_prot_t prot) { vm_offset_t start, start_offset; @@ -87,15 +97,15 @@ io_map(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags) if (flags == VM_WIMG_WCOMB) { (void) pmap_map_bd_with_options(start, phys_addr, phys_addr + round_page(size), - VM_PROT_READ | VM_PROT_WRITE, PMAP_MAP_BD_WCOMB); + prot, PMAP_MAP_BD_WCOMB); } else { (void) pmap_map_bd(start, phys_addr, phys_addr + round_page(size), - VM_PROT_READ | VM_PROT_WRITE); + prot); } } else { (void) kmem_alloc_pageable(kernel_map, &start, round_page(size), VM_KERN_MEMORY_IOKIT); (void) pmap_map(start, phys_addr, phys_addr + round_page(size), - VM_PROT_READ | VM_PROT_WRITE, flags); + prot, flags); } #if KASAN kasan_notify_address(start + start_offset, size); diff --git a/osfmk/arm/io_map_entries.h b/osfmk/arm/io_map_entries.h index 4b97c77f5..1c5ec79a6 100644 --- a/osfmk/arm/io_map_entries.h +++ b/osfmk/arm/io_map_entries.h @@ -40,6 +40,13 @@ extern vm_offset_t io_map( vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags); + +extern vm_offset_t io_map_with_prot( + vm_map_offset_t phys_addr, + vm_size_t size, + unsigned int flags, + vm_prot_t prot); + extern vm_offset_t io_map_spec(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags); #endif /* __APPLE_API_PRIVATE */ diff --git a/osfmk/arm/kpc_arm.c b/osfmk/arm/kpc_arm.c index b5c060a8a..5d882c13b 100644 --- a/osfmk/arm/kpc_arm.c +++ b/osfmk/arm/kpc_arm.c @@ -282,7 +282,7 @@ kpc_set_running_xcall( void *vstate ) set_running_configurable(mp_config->cfg_target_mask, mp_config->cfg_state_mask); - if (hw_atomic_sub(&kpc_xcall_sync, 1) == 0) { + if (os_atomic_dec(&kpc_xcall_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_xcall_sync); } } @@ -674,7 +674,7 @@ kpc_set_reload_xcall(void *vmp_config) ml_set_interrupts_enabled(enabled); - if (hw_atomic_sub(&kpc_reload_sync, 1) == 0) { + if (os_atomic_dec(&kpc_reload_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_reload_sync); } } @@ -749,7 +749,7 @@ kpc_set_config_xcall(void *vmp_config) new_config += kpc_popcount(mp_config->pmc_mask); } - if (hw_atomic_sub(&kpc_config_sync, 1) == 0) { + if (os_atomic_dec(&kpc_config_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_config_sync); } } @@ -795,9 +795,9 @@ kpc_get_curcpu_counters_xcall(void *args) r = kpc_get_curcpu_counters(handler->classes, NULL, &handler->buf[offset]); /* number of counters added by this CPU, needs to be atomic */ - hw_atomic_add(&(handler->nb_counters), r); + os_atomic_add(&(handler->nb_counters), r, relaxed); - if (hw_atomic_sub(&kpc_xread_sync, 1) == 0) { + if (os_atomic_dec(&kpc_xread_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_xread_sync); } } diff --git a/osfmk/arm/locks.h b/osfmk/arm/locks.h index 41941a9d1..ce0e69e42 100644 --- a/osfmk/arm/locks.h +++ b/osfmk/arm/locks.h @@ -88,7 +88,7 @@ typedef struct _lck_mtx_ { union { struct { uint16_t lck_mtx_waiters;/* Number of waiters */ - uint8_t lck_mtx_pri; /* Priority to inherit */ + uint8_t lck_mtx_pri; /* unused */ uint8_t lck_mtx_type; /* Type */ }; struct { @@ -215,13 +215,13 @@ typedef struct { // 23-30 #define LCK_RW_TAG_VALID_BIT 31 -#define LCK_RW_INTERLOCK (1 << LCK_RW_INTERLOCK_BIT) -#define LCK_RW_R_WAITING (1 << LCK_RW_R_WAITING_BIT) -#define LCK_RW_W_WAITING (1 << LCK_RW_W_WAITING_BIT) -#define LCK_RW_WANT_UPGRADE (1 << LCK_RW_WANT_UPGRADE_BIT) -#define LCK_RW_WANT_EXCL (1 << LCK_RW_WANT_EXCL_BIT) -#define LCK_RW_TAG_VALID (1 << LCK_RW_TAG_VALID_BIT) -#define LCK_RW_PRIV_EXCL (1 << LCK_RW_PRIV_EXCL_BIT) +#define LCK_RW_INTERLOCK (1U << LCK_RW_INTERLOCK_BIT) +#define LCK_RW_R_WAITING (1U << LCK_RW_R_WAITING_BIT) +#define LCK_RW_W_WAITING (1U << LCK_RW_W_WAITING_BIT) +#define LCK_RW_WANT_UPGRADE (1U << LCK_RW_WANT_UPGRADE_BIT) +#define LCK_RW_WANT_EXCL (1U << LCK_RW_WANT_EXCL_BIT) +#define LCK_RW_TAG_VALID (1U << LCK_RW_TAG_VALID_BIT) +#define LCK_RW_PRIV_EXCL (1U << LCK_RW_PRIV_EXCL_BIT) #define LCK_RW_SHARED_MASK (0xffff << LCK_RW_SHARED_READER_OFFSET) #define LCK_RW_SHARED_READER (0x1 << LCK_RW_SHARED_READER_OFFSET) @@ -257,6 +257,9 @@ typedef struct { #define PLATFORM_LCK_ILOCK LCK_ILOCK +#if defined(__ARM_ARCH_8_2__) +#define __ARM_ATOMICS_8_1 1 // ARMv8.1 atomic instructions are available +#endif /* * Lock state to thread pointer @@ -273,8 +276,8 @@ typedef struct { */ #define LCK_MTX_THREAD_MASK (~(uintptr_t)(LCK_ILOCK | ARM_LCK_WAITERS)) -#define disable_preemption_for_thread(t) ((volatile thread_t)t)->machine.preemption_count++ -#define preemption_disabled_for_thread(t) (((volatile thread_t)t)->machine.preemption_count > 0) +#define disable_preemption_for_thread(t) os_atomic_store(&(t->machine.preemption_count), t->machine.preemption_count + 1, compiler_acq_rel) +#define preemption_disabled_for_thread(t) (t->machine.preemption_count > 0) __unused static void diff --git a/osfmk/arm/locks_arm.c b/osfmk/arm/locks_arm.c index 5b6917ac3..49a261f31 100644 --- a/osfmk/arm/locks_arm.c +++ b/osfmk/arm/locks_arm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,7 +59,6 @@ * Locking primitives implementation */ -#define ATOMIC_PRIVATE 1 #define LOCK_PRIVATE 1 #include @@ -71,7 +70,6 @@ #include #include #include -#include #include #include #include @@ -129,17 +127,6 @@ extern uint64_t dtrace_spin_threshold; /* Forwards */ - -#if USLOCK_DEBUG -/* - * Perform simple lock checks. - */ -int uslock_check = 1; -int max_lock_loops = 100000000; -decl_simple_lock_data(extern, printf_lock) -decl_simple_lock_data(extern, panic_lock) -#endif /* USLOCK_DEBUG */ - extern unsigned int not_in_kdp; /* @@ -165,19 +152,6 @@ typedef void *pc_t; * Portable lock package implementation of usimple_locks. */ -#if USLOCK_DEBUG -#define USLDBG(stmt) stmt -void usld_lock_init(usimple_lock_t, unsigned short); -void usld_lock_pre(usimple_lock_t, pc_t); -void usld_lock_post(usimple_lock_t, pc_t); -void usld_unlock(usimple_lock_t, pc_t); -void usld_lock_try_pre(usimple_lock_t, pc_t); -void usld_lock_try_post(usimple_lock_t, pc_t); -int usld_lock_common_checks(usimple_lock_t, const char *); -#else /* USLOCK_DEBUG */ -#define USLDBG(stmt) -#endif /* USLOCK_DEBUG */ - /* * Owner thread pointer when lock held in spin mode */ @@ -190,26 +164,24 @@ int usld_lock_common_checks(usimple_lock_t, const char *); #define lck_rw_ilk_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL) #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT) -#define memory_barrier() __c11_atomic_thread_fence(memory_order_acq_rel_smp) -#define load_memory_barrier() __c11_atomic_thread_fence(memory_order_acquire_smp) -#define store_memory_barrier() __c11_atomic_thread_fence(memory_order_release_smp) +#define load_memory_barrier() os_atomic_thread_fence(acquire) // Enforce program order of loads and stores. -#define ordered_load(target, type) \ - __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed) -#define ordered_store(target, type, value) \ - __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed) - -#define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data, uintptr_t) -#define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, uintptr_t, (value)) -#define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data, uint32_t) -#define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, uint32_t, (value)) -#define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner, thread_t) -#define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, thread_t, (value)) -#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data, uintptr_t) -#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, uintptr_t, (value)) -#define ordered_load_bit(lock) ordered_load((lock), uint32_t) -#define ordered_store_bit(lock, value) ordered_store((lock), uint32_t, (value)) +#define ordered_load(target) \ + os_atomic_load(target, compiler_acq_rel) +#define ordered_store(target, value) \ + os_atomic_store(target, value, compiler_acq_rel) + +#define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data) +#define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value)) +#define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data) +#define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, (value)) +#define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner) +#define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, (value)) +#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data) +#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value)) +#define ordered_load_bit(lock) ordered_load((lock)) +#define ordered_store_bit(lock, value) ordered_store((lock), (value)) // Prevent the compiler from reordering memory operations around this @@ -253,11 +225,56 @@ static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait); * atomic_exchange_complete() - conclude an exchange * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin() */ +__unused static uint32_t +load_exclusive32(uint32_t *target, enum memory_order ord) +{ + uint32_t value; + +#if __arm__ + if (memory_order_has_release(ord)) { + // Pre-load release barrier + atomic_thread_fence(memory_order_release); + } + value = __builtin_arm_ldrex(target); +#else + if (memory_order_has_acquire(ord)) { + value = __builtin_arm_ldaex(target); // ldaxr + } else { + value = __builtin_arm_ldrex(target); // ldxr + } +#endif // __arm__ + return value; +} + +__unused static boolean_t +store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord) +{ + boolean_t err; + +#if __arm__ + err = __builtin_arm_strex(value, target); + if (memory_order_has_acquire(ord)) { + // Post-store acquire barrier + atomic_thread_fence(memory_order_acquire); + } +#else + if (memory_order_has_release(ord)) { + err = __builtin_arm_stlex(value, target); // stlxr + } else { + err = __builtin_arm_strex(value, target); // stxr + } +#endif // __arm__ + return !err; +} + static uint32_t atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord) { uint32_t val; +#if __ARM_ATOMICS_8_1 + ord = memory_order_relaxed; +#endif val = load_exclusive32(target, ord); *previous = val; return val; @@ -266,14 +283,18 @@ atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order static boolean_t atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord) { +#if __ARM_ATOMICS_8_1 + return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed); +#else (void)previous; // Previous not needed, monitor is held return store_exclusive32(target, newval, ord); +#endif } static void atomic_exchange_abort(void) { - clear_exclusive(); + os_atomic_clear_exclusive(); } static boolean_t @@ -298,260 +319,113 @@ atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, e } } +inline boolean_t +hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) +{ + return atomic_test_and_set32(target, test_mask, set_mask, ord, wait); +} + void _disable_preemption(void) { - thread_t thread = current_thread(); - unsigned int count; + thread_t thread = current_thread(); + unsigned int count = thread->machine.preemption_count; - count = thread->machine.preemption_count + 1; - ordered_store(&thread->machine.preemption_count, unsigned int, count); + count += 1; + if (__improbable(count == 0)) { + panic("Preemption count overflow"); + } + + os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel); } -void -_enable_preemption(void) +/* + * This function checks whether an AST_URGENT has been pended. + * + * It is called once the preemption has been reenabled, which means the thread + * may have been preempted right before this was called, and when this function + * actually performs the check, we've changed CPU. + * + * This race is however benign: the point of AST_URGENT is to trigger a context + * switch, so if one happened, there's nothing left to check for, and AST_URGENT + * was cleared in the process. + * + * It follows that this check cannot have false negatives, which allows us + * to avoid fiddling with interrupt state for the vast majority of cases + * when the check will actually be negative. + */ +static NOINLINE void +kernel_preempt_check(thread_t thread) { - thread_t thread = current_thread(); - long state; - unsigned int count; + cpu_data_t *cpu_data_ptr; + long state; + #if __arm__ #define INTERRUPT_MASK PSR_IRQF #else // __arm__ #define INTERRUPT_MASK DAIF_IRQF #endif // __arm__ - count = thread->machine.preemption_count; - if (count == 0) { - panic("Preemption count negative"); // Count will go negative when released - } - count--; - if (count > 0) { - goto update_count; // Preemption is still disabled, just update - } - state = get_interrupts(); // Get interrupt state - if (state & INTERRUPT_MASK) { - goto update_count; // Interrupts are already masked, can't take AST here + /* + * This check is racy and could load from another CPU's pending_ast mask, + * but as described above, this can't have false negatives. + */ + cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel); + if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) { + return; } - disable_interrupts_noread(); // Disable interrupts - ordered_store(&thread->machine.preemption_count, unsigned int, count); - if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { + + /* If interrupts are masked, we can't take an AST here */ + state = get_interrupts(); + if ((state & INTERRUPT_MASK) == 0) { + disable_interrupts_noread(); // Disable interrupts + + /* + * Reload cpu_data_ptr: a context switch would cause it to change. + * Now that interrupts are disabled, this will debounce false positives. + */ + cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel); + if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { #if __arm__ #if __ARM_USER_PROTECT__ - uintptr_t up = arm_user_protect_begin(thread); + uintptr_t up = arm_user_protect_begin(thread); #endif // __ARM_USER_PROTECT__ - enable_fiq(); + enable_fiq(); #endif // __arm__ - ast_taken_kernel(); // Handle urgent AST + ast_taken_kernel(); // Handle urgent AST #if __arm__ #if __ARM_USER_PROTECT__ - arm_user_protect_end(thread, up, TRUE); + arm_user_protect_end(thread, up, TRUE); #endif // __ARM_USER_PROTECT__ - enable_interrupts(); - return; // Return early on arm only due to FIQ enabling + enable_interrupts(); + return; // Return early on arm only due to FIQ enabling #endif // __arm__ - } - restore_interrupts(state); // Enable interrupts - return; - -update_count: - ordered_store(&thread->machine.preemption_count, unsigned int, count); - return; -} - -int -get_preemption_level(void) -{ - return current_thread()->machine.preemption_count; -} - -#if __SMP__ -static unsigned int -hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)); -#endif - -static inline unsigned int -hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) -{ - unsigned int success = 0; - uint32_t mask = (1 << bit); -#if !__SMP__ - uint32_t state; -#endif - -#if __SMP__ - if (__improbable(!atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE))) { - success = hw_lock_bit_to_contended(lock, mask, timeout LCK_GRP_ARG(grp)); - } else { - success = 1; - } -#else // __SMP__ - (void)timeout; - state = ordered_load_bit(lock); - if (!(mask & state)) { - ordered_store_bit(lock, state | mask); - success = 1; - } -#endif // __SMP__ - - if (success) { - lck_grp_spin_update_held(lock LCK_GRP_ARG(grp)); - } - - return success; -} - -unsigned -int -(hw_lock_bit_to)(hw_lock_bit_t * lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) -{ - _disable_preemption(); - return hw_lock_bit_to_internal(lock, bit, timeout LCK_GRP_ARG(grp)); -} - -#if __SMP__ -static unsigned int NOINLINE -hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) -{ - uint64_t end = 0; - int i; -#if CONFIG_DTRACE || LOCK_STATS - uint64_t begin = 0; - boolean_t stat_enabled = lck_grp_spin_spin_enabled(lock LCK_GRP_ARG(grp)); -#endif /* CONFIG_DTRACE || LOCK_STATS */ - -#if LOCK_STATS || CONFIG_DTRACE - if (__improbable(stat_enabled)) { - begin = mach_absolute_time(); - } -#endif /* LOCK_STATS || CONFIG_DTRACE */ - for (;;) { - for (i = 0; i < LOCK_SNOOP_SPINS; i++) { - // Always load-exclusive before wfe - // This grabs the monitor and wakes up on a release event - if (atomic_test_and_set32(lock, mask, mask, memory_order_acquire, TRUE)) { - goto end; - } - } - if (end == 0) { - end = ml_get_timebase() + timeout; - } else if (ml_get_timebase() >= end) { - break; } + restore_interrupts(state); // Enable interrupts } - return 0; -end: -#if CONFIG_DTRACE || LOCK_STATS - if (__improbable(stat_enabled)) { - lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin); - } - lck_grp_spin_update_miss(lock LCK_GRP_ARG(grp)); -#endif /* CONFIG_DTRACE || LCK_GRP_STAT */ - - return 1; } -#endif // __SMP__ void -(hw_lock_bit)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) -{ - if (hw_lock_bit_to(lock, bit, LOCK_PANIC_TIMEOUT, LCK_GRP_PROBEARG(grp))) { - return; - } -#if __SMP__ - panic("hw_lock_bit(): timed out (%p)", lock); -#else - panic("hw_lock_bit(): interlock held (%p)", lock); -#endif -} - -void -(hw_lock_bit_nopreempt)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) -{ - if (__improbable(get_preemption_level() == 0)) { - panic("Attempt to take no-preempt bitlock %p in preemptible context", lock); - } - if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT LCK_GRP_ARG(grp))) { - return; - } -#if __SMP__ - panic("hw_lock_bit_nopreempt(): timed out (%p)", lock); -#else - panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock); -#endif -} - -unsigned -int -(hw_lock_bit_try)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) +_enable_preemption(void) { - uint32_t mask = (1 << bit); -#if !__SMP__ - uint32_t state; -#endif - boolean_t success = FALSE; + thread_t thread = current_thread(); + unsigned int count = thread->machine.preemption_count; - _disable_preemption(); -#if __SMP__ - // TODO: consider weak (non-looping) atomic test-and-set - success = atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE); -#else - state = ordered_load_bit(lock); - if (!(mask & state)) { - ordered_store_bit(lock, state | mask); - success = TRUE; - } -#endif // __SMP__ - if (!success) { - _enable_preemption(); + if (__improbable(count == 0)) { + panic("Preemption count underflow"); } + count -= 1; - if (success) { - lck_grp_spin_update_held(lock LCK_GRP_ARG(grp)); + os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel); + if (count == 0) { + kernel_preempt_check(thread); } - - return success; -} - -static inline void -hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit) -{ - uint32_t mask = (1 << bit); -#if !__SMP__ - uint32_t state; -#endif - -#if __SMP__ - __c11_atomic_fetch_and((_Atomic uint32_t *)lock, ~mask, memory_order_release); - set_event(); -#else // __SMP__ - state = ordered_load_bit(lock); - ordered_store_bit(lock, state & ~mask); -#endif // __SMP__ -#if CONFIG_DTRACE - LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit); -#endif -} - -/* - * Routine: hw_unlock_bit - * - * Release spin-lock. The second parameter is the bit number to test and set. - * Decrement the preemption level. - */ -void -hw_unlock_bit(hw_lock_bit_t * lock, unsigned int bit) -{ - hw_unlock_bit_internal(lock, bit); - _enable_preemption(); } -void -hw_unlock_bit_nopreempt(hw_lock_bit_t * lock, unsigned int bit) +int +get_preemption_level(void) { - if (__improbable(get_preemption_level() == 0)) { - panic("Attempt to release no-preempt bitlock %p in preemptible context", lock); - } - hw_unlock_bit_internal(lock, bit); + return current_thread()->machine.preemption_count; } #if __SMP__ @@ -618,11 +492,12 @@ lck_spin_init( lck_grp_t * grp, __unused lck_attr_t * attr) { - hw_lock_init(&lck->hwlock); lck->type = LCK_SPIN_TYPE; - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); - store_memory_barrier(); + hw_lock_init(&lck->hwlock); + if (grp) { + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); + } } /* @@ -633,7 +508,6 @@ arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value) { lck->type = LCK_SPIN_TYPE; hw_lock_init(&lck->hwlock); - store_memory_barrier(); } @@ -767,8 +641,10 @@ lck_spin_destroy( return; } lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED; - lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); - lck_grp_deallocate(grp); + if (grp) { + lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); + lck_grp_deallocate(grp); + } } /* @@ -794,12 +670,7 @@ usimple_lock_init( usimple_lock_t l, unsigned short tag) { -#ifndef MACHINE_SIMPLE_LOCK - USLDBG(usld_lock_init(l, tag)); - hw_lock_init(&l->lck_spin_data); -#else simple_lock_init((simple_lock_t) l, tag); -#endif } @@ -815,21 +686,7 @@ void usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - - OBTAIN_PC(pc, l); - USLDBG(usld_lock_pre(l, pc)); - - if (!hw_lock_to(&l->lck_spin_data, LockTimeOut, LCK_GRP_ARG(grp))) { /* Try to get the lock - * with a timeout */ - panic("simple lock deadlock detection - l=%p, cpu=%d, ret=%p", &l, cpu_number(), pc); - } - - USLDBG(usld_lock_post(l, pc)); -#else simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp)); -#endif } @@ -846,16 +703,7 @@ void (usimple_unlock)( usimple_lock_t l) { -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - - OBTAIN_PC(pc, l); - USLDBG(usld_unlock(l, pc)); - sync(); - hw_lock_unlock(&l->lck_spin_data); -#else simple_unlock((simple_lock_t)l); -#endif } @@ -877,299 +725,9 @@ int usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - unsigned int success; - - OBTAIN_PC(pc, l); - USLDBG(usld_lock_try_pre(l, pc)); - if ((success = hw_lock_try(&l->lck_spin_data LCK_GRP_ARG(grp)))) { - USLDBG(usld_lock_try_post(l, pc)); - } - return success; -#else return simple_lock_try((simple_lock_t) l, grp); -#endif -} - -#if USLOCK_DEBUG -/* - * States of a usimple_lock. The default when initializing - * a usimple_lock is setting it up for debug checking. - */ -#define USLOCK_CHECKED 0x0001 /* lock is being checked */ -#define USLOCK_TAKEN 0x0002 /* lock has been taken */ -#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ -#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) -#define USLOCK_CHECKING(l) (uslock_check && \ - ((l)->debug.state & USLOCK_CHECKED)) - -/* - * Trace activities of a particularly interesting lock. - */ -void usl_trace(usimple_lock_t, int, pc_t, const char *); - - -/* - * Initialize the debugging information contained - * in a usimple_lock. - */ -void -usld_lock_init( - usimple_lock_t l, - __unused unsigned short tag) -{ - if (l == USIMPLE_LOCK_NULL) { - panic("lock initialization: null lock pointer"); - } - l->lock_type = USLOCK_TAG; - l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0; - l->debug.lock_cpu = l->debug.unlock_cpu = 0; - l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC; - l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD; - l->debug.duration[0] = l->debug.duration[1] = 0; - l->debug.unlock_cpu = l->debug.unlock_cpu = 0; - l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC; - l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD; -} - - -/* - * These checks apply to all usimple_locks, not just - * those with USLOCK_CHECKED turned on. - */ -int -usld_lock_common_checks( - usimple_lock_t l, - const char *caller) -{ - if (l == USIMPLE_LOCK_NULL) { - panic("%s: null lock pointer", caller); - } - if (l->lock_type != USLOCK_TAG) { - panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l); - } - if (!(l->debug.state & USLOCK_INIT)) { - panic("%s: 0x%x is not an initialized lock", - caller, (integer_t) l); - } - return USLOCK_CHECKING(l); -} - - -/* - * Debug checks on a usimple_lock just before attempting - * to acquire it. - */ -/* ARGSUSED */ -void -usld_lock_pre( - usimple_lock_t l, - pc_t pc) -{ - const char *caller = "usimple_lock"; - - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - /* - * Note that we have a weird case where we are getting a lock when we are] - * in the process of putting the system to sleep. We are running with no - * current threads, therefore we can't tell if we are trying to retake a lock - * we have or someone on the other processor has it. Therefore we just - * ignore this test if the locking thread is 0. - */ - - if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && - l->debug.lock_thread == (void *) current_thread()) { - printf("%s: lock 0x%x already locked (at %p) by", - caller, (integer_t) l, l->debug.lock_pc); - printf(" current thread %p (new attempt at pc %p)\n", - l->debug.lock_thread, pc); - panic("%s", caller); - } - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); -} - - -/* - * Debug checks on a usimple_lock just after acquiring it. - * - * Pre-emption has been disabled at this point, - * so we are safe in using cpu_number. - */ -void -usld_lock_post( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "successful usimple_lock"; - - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); - } - if ((l->debug.state & USLOCK_TAKEN)) { - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); - } - - mycpu = cpu_number(); - l->debug.lock_thread = (void *) current_thread(); - l->debug.state |= USLOCK_TAKEN; - l->debug.lock_pc = pc; - l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); -} - - -/* - * Debug checks on a usimple_lock just before - * releasing it. Note that the caller has not - * yet released the hardware lock. - * - * Preemption is still disabled, so there's - * no problem using cpu_number. - */ -void -usld_unlock( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "usimple_unlock"; - - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - mycpu = cpu_number(); - - if (!(l->debug.state & USLOCK_TAKEN)) { - panic("%s: lock 0x%x hasn't been taken", - caller, (integer_t) l); - } - if (l->debug.lock_thread != (void *) current_thread()) { - panic("%s: unlocking lock 0x%x, owned by thread %p", - caller, (integer_t) l, l->debug.lock_thread); - } - if (l->debug.lock_cpu != mycpu) { - printf("%s: unlocking lock 0x%x on cpu 0x%x", - caller, (integer_t) l, mycpu); - printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); - panic("%s", caller); - } - usl_trace(l, mycpu, pc, caller); - - l->debug.unlock_thread = l->debug.lock_thread; - l->debug.lock_thread = INVALID_PC; - l->debug.state &= ~USLOCK_TAKEN; - l->debug.unlock_pc = pc; - l->debug.unlock_cpu = mycpu; } - -/* - * Debug checks on a usimple_lock just before - * attempting to acquire it. - * - * Preemption isn't guaranteed to be disabled. - */ -void -usld_lock_try_pre( - usimple_lock_t l, - pc_t pc) -{ - const char *caller = "usimple_lock_try"; - - if (!usld_lock_common_checks(l, caller)) { - return; - } - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); -} - - -/* - * Debug checks on a usimple_lock just after - * successfully attempting to acquire it. - * - * Preemption has been disabled by the - * lock acquisition attempt, so it's safe - * to use cpu_number. - */ -void -usld_lock_try_post( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "successful usimple_lock_try"; - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); - } - if ((l->debug.state & USLOCK_TAKEN)) { - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); - } - - mycpu = cpu_number(); - l->debug.lock_thread = (void *) current_thread(); - l->debug.state |= USLOCK_TAKEN; - l->debug.lock_pc = pc; - l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); -} - - -/* - * For very special cases, set traced_lock to point to a - * specific lock of interest. The result is a series of - * XPRs showing lock operations on that lock. The lock_seq - * value is used to show the order of those operations. - */ -usimple_lock_t traced_lock; -unsigned int lock_seq; - -void -usl_trace( - usimple_lock_t l, - int mycpu, - pc_t pc, - const char *op_name) -{ - if (traced_lock == l) { - XPR(XPR_SLOCK, - "seq %d, cpu %d, %s @ %x\n", - (integer_t) lock_seq, (integer_t) mycpu, - (integer_t) op_name, (integer_t) pc, 0); - lock_seq++; - } -} - - -#endif /* USLOCK_DEBUG */ - /* * The C portion of the shared/exclusive locks package. */ @@ -1225,13 +783,13 @@ lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unuse if (wait) { wait_for_event(); } else { - clear_exclusive(); + os_atomic_clear_exclusive(); } if (!wait || (mach_absolute_time() >= deadline)) { return FALSE; } } - clear_exclusive(); + os_atomic_clear_exclusive(); return TRUE; #else uint32_t data; @@ -1259,7 +817,7 @@ lck_rw_interlock_spin(lck_rw_t *lock) if (data & LCK_RW_INTERLOCK) { wait_for_event(); } else { - clear_exclusive(); + os_atomic_clear_exclusive(); return; } } @@ -1495,6 +1053,8 @@ lck_rw_lock_shared(lck_rw_t *lock) /* * Routine: lck_rw_lock_shared_to_exclusive + * + * False returned upon failure, in this case the shared lock is dropped. */ boolean_t lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) @@ -2505,7 +2065,6 @@ lck_mtx_init( { lck->lck_mtx_ptr = NULL; // Clear any padding in the union fields below lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; lck->lck_mtx_type = LCK_MTX_TYPE; ordered_store_mtx(lck, 0); } @@ -2538,7 +2097,6 @@ lck_mtx_init_ext( lck->lck_mtx_type = LCK_MTX_TYPE; } else { lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; lck->lck_mtx_type = LCK_MTX_TYPE; ordered_store_mtx(lck, 0); } @@ -2627,8 +2185,8 @@ lck_mtx_lock(lck_mtx_t *lock) lck_mtx_verify(lock); lck_mtx_check_preemption(lock); thread = current_thread(); - if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread), - memory_order_acquire_smp, FALSE)) { + if (os_atomic_cmpxchg(&lock->lck_mtx_data, + 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -2647,6 +2205,7 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) uintptr_t state; int waiters = 0; spinwait_result_t sw_res; + struct turnstile *ts = NULL; /* Loop waiting until I see that the mutex is unowned */ for (;;) { @@ -2655,6 +2214,11 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) switch (sw_res) { case SPINWAIT_ACQUIRED: + if (ts != NULL) { + interlock_lock(lock); + turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + interlock_unlock(lock); + } goto done; case SPINWAIT_INTERLOCK: goto set_owner; @@ -2668,7 +2232,7 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) break; } ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait - lck_mtx_lock_wait(lock, holding_thread); + lck_mtx_lock_wait(lock, holding_thread, &ts); /* returns interlock unlocked */ } @@ -2678,7 +2242,15 @@ set_owner: if (state & ARM_LCK_WAITERS) { /* Skip lck_mtx_lock_acquire if there are no waiters. */ - waiters = lck_mtx_lock_acquire(lock); + waiters = lck_mtx_lock_acquire(lock, ts); + /* + * lck_mtx_lock_acquire will call + * turnstile_complete + */ + } else { + if (ts != NULL) { + turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + } } state = LCK_MTX_THREAD_TO_STATE(thread); @@ -2697,6 +2269,12 @@ set_owner: done: load_memory_barrier(); + assert(thread->turnstile != NULL); + + if (ts != NULL) { + turnstile_cleanup(); + } + #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -2918,8 +2496,8 @@ lck_mtx_try_lock(lck_mtx_t *lock) thread_t thread = current_thread(); lck_mtx_verify(lock); - if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread), - memory_order_acquire_smp, FALSE)) { + if (os_atomic_cmpxchg(&lock->lck_mtx_data, + 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -2957,7 +2535,7 @@ lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread) state |= LCK_ILOCK; ordered_store_mtx(lock, state); #endif // __SMP__ - waiters = lck_mtx_lock_acquire(lock); + waiters = lck_mtx_lock_acquire(lock, NULL); state = LCK_MTX_THREAD_TO_STATE(thread); if (waiters != 0) { state |= ARM_LCK_WAITERS; @@ -2971,6 +2549,9 @@ lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread) enable_preemption(); #endif load_memory_barrier(); + + turnstile_cleanup(); + return TRUE; } @@ -3046,8 +2627,8 @@ lck_mtx_unlock(lck_mtx_t *lock) goto slow_case; } // Locked as a mutex - if (atomic_compare_exchange(&lock->lck_mtx_data, LCK_MTX_THREAD_TO_STATE(thread), 0, - memory_order_release_smp, FALSE)) { + if (os_atomic_cmpxchg(&lock->lck_mtx_data, + LCK_MTX_THREAD_TO_STATE(thread), 0, release)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -3061,6 +2642,7 @@ static void NOINLINE lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held) { uintptr_t state; + boolean_t cleanup = FALSE; if (ilk_held) { state = ordered_load_mtx(lock); @@ -3084,13 +2666,17 @@ lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held) ordered_store_mtx(lock, state); #endif if (state & ARM_LCK_WAITERS) { - lck_mtx_unlock_wakeup(lock, thread); - state = ordered_load_mtx(lock); - } else { - assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri); + if (lck_mtx_unlock_wakeup(lock, thread)) { + state = ARM_LCK_WAITERS; + } else { + state = 0; + } + cleanup = TRUE; + goto unlock; } } state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */ +unlock: #if __SMP__ state |= LCK_ILOCK; ordered_store_mtx(lock, state); @@ -3099,6 +2685,16 @@ lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held) ordered_store_mtx(lock, state); enable_preemption(); #endif + if (cleanup) { + /* + * Do not do any turnstile operations outside of this block. + * lock/unlock is called at early stage of boot with single thread, + * when turnstile is not yet initialized. + * Even without contention we can come throught the slow path + * if the mutex is acquired as a spin lock. + */ + turnstile_cleanup(); + } #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0); @@ -3165,7 +2761,7 @@ lck_mtx_convert_spin(lck_mtx_t *lock) } state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag ordered_store_mtx(lock, state); - waiters = lck_mtx_lock_acquire(lock); // Acquire to manage priority boosts + waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts state = LCK_MTX_THREAD_TO_STATE(thread); if (waiters != 0) { state |= ARM_LCK_WAITERS; @@ -3178,6 +2774,7 @@ lck_mtx_convert_spin(lck_mtx_t *lock) ordered_store_mtx(lock, state); // Set ownership enable_preemption(); #endif + turnstile_cleanup(); } @@ -3232,13 +2829,8 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type) if (holder != 0) { if (holder == thread) { panic("Lock owned by current thread %p = %lx", lock, state); - } else { - panic("Lock %p owned by thread %p", lock, holder); } } - if (state & LCK_ILOCK) { - panic("Lock bit set %p = %lx", lock, state); - } } else { panic("lck_spin_assert(): invalid arg (%u)", type); } diff --git a/osfmk/arm/locore.s b/osfmk/arm/locore.s index ce41150c1..1a544b0d8 100644 --- a/osfmk/arm/locore.s +++ b/osfmk/arm/locore.s @@ -720,11 +720,6 @@ icache_invalidate_trap: dsb ish isb #endif - mov r4, r0 - mov r5, r1 - bl EXT(CleanPoU_DcacheRegion) - mov r0, r4 - mov r1, r5 bl EXT(InvalidatePoU_IcacheRegion) mrc p15, 0, r9, c13, c0, 4 // Reload r9 from TPIDRPRW #if __ARM_USER_PROTECT__ @@ -1354,15 +1349,14 @@ fleh_irq_handler: mrc p15, 0, r9, c13, c0, 4 // Reload r9 from TPIDRPRW bl EXT(ml_get_timebase) // get current timebase LOAD_ADDR(r3, EntropyData) - ldr r2, [r3, ENTROPY_INDEX_PTR] - add r1, r3, ENTROPY_DATA_SIZE - add r2, r2, #4 - cmp r2, r1 - addge r2, r3, ENTROPY_BUFFER - ldr r4, [r2] - eor r0, r0, r4, ROR #9 - str r0, [r2] // Update gEntropie - str r2, [r3, ENTROPY_INDEX_PTR] + ldr r2, [r3, ENTROPY_SAMPLE_COUNT] + add r1, r2, 1 + str r1, [r3, ENTROPY_SAMPLE_COUNT] + and r2, r2, ENTROPY_BUFFER_INDEX_MASK + add r1, r3, ENTROPY_BUFFER + ldr r4, [r1, r2, lsl #2] + eor r0, r0, r4, ror #9 + str r0, [r1, r2, lsl #2] // Update gEntropie return_from_irq: mov r5, #0 diff --git a/osfmk/arm/loose_ends.c b/osfmk/arm/loose_ends.c index 883f8a1ea..cd9ff9021 100644 --- a/osfmk/arm/loose_ends.c +++ b/osfmk/arm/loose_ends.c @@ -53,6 +53,12 @@ #define INT_SIZE (BYTE_SIZE * sizeof (int)) +/* machine_routines_asm.s calls these */ +extern int copyin_validate(const user_addr_t, uintptr_t, vm_size_t); +extern int copyin_user_validate(const user_addr_t, uintptr_t, vm_size_t); +extern int copyout_validate(uintptr_t, const user_addr_t, vm_size_t); +extern int copyio_user_validate(int, int, user_addr_t, vm_size_t); +extern int copyoutstr_prevalidate(const void *, user_addr_t, size_t); void bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes) @@ -572,6 +578,36 @@ copypv(addr64_t source, addr64_t sink, unsigned int size, int which) */ const int copysize_limit_panic = (64 * 1024 * 1024); +static inline bool +is_kernel_to_kernel_copy() +{ + return current_thread()->map->pmap == kernel_pmap; +} + +static int +copy_validate_user(const user_addr_t user_addr, vm_size_t nbytes, bool kern_to_kern_allowed) +{ + user_addr_t user_addr_last = user_addr + nbytes; + thread_t self = current_thread(); + + if (__improbable(!kern_to_kern_allowed && is_kernel_to_kernel_copy())) { + return EFAULT; + } + + if (__improbable((user_addr_last < user_addr) || + ((user_addr + nbytes) > vm_map_max(self->map)) || + (user_addr < vm_map_min(self->map)))) { + return EFAULT; + } + + if (__improbable(nbytes > copysize_limit_panic)) { + panic("%s(%p, ..., %u) - transfer too large", __func__, + (void *)user_addr, nbytes); + } + + return 0; +} + /* * Validate the arguments to copy{in,out} on this platform. * @@ -581,7 +617,7 @@ const int copysize_limit_panic = (64 * 1024 * 1024); */ static int copy_validate(const user_addr_t user_addr, - uintptr_t kernel_addr, vm_size_t nbytes) + uintptr_t kernel_addr, vm_size_t nbytes, bool kern_to_kern_allowed) { uintptr_t kernel_addr_last = kernel_addr + nbytes; @@ -593,31 +629,42 @@ copy_validate(const user_addr_t user_addr, (void *)user_addr, (void *)kernel_addr, nbytes); } - user_addr_t user_addr_last = user_addr + nbytes; + return copy_validate_user(user_addr, nbytes, kern_to_kern_allowed); +} - if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) || - (user_addr < vm_map_min(current_thread()->map)))) { - return EFAULT; - } +int +copyin_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes) +{ + return copy_validate(ua, ka, nbytes, true); +} - if (__improbable(nbytes > copysize_limit_panic)) { - panic("%s(%p, %p, %u) - transfer too large", __func__, - (void *)user_addr, (void *)kernel_addr, nbytes); - } +int +copyin_user_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes) +{ + return copy_validate(ua, ka, nbytes, false); +} - return 0; +int +copyout_validate(uintptr_t ka, const user_addr_t ua, vm_size_t nbytes) +{ + return copy_validate(ua, ka, nbytes, true); } int -copyin_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes) +copyio_user_validate(int a __unused, int b __unused, + user_addr_t ua, vm_size_t nbytes) { - return copy_validate(ua, ka, nbytes); + return copy_validate_user(ua, nbytes, false); } int -copyout_validate(uintptr_t ka, const user_addr_t ua, vm_size_t nbytes) +copyoutstr_prevalidate(const void *__unused kaddr, user_addr_t __unused uaddr, size_t __unused len) { - return copy_validate(ua, ka, nbytes); + if (__improbable(is_kernel_to_kernel_copy())) { + return EFAULT; + } + + return 0; } #if MACH_ASSERT diff --git a/osfmk/arm/machine_cpuid.c b/osfmk/arm/machine_cpuid.c index b79253632..a29074a2c 100644 --- a/osfmk/arm/machine_cpuid.c +++ b/osfmk/arm/machine_cpuid.c @@ -147,6 +147,9 @@ machine_do_mvfpid() #else cpuid_mvfp_info.neon = 1; cpuid_mvfp_info.neon_hpfp = 1; +#if defined(__ARM_ARCH_8_2__) + cpuid_mvfp_info.neon_fp16 = 1; +#endif /* defined(__ARM_ARCH_8_2__) */ #endif /* __arm__ */ } diff --git a/osfmk/arm/machine_routines.c b/osfmk/arm/machine_routines.c index f201ddcc8..df89b7500 100644 --- a/osfmk/arm/machine_routines.c +++ b/osfmk/arm/machine_routines.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -69,7 +70,9 @@ uint64_t TLockTimeOut; uint64_t MutexSpin; boolean_t is_clock_configured = FALSE; +#if CONFIG_NONFATAL_ASSERTS extern int mach_assert; +#endif extern volatile uint32_t debug_enabled; void machine_conf(void); @@ -79,7 +82,9 @@ machine_startup(__unused boot_args * args) { int boot_arg; +#if CONFIG_NONFATAL_ASSERTS PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert)); +#endif if (PE_parse_boot_argn("preempt", &boot_arg, sizeof(boot_arg))) { default_preemption_rate = boot_arg; @@ -222,8 +227,8 @@ ml_init_lock_timeout(void) void ml_cpu_up(void) { - hw_atomic_add(&machine_info.physical_cpu, 1); - hw_atomic_add(&machine_info.logical_cpu, 1); + os_atomic_inc(&machine_info.physical_cpu, relaxed); + os_atomic_inc(&machine_info.logical_cpu, relaxed); } /* @@ -235,8 +240,8 @@ ml_cpu_down(void) { cpu_data_t *cpu_data_ptr; - hw_atomic_sub(&machine_info.physical_cpu, 1); - hw_atomic_sub(&machine_info.logical_cpu, 1); + os_atomic_dec(&machine_info.physical_cpu, relaxed); + os_atomic_dec(&machine_info.logical_cpu, relaxed); /* * If we want to deal with outstanding IPIs, we need to @@ -617,7 +622,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info, #endif if (!is_boot_cpu) { - early_random_cpu_init(this_cpu_datap->cpu_number); + random_cpu_init(this_cpu_datap->cpu_number); } return KERN_SUCCESS; @@ -693,6 +698,16 @@ ml_io_map( return io_map(phys_addr, size, VM_WIMG_IO); } +/* Map memory map IO space (with protections specified) */ +vm_offset_t +ml_io_map_with_prot( + vm_offset_t phys_addr, + vm_size_t size, + vm_prot_t prot) +{ + return io_map_with_prot(phys_addr, size, VM_WIMG_IO, prot); +} + vm_offset_t ml_io_map_wcomb( vm_offset_t phys_addr, @@ -728,12 +743,28 @@ vm_offset_t ml_static_vtop( vm_offset_t vaddr) { - if (((vm_address_t)(vaddr) - gVirtBase) >= gPhysSize) { - panic("ml_static_ptovirt(): illegal vaddr: %p\n", (void*)vaddr); - } + assertf(((vm_address_t)(vaddr) - gVirtBase) < gPhysSize, "%s: illegal vaddr: %p", __func__, (void*)vaddr); return (vm_address_t)(vaddr) - gVirtBase + gPhysBase; } +/* + * Return the maximum contiguous KVA range that can be accessed from this + * physical address. For arm64, we employ a segmented physical aperture + * relocation table which can limit the available range for a given PA to + * something less than the extent of physical memory. But here, we still + * have a flat physical aperture, so no such requirement exists. + */ +vm_map_address_t +phystokv_range(pmap_paddr_t pa, vm_size_t *max_len) +{ + vm_size_t len = gPhysSize - (pa - gPhysBase); + if (*max_len > len) { + *max_len = len; + } + assertf((pa - gPhysBase) < gPhysSize, "%s: illegal PA: 0x%lx", __func__, (unsigned long)pa); + return pa - gPhysBase + gVirtBase; +} + vm_offset_t ml_static_slide( vm_offset_t vaddr) @@ -811,9 +842,6 @@ ml_static_protect( ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_NX_MASK)) | arm_prot; *pte_p = ptmp; -#ifndef __ARM_L1_PTW__ - FlushPoC_DcacheRegion((vm_offset_t) pte_p, sizeof(*pte_p)); -#endif } } @@ -1142,13 +1170,13 @@ user_cont_hwclock_allowed(void) return FALSE; } -boolean_t -user_timebase_allowed(void) +uint8_t +user_timebase_type(void) { #if __ARM_TIME__ - return TRUE; + return USER_TIMEBASE_SPEC; #else - return FALSE; + return USER_TIMEBASE_NONE; #endif } @@ -1156,7 +1184,7 @@ user_timebase_allowed(void) * The following are required for parts of the kernel * that cannot resolve these functions as inlines: */ -extern thread_t current_act(void); +extern thread_t current_act(void) __attribute__((const)); thread_t current_act(void) { @@ -1164,7 +1192,7 @@ current_act(void) } #undef current_thread -extern thread_t current_thread(void); +extern thread_t current_thread(void) __attribute__((const)); thread_t current_thread(void) { diff --git a/osfmk/arm/machine_routines.h b/osfmk/arm/machine_routines.h index 545403eee..db581e897 100644 --- a/osfmk/arm/machine_routines.h +++ b/osfmk/arm/machine_routines.h @@ -446,6 +446,11 @@ vm_offset_t ml_io_map_wcomb( vm_offset_t phys_addr, vm_size_t size); +vm_offset_t ml_io_map_with_prot( + vm_offset_t phys_addr, + vm_size_t size, + vm_prot_t prot); + void ml_get_bouncepool_info( vm_offset_t *phys_addr, vm_size_t *size); @@ -514,6 +519,17 @@ void bzero_phys( void bzero_phys_nc(addr64_t src64, vm_size_t bytes); +#if MACH_KERNEL_PRIVATE +#ifdef __arm64__ +/* Pattern-fill buffer with zeros or a 32-bit pattern; + * target must be 128-byte aligned and sized a multiple of 128 + * Both variants emit stores with non-temporal properties. + */ +void fill32_dczva(addr64_t, vm_size_t); +void fill32_nt(addr64_t, vm_size_t, uint32_t); +#endif +#endif + void ml_thread_policy( thread_t thread, unsigned policy_id, @@ -556,6 +572,14 @@ extern uint64_t ml_get_conttime_wake_time(void); /* Time since the system was reset (as part of boot/wake) */ uint64_t ml_get_time_since_reset(void); +/* + * Called by ApplePMGR to set wake time. Units and epoch are identical + * to mach_continuous_time(). Has no effect on !HAS_CONTINUOUS_HWCLOCK + * chips. If wake_time == UINT64_MAX, that means the wake time is + * unknown and calls to ml_get_time_since_reset() will return UINT64_MAX. + */ +void ml_set_reset_time(uint64_t wake_time); + #ifdef XNU_KERNEL_PRIVATE /* Just a stub on ARM */ extern kern_return_t ml_interrupt_prewarm(uint64_t deadline); @@ -608,6 +632,8 @@ extern int be_tracing(void); typedef void (*broadcastFunc) (void *); unsigned int cpu_broadcast_xcall(uint32_t *, boolean_t, broadcastFunc, void *); kern_return_t cpu_xcall(int, broadcastFunc, void *); +unsigned int cpu_broadcast_immediate_xcall(uint32_t *, boolean_t, broadcastFunc, void *); +kern_return_t cpu_immediate_xcall(int, broadcastFunc, void *); #ifdef KERNEL_PRIVATE @@ -932,6 +958,22 @@ typedef enum perfcontrol_callout_stat { uint64_t perfcontrol_callout_stat_avg(perfcontrol_callout_type_t type, perfcontrol_callout_stat_t stat); +#if defined(HAS_APPLE_PAC) +#define ONES(x) (BIT((x))-1) +#define PTR_MASK ONES(64-T1SZ_BOOT) +#define PAC_MASK ~PTR_MASK +#define SIGN(p) ((p) & BIT(55)) +#define UNSIGN_PTR(p) \ + SIGN(p) ? ((p) | PAC_MASK) : ((p) & ~PAC_MASK) + +void ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit); +void ml_task_set_disable_user_jop(task_t task, boolean_t disable_user_jop); +void ml_thread_set_disable_user_jop(thread_t thread, boolean_t disable_user_jop); +void ml_set_kernelkey_enabled(boolean_t enable); +void *ml_auth_ptr_unchecked(void *ptr, unsigned key, uint64_t modifier); +#endif /* defined(HAS_APPLE_PAC) */ + + #endif /* KERNEL_PRIVATE */ @@ -940,7 +982,7 @@ void ml_get_power_state(boolean_t *, boolean_t *); uint32_t get_arm_cpu_version(void); boolean_t user_cont_hwclock_allowed(void); -boolean_t user_timebase_allowed(void); +uint8_t user_timebase_type(void); boolean_t ml_thread_is64bit(thread_t thread); #ifdef __arm64__ diff --git a/osfmk/arm/machine_routines_asm.s b/osfmk/arm/machine_routines_asm.s index d175af88d..7b7f41411 100644 --- a/osfmk/arm/machine_routines_asm.s +++ b/osfmk/arm/machine_routines_asm.s @@ -621,15 +621,21 @@ LEXT(set_context_id) isb bx lr -#define COPYIO_VALIDATE(NAME) \ - /* call NAME_validate to check the arguments */ ;\ - push {r0, r1, r2, r7, lr} ;\ - add r7, sp, #12 ;\ - blx EXT(NAME##_validate) ;\ - cmp r0, #0 ;\ - addne sp, #12 ;\ - popne {r7, pc} ;\ - pop {r0, r1, r2, r7, lr} ;\ +/* + * arg0: prefix of the external validator function (copyin or copyout) + * arg1: 0-based index of highest argument register that must be preserved + */ +.macro COPYIO_VALIDATE + /* call NAME_validate to check the arguments */ + push {r0-r$1, r7, lr} + add r7, sp, #(($1 + 1) * 4) + blx EXT($0_validate) + cmp r0, #0 + addne sp, #(($1 + 1) * 4) + popne {r7, pc} + pop {r0-r$1, r7, lr} +.endmacro + #define COPYIO_SET_RECOVER() \ /* set recovery address */ ;\ @@ -735,7 +741,7 @@ LEXT(copyinstr) moveq r12, #0 streq r12, [r3] bxeq lr - COPYIO_VALIDATE(copyin) + COPYIO_VALIDATE copyin_user, 3 stmfd sp!, { r4, r5, r6 } mov r6, r3 @@ -786,7 +792,7 @@ copyinstr_error: .globl EXT(copyin) LEXT(copyin) COPYIO_HEADER() - COPYIO_VALIDATE(copyin) + COPYIO_VALIDATE copyin, 2 COPYIO_TRY_KERNEL() COPYIO_SET_RECOVER() COPYIO_MAP_USER() @@ -803,7 +809,7 @@ LEXT(copyin) .globl EXT(copyout) LEXT(copyout) COPYIO_HEADER() - COPYIO_VALIDATE(copyout) + COPYIO_VALIDATE copyout, 2 COPYIO_TRY_KERNEL() COPYIO_SET_RECOVER() COPYIO_MAP_USER() @@ -814,34 +820,96 @@ LEXT(copyout) /* - * int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes) + * int copyin_atomic32(const user_addr_t user_addr, uint32_t *kernel_addr) + * r0: user_addr + * r1: kernel_addr + */ + .text + .align 2 + .globl EXT(copyin_atomic32) +LEXT(copyin_atomic32) + tst r0, #3 // Test alignment of user address + bne 2f + + mov r2, #4 + COPYIO_VALIDATE copyin_user, 1 + COPYIO_SET_RECOVER() + COPYIO_MAP_USER() + + ldr r2, [r0] // Load word from user + str r2, [r1] // Store to kernel_addr + mov r0, #0 // Success + + COPYIO_UNMAP_USER() + COPYIO_RESTORE_RECOVER() + bx lr +2: // misaligned copyin + mov r0, #EINVAL + bx lr + +/* + * int copyin_atomic32_wait_if_equals(const char *src, uint32_t value) + * r0: user_addr + * r1: value + */ + .text + .align 2 + .globl EXT(copyin_atomic32_wait_if_equals) +LEXT(copyin_atomic32_wait_if_equals) + tst r0, #3 // Test alignment of user address + bne 2f + + mov r2, r0 + mov r3, #4 + COPYIO_VALIDATE copyio_user, 1 // validate user address (uses r2, r3) + COPYIO_SET_RECOVER() + COPYIO_MAP_USER() + + ldrex r2, [r0] + cmp r2, r1 + movne r0, ESTALE + bne 1f + mov r0, #0 + wfe +1: + clrex + + COPYIO_UNMAP_USER() + COPYIO_RESTORE_RECOVER() + bx lr +2: // misaligned copyin + mov r0, #EINVAL + bx lr + +/* + * int copyin_atomic64(const user_addr_t user_addr, uint64_t *kernel_addr) + * r0: user_addr + * r1: kernel_addr */ .text .align 2 - .globl EXT(copyin_word) -LEXT(copyin_word) - cmp r2, #4 // Test if size is 4 or 8 - cmpne r2, #8 - bne L_copyin_invalid - sub r3, r2, #1 - tst r0, r3 // Test alignment of user address - bne L_copyin_invalid - - COPYIO_VALIDATE(copyin) + .globl EXT(copyin_atomic64) +LEXT(copyin_atomic64) + tst r0, #7 // Test alignment of user address + bne 2f + + mov r2, #8 + COPYIO_VALIDATE copyin_user, 1 COPYIO_SET_RECOVER() COPYIO_MAP_USER() - mov r3, #0 // Clear high register - cmp r2, #4 // If size is 4 - ldreq r2, [r0] // Load word from user - ldrdne r2, r3, [r0] // Else Load double word from user +1: // ldrex/strex retry loop + ldrexd r2, r3, [r0] // Load double word from user + strexd r5, r2, r3, [r0] // (the COPYIO_*() macros make r5 safe to use as a scratch register here) + cmp r5, #0 + bne 1b stm r1, {r2, r3} // Store to kernel_addr mov r0, #0 // Success COPYIO_UNMAP_USER() COPYIO_RESTORE_RECOVER() bx lr -L_copyin_invalid: +2: // misaligned copyin mov r0, #EINVAL bx lr @@ -853,6 +921,69 @@ copyio_error: ldmfd sp!, { r4, r5, r6 } bx lr + +/* + * int copyout_atomic32(uint32_t value, user_addr_t user_addr) + * r0: value + * r1: user_addr + */ + .text + .align 2 + .globl EXT(copyout_atomic32) +LEXT(copyout_atomic32) + tst r1, #3 // Test alignment of user address + bne 2f + + mov r2, r1 + mov r3, #4 + COPYIO_VALIDATE copyio_user, 1 // validate user address (uses r2, r3) + COPYIO_SET_RECOVER() + COPYIO_MAP_USER() + + str r0, [r1] // Store word to user + mov r0, #0 // Success + + COPYIO_UNMAP_USER() + COPYIO_RESTORE_RECOVER() + bx lr +2: // misaligned copyout + mov r0, #EINVAL + bx lr + + +/* + * int copyout_atomic64(uint64_t value, user_addr_t user_addr) + * r0, r1: value + * r2: user_addr + */ + .text + .align 2 + .globl EXT(copyout_atomic64) +LEXT(copyout_atomic64) + tst r2, #7 // Test alignment of user address + bne 2f + + mov r3, #8 + COPYIO_VALIDATE copyio_user, 2 // validate user address (uses r2, r3) + COPYIO_SET_RECOVER() + COPYIO_MAP_USER() + +1: // ldrex/strex retry loop + ldrexd r4, r5, [r2] + strexd r3, r0, r1, [r2] // Atomically store double word to user + cmp r3, #0 + bne 1b + + mov r0, #0 // Success + + COPYIO_UNMAP_USER() + COPYIO_RESTORE_RECOVER() + bx lr +2: // misaligned copyout + mov r0, #EINVAL + bx lr + + /* * int copyin_kern(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) */ diff --git a/osfmk/arm/machine_routines_common.c b/osfmk/arm/machine_routines_common.c index 2cb596872..02f733910 100644 --- a/osfmk/arm/machine_routines_common.c +++ b/osfmk/arm/machine_routines_common.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #if MONOTONIC @@ -262,13 +263,13 @@ perfcontrol_callout_counters_end(uint64_t *start_counters, { uint64_t end_counters[MT_CORE_NFIXED]; mt_fixed_counts(end_counters); - atomic_fetch_add_explicit(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_CYCLES], - end_counters[MT_CORE_CYCLES] - start_counters[MT_CORE_CYCLES], memory_order_relaxed); + os_atomic_add(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_CYCLES], + end_counters[MT_CORE_CYCLES] - start_counters[MT_CORE_CYCLES], relaxed); #ifdef MT_CORE_INSTRS - atomic_fetch_add_explicit(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_INSTRS], - end_counters[MT_CORE_INSTRS] - start_counters[MT_CORE_INSTRS], memory_order_relaxed); + os_atomic_add(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_INSTRS], + end_counters[MT_CORE_INSTRS] - start_counters[MT_CORE_INSTRS], relaxed); #endif /* defined(MT_CORE_INSTRS) */ - atomic_fetch_add_explicit(&perfcontrol_callout_count[type], 1, memory_order_relaxed); + os_atomic_inc(&perfcontrol_callout_count[type], relaxed); } #endif /* MONOTONIC */ @@ -279,7 +280,8 @@ perfcontrol_callout_stat_avg(perfcontrol_callout_type_t type, if (!perfcontrol_callout_stats_enabled) { return 0; } - return perfcontrol_callout_stats[type][stat] / perfcontrol_callout_count[type]; + return os_atomic_load_wide(&perfcontrol_callout_stats[type][stat], relaxed) / + os_atomic_load_wide(&perfcontrol_callout_count[type], relaxed); } void @@ -480,13 +482,16 @@ machine_perfcontrol_deadline_passed(uint64_t deadline) /* * ml_spin_debug_reset() * Reset the timestamp on a thread that has been unscheduled - * to avoid false alarms. Alarm will go off if interrupts are held + * to avoid false alarms. Alarm will go off if interrupts are held * disabled for too long, starting from now. + * + * Call ml_get_timebase() directly to prevent extra overhead on newer + * platforms that's enabled in DEVELOPMENT kernel configurations. */ void ml_spin_debug_reset(thread_t thread) { - thread->machine.intmask_timestamp = mach_absolute_time(); + thread->machine.intmask_timestamp = ml_get_timebase(); } /* @@ -519,7 +524,7 @@ ml_check_interrupts_disabled_duration(thread_t thread) start = thread->machine.intmask_timestamp; if (start != 0) { - now = mach_absolute_time(); + now = ml_get_timebase(); if ((now - start) > interrupt_masked_timeout * debug_cpu_performance_degradation_factor) { mach_timebase_info_data_t timebase; @@ -554,6 +559,7 @@ ml_set_interrupts_enabled(boolean_t enable) state = __builtin_arm_rsr("DAIF"); #endif if (enable && (state & INTERRUPT_MASK)) { + assert(getCpuDatap()->cpu_int_state == NULL); // Make sure we're not enabling interrupts from primary interrupt context #if INTERRUPT_MASKED_DEBUG if (interrupt_masked_debug) { // Interrupts are currently masked, we will enable them (after finishing this check) @@ -588,7 +594,7 @@ ml_set_interrupts_enabled(boolean_t enable) #if INTERRUPT_MASKED_DEBUG if (interrupt_masked_debug) { // Interrupts were enabled, we just masked them - current_thread()->machine.intmask_timestamp = mach_absolute_time(); + current_thread()->machine.intmask_timestamp = ml_get_timebase(); } #endif } @@ -690,6 +696,11 @@ ml_get_time_since_reset(void) return ml_get_hwclock(); } +void +ml_set_reset_time(__unused uint64_t wake_time) +{ +} + uint64_t ml_get_conttime_wake_time(void) { diff --git a/osfmk/arm/memory_types.h b/osfmk/arm/memory_types.h new file mode 100644 index 000000000..59458b67e --- /dev/null +++ b/osfmk/arm/memory_types.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _ARM_MEMORY_TYPES_H_ +#define _ARM_MEMORY_TYPES_H_ + +#include + +/* + * WIMG control + */ +#define VM_MEM_INNER 0x10 +#define VM_MEM_RT 0x10 // intentionally alias VM_MEM_INNER; will be used with mutually exclusive caching policies +#define VM_MEM_EARLY_ACK 0x20 + +#define VM_WIMG_DEFAULT (VM_MEM_COHERENT) // 0x2 +#define VM_WIMG_COPYBACK (VM_MEM_COHERENT) // 0x2 +#define VM_WIMG_INNERWBACK (VM_MEM_COHERENT | VM_MEM_INNER) // 0x12 +#define VM_WIMG_IO (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) // 0x7 +#define VM_WIMG_POSTED (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED | VM_MEM_EARLY_ACK) // 0x27 +#define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) // 0xb +#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) // 0x6 +#define VM_WIMG_RT (VM_WIMG_IO | VM_MEM_RT) // 0x17 +#define VM_WIMG_POSTED_REORDERED (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT | VM_MEM_WRITE_THROUGH | VM_MEM_EARLY_ACK) // 0x2e +#define VM_WIMG_POSTED_COMBINED_REORDERED (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT | VM_MEM_EARLY_ACK) // 0x26 + +#endif /* _ARM_MEMORY_TYPES_H_ */ diff --git a/osfmk/arm/misc_protos.h b/osfmk/arm/misc_protos.h index ca995fb37..3cd4964bc 100644 --- a/osfmk/arm/misc_protos.h +++ b/osfmk/arm/misc_protos.h @@ -44,24 +44,23 @@ extern void arm_vm_init(uint64_t memory_size, boot_args *args); extern void arm_vm_prot_init(boot_args *args); extern void arm_vm_prot_finalize(boot_args *args); - extern kern_return_t DebuggerXCallEnter(boolean_t); extern void DebuggerXCallReturn(void); #if __arm64__ && DEBUG extern void dump_kva_space(void); -#endif +#endif /* __arm64__ && DEBUG */ extern void Load_context(thread_t); extern void Idle_load_context(void) __attribute__((noreturn)); extern thread_t Switch_context(thread_t, thread_continue_t, thread_t); extern thread_t Shutdown_context(void (*doshutdown)(processor_t), processor_t processor); -extern void Call_continuation(thread_continue_t, void *, wait_result_t, boolean_t enable_interrupts); +extern void __dead2 Call_continuation(thread_continue_t, void *, wait_result_t, boolean_t enable_interrupts); + extern void DebuggerCall(unsigned int reason, void *ctx); extern void DebuggerXCall(void *ctx); -extern int _copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t max, vm_size_t *actual); extern int copyout_kern(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes); extern int copyin_kern(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes); @@ -85,12 +84,12 @@ extern int copyio_check_user_addr(user_addr_t user_addr, vm_size_t nbytes); /* Top-Byte-Ignore */ extern boolean_t user_tbi; -#define TBI_MASK 0xff00000000000000 -#define user_tbi_enabled() (user_tbi) -#define tbi_clear(addr) ((addr) & ~(TBI_MASK)) +#define TBI_MASK 0xff00000000000000 +#define user_tbi_enabled() (user_tbi) +#define tbi_clear(addr) ((addr) & ~(TBI_MASK)) -#else +#else /* !defined(__arm__) && !defined(__arm64__) */ #error Unknown architecture. -#endif +#endif /* defined(__arm__) */ #endif /* _ARM_MISC_PROTOS_H_ */ diff --git a/osfmk/arm/model_dep.c b/osfmk/arm/model_dep.c index f178db28e..42d753130 100644 --- a/osfmk/arm/model_dep.c +++ b/osfmk/arm/model_dep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,6 +40,9 @@ #include #include +#if defined(HAS_APPLE_PAC) +#include +#endif #include #include @@ -135,7 +138,7 @@ extern uint64_t last_hwaccess_thread; extern char gTargetTypeBuffer[8]; extern char gModelTypeBuffer[32]; -decl_simple_lock_data(extern, clock_lock) +decl_simple_lock_data(extern, clock_lock); extern struct timeval gIOLastSleepTime; extern struct timeval gIOLastWakeTime; extern boolean_t is_clock_configured; @@ -262,6 +265,10 @@ print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, if (ppn != (ppnum_t)NULL) { if (is_64_bit) { lr = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET64) & PAGE_MASK)); +#if defined(HAS_APPLE_PAC) + /* return addresses on stack will be signed by arm64e ABI */ + lr = (addr64_t) ptrauth_strip((void *)lr, ptrauth_key_return_address); +#endif } else { lr = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET) & PAGE_MASK)); } @@ -309,8 +316,7 @@ print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, extern void panic_print_vnodes(void); static void -do_print_all_backtraces( - const char *message) +do_print_all_backtraces(const char *message, uint64_t panic_options) { int logversion = PANICLOG_VERSION; thread_t cur_thread = current_thread(); @@ -337,7 +343,7 @@ do_print_all_backtraces( } panic_bt_depth++; - /* Truncate panic string to 1200 bytes -- WDT log can be ~1100 bytes */ + /* Truncate panic string to 1200 bytes */ paniclog_append_noflush("Debugger message: %.1200s\n", message); if (debug_enabled) { paniclog_append_noflush("Device: %s\n", @@ -437,8 +443,8 @@ do_print_all_backtraces( } #endif - // Just print threads with high CPU usage for WDT timeouts - if (strncmp(message, "WDT timeout", 11) == 0) { + // Highlight threads that used high amounts of CPU in the panic log if requested (historically requested for watchdog panics) + if (panic_options & DEBUGGER_OPTION_PRINT_CPU_USAGE_PANICLOG) { thread_t top_runnable[5] = {0}; thread_t thread; int total_cpu_usage = 0; @@ -483,7 +489,7 @@ do_print_all_backtraces( } } // Loop through highest priority runnable threads paniclog_append_noflush("\n"); - } // Check if message is "WDT timeout" + } // print current task info if (VALIDATE_PTR_LIST(cur_thread, cur_thread->task)) { @@ -557,7 +563,7 @@ do_print_all_backtraces( kdp_snapshot_preflight(-1, stackshot_begin_loc, bytes_remaining - end_marker_bytes, (STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC | - STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0); + STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT), &kc_panic_data, 0); err = do_stackshot(NULL); bytes_traced = kdp_stack_snapshot_bytes_traced(); if (bytes_traced > 0 && !err) { @@ -605,7 +611,7 @@ do_print_all_backtraces( * Entry to print_all_backtraces is serialized by the debugger lock */ static void -print_all_backtraces(const char *message) +print_all_backtraces(const char *message, uint64_t panic_options) { unsigned int initial_not_in_kdp = not_in_kdp; @@ -620,7 +626,7 @@ print_all_backtraces(const char *message) * not_in_kdp. */ not_in_kdp = 0; - do_print_all_backtraces(message); + do_print_all_backtraces(message, panic_options); not_in_kdp = initial_not_in_kdp; @@ -663,10 +669,20 @@ panic_print_symbol_name(vm_address_t search) void SavePanicInfo( - const char *message, __unused void *panic_data, __unused uint64_t panic_options) + const char *message, __unused void *panic_data, uint64_t panic_options) { - /* This should be initialized by the time we get here */ - assert(panic_info->eph_panic_log_offset != 0); + /* + * This should be initialized by the time we get here, but + * if it is not, asserting about it will be of no use (it will + * come right back to here), so just loop right here and now. + * This prevents early-boot panics from becoming recursive and + * thus makes them easier to debug. If you attached to a device + * and see your PC here, look down a few frames to see your + * early-boot panic there. + */ + while (!panic_info || panic_info->eph_panic_log_offset == 0) { + ; + } if (panic_options & DEBUGGER_OPTION_PANICLOGANDREBOOT) { panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_BUTTON_RESET_PANIC; @@ -699,7 +715,7 @@ SavePanicInfo( PanicInfoSaved = TRUE; - print_all_backtraces(message); + print_all_backtraces(message, panic_options); assert(panic_info->eph_panic_log_len != 0); panic_info->eph_other_log_len = PE_get_offset_into_panic_region(debug_buf_ptr) - panic_info->eph_other_log_offset; @@ -744,6 +760,20 @@ paniclog_flush() PE_sync_panic_buffers(); } +/* + * @function _was_in_userspace + * + * @abstract Unused function used to indicate that a CPU was in userspace + * before it was IPI'd to enter the Debugger context. + * + * @discussion This function should never actually be called. + */ +static void __attribute__((__noreturn__)) +_was_in_userspace(void) +{ + panic("%s: should not have been invoked.", __FUNCTION__); +} + /* * @function DebuggerXCallEnter * @@ -814,7 +844,7 @@ DebuggerXCallEnter( } if (KERN_SUCCESS == cpu_signal(target_cpu_datap, SIGPdebug, (void *)NULL, NULL)) { - (void)hw_atomic_add(&debugger_sync, 1); + os_atomic_inc(&debugger_sync, relaxed); } else { cpu_signal_failed = true; kprintf("cpu_signal failed in DebuggerXCallEnter\n"); @@ -951,16 +981,16 @@ DebuggerXCall( if (save_context) { /* Save the interrupted context before acknowledging the signal */ - *state = *regs; + copy_signed_thread_state(state, regs); } else if (regs) { /* zero old state so machine_trace_thread knows not to backtrace it */ set_saved_state_fp(state, 0); - set_saved_state_pc(state, 0); + set_saved_state_pc(state, (register_t)&_was_in_userspace); set_saved_state_lr(state, 0); set_saved_state_sp(state, 0); } - (void)hw_atomic_sub(&debugger_sync, 1); + os_atomic_dec(&debugger_sync, relaxed); __builtin_arm_dmb(DMB_ISH); while (mp_kdp_trap) { ; diff --git a/osfmk/arm/monotonic_arm.c b/osfmk/arm/monotonic_arm.c index 8ed24d4eb..b5825672b 100644 --- a/osfmk/arm/monotonic_arm.c +++ b/osfmk/arm/monotonic_arm.c @@ -27,6 +27,7 @@ */ #include +#include #include #include @@ -43,6 +44,12 @@ mt_core_snap(__unused unsigned int ctr) return 0; } +uint64_t +mt_count_pmis(void) +{ + return 0; +} + struct mt_cpu * mt_cur_cpu(void) { diff --git a/osfmk/arm/pal_routines.h b/osfmk/arm/pal_routines.h index c9056284e..76431251c 100644 --- a/osfmk/arm/pal_routines.h +++ b/osfmk/arm/pal_routines.h @@ -29,6 +29,7 @@ #define _ARM_PAL_ROUTINES_H #include +#include #if defined(__cplusplus) extern "C" { @@ -58,7 +59,7 @@ static inline void pal_get_resource_property(const char **property_name, int *property_value) { - *property_name = 0; + *property_name = NULL; (void) property_value; } diff --git a/osfmk/arm/pcb.c b/osfmk/arm/pcb.c index 2ec9f9dcb..c03e518b6 100644 --- a/osfmk/arm/pcb.c +++ b/osfmk/arm/pcb.c @@ -115,6 +115,12 @@ machine_switch_context( return retval; } +boolean_t +machine_thread_on_core(thread_t thread) +{ + return thread->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU; +} + /* * Routine: machine_thread_create * @@ -143,7 +149,7 @@ machine_thread_create( struct pmap *new_pmap = vm_map_pmap(task->map); thread->machine.kptw_ttb = ((unsigned int) kernel_pmap->ttep) | TTBR_SETUP; - thread->machine.asid = new_pmap->asid; + thread->machine.asid = new_pmap->hw_asid; if (new_pmap->tte_index_max == NTTES) { thread->machine.uptw_ttc = 2; thread->machine.uptw_ttb = ((unsigned int) new_pmap->ttep) | TTBR_SETUP; diff --git a/osfmk/arm/pmap.c b/osfmk/arm/pmap.c index 8f33cff28..93921c0eb 100644 --- a/osfmk/arm/pmap.c +++ b/osfmk/arm/pmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2018 Apple Inc. All rights reserved. + * Copyright (c) 2011-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,14 +37,13 @@ #include #include +#include #include #include #include #include #include -#include #include -#include #include #include @@ -91,6 +90,428 @@ #include #include +#if defined(HAS_APPLE_PAC) +#include +#endif + +#define PMAP_TT_L0_LEVEL 0x0 +#define PMAP_TT_L1_LEVEL 0x1 +#define PMAP_TT_L2_LEVEL 0x2 +#define PMAP_TT_L3_LEVEL 0x3 +#if (__ARM_VMSA__ == 7) +#define PMAP_TT_MAX_LEVEL PMAP_TT_L2_LEVEL +#else +#define PMAP_TT_MAX_LEVEL PMAP_TT_L3_LEVEL +#endif +#define PMAP_TT_LEAF_LEVEL PMAP_TT_MAX_LEVEL +#define PMAP_TT_TWIG_LEVEL (PMAP_TT_MAX_LEVEL - 1) + +static bool alloc_asid(pmap_t pmap); +static void free_asid(pmap_t pmap); +static void flush_mmu_tlb_region_asid_async(vm_offset_t va, unsigned length, pmap_t pmap); +static void flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap); +static void flush_mmu_tlb_full_asid_async(pmap_t pmap); +static pt_entry_t wimg_to_pte(unsigned int wimg); + +struct page_table_ops { + bool (*alloc_id)(pmap_t pmap); + void (*free_id)(pmap_t pmap); + void (*flush_tlb_region_async)(vm_offset_t va, unsigned length, pmap_t pmap); + void (*flush_tlb_tte_async)(vm_offset_t va, pmap_t pmap); + void (*flush_tlb_async)(pmap_t pmap); + pt_entry_t (*wimg_to_pte)(unsigned int wimg); +}; + +static const struct page_table_ops native_pt_ops = +{ + .alloc_id = alloc_asid, + .free_id = free_asid, + .flush_tlb_region_async = flush_mmu_tlb_region_asid_async, + .flush_tlb_tte_async = flush_mmu_tlb_tte_asid_async, + .flush_tlb_async = flush_mmu_tlb_full_asid_async, + .wimg_to_pte = wimg_to_pte, +}; + +#if (__ARM_VMSA__ > 7) +const struct page_table_level_info pmap_table_level_info_16k[] = +{ + [0] = { + .size = ARM_16K_TT_L0_SIZE, + .offmask = ARM_16K_TT_L0_OFFMASK, + .shift = ARM_16K_TT_L0_SHIFT, + .index_mask = ARM_16K_TT_L0_INDEX_MASK, + .valid_mask = ARM_TTE_VALID, + .type_mask = ARM_TTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_BLOCK + }, + [1] = { + .size = ARM_16K_TT_L1_SIZE, + .offmask = ARM_16K_TT_L1_OFFMASK, + .shift = ARM_16K_TT_L1_SHIFT, + .index_mask = ARM_16K_TT_L1_INDEX_MASK, + .valid_mask = ARM_TTE_VALID, + .type_mask = ARM_TTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_BLOCK + }, + [2] = { + .size = ARM_16K_TT_L2_SIZE, + .offmask = ARM_16K_TT_L2_OFFMASK, + .shift = ARM_16K_TT_L2_SHIFT, + .index_mask = ARM_16K_TT_L2_INDEX_MASK, + .valid_mask = ARM_TTE_VALID, + .type_mask = ARM_TTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_BLOCK + }, + [3] = { + .size = ARM_16K_TT_L3_SIZE, + .offmask = ARM_16K_TT_L3_OFFMASK, + .shift = ARM_16K_TT_L3_SHIFT, + .index_mask = ARM_16K_TT_L3_INDEX_MASK, + .valid_mask = ARM_PTE_TYPE_VALID, + .type_mask = ARM_PTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_L3BLOCK + } +}; + +const struct page_table_level_info pmap_table_level_info_4k[] = +{ + [0] = { + .size = ARM_4K_TT_L0_SIZE, + .offmask = ARM_4K_TT_L0_OFFMASK, + .shift = ARM_4K_TT_L0_SHIFT, + .index_mask = ARM_4K_TT_L0_INDEX_MASK, + .valid_mask = ARM_TTE_VALID, + .type_mask = ARM_TTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_BLOCK + }, + [1] = { + .size = ARM_4K_TT_L1_SIZE, + .offmask = ARM_4K_TT_L1_OFFMASK, + .shift = ARM_4K_TT_L1_SHIFT, + .index_mask = ARM_4K_TT_L1_INDEX_MASK, + .valid_mask = ARM_TTE_VALID, + .type_mask = ARM_TTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_BLOCK + }, + [2] = { + .size = ARM_4K_TT_L2_SIZE, + .offmask = ARM_4K_TT_L2_OFFMASK, + .shift = ARM_4K_TT_L2_SHIFT, + .index_mask = ARM_4K_TT_L2_INDEX_MASK, + .valid_mask = ARM_TTE_VALID, + .type_mask = ARM_TTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_BLOCK + }, + [3] = { + .size = ARM_4K_TT_L3_SIZE, + .offmask = ARM_4K_TT_L3_OFFMASK, + .shift = ARM_4K_TT_L3_SHIFT, + .index_mask = ARM_4K_TT_L3_INDEX_MASK, + .valid_mask = ARM_PTE_TYPE_VALID, + .type_mask = ARM_PTE_TYPE_MASK, + .type_block = ARM_TTE_TYPE_L3BLOCK + } +}; + +struct page_table_attr { + const struct page_table_level_info * const pta_level_info; + const struct page_table_ops * const pta_ops; + const uintptr_t ap_ro; + const uintptr_t ap_rw; + const uintptr_t ap_rona; + const uintptr_t ap_rwna; + const uintptr_t ap_xn; + const uintptr_t ap_x; + const unsigned int pta_root_level; + const unsigned int pta_max_level; +}; + +const struct page_table_attr pmap_pt_attr_4k = { + .pta_level_info = pmap_table_level_info_4k, + .pta_root_level = PMAP_TT_L1_LEVEL, + .pta_max_level = PMAP_TT_L3_LEVEL, + .pta_ops = &native_pt_ops, + .ap_ro = ARM_PTE_AP(AP_RORO), + .ap_rw = ARM_PTE_AP(AP_RWRW), + .ap_rona = ARM_PTE_AP(AP_RONA), + .ap_rwna = ARM_PTE_AP(AP_RWNA), + .ap_xn = ARM_PTE_PNX | ARM_PTE_NX, + .ap_x = ARM_PTE_PNX, +}; + +const struct page_table_attr pmap_pt_attr_16k = { + .pta_level_info = pmap_table_level_info_16k, + .pta_root_level = PMAP_TT_L1_LEVEL, + .pta_max_level = PMAP_TT_L3_LEVEL, + .pta_ops = &native_pt_ops, + .ap_ro = ARM_PTE_AP(AP_RORO), + .ap_rw = ARM_PTE_AP(AP_RWRW), + .ap_rona = ARM_PTE_AP(AP_RONA), + .ap_rwna = ARM_PTE_AP(AP_RWNA), + .ap_xn = ARM_PTE_PNX | ARM_PTE_NX, + .ap_x = ARM_PTE_PNX, +}; + +#if __ARM_16K_PG__ +const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k; +#else /* !__ARM_16K_PG__ */ +const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k; +#endif /* !__ARM_16K_PG__ */ + + +#else /* (__ARM_VMSA__ > 7) */ +/* + * We don't support pmap parameterization for VMSA7, so use an opaque + * page_table_attr structure. + */ +const struct page_table_attr * const native_pt_attr = NULL; +#endif /* (__ARM_VMSA__ > 7) */ + +typedef struct page_table_attr pt_attr_t; + +/* Macro for getting pmap attributes; not a function for const propagation. */ +#if ARM_PARAMETERIZED_PMAP +/* The page table attributes are linked to the pmap */ +#define pmap_get_pt_attr(pmap) ((pmap)->pmap_pt_attr) +#define pmap_get_pt_ops(pmap) ((pmap)->pmap_pt_attr->pta_ops) +#else /* !ARM_PARAMETERIZED_PMAP */ +/* The page table attributes are fixed (to allow for const propagation) */ +#define pmap_get_pt_attr(pmap) (native_pt_attr) +#define pmap_get_pt_ops(pmap) (&native_pt_ops) +#endif /* !ARM_PARAMETERIZED_PMAP */ + +#if (__ARM_VMSA__ > 7) +static inline uint64_t +pt_attr_ln_size(const pt_attr_t * const pt_attr, unsigned int level) +{ + return pt_attr->pta_level_info[level].size; +} + +__unused static inline uint64_t +pt_attr_ln_shift(const pt_attr_t * const pt_attr, unsigned int level) +{ + return pt_attr->pta_level_info[level].shift; +} + +__unused static inline uint64_t +pt_attr_ln_offmask(const pt_attr_t * const pt_attr, unsigned int level) +{ + return pt_attr->pta_level_info[level].offmask; +} + +static inline unsigned int +pt_attr_twig_level(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_max_level - 1; +} + +static inline unsigned int +pt_attr_root_level(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_root_level; +} + +static __unused inline uint64_t +pt_attr_leaf_size(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level].size; +} + +static __unused inline uint64_t +pt_attr_leaf_offmask(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level].offmask; +} + +static inline uint64_t +pt_attr_leaf_shift(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level].shift; +} + +static __unused inline uint64_t +pt_attr_leaf_index_mask(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level].index_mask; +} + +static inline uint64_t +pt_attr_twig_size(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].size; +} + +static inline uint64_t +pt_attr_twig_offmask(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].offmask; +} + +static inline uint64_t +pt_attr_twig_shift(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].shift; +} + +static __unused inline uint64_t +pt_attr_twig_index_mask(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].index_mask; +} + +static inline uint64_t +pt_attr_leaf_table_size(const pt_attr_t * const pt_attr) +{ + return pt_attr_twig_size(pt_attr); +} + +static inline uint64_t +pt_attr_leaf_table_offmask(const pt_attr_t * const pt_attr) +{ + return pt_attr_twig_offmask(pt_attr); +} + +static inline uintptr_t +pt_attr_leaf_rw(const pt_attr_t * const pt_attr) +{ + return pt_attr->ap_rw; +} + +static inline uintptr_t +pt_attr_leaf_ro(const pt_attr_t * const pt_attr) +{ + return pt_attr->ap_ro; +} + +static inline uintptr_t +pt_attr_leaf_rona(const pt_attr_t * const pt_attr) +{ + return pt_attr->ap_rona; +} + +static inline uintptr_t +pt_attr_leaf_rwna(const pt_attr_t * const pt_attr) +{ + return pt_attr->ap_rwna; +} + +static inline uintptr_t +pt_attr_leaf_xn(const pt_attr_t * const pt_attr) +{ + return pt_attr->ap_xn; +} + +static inline uintptr_t +pt_attr_leaf_x(const pt_attr_t * const pt_attr) +{ + return pt_attr->ap_x; +} + +#else /* (__ARM_VMSA__ > 7) */ + +static inline unsigned int +pt_attr_twig_level(__unused const pt_attr_t * const pt_attr) +{ + return PMAP_TT_L1_LEVEL; +} + +static inline uint64_t +pt_attr_twig_size(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_TWIG_SIZE; +} + +static inline uint64_t +pt_attr_twig_offmask(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_TWIG_OFFMASK; +} + +static inline uint64_t +pt_attr_twig_shift(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_TWIG_SHIFT; +} + +static __unused inline uint64_t +pt_attr_twig_index_mask(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_TWIG_INDEX_MASK; +} + +__unused static inline uint64_t +pt_attr_leaf_size(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_LEAF_SIZE; +} + +__unused static inline uint64_t +pt_attr_leaf_offmask(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_LEAF_OFFMASK; +} + +static inline uint64_t +pt_attr_leaf_shift(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_LEAF_SHIFT; +} + +static __unused inline uint64_t +pt_attr_leaf_index_mask(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_LEAF_INDEX_MASK; +} + +static inline uint64_t +pt_attr_leaf_table_size(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_L1_PT_SIZE; +} + +static inline uint64_t +pt_attr_leaf_table_offmask(__unused const pt_attr_t * const pt_attr) +{ + return ARM_TT_L1_PT_OFFMASK; +} + +static inline uintptr_t +pt_attr_leaf_rw(__unused const pt_attr_t * const pt_attr) +{ + return ARM_PTE_AP(AP_RWRW); +} + +static inline uintptr_t +pt_attr_leaf_ro(__unused const pt_attr_t * const pt_attr) +{ + return ARM_PTE_AP(AP_RORO); +} + +static inline uintptr_t +pt_attr_leaf_rona(__unused const pt_attr_t * const pt_attr) +{ + return ARM_PTE_AP(AP_RONA); +} + +static inline uintptr_t +pt_attr_leaf_rwna(__unused const pt_attr_t * const pt_attr) +{ + return ARM_PTE_AP(AP_RWNA); +} + +static inline uintptr_t +pt_attr_leaf_xn(__unused const pt_attr_t * const pt_attr) +{ + return ARM_PTE_NX; +} + +#endif /* (__ARM_VMSA__ > 7) */ + +static inline void +pmap_sync_tlb(bool strong __unused) +{ + sync_tlb_flush(); +} #if MACH_ASSERT int vm_footprint_suspend_allowed = 1; @@ -128,11 +549,11 @@ int panic_on_unsigned_execute = 0; /* Virtual memory region for early allocation */ #if (__ARM_VMSA__ == 7) -#define VREGION1_START (VM_HIGH_KERNEL_WINDOW & ~ARM_TT_L1_PT_OFFMASK) +#define VREGION1_HIGH_WINDOW (0) #else #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA) -#define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW) #endif +#define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW) #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START))) extern unsigned int not_in_kdp; @@ -146,7 +567,7 @@ extern vm_offset_t virtual_space_start; /* Next available kernel VA */ extern vm_offset_t virtual_space_end; /* End of kernel address space */ extern vm_offset_t static_memory_end; -extern int hard_maxproc; +extern int maxproc, hard_maxproc; #if (__ARM_VMSA__ > 7) /* The number of address bits one TTBR can cover. */ @@ -179,14 +600,15 @@ vm_object_t pmap_object = &pmap_object_store; static struct zone *pmap_zone; /* zone of pmap structures */ -decl_simple_lock_data(, pmaps_lock MARK_AS_PMAP_DATA) +decl_simple_lock_data(, pmaps_lock MARK_AS_PMAP_DATA); +decl_simple_lock_data(, tt1_lock MARK_AS_PMAP_DATA); unsigned int pmap_stamp MARK_AS_PMAP_DATA; queue_head_t map_pmap_list MARK_AS_PMAP_DATA; -decl_simple_lock_data(, pt_pages_lock MARK_AS_PMAP_DATA) +decl_simple_lock_data(, pt_pages_lock MARK_AS_PMAP_DATA); queue_head_t pt_page_list MARK_AS_PMAP_DATA; /* pt page ptd entries list */ -decl_simple_lock_data(, pmap_pages_lock MARK_AS_PMAP_DATA) +decl_simple_lock_data(, pmap_pages_lock MARK_AS_PMAP_DATA); typedef struct page_free_entry { struct page_free_entry *next; @@ -241,6 +663,7 @@ SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set b int nx_enabled = 1; /* enable no-execute protection */ int allow_data_exec = 0; /* No apps may execute data */ int allow_stack_exec = 0; /* No apps may execute from the stack */ +unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0; #else /* DEVELOPMENT || DEBUG */ const int nx_enabled = 1; /* enable no-execute protection */ const int allow_data_exec = 0; /* No apps may execute data */ @@ -253,15 +676,16 @@ const int allow_stack_exec = 0; /* No apps may execute f typedef struct pv_entry { struct pv_entry *pve_next; /* next alias */ pt_entry_t *pve_ptep; /* page table entry */ +} #if __arm__ && (__BIGGEST_ALIGNMENT__ > 4) /* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers * are 32-bit: * Since pt_desc is 64-bit aligned and we cast often from pv_entry to * pt_desc. */ -} __attribute__ ((aligned(8))) pv_entry_t; +__attribute__ ((aligned(8))) pv_entry_t; #else -} pv_entry_t; +pv_entry_t; #endif #define PV_ENTRY_NULL ((pv_entry_t *) 0) @@ -298,10 +722,10 @@ SECURITY_READ_ONLY_LATE(pv_entry_t * *) pv_head_table; /* array of pv pv_entry_t *pv_free_list MARK_AS_PMAP_DATA; pv_entry_t *pv_kern_free_list MARK_AS_PMAP_DATA; -decl_simple_lock_data(, pv_free_list_lock MARK_AS_PMAP_DATA) -decl_simple_lock_data(, pv_kern_free_list_lock MARK_AS_PMAP_DATA) +decl_simple_lock_data(, pv_free_list_lock MARK_AS_PMAP_DATA); +decl_simple_lock_data(, pv_kern_free_list_lock MARK_AS_PMAP_DATA); -decl_simple_lock_data(, phys_backup_lock) +decl_simple_lock_data(, phys_backup_lock); /* * pt_desc - structure to keep info on page assigned to page tables @@ -321,6 +745,14 @@ decl_simple_lock_data(, phys_backup_lock) typedef struct pt_desc { queue_chain_t pt_page; + union { + struct pmap *pmap; + }; + /* + * Locate this struct towards the end of the pt_desc; our long term + * goal is to make this a VLA to avoid wasting memory if we don't need + * multiple entries. + */ struct { /* * For non-leaf pagetables, should always be PT_DESC_REFCOUNT @@ -334,13 +766,8 @@ typedef struct pt_desc { * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU operations are implicitly wired) */ unsigned short wiredcnt; - } pt_cnt[PT_INDEX_MAX]; - union { - struct pmap *pmap; - }; - struct { vm_offset_t va; - } pt_map[PT_INDEX_MAX]; + } ptd_info[PT_INDEX_MAX]; } pt_desc_t; @@ -351,7 +778,7 @@ SECURITY_READ_ONLY_LATE(pt_desc_t *) ptd_root_table; pt_desc_t *ptd_free_list MARK_AS_PMAP_DATA = PTD_ENTRY_NULL; SECURITY_READ_ONLY_LATE(boolean_t) ptd_preboot = TRUE; unsigned int ptd_free_count MARK_AS_PMAP_DATA = 0; -decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA) +decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA); /* * physical page attribute @@ -377,8 +804,10 @@ SECURITY_READ_ONLY_LATE(pp_attr_t*) pp_attr_table; typedef struct pmap_io_range { uint64_t addr; - uint32_t len; - uint32_t wimg; // treated as pp_attr_t + uint64_t len; + #define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31) // Strong DSB required for pages in this range + uint32_t wimg; // lower 16 bits treated as pp_attr_t, upper 16 bits contain additional mapping flags + uint32_t signature; // 4CC } __attribute__((packed)) pmap_io_range_t; SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table; @@ -386,8 +815,6 @@ SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table; SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0; SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0; -SECURITY_READ_ONLY_LATE(pmap_paddr_t) io_rgn_start = 0; -SECURITY_READ_ONLY_LATE(pmap_paddr_t) io_rgn_end = 0; SECURITY_READ_ONLY_LATE(unsigned int) num_io_rgns = 0; SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */ @@ -400,8 +827,13 @@ SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0; SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0; #endif -/* free address spaces (1 means free) */ -static uint32_t asid_bitmap[MAX_ASID / (sizeof(uint32_t) * NBBY)] MARK_AS_PMAP_DATA; +#define PMAP_MAX_SW_ASID ((MAX_ASID + MAX_HW_ASID - 1) / MAX_HW_ASID) +_Static_assert(PMAP_MAX_SW_ASID <= (UINT8_MAX + 1), + "VASID bits can't be represented by an 8-bit integer"); + +decl_simple_lock_data(, asid_lock MARK_AS_PMAP_DATA); +static bitmap_t asid_bitmap[BITMAP_LEN(MAX_ASID)] MARK_AS_PMAP_DATA; + #if (__ARM_VMSA__ > 7) SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; @@ -425,7 +857,7 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #define pte_set_wired(ptep, wired) \ do { \ SInt16 *ptd_wiredcnt_ptr; \ - ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(ptep)->pt_cnt[ARM_PT_DESC_INDEX(ptep)].wiredcnt); \ + ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(ptep)->ptd_info[ARM_PT_DESC_INDEX(ptep)].wiredcnt); \ if (wired) { \ *ptep |= ARM_PTE_WIRED; \ OSAddAtomic16(1, ptd_wiredcnt_ptr); \ @@ -473,69 +905,52 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; /* PTEP Define Macros */ -#if (__ARM_VMSA__ == 7) +/* mask for page descriptor index */ +#define ARM_TT_PT_INDEX_MASK ARM_PGMASK +#if (__ARM_VMSA__ == 7) #define ARM_PT_DESC_INDEX_MASK 0x00000 #define ARM_PT_DESC_INDEX_SHIFT 0 -/* - * mask for page descriptor index: 4MB per page table - */ -#define ARM_TT_PT_INDEX_MASK 0xfffU /* mask for page descriptor index: 4MB per page table */ - /* * Shift value used for reconstructing the virtual address for a PTE. */ #define ARM_TT_PT_ADDR_SHIFT (10U) #define ptep_get_va(ptep) \ - ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~0xFFF))))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<ptd_info[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<pmap)) + ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_PGMASK))))))))->pmap)) #else #if (ARM_PGSHIFT == 12) #define ARM_PT_DESC_INDEX_MASK ((PAGE_SHIFT_CONST == ARM_PGSHIFT )? 0x00000ULL : 0x03000ULL) #define ARM_PT_DESC_INDEX_SHIFT ((PAGE_SHIFT_CONST == ARM_PGSHIFT )? 0 : 12) -/* - * mask for page descriptor index: 2MB per page table - */ -#define ARM_TT_PT_INDEX_MASK (0x0fffULL) /* * Shift value used for reconstructing the virtual address for a PTE. */ #define ARM_TT_PT_ADDR_SHIFT (9ULL) - -/* TODO: Give this a better name/documentation than "other" */ -#define ARM_TT_PT_OTHER_MASK (0x0fffULL) - #else #define ARM_PT_DESC_INDEX_MASK (0x00000) #define ARM_PT_DESC_INDEX_SHIFT (0) -/* - * mask for page descriptor index: 32MB per page table - */ -#define ARM_TT_PT_INDEX_MASK (0x3fffULL) /* * Shift value used for reconstructing the virtual address for a PTE. */ #define ARM_TT_PT_ADDR_SHIFT (11ULL) - -/* TODO: Give this a better name/documentation than "other" */ -#define ARM_TT_PT_OTHER_MASK (0x3fffULL) #endif + #define ARM_PT_DESC_INDEX(ptep) \ (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT) #define ptep_get_va(ptep) \ - ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK))))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<ptd_info[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<pmap)) + ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_PGMASK))))))))->pmap)) #endif @@ -558,8 +973,9 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #ifdef __arm64__ -#define PVH_FLAG_IOMMU 0x4UL -#define PVH_FLAG_IOMMU_TABLE (1ULL << 63) +/* All flags listed below are stored in the PV head pointer unless otherwise noted */ +#define PVH_FLAG_IOMMU 0x4UL /* Stored in each PTE, or in PV head for single-PTE PV heads */ +#define PVH_FLAG_IOMMU_TABLE (1ULL << 63) /* Stored in each PTE, or in PV head for single-PTE PV heads */ #define PVH_FLAG_CPU (1ULL << 62) #define PVH_LOCK_BIT 61 #define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT) @@ -591,15 +1007,15 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #define pvh_set_flags(h, f) \ do { \ - __c11_atomic_store((_Atomic vm_offset_t *)(h), (*(vm_offset_t *)(h) & ~PVH_HIGH_FLAGS) | (f), \ - memory_order_relaxed); \ + os_atomic_store((vm_offset_t *)(h), (*(vm_offset_t *)(h) & ~PVH_HIGH_FLAGS) | (f), \ + relaxed); \ } while (0) #define pvh_update_head(h, e, t) \ do { \ assert(*(vm_offset_t *)(h) & PVH_FLAG_LOCK); \ - __c11_atomic_store((_Atomic vm_offset_t *)(h), (vm_offset_t)(e) | (t) | PVH_FLAG_LOCK, \ - memory_order_relaxed); \ + os_atomic_store((vm_offset_t *)(h), (vm_offset_t)(e) | (t) | PVH_FLAG_LOCK, \ + relaxed); \ } while (0) #define pvh_update_head_unlocked(h, e, t) \ @@ -740,25 +1156,34 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #if (__ARM_VMSA__ == 7) -#define tte_index(pmap, addr) \ +#define tte_index(pmap, pt_attr, addr) \ ttenum((addr)) +#define pte_index(pmap, pt_attr, addr) \ + ptenum((addr)) + #else -#define tt0_index(pmap, addr) \ - (((addr) & ARM_TT_L0_INDEX_MASK) >> ARM_TT_L0_SHIFT) +#define ttn_index(pmap, pt_attr, addr, pt_level) \ + (((addr) & (pt_attr)->pta_level_info[(pt_level)].index_mask) >> (pt_attr)->pta_level_info[(pt_level)].shift) + +#define tt0_index(pmap, pt_attr, addr) \ + ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L0_LEVEL) -#define tt1_index(pmap, addr) \ - (((addr) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT) +#define tt1_index(pmap, pt_attr, addr) \ + ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L1_LEVEL) -#define tt2_index(pmap, addr) \ - (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) +#define tt2_index(pmap, pt_attr, addr) \ + ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L2_LEVEL) -#define tt3_index(pmap, addr) \ - (((addr) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT) +#define tt3_index(pmap, pt_attr, addr) \ + ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L3_LEVEL) -#define tte_index(pmap, addr) \ - (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) +#define tte_index(pmap, pt_attr, addr) \ + tt2_index((pmap), (pt_attr), (addr)) + +#define pte_index(pmap, pt_attr, addr) \ + tt3_index((pmap), (pt_attr), (addr)) #endif @@ -810,13 +1235,11 @@ lck_grp_t pmap_lck_grp; pmap_unlock_bit((uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD, PVH_LOCK_BIT - (PVH_LOCK_WORD * 32)); \ } while (0) -#define PMAP_UPDATE_TLBS(pmap, s, e) { \ - flush_mmu_tlb_region_asid_async(s, (unsigned)(e - s), pmap); \ - sync_tlb_flush(); \ +#define PMAP_UPDATE_TLBS(pmap, s, e, strong) { \ + pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (unsigned)(e - s), pmap); \ + pmap_sync_tlb(strong); \ } -#ifdef __ARM_L1_PTW__ - #define FLUSH_PTE_RANGE(spte, epte) \ __builtin_arm_dmb(DMB_ISH); @@ -829,32 +1252,15 @@ lck_grp_t pmap_lck_grp; #define FLUSH_PTE_RANGE_STRONG(spte, epte) \ __builtin_arm_dsb(DSB_ISH); -#else /* __ARM_L1_PTW */ - -#define FLUSH_PTE_RANGE(spte, epte) \ - CleanPoU_DcacheRegion((vm_offset_t)spte, \ - (vm_offset_t)epte - (vm_offset_t)spte); - -#define FLUSH_PTE(pte_p) \ - __unreachable_ok_push \ - if (TEST_PAGE_RATIO_4) \ - FLUSH_PTE_RANGE((pte_p), (pte_p) + 4); \ - else \ - FLUSH_PTE_RANGE((pte_p), (pte_p) + 1); \ - CleanPoU_DcacheRegion((vm_offset_t)pte_p, sizeof(pt_entry_t)); \ - __unreachable_ok_pop - -#define FLUSH_PTE_STRONG(pte_p) FLUSH_PTE(pte_p) - -#define FLUSH_PTE_RANGE_STRONG(spte, epte) FLUSH_PTE_RANGE(spte, epte) - -#endif /* !defined(__ARM_L1_PTW) */ - #define WRITE_PTE_FAST(pte_p, pte_entry) \ __unreachable_ok_push \ if (TEST_PAGE_RATIO_4) { \ - if (((unsigned)(pte_p)) & 0x1f) \ - panic("WRITE_PTE\n"); \ + if (((unsigned)(pte_p)) & 0x1f) { \ + panic("%s: WRITE_PTE_FAST is unaligned, " \ + "pte_p=%p, pte_entry=%p", \ + __FUNCTION__, \ + pte_p, (void*)pte_entry); \ + } \ if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) { \ *(pte_p) = (pte_entry); \ *((pte_p)+1) = (pte_entry); \ @@ -959,9 +1365,6 @@ ppnum_t pmap_vtophys( void pmap_switch_user_ttb( pmap_t pmap); -static void flush_mmu_tlb_region_asid_async( - vm_offset_t va, unsigned length, pmap_t pmap); - static kern_return_t pmap_expand( pmap_t, vm_map_address_t, unsigned int options, unsigned int level); @@ -969,7 +1372,7 @@ static int pmap_remove_range( pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *); static int pmap_remove_range_options( - pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *, int); + pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *, bool *, int); static tt_entry_t *pmap_tt1_allocate( pmap_t, vm_size_t, unsigned int); @@ -989,15 +1392,6 @@ static kern_return_t pmap_tt_allocate( static void pmap_tte_deallocate( pmap_t, tt_entry_t *, unsigned int); -#define PMAP_TT_L1_LEVEL 0x1 -#define PMAP_TT_L2_LEVEL 0x2 -#define PMAP_TT_L3_LEVEL 0x3 -#if (__ARM_VMSA__ == 7) -#define PMAP_TT_MAX_LEVEL PMAP_TT_L2_LEVEL -#else -#define PMAP_TT_MAX_LEVEL PMAP_TT_L3_LEVEL -#endif - #ifdef __ARM64_PMAP_SUBPAGE_L1__ #if (__ARM_VMSA__ <= 7) #error This is not supported for old-style page tables @@ -1024,6 +1418,9 @@ static inline tt_entry_t *pmap_tt2e( static inline pt_entry_t *pmap_tt3e( pmap_t, vm_map_address_t); +static inline pt_entry_t *pmap_ttne( + pmap_t, unsigned int, vm_map_address_t); + static void pmap_unmap_sharedpage( pmap_t pmap); @@ -1064,19 +1461,20 @@ static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes); static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes); - static void pmap_trim_self(pmap_t pmap); static void pmap_trim_subord(pmap_t subord); + #define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \ - static __return_type __function_name##_internal __function_args; + static __return_type __function_name##_internal __function_args PMAP_SUPPORT_PROTOTYPES( kern_return_t, arm_fast_fault, (pmap_t pmap, vm_map_address_t va, vm_prot_t fault_type, - boolean_t from_user), ARM_FAST_FAULT_INDEX); + bool was_af_fault, + bool from_user), ARM_FAST_FAULT_INDEX); PMAP_SUPPORT_PROTOTYPES( @@ -1110,9 +1508,9 @@ PMAP_SUPPORT_PROTOTYPES( PMAP_SUPPORT_PROTOTYPES( pmap_t, - pmap_create, (ledger_t ledger, + pmap_create_options, (ledger_t ledger, vm_map_size_t size, - boolean_t is_64bit), PMAP_CREATE_INDEX); + unsigned int flags), PMAP_CREATE_INDEX); PMAP_SUPPORT_PROTOTYPES( void, @@ -1287,6 +1685,7 @@ PMAP_SUPPORT_PROTOTYPES( + void pmap_footprint_suspend(vm_map_t map, boolean_t suspend); PMAP_SUPPORT_PROTOTYPES( @@ -1389,6 +1788,7 @@ pmap_get_cpu_data(void) } + /* TODO */ pmap_paddr_t pmap_pages_reclaim( @@ -1398,7 +1798,6 @@ pmap_pages_reclaim( unsigned i; pt_desc_t *ptdp; - /* * pmap_pages_reclaim() is returning a page by freeing an active pt page. * To be eligible, a pt page is assigned to a user pmap. It doesn't have any wired pte @@ -1442,13 +1841,13 @@ pmap_pages_reclaim( unsigned wiredcnt_acc = 0; for (i = 0; i < PT_INDEX_MAX; i++) { - if (ptdp->pt_cnt[i].refcnt == PT_DESC_REFCOUNT) { + if (ptdp->ptd_info[i].refcnt == PT_DESC_REFCOUNT) { /* Do not attempt to free a page that contains an L2 table */ refcnt_acc = 0; break; } - refcnt_acc += ptdp->pt_cnt[i].refcnt; - wiredcnt_acc += ptdp->pt_cnt[i].wiredcnt; + refcnt_acc += ptdp->ptd_info[i].refcnt; + wiredcnt_acc += ptdp->ptd_info[i].wiredcnt; } if ((wiredcnt_acc == 0) && (refcnt_acc != 0)) { found_page = TRUE; @@ -1462,21 +1861,25 @@ pmap_pages_reclaim( ptdp = (pt_desc_t *)queue_next((queue_t)ptdp); } if (!found_page) { - panic("pmap_pages_reclaim(): No eligible page in pt_page_list\n"); + panic("%s: No eligible page in pt_page_list", __FUNCTION__); } else { - int remove_count = 0; + int remove_count = 0; + bool need_strong_sync = false; vm_map_address_t va; - pmap_t pmap; - pt_entry_t *bpte, *epte; - pt_entry_t *pte_p; - tt_entry_t *tte_p; - uint32_t rmv_spte = 0; + pmap_t pmap; + pt_entry_t *bpte, *epte; + pt_entry_t *pte_p; + tt_entry_t *tte_p; + uint32_t rmv_spte = 0; pmap_simple_unlock(&pt_pages_lock); pmap = ptdp->pmap; PMAP_ASSERT_LOCKED(pmap); // pmap lock should be held from loop above + + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + for (i = 0; i < PT_INDEX_MAX; i++) { - va = ptdp->pt_map[i].va; + va = ptdp->ptd_info[i].va; /* If the VA is bogus, this may represent an unallocated region * or one which is in transition (already being freed or expanded). @@ -1488,15 +1891,9 @@ pmap_pages_reclaim( tte_p = pmap_tte(pmap, va); if ((tte_p != (tt_entry_t *) NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) { -#if (__ARM_VMSA__ == 7) - pte_p = (pt_entry_t *) ttetokv(*tte_p); - bpte = &pte_p[ptenum(va)]; - epte = bpte + PAGE_SIZE / sizeof(pt_entry_t); -#else pte_p = (pt_entry_t *) ttetokv(*tte_p); - bpte = &pte_p[tt3_index(pmap, va)]; + bpte = &pte_p[pte_index(pmap, pt_attr, va)]; epte = bpte + PAGE_SIZE / sizeof(pt_entry_t); -#endif /* * Use PMAP_OPTIONS_REMOVE to clear any * "compressed" markers and update the @@ -1510,33 +1907,23 @@ pmap_pages_reclaim( */ remove_count += pmap_remove_range_options( pmap, va, bpte, epte, - &rmv_spte, PMAP_OPTIONS_REMOVE); - if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt != 0) { - panic("pmap_pages_reclaim(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt); + &rmv_spte, &need_strong_sync, PMAP_OPTIONS_REMOVE); + if (ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt != 0) { + panic("%s: ptdp %p, count %d", __FUNCTION__, ptdp, ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt); } -#if (__ARM_VMSA__ == 7) - pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL); - flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->asid & 0xff)); - flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->asid & 0xff)); - flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->asid & 0xff)); - flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->asid & 0xff)); -#else - pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL); - flush_mmu_tlb_entry_async(tlbi_addr(va & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); -#endif + + pmap_tte_deallocate(pmap, tte_p, PMAP_TT_TWIG_LEVEL); if (remove_count > 0) { -#if (__ARM_VMSA__ == 7) - flush_mmu_tlb_region_asid_async(va, 4 * ARM_TT_L1_SIZE, pmap); -#else - flush_mmu_tlb_region_asid_async(va, ARM_TT_L2_SIZE, pmap); -#endif + pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, (unsigned int)pt_attr_leaf_table_size(pt_attr), pmap); + } else { + pmap_get_pt_ops(pmap)->flush_tlb_tte_async(va, pmap); } } } - sync_tlb_flush(); // Undo the lock we grabbed when we found ptdp above PMAP_UNLOCK(pmap); + pmap_sync_tlb(need_strong_sync); } pmap_simple_lock(&pmap_pages_lock); } @@ -1671,76 +2058,56 @@ pmap_tt_ledger_debit( } } -static unsigned int -alloc_asid( - void) +static bool +alloc_asid(pmap_t pmap) { - unsigned int asid_bitmap_index; - - pmap_simple_lock(&pmaps_lock); - for (asid_bitmap_index = 0; asid_bitmap_index < (MAX_ASID / (sizeof(uint32_t) * NBBY)); asid_bitmap_index++) { - unsigned int temp = ffs(asid_bitmap[asid_bitmap_index]); - if (temp > 0) { - temp -= 1; - asid_bitmap[asid_bitmap_index] &= ~(1 << temp); -#if __ARM_KERNEL_PROTECT__ - /* - * We need two ASIDs: n and (n | 1). n is used for EL0, - * (n | 1) for EL1. - */ - unsigned int temp2 = temp | 1; - assert(temp2 < MAX_ASID); - assert(temp2 < 32); - assert(temp2 != temp); - assert(asid_bitmap[asid_bitmap_index] & (1 << temp2)); - - /* Grab the second ASID. */ - asid_bitmap[asid_bitmap_index] &= ~(1 << temp2); -#endif /* __ARM_KERNEL_PROTECT__ */ - pmap_simple_unlock(&pmaps_lock); - - /* - * We should never vend out physical ASID 0 through this - * method, as it belongs to the kernel. - */ - assert(((asid_bitmap_index * sizeof(uint32_t) * NBBY + temp) % ARM_MAX_ASID) != 0); + int vasid; + uint16_t hw_asid; -#if __ARM_KERNEL_PROTECT__ - /* Or the kernel EL1 ASID. */ - assert(((asid_bitmap_index * sizeof(uint32_t) * NBBY + temp) % ARM_MAX_ASID) != 1); -#endif /* __ARM_KERNEL_PROTECT__ */ - - return asid_bitmap_index * sizeof(uint32_t) * NBBY + temp; - } + pmap_simple_lock(&asid_lock); + vasid = bitmap_first(&asid_bitmap[0], MAX_ASID); + if (vasid < 0) { + pmap_simple_unlock(&asid_lock); + return false; } - pmap_simple_unlock(&pmaps_lock); - /* - * ToDo: Add code to deal with pmap with no asid panic for now. Not - * an issue with the small config process hard limit - */ - panic("alloc_asid(): out of ASID number"); - return MAX_ASID; + assert(vasid < MAX_ASID); + bitmap_clear(&asid_bitmap[0], (unsigned int)vasid); + pmap_simple_unlock(&asid_lock); + // bitmap_first() returns highest-order bits first, but a 0-based scheme works + // slightly better with the collision detection scheme used by pmap_switch_internal(). + vasid = MAX_ASID - 1 - vasid; + hw_asid = vasid % MAX_HW_ASID; + pmap->sw_asid = vasid / MAX_HW_ASID; + hw_asid += 1; // Account for ASID 0, which is reserved for the kernel +#if __ARM_KERNEL_PROTECT__ + hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access +#endif + pmap->hw_asid = hw_asid; + return true; } static void -free_asid( - int asid) +free_asid(pmap_t pmap) { - /* Don't free up any alias of physical ASID 0. */ - assert((asid % ARM_MAX_ASID) != 0); - - pmap_simple_lock(&pmaps_lock); - setbit(asid, (int *) asid_bitmap); + unsigned int vasid; + uint16_t hw_asid = pmap->hw_asid; + assert(hw_asid != 0); // Should not try to free kernel ASID #if __ARM_KERNEL_PROTECT__ - assert((asid | 1) < MAX_ASID); - assert((asid | 1) != asid); - setbit(asid | 1, (int *) asid_bitmap); -#endif /* __ARM_KERNEL_PROTECT__ */ + hw_asid >>= 1; +#endif + hw_asid -= 1; - pmap_simple_unlock(&pmaps_lock); + vasid = ((unsigned int)pmap->sw_asid * MAX_HW_ASID) + hw_asid; + vasid = MAX_ASID - 1 - vasid; + + pmap_simple_lock(&asid_lock); + assert(!bitmap_test(&asid_bitmap[0], vasid)); + bitmap_set(&asid_bitmap[0], vasid); + pmap_simple_unlock(&asid_lock); } + #ifndef PMAP_PV_LOAD_FACTOR #define PMAP_PV_LOAD_FACTOR 1 #endif @@ -1931,8 +2298,8 @@ pv_list_free( static inline void pv_water_mark_check(void) { - if ((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark)) { - if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) { + if (__improbable((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark))) { + if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) { thread_wakeup(&mapping_replenish_event); } } @@ -2134,7 +2501,8 @@ mapping_free_prime(void) kr = mapping_free_prime_internal(); if (kr != KERN_SUCCESS) { - panic("%s: failed, kr=%d", __FUNCTION__, kr); + panic("%s: failed, kr=%d", + __FUNCTION__, kr); } } @@ -2147,7 +2515,8 @@ mapping_adjust(void) mres = kernel_thread_start_priority((thread_continue_t)mapping_replenish, NULL, MAXPRI_KERNEL, &mapping_replenish_thread); if (mres != KERN_SUCCESS) { - panic("pmap: mapping_replenish thread creation failed"); + panic("%s: mapping_replenish thread creation failed", + __FUNCTION__); } thread_deallocate(mapping_replenish_thread); } @@ -2331,7 +2700,8 @@ ptd_alloc_unlinked(bool reclaim) ptd_free_list = (pt_desc_t *)(*(void **)ptdp); ptd_free_count--; } else { - panic("out of ptd entry\n"); + panic("%s: out of ptd entry", + __FUNCTION__); } if (!ptd_preboot) { @@ -2343,9 +2713,9 @@ ptd_alloc_unlinked(bool reclaim) ptdp->pmap = NULL; for (i = 0; i < PT_INDEX_MAX; i++) { - ptdp->pt_map[i].va = (vm_offset_t)-1; - ptdp->pt_cnt[i].refcnt = 0; - ptdp->pt_cnt[i].wiredcnt = 0; + ptdp->ptd_info[i].va = (vm_offset_t)-1; + ptdp->ptd_info[i].refcnt = 0; + ptdp->ptd_info[i].wiredcnt = 0; } return ptdp; @@ -2379,7 +2749,10 @@ ptd_deallocate(pt_desc_t *ptdp) pmap_t pmap = ptdp->pmap; if (ptd_preboot) { - panic("ptd_deallocate(): early boot\n"); + panic("%s: early boot, " + "ptdp=%p", + __FUNCTION__, + ptdp); } if (ptdp->pt_page.next != NULL) { @@ -2406,21 +2779,21 @@ ptd_init( pt_entry_t *pte_p) { if (ptdp->pmap != pmap) { - panic("ptd_init(): pmap mismatch\n"); + panic("%s: pmap mismatch, " + "ptdp=%p, pmap=%p, va=%p, level=%u, pte_p=%p", + __FUNCTION__, + ptdp, pmap, (void*)va, level, pte_p); } #if (__ARM_VMSA__ == 7) assert(level == 2); - ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(ARM_TT_L1_PT_OFFMASK); + ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(ARM_TT_L1_PT_OFFMASK); #else - if (level == 3) { - ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L2_OFFMASK; - } else if (level == 2) { - ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L1_OFFMASK; - } + assert(level > pt_attr_root_level(pmap_get_pt_attr(pmap))); + ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(pt_attr_ln_offmask(pmap_get_pt_attr(pmap), level - 1)); #endif if (level < PMAP_TT_MAX_LEVEL) { - ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt = PT_DESC_REFCOUNT; + ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt = PT_DESC_REFCOUNT; } } @@ -2442,10 +2815,12 @@ static inline tt_entry_t * pmap_tte(pmap_t pmap, vm_map_address_t addr) { - if (!(tte_index(pmap, addr) < pmap->tte_index_max)) { + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + + if (!(tte_index(pmap, pt_attr, addr) < pmap->tte_index_max)) { return (tt_entry_t *)NULL; } - return &pmap->tte[tte_index(pmap, addr)]; + return &pmap->tte[tte_index(pmap, pt_attr, addr)]; } @@ -2470,11 +2845,14 @@ pmap_pte( return PT_ENTRY_NULL; } tte = *ttp; - #if MACH_ASSERT +#if MACH_ASSERT if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) { - panic("Attempt to demote L1 block: pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte); + panic("%s: Attempt to demote L1 block, tte=0x%lx, " + "pmap=%p, addr=%p", + __FUNCTION__, (unsigned long)tte, + pmap, (void*)addr); } - #endif +#endif if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) { return PT_ENTRY_NULL; } @@ -2482,8 +2860,75 @@ pmap_pte( return ptp; } +__unused static inline tt_entry_t * +pmap_ttne(pmap_t pmap, + unsigned int target_level, + vm_map_address_t addr) +{ + tt_entry_t * ret_ttep = NULL; + + switch (target_level) { + case 1: + ret_ttep = pmap_tte(pmap, addr); + break; + case 2: + ret_ttep = (tt_entry_t *)pmap_pte(pmap, addr); + break; + default: + panic("%s: bad level, " + "pmap=%p, target_level=%u, addr=%p", + __FUNCTION__, + pmap, target_level, (void *)addr); + } + + return ret_ttep; +} + #else +static inline tt_entry_t * +pmap_ttne(pmap_t pmap, + unsigned int target_level, + vm_map_address_t addr) +{ + tt_entry_t * ttp = NULL; + tt_entry_t * ttep = NULL; + tt_entry_t tte = ARM_TTE_EMPTY; + unsigned int cur_level; + + const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + + ttp = pmap->tte; + + assert(target_level <= pt_attr->pta_max_level); + + for (cur_level = pt_attr->pta_root_level; cur_level <= target_level; cur_level++) { + ttep = &ttp[ttn_index(pmap, pt_attr, addr, cur_level)]; + + if (cur_level == target_level) { + break; + } + + tte = *ttep; + +#if MACH_ASSERT + if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) { + panic("%s: Attempt to demote L%u block, tte=0x%llx, " + "pmap=%p, target_level=%u, addr=%p", + __FUNCTION__, cur_level, tte, + pmap, target_level, (void*)addr); + } +#endif + if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) { + return TT_ENTRY_NULL; + } + + ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK); + } + + return ttep; +} + /* * Given an offset and a map, compute the address of level 1 translation table entry. * If the tranlation is invalid then PT_ENTRY_NULL is returned. @@ -2492,14 +2937,7 @@ static inline tt_entry_t * pmap_tt1e(pmap_t pmap, vm_map_address_t addr) { - /* Level 0 currently unused */ -#if __ARM64_TWO_LEVEL_PMAP__ -#pragma unused(pmap, addr) - panic("pmap_tt1e called on a two level pmap"); - return NULL; -#else - return &pmap->tte[tt1_index(pmap, addr)]; -#endif + return pmap_ttne(pmap, PMAP_TT_L1_LEVEL, addr); } /* @@ -2510,26 +2948,7 @@ static inline tt_entry_t * pmap_tt2e(pmap_t pmap, vm_map_address_t addr) { -#if __ARM64_TWO_LEVEL_PMAP__ - return &pmap->tte[tt2_index(pmap, addr)]; -#else - tt_entry_t *ttp; - tt_entry_t tte; - - ttp = pmap_tt1e(pmap, addr); - tte = *ttp; - #if MACH_ASSERT - if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) { - panic("Attempt to demote L1 block (?!): pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte); - } - #endif - if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) { - return PT_ENTRY_NULL; - } - - ttp = &((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, addr)]; - return (tt_entry_t *)ttp; -#endif + return pmap_ttne(pmap, PMAP_TT_L2_LEVEL, addr); } @@ -2542,32 +2961,9 @@ pmap_tt3e( pmap_t pmap, vm_map_address_t addr) { - pt_entry_t *ptp; - tt_entry_t *ttp; - tt_entry_t tte; - - ttp = pmap_tt2e(pmap, addr); - if (ttp == PT_ENTRY_NULL) { - return PT_ENTRY_NULL; - } - - tte = *ttp; - -#if MACH_ASSERT - if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) { - panic("Attempt to demote L2 block: pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte); - } -#endif - if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) { - return PT_ENTRY_NULL; - } - - /* Get third-level (4KB) entry */ - ptp = &(((pt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, addr)]); - return ptp; + return (pt_entry_t*)pmap_ttne(pmap, PMAP_TT_L3_LEVEL, addr); } - static inline tt_entry_t * pmap_tte( pmap_t pmap, @@ -2576,7 +2972,6 @@ pmap_tte( return pmap_tt2e(pmap, addr); } - static inline pt_entry_t * pmap_pte( pmap_t pmap, @@ -2588,6 +2983,10 @@ pmap_pte( #endif + + + + /* * Map memory at initialization. The physical addresses being * mapped are not managed and are never unmapped. @@ -2650,6 +3049,12 @@ pmap_map_bd_with_options( case PMAP_MAP_BD_POSTED: mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED); break; + case PMAP_MAP_BD_POSTED_REORDERED: + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED); + break; + case PMAP_MAP_BD_POSTED_COMBINED_REORDERED: + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); + break; default: mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE); break; @@ -2666,9 +3071,13 @@ pmap_map_bd_with_options( while (paddr < end) { ptep = pmap_pte(kernel_pmap, vaddr); if (ptep == PT_ENTRY_NULL) { - panic("pmap_map_bd"); + panic("%s: no PTE for vaddr=%p, " + "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x", + __FUNCTION__, (void*)vaddr, + (void*)virt, (void*)start, (void*)end, prot, options); } - assert(!ARM_PTE_IS_COMPRESSED(*ptep)); + + assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep)); WRITE_PTE_STRONG(ptep, tmplate); pte_increment_pa(tmplate); @@ -2717,7 +3126,7 @@ pmap_map_bd( if (ptep == PT_ENTRY_NULL) { panic("pmap_map_bd"); } - assert(!ARM_PTE_IS_COMPRESSED(*ptep)); + assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep)); WRITE_PTE_STRONG(ptep, tmplate); pte_increment_pa(tmplate); @@ -2763,24 +3172,30 @@ pmap_map_high_window_bd( len += offset; if (len > (va_max - va_start)) { - panic("pmap_map_high_window_bd: area too large\n"); + panic("%s: area too large, " + "pa_start=%p, len=%p, prot=0x%x", + __FUNCTION__, + (void*)pa_start, (void*)len, prot); } scan: for (; va_start < va_max; va_start += PAGE_SIZE) { ptep = pmap_pte(kernel_pmap, va_start); - assert(!ARM_PTE_IS_COMPRESSED(*ptep)); + assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep)); if (*ptep == ARM_PTE_TYPE_FAULT) { break; } } if (va_start > va_max) { - panic("pmap_map_high_window_bd: insufficient pages\n"); + panic("%s: insufficient pages, " + "pa_start=%p, len=%p, prot=0x%x", + __FUNCTION__, + (void*)pa_start, (void*)len, prot); } for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) { ptep = pmap_pte(kernel_pmap, va_end); - assert(!ARM_PTE_IS_COMPRESSED(*ptep)); + assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep)); if (*ptep != ARM_PTE_TYPE_FAULT) { va_start = va_end + PAGE_SIZE; goto scan; @@ -2803,7 +3218,7 @@ scan: #endif /* __ARM_KERNEL_PROTECT__ */ WRITE_PTE_STRONG(ptep, pte); } - PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len); + PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false); #if KASAN kasan_notify_address(va_start, len); #endif @@ -2835,32 +3250,18 @@ pmap_compute_io_rgns(void) panic("pmap I/O region %u addr 0x%llx is not page-aligned", i, ranges[i].addr); } if (ranges[i].len & PAGE_MASK) { - panic("pmap I/O region %u length 0x%x is not page-aligned", i, ranges[i].len); + panic("pmap I/O region %u length 0x%llx is not page-aligned", i, ranges[i].len); } if (os_add_overflow(ranges[i].addr, ranges[i].len, &rgn_end)) { - panic("pmap I/O region %u addr 0x%llx length 0x%x wraps around", i, ranges[i].addr, ranges[i].len); - } - if ((i == 0) || (ranges[i].addr < io_rgn_start)) { - io_rgn_start = ranges[i].addr; + panic("pmap I/O region %u addr 0x%llx length 0x%llx wraps around", i, ranges[i].addr, ranges[i].len); } - if ((i == 0) || (rgn_end > io_rgn_end)) { - io_rgn_end = rgn_end; + if (((ranges[i].addr <= gPhysBase) && (rgn_end > gPhysBase)) || + ((ranges[i].addr < avail_end) && (rgn_end >= avail_end)) || + ((ranges[i].addr > gPhysBase) && (rgn_end < avail_end))) { + panic("pmap I/O region %u addr 0x%llx length 0x%llx overlaps physical memory", i, ranges[i].addr, ranges[i].len); } - ++num_io_rgns; - } - if (io_rgn_start & PAGE_MASK) { - panic("pmap I/O region start is not page-aligned!\n"); - } - - if (io_rgn_end & PAGE_MASK) { - panic("pmap I/O region end is not page-aligned!\n"); - } - - if (((io_rgn_start <= gPhysBase) && (io_rgn_end > gPhysBase)) || - ((io_rgn_start < avail_end) && (io_rgn_end >= avail_end)) || - ((io_rgn_start > gPhysBase) && (io_rgn_end < avail_end))) { - panic("pmap I/O region overlaps physical memory!\n"); + ++num_io_rgns; } return num_io_rgns * sizeof(*ranges); @@ -2931,51 +3332,48 @@ pmap_get_arm64_prot( pmap_t pmap, vm_offset_t addr) { - uint64_t tte; - uint64_t tt_type, table_ap, table_xn, table_pxn; - uint64_t prot = 0; - - tte = *pmap_tt1e(pmap, addr); - - if (!(tte & ARM_TTE_VALID)) { - return 0; - } - - tt_type = tte & ARM_TTE_TYPE_MASK; + tt_entry_t tte = 0; + unsigned int level = 0; + uint64_t tte_type = 0; + uint64_t effective_prot_bits = 0; + uint64_t aggregate_tte = 0; + uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0; + const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); - if (tt_type == ARM_TTE_TYPE_BLOCK) { - return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID; - } + for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) { + tte = *pmap_ttne(pmap, level, addr); - table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3; - table_xn = tte & ARM_TTE_TABLE_XN; - table_pxn = tte & ARM_TTE_TABLE_PXN; + if (!(tte & ARM_TTE_VALID)) { + return 0; + } - prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0); + tte_type = tte & ARM_TTE_TYPE_MASK; - tte = *pmap_tt2e(pmap, addr); - if (!(tte & ARM_TTE_VALID)) { - return 0; + if ((tte_type == ARM_TTE_TYPE_BLOCK) || + (level == pt_attr->pta_max_level)) { + /* Block or page mapping; both have the same protection bit layout. */ + break; + } else if (tte_type == ARM_TTE_TYPE_TABLE) { + /* All of the table bits we care about are overrides, so just OR them together. */ + aggregate_tte |= tte; + } } - tt_type = tte & ARM_TTE_TYPE_MASK; - - if (tt_type == ARM_TTE_TYPE_BLOCK) { - return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID; - } + table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK); + table_xn = (aggregate_tte & ARM_TTE_TABLE_XN); + table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN); - table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3; - table_xn = tte & ARM_TTE_TABLE_XN; - table_pxn = tte & ARM_TTE_TABLE_PXN; + /* Start with the PTE bits. */ + effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX); - prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0); + /* Table AP bits mask out block/page AP bits */ + effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits)); - tte = *pmap_tt3e(pmap, addr); - if (!(tte & ARM_TTE_VALID)) { - return 0; - } + /* XN/PXN bits can be OR'd in. */ + effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0); + effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0); - return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID; + return effective_prot_bits; } #endif /* __arm64__ */ @@ -3012,7 +3410,6 @@ pmap_bootstrap( vm_size_t pp_attr_table_size; vm_size_t io_attr_table_size; unsigned int npages; - unsigned int i; vm_map_offset_t maxoffset; lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL); @@ -3028,6 +3425,12 @@ pmap_bootstrap( * Initialize the kernel pmap. */ pmap_stamp = 1; +#if ARM_PARAMETERIZED_PMAP + kernel_pmap->pmap_pt_attr = native_pt_attr; +#endif /* ARM_PARAMETERIZED_PMAP */ +#if HAS_APPLE_PAC + kernel_pmap->disable_jop = 0; +#endif /* HAS_APPLE_PAC */ kernel_pmap->tte = cpu_tte; kernel_pmap->ttep = cpu_ttep; #if (__ARM_VMSA__ > 7) @@ -3036,7 +3439,7 @@ pmap_bootstrap( kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS; #endif kernel_pmap->max = VM_MAX_KERNEL_ADDRESS; - kernel_pmap->ref_count = 1; + os_atomic_init(&kernel_pmap->ref_count, 1); kernel_pmap->gc_status = 0; kernel_pmap->nx_enabled = TRUE; #ifdef __arm64__ @@ -3044,7 +3447,7 @@ pmap_bootstrap( #else kernel_pmap->is_64bit = FALSE; #endif - kernel_pmap->stamp = hw_atomic_add(&pmap_stamp, 1); + kernel_pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed); kernel_pmap->nested_region_grand_addr = 0x0ULL; kernel_pmap->nested_region_subord_addr = 0x0ULL; @@ -3054,10 +3457,10 @@ pmap_bootstrap( #if (__ARM_VMSA__ == 7) kernel_pmap->tte_index_max = 4 * NTTES; -#else - kernel_pmap->tte_index_max = (ARM_PGBYTES / sizeof(tt_entry_t)); #endif kernel_pmap->prev_tte = (tt_entry_t *) NULL; + kernel_pmap->hw_asid = 0; + kernel_pmap->sw_asid = 0; PMAP_LOCK_INIT(kernel_pmap); #if (__ARM_VMSA__ == 7) @@ -3096,6 +3499,8 @@ pmap_bootstrap( vm_last_phys = trunc_page(avail_end); simple_lock_init(&pmaps_lock, 0); + simple_lock_init(&asid_lock, 0); + simple_lock_init(&tt1_lock, 0); queue_init(&map_pmap_list); queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps); free_page_size_tt_list = TT_FREE_ENTRY_NULL; @@ -3119,28 +3524,8 @@ pmap_bootstrap( virtual_space_start = vstart; virtual_space_end = VM_MAX_KERNEL_ADDRESS; - /* mark all the address spaces in use */ - for (i = 0; i < MAX_ASID / (sizeof(uint32_t) * NBBY); i++) { - asid_bitmap[i] = 0xffffffff; - } - - /* - * The kernel gets ASID 0, and all aliases of it. This is - * important because ASID 0 is global; if we vend ASID 0 - * out to a user pmap, those translations will show up in - * other processes through the TLB. - */ - for (i = 0; i < MAX_ASID; i += ARM_MAX_ASID) { - asid_bitmap[i / (sizeof(uint32_t) * NBBY)] &= ~(1 << (i % (sizeof(uint32_t) * NBBY))); - -#if __ARM_KERNEL_PROTECT__ - assert((i + 1) < MAX_ASID); - asid_bitmap[(i + 1) / (sizeof(uint32_t) * NBBY)] &= ~(1 << ((i + 1) % (sizeof(uint32_t) * NBBY))); -#endif /* __ARM_KERNEL_PROTECT__ */ - } + bitmap_full(&asid_bitmap[0], MAX_ASID); - kernel_pmap->asid = 0; - kernel_pmap->vasid = 0; if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) { @@ -3287,7 +3672,8 @@ pmap_free_pages( boolean_t pmap_next_page_hi( - ppnum_t * pnum) + ppnum_t * pnum, + __unused boolean_t might_free) { return pmap_next_page(pnum); } @@ -3338,21 +3724,21 @@ pmap_init( pv_init(); /* - * The value of hard_maxproc may have been scaled, make sure - * it is still less than the value of MAX_ASID. + * The values of [hard_]maxproc may have been scaled, make sure + * they are still less than the value of MAX_ASID. */ - assert(hard_maxproc < MAX_ASID); + if (maxproc > MAX_ASID) { + maxproc = MAX_ASID; + } + if (hard_maxproc > MAX_ASID) { + hard_maxproc = MAX_ASID; + } #if CONFIG_PGTRACE pmap_pgtrace_init(); #endif } -void -pmap_pv_fixup(__unused vm_offset_t start, __unused vm_size_t length) -{ -} - boolean_t pmap_verify_free( ppnum_t ppnum) @@ -3398,7 +3784,7 @@ pmap_zone_init( PAGE_SIZE, "pmap"); } - +__dead2 void pmap_ledger_alloc_init(size_t size) { @@ -3407,17 +3793,15 @@ pmap_ledger_alloc_init(size_t size) __func__, size); } +__dead2 ledger_t pmap_ledger_alloc(void) { - ledger_t retval = NULL; - panic("%s: unsupported", __func__); - - return retval; } +__dead2 void pmap_ledger_free(ledger_t ledger) { @@ -3439,13 +3823,18 @@ pmap_ledger_free(ledger_t ledger) * is bounded by that size. */ MARK_AS_PMAP_TEXT static pmap_t -pmap_create_internal( +pmap_create_options_internal( ledger_t ledger, vm_map_size_t size, - boolean_t is_64bit) + unsigned int flags) { unsigned i; + unsigned tte_index_max; pmap_t p; + bool is_64bit = flags & PMAP_CREATE_64BIT; +#if defined(HAS_APPLE_PAC) + bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP; +#endif /* defined(HAS_APPLE_PAC) */ /* * A software use-only map doesn't even need a pmap. @@ -3454,7 +3843,6 @@ pmap_create_internal( return PMAP_NULL; } - /* * Allocate a pmap struct from the pmap_zone. Then allocate * the translation table of the right size for the pmap. @@ -3463,7 +3851,7 @@ pmap_create_internal( return PMAP_NULL; } - if (is_64bit) { + if (flags & PMAP_CREATE_64BIT) { p->min = MACH_VM_MIN_ADDRESS; p->max = MACH_VM_MAX_ADDRESS; } else { @@ -3471,17 +3859,29 @@ pmap_create_internal( p->max = VM_MAX_ADDRESS; } +#if defined(HAS_APPLE_PAC) + p->disable_jop = disable_jop; +#endif /* defined(HAS_APPLE_PAC) */ + p->nested_region_true_start = 0; p->nested_region_true_end = ~0; - p->ref_count = 1; + os_atomic_init(&p->ref_count, 1); p->gc_status = 0; - p->stamp = hw_atomic_add(&pmap_stamp, 1); + p->stamp = os_atomic_inc(&pmap_stamp, relaxed); p->nx_enabled = TRUE; p->is_64bit = is_64bit; p->nested = FALSE; p->nested_pmap = PMAP_NULL; +#if ARM_PARAMETERIZED_PMAP + p->pmap_pt_attr = native_pt_attr; +#endif /* ARM_PARAMETERIZED_PMAP */ + + if (!pmap_get_pt_ops(p)->alloc_id(p)) { + goto id_alloc_fail; + } + p->ledger = ledger; @@ -3496,26 +3896,26 @@ pmap_create_internal( p->tt_entry_free = (tt_entry_t *)0; p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, 0); + if (!(p->tte)) { + goto tt1_alloc_fail; + } + p->ttep = ml_static_vtop((vm_offset_t)p->tte); PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep); #if (__ARM_VMSA__ == 7) - p->tte_index_max = NTTES; + tte_index_max = p->tte_index_max = NTTES; #else - p->tte_index_max = (PMAP_ROOT_ALLOC_SIZE / sizeof(tt_entry_t)); + tte_index_max = (PMAP_ROOT_ALLOC_SIZE / sizeof(tt_entry_t)); #endif p->prev_tte = (tt_entry_t *) NULL; /* nullify the translation table */ - for (i = 0; i < p->tte_index_max; i++) { + for (i = 0; i < tte_index_max; i++) { p->tte[i] = ARM_TTE_TYPE_FAULT; } - FLUSH_PTE_RANGE(p->tte, p->tte + p->tte_index_max); - - /* assign a asid */ - p->vasid = alloc_asid(); - p->asid = p->vasid % ARM_MAX_ASID; + FLUSH_PTE_RANGE(p->tte, p->tte + tte_index_max); /* * initialize the rest of the structure @@ -3545,27 +3945,33 @@ pmap_create_internal( pmap_simple_unlock(&pmaps_lock); return p; + +tt1_alloc_fail: + pmap_get_pt_ops(p)->free_id(p); +id_alloc_fail: + zfree(pmap_zone, p); + return PMAP_NULL; } pmap_t -pmap_create( +pmap_create_options( ledger_t ledger, vm_map_size_t size, - boolean_t is_64bit) + unsigned int flags) { pmap_t pmap; - PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, is_64bit); + PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags); ledger_reference(ledger); - pmap = pmap_create_internal(ledger, size, is_64bit); + pmap = pmap_create_options_internal(ledger, size, flags); if (pmap == PMAP_NULL) { ledger_dereference(ledger); } - PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); + PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid); return pmap; } @@ -3625,6 +4031,7 @@ pmap_set_process( { pmap_set_process_internal(pmap, pid, procname); } +#endif /* MACH_ASSERT */ /* * We maintain stats and ledgers so that a task's physical footprint is: @@ -3637,115 +4044,6 @@ pmap_set_process( * where "alternate_accounting" includes "iokit" and "purgeable" memory. */ -struct { - uint64_t num_pmaps_checked; - - int phys_footprint_over; - ledger_amount_t phys_footprint_over_total; - ledger_amount_t phys_footprint_over_max; - int phys_footprint_under; - ledger_amount_t phys_footprint_under_total; - ledger_amount_t phys_footprint_under_max; - - int internal_over; - ledger_amount_t internal_over_total; - ledger_amount_t internal_over_max; - int internal_under; - ledger_amount_t internal_under_total; - ledger_amount_t internal_under_max; - - int internal_compressed_over; - ledger_amount_t internal_compressed_over_total; - ledger_amount_t internal_compressed_over_max; - int internal_compressed_under; - ledger_amount_t internal_compressed_under_total; - ledger_amount_t internal_compressed_under_max; - - int iokit_mapped_over; - ledger_amount_t iokit_mapped_over_total; - ledger_amount_t iokit_mapped_over_max; - int iokit_mapped_under; - ledger_amount_t iokit_mapped_under_total; - ledger_amount_t iokit_mapped_under_max; - - int alternate_accounting_over; - ledger_amount_t alternate_accounting_over_total; - ledger_amount_t alternate_accounting_over_max; - int alternate_accounting_under; - ledger_amount_t alternate_accounting_under_total; - ledger_amount_t alternate_accounting_under_max; - - int alternate_accounting_compressed_over; - ledger_amount_t alternate_accounting_compressed_over_total; - ledger_amount_t alternate_accounting_compressed_over_max; - int alternate_accounting_compressed_under; - ledger_amount_t alternate_accounting_compressed_under_total; - ledger_amount_t alternate_accounting_compressed_under_max; - - int page_table_over; - ledger_amount_t page_table_over_total; - ledger_amount_t page_table_over_max; - int page_table_under; - ledger_amount_t page_table_under_total; - ledger_amount_t page_table_under_max; - - int purgeable_volatile_over; - ledger_amount_t purgeable_volatile_over_total; - ledger_amount_t purgeable_volatile_over_max; - int purgeable_volatile_under; - ledger_amount_t purgeable_volatile_under_total; - ledger_amount_t purgeable_volatile_under_max; - - int purgeable_nonvolatile_over; - ledger_amount_t purgeable_nonvolatile_over_total; - ledger_amount_t purgeable_nonvolatile_over_max; - int purgeable_nonvolatile_under; - ledger_amount_t purgeable_nonvolatile_under_total; - ledger_amount_t purgeable_nonvolatile_under_max; - - int purgeable_volatile_compressed_over; - ledger_amount_t purgeable_volatile_compressed_over_total; - ledger_amount_t purgeable_volatile_compressed_over_max; - int purgeable_volatile_compressed_under; - ledger_amount_t purgeable_volatile_compressed_under_total; - ledger_amount_t purgeable_volatile_compressed_under_max; - - int purgeable_nonvolatile_compressed_over; - ledger_amount_t purgeable_nonvolatile_compressed_over_total; - ledger_amount_t purgeable_nonvolatile_compressed_over_max; - int purgeable_nonvolatile_compressed_under; - ledger_amount_t purgeable_nonvolatile_compressed_under_total; - ledger_amount_t purgeable_nonvolatile_compressed_under_max; - - int network_volatile_over; - ledger_amount_t network_volatile_over_total; - ledger_amount_t network_volatile_over_max; - int network_volatile_under; - ledger_amount_t network_volatile_under_total; - ledger_amount_t network_volatile_under_max; - - int network_nonvolatile_over; - ledger_amount_t network_nonvolatile_over_total; - ledger_amount_t network_nonvolatile_over_max; - int network_nonvolatile_under; - ledger_amount_t network_nonvolatile_under_total; - ledger_amount_t network_nonvolatile_under_max; - - int network_volatile_compressed_over; - ledger_amount_t network_volatile_compressed_over_total; - ledger_amount_t network_volatile_compressed_over_max; - int network_volatile_compressed_under; - ledger_amount_t network_volatile_compressed_under_total; - ledger_amount_t network_volatile_compressed_under_max; - - int network_nonvolatile_compressed_over; - ledger_amount_t network_nonvolatile_compressed_over_total; - ledger_amount_t network_nonvolatile_compressed_over_max; - int network_nonvolatile_compressed_under; - ledger_amount_t network_nonvolatile_compressed_under_total; - ledger_amount_t network_nonvolatile_compressed_under_max; -} pmap_ledgers_drift; -#endif /* MACH_ASSERT */ /* * Retire the given physical map from service. @@ -3762,7 +4060,9 @@ pmap_destroy_internal( VALIDATE_PMAP(pmap); - int32_t ref_count = __c11_atomic_fetch_sub(&pmap->ref_count, 1, memory_order_relaxed) - 1; + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + + int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed); if (ref_count > 0) { return; } else if (ref_count < 0) { @@ -3771,9 +4071,11 @@ pmap_destroy_internal( panic("pmap %p: attempt to destroy kernel pmap", pmap); } -#if (__ARM_VMSA__ == 7) pt_entry_t *ttep; - unsigned int i; + +#if (__ARM_VMSA__ > 7) + pmap_unmap_sharedpage(pmap); +#endif /* (__ARM_VMSA__ > 7) */ pmap_simple_lock(&pmaps_lock); while (pmap->gc_status & PMAP_GC_INFLIGHT) { @@ -3786,9 +4088,14 @@ pmap_destroy_internal( queue_remove(&map_pmap_list, pmap, pmap_t, pmaps); pmap_simple_unlock(&pmaps_lock); +#if (__ARM_VMSA__ == 7) if (pmap->cpu_ref != 0) { - panic("pmap_destroy(%p): cpu_ref = %u", pmap, pmap->cpu_ref); + panic("%s: cpu_ref=%u, " + "pmap=%p", + __FUNCTION__, pmap->cpu_ref, + pmap); } +#endif /* (__ARM_VMSA__ == 7) */ pmap_trim_self(pmap); @@ -3796,6 +4103,9 @@ pmap_destroy_internal( * Free the memory maps, then the * pmap structure. */ +#if (__ARM_VMSA__ == 7) + unsigned int i = 0; + PMAP_LOCK(pmap); for (i = 0; i < pmap->tte_index_max; i++) { ttep = &pmap->tte[i]; @@ -3804,91 +4114,57 @@ pmap_destroy_internal( } } PMAP_UNLOCK(pmap); +#else /* (__ARM_VMSA__ == 7) */ + vm_map_address_t c; + unsigned int level; + + for (level = pt_attr->pta_max_level - 1; level >= pt_attr->pta_root_level; level--) { + for (c = pmap->min; c < pmap->max; c += pt_attr_ln_size(pt_attr, level)) { + ttep = pmap_ttne(pmap, level, c); + + if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { + PMAP_LOCK(pmap); + pmap_tte_deallocate(pmap, ttep, level); + PMAP_UNLOCK(pmap); + } + } + } +#endif /* (__ARM_VMSA__ == 7) */ + + if (pmap->tte) { +#if (__ARM_VMSA__ == 7) pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max * sizeof(tt_entry_t), 0); + pmap->tte_index_max = 0; +#else /* (__ARM_VMSA__ == 7) */ + pmap_tt1_deallocate(pmap, pmap->tte, PMAP_ROOT_ALLOC_SIZE, 0); +#endif /* (__ARM_VMSA__ == 7) */ pmap->tte = (tt_entry_t *) NULL; pmap->ttep = 0; - pmap->tte_index_max = 0; } + +#if (__ARM_VMSA__ == 7) if (pmap->prev_tte) { pmap_tt1_deallocate(pmap, pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0); pmap->prev_tte = (tt_entry_t *) NULL; } +#endif /* (__ARM_VMSA__ == 7) */ + assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); - flush_mmu_tlb_asid(pmap->asid); + pmap_get_pt_ops(pmap)->flush_tlb_async(pmap); + sync_tlb_flush(); + /* return its asid to the pool */ - free_asid(pmap->vasid); + pmap_get_pt_ops(pmap)->free_id(pmap); pmap_check_ledgers(pmap); - - if (pmap->nested_region_asid_bitmap) { - kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size * sizeof(unsigned int)); - } - zfree(pmap_zone, pmap); -#else /* __ARM_VMSA__ == 7 */ - pt_entry_t *ttep; - pmap_paddr_t pa; - vm_map_address_t c; - - pmap_unmap_sharedpage(pmap); - - pmap_simple_lock(&pmaps_lock); - while (pmap->gc_status & PMAP_GC_INFLIGHT) { - pmap->gc_status |= PMAP_GC_WAIT; - assert_wait((event_t) &pmap->gc_status, THREAD_UNINT); - pmap_simple_unlock(&pmaps_lock); - (void) thread_block(THREAD_CONTINUE_NULL); - pmap_simple_lock(&pmaps_lock); - } - queue_remove(&map_pmap_list, pmap, pmap_t, pmaps); - pmap_simple_unlock(&pmaps_lock); - - pmap_trim_self(pmap); - - /* - * Free the memory maps, then the - * pmap structure. - */ - for (c = pmap->min; c < pmap->max; c += ARM_TT_L2_SIZE) { - ttep = pmap_tt2e(pmap, c); - if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { - PMAP_LOCK(pmap); - pmap_tte_deallocate(pmap, ttep, PMAP_TT_L2_LEVEL); - PMAP_UNLOCK(pmap); - } - } -#if !__ARM64_TWO_LEVEL_PMAP__ - for (c = pmap->min; c < pmap->max; c += ARM_TT_L1_SIZE) { - ttep = pmap_tt1e(pmap, c); - if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { - PMAP_LOCK(pmap); - pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL); - PMAP_UNLOCK(pmap); - } - } -#endif - - - if (pmap->tte) { - pa = pmap->ttep; - pmap_tt1_deallocate(pmap, (tt_entry_t *)phystokv(pa), PMAP_ROOT_ALLOC_SIZE, 0); - } - - assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); - flush_mmu_tlb_asid((uint64_t)(pmap->asid) << TLBI_ASID_SHIFT); - free_asid(pmap->vasid); - if (pmap->nested_region_asid_bitmap) { kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size * sizeof(unsigned int)); } - pmap_check_ledgers(pmap); - zfree(pmap_zone, pmap); - -#endif /* __ARM_VMSA__ == 7 */ } void @@ -3897,7 +4173,7 @@ pmap_destroy( { ledger_t ledger; - PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); + PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid); ledger = pmap->ledger; @@ -3918,7 +4194,7 @@ pmap_reference_internal( { if (pmap != PMAP_NULL) { VALIDATE_PMAP(pmap); - __c11_atomic_fetch_add(&pmap->ref_count, 1, memory_order_relaxed); + os_atomic_inc(&pmap->ref_count, relaxed); } } @@ -3935,43 +4211,35 @@ pmap_tt1_allocate( vm_size_t size, unsigned option) { - tt_entry_t *tt1; + tt_entry_t *tt1 = NULL; tt_free_entry_t *tt1_free; pmap_paddr_t pa; vm_address_t va; vm_address_t va_end; kern_return_t ret; - pmap_simple_lock(&pmaps_lock); + pmap_simple_lock(&tt1_lock); if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) { free_page_size_tt_count--; tt1 = (tt_entry_t *)free_page_size_tt_list; free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next; - pmap_simple_unlock(&pmaps_lock); - pmap_tt_ledger_credit(pmap, size); - return (tt_entry_t *)tt1; - } - ; - if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) { + } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) { free_two_page_size_tt_count--; tt1 = (tt_entry_t *)free_two_page_size_tt_list; free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next; - pmap_simple_unlock(&pmaps_lock); - pmap_tt_ledger_credit(pmap, size); - return (tt_entry_t *)tt1; - } - ; - if (free_tt_count != 0) { + } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) { free_tt_count--; tt1 = (tt_entry_t *)free_tt_list; free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next; - pmap_simple_unlock(&pmaps_lock); + } + + pmap_simple_unlock(&tt1_lock); + + if (tt1 != NULL) { pmap_tt_ledger_credit(pmap, size); return (tt_entry_t *)tt1; } - pmap_simple_unlock(&pmaps_lock); - ret = pmap_pages_alloc(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)); if (ret == KERN_RESOURCE_SHORTAGE) { @@ -3980,19 +4248,22 @@ pmap_tt1_allocate( if (size < PAGE_SIZE) { - pmap_simple_lock(&pmaps_lock); - - for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + size; va < va_end; va = va + size) { + va = phystokv(pa) + size; + tt_free_entry_t *local_free_list = (tt_free_entry_t*)va; + tt_free_entry_t *next_free = NULL; + for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) { tt1_free = (tt_free_entry_t *)va; - tt1_free->next = free_tt_list; - free_tt_list = tt1_free; - free_tt_count++; + tt1_free->next = next_free; + next_free = tt1_free; } + pmap_simple_lock(&tt1_lock); + local_free_list->next = free_tt_list; + free_tt_list = next_free; + free_tt_count += ((PAGE_SIZE / size) - 1); if (free_tt_count > free_tt_max) { free_tt_max = free_tt_count; } - - pmap_simple_unlock(&pmaps_lock); + pmap_simple_unlock(&tt1_lock); } /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size. @@ -4014,9 +4285,8 @@ pmap_tt1_deallocate( tt_free_entry_t *tt_entry; tt_entry = (tt_free_entry_t *)tt; - if (not_in_kdp) { - pmap_simple_lock(&pmaps_lock); - } + assert(not_in_kdp); + pmap_simple_lock(&tt1_lock); if (size < PAGE_SIZE) { free_tt_count++; @@ -4045,10 +4315,8 @@ pmap_tt1_deallocate( free_two_page_size_tt_list = tt_entry; } - if ((option & PMAP_TT_DEALLOCATE_NOBLOCK) || (!not_in_kdp)) { - if (not_in_kdp) { - pmap_simple_unlock(&pmaps_lock); - } + if (option & PMAP_TT_DEALLOCATE_NOBLOCK) { + pmap_simple_unlock(&tt1_lock); pmap_tt_ledger_debit(pmap, size); return; } @@ -4058,13 +4326,13 @@ pmap_tt1_deallocate( tt = (tt_entry_t *)free_page_size_tt_list; free_page_size_tt_list = ((tt_free_entry_t *)tt)->next; - pmap_simple_unlock(&pmaps_lock); + pmap_simple_unlock(&tt1_lock); pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE); OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); - pmap_simple_lock(&pmaps_lock); + pmap_simple_lock(&tt1_lock); } while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) { @@ -4072,15 +4340,15 @@ pmap_tt1_deallocate( tt = (tt_entry_t *)free_two_page_size_tt_list; free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next; - pmap_simple_unlock(&pmaps_lock); + pmap_simple_unlock(&tt1_lock); pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE); OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); - pmap_simple_lock(&pmaps_lock); + pmap_simple_lock(&tt1_lock); } - pmap_simple_unlock(&pmaps_lock); + pmap_simple_unlock(&tt1_lock); pmap_tt_ledger_debit(pmap, size); } @@ -4177,20 +4445,20 @@ pmap_tt_deallocate( ptdp = ptep_get_ptd((vm_offset_t)ttp); - ptdp->pt_map[ARM_PT_DESC_INDEX(ttp)].va = (vm_offset_t)-1; + ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].va = (vm_offset_t)-1; - if ((level < PMAP_TT_MAX_LEVEL) && (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT)) { - ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; + if ((level < PMAP_TT_MAX_LEVEL) && (ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT)) { + ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; } - if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt != 0) { - panic("pmap_tt_deallocate(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt); + if (ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt != 0) { + panic("pmap_tt_deallocate(): ptdp %p, count %d\n", ptdp, ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt); } - ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; + ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) { - pt_acc_cnt += ptdp->pt_cnt[i].refcnt; + pt_acc_cnt += ptdp->ptd_info[i].refcnt; } if (pt_acc_cnt == 0) { @@ -4261,9 +4529,9 @@ pmap_tte_remove( panic("pmap_tte_deallocate(): null tt_entry ttep==%p\n", ttep); } - if (((level + 1) == PMAP_TT_MAX_LEVEL) && (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt != 0)) { + if (((level + 1) == PMAP_TT_MAX_LEVEL) && (tte_get_ptd(tte)->ptd_info[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt != 0)) { panic("pmap_tte_deallocate(): pmap=%p ttep=%p ptd=%p refcnt=0x%x \n", pmap, ttep, - tte_get_ptd(tte), (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt)); + tte_get_ptd(tte), (tte_get_ptd(tte)->ptd_info[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt)); } #if (__ARM_VMSA__ == 7) @@ -4311,7 +4579,7 @@ pmap_tte_deallocate( unsigned i; for (i = 0; i < (ARM_PGBYTES / sizeof(*pte_p)); i++, pte_p++) { - if (ARM_PTE_IS_COMPRESSED(*pte_p)) { + if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) { panic("pmap_tte_deallocate: tte=0x%llx pmap=%p, pte_p=%p, pte=0x%llx compressed\n", (uint64_t)tte, pmap, pte_p, (uint64_t)(*pte_p)); } else if (((*pte_p) & ARM_PTE_TYPE_MASK) != ARM_PTE_TYPE_FAULT) { @@ -4353,8 +4621,13 @@ pmap_remove_range( pt_entry_t *epte, uint32_t *rmv_cnt) { - return pmap_remove_range_options(pmap, va, bpte, epte, rmv_cnt, - PMAP_OPTIONS_REMOVE); + bool need_strong_sync = false; + int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, rmv_cnt, + &need_strong_sync, PMAP_OPTIONS_REMOVE); + if (num_changed > 0) { + PMAP_UPDATE_TLBS(pmap, va, va + (PAGE_SIZE * (epte - bpte)), need_strong_sync); + } + return num_changed; } @@ -4500,6 +4773,7 @@ pmap_remove_range_options( pt_entry_t *bpte, pt_entry_t *epte, uint32_t *rmv_cnt, + bool *need_strong_sync __unused, int options) { pt_entry_t *cpte; @@ -4539,7 +4813,7 @@ pmap_remove_range_options( while (!managed) { if (pmap != kernel_pmap && (options & PMAP_OPTIONS_REMOVE) && - (ARM_PTE_IS_COMPRESSED(spte))) { + (ARM_PTE_IS_COMPRESSED(spte, cpte))) { /* * "pmap" must be locked at this point, * so this should not race with another @@ -4560,7 +4834,7 @@ pmap_remove_range_options( * our "compressed" markers, * so let's update it here. */ - if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->pt_cnt[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) { + if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->ptd_info[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) { panic("pmap_remove_range_options: over-release of ptdp %p for pte %p\n", ptep_get_ptd(cpte), cpte); } spte = *cpte; @@ -4586,7 +4860,7 @@ pmap_remove_range_options( UNLOCK_PVH(pai); } - if (ARM_PTE_IS_COMPRESSED(*cpte)) { + if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) { /* * There used to be a valid mapping here but it * has already been removed when the page was @@ -4598,7 +4872,8 @@ pmap_remove_range_options( /* remove the translation, do not flush the TLB */ if (*cpte != ARM_PTE_TYPE_FAULT) { - assert(!ARM_PTE_IS_COMPRESSED(*cpte)); + assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte); + assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte); #if MACH_ASSERT if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) { panic("pmap_remove_range_options(): cpte=%p ptd=%p pte=0x%llx va=0x%llx\n", @@ -4611,8 +4886,9 @@ pmap_remove_range_options( if ((spte != ARM_PTE_TYPE_FAULT) && (pmap != kernel_pmap)) { - assert(!ARM_PTE_IS_COMPRESSED(spte)); - if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->pt_cnt[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) { + assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte); + assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte); + if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->ptd_info[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) { panic("pmap_remove_range_options: over-release of ptdp %p for pte %p\n", ptep_get_ptd(cpte), cpte); } if (rmv_cnt) { @@ -4777,17 +5053,22 @@ pmap_remove_options_internal( vm_map_address_t end, int options) { - int remove_count = 0; + int remove_count = 0; pt_entry_t *bpte, *epte; pt_entry_t *pte_p; tt_entry_t *tte_p; uint32_t rmv_spte = 0; + bool need_strong_sync = false; + bool flush_tte = false; if (__improbable(end < start)) { panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end); } VALIDATE_PMAP(pmap); + + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + PMAP_LOCK(pmap); tte_p = pmap_tte(pmap, start); @@ -4799,28 +5080,27 @@ pmap_remove_options_internal( if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { pte_p = (pt_entry_t *) ttetokv(*tte_p); bpte = &pte_p[ptenum(start)]; - epte = bpte + ((end - start) >> ARM_TT_LEAF_SHIFT); + epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr)); remove_count += pmap_remove_range_options(pmap, start, bpte, epte, - &rmv_spte, options); + &rmv_spte, &need_strong_sync, options); -#if (__ARM_VMSA__ == 7) - if (rmv_spte && (ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && - (pmap != kernel_pmap) && (pmap->nested == FALSE)) { - pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL); - flush_mmu_tlb_entry((start & ~ARM_TT_L1_OFFMASK) | (pmap->asid & 0xff)); - } -#else - if (rmv_spte && (ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && + if (rmv_spte && (ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && (pmap != kernel_pmap) && (pmap->nested == FALSE)) { - pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL); - flush_mmu_tlb_entry(tlbi_addr(start & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr)); + flush_tte = true; } -#endif } done: PMAP_UNLOCK(pmap); + + if (remove_count > 0) { + PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync); + } else if (flush_tte > 0) { + pmap_get_pt_ops(pmap)->flush_tlb_tte_async(start, pmap); + sync_tlb_flush(); + } return remove_count; } @@ -4838,6 +5118,8 @@ pmap_remove_options( return; } + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start), VM_KERNEL_ADDRHIDE(end)); @@ -4860,11 +5142,7 @@ pmap_remove_options( while (va < end) { vm_map_address_t l; -#if (__ARM_VMSA__ == 7) - l = ((va + ARM_TT_L1_SIZE) & ~ARM_TT_L1_OFFMASK); -#else - l = ((va + ARM_TT_L2_SIZE) & ~ARM_TT_L2_OFFMASK); -#endif + l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr)); if (l > end) { l = end; } @@ -4874,10 +5152,6 @@ pmap_remove_options( va = l; } - if (remove_count > 0) { - PMAP_UPDATE_TLBS(pmap, start, end); - } - PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END); } @@ -4905,12 +5179,11 @@ pmap_set_pmap( #if __ARM_USER_PROTECT__ if (pmap->tte_index_max == NTTES) { thread->machine.uptw_ttc = 2; - thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP; } else { - thread->machine.uptw_ttc = 1; \ - thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP; + thread->machine.uptw_ttc = 1; } - thread->machine.asid = pmap->asid; + thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP; + thread->machine.asid = pmap->hw_asid; #endif } @@ -4918,9 +5191,9 @@ static void pmap_flush_core_tlb_asid(pmap_t pmap) { #if (__ARM_VMSA__ == 7) - flush_core_tlb_asid(pmap->asid); + flush_core_tlb_asid(pmap->hw_asid); #else - flush_core_tlb_asid(((uint64_t) pmap->asid) << TLBI_ASID_SHIFT); + flush_core_tlb_asid(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT); #endif } @@ -4930,34 +5203,42 @@ pmap_switch_internal( { VALIDATE_PMAP(pmap); pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data(); - uint32_t last_asid_high_bits, asid_high_bits; - boolean_t do_asid_flush = FALSE; + uint16_t asid_index = pmap->hw_asid; + boolean_t do_asid_flush = FALSE; + +#if __ARM_KERNEL_PROTECT__ + asid_index >>= 1; +#endif #if (__ARM_VMSA__ == 7) - if (not_in_kdp) { - pmap_simple_lock(&pmap->tt1_lock); - } + assert(not_in_kdp); + pmap_simple_lock(&pmap->tt1_lock); #else pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap; #endif - /* Paranoia. */ - assert(pmap->asid < (sizeof(cpu_data_ptr->cpu_asid_high_bits) / sizeof(*cpu_data_ptr->cpu_asid_high_bits))); +#if MAX_ASID > MAX_HW_ASID + if (asid_index > 0) { + asid_index -= 1; + /* Paranoia. */ + assert(asid_index < (sizeof(cpu_data_ptr->cpu_asid_high_bits) / sizeof(*cpu_data_ptr->cpu_asid_high_bits))); - /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */ - asid_high_bits = pmap->vasid >> ARM_ASID_SHIFT; - last_asid_high_bits = (uint32_t) cpu_data_ptr->cpu_asid_high_bits[pmap->asid]; + /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */ + uint8_t asid_high_bits = pmap->sw_asid; + uint8_t last_asid_high_bits = cpu_data_ptr->cpu_asid_high_bits[asid_index]; - if (asid_high_bits != last_asid_high_bits) { - /* - * If the virtual ASID of the new pmap does not match the virtual ASID - * last seen on this CPU for the physical ASID (that was a mouthful), - * then this switch runs the risk of aliasing. We need to flush the - * TLB for this phyiscal ASID in this case. - */ - cpu_data_ptr->cpu_asid_high_bits[pmap->asid] = (uint8_t) asid_high_bits; - do_asid_flush = TRUE; + if (asid_high_bits != last_asid_high_bits) { + /* + * If the virtual ASID of the new pmap does not match the virtual ASID + * last seen on this CPU for the physical ASID (that was a mouthful), + * then this switch runs the risk of aliasing. We need to flush the + * TLB for this phyiscal ASID in this case. + */ + cpu_data_ptr->cpu_asid_high_bits[asid_index] = asid_high_bits; + do_asid_flush = TRUE; + } } +#endif /* MAX_ASID > MAX_HW_ASID */ pmap_switch_user_ttb_internal(pmap); @@ -4972,12 +5253,13 @@ pmap_switch_internal( #endif if (do_asid_flush) { pmap_flush_core_tlb_asid(pmap); +#if DEVELOPMENT || DEBUG + os_atomic_inc(&pmap_asid_flushes, relaxed); +#endif } #if (__ARM_VMSA__ == 7) - if (not_in_kdp) { - pmap_simple_unlock(&pmap->tt1_lock); - } + pmap_simple_unlock(&pmap->tt1_lock); #endif } @@ -4985,7 +5267,7 @@ void pmap_switch( pmap_t pmap) { - PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); + PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid); pmap_switch_internal(pmap); PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END); } @@ -5156,7 +5438,7 @@ pmap_page_protect_options_internal( pmap != kernel_pmap && (options & PMAP_OPTIONS_COMPRESSOR) && IS_INTERNAL_PAGE(pai)) { - assert(!ARM_PTE_IS_COMPRESSED(*pte_p)); + assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)); /* mark this PTE as having been "compressed" */ tmplate = ARM_PTE_COMPRESSED; if (is_altacct) { @@ -5170,7 +5452,7 @@ pmap_page_protect_options_internal( if ((*pte_p != ARM_PTE_TYPE_FAULT) && tmplate == ARM_PTE_TYPE_FAULT && (pmap != kernel_pmap)) { - if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt)) <= 0) { + if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt)) <= 0) { panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p); } } @@ -5264,58 +5546,29 @@ pmap_page_protect_options_internal( } } else { pt_entry_t spte; + const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap); spte = *pte_p; if (pmap == kernel_pmap) { tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA)); } else { - tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RORO)); + tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr)); } pte_set_was_writeable(tmplate, false); - -#if (__ARM_VMSA__ == 7) - if (set_NX) { - tmplate |= ARM_PTE_NX; - } else { - /* - * While the naive implementation of this would serve to add execute - * permission, this is not how the VM uses this interface, or how - * x86_64 implements it. So ignore requests to add execute permissions. - */ -#if 0 - tmplate &= ~ARM_PTE_NX; -#else - ; -#endif - } -#else + /* + * While the naive implementation of this would serve to add execute + * permission, this is not how the VM uses this interface, or how + * x86_64 implements it. So ignore requests to add execute permissions. + */ if (set_NX) { - tmplate |= ARM_PTE_NX | ARM_PTE_PNX; - } else { - /* - * While the naive implementation of this would serve to add execute - * permission, this is not how the VM uses this interface, or how - * x86_64 implements it. So ignore requests to add execute permissions. - */ -#if 0 - if (pmap == kernel_pmap) { - tmplate &= ~ARM_PTE_PNX; - tmplate |= ARM_PTE_NX; - } else { - tmplate &= ~ARM_PTE_NX; - tmplate |= ARM_PTE_PNX; - } -#else - ; -#endif + tmplate |= pt_attr_leaf_xn(pt_attr); } -#endif if (*pte_p != ARM_PTE_TYPE_FAULT && - !ARM_PTE_IS_COMPRESSED(*pte_p) && + !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p) && *pte_p != tmplate) { WRITE_PTE_STRONG(pte_p, tmplate); update = TRUE; @@ -5325,7 +5578,7 @@ pmap_page_protect_options_internal( /* Invalidate TLBs for all CPUs using it */ if (update) { tlb_flush_needed = TRUE; - flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap); } #ifdef PVH_FLAG_IOMMU @@ -5436,18 +5689,16 @@ pmap_protect_options_internal( unsigned int options, __unused void *args) { - tt_entry_t *tte_p; - pt_entry_t *bpte_p, *epte_p; - pt_entry_t *pte_p; - boolean_t set_NX = TRUE; + const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap); + tt_entry_t *tte_p; + pt_entry_t *bpte_p, *epte_p; + pt_entry_t *pte_p; + boolean_t set_NX = TRUE; #if (__ARM_VMSA__ > 7) - boolean_t set_XO = FALSE; -#endif - boolean_t should_have_removed = FALSE; - -#ifndef __ARM_IC_NOALIAS_ICACHE__ - boolean_t InvalidatePoU_Icache_Done = FALSE; + boolean_t set_XO = FALSE; #endif + boolean_t should_have_removed = FALSE; + bool need_strong_sync = false; if (__improbable(end < start)) { panic("%s called with bogus range: %p, %p", __func__, (void*)start, (void*)end); @@ -5518,7 +5769,7 @@ pmap_protect_options_internal( spte = *pte_p; if ((spte == ARM_PTE_TYPE_FAULT) || - ARM_PTE_IS_COMPRESSED(spte)) { + ARM_PTE_IS_COMPRESSED(spte, pte_p)) { continue; } @@ -5549,7 +5800,7 @@ pmap_protect_options_internal( } if ((spte == ARM_PTE_TYPE_FAULT) || - ARM_PTE_IS_COMPRESSED(spte)) { + ARM_PTE_IS_COMPRESSED(spte, pte_p)) { continue; } @@ -5569,11 +5820,11 @@ pmap_protect_options_internal( #if DEVELOPMENT || DEBUG if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) { force_write = TRUE; - tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWRW)); + tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr)); } else #endif { - tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RORO)); + tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr)); } } @@ -5587,34 +5838,23 @@ pmap_protect_options_internal( * not allowed to increase * access permissions. */ -#if (__ARM_VMSA__ == 7) - if (set_NX) { - tmplate |= ARM_PTE_NX; - } else { - /* do NOT clear "NX"! */ - } -#else if (set_NX) { - tmplate |= ARM_PTE_NX | ARM_PTE_PNX; + tmplate |= pt_attr_leaf_xn(pt_attr); } else { +#if (__ARM_VMSA__ > 7) if (pmap == kernel_pmap) { - /* - * TODO: Run CS/Monitor checks here; - * should we be clearing PNX here? Is - * this just for dtrace? - */ - tmplate &= ~ARM_PTE_PNX; + /* do NOT clear "PNX"! */ tmplate |= ARM_PTE_NX; } else { /* do NOT clear "NX"! */ - tmplate |= ARM_PTE_PNX; + tmplate |= pt_attr_leaf_x(pt_attr); if (set_XO) { tmplate &= ~ARM_PTE_APMASK; - tmplate |= ARM_PTE_AP(AP_RONA); + tmplate |= pt_attr_leaf_rona(pt_attr); } } - } #endif + } #if DEVELOPMENT || DEBUG if (force_write) { @@ -5658,19 +5898,6 @@ pmap_protect_options_internal( /* We do not expect to write fast fault the entry. */ pte_set_was_writeable(tmplate, false); - /* TODO: Doesn't this need to worry about PNX? */ - if (((spte & ARM_PTE_NX) == ARM_PTE_NX) && (prot & VM_PROT_EXECUTE)) { - CleanPoU_DcacheRegion((vm_offset_t) phystokv(pa), PAGE_SIZE); -#ifdef __ARM_IC_NOALIAS_ICACHE__ - InvalidatePoU_IcacheRegion((vm_offset_t) phystokv(pa), PAGE_SIZE); -#else - if (!InvalidatePoU_Icache_Done) { - InvalidatePoU_Icache(); - InvalidatePoU_Icache_Done = TRUE; - } -#endif - } - WRITE_PTE_FAST(pte_p, tmplate); if (managed) { @@ -5678,9 +5905,8 @@ pmap_protect_options_internal( UNLOCK_PVH(pai); } } - FLUSH_PTE_RANGE_STRONG(bpte_p, epte_p); - PMAP_UPDATE_TLBS(pmap, start, end); + PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync); } PMAP_UNLOCK(pmap); @@ -5697,6 +5923,8 @@ pmap_protect_options( { vm_map_address_t l, beg; + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + if ((b | e) & PAGE_MASK) { panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx\n", pmap, (uint64_t)b, (uint64_t)e); @@ -5733,7 +5961,7 @@ pmap_protect_options( beg = b; while (beg < e) { - l = ((beg + ARM_TT_TWIG_SIZE) & ~ARM_TT_TWIG_OFFMASK); + l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr)); if (l > e) { l = e; @@ -5817,7 +6045,7 @@ static inline void pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte, vm_map_address_t v) { if (pmap != kernel_pmap && ((pte & ARM_PTE_WIRED) != (*pte_p & ARM_PTE_WIRED))) { - SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].wiredcnt); + SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].wiredcnt); if (pte & ARM_PTE_WIRED) { OSAddAtomic16(1, ptd_wiredcnt_ptr); pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); @@ -5829,9 +6057,9 @@ pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte, vm_map_address_t } } if (*pte_p != ARM_PTE_TYPE_FAULT && - !ARM_PTE_IS_COMPRESSED(*pte_p)) { + !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) { WRITE_PTE_STRONG(pte_p, pte); - PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE); + PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE, false); } else { WRITE_PTE(pte_p, pte); __builtin_arm_isb(ISB_SY); @@ -5840,7 +6068,7 @@ pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte, vm_map_address_t PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + PAGE_SIZE), pte); } -static pt_entry_t +MARK_AS_PMAP_TEXT static pt_entry_t wimg_to_pte(unsigned int wimg) { pt_entry_t pte; @@ -5855,6 +6083,14 @@ wimg_to_pte(unsigned int wimg) pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED); pte |= ARM_PTE_NX | ARM_PTE_PNX; break; + case VM_WIMG_POSTED_REORDERED: + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED); + pte |= ARM_PTE_NX | ARM_PTE_PNX; + break; + case VM_WIMG_POSTED_COMBINED_REORDERED: + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); + pte |= ARM_PTE_NX | ARM_PTE_PNX; + break; case VM_WIMG_WCOMB: pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); pte |= ARM_PTE_NX | ARM_PTE_PNX; @@ -6062,6 +6298,8 @@ pmap_enter_options_internal( VALIDATE_PMAP(pmap); + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + if ((v) & PAGE_MASK) { panic("pmap_enter_options() pmap %p v 0x%llx\n", pmap, (uint64_t)v); @@ -6123,7 +6361,7 @@ Pmap_enter_retry: spte = *pte_p; - if (ARM_PTE_IS_COMPRESSED(spte)) { + if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) { /* * "pmap" should be locked at this point, so this should * not race with another pmap_enter() or pmap_remove_range(). @@ -6158,7 +6396,6 @@ Pmap_enter_retry: if ((spte != ARM_PTE_TYPE_FAULT) && (pte_to_pa(spte) != pa)) { pmap_remove_range(pmap, v, pte_p, pte_p + 1, 0); - PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE); } pte = pa_to_pte(pa) | ARM_PTE_TYPE; @@ -6171,21 +6408,17 @@ Pmap_enter_retry: pte |= ARM_PTE_WIRED; } -#if (__ARM_VMSA__ == 7) - if (set_NX) { - pte |= ARM_PTE_NX; - } -#else if (set_NX) { - pte |= ARM_PTE_NX | ARM_PTE_PNX; + pte |= pt_attr_leaf_xn(pt_attr); } else { +#if (__ARM_VMSA__ > 7) if (pmap == kernel_pmap) { pte |= ARM_PTE_NX; } else { - pte |= ARM_PTE_PNX; + pte |= pt_attr_leaf_x(pt_attr); } - } #endif + } if (pmap == kernel_pmap) { #if __ARM_KERNEL_PROTECT__ @@ -6204,12 +6437,12 @@ Pmap_enter_retry: } #endif } else { - if (!(pmap->nested)) { + if (!pmap->nested) { pte |= ARM_PTE_NG; } else if ((pmap->nested_region_asid_bitmap) && (v >= pmap->nested_region_subord_addr) && (v < (pmap->nested_region_subord_addr + pmap->nested_region_size))) { - unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr) >> ARM_TT_TWIG_SHIFT); + unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr) >> pt_attr_twig_shift(pt_attr)); if ((pmap->nested_region_asid_bitmap) && testbit(index, (int *)pmap->nested_region_asid_bitmap)) { @@ -6227,9 +6460,9 @@ Pmap_enter_retry: && (nest_vaddr < (pmap->nested_region_subord_addr + pmap->nested_region_size)) && ((nest_pte_p = pmap_pte(pmap->nested_pmap, nest_vaddr)) != PT_ENTRY_NULL) && (*nest_pte_p != ARM_PTE_TYPE_FAULT) - && (!ARM_PTE_IS_COMPRESSED(*nest_pte_p)) + && (!ARM_PTE_IS_COMPRESSED(*nest_pte_p, nest_pte_p)) && (((*nest_pte_p) & ARM_PTE_NG) != ARM_PTE_NG)) { - unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr) >> ARM_TT_TWIG_SHIFT); + unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr) >> pt_attr_twig_shift(pt_attr)); if ((pmap->nested_pmap->nested_region_asid_bitmap) && !testbit(index, (int *)pmap->nested_pmap->nested_region_asid_bitmap)) { @@ -6243,33 +6476,33 @@ Pmap_enter_retry: if (pa_valid(pa) && (!pa_test_bits(pa, PP_ATTR_MODIFIED))) { if (fault_type & VM_PROT_WRITE) { if (set_XO) { - pte |= ARM_PTE_AP(AP_RWNA); + pte |= pt_attr_leaf_rwna(pt_attr); } else { - pte |= ARM_PTE_AP(AP_RWRW); + pte |= pt_attr_leaf_rw(pt_attr); } pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED); } else { if (set_XO) { - pte |= ARM_PTE_AP(AP_RONA); + pte |= pt_attr_leaf_rona(pt_attr); } else { - pte |= ARM_PTE_AP(AP_RORO); + pte |= pt_attr_leaf_ro(pt_attr); } pa_set_bits(pa, PP_ATTR_REFERENCED); pte_set_was_writeable(pte, true); } } else { if (set_XO) { - pte |= ARM_PTE_AP(AP_RWNA); + pte |= pt_attr_leaf_rwna(pt_attr); } else { - pte |= ARM_PTE_AP(AP_RWRW); + pte |= pt_attr_leaf_rw(pt_attr); } pa_set_bits(pa, PP_ATTR_REFERENCED); } } else { if (set_XO) { - pte |= ARM_PTE_AP(AP_RONA); + pte |= pt_attr_leaf_rona(pt_attr); } else { - pte |= ARM_PTE_AP(AP_RORO); + pte |= pt_attr_leaf_ro(pt_attr);; } pa_set_bits(pa, PP_ATTR_REFERENCED); } @@ -6280,8 +6513,8 @@ Pmap_enter_retry: volatile uint16_t *refcnt = NULL; volatile uint16_t *wiredcnt = NULL; if (pmap != kernel_pmap) { - refcnt = &(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt); - wiredcnt = &(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].wiredcnt); + refcnt = &(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt); + wiredcnt = &(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].wiredcnt); /* Bump the wired count to keep the PTE page from being reclaimed. We need this because * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate * a new PV entry. */ @@ -6318,7 +6551,7 @@ Pmap_enter_loop: * was dropped, so clear any cache attributes we may have previously set * in the PTE template. */ pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK); - pte |= wimg_to_pte(wimg_bits); + pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits); @@ -6417,7 +6650,7 @@ Pmap_enter_loop: wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)); } - pte |= wimg_to_pte(wimg_bits); + pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits); pmap_enter_pte(pmap, pte_p, pte, v); } @@ -6511,8 +6744,19 @@ pmap_change_wiring_internal( pte_p = pmap_pte(pmap, v); assert(pte_p != PT_ENTRY_NULL); pa = pte_to_pa(*pte_p); - if (pa_valid(pa)) { + + while (pa_valid(pa)) { + pmap_paddr_t new_pa; + LOCK_PVH((int)pa_index(pa)); + new_pa = pte_to_pa(*pte_p); + + if (pa == new_pa) { + break; + } + + UNLOCK_PVH((int)pa_index(pa)); + pa = new_pa; } if (wired && !pte_is_wired(*pte_p)) { @@ -6631,7 +6875,7 @@ pmap_vtophys( ppn = (ppnum_t) atop(pte_to_pa(*pte_p) | (va & ARM_PGMASK)); #if DEVELOPMENT || DEBUG if (ppn != 0 && - ARM_PTE_IS_COMPRESSED(*pte_p)) { + ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) { panic("pmap_vtophys(%p,0x%llx): compressed pte_p=%p 0x%llx with ppn=0x%x\n", pmap, va, pte_p, (uint64_t) (*pte_p), ppn); } @@ -6650,13 +6894,10 @@ pmap_vtophys( tt_entry_t tte; ppnum_t ppn = 0; + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + /* Level 0 currently unused */ -#if __ARM64_TWO_LEVEL_PMAP__ - /* We have no L1 entry; go straight to the L2 entry */ - ttp = pmap_tt2e(pmap, va); - tte = *ttp; -#else /* Get first-level (1GB) entry */ ttp = pmap_tt1e(pmap, va); tte = *ttp; @@ -6664,8 +6905,8 @@ pmap_vtophys( return ppn; } - tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, va)]; -#endif + tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, pt_attr, va)]; + if ((tte & ARM_TTE_VALID) != (ARM_TTE_VALID)) { return ppn; } @@ -6674,7 +6915,7 @@ pmap_vtophys( ppn = (ppnum_t) atop((tte & ARM_TTE_BLOCK_L2_MASK) | (va & ARM_TT_L2_OFFMASK)); return ppn; } - tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, va)]; + tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, pt_attr, va)]; ppn = (ppnum_t) atop((tte & ARM_PTE_MASK) | (va & ARM_TT_L3_OFFMASK)); #endif @@ -6744,7 +6985,8 @@ pmap_init_pte_page( pt_entry_t *pte_p, vm_offset_t va, unsigned int ttlevel, - boolean_t alloc_ptd) + boolean_t alloc_ptd, + boolean_t clear) { pt_desc_t *ptdp = NULL; vm_offset_t *pvh; @@ -6769,10 +7011,12 @@ pmap_init_pte_page( panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p); } - bzero(pte_p, ARM_PGBYTES); - // below barrier ensures the page zeroing is visible to PTW before - // it is linked to the PTE of previous level - __builtin_arm_dmb(DMB_ISHST); + if (clear) { + bzero(pte_p, ARM_PGBYTES); + // below barrier ensures the page zeroing is visible to PTW before + // it is linked to the PTE of previous level + __builtin_arm_dmb(DMB_ISHST); + } ptd_init(ptdp, pmap, va, ttlevel, pte_p); } @@ -6794,14 +7038,15 @@ pmap_expand( unsigned int options, unsigned int level) { + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + #if (__ARM_VMSA__ == 7) vm_offset_t pa; tt_entry_t *tte_p; tt_entry_t *tt_p; unsigned int i; - - while (tte_index(pmap, v) >= pmap->tte_index_max) { + while (tte_index(pmap, pt_attr, v) >= pmap->tte_index_max) { tte_p = pmap_tt1_allocate(pmap, 2 * ARM_PGBYTES, ((options & PMAP_OPTIONS_NOWAIT)? PMAP_TT_ALLOCATE_NOWAIT : 0)); if (tte_p == (tt_entry_t *)0) { return KERN_RESOURCE_SHORTAGE; @@ -6822,24 +7067,43 @@ pmap_expand( tte_p[i] = ARM_TTE_TYPE_FAULT; } - pmap->prev_tte = pmap->tte; + FLUSH_PTE_RANGE(tte_p, tte_p + (2 * NTTES)); // DMB + + /* Order is important here, so that pmap_switch_user_ttb() sees things + * in the correct sequence. + * --update of pmap->tte[p] must happen prior to updating pmap->tte_index_max, + * separated by at least a DMB, so that context switch does not see a 1 GB + * L1 table with a 2GB size. + * --update of pmap->tte[p] must also happen prior to setting pmap->prev_tte, + * separated by at least a DMB, so that context switch does not see an L1 + * table to be freed without also seeing its replacement.*/ + + tt_entry_t *prev_tte = pmap->tte; + pmap->tte = tte_p; pmap->ttep = ml_static_vtop((vm_offset_t)pmap->tte); - FLUSH_PTE_RANGE(pmap->tte, pmap->tte + (2 * NTTES)); + __builtin_arm_dmb(DMB_ISH); pmap->tte_index_max = 2 * NTTES; - pmap->stamp = hw_atomic_add(&pmap_stamp, 1); + pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed); for (i = 0; i < NTTES; i++) { - pmap->prev_tte[i] = ARM_TTE_TYPE_FAULT; + prev_tte[i] = ARM_TTE_TYPE_FAULT; } - FLUSH_PTE_RANGE(pmap->prev_tte, pmap->prev_tte + NTTES); + /* We need a strong flush here because a TLB flush will be + * issued from pmap_switch_user_ttb() as soon as this pmap + * is no longer active on any CPU. We need to ensure all + * prior stores to the TTE region have retired before that. */ + FLUSH_PTE_RANGE_STRONG(prev_tte, prev_tte + NTTES); // DSB + pmap->prev_tte = prev_tte; pmap_simple_unlock(&pmap->tt1_lock); PMAP_UNLOCK(pmap); - pmap_set_pmap(pmap, current_thread()); + if (current_pmap() == pmap) { + pmap_set_pmap(pmap, current_thread()); + } } if (level == 1) { @@ -6896,11 +7160,8 @@ pmap_expand( if (pmap_pte(pmap, v) == PT_ENTRY_NULL) { tt_entry_t *tte_next_p; - pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE); + pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE, TRUE); pa = kvtophys((vm_offset_t)tt_p); -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) phystokv(pa), PAGE_SIZE); -#endif tte_p = &pmap->tte[ttenum(v)]; for (i = 0, tte_next_p = tte_p; i < 4; i++) { *tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE; @@ -6923,13 +7184,7 @@ pmap_expand( return KERN_SUCCESS; #else pmap_paddr_t pa; -#if __ARM64_TWO_LEVEL_PMAP__ - /* If we are using a two level page table, we'll start at L2. */ - unsigned int ttlevel = 2; -#else - /* Otherwise, we start at L1 (we use 3 levels by default). */ - unsigned int ttlevel = 1; -#endif + unsigned int ttlevel = pt_attr_root_level(pt_attr); tt_entry_t *tte_p; tt_entry_t *tt_p; @@ -6939,50 +7194,24 @@ pmap_expand( for (; ttlevel < level; ttlevel++) { PMAP_LOCK(pmap); - if (ttlevel == 1) { - if ((pmap_tt2e(pmap, v) == PT_ENTRY_NULL)) { - PMAP_UNLOCK(pmap); - while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L2_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) { - if (options & PMAP_OPTIONS_NOWAIT) { - return KERN_RESOURCE_SHORTAGE; - } - VM_PAGE_WAIT(); - } - PMAP_LOCK(pmap); - if ((pmap_tt2e(pmap, v) == PT_ENTRY_NULL)) { - pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE); - pa = kvtophys((vm_offset_t)tt_p); - tte_p = pmap_tt1e( pmap, v); - *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; - PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK), - VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p); - pa = 0x0ULL; - tt_p = (tt_entry_t *)NULL; - if ((pmap == kernel_pmap) && (VM_MIN_KERNEL_ADDRESS < 0x00000000FFFFFFFFULL)) { - current_pmap()->tte[v >> ARM_TT_L1_SHIFT] = kernel_pmap->tte[v >> ARM_TT_L1_SHIFT]; - } + if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) { + PMAP_UNLOCK(pmap); + while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) { + if (options & PMAP_OPTIONS_NOWAIT) { + return KERN_RESOURCE_SHORTAGE; } + VM_PAGE_WAIT(); } - } else if (ttlevel == 2) { - if (pmap_tt3e(pmap, v) == PT_ENTRY_NULL) { - PMAP_UNLOCK(pmap); - while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L3_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) { - if (options & PMAP_OPTIONS_NOWAIT) { - return KERN_RESOURCE_SHORTAGE; - } - VM_PAGE_WAIT(); - } - PMAP_LOCK(pmap); - if ((pmap_tt3e(pmap, v) == PT_ENTRY_NULL)) { - pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L3_LEVEL, FALSE); - pa = kvtophys((vm_offset_t)tt_p); - tte_p = pmap_tt2e( pmap, v); - *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; - PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L2_OFFMASK), - VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L2_OFFMASK) + ARM_TT_L2_SIZE), *tte_p); - pa = 0x0ULL; - tt_p = (tt_entry_t *)NULL; - } + PMAP_LOCK(pmap); + if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) { + pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE, TRUE); + pa = kvtophys((vm_offset_t)tt_p); + tte_p = pmap_ttne(pmap, ttlevel, v); + *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; + PMAP_TRACE(ttlevel + 1, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)), + VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p); + pa = 0x0ULL; + tt_p = (tt_entry_t *)NULL; } } @@ -7598,21 +7827,29 @@ pmap_switch_user_ttb_internal( if ((cpu_data_ptr->cpu_user_pmap != PMAP_NULL) && (cpu_data_ptr->cpu_user_pmap != kernel_pmap)) { unsigned int c; + tt_entry_t *tt_entry = cpu_data_ptr->cpu_user_pmap->prev_tte; - c = hw_atomic_sub((volatile uint32_t *)&cpu_data_ptr->cpu_user_pmap->cpu_ref, 1); - if ((c == 0) && (cpu_data_ptr->cpu_user_pmap->prev_tte != 0)) { + c = os_atomic_dec(&cpu_data_ptr->cpu_user_pmap->cpu_ref, acq_rel); + if ((c == 0) && (tt_entry != NULL)) { /* We saved off the old 1-page tt1 in pmap_expand() in case other cores were still using it. * Now that the user pmap's cpu_ref is 0, we should be able to safely free it.*/ - tt_entry_t *tt_entry; - tt_entry = cpu_data_ptr->cpu_user_pmap->prev_tte; - cpu_data_ptr->cpu_user_pmap->prev_tte = (tt_entry_t *) NULL; + cpu_data_ptr->cpu_user_pmap->prev_tte = NULL; +#if !__ARM_USER_PROTECT__ + set_mmu_ttb(kernel_pmap->ttep); + set_context_id(kernel_pmap->hw_asid); +#endif + /* Now that we can guarantee the old 1-page L1 table is no longer active on any CPU, + * flush any cached intermediate translations that may point to it. Note that to be truly + * safe from prefetch-related issues, this table PA must have been cleared from TTBR0 prior + * to this call. __ARM_USER_PROTECT__ effectively guarantees that for all current configurations.*/ + flush_mmu_tlb_asid(cpu_data_ptr->cpu_user_pmap->hw_asid); pmap_tt1_deallocate(cpu_data_ptr->cpu_user_pmap, tt_entry, ARM_PGBYTES, PMAP_TT_DEALLOCATE_NOBLOCK); } } cpu_data_ptr->cpu_user_pmap = pmap; cpu_data_ptr->cpu_user_pmap_stamp = pmap->stamp; - (void) hw_atomic_add((volatile uint32_t *)&pmap->cpu_ref, 1); + os_atomic_inc(&pmap->cpu_ref, acq_rel); #if MACH_ASSERT && __ARM_USER_PROTECT__ { @@ -7646,7 +7883,7 @@ pmap_switch_user_ttb_internal( } #if !__ARM_USER_PROTECT__ - set_context_id(pmap->asid); + set_context_id(pmap->hw_asid); #endif #else /* (__ARM_VMSA__ == 7) */ @@ -7658,16 +7895,33 @@ pmap_switch_user_ttb_internal( if (pmap == kernel_pmap) { pmap_clear_user_ttb_internal(); } else { - set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->asid) << TTBR_ASID_SHIFT)); + set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT)); } -#endif + +#if defined(HAS_APPLE_PAC) && (__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) + if (!(BootArgs->bootFlags & kBootFlagsDisableJOP) && !(BootArgs->bootFlags & kBootFlagsDisableUserJOP)) { + uint64_t sctlr = __builtin_arm_rsr64("SCTLR_EL1"); + bool jop_enabled = sctlr & SCTLR_JOP_KEYS_ENABLED; + if (!jop_enabled && !pmap->disable_jop) { + // turn on JOP + sctlr |= SCTLR_JOP_KEYS_ENABLED; + __builtin_arm_wsr64("SCTLR_EL1", sctlr); + // no ISB necessary because this won't take effect until eret returns to EL0 + } else if (jop_enabled && pmap->disable_jop) { + // turn off JOP + sctlr &= ~SCTLR_JOP_KEYS_ENABLED; + __builtin_arm_wsr64("SCTLR_EL1", sctlr); + } + } +#endif /* HAS_APPLE_PAC && (__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) */ +#endif /* (__ARM_VMSA__ == 7) */ } void pmap_switch_user_ttb( pmap_t pmap) { - PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); + PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid); pmap_switch_user_ttb_internal(pmap); PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_END); } @@ -7702,16 +7956,16 @@ arm_force_fast_fault_internal( vm_prot_t allow_mode, int options) { - pmap_paddr_t phys = ptoa(ppnum); - pv_entry_t *pve_p; - pt_entry_t *pte_p; - int pai; - boolean_t result; - pv_entry_t **pv_h; - boolean_t is_reusable, is_internal; - boolean_t tlb_flush_needed = FALSE; - boolean_t ref_fault; - boolean_t mod_fault; + pmap_paddr_t phys = ptoa(ppnum); + pv_entry_t *pve_p; + pt_entry_t *pte_p; + int pai; + boolean_t result; + pv_entry_t **pv_h; + boolean_t is_reusable, is_internal; + boolean_t tlb_flush_needed = FALSE; + boolean_t ref_fault; + boolean_t mod_fault; assert(ppnum != vm_page_fictitious_addr); @@ -7759,7 +8013,7 @@ arm_force_fast_fault_internal( if (*pte_p == ARM_PTE_EMPTY) { panic("pte is NULL: pte_p=%p ppnum=0x%x\n", pte_p, ppnum); } - if (ARM_PTE_IS_COMPRESSED(*pte_p)) { + if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) { panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x\n", pte_p, ppnum); } @@ -7794,7 +8048,7 @@ arm_force_fast_fault_internal( } } else { if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWRW)) { - tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RORO)); + tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pmap_get_pt_attr(pmap))); pte_set_was_writeable(tmplate, true); update_pte = TRUE; mod_fault = TRUE; @@ -7805,9 +8059,9 @@ arm_force_fast_fault_internal( if (update_pte) { if (*pte_p != ARM_PTE_TYPE_FAULT && - !ARM_PTE_IS_COMPRESSED(*pte_p)) { + !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) { WRITE_PTE_STRONG(pte_p, tmplate); - flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap); tlb_flush_needed = TRUE; } else { WRITE_PTE(pte_p, tmplate); @@ -7990,7 +8244,7 @@ arm_clear_fast_fault( if (pmap == kernel_pmap) { tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA)); } else { - tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWRW)); + tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap))); } } @@ -8010,7 +8264,7 @@ arm_clear_fast_fault( if (spte != tmplate) { if (spte != ARM_PTE_TYPE_FAULT) { WRITE_PTE_STRONG(pte_p, tmplate); - flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap); tlb_flush_needed = TRUE; } else { WRITE_PTE(pte_p, tmplate); @@ -8052,14 +8306,14 @@ arm_fast_fault_internal( pmap_t pmap, vm_map_address_t va, vm_prot_t fault_type, - __unused boolean_t from_user) + __unused bool was_af_fault, + __unused bool from_user) { kern_return_t result = KERN_FAILURE; pt_entry_t *ptep; pt_entry_t spte = ARM_PTE_TYPE_FAULT; int pai; pmap_paddr_t pa; - VALIDATE_PMAP(pmap); PMAP_LOCK(pmap); @@ -8071,22 +8325,25 @@ arm_fast_fault_internal( ptep = pmap_pte(pmap, va); if (ptep != PT_ENTRY_NULL) { - spte = *ptep; + while (true) { + spte = *ptep; - pa = pte_to_pa(spte); + pa = pte_to_pa(spte); - if ((spte == ARM_PTE_TYPE_FAULT) || - ARM_PTE_IS_COMPRESSED(spte)) { - PMAP_UNLOCK(pmap); - return result; - } + if ((spte == ARM_PTE_TYPE_FAULT) || + ARM_PTE_IS_COMPRESSED(spte, ptep)) { + PMAP_UNLOCK(pmap); + return result; + } - if (!pa_valid(pa)) { - PMAP_UNLOCK(pmap); - return result; + if (!pa_valid(pa)) { + PMAP_UNLOCK(pmap); + return result; + } + pai = (int)pa_index(pa); + LOCK_PVH(pai); + break; } - pai = (int)pa_index(pa); - LOCK_PVH(pai); } else { PMAP_UNLOCK(pmap); return result; @@ -8132,7 +8389,8 @@ arm_fast_fault( pmap_t pmap, vm_map_address_t va, vm_prot_t fault_type, - __unused boolean_t from_user) + bool was_af_fault, + __unused bool from_user) { kern_return_t result = KERN_FAILURE; @@ -8163,7 +8421,7 @@ arm_fast_fault( } #endif - result = arm_fast_fault_internal(pmap, va, fault_type, from_user); + result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user); #if (__ARM_VMSA__ == 7) done: @@ -8260,7 +8518,7 @@ pmap_map_globals( #endif *ptep = pte; FLUSH_PTE_RANGE(ptep, (ptep + 1)); - PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE); + PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false); } vm_offset_t @@ -8279,16 +8537,19 @@ pmap_map_cpu_windows_copy_internal( unsigned int wimg_bits) { pt_entry_t *ptep = NULL, pte; + pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data(); unsigned int cpu_num; unsigned int i; vm_offset_t cpu_copywindow_vaddr = 0; + bool need_strong_sync = false; - cpu_num = pmap_get_cpu_data()->cpu_number; + + cpu_num = pmap_cpu_data->cpu_number; for (i = 0; i < CPUWINDOWS_MAX; i++) { cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i); ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr); - assert(!ARM_PTE_IS_COMPRESSED(*ptep)); + assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep)); if (*ptep == ARM_PTE_TYPE_FAULT) { break; } @@ -8316,7 +8577,8 @@ pmap_map_cpu_windows_copy_internal( * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate. */ FLUSH_PTE_STRONG(ptep); - PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE); + PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i]); + pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync; return i; } @@ -8337,8 +8599,9 @@ pmap_unmap_cpu_windows_copy_internal( pt_entry_t *ptep; unsigned int cpu_num; vm_offset_t cpu_copywindow_vaddr = 0; + pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data(); - cpu_num = pmap_get_cpu_data()->cpu_number; + cpu_num = pmap_cpu_data->cpu_number; cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index); /* Issue full-system DSB to ensure prior operations on the per-CPU window @@ -8347,7 +8610,7 @@ pmap_unmap_cpu_windows_copy_internal( __builtin_arm_dsb(DSB_SY); ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr); WRITE_PTE_STRONG(ptep, ARM_PTE_TYPE_FAULT); - PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE); + PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index]); } void @@ -8400,6 +8663,7 @@ pmap_trim_range( addr64_t adjust_offmask; tt_entry_t * tte_p; pt_entry_t * pte_p; + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); if (__improbable(end < start)) { panic("%s: invalid address range, " @@ -8419,19 +8683,13 @@ pmap_trim_range( } /* Contract the range to TT page boundaries. */ -#if (__ARM_VMSA__ > 7) - adjust_offmask = ARM_TT_TWIG_OFFMASK; -#else /* (__ARM_VMSA__ > 7) */ - adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1); -#endif /* (__ARM_VMSA__ > 7) */ - + adjust_offmask = pt_attr_leaf_table_offmask(pt_attr); adjusted_start = ((start + adjust_offmask) & ~adjust_offmask); adjusted_end = end & ~adjust_offmask; + bool modified = false; /* Iterate over the range, trying to remove TTEs. */ - for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += ARM_TT_TWIG_SIZE) { - bool modified = false; - + for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) { PMAP_LOCK(pmap); tte_p = pmap_tte(pmap, cur); @@ -8443,43 +8701,27 @@ pmap_trim_range( if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { pte_p = (pt_entry_t *) ttetokv(*tte_p); -#if (__ARM_VMSA__ == 7) - if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && - (pmap != kernel_pmap)) { - if (pmap->nested == TRUE) { - /* Deallocate for the nested map. */ - pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL); - } else { - /* Just remove for the parent map. */ - pmap_tte_remove(pmap, tte_p, PMAP_TT_L1_LEVEL); - } - - flush_mmu_tlb_entry((cur & ~ARM_TT_L1_OFFMASK) | (pmap->asid & 0xff)); - modified = true; - } -#else - if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && + if ((ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && (pmap != kernel_pmap)) { if (pmap->nested == TRUE) { /* Deallocate for the nested map. */ - pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL); + pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr)); } else { /* Just remove for the parent map. */ - pmap_tte_remove(pmap, tte_p, PMAP_TT_L2_LEVEL); + pmap_tte_remove(pmap, tte_p, pt_attr_twig_level(pt_attr)); } - flush_mmu_tlb_entry(tlbi_addr(cur & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + pmap_get_pt_ops(pmap)->flush_tlb_tte_async(cur, pmap); modified = true; } -#endif } done: PMAP_UNLOCK(pmap); + } - if (modified) { - PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE); - } + if (modified) { + sync_tlb_flush(); } #if (__ARM_VMSA__ > 7) @@ -8525,7 +8767,7 @@ done: if (remove_tt1e) { pmap_tte_deallocate(pmap, tt1e_p, PMAP_TT_L1_LEVEL); - PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE); + PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE, false); } PMAP_UNLOCK(pmap); @@ -8571,6 +8813,8 @@ pmap_trim_internal( VALIDATE_PMAP(grand); VALIDATE_PMAP(subord); + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); + PMAP_LOCK(subord); if (!subord->nested) { @@ -8621,11 +8865,7 @@ pmap_trim_internal( } if ((!subord->nested_bounds_set) && size) { -#if (__ARM_VMSA__ > 7) - adjust_offmask = ARM_TT_TWIG_OFFMASK; -#else /* (__ARM_VMSA__ > 7) */ - adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1); -#endif /* (__ARM_VMSA__ > 7) */ + adjust_offmask = pt_attr_leaf_table_offmask(pt_attr); subord->nested_region_true_start = nstart; subord->nested_region_true_end = nend; @@ -8729,6 +8969,7 @@ pmap_trim( pmap_trim_internal(grand, subord, vstart, nstart, size); } + /* * kern_return_t pmap_nest(grand, subord, vstart, size) * @@ -8767,19 +9008,17 @@ pmap_nest_internal( if (__improbable(os_add_overflow(nstart, size, &nend))) { panic("%s: %p nested addr wraps around: 0x%llx + 0x%llx", __func__, subord, nstart, size); } + VALIDATE_PMAP(grand); VALIDATE_PMAP(subord); + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); + assert(pmap_get_pt_attr(subord) == pt_attr); -#if (__ARM_VMSA__ == 7) - if (((size | vstart | nstart) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) { - return KERN_INVALID_VALUE; /* Nest 4MB region */ - } -#else - if (((size | vstart | nstart) & (ARM_TT_L2_OFFMASK)) != 0x0ULL) { + + if (((size | vstart | nstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL) { panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size); } -#endif if (!subord->nested) { panic("%s: subordinate pmap %p is not nestable", __func__, subord); @@ -8790,7 +9029,7 @@ pmap_nest_internal( } if (subord->nested_region_asid_bitmap == NULL) { - nested_region_asid_bitmap_size = (unsigned int)(size >> ARM_TT_TWIG_SHIFT) / (sizeof(unsigned int) * NBBY); + nested_region_asid_bitmap_size = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY); nested_region_asid_bitmap = kalloc(nested_region_asid_bitmap_size * sizeof(unsigned int)); bzero(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int)); @@ -8818,7 +9057,7 @@ pmap_nest_internal( new_size = nend - subord->nested_region_subord_addr; /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */ - new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> ARM_TT_TWIG_SHIFT) / (sizeof(unsigned int) * NBBY)) + 1; + new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1; new_nested_region_asid_bitmap = kalloc(new_nested_region_asid_bitmap_size * sizeof(unsigned int)); PMAP_LOCK(subord); @@ -8909,17 +9148,17 @@ expand_next: #else nvaddr = (vm_map_offset_t) nstart; - num_tte = (unsigned int)(size >> ARM_TT_L2_SHIFT); + num_tte = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)); for (i = 0; i < num_tte; i++) { if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) { goto expand_next; } - stte_p = pmap_tt2e(subord, nvaddr); + stte_p = pmap_tte(subord, nvaddr); if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) { PMAP_UNLOCK(subord); - kr = pmap_expand(subord, nvaddr, expand_options, PMAP_TT_L3_LEVEL); + kr = pmap_expand(subord, nvaddr, expand_options, PMAP_TT_LEAF_LEVEL); if (kr != KERN_SUCCESS) { PMAP_LOCK(grand); @@ -8929,7 +9168,7 @@ expand_next: PMAP_LOCK(subord); } expand_next: - nvaddr += ARM_TT_L2_SIZE; + nvaddr += pt_attr_twig_size(pt_attr); } #endif PMAP_UNLOCK(subord); @@ -8963,11 +9202,11 @@ nest_next: goto nest_next; } - stte_p = pmap_tt2e(subord, nvaddr); - gtte_p = pmap_tt2e(grand, vaddr); + stte_p = pmap_tte(subord, nvaddr); + gtte_p = pmap_tte(grand, vaddr); if (gtte_p == PT_ENTRY_NULL) { PMAP_UNLOCK(grand); - kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_L2_LEVEL); + kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_TWIG_LEVEL); PMAP_LOCK(grand); if (kr != KERN_SUCCESS) { @@ -8979,8 +9218,8 @@ nest_next: *gtte_p = *stte_p; nest_next: - vaddr += ARM_TT_L2_SIZE; - nvaddr += ARM_TT_L2_SIZE; + vaddr += pt_attr_twig_size(pt_attr); + nvaddr += pt_attr_twig_size(pt_attr); } #endif @@ -8996,7 +9235,7 @@ done: */ assert((size & 0xFFFFFFFF00000000ULL) == 0); #endif - PMAP_UPDATE_TLBS(grand, vstart, vend); + PMAP_UPDATE_TLBS(grand, vstart, vend, false); PMAP_UNLOCK(grand); return kr; @@ -9064,15 +9303,11 @@ pmap_unnest_options_internal( VALIDATE_PMAP(grand); -#if (__ARM_VMSA__ == 7) - if (((size | vaddr) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) { - panic("pmap_unnest(): unaligned request\n"); - } -#else - if (((size | vaddr) & ARM_TT_L2_OFFMASK) != 0x0ULL) { - panic("pmap_unnest(): unaligned request\n"); + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); + + if (((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) { + panic("pmap_unnest(): unaligned request"); } -#endif if ((option & PMAP_UNNEST_CLEAN) == 0) { if (grand->nested_pmap == NULL) { @@ -9086,11 +9321,11 @@ pmap_unnest_options_internal( PMAP_LOCK(grand->nested_pmap); start = vaddr - grand->nested_region_grand_addr + grand->nested_region_subord_addr; - start_index = (unsigned int)((vaddr - grand->nested_region_grand_addr) >> ARM_TT_TWIG_SHIFT); - max_index = (unsigned int)(start_index + (size >> ARM_TT_TWIG_SHIFT)); - num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT); + start_index = (unsigned int)((vaddr - grand->nested_region_grand_addr) >> pt_attr_twig_shift(pt_attr)); + max_index = (unsigned int)(start_index + (size >> pt_attr_twig_shift(pt_attr))); + num_tte = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)); - for (current_index = start_index, addr = start; current_index < max_index; current_index++, addr += ARM_TT_TWIG_SIZE) { + for (current_index = start_index, addr = start; current_index < max_index; current_index++, addr += pt_attr_twig_size(pt_attr)) { pt_entry_t *bpte, *epte, *cpte; if (addr < grand->nested_pmap->nested_region_true_start) { @@ -9104,7 +9339,7 @@ pmap_unnest_options_internal( } bpte = pmap_pte(grand->nested_pmap, addr); - epte = bpte + (ARM_TT_LEAF_INDEX_MASK >> ARM_TT_LEAF_SHIFT); + epte = bpte + (pt_attr_leaf_index_mask(pt_attr) >> pt_attr_leaf_shift(pt_attr)); if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap)) { setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap); @@ -9116,7 +9351,7 @@ pmap_unnest_options_internal( pt_entry_t spte; if ((*cpte != ARM_PTE_TYPE_FAULT) - && (!ARM_PTE_IS_COMPRESSED(*cpte))) { + && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) { spte = *cpte; while (!managed) { pa = pte_to_pa(spte); @@ -9163,9 +9398,9 @@ pmap_unnest_options_internal( start = vaddr; addr = vaddr; - num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT); + num_tte = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)); - for (i = 0; i < num_tte; i++, addr += ARM_TT_TWIG_SIZE) { + for (i = 0; i < num_tte; i++, addr += pt_attr_twig_size(pt_attr)) { if (addr < grand->nested_pmap->nested_region_true_start) { /* We haven't reached the interesting range. */ continue; @@ -9182,7 +9417,7 @@ pmap_unnest_options_internal( tte_p = pmap_tte(grand, start); FLUSH_PTE_RANGE_STRONG(tte_p, tte_p + num_tte); - PMAP_UPDATE_TLBS(grand, start, vend); + PMAP_UPDATE_TLBS(grand, start, vend, false); PMAP_UNLOCK(grand); @@ -9267,8 +9502,14 @@ pt_fake_zone_info( * an ARM small page (4K). */ -#define ARM_FULL_TLB_FLUSH_THRESHOLD 64 +#define ARM_FULL_TLB_FLUSH_THRESHOLD 64 + +#if __ARM_RANGE_TLBI__ +#define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1 +#define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_16K_TLB_RANGE_PAGES +#else #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256 +#endif // __ARM_RANGE_TLBI__ static void flush_mmu_tlb_region_asid_async( @@ -9280,7 +9521,7 @@ flush_mmu_tlb_region_asid_async( vm_offset_t end = va + length; uint32_t asid; - asid = pmap->asid; + asid = pmap->hw_asid; if (length / ARM_SMALL_PAGE_SIZE > ARM_FULL_TLB_FLUSH_THRESHOLD) { boolean_t flush_all = FALSE; @@ -9312,12 +9553,12 @@ flush_mmu_tlb_region_asid_async( flush_mmu_tlb_entries_async(va, end); #else - vm_offset_t end = va + length; - uint32_t asid; + unsigned npages = length >> pt_attr_leaf_shift(pmap_get_pt_attr(pmap)); + uint32_t asid; - asid = pmap->asid; + asid = pmap->hw_asid; - if ((length >> ARM_TT_L3_SHIFT) > ARM64_FULL_TLB_FLUSH_THRESHOLD) { + if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) { boolean_t flush_all = FALSE; if ((asid == 0) || (pmap->nested == TRUE)) { @@ -9330,8 +9571,19 @@ flush_mmu_tlb_region_asid_async( } return; } +#if __ARM_RANGE_TLBI__ + if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) { + va = generate_rtlbi_param(npages, asid, va); + if (pmap->nested == TRUE) { + flush_mmu_tlb_allrange_async(va); + } else { + flush_mmu_tlb_range_async(va); + } + return; + } +#endif + vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length); va = tlbi_asid(asid) | tlbi_addr(va); - end = tlbi_asid(asid) | tlbi_addr(end); if (pmap->nested == TRUE) { flush_mmu_tlb_allentries_async(va, end); } else { @@ -9341,6 +9593,29 @@ flush_mmu_tlb_region_asid_async( #endif } +MARK_AS_PMAP_TEXT static void +flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap) +{ +#if (__ARM_VMSA__ == 7) + flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff)); + flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); + flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); + flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); +#else + flush_mmu_tlb_entry_async(tlbi_addr(va & ~pt_attr_twig_offmask(pmap_get_pt_attr(pmap))) | tlbi_asid(pmap->hw_asid)); +#endif +} + +MARK_AS_PMAP_TEXT static void +flush_mmu_tlb_full_asid_async(pmap_t pmap) +{ +#if (__ARM_VMSA__ == 7) + flush_mmu_tlb_asid_async(pmap->hw_asid); +#else /* (__ARM_VMSA__ == 7) */ + flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT); +#endif /* (__ARM_VMSA__ == 7) */ +} + void flush_mmu_tlb_region( vm_offset_t va, @@ -9350,18 +9625,21 @@ flush_mmu_tlb_region( sync_tlb_flush(); } -static unsigned int +static pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t paddr) { - pmap_io_range_t find_range = {.addr = paddr, .len = PAGE_SIZE}; + pmap_io_range_t find_range = {.addr = paddr & ~PAGE_MASK, .len = PAGE_SIZE}; unsigned int begin = 0, end = num_io_rgns - 1; - assert(num_io_rgns > 0); + if ((num_io_rgns == 0) || (paddr < io_attr_table[begin].addr) || + (paddr >= (io_attr_table[end].addr + io_attr_table[end].len))) { + return NULL; + } for (;;) { unsigned int middle = (begin + end) / 2; int cmp = cmp_io_rgns(&find_range, &io_attr_table[middle]); if (cmp == 0) { - return io_attr_table[middle].wimg; + return &io_attr_table[middle]; } else if (begin == end) { break; } else if (cmp > 0) { @@ -9370,9 +9648,8 @@ pmap_find_io_attr(pmap_paddr_t paddr) end = middle; } } - ; - return VM_WIMG_IO; + return NULL; } unsigned int @@ -9386,21 +9663,11 @@ pmap_cache_attributes( paddr = ptoa(pn); - if ((paddr >= io_rgn_start) && (paddr < io_rgn_end)) { - return pmap_find_io_attr(paddr); - } - - if (!pmap_initialized) { - if ((paddr >= gPhysBase) && (paddr < gPhysBase + gPhysSize)) { - return VM_WIMG_DEFAULT; - } else { - return VM_WIMG_IO; - } - } - + assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped if (!pa_valid(paddr)) { - return VM_WIMG_IO; + pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr); + return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg; } result = VM_WIMG_DEFAULT; @@ -9572,7 +9839,7 @@ pmap_batch_set_cache_attributes_internal( } return TRUE; -}; +} boolean_t pmap_batch_set_cache_attributes( @@ -9656,7 +9923,7 @@ pmap_set_cache_attributes( pmap_set_cache_attributes_internal(pn, cacheattr); } -void +MARK_AS_PMAP_TEXT void pmap_update_cache_attributes_locked( ppnum_t ppnum, unsigned attributes) @@ -9669,6 +9936,8 @@ pmap_update_cache_attributes_locked( unsigned int pai; boolean_t tlb_flush_needed = FALSE; + PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes); + #if __ARM_PTE_PHYSMAP__ vm_offset_t kva = phystokv(phys); pte_p = pmap_pte(kernel_pmap, kva); @@ -9717,10 +9986,10 @@ pmap_update_cache_attributes_locked( tmplate = *pte_p; tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK); - tmplate |= wimg_to_pte(attributes); + tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes); WRITE_PTE_STRONG(pte_p, tmplate); - flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap); tlb_flush_needed = TRUE; #ifdef PVH_FLAG_IOMMU @@ -9734,6 +10003,8 @@ cache_skip_pve: if (tlb_flush_needed) { sync_tlb_flush(); } + + PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes); } #if (__ARM_VMSA__ == 7) @@ -9822,7 +10093,7 @@ pmap_create_sharedpage( * Note that we update parameters of the entry for our unique needs (NG * entry, etc.). */ - sharedpage_pmap = pmap_create(NULL, 0x0, FALSE); + sharedpage_pmap = pmap_create_options(NULL, 0x0, 0); assert(sharedpage_pmap != NULL); /* The user 64-bit mapping... */ @@ -9843,7 +10114,7 @@ pmap_create_sharedpage( * Asserts to ensure that the TTEs we nest to map the shared page do not overlap * with user controlled TTEs. */ -#if (ARM_PGSHIFT == 14) || __ARM64_TWO_LEVEL_PMAP__ +#if (ARM_PGSHIFT == 14) static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= MACH_VM_MAX_ADDRESS); static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS); #elif (ARM_PGSHIFT == 12) @@ -9884,9 +10155,6 @@ pmap_insert_sharedpage_internal( * order to nest. */ #if (ARM_PGSHIFT == 12) -#if __ARM64_TWO_LEVEL_PMAP__ -#error A two level page table with a page shift of 12 is not currently supported -#endif (void)options; /* Just slam in the L1 entry. */ @@ -9898,7 +10166,6 @@ pmap_insert_sharedpage_internal( src_ttep = pmap_tt1e(sharedpage_pmap, sharedpage_vaddr); #elif (ARM_PGSHIFT == 14) -#if !__ARM64_TWO_LEVEL_PMAP__ /* Allocate for the L2 entry if necessary, and slam it into place. */ /* * As long as we are use a three level page table, the first level @@ -9917,7 +10184,6 @@ pmap_insert_sharedpage_internal( PMAP_LOCK(pmap); } -#endif ttep = pmap_tt2e(pmap, sharedpage_vaddr); @@ -9934,10 +10200,10 @@ pmap_insert_sharedpage_internal( /* TODO: Should we flush in the 64-bit case? */ flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap); -#if (ARM_PGSHIFT == 12) && !__ARM64_TWO_LEVEL_PMAP__ - flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid)); +#if (ARM_PGSHIFT == 12) + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->hw_asid)); #elif (ARM_PGSHIFT == 14) - flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->hw_asid)); #endif sync_tlb_flush(); @@ -9964,9 +10230,6 @@ pmap_unmap_sharedpage( } #if (ARM_PGSHIFT == 12) -#if __ARM64_TWO_LEVEL_PMAP__ -#error A two level page table with a page shift of 12 is not currently supported -#endif ttep = pmap_tt1e(pmap, sharedpage_vaddr); if (ttep == NULL) { @@ -9994,12 +10257,9 @@ pmap_unmap_sharedpage( flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap); #if (ARM_PGSHIFT == 12) -#if __ARM64_TWO_LEVEL_PMAP__ -#error A two level page table with a page shift of 12 is not currently supported -#endif - flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->hw_asid)); #elif (ARM_PGSHIFT == 14) - flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->hw_asid)); #endif sync_tlb_flush(); } @@ -10045,69 +10305,44 @@ pmap_is_empty_internal( VALIDATE_PMAP(pmap); - if ((pmap != kernel_pmap) && (not_in_kdp)) { + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + unsigned int initial_not_in_kdp = not_in_kdp; + + if ((pmap != kernel_pmap) && (initial_not_in_kdp)) { PMAP_LOCK(pmap); } #if (__ARM_VMSA__ == 7) - if (tte_index(pmap, va_end) >= pmap->tte_index_max) { - if ((pmap != kernel_pmap) && (not_in_kdp)) { + if (tte_index(pmap, pt_attr, va_end) >= pmap->tte_index_max) { + if ((pmap != kernel_pmap) && (initial_not_in_kdp)) { PMAP_UNLOCK(pmap); } return TRUE; } +#endif - block_start = va_start; - tte_p = pmap_tte(pmap, block_start); - while (block_start < va_end) { - block_end = (block_start + ARM_TT_L1_SIZE) & ~(ARM_TT_L1_OFFMASK); - if (block_end > va_end) { - block_end = va_end; - } - - if ((*tte_p & ARM_TTE_TYPE_MASK) != 0) { - vm_map_offset_t offset; - ppnum_t phys_page = 0; - - for (offset = block_start; - offset < block_end; - offset += ARM_PGBYTES) { - // This does a pmap_find_phys() lookup but assumes lock is held - phys_page = pmap_vtophys(pmap, offset); - if (phys_page) { - if ((pmap != kernel_pmap) && (not_in_kdp)) { - PMAP_UNLOCK(pmap); - } - return FALSE; - } - } - } - - block_start = block_end; - tte_p++; - } -#else + /* TODO: This will be faster if we increment ttep at each level. */ block_start = va_start; while (block_start < va_end) { pt_entry_t *bpte_p, *epte_p; pt_entry_t *pte_p; - block_end = (block_start + ARM_TT_L2_SIZE) & ~ARM_TT_L2_OFFMASK; + block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr); if (block_end > va_end) { block_end = va_end; } - tte_p = pmap_tt2e(pmap, block_start); + tte_p = pmap_tte(pmap, block_start); if ((tte_p != PT_ENTRY_NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) { pte_p = (pt_entry_t *) ttetokv(*tte_p); - bpte_p = &pte_p[tt3_index(pmap, block_start)]; - epte_p = bpte_p + (((block_end - block_start) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT); + bpte_p = &pte_p[pte_index(pmap, pt_attr, block_start)]; + epte_p = &pte_p[pte_index(pmap, pt_attr, block_end)]; for (pte_p = bpte_p; pte_p < epte_p; pte_p++) { if (*pte_p != ARM_PTE_EMPTY) { - if ((pmap != kernel_pmap) && (not_in_kdp)) { + if ((pmap != kernel_pmap) && (initial_not_in_kdp)) { PMAP_UNLOCK(pmap); } return FALSE; @@ -10116,9 +10351,8 @@ pmap_is_empty_internal( } block_start = block_end; } -#endif - if ((pmap != kernel_pmap) && (not_in_kdp)) { + if ((pmap != kernel_pmap) && (initial_not_in_kdp)) { PMAP_UNLOCK(pmap); } @@ -10308,18 +10542,13 @@ pmap_query_resident_internal( return PMAP_RESIDENT_INVALID; } if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { -#if (__ARM_VMSA__ == 7) - pte_p = (pt_entry_t *) ttetokv(*tte_p); - bpte = &pte_p[ptenum(start)]; - epte = bpte + atop(end - start); -#else + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); pte_p = (pt_entry_t *) ttetokv(*tte_p); - bpte = &pte_p[tt3_index(pmap, start)]; - epte = bpte + ((end - start) >> ARM_TT_L3_SHIFT); -#endif + bpte = &pte_p[pte_index(pmap, pt_attr, start)]; + epte = &pte_p[pte_index(pmap, pt_attr, end)]; for (; bpte < epte; bpte++) { - if (ARM_PTE_IS_COMPRESSED(*bpte)) { + if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) { compressed_bytes += ARM_PGBYTES; } else if (pa_valid(pte_to_pa(*bpte))) { resident_bytes += ARM_PGBYTES; @@ -10356,6 +10585,8 @@ pmap_query_resident( return 0; } + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + total_resident_bytes = 0; compressed_bytes = 0; @@ -10368,7 +10599,7 @@ pmap_query_resident( vm_map_address_t l; mach_vm_size_t resident_bytes; - l = ((va + ARM_TT_TWIG_SIZE) & ~ARM_TT_TWIG_OFFMASK); + l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr)); if (l > end) { l = end; @@ -10398,10 +10629,8 @@ static void pmap_check_ledgers( pmap_t pmap) { - ledger_amount_t bal; - int pid; - char *procname; - boolean_t do_panic; + int pid; + char *procname; if (pmap->pmap_pid == 0) { /* @@ -10419,73 +10648,10 @@ pmap_check_ledgers( return; } - do_panic = FALSE; pid = pmap->pmap_pid; procname = pmap->pmap_procname; - pmap_ledgers_drift.num_pmaps_checked++; - -#define LEDGER_CHECK_BALANCE(__LEDGER) \ -MACRO_BEGIN \ - int panic_on_negative = TRUE; \ - ledger_get_balance(pmap->ledger, \ - task_ledgers.__LEDGER, \ - &bal); \ - ledger_get_panic_on_negative(pmap->ledger, \ - task_ledgers.__LEDGER, \ - &panic_on_negative); \ - if (bal != 0) { \ - if (panic_on_negative || \ - (pmap_ledgers_panic && \ - pmap_ledgers_panic_leeway > 0 && \ - (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ - bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ - do_panic = TRUE; \ - } \ - printf("LEDGER BALANCE proc %d (%s) " \ - "\"%s\" = %lld\n", \ - pid, procname, #__LEDGER, bal); \ - if (bal > 0) { \ - pmap_ledgers_drift.__LEDGER##_over++; \ - pmap_ledgers_drift.__LEDGER##_over_total += bal; \ - if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ - pmap_ledgers_drift.__LEDGER##_over_max = bal; \ - } \ - } else if (bal < 0) { \ - pmap_ledgers_drift.__LEDGER##_under++; \ - pmap_ledgers_drift.__LEDGER##_under_total += bal; \ - if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ - pmap_ledgers_drift.__LEDGER##_under_max = bal; \ - } \ - } \ - } \ -MACRO_END - - LEDGER_CHECK_BALANCE(phys_footprint); - LEDGER_CHECK_BALANCE(internal); - LEDGER_CHECK_BALANCE(internal_compressed); - LEDGER_CHECK_BALANCE(iokit_mapped); - LEDGER_CHECK_BALANCE(alternate_accounting); - LEDGER_CHECK_BALANCE(alternate_accounting_compressed); - LEDGER_CHECK_BALANCE(page_table); - LEDGER_CHECK_BALANCE(purgeable_volatile); - LEDGER_CHECK_BALANCE(purgeable_nonvolatile); - LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); - LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); - LEDGER_CHECK_BALANCE(network_volatile); - LEDGER_CHECK_BALANCE(network_nonvolatile); - LEDGER_CHECK_BALANCE(network_volatile_compressed); - LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); - - if (do_panic) { - if (pmap_ledgers_panic) { - panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", - pmap, pid, procname); - } else { - printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", - pmap, pid, procname); - } - } + vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname); PMAP_STATS_ASSERTF(pmap->stats.resident_count == 0, pmap, "stats.resident_count %d", pmap->stats.resident_count); #if 00 @@ -10708,7 +10874,7 @@ pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t s } else { PGTRACE_WRITE_PTE(cptep, *ptep); } - PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES); + PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES, false); } // get ptes for original and clone @@ -10717,7 +10883,7 @@ pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t s // invalidate original pte and mark it as a pgtrace page PGTRACE_WRITE_PTE(ptep, (*ptep | ARM_PTE_PGTRACE) & ~ARM_PTE_TYPE_VALID); - PMAP_UPDATE_TLBS(pmap, map->ova, map->ova + ARM_PGBYTES); + PMAP_UPDATE_TLBS(pmap, map->ova, map->ova + ARM_PGBYTES, false); map->cloned = true; p->state = DEFINED; @@ -10766,14 +10932,14 @@ pmap_pgtrace_remove_clone(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t va) ptep = pmap_pte(pmap, map->ova); assert(ptep); PGTRACE_WRITE_PTE(ptep, *ptep | ARM_PTE_TYPE_VALID); - PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES); + PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES, false); // revert clone pages for (int i = 0; i < 3; i++) { ptep = pmap_pte(kernel_pmap, map->cva[i]); assert(ptep != NULL); PGTRACE_WRITE_PTE(ptep, map->cva_spte[i]); - PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES); + PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES, false); } } @@ -10828,14 +10994,14 @@ pmap_pgtrace_remove_all_clone(pmap_paddr_t pa) ptep = pmap_pte(map->pmap, map->ova); assert(ptep); PGTRACE_WRITE_PTE(ptep, *ptep | ARM_PTE_TYPE_VALID); - PMAP_UPDATE_TLBS(map->pmap, map->ova, map->ova + ARM_PGBYTES); + PMAP_UPDATE_TLBS(map->pmap, map->ova, map->ova + ARM_PGBYTES, false); // revert clone ptes for (int i = 0; i < 3; i++) { ptep = pmap_pte(kernel_pmap, map->cva[i]); assert(ptep != NULL); PGTRACE_WRITE_PTE(ptep, map->cva_spte[i]); - PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES); + PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES, false); } PMAP_UNLOCK(map->pmap); @@ -10894,6 +11060,7 @@ pmap_pgtrace_clone_from_pa(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t start_o pt_entry_t *ptep; tt_entry_t *ttep; tt_entry_t tte; + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); pmap_pgtrace_get_search_space(pmap, &min, &max); @@ -10912,12 +11079,6 @@ pmap_pgtrace_clone_from_pa(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t start_o goto unlock_continue; } -#if __ARM64_TWO_LEVEL_PMAP__ - // check whether we can skip l2 - ttep = pmap_tt2e(pmap, cur_page); - assert(ttep); - tte = *ttep; -#else // check whether we can skip l1 ttep = pmap_tt1e(pmap, cur_page); assert(ttep); @@ -10928,15 +11089,15 @@ pmap_pgtrace_clone_from_pa(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t start_o } // how about l2 - tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, cur_page)]; -#endif + tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, pt_attr, cur_page)]; + if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) { add = ARM_TT_L2_SIZE; goto unlock_continue; } // ptep finally - ptep = &(((pt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, cur_page)]); + ptep = &(((pt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, pt_attr, cur_page)]); if (ptep == PT_ENTRY_NULL) { add = ARM_TT_L3_SIZE; goto unlock_continue; @@ -11382,7 +11543,7 @@ pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss) } else if ((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE_VALID) { // Somehow this cpu's tlb has not updated kprintf("%s Somehow this cpu's tlb has not updated?\n", __func__); - PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES); + PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES, false); PMAP_PGTRACE_UNLOCK(&ints); return KERN_SUCCESS; @@ -11437,7 +11598,7 @@ pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss) PMAP_PGTRACE_UNLOCK(&ints); // Return to next instruction - set_saved_state_pc(ss, get_saved_state_pc(ss) + sizeof(uint32_t)); + add_saved_state_pc(ss, sizeof(uint32_t)); return KERN_SUCCESS; } @@ -11502,7 +11663,7 @@ pmap_query_page_info_internal( pa = pte_to_pa(*pte); if (pa == 0) { - if (ARM_PTE_IS_COMPRESSED(*pte)) { + if (ARM_PTE_IS_COMPRESSED(*pte, pte)) { disp |= PMAP_QUERY_PAGE_COMPRESSED; if (*pte & ARM_PTE_COMPRESSED_ALT) { disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT; @@ -11566,6 +11727,7 @@ pmap_return(boolean_t do_panic, boolean_t do_recurse) + MARK_AS_PMAP_TEXT static void pmap_footprint_suspend_internal( vm_map_t map, @@ -11594,16 +11756,6 @@ pmap_footprint_suspend( #if defined(__arm64__) && (DEVELOPMENT || DEBUG) -struct page_table_level_info { - uint64_t size; - uint64_t offmask; - uint64_t shift; - uint64_t index_mask; - uint64_t valid_mask; - uint64_t type_mask; - uint64_t type_block; -}; - struct page_table_dump_header { uint64_t pa; uint64_t num_entries; @@ -11611,14 +11763,9 @@ struct page_table_dump_header { uint64_t end_va; }; -struct page_table_level_info page_table_levels[] = -{ { ARM_TT_L0_SIZE, ARM_TT_L0_OFFMASK, ARM_TT_L0_SHIFT, ARM_TT_L0_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK }, - { ARM_TT_L1_SIZE, ARM_TT_L1_OFFMASK, ARM_TT_L1_SHIFT, ARM_TT_L1_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK }, - { ARM_TT_L2_SIZE, ARM_TT_L2_OFFMASK, ARM_TT_L2_SHIFT, ARM_TT_L2_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK }, - { ARM_TT_L3_SIZE, ARM_TT_L3_OFFMASK, ARM_TT_L3_SHIFT, ARM_TT_L3_INDEX_MASK, ARM_PTE_TYPE_VALID, ARM_PTE_TYPE_MASK, ARM_TTE_TYPE_L3BLOCK } }; - static size_t -pmap_dump_page_tables_recurse(const tt_entry_t *ttp, +pmap_dump_page_tables_recurse(pmap_t pmap, + const tt_entry_t *ttp, unsigned int cur_level, uint64_t start_va, void *bufp, @@ -11626,10 +11773,12 @@ pmap_dump_page_tables_recurse(const tt_entry_t *ttp, { size_t bytes_used = 0; uint64_t num_entries = ARM_PGBYTES / sizeof(*ttp); - uint64_t size = page_table_levels[cur_level].size; - uint64_t valid_mask = page_table_levels[cur_level].valid_mask; - uint64_t type_mask = page_table_levels[cur_level].type_mask; - uint64_t type_block = page_table_levels[cur_level].type_block; + const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); + + uint64_t size = pt_attr->pta_level_info[cur_level].size; + uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask; + uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask; + uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block; if (cur_level == arm64_root_pgtable_level) { num_entries = arm64_root_pgtable_num_ttes; @@ -11671,7 +11820,7 @@ pmap_dump_page_tables_recurse(const tt_entry_t *ttp, const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK); - size_t recurse_result = pmap_dump_page_tables_recurse(next_tt, cur_level + 1, current_va, (uint8_t*)bufp + bytes_used, buf_end); + size_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1, current_va, (uint8_t*)bufp + bytes_used, buf_end); if (recurse_result == 0) { return 0; @@ -11690,7 +11839,7 @@ pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end) if (not_in_kdp) { panic("pmap_dump_page_tables must only be called from kernel debugger context"); } - return pmap_dump_page_tables_recurse(pmap->tte, arm64_root_pgtable_level, pmap->min, bufp, buf_end); + return pmap_dump_page_tables_recurse(pmap, pmap->tte, arm64_root_pgtable_level, pmap->min, bufp, buf_end); } #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */ diff --git a/osfmk/arm/pmap.h b/osfmk/arm/pmap.h index 50464cd10..3d45185eb 100644 --- a/osfmk/arm/pmap.h +++ b/osfmk/arm/pmap.h @@ -1,6 +1,5 @@ /* - * - * Copyright (c) 2007-2016 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,26 +42,41 @@ #ifndef ASSEMBLER #include +#include #include #include #include #include #include +#if defined(__arm64__) +#include +#else +#include +#endif + + +#define ASID_SHIFT (11) /* Shift for 2048 max virtual ASIDs (2048 pmaps) */ +#define MAX_ASID (1 << ASID_SHIFT) /* Max supported ASIDs (can be virtual) */ +#ifndef ARM_ASID_SHIFT +#define ARM_ASID_SHIFT (8) /* Shift for the maximum ARM ASID value (256) */ +#endif +#define ARM_MAX_ASID (1 << ARM_ASID_SHIFT) /* Max ASIDs supported by the hardware */ +#define NBBY 8 #if __ARM_KERNEL_PROTECT__ -/* - * For __ARM_KERNEL_PROTECT__, we need twice as many ASIDs to support having - * unique EL0 and EL1 ASIDs for each pmap. - */ -#define ASID_SHIFT (12) /* Shift for the maximum virtual ASID value (2048)*/ -#else /* __ARM_KERNEL_PROTECT__ */ -#define ASID_SHIFT (11) /* Shift for the maximum virtual ASID value (2048) */ -#endif /* __ARM_KERNEL_PROTECT__ */ -#define MAX_ASID (1 << ASID_SHIFT) /* Max supported ASIDs (can be virtual) */ -#define ARM_ASID_SHIFT (8) /* Shift for the maximum ARM ASID value (256) */ -#define ARM_MAX_ASID (1 << ARM_ASID_SHIFT) /* Max ASIDs supported by the hardware */ -#define ASID_VIRT_BITS (ASID_SHIFT - ARM_ASID_SHIFT) /* The number of virtual bits in a virtaul ASID */ -#define NBBY 8 +#define MAX_HW_ASID ((ARM_MAX_ASID >> 1) - 1) +#else +#define MAX_HW_ASID (ARM_MAX_ASID - 1) +#endif + +#ifndef ARM_VMID_SHIFT +#define ARM_VMID_SHIFT (8) +#endif +#define ARM_MAX_VMID (1 << ARM_VMID_SHIFT) + +/* XPRR virtual register map */ + +#define CPUWINDOWS_MAX 4 struct pmap_cpu_data { #if defined(__arm64__) @@ -72,7 +86,9 @@ struct pmap_cpu_data { unsigned int cpu_user_pmap_stamp; #endif unsigned int cpu_number; + bool copywindow_strong_sync[CPUWINDOWS_MAX]; +#if MAX_ASID > MAX_HW_ASID /* * This supports overloading of ARM ASIDs by the pmap. The field needs @@ -85,7 +101,8 @@ struct pmap_cpu_data { * memory by only having enough bits to support MAX_ASID. However, such * an implementation would be more error prone. */ - uint8_t cpu_asid_high_bits[ARM_MAX_ASID]; + uint8_t cpu_asid_high_bits[MAX_HW_ASID]; +#endif }; typedef struct pmap_cpu_data pmap_cpu_data_t; @@ -134,6 +151,16 @@ typedef uint32_t pt_entry_t; /* page #error unknown arch #endif +struct page_table_level_info { + const uint64_t size; + const uint64_t offmask; + const uint64_t shift; + const uint64_t index_mask; + const uint64_t valid_mask; + const uint64_t type_mask; + const uint64_t type_block; +}; + /* superpages */ #define SUPERPAGE_NBASEPAGES 1 /* No superpages support */ @@ -174,37 +201,6 @@ typedef uint32_t pt_entry_t; /* page #define NPTES (ARM_PGBYTES / sizeof(pt_entry_t)) #endif -extern void sync_tlb_flush(void); -extern void flush_mmu_tlb_async(void); -extern void flush_mmu_tlb(void); -extern void flush_core_tlb_async(void); -extern void flush_core_tlb(void); -#if defined(__arm64__) -extern void flush_mmu_tlb_allentries_async(uint64_t, uint64_t); -extern void flush_mmu_tlb_allentries(uint64_t, uint64_t); -extern void flush_mmu_tlb_entry_async(uint64_t); -extern void flush_mmu_tlb_entry(uint64_t); -extern void flush_mmu_tlb_entries_async(uint64_t, uint64_t); -extern void flush_mmu_tlb_entries(uint64_t, uint64_t); -extern void flush_mmu_tlb_asid_async(uint64_t); -extern void flush_mmu_tlb_asid(uint64_t); -extern void flush_core_tlb_asid_async(uint64_t); -extern void flush_core_tlb_asid(uint64_t); - -#define tlbi_addr(x) (((x) >> TLBI_ADDR_SHIFT) & TLBI_ADDR_MASK) -#define tlbi_asid(x) (((uint64_t)x << TLBI_ASID_SHIFT) & TLBI_ASID_MASK) -#else -extern void flush_mmu_tlb_entry_async(uint32_t); -extern void flush_mmu_tlb_entry(uint32_t); -extern void flush_mmu_tlb_entries_async(uint32_t, uint32_t); -extern void flush_mmu_tlb_entries(uint32_t, uint32_t); -extern void flush_mmu_tlb_mva_entries_async(uint32_t); -extern void flush_mmu_tlb_mva_entries(uint32_t); -extern void flush_mmu_tlb_asid_async(uint32_t); -extern void flush_mmu_tlb_asid(uint32_t); -extern void flush_core_tlb_asid_async(uint32_t); -extern void flush_core_tlb_asid(uint32_t); -#endif extern void flush_mmu_tlb_region(vm_offset_t va, unsigned length); #if defined(__arm64__) @@ -291,6 +287,9 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va); #endif /* DEVELOPMENT || DEBUG */ +/* Forward struct declarations for the pmap data structure */ +struct page_table_attr; + /* * Convert translation/page table entry to kernel virtual address */ @@ -298,12 +297,15 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va); #define ptetokv(a) (phystokv(pte_to_pa(a))) struct pmap { - tt_entry_t *tte; /* translation table entries */ + tt_entry_t *tte; /* translation table entries */ pmap_paddr_t ttep; /* translation table physical */ vm_map_address_t min; /* min address in pmap */ vm_map_address_t max; /* max address in pmap */ +#if ARM_PARAMETERIZED_PMAP + const struct page_table_attr * pmap_pt_attr; /* details about page table layout */ +#endif /* ARM_PARAMETERIZED_PMAP */ ledger_t ledger; /* ledger tracking phys mappings */ - decl_simple_lock_data(, lock) /* lock on map */ + decl_simple_lock_data(, lock); /* lock on map */ struct pmap_statistics stats; /* map statistics */ queue_chain_t pmaps; /* global list of pmaps */ tt_entry_t *tt_entry_free; /* free translation table entries */ @@ -317,19 +319,19 @@ struct pmap { unsigned int *nested_region_asid_bitmap; #if (__ARM_VMSA__ <= 7) - decl_simple_lock_data(, tt1_lock) /* lock on tt1 */ + decl_simple_lock_data(, tt1_lock); /* lock on tt1 */ unsigned int cpu_ref; /* number of cpus using pmap */ + unsigned int tte_index_max; /* max tte index in translation table entries */ #endif - unsigned int asid; /* address space id */ - unsigned int vasid; /* Virtual address space id */ unsigned int stamp; /* creation stamp */ _Atomic int32_t ref_count; /* pmap reference count */ unsigned int gc_status; /* gc status */ unsigned int nested_region_asid_bitmap_size; - unsigned int tte_index_max; /* max tte index in translation table entries */ uint32_t nested_no_bounds_refcnt;/* number of pmaps that nested this pmap without bounds set */ + uint16_t hw_asid; + uint8_t sw_asid; #if MACH_ASSERT int pmap_pid; @@ -340,32 +342,17 @@ struct pmap { bool footprint_suspended; bool footprint_was_suspended; #endif /* DEVELOPMENT || DEBUG */ - bool nx_enabled; /* no execute */ - bool nested; /* is nested */ - bool is_64bit; /* is 64bit */ + bool nx_enabled; /* no execute */ + bool nested; /* is nested */ + bool is_64bit; /* is 64bit */ bool nested_has_no_bounds_ref; /* nested a pmap when the bounds were not set */ bool nested_bounds_set; /* The nesting bounds have been set */ +#if HAS_APPLE_PAC + bool disable_jop; +#endif /* HAS_APPLE_PAC */ }; -/* typedef struct pmap *pmap_t; */ -#define PMAP_NULL ((pmap_t) 0) - - -/* - * WIMG control - */ -#define VM_MEM_INNER 0x10 -#define VM_MEM_RT 0x10 // intentionally alias VM_MEM_INNER; will be used with mutually exclusive caching policies -#define VM_MEM_EARLY_ACK 0x20 - -#define VM_WIMG_DEFAULT (VM_MEM_COHERENT) -#define VM_WIMG_COPYBACK (VM_MEM_COHERENT) -#define VM_WIMG_INNERWBACK (VM_MEM_COHERENT | VM_MEM_INNER) -#define VM_WIMG_IO (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) -#define VM_WIMG_POSTED (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED | VM_MEM_EARLY_ACK) -#define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) -#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) -#define VM_WIMG_RT (VM_WIMG_IO | VM_MEM_RT) +#define PMAP_VASID(pmap) (((uint32_t)((pmap)->sw_asid) << 16) | pmap->hw_asid) #if VM_DEBUG extern int pmap_list_resident_pages( @@ -460,15 +447,17 @@ extern vm_map_address_t pmap_map_high_window_bd( vm_offset_t pa, vm_size_t len, extern kern_return_t pmap_map_block(pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags); extern void pmap_map_globals(void); -#define PMAP_MAP_BD_DEVICE 0x1 -#define PMAP_MAP_BD_WCOMB 0x2 -#define PMAP_MAP_BD_POSTED 0x3 -#define PMAP_MAP_BD_MASK 0x3 +#define PMAP_MAP_BD_DEVICE 0x0 +#define PMAP_MAP_BD_WCOMB 0x1 +#define PMAP_MAP_BD_POSTED 0x2 +#define PMAP_MAP_BD_POSTED_REORDERED 0x3 +#define PMAP_MAP_BD_POSTED_COMBINED_REORDERED 0x4 +#define PMAP_MAP_BD_MASK 0x7 extern vm_map_address_t pmap_map_bd_with_options(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot, int32_t options); extern vm_map_address_t pmap_map_bd(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot); -extern void pmap_init_pte_page(pmap_t, pt_entry_t *, vm_offset_t, unsigned int ttlevel, boolean_t alloc_ptd); +extern void pmap_init_pte_page(pmap_t, pt_entry_t *, vm_offset_t, unsigned int ttlevel, boolean_t alloc_ptd, boolean_t clear); extern boolean_t pmap_valid_address(pmap_paddr_t addr); extern void pmap_disable_NX(pmap_t pmap); @@ -551,7 +540,9 @@ boolean_t pmap_enforces_execute_only(pmap_t pmap); #define PMAP_LEDGER_ALLOC_INDEX 66 #define PMAP_LEDGER_FREE_INDEX 67 -#define PMAP_COUNT 68 + + +#define PMAP_COUNT 71 #define PMAP_INVALID_CPU_NUM (~0U) diff --git a/osfmk/arm/proc_reg.h b/osfmk/arm/proc_reg.h index 45536d43f..192bc9d69 100644 --- a/osfmk/arm/proc_reg.h +++ b/osfmk/arm/proc_reg.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2016 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,132 +71,116 @@ #endif #if defined (ARMA7) -#define __ARM_ARCH__ 7 -#define __ARM_SUB_ARCH__ CPU_ARCH_ARMv7k -#define __ARM_VMSA__ 7 -#define __ARM_VFP__ 3 +#define __ARM_ARCH__ 7 +#define __ARM_SUB_ARCH__ CPU_ARCH_ARMv7k +#define __ARM_VMSA__ 7 +#define __ARM_VFP__ 3 #if defined(__XNU_UP__) -#define __ARM_SMP__ 0 +#define __ARM_SMP__ 0 #else -#define __ARM_SMP__ 1 +#define __ARM_SMP__ 1 /* For SMP kernels, force physical aperture to be mapped at PTE level so that its mappings * can be updated to reflect cache attribute changes on alias mappings. This prevents * prefetched physical aperture cachelines from becoming dirty in L1 due to a write to * an uncached alias mapping on the same core. Subsequent uncached writes from another * core may not snoop this line, and the dirty line may end up being evicted later to * effectively overwrite the uncached writes from other cores. */ -#define __ARM_PTE_PHYSMAP__ 1 +#define __ARM_PTE_PHYSMAP__ 1 #endif /* __ARMA7_SMP__ controls whether we are consistent with the A7 MP_CORE spec; needed because entities other than * the xnu-managed processors may need to snoop our cache operations. */ -#define __ARMA7_SMP__ 1 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 -#define __ARM_USER_PROTECT__ 1 -#define __ARM_TIME_TIMEBASE_ONLY__ 1 - -#elif defined (APPLECYCLONE) -#define __ARM_ARCH__ 8 -#define __ARM_VMSA__ 8 -#define __ARM_SMP__ 1 -#define __ARM_VFP__ 4 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_COHERENT_IO__ 1 -#define __ARM_IC_NOALIAS_ICACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 -#define __ARM_ENABLE_SWAP__ 1 -#define __ARM_V8_CRYPTO_EXTENSIONS__ 1 -#define __ARM64_PMAP_SUBPAGE_L1__ 1 -#define __ARM_KERNEL_PROTECT__ 1 +#define __ARMA7_SMP__ 1 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_USER_PROTECT__ 1 +#define __ARM_TIME_TIMEBASE_ONLY__ 1 #elif defined (APPLETYPHOON) -#define __ARM_ARCH__ 8 -#define __ARM_VMSA__ 8 -#define __ARM_SMP__ 1 -#define __ARM_VFP__ 4 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_COHERENT_IO__ 1 -#define __ARM_IC_NOALIAS_ICACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 -#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1 -#define __ARM64_PMAP_SUBPAGE_L1__ 1 -#define __ARM_KERNEL_PROTECT__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_KERNEL_PROTECT__ 1 #elif defined (APPLETWISTER) -#define __ARM_ARCH__ 8 -#define __ARM_VMSA__ 8 -#define __ARM_SMP__ 1 -#define __ARM_VFP__ 4 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_COHERENT_IO__ 1 -#define __ARM_IC_NOALIAS_ICACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 -#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1 -#define __ARM_16K_PG__ 1 -#define __ARM64_PMAP_SUBPAGE_L1__ 1 -#define __ARM_KERNEL_PROTECT__ 1 +#define __ARM_16K_PG__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_KERNEL_PROTECT__ 1 #elif defined (APPLEHURRICANE) -#define __ARM_ARCH__ 8 -#define __ARM_VMSA__ 8 -#define __ARM_SMP__ 1 -#define __ARM_VFP__ 4 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_COHERENT_IO__ 1 -#define __ARM_IC_NOALIAS_ICACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 -#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1 -#define __ARM_16K_PG__ 1 -#define __ARM64_PMAP_SUBPAGE_L1__ 1 -#define __ARM_KERNEL_PROTECT__ 1 -#define __ARM_GLOBAL_SLEEP_BIT__ 1 -#define __ARM_PAN_AVAILABLE__ 1 +#define __ARM_16K_PG__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_KERNEL_PROTECT__ 1 +#define __ARM_GLOBAL_SLEEP_BIT__ 1 +#define __ARM_PAN_AVAILABLE__ 1 #elif defined (APPLEMONSOON) -#define __ARM_ARCH__ 8 -#define __ARM_VMSA__ 8 -#define __ARM_SMP__ 1 -#define __ARM_AMP__ 1 -#define __ARM_VFP__ 4 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_COHERENT_IO__ 1 -#define __ARM_IC_NOALIAS_ICACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 -#define __ARM_ENABLE_SWAP__ 1 -#define __ARM_V8_CRYPTO_EXTENSIONS__ 1 -#define __ARM_16K_PG__ 1 -#define __ARM64_PMAP_SUBPAGE_L1__ 1 -#define __ARM_KERNEL_PROTECT__ 1 -#define __ARM_GLOBAL_SLEEP_BIT__ 1 -#define __ARM_PAN_AVAILABLE__ 1 -#define __ARM_WKDM_ISA_AVAILABLE__ 1 -#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL) +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_AMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_V8_CRYPTO_EXTENSIONS__ 1 +#define __ARM_16K_PG__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_KERNEL_PROTECT__ 1 +#define __ARM_GLOBAL_SLEEP_BIT__ 1 +#define __ARM_PAN_AVAILABLE__ 1 +#define __ARM_WKDM_ISA_AVAILABLE__ 1 +#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL) #define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64) -#define __ARM_CLUSTER_COUNT__ 2 +#define __ARM_CLUSTER_COUNT__ 2 #elif defined (BCM2837) -#define __ARM_ARCH__ 8 -#define __ARM_VMSA__ 8 -#define __ARM_SMP__ 1 -#define __ARM_VFP__ 4 -#define __ARM_COHERENT_CACHE__ 1 -#define __ARM_L1_PTW__ 1 -#define __ARM_DEBUG__ 7 +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_DEBUG__ 7 #define __ARM64_PMAP_SUBPAGE_L1__ 1 #else #error processor not supported #endif +#if __ARM_42BIT_PA_SPACE__ +/* For now, force the issue! */ +#undef __ARM64_PMAP_SUBPAGE_L1__ +#endif /* __ARM_42BIT_PA_SPACE__ */ + #if __ARM_KERNEL_PROTECT__ /* * This feature is not currently implemented for 32-bit ARM CPU architectures. @@ -205,25 +189,37 @@ */ #if __arm__ #error __ARM_KERNEL_PROTECT__ is not supported on ARM32 -#endif +#endif /* __arm__ */ #endif /* __ARM_KERNEL_PROTECT__ */ #if defined(ARM_BOARD_WFE_TIMEOUT_NS) #define __ARM_ENABLE_WFE_ 1 -#else +#else /* defined(ARM_BOARD_WFE_TIMEOUT_NS) */ #define __ARM_ENABLE_WFE_ 0 -#endif +#endif /* defined(ARM_BOARD_WFE_TIMEOUT_NS) */ +/* + * The clutch scheduler is enabled only on non-AMP platforms for now. + */ +#if !__ARM_AMP__ && CONFIG_CLUTCH +#define CONFIG_SCHED_CLUTCH 1 +#else /* !__ARM_AMP__ && CONFIG_CLUTCH */ +#define CONFIG_SCHED_CLUTCH 0 +#endif /* !__ARM_AMP__ && CONFIG_CLUTCH */ + +#if __ARM_AMP__ || CONFIG_SCHED_CLUTCH +#define CONFIG_THREAD_GROUPS 1 +#else /* __ARM_AMP__ || CONFIG_SCHED_CLUTCH */ #define CONFIG_THREAD_GROUPS 0 +#endif +#ifdef XNU_KERNEL_PRIVATE -#ifdef XNU_KERNEL_PRIVATE - -#if __ARM_VFP__ -#define ARM_VFP_DEBUG 0 -#endif +#if __ARM_VFP__ +#define ARM_VFP_DEBUG 0 +#endif /* __ARM_VFP__ */ -#endif +#endif /* XNU_KERNEL_PRIVATE */ @@ -242,66 +238,66 @@ /* * Flags */ -#define PSR_NF 0x80000000 /* Negative/Less than */ -#define PSR_ZF 0x40000000 /* Zero */ -#define PSR_CF 0x20000000 /* Carry/Borrow/Extend */ -#define PSR_VF 0x10000000 /* Overflow */ -#define PSR_QF 0x08000000 /* saturation flag (QADD ARMv5) */ +#define PSR_NF 0x80000000 /* Negative/Less than */ +#define PSR_ZF 0x40000000 /* Zero */ +#define PSR_CF 0x20000000 /* Carry/Borrow/Extend */ +#define PSR_VF 0x10000000 /* Overflow */ +#define PSR_QF 0x08000000 /* saturation flag (QADD ARMv5) */ /* * Modified execution mode flags */ -#define PSR_JF 0x01000000 /* Jazelle flag (BXJ ARMv5) */ -#define PSR_EF 0x00000200 /* mixed-endian flag (SETEND ARMv6) */ -#define PSR_AF 0x00000100 /* precise abort flag (ARMv6) */ -#define PSR_TF 0x00000020 /* thumb flag (BX ARMv4T) */ -#define PSR_TFb 5 /* thumb flag (BX ARMv4T) */ +#define PSR_JF 0x01000000 /* Jazelle flag (BXJ ARMv5) */ +#define PSR_EF 0x00000200 /* mixed-endian flag (SETEND ARMv6) */ +#define PSR_AF 0x00000100 /* precise abort flag (ARMv6) */ +#define PSR_TF 0x00000020 /* thumb flag (BX ARMv4T) */ +#define PSR_TFb 5 /* thumb flag (BX ARMv4T) */ /* * Interrupts */ -#define PSR_IRQFb 7 /* IRQ : 0 = IRQ enable */ -#define PSR_IRQF 0x00000080 /* IRQ : 0 = IRQ enable */ -#define PSR_FIQF 0x00000040 /* FIQ : 0 = FIQ enable */ +#define PSR_IRQFb 7 /* IRQ : 0 = IRQ enable */ +#define PSR_IRQF 0x00000080 /* IRQ : 0 = IRQ enable */ +#define PSR_FIQF 0x00000040 /* FIQ : 0 = FIQ enable */ /* * CPU mode */ -#define PSR_USER_MODE 0x00000010 /* User mode */ -#define PSR_FIQ_MODE 0x00000011 /* FIQ mode */ -#define PSR_IRQ_MODE 0x00000012 /* IRQ mode */ -#define PSR_SVC_MODE 0x00000013 /* Supervisor mode */ -#define PSR_ABT_MODE 0x00000017 /* Abort mode */ -#define PSR_UND_MODE 0x0000001B /* Undefined mode */ +#define PSR_USER_MODE 0x00000010 /* User mode */ +#define PSR_FIQ_MODE 0x00000011 /* FIQ mode */ +#define PSR_IRQ_MODE 0x00000012 /* IRQ mode */ +#define PSR_SVC_MODE 0x00000013 /* Supervisor mode */ +#define PSR_ABT_MODE 0x00000017 /* Abort mode */ +#define PSR_UND_MODE 0x0000001B /* Undefined mode */ -#define PSR_MODE_MASK 0x0000001F -#define PSR_IS_KERNEL(psr) (((psr) & PSR_MODE_MASK) != PSR_USER_MODE) -#define PSR_IS_USER(psr) (((psr) & PSR_MODE_MASK) == PSR_USER_MODE) +#define PSR_MODE_MASK 0x0000001F +#define PSR_IS_KERNEL(psr) (((psr) & PSR_MODE_MASK) != PSR_USER_MODE) +#define PSR_IS_USER(psr) (((psr) & PSR_MODE_MASK) == PSR_USER_MODE) -#define PSR_USERDFLT PSR_USER_MODE -#define PSR_USER_MASK (PSR_AF | PSR_IRQF | PSR_FIQF | PSR_MODE_MASK) -#define PSR_USER_SET PSR_USER_MODE +#define PSR_USERDFLT PSR_USER_MODE +#define PSR_USER_MASK (PSR_AF | PSR_IRQF | PSR_FIQF | PSR_MODE_MASK) +#define PSR_USER_SET PSR_USER_MODE -#define PSR_INTMASK PSR_IRQF /* Interrupt disable */ +#define PSR_INTMASK PSR_IRQF /* Interrupt disable */ /* * FPEXC: Floating-Point Exception Register */ -#define FPEXC_EX 0x80000000 /* Exception status */ -#define FPEXC_EX_BIT 31 -#define FPEXC_EN 0x40000000 /* VFP : 1 = EN enable */ -#define FPEXC_EN_BIT 30 +#define FPEXC_EX 0x80000000 /* Exception status */ +#define FPEXC_EX_BIT 31 +#define FPEXC_EN 0x40000000 /* VFP : 1 = EN enable */ +#define FPEXC_EN_BIT 30 /* * FPSCR: Floating-point Status and Control Register */ -#define FPSCR_DN 0x02000000 /* Default NaN */ -#define FPSCR_FZ 0x01000000 /* Flush to zero */ +#define FPSCR_DN 0x02000000 /* Default NaN */ +#define FPSCR_FZ 0x01000000 /* Flush to zero */ -#define FPSCR_DEFAULT FPSCR_DN | FPSCR_FZ +#define FPSCR_DEFAULT FPSCR_DN | FPSCR_FZ /* @@ -310,34 +306,34 @@ * IFSR: Instruction Fault Status Register * DFSR: Data Fault Status Register */ -#define FSR_ALIGN 0x00000001 /* Alignment */ -#define FSR_DEBUG 0x00000002 /* Debug (watch/break) */ -#define FSR_ICFAULT 0x00000004 /* Fault on instruction cache maintenance */ -#define FSR_SFAULT 0x00000005 /* Translation Section */ -#define FSR_PFAULT 0x00000007 /* Translation Page */ -#define FSR_SACCESS 0x00000003 /* Section access */ -#define FSR_PACCESS 0x00000006 /* Page Access */ -#define FSR_SDOM 0x00000009 /* Domain Section */ -#define FSR_PDOM 0x0000000B /* Domain Page */ -#define FSR_SPERM 0x0000000D /* Permission Section */ -#define FSR_PPERM 0x0000000F /* Permission Page */ -#define FSR_EXT 0x00001000 /* External (Implementation Defined Classification) */ - -#define FSR_MASK 0x0000040F /* Valid bits */ -#define FSR_ALIGN_MASK 0x0000040D /* Valid bits to check align */ - -#define DFSR_WRITE 0x00000800 /* write data abort fault */ +#define FSR_ALIGN 0x00000001 /* Alignment */ +#define FSR_DEBUG 0x00000002 /* Debug (watch/break) */ +#define FSR_ICFAULT 0x00000004 /* Fault on instruction cache maintenance */ +#define FSR_SFAULT 0x00000005 /* Translation Section */ +#define FSR_PFAULT 0x00000007 /* Translation Page */ +#define FSR_SACCESS 0x00000003 /* Section access */ +#define FSR_PACCESS 0x00000006 /* Page Access */ +#define FSR_SDOM 0x00000009 /* Domain Section */ +#define FSR_PDOM 0x0000000B /* Domain Page */ +#define FSR_SPERM 0x0000000D /* Permission Section */ +#define FSR_PPERM 0x0000000F /* Permission Page */ +#define FSR_EXT 0x00001000 /* External (Implementation Defined Classification) */ + +#define FSR_MASK 0x0000040F /* Valid bits */ +#define FSR_ALIGN_MASK 0x0000040D /* Valid bits to check align */ + +#define DFSR_WRITE 0x00000800 /* write data abort fault */ #if defined (ARMA7) || defined (APPLE_ARM64_ARCH_FAMILY) || defined (BCM2837) -#define TEST_FSR_VMFAULT(status) \ - (((status) == FSR_PFAULT) \ - || ((status) == FSR_PPERM) \ - || ((status) == FSR_SFAULT) \ - || ((status) == FSR_SPERM) \ - || ((status) == FSR_ICFAULT) \ - || ((status) == FSR_SACCESS) \ - || ((status) == FSR_PACCESS)) +#define TEST_FSR_VMFAULT(status) \ + (((status) == FSR_PFAULT) \ + || ((status) == FSR_PPERM) \ + || ((status) == FSR_SFAULT) \ + || ((status) == FSR_SPERM) \ + || ((status) == FSR_ICFAULT) \ + || ((status) == FSR_SACCESS) \ + || ((status) == FSR_PACCESS)) #define TEST_FSR_TRANSLATION_FAULT(status) \ (((status) == FSR_SFAULT) \ @@ -356,101 +352,73 @@ #if defined (ARMA7) /* I-Cache */ -#define MMU_I_CLINE 5 /* cache line size as 1<>1)&0x1)<>1)&0x1)<>2)&0x1)<>2)&0x1)<> PTE_SHIFT) /* number of ptes per page */ - -#define ARM_PTE_EMPTY 0x00000000 /* unasigned - invalid entry */ - /* markers for (invalid) PTE for a page sent to compressor */ -#define ARM_PTE_COMPRESSED ARM_PTE_TEX1 /* compressed... */ -#define ARM_PTE_COMPRESSED_ALT ARM_PTE_TEX2 /* ... and was "alt_acct" */ +#define ARM_PTE_COMPRESSED ARM_PTE_TEX1 /* compressed... */ +#define ARM_PTE_COMPRESSED_ALT ARM_PTE_TEX2 /* ... and was "alt_acct" */ #define ARM_PTE_COMPRESSED_MASK (ARM_PTE_COMPRESSED | ARM_PTE_COMPRESSED_ALT) -#define ARM_PTE_IS_COMPRESSED(x) \ - ((((x) & 0x3) == 0) && /* PTE is not valid... */ \ - ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ - ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ - (panic("compressed PTE %p 0x%x has extra bits 0x%x: corrupted?", \ - &(x), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE))) +#define ARM_PTE_IS_COMPRESSED(x, p) \ + ((((x) & 0x3) == 0) && /* PTE is not valid... */ \ + ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ + ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ + (panic("compressed PTE %p 0x%x has extra bits 0x%x: corrupted?", \ + (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE))) + +#define PTE_SHIFT 2 /* shift width of a pte (sizeof(pte) == (1 << PTE_SHIFT)) */ +#define PTE_PGENTRIES (1024 >> PTE_SHIFT) /* number of ptes per page */ -#define ARM_PTE_TYPE_FAULT 0x00000000 /* fault entry type */ -#define ARM_PTE_TYPE 0x00000002 /* small page entry type */ -#define ARM_PTE_TYPE_MASK 0x00000002 /* mask to get pte type */ +#define ARM_PTE_EMPTY 0x00000000 /* unasigned - invalid entry */ -#define ARM_PTE_NG_MASK 0x00000800 /* mask to determine notGlobal bit */ -#define ARM_PTE_NG 0x00000800 /* value for a per-process mapping */ +#define ARM_PTE_TYPE_FAULT 0x00000000 /* fault entry type */ +#define ARM_PTE_TYPE_VALID 0x00000002 /* valid L2 entry */ +#define ARM_PTE_TYPE 0x00000002 /* small page entry type */ +#define ARM_PTE_TYPE_MASK 0x00000002 /* mask to get pte type */ -#define ARM_PTE_SHSHIFT 10 -#define ARM_PTE_SHMASK 0x00000400 /* shared (SMP) mapping mask */ -#define ARM_PTE_SH 0x00000400 /* shared (SMP) mapping */ +#define ARM_PTE_NG_MASK 0x00000800 /* mask to determine notGlobal bit */ +#define ARM_PTE_NG 0x00000800 /* value for a per-process mapping */ -#define ARM_PTE_CBSHIFT 2 -#define ARM_PTE_CB(x) ((x)<>1)&0x1)<>1)&0x1)<>2)&0x1)<>2)&0x1)<> 16) // ARM_TTE_TABLE_MASK top halfword + movt r7, #(ARM_TTE_TABLE_MASK >> 16) // ARM_TTE_TABLE_MASK top halfword and r11, r6, r7 // apply mask orr r11, r11, #ARM_TTE_TYPE_TABLE // mark it as a coarse page table str r11, [r5] // store tte entry for page table diff --git a/osfmk/arm/status.c b/osfmk/arm/status.c index 8fffe7c1c..bdfcf5a6b 100644 --- a/osfmk/arm/status.c +++ b/osfmk/arm/status.c @@ -63,10 +63,11 @@ void /* __private_extern__ */ unsigned int _MachineStateCount[] = { /* FLAVOR_LIST */ 0, - ARM_THREAD_STATE_COUNT, - ARM_VFP_STATE_COUNT, - ARM_EXCEPTION_STATE_COUNT, - ARM_DEBUG_STATE_COUNT + [ARM_THREAD_STATE] = ARM_THREAD_STATE_COUNT, + [ARM_VFP_STATE] = ARM_VFP_STATE_COUNT, + [ARM_EXCEPTION_STATE] = ARM_EXCEPTION_STATE_COUNT, + [ARM_DEBUG_STATE] = ARM_DEBUG_STATE_COUNT, + [ARM_PAGEIN_STATE] = ARM_PAGEIN_STATE_COUNT, }; extern zone_t ads_zone; @@ -139,6 +140,18 @@ machine_thread_get_state( *count = 4; break; + case THREAD_STATE_FLAVOR_LIST_10_15: + if (*count < 5) + return (KERN_INVALID_ARGUMENT); + + tstate[0] = ARM_THREAD_STATE; + tstate[1] = ARM_VFP_STATE; + tstate[2] = ARM_EXCEPTION_STATE; + tstate[3] = ARM_DEBUG_STATE; + tstate[4] = ARM_PAGEIN_STATE; + *count = 5; + break; + case ARM_THREAD_STATE:{ struct arm_thread_state *state; struct arm_saved_state *saved_state; @@ -237,6 +250,20 @@ machine_thread_get_state( break; } + case ARM_PAGEIN_STATE:{ + arm_pagein_state_t *state; + + if (*count < ARM_PAGEIN_STATE_COUNT) { + return (KERN_INVALID_ARGUMENT); + } + + state = (arm_pagein_state_t *)tstate; + state->__pagein_error = thread->t_pagein_error; + + *count = ARM_PAGEIN_STATE_COUNT; + break; + } + default: return (KERN_INVALID_ARGUMENT); } @@ -456,17 +483,30 @@ machine_thread_set_state( return (KERN_SUCCESS); } +mach_vm_address_t +machine_thread_pc(thread_t thread) +{ + struct arm_saved_state *ss = get_user_regs(thread); + return (mach_vm_address_t)get_saved_state_pc(ss); +} + +void +machine_thread_reset_pc(thread_t thread, mach_vm_address_t pc) +{ + set_saved_state_pc(get_user_regs(thread), (register_t)pc); +} + /* * Routine: machine_thread_state_initialize * */ kern_return_t machine_thread_state_initialize( - thread_t thread) + thread_t thread) { struct arm_saved_state *savestate; - savestate = (struct arm_saved_state *) & thread->machine.PcbData; + savestate = (struct arm_saved_state *) &thread->machine.PcbData; bzero((char *) savestate, sizeof(struct arm_saved_state)); savestate->cpsr = PSR_USERDFLT; diff --git a/osfmk/arm/task.h b/osfmk/arm/task.h index 2558ed3bb..7bdcec8af 100644 --- a/osfmk/arm/task.h +++ b/osfmk/arm/task.h @@ -59,5 +59,12 @@ * Machine dependant task fields */ +#if defined(HAS_APPLE_PAC) +#define MACHINE_TASK \ + void* task_debug; \ + uint64_t rop_pid; \ + boolean_t disable_user_jop; +#else #define MACHINE_TASK \ void* task_debug; +#endif diff --git a/osfmk/arm/thread.h b/osfmk/arm/thread.h index 46a603dcc..f17ae451d 100644 --- a/osfmk/arm/thread.h +++ b/osfmk/arm/thread.h @@ -64,29 +64,29 @@ #include #include -#ifdef MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE #include #include #endif #if __ARM_VFP__ -#define VFPSAVE_ALIGN 16 -#define VFPSAVE_ATTRIB __attribute__ ((aligned (VFPSAVE_ALIGN))) -#define THREAD_ALIGN VFPSAVE_ALIGN +#define VFPSAVE_ALIGN 16 +#define VFPSAVE_ATTRIB __attribute__((aligned (VFPSAVE_ALIGN))) +#define THREAD_ALIGN VFPSAVE_ALIGN /* * vector floating point saved state */ struct arm_vfpsaved_state { - uint32_t r[64]; - uint32_t fpscr; - uint32_t fpexc; + uint32_t r[64]; + uint32_t fpscr; + uint32_t fpexc; }; #endif struct perfcontrol_state { - uint64_t opaque[8] __attribute__((aligned(8))); + uint64_t opaque[8] __attribute__((aligned(8))); }; /* @@ -94,7 +94,7 @@ struct perfcontrol_state { */ extern unsigned int _MachineStateCount[]; -#ifdef MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE #if __arm64__ typedef arm_context_t machine_thread_kernel_state; #else @@ -104,104 +104,82 @@ typedef struct arm_saved_state machine_thread_kernel_state; struct machine_thread { #if __arm64__ - arm_context_t *contextData; /* allocated user context */ - arm_saved_state_t *upcb; /* pointer to user GPR state */ - arm_neon_saved_state_t *uNeon; /* pointer to user VFP state */ + arm_context_t * contextData; /* allocated user context */ + arm_saved_state_t * upcb; /* pointer to user GPR state */ + arm_neon_saved_state_t * uNeon; /* pointer to user VFP state */ #elif __arm__ - struct arm_saved_state PcbData; + struct arm_saved_state PcbData; #if __ARM_VFP__ - struct arm_vfpsaved_state uVFPdata VFPSAVE_ATTRIB; - struct arm_vfpsaved_state kVFPdata VFPSAVE_ATTRIB; + struct arm_vfpsaved_state uVFPdata VFPSAVE_ATTRIB; + struct arm_vfpsaved_state kVFPdata VFPSAVE_ATTRIB; #endif /* __ARM_VFP__ */ #else #error Unknown arch #endif + #if __ARM_USER_PROTECT__ - unsigned int uptw_ttc; - unsigned int uptw_ttb; - unsigned int kptw_ttb; - unsigned int asid; + unsigned int uptw_ttc; + unsigned int uptw_ttb; + unsigned int kptw_ttb; + unsigned int asid; #endif - vm_offset_t kstackptr; /* top of kernel stack */ - struct cpu_data *CpuDatap; /* current per cpu data */ - unsigned int preemption_count; /* preemption count */ + vm_offset_t kstackptr; /* top of kernel stack */ +#if defined(HAS_APPLE_PAC) + uint64_t rop_pid; + boolean_t disable_user_jop; +#endif + struct cpu_data * CpuDatap; /* current per cpu data */ + unsigned int preemption_count; /* preemption count */ #if __ARM_SMP__ #define MACHINE_THREAD_FLAGS_ON_CPU (0x1) - uint8_t machine_thread_flags; + uint8_t machine_thread_flags; #endif /* __ARM_SMP__ */ - arm_debug_state_t *DebugData; - mach_vm_address_t cthread_self; /* for use of cthread package */ - mach_vm_address_t cthread_data; /* for use of cthread package */ + arm_debug_state_t * DebugData; + mach_vm_address_t cthread_self; /* for use of cthread package */ + mach_vm_address_t cthread_data; /* for use of cthread package */ - struct perfcontrol_state perfctrl_state; + struct perfcontrol_state perfctrl_state; #if __arm64__ - uint64_t energy_estimate_nj; + uint64_t energy_estimate_nj; #endif #if INTERRUPT_MASKED_DEBUG - uint64_t intmask_timestamp; /* timestamp of when interrupts were masked */ + uint64_t intmask_timestamp; /* timestamp of when interrupts were masked */ #endif }; #endif -extern struct arm_saved_state *get_user_regs(thread_t); -extern struct arm_saved_state *find_user_regs(thread_t); -extern struct arm_saved_state *find_kern_regs(thread_t); -extern struct arm_vfpsaved_state *find_user_vfp(thread_t); +extern struct arm_saved_state * get_user_regs(thread_t); +extern struct arm_saved_state * find_user_regs(thread_t); +extern struct arm_saved_state * find_kern_regs(thread_t); +extern struct arm_vfpsaved_state * find_user_vfp(thread_t); #if defined(__arm__) -extern arm_debug_state_t *find_debug_state(thread_t); +extern arm_debug_state_t * find_debug_state(thread_t); #elif defined(__arm64__) -extern arm_debug_state32_t *find_debug_state32(thread_t); -extern arm_debug_state64_t *find_debug_state64(thread_t); -extern arm_neon_saved_state_t *get_user_neon_regs(thread_t); +extern arm_debug_state32_t * find_debug_state32(thread_t); +extern arm_debug_state64_t * find_debug_state64(thread_t); +extern arm_neon_saved_state_t * get_user_neon_regs(thread_t); #else #error unknown arch #endif #define FIND_PERFCONTROL_STATE(th) (&th->machine.perfctrl_state) -#ifdef MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE #if __ARM_VFP__ -extern void vfp_state_initialize(struct arm_vfpsaved_state *vfp_state); -extern void vfp_save(struct arm_vfpsaved_state *vfp_ss); -extern void vfp_load(struct arm_vfpsaved_state *vfp_ss); -extern void toss_live_vfp(void *vfp_fc); +extern void vfp_state_initialize(struct arm_vfpsaved_state *vfp_state); +extern void vfp_save(struct arm_vfpsaved_state *vfp_ss); +extern void vfp_load(struct arm_vfpsaved_state *vfp_ss); #endif /* __ARM_VFP__ */ -extern void arm_debug_set(arm_debug_state_t *debug_state); +extern void arm_debug_set(arm_debug_state_t *debug_state); #if defined(__arm64__) -extern void arm_debug_set32(arm_debug_state_t *debug_state); -extern void arm_debug_set64(arm_debug_state_t *debug_state); - -kern_return_t handle_get_arm_thread_state( - thread_state_t tstate, - mach_msg_type_number_t * count, - const arm_saved_state_t *saved_state); -kern_return_t handle_get_arm32_thread_state( - thread_state_t tstate, - mach_msg_type_number_t * count, - const arm_saved_state_t *saved_state); -kern_return_t handle_get_arm64_thread_state( - thread_state_t tstate, - mach_msg_type_number_t * count, - const arm_saved_state_t *saved_state); - -kern_return_t handle_set_arm_thread_state( - const thread_state_t tstate, - mach_msg_type_number_t count, - arm_saved_state_t *saved_state); -kern_return_t handle_set_arm32_thread_state( - const thread_state_t tstate, - mach_msg_type_number_t count, - arm_saved_state_t *saved_state); -kern_return_t handle_set_arm64_thread_state( - const thread_state_t tstate, - mach_msg_type_number_t count, - arm_saved_state_t *saved_state); +extern void arm_debug_set32(arm_debug_state_t *debug_state); +extern void arm_debug_set64(arm_debug_state_t *debug_state); #endif #endif /* MACH_KERNEL_PRIVATE */ @@ -209,17 +187,11 @@ extern void *act_thread_csave(void); extern void act_thread_catt(void *ctx); extern void act_thread_cfree(void *ctx); -/* - * Return address of the function that called current function, given - * address of the first parameter of current function. - */ -#define GET_RETURN_PC(addr) (((vm_offset_t *)0)) /* - * Defining this indicates that MD code will supply an exception() - * routine, conformant with kern/exception.c (dependency alert!) - * but which does wonderfully fast, machine-dependent magic. + * Return address of the function that called current function, given + * address of the first parameter of current function. */ -#define MACHINE_FAST_EXCEPTION 1 +#define GET_RETURN_PC(addr) (__builtin_return_address(0)) -#endif /* _ARM_THREAD_H_ */ +#endif /* _ARM_THREAD_H_ */ diff --git a/osfmk/arm/tlb.h b/osfmk/arm/tlb.h new file mode 100644 index 000000000..793b50c4d --- /dev/null +++ b/osfmk/arm/tlb.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include + +#define tlbi_addr(x) ((((x) >> 12) & TLBI_ADDR_MASK) << TLBI_ADDR_SHIFT) +#define tlbi_asid(x) (((uintptr_t)(x) & TLBI_ASID_MASK) << TLBI_ASID_SHIFT) + +extern void sync_tlb_flush(void); +extern void flush_mmu_tlb_async(void); +extern void flush_mmu_tlb(void); +extern void flush_core_tlb_async(void); +extern void flush_core_tlb(void); +extern void flush_mmu_tlb_entry_async(uint32_t); +extern void flush_mmu_tlb_entry(uint32_t); +extern void flush_mmu_tlb_entries_async(uint32_t, uint32_t); +extern void flush_mmu_tlb_entries(uint32_t, uint32_t); +extern void flush_mmu_tlb_mva_entries_async(uint32_t); +extern void flush_mmu_tlb_mva_entries(uint32_t); +extern void flush_mmu_tlb_asid_async(uint32_t); +extern void flush_mmu_tlb_asid(uint32_t); +extern void flush_core_tlb_asid_async(uint32_t); +extern void flush_core_tlb_asid(uint32_t); diff --git a/osfmk/arm/trap.c b/osfmk/arm/trap.c index 2605951b2..608593c86 100644 --- a/osfmk/arm/trap.c +++ b/osfmk/arm/trap.c @@ -102,7 +102,7 @@ perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routi void sleh_undef(struct arm_saved_state *, struct arm_vfpsaved_state *); void sleh_abort(struct arm_saved_state *, int); static kern_return_t sleh_alignment(struct arm_saved_state *); -static void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *regs); +static void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *regs); int sleh_alignment_count = 0; int trap_on_alignment_fault = 0; @@ -243,7 +243,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u * can see the original state of this thread). */ vm_offset_t kstackptr = current_thread()->machine.kstackptr; - *((arm_saved_state_t *) kstackptr) = *regs; + copy_signed_thread_state((arm_saved_state_t *)kstackptr, regs); DebuggerCall(exception, regs); (void) ml_set_interrupts_enabled(intr); @@ -274,7 +274,7 @@ sleh_abort(struct arm_saved_state * regs, int type) int status; int debug_status = 0; int spsr; - int exc; + int exc = EXC_BAD_ACCESS; mach_exception_data_type_t codes[2]; vm_map_t map; vm_map_address_t vaddr; @@ -309,7 +309,7 @@ sleh_abort(struct arm_saved_state * regs, int type) if (ml_at_interrupt_context()) { #if CONFIG_DTRACE - if (!(thread->options & TH_OPT_DTRACE)) + if (!(thread->t_dtrace_inprobe)) #endif /* CONFIG_DTRACE */ { panic_with_thread_kernel_state("sleh_abort at interrupt context", regs); @@ -404,7 +404,7 @@ sleh_abort(struct arm_saved_state * regs, int type) (void) ml_set_interrupts_enabled(intr); } else if (TEST_FSR_VMFAULT(status)) { #if CONFIG_DTRACE - if (thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ + if (thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(fault_addr)) { /* Should a fault under dtrace be ignored? */ /* Point to next instruction */ regs->pc += ((regs->cpsr & PSR_TF) && !IS_THUMB32(*((uint16_t*) (regs->pc)))) ? 2 : 4; @@ -428,7 +428,7 @@ sleh_abort(struct arm_saved_state * regs, int type) if (!TEST_FSR_TRANSLATION_FAULT(status)) { /* check to see if it is just a pmap ref/modify fault */ - result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE); + result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, (status == FSR_PACCESS), FALSE); if (result == KERN_SUCCESS) { goto exit; } @@ -470,22 +470,18 @@ sleh_abort(struct arm_saved_state * regs, int type) } intr = ml_set_interrupts_enabled(FALSE); - panic_plain("kernel abort type %d: fault_type=0x%x, fault_addr=0x%x\n" + panic_plain("kernel abort type %d at pc 0x%08x, lr 0x%08x: fault_type=0x%x, fault_addr=0x%x\n" "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - type, fault_type, fault_addr, + type, regs->pc, regs->lr, fault_type, fault_addr, regs->r[0], regs->r[1], regs->r[2], regs->r[3], regs->r[4], regs->r[5], regs->r[6], regs->r[7], regs->r[8], regs->r[9], regs->r[10], regs->r[11], regs->r[12], regs->sp, regs->lr, regs->pc, regs->cpsr, regs->fsr, regs->far); - - (void) ml_set_interrupts_enabled(intr); - - goto exit; } /* Fault in user mode */ @@ -493,7 +489,7 @@ sleh_abort(struct arm_saved_state * regs, int type) map = thread->map; #if CONFIG_DTRACE - if (thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ + if (thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(fault_addr)) { /* Should a user mode fault under dtrace be ignored? */ if (recover) { regs->pc = recover; @@ -519,7 +515,7 @@ sleh_abort(struct arm_saved_state * regs, int type) if (!TEST_FSR_TRANSLATION_FAULT(status)) { /* check to see if it is just a pmap ref/modify fault */ - result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, TRUE); + result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, (status == FSR_PACCESS), TRUE); if (result == KERN_SUCCESS) { goto exception_return; } @@ -534,22 +530,27 @@ sleh_abort(struct arm_saved_state * regs, int type) if (result == KERN_SUCCESS || result == KERN_ABORTED) { goto exception_return; } - exc = EXC_BAD_ACCESS; + + /* + * KERN_FAILURE here means preemption was disabled when we called vm_fault. + * That should never happen for a page fault from user space. + */ + if (__improbable(result == KERN_FAILURE)) { + panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread); + } + codes[0] = result; } else if ((status & FSR_ALIGN_MASK) == FSR_ALIGN) { if (sleh_alignment(regs) == KERN_SUCCESS) { goto exception_return; } - exc = EXC_BAD_ACCESS; codes[0] = EXC_ARM_DA_ALIGN; } else if (status == FSR_DEBUG) { exc = EXC_BREAKPOINT; codes[0] = EXC_ARM_DA_DEBUG; } else if ((status == FSR_SDOM) || (status == FSR_PDOM)) { - exc = EXC_BAD_ACCESS; - codes[0] = KERN_INVALID_ADDRESS; + panic_with_thread_kernel_state("Unexpected domain fault", regs); } else { - exc = EXC_BAD_ACCESS; codes[0] = KERN_FAILURE; } @@ -857,16 +858,17 @@ interrupt_stats(void) SCHED_STATS_INTERRUPT(current_processor()); } +__dead2 static void panic_with_thread_kernel_state(const char *msg, struct arm_saved_state *regs) { - panic_plain("%s (saved state:%p)\n" + panic_plain("%s at pc 0x%08x, lr 0x%08x (saved state:%p)\n" "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - msg, regs, + msg, regs->pc, regs->lr, regs, regs->r[0], regs->r[1], regs->r[2], regs->r[3], regs->r[4], regs->r[5], regs->r[6], regs->r[7], regs->r[8], regs->r[9], regs->r[10], regs->r[11], diff --git a/osfmk/arm/trap.h b/osfmk/arm/trap.h index d3a07cb10..fa179c8b5 100644 --- a/osfmk/arm/trap.h +++ b/osfmk/arm/trap.h @@ -240,7 +240,7 @@ || (((op) & THUMB_SIMD_VFP_MASK3) == THUMB_SIMD_VFP_CODE3)) extern boolean_t arm_force_fast_fault(ppnum_t, vm_prot_t, int, void *); -extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean_t); +extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, bool, bool); /* * Determines if the aborted instruction is read or write operation diff --git a/osfmk/arm/xpr.h b/osfmk/arm/xpr.h deleted file mode 100644 index 82904b1b0..000000000 --- a/osfmk/arm/xpr.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Machine dependent module for the XPR tracing facility. - */ - -#define XPR_TIMESTAMP (0) diff --git a/osfmk/arm64/Makefile b/osfmk/arm64/Makefile index f7ec7555e..ec8b11901 100644 --- a/osfmk/arm64/Makefile +++ b/osfmk/arm64/Makefile @@ -11,7 +11,8 @@ ARM_HEADER_FILES = \ lowglobals.h \ machine_cpuid.h \ machine_machdep.h \ - proc_reg.h + proc_reg.h \ + tlb.h \ INSTALL_MD_DIR = arm64 @@ -23,7 +24,7 @@ INSTALL_KF_MD_LIST = $(ARM_HEADER_FILES) INSTALL_KF_MD_LCL_LIST = machine_kpc.h machine_remote_time.h monotonic.h pgtrace.h $(ARM_HEADER_FILES) -EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h machine_remote_time.h monotonic.h proc_reg.h pgtrace.h asm.h +EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h machine_remote_time.h monotonic.h proc_reg.h pgtrace.h asm.h tlb.h EXPORT_MD_DIR = arm64 diff --git a/osfmk/arm64/arm_vm_init.c b/osfmk/arm64/arm_vm_init.c index bfad29bf5..8f6a0cbb7 100644 --- a/osfmk/arm64/arm_vm_init.c +++ b/osfmk/arm64/arm_vm_init.c @@ -30,15 +30,16 @@ #include #include +#include +#include +#include #include #include -#include -#include #include #include #include -#include +#include #include #include #include @@ -118,8 +119,8 @@ SECURITY_READ_ONLY_LATE(unsigned long) gPhysSize; SECURITY_READ_ONLY_LATE(unsigned long) gT0Sz = T0SZ_BOOT; SECURITY_READ_ONLY_LATE(unsigned long) gT1Sz = T1SZ_BOOT; -/* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move - * all kexts before the kernel. This is only for arm64 devices and looks +/* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move + * all kexts before the kernel. This is only for arm64 devices and looks * something like the following: * -- vmaddr order -- * 0xffffff8004004000 __PRELINK_TEXT @@ -188,6 +189,7 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) segEXTRADATA; SECURITY_READ_ONLY_LATE(unsigned long) segSizeEXTRADATA; SECURITY_READ_ONLY_LATE(vm_offset_t) segLOWESTTEXT; +SECURITY_READ_ONLY_LATE(vm_offset_t) segLOWEST; SECURITY_READ_ONLY_LATE(static vm_offset_t) segTEXTB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT; @@ -266,19 +268,13 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) static_memory_end; SECURITY_READ_ONLY_LATE(pmap_paddr_t) avail_start; SECURITY_READ_ONLY_LATE(pmap_paddr_t) avail_end; SECURITY_READ_ONLY_LATE(pmap_paddr_t) real_avail_end; +SECURITY_READ_ONLY_LATE(unsigned long) real_phys_size; #if __ARM_KERNEL_PROTECT__ extern void ExceptionVectorsBase; extern void ExceptionVectorsEnd; #endif /* __ARM_KERNEL_PROTECT__ */ -#if defined(KERNEL_INTEGRITY_KTRR) -#if __ARM64_TWO_LEVEL_PMAP__ -/* We could support this configuration, but it adds memory overhead. */ -#error This configuration is not supported -#endif -#endif - typedef struct { pmap_paddr_t pa; vm_map_address_t va; @@ -297,6 +293,7 @@ phystokv(pmap_paddr_t pa) if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len))) return (pa - ptov_table[i].pa + ptov_table[i].va); } + assertf((pa - gPhysBase) < real_phys_size, "%s: illegal PA: 0x%llx", __func__, (uint64_t)pa); return (pa - gPhysBase + gVirtBase); } @@ -315,6 +312,7 @@ phystokv_range(pmap_paddr_t pa, vm_size_t *max_len) len = PAGE_SIZE - (pa & PAGE_MASK); if (*max_len > len) *max_len = len; + assertf((pa - gPhysBase) < real_phys_size, "%s: illegal PA: 0x%llx", __func__, (uint64_t)pa); return (pa - gPhysBase + gVirtBase); } @@ -325,8 +323,7 @@ ml_static_vtop(vm_offset_t va) if ((va >= ptov_table[i].va) && (va < (ptov_table[i].va + ptov_table[i].len))) return (va - ptov_table[i].va + ptov_table[i].pa); } - if (((vm_address_t)(va) - gVirtBase) >= gPhysSize) - panic("ml_static_vtop(): illegal VA: %p\n", (void*)va); + assertf(((vm_address_t)(va) - gVirtBase) < gPhysSize, "%s: illegal VA: %p", __func__, (void*)va); return ((vm_address_t)(va) - gVirtBase + gPhysBase); } @@ -446,7 +443,6 @@ void dump_kva_space() { printf("Root page table: %s\n", root_static ? "Static" : "Dynamic"); -#if !__ARM64_TWO_LEVEL_PMAP__ for(unsigned int i=0; i> 20, @@ -503,10 +496,8 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) vm_offset_t ptpage = 0; tt_entry_t * ttp = root_ttp; -#if !__ARM64_TWO_LEVEL_PMAP__ tt_entry_t * l1_ttep = NULL; tt_entry_t l1_tte = 0; -#endif tt_entry_t * l2_ttep = NULL; tt_entry_t l2_tte = 0; @@ -517,7 +508,6 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) * Walk the target page table to find the PTE for the given virtual * address. Allocate any page table pages needed to do this. */ -#if !__ARM64_TWO_LEVEL_PMAP__ l1_ttep = ttp + ((vaddr & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); l1_tte = *l1_ttep; @@ -532,7 +522,6 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) } ttp = (tt_entry_t *)phystokv(l1_tte & ARM_TTE_TABLE_MASK); -#endif l2_ttep = ttp + ((vaddr & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); l2_tte = *l2_ttep; @@ -566,6 +555,10 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) *ptep = pte; } +#endif // __ARM_KERNEL_PROTECT + +#if __ARM_KERNEL_PROTECT__ + /* * arm_vm_kernel_el0_map: * vaddr: The target virtual address @@ -611,7 +604,6 @@ arm_vm_kernel_pte(vm_offset_t vaddr) pt_entry_t * ptep = NULL; pt_entry_t pte = 0; -#if !__ARM64_TWO_LEVEL_PMAP__ ttep = ttp + ((vaddr & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); tte = *ttep; @@ -627,7 +619,6 @@ arm_vm_kernel_pte(vm_offset_t vaddr) } ttp = (tt_entry_t *)phystokv(tte & ARM_TTE_TABLE_MASK); -#endif ttep = ttp + ((vaddr & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); tte = *ttep; @@ -736,11 +727,9 @@ static void arm_replace_identity_map(boot_args * args) vm_offset_t addr; pmap_paddr_t paddr; -#if !__ARM64_TWO_LEVEL_PMAP__ pmap_paddr_t l1_ptp_phys = 0; tt_entry_t *l1_ptp_virt = NULL; tt_entry_t *tte1 = NULL; -#endif pmap_paddr_t l2_ptp_phys = 0; tt_entry_t *l2_ptp_virt = NULL; tt_entry_t *tte2 = NULL; @@ -795,18 +784,17 @@ tt_entry_t *arm_kva_to_tte(vm_offset_t); tt_entry_t * arm_kva_to_tte(vm_offset_t va) { -#if __ARM64_TWO_LEVEL_PMAP__ - tt_entry_t *tte2; - tte2 = cpu_tte + L2_TABLE_INDEX(va); -#else tt_entry_t *tte1, *tte2; tte1 = cpu_tte + L1_TABLE_INDEX(va); tte2 = L2_TABLE_VA(tte1) + L2_TABLE_INDEX(va); -#endif + return tte2; } +#define ARM64_GRANULE_ALLOW_BLOCK (1 << 0) +#define ARM64_GRANULE_ALLOW_HINT (1 << 1) + /* * arm_vm_page_granular_helper updates protections at the L3 level. It will (if * neccessary) allocate a page for the L3 table and update the corresponding L2 @@ -815,13 +803,13 @@ arm_kva_to_tte(vm_offset_t va) * not be invoked from a context that does not do L2 iteration separately (basically, * don't call this except from arm_vm_page_granular_prot). * - * bool force_page_granule: true: will force page level mappings for this entry - * false: will try to use block level mappings + * unsigned granule: 0 => force to page granule, or a combination of + * ARM64_GRANULE_* flags declared above. */ static void arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, pmap_paddr_t pa_offset, - int pte_prot_APX, int pte_prot_XN, bool force_page_granule, + int pte_prot_APX, int pte_prot_XN, unsigned granule, pt_entry_t **deferred_pte, pt_entry_t *deferred_ptmp) { if (va & ARM_TT_L2_OFFMASK) { /* ragged edge hanging over a ARM_TT_L2_SIZE boundary */ @@ -886,9 +874,13 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, * be fully covered by this mapping request. */ if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) <= _end) && - !force_page_granule && use_contiguous_hint) { + (granule & ARM64_GRANULE_ALLOW_HINT) && use_contiguous_hint) { assert((va & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)) == ((pa & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)))); ptmp |= ARM_PTE_HINT; + /* Do not attempt to reapply the hint bit to an already-active mapping. + * This very likely means we're attempting to change attributes on an already-active mapping, + * which violates the requirement of the hint bit.*/ + assert(!kva_active || (ppte[i] == ARM_PTE_TYPE_FAULT)); } /* * Do not change the contiguous bit on an active mapping. Even in a single-threaded @@ -899,18 +891,18 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, */ assert(!kva_active || (ppte[i] == ARM_PTE_TYPE_FAULT) || ((ppte[i] & ARM_PTE_HINT) == (ptmp & ARM_PTE_HINT))); - /* + /* * If we reach an entry that maps the current pte page, delay updating it until the very end. * Otherwise we might end up making the PTE page read-only, leading to a fault later on in * this function if we manage to outrun the TLB. This can happen on KTRR-enabled devices when * marking segDATACONST read-only. Mappings for this region may straddle a PT page boundary, * so we must also defer assignment of the following PTE. We will assume that if the region * were to require one or more full L3 pages, it would instead use L2 blocks where possible, - * therefore only requiring at most one L3 page at the beginning and one at the end. + * therefore only requiring at most one L3 page at the beginning and one at the end. */ if (kva_active && ((pt_entry_t*)(phystokv(pa)) == ppte)) { - assert(recursive_pte == NULL); - assert(!force_page_granule); + assert(recursive_pte == NULL); + assert(granule & ARM64_GRANULE_ALLOW_BLOCK); recursive_pte = &ppte[i]; recursive_ptmp = ptmp; } else if ((deferred_pte != NULL) && (&ppte[i] == &recursive_pte[1])) { @@ -940,7 +932,7 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, static void arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa_offset, int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, - bool force_page_granule) + unsigned granule) { pt_entry_t *deferred_pte = NULL, deferred_ptmp = 0; vm_offset_t _end = start + size; @@ -950,19 +942,19 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa return; if (align_start > _end) { - arm_vm_page_granular_helper(start, _end, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL); + arm_vm_page_granular_helper(start, _end, start, pa_offset, pte_prot_APX, pte_prot_XN, granule, NULL, NULL); return; } - arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp); + arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, granule, &deferred_pte, &deferred_ptmp); while ((_end - align_start) >= ARM_TT_L2_SIZE) { - if (force_page_granule) + if (!(granule & ARM64_GRANULE_ALLOW_BLOCK)) { arm_vm_page_granular_helper(align_start, align_start+ARM_TT_L2_SIZE, align_start + 1, pa_offset, - pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL); - else { + pte_prot_APX, pte_prot_XN, granule, NULL, NULL); + } else { pmap_paddr_t pa = align_start - gVirtBase + gPhysBase - pa_offset; - assert((pa & ARM_TT_L2_OFFMASK) == 0); + assert((pa & ARM_TT_L2_OFFMASK) == 0); tt_entry_t *tte2; tt_entry_t tmplate; @@ -973,7 +965,7 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa | ARM_TTE_VALID | ARM_TTE_BLOCK_AF | ARM_TTE_BLOCK_NX | ARM_TTE_BLOCK_AP(pte_prot_APX) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY) | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK); - + #if __ARM_KERNEL_PROTECT__ tmplate = tmplate | ARM_TTE_BLOCK_NG; #endif /* __ARM_KERNEL_PROTECT__ */ @@ -987,34 +979,28 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa } if (align_start < _end) - arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp); + arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, granule, &deferred_pte, &deferred_ptmp); if (deferred_pte != NULL) *deferred_pte = deferred_ptmp; } static inline void -arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, bool force_page_granule) -{ - arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, force_page_granule); -} - -static inline void -arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, bool force_page_granule) +arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, unsigned granule) { - arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, force_page_granule); + arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, granule); } static inline void -arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, bool force_page_granule) +arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, unsigned granule) { - arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, force_page_granule); + arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, granule); } static inline void -arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, bool force_page_granule) +arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, unsigned granule) { - arm_vm_page_granular_prot(start, size, 0, 0, AP_RWNA, 0, force_page_granule); + arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, granule); } /* used in the chosen/memory-map node, populated by iBoot. */ @@ -1023,7 +1009,6 @@ typedef struct MemoryMapFileInfo { size_t length; } MemoryMapFileInfo; - void arm_vm_prot_init(boot_args * args) { @@ -1037,6 +1022,8 @@ arm_vm_prot_init(boot_args * args) segEXTRADATA = segLOWESTTEXT; segSizeEXTRADATA = 0; + segLOWEST = segLOWESTTEXT; + DTEntry memory_map; MemoryMapFileInfo *trustCacheRange; unsigned int trustCacheRangeSize; @@ -1052,24 +1039,38 @@ arm_vm_prot_init(boot_args * args) segEXTRADATA = phystokv(trustCacheRange->paddr); segSizeEXTRADATA = trustCacheRange->length; - arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, FALSE); + if (segEXTRADATA <= segLOWEST) { + segLOWEST = segEXTRADATA; + } +#if !(DEBUG || DEVELOPMENT) + + + else { + panic("EXTRADATA is in an unexpected place: %#lx > %#lx", segEXTRADATA, segLOWEST); + } +#endif /* !(DEBUG || DEVELOPMENT) */ + + arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + } /* Map coalesced kext TEXT segment RWNX for now */ - arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, FALSE); // Refined in OSKext::readPrelinkedExtensions + arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, ARM64_GRANULE_ALLOW_BLOCK); // Refined in OSKext::readPrelinkedExtensions /* Map coalesced kext DATA_CONST segment RWNX (could be empty) */ - arm_vm_page_granular_RWNX(segPLKDATACONSTB, segSizePLKDATACONST, FALSE); // Refined in OSKext::readPrelinkedExtensions + arm_vm_page_granular_RWNX(segPLKDATACONSTB, segSizePLKDATACONST, ARM64_GRANULE_ALLOW_BLOCK); // Refined in OSKext::readPrelinkedExtensions - /* Map coalesced kext TEXT_EXEC segment RWX (could be empty) */ - arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, FALSE); // Refined in OSKext::readPrelinkedExtensions + /* Map coalesced kext TEXT_EXEC segment RX (could be empty) */ + arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Refined in OSKext::readPrelinkedExtensions /* if new segments not present, set space between PRELINK_TEXT and xnu TEXT to RWNX * otherwise we no longer expect any space between the coalesced kext read only segments and xnu rosegments */ if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) { - if (segSizePRELINKTEXT) - arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), FALSE); + if (segSizePRELINKTEXT) { + arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), + ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + } } else { /* * If we have the new segments, we should still protect the gap between kext @@ -1077,7 +1078,8 @@ arm_vm_prot_init(boot_args * args) * exists. */ if ((segPLKDATACONSTB + segSizePLKDATACONST) < segTEXTB) { - arm_vm_page_granular_RWNX(segPLKDATACONSTB + segSizePLKDATACONST, segTEXTB - (segPLKDATACONSTB + segSizePLKDATACONST), FALSE); + arm_vm_page_granular_RWNX(segPLKDATACONSTB + segSizePLKDATACONST, segTEXTB - (segPLKDATACONSTB + segSizePLKDATACONST), + ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); } } @@ -1088,39 +1090,38 @@ arm_vm_prot_init(boot_args * args) * * TEXT segment contains mach headers and other non-executable data. This will become RONX later. */ - arm_vm_page_granular_RNX(segTEXTB, segSizeTEXT, FALSE); + arm_vm_page_granular_RNX(segTEXTB, segSizeTEXT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* Can DATACONST start out and stay RNX? * NO, stuff in this segment gets modified during startup (viz. mac_policy_init()/mac_policy_list) * Make RNX in prot_finalize */ - arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, FALSE); + arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK); - /* TEXTEXEC contains read only executable code: becomes ROX in prot_finalize */ - arm_vm_page_granular_RWX(segTEXTEXECB, segSizeTEXTEXEC, FALSE); + arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* DATA segment will remain RWNX */ - arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, FALSE); + arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); - arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, TRUE); - arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, TRUE); - arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, TRUE); - arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, TRUE); + arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, 0); + arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, 0); + arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, 0); + arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, 0); - arm_vm_page_granular_ROX(segKLDB, segSizeKLD, FALSE); - arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, FALSE); - arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, FALSE); // Coalesced kext LINKEDIT segment - arm_vm_page_granular_ROX(segLASTB, segSizeLAST, FALSE); // __LAST may be empty, but we cannot assume this + arm_vm_page_granular_ROX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Coalesced kext LINKEDIT segment + arm_vm_page_granular_ROX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK); // __LAST may be empty, but we cannot assume this - arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, FALSE); // Prelink __DATA for kexts (RW data) + arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Prelink __DATA for kexts (RW data) if (segSizePLKLLVMCOV > 0) - arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, FALSE); // LLVM code coverage data + arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // LLVM code coverage data - arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, FALSE); /* PreLinkInfoDictionary */ + arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* PreLinkInfoDictionary */ - arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), BOOTSTRAP_TABLE_SIZE, FALSE); // Boot page tables; they should not be mutable. + arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), BOOTSTRAP_TABLE_SIZE, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Boot page tables; they should not be mutable. } /* @@ -1152,7 +1153,7 @@ SECURITY_READ_ONLY_LATE(static unsigned int) ptov_index = 0; #define ROUND_TWIG(addr) (((addr) + ARM_TT_TWIG_OFFMASK) & ~(ARM_TT_TWIG_OFFMASK)) static void -arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap_base, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, boolean_t force_page_granule) +arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap_base, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, unsigned granule) { pmap_paddr_t pa_offset; @@ -1163,7 +1164,7 @@ arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap temp_ptov_table[ptov_index].va = physmap_base; else temp_ptov_table[ptov_index].va = temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len; - if (!force_page_granule) { + if (granule & ARM64_GRANULE_ALLOW_BLOCK) { vm_map_address_t orig_offset = temp_ptov_table[ptov_index].pa & ARM_TT_TWIG_OFFMASK; vm_map_address_t new_offset = temp_ptov_table[ptov_index].va & ARM_TT_TWIG_OFFMASK; if (new_offset < orig_offset) @@ -1173,8 +1174,8 @@ arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap } assert((temp_ptov_table[ptov_index].va & ARM_PGMASK) == 0); temp_ptov_table[ptov_index].len = round_page(len); - pa_offset = temp_ptov_table[ptov_index].va - orig_va; - arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, force_page_granule); + pa_offset = temp_ptov_table[ptov_index].va - orig_va; + arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, granule); ++ptov_index; } @@ -1186,18 +1187,20 @@ arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_addre bzero(temp_ptov_table, sizeof(temp_ptov_table)); // Will be handed back to VM layer through ml_static_mfree() in arm_vm_prot_finalize() - arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segEXTRADATA - gVirtBase, AP_RWNA, FALSE); + arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, + ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); - arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, FALSE); /* Device Tree, RAM Disk (if present), bootArgs */ + arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, + ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* Device Tree, RAM Disk (if present), bootArgs */ arm_vm_physmap_slide(temp_ptov_table, physmap_base, (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE - gPhysBase + gVirtBase), - real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, FALSE); // rest of physmem + real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // rest of physmem assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= dynamic_memory_begin); // Sort in descending order of segment length. LUT traversal is linear, so largest (most likely used) // segments should be placed earliest in the table to optimize lookup performance. - qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries); + qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries); memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table)); } @@ -1222,7 +1225,7 @@ arm_vm_prot_finalize(boot_args * args __unused) * should be immediately followed by XNU's TEXT segment */ - ml_static_mfree(phystokv(gPhysBase), segEXTRADATA - gVirtBase); + ml_static_mfree(phystokv(gPhysBase), segLOWEST - gVirtBase); /* * KTRR support means we will be mucking with these pages and trying to @@ -1233,18 +1236,9 @@ arm_vm_prot_finalize(boot_args * args __unused) ml_static_mfree(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT)); } - /* - * LowResetVectorBase patching should be done by now, so tighten executable - * protections. - */ - arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, FALSE); - /* tighten permissions on kext read only data and code */ - if (segSizePLKDATACONST && segSizePLKTEXTEXEC) { - arm_vm_page_granular_RNX(segPRELINKTEXTB, segSizePRELINKTEXT, FALSE); - arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, FALSE); - arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, FALSE); - } + arm_vm_page_granular_RNX(segPRELINKTEXTB, segSizePRELINKTEXT, ARM64_GRANULE_ALLOW_BLOCK); + arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, ARM64_GRANULE_ALLOW_BLOCK); cpu_stack_alloc(&BootCpuData); arm64_replace_bootstack(&BootCpuData); @@ -1259,7 +1253,7 @@ arm_vm_prot_finalize(boot_args * args __unused) /* * __LAST,__pinst should no longer be executable. */ - arm_vm_page_granular_RNX(segLASTB, segSizeLAST, FALSE); + arm_vm_page_granular_RNX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK); /* * Must wait until all other region permissions are set before locking down DATA_CONST @@ -1268,11 +1262,8 @@ arm_vm_prot_finalize(boot_args * args __unused) */ #endif - arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, FALSE); + arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK); -#ifndef __ARM_L1_PTW__ - FlushPoC_Dcache(); -#endif __builtin_arm_dsb(DSB_ISH); flush_mmu_tlb(); } @@ -1302,7 +1293,16 @@ set_tbi(void) user_tbi = ((tbi & TBI_USER) == TBI_USER); old_tcr = new_tcr = get_tcr(); new_tcr |= (user_tbi) ? TCR_TBI0_TOPBYTE_IGNORED : 0; + +#if !defined(HAS_APPLE_PAC) + /* + * arm_vm_init() runs after rebase_threaded_starts(), so enabling TBI1 + * at this point will break the computed pointer signatures. TBID1 + * could help mitigate this problem, but for now we'll just disable + * kernel TBI if PAC is being used. + */ new_tcr |= (tbi & TBI_KERNEL) ? TCR_TBI1_TOPBYTE_IGNORED : 0; +#endif if (old_tcr != new_tcr) { set_tcr(new_tcr); @@ -1317,19 +1317,8 @@ set_tbi(void) void arm_vm_init(uint64_t memory_size, boot_args * args) { -#if !__ARM64_TWO_LEVEL_PMAP__ vm_map_address_t va_l1, va_l1_end; tt_entry_t *cpu_l1_tte; -#else - /* - * If we are using two level page tables, rather than the - * 3 level page tables that xnu defaults to for ARM64, - * then a great deal of the code in this path becomes - * redundant. As a result, most of the logic having to - * do with L1 pages will be excluded from such - * configurations in this function. - */ -#endif vm_map_address_t va_l2, va_l2_end; tt_entry_t *cpu_l2_tte; pmap_paddr_t boot_ttep; @@ -1345,8 +1334,22 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ gVirtBase = args->virtBase; gPhysBase = args->physBase; - gPhysSize = args->memSize; - mem_size = args->memSize; +#if KASAN + real_phys_size = args->memSize + (shadow_ptop - shadow_pbase); +#else + real_phys_size = args->memSize; +#endif + /* + * Ensure the physical region we specify for the VM to manage ends on a + * software page boundary. Note that the software page size (PAGE_SIZE) + * may be a multiple of the hardware page size specified in ARM_PGBYTES. + * We must round the reported memory size down to the nearest PAGE_SIZE + * boundary to ensure the VM does not try to manage a page it does not + * completely own. The KASAN shadow region, if present, is managed entirely + * in units of the hardware page size and should not need similar treatment. + */ + gPhysSize = mem_size = ((gPhysBase + args->memSize) & ~PAGE_MASK) - gPhysBase; + if ((memory_size != 0) && (mem_size > memory_size)) mem_size = memory_size; if (mem_size >= ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 4)) @@ -1403,21 +1406,14 @@ arm_vm_init(uint64_t memory_size, boot_args * args) /* * Initialize l1 page table page */ -#if __ARM64_TWO_LEVEL_PMAP__ - /* - * If we're using a two level page table, we still need to - * set the cpu_ttep to avail_start, as this will be the root - * of our page table regardless of how many levels we are - * using. - */ -#endif cpu_tte = (tt_entry_t *)alloc_ptpage(TRUE); cpu_ttep = kvtophys((vm_offset_t)cpu_tte); bzero(cpu_tte, ARM_PGBYTES); avail_end = gPhysBase + mem_size; + assert(!(avail_end & PAGE_MASK)); #if KASAN - real_avail_end = avail_end + (shadow_ptop - shadow_pbase); + real_avail_end = gPhysBase + real_phys_size; #else real_avail_end = avail_end; #endif @@ -1429,9 +1425,8 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * * the so called physical aperture should be statically mapped */ -#if !__ARM64_TWO_LEVEL_PMAP__ va_l1 = gVirtBase; - va_l1_end = dynamic_memory_begin; + va_l1_end = dynamic_memory_begin; cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); while (va_l1 < va_l1_end) { @@ -1450,7 +1445,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) va_l1 += ARM_TT_L1_SIZE; cpu_l1_tte++; } -#endif #if __ARM_KERNEL_PROTECT__ /* Expand the page tables to prepare for the EL0 mappings. */ @@ -1519,14 +1513,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * cover this address range: * LOW_GLOBAL_BASE_ADDRESS + 2MB */ -#if __ARM64_TWO_LEVEL_PMAP__ - va_l2 = LOW_GLOBAL_BASE_ADDRESS; - cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#else va_l1 = va_l2 = LOW_GLOBAL_BASE_ADDRESS; cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#endif ptpage_vaddr = alloc_ptpage(TRUE); *cpu_l2_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN; bzero((void *)ptpage_vaddr, ARM_PGBYTES); @@ -1536,7 +1525,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * cover this address range: * KERNEL_DYNAMIC_ADDR - VM_MAX_KERNEL_ADDRESS */ -#if !__ARM64_TWO_LEVEL_PMAP__ va_l1 = dynamic_memory_begin; va_l1_end = VM_MAX_KERNEL_ADDRESS; cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); @@ -1557,20 +1545,26 @@ arm_vm_init(uint64_t memory_size, boot_args * args) va_l1 += ARM_TT_L1_SIZE; cpu_l1_tte++; } -#endif #if KASAN /* record the extent of the physmap */ physmap_vbase = physmap_base; physmap_vtop = static_memory_end; kasan_init(); -#endif +#endif /* KASAN */ + +#if MONOTONIC + mt_early_init(); +#endif /* MONOTONIC */ set_tbi(); - set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK); arm_vm_physmap_init(args, physmap_base, dynamic_memory_begin); set_mmu_ttb_alternate(cpu_ttep & TTBR_BADDR_MASK); + + + set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK); + flush_mmu_tlb(); kva_active = TRUE; // global table pointers may need to be different due to physical aperture remapping @@ -1600,7 +1594,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) mem_segments = (mem_size + 0x0FFFFFFF) >> 28; -#if !__ARM64_TWO_LEVEL_PMAP__ va_l1 = dynamic_memory_begin; va_l1_end = va_l1 + ((2 + (mem_segments * 10)) << 20); va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes); @@ -1620,13 +1613,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) } cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#else - va_l2 = dynamic_memory_begin; - va_l2_end = va_l2 + ((2 + (mem_segments * 10)) << 20); - va_l2_end += round_page(args->Video.v_height * args->Video.v_rowBytes); - va_l2_end = (va_l2_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL; - cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#endif while (va_l2 < va_l2_end) { pt_entry_t * ptp; @@ -1636,25 +1622,23 @@ arm_vm_init(uint64_t memory_size, boot_args * args) ptp = (pt_entry_t *) alloc_ptpage(FALSE); ptp_phys = (pmap_paddr_t)kvtophys((vm_offset_t)ptp); - pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE); + pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE, TRUE); *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN; va_l2 += ARM_TT_L2_SIZE; cpu_l2_tte++; }; -#if !__ARM64_TWO_LEVEL_PMAP__ + va_l1 = va_l2_end; cpu_l1_tte++; } -#endif /* * Initialize l3 page table pages : * cover this address range: * (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VM_MAX_KERNEL_ADDRESS */ -#if !__ARM64_TWO_LEVEL_PMAP__ va_l1 = VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK; va_l1_end = VM_MAX_KERNEL_ADDRESS; @@ -1672,11 +1656,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) } cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#else - va_l2 = VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK; - va_l2_end = VM_MAX_KERNEL_ADDRESS; - cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#endif while (va_l2 < va_l2_end) { pt_entry_t * ptp; @@ -1686,18 +1665,17 @@ arm_vm_init(uint64_t memory_size, boot_args * args) ptp = (pt_entry_t *) alloc_ptpage(FALSE); ptp_phys = (pmap_paddr_t)kvtophys((vm_offset_t)ptp); - pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE); + pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE, TRUE); *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN; va_l2 += ARM_TT_L2_SIZE; cpu_l2_tte++; }; -#if !__ARM64_TWO_LEVEL_PMAP__ + va_l1 = va_l2_end; cpu_l1_tte++; } -#endif #if __ARM64_PMAP_SUBPAGE_L1__ && __ARM_16K_PG__ /* diff --git a/osfmk/arm64/asm.h b/osfmk/arm64/asm.h index fb2c1ea8a..3bced40de 100644 --- a/osfmk/arm64/asm.h +++ b/osfmk/arm64/asm.h @@ -152,6 +152,11 @@ movk $0, #((($1) >> 00) & 0x000000000000FFFF), lsl #00 .endmacro +.macro MOV32 + movz $0, #((($1) >> 16) & 0x000000000000FFFF), lsl #16 + movk $0, #((($1) >> 00) & 0x000000000000FFFF), lsl #00 +.endmacro + .macro ARM64_STACK_PROLOG #if __has_feature(ptrauth_returns) pacibsp @@ -178,7 +183,7 @@ #ifdef XNU_KERNEL_PRIVATE .macro PANIC_UNIMPLEMENTED - bl _panic_unimplemented + bl EXT(panic_unimplemented) .endmacro #endif diff --git a/osfmk/arm64/bsd_arm64.c b/osfmk/arm64/bsd_arm64.c index f40b6bfca..0a76b1caf 100644 --- a/osfmk/arm64/bsd_arm64.c +++ b/osfmk/arm64/bsd_arm64.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -143,7 +143,17 @@ mach_syscall(struct arm_saved_state *state) { kern_return_t retval; mach_call_t mach_call; - struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + struct mach_call_args args = { + .arg1 = 0, + .arg2 = 0, + .arg3 = 0, + .arg4 = 0, + .arg5 = 0, + .arg6 = 0, + .arg7 = 0, + .arg8 = 0, + .arg9 = 0 + }; int call_number = get_saved_state_svc_number(state); int64_t exc_code; int argc; diff --git a/osfmk/arm64/caches_asm.s b/osfmk/arm64/caches_asm.s index a673abaf3..87caca6e8 100644 --- a/osfmk/arm64/caches_asm.s +++ b/osfmk/arm64/caches_asm.s @@ -43,9 +43,11 @@ .globl EXT(invalidate_mmu_icache) LEXT(InvalidatePoU_Icache) LEXT(invalidate_mmu_icache) + dsb sy ic ialluis // Invalidate icache dsb sy isb sy +L_imi_done: ret /* @@ -57,6 +59,10 @@ LEXT(invalidate_mmu_icache) .align 2 .globl EXT(InvalidatePoU_IcacheRegion) LEXT(InvalidatePoU_IcacheRegion) + ARM64_STACK_PROLOG + PUSH_FRAME + bl EXT(CleanPoU_DcacheRegion) +#if __ARM_IC_NOALIAS_ICACHE__ mov x9, #((1< kernel. + * One of COPYIO_IN or COPYIO_OUT should always be specified. + * + * @const COPYIO_OUT + * The copy is kernel -> user + * One of COPYIO_IN or COPYIO_OUT should always be specified. + * + * @const COPYIO_ALLOW_KERNEL_TO_KERNEL + * The "user_address" is allowed to be in the VA space of the kernel. + * + * @const COPYIO_VALIDATE_USER_ONLY + * There isn't really a kernel address used, and only the user address + * needs to be validated. + * + * @const COPYIO_ATOMIC + * The copyio operation is atomic, ensure that it is properly aligned. + */ +__options_decl(copyio_flags_t, uint32_t, { + COPYIO_IN = 0x0001, + COPYIO_OUT = 0x0002, + COPYIO_ALLOW_KERNEL_TO_KERNEL = 0x0004, + COPYIO_VALIDATE_USER_ONLY = 0x0008, + COPYIO_ATOMIC = 0x0010, +}); static inline void user_access_enable(void) { #if __ARM_PAN_AVAILABLE__ + assert(__builtin_arm_rsr("pan") != 0); __builtin_arm_wsr("pan", 0); #endif /* __ARM_PAN_AVAILABLE__ */ } @@ -70,78 +99,94 @@ user_access_disable(void) #endif /* __ARM_PAN_AVAILABLE__ */ } +/* + * Copy sizes bigger than this value will cause a kernel panic. + * + * Yes, this is an arbitrary fixed limit, but it's almost certainly + * a programming error to be copying more than this amount between + * user and wired kernel memory in a single invocation on this + * platform. + */ +const int copysize_limit_panic = (64 * 1024 * 1024); + +static inline bool +is_kernel_to_kernel_copy() +{ + return current_thread()->map->pmap == kernel_pmap; +} + +/* + * Validate the arguments to copy{in,out} on this platform. + * + * Returns EXDEV when the current thread pmap is the kernel's + * which is non fatal for certain routines. + */ static int -copyio(copyio_type_t copytype, const char *src, char *dst, - vm_size_t nbytes, vm_size_t *lencopied) +copy_validate(const user_addr_t user_addr, uintptr_t kernel_addr, + vm_size_t nbytes, copyio_flags_t flags) { - int result = 0; - vm_size_t bytes_copied = 0; - vm_size_t kernel_buf_size = 0; - void * kernel_addr = NULL; + thread_t self = current_thread(); - /* Reject TBI addresses */ - if (copytype == COPYIO_OUT) { - if ((uintptr_t)dst & TBI_MASK) { - return EINVAL; - } - } else { - if ((uintptr_t)src & TBI_MASK) { + user_addr_t user_addr_last; + uintptr_t kernel_addr_last; + + if (__improbable(nbytes > copysize_limit_panic)) { + panic("%s(%p, %p, %lu) - transfer too large", __func__, + (void *)user_addr, (void *)kernel_addr, nbytes); + } + + if (__improbable((user_addr < vm_map_min(self->map)) || + os_add_overflow(user_addr, nbytes, &user_addr_last) || + (user_addr_last > vm_map_max(self->map)))) { + return EFAULT; + } + + if (flags & COPYIO_ATOMIC) { + if (__improbable(user_addr & (nbytes - 1))) { return EINVAL; } } - if (__probable(copyio_zalloc_check)) { - if (copytype == COPYIO_IN || copytype == COPYIO_INSTR || copytype == COPYIO_IN_WORD) { - kernel_addr = (void*)dst; - } else if (copytype == COPYIO_OUT) { - kernel_addr = (void*)(uintptr_t)src; + if ((flags & COPYIO_VALIDATE_USER_ONLY) == 0) { + if (__improbable((kernel_addr < VM_MIN_KERNEL_ADDRESS) || + os_add_overflow(kernel_addr, nbytes, &kernel_addr_last) || + (kernel_addr_last > VM_MAX_KERNEL_ADDRESS))) { + panic("%s(%p, %p, %lu) - kaddr not in kernel", __func__, + (void *)user_addr, (void *)kernel_addr, nbytes); } - if (kernel_addr) { - kernel_buf_size = zone_element_size(kernel_addr, NULL); - } - if (__improbable(kernel_buf_size && kernel_buf_size < nbytes)) { - panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes); + } + + if (is_kernel_to_kernel_copy()) { + if (__improbable((flags & COPYIO_ALLOW_KERNEL_TO_KERNEL) == 0)) { + return EFAULT; } + return EXDEV; } -#if KASAN - /* For user copies, asan-check the kernel-side buffer */ - if (copytype == COPYIO_IN || copytype == COPYIO_INSTR || copytype == COPYIO_IN_WORD) { - __asan_storeN((uintptr_t)dst, nbytes); - } else if (copytype == COPYIO_OUT) { - __asan_loadN((uintptr_t)src, nbytes); + if (__improbable(user_addr & TBI_MASK)) { + return EINVAL; } -#endif - user_access_enable(); + if ((flags & COPYIO_VALIDATE_USER_ONLY) == 0) { + if (__probable(copyio_zalloc_check)) { + vm_size_t kernel_buf_size = zone_element_size((void *)kernel_addr, NULL); + if (__improbable(kernel_buf_size && kernel_buf_size < nbytes)) { + panic("copyio_preflight: kernel buffer 0x%lx has size %lu < nbytes %lu", + kernel_addr, kernel_buf_size, nbytes); + } + } - /* Select copy routines based on direction: - * COPYIO_IN - Use unprivileged loads to read from user address - * COPYIO_OUT - Use unprivleged stores to write to user address - */ - - switch (copytype) { - case COPYIO_IN: - result = _bcopyin(src, dst, nbytes); - break; - case COPYIO_INSTR: - result = _bcopyinstr(src, dst, nbytes, &bytes_copied); - if (result != EFAULT) { - *lencopied = bytes_copied; +#if KASAN + /* For user copies, asan-check the kernel-side buffer */ + if (flags & COPYIO_IN) { + __asan_storeN(kernel_addr, nbytes); + } else { + __asan_loadN(kernel_addr, nbytes); + kasan_check_uninitialized((vm_address_t)kernel_addr, nbytes); } - break; - case COPYIO_IN_WORD: - result = _copyin_word(src, (uint64_t *)(uintptr_t)dst, nbytes); - break; - case COPYIO_OUT: - result = _bcopyout(src, dst, nbytes); - break; - default: - result = EINVAL; +#endif } - - user_access_disable(); - return result; + return 0; } int @@ -165,67 +210,123 @@ copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes) { int result; - if (nbytes == 0) { + if (__improbable(nbytes == 0)) { return 0; } - result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); - if (result) { + result = copy_validate(user_addr, (uintptr_t)kernel_addr, nbytes, + COPYIO_IN | COPYIO_ALLOW_KERNEL_TO_KERNEL); + if (result == EXDEV) { + return copyin_kern(user_addr, kernel_addr, nbytes); + } + if (__improbable(result)) { return result; } - if (current_thread()->map->pmap == kernel_pmap) { - return copyin_kern(user_addr, kernel_addr, nbytes); - } else { - return copyio(COPYIO_IN, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, NULL); - } + user_access_enable(); + result = _bcopyin((const char *)user_addr, kernel_addr, nbytes); + user_access_disable(); + return result; } /* - * copyin_word - * Read an aligned value from userspace as a single memory transaction. - * This function supports userspace synchronization features + * copy{in,out}_atomic{32,64} + * Read or store an aligned value from userspace as a single memory transaction. + * These functions support userspace synchronization features */ int -copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes) +copyin_atomic32(const user_addr_t user_addr, uint32_t *kernel_addr) { - int result; + int result = copy_validate(user_addr, (uintptr_t)kernel_addr, 4, + COPYIO_IN | COPYIO_ATOMIC); + if (__improbable(result)) { + return result; + } + user_access_enable(); + result = _copyin_atomic32((const char *)user_addr, kernel_addr); + user_access_disable(); + return result; +} - /* Verify sizes */ - if ((nbytes != 4) && (nbytes != 8)) { - return EINVAL; +int +copyin_atomic32_wait_if_equals(const user_addr_t user_addr, uint32_t value) +{ + int result = copy_validate(user_addr, 0, 4, + COPYIO_OUT | COPYIO_ATOMIC | COPYIO_VALIDATE_USER_ONLY); + if (__improbable(result)) { + return result; } + user_access_enable(); + result = _copyin_atomic32_wait_if_equals((const char *)user_addr, value); + user_access_disable(); + return result; +} - /* Test alignment */ - if (user_addr & (nbytes - 1)) { - return EINVAL; +int +copyin_atomic64(const user_addr_t user_addr, uint64_t *kernel_addr) +{ + int result = copy_validate(user_addr, (uintptr_t)kernel_addr, 8, + COPYIO_IN | COPYIO_ATOMIC); + if (__improbable(result)) { + return result; } + user_access_enable(); + result = _copyin_atomic64((const char *)user_addr, kernel_addr); + user_access_disable(); + return result; +} - result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); - if (result) { +int +copyout_atomic32(uint32_t value, user_addr_t user_addr) +{ + int result = copy_validate(user_addr, 0, 4, + COPYIO_OUT | COPYIO_ATOMIC | COPYIO_VALIDATE_USER_ONLY); + if (__improbable(result)) { return result; } + user_access_enable(); + result = _copyout_atomic32(value, (const char *)user_addr); + user_access_disable(); + return result; +} - return copyio(COPYIO_IN_WORD, (const char *)user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL); +int +copyout_atomic64(uint64_t value, user_addr_t user_addr) +{ + int result = copy_validate(user_addr, 0, 8, + COPYIO_OUT | COPYIO_ATOMIC | COPYIO_VALIDATE_USER_ONLY); + if (__improbable(result)) { + return result; + } + user_access_enable(); + result = _copyout_atomic64(value, (const char *)user_addr); + user_access_disable(); + return result; } int copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) { int result; + vm_size_t bytes_copied = 0; *lencopied = 0; - if (nbytes == 0) { + if (__improbable(nbytes == 0)) { return ENAMETOOLONG; } - result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); - - if (result) { + result = copy_validate(user_addr, (uintptr_t)kernel_addr, nbytes, COPYIO_IN); + if (__improbable(result)) { return result; } - - return copyio(COPYIO_INSTR, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, lencopied); + user_access_enable(); + result = _bcopyinstr((const char *)user_addr, kernel_addr, nbytes, + &bytes_copied); + user_access_disable(); + if (result != EFAULT) { + *lencopied = bytes_copied; + } + return result; } int @@ -237,69 +338,26 @@ copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) return 0; } - result = copyout_validate((uintptr_t)kernel_addr, user_addr, nbytes); - if (result) { - return result; - } - - if (current_thread()->map->pmap == kernel_pmap) { + result = copy_validate(user_addr, (uintptr_t)kernel_addr, nbytes, + COPYIO_OUT | COPYIO_ALLOW_KERNEL_TO_KERNEL); + if (result == EXDEV) { return copyout_kern(kernel_addr, user_addr, nbytes); - } else { - return copyio(COPYIO_OUT, kernel_addr, (char *)(uintptr_t)user_addr, nbytes, NULL); } + if (__improbable(result)) { + return result; + } + user_access_enable(); + result = _bcopyout(kernel_addr, (char *)user_addr, nbytes); + user_access_disable(); + return result; } - -/* - * Copy sizes bigger than this value will cause a kernel panic. - * - * Yes, this is an arbitrary fixed limit, but it's almost certainly - * a programming error to be copying more than this amount between - * user and wired kernel memory in a single invocation on this - * platform. - */ -const int copysize_limit_panic = (64 * 1024 * 1024); - -/* - * Validate the arguments to copy{in,out} on this platform. - */ -static int -copy_validate(const user_addr_t user_addr, - uintptr_t kernel_addr, vm_size_t nbytes) +int +copyoutstr_prevalidate(const void *__unused kaddr, user_addr_t __unused uaddr, size_t __unused len) { - uintptr_t kernel_addr_last = kernel_addr + nbytes; - - if (__improbable(kernel_addr < VM_MIN_KERNEL_ADDRESS || - kernel_addr > VM_MAX_KERNEL_ADDRESS || - kernel_addr_last < kernel_addr || - kernel_addr_last > VM_MAX_KERNEL_ADDRESS)) { - panic("%s(%p, %p, %lu) - kaddr not in kernel", __func__, - (void *)user_addr, (void *)kernel_addr, nbytes); - } - - user_addr_t user_addr_last = user_addr + nbytes; - - if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) || - (user_addr < vm_map_min(current_thread()->map)))) { + if (__improbable(is_kernel_to_kernel_copy())) { return EFAULT; } - if (__improbable(nbytes > copysize_limit_panic)) { - panic("%s(%p, %p, %lu) - transfer too large", __func__, - (void *)user_addr, (void *)kernel_addr, nbytes); - } - return 0; } - -int -copyin_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes) -{ - return copy_validate(ua, ka, nbytes); -} - -int -copyout_validate(uintptr_t ka, const user_addr_t ua, vm_size_t nbytes) -{ - return copy_validate(ua, ka, nbytes); -} diff --git a/osfmk/arm64/cpu.c b/osfmk/arm64/cpu.c index 483d4673b..2360e6982 100644 --- a/osfmk/arm64/cpu.c +++ b/osfmk/arm64/cpu.c @@ -84,10 +84,10 @@ extern void exc_vectors_table; extern void __attribute__((noreturn)) arm64_prepare_for_sleep(void); extern void arm64_force_wfi_clock_gate(void); -#if (defined(APPLECYCLONE) || defined(APPLETYPHOON)) -// CPU1 Stuck in WFIWT Because of MMU Prefetch -extern void cyclone_typhoon_prepare_for_wfi(void); -extern void cyclone_typhoon_return_from_wfi(void); +#if defined(APPLETYPHOON) +// +extern void typhoon_prepare_for_wfi(void); +extern void typhoon_return_from_wfi(void); #endif @@ -116,6 +116,8 @@ static uint64_t wfi_delay = 0; #endif /* DEVELOPMENT || DEBUG */ +static bool idle_wfe_to_deadline = false; + #if __ARM_GLOBAL_SLEEP_BIT__ volatile boolean_t arm64_stall_sleep = TRUE; #endif @@ -136,6 +138,7 @@ static boolean_t coresight_debug_enabled = FALSE; #if defined(CONFIG_XNUPOST) void arm64_ipi_test_callback(void *); +void arm64_immediate_ipi_test_callback(void *); void arm64_ipi_test_callback(void *parm) @@ -148,12 +151,23 @@ arm64_ipi_test_callback(void *parm) *ipi_test_data = cpu_data->cpu_number; } -uint64_t arm64_ipi_test_data[MAX_CPUS]; +void +arm64_immediate_ipi_test_callback(void *parm) +{ + volatile uint64_t *ipi_test_data = parm; + cpu_data_t *cpu_data; + + cpu_data = getCpuDatap(); + + *ipi_test_data = cpu_data->cpu_number + MAX_CPUS; +} + +uint64_t arm64_ipi_test_data[MAX_CPUS * 2]; void arm64_ipi_test() { - volatile uint64_t *ipi_test_data; + volatile uint64_t *ipi_test_data, *immediate_ipi_test_data; uint32_t timeout_ms = 100; uint64_t then, now, delta; int current_cpu_number = getCpuDatap()->cpu_number; @@ -169,19 +183,34 @@ arm64_ipi_test() for (unsigned int i = 0; i < MAX_CPUS; ++i) { ipi_test_data = &arm64_ipi_test_data[i]; + immediate_ipi_test_data = &arm64_ipi_test_data[i + MAX_CPUS]; *ipi_test_data = ~i; kern_return_t error = cpu_xcall((int)i, (void *)arm64_ipi_test_callback, (void *)(uintptr_t)ipi_test_data); if (error != KERN_SUCCESS) { panic("CPU %d was unable to IPI CPU %u: error %d", current_cpu_number, i, error); } + while ((error = cpu_immediate_xcall((int)i, (void *)arm64_immediate_ipi_test_callback, + (void *)(uintptr_t)immediate_ipi_test_data)) == KERN_ALREADY_WAITING) { + now = mach_absolute_time(); + absolutetime_to_nanoseconds(now - then, &delta); + if ((delta / NSEC_PER_MSEC) > timeout_ms) { + panic("CPU %d was unable to immediate-IPI CPU %u within %dms", current_cpu_number, i, timeout_ms); + } + } + + if (error != KERN_SUCCESS) { + panic("CPU %d was unable to immediate-IPI CPU %u: error %d", current_cpu_number, i, error); + } + then = mach_absolute_time(); - while (*ipi_test_data != i) { + while ((*ipi_test_data != i) || (*immediate_ipi_test_data != (i + MAX_CPUS))) { now = mach_absolute_time(); absolutetime_to_nanoseconds(now - then, &delta); if ((delta / NSEC_PER_MSEC) > timeout_ms) { - panic("CPU %d tried to IPI CPU %d but didn't get correct response within %dms, respose: %llx", current_cpu_number, i, timeout_ms, *ipi_test_data); + panic("CPU %d tried to IPI CPU %d but didn't get correct responses within %dms, responses: %llx, %llx", + current_cpu_number, i, timeout_ms, *ipi_test_data, *immediate_ipi_test_data); } } } @@ -271,7 +300,29 @@ cpu_sleep(void) CleanPoC_Dcache(); + /* This calls: + * + * IOCPURunPlatformQuiesceActions when sleeping the boot cpu + * ml_arm_sleep() on all CPUs + * + * It does not return. + */ PE_cpu_machine_quiesce(cpu_data_ptr->cpu_id); + /*NOTREACHED*/ +} + +/* + * Routine: cpu_interrupt_is_pending + * Function: Returns the value of ISR. Due to how this register is + * is implemented, this returns 0 if there are no + * interrupts pending, so it can be used as a boolean test. + */ +static int +cpu_interrupt_is_pending(void) +{ + uint64_t isr_value; + isr_value = __builtin_arm_rsr64("ISR_EL1"); + return (int)isr_value; } /* @@ -287,9 +338,20 @@ cpu_idle(void) if ((!idle_enable) || (cpu_data_ptr->cpu_signal & SIGPdisabled)) { Idle_load_context(); } + if (!SetIdlePop()) { + /* If a deadline is pending, wait for it to elapse. */ + if (idle_wfe_to_deadline) { + if (arm64_wfe_allowed()) { + while (!cpu_interrupt_is_pending()) { + __builtin_arm_wfe(); + } + } + } + Idle_load_context(); } + lastPop = cpu_data_ptr->rtcPop; pmap_switch_user_ttb(kernel_pmap); @@ -335,16 +397,16 @@ cpu_idle(void) } #endif /* DEVELOPMENT || DEBUG */ -#if defined(APPLECYCLONE) || defined(APPLETYPHOON) +#if defined(APPLETYPHOON) // CPU1 Stuck in WFIWT Because of MMU Prefetch - cyclone_typhoon_prepare_for_wfi(); + typhoon_prepare_for_wfi(); #endif __builtin_arm_dsb(DSB_SY); __builtin_arm_wfi(); -#if defined(APPLECYCLONE) || defined(APPLETYPHOON) +#if defined(APPLETYPHOON) // CPU1 Stuck in WFIWT Because of MMU Prefetch - cyclone_typhoon_return_from_wfi(); + typhoon_return_from_wfi(); #endif #if DEVELOPMENT || DEBUG @@ -471,7 +533,9 @@ cpu_init(void) cdp->cpu_stat.irq_ex_cnt_wake = 0; cdp->cpu_stat.ipi_cnt_wake = 0; cdp->cpu_stat.timer_cnt_wake = 0; +#if MONOTONIC cdp->cpu_stat.pmi_cnt_wake = 0; +#endif /* MONOTONIC */ cdp->cpu_running = TRUE; cdp->cpu_sleep_token_last = cdp->cpu_sleep_token; cdp->cpu_sleep_token = 0x0UL; @@ -517,11 +581,16 @@ cpu_stack_alloc(cpu_data_t *cpu_data_ptr) void cpu_data_free(cpu_data_t *cpu_data_ptr) { - if (cpu_data_ptr == &BootCpuData) { + if ((cpu_data_ptr == NULL) || (cpu_data_ptr == &BootCpuData)) { return; } cpu_processor_free( cpu_data_ptr->cpu_processor); + if (CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr == cpu_data_ptr) { + CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr = NULL; + CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_paddr = 0; + __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu array are visible + } (kfree)((void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE); (kfree)((void *)(cpu_data_ptr->excepstack_top - EXCEPSTACK_SIZE), EXCEPSTACK_SIZE); kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t)); @@ -561,12 +630,6 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->cpu_signal = SIGPdisabled; -#if DEBUG || DEVELOPMENT - cpu_data_ptr->failed_xcall = NULL; - cpu_data_ptr->failed_signal = 0; - cpu_data_ptr->failed_signal_count = 0; -#endif - cpu_data_ptr->cpu_get_fiq_handler = NULL; cpu_data_ptr->cpu_tbd_hardware_addr = NULL; cpu_data_ptr->cpu_tbd_hardware_val = NULL; @@ -576,6 +639,8 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->cpu_sleep_token_last = 0x00000000UL; cpu_data_ptr->cpu_xcall_p0 = NULL; cpu_data_ptr->cpu_xcall_p1 = NULL; + cpu_data_ptr->cpu_imm_xcall_p0 = NULL; + cpu_data_ptr->cpu_imm_xcall_p1 = NULL; for (i = 0; i < CORESIGHT_REGIONS; ++i) { cpu_data_ptr->coresight_base[i] = 0; @@ -594,6 +659,9 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->cpu_exc_vectors = (vm_offset_t)&exc_vectors_table; #endif /* __ARM_KERNEL_PROTECT__ */ +#if defined(HAS_APPLE_PAC) + cpu_data_ptr->rop_key = 0; +#endif } kern_return_t @@ -607,6 +675,7 @@ cpu_data_register(cpu_data_t *cpu_data_ptr) } #endif + __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu data are visible CpuDataEntries[cpu].cpu_data_vaddr = cpu_data_ptr; CpuDataEntries[cpu].cpu_data_paddr = (void *)ml_vtophys((vm_offset_t)cpu_data_ptr); return KERN_SUCCESS; @@ -630,8 +699,8 @@ cpu_start(int cpu) cpu_data_ptr->cpu_pmap_cpu_data.cpu_nested_pmap = NULL; - if (cpu_data_ptr->cpu_processor->next_thread != THREAD_NULL) { - first_thread = cpu_data_ptr->cpu_processor->next_thread; + if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) { + first_thread = cpu_data_ptr->cpu_processor->startup_thread; } else { first_thread = cpu_data_ptr->cpu_processor->idle_thread; } @@ -675,6 +744,9 @@ cpu_timebase_init(boolean_t from_boot) * This ensures that mach_absolute_time() stops ticking across sleep. */ rtclock_base_abstime = wake_abstime - ml_get_hwclock(); + } else if (from_boot) { + /* On initial boot, initialize time_since_reset to CNTPCT_EL0. */ + ml_set_reset_time(ml_get_hwclock()); } cdp->cpu_decrementer = 0x7FFFFFFFUL; @@ -717,6 +789,7 @@ ml_arm_sleep(void) * the abstime value we'll use when we resume. */ wake_abstime = ml_get_timebase(); + ml_set_reset_time(UINT64_MAX); } else { CleanPoU_Dcache(); } @@ -841,6 +914,8 @@ cpu_machine_idle_init(boolean_t from_boot) break; } + PE_parse_boot_argn("idle_wfe_to_deadline", &idle_wfe_to_deadline, sizeof(idle_wfe_to_deadline)); + ResetHandlerData.assist_reset_handler = 0; ResetHandlerData.cpu_data_entries = ml_static_vtop((vm_offset_t)CpuDataEntries); @@ -898,9 +973,9 @@ void machine_track_platform_idle(boolean_t entry) { if (entry) { - (void)__c11_atomic_fetch_add(&cpu_idle_count, 1, __ATOMIC_RELAXED); + os_atomic_inc(&cpu_idle_count, relaxed); } else { - (void)__c11_atomic_fetch_sub(&cpu_idle_count, 1, __ATOMIC_RELAXED); + os_atomic_dec(&cpu_idle_count, relaxed); } } diff --git a/osfmk/arm64/cswitch.s b/osfmk/arm64/cswitch.s index 7aa9614a1..06aeca99e 100644 --- a/osfmk/arm64/cswitch.s +++ b/osfmk/arm64/cswitch.s @@ -27,6 +27,7 @@ */ #include #include +#include #include #include "assym.s" @@ -51,6 +52,30 @@ stp x25, x26, [$0, SS64_X25] stp x27, x28, [$0, SS64_X27] stp fp, lr, [$0, SS64_FP] +#ifdef HAS_APPLE_PAC + stp x0, x1, [sp, #-16]! + stp x2, x3, [sp, #-16]! + stp x4, x5, [sp, #-16]! + + /* + * Arg0: The ARM context pointer + * Arg1: PC value to sign + * Arg2: CPSR value to sign + * Arg3: LR to sign + */ + mov x0, $0 + ldr x1, [x0, SS64_PC] + ldr w2, [x0, SS64_CPSR] + mov x3, lr + mov x4, x16 + mov x5, x17 + bl EXT(ml_sign_thread_state) + + ldp x4, x5, [sp], #16 + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 + ldp fp, lr, [$0, SS64_FP] +#endif /* defined(HAS_APPLE_PAC) */ mov $1, sp str $1, [$0, SS64_SP] @@ -78,14 +103,25 @@ * arg1 - Scratch register */ .macro load_general_registers + mov x20, x0 + mov x21, x1 + mov x22, x2 + + mov x0, $0 + AUTH_THREAD_STATE_IN_X0 x23, x24, x25, x26, x27 - ldp x16, x17, [$0, SS64_X16] + mov x0, x20 + mov x1, x21 + mov x2, x22 + + // Skip x16, x17 - already loaded + authed by AUTH_THREAD_STATE_IN_X0 ldp x19, x20, [$0, SS64_X19] ldp x21, x22, [$0, SS64_X21] ldp x23, x24, [$0, SS64_X23] ldp x25, x26, [$0, SS64_X25] ldp x27, x28, [$0, SS64_X27] - ldp fp, lr, [$0, SS64_FP] + ldr fp, [$0, SS64_FP] + // Skip lr - already loaded + authed by AUTH_THREAD_STATE_IN_X0 ldr $1, [$0, SS64_SP] mov sp, $1 @@ -99,6 +135,7 @@ ldr d15,[$0, NS64_D15] .endmacro + /* * set_thread_registers * @@ -120,6 +157,36 @@ mov x18, $1 // ... and trash reserved x18 .endmacro +#if defined(HAS_APPLE_PAC) +/* + * set_process_dependent_keys + * + * Updates process dependent keys during context switch if necessary + * Per CPU Data rop_key is initialized in arm_init() for bootstrap processor + * and in cpu_data_init for slave processors + * + * arg0 - New thread pointer/Current CPU key + * arg1 - Scratch register: New Thread Key + * arg2 - Scratch register: Current CPU Data pointer + */ +.macro set_process_dependent_keys + ldr $1, [$0, TH_ROP_PID] + ldr $2, [$0, ACT_CPUDATAP] + ldr $0, [$2, CPU_ROP_KEY] + cmp $0, $1 + b.eq 1f + str $1, [$2, CPU_ROP_KEY] + msr APIBKeyLo_EL1, $1 + add $1, $1, #1 + msr APIBKeyHi_EL1, $1 + add $1, $1, #1 + msr APDBKeyLo_EL1, $1 + add $1, $1, #1 + msr APDBKeyHi_EL1, $1 + isb sy +1: +.endmacro +#endif /* defined(HAS_APPLE_PAC) */ /* * void machine_load_context(thread_t thread) @@ -135,6 +202,9 @@ LEXT(machine_load_context) set_thread_registers x0, x1, x2 ldr x1, [x0, TH_KSTACKPTR] // Get top of kernel stack load_general_registers x1, x2 +#ifdef HAS_APPLE_PAC + set_process_dependent_keys x0, x1, x2 +#endif mov x0, #0 // Clear argument to thread_continue ret @@ -158,6 +228,9 @@ LEXT(Call_continuation) mov sp, x5 // Set stack pointer mov fp, #0 // Clear the frame pointer +#if defined(HAS_APPLE_PAC) + set_process_dependent_keys x4, x5, x6 +#endif mov x20, x0 //continuation mov x21, x1 //continuation parameter @@ -165,12 +238,16 @@ LEXT(Call_continuation) cbz x3, 1f mov x0, #1 - bl _ml_set_interrupts_enabled + bl EXT(ml_set_interrupts_enabled) 1: mov x0, x21 // Set the first parameter mov x1, x22 // Set the wait result arg +#ifdef HAS_APPLE_PAC + blraaz x20 // Branch to the continuation +#else blr x20 // Branch to the continuation +#endif mrs x0, TPIDR_EL1 // Get the current thread pointer b EXT(thread_terminate) // Kill the thread @@ -192,6 +269,9 @@ Lswitch_threads: set_thread_registers x2, x3, x4 ldr x3, [x2, TH_KSTACKPTR] load_general_registers x3, x4 +#if defined(HAS_APPLE_PAC) + set_process_dependent_keys x2, x3, x4 +#endif ret /* @@ -212,7 +292,6 @@ LEXT(Shutdown_context) mov sp, x12 b EXT(cpu_doshutdown) - /* * thread_t Idle_context(void) * @@ -242,6 +321,9 @@ LEXT(Idle_load_context) mrs x0, TPIDR_EL1 // Get thread pointer ldr x1, [x0, TH_KSTACKPTR] // Get the top of the kernel stack load_general_registers x1, x2 +#ifdef HAS_APPLE_PAC + set_process_dependent_keys x0, x1, x2 +#endif ret .align 2 @@ -249,3 +331,5 @@ LEXT(Idle_load_context) LEXT(machine_set_current_thread) set_thread_registers x0, x1, x2 ret + + diff --git a/osfmk/arm64/dbgwrap.c b/osfmk/arm64/dbgwrap.c index 666efc2d3..7aa70d824 100644 --- a/osfmk/arm64/dbgwrap.c +++ b/osfmk/arm64/dbgwrap.c @@ -115,7 +115,7 @@ ml_dbgwrap_halt_cpu(int cpu_index, uint64_t timeout_ns) return DBGWRAP_ERR_SELF_HALT; } - if (!hw_compare_and_store((uint32_t)-1, (unsigned int)curcpu, &halt_from_cpu) && + if (!os_atomic_cmpxchg(&halt_from_cpu, (uint32_t)-1, (unsigned int)curcpu, acq_rel) && (halt_from_cpu != (uint32_t)curcpu)) { return DBGWRAP_ERR_INPROGRESS; } @@ -155,7 +155,7 @@ ml_dbgwrap_stuff_instr(cpu_data_t *cdp, uint32_t instr, uint64_t timeout_ns, dbg uint64_t deadline = mach_absolute_time() + interval; #if DEVELOPMENT || DEBUG - uint32_t stuffed_instr_index = hw_atomic_add(&stuffed_instr_count, 1); + uint32_t stuffed_instr_index = os_atomic_inc(&stuffed_instr_count, relaxed); stuffed_instrs[(stuffed_instr_index - 1) % MAX_STUFFED_INSTRS] = instr; #endif diff --git a/osfmk/arm64/exception_asm.h b/osfmk/arm64/exception_asm.h new file mode 100644 index 000000000..41bfa1f68 --- /dev/null +++ b/osfmk/arm64/exception_asm.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PEXPERT_ARM_BOARD_CONFIG_H +#include +#endif + + +/* + * INIT_SAVED_STATE_FLAVORS + * + * Initializes the saved state flavors of a new saved state structure + * arg0 - saved state pointer + * arg1 - 32-bit scratch reg + * arg2 - 32-bit scratch reg + */ +.macro INIT_SAVED_STATE_FLAVORS +mov $1, ARM_SAVED_STATE64 // Set saved state to 64-bit flavor +mov $2, ARM_SAVED_STATE64_COUNT +stp $1, $2, [$0, SS_FLAVOR] +mov $1, ARM_NEON_SAVED_STATE64 // Set neon state to 64-bit flavor +str $1, [$0, NS_FLAVOR] +mov $1, ARM_NEON_SAVED_STATE64_COUNT +str $1, [$0, NS_COUNT] +.endmacro + +/* + * SPILL_REGISTERS + * + * Spills the current set of registers (excluding x0, x1, sp, fp) to the specified + * save area. + * x0 - Address of the save area + */ + +.macro SPILL_REGISTERS +stp x2, x3, [x0, SS64_X2] // Save remaining GPRs +stp x4, x5, [x0, SS64_X4] +stp x6, x7, [x0, SS64_X6] +stp x8, x9, [x0, SS64_X8] +stp x10, x11, [x0, SS64_X10] +stp x12, x13, [x0, SS64_X12] +stp x14, x15, [x0, SS64_X14] +stp x16, x17, [x0, SS64_X16] +stp x18, x19, [x0, SS64_X18] +stp x20, x21, [x0, SS64_X20] +stp x22, x23, [x0, SS64_X22] +stp x24, x25, [x0, SS64_X24] +stp x26, x27, [x0, SS64_X26] +str x28, [x0, SS64_X28] + +/* Save arm_neon_saved_state64 */ + +stp q0, q1, [x0, NS64_Q0] +stp q2, q3, [x0, NS64_Q2] +stp q4, q5, [x0, NS64_Q4] +stp q6, q7, [x0, NS64_Q6] +stp q8, q9, [x0, NS64_Q8] +stp q10, q11, [x0, NS64_Q10] +stp q12, q13, [x0, NS64_Q12] +stp q14, q15, [x0, NS64_Q14] +stp q16, q17, [x0, NS64_Q16] +stp q18, q19, [x0, NS64_Q18] +stp q20, q21, [x0, NS64_Q20] +stp q22, q23, [x0, NS64_Q22] +stp q24, q25, [x0, NS64_Q24] +stp q26, q27, [x0, NS64_Q26] +stp q28, q29, [x0, NS64_Q28] +stp q30, q31, [x0, NS64_Q30] + +mrs lr, ELR_EL1 // Get exception link register +mrs x23, SPSR_EL1 // Load CPSR into var reg x23 +mrs x24, FPSR +mrs x25, FPCR + +#if defined(HAS_APPLE_PAC) +/* Save x1 and LR to preserve across call */ +mov x21, x1 +mov x20, lr + +/* + * Create thread state signature + * + * Arg0: The ARM context pointer + * Arg1: The PC value to sign + * Arg2: The CPSR value to sign + * Arg3: The LR value to sign + * Arg4: The X16 value to sign + * Arg5: The X17 value to sign + */ +mov x1, lr +mov w2, w23 +ldr x3, [x0, SS64_LR] +mov x4, x16 +mov x5, x17 +bl _ml_sign_thread_state + +mov lr, x20 +mov x1, x21 +#endif /* defined(HAS_APPLE_PAC) */ + +str lr, [x0, SS64_PC] // Save ELR to PCB +str w23, [x0, SS64_CPSR] // Save CPSR to PCB +str w24, [x0, NS64_FPSR] +str w25, [x0, NS64_FPCR] + +mrs x20, FAR_EL1 +mrs x21, ESR_EL1 + +str x20, [x0, SS64_FAR] +str w21, [x0, SS64_ESR] +.endmacro + +.macro DEADLOOP +b . +.endmacro diff --git a/osfmk/arm64/genassym.c b/osfmk/arm64/genassym.c index faf7f8843..8dfdecdda 100644 --- a/osfmk/arm64/genassym.c +++ b/osfmk/arm64/genassym.c @@ -100,82 +100,45 @@ */ #define DECLARE(SYM, VAL) \ - __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "n" ((u_long)(VAL))) + __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "i" ((u_long)(VAL))) -int main( - int argc, - char ** argv); +int main(int argc, + char ** argv); int -main( - int argc, - char **argv) +main(int argc, + char ** argv) { - DECLARE("T_PREFETCH_ABT", T_PREFETCH_ABT); - DECLARE("T_DATA_ABT", T_DATA_ABT); - DECLARE("AST_URGENT", AST_URGENT); - DECLARE("AST_PREEMPTION", AST_PREEMPTION); DECLARE("TH_RECOVER", offsetof(struct thread, recover)); - DECLARE("TH_CONTINUATION", offsetof(struct thread, continuation)); - DECLARE("TH_KERNEL_STACK", offsetof(struct thread, kernel_stack)); DECLARE("TH_KSTACKPTR", offsetof(struct thread, machine.kstackptr)); - DECLARE("THREAD_UTHREAD", offsetof(struct thread, uthread)); - - DECLARE("TASK_MACH_EXC_PORT", - offsetof(struct task, exc_actions[EXC_MACH_SYSCALL].port)); +#if defined(HAS_APPLE_PAC) + DECLARE("TH_ROP_PID", offsetof(struct thread, machine.rop_pid)); + DECLARE("TH_DISABLE_USER_JOP", offsetof(struct thread, machine.disable_user_jop)); +#endif /* defined(HAS_APPLE_PAC) */ /* These fields are being added on demand */ - DECLARE("ACT_TASK", offsetof(struct thread, task)); DECLARE("ACT_CONTEXT", offsetof(struct thread, machine.contextData)); - DECLARE("ACT_UPCB", offsetof(struct thread, machine.upcb)); -// DECLARE("ACT_PCBDATA", offsetof(struct thread, machine.contextData.ss)); - DECLARE("ACT_UNEON", offsetof(struct thread, machine.uNeon)); -// DECLARE("ACT_NEONDATA", offsetof(struct thread, machine.contextData.ns)); DECLARE("TH_CTH_SELF", offsetof(struct thread, machine.cthread_self)); DECLARE("TH_CTH_DATA", offsetof(struct thread, machine.cthread_data)); DECLARE("ACT_PREEMPT_CNT", offsetof(struct thread, machine.preemption_count)); DECLARE("ACT_CPUDATAP", offsetof(struct thread, machine.CpuDatap)); - DECLARE("ACT_MAP", offsetof(struct thread, map)); DECLARE("ACT_DEBUGDATA", offsetof(struct thread, machine.DebugData)); DECLARE("TH_IOTIER_OVERRIDE", offsetof(struct thread, iotier_override)); DECLARE("TH_RWLOCK_CNT", offsetof(struct thread, rwlock_count)); - DECLARE("TH_SCHED_FLAGS", offsetof(struct thread, sched_flags)); - DECLARE("TH_SFLAG_RW_PROMOTED_BIT", TH_SFLAG_RW_PROMOTED_BIT); - - DECLARE("TH_MACH_SYSCALLS", offsetof(struct thread, syscalls_mach)); - DECLARE("TH_UNIX_SYSCALLS", offsetof(struct thread, syscalls_unix)); - DECLARE("TASK_BSD_INFO", offsetof(struct task, bsd_info)); - DECLARE("MACH_TRAP_TABLE_COUNT", MACH_TRAP_TABLE_COUNT); - DECLARE("MACH_TRAP_TABLE_ENTRY_SIZE", sizeof(mach_trap_t)); - - DECLARE("MAP_PMAP", offsetof(struct _vm_map, pmap)); +#if defined(HAS_APPLE_PAC) + DECLARE("TASK_ROP_PID", offsetof(struct task, rop_pid)); +#endif /* defined(HAS_APPLE_PAC) */ DECLARE("ARM_CONTEXT_SIZE", sizeof(arm_context_t)); - DECLARE("CONTEXT_SS", offsetof(arm_context_t, ss)); DECLARE("SS_FLAVOR", offsetof(arm_context_t, ss.ash.flavor)); - DECLARE("ARM_SAVED_STATE32", ARM_SAVED_STATE32); DECLARE("ARM_SAVED_STATE64", ARM_SAVED_STATE64); DECLARE("ARM_SAVED_STATE64_COUNT", ARM_SAVED_STATE64_COUNT); - DECLARE("SS32_W0", offsetof(arm_context_t, ss.ss_32.r[0])); - DECLARE("SS32_W2", offsetof(arm_context_t, ss.ss_32.r[2])); - DECLARE("SS32_W4", offsetof(arm_context_t, ss.ss_32.r[4])); - DECLARE("SS32_W6", offsetof(arm_context_t, ss.ss_32.r[6])); - DECLARE("SS32_W8", offsetof(arm_context_t, ss.ss_32.r[8])); - DECLARE("SS32_W10", offsetof(arm_context_t, ss.ss_32.r[10])); - DECLARE("SS32_W12", offsetof(arm_context_t, ss.ss_32.r[12])); - DECLARE("SS32_SP", offsetof(arm_context_t, ss.ss_32.sp)); - DECLARE("SS32_LR", offsetof(arm_context_t, ss.ss_32.lr)); - DECLARE("SS32_PC", offsetof(arm_context_t, ss.ss_32.pc)); - DECLARE("SS32_CPSR", offsetof(arm_context_t, ss.ss_32.cpsr)); - DECLARE("SS32_VADDR", offsetof(arm_context_t, ss.ss_32.far)); - DECLARE("SS32_STATUS", offsetof(arm_context_t, ss.ss_32.esr)); - DECLARE("SS64_X0", offsetof(arm_context_t, ss.ss_64.x[0])); DECLARE("SS64_X2", offsetof(arm_context_t, ss.ss_64.x[2])); DECLARE("SS64_X4", offsetof(arm_context_t, ss.ss_64.x[4])); @@ -203,25 +166,15 @@ main( DECLARE("SS64_CPSR", offsetof(arm_context_t, ss.ss_64.cpsr)); DECLARE("SS64_FAR", offsetof(arm_context_t, ss.ss_64.far)); DECLARE("SS64_ESR", offsetof(arm_context_t, ss.ss_64.esr)); +#if defined(HAS_APPLE_PAC) + DECLARE("SS64_JOPHASH", offsetof(arm_context_t, ss.ss_64.jophash)); +#endif /* defined(HAS_APPLE_PAC) */ - DECLARE("CONTEXT_NS", offsetof(arm_context_t, ns)); DECLARE("NS_FLAVOR", offsetof(arm_context_t, ns.nsh.flavor)); DECLARE("NS_COUNT", offsetof(arm_context_t, ns.nsh.count)); - DECLARE("ARM_NEON_SAVED_STATE32", ARM_NEON_SAVED_STATE32); DECLARE("ARM_NEON_SAVED_STATE64", ARM_NEON_SAVED_STATE64); DECLARE("ARM_NEON_SAVED_STATE64_COUNT", ARM_NEON_SAVED_STATE64_COUNT); - DECLARE("NS32_Q0", offsetof(arm_context_t, ns.ns_32.v.q[0])); - DECLARE("NS32_Q2", offsetof(arm_context_t, ns.ns_32.v.q[2])); - DECLARE("NS32_Q4", offsetof(arm_context_t, ns.ns_32.v.q[4])); - DECLARE("NS32_Q6", offsetof(arm_context_t, ns.ns_32.v.q[6])); - DECLARE("NS32_Q8", offsetof(arm_context_t, ns.ns_32.v.q[8])); - DECLARE("NS32_Q10", offsetof(arm_context_t, ns.ns_32.v.q[10])); - DECLARE("NS32_Q12", offsetof(arm_context_t, ns.ns_32.v.q[12])); - DECLARE("NS32_Q14", offsetof(arm_context_t, ns.ns_32.v.q[14])); - DECLARE("NS32_FPSR", offsetof(arm_context_t, ns.ns_32.fpsr)); - DECLARE("NS32_FPCR", offsetof(arm_context_t, ns.ns_32.fpcr)); - DECLARE("NS64_D8", offsetof(arm_context_t, ns.ns_64.v.d[8])); DECLARE("NS64_D9", offsetof(arm_context_t, ns.ns_64.v.d[9])); DECLARE("NS64_D10", offsetof(arm_context_t, ns.ns_64.v.d[10])); @@ -250,126 +203,45 @@ main( DECLARE("NS64_FPSR", offsetof(arm_context_t, ns.ns_64.fpsr)); DECLARE("NS64_FPCR", offsetof(arm_context_t, ns.ns_64.fpcr)); + DECLARE("PGBYTES", ARM_PGBYTES); DECLARE("PGSHIFT", ARM_PGSHIFT); - DECLARE("PGMASK", ARM_PGMASK); - DECLARE("VM_MIN_ADDRESS", VM_MIN_ADDRESS); - DECLARE("VM_MAX_ADDRESS", VM_MAX_ADDRESS); DECLARE("VM_MIN_KERNEL_ADDRESS", VM_MIN_KERNEL_ADDRESS); - DECLARE("VM_MAX_KERNEL_ADDRESS", VM_MAX_KERNEL_ADDRESS); - DECLARE("KERNELBASE", VM_MIN_KERNEL_ADDRESS); DECLARE("KERNEL_STACK_SIZE", KERNEL_STACK_SIZE); DECLARE("TBI_MASK", TBI_MASK); - DECLARE("KERN_INVALID_ADDRESS", KERN_INVALID_ADDRESS); + DECLARE("MAX_CPUS", MAX_CPUS); + DECLARE("cdeSize", sizeof(struct cpu_data_entry)); - DECLARE("MAX_CPUS", MAX_CPUS); + DECLARE("cdSize", sizeof(struct cpu_data)); - DECLARE("cdeSize", - sizeof(struct cpu_data_entry)); - - DECLARE("cdSize", - sizeof(struct cpu_data)); - - DECLARE("CPU_ACTIVE_THREAD", - offsetof(cpu_data_t, cpu_active_thread)); - DECLARE("CPU_ACTIVE_STACK", - offsetof(cpu_data_t, cpu_active_stack)); - DECLARE("CPU_ISTACKPTR", - offsetof(cpu_data_t, istackptr)); - DECLARE("CPU_INTSTACK_TOP", - offsetof(cpu_data_t, intstack_top)); - DECLARE("CPU_EXCEPSTACKPTR", - offsetof(cpu_data_t, excepstackptr)); - DECLARE("CPU_EXCEPSTACK_TOP", - offsetof(cpu_data_t, excepstack_top)); + DECLARE("CPU_ACTIVE_THREAD", offsetof(cpu_data_t, cpu_active_thread)); + DECLARE("CPU_ISTACKPTR", offsetof(cpu_data_t, istackptr)); + DECLARE("CPU_INTSTACK_TOP", offsetof(cpu_data_t, intstack_top)); + DECLARE("CPU_EXCEPSTACK_TOP", offsetof(cpu_data_t, excepstack_top)); #if __ARM_KERNEL_PROTECT__ - DECLARE("CPU_EXC_VECTORS", - offsetof(cpu_data_t, cpu_exc_vectors)); + DECLARE("CPU_EXC_VECTORS", offsetof(cpu_data_t, cpu_exc_vectors)); #endif /* __ARM_KERNEL_PROTECT__ */ - DECLARE("CPU_NUMBER_GS", - offsetof(cpu_data_t, cpu_number)); - DECLARE("CPU_IDENT", - offsetof(cpu_data_t, cpu_ident)); - DECLARE("CPU_RUNNING", - offsetof(cpu_data_t, cpu_running)); - DECLARE("CPU_MCOUNT_OFF", - offsetof(cpu_data_t, cpu_mcount_off)); - DECLARE("CPU_PENDING_AST", - offsetof(cpu_data_t, cpu_pending_ast)); - DECLARE("CPU_PROCESSOR", - offsetof(cpu_data_t, cpu_processor)); - DECLARE("CPU_CACHE_DISPATCH", - offsetof(cpu_data_t, cpu_cache_dispatch)); - DECLARE("CPU_BASE_TIMEBASE", - offsetof(cpu_data_t, cpu_base_timebase)); - DECLARE("CPU_DECREMENTER", - offsetof(cpu_data_t, cpu_decrementer)); - DECLARE("CPU_GET_DECREMENTER_FUNC", - offsetof(cpu_data_t, cpu_get_decrementer_func)); - DECLARE("CPU_SET_DECREMENTER_FUNC", - offsetof(cpu_data_t, cpu_set_decrementer_func)); - DECLARE("CPU_GET_FIQ_HANDLER", - offsetof(cpu_data_t, cpu_get_fiq_handler)); - DECLARE("CPU_TBD_HARDWARE_ADDR", - offsetof(cpu_data_t, cpu_tbd_hardware_addr)); - DECLARE("CPU_TBD_HARDWARE_VAL", - offsetof(cpu_data_t, cpu_tbd_hardware_val)); - DECLARE("CPU_INT_STATE", - offsetof(cpu_data_t, cpu_int_state)); - DECLARE("INTERRUPT_HANDLER", - offsetof(cpu_data_t, interrupt_handler)); - DECLARE("INTERRUPT_TARGET", - offsetof(cpu_data_t, interrupt_target)); - DECLARE("INTERRUPT_REFCON", - offsetof(cpu_data_t, interrupt_refCon)); - DECLARE("INTERRUPT_NUB", - offsetof(cpu_data_t, interrupt_nub)); - DECLARE("INTERRUPT_SOURCE", - offsetof(cpu_data_t, interrupt_source)); - DECLARE("CPU_USER_DEBUG", - offsetof(cpu_data_t, cpu_user_debug)); - DECLARE("CPU_STAT_IRQ", - offsetof(cpu_data_t, cpu_stat.irq_ex_cnt)); - DECLARE("CPU_STAT_IRQ_WAKE", - offsetof(cpu_data_t, cpu_stat.irq_ex_cnt_wake)); - DECLARE("CPU_RESET_HANDLER", - offsetof(cpu_data_t, cpu_reset_handler)); - DECLARE("CPU_RESET_ASSIST", - offsetof(cpu_data_t, cpu_reset_assist)); - DECLARE("CPU_REGMAP_PADDR", - offsetof(cpu_data_t, cpu_regmap_paddr)); - DECLARE("CPU_PHYS_ID", - offsetof(cpu_data_t, cpu_phys_id)); - DECLARE("RTCLOCK_DATAP", - offsetof(cpu_data_t, rtclock_datap)); - DECLARE("CLUSTER_MASTER", - offsetof(cpu_data_t, cluster_master)); - - DECLARE("RTCLOCKDataSize", - sizeof(rtclock_data_t)); - DECLARE("RTCLOCK_ADJ_ABSTIME_LOW", - offsetof(rtclock_data_t, rtc_adj.abstime_val.low)); - DECLARE("RTCLOCK_ADJ_ABSTIME_HIGH", - offsetof(rtclock_data_t, rtc_adj.abstime_val.high)); - DECLARE("RTCLOCK_BASE_ABSTIME_LOW", - offsetof(rtclock_data_t, rtc_base.abstime_val.low)); - DECLARE("RTCLOCK_BASE_ABSTIME_HIGH", - offsetof(rtclock_data_t, rtc_base.abstime_val.high)); - - DECLARE("SIGPdec", SIGPdec); - - DECLARE("rhdSize", - sizeof(struct reset_handler_data)); + DECLARE("CPU_NUMBER_GS", offsetof(cpu_data_t, cpu_number)); + DECLARE("CPU_PENDING_AST", offsetof(cpu_data_t, cpu_pending_ast)); + DECLARE("CPU_INT_STATE", offsetof(cpu_data_t, cpu_int_state)); + DECLARE("CPU_USER_DEBUG", offsetof(cpu_data_t, cpu_user_debug)); + DECLARE("CPU_STAT_IRQ", offsetof(cpu_data_t, cpu_stat.irq_ex_cnt)); + DECLARE("CPU_STAT_IRQ_WAKE", offsetof(cpu_data_t, cpu_stat.irq_ex_cnt_wake)); + DECLARE("CPU_RESET_HANDLER", offsetof(cpu_data_t, cpu_reset_handler)); + DECLARE("CPU_PHYS_ID", offsetof(cpu_data_t, cpu_phys_id)); + DECLARE("CLUSTER_MASTER", offsetof(cpu_data_t, cluster_master)); + + DECLARE("RTCLOCKDataSize", sizeof(rtclock_data_t)); + + DECLARE("rhdSize", sizeof(struct reset_handler_data)); #if WITH_CLASSIC_S2R || !__arm64__ - DECLARE("stSize", - sizeof(SleepToken)); -#endif + DECLARE("stSize", sizeof(SleepToken)); +#endif /* WITH_CLASSIC_S2R || !__arm64__ */ DECLARE("CPU_DATA_ENTRIES", offsetof(struct reset_handler_data, cpu_data_entries)); - DECLARE("ASSIST_RESET_HANDLER", offsetof(struct reset_handler_data, assist_reset_handler)); DECLARE("CPU_DATA_PADDR", offsetof(struct cpu_data_entry, cpu_data_paddr)); @@ -378,53 +250,19 @@ main( DECLARE("PAGE_MAX_SIZE", PAGE_MAX_SIZE); - DECLARE("TIMER_TSTAMP", - offsetof(struct timer, tstamp)); - DECLARE("THREAD_TIMER", - offsetof(struct processor, processor_data.thread_timer)); - DECLARE("KERNEL_TIMER", - offsetof(struct processor, processor_data.kernel_timer)); - DECLARE("SYSTEM_STATE", - offsetof(struct processor, processor_data.system_state)); - DECLARE("USER_STATE", - offsetof(struct processor, processor_data.user_state)); - DECLARE("CURRENT_STATE", - offsetof(struct processor, processor_data.current_state)); - - DECLARE("SYSTEM_TIMER", - offsetof(struct thread, system_timer)); - DECLARE("USER_TIMER", - offsetof(struct thread, user_timer)); - -#if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME - DECLARE("PRECISE_USER_KERNEL_TIME", - offsetof(struct thread, precise_user_kernel_time)); -#endif - - DECLARE("BA_VIRT_BASE", - offsetof(struct boot_args, virtBase)); - DECLARE("BA_PHYS_BASE", - offsetof(struct boot_args, physBase)); - DECLARE("BA_MEM_SIZE", - offsetof(struct boot_args, memSize)); - DECLARE("BA_TOP_OF_KERNEL_DATA", - offsetof(struct boot_args, topOfKernelData)); - DECLARE("BA_DEVICE_TREE", - offsetof(struct boot_args, deviceTreeP)); - DECLARE("BA_DEVICE_TREE_LENGTH", - offsetof(struct boot_args, deviceTreeLength)); - DECLARE("BA_BOOT_FLAGS", - offsetof(struct boot_args, bootFlags)); - - DECLARE("ENTROPY_INDEX_PTR", - offsetof(entropy_data_t, index_ptr)); - DECLARE("ENTROPY_BUFFER", - offsetof(entropy_data_t, buffer)); - DECLARE("ENTROPY_DATA_SIZE", sizeof(struct entropy_data)); + DECLARE("BA_VIRT_BASE", offsetof(struct boot_args, virtBase)); + DECLARE("BA_PHYS_BASE", offsetof(struct boot_args, physBase)); + DECLARE("BA_MEM_SIZE", offsetof(struct boot_args, memSize)); + DECLARE("BA_TOP_OF_KERNEL_DATA", offsetof(struct boot_args, topOfKernelData)); + DECLARE("BA_BOOT_FLAGS", offsetof(struct boot_args, bootFlags)); DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1)); +#if defined(HAS_APPLE_PAC) + DECLARE("CPU_ROP_KEY", offsetof(cpu_data_t, rop_key)); +#endif /* defined(HAS_APPLE_PAC) */ + return 0; } diff --git a/osfmk/arm64/gxf_exceptions.s b/osfmk/arm64/gxf_exceptions.s new file mode 100644 index 000000000..669f28883 --- /dev/null +++ b/osfmk/arm64/gxf_exceptions.s @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include +#include +#include +#include +#include +#include "assym.s" +#include + + +/* vim: set ts=4: */ diff --git a/osfmk/arm64/kpc.c b/osfmk/arm64/kpc.c index 25a328da8..3a5a4d444 100644 --- a/osfmk/arm64/kpc.c +++ b/osfmk/arm64/kpc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 Apple Inc. All rights reserved. + * Copyright (c) 2012-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,6 +37,8 @@ #include #include +#if APPLE_ARM64_ARCH_FAMILY + #if MONOTONIC #include #endif /* MONOTONIC */ @@ -180,12 +182,10 @@ void kpc_pmi_handler(unsigned int ctr); #define SREG_PMC8 "S3_2_c15_c9_0" #define SREG_PMC9 "S3_2_c15_c10_0" -#if !defined(APPLECYCLONE) #define SREG_PMMMAP "S3_2_c15_c15_0" #define SREG_PMTRHLD2 "S3_2_c15_c14_0" #define SREG_PMTRHLD4 "S3_2_c15_c13_0" #define SREG_PMTRHLD6 "S3_2_c15_c12_0" -#endif /* * The low 8 bits of a configuration words select the event to program on @@ -217,11 +217,11 @@ void kpc_pmi_handler(unsigned int ctr); * All: PMCR2-4, OPMAT0-1, OPMSK0-1. * Typhoon/Twister/Hurricane: PMMMAP, PMTRHLD2/4/6. */ -#if defined(APPLECYCLONE) +#if HAS_EARLY_APPLE_CPMU #define RAWPMU_CONFIG_COUNT 7 -#else +#else /* HAS_EARLY_APPLE_CPMU */ #define RAWPMU_CONFIG_COUNT 11 -#endif +#endif /* !HAS_EARLY_APPLE_CPMU */ /* TODO: allocate dynamically */ static uint64_t saved_PMCR[MAX_CPUS][2]; @@ -243,100 +243,89 @@ static boolean_t whitelist_disabled = TRUE; static boolean_t whitelist_disabled = FALSE; #endif -/* List of counter events that are allowed externally */ +#define CPMU_CORE_CYCLE 0x02 + +#if HAS_EARLY_APPLE_CPMU + +#define CPMU_BIU_UPSTREAM_CYCLE 0x19 +#define CPMU_BIU_DOWNSTREAM_CYCLE 0x1a +#define CPMU_L2C_AGENT_LD 0x22 +#define CPMU_L2C_AGENT_LD_MISS 0x23 +#define CPMU_L2C_AGENT_ST 0x24 +#define CPMU_L2C_AGENT_ST_MISS 0x25 +#define CPMU_INST_A32 0x78 +#define CPMU_INST_THUMB 0x79 +#define CPMU_INST_A64 0x7a +#define CPMU_INST_BRANCH 0x7b +#define CPMU_SYNC_DC_LOAD_MISS 0xb4 +#define CPMU_SYNC_DC_STORE_MISS 0xb5 +#define CPMU_SYNC_DTLB_MISS 0xb6 +#define CPMU_SYNC_ST_HIT_YNGR_LD 0xb9 +#define CPMU_SYNC_BR_ANY_MISP 0xc0 +#define CPMU_FED_IC_MISS_DEM 0xce +#define CPMU_FED_ITLB_MISS 0xcf + +#else /* HAS_EARLY_APPLE_CPMU */ + +#if HAS_CPMU_BIU_EVENTS +#define CPMU_BIU_UPSTREAM_CYCLE 0x13 +#define CPMU_BIU_DOWNSTREAM_CYCLE 0x14 +#endif /* HAS_CPMU_BIU_EVENTS */ + +#if HAS_CPMU_L2C_EVENTS +#define CPMU_L2C_AGENT_LD 0x1a +#define CPMU_L2C_AGENT_LD_MISS 0x1b +#define CPMU_L2C_AGENT_ST 0x1c +#define CPMU_L2C_AGENT_ST_MISS 0x1d +#endif /* HAS_CPMU_L2C_EVENTS */ + +#define CPMU_INST_A32 0x8a +#define CPMU_INST_THUMB 0x8b +#define CPMU_INST_A64 0x8c +#define CPMU_INST_BRANCH 0x8d +#define CPMU_SYNC_DC_LOAD_MISS 0xbf +#define CPMU_SYNC_DC_STORE_MISS 0xc0 +#define CPMU_SYNC_DTLB_MISS 0xc1 +#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4 +#define CPMU_SYNC_BR_ANY_MISP 0xcb +#define CPMU_FED_IC_MISS_DEM 0xd3 +#define CPMU_FED_ITLB_MISS 0xd4 + +#endif /* !HAS_EARLY_APPLE_CPMU */ + +/* List of counter events that are allowed to be used by 3rd-parties. */ static kpc_config_t whitelist[] = { - 0, /* NO_EVENT */ - -#if defined(APPLECYCLONE) - 0x02, /* CORE_CYCLE */ - 0x19, /* BIU_UPSTREAM_CYCLE */ - 0x1a, /* BIU_DOWNSTREAM_CYCLE */ - 0x22, /* L2C_AGENT_LD */ - 0x23, /* L2C_AGENT_LD_MISS */ - 0x24, /* L2C_AGENT_ST */ - 0x25, /* L2C_AGENT_ST_MISS */ - 0x78, /* INST_A32 */ - 0x79, /* INST_THUMB */ - 0x7a, /* INST_A64 */ - 0x7b, /* INST_BRANCH */ - 0xb4, /* SYNC_DC_LOAD_MISS */ - 0xb5, /* SYNC_DC_STORE_MISS */ - 0xb6, /* SYNC_DTLB_MISS */ - 0xb9, /* SYNC_ST_HIT_YNGR_LD */ - 0xc0, /* SYNC_BR_ANY_MISP */ - 0xce, /* FED_IC_MISS_DEM */ - 0xcf, /* FED_ITLB_MISS */ - -#elif defined(APPLETYPHOON) - 0x02, /* CORE_CYCLE */ - 0x13, /* BIU_UPSTREAM_CYCLE */ - 0x14, /* BIU_DOWNSTREAM_CYCLE */ - 0x1a, /* L2C_AGENT_LD */ - 0x1b, /* L2C_AGENT_LD_MISS */ - 0x1c, /* L2C_AGENT_ST */ - 0x1d, /* L2C_AGENT_ST_MISS */ - 0x8a, /* INST_A32 */ - 0x8b, /* INST_THUMB */ - 0x8c, /* INST_A64 */ - 0x8d, /* INST_BRANCH */ - 0xbf, /* SYNC_DC_LOAD_MISS */ - 0xc0, /* SYNC_DC_STORE_MISS */ - 0xc1, /* SYNC_DTLB_MISS */ - 0xc4, /* SYNC_ST_HIT_YNGR_LD */ - 0xcb, /* SYNC_BR_ANY_MISP */ - 0xd3, /* FED_IC_MISS_DEM */ - 0xd4, /* FED_ITLB_MISS */ - -#elif defined(APPLETWISTER) || defined(APPLEHURRICANE) - 0x02, /* CORE_CYCLE */ - 0x1a, /* L2C_AGENT_LD */ - 0x1b, /* L2C_AGENT_LD_MISS */ - 0x1c, /* L2C_AGENT_ST */ - 0x1d, /* L2C_AGENT_ST_MISS */ - 0x8a, /* INST_A32 */ - 0x8b, /* INST_THUMB */ - 0x8c, /* INST_A64 */ - 0x8d, /* INST_BRANCH */ - 0xbf, /* SYNC_DC_LOAD_MISS */ - 0xc0, /* SYNC_DC_STORE_MISS */ - 0xc1, /* SYNC_DTLB_MISS */ - 0xc4, /* SYNC_ST_HIT_YNGR_LD */ - 0xcb, /* SYNC_BR_ANY_MISP */ - 0xd3, /* FED_IC_MISS_DEM */ - 0xd4, /* FED_ITLB_MISS */ - -#elif defined(APPLEMONSOON) - 0x02, /* CORE_CYCLE */ - 0x8a, /* INST_A32 */ - 0x8b, /* INST_THUMB */ - 0x8c, /* INST_A64 */ - 0x8d, /* INST_BRANCH */ - 0xbf, /* SYNC_DC_LOAD_MISS */ - 0xc0, /* SYNC_DC_STORE_MISS */ - 0xc1, /* SYNC_DTLB_MISS */ - 0xc4, /* SYNC_ST_HIT_YNGR_LD */ - 0xcb, /* SYNC_BR_ANY_MISP */ - 0xd3, /* FED_IC_MISS_DEM */ - 0xd4, /* FED_ITLB_MISS */ + 0, /* NO_EVENT */ -#else - /* An unknown CPU gets a trivial { NO_EVENT } whitelist. */ -#endif + CPMU_CORE_CYCLE, + +#if HAS_CPMU_BIU_EVENTS + CPMU_BIU_UPSTREAM_CYCLE, CPMU_BIU_DOWNSTREAM_CYCLE, +#endif /* HAS_CPMU_BIU_EVENTS */ + +#if HAS_CPMU_L2C_EVENTS + CPMU_L2C_AGENT_LD, CPMU_L2C_AGENT_LD_MISS, CPMU_L2C_AGENT_ST, + CPMU_L2C_AGENT_ST_MISS, +#endif /* HAS_CPMU_L2C_EVENTS */ + + CPMU_INST_A32, CPMU_INST_THUMB, CPMU_INST_A64, CPMU_INST_BRANCH, + CPMU_SYNC_DC_LOAD_MISS, CPMU_SYNC_DC_STORE_MISS, + CPMU_SYNC_DTLB_MISS, CPMU_SYNC_ST_HIT_YNGR_LD, + CPMU_SYNC_BR_ANY_MISP, CPMU_FED_IC_MISS_DEM, CPMU_FED_ITLB_MISS, }; -#define WHITELIST_COUNT (sizeof(whitelist)/sizeof(*whitelist)) +#define WHITELIST_COUNT (sizeof(whitelist) / sizeof(whitelist[0])) +#define EVENT_MASK 0xff -static boolean_t +static bool config_in_whitelist(kpc_config_t cfg) { - unsigned int i; - - for (i = 0; i < WHITELIST_COUNT; i++) { - if (cfg == whitelist[i]) { - return TRUE; + for (unsigned int i = 0; i < WHITELIST_COUNT; i++) { + /* Strip off any EL configuration bits -- just look at the event. */ + if ((cfg & EVENT_MASK) == whitelist[i]) { + return true; } } - - return FALSE; + return false; } #ifdef KPC_DEBUG @@ -784,7 +773,7 @@ kpc_set_running_xcall( void *vstate ) set_running_configurable(mp_config->cfg_target_mask, mp_config->cfg_state_mask); - if (hw_atomic_sub(&kpc_xcall_sync, 1) == 0) { + if (os_atomic_dec(&kpc_xcall_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_xcall_sync); } } @@ -802,9 +791,9 @@ kpc_get_curcpu_counters_xcall(void *args) int r = kpc_get_curcpu_counters(handler->classes, NULL, &handler->buf[offset]); /* number of counters added by this CPU, needs to be atomic */ - hw_atomic_add(&(handler->nb_counters), r); + os_atomic_add(&(handler->nb_counters), r, relaxed); - if (hw_atomic_sub(&kpc_xread_sync, 1) == 0) { + if (os_atomic_dec(&kpc_xread_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_xread_sync); } } @@ -939,7 +928,7 @@ kpc_set_config_xcall(void *vmp_config) new_config += RAWPMU_CONFIG_COUNT; } - if (hw_atomic_sub(&kpc_config_sync, 1) == 0) { + if (os_atomic_dec(&kpc_config_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_config_sync); } } @@ -1010,7 +999,7 @@ kpc_set_reload_xcall(void *vmp_config) ml_set_interrupts_enabled(enabled); - if (hw_atomic_sub(&kpc_reload_sync, 1) == 0) { + if (os_atomic_dec(&kpc_reload_sync, relaxed) == 0) { thread_wakeup((event_t) &kpc_reload_sync); } } @@ -1124,3 +1113,165 @@ kpc_get_pmu_version(void) { return KPC_PMU_ARM_APPLE; } + +#else /* APPLE_ARM64_ARCH_FAMILY */ + +/* We don't currently support non-Apple arm64 PMU configurations like PMUv3 */ + +void +kpc_arch_init(void) +{ + /* No-op */ +} + +uint32_t +kpc_get_classes(void) +{ + return 0; +} + +uint32_t +kpc_fixed_count(void) +{ + return 0; +} + +uint32_t +kpc_configurable_count(void) +{ + return 0; +} + +uint32_t +kpc_fixed_config_count(void) +{ + return 0; +} + +uint32_t +kpc_configurable_config_count(uint64_t pmc_mask __unused) +{ + return 0; +} + +int +kpc_get_fixed_config(kpc_config_t *configv __unused) +{ + return 0; +} + +uint64_t +kpc_fixed_max(void) +{ + return 0; +} + +uint64_t +kpc_configurable_max(void) +{ + return 0; +} + +int +kpc_get_configurable_config(kpc_config_t *configv __unused, uint64_t pmc_mask __unused) +{ + return ENOTSUP; +} + +int +kpc_get_configurable_counters(uint64_t *counterv __unused, uint64_t pmc_mask __unused) +{ + return ENOTSUP; +} + +int +kpc_get_fixed_counters(uint64_t *counterv __unused) +{ + return 0; +} + +boolean_t +kpc_is_running_fixed(void) +{ + return FALSE; +} + +boolean_t +kpc_is_running_configurable(uint64_t pmc_mask __unused) +{ + return FALSE; +} + +int +kpc_set_running_arch(struct kpc_running_remote *mp_config __unused) +{ + return ENOTSUP; +} + +int +kpc_set_period_arch(struct kpc_config_remote *mp_config __unused) +{ + return ENOTSUP; +} + +int +kpc_set_config_arch(struct kpc_config_remote *mp_config __unused) +{ + return ENOTSUP; +} + +void +kpc_idle(void) +{ + // do nothing +} + +void +kpc_idle_exit(void) +{ + // do nothing +} + +int +kpc_get_all_cpus_counters(uint32_t classes __unused, int *curcpu __unused, uint64_t *buf __unused) +{ + return 0; +} + +int +kpc_set_sw_inc( uint32_t mask __unused ) +{ + return ENOTSUP; +} + +int +kpc_get_pmu_version(void) +{ + return KPC_PMU_ERROR; +} + +uint32_t +kpc_rawpmu_config_count(void) +{ + return 0; +} + +int +kpc_get_rawpmu_config(__unused kpc_config_t *configv) +{ + return 0; +} + +int +kpc_disable_whitelist( int val __unused ) +{ + return 0; +} + +int +kpc_get_whitelist_disabled( void ) +{ + return 0; +} + +#endif /* !APPLE_ARM64_ARCH_FAMILY */ diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index 6a8d109f7..5edaf67f1 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -27,97 +27,20 @@ */ #include +#include #include #include #include #include #include #include "assym.s" +#include #if __ARM_KERNEL_PROTECT__ #include #endif -/* - * INIT_SAVED_STATE_FLAVORS - * - * Initializes the saved state flavors of a new saved state structure - * arg0 - saved state pointer - * arg1 - 32-bit scratch reg - * arg2 - 32-bit scratch reg - */ -.macro INIT_SAVED_STATE_FLAVORS - mov $1, ARM_SAVED_STATE64 // Set saved state to 64-bit flavor - mov $2, ARM_SAVED_STATE64_COUNT - stp $1, $2, [$0, SS_FLAVOR] - mov $1, ARM_NEON_SAVED_STATE64 // Set neon state to 64-bit flavor - str $1, [$0, NS_FLAVOR] - mov $1, ARM_NEON_SAVED_STATE64_COUNT - str $1, [$0, NS_COUNT] -.endmacro - - -/* - * SPILL_REGISTERS - * - * Spills the current set of registers (excluding x0 and x1) to the specified - * save area. - * x0 - Address of the save area - */ -.macro SPILL_REGISTERS - stp x2, x3, [x0, SS64_X2] // Save remaining GPRs - stp x4, x5, [x0, SS64_X4] - stp x6, x7, [x0, SS64_X6] - stp x8, x9, [x0, SS64_X8] - stp x10, x11, [x0, SS64_X10] - stp x12, x13, [x0, SS64_X12] - stp x14, x15, [x0, SS64_X14] - stp x16, x17, [x0, SS64_X16] - stp x18, x19, [x0, SS64_X18] - stp x20, x21, [x0, SS64_X20] - stp x22, x23, [x0, SS64_X22] - stp x24, x25, [x0, SS64_X24] - stp x26, x27, [x0, SS64_X26] - str x28, [x0, SS64_X28] - - /* Save arm_neon_saved_state64 */ - - stp q0, q1, [x0, NS64_Q0] - stp q2, q3, [x0, NS64_Q2] - stp q4, q5, [x0, NS64_Q4] - stp q6, q7, [x0, NS64_Q6] - stp q8, q9, [x0, NS64_Q8] - stp q10, q11, [x0, NS64_Q10] - stp q12, q13, [x0, NS64_Q12] - stp q14, q15, [x0, NS64_Q14] - stp q16, q17, [x0, NS64_Q16] - stp q18, q19, [x0, NS64_Q18] - stp q20, q21, [x0, NS64_Q20] - stp q22, q23, [x0, NS64_Q22] - stp q24, q25, [x0, NS64_Q24] - stp q26, q27, [x0, NS64_Q26] - stp q28, q29, [x0, NS64_Q28] - stp q30, q31, [x0, NS64_Q30] - - mrs lr, ELR_EL1 // Get exception link register - mrs x23, SPSR_EL1 // Load CPSR into var reg x23 - mrs x24, FPSR - mrs x25, FPCR - - - str lr, [x0, SS64_PC] // Save ELR to PCB - str w23, [x0, SS64_CPSR] // Save CPSR to PCB - str w24, [x0, NS64_FPSR] - str w25, [x0, NS64_FPCR] - - mrs x20, FAR_EL1 - mrs x21, ESR_EL1 - str x20, [x0, SS64_FAR] - str w21, [x0, SS64_ESR] -.endmacro - - #define CBF_DISABLE 0 #define CBF_ENABLE 1 @@ -204,19 +127,21 @@ .align 3 .globl EXT(exc_vectors_table) LEXT(exc_vectors_table) - /* Table of exception handlers. */ - .quad Lel1_sp0_synchronous_vector_long - .quad Lel1_sp0_irq_vector_long - .quad Lel1_sp0_fiq_vector_long - .quad Lel1_sp0_serror_vector_long - .quad Lel1_sp1_synchronous_vector_long - .quad Lel1_sp1_irq_vector_long - .quad Lel1_sp1_fiq_vector_long - .quad Lel1_sp1_serror_vector_long - .quad Lel0_synchronous_vector_64_long - .quad Lel0_irq_vector_64_long - .quad Lel0_fiq_vector_64_long - .quad Lel0_serror_vector_64_long + /* Table of exception handlers. + * These handlers sometimes contain deadloops. + * It's nice to have symbols for them when debugging. */ + .quad el1_sp0_synchronous_vector_long + .quad el1_sp0_irq_vector_long + .quad el1_sp0_fiq_vector_long + .quad el1_sp0_serror_vector_long + .quad el1_sp1_synchronous_vector_long + .quad el1_sp1_irq_vector_long + .quad el1_sp1_fiq_vector_long + .quad el1_sp1_serror_vector_long + .quad el0_synchronous_vector_64_long + .quad el0_irq_vector_64_long + .quad el0_fiq_vector_64_long + .quad el0_serror_vector_64_long #endif /* __ARM_KERNEL_PROTECT__ */ .text @@ -234,66 +159,66 @@ LEXT(exc_vectors_table) .globl EXT(ExceptionVectorsBase) LEXT(ExceptionVectorsBase) Lel1_sp0_synchronous_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp0_synchronous_vector_long, 0 + BRANCH_TO_KVA_VECTOR el1_sp0_synchronous_vector_long, 0 .text .align 7 Lel1_sp0_irq_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp0_irq_vector_long, 1 + BRANCH_TO_KVA_VECTOR el1_sp0_irq_vector_long, 1 .text .align 7 Lel1_sp0_fiq_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp0_fiq_vector_long, 2 + BRANCH_TO_KVA_VECTOR el1_sp0_fiq_vector_long, 2 .text .align 7 Lel1_sp0_serror_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp0_serror_vector_long, 3 + BRANCH_TO_KVA_VECTOR el1_sp0_serror_vector_long, 3 .text .align 7 Lel1_sp1_synchronous_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp1_synchronous_vector_long, 4 + BRANCH_TO_KVA_VECTOR el1_sp1_synchronous_vector_long, 4 .text .align 7 Lel1_sp1_irq_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp1_irq_vector_long, 5 + BRANCH_TO_KVA_VECTOR el1_sp1_irq_vector_long, 5 .text .align 7 Lel1_sp1_fiq_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp1_fiq_vector_long, 6 + BRANCH_TO_KVA_VECTOR el1_sp1_fiq_vector_long, 6 .text .align 7 Lel1_sp1_serror_vector: - BRANCH_TO_KVA_VECTOR Lel1_sp1_serror_vector, 7 + BRANCH_TO_KVA_VECTOR el1_sp1_serror_vector_long, 7 .text .align 7 Lel0_synchronous_vector_64: MAP_KERNEL - BRANCH_TO_KVA_VECTOR Lel0_synchronous_vector_64_long, 8 + BRANCH_TO_KVA_VECTOR el0_synchronous_vector_64_long, 8 .text .align 7 Lel0_irq_vector_64: MAP_KERNEL - BRANCH_TO_KVA_VECTOR Lel0_irq_vector_64_long, 9 + BRANCH_TO_KVA_VECTOR el0_irq_vector_64_long, 9 .text .align 7 Lel0_fiq_vector_64: MAP_KERNEL - BRANCH_TO_KVA_VECTOR Lel0_fiq_vector_64_long, 10 + BRANCH_TO_KVA_VECTOR el0_fiq_vector_64_long, 10 .text .align 7 Lel0_serror_vector_64: MAP_KERNEL - BRANCH_TO_KVA_VECTOR Lel0_serror_vector_64_long, 11 + BRANCH_TO_KVA_VECTOR el0_serror_vector_64_long, 11 /* Fill out the rest of the page */ .align 12 @@ -313,7 +238,7 @@ Lel0_serror_vector_64: mov x0, sp // Copy saved state pointer to x0 .endmacro -Lel1_sp0_synchronous_vector_long: +el1_sp0_synchronous_vector_long: sub sp, sp, ARM_CONTEXT_SIZE // Make space on the exception stack stp x0, x1, [sp, SS64_X0] // Save x0, x1 to the stack mrs x1, ESR_EL1 // Get the exception syndrome @@ -331,35 +256,35 @@ Lkernel_stack_valid: ldp x0, x1, [sp, SS64_X0] // Restore x0, x1 add sp, sp, ARM_CONTEXT_SIZE // Restore SP1 EL1_SP0_VECTOR - adrp x1, fleh_synchronous@page // Load address for fleh - add x1, x1, fleh_synchronous@pageoff + adrp x1, EXT(fleh_synchronous)@page // Load address for fleh + add x1, x1, EXT(fleh_synchronous)@pageoff b fleh_dispatch64 -Lel1_sp0_irq_vector_long: +el1_sp0_irq_vector_long: EL1_SP0_VECTOR mrs x1, TPIDR_EL1 ldr x1, [x1, ACT_CPUDATAP] ldr x1, [x1, CPU_ISTACKPTR] mov sp, x1 - adrp x1, fleh_irq@page // Load address for fleh - add x1, x1, fleh_irq@pageoff + adrp x1, EXT(fleh_irq)@page // Load address for fleh + add x1, x1, EXT(fleh_irq)@pageoff b fleh_dispatch64 -Lel1_sp0_fiq_vector_long: +el1_sp0_fiq_vector_long: // ARM64_TODO write optimized decrementer EL1_SP0_VECTOR mrs x1, TPIDR_EL1 ldr x1, [x1, ACT_CPUDATAP] ldr x1, [x1, CPU_ISTACKPTR] mov sp, x1 - adrp x1, fleh_fiq@page // Load address for fleh - add x1, x1, fleh_fiq@pageoff + adrp x1, EXT(fleh_fiq)@page // Load address for fleh + add x1, x1, EXT(fleh_fiq)@pageoff b fleh_dispatch64 -Lel1_sp0_serror_vector_long: +el1_sp0_serror_vector_long: EL1_SP0_VECTOR - adrp x1, fleh_serror@page // Load address for fleh - add x1, x1, fleh_serror@pageoff + adrp x1, EXT(fleh_serror)@page // Load address for fleh + add x1, x1, EXT(fleh_serror)@pageoff b fleh_dispatch64 .macro EL1_SP1_VECTOR @@ -372,7 +297,7 @@ Lel1_sp0_serror_vector_long: mov x0, sp // Copy saved state pointer to x0 .endmacro -Lel1_sp1_synchronous_vector_long: +el1_sp1_synchronous_vector_long: b check_exception_stack Lel1_sp1_synchronous_valid_stack: #if defined(KERNEL_INTEGRITY_KTRR) @@ -384,27 +309,60 @@ Lel1_sp1_synchronous_vector_continue: add x1, x1, fleh_synchronous_sp1@pageoff b fleh_dispatch64 -Lel1_sp1_irq_vector_long: +el1_sp1_irq_vector_long: EL1_SP1_VECTOR adrp x1, fleh_irq_sp1@page add x1, x1, fleh_irq_sp1@pageoff b fleh_dispatch64 -Lel1_sp1_fiq_vector_long: +el1_sp1_fiq_vector_long: EL1_SP1_VECTOR adrp x1, fleh_fiq_sp1@page add x1, x1, fleh_fiq_sp1@pageoff b fleh_dispatch64 -Lel1_sp1_serror_vector_long: +el1_sp1_serror_vector_long: EL1_SP1_VECTOR adrp x1, fleh_serror_sp1@page add x1, x1, fleh_serror_sp1@pageoff b fleh_dispatch64 +#if defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) +/** + * On these CPUs, SCTLR_CP15BEN_ENABLED is res0, and SCTLR_{ITD,SED}_DISABLED are res1. + * The rest of the bits in SCTLR_EL1_DEFAULT | SCTLR_PACIB_ENABLED are set in common_start. + */ +#define SCTLR_EL1_INITIAL (SCTLR_EL1_DEFAULT | SCTLR_PACIB_ENABLED) +#define SCTLR_EL1_EXPECTED ((SCTLR_EL1_INITIAL | SCTLR_SED_DISABLED | SCTLR_ITD_DISABLED) & ~SCTLR_CP15BEN_ENABLED) +#endif + .macro EL0_64_VECTOR mov x18, #0 // Zero x18 to avoid leaking data to user SS stp x0, x1, [sp, #-16]! // Save x0 and x1 to the exception stack +#if defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) + // enable JOP for kernel + adrp x0, EXT(const_boot_args)@page + add x0, x0, EXT(const_boot_args)@pageoff + ldr x0, [x0, BA_BOOT_FLAGS] + and x0, x0, BA_BOOT_FLAGS_DISABLE_JOP + cbnz x0, 1f + // if disable jop is set, don't touch SCTLR (it's already off) + // if (!boot_args->kernel_jop_disable) { + mrs x0, SCTLR_EL1 + tbnz x0, SCTLR_PACIA_ENABLED_SHIFT, 1f + // turn on jop for kernel if it isn't already on + // if (!jop_running) { + MOV64 x1, SCTLR_JOP_KEYS_ENABLED + orr x0, x0, x1 + msr SCTLR_EL1, x0 + isb sy + MOV64 x1, SCTLR_EL1_EXPECTED | SCTLR_JOP_KEYS_ENABLED + cmp x0, x1 + bne . + // } + // } +1: +#endif /* defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) */ mrs x0, TPIDR_EL1 // Load the thread register mrs x1, SP_EL0 // Load the user stack pointer add x0, x0, ACT_CONTEXT // Calculate where we store the user context pointer @@ -421,42 +379,42 @@ Lel1_sp1_serror_vector_long: .endmacro -Lel0_synchronous_vector_64_long: +el0_synchronous_vector_64_long: EL0_64_VECTOR mrs x1, TPIDR_EL1 // Load the thread register ldr x1, [x1, TH_KSTACKPTR] // Load the top of the kernel stack to x1 mov sp, x1 // Set the stack pointer to the kernel stack - adrp x1, fleh_synchronous@page // Load address for fleh - add x1, x1, fleh_synchronous@pageoff + adrp x1, EXT(fleh_synchronous)@page // Load address for fleh + add x1, x1, EXT(fleh_synchronous)@pageoff b fleh_dispatch64 -Lel0_irq_vector_64_long: +el0_irq_vector_64_long: EL0_64_VECTOR mrs x1, TPIDR_EL1 ldr x1, [x1, ACT_CPUDATAP] ldr x1, [x1, CPU_ISTACKPTR] mov sp, x1 // Set the stack pointer to the kernel stack - adrp x1, fleh_irq@page // load address for fleh - add x1, x1, fleh_irq@pageoff + adrp x1, EXT(fleh_irq)@page // load address for fleh + add x1, x1, EXT(fleh_irq)@pageoff b fleh_dispatch64 -Lel0_fiq_vector_64_long: +el0_fiq_vector_64_long: EL0_64_VECTOR mrs x1, TPIDR_EL1 ldr x1, [x1, ACT_CPUDATAP] ldr x1, [x1, CPU_ISTACKPTR] mov sp, x1 // Set the stack pointer to the kernel stack - adrp x1, fleh_fiq@page // load address for fleh - add x1, x1, fleh_fiq@pageoff + adrp x1, EXT(fleh_fiq)@page // load address for fleh + add x1, x1, EXT(fleh_fiq)@pageoff b fleh_dispatch64 -Lel0_serror_vector_64_long: +el0_serror_vector_64_long: EL0_64_VECTOR mrs x1, TPIDR_EL1 // Load the thread register ldr x1, [x1, TH_KSTACKPTR] // Load the top of the kernel stack to x1 mov sp, x1 // Set the stack pointer to the kernel stack - adrp x1, fleh_serror@page // load address for fleh - add x1, x1, fleh_serror@pageoff + adrp x1, EXT(fleh_serror)@page // load address for fleh + add x1, x1, EXT(fleh_serror)@pageoff b fleh_dispatch64 @@ -583,7 +541,7 @@ check_ktrr_sctlr_trap: .align 2 fleh_dispatch64: /* Save arm_saved_state64 */ - SPILL_REGISTERS + SPILL_REGISTERS KERNEL_MODE /* If exception is from userspace, zero unused registers */ and x23, x23, #(PSR64_MODE_EL_MASK) @@ -640,7 +598,8 @@ fleh_dispatch64: .text .align 2 -fleh_synchronous: + .global EXT(fleh_synchronous) +LEXT(fleh_synchronous) mrs x1, ESR_EL1 // Load exception syndrome mrs x2, FAR_EL1 // Load fault address @@ -724,7 +683,8 @@ Lfleh_sync_load_lr: .text .align 2 -fleh_irq: + .global EXT(fleh_irq) +LEXT(fleh_irq) BEGIN_INTERRUPT_HANDLER PUSH_FRAME bl EXT(sleh_irq) @@ -742,7 +702,8 @@ LEXT(fleh_fiq_generic) .text .align 2 -fleh_fiq: + .global EXT(fleh_fiq) +LEXT(fleh_fiq) BEGIN_INTERRUPT_HANDLER PUSH_FRAME bl EXT(sleh_fiq) @@ -754,7 +715,8 @@ fleh_fiq: .text .align 2 -fleh_serror: + .global EXT(fleh_serror) +LEXT(fleh_serror) mrs x1, ESR_EL1 // Load exception syndrome mrs x2, FAR_EL1 // Load fault address @@ -820,31 +782,27 @@ Lsp1_serror_str: .text .align 2 exception_return_dispatch: - ldr w0, [x21, SS_FLAVOR] // x0 = (threadIs64Bit) ? ss_64.cpsr : ss_32.cpsr - cmp x0, ARM_SAVED_STATE64 - ldr w1, [x21, SS64_CPSR] - ldr w2, [x21, SS32_CPSR] - csel w0, w1, w2, eq - tbnz w0, PSR64_MODE_EL_SHIFT, return_to_kernel // Test for low bit of EL, return to kernel if set + ldr w0, [x21, SS64_CPSR] + tst w0, PSR64_MODE_EL_MASK + b.ne return_to_kernel // return to kernel if M[3:2] > 0 b return_to_user .text .align 2 return_to_kernel: - tbnz w0, #DAIF_IRQF_SHIFT, Lkernel_skip_ast_taken // Skip AST check if IRQ disabled - msr DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF) // Disable interrupts - mrs x0, TPIDR_EL1 // Load thread pointer - ldr w1, [x0, ACT_PREEMPT_CNT] // Load preemption count - cbnz x1, Lkernel_skip_ast_taken // If preemption disabled, skip AST check - ldr x1, [x0, ACT_CPUDATAP] // Get current CPU data pointer - ldr x2, [x1, CPU_PENDING_AST] // Get ASTs - tst x2, AST_URGENT // If no urgent ASTs, skip ast_taken - b.eq Lkernel_skip_ast_taken - mov sp, x21 // Switch to thread stack for preemption + tbnz w0, #DAIF_IRQF_SHIFT, exception_return // Skip AST check if IRQ disabled + mrs x3, TPIDR_EL1 // Load thread pointer + ldr w1, [x3, ACT_PREEMPT_CNT] // Load preemption count + msr DAIFSet, #DAIFSC_ALL // Disable exceptions + cbnz x1, exception_return_unint_tpidr_x3 // If preemption disabled, skip AST check + ldr x1, [x3, ACT_CPUDATAP] // Get current CPU data pointer + ldr x2, [x1, CPU_PENDING_AST] // Get ASTs + tst x2, AST_URGENT // If no urgent ASTs, skip ast_taken + b.eq exception_return_unint_tpidr_x3 + mov sp, x21 // Switch to thread stack for preemption PUSH_FRAME - bl EXT(ast_taken_kernel) // Handle AST_URGENT + bl EXT(ast_taken_kernel) // Handle AST_URGENT POP_FRAME -Lkernel_skip_ast_taken: b exception_return .text @@ -870,26 +828,33 @@ LEXT(thread_exception_return) .text return_to_user: check_user_asts: - msr DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF) // Disable interrupts mrs x3, TPIDR_EL1 // Load thread pointer movn w2, #0 str w2, [x3, TH_IOTIER_OVERRIDE] // Reset IO tier override to -1 before returning to user +#if MACH_ASSERT ldr w0, [x3, TH_RWLOCK_CNT] - cbz w0, 1f // Detect unbalance RW lock/unlock + cbz w0, 1f // Detect unbalance RW lock/unlock b rwlock_count_notzero 1: + ldr w0, [x3, ACT_PREEMPT_CNT] + cbz w0, 1f + b preempt_count_notzero +1: +#endif - ldr x4, [x3, ACT_CPUDATAP] // Get current CPU data pointer - ldr x0, [x4, CPU_PENDING_AST] // Get ASTs - cbnz x0, user_take_ast // If pending ASTs, go service them + msr DAIFSet, #DAIFSC_ALL // Disable exceptions + ldr x4, [x3, ACT_CPUDATAP] // Get current CPU data pointer + ldr x0, [x4, CPU_PENDING_AST] // Get ASTs + cbnz x0, user_take_ast // If pending ASTs, go service them #if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME + mov x19, x3 // Preserve thread pointer across function call PUSH_FRAME bl EXT(timer_state_event_kernel_to_user) POP_FRAME - mrs x3, TPIDR_EL1 // Reload thread pointer + mov x3, x19 #endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME */ #if (CONFIG_KERNEL_INTEGRITY && KERNEL_INTEGRITY_WT) @@ -923,6 +888,7 @@ check_user_asts: ldr x0, [x3, ACT_DEBUGDATA] orr x1, x1, x0 // Thread debug state and live debug state both NULL? cbnz x1, user_set_debug_state_and_return // If one or the other non-null, go set debug state + b exception_return_unint_tpidr_x3 // // Fall through from return_to_user to exception_return. @@ -932,7 +898,9 @@ check_user_asts: exception_return: msr DAIFSet, #DAIFSC_ALL // Disable exceptions +exception_return_unint: mrs x3, TPIDR_EL1 // Load thread pointer +exception_return_unint_tpidr_x3: mov sp, x21 // Reload the pcb pointer /* ARM64_TODO Reserve x18 until we decide what to do with it */ @@ -960,18 +928,42 @@ Lskip_el0_eret_mapping: #endif /* __ARM_KERNEL_PROTECT__ */ Lexception_return_restore_registers: - /* Restore special register state */ - ldr x0, [sp, SS64_PC] // Get the return address - ldr w1, [sp, SS64_CPSR] // Get the return CPSR - ldr w2, [sp, NS64_FPSR] - ldr w3, [sp, NS64_FPCR] - - msr ELR_EL1, x0 // Load the return address into ELR - msr SPSR_EL1, x1 // Load the return CPSR into SPSR - msr FPSR, x2 - msr FPCR, x3 // Synchronized by ERET - mov x0, sp // x0 = &pcb + // Loads authed $x0->ss_64.pc into x1 and $x0->ss_64.cpsr into w2 + AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24 + +/* Restore special register state */ + ldr w3, [sp, NS64_FPSR] + ldr w4, [sp, NS64_FPCR] + + msr ELR_EL1, x1 // Load the return address into ELR + msr SPSR_EL1, x2 // Load the return CPSR into SPSR + msr FPSR, x3 + msr FPCR, x4 // Synchronized by ERET + +#if defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) + /* if eret to userspace, disable JOP */ + tbnz w2, PSR64_MODE_EL_SHIFT, Lskip_disable_jop + adrp x4, EXT(const_boot_args)@page + add x4, x4, EXT(const_boot_args)@pageoff + ldr x4, [x4, BA_BOOT_FLAGS] + and x1, x4, BA_BOOT_FLAGS_DISABLE_JOP + cbnz x1, Lskip_disable_jop // if global JOP disabled, don't touch SCTLR (kernel JOP is already off) + and x1, x4, BA_BOOT_FLAGS_DISABLE_USER_JOP + cbnz x1, Ldisable_jop // if global user JOP disabled, always turn off JOP regardless of thread flag (kernel running with JOP on) + mrs x2, TPIDR_EL1 + ldr x2, [x2, TH_DISABLE_USER_JOP] + cbz x2, Lskip_disable_jop // if thread has JOP enabled, leave it on (kernel running with JOP on) +Ldisable_jop: + MOV64 x1, SCTLR_JOP_KEYS_ENABLED + mrs x4, SCTLR_EL1 + bic x4, x4, x1 + msr SCTLR_EL1, x4 + MOV64 x1, SCTLR_EL1_EXPECTED + cmp x4, x1 + bne . +Lskip_disable_jop: +#endif /* defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__)*/ /* Restore arm_neon_saved_state64 */ ldp q0, q1, [x0, NS64_Q0] @@ -1001,14 +993,15 @@ Lexception_return_restore_registers: ldp x10, x11, [x0, SS64_X10] ldp x12, x13, [x0, SS64_X12] ldp x14, x15, [x0, SS64_X14] - ldp x16, x17, [x0, SS64_X16] + // Skip x16, x17 - already loaded + authed by AUTH_THREAD_STATE_IN_X0 ldp x18, x19, [x0, SS64_X18] ldp x20, x21, [x0, SS64_X20] ldp x22, x23, [x0, SS64_X22] ldp x24, x25, [x0, SS64_X24] ldp x26, x27, [x0, SS64_X26] ldr x28, [x0, SS64_X28] - ldp fp, lr, [x0, SS64_FP] + ldr fp, [x0, SS64_FP] + // Skip lr - already loaded + authed by AUTH_THREAD_STATE_IN_X0 // Restore stack pointer and our last two GPRs ldr x1, [x0, SS64_SP] @@ -1052,18 +1045,18 @@ user_take_ast: PUSH_FRAME bl EXT(ast_taken_user) // Handle all ASTs, may return via continuation POP_FRAME - mrs x3, TPIDR_EL1 // Reload thread pointer b check_user_asts // Now try again user_set_debug_state_and_return: + + ldr x4, [x3, ACT_CPUDATAP] // Get current CPU data pointer isb // Synchronize context PUSH_FRAME bl EXT(arm_debug_set) // Establish thread debug state in live regs POP_FRAME isb - mrs x3, TPIDR_EL1 // Reload thread pointer - b exception_return // And continue + b exception_return_unint // Continue, reloading the thread pointer .text .align 2 @@ -1077,6 +1070,7 @@ L_underflow_str: .asciz "Preemption count negative on thread %p" .align 2 +#if MACH_ASSERT .text .align 2 rwlock_count_notzero: @@ -1089,6 +1083,21 @@ rwlock_count_notzero: L_rwlock_count_notzero_str: .asciz "RW lock count not 0 on thread %p (%u)" + + .text + .align 2 +preempt_count_notzero: + mrs x0, TPIDR_EL1 + str x0, [sp, #-16]! // We'll print thread pointer + ldr w0, [x0, ACT_PREEMPT_CNT] + str w0, [sp, #8] + adr x0, L_preempt_count_notzero_str // Format string + CALL_EXTERN panic // Game over + +L_preempt_count_notzero_str: + .asciz "preemption count not 0 on thread %p (%u)" +#endif /* MACH_ASSERT */ + .align 2 #if __ARM_KERNEL_PROTECT__ diff --git a/osfmk/arm64/loose_ends.c b/osfmk/arm64/loose_ends.c index 00aae153f..495cc7c03 100644 --- a/osfmk/arm64/loose_ends.c +++ b/osfmk/arm64/loose_ends.c @@ -520,7 +520,7 @@ ml_phys_write_double_64(addr64_t paddr64, unsigned long long data) void setbit(int bitno, int *s) { - s[bitno / INT_SIZE] |= 1 << (bitno % INT_SIZE); + s[bitno / INT_SIZE] |= 1U << (bitno % INT_SIZE); } /* @@ -529,7 +529,7 @@ setbit(int bitno, int *s) void clrbit(int bitno, int *s) { - s[bitno / INT_SIZE] &= ~(1 << (bitno % INT_SIZE)); + s[bitno / INT_SIZE] &= ~(1U << (bitno % INT_SIZE)); } /* @@ -538,7 +538,7 @@ clrbit(int bitno, int *s) int testbit(int bitno, int *s) { - return s[bitno / INT_SIZE] & (1 << (bitno % INT_SIZE)); + return s[bitno / INT_SIZE] & (1U << (bitno % INT_SIZE)); } /* @@ -765,17 +765,18 @@ ml_thread_policy( // kprintf("ml_thread_policy() unimplemented\n"); } +__dead2 void -panic_unimplemented() +panic_unimplemented(void) { panic("Not yet implemented."); } /* ARM64_TODO */ -void abort(void); +void abort(void) __dead2; void -abort() +abort(void) { panic("Abort."); } diff --git a/osfmk/arm64/machine_remote_time.c b/osfmk/arm64/machine_remote_time.c index 095ec6dab..defc45b44 100644 --- a/osfmk/arm64/machine_remote_time.c +++ b/osfmk/arm64/machine_remote_time.c @@ -26,7 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include -#include +#include #include #include #include @@ -34,6 +34,7 @@ #include #include #include +#include lck_spin_t *bt_spin_lock = NULL; _Atomic uint32_t bt_init_flag = 0; @@ -41,14 +42,14 @@ _Atomic uint32_t bt_init_flag = 0; extern lck_spin_t *ts_conversion_lock; extern void mach_bridge_add_timestamp(uint64_t remote_timestamp, uint64_t local_timestamp); extern void bt_calibration_thread_start(void); +extern void bt_params_add(struct bt_params *params); void mach_bridge_init_timestamp(void) { /* This function should be called only once by the driver * implementing the interrupt handler for receiving timestamps */ - if (bt_init_flag) { - assert(!bt_init_flag); + if (os_atomic_load(&bt_init_flag, relaxed)) { return; } @@ -59,7 +60,7 @@ mach_bridge_init_timestamp(void) bt_spin_lock = lck_spin_alloc_init(bt_lck_grp, NULL); ts_conversion_lock = lck_spin_alloc_init(bt_lck_grp, NULL); - atomic_store(&bt_init_flag, 1); + os_atomic_store(&bt_init_flag, 1, release); /* Start the kernel thread only after all the locks have been initialized */ bt_calibration_thread_start(); @@ -74,8 +75,8 @@ mach_bridge_recv_timestamps(uint64_t remoteTimestamp, uint64_t localTimestamp) assert(ml_at_interrupt_context() == TRUE); /* Ensure the locks have been initialized */ - if (!bt_init_flag) { - assert(bt_init_flag != 0); + if (!os_atomic_load(&bt_init_flag, acquire)) { + panic("%s called before mach_bridge_init_timestamp", __func__); return; } @@ -87,3 +88,27 @@ mach_bridge_recv_timestamps(uint64_t remoteTimestamp, uint64_t localTimestamp) return; } + +/* + * This function is used to set parameters, calculated externally, + * needed for mach_bridge_remote_time. + */ +void +mach_bridge_set_params(uint64_t local_timestamp, uint64_t remote_timestamp, double rate) +{ + /* Ensure the locks have been initialized */ + if (!os_atomic_load(&bt_init_flag, acquire)) { + panic("%s called before mach_bridge_init_timestamp", __func__); + return; + } + + struct bt_params params = {}; + params.base_local_ts = local_timestamp; + params.base_remote_ts = remote_timestamp; + params.rate = rate; + lck_spin_lock(ts_conversion_lock); + bt_params_add(¶ms); + lck_spin_unlock(ts_conversion_lock); + KDBG(MACHDBG_CODE(DBG_MACH_CLOCK, MACH_BRIDGE_TS_PARAMS), params.base_local_ts, + params.base_remote_ts, *(uint64_t *)((void *)¶ms.rate)); +} diff --git a/osfmk/arm64/machine_remote_time.h b/osfmk/arm64/machine_remote_time.h index ee4db2f3b..1083a4b41 100644 --- a/osfmk/arm64/machine_remote_time.h +++ b/osfmk/arm64/machine_remote_time.h @@ -34,6 +34,7 @@ __BEGIN_DECLS void mach_bridge_recv_timestamps(uint64_t bridgeTimestamp, uint64_t localTimestamp); void mach_bridge_init_timestamp(void); +void mach_bridge_set_params(uint64_t local_timestamp, uint64_t remote_timestamp, double rate); __END_DECLS #endif /* MACHINE_ARM64_REMOTE_TIME_H */ diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index b426e7eb6..13aca14c1 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -36,13 +36,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -64,6 +67,7 @@ #endif + static int max_cpus_initialized = 0; #define MAX_CPUS_SET 0x1 #define MAX_CPUS_WAIT 0x2 @@ -76,10 +80,12 @@ boolean_t is_clock_configured = FALSE; uint32_t yield_delay_us = 0; /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */ +#if CONFIG_NONFATAL_ASSERTS extern int mach_assert; +#endif extern volatile uint32_t debug_enabled; -extern vm_offset_t segEXTRADATA; +extern vm_offset_t segLOWEST; extern vm_offset_t segLOWESTTEXT; extern vm_offset_t segLASTB; extern unsigned long segSizeLAST; @@ -108,12 +114,14 @@ void ml_lockdown_run_handler(void); uint32_t get_arm_cpu_version(void); +__dead2 void -ml_cpu_signal(unsigned int cpu_id __unused) +ml_cpu_signal(unsigned int cpu_mpidr __unused) { panic("Platform does not support ACC Fast IPI"); } +__dead2 void ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs) { @@ -127,14 +135,16 @@ ml_cpu_signal_deferred_get_timer() return 0; } +__dead2 void -ml_cpu_signal_deferred(unsigned int cpu_id __unused) +ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused) { panic("Platform does not support ACC Fast IPI deferral"); } +__dead2 void -ml_cpu_signal_retract(unsigned int cpu_id __unused) +ml_cpu_signal_retract(unsigned int cpu_mpidr __unused) { panic("Platform does not support ACC Fast IPI retraction"); } @@ -142,9 +152,9 @@ ml_cpu_signal_retract(unsigned int cpu_id __unused) void machine_idle(void) { - __asm__ volatile ("msr DAIFSet, %[mask]" ::[mask] "i" (DAIFSC_IRQF | DAIFSC_FIQF)); + __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF)); Idle_context(); - __asm__ volatile ("msr DAIFClr, %[mask]" ::[mask] "i" (DAIFSC_IRQF | DAIFSC_FIQF)); + __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF)); } void @@ -234,15 +244,11 @@ user_cont_hwclock_allowed(void) return FALSE; } -/* - * user_timebase_allowed() - * - * Indicates whether we allow EL0 to read the physical timebase (CNTPCT_EL0). - */ -boolean_t -user_timebase_allowed(void) + +uint8_t +user_timebase_type(void) { - return TRUE; + return USER_TIMEBASE_SPEC; } boolean_t @@ -357,9 +363,9 @@ lock_mmu(uint64_t begin, uint64_t end) __builtin_arm_isb(ISB_SY); flush_mmu_tlb(); -#else +#else /* defined(KERNEL_INTEGRITY_KTRR) */ #error KERNEL_INTEGRITY config error -#endif +#endif /* defined(KERNEL_INTEGRITY_KTRR) */ } static void @@ -403,7 +409,7 @@ rorgn_lockdown(void) assert_unlocked(); /* [x] - Use final method of determining all kernel text range or expect crashes */ - ktrr_begin = segEXTRADATA; + ktrr_begin = segLOWEST; assert(ktrr_begin && gVirtBase && gPhysBase); ktrr_begin = kvtophys(ktrr_begin); @@ -451,7 +457,9 @@ machine_startup(__unused boot_args * args) int boot_arg; +#if CONFIG_NONFATAL_ASSERTS PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert)); +#endif if (PE_parse_boot_argn("preempt", &boot_arg, sizeof(boot_arg))) { default_preemption_rate = boot_arg; @@ -649,8 +657,8 @@ ml_init_lock_timeout(void) void ml_cpu_up(void) { - hw_atomic_add(&machine_info.physical_cpu, 1); - hw_atomic_add(&machine_info.logical_cpu, 1); + os_atomic_inc(&machine_info.physical_cpu, relaxed); + os_atomic_inc(&machine_info.logical_cpu, relaxed); } /* @@ -662,8 +670,8 @@ ml_cpu_down(void) { cpu_data_t *cpu_data_ptr; - hw_atomic_sub(&machine_info.physical_cpu, 1); - hw_atomic_sub(&machine_info.logical_cpu, 1); + os_atomic_dec(&machine_info.physical_cpu, relaxed); + os_atomic_dec(&machine_info.logical_cpu, relaxed); /* * If we want to deal with outstanding IPIs, we need to @@ -678,6 +686,16 @@ ml_cpu_down(void) */ cpu_data_ptr = getCpuDatap(); cpu_data_ptr->cpu_running = FALSE; + + if (cpu_data_ptr != &BootCpuData) { + /* + * Move all of this cpu's timers to the master/boot cpu, + * and poke it in case there's a sooner deadline for it to schedule. + */ + timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue); + cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, NULL); + } + cpu_signal_handler_internal(TRUE); } @@ -1085,7 +1103,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info, #endif /* KPC */ if (!is_boot_cpu) { - early_random_cpu_init(this_cpu_datap->cpu_number); + random_cpu_init(this_cpu_datap->cpu_number); // now let next CPU register itself OSIncrementAtomic((SInt32*)&real_ncpus); } @@ -1164,6 +1182,16 @@ ml_io_map( return io_map(phys_addr, size, VM_WIMG_IO); } +/* Map memory map IO space (with protections specified) */ +vm_offset_t +ml_io_map_with_prot( + vm_offset_t phys_addr, + vm_size_t size, + vm_prot_t prot) +{ + return io_map_with_prot(phys_addr, size, VM_WIMG_IO, prot); +} + vm_offset_t ml_io_map_wcomb( vm_offset_t phys_addr, @@ -1308,9 +1336,6 @@ ml_static_protect( } } } -#ifndef __ARM_L1_PTW__ - FlushPoC_DcacheRegion( trunc_page_32(pte_p), 4 * sizeof(*pte_p)); -#endif } else { ptmp = *pte_p; @@ -1319,10 +1344,6 @@ ml_static_protect( ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot; *pte_p = ptmp; } - -#ifndef __ARM_L1_PTW__ - FlushPoC_DcacheRegion( trunc_page_32(pte_p), sizeof(*pte_p)); -#endif } __unreachable_ok_pop } @@ -1601,9 +1622,8 @@ ml_get_hwclock() // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2 // "Reads of CNTPCT[_EL0] can occur speculatively and out of order relative // to other instructions executed on the same processor." - __asm__ volatile ("isb\n" - "mrs %0, CNTPCT_EL0" - : "=r"(timebase)); + __builtin_arm_isb(ISB_SY); + timebase = __builtin_arm_rsr64("CNTPCT_EL0"); return timebase; } @@ -1678,7 +1698,13 @@ cache_trap_recover() static void set_cache_trap_recover(thread_t thread) { +#if defined(HAS_APPLE_PAC) + thread->recover = (vm_address_t)ptrauth_auth_and_resign(&cache_trap_recover, + ptrauth_key_function_pointer, 0, + ptrauth_key_function_pointer, ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER)); +#else /* defined(HAS_APPLE_PAC) */ thread->recover = (vm_address_t)cache_trap_recover; +#endif /* defined(HAS_APPLE_PAC) */ } static void @@ -1742,14 +1768,8 @@ icache_invalidate_trap(vm_map_address_t start, vm_map_size_t size) set_cache_trap_recover(thread); - CleanPoU_DcacheRegion(start, (uint32_t) size); - /* Invalidate iCache to point of unification */ -#if __ARM_IC_NOALIAS_ICACHE__ InvalidatePoU_IcacheRegion(start, (uint32_t)size); -#else - InvalidatePoU_Icache(); -#endif /* Restore recovery function */ thread->recover = old_recover; @@ -1814,7 +1834,7 @@ _enable_timebase_event_stream(uint32_t bit_index) * If the SOC supports it (and it isn't broken), enable * EL0 access to the physical timebase register. */ - if (user_timebase_allowed()) { + if (user_timebase_type() != USER_TIMEBASE_NONE) { cntkctl |= CNTKCTL_EL1_PL0PCTEN; } @@ -1832,6 +1852,8 @@ _enable_virtual_timer(void) __asm__ volatile ("msr CNTP_CTL_EL0, %0" : : "r"(cntvctl)); } +uint64_t events_per_sec = 0; + void fiq_context_init(boolean_t enable_fiq __unused) { @@ -1847,16 +1869,10 @@ fiq_context_bootstrap(boolean_t enable_fiq) { #if defined(APPLE_ARM64_ARCH_FAMILY) || defined(BCM2837) /* Could fill in our own ops here, if we needed them */ - uint64_t ticks_per_sec, ticks_per_event, events_per_sec; + uint64_t ticks_per_sec, ticks_per_event; uint32_t bit_index; ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz; -#if defined(ARM_BOARD_WFE_TIMEOUT_NS) - events_per_sec = 1000000000 / ARM_BOARD_WFE_TIMEOUT_NS; -#else - /* Default to 1usec (or as close as we can get) */ - events_per_sec = 1000000; -#endif ticks_per_event = ticks_per_sec / events_per_sec; bit_index = flsll(ticks_per_event) - 1; /* Highest bit set */ @@ -1988,7 +2004,7 @@ timer_state_event_kernel_to_user(void) * The following are required for parts of the kernel * that cannot resolve these functions as inlines: */ -extern thread_t current_act(void); +extern thread_t current_act(void) __attribute__((const)); thread_t current_act(void) { @@ -1996,7 +2012,7 @@ current_act(void) } #undef current_thread -extern thread_t current_thread(void); +extern thread_t current_thread(void) __attribute__((const)); thread_t current_thread(void) { @@ -2057,3 +2073,59 @@ ex_cb_invoke( return EXCB_ACTION_NONE; } +#if defined(HAS_APPLE_PAC) +void +ml_task_set_disable_user_jop(task_t task, boolean_t disable_user_jop) +{ + assert(task); + task->disable_user_jop = disable_user_jop; +} + +void +ml_thread_set_disable_user_jop(thread_t thread, boolean_t disable_user_jop) +{ + assert(thread); + thread->machine.disable_user_jop = disable_user_jop; +} + +void +ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit) +{ + if (inherit) { + task->rop_pid = parent_task->rop_pid; + } else { + task->rop_pid = early_random(); + } +} +#endif /* defined(HAS_APPLE_PAC) */ + + +#if defined(HAS_APPLE_PAC) + +/* + * ml_auth_ptr_unchecked: call this instead of ptrauth_auth_data + * instrinsic when you don't want to trap on auth fail. + * + */ + +void * +ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier) +{ + switch (key & 0x3) { + case ptrauth_key_asia: + asm volatile ("autia %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + break; + case ptrauth_key_asib: + asm volatile ("autib %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + break; + case ptrauth_key_asda: + asm volatile ("autda %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + break; + case ptrauth_key_asdb: + asm volatile ("autdb %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + break; + } + + return ptr; +} +#endif /* defined(HAS_APPLE_PAC) */ diff --git a/osfmk/arm64/machine_routines_asm.h b/osfmk/arm64/machine_routines_asm.h new file mode 100644 index 000000000..7f5f8ed29 --- /dev/null +++ b/osfmk/arm64/machine_routines_asm.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "assym.s" + +#ifndef __ASSEMBLER__ +#error "This header should only be used in .s files" +#endif + +/** + * Loads the following values from the thread_kernel_state pointer in x0: + * + * x1: $x0->ss_64.pc + * w2: $x0->ss_64.cpsr + * x16: $x0->ss_64.x16 + * x17: $x0->ss_64.x17 + * lr: $x0->ss_64.lr + * + * On CPUs with PAC support, this macro will auth the above values with ml_check_signed_state(). + * + * arg0 - scratch register 1 + * arg1 - scratch register 2 + * arg2 - scratch register 3 + * arg3 - scratch register 4 + * arg4 - scratch register 5 + */ +/* BEGIN IGNORE CODESTYLE */ +.macro AUTH_THREAD_STATE_IN_X0 + ldr x1, [x0, SS64_PC] + ldr w2, [x0, SS64_CPSR] + ldp x16, x17, [x0, SS64_X16] + +#if defined(HAS_APPLE_PAC) + // Save x3-x5 to preserve across call + mov $2, x3 + mov $3, x4 + mov $4, x5 + + /* + * Arg0: The ARM context pointer (already in x0) + * Arg1: PC to check (loaded above) + * Arg2: CPSR to check (loaded above) + * Arg3: the LR to check + * + * Stash saved state PC and CPSR in other registers to avoid reloading potentially unauthed + * values from memory. (ml_check_signed_state will clobber x1 and x2.) + */ + mov $0, x1 + mov $1, x2 + ldr x3, [x0, SS64_LR] + mov x4, x16 + mov x5, x17 + bl EXT(ml_check_signed_state) + mov x1, $0 + mov x2, $1 + + // LR was already loaded/authed earlier, if we reload it we might be loading a potentially unauthed value + mov lr, x3 + mov x3, $2 + mov x4, $3 + mov x5, $4 +#else + ldr lr, [x0, SS64_LR] +#endif /* defined(HAS_APPLE_PAC) */ +.endmacro +/* END IGNORE CODESTYLE */ + +/* vim: set ft=asm: */ diff --git a/osfmk/arm64/machine_routines_asm.s b/osfmk/arm64/machine_routines_asm.s index 08756dc8d..64fd61152 100644 --- a/osfmk/arm64/machine_routines_asm.s +++ b/osfmk/arm64/machine_routines_asm.s @@ -35,6 +35,30 @@ #include "assym.s" +#if defined(HAS_APPLE_PAC) +/* + * void + * ml_set_kernelkey_enabled(boolean_t enable) + * + * Toggle pointer auth kernel domain key diversification. Assembly to prevent compiler reordering. + * + */ + + .align 2 + .globl EXT(ml_set_kernelkey_enabled) +LEXT(ml_set_kernelkey_enabled) + mrs x1, ARM64_REG_APCTL_EL1 + orr x2, x1, #APCTL_EL1_KernKeyEn + and x1, x1, #~APCTL_EL1_KernKeyEn + cmp w0, #0 + csel x1, x1, x2, eq + msr ARM64_REG_APCTL_EL1, x1 + isb + ret + +#endif /* defined(HAS_APPLE_PAC) */ + + /* uint32_t get_fpscr(void): * Returns (FPSR | FPCR). @@ -131,369 +155,6 @@ Lupdate_mdscr_panic_str: .asciz "MDSCR.KDE was set" -#if __ARM_KERNEL_PROTECT__ -/* - * __ARM_KERNEL_PROTECT__ adds two complications to TLB management: - * - * 1. As each pmap has two ASIDs, every TLB operation that targets an ASID must - * target both ASIDs for the pmap that owns the target ASID. - * - * 2. Any TLB operation targeting the kernel_pmap ASID (ASID 0) must target all - * ASIDs (as kernel_pmap mappings may be referenced while using an ASID that - * belongs to another pmap). We expect these routines to be called with the - * EL0 ASID for the target; not the EL1 ASID. - */ -#endif /* __ARM_KERNEL_PROTECT__ */ - -.macro SYNC_TLB_FLUSH - dsb ish - isb sy -.endmacro - - -/* - * void sync_tlb_flush(void) - * - * Synchronize one or more prior TLB flush operations - */ - .text - .align 2 - .globl EXT(sync_tlb_flush) -LEXT(sync_tlb_flush) - SYNC_TLB_FLUSH - ret - - -.macro FLUSH_MMU_TLB - tlbi vmalle1is -.endmacro -/* - * void flush_mmu_tlb_async(void) - * - * Flush all TLBs, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_async) -LEXT(flush_mmu_tlb_async) - FLUSH_MMU_TLB - ret - -/* - * void flush_mmu_tlb(void) - * - * Flush all TLBs - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb) -LEXT(flush_mmu_tlb) - FLUSH_MMU_TLB - SYNC_TLB_FLUSH - ret - -.macro FLUSH_CORE_TLB - tlbi vmalle1 -.endmacro - -/* - * void flush_core_tlb_async(void) - * - * Flush local core TLB, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_core_tlb_async) -LEXT(flush_core_tlb_async) - FLUSH_CORE_TLB - ret - -/* - * void flush_core_tlb(void) - * - * Flush local core TLB - */ - .text - .align 2 - .globl EXT(flush_core_tlb) -LEXT(flush_core_tlb) - FLUSH_CORE_TLB - SYNC_TLB_FLUSH - ret - -.macro FLUSH_MMU_TLB_ALLENTRIES -#if __ARM_16K_PG__ - and x0, x0, #~0x3 - - /* - * The code below is not necessarily correct. From an overview of - * the client code, the expected contract for TLB flushes is that - * we will expand from an "address, length" pair to "start address, - * end address" in the course of a TLB flush. This suggests that - * a flush for "X, X+4" is actually only asking for a flush of a - * single 16KB page. At the same time, we'd like to be prepared - * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page - * number to a 16KB page boundary. This should deal correctly with - * unaligned inputs. - * - * If our expecations about client behavior are wrong however, this - * will lead to occasional TLB corruption on platforms with 16KB - * pages. - */ - add x1, x1, #0x3 - and x1, x1, #~0x3 -#endif -1: // Lflush_mmu_tlb_allentries_loop: - tlbi vaae1is, x0 - add x0, x0, #(ARM_PGBYTES / 4096) // Units are 4KB pages, as defined by the ISA - cmp x0, x1 - b.lt 1b // Lflush_mmu_tlb_allentries_loop -.endmacro - -/* - * void flush_mmu_tlb_allentries_async(uint64_t, uint64_t) - * - * Flush TLB entries, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_allentries_async) -LEXT(flush_mmu_tlb_allentries_async) - FLUSH_MMU_TLB_ALLENTRIES - ret - -/* - * void flush_mmu_tlb_allentries(uint64_t, uint64_t) - * - * Flush TLB entries - */ - .globl EXT(flush_mmu_tlb_allentries) -LEXT(flush_mmu_tlb_allentries) - FLUSH_MMU_TLB_ALLENTRIES - SYNC_TLB_FLUSH - ret - -.macro FLUSH_MMU_TLB_ENTRY -#if __ARM_KERNEL_PROTECT__ - /* - * If we are flushing ASID 0, this is a kernel operation. With this - * ASID scheme, this means we should flush all ASIDs. - */ - lsr x2, x0, #TLBI_ASID_SHIFT - cmp x2, #0 - b.eq 1f // Lflush_mmu_tlb_entry_globally - - bic x0, x0, #(1 << TLBI_ASID_SHIFT) - tlbi vae1is, x0 - orr x0, x0, #(1 << TLBI_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ - tlbi vae1is, x0 -#if __ARM_KERNEL_PROTECT__ - b 2f // Lflush_mmu_tlb_entry_done -1: // Lflush_mmu_tlb_entry_globally: - tlbi vaae1is, x0 -2: // Lflush_mmu_tlb_entry_done -#endif /* __ARM_KERNEL_PROTECT__ */ -.endmacro -/* - * void flush_mmu_tlb_entry_async(uint64_t) - * - * Flush TLB entry, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_entry_async) -LEXT(flush_mmu_tlb_entry_async) - FLUSH_MMU_TLB_ENTRY - ret - -/* - * void flush_mmu_tlb_entry(uint64_t) - * - * Flush TLB entry - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_entry) -LEXT(flush_mmu_tlb_entry) - FLUSH_MMU_TLB_ENTRY - SYNC_TLB_FLUSH - ret - -.macro FLUSH_MMU_TLB_ENTRIES -#if __ARM_16K_PG__ - and x0, x0, #~0x3 - - /* - * The code below is not necessarily correct. From an overview of - * the client code, the expected contract for TLB flushes is that - * we will expand from an "address, length" pair to "start address, - * end address" in the course of a TLB flush. This suggests that - * a flush for "X, X+4" is actually only asking for a flush of a - * single 16KB page. At the same time, we'd like to be prepared - * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page - * number to a 16KB page boundary. This should deal correctly with - * unaligned inputs. - * - * If our expecations about client behavior are wrong however, this - * will lead to occasional TLB corruption on platforms with 16KB - * pages. - */ - add x1, x1, #0x3 - and x1, x1, #~0x3 -#endif /* __ARM_16K_PG__ */ -#if __ARM_KERNEL_PROTECT__ - /* - * If we are flushing ASID 0, this is a kernel operation. With this - * ASID scheme, this means we should flush all ASIDs. - */ - lsr x2, x0, #TLBI_ASID_SHIFT - cmp x2, #0 - b.eq 2f // Lflush_mmu_tlb_entries_globally_loop - - bic x0, x0, #(1 << TLBI_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ -1: // Lflush_mmu_tlb_entries_loop - tlbi vae1is, x0 -#if __ARM_KERNEL_PROTECT__ - orr x0, x0, #(1 << TLBI_ASID_SHIFT) - tlbi vae1is, x0 - bic x0, x0, #(1 << TLBI_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ - add x0, x0, #(ARM_PGBYTES / 4096) // Units are pages - cmp x0, x1 - b.lt 1b // Lflush_mmu_tlb_entries_loop -#if __ARM_KERNEL_PROTECT__ - b 3f // Lflush_mmu_tlb_entries_done -2: // Lflush_mmu_tlb_entries_globally_loop: - tlbi vaae1is, x0 - add x0, x0, #(ARM_PGBYTES / 4096) // Units are pages - cmp x0, x1 - b.lt 2b // Lflush_mmu_tlb_entries_globally_loop -3: // Lflush_mmu_tlb_entries_done -#endif /* __ARM_KERNEL_PROTECT__ */ -.endmacro - -/* - * void flush_mmu_tlb_entries_async(uint64_t, uint64_t) - * - * Flush TLB entries, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_entries_async) -LEXT(flush_mmu_tlb_entries_async) - FLUSH_MMU_TLB_ENTRIES - ret - -/* - * void flush_mmu_tlb_entries(uint64_t, uint64_t) - * - * Flush TLB entries - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_entries) -LEXT(flush_mmu_tlb_entries) - FLUSH_MMU_TLB_ENTRIES - SYNC_TLB_FLUSH - ret - -.macro FLUSH_MMU_TLB_ASID -#if __ARM_KERNEL_PROTECT__ - /* - * If we are flushing ASID 0, this is a kernel operation. With this - * ASID scheme, this means we should flush all ASIDs. - */ - lsr x1, x0, #TLBI_ASID_SHIFT - cmp x1, #0 - b.eq 1f // Lflush_mmu_tlb_globally - - bic x0, x0, #(1 << TLBI_ASID_SHIFT) - tlbi aside1is, x0 - orr x0, x0, #(1 << TLBI_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ - tlbi aside1is, x0 -#if __ARM_KERNEL_PROTECT__ - b 2f // Lflush_mmu_tlb_asid_done -1: // Lflush_mmu_tlb_globally: - tlbi vmalle1is -2: // Lflush_mmu_tlb_asid_done: -#endif /* __ARM_KERNEL_PROTECT__ */ -.endmacro - -/* - * void flush_mmu_tlb_asid_async(uint64_t) - * - * Flush TLB entriesfor requested asid, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_asid_async) -LEXT(flush_mmu_tlb_asid_async) - FLUSH_MMU_TLB_ASID - ret - -/* - * void flush_mmu_tlb_asid(uint64_t) - * - * Flush TLB entriesfor requested asid - */ - .text - .align 2 - .globl EXT(flush_mmu_tlb_asid) -LEXT(flush_mmu_tlb_asid) - FLUSH_MMU_TLB_ASID - SYNC_TLB_FLUSH - ret - -.macro FLUSH_CORE_TLB_ASID -#if __ARM_KERNEL_PROTECT__ - /* - * If we are flushing ASID 0, this is a kernel operation. With this - * ASID scheme, this means we should flush all ASIDs. - */ - lsr x1, x0, #TLBI_ASID_SHIFT - cmp x1, #0 - b.eq 1f // Lflush_core_tlb_asid_globally - - bic x0, x0, #(1 << TLBI_ASID_SHIFT) - tlbi aside1, x0 - orr x0, x0, #(1 << TLBI_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ - tlbi aside1, x0 -#if __ARM_KERNEL_PROTECT__ - b 2f // Lflush_core_tlb_asid_done -1: // Lflush_core_tlb_asid_globally: - tlbi vmalle1 -2: // Lflush_core_tlb_asid_done: -#endif /* __ARM_KERNEL_PROTECT__ */ -.endmacro - -/* - * void flush_core_tlb_asid_async(uint64_t) - * - * Flush TLB entries for core for requested asid, don't wait for completion - */ - .text - .align 2 - .globl EXT(flush_core_tlb_asid_async) -LEXT(flush_core_tlb_asid_async) - FLUSH_CORE_TLB_ASID - ret -/* - * void flush_core_tlb_asid(uint64_t) - * - * Flush TLB entries for core for requested asid - */ - .text - .align 2 - .globl EXT(flush_core_tlb_asid) -LEXT(flush_core_tlb_asid) - FLUSH_CORE_TLB_ASID - SYNC_TLB_FLUSH - ret - /* * Set MMU Translation Table Base Alternate */ @@ -566,7 +227,7 @@ LEXT(set_tcr) cbnz x1, L_set_tcr_panic #if defined(KERNEL_INTEGRITY_KTRR) mov x1, lr - bl _pinst_set_tcr + bl EXT(pinst_set_tcr) mov lr, x1 #else msr TCR_EL1, x0 @@ -598,7 +259,7 @@ L_set_locked_reg_panic_str: #else #if defined(KERNEL_INTEGRITY_KTRR) mov x1, lr - bl _pinst_set_tcr + bl EXT(pinst_set_tcr) mov lr, x1 #else msr TCR_EL1, x0 @@ -683,6 +344,11 @@ L_mmu_kvtop_wpreflight_invalid: mrs $0, TPIDR_EL1 // Load thread pointer adrp $2, $3@page // Load the recovery handler address add $2, $2, $3@pageoff +#if defined(HAS_APPLE_PAC) + add $1, $0, TH_RECOVER + movk $1, #PAC_DISCRIMINATOR_RECOVER, lsl 48 + pacia $2, $1 // Sign with IAKey + blended discriminator +#endif ldr $1, [$0, TH_RECOVER] // Save previous recovery handler str $2, [$0, TH_RECOVER] // Set new signed recovery handler @@ -744,35 +410,94 @@ LEXT(_bcopyin) ARM64_STACK_EPILOG /* - * int _copyin_word(const char *src, uint64_t *dst, vm_size_t len) + * int _copyin_atomic32(const char *src, uint32_t *dst) */ .text .align 2 - .globl EXT(_copyin_word) -LEXT(_copyin_word) + .globl EXT(_copyin_atomic32) +LEXT(_copyin_atomic32) ARM64_STACK_PROLOG PUSH_FRAME SET_RECOVERY_HANDLER x10, x11, x3, copyio_error - cmp x2, #4 - b.eq L_copyin_word_4 - cmp x2, #8 - b.eq L_copyin_word_8 - mov x0, EINVAL - b L_copying_exit -L_copyin_word_4: ldr w8, [x0] - b L_copyin_word_store -L_copyin_word_8: + str w8, [x1] + mov x0, #0 + CLEAR_RECOVERY_HANDLER x10, x11 + POP_FRAME + ARM64_STACK_EPILOG + +/* + * int _copyin_atomic32_wait_if_equals(const char *src, uint32_t value) + */ + .text + .align 2 + .globl EXT(_copyin_atomic32_wait_if_equals) +LEXT(_copyin_atomic32_wait_if_equals) + ARM64_STACK_PROLOG + PUSH_FRAME + SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + ldxr w8, [x0] + cmp w8, w1 + mov x0, ESTALE + b.ne 1f + mov x0, #0 + wfe +1: + clrex + CLEAR_RECOVERY_HANDLER x10, x11 + POP_FRAME + ARM64_STACK_EPILOG + +/* + * int _copyin_atomic64(const char *src, uint32_t *dst) + */ + .text + .align 2 + .globl EXT(_copyin_atomic64) +LEXT(_copyin_atomic64) + ARM64_STACK_PROLOG + PUSH_FRAME + SET_RECOVERY_HANDLER x10, x11, x3, copyio_error ldr x8, [x0] -L_copyin_word_store: str x8, [x1] mov x0, #0 CLEAR_RECOVERY_HANDLER x10, x11 -L_copying_exit: POP_FRAME ARM64_STACK_EPILOG +/* + * int _copyout_atomic32(uint32_t value, char *dst) + */ + .text + .align 2 + .globl EXT(_copyout_atomic32) +LEXT(_copyout_atomic32) + ARM64_STACK_PROLOG + PUSH_FRAME + SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + str w0, [x1] + mov x0, #0 + CLEAR_RECOVERY_HANDLER x10, x11 + POP_FRAME + ARM64_STACK_EPILOG + +/* + * int _copyout_atomic64(uint64_t value, char *dst) + */ + .text + .align 2 + .globl EXT(_copyout_atomic64) +LEXT(_copyout_atomic64) + ARM64_STACK_PROLOG + PUSH_FRAME + SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + str x0, [x1] + mov x0, #0 + CLEAR_RECOVERY_HANDLER x10, x11 + POP_FRAME + ARM64_STACK_EPILOG + /* * int _bcopyout(const char *src, char *dst, vm_size_t len) @@ -825,6 +550,11 @@ LEXT(_bcopyinstr) mrs x10, TPIDR_EL1 // Get thread pointer ldr x11, [x10, TH_RECOVER] // Save previous recover +#if defined(HAS_APPLE_PAC) + add x5, x10, TH_RECOVER // Sign new pointer with IAKey + blended discriminator + movk x5, #PAC_DISCRIMINATOR_RECOVER, lsl 48 + pacia x4, x5 +#endif str x4, [x10, TH_RECOVER] // Store new recover mov x4, #0 // x4 - total bytes copied @@ -940,8 +670,8 @@ LEXT(arm_debug_set_cp14) LEXT(arm64_prepare_for_sleep) PUSH_FRAME -#if defined(APPLECYCLONE) || defined(APPLETYPHOON) - // CPU1 Stuck in WFIWT Because of MMU Prefetch +#if defined(APPLETYPHOON) + // mrs x0, ARM64_REG_HID2 // Read HID2 orr x0, x0, #(ARM64_REG_HID2_disMMUmtlbPrefetch) // Set HID.DisableMTLBPrefetch msr ARM64_REG_HID2, x0 // Write HID2 @@ -1022,16 +752,16 @@ LEXT(arm64_force_wfi_clock_gate) -#if defined(APPLECYCLONE) || defined(APPLETYPHOON) +#if defined(APPLETYPHOON) .text .align 2 - .globl EXT(cyclone_typhoon_prepare_for_wfi) + .globl EXT(typhoon_prepare_for_wfi) -LEXT(cyclone_typhoon_prepare_for_wfi) +LEXT(typhoon_prepare_for_wfi) PUSH_FRAME - // CPU1 Stuck in WFIWT Because of MMU Prefetch + // mrs x0, ARM64_REG_HID2 // Read HID2 orr x0, x0, #(ARM64_REG_HID2_disMMUmtlbPrefetch) // Set HID.DisableMTLBPrefetch msr ARM64_REG_HID2, x0 // Write HID2 @@ -1044,11 +774,11 @@ LEXT(cyclone_typhoon_prepare_for_wfi) .text .align 2 - .globl EXT(cyclone_typhoon_return_from_wfi) -LEXT(cyclone_typhoon_return_from_wfi) + .globl EXT(typhoon_return_from_wfi) +LEXT(typhoon_return_from_wfi) PUSH_FRAME - // CPU1 Stuck in WFIWT Because of MMU Prefetch + // mrs x0, ARM64_REG_HID2 // Read HID2 mov x1, #(ARM64_REG_HID2_disMMUmtlbPrefetch) // bic x0, x0, x1 // Clear HID.DisableMTLBPrefetchMTLBPrefetch @@ -1204,7 +934,7 @@ LEXT(arm64_replace_bootstack) // Set SP_EL1 to exception stack #if defined(KERNEL_INTEGRITY_KTRR) mov x1, lr - bl _pinst_spsel_1 + bl EXT(pinst_spsel_1) mov lr, x1 #else msr SPSel, #1 @@ -1233,5 +963,84 @@ LEXT(monitor_call) ret #endif +#ifdef HAS_APPLE_PAC +/** + * void ml_sign_thread_state(arm_saved_state_t *ss, uint64_t pc, + * uint32_t cpsr, uint64_t lr, uint64_t x16, + * uint64_t x17) + */ + .text + .align 2 + .globl EXT(ml_sign_thread_state) +LEXT(ml_sign_thread_state) + pacga x1, x1, x0 /* PC hash (gkey + &arm_saved_state) */ + /* + * Mask off the carry flag so we don't need to re-sign when that flag is + * touched by the system call return path. + */ + bic x2, x2, PSR_CF + pacga x1, x2, x1 /* SPSR hash (gkey + pc hash) */ + pacga x1, x3, x1 /* LR Hash (gkey + spsr hash) */ + pacga x1, x4, x1 /* X16 hash (gkey + lr hash) */ + pacga x1, x5, x1 /* X17 hash (gkey + x16 hash) */ + str x1, [x0, SS64_JOPHASH] + ret + +/** + * void ml_check_signed_state(arm_saved_state_t *ss, uint64_t pc, + * uint32_t cpsr, uint64_t lr, uint64_t x16, + * uint64_t x17) + */ + .text + .align 2 + .globl EXT(ml_check_signed_state) +LEXT(ml_check_signed_state) + pacga x1, x1, x0 /* PC hash (gkey + &arm_saved_state) */ + /* + * Mask off the carry flag so we don't need to re-sign when that flag is + * touched by the system call return path. + */ + bic x2, x2, PSR_CF + pacga x1, x2, x1 /* SPSR hash (gkey + pc hash) */ + pacga x1, x3, x1 /* LR Hash (gkey + spsr hash) */ + pacga x1, x4, x1 /* X16 hash (gkey + lr hash) */ + pacga x1, x5, x1 /* X17 hash (gkey + x16 hash) */ + ldr x2, [x0, SS64_JOPHASH] + cmp x1, x2 + b.ne Lcheck_hash_panic + ret +Lcheck_hash_panic: + mov x1, x0 + adr x0, Lcheck_hash_str + CALL_EXTERN panic_with_thread_kernel_state +Lcheck_hash_str: + .asciz "JOP Hash Mismatch Detected (PC, CPSR, or LR corruption)" +#endif /* HAS_APPLE_PAC */ + + .text + .align 2 + .globl EXT(fill32_dczva) +LEXT(fill32_dczva) +0: + dc zva, x0 + add x0, x0, #64 + subs x1, x1, #64 + b.hi 0b + ret + + .text + .align 2 + .globl EXT(fill32_nt) +LEXT(fill32_nt) + dup.4s v0, w2 +0: + stnp q0, q0, [x0] + stnp q0, q0, [x0, #0x20] + stnp q0, q0, [x0, #0x40] + stnp q0, q0, [x0, #0x60] + add x0, x0, #128 + subs x1, x1, #128 + b.hi 0b + ret /* vim: set sw=4 ts=4: */ diff --git a/osfmk/arm64/monotonic_arm64.c b/osfmk/arm64/monotonic_arm64.c index 878c87973..25895247f 100644 --- a/osfmk/arm64/monotonic_arm64.c +++ b/osfmk/arm64/monotonic_arm64.c @@ -284,6 +284,11 @@ core_idle(__unused cpu_data_t *cpu) #pragma mark common hooks +void +mt_early_init(void) +{ +} + void mt_cpu_idle(cpu_data_t *cpu) { @@ -332,15 +337,26 @@ mt_wake_per_core(void) { } +uint64_t +mt_count_pmis(void) +{ + uint64_t npmis = 0; + int max_cpu = ml_get_max_cpu_number(); + for (int i = 0; i <= max_cpu; i++) { + cpu_data_t *cpu = (cpu_data_t *)CpuDataEntries[i].cpu_data_vaddr; + npmis += cpu->cpu_monotonic.mtc_npmis; + } + return npmis; +} + static void mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) { assert(cpu != NULL); assert(ml_get_interrupts_enabled() == FALSE); - os_atomic_inc(&mt_pmis, relaxed); - cpu->cpu_stat.pmi_cnt++; - cpu->cpu_stat.pmi_cnt_wake++; + cpu->cpu_monotonic.mtc_npmis += 1; + cpu->cpu_stat.pmi_cnt_wake += 1; #if MONOTONIC_DEBUG if (!PMCR0_PMI(pmcr0)) { @@ -444,7 +460,7 @@ mt_microstackshot_start_remote(__unused void *arg) core_set_enabled(); - if (hw_atomic_sub(&mt_xc_sync, 1) == 0) { + if (os_atomic_dec(&mt_xc_sync, relaxed) == 0) { thread_wakeup((event_t)&mt_xc_sync); } } diff --git a/osfmk/arm64/pcb.c b/osfmk/arm64/pcb.c index 5904e612f..4303f45fe 100644 --- a/osfmk/arm64/pcb.c +++ b/osfmk/arm64/pcb.c @@ -61,7 +61,6 @@ #include - #define USER_SS_ZONE_ALLOC_SIZE (0x4000) extern int debug_task; @@ -70,7 +69,7 @@ zone_t ads_zone; /* zone for debug_state area */ zone_t user_ss_zone; /* zone for user arm_context_t allocations */ /* - * Routine: consider_machine_collect + * Routine: consider_machine_collect * */ void @@ -80,7 +79,7 @@ consider_machine_collect(void) } /* - * Routine: consider_machine_adjust + * Routine: consider_machine_adjust * */ void @@ -88,22 +87,22 @@ consider_machine_adjust(void) { } + /* - * Routine: machine_switch_context + * Routine: machine_switch_context * */ thread_t -machine_switch_context( - thread_t old, - thread_continue_t continuation, - thread_t new) +machine_switch_context(thread_t old, + thread_continue_t continuation, + thread_t new) { thread_t retval; - pmap_t new_pmap; - cpu_data_t *cpu_data_ptr; + pmap_t new_pmap; + cpu_data_t * cpu_data_ptr; -#define machine_switch_context_kprintf(x...) /* kprintf("machine_switch_con - * text: " x) */ +#define machine_switch_context_kprintf(x...) \ + /* kprintf("machine_switch_context: " x) */ cpu_data_ptr = getCpuDatap(); if (old == new) @@ -112,10 +111,12 @@ machine_switch_context( kpc_off_cpu(old); + new_pmap = new->map->pmap; if (old->map->pmap != new_pmap) pmap_switch(new_pmap); + new->machine.CpuDatap = cpu_data_ptr; /* TODO: Should this be ordered? */ @@ -130,19 +131,25 @@ machine_switch_context( return retval; } +boolean_t +machine_thread_on_core(thread_t thread) +{ + return thread->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU; +} + /* - * Routine: machine_thread_create + * Routine: machine_thread_create * */ kern_return_t -machine_thread_create( - thread_t thread, - task_t task) +machine_thread_create(thread_t thread, + task_t task) { arm_context_t *thread_user_ss = NULL; kern_return_t result = KERN_SUCCESS; -#define machine_thread_create_kprintf(x...) /* kprintf("machine_thread_create: " x) */ +#define machine_thread_create_kprintf(x...) \ + /* kprintf("machine_thread_create: " x) */ machine_thread_create_kprintf("thread = %x\n", thread); @@ -152,6 +159,10 @@ machine_thread_create( thread->machine.preemption_count = 0; thread->machine.cthread_self = 0; thread->machine.cthread_data = 0; +#if defined(HAS_APPLE_PAC) + thread->machine.rop_pid = task->rop_pid; + thread->machine.disable_user_jop = task->disable_user_jop; +#endif if (task != kernel_task) { @@ -159,7 +170,8 @@ machine_thread_create( thread->machine.contextData = (arm_context_t *)zalloc(user_ss_zone); if (!thread->machine.contextData) { - return KERN_FAILURE; + result = KERN_FAILURE; + goto done; } thread->machine.upcb = &thread->machine.contextData->ss; @@ -176,34 +188,38 @@ machine_thread_create( thread->machine.uNeon->nsh.flavor = ARM_NEON_SAVED_STATE32; thread->machine.uNeon->nsh.count = ARM_NEON_SAVED_STATE32_COUNT; } + } else { thread->machine.upcb = NULL; thread->machine.uNeon = NULL; thread->machine.contextData = NULL; } - bzero(&thread->machine.perfctrl_state, sizeof(thread->machine.perfctrl_state)); + bzero(&thread->machine.perfctrl_state, sizeof(thread->machine.perfctrl_state)); result = machine_thread_state_initialize(thread); +done: if (result != KERN_SUCCESS) { thread_user_ss = thread->machine.contextData; - thread->machine.upcb = NULL; - thread->machine.uNeon = NULL; - thread->machine.contextData = NULL; - zfree(user_ss_zone, thread_user_ss); + + if (thread_user_ss) { + thread->machine.upcb = NULL; + thread->machine.uNeon = NULL; + thread->machine.contextData = NULL; + zfree(user_ss_zone, thread_user_ss); + } } return result; } /* - * Routine: machine_thread_destroy + * Routine: machine_thread_destroy * */ void -machine_thread_destroy( - thread_t thread) +machine_thread_destroy(thread_t thread) { arm_context_t *thread_user_ss; @@ -213,6 +229,8 @@ machine_thread_destroy( thread->machine.upcb = NULL; thread->machine.uNeon = NULL; thread->machine.contextData = NULL; + + zfree(user_ss_zone, thread_user_ss); } @@ -227,7 +245,7 @@ machine_thread_destroy( /* - * Routine: machine_thread_init + * Routine: machine_thread_init * */ void @@ -251,11 +269,12 @@ machine_thread_init(void) CONFIG_THREAD_MAX * (sizeof(arm_context_t)), USER_SS_ZONE_ALLOC_SIZE, "user save state"); + } /* - * Routine: get_useraddr + * Routine: get_useraddr * */ user_addr_t @@ -265,17 +284,16 @@ get_useraddr() } /* - * Routine: machine_stack_detach + * Routine: machine_stack_detach * */ vm_offset_t -machine_stack_detach( - thread_t thread) +machine_stack_detach(thread_t thread) { - vm_offset_t stack; + vm_offset_t stack; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DETACH), - (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0); + (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0); stack = thread->kernel_stack; thread->kernel_stack = 0; @@ -286,21 +304,22 @@ machine_stack_detach( /* - * Routine: machine_stack_attach + * Routine: machine_stack_attach * */ void -machine_stack_attach( - thread_t thread, - vm_offset_t stack) +machine_stack_attach(thread_t thread, + vm_offset_t stack) { struct arm_context *context; struct arm_saved_state64 *savestate; + uint32_t current_el; -#define machine_stack_attach_kprintf(x...) /* kprintf("machine_stack_attach: " x) */ +#define machine_stack_attach_kprintf(x...) \ + /* kprintf("machine_stack_attach: " x) */ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_ATTACH), - (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0); + (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0); thread->kernel_stack = stack; thread->machine.kstackptr = stack + kernel_stack_size - sizeof(struct thread_kernel_state); @@ -308,28 +327,66 @@ machine_stack_attach( machine_stack_attach_kprintf("kstackptr: %lx\n", (vm_address_t)thread->machine.kstackptr); + current_el = (uint32_t) __builtin_arm_rsr64("CurrentEL"); context = &((thread_kernel_state_t) thread->machine.kstackptr)->machine; savestate = saved_state64(&context->ss); savestate->fp = 0; - savestate->lr = (uintptr_t)thread_continue; savestate->sp = thread->machine.kstackptr; - savestate->cpsr = PSR64_KERNEL_DEFAULT; +#if defined(HAS_APPLE_PAC) + /* Sign the initial kernel stack saved state */ + const uint32_t default_cpsr = PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK; + asm volatile ( + "mov x0, %[ss]" "\n" + + "mov x1, xzr" "\n" + "str x1, [x0, %[SS64_PC]]" "\n" + + "mov x2, %[default_cpsr_lo]" "\n" + "movk x2, %[default_cpsr_hi], lsl #16" "\n" + "mrs x3, CurrentEL" "\n" + "orr w2, w2, w3" "\n" + "str w2, [x0, %[SS64_CPSR]]" "\n" + + "adrp x3, _thread_continue@page" "\n" + "add x3, x3, _thread_continue@pageoff" "\n" + "str x3, [x0, %[SS64_LR]]" "\n" + + "mov x4, xzr" "\n" + "mov x5, xzr" "\n" + "stp x4, x5, [x0, %[SS64_X16]]" "\n" + + "mov x6, lr" "\n" + "bl _ml_sign_thread_state" "\n" + "mov lr, x6" "\n" + : + : [ss] "r"(&context->ss), + [default_cpsr_lo] "M"(default_cpsr & 0xFFFF), + [default_cpsr_hi] "M"(default_cpsr >> 16), + [SS64_X16] "i"(offsetof(struct arm_saved_state, ss_64.x[16])), + [SS64_PC] "i"(offsetof(struct arm_saved_state, ss_64.pc)), + [SS64_CPSR] "i"(offsetof(struct arm_saved_state, ss_64.cpsr)), + [SS64_LR] "i"(offsetof(struct arm_saved_state, ss_64.lr)) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6" + ); +#else + savestate->lr = (uintptr_t)thread_continue; + savestate->cpsr = (PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK) | current_el; +#endif /* defined(HAS_APPLE_PAC) */ machine_stack_attach_kprintf("thread = %p pc = %llx, sp = %llx\n", thread, savestate->lr, savestate->sp); } /* - * Routine: machine_stack_handoff + * Routine: machine_stack_handoff * */ void -machine_stack_handoff( - thread_t old, - thread_t new) +machine_stack_handoff(thread_t old, + thread_t new) { - vm_offset_t stack; - pmap_t new_pmap; - cpu_data_t *cpu_data_ptr; + vm_offset_t stack; + pmap_t new_pmap; + cpu_data_t * cpu_data_ptr; kpc_off_cpu(old); @@ -344,10 +401,12 @@ machine_stack_handoff( } + new_pmap = new->map->pmap; if (old->map->pmap != new_pmap) pmap_switch(new_pmap); + new->machine.CpuDatap = cpu_data_ptr; /* TODO: Should this be ordered? */ @@ -362,17 +421,17 @@ machine_stack_handoff( /* - * Routine: call_continuation + * Routine: call_continuation * */ void -call_continuation( - thread_continue_t continuation, - void *parameter, - wait_result_t wresult, - boolean_t enable_interrupts) +call_continuation(thread_continue_t continuation, + void *parameter, + wait_result_t wresult, + boolean_t enable_interrupts) { -#define call_continuation_kprintf(x...) /* kprintf("call_continuation_kprintf:" x) */ +#define call_continuation_kprintf(x...) \ + /* kprintf("call_continuation_kprintf:" x) */ call_continuation_kprintf("thread = %p continuation = %p, stack = %p\n", current_thread(), continuation, current_thread()->machine.kstackptr); Call_continuation(continuation, parameter, wresult, enable_interrupts); @@ -398,12 +457,12 @@ call_continuation( void arm_debug_set32(arm_debug_state_t *debug_state) { - struct cpu_data *cpu_data_ptr; - arm_debug_info_t *debug_info = arm_debug_info(); - boolean_t intr, set_mde = 0; - arm_debug_state_t off_state; - uint32_t i; - uint64_t all_ctrls = 0; + struct cpu_data * cpu_data_ptr; + arm_debug_info_t * debug_info = arm_debug_info(); + boolean_t intr, set_mde = 0; + arm_debug_state_t off_state; + uint32_t i; + uint64_t all_ctrls = 0; intr = ml_set_interrupts_enabled(FALSE); cpu_data_ptr = getCpuDatap(); @@ -550,16 +609,14 @@ void arm_debug_set32(arm_debug_state_t *debug_state) } else { update_mdscr(0x8000, 0); } - + /* * Software debug single step enable */ if (debug_state->uds.ds32.mdscr_el1 & 0x1) { update_mdscr(0x8000, 1); // ~MDE | SS : no brk/watch while single stepping (which we've set) - set_saved_state_cpsr((current_thread()->machine.upcb), - get_saved_state_cpsr((current_thread()->machine.upcb)) | PSR64_SS); - + mask_saved_state_cpsr(current_thread()->machine.upcb, PSR64_SS, 0); } else { update_mdscr(0x1, 0); @@ -577,12 +634,12 @@ void arm_debug_set32(arm_debug_state_t *debug_state) void arm_debug_set64(arm_debug_state_t *debug_state) { - struct cpu_data *cpu_data_ptr; - arm_debug_info_t *debug_info = arm_debug_info(); - boolean_t intr, set_mde = 0; - arm_debug_state_t off_state; - uint32_t i; - uint64_t all_ctrls = 0; + struct cpu_data * cpu_data_ptr; + arm_debug_info_t * debug_info = arm_debug_info(); + boolean_t intr, set_mde = 0; + arm_debug_state_t off_state; + uint32_t i; + uint64_t all_ctrls = 0; intr = ml_set_interrupts_enabled(FALSE); cpu_data_ptr = getCpuDatap(); @@ -727,7 +784,7 @@ void arm_debug_set64(arm_debug_state_t *debug_state) if (set_mde) { update_mdscr(0, 0x8000); // MDSCR_EL1[MDE] } - + /* * Software debug single step enable */ @@ -735,9 +792,7 @@ void arm_debug_set64(arm_debug_state_t *debug_state) update_mdscr(0x8000, 1); // ~MDE | SS : no brk/watch while single stepping (which we've set) - set_saved_state_cpsr((current_thread()->machine.upcb), - get_saved_state_cpsr((current_thread()->machine.upcb)) | PSR64_SS); - + mask_saved_state_cpsr(current_thread()->machine.upcb, PSR64_SS, 0); } else { update_mdscr(0x1, 0); @@ -779,7 +834,7 @@ void arm_debug_set(arm_debug_state_t *debug_state) boolean_t debug_legacy_state_is_valid(arm_legacy_debug_state_t *debug_state) { - arm_debug_info_t *debug_info = arm_debug_info(); + arm_debug_info_t *debug_info = arm_debug_info(); uint32_t i; for (i = 0; i < debug_info->num_breakpoint_pairs; i++) { if (0 != debug_state->bcr[i] && VM_MAX_ADDRESS32 <= debug_state->bvr[i]) @@ -796,7 +851,7 @@ debug_legacy_state_is_valid(arm_legacy_debug_state_t *debug_state) boolean_t debug_state_is_valid32(arm_debug_state32_t *debug_state) { - arm_debug_info_t *debug_info = arm_debug_info(); + arm_debug_info_t *debug_info = arm_debug_info(); uint32_t i; for (i = 0; i < debug_info->num_breakpoint_pairs; i++) { if (0 != debug_state->bcr[i] && VM_MAX_ADDRESS32 <= debug_state->bvr[i]) @@ -813,7 +868,7 @@ debug_state_is_valid32(arm_debug_state32_t *debug_state) boolean_t debug_state_is_valid64(arm_debug_state64_t *debug_state) { - arm_debug_info_t *debug_info = arm_debug_info(); + arm_debug_info_t *debug_info = arm_debug_info(); uint32_t i; for (i = 0; i < debug_info->num_breakpoint_pairs; i++) { if (0 != debug_state->bcr[i] && MACH_VM_MAX_ADDRESS <= debug_state->bvr[i]) @@ -832,38 +887,33 @@ debug_state_is_valid64(arm_debug_state64_t *debug_state) * is ignored in the case of ARM -- Is this the right assumption? */ void -copy_legacy_debug_state( - arm_legacy_debug_state_t *src, - arm_legacy_debug_state_t *target, - __unused boolean_t all) +copy_legacy_debug_state(arm_legacy_debug_state_t * src, + arm_legacy_debug_state_t * target, + __unused boolean_t all) { bcopy(src, target, sizeof(arm_legacy_debug_state_t)); } void -copy_debug_state32( - arm_debug_state32_t *src, - arm_debug_state32_t *target, - __unused boolean_t all) +copy_debug_state32(arm_debug_state32_t * src, + arm_debug_state32_t * target, + __unused boolean_t all) { bcopy(src, target, sizeof(arm_debug_state32_t)); } void -copy_debug_state64( - arm_debug_state64_t *src, - arm_debug_state64_t *target, - __unused boolean_t all) +copy_debug_state64(arm_debug_state64_t * src, + arm_debug_state64_t * target, + __unused boolean_t all) { bcopy(src, target, sizeof(arm_debug_state64_t)); } kern_return_t -machine_thread_set_tsd_base( - thread_t thread, - mach_vm_offset_t tsd_base) +machine_thread_set_tsd_base(thread_t thread, + mach_vm_offset_t tsd_base) { - if (thread->task == kernel_task) { return KERN_INVALID_ARGUMENT; } diff --git a/osfmk/arm64/platform_tests.c b/osfmk/arm64/platform_tests.c index 5f6e474fb..9026e45f1 100644 --- a/osfmk/arm64/platform_tests.c +++ b/osfmk/arm64/platform_tests.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 Apple Inc. All rights reserved. + * Copyright (c) 2011-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -63,7 +63,6 @@ #include #include #include -#include #include #include #include @@ -85,6 +84,15 @@ kern_return_t arm64_lock_test(void); kern_return_t arm64_munger_test(void); kern_return_t ex_cb_test(void); kern_return_t arm64_pan_test(void); +kern_return_t arm64_late_pan_test(void); +#if defined(HAS_APPLE_PAC) +#include +kern_return_t arm64_ropjop_test(void); +#endif +#if HAS_TWO_STAGE_SPR_LOCK +kern_return_t arm64_spr_lock_test(void); +extern void arm64_msr_lock_test(uint64_t); +#endif // exception handler ignores this fault address during PAN test #if __ARM_PAN_AVAILABLE__ @@ -1060,8 +1068,163 @@ ex_cb_test() return KERN_SUCCESS; } +#if defined(HAS_APPLE_PAC) + +/* + * + * arm64_ropjop_test - basic xnu ROP/JOP test plan + * + * - assert ROP/JOP configured and running status match + * - assert all AppleMode ROP/JOP features enabled + * - ensure ROP/JOP keys are set and diversified + * - sign a KVA (the address of this function),assert it was signed (changed) + * - authenticate the newly signed KVA + * - assert the authed KVA is the original KVA + * - corrupt a signed ptr, auth it, ensure auth failed + * - assert the failed authIB of corrupted pointer is tagged + * + */ + +kern_return_t +arm64_ropjop_test() +{ + T_LOG("Testing ROP/JOP"); + + /* how is ROP/JOP configured */ + boolean_t config_rop_enabled = TRUE; + boolean_t config_jop_enabled = !(BootArgs->bootFlags & kBootFlagsDisableJOP); + + + /* assert all AppleMode ROP/JOP features enabled */ + uint64_t apctl = __builtin_arm_rsr64(ARM64_REG_APCTL_EL1); +#if __APSTS_SUPPORTED__ + uint64_t apsts = __builtin_arm_rsr64(ARM64_REG_APSTS_EL1); + T_ASSERT(apsts & APSTS_EL1_MKEYVld, NULL); +#else + T_ASSERT(apctl & APCTL_EL1_MKEYVld, NULL); +#endif /* __APSTS_SUPPORTED__ */ + T_ASSERT(apctl & APCTL_EL1_AppleMode, NULL); + T_ASSERT(apctl & APCTL_EL1_KernKeyEn, NULL); + + /* ROP/JOP keys enabled current status */ + bool status_jop_enabled, status_rop_enabled; +#if __APSTS_SUPPORTED__ /* H13+ */ + // TODO: update unit test to understand ROP/JOP enabled config for H13+ + status_jop_enabled = status_rop_enabled = apctl & APCTL_EL1_EnAPKey1; +#elif __APCFG_SUPPORTED__ /* H12 */ + uint64_t apcfg_el1 = __builtin_arm_rsr64(APCFG_EL1); + status_jop_enabled = status_rop_enabled = apcfg_el1 & APCFG_EL1_ELXENKEY; +#else /* !__APCFG_SUPPORTED__ H11 */ + uint64_t sctlr_el1 = __builtin_arm_rsr64("SCTLR_EL1"); + status_jop_enabled = sctlr_el1 & SCTLR_PACIA_ENABLED; + status_rop_enabled = sctlr_el1 & SCTLR_PACIB_ENABLED; +#endif /* __APSTS_SUPPORTED__ */ + + /* assert configured and running status match */ + T_ASSERT(config_rop_enabled == status_rop_enabled, NULL); + T_ASSERT(config_jop_enabled == status_jop_enabled, NULL); + + + if (config_jop_enabled) { + /* jop key */ + uint64_t apiakey_hi = __builtin_arm_rsr64(ARM64_REG_APIAKEYHI_EL1); + uint64_t apiakey_lo = __builtin_arm_rsr64(ARM64_REG_APIAKEYLO_EL1); + + /* ensure JOP key is set and diversified */ + T_EXPECT(apiakey_hi != KERNEL_ROP_ID && apiakey_lo != KERNEL_ROP_ID, NULL); + T_EXPECT(apiakey_hi != 0 && apiakey_lo != 0, NULL); + } + + if (config_rop_enabled) { + /* rop key */ + uint64_t apibkey_hi = __builtin_arm_rsr64(ARM64_REG_APIBKEYHI_EL1); + uint64_t apibkey_lo = __builtin_arm_rsr64(ARM64_REG_APIBKEYLO_EL1); + + /* ensure ROP key is set and diversified */ + T_EXPECT(apibkey_hi != KERNEL_ROP_ID && apibkey_lo != KERNEL_ROP_ID, NULL); + T_EXPECT(apibkey_hi != 0 && apibkey_lo != 0, NULL); + + /* sign a KVA (the address of this function) */ + uint64_t kva_signed = (uint64_t) ptrauth_sign_unauthenticated((void *)&config_rop_enabled, ptrauth_key_asib, 0); + + /* assert it was signed (changed) */ + T_EXPECT(kva_signed != (uint64_t)&config_rop_enabled, NULL); + + /* authenticate the newly signed KVA */ + uint64_t kva_authed = (uint64_t) ml_auth_ptr_unchecked((void *)kva_signed, ptrauth_key_asib, 0); + + /* assert the authed KVA is the original KVA */ + T_EXPECT(kva_authed == (uint64_t)&config_rop_enabled, NULL); + + /* corrupt a signed ptr, auth it, ensure auth failed */ + uint64_t kva_corrupted = kva_signed ^ 1; + + /* authenticate the corrupted pointer */ + kva_authed = (uint64_t) ml_auth_ptr_unchecked((void *)kva_corrupted, ptrauth_key_asib, 0); + + /* when AuthIB fails, bits 63:62 will be set to 2'b10 */ + uint64_t auth_fail_mask = 3ULL << 61; + uint64_t authib_fail = 2ULL << 61; + + /* assert the failed authIB of corrupted pointer is tagged */ + T_EXPECT((kva_authed & auth_fail_mask) == authib_fail, NULL); + } + + return KERN_SUCCESS; +} +#endif /* defined(HAS_APPLE_PAC) */ #if __ARM_PAN_AVAILABLE__ + +struct pan_test_thread_args { + volatile bool join; +}; + +static void +arm64_pan_test_thread(void *arg, wait_result_t __unused wres) +{ + T_ASSERT(__builtin_arm_rsr("pan") != 0, NULL); + + struct pan_test_thread_args *args = arg; + + for (processor_t p = processor_list; p != NULL; p = p->processor_list) { + thread_bind(p); + thread_block(THREAD_CONTINUE_NULL); + kprintf("Running PAN test on cpu %d\n", p->cpu_id); + arm64_pan_test(); + } + + /* unbind thread from specific cpu */ + thread_bind(PROCESSOR_NULL); + thread_block(THREAD_CONTINUE_NULL); + + while (!args->join) { + ; + } + + thread_wakeup(args); +} + +kern_return_t +arm64_late_pan_test() +{ + thread_t thread; + kern_return_t kr; + + struct pan_test_thread_args args; + args.join = false; + + kr = kernel_thread_start(arm64_pan_test_thread, &args, &thread); + assert(kr == KERN_SUCCESS); + + thread_deallocate(thread); + + assert_wait(&args, THREAD_UNINT); + args.join = true; + thread_block(THREAD_CONTINUE_NULL); + return KERN_SUCCESS; +} + kern_return_t arm64_pan_test() { @@ -1069,6 +1232,9 @@ arm64_pan_test() T_LOG("Testing PAN."); + + T_ASSERT((__builtin_arm_rsr("SCTLR_EL1") & SCTLR_PAN_UNCHANGED) == 0, "SCTLR_EL1.SPAN must be cleared"); + T_ASSERT(__builtin_arm_rsr("pan") != 0, NULL); pan_exception_level = 0; @@ -1107,9 +1273,10 @@ arm64_pan_test() pan_ro_addr = 0; __builtin_arm_wsr("pan", 1); + return KERN_SUCCESS; } -#endif +#endif /* __ARM_PAN_AVAILABLE__ */ kern_return_t @@ -1125,3 +1292,44 @@ arm64_munger_test() return 0; } + +#if HAS_TWO_STAGE_SPR_LOCK + +#define STR1(x) #x +#define STR(x) STR1(x) + +volatile vm_offset_t spr_lock_test_addr; +volatile uint32_t spr_lock_exception_esr; + +kern_return_t +arm64_spr_lock_test() +{ + processor_t p; + + for (p = processor_list; p != NULL; p = p->processor_list) { + thread_bind(p); + thread_block(THREAD_CONTINUE_NULL); + T_LOG("Running SPR lock test on cpu %d\n", p->cpu_id); + + uint64_t orig_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8)); + spr_lock_test_addr = (vm_offset_t)VM_KERNEL_STRIP_PTR(arm64_msr_lock_test); + spr_lock_exception_esr = 0; + arm64_msr_lock_test(~orig_value); + T_EXPECT(spr_lock_exception_esr != 0, "MSR write generated synchronous abort"); + + uint64_t new_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8)); + T_EXPECT(orig_value == new_value, "MSR write did not succeed"); + + spr_lock_test_addr = 0; + } + + /* unbind thread from specific cpu */ + thread_bind(PROCESSOR_NULL); + thread_block(THREAD_CONTINUE_NULL); + + T_PASS("Done running SPR lock tests"); + + return KERN_SUCCESS; +} + +#endif /* HAS_TWO_STAGE_SPR_LOCK */ diff --git a/osfmk/arm64/platform_tests_asm.s b/osfmk/arm64/platform_tests_asm.s new file mode 100644 index 000000000..5ec159e48 --- /dev/null +++ b/osfmk/arm64/platform_tests_asm.s @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#if HAS_TWO_STAGE_SPR_LOCK + .text + .align 2 + .globl EXT(arm64_msr_lock_test) +LEXT(arm64_msr_lock_test) + msr ARM64_REG_HID8, x0 + ret +#endif diff --git a/osfmk/arm64/proc_reg.h b/osfmk/arm64/proc_reg.h index ee13e1844..10c7aa567 100644 --- a/osfmk/arm64/proc_reg.h +++ b/osfmk/arm64/proc_reg.h @@ -97,101 +97,107 @@ * +-+-+-+-+-+---+---+--+--+----------+-+-+-+-+-+-----+ * * where: - * NZCV Comparison flags - * PAN Privileged Access Never - * SS Single step - * IL Illegal state - * DAIF Interrupt masks - * M Mode field + * NZCV: Comparison flags + * PAN: Privileged Access Never + * SS: Single step + * IL: Illegal state + * DAIF: Interrupt masks + * M: Mode field */ -#define PSR64_NZCV_SHIFT 28 -#define PSR64_NZCV_MASK (1 << PSR64_NZCV_SHIFT) +#define PSR64_NZCV_SHIFT 28 +#define PSR64_NZCV_MASK (1 << PSR64_NZCV_SHIFT) -#define PSR64_N_SHIFT 31 -#define PSR64_N (1 << PSR64_N_SHIFT) +#define PSR64_N_SHIFT 31 +#define PSR64_N (1 << PSR64_N_SHIFT) -#define PSR64_Z_SHIFT 30 -#define PSR64_Z (1 << PSR64_Z_SHIFT) +#define PSR64_Z_SHIFT 30 +#define PSR64_Z (1 << PSR64_Z_SHIFT) -#define PSR64_C_SHIFT 29 -#define PSR64_C (1 << PSR64_C_SHIFT) +#define PSR64_C_SHIFT 29 +#define PSR64_C (1 << PSR64_C_SHIFT) -#define PSR64_V_SHIFT 28 -#define PSR64_V (1 << PSR64_V_SHIFT) +#define PSR64_V_SHIFT 28 +#define PSR64_V (1 << PSR64_V_SHIFT) -#define PSR64_PAN_SHIFT 22 -#define PSR64_PAN (1 << PSR64_PAN_SHIFT) +#define PSR64_PAN_SHIFT 22 +#define PSR64_PAN (1 << PSR64_PAN_SHIFT) -#define PSR64_SS_SHIFT 21 -#define PSR64_SS (1 << PSR64_SS_SHIFT) +#define PSR64_SS_SHIFT 21 +#define PSR64_SS (1 << PSR64_SS_SHIFT) -#define PSR64_IL_SHIFT 20 -#define PSR64_IL (1 << PSR64_IL_SHIFT) +#define PSR64_IL_SHIFT 20 +#define PSR64_IL (1 << PSR64_IL_SHIFT) /* * msr DAIF, Xn and mrs Xn, DAIF transfer into * and out of bits 9:6 */ -#define DAIF_DEBUG_SHIFT 9 -#define DAIF_DEBUGF (1 << DAIF_DEBUG_SHIFT) +#define DAIF_DEBUG_SHIFT 9 +#define DAIF_DEBUGF (1 << DAIF_DEBUG_SHIFT) -#define DAIF_ASYNC_SHIFT 8 -#define DAIF_ASYNCF (1 << DAIF_ASYNC_SHIFT) +#define DAIF_ASYNC_SHIFT 8 +#define DAIF_ASYNCF (1 << DAIF_ASYNC_SHIFT) -#define DAIF_IRQF_SHIFT 7 -#define DAIF_IRQF (1 << DAIF_IRQF_SHIFT) +#define DAIF_IRQF_SHIFT 7 +#define DAIF_IRQF (1 << DAIF_IRQF_SHIFT) -#define DAIF_FIQF_SHIFT 6 -#define DAIF_FIQF (1 << DAIF_FIQF_SHIFT) +#define DAIF_FIQF_SHIFT 6 +#define DAIF_FIQF (1 << DAIF_FIQF_SHIFT) -#define DAIF_ALL (DAIF_DEBUGF | DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF) -#define DAIF_STANDARD_DISABLE (DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF) +#define DAIF_ALL (DAIF_DEBUGF | DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF) +#define DAIF_STANDARD_DISABLE (DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF) -#define SPSR_INTERRUPTS_ENABLED(x) (!(x & DAIF_FIQF)) +#define SPSR_INTERRUPTS_ENABLED(x) (!(x & DAIF_FIQF)) /* * msr DAIFSet, Xn, and msr DAIFClr, Xn transfer * from bits 3:0. */ -#define DAIFSC_DEBUGF (1 << 3) -#define DAIFSC_ASYNCF (1 << 2) -#define DAIFSC_IRQF (1 << 1) -#define DAIFSC_FIQF (1 << 0) -#define DAIFSC_ALL (DAIFSC_DEBUGF | DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) +#define DAIFSC_DEBUGF (1 << 3) +#define DAIFSC_ASYNCF (1 << 2) +#define DAIFSC_IRQF (1 << 1) +#define DAIFSC_FIQF (1 << 0) +#define DAIFSC_ALL (DAIFSC_DEBUGF | DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) #define DAIFSC_STANDARD_DISABLE (DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) /* * ARM64_TODO: unify with ARM? */ -#define PSR64_CF 0x20000000 /* Carry/Borrow/Extend */ +#define PSR64_CF 0x20000000 /* Carry/Borrow/Extend */ -#define PSR64_MODE_MASK 0x1F +#define PSR64_MODE_MASK 0x1F -#define PSR64_MODE_USER32_THUMB 0x20 +#define PSR64_MODE_USER32_THUMB 0x20 -#define PSR64_MODE_RW_SHIFT 4 -#define PSR64_MODE_RW_64 0 -#define PSR64_MODE_RW_32 (0x1 << PSR64_MODE_RW_SHIFT) +#define PSR64_MODE_RW_SHIFT 4 +#define PSR64_MODE_RW_64 0 +#define PSR64_MODE_RW_32 (0x1 << PSR64_MODE_RW_SHIFT) -#define PSR64_MODE_EL_SHIFT 2 -#define PSR64_MODE_EL_MASK (0x3 << PSR64_MODE_EL_SHIFT) -#define PSR64_MODE_EL3 (0x3 << PSR64_MODE_EL_SHIFT) -#define PSR64_MODE_EL1 (0x1 << PSR64_MODE_EL_SHIFT) -#define PSR64_MODE_EL0 0 +#define PSR64_MODE_EL_SHIFT 2 +#define PSR64_MODE_EL_MASK (0x3 << PSR64_MODE_EL_SHIFT) +#define PSR64_MODE_EL3 (0x3 << PSR64_MODE_EL_SHIFT) +#define PSR64_MODE_EL2 (0x2 << PSR64_MODE_EL_SHIFT) +#define PSR64_MODE_EL1 (0x1 << PSR64_MODE_EL_SHIFT) +#define PSR64_MODE_EL0 0 -#define PSR64_MODE_SPX 0x1 -#define PSR64_MODE_SP0 0 +#define PSR64_MODE_SPX 0x1 +#define PSR64_MODE_SP0 0 -#define PSR64_USER32_DEFAULT (PSR64_MODE_RW_32 | PSR64_MODE_EL0 | PSR64_MODE_SP0) -#define PSR64_USER64_DEFAULT (PSR64_MODE_RW_64 | PSR64_MODE_EL0 | PSR64_MODE_SP0) -#define PSR64_KERNEL_DEFAULT (DAIF_STANDARD_DISABLE | PSR64_MODE_RW_64 | PSR64_MODE_EL1 | PSR64_MODE_SP0) +#define PSR64_USER32_DEFAULT (PSR64_MODE_RW_32 | PSR64_MODE_EL0 | PSR64_MODE_SP0) +#define PSR64_USER64_DEFAULT (PSR64_MODE_RW_64 | PSR64_MODE_EL0 | PSR64_MODE_SP0) +#define PSR64_KERNEL_STANDARD (DAIF_STANDARD_DISABLE | PSR64_MODE_RW_64 | PSR64_MODE_EL1 | PSR64_MODE_SP0) +#if __ARM_PAN_AVAILABLE__ +#define PSR64_KERNEL_DEFAULT (PSR64_KERNEL_STANDARD | PSR64_PAN) +#else +#define PSR64_KERNEL_DEFAULT PSR64_KERNEL_STANDARD +#endif -#define PSR64_IS_KERNEL(x) ((x & PSR64_MODE_EL_MASK) == PSR64_MODE_EL1) -#define PSR64_IS_USER(x) ((x & PSR64_MODE_EL_MASK) == PSR64_MODE_EL0) +#define PSR64_IS_KERNEL(x) ((x & PSR64_MODE_EL_MASK) > PSR64_MODE_EL0) +#define PSR64_IS_USER(x) ((x & PSR64_MODE_EL_MASK) == PSR64_MODE_EL0) -#define PSR64_IS_USER32(x) (PSR64_IS_USER(x) && (x & PSR64_MODE_RW_32)) -#define PSR64_IS_USER64(x) (PSR64_IS_USER(x) && !(x & PSR64_MODE_RW_32)) +#define PSR64_IS_USER32(x) (PSR64_IS_USER(x) && (x & PSR64_MODE_RW_32)) +#define PSR64_IS_USER64(x) (PSR64_IS_USER(x) && !(x & PSR64_MODE_RW_32)) @@ -199,82 +205,95 @@ * System Control Register (SCTLR) */ -#define SCTLR_RESERVED ((3 << 28) | (1 << 22) | (1 << 20) | (1 << 11)) +#define SCTLR_RESERVED ((3ULL << 28) | (1ULL << 22) | (1ULL << 20) | (1ULL << 11)) +#if defined(HAS_APPLE_PAC) + +// 31 PACIA_ENABLED AddPACIA and AuthIA functions enabled +#define SCTLR_PACIA_ENABLED_SHIFT 31 +#define SCTLR_PACIA_ENABLED (1ULL << SCTLR_PACIA_ENABLED_SHIFT) +// 30 PACIB_ENABLED AddPACIB and AuthIB functions enabled +#define SCTLR_PACIB_ENABLED (1ULL << 30) +// 29:28 RES1 11 +// 27 PACDA_ENABLED AddPACDA and AuthDA functions enabled +#define SCTLR_PACDA_ENABLED (1ULL << 27) +// 13 PACDB_ENABLED AddPACDB and AuthDB functions enabled +#define SCTLR_PACDB_ENABLED (1ULL << 13) -// 26 UCI User Cache Instructions -#define SCTLR_UCI_ENABLED (1 << 26) +#define SCTLR_JOP_KEYS_ENABLED (SCTLR_PACIA_ENABLED | SCTLR_PACDA_ENABLED | SCTLR_PACDB_ENABLED) +#endif /* defined(HAS_APPLE_PAC) */ -// 25 EE Exception Endianness -#define SCTLR_EE_BIG_ENDIAN (1 << 25) +// 26 UCI User Cache Instructions +#define SCTLR_UCI_ENABLED (1ULL << 26) -// 24 E0E EL0 Endianness -#define SCTLR_E0E_BIG_ENDIAN (1 << 24) +// 25 EE Exception Endianness +#define SCTLR_EE_BIG_ENDIAN (1ULL << 25) -// 23 SPAN Set PAN -#define SCTLR_PAN_UNCHANGED (1 << 23) +// 24 E0E EL0 Endianness +#define SCTLR_E0E_BIG_ENDIAN (1ULL << 24) -// 22 RES1 1 -// 21 RES0 0 -// 20 RES1 1 +// 23 SPAN Set PAN +#define SCTLR_PAN_UNCHANGED (1ULL << 23) -// 19 WXN Writeable implies eXecute Never -#define SCTLR_WXN_ENABLED (1 << 19) +// 22 RES1 1 +// 21 RES0 0 +// 20 RES1 1 -// 18 nTWE Not trap WFE from EL0 -#define SCTLR_nTWE_WFE_ENABLED (1 << 18) +// 19 WXN Writeable implies eXecute Never +#define SCTLR_WXN_ENABLED (1ULL << 19) -// 17 RES0 0 +// 18 nTWE Not trap WFE from EL0 +#define SCTLR_nTWE_WFE_ENABLED (1ULL << 18) -// 16 nTWI Not trap WFI from EL0 -#define SCTRL_nTWI_WFI_ENABLED (1 << 16) +// 17 RES0 0 -// 15 UCT User Cache Type register (CTR_EL0) -#define SCTLR_UCT_ENABLED (1 << 15) +// 16 nTWI Not trap WFI from EL0 +#define SCTRL_nTWI_WFI_ENABLED (1ULL << 16) -// 14 DZE User Data Cache Zero (DC ZVA) -#define SCTLR_DZE_ENABLED (1 << 14) +// 15 UCT User Cache Type register (CTR_EL0) +#define SCTLR_UCT_ENABLED (1ULL << 15) -// 13 PACDB_ENABLED AddPACDB and AuthDB functions enabled -#define SCTLR_PACDB_ENABLED (1 << 13) +// 14 DZE User Data Cache Zero (DC ZVA) +#define SCTLR_DZE_ENABLED (1ULL << 14) -// 12 I Instruction cache enable -#define SCTLR_I_ENABLED (1 << 12) +// 12 I Instruction cache enable +#define SCTLR_I_ENABLED (1ULL << 12) -// 11 RES1 1 -// 10 RES0 0 +// 11 RES1 1 +// 10 RES0 0 -// 9 UMA User Mask Access -#define SCTLR_UMA_ENABLED (1 << 9) +// 9 UMA User Mask Access +#define SCTLR_UMA_ENABLED (1ULL << 9) -// 8 SED SETEND Disable -#define SCTLR_SED_DISABLED (1 << 8) +// 8 SED SETEND Disable +#define SCTLR_SED_DISABLED (1ULL << 8) -// 7 ITD IT Disable -#define SCTLR_ITD_DISABLED (1 << 7) +// 7 ITD IT Disable +#define SCTLR_ITD_DISABLED (1ULL << 7) -// 6 RES0 0 +// 6 RES0 0 -// 5 CP15BEN CP15 Barrier ENable -#define SCTLR_CP15BEN_ENABLED (1 << 5) +// 5 CP15BEN CP15 Barrier ENable +#define SCTLR_CP15BEN_ENABLED (1ULL << 5) -// 4 SA0 Stack Alignment check for EL0 -#define SCTLR_SA0_ENABLED (1 << 4) +// 4 SA0 Stack Alignment check for EL0 +#define SCTLR_SA0_ENABLED (1ULL << 4) -// 3 SA Stack Alignment check -#define SCTLR_SA_ENABLED (1 << 3) +// 3 SA Stack Alignment check +#define SCTLR_SA_ENABLED (1ULL << 3) -// 2 C Cache enable -#define SCTLR_C_ENABLED (1 << 2) +// 2 C Cache enable +#define SCTLR_C_ENABLED (1ULL << 2) -// 1 A Alignment check -#define SCTLR_A_ENABLED (1 << 1) +// 1 A Alignment check +#define SCTLR_A_ENABLED (1ULL << 1) -// 0 M MMU enable -#define SCTLR_M_ENABLED (1 << 0) +// 0 M MMU enable +#define SCTLR_M_ENABLED (1ULL << 0) -#define SCTLR_EL1_DEFAULT (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \ - SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED | \ - SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED) +#define SCTLR_EL1_DEFAULT \ + (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \ + SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED | \ + SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED) /* * Coprocessor Access Control Register (CPACR) @@ -285,15 +304,15 @@ * +---+---+------+------+--------------------+ * * where: - * TTA Trace trap - * FPEN Floating point enable + * TTA: Trace trap + * FPEN: Floating point enable */ -#define CPACR_TTA_SHIFT 28 -#define CPACR_TTA (1 << CPACR_TTA_SHIFT) +#define CPACR_TTA_SHIFT 28 +#define CPACR_TTA (1 << CPACR_TTA_SHIFT) -#define CPACR_FPEN_SHIFT 20 -#define CPACR_FPEN_EL0_TRAP (0x1 << CPACR_FPEN_SHIFT) -#define CPACR_FPEN_ENABLE (0x3 << CPACR_FPEN_SHIFT) +#define CPACR_FPEN_SHIFT 20 +#define CPACR_FPEN_EL0_TRAP (0x1 << CPACR_FPEN_SHIFT) +#define CPACR_FPEN_ENABLE (0x3 << CPACR_FPEN_SHIFT) /* * FPSR: Floating Point Status Register @@ -304,37 +323,37 @@ * +--+--+--+--+--+-------------------+---+--+---+---+---+---+---+ */ -#define FPSR_N_SHIFT 31 -#define FPSR_Z_SHIFT 30 -#define FPSR_C_SHIFT 29 -#define FPSR_V_SHIFT 28 -#define FPSR_QC_SHIFT 27 -#define FPSR_IDC_SHIFT 7 -#define FPSR_IXC_SHIFT 4 -#define FPSR_UFC_SHIFT 3 -#define FPSR_OFC_SHIFT 2 -#define FPSR_DZC_SHIFT 1 -#define FPSR_IOC_SHIFT 0 -#define FPSR_N (1 << FPSR_N_SHIFT) -#define FPSR_Z (1 << FPSR_Z_SHIFT) -#define FPSR_C (1 << FPSR_C_SHIFT) -#define FPSR_V (1 << FPSR_V_SHIFT) -#define FPSR_QC (1 << FPSR_QC_SHIFT) -#define FPSR_IDC (1 << FPSR_IDC_SHIFT) -#define FPSR_IXC (1 << FPSR_IXC_SHIFT) -#define FPSR_UFC (1 << FPSR_UFC_SHIFT) -#define FPSR_OFC (1 << FPSR_OFC_SHIFT) -#define FPSR_DZC (1 << FPSR_DZC_SHIFT) -#define FPSR_IOC (1 << FPSR_IOC_SHIFT) +#define FPSR_N_SHIFT 31 +#define FPSR_Z_SHIFT 30 +#define FPSR_C_SHIFT 29 +#define FPSR_V_SHIFT 28 +#define FPSR_QC_SHIFT 27 +#define FPSR_IDC_SHIFT 7 +#define FPSR_IXC_SHIFT 4 +#define FPSR_UFC_SHIFT 3 +#define FPSR_OFC_SHIFT 2 +#define FPSR_DZC_SHIFT 1 +#define FPSR_IOC_SHIFT 0 +#define FPSR_N (1 << FPSR_N_SHIFT) +#define FPSR_Z (1 << FPSR_Z_SHIFT) +#define FPSR_C (1 << FPSR_C_SHIFT) +#define FPSR_V (1 << FPSR_V_SHIFT) +#define FPSR_QC (1 << FPSR_QC_SHIFT) +#define FPSR_IDC (1 << FPSR_IDC_SHIFT) +#define FPSR_IXC (1 << FPSR_IXC_SHIFT) +#define FPSR_UFC (1 << FPSR_UFC_SHIFT) +#define FPSR_OFC (1 << FPSR_OFC_SHIFT) +#define FPSR_DZC (1 << FPSR_DZC_SHIFT) +#define FPSR_IOC (1 << FPSR_IOC_SHIFT) /* * A mask for all for all of the bits that are not RAZ for FPSR; this * is primarily for converting between a 32-bit view of NEON state * (FPSCR) and a 64-bit view of NEON state (FPSR, FPCR). */ -#define FPSR_MASK (FPSR_N | FPSR_Z | FPSR_C | FPSR_V | FPSR_QC | \ - FPSR_IDC | FPSR_IXC | FPSR_UFC | FPSR_OFC | \ - FPSR_DZC | FPSR_IOC) +#define FPSR_MASK \ + (FPSR_N | FPSR_Z | FPSR_C | FPSR_V | FPSR_QC | FPSR_IDC | FPSR_IXC | \ + FPSR_UFC | FPSR_OFC | FPSR_DZC | FPSR_IOC) /* * FPCR: Floating Point Control Register @@ -345,41 +364,41 @@ * +-----+---+--+--+-----+------+--+---+---+--+---+---+---+---+---+--------+ */ -#define FPCR_AHP_SHIFT 26 -#define FPCR_DN_SHIFT 25 -#define FPCR_FZ_SHIFT 24 -#define FPCR_RMODE_SHIFT 22 -#define FPCR_STRIDE_SHIFT 20 -#define FPCR_LEN_SHIFT 16 -#define FPCR_IDE_SHIFT 15 -#define FPCR_IXE_SHIFT 12 -#define FPCR_UFE_SHIFT 11 -#define FPCR_OFE_SHIFT 10 -#define FPCR_DZE_SHIFT 9 -#define FPCR_IOE_SHIFT 8 -#define FPCR_AHP (1 << FPCR_AHP_SHIFT) -#define FPCR_DN (1 << FPCR_DN_SHIFT) -#define FPCR_FZ (1 << FPCR_FZ_SHIFT) -#define FPCR_RMODE (0x3 << FPCR_RMODE_SHIFT) -#define FPCR_STRIDE (0x3 << FPCR_STRIDE_SHIFT) -#define FPCR_LEN (0x7 << FPCR_LEN_SHIFT) -#define FPCR_IDE (1 << FPCR_IDE_SHIFT) -#define FPCR_IXE (1 << FPCR_IXE_SHIFT) -#define FPCR_UFE (1 << FPCR_UFE_SHIFT) -#define FPCR_OFE (1 << FPCR_OFE_SHIFT) -#define FPCR_DZE (1 << FPCR_DZE_SHIFT) -#define FPCR_IOE (1 << FPCR_IOE_SHIFT) -#define FPCR_DEFAULT (FPCR_DN) -#define FPCR_DEFAULT_32 (FPCR_DN|FPCR_FZ) +#define FPCR_AHP_SHIFT 26 +#define FPCR_DN_SHIFT 25 +#define FPCR_FZ_SHIFT 24 +#define FPCR_RMODE_SHIFT 22 +#define FPCR_STRIDE_SHIFT 20 +#define FPCR_LEN_SHIFT 16 +#define FPCR_IDE_SHIFT 15 +#define FPCR_IXE_SHIFT 12 +#define FPCR_UFE_SHIFT 11 +#define FPCR_OFE_SHIFT 10 +#define FPCR_DZE_SHIFT 9 +#define FPCR_IOE_SHIFT 8 +#define FPCR_AHP (1 << FPCR_AHP_SHIFT) +#define FPCR_DN (1 << FPCR_DN_SHIFT) +#define FPCR_FZ (1 << FPCR_FZ_SHIFT) +#define FPCR_RMODE (0x3 << FPCR_RMODE_SHIFT) +#define FPCR_STRIDE (0x3 << FPCR_STRIDE_SHIFT) +#define FPCR_LEN (0x7 << FPCR_LEN_SHIFT) +#define FPCR_IDE (1 << FPCR_IDE_SHIFT) +#define FPCR_IXE (1 << FPCR_IXE_SHIFT) +#define FPCR_UFE (1 << FPCR_UFE_SHIFT) +#define FPCR_OFE (1 << FPCR_OFE_SHIFT) +#define FPCR_DZE (1 << FPCR_DZE_SHIFT) +#define FPCR_IOE (1 << FPCR_IOE_SHIFT) +#define FPCR_DEFAULT (FPCR_DN) +#define FPCR_DEFAULT_32 (FPCR_DN|FPCR_FZ) /* * A mask for all for all of the bits that are not RAZ for FPCR; this * is primarily for converting between a 32-bit view of NEON state * (FPSCR) and a 64-bit view of NEON state (FPSR, FPCR). */ -#define FPCR_MASK (FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE | \ - FPCR_STRIDE | FPCR_LEN | FPCR_IDE | FPCR_IXE | \ - FPCR_UFE | FPCR_OFE | FPCR_DZE | FPCR_IOE) +#define FPCR_MASK \ + (FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE | FPCR_STRIDE | FPCR_LEN | \ + FPCR_IDE | FPCR_IXE | FPCR_UFE | FPCR_OFE | FPCR_DZE | FPCR_IOE) /* * Translation Control Register (TCR) @@ -398,102 +417,110 @@ * | zero |TBI1|TBI0|AS|z| IPS | TG1 | SH1 |ORGN1|IRGN1|EPD1|A1| T1SZ | TG0 | SH0 |ORGN0|IRGN0|EPD0|z|T0SZ| * +------+----+----+--+-+-----+-----+-----+-----+-----+----+--+------+-----+-----+-----+-----+----+-+----+ * - * TBI1 Top Byte Ignored for TTBR1 region - * TBI0 Top Byte Ignored for TTBR0 region - * AS ASID Size - * IPS Physical Address Size limit - * TG1 Granule Size for TTBR1 region - * SH1 Shareability for TTBR1 region - * ORGN1 Outer Cacheability for TTBR1 region - * IRGN1 Inner Cacheability for TTBR1 region - * EPD1 Translation table walk disable for TTBR1 - * A1 ASID selection from TTBR1 enable - * T1SZ Virtual address size for TTBR1 - * TG0 Granule Size for TTBR0 region - * SH0 Shareability for TTBR0 region - * ORGN0 Outer Cacheability for TTBR0 region - * IRGN0 Inner Cacheability for TTBR0 region - * T0SZ Virtual address size for TTBR0 + * TBI1: Top Byte Ignored for TTBR1 region + * TBI0: Top Byte Ignored for TTBR0 region + * AS: ASID Size + * IPS: Physical Address Size limit + * TG1: Granule Size for TTBR1 region + * SH1: Shareability for TTBR1 region + * ORGN1: Outer Cacheability for TTBR1 region + * IRGN1: Inner Cacheability for TTBR1 region + * EPD1: Translation table walk disable for TTBR1 + * A1: ASID selection from TTBR1 enable + * T1SZ: Virtual address size for TTBR1 + * TG0: Granule Size for TTBR0 region + * SH0: Shareability for TTBR0 region + * ORGN0: Outer Cacheability for TTBR0 region + * IRGN0: Inner Cacheability for TTBR0 region + * T0SZ: Virtual address size for TTBR0 */ -#define TCR_T0SZ_SHIFT 0ULL -#define TCR_TSZ_BITS 6ULL -#define TCR_TSZ_MASK ((1ULL << TCR_TSZ_BITS) - 1ULL) +#define TCR_T0SZ_SHIFT 0ULL +#define TCR_TSZ_BITS 6ULL +#define TCR_TSZ_MASK ((1ULL << TCR_TSZ_BITS) - 1ULL) -#define TCR_IRGN0_SHIFT 8ULL -#define TCR_IRGN0_DISABLED (0ULL << TCR_IRGN0_SHIFT) -#define TCR_IRGN0_WRITEBACK (1ULL << TCR_IRGN0_SHIFT) -#define TCR_IRGN0_WRITETHRU (2ULL << TCR_IRGN0_SHIFT) -#define TCR_IRGN0_WRITEBACKNO (3ULL << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_SHIFT 8ULL +#define TCR_IRGN0_DISABLED (0ULL << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WRITEBACK (1ULL << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WRITETHRU (2ULL << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WRITEBACKNO (3ULL << TCR_IRGN0_SHIFT) -#define TCR_ORGN0_SHIFT 10ULL -#define TCR_ORGN0_DISABLED (0ULL << TCR_ORGN0_SHIFT) -#define TCR_ORGN0_WRITEBACK (1ULL << TCR_ORGN0_SHIFT) -#define TCR_ORGN0_WRITETHRU (2ULL << TCR_ORGN0_SHIFT) -#define TCR_ORGN0_WRITEBACKNO (3ULL << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_SHIFT 10ULL +#define TCR_ORGN0_DISABLED (0ULL << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WRITEBACK (1ULL << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WRITETHRU (2ULL << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WRITEBACKNO (3ULL << TCR_ORGN0_SHIFT) -#define TCR_SH0_SHIFT 12ULL -#define TCR_SH0_NONE (0ULL << TCR_SH0_SHIFT) -#define TCR_SH0_OUTER (2ULL << TCR_SH0_SHIFT) -#define TCR_SH0_INNER (3ULL << TCR_SH0_SHIFT) +#define TCR_SH0_SHIFT 12ULL +#define TCR_SH0_NONE (0ULL << TCR_SH0_SHIFT) +#define TCR_SH0_OUTER (2ULL << TCR_SH0_SHIFT) +#define TCR_SH0_INNER (3ULL << TCR_SH0_SHIFT) -#define TCR_TG0_GRANULE_SHIFT (14ULL) +#define TCR_TG0_GRANULE_SHIFT (14ULL) -#define TCR_TG0_GRANULE_4KB (0ULL << TCR_TG0_GRANULE_SHIFT) -#define TCR_TG0_GRANULE_64KB (1ULL << TCR_TG0_GRANULE_SHIFT) -#define TCR_TG0_GRANULE_16KB (2ULL << TCR_TG0_GRANULE_SHIFT) +#define TCR_TG0_GRANULE_4KB (0ULL << TCR_TG0_GRANULE_SHIFT) +#define TCR_TG0_GRANULE_64KB (1ULL << TCR_TG0_GRANULE_SHIFT) +#define TCR_TG0_GRANULE_16KB (2ULL << TCR_TG0_GRANULE_SHIFT) #if __ARM_16K_PG__ -#define TCR_TG0_GRANULE_SIZE (TCR_TG0_GRANULE_16KB) +#define TCR_TG0_GRANULE_SIZE (TCR_TG0_GRANULE_16KB) #else -#define TCR_TG0_GRANULE_SIZE (TCR_TG0_GRANULE_4KB) +#define TCR_TG0_GRANULE_SIZE (TCR_TG0_GRANULE_4KB) #endif -#define TCR_T1SZ_SHIFT 16ULL +#define TCR_T1SZ_SHIFT 16ULL -#define TCR_A1_ASID1 (1ULL << 22ULL) -#define TCR_EPD1_TTBR1_DISABLED (1ULL << 23ULL) +#define TCR_A1_ASID1 (1ULL << 22ULL) +#define TCR_EPD1_TTBR1_DISABLED (1ULL << 23ULL) -#define TCR_IRGN1_SHIFT 24ULL -#define TCR_IRGN1_DISABLED (0ULL << TCR_IRGN1_SHIFT) -#define TCR_IRGN1_WRITEBACK (1ULL << TCR_IRGN1_SHIFT) -#define TCR_IRGN1_WRITETHRU (2ULL << TCR_IRGN1_SHIFT) -#define TCR_IRGN1_WRITEBACKNO (3ULL << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_SHIFT 24ULL +#define TCR_IRGN1_DISABLED (0ULL << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WRITEBACK (1ULL << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WRITETHRU (2ULL << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WRITEBACKNO (3ULL << TCR_IRGN1_SHIFT) -#define TCR_ORGN1_SHIFT 26ULL -#define TCR_ORGN1_DISABLED (0ULL << TCR_ORGN1_SHIFT) -#define TCR_ORGN1_WRITEBACK (1ULL << TCR_ORGN1_SHIFT) -#define TCR_ORGN1_WRITETHRU (2ULL << TCR_ORGN1_SHIFT) -#define TCR_ORGN1_WRITEBACKNO (3ULL << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_SHIFT 26ULL +#define TCR_ORGN1_DISABLED (0ULL << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WRITEBACK (1ULL << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WRITETHRU (2ULL << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WRITEBACKNO (3ULL << TCR_ORGN1_SHIFT) -#define TCR_SH1_SHIFT 28ULL -#define TCR_SH1_NONE (0ULL << TCR_SH1_SHIFT) -#define TCR_SH1_OUTER (2ULL << TCR_SH1_SHIFT) -#define TCR_SH1_INNER (3ULL << TCR_SH1_SHIFT) +#define TCR_SH1_SHIFT 28ULL +#define TCR_SH1_NONE (0ULL << TCR_SH1_SHIFT) +#define TCR_SH1_OUTER (2ULL << TCR_SH1_SHIFT) +#define TCR_SH1_INNER (3ULL << TCR_SH1_SHIFT) -#define TCR_TG1_GRANULE_SHIFT 30ULL +#define TCR_TG1_GRANULE_SHIFT 30ULL -#define TCR_TG1_GRANULE_16KB (1ULL << TCR_TG1_GRANULE_SHIFT) -#define TCR_TG1_GRANULE_4KB (2ULL << TCR_TG1_GRANULE_SHIFT) -#define TCR_TG1_GRANULE_64KB (3ULL << TCR_TG1_GRANULE_SHIFT) +#define TCR_TG1_GRANULE_16KB (1ULL << TCR_TG1_GRANULE_SHIFT) +#define TCR_TG1_GRANULE_4KB (2ULL << TCR_TG1_GRANULE_SHIFT) +#define TCR_TG1_GRANULE_64KB (3ULL << TCR_TG1_GRANULE_SHIFT) #if __ARM_16K_PG__ -#define TCR_TG1_GRANULE_SIZE (TCR_TG1_GRANULE_16KB) +#define TCR_TG1_GRANULE_SIZE (TCR_TG1_GRANULE_16KB) #else -#define TCR_TG1_GRANULE_SIZE (TCR_TG1_GRANULE_4KB) +#define TCR_TG1_GRANULE_SIZE (TCR_TG1_GRANULE_4KB) #endif -#define TCR_IPS_SHIFT 32ULL -#define TCR_IPS_32BITS (0ULL << TCR_IPS_SHIFT) -#define TCR_IPS_36BITS (1ULL << TCR_IPS_SHIFT) -#define TCR_IPS_40BITS (2ULL << TCR_IPS_SHIFT) -#define TCR_IPS_42BITS (3ULL << TCR_IPS_SHIFT) -#define TCR_IPS_44BITS (4ULL << TCR_IPS_SHIFT) -#define TCR_IPS_48BITS (5ULL << TCR_IPS_SHIFT) - -#define TCR_AS_16BIT_ASID (1ULL << 36) -#define TCR_TBI0_TOPBYTE_IGNORED (1ULL << 37) -#define TCR_TBI1_TOPBYTE_IGNORED (1ULL << 38) +#define TCR_IPS_SHIFT 32ULL +#define TCR_IPS_32BITS (0ULL << TCR_IPS_SHIFT) +#define TCR_IPS_36BITS (1ULL << TCR_IPS_SHIFT) +#define TCR_IPS_40BITS (2ULL << TCR_IPS_SHIFT) +#define TCR_IPS_42BITS (3ULL << TCR_IPS_SHIFT) +#define TCR_IPS_44BITS (4ULL << TCR_IPS_SHIFT) +#define TCR_IPS_48BITS (5ULL << TCR_IPS_SHIFT) + +#define TCR_AS_16BIT_ASID (1ULL << 36) +#define TCR_TBI0_TOPBYTE_IGNORED (1ULL << 37) +#define TCR_TBI1_TOPBYTE_IGNORED (1ULL << 38) +#define TCR_TBID0_TBI_DATA_ONLY (1ULL << 51) +#define TCR_TBID1_TBI_DATA_ONLY (1ULL << 52) + +#if defined(HAS_APPLE_PAC) +#define TCR_TBID0_ENABLE TCR_TBID0_TBI_DATA_ONLY +#else +#define TCR_TBID0_ENABLE 0 +#endif /* * Multiprocessor Affinity Register (MPIDR_EL1) @@ -503,27 +530,21 @@ * +---------------------------------+--+-----+--+-----+----+----+ * * where - * U Uniprocessor - * MT Multi-threading at lowest affinity level - * Aff2 "1" - PCORE, "0" - ECORE - * Aff1 Cluster ID - * Aff0 CPU ID + * U: Uniprocessor + * MT: Multi-threading at lowest affinity level + * Aff2: "1" - PCORE, "0" - ECORE + * Aff1: Cluster ID + * Aff0: CPU ID */ -#define MPIDR_AFF0_MASK 0xFF -#define MPIDR_AFF1_MASK 0xFF00 -#define MPIDR_AFF1_SHIFT 8 -#define MPIDR_AFF2_MASK 0xFF0000 -#define MPIDR_AFF2_SHIFT 16 - -/* - * We currently use a 3 level page table (rather than the full 4 - * level page table). As a result, we do not have the full 48-bits - * of address space per TTBR (although the 16KB granule size lets us - * get very close). - */ -#if __ARM64_TWO_LEVEL_PMAP__ && !__ARM_16K_PG__ -#error ARM64 does not currently support a 2 level page table with 4KB pages -#endif /* __ARM64_TWO_LEVEL_PMAP__ */ +#define MPIDR_AFF0_SHIFT 0 +#define MPIDR_AFF0_WIDTH 8 +#define MPIDR_AFF0_MASK (((1 << MPIDR_AFF0_WIDTH) - 1) << MPIDR_AFF0_SHIFT) +#define MPIDR_AFF1_SHIFT 8 +#define MPIDR_AFF1_WIDTH 8 +#define MPIDR_AFF1_MASK (((1 << MPIDR_AFF1_WIDTH) - 1) << MPIDR_AFF1_SHIFT) +#define MPIDR_AFF2_SHIFT 16 +#define MPIDR_AFF2_WIDTH 8 +#define MPIDR_AFF2_MASK (((1 << MPIDR_AFF2_WIDTH) - 1) << MPIDR_AFF2_SHIFT) /* * TXSZ indicates the size of the range a TTBR covers. Currently, @@ -544,56 +565,60 @@ */ #endif /* __ARM_KERNEL_PROTECT__ */ #ifdef __ARM_16K_PG__ -#if __ARM64_TWO_LEVEL_PMAP__ -#define T0SZ_BOOT 28ULL -#elif __ARM64_PMAP_SUBPAGE_L1__ -#define T0SZ_BOOT 25ULL -#else /* __ARM64_TWO_LEVEL_PMAP__ */ -#define T0SZ_BOOT 17ULL -#endif /* __ARM64_TWO_LEVEL_PMAP__ */ +#if __ARM64_PMAP_SUBPAGE_L1__ +#define T0SZ_BOOT 25ULL +#else /* !__ARM64_PMAP_SUBPAGE_L1__ */ +#define T0SZ_BOOT 17ULL +#endif /* !__ARM64_PMAP_SUBPAGE_L1__ */ #else /* __ARM_16K_PG__ */ #if __ARM64_PMAP_SUBPAGE_L1__ -#define T0SZ_BOOT 26ULL +#define T0SZ_BOOT 26ULL #else /* __ARM64_PMAP_SUBPAGE_L1__ */ -#define T0SZ_BOOT 25ULL +#define T0SZ_BOOT 25ULL #endif /* __ARM64_PMAP_SUBPAGE_L1__ */ #endif /* __ARM_16K_PG__ */ #if defined(APPLE_ARM64_ARCH_FAMILY) /* T0SZ must be the same as T1SZ */ -#define T1SZ_BOOT T0SZ_BOOT +#define T1SZ_BOOT T0SZ_BOOT #else /* defined(APPLE_ARM64_ARCH_FAMILY) */ #ifdef __ARM_16K_PG__ -#if __ARM64_TWO_LEVEL_PMAP__ -#define T1SZ_BOOT 28ULL -#elif __ARM64_PMAP_SUBPAGE_L1__ -#define T1SZ_BOOT 25ULL -#else /* __ARM64_TWO_LEVEL_PMAP__ */ -#define T1SZ_BOOT 17ULL -#endif /* __ARM64_TWO_LEVEL_PMAP__ */ +#if __ARM64_PMAP_SUBPAGE_L1__ +#define T1SZ_BOOT 25ULL +#else /* !__ARM64_PMAP_SUBPAGE_L1__ */ +#define T1SZ_BOOT 17ULL +#endif /* !__ARM64_PMAP_SUBPAGE_L1__ */ #else /* __ARM_16K_PG__ */ #if __ARM64_PMAP_SUBPAGE_L1__ -#define T1SZ_BOOT 26ULL +#define T1SZ_BOOT 26ULL #else /* __ARM64_PMAP_SUBPAGE_L1__ */ -#define T1SZ_BOOT 25ULL +#define T1SZ_BOOT 25ULL #endif /*__ARM64_PMAP_SUBPAGE_L1__*/ #endif /* __ARM_16K_PG__ */ #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ -#define TCR_EL1_BASE (TCR_IPS_40BITS | \ - TCR_SH0_OUTER | TCR_ORGN0_WRITEBACK | TCR_IRGN0_WRITEBACK | (T0SZ_BOOT << TCR_T0SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE) |\ - TCR_SH1_OUTER | TCR_ORGN1_WRITEBACK | TCR_IRGN1_WRITEBACK | (TCR_TG1_GRANULE_SIZE)) +#if __ARM_42BIT_PA_SPACE__ +#define TCR_IPS_VALUE TCR_IPS_42BITS +#else /* !__ARM_42BIT_PA_SPACE__ */ +#define TCR_IPS_VALUE TCR_IPS_40BITS +#endif /* !__ARM_42BIT_PA_SPACE__ */ + +#define TCR_EL1_BASE \ + (TCR_IPS_VALUE | TCR_SH0_OUTER | TCR_ORGN0_WRITEBACK | \ + TCR_IRGN0_WRITEBACK | (T0SZ_BOOT << TCR_T0SZ_SHIFT) | \ + (TCR_TG0_GRANULE_SIZE) | TCR_SH1_OUTER | TCR_ORGN1_WRITEBACK | \ + TCR_IRGN1_WRITEBACK | (TCR_TG1_GRANULE_SIZE) | \ + TCR_TBI0_TOPBYTE_IGNORED | (TCR_TBID0_ENABLE)) #if __ARM_KERNEL_PROTECT__ -#define TCR_EL1_BOOT (TCR_EL1_BASE | \ - (T1SZ_BOOT << TCR_T1SZ_SHIFT) | TCR_TBI0_TOPBYTE_IGNORED) -#define T1SZ_USER (T1SZ_BOOT + 1) -#define TCR_EL1_USER (TCR_EL1_BASE | (T1SZ_USER << TCR_T1SZ_SHIFT) | TCR_TBI0_TOPBYTE_IGNORED) +#define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT)) +#define T1SZ_USER (T1SZ_BOOT + 1) +#define TCR_EL1_USER (TCR_EL1_BASE | (T1SZ_USER << TCR_T1SZ_SHIFT)) #else -#define TCR_EL1_BOOT (TCR_EL1_BASE | \ - (T1SZ_BOOT << TCR_T1SZ_SHIFT)) +#define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT)) #endif /* __ARM_KERNEL_PROTECT__ */ + /* * Translation Table Base Register (TTBR) * @@ -603,10 +628,10 @@ * +--------+------------------+------+ * */ -#define TTBR_ASID_SHIFT 48 -#define TTBR_ASID_MASK 0xffff000000000000 +#define TTBR_ASID_SHIFT 48 +#define TTBR_ASID_MASK 0xffff000000000000 -#define TTBR_BADDR_MASK 0x0000ffffffffffff +#define TTBR_BADDR_MASK 0x0000ffffffffffff /* * Memory Attribute Indirection Register @@ -618,85 +643,93 @@ * */ -#define MAIR_ATTR_SHIFT(x) (8*(x)) +#define MAIR_ATTR_SHIFT(x) (8*(x)) /* Strongly ordered or device memory attributes */ -#define MAIR_OUTER_STRONGLY_ORDERED 0x0 -#define MAIR_OUTER_DEVICE 0x0 +#define MAIR_OUTER_STRONGLY_ORDERED 0x0 +#define MAIR_OUTER_DEVICE 0x0 -#define MAIR_INNER_STRONGLY_ORDERED 0x0 -#define MAIR_INNER_DEVICE 0x4 +#define MAIR_INNER_STRONGLY_ORDERED 0x0 +#define MAIR_INNER_DEVICE 0x4 /* Normal memory attributes */ -#define MAIR_OUTER_NON_CACHEABLE 0x40 -#define MAIR_OUTER_WRITE_THROUGH 0x80 -#define MAIR_OUTER_WRITE_BACK 0xc0 +#define MAIR_OUTER_NON_CACHEABLE 0x40 +#define MAIR_OUTER_WRITE_THROUGH 0x80 +#define MAIR_OUTER_WRITE_BACK 0xc0 -#define MAIR_INNER_NON_CACHEABLE 0x4 -#define MAIR_INNER_WRITE_THROUGH 0x8 -#define MAIR_INNER_WRITE_BACK 0xc +#define MAIR_INNER_NON_CACHEABLE 0x4 +#define MAIR_INNER_WRITE_THROUGH 0x8 +#define MAIR_INNER_WRITE_BACK 0xc /* Allocate policy for cacheable memory */ -#define MAIR_OUTER_WRITE_ALLOCATE 0x10 -#define MAIR_OUTER_READ_ALLOCATE 0x20 +#define MAIR_OUTER_WRITE_ALLOCATE 0x10 +#define MAIR_OUTER_READ_ALLOCATE 0x20 -#define MAIR_INNER_WRITE_ALLOCATE 0x1 -#define MAIR_INNER_READ_ALLOCATE 0x2 +#define MAIR_INNER_WRITE_ALLOCATE 0x1 +#define MAIR_INNER_READ_ALLOCATE 0x2 /* Memory Atribute Encoding */ -/* Device memory types: - * G (gathering): multiple reads/writes can be combined - * R (reordering): reads or writes may reach device out of program order - * E (early-acknowledge): writes may return immediately (e.g. PCIe posted writes) +/* + * Device memory types: + * G (gathering): multiple reads/writes can be combined + * R (reordering): reads or writes may reach device out of program order + * E (early-acknowledge): writes may return immediately (e.g. PCIe posted writes) */ -#define MAIR_DISABLE 0x00 /* Device Memory, nGnRnE (strongly ordered) */ -#define MAIR_POSTED 0x04 /* Device Memory, nGnRE (strongly ordered, posted writes) */ -#define MAIR_WRITECOMB 0x44 /* Normal Memory, Outer Non-Cacheable, Inner Non-Cacheable */ -#define MAIR_WRITETHRU 0xBB /* Normal Memory, Outer Write-through, Inner Write-through */ -#define MAIR_WRITEBACK 0xFF /* Normal Memory, Outer Write-back, Inner Write-back */ -#define MAIR_INNERWRITEBACK 0x4F /* Normal Memory, Outer Non-Cacheable, Inner Write-back */ +#define MAIR_DISABLE 0x00 /* Device Memory, nGnRnE (strongly ordered) */ +#define MAIR_POSTED 0x04 /* Device Memory, nGnRE (strongly ordered, posted writes) */ +#define MAIR_POSTED_REORDERED 0x08 /* Device Memory, nGRE (reorderable, posted writes) */ +#define MAIR_POSTED_COMBINED_REORDERED 0x0C /* Device Memory, GRE (reorderable, gathered writes, posted writes) */ +#define MAIR_WRITECOMB 0x44 /* Normal Memory, Outer Non-Cacheable, Inner Non-Cacheable */ +#define MAIR_WRITETHRU 0xBB /* Normal Memory, Outer Write-through, Inner Write-through */ +#define MAIR_WRITEBACK 0xFF /* Normal Memory, Outer Write-back, Inner Write-back */ +#define MAIR_INNERWRITEBACK 0x4F /* Normal Memory, Outer Non-Cacheable, Inner Write-back */ /* - * ARM 4-level Page Table support - 2*1024TB (2^48) of address space + * ARM 4-level Page Table support - 2*1024TB (2^48) of address space */ /* * Memory Attribute Index */ -#define CACHE_ATTRINDX_WRITEBACK 0x0 /* cache enabled, buffer enabled */ -#define CACHE_ATTRINDX_WRITECOMB 0x1 /* no cache, buffered writes */ -#define CACHE_ATTRINDX_WRITETHRU 0x2 /* cache enabled, buffer disabled */ -#define CACHE_ATTRINDX_DISABLE 0x3 /* no cache, no buffer */ -#define CACHE_ATTRINDX_INNERWRITEBACK 0x4 /* inner cache enabled, buffer enabled, write allocate */ -#define CACHE_ATTRINDX_POSTED 0x5 /* no cache, no buffer, posted writes */ -#define CACHE_ATTRINDX_DEFAULT CACHE_ATTRINDX_WRITEBACK +#define CACHE_ATTRINDX_WRITEBACK 0x0 /* cache enabled, buffer enabled (normal memory) */ +#define CACHE_ATTRINDX_WRITECOMB 0x1 /* no cache, buffered writes (normal memory) */ +#define CACHE_ATTRINDX_WRITETHRU 0x2 /* cache enabled, buffer disabled (normal memory) */ +#define CACHE_ATTRINDX_DISABLE 0x3 /* no cache, no buffer (device memory) */ +#define CACHE_ATTRINDX_INNERWRITEBACK 0x4 /* inner cache enabled, buffer enabled, write allocate (normal memory) */ +#define CACHE_ATTRINDX_POSTED 0x5 /* no cache, no buffer, posted writes (device memory) */ +#define CACHE_ATTRINDX_POSTED_REORDERED 0x6 /* no cache, reorderable access, posted writes (device memory) */ +#define CACHE_ATTRINDX_POSTED_COMBINED_REORDERED 0x7 /* no cache, write gathering, reorderable access, posted writes (device memory) */ +#define CACHE_ATTRINDX_DEFAULT CACHE_ATTRINDX_WRITEBACK + /* - * Access protection bit values (TTEs and PTEs) + * Access protection bit values (TTEs and PTEs), stage 1 + * + * Bit 1 controls access type (1=RO, 0=RW), bit 0 controls user (1=access, 0=no access) */ -#define AP_RWNA 0x0 /* priv=read-write, user=no-access */ -#define AP_RWRW 0x1 /* priv=read-write, user=read-write */ -#define AP_RONA 0x2 /* priv=read-only, user=no-access */ -#define AP_RORO 0x3 /* priv=read-only, user=read-only */ -#define AP_MASK 0x3 /* mask to find ap bits */ +#define AP_RWNA 0x0 /* priv=read-write, user=no-access */ +#define AP_RWRW 0x1 /* priv=read-write, user=read-write */ +#define AP_RONA 0x2 /* priv=read-only, user=no-access */ +#define AP_RORO 0x3 /* priv=read-only, user=read-only */ +#define AP_MASK 0x3 /* mask to find ap bits */ /* * Shareability attributes */ -#define SH_NONE 0x0 /* Non shareable */ -#define SH_NONE 0x0 /* Device shareable */ -#define SH_DEVICE 0x2 /* Normal memory Inner non shareable - Outer non shareable */ -#define SH_OUTER_MEMORY 0x2 /* Normal memory Inner shareable - Outer shareable */ -#define SH_INNER_MEMORY 0x3 /* Normal memory Inner shareable - Outer non shareable */ +#define SH_NONE 0x0 /* Non shareable */ +#define SH_NONE 0x0 /* Device shareable */ +#define SH_DEVICE 0x2 /* Normal memory Inner non shareable - Outer non shareable */ +#define SH_OUTER_MEMORY 0x2 /* Normal memory Inner shareable - Outer shareable */ +#define SH_INNER_MEMORY 0x3 /* Normal memory Inner shareable - Outer non shareable */ /* * ARM Page Granule */ -#ifdef __ARM_16K_PG__ +#ifdef __ARM_16K_PG__ #define ARM_PGSHIFT 14 #else #define ARM_PGSHIFT 12 @@ -704,7 +737,6 @@ #define ARM_PGBYTES (1 << ARM_PGSHIFT) #define ARM_PGMASK (ARM_PGBYTES-1) - /* * L0 Translation table * @@ -719,17 +751,17 @@ * Covers 256TB (2^48) of address space. */ -#ifdef __ARM_16K_PG__ -#define ARM_TT_L0_SIZE 0x0000800000000000ULL /* size of area covered by a tte */ -#define ARM_TT_L0_OFFMASK 0x00007fffffffffffULL /* offset within an L0 entry */ -#define ARM_TT_L0_SHIFT 47 /* page descriptor shift */ -#define ARM_TT_L0_INDEX_MASK 0x0000800000000000ULL /* mask for getting index in L0 table from virtual address */ -#else -#define ARM_TT_L0_SIZE 0x0000008000000000ULL /* size of area covered by a tte */ -#define ARM_TT_L0_OFFMASK 0x0000007fffffffffULL /* offset within an L0 entry */ -#define ARM_TT_L0_SHIFT 39 /* page descriptor shift */ -#define ARM_TT_L0_INDEX_MASK 0x0000ff8000000000ULL /* mask for getting index in L0 table from virtual address */ -#endif +/* 16K L0 */ +#define ARM_16K_TT_L0_SIZE 0x0000800000000000ULL /* size of area covered by a tte */ +#define ARM_16K_TT_L0_OFFMASK 0x00007fffffffffffULL /* offset within an L0 entry */ +#define ARM_16K_TT_L0_SHIFT 47 /* page descriptor shift */ +#define ARM_16K_TT_L0_INDEX_MASK 0x0000800000000000ULL /* mask for getting index in L0 table from virtual address */ + +/* 4K L0 */ +#define ARM_4K_TT_L0_SIZE 0x0000008000000000ULL /* size of area covered by a tte */ +#define ARM_4K_TT_L0_OFFMASK 0x0000007fffffffffULL /* offset within an L0 entry */ +#define ARM_4K_TT_L0_SHIFT 39 /* page descriptor shift */ +#define ARM_4K_TT_L0_INDEX_MASK 0x0000ff8000000000ULL /* mask for getting index in L0 table from virtual address */ /* * L1 Translation table @@ -745,27 +777,27 @@ * Covers 128TB (2^47) of address space. */ -#ifdef __ARM_16K_PG__ -#define ARM_TT_L1_SIZE 0x0000001000000000ULL /* size of area covered by a tte */ -#define ARM_TT_L1_OFFMASK 0x0000000fffffffffULL /* offset within an L1 entry */ -#define ARM_TT_L1_SHIFT 36 /* page descriptor shift */ +/* 16K L1 */ +#define ARM_16K_TT_L1_SIZE 0x0000001000000000ULL /* size of area covered by a tte */ +#define ARM_16K_TT_L1_OFFMASK 0x0000000fffffffffULL /* offset within an L1 entry */ +#define ARM_16K_TT_L1_SHIFT 36 /* page descriptor shift */ #ifdef __ARM64_PMAP_SUBPAGE_L1__ /* This config supports 512GB per TTBR. */ -#define ARM_TT_L1_INDEX_MASK 0x0000007000000000ULL /* mask for getting index into L1 table from virtual address */ +#define ARM_16K_TT_L1_INDEX_MASK 0x0000007000000000ULL /* mask for getting index into L1 table from virtual address */ #else /* __ARM64_PMAP_SUBPAGE_L1__ */ -#define ARM_TT_L1_INDEX_MASK 0x00007ff000000000ULL /* mask for getting index into L1 table from virtual address */ +#define ARM_16K_TT_L1_INDEX_MASK 0x00007ff000000000ULL /* mask for getting index into L1 table from virtual address */ #endif /* __ARM64_PMAP_SUBPAGE_L1__ */ -#else /* __ARM_16K_PG__ */ -#define ARM_TT_L1_SIZE 0x0000000040000000ULL /* size of area covered by a tte */ -#define ARM_TT_L1_OFFMASK 0x000000003fffffffULL /* offset within an L1 entry */ -#define ARM_TT_L1_SHIFT 30 /* page descriptor shift */ + +/* 4K L1 */ +#define ARM_4K_TT_L1_SIZE 0x0000000040000000ULL /* size of area covered by a tte */ +#define ARM_4K_TT_L1_OFFMASK 0x000000003fffffffULL /* offset within an L1 entry */ +#define ARM_4K_TT_L1_SHIFT 30 /* page descriptor shift */ #ifdef __ARM64_PMAP_SUBPAGE_L1__ /* This config supports 256GB per TTBR. */ -#define ARM_TT_L1_INDEX_MASK 0x0000003fc0000000ULL /* mask for getting index into L1 table from virtual address */ +#define ARM_4K_TT_L1_INDEX_MASK 0x0000003fc0000000ULL /* mask for getting index into L1 table from virtual address */ #else /* __ARM64_PMAP_SUBPAGE_L1__ */ -#define ARM_TT_L1_INDEX_MASK 0x0000007fc0000000ULL /* mask for getting index into L1 table from virtual address */ +#define ARM_4K_TT_L1_INDEX_MASK 0x0000007fc0000000ULL /* mask for getting index into L1 table from virtual address */ #endif /* __ARM64_PMAP_SUBPAGE_L1__ */ -#endif /* some sugar for getting pointers to page tables and entries */ @@ -773,7 +805,7 @@ #define L2_TABLE_INDEX(va) (((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) #define L3_TABLE_INDEX(va) (((va) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT) -#define L2_TABLE_VA(tte) ((tt_entry_t*) phystokv((*(tte)) & ARM_TTE_TABLE_MASK)) +#define L2_TABLE_VA(tte) ((tt_entry_t*) phystokv((*(tte)) & ARM_TTE_TABLE_MASK)) #define L3_TABLE_VA(tte2) ((pt_entry_t*) phystokv((*(tte2)) & ARM_TTE_TABLE_MASK)) /* @@ -790,17 +822,17 @@ * Covers 64GB (2^36) of address space. */ -#ifdef __ARM_16K_PG__ -#define ARM_TT_L2_SIZE 0x0000000002000000ULL /* size of area covered by a tte */ -#define ARM_TT_L2_OFFMASK 0x0000000001ffffffULL /* offset within an L2 entry */ -#define ARM_TT_L2_SHIFT 25 /* page descriptor shift */ -#define ARM_TT_L2_INDEX_MASK 0x0000000ffe000000ULL /* mask for getting index in L2 table from virtual address */ -#else -#define ARM_TT_L2_SIZE 0x0000000000200000ULL /* size of area covered by a tte */ -#define ARM_TT_L2_OFFMASK 0x00000000001fffffULL /* offset within an L2 entry */ -#define ARM_TT_L2_SHIFT 21 /* page descriptor shift */ -#define ARM_TT_L2_INDEX_MASK 0x000000003fe00000ULL /* mask for getting index in L2 table from virtual address */ -#endif +/* 16K L2 */ +#define ARM_16K_TT_L2_SIZE 0x0000000002000000ULL /* size of area covered by a tte */ +#define ARM_16K_TT_L2_OFFMASK 0x0000000001ffffffULL /* offset within an L2 entry */ +#define ARM_16K_TT_L2_SHIFT 25 /* page descriptor shift */ +#define ARM_16K_TT_L2_INDEX_MASK 0x0000000ffe000000ULL /* mask for getting index in L2 table from virtual address */ + +/* 4K L2 */ +#define ARM_4K_TT_L2_SIZE 0x0000000000200000ULL /* size of area covered by a tte */ +#define ARM_4K_TT_L2_OFFMASK 0x00000000001fffffULL /* offset within an L2 entry */ +#define ARM_4K_TT_L2_SHIFT 21 /* page descriptor shift */ +#define ARM_4K_TT_L2_INDEX_MASK 0x000000003fe00000ULL /* mask for getting index in L2 table from virtual address */ /* * L3 Translation table @@ -816,17 +848,71 @@ * Covers 32MB (2^25) of address space. */ +/* 16K L3 */ +#define ARM_16K_TT_L3_SIZE 0x0000000000004000ULL /* size of area covered by a tte */ +#define ARM_16K_TT_L3_OFFMASK 0x0000000000003fffULL /* offset within L3 PTE */ +#define ARM_16K_TT_L3_SHIFT 14 /* page descriptor shift */ +#define ARM_16K_TT_L3_INDEX_MASK 0x0000000001ffc000ULL /* mask for page descriptor index */ + +/* 4K L3 */ +#define ARM_4K_TT_L3_SIZE 0x0000000000001000ULL /* size of area covered by a tte */ +#define ARM_4K_TT_L3_OFFMASK 0x0000000000000fffULL /* offset within L3 PTE */ +#define ARM_4K_TT_L3_SHIFT 12 /* page descriptor shift */ +#define ARM_4K_TT_L3_INDEX_MASK 0x00000000001ff000ULL /* mask for page descriptor index */ + #ifdef __ARM_16K_PG__ -#define ARM_TT_L3_SIZE 0x0000000000004000ULL /* size of area covered by a tte */ -#define ARM_TT_L3_OFFMASK 0x0000000000003fffULL /* offset within L3 PTE */ -#define ARM_TT_L3_SHIFT 14 /* page descriptor shift */ -#define ARM_TT_L3_INDEX_MASK 0x0000000001ffc000ULL /* mask for page descriptor index */ -#else -#define ARM_TT_L3_SIZE 0x0000000000001000ULL /* size of area covered by a tte */ -#define ARM_TT_L3_OFFMASK 0x0000000000000fffULL /* offset within L3 PTE */ -#define ARM_TT_L3_SHIFT 12 /* page descriptor shift */ -#define ARM_TT_L3_INDEX_MASK 0x00000000001ff000ULL /* mask for page descriptor index */ -#endif + +/* Native L0 defines */ +#define ARM_TT_L0_SIZE ARM_16K_TT_L0_SIZE +#define ARM_TT_L0_OFFMASK ARM_16K_TT_L0_OFFMASK +#define ARM_TT_L0_SHIFT ARM_16K_TT_L0_SHIFT +#define ARM_TT_L0_INDEX_MASK ARM_16K_TT_L0_INDEX_MASK + +/* Native L1 defines */ +#define ARM_TT_L1_SIZE ARM_16K_TT_L1_SIZE +#define ARM_TT_L1_OFFMASK ARM_16K_TT_L1_OFFMASK +#define ARM_TT_L1_SHIFT ARM_16K_TT_L1_SHIFT +#define ARM_TT_L1_INDEX_MASK ARM_16K_TT_L1_INDEX_MASK + +/* Native L2 defines */ +#define ARM_TT_L2_SIZE ARM_16K_TT_L2_SIZE +#define ARM_TT_L2_OFFMASK ARM_16K_TT_L2_OFFMASK +#define ARM_TT_L2_SHIFT ARM_16K_TT_L2_SHIFT +#define ARM_TT_L2_INDEX_MASK ARM_16K_TT_L2_INDEX_MASK + +/* Native L3 defines */ +#define ARM_TT_L3_SIZE ARM_16K_TT_L3_SIZE +#define ARM_TT_L3_OFFMASK ARM_16K_TT_L3_OFFMASK +#define ARM_TT_L3_SHIFT ARM_16K_TT_L3_SHIFT +#define ARM_TT_L3_INDEX_MASK ARM_16K_TT_L3_INDEX_MASK + +#else /* !__ARM_16K_PG__ */ + +/* Native L0 defines */ +#define ARM_TT_L0_SIZE ARM_4K_TT_L0_SIZE +#define ARM_TT_L0_OFFMASK ARM_4K_TT_L0_OFFMASK +#define ARM_TT_L0_SHIFT ARM_4K_TT_L0_SHIFT +#define ARM_TT_L0_INDEX_MASK ARM_4K_TT_L0_INDEX_MASK + +/* Native L1 defines */ +#define ARM_TT_L1_SIZE ARM_4K_TT_L1_SIZE +#define ARM_TT_L1_OFFMASK ARM_4K_TT_L1_OFFMASK +#define ARM_TT_L1_SHIFT ARM_4K_TT_L1_SHIFT +#define ARM_TT_L1_INDEX_MASK ARM_4K_TT_L1_INDEX_MASK + +/* Native L2 defines */ +#define ARM_TT_L2_SIZE ARM_4K_TT_L2_SIZE +#define ARM_TT_L2_OFFMASK ARM_4K_TT_L2_OFFMASK +#define ARM_TT_L2_SHIFT ARM_4K_TT_L2_SHIFT +#define ARM_TT_L2_INDEX_MASK ARM_4K_TT_L2_INDEX_MASK + +/* Native L3 defines */ +#define ARM_TT_L3_SIZE ARM_4K_TT_L3_SIZE +#define ARM_TT_L3_OFFMASK ARM_4K_TT_L3_OFFMASK +#define ARM_TT_L3_SHIFT ARM_4K_TT_L3_SHIFT +#define ARM_TT_L3_INDEX_MASK ARM_4K_TT_L3_INDEX_MASK + +#endif /* !__ARM_16K_PG__ */ /* * Convenience definitions for: @@ -836,27 +922,20 @@ * * My apologies to any botanists who may be reading this. */ -#define ARM_TT_LEAF_SIZE ARM_TT_L3_SIZE -#define ARM_TT_LEAF_OFFMASK ARM_TT_L3_OFFMASK -#define ARM_TT_LEAF_SHIFT ARM_TT_L3_SHIFT -#define ARM_TT_LEAF_INDEX_MASK ARM_TT_L3_INDEX_MASK - -#define ARM_TT_TWIG_SIZE ARM_TT_L2_SIZE -#define ARM_TT_TWIG_OFFMASK ARM_TT_L2_OFFMASK -#define ARM_TT_TWIG_SHIFT ARM_TT_L2_SHIFT -#define ARM_TT_TWIG_INDEX_MASK ARM_TT_L2_INDEX_MASK - -#if __ARM64_TWO_LEVEL_PMAP__ -#define ARM_TT_ROOT_SIZE ARM_TT_L2_SIZE -#define ARM_TT_ROOT_OFFMASK ARM_TT_L2_OFFMASK -#define ARM_TT_ROOT_SHIFT ARM_TT_L2_SHIFT -#define ARM_TT_ROOT_INDEX_MASK ARM_TT_L2_INDEX_MASK -#else -#define ARM_TT_ROOT_SIZE ARM_TT_L1_SIZE -#define ARM_TT_ROOT_OFFMASK ARM_TT_L1_OFFMASK -#define ARM_TT_ROOT_SHIFT ARM_TT_L1_SHIFT -#define ARM_TT_ROOT_INDEX_MASK ARM_TT_L1_INDEX_MASK -#endif +#define ARM_TT_LEAF_SIZE ARM_TT_L3_SIZE +#define ARM_TT_LEAF_OFFMASK ARM_TT_L3_OFFMASK +#define ARM_TT_LEAF_SHIFT ARM_TT_L3_SHIFT +#define ARM_TT_LEAF_INDEX_MASK ARM_TT_L3_INDEX_MASK + +#define ARM_TT_TWIG_SIZE ARM_TT_L2_SIZE +#define ARM_TT_TWIG_OFFMASK ARM_TT_L2_OFFMASK +#define ARM_TT_TWIG_SHIFT ARM_TT_L2_SHIFT +#define ARM_TT_TWIG_INDEX_MASK ARM_TT_L2_INDEX_MASK + +#define ARM_TT_ROOT_SIZE ARM_TT_L1_SIZE +#define ARM_TT_ROOT_OFFMASK ARM_TT_L1_OFFMASK +#define ARM_TT_ROOT_SHIFT ARM_TT_L1_SHIFT +#define ARM_TT_ROOT_INDEX_MASK ARM_TT_L1_INDEX_MASK /* * 4KB granule size: @@ -927,116 +1006,121 @@ * +-----+------+--+---+----+------+----------------------+------+--+--+----+----+--+-------+-+-+ * * where: - * 'nG' notGlobal bit - * 'SH' Shareability field - * 'AP' access protection - * 'XN' eXecute Never bit - * 'PXN' Privilege eXecute Never bit - * 'NS' Non-Secure bit - * 'HINT' 16 entry continuguous output hint - * 'AttrIdx' Memory Attribute Index + * nG: notGlobal bit + * SH: Shareability field + * AP: access protection + * XN: eXecute Never bit + * PXN: Privilege eXecute Never bit + * NS: Non-Secure bit + * HINT: 16 entry continuguous output hint + * AttrIdx: Memory Attribute Index */ -#define TTE_SHIFT 3 /* shift width of a tte (sizeof(tte) == (1 << TTE_SHIFT)) */ +#define TTE_SHIFT 3 /* shift width of a tte (sizeof(tte) == (1 << TTE_SHIFT)) */ #ifdef __ARM_16K_PG__ -#define TTE_PGENTRIES (16384 >> TTE_SHIFT) /* number of ttes per page */ +#define TTE_PGENTRIES (16384 >> TTE_SHIFT) /* number of ttes per page */ #else -#define TTE_PGENTRIES (4096 >> TTE_SHIFT) /* number of ttes per page */ +#define TTE_PGENTRIES (4096 >> TTE_SHIFT) /* number of ttes per page */ #endif -#define ARM_TTE_MAX (TTE_PGENTRIES) +#define ARM_TTE_MAX (TTE_PGENTRIES) -#define ARM_TTE_EMPTY 0x0000000000000000ULL /* unasigned - invalid entry */ -#define ARM_TTE_TYPE_FAULT 0x0000000000000000ULL /* unasigned - invalid entry */ +#define ARM_TTE_EMPTY 0x0000000000000000ULL /* unasigned - invalid entry */ +#define ARM_TTE_TYPE_FAULT 0x0000000000000000ULL /* unasigned - invalid entry */ -#define ARM_TTE_VALID 0x0000000000000001ULL /* valid entry */ +#define ARM_TTE_VALID 0x0000000000000001ULL /* valid entry */ -#define ARM_TTE_TYPE_MASK 0x0000000000000002ULL /* mask for extracting the type */ -#define ARM_TTE_TYPE_TABLE 0x0000000000000002ULL /* page table type */ -#define ARM_TTE_TYPE_BLOCK 0x0000000000000000ULL /* block entry type */ -#define ARM_TTE_TYPE_L3BLOCK 0x0000000000000002ULL -#define ARM_TTE_TYPE_MASK 0x0000000000000002ULL /* mask for extracting the type */ +#define ARM_TTE_TYPE_MASK 0x0000000000000002ULL /* mask for extracting the type */ +#define ARM_TTE_TYPE_TABLE 0x0000000000000002ULL /* page table type */ +#define ARM_TTE_TYPE_BLOCK 0x0000000000000000ULL /* block entry type */ +#define ARM_TTE_TYPE_L3BLOCK 0x0000000000000002ULL +#define ARM_TTE_TYPE_MASK 0x0000000000000002ULL /* mask for extracting the type */ #ifdef __ARM_16K_PG__ -/* Note that L0/L1 block entries are disallowed for the 16KB granule size; what are we doing with these? */ -#define ARM_TTE_BLOCK_SHIFT 12 /* entry shift for a 16KB L3 TTE entry */ -#define ARM_TTE_BLOCK_L0_SHIFT ARM_TT_L0_SHIFT /* block shift for 128TB section */ -#define ARM_TTE_BLOCK_L1_MASK 0x0000fff000000000ULL /* mask to extract phys address from L1 block entry */ -#define ARM_TTE_BLOCK_L1_SHIFT ARM_TT_L1_SHIFT /* block shift for 64GB section */ -#define ARM_TTE_BLOCK_L2_MASK 0x0000fffffe000000ULL /* mask to extract phys address from Level 2 Translation Block entry */ -#define ARM_TTE_BLOCK_L2_SHIFT ARM_TT_L2_SHIFT /* block shift for 32MB section */ +/* + * Note that L0/L1 block entries are disallowed for the 16KB granule size; what + * are we doing with these? + */ +#define ARM_TTE_BLOCK_SHIFT 12 /* entry shift for a 16KB L3 TTE entry */ +#define ARM_TTE_BLOCK_L0_SHIFT ARM_TT_L0_SHIFT /* block shift for 128TB section */ +#define ARM_TTE_BLOCK_L1_MASK 0x0000fff000000000ULL /* mask to extract phys address from L1 block entry */ +#define ARM_TTE_BLOCK_L1_SHIFT ARM_TT_L1_SHIFT /* block shift for 64GB section */ +#define ARM_TTE_BLOCK_L2_MASK 0x0000fffffe000000ULL /* mask to extract phys address from Level 2 Translation Block entry */ +#define ARM_TTE_BLOCK_L2_SHIFT ARM_TT_L2_SHIFT /* block shift for 32MB section */ #else -#define ARM_TTE_BLOCK_SHIFT 12 /* entry shift for a 4KB L3 TTE entry */ -#define ARM_TTE_BLOCK_L0_SHIFT ARM_TT_L0_SHIFT /* block shift for 2048GB section */ -#define ARM_TTE_BLOCK_L1_MASK 0x0000ffffc0000000ULL /* mask to extract phys address from L1 block entry */ -#define ARM_TTE_BLOCK_L1_SHIFT ARM_TT_L1_SHIFT /* block shift for 1GB section */ -#define ARM_TTE_BLOCK_L2_MASK 0x0000ffffffe00000ULL /* mask to extract phys address from Level 2 Translation Block entry */ -#define ARM_TTE_BLOCK_L2_SHIFT ARM_TT_L2_SHIFT /* block shift for 2MB section */ +#define ARM_TTE_BLOCK_SHIFT 12 /* entry shift for a 4KB L3 TTE entry */ +#define ARM_TTE_BLOCK_L0_SHIFT ARM_TT_L0_SHIFT /* block shift for 2048GB section */ +#define ARM_TTE_BLOCK_L1_MASK 0x0000ffffc0000000ULL /* mask to extract phys address from L1 block entry */ +#define ARM_TTE_BLOCK_L1_SHIFT ARM_TT_L1_SHIFT /* block shift for 1GB section */ +#define ARM_TTE_BLOCK_L2_MASK 0x0000ffffffe00000ULL /* mask to extract phys address from Level 2 Translation Block entry */ +#define ARM_TTE_BLOCK_L2_SHIFT ARM_TT_L2_SHIFT /* block shift for 2MB section */ #endif -#define ARM_TTE_BLOCK_APSHIFT 6 -#define ARM_TTE_BLOCK_AP(x) ((x)<> PTE_SHIFT) /* number of ptes per page */ +#define PTE_PGENTRIES (16384 >> PTE_SHIFT) /* number of ptes per page */ #else -#define PTE_PGENTRIES (4096 >> PTE_SHIFT) /* number of ptes per page */ +#define PTE_PGENTRIES (4096 >> PTE_SHIFT) /* number of ptes per page */ #endif -#define ARM_PTE_EMPTY 0x0000000000000000ULL /* unasigned - invalid entry */ +#define ARM_PTE_EMPTY 0x0000000000000000ULL /* unassigned - invalid entry */ /* markers for (invalid) PTE for a page sent to compressor */ -#define ARM_PTE_COMPRESSED 0x8000000000000000ULL /* compressed... */ -#define ARM_PTE_COMPRESSED_ALT 0x4000000000000000ULL /* ... and was "alt_acct" */ -#define ARM_PTE_COMPRESSED_MASK 0xC000000000000000ULL -#define ARM_PTE_IS_COMPRESSED(x) \ - ((((x) & 0x3) == 0) && /* PTE is not valid... */ \ - ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ - ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ - (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \ - &(x), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE))) - -#define ARM_PTE_TYPE 0x0000000000000003ULL /* valid L3 entry: includes bit #1 (counterintuitively) */ -#define ARM_PTE_TYPE_VALID 0x0000000000000003ULL /* valid L3 entry: includes bit #1 (counterintuitively) */ -#define ARM_PTE_TYPE_FAULT 0x0000000000000000ULL /* invalid L3 entry */ -#define ARM_PTE_TYPE_MASK 0x0000000000000002ULL /* mask to get pte type */ +#define ARM_PTE_COMPRESSED 0x8000000000000000ULL /* compressed... */ +#define ARM_PTE_COMPRESSED_ALT 0x4000000000000000ULL /* ... and was "alt_acct" */ +#define ARM_PTE_COMPRESSED_MASK 0xC000000000000000ULL + +#define ARM_PTE_IS_COMPRESSED(x, p) \ + ((((x) & 0x3) == 0) && /* PTE is not valid... */ \ + ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ + ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ + (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \ + (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE))) + +#define ARM_PTE_TYPE 0x0000000000000003ULL /* valid L3 entry: includes bit #1 (counterintuitively) */ +#define ARM_PTE_TYPE_VALID 0x0000000000000003ULL /* valid L3 entry: includes bit #1 (counterintuitively) */ +#define ARM_PTE_TYPE_FAULT 0x0000000000000000ULL /* invalid L3 entry */ +#define ARM_PTE_TYPE_MASK 0x0000000000000002ULL /* mask to get pte type */ #ifdef __ARM_16K_PG__ /* TODO: What does the shift mean here? */ -#define ARM_PTE_PAGE_MASK 0x0000FFFFFFFFC000ULL /* mask for 16KB page */ +#define ARM_PTE_PAGE_MASK 0x0000FFFFFFFFC000ULL /* mask for 16KB page */ #else -#define ARM_PTE_PAGE_MASK 0x0000FFFFFFFFF000ULL /* mask for 4KB page */ -#define ARM_PTE_PAGE_SHIFT 12 /* page shift for 4KB page */ +#define ARM_PTE_PAGE_MASK 0x0000FFFFFFFFF000ULL /* mask for 4KB page */ +#define ARM_PTE_PAGE_SHIFT 12 /* page shift for 4KB page */ #endif -#define ARM_PTE_AP(x) ((x) << 6) /* access protections */ -#define ARM_PTE_APMASK (0x3ULL << 6) /* mask access protections */ -#define ARM_PTE_EXTRACT_AP(x) (((x) >> 6) & 0x3ULL) /* extract access protections from PTE */ +#define ARM_PTE_AP(x) ((x) << 6) /* access protections */ +#define ARM_PTE_APMASK (0x3ULL << 6) /* mask access protections */ +#define ARM_PTE_EXTRACT_AP(x) (((x) >> 6) & 0x3ULL) /* extract access protections from PTE */ -#define ARM_PTE_ATTRINDX(x) ((x) << 2) /* memory attributes index */ -#define ARM_PTE_ATTRINDXMASK (0x7ULL << 2) /* mask memory attributes index */ +#define ARM_PTE_ATTRINDX(x) ((x) << 2) /* memory attributes index */ +#define ARM_PTE_ATTRINDXMASK (0x7ULL << 2) /* mask memory attributes index */ -#define ARM_PTE_SH(x) ((x) << 8) /* access shared */ -#define ARM_PTE_SHMASK (0x3ULL << 8) /* mask access shared */ +#define ARM_PTE_SH(x) ((x) << 8) /* access shared */ +#define ARM_PTE_SHMASK (0x3ULL << 8) /* mask access shared */ -#define ARM_PTE_AF 0x0000000000000400ULL /* value for access */ -#define ARM_PTE_AFMASK 0x0000000000000400ULL /* access mask */ +#define ARM_PTE_AF 0x0000000000000400ULL /* value for access */ +#define ARM_PTE_AFMASK 0x0000000000000400ULL /* access mask */ -#define ARM_PTE_NG 0x0000000000000800ULL /* value for a global mapping */ -#define ARM_PTE_NG_MASK 0x0000000000000800ULL /* notGlobal mapping mask */ +#define ARM_PTE_NG 0x0000000000000800ULL /* value for a global mapping */ +#define ARM_PTE_NG_MASK 0x0000000000000800ULL /* notGlobal mapping mask */ -#define ARM_PTE_NS 0x0000000000000020ULL /* value for a secure mapping */ -#define ARM_PTE_NS_MASK 0x0000000000000020ULL /* notSecure mapping mask */ +#define ARM_PTE_NS 0x0000000000000020ULL /* value for a secure mapping */ +#define ARM_PTE_NS_MASK 0x0000000000000020ULL /* notSecure mapping mask */ -#define ARM_PTE_HINT 0x0010000000000000ULL /* value for contiguous entries hint */ -#define ARM_PTE_HINT_MASK 0x0010000000000000ULL /* mask for contiguous entries hint */ +#define ARM_PTE_HINT 0x0010000000000000ULL /* value for contiguous entries hint */ +#define ARM_PTE_HINT_MASK 0x0010000000000000ULL /* mask for contiguous entries hint */ #if __ARM_16K_PG__ -#define ARM_PTE_HINT_ENTRIES 128ULL /* number of entries the hint covers */ -#define ARM_PTE_HINT_ENTRIES_SHIFT 7ULL /* shift to construct the number of entries */ -#define ARM_PTE_HINT_ADDR_MASK 0x0000FFFFFFE00000ULL /* mask to extract the starting hint address */ -#define ARM_PTE_HINT_ADDR_SHIFT 21 /* shift for the hint address */ -#define ARM_KVA_HINT_ADDR_MASK 0xFFFFFFFFFFE00000ULL /* mask to extract the starting hint address */ +#define ARM_PTE_HINT_ENTRIES 128ULL /* number of entries the hint covers */ +#define ARM_PTE_HINT_ENTRIES_SHIFT 7ULL /* shift to construct the number of entries */ +#define ARM_PTE_HINT_ADDR_MASK 0x0000FFFFFFE00000ULL /* mask to extract the starting hint address */ +#define ARM_PTE_HINT_ADDR_SHIFT 21 /* shift for the hint address */ +#define ARM_KVA_HINT_ADDR_MASK 0xFFFFFFFFFFE00000ULL /* mask to extract the starting hint address */ #else -#define ARM_PTE_HINT_ENTRIES 16ULL /* number of entries the hint covers */ -#define ARM_PTE_HINT_ENTRIES_SHIFT 4ULL /* shift to construct the number of entries */ -#define ARM_PTE_HINT_ADDR_MASK 0x0000FFFFFFFF0000ULL /* mask to extract the starting hint address */ -#define ARM_PTE_HINT_ADDR_SHIFT 16 /* shift for the hint address */ -#define ARM_KVA_HINT_ADDR_MASK 0xFFFFFFFFFFFF0000ULL /* mask to extract the starting hint address */ +#define ARM_PTE_HINT_ENTRIES 16ULL /* number of entries the hint covers */ +#define ARM_PTE_HINT_ENTRIES_SHIFT 4ULL /* shift to construct the number of entries */ +#define ARM_PTE_HINT_ADDR_MASK 0x0000FFFFFFFF0000ULL /* mask to extract the starting hint address */ +#define ARM_PTE_HINT_ADDR_SHIFT 16 /* shift for the hint address */ +#define ARM_KVA_HINT_ADDR_MASK 0xFFFFFFFFFFFF0000ULL /* mask to extract the starting hint address */ #endif -#define ARM_PTE_PNX 0x0020000000000000ULL /* value for privilege no execute bit */ -#define ARM_PTE_PNXMASK 0x0020000000000000ULL /* privilege no execute mask */ +#define ARM_PTE_PNX 0x0020000000000000ULL /* value for privilege no execute bit */ +#define ARM_PTE_PNXMASK 0x0020000000000000ULL /* privilege no execute mask */ -#define ARM_PTE_NX 0x0040000000000000ULL /* value for no execute bit */ -#define ARM_PTE_NXMASK 0x0040000000000000ULL /* no execute mask */ +#define ARM_PTE_NX 0x0040000000000000ULL /* value for no execute bit */ +#define ARM_PTE_NXMASK 0x0040000000000000ULL /* no execute mask */ -#define ARM_PTE_WIRED 0x0080000000000000ULL /* value for software wired bit */ -#define ARM_PTE_WIRED_MASK 0x0080000000000000ULL /* software wired mask */ +#define ARM_PTE_WIRED 0x0400000000000000ULL /* value for software wired bit */ +#define ARM_PTE_WIRED_MASK 0x0400000000000000ULL /* software wired mask */ -#define ARM_PTE_WRITEABLE 0x0100000000000000ULL /* value for software writeable bit */ -#define ARM_PTE_WRITEABLE_MASK 0x0100000000000000ULL /* software writeable mask */ +#define ARM_PTE_WRITEABLE 0x0800000000000000ULL /* value for software writeable bit */ +#define ARM_PTE_WRITEABLE_MASK 0x0800000000000000ULL /* software writeable mask */ #if CONFIG_PGTRACE -#define ARM_PTE_PGTRACE 0x0200000000000000ULL /* value for software trace bit */ -#define ARM_PTE_PGTRACE_MASK 0x0200000000000000ULL /* software trace mask */ +#define ARM_PTE_PGTRACE 0x0200000000000000ULL /* value for software trace bit */ +#define ARM_PTE_PGTRACE_MASK 0x0200000000000000ULL /* software trace mask */ #endif -#define ARM_PTE_BOOT_PAGE_BASE (ARM_PTE_TYPE_VALID | ARM_PTE_SH(SH_OUTER_MEMORY) \ - | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_PTE_AF) +#define ARM_PTE_BOOT_PAGE_BASE \ + (ARM_PTE_TYPE_VALID | ARM_PTE_SH(SH_OUTER_MEMORY) | \ + ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_PTE_AF) #if __ARM_KERNEL_PROTECT__ -#define ARM_PTE_BOOT_PAGE (ARM_PTE_BOOT_PAGE_BASE | ARM_PTE_NG) +#define ARM_PTE_BOOT_PAGE (ARM_PTE_BOOT_PAGE_BASE | ARM_PTE_NG) #else /* __ARM_KERNEL_PROTECT__ */ -#define ARM_PTE_BOOT_PAGE (ARM_PTE_BOOT_PAGE_BASE) +#define ARM_PTE_BOOT_PAGE (ARM_PTE_BOOT_PAGE_BASE) #endif /* __ARM_KERNEL_PROTECT__ */ /* * TLBI appers to only deal in 4KB page addresses, so give * it an explicit shift of 12. */ +#define TLBI_ADDR_SHIFT (0) #define TLBI_ADDR_SIZE (44) #define TLBI_ADDR_MASK ((1ULL << TLBI_ADDR_SIZE) - 1) -#define TLBI_ADDR_SHIFT (12) #define TLBI_ASID_SHIFT (48) #define TLBI_ASID_SIZE (16) -#define TLBI_ASID_MASK (((1ULL << TLBI_ASID_SIZE) - 1) << TLBI_ASID_SHIFT) +#define TLBI_ASID_MASK (((1ULL << TLBI_ASID_SIZE) - 1)) + +#define RTLBI_ADDR_SIZE (37) +#define RTLBI_ADDR_MASK ((1ULL << RTLBI_ADDR_SIZE) - 1) +#define RTLBI_ADDR_SHIFT ARM_TT_L3_SHIFT +#define RTLBI_TG ((uint64_t)(((ARM_TT_L3_SHIFT - 12) >> 1) + 1) << 46) +#define RTLBI_SCALE_SHIFT (44) +#define RTLBI_NUM_SHIFT (39) /* * Exception Syndrome Register @@ -1205,85 +1298,85 @@ * | EC |IL| ISS | * +------+--+------------------+ * - * EC - Exception Class - * IL - Instruction Length - * ISS- Instruction Specific Syndrome + * EC - Exception Class + * IL - Instruction Length + * ISS - Instruction Specific Syndrome * * Note: The ISS can have many forms. These are defined separately below. */ -#define ESR_EC_SHIFT 26 -#define ESR_EC_MASK (0x3F << ESR_EC_SHIFT) -#define ESR_EC(x) ((x & ESR_EC_MASK) >> ESR_EC_SHIFT) +#define ESR_EC_SHIFT 26 +#define ESR_EC_MASK (0x3FULL << ESR_EC_SHIFT) +#define ESR_EC(x) ((x & ESR_EC_MASK) >> ESR_EC_SHIFT) -#define ESR_IL_SHIFT 25 -#define ESR_IL (1 << ESR_IL_SHIFT) +#define ESR_IL_SHIFT 25 +#define ESR_IL (1 << ESR_IL_SHIFT) -#define ESR_INSTR_IS_2BYTES(x) (!(x & ESR_IL)) +#define ESR_INSTR_IS_2BYTES(x) (!(x & ESR_IL)) -#define ESR_ISS_MASK 0x01FFFFFF -#define ESR_ISS(x) (x & ESR_ISS_MASK) +#define ESR_ISS_MASK 0x01FFFFFF +#define ESR_ISS(x) (x & ESR_ISS_MASK) #ifdef __ASSEMBLER__ /* Define only the classes we need to test in the exception vectors. */ -#define ESR_EC_IABORT_EL1 0x21 -#define ESR_EC_DABORT_EL1 0x25 -#define ESR_EC_SP_ALIGN 0x26 +#define ESR_EC_IABORT_EL1 0x21 +#define ESR_EC_DABORT_EL1 0x25 +#define ESR_EC_SP_ALIGN 0x26 #else typedef enum { - ESR_EC_UNCATEGORIZED = 0x00, - ESR_EC_WFI_WFE = 0x01, - ESR_EC_MCR_MRC_CP15_TRAP = 0x03, - ESR_EC_MCRR_MRRC_CP15_TRAP = 0x04, - ESR_EC_MCR_MRC_CP14_TRAP = 0x05, - ESR_EC_LDC_STC_CP14_TRAP = 0x06, - ESR_EC_TRAP_SIMD_FP = 0x07, - ESR_EC_MCRR_MRRC_CP14_TRAP = 0x0c, - ESR_EC_ILLEGAL_INSTR_SET = 0x0e, - ESR_EC_SVC_32 = 0x11, - ESR_EC_SVC_64 = 0x15, - ESR_EC_MSR_TRAP = 0x18, - ESR_EC_IABORT_EL0 = 0x20, - ESR_EC_IABORT_EL1 = 0x21, - ESR_EC_PC_ALIGN = 0x22, - ESR_EC_DABORT_EL0 = 0x24, - ESR_EC_DABORT_EL1 = 0x25, - ESR_EC_SP_ALIGN = 0x26, - ESR_EC_FLOATING_POINT_32 = 0x28, - ESR_EC_FLOATING_POINT_64 = 0x2C, - ESR_EC_BKPT_REG_MATCH_EL0 = 0x30, // Breakpoint Debug event taken to the EL from a lower EL. - ESR_EC_BKPT_REG_MATCH_EL1 = 0x31, // Breakpoint Debug event taken to the EL from the EL. - ESR_EC_SW_STEP_DEBUG_EL0 = 0x32, // Software Step Debug event taken to the EL from a lower EL. - ESR_EC_SW_STEP_DEBUG_EL1 = 0x33, // Software Step Debug event taken to the EL from the EL. - ESR_EC_WATCHPT_MATCH_EL0 = 0x34, // Watchpoint Debug event taken to the EL from a lower EL. - ESR_EC_WATCHPT_MATCH_EL1 = 0x35, // Watchpoint Debug event taken to the EL from the EL. - ESR_EC_BKPT_AARCH32 = 0x38, - ESR_EC_BRK_AARCH64 = 0x3C + ESR_EC_UNCATEGORIZED = 0x00, + ESR_EC_WFI_WFE = 0x01, + ESR_EC_MCR_MRC_CP15_TRAP = 0x03, + ESR_EC_MCRR_MRRC_CP15_TRAP = 0x04, + ESR_EC_MCR_MRC_CP14_TRAP = 0x05, + ESR_EC_LDC_STC_CP14_TRAP = 0x06, + ESR_EC_TRAP_SIMD_FP = 0x07, + ESR_EC_MCRR_MRRC_CP14_TRAP = 0x0c, + ESR_EC_ILLEGAL_INSTR_SET = 0x0e, + ESR_EC_SVC_32 = 0x11, + ESR_EC_SVC_64 = 0x15, + ESR_EC_MSR_TRAP = 0x18, + ESR_EC_IABORT_EL0 = 0x20, + ESR_EC_IABORT_EL1 = 0x21, + ESR_EC_PC_ALIGN = 0x22, + ESR_EC_DABORT_EL0 = 0x24, + ESR_EC_DABORT_EL1 = 0x25, + ESR_EC_SP_ALIGN = 0x26, + ESR_EC_FLOATING_POINT_32 = 0x28, + ESR_EC_FLOATING_POINT_64 = 0x2C, + ESR_EC_BKPT_REG_MATCH_EL0 = 0x30, // Breakpoint Debug event taken to the EL from a lower EL. + ESR_EC_BKPT_REG_MATCH_EL1 = 0x31, // Breakpoint Debug event taken to the EL from the EL. + ESR_EC_SW_STEP_DEBUG_EL0 = 0x32, // Software Step Debug event taken to the EL from a lower EL. + ESR_EC_SW_STEP_DEBUG_EL1 = 0x33, // Software Step Debug event taken to the EL from the EL. + ESR_EC_WATCHPT_MATCH_EL0 = 0x34, // Watchpoint Debug event taken to the EL from a lower EL. + ESR_EC_WATCHPT_MATCH_EL1 = 0x35, // Watchpoint Debug event taken to the EL from the EL. + ESR_EC_BKPT_AARCH32 = 0x38, + ESR_EC_BRK_AARCH64 = 0x3C, } esr_exception_class_t; typedef enum { - FSC_TRANSLATION_FAULT_L0 = 0x04, - FSC_TRANSLATION_FAULT_L1 = 0x05, - FSC_TRANSLATION_FAULT_L2 = 0x06, - FSC_TRANSLATION_FAULT_L3 = 0x07, - FSC_ACCESS_FLAG_FAULT_L1 = 0x09, - FSC_ACCESS_FLAG_FAULT_L2 = 0x0A, - FSC_ACCESS_FLAG_FAULT_L3 = 0x0B, - FSC_PERMISSION_FAULT_L1 = 0x0D, - FSC_PERMISSION_FAULT_L2 = 0x0E, - FSC_PERMISSION_FAULT_L3 = 0x0F, - FSC_SYNC_EXT_ABORT = 0x10, - FSC_ASYNC_EXT_ABORT = 0x11, - FSC_SYNC_EXT_ABORT_TT_L1 = 0x15, - FSC_SYNC_EXT_ABORT_TT_L2 = 0x16, - FSC_SYNC_EXT_ABORT_TT_L3 = 0x17, - FSC_SYNC_PARITY = 0x18, - FSC_ASYNC_PARITY = 0x19, - FSC_SYNC_PARITY_TT_L1 = 0x1D, - FSC_SYNC_PARITY_TT_L2 = 0x1E, - FSC_SYNC_PARITY_TT_L3 = 0x1F, - FSC_ALIGNMENT_FAULT = 0x21, - FSC_DEBUG_FAULT = 0x22 + FSC_TRANSLATION_FAULT_L0 = 0x04, + FSC_TRANSLATION_FAULT_L1 = 0x05, + FSC_TRANSLATION_FAULT_L2 = 0x06, + FSC_TRANSLATION_FAULT_L3 = 0x07, + FSC_ACCESS_FLAG_FAULT_L1 = 0x09, + FSC_ACCESS_FLAG_FAULT_L2 = 0x0A, + FSC_ACCESS_FLAG_FAULT_L3 = 0x0B, + FSC_PERMISSION_FAULT_L1 = 0x0D, + FSC_PERMISSION_FAULT_L2 = 0x0E, + FSC_PERMISSION_FAULT_L3 = 0x0F, + FSC_SYNC_EXT_ABORT = 0x10, + FSC_ASYNC_EXT_ABORT = 0x11, + FSC_SYNC_EXT_ABORT_TT_L1 = 0x15, + FSC_SYNC_EXT_ABORT_TT_L2 = 0x16, + FSC_SYNC_EXT_ABORT_TT_L3 = 0x17, + FSC_SYNC_PARITY = 0x18, + FSC_ASYNC_PARITY = 0x19, + FSC_SYNC_PARITY_TT_L1 = 0x1D, + FSC_SYNC_PARITY_TT_L2 = 0x1E, + FSC_SYNC_PARITY_TT_L3 = 0x1F, + FSC_ALIGNMENT_FAULT = 0x21, + FSC_DEBUG_FAULT = 0x22 } fault_status_t; #endif /* ASSEMBLER */ @@ -1295,19 +1388,19 @@ typedef enum { * +---+-----------------+--+------+ * * where: - * ISV Instruction syndrome valid - * EX Exclusive access - * IFSC Instruction Fault Status Code + * ISV: Instruction syndrome valid + * EX: Exclusive access + * IFSC: Instruction Fault Status Code */ -#define ISS_SSDE_ISV_SHIFT 24 -#define ISS_SSDE_ISV (0x1 << ISS_SSDE_ISV_SHIFT) +#define ISS_SSDE_ISV_SHIFT 24 +#define ISS_SSDE_ISV (0x1 << ISS_SSDE_ISV_SHIFT) -#define ISS_SSDE_EX_SHIFT 6 -#define ISS_SSDE_EX (0x1 << ISS_SSDE_EX_SHIFT) +#define ISS_SSDE_EX_SHIFT 6 +#define ISS_SSDE_EX (0x1 << ISS_SSDE_EX_SHIFT) -#define ISS_SSDE_FSC_MASK 0x3F -#define ISS_SSDE_FSC(x) (x & ISS_SSDE_FSC_MASK) +#define ISS_SSDE_FSC_MASK 0x3F +#define ISS_SSDE_FSC(x) (x & ISS_SSDE_FSC_MASK) /* * Instruction Abort ISS (EL1) @@ -1317,15 +1410,15 @@ typedef enum { * +---------------+--+---+------+ * * where: - * EA External Abort type - * IFSC Instruction Fault Status Code + * EA: External Abort type + * IFSC: Instruction Fault Status Code */ -#define ISS_IA_EA_SHIFT 9 -#define ISS_IA_EA (0x1 << ISS_IA_EA_SHIFT) +#define ISS_IA_EA_SHIFT 9 +#define ISS_IA_EA (0x1 << ISS_IA_EA_SHIFT) -#define ISS_IA_FSC_MASK 0x3F -#define ISS_IA_FSC(x) (x & ISS_IA_FSC_MASK) +#define ISS_IA_FSC_MASK 0x3F +#define ISS_IA_FSC(x) (x & ISS_IA_FSC_MASK) /* @@ -1337,59 +1430,98 @@ typedef enum { * +---------------+--+--+-+---+----+ * * where: - * EA External Abort type - * CM Cache Maintenance operation - * WnR Write not Read - * DFSC Data Fault Status Code + * EA: External Abort type + * CM: Cache Maintenance operation + * WnR: Write not Read + * DFSC: Data Fault Status Code */ -#define ISS_DA_EA_SHIFT 9 -#define ISS_DA_EA (0x1 << ISS_DA_EA_SHIFT) +#define ISS_DA_EA_SHIFT 9 +#define ISS_DA_EA (0x1 << ISS_DA_EA_SHIFT) -#define ISS_DA_CM_SHIFT 8 -#define ISS_DA_CM (0x1 << ISS_DA_CM_SHIFT) +#define ISS_DA_CM_SHIFT 8 +#define ISS_DA_CM (0x1 << ISS_DA_CM_SHIFT) -#define ISS_DA_WNR_SHIFT 6 -#define ISS_DA_WNR (0x1 << ISS_DA_WNR_SHIFT) +#define ISS_DA_WNR_SHIFT 6 +#define ISS_DA_WNR (0x1 << ISS_DA_WNR_SHIFT) + +#define ISS_DA_FSC_MASK 0x3F +#define ISS_DA_FSC(x) (x & ISS_DA_FSC_MASK) + +/* + * Floating Point Exception ISS (EL1) + * + * 24 23 22 8 7 4 3 2 1 0 + * +-+---+---------------+---+--+---+---+---+---+---+ + * |0|TFV|000000000000000|IDF|00|IXF|UFF|OFF|DZF|IOF| + * +-+---+---------------+---+--+---+---+---+---+---+ + * + * where: + * TFV: Trapped Fault Valid + * IDF: Input Denormal Exception + * IXF: Input Inexact Exception + * UFF: Underflow Exception + * OFF: Overflow Exception + * DZF: Divide by Zero Exception + * IOF: Invalid Operation Exception + */ +#define ISS_FP_TFV_SHIFT 23 +#define ISS_FP_TFV (0x1 << ISS_FP_TFV_SHIFT) + +#define ISS_FP_IDF_SHIFT 7 +#define ISS_FP_IDF (0x1 << ISS_FP_IDF_SHIFT) + +#define ISS_FP_IXF_SHIFT 4 +#define ISS_FP_IXF (0x1 << ISS_FP_IXF_SHIFT) + +#define ISS_FP_UFF_SHIFT 3 +#define ISS_FP_UFF (0x1 << ISS_FP_UFF_SHIFT) + +#define ISS_FP_OFF_SHIFT 2 +#define ISS_FP_OFF (0x1 << ISS_FP_OFF_SHIFT) + +#define ISS_FP_DZF_SHIFT 1 +#define ISS_FP_DZF (0x1 << ISS_FP_DZF_SHIFT) + +#define ISS_FP_IOF_SHIFT 0 +#define ISS_FP_IOF (0x1 << ISS_FP_IOF_SHIFT) -#define ISS_DA_FSC_MASK 0x3F -#define ISS_DA_FSC(x) (x & ISS_DA_FSC_MASK) /* * Physical Address Register (EL1) */ -#define PAR_F_SHIFT 0 -#define PAR_F (0x1 << PAR_F_SHIFT) +#define PAR_F_SHIFT 0 +#define PAR_F (0x1 << PAR_F_SHIFT) -#define PLATFORM_SYSCALL_TRAP_NO 0x80000000 +#define PLATFORM_SYSCALL_TRAP_NO 0x80000000 -#define ARM64_SYSCALL_CODE_REG_NUM (16) +#define ARM64_SYSCALL_CODE_REG_NUM (16) -#define ARM64_CLINE_SHIFT 6 +#define ARM64_CLINE_SHIFT 6 #if defined(APPLE_ARM64_ARCH_FAMILY) -#define L2CERRSTS_DATSBEESV (1ULL << 2) /* L2C data single bit ECC error */ -#define L2CERRSTS_DATDBEESV (1ULL << 4) /* L2C data double bit ECC error */ +#define L2CERRSTS_DATSBEESV (1ULL << 2) /* L2C data single bit ECC error */ +#define L2CERRSTS_DATDBEESV (1ULL << 4) /* L2C data double bit ECC error */ #endif /* * Timer definitions. */ -#define CNTKCTL_EL1_PL0PTEN (0x1 << 9) /* 1: EL0 access to physical timer regs permitted */ -#define CNTKCTL_EL1_PL0VTEN (0x1 << 8) /* 1: EL0 access to virtual timer regs permitted */ -#define CNTKCTL_EL1_EVENTI_MASK (0x000000f0) /* Mask for bits describing which bit to use for triggering event stream */ -#define CNTKCTL_EL1_EVENTI_SHIFT (0x4) /* Shift for same */ -#define CNTKCTL_EL1_EVENTDIR (0x1 << 3) /* 1: one-to-zero transition of specified bit causes event */ -#define CNTKCTL_EL1_EVNTEN (0x1 << 2) /* 1: enable event stream */ -#define CNTKCTL_EL1_PL0VCTEN (0x1 << 1) /* 1: EL0 access to physical timebase + frequency reg enabled */ -#define CNTKCTL_EL1_PL0PCTEN (0x1 << 0) /* 1: EL0 access to virtual timebase + frequency reg enabled */ - -#define CNTV_CTL_EL0_ISTATUS (0x1 << 2) /* (read only): whether interrupt asserted */ -#define CNTV_CTL_EL0_IMASKED (0x1 << 1) /* 1: interrupt masked */ -#define CNTV_CTL_EL0_ENABLE (0x1 << 0) /* 1: virtual timer enabled */ - -#define CNTP_CTL_EL0_ISTATUS CNTV_CTL_EL0_ISTATUS -#define CNTP_CTL_EL0_IMASKED CNTV_CTL_EL0_IMASKED -#define CNTP_CTL_EL0_ENABLE CNTV_CTL_EL0_ENABLE +#define CNTKCTL_EL1_PL0PTEN (0x1 << 9) /* 1: EL0 access to physical timer regs permitted */ +#define CNTKCTL_EL1_PL0VTEN (0x1 << 8) /* 1: EL0 access to virtual timer regs permitted */ +#define CNTKCTL_EL1_EVENTI_MASK (0x000000f0) /* Mask for bits describing which bit to use for triggering event stream */ +#define CNTKCTL_EL1_EVENTI_SHIFT (0x4) /* Shift for same */ +#define CNTKCTL_EL1_EVENTDIR (0x1 << 3) /* 1: one-to-zero transition of specified bit causes event */ +#define CNTKCTL_EL1_EVNTEN (0x1 << 2) /* 1: enable event stream */ +#define CNTKCTL_EL1_PL0VCTEN (0x1 << 1) /* 1: EL0 access to physical timebase + frequency reg enabled */ +#define CNTKCTL_EL1_PL0PCTEN (0x1 << 0) /* 1: EL0 access to virtual timebase + frequency reg enabled */ + +#define CNTV_CTL_EL0_ISTATUS (0x1 << 2) /* (read only): whether interrupt asserted */ +#define CNTV_CTL_EL0_IMASKED (0x1 << 1) /* 1: interrupt masked */ +#define CNTV_CTL_EL0_ENABLE (0x1 << 0) /* 1: virtual timer enabled */ + +#define CNTP_CTL_EL0_ISTATUS CNTV_CTL_EL0_ISTATUS +#define CNTP_CTL_EL0_IMASKED CNTV_CTL_EL0_IMASKED +#define CNTP_CTL_EL0_ENABLE CNTV_CTL_EL0_ENABLE /* * At present all other uses of ARM_DBG_* are shared bit compatibly with the 32bit definitons. @@ -1397,28 +1529,36 @@ typedef enum { */ #define ARM_DBG_VR_ADDRESS_MASK64 0xFFFFFFFFFFFFFFFCull /* BVR & WVR */ -#define MIDR_EL1_REV_SHIFT 0 -#define MIDR_EL1_REV_MASK (0xf << MIDR_EL1_REV_SHIFT) -#define MIDR_EL1_PNUM_SHIFT 4 -#define MIDR_EL1_PNUM_MASK (0xfff << MIDR_EL1_PNUM_SHIFT) -#define MIDR_EL1_ARCH_SHIFT 16 -#define MIDR_EL1_ARCH_MASK (0xf << MIDR_EL1_ARCH_SHIFT) -#define MIDR_EL1_VAR_SHIFT 20 -#define MIDR_EL1_VAR_MASK (0xf << MIDR_EL1_VAR_SHIFT) -#define MIDR_EL1_IMP_SHIFT 24 -#define MIDR_EL1_IMP_MASK (0xff << MIDR_EL1_IMP_SHIFT) +#define MIDR_EL1_REV_SHIFT 0 +#define MIDR_EL1_REV_MASK (0xf << MIDR_EL1_REV_SHIFT) +#define MIDR_EL1_PNUM_SHIFT 4 +#define MIDR_EL1_PNUM_MASK (0xfff << MIDR_EL1_PNUM_SHIFT) +#define MIDR_EL1_ARCH_SHIFT 16 +#define MIDR_EL1_ARCH_MASK (0xf << MIDR_EL1_ARCH_SHIFT) +#define MIDR_EL1_VAR_SHIFT 20 +#define MIDR_EL1_VAR_MASK (0xf << MIDR_EL1_VAR_SHIFT) +#define MIDR_EL1_IMP_SHIFT 24 +#define MIDR_EL1_IMP_MASK (0xff << MIDR_EL1_IMP_SHIFT) /* * CoreSight debug registers */ -#define CORESIGHT_ED 0 -#define CORESIGHT_CTI 1 -#define CORESIGHT_PMU 2 -#define CORESIGHT_UTT 3 /* Not truly a coresight thing, but at a fixed convenient location right after the coresight region */ +#define CORESIGHT_ED 0 +#define CORESIGHT_CTI 1 +#define CORESIGHT_PMU 2 +#define CORESIGHT_UTT 3 /* Not truly a coresight thing, but at a fixed convenient location right after the coresight region */ + +#define CORESIGHT_OFFSET(x) ((x) * 0x10000) +#define CORESIGHT_REGIONS 4 +#define CORESIGHT_SIZE 0x1000 + + + + + + + -#define CORESIGHT_OFFSET(x) ((x) * 0x10000) -#define CORESIGHT_REGIONS 4 -#define CORESIGHT_SIZE 0x1000 /* @@ -1430,30 +1570,75 @@ typedef enum { * +----------+--------+------+------+------+-----+------+ */ -#define ID_AA64ISAR0_EL1_ATOMIC_OFFSET 20 -#define ID_AA64ISAR0_EL1_ATOMIC_MASK (0xfull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET) -#define ID_AA64ISAR0_EL1_ATOMIC_8_1 (2ull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET) +#define ID_AA64ISAR0_EL1_FHM_OFFSET 48 +#define ID_AA64ISAR0_EL1_FHM_MASK (0xfull << ID_AA64ISAR0_EL1_FHM_OFFSET) +#define ID_AA64ISAR0_EL1_FHM_8_2 (1ull << ID_AA64ISAR0_EL1_FHM_OFFSET) + +#define ID_AA64ISAR0_EL1_ATOMIC_OFFSET 20 +#define ID_AA64ISAR0_EL1_ATOMIC_MASK (0xfull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET) +#define ID_AA64ISAR0_EL1_ATOMIC_8_1 (2ull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET) + +#define ID_AA64ISAR0_EL1_CRC32_OFFSET 16 +#define ID_AA64ISAR0_EL1_CRC32_MASK (0xfull << ID_AA64ISAR0_EL1_CRC32_OFFSET) +#define ID_AA64ISAR0_EL1_CRC32_EN (1ull << ID_AA64ISAR0_EL1_CRC32_OFFSET) + +#define ID_AA64ISAR0_EL1_SHA2_OFFSET 12 +#define ID_AA64ISAR0_EL1_SHA2_MASK (0xfull << ID_AA64ISAR0_EL1_SHA2_OFFSET) +#define ID_AA64ISAR0_EL1_SHA2_EN (1ull << ID_AA64ISAR0_EL1_SHA2_OFFSET) -#define ID_AA64ISAR0_EL1_CRC32_OFFSET 16 -#define ID_AA64ISAR0_EL1_CRC32_MASK (0xfull << ID_AA64ISAR0_EL1_CRC32_OFFSET) -#define ID_AA64ISAR0_EL1_CRC32_EN (1ull << ID_AA64ISAR0_EL1_CRC32_OFFSET) +#define ID_AA64ISAR0_EL1_SHA1_OFFSET 8 +#define ID_AA64ISAR0_EL1_SHA1_MASK (0xfull << ID_AA64ISAR0_EL1_SHA1_OFFSET) +#define ID_AA64ISAR0_EL1_SHA1_EN (1ull << ID_AA64ISAR0_EL1_SHA1_OFFSET) -#define ID_AA64ISAR0_EL1_SHA2_OFFSET 12 -#define ID_AA64ISAR0_EL1_SHA2_MASK (0xfull << ID_AA64ISAR0_EL1_SHA2_OFFSET) -#define ID_AA64ISAR0_EL1_SHA2_EN (1ull << ID_AA64ISAR0_EL1_SHA2_OFFSET) +#define ID_AA64ISAR0_EL1_AES_OFFSET 4 +#define ID_AA64ISAR0_EL1_AES_MASK (0xfull << ID_AA64ISAR0_EL1_AES_OFFSET) +#define ID_AA64ISAR0_EL1_AES_EN (1ull << ID_AA64ISAR0_EL1_AES_OFFSET) +#define ID_AA64ISAR0_EL1_AES_PMULL_EN (2ull << ID_AA64ISAR0_EL1_AES_OFFSET) -#define ID_AA64ISAR0_EL1_SHA1_OFFSET 8 -#define ID_AA64ISAR0_EL1_SHA1_MASK (0xfull << ID_AA64ISAR0_EL1_SHA1_OFFSET) -#define ID_AA64ISAR0_EL1_SHA1_EN (1ull << ID_AA64ISAR0_EL1_SHA1_OFFSET) -#define ID_AA64ISAR0_EL1_AES_OFFSET 4 -#define ID_AA64ISAR0_EL1_AES_MASK (0xfull << ID_AA64ISAR0_EL1_AES_OFFSET) -#define ID_AA64ISAR0_EL1_AES_EN (1ull << ID_AA64ISAR0_EL1_AES_OFFSET) -#define ID_AA64ISAR0_EL1_AES_PMULL_EN (2ull << ID_AA64ISAR0_EL1_AES_OFFSET) +#if __APCFG_SUPPORTED__ +/* + * APCFG_EL1 + * + * 63 2 1 0 + * +----------+-+-+ + * | reserved |K|R| + * +----------+-+-+ + * + * where: + * R: Reserved + * K: ElXEnKey - Enable ARMV8.3 defined {IA,IB,DA,DB} keys when CPU is + * operating in EL1 (or higher) and when under Apple-Mode + */ +#define APCFG_EL1_ELXENKEY_OFFSET 1 +#define APCFG_EL1_ELXENKEY_MASK (0x1ULL << APCFG_EL1_ELXENKEY_OFFSET) +#define APCFG_EL1_ELXENKEY APCFG_EL1_ELXENKEY_MASK +#endif /* __APCFG_SUPPORTED__ */ + +#define APSTATE_G_SHIFT (0) +#define APSTATE_P_SHIFT (1) +#define APSTATE_A_SHIFT (2) + +#ifdef __APSTS_SUPPORTED__ +#define APCTL_EL1_AppleMode (1ULL << 0) +#define APCTL_EL1_KernKeyEn (1ULL << 1) +#define APCTL_EL1_EnAPKey0 (1ULL << 2) +#define APCTL_EL1_EnAPKey1 (1ULL << 3) +#define APSTS_EL1_MKEYVld (1ULL << 0) +#else +#define APCTL_EL1_AppleMode (1ULL << 0) +#define APCTL_EL1_MKEYVld (1ULL << 1) +#define APCTL_EL1_KernKeyEn (1ULL << 2) +#endif +#if defined(HAS_APPLE_PAC) +// The value of ptrauth_string_discriminator("recover"), hardcoded so it can be used from assembly code +#define PAC_DISCRIMINATOR_RECOVER 0x1e02 +#endif + #ifdef __ASSEMBLER__ /* @@ -1463,14 +1648,14 @@ typedef enum { * Where the "variant" is the major number and the "revision" is the minor number. * * For example: - * Cyclone A0 is variant 0, revision 0, i.e. 0. - * Cyclone B0 is variant 1, revision 0, i.e. 0x10 + * Cyclone A0 is variant 0, revision 0, i.e. 0. + * Cyclone B0 is variant 1, revision 0, i.e. 0x10 * $0 - register to place value in */ .macro GET_MIDR_CPU_VERSION -mrs $0, MIDR_EL1 // Read MIDR_EL1 for CPUID -bfi $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #4 // move bits 3:0 (revision) to 19:16 (below variant) to get values adjacent -ubfx $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #8 // And extract the concatenated bitstring to beginning of register +mrs $0, MIDR_EL1 // Read MIDR_EL1 for CPUID +bfi $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #4 // move bits 3:0 (revision) to 19:16 (below variant) to get values adjacent +ubfx $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #8 // And extract the concatenated bitstring to beginning of register .endmacro /* @@ -1483,8 +1668,8 @@ ubfx $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #8 // And extract the conca */ .macro SKIP_IF_CPU_VERSION_GREATER_OR_EQUAL GET_MIDR_CPU_VERSION $0 -cmp $0, $1 -b.pl $2 // Unsigned "greater or equal" +cmp $0, $1 +b.pl $2 // Unsigned "greater or equal" .endmacro /* @@ -1497,8 +1682,8 @@ b.pl $2 // Unsigned "greater or equal" */ .macro SKIP_IF_CPU_VERSION_LESS_THAN GET_MIDR_CPU_VERSION $0 -cmp $0, $1 -b.mi $2 // Unsigned "strictly less than" +cmp $0, $1 +b.mi $2 // Unsigned "strictly less than" .endmacro #endif /* __ASSEMBLER__ */ diff --git a/osfmk/arm64/sleh.c b/osfmk/arm64/sleh.c index 2ad70b7c8..705e31444 100644 --- a/osfmk/arm64/sleh.c +++ b/osfmk/arm64/sleh.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include @@ -79,30 +79,28 @@ assert(TEST_CONTEXT32_SANITY(context) || TEST_CONTEXT64_SANITY(context)) -#define COPYIN(src, dst, size) \ - (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) ? \ - copyin_kern(src, dst, size) \ - : \ - copyin(src, dst, size) +#define COPYIN(src, dst, size) \ + (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) ? \ + copyin_kern(src, dst, size) : \ + copyin(src, dst, size) -#define COPYOUT(src, dst, size) \ - (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) ? \ - copyout_kern(src, dst, size) \ - : \ - copyout(src, dst, size) +#define COPYOUT(src, dst, size) \ + (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) ? \ + copyout_kern(src, dst, size) : \ + copyout(src, dst, size) // Below is for concatenating a string param to a string literal #define STR1(x) #x #define STR(x) STR1(x) -void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss); +void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) __abortlike; -void sleh_synchronous_sp1(arm_context_t *, uint32_t, vm_offset_t); +void sleh_synchronous_sp1(arm_context_t *, uint32_t, vm_offset_t) __abortlike; void sleh_synchronous(arm_context_t *, uint32_t, vm_offset_t); void sleh_irq(arm_saved_state_t *); void sleh_fiq(arm_saved_state_t *); void sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far); -void sleh_invalid_stack(arm_context_t *context, uint32_t esr, vm_offset_t far); +void sleh_invalid_stack(arm_context_t *context, uint32_t esr, vm_offset_t far) __dead2; static void sleh_interrupt_handler_prologue(arm_saved_state_t *, unsigned int type); static void sleh_interrupt_handler_epilogue(void); @@ -113,10 +111,10 @@ static void handle_mach_continuous_time_trap(arm_saved_state_t *); static void handle_msr_trap(arm_saved_state_t *state, uint32_t iss); -extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean_t); +extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, bool, bool); -static void handle_uncategorized(arm_saved_state_t *, boolean_t); -static void handle_breakpoint(arm_saved_state_t *); +static void handle_uncategorized(arm_saved_state_t *); +static void handle_breakpoint(arm_saved_state_t *) __dead2; typedef void (*abort_inspector_t)(uint32_t, fault_status_t *, vm_prot_t *); static void inspect_instruction_abort(uint32_t, fault_status_t *, vm_prot_t *); @@ -130,18 +128,19 @@ typedef void (*abort_handler_t)(arm_saved_state_t *, uint32_t, vm_offset_t, faul static void handle_user_abort(arm_saved_state_t *, uint32_t, vm_offset_t, fault_status_t, vm_prot_t, vm_offset_t); static void handle_kernel_abort(arm_saved_state_t *, uint32_t, vm_offset_t, fault_status_t, vm_prot_t, vm_offset_t); -static void handle_pc_align(arm_saved_state_t *ss); -static void handle_sp_align(arm_saved_state_t *ss); -static void handle_sw_step_debug(arm_saved_state_t *ss); -static void handle_wf_trap(arm_saved_state_t *ss); +static void handle_pc_align(arm_saved_state_t *ss) __dead2; +static void handle_sp_align(arm_saved_state_t *ss) __dead2; +static void handle_sw_step_debug(arm_saved_state_t *ss) __dead2; +static void handle_wf_trap(arm_saved_state_t *ss) __dead2; +static void handle_fp_trap(arm_saved_state_t *ss, uint32_t esr) __dead2; -static void handle_watchpoint(vm_offset_t fault_addr); +static void handle_watchpoint(vm_offset_t fault_addr) __dead2; static void handle_abort(arm_saved_state_t *, uint32_t, vm_offset_t, vm_offset_t, abort_inspector_t, abort_handler_t); -static void handle_user_trapped_instruction32(arm_saved_state_t *, uint32_t esr); +static void handle_user_trapped_instruction32(arm_saved_state_t *, uint32_t esr) __dead2; -static void handle_simd_trap(arm_saved_state_t *, uint32_t esr); +static void handle_simd_trap(arm_saved_state_t *, uint32_t esr) __dead2; extern void mach_kauth_cred_uthread_update(void); void mach_syscall_trace_exit(unsigned int retval, unsigned int call_number); @@ -160,8 +159,11 @@ mach_syscall(struct arm_saved_state*); extern kern_return_t dtrace_user_probe(arm_saved_state_t* regs); extern boolean_t dtrace_tally_fault(user_addr_t); -/* Traps for userland processing. Can't include bsd/sys/fasttrap_isa.h, so copy and paste the trap instructions - * over from that file. Need to keep these in sync! */ +/* + * Traps for userland processing. Can't include bsd/sys/fasttrap_isa.h, so copy + * and paste the trap instructions + * over from that file. Need to keep these in sync! + */ #define FASTTRAP_ARM32_INSTR 0xe7ffdefc #define FASTTRAP_THUMB32_INSTR 0xdefc #define FASTTRAP_ARM64_INSTR 0xe7eeee7e @@ -174,6 +176,7 @@ extern boolean_t dtrace_tally_fault(user_addr_t); perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */ #endif + #if CONFIG_PGTRACE extern boolean_t pgtrace_enabled; #endif @@ -187,16 +190,21 @@ extern volatile char pan_fault_value; #endif #endif -#if defined(APPLECYCLONE) -#define CPU_NAME "Cyclone" -#elif defined(APPLETYPHOON) -#define CPU_NAME "Typhoon" +#if HAS_TWO_STAGE_SPR_LOCK +#ifdef CONFIG_XNUPOST +extern volatile vm_offset_t spr_lock_test_addr; +extern volatile uint32_t spr_lock_exception_esr; +#endif +#endif + +#if defined(APPLETYPHOON) +#define CPU_NAME "Typhoon" #elif defined(APPLETWISTER) -#define CPU_NAME "Twister" +#define CPU_NAME "Twister" #elif defined(APPLEHURRICANE) -#define CPU_NAME "Hurricane" +#define CPU_NAME "Hurricane" #else -#define CPU_NAME "Unknown" +#define CPU_NAME "Unknown" #endif #if (CONFIG_KERNEL_INTEGRITY && defined(KERNEL_INTEGRITY_WT)) @@ -224,6 +232,7 @@ __ror(unsigned value, unsigned shift) (unsigned)(value) << ((unsigned)(sizeof(unsigned) * CHAR_BIT) - (unsigned)(shift)); } +__dead2 static void arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_offset_t far) { @@ -268,6 +277,9 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o (void *)l2c_err_sts, (void *)l2c_err_adr, (void *)l2c_err_inf); #else // !defined(NO_ECORE) && !defined(HAS_MIGSTS) uint64_t llc_err_sts, llc_err_adr, llc_err_inf, mpidr; +#if defined(HAS_DPC_ERR) + uint64_t dpc_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_DPC_ERR_STS)); +#endif // defined(HAS_DPC_ERR) mpidr = __builtin_arm_rsr64("MPIDR_EL1"); @@ -286,10 +298,17 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o llc_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF)); panic_plain("Unhandled " CPU_NAME - " implementation specific error. state=%p esr=%#x far=%p p-core?%d\n" + " implementation specific error. state=%p esr=%#x far=%p p-core?%d" +#if defined(HAS_DPC_ERR) + " dpc_err_sts:%p" +#endif + "\n" "\tlsu_err_sts:%p, fed_err_sts:%p, mmu_err_sts:%p\n" "\tllc_err_sts:%p, llc_err_adr:%p, llc_err_inf:%p\n", state, esr, (void *)far, !!(mpidr & MPIDR_PNE), +#if defined(HAS_DPC_ERR) + (void *)dpc_err_sts, +#endif (void *)lsu_err_sts, (void *)fed_err_sts, (void *)mmu_err_sts, (void *)llc_err_sts, (void *)llc_err_adr, (void *)llc_err_inf); #endif @@ -345,7 +364,7 @@ kernel_integrity_error_handler(uint32_t esr, vm_offset_t far) static void arm64_platform_error(arm_saved_state_t *state, uint32_t esr, vm_offset_t far) { - cpu_data_t *cdp = getCpuDatap(); + cpu_data_t *cdp = getCpuDatap(); #if CONFIG_KERNEL_INTEGRITY kernel_integrity_error_handler(esr, far); @@ -366,7 +385,7 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) ss_valid = is_saved_state64(ss); arm_saved_state64_t *state = saved_state64(ss); - panic_plain("%s (saved state: %p%s)\n" + panic_plain("%s at pc 0x%016llx, lr 0x%016llx (saved state: %p%s)\n" "\t x0: 0x%016llx x1: 0x%016llx x2: 0x%016llx x3: 0x%016llx\n" "\t x4: 0x%016llx x5: 0x%016llx x6: 0x%016llx x7: 0x%016llx\n" "\t x8: 0x%016llx x9: 0x%016llx x10: 0x%016llx x11: 0x%016llx\n" @@ -376,7 +395,7 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) "\t x24: 0x%016llx x25: 0x%016llx x26: 0x%016llx x27: 0x%016llx\n" "\t x28: 0x%016llx fp: 0x%016llx lr: 0x%016llx sp: 0x%016llx\n" "\t pc: 0x%016llx cpsr: 0x%08x esr: 0x%08x far: 0x%016llx\n", - msg, ss, (ss_valid ? "" : " INVALID"), + msg, state->pc, state->lr, ss, (ss_valid ? "" : " INVALID"), state->x[0], state->x[1], state->x[2], state->x[3], state->x[4], state->x[5], state->x[6], state->x[7], state->x[8], state->x[9], state->x[10], state->x[11], @@ -388,12 +407,11 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) state->pc, state->cpsr, state->esr, state->far); } - void sleh_synchronous_sp1(arm_context_t *context, uint32_t esr, vm_offset_t far __unused) { - esr_exception_class_t class = ESR_EC(esr); - arm_saved_state_t *state = &context->ss; + esr_exception_class_t class = ESR_EC(esr); + arm_saved_state_t * state = &context->ss; switch (class) { case ESR_EC_UNCATEGORIZED: @@ -409,19 +427,51 @@ sleh_synchronous_sp1(arm_context_t *context, uint32_t esr, vm_offset_t far __unu } } +#if defined(HAS_TWO_STAGE_SPR_LOCK) && defined(CONFIG_XNUPOST) +static bool +handle_msr_write_from_xnupost(arm_saved_state_t *state, uint32_t esr) +{ + user_addr_t pc = get_saved_state_pc(state); + if ((spr_lock_test_addr != 0) && (pc == spr_lock_test_addr)) { + spr_lock_exception_esr = esr; + set_saved_state_pc(state, pc + 4); + return true; + } + + return false; +} +#endif + void sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) { - esr_exception_class_t class = ESR_EC(esr); - arm_saved_state_t *state = &context->ss; - vm_offset_t recover = 0, recover_saved = 0; - thread_t thread = current_thread(); + esr_exception_class_t class = ESR_EC(esr); + arm_saved_state_t * state = &context->ss; + vm_offset_t recover = 0; + thread_t thread = current_thread(); +#if MACH_ASSERT + int preemption_level = get_preemption_level(); +#endif ASSERT_CONTEXT_SANITY(context); + if (__improbable(ESR_INSTR_IS_2BYTES(esr))) { + /* + * We no longer support 32-bit, which means no 2-byte + * instructions. + */ + if (PSR64_IS_USER(get_saved_state_cpsr(state))) { + panic("Exception on 2-byte instruction, " + "context=%p, esr=%#x, far=%p", + context, esr, (void *)far); + } else { + panic_with_thread_kernel_state("Exception on 2-byte instruction", state); + } + } + /* Don't run exception handler with recover handler set in case of double fault */ if (thread->recover) { - recover = recover_saved = thread->recover; + recover = thread->recover; thread->recover = (vm_offset_t)NULL; } @@ -441,7 +491,7 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) case ESR_EC_DABORT_EL0: handle_abort(state, esr, far, recover, inspect_data_abort, handle_user_abort); - assert(0); /* Unreachable */ + thread_exception_return(); case ESR_EC_MSR_TRAP: handle_msr_trap(state, ESR_ISS(esr)); @@ -449,7 +499,7 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) case ESR_EC_IABORT_EL0: handle_abort(state, esr, far, recover, inspect_instruction_abort, handle_user_abort); - assert(0); /* Unreachable */ + thread_exception_return(); case ESR_EC_IABORT_EL1: @@ -457,8 +507,7 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) case ESR_EC_PC_ALIGN: handle_pc_align(state); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_DABORT_EL1: handle_abort(state, esr, far, recover, inspect_data_abort, handle_kernel_abort); @@ -467,103 +516,61 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) case ESR_EC_UNCATEGORIZED: assert(!ESR_ISS(esr)); - handle_uncategorized(&context->ss, ESR_INSTR_IS_2BYTES(esr)); - /* TODO: Uncomment this after stackshot uses a brk instruction - * rather than an undefined instruction, as stackshot is the - * only case where we want to return to the first-level handler. - */ - //assert(0); /* Unreachable */ +#if defined(HAS_TWO_STAGE_SPR_LOCK) && defined(CONFIG_XNUPOST) + if (handle_msr_write_from_xnupost(state, esr)) { + break; + } +#endif + handle_uncategorized(&context->ss); break; case ESR_EC_SP_ALIGN: handle_sp_align(state); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_BKPT_AARCH32: handle_breakpoint(state); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_BRK_AARCH64: if (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) { - kprintf("Breakpoint instruction exception from kernel. Hanging here (by design).\n"); - for (;;) { - ; - } - - __unreachable_ok_push - DebuggerCall(EXC_BREAKPOINT, &context->ss); - break; - __unreachable_ok_pop + panic_with_thread_kernel_state("Break instruction exception from kernel. Panic (by design)", state); } else { handle_breakpoint(state); - assert(0); /* Unreachable */ } + __builtin_unreachable(); case ESR_EC_BKPT_REG_MATCH_EL0: if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) { handle_breakpoint(state); - assert(0); /* Unreachable */ } panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p", class, state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_BKPT_REG_MATCH_EL1: - if (!PE_i_can_has_debugger(NULL) && FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) { - kprintf("Hardware Breakpoint Debug exception from kernel. Hanging here (by design).\n"); - for (;;) { - ; - } - - __unreachable_ok_push - DebuggerCall(EXC_BREAKPOINT, &context->ss); - break; - __unreachable_ok_pop - } - panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p", - class, state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + panic_with_thread_kernel_state("Hardware Breakpoint Debug exception from kernel. Panic (by design)", state); + __builtin_unreachable(); case ESR_EC_SW_STEP_DEBUG_EL0: if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) { handle_sw_step_debug(state); - assert(0); /* Unreachable */ } panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p", class, state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_SW_STEP_DEBUG_EL1: - if (!PE_i_can_has_debugger(NULL) && FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) { - kprintf("Software Step Debug exception from kernel. Hanging here (by design).\n"); - for (;;) { - ; - } - - __unreachable_ok_push - DebuggerCall(EXC_BREAKPOINT, &context->ss); - break; - __unreachable_ok_pop - } - panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p", - class, state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + panic_with_thread_kernel_state("Software Step Debug exception from kernel. Panic (by design)", state); + __builtin_unreachable(); case ESR_EC_WATCHPT_MATCH_EL0: if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) { handle_watchpoint(far); - assert(0); /* Unreachable */ } panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p", class, state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_WATCHPT_MATCH_EL1: /* @@ -576,13 +583,11 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) } panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p", class, state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); case ESR_EC_TRAP_SIMD_FP: handle_simd_trap(state, esr); - assert(0); - break; + __builtin_unreachable(); case ESR_EC_ILLEGAL_INSTR_SET: if (EXCB_ACTION_RERUN != @@ -590,10 +595,9 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) // instruction is not re-executed panic("Illegal instruction set exception. state=%p class=%u esr=%u far=%p spsr=0x%x", state, class, esr, (void *)far, get_saved_state_cpsr(state)); - assert(0); } // must clear this fault in PSR to re-run - set_saved_state_cpsr(state, get_saved_state_cpsr(state) & (~PSR64_IL)); + mask_saved_state_cpsr(state, 0, PSR64_IL); break; case ESR_EC_MCR_MRC_CP15_TRAP: @@ -602,25 +606,32 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) case ESR_EC_LDC_STC_CP14_TRAP: case ESR_EC_MCRR_MRRC_CP14_TRAP: handle_user_trapped_instruction32(state, esr); - assert(0); - break; + __builtin_unreachable(); case ESR_EC_WFI_WFE: // Use of WFI or WFE instruction when they have been disabled for EL0 handle_wf_trap(state); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); + + case ESR_EC_FLOATING_POINT_64: + handle_fp_trap(state, esr); + __builtin_unreachable(); + default: panic("Unsupported synchronous exception. state=%p class=%u esr=%u far=%p", state, class, esr, (void *)far); - assert(0); /* Unreachable */ - break; + __builtin_unreachable(); } - if (recover_saved) { - thread->recover = recover_saved; + if (recover) { + thread->recover = recover; } +#if MACH_ASSERT + if (preemption_level != get_preemption_level()) { + panic("synchronous exception changed preemption level from %d to %d", preemption_level, get_preemption_level()); + } +#endif } /* @@ -628,21 +639,14 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) * ARM64_TODO: For now, we assume this is for undefined instruction exceptions. */ static void -handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2) +handle_uncategorized(arm_saved_state_t *state) { exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr = 0; - - if (instrLen2) { - uint16_t instr16 = 0; - COPYIN(get_saved_state_pc(state), (char *)&instr16, sizeof(instr16)); + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; - instr = instr16; - } else { - COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr)); - } + COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr)); #if CONFIG_DTRACE if (tempDTraceTrapHook && (tempDTraceTrapHook(exception, state, 0, 0) == KERN_SUCCESS)) { @@ -697,7 +701,7 @@ handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2) */ kstackptr = (vm_offset_t) current_thread()->machine.kstackptr; if (kstackptr) { - ((thread_kernel_state_t) kstackptr)->machine.ss = *state; + copy_signed_thread_state(&((thread_kernel_state_t) kstackptr)->machine.ss, state); } /* Hop into the debugger (typically either due to a @@ -714,68 +718,50 @@ handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2) } /* - * Check for GDB breakpoint via illegal opcode. + * Check for GDB breakpoint via illegal opcode. */ - if (instrLen2) { - if (IS_THUMB_GDB_TRAP(instr)) { - exception = EXC_BREAKPOINT; - codes[0] = EXC_ARM_BREAKPOINT; - codes[1] = instr; - } else { - codes[1] = instr; - } + if (IS_ARM_GDB_TRAP(instr)) { + exception = EXC_BREAKPOINT; + codes[0] = EXC_ARM_BREAKPOINT; + codes[1] = instr; } else { - if (IS_ARM_GDB_TRAP(instr)) { - exception = EXC_BREAKPOINT; - codes[0] = EXC_ARM_BREAKPOINT; - codes[1] = instr; - } else if (IS_THUMB_GDB_TRAP((instr & 0xFFFF))) { - exception = EXC_BREAKPOINT; - codes[0] = EXC_ARM_BREAKPOINT; - codes[1] = instr & 0xFFFF; - } else if (IS_THUMB_GDB_TRAP((instr >> 16))) { - exception = EXC_BREAKPOINT; - codes[0] = EXC_ARM_BREAKPOINT; - codes[1] = instr >> 16; - } else { - codes[1] = instr; - } + codes[1] = instr; } exception_triage(exception, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } static void handle_breakpoint(arm_saved_state_t *state) { - exception_type_t exception = EXC_BREAKPOINT; - mach_exception_data_type_t codes[2] = {EXC_ARM_BREAKPOINT}; - mach_msg_type_number_t numcodes = 2; + exception_type_t exception = EXC_BREAKPOINT; + mach_exception_data_type_t codes[2] = {EXC_ARM_BREAKPOINT}; + mach_msg_type_number_t numcodes = 2; codes[1] = get_saved_state_pc(state); exception_triage(exception, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } static void handle_watchpoint(vm_offset_t fault_addr) { - exception_type_t exception = EXC_BREAKPOINT; - mach_exception_data_type_t codes[2] = {EXC_ARM_DA_DEBUG}; - mach_msg_type_number_t numcodes = 2; + exception_type_t exception = EXC_BREAKPOINT; + mach_exception_data_type_t codes[2] = {EXC_ARM_DA_DEBUG}; + mach_msg_type_number_t numcodes = 2; codes[1] = fault_addr; exception_triage(exception, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } static void handle_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr, vm_offset_t recover, abort_inspector_t inspect_abort, abort_handler_t handler) { - fault_status_t fault_code; - vm_prot_t fault_type; + fault_status_t fault_code; + vm_prot_t fault_type; inspect_abort(ESR_ISS(esr), &fault_code, &fault_type); handler(state, esr, fault_addr, fault_code, fault_type, recover); @@ -819,7 +805,7 @@ handle_pc_align(arm_saved_state_t *ss) codes[1] = get_saved_state_pc(ss); exception_triage(exc, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } static void @@ -838,22 +824,59 @@ handle_sp_align(arm_saved_state_t *ss) codes[1] = get_saved_state_sp(ss); exception_triage(exc, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } static void -handle_wf_trap(arm_saved_state_t *ss) +handle_wf_trap(arm_saved_state_t *state) { exception_type_t exc; mach_exception_data_type_t codes[2]; mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; + + COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr)); exc = EXC_BAD_INSTRUCTION; codes[0] = EXC_ARM_UNDEFINED; - codes[1] = get_saved_state_sp(ss); + codes[1] = instr; + + exception_triage(exc, codes, numcodes); + __builtin_unreachable(); +} + +static void +handle_fp_trap(arm_saved_state_t *state, uint32_t esr) +{ + exception_type_t exc = EXC_ARITHMETIC; + mach_exception_data_type_t codes[2]; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; + + /* The floating point trap flags are only valid if TFV is set. */ + if (!(esr & ISS_FP_TFV)) { + codes[0] = EXC_ARM_FP_UNDEFINED; + } else if (esr & ISS_FP_UFF) { + codes[0] = EXC_ARM_FP_UF; + } else if (esr & ISS_FP_OFF) { + codes[0] = EXC_ARM_FP_OF; + } else if (esr & ISS_FP_IOF) { + codes[0] = EXC_ARM_FP_IO; + } else if (esr & ISS_FP_DZF) { + codes[0] = EXC_ARM_FP_DZ; + } else if (esr & ISS_FP_IDF) { + codes[0] = EXC_ARM_FP_ID; + } else if (esr & ISS_FP_IXF) { + codes[0] = EXC_ARM_FP_IX; + } else { + panic("Unrecognized floating point exception, state=%p, esr=%#x", state, esr); + } + + COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr)); + codes[1] = instr; exception_triage(exc, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } @@ -876,8 +899,7 @@ handle_sw_step_debug(arm_saved_state_t *state) panic_with_thread_kernel_state("SW_STEP_DEBUG exception thread DebugData is NULL.", state); } - set_saved_state_cpsr((thread->machine.upcb), - get_saved_state_cpsr((thread->machine.upcb)) & ~(PSR64_SS | DAIF_IRQF | DAIF_FIQF)); + mask_saved_state_cpsr(thread->machine.upcb, 0, PSR64_SS | DAIF_IRQF | DAIF_FIQF); // Special encoding for gdb single step event on ARM exc = EXC_BREAKPOINT; @@ -885,7 +907,7 @@ handle_sw_step_debug(arm_saved_state_t *state) codes[1] = 0; exception_triage(exc, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } static int @@ -958,14 +980,48 @@ is_parity_error(fault_status_t status) } } +static void +set_saved_state_pc_to_recovery_handler(arm_saved_state_t *iss, vm_offset_t recover) +{ +#if defined(HAS_APPLE_PAC) + thread_t thread = current_thread(); + const uintptr_t disc = ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER); + const char *panic_msg = "Illegal thread->recover value %p"; + + MANIPULATE_SIGNED_THREAD_STATE(iss, + // recover = (vm_offset_t)ptrauth_auth_data((void *)recover, ptrauth_key_function_pointer, + // ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER)); + "mov x1, %[recover] \n" + "mov x6, %[disc] \n" + "autia x1, x6 \n" + // if (recover != (vm_offset_t)ptrauth_strip((void *)recover, ptrauth_key_function_pointer)) { + "mov x6, x1 \n" + "xpaci x6 \n" + "cmp x1, x6 \n" + "beq 1f \n" + // panic("Illegal thread->recover value %p", (void *)recover); + "mov x0, %[panic_msg] \n" + "bl _panic \n" + // } + "1: \n" + "str x1, [x0, %[SS64_PC]] \n", + [recover] "r"(recover), + [disc] "r"(disc), + [panic_msg] "r"(panic_msg) + ); +#else + set_saved_state_pc(iss, recover); +#endif +} + static void handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr, fault_status_t fault_code, vm_prot_t fault_type, vm_offset_t recover) { - exception_type_t exc = EXC_BAD_ACCESS; - mach_exception_data_type_t codes[2]; - mach_msg_type_number_t numcodes = 2; - thread_t thread = current_thread(); + exception_type_t exc = EXC_BAD_ACCESS; + mach_exception_data_type_t codes[2]; + mach_msg_type_number_t numcodes = 2; + thread_t thread = current_thread(); (void)esr; (void)state; @@ -988,21 +1044,18 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr } #if CONFIG_DTRACE - if (thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ + if (thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(vm_fault_addr)) { /* Should a user mode fault under dtrace be ignored? */ if (recover) { - set_saved_state_pc(state, recover); + set_saved_state_pc_to_recovery_handler(state, recover); } else { - boolean_t intr = ml_set_interrupts_enabled(FALSE); + ml_set_interrupts_enabled(FALSE); panic_with_thread_kernel_state("copyin/out has no recovery point", state); - (void) ml_set_interrupts_enabled(intr); } return; } else { - boolean_t intr = ml_set_interrupts_enabled(FALSE); + ml_set_interrupts_enabled(FALSE); panic_with_thread_kernel_state("Unexpected UMW page fault under dtrace_probe", state); - (void) ml_set_interrupts_enabled(intr); - return; } } #else @@ -1022,7 +1075,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr /* check to see if it is just a pmap ref/modify fault */ if ((result != KERN_SUCCESS) && !is_translation_fault(fault_code)) { - result = arm_fast_fault(map->pmap, trunc_page(vm_fault_addr), fault_type, TRUE); + result = arm_fast_fault(map->pmap, trunc_page(vm_fault_addr), fault_type, (fault_code == FSC_ACCESS_FLAG_FAULT_L3), TRUE); } if (result != KERN_SUCCESS) { { @@ -1033,8 +1086,15 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr } } if (result == KERN_SUCCESS || result == KERN_ABORTED) { - thread_exception_return(); - /* NOTREACHED */ + return; + } + + /* + * vm_fault() should never return KERN_FAILURE for page faults from user space. + * If it does, we're leaking preemption disables somewhere in the kernel. + */ + if (__improbable(result == KERN_FAILURE)) { + panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread); } codes[0] = result; @@ -1044,8 +1104,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr #if defined(APPLE_ARM64_ARCH_FAMILY) if (fault_code == FSC_SYNC_PARITY) { arm64_platform_error(state, esr, fault_addr); - thread_exception_return(); - /* NOTREACHED */ + return; } #else panic("User parity error."); @@ -1056,7 +1115,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr codes[1] = fault_addr; exception_triage(exc, codes, numcodes); - assert(0); /* NOTREACHED */ + __builtin_unreachable(); } #if __ARM_PAN_AVAILABLE__ @@ -1091,26 +1150,24 @@ static void handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr, fault_status_t fault_code, vm_prot_t fault_type, vm_offset_t recover) { - thread_t thread = current_thread(); + thread_t thread = current_thread(); (void)esr; #if CONFIG_DTRACE - if (is_vm_fault(fault_code) && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ + if (is_vm_fault(fault_code) && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(fault_addr)) { /* Should a fault under dtrace be ignored? */ /* * Point to next instruction, or recovery handler if set. */ if (recover) { - set_saved_state_pc(state, recover); + set_saved_state_pc_to_recovery_handler(state, recover); } else { - set_saved_state_pc(state, get_saved_state_pc(state) + 4); + add_saved_state_pc(state, 4); } return; } else { - boolean_t intr = ml_set_interrupts_enabled(FALSE); + ml_set_interrupts_enabled(FALSE); panic_with_thread_kernel_state("Unexpected page fault under dtrace_probe", state); - (void) ml_set_interrupts_enabled(intr); - return; } } #endif @@ -1122,9 +1179,9 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad #endif if (is_vm_fault(fault_code)) { - kern_return_t result = KERN_FAILURE; - vm_map_t map; - int interruptible; + kern_return_t result = KERN_FAILURE; + vm_map_t map; + int interruptible; /* * Ensure no faults in the physical aperture. This could happen if @@ -1141,7 +1198,8 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad // that PAN is re-enabled for the exception handler and that // accessing the test address produces a PAN fault. pan_fault_value = *(char *)pan_test_addr; - set_saved_state_pc(state, get_saved_state_pc(state) + 4); + __builtin_arm_wsr("pan", 1); // turn PAN back on after the nested exception cleared it for this context + add_saved_state_pc(state, 4); return; } #endif @@ -1174,7 +1232,7 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad /* check to see if it is just a pmap ref/modify fault */ if (!is_translation_fault(fault_code)) { - result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE); + result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, (fault_code == FSC_ACCESS_FLAG_FAULT_L3), FALSE); if (result == KERN_SUCCESS) { return; } @@ -1197,7 +1255,7 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad * If we have a recover handler, invoke it now. */ if (recover) { - set_saved_state_pc(state, recover); + set_saved_state_pc_to_recovery_handler(state, recover); return; } @@ -1211,11 +1269,11 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad // the exception handler if (pan_exception_level == 1) { pan_fault_value = *(char *)pan_test_addr; + __builtin_arm_wsr("pan", 1); // turn PAN back on after the nested exception cleared it for this context } // this fault address is used for PAN test // disable PAN and rerun - set_saved_state_cpsr(state, - get_saved_state_cpsr(state) & (~PSR64_PAN)); + mask_saved_state_cpsr(state, 0, PSR64_PAN); return; } #endif @@ -1228,6 +1286,10 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad panic_with_thread_kernel_state("Unexpected abort while on interrupt stack.", state); #endif } else if (is_alignment_fault(fault_code)) { + if (recover) { + set_saved_state_pc_to_recovery_handler(state, recover); + return; + } panic_with_thread_kernel_state("Unaligned kernel data abort.", state); } else if (is_parity_error(fault_code)) { #if defined(APPLE_ARM64_ARCH_FAMILY) @@ -1250,9 +1312,9 @@ extern void syscall_trace(struct arm_saved_state * regs); static void handle_svc(arm_saved_state_t *state) { - int trap_no = get_saved_state_svc_number(state); - thread_t thread = current_thread(); - struct proc *p; + int trap_no = get_saved_state_svc_number(state); + thread_t thread = current_thread(); + struct proc *p; #define handle_svc_kprintf(x...) /* kprintf("handle_svc: " x) */ @@ -1311,9 +1373,9 @@ static void handle_msr_trap(arm_saved_state_t *state, uint32_t iss) { exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr = 0; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; (void)iss; @@ -1335,9 +1397,9 @@ static void handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr) { exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr = 0; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr; if (is_saved_state64(state)) { panic("ESR (0x%x) for instruction trapped from U32, but saved state is 64-bit.", esr); @@ -1351,15 +1413,16 @@ handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr) codes[1] = instr; exception_triage(exception, codes, numcodes); + __builtin_unreachable(); } static void handle_simd_trap(arm_saved_state_t *state, uint32_t esr) { exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr = 0; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; if (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) { panic("ESR (0x%x) for SIMD trap from userland, actually came from kernel?", esr); @@ -1369,20 +1432,23 @@ handle_simd_trap(arm_saved_state_t *state, uint32_t esr) codes[1] = instr; exception_triage(exception, codes, numcodes); + __builtin_unreachable(); } void sleh_irq(arm_saved_state_t *state) { - uint64_t timestamp = 0; - uint32_t old_entropy_data = 0; - uint32_t * old_entropy_data_ptr = NULL; - uint32_t * new_entropy_data_ptr = NULL; - cpu_data_t * cdp = getCpuDatap(); -#if DEVELOPMENT || DEBUG + uint64_t timestamp = 0; + uint32_t old_entropy_data = 0; + uint32_t old_entropy_sample_count = 0; + size_t entropy_index = 0; + uint32_t * entropy_data_ptr = NULL; + cpu_data_t * cdp = getCpuDatap(); +#if MACH_ASSERT int preemption_level = get_preemption_level(); #endif + sleh_interrupt_handler_prologue(state, DBG_INTR_TYPE_OTHER); /* Run the registered interrupt handler. */ @@ -1401,21 +1467,18 @@ sleh_irq(arm_saved_state_t *state) * is the entire point of the entropy buffer, we will not worry about * these races for now. */ - old_entropy_data_ptr = EntropyData.index_ptr; - new_entropy_data_ptr = old_entropy_data_ptr + 1; - - if (new_entropy_data_ptr >= &EntropyData.buffer[ENTROPY_BUFFER_SIZE]) { - new_entropy_data_ptr = EntropyData.buffer; - } + old_entropy_sample_count = EntropyData.sample_count; + EntropyData.sample_count += 1; - EntropyData.index_ptr = new_entropy_data_ptr; + entropy_index = old_entropy_sample_count & ENTROPY_BUFFER_INDEX_MASK; + entropy_data_ptr = EntropyData.buffer + entropy_index; /* Mix the timestamp data and the old data together. */ - old_entropy_data = *old_entropy_data_ptr; - *old_entropy_data_ptr = (uint32_t)timestamp ^ __ror(old_entropy_data, 9); + old_entropy_data = *entropy_data_ptr; + *entropy_data_ptr = (uint32_t)timestamp ^ __ror(old_entropy_data, 9); sleh_interrupt_handler_epilogue(); -#if DEVELOPMENT || DEBUG +#if MACH_ASSERT if (preemption_level != get_preemption_level()) { panic("irq handler %p changed preemption level from %d to %d", cdp->interrupt_handler, preemption_level, get_preemption_level()); } @@ -1426,7 +1489,7 @@ void sleh_fiq(arm_saved_state_t *state) { unsigned int type = DBG_INTR_TYPE_UNKNOWN; -#if DEVELOPMENT || DEBUG +#if MACH_ASSERT int preemption_level = get_preemption_level(); #endif @@ -1469,7 +1532,7 @@ sleh_fiq(arm_saved_state_t *state) } sleh_interrupt_handler_epilogue(); -#if DEVELOPMENT || DEBUG +#if MACH_ASSERT if (preemption_level != get_preemption_level()) { panic("fiq type %u changed preemption level from %d to %d", type, preemption_level, get_preemption_level()); } @@ -1479,14 +1542,14 @@ sleh_fiq(arm_saved_state_t *state) void sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far) { - arm_saved_state_t *state = &context->ss; -#if DEVELOPMENT || DEBUG + arm_saved_state_t *state = &context->ss; +#if MACH_ASSERT int preemption_level = get_preemption_level(); #endif ASSERT_CONTEXT_SANITY(context); arm64_platform_error(state, esr, far); -#if DEVELOPMENT || DEBUG +#if MACH_ASSERT if (preemption_level != get_preemption_level()) { panic("serror changed preemption level from %d to %d", preemption_level, get_preemption_level()); } @@ -1494,13 +1557,12 @@ sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far) } void -mach_syscall_trace_exit( - unsigned int retval, - unsigned int call_number) +mach_syscall_trace_exit(unsigned int retval, + unsigned int call_number) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) | DBG_FUNC_END, - retval, 0, 0, 0, 0); + MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) | + DBG_FUNC_END, retval, 0, 0, 0, 0); } __attribute__((noreturn)) @@ -1516,11 +1578,11 @@ thread_syscall_return(kern_return_t error) assert(is_saved_state64(state)); saved_state64(state)->x[0] = error; -#if DEBUG || DEVELOPMENT +#if MACH_ASSERT kern_allocation_name_t prior __assert_only = thread_get_kernel_state(thread)->allocation_name; assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior)); -#endif /* DEBUG || DEVELOPMENT */ +#endif /* MACH_ASSERT */ if (kdebug_enable) { /* Invert syscall number (negative for a mach syscall) */ @@ -1540,7 +1602,7 @@ syscall_trace( static void sleh_interrupt_handler_prologue(arm_saved_state_t *state, unsigned int type) { - uint64_t is_user = PSR64_IS_USER(get_saved_state_cpsr(state)); + uint64_t is_user = PSR64_IS_USER(get_saved_state_cpsr(state)); uint64_t pc = is_user ? get_saved_state_pc(state) : VM_KERNEL_UNSLIDE(get_saved_state_pc(state)); diff --git a/osfmk/arm64/start.s b/osfmk/arm64/start.s index f709217cf..4e964ca8b 100644 --- a/osfmk/arm64/start.s +++ b/osfmk/arm64/start.s @@ -32,12 +32,14 @@ #include #include #include "assym.s" +#include #if __ARM_KERNEL_PROTECT__ #include #endif /* __ARM_KERNEL_PROTECT__ */ + .macro MSR_VBAR_EL1_X0 #if defined(KERNEL_INTEGRITY_KTRR) mov x1, lr @@ -52,7 +54,7 @@ #if defined(KERNEL_INTEGRITY_KTRR) mov x0, x1 mov x1, lr - bl _pinst_set_tcr + bl EXT(pinst_set_tcr) mov lr, x1 #else msr TCR_EL1, x1 @@ -62,7 +64,7 @@ .macro MSR_TTBR1_EL1_X0 #if defined(KERNEL_INTEGRITY_KTRR) mov x1, lr - bl _pinst_set_ttbr1 + bl EXT(pinst_set_ttbr1) mov lr, x1 #else msr TTBR1_EL1, x0 @@ -74,9 +76,9 @@ mov x1, lr // This may abort, do so on SP1 - bl _pinst_spsel_1 + bl EXT(pinst_spsel_1) - bl _pinst_set_sctlr + bl EXT(pinst_set_sctlr) msr SPSel, #0 // Back to SP0 mov lr, x1 #else @@ -98,8 +100,30 @@ .align 12 .globl EXT(LowResetVectorBase) LEXT(LowResetVectorBase) - // Preserve x0 for start_first_cpu, if called + /* + * On reset, both RVBAR_EL1 and VBAR_EL1 point here. SPSel.SP is 1, + * so on reset the CPU will jump to offset 0x0 and on exceptions + * the CPU will jump to offset 0x200, 0x280, 0x300, or 0x380. + * In order for both the reset vector and exception vectors to + * coexist in the same space, the reset code is moved to the end + * of the exception vector area. + */ + b EXT(reset_vector) + /* EL1 SP1: These vectors trap errors during early startup on non-boot CPUs. */ + .align 9 + b . + .align 7 + b . + .align 7 + b . + .align 7 + b . + + .align 7 + .globl EXT(reset_vector) +LEXT(reset_vector) + // Preserve x0 for start_first_cpu, if called // Unlock the core for debugging msr OSLAR_EL1, xzr msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts @@ -122,6 +146,11 @@ LEXT(LowResetVectorBase) * If either values are zero, we're debugging kernel so skip programming KTRR. */ + /* spin until bootstrap core has completed machine lockdown */ + adrp x17, EXT(lockdown_done)@page +1: + ldr x18, [x17, EXT(lockdown_done)@pageoff] + cbz x18, 1b // load stashed rorgn_begin adrp x17, EXT(rorgn_begin)@page @@ -144,7 +173,7 @@ LEXT(LowResetVectorBase) mov x17, #1 msr ARM64_REG_KTRR_LOCK_EL1, x17 Lskip_ktrr: -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) */ // Process reset handlers adrp x19, EXT(ResetHandlerData)@page // Get address of the reset handler data @@ -158,14 +187,14 @@ Lcheck_cpu_data_entry: cbz x21, Lnext_cpu_data_entry ldr w2, [x21, CPU_PHYS_ID] // Load ccc cpu phys id cmp x0, x2 // Compare cpu data phys cpu and MPIDR_EL1 phys cpu - b.eq Lfound_cpu_data_entry // Branch if match + b.eq Lfound_cpu_data_entry // Branch if match Lnext_cpu_data_entry: add x1, x1, #16 // Increment to the next cpu data entry cmp x1, x3 - b.eq Lskip_cpu_reset_handler // Not found + b.eq Lskip_cpu_reset_handler // Not found b Lcheck_cpu_data_entry // loop Lfound_cpu_data_entry: - adrp x20, EXT(const_boot_args)@page + adrp x20, EXT(const_boot_args)@page add x20, x20, EXT(const_boot_args)@pageoff ldr x0, [x21, CPU_RESET_HANDLER] // Call CPU reset handler cbz x0, Lskip_cpu_reset_handler @@ -178,7 +207,7 @@ Lfound_cpu_data_entry: adrp x2, EXT(start_cpu)@page add x2, x2, EXT(start_cpu)@pageoff cmp x0, x2 - bne Lskip_cpu_reset_handler + bne Lskip_cpu_reset_handler 1: @@ -196,12 +225,7 @@ Lfound_cpu_data_entry: Lskip_cpu_reset_handler: b . // Hang if the handler is NULL or returns - .align 3 - .globl EXT(ResetHandlerData) -LEXT(ResetHandlerData) - .space (rhdSize_NUM),0 // (filled with 0s) - - .align 3 + .align 3 .global EXT(LowResetVectorEnd) LEXT(LowResetVectorEnd) .global EXT(SleepToken) @@ -210,6 +234,13 @@ LEXT(SleepToken) .space (stSize_NUM),0 #endif + .section __DATA_CONST,__const + .align 3 + .globl EXT(ResetHandlerData) +LEXT(ResetHandlerData) + .space (rhdSize_NUM),0 // (filled with 0s) + .text + /* * __start trampoline is located at a position relative to LowResetVectorBase @@ -276,6 +307,7 @@ LEXT(LowExceptionVectorBase) .align ARM_PGSHIFT .globl EXT(bootstrap_instructions) LEXT(bootstrap_instructions) + #endif /* defined(KERNEL_INTEGRITY_KTRR)*/ .align 2 .globl EXT(resume_idle_cpu) @@ -311,6 +343,7 @@ start_cpu: ldr x25, [x20, BA_TOP_OF_KERNEL_DATA] // Get the top of the kernel data ldr x26, [x20, BA_BOOT_FLAGS] // Get the kernel boot flags + // Set TPIDRRO_EL0 with the CPU number ldr x0, [x21, CPU_NUMBER_GS] msr TPIDRRO_EL0, x0 @@ -322,7 +355,7 @@ start_cpu: // Set SP_EL1 to exception stack #if defined(KERNEL_INTEGRITY_KTRR) mov x1, lr - bl _pinst_spsel_1 + bl EXT(pinst_spsel_1) mov lr, x1 #else msr SPSel, #1 @@ -452,6 +485,7 @@ LEXT(start_first_cpu) // Unlock the core for debugging msr OSLAR_EL1, xzr msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts + mov x20, x0 mov x21, #0 @@ -481,7 +515,7 @@ LEXT(start_first_cpu) // Set SP_EL1 to exception stack #if defined(KERNEL_INTEGRITY_KTRR) - bl _pinst_spsel_1 + bl EXT(pinst_spsel_1) #else msr SPSel, #1 #endif @@ -511,23 +545,13 @@ LEXT(start_first_cpu) * Page 3 - KVA L1 table * Page 4 - KVA L2 table */ -#if __ARM64_TWO_LEVEL_PMAP__ - /* - * If we are using a two level scheme, we don't need the L1 entries, so: - * Page 1 - V=P L2 table - * Page 2 - KVA L2 table - */ -#endif // Invalidate all entries in the bootstrap page tables mov x0, #(ARM_TTE_EMPTY) // Load invalid entry template mov x1, x25 // Start at top of kernel mov x2, #(TTE_PGENTRIES) // Load number of entries per page -#if __ARM64_TWO_LEVEL_PMAP__ - lsl x2, x2, #1 // Shift by 1 for num entries on 2 pages -#else lsl x2, x2, #2 // Shift by 2 for num entries on 4 pages -#endif + Linvalidate_bootstrap: // do { str x0, [x1], #(1 << TTE_SHIFT) // Invalidate and advance subs x2, x2, #1 // entries-- @@ -603,6 +627,7 @@ Linvalidate_bootstrap: // do { /* Ensure TTEs are visible */ dsb ish + b common_start /* @@ -659,6 +684,10 @@ common_start: orr x0, x0, x1 mov x1, #(MAIR_POSTED << MAIR_ATTR_SHIFT(CACHE_ATTRINDX_POSTED)) orr x0, x0, x1 + mov x1, #(MAIR_POSTED_REORDERED << MAIR_ATTR_SHIFT(CACHE_ATTRINDX_POSTED_REORDERED)) + orr x0, x0, x1 + mov x1, #(MAIR_POSTED_COMBINED_REORDERED << MAIR_ATTR_SHIFT(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED)) + orr x0, x0, x1 msr MAIR_EL1, x0 #if defined(APPLEHURRICANE) @@ -686,6 +715,7 @@ common_start: #endif + #ifndef __ARM_IC_NOALIAS_ICACHE__ /* Invalidate the TLB and icache on systems that do not guarantee that the * caches are invalidated on reset. @@ -723,14 +753,108 @@ common_start: 1: MSR_VBAR_EL1_X0 +1: +#ifdef HAS_APPLE_PAC +#ifdef __APSTS_SUPPORTED__ + mrs x0, ARM64_REG_APSTS_EL1 + and x1, x0, #(APSTS_EL1_MKEYVld) + cbz x1, 1b // Poll APSTS_EL1.MKEYVld + mrs x0, ARM64_REG_APCTL_EL1 + orr x0, x0, #(APCTL_EL1_AppleMode) + orr x0, x0, #(APCTL_EL1_KernKeyEn) + and x0, x0, #~(APCTL_EL1_EnAPKey0) + msr ARM64_REG_APCTL_EL1, x0 +#else + mrs x0, ARM64_REG_APCTL_EL1 + and x1, x0, #(APCTL_EL1_MKEYVld) + cbz x1, 1b // Poll APCTL_EL1.MKEYVld + orr x0, x0, #(APCTL_EL1_AppleMode) + orr x0, x0, #(APCTL_EL1_KernKeyEn) + msr ARM64_REG_APCTL_EL1, x0 +#endif /* APSTS_SUPPORTED */ + + /* ISB necessary to ensure APCTL_EL1_AppleMode logic enabled before proceeding */ + isb sy + /* Load static kernel key diversification values */ + ldr x0, =KERNEL_ROP_ID + /* set ROP key. must write at least once to pickup mkey per boot diversification */ + msr APIBKeyLo_EL1, x0 + add x0, x0, #1 + msr APIBKeyHi_EL1, x0 + add x0, x0, #1 + msr APDBKeyLo_EL1, x0 + add x0, x0, #1 + msr APDBKeyHi_EL1, x0 + add x0, x0, #1 + msr ARM64_REG_KERNELKEYLO_EL1, x0 + add x0, x0, #1 + msr ARM64_REG_KERNELKEYHI_EL1, x0 + /* set JOP key. must write at least once to pickup mkey per boot diversification */ + add x0, x0, #1 + msr APIAKeyLo_EL1, x0 + add x0, x0, #1 + msr APIAKeyHi_EL1, x0 + add x0, x0, #1 + msr APDAKeyLo_EL1, x0 + add x0, x0, #1 + msr APDAKeyHi_EL1, x0 + /* set G key */ + add x0, x0, #1 + msr APGAKeyLo_EL1, x0 + add x0, x0, #1 + msr APGAKeyHi_EL1, x0 + + // Enable caches, MMU, ROP and JOP + mov x0, #(SCTLR_EL1_DEFAULT & 0xFFFF) + mov x1, #(SCTLR_EL1_DEFAULT & 0xFFFF0000) + orr x0, x0, x1 + orr x0, x0, #(SCTLR_PACIB_ENABLED) /* IB is ROP */ + +#if DEBUG || DEVELOPMENT + and x2, x26, BA_BOOT_FLAGS_DISABLE_JOP +#if __APCFG_SUPPORTED__ + // for APCFG systems, JOP keys are always on for EL1 unless ELXENKEY is cleared. + // JOP keys for EL0 will be toggled on the first time we pmap_switch to a pmap that has JOP enabled + cbz x2, Lenable_mmu + mrs x3, APCFG_EL1 + and x3, x3, #~(APCFG_EL1_ELXENKEY) + msr APCFG_EL1, x3 +#else /* __APCFG_SUPPORTED__ */ + cbnz x2, Lenable_mmu +#endif /* __APCFG_SUPPORTED__ */ +#endif /* DEBUG || DEVELOPMENT */ + +#if !__APCFG_SUPPORTED__ + MOV64 x1, SCTLR_JOP_KEYS_ENABLED + orr x0, x0, x1 +#endif /* !__APCFG_SUPPORTED__ */ +Lenable_mmu: +#else /* HAS_APPLE_PAC */ // Enable caches and MMU mov x0, #(SCTLR_EL1_DEFAULT & 0xFFFF) mov x1, #(SCTLR_EL1_DEFAULT & 0xFFFF0000) orr x0, x0, x1 +#endif /* HAS_APPLE_PAC */ MSR_SCTLR_EL1_X0 isb sy + MOV32 x1, SCTLR_EL1_DEFAULT +#if HAS_APPLE_PAC + orr x1, x1, #(SCTLR_PACIB_ENABLED) +#if !__APCFG_SUPPORTED__ + MOV64 x2, SCTLR_JOP_KEYS_ENABLED +#if (DEBUG || DEVELOPMENT) + // Ignore the JOP bits, since we can't predict at compile time whether BA_BOOT_FLAGS_DISABLE_JOP is set + bic x0, x0, x2 +#else + orr x1, x1, x2 +#endif /* (DEBUG || DEVELOPMENT) */ +#endif /* !__APCFG_SUPPORTED__ */ +#endif /* HAS_APPLE_PAC */ + cmp x0, x1 + bne . + #if (!CONFIG_KERNEL_INTEGRITY || (CONFIG_KERNEL_INTEGRITY && !defined(KERNEL_INTEGRITY_WT))) /* Watchtower * @@ -756,27 +880,24 @@ common_start: ARM64_WRITE_EP_SPR x15, x12, ARM64_REG_EHID4, ARM64_REG_HID4 #endif // APPLE_ARM64_ARCH_FAMILY -#if defined(APPLECYCLONE) || defined(APPLETYPHOON) +#if defined(APPLETYPHOON) // - // Cyclone/Typhoon-Specific initialization - // For tunable summary, see Alcatraz/H6: Confirm Cyclone CPU tunables have been set + // Typhoon-Specific initialization + // For tunable summary, see // // // Disable LSP flush with context switch to work around bug in LSP - // that can cause Cyclone to wedge when CONTEXTIDR is written. - // Innsbruck11A175: panic(cpu 0 caller 0xffffff800024e30c): "wait queue deadlock - wq=0xffffff805a7a63c0, cpu=0\n" + // that can cause Typhoon to wedge when CONTEXTIDR is written. + // // mrs x12, ARM64_REG_HID0 orr x12, x12, ARM64_REG_HID0_LoopBuffDisb msr ARM64_REG_HID0, x12 - + mrs x12, ARM64_REG_HID1 orr x12, x12, ARM64_REG_HID1_rccDisStallInactiveIexCtl -#if defined(APPLECYCLONE) - orr x12, x12, ARM64_REG_HID1_disLspFlushWithContextSwitch -#endif msr ARM64_REG_HID1, x12 mrs x12, ARM64_REG_HID3 @@ -796,7 +917,7 @@ common_start: #endif // ARM64_BOARD_CONFIG_T7001 msr ARM64_REG_HID8, x12 isb sy -#endif // APPLECYCLONE || APPLETYPHOON +#endif // APPLETYPHOON #if defined(APPLETWISTER) @@ -955,6 +1076,11 @@ Lskip_skye_post_a1_workarounds: #endif /* defined(APPLEMONSOON) */ + + + + + // If x21 != 0, we're doing a warm reset, so we need to trampoline to the kernel pmap. cbnz x21, Ltrampoline @@ -969,7 +1095,7 @@ Lskip_skye_post_a1_workarounds: // x0: boot args // x1: KVA page table phys base mrs x1, TTBR1_EL1 - bl _kasan_bootstrap + bl EXT(kasan_bootstrap) mov x0, x20 mov lr, x21 @@ -1024,6 +1150,7 @@ arm_init_tramp: * +---Kernel Base---+ */ + mov x19, lr // Convert CPU data PA to VA and set as first argument mov x0, x21 diff --git a/osfmk/arm64/status.c b/osfmk/arm64/status.c index 5a69eabc4..41d213e69 100644 --- a/osfmk/arm64/status.c +++ b/osfmk/arm64/status.c @@ -34,16 +34,19 @@ #include #include #include +#if __has_feature(ptrauth_calls) +#include +#endif struct arm_vfpv2_state { - __uint32_t __r[32]; - __uint32_t __fpscr; + __uint32_t __r[32]; + __uint32_t __fpscr; }; -typedef struct arm_vfpv2_state arm_vfpv2_state_t; +typedef struct arm_vfpv2_state arm_vfpv2_state_t; -#define ARM_VFPV2_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_vfpv2_state_t)/sizeof(uint32_t))) +#define ARM_VFPV2_STATE_COUNT \ + ((mach_msg_type_number_t)(sizeof (arm_vfpv2_state_t)/sizeof(uint32_t))) /* * Forward definitions @@ -55,31 +58,19 @@ void thread_set_parent(thread_t parent, int pid); * Maps state flavor to number of words in the state: */ /* __private_extern__ */ -unsigned int _MachineStateCount[] = { - /* FLAVOR_LIST */ 0, - ARM_UNIFIED_THREAD_STATE_COUNT, - ARM_VFP_STATE_COUNT, - ARM_EXCEPTION_STATE_COUNT, - ARM_DEBUG_STATE_COUNT, - /* THREAD_STATE_NONE (legacy) */ 0, - ARM_THREAD_STATE64_COUNT, - ARM_EXCEPTION_STATE64_COUNT, - /* THREAD_STATE_LAST (legacy) */ 0, - ARM_THREAD_STATE32_COUNT, - /* UNALLOCATED */ 0, - /* UNALLOCATED */ 0, - /* UNALLOCATED */ 0, - /* UNALLOCATED */ 0, - ARM_DEBUG_STATE32_COUNT, - ARM_DEBUG_STATE64_COUNT, - ARM_NEON_STATE_COUNT, - ARM_NEON_STATE64_COUNT, - /* UNALLOCATED */ 0, - /* UNALLOCATED */ 0, - /* ARM_SAVED_STATE32_COUNT */ 0, - /* ARM_SAVED_STATE64_COUNT */ 0, - /* ARM_NEON_SAVED_STATE32_COUNT */ 0, - /* ARM_NEON_SAVED_STATE64_COUNT */ 0, +unsigned int _MachineStateCount[] = { + [ARM_UNIFIED_THREAD_STATE] = ARM_UNIFIED_THREAD_STATE_COUNT, + [ARM_VFP_STATE] = ARM_VFP_STATE_COUNT, + [ARM_EXCEPTION_STATE] = ARM_EXCEPTION_STATE_COUNT, + [ARM_DEBUG_STATE] = ARM_DEBUG_STATE_COUNT, + [ARM_THREAD_STATE64] = ARM_THREAD_STATE64_COUNT, + [ARM_EXCEPTION_STATE64] = ARM_EXCEPTION_STATE64_COUNT, + [ARM_THREAD_STATE32] = ARM_THREAD_STATE32_COUNT, + [ARM_DEBUG_STATE32] = ARM_DEBUG_STATE32_COUNT, + [ARM_DEBUG_STATE64] = ARM_DEBUG_STATE64_COUNT, + [ARM_NEON_STATE] = ARM_NEON_STATE_COUNT, + [ARM_NEON_STATE64] = ARM_NEON_STATE64_COUNT, + [ARM_PAGEIN_STATE] = ARM_PAGEIN_STATE_COUNT, }; extern zone_t ads_zone; @@ -89,7 +80,8 @@ extern zone_t ads_zone; * Copy values from saved_state to ts64. */ void -saved_state_to_thread_state64(const arm_saved_state_t *saved_state, arm_thread_state64_t *ts64) +saved_state_to_thread_state64(const arm_saved_state_t * saved_state, + arm_thread_state64_t * ts64) { uint32_t i; @@ -109,7 +101,8 @@ saved_state_to_thread_state64(const arm_saved_state_t *saved_state, arm_thread_s * Copy values from ts64 to saved_state */ void -thread_state64_to_saved_state(const arm_thread_state64_t *ts64, arm_saved_state_t *saved_state) +thread_state64_to_saved_state(const arm_thread_state64_t * ts64, + arm_saved_state_t * saved_state) { uint32_t i; @@ -124,13 +117,13 @@ thread_state64_to_saved_state(const arm_thread_state64_t *ts64, arm_saved_state_ set_saved_state_reg(saved_state, i, ts64->x[i]); } } -#endif -kern_return_t -handle_get_arm32_thread_state( - thread_state_t tstate, - mach_msg_type_number_t * count, - const arm_saved_state_t *saved_state) +#endif /* __arm64__ */ + +static kern_return_t +handle_get_arm32_thread_state(thread_state_t tstate, + mach_msg_type_number_t * count, + const arm_saved_state_t * saved_state) { if (*count < ARM_THREAD_STATE32_COUNT) { return KERN_INVALID_ARGUMENT; @@ -144,11 +137,10 @@ handle_get_arm32_thread_state( return KERN_SUCCESS; } -kern_return_t -handle_get_arm64_thread_state( - thread_state_t tstate, - mach_msg_type_number_t * count, - const arm_saved_state_t *saved_state) +static kern_return_t +handle_get_arm64_thread_state(thread_state_t tstate, + mach_msg_type_number_t * count, + const arm_saved_state_t * saved_state) { if (*count < ARM_THREAD_STATE64_COUNT) { return KERN_INVALID_ARGUMENT; @@ -163,11 +155,10 @@ handle_get_arm64_thread_state( } -kern_return_t -handle_get_arm_thread_state( - thread_state_t tstate, - mach_msg_type_number_t * count, - const arm_saved_state_t *saved_state) +static kern_return_t +handle_get_arm_thread_state(thread_state_t tstate, + mach_msg_type_number_t * count, + const arm_saved_state_t * saved_state) { /* In an arm64 world, this flavor can be used to retrieve the thread * state of a 32-bit or 64-bit thread into a unified structure, but we @@ -196,11 +187,11 @@ handle_get_arm_thread_state( return KERN_SUCCESS; } -kern_return_t -handle_set_arm32_thread_state( - const thread_state_t tstate, - mach_msg_type_number_t count, - arm_saved_state_t *saved_state) + +static kern_return_t +handle_set_arm32_thread_state(const thread_state_t tstate, + mach_msg_type_number_t count, + arm_saved_state_t * saved_state) { if (count != ARM_THREAD_STATE32_COUNT) { return KERN_INVALID_ARGUMENT; @@ -210,11 +201,10 @@ handle_set_arm32_thread_state( return KERN_SUCCESS; } -kern_return_t -handle_set_arm64_thread_state( - const thread_state_t tstate, - mach_msg_type_number_t count, - arm_saved_state_t *saved_state) +static kern_return_t +handle_set_arm64_thread_state(const thread_state_t tstate, + mach_msg_type_number_t count, + arm_saved_state_t * saved_state) { if (count != ARM_THREAD_STATE64_COUNT) { return KERN_INVALID_ARGUMENT; @@ -225,11 +215,10 @@ handle_set_arm64_thread_state( } -kern_return_t -handle_set_arm_thread_state( - const thread_state_t tstate, - mach_msg_type_number_t count, - arm_saved_state_t *saved_state) +static kern_return_t +handle_set_arm_thread_state(const thread_state_t tstate, + mach_msg_type_number_t count, + arm_saved_state_t * saved_state) { /* In an arm64 world, this flavor can be used to set the thread state of a * 32-bit or 64-bit thread from a unified structure, but we need to support @@ -262,6 +251,7 @@ handle_set_arm_thread_state( return KERN_SUCCESS; } + /* * Translate thread state arguments to userspace representation */ @@ -273,9 +263,80 @@ machine_thread_state_convert_to_user( thread_state_t tstate, mach_msg_type_number_t *count) { +#if __has_feature(ptrauth_calls) + arm_thread_state64_t *ts64; + + switch (flavor) { + case ARM_THREAD_STATE: + { + arm_unified_thread_state_t *unified_state = (arm_unified_thread_state_t *)tstate; + + if (*count < ARM_UNIFIED_THREAD_STATE_COUNT || !is_thread_state64(unified_state)) { + return KERN_SUCCESS; + } + ts64 = thread_state64(unified_state); + break; + } + case ARM_THREAD_STATE64: + { + if (*count < ARM_THREAD_STATE64_COUNT) { + return KERN_SUCCESS; + } + ts64 = (arm_thread_state64_t *)tstate; + break; + } + default: + return KERN_SUCCESS; + } + + // Note that kernel threads never have disable_user_jop set + if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread()) || + thread->machine.disable_user_jop || !thread_is_64bit_addr(thread) || + (BootArgs->bootFlags & kBootFlagsDisableUserThreadStateJOP)) { + ts64->flags = __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH; + return KERN_SUCCESS; + } + + ts64->flags = 0; + if (ts64->lr) { + // lr might contain an IB-signed return address (strip is a no-op on unsigned addresses) + uintptr_t stripped_lr = (uintptr_t)ptrauth_strip((void *)ts64->lr, + ptrauth_key_return_address); + if (ts64->lr != stripped_lr) { + // Need to allow already-signed lr value to round-trip as is + ts64->flags |= __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR; + } + // Note that an IB-signed return address that happens to have a 0 signature value + // will round-trip correctly even if IA-signed again below (and IA-authd later) + } + + if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) { + return KERN_SUCCESS; + } + + if (ts64->pc) { + ts64->pc = (uintptr_t)pmap_sign_user_ptr((void*)ts64->pc, + ptrauth_key_process_independent_code, ptrauth_string_discriminator("pc")); + } + if (ts64->lr && !(ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) { + ts64->lr = (uintptr_t)pmap_sign_user_ptr((void*)ts64->lr, + ptrauth_key_process_independent_code, ptrauth_string_discriminator("lr")); + } + if (ts64->sp) { + ts64->sp = (uintptr_t)pmap_sign_user_ptr((void*)ts64->sp, + ptrauth_key_process_independent_data, ptrauth_string_discriminator("sp")); + } + if (ts64->fp) { + ts64->fp = (uintptr_t)pmap_sign_user_ptr((void*)ts64->fp, + ptrauth_key_process_independent_data, ptrauth_string_discriminator("fp")); + } + + return KERN_SUCCESS; +#else // No conversion to userspace representation on this platform (void)thread; (void)flavor; (void)tstate; (void)count; return KERN_SUCCESS; +#endif /* __has_feature(ptrauth_calls) */ } /* @@ -289,9 +350,94 @@ machine_thread_state_convert_from_user( thread_state_t tstate, mach_msg_type_number_t count) { +#if __has_feature(ptrauth_calls) + arm_thread_state64_t *ts64; + + switch (flavor) { + case ARM_THREAD_STATE: + { + arm_unified_thread_state_t *unified_state = (arm_unified_thread_state_t *)tstate; + + if (count < ARM_UNIFIED_THREAD_STATE_COUNT || !is_thread_state64(unified_state)) { + return KERN_SUCCESS; + } + ts64 = thread_state64(unified_state); + break; + } + case ARM_THREAD_STATE64: + { + if (count != ARM_THREAD_STATE64_COUNT) { + return KERN_SUCCESS; + } + ts64 = (arm_thread_state64_t *)tstate; + break; + } + default: + return KERN_SUCCESS; + } + + // Note that kernel threads never have disable_user_jop set + if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread())) { + if (thread->machine.disable_user_jop || !thread_is_64bit_addr(thread)) { + ts64->flags = __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH; + return KERN_SUCCESS; + } + // A JOP-disabled process must not set thread state on a JOP-enabled process + return KERN_PROTECTION_FAILURE; + } + + if (ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) { + if (thread->machine.disable_user_jop || !thread_is_64bit_addr(thread) || + (BootArgs->bootFlags & kBootFlagsDisableUserThreadStateJOP)) { + return KERN_SUCCESS; + } + // Disallow setting unsigned thread state on JOP-enabled processes. + // Ignore flag and treat thread state arguments as signed, ptrauth + // poisoning will cause resulting thread state to be invalid + ts64->flags &= ~__DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH; + } + + if (ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR) { + // lr might contain an IB-signed return address (strip is a no-op on unsigned addresses) + uintptr_t stripped_lr = (uintptr_t)ptrauth_strip((void *)ts64->lr, + ptrauth_key_return_address); + if (ts64->lr == stripped_lr) { + // Don't allow unsigned pointer to be passed through as is. Ignore flag and + // treat as IA-signed below (where auth failure may poison the value). + ts64->flags &= ~__DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR; + } + // Note that an IB-signed return address that happens to have a 0 signature value + // will also have been IA-signed (without this flag being set) and so will IA-auth + // correctly below. + } + + if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) { + return KERN_SUCCESS; + } + + if (ts64->pc) { + ts64->pc = (uintptr_t)pmap_auth_user_ptr((void*)ts64->pc, + ptrauth_key_process_independent_code, ptrauth_string_discriminator("pc")); + } + if (ts64->lr && !(ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) { + ts64->lr = (uintptr_t)pmap_auth_user_ptr((void*)ts64->lr, + ptrauth_key_process_independent_code, ptrauth_string_discriminator("lr")); + } + if (ts64->sp) { + ts64->sp = (uintptr_t)pmap_auth_user_ptr((void*)ts64->sp, + ptrauth_key_process_independent_data, ptrauth_string_discriminator("sp")); + } + if (ts64->fp) { + ts64->fp = (uintptr_t)pmap_auth_user_ptr((void*)ts64->fp, + ptrauth_key_process_independent_data, ptrauth_string_discriminator("fp")); + } + + return KERN_SUCCESS; +#else // No conversion from userspace representation on this platform (void)thread; (void)flavor; (void)tstate; (void)count; return KERN_SUCCESS; +#endif /* __has_feature(ptrauth_calls) */ } /* @@ -303,9 +449,27 @@ machine_thread_siguctx_pointer_convert_to_user( __assert_only thread_t thread, user_addr_t *uctxp) { +#if __has_feature(ptrauth_calls) + if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread())) { + assert(thread->machine.disable_user_jop || !thread_is_64bit_addr(thread)); + return KERN_SUCCESS; + } + + if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) { + return KERN_SUCCESS; + } + + if (*uctxp) { + *uctxp = (uintptr_t)pmap_sign_user_ptr((void*)*uctxp, + ptrauth_key_process_independent_data, ptrauth_string_discriminator("uctx")); + } + + return KERN_SUCCESS; +#else // No conversion to userspace representation on this platform (void)thread; (void)uctxp; return KERN_SUCCESS; +#endif /* __has_feature(ptrauth_calls) */ } /* @@ -318,21 +482,41 @@ machine_thread_function_pointers_convert_from_user( user_addr_t *fptrs, uint32_t count) { +#if __has_feature(ptrauth_calls) + if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread())) { + assert(thread->machine.disable_user_jop || !thread_is_64bit_addr(thread)); + return KERN_SUCCESS; + } + + if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) { + return KERN_SUCCESS; + } + + while (count--) { + if (*fptrs) { + *fptrs = (uintptr_t)pmap_auth_user_ptr((void*)*fptrs, + ptrauth_key_function_pointer, 0); + } + fptrs++; + } + + return KERN_SUCCESS; +#else // No conversion from userspace representation on this platform (void)thread; (void)fptrs; (void)count; return KERN_SUCCESS; +#endif /* __has_feature(ptrauth_calls) */ } /* - * Routine: machine_thread_get_state + * Routine: machine_thread_get_state * */ kern_return_t -machine_thread_get_state( - thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t * count) +machine_thread_get_state(thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t * count) { switch (flavor) { case THREAD_STATE_FLAVOR_LIST: @@ -359,6 +543,19 @@ machine_thread_get_state( *count = 4; break; + case THREAD_STATE_FLAVOR_LIST_10_15: + if (*count < 5) { + return KERN_INVALID_ARGUMENT; + } + + tstate[0] = ARM_THREAD_STATE; + tstate[1] = ARM_VFP_STATE; + tstate[2] = thread_is_64bit_data(thread) ? ARM_EXCEPTION_STATE64 : ARM_EXCEPTION_STATE; + tstate[3] = thread_is_64bit_data(thread) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32; + tstate[4] = ARM_PAGEIN_STATE; + *count = 5; + break; + case ARM_THREAD_STATE: { kern_return_t rn = handle_get_arm_thread_state(tstate, count, thread->machine.upcb); @@ -512,7 +709,7 @@ machine_thread_get_state( case ARM_VFP_STATE:{ struct arm_vfp_state *state; arm_neon_saved_state32_t *thread_state; - unsigned int max; + unsigned int max; if (*count < ARM_VFP_STATE_COUNT) { if (*count < ARM_VFPV2_STATE_COUNT) { @@ -581,6 +778,22 @@ machine_thread_get_state( break; } + + case ARM_PAGEIN_STATE: { + arm_pagein_state_t *state; + + if (*count < ARM_PAGEIN_STATE_COUNT) { + return KERN_INVALID_ARGUMENT; + } + + state = (arm_pagein_state_t *)tstate; + state->__pagein_error = thread->t_pagein_error; + + *count = ARM_PAGEIN_STATE_COUNT; + break; + } + + default: return KERN_INVALID_ARGUMENT; } @@ -589,15 +802,14 @@ machine_thread_get_state( /* - * Routine: machine_thread_get_kern_state + * Routine: machine_thread_get_kern_state * */ kern_return_t -machine_thread_get_kern_state( - thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t * count) +machine_thread_get_kern_state(thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t * count) { /* * This works only for an interrupted kernel thread @@ -670,15 +882,14 @@ machine_thread_switch_addrmode(thread_t thread) extern long long arm_debug_get(void); /* - * Routine: machine_thread_set_state + * Routine: machine_thread_set_state * */ kern_return_t -machine_thread_set_state( - thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t count) +machine_thread_set_state(thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t count) { kern_return_t rn; @@ -762,7 +973,6 @@ machine_thread_set_state( } } - if (!enabled) { arm_debug_state32_t *thread_state = find_debug_state32(thread); if (thread_state != NULL) { @@ -902,7 +1112,7 @@ machine_thread_set_state( { arm_debug_state64_t *state; boolean_t enabled = FALSE; - unsigned int i; + unsigned int i; if (count != ARM_DEBUG_STATE64_COUNT) { return KERN_INVALID_ARGUMENT; @@ -957,7 +1167,7 @@ machine_thread_set_state( for (i = 0; i < 16; i++) { /* set appropriate privilege; mask out unknown bits */ thread_state->bcr[i] = (state->bcr[i] & (0 /* Was ARM_DBG_CR_ADDRESS_MASK_MASK deprecated in v8 */ - | 0 /* Was ARM_DBGBCR_MATCH_MASK, ignored in AArch64 state */ + | 0 /* Was ARM_DBGBCR_MATCH_MASK, ignored in AArch64 state */ | ARM_DBG_CR_BYTE_ADDRESS_SELECT_MASK | ARM_DBG_CR_ENABLE_MASK)) | ARM_DBGBCR_TYPE_IVA @@ -1055,19 +1265,32 @@ machine_thread_set_state( break; } + default: return KERN_INVALID_ARGUMENT; } return KERN_SUCCESS; } +mach_vm_address_t +machine_thread_pc(thread_t thread) +{ + struct arm_saved_state *ss = get_user_regs(thread); + return (mach_vm_address_t)get_saved_state_pc(ss); +} + +void +machine_thread_reset_pc(thread_t thread, mach_vm_address_t pc) +{ + set_saved_state_pc(get_user_regs(thread), (register_t)pc); +} + /* - * Routine: machine_thread_state_initialize + * Routine: machine_thread_state_initialize * */ kern_return_t -machine_thread_state_initialize( - thread_t thread) +machine_thread_state_initialize(thread_t thread) { arm_context_t *context = thread->machine.contextData; @@ -1090,19 +1313,24 @@ machine_thread_state_initialize( thread->machine.DebugData = NULL; +#if defined(HAS_APPLE_PAC) + /* Sign the initial user-space thread state */ + if (thread->machine.upcb != NULL) { + ml_sign_thread_state(thread->machine.upcb, 0, 0, 0, 0, 0); + } +#endif /* defined(HAS_APPLE_PAC) */ return KERN_SUCCESS; } /* - * Routine: machine_thread_dup + * Routine: machine_thread_dup * */ kern_return_t -machine_thread_dup( - thread_t self, - thread_t target, - __unused boolean_t is_corpse) +machine_thread_dup(thread_t self, + thread_t target, + __unused boolean_t is_corpse) { struct arm_saved_state *self_saved_state; struct arm_saved_state *target_saved_state; @@ -1113,46 +1341,47 @@ machine_thread_dup( self_saved_state = self->machine.upcb; target_saved_state = target->machine.upcb; bcopy(self_saved_state, target_saved_state, sizeof(struct arm_saved_state)); +#if defined(HAS_APPLE_PAC) + if (!is_corpse && is_saved_state64(self_saved_state)) { + check_and_sign_copied_thread_state(target_saved_state, self_saved_state); + } +#endif /* defined(HAS_APPLE_PAC) */ return KERN_SUCCESS; } /* - * Routine: get_user_regs + * Routine: get_user_regs * */ struct arm_saved_state * -get_user_regs( - thread_t thread) +get_user_regs(thread_t thread) { return thread->machine.upcb; } arm_neon_saved_state_t * -get_user_neon_regs( - thread_t thread) +get_user_neon_regs(thread_t thread) { return thread->machine.uNeon; } /* - * Routine: find_user_regs + * Routine: find_user_regs * */ struct arm_saved_state * -find_user_regs( - thread_t thread) +find_user_regs(thread_t thread) { return thread->machine.upcb; } /* - * Routine: find_kern_regs + * Routine: find_kern_regs * */ struct arm_saved_state * -find_kern_regs( - thread_t thread) +find_kern_regs(thread_t thread) { /* * This works only for an interrupted kernel thread @@ -1165,8 +1394,7 @@ find_kern_regs( } arm_debug_state32_t * -find_debug_state32( - thread_t thread) +find_debug_state32(thread_t thread) { if (thread && thread->machine.DebugData) { return &(thread->machine.DebugData->uds.ds32); @@ -1176,8 +1404,7 @@ find_debug_state32( } arm_debug_state64_t * -find_debug_state64( - thread_t thread) +find_debug_state64(thread_t thread) { if (thread && thread->machine.DebugData) { return &(thread->machine.DebugData->uds.ds64); @@ -1187,19 +1414,18 @@ find_debug_state64( } /* - * Routine: thread_userstack + * Routine: thread_userstack * */ kern_return_t -thread_userstack( - __unused thread_t thread, - int flavor, - thread_state_t tstate, - unsigned int count, - mach_vm_offset_t * user_stack, - int *customstack, - boolean_t is_64bit_data - ) +thread_userstack(__unused thread_t thread, + int flavor, + thread_state_t tstate, + unsigned int count, + mach_vm_offset_t * user_stack, + int * customstack, + boolean_t is_64bit_data + ) { register_t sp; @@ -1267,9 +1493,8 @@ thread_userstack( * thread, if otherwise unknown. */ kern_return_t -thread_userstackdefault( - mach_vm_offset_t *default_user_stack, - boolean_t is64bit) +thread_userstackdefault(mach_vm_offset_t * default_user_stack, + boolean_t is64bit) { if (is64bit) { *default_user_stack = USRSTACK64; @@ -1281,11 +1506,12 @@ thread_userstackdefault( } /* - * Routine: thread_setuserstack + * Routine: thread_setuserstack * */ void -thread_setuserstack(thread_t thread, mach_vm_address_t user_stack) +thread_setuserstack(thread_t thread, + mach_vm_address_t user_stack) { struct arm_saved_state *sv; @@ -1297,11 +1523,12 @@ thread_setuserstack(thread_t thread, mach_vm_address_t user_stack) } /* - * Routine: thread_adjuserstack + * Routine: thread_adjuserstack * */ uint64_t -thread_adjuserstack(thread_t thread, int adjust) +thread_adjuserstack(thread_t thread, + int adjust) { struct arm_saved_state *sv; uint64_t sp; @@ -1316,11 +1543,12 @@ thread_adjuserstack(thread_t thread, int adjust) } /* - * Routine: thread_setentrypoint + * Routine: thread_setentrypoint * */ void -thread_setentrypoint(thread_t thread, mach_vm_offset_t entry) +thread_setentrypoint(thread_t thread, + mach_vm_offset_t entry) { struct arm_saved_state *sv; @@ -1332,17 +1560,16 @@ thread_setentrypoint(thread_t thread, mach_vm_offset_t entry) } /* - * Routine: thread_entrypoint + * Routine: thread_entrypoint * */ kern_return_t -thread_entrypoint( - __unused thread_t thread, - int flavor, - thread_state_t tstate, - unsigned int count __unused, - mach_vm_offset_t * entry_point - ) +thread_entrypoint(__unused thread_t thread, + int flavor, + thread_state_t tstate, + unsigned int count __unused, + mach_vm_offset_t * entry_point + ) { switch (flavor) { case ARM_THREAD_STATE: @@ -1388,13 +1615,12 @@ thread_entrypoint( /* - * Routine: thread_set_child + * Routine: thread_set_child * */ void -thread_set_child( - thread_t child, - int pid) +thread_set_child(thread_t child, + int pid) { struct arm_saved_state *child_state; @@ -1406,13 +1632,12 @@ thread_set_child( /* - * Routine: thread_set_parent + * Routine: thread_set_parent * */ void -thread_set_parent( - thread_t parent, - int pid) +thread_set_parent(thread_t parent, + int pid) { struct arm_saved_state *parent_state; @@ -1431,10 +1656,10 @@ struct arm_act_context { }; /* - * Routine: act_thread_csave + * Routine: act_thread_csave * */ -void * +void * act_thread_csave(void) { struct arm_act_context *ic; @@ -1459,13 +1684,13 @@ act_thread_csave(void) val = ARM_NEON_STATE64_COUNT; kret = machine_thread_get_state(thread, ARM_NEON_STATE64, - (thread_state_t) &ic->ns, + (thread_state_t)&ic->ns, &val); } else { val = ARM_NEON_STATE_COUNT; kret = machine_thread_get_state(thread, ARM_NEON_STATE, - (thread_state_t) &ic->ns, + (thread_state_t)&ic->ns, &val); } if (kret != KERN_SUCCESS) { @@ -1477,11 +1702,11 @@ act_thread_csave(void) } /* - * Routine: act_thread_catt + * Routine: act_thread_catt * */ void -act_thread_catt(void *ctx) +act_thread_catt(void * ctx) { struct arm_act_context *ic; kern_return_t kret; @@ -1501,12 +1726,12 @@ act_thread_catt(void *ctx) if (thread_is_64bit_data(thread)) { kret = machine_thread_set_state(thread, ARM_NEON_STATE64, - (thread_state_t) &ic->ns, + (thread_state_t)&ic->ns, ARM_NEON_STATE64_COUNT); } else { kret = machine_thread_set_state(thread, ARM_NEON_STATE, - (thread_state_t) &ic->ns, + (thread_state_t)&ic->ns, ARM_NEON_STATE_COUNT); } if (kret != KERN_SUCCESS) { @@ -1518,7 +1743,7 @@ out: } /* - * Routine: act_thread_catt + * Routine: act_thread_catt * */ void @@ -1528,7 +1753,8 @@ act_thread_cfree(void *ctx) } kern_return_t -thread_set_wq_state32(thread_t thread, thread_state_t tstate) +thread_set_wq_state32(thread_t thread, + thread_state_t tstate) { arm_thread_state_t *state; struct arm_saved_state *saved_state; @@ -1565,7 +1791,8 @@ thread_set_wq_state32(thread_t thread, thread_state_t tstate) } kern_return_t -thread_set_wq_state64(thread_t thread, thread_state_t tstate) +thread_set_wq_state64(thread_t thread, + thread_state_t tstate) { arm_thread_state64_t *state; struct arm_saved_state *saved_state; diff --git a/osfmk/arm64/tlb.h b/osfmk/arm64/tlb.h new file mode 100644 index 000000000..eb1face77 --- /dev/null +++ b/osfmk/arm64/tlb.h @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include +#include + +#define tlbi_addr(x) ((((x) >> 12) & TLBI_ADDR_MASK) << TLBI_ADDR_SHIFT) +#define tlbi_asid(x) (((uintptr_t)(x) & TLBI_ASID_MASK) << TLBI_ASID_SHIFT) + +#if __ARM_KERNEL_PROTECT__ +/* + * __ARM_KERNEL_PROTECT__ adds two complications to TLB management: + * + * 1. As each pmap has two ASIDs, every TLB operation that targets an ASID must + * target both ASIDs for the pmap that owns the target ASID. + * + * 2. Any TLB operation targeting the kernel_pmap ASID (ASID 0) must target all + * ASIDs (as kernel_pmap mappings may be referenced while using an ASID that + * belongs to another pmap). We expect these routines to be called with the + * EL0 ASID for the target; not the EL1 ASID. + */ +#endif /* __ARM_KERNEL_PROTECT__ */ + +static inline void +sync_tlb_flush(void) +{ + __builtin_arm_dsb(DSB_ISH); + __builtin_arm_isb(ISB_SY); +} + +// flush_mmu_tlb: full TLB flush on all cores +static inline void +flush_mmu_tlb_async(void) +{ + asm volatile ("tlbi vmalle1is"); +} + +static inline void +flush_mmu_tlb(void) +{ + flush_mmu_tlb_async(); + sync_tlb_flush(); +} + +// flush_core_tlb: full TLB flush on local core only +static inline void +flush_core_tlb_async(void) +{ + asm volatile ("tlbi vmalle1"); +} + +static inline void +flush_core_tlb(void) +{ + flush_core_tlb_async(); + sync_tlb_flush(); +} + +// flush_mmu_tlb_allentries_async: flush entries that map VA range, all ASIDS, all cores +// start and end are in units of 4K pages. +static inline void +flush_mmu_tlb_allentries_async(uint64_t start, uint64_t end) +{ +#if __ARM_16K_PG__ + start = start & ~0x3ULL; + + /* + * The code below is not necessarily correct. From an overview of + * the client code, the expected contract for TLB flushes is that + * we will expand from an "address, length" pair to "start address, + * end address" in the course of a TLB flush. This suggests that + * a flush for "X, X+4" is actually only asking for a flush of a + * single 16KB page. At the same time, we'd like to be prepared + * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page + * number to a 16KB page boundary. This should deal correctly with + * unaligned inputs. + * + * If our expecations about client behavior are wrong however, this + * will lead to occasional TLB corruption on platforms with 16KB + * pages. + */ + end = (end + 0x3ULL) & ~0x3ULL; +#endif // __ARM_16K_PG__ + for (; start < end; start += (ARM_PGBYTES / 4096)) { + asm volatile ("tlbi vaae1is, %0" : : "r"(start)); + } +} + +static inline void +flush_mmu_tlb_allentries(uint64_t start, uint64_t end) +{ + flush_mmu_tlb_allentries_async(start, end); + sync_tlb_flush(); +} + +// flush_mmu_tlb_entry: flush TLB entries that map a VA and ASID, all cores +// Will also flush global entries that match the VA +static inline void +flush_mmu_tlb_entry_async(uint64_t val) +{ +#if __ARM_KERNEL_PROTECT__ + uint64_t asid = val >> TLBI_ASID_SHIFT; + if (asid == 0) { + asm volatile ("tlbi vaae1is, %0" : : "r"(val)); + return; + } + val = val & ~(1ULL << TLBI_ASID_SHIFT); + asm volatile ("tlbi vae1is, %0" : : "r"(val)); + val = val | (1ULL << TLBI_ASID_SHIFT); +#endif /* __ARM_KERNEL_PROTECT__ */ + asm volatile ("tlbi vae1is, %0" : : "r"(val)); +} + +static inline void +flush_mmu_tlb_entry(uint64_t val) +{ + flush_mmu_tlb_entry_async(val); + sync_tlb_flush(); +} + +// flush_mmu_tlb_entries: flush TLB entries that map a VA range and ASID, all cores +// start and end must have the ASID in the high 16 bits, with the VA in units of 4K in the lowest bits +// Will also flush global entries that match the VA range +static inline void +flush_mmu_tlb_entries_async(uint64_t start, uint64_t end) +{ +#if __ARM_16K_PG__ + start = start & ~0x3ULL; + + /* + * The code below is not necessarily correct. From an overview of + * the client code, the expected contract for TLB flushes is that + * we will expand from an "address, length" pair to "start address, + * end address" in the course of a TLB flush. This suggests that + * a flush for "X, X+4" is actually only asking for a flush of a + * single 16KB page. At the same time, we'd like to be prepared + * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page + * number to a 16KB page boundary. This should deal correctly with + * unaligned inputs. + * + * If our expecations about client behavior are wrong however, this + * will lead to occasional TLB corruption on platforms with 16KB + * pages. + */ + end = (end + 0x3ULL) & ~0x3ULL; +#endif // __ARM_16K_PG__ +#if __ARM_KERNEL_PROTECT__ + uint64_t asid = start >> TLBI_ASID_SHIFT; + /* + * If we are flushing ASID 0, this is a kernel operation. With this + * ASID scheme, this means we should flush all ASIDs. + */ + if (asid == 0) { + for (; start < end; start += (ARM_PGBYTES / 4096)) { + asm volatile ("tlbi vaae1is, %0" : : "r"(start)); + } + return; + } + start = start | (1ULL << TLBI_ASID_SHIFT); + end = end | (1ULL << TLBI_ASID_SHIFT); + for (; start < end; start += (ARM_PGBYTES / 4096)) { + start = start & ~(1ULL << TLBI_ASID_SHIFT); + asm volatile ("tlbi vae1is, %0" : : "r"(start)); + start = start | (1ULL << TLBI_ASID_SHIFT); + asm volatile ("tlbi vae1is, %0" : : "r"(start)); + } +#else + for (; start < end; start += (ARM_PGBYTES / 4096)) { + asm volatile ("tlbi vae1is, %0" : : "r"(start)); + } +#endif /* __ARM_KERNEL_PROTECT__ */ +} + +static inline void +flush_mmu_tlb_entries(uint64_t start, uint64_t end) +{ + flush_mmu_tlb_entries_async(start, end); + sync_tlb_flush(); +} + +// flush_mmu_tlb_asid: flush all entries that match an ASID, on all cores +// ASID must be in high 16 bits of argument +// Will not flush global entries +static inline void +flush_mmu_tlb_asid_async(uint64_t val) +{ +#if __ARM_KERNEL_PROTECT__ + /* + * If we are flushing ASID 0, this is a kernel operation. With this + * ASID scheme, this means we should flush all ASIDs. + */ + uint64_t asid = val >> TLBI_ASID_SHIFT; + if (asid == 0) { + asm volatile ("tlbi vmalle1is"); + return; + } + val = val & ~(1ULL << TLBI_ASID_SHIFT); + asm volatile ("tlbi aside1is, %0" : : "r"(val)); + val = val | (1ULL << TLBI_ASID_SHIFT); +#endif /* __ARM_KERNEL_PROTECT__ */ + asm volatile ("tlbi aside1is, %0" : : "r"(val)); +} + +static inline void +flush_mmu_tlb_asid(uint64_t val) +{ + flush_mmu_tlb_asid_async(val); + sync_tlb_flush(); +} + +// flush_core_tlb_asid: flush all entries that match an ASID, local core only +// ASID must be in high 16 bits of argument +// Will not flush global entries +static inline void +flush_core_tlb_asid_async(uint64_t val) +{ +#if __ARM_KERNEL_PROTECT__ + /* + * If we are flushing ASID 0, this is a kernel operation. With this + * ASID scheme, this means we should flush all ASIDs. + */ + uint64_t asid = val >> TLBI_ASID_SHIFT; + if (asid == 0) { + asm volatile ("tlbi vmalle1"); + return; + } + val = val & ~(1ULL << TLBI_ASID_SHIFT); + asm volatile ("tlbi aside1, %0" : : "r"(val)); + val = val | (1ULL << TLBI_ASID_SHIFT); +#endif /* __ARM_KERNEL_PROTECT__ */ + asm volatile ("tlbi aside1, %0" : : "r"(val)); +} + +static inline void +flush_core_tlb_asid(uint64_t val) +{ + flush_core_tlb_asid_async(val); + sync_tlb_flush(); +} + +#if __ARM_RANGE_TLBI__ +#if __ARM_KERNEL_PROTECT__ + #error __ARM_RANGE_TLBI__ + __ARM_KERNEL_PROTECT__ is not currently supported +#endif + +#define ARM64_16K_TLB_RANGE_PAGES (1ULL << 21) +#define rtlbi_addr(x) (((x) >> RTLBI_ADDR_SHIFT) & RTLBI_ADDR_MASK) +#define rtlbi_scale(x) ((uint64_t)(x) << RTLBI_SCALE_SHIFT) +#define rtlbi_num(x) ((uint64_t)(x) << RTLBI_NUM_SHIFT) + +/** + * Given the number of pages to invalidate, generate the correct parameter to + * pass to any of the TLBI by range methods. + */ +static inline uint64_t +generate_rtlbi_param(ppnum_t npages, uint32_t asid, vm_offset_t va) +{ + /** + * Per the armv8.4 RTLBI extension spec, the range encoded in the rtlbi register operand is defined by: + * BaseADDR <= VA < BaseADDR+((NUM+1)*2^(5*SCALE+1) * Translation_Granule_Size) + */ + unsigned order = (sizeof(npages) * 8) - __builtin_clz(npages - 1) - 1; + unsigned scale = ((order ? order : 1) - 1) / 5; + unsigned granule = 1 << ((5 * scale) + 1); + unsigned num = (((npages + granule - 1) & ~(granule - 1)) / granule) - 1; + return tlbi_asid(asid) | RTLBI_TG | rtlbi_scale(scale) | rtlbi_num(num) | rtlbi_addr(va); +} + +// flush_mmu_tlb_range: flush TLB entries that map a VA range using a single instruction +// The argument should be encoded according to generate_rtlbi_param(). +// Follows the same ASID matching behavior as flush_mmu_tlb_entries() +static inline void +flush_mmu_tlb_range_async(uint64_t val) +{ + asm volatile ("tlbi rvae1is, %0" : : "r"(val)); +} + +static inline void +flush_mmu_tlb_range(uint64_t val) +{ + flush_mmu_tlb_range_async(val); + sync_tlb_flush(); +} + +// flush_mmu_tlb_allrange: flush TLB entries that map a VA range using a single instruction +// The argument should be encoded according to generate_rtlbi_param(). +// Follows the same ASID matching behavior as flush_mmu_tlb_allentries() +static inline void +flush_mmu_tlb_allrange_async(uint64_t val) +{ + asm volatile ("tlbi rvaae1is, %0" : : "r"(val)); +} + +static inline void +flush_mmu_tlb_allrange(uint64_t val) +{ + flush_mmu_tlb_allrange_async(val); + sync_tlb_flush(); +} + +#endif // __ARM_RANGE_TLBI__ + + diff --git a/osfmk/atm/Makefile b/osfmk/atm/Makefile index aa1f67f54..88863e3b2 100644 --- a/osfmk/atm/Makefile +++ b/osfmk/atm/Makefile @@ -54,7 +54,7 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -63,7 +63,7 @@ ${MIG_UUHDRS} : \ ${MIG_USHDRS} : \ %_server.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -101,7 +101,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -111,7 +111,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/atm/atm.c b/osfmk/atm/atm.c index 69fc9f738..ff1111080 100644 --- a/osfmk/atm/atm.c +++ b/osfmk/atm/atm.c @@ -76,16 +76,13 @@ static void atm_hash_table_init(void); static kern_return_t atm_value_hash_table_insert(atm_value_t new_atm_value); static void atm_value_hash_table_delete(atm_value_t atm_value); static atm_value_t get_atm_value_from_aid(aid_t aid) __unused; -static void atm_value_get_ref(atm_value_t atm_value); static kern_return_t atm_listener_insert(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard); static void atm_listener_delete_all(atm_value_t atm_value); static atm_task_descriptor_t atm_task_descriptor_alloc_init(mach_port_t trace_buffer, uint64_t buffer_size, __assert_only task_t task); -static void atm_descriptor_get_reference(atm_task_descriptor_t task_descriptor); static void atm_task_descriptor_dealloc(atm_task_descriptor_t task_descriptor); static kern_return_t atm_value_unregister(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard); static kern_return_t atm_value_register(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard); static kern_return_t atm_listener_delete(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard); -static void atm_link_get_reference(atm_link_object_t link_object) __unused; static void atm_link_dealloc(atm_link_object_t link_object); kern_return_t @@ -136,7 +133,7 @@ atm_release(ipc_voucher_attr_manager_t __assert_only manager); /* * communication channel from voucher system to ATM */ -struct ipc_voucher_attr_manager atm_manager = { +const struct ipc_voucher_attr_manager atm_manager = { .ivam_release_value = atm_release_value, .ivam_get_value = atm_get_value, .ivam_extract_content = atm_extract_content, @@ -609,7 +606,7 @@ atm_value_alloc_init(aid_t aid) queue_init(&new_atm_value->listeners); new_atm_value->sync = 1; new_atm_value->listener_count = 0; - new_atm_value->reference_count = 1; + os_ref_init(&new_atm_value->reference_count, NULL); lck_mtx_init(&new_atm_value->listener_lock, &atm_lock_grp, &atm_lock_attr); #if DEVELOPMENT || DEBUG @@ -658,24 +655,19 @@ get_subaid() static void atm_value_dealloc(atm_value_t atm_value) { - if (0 < atm_value_release_internal(atm_value)) { - return; - } - - assert(atm_value->reference_count == 0); - - /* Free up the atm value and also remove all the listeners. */ - atm_listener_delete_all(atm_value); + if (os_ref_release(&atm_value->reference_count) == 0) { + /* Free up the atm value and also remove all the listeners. */ + atm_listener_delete_all(atm_value); - lck_mtx_destroy(&atm_value->listener_lock, &atm_lock_grp); + lck_mtx_destroy(&atm_value->listener_lock, &atm_lock_grp); #if DEVELOPMENT || DEBUG - lck_mtx_lock(&atm_values_list_lock); - queue_remove(&atm_values_list, atm_value, atm_value_t, value_elt); - lck_mtx_unlock(&atm_values_list_lock); + lck_mtx_lock(&atm_values_list_lock); + queue_remove(&atm_values_list, atm_value, atm_value_t, value_elt); + lck_mtx_unlock(&atm_values_list_lock); #endif - zfree(atm_value_zone, atm_value); - return; + zfree(atm_value_zone, atm_value); + } } @@ -780,7 +772,7 @@ get_atm_value_from_aid(aid_t aid) * Aid found. Incerease ref count and return * the atm value structure. */ - atm_value_get_ref(next); + os_ref_retain(&next->reference_count); lck_mtx_unlock(&hash_list_head->hash_list_lock); return next; } @@ -790,18 +782,6 @@ get_atm_value_from_aid(aid_t aid) } -/* - * Routine: atm_value_get_ref - * Purpose: Get a reference on atm value. - * Returns: None. - */ -static void -atm_value_get_ref(atm_value_t atm_value) -{ - atm_value_reference_internal(atm_value); -} - - /* * Routine: atm_listener_insert * Purpose: Insert a listener to an atm value. @@ -822,11 +802,11 @@ atm_listener_insert( new_link_object = (atm_link_object_t) zalloc(atm_link_objects_zone); new_link_object->descriptor = task_descriptor; - new_link_object->reference_count = 1; + os_ref_init(&new_link_object->reference_count, NULL); new_link_object->guard = guard; /* Get a reference on the task descriptor */ - atm_descriptor_get_reference(task_descriptor); + os_ref_retain(&task_descriptor->reference_count); queue_init(&free_listeners); listener_count = atm_value->listener_count; @@ -857,7 +837,7 @@ atm_listener_insert( if (elem->descriptor == task_descriptor) { /* Increment reference count on Link object. */ - atm_link_get_reference(elem); + os_ref_retain(&elem->reference_count); /* Replace the guard with the new one, the old guard is anyways on unregister path. */ elem->guard = guard; @@ -945,16 +925,16 @@ atm_listener_delete( if (elem->guard == guard) { KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO, (ATM_VALUE_UNREGISTERED))) | DBG_FUNC_NONE, - VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, guard, elem->reference_count, 0); + VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, guard, os_ref_get_count(&elem->reference_count), 0); elem->guard = 0; kr = KERN_SUCCESS; } else { KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO, (ATM_VALUE_DIFF_MAILBOX))) | DBG_FUNC_NONE, - VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, elem->guard, elem->reference_count, 0); + VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, elem->guard, os_ref_get_count(&elem->reference_count), 0); kr = KERN_INVALID_VALUE; } - if (0 == atm_link_object_release_internal(elem)) { + if (os_ref_release(&elem->reference_count) == 0) { queue_remove(&atm_value->listeners, elem, atm_link_object_t, listeners_element); queue_enter(&free_listeners, elem, atm_link_object_t, listeners_element); atm_listener_count_decr_internal(atm_value); @@ -992,7 +972,7 @@ atm_task_descriptor_alloc_init( new_task_descriptor->trace_buffer = trace_buffer; new_task_descriptor->trace_buffer_size = buffer_size; - new_task_descriptor->reference_count = 1; + os_ref_init(&new_task_descriptor->reference_count, NULL); new_task_descriptor->flags = 0; lck_mtx_init(&new_task_descriptor->lock, &atm_lock_grp, &atm_lock_attr); @@ -1007,18 +987,6 @@ atm_task_descriptor_alloc_init( } -/* - * Routine: atm_descriptor_get_reference - * Purpose: Get a reference count on task descriptor. - * Returns: None. - */ -static void -atm_descriptor_get_reference(atm_task_descriptor_t task_descriptor) -{ - atm_task_desc_reference_internal(task_descriptor); -} - - /* * Routine: atm_task_descriptor_dealloc * Prupose: Drops the reference on atm descriptor. @@ -1027,34 +995,17 @@ atm_descriptor_get_reference(atm_task_descriptor_t task_descriptor) static void atm_task_descriptor_dealloc(atm_task_descriptor_t task_descriptor) { - if (0 < atm_task_desc_release_internal(task_descriptor)) { - return; - } - - assert(task_descriptor->reference_count == 0); - + if (os_ref_release(&task_descriptor->reference_count) == 0) { #if DEVELOPMENT || DEBUG - lck_mtx_lock(&atm_descriptors_list_lock); - queue_remove(&atm_descriptors_list, task_descriptor, atm_task_descriptor_t, descriptor_elt); - lck_mtx_unlock(&atm_descriptors_list_lock); + lck_mtx_lock(&atm_descriptors_list_lock); + queue_remove(&atm_descriptors_list, task_descriptor, atm_task_descriptor_t, descriptor_elt); + lck_mtx_unlock(&atm_descriptors_list_lock); #endif - /* release the send right for the named memory entry */ - ipc_port_release_send(task_descriptor->trace_buffer); - lck_mtx_destroy(&task_descriptor->lock, &atm_lock_grp); - zfree(atm_descriptors_zone, task_descriptor); - return; -} - - -/* - * Routine: atm_link_get_reference - * Purpose: Get a reference count on atm link object. - * Returns: None. - */ -static void -atm_link_get_reference(atm_link_object_t link_object) -{ - atm_link_object_reference_internal(link_object); + /* release the send right for the named memory entry */ + ipc_port_release_send(task_descriptor->trace_buffer); + lck_mtx_destroy(&task_descriptor->lock, &atm_lock_grp); + zfree(atm_descriptors_zone, task_descriptor); + } } diff --git a/osfmk/atm/atm_internal.h b/osfmk/atm/atm_internal.h index ea1cbce7c..a8a4aace6 100644 --- a/osfmk/atm/atm_internal.h +++ b/osfmk/atm/atm_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef MACH_KERNEL_PRIVATE @@ -49,10 +50,10 @@ typedef mach_voucher_attr_value_handle_t atm_voucher_id_t; struct atm_task_descriptor { - decl_lck_mtx_data(, lock) /* lock to protect reference count */ + decl_lck_mtx_data(, lock); /* lock to protect reference count */ mach_port_t trace_buffer; /* named memory entry registered by user */ uint64_t trace_buffer_size; /* size of the trace_buffer registered */ - uint32_t reference_count; + os_refcnt_t reference_count; uint8_t flags; #if DEVELOPMENT || DEBUG task_t task; /* task pointer for debugging purposes */ @@ -60,42 +61,31 @@ struct atm_task_descriptor { #endif }; -#define atm_task_desc_reference_internal(elem) \ - (hw_atomic_add(&(elem)->reference_count, 1)) - -#define atm_task_desc_release_internal(elem) \ - (hw_atomic_sub(&(elem)->reference_count, 1)) - typedef struct atm_task_descriptor *atm_task_descriptor_t; #define ATM_TASK_DESCRIPTOR_NULL NULL struct atm_value { aid_t aid; /* activity id */ queue_head_t listeners; /* List of listeners who register for this activity */ - decl_lck_mtx_data(, listener_lock) /* Lock to protect listener list */ + decl_lck_mtx_data(, listener_lock); /* Lock to protect listener list */ queue_chain_t vid_hash_elt; /* Next hash element in the global hash table */ #if DEVELOPMENT || DEBUG queue_chain_t value_elt; /* global chain of all values */ #endif uint32_t sync; /* Made ref count given to voucher sub system. */ - uint32_t listener_count; /* Number of Listerners listening on the value. */ - uint32_t reference_count; /* use count on the atm value, 1 taken by the global hash table */ -}; - -#define atm_value_reference_internal(elem) \ - (hw_atomic_add(&(elem)->reference_count, 1)) -#define atm_value_release_internal(elem) \ - (hw_atomic_sub(&(elem)->reference_count, 1)) + uint32_t listener_count; + os_refcnt_t reference_count; /* use count on the atm value, 1 taken by the global hash table */ +}; #define atm_listener_count_incr_internal(elem) \ - (hw_atomic_add(&(elem)->listener_count, 1)) + (os_atomic_inc(&(elem)->listener_count, relaxed)) #define atm_listener_count_decr_internal(elem) \ - (hw_atomic_sub(&(elem)->listener_count, 1)) + (os_atomic_dec(&(elem)->listener_count, relaxed)) #define atm_sync_reference_internal(elem) \ - (hw_atomic_add(&(elem)->sync, 1)) + (os_atomic_inc(&(elem)->sync, relaxed)) typedef struct atm_value *atm_value_t; #define ATM_VALUE_NULL NULL @@ -107,20 +97,14 @@ struct atm_link_object { atm_task_descriptor_t descriptor; queue_chain_t listeners_element; /* Head is atm_value->listeners. */ atm_guard_t guard; /* Guard registered by the user for an activity. */ - uint32_t reference_count; /* Refernece count for link object */ + os_refcnt_t reference_count; }; typedef struct atm_link_object *atm_link_object_t; -#define atm_link_object_reference_internal(elem) \ - (hw_atomic_add(&(elem)->reference_count, 1)) - -#define atm_link_object_release_internal(elem) \ - (hw_atomic_sub(&(elem)->reference_count, 1)) - struct atm_value_hash { queue_head_t hash_list; - decl_lck_mtx_data(, hash_list_lock) /* lock to protect bucket list. */ + decl_lck_mtx_data(, hash_list_lock); /* lock to protect bucket list. */ }; typedef struct atm_value_hash *atm_value_hash_t; diff --git a/osfmk/bank/Makefile b/osfmk/bank/Makefile index 2f4246c0f..27dee2fdb 100644 --- a/osfmk/bank/Makefile +++ b/osfmk/bank/Makefile @@ -52,7 +52,7 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -61,7 +61,7 @@ ${MIG_UUHDRS} : \ ${MIG_USHDRS} : \ %_server.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -97,7 +97,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -107,7 +107,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/bank/bank.c b/osfmk/bank/bank.c index ef4d2977d..a281a029a 100644 --- a/osfmk/bank/bank.c +++ b/osfmk/bank/bank.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,6 +59,10 @@ static zone_t bank_task_zone, bank_account_zone; #define CAST_TO_BANK_ACCOUNT(x) ((bank_account_t)((void *)(x))) ipc_voucher_attr_control_t bank_voucher_attr_control; /* communication channel from ATM to voucher system */ +struct persona; +extern struct persona *system_persona, *proxy_system_persona; +uint32_t persona_get_id(struct persona *persona); +extern int unique_persona; #if DEVELOPMENT || DEBUG queue_head_t bank_tasks_list; @@ -66,11 +70,11 @@ queue_head_t bank_accounts_list; #endif static ledger_template_t bank_ledger_template = NULL; -struct _bank_ledger_indices bank_ledgers = { -1, -1 }; +struct _bank_ledger_indices bank_ledgers = { .cpu_time = -1, .energy = -1 }; static bank_task_t bank_task_alloc_init(task_t task); static bank_account_t bank_account_alloc_init(bank_task_t bank_holder, bank_task_t bank_merchant, - bank_task_t bank_secureoriginator, bank_task_t bank_proximateprocess, struct thread_group* banktg); + bank_task_t bank_secureoriginator, bank_task_t bank_proximateprocess, struct thread_group* banktg, uint32_t persona_id); static bank_task_t get_bank_task_context(task_t task, boolean_t initialize); static void bank_task_dealloc(bank_task_t bank_task, mach_voucher_attr_value_reference_t sync); static kern_return_t bank_account_dealloc_with_sync(bank_account_t bank_account, mach_voucher_attr_value_reference_t sync); @@ -80,11 +84,15 @@ static ledger_t bank_get_bank_task_ledger_with_ref(bank_task_t bank_task); static void bank_destroy_bank_task_ledger(bank_task_t bank_task); static void init_bank_ledgers(void); static boolean_t bank_task_is_propagate_entitled(task_t t); +static boolean_t bank_task_is_persona_modify_entitled(task_t t); static struct thread_group *bank_get_bank_task_thread_group(bank_task_t bank_task __unused); static struct thread_group *bank_get_bank_account_thread_group(bank_account_t bank_account __unused); +static boolean_t bank_verify_persona_id(uint32_t persona_id); static lck_spin_t g_bank_task_lock_data; /* lock to protect task->bank_context transition */ +static uint32_t disable_persona_propogate_check = 0; + #define global_bank_task_lock_init() \ lck_spin_init(&g_bank_task_lock_data, &bank_lock_grp, &bank_lock_attr) #define global_bank_task_lock_destroy() \ @@ -105,7 +113,8 @@ extern uint32_t proc_getgid(void *p); extern void proc_getexecutableuuid(void *p, unsigned char *uuidbuf, unsigned long size); extern int kauth_cred_issuser(void *cred); extern void* kauth_cred_get(void); - +extern void* persona_lookup(uint32_t id); +extern void persona_put(void* persona); kern_return_t bank_release_value( @@ -155,7 +164,7 @@ bank_release(ipc_voucher_attr_manager_t __assert_only manager); /* * communication channel from voucher system to ATM */ -struct ipc_voucher_attr_manager bank_manager = { +const struct ipc_voucher_attr_manager bank_manager = { .ivam_release_value = bank_release_value, .ivam_get_value = bank_get_value, .ivam_extract_content = bank_extract_content, @@ -232,6 +241,15 @@ bank_init() panic("BANK subsystem initialization failed"); } + +#if DEVELOPMENT || DEBUG + uint32_t disable_persona_propogate_check_bootarg = 0; + if (PE_parse_boot_argn("disable_persona_propogate_check", &disable_persona_propogate_check_bootarg, + sizeof(disable_persona_propogate_check_bootarg))) { + disable_persona_propogate_check = (disable_persona_propogate_check_bootarg != 0) ? 1 : 0; + } +#endif + kprintf("BANK subsystem is initialized\n"); return; } @@ -303,6 +321,8 @@ bank_release_value( /* * Routine: bank_get_value + * + * This function uses the recipe to create a bank attribute for a voucher. */ kern_return_t bank_get_value( @@ -311,13 +331,12 @@ bank_get_value( mach_voucher_attr_recipe_command_t command, mach_voucher_attr_value_handle_array_t prev_values, mach_msg_type_number_t prev_value_count, - mach_voucher_attr_content_t __unused recipe, - mach_voucher_attr_content_size_t __unused recipe_size, + mach_voucher_attr_content_t recipe, + mach_voucher_attr_content_size_t recipe_size, mach_voucher_attr_value_handle_t *out_value, mach_voucher_attr_value_flags_t *out_flags, ipc_voucher_t *out_value_voucher) { - bank_task_t bank_task = BANK_TASK_NULL; bank_task_t bank_holder = BANK_TASK_NULL; bank_task_t bank_merchant = BANK_TASK_NULL; bank_task_t bank_secureoriginator = BANK_TASK_NULL; @@ -331,6 +350,7 @@ bank_get_value( mach_msg_type_number_t i; struct thread_group *thread_group = NULL; struct thread_group *cur_thread_group = NULL; + uint32_t persona_id = proc_persona_id(NULL); assert(MACH_VOUCHER_ATTR_KEY_BANK == key); assert(manager == &bank_manager); @@ -342,13 +362,107 @@ bank_get_value( switch (command) { case MACH_VOUCHER_ATTR_BANK_CREATE: - /* Return the default task value instead of bank task */ + /* It returns the default task value. This value is replaced by + * an actual bank task reference, by using a recipe with + * MACH_VOUCHER_ATTR_SEND_PREPROCESS command. + */ *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE); *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST; break; + case MACH_VOUCHER_ATTR_BANK_MODIFY_PERSONA: + + /* It creates a bank account attribute value with a new persona id + * and auto-redeems it on behalf of the bank_holder. + */ + *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_VALUE); + + for (i = 0; i < prev_value_count; i++) { + bank_handle = prev_values[i]; + bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle); + + /* Expect a pre-processed attribute value */ + if (bank_element == BANK_DEFAULT_VALUE || bank_element == BANK_DEFAULT_TASK_VALUE) { + continue; + } + + if (!bank_task_is_persona_modify_entitled(current_task())) { + return KERN_NO_ACCESS; + } + + struct persona_modify_info pmi = {}; + if (recipe_size == sizeof(pmi)) { + memcpy((void *)&pmi, recipe, sizeof(pmi)); + persona_id = pmi.persona_id; + } else { + return KERN_INVALID_ARGUMENT; + } + + /* Verify if the persona id is valid */ + if (!bank_verify_persona_id(persona_id)) { + return KERN_INVALID_ARGUMENT; + } + + /* Update the persona id only if the bank element is a bank task. + * This ensures that the bank_holder can be trusted. + */ + if (bank_element->be_type == BANK_TASK) { + bank_holder = CAST_TO_BANK_TASK(bank_element); + /* Ensure that the requestor validated by userspace matches + * the bank_holder + */ + if (pmi.unique_pid != bank_holder->bt_unique_pid) { + return KERN_INVALID_CAPABILITY; + } + bank_merchant = bank_holder; + bank_secureoriginator = bank_holder; + bank_proximateprocess = bank_holder; + thread_group = bank_get_bank_task_thread_group(bank_holder); + } else if (bank_element->be_type == BANK_ACCOUNT) { + return KERN_INVALID_ARGUMENT; + } else { + panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type); + } + + /* Change the persona-id to holder task's persona-id if the task is not spawned in system persona */ + if (unique_persona && + bank_merchant->bt_persona_id != persona_get_id(system_persona) && + bank_merchant->bt_persona_id != persona_get_id(proxy_system_persona)) { + persona_id = bank_merchant->bt_persona_id; + } + + if (bank_holder->bt_persona_id == persona_id) { + lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock); + bank_task_made_reference(bank_holder); + if (bank_holder->bt_voucher_ref == 0) { + /* Take a ref for voucher system, if voucher system does not have a ref */ + bank_task_reference(bank_holder); + bank_holder->bt_voucher_ref = 1; + } + lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock); + + *out_value = BANK_ELEMENT_TO_HANDLE(bank_holder); + return kr; + } + + bank_account = bank_account_alloc_init(bank_holder, bank_merchant, + bank_secureoriginator, bank_proximateprocess, + thread_group, persona_id); + if (bank_account == BANK_ACCOUNT_NULL) { + return KERN_RESOURCE_SHORTAGE; + } + + *out_value = BANK_ELEMENT_TO_HANDLE(bank_account); + return kr; + } + break; + case MACH_VOUCHER_ATTR_AUTO_REDEEM: + /* It creates a bank account with the bank_merchant set to the current task. + * A bank attribute voucher needs to be redeemed before it can be adopted by + * it's threads. + */ for (i = 0; i < prev_value_count; i++) { bank_handle = prev_values[i]; bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle); @@ -364,12 +478,14 @@ bank_get_value( bank_secureoriginator = bank_holder; bank_proximateprocess = bank_holder; thread_group = bank_get_bank_task_thread_group(bank_holder); + persona_id = bank_holder->bt_persona_id; } else if (bank_element->be_type == BANK_ACCOUNT) { old_bank_account = CAST_TO_BANK_ACCOUNT(bank_element); bank_holder = old_bank_account->ba_holder; bank_secureoriginator = old_bank_account->ba_secureoriginator; bank_proximateprocess = old_bank_account->ba_proximateprocess; thread_group = bank_get_bank_account_thread_group(old_bank_account); + persona_id = old_bank_account->ba_so_persona_id; } else { panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type); } @@ -386,11 +502,19 @@ bank_get_value( thread_group = cur_thread_group; } + /* Change the persona-id to current task persona-id if the task is not spawned in system persona */ + if (unique_persona && + bank_merchant->bt_persona_id != persona_get_id(system_persona) && + bank_merchant->bt_persona_id != persona_get_id(proxy_system_persona)) { + persona_id = bank_merchant->bt_persona_id; + } + /* Check if trying to redeem for self task, return the default bank task */ if (bank_holder == bank_merchant && bank_holder == bank_secureoriginator && bank_holder == bank_proximateprocess && - thread_group == cur_thread_group) { + thread_group == cur_thread_group && + persona_id == bank_holder->bt_persona_id) { *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE); *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST; return kr; @@ -398,7 +522,7 @@ bank_get_value( bank_account = bank_account_alloc_init(bank_holder, bank_merchant, bank_secureoriginator, bank_proximateprocess, - thread_group); + thread_group, persona_id); if (bank_account == BANK_ACCOUNT_NULL) { return KERN_RESOURCE_SHORTAGE; } @@ -429,11 +553,13 @@ bank_get_value( bank_holder = CAST_TO_BANK_TASK(bank_element); bank_secureoriginator = bank_holder; thread_group = bank_get_bank_task_thread_group(bank_holder); + persona_id = bank_holder->bt_persona_id; } else if (bank_element->be_type == BANK_ACCOUNT) { old_bank_account = CAST_TO_BANK_ACCOUNT(bank_element); bank_holder = old_bank_account->ba_holder; bank_secureoriginator = old_bank_account->ba_secureoriginator; thread_group = bank_get_bank_account_thread_group(old_bank_account); + persona_id = old_bank_account->ba_so_persona_id; } else { panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type); } @@ -448,21 +574,24 @@ bank_get_value( /* * If the process doesn't have secure persona entitlement, * then replace the secure originator to current task. + * Also update the persona_id to match that of the secure originator. */ if (bank_merchant->bt_hasentitlement == 0) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (BANK_CODE(BANK_ACCOUNT_INFO, (BANK_SECURE_ORIGINATOR_CHANGED))) | DBG_FUNC_NONE, bank_secureoriginator->bt_pid, bank_merchant->bt_pid, 0, 0, 0); bank_secureoriginator = bank_merchant; + persona_id = bank_merchant->bt_persona_id; } bank_proximateprocess = bank_merchant; - /* Check if trying to redeem for self task, return the bank task */ + /* Check if trying to pre-process for self task, return the bank task */ if (bank_holder == bank_merchant && bank_holder == bank_secureoriginator && bank_holder == bank_proximateprocess && - thread_group == cur_thread_group) { + thread_group == cur_thread_group && + persona_id == bank_holder->bt_persona_id) { lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock); bank_task_made_reference(bank_holder); if (bank_holder->bt_voucher_ref == 0) { @@ -477,7 +606,7 @@ bank_get_value( } bank_account = bank_account_alloc_init(bank_holder, bank_merchant, bank_secureoriginator, bank_proximateprocess, - thread_group); + thread_group, persona_id); if (bank_account == BANK_ACCOUNT_NULL) { return KERN_RESOURCE_SHORTAGE; } @@ -490,7 +619,9 @@ bank_get_value( break; case MACH_VOUCHER_ATTR_REDEEM: - + /* This command expects that the bank attribute has been auto-redeemed + * and returns a reference to that bank account value. + */ for (i = 0; i < prev_value_count; i++) { bank_handle = prev_values[i]; bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle); @@ -499,24 +630,32 @@ bank_get_value( continue; } - task = current_task(); if (bank_element == BANK_DEFAULT_TASK_VALUE) { *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE); *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST; return kr; } - if (bank_element->be_type == BANK_TASK) { - bank_task = CAST_TO_BANK_TASK(bank_element); - panic("Found a bank task in MACH_VOUCHER_ATTR_REDEEM: %p", bank_task); + task = current_task(); + if (bank_element->be_type == BANK_TASK) { + bank_holder = CAST_TO_BANK_TASK(bank_element); + if (bank_holder == get_bank_task_context(task, FALSE)) { + *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE); + *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST; + } else { + kr = KERN_INVALID_CAPABILITY; + } return kr; } else if (bank_element->be_type == BANK_ACCOUNT) { bank_account = CAST_TO_BANK_ACCOUNT(bank_element); bank_merchant = bank_account->ba_merchant; if (bank_merchant != get_bank_task_context(task, FALSE)) { - panic("Found another bank task: %p as a bank merchant\n", bank_merchant); + /* This error can be used to verify if the task can + * adopt the voucher. + */ + kr = KERN_INVALID_CAPABILITY; + return kr; } - bank_account_made_reference(bank_account); *out_value = BANK_ELEMENT_TO_HANDLE(bank_account); return kr; @@ -591,14 +730,13 @@ bank_extract_content( bank_account->ba_holder->bt_pid, bank_account->ba_merchant->bt_pid, bank_account->ba_secureoriginator->bt_pid, - bank_account->ba_secureoriginator->bt_persona_id, + bank_account->ba_so_persona_id, bank_account->ba_proximateprocess->bt_pid, bank_account->ba_proximateprocess->bt_persona_id); } else { panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type); } - memcpy(&out_recipe[0], buf, strlen(buf) + 1); *out_command = MACH_VOUCHER_ATTR_BANK_NULL; *in_out_recipe_size = (mach_voucher_attr_content_size_t)strlen(buf) + 1; @@ -610,7 +748,7 @@ bank_extract_content( /* * Routine: bank_command - * Purpose: Execute a command against a set of ATM values. + * Purpose: Execute a command against a set of bank values. * Returns: KERN_SUCCESS: On successful execution of command. * KERN_FAILURE: On failure. */ @@ -635,6 +773,7 @@ bank_command( mach_voucher_attr_value_handle_t bank_handle; mach_msg_type_number_t i; int32_t pid; + uint32_t persona_id; assert(MACH_VOUCHER_ATTR_KEY_BANK == key); assert(manager == &bank_manager); @@ -714,6 +853,42 @@ bank_command( *out_content_size = 0; return KERN_INVALID_VALUE; + case BANK_PERSONA_ID: + + if ((sizeof(persona_id)) > *out_content_size) { + *out_content_size = 0; + return KERN_NO_SPACE; + } + + for (i = 0; i < value_count; i++) { + bank_handle = values[i]; + bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle); + if (bank_element == BANK_DEFAULT_VALUE) { + continue; + } + + if (bank_element == BANK_DEFAULT_TASK_VALUE) { + bank_element = CAST_TO_BANK_ELEMENT(get_bank_task_context(current_task(), FALSE)); + } + + if (bank_element->be_type == BANK_TASK) { + bank_task = CAST_TO_BANK_TASK(bank_element); + persona_id = bank_task->bt_persona_id; + } else if (bank_element->be_type == BANK_ACCOUNT) { + bank_account = CAST_TO_BANK_ACCOUNT(bank_element); + persona_id = bank_account->ba_so_persona_id; + } else { + panic("Bogus bank type: %d passed in voucher_command\n", bank_element->be_type); + } + + memcpy(out_content, &persona_id, sizeof(persona_id)); + *out_content_size = (mach_voucher_attr_content_size_t)sizeof(persona_id); + return KERN_SUCCESS; + } + /* In the case of no value, return error KERN_INVALID_VALUE */ + *out_content_size = 0; + return KERN_INVALID_VALUE; + default: return KERN_INVALID_ARGUMENT; } @@ -787,22 +962,40 @@ bank_task_alloc_init(task_t task) /* * Routine: proc_is_propagate_entitled - * Purpose: Check if the process has persona propagate entitlement. + * Purpose: Check if the process is allowed to propagate secure originator. * Returns: TRUE if entitled. * FALSE if not. */ static boolean_t bank_task_is_propagate_entitled(task_t t) { - /* Return TRUE if root process */ - if (0 == kauth_cred_issuser(kauth_cred_get())) { - /* If it's a non-root process, it needs to have the entitlement for secure originator propagation */ - boolean_t entitled = FALSE; - entitled = IOTaskHasEntitlement(t, ENTITLEMENT_PERSONA_PROPAGATE); - return entitled; - } else { + /* Check if it has an entitlement which disallows secure originator propagation */ + boolean_t entitled = FALSE; + entitled = IOTaskHasEntitlement(t, ENTITLEMENT_PERSONA_NO_PROPAGATE); + if (entitled) { + return FALSE; + } + + /* If it's a platform binary, allow propogation by default */ + if (disable_persona_propogate_check || (t->t_flags & TF_PLATFORM)) { return TRUE; } + + return FALSE; +} + +/* + * Routine: proc_is_persona_modify_entitled + * Purpose: Check if the process has persona modify entitlement. + * Returns: TRUE if entitled. + * FALSE if not. + */ +static boolean_t +bank_task_is_persona_modify_entitled(task_t t) +{ + boolean_t entitled = FALSE; + entitled = IOTaskHasEntitlement(t, ENTITLEMENT_PERSONA_MODIFY); + return entitled; } /* @@ -817,7 +1010,8 @@ bank_account_alloc_init( bank_task_t bank_merchant, bank_task_t bank_secureoriginator, bank_task_t bank_proximateprocess, - struct thread_group *thread_group) + struct thread_group *thread_group, + uint32_t persona_id) { bank_account_t new_bank_account; bank_account_t bank_account; @@ -845,6 +1039,7 @@ bank_account_alloc_init( new_bank_account->ba_holder = bank_holder; new_bank_account->ba_secureoriginator = bank_secureoriginator; new_bank_account->ba_proximateprocess = bank_proximateprocess; + new_bank_account->ba_so_persona_id = persona_id; /* Iterate through accounts need to pay list to find the existing entry */ lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock); @@ -852,7 +1047,8 @@ bank_account_alloc_init( if (bank_account->ba_merchant != bank_merchant || bank_account->ba_secureoriginator != bank_secureoriginator || bank_account->ba_proximateprocess != bank_proximateprocess || - bank_get_bank_account_thread_group(bank_account) != thread_group) { + bank_get_bank_account_thread_group(bank_account) != thread_group || + bank_account->ba_so_persona_id != persona_id) { continue; } @@ -1405,8 +1601,7 @@ bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energ * Routine: bank_get_voucher_bank_account * Purpose: Get the bank account from the voucher. * Returns: bank_account if bank_account attribute present in voucher. - * NULL on no attribute, no bank_element, or if holder and merchant bank accounts - * and voucher thread group and current thread group are the same. + * NULL on no attribute or no bank_element */ static bank_account_t bank_get_voucher_bank_account(ipc_voucher_t voucher) @@ -1439,23 +1634,7 @@ bank_get_voucher_bank_account(ipc_voucher_t voucher) return BANK_ACCOUNT_NULL; } else if (bank_element->be_type == BANK_ACCOUNT) { bank_account = CAST_TO_BANK_ACCOUNT(bank_element); - /* - * Return BANK_ACCOUNT_NULL if the ba_holder is same as ba_merchant - * and bank account thread group is same as current thread group - * i.e. ba_merchant's thread group. - * - * The bank account might have ba_holder same as ba_merchant but different - * thread group if daemon sends a voucher to an App and then App sends the - * same voucher back to the daemon (IPC code will replace thread group in the - * voucher to App's thread group when it gets auto redeemed by the App). - */ - if (bank_account->ba_holder != bank_account->ba_merchant || - bank_get_bank_account_thread_group(bank_account) != - bank_get_bank_task_thread_group(bank_account->ba_merchant)) { - return bank_account; - } else { - return BANK_ACCOUNT_NULL; - } + return bank_account; } else { panic("Bogus bank type: %d passed in bank_get_voucher_bank_account\n", bank_element->be_type); } @@ -1544,30 +1723,61 @@ bank_get_bank_account_thread_group(bank_account_t bank_account __unused) } /* - * Routine: bank_get_bank_ledger_and_thread_group - * Purpose: Get the bankledger (chit) and thread group from the voucher. - * Returns: bankledger and thread group if bank_account attribute present in voucher. - * + * Routine: bank_get_bank_ledger_thread_group_and_persona + * Purpose: Get the bankledger (chit), thread group and persona id from the voucher. + * Returns: bankledger, thread group if bank_account attribute present in voucher + * and persona_id */ kern_return_t -bank_get_bank_ledger_and_thread_group( +bank_get_bank_ledger_thread_group_and_persona( ipc_voucher_t voucher, ledger_t *bankledger, - struct thread_group **banktg) + struct thread_group **banktg, + uint32_t *persona_id) { bank_account_t bank_account; + bank_task_t bank_task; struct thread_group *thread_group = NULL; bank_account = bank_get_voucher_bank_account(voucher); - *bankledger = bank_get_bank_account_ledger(bank_account); - thread_group = bank_get_bank_account_thread_group(bank_account); + bank_task = get_bank_task_context(current_task(), FALSE); + if (persona_id != NULL) { + if (bank_account != BANK_ACCOUNT_NULL) { + *persona_id = bank_account->ba_so_persona_id; + } else { + *persona_id = bank_task->bt_persona_id; + } + } + /* + * Use BANK_ACCOUNT_NULL if the ba_holder is same as ba_merchant + * and bank account thread group is same as current thread group + * i.e. ba_merchant's thread group. + * + * The bank account might have ba_holder same as ba_merchant but different + * thread group if daemon sends a voucher to an App and then App sends the + * same voucher back to the daemon (IPC code will replace thread group in the + * voucher to App's thread group when it gets auto redeemed by the App). + */ + if ((bank_account != NULL) && + (bank_account->ba_holder == bank_account->ba_merchant) && + (bank_get_bank_account_thread_group(bank_account) == + bank_get_bank_task_thread_group(bank_account->ba_merchant))) { + bank_account = BANK_ACCOUNT_NULL; + } - /* Return NULL thread group if voucher has current task's thread group */ - if (thread_group == bank_get_bank_task_thread_group( - get_bank_task_context(current_task(), FALSE))) { - thread_group = NULL; + if (bankledger != NULL) { + *bankledger = bank_get_bank_account_ledger(bank_account); + } + + if (banktg != NULL) { + thread_group = bank_get_bank_account_thread_group(bank_account); + + /* Return NULL thread group if voucher has current task's thread group */ + if (thread_group == bank_get_bank_task_thread_group(bank_task)) { + thread_group = NULL; + } + *banktg = thread_group; } - *banktg = thread_group; return KERN_SUCCESS; } @@ -1645,3 +1855,23 @@ bank_swap_thread_bank_ledger(thread_t thread __unused, ledger_t new_ledger __unu effective_energy_consumed); } } + +/* + * Routine: bank_verify_persona_id + * Purpose: Verifies if the persona id is valid + * + * The caller should check if the task is entitled + * to do the lookup. + */ +static boolean_t +bank_verify_persona_id(uint32_t persona_id) +{ + /* A successful lookup implies that the persona id is valid */ + void *persona = persona_lookup(persona_id); + if (!persona) { + return FALSE; + } + persona_put(persona); + + return TRUE; +} diff --git a/osfmk/bank/bank_internal.h b/osfmk/bank/bank_internal.h index f20d09950..f78a64dda 100644 --- a/osfmk/bank/bank_internal.h +++ b/osfmk/bank/bank_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,8 +69,8 @@ struct bank_task { ledger_t bt_ledger; /* Ledger of the customer task */ queue_head_t bt_accounts_to_pay; /* List of accounts worked for me and need to pay */ queue_head_t bt_accounts_to_charge; /* List of accounts I did work and need to charge */ - decl_lck_mtx_data(, bt_acc_to_pay_lock) /* Lock to protect accounts to pay list */ - decl_lck_mtx_data(, bt_acc_to_charge_lock) /* Lock to protect accounts to charge list */ + decl_lck_mtx_data(, bt_acc_to_pay_lock); /* Lock to protect accounts to pay list */ + decl_lck_mtx_data(, bt_acc_to_charge_lock); /* Lock to protect accounts to charge list */ uint8_t bt_hasentitlement; /* If the secure persona entitlement is set on the task */ #if DEVELOPMENT || DEBUG queue_chain_t bt_global_elt; /* Element on the global bank task chain */ @@ -108,13 +108,13 @@ typedef struct bank_task * bank_task_t; (OSAddAtomic(-(num), &(elem)->bt_refs)) #define bank_task_made_reference(elem) \ - (hw_atomic_add(&(elem)->bt_made, 1) - 1) + (os_atomic_inc_orig(&(elem)->bt_made, relaxed)) #define bank_task_made_release(elem) \ - (hw_atomic_sub(&(elem)->bt_made, 1) + 1) + (os_atomic_dec_orig(&(elem)->bt_made, relaxed)) #define bank_task_made_release_num(elem, num) \ - (hw_atomic_sub(&(elem)->bt_made, (num)) + (num)) + (os_atomic_sub_orig(&(elem)->bt_made, (num), relaxed)) struct bank_account { @@ -129,6 +129,8 @@ struct bank_account { #if DEVELOPMENT || DEBUG queue_chain_t ba_global_elt; /* Element on the global account chain */ #endif + uint32_t ba_so_persona_id; /* Persona ID of ba_secureoriginator, + * unless modified by a entitled process */ }; #define ba_type ba_elem.be_type @@ -153,13 +155,13 @@ typedef struct bank_account * bank_account_t; (OSAddAtomic(-(num), &(elem)->ba_refs)) #define bank_account_made_reference(elem) \ - (hw_atomic_add(&(elem)->ba_made, 1) - 1) + (os_atomic_inc_orig(&(elem)->ba_made, relaxed)) #define bank_account_made_release(elem) \ - (hw_atomic_sub(&(elem)->ba_made, 1) + 1) + (os_atomic_dec_orig(&(elem)->ba_made, relaxed)) #define bank_account_made_release_num(elem, num) \ - (hw_atomic_sub(&(elem)->ba_made, (num)) + (num)) + (os_atomic_sub_orig(&(elem)->ba_made, (num), relaxed)) struct _bank_ledger_indices { int cpu_time; @@ -175,8 +177,8 @@ extern void bank_billed_balance_safe(task_t task, uint64_t *cpu_time, uint64_t * extern void bank_billed_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy); extern void bank_serviced_balance_safe(task_t task, uint64_t *cpu_time, uint64_t *energy); extern void bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy); -extern kern_return_t bank_get_bank_ledger_and_thread_group(ipc_voucher_t voucher, - ledger_t *bankledger, struct thread_group **banktg); +extern kern_return_t bank_get_bank_ledger_thread_group_and_persona(ipc_voucher_t voucher, + ledger_t *bankledger, struct thread_group **banktg, uint32_t *persona_id); extern void bank_swap_thread_bank_ledger(thread_t thread, ledger_t ledger); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/bank/bank_types.h b/osfmk/bank/bank_types.h index 51c40830d..7483b2a68 100644 --- a/osfmk/bank/bank_types.h +++ b/osfmk/bank/bank_types.h @@ -34,12 +34,14 @@ #define MACH_VOUCHER_ATTR_BANK_NULL ((mach_voucher_attr_recipe_command_t)601) #define MACH_VOUCHER_ATTR_BANK_CREATE ((mach_voucher_attr_recipe_command_t)610) +#define MACH_VOUCHER_ATTR_BANK_MODIFY_PERSONA ((mach_voucher_attr_recipe_command_t)611) #define MACH_VOUCHER_BANK_CONTENT_SIZE (500) typedef uint32_t bank_action_t; #define BANK_ORIGINATOR_PID 0x1 #define BANK_PERSONA_TOKEN 0x2 +#define BANK_PERSONA_ID 0x3 struct proc_persona_info { uint64_t unique_pid; @@ -57,8 +59,15 @@ struct persona_token { struct proc_persona_info proximate; }; +struct persona_modify_info { + uint32_t persona_id; + uint64_t unique_pid; +}; + #ifdef PRIVATE -#define ENTITLEMENT_PERSONA_PROPAGATE "com.apple.private.personas.propagate" +/* Redeem bank voucher on behalf of another process while changing the persona */ +#define ENTITLEMENT_PERSONA_MODIFY "com.apple.private.persona.modify" +#define ENTITLEMENT_PERSONA_NO_PROPAGATE "com.apple.private.personas.no.propagate" #endif /* PRIVATE */ #endif /* _BANK_BANK_TYPES_H_ */ diff --git a/osfmk/conf/Makefile b/osfmk/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/osfmk/conf/Makefile +++ b/osfmk/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/osfmk/conf/Makefile.template b/osfmk/conf/Makefile.template index d296cb6a7..2db9fb566 100644 --- a/osfmk/conf/Makefile.template +++ b/osfmk/conf/Makefile.template @@ -76,6 +76,14 @@ OBJS_NO_SIGN_COMPARE = \ $(foreach file,$(OBJS_NO_CAST_ALIGN),$(eval $(call add_perfile_cflags,$(file),-Wno-cast-align))) $(foreach file,$(OBJS_NO_SIGN_COMPARE),$(eval $(call add_perfile_cflags,$(file),-Wno-sign-compare))) +ifeq ($(KSANCOV),1) +# Don't instrument functions called by the ksancov runtime. SanitizeCoverage does +# not support blacklists, so exclude the whole file. +machine_routines.o_CFLAGS_RM = $(KSANCOV_CFLAGS) +machine_routines_common.o_CFLAGS_RM = $(KSANCOV_CFLAGS) +pcb_native.o_CFLAGS_RM = $(KSANCOV_CFLAGS) +endif + # # XXX: INCFLAGS to include libsa prototypes # @@ -138,9 +146,9 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \ mv $${hib_file}__ $${hib_file} || exit 1; \ done - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist do_all: $(COMPONENT).filelist @@ -162,7 +170,7 @@ endif -include genassym.d genassym.o: .CFLAGS $(firstword $(MAKEFILE_LIST)) genassym.o: $(SOURCE_DIR)/$(COMPONENT)/$(GENASSYM_LOCATION)/genassym.c - @echo "[$(CMD_MC)] $(ColorH)GENASSYM$(Color0) $(ColorLF)$<$(Color0)" + $(call makelog,[$(CMD_MC)] $(ColorH)GENASSYM$(Color0) $(ColorLF)$<$(Color0)) $(_v)${GENASSYM_KCC} ${CFLAGS} ${CFLAGS_NOLTO_FLAG} -MD -S -o ${@} ${INCFLAGS} $< assym.s: genassym.o diff --git a/osfmk/conf/Makefile.x86_64 b/osfmk/conf/Makefile.x86_64 index 57759351c..98df09944 100644 --- a/osfmk/conf/Makefile.x86_64 +++ b/osfmk/conf/Makefile.x86_64 @@ -2,7 +2,7 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### -CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 +CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 -Wno-atomic-implicit-seq-cst # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ diff --git a/osfmk/conf/files b/osfmk/conf/files index d7a06429a..4c6f803a0 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -48,7 +48,6 @@ OPTIONS/mach_pagemap optional mach_pagemap OPTIONS/mach_vm_debug optional mach_vm_debug OPTIONS/mach_page_hash_stats optional mach_page_hash_stats OPTIONS/mig_debug optional mig_debug -OPTIONS/xpr_debug optional xpr_debug OPTIONS/zone_debug optional zone_debug OPTIONS/vm_cpm optional vm_cpm OPTIONS/task_swapper optional task_swapper @@ -112,6 +111,7 @@ osfmk/ipc/mach_msg.c standard osfmk/ipc/mach_port.c standard osfmk/ipc/mig_log.c optional mig_debug osfmk/kern/affinity.c standard +osfmk/kern/arcade.c optional config_arcade osfmk/kern/ast.c standard osfmk/kern/audit_sessionport.c optional config_audit osfmk/kern/backtrace.c standard @@ -123,6 +123,7 @@ osfmk/kern/coalition.c optional config_coalitions osfmk/kern/counters.c standard osfmk/kern/cpu_quiesce.c optional config_quiesce_counter osfmk/kern/debug.c standard +osfmk/kern/ecc_logging.c optional config_ecc_logging osfmk/kern/energy_perf.c standard osfmk/kern/exception.c standard osfmk/kern/extmod_statistics.c standard @@ -136,24 +137,26 @@ osfmk/kern/ipc_misc.c standard osfmk/kern/ipc_sync.c standard osfmk/kern/ipc_tt.c standard osfmk/kern/kalloc.c standard -osfmk/kern/ecc_logging.c optional config_ecc_logging osfmk/kern/ktrace_background_notify.c standard osfmk/kern/ledger.c standard osfmk/kern/locks.c standard osfmk/kern/tlock.c standard osfmk/kern/ltable.c standard -osfmk/kern/machine.c standard osfmk/kern/mach_node.c standard +osfmk/kern/machine.c standard osfmk/kern/mk_sp.c standard osfmk/kern/mk_timer.c standard +osfmk/kern/mpsc_queue.c standard osfmk/kern/page_decrypt.c standard osfmk/kern/printf.c standard osfmk/kern/priority.c standard osfmk/kern/priority_queue.c standard osfmk/kern/processor.c standard osfmk/kern/processor_data.c standard +osfmk/kern/restartable.c standard osfmk/kern/sched_average.c standard osfmk/kern/sched_dualq.c optional config_sched_multiq +osfmk/kern/sched_clutch.c optional config_clutch osfmk/kern/sched_prim.c standard osfmk/kern/sched_proto.c optional config_sched_proto osfmk/kern/sched_traditional.c optional config_sched_traditional @@ -172,7 +175,9 @@ osfmk/kern/task.c standard osfmk/kern/task_policy.c standard osfmk/kern/task_swap.c standard osfmk/kern/test_lock.c optional development -osfmk/kern/test_lock.c optional debug +osfmk/kern/test_lock.c optional debug +osfmk/kern/test_mpsc_queue.c optional development +osfmk/kern/test_mpsc_queue.c optional debug osfmk/kern/thread.c standard osfmk/kern/thread_act.c standard osfmk/kern/thread_call.c standard @@ -184,7 +189,6 @@ osfmk/kern/turnstile.c standard osfmk/kern/ux_handler.c standard osfmk/kern/waitq.c standard osfmk/kern/work_interval.c standard -osfmk/kern/xpr.c optional xpr_debug osfmk/kern/zalloc.c standard osfmk/kern/zcache.c optional config_zcache osfmk/kern/gzalloc.c optional config_gzalloc @@ -214,6 +218,7 @@ osfmk/kern/copyout_shim.c optional copyout_shim ./mach/memory_entry_server.c standard ./mach/memory_object_control_server.c standard ./mach/resource_notify_user.c standard +./mach/restartable_server.c standard ./mach/upl_server.c standard ./mach/audit_triggers_user.c standard ./mach/task_access_user.c standard @@ -234,6 +239,12 @@ osfmk/atm/atm.c optional config_atm osfmk/voucher/ipc_pthread_priority.c standard ./mach/coalition_notification_user.c optional config_coalitions ./mach/sysdiagnose_notification_user.c optional config_sysdiagnose +./mach/sysdiagnose_notification_user.c optional config_sysdiagnose +./mach/vfs_nspace_user.c standard +./mach/fairplayd_notification_user.c optional config_arcade +./mach/arcade_upcall_user.c optional config_arcade +./mach/arcade_register_server.c optional config_arcade + # # For now, no external pagers # @@ -317,19 +328,17 @@ osfmk/console/video_console.c optional video_console osfmk/kern/telemetry.c optional config_telemetry # Built-in corecrypto for early_random(): -osfmk/corecrypto/cc/src/cc_clear.c standard -osfmk/corecrypto/cc/src/cc_cmp_safe.c standard -osfmk/corecrypto/cc/src/cc_try_abort.c standard -osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c standard -osfmk/corecrypto/ccdigest/src/ccdigest_init.c standard -osfmk/corecrypto/ccdigest/src/ccdigest_update.c standard -osfmk/corecrypto/cchmac/src/cchmac.c standard -osfmk/corecrypto/cchmac/src/cchmac_init.c standard -osfmk/corecrypto/cchmac/src/cchmac_update.c standard -osfmk/corecrypto/cchmac/src/cchmac_final.c standard +osfmk/corecrypto/cc/src/cc_clear.c standard +osfmk/corecrypto/cc/src/cc_cmp_safe.c standard +osfmk/corecrypto/cc/src/cc_abort.c standard +osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c standard +osfmk/corecrypto/ccdigest/src/ccdigest_init.c standard +osfmk/corecrypto/ccdigest/src/ccdigest_update.c standard +osfmk/corecrypto/cchmac/src/cchmac.c standard +osfmk/corecrypto/cchmac/src/cchmac_init.c standard +osfmk/corecrypto/cchmac/src/cchmac_update.c standard +osfmk/corecrypto/cchmac/src/cchmac_final.c standard osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c standard -osfmk/corecrypto/ccsha1/src/ccsha1_eay.c standard -osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c standard osfmk/corecrypto/ccsha2/src/ccsha256_di.c standard osfmk/corecrypto/ccsha2/src/ccsha256_initial_state.c standard @@ -337,4 +346,4 @@ osfmk/corecrypto/ccsha2/src/ccsha256_K.c standard osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c standard osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c standard -osfmk/prng/prng_random.c standard +osfmk/prng/prng_random.c standard diff --git a/osfmk/conf/files.arm b/osfmk/conf/files.arm index 11ca56627..777e24bc6 100644 --- a/osfmk/conf/files.arm +++ b/osfmk/conf/files.arm @@ -72,6 +72,3 @@ osfmk/kperf/arm/kperf_mp.c optional kperf osfmk/arm/kpc_arm.c optional kpc osfmk/arm/monotonic_arm.c optional monotonic - -# Support for early_random() -osfmk/corecrypto/ccn/src/arm/ccn_set.s standard diff --git a/osfmk/conf/files.arm64 b/osfmk/conf/files.arm64 index 239a78423..2fb849e8b 100644 --- a/osfmk/conf/files.arm64 +++ b/osfmk/conf/files.arm64 @@ -36,6 +36,7 @@ osfmk/arm/io_map.c standard osfmk/arm64/loose_ends.c standard osfmk/arm/locks_arm.c standard osfmk/arm64/locore.s standard +osfmk/arm64/gxf_exceptions.s standard osfmk/arm64/lowmem_vectors.c standard osfmk/arm64/sleh.c standard osfmk/arm64/start.s standard @@ -77,14 +78,12 @@ osfmk/arm64/kpc.c optional kpc osfmk/arm64/monotonic_arm64.c optional monotonic -osfmk/arm64/platform_tests.c optional config_xnupost +osfmk/arm64/platform_tests.c optional config_xnupost +osfmk/arm64/platform_tests_asm.s optional config_xnupost osfmk/arm64/alternate_debugger.c optional alternate_debugger osfmk/arm64/alternate_debugger_asm.s optional alternate_debugger -# Support for early_random() -osfmk/corecrypto/ccn/src/ccn_set.c standard - osfmk/arm64/pgtrace.c standard osfmk/arm64/pgtrace_decoder.c optional config_pgtrace_nonkext osfmk/arm64/machine_remote_time.c optional config_mach_bridge_recv_time diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index dfe06058d..dd89ec483 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -2,8 +2,6 @@ OPTIONS/fb optional fb OPTIONS/debug optional debug -OPTIONS/gprof optional gprof - osfmk/vm/vm_apple_protect.c standard #osfmk/x86_64/hi_res_clock_map.c optional hi_res_clock @@ -85,12 +83,6 @@ osfmk/i386/acpi.c standard osfmk/i386/mtrr.c optional config_mtrr - -#osfmk/profiling/x86_64/profile-md.c optional gprof -#osfmk/profiling/x86_64/profile-asm.s optional gprof -#osfmk/profiling/profile-kgmon.c optional gprof -#osfmk/profiling/profile-mk.c optional gprof - osfmk/kdp/ml/x86_64/kdp_machdep.c optional mach_kdp osfmk/kdp/ml/x86_64/kdp_vm.c optional mach_kdp osfmk/kdp/ml/i386/kdp_x86_common.c optional mach_kdp diff --git a/osfmk/console/serial_console.c b/osfmk/console/serial_console.c index 160ab8fb1..cc887d62d 100644 --- a/osfmk/console/serial_console.c +++ b/osfmk/console/serial_console.c @@ -310,10 +310,10 @@ _cnputs(char * c, int size) } while (size-- > 0) { - cons_ops[cons_ops_index].putc(0, 0, *c); if (*c == '\n') { cons_ops[cons_ops_index].putc(0, 0, '\r'); } + cons_ops[cons_ops_index].putc(0, 0, *c); c++; } @@ -407,7 +407,7 @@ console_ring_try_empty(void) boolean_t state = ml_set_interrupts_enabled(FALSE); /* Indicate that we're in the process of writing a block of data to the console. */ - (void)hw_atomic_add(&console_output, 1); + os_atomic_inc(&console_output, relaxed); simple_lock_try_lock_loop(&console_ring.write_lock, LCK_GRP_NULL); @@ -430,7 +430,7 @@ console_ring_try_empty(void) simple_unlock(&console_ring.write_lock); - (void)hw_atomic_sub(&console_output, 1); + os_atomic_dec(&console_output, relaxed); simple_unlock(&console_ring.read_lock); @@ -658,7 +658,7 @@ vcgetc(__unused int l, __unused int u, __unused boolean_t wait, __unused boolean { char c; - if (0 == (*PE_poll_input)(0, &c)) { + if (0 == PE_stub_poll_input(0, &c)) { return c; } else { return 0; @@ -681,7 +681,7 @@ alloc_free_func(void * arg, wait_result_t wres __unused) T_LOG("Doing %d iterations of console cpu alloc and free.", count); while (count-- > 0) { - (void)hw_atomic_add(&cons_test_ops_count, 1); + os_atomic_inc(&cons_test_ops_count, relaxed); cbp = (console_buf_t *)console_cpu_alloc(0); if (cbp == NULL) { T_ASSERT_NOTNULL(cbp, "cpu allocation failed"); @@ -702,7 +702,7 @@ log_to_console_func(void * arg __unused, wait_result_t wres __unused) uint64_t thread_id = current_thread()->thread_id; char somedata[10] = "123456789"; for (int i = 0; i < 26; i++) { - (void)hw_atomic_add(&cons_test_ops_count, 1); + os_atomic_inc(&cons_test_ops_count, relaxed); printf(" thid: %llu printf iteration %d\n", thread_id, i); cnputc_unbuffered((char)('A' + i)); cnputc_unbuffered('\n'); diff --git a/osfmk/console/serial_general.c b/osfmk/console/serial_general.c index fb2d0d6c5..7a8cfeb00 100644 --- a/osfmk/console/serial_general.c +++ b/osfmk/console/serial_general.c @@ -72,9 +72,9 @@ serial_keyboard_start(void) { /* Go see if there are any characters pending now */ serial_keyboard_poll(); - panic("serial_keyboard_start: we can't get back here\n"); } +__dead2 void serial_keyboard_poll(void) { diff --git a/osfmk/console/serial_protos.h b/osfmk/console/serial_protos.h index 68f4a21e7..e508ee10b 100644 --- a/osfmk/console/serial_protos.h +++ b/osfmk/console/serial_protos.h @@ -41,8 +41,8 @@ extern "C" { void serial_keyboard_init(void); -void serial_keyboard_start(void); -void serial_keyboard_poll(void); +void serial_keyboard_start(void) __dead2; +void serial_keyboard_poll(void) __dead2; extern uint32_t serialmode; diff --git a/osfmk/console/video_console.c b/osfmk/console/video_console.c index 2034f1e51..6a2131ca0 100644 --- a/osfmk/console/video_console.c +++ b/osfmk/console/video_console.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,18 +22,18 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_FREE_COPYRIGHT@ - * + * */ /* * @APPLE_FREE_COPYRIGHT@ */ /* - * NetBSD: ite.c,v 1.16 1995/07/17 01:24:34 briggs Exp + * NetBSD: ite.c,v 1.16 1995/07/17 01:24:34 briggs Exp * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 @@ -196,7 +196,7 @@ MACRO_END #endif /* -# Attribute codes: +# Attribute codes: # 00=none 01=bold 04=underscore 05=blink 07=reverse 08=concealed # Text color codes: # 30=black 31=red 32=green 33=yellow 34=blue 35=magenta 36=cyan 37=white @@ -241,7 +241,7 @@ enum vt100state_e { } gc_vt100state = ESnormal; -enum +enum { /* secs */ kProgressAcquireDelay = 0, @@ -295,7 +295,7 @@ static void gc_set_tab_stop(unsigned int column, boolean_t enabled); static void gc_show_cursor(unsigned int xx, unsigned int yy); static void gc_update_color(int color, boolean_t fore); -static void +static void gc_clear_line(unsigned int xx, unsigned int yy, int which) { unsigned int start, end, i; @@ -329,7 +329,7 @@ gc_clear_line(unsigned int xx, unsigned int yy, int which) } } -static void +static void gc_clear_screen(unsigned int xx, unsigned int yy, int top, unsigned int bottom, int which) { @@ -538,7 +538,7 @@ gc_paint_char(unsigned int xx, unsigned int yy, unsigned char ch, int attrs) if ( xx < gc_buffer_columns && yy < gc_buffer_rows ) { uint32_t index = (yy * gc_buffer_columns) + xx; - + gc_buffer_attributes[index] = attrs; gc_buffer_characters[index] = ch; gc_buffer_colorcodes[index] = gc_color_code; @@ -547,7 +547,7 @@ gc_paint_char(unsigned int xx, unsigned int yy, unsigned char ch, int attrs) gc_ops.paint_char(xx, yy, ch, attrs, 0, 0); } -static void +static void gc_putchar(char ch) { if (!ch) { @@ -1871,7 +1871,7 @@ static int8_t vc_uiscale = 1; vc_progress_user_options vc_progress_options; vc_progress_user_options vc_user_options; -decl_simple_lock_data(,vc_progress_lock) +decl_simple_lock_data(,vc_progress_lock); #if !CONFIG_EMBEDDED static int vc_progress_withmeter = 3; @@ -3306,6 +3306,3 @@ vc_set_progressmeter(int new_value) } #endif /* !CONFIG_EMBEDDED */ - - - diff --git a/osfmk/corecrypto/cc/src/cc_abort.c b/osfmk/corecrypto/cc/src/cc_abort.c new file mode 100644 index 000000000..726af1668 --- /dev/null +++ b/osfmk/corecrypto/cc/src/cc_abort.c @@ -0,0 +1,81 @@ +/* + * cc_abort.c + * corecrypto + * + * Created on 3/9/2019 + * + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +//cc_abort() is implemented to comply with by FIPS 140-2, when DRBG produces +//two equal consecutive blocks. See rdar://19129408 + +#if !CC_PROVIDES_ABORT + +#error "This environment does not provide an abort()/panic()-like function" + +#elif CC_KERNEL + +#include +void +cc_abort(const char * msg) +{ + panic("%s", msg); +} + +#elif CC_USE_L4 + +#include +#include +void +cc_abort(const char * msg) +{ + sys_panic(msg); +} + +#elif CC_RTKIT + +#include +void +cc_abort(const char * msg) +{ + RTK_abort("%s", msg); +} + +#else + +#include +void +cc_abort(const char * msg CC_UNUSED) +{ + abort(); +} + +#endif diff --git a/osfmk/corecrypto/cc/src/cc_clear.c b/osfmk/corecrypto/cc/src/cc_clear.c index 1733f9a53..db21af6c9 100644 --- a/osfmk/corecrypto/cc/src/cc_clear.c +++ b/osfmk/corecrypto/cc/src/cc_clear.c @@ -66,6 +66,6 @@ cc_clear(size_t len, void *dst) /* This is an altarnative for clang that should work * void cc_clear(size_t len, void *dst) __attribute__ ((optnone)) * { - * cc_zero(len,dst); + * cc_clear(len,dst); * } */ diff --git a/osfmk/corecrypto/cc/src/cc_cmp_safe.c b/osfmk/corecrypto/cc/src/cc_cmp_safe.c index ee9efab11..dcafb1e11 100644 --- a/osfmk/corecrypto/cc/src/cc_cmp_safe.c +++ b/osfmk/corecrypto/cc/src/cc_cmp_safe.c @@ -44,6 +44,6 @@ cc_cmp_safe(size_t num, const void * ptr1, const void * ptr2) for (i = 0; i < num; i++) { flag |= (s[i] ^ t[i]); } - HEAVISIDE_STEP_UINT8(flag, flag); // flag=(flag==0)?0:1; + CC_HEAVISIDE_STEP(flag, flag); // flag=(flag==0)?0:1; return flag; // 0 iff all bytes were equal, 1 if there is any difference } diff --git a/osfmk/corecrypto/cc/src/cc_try_abort.c b/osfmk/corecrypto/cc/src/cc_try_abort.c deleted file mode 100644 index 31a07bab3..000000000 --- a/osfmk/corecrypto/cc/src/cc_try_abort.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * cc_try_abort.c - * corecrypto - * - * Created on 7/16/2015 - * - * Copyright (c) 2014,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include - -//cc_try_abort() is implemented to comply with by FIPS 140-2, when DRBG produces -//two equal consecutive blocks. See radar 19129408 - -#if CC_KERNEL -#include -void -cc_try_abort(const char * msg CC_UNUSED, ...) -{ - panic("%s", msg); -} - -#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKIT || CC_RTKITROM -void -cc_try_abort(const char * msg CC_UNUSED, ...) -{ - //Do nothing and return because we don't have panic() in those - //environments. Make sure you return error, when using cc_try_abort() in above environments -} - -#else -#include -void -cc_try_abort(const char * msg CC_UNUSED, ...) -{ - abort(); -} -#endif diff --git a/osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c b/osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c index 5757bd413..bb85bbf14 100644 --- a/osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c +++ b/osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c @@ -32,488 +32,221 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include + +#include #include #include #include -#include #include -// Test vectors at: -// http://csrc.nist.gov/groups/STM/cavp/#05 -// http://csrc.nist.gov/groups/STM/cavp/documents/drbg/drbgtestvectors.zip -// - -/* - * This HMAC DBRG is described in: - * - * SP 800-90 A Rev. 1 (2nd Draft) - * DRAFT Recommendation for Random Number Generation Using Deterministic Random Bit Generators - * April 2014 - * - * - * See in particular - * - 10.1.2 HMAC_DRBG (p 45) - * - B.2 HMAC_DRBGExample (p 83) - * - * We support maximum security strength of 256 bits - * Note that the example in B.2 is very limited, refer to §10.1.2 for more - */ - -/* - * The Get_entropy_input function is specified in pseudocode in [SP 800-90C] for various RBG constructions; - * however, in general, the function has the following meaning: - * Get_entropy_input: A function that is used to obtain entropy input. The function call is: - * (status, entropy_input) = Get_entropy_input (min_entropy, min_ length, max_ length, prediction_resistance_request), - * which requests a string of bits (entropy_input) with at least min_entropy bits of entropy. The length for the string - * shall be equal to or greater than min_length bits, and less than or equal to max_length bits. The - * prediction_resistance_request parameter indicates whether or not prediction resistance is to be provided during the request - * (i.e., whether fresh entropy is required). A status code is also returned from the function. - */ - -/* - * Check the validity of the input parameters. - * 1. If (requested_instantiation_security_strength > 256), then Return (“Invalid - * requested_instantiation_security_strength”, −1). - * 2. If (len (personalization_string) > 160), then Return (“Personalization_string - * too long”, −1) - * Comment: Set the security_strength to one of the valid security strengths. - * 3. If (requested_security_strength ≤ 112), then security_strength = 112 Else (requested_ security_strength ≤ 128), then security_strength = 128 Else (requested_ security_strength ≤ 192), then security_strength = 192 Else security_strength = 256. - * Comment: Get the entropy_input and the nonce. - * 4. min_entropy = 1.5 × security_strength. - * 5. (status, entropy_input) = Get_entropy_input (min_entropy, 1000). - * 6. If (status ≠ “Success”), then Return (status, −1). - */ +// This HMAC DRBG is described in: -/* - * 1. highest_supported_security_strength = 256. - * 2. Output block (outlen) = 256 bits. - * 3. Required minimum entropy for the entropy input at instantiation = 3/2 security_strength (this includes the entropy required for the nonce). - * 4. Seed length (seedlen) = 440 bits. - * 5. Maximum number of bits per request (max_number_of_bits_per_request) = 7500 - * bits. - * 6. Reseed_interval (reseed_ interval) = 10,000 requests. - * 7. Maximum length of the personalization string (max_personalization_string_length) = 160 bits. - * 8. Maximum length of the entropy input (max _length) = 1000 bits. - */ +// NIST SP 800-90A Rev. 1 +// Recommendation for Random Number Generation Using Deterministic Random Bit Generators +// June 2015 -// -// Defines below based on 10.1, Table 2: Definitions for Hash-Based DRBG Mechanisms (p 39) -// +// See in particular: +// - 9 DRBG Mechanism Functions +// - 10.1.2 HMAC_DRBG +// - B.2 HMAC_DRBGExample -#define NH_MAX_OUTPUT_BLOCK_SIZE (CCSHA512_OUTPUT_SIZE) // 512 bits, i.e. 64 bytes (CCSHA512_OUTPUT_SIZE) -#define NH_MAX_KEY_SIZE (CCSHA512_OUTPUT_SIZE) // 512 bits, i.e. 64 bytes (CCSHA512_OUTPUT_SIZE) +#define NISTHMAC_MAX_OUTPUT_SIZE (CCSHA512_OUTPUT_SIZE) -#define MIN_REQ_ENTROPY(di) ((di)->output_size/2) +#define MIN_REQ_ENTROPY(di) ((di)->output_size / 2) struct ccdrbg_nisthmac_state { - const struct ccdrbg_nisthmac_custom *custom; //ccdrbg_nisthmac_state does not need to store ccdrbg_info. ccdrbg_nisthmac_custom is sufficient - size_t bytesLeft; - uint64_t reseed_counter; // the reseed counter should be able to hole 2^^48. size_t might be smaller than 48 bits - size_t vsize; - size_t keysize; - uint8_t v[2 * NH_MAX_OUTPUT_BLOCK_SIZE]; - uint8_t *vptr; - uint8_t *nextvptr; - uint8_t key[NH_MAX_KEY_SIZE]; + const struct ccdrbg_nisthmac_custom *custom; + uint8_t key[NISTHMAC_MAX_OUTPUT_SIZE]; + uint8_t V[NISTHMAC_MAX_OUTPUT_SIZE]; + uint64_t reseed_counter; }; #define DRBG_NISTHMAC_DEBUG 0 - #if DRBG_NISTHMAC_DEBUG -#include "cc_debug.h" +#include static void -dumpState(const char *label, struct ccdrbg_nisthmac_state *state) +dump_state(const char *label, struct ccdrbg_nisthmac_state *drbg_ctx) { - //cc_print(label, state->vsize, state->nextvptr); - cc_print(label, state->vsize, state->vptr); - cc_print(label, state->keysize, state->key); + size_t outlen = drbg_ctx->custom->di->output_size; + + cc_print(label, outlen, drbg_ctx->key); + cc_print(label, outlen, drbg_ctx->V); } #endif +// See NIST SP 800-90A, Rev. 1, 9.4 +static void +done(struct ccdrbg_state *ctx) +{ + cc_clear(sizeof(struct ccdrbg_nisthmac_state), ctx); +} + +// See NIST SP 800-90A, Rev. 1, 10.1.2.2 +static void +update(struct ccdrbg_state *ctx, unsigned ndata, ...) +{ + struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx; + const struct ccdigest_info *info = drbg_ctx->custom->di; + size_t outlen = info->output_size; + size_t data_nbytes = 0; + va_list args; -static void done(struct ccdrbg_state *drbg); + cchmac_di_decl(info, hmac_ctx); -/* - * NIST SP 800-90A, Rev. 1 HMAC_DRBG April 2014, p 46 - * - * HMAC_DRBG_Update (provided_data, K, V): - * 1. provided_data: The data to be used. - * 2. K: The current value of Key. - * 3. V: The current value of V. - * Output: - * 1. K: The new value for Key. - * 2. V: The new value for V. - * - * HMAC_DRBG Update Process: - * - * 1. K = HMAC (K, V || 0x00 || provided_data). - * 2. V=HMAC(K,V). - * 3. If (provided_data = Null), then return K and V. - * 4. K = HMAC (K, V || 0x01 || provided_data). - * 5. V=HMAC(K,V). - * 6. Return K and V. - */ + for (uint8_t b = 0; b < 2; b += 1) { + cchmac_init(info, hmac_ctx, outlen, drbg_ctx->key); -// was: size_t providedDataLength, const void *providedData + cchmac_update(info, hmac_ctx, outlen, drbg_ctx->V); -/* - * To handle the case where we have three strings that are concatenated, - * we pass in three (ptr, len) pairs - */ + cchmac_update(info, hmac_ctx, sizeof(b), &b); -static int -hmac_dbrg_update(struct ccdrbg_state *drbg, - size_t daLen, const void *da, - size_t dbLen, const void *db, - size_t dcLen, const void *dc - ) -{ - int rc = CCDRBG_STATUS_ERROR; - struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; - const struct ccdigest_info *di = state->custom->di; + va_start(args, ndata); - const unsigned char cZero = 0x00; - const unsigned char cOne = 0x01; + for (unsigned i = 0; i < ndata; i += 1) { + size_t nbytes = va_arg(args, size_t); + const void *buf = va_arg(args, const void *); - cchmac_ctx_decl(di->state_size, di->block_size, ctx); - cchmac_init(di, ctx, state->keysize, state->key); + cchmac_update(info, hmac_ctx, nbytes, buf); - // 1. K = HMAC (K, V || 0x00 || provided_data). - cchmac_update(di, ctx, state->vsize, state->vptr); - cchmac_update(di, ctx, 1, &cZero); - if (da && daLen) { - cchmac_update(di, ctx, daLen, da); - } - if (db && dbLen) { - cchmac_update(di, ctx, dbLen, db); - } - if (dc && dcLen) { - cchmac_update(di, ctx, dcLen, dc); - } - cchmac_final(di, ctx, state->key); - - // One parameter must be non-empty, or return - if (((da && daLen) || (db && dbLen) || (dc && dcLen))) { - // 2. V=HMAC(K,V). - cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->vptr); - // 4. K = HMAC (K, V || 0x01 || provided_data). - cchmac_init(di, ctx, state->keysize, state->key); - cchmac_update(di, ctx, state->vsize, state->vptr); - cchmac_update(di, ctx, 1, &cOne); - if (da && daLen) { - cchmac_update(di, ctx, daLen, da); - } - if (db && dbLen) { - cchmac_update(di, ctx, dbLen, db); + data_nbytes += nbytes; } - if (dc && dcLen) { - cchmac_update(di, ctx, dcLen, dc); - } - cchmac_final(di, ctx, state->key); - } - // If additional data 5. V=HMAC(K,V) - // If no addtional data, this is step 2. V=HMAC(K,V). - state->bytesLeft = 0; - - // FIPS 140-2 4.9.2 Conditional Tests - // "the first n-bit block generated after power-up, initialization, or reset shall not be used, but shall be saved for comparison with the next n-bit block to be generated" - // Generate the first block and the second block. Compare for FIPS and discard the first block - // We keep the second block as the first set of data to be returned - cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->vptr); // First block - cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->nextvptr); // First to be returned - if (0 == cc_cmp_safe(state->vsize, state->vptr, state->nextvptr)) { - //The world as we know it has come to an end - //the DRBG data structure is zeroized. subsequent calls to - //DRBG ends up in NULL dereferencing and/or unpredictable state. - //catastrophic error in SP 800-90A - done(drbg); - rc = CCDRBG_STATUS_ABORT; - cc_try_abort(NULL); - goto errOut; - } - rc = CCDRBG_STATUS_OK; -errOut: - return rc; -} - -//make sure state is initialized, before calling this function -static int -validate_inputs(struct ccdrbg_nisthmac_state *state, - size_t entropyLength, - size_t additionalInputLength, - size_t psLength) -{ - int rc; - const struct ccdrbg_nisthmac_custom *custom = state->custom; - const struct ccdigest_info *di = custom->di; - rc = CCDRBG_STATUS_ERROR; - //buffer size checks - cc_require(di->output_size <= sizeof(state->v) / 2, end); //digest size too long - cc_require(di->output_size <= sizeof(state->key), end); //digest size too long + va_end(args); - //NIST SP800 compliance checks - //the following maximum checks are redundant if long is 32 bits. + cchmac_final(info, hmac_ctx, drbg_ctx->key); - rc = CCDRBG_STATUS_PARAM_ERROR; - cc_require(psLength <= CCDRBG_MAX_PSINPUT_SIZE, end); //personalization string too long - cc_require(entropyLength <= CCDRBG_MAX_ENTROPY_SIZE, end); //supplied too much entropy - cc_require(additionalInputLength <= CCDRBG_MAX_ADDITIONALINPUT_SIZE, end); //additional input too long - cc_require(entropyLength >= MIN_REQ_ENTROPY(di), end); //supplied too litle entropy + cchmac(info, outlen, drbg_ctx->key, outlen, drbg_ctx->V, drbg_ctx->V); - cc_require(di->output_size <= NH_MAX_OUTPUT_BLOCK_SIZE, end); //the requested security strength is not supported + if (data_nbytes == 0) { + break; + } + } - rc = CCDRBG_STATUS_OK; -end: - return rc; + cchmac_di_clear(info, hmac_ctx); } -/* - * NIST SP 800-90A, Rev. 1 April 2014 B.2.2, p 84 - * - * HMAC_DRBG_Instantiate_algorithm (...): - * Input: bitstring (entropy_input, personalization_string). - * Output: bitstring (V, Key), integer reseed_counter. - * - * Process: - * 1. seed_material = entropy_input || personalization_string. - * 2. Set Key to outlen bits of zeros. - * 3. Set V to outlen/8 bytes of 0x01. - * 4. (Key, V) = HMAC_DRBG_Update (seed_material, Key, V). - * 5. reseed_counter = 1. - * 6. Return (V, Key, reseed_counter). - */ - -// This version does not do memory allocation -//SP800-90 A: Required minimum entropy for instantiate and reseed=security_strength +static bool +entropy_isvalid(size_t entropy_nbytes, const struct ccdigest_info *info) +{ + return (entropy_nbytes <= CCDRBG_MAX_ENTROPY_SIZE) && (entropy_nbytes >= MIN_REQ_ENTROPY(info)); +} +// See NIST SP 800-90A, Rev. 1, 9.1 and 10.1.2.3 static int -hmac_dbrg_instantiate_algorithm(struct ccdrbg_state *drbg, - size_t entropyLength, const void *entropy, - size_t nonceLength, const void *nonce, - size_t psLength, const void *ps) +init(const struct ccdrbg_info *info, + struct ccdrbg_state *ctx, + size_t entropy_nbytes, + const void *entropy, + size_t nonce_nbytes, + const void *nonce, + size_t ps_nbytes, + const void *ps) { - // TODO: The NIST code passes nonce (i.e. HMAC key) to generate, but cc interface isn't set up that way - struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; + struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx; + drbg_ctx->custom = info->custom; + const struct ccdigest_info *digest_info = drbg_ctx->custom->di; + size_t outlen = digest_info->output_size; - // 1. seed_material = entropy_input || nonce || personalization_string. + int status = CCDRBG_STATUS_PARAM_ERROR; + cc_require(outlen <= NISTHMAC_MAX_OUTPUT_SIZE, out); + cc_require(entropy_isvalid(entropy_nbytes, digest_info), out); + cc_require(ps_nbytes <= CCDRBG_MAX_PSINPUT_SIZE, out); - // 2. Set Key to outlen bits of zeros. - cc_zero(state->keysize, state->key); + status = CCDRBG_STATUS_OK; - // 3. Set V to outlen/8 bytes of 0x01. - CC_MEMSET(state->vptr, 0x01, state->vsize); + cc_memset(drbg_ctx->key, 0, outlen); + cc_memset(drbg_ctx->V, 1, outlen); - // 4. (Key, V) = HMAC_DRBG_Update (seed_material, Key, V). - hmac_dbrg_update(drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps); + update(ctx, 3, entropy_nbytes, entropy, nonce_nbytes, nonce, ps_nbytes, ps); - // 5. reseed_counter = 1. - state->reseed_counter = 1; + drbg_ctx->reseed_counter = 1; - return CCDRBG_STATUS_OK; +out: + return status; } -// In NIST terminology, the nonce is the HMAC key and ps is the personalization string -// We assume that the caller has passed in -// min_entropy = NH_REQUIRED_MIN_ENTROPY(security_strength) -// bytes of entropy +static bool +add_isvalid(size_t add_nbytes) +{ + return add_nbytes <= CCDRBG_MAX_ADDITIONALINPUT_SIZE; +} +// See NIST SP 800-90A, Rev. 1, 9.2 and 10.1.2.4 static int -init(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, - size_t entropyLength, const void* entropy, - size_t nonceLength, const void* nonce, - size_t psLength, const void* ps) +reseed(struct ccdrbg_state *ctx, size_t entropy_nbytes, const void *entropy, size_t add_nbytes, const void *add) { - struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; - state->bytesLeft = 0; - state->custom = info->custom; //we only need to get the custom parameter from the info structure. - - int rc = validate_inputs(state, entropyLength, 0, psLength); - if (rc != CCDRBG_STATUS_OK) { - //clear everything if cannot initialize. The idea is that if the caller doesn't check the output of init() and init() fails, - //the system crashes by NULL dereferencing after a call to generate, rather than generating bad random numbers. - done(drbg); - return rc; - } - - const struct ccdigest_info *di = state->custom->di; - state->vsize = di->output_size; - state->keysize = di->output_size; - state->vptr = state->v; - state->nextvptr = state->v + state->vsize; - - // 7. (V, Key, reseed_counter) = HMAC_DRBG_Instantiate_algorithm (entropy_input, personalization_string). - hmac_dbrg_instantiate_algorithm(drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps); + struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx; + const struct ccdigest_info *digest_info = drbg_ctx->custom->di; -#if DRBG_NISTHMAC_DEBUG - dumpState("Init: ", state); -#endif - return CCDRBG_STATUS_OK; -} + int status = CCDRBG_STATUS_PARAM_ERROR; + cc_require(entropy_isvalid(entropy_nbytes, digest_info), out); + cc_require(add_isvalid(add_nbytes), out); -/* - * 10.1.2.4 Reseeding an HMAC_DRBG Instantiation - * Notes for the reseed function specified in Section 9.2: - * The reseeding of an HMAC_DRBG instantiation requires a call to the Reseed_function specified in Section 9.2. - * Process step 6 of that function calls the reseed algorithm specified in this section. The values for min_length - * are provided in Table 2 of Section 10.1. - * - * The reseed algorithm: - * Let HMAC_DRBG_Update be the function specified in Section 10.1.2.2. The following process or its equivalent - * shall be used as the reseed algorithm for this DRBG mechanism (see step 6 of the reseed process in Section 9.2): - * - * HMAC_DRBG_Reseed_algorithm (working_state, entropy_input, additional_input): - * 1. working_state: The current values for V, Key and reseed_counter (see Section 10.1.2.1). - * 2. entropy_input: The string of bits obtained from the source of entropy input. - * 3. additional_input: The additional input string received from the consuming application. - * Note that the length of the additional_input string may be zero. - * - * Output: - * 1. new_working_state: The new values for V, Key and reseed_counter. HMAC_DRBG Reseed Process: - * 1. seed_material = entropy_input || additional_input. - * 2. (Key, V) = HMAC_DRBG_Update (seed_material, Key, V). 3. reseed_counter = 1. - * 4. Return V, Key and reseed_counter as the new_working_state. - */ + status = CCDRBG_STATUS_OK; -static int -reseed(struct ccdrbg_state *drbg, - size_t entropyLength, const void *entropy, - size_t additionalLength, const void *additional) -{ - struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; - int rc = validate_inputs(state, entropyLength, additionalLength, 0); - if (rc != CCDRBG_STATUS_OK) { - return rc; - } + update(ctx, 2, entropy_nbytes, entropy, add_nbytes, add); - int rx = hmac_dbrg_update(drbg, entropyLength, entropy, additionalLength, additional, 0, NULL); - state->reseed_counter = 1; + drbg_ctx->reseed_counter = 1; -#if DRBG_NISTHMAC_DEBUG - dumpState("Reseed: ", state); -#endif - return rx; +out: + return status; } -/* - * HMAC_DRBG_Generate_algorithm: - * Input: bitstring (V, Key), integer (reseed_counter, requested_number_of_bits). - * Output: string status, bitstring (pseudorandom_bits, V, Key), integer reseed_counter. - * - * Process: - * 1. If (reseed_counter ≥ 10,000), then Return (“Reseed required”, Null, V, Key, reseed_counter). - * 2. temp = Null. - * 3. While (len (temp) < requested_no_of_bits) do: - * 3.1 V = HMAC (Key, V). - * 3.2 temp = temp || V. - * 4. pseudorandom_bits = Leftmost (requested_no_of_bits) of temp. - * 5. (Key, V) = HMAC_DRBG_Update (Null, Key, V). - * 6. reseed_counter = reseed_counter + 1. - * 7. Return (“Success”, pseudorandom_bits, V, Key, reseed_counter). - */ - +// See NIST SP 800-90A, Rev. 1, 9.3 and 10.1.2.5 static int -validate_gen_params(uint64_t reseed_counter, size_t dataOutLength, size_t additionalLength) +generate(struct ccdrbg_state *ctx, size_t out_nbytes, void *out, size_t add_nbytes, const void *add) { - int rc = CCDRBG_STATUS_PARAM_ERROR; - - // Zero byte in one request is a valid use-case (21208820) - cc_require(dataOutLength <= CCDRBG_MAX_REQUEST_SIZE, end); //Requested too many bytes in one request - cc_require(additionalLength <= CCDRBG_MAX_ADDITIONALINPUT_SIZE, end); //Additional input too long + struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx; + const struct ccdigest_info *info = drbg_ctx->custom->di; + size_t outlen = info->output_size; - // 1. If (reseed_counter > 2^^48), then Return (“Reseed required”, Null, V, Key, reseed_counter). - rc = CCDRBG_STATUS_NEED_RESEED; - cc_require(reseed_counter <= CCDRBG_RESEED_INTERVAL, end); //Reseed required + int status = CCDRBG_STATUS_PARAM_ERROR; + cc_require(out_nbytes <= CCDRBG_MAX_REQUEST_SIZE, out); + cc_require(add_isvalid(add_nbytes), out); - rc = CCDRBG_STATUS_OK; + status = CCDRBG_STATUS_NEED_RESEED; + cc_require(drbg_ctx->reseed_counter <= CCDRBG_RESEED_INTERVAL || !drbg_ctx->custom->strictFIPS, out); -end: - return rc; -} - -static int -generate(struct ccdrbg_state *drbg, size_t dataOutLength, void *dataOut, - size_t additionalLength, const void *additional) -{ - struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; - const struct ccdrbg_nisthmac_custom *custom = state->custom; - const struct ccdigest_info *di = custom->di; + status = CCDRBG_STATUS_OK; - int rc = validate_gen_params(state->reseed_counter, dataOutLength, additional == NULL?0:additionalLength); - if (rc != CCDRBG_STATUS_OK) { - return rc; + if (add_nbytes > 0) { + update(ctx, 1, add_nbytes, add); } - // 2. If additional_input ≠ Null, then (Key, V) = HMAC_DRBG_Update (additional_input, Key, V). - if (additional && additionalLength) { - hmac_dbrg_update(drbg, additionalLength, additional, 0, NULL, 0, NULL); - } + uint8_t *out_bytes = out; + uint8_t Vprev[NISTHMAC_MAX_OUTPUT_SIZE]; - // hmac_dbrg_generate_algorithm - char *outPtr = (char *) dataOut; - while (dataOutLength > 0) { - if (!state->bytesLeft) { - // 5. V=HMAC(K,V). - cchmac(di, state->keysize, state->key, state->vsize, state->nextvptr, state->vptr); // Won't be returned - // FIPS 140-2 4.9.2 Conditional Tests - // "Each subsequent generation of an n-bit block shall be compared with the previously generated block. The test shall fail if any two compared n-bit blocks are equal." - if (0 == cc_cmp_safe(state->vsize, state->vptr, state->nextvptr)) { - //The world as we know it has come to an end - //the DRBG data structure is zeroized. subsequent calls to - //DRBG ends up in NULL dereferencing and/or unpredictable state. - //catastrophic error in SP 800-90A - done(drbg); - rc = CCDRBG_STATUS_ABORT; - cc_try_abort(NULL); - goto errOut; - } - CC_SWAP(state->nextvptr, state->vptr); - state->bytesLeft = state->vsize; -#if DRBG_NISTHMAC_DEBUG - cc_print("generate blk: ", state->vsize, state->vptr); -#endif + while (out_nbytes > 0) { + cc_memcpy(Vprev, drbg_ctx->V, outlen); + cchmac(info, outlen, drbg_ctx->key, outlen, drbg_ctx->V, drbg_ctx->V); + + // See FIPS 140-2, 4.9.2 Conditional Tests + if (cc_cmp_safe(outlen, Vprev, drbg_ctx->V) == 0) { + done(ctx); + status = CCDRBG_STATUS_ABORT; + cc_try_abort(NULL); + goto out; } - size_t outLength = dataOutLength > state->bytesLeft ? state->bytesLeft : dataOutLength; - CC_MEMCPY(outPtr, state->vptr, outLength); - state->bytesLeft -= outLength; - outPtr += outLength; - dataOutLength -= outLength; - } - // 6. (Key, V) = HMAC_DRBG_Update (additional_input, Key, V). - hmac_dbrg_update(drbg, additionalLength, additional, 0, NULL, 0, NULL); + size_t n = CC_MIN(out_nbytes, outlen); + cc_memcpy(out_bytes, drbg_ctx->V, n); - // 7. reseed_counter = reseed_counter + 1. - state->reseed_counter++; + out_bytes += n; + out_nbytes -= n; + } -#if DRBG_NISTHMAC_DEBUG - dumpState("generate end: ", state); - cc_print("generate end nxt: ", state->vsize, state->nextvptr); -#endif - rc = CCDRBG_STATUS_OK; -errOut: - return rc; -} + update(ctx, 1, add_nbytes, add); -static void -done(struct ccdrbg_state *drbg) -{ - struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; - cc_clear(sizeof(struct ccdrbg_nisthmac_state), state); //clear v, key as well as internal variables -} + drbg_ctx->reseed_counter += 1; -struct ccdrbg_info ccdrbg_nisthmac_info = { - .size = sizeof(struct ccdrbg_nisthmac_state) + sizeof(struct ccdrbg_nisthmac_custom), - .init = init, - .reseed = reseed, - .generate = generate, - .done = done, - .custom = NULL -}; +out: + cc_clear(outlen, Vprev); + return status; +} -/* This initializes an info object with the right options */ void ccdrbg_factory_nisthmac(struct ccdrbg_info *info, const struct ccdrbg_nisthmac_custom *custom) { diff --git a/osfmk/corecrypto/ccdigest/src/ccdigest_init.c b/osfmk/corecrypto/ccdigest/src/ccdigest_init.c index 9dc776366..0ba754841 100644 --- a/osfmk/corecrypto/ccdigest/src/ccdigest_init.c +++ b/osfmk/corecrypto/ccdigest/src/ccdigest_init.c @@ -32,7 +32,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include +#include #include void diff --git a/osfmk/corecrypto/ccdigest/src/ccdigest_update.c b/osfmk/corecrypto/ccdigest/src/ccdigest_update.c index 94b29a172..6856c4e74 100644 --- a/osfmk/corecrypto/ccdigest/src/ccdigest_update.c +++ b/osfmk/corecrypto/ccdigest/src/ccdigest_update.c @@ -36,12 +36,16 @@ #include void -ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, - size_t len, const void *data) +ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, size_t len, const void *data) { const char * data_ptr = data; size_t nblocks, nbytes; + // Sanity check to recover from ctx corruptions. + if (ccdigest_num(di, ctx) >= di->block_size) { + ccdigest_num(di, ctx) = 0; + } + while (len > 0) { if (ccdigest_num(di, ctx) == 0 && len > di->block_size) { //low-end processors are slow on divison @@ -59,13 +63,10 @@ ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, di->compress(ccdigest_state(di, ctx), nblocks, data_ptr); len -= nbytes; data_ptr += nbytes; - ccdigest_nbits(di, ctx) += nbytes * 8; + ccdigest_nbits(di, ctx) += (uint64_t) (nbytes) * 8; } else { - size_t n = di->block_size - ccdigest_num(di, ctx); - if (len < n) { - n = len; - } - CC_MEMCPY(ccdigest_data(di, ctx) + ccdigest_num(di, ctx), data_ptr, n); + size_t n = CC_MIN(di->block_size - ccdigest_num(di, ctx), len); + cc_memcpy(ccdigest_data(di, ctx) + ccdigest_num(di, ctx), data_ptr, n); /* typecast: less than block size, will always fit into an int */ ccdigest_num(di, ctx) += (unsigned int)n; len -= n; diff --git a/osfmk/corecrypto/cchmac/src/cchmac_final.c b/osfmk/corecrypto/cchmac/src/cchmac_final.c index dc72c7adb..bb25887fe 100644 --- a/osfmk/corecrypto/cchmac/src/cchmac_final.c +++ b/osfmk/corecrypto/cchmac/src/cchmac_final.c @@ -32,6 +32,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include #include diff --git a/osfmk/corecrypto/cchmac/src/cchmac_init.c b/osfmk/corecrypto/cchmac/src/cchmac_init.c index 4eba5b23a..1d5d799f5 100644 --- a/osfmk/corecrypto/cchmac/src/cchmac_init.c +++ b/osfmk/corecrypto/cchmac/src/cchmac_init.c @@ -32,6 +32,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include #include @@ -67,7 +68,7 @@ cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t hc, } /* Fill remainder of cchmac_data(di, hc) with opad. */ if (key_len < di->block_size) { - CC_MEMSET(cchmac_data(di, hc) + key_len, 0x5c, di->block_size - key_len); + cc_memset(cchmac_data(di, hc) + key_len, 0x5c, di->block_size - key_len); } /* Set cchmac_ostate32(di, hc) to the state of the first round of the diff --git a/osfmk/corecrypto/ccn/src/ccn_set.c b/osfmk/corecrypto/ccn/src/ccn_set.c deleted file mode 100644 index 4cd06a506..000000000 --- a/osfmk/corecrypto/ccn/src/ccn_set.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * ccn_set.c - * corecrypto - * - * Created on 02/17/2012 - * - * Copyright (c) 2012,2014,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -#if !CCN_SET_ASM -void -ccn_set(cc_size n, cc_unit *r, const cc_unit *s) -{ - CC_MEMMOVE(r, s, ccn_sizeof_n(n)); -} -#endif diff --git a/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c b/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c index 8b30793d5..56234ff33 100644 --- a/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c +++ b/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c @@ -38,32 +38,42 @@ /* This can be used for SHA1, SHA256 and SHA224 */ void -ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t ctx, - unsigned char *digest) +ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest) { - ccdigest_nbits(di, ctx) += ccdigest_num(di, ctx) * 8; - ccdigest_data(di, ctx)[ccdigest_num(di, ctx)++] = 0x80; + // Sanity check to recover from ctx corruptions. + if (ccdigest_num(di, ctx) >= di->block_size) { + ccdigest_num(di, ctx) = 0; + } + + // Clone the state. + ccdigest_di_decl(di, tmp); + cc_memcpy(tmp, ctx, ccdigest_di_size(di)); + + ccdigest_nbits(di, tmp) += ccdigest_num(di, tmp) * 8; + ccdigest_data(di, tmp)[ccdigest_num(di, tmp)++] = 0x80; /* If we don't have at least 8 bytes (for the length) left we need to add * a second block. */ - if (ccdigest_num(di, ctx) > 64 - 8) { - while (ccdigest_num(di, ctx) < 64) { - ccdigest_data(di, ctx)[ccdigest_num(di, ctx)++] = 0; + if (ccdigest_num(di, tmp) > 64 - 8) { + while (ccdigest_num(di, tmp) < 64) { + ccdigest_data(di, tmp)[ccdigest_num(di, tmp)++] = 0; } - di->compress(ccdigest_state(di, ctx), 1, ccdigest_data(di, ctx)); - ccdigest_num(di, ctx) = 0; + di->compress(ccdigest_state(di, tmp), 1, ccdigest_data(di, tmp)); + ccdigest_num(di, tmp) = 0; } /* pad upto block_size minus 8 with 0s */ - while (ccdigest_num(di, ctx) < 64 - 8) { - ccdigest_data(di, ctx)[ccdigest_num(di, ctx)++] = 0; + while (ccdigest_num(di, tmp) < 64 - 8) { + ccdigest_data(di, tmp)[ccdigest_num(di, tmp)++] = 0; } - CC_STORE64_BE(ccdigest_nbits(di, ctx), ccdigest_data(di, ctx) + 64 - 8); - di->compress(ccdigest_state(di, ctx), 1, ccdigest_data(di, ctx)); + CC_STORE64_BE(ccdigest_nbits(di, tmp), ccdigest_data(di, tmp) + 64 - 8); + di->compress(ccdigest_state(di, tmp), 1, ccdigest_data(di, tmp)); /* copy output */ for (unsigned int i = 0; i < di->output_size / 4; i++) { - CC_STORE32_BE(ccdigest_state_u32(di, ctx)[i], digest + (4 * i)); + CC_STORE32_BE(ccdigest_state_u32(di, tmp)[i], digest + (4 * i)); } + + ccdigest_di_clear(di, tmp); } diff --git a/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h b/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h index 59c6acc04..f055084b0 100644 --- a/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h +++ b/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h @@ -37,8 +37,6 @@ #include -void ccdigest_final_common(const struct ccdigest_info *di, - ccdigest_ctx_t ctx, void *digest); void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t, unsigned char *digest); void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t, diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c b/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c deleted file mode 100644 index 22941eb91..000000000 --- a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c +++ /dev/null @@ -1,309 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#include -#include "ccsha1_internal.h" -#include -#include -#include "ccdigest_internal.h" - - -#ifndef SHA_LONG_LOG2 -#define SHA_LONG_LOG2 2 /* default to 32 bits */ -#endif - - -#define ROTATE(b, n) CC_ROLc(b, n) - -#define Xupdate(a, ix, ia, ib, ic, id) ( (a)=(ia^ib^ic^id),\ - ix=(a)=ROTATE((a),1) \ - ) - -#define MD32_REG_T uint32_t - -#define HOST_c2l(data, l) CC_LOAD32_BE(l, data); data+=4; - -#define K_00_19 0x5a827999 -#define K_20_39 0x6ed9eba1 -#define K_40_59 0x8f1bbcdc -#define K_60_79 0xca62c1d6 - -/* As pointed out by Wei Dai , F() below can be - * simplified to the code in F_00_19. Wei attributes these optimisations - * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel. - * #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) - * I've just become aware of another tweak to be made, again from Wei Dai, - * in F_40_59, (x&a)|(y&a) -> (x|y)&a - */ -#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) -#define F_40_59(b, c, d) (((b) & (c)) | (((b)|(c)) & (d))) -#define F_60_79(b, c, d) F_20_39(b,c,d) - -#define BODY_00_15(i, a, b, c, d, e, f, xi) \ - (f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \ - (b)=ROTATE((b),30); - -#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \ - Xupdate(f,xi,xa,xb,xc,xd); \ - (f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \ - (b)=ROTATE((b),30); - -#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \ - Xupdate(f,xi,xa,xb,xc,xd); \ - (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \ - (b)=ROTATE((b),30); - -#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd) \ - Xupdate(f,xa,xa,xb,xc,xd); \ - (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \ - (b)=ROTATE((b),30); - -#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd) \ - Xupdate(f,xa,xa,xb,xc,xd); \ - (f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \ - (b)=ROTATE((b),30); - -#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd) \ - Xupdate(f,xa,xa,xb,xc,xd); \ - (f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \ - (b)=ROTATE((b),30); - -#ifdef X -#undef X -#endif - -#ifndef MD32_XARRAY -/* - * Originally X was an array. As it's automatic it's natural - * to expect RISC compiler to accomodate at least part of it in - * the register bank, isn't it? Unfortunately not all compilers - * "find" this expectation reasonable:-( On order to make such - * compilers generate better code I replace X[] with a bunch of - * X0, X1, etc. See the function body below... - * - */ -# define X(i) XX##i -#else -/* - * However! Some compilers (most notably HP C) get overwhelmed by - * that many local variables so that we have to have the way to - * fall down to the original behavior. - */ -# define X(i) XX[i] -#endif - -static void -sha1_compress(ccdigest_state_t s, size_t num, const void *buf) -{ - const unsigned char *data = buf; - register uint32_t A, B, C, D, E, T, l; -#ifndef MD32_XARRAY - uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9, XX10, XX11, XX12, XX13, XX14, XX15; -#else - uint32_t XX[16]; -#endif - uint32_t *state = ccdigest_u32(s); - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - E = state[4]; - - for (;;) { - HOST_c2l(data, l); X( 0) = l; HOST_c2l(data, l); X( 1) = l; - BODY_00_15( 0, A, B, C, D, E, T, X( 0)); HOST_c2l(data, l); X( 2) = l; - BODY_00_15( 1, T, A, B, C, D, E, X( 1)); HOST_c2l(data, l); X( 3) = l; - BODY_00_15( 2, E, T, A, B, C, D, X( 2)); HOST_c2l(data, l); X( 4) = l; - BODY_00_15( 3, D, E, T, A, B, C, X( 3)); HOST_c2l(data, l); X( 5) = l; - BODY_00_15( 4, C, D, E, T, A, B, X( 4)); HOST_c2l(data, l); X( 6) = l; - BODY_00_15( 5, B, C, D, E, T, A, X( 5)); HOST_c2l(data, l); X( 7) = l; - BODY_00_15( 6, A, B, C, D, E, T, X( 6)); HOST_c2l(data, l); X( 8) = l; - BODY_00_15( 7, T, A, B, C, D, E, X( 7)); HOST_c2l(data, l); X( 9) = l; - BODY_00_15( 8, E, T, A, B, C, D, X( 8)); HOST_c2l(data, l); X(10) = l; - BODY_00_15( 9, D, E, T, A, B, C, X( 9)); HOST_c2l(data, l); X(11) = l; - BODY_00_15(10, C, D, E, T, A, B, X(10)); HOST_c2l(data, l); X(12) = l; - BODY_00_15(11, B, C, D, E, T, A, X(11)); HOST_c2l(data, l); X(13) = l; - BODY_00_15(12, A, B, C, D, E, T, X(12)); HOST_c2l(data, l); X(14) = l; - BODY_00_15(13, T, A, B, C, D, E, X(13)); HOST_c2l(data, l); X(15) = l; - BODY_00_15(14, E, T, A, B, C, D, X(14)); - BODY_00_15(15, D, E, T, A, B, C, X(15)); - - BODY_16_19(16, C, D, E, T, A, B, X( 0), X( 0), X( 2), X( 8), X(13)); - BODY_16_19(17, B, C, D, E, T, A, X( 1), X( 1), X( 3), X( 9), X(14)); - BODY_16_19(18, A, B, C, D, E, T, X( 2), X( 2), X( 4), X(10), X(15)); - BODY_16_19(19, T, A, B, C, D, E, X( 3), X( 3), X( 5), X(11), X( 0)); - - BODY_20_31(20, E, T, A, B, C, D, X( 4), X( 4), X( 6), X(12), X( 1)); - BODY_20_31(21, D, E, T, A, B, C, X( 5), X( 5), X( 7), X(13), X( 2)); - BODY_20_31(22, C, D, E, T, A, B, X( 6), X( 6), X( 8), X(14), X( 3)); - BODY_20_31(23, B, C, D, E, T, A, X( 7), X( 7), X( 9), X(15), X( 4)); - BODY_20_31(24, A, B, C, D, E, T, X( 8), X( 8), X(10), X( 0), X( 5)); - BODY_20_31(25, T, A, B, C, D, E, X( 9), X( 9), X(11), X( 1), X( 6)); - BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X( 2), X( 7)); - BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X( 3), X( 8)); - BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X( 4), X( 9)); - BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X( 5), X(10)); - BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X( 0), X( 6), X(11)); - BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X( 1), X( 7), X(12)); - - BODY_32_39(32, E, T, A, B, C, D, X( 0), X( 2), X( 8), X(13)); - BODY_32_39(33, D, E, T, A, B, C, X( 1), X( 3), X( 9), X(14)); - BODY_32_39(34, C, D, E, T, A, B, X( 2), X( 4), X(10), X(15)); - BODY_32_39(35, B, C, D, E, T, A, X( 3), X( 5), X(11), X( 0)); - BODY_32_39(36, A, B, C, D, E, T, X( 4), X( 6), X(12), X( 1)); - BODY_32_39(37, T, A, B, C, D, E, X( 5), X( 7), X(13), X( 2)); - BODY_32_39(38, E, T, A, B, C, D, X( 6), X( 8), X(14), X( 3)); - BODY_32_39(39, D, E, T, A, B, C, X( 7), X( 9), X(15), X( 4)); - - BODY_40_59(40, C, D, E, T, A, B, X( 8), X(10), X( 0), X( 5)); - BODY_40_59(41, B, C, D, E, T, A, X( 9), X(11), X( 1), X( 6)); - BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X( 2), X( 7)); - BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X( 3), X( 8)); - BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X( 4), X( 9)); - BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X( 5), X(10)); - BODY_40_59(46, C, D, E, T, A, B, X(14), X( 0), X( 6), X(11)); - BODY_40_59(47, B, C, D, E, T, A, X(15), X( 1), X( 7), X(12)); - BODY_40_59(48, A, B, C, D, E, T, X( 0), X( 2), X( 8), X(13)); - BODY_40_59(49, T, A, B, C, D, E, X( 1), X( 3), X( 9), X(14)); - BODY_40_59(50, E, T, A, B, C, D, X( 2), X( 4), X(10), X(15)); - BODY_40_59(51, D, E, T, A, B, C, X( 3), X( 5), X(11), X( 0)); - BODY_40_59(52, C, D, E, T, A, B, X( 4), X( 6), X(12), X( 1)); - BODY_40_59(53, B, C, D, E, T, A, X( 5), X( 7), X(13), X( 2)); - BODY_40_59(54, A, B, C, D, E, T, X( 6), X( 8), X(14), X( 3)); - BODY_40_59(55, T, A, B, C, D, E, X( 7), X( 9), X(15), X( 4)); - BODY_40_59(56, E, T, A, B, C, D, X( 8), X(10), X( 0), X( 5)); - BODY_40_59(57, D, E, T, A, B, C, X( 9), X(11), X( 1), X( 6)); - BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X( 2), X( 7)); - BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X( 3), X( 8)); - - BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X( 4), X( 9)); - BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X( 5), X(10)); - BODY_60_79(62, E, T, A, B, C, D, X(14), X( 0), X( 6), X(11)); - BODY_60_79(63, D, E, T, A, B, C, X(15), X( 1), X( 7), X(12)); - BODY_60_79(64, C, D, E, T, A, B, X( 0), X( 2), X( 8), X(13)); - BODY_60_79(65, B, C, D, E, T, A, X( 1), X( 3), X( 9), X(14)); - BODY_60_79(66, A, B, C, D, E, T, X( 2), X( 4), X(10), X(15)); - BODY_60_79(67, T, A, B, C, D, E, X( 3), X( 5), X(11), X( 0)); - BODY_60_79(68, E, T, A, B, C, D, X( 4), X( 6), X(12), X( 1)); - BODY_60_79(69, D, E, T, A, B, C, X( 5), X( 7), X(13), X( 2)); - BODY_60_79(70, C, D, E, T, A, B, X( 6), X( 8), X(14), X( 3)); - BODY_60_79(71, B, C, D, E, T, A, X( 7), X( 9), X(15), X( 4)); - BODY_60_79(72, A, B, C, D, E, T, X( 8), X(10), X( 0), X( 5)); - BODY_60_79(73, T, A, B, C, D, E, X( 9), X(11), X( 1), X( 6)); - BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X( 2), X( 7)); - BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X( 3), X( 8)); - BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X( 4), X( 9)); - BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X( 5), X(10)); - BODY_60_79(78, A, B, C, D, E, T, X(14), X( 0), X( 6), X(11)); - BODY_60_79(79, T, A, B, C, D, E, X(15), X( 1), X( 7), X(12)); - - state[0] = (state[0] + E) & 0xffffffff; - state[1] = (state[1] + T) & 0xffffffff; - state[2] = (state[2] + A) & 0xffffffff; - state[3] = (state[3] + B) & 0xffffffff; - state[4] = (state[4] + C) & 0xffffffff; - - if (--num <= 0) { - break; - } - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - E = state[4]; - } -} - -const struct ccdigest_info ccsha1_eay_di = { - .output_size = CCSHA1_OUTPUT_SIZE, - .state_size = CCSHA1_STATE_SIZE, - .block_size = CCSHA1_BLOCK_SIZE, - .oid_size = ccoid_sha1_len, - .oid = CC_DIGEST_OID_SHA1, - .initial_state = ccsha1_initial_state, - .compress = sha1_compress, - .final = ccdigest_final_64be, -}; diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c b/osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c deleted file mode 100644 index f72ecfd9a..000000000 --- a/osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * ccsha1_initial_state.c - * corecrypto - * - * Created on 12/07/2010 - * - * Copyright (c) 2010,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -const uint32_t ccsha1_initial_state[5] = { - 0x67452301, - 0xefcdab89, - 0x98badcfe, - 0x10325476, - 0xc3d2e1f0 -}; diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h b/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h deleted file mode 100644 index 323bbb2cd..000000000 --- a/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * ccsha1_internal.h - * corecrypto - * - * Created on 12/19/2017 - * - * Copyright (c) 2017 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _CORECRYPTO_CCSHA1_INTERNAL_H_ -#define _CORECRYPTO_CCSHA1_INTERNAL_H_ - -#include -#include - -extern const uint32_t ccsha1_initial_state[5]; - -#if CCSHA1_VNG_INTEL && defined(__x86_64__) -extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di; -extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di; -#endif - -#endif /* _CORECRYPTO_CCSHA1_INTERNAL_H_ */ diff --git a/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h b/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h index 59c6acc04..f055084b0 100644 --- a/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h +++ b/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h @@ -37,8 +37,6 @@ #include -void ccdigest_final_common(const struct ccdigest_info *di, - ccdigest_ctx_t ctx, void *digest); void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t, unsigned char *digest); void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t, diff --git a/osfmk/corecrypto/ccsha2/src/ccsha256_di.c b/osfmk/corecrypto/ccsha2/src/ccsha256_di.c index d31a9402c..2a61b8413 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha256_di.c +++ b/osfmk/corecrypto/ccsha2/src/ccsha256_di.c @@ -47,16 +47,20 @@ ccsha256_di(void) #if defined (__x86_64__) if (CC_HAS_AVX512_AND_IN_KERNEL()) { return &ccsha256_vng_intel_SupplementalSSE3_di; - } else { - return CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di : - ((CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di : - &ccsha256_vng_intel_SupplementalSSE3_di)); + } else +#if CC_ACCELERATECRYPTO + { return &ccsha256_vng_intel_di; // use AccelerateCrypto } +#else + { return CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di : + ((CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di : + &ccsha256_vng_intel_SupplementalSSE3_di)); } +#endif #else return &ccsha256_vng_intel_SupplementalSSE3_di; #endif -#elif CCSHA2_VNG_ARMV7NEON - return &ccsha256_vng_armv7neon_di; +#elif CCSHA2_VNG_ARM + return &ccsha256_vng_arm_di; #elif CCSHA256_ARMV6M_ASM return &ccsha256_v6m_di; #else diff --git a/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c b/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c index 6c84aa4d4..0b6ea34de 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c +++ b/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c @@ -52,102 +52,98 @@ #if !CC_KERNEL || !CC_USE_ASM -// Various logical functions -#define Ch(x, y, z) (z ^ (x & (y ^ z))) -#define Maj(x, y, z) (((x | y) & z) | (x & y)) -#define S(x, n) ror((x),(n)) -#define R(x, n) ((x)>>(n)) - -#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) - -#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -//It is beter if the following macros are defined as inline functions, -//but I found some compilers do not inline them. -#ifdef __CC_ARM - #define ror(val, shift) __ror(val,shift) +#if CCSHA2_SHA256_USE_SHA512_K +#define K(i) ((uint32_t)(ccsha512_K[i] >> 32)) #else - #define ror(val, shift) ((val >> shift) | (val << (32 - shift))) +#define K(i) ccsha256_K[i] #endif -#ifdef __CC_ARM - #define byte_swap32(x) __rev(x) -#elif defined(__clang__) && !defined(_MSC_VER) - #define byte_swap32(x) __builtin_bswap32(x); -#else - #define byte_swap32(x) ((ror(x, 8) & 0xff00ff00) | (ror(x, 24) & 0x00ff00ff)) -#endif +// Various logical functions +#define Ch(x, y, z) (z ^ (x & (y ^ z))) +#define Maj(x, y, z) (((x | y) & z) | (x & y)) +#define S(x, n) CC_RORc(x, n) +#define R(x, n) ((x) >> (n)) +#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) +#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) +#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) +#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) -#if CC_HANDLE_UNALIGNED_DATA - #define set_W(i) CC_LOAD32_BE(W[i], buf + (4*(i))) -#else - #define set_W(i) W[i] = byte_swap32(buf[i]) -#endif +#define set_W(i) CC_LOAD32_BE(W[i], buf + (4 * (i))) // the round function -#define RND(a, b, c, d, e, f, g, h, i) \ - t0 = h + Sigma1(e) + Ch(e, f, g) + ccsha256_K[i] + W[i]; \ - t1 = Sigma0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; +#define RND(a, b, c, d, e, f, g, h, i) \ + t0 = h + Sigma1(e) + Ch(e, f, g) + K(i) + W[i]; \ + t1 = Sigma0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; // compress 512-bits void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *in) { uint32_t W[64], t0, t1; - uint32_t S0, S1, S2, S3, S4, S5, S6, S7; + uint32_t S[8]; int i; uint32_t *s = ccdigest_u32(state); -#if CC_HANDLE_UNALIGNED_DATA const unsigned char *buf = in; -#else - const uint32_t *buf = in; -#endif while (nblocks--) { // schedule W 0..15 - set_W(0); set_W(1); set_W(2); set_W(3); set_W(4); set_W(5); set_W(6); set_W(7); - set_W(8); set_W(9); set_W(10); set_W(11); set_W(12); set_W(13); set_W(14); set_W(15); + for (i = 0; i < 16; i += 1) { + set_W(i); + } // schedule W 16..63 - for (i = 16; i < 64; i++) { + for (; i < 64; i++) { W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16]; } // copy state into S - S0 = s[0]; - S1 = s[1]; - S2 = s[2]; - S3 = s[3]; - S4 = s[4]; - S5 = s[5]; - S6 = s[6]; - S7 = s[7]; + S[0] = s[0]; + S[1] = s[1]; + S[2] = s[2]; + S[3] = s[3]; + S[4] = s[4]; + S[5] = s[5]; + S[6] = s[6]; + S[7] = s[7]; // Compress +#if CC_SMALL_CODE + for (i = 0; i < 64; i += 1) { + t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K(i) + W[i]; + t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]); + S[7] = S[6]; + S[6] = S[5]; + S[5] = S[4]; + S[4] = S[3] + t0; + S[3] = S[2]; + S[2] = S[1]; + S[1] = S[0]; + S[0] = t0 + t1; + } +#else for (i = 0; i < 64; i += 8) { - RND(S0, S1, S2, S3, S4, S5, S6, S7, i + 0); - RND(S7, S0, S1, S2, S3, S4, S5, S6, i + 1); - RND(S6, S7, S0, S1, S2, S3, S4, S5, i + 2); - RND(S5, S6, S7, S0, S1, S2, S3, S4, i + 3); - RND(S4, S5, S6, S7, S0, S1, S2, S3, i + 4); - RND(S3, S4, S5, S6, S7, S0, S1, S2, i + 5); - RND(S2, S3, S4, S5, S6, S7, S0, S1, i + 6); - RND(S1, S2, S3, S4, S5, S6, S7, S0, i + 7); + RND(S[0], S[1], S[2], S[3], S[4], S[5], S[6], S[7], i + 0); + RND(S[7], S[0], S[1], S[2], S[3], S[4], S[5], S[6], i + 1); + RND(S[6], S[7], S[0], S[1], S[2], S[3], S[4], S[5], i + 2); + RND(S[5], S[6], S[7], S[0], S[1], S[2], S[3], S[4], i + 3); + RND(S[4], S[5], S[6], S[7], S[0], S[1], S[2], S[3], i + 4); + RND(S[3], S[4], S[5], S[6], S[7], S[0], S[1], S[2], i + 5); + RND(S[2], S[3], S[4], S[5], S[6], S[7], S[0], S[1], i + 6); + RND(S[1], S[2], S[3], S[4], S[5], S[6], S[7], S[0], i + 7); } +#endif // feedback - s[0] += S0; - s[1] += S1; - s[2] += S2; - s[3] += S3; - s[4] += S4; - s[5] += S5; - s[6] += S6; - s[7] += S7; + s[0] += S[0]; + s[1] += S[1]; + s[2] += S[2]; + s[3] += S[3]; + s[4] += S[4]; + s[5] += S[5]; + s[6] += S[6]; + s[7] += S[7]; buf += CCSHA256_BLOCK_SIZE / sizeof(buf[0]); } diff --git a/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h b/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h index 7bf64bc04..5a174ab68 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h +++ b/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h @@ -37,6 +37,12 @@ #include +#ifndef CCSHA2_DISABLE_SHA512 +#define CCSHA2_DISABLE_SHA512 0 +#endif + +#define CCSHA2_SHA256_USE_SHA512_K (CC_SMALL_CODE && !CCSHA2_DISABLE_SHA512) + extern const struct ccdigest_info ccsha256_v6m_di; void ccsha256_v6m_compress(ccdigest_state_t state, size_t nblocks, const void *buf); @@ -45,12 +51,20 @@ void ccsha512_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *i #if CCSHA2_VNG_INTEL #if defined __x86_64__ -void ccsha256_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in); -void ccsha256_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in); -void ccsha256_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in); -void ccsha512_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in); -void ccsha512_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in); -void ccsha512_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in); +void ccsha256_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_avx2_compress"); +void ccsha256_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_avx1_compress"); +void ccsha256_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_sse3_compress"); +void ccsha512_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha512_vng_intel_avx2_compress"); +void ccsha512_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha512_vng_intel_avx1_compress"); +void ccsha512_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha512_vng_intel_ssse3_compress"); + +#if CC_ACCELERATECRYPTO +// AccelerateCrypto +extern const struct ccdigest_info ccsha224_vng_intel_di; +extern const struct ccdigest_info ccsha256_vng_intel_di; +extern const struct ccdigest_info ccsha384_vng_intel_di; +extern const struct ccdigest_info ccsha512_vng_intel_di; +#endif extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di; extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di; @@ -63,14 +77,7 @@ extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di; extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di; extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di; #endif -void ccsha256_vng_intel_sse3_compress(ccdigest_state_t state, size_t nblocks, const void *in); -#endif - -#if CCSHA2_VNG_ARMV7NEON -extern const struct ccdigest_info ccsha384_vng_arm64_di; -extern const struct ccdigest_info ccsha384_vng_armv7neon_di; -extern const struct ccdigest_info ccsha512_vng_arm64_di; -extern const struct ccdigest_info ccsha512_vng_armv7neon_di; +void ccsha256_vng_intel_sse3_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_sse3_compress"); #endif extern const uint32_t ccsha256_K[64]; diff --git a/osfmk/corpses/corpse.c b/osfmk/corpses/corpse.c index 73448a1f5..040f331ce 100644 --- a/osfmk/corpses/corpse.c +++ b/osfmk/corpses/corpse.c @@ -407,7 +407,7 @@ task_generate_corpse( ipc_port_t corpse_port; ipc_port_t old_notify; - if (task == kernel_task || task == TASK_NULL || task == current_task()) { + if (task == kernel_task || task == TASK_NULL) { return KERN_INVALID_ARGUMENT; } @@ -446,7 +446,7 @@ task_generate_corpse( assert(IP_NULL != corpse_port); ip_lock(corpse_port); - assert(ip_active(corpse_port)); + require_ip_active(corpse_port); ipc_port_nsrequest(corpse_port, corpse_port->ip_mscount, ipc_port_make_sonce_locked(corpse_port), &old_notify); /* port unlocked */ @@ -579,6 +579,7 @@ task_generate_corpse_internal( is_64bit_data, t_flags, TPF_NONE, + TWF_NONE, &new_task); if (kr != KERN_SUCCESS) { goto error_task_generate_corpse; diff --git a/osfmk/corpses/task_corpse.h b/osfmk/corpses/task_corpse.h index a264a60a9..c51a3bbd0 100644 --- a/osfmk/corpses/task_corpse.h +++ b/osfmk/corpses/task_corpse.h @@ -75,7 +75,7 @@ extern kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data); extern void corpses_init(void); -extern unsigned long total_corpses_count(void); +extern unsigned long total_corpses_count(void) __attribute__((pure)); extern boolean_t corpses_enabled(void); extern kern_return_t task_generate_corpse_internal( diff --git a/osfmk/default_pager/Makefile b/osfmk/default_pager/Makefile index f7cad85e2..1a6d194df 100644 --- a/osfmk/default_pager/Makefile +++ b/osfmk/default_pager/Makefile @@ -39,7 +39,7 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -48,7 +48,7 @@ ${MIG_UUHDRS} : \ ${MIG_USHDRS} : \ %_server.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -99,7 +99,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -109,7 +109,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/device/Makefile b/osfmk/device/Makefile index c6e070a05..2f23d1231 100644 --- a/osfmk/device/Makefile +++ b/osfmk/device/Makefile @@ -45,7 +45,7 @@ COMP_FILES = ${DEVICE_FILES} do_build_all:: $(COMP_FILES) ${DEVICE_FILES}: device.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -header /dev/null \ -user /dev/null \ diff --git a/osfmk/device/device_init.c b/osfmk/device/device_init.c index c0a92c6e7..8b1fedd41 100644 --- a/osfmk/device/device_init.c +++ b/osfmk/device/device_init.c @@ -90,12 +90,10 @@ lck_mtx_t iokit_obj_to_port_binding_lock; void device_service_create(void) { - master_device_port = ipc_port_alloc_kernel(); - if (master_device_port == IP_NULL) { - panic("can't allocate master device port"); - } + master_device_port = ipc_kobject_alloc_port( + (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE, + IPC_KOBJECT_ALLOC_NONE); - ipc_kobject_set(master_device_port, (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE); kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT, ipc_port_make_send(master_device_port)); diff --git a/osfmk/device/device_port.h b/osfmk/device/device_port.h index 625eabe0a..e73e85c50 100644 --- a/osfmk/device/device_port.h +++ b/osfmk/device/device_port.h @@ -74,8 +74,21 @@ extern mach_port_t master_device_port; #define DEVICE_PAGER_COHERENT 0x2 #define DEVICE_PAGER_CACHE_INHIB 0x4 #define DEVICE_PAGER_WRITE_THROUGH 0x8 -#define DEVICE_PAGER_EARLY_ACK 0x20 #define DEVICE_PAGER_CONTIGUOUS 0x100 #define DEVICE_PAGER_NOPHYSCACHE 0x200 +#ifdef XNU_KERNEL_PRIVATE +#include + +_Static_assert(((DEVICE_PAGER_CONTIGUOUS | DEVICE_PAGER_NOPHYSCACHE) & VM_WIMG_MASK) == 0, + "device pager flags overlap WIMG mask"); + +/* Assert on the backwards-compatible DEVICE_PAGER* values */ +_Static_assert(DEVICE_PAGER_GUARDED == VM_MEM_GUARDED, "DEVICE_PAGER_GUARDED != VM_MEM_GUARDED"); +_Static_assert(DEVICE_PAGER_COHERENT == VM_MEM_COHERENT, "DEVICE_PAGER_COHERENT != VM_MEM_COHERENT"); +_Static_assert(DEVICE_PAGER_CACHE_INHIB == VM_MEM_NOT_CACHEABLE, "DEVICE_PAGER_CACHE_INHIB != VM_MEM_NOT_CACHEABLE"); +_Static_assert(DEVICE_PAGER_WRITE_THROUGH == VM_MEM_WRITE_THROUGH, "DEVICE_PAGER_WRITE_THROUGH != VM_MEM_WRITE_THROUGH"); + +#endif /* KERNEL_PRIVATE */ + #endif /* _DEVICE_DEVICE_PORT_H_ */ diff --git a/osfmk/device/device_types.h b/osfmk/device/device_types.h index 28a649ce8..f1cc26e1e 100644 --- a/osfmk/device/device_types.h +++ b/osfmk/device/device_types.h @@ -72,7 +72,7 @@ #include #if PRIVATE -#define IOKIT_SERVER_VERSION 20150715 +#define IOKIT_SERVER_VERSION 20190423 #endif @@ -121,12 +121,14 @@ typedef uint64_t io_async_ref64_t[8]; typedef struct IOObject * io_object_t; typedef io_object_t io_connect_t; +typedef io_object_t uext_object_t; extern void iokit_remove_reference( io_object_t obj ); extern void iokit_remove_connect_reference( io_object_t obj ); extern io_object_t iokit_lookup_object_port( ipc_port_t port ); extern io_connect_t iokit_lookup_connect_port( ipc_port_t port ); +extern uext_object_t iokit_lookup_uext_object_port( ipc_port_t port ); extern ipc_port_t iokit_make_object_port( io_object_t obj ); extern ipc_port_t iokit_make_connect_port( io_connect_t obj ); diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index 487eee336..15866381e 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -75,7 +75,7 @@ * Lookup a device by its port. * Doesn't consume the naked send right; produces a device reference. */ -static io_object_t +io_object_t iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type) { io_object_t obj; @@ -111,6 +111,13 @@ iokit_lookup_connect_port( return iokit_lookup_io_object(port, IKOT_IOKIT_CONNECT); } +MIGEXTERN io_object_t +iokit_lookup_uext_object_port( + ipc_port_t port) +{ + return iokit_lookup_io_object(port, IKOT_UEXT_OBJECT); +} + static io_object_t iokit_lookup_object_in_space_with_port_name(mach_port_name_t name, ipc_kobject_type_t type, ipc_space_t space) { @@ -120,16 +127,16 @@ iokit_lookup_object_in_space_with_port_name(mach_port_name_t name, ipc_kobject_t ipc_port_t port; kern_return_t kr; - kr = ipc_object_translate(space, name, MACH_PORT_RIGHT_SEND, (ipc_object_t *)&port); + kr = ipc_port_translate_send(space, name, &port); if (kr == KERN_SUCCESS) { assert(IP_VALID(port)); - + require_ip_active(port); ip_reference(port); ip_unlock(port); iokit_lock_port(port); - if (ip_active(port) && (ip_kotype(port) == type)) { + if (ip_kotype(port) == type) { obj = (io_object_t) port->ip_kobject; iokit_add_reference(obj, type); } @@ -154,6 +161,12 @@ iokit_lookup_connect_ref_current_task(mach_port_name_t name) return iokit_lookup_object_in_space_with_port_name(name, IKOT_IOKIT_CONNECT, current_space()); } +EXTERN io_object_t +iokit_lookup_uext_ref_current_task(mach_port_name_t name) +{ + return iokit_lookup_object_in_space_with_port_name(name, IKOT_UEXT_OBJECT, current_space()); +} + EXTERN void iokit_retain_port( ipc_port_t port ) { @@ -233,32 +246,15 @@ int gIOKitPortCount; EXTERN ipc_port_t iokit_alloc_object_port( io_object_t obj, ipc_kobject_type_t type ) { - ipc_port_t notify; - ipc_port_t port; - - do { - /* Allocate port, keeping a reference for it. */ - port = ipc_port_alloc_kernel(); - if (port == IP_NULL) { - continue; - } - - /* set kobject & type */ - ipc_kobject_set( port, (ipc_kobject_t) obj, type); - - /* Request no-senders notifications on the port. */ - ip_lock( port); - notify = ipc_port_make_sonce_locked( port); - ipc_port_nsrequest( port, 1, notify, ¬ify); - /* port unlocked */ - assert( notify == IP_NULL); - gIOKitPortCount++; - } while (FALSE); - - return port; + /* Allocate port, keeping a reference for it. */ + gIOKitPortCount++; + ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_NSREQUEST; + if (type == IKOT_IOKIT_CONNECT) { + options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; + } + return ipc_kobject_alloc_port((ipc_kobject_t) obj, type, options); } - EXTERN kern_return_t iokit_destroy_object_port( ipc_port_t port ) { @@ -304,8 +300,12 @@ iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type ) if (IP_VALID( sendPort )) { kern_return_t kr; - kr = ipc_object_copyout( task->itk_space, (ipc_object_t) sendPort, - MACH_MSG_TYPE_PORT_SEND, TRUE, &name); + // Remove once is fixed. + // We need to make ith_knote NULL as ipc_object_copyout() uses + // thread-argument-passing and its value should not be garbage + current_thread()->ith_knote = ITH_KNOTE_NULL; + kr = ipc_object_copyout( task->itk_space, ip_to_object(sendPort), + MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name); if (kr != KERN_SUCCESS) { ipc_port_release_send( sendPort ); name = MACH_PORT_NULL; @@ -339,7 +339,7 @@ iokit_no_senders( mach_no_senders_notification_t * notification ) ipc_kobject_type_t type = IKOT_NONE; ipc_port_t notify; - port = (ipc_port_t) notification->not_header.msgh_remote_port; + port = notification->not_header.msgh_remote_port; // convert a port to io_object_t. if (IP_VALID(port)) { @@ -349,7 +349,8 @@ iokit_no_senders( mach_no_senders_notification_t * notification ) type = ip_kotype( port ); if ((IKOT_IOKIT_OBJECT == type) || (IKOT_IOKIT_CONNECT == type) - || (IKOT_IOKIT_IDENT == type)) { + || (IKOT_IOKIT_IDENT == type) + || (IKOT_UEXT_OBJECT == type)) { iokit_add_reference( obj, IKOT_IOKIT_OBJECT ); } else { obj = NULL; @@ -449,6 +450,10 @@ IOMapPages(vm_map_t map, mach_vm_address_t va, mach_vm_address_t pa, case kIOMapPostedWrite: flags = VM_WIMG_POSTED; break; + + case kIOMapRealTimeCache: + flags = VM_WIMG_RT; + break; } pmap_set_cache_attributes(pagenum, flags); @@ -513,6 +518,10 @@ IOProtectCacheMode(vm_map_t __unused map, mach_vm_address_t __unused va, case kIOMapPostedWrite: flags = VM_WIMG_POSTED; break; + + case kIOMapRealTimeCache: + flags = VM_WIMG_RT; + break; } pmap_flush_context_init(&pmap_flush_context_storage); diff --git a/osfmk/gssd/Makefile b/osfmk/gssd/Makefile index 83666b479..3f42fac82 100644 --- a/osfmk/gssd/Makefile +++ b/osfmk/gssd/Makefile @@ -41,7 +41,7 @@ COMP_FILES = ${MIG_KUSRC} do_build_all:: $(COMP_FILES) ${MIG_KUSRC} : gssd_mach.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user gssd_mach.c \ -header gssd_mach.h \ diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 19781ce3b..f6b970194 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -114,6 +114,7 @@ #include #include +#include #include #include @@ -198,11 +199,10 @@ extern void kdp_snapshot_preflight(int pid, void *tracebuf, boolean_t enable_faulting); extern int kdp_stack_snapshot_bytes_traced(void); -#if DEVELOPMENT || DEBUG vm_offset_t panic_stackshot_buf = 0; -size_t panic_stackshot_len = 0; -#endif +size_t panic_stackshot_buf_len = 0; +size_t panic_stackshot_len = 0; /* * Backtrace a single frame. */ @@ -765,7 +765,8 @@ uint64_t panic_restart_timeout = ~(0ULL); void RecordPanicStackshot() { - int err = 0, bytes_traced = 0, bytes_used = 0, bytes_remaining = 0; + int err = 0; + size_t bytes_traced = 0, bytes_used = 0, bytes_remaining = 0; char *stackshot_begin_loc = NULL; /* Don't re-enter this code if we panic here */ @@ -786,136 +787,125 @@ RecordPanicStackshot() return; } - /* - * Try to capture an in memory panic_stackshot (enabled during boot - * on systems with co-processors). - */ - if (extended_debug_log_enabled) { - if (stackshot_active()) { - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_NESTED; - panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - kdb_printf("Panicked during stackshot, skipping panic stackshot\n"); - return; - } else { - stackshot_begin_loc = debug_buf_ptr; - - bytes_remaining = debug_buf_size - (unsigned int)((uintptr_t)stackshot_begin_loc - (uintptr_t)debug_buf_base); - err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc, - KCDATA_BUFFER_BEGIN_STACKSHOT, bytes_remaining, KCFLAG_USE_MEMCOPY); - if (err != KERN_SUCCESS) { - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR; - panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - kdb_printf("Failed to initialize kcdata buffer for in-memory panic stackshot, skipping ...\n"); - return; - } - - kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, bytes_remaining, - (STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT | - STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC | - STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0); - err = do_stackshot(NULL); - bytes_traced = (int) kdp_stack_snapshot_bytes_traced(); - bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data); - - if ((err != KERN_SUCCESS) && (bytes_used > 0)) { - /* - * We ran out of space while trying to capture a stackshot, try again without user frames. - * It's not safe to log from here, but append a flag to the panic flags. - */ - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_KERNEL_ONLY; - panic_stackshot_reset_state(); - - /* Erase the stackshot data (this region is pre-populated with the NULL character) */ - memset(stackshot_begin_loc, '\0', bytes_used); - - err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc, - KCDATA_BUFFER_BEGIN_STACKSHOT, bytes_remaining, KCFLAG_USE_MEMCOPY); - if (err != KERN_SUCCESS) { - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR; - panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - kdb_printf("Failed to re-initialize kcdata buffer for kernel only in-memory panic stackshot, skipping ...\n"); - return; - } - - kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, bytes_remaining, (STACKSHOT_KCDATA_FORMAT | - STACKSHOT_NO_IO_STATS | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY | - STACKSHOT_FROM_PANIC | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0); - err = do_stackshot(NULL); - bytes_traced = (int) kdp_stack_snapshot_bytes_traced(); - bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data); - } - - if (err == KERN_SUCCESS) { - debug_buf_ptr += bytes_traced; - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_SUCCEEDED; - panic_info->mph_stackshot_offset = PE_get_offset_into_panic_region(stackshot_begin_loc); - panic_info->mph_stackshot_len = bytes_traced; - - panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - kdb_printf("\n** In Memory Panic Stackshot Succeeded ** Bytes Traced %d **\n", bytes_traced); - } else { - if (bytes_used > 0) { - /* Erase the stackshot data (this region is pre-populated with the NULL character) */ - memset(stackshot_begin_loc, '\0', bytes_used); - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_INCOMPLETE; - - panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - kdb_printf("\n** In Memory Panic Stackshot Incomplete ** Bytes Filled %d ** Err %d\n", bytes_used, err); - } else { - bzero(stackshot_begin_loc, bytes_used); - panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR; - - panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - kdb_printf("\n** In Memory Panic Stackshot Failed ** Bytes Traced %d, err %d\n", bytes_traced, err); - } - } - } -#if DEVELOPMENT || DEBUG - if (panic_stackshot_buf != 0) { - /* We're going to try to take another stackshot, reset the state. */ - panic_stackshot_reset_state(); - } -#endif /* DEVELOPMENT || DEBUG */ - } else { + if (stackshot_active()) { + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_NESTED; panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); - } - -#if DEVELOPMENT || DEBUG - - if (panic_stackshot_buf == 0) { - kdb_printf("No stackshot buffer allocated for file backed panic stackshot, skipping...\n"); + kdb_printf("Panicked during stackshot, skipping panic stackshot\n"); return; } - if (stackshot_active()) { - kdb_printf("Panicked during stackshot, skipping file backed panic stackshot\n"); + /* Try to capture an in memory panic_stackshot */ + if (extended_debug_log_enabled) { + /* On coprocessor systems we write this into the extended debug log */ + stackshot_begin_loc = debug_buf_ptr; + bytes_remaining = debug_buf_size - (unsigned int)((uintptr_t)stackshot_begin_loc - (uintptr_t)debug_buf_base); + } else if (panic_stackshot_buf != 0) { + /* On other systems we use the panic stackshot_buf */ + stackshot_begin_loc = (char *) panic_stackshot_buf; + bytes_remaining = panic_stackshot_buf_len; + } else { + panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); return; } - err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)panic_stackshot_buf, KCDATA_BUFFER_BEGIN_STACKSHOT, - PANIC_STACKSHOT_BUFSIZE, KCFLAG_USE_MEMCOPY); + + err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc, + KCDATA_BUFFER_BEGIN_STACKSHOT, (unsigned int) bytes_remaining, KCFLAG_USE_MEMCOPY); if (err != KERN_SUCCESS) { - kdb_printf("Failed to initialize kcdata buffer for file backed panic stackshot, skipping ...\n"); + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR; + panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); + kdb_printf("Failed to initialize kcdata buffer for in-memory panic stackshot, skipping ...\n"); return; } - kdp_snapshot_preflight(-1, (void *) panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, (STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT | - STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC | STACKSHOT_NO_IO_STATS - | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0); + uint32_t stackshot_flags = (STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT | + STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC | + STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO); +#if DEVELOPMENT + /* + * Include the shared cache layout in panic stackshots on DEVELOPMENT kernels so that we can symbolicate + * panic stackshots from corefiles. + */ + stackshot_flags |= STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT; +#endif + + kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, (uint32_t) bytes_remaining, stackshot_flags, &kc_panic_data, 0); err = do_stackshot(NULL); bytes_traced = (int) kdp_stack_snapshot_bytes_traced(); - if (bytes_traced > 0 && !err) { + bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data); + + if ((err != KERN_SUCCESS) && (bytes_used > 0)) { + /* + * We ran out of space while trying to capture a stackshot, try again without user frames. + * It's not safe to log from here (in case we're writing in the middle of the debug buffer on coprocessor systems) + * but append a flag to the panic flags. + */ + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_KERNEL_ONLY; + panic_stackshot_reset_state(); + + /* Erase the stackshot data (this region is pre-populated with the NULL character) */ + memset(stackshot_begin_loc, '\0', bytes_used); + + err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc, + KCDATA_BUFFER_BEGIN_STACKSHOT, (unsigned int) bytes_remaining, KCFLAG_USE_MEMCOPY); + if (err != KERN_SUCCESS) { + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR; + panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); + kdb_printf("Failed to re-initialize kcdata buffer for kernel only in-memory panic stackshot, skipping ...\n"); + return; + } + + stackshot_flags = (STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_KCDATA_FORMAT | STACKSHOT_FROM_PANIC | + STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY); +#if DEVELOPMENT + /* + * Include the shared cache layout in panic stackshots on DEVELOPMENT kernels so that we can symbolicate + * panic stackshots from corefiles. + */ + stackshot_flags |= STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT; +#endif + + kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, (uint32_t) bytes_remaining, stackshot_flags, &kc_panic_data, 0); + err = do_stackshot(NULL); + bytes_traced = (int) kdp_stack_snapshot_bytes_traced(); + bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data); + } + + if (err == KERN_SUCCESS) { + if (extended_debug_log_enabled) { + debug_buf_ptr += bytes_traced; + } + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_SUCCEEDED; + panic_info->mph_stackshot_offset = PE_get_offset_into_panic_region(stackshot_begin_loc); + panic_info->mph_stackshot_len = (uint32_t) bytes_traced; + + panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); + kdb_printf("\n** In Memory Panic Stackshot Succeeded ** Bytes Traced %zu **\n", bytes_traced); + + /* Used by the code that writes the buffer to disk */ + panic_stackshot_buf = (vm_offset_t) stackshot_begin_loc; panic_stackshot_len = bytes_traced; - kdb_printf("File backed panic stackshot succeeded, length: %u bytes\n", bytes_traced); + + if (!extended_debug_log_enabled && (gIOPolledCoreFileMode == kIOPolledCoreFileModeStackshot)) { + /* System configured to write panic stackshot to disk */ + kern_dump(KERN_DUMP_STACKSHOT_DISK); + } } else { - bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data); if (bytes_used > 0) { - kdb_printf("File backed panic stackshot incomplete, consumed %u bytes, error : %d \n", bytes_used, err); + /* Erase the stackshot data (this region is pre-populated with the NULL character) */ + memset(stackshot_begin_loc, '\0', bytes_used); + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_INCOMPLETE; + + panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); + kdb_printf("\n** In Memory Panic Stackshot Incomplete ** Bytes Filled %zu ** Err %d\n", bytes_used, err); } else { - kdb_printf("File backed panic stackshot incomplete, consumed %u bytes, error : %d \n", bytes_used, err); + bzero(stackshot_begin_loc, bytes_used); + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR; + + panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr); + kdb_printf("\n** In Memory Panic Stackshot Failed ** Bytes Traced %zu, err %d\n", bytes_traced, err); } } -#endif /* DEVELOPMENT || DEBUG */ return; } @@ -991,9 +981,7 @@ SavePanicInfo( * Flush the panic log again with the stackshot or any relevant logging * from when we tried to capture it. */ - if (extended_debug_log_enabled) { - paniclog_flush_internal(kPaniclogFlushStackshot); - } + paniclog_flush_internal(kPaniclogFlushStackshot); } void @@ -1261,7 +1249,7 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu boolean_t old_doprnt_hide_pointers = doprnt_hide_pointers; if (pbtcpu != cn) { - hw_atomic_add(&pbtcnt, 1); + os_atomic_inc(&pbtcnt, relaxed); /* Spin on print backtrace lock, which serializes output * Continue anyway if a timeout occurs. */ @@ -1382,7 +1370,7 @@ out: * event of panics on multiple processors. */ hw_lock_unlock(&pbtlock); - hw_atomic_sub(&pbtcnt, 1); + os_atomic_dec(&pbtcnt, relaxed); /* Wait for other processors to complete output * Timeout and continue after PBT_TIMEOUT_CYCLES. */ @@ -1562,7 +1550,7 @@ print_launchd_info(void) int cn = cpu_number(); if (pbtcpu != cn) { - hw_atomic_add(&pbtcnt, 1); + os_atomic_inc(&pbtcnt, relaxed); /* Spin on print backtrace lock, which serializes output * Continue anyway if a timeout occurs. */ @@ -1581,7 +1569,7 @@ print_launchd_info(void) * event of panics on multiple processors. */ hw_lock_unlock(&pbtlock); - hw_atomic_sub(&pbtcnt, 1); + os_atomic_dec(&pbtcnt, relaxed); /* Wait for other processors to complete output * Timeout and continue after PBT_TIMEOUT_CYCLES. */ diff --git a/osfmk/i386/Makefile b/osfmk/i386/Makefile index 12e9c6025..e46ad5748 100644 --- a/osfmk/i386/Makefile +++ b/osfmk/i386/Makefile @@ -25,6 +25,7 @@ EXPORT_ONLY_FILES = \ locks_i386_inlines.h \ machine_routines.h \ machine_cpu.h \ + memory_types.h \ mtrr.h \ mp.h \ mp_desc.h \ @@ -32,7 +33,7 @@ EXPORT_ONLY_FILES = \ pal_native.h \ pal_routines.h \ pal_hibernate.h \ - panic_hooks.h \ + panic_hooks.h \ pmCPU.h \ pmap.h \ proc_reg.h \ diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index 10ab92123..ee93c2eb8 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -367,17 +367,21 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) /* Restart timer interrupts */ rtc_timer_start(); -#if HIBERNATION +#if MONOTONIC + mt_cpu_up(cdp); +#endif /* MONOTONIC */ + +#if HIBERNATION kprintf("ret from acpi_sleep_cpu hib=%d\n", did_hibernate); -#endif +#endif /* HIBERNATION */ #if CONFIG_SLEEP /* Becase we don't save the bootstrap page, and we share it * between sleep and mp slave init, we need to recreate it * after coming back from sleep or hibernate */ install_real_mode_bootstrap(slave_pstart); -#endif +#endif /* CONFIG_SLEEP */ } /* @@ -402,9 +406,17 @@ acpi_idle_kernel(acpi_sleep_callback func, void *refcon) assert(cpu_number() == master_cpu); +#if MONOTONIC + mt_cpu_down(cpu_datap(0)); +#endif /* MONOTONIC */ + /* Cancel any pending deadline */ setPop(0); - while (lapic_is_interrupting(LAPIC_TIMER_VECTOR)) { + while (lapic_is_interrupting(LAPIC_TIMER_VECTOR) +#if MONOTONIC + || lapic_is_interrupting(LAPIC_VECTOR(PERFCNT)) +#endif /* MONOTONIC */ + ) { (void) ml_set_interrupts_enabled(TRUE); setPop(0); ml_set_interrupts_enabled(FALSE); diff --git a/osfmk/i386/asm.h b/osfmk/i386/asm.h index b04ac6a7d..50905a62e 100644 --- a/osfmk/i386/asm.h +++ b/osfmk/i386/asm.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,34 +22,34 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -57,14 +57,6 @@ #ifndef _I386_ASM_H_ #define _I386_ASM_H_ -#ifdef _KERNEL -#include -#endif /* _KERNEL */ - -#if defined(MACH_KERNEL) || defined(_KERNEL) -#include -#endif /* MACH_KERNEL || _KERNEL */ - #if defined(__i386__) #define S_PC (%esp) @@ -153,10 +145,9 @@ #define data16 .byte 0x66 #define addr16 .byte 0x67 -#if !GPROF #define MCOUNT -#elif defined(__SHARED__) +#if defined(__SHARED__) #define MCOUNT ; .data;\ .align ALIGN;\ LBc(x, 8) .long 0;\ @@ -167,10 +158,7 @@ Egaddr(%eax,_mcount_ptr);\ Gpop;\ call *(%eax); - -#else /* !GPROF, !__SHARED__ */ -#define MCOUNT ; call mcount; -#endif /* GPROF */ +#endif /* __SHARED__ */ #ifdef __ELF__ #define ELF_FUNC(x) .type x,@function @@ -363,7 +351,7 @@ leaq (%rsp), %rsi ;\ call EXT(fn) ;\ mov (%rsp), %rsp - + #define CCALL(fn) \ mov %rsp, %r12 ;\ and $0xFFFFFFFFFFFFFFF0, %rsp ;\ diff --git a/osfmk/i386/atomic.h b/osfmk/i386/atomic.h index 75ce5c5a9..bd2a0c703 100644 --- a/osfmk/i386/atomic.h +++ b/osfmk/i386/atomic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,47 +26,13 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _I386_ATOMIC_H_ -#define _I386_ATOMIC_H_ - -#include - -#if __SMP__ - -#define memory_order_consume_smp memory_order_consume -#define memory_order_acquire_smp memory_order_acquire -#define memory_order_release_smp memory_order_release -#define memory_order_acq_rel_smp memory_order_acq_rel -#define memory_order_seq_cst_smp memory_order_seq_cst - -#else - -#define memory_order_consume_smp memory_order_relaxed -#define memory_order_acquire_smp memory_order_relaxed -#define memory_order_release_smp memory_order_relaxed -#define memory_order_acq_rel_smp memory_order_relaxed -#define memory_order_seq_cst_smp memory_order_relaxed - +#ifndef _MACHINE_ATOMIC_H +#error "Do not include directly, use " #endif -#ifdef ATOMIC_PRIVATE - -static inline boolean_t -atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, - enum memory_order ord, boolean_t wait) -{ - (void)wait; - return __c11_atomic_compare_exchange_strong((_Atomic uintptr_t *)target, &oldval, newval, ord, memory_order_relaxed); -} - -static inline boolean_t -atomic_compare_exchange32(uint32_t *target, uint32_t oldval, uint32_t newval, - enum memory_order ord, boolean_t wait) -{ - (void)wait; - return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &oldval, newval, ord, memory_order_relaxed); -} +#ifndef _I386_ATOMIC_H_ +#define _I386_ATOMIC_H_ -#endif // ATOMIC_PRIVATE +/* No special configuration for Intel */ #endif // _I386_ATOMIC_H_ diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index a1d3b4965..039a31bb6 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -451,7 +451,17 @@ mach_call_munger(x86_saved_state_t *state) int call_number; mach_call_t mach_call; kern_return_t retval; - struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + struct mach_call_args args = { + .arg1 = 0, + .arg2 = 0, + .arg3 = 0, + .arg4 = 0, + .arg5 = 0, + .arg6 = 0, + .arg7 = 0, + .arg8 = 0, + .arg9 = 0 + }; x86_saved_state32_t *regs; struct uthread *ut = get_bsdthread_info(current_thread()); @@ -542,7 +552,17 @@ mach_call_munger64(x86_saved_state_t *state) int call_number; int argc; mach_call_t mach_call; - struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + struct mach_call_args args = { + .arg1 = 0, + .arg2 = 0, + .arg3 = 0, + .arg4 = 0, + .arg5 = 0, + .arg6 = 0, + .arg7 = 0, + .arg8 = 0, + .arg9 = 0 + }; x86_saved_state64_t *regs; struct uthread *ut = get_bsdthread_info(current_thread()); @@ -574,8 +594,7 @@ mach_call_munger64(x86_saved_state_t *state) argc = mach_trap_table[call_number].mach_trap_arg_count; if (argc) { int args_in_regs = MIN(6, argc); - - memcpy(&args.arg1, ®s->rdi, args_in_regs * sizeof(syscall_arg_t)); + __nochk_memcpy(&args.arg1, ®s->rdi, args_in_regs * sizeof(syscall_arg_t)); if (argc > 6) { int copyin_count; diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 67f1ea084..2c4a40d83 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -375,6 +375,16 @@ commpage_init_cpu_capabilities( void ) CPUID_LEAF7_FEATURE_AVX512IFMA); setif(bits, kHasAVX512VBMI, cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512VBMI); + setif(bits, kHasVAES, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_VAES); + setif(bits, kHasVPCLMULQDQ, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_VPCLMULQDQ); + setif(bits, kHasAVX512VNNI, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_AVX512VNNI); + setif(bits, kHasAVX512BITALG, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_AVX512BITALG); + setif(bits, kHasAVX512VPOPCNTDQ, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_AVX512VPCDQ); } uint64_t misc_enable = rdmsr64(MSR_IA32_MISC_ENABLE); @@ -482,6 +492,7 @@ commpage_stuff_routine( commpage_stuff(rd->commpage_address, rd->code_address, rd->code_length); } + /* Fill in the 32- or 64-bit commpage. Called once for each. */ @@ -568,7 +579,7 @@ commpage_populate( void ) _COMM_PAGE32_BASE_ADDRESS, &time_data32, >od_time_data32, - "commpage 32-bit", + _COMM_PAGE32_SIGNATURE_STRING, VM_PROT_READ); #ifndef __LP64__ pmap_commpage32_init((vm_offset_t) commPagePtr32, _COMM_PAGE32_BASE_ADDRESS, @@ -584,7 +595,7 @@ commpage_populate( void ) _COMM_PAGE32_START_ADDRESS, /* commpage address are relative to 32-bit commpage placement */ &time_data64, >od_time_data64, - "commpage 64-bit", + _COMM_PAGE64_SIGNATURE_STRING, VM_PROT_READ); #ifndef __LP64__ pmap_commpage64_init((vm_offset_t) commPagePtr64, _COMM_PAGE64_BASE_ADDRESS, @@ -883,6 +894,54 @@ commpage_update_atm_diagnostic_config(uint32_t diagnostic_config) } } +/* + * update the commpage with if dtrace user land probes are enabled + */ +void +commpage_update_dof(boolean_t enabled) +{ +#if CONFIG_DTRACE + char *cp; + + cp = commPagePtr32; + if (cp) { + cp += (_COMM_PAGE_DTRACE_DOF_ENABLED - _COMM_PAGE32_BASE_ADDRESS); + *cp = (enabled ? 1 : 0); + } + + cp = commPagePtr64; + if (cp) { + cp += (_COMM_PAGE_DTRACE_DOF_ENABLED - _COMM_PAGE32_START_ADDRESS); + *cp = (enabled ? 1 : 0); + } +#else + (void)enabled; +#endif +} + + +/* + * update the dyld global config flags + */ +void +commpage_update_dyld_flags(uint64_t value) +{ + char *cp; + + cp = commPagePtr32; + if (cp) { + cp += (_COMM_PAGE_DYLD_SYSTEM_FLAGS - _COMM_PAGE32_BASE_ADDRESS); + *(uint64_t *)cp = value; + } + + cp = commPagePtr64; + if (cp) { + cp += (_COMM_PAGE_DYLD_SYSTEM_FLAGS - _COMM_PAGE32_BASE_ADDRESS); + *(uint64_t *)cp = value; + } +} + + /* * update the commpage data for last known value of mach_absolute_time() */ diff --git a/osfmk/i386/commpage/commpage.h b/osfmk/i386/commpage/commpage.h index 2dc782686..2bf2a41f9 100644 --- a/osfmk/i386/commpage/commpage.h +++ b/osfmk/i386/commpage/commpage.h @@ -157,6 +157,8 @@ extern void commpage_update_mach_continuous_time(uint64_t sleeptime); extern void commpage_update_boottime(uint64_t boottime_usec); extern void commpage_update_kdebug_state(void); extern void commpage_update_atm_diagnostic_config(uint32_t); +extern void commpage_update_dof(boolean_t enabled); +extern void commpage_update_dyld_flags(uint64_t value); extern void commpage_post_ucode_update(void); extern uint32_t commpage_is_in_pfz32(uint32_t); diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c index 517c229c8..bad6b1016 100644 --- a/osfmk/i386/cpu.c +++ b/osfmk/i386/cpu.c @@ -93,6 +93,7 @@ cpu_sleep(void) { cpu_data_t *cdp = current_cpu_datap(); + /* This calls IOCPURunPlatformQuiesceActions when sleeping the boot cpu */ PE_cpu_machine_quiesce(cdp->cpu_id); cpu_thread_halt(); diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index 991bf1b34..89f8fc52d 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2009 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,7 +73,6 @@ #define kHasADX 0x0000000400000000ULL #define kHasMPX 0x0000001000000000ULL #define kHasSGX 0x0000002000000000ULL -#if !defined(RC_HIDE_XNU_J137) #define kHasAVX512F 0x0000004000000000ULL #define kHasAVX512CD 0x0000008000000000ULL #define kHasAVX512DQ 0x0000010000000000ULL @@ -81,7 +80,11 @@ #define kHasAVX512IFMA 0x0000040000000000ULL #define kHasAVX512VBMI 0x0000080000000000ULL #define kHasAVX512VL 0x0000100000000000ULL -#endif /* not RC_HIDE_XNU_J137 */ +#define kHasVAES 0x0000200000000000ULL +#define kHasVPCLMULQDQ 0x0000400000000000ULL +#define kHasAVX512VNNI 0x0000800000000000ULL +#define kHasAVX512BITALG 0x0001000000000000ULL +#define kHasAVX512VPOPCNTDQ 0x0002000000000000ULL #ifndef __ASSEMBLER__ @@ -192,7 +195,7 @@ _NumCPUs( void ) #define _COMM_PAGE_ACTIVE_CPUS (_COMM_PAGE_START_ADDRESS+0x034) /* uint8_t number of active CPUs (hw.activecpu) */ #define _COMM_PAGE_PHYSICAL_CPUS (_COMM_PAGE_START_ADDRESS+0x035) /* uint8_t number of physical CPUs (hw.physicalcpu_max) */ -#define _COMM_PAGE_LOGICAL_CPUS (_COMM_PAGE_START_ADDRESS+0x036) /* uint8_t number of logical CPUs (hw.logicalcpu_max) */ +#define _COMM_PAGE_LOGICAL_CPUS (_COMM_PAGE_START_ADDRESS+0x036) /* uint8_t number of logical CPUs (hw.logicalcpu_max) */ #define _COMM_PAGE_UNUSED1 (_COMM_PAGE_START_ADDRESS+0x037) /* 1 unused bytes */ #define _COMM_PAGE_MEMORY_SIZE (_COMM_PAGE_START_ADDRESS+0x038) /* uint64_t max memory size */ @@ -200,7 +203,8 @@ _NumCPUs( void ) #define _COMM_PAGE_KDEBUG_ENABLE (_COMM_PAGE_START_ADDRESS+0x044) /* uint32_t export "kdebug_enable" to userspace */ #define _COMM_PAGE_ATM_DIAGNOSTIC_CONFIG (_COMM_PAGE_START_ADDRESS+0x48) /* uint32_t export "atm_diagnostic_config" to userspace */ -#define _COMM_PAGE_UNUSED2 (_COMM_PAGE_START_ADDRESS+0x04C) /* [0x4C,0x50) unused */ +#define _COMM_PAGE_DTRACE_DOF_ENABLED (_COMM_PAGE_START_ADDRESS+0x04C) /* uint8_t 0 if userspace DOF disable, 1 if enabled */ +#define _COMM_PAGE_UNUSED2 (_COMM_PAGE_START_ADDRESS+0x04D) /* [0x4D,0x50) unused */ #define _COMM_PAGE_TIME_DATA_START (_COMM_PAGE_START_ADDRESS+0x050) /* base of offsets below (_NT_SCALE etc) */ #define _COMM_PAGE_NT_TSC_BASE (_COMM_PAGE_START_ADDRESS+0x050) /* used by nanotime() */ @@ -221,6 +225,9 @@ _NumCPUs( void ) #define _COMM_PAGE_BOOTTIME_USEC (_COMM_PAGE_START_ADDRESS+0x0C8) /* uint64_t boottime */ #define _COMM_PAGE_NEWTIMEOFDAY_DATA (_COMM_PAGE_START_ADDRESS+0x0D0) /* used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40 */ +/* Resume packed values to the next cacheline */ +#define _COMM_PAGE_DYLD_SYSTEM_FLAGS (_COMM_PAGE_START_ADDRESS+0x100) /* uint64_t export kern.dyld_system_flags to userspace */ + #define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0xfff) /* end of common page */ /* Warning: kernel commpage.h has a matching c typedef for the following. They must be kept in sync. */ diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index da7919cda..a479eaea8 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -34,6 +34,7 @@ #define I386_CPU_DATA #include +#include #include #include @@ -436,7 +437,7 @@ get_active_thread_volatile(void) CPU_DATA_GET(cpu_active_thread, thread_t) } -static inline __pure2 thread_t +static inline __attribute__((const)) thread_t get_active_thread(void) { CPU_DATA_GET(cpu_active_thread, thread_t) @@ -630,6 +631,7 @@ disable_preemption_internal(void) { assert(get_preemption_level() >= 0); + os_compiler_barrier(release); #if defined(__clang__) cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL; cpu_data->cpu_preemption_level++; @@ -638,6 +640,7 @@ disable_preemption_internal(void) : : "i" (offsetof(cpu_data_t, cpu_preemption_level))); #endif + os_compiler_barrier(acquire); pltrace(FALSE); } @@ -646,6 +649,7 @@ enable_preemption_internal(void) { assert(get_preemption_level() > 0); pltrace(TRUE); + os_compiler_barrier(release); #if defined(__clang__) cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL; if (0 == --cpu_data->cpu_preemption_level) { @@ -660,6 +664,7 @@ enable_preemption_internal(void) : "i" (offsetof(cpu_data_t, cpu_preemption_level)) : "eax", "ecx", "edx", "cc", "memory"); #endif + os_compiler_barrier(acquire); } static inline void @@ -668,6 +673,7 @@ enable_preemption_no_check(void) assert(get_preemption_level() > 0); pltrace(TRUE); + os_compiler_barrier(release); #if defined(__clang__) cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL; cpu_data->cpu_preemption_level--; @@ -677,6 +683,7 @@ enable_preemption_no_check(void) : "i" (offsetof(cpu_data_t, cpu_preemption_level)) : "cc", "memory"); #endif + os_compiler_barrier(acquire); } static inline void diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 75da80e5f..0fafb3aad 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -255,8 +255,6 @@ do_cwas(i386_cpu_info_t *cpuinfo, boolean_t on_slave) * Workaround for reclaiming perf counter 3 due to TSX memory ordering erratum. * This workaround does not support being forcibly set (since an MSR must be * enumerated, lest we #GP when forced to access it.) - * When RTM_FORCE_FORCE is enabled all RTM transactions on the logical CPU will - * forcefully abort, but the general purpose counter 3 will report correct values. */ if (cpuid_wa_required(CPU_INTEL_TSXFA) == CWA_ON) { wrmsr64(MSR_IA32_TSX_FORCE_ABORT, @@ -929,43 +927,46 @@ cpuid_set_info(void) } /* cpuid_set_cache_info must be invoked after set_generic_info */ - if (info_p->cpuid_cpufamily == CPUFAMILY_INTEL_PENRYN) { - cpuid_set_cache_info(info_p); - } - /* * Find the number of enabled cores and threads * (which determines whether SMT/Hyperthreading is active). */ - switch (info_p->cpuid_cpufamily) { - case CPUFAMILY_INTEL_PENRYN: - info_p->core_count = info_p->cpuid_cores_per_package; - info_p->thread_count = info_p->cpuid_logical_per_package; - break; - case CPUFAMILY_INTEL_WESTMERE: { - uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT); - info_p->core_count = bitfield32((uint32_t)msr, 19, 16); - info_p->thread_count = bitfield32((uint32_t)msr, 15, 0); - break; - } - default: { - uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT); - if (msr == 0) { - /* Provide a non-zero default for some VMMs */ - msr = (1 << 16) + 1; - } - info_p->core_count = bitfield32((uint32_t)msr, 31, 16); - info_p->thread_count = bitfield32((uint32_t)msr, 15, 0); - break; - } - } - if (info_p->core_count == 0) { - info_p->core_count = info_p->cpuid_cores_per_package; - info_p->thread_count = info_p->cpuid_logical_per_package; - } - if (info_p->cpuid_cpufamily != CPUFAMILY_INTEL_PENRYN) { + if (0 != (info_p->cpuid_features & CPUID_FEATURE_VMM) && + PE_parse_boot_argn("-nomsr35h", NULL, 0)) { + info_p->core_count = 1; + info_p->thread_count = 1; cpuid_set_cache_info(info_p); + } else { + switch (info_p->cpuid_cpufamily) { + case CPUFAMILY_INTEL_PENRYN: + cpuid_set_cache_info(info_p); + info_p->core_count = info_p->cpuid_cores_per_package; + info_p->thread_count = info_p->cpuid_logical_per_package; + break; + case CPUFAMILY_INTEL_WESTMERE: { + uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT); + if (0 == msr) { + /* Provide a non-zero default for some VMMs */ + msr = (1 << 16) | 1; + } + info_p->core_count = bitfield32((uint32_t)msr, 19, 16); + info_p->thread_count = bitfield32((uint32_t)msr, 15, 0); + cpuid_set_cache_info(info_p); + break; + } + default: { + uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT); + if (0 == msr) { + /* Provide a non-zero default for some VMMs */ + msr = (1 << 16) | 1; + } + info_p->core_count = bitfield32((uint32_t)msr, 31, 16); + info_p->thread_count = bitfield32((uint32_t)msr, 15, 0); + cpuid_set_cache_info(info_p); + break; + } + } } DBG("cpuid_set_info():\n"); @@ -1093,7 +1094,11 @@ static struct table { {CPUID_LEAF7_FEATURE_OSPKE, "OSPKE"}, {CPUID_LEAF7_FEATURE_WAITPKG, "WAITPKG"}, {CPUID_LEAF7_FEATURE_GFNI, "GFNI"}, - {CPUID_LEAF7_FEATURE_AVX512VPCDQ, "AVX512VPCDQ"}, + {CPUID_LEAF7_FEATURE_VAES, "VAES"}, + {CPUID_LEAF7_FEATURE_VPCLMULQDQ, "VPCLMULQDQ"}, + {CPUID_LEAF7_FEATURE_AVX512VNNI, "AVX512VNNI"}, + {CPUID_LEAF7_FEATURE_AVX512BITALG, "AVX512BITALG"}, + {CPUID_LEAF7_FEATURE_AVX512VPCDQ, "AVX512VPOPCNTDQ"}, {CPUID_LEAF7_FEATURE_RDPID, "RDPID"}, {CPUID_LEAF7_FEATURE_CLDEMOTE, "CLDEMOTE"}, {CPUID_LEAF7_FEATURE_MOVDIRI, "MOVDIRI"}, @@ -1104,6 +1109,7 @@ static struct table { leaf7_extfeature_map[] = { { CPUID_LEAF7_EXTFEATURE_AVX5124VNNIW, "AVX5124VNNIW" }, { CPUID_LEAF7_EXTFEATURE_AVX5124FMAPS, "AVX5124FMAPS" }, + { CPUID_LEAF7_EXTFEATURE_FSREPMOV, "FSREPMOV" }, { CPUID_LEAF7_EXTFEATURE_MDCLEAR, "MDCLEAR" }, { CPUID_LEAF7_EXTFEATURE_TSXFA, "TSXFA" }, { CPUID_LEAF7_EXTFEATURE_IBRS, "IBRS" }, diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 3af0e20ef..c80308084 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -129,55 +129,60 @@ * Leaf 7, subleaf 0 additional features. * Bits returned in %ebx:%ecx to a CPUID request with {%eax,%ecx} of (0x7,0x0}: */ -#define CPUID_LEAF7_FEATURE_RDWRFSGS _Bit(0) /* FS/GS base read/write */ -#define CPUID_LEAF7_FEATURE_TSCOFF _Bit(1) /* TSC thread offset */ -#define CPUID_LEAF7_FEATURE_SGX _Bit(2) /* Software Guard eXtensions */ -#define CPUID_LEAF7_FEATURE_BMI1 _Bit(3) /* Bit Manipulation Instrs, set 1 */ -#define CPUID_LEAF7_FEATURE_HLE _Bit(4) /* Hardware Lock Elision*/ -#define CPUID_LEAF7_FEATURE_AVX2 _Bit(5) /* AVX2 Instructions */ -#define CPUID_LEAF7_FEATURE_FDPEO _Bit(6) /* x87 FPU Data Pointer updated only on x87 exceptions */ -#define CPUID_LEAF7_FEATURE_SMEP _Bit(7) /* Supervisor Mode Execute Protect */ -#define CPUID_LEAF7_FEATURE_BMI2 _Bit(8) /* Bit Manipulation Instrs, set 2 */ -#define CPUID_LEAF7_FEATURE_ERMS _Bit(9) /* Enhanced Rep Movsb/Stosb */ -#define CPUID_LEAF7_FEATURE_INVPCID _Bit(10) /* INVPCID intruction, TDB */ -#define CPUID_LEAF7_FEATURE_RTM _Bit(11) /* RTM */ -#define CPUID_LEAF7_FEATURE_PQM _Bit(12) /* Platform Qos Monitoring */ -#define CPUID_LEAF7_FEATURE_FPU_CSDS _Bit(13) /* FPU CS/DS deprecation */ -#define CPUID_LEAF7_FEATURE_MPX _Bit(14) /* Memory Protection eXtensions */ -#define CPUID_LEAF7_FEATURE_PQE _Bit(15) /* Platform Qos Enforcement */ -#define CPUID_LEAF7_FEATURE_AVX512F _Bit(16) /* AVX512F instructions */ -#define CPUID_LEAF7_FEATURE_AVX512DQ _Bit(17) /* AVX512DQ instructions */ -#define CPUID_LEAF7_FEATURE_RDSEED _Bit(18) /* RDSEED Instruction */ -#define CPUID_LEAF7_FEATURE_ADX _Bit(19) /* ADX Instructions */ -#define CPUID_LEAF7_FEATURE_SMAP _Bit(20) /* Supervisor Mode Access Protect */ -#define CPUID_LEAF7_FEATURE_AVX512IFMA _Bit(21) /* AVX512IFMA instructions */ -#define CPUID_LEAF7_FEATURE_CLFSOPT _Bit(23) /* CLFSOPT */ -#define CPUID_LEAF7_FEATURE_CLWB _Bit(24) /* CLWB */ -#define CPUID_LEAF7_FEATURE_IPT _Bit(25) /* Intel Processor Trace */ -#define CPUID_LEAF7_FEATURE_AVX512CD _Bit(28) /* AVX512CD instructions */ -#define CPUID_LEAF7_FEATURE_SHA _Bit(29) /* SHA instructions */ -#define CPUID_LEAF7_FEATURE_AVX512BW _Bit(30) /* AVX512BW instructions */ -#define CPUID_LEAF7_FEATURE_AVX512VL _Bit(31) /* AVX512VL instructions */ - -#define CPUID_LEAF7_FEATURE_PREFETCHWT1 _HBit(0)/* Prefetch Write/T1 hint */ -#define CPUID_LEAF7_FEATURE_AVX512VBMI _HBit(1)/* AVX512VBMI instructions */ -#define CPUID_LEAF7_FEATURE_UMIP _HBit(2) /* User Mode Instruction Prevention */ -#define CPUID_LEAF7_FEATURE_PKU _HBit(3) /* Protection Keys for Usermode */ -#define CPUID_LEAF7_FEATURE_OSPKE _HBit(4) /* OS has enabled PKE */ -#define CPUID_LEAF7_FEATURE_WAITPKG _HBit(5) /* WAITPKG instructions */ -#define CPUID_LEAF7_FEATURE_GFNI _HBit(8) /* Galois Field New Instructions */ -#define CPUID_LEAF7_FEATURE_AVX512VPCDQ _HBit(14) /* AVX512 VPOPCNTDQ instruction */ -#define CPUID_LEAF7_FEATURE_RDPID _HBit(22) /* RDPID and IA32_TSC_AUX */ -#define CPUID_LEAF7_FEATURE_CLDEMOTE _HBit(25) /* Cache line demote */ -#define CPUID_LEAF7_FEATURE_MOVDIRI _HBit(27) /* MOVDIRI instruction */ -#define CPUID_LEAF7_FEATURE_MOVDIRI64B _HBit(28) /* MOVDIRI64B instruction */ -#define CPUID_LEAF7_FEATURE_SGXLC _HBit(30) /* SGX Launch Configuration */ +#define CPUID_LEAF7_FEATURE_RDWRFSGS _Bit(0) /* FS/GS base read/write */ +#define CPUID_LEAF7_FEATURE_TSCOFF _Bit(1) /* TSC thread offset */ +#define CPUID_LEAF7_FEATURE_SGX _Bit(2) /* Software Guard eXtensions */ +#define CPUID_LEAF7_FEATURE_BMI1 _Bit(3) /* Bit Manipulation Instrs, set 1 */ +#define CPUID_LEAF7_FEATURE_HLE _Bit(4) /* Hardware Lock Elision*/ +#define CPUID_LEAF7_FEATURE_AVX2 _Bit(5) /* AVX2 Instructions */ +#define CPUID_LEAF7_FEATURE_FDPEO _Bit(6) /* x87 FPU Data Pointer updated only on x87 exceptions */ +#define CPUID_LEAF7_FEATURE_SMEP _Bit(7) /* Supervisor Mode Execute Protect */ +#define CPUID_LEAF7_FEATURE_BMI2 _Bit(8) /* Bit Manipulation Instrs, set 2 */ +#define CPUID_LEAF7_FEATURE_ERMS _Bit(9) /* Enhanced Rep Movsb/Stosb */ +#define CPUID_LEAF7_FEATURE_INVPCID _Bit(10) /* INVPCID intruction, TDB */ +#define CPUID_LEAF7_FEATURE_RTM _Bit(11) /* RTM */ +#define CPUID_LEAF7_FEATURE_PQM _Bit(12) /* Platform Qos Monitoring */ +#define CPUID_LEAF7_FEATURE_FPU_CSDS _Bit(13) /* FPU CS/DS deprecation */ +#define CPUID_LEAF7_FEATURE_MPX _Bit(14) /* Memory Protection eXtensions */ +#define CPUID_LEAF7_FEATURE_PQE _Bit(15) /* Platform Qos Enforcement */ +#define CPUID_LEAF7_FEATURE_AVX512F _Bit(16) /* AVX512F instructions */ +#define CPUID_LEAF7_FEATURE_AVX512DQ _Bit(17) /* AVX512DQ instructions */ +#define CPUID_LEAF7_FEATURE_RDSEED _Bit(18) /* RDSEED Instruction */ +#define CPUID_LEAF7_FEATURE_ADX _Bit(19) /* ADX Instructions */ +#define CPUID_LEAF7_FEATURE_SMAP _Bit(20) /* Supervisor Mode Access Protect */ +#define CPUID_LEAF7_FEATURE_AVX512IFMA _Bit(21) /* AVX512IFMA instructions */ +#define CPUID_LEAF7_FEATURE_CLFSOPT _Bit(23) /* CLFSOPT */ +#define CPUID_LEAF7_FEATURE_CLWB _Bit(24) /* CLWB */ +#define CPUID_LEAF7_FEATURE_IPT _Bit(25) /* Intel Processor Trace */ +#define CPUID_LEAF7_FEATURE_AVX512CD _Bit(28) /* AVX512CD instructions */ +#define CPUID_LEAF7_FEATURE_SHA _Bit(29) /* SHA instructions */ +#define CPUID_LEAF7_FEATURE_AVX512BW _Bit(30) /* AVX512BW instructions */ +#define CPUID_LEAF7_FEATURE_AVX512VL _Bit(31) /* AVX512VL instructions */ + +#define CPUID_LEAF7_FEATURE_PREFETCHWT1 _HBit(0) /* Prefetch Write/T1 hint */ +#define CPUID_LEAF7_FEATURE_AVX512VBMI _HBit(1) /* AVX512VBMI instructions */ +#define CPUID_LEAF7_FEATURE_UMIP _HBit(2) /* User Mode Instruction Prevention */ +#define CPUID_LEAF7_FEATURE_PKU _HBit(3) /* Protection Keys for Usermode */ +#define CPUID_LEAF7_FEATURE_OSPKE _HBit(4) /* OS has enabled PKE */ +#define CPUID_LEAF7_FEATURE_WAITPKG _HBit(5) /* WAITPKG instructions */ +#define CPUID_LEAF7_FEATURE_GFNI _HBit(8) /* Galois Field New Instructions */ +#define CPUID_LEAF7_FEATURE_VAES _HBit(9) /* Vector-encoded AES */ +#define CPUID_LEAF7_FEATURE_VPCLMULQDQ _HBit(10) /* Vector Carryless-multiply */ +#define CPUID_LEAF7_FEATURE_AVX512VNNI _HBit(11) /* AVX512 Vector Neural Net Instructions */ +#define CPUID_LEAF7_FEATURE_AVX512BITALG _HBit(12) /* AVX512 VPOPCNT{B,W} and VPSHUFBITQMB */ +#define CPUID_LEAF7_FEATURE_AVX512VPCDQ _HBit(14) /* AVX512 VPOPCNTDQ instruction */ +#define CPUID_LEAF7_FEATURE_RDPID _HBit(22) /* RDPID and IA32_TSC_AUX */ +#define CPUID_LEAF7_FEATURE_CLDEMOTE _HBit(25) /* Cache line demote */ +#define CPUID_LEAF7_FEATURE_MOVDIRI _HBit(27) /* MOVDIRI instruction */ +#define CPUID_LEAF7_FEATURE_MOVDIRI64B _HBit(28) /* MOVDIRI64B instruction */ +#define CPUID_LEAF7_FEATURE_SGXLC _HBit(30) /* SGX Launch Configuration */ /* * Values in EDX returned by CPUID Leaf 7, subleaf 0 */ #define CPUID_LEAF7_EXTFEATURE_AVX5124VNNIW _Bit(2) /* AVX512_4VNNIW */ #define CPUID_LEAF7_EXTFEATURE_AVX5124FMAPS _Bit(3) /* AVX512_4FMAPS */ +#define CPUID_LEAF7_EXTFEATURE_FSREPMOV _Bit(4) /* Fast Short REP MOV */ #define CPUID_LEAF7_EXTFEATURE_MDCLEAR _Bit(10) /* Overloaded VERW / L1D_FLUSH */ #define CPUID_LEAF7_EXTFEATURE_TSXFA _Bit(13) /* TSX RTM_FORCE_ABORT MSR */ #define CPUID_LEAF7_EXTFEATURE_IBRS _Bit(26) /* IBRS / IBPB */ diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 0ac53d48c..82ab4423f 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -259,21 +259,6 @@ vzeroupper(void) static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */ -/* - * Define a wrapper for bcopy to defeat destination size checka. - * This is needed to treat repeated objects such as - * _STRUCT_XMM_REG fpu_ymmh0; - * ... - * _STRUCT_XMM_REG fpu_ymmh7; - * as an array and to copy like so: - * bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG)); - * without the compiler throwing a __builtin__memmove_chk error. - */ -static inline void -bcopy_nochk(void *_src, void *_dst, size_t _len) -{ - bcopy(_src, _dst, _len); -} /* * Furthermore, make compile-time asserts that no padding creeps into structures @@ -878,7 +863,7 @@ Retry: state->fpu_mxcsr &= mxcsr_capability_mask; - bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]); + __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]); switch (thread_xstate(thr_act)) { case UNDEFINED_FULL: @@ -906,9 +891,9 @@ Retry: iavx->_xh.xcomp_bv = 0; if (f == x86_AVX_STATE32) { - bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); } else if (f == x86_AVX_STATE64) { - bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); } else { iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87); } @@ -932,23 +917,23 @@ Retry: switch (f) { case x86_AVX512_STATE32: - bcopy_nochk(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); - bcopy_nochk(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)); - bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); + __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)); + __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); DBG_AVX512_STATE(iavx); break; case x86_AVX_STATE32: - bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); break; case x86_AVX512_STATE64: - bcopy_nochk(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); - bcopy_nochk(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)); - bcopy_nochk(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)); - bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); + __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)); + __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)); + __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); DBG_AVX512_STATE(iavx); break; case x86_AVX_STATE64: - bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); break; } break; @@ -1024,7 +1009,7 @@ fpu_get_fxstate( * No valid floating-point state. */ - bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw, + __nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw, fp_state_size[FP]); simple_unlock(&pcb->lock); @@ -1047,7 +1032,7 @@ fpu_get_fxstate( (void)ml_set_interrupts_enabled(intr); } if (ifps->fp_valid) { - bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]); + __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]); switch (thread_xstate(thr_act)) { case UNDEFINED_FULL: case FP_FULL: @@ -1065,9 +1050,9 @@ fpu_get_fxstate( struct x86_avx_thread_state *iavx = (void *) ifps; x86_avx_state64_t *xs = (x86_avx_state64_t *) state; if (f == x86_AVX_STATE32) { - bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); } else if (f == x86_AVX_STATE64) { - bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); } break; } @@ -1081,23 +1066,23 @@ fpu_get_fxstate( } xs = { .ts = tstate }; switch (f) { case x86_AVX512_STATE32: - bcopy_nochk(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG)); - bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG)); - bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG)); + __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG)); + __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); DBG_AVX512_STATE(iavx); break; case x86_AVX_STATE32: - bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); break; case x86_AVX512_STATE64: - bcopy_nochk(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG)); - bcopy_nochk(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG)); - bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG)); - bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG)); + __nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG)); + __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG)); + __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); DBG_AVX512_STATE(iavx); break; case x86_AVX_STATE64: - bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); + __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); break; } break; @@ -1163,7 +1148,7 @@ fpu_dup_fxstate( if (ifps->fp_valid) { child->machine.ifps = new_ifps; child->machine.xstate = xstate; - bcopy((char *)(ppcb->ifps), + __nochk_bcopy((char *)(ppcb->ifps), (char *)(child->machine.ifps), fp_state_size[xstate]); @@ -1249,7 +1234,7 @@ fpnoextflt(void) if (pcb->ifps == 0 && !get_interrupt_level()) { ifps = fp_state_alloc(xstate); - bcopy((char *)&initial_fp_state, (char *)ifps, + __nochk_bcopy((char *)&initial_fp_state, (char *)ifps, fp_state_size[xstate]); if (!thread_is_64bit_addr(thr_act)) { ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32; @@ -1555,7 +1540,7 @@ fpu_savearea_promote_avx512(thread_t thread) /* Allocate an AVX512 savearea and copy AVX state into it */ if (pcb->xstate != AVX512) { - bcopy(ifps, ifps512, fp_state_size[AVX]); + __nochk_bcopy(ifps, ifps512, fp_state_size[AVX]); pcb->ifps = ifps512; pcb->xstate = AVX512; ifps512 = NULL; diff --git a/osfmk/i386/genassym.c b/osfmk/i386/genassym.c index f2e340ab7..b691ae36b 100644 --- a/osfmk/i386/genassym.c +++ b/osfmk/i386/genassym.c @@ -81,7 +81,6 @@ #include #include #include -#include #include #include #include @@ -104,7 +103,7 @@ __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "n" ((u_int)(VAL))) #define DECLAREULL(SYM, VAL) \ - __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "n" ((unsigned long long)(VAL))) + __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "i" ((unsigned long long)(VAL))) int main( int argc, @@ -153,7 +152,6 @@ main( DECLARE("TH_CONTINUATION", offsetof(struct thread, continuation)); DECLARE("TH_KERNEL_STACK", offsetof(struct thread, kernel_stack)); DECLARE("TH_MUTEX_COUNT", offsetof(struct thread, mutex_count)); - DECLARE("TH_WAS_PROMOTED_ON_WAKEUP", offsetof(struct thread, was_promoted_on_wakeup)); DECLARE("TH_IOTIER_OVERRIDE", offsetof(struct thread, iotier_override)); DECLARE("TH_SYSCALLS_MACH", offsetof(struct thread, syscalls_mach)); diff --git a/osfmk/i386/hibernate_i386.c b/osfmk/i386/hibernate_i386.c index d88bb1897..2c7a177f1 100644 --- a/osfmk/i386/hibernate_i386.c +++ b/osfmk/i386/hibernate_i386.c @@ -166,6 +166,7 @@ hibernate_page_list_allocate(boolean_t log) } if (num_banks >= MAX_BANKS) { + HIBLOG("%s error, num_banks exceed MAX_BANKS(0x%x)\n", __FUNCTION__, MAX_BANKS); return NULL; } diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 1e8810b17..f1b9f1e12 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include #include @@ -142,7 +141,12 @@ pml4_entry_t *IdlePML4; int kernPhysPML4Index; int kernPhysPML4EntryCount; -int allow_64bit_proc_LDT_ops; +/* + * These are 4K mapping page table pages from KPTphys[] that we wound + * up not using. They get ml_static_mfree()'d once the VM is initialized. + */ +ppnum_t released_PT_ppn = 0; +uint32_t released_PT_cnt = 0; char *physfree; void idt64_remap(void); @@ -397,6 +401,109 @@ Idle_PTs_init(void) set_cr3_raw((uintptr_t)ID_MAP_VTOP(IdlePML4)); } +/* + * Release any still unused, preallocated boot kernel page tables. + * start..end is the VA range currently unused. + */ +void +Idle_PTs_release(vm_offset_t start, vm_offset_t end) +{ + uint32_t i; + uint32_t index_start; + uint32_t index_limit; + ppnum_t pn_first; + ppnum_t pn; + uint32_t cnt; + + /* + * Align start to the next large page boundary + */ + start = ((start + I386_LPGMASK) & ~I386_LPGMASK); + + /* + * convert start into an index in KPTphys[] + */ + index_start = (uint32_t)((start - KERNEL_BASE) >> PAGE_SHIFT); + + /* + * Find the ending index in KPTphys[] + */ + index_limit = (uint32_t)((end - KERNEL_BASE) >> PAGE_SHIFT); + + if (index_limit > NKPT * PTE_PER_PAGE) { + index_limit = NKPT * PTE_PER_PAGE; + } + + /* + * Make sure all the 4K page tables are empty. + * If not, panic a development/debug kernel. + * On a production kernel, since this would stop us from booting, + * just abort the operation. + */ + for (i = index_start; i < index_limit; ++i) { + assert(KPTphys[i] == 0); + if (KPTphys[i] != 0) { + return; + } + } + + /* + * Now figure out the indices into the 2nd level page tables, IdlePTD[]. + */ + index_start >>= PTPGSHIFT; + index_limit >>= PTPGSHIFT; + if (index_limit > NPGPTD * PTE_PER_PAGE) { + index_limit = NPGPTD * PTE_PER_PAGE; + } + + if (index_limit <= index_start) { + return; + } + + + /* + * Now check the pages referenced from Level 2 tables. + * They should be contiguous, assert fail if not on development/debug. + * In production, just fail the removal to allow the system to boot. + */ + pn_first = 0; + cnt = 0; + for (i = index_start; i < index_limit; ++i) { + assert(IdlePTD[i] != 0); + if (IdlePTD[i] == 0) { + return; + } + + pn = (ppnum_t)((PG_FRAME & IdlePTD[i]) >> PTSHIFT); + if (cnt == 0) { + pn_first = pn; + } else { + assert(pn == pn_first + cnt); + if (pn != pn_first + cnt) { + return; + } + } + ++cnt; + } + + /* + * Good to go, clear the level 2 entries and invalidate the TLB + */ + for (i = index_start; i < index_limit; ++i) { + IdlePTD[i] = 0; + } + set_cr3_raw(get_cr3_raw()); + + /* + * Remember these PFNs to be released later in pmap_lowmem_finalize() + */ + released_PT_ppn = pn_first; + released_PT_cnt = cnt; +#if DEVELOPMENT || DEBUG + printf("Idle_PTs_release %d pages from PFN 0x%x\n", released_PT_cnt, released_PT_ppn); +#endif +} + extern void vstart_trap_handler; #define BOOT_TRAP_VECTOR(t) \ @@ -485,9 +592,8 @@ vstart(vm_offset_t boot_args_start) lphysfree = kernelBootArgs->kaddr + kernelBootArgs->ksize; physfree = (void *)(uintptr_t)((lphysfree + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)); -#if DEVELOPMENT || DEBUG pal_serial_init(); -#endif + DBG("revision 0x%x\n", kernelBootArgs->Revision); DBG("version 0x%x\n", kernelBootArgs->Version); DBG("command line %s\n", kernelBootArgs->CommandLine); @@ -595,6 +701,14 @@ i386_init(void) #endif master_cpu = 0; + + lck_mod_init(); + + /* + * Initialize the timer callout world + */ + timer_call_init(); + cpu_init(); postcode(CPU_INIT_D); @@ -613,11 +727,6 @@ i386_init(void) dgWork.dgFlags = 0; } - if (!PE_parse_boot_argn("ldt64", &allow_64bit_proc_LDT_ops, - sizeof(allow_64bit_proc_LDT_ops))) { - allow_64bit_proc_LDT_ops = 0; - } - serialmode = 0; if (PE_parse_boot_argn("serial", &serialmode, sizeof(serialmode))) { /* We want a serial keyboard and/or console */ @@ -696,6 +805,11 @@ i386_init(void) kernel_debug_string_early("power_management_init"); power_management_init(); + +#if MONOTONIC + mt_cpu_up(cpu_datap(0)); +#endif /* MONOTONIC */ + processor_bootstrap(); thread_bootstrap(); @@ -705,7 +819,7 @@ i386_init(void) pstate_trace(); } -static void +static void __dead2 do_init_slave(boolean_t fast_restart) { void *init_param = FULL_SLAVE_INIT; @@ -761,6 +875,12 @@ do_init_slave(boolean_t fast_restart) cpu_thread_init(); /* not strictly necessary */ cpu_init(); /* Sets cpu_running which starter cpu waits for */ + + +#if MONOTONIC + mt_cpu_up(current_cpu_datap()); +#endif /* MONOTONIC */ + slave_main(init_param); panic("do_init_slave() returned from slave_main()"); @@ -861,7 +981,8 @@ doublemap_init(uint8_t randL3) */ dblmap_dist = dblmap_base - hdescb; - idt64_hndl_table0[1] = DBLMAP(idt64_hndl_table0[1]); + idt64_hndl_table0[1] = DBLMAP(idt64_hndl_table0[1]); /* 64-bit exit trampoline */ + idt64_hndl_table0[3] = DBLMAP(idt64_hndl_table0[3]); /* 32-bit exit trampoline */ idt64_hndl_table0[6] = (uint64_t)(uintptr_t)&kernel_stack_mask; extern cpu_data_t cpshadows[], scdatas[]; diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index 63a1b46ef..a4edd4259 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -119,9 +120,19 @@ vm_offset_t vm_kernel_builtinkmod_text_end; #define MAXLORESERVE (32 * 1024 * 1024) ppnum_t max_ppnum = 0; -ppnum_t lowest_lo = 0; -ppnum_t lowest_hi = 0; -ppnum_t highest_hi = 0; + +/* + * pmap_high_used* are the highest range of physical memory used for kernel + * internals (page tables, vm_pages) via pmap_steal_memory() that don't + * need to be encrypted in hibernation images. There can be one gap in + * the middle of this due to fragmentation when using a mix of small + * and large pages. In that case, the fragment lives between the high + * and middle ranges. + */ +ppnum_t pmap_high_used_top = 0; +ppnum_t pmap_high_used_bottom = 0; +ppnum_t pmap_middle_used_top = 0; +ppnum_t pmap_middle_used_bottom = 0; enum {PMAP_MAX_RESERVED_RANGES = 32}; uint32_t pmap_reserved_pages_allocated = 0; @@ -168,6 +179,12 @@ uint64_t firmware_MMIO_bytes; */ extern void *last_kernel_symbol; +#define LG_PPNUM_PAGES (I386_LPGBYTES >> PAGE_SHIFT) +#define LG_PPNUM_MASK (I386_LPGMASK >> PAGE_SHIFT) + +/* set so no region large page fragment pages exist */ +#define RESET_FRAG(r) (((r)->alloc_frag_up = 1), ((r)->alloc_frag_down = 0)) + boolean_t memmap = FALSE; #if DEBUG || DEVELOPMENT static void @@ -181,11 +198,14 @@ kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount) addr64_t efi_start, efi_end; for (j = 0; j < pmap_memory_region_count; j++, p++) { - kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx top 0x%llx\n", + kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx" + " alloc_frag_up 0x%llx alloc_frag_down 0x%llx top 0x%llx\n", j, p->type, (addr64_t) p->base << I386_PGSHIFT, (addr64_t) p->alloc_up << I386_PGSHIFT, (addr64_t) p->alloc_down << I386_PGSHIFT, + (addr64_t) p->alloc_frag_up << I386_PGSHIFT, + (addr64_t) p->alloc_frag_down << I386_PGSHIFT, (addr64_t) p->end << I386_PGSHIFT); region_start = (addr64_t) p->base << I386_PGSHIFT; region_end = ((addr64_t) p->end << I386_PGSHIFT) - 1; @@ -314,7 +334,7 @@ i386_vm_init(uint64_t maxmem, segDATA = getsegbynamefromheader(&_mh_execute_header, "__DATA"); segCONST = getsegbynamefromheader(&_mh_execute_header, - "__CONST"); + "__DATA_CONST"); cursectTEXT = lastsectTEXT = firstsect(segTEXT); /* Discover the last TEXT section within the TEXT segment */ while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) { @@ -554,6 +574,7 @@ i386_vm_init(uint64_t maxmem, (top < vm_kernel_base_page)) { pmptr->alloc_up = pmptr->base; pmptr->alloc_down = pmptr->end; + RESET_FRAG(pmptr); pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count; } else { /* @@ -561,6 +582,7 @@ i386_vm_init(uint64_t maxmem, */ pmptr->alloc_up = top + 1; pmptr->alloc_down = top; + RESET_FRAG(pmptr); } pmptr->type = pmap_type; pmptr->attribute = mptr->Attribute; @@ -574,6 +596,7 @@ i386_vm_init(uint64_t maxmem, pmptr->end = (fap - 1); pmptr->alloc_up = pmptr->end + 1; pmptr->alloc_down = pmptr->end; + RESET_FRAG(pmptr); pmptr->type = pmap_type; pmptr->attribute = mptr->Attribute; /* @@ -587,6 +610,7 @@ i386_vm_init(uint64_t maxmem, pmptr->type = pmap_type; pmptr->attribute = mptr->Attribute; pmptr->alloc_down = pmptr->end = top; + RESET_FRAG(pmptr); if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) { pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count; @@ -599,6 +623,7 @@ i386_vm_init(uint64_t maxmem, pmptr->type = pmap_type; pmptr->attribute = mptr->Attribute; pmptr->alloc_down = pmptr->end = top; + RESET_FRAG(pmptr); if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) { pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count; } @@ -621,6 +646,7 @@ i386_vm_init(uint64_t maxmem, (pmptr->base == (prev_pmptr->end + 1))) { prev_pmptr->end = pmptr->end; prev_pmptr->alloc_down = pmptr->alloc_down; + RESET_FRAG(pmptr); } else { pmap_memory_region_count++; prev_pmptr = pmptr; @@ -692,6 +718,7 @@ i386_vm_init(uint64_t maxmem, if (pages_to_use == 0) { pmap_memory_regions[cur_region].end = cur_end; pmap_memory_regions[cur_region].alloc_down = cur_end; + RESET_FRAG(&pmap_memory_regions[cur_region]); } cur_region++; @@ -772,113 +799,220 @@ pmap_free_pages(void) return (unsigned int)avail_remaining; } - boolean_t pmap_next_page_reserved(ppnum_t *); /* * Pick a page from a "kernel private" reserved range; works around - * errata on some hardware. + * errata on some hardware. EFI marks pages which can't be used for + * certain kinds of I/O-ish activities as reserved. We reserve them for + * kernel internal usage and prevent them from ever going on regular + * free list. */ boolean_t -pmap_next_page_reserved(ppnum_t *pn) +pmap_next_page_reserved( + ppnum_t *pn) { + uint32_t n; + pmap_memory_region_t *region; + uint32_t reserved_index; + if (pmap_reserved_ranges) { - uint32_t n; - pmap_memory_region_t *region; for (n = 0; n < pmap_last_reserved_range_index; n++) { - uint32_t reserved_index = pmap_reserved_range_indices[n]; + reserved_index = pmap_reserved_range_indices[n]; region = &pmap_memory_regions[reserved_index]; if (region->alloc_up <= region->alloc_down) { *pn = region->alloc_up++; - avail_remaining--; - - if (*pn > max_ppnum) { - max_ppnum = *pn; - } + } else if (region->alloc_frag_up <= region->alloc_frag_down) { + *pn = region->alloc_frag_up++; + } else { + continue; + } + avail_remaining--; - if (lowest_lo == 0 || *pn < lowest_lo) { - lowest_lo = *pn; - } + if (*pn > max_ppnum) { + max_ppnum = *pn; + } - pmap_reserved_pages_allocated++; + pmap_reserved_pages_allocated++; #if DEBUG - if (region->alloc_up > region->alloc_down) { - kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute); - } -#endif - return TRUE; + if (region->alloc_up > region->alloc_down) { + kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute); } +#endif + return TRUE; } } return FALSE; } +/* + * Return the highest large page available. Fails once there are no more large pages. + */ +kern_return_t +pmap_next_page_large( + ppnum_t *pn) +{ + int r; + pmap_memory_region_t *region; + ppnum_t frag_start; + ppnum_t lgpg; + + if (avail_remaining < LG_PPNUM_PAGES) { + return KERN_FAILURE; + } + + for (r = pmap_memory_region_count - 1; r >= 0; r--) { + region = &pmap_memory_regions[r]; + + /* + * First check if there is enough memory. + */ + if (region->alloc_down < region->alloc_up || + (region->alloc_down - region->alloc_up + 1) < LG_PPNUM_PAGES) { + continue; + } + + /* + * Find the starting large page, creating a fragment if needed. + */ + if ((region->alloc_down & LG_PPNUM_MASK) == LG_PPNUM_MASK) { + lgpg = (region->alloc_down & ~LG_PPNUM_MASK); + } else { + /* Can only have 1 fragment per region at a time */ + if (region->alloc_frag_up <= region->alloc_frag_down) { + continue; + } + + /* Check for enough room below any fragment. */ + frag_start = (region->alloc_down & ~LG_PPNUM_MASK); + if (frag_start < region->alloc_up || + frag_start - region->alloc_up < LG_PPNUM_PAGES) { + continue; + } + + lgpg = frag_start - LG_PPNUM_PAGES; + region->alloc_frag_up = frag_start; + region->alloc_frag_down = region->alloc_down; + } + + *pn = lgpg; + region->alloc_down = lgpg - 1; + + + avail_remaining -= LG_PPNUM_PAGES; + if (*pn + LG_PPNUM_MASK > max_ppnum) { + max_ppnum = *pn + LG_PPNUM_MASK; + } + + return KERN_SUCCESS; + } + return KERN_FAILURE; +} boolean_t pmap_next_page_hi( - ppnum_t *pn) + ppnum_t *pn, + boolean_t might_free) { pmap_memory_region_t *region; - int n; + int n; - if (pmap_next_page_reserved(pn)) { + if (!might_free && pmap_next_page_reserved(pn)) { return TRUE; } if (avail_remaining) { for (n = pmap_memory_region_count - 1; n >= 0; n--) { region = &pmap_memory_regions[n]; - - if (region->alloc_down >= region->alloc_up) { + if (region->alloc_frag_up <= region->alloc_frag_down) { + *pn = region->alloc_frag_down--; + } else if (region->alloc_down >= region->alloc_up) { *pn = region->alloc_down--; - avail_remaining--; - - if (*pn > max_ppnum) { - max_ppnum = *pn; - } - - if (lowest_lo == 0 || *pn < lowest_lo) { - lowest_lo = *pn; - } - - if (lowest_hi == 0 || *pn < lowest_hi) { - lowest_hi = *pn; - } + } else { + continue; + } - if (*pn > highest_hi) { - highest_hi = *pn; - } + avail_remaining--; - return TRUE; + if (*pn > max_ppnum) { + max_ppnum = *pn; } + + return TRUE; } } return FALSE; } +/* + * Record which high pages have been allocated so far, + * so that pmap_init() can mark them PMAP_NOENCRYPT, which + * makes hibernation faster. + * + * Because of the code in pmap_next_page_large(), we could + * theoretically have fragments in several regions. + * In practice that just doesn't happen. The last pmap region + * is normally the largest and will satisfy all pmap_next_hi/large() + * allocations. Since this information is used as an optimization + * and it's ok to be conservative, we'll just record the information + * for the final region. + */ +void +pmap_hi_pages_done(void) +{ + pmap_memory_region_t *r; + + r = &pmap_memory_regions[pmap_memory_region_count - 1]; + pmap_high_used_top = r->end; + if (r->alloc_frag_up <= r->alloc_frag_down) { + pmap_high_used_bottom = r->alloc_frag_down + 1; + pmap_middle_used_top = r->alloc_frag_up - 1; + if (r->alloc_up <= r->alloc_down) { + pmap_middle_used_bottom = r->alloc_down + 1; + } else { + pmap_high_used_bottom = r->base; + } + } else { + if (r->alloc_up <= r->alloc_down) { + pmap_high_used_bottom = r->alloc_down + 1; + } else { + pmap_high_used_bottom = r->base; + } + } +#if DEBUG || DEVELOPMENT + kprintf("pmap_high_used_top 0x%x\n", pmap_high_used_top); + kprintf("pmap_high_used_bottom 0x%x\n", pmap_high_used_bottom); + kprintf("pmap_middle_used_top 0x%x\n", pmap_middle_used_top); + kprintf("pmap_middle_used_bottom 0x%x\n", pmap_middle_used_bottom); +#endif +} +/* + * Return the next available page from lowest memory for general use. + */ boolean_t pmap_next_page( - ppnum_t *pn) + ppnum_t *pn) { + pmap_memory_region_t *region; + if (avail_remaining) { while (pmap_memory_region_current < pmap_memory_region_count) { - if (pmap_memory_regions[pmap_memory_region_current].alloc_up > - pmap_memory_regions[pmap_memory_region_current].alloc_down) { + region = &pmap_memory_regions[pmap_memory_region_current]; + if (region->alloc_up <= region->alloc_down) { + *pn = region->alloc_up++; + } else if (region->alloc_frag_up <= region->alloc_frag_down) { + *pn = region->alloc_frag_up++; + } else { pmap_memory_region_current++; continue; } - *pn = pmap_memory_regions[pmap_memory_region_current].alloc_up++; avail_remaining--; if (*pn > max_ppnum) { max_ppnum = *pn; } - if (lowest_lo == 0 || *pn < lowest_lo) { - lowest_lo = *pn; - } - return TRUE; } } diff --git a/osfmk/i386/lapic.h b/osfmk/i386/lapic.h index 9a046a2f5..9f06f52bc 100644 --- a/osfmk/i386/lapic.h +++ b/osfmk/i386/lapic.h @@ -183,11 +183,11 @@ typedef struct { extern lapic_ops_table_t *lapic_ops; #define LAPIC_INIT() lapic_ops->init(); -#define LAPIC_WRITE(reg, val) lapic_ops->write(reg, val) +#define LAPIC_WRITE(reg, val) lapic_ops->write(reg, val) #define LAPIC_READ(reg) lapic_ops->read(reg) -#define LAPIC_READ_OFFSET(reg, off) LAPIC_READ((reg)+(off)) +#define LAPIC_READ_OFFSET(reg, off) LAPIC_READ((lapic_register_t)((reg)+(off))) #define LAPIC_READ_ICR() lapic_ops->read_icr() -#define LAPIC_WRITE_ICR(dst, cmd) lapic_ops->write_icr(dst, cmd) +#define LAPIC_WRITE_ICR(dst, cmd) lapic_ops->write_icr(dst, cmd) typedef enum { periodic, diff --git a/osfmk/i386/locks.h b/osfmk/i386/locks.h index 21e74d712..e553bc4a0 100644 --- a/osfmk/i386/locks.h +++ b/osfmk/i386/locks.h @@ -81,10 +81,10 @@ typedef struct _lck_mtx_ { struct { volatile uint32_t lck_mtx_waiters:16, - lck_mtx_pri:8, + lck_mtx_pri:8, // unused lck_mtx_ilocked:1, lck_mtx_mlocked:1, - lck_mtx_promoted:1, + lck_mtx_promoted:1, // unused lck_mtx_spin:1, lck_mtx_is_ext:1, lck_mtx_pad3:3; @@ -107,7 +107,6 @@ typedef struct _lck_mtx_ { #define LCK_MTX_PRIORITY_MSK 0x00ff0000 #define LCK_MTX_ILOCKED_MSK 0x01000000 #define LCK_MTX_MLOCKED_MSK 0x02000000 -#define LCK_MTX_PROMOTED_MSK 0x04000000 #define LCK_MTX_SPIN_MSK 0x08000000 /* This pattern must subsume the interlocked, mlocked and spin bits */ @@ -124,7 +123,8 @@ typedef enum lck_mtx_spinwait_ret_type { } lck_mtx_spinwait_ret_type_t; extern lck_mtx_spinwait_ret_type_t lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex); -extern void lck_mtx_lock_wait_x86(lck_mtx_t *mutex); +struct turnstile; +extern void lck_mtx_lock_wait_x86(lck_mtx_t *mutex, struct turnstile **ts); extern void lck_mtx_lock_acquire_x86(lck_mtx_t *mutex); extern void lck_mtx_lock_slow(lck_mtx_t *lock); diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 5f693ff51..c5b0d3037 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,7 +61,6 @@ * Locking primitives implementation */ -#define ATOMIC_PRIVATE 1 #define LOCK_PRIVATE 1 #include @@ -75,7 +74,6 @@ #include #include #include -#include #include #include @@ -87,32 +85,32 @@ #include #include -#if CONFIG_DTRACE -#define DTRACE_RW_SHARED 0x0 //reader -#define DTRACE_RW_EXCL 0x1 //writer -#define DTRACE_NO_FLAG 0x0 //not applicable +#if CONFIG_DTRACE +#define DTRACE_RW_SHARED 0x0 //reader +#define DTRACE_RW_EXCL 0x1 //writer +#define DTRACE_NO_FLAG 0x0 //not applicable #endif /* CONFIG_DTRACE */ -#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 -#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 -#define LCK_RW_LCK_SHARED_CODE 0x102 -#define LCK_RW_LCK_SH_TO_EX_CODE 0x103 -#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 -#define LCK_RW_LCK_EX_TO_SH_CODE 0x105 +#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 +#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 +#define LCK_RW_LCK_SHARED_CODE 0x102 +#define LCK_RW_LCK_SH_TO_EX_CODE 0x103 +#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 +#define LCK_RW_LCK_EX_TO_SH_CODE 0x105 -#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 -#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 -#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 -#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 -#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 -#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 -#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 -#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 +#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 +#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 +#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 +#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 +#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 +#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 +#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 +#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 -#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) +#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) -unsigned int LcksOpts = 0; +unsigned int LcksOpts=0; #if DEVELOPMENT || DEBUG unsigned int LckDisablePreemptCheck = 0; @@ -120,15 +118,15 @@ unsigned int LckDisablePreemptCheck = 0; /* Forwards */ -#if USLOCK_DEBUG +#if USLOCK_DEBUG /* * Perform simple lock checks. */ -int uslock_check = 1; -int max_lock_loops = 100000000; -decl_simple_lock_data(extern, printf_lock) -decl_simple_lock_data(extern, panic_lock) -#endif /* USLOCK_DEBUG */ +int uslock_check = 1; +int max_lock_loops = 100000000; +decl_simple_lock_data(extern , printf_lock); +decl_simple_lock_data(extern , panic_lock); +#endif /* USLOCK_DEBUG */ extern unsigned int not_in_kdp; @@ -137,23 +135,23 @@ extern unsigned int not_in_kdp; * of the various lock routines. However, this information * is only used for debugging and statistics. */ -typedef void *pc_t; -#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) -#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) -#if ANY_LOCK_DEBUG -#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) -#define DECL_PC(pc) pc_t pc; -#else /* ANY_LOCK_DEBUG */ +typedef void *pc_t; +#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) +#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) +#if ANY_LOCK_DEBUG +#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) +#define DECL_PC(pc) pc_t pc; +#else /* ANY_LOCK_DEBUG */ #define DECL_PC(pc) -#ifdef lint +#ifdef lint /* * Eliminate lint complaints about unused local pc variables. */ -#define OBTAIN_PC(pc) ++pc -#else /* lint */ -#define OBTAIN_PC(pc) -#endif /* lint */ -#endif /* USLOCK_DEBUG */ +#define OBTAIN_PC(pc) ++pc +#else /* lint */ +#define OBTAIN_PC(pc) +#endif /* lint */ +#endif /* USLOCK_DEBUG */ /* * atomic exchange API is a low level abstraction of the operations @@ -168,10 +166,10 @@ typedef void *pc_t; static uint32_t atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord) { - uint32_t val; + uint32_t val; - (void)ord; // Memory order not used - val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed); + (void)ord; // Memory order not used + val = os_atomic_load(target, relaxed); *previous = val; return val; } @@ -183,48 +181,50 @@ atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, } static void -atomic_exchange_abort(void) -{ -} +atomic_exchange_abort(void) { } static boolean_t atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) { - uint32_t value, prev; + uint32_t value, prev; - for (;;) { + for ( ; ; ) { value = atomic_exchange_begin32(target, &prev, ord); if (value & test_mask) { - if (wait) { + if (wait) cpu_pause(); - } else { + else atomic_exchange_abort(); - } return FALSE; } value |= set_mask; - if (atomic_exchange_complete32(target, prev, value, ord)) { + if (atomic_exchange_complete32(target, prev, value, ord)) return TRUE; - } } } +inline boolean_t +hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) +{ + return atomic_test_and_set32(target, test_mask, set_mask, ord, wait); +} + /* * Portable lock package implementation of usimple_locks. */ -#if USLOCK_DEBUG -#define USLDBG(stmt) stmt -void usld_lock_init(usimple_lock_t, unsigned short); -void usld_lock_pre(usimple_lock_t, pc_t); -void usld_lock_post(usimple_lock_t, pc_t); -void usld_unlock(usimple_lock_t, pc_t); -void usld_lock_try_pre(usimple_lock_t, pc_t); -void usld_lock_try_post(usimple_lock_t, pc_t); -int usld_lock_common_checks(usimple_lock_t, char *); -#else /* USLOCK_DEBUG */ -#define USLDBG(stmt) -#endif /* USLOCK_DEBUG */ +#if USLOCK_DEBUG +#define USLDBG(stmt) stmt +void usld_lock_init(usimple_lock_t, unsigned short); +void usld_lock_pre(usimple_lock_t, pc_t); +void usld_lock_post(usimple_lock_t, pc_t); +void usld_unlock(usimple_lock_t, pc_t); +void usld_lock_try_pre(usimple_lock_t, pc_t); +void usld_lock_try_post(usimple_lock_t, pc_t); +int usld_lock_common_checks(usimple_lock_t, char *); +#else /* USLOCK_DEBUG */ +#define USLDBG(stmt) +#endif /* USLOCK_DEBUG */ /* * Forward definitions @@ -240,7 +240,7 @@ void lck_rw_clear_promotions_x86(thread_t thread); static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock); static boolean_t lck_rw_grab_want(lck_rw_t *lock); static boolean_t lck_rw_grab_shared(lck_rw_t *lock); -static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect); +static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect); static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state); static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state); static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state); @@ -254,16 +254,15 @@ static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint3 */ lck_spin_t * lck_spin_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) + lck_grp_t *grp, + lck_attr_t *attr) { - lck_spin_t *lck; + lck_spin_t *lck; - if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) { + if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) lck_spin_init(lck, grp, attr); - } - return lck; + return(lck); } /* @@ -271,8 +270,8 @@ lck_spin_alloc_init( */ void lck_spin_free( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { lck_spin_destroy(lck, grp); kfree(lck, sizeof(lck_spin_t)); @@ -283,13 +282,15 @@ lck_spin_free( */ void lck_spin_init( - lck_spin_t *lck, - lck_grp_t *grp, - __unused lck_attr_t *attr) + lck_spin_t *lck, + lck_grp_t *grp, + __unused lck_attr_t *attr) { usimple_lock_init((usimple_lock_t) lck, 0); - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); + if (grp) { + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); + } } /* @@ -297,15 +298,16 @@ lck_spin_init( */ void lck_spin_destroy( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { - if (lck->interlock == LCK_SPIN_TAG_DESTROYED) { + if (lck->interlock == LCK_SPIN_TAG_DESTROYED) return; - } lck->interlock = LCK_SPIN_TAG_DESTROYED; - lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); - lck_grp_deallocate(grp); + if (grp) { + lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); + lck_grp_deallocate(grp); + } return; } @@ -314,8 +316,8 @@ lck_spin_destroy( */ void lck_spin_lock_grp( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { #pragma unused(grp) usimple_lock((usimple_lock_t) lck, grp); @@ -323,7 +325,7 @@ lck_spin_lock_grp( void lck_spin_lock( - lck_spin_t *lck) + lck_spin_t *lck) { usimple_lock((usimple_lock_t) lck, NULL); } @@ -333,24 +335,24 @@ lck_spin_lock( */ void lck_spin_unlock( - lck_spin_t *lck) + lck_spin_t *lck) { usimple_unlock((usimple_lock_t) lck); } boolean_t lck_spin_try_lock_grp( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { #pragma unused(grp) boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp); -#if DEVELOPMENT || DEBUG +#if DEVELOPMENT || DEBUG if (lrval) { pltrace(FALSE); } #endif - return lrval; + return(lrval); } @@ -359,15 +361,15 @@ lck_spin_try_lock_grp( */ boolean_t lck_spin_try_lock( - lck_spin_t *lck) + lck_spin_t *lck) { boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL); -#if DEVELOPMENT || DEBUG +#if DEVELOPMENT || DEBUG if (lrval) { pltrace(FALSE); } #endif - return lrval; + return(lrval); } /* @@ -397,8 +399,6 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type) if (__improbable(holder != THREAD_NULL)) { if (holder == thread) { panic("Lock owned by current thread %p = %lx", lock, state); - } else { - panic("Lock %p owned by thread %p", lock, holder); } } } @@ -410,8 +410,7 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type) * Returns: TRUE if lock is acquired. */ boolean_t -kdp_lck_spin_is_acquired(lck_spin_t *lck) -{ +kdp_lck_spin_is_acquired(lck_spin_t *lck) { if (not_in_kdp) { panic("panic: spinlock acquired check done outside of kernel debugger"); } @@ -425,23 +424,21 @@ kdp_lck_spin_is_acquired(lck_spin_t *lck) */ void usimple_lock_init( - usimple_lock_t l, - __unused unsigned short tag) + usimple_lock_t l, + __unused unsigned short tag) { -#ifndef MACHINE_SIMPLE_LOCK +#ifndef MACHINE_SIMPLE_LOCK USLDBG(usld_lock_init(l, tag)); hw_lock_init(&l->interlock); #else - simple_lock_init((simple_lock_t)l, tag); + simple_lock_init((simple_lock_t)l,tag); #endif } volatile uint32_t spinlock_owner_cpu = ~0; volatile usimple_lock_t spinlock_timed_out; -uint32_t -spinlock_timeout_NMI(uintptr_t thread_addr) -{ +uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { uint32_t i; for (i = 0; i < real_ncpus; i++) { @@ -467,22 +464,21 @@ spinlock_timeout_NMI(uintptr_t thread_addr) */ void (usimple_lock)( - usimple_lock_t l + usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK +#ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); OBTAIN_PC(pc); USLDBG(usld_lock_pre(l, pc)); - if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { + if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { boolean_t uslock_acquired = FALSE; while (machine_timeout_suspended()) { enable_preemption(); - if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) { + if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) break; - } } if (uslock_acquired == FALSE) { @@ -491,11 +487,11 @@ void spinlock_timed_out = l; lock_cpu = spinlock_timeout_NMI(lowner); panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu", - l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); + l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); } } #if DEVELOPMENT || DEBUG - pltrace(FALSE); + pltrace(FALSE); #endif USLDBG(usld_lock_post(l, pc)); @@ -517,15 +513,15 @@ void */ void usimple_unlock( - usimple_lock_t l) + usimple_lock_t l) { -#ifndef MACHINE_SIMPLE_LOCK +#ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); OBTAIN_PC(pc); USLDBG(usld_unlock(l, pc)); #if DEVELOPMENT || DEBUG - pltrace(TRUE); + pltrace(TRUE); #endif hw_lock_unlock(&l->interlock); #else @@ -548,11 +544,11 @@ usimple_unlock( */ unsigned int usimple_lock_try( - usimple_lock_t l, + usimple_lock_t l, lck_grp_t *grp) { -#ifndef MACHINE_SIMPLE_LOCK - unsigned int success; +#ifndef MACHINE_SIMPLE_LOCK + unsigned int success; DECL_PC(pc); OBTAIN_PC(pc); @@ -561,48 +557,81 @@ usimple_lock_try( #if DEVELOPMENT || DEBUG pltrace(FALSE); #endif - USLDBG(usld_lock_try_post(l, pc)); + USLDBG(usld_lock_try_post(l, pc)); } return success; #else - return simple_lock_try((simple_lock_t)l, grp); + return(simple_lock_try((simple_lock_t)l, grp)); #endif } /* - * Acquire a usimple_lock while polling for pending TLB flushes + * Acquire a usimple_lock while polling for pending cpu signals * and spinning on a lock. * */ -void -usimple_lock_try_lock_loop(usimple_lock_t l, lck_grp_t *grp) +unsigned int +(usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l, + uint64_t deadline + LCK_GRP_ARG(lck_grp_t *grp)) { boolean_t istate = ml_get_interrupts_enabled(); + + if (deadline < mach_absolute_time()) { + return 0; + } + while (!simple_lock_try(l, grp)) { - if (!istate) { - handle_pending_TLB_flushes(); + if (!istate) + cpu_signal_handler(NULL); + + if (deadline < mach_absolute_time()) { + return 0; } + cpu_pause(); } + + return 1; +} + +void +(usimple_lock_try_lock_loop)(usimple_lock_t l + LCK_GRP_ARG(lck_grp_t *grp)) +{ + usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp); } -#if USLOCK_DEBUG +unsigned int +(usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l, + uint64_t duration + LCK_GRP_ARG(lck_grp_t *grp)) +{ + uint64_t deadline; + uint64_t base_at = mach_absolute_time(); + uint64_t duration_at; + + nanoseconds_to_absolutetime(duration, &duration_at); + deadline = base_at + duration_at; + if (deadline < base_at) { + /* deadline has overflowed, make it saturate */ + deadline = ULLONG_MAX; + } + + return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp); +} + +#if USLOCK_DEBUG /* * States of a usimple_lock. The default when initializing * a usimple_lock is setting it up for debug checking. */ -#define USLOCK_CHECKED 0x0001 /* lock is being checked */ -#define USLOCK_TAKEN 0x0002 /* lock has been taken */ -#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ -#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) -#define USLOCK_CHECKING(l) (uslock_check && \ - ((l)->debug.state & USLOCK_CHECKED)) - -/* - * Trace activities of a particularly interesting lock. - */ -void usl_trace(usimple_lock_t, int, pc_t, const char *); - +#define USLOCK_CHECKED 0x0001 /* lock is being checked */ +#define USLOCK_TAKEN 0x0002 /* lock has been taken */ +#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ +#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) +#define USLOCK_CHECKING(l) (uslock_check && \ + ((l)->debug.state & USLOCK_CHECKED)) /* * Initialize the debugging information contained @@ -610,12 +639,11 @@ void usl_trace(usimple_lock_t, int, pc_t, const char *); */ void usld_lock_init( - usimple_lock_t l, - __unused unsigned short tag) + usimple_lock_t l, + __unused unsigned short tag) { - if (l == USIMPLE_LOCK_NULL) { + if (l == USIMPLE_LOCK_NULL) panic("lock initialization: null lock pointer"); - } l->lock_type = USLOCK_TAG; l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0; l->debug.lock_cpu = l->debug.unlock_cpu = 0; @@ -634,18 +662,15 @@ usld_lock_init( */ int usld_lock_common_checks( - usimple_lock_t l, - char *caller) + usimple_lock_t l, + char *caller) { - if (l == USIMPLE_LOCK_NULL) { + if (l == USIMPLE_LOCK_NULL) panic("%s: null lock pointer", caller); - } - if (l->lock_type != USLOCK_TAG) { + if (l->lock_type != USLOCK_TAG) panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type); - } - if (!(l->debug.state & USLOCK_INIT)) { + if (!(l->debug.state & USLOCK_INIT)) panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state); - } return USLOCK_CHECKING(l); } @@ -657,15 +682,14 @@ usld_lock_common_checks( /* ARGSUSED */ void usld_lock_pre( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - char caller[] = "usimple_lock"; + char caller[] = "usimple_lock"; - if (!usld_lock_common_checks(l, caller)) { + if (!usld_lock_common_checks(l, caller)) return; - } /* * Note that we have a weird case where we are getting a lock when we are] @@ -678,13 +702,12 @@ usld_lock_pre( if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && l->debug.lock_thread == (void *) current_thread()) { printf("%s: lock %p already locked (at %p) by", - caller, l, l->debug.lock_pc); + caller, l, l->debug.lock_pc); printf(" current thread %p (new attempt at pc %p)\n", - l->debug.lock_thread, pc); + l->debug.lock_thread, pc); panic("%s", caller); } mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); mp_enable_preemption(); } @@ -697,33 +720,28 @@ usld_lock_pre( */ void usld_lock_post( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - int mycpu; - char caller[] = "successful usimple_lock"; + int mycpu; + char caller[] = "successful usimple_lock"; - if (!usld_lock_common_checks(l, caller)) { + if (!usld_lock_common_checks(l, caller)) return; - } - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { + if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) panic("%s: lock %p became uninitialized", - caller, l); - } - if ((l->debug.state & USLOCK_TAKEN)) { + caller, l); + if ((l->debug.state & USLOCK_TAKEN)) panic("%s: lock 0x%p became TAKEN by someone else", - caller, l); - } + caller, l); mycpu = cpu_number(); l->debug.lock_thread = (void *)current_thread(); l->debug.state |= USLOCK_TAKEN; l->debug.lock_pc = pc; l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); } @@ -737,34 +755,30 @@ usld_lock_post( */ void usld_unlock( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - int mycpu; - char caller[] = "usimple_unlock"; + int mycpu; + char caller[] = "usimple_unlock"; - if (!usld_lock_common_checks(l, caller)) { + if (!usld_lock_common_checks(l, caller)) return; - } mycpu = cpu_number(); - if (!(l->debug.state & USLOCK_TAKEN)) { + if (!(l->debug.state & USLOCK_TAKEN)) panic("%s: lock 0x%p hasn't been taken", - caller, l); - } - if (l->debug.lock_thread != (void *) current_thread()) { + caller, l); + if (l->debug.lock_thread != (void *) current_thread()) panic("%s: unlocking lock 0x%p, owned by thread %p", - caller, l, l->debug.lock_thread); - } + caller, l, l->debug.lock_thread); if (l->debug.lock_cpu != mycpu) { printf("%s: unlocking lock 0x%p on cpu 0x%x", - caller, l, mycpu); + caller, l, mycpu); printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); panic("%s", caller); } - usl_trace(l, mycpu, pc, caller); l->debug.unlock_thread = l->debug.lock_thread; l->debug.lock_thread = INVALID_PC; @@ -782,17 +796,13 @@ usld_unlock( */ void usld_lock_try_pre( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + __unused pc_t pc) { - char caller[] = "usimple_lock_try"; + char caller[] = "usimple_lock_try"; - if (!usld_lock_common_checks(l, caller)) { + if (!usld_lock_common_checks(l, caller)) return; - } - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); } @@ -806,79 +816,45 @@ usld_lock_try_pre( */ void usld_lock_try_post( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - int mycpu; - char caller[] = "successful usimple_lock_try"; + int mycpu; + char caller[] = "successful usimple_lock_try"; - if (!usld_lock_common_checks(l, caller)) { + if (!usld_lock_common_checks(l, caller)) return; - } - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { + if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) panic("%s: lock 0x%p became uninitialized", - caller, l); - } - if ((l->debug.state & USLOCK_TAKEN)) { + caller, l); + if ((l->debug.state & USLOCK_TAKEN)) panic("%s: lock 0x%p became TAKEN by someone else", - caller, l); - } + caller, l); mycpu = cpu_number(); l->debug.lock_thread = (void *) current_thread(); l->debug.state |= USLOCK_TAKEN; l->debug.lock_pc = pc; l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); } - - -/* - * For very special cases, set traced_lock to point to a - * specific lock of interest. The result is a series of - * XPRs showing lock operations on that lock. The lock_seq - * value is used to show the order of those operations. - */ -usimple_lock_t traced_lock; -unsigned int lock_seq; - -void -usl_trace( - usimple_lock_t l, - int mycpu, - pc_t pc, - const char * op_name) -{ - if (traced_lock == l) { - XPR(XPR_SLOCK, - "seq %d, cpu %d, %s @ %x\n", - (uintptr_t) lock_seq, (uintptr_t) mycpu, - (uintptr_t) op_name, (uintptr_t) pc, 0); - lock_seq++; - } -} - - -#endif /* USLOCK_DEBUG */ +#endif /* USLOCK_DEBUG */ /* * Routine: lck_rw_alloc_init */ lck_rw_t * lck_rw_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) -{ - lck_rw_t *lck; + lck_grp_t *grp, + lck_attr_t *attr) { + lck_rw_t *lck; if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) { bzero(lck, sizeof(lck_rw_t)); lck_rw_init(lck, grp, attr); } - return lck; + return(lck); } /* @@ -886,9 +862,8 @@ lck_rw_alloc_init( */ void lck_rw_free( - lck_rw_t *lck, - lck_grp_t *grp) -{ + lck_rw_t *lck, + lck_grp_t *grp) { lck_rw_destroy(lck, grp); kfree(lck, sizeof(lck_rw_t)); } @@ -898,12 +873,12 @@ lck_rw_free( */ void lck_rw_init( - lck_rw_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) + lck_rw_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { - lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? - attr : &LockDefaultLckAttr; + lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? + attr : &LockDefaultLckAttr; hw_lock_byte_init(&lck->lck_rw_interlock); lck->lck_rw_want_write = FALSE; @@ -913,7 +888,7 @@ lck_rw_init( lck->lck_r_waiting = lck->lck_w_waiting = 0; lck->lck_rw_tag = 0; lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val & - LCK_ATTR_RW_SHARED_PRIORITY) == 0); + LCK_ATTR_RW_SHARED_PRIORITY) == 0); lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); @@ -924,12 +899,11 @@ lck_rw_init( */ void lck_rw_destroy( - lck_rw_t *lck, - lck_grp_t *grp) + lck_rw_t *lck, + lck_grp_t *grp) { - if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) { + if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) return; - } #if MACH_LDEBUG lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD); #endif @@ -955,7 +929,7 @@ lck_rw_destroy( static inline boolean_t lck_interlock_lock(lck_rw_t *lck) { - boolean_t istate; + boolean_t istate; istate = ml_set_interrupts_enabled(FALSE); hw_lock_byte_lock(&lck->lck_rw_interlock); @@ -978,18 +952,16 @@ lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) static inline void lck_rw_lock_pause(boolean_t interrupts_enabled) { - if (!interrupts_enabled) { + if (!interrupts_enabled) handle_pending_TLB_flushes(); - } cpu_pause(); } static inline boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock) { - if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) { + if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) return TRUE; - } return FALSE; } @@ -1004,7 +976,7 @@ lck_rw_deadline_for_spin(lck_rw_t *lck) if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) { /* * there are already threads waiting on this lock... this - * implies that they have spun beyond their deadlines waiting for + * implies that they have spun beyond their deadlines waiting for * the desired state to show up so we will not bother spinning at this time... * or * the current number of threads sharing this lock exceeds our capacity to run them @@ -1012,12 +984,11 @@ lck_rw_deadline_for_spin(lck_rw_t *lck) * to be at 0, we'll not bother spinning since the latency for this to happen is * unpredictable... */ - return mach_absolute_time(); + return (mach_absolute_time()); } - return mach_absolute_time() + MutexSpin; - } else { - return mach_absolute_time() + (1LL * 1000000000LL); - } + return (mach_absolute_time() + MutexSpin); + } else + return (mach_absolute_time() + (100000LL * 1000000000LL)); } @@ -1036,13 +1007,12 @@ lck_rw_interlock_spin(lck_rw_t *lock) static boolean_t lck_rw_grab_want(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed); - if ((data & LCK_RW_INTERLOCK) == 0) { + if ((data & LCK_RW_INTERLOCK) == 0) break; - } atomic_exchange_abort(); lck_rw_interlock_spin(lock); } @@ -1057,13 +1027,12 @@ lck_rw_grab_want(lck_rw_t *lock) static boolean_t lck_rw_grab_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); - if ((data & LCK_RW_INTERLOCK) == 0) { + if ((data & LCK_RW_INTERLOCK) == 0) break; - } atomic_exchange_abort(); lck_rw_interlock_spin(lock); } @@ -1082,19 +1051,19 @@ lck_rw_grab_shared(lck_rw_t *lock) */ static void lck_rw_lock_exclusive_gen( - lck_rw_t *lck) + lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - uint64_t deadline = 0; - int slept = 0; - int gotlock = 0; - int lockheld = 0; - wait_result_t res = 0; - boolean_t istate = -1; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int slept = 0; + int gotlock = 0; + int lockheld = 0; + wait_result_t res = 0; + boolean_t istate = -1; -#if CONFIG_DTRACE +#if CONFIG_DTRACE boolean_t dtrace_ls_initialized = FALSE; - boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE; + boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE; uint64_t wait_interval = 0; int readers_at_sleep = 0; #endif @@ -1102,8 +1071,9 @@ lck_rw_lock_exclusive_gen( /* * Try to acquire the lck_rw_want_write bit. */ - while (!lck_rw_grab_want(lck)) { -#if CONFIG_DTRACE + while ( !lck_rw_grab_want(lck)) { + +#if CONFIG_DTRACE if (dtrace_ls_initialized == FALSE) { dtrace_ls_initialized = TRUE; dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); @@ -1119,39 +1089,38 @@ lck_rw_lock_exclusive_gen( } } #endif - if (istate == -1) { + if (istate == -1) istate = ml_get_interrupts_enabled(); - } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); - while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) { + while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); - } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0); - if (gotlock) { + if (gotlock) break; - } /* * if we get here, the deadline has expired w/o us * being able to grab the lock exclusively * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { + istate = lck_interlock_lock(lck); if (lck->lck_rw_want_write) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1178,7 +1147,8 @@ lck_rw_lock_exclusive_gen( * and the interlock not held, we are safe to proceed */ while (lck_rw_held_read_or_upgrade(lck)) { -#if CONFIG_DTRACE + +#if CONFIG_DTRACE /* * Either sleeping or spinning is happening, start * a timing of our delay interval now. If we set it @@ -1200,29 +1170,27 @@ lck_rw_lock_exclusive_gen( } } #endif - if (istate == -1) { + if (istate == -1) istate = ml_get_interrupts_enabled(); - } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); - while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) { + while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); - } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0); - if (!lockheld) { + if ( !lockheld) break; - } /* * if we get here, the deadline has expired w/o us * being able to grab the lock exclusively * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { + istate = lck_interlock_lock(lck); if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { @@ -1232,7 +1200,7 @@ lck_rw_lock_exclusive_gen( thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1252,7 +1220,7 @@ lck_rw_lock_exclusive_gen( } } -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * Decide what latencies we suffered that are Dtrace events. * If we have set wait_interval, then we either spun or slept. @@ -1286,46 +1254,40 @@ lck_rw_lock_exclusive_gen( * Routine: lck_rw_done */ -lck_rw_type_t -lck_rw_done(lck_rw_t *lock) +lck_rw_type_t lck_rw_done(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); - if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ + if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ atomic_exchange_abort(); lck_rw_interlock_spin(lock); continue; } if (data & LCK_RW_SHARED_MASK) { data -= LCK_RW_SHARED_READER; - if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */ + if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */ goto check_waiters; - } - } else { /* if reader count == 0, must be exclusive lock */ + } else { /* if reader count == 0, must be exclusive lock */ if (data & LCK_RW_WANT_UPGRADE) { data &= ~(LCK_RW_WANT_UPGRADE); } else { - if (data & LCK_RW_WANT_WRITE) { + if (data & LCK_RW_WANT_WRITE) data &= ~(LCK_RW_WANT_EXCL); - } else { /* lock is not 'owned', panic */ + else /* lock is not 'owned', panic */ panic("Releasing non-exclusive RW lock without a reader refcount!"); - } } check_waiters: if (prev & LCK_RW_W_WAITING) { data &= ~(LCK_RW_W_WAITING); - if ((prev & LCK_RW_PRIV_EXCL) == 0) { + if ((prev & LCK_RW_PRIV_EXCL) == 0) data &= ~(LCK_RW_R_WAITING); - } - } else { + } else data &= ~(LCK_RW_R_WAITING); - } } - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) { + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) break; - } cpu_pause(); } return lck_rw_done_gen(lock, prev); @@ -1336,13 +1298,13 @@ check_waiters: * * called from lck_rw_done() * prior_lock_state is the value in the 1st - * word of the lock at the time of a successful + * word of the lock at the time of a successful * atomic compare and exchange with the new value... - * it represents the state of the lock before we + * it represents the state of the lock before we * decremented the rw_shared_count or cleared either - * rw_want_upgrade or rw_want_write and + * rw_want_upgrade or rw_want_write and * the lck_x_waiting bits... since the wrapper - * routine has already changed the state atomically, + * routine has already changed the state atomically, * we just need to decide if we should * wake up anyone and what value to return... we do * this by examining the state of the lock before @@ -1410,16 +1372,15 @@ lck_rw_done_gen( */ void lck_rw_unlock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) + lck_rw_t *lck, + lck_rw_type_t lck_rw_type) { - if (lck_rw_type == LCK_RW_TYPE_SHARED) { + if (lck_rw_type == LCK_RW_TYPE_SHARED) lck_rw_unlock_shared(lck); - } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { + else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) lck_rw_unlock_exclusive(lck); - } else { + else panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type); - } } @@ -1428,16 +1389,15 @@ lck_rw_unlock( */ void lck_rw_unlock_shared( - lck_rw_t *lck) + lck_rw_t *lck) { - lck_rw_type_t ret; + lck_rw_type_t ret; assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count); ret = lck_rw_done(lck); - if (ret != LCK_RW_TYPE_SHARED) { + if (ret != LCK_RW_TYPE_SHARED) panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret); - } } @@ -1446,15 +1406,14 @@ lck_rw_unlock_shared( */ void lck_rw_unlock_exclusive( - lck_rw_t *lck) + lck_rw_t *lck) { - lck_rw_type_t ret; + lck_rw_type_t ret; ret = lck_rw_done(lck); - if (ret != LCK_RW_TYPE_EXCLUSIVE) { + if (ret != LCK_RW_TYPE_EXCLUSIVE) panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret); - } } @@ -1463,16 +1422,15 @@ lck_rw_unlock_exclusive( */ void lck_rw_lock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) + lck_rw_t *lck, + lck_rw_type_t lck_rw_type) { - if (lck_rw_type == LCK_RW_TYPE_SHARED) { + if (lck_rw_type == LCK_RW_TYPE_SHARED) lck_rw_lock_shared(lck); - } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { + else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) lck_rw_lock_exclusive(lck); - } else { + else panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type); - } } /* @@ -1481,10 +1439,10 @@ lck_rw_lock( void lck_rw_lock_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; current_thread()->rwlock_count++; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) { atomic_exchange_abort(); @@ -1497,14 +1455,13 @@ lck_rw_lock_shared(lck_rw_t *lock) break; } data += LCK_RW_SHARED_READER; - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) break; - } cpu_pause(); } -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); -#endif /* CONFIG_DTRACE */ +#endif /* CONFIG_DTRACE */ return; } @@ -1517,24 +1474,25 @@ lck_rw_lock_shared(lck_rw_t *lock) */ static void lck_rw_lock_shared_gen( - lck_rw_t *lck) + lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - uint64_t deadline = 0; - int gotlock = 0; - int slept = 0; - wait_result_t res = 0; - boolean_t istate = -1; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int gotlock = 0; + int slept = 0; + wait_result_t res = 0; + boolean_t istate = -1; -#if CONFIG_DTRACE +#if CONFIG_DTRACE uint64_t wait_interval = 0; int readers_at_sleep = 0; boolean_t dtrace_ls_initialized = FALSE; boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE; #endif - while (!lck_rw_grab_shared(lck)) { -#if CONFIG_DTRACE + while ( !lck_rw_grab_shared(lck)) { + +#if CONFIG_DTRACE if (dtrace_ls_initialized == FALSE) { dtrace_ls_initialized = TRUE; dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0); @@ -1550,43 +1508,42 @@ lck_rw_lock_shared_gen( } } #endif - if (istate == -1) { + if (istate == -1) istate = ml_get_interrupts_enabled(); - } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); - while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) { + while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); - } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); - if (gotlock) { + if (gotlock) break; - } /* * if we get here, the deadline has expired w/o us * being able to grab the lock for read * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { + istate = lck_interlock_lock(lck); if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); lck->lck_r_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead); res = assert_wait(RW_LOCK_READER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1594,7 +1551,7 @@ lck_rw_lock_shared_gen( slept++; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END, - trace_lck, res, slept, 0, 0); + trace_lck, res, slept, 0, 0); } else { lck->lck_rw_shared_count++; lck_interlock_unlock(lck, istate); @@ -1603,7 +1560,7 @@ lck_rw_lock_shared_gen( } } -#if CONFIG_DTRACE +#if CONFIG_DTRACE if (dtrace_ls_enabled == TRUE) { if (slept == 0) { LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); @@ -1627,27 +1584,28 @@ lck_rw_lock_exclusive(lck_rw_t *lock) { current_thread()->rwlock_count++; if (atomic_test_and_set32(&lock->data, - (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), - LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { -#if CONFIG_DTRACE + (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), + LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); -#endif /* CONFIG_DTRACE */ - } else { +#endif /* CONFIG_DTRACE */ + } else lck_rw_lock_exclusive_gen(lock); - } } /* * Routine: lck_rw_lock_shared_to_exclusive + * + * False returned upon failure, in this case the shared lock is dropped. */ boolean_t lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); @@ -1656,26 +1614,22 @@ lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) } if (data & LCK_RW_WANT_UPGRADE) { data -= LCK_RW_SHARED_READER; - if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */ - data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */ - } - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { + if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */ + data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) return lck_rw_lock_shared_to_exclusive_failure(lock, prev); - } } else { - data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */ - data -= LCK_RW_SHARED_READER; /* and shed our read count */ - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { + data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */ + data -= LCK_RW_SHARED_READER; /* and shed our read count */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) break; - } } cpu_pause(); } - /* we now own the WANT_UPGRADE */ - if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */ - lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */ - } -#if CONFIG_DTRACE + /* we now own the WANT_UPGRADE */ + if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */ + lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */ +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0); #endif return TRUE; @@ -1692,12 +1646,12 @@ lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) */ static boolean_t lck_rw_lock_shared_to_exclusive_failure( - lck_rw_t *lck, - uint32_t prior_lock_state) + lck_rw_t *lck, + uint32_t prior_lock_state) { - lck_rw_t *fake_lck; - thread_t thread = current_thread(); - uint32_t rwlock_count; + lck_rw_t *fake_lck; + thread_t thread = current_thread(); + uint32_t rwlock_count; /* Check if dropping the lock means that we need to unpromote */ rwlock_count = thread->rwlock_count--; @@ -1723,9 +1677,9 @@ lck_rw_lock_shared_to_exclusive_failure( } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); - return FALSE; + return (FALSE); } @@ -1739,16 +1693,16 @@ lck_rw_lock_shared_to_exclusive_failure( */ static boolean_t lck_rw_lock_shared_to_exclusive_success( - lck_rw_t *lck) + lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - uint64_t deadline = 0; - int slept = 0; - int still_shared = 0; - wait_result_t res; - boolean_t istate = -1; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int slept = 0; + int still_shared = 0; + wait_result_t res; + boolean_t istate = -1; -#if CONFIG_DTRACE +#if CONFIG_DTRACE uint64_t wait_interval = 0; int readers_at_sleep = 0; boolean_t dtrace_ls_initialized = FALSE; @@ -1756,7 +1710,8 @@ lck_rw_lock_shared_to_exclusive_success( #endif while (lck->lck_rw_shared_count != 0) { -#if CONFIG_DTRACE + +#if CONFIG_DTRACE if (dtrace_ls_initialized == FALSE) { dtrace_ls_initialized = TRUE; dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0); @@ -1772,42 +1727,40 @@ lck_rw_lock_shared_to_exclusive_success( } } #endif - if (istate == -1) { + if (istate == -1) istate = ml_get_interrupts_enabled(); - } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); - while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) { + while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); - } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END, - trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); - if (!still_shared) { + if ( !still_shared) break; - } /* * if we get here, the deadline has expired w/o * the rw_shared_count having drained to 0 * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { + istate = lck_interlock_lock(lck); if (lck->lck_rw_shared_count != 0) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1815,14 +1768,14 @@ lck_rw_lock_shared_to_exclusive_success( slept++; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END, - trace_lck, res, slept, 0, 0); + trace_lck, res, slept, 0, 0); } else { lck_interlock_unlock(lck, istate); break; } } } -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * We infer whether we took the sleep/spin path above by checking readers_at_sleep. */ @@ -1837,37 +1790,33 @@ lck_rw_lock_shared_to_exclusive_success( } LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); #endif - return TRUE; + return (TRUE); } /* * Routine: lck_rw_lock_exclusive_to_shared */ -void -lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) +void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); - lck_rw_interlock_spin(lock); /* wait for interlock to clear */ + lck_rw_interlock_spin(lock); /* wait for interlock to clear */ continue; } data += LCK_RW_SHARED_READER; - if (data & LCK_RW_WANT_UPGRADE) { + if (data & LCK_RW_WANT_UPGRADE) data &= ~(LCK_RW_WANT_UPGRADE); - } else { + else data &= ~(LCK_RW_WANT_EXCL); - } - if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) { + if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) data &= ~(LCK_RW_W_WAITING); - } - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) { + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) break; - } cpu_pause(); } return lck_rw_lock_exclusive_to_shared_gen(lock, prev); @@ -1876,7 +1825,7 @@ lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) /* * Routine: lck_rw_lock_exclusive_to_shared_gen - * Function: + * Function: * assembly fast path has already dropped * our exclusive state and bumped lck_rw_shared_count * all we need to do here is determine if anyone @@ -1884,16 +1833,16 @@ lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) */ static void lck_rw_lock_exclusive_to_shared_gen( - lck_rw_t *lck, - uint32_t prior_lock_state) + lck_rw_t *lck, + uint32_t prior_lock_state) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - lck_rw_t *fake_lck; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + lck_rw_t *fake_lck; fake_lck = (lck_rw_t *)&prior_lock_state; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); + trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); /* * don't wake up anyone waiting to take the lock exclusively @@ -1903,12 +1852,11 @@ lck_rw_lock_exclusive_to_shared_gen( * wake up any waiting readers if we don't have any writers waiting, * or the lock is NOT marked as rw_priv_excl (writers have privilege) */ - if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) { + if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) thread_wakeup(RW_LOCK_READER_EVENT(lck)); - } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); @@ -1921,29 +1869,27 @@ lck_rw_lock_exclusive_to_shared_gen( */ boolean_t lck_rw_try_lock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) -{ - if (lck_rw_type == LCK_RW_TYPE_SHARED) { - return lck_rw_try_lock_shared(lck); - } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { - return lck_rw_try_lock_exclusive(lck); - } else { + lck_rw_t *lck, + lck_rw_type_t lck_rw_type) +{ + if (lck_rw_type == LCK_RW_TYPE_SHARED) + return(lck_rw_try_lock_shared(lck)); + else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) + return(lck_rw_try_lock_exclusive(lck)); + else panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type); - } - return FALSE; + return(FALSE); } /* * Routine: lck_rw_try_lock_shared */ -boolean_t -lck_rw_try_lock_shared(lck_rw_t *lock) +boolean_t lck_rw_try_lock_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); @@ -1952,19 +1898,18 @@ lck_rw_try_lock_shared(lck_rw_t *lock) } if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { atomic_exchange_abort(); - return FALSE; /* lock is busy */ + return FALSE; /* lock is busy */ } - data += LCK_RW_SHARED_READER; /* Increment reader refcount */ - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { + data += LCK_RW_SHARED_READER; /* Increment reader refcount */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) break; - } cpu_pause(); } current_thread()->rwlock_count++; /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */ -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); -#endif /* CONFIG_DTRACE */ +#endif /* CONFIG_DTRACE */ return TRUE; } @@ -1973,12 +1918,11 @@ lck_rw_try_lock_shared(lck_rw_t *lock) * Routine: lck_rw_try_lock_exclusive */ -boolean_t -lck_rw_try_lock_exclusive(lck_rw_t *lock) +boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for (;;) { + for ( ; ; ) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); @@ -1987,27 +1931,26 @@ lck_rw_try_lock_exclusive(lck_rw_t *lock) } if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { atomic_exchange_abort(); - return FALSE; /* can't get it */ + return FALSE; /* can't get it */ } data |= LCK_RW_WANT_EXCL; - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) break; - } cpu_pause(); } current_thread()->rwlock_count++; -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); -#endif /* CONFIG_DTRACE */ +#endif /* CONFIG_DTRACE */ return TRUE; } void lck_rw_assert( - lck_rw_t *lck, - unsigned int type) + lck_rw_t *lck, + unsigned int type) { switch (type) { case LCK_RW_ASSERT_SHARED: @@ -2017,7 +1960,7 @@ lck_rw_assert( break; case LCK_RW_ASSERT_EXCLUSIVE: if ((lck->lck_rw_want_write || - lck->lck_rw_want_upgrade) && + lck->lck_rw_want_upgrade) && lck->lck_rw_shared_count == 0) { return; } @@ -2031,8 +1974,8 @@ lck_rw_assert( break; case LCK_RW_ASSERT_NOTHELD: if (!(lck->lck_rw_want_write || - lck->lck_rw_want_upgrade || - lck->lck_rw_shared_count != 0)) { + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0)) { return; } break; @@ -2044,6 +1987,9 @@ lck_rw_assert( } /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */ +#if MACH_LDEBUG +__dead2 +#endif void lck_rw_clear_promotions_x86(thread_t thread) { @@ -2077,8 +2023,7 @@ lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield) * NOT SAFE: To be used only by kernel debugger to avoid deadlock. */ boolean_t -kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) -{ +kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) { if (not_in_kdp) { panic("panic: rw lock exclusive check done outside of kernel debugger"); } @@ -2112,10 +2057,6 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) * Intel lock invariants: * * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue - * lck_mtx_pri: contains the max priority of all waiters during a contention period - * not cleared on last unlock, but stomped over on next first contention - * lck_mtx_promoted: set when the current lock owner has been promoted - * cleared when lock owner unlocks, set on acquire or wait. * * The lock owner is promoted to the max priority of all its waiters only if it * was a lower priority when it acquired or was an owner when a waiter waited. @@ -2131,7 +2072,7 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) * on acquire. */ -#ifdef MUTEX_ZONE +#ifdef MUTEX_ZONE extern zone_t lck_mtx_zone; #endif @@ -2140,20 +2081,18 @@ extern zone_t lck_mtx_zone; */ lck_mtx_t * lck_mtx_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) + lck_grp_t *grp, + lck_attr_t *attr) { - lck_mtx_t *lck; -#ifdef MUTEX_ZONE - if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) { + lck_mtx_t *lck; +#ifdef MUTEX_ZONE + if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) lck_mtx_init(lck, grp, attr); - } #else - if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) { + if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) lck_mtx_init(lck, grp, attr); - } #endif - return lck; + return(lck); } /* @@ -2161,11 +2100,11 @@ lck_mtx_alloc_init( */ void lck_mtx_free( - lck_mtx_t *lck, - lck_grp_t *grp) + lck_mtx_t *lck, + lck_grp_t *grp) { lck_mtx_destroy(lck, grp); -#ifdef MUTEX_ZONE +#ifdef MUTEX_ZONE zfree(lck_mtx_zone, lck); #else kfree(lck, sizeof(lck_mtx_t)); @@ -2177,9 +2116,9 @@ lck_mtx_free( */ static void lck_mtx_ext_init( - lck_mtx_ext_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) + lck_mtx_ext_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { bzero((void *)lck, sizeof(lck_mtx_ext_t)); @@ -2190,9 +2129,8 @@ lck_mtx_ext_init( lck->lck_mtx_grp = grp; - if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) { + if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; - } lck->lck_mtx.lck_mtx_is_ext = 1; lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF; @@ -2203,18 +2141,17 @@ lck_mtx_ext_init( */ void lck_mtx_init( - lck_mtx_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) + lck_mtx_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { - lck_mtx_ext_t *lck_ext; - lck_attr_t *lck_attr; + lck_mtx_ext_t *lck_ext; + lck_attr_t *lck_attr; - if (attr != LCK_ATTR_NULL) { + if (attr != LCK_ATTR_NULL) lck_attr = attr; - } else { + else lck_attr = &LockDefaultLckAttr; - } if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) { @@ -2236,18 +2173,17 @@ lck_mtx_init( */ void lck_mtx_init_ext( - lck_mtx_t *lck, - lck_mtx_ext_t *lck_ext, - lck_grp_t *grp, - lck_attr_t *attr) + lck_mtx_t *lck, + lck_mtx_ext_t *lck_ext, + lck_grp_t *grp, + lck_attr_t *attr) { - lck_attr_t *lck_attr; + lck_attr_t *lck_attr; - if (attr != LCK_ATTR_NULL) { + if (attr != LCK_ATTR_NULL) lck_attr = attr; - } else { + else lck_attr = &LockDefaultLckAttr; - } if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { lck_mtx_ext_init(lck_ext, grp, lck_attr); @@ -2289,14 +2225,13 @@ lck_mtx_lock_mark_destroyed( */ void lck_mtx_destroy( - lck_mtx_t *lck, - lck_grp_t *grp) + lck_mtx_t *lck, + lck_grp_t *grp) { boolean_t indirect; - if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) { + if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) return; - } #if MACH_LDEBUG lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); #endif @@ -2304,9 +2239,8 @@ lck_mtx_destroy( lck_mtx_lock_mark_destroyed(lck, indirect); - if (indirect) { + if (indirect) kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); - } lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); lck_grp_deallocate(grp); return; @@ -2328,7 +2262,7 @@ __attribute__((always_inline)) static boolean_t get_indirect_mutex( lck_mtx_t **lock, - uint32_t *state) + uint32_t *state) { *lock = &((*lock)->lck_mtx_ptr->lck_mtx); *state = ordered_load_mtx_state(*lock); @@ -2336,23 +2270,22 @@ get_indirect_mutex( } /* - * Routine: lck_mtx_unlock_slow + * Routine: lck_mtx_unlock_slow * * Unlocks a mutex held by current thread. * - * It will wake up waiters if necessary and - * drop promotions. + * It will wake up waiters if necessary. * * Interlock can be held. */ __attribute__((noinline)) void lck_mtx_unlock_slow( - lck_mtx_t *lock) + lck_mtx_t *lock) { - thread_t thread; - uint32_t state, prev; - boolean_t indirect = FALSE; + thread_t thread; + uint32_t state, prev; + boolean_t indirect = FALSE; state = ordered_load_mtx_state(lock); @@ -2365,15 +2298,13 @@ lck_mtx_unlock_slow( #if DEVELOPMENT | DEBUG thread_t owner = (thread_t)lock->lck_mtx_owner; - if (__improbable(owner != thread)) { - return lck_mtx_owner_check_panic(lock); - } + if(__improbable(owner != thread)) + lck_mtx_owner_check_panic(lock); #endif /* check if it is held as a spinlock */ - if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) { + if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) goto unlock; - } lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state); @@ -2384,24 +2315,24 @@ unlock: ordered_store_mtx_owner(lock, 0); /* keep original state in prev for later evaluation */ prev = state; - /* release interlock, promotion and clear spin flag */ - state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK)); - if ((state & LCK_MTX_WAITERS_MSK)) { - state -= LCK_MTX_WAITER; /* decrement waiter count */ - } - ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { #if MACH_LDEBUG - /* perform lock statistics after drop to prevent delay */ - if (thread) { - thread->mutex_count--; /* lock statistic */ + if (thread) + thread->mutex_count--; +#endif + return lck_mtx_unlock_wakeup_tail(lock, state, indirect); } -#endif /* MACH_LDEBUG */ - /* check if there are waiters to wake up or priority to drop */ - if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK))) { - return lck_mtx_unlock_wakeup_tail(lock, prev, indirect); - } + /* release interlock, promotion and clear spin flag */ + state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK)); + ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */ + +#if MACH_LDEBUG + /* perform lock statistics after drop to prevent delay */ + if (thread) + thread->mutex_count--; /* lock statistic */ +#endif /* MACH_LDEBUG */ /* re-enable preemption */ lck_mtx_unlock_finish_inline(lock, FALSE); @@ -2409,19 +2340,18 @@ unlock: return; } -#define LCK_MTX_LCK_WAIT_CODE 0x20 -#define LCK_MTX_LCK_WAKEUP_CODE 0x21 -#define LCK_MTX_LCK_SPIN_CODE 0x22 -#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 -#define LCK_MTX_LCK_DEMOTE_CODE 0x24 +#define LCK_MTX_LCK_WAIT_CODE 0x20 +#define LCK_MTX_LCK_WAKEUP_CODE 0x21 +#define LCK_MTX_LCK_SPIN_CODE 0x22 +#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 +#define LCK_MTX_LCK_DEMOTE_CODE 0x24 /* * Routine: lck_mtx_unlock_wakeup_tail * * Invoked on unlock when there is * contention, i.e. the assembly routine sees - * that mutex->lck_mtx_waiters != 0 or - * that mutex->lck_mtx_promoted != 0 + * that mutex->lck_mtx_waiters != 0 * * neither the mutex or interlock is held * @@ -2431,7 +2361,6 @@ unlock: * * assembly routine previously did the following to mutex: * (after saving the state in prior_lock_state) - * cleared lck_mtx_promoted * decremented lck_mtx_waiters if nonzero * * This function needs to be called as a tail call @@ -2439,151 +2368,94 @@ unlock: */ __attribute__((noinline)) static void -lck_mtx_unlock_wakeup_tail( - lck_mtx_t *mutex, - int prior_lock_state, - boolean_t indirect) +lck_mtx_unlock_wakeup_tail ( + lck_mtx_t *mutex, + uint32_t state, + boolean_t indirect) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); - lck_mtx_t fake_lck; + struct turnstile *ts; - /* - * prior_lock state is a snapshot of the 2nd word of the - * lock in question... we'll fake up a lock with the bits - * copied into place and carefully not access anything - * beyond whats defined in the second word of a lck_mtx_t - */ - fake_lck.lck_mtx_state = prior_lock_state; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + kern_return_t did_wake; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, - trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); - if (__probable(fake_lck.lck_mtx_waiters)) { - kern_return_t did_wake; + ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); - if (fake_lck.lck_mtx_waiters > 1) { - did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri); - } else { - did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex)); - } - /* - * The waiters count always precisely matches the number of threads on the waitqueue. - * i.e. we should never see ret == KERN_NOT_WAITING. - */ - assert(did_wake == KERN_SUCCESS); + if (mutex->lck_mtx_waiters > 1) { + /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */ + did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE); + } else { + did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE); } + assert(did_wake == KERN_SUCCESS); - /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */ - if (__improbable(fake_lck.lck_mtx_promoted)) { - thread_t thread = current_thread(); - - spl_t s = splsched(); - thread_lock(thread); - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, - thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0); - assert(thread->was_promoted_on_wakeup == 0); - assert(thread->promotions > 0); - - assert_promotions_invariant(thread); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); - if (--thread->promotions == 0) { - sched_thread_unpromote(thread, trace_lck); - } + state -= LCK_MTX_WAITER; + state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK)); + ordered_store_mtx_state_release(mutex, state); - assert_promotions_invariant(thread); + assert(current_thread()->turnstile != NULL); - thread_unlock(thread); - splx(s); - } + turnstile_cleanup(); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); lck_mtx_unlock_finish_inline(mutex, indirect); } /* - * Routine: lck_mtx_lock_acquire_x86 + * Routine: lck_mtx_lock_acquire_x86 * * Invoked on acquiring the mutex when there is * contention (i.e. the assembly routine sees that - * that mutex->lck_mtx_waiters != 0 or - * thread->was_promoted_on_wakeup != 0)... + * that mutex->lck_mtx_waiters != 0 * * mutex is owned... interlock is held... preemption is disabled */ __attribute__((always_inline)) static void lck_mtx_lock_acquire_inline( - lck_mtx_t *mutex) + lck_mtx_t *mutex, + struct turnstile *ts) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); - integer_t priority; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, - trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); - - if (mutex->lck_mtx_waiters) { - priority = mutex->lck_mtx_pri; - } else { - priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */ - } - /* the priority must have been set correctly by wait */ - assert(priority <= MAXPRI_PROMOTE); - assert(priority == 0 || priority >= BASEPRI_DEFAULT); - - /* if the mutex wasn't owned, then the owner wasn't promoted */ - assert(mutex->lck_mtx_promoted == 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */ + assert(thread->waiting_for_mutex == NULL); - if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { - spl_t s = splsched(); - thread_lock(thread); - - if (thread->was_promoted_on_wakeup) { - assert(thread->promotions > 0); + if (mutex->lck_mtx_waiters > 0) { + if (ts == NULL) { + ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); } - /* Intel only promotes if priority goes up */ - if (thread->sched_pri < priority && thread->promotion_priority < priority) { - /* Remember that I need to drop this promotion on unlock */ - mutex->lck_mtx_promoted = 1; + turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + } - if (thread->promotions++ == 0) { - /* This is the first promotion for the owner */ - sched_thread_promote_to_pri(thread, priority, trace_lck); - } else { - /* - * Holder was previously promoted due to a different mutex, - * raise to match this one. - * Or, this thread was promoted on wakeup but someone else - * later contended on mutex at higher priority before we got here - */ - sched_thread_update_promotion_to_pri(thread, priority, trace_lck); - } - } + if (ts != NULL) { + turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + } - if (thread->was_promoted_on_wakeup) { - thread->was_promoted_on_wakeup = 0; - if (--thread->promotions == 0) { - sched_thread_unpromote(thread, trace_lck); - } - } + assert(current_thread()->turnstile != NULL); - thread_unlock(thread); - splx(s); - } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); } void lck_mtx_lock_acquire_x86( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { - return lck_mtx_lock_acquire_inline(mutex); + return lck_mtx_lock_acquire_inline(mutex, NULL); } /* @@ -2595,19 +2467,20 @@ lck_mtx_lock_acquire_x86( __attribute__((noinline)) static void lck_mtx_lock_acquire_tail( - lck_mtx_t *mutex, - boolean_t indirect) + lck_mtx_t *mutex, + boolean_t indirect, + struct turnstile *ts) { - lck_mtx_lock_acquire_inline(mutex); - lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect); + lck_mtx_lock_acquire_inline(mutex, ts); + lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect); } __attribute__((noinline)) static boolean_t lck_mtx_try_lock_acquire_tail( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { - lck_mtx_lock_acquire_inline(mutex); + lck_mtx_lock_acquire_inline(mutex, NULL); lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex)); return TRUE; @@ -2616,9 +2489,9 @@ lck_mtx_try_lock_acquire_tail( __attribute__((noinline)) static void lck_mtx_convert_spin_acquire_tail( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { - lck_mtx_lock_acquire_inline(mutex); + lck_mtx_lock_acquire_inline(mutex, NULL); lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex)); } @@ -2640,7 +2513,7 @@ lck_mtx_interlock_lock_set_and_clear_flags( uint32_t state, prev; state = *new_state; - for (;;) { + for ( ; ; ) { /* have to wait for interlock to clear */ while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) { cpu_pause(); @@ -2648,12 +2521,11 @@ lck_mtx_interlock_lock_set_and_clear_flags( } prev = state; /* prev contains snapshot for exchange */ state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */ - state &= ~and_flags; /* clear flags */ + state &= ~and_flags; /* clear flags */ disable_preemption(); - if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) break; - } enable_preemption(); cpu_pause(); state = ordered_load_mtx_state(mutex); @@ -2692,12 +2564,12 @@ lck_mtx_interlock_try_lock_set_flags( if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) { return 0; } - prev = state; /* prev contains snapshot for exchange */ - state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */ + prev = state; /* prev contains snapshot for exchange */ + state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */ disable_preemption(); - if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { - *new_state = state; - return 1; + if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) { + *new_state = state; + return 1; } enable_preemption(); @@ -2717,7 +2589,7 @@ lck_mtx_interlock_try_lock_disable_interrupts( lck_mtx_t *mutex, boolean_t *istate) { - uint32_t state; + uint32_t state; *istate = ml_set_interrupts_enabled(FALSE); state = ordered_load_mtx_state(mutex); @@ -2749,6 +2621,7 @@ lck_mtx_lock_contended( lck_mtx_spinwait_ret_type_t ret; uint32_t state; thread_t thread; + struct turnstile *ts = NULL; try_again: @@ -2768,7 +2641,7 @@ try_again: lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock); } - /* just fall through case LCK_MTX_SPINWAIT_SPUN */ + /* just fall through case LCK_MTX_SPINWAIT_SPUN */ case LCK_MTX_SPINWAIT_SPUN: /* * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin @@ -2781,12 +2654,13 @@ try_again: if (indirect) { lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss); } - lck_mtx_lock_wait_x86(lock); + lck_mtx_lock_wait_x86(lock, &ts); /* * interlock is not held here. */ goto try_again; } else { + /* grab the mutex */ state |= LCK_MTX_MLOCKED_MSK; ordered_store_mtx_state_release(lock, state); @@ -2818,12 +2692,22 @@ try_again: /* mutex has been acquired */ thread = (thread_t)lock->lck_mtx_owner; - if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) { - return lck_mtx_lock_acquire_tail(lock, indirect); + if (state & LCK_MTX_WAITERS_MSK) { + /* + * lck_mtx_lock_acquire_tail will call + * turnstile_complete. + */ + return lck_mtx_lock_acquire_tail(lock, indirect, ts); } + if (ts != NULL) { + turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + } + + assert(current_thread()->turnstile != NULL); + /* release the interlock */ - lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect); + lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect); } /* @@ -2831,7 +2715,7 @@ try_again: * panic to optimize compiled code. */ -__attribute__((noinline)) +__attribute__((noinline)) __abortlike static void lck_mtx_destroyed( lck_mtx_t *lock) @@ -2856,7 +2740,7 @@ lck_mtx_lock_wait_interlock_to_clear( { uint32_t state; - for (;;) { + for ( ; ; ) { cpu_pause(); state = ordered_load_mtx_state(lock); if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) { @@ -2878,7 +2762,7 @@ lck_mtx_try_lock_wait_interlock_to_clear( { uint32_t state; - for (;;) { + for ( ; ; ) { cpu_pause(); state = ordered_load_mtx_state(lock); if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) { @@ -2906,9 +2790,9 @@ void lck_mtx_lock_slow( lck_mtx_t *lock) { - boolean_t indirect = FALSE; - uint32_t state; - int first_miss = 0; + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; state = ordered_load_mtx_state(lock); @@ -2922,14 +2806,14 @@ lck_mtx_lock_slow( /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ /* no, must have been the mutex */ return lck_mtx_lock_contended(lock, indirect, &first_miss); } /* check to see if it is marked destroyed */ if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { - return lck_mtx_destroyed(lock); + lck_mtx_destroyed(lock); } /* Is this an indirect mutex? */ @@ -2940,7 +2824,7 @@ lck_mtx_lock_slow( lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); if (state & LCK_MTX_SPIN_MSK) { - /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ assert(state & LCK_MTX_ILOCKED_MSK); lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); } @@ -2966,7 +2850,7 @@ lck_mtx_lock_slow( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif /* @@ -2974,7 +2858,7 @@ lck_mtx_lock_slow( * inherit their priority. */ if (__improbable(state & LCK_MTX_WAITERS_MSK)) { - return lck_mtx_lock_acquire_tail(lock, indirect); + return lck_mtx_lock_acquire_tail(lock, indirect, NULL); } /* release the interlock */ @@ -3003,13 +2887,13 @@ lck_mtx_try_lock_slow( */ /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ return FALSE; } /* check to see if it is marked destroyed */ if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { - return lck_mtx_try_destroyed(lock); + lck_mtx_try_destroyed(lock); } /* Is this an indirect mutex? */ @@ -3021,9 +2905,8 @@ lck_mtx_try_lock_slow( } if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) { + if (indirect) lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); - } return FALSE; } } @@ -3031,9 +2914,8 @@ lck_mtx_try_lock_slow( /* no - can't be INDIRECT, DESTROYED or locked */ while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) { if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) { + if (indirect) lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); - } return FALSE; } } @@ -3046,7 +2928,7 @@ lck_mtx_try_lock_slow( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif /* @@ -3061,12 +2943,13 @@ lck_mtx_try_lock_slow( lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock)); return TRUE; + } __attribute__((noinline)) void lck_mtx_lock_spin_slow( - lck_mtx_t *lock) + lck_mtx_t *lock) { boolean_t indirect = FALSE; uint32_t state; @@ -3084,14 +2967,14 @@ lck_mtx_lock_spin_slow( /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ /* no, must have been the mutex */ return lck_mtx_lock_contended(lock, indirect, &first_miss); } /* check to see if it is marked destroyed */ if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { - return lck_mtx_destroyed(lock); + lck_mtx_destroyed(lock); } /* Is this an indirect mutex? */ @@ -3102,7 +2985,7 @@ lck_mtx_lock_spin_slow( lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); if (state & LCK_MTX_SPIN_MSK) { - /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ assert(state & LCK_MTX_ILOCKED_MSK); lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); } @@ -3114,7 +2997,7 @@ lck_mtx_lock_spin_slow( } /* no - can't be INDIRECT, DESTROYED or locked */ - while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) { + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) { if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { return lck_mtx_lock_contended(lock, indirect, &first_miss); } @@ -3132,7 +3015,7 @@ lck_mtx_lock_spin_slow( } #endif -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0); #endif /* return with the interlock held and preemption disabled */ @@ -3159,13 +3042,13 @@ lck_mtx_try_lock_spin_slow( */ /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ return FALSE; } /* check to see if it is marked destroyed */ if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { - return lck_mtx_try_destroyed(lock); + lck_mtx_try_destroyed(lock); } /* Is this an indirect mutex? */ @@ -3177,9 +3060,8 @@ lck_mtx_try_lock_spin_slow( } if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) { + if (indirect) lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); - } return FALSE; } } @@ -3187,9 +3069,8 @@ lck_mtx_try_lock_spin_slow( /* no - can't be INDIRECT, DESTROYED or locked */ while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) { if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) { + if (indirect) lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); - } return FALSE; } } @@ -3202,7 +3083,7 @@ lck_mtx_try_lock_spin_slow( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif @@ -3210,12 +3091,13 @@ lck_mtx_try_lock_spin_slow( LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0); #endif return TRUE; + } __attribute__((noinline)) void lck_mtx_convert_spin( - lck_mtx_t *lock) + lck_mtx_t *lock) { uint32_t state; @@ -3253,7 +3135,7 @@ lck_mtx_convert_spin( static inline boolean_t lck_mtx_lock_grab_mutex( - lck_mtx_t *lock) + lck_mtx_t *lock) { uint32_t state; @@ -3271,7 +3153,7 @@ lck_mtx_lock_grab_mutex( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif return TRUE; @@ -3280,8 +3162,8 @@ lck_mtx_lock_grab_mutex( __attribute__((noinline)) void lck_mtx_assert( - lck_mtx_t *lock, - unsigned int type) + lck_mtx_t *lock, + unsigned int type) { thread_t thread, owner; uint32_t state; @@ -3296,19 +3178,17 @@ lck_mtx_assert( owner = (thread_t)lock->lck_mtx_owner; if (type == LCK_MTX_ASSERT_OWNED) { - if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) { + if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) panic("mutex (%p) not owned\n", lock); - } } else { - assert(type == LCK_MTX_ASSERT_NOTOWNED); - if (owner == thread) { + assert (type == LCK_MTX_ASSERT_NOTOWNED); + if (owner == thread) panic("mutex (%p) owned\n", lock); - } } } /* - * Routine: lck_mtx_lock_spinwait_x86 + * Routine: lck_mtx_lock_spinwait_x86 * * Invoked trying to acquire a mutex when there is contention but * the holder is running on another processor. We spin for up to a maximum @@ -3322,18 +3202,18 @@ lck_mtx_assert( __attribute__((noinline)) lck_mtx_spinwait_ret_type_t lck_mtx_lock_spinwait_x86( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); - thread_t holder; - uint64_t overall_deadline; - uint64_t check_owner_deadline; - uint64_t cur_time; - lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN; - int loopcount = 0; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + thread_t holder; + uint64_t overall_deadline; + uint64_t check_owner_deadline; + uint64_t cur_time; + lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN; + int loopcount = 0; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0); cur_time = mach_absolute_time(); overall_deadline = cur_time + MutexSpin; @@ -3354,12 +3234,11 @@ lck_mtx_lock_spinwait_x86( } cur_time = mach_absolute_time(); - if (cur_time >= overall_deadline) { + if (cur_time >= overall_deadline) break; - } if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) { - boolean_t istate; + boolean_t istate; /* * We will repeatedly peek at the state of the lock while spinning, @@ -3372,16 +3251,18 @@ lck_mtx_lock_spinwait_x86( * This is safe because it is a "try_lock", if we can't acquire * the interlock we re-enable the interrupts and fail, so it is * ok to call it even if the interlock was already held. - */ + */ if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) { + if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { - if (!(holder->machine.specFlags & OnProc) || - (holder->state & TH_IDLE)) { + + if ( !(holder->machine.specFlags & OnProc) || + (holder->state & TH_IDLE)) { + lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); - if (loopcount == 0) { + if (loopcount == 0) retval = LCK_MTX_SPINWAIT_NO_SPIN; - } break; } } @@ -3393,31 +3274,32 @@ lck_mtx_lock_spinwait_x86( cpu_pause(); loopcount++; + } while (TRUE); -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * We've already kept a count via overall_deadline of how long we spun. * If dtrace is active, then we compute backwards to decide how * long we spun. * * Note that we record a different probe id depending on whether - * this is a direct or indirect mutex. This allows us to + * this is a direct or indirect mutex. This allows us to * penalize only lock groups that have debug/stats enabled * with dtrace processing if desired. */ if (__probable(mutex->lck_mtx_is_ext == 0)) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - (overall_deadline - MutexSpin)); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - (overall_deadline - MutexSpin)); } /* The lockstat acquire event is recorded by the assembly code beneath us. */ #endif KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0); return retval; } @@ -3425,7 +3307,7 @@ lck_mtx_lock_spinwait_x86( /* - * Routine: lck_mtx_lock_wait_x86 + * Routine: lck_mtx_lock_wait_x86 * * Invoked in order to wait on contention. * @@ -3452,100 +3334,60 @@ lck_mtx_lock_spinwait_x86( */ __attribute__((noinline)) void -lck_mtx_lock_wait_x86( - lck_mtx_t *mutex) +lck_mtx_lock_wait_x86 ( + lck_mtx_t *mutex, + struct turnstile **ts) { -#if CONFIG_DTRACE + thread_t self = current_thread(); + +#if CONFIG_DTRACE uint64_t sleep_start = 0; if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { sleep_start = mach_absolute_time(); } #endif - thread_t self = current_thread(); - assert(self->waiting_for_mutex == NULL); - - self->waiting_for_mutex = mutex; - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), - mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); - - integer_t waiter_pri = self->sched_pri; - waiter_pri = MAX(waiter_pri, self->base_pri); - waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT); - waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE); - - assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, 0, 0); - /* Re-initialize lck_mtx_pri if this is the first contention */ - if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri) { - mutex->lck_mtx_pri = waiter_pri; - } + assert(self->waiting_for_mutex == NULL); + self->waiting_for_mutex = mutex; + mutex->lck_mtx_waiters++; thread_t holder = (thread_t)mutex->lck_mtx_owner; - assert(holder != NULL); /* - * Intel only causes a promotion when priority needs to change, - * reducing thread lock holds but leaving us vulnerable to the holder - * dropping priority. + * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse + * the same turnstile while looping, the matching turnstile compleate will be called + * by lck_mtx_lock_contended when finally acquiring the lock. */ - if (holder->sched_pri < mutex->lck_mtx_pri) { - int promote_pri = mutex->lck_mtx_pri; - - spl_t s = splsched(); - thread_lock(holder); - - /* Check again in case sched_pri changed */ - if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) { - if (mutex->lck_mtx_promoted == 0) { - /* This is the first promotion for this mutex */ - mutex->lck_mtx_promoted = 1; - - if (holder->promotions++ == 0) { - /* This is the first promotion for holder */ - sched_thread_promote_to_pri(holder, promote_pri, trace_lck); - } else { - /* - * Holder was previously promoted due to a different mutex, - * check if it needs to raise to match this one - */ - sched_thread_update_promotion_to_pri(holder, promote_pri, - trace_lck); - } - } else { - /* - * Holder was previously promoted due to this mutex, - * check if the pri needs to go up - */ - sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck); - } - } - - thread_unlock(holder); - splx(s); + if (*ts == NULL) { + *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); } - mutex->lck_mtx_waiters++; - + struct turnstile *turnstile = *ts; thread_set_pending_block_hint(self, kThreadWaitKernelMutex); - assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); lck_mtx_ilk_unlock(mutex); + turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + thread_block(THREAD_CONTINUE_NULL); self->waiting_for_mutex = NULL; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), - mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, 0, 0); -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * Record the Dtrace lockstat probe for blocking, block time * measured from when we were entered. @@ -3568,7 +3410,7 @@ lck_mtx_lock_wait_x86( * Returns: TRUE if lock is acquired. */ boolean_t -kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) +kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) { if (not_in_kdp) { panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger"); @@ -3594,17 +3436,17 @@ void kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) { lck_rw_t *rwlck = NULL; - switch (waitinfo->wait_type) { - case kThreadWaitKernelRWLockRead: - rwlck = READ_EVENT_TO_RWLOCK(event); - break; - case kThreadWaitKernelRWLockWrite: - case kThreadWaitKernelRWLockUpgrade: - rwlck = WRITE_EVENT_TO_RWLOCK(event); - break; - default: - panic("%s was called with an invalid blocking type", __FUNCTION__); - break; + switch(waitinfo->wait_type) { + case kThreadWaitKernelRWLockRead: + rwlck = READ_EVENT_TO_RWLOCK(event); + break; + case kThreadWaitKernelRWLockWrite: + case kThreadWaitKernelRWLockUpgrade: + rwlck = WRITE_EVENT_TO_RWLOCK(event); + break; + default: + panic("%s was called with an invalid blocking type", __FUNCTION__); + break; } waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck); waitinfo->owner = 0; diff --git a/osfmk/i386/locks_i386_inlines.h b/osfmk/i386/locks_i386_inlines.h index a7e188072..b10b70feb 100644 --- a/osfmk/i386/locks_i386_inlines.h +++ b/osfmk/i386/locks_i386_inlines.h @@ -31,25 +31,22 @@ #include #include +#include // Enforce program order of loads and stores. -#define ordered_load(target) _Generic( (target),\ - uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \ - uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) ) -#define ordered_store_release(target, value) _Generic( (target),\ - uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_release_smp), \ - uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_release_smp) ) -#define ordered_store_volatile(target, value) _Generic( (target),\ - volatile uint32_t* : __c11_atomic_store((_Atomic volatile uint32_t* )(target), (value), memory_order_relaxed), \ - volatile uintptr_t*: __c11_atomic_store((_Atomic volatile uintptr_t*)(target), (value), memory_order_relaxed) ) +#define ordered_load(target) os_atomic_load(target, compiler_acq_rel) +#define ordered_store_release(target, value) ({ \ + os_atomic_store(target, value, release); \ + os_compiler_barrier(); \ +}) /* Enforce program order of loads and stores. */ #define ordered_load_mtx_state(lock) ordered_load(&(lock)->lck_mtx_state) #define ordered_store_mtx_state_release(lock, value) ordered_store_release(&(lock)->lck_mtx_state, (value)) -#define ordered_store_mtx_owner(lock, value) ordered_store_volatile(&(lock)->lck_mtx_owner, (value)) +#define ordered_store_mtx_owner(lock, value) os_atomic_store(&(lock)->lck_mtx_owner, (value), compiler_acq_rel) #if DEVELOPMENT | DEBUG -void lck_mtx_owner_check_panic(lck_mtx_t *mutex); +void lck_mtx_owner_check_panic(lck_mtx_t *mutex) __abortlike; #endif __attribute__((always_inline)) @@ -85,6 +82,29 @@ lck_mtx_lock_finish_inline( #endif } +__attribute__((always_inline)) +static inline void +lck_mtx_lock_finish_inline_with_cleanup( + lck_mtx_t *mutex, + uint32_t state, + boolean_t indirect) +{ + assert(state & LCK_MTX_ILOCKED_MSK); + + /* release the interlock and re-enable preemption */ + lck_mtx_ilk_unlock_inline(mutex, state); + + turnstile_cleanup(); + +#if CONFIG_DTRACE + if (indirect) { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, mutex, 0); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, mutex, 0); + } +#endif +} + __attribute__((always_inline)) static inline void lck_mtx_try_lock_finish_inline( diff --git a/osfmk/i386/locks_i386_opt.c b/osfmk/i386/locks_i386_opt.c index fb0562fe8..5720cf7e2 100644 --- a/osfmk/i386/locks_i386_opt.c +++ b/osfmk/i386/locks_i386_opt.c @@ -26,7 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#define ATOMIC_PRIVATE 1 #define LOCK_PRIVATE 1 #include @@ -39,7 +38,6 @@ #include #include #include -#include #include #include @@ -138,7 +136,7 @@ lck_mtx_lock( state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK; disable_preemption(); - if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) { enable_preemption(); return lck_mtx_lock_slow(lock); } @@ -192,7 +190,7 @@ lck_mtx_try_lock( state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK; disable_preemption(); - if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) { enable_preemption(); return lck_mtx_try_lock_slow(lock); } @@ -255,7 +253,7 @@ lck_mtx_lock_spin_always( state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK; disable_preemption(); - if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) { enable_preemption(); return lck_mtx_lock_spin_slow(lock); } @@ -342,7 +340,7 @@ lck_mtx_try_lock_spin_always( state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK; disable_preemption(); - if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) { enable_preemption(); return lck_mtx_try_lock_spin_slow(lock); } @@ -395,7 +393,7 @@ lck_mtx_try_lock_spin( * Unlocks a mutex held by current thread. * It tries the fast path first, and falls * through the slow path in case waiters need to - * be woken up or promotions need to be dropped. + * be woken up. * * Interlock can be held, and the slow path will * unlock the mutex for this case. @@ -417,7 +415,7 @@ lck_mtx_unlock( * Only full mutex will go through the fast path * (if the lock was acquired as a spinlock it will * fall through the slow path). - * If there are waiters or promotions it will fall + * If there are waiters it will fall * through the slow path. * If it is indirect it will fall through the slow path. */ @@ -426,7 +424,7 @@ lck_mtx_unlock( * Fast path state: * interlock not held, no waiters, no promotion and mutex held. */ - prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_WAITERS_MSK | LCK_MTX_PROMOTED_MSK); + prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_WAITERS_MSK); prev |= LCK_MTX_MLOCKED_MSK; state = prev | LCK_MTX_ILOCKED_MSK; @@ -435,7 +433,7 @@ lck_mtx_unlock( disable_preemption(); /* the memory order needs to be acquire because it is acquiring the interlock */ - if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) { enable_preemption(); return lck_mtx_unlock_slow(lock); } @@ -445,7 +443,7 @@ lck_mtx_unlock( #if DEVELOPMENT | DEBUG thread_t owner = (thread_t)lock->lck_mtx_owner; if (__improbable(owner != current_thread())) { - return lck_mtx_owner_check_panic(lock); + lck_mtx_owner_check_panic(lock); } #endif diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 7d4568ed9..84bfb4c40 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -147,10 +147,9 @@ ml_static_unslide( return VM_KERNEL_UNSLIDE(vaddr); } - /* - * Routine: ml_static_mfree - * Function: + * Reclaim memory, by virtual address, that was used in early boot that is no longer needed + * by the kernel. */ void ml_static_mfree( @@ -160,28 +159,43 @@ ml_static_mfree( addr64_t vaddr_cur; ppnum_t ppn; uint32_t freed_pages = 0; + vm_size_t map_size; assert(vaddr >= VM_MIN_KERNEL_ADDRESS); assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */ - for (vaddr_cur = vaddr; - vaddr_cur < round_page_64(vaddr + size); - vaddr_cur += PAGE_SIZE) { + for (vaddr_cur = vaddr; vaddr_cur < round_page_64(vaddr + size);) { + map_size = pmap_query_pagesize(kernel_pmap, vaddr_cur); + + /* just skip if nothing mapped here */ + if (map_size == 0) { + vaddr_cur += PAGE_SIZE; + continue; + } + + /* + * Can't free from the middle of a large page. + */ + assert((vaddr_cur & (map_size - 1)) == 0); + ppn = pmap_find_phys(kernel_pmap, vaddr_cur); - if (ppn != (vm_offset_t)NULL) { - kernel_pmap->stats.resident_count++; - if (kernel_pmap->stats.resident_count > - kernel_pmap->stats.resident_max) { - kernel_pmap->stats.resident_max = - kernel_pmap->stats.resident_count; + assert(ppn != (ppnum_t)NULL); + + pmap_remove(kernel_pmap, vaddr_cur, vaddr_cur + map_size); + while (map_size > 0) { + if (++kernel_pmap->stats.resident_count > kernel_pmap->stats.resident_max) { + kernel_pmap->stats.resident_max = kernel_pmap->stats.resident_count; } - pmap_remove(kernel_pmap, vaddr_cur, vaddr_cur + PAGE_SIZE); + assert(pmap_valid_page(ppn)); if (IS_MANAGED_PAGE(ppn)) { vm_page_create(ppn, (ppn + 1)); freed_pages++; } + map_size -= PAGE_SIZE; + vaddr_cur += PAGE_SIZE; + ppn++; } } vm_page_lockspin_queues(); @@ -371,6 +385,7 @@ ml_get_power_state(boolean_t *icp, boolean_t *pidlep) } /* Generate a fake interrupt */ +__dead2 void ml_cause_interrupt(void) { @@ -429,6 +444,7 @@ machine_signal_idle( cpu_interrupt(processor->cpu_id); } +__dead2 void machine_signal_idle_deferred( __unused processor_t processor) @@ -436,6 +452,7 @@ machine_signal_idle_deferred( panic("Unimplemented"); } +__dead2 void machine_signal_idle_cancel( __unused processor_t processor) @@ -567,7 +584,7 @@ ml_processor_register( /* allocate and initialize other per-cpu structures */ if (!boot_cpu) { mp_cpus_call_cpu_init(cpunum); - early_random_cpu_init(cpunum); + random_cpu_init(cpunum); } /* output arg */ @@ -868,7 +885,7 @@ ml_cpu_down(void) * The following are required for parts of the kernel * that cannot resolve these functions as inlines: */ -extern thread_t current_act(void); +extern thread_t current_act(void) __attribute__((const)); thread_t current_act(void) { @@ -876,7 +893,7 @@ current_act(void) } #undef current_thread -extern thread_t current_thread(void); +extern thread_t current_thread(void) __attribute__((const)); thread_t current_thread(void) { @@ -1045,11 +1062,8 @@ ml_entropy_collect(void) assert(cpu_number() == master_cpu); /* update buffer pointer cyclically */ - if (EntropyData.index_ptr - EntropyData.buffer == ENTROPY_BUFFER_SIZE) { - ep = EntropyData.index_ptr = EntropyData.buffer; - } else { - ep = EntropyData.index_ptr++; - } + ep = EntropyData.buffer + (EntropyData.sample_count & ENTROPY_BUFFER_INDEX_MASK); + EntropyData.sample_count += 1; rdtsc_nofence(tsc_lo, tsc_hi); *ep = ror32(*ep, 9) ^ tsc_lo; diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index 28018871b..b2f1e478f 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -381,7 +381,6 @@ void interrupt_reset_latency_stats(void); void interrupt_populate_latency_stats(char *, unsigned); void ml_get_power_state(boolean_t *, boolean_t *); -void timer_queue_expire_local(void*); void timer_queue_expire_rescan(void*); void ml_timer_evaluate(void); boolean_t ml_timer_forced_evaluation(void); diff --git a/osfmk/i386/memory_types.h b/osfmk/i386/memory_types.h new file mode 100644 index 000000000..808a4a74b --- /dev/null +++ b/osfmk/i386/memory_types.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _I386_MEMORY_TYPES_H_ +#define _I386_MEMORY_TYPES_H_ + +#define VM_WIMG_COPYBACK VM_MEM_COHERENT +#define VM_WIMG_COPYBACKLW VM_WIMG_COPYBACK +#define VM_WIMG_DEFAULT VM_MEM_COHERENT +/* ?? intel ?? */ +#define VM_WIMG_IO (VM_MEM_COHERENT | \ + VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) +#define VM_WIMG_POSTED VM_WIMG_IO +#define VM_WIMG_POSTED_REORDERED VM_WIMG_IO +#define VM_WIMG_POSTED_COMBINED_REORDERED VM_WIMG_IO +#define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) +/* write combining mode, aka store gather */ +#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) +#define VM_WIMG_INNERWBACK VM_MEM_COHERENT +#define VM_WIMG_RT VM_WIMG_WCOMB + +#endif /* _I386_MEMORY_TYPES_H_ */ diff --git a/osfmk/i386/misc_protos.h b/osfmk/i386/misc_protos.h index 8a0165905..c12a9aaf7 100644 --- a/osfmk/i386/misc_protos.h +++ b/osfmk/i386/misc_protos.h @@ -117,19 +117,15 @@ extern void rtc_sleep_wakeup(uint64_t base); extern void rtc_timer_start(void); -extern void rtc_clock_stepping( - uint32_t new_frequency, - uint32_t old_frequency); -extern void rtc_clock_stepped( - uint32_t new_frequency, - uint32_t old_frequency); extern void rtc_clock_napped(uint64_t, uint64_t); extern void rtc_clock_adjust(uint64_t); extern void pmap_lowmem_finalize(void); thread_t Switch_context(thread_t, thread_continue_t, thread_t); -thread_t Shutdown_context(thread_t thread, void (*doshutdown)(processor_t), processor_t processor); + +__not_tail_called thread_t +Shutdown_context(thread_t thread, void (*doshutdown)(processor_t), processor_t processor); #ifdef __x86_64__ uint64_t x86_64_pre_sleep(void); @@ -150,6 +146,10 @@ copy_debug_state64(x86_debug_state64_t *src, x86_debug_state64_t *target, boolea extern void act_machine_switch_pcb(thread_t old, thread_t new); +extern void Idle_PTs_release(vm_offset_t start, vm_offset_t end); +extern ppnum_t released_PT_ppn; +extern uint32_t released_PT_cnt; + /* Fast-restart parameters */ #define FULL_SLAVE_INIT (NULL) #define FAST_SLAVE_INIT ((void *)(uintptr_t)1) diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 428f6151c..b6654cc39 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -56,8 +55,6 @@ #include #include -#include - #include #include #include @@ -188,24 +185,6 @@ boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, void i386_start_cpu(int lapic_id, int cpu_num); void i386_send_NMI(int cpu); void NMIPI_enable(boolean_t); -#if GPROF -/* - * Initialize dummy structs for profiling. These aren't used but - * allows hertz_tick() to be built with GPROF defined. - */ -struct profile_vars _profile_vars; -struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars }; -#define GPROF_INIT() \ -{ \ - int i; \ - \ - /* Hack to initialize pointers to unused profiling structs */ \ - for (i = 1; i < MAX_CPUS; i++) \ - _profile_vars_cpus[i] = &_profile_vars; \ -} -#else -#define GPROF_INIT() -#endif /* GPROF */ static lck_grp_t smp_lck_grp; static lck_grp_attr_t smp_lck_grp_attr; @@ -245,7 +224,6 @@ smp_init(void) cpu_thread_init(); - GPROF_INIT(); DBGLOG_CPU_INIT(master_cpu); mp_cpus_call_init(); @@ -1500,12 +1478,9 @@ mp_broadcast( * signal other processors, which will call mp_broadcast_action() */ mp_bc_count = real_ncpus; /* assume max possible active */ - mp_bc_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, *mp_broadcast_action, NULL) + 1; + mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL); atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */ - /* call executor function on this cpu */ - mp_broadcast_action(NULL); - /* block for other cpus to have run action_func */ if (mp_bc_ncpus > 1) { thread_block(THREAD_CONTINUE_NULL); diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index e63c9f4c4..43e8085e0 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,6 +65,7 @@ #include #include +#include #define MAX_CPUS 64 /* 8 * sizeof(cpumask_t) */ @@ -76,13 +77,14 @@ #include #include #include +#include __BEGIN_DECLS extern kern_return_t intel_startCPU(int slot_num); extern kern_return_t intel_startCPU_fast(int slot_num); -extern void i386_init_slave(void); -extern void i386_init_slave_fast(void); +extern void i386_init_slave(void) __dead2; +extern void i386_init_slave_fast(void) __dead2; extern void smp_init(void); extern void cpu_interrupt(int cpu); @@ -90,7 +92,7 @@ __END_DECLS extern unsigned int real_ncpus; /* real number of cpus */ extern unsigned int max_ncpus; /* max number of cpus */ -decl_simple_lock_data(extern, kdb_lock) /* kdb lock */ +decl_simple_lock_data(extern, kdb_lock); /* kdb lock */ __BEGIN_DECLS @@ -153,6 +155,9 @@ typedef enum {KDP_XCPU_NONE = 0xffff, KDP_CURRENT_LCPU = 0xfffe} kdp_cpu_t; typedef uint32_t cpu_t; typedef volatile uint64_t cpumask_t; + +static_assert(sizeof(cpumask_t) * CHAR_BIT >= MAX_CPUS, "cpumask_t bitvector is too small for current MAX_CPUS value"); + static inline cpumask_t cpu_to_cpumask(cpu_t cpu) { diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index ad97efdaa..ac756a7f9 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -224,11 +224,11 @@ cldt_t *dyn_ldts; * in the uber-space remapping window on the kernel. */ struct fake_descriptor64 kernel_ldt_desc64 = { - 0, - LDTSZ_MIN*sizeof(struct fake_descriptor) - 1, - 0, - ACC_P | ACC_PL_K | ACC_LDT, - 0 + .offset64 = 0, + .lim_or_seg = LDTSZ_MIN * sizeof(struct fake_descriptor) - 1, + .size_or_IST = 0, + .access = ACC_P | ACC_PL_K | ACC_LDT, + .reserved = 0 }; /* @@ -236,11 +236,11 @@ struct fake_descriptor64 kernel_ldt_desc64 = { * It is follows pattern of the KERNEL_LDT. */ struct fake_descriptor64 kernel_tss_desc64 = { - 0, - sizeof(struct x86_64_tss) - 1, - 0, - ACC_P | ACC_PL_K | ACC_TSS, - 0 + .offset64 = 0, + .lim_or_seg = sizeof(struct x86_64_tss) - 1, + .size_or_IST = 0, + .access = ACC_P | ACC_PL_K | ACC_TSS, + .reserved = 0 }; /* @@ -499,9 +499,6 @@ cpu_desc_load(cpu_data_t *cdp) postcode(CPU_DESC_LOAD_TSS); set_tr(KERNEL_TSS); -#if GPROF // Hack to enable mcount to work on K64 - __asm__ volatile ("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS))); -#endif postcode(CPU_DESC_LOAD_EXIT); } @@ -511,11 +508,8 @@ cpu_desc_load(cpu_data_t *cdp) void cpu_syscall_init(cpu_data_t *cdp) { -#if MONOTONIC - mt_cpu_up(cdp); -#else /* MONOTONIC */ #pragma unused(cdp) -#endif /* !MONOTONIC */ + wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter)); wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku); diff --git a/osfmk/i386/pal_routines.h b/osfmk/i386/pal_routines.h index 03972d40b..bfb34d31b 100644 --- a/osfmk/i386/pal_routines.h +++ b/osfmk/i386/pal_routines.h @@ -123,7 +123,7 @@ void pal_thread_terminate_self(thread_t thread); void pal_ast_check(thread_t thread); /* Called by sync_iss_to_iks */ -extern void pal_get_kern_regs( x86_saved_state_t *state ); +extern void pal_get_kern_regs( x86_saved_state_t *state ) __dead2; /* * Platform-specific hlt/sti. diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index fe5d56b8e..9ece881bd 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -103,34 +103,31 @@ * Maps state flavor to number of words in the state: */ unsigned int _MachineStateCount[] = { - [x86_THREAD_STATE32] = x86_THREAD_STATE32_COUNT, - [x86_THREAD_STATE64] = x86_THREAD_STATE64_COUNT, - [x86_THREAD_FULL_STATE64] = x86_THREAD_FULL_STATE64_COUNT, - [x86_THREAD_STATE] = x86_THREAD_STATE_COUNT, - [x86_FLOAT_STATE32] = x86_FLOAT_STATE32_COUNT, - [x86_FLOAT_STATE64] = x86_FLOAT_STATE64_COUNT, - [x86_FLOAT_STATE] = x86_FLOAT_STATE_COUNT, - [x86_EXCEPTION_STATE32] = x86_EXCEPTION_STATE32_COUNT, - [x86_EXCEPTION_STATE64] = x86_EXCEPTION_STATE64_COUNT, - [x86_EXCEPTION_STATE] = x86_EXCEPTION_STATE_COUNT, - [x86_DEBUG_STATE32] = x86_DEBUG_STATE32_COUNT, - [x86_DEBUG_STATE64] = x86_DEBUG_STATE64_COUNT, - [x86_DEBUG_STATE] = x86_DEBUG_STATE_COUNT, - [x86_AVX_STATE32] = x86_AVX_STATE32_COUNT, - [x86_AVX_STATE64] = x86_AVX_STATE64_COUNT, - [x86_AVX_STATE] = x86_AVX_STATE_COUNT, -#if !defined(RC_HIDE_XNU_J137) - [x86_AVX512_STATE32] = x86_AVX512_STATE32_COUNT, - [x86_AVX512_STATE64] = x86_AVX512_STATE64_COUNT, - [x86_AVX512_STATE] = x86_AVX512_STATE_COUNT, -#endif /* not RC_HIDE_XNU_J137 */ + [x86_THREAD_STATE32] = x86_THREAD_STATE32_COUNT, + [x86_THREAD_STATE64] = x86_THREAD_STATE64_COUNT, + [x86_THREAD_FULL_STATE64] = x86_THREAD_FULL_STATE64_COUNT, + [x86_THREAD_STATE] = x86_THREAD_STATE_COUNT, + [x86_FLOAT_STATE32] = x86_FLOAT_STATE32_COUNT, + [x86_FLOAT_STATE64] = x86_FLOAT_STATE64_COUNT, + [x86_FLOAT_STATE] = x86_FLOAT_STATE_COUNT, + [x86_EXCEPTION_STATE32] = x86_EXCEPTION_STATE32_COUNT, + [x86_EXCEPTION_STATE64] = x86_EXCEPTION_STATE64_COUNT, + [x86_EXCEPTION_STATE] = x86_EXCEPTION_STATE_COUNT, + [x86_DEBUG_STATE32] = x86_DEBUG_STATE32_COUNT, + [x86_DEBUG_STATE64] = x86_DEBUG_STATE64_COUNT, + [x86_DEBUG_STATE] = x86_DEBUG_STATE_COUNT, + [x86_AVX_STATE32] = x86_AVX_STATE32_COUNT, + [x86_AVX_STATE64] = x86_AVX_STATE64_COUNT, + [x86_AVX_STATE] = x86_AVX_STATE_COUNT, + [x86_AVX512_STATE32] = x86_AVX512_STATE32_COUNT, + [x86_AVX512_STATE64] = x86_AVX512_STATE64_COUNT, + [x86_AVX512_STATE] = x86_AVX512_STATE_COUNT, + [x86_PAGEIN_STATE] = x86_PAGEIN_STATE_COUNT }; zone_t iss_zone; /* zone for saved_state area */ zone_t ids_zone; /* zone for debug_state area */ -extern int allow_64bit_proc_LDT_ops; - /* Forward */ extern void Thread_continue(void); @@ -485,6 +482,12 @@ machine_switch_context( return Switch_context(old, continuation, new); } +boolean_t +machine_thread_on_core(thread_t thread) +{ + return thread->machine.specFlags & OnProc; +} + thread_t machine_processor_shutdown( thread_t thread, @@ -855,15 +858,10 @@ machine_thread_set_state( state = (x86_saved_state32_t *) tstate; /* - * Allow a thread in a 64-bit process to set - * 32-bit state iff the code segment originates - * in the LDT (the implication is that only - * 32-bit code segments are allowed there, so - * setting 32-bit state implies a switch to - * compatibility mode on resume-to-user). + * Refuse to allow 64-bit processes to set + * 32-bit state. */ - if (thread_is_64bit_addr(thr_act) && - thr_act->task->i386_ldt == 0) { + if (thread_is_64bit_addr(thr_act)) { return KERN_INVALID_ARGUMENT; } @@ -996,38 +994,34 @@ machine_thread_set_state( case x86_FLOAT_STATE32: case x86_AVX_STATE32: -#if !defined(RC_HIDE_XNU_J137) case x86_AVX512_STATE32: -#endif /* not RC_HIDE_XNU_J137 */ - { - if (count != _MachineStateCount[flavor]) { - return KERN_INVALID_ARGUMENT; - } - - if (thread_is_64bit_addr(thr_act)) { - return KERN_INVALID_ARGUMENT; - } + { + if (count != _MachineStateCount[flavor]) { + return KERN_INVALID_ARGUMENT; + } - return fpu_set_fxstate(thr_act, tstate, flavor); + if (thread_is_64bit_addr(thr_act)) { + return KERN_INVALID_ARGUMENT; } + return fpu_set_fxstate(thr_act, tstate, flavor); + } + case x86_FLOAT_STATE64: case x86_AVX_STATE64: -#if !defined(RC_HIDE_XNU_J137) case x86_AVX512_STATE64: -#endif /* not RC_HIDE_XNU_J137 */ - { - if (count != _MachineStateCount[flavor]) { - return KERN_INVALID_ARGUMENT; - } - - if (!thread_is_64bit_addr(thr_act)) { - return KERN_INVALID_ARGUMENT; - } + { + if (count != _MachineStateCount[flavor]) { + return KERN_INVALID_ARGUMENT; + } - return fpu_set_fxstate(thr_act, tstate, flavor); + if (!thread_is_64bit_addr(thr_act)) { + return KERN_INVALID_ARGUMENT; } + return fpu_set_fxstate(thr_act, tstate, flavor); + } + case x86_FLOAT_STATE: { x86_float_state_t *state; @@ -1049,37 +1043,35 @@ machine_thread_set_state( } case x86_AVX_STATE: -#if !defined(RC_HIDE_XNU_J137) case x86_AVX512_STATE: -#endif - { - x86_avx_state_t *state; - - if (count != _MachineStateCount[flavor]) { - return KERN_INVALID_ARGUMENT; - } + { + x86_avx_state_t *state; - state = (x86_avx_state_t *)tstate; - /* Flavors are defined to have sequential values: 32-bit, 64-bit, non-specific */ - /* 64-bit flavor? */ - if (state->ash.flavor == (flavor - 1) && - state->ash.count == _MachineStateCount[flavor - 1] && - thread_is_64bit_addr(thr_act)) { - return fpu_set_fxstate(thr_act, - (thread_state_t)&state->ufs.as64, - flavor - 1); - } - /* 32-bit flavor? */ - if (state->ash.flavor == (flavor - 2) && - state->ash.count == _MachineStateCount[flavor - 2] && - !thread_is_64bit_addr(thr_act)) { - return fpu_set_fxstate(thr_act, - (thread_state_t)&state->ufs.as32, - flavor - 2); - } + if (count != _MachineStateCount[flavor]) { return KERN_INVALID_ARGUMENT; } + state = (x86_avx_state_t *)tstate; + /* Flavors are defined to have sequential values: 32-bit, 64-bit, non-specific */ + /* 64-bit flavor? */ + if (state->ash.flavor == (flavor - 1) && + state->ash.count == _MachineStateCount[flavor - 1] && + thread_is_64bit_addr(thr_act)) { + return fpu_set_fxstate(thr_act, + (thread_state_t)&state->ufs.as64, + flavor - 1); + } + /* 32-bit flavor? */ + if (state->ash.flavor == (flavor - 2) && + state->ash.count == _MachineStateCount[flavor - 2] && + !thread_is_64bit_addr(thr_act)) { + return fpu_set_fxstate(thr_act, + (thread_state_t)&state->ufs.as32, + flavor - 2); + } + return KERN_INVALID_ARGUMENT; + } + case x86_THREAD_STATE32: { if (count != x86_THREAD_STATE32_COUNT) { @@ -1108,15 +1100,16 @@ machine_thread_set_state( case x86_THREAD_FULL_STATE64: { - if (!allow_64bit_proc_LDT_ops) { + if (count != x86_THREAD_FULL_STATE64_COUNT) { return KERN_INVALID_ARGUMENT; } - if (count != x86_THREAD_FULL_STATE64_COUNT) { + if (!thread_is_64bit_addr(thr_act)) { return KERN_INVALID_ARGUMENT; } - if (!thread_is_64bit_addr(thr_act)) { + /* If this process does not have a custom LDT, return failure */ + if (thr_act->task->i386_ldt == 0) { return KERN_INVALID_ARGUMENT; } @@ -1139,7 +1132,7 @@ machine_thread_set_state( return set_thread_state64(thr_act, &state->uts.ts64, FALSE); } else if (state->tsh.flavor == x86_THREAD_FULL_STATE64 && state->tsh.count == x86_THREAD_FULL_STATE64_COUNT && - thread_is_64bit_addr(thr_act)) { + thread_is_64bit_addr(thr_act) && thr_act->task->i386_ldt != 0) { return set_thread_state64(thr_act, &state->uts.ts64, TRUE); } else if (state->tsh.flavor == x86_THREAD_STATE32 && state->tsh.count == x86_THREAD_STATE32_COUNT && @@ -1207,6 +1200,30 @@ machine_thread_set_state( return KERN_SUCCESS; } +mach_vm_address_t +machine_thread_pc(thread_t thr_act) +{ + if (thread_is_64bit_addr(thr_act)) { + return (mach_vm_address_t)USER_REGS64(thr_act)->isf.rip; + } else { + return (mach_vm_address_t)USER_REGS32(thr_act)->eip; + } +} + +void +machine_thread_reset_pc(thread_t thr_act, mach_vm_address_t pc) +{ + pal_register_cache_state(thr_act, DIRTY); + + if (thread_is_64bit_addr(thr_act)) { + if (!IS_USERADDR64_CANONICAL(pc)) { + pc = 0; + } + USER_REGS64(thr_act)->isf.rip = (uint64_t)pc; + } else { + USER_REGS32(thr_act)->eip = (uint32_t)pc; + } +} /* @@ -1268,7 +1285,6 @@ machine_thread_get_state( break; } -#if !defined(RC_HIDE_XNU_J137) case THREAD_STATE_FLAVOR_LIST_10_13: { if (*count < 6) { @@ -1286,7 +1302,24 @@ machine_thread_get_state( break; } -#endif + case THREAD_STATE_FLAVOR_LIST_10_15: + { + if (*count < 7) { + return KERN_INVALID_ARGUMENT; + } + + tstate[0] = x86_THREAD_STATE; + tstate[1] = x86_FLOAT_STATE; + tstate[2] = x86_EXCEPTION_STATE; + tstate[3] = x86_DEBUG_STATE; + tstate[4] = x86_AVX_STATE; + tstate[5] = x86_AVX512_STATE; + tstate[6] = x86_PAGEIN_STATE; + + *count = 7; + break; + } + case x86_SAVED_STATE32: { x86_saved_state32_t *state; @@ -1407,70 +1440,64 @@ machine_thread_get_state( } case x86_AVX_STATE32: -#if !defined(RC_HIDE_XNU_J137) case x86_AVX512_STATE32: -#endif - { - if (*count != _MachineStateCount[flavor]) { - return KERN_INVALID_ARGUMENT; - } + { + if (*count != _MachineStateCount[flavor]) { + return KERN_INVALID_ARGUMENT; + } - if (thread_is_64bit_addr(thr_act)) { - return KERN_INVALID_ARGUMENT; - } + if (thread_is_64bit_addr(thr_act)) { + return KERN_INVALID_ARGUMENT; + } - *count = _MachineStateCount[flavor]; + *count = _MachineStateCount[flavor]; - return fpu_get_fxstate(thr_act, tstate, flavor); - } + return fpu_get_fxstate(thr_act, tstate, flavor); + } case x86_AVX_STATE64: -#if !defined(RC_HIDE_XNU_J137) case x86_AVX512_STATE64: -#endif - { - if (*count != _MachineStateCount[flavor]) { - return KERN_INVALID_ARGUMENT; - } + { + if (*count != _MachineStateCount[flavor]) { + return KERN_INVALID_ARGUMENT; + } - if (!thread_is_64bit_addr(thr_act)) { - return KERN_INVALID_ARGUMENT; - } + if (!thread_is_64bit_addr(thr_act)) { + return KERN_INVALID_ARGUMENT; + } - *count = _MachineStateCount[flavor]; + *count = _MachineStateCount[flavor]; - return fpu_get_fxstate(thr_act, tstate, flavor); - } + return fpu_get_fxstate(thr_act, tstate, flavor); + } case x86_AVX_STATE: -#if !defined(RC_HIDE_XNU_J137) case x86_AVX512_STATE: -#endif - { - x86_avx_state_t *state; - thread_state_t fstate; - - if (*count < _MachineStateCount[flavor]) { - return KERN_INVALID_ARGUMENT; - } + { + x86_avx_state_t *state; + thread_state_t fstate; - *count = _MachineStateCount[flavor]; - state = (x86_avx_state_t *)tstate; + if (*count < _MachineStateCount[flavor]) { + return KERN_INVALID_ARGUMENT; + } - bzero((char *)state, *count * sizeof(int)); + *count = _MachineStateCount[flavor]; + state = (x86_avx_state_t *)tstate; - if (thread_is_64bit_addr(thr_act)) { - flavor -= 1; /* 64-bit flavor */ - fstate = (thread_state_t) &state->ufs.as64; - } else { - flavor -= 2; /* 32-bit flavor */ - fstate = (thread_state_t) &state->ufs.as32; - } - state->ash.flavor = flavor; - state->ash.count = _MachineStateCount[flavor]; + bzero((char *)state, *count * sizeof(int)); - return fpu_get_fxstate(thr_act, fstate, flavor); + if (thread_is_64bit_addr(thr_act)) { + flavor -= 1; /* 64-bit flavor */ + fstate = (thread_state_t) &state->ufs.as64; + } else { + flavor -= 2; /* 32-bit flavor */ + fstate = (thread_state_t) &state->ufs.as32; } + state->ash.flavor = flavor; + state->ash.count = _MachineStateCount[flavor]; + + return fpu_get_fxstate(thr_act, fstate, flavor); + } case x86_THREAD_STATE32: { @@ -1506,15 +1533,16 @@ machine_thread_get_state( case x86_THREAD_FULL_STATE64: { - if (!allow_64bit_proc_LDT_ops) { + if (*count < x86_THREAD_FULL_STATE64_COUNT) { return KERN_INVALID_ARGUMENT; } - if (*count < x86_THREAD_FULL_STATE64_COUNT) { + if (!thread_is_64bit_addr(thr_act)) { return KERN_INVALID_ARGUMENT; } - if (!thread_is_64bit_addr(thr_act)) { + /* If this process does not have a custom LDT, return failure */ + if (thr_act->task->i386_ldt == 0) { return KERN_INVALID_ARGUMENT; } @@ -1680,6 +1708,20 @@ machine_thread_get_state( *count = x86_DEBUG_STATE_COUNT; break; } + + case x86_PAGEIN_STATE: + { + if (*count < x86_PAGEIN_STATE_COUNT) { + return KERN_INVALID_ARGUMENT; + } + + x86_pagein_state_t *state = (void *)tstate; + + state->__pagein_error = thr_act->t_pagein_error; + + *count = x86_PAGEIN_STATE_COUNT; + break; + } default: return KERN_INVALID_ARGUMENT; } @@ -1981,15 +2023,16 @@ machine_stack_attach( thread_initialize_kernel_state(thread); statep = STACK_IKS(stack); -#if defined(__x86_64__) - statep->k_rip = (unsigned long) Thread_continue; - statep->k_rbx = (unsigned long) thread_continue; - statep->k_rsp = (unsigned long) STACK_IKS(stack); -#else - statep->k_eip = (unsigned long) Thread_continue; - statep->k_ebx = (unsigned long) thread_continue; - statep->k_esp = (unsigned long) STACK_IKS(stack); -#endif + + /* + * Reset the state of the thread to resume from a continuation, + * including resetting the stack and frame pointer to avoid backtracers + * seeing this temporary state and attempting to walk the defunct stack. + */ + statep->k_rbp = (uint64_t) 0; + statep->k_rip = (uint64_t) Thread_continue; + statep->k_rbx = (uint64_t) thread_continue; + statep->k_rsp = (uint64_t) STACK_IKS(stack); return; } diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index ff49e3fe2..d0f1040fe 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -750,7 +750,7 @@ pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline) thread_urgency_t urgency; uint64_t arg1, arg2; - urgency = thread_get_urgency(current_processor()->next_thread, &arg1, &arg2); + urgency = thread_get_urgency(THREAD_NULL, &arg1, &arg2); if (urgency == THREAD_URGENCY_REAL_TIME) { if (rt_period != NULL) { diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index bd932f8e2..06f61e536 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -226,17 +226,6 @@ extern int kernPhysPML4EntryCount; #define KERNEL_BASE (0ULL - (NBPML4 * KERNEL_PML4_COUNT)) #define KERNEL_BASEMENT (KERNEL_BASE - NBPML4) /* Basement uses one PML4 entry */ -#define VM_WIMG_COPYBACK VM_MEM_COHERENT -#define VM_WIMG_COPYBACKLW VM_WIMG_COPYBACK -#define VM_WIMG_DEFAULT VM_MEM_COHERENT -/* ?? intel ?? */ -#define VM_WIMG_IO (VM_MEM_COHERENT | \ - VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) -#define VM_WIMG_POSTED VM_WIMG_IO -#define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) -/* write combining mode, aka store gather */ -#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) -#define VM_WIMG_INNERWBACK VM_MEM_COHERENT /* * Pte related macros */ @@ -324,37 +313,11 @@ extern int kernPhysPML4EntryCount; #define INTEL_PTE_COMPRESSED_MASK (INTEL_PTE_COMPRESSED | \ INTEL_PTE_COMPRESSED_ALT | INTEL_PTE_SWLOCK) -#define PTE_IS_COMPRESSED(x, ptep) \ - ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */ \ +#define PTE_IS_COMPRESSED(x, ptep, pmap, vaddr) \ + ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */ \ ((x) & INTEL_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ - ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ - (panic_compressed_pte_corrupt((x), &(x), (ptep)), FALSE))) - -static inline void -panic_compressed_pte_corrupt(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep) -{ - uint64_t *adj_pteps[2]; - int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t); - /* - * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of - * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous - * 2 PTEs). - */ - if (pteidx == 0) { - adj_pteps[0] = ptep + 1; - adj_pteps[1] = ptep + 2; - } else if (pteidx == (NPTPG - 1)) { - adj_pteps[0] = ptep - 2; - adj_pteps[1] = ptep - 1; - } else { - adj_pteps[0] = ptep - 1; - adj_pteps[1] = ptep + 1; - } - - panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p", - pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]); - /*NOTREACHED*/ -} + ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ + pmap_compressed_pte_corruption_repair((x), &(x), (ptep), (pmap), (vaddr)))) #define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */ #define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */ @@ -519,6 +482,7 @@ PHYSMAP_PTOV_check(void *paddr) } #define PHYSMAP_PTOV(x) (PHYSMAP_PTOV_check((void*) (x))) +#define phystokv(x) ((vm_offset_t)(PHYSMAP_PTOV(x))) #if MACH_KERNEL_PRIVATE extern uint64_t dblmap_base, dblmap_max, dblmap_dist; @@ -580,21 +544,24 @@ struct pmap { pml4_entry_t *pm_pml4; /* VKA of top level */ pml4_entry_t *pm_upml4; /* Shadow VKA of top level */ pmap_paddr_t pm_eptp; /* EPTP */ + task_map_t pm_task_map; boolean_t pagezero_accessible; #define PMAP_PCID_MAX_CPUS MAX_CPUS /* Must be a multiple of 8 */ pcid_t pmap_pcid_cpus[PMAP_PCID_MAX_CPUS]; volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS]; boolean_t pm_shared; + os_refcnt_t ref_count; + pdpt_entry_t *pm_pdpt; /* KVA of 3rd level page */ vm_object_t pm_obj; /* object to hold pde's */ vm_object_t pm_obj_pdpt; /* holds pdpt pages */ vm_object_t pm_obj_pml4; /* holds pml4 pages */ #if DEVELOPMENT || DEBUG int nx_enabled; #endif - int ref_count; ledger_t ledger; /* ledger tracking phys mappings */ struct pmap_statistics stats; /* map statistics */ + uint64_t corrected_compressed_ptes_count; #if MACH_ASSERT boolean_t pmap_stats_assert; int pmap_pid; @@ -647,10 +614,12 @@ extern void pmap_put_mapwindow(mapwindow_t *map); #endif typedef struct pmap_memory_regions { - ppnum_t base; /* first page of this region */ - ppnum_t alloc_up; /* pages below this one have been "stolen" */ - ppnum_t alloc_down; /* pages above this one have been "stolen" */ - ppnum_t end; /* last page of this region */ + ppnum_t base; /* first page of this region */ + ppnum_t alloc_up; /* pages below this one have been "stolen" */ + ppnum_t alloc_down; /* pages above this one have been "stolen" */ + ppnum_t alloc_frag_up; /* low page of fragment after large page alloc */ + ppnum_t alloc_frag_down; /* high page of fragment after large page alloc */ + ppnum_t end; /* last page of this region */ uint32_t type; uint64_t attribute; } pmap_memory_region_t; @@ -786,6 +755,7 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s uint64_t *, int *, int *, int *); extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1, 2)); +extern void x86_64_protect_data_const(void); /* * Macros for speed. */ diff --git a/osfmk/i386/pmap_common.c b/osfmk/i386/pmap_common.c index 17c6e2947..9bfec8a48 100644 --- a/osfmk/i386/pmap_common.c +++ b/osfmk/i386/pmap_common.c @@ -291,10 +291,21 @@ __private_extern__ void pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1, 2)) { if (pmap_pagetable_corruption_incidents > 0) { - int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG); + int i, j, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG); (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout); for (i = 0; i < e; i++) { - (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime); + (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", + pmap_pagetable_corruption_records[i].incident, + pmap_pagetable_corruption_records[i].reason, + pmap_pagetable_corruption_records[i].action, + pmap_pagetable_corruption_records[i].abstime); + + if (pmap_pagetable_corruption_records[i].adj_ptes_count > 0) { + for (j = 0; j < pmap_pagetable_corruption_records[i].adj_ptes_count; j++) { + (*log_func)("\tAdjacent PTE[%d] = 0x%llx\n", j, + pmap_pagetable_corruption_records[i].adj_ptes[j]); + } + } } } } diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index abf263a1f..5928bda3f 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -303,10 +303,10 @@ extern uint32_t npvhashmask; extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ extern pv_hashed_entry_t pv_hashed_free_list; extern pv_hashed_entry_t pv_hashed_kern_free_list; -decl_simple_lock_data(extern, pv_hashed_free_list_lock) -decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) -decl_simple_lock_data(extern, pv_hash_table_lock) -decl_simple_lock_data(extern, phys_backup_lock) +decl_simple_lock_data(extern, pv_hashed_free_list_lock); +decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock); +decl_simple_lock_data(extern, pv_hash_table_lock); +decl_simple_lock_data(extern, phys_backup_lock); extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry * structures */ @@ -342,7 +342,7 @@ PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) simple_unlock(&pv_hashed_free_list_lock); if (pv_hashed_free_count <= pv_hashed_low_water_mark) { - if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) { + if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) { thread_wakeup(&mapping_replenish_event); } } @@ -375,7 +375,7 @@ PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) simple_unlock(&pv_hashed_kern_free_list_lock); if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { - if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) { + if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) { thread_wakeup(&mapping_replenish_event); } } @@ -507,9 +507,14 @@ extern uint64_t pde_mapped_size; extern char *pmap_phys_attributes; extern ppnum_t last_managed_page; -extern ppnum_t lowest_lo; -extern ppnum_t lowest_hi; -extern ppnum_t highest_hi; +/* + * Used to record high memory allocated to kernel before + * pmap_init() gets called. + */ +extern ppnum_t pmap_high_used_top; +extern ppnum_t pmap_high_used_bottom; +extern ppnum_t pmap_middle_used_top; +extern ppnum_t pmap_middle_used_bottom; /* * when spinning through pmap_remove @@ -643,13 +648,14 @@ popcnt1(uint64_t distance) */ typedef enum { - PTE_VALID = 0x0, - PTE_INVALID = 0x1, - PTE_RSVD = 0x2, - PTE_SUPERVISOR = 0x4, - PTE_BITFLIP = 0x8, - PV_BITFLIP = 0x10, - PTE_INVALID_CACHEABILITY = 0x20 + PTE_VALID = 0x0, + PTE_INVALID = 0x1, + PTE_RSVD = 0x2, + PTE_SUPERVISOR = 0x4, + PTE_BITFLIP = 0x8, + PV_BITFLIP = 0x10, + PTE_INVALID_CACHEABILITY = 0x20, + PTE_NXBITFLIP = 0x40 } pmap_pagetable_corruption_t; typedef enum { @@ -680,6 +686,9 @@ typedef struct { pmap_t pvpmap; vm_map_offset_t pvva; uint64_t abstime; + int adj_ptes_count; +#define PMPTCR_MAX_ADJ_PTES (2) + uint64_t adj_ptes[PMPTCR_MAX_ADJ_PTES]; } pmap_pagetable_corruption_record_t; extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; @@ -687,10 +696,21 @@ extern uint64_t pmap_pagetable_corruption_last_abstime; extern thread_call_t pmap_pagetable_corruption_log_call; extern boolean_t pmap_pagetable_corruption_timeout; -static inline void -pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) +static inline pmap_pagetable_corruption_action_t +pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, + pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, + ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva, int adj_pteps_cnt, uint64_t **adj_pteps) { uint32_t pmap_pagetable_corruption_log_index; + uint64_t curtime = mach_absolute_time(); + + if ((curtime - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { + pmap_pagetable_corruption_timeout = TRUE; + action = PMAP_ACTION_ASSERT; + } else { + pmap_pagetable_corruption_last_abstime = curtime; + } + pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; @@ -701,9 +721,17 @@ pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corru pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; - pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = curtime; + if (adj_pteps_cnt > 0 && adj_pteps != NULL) { + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count = MIN(adj_pteps_cnt, PMPTCR_MAX_ADJ_PTES); + for (int i = 0; i < pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count; i++) { + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes[i] = *adj_pteps[i]; + } + } /* Asynchronously log */ thread_call_enter(pmap_pagetable_corruption_log_call); + + return action; } static inline pmap_pagetable_corruption_action_t @@ -797,14 +825,49 @@ pmap_cpc_exit: action = PMAP_ACTION_ASSERT; } - if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { - action = PMAP_ACTION_ASSERT; - pmap_pagetable_corruption_timeout = TRUE; + return pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva, 0, 0); +} + +static inline boolean_t +pmap_compressed_pte_corruption_repair(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep, pmap_t pmap, + vm_map_offset_t vaddr) +{ + uint64_t *adj_pteps[2]; + int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t); + pmap_pagetable_corruption_action_t action = PMAP_ACTION_IGNORE; + + /* + * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of + * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous + * 2 PTEs). + */ + if (pteidx == 0) { + adj_pteps[0] = ptep + 1; + adj_pteps[1] = ptep + 2; + } else if (pteidx == (NPTPG - 1)) { + adj_pteps[0] = ptep - 2; + adj_pteps[1] = ptep - 1; } else { - pmap_pagetable_corruption_last_abstime = mach_absolute_time(); + adj_pteps[0] = ptep - 1; + adj_pteps[1] = ptep + 1; } - pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); - return action; + + /* + * Since the compressed PTE no longer has a PTE associated, we cannot pass in the pv data to + * pmap_pagetable_corruption_log, so instead supply adjacent PTEs for logging. + */ + if (pmap_pagetable_corruption_log(ROOT_ABSENT, (pte & INTEL_PTE_NX) ? PTE_NXBITFLIP : PTE_BITFLIP, + action, pmap, vaddr, ptep, (ppnum_t)~0UL, 0, 0, sizeof(adj_pteps) / sizeof(adj_pteps[0]), + adj_pteps) != PMAP_ACTION_ASSERT) { + /* Correct the flipped bit(s) and continue */ + pmap_store_pte(ptep, pte & INTEL_PTE_COMPRESSED_MASK); + pmap->corrected_compressed_ptes_count++; + return TRUE; /* Returning TRUE to indicate this is a now a valid compressed PTE (we hope) */ + } + + panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p", + pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]); + /*NOTREACHED*/ } /* diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index 93169df60..eae2bf321 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -342,7 +342,7 @@ pmap_find_phys(pmap_t pmap, addr64_t va) mp_disable_preemption(); } - if (!pmap->ref_count) { + if (os_ref_get_count(&pmap->ref_count) == 0) { goto pfp_exit; } @@ -640,7 +640,7 @@ Retry: old_pa_locked = FALSE; if (old_pa == 0 && - PTE_IS_COMPRESSED(*pte, pte)) { + PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) { /* * "pmap" should be locked at this point, so this should * not race with another pmap_enter() or pmap_remove_range(). @@ -1261,7 +1261,7 @@ pmap_remove_range_options( pa = pte_to_pa(p); if (pa == 0) { if ((options & PMAP_OPTIONS_REMOVE) && - (PTE_IS_COMPRESSED(p, cpte))) { + (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) { assert(pmap != kernel_pmap); /* one less "compressed"... */ stats_compressed++; @@ -1322,7 +1322,7 @@ check_pte_for_compressed_marker: * loop above, so check again. */ if ((options & PMAP_OPTIONS_REMOVE) && - (PTE_IS_COMPRESSED(*cpte, cpte))) { + (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) { assert(pmap != kernel_pmap); /* one less "compressed"... */ stats_compressed++; @@ -1724,7 +1724,7 @@ pmap_page_protect_options( if (pmap != kernel_pmap && (options & PMAP_OPTIONS_COMPRESSOR) && IS_INTERNAL_PAGE(pai)) { - assert(!PTE_IS_COMPRESSED(*pte, pte)); + assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)); /* mark this PTE as having been "compressed" */ new_pte_value = PTE_COMPRESSED; if (IS_ALTACCT_PAGE(pai, pv_e)) { @@ -2525,7 +2525,7 @@ pmap_query_page_info( pa = pte_to_pa(*pte); if (pa == 0) { - if (PTE_IS_COMPRESSED(*pte, pte)) { + if (PTE_IS_COMPRESSED(*pte, pte, pmap, va)) { disp |= PMAP_QUERY_PAGE_COMPRESSED; if (*pte & PTE_COMPRESSED_ALT) { disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT; @@ -2581,6 +2581,7 @@ pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstar return; } +__dead2 void pmap_ledger_alloc_init(size_t size) { @@ -2589,15 +2590,15 @@ pmap_ledger_alloc_init(size_t size) __func__, size); } +__dead2 ledger_t pmap_ledger_alloc(void) { panic("%s: unsupported", __func__); - - return NULL; } +__dead2 void pmap_ledger_free(ledger_t ledger) { diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index fa269748c..bc6fa6524 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -227,20 +227,6 @@ rtc_clock_adjust(uint64_t tsc_base_delta) rtc_nanotime_set_commpage(rntp); } -void -rtc_clock_stepping(__unused uint32_t new_frequency, - __unused uint32_t old_frequency) -{ - panic("rtc_clock_stepping unsupported"); -} - -void -rtc_clock_stepped(__unused uint32_t new_frequency, - __unused uint32_t old_frequency) -{ - panic("rtc_clock_stepped unsupported"); -} - /* * rtc_sleep_wakeup: * diff --git a/osfmk/i386/rtclock_native.c b/osfmk/i386/rtclock_native.c index 720b743e8..2fe8a8462 100644 --- a/osfmk/i386/rtclock_native.c +++ b/osfmk/i386/rtclock_native.c @@ -145,14 +145,14 @@ rtc_lapic_set_tsc_deadline_timer(uint64_t deadline, uint64_t now) * Definitions for timer operations table */ -rtc_timer_t rtc_timer_lapic = { - rtc_lapic_config_timer, - rtc_lapic_set_timer +rtc_timer_t rtc_timer_lapic = { + .rtc_config = rtc_lapic_config_timer, + .rtc_set = rtc_lapic_set_timer, }; -rtc_timer_t rtc_timer_tsc_deadline = { - rtc_lapic_config_tsc_deadline_timer, - rtc_lapic_set_tsc_deadline_timer +rtc_timer_t rtc_timer_tsc_deadline = { + .rtc_config = rtc_lapic_config_tsc_deadline_timer, + .rtc_set = rtc_lapic_set_tsc_deadline_timer, }; rtc_timer_t *rtc_timer = &rtc_timer_lapic; /* defaults to LAPIC timer */ diff --git a/osfmk/i386/simple_lock.h b/osfmk/i386/simple_lock.h index b9298397d..4fd4d0677 100644 --- a/osfmk/i386/simple_lock.h +++ b/osfmk/i386/simple_lock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,7 +120,7 @@ typedef usimple_lock_data_t *simple_lock_t; typedef usimple_lock_data_t simple_lock_data_t; #define decl_simple_lock_data(class, name) \ - class simple_lock_data_t name; + class simple_lock_data_t name #endif /* !defined(decl_simple_lock_data) */ diff --git a/osfmk/i386/thread.h b/osfmk/i386/thread.h index 74da242cf..1c1e8e926 100644 --- a/osfmk/i386/thread.h +++ b/osfmk/i386/thread.h @@ -157,6 +157,8 @@ struct machine_thread { int physwindow_busy; #endif + uint32_t last_xcpm_ttd; + uint8_t last_xcpm_index; int mthr_do_segchk; }; typedef struct machine_thread *pcb_t; diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index e9ef4dea7..bfc24c4aa 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -117,7 +117,7 @@ extern void kprint_state(x86_saved_state64_t *saved_state); * Forward declarations */ static void user_page_fault_continue(kern_return_t kret); -static void panic_trap(x86_saved_state64_t *saved_state, uint32_t pl, kern_return_t fault_result); +static void panic_trap(x86_saved_state64_t *saved_state, uint32_t pl, kern_return_t fault_result) __dead2; static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip); #if CONFIG_DTRACE @@ -709,7 +709,7 @@ kernel_trap( case T_PAGE_FAULT: #if CONFIG_DTRACE - if (thread != THREAD_NULL && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ + if (thread != THREAD_NULL && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ /* * DTrace has "anticipated" the possibility of this fault, and has @@ -878,12 +878,6 @@ panic_trap(x86_saved_state64_t *regs, uint32_t pl, kern_return_t fault_result) potential_smap_fault ? " SMAP fault" : "", pl, fault_result); - /* - * This next statement is not executed, - * but it's needed to stop the compiler using tail call optimization - * for the panic call - which confuses the subsequent backtrace. - */ - cr0 = 0; } #if CONFIG_DTRACE @@ -1124,6 +1118,14 @@ user_trap( /*NOTREACHED*/ } + /* + * For a user trap, vm_fault() should never return KERN_FAILURE. + * If it does, we're leaking preemption disables somewhere in the kernel. + */ + if (__improbable(kret == KERN_FAILURE)) { + panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread); + } + user_page_fault_continue(kret); } /* NOTREACHED */ break; @@ -1153,7 +1155,6 @@ user_trap( default: panic("Unexpected user trap, type %d", type); - return; } /* Note: Codepaths that directly return from user_trap() have pending * ASTs processed in locore diff --git a/osfmk/i386/trap.h b/osfmk/i386/trap.h index fc7df3dfb..5601e64f6 100644 --- a/osfmk/i386/trap.h +++ b/osfmk/i386/trap.h @@ -132,8 +132,8 @@ extern void user_trap(x86_saved_state_t *regs); extern void interrupt(x86_saved_state_t *regs); -extern void panic_double_fault64(x86_saved_state_t *regs); -extern void panic_machine_check64(x86_saved_state_t *regs); +extern void panic_double_fault64(x86_saved_state_t *regs) __abortlike; +extern void panic_machine_check64(x86_saved_state_t *regs) __abortlike; typedef kern_return_t (*perfCallback)( int trapno, diff --git a/osfmk/i386/trap_native.c b/osfmk/i386/trap_native.c index 82f5c5168..b5613be39 100644 --- a/osfmk/i386/trap_native.c +++ b/osfmk/i386/trap_native.c @@ -100,7 +100,6 @@ extern void kprintf_break_lock(void); extern void kprint_state(x86_saved_state64_t *saved_state); -void panic_64(x86_saved_state_t *, int, const char *, boolean_t); extern volatile int panic_double_fault_cpu; @@ -109,7 +108,7 @@ extern volatile int panic_double_fault_cpu; /* * K64 debug - fatal handler for debug code in the trap vectors. */ -extern void +extern void __dead2 panic_idt64(x86_saved_state_t *rsp); void panic_idt64(x86_saved_state_t *rsp) @@ -120,7 +119,8 @@ panic_idt64(x86_saved_state_t *rsp) #endif -void +__dead2 +static void panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boolean_t do_mca_dump) { /* Set postcode (DEBUG only) */ diff --git a/osfmk/i386/user_ldt.c b/osfmk/i386/user_ldt.c index 29334339a..fa5c0ce23 100644 --- a/osfmk/i386/user_ldt.c +++ b/osfmk/i386/user_ldt.c @@ -77,6 +77,9 @@ #include #include +#include /* for IOTaskHasEntitlement */ +#include /* for csr_check */ + #include static void user_ldt_set_action(void *); @@ -85,7 +88,7 @@ static int i386_set_ldt_impl(uint32_t *retval, uint64_t start_sel, uint64_t desc static int i386_get_ldt_impl(uint32_t *retval, uint64_t start_sel, uint64_t descs, uint64_t num_sels); -extern int allow_64bit_proc_LDT_ops; +#define LDT_IN_64BITPROC_ENTITLEMENT "com.apple.security.ldt-in-64bit-process" /* * Add the descriptors to the LDT, starting with @@ -441,8 +444,9 @@ i386_set_ldt64( uint64_t descs, /* out */ uint64_t num_sels) { - if (!allow_64bit_proc_LDT_ops) { - return EINVAL; + if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) != 0 && + !IOTaskHasEntitlement(current_task(), LDT_IN_64BITPROC_ENTITLEMENT)) { + return EPERM; } return i386_set_ldt_impl(retval, start_sel, descs, num_sels); @@ -468,8 +472,9 @@ i386_get_ldt64( uint64_t descs, /* out */ uint64_t num_sels) { - if (!allow_64bit_proc_LDT_ops) { - return EINVAL; + if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) != 0 && + !IOTaskHasEntitlement(current_task(), LDT_IN_64BITPROC_ENTITLEMENT)) { + return EPERM; } return i386_get_ldt_impl(retval, start_sel, descs, num_sels); diff --git a/osfmk/i386/xpr.h b/osfmk/i386/xpr.h deleted file mode 100644 index 3c7449a28..000000000 --- a/osfmk/i386/xpr.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * File: xpr.h - * - * Machine dependent module for the XPR tracing facility. - */ - -#define XPR_TIMESTAMP (0) diff --git a/osfmk/ipc/ipc_entry.h b/osfmk/ipc/ipc_entry.h index f63a3f546..dab496ef8 100644 --- a/osfmk/ipc/ipc_entry.h +++ b/osfmk/ipc/ipc_entry.h @@ -125,6 +125,8 @@ struct ipc_entry { #define IE_BITS_TYPE_MASK 0x001f0000 /* 5 bits of capability type */ #define IE_BITS_TYPE(bits) ((bits) & IE_BITS_TYPE_MASK) +#define IE_BITS_EXTYPE_MASK 0x00200000 /* 1 bit for extended capability */ + #ifndef NO_PORT_GEN #define IE_BITS_GEN_MASK 0xff000000 /* 8 bits for generation */ #define IE_BITS_GEN(bits) ((bits) & IE_BITS_GEN_MASK) diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index 86d03a586..44d1efed8 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -88,14 +88,14 @@ static lck_spin_t ipc_importance_lock_data; /* single lock for now */ lck_spin_assert(&ipc_importance_lock_data, LCK_ASSERT_OWNED) #if IIE_REF_DEBUG -#define incr_ref_counter(x) (hw_atomic_add(&(x), 1)) +#define incr_ref_counter(x) (os_atomic_inc(&(x), relaxed)) static inline uint32_t ipc_importance_reference_internal(ipc_importance_elem_t elem) { incr_ref_counter(elem->iie_refs_added); - return hw_atomic_add(&elem->iie_bits, 1) & IIE_REFS_MASK; + return os_atomic_inc(&elem->iie_bits, relaxed) & IIE_REFS_MASK; } static inline @@ -103,7 +103,7 @@ uint32_t ipc_importance_release_internal(ipc_importance_elem_t elem) { incr_ref_counter(elem->iie_refs_dropped); - return hw_atomic_sub(&elem->iie_bits, 1) & IIE_REFS_MASK; + return os_atomic_dec(&elem->iie_bits, relaxed) & IIE_REFS_MASK; } static inline @@ -730,7 +730,7 @@ ipc_importance_task_propagate_helper( } /* determine the task importance to adjust as result (if any) */ - port = (ipc_port_t) hdr->msgh_remote_port; + port = hdr->msgh_remote_port; assert(IP_VALID(port)); ip_lock(port); temp_task_imp = IIT_NULL; @@ -1477,7 +1477,7 @@ ipc_importance_task_drop_legacy_external_assertion(ipc_importance_task_t task_im } - +#if LEGACY_IMPORTANCE_DELIVERY /* Transfer an assertion to legacy userspace responsibility */ static kern_return_t ipc_importance_task_externalize_legacy_assertion(ipc_importance_task_t task_imp, uint32_t count, __unused int sender_pid) @@ -1515,6 +1515,7 @@ ipc_importance_task_externalize_legacy_assertion(ipc_importance_task_t task_imp, return KERN_SUCCESS; } +#endif /* LEGACY_IMPORTANCE_DELIVERY */ /* * Routine: ipc_importance_task_update_live_donor @@ -2221,6 +2222,7 @@ ipc_importance_check_circularity( int assertcnt = 0; ipc_port_t base; struct turnstile *send_turnstile = TURNSTILE_NULL; + struct task_watchport_elem *watchport_elem = NULL; assert(port != IP_NULL); assert(dest != IP_NULL); @@ -2308,7 +2310,7 @@ ipc_importance_check_circularity( /* port (== base) is in limbo */ - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == MACH_PORT_NULL); assert(port->ip_destination == IP_NULL); @@ -2318,7 +2320,7 @@ ipc_importance_check_circularity( /* base is in transit or in limbo */ - assert(ip_active(base)); + require_ip_active(base); assert(base->ip_receiver_name == MACH_PORT_NULL); next = base->ip_destination; @@ -2347,10 +2349,18 @@ not_circular: /* port is in limbo */ imq_lock(&port->ip_messages); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == MACH_PORT_NULL); assert(port->ip_destination == IP_NULL); + /* Port is being enqueued in a kmsg, remove the watchport boost in order to push on destination port */ + watchport_elem = ipc_port_clear_watchport_elem_internal(port); + + /* Check if the port is being enqueued as a part of sync bootstrap checkin */ + if (dest->ip_specialreply && dest->ip_sync_bootstrap_checkin) { + port->ip_sync_bootstrap_checkin = 1; + } + ip_reference(dest); port->ip_destination = dest; @@ -2403,7 +2413,7 @@ not_circular: /* port is in transit */ - assert(ip_active(dest)); + require_ip_active(dest); assert(dest->ip_receiver_name == MACH_PORT_NULL); assert(dest->ip_destination != IP_NULL); assert(dest->ip_tempowner == 0); @@ -2451,6 +2461,18 @@ not_circular: ip_unlock(base); + /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */ + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + + /* Take the mq lock to call turnstile complete */ + imq_lock(&port->ip_messages); + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL, TURNSTILE_SYNC_IPC); + send_turnstile = TURNSTILE_NULL; + imq_unlock(&port->ip_messages); + turnstile_cleanup(); + } + /* * Transfer assertions now that the ports are unlocked. * Avoid extra overhead if transferring to/from the same task. @@ -2480,18 +2502,6 @@ not_circular: ipc_importance_unlock(); } - /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */ - if (send_turnstile) { - turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD); - - /* Take the mq lock to call turnstile complete */ - imq_lock(&port->ip_messages); - turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL); - send_turnstile = TURNSTILE_NULL; - imq_unlock(&port->ip_messages); - turnstile_cleanup(); - } - if (imp_task != IIT_NULL) { ipc_importance_task_release(imp_task); } @@ -2500,6 +2510,10 @@ not_circular: ipc_importance_task_release(release_imp_task); } + if (watchport_elem) { + task_watchport_elem_deallocate(watchport_elem); + } + return FALSE; } @@ -2518,7 +2532,7 @@ ipc_importance_send( ipc_kmsg_t kmsg, mach_msg_option_t option) { - ipc_port_t port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port; + ipc_port_t port = kmsg->ikm_header->msgh_remote_port; boolean_t port_lock_dropped = FALSE; ipc_importance_elem_t elem; task_t task; @@ -3154,11 +3168,14 @@ ipc_importance_receive( ipc_kmsg_t kmsg, mach_msg_option_t option) { + int impresult = -1; + +#if IMPORTANCE_TRACE || LEGACY_IMPORTANCE_DELIVERY + task_t task_self = current_task(); unsigned int sender_pid = ((mach_msg_max_trailer_t *) ((vm_offset_t)kmsg->ikm_header + round_msg(kmsg->ikm_header->msgh_size)))->msgh_audit.val[5]; - task_t task_self = current_task(); - int impresult = -1; +#endif /* convert to a voucher with an inherit importance attribute? */ if ((option & MACH_RCV_VOUCHER) != 0) { @@ -3239,14 +3256,17 @@ ipc_importance_receive( /* With kmsg unlinked, can safely examine message importance attribute. */ if (MACH_MSGH_BITS_RAISED_IMPORTANCE(kmsg->ikm_header->msgh_bits)) { - ipc_importance_task_t task_imp = task_self->task_imp_base; ipc_port_t port = kmsg->ikm_header->msgh_remote_port; +#if LEGACY_IMPORTANCE_DELIVERY + ipc_importance_task_t task_imp = task_self->task_imp_base; /* The owner of receive right might have changed, take the internal assertion */ if (KERN_SUCCESS == ipc_importance_task_hold_internal_assertion(task_imp, 1)) { ipc_importance_task_externalize_legacy_assertion(task_imp, 1, sender_pid); impresult = 1; - } else { + } else +#endif + { /* The importance boost never applied to task (clear the bit) */ kmsg->ikm_header->msgh_bits &= ~MACH_MSGH_BITS_RAISEIMP; impresult = 0; @@ -3409,7 +3429,7 @@ static void ipc_importance_manager_release( ipc_voucher_attr_manager_t manager); -struct ipc_voucher_attr_manager ipc_importance_manager = { +const struct ipc_voucher_attr_manager ipc_importance_manager = { .ivam_release_value = ipc_importance_release_value, .ivam_get_value = ipc_importance_get_value, .ivam_extract_content = ipc_importance_extract_content, @@ -3792,6 +3812,7 @@ ipc_importance_command( * reference granted back at registration time, and that reference is never * dropped, this should never be called. */ +__abortlike static void ipc_importance_manager_release( ipc_voucher_attr_manager_t __assert_only manager) diff --git a/osfmk/ipc/ipc_importance.h b/osfmk/ipc/ipc_importance.h index 9f69a6af1..16ca8ed40 100644 --- a/osfmk/ipc/ipc_importance.h +++ b/osfmk/ipc/ipc_importance.h @@ -95,10 +95,10 @@ struct ipc_importance_elem { #if !IIE_REF_DEBUG #define ipc_importance_reference_internal(elem) \ - (hw_atomic_add(&(elem)->iie_bits, 1) & IIE_REFS_MASK) + (os_atomic_inc(&(elem)->iie_bits, relaxed) & IIE_REFS_MASK) #define ipc_importance_release_internal(elem) \ - (hw_atomic_sub(&(elem)->iie_bits, 1) & IIE_REFS_MASK) + (os_atomic_dec(&(elem)->iie_bits, relaxed) & IIE_REFS_MASK) #endif struct ipc_importance_task { diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index 4e45ca60e..ca4bcee84 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -77,6 +77,7 @@ #include #include +#include #include #include #include @@ -125,6 +126,8 @@ vm_size_t ipc_kmsg_max_body_space = ((IPC_KMSG_MAX_SPACE * 3) / 4 - MAX_TRAILER_ int ipc_space_max; int ipc_port_max; int ipc_pset_max; +int prioritize_launch = 1; +int enforce_strict_reply = 0; lck_grp_t ipc_lck_grp; @@ -143,6 +146,8 @@ void ipc_bootstrap(void) { kern_return_t kr; + int prioritize_launch_bootarg; + int strict_reply_bootarg; lck_grp_attr_setdefault(&ipc_lck_grp_attr); lck_grp_init(&ipc_lck_grp, "ipc", &ipc_lck_grp_attr); @@ -171,6 +176,7 @@ ipc_bootstrap(void) /* cant charge callers for port allocations (references passed) */ zone_change(ipc_object_zones[IOT_PORT], Z_CALLERACCT, FALSE); zone_change(ipc_object_zones[IOT_PORT], Z_NOENCRYPT, TRUE); + zone_change(ipc_object_zones[IOT_PORT], Z_CLEARMEMORY, TRUE); ipc_object_zones[IOT_PORT_SET] = zinit(sizeof(struct ipc_pset), @@ -178,6 +184,7 @@ ipc_bootstrap(void) sizeof(struct ipc_pset), "ipc port sets"); zone_change(ipc_object_zones[IOT_PORT_SET], Z_NOENCRYPT, TRUE); + zone_change(ipc_object_zones[IOT_PORT_SET], Z_CLEARMEMORY, TRUE); /* * Create the basic ipc_kmsg_t zone (the one we also cache) @@ -216,6 +223,17 @@ ipc_bootstrap(void) semaphore_init(); mk_timer_init(); host_notify_init(); + +#if CONFIG_ARCADE + arcade_init(); +#endif + + if (PE_parse_boot_argn("prioritize_launch", &prioritize_launch_bootarg, sizeof(prioritize_launch_bootarg))) { + prioritize_launch = !!prioritize_launch_bootarg; + } + if (PE_parse_boot_argn("ipc_strict_reply", &strict_reply_bootarg, sizeof(strict_reply_bootarg))) { + enforce_strict_reply = !!strict_reply_bootarg; + } } /* diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 803b25bc2..f1611fc82 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -96,7 +96,7 @@ #include -#include +#include #include #include @@ -168,6 +168,7 @@ typedef union{ mach_msg_legacy_port_descriptor_t port; mach_msg_ool_descriptor32_t out_of_line32; mach_msg_ool_ports_descriptor32_t ool_ports32; + mach_msg_guarded_port_descriptor32_t guarded_port32; mach_msg_type_descriptor_t type; } mach_msg_legacy_descriptor_t; @@ -471,7 +472,15 @@ ipc_msg_print_untyped64( dsc->deallocate ? "DEALLOC" : ""); break; } + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_guarded_port_descriptor_t *dsc; + dsc = (mach_msg_guarded_port_descriptor_t *)&saddr->guarded_port; + kprintf(" GUARDED_PORT name = %p flags = 0x%x disp = ", dsc->name, dsc->flags); + ipc_print_type_name64(dsc->disposition); + kprintf("\n"); + break; + } default: { kprintf(" UNKNOWN DESCRIPTOR 0x%x\n", type); break; @@ -568,8 +577,9 @@ MACRO_END #define KMSG_TRACE_FLAG_TIMER 0x200000 #define KMSG_TRACE_FLAG_SEMA 0x400000 #define KMSG_TRACE_FLAG_DTMPOWNER 0x800000 +#define KMSG_TRACE_FLAG_GUARDED_DESC 0x1000000 -#define KMSG_TRACE_FLAGS_MASK 0xffffff +#define KMSG_TRACE_FLAGS_MASK 0x1ffffff #define KMSG_TRACE_FLAGS_SHIFT 8 #define KMSG_TRACE_PORTS_MASK 0xff @@ -577,7 +587,6 @@ MACRO_END #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) #include -extern boolean_t kdebug_debugid_enabled(uint32_t debugid); void ipc_kmsg_trace_send(ipc_kmsg_t kmsg, @@ -591,7 +600,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, int kotype = 0; uint32_t msg_size = 0; - uint32_t msg_flags = KMSG_TRACE_FLAG_TRACED; + uint64_t msg_flags = KMSG_TRACE_FLAG_TRACED; uint32_t num_ports = 0; uint32_t send_pid, dst_pid; @@ -610,7 +619,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, msg = kmsg->ikm_header; - dst_port = (ipc_port_t)(msg->msgh_remote_port); + dst_port = msg->msgh_remote_port; if (!IPC_PORT_VALID(dst_port)) { return; } @@ -658,7 +667,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, msg_flags |= KMSG_TRACE_FLAG_SND64; } - src_port = (ipc_port_t)(msg->msgh_local_port); + src_port = msg->msgh_local_port; if (src_port) { if (src_port->ip_messages.imq_qlimit != MACH_PORT_QLIMIT_DEFAULT) { msg_flags |= KMSG_TRACE_FLAG_SRC_NDFLTQ; @@ -735,6 +744,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, case IKOT_IOKIT_CONNECT: case IKOT_IOKIT_OBJECT: case IKOT_IOKIT_IDENT: + case IKOT_UEXT_OBJECT: msg_flags |= KMSG_TRACE_FLAG_IOKIT; break; default: @@ -806,6 +816,12 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, msg_size -= 16; } } break; + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: + num_ports++; + msg_flags |= KMSG_TRACE_FLAG_GUARDED_DESC; + if (is_task_64bit) { + msg_size -= 16; + } default: break; } @@ -818,7 +834,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, trailer = (mach_msg_trailer_t *)((vm_offset_t)msg + round_msg((vm_offset_t)msg->msgh_size)); if (trailer->msgh_trailer_size <= sizeof(mach_msg_security_trailer_t)) { - extern security_token_t KERNEL_SECURITY_TOKEN; + extern const security_token_t KERNEL_SECURITY_TOKEN; mach_msg_security_trailer_t *strailer; strailer = (mach_msg_security_trailer_t *)trailer; /* @@ -873,6 +889,31 @@ mach_msg_return_t ipc_kmsg_copyin_body( vm_map_t map, mach_msg_option_t *optionp); + +extern int enforce_strict_reply; + +static void +ipc_kmsg_link_reply_context_locked( + ipc_port_t reply_port, + ipc_port_t voucher_port); + +static kern_return_t +ipc_kmsg_validate_reply_port_locked( + ipc_port_t reply_port, + mach_msg_option_t options); + +static mach_msg_return_t +ipc_kmsg_validate_reply_context_locked( + mach_msg_option_t option, + ipc_port_t dest_port, + ipc_voucher_t voucher, + mach_port_name_t voucher_name); + +/* we can't include the BSD header here... */ +#ifndef PERSONA_ID_NONE +#define PERSONA_ID_NONE ((uint32_t)-1) +#endif + /* * We keep a per-processor cache of kernel message buffers. * The cache saves the overhead/locking of using kalloc/kfree. @@ -899,7 +940,7 @@ ipc_kmsg_alloc( /* * LP64support - * Pad the allocation in case we need to expand the - * message descrptors for user spaces with pointers larger than + * message descriptors for user spaces with pointers larger than * the kernel's own, or vice versa. We don't know how many descriptors * there are yet, so just assume the whole body could be * descriptors (if there could be any at all). @@ -1298,10 +1339,10 @@ ipc_kmsg_clean_body( /* * Destroy port rights carried in the message */ - if (!IO_VALID((ipc_object_t) dsc->name)) { + if (!IP_VALID(dsc->name)) { continue; } - ipc_object_destroy((ipc_object_t) dsc->name, dsc->disposition); + ipc_object_destroy(ip_to_object(dsc->name), dsc->disposition); break; } case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: @@ -1354,8 +1395,20 @@ ipc_kmsg_clean_body( (vm_size_t) dsc->count * sizeof(mach_port_t)); break; } + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_guarded_port_descriptor_t *dsc = (typeof(dsc)) & saddr->guarded_port; + + /* + * Destroy port rights carried in the message + */ + if (!IP_VALID(dsc->name)) { + continue; + } + ipc_object_destroy(ip_to_object(dsc->name), dsc->disposition); + break; + } default: { - _ipc_kmsg_clean_invalid_desc++; /* don't understand this type of descriptor */ + _ipc_kmsg_clean_invalid_desc++; /* don't understand this type of descriptor */ } } } @@ -1388,16 +1441,16 @@ ipc_kmsg_clean_partial( /* deal with importance chain while we still have dest and voucher references */ ipc_importance_clean(kmsg); - object = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; + object = ip_to_object(kmsg->ikm_header->msgh_remote_port); assert(IO_VALID(object)); ipc_object_destroy_dest(object, MACH_MSGH_BITS_REMOTE(mbits)); - object = (ipc_object_t) kmsg->ikm_header->msgh_local_port; + object = ip_to_object(kmsg->ikm_header->msgh_local_port); if (IO_VALID(object)) { ipc_object_destroy(object, MACH_MSGH_BITS_LOCAL(mbits)); } - object = (ipc_object_t) kmsg->ikm_voucher; + object = ip_to_object(kmsg->ikm_voucher); if (IO_VALID(object)) { assert(MACH_MSGH_BITS_VOUCHER(mbits) == MACH_MSG_TYPE_MOVE_SEND); ipc_object_destroy(object, MACH_MSG_TYPE_PORT_SEND); @@ -1431,17 +1484,17 @@ ipc_kmsg_clean( ipc_importance_clean(kmsg); mbits = kmsg->ikm_header->msgh_bits; - object = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; + object = ip_to_object(kmsg->ikm_header->msgh_remote_port); if (IO_VALID(object)) { ipc_object_destroy_dest(object, MACH_MSGH_BITS_REMOTE(mbits)); } - object = (ipc_object_t) kmsg->ikm_header->msgh_local_port; + object = ip_to_object(kmsg->ikm_header->msgh_local_port); if (IO_VALID(object)) { ipc_object_destroy(object, MACH_MSGH_BITS_LOCAL(mbits)); } - object = (ipc_object_t) kmsg->ikm_voucher; + object = ip_to_object(kmsg->ikm_voucher); if (IO_VALID(object)) { assert(MACH_MSGH_BITS_VOUCHER(mbits) == MACH_MSG_TYPE_MOVE_SEND); ipc_object_destroy(object, MACH_MSG_TYPE_PORT_SEND); @@ -1688,7 +1741,7 @@ ipc_kmsg_get_from_kernel( assert(size >= sizeof(mach_msg_header_t)); assert((size & 3) == 0); - dest_port = (ipc_port_t)msg->msgh_remote_port; + dest_port = msg->msgh_remote_port; msg_and_trailer_size = size + MAX_TRAILER_SIZE; @@ -1812,10 +1865,26 @@ ipc_kmsg_send( ipc_voucher_send_preprocessing(kmsg); - port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port; + port = kmsg->ikm_header->msgh_remote_port; assert(IP_VALID(port)); ip_lock(port); + /* + * If the destination has been guarded with a reply context, and the + * sender is consuming a send-once right, then assume this is a reply + * to an RPC and we need to validate that this sender is currently in + * the correct context. + */ + if (enforce_strict_reply && port->ip_reply_context != 0 && + ((option & MACH_SEND_KERNEL) == 0) && + MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) { + error = ipc_kmsg_validate_reply_context_locked(option, port, th->ith_voucher, th->ith_voucher_name); + if (error != MACH_MSG_SUCCESS) { + ip_unlock(port); + return error; + } + } + #if IMPORTANCE_INHERITANCE retry: #endif /* IMPORTANCE_INHERITANCE */ @@ -1856,7 +1925,7 @@ retry: * ipc_port_dealloc_kernel clears ip_receiver * before destroying a kernel port. */ - assert(ip_active(port)); + require_ip_active(port); port->ip_messages.imq_seqno++; ip_unlock(port); @@ -1872,7 +1941,7 @@ retry: /* restart the KMSG_INFO tracing for the reply message */ KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_START); - port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port; + port = kmsg->ikm_header->msgh_remote_port; assert(IP_VALID(port)); ip_lock(port); /* fall thru with reply - same options */ @@ -1906,7 +1975,7 @@ retry: */ imq_lock(&port->ip_messages); - set_ip_srp_msg_sent(port); + ipc_special_reply_port_msg_sent(port); ip_unlock(port); @@ -2159,19 +2228,207 @@ ipc_kmsg_set_qos( } kr = KERN_SUCCESS; - if ((options & MACH_SEND_SYNC_OVERRIDE)) { - if (IP_VALID(special_reply_port) && - MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) { + + if (IP_VALID(special_reply_port) && + MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) { + if ((options & MACH_SEND_SYNC_OVERRIDE)) { + boolean_t sync_bootstrap_checkin = !!(options & MACH_SEND_SYNC_BOOTSTRAP_CHECKIN); /* * Link the destination port to special reply port and make sure that * dest port has a send turnstile, else allocate one. */ - ipc_port_link_special_reply_port(special_reply_port, dest_port); + ipc_port_link_special_reply_port(special_reply_port, dest_port, sync_bootstrap_checkin); } } return kr; } +static inline void +ipc_kmsg_allow_immovable_send( + ipc_kmsg_t kmsg, + ipc_entry_t dest_entry) +{ + ipc_object_t object = dest_entry->ie_object; + /* + * If the dest port is a kobject, allow copyin of immovable send + * rights in the message body to succeed + */ + if (IO_VALID(object) && io_is_kobject(object)) { + kmsg->ikm_flags |= IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND; + } +} + +/* + * Routine: ipc_kmsg_link_reply_context_locked + * Purpose: + * Link any required context from the sending voucher + * to the reply port. The ipc_kmsg_copyin function will + * enforce that the sender calls mach_msg in this context. + * Conditions: + * reply port is locked + */ +static void +ipc_kmsg_link_reply_context_locked( + ipc_port_t reply_port, + ipc_port_t voucher_port) +{ + kern_return_t __assert_only kr; + uint32_t persona_id = 0; + ipc_voucher_t voucher; + + ip_lock_held(reply_port); + + if (!ip_active(reply_port)) { + return; + } + + voucher = convert_port_to_voucher(voucher_port); + + kr = bank_get_bank_ledger_thread_group_and_persona(voucher, NULL, NULL, &persona_id); + assert(kr == KERN_SUCCESS); + ipc_voucher_release(voucher); + + if (persona_id == 0 || persona_id == PERSONA_ID_NONE) { + /* there was no persona context to record */ + return; + } + + /* + * Set the persona_id as the context on the reply port. + * This will force the thread that replies to have adopted a voucher + * with a matching persona. + */ + reply_port->ip_reply_context = persona_id; + + return; +} + +static kern_return_t +ipc_kmsg_validate_reply_port_locked(ipc_port_t reply_port, mach_msg_option_t options) +{ + ip_lock_held(reply_port); + + if (!ip_active(reply_port)) { + /* + * Ideally, we would enforce that the reply receive right is + * active, but asynchronous XPC cancellation destroys the + * receive right, so we just have to return success here. + */ + return KERN_SUCCESS; + } + + if (options & MACH_SEND_MSG) { + /* + * If the rely port is active, then it should not be + * in-transit, and the receive right should be in the caller's + * IPC space. + */ + if (!reply_port->ip_receiver_name || reply_port->ip_receiver != current_task()->itk_space) { + return KERN_INVALID_CAPABILITY; + } + + /* + * A port used as a reply port in an RPC should have exactly 1 + * extant send-once right which we either just made or are + * moving as part of the IPC. + */ + if (reply_port->ip_sorights != 1) { + return KERN_INVALID_CAPABILITY; + } + /* + * XPC uses an extra send-right to keep the name of the reply + * right around through cancellation. That makes it harder to + * enforce a particular semantic kere, so for now, we say that + * you can have a maximum of 1 send right (in addition to your + * send once right). In the future, it would be great to lock + * this down even further. + */ + if (reply_port->ip_srights > 1) { + return KERN_INVALID_CAPABILITY; + } + + /* + * The sender can also specify that the receive right should + * be immovable. Note that this check only applies to + * send-only operations. Combined send/receive or rcv-only + * operations can specify an immovable receive right by + * opt-ing into guarded descriptors (MACH_RCV_GUARDED_DESC) + * and using the MACH_MSG_STRICT_REPLY options flag. + */ + if (MACH_SEND_REPLY_IS_IMMOVABLE(options)) { + if (!reply_port->ip_immovable_receive) { + return KERN_INVALID_CAPABILITY; + } + } + } + + /* + * don't enforce this yet: need a better way of indicating the + * receiver wants this... + */ +#if 0 + if (MACH_RCV_WITH_IMMOVABLE_REPLY(options)) { + if (!reply_port->ip_immovable_receive) { + return KERN_INVALID_CAPABILITY; + } + } +#endif /* 0 */ + + return KERN_SUCCESS; +} + +/* + * Routine: ipc_kmsg_validate_reply_context_locked + * Purpose: + * Validate that the current thread is running in the context + * required by the destination port. + * Conditions: + * dest_port is locked + * Returns: + * MACH_MSG_SUCCESS on success. + * On error, an EXC_GUARD exception is also raised. + * This function *always* resets the port reply context. + */ +static mach_msg_return_t +ipc_kmsg_validate_reply_context_locked( + mach_msg_option_t option, + ipc_port_t dest_port, + ipc_voucher_t voucher, + mach_port_name_t voucher_name) +{ + uint32_t dest_ctx = dest_port->ip_reply_context; + dest_port->ip_reply_context = 0; + + if (!ip_active(dest_port)) { + return MACH_MSG_SUCCESS; + } + + if (voucher == IPC_VOUCHER_NULL || !MACH_PORT_VALID(voucher_name)) { + if ((option & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(voucher_name, 0, + (MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER | dest_ctx), + kGUARD_EXC_STRICT_REPLY); + } + return MACH_SEND_INVALID_CONTEXT; + } + + kern_return_t __assert_only kr; + uint32_t persona_id = 0; + kr = bank_get_bank_ledger_thread_group_and_persona(voucher, NULL, NULL, &persona_id); + assert(kr == KERN_SUCCESS); + + if (dest_ctx != persona_id) { + if ((option & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(voucher_name, 0, + (MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA | ((((uint64_t)persona_id << 32) & MPG_FLAGS_STRICT_REPLY_MASK) | dest_ctx)), + kGUARD_EXC_STRICT_REPLY); + } + return MACH_SEND_INVALID_CONTEXT; + } + + return MACH_MSG_SUCCESS; +} + /* * Routine: ipc_kmsg_copyin_header * Purpose: @@ -2283,6 +2540,23 @@ ipc_kmsg_copyin_header( } } + if (enforce_strict_reply && MACH_SEND_WITH_STRICT_REPLY(*optionp) && + (!MACH_PORT_VALID(reply_name) || + ((reply_type != MACH_MSG_TYPE_MAKE_SEND_ONCE) && (reply_type != MACH_MSG_TYPE_MOVE_SEND_ONCE)) + )) { + /* + * The caller cannot enforce a reply context with an invalid + * reply port name, or a non-send_once reply disposition. + */ + is_write_unlock(space); + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(reply_name, 0, + (MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_DISP | reply_type), + kGUARD_EXC_STRICT_REPLY); + } + return MACH_SEND_INVALID_REPLY; + } + /* * Handle combinations of validating destination and reply; along * with copying in destination, reply, and voucher in an atomic way. @@ -2298,6 +2572,8 @@ ipc_kmsg_copyin_header( if (dest_entry == IE_NULL) { goto invalid_dest; } + /* Check if dest port allows immovable send rights to be sent in the kmsg body */ + ipc_kmsg_allow_immovable_send(kmsg, dest_entry); /* * Make sure a future copyin of the reply port will succeed. @@ -2316,7 +2592,7 @@ ipc_kmsg_copyin_header( goto invalid_reply; } assert(dest_entry != reply_entry); /* names are not equal */ - if (!ipc_right_copyin_check(space, reply_name, reply_entry, reply_type)) { + if (!ipc_right_copyin_check_reply(space, reply_name, reply_entry, reply_type)) { goto invalid_reply; } } @@ -2329,14 +2605,13 @@ ipc_kmsg_copyin_header( * the copyins can be blamed on the destination. */ kr = ipc_right_copyin_two(space, dest_name, dest_entry, - dest_type, voucher_type, - &dest_port, &dest_soright, + dest_type, voucher_type, &dest_port, &dest_soright, &release_port); if (kr != KERN_SUCCESS) { assert(kr != KERN_INVALID_CAPABILITY); goto invalid_dest; } - voucher_port = (ipc_port_t)dest_port; + voucher_port = ip_object_to_port(dest_port); /* * could not have been one of these dispositions, @@ -2354,7 +2629,7 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, reply_name, reply_entry, reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK, &reply_port, &reply_soright, - &release_port, &assertcnt); + &release_port, &assertcnt, 0, NULL); assert(assertcnt == 0); assert(kr == KERN_SUCCESS); } @@ -2371,16 +2646,24 @@ ipc_kmsg_copyin_header( if (dest_entry == IE_NULL) { goto invalid_dest; } + ipc_kmsg_allow_immovable_send(kmsg, dest_entry); + reply_entry = dest_entry; assert(reply_type != 0); /* because name not null */ + /* + * Pre-validate that the reply right can be copied in by itself + */ + if (!ipc_right_copyin_check_reply(space, reply_name, reply_entry, reply_type)) { + goto invalid_reply; + } + /* * Do the joint copyin of the dest disposition and * reply disposition from the one entry/port. */ kr = ipc_right_copyin_two(space, dest_name, dest_entry, - dest_type, reply_type, - &dest_port, &dest_soright, + dest_type, reply_type, &dest_port, &dest_soright, &release_port); if (kr == KERN_INVALID_CAPABILITY) { goto invalid_reply; @@ -2420,6 +2703,7 @@ ipc_kmsg_copyin_header( goto invalid_dest; } assert(dest_entry != voucher_entry); + ipc_kmsg_allow_immovable_send(kmsg, dest_entry); /* * Make sure reply port entry is valid before dest copyin. @@ -2435,7 +2719,7 @@ ipc_kmsg_copyin_header( assert(dest_entry != reply_entry); /* names are not equal */ assert(reply_type != 0); /* because reply_name not null */ - if (!ipc_right_copyin_check(space, reply_name, reply_entry, reply_type)) { + if (!ipc_right_copyin_check_reply(space, reply_name, reply_entry, reply_type)) { goto invalid_reply; } } @@ -2444,9 +2728,10 @@ ipc_kmsg_copyin_header( * copyin the destination. */ kr = ipc_right_copyin(space, dest_name, dest_entry, - dest_type, IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE, + dest_type, (IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND | + IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE), &dest_port, &dest_soright, - &release_port, &assertcnt); + &release_port, &assertcnt, 0, NULL); assert(assertcnt == 0); if (kr != KERN_SUCCESS) { goto invalid_dest; @@ -2462,12 +2747,12 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, reply_name, reply_entry, reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK, &reply_port, &reply_soright, - &release_port, &assertcnt); + &release_port, &assertcnt, 0, NULL); assert(assertcnt == 0); assert(kr == KERN_SUCCESS); } else { /* convert invalid name to equivalent ipc_object type */ - reply_port = (ipc_object_t)CAST_MACH_NAME_TO_PORT(reply_name); + reply_port = ip_to_object(CAST_MACH_NAME_TO_PORT(reply_name)); } } @@ -2481,11 +2766,11 @@ ipc_kmsg_copyin_header( (ipc_object_t *)&voucher_port, &voucher_soright, &voucher_release_port, - &assertcnt); + &assertcnt, 0, NULL); assert(assertcnt == 0); assert(KERN_SUCCESS == kr); assert(IP_VALID(voucher_port)); - assert(ip_active(voucher_port)); + require_ip_active(voucher_port); } } @@ -2538,7 +2823,7 @@ ipc_kmsg_copyin_header( if (((*optionp & MACH_SEND_NOTIFY) != 0) && dest_type != MACH_MSG_TYPE_PORT_SEND_ONCE && dest_entry != IE_NULL && dest_entry->ie_request != IE_REQ_NONE) { - ipc_port_t dport = (ipc_port_t)dest_port; + ipc_port_t dport = ip_object_to_port(dest_port); assert(dport != IP_NULL); ip_lock(dport); @@ -2577,7 +2862,7 @@ ipc_kmsg_copyin_header( * destination port. */ if (needboost == TRUE) { - ipc_port_t dport = (ipc_port_t)dest_port; + ipc_port_t dport = ip_object_to_port(dest_port); /* dport still locked from above */ if (ipc_port_importance_delta(dport, IPID_OPTION_SENDPOSSIBLE, 1) == FALSE) { @@ -2607,8 +2892,8 @@ ipc_kmsg_copyin_header( } msg->msgh_bits = MACH_MSGH_BITS_SET(dest_type, reply_type, voucher_type, mbits); - msg->msgh_remote_port = (ipc_port_t)dest_port; - msg->msgh_local_port = (ipc_port_t)reply_port; + msg->msgh_remote_port = ip_object_to_port(dest_port); + msg->msgh_local_port = ip_object_to_port(reply_port); /* capture the qos value(s) for the kmsg */ ipc_kmsg_set_qos(kmsg, *optionp, override); @@ -2621,6 +2906,37 @@ ipc_kmsg_copyin_header( ip_release(voucher_release_port); } + if (enforce_strict_reply && MACH_SEND_WITH_STRICT_REPLY(*optionp) && IP_VALID(msg->msgh_local_port)) { + /* + * We've already validated that the reply disposition is a + * [make/move] send-once. Ideally, we should enforce that the + * reply port is also not dead, but XPC asynchronous + * cancellation can make the reply port dead before we + * actually make it to the mach_msg send. + * + * Here, we ensure that if we have a non-dead reply port, then + * the reply port's receive right should not be in-transit, + * and should live in the caller's IPC space. + */ + ipc_port_t rport = msg->msgh_local_port; + ip_lock(rport); + kr = ipc_kmsg_validate_reply_port_locked(rport, *optionp); + ip_unlock(rport); + if (kr != KERN_SUCCESS) { + /* + * no descriptors have been copied in yet, but the + * full header has been copied in: clean it up + */ + ipc_kmsg_clean_partial(kmsg, 0, NULL, 0, 0); + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(reply_name, 0, + (MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_PORT | kr), + kGUARD_EXC_STRICT_REPLY); + } + return MACH_SEND_INVALID_REPLY; + } + } + return MACH_MSG_SUCCESS; invalid_reply: @@ -2655,19 +2971,7 @@ invalid_dest: return MACH_SEND_INVALID_DEST; } -mach_msg_descriptor_t *ipc_kmsg_copyin_port_descriptor( - volatile mach_msg_port_descriptor_t *dsc, - mach_msg_legacy_port_descriptor_t *user_dsc, - ipc_space_t space, - ipc_object_t dest, - ipc_kmsg_t kmsg, - mach_msg_option_t *optionp, - mach_msg_return_t *mr); - -void ipc_print_type_name( - int type_name); - -mach_msg_descriptor_t * +static mach_msg_descriptor_t * ipc_kmsg_copyin_port_descriptor( volatile mach_msg_port_descriptor_t *dsc, mach_msg_legacy_port_descriptor_t *user_dsc_in, @@ -2688,9 +2992,9 @@ ipc_kmsg_copyin_port_descriptor( name = (mach_port_name_t)user_dsc->name; if (MACH_PORT_VALID(name)) { - kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object); + kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object, 0, NULL, kmsg->ikm_flags); if (kr != KERN_SUCCESS) { - if ((*optionp & MACH_SEND_KERNEL) == 0) { + if (((*optionp & MACH_SEND_KERNEL) == 0) && (kr == KERN_INVALID_RIGHT)) { mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT); } *mr = MACH_SEND_INVALID_RIGHT; @@ -2698,34 +3002,23 @@ ipc_kmsg_copyin_port_descriptor( } if ((result_disp == MACH_MSG_TYPE_PORT_RECEIVE) && - ipc_port_check_circularity((ipc_port_t) object, - (ipc_port_t) dest)) { + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(dest))) { kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; } - dsc->name = (ipc_port_t) object; + dsc->name = ip_object_to_port(object); } else { dsc->name = CAST_MACH_NAME_TO_PORT(name); } dsc->disposition = result_disp; dsc->type = MACH_MSG_PORT_DESCRIPTOR; - dsc->pad_end = 0; // debug, unnecessary + dsc->pad_end = 0; // debug, unnecessary return (mach_msg_descriptor_t *)(user_dsc_in + 1); } -mach_msg_descriptor_t * ipc_kmsg_copyin_ool_descriptor( - mach_msg_ool_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - int is_64bit, - vm_offset_t *paddr, - vm_map_copy_t *copy, - vm_size_t *space_needed, - vm_map_t map, - mach_msg_option_t *optionp, - mach_msg_return_t *mr); - -mach_msg_descriptor_t * +static mach_msg_descriptor_t * ipc_kmsg_copyin_ool_descriptor( mach_msg_ool_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, @@ -2828,18 +3121,7 @@ ipc_kmsg_copyin_ool_descriptor( return user_dsc; } -mach_msg_descriptor_t * ipc_kmsg_copyin_ool_ports_descriptor( - mach_msg_ool_ports_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - int is_64bit, - vm_map_t map, - ipc_space_t space, - ipc_object_t dest, - ipc_kmsg_t kmsg, - mach_msg_option_t *optionp, - mach_msg_return_t *mr); - -mach_msg_descriptor_t * +static mach_msg_descriptor_t * ipc_kmsg_copyin_ool_ports_descriptor( mach_msg_ool_ports_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, @@ -2950,11 +3232,11 @@ ipc_kmsg_copyin_ool_ports_descriptor( ipc_object_t object; if (!MACH_PORT_VALID(name)) { - objects[i] = (ipc_object_t)CAST_MACH_NAME_TO_PORT(name); + objects[i] = ip_to_object(CAST_MACH_NAME_TO_PORT(name)); continue; } - kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object); + kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object, 0, NULL, kmsg->ikm_flags); if (kr != KERN_SUCCESS) { unsigned int j; @@ -2967,7 +3249,7 @@ ipc_kmsg_copyin_ool_ports_descriptor( } kfree(data, ports_length); dsc->address = NULL; - if ((*optionp & MACH_SEND_KERNEL) == 0) { + if (((*optionp & MACH_SEND_KERNEL) == 0) && (kr == KERN_INVALID_RIGHT)) { mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT); } *mr = MACH_SEND_INVALID_RIGHT; @@ -2975,9 +3257,8 @@ ipc_kmsg_copyin_ool_ports_descriptor( } if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) && - ipc_port_check_circularity( - (ipc_port_t) object, - (ipc_port_t) dest)) { + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(dest))) { kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; } @@ -2987,6 +3268,74 @@ ipc_kmsg_copyin_ool_ports_descriptor( return user_dsc; } +static mach_msg_descriptor_t * +ipc_kmsg_copyin_guarded_port_descriptor( + mach_msg_guarded_port_descriptor_t *dsc, + mach_msg_descriptor_t *user_addr, + int is_64bit, + ipc_space_t space, + ipc_object_t dest, + ipc_kmsg_t kmsg, + mach_msg_option_t *optionp, + mach_msg_return_t *mr) +{ + mach_msg_descriptor_t *user_dsc; + mach_msg_type_name_t disp; + mach_msg_type_name_t result_disp; + mach_port_name_t name; + mach_msg_guard_flags_t guard_flags; + ipc_object_t object; + mach_port_context_t context; + + if (!is_64bit) { + mach_msg_guarded_port_descriptor32_t *user_gp_dsc = (typeof(user_gp_dsc))user_addr; + name = user_gp_dsc->name; + guard_flags = user_gp_dsc->flags; + disp = user_gp_dsc->disposition; + context = user_gp_dsc->context; + user_dsc = (mach_msg_descriptor_t *)(user_gp_dsc + 1); + } else { + mach_msg_guarded_port_descriptor64_t *user_gp_dsc = (typeof(user_gp_dsc))user_addr; + name = user_gp_dsc->name; + guard_flags = user_gp_dsc->flags; + disp = user_gp_dsc->disposition; + context = user_gp_dsc->context; + user_dsc = (mach_msg_descriptor_t *)(user_gp_dsc + 1); + } + + guard_flags &= MACH_MSG_GUARD_FLAGS_MASK; + result_disp = ipc_object_copyin_type(disp); + + if (MACH_PORT_VALID(name)) { + kern_return_t kr = ipc_object_copyin(space, name, disp, &object, context, &guard_flags, kmsg->ikm_flags); + if (kr != KERN_SUCCESS) { + if (((*optionp & MACH_SEND_KERNEL) == 0) && (kr == KERN_INVALID_RIGHT)) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT); + } + *mr = MACH_SEND_INVALID_RIGHT; + return NULL; + } + + if ((result_disp == MACH_MSG_TYPE_PORT_RECEIVE) && + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(dest))) { + kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; + } + dsc->name = ip_object_to_port(object); + } else { + dsc->name = CAST_MACH_NAME_TO_PORT(name); + } + dsc->flags = guard_flags; + dsc->disposition = result_disp; + dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + +#if __LP64__ + dsc->pad_end = 0; // debug, unnecessary +#endif + return user_dsc; +} + + /* * Routine: ipc_kmsg_copyin_body * Purpose: @@ -3007,6 +3356,7 @@ ipc_kmsg_copyin_ool_ports_descriptor( * MACH_SEND_MSG_TOO_SMALL Body is too small for types/data. * MACH_SEND_INVALID_RT_OOL_SIZE OOL Buffer too large for RT * MACH_MSG_INVALID_RT_DESCRIPTOR Dealloc and RT are incompatible + * MACH_SEND_NO_GRANT_DEST Dest port doesn't accept ports in body */ mach_msg_return_t @@ -3018,27 +3368,33 @@ ipc_kmsg_copyin_body( { ipc_object_t dest; mach_msg_body_t *body; - mach_msg_descriptor_t *daddr, *naddr; + mach_msg_descriptor_t *daddr, *naddr, *end; mach_msg_descriptor_t *user_addr, *kern_addr; mach_msg_type_number_t dsc_count; boolean_t is_task_64bit = (map->max_offset > VM_MAX_ADDRESS); boolean_t complex = FALSE; + boolean_t contains_port_desc = FALSE; vm_size_t space_needed = 0; vm_offset_t paddr = 0; vm_map_copy_t copy = VM_MAP_COPY_NULL; mach_msg_type_number_t i; mach_msg_return_t mr = MACH_MSG_SUCCESS; + ipc_port_t remote_port = kmsg->ikm_header->msgh_remote_port; vm_size_t descriptor_size = 0; mach_msg_type_number_t total_ool_port_count = 0; + mach_msg_guard_flags_t guard_flags = 0; + mach_port_context_t context; + mach_msg_type_name_t disp; /* * Determine if the target is a kernel port. */ - dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; + dest = ip_to_object(remote_port); body = (mach_msg_body_t *) (kmsg->ikm_header + 1); naddr = (mach_msg_descriptor_t *) (body + 1); + end = (mach_msg_descriptor_t *) ((vm_offset_t)kmsg->ikm_header + kmsg->ikm_header->msgh_size); dsc_count = body->msgh_descriptor_count; if (dsc_count == 0) { @@ -3059,10 +3415,16 @@ ipc_kmsg_copyin_body( /* make sure the descriptor fits in the message */ if (is_task_64bit) { + if ((mach_msg_descriptor_t*)((vm_offset_t)daddr + 12) > end) { + mr = MACH_SEND_MSG_TOO_SMALL; + goto clean_message; + } + switch (daddr->type.type) { case MACH_MSG_OOL_DESCRIPTOR: case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: case MACH_MSG_OOL_PORTS_DESCRIPTOR: + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: descriptor_size += 16; naddr = (typeof(naddr))((vm_offset_t)daddr + 16); break; @@ -3076,8 +3438,7 @@ ipc_kmsg_copyin_body( naddr = (typeof(naddr))((vm_offset_t)daddr + 12); } - if (naddr > (mach_msg_descriptor_t *) - ((vm_offset_t)kmsg->ikm_header + kmsg->ikm_header->msgh_size)) { + if (naddr > end) { mr = MACH_SEND_MSG_TOO_SMALL; goto clean_message; } @@ -3125,6 +3486,7 @@ ipc_kmsg_copyin_body( mr = MACH_SEND_TOO_LARGE; goto clean_message; } + contains_port_desc = TRUE; break; case MACH_MSG_OOL_PORTS_DESCRIPTOR: ool_port_count = (is_task_64bit) ? @@ -3142,6 +3504,35 @@ ipc_kmsg_copyin_body( mr = MACH_SEND_TOO_LARGE; goto clean_message; } + contains_port_desc = TRUE; + break; + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: + guard_flags = (is_task_64bit) ? + ((mach_msg_guarded_port_descriptor64_t *)daddr)->flags : + ((mach_msg_guarded_port_descriptor32_t *)daddr)->flags; + context = (is_task_64bit) ? + ((mach_msg_guarded_port_descriptor64_t *)daddr)->context : + ((mach_msg_guarded_port_descriptor32_t *)daddr)->context; + disp = (is_task_64bit) ? + ((mach_msg_guarded_port_descriptor64_t *)daddr)->disposition : + ((mach_msg_guarded_port_descriptor32_t *)daddr)->disposition; + + /* Only MACH_MSG_TYPE_MOVE_RECEIVE is supported for now */ + if (!guard_flags || ((guard_flags & ~MACH_MSG_GUARD_FLAGS_MASK) != 0) || + ((guard_flags & MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND) && (context != 0)) || + (disp != MACH_MSG_TYPE_MOVE_RECEIVE)) { + /* + * Invalid guard flags, context or disposition + */ + mr = MACH_SEND_INVALID_TYPE; + goto clean_message; + } + if (os_add_overflow(total_ool_port_count, 1, &total_ool_port_count)) { + /* Overflow detected */ + mr = MACH_SEND_TOO_LARGE; + goto clean_message; + } + contains_port_desc = TRUE; break; } } @@ -3152,6 +3543,16 @@ ipc_kmsg_copyin_body( goto clean_message; } + /* + * Check if dest is a no-grant port; Since this bit is set only on + * port construction and cannot be unset later, we can peek at the + * bit without paying the cost of locking the port. + */ + if (contains_port_desc && remote_port->ip_no_grant) { + mr = MACH_SEND_NO_GRANT_DEST; + goto clean_message; + } + /* * Allocate space in the pageable kernel ipc copy map for all the * ool data that is to be physically copied. Map is marked wait for @@ -3168,7 +3569,8 @@ ipc_kmsg_copyin_body( /* user_addr = just after base as it was copied in */ user_addr = (mach_msg_descriptor_t *)((vm_offset_t)kmsg->ikm_header + sizeof(mach_msg_base_t)); - /* Shift the mach_msg_base_t down to make room for dsc_count*16bytes of descriptors */ + /* Shift the mach_msg_base_t down to make room for dsc_count*16bytes of descriptors on 64 bit kernels + */ if (descriptor_size != 16 * dsc_count) { vm_offset_t dsc_adjust = 16 * dsc_count - descriptor_size; @@ -3205,6 +3607,12 @@ ipc_kmsg_copyin_body( kern_addr++; complex = TRUE; break; + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: + user_addr = ipc_kmsg_copyin_guarded_port_descriptor((mach_msg_guarded_port_descriptor_t *)kern_addr, + user_addr, is_task_64bit, space, dest, kmsg, optionp, &mr); + kern_addr++; + complex = TRUE; + break; default: /* Invalid descriptor */ mr = MACH_SEND_INVALID_TYPE; @@ -3218,7 +3626,7 @@ ipc_kmsg_copyin_body( paddr, space_needed); goto out; } - } /* End of loop */ + } /* End of loop */ if (!complex) { kmsg->ikm_header->msgh_bits &= ~MACH_MSGH_BITS_COMPLEX; @@ -3295,7 +3703,6 @@ ipc_kmsg_copyin( } mr = ipc_kmsg_copyin_body( kmsg, space, map, optionp); - /* unreachable if !DEBUG */ __unreachable_ok_push if (DEBUG_KPRINT_SYSCALL_PREDICATE(DEBUG_KPRINT_SYSCALL_IPC_MASK)) { @@ -3330,8 +3737,9 @@ ipc_kmsg_copyin_from_kernel( mach_msg_bits_t bits = kmsg->ikm_header->msgh_bits; mach_msg_type_name_t rname = MACH_MSGH_BITS_REMOTE(bits); mach_msg_type_name_t lname = MACH_MSGH_BITS_LOCAL(bits); - ipc_object_t remote = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; - ipc_object_t local = (ipc_object_t) kmsg->ikm_header->msgh_local_port; + ipc_object_t remote = ip_to_object(kmsg->ikm_header->msgh_remote_port); + ipc_object_t local = ip_to_object(kmsg->ikm_header->msgh_local_port); + ipc_port_t dest = kmsg->ikm_header->msgh_remote_port; /* translate the destination and reply ports */ if (!IO_VALID(remote)) { @@ -3364,6 +3772,30 @@ ipc_kmsg_copyin_from_kernel( return MACH_MSG_SUCCESS; } } + + /* + * Check if the remote port accepts ports in the body. + */ + if (dest->ip_no_grant) { + mach_msg_descriptor_t *saddr; + mach_msg_body_t *body; + mach_msg_type_number_t i, count; + + body = (mach_msg_body_t *) (kmsg->ikm_header + 1); + saddr = (mach_msg_descriptor_t *) (body + 1); + count = body->msgh_descriptor_count; + + for (i = 0; i < count; i++, saddr++) { + switch (saddr->type.type) { + case MACH_MSG_PORT_DESCRIPTOR: + case MACH_MSG_OOL_PORTS_DESCRIPTOR: + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: + /* no descriptors have been copied in yet */ + ipc_kmsg_clean_partial(kmsg, 0, NULL, 0, 0); + return MACH_SEND_NO_GRANT_DEST; + } + } + } { mach_msg_descriptor_t *saddr; mach_msg_body_t *body; @@ -3384,7 +3816,7 @@ ipc_kmsg_copyin_from_kernel( /* this is really the type SEND, SEND_ONCE, etc. */ name = dsc->disposition; - object = (ipc_object_t) dsc->name; + object = ip_to_object(dsc->name); dsc->disposition = ipc_object_copyin_type(name); if (!IO_VALID(object)) { @@ -3398,10 +3830,10 @@ ipc_kmsg_copyin_from_kernel( /* assert when the new kobject model is in place since*/ /* ports will not be used in kernel to kernel chats */ - if (((ipc_port_t)remote)->ip_receiver != ipc_space_kernel) { + if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) { if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) && - ipc_port_check_circularity((ipc_port_t) object, - (ipc_port_t) remote)) { + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(remote))) { kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; } @@ -3440,9 +3872,36 @@ ipc_kmsg_copyin_from_kernel( ipc_object_copyin_from_kernel(object, name); if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) && - ipc_port_check_circularity( - (ipc_port_t) object, - (ipc_port_t) remote)) { + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(remote))) { + kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; + } + } + break; + } + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_guarded_port_descriptor_t *dsc = (typeof(dsc)) & saddr->guarded_port; + mach_msg_type_name_t disp = dsc->disposition; + ipc_object_t object = ip_to_object(dsc->name); + dsc->disposition = ipc_object_copyin_type(disp); + assert(dsc->flags == 0); + + if (!IO_VALID(object)) { + break; + } + + ipc_object_copyin_from_kernel(object, disp); + /* + * avoid circularity when the destination is also + * the kernel. This check should be changed into an + * assert when the new kobject model is in place since + * ports will not be used in kernel to kernel chats + */ + + if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) { + if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) && + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(remote))) { kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; } } @@ -3467,8 +3926,9 @@ ipc_kmsg_copyin_from_kernel_legacy( mach_msg_bits_t bits = kmsg->ikm_header->msgh_bits; mach_msg_type_name_t rname = MACH_MSGH_BITS_REMOTE(bits); mach_msg_type_name_t lname = MACH_MSGH_BITS_LOCAL(bits); - ipc_object_t remote = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; - ipc_object_t local = (ipc_object_t) kmsg->ikm_header->msgh_local_port; + ipc_object_t remote = ip_to_object(kmsg->ikm_header->msgh_remote_port); + ipc_object_t local = ip_to_object(kmsg->ikm_header->msgh_local_port); + ipc_port_t dest = kmsg->ikm_header->msgh_remote_port; /* translate the destination and reply ports */ if (!IO_VALID(remote)) { @@ -3501,6 +3961,28 @@ ipc_kmsg_copyin_from_kernel_legacy( return MACH_MSG_SUCCESS; } } + + if (dest->ip_no_grant) { + mach_msg_descriptor_t *saddr; + mach_msg_body_t *body; + mach_msg_type_number_t i, count; + + body = (mach_msg_body_t *) (kmsg->ikm_header + 1); + saddr = (mach_msg_descriptor_t *) (body + 1); + count = body->msgh_descriptor_count; + + for (i = 0; i < count; i++, saddr++) { + switch (saddr->type.type) { + case MACH_MSG_PORT_DESCRIPTOR: + case MACH_MSG_OOL_PORTS_DESCRIPTOR: + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: + /* no descriptors have been copied in yet */ + ipc_kmsg_clean_partial(kmsg, 0, NULL, 0, 0); + return MACH_SEND_NO_GRANT_DEST; + } + } + } + { mach_msg_legacy_descriptor_t *saddr; mach_msg_descriptor_t *daddr; @@ -3533,9 +4015,9 @@ ipc_kmsg_copyin_from_kernel_legacy( /* this is really the type SEND, SEND_ONCE, etc. */ name = dsc->disposition; - object = (ipc_object_t) CAST_MACH_NAME_TO_PORT(dsc->name); + object = ip_to_object(CAST_MACH_NAME_TO_PORT(dsc->name)); dest_dsc->disposition = ipc_object_copyin_type(name); - dest_dsc->name = (mach_port_t)object; + dest_dsc->name = ip_object_to_port(object); dest_dsc->type = MACH_MSG_PORT_DESCRIPTOR; if (!IO_VALID(object)) { @@ -3549,10 +4031,10 @@ ipc_kmsg_copyin_from_kernel_legacy( /* assert when the new kobject model is in place since*/ /* ports will not be used in kernel to kernel chats */ - if (((ipc_port_t)remote)->ip_receiver != ipc_space_kernel) { + if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) { if ((dest_dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) && - ipc_port_check_circularity((ipc_port_t) object, - (ipc_port_t) remote)) { + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(remote))) { kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; } @@ -3610,9 +4092,8 @@ ipc_kmsg_copyin_from_kernel_legacy( ipc_object_copyin_from_kernel(object, name); if ((disposition == MACH_MSG_TYPE_PORT_RECEIVE) && - ipc_port_check_circularity( - (ipc_port_t) object, - (ipc_port_t) remote)) { + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(remote))) { kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR; } } @@ -3625,6 +4106,46 @@ ipc_kmsg_copyin_from_kernel_legacy( dest_dsc->count = port_count; break; } + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_type_name_t disp; + ipc_object_t object; + mach_msg_guarded_port_descriptor32_t *dsc; + mach_msg_guarded_port_descriptor_t *dest_dsc; + + dsc = (typeof(dsc)) & saddr->guarded_port32; + dest_dsc = &daddr->guarded_port; + + disp = dsc->disposition; + object = ip_to_object(CAST_MACH_NAME_TO_PORT(dsc->name)); + assert(dsc->flags == 0); + assert(dsc->context == 0); + + dest_dsc->disposition = ipc_object_copyin_type(disp); + dest_dsc->name = ip_object_to_port(object); + dest_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + dest_dsc->flags = 0; + + if (!IO_VALID(object)) { + break; + } + + ipc_object_copyin_from_kernel(object, disp); + + /* CDY avoid circularity when the destination is also */ + /* the kernel. This check should be changed into an */ + /* assert when the new kobject model is in place since*/ + /* ports will not be used in kernel to kernel chats */ + + if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) { + if ((dest_dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) && + ipc_port_check_circularity(ip_object_to_port(object), + ip_object_to_port(remote))) { + kmsg->ikm_header->msgh_bits |= + MACH_MSGH_BITS_CIRCULAR; + } + } + break; + } default: { #if MACH_ASSERT panic("ipc_kmsg_copyin_from_kernel: bad descriptor"); @@ -3671,7 +4192,7 @@ ipc_kmsg_copyout_header( { mach_msg_header_t *msg = kmsg->ikm_header; mach_msg_bits_t mbits = msg->msgh_bits; - ipc_port_t dest = (ipc_port_t) msg->msgh_remote_port; + ipc_port_t dest = msg->msgh_remote_port; assert(IP_VALID(dest)); @@ -3744,12 +4265,14 @@ ipc_kmsg_copyout_header( /* Is there already an entry we can use? */ if ((reply_type != MACH_MSG_TYPE_PORT_SEND_ONCE) && - ipc_right_reverse(space, (ipc_object_t) reply, &reply_name, &entry)) { + ipc_right_reverse(space, ip_to_object(reply), &reply_name, &entry)) { /* reply port is locked and active */ assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); } else { ip_lock(reply); if (!ip_active(reply)) { + /* clear the context value */ + reply->ip_reply_context = 0; ip_unlock(reply); release_reply_port = reply; @@ -3764,14 +4287,42 @@ ipc_kmsg_copyout_header( ipc_entry_claim(space, &reply_name, &entry); assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE); assert(entry->ie_object == IO_NULL); - entry->ie_object = (ipc_object_t) reply; + entry->ie_object = ip_to_object(reply); } /* space and reply port are locked and active */ - ip_reference(reply); /* hold onto the reply port */ + ip_reference(reply); /* hold onto the reply port */ + + /* + * If the receiver would like to enforce strict reply + * semantics, and the message looks like it expects a reply, + * and contains a voucher, then link the context in the + * voucher with the reply port so that the next message sent + * to the reply port must come from a thread that has a + * matching context (voucher). + */ + if (enforce_strict_reply && MACH_RCV_WITH_STRICT_REPLY(option) && IP_VALID(voucher)) { + if (ipc_kmsg_validate_reply_port_locked(reply, option) != KERN_SUCCESS) { + /* if the receiver isn't happy with the reply port: fail the receive. */ + ip_unlock(reply); + ipc_entry_dealloc(space, reply_name, entry); + is_write_unlock(space); + ip_release(reply); + return MACH_RCV_INVALID_REPLY; + } + ipc_kmsg_link_reply_context_locked(reply, voucher); + } else { + /* + * if the receive did not choose to participate + * in the strict reply/RPC, then don't enforce + * anything (as this could lead to booby-trapped + * messages that kill the server). + */ + reply->ip_reply_context = 0; + } kr = ipc_right_copyout(space, reply_name, entry, - reply_type, TRUE, (ipc_object_t) reply); + reply_type, NULL, NULL, ip_to_object(reply)); assert(kr == KERN_SUCCESS); /* reply port is unlocked */ } else { @@ -3798,7 +4349,7 @@ done_with_reply: if ((option & MACH_RCV_VOUCHER) != 0) { ipc_entry_t entry; - if (ipc_right_reverse(space, (ipc_object_t) voucher, + if (ipc_right_reverse(space, ip_to_object(voucher), &voucher_name, &entry)) { /* voucher port locked */ assert(entry->ie_bits & MACH_PORT_TYPE_SEND); @@ -3808,16 +4359,15 @@ done_with_reply: ipc_entry_claim(space, &voucher_name, &entry); assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE); assert(entry->ie_object == IO_NULL); - entry->ie_object = (ipc_object_t) voucher; + entry->ie_object = ip_to_object(voucher); ip_lock(voucher); } /* space is locked and active */ - - assert(ip_active(voucher)); + require_ip_active(voucher); assert(ip_kotype(voucher) == IKOT_VOUCHER); kr = ipc_right_copyout(space, voucher_name, entry, - MACH_MSG_TYPE_MOVE_SEND, TRUE, - (ipc_object_t) voucher); + MACH_MSG_TYPE_MOVE_SEND, NULL, NULL, + ip_to_object(voucher)); /* voucher port is unlocked */ } else { voucher_type = MACH_MSGH_BITS_ZERO; @@ -3909,7 +4459,7 @@ done_with_voucher: */ if (ip_active(dest)) { - ipc_object_copyout_dest(space, (ipc_object_t) dest, + ipc_object_copyout_dest(space, ip_to_object(dest), dest_type, &dest_name); /* dest is unlocked */ } else { @@ -3946,11 +4496,6 @@ done_with_voucher: } } - if (IP_VALID(release_voucher_port)) { - ipc_port_release_send(release_voucher_port); - } - - if ((option & MACH_RCV_VOUCHER) != 0) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_MSG_RECV) | DBG_FUNC_NONE, VM_KERNEL_ADDRPERM((uintptr_t)kmsg), @@ -3967,6 +4512,10 @@ done_with_voucher: 0); } + if (IP_VALID(release_voucher_port)) { + ipc_port_release_send(release_voucher_port); + } + msg->msgh_bits = MACH_MSGH_BITS_SET(reply_type, dest_type, voucher_type, mbits); msg->msgh_local_port = CAST_MACH_NAME_TO_PORT(dest_name); @@ -3999,6 +4548,8 @@ ipc_kmsg_copyout_object( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, + mach_port_context_t *context, + mach_msg_guard_flags_t *guard_flags, mach_port_name_t *namep) { kern_return_t kr; @@ -4008,7 +4559,7 @@ ipc_kmsg_copyout_object( return MACH_MSG_SUCCESS; } - kr = ipc_object_copyout(space, object, msgt_name, TRUE, namep); + kr = ipc_object_copyout(space, object, msgt_name, context, guard_flags, namep); if (kr != KERN_SUCCESS) { ipc_object_destroy(object, msgt_name); @@ -4028,12 +4579,7 @@ ipc_kmsg_copyout_object( return MACH_MSG_SUCCESS; } -mach_msg_descriptor_t * -ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - ipc_space_t space, - kern_return_t *mr); -mach_msg_descriptor_t * +static mach_msg_descriptor_t * ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc, mach_msg_descriptor_t *dest_dsc, ipc_space_t space, @@ -4043,18 +4589,15 @@ ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc, mach_port_name_t name; mach_msg_type_name_t disp; - /* Copyout port right carried in the message */ port = dsc->port.name; disp = dsc->port.disposition; *mr |= ipc_kmsg_copyout_object(space, - (ipc_object_t)port, - disp, - &name); + ip_to_object(port), disp, NULL, NULL, &name); if (current_task() == kernel_task) { mach_msg_port_descriptor_t *user_dsc = (typeof(user_dsc))dest_dsc; - user_dsc--; // point to the start of this port descriptor + user_dsc--; // point to the start of this port descriptor bzero((void *)user_dsc, sizeof(*user_dsc)); user_dsc->name = CAST_MACH_NAME_TO_PORT(name); user_dsc->disposition = disp; @@ -4062,7 +4605,7 @@ ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc, dest_dsc = (typeof(dest_dsc))user_dsc; } else { mach_msg_legacy_port_descriptor_t *user_dsc = (typeof(user_dsc))dest_dsc; - user_dsc--; // point to the start of this port descriptor + user_dsc--; // point to the start of this port descriptor bzero((void *)user_dsc, sizeof(*user_dsc)); user_dsc->name = CAST_MACH_PORT_TO_NAME(name); user_dsc->disposition = disp; @@ -4252,16 +4795,16 @@ ipc_kmsg_copyout_ool_ports_descriptor(mach_msg_ool_ports_descriptor_t *dsc, * for those rights out to user-space. */ if (rcv_addr != 0) { - mach_port_t *objects = (mach_port_t *) dsc->address; + ipc_object_t *objects = (ipc_object_t *) dsc->address; mach_port_name_t *names = (mach_port_name_t *) dsc->address; /* copyout port rights carried in the message */ for (i = 0; i < count; i++) { - ipc_object_t object = (ipc_object_t)objects[i]; + ipc_object_t object = objects[i]; *mr |= ipc_kmsg_copyout_object(space, object, - disp, &names[i]); + disp, NULL, NULL, &names[i]); } /* copyout to memory allocated above */ @@ -4325,6 +4868,81 @@ ipc_kmsg_copyout_ool_ports_descriptor(mach_msg_ool_ports_descriptor_t *dsc, return user_dsc; } +static mach_msg_descriptor_t * +ipc_kmsg_copyout_guarded_port_descriptor( + mach_msg_guarded_port_descriptor_t *dsc, + mach_msg_descriptor_t *dest_dsc, + int is_64bit, + __unused ipc_kmsg_t kmsg, + ipc_space_t space, + mach_msg_option_t option, + kern_return_t *mr) +{ + mach_port_t port; + mach_port_name_t name = MACH_PORT_NULL; + mach_msg_type_name_t disp; + mach_msg_guard_flags_t guard_flags; + mach_port_context_t context; + + /* Copyout port right carried in the message */ + port = dsc->name; + disp = dsc->disposition; + guard_flags = dsc->flags; + context = 0; + + /* Currently kernel_task doesnt support receiving guarded port descriptors */ + struct knote *kn = current_thread()->ith_knote; + if ((kn != ITH_KNOTE_PSEUDO) && (((option & MACH_RCV_GUARDED_DESC) == 0) || + (current_task() == kernel_task))) { +#if DEVELOPMENT || DEBUG + if (current_task() != kernel_task) { + /* + * Simulated crash needed for debugging, notifies the receiver to opt into receiving + * guarded descriptors. + */ + mach_port_guard_exception(current_thread()->ith_receiver_name, 0, 0, kGUARD_EXC_RCV_GUARDED_DESC); + } +#endif + KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_DESTROY_GUARDED_DESC), current_thread()->ith_receiver_name, + VM_KERNEL_ADDRPERM(port), disp, guard_flags); + ipc_object_destroy(ip_to_object(port), disp); + mach_msg_legacy_port_descriptor_t *user_dsc = (typeof(user_dsc))dest_dsc; + user_dsc--; // point to the start of this port descriptor + bzero((void *)user_dsc, sizeof(*user_dsc)); + user_dsc->name = name; + user_dsc->disposition = disp; + user_dsc->type = MACH_MSG_PORT_DESCRIPTOR; + dest_dsc = (typeof(dest_dsc))user_dsc; + } else { + *mr |= ipc_kmsg_copyout_object(space, + ip_to_object(port), disp, &context, &guard_flags, &name); + + if (!is_64bit) { + mach_msg_guarded_port_descriptor32_t *user_dsc = (typeof(user_dsc))dest_dsc; + user_dsc--; // point to the start of this port descriptor + bzero((void *)user_dsc, sizeof(*user_dsc)); + user_dsc->name = name; + user_dsc->flags = guard_flags; + user_dsc->disposition = disp; + user_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + user_dsc->context = CAST_DOWN_EXPLICIT(uint32_t, context); + dest_dsc = (typeof(dest_dsc))user_dsc; + } else { + mach_msg_guarded_port_descriptor64_t *user_dsc = (typeof(user_dsc))dest_dsc; + user_dsc--; // point to the start of this port descriptor + bzero((void *)user_dsc, sizeof(*user_dsc)); + user_dsc->name = name; + user_dsc->flags = guard_flags; + user_dsc->disposition = disp; + user_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + user_dsc->context = context; + dest_dsc = (typeof(dest_dsc))user_dsc; + } + } + + return (mach_msg_descriptor_t *)dest_dsc; +} + /* * Routine: ipc_kmsg_copyout_body * Purpose: @@ -4349,6 +4967,7 @@ ipc_kmsg_copyout_body( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, + mach_msg_option_t option, mach_msg_body_t *slist) { mach_msg_body_t *body; @@ -4390,6 +5009,10 @@ ipc_kmsg_copyout_body( user_dsc = ipc_kmsg_copyout_ool_ports_descriptor( (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, map, space, kmsg, &mr); break; + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: + user_dsc = ipc_kmsg_copyout_guarded_port_descriptor( + (mach_msg_guarded_port_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, kmsg, space, option, &mr); + break; default: { panic("untyped IPC copyout body: invalid message descriptor"); } @@ -4448,6 +5071,7 @@ ipc_kmsg_copyout_size( case MACH_MSG_OOL_DESCRIPTOR: case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: case MACH_MSG_OOL_PORTS_DESCRIPTOR: + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: if (!is_task_64bit) { send_size -= DESC_SIZE_ADJUSTMENT; } @@ -4495,7 +5119,7 @@ ipc_kmsg_copyout( } if (kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) { - mr = ipc_kmsg_copyout_body(kmsg, space, map, slist); + mr = ipc_kmsg_copyout_body(kmsg, space, map, option, slist); if (mr != MACH_MSG_SUCCESS) { mr |= MACH_RCV_BODY_ERROR; @@ -4533,9 +5157,9 @@ ipc_kmsg_copyout_pseudo( mach_msg_body_t *slist) { mach_msg_bits_t mbits = kmsg->ikm_header->msgh_bits; - ipc_object_t dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; - ipc_object_t reply = (ipc_object_t) kmsg->ikm_header->msgh_local_port; - ipc_object_t voucher = (ipc_object_t) kmsg->ikm_voucher; + ipc_object_t dest = ip_to_object(kmsg->ikm_header->msgh_remote_port); + ipc_object_t reply = ip_to_object(kmsg->ikm_header->msgh_local_port); + ipc_object_t voucher = ip_to_object(kmsg->ikm_voucher); mach_msg_type_name_t dest_type = MACH_MSGH_BITS_REMOTE(mbits); mach_msg_type_name_t reply_type = MACH_MSGH_BITS_LOCAL(mbits); mach_msg_type_name_t voucher_type = MACH_MSGH_BITS_VOUCHER(mbits); @@ -4560,8 +5184,8 @@ ipc_kmsg_copyout_pseudo( ipc_importance_assert_clean(kmsg); #endif - mr = (ipc_kmsg_copyout_object(space, dest, dest_type, &dest_name) | - ipc_kmsg_copyout_object(space, reply, reply_type, &reply_name)); + mr = (ipc_kmsg_copyout_object(space, dest, dest_type, NULL, NULL, &dest_name) | + ipc_kmsg_copyout_object(space, reply, reply_type, NULL, NULL, &reply_name)); kmsg->ikm_header->msgh_bits = mbits & MACH_MSGH_BITS_USER; kmsg->ikm_header->msgh_remote_port = CAST_MACH_NAME_TO_PORT(dest_name); @@ -4571,12 +5195,12 @@ ipc_kmsg_copyout_pseudo( assert(voucher_type == MACH_MSG_TYPE_MOVE_SEND); kmsg->ikm_voucher = IP_NULL; - mr |= ipc_kmsg_copyout_object(space, voucher, voucher_type, &voucher_name); + mr |= ipc_kmsg_copyout_object(space, voucher, voucher_type, NULL, NULL, &voucher_name); kmsg->ikm_header->msgh_voucher_port = voucher_name; } if (mbits & MACH_MSGH_BITS_COMPLEX) { - mr |= ipc_kmsg_copyout_body(kmsg, space, map, slist); + mr |= ipc_kmsg_copyout_body(kmsg, space, map, 0, slist); } return mr; @@ -4606,9 +5230,9 @@ ipc_kmsg_copyout_dest( mach_port_name_t dest_name, reply_name, voucher_name; mbits = kmsg->ikm_header->msgh_bits; - dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; - reply = (ipc_object_t) kmsg->ikm_header->msgh_local_port; - voucher = (ipc_object_t) kmsg->ikm_voucher; + dest = ip_to_object(kmsg->ikm_header->msgh_remote_port); + reply = ip_to_object(kmsg->ikm_header->msgh_local_port); + voucher = ip_to_object(kmsg->ikm_voucher); voucher_name = kmsg->ikm_header->msgh_voucher_port; dest_type = MACH_MSGH_BITS_REMOTE(mbits); reply_type = MACH_MSGH_BITS_LOCAL(mbits); @@ -4639,7 +5263,7 @@ ipc_kmsg_copyout_dest( assert(voucher_type == MACH_MSG_TYPE_MOVE_SEND); kmsg->ikm_voucher = IP_NULL; - ipc_object_destroy((ipc_object_t)voucher, voucher_type); + ipc_object_destroy(voucher, voucher_type); voucher_name = MACH_PORT_NULL; } @@ -4682,7 +5306,7 @@ ipc_kmsg_copyout_to_kernel( mach_msg_type_name_t reply_type; mach_port_name_t dest_name; - dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; + dest = ip_to_object(kmsg->ikm_header->msgh_remote_port); reply = kmsg->ikm_header->msgh_local_port; dest_type = MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits); reply_type = MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits); @@ -4741,7 +5365,7 @@ ipc_kmsg_copyout_to_kernel_legacy( mach_msg_type_name_t reply_type; mach_port_name_t dest_name; - dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; + dest = ip_to_object(kmsg->ikm_header->msgh_remote_port); reply = kmsg->ikm_header->msgh_local_port; dest_type = MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits); reply_type = MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits); @@ -4843,9 +5467,20 @@ ipc_kmsg_copyout_to_kernel_legacy( dest_dsc->type = type; break; } + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_guarded_port_descriptor_t *source_dsc = (typeof(source_dsc)) & saddr->guarded_port; + mach_msg_guarded_port_descriptor32_t *dest_dsc = &daddr->guarded_port32; + + dest_dsc->name = CAST_MACH_PORT_TO_NAME(source_dsc->name); + dest_dsc->disposition = source_dsc->disposition; + dest_dsc->flags = 0; + dest_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + dest_dsc->context = 0; + break; + } default: { #if MACH_ASSERT - panic("ipc_kmsg_copyin_from_kernel: bad descriptor"); + panic("ipc_kmsg_copyout_to_kernel_legacy: bad descriptor"); #endif /* MACH_ASSERT */ } } @@ -4945,3 +5580,12 @@ done: return trailer->msgh_trailer_size; } + +mach_msg_header_t * +ipc_kmsg_msg_header(ipc_kmsg_t kmsg) +{ + if (NULL == kmsg) { + return NULL; + } + return kmsg->ikm_header; +} diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index 74c31f1b4..68b7c4016 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -82,6 +82,10 @@ #include #include +typedef uint32_t ipc_kmsg_flags_t; + +#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 /* Dest port contains an immovable send right */ + /* * This structure is only the header for a kmsg buffer; * the actual buffer is normally larger. The rest of the buffer @@ -99,6 +103,7 @@ struct ipc_kmsg { mach_msg_size_t ikm_size; + ipc_kmsg_flags_t ikm_flags; struct ipc_kmsg *ikm_next; /* next message on port/discard queue */ struct ipc_kmsg *ikm_prev; /* prev message on port/discard queue */ mach_msg_header_t *ikm_header; @@ -165,6 +170,7 @@ MACRO_END #define ikm_init(kmsg, size) \ MACRO_BEGIN \ (kmsg)->ikm_size = (size); \ + (kmsg)->ikm_flags = 0; \ (kmsg)->ikm_prealloc = IP_NULL; \ (kmsg)->ikm_voucher = IP_NULL; \ (kmsg)->ikm_importance = IIE_NULL; \ @@ -348,6 +354,8 @@ extern mach_msg_return_t ipc_kmsg_copyout_object( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, + mach_port_context_t *context, + mach_msg_guard_flags_t *guard_flags, mach_port_name_t *namep); /* Copyout the header and body to a user message */ @@ -363,6 +371,7 @@ extern mach_msg_return_t ipc_kmsg_copyout_body( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, + mach_msg_option_t option, mach_msg_body_t *slist); /* Copyout port rights and out-of-line memory to a user message, @@ -407,4 +416,7 @@ extern void ipc_kmsg_trace_send(ipc_kmsg_t kmsg, #define ipc_kmsg_trace_send(a, b) do { } while (0) #endif +extern mach_msg_header_t * + ipc_kmsg_msg_header(ipc_kmsg_t); + #endif /* _IPC_IPC_KMSG_H_ */ diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index 2a6642598..4bec21084 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -84,6 +84,7 @@ #include #include +#include #include #include #include @@ -106,7 +107,7 @@ int ipc_mqueue_full; /* address is event for queue space */ int ipc_mqueue_rcv; /* address is event for message arrival */ /* forward declarations */ -void ipc_mqueue_receive_results(wait_result_t result); +static void ipc_mqueue_receive_results(wait_result_t result); static void ipc_mqueue_peek_on_thread( ipc_mqueue_t port_mq, mach_msg_option_t option, @@ -132,6 +133,7 @@ ipc_mqueue_init( mqueue->imq_seqno = 0; mqueue->imq_msgcount = 0; mqueue->imq_qlimit = MACH_PORT_QLIMIT_DEFAULT; + mqueue->imq_context = 0; mqueue->imq_fullwaiters = FALSE; #if MACH_FLIPC mqueue->imq_fport = FPORT_NULL; @@ -417,6 +419,26 @@ leave: return KERN_SUCCESS; } + +/* + * Routine: ipc_mqueue_has_klist + * Purpose: + * Returns whether the given mqueue imq_klist field can be used as a klist. + */ +static inline bool +ipc_mqueue_has_klist(ipc_mqueue_t mqueue) +{ + ipc_object_t object = imq_to_object(mqueue); + if (io_otype(object) != IOT_PORT) { + return true; + } + ipc_port_t port = ip_from_mq(mqueue); + if (port->ip_specialreply) { + return false; + } + return port->ip_sync_link_state == PORT_SYNC_LINK_ANY; +} + /* * Routine: ipc_mqueue_changed * Purpose: @@ -429,7 +451,7 @@ ipc_mqueue_changed( ipc_space_t space, ipc_mqueue_t mqueue) { - if (IMQ_KLIST_VALID(mqueue) && SLIST_FIRST(&mqueue->imq_klist)) { + if (ipc_mqueue_has_klist(mqueue) && SLIST_FIRST(&mqueue->imq_klist)) { /* * Indicate that this message queue is vanishing * @@ -440,7 +462,7 @@ ipc_mqueue_changed( * The new process may want to register the port it gets back with an * EVFILT_MACHPORT filter again, and may have pending sync IPC on this * port pending already, in which case we want the imq_klist field to be - * reusable for nefarious purposes (see IMQ_SET_INHERITOR). + * reusable for nefarious purposes. * * Fortunately, we really don't need this linkage anymore after this * point as EV_VANISHED / EV_EOF will be the last thing delivered ever. @@ -458,6 +480,11 @@ ipc_mqueue_changed( */ assert(space); knote_vanish(&mqueue->imq_klist, is_active(space)); + } + + if (io_otype(imq_to_object(mqueue)) == IOT_PORT) { + ipc_port_adjust_sync_link_state_locked(ip_from_mq(mqueue), PORT_SYNC_LINK_ANY, NULL); + } else { klist_init(&mqueue->imq_klist); } @@ -516,7 +543,6 @@ ipc_mqueue_send( thread_t cur_thread = current_thread(); ipc_port_t port = ip_from_mq(mqueue); struct turnstile *send_turnstile = TURNSTILE_NULL; - turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; uint64_t deadline; /* @@ -544,17 +570,8 @@ ipc_mqueue_send( port_send_turnstile_address(port), TURNSTILE_NULL, TURNSTILE_SYNC_IPC); - /* Check if the port in is in transit, get the destination port's turnstile */ - if (ip_active(port) && - port->ip_receiver_name == MACH_PORT_NULL && - port->ip_destination != NULL) { - inheritor = port_send_turnstile(port->ip_destination); - } else { - inheritor = ipc_port_get_inheritor(port); - } - - turnstile_update_inheritor(send_turnstile, inheritor, - TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_TURNSTILE); + ipc_port_send_update_inheritor(port, send_turnstile, + TURNSTILE_DELAYED_UPDATE); wresult = waitq_assert_wait64_leeway( &send_turnstile->ts_waitq, @@ -575,7 +592,7 @@ ipc_mqueue_send( /* Call turnstile complete with interlock held */ imq_lock(mqueue); - turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL); + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL, TURNSTILE_SYNC_IPC); imq_unlock(mqueue); /* Call cleanup after dropping the interlock */ @@ -636,11 +653,13 @@ ipc_mqueue_override_send( ipc_kmsg_t first = ipc_kmsg_queue_first(&mqueue->imq_messages); if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override)) { - ipc_port_t port = ip_from_mq(mqueue); + ipc_object_t object = imq_to_object(mqueue); + assert(io_otype(object) == IOT_PORT); + ipc_port_t port = ip_object_to_port(object); if (ip_active(port) && port->ip_receiver_name != MACH_PORT_NULL && is_active(port->ip_receiver) && - IMQ_KLIST_VALID(mqueue)) { + ipc_mqueue_has_klist(mqueue)) { KNOTE(&mqueue->imq_klist, 0); } } @@ -787,11 +806,13 @@ ipc_mqueue_post( if (mqueue->imq_msgcount > 0) { if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg)) { /* if the space is dead there is no point calling KNOTE */ - ipc_port_t port = ip_from_mq(mqueue); + ipc_object_t object = imq_to_object(mqueue); + assert(io_otype(object) == IOT_PORT); + ipc_port_t port = ip_object_to_port(object); if (ip_active(port) && port->ip_receiver_name != MACH_PORT_NULL && is_active(port->ip_receiver) && - IMQ_KLIST_VALID(mqueue)) { + ipc_mqueue_has_klist(mqueue)) { KNOTE(&mqueue->imq_klist, 0); } } @@ -902,7 +923,7 @@ out_unlock: } -/* static */ void +static void ipc_mqueue_receive_results(wait_result_t saved_wait_result) { thread_t self = current_thread(); @@ -1077,7 +1098,6 @@ ipc_mqueue_receive_on_thread( wait_result_t wresult; uint64_t deadline; struct turnstile *rcv_turnstile = TURNSTILE_NULL; - turnstile_inheritor_t inheritor = NULL; /* called with mqueue locked */ @@ -1179,8 +1199,10 @@ ipc_mqueue_receive_on_thread( } /* - * Threads waiting on a port (not portset) - * will wait on port's receive turnstile. + * Threads waiting on a special reply port + * (not portset or regular ports) + * will wait on its receive turnstile. + * * Donate waiting thread's turnstile and * setup inheritor for special reply port. * Based on the state of the special reply @@ -1195,18 +1217,14 @@ ipc_mqueue_receive_on_thread( * will be converted to to turnstile waitq * in waitq_assert_wait instead of global waitqs. */ - if (imq_is_queue(mqueue)) { + if (imq_is_queue(mqueue) && ip_from_mq(mqueue)->ip_specialreply) { ipc_port_t port = ip_from_mq(mqueue); rcv_turnstile = turnstile_prepare((uintptr_t)port, port_rcv_turnstile_address(port), TURNSTILE_NULL, TURNSTILE_SYNC_IPC); - if (port->ip_specialreply) { - inheritor = ipc_port_get_special_reply_port_inheritor(port); - } - - turnstile_update_inheritor(rcv_turnstile, inheritor, - (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_DELAYED_UPDATE)); + ipc_port_recv_update_inheritor(port, rcv_turnstile, + TURNSTILE_DELAYED_UPDATE); } thread_set_pending_block_hint(thread, kThreadWaitPortReceive); @@ -1592,7 +1610,7 @@ ipc_mqueue_set_gather_member_names( /* only receive rights can be members of port sets */ if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) != MACH_PORT_TYPE_NONE) { - __IGNORE_WCASTALIGN(ipc_port_t port = (ipc_port_t)entry->ie_object); + ipc_port_t port = ip_object_to_port(entry->ie_object); ipc_mqueue_t mq = &port->ip_messages; assert(IP_VALID(port)); @@ -1780,6 +1798,7 @@ ipc_mqueue_copyin( ipc_object_t *objectp) { ipc_entry_t entry; + ipc_entry_bits_t bits; ipc_object_t object; ipc_mqueue_t mqueue; @@ -1795,24 +1814,23 @@ ipc_mqueue_copyin( return MACH_RCV_INVALID_NAME; } + bits = entry->ie_bits; object = entry->ie_object; - if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) { - ipc_port_t port; + if (bits & MACH_PORT_TYPE_RECEIVE) { + ipc_port_t port = ip_object_to_port(object); - __IGNORE_WCASTALIGN(port = (ipc_port_t) object); assert(port != IP_NULL); ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); is_read_unlock(space); mqueue = &port->ip_messages; - } else if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) { - ipc_pset_t pset; + } else if (bits & MACH_PORT_TYPE_PORT_SET) { + ipc_pset_t pset = ips_object_to_pset(object); - __IGNORE_WCASTALIGN(pset = (ipc_pset_t) object); assert(pset != IPS_NULL); ips_lock(pset); @@ -1822,6 +1840,10 @@ ipc_mqueue_copyin( mqueue = &pset->ips_messages; } else { is_read_unlock(space); + /* guard exception if we never held the receive right in this entry */ + if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_RCV_INVALID_NAME); + } return MACH_RCV_INVALID_NAME; } @@ -1837,3 +1859,19 @@ ipc_mqueue_copyin( *mqueuep = mqueue; return MACH_MSG_SUCCESS; } + +void +imq_lock(ipc_mqueue_t mq) +{ + ipc_object_t object = imq_to_object(mq); + ipc_object_validate(object); + waitq_lock(&(mq)->imq_wait_queue); +} + +unsigned int +imq_lock_try(ipc_mqueue_t mq) +{ + ipc_object_t object = imq_to_object(mq); + ipc_object_validate(object); + return waitq_lock_try(&(mq)->imq_wait_queue); +} diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index 140ce1dfd..4e6fb3240 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -90,6 +90,9 @@ typedef struct ipc_mqueue { mach_port_name_t receiver_name; uint16_t msgcount; uint16_t qlimit; +#ifdef __LP64__ + uint32_t qcontext; +#endif #if MACH_FLIPC struct flipc_port *fport; // Null for local port, or ptr to flipc port #endif @@ -99,30 +102,32 @@ typedef struct ipc_mqueue { } pset; } data; union { + /* + * Port Sets: + * only use imq_klist + * + * Special Reply Ports (ip_specialreply == true): + * only use imq_srp_owner_thread + * + * Ports, based on ip_sync_link_state, use: + * - PORT_SYNC_LINK_ANY: imq_klist + * - PORT_SYNC_LINK_WORKLOOP_KNOTE: imq_inheritor_knote + * - PORT_SYNC_LINK_WORKLOOP_STASH: imq_inheritor_turnstile + * - PORT_SYNC_LINK_RCV_THREAD: imq_inheritor_thread_ref + */ struct klist imq_klist; - uintptr_t imq_inheritor; + struct knote *imq_inheritor_knote; + struct turnstile *imq_inheritor_turnstile; + thread_t imq_inheritor_thread_ref; + thread_t imq_srp_owner_thread; }; +#ifndef __LP64__ + uint32_t qcontext; +#endif } *ipc_mqueue_t; #define IMQ_NULL ((ipc_mqueue_t) 0) -/* - * When a receive right is in flight, before it can ever be registered with - * a new knote, its imq_klist field can be overloaded to hold a pointer - * to the knote that the port is pushing on through his turnstile. - * - * if IMQ_KLIST_VALID() returns true, then the imq_klist field can be used, - * else IMQ_INHERITOR() can be used to get the pointer to the knote currently - * being the port turnstile inheritor. - */ -#define IMQ_KLIST_VALID(imq) (((imq)->imq_inheritor & 1) == 0) -#define IMQ_INHERITOR(imq) ((struct turnstile *)((imq)->imq_inheritor ^ 1)) -#define IMQ_SET_INHERITOR(imq, inheritor) \ -MACRO_BEGIN \ - assert(((imq)->imq_inheritor & 1) || SLIST_EMPTY(&(imq)->imq_klist)); \ - ((imq)->imq_inheritor = (uintptr_t)(inheritor) | 1); \ -MACRO_END - #define imq_wait_queue data.port.waitq #define imq_messages data.port.messages #define imq_msgcount data.port.msgcount @@ -133,6 +138,16 @@ MACRO_END #define imq_fport data.port.fport #endif +/* + * The qcontext structure member fills in a 32-bit padding gap in ipc_mqueue. + * However, the 32-bits are in slightly different places on 32 and 64 bit systems. + */ +#ifdef __LP64__ +#define imq_context data.port.qcontext +#else +#define imq_context qcontext +#endif + /* * we can use the 'eventmask' bits of the waitq b/c * they are only used by global queues @@ -146,28 +161,24 @@ MACRO_END #define imq_is_queue(mq) waitq_is_queue(&(mq)->imq_wait_queue) #define imq_is_valid(mq) waitq_is_valid(&(mq)->imq_wait_queue) -#define imq_lock(mq) waitq_lock(&(mq)->imq_wait_queue) -#define imq_lock_try(mq) waitq_lock_try(&(mq)->imq_wait_queue) #define imq_unlock(mq) waitq_unlock(&(mq)->imq_wait_queue) #define imq_held(mq) waitq_held(&(mq)->imq_wait_queue) #define imq_valid(mq) waitq_valid(&(mq)->imq_wait_queue) +extern void imq_lock(ipc_mqueue_t mq); +extern unsigned int imq_lock_try(ipc_mqueue_t mq); + /* * Get an ipc_mqueue pointer from a waitq pointer. These are traditionally the * same pointer, but this conversion makes no assumptions on union structure * member positions - it should allow the waitq to move around in either the * port-set mqueue or the port mqueue independently. */ -#define imq_from_waitq(waitq) (waitq_is_set(waitq) ? \ - ((struct ipc_mqueue *)((void *)( \ - (uintptr_t)(waitq) - \ - __offsetof(struct ipc_mqueue, imq_set_queue)) \ - )) : \ - ((struct ipc_mqueue *)((void *)( \ - (uintptr_t)(waitq) - \ - __offsetof(struct ipc_mqueue, imq_wait_queue)) \ - )) \ - ) +#define imq_from_waitq(waitq) (waitq_is_set(waitq) ? \ + __container_of(waitq, struct ipc_mqueue, imq_set_queue.wqset_q) : \ + __container_of(waitq, struct ipc_mqueue, imq_wait_queue)) + +#define imq_to_object(mq) ip_to_object(ip_from_mq(mq)) extern void imq_reserve_and_lock(ipc_mqueue_t mq, uint64_t *reserved_prepost); diff --git a/osfmk/ipc/ipc_notify.c b/osfmk/ipc/ipc_notify.c index 1730c5b41..f677c6e28 100644 --- a/osfmk/ipc/ipc_notify.c +++ b/osfmk/ipc/ipc_notify.c @@ -158,7 +158,7 @@ void ipc_notify_send_once( ipc_port_t port) { - ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE); + ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN, FALSE); (void)mach_notify_send_once(port); /* send-once right consumed */ diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 27a1cca4a..76fc96b8e 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -128,7 +128,7 @@ ipc_object_release( * Look up an object in a space. * Conditions: * Nothing locked before. If successful, the object - * is returned locked. The caller doesn't get a ref. + * is returned active and locked. The caller doesn't get a ref. * Returns: * KERN_SUCCESS Object returned locked. * KERN_INVALID_TASK The space is dead. @@ -146,6 +146,10 @@ ipc_object_translate( ipc_object_t object; kern_return_t kr; + if (!MACH_PORT_RIGHT_VALID_TRANSLATE(right)) { + return KERN_INVALID_RIGHT; + } + kr = ipc_right_lookup_read(space, name, &entry); if (kr != KERN_SUCCESS) { return kr; @@ -163,6 +167,11 @@ ipc_object_translate( io_lock(object); is_read_unlock(space); + if (!io_active(object)) { + io_unlock(object); + return KERN_INVALID_NAME; + } + *objectp = object; return KERN_SUCCESS; } @@ -193,8 +202,9 @@ ipc_object_translate_two( { ipc_entry_t entry1; ipc_entry_t entry2; - ipc_object_t object; + ipc_object_t object1, object2; kern_return_t kr; + boolean_t doguard = TRUE; kr = ipc_right_lookup_two_read(space, name1, &entry1, name2, &entry2); if (kr != KERN_SUCCESS) { @@ -203,26 +213,52 @@ ipc_object_translate_two( /* space is read-locked and active */ if ((entry1->ie_bits & MACH_PORT_TYPE(right1)) == MACH_PORT_TYPE_NONE) { + /* If looking for receive, and the entry used to hold one, give a pass on EXC_GUARD */ + if ((right1 & MACH_PORT_RIGHT_RECEIVE) == MACH_PORT_RIGHT_RECEIVE && + (entry1->ie_bits & MACH_PORT_TYPE_EX_RECEIVE) == MACH_PORT_TYPE_EX_RECEIVE) { + doguard = FALSE; + } is_read_unlock(space); - mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_RIGHT); + if (doguard) { + mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_RIGHT); + } return KERN_INVALID_RIGHT; } if ((entry2->ie_bits & MACH_PORT_TYPE(right2)) == MACH_PORT_TYPE_NONE) { + /* If looking for receive, and the entry used to hold one, give a pass on EXC_GUARD */ + if ((right2 & MACH_PORT_RIGHT_RECEIVE) == MACH_PORT_RIGHT_RECEIVE && + (entry2->ie_bits & MACH_PORT_TYPE_EX_RECEIVE) == MACH_PORT_TYPE_EX_RECEIVE) { + doguard = FALSE; + } is_read_unlock(space); - mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_RIGHT); + if (doguard) { + mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_RIGHT); + } return KERN_INVALID_RIGHT; } - object = entry1->ie_object; - assert(object != IO_NULL); - io_lock(object); - *objectp1 = object; + object1 = entry1->ie_object; + assert(object1 != IO_NULL); + io_lock(object1); + if (!io_active(object1)) { + io_unlock(object1); + is_read_unlock(space); + return KERN_INVALID_NAME; + } - object = entry2->ie_object; - assert(object != IO_NULL); - io_lock(object); - *objectp2 = object; + object2 = entry2->ie_object; + assert(object2 != IO_NULL); + io_lock(object2); + if (!io_active(object2)) { + io_unlock(object1); + io_unlock(object2); + is_read_unlock(space); + return KERN_INVALID_NAME; + } + + *objectp1 = object1; + *objectp2 = object2; is_read_unlock(space); return KERN_SUCCESS; @@ -343,11 +379,11 @@ ipc_object_alloc( } if (otype == IOT_PORT) { - ipc_port_t port = (ipc_port_t)object; + ipc_port_t port = ip_object_to_port(object); bzero((char *)port, sizeof(*port)); } else if (otype == IOT_PORT_SET) { - ipc_pset_t pset = (ipc_pset_t)object; + ipc_pset_t pset = ips_object_to_pset(object); bzero((char *)pset, sizeof(*pset)); } @@ -365,10 +401,10 @@ ipc_object_alloc( entry->ie_object = object; ipc_entry_modified(space, *namep, entry); + object->io_bits = io_makebits(TRUE, otype, 0); io_lock(object); object->io_references = 1; /* for entry, not caller */ - object->io_bits = io_makebits(TRUE, otype, 0); *objectp = object; return KERN_SUCCESS; @@ -412,11 +448,11 @@ ipc_object_alloc_name( } if (otype == IOT_PORT) { - ipc_port_t port = (ipc_port_t)object; + ipc_port_t port = ip_object_to_port(object); bzero((char *)port, sizeof(*port)); } else if (otype == IOT_PORT_SET) { - ipc_pset_t pset = (ipc_pset_t)object; + ipc_pset_t pset = ips_object_to_pset(object); bzero((char *)pset, sizeof(*pset)); } @@ -438,16 +474,31 @@ ipc_object_alloc_name( entry->ie_object = object; ipc_entry_modified(space, name, entry); + object->io_bits = io_makebits(TRUE, otype, 0); + io_lock(object); is_write_unlock(space); object->io_references = 1; /* for entry, not caller */ - object->io_bits = io_makebits(TRUE, otype, 0); *objectp = object; return KERN_SUCCESS; } +/* Routine: ipc_object_validate + * Purpose: + * Validates an ipc port or port set as belonging to the correct + * zone. + */ + +void +ipc_object_validate( + ipc_object_t object) +{ + int otype = (io_otype(object) == IOT_PORT_SET) ? IOT_PORT_SET : IOT_PORT; + zone_require(object, ipc_object_zones[otype]); +} + /* * Routine: ipc_object_copyin_type * Purpose: @@ -500,7 +551,10 @@ ipc_object_copyin( ipc_space_t space, mach_port_name_t name, mach_msg_type_name_t msgt_name, - ipc_object_t *objectp) + ipc_object_t *objectp, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags, + ipc_kmsg_flags_t kmsg_flags) { ipc_entry_t entry; ipc_port_t soright; @@ -508,6 +562,11 @@ ipc_object_copyin( kern_return_t kr; int assertcnt = 0; + ipc_right_copyin_flags_t irc_flags = IPC_RIGHT_COPYIN_FLAGS_DEADOK; + if (kmsg_flags & IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) { + irc_flags |= IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND; + } + /* * Could first try a read lock when doing * MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND, @@ -522,10 +581,12 @@ ipc_object_copyin( release_port = IP_NULL; kr = ipc_right_copyin(space, name, entry, - msgt_name, IPC_RIGHT_COPYIN_FLAGS_DEADOK, + msgt_name, irc_flags, objectp, &soright, &release_port, - &assertcnt); + &assertcnt, + context, + guard_flags); if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) { ipc_entry_dealloc(space, name, entry); } @@ -585,17 +646,17 @@ ipc_object_copyin_from_kernel( switch (msgt_name) { case MACH_MSG_TYPE_MOVE_RECEIVE: { - ipc_port_t port = (ipc_port_t) object; + ipc_port_t port = ip_object_to_port(object); ip_lock(port); imq_lock(&port->ip_messages); - assert(ip_active(port)); + require_ip_active(port); if (port->ip_destination != IP_NULL) { assert(port->ip_receiver == ipc_space_kernel); + assert(port->ip_immovable_receive == 0); /* relevant part of ipc_port_clear_receiver */ - ipc_port_set_mscount(port, 0); - + port->ip_mscount = 0; port->ip_receiver_name = MACH_PORT_NULL; port->ip_destination = IP_NULL; } @@ -605,7 +666,7 @@ ipc_object_copyin_from_kernel( } case MACH_MSG_TYPE_COPY_SEND: { - ipc_port_t port = (ipc_port_t) object; + ipc_port_t port = ip_object_to_port(object); ip_lock(port); if (ip_active(port)) { @@ -618,7 +679,7 @@ ipc_object_copyin_from_kernel( } case MACH_MSG_TYPE_MAKE_SEND: { - ipc_port_t port = (ipc_port_t) object; + ipc_port_t port = ip_object_to_port(object); ip_lock(port); if (ip_active(port)) { @@ -636,26 +697,25 @@ ipc_object_copyin_from_kernel( case MACH_MSG_TYPE_MOVE_SEND: { /* move naked send right into the message */ - assert(((ipc_port_t)object)->ip_srights); + assert(ip_object_to_port(object)->ip_srights); break; } case MACH_MSG_TYPE_MAKE_SEND_ONCE: { - ipc_port_t port = (ipc_port_t) object; + ipc_port_t port = ip_object_to_port(object); ip_lock(port); if (ip_active(port)) { assert(port->ip_receiver_name != MACH_PORT_NULL); } - port->ip_sorights++; - ip_reference(port); + ipc_port_make_sonce_locked(port); ip_unlock(port); break; } case MACH_MSG_TYPE_MOVE_SEND_ONCE: { /* move naked send-once right into the message */ - assert(((ipc_port_t)object)->ip_sorights); + assert(ip_object_to_port(object)->ip_sorights); break; } @@ -685,15 +745,15 @@ ipc_object_destroy( switch (msgt_name) { case MACH_MSG_TYPE_PORT_SEND: - ipc_port_release_send((ipc_port_t) object); + ipc_port_release_send(ip_object_to_port(object)); break; case MACH_MSG_TYPE_PORT_SEND_ONCE: - ipc_notify_send_once((ipc_port_t) object); + ipc_notify_send_once(ip_object_to_port(object)); break; case MACH_MSG_TYPE_PORT_RECEIVE: - ipc_port_release_receive((ipc_port_t) object); + ipc_port_release_receive(ip_object_to_port(object)); break; default: @@ -721,15 +781,15 @@ ipc_object_destroy_dest( switch (msgt_name) { case MACH_MSG_TYPE_PORT_SEND: - ipc_port_release_send((ipc_port_t) object); + ipc_port_release_send(ip_object_to_port(object)); break; case MACH_MSG_TYPE_PORT_SEND_ONCE: if (io_active(object) && - !ip_full_kernel((ipc_port_t) object)) { - ipc_notify_send_once((ipc_port_t) object); + !ip_full_kernel(ip_object_to_port(object))) { + ipc_notify_send_once(ip_object_to_port(object)); } else { - ipc_port_release_sonce((ipc_port_t) object); + ipc_port_release_sonce(ip_object_to_port(object)); } break; @@ -738,6 +798,98 @@ ipc_object_destroy_dest( } } +/* + * Routine: ipc_object_insert_send_right + * Purpose: + * Insert a send right into an object already in the space. + * The specified name must already point to a valid object. + * + * Note: This really is a combined copyin()/copyout(), + * that avoids most of the overhead of being implemented that way. + * + * This is the fastpath for mach_port_insert_right. + * + * Conditions: + * Nothing locked. + * + * msgt_name must be MACH_MSG_TYPE_MAKE_SEND_ONCE or + * MACH_MSG_TYPE_MOVE_SEND_ONCE. + * + * Returns: + * KERN_SUCCESS Copied out object, consumed ref. + * KERN_INVALID_TASK The space is dead. + * KERN_INVALID_NAME Name doesn't exist in space. + * KERN_INVALID_CAPABILITY The object is dead. + * KERN_RIGHT_EXISTS Space has rights under another name. + */ +kern_return_t +ipc_object_insert_send_right( + ipc_space_t space, + mach_port_name_t name, + mach_msg_type_name_t msgt_name) +{ + ipc_entry_bits_t bits; + ipc_object_t object; + ipc_entry_t entry; + kern_return_t kr; + + assert(msgt_name == MACH_MSG_TYPE_MAKE_SEND || + msgt_name == MACH_MSG_TYPE_COPY_SEND); + + kr = ipc_right_lookup_write(space, name, &entry); + if (kr != KERN_SUCCESS) { + return kr; + } + /* space is write-locked and active */ + + if (!IO_VALID(entry->ie_object)) { + is_write_unlock(space); + return KERN_INVALID_CAPABILITY; + } + + bits = entry->ie_bits; + object = entry->ie_object; + + io_lock(object); + if (!io_active(object)) { + kr = KERN_INVALID_CAPABILITY; + } else if (msgt_name == MACH_MSG_TYPE_MAKE_SEND) { + if (bits & MACH_PORT_TYPE_RECEIVE) { + ipc_port_t port = ip_object_to_port(object); + port->ip_mscount++; + if ((bits & MACH_PORT_TYPE_SEND) == 0) { + port->ip_srights++; + bits |= MACH_PORT_TYPE_SEND; + } + /* leave urefs pegged to maximum if it overflowed */ + if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { + bits += 1; /* increment urefs */ + } + entry->ie_bits = bits; + ipc_entry_modified(space, name, entry); + kr = KERN_SUCCESS; + } else { + kr = KERN_INVALID_RIGHT; + } + } else { // MACH_MSG_TYPE_COPY_SEND + if (bits & MACH_PORT_TYPE_SEND) { + /* leave urefs pegged to maximum if it overflowed */ + if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { + entry->ie_bits = bits + 1; /* increment urefs */ + } + ipc_entry_modified(space, name, entry); + kr = KERN_SUCCESS; + } else { + kr = KERN_INVALID_RIGHT; + } + } + + io_unlock(object); + is_write_unlock(space); + + return kr; +} + /* * Routine: ipc_object_copyout * Purpose: @@ -760,7 +912,8 @@ ipc_object_copyout( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, - boolean_t overflow, + mach_port_context_t *context, + mach_msg_guard_flags_t *guard_flags, mach_port_name_t *namep) { struct knote *kn = current_thread()->ith_knote; @@ -773,7 +926,7 @@ ipc_object_copyout( if (ITH_KNOTE_VALID(kn, msgt_name)) { filt_machport_turnstile_prepare_lazily(kn, - msgt_name, (ipc_port_t)object); + msgt_name, ip_object_to_port(object)); } is_write_lock(space); @@ -822,7 +975,7 @@ ipc_object_copyout( /* space is write-locked and active, object is locked and active */ kr = ipc_right_copyout(space, name, entry, - msgt_name, overflow, object); + msgt_name, context, guard_flags, object); /* object is unlocked */ is_write_unlock(space); @@ -857,14 +1010,12 @@ ipc_object_copyout_name( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, - boolean_t overflow, mach_port_name_t name) { mach_port_name_t oname; ipc_entry_t oentry; ipc_entry_t entry; kern_return_t kr; - struct knote *kn = current_thread()->ith_knote; #if IMPORTANCE_INHERITANCE int assertcnt = 0; @@ -874,11 +1025,6 @@ ipc_object_copyout_name( assert(IO_VALID(object)); assert(io_otype(object) == IOT_PORT); - if (ITH_KNOTE_VALID(kn, msgt_name)) { - filt_machport_turnstile_prepare_lazily(kn, - msgt_name, (ipc_port_t)object); - } - kr = ipc_entry_alloc_name(space, name, &entry); if (kr != KERN_SUCCESS) { return kr; @@ -931,7 +1077,7 @@ ipc_object_copyout_name( * port has assertions (and the task wants them). */ if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) { - ipc_port_t port = (ipc_port_t)object; + ipc_port_t port = ip_object_to_port(object); if (space->is_task != TASK_NULL) { task_imp = space->is_task->task_imp_base; @@ -951,7 +1097,7 @@ ipc_object_copyout_name( #endif /* IMPORTANCE_INHERITANCE */ kr = ipc_right_copyout(space, name, entry, - msgt_name, overflow, object); + msgt_name, NULL, NULL, object); /* object is unlocked */ is_write_unlock(space); @@ -994,8 +1140,6 @@ ipc_object_copyout_dest( assert(IO_VALID(object)); assert(io_active(object)); - io_release(object); - /* * If the space is the receiver/owner of the object, * then we quietly consume the right and return @@ -1005,7 +1149,7 @@ ipc_object_copyout_dest( switch (msgt_name) { case MACH_MSG_TYPE_PORT_SEND: { - ipc_port_t port = (ipc_port_t) object; + ipc_port_t port = ip_object_to_port(object); ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount; @@ -1021,16 +1165,20 @@ ipc_object_copyout_dest( nsrequest = port->ip_nsrequest; port->ip_nsrequest = IP_NULL; mscount = port->ip_mscount; - ip_unlock(port); + ipc_port_clear_sync_rcv_thread_boost_locked(port); + /* port unlocked */ ipc_notify_no_senders(nsrequest, mscount); } else { - ip_unlock(port); + ipc_port_clear_sync_rcv_thread_boost_locked(port); + /* port unlocked */ } + + ip_release(port); break; } case MACH_MSG_TYPE_PORT_SEND_ONCE: { - ipc_port_t port = (ipc_port_t) object; + ipc_port_t port = ip_object_to_port(object); assert(port->ip_sorights > 0); @@ -1039,7 +1187,9 @@ ipc_object_copyout_dest( port->ip_sorights--; name = port->ip_receiver_name; - ip_unlock(port); + ipc_port_clear_sync_rcv_thread_boost_locked(port); + /* port unlocked */ + ip_release(port); } else { /* * A very bizarre case. The message @@ -1050,7 +1200,6 @@ ipc_object_copyout_dest( * so generate a send-once notification. */ - ip_reference(port); /* restore ref */ ip_unlock(port); ipc_notify_send_once(port); @@ -1069,52 +1218,30 @@ ipc_object_copyout_dest( } /* - * Routine: ipc_object_rename + * Routine: io_lock * Purpose: - * Rename an entry in a space. - * Conditions: - * Nothing locked. - * Returns: - * KERN_SUCCESS Renamed the entry. - * KERN_INVALID_TASK The space was dead. - * KERN_INVALID_NAME oname didn't denote an entry. - * KERN_NAME_EXISTS nname already denoted an entry. - * KERN_RESOURCE_SHORTAGE Couldn't allocate new entry. + * Validate, then acquire a lock on an ipc object */ -kern_return_t -ipc_object_rename( - ipc_space_t space, - mach_port_name_t oname, - mach_port_name_t nname) +void +io_lock(ipc_object_t io) { - ipc_entry_t oentry, nentry; - kern_return_t kr; - - kr = ipc_entry_alloc_name(space, nname, &nentry); - if (kr != KERN_SUCCESS) { - return kr; - } - - /* space is write-locked and active */ - - if (ipc_right_inuse(space, nname, nentry)) { - /* space is unlocked */ - return KERN_NAME_EXISTS; - } - - /* don't let ipc_entry_lookup see the uninitialized new entry */ + ipc_object_validate(io); + lck_spin_lock_grp(&(io)->io_lock_data, &ipc_lck_grp); +} - if ((oname == nname) || - ((oentry = ipc_entry_lookup(space, oname)) == IE_NULL)) { - ipc_entry_dealloc(space, nname, nentry); - is_write_unlock(space); - return KERN_INVALID_NAME; - } +/* + * Routine: io_lock_try + * Purpose: + * Validate, then try to acquire a lock on an object, + * fail if there is an existing busy lock + */ - kr = ipc_right_rename(space, oname, oentry, nname, nentry); - /* space is unlocked */ - return kr; +boolean_t +io_lock_try(ipc_object_t io) +{ + ipc_object_validate(io); + return lck_spin_try_lock_grp(&(io)->io_lock_data, &ipc_lck_grp); } /* @@ -1126,11 +1253,8 @@ io_free( unsigned int otype, ipc_object_t object) { - ipc_port_t port; - if (otype == IOT_PORT) { - port = (ipc_port_t) object; - ipc_port_finalize(port); + ipc_port_finalize(ip_object_to_port(object)); } io_lock_destroy(object); zfree(ipc_object_zones[otype], object); diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index 2e23f5681..77ddc1333 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -99,8 +99,8 @@ typedef natural_t ipc_object_type_t; struct ipc_object { ipc_object_bits_t io_bits; ipc_object_refs_t io_references; - lck_spin_t io_lock_data; -}; + lck_spin_t io_lock_data; +} __attribute__((aligned(8))); /* * If another object type needs to participate in io_kotype()-based @@ -131,7 +131,8 @@ struct ipc_object_header { * definitions in ipc_port.h. */ #define IO_BITS_PORT_INFO 0x0000f000 /* stupid port tricks */ -#define IO_BITS_KOTYPE 0x00000fff /* used by the object */ +#define IO_BITS_KOTYPE 0x000007ff /* used by the object */ +#define IO_BITS_KOBJECT 0x00000800 /* port belongs to a kobject */ #define IO_BITS_OTYPE 0x7fff0000 /* determines a zone */ #define IO_BITS_ACTIVE 0x80000000 /* is object alive? */ @@ -139,6 +140,7 @@ struct ipc_object_header { #define io_otype(io) (((io)->io_bits & IO_BITS_OTYPE) >> 16) #define io_kotype(io) ((io)->io_bits & IO_BITS_KOTYPE) +#define io_is_kobject(io) (((io)->io_bits & IO_BITS_KOBJECT) != IKOT_NONE) #define io_makebits(active, otype, kotype) \ (((active) ? IO_BITS_ACTIVE : 0) | ((otype) << 16) | (kotype)) @@ -151,6 +153,7 @@ struct ipc_object_header { #define IOT_NUMBER 2 /* number of types used */ extern zone_t ipc_object_zones[IOT_NUMBER]; +extern lck_grp_t ipc_lck_grp; #define io_alloc(otype) \ ((ipc_object_t) zalloc(ipc_object_zones[(otype)])) @@ -167,15 +170,18 @@ extern void io_free( lck_spin_init(&(io)->io_lock_data, &ipc_lck_grp, &ipc_lck_attr) #define io_lock_destroy(io) \ lck_spin_destroy(&(io)->io_lock_data, &ipc_lck_grp) -#define io_lock(io) \ - lck_spin_lock_grp(&(io)->io_lock_data, &ipc_lck_grp) -#define io_lock_try(io) \ - lck_spin_try_lock_grp(&(io)->io_lock_data, &ipc_lck_grp) +#define io_lock_held(io) \ + LCK_SPIN_ASSERT(&(io)->io_lock_data, LCK_ASSERT_OWNED) #define io_lock_held_kdp(io) \ kdp_lck_spin_is_acquired(&(io)->io_lock_data) #define io_unlock(io) \ lck_spin_unlock(&(io)->io_lock_data) +extern void io_lock( + ipc_object_t io); +extern boolean_t io_lock_try( + ipc_object_t io); + #define _VOLATILE_ volatile /* Sanity check the ref count. If it is 0, we may be doubly zfreeing. @@ -191,7 +197,7 @@ extern void io_free( * and zfree modifies that to point to the next free zone element. */ #define IO_MAX_REFERENCES \ - (unsigned)(~0 ^ (1 << (sizeof(int)*BYTE_SIZE - 1))) + (unsigned)(~0 ^ (1U << (sizeof(int)*BYTE_SIZE - 1))) static inline void io_reference(ipc_object_t io) @@ -199,8 +205,10 @@ io_reference(ipc_object_t io) ipc_object_refs_t new_io_references; ipc_object_refs_t old_io_references; - assert((io)->io_references > 0 && - (io)->io_references < IO_MAX_REFERENCES); + if ((io)->io_references == 0 || + (io)->io_references >= IO_MAX_REFERENCES) { + panic("%s: reference count %u is invalid\n", __func__, (io)->io_references); + } do { old_io_references = (io)->io_references; @@ -219,8 +227,10 @@ io_release(ipc_object_t io) ipc_object_refs_t new_io_references; ipc_object_refs_t old_io_references; - assert((io)->io_references > 0 && - (io)->io_references < IO_MAX_REFERENCES); + if ((io)->io_references == 0 || + (io)->io_references >= IO_MAX_REFERENCES) { + panic("%s: reference count %u is invalid\n", __func__, (io)->io_references); + } do { old_io_references = (io)->io_references; @@ -277,6 +287,10 @@ extern kern_return_t ipc_object_translate_two( mach_port_right_t right2, ipc_object_t *objectp2); +/* Validate an object as belonging to the correct zone */ +extern void ipc_object_validate( + ipc_object_t object); + /* Allocate a dead-name entry */ extern kern_return_t ipc_object_alloc_dead( @@ -315,7 +329,10 @@ extern kern_return_t ipc_object_copyin( ipc_space_t space, mach_port_name_t name, mach_msg_type_name_t msgt_name, - ipc_object_t *objectp); + ipc_object_t *objectp, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags, + uint32_t kmsg_flags); /* Copyin a naked capability from the kernel */ extern void ipc_object_copyin_from_kernel( @@ -332,12 +349,19 @@ extern void ipc_object_destroy_dest( ipc_object_t object, mach_msg_type_name_t msgt_name); +/* Insert a send right into an object already in the current space */ +extern kern_return_t ipc_object_insert_send_right( + ipc_space_t space, + mach_port_name_t name, + mach_msg_type_name_t msgt_name); + /* Copyout a capability, placing it into a space */ extern kern_return_t ipc_object_copyout( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, - boolean_t overflow, + mach_port_context_t *context, + mach_msg_guard_flags_t *guard_flags, mach_port_name_t *namep); /* Copyout a capability with a name, placing it into a space */ @@ -345,7 +369,6 @@ extern kern_return_t ipc_object_copyout_name( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, - boolean_t overflow, mach_port_name_t name); /* Translate/consume the destination right of a message */ @@ -355,10 +378,4 @@ extern void ipc_object_copyout_dest( mach_msg_type_name_t msgt_name, mach_port_name_t *namep); -/* Rename an entry in a space */ -extern kern_return_t ipc_object_rename( - ipc_space_t space, - mach_port_name_t oname, - mach_port_name_t nname); - #endif /* _IPC_IPC_OBJECT_H_ */ diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index ee9a7571e..cd8c04b81 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,16 +91,17 @@ #include #include #include -#include +#include #include #include #include -decl_lck_spin_data(, ipc_port_multiple_lock_data) +decl_lck_spin_data(, ipc_port_multiple_lock_data); ipc_port_timestamp_t ipc_port_timestamp_data; int ipc_portbt; +extern int prioritize_launch; #if MACH_ASSERT void ipc_port_init_debug( @@ -114,6 +115,14 @@ void ipc_port_callstack_init_debug( #endif /* MACH_ASSERT */ +static void +ipc_port_send_turnstile_recompute_push_locked( + ipc_port_t port); + +static thread_t +ipc_port_get_watchport_inheritor( + ipc_port_t port); + void ipc_port_release(ipc_port_t port) { @@ -180,7 +189,7 @@ ipc_port_request_alloc( *importantp = FALSE; #endif /* IMPORTANCE_INHERITANCE */ - assert(ip_active(port)); + require_ip_active(port); assert(name != MACH_PORT_NULL); assert(soright != IP_NULL); @@ -248,8 +257,7 @@ ipc_port_request_grow( { ipc_table_size_t its; ipc_port_request_t otable, ntable; - - assert(ip_active(port)); + require_ip_active(port); otable = port->ip_requests; if (otable == IPR_NULL) { @@ -366,7 +374,7 @@ ipc_port_request_sparm( if (index != IE_REQ_NONE) { ipc_port_request_t ipr, table; - assert(ip_active(port)); + require_ip_active(port); table = port->ip_requests; assert(table != IPR_NULL); @@ -456,7 +464,7 @@ ipc_port_request_cancel( ipc_port_request_t ipr, table; ipc_port_t request = IP_NULL; - assert(ip_active(port)); + require_ip_active(port); table = port->ip_requests; assert(table != IPR_NULL); @@ -492,8 +500,7 @@ ipc_port_pdrequest( ipc_port_t *previousp) { ipc_port_t previous; - - assert(ip_active(port)); + require_ip_active(port); previous = port->ip_pdrequest; port->ip_pdrequest = notify; @@ -523,8 +530,7 @@ ipc_port_nsrequest( { ipc_port_t previous; mach_port_mscount_t mscount; - - assert(ip_active(port)); + require_ip_active(port); previous = port->ip_nsrequest; mscount = port->ip_mscount; @@ -579,7 +585,7 @@ ipc_port_clear_receiver( /* * Send anyone waiting on the port's queue directly away. - * Also clear the mscount and seqno. + * Also clear the mscount, seqno, guard bits */ imq_lock(mqueue); if (port->ip_receiver_name) { @@ -590,6 +596,11 @@ ipc_port_clear_receiver( port->ip_mscount = 0; mqueue->imq_seqno = 0; port->ip_context = port->ip_guarded = port->ip_strict_guard = 0; + /* + * clear the immovable bit so the port can move back to anyone listening + * for the port destroy notification + */ + port->ip_immovable_receive = 0; if (should_destroy) { /* @@ -644,6 +655,7 @@ ipc_port_init( port->ip_premsg = IKM_NULL; port->ip_context = 0; + port->ip_reply_context = 0; port->ip_sprequests = 0; port->ip_spimportant = 0; @@ -652,12 +664,17 @@ ipc_port_init( port->ip_guarded = 0; port->ip_strict_guard = 0; + port->ip_immovable_receive = 0; + port->ip_no_grant = 0; + port->ip_immovable_send = 0; port->ip_impcount = 0; port->ip_specialreply = 0; port->ip_sync_link_state = PORT_SYNC_LINK_ANY; + port->ip_sync_bootstrap_checkin = 0; + port->ip_watchport_elem = NULL; - reset_ip_srp_bits(port); + ipc_special_reply_port_bits_reset(port); port->ip_send_turnstile = TURNSTILE_NULL; @@ -682,20 +699,26 @@ ipc_port_init( kern_return_t ipc_port_alloc( ipc_space_t space, + bool make_send_right, mach_port_name_t *namep, ipc_port_t *portp) { ipc_port_t port; mach_port_name_t name; kern_return_t kr; + mach_port_type_t type = MACH_PORT_TYPE_RECEIVE; + mach_port_urefs_t urefs = 0; #if MACH_ASSERT uintptr_t buf[IP_CALLSTACK_MAX]; ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX); #endif /* MACH_ASSERT */ - kr = ipc_object_alloc(space, IOT_PORT, - MACH_PORT_TYPE_RECEIVE, 0, + if (make_send_right) { + type |= MACH_PORT_TYPE_SEND; + urefs = 1; + } + kr = ipc_object_alloc(space, IOT_PORT, type, urefs, &name, (ipc_object_t *) &port); if (kr != KERN_SUCCESS) { return kr; @@ -704,6 +727,12 @@ ipc_port_alloc( /* port and space are locked */ ipc_port_init(port, space, name); + if (make_send_right) { + /* ipc_object_alloc() already made the entry reference */ + port->ip_srights++; + port->ip_mscount++; + } + #if MACH_ASSERT ipc_port_init_debug(port, &buf[0], IP_CALLSTACK_MAX); #endif /* MACH_ASSERT */ @@ -898,6 +927,7 @@ ipc_port_destroy(ipc_port_t port) ipc_mqueue_t mqueue; ipc_kmsg_t kmsg; boolean_t special_reply = port->ip_specialreply; + struct task_watchport_elem *watchport_elem = NULL; #if IMPORTANCE_INHERITANCE ipc_importance_task_t release_imp_task = IIT_NULL; @@ -906,10 +936,13 @@ ipc_port_destroy(ipc_port_t port) natural_t assertcnt = 0; #endif /* IMPORTANCE_INHERITANCE */ - assert(ip_active(port)); + require_ip_active(port); /* port->ip_receiver_name is garbage */ /* port->ip_receiver/port->ip_destination is garbage */ + /* clear any reply-port context */ + port->ip_reply_context = 0; + /* check for a backup port */ pdrequest = port->ip_pdrequest; @@ -944,20 +977,27 @@ ipc_port_destroy(ipc_port_t port) /* we assume the ref for pdrequest */ port->ip_pdrequest = IP_NULL; - ip_unlock(port); + + imq_lock(&port->ip_messages); + watchport_elem = ipc_port_clear_watchport_elem_internal(port); + ipc_port_send_turnstile_recompute_push_locked(port); + /* mqueue and port unlocked */ if (special_reply) { ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE, FALSE); } + + if (watchport_elem) { + task_watchport_elem_deallocate(watchport_elem); + watchport_elem = NULL; + } /* consumes our refs for port and pdrequest */ ipc_notify_port_destroyed(pdrequest, port); goto drop_assertions; } - nsrequest = port->ip_nsrequest; - /* * The mach_msg_* paths don't hold a port lock, they only hold a * reference to the port object. If a thread raced us and is now @@ -973,6 +1013,11 @@ ipc_port_destroy(ipc_port_t port) assert(port->ip_in_pset == 0); assert(port->ip_mscount == 0); + imq_lock(&port->ip_messages); + watchport_elem = ipc_port_clear_watchport_elem_internal(port); + imq_unlock(&port->ip_messages); + nsrequest = port->ip_nsrequest; + /* * If the port has a preallocated message buffer and that buffer * is not inuse, free it. If it has an inuse one, then the kmsg @@ -988,14 +1033,26 @@ ipc_port_destroy(ipc_port_t port) assert(kmsg != IKM_NULL); inuse_port = ikm_prealloc_inuse_port(kmsg); ipc_kmsg_clear_prealloc(kmsg, port); - ip_unlock(port); + + imq_lock(&port->ip_messages); + ipc_port_send_turnstile_recompute_push_locked(port); + /* mqueue and port unlocked */ + if (inuse_port != IP_NULL) { assert(inuse_port == port); } else { ipc_kmsg_free(kmsg); } } else { - ip_unlock(port); + imq_lock(&port->ip_messages); + ipc_port_send_turnstile_recompute_push_locked(port); + /* mqueue and port unlocked */ + } + + /* Deallocate the watchport element */ + if (watchport_elem) { + task_watchport_elem_deallocate(watchport_elem); + watchport_elem = NULL; } /* unlink the kmsg from special reply port */ @@ -1077,6 +1134,7 @@ ipc_port_check_circularity( return ipc_importance_check_circularity(port, dest); #else ipc_port_t base; + struct task_watchport_elem *watchport_elem = NULL; assert(port != IP_NULL); assert(dest != IP_NULL); @@ -1134,8 +1192,7 @@ ipc_port_check_circularity( ipc_port_multiple_unlock(); /* port (== base) is in limbo */ - - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == MACH_PORT_NULL); assert(port->ip_destination == IP_NULL); @@ -1144,8 +1201,7 @@ ipc_port_check_circularity( ipc_port_t next; /* dest is in transit or in limbo */ - - assert(ip_active(base)); + require_ip_active(base); assert(base->ip_receiver_name == MACH_PORT_NULL); next = base->ip_destination; @@ -1170,11 +1226,18 @@ not_circular: imq_lock(&port->ip_messages); /* port is in limbo */ - - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == MACH_PORT_NULL); assert(port->ip_destination == IP_NULL); + /* Clear the watchport boost */ + watchport_elem = ipc_port_clear_watchport_elem_internal(port); + + /* Check if the port is being enqueued as a part of sync bootstrap checkin */ + if (dest->ip_specialreply && dest->ip_sync_bootstrap_checkin) { + port->ip_sync_bootstrap_checkin = 1; + } + ip_reference(dest); port->ip_destination = dest; @@ -1185,6 +1248,13 @@ not_circular: port_send_turnstile_address(port), TURNSTILE_NULL, TURNSTILE_SYNC_IPC); + /* + * What ipc_port_adjust_port_locked would do, + * but we need to also drop even more locks before + * calling turnstile_update_inheritor_complete(). + */ + ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL); + turnstile_update_inheritor(send_turnstile, port_send_turnstile(dest), (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); @@ -1204,8 +1274,7 @@ not_circular: } /* port is in transit */ - - assert(ip_active(dest)); + require_ip_active(dest); assert(dest->ip_receiver_name == MACH_PORT_NULL); assert(dest->ip_destination != IP_NULL); @@ -1227,35 +1296,153 @@ not_circular: /* Take the mq lock to call turnstile complete */ imq_lock(&port->ip_messages); - turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL); + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL, TURNSTILE_SYNC_IPC); send_turnstile = TURNSTILE_NULL; imq_unlock(&port->ip_messages); turnstile_cleanup(); } + if (watchport_elem) { + task_watchport_elem_deallocate(watchport_elem); + } + return FALSE; #endif /* !IMPORTANCE_INHERITANCE */ } -struct turnstile * -ipc_port_get_inheritor(ipc_port_t port) +/* + * Update the recv turnstile inheritor for a port. + * + * Sync IPC through the port receive turnstile only happens for the special + * reply port case. It has three sub-cases: + * + * 1. a send-once right is in transit, and pushes on the send turnstile of its + * destination mqueue. + * + * 2. a send-once right has been stashed on a knote it was copied out "through", + * as the first such copied out port. + * + * 3. a send-once right has been stashed on a knote it was copied out "through", + * as the second or more copied out port. + */ +void +ipc_port_recv_update_inheritor( + ipc_port_t port, + struct turnstile *rcv_turnstile, + turnstile_update_flags_t flags) { - ipc_mqueue_t mqueue = &port->ip_messages; + struct turnstile *inheritor = TURNSTILE_NULL; struct knote *kn; - assert(imq_held(mqueue)); + if (ip_active(port) && port->ip_specialreply) { + imq_held(&port->ip_messages); + + switch (port->ip_sync_link_state) { + case PORT_SYNC_LINK_PORT: + if (port->ip_sync_inheritor_port != NULL) { + inheritor = port_send_turnstile(port->ip_sync_inheritor_port); + } + break; - if (!IMQ_KLIST_VALID(mqueue)) { - return IMQ_INHERITOR(mqueue); + case PORT_SYNC_LINK_WORKLOOP_KNOTE: + kn = port->ip_sync_inheritor_knote; + inheritor = filt_ipc_kqueue_turnstile(kn); + break; + + case PORT_SYNC_LINK_WORKLOOP_STASH: + inheritor = port->ip_sync_inheritor_ts; + break; + } } - SLIST_FOREACH(kn, &port->ip_messages.imq_klist, kn_selnext) { - if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) { - return filt_machport_kqueue_turnstile(kn); + turnstile_update_inheritor(rcv_turnstile, inheritor, + flags | TURNSTILE_INHERITOR_TURNSTILE); +} + +/* + * Update the send turnstile inheritor for a port. + * + * Sync IPC through the port send turnstile has 7 possible reasons to be linked: + * + * 1. a special reply port is part of sync ipc for bootstrap checkin and needs + * to push on thread doing the sync ipc. + * + * 2. a receive right is in transit, and pushes on the send turnstile of its + * destination mqueue. + * + * 3. port was passed as an exec watchport and port is pushing on main thread + * of the task. + * + * 4. a receive right has been stashed on a knote it was copied out "through", + * as the first such copied out port (same as PORT_SYNC_LINK_WORKLOOP_KNOTE + * for the special reply port) + * + * 5. a receive right has been stashed on a knote it was copied out "through", + * as the second or more copied out port (same as + * PORT_SYNC_LINK_WORKLOOP_STASH for the special reply port) + * + * 6. a receive right has been copied out as a part of sync bootstrap checkin + * and needs to push on thread doing the sync bootstrap checkin. + * + * 7. the receive right is monitored by a knote, and pushes on any that is + * registered on a workloop. filt_machport makes sure that if such a knote + * exists, it is kept as the first item in the knote list, so we never need + * to walk. + */ +void +ipc_port_send_update_inheritor( + ipc_port_t port, + struct turnstile *send_turnstile, + turnstile_update_flags_t flags) +{ + ipc_mqueue_t mqueue = &port->ip_messages; + turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; + struct knote *kn; + turnstile_update_flags_t inheritor_flags = TURNSTILE_INHERITOR_TURNSTILE; + + assert(imq_held(mqueue)); + + if (!ip_active(port)) { + /* this port is no longer active, it should not push anywhere */ + } else if (port->ip_specialreply) { + /* Case 1. */ + if (port->ip_sync_bootstrap_checkin && prioritize_launch) { + inheritor = port->ip_messages.imq_srp_owner_thread; + inheritor_flags = TURNSTILE_INHERITOR_THREAD; + } + } else if (port->ip_receiver_name == MACH_PORT_NULL && + port->ip_destination != NULL) { + /* Case 2. */ + inheritor = port_send_turnstile(port->ip_destination); + } else if (port->ip_watchport_elem != NULL) { + /* Case 3. */ + if (prioritize_launch) { + assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY); + inheritor = ipc_port_get_watchport_inheritor(port); + inheritor_flags = TURNSTILE_INHERITOR_THREAD; + } + } else if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE) { + /* Case 4. */ + inheritor = filt_ipc_kqueue_turnstile(mqueue->imq_inheritor_knote); + } else if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_STASH) { + /* Case 5. */ + inheritor = mqueue->imq_inheritor_turnstile; + } else if (port->ip_sync_link_state == PORT_SYNC_LINK_RCV_THREAD) { + /* Case 6. */ + if (prioritize_launch) { + inheritor = port->ip_messages.imq_inheritor_thread_ref; + inheritor_flags = TURNSTILE_INHERITOR_THREAD; + } + } else if ((kn = SLIST_FIRST(&mqueue->imq_klist))) { + /* Case 7. Push on a workloop that is interested */ + if (filt_machport_kqueue_has_turnstile(kn)) { + assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY); + inheritor = filt_ipc_kqueue_turnstile(kn); } } - return TURNSTILE_NULL; + turnstile_update_inheritor(send_turnstile, inheritor, + flags | inheritor_flags); } /* @@ -1271,7 +1458,6 @@ void ipc_port_send_turnstile_prepare(ipc_port_t port) { struct turnstile *turnstile = TURNSTILE_NULL; - struct turnstile *inheritor = TURNSTILE_NULL; struct turnstile *send_turnstile = TURNSTILE_NULL; retry_alloc: @@ -1290,22 +1476,9 @@ retry_alloc: turnstile, TURNSTILE_SYNC_IPC); turnstile = TURNSTILE_NULL; - /* - * if port in transit, setup linkage for its turnstile, - * otherwise the link it to WL turnstile. - */ - if (ip_active(port) && - port->ip_receiver_name == MACH_PORT_NULL && - port->ip_destination != IP_NULL) { - assert(port->ip_receiver_name == MACH_PORT_NULL); - assert(port->ip_destination != IP_NULL); + ipc_port_send_update_inheritor(port, send_turnstile, + TURNSTILE_IMMEDIATE_UPDATE); - inheritor = port_send_turnstile(port->ip_destination); - } else { - inheritor = ipc_port_get_inheritor(port); - } - turnstile_update_inheritor(send_turnstile, inheritor, - TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE); /* turnstile complete will be called in ipc_port_send_turnstile_complete */ } @@ -1343,7 +1516,7 @@ ipc_port_send_turnstile_complete(ipc_port_t port) port_send_turnstile(port)->ts_port_ref--; if (port_send_turnstile(port)->ts_port_ref == 0) { turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), - &turnstile); + &turnstile, TURNSTILE_SYNC_IPC); assert(turnstile != TURNSTILE_NULL); } imq_unlock(&port->ip_messages); @@ -1355,6 +1528,20 @@ ipc_port_send_turnstile_complete(ipc_port_t port) } } +/* + * Routine: ipc_port_rcv_turnstile + * Purpose: + * Get the port's receive turnstile + * + * Conditions: + * mqueue locked or thread waiting on turnstile is locked. + */ +static struct turnstile * +ipc_port_rcv_turnstile(ipc_port_t port) +{ + return turnstile_lookup_by_proprietor((uintptr_t)port, TURNSTILE_SYNC_IPC); +} + /* * Routine: ipc_port_rcv_turnstile_waitq @@ -1384,21 +1571,6 @@ ipc_port_rcv_turnstile_waitq(struct waitq *waitq) } -/* - * Routine: ipc_port_rcv_turnstile - * Purpose: - * Get the port's receive turnstile - * - * Conditions: - * mqueue locked or thread waiting on turnstile is locked. - */ -struct turnstile * -ipc_port_rcv_turnstile(ipc_port_t port) -{ - return turnstile_lookup_by_proprietor((uintptr_t)port); -} - - /* * Routine: ipc_port_link_special_reply_port * Purpose: @@ -1411,7 +1583,8 @@ ipc_port_rcv_turnstile(ipc_port_t port) void ipc_port_link_special_reply_port( ipc_port_t special_reply_port, - ipc_port_t dest_port) + ipc_port_t dest_port, + boolean_t sync_bootstrap_checkin) { boolean_t drop_turnstile_ref = FALSE; @@ -1422,6 +1595,10 @@ ipc_port_link_special_reply_port( ip_lock(special_reply_port); imq_lock(&special_reply_port->ip_messages); + if (sync_bootstrap_checkin && special_reply_port->ip_specialreply) { + special_reply_port->ip_sync_bootstrap_checkin = 1; + } + /* Check if we need to drop the acquired turnstile ref on dest port */ if (!special_reply_port->ip_specialreply || special_reply_port->ip_sync_link_state != PORT_SYNC_LINK_ANY || @@ -1446,14 +1623,14 @@ ipc_port_link_special_reply_port( #if DEVELOPMENT || DEBUG inline void -reset_ip_srp_bits(ipc_port_t special_reply_port) +ipc_special_reply_port_bits_reset(ipc_port_t special_reply_port) { special_reply_port->ip_srp_lost_link = 0; special_reply_port->ip_srp_msg_sent = 0; } -inline void -reset_ip_srp_msg_sent(ipc_port_t special_reply_port) +static inline void +ipc_special_reply_port_msg_sent_reset(ipc_port_t special_reply_port) { if (special_reply_port->ip_specialreply == 1) { special_reply_port->ip_srp_msg_sent = 0; @@ -1461,15 +1638,15 @@ reset_ip_srp_msg_sent(ipc_port_t special_reply_port) } inline void -set_ip_srp_msg_sent(ipc_port_t special_reply_port) +ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port) { if (special_reply_port->ip_specialreply == 1) { special_reply_port->ip_srp_msg_sent = 1; } } -inline void -set_ip_srp_lost_link(ipc_port_t special_reply_port) +static inline void +ipc_special_reply_port_lost_link(ipc_port_t special_reply_port) { if (special_reply_port->ip_specialreply == 1 && special_reply_port->ip_srp_msg_sent == 0) { special_reply_port->ip_srp_lost_link = 1; @@ -1478,25 +1655,25 @@ set_ip_srp_lost_link(ipc_port_t special_reply_port) #else /* DEVELOPMENT || DEBUG */ inline void -reset_ip_srp_bits(__unused ipc_port_t special_reply_port) +ipc_special_reply_port_bits_reset(__unused ipc_port_t special_reply_port) { return; } -inline void -reset_ip_srp_msg_sent(__unused ipc_port_t special_reply_port) +static inline void +ipc_special_reply_port_msg_sent_reset(__unused ipc_port_t special_reply_port) { return; } inline void -set_ip_srp_msg_sent(__unused ipc_port_t special_reply_port) +ipc_special_reply_port_msg_sent(__unused ipc_port_t special_reply_port) { return; } -inline void -set_ip_srp_lost_link(__unused ipc_port_t special_reply_port) +static inline void +ipc_special_reply_port_lost_link(__unused ipc_port_t special_reply_port) { return; } @@ -1505,10 +1682,11 @@ set_ip_srp_lost_link(__unused ipc_port_t special_reply_port) /* * Routine: ipc_port_adjust_special_reply_port_locked * Purpose: - * If the special port has a turnstile, update it's inheritor. + * If the special port has a turnstile, update its inheritor. * Condition: * Special reply port locked on entry. * Special reply port unlocked on return. + * The passed in port is a special reply port. * Returns: * None. */ @@ -1522,21 +1700,30 @@ ipc_port_adjust_special_reply_port_locked( ipc_port_t dest_port = IPC_PORT_NULL; int sync_link_state = PORT_SYNC_LINK_NO_LINKAGE; turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; - struct turnstile *dest_ts = TURNSTILE_NULL, *ts = TURNSTILE_NULL; + struct turnstile *ts = TURNSTILE_NULL; + assert(special_reply_port->ip_specialreply); + + ip_lock_held(special_reply_port); // ip_sync_link_state is touched imq_lock(&special_reply_port->ip_messages); if (flags & IPC_PORT_ADJUST_SR_RECEIVED_MSG) { - reset_ip_srp_msg_sent(special_reply_port); + ipc_special_reply_port_msg_sent_reset(special_reply_port); + } + + if (flags & IPC_PORT_ADJUST_UNLINK_THREAD) { + special_reply_port->ip_messages.imq_srp_owner_thread = NULL; + } + + if (flags & IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN) { + special_reply_port->ip_sync_bootstrap_checkin = 0; } /* Check if the special reply port is marked non-special */ - if (special_reply_port->ip_specialreply == 0 || - special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY) { + if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY) { if (get_turnstile) { turnstile_complete((uintptr_t)special_reply_port, - port_rcv_turnstile_address(special_reply_port), - NULL); + port_rcv_turnstile_address(special_reply_port), NULL, TURNSTILE_SYNC_IPC); } imq_unlock(&special_reply_port->ip_messages); ip_unlock(special_reply_port); @@ -1546,32 +1733,23 @@ ipc_port_adjust_special_reply_port_locked( return; } - /* Clear thread's special reply port and clear linkage */ - if (flags & IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY) { - /* This option should only be specified by a non blocking thread */ - assert(get_turnstile == FALSE); - special_reply_port->ip_specialreply = 0; - - reset_ip_srp_bits(special_reply_port); - - /* Check if need to break linkage */ - if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_NO_LINKAGE) { - imq_unlock(&special_reply_port->ip_messages); - ip_unlock(special_reply_port); - return; - } - } else if (flags & IPC_PORT_ADJUST_SR_LINK_WORKLOOP) { - if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY || - special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_PORT) { - if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_SEND_ONCE)) { - inheritor = filt_machport_stash_port(kn, special_reply_port, - &sync_link_state); - } + if (flags & IPC_PORT_ADJUST_SR_LINK_WORKLOOP) { + if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_SEND_ONCE)) { + inheritor = filt_machport_stash_port(kn, special_reply_port, + &sync_link_state); } } else if (flags & IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE) { sync_link_state = PORT_SYNC_LINK_ANY; } + /* Check if need to break linkage */ + if (!get_turnstile && sync_link_state == PORT_SYNC_LINK_NO_LINKAGE && + special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_NO_LINKAGE) { + imq_unlock(&special_reply_port->ip_messages); + ip_unlock(special_reply_port); + return; + } + switch (special_reply_port->ip_sync_link_state) { case PORT_SYNC_LINK_PORT: dest_port = special_reply_port->ip_sync_inheritor_port; @@ -1581,7 +1759,6 @@ ipc_port_adjust_special_reply_port_locked( special_reply_port->ip_sync_inheritor_knote = NULL; break; case PORT_SYNC_LINK_WORKLOOP_STASH: - dest_ts = special_reply_port->ip_sync_inheritor_ts; special_reply_port->ip_sync_inheritor_ts = NULL; break; } @@ -1593,12 +1770,11 @@ ipc_port_adjust_special_reply_port_locked( special_reply_port->ip_sync_inheritor_knote = kn; break; case PORT_SYNC_LINK_WORKLOOP_STASH: - turnstile_reference(inheritor); special_reply_port->ip_sync_inheritor_ts = inheritor; break; case PORT_SYNC_LINK_NO_LINKAGE: if (flags & IPC_PORT_ADJUST_SR_ENABLE_EVENT) { - set_ip_srp_lost_link(special_reply_port); + ipc_special_reply_port_lost_link(special_reply_port); } break; } @@ -1606,14 +1782,13 @@ ipc_port_adjust_special_reply_port_locked( /* Get thread's turnstile donated to special reply port */ if (get_turnstile) { turnstile_complete((uintptr_t)special_reply_port, - port_rcv_turnstile_address(special_reply_port), - NULL); + port_rcv_turnstile_address(special_reply_port), NULL, TURNSTILE_SYNC_IPC); } else { ts = ipc_port_rcv_turnstile(special_reply_port); if (ts) { turnstile_reference(ts); - turnstile_update_inheritor(ts, inheritor, - (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); + ipc_port_recv_update_inheritor(special_reply_port, ts, + TURNSTILE_IMMEDIATE_UPDATE); } } @@ -1628,22 +1803,18 @@ ipc_port_adjust_special_reply_port_locked( turnstile_deallocate_safe(ts); } - /* Release the ref on the dest port and it's turnstile */ + /* Release the ref on the dest port and its turnstile */ if (dest_port) { ipc_port_send_turnstile_complete(dest_port); /* release the reference on the dest port */ ip_release(dest_port); } - - if (dest_ts) { - turnstile_deallocate_safe(dest_ts); - } } /* * Routine: ipc_port_adjust_special_reply_port * Purpose: - * If the special port has a turnstile, update it's inheritor. + * If the special port has a turnstile, update its inheritor. * Condition: * Nothing locked. * Returns: @@ -1655,39 +1826,310 @@ ipc_port_adjust_special_reply_port( uint8_t flags, boolean_t get_turnstile) { - ip_lock(special_reply_port); - ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, flags, get_turnstile); - /* special_reply_port unlocked */ + if (special_reply_port->ip_specialreply) { + ip_lock(special_reply_port); + ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, + flags, get_turnstile); + /* special_reply_port unlocked */ + } + if (get_turnstile) { + assert(current_thread()->turnstile != TURNSTILE_NULL); + } } /* - * Routine: ipc_port_get_special_reply_port_inheritor + * Routine: ipc_port_adjust_sync_link_state_locked * Purpose: - * Returns the current inheritor of the special reply port + * Update the sync link state of the port and the + * turnstile inheritor. * Condition: - * mqueue is locked, port is a special reply port + * Port and mqueue locked on entry. + * Port and mqueue locked on return. * Returns: - * the current inheritor + * None. */ -turnstile_inheritor_t -ipc_port_get_special_reply_port_inheritor( - ipc_port_t port) +void +ipc_port_adjust_sync_link_state_locked( + ipc_port_t port, + int sync_link_state, + turnstile_inheritor_t inheritor) { - assert(port->ip_specialreply); - imq_held(&port->ip_messages); - switch (port->ip_sync_link_state) { - case PORT_SYNC_LINK_PORT: - if (port->ip_sync_inheritor_port != NULL) { - return port_send_turnstile(port->ip_sync_inheritor_port); - } - break; + case PORT_SYNC_LINK_RCV_THREAD: + /* deallocate the thread reference for the inheritor */ + thread_deallocate_safe(port->ip_messages.imq_inheritor_thread_ref); + /* Fall through */ + + default: + klist_init(&port->ip_messages.imq_klist); + } + + switch (sync_link_state) { case PORT_SYNC_LINK_WORKLOOP_KNOTE: - return filt_machport_stashed_special_reply_port_turnstile(port); + port->ip_messages.imq_inheritor_knote = inheritor; + break; case PORT_SYNC_LINK_WORKLOOP_STASH: - return port->ip_sync_inheritor_ts; + port->ip_messages.imq_inheritor_turnstile = inheritor; + break; + case PORT_SYNC_LINK_RCV_THREAD: + /* The thread could exit without clearing port state, take a thread ref */ + thread_reference((thread_t)inheritor); + port->ip_messages.imq_inheritor_thread_ref = inheritor; + break; + default: + klist_init(&port->ip_messages.imq_klist); + sync_link_state = PORT_SYNC_LINK_ANY; + } + + port->ip_sync_link_state = sync_link_state; +} + + +/* + * Routine: ipc_port_adjust_port_locked + * Purpose: + * If the port has a turnstile, update its inheritor. + * Condition: + * Port locked on entry. + * Port unlocked on return. + * Returns: + * None. + */ +void +ipc_port_adjust_port_locked( + ipc_port_t port, + struct knote *kn, + boolean_t sync_bootstrap_checkin) +{ + int sync_link_state = PORT_SYNC_LINK_ANY; + turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; + + ip_lock_held(port); // ip_sync_link_state is touched + imq_held(&port->ip_messages); + + assert(!port->ip_specialreply); + + if (kn) { + inheritor = filt_machport_stash_port(kn, port, &sync_link_state); + if (sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE) { + inheritor = kn; + } + } else if (sync_bootstrap_checkin) { + inheritor = current_thread(); + sync_link_state = PORT_SYNC_LINK_RCV_THREAD; } - return TURNSTILE_INHERITOR_NULL; + + ipc_port_adjust_sync_link_state_locked(port, sync_link_state, inheritor); + port->ip_sync_bootstrap_checkin = 0; + + ipc_port_send_turnstile_recompute_push_locked(port); + /* port and mqueue unlocked */ +} + +/* + * Routine: ipc_port_clear_sync_rcv_thread_boost_locked + * Purpose: + * If the port is pushing on rcv thread, clear it. + * Condition: + * Port locked on entry + * mqueue is not locked. + * Port unlocked on return. + * Returns: + * None. + */ +void +ipc_port_clear_sync_rcv_thread_boost_locked( + ipc_port_t port) +{ + ip_lock_held(port); // ip_sync_link_state is touched + + if (port->ip_sync_link_state != PORT_SYNC_LINK_RCV_THREAD) { + ip_unlock(port); + return; + } + + imq_lock(&port->ip_messages); + ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL); + + ipc_port_send_turnstile_recompute_push_locked(port); + /* port and mqueue unlocked */ +} + +/* + * Routine: ipc_port_add_watchport_elem_locked + * Purpose: + * Transfer the turnstile boost of watchport to task calling exec. + * Condition: + * Port locked on entry. + * Port unlocked on return. + * Returns: + * KERN_SUCESS on success. + * KERN_FAILURE otherwise. + */ +kern_return_t +ipc_port_add_watchport_elem_locked( + ipc_port_t port, + struct task_watchport_elem *watchport_elem, + struct task_watchport_elem **old_elem) +{ + ip_lock_held(port); + imq_held(&port->ip_messages); + + /* Watchport boost only works for non-special active ports mapped in an ipc space */ + if (!ip_active(port) || port->ip_specialreply || + port->ip_receiver_name == MACH_PORT_NULL) { + imq_unlock(&port->ip_messages); + ip_unlock(port); + return KERN_FAILURE; + } + + if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) { + /* Sever the linkage if the port was pushing on knote */ + ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL); + } + + *old_elem = port->ip_watchport_elem; + port->ip_watchport_elem = watchport_elem; + + ipc_port_send_turnstile_recompute_push_locked(port); + /* port and mqueue unlocked */ + return KERN_SUCCESS; +} + +/* + * Routine: ipc_port_clear_watchport_elem_internal_conditional_locked + * Purpose: + * Remove the turnstile boost of watchport and recompute the push. + * Condition: + * Port locked on entry. + * Port unlocked on return. + * Returns: + * KERN_SUCESS on success. + * KERN_FAILURE otherwise. + */ +kern_return_t +ipc_port_clear_watchport_elem_internal_conditional_locked( + ipc_port_t port, + struct task_watchport_elem *watchport_elem) +{ + ip_lock_held(port); + imq_held(&port->ip_messages); + + if (port->ip_watchport_elem != watchport_elem) { + imq_unlock(&port->ip_messages); + ip_unlock(port); + return KERN_FAILURE; + } + + ipc_port_clear_watchport_elem_internal(port); + ipc_port_send_turnstile_recompute_push_locked(port); + /* port and mqueue unlocked */ + return KERN_SUCCESS; +} + +/* + * Routine: ipc_port_replace_watchport_elem_conditional_locked + * Purpose: + * Replace the turnstile boost of watchport and recompute the push. + * Condition: + * Port locked on entry. + * Port unlocked on return. + * Returns: + * KERN_SUCESS on success. + * KERN_FAILURE otherwise. + */ +kern_return_t +ipc_port_replace_watchport_elem_conditional_locked( + ipc_port_t port, + struct task_watchport_elem *old_watchport_elem, + struct task_watchport_elem *new_watchport_elem) +{ + ip_lock_held(port); + imq_held(&port->ip_messages); + + if (port->ip_watchport_elem != old_watchport_elem) { + imq_unlock(&port->ip_messages); + ip_unlock(port); + return KERN_FAILURE; + } + + port->ip_watchport_elem = new_watchport_elem; + ipc_port_send_turnstile_recompute_push_locked(port); + /* port and mqueue unlocked */ + return KERN_SUCCESS; +} + +/* + * Routine: ipc_port_clear_watchport_elem_internal + * Purpose: + * Remove the turnstile boost of watchport. + * Condition: + * Port locked on entry. + * Port locked on return. + * Returns: + * Old task_watchport_elem returned. + */ +struct task_watchport_elem * +ipc_port_clear_watchport_elem_internal( + ipc_port_t port) +{ + struct task_watchport_elem *watchport_elem; + + ip_lock_held(port); + imq_held(&port->ip_messages); + + watchport_elem = port->ip_watchport_elem; + port->ip_watchport_elem = NULL; + + return watchport_elem; +} + +/* + * Routine: ipc_port_send_turnstile_recompute_push_locked + * Purpose: + * Update send turnstile inheritor of port and recompute the push. + * Condition: + * Port locked on entry. + * Port unlocked on return. + * Returns: + * None. + */ +static void +ipc_port_send_turnstile_recompute_push_locked( + ipc_port_t port) +{ + struct turnstile *send_turnstile = port_send_turnstile(port); + if (send_turnstile) { + turnstile_reference(send_turnstile); + ipc_port_send_update_inheritor(port, send_turnstile, + TURNSTILE_IMMEDIATE_UPDATE); + } + imq_unlock(&port->ip_messages); + ip_unlock(port); + + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, + TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate_safe(send_turnstile); + } +} + +/* + * Routine: ipc_port_get_watchport_inheritor + * Purpose: + * Returns inheritor for watchport. + * + * Conditions: + * mqueue locked. + * Returns: + * watchport inheritor. + */ +static thread_t +ipc_port_get_watchport_inheritor( + ipc_port_t port) +{ + imq_held(&port->ip_messages); + return port->ip_watchport_elem->twe_task->watchports->tw_thread; } /* @@ -1951,51 +2393,6 @@ ipc_port_importance_delta( } #endif /* IMPORTANCE_INHERITANCE */ -/* - * Routine: ipc_port_lookup_notify - * Purpose: - * Make a send-once notify port from a receive right. - * Returns IP_NULL if name doesn't denote a receive right. - * Conditions: - * The space must be locked (read or write) and active. - * Being the active space, we can rely on thread server_id - * context to give us the proper server level sub-order - * within the space. - */ - -ipc_port_t -ipc_port_lookup_notify( - ipc_space_t space, - mach_port_name_t name) -{ - ipc_port_t port; - ipc_entry_t entry; - - assert(is_active(space)); - - entry = ipc_entry_lookup(space, name); - if (entry == IE_NULL) { - return IP_NULL; - } - if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) { - return IP_NULL; - } - - __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object); - assert(port != IP_NULL); - - ip_lock(port); - assert(ip_active(port)); - assert(port->ip_receiver_name == name); - assert(port->ip_receiver == space); - - ip_reference(port); - port->ip_sorights++; - ip_unlock(port); - - return port; -} - /* * Routine: ipc_port_make_send_locked * Purpose: @@ -2008,7 +2405,7 @@ ipc_port_t ipc_port_make_send_locked( ipc_port_t port) { - assert(ip_active(port)); + require_ip_active(port); port->ip_mscount++; port->ip_srights++; ip_reference(port); @@ -2031,9 +2428,7 @@ ipc_port_make_send( ip_lock(port); if (ip_active(port)) { - port->ip_mscount++; - port->ip_srights++; - ip_reference(port); + ipc_port_make_send_locked(port); ip_unlock(port); return port; } @@ -2041,6 +2436,22 @@ ipc_port_make_send( return IP_DEAD; } +/* + * Routine: ipc_port_copy_send_locked + * Purpose: + * Make a naked send right from another naked send right. + * Conditions: + * port locked and active. + */ +void +ipc_port_copy_send_locked( + ipc_port_t port) +{ + assert(port->ip_srights > 0); + port->ip_srights++; + ip_reference(port); +} + /* * Routine: ipc_port_copy_send * Purpose: @@ -2065,10 +2476,7 @@ ipc_port_copy_send( ip_lock(port); if (ip_active(port)) { - assert(port->ip_srights > 0); - - ip_reference(port); - port->ip_srights++; + ipc_port_copy_send_locked(port); sright = port; } else { sright = IP_DEAD; @@ -2097,44 +2505,8 @@ ipc_port_copyout_send( if (IP_VALID(sright)) { kern_return_t kr; - kr = ipc_object_copyout(space, (ipc_object_t) sright, - MACH_MSG_TYPE_PORT_SEND, TRUE, &name); - if (kr != KERN_SUCCESS) { - ipc_port_release_send(sright); - - if (kr == KERN_INVALID_CAPABILITY) { - name = MACH_PORT_DEAD; - } else { - name = MACH_PORT_NULL; - } - } - } else { - name = CAST_MACH_PORT_TO_NAME(sright); - } - - return name; -} - -/* - * Routine: ipc_port_copyout_name_send - * Purpose: - * Copyout a naked send right (possibly null/dead) to given name, - * or if that fails, destroy the right. - * Conditions: - * Nothing locked. - */ - -mach_port_name_t -ipc_port_copyout_name_send( - ipc_port_t sright, - ipc_space_t space, - mach_port_name_t name) -{ - if (IP_VALID(sright)) { - kern_return_t kr; - - kr = ipc_object_copyout_name(space, (ipc_object_t) sright, - MACH_MSG_TYPE_PORT_SEND, TRUE, name); + kr = ipc_object_copyout(space, ip_to_object(sright), + MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name); if (kr != KERN_SUCCESS) { ipc_port_release_send(sright); @@ -2212,7 +2584,7 @@ ipc_port_t ipc_port_make_sonce_locked( ipc_port_t port) { - assert(ip_active(port)); + require_ip_active(port); port->ip_sorights++; ip_reference(port); return port; @@ -2236,8 +2608,7 @@ ipc_port_make_sonce( ip_lock(port); if (ip_active(port)) { - port->ip_sorights++; - ip_reference(port); + ipc_port_make_sonce_locked(port); ip_unlock(port); return port; } @@ -2267,7 +2638,7 @@ ipc_port_release_sonce( return; } - ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE); + ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN, FALSE); ip_lock(port); @@ -2302,7 +2673,7 @@ ipc_port_release_receive( } ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == MACH_PORT_NULL); dest = port->ip_destination; @@ -2330,7 +2701,7 @@ ipc_port_alloc_special( { ipc_port_t port; - __IGNORE_WCASTALIGN(port = (ipc_port_t) io_alloc(IOT_PORT)); + port = ip_object_to_port(io_alloc(IOT_PORT)); if (port == IP_NULL) { return IP_NULL; } @@ -2341,7 +2712,7 @@ ipc_port_alloc_special( #endif /* MACH_ASSERT */ bzero((char *)port, sizeof(*port)); - io_lock_init(&port->ip_object); + io_lock_init(ip_to_object(port)); port->ip_references = 1; port->ip_object.io_bits = io_makebits(TRUE, IOT_PORT, 0); @@ -2369,7 +2740,7 @@ ipc_port_dealloc_special( __assert_only ipc_space_t space) { ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); // assert(port->ip_receiver_name != MACH_PORT_NULL); assert(port->ip_receiver == space); @@ -2384,7 +2755,7 @@ ipc_port_dealloc_special( imq_unlock(&port->ip_messages); /* relevant part of ipc_port_clear_receiver */ - ipc_port_set_mscount(port, 0); + port->ip_mscount = 0; port->ip_messages.imq_seqno = 0; ipc_port_destroy(port); @@ -2447,7 +2818,7 @@ kdp_mqueue_send_find_owner(struct waitq * waitq, __assert_only event64_t event, assert(waitq_is_turnstile_queue(waitq)); turnstile = waitq_to_turnstile(waitq); - ipc_port_t port = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */ + ipc_port_t port = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */ assert(kdp_is_in_zone(port, "ipc ports")); waitinfo->owner = 0; diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 971f77821..3fccd2460 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -139,9 +139,10 @@ struct ipc_port { union { struct ipc_kmsg *premsg; struct turnstile *send_turnstile; - SLIST_ENTRY(ipc_port) dealloc_elm; } kdata2; + struct task_watchport_elem *ip_watchport_elem; + mach_vm_address_t ip_context; natural_t ip_sprequests:1, /* send-possible requests outstanding */ @@ -151,8 +152,12 @@ struct ipc_port { ip_guarded:1, /* port guarded (use context value as guard) */ ip_strict_guard:1, /* Strict guarding; Prevents user manipulation of context values directly */ ip_specialreply:1, /* port is a special reply port */ - ip_sync_link_state:3, /* link the special reply port to destination port/ Workloop */ - ip_impcount:22; /* number of importance donations in nested queue */ + ip_sync_link_state:3, /* link the port to destination port/ Workloop */ + ip_sync_bootstrap_checkin:1,/* port part of sync bootstrap checkin, push on thread doing the checkin */ + ip_immovable_receive:1, /* the receive right cannot be moved out of a space, until it is destroyed */ + ip_no_grant:1, /* Port wont accept complex messages containing (ool) port descriptors */ + ip_immovable_send:1, /* No send(once) rights to this port can be moved out of a space */ + ip_impcount:18; /* number of importance donations in nested queue */ mach_port_mscount_t ip_mscount; mach_port_rights_t ip_srights; @@ -175,10 +180,10 @@ struct ipc_port { #define ip_references ip_object.io_references -#define ip_bits ip_object.io_bits #define ip_receiver_name ip_messages.imq_receiver_name #define ip_in_pset ip_messages.imq_in_pset +#define ip_reply_context ip_messages.imq_context #define ip_receiver data.receiver #define ip_destination data.destination @@ -192,7 +197,6 @@ struct ipc_port { #define ip_premsg kdata2.premsg #define ip_send_turnstile kdata2.send_turnstile -#define ip_dealloc_elm kdata2.dealloc_elm #define port_send_turnstile(port) (IP_PREALLOC(port) ? (port)->ip_premsg->ikm_turnstile : (port)->ip_send_turnstile) @@ -208,11 +212,12 @@ MACRO_END #define port_send_turnstile_address(port) \ (IP_PREALLOC(port) ? &((port)->ip_premsg->ikm_turnstile) : &((port)->ip_send_turnstile)) -#define port_rcv_turnstile_address(port) (NULL) +#define port_rcv_turnstile_address(port) \ + (NULL) /* - * SYNC IPC state flags for special reply port. + * SYNC IPC state flags for special reply port/ rcv right. * * PORT_SYNC_LINK_ANY * Special reply port is not linked to any other port @@ -237,26 +242,34 @@ MACRO_END * Message sent to special reply port, do * not allow any linkages till receive is * complete. + * + * PORT_SYNC_LINK_RCV_THREAD + * Receive right copied out as a part of bootstrap check in, + * push on the thread which copied out the port. */ #define PORT_SYNC_LINK_ANY (0) #define PORT_SYNC_LINK_PORT (0x1) #define PORT_SYNC_LINK_WORKLOOP_KNOTE (0x2) #define PORT_SYNC_LINK_WORKLOOP_STASH (0x3) #define PORT_SYNC_LINK_NO_LINKAGE (0x4) +#define PORT_SYNC_LINK_RCV_THREAD (0x5) #define IP_NULL IPC_PORT_NULL #define IP_DEAD IPC_PORT_DEAD #define IP_VALID(port) IPC_PORT_VALID(port) -#define ip_active(port) io_active(&(port)->ip_object) -#define ip_lock_init(port) io_lock_init(&(port)->ip_object) -#define ip_lock(port) io_lock(&(port)->ip_object) -#define ip_lock_try(port) io_lock_try(&(port)->ip_object) -#define ip_lock_held_kdp(port) io_lock_held_kdp(&(port)->ip_object) -#define ip_unlock(port) io_unlock(&(port)->ip_object) +#define ip_object_to_port(io) __container_of(io, struct ipc_port, ip_object) +#define ip_to_object(port) (&(port)->ip_object) +#define ip_active(port) io_active(ip_to_object(port)) +#define ip_lock_init(port) io_lock_init(ip_to_object(port)) +#define ip_lock_held(port) io_lock_held(ip_to_object(port)) +#define ip_lock(port) io_lock(ip_to_object(port)) +#define ip_lock_try(port) io_lock_try(ip_to_object(port)) +#define ip_lock_held_kdp(port) io_lock_held_kdp(ip_to_object(port)) +#define ip_unlock(port) io_unlock(ip_to_object(port)) -#define ip_reference(port) io_reference(&(port)->ip_object) -#define ip_release(port) io_release(&(port)->ip_object) +#define ip_reference(port) io_reference(ip_to_object(port)) +#define ip_release(port) io_release(ip_to_object(port)) /* get an ipc_port pointer from an ipc_mqueue pointer */ #define ip_from_mq(mq) \ @@ -265,7 +278,8 @@ MACRO_END #define ip_reference_mq(mq) ip_reference(ip_from_mq(mq)) #define ip_release_mq(mq) ip_release(ip_from_mq(mq)) -#define ip_kotype(port) io_kotype(&(port)->ip_object) +#define ip_kotype(port) io_kotype(ip_to_object(port)) +#define ip_is_kobject(port) io_is_kobject(ip_to_object(port)) #define ip_full_kernel(port) imq_full_kernel(&(port)->ip_messages) #define ip_full(port) imq_full(&(port)->ip_messages) @@ -279,18 +293,18 @@ MACRO_END * therefore cannot be blocked waiting for memory themselves). */ #define IP_BIT_PREALLOC 0x00008000 /* preallocated mesg */ -#define IP_PREALLOC(port) ((port)->ip_bits & IP_BIT_PREALLOC) +#define IP_PREALLOC(port) ((port)->ip_object.io_bits & IP_BIT_PREALLOC) #define IP_SET_PREALLOC(port, kmsg) \ MACRO_BEGIN \ - (port)->ip_bits |= IP_BIT_PREALLOC; \ + (port)->ip_object.io_bits |= IP_BIT_PREALLOC; \ (port)->ip_premsg = (kmsg); \ MACRO_END #define IP_CLEAR_PREALLOC(port, kmsg) \ MACRO_BEGIN \ assert((port)->ip_premsg == kmsg); \ - (port)->ip_bits &= ~IP_BIT_PREALLOC; \ + (port)->ip_object.io_bits &= ~IP_BIT_PREALLOC; \ (port)->ip_premsg = IKM_NULL; \ MACRO_END @@ -364,15 +378,34 @@ extern ipc_port_timestamp_t ipc_port_timestamp(void); #define IP_TIMESTAMP_ORDER(one, two) ((int) ((one) - (two)) < 0) +static inline void +require_ip_active(ipc_port_t port) +{ + if (!ip_active(port)) { + panic("Using inactive port %p", port); + } +} + +static inline kern_return_t +ipc_port_translate( + ipc_space_t space, + mach_port_name_t name, + mach_port_right_t right, + ipc_port_t *portp) +{ + ipc_object_t object; + kern_return_t kr; + + kr = ipc_object_translate(space, name, right, &object); + *portp = (kr == KERN_SUCCESS) ? ip_object_to_port(object) : IP_NULL; + return kr; +} + #define ipc_port_translate_receive(space, name, portp) \ - ipc_object_translate((space), (name), \ - MACH_PORT_RIGHT_RECEIVE, \ - (ipc_object_t *) (portp)) + ipc_port_translate((space), (name), MACH_PORT_RIGHT_RECEIVE, portp) #define ipc_port_translate_send(space, name, portp) \ - ipc_object_translate((space), (name), \ - MACH_PORT_RIGHT_SEND, \ - (ipc_object_t *) (portp)) + ipc_port_translate((space), (name), MACH_PORT_RIGHT_SEND, portp) /* Allocate a notification request slot */ #if IMPORTANCE_INHERITANCE @@ -421,23 +454,6 @@ extern boolean_t ipc_port_request_sparm( mach_msg_option_t option, mach_msg_priority_t override); -/* Macros for manipulating a port's dead name notificaiton requests */ -#define ipc_port_request_rename(port, index, oname, nname) \ -MACRO_BEGIN \ - ipc_port_request_t ipr, table; \ - \ - assert(ip_active(port)); \ - \ - table = port->ip_requests; \ - assert(table != IPR_NULL); \ - \ - ipr = &table[index]; \ - assert(ipr->ipr_name == oname); \ - \ - ipr->ipr_name = nname; \ -MACRO_END - - /* Make a port-deleted request */ extern void ipc_port_pdrequest( ipc_port_t port, @@ -451,13 +467,6 @@ extern void ipc_port_nsrequest( ipc_port_t notify, ipc_port_t *previousp); -#define ipc_port_set_mscount(port, mscount) \ -MACRO_BEGIN \ - assert(ip_active(port)); \ - \ - (port)->ip_mscount = (mscount); \ -MACRO_END - /* Prepare a receive right for transmission/destruction */ extern boolean_t ipc_port_clear_receiver( ipc_port_t port, @@ -472,6 +481,7 @@ extern void ipc_port_init( /* Allocate a port */ extern kern_return_t ipc_port_alloc( ipc_space_t space, + bool make_send_right, mach_port_name_t *namep, ipc_port_t *portp); @@ -511,35 +521,40 @@ enum { void ipc_port_link_special_reply_port( ipc_port_t special_reply_port, - ipc_port_t dest_port); + ipc_port_t dest_port, + boolean_t sync_bootstrap_checkin); #define IPC_PORT_ADJUST_SR_NONE 0 -#define IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY 0x1 -#define IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE 0x2 -#define IPC_PORT_ADJUST_SR_LINK_WORKLOOP 0x4 - +#define IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE 0x1 +#define IPC_PORT_ADJUST_SR_LINK_WORKLOOP 0x2 +#define IPC_PORT_ADJUST_UNLINK_THREAD 0x4 #define IPC_PORT_ADJUST_SR_RECEIVED_MSG 0x8 #define IPC_PORT_ADJUST_SR_ENABLE_EVENT 0x10 +#define IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN 0x20 void -reset_ip_srp_bits(ipc_port_t special_reply_port); - -void -reset_ip_srp_msg_sent(ipc_port_t special_reply_port); +ipc_special_reply_port_bits_reset(ipc_port_t special_reply_port); void -set_ip_srp_msg_sent(ipc_port_t special_reply_port); +ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port); void -set_ip_srp_lost_link(ipc_port_t special_reply_port); +ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port); /* Adjust special reply port linkage */ -void ipc_port_adjust_special_reply_port_locked( +void +ipc_port_adjust_special_reply_port_locked( ipc_port_t special_reply_port, struct knote *kn, uint8_t flags, boolean_t get_turnstile); +void +ipc_port_adjust_sync_link_state_locked( + ipc_port_t port, + int sync_link_state, + turnstile_inheritor_t inheritor); + /* Adjust special reply port linkage */ void ipc_port_adjust_special_reply_port( @@ -547,9 +562,36 @@ ipc_port_adjust_special_reply_port( uint8_t flags, boolean_t get_turnstile); -turnstile_inheritor_t -ipc_port_get_special_reply_port_inheritor( - ipc_port_t special_reply_port); +void +ipc_port_adjust_port_locked( + ipc_port_t port, + struct knote *kn, + boolean_t sync_bootstrap_checkin); + +void +ipc_port_clear_sync_rcv_thread_boost_locked( + ipc_port_t port); + +kern_return_t +ipc_port_add_watchport_elem_locked( + ipc_port_t port, + struct task_watchport_elem *watchport_elem, + struct task_watchport_elem **old_elem); + +kern_return_t +ipc_port_clear_watchport_elem_internal_conditional_locked( + ipc_port_t port, + struct task_watchport_elem *watchport_elem); + +kern_return_t +ipc_port_replace_watchport_elem_conditional_locked( + ipc_port_t port, + struct task_watchport_elem *old_watchport_elem, + struct task_watchport_elem *new_watchport_elem); + +struct task_watchport_elem * +ipc_port_clear_watchport_elem_internal( + ipc_port_t port); void ipc_port_send_turnstile_prepare(ipc_port_t port); @@ -560,9 +602,6 @@ ipc_port_send_turnstile_complete(ipc_port_t port); struct waitq * ipc_port_rcv_turnstile_waitq(struct waitq *waitq); -struct turnstile * -ipc_port_rcv_turnstile(ipc_port_t port); - /* apply importance delta to port only */ extern mach_port_delta_t ipc_port_impcount_delta( @@ -586,11 +625,6 @@ ipc_port_importance_delta( mach_port_delta_t delta); #endif /* IMPORTANCE_INHERITANCE */ -/* Make a send-once notify port from a receive right */ -extern ipc_port_t ipc_port_lookup_notify( - ipc_space_t space, - mach_port_name_t name); - /* Make a naked send right from a receive right - port locked and active */ extern ipc_port_t ipc_port_make_send_locked( ipc_port_t port); @@ -599,6 +633,10 @@ extern ipc_port_t ipc_port_make_send_locked( extern ipc_port_t ipc_port_make_send( ipc_port_t port); +/* Make a naked send right from another naked send right - port locked and active */ +extern void ipc_port_copy_send_locked( + ipc_port_t port); + /* Make a naked send right from another naked send right */ extern ipc_port_t ipc_port_copy_send( ipc_port_t port); @@ -608,12 +646,6 @@ extern mach_port_name_t ipc_port_copyout_send( ipc_port_t sright, ipc_space_t space); -/* Copyout a naked send right to given name */ -extern mach_port_name_t ipc_port_copyout_name_send( - ipc_port_t sright, - ipc_space_t space, - mach_port_name_t name); - #endif /* MACH_KERNEL_PRIVATE */ #if KERNEL_PRIVATE @@ -670,8 +702,13 @@ extern void ipc_port_track_dealloc( extern void ipc_port_debug_init(void); #endif /* MACH_ASSERT */ -extern struct turnstile *ipc_port_get_inheritor( - ipc_port_t port); +extern void ipc_port_recv_update_inheritor(ipc_port_t port, + struct turnstile *turnstile, + turnstile_update_flags_t flags); + +extern void ipc_port_send_update_inheritor(ipc_port_t port, + struct turnstile *turnstile, + turnstile_update_flags_t flags); #define ipc_port_alloc_kernel() \ ipc_port_alloc_special(ipc_space_kernel) diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index c14e98a79..523c49660 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -175,14 +175,14 @@ ipc_pset_alloc_special( assert(space->is_table == IE_NULL); assert(!is_active(space)); - __IGNORE_WCASTALIGN(pset = (ipc_pset_t)io_alloc(IOT_PORT_SET)); + pset = ips_object_to_pset(io_alloc(IOT_PORT_SET)); if (pset == IPS_NULL) { return IPS_NULL; } bzero((char *)pset, sizeof(*pset)); - io_lock_init(&pset->ips_object); + io_lock_init(ips_to_object(pset)); pset->ips_references = 1; pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0); @@ -205,7 +205,7 @@ ipc_pset_member( ipc_pset_t pset, ipc_port_t port) { - assert(ip_active(port)); + require_ip_active(port); return ipc_mqueue_member(&port->ip_messages, &pset->ips_messages); } @@ -230,7 +230,7 @@ ipc_pset_add( kern_return_t kr; assert(ips_active(pset)); - assert(ip_active(port)); + require_ip_active(port); kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages, reserved_link, reserved_prepost); @@ -256,8 +256,7 @@ ipc_pset_remove( ipc_port_t port) { kern_return_t kr; - - assert(ip_active(port)); + require_ip_active(port); if (port->ip_in_pset == 0) { return KERN_NOT_IN_SET; @@ -299,7 +298,7 @@ ipc_pset_lazy_allocate( } psobj = entry->ie_object; - __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj); + pset = ips_object_to_pset(psobj); assert(pset != NULL); ipc_mqueue_t set_mqueue = &pset->ips_messages; struct waitq_set *wqset = &set_mqueue->imq_set_queue; @@ -384,7 +383,7 @@ ipc_pset_destroy( /* * Kqueue EVFILT_MACHPORT support * - * - kn_ptr.p_mqueue points to the monitored mqueue + * - kn_mqueue points to the monitored mqueue * * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer * that can be used to direct-deliver messages when @@ -422,12 +421,18 @@ filt_machport_adjust_qos(struct knote *kn, ipc_kmsg_t first) } struct turnstile * -filt_machport_kqueue_turnstile(struct knote *kn) +filt_ipc_kqueue_turnstile(struct knote *kn) { - if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) { - return kqueue_turnstile(knote_get_kq(kn)); - } - return TURNSTILE_NULL; + assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP); + return kqueue_turnstile(knote_get_kq(kn)); +} + +bool +filt_machport_kqueue_has_turnstile(struct knote *kn) +{ + assert(kn->kn_filter == EVFILT_MACHPORT); + return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK)) + && (kn->kn_flags & EV_DISPATCH); } /* @@ -444,15 +449,24 @@ filt_machport_kqueue_turnstile(struct knote *kn) struct turnstile * filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link) { - struct turnstile *ts = filt_machport_kqueue_turnstile(kn); + struct turnstile *ts = TURNSTILE_NULL; - if (!ts) { + if (kn->kn_filter == EVFILT_WORKLOOP) { + assert(kn->kn_mqueue == NULL); + kn->kn_mqueue = &port->ip_messages; + ip_reference(port); + if (link) { + *link = PORT_SYNC_LINK_WORKLOOP_KNOTE; + } + ts = filt_ipc_kqueue_turnstile(kn); + } else if (!filt_machport_kqueue_has_turnstile(kn)) { if (link) { *link = PORT_SYNC_LINK_NO_LINKAGE; } } else if (kn->kn_ext[3] == 0) { ip_reference(port); kn->kn_ext[3] = (uintptr_t)port; + ts = filt_ipc_kqueue_turnstile(kn); if (link) { *link = PORT_SYNC_LINK_WORKLOOP_KNOTE; } @@ -466,19 +480,6 @@ filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link) return ts; } -struct turnstile * -filt_machport_stashed_special_reply_port_turnstile(ipc_port_t port) -{ - struct knote *kn = port->ip_sync_inheritor_knote; - - assert(port->ip_specialreply); - assert(port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE); - if (kn->kn_ext[3] == (uint64_t)port) { - return kqueue_turnstile(knote_get_kq(kn)); - } - return kn->kn_hook; -} - /* * Lazily prepare a turnstile so that filt_machport_stash_port() * can be called with the mqueue lock held. @@ -500,11 +501,15 @@ filt_machport_turnstile_prepare_lazily( /* This is called from within filt_machportprocess */ assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED)); - struct turnstile *ts = filt_machport_kqueue_turnstile(kn); - if (ts == TURNSTILE_NULL || kn->kn_ext[3] == 0 || kn->kn_hook) { + if (!filt_machport_kqueue_has_turnstile(kn)) { + return; + } + + if (kn->kn_ext[3] == 0 || kn->kn_hook) { return; } + struct turnstile *ts = filt_ipc_kqueue_turnstile(kn); if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) || (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) { struct turnstile *kn_ts = turnstile_alloc(); @@ -516,6 +521,67 @@ filt_machport_turnstile_prepare_lazily( } } +static void +filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port, + ipc_mqueue_t mqueue) +{ + struct turnstile *ts = TURNSTILE_NULL; + + ip_lock(port); + if (port->ip_specialreply) { + /* + * If the reply has been sent to the special reply port already, + * then the special reply port may already be reused to do something + * entirely different. + * + * However, the only reason for it to still point to this knote is + * that it's still waiting for a reply, so when this is the case, + * neuter the linkage. + */ + if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE && + port->ip_sync_inheritor_knote == kn) { + ipc_port_adjust_special_reply_port_locked(port, NULL, + (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE); + } else { + ip_unlock(port); + } + } else { + /* + * For receive rights, if their IMQ_KNOTE() is still this + * knote, then sever the link. + */ + imq_lock(mqueue); + if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE && + mqueue->imq_inheritor_knote == kn) { + ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL); + ts = port_send_turnstile(port); + } + if (ts) { + turnstile_reference(ts); + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, + TURNSTILE_IMMEDIATE_UPDATE); + } + imq_unlock(mqueue); + ip_unlock(port); + + if (ts) { + turnstile_update_inheritor_complete(ts, + TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate(ts); + } + } + + ip_release(port); +} + +void +filt_wldetach_sync_ipc(struct knote *kn) +{ + ipc_mqueue_t mqueue = kn->kn_mqueue; + filt_machport_turnstile_complete_port(kn, ip_from_mq(mqueue), mqueue); + kn->kn_mqueue = NULL; +} + /* * Other half of filt_machport_turnstile_prepare_lazily() * @@ -524,75 +590,20 @@ filt_machport_turnstile_prepare_lazily( static void filt_machport_turnstile_complete(struct knote *kn) { - struct turnstile *ts = TURNSTILE_NULL; - if (kn->kn_ext[3]) { ipc_port_t port = (ipc_port_t)kn->kn_ext[3]; - ipc_mqueue_t mqueue = &port->ip_messages; - - ip_lock(port); - if (port->ip_specialreply) { - /* - * If the reply has been sent to the special reply port already, - * then the special reply port may already be reused to do something - * entirely different. - * - * However, the only reason for it to still point to this knote is - * that it's still waiting for a reply, so when this is the case, - * neuter the linkage. - */ - if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE && - port->ip_sync_inheritor_knote == kn) { - ipc_port_adjust_special_reply_port_locked(port, NULL, - (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE); - } else { - ip_unlock(port); - } - } else { - struct turnstile *kq_ts = kqueue_turnstile(knote_get_kq(kn)); - - /* - * For receive rights, if their IMQ_INHERITOR() is still this - * workloop, then sever the link. - * - * It has a theoretical hole: if the port is sent again to a new - * receive right that is also monitored by the same kqueue, - * we would sever the link incorrectly. - * - * However this would be a REALLY cumbersome thing to do. - */ - imq_lock(mqueue); - if (!IMQ_KLIST_VALID(mqueue) && IMQ_INHERITOR(mqueue) == kq_ts) { - turnstile_deallocate_safe(kq_ts); - klist_init(&mqueue->imq_klist); - ts = port_send_turnstile(port); - } - if (ts) { - turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, - TURNSTILE_IMMEDIATE_UPDATE); - turnstile_reference(ts); - } - imq_unlock(mqueue); - ip_unlock(port); - - if (ts) { - turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); - turnstile_deallocate(ts); - } - } - - ip_release(port); + filt_machport_turnstile_complete_port(kn, port, &port->ip_messages); kn->kn_ext[3] = 0; } if (kn->kn_hook) { - ts = kn->kn_hook; + struct turnstile *ts = kn->kn_hook; turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); - turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts); + turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts, TURNSTILE_KNOTE); turnstile_cleanup(); assert(ts); @@ -600,16 +611,105 @@ filt_machport_turnstile_complete(struct knote *kn) } } +static void +filt_machport_link(ipc_mqueue_t mqueue, struct knote *kn) +{ + struct knote *hd = SLIST_FIRST(&mqueue->imq_klist); + + if (hd && filt_machport_kqueue_has_turnstile(kn)) { + SLIST_INSERT_AFTER(hd, kn, kn_selnext); + } else { + SLIST_INSERT_HEAD(&mqueue->imq_klist, kn, kn_selnext); + } +} + +static void +filt_machport_unlink(ipc_mqueue_t mqueue, struct knote *kn) +{ + struct knote **knprev; + + KNOTE_DETACH(&mqueue->imq_klist, kn); + + /* make sure the first knote is a knote we can push on */ + SLIST_FOREACH_PREVPTR(kn, knprev, &mqueue->imq_klist, kn_selnext) { + if (filt_machport_kqueue_has_turnstile(kn)) { + *knprev = SLIST_NEXT(kn, kn_selnext); + SLIST_INSERT_HEAD(&mqueue->imq_klist, kn, kn_selnext); + break; + } + } +} + +int +filt_wlattach_sync_ipc(struct knote *kn) +{ + mach_port_name_t name = (mach_port_name_t)kn->kn_id; + ipc_space_t space = current_space(); + ipc_entry_t entry; + ipc_port_t port = IP_NULL; + int error = 0; + + if (ipc_right_lookup_read(space, name, &entry) != KERN_SUCCESS) { + return ENOENT; + } + + /* space is read-locked */ + + if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) { + port = ip_object_to_port(entry->ie_object); + if (port->ip_specialreply) { + error = ENOENT; + } + } else if (entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE) { + port = ip_object_to_port(entry->ie_object); + if (!port->ip_specialreply) { + error = ENOENT; + } + } else { + error = ENOENT; + } + if (error) { + is_read_unlock(space); + return error; + } + + ip_lock(port); + is_read_unlock(space); + + if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) { + ip_unlock(port); + /* + * We cannot start a sync IPC inheritance chain, only further one + * Note: this can also happen if the inheritance chain broke + * because the original requestor died. + */ + return ENOENT; + } + + if (port->ip_specialreply) { + ipc_port_adjust_special_reply_port_locked(port, kn, + IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE); + } else { + ipc_port_adjust_port_locked(port, kn, FALSE); + } + + /* make sure the port was stashed */ + assert(kn->kn_mqueue == &port->ip_messages); + + /* port has been unlocked by ipc_port_adjust_* */ + + return 0; +} + static int filt_machportattach( struct knote *kn, - __unused struct kevent_internal_s *kev) + __unused struct kevent_qos_s *kev) { - mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident; + mach_port_name_t name = (mach_port_name_t)kn->kn_id; uint64_t wq_link_id = waitq_link_reserve(NULL); ipc_space_t space = current_space(); ipc_kmsg_t first; - struct turnstile *turnstile = TURNSTILE_NULL; struct turnstile *send_turnstile = TURNSTILE_NULL; int error; @@ -621,132 +721,174 @@ filt_machportattach( kn->kn_flags &= ~EV_EOF; kn->kn_ext[3] = 0; - if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) { + if (filt_machport_kqueue_has_turnstile(kn)) { /* * If the filter is likely to support sync IPC override, * and it happens to be attaching to a workloop, * make sure the workloop has an allocated turnstile. */ - turnstile = kqueue_alloc_turnstile(knote_get_kq(kn)); + kqueue_alloc_turnstile(knote_get_kq(kn)); } +lookup_again: kr = ipc_right_lookup_read(space, name, &entry); -check_lookup: - if (kr == KERN_SUCCESS) { - /* space is read-locked and active */ - - if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) { - ipc_pset_t pset; - - if (knote_link_waitqset_should_lazy_alloc(kn)) { - is_read_unlock(space); - - /* - * We need to link the portset of the kn, - * to insure that the link is allocated before taking - * any spinlocks. - */ - knote_link_waitqset_lazy_alloc(kn); - - /* - * We had to drop the space lock because knote_link_waitqset_lazy_alloc() - * could have allocated memory. The ipc_right_lookup_read() - * function returns with the space locked, so we need to revalidate state. - */ - kr = ipc_right_lookup_read(space, name, &entry); - if (!(kr == KERN_SUCCESS) || !(entry->ie_bits & MACH_PORT_TYPE_PORT_SET)) { - goto check_lookup; - } - } + if (kr != KERN_SUCCESS) { + error = ENOENT; + goto out; + } - __IGNORE_WCASTALIGN(pset = (ipc_pset_t)entry->ie_object); - mqueue = &pset->ips_messages; - ips_reference(pset); + /* space is read-locked and active */ - imq_lock(mqueue); - kn->kn_ptr.p_mqueue = mqueue; + if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) && + knote_link_waitqset_should_lazy_alloc(kn)) { + is_read_unlock(space); + /* + * We need to link the portset of the kn, + * to insure that the link is allocated before taking + * any spinlocks. + * + * Because we have to drop the space lock so that + * knote_link_waitqset_lazy_alloc() can allocate memory, + * we will need to redo the lookup. + */ + knote_link_waitqset_lazy_alloc(kn); + goto lookup_again; + } + + if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) { + ipc_pset_t pset; + + pset = ips_object_to_pset(entry->ie_object); + mqueue = &pset->ips_messages; + ips_reference(pset); + + imq_lock(mqueue); + kn->kn_mqueue = mqueue; + + /* + * Bind the portset wait queue directly to knote/kqueue. + * This allows us to just use wait_queue foo to effect a wakeup, + * rather than having to call knote() from the Mach code on each + * message. We still attach the knote to the mqueue klist for + * NOTE_REVOKE purposes only. + */ + error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id); + if (!error) { + filt_machport_link(mqueue, kn); + imq_unlock(mqueue); + } else { + kn->kn_mqueue = IMQ_NULL; + imq_unlock(mqueue); + ips_release(pset); + } + + is_read_unlock(space); + + /* + * linked knotes are marked stay-active and therefore don't + * need an indication of their fired state to be returned + * from the attach operation. + */ + } else if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) { + ipc_port_t port = ip_object_to_port(entry->ie_object); + + if (port->ip_specialreply) { /* - * Bind the portset wait queue directly to knote/kqueue. - * This allows us to just use wait_queue foo to effect a wakeup, - * rather than having to call knote() from the Mach code on each - * message. We still attach the knote to the mqueue klist for - * NOTE_REVOKE purposes only. + * Registering for kevents on special reply ports + * isn't supported for two reasons: + * + * 1. it really makes very little sense for a port that + * is supposed to be used synchronously + * + * 2. their mqueue's imq_klist field will be used to + * store the receive turnstile, so we can't possibly + * attach them anyway. */ - error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id); - if (!error) { - assert(IMQ_KLIST_VALID(mqueue)); - KNOTE_ATTACH(&mqueue->imq_klist, kn); - imq_unlock(mqueue); - } else { - kn->kn_ptr.p_mqueue = IMQ_NULL; - imq_unlock(mqueue); - ips_release(pset); - } - is_read_unlock(space); + error = ENOTSUP; + goto out; + } + + mqueue = &port->ip_messages; + ip_reference(port); + + /* + * attach knote to port and determine result + * If the filter requested direct message receipt, + * we may need to adjust the qos of the knote to + * reflect the requested and override qos of the + * first message in the queue. + */ + ip_lock(port); + imq_lock(mqueue); + kn->kn_mqueue = mqueue; + if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) { /* - * linked knotes are marked stay-active and therefore don't - * need an indication of their fired state to be returned - * from the attach operation. + * We're attaching a port that used to have an IMQ_KNOTE, + * clobber this state, we'll fixup its turnstile inheritor below. */ - } else if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) { - ipc_port_t port; + ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL); + } + filt_machport_link(mqueue, kn); + + if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { + result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first); + } - __IGNORE_WCASTALIGN(port = (ipc_port_t)entry->ie_object); - mqueue = &port->ip_messages; - ip_reference(port); + /* + * Update the port's turnstile inheritor + * + * Unlike filt_machportdetach(), we don't have to care about races for + * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect + * already pushing knotes, and if the current one becomes the new + * pusher, it'll only be visible when turnstile_workloop_pusher_info() + * returns. + */ + send_turnstile = port_send_turnstile(port); + if (send_turnstile) { + turnstile_reference(send_turnstile); + ipc_port_send_update_inheritor(port, send_turnstile, + TURNSTILE_IMMEDIATE_UPDATE); /* - * attach knote to port and determine result - * If the filter requested direct message receipt, - * we may need to adjust the qos of the knote to - * reflect the requested and override qos of the - * first message in the queue. + * rdar://problem/48861190 + * + * When a listener connection resumes a peer, + * updating the inheritor above has moved the push + * from the current thread to the workloop. + * + * However, we haven't told the workloop yet + * that it needs a thread request, and we risk + * to be preeempted as soon as we drop the space + * lock below. + * + * To avoid this disable preemption and let kevent + * reenable it after it takes the kqlock. */ - imq_lock(mqueue); - kn->kn_ptr.p_mqueue = mqueue; - if (!IMQ_KLIST_VALID(mqueue)) { - /* - * We're attaching a port that used to have an IMQ_INHERITOR, - * clobber this state, and set the inheritor of its turnstile - * to the kqueue it's now attached to. - */ - turnstile_deallocate_safe(IMQ_INHERITOR(mqueue)); - klist_init(&mqueue->imq_klist); - } - KNOTE_ATTACH(&mqueue->imq_klist, kn); - - /* Update the port's turnstile inheritor */ - send_turnstile = port_send_turnstile(port); - if (send_turnstile) { - turnstile_reference(send_turnstile); - turnstile_update_inheritor(send_turnstile, turnstile, - (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); - } + disable_preemption(); + result |= FILTER_THREADREQ_NODEFEER; + } - if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { - result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first); - } - imq_unlock(mqueue); - is_read_unlock(space); - if (send_turnstile) { - turnstile_update_inheritor_complete(send_turnstile, - TURNSTILE_INTERLOCK_NOT_HELD); - turnstile_deallocate(send_turnstile); - } + imq_unlock(mqueue); + ip_unlock(port); - error = 0; - } else { - is_read_unlock(space); - error = ENOTSUP; + is_read_unlock(space); + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, + TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate_safe(send_turnstile); } + + error = 0; } else { - error = ENOENT; + is_read_unlock(space); + error = ENOTSUP; } +out: waitq_link_release(wq_link_id); /* bail out on errors */ @@ -758,18 +900,17 @@ check_lookup: return result; } -/* NOT proud of these - we should have a stricter relationship between mqueue and ipc object */ -#define mqueue_to_pset(mq) ((ipc_pset_t)((uintptr_t)mq-offsetof(struct ipc_pset, ips_messages))) -#define mqueue_to_port(mq) ((ipc_port_t)((uintptr_t)mq-offsetof(struct ipc_port, ip_messages))) -#define mqueue_to_object(mq) (((ipc_object_t)(mq)) - 1) - +/* Validate imq_to_object implementation "works" */ +_Static_assert(offsetof(struct ipc_pset, ips_messages) == + offsetof(struct ipc_port, ip_messages), + "Make sure the mqueue aliases in both ports and psets"); static void filt_machportdetach( struct knote *kn) { - ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; - ipc_object_t object = mqueue_to_object(mqueue); + ipc_mqueue_t mqueue = kn->kn_mqueue; + ipc_object_t object = imq_to_object(mqueue); struct turnstile *send_turnstile = TURNSTILE_NULL; filt_machport_turnstile_complete(kn); @@ -780,24 +921,36 @@ filt_machportdetach( * ipc_mqueue_changed() already unhooked this knote from the mqueue, */ } else { - assert(IMQ_KLIST_VALID(mqueue)); - KNOTE_DETACH(&mqueue->imq_klist, kn); - } + ipc_port_t port = IP_NULL; - if (io_otype(object) == IOT_PORT) { - ipc_port_t port = ip_from_mq(mqueue); + /* + * When the knote being detached is the first one in the list, + * then unlinking the knote *and* updating the turnstile inheritor + * need to happen atomically with respect to the callers of + * turnstile_workloop_pusher_info(). + * + * The caller of turnstile_workloop_pusher_info() will use the kq req + * lock (and hence the kqlock), so we just need to hold the kqlock too. + */ + if (io_otype(object) == IOT_PORT) { + port = ip_object_to_port(object); + assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY); + if (kn == SLIST_FIRST(&mqueue->imq_klist)) { + send_turnstile = port_send_turnstile(port); + } + } + + filt_machport_unlink(mqueue, kn); - send_turnstile = port_send_turnstile(port); if (send_turnstile) { turnstile_reference(send_turnstile); - turnstile_update_inheritor(send_turnstile, - ipc_port_get_inheritor(port), - TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE); + ipc_port_send_update_inheritor(port, send_turnstile, + TURNSTILE_IMMEDIATE_UPDATE); } } /* Clear the knote pointer once the knote has been removed from turnstile */ - kn->kn_ptr.p_mqueue = IMQ_NULL; + kn->kn_mqueue = IMQ_NULL; imq_unlock(mqueue); if (send_turnstile) { @@ -833,7 +986,7 @@ filt_machportdetach( static int filt_machportevent(struct knote *kn, long hint __assert_only) { - ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_mqueue_t mqueue = kn->kn_mqueue; ipc_kmsg_t first; int result = 0; @@ -853,9 +1006,9 @@ filt_machportevent(struct knote *kn, long hint __assert_only) static int filt_machporttouch( struct knote *kn, - struct kevent_internal_s *kev) + struct kevent_qos_s *kev) { - ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_mqueue_t mqueue = kn->kn_mqueue; ipc_kmsg_t first; int result = 0; @@ -892,15 +1045,12 @@ filt_machporttouch( } static int -filt_machportprocess( - struct knote *kn, - struct filt_process_s *process_data, - struct kevent_internal_s *kev) +filt_machportprocess(struct knote *kn, struct kevent_qos_s *kev) { - ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; - ipc_object_t object = mqueue_to_object(mqueue); + ipc_mqueue_t mqueue = kn->kn_mqueue; + ipc_object_t object = imq_to_object(mqueue); thread_t self = current_thread(); - boolean_t used_filtprocess_data = FALSE; + kevent_ctx_t kectx = NULL; wait_result_t wresult; mach_msg_option_t option; @@ -908,7 +1058,7 @@ filt_machportprocess( mach_msg_size_t size; /* Capture current state */ - *kev = kn->kn_kevent; + knote_fill_kevent(kn, kev, MACH_PORT_NULL); kev->ext[3] = 0; /* hide our port reference from userspace */ /* If already deallocated/moved return one last EOF event */ @@ -922,7 +1072,7 @@ filt_machportprocess( * name of the port and sizeof the waiting message. */ option = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | - MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER); + MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY); if (option & MACH_RCV_MSG) { addr = (mach_vm_address_t) kn->kn_ext[0]; @@ -932,13 +1082,12 @@ filt_machportprocess( * If the kevent didn't specify a buffer and length, carve a buffer * from the filter processing data according to the flags. */ - if (size == 0 && process_data != NULL) { - used_filtprocess_data = TRUE; - - addr = (mach_vm_address_t)process_data->fp_data_out; - size = (mach_msg_size_t)process_data->fp_data_resid; + if (size == 0) { + kectx = kevent_get_context(self); + addr = (mach_vm_address_t)kectx->kec_data_out; + size = (mach_msg_size_t)kectx->kec_data_resid; option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY); - if (process_data->fp_flags & KEVENT_FLAG_STACK_DATA) { + if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) { option |= MACH_RCV_STACK; } } @@ -1037,16 +1186,15 @@ filt_machportprocess( * store the address used in the knote and adjust the residual and * other parameters for future use. */ - if (used_filtprocess_data) { - assert(process_data->fp_data_resid >= size); - process_data->fp_data_resid -= size; - if ((process_data->fp_flags & KEVENT_FLAG_STACK_DATA) == 0) { - kev->ext[0] = process_data->fp_data_out; - process_data->fp_data_out += size; + if (kectx) { + assert(kectx->kec_data_resid >= size); + kectx->kec_data_resid -= size; + if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) { + kev->ext[0] = kectx->kec_data_out; + kectx->kec_data_out += size; } else { assert(option & MACH_RCV_STACK); - kev->ext[0] = process_data->fp_data_out + - process_data->fp_data_resid; + kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid; } } @@ -1081,7 +1229,7 @@ filt_machportprocess( static int filt_machportpeek(struct knote *kn) { - ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_mqueue_t mqueue = kn->kn_mqueue; return ipc_mqueue_set_peek(mqueue) ? FILTER_ACTIVE : 0; } diff --git a/osfmk/ipc/ipc_pset.h b/osfmk/ipc/ipc_pset.h index f0e5df942..efdcbcf81 100644 --- a/osfmk/ipc/ipc_pset.h +++ b/osfmk/ipc/ipc_pset.h @@ -84,13 +84,15 @@ struct ipc_pset { #define ips_references ips_object.io_references -#define ips_active(pset) io_active(&(pset)->ips_object) -#define ips_lock(pset) io_lock(&(pset)->ips_object) -#define ips_lock_try(pset) io_lock_try(&(pset)->ips_object) -#define ips_lock_held_kdp(pset) io_lock_held_kdp(&(pset)->ips_object) -#define ips_unlock(pset) io_unlock(&(pset)->ips_object) -#define ips_reference(pset) io_reference(&(pset)->ips_object) -#define ips_release(pset) io_release(&(pset)->ips_object) +#define ips_object_to_pset(io) __container_of(io, struct ipc_pset, ips_object) +#define ips_to_object(pset) (&(pset)->ips_object) +#define ips_active(pset) io_active(ips_to_object(pset)) +#define ips_lock(pset) io_lock(ips_to_object(pset)) +#define ips_lock_try(pset) io_lock_try(ips_to_object(pset)) +#define ips_lock_held_kdp(pset) io_lock_held_kdp(ips_to_object(pset)) +#define ips_unlock(pset) io_unlock(ips_to_object(pset)) +#define ips_reference(pset) io_reference(ips_to_object(pset)) +#define ips_release(pset) io_release(ips_to_object(pset)) /* get an ipc_pset pointer from an ipc_mqueue pointer */ #define ips_from_mq(mq) \ @@ -144,11 +146,11 @@ extern void ipc_pset_destroy( ipc_pset_t pset); #if MACH_KERNEL_PRIVATE -extern struct turnstile *filt_machport_kqueue_turnstile( +extern struct turnstile *filt_ipc_kqueue_turnstile( + struct knote *kn); +bool +filt_machport_kqueue_has_turnstile( struct knote *kn); - -extern struct turnstile *filt_machport_stashed_special_reply_port_turnstile( - ipc_port_t port); extern void filt_machport_turnstile_prepare_lazily( struct knote *kn, diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index 08a79c300..00a256e17 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -204,7 +204,7 @@ ipc_right_reverse( assert(is_active(space)); assert(io_otype(object) == IOT_PORT); - port = (ipc_port_t) object; + port = ip_object_to_port(object); ip_lock(port); if (!ip_active(port)) { @@ -221,17 +221,17 @@ ipc_right_reverse( assert(entry != IE_NULL); assert(entry->ie_bits & MACH_PORT_TYPE_RECEIVE); - assert(port == (ipc_port_t) entry->ie_object); + assert(port == ip_object_to_port(entry->ie_object)); *namep = name; *entryp = entry; return TRUE; } - if (ipc_hash_lookup(space, (ipc_object_t) port, namep, entryp)) { + if (ipc_hash_lookup(space, ip_to_object(port), namep, entryp)) { assert((entry = *entryp) != IE_NULL); assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_SEND); - assert(port == (ipc_port_t) entry->ie_object); + assert(port == ip_object_to_port(entry->ie_object)); return TRUE; } @@ -301,7 +301,7 @@ ipc_right_request_alloc( if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { ipc_port_request_index_t new_request; - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -448,8 +448,8 @@ ipc_right_request_cancel( { ipc_port_t previous; - assert(ip_active(port)); - assert(port == (ipc_port_t) entry->ie_object); + require_ip_active(port); + assert(port == ip_object_to_port(entry->ie_object)); if (entry->ie_request == IE_REQ_NONE) { return IP_NULL; @@ -511,7 +511,7 @@ ipc_right_check( ipc_entry_bits_t bits; assert(is_active(space)); - assert(port == (ipc_port_t) entry->ie_object); + assert(port == ip_object_to_port(entry->ie_object)); ip_lock(port); if (ip_active(port) || @@ -545,7 +545,7 @@ ipc_right_check( */ if ((bits & MACH_PORT_TYPE_SEND) != 0) { - ipc_hash_delete(space, (ipc_object_t)port, name, entry); + ipc_hash_delete(space, ip_to_object(port), name, entry); } /* convert entry to dead name */ @@ -622,7 +622,7 @@ ipc_right_terminate( break; case MACH_PORT_TYPE_PORT_SET: { - ipc_pset_t pset = (ipc_pset_t) entry->ie_object; + ipc_pset_t pset = ips_object_to_pset(entry->ie_object); assert(entry->ie_request == IE_REQ_NONE); assert(pset != IPS_NULL); @@ -637,7 +637,7 @@ ipc_right_terminate( case MACH_PORT_TYPE_RECEIVE: case MACH_PORT_TYPE_SEND_RECEIVE: case MACH_PORT_TYPE_SEND_ONCE: { - ipc_port_t port = (ipc_port_t) entry->ie_object; + ipc_port_t port = ip_object_to_port(entry->ie_object); ipc_port_t request; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -673,6 +673,7 @@ ipc_right_terminate( ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */ } else if (type & MACH_PORT_TYPE_SEND_ONCE) { assert(port->ip_sorights > 0); + port->ip_reply_context = 0; ip_unlock(port); ipc_notify_send_once(port); /* consumes our ref */ @@ -736,7 +737,7 @@ ipc_right_destroy( break; case MACH_PORT_TYPE_PORT_SET: { - ipc_pset_t pset = (ipc_pset_t) entry->ie_object; + ipc_pset_t pset = ips_object_to_pset(entry->ie_object); assert(entry->ie_request == IE_REQ_NONE); assert(pset != IPS_NULL); @@ -756,7 +757,7 @@ ipc_right_destroy( case MACH_PORT_TYPE_RECEIVE: case MACH_PORT_TYPE_SEND_RECEIVE: case MACH_PORT_TYPE_SEND_ONCE: { - ipc_port_t port = (ipc_port_t) entry->ie_object; + ipc_port_t port = ip_object_to_port(entry->ie_object); ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; ipc_port_t request; @@ -764,8 +765,7 @@ ipc_right_destroy( assert(port != IP_NULL); if (type == MACH_PORT_TYPE_SEND) { - ipc_hash_delete(space, (ipc_object_t) port, - name, entry); + ipc_hash_delete(space, ip_to_object(port), name, entry); } ip_lock(port); @@ -813,12 +813,13 @@ ipc_right_destroy( } if (type & MACH_PORT_TYPE_RECEIVE) { - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver == space); ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */ } else if (type & MACH_PORT_TYPE_SEND_ONCE) { assert(port->ip_sorights > 0); + port->ip_reply_context = 0; ip_unlock(port); ipc_notify_send_once(port); /* consumes our ref */ @@ -885,7 +886,7 @@ ipc_right_dealloc( assert(IE_BITS_UREFS(bits) == 0); assert(entry->ie_request == IE_REQ_NONE); - pset = (ipc_pset_t) entry->ie_object; + pset = ips_object_to_pset(entry->ie_object); assert(pset != IPS_NULL); entry->ie_object = IO_NULL; @@ -929,7 +930,7 @@ dead_name: assert(IE_BITS_UREFS(bits) == 1); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -941,6 +942,13 @@ dead_name: assert(port->ip_sorights > 0); + /* + * clear any reply context: + * no one will be sending the response b/c we are destroying + * the single, outstanding send once right. + */ + port->ip_reply_context = 0; + request = ipc_right_request_cancel_macro(space, port, name, entry); ip_unlock(port); @@ -965,7 +973,7 @@ dead_name: assert(IE_BITS_UREFS(bits) > 0); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -988,8 +996,7 @@ dead_name: request = ipc_right_request_cancel_macro(space, port, name, entry); - ipc_hash_delete(space, (ipc_object_t) port, - name, entry); + ipc_hash_delete(space, ip_to_object(port), name, entry); ip_unlock(port); entry->ie_object = IO_NULL; @@ -1022,11 +1029,11 @@ dead_name: assert(IE_BITS_UREFS(bits) > 0); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); assert(port->ip_srights > 0); @@ -1129,7 +1136,7 @@ ipc_right_delta( goto invalid_value; } - pset = (ipc_pset_t) entry->ie_object; + pset = ips_object_to_pset(entry->ie_object); assert(pset != IPS_NULL); entry->ie_object = IO_NULL; @@ -1147,7 +1154,9 @@ ipc_right_delta( ipc_port_t request = IP_NULL; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { - mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); + if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); + } goto invalid_right; } @@ -1159,7 +1168,7 @@ ipc_right_delta( goto invalid_value; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); /* @@ -1169,7 +1178,7 @@ ipc_right_delta( */ ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); @@ -1202,7 +1211,8 @@ ipc_right_delta( */ ipc_entry_modified(space, name, entry); entry->ie_bits &= ~MACH_PORT_TYPE_RECEIVE; - ipc_hash_insert(space, (ipc_object_t) port, + entry->ie_bits |= MACH_PORT_TYPE_EX_RECEIVE; + ipc_hash_insert(space, ip_to_object(port), name, entry); ip_reference(port); } else { @@ -1214,7 +1224,7 @@ ipc_right_delta( * port is destroyed "first". */ bits &= ~IE_BITS_TYPE_MASK; - bits |= MACH_PORT_TYPE_DEAD_NAME; + bits |= (MACH_PORT_TYPE_DEAD_NAME | MACH_PORT_TYPE_EX_RECEIVE); if (entry->ie_request) { entry->ie_request = IE_REQ_NONE; /* if urefs are pegged due to overflow, leave them pegged */ @@ -1255,7 +1265,7 @@ ipc_right_delta( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE); assert(IE_BITS_UREFS(bits) == 1); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -1277,6 +1287,13 @@ ipc_right_delta( goto success; } + /* + * clear any reply context: + * no one will be sending the response b/c we are destroying + * the single, outstanding send once right. + */ + port->ip_reply_context = 0; + request = ipc_right_request_cancel_macro(space, port, name, entry); ip_unlock(port); @@ -1298,7 +1315,7 @@ ipc_right_delta( mach_port_urefs_t urefs; if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -1372,7 +1389,15 @@ ipc_right_delta( if ((bits & MACH_PORT_TYPE_SEND) == 0) { /* invalid right exception only when not live/dead confusion */ - if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) { + if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0 +#if !defined(AE_MAKESENDRIGHT_FIXED) + /* + * AE tries to add single send right without knowing if it already owns one. + * But if it doesn't, it should own the receive right and delta should be 1. + */ + && (((bits & MACH_PORT_TYPE_RECEIVE) == 0) || (delta != 1)) +#endif + ) { mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); } goto invalid_right; @@ -1380,7 +1405,7 @@ ipc_right_delta( /* maximum urefs for send is MACH_PORT_UREFS_MAX */ - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -1444,7 +1469,7 @@ ipc_right_delta( request = ipc_right_request_cancel_macro(space, port, name, entry); - ipc_hash_delete(space, (ipc_object_t) port, + ipc_hash_delete(space, ip_to_object(port), name, entry); ip_unlock(port); @@ -1541,8 +1566,17 @@ ipc_right_destruct( assert(is_active(space)); - if (((bits & MACH_PORT_TYPE_RECEIVE) == 0) || - (srdelta && ((bits & MACH_PORT_TYPE_SEND) == 0))) { + if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { + is_write_unlock(space); + + /* No exception if we used to have receive and held entry since */ + if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); + } + return KERN_INVALID_RIGHT; + } + + if (srdelta && (bits & MACH_PORT_TYPE_SEND) == 0) { is_write_unlock(space); mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; @@ -1552,11 +1586,11 @@ ipc_right_destruct( goto invalid_value; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); @@ -1639,7 +1673,8 @@ ipc_right_destruct( */ ipc_entry_modified(space, name, entry); entry->ie_bits &= ~MACH_PORT_TYPE_RECEIVE; - ipc_hash_insert(space, (ipc_object_t) port, + entry->ie_bits |= MACH_PORT_TYPE_EX_RECEIVE; + ipc_hash_insert(space, ip_to_object(port), name, entry); ip_reference(port); } else { @@ -1651,7 +1686,7 @@ ipc_right_destruct( * port is destroyed "first". */ bits &= ~IE_BITS_TYPE_MASK; - bits |= MACH_PORT_TYPE_DEAD_NAME; + bits |= (MACH_PORT_TYPE_DEAD_NAME | MACH_PORT_TYPE_EX_RECEIVE); if (entry->ie_request) { entry->ie_request = IE_REQ_NONE; if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { @@ -1719,14 +1754,14 @@ ipc_right_info( bits = entry->ie_bits; request = entry->ie_request; - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); if (bits & MACH_PORT_TYPE_RECEIVE) { assert(IP_VALID(port)); if (request != IE_REQ_NONE) { ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); type |= ipc_port_request_type(port, name, request); ip_unlock(port); } @@ -1761,27 +1796,29 @@ ipc_right_info( } /* - * Routine: ipc_right_copyin_check + * Routine: ipc_right_copyin_check_reply * Purpose: - * Check if a subsequent ipc_right_copyin would succeed. + * Check if a subsequent ipc_right_copyin would succeed. Used only + * by ipc_kmsg_copyin_header to check if reply_port can be copied in. + * If the reply port is an immovable send right, it errors out. * Conditions: * The space is locked (read or write) and active. */ boolean_t -ipc_right_copyin_check( +ipc_right_copyin_check_reply( __assert_only ipc_space_t space, - __unused mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_name) + mach_port_name_t reply_name, + ipc_entry_t reply_entry, + mach_msg_type_name_t reply_type) { ipc_entry_bits_t bits; - ipc_port_t port; + ipc_port_t reply_port; - bits = entry->ie_bits; + bits = reply_entry->ie_bits; assert(is_active(space)); - switch (msgt_name) { + switch (reply_type) { case MACH_MSG_TYPE_MAKE_SEND: if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { return FALSE; @@ -1795,17 +1832,8 @@ ipc_right_copyin_check( break; case MACH_MSG_TYPE_MOVE_RECEIVE: - if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { - return FALSE; - } - if (io_kotype(entry->ie_object) != IKOT_NONE) { - return FALSE; - } - port = (ipc_port_t) entry->ie_object; - if (port->ip_specialreply) { - return FALSE; - } - break; + /* ipc_kmsg_copyin_header already filters it out */ + return FALSE; case MACH_MSG_TYPE_COPY_SEND: case MACH_MSG_TYPE_MOVE_SEND: @@ -1818,19 +1846,29 @@ ipc_right_copyin_check( return FALSE; } - port = (ipc_port_t) entry->ie_object; - assert(port != IP_NULL); + reply_port = ip_object_to_port(reply_entry->ie_object); + assert(reply_port != IP_NULL); /* * active status peek to avoid checks that will be skipped * on copyin for dead ports. Lock not held, so will not be * atomic (but once dead, there's no going back). */ - if (!ip_active(port)) { + if (!ip_active(reply_port)) { break; } - if (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE) { + /* + * Can't copyin a send right that is marked immovable. This bit + * is set only during port creation and never unset. So it can + * be read without a lock. + */ + if (reply_port->ip_immovable_send) { + mach_port_guard_exception(reply_name, 0, 0, kGUARD_EXC_IMMOVABLE); + return FALSE; + } + + if (reply_type == MACH_MSG_TYPE_MOVE_SEND_ONCE) { if ((bits & MACH_PORT_TYPE_SEND_ONCE) == 0) { return FALSE; } @@ -1850,6 +1888,40 @@ ipc_right_copyin_check( return TRUE; } +/* + * Routine: ipc_right_copyin_check_guard_locked + * Purpose: + * Check if the port is guarded and the guard + * value matches the one passed in the arguments. + * If MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND is set, + * check if the port is unguarded. + * Conditions: + * The port is locked. + * Returns: + * KERN_SUCCESS Port is either unguarded + * or guarded with expected value + * KERN_INVALID_ARGUMENT Port is either unguarded already or guard mismatch. + * This also raises a EXC_GUARD exception. + */ +static kern_return_t +ipc_right_copyin_check_guard_locked( + mach_port_name_t name, + ipc_port_t port, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags) +{ + mach_msg_guard_flags_t flags = *guard_flags; + if ((flags & MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND) && !port->ip_guarded && !context) { + return KERN_SUCCESS; + } else if (port->ip_guarded && (port->ip_context == context)) { + return KERN_SUCCESS; + } + + /* Incorrect guard; Raise exception */ + mach_port_guard_exception(name, context, port->ip_context, kGUARD_EXC_INCORRECT_GUARD); + return KERN_INVALID_ARGUMENT; +} + /* * Routine: ipc_right_copyin * Purpose: @@ -1871,26 +1943,32 @@ ipc_right_copyin_check( * Returns: * KERN_SUCCESS Acquired an object, possibly IO_DEAD. * KERN_INVALID_RIGHT Name doesn't denote correct right. + * KERN_INVALID_CAPABILITY Trying to move an kobject port or an immovable right + * KERN_INVALID_ARGUMENT Port is unguarded or guard mismatch */ kern_return_t ipc_right_copyin( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_name, - ipc_right_copyin_flags_t flags, - ipc_object_t *objectp, - ipc_port_t *sorightp, - ipc_port_t *releasep, - int *assertcntp) + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t entry, + mach_msg_type_name_t msgt_name, + ipc_right_copyin_flags_t flags, + ipc_object_t *objectp, + ipc_port_t *sorightp, + ipc_port_t *releasep, + int *assertcntp, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags) { ipc_entry_bits_t bits; ipc_port_t port; + kern_return_t kr; + boolean_t deadok = flags & IPC_RIGHT_COPYIN_FLAGS_DEADOK? TRUE : FALSE; + boolean_t allow_imm_send = flags & IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND? TRUE : FALSE; *releasep = IP_NULL; *assertcntp = 0; - boolean_t deadok = (flags & IPC_RIGHT_COPYIN_FLAGS_DEADOK) ? TRUE : FALSE; bits = entry->ie_bits; @@ -1902,20 +1980,17 @@ ipc_right_copyin( goto invalid_right; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); ip_lock(port); - assert(ip_active(port)); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); - port->ip_mscount++; - port->ip_srights++; - ip_reference(port); + ipc_port_make_send_locked(port); ip_unlock(port); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = IP_NULL; break; } @@ -1925,19 +2000,18 @@ ipc_right_copyin( goto invalid_right; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); - port->ip_sorights++; - ip_reference(port); + ipc_port_make_sonce_locked(port); ip_unlock(port); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = IP_NULL; break; } @@ -1963,24 +2037,41 @@ ipc_right_copyin( * situation which is, "This is a valid receive right, * but it's also a kobject and you can't move it." */ + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); return KERN_INVALID_CAPABILITY; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); + if (port->ip_immovable_receive) { + assert(port->ip_receiver != ipc_space_kernel); + ip_unlock(port); + assert(current_task() != kernel_task); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); + return KERN_INVALID_CAPABILITY; + } + + if (guard_flags != NULL) { + kr = ipc_right_copyin_check_guard_locked(name, port, context, guard_flags); + if (kr != KERN_SUCCESS) { + ip_unlock(port); + return kr; + } + } + if (bits & MACH_PORT_TYPE_SEND) { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_RECEIVE); assert(IE_BITS_UREFS(bits) > 0); assert(port->ip_srights > 0); - ipc_hash_insert(space, (ipc_object_t) port, + ipc_hash_insert(space, ip_to_object(port), name, entry); ip_reference(port); } else { @@ -1992,9 +2083,15 @@ ipc_right_copyin( entry->ie_object = IO_NULL; } entry->ie_bits = bits & ~MACH_PORT_TYPE_RECEIVE; + entry->ie_bits |= MACH_PORT_TYPE_EX_RECEIVE; ipc_entry_modified(space, name, entry); + /* ipc_port_clear_receiver unguards the port and clears the ip_immovable_receive bit */ (void)ipc_port_clear_receiver(port, FALSE); /* don't destroy the port/mqueue */ + if (guard_flags != NULL) { + /* this flag will be cleared during copyout */ + *guard_flags = *guard_flags | MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND; + } #if IMPORTANCE_INHERITANCE /* @@ -2017,7 +2114,7 @@ ipc_right_copyin( ip_unlock(port); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = request; break; } @@ -2035,7 +2132,7 @@ ipc_right_copyin( assert(IE_BITS_UREFS(bits) > 0); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -2053,13 +2150,16 @@ ipc_right_copyin( goto invalid_right; } - assert(port->ip_srights > 0); + if (!allow_imm_send && port->ip_immovable_send) { + ip_unlock(port); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); + return KERN_INVALID_CAPABILITY; + } - port->ip_srights++; - ip_reference(port); + ipc_port_copy_send_locked(port); ip_unlock(port); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = IP_NULL; break; } @@ -2079,7 +2179,7 @@ ipc_right_copyin( assert(IE_BITS_UREFS(bits) > 0); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -2097,9 +2197,14 @@ ipc_right_copyin( goto invalid_right; } - assert(port->ip_srights > 0); + if (!allow_imm_send && port->ip_immovable_send) { + ip_unlock(port); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); + return KERN_INVALID_CAPABILITY; + } if (IE_BITS_UREFS(bits) == 1) { + assert(port->ip_srights > 0); if (bits & MACH_PORT_TYPE_RECEIVE) { assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); @@ -2113,7 +2218,7 @@ ipc_right_copyin( request = ipc_right_request_cancel_macro(space, port, name, entry); - ipc_hash_delete(space, (ipc_object_t) port, + ipc_hash_delete(space, ip_to_object(port), name, entry); entry->ie_object = IO_NULL; /* transfer entry's reference to caller */ @@ -2121,8 +2226,7 @@ ipc_right_copyin( entry->ie_bits = bits & ~ (IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND); } else { - port->ip_srights++; - ip_reference(port); + ipc_port_copy_send_locked(port); /* if urefs are pegged due to overflow, leave them pegged */ if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { entry->ie_bits = bits - 1; /* decrement urefs */ @@ -2132,7 +2236,7 @@ ipc_right_copyin( ipc_entry_modified(space, name, entry); ip_unlock(port); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = request; break; } @@ -2152,7 +2256,7 @@ ipc_right_copyin( assert(IE_BITS_UREFS(bits) > 0); - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, flags)) { @@ -2175,6 +2279,12 @@ ipc_right_copyin( goto invalid_right; } + if (!allow_imm_send && port->ip_immovable_send) { + ip_unlock(port); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); + return KERN_INVALID_CAPABILITY; + } + assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE); assert(IE_BITS_UREFS(bits) == 1); assert(port->ip_sorights > 0); @@ -2186,7 +2296,7 @@ ipc_right_copyin( entry->ie_bits = bits & ~ (IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND_ONCE); ipc_entry_modified(space, name, entry); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = request; break; } @@ -2235,93 +2345,6 @@ move_dead: return KERN_SUCCESS; } -/* - * Routine: ipc_right_copyin_undo - * Purpose: - * Undoes the effects of an ipc_right_copyin - * of a send/send-once right that is dead. - * (Object is either IO_DEAD or a dead port.) - * Conditions: - * The space is write-locked and active. - */ - -void -ipc_right_copyin_undo( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_name, - ipc_object_t object, - ipc_port_t soright) -{ - ipc_entry_bits_t bits; - - bits = entry->ie_bits; - - assert(is_active(space)); - - assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) || - (msgt_name == MACH_MSG_TYPE_COPY_SEND) || - (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE)); - - if (soright != IP_NULL) { - assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) || - (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE)); - assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE); - assert(object != IO_DEAD); - - entry->ie_bits = ((bits & ~IE_BITS_RIGHT_MASK) | - MACH_PORT_TYPE_DEAD_NAME | 2); - } else if (IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE) { - assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) || - (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE)); - - entry->ie_bits = ((bits & ~IE_BITS_RIGHT_MASK) | - MACH_PORT_TYPE_DEAD_NAME | 1); - } else if (IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME) { - assert(object == IO_DEAD); - assert(IE_BITS_UREFS(bits) > 0); - - if (msgt_name != MACH_MSG_TYPE_COPY_SEND) { - assert(IE_BITS_UREFS(bits) <= MACH_PORT_UREFS_MAX); - /* if urefs are pegged due to overflow, leave them pegged */ - if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { - entry->ie_bits = bits + 1; /* increment urefs */ - } - } - } else { - assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) || - (msgt_name == MACH_MSG_TYPE_COPY_SEND)); - assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); - assert(object != IO_DEAD); - assert(entry->ie_object == object); - assert(IE_BITS_UREFS(bits) > 0); - - if (msgt_name != MACH_MSG_TYPE_COPY_SEND) { - assert(IE_BITS_UREFS(bits) <= MACH_PORT_UREFS_MAX); - /* if urefs are pegged due to overflow, leave them pegged */ - if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { - entry->ie_bits = bits + 1; /* increment urefs */ - } - } - - /* - * May as well convert the entry to a dead name. - * (Or if it is a compat entry, destroy it.) - */ - - (void) ipc_right_check(space, (ipc_port_t) object, - name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE); - /* object is dead so it is not locked */ - } - ipc_entry_modified(space, name, entry); - /* release the reference acquired by copyin */ - - if (object != IO_DEAD) { - io_release(object); - } -} - /* * Routine: ipc_right_copyin_two_move_sends * Purpose: @@ -2365,7 +2388,7 @@ ipc_right_copyin_two_move_sends( goto invalid_right; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { @@ -2374,45 +2397,57 @@ ipc_right_copyin_two_move_sends( } /* port is locked and active */ - assert(port->ip_srights > 0); + if (urefs > 2) { + /* + * We are moving 2 urefs as naked send rights, which is decomposed as: + * - two copy sends (which doesn't affect the make send count) + * - decrementing the local urefs twice. + */ + ipc_port_copy_send_locked(port); + ipc_port_copy_send_locked(port); + /* if urefs are pegged due to overflow, leave them pegged */ + if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { + entry->ie_bits = bits - 2; /* decrement urefs */ + } + } else { + /* + * We have exactly 2 send rights for this port in this space, + * which means that we will liberate the naked send right held + * by this entry. + * + * However refcounting rules around entries are that naked send rights + * on behalf of spaces do not have an associated port reference, + * so we need to donate one ... + */ + ipc_port_copy_send_locked(port); - if (urefs == 2) { if (bits & MACH_PORT_TYPE_RECEIVE) { assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_RECEIVE); - port->ip_srights++; - ip_reference(port); + /* ... that we inject manually when the entry stays alive */ ip_reference(port); } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); + /* ... that we steal from the entry when it dies */ request = ipc_right_request_cancel_macro(space, port, name, entry); - port->ip_srights++; - ip_reference(port); - ipc_hash_delete(space, (ipc_object_t) port, + ipc_hash_delete(space, ip_to_object(port), name, entry); entry->ie_object = IO_NULL; } + entry->ie_bits = bits & ~(IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND); - } else { - port->ip_srights += 2; - ip_reference(port); - ip_reference(port); - /* if urefs are pegged due to overflow, leave them pegged */ - if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { - entry->ie_bits = bits - 2; /* decrement urefs */ - } } ipc_entry_modified(space, name, entry); ip_unlock(port); - *objectp = (ipc_object_t) port; + *objectp = ip_to_object(port); *sorightp = request; return KERN_SUCCESS; @@ -2430,6 +2465,7 @@ invalid_right: * Conditions: * The space is write-locked and active. * The object is returned with two refs/rights. + * Msgt_one refers to the dest_type * Returns: * KERN_SUCCESS Acquired an object. * KERN_INVALID_RIGHT Name doesn't denote correct right(s). @@ -2437,14 +2473,14 @@ invalid_right: */ kern_return_t ipc_right_copyin_two( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_one, - mach_msg_type_name_t msgt_two, - ipc_object_t *objectp, - ipc_port_t *sorightp, - ipc_port_t *releasep) + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t entry, + mach_msg_type_name_t msgt_one, + mach_msg_type_name_t msgt_two, + ipc_object_t *objectp, + ipc_port_t *sorightp, + ipc_port_t *releasep) { kern_return_t kr; int assertcnt = 0; @@ -2452,14 +2488,6 @@ ipc_right_copyin_two( assert(MACH_MSG_TYPE_PORT_ANY_SEND(msgt_one)); assert(MACH_MSG_TYPE_PORT_ANY_SEND(msgt_two)); - - /* - * Pre-validate the second disposition is possible all by itself. - */ - if (!ipc_right_copyin_check(space, name, entry, msgt_two)) { - return KERN_INVALID_CAPABILITY; - } - /* * This is a little tedious to make atomic, because * there are 25 combinations of valid dispositions. @@ -2491,9 +2519,9 @@ ipc_right_copyin_two( ipc_object_t object_two; kr = ipc_right_copyin(space, name, entry, - msgt_one, IPC_RIGHT_COPYIN_FLAGS_NONE, + msgt_one, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, objectp, sorightp, releasep, - &assertcnt); + &assertcnt, 0, NULL); assert(assertcnt == 0); if (kr != KERN_SUCCESS) { return kr; @@ -2512,7 +2540,7 @@ ipc_right_copyin_two( kr = ipc_right_copyin(space, name, entry, msgt_two, IPC_RIGHT_COPYIN_FLAGS_NONE, &object_two, sorightp, releasep, - &assertcnt); + &assertcnt, 0, NULL); assert(assertcnt == 0); assert(kr == KERN_SUCCESS); assert(*sorightp == IP_NULL); @@ -2550,9 +2578,9 @@ ipc_right_copyin_two( } kr = ipc_right_copyin(space, name, entry, - msgt_name, IPC_RIGHT_COPYIN_FLAGS_NONE, + msgt_name, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, objectp, sorightp, releasep, - &assertcnt); + &assertcnt, 0, NULL); assert(assertcnt == 0); if (kr != KERN_SUCCESS) { return kr; @@ -2563,7 +2591,7 @@ ipc_right_copyin_two( * that's OK. Neither right will be usable to send * a message anyway. */ - (void)ipc_port_copy_send((ipc_port_t)*objectp); + (void)ipc_port_copy_send(ip_object_to_port(*objectp)); } return KERN_SUCCESS; @@ -2580,9 +2608,9 @@ ipc_right_copyin_two( * because user-reference overflow isn't a possibility. * * If copying out the object would cause the user-reference - * count in the entry to overflow, and overflow is TRUE, - * then instead the user-reference count is left pegged - * to its maximum value and the copyout succeeds anyway. + * count in the entry to overflow, then the user-reference + * count is left pegged to its maximum value and the copyout + * succeeds anyway. * Conditions: * The space is write-locked and active. * The object is locked and active. @@ -2597,7 +2625,8 @@ ipc_right_copyout( mach_port_name_t name, ipc_entry_t entry, mach_msg_type_name_t msgt_name, - __unused boolean_t overflow, + mach_port_context_t *context, + mach_msg_guard_flags_t *guard_flags, ipc_object_t object) { ipc_entry_bits_t bits; @@ -2610,7 +2639,7 @@ ipc_right_copyout( assert(io_active(object)); assert(entry->ie_object == object); - port = (ipc_port_t) object; + port = ip_object_to_port(object); switch (msgt_name) { case MACH_MSG_TYPE_PORT_SEND_ONCE: @@ -2673,8 +2702,7 @@ ipc_right_copyout( /* entry is locked holding ref, so can use port */ - ipc_hash_insert(space, (ipc_object_t) port, - name, entry); + ipc_hash_insert(space, ip_to_object(port), name, entry); } entry->ie_bits = (bits | MACH_PORT_TYPE_SEND) + 1; /* increment urefs */ @@ -2683,9 +2711,6 @@ ipc_right_copyout( case MACH_MSG_TYPE_PORT_RECEIVE: { ipc_port_t dest; - turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; - struct turnstile *ts = TURNSTILE_NULL; - #if IMPORTANCE_INHERITANCE natural_t assertcnt = port->ip_impcount; #endif /* IMPORTANCE_INHERITANCE */ @@ -2699,50 +2724,49 @@ ipc_right_copyout( port->ip_receiver_name = name; port->ip_receiver = space; - assert((bits & MACH_PORT_TYPE_RECEIVE) == 0); + struct knote *kn = current_thread()->ith_knote; - /* Update the port's turnstile linkage to WL turnstile */ - ts = port_send_turnstile(port); - if (ts) { - struct knote *kn = current_thread()->ith_knote; - if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) { - inheritor = filt_machport_stash_port(kn, port, NULL); - if (inheritor) { - turnstile_reference(inheritor); - IMQ_SET_INHERITOR(&port->ip_messages, inheritor); - } + if ((guard_flags != NULL) && ((*guard_flags & MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE) != 0)) { + assert(port->ip_immovable_receive == 0); + port->ip_guarded = 1; + port->ip_strict_guard = 0; + /* pseudo receive shouldn't set the receive right as immovable in the sender's space */ + if (kn != ITH_KNOTE_PSEUDO) { + port->ip_immovable_receive = 1; } - turnstile_reference(ts); - turnstile_update_inheritor(ts, inheritor, - (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE)); + port->ip_context = current_thread()->ith_msg_addr; + *context = port->ip_context; + *guard_flags = *guard_flags & ~MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND; } - imq_unlock(&port->ip_messages); - + assert((bits & MACH_PORT_TYPE_RECEIVE) == 0); if (bits & MACH_PORT_TYPE_SEND) { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); assert(IE_BITS_UREFS(bits) > 0); assert(port->ip_srights > 0); - - ip_unlock(port); - ip_release(port); - - /* entry is locked holding ref, so can use port */ - ipc_hash_delete(space, (ipc_object_t) port, name, entry); } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE); assert(IE_BITS_UREFS(bits) == 0); + } - /* transfer ref to entry */ - ip_unlock(port); + boolean_t sync_bootstrap_checkin = FALSE; + if (kn != ITH_KNOTE_PSEUDO && port->ip_sync_bootstrap_checkin) { + sync_bootstrap_checkin = TRUE; } - entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE; - ipc_entry_modified(space, name, entry); + if (!ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) { + kn = NULL; + } + ipc_port_adjust_port_locked(port, kn, sync_bootstrap_checkin); + /* port & message queue are unlocked */ - if (ts) { - turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); - turnstile_deallocate_safe(ts); + if (bits & MACH_PORT_TYPE_SEND) { + ip_release(port); + + /* entry is locked holding ref, so can use port */ + ipc_hash_delete(space, ip_to_object(port), name, entry); } + entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE; + ipc_entry_modified(space, name, entry); if (dest != IP_NULL) { #if IMPORTANCE_INHERITANCE @@ -2769,137 +2793,3 @@ ipc_right_copyout( } return KERN_SUCCESS; } - -/* - * Routine: ipc_right_rename - * Purpose: - * Transfer an entry from one name to another. - * The old entry is deallocated. - * Conditions: - * The space is write-locked and active. - * The new entry is unused. Upon return, - * the space is unlocked. - * Returns: - * KERN_SUCCESS Moved entry to new name. - */ - -kern_return_t -ipc_right_rename( - ipc_space_t space, - mach_port_name_t oname, - ipc_entry_t oentry, - mach_port_name_t nname, - ipc_entry_t nentry) -{ - ipc_port_request_index_t request = oentry->ie_request; - ipc_entry_bits_t bits = oentry->ie_bits; - ipc_object_t object = oentry->ie_object; - ipc_port_t release_port = IP_NULL; - - assert(is_active(space)); - assert(oname != nname); - - /* - * If IE_BITS_COMPAT, we can't allow the entry to be renamed - * if the port is dead. (This would foil ipc_port_destroy.) - * Instead we should fail because oentry shouldn't exist. - * Note IE_BITS_COMPAT implies ie_request != 0. - */ - - if (request != IE_REQ_NONE) { - ipc_port_t port; - - assert(bits & MACH_PORT_TYPE_PORT_RIGHTS); - port = (ipc_port_t) object; - assert(port != IP_NULL); - - if (ipc_right_check(space, port, oname, oentry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { - request = IE_REQ_NONE; - object = IO_NULL; - bits = oentry->ie_bits; - release_port = port; - assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); - assert(oentry->ie_request == IE_REQ_NONE); - } else { - /* port is locked and active */ - - ipc_port_request_rename(port, request, oname, nname); - ip_unlock(port); - oentry->ie_request = IE_REQ_NONE; - } - } - - /* initialize nentry before letting ipc_hash_insert see it */ - - assert((nentry->ie_bits & IE_BITS_RIGHT_MASK) == 0); - nentry->ie_bits |= bits & IE_BITS_RIGHT_MASK; - nentry->ie_request = request; - nentry->ie_object = object; - - switch (IE_BITS_TYPE(bits)) { - case MACH_PORT_TYPE_SEND: { - ipc_port_t port; - - port = (ipc_port_t) object; - assert(port != IP_NULL); - - /* remember, there are no other share entries possible */ - /* or we can't do the rename. Therefore we do not need */ - /* to check the other subspaces */ - ipc_hash_delete(space, (ipc_object_t) port, oname, oentry); - ipc_hash_insert(space, (ipc_object_t) port, nname, nentry); - break; - } - - case MACH_PORT_TYPE_RECEIVE: - case MACH_PORT_TYPE_SEND_RECEIVE: { - ipc_port_t port; - - port = (ipc_port_t) object; - assert(port != IP_NULL); - - ip_lock(port); - imq_lock(&port->ip_messages); - assert(ip_active(port)); - assert(port->ip_receiver_name == oname); - assert(port->ip_receiver == space); - - port->ip_receiver_name = nname; - imq_unlock(&port->ip_messages); - ip_unlock(port); - break; - } - - case MACH_PORT_TYPE_PORT_SET: { - ipc_pset_t pset; - - pset = (ipc_pset_t) object; - assert(pset != IPS_NULL); - - ips_lock(pset); - assert(ips_active(pset)); - - ips_unlock(pset); - break; - } - - case MACH_PORT_TYPE_SEND_ONCE: - case MACH_PORT_TYPE_DEAD_NAME: - break; - - default: - panic("ipc_right_rename: strange rights"); - } - - assert(oentry->ie_request == IE_REQ_NONE); - oentry->ie_object = IO_NULL; - ipc_entry_dealloc(space, oname, oentry); - ipc_entry_modified(space, nname, nentry); - is_write_unlock(space); - - if (release_port != IP_NULL) { - ip_release(release_port); - } - - return KERN_SUCCESS; -} diff --git a/osfmk/ipc/ipc_right.h b/osfmk/ipc/ipc_right.h index d995aef3a..a3049efc7 100644 --- a/osfmk/ipc/ipc_right.h +++ b/osfmk/ipc/ipc_right.h @@ -78,7 +78,7 @@ typedef uint32_t ipc_right_copyin_flags_t; #define IPC_RIGHT_COPYIN_FLAGS_NONE 0x0 #define IPC_RIGHT_COPYIN_FLAGS_DEADOK 0x1 -#define IPC_RIGHT_COPYIN_FLAGS_RESERVED 0x2 +#define IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x2 #define IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE 0x4 /* allow copyin of a send once right to a dead port with no dead name requests */ /* Find an entry in a space, given the name */ @@ -180,44 +180,37 @@ extern kern_return_t ipc_right_info( mach_port_type_t *typep, mach_port_urefs_t *urefsp); -/* Check if a subsequent ipc_right_copyin would succeed */ -extern boolean_t ipc_right_copyin_check( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_name); +/* Check if a subsequent ipc_right_copyin of the reply port will succeed */ +extern boolean_t ipc_right_copyin_check_reply( + ipc_space_t space, + mach_port_name_t reply_name, + ipc_entry_t reply_entry, + mach_msg_type_name_t reply_type); /* Copyin a capability from a space */ extern kern_return_t ipc_right_copyin( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_name, - ipc_right_copyin_flags_t flags, - ipc_object_t *objectp, - ipc_port_t *sorightp, - ipc_port_t *releasep, - int *assertcntp); - -/* Undo the effects of an ipc_right_copyin */ -extern void ipc_right_copyin_undo( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_name, - ipc_object_t object, - ipc_port_t soright); + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t entry, + mach_msg_type_name_t msgt_name, + ipc_right_copyin_flags_t flags, + ipc_object_t *objectp, + ipc_port_t *sorightp, + ipc_port_t *releasep, + int *assertcntp, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags); /* Copyin a pair of dispositions from a space */ extern kern_return_t ipc_right_copyin_two( - ipc_space_t space, - mach_port_name_t name, - ipc_entry_t entry, - mach_msg_type_name_t msgt_one, - mach_msg_type_name_t msgt_two, - ipc_object_t *objectp, - ipc_port_t *sorightp, - ipc_port_t *releasep); + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t entry, + mach_msg_type_name_t msgt_one, + mach_msg_type_name_t msgt_two, + ipc_object_t *objectp, + ipc_port_t *sorightp, + ipc_port_t *releasep); /* Copyout a capability to a space */ extern kern_return_t ipc_right_copyout( @@ -225,15 +218,8 @@ extern kern_return_t ipc_right_copyout( mach_port_name_t name, ipc_entry_t entry, mach_msg_type_name_t msgt_name, - boolean_t overflow, + mach_port_context_t *context, + mach_msg_guard_flags_t *guard_flags, ipc_object_t object); -/* Reanme a capability */ -extern kern_return_t ipc_right_rename( - ipc_space_t space, - mach_port_name_t oname, - ipc_entry_t oentry, - mach_port_name_t nname, - ipc_entry_t nentry); - #endif /* _IPC_IPC_RIGHT_H_ */ diff --git a/osfmk/ipc/ipc_space.h b/osfmk/ipc/ipc_space.h index c0a2d1d15..161c55403 100644 --- a/osfmk/ipc/ipc_space.h +++ b/osfmk/ipc/ipc_space.h @@ -185,7 +185,6 @@ extern lck_attr_t ipc_lck_attr; &ipc_lck_grp) #define is_write_lock(is) lck_spin_lock_grp(&(is)->is_lock_data, &ipc_lck_grp) -#define is_write_lock_try(is) lck_spin_try_lock_grp(&(is)->is_lock_data, &ipc_lck_grp) #define is_write_unlock(is) lck_spin_unlock(&(is)->is_lock_data) #define is_write_sleep(is) lck_spin_sleep_grp(&(is)->is_lock_data, \ LCK_SLEEP_DEFAULT, \ @@ -245,6 +244,14 @@ extern void ipc_space_rand_freelist( /* Generate a new gencount rollover point from a space's entropy pool */ extern ipc_entry_bits_t ipc_space_get_rollpoint(ipc_space_t space); + +/* Allocate a new port/set/dead-name in a space */ +extern kern_return_t mach_port_allocate_internal( + ipc_space_t space, + mach_port_right_t right, + mach_port_qos_t *qosp, + mach_port_name_t *namep); + #endif /* MACH_KERNEL_PRIVATE */ #endif /* __APPLE_API_PRIVATE */ diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index ff6da5605..eeb226a87 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -208,12 +208,14 @@ ipc_voucher_init(void) sizeof(struct ipc_voucher), "ipc vouchers"); zone_change(ipc_voucher_zone, Z_NOENCRYPT, TRUE); + zone_change(ipc_voucher_zone, Z_CLEARMEMORY, TRUE); ipc_voucher_attr_control_zone = zinit(sizeof(struct ipc_voucher_attr_control), attr_manager_max * sizeof(struct ipc_voucher_attr_control), sizeof(struct ipc_voucher_attr_control), "ipc voucher attr controls"); zone_change(ipc_voucher_attr_control_zone, Z_NOENCRYPT, TRUE); + zone_change(ipc_voucher_attr_control_zone, Z_CLEARMEMORY, TRUE); /* initialize voucher hash */ ivht_lock_init(); @@ -318,7 +320,7 @@ iv_dealloc(ipc_voucher_t iv, boolean_t unhash) * is gone. We can just discard it now. */ if (IP_VALID(port)) { - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_srights == 0); ipc_port_dealloc_kernel(port); @@ -404,6 +406,7 @@ convert_port_to_voucher( ipc_port_t port) { if (IP_VALID(port)) { + zone_require(port, ipc_object_zones[IOT_PORT]); ipc_voucher_t voucher = (ipc_voucher_t) port->ip_kobject; /* @@ -415,8 +418,9 @@ convert_port_to_voucher( return IV_NULL; } - assert(ip_active(port)); + require_ip_active(port); + zone_require(voucher, ipc_voucher_zone); ipc_voucher_reference(voucher); return voucher; } @@ -477,26 +481,19 @@ ipc_voucher_release(ipc_voucher_t voucher) * Purpose: * Called whenever the Mach port system detects no-senders * on the voucher port. - * - * Each time the send-right count goes positive, a no-senders - * notification is armed (and a voucher reference is donated). - * So, each notification that comes in must release a voucher - * reference. If more send rights have been added since it - * fired (asynchronously), they will be protected by a different - * reference hold. */ void ipc_voucher_notify(mach_msg_header_t *msg) { mach_no_senders_notification_t *notification = (void *)msg; ipc_port_t port = notification->not_header.msgh_remote_port; - ipc_voucher_t iv; - assert(ip_active(port)); + require_ip_active(port); assert(IKOT_VOUCHER == ip_kotype(port)); - iv = (ipc_voucher_t)port->ip_kobject; - ipc_voucher_release(iv); + /* consume the reference donated by convert_voucher_to_port */ + zone_require((ipc_voucher_t)port->ip_kobject, ipc_voucher_zone); + ipc_voucher_release((ipc_voucher_t)port->ip_kobject); } /* @@ -505,48 +502,22 @@ ipc_voucher_notify(mach_msg_header_t *msg) ipc_port_t convert_voucher_to_port(ipc_voucher_t voucher) { - ipc_port_t port, send; - if (IV_NULL == voucher) { return IP_NULL; } + zone_require(voucher, ipc_voucher_zone); assert(os_ref_get_count(&voucher->iv_refs) > 0); - /* create a port if needed */ - port = voucher->iv_port; - if (!IP_VALID(port)) { - port = ipc_port_alloc_kernel(); - assert(IP_VALID(port)); - ipc_kobject_set_atomically(port, (ipc_kobject_t) voucher, IKOT_VOUCHER); - - /* If we lose the race, deallocate and pick up the other guy's port */ - if (!OSCompareAndSwapPtr(IP_NULL, port, &voucher->iv_port)) { - ipc_port_dealloc_kernel(port); - port = voucher->iv_port; - assert(ip_kotype(port) == IKOT_VOUCHER); - assert(port->ip_kobject == (ipc_kobject_t)voucher); - } - } - - ip_lock(port); - assert(ip_active(port)); - send = ipc_port_make_send_locked(port); - - if (1 == port->ip_srights) { - ipc_port_t old_notify; - - /* transfer our ref to the port, and arm the no-senders notification */ - assert(IP_NULL == port->ip_nsrequest); - ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify); - /* port unlocked */ - assert(IP_NULL == old_notify); - } else { - /* piggyback on the existing port reference, so consume ours */ - ip_unlock(port); + /* + * make a send right and donate our reference for ipc_voucher_notify + * if this is the first send right + */ + if (!ipc_kobject_make_send_lazy_alloc_port(&voucher->iv_port, + (ipc_kobject_t)voucher, IKOT_VOUCHER)) { ipc_voucher_release(voucher); } - return send; + return voucher->iv_port; } #define ivace_reset_data(ivace_elem, next_index) { \ @@ -650,7 +621,7 @@ ivac_dealloc(ipc_voucher_attr_control_t ivac) * is gone. We can just discard it now. */ if (IP_VALID(port)) { - assert(ip_active(port)); + require_ip_active(port); assert(port->ip_srights == 0); ipc_port_dealloc_kernel(port); @@ -699,6 +670,7 @@ convert_port_to_voucher_attr_control( ipc_port_t port) { if (IP_VALID(port)) { + zone_require(port, ipc_object_zones[IOT_PORT]); ipc_voucher_attr_control_t ivac = (ipc_voucher_attr_control_t) port->ip_kobject; /* @@ -710,35 +682,32 @@ convert_port_to_voucher_attr_control( if (ip_kotype(port) != IKOT_VOUCHER_ATTR_CONTROL) { return IVAC_NULL; } + require_ip_active(port); - assert(ip_active(port)); - + zone_require(ivac, ipc_voucher_attr_control_zone); ivac_reference(ivac); return ivac; } return IVAC_NULL; } +/* + * Routine: ipc_voucher_notify + * Purpose: + * Called whenever the Mach port system detects no-senders + * on the voucher attr control port. + */ void ipc_voucher_attr_control_notify(mach_msg_header_t *msg) { mach_no_senders_notification_t *notification = (void *)msg; ipc_port_t port = notification->not_header.msgh_remote_port; - ipc_voucher_attr_control_t ivac; + require_ip_active(port); assert(IKOT_VOUCHER_ATTR_CONTROL == ip_kotype(port)); - ip_lock(port); - assert(ip_active(port)); - /* if no new send rights, drop a control reference */ - if (port->ip_mscount == notification->not_count) { - ivac = (ipc_voucher_attr_control_t)port->ip_kobject; - ip_unlock(port); - - ivac_release(ivac); - } else { - ip_unlock(port); - } + /* release the reference donated by convert_voucher_attr_control_to_port */ + ivac_release((ipc_voucher_attr_control_t)port->ip_kobject); } /* @@ -747,48 +716,21 @@ ipc_voucher_attr_control_notify(mach_msg_header_t *msg) ipc_port_t convert_voucher_attr_control_to_port(ipc_voucher_attr_control_t control) { - ipc_port_t port, send; - if (IVAC_NULL == control) { return IP_NULL; } - /* create a port if needed */ - port = control->ivac_port; - if (!IP_VALID(port)) { - port = ipc_port_alloc_kernel(); - assert(IP_VALID(port)); - if (OSCompareAndSwapPtr(IP_NULL, port, &control->ivac_port)) { - ip_lock(port); - ipc_kobject_set_atomically(port, (ipc_kobject_t) control, IKOT_VOUCHER_ATTR_CONTROL); - } else { - ipc_port_dealloc_kernel(port); - port = control->ivac_port; - ip_lock(port); - assert(ip_kotype(port) == IKOT_VOUCHER_ATTR_CONTROL); - assert(port->ip_kobject == (ipc_kobject_t)control); - } - } else { - ip_lock(port); - } + zone_require(control, ipc_voucher_attr_control_zone); - assert(ip_active(port)); - send = ipc_port_make_send_locked(port); - - if (1 == port->ip_srights) { - ipc_port_t old_notify; - - /* transfer our ref to the port, and arm the no-senders notification */ - assert(IP_NULL == port->ip_nsrequest); - ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify); - assert(IP_NULL == old_notify); - /* ipc_port_nsrequest unlocks the port */ - } else { - /* piggyback on the existing port reference, so consume ours */ - ip_unlock(port); + /* + * make a send right and donate our reference for + * ipc_voucher_attr_control_notify if this is the first send right + */ + if (!ipc_kobject_make_send_lazy_alloc_port(&control->ivac_port, + (ipc_kobject_t)control, IKOT_VOUCHER_ATTR_CONTROL)) { ivac_release(control); } - return send; + return control->ivac_port; } /* @@ -1213,7 +1155,7 @@ ivgt_lookup(iv_index_t key_index, } /* - * Routine: ipc_replace_voucher_value + * Routine: ipc_replace_voucher_value * Purpose: * Replace the value with the results of * running the supplied command through the resource @@ -1307,7 +1249,7 @@ ipc_replace_voucher_value( } /* - * Routine: ipc_directly_replace_voucher_value + * Routine: ipc_directly_replace_voucher_value * Purpose: * Replace the value with the value-handle * supplied directly by the attribute manager. @@ -1513,8 +1455,7 @@ ipc_execute_voucher_recipe_command( new_value = *(mach_voucher_attr_value_handle_t *)(void *)content; kr = ipc_directly_replace_voucher_value(voucher, - key, - new_value); + key, new_value); if (KERN_SUCCESS != kr) { return kr; } @@ -1592,7 +1533,7 @@ ipc_execute_voucher_recipe_command( } /* - * Routine: iv_checksum + * Routine: iv_checksum * Purpose: * Compute the voucher sum. This is more position- * relevant than many other checksums - important for @@ -1622,7 +1563,7 @@ iv_checksum(ipc_voucher_t voucher, boolean_t *emptyp) } /* - * Routine: iv_dedup + * Routine: iv_dedup * Purpose: * See if the set of values represented by this new voucher * already exist in another voucher. If so return a reference @@ -1787,7 +1728,7 @@ iv_dedup(ipc_voucher_t new_iv) } /* - * Routine: ipc_create_mach_voucher + * Routine: ipc_create_mach_voucher * Purpose: * Create a new mach voucher and initialize it with the * value(s) created by having the appropriate resource @@ -1858,7 +1799,7 @@ ipc_create_mach_voucher( } /* - * Routine: ipc_voucher_attr_control_create_mach_voucher + * Routine: ipc_voucher_attr_control_create_mach_voucher * Purpose: * Create a new mach voucher and initialize it with the * value(s) created by having the appropriate resource @@ -1945,7 +1886,7 @@ ipc_voucher_attr_control_create_mach_voucher( } /* - * ipc_register_well_known_mach_voucher_attr_manager + * ipc_register_well_known_mach_voucher_attr_manager * * Register the resource manager responsible for a given key value. */ @@ -2007,7 +1948,7 @@ ipc_register_well_known_mach_voucher_attr_manager( } /* - * Routine: mach_voucher_extract_attr_content + * Routine: mach_voucher_extract_attr_content * Purpose: * Extract the content for a given pair. * @@ -2070,14 +2011,12 @@ mach_voucher_extract_attr_content( /* callout to manager */ kr = (manager->ivam_extract_content)(manager, key, - vals, vals_count, - &command, - content, in_out_size); + vals, vals_count, &command, content, in_out_size); return kr; } /* - * Routine: mach_voucher_extract_attr_recipe + * Routine: mach_voucher_extract_attr_recipe * Purpose: * Extract a recipe for a given pair. * @@ -2163,7 +2102,7 @@ mach_voucher_extract_attr_recipe( /* - * Routine: mach_voucher_extract_all_attr_recipes + * Routine: mach_voucher_extract_all_attr_recipes * Purpose: * Extract all the (non-default) contents for a given voucher, * building up a recipe that could be provided to a future @@ -2253,7 +2192,7 @@ mach_voucher_extract_all_attr_recipes( } /* - * Routine: mach_voucher_debug_info + * Routine: mach_voucher_debug_info * Purpose: * Extract all the (non-default) contents for a given mach port name, * building up a recipe that could be provided to a future @@ -2284,6 +2223,10 @@ mach_voucher_debug_info( kern_return_t kr; ipc_port_t port = MACH_PORT_NULL; + if (space == IS_NULL) { + return KERN_INVALID_TASK; + } + if (!MACH_PORT_VALID(voucher_name)) { return KERN_INVALID_ARGUMENT; } @@ -2307,7 +2250,7 @@ mach_voucher_debug_info( #endif /* - * Routine: mach_voucher_attr_command + * Routine: mach_voucher_attr_command * Purpose: * Invoke an attribute-specific command through this voucher. * @@ -2380,7 +2323,7 @@ mach_voucher_attr_command( } /* - * Routine: mach_voucher_attr_control_get_values + * Routine: mach_voucher_attr_control_get_values * Purpose: * For a given voucher, get the value handle associated with the * specified attribute manager. @@ -2416,7 +2359,7 @@ mach_voucher_attr_control_get_values( } /* - * Routine: mach_voucher_attr_control_create_mach_voucher + * Routine: mach_voucher_attr_control_create_mach_voucher * Purpose: * Create a new mach voucher and initialize it by processing the * supplied recipe(s). @@ -2510,7 +2453,7 @@ mach_voucher_attr_control_create_mach_voucher( } /* - * Routine: host_create_mach_voucher + * Routine: host_create_mach_voucher * Purpose: * Create a new mach voucher and initialize it by processing the * supplied recipe(s). @@ -2598,10 +2541,10 @@ host_create_mach_voucher( } /* - * Routine: host_register_well_known_mach_voucher_attr_manager + * Routine: host_register_well_known_mach_voucher_attr_manager * Purpose: * Register the user-level resource manager responsible for a given - * key value. + * key value. * Conditions: * The manager port passed in has to be converted/wrapped * in an ipc_voucher_attr_manager_t structure and then call the @@ -2650,7 +2593,7 @@ host_register_well_known_mach_voucher_attr_manager( } /* - * Routine: host_register_mach_voucher_attr_manager + * Routine: host_register_mach_voucher_attr_manager * Purpose: * Register the user-space resource manager and return a * dynamically allocated key. @@ -3025,7 +2968,7 @@ static void user_data_release( ipc_voucher_attr_manager_t manager); -struct ipc_voucher_attr_manager user_data_manager = { +const struct ipc_voucher_attr_manager user_data_manager = { .ivam_release_value = user_data_release_value, .ivam_get_value = user_data_get_value, .ivam_extract_content = user_data_extract_content, @@ -3048,7 +2991,7 @@ ipc_voucher_attr_control_t test_control; #endif /* - * Routine: user_data_release_value + * Routine: user_data_release_value * Purpose: * Release a made reference on a specific value managed by * this voucher attribute manager. @@ -3086,7 +3029,7 @@ user_data_release_value( } /* - * Routine: user_data_checksum + * Routine: user_data_checksum * Purpose: * Provide a rudimentary checksum for the data presented * to these voucher attribute managers. @@ -3107,7 +3050,7 @@ user_data_checksum( } /* - * Routine: user_data_dedup + * Routine: user_data_dedup * Purpose: * See if the content represented by this request already exists * in another user data element. If so return a made reference diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index 19df67bf7..cf1d90c0b 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -256,7 +256,7 @@ mach_port_space_info( iin->iin_type = IE_BITS_TYPE(bits); if ((entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) != MACH_PORT_TYPE_NONE && entry->ie_request != IE_REQ_NONE) { - __IGNORE_WCASTALIGN(ipc_port_t port = (ipc_port_t) entry->ie_object); + ipc_port_t port = ip_object_to_port(entry->ie_object); assert(IP_VALID(port)); ip_lock(port); @@ -488,7 +488,7 @@ mach_port_kobject( return KERN_INVALID_RIGHT; } - __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object); + port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); ip_lock(port); @@ -501,18 +501,18 @@ mach_port_kobject( *typep = (unsigned int) ip_kotype(port); kaddr = (mach_vm_address_t)port->ip_kobject; - ip_unlock(port); - + *addrp = 0; #if (DEVELOPMENT || DEBUG) - if (0 != kaddr && is_ipc_kobject(*typep)) { + if (kaddr && ip_is_kobject(port)) { *addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr); - } else + } #endif - *addrp = 0; + ip_unlock(port); return KERN_SUCCESS; } #endif /* MACH_IPC_DEBUG */ + /* * Routine: mach_port_kernel_object [Legacy kernel call] * Purpose: diff --git a/osfmk/ipc/mach_kernelrpc.c b/osfmk/ipc/mach_kernelrpc.c index 52bf7b11c..603f841d1 100644 --- a/osfmk/ipc/mach_kernelrpc.c +++ b/osfmk/ipc/mach_kernelrpc.c @@ -277,18 +277,30 @@ _kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_ goto done; } + if (args->name == args->poly) { + switch (args->polyPoly) { + case MACH_MSG_TYPE_MAKE_SEND: + case MACH_MSG_TYPE_COPY_SEND: + /* fastpath MAKE_SEND / COPY_SEND which is the most common case */ + rv = ipc_object_insert_send_right(task->itk_space, args->poly, + args->polyPoly); + goto done; + + default: + break; + } + } + rv = ipc_object_copyin(task->itk_space, args->poly, args->polyPoly, - (ipc_object_t *)&port); + (ipc_object_t *)&port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); if (rv != KERN_SUCCESS) { goto done; } disp = ipc_object_copyin_type(args->polyPoly); rv = mach_port_insert_right(task->itk_space, args->name, port, disp); - if (rv != KERN_SUCCESS) { - if (IO_VALID((ipc_object_t)port)) { - ipc_object_destroy((ipc_object_t)port, disp); - } + if (rv != KERN_SUCCESS && IP_VALID(port)) { + ipc_object_destroy(ip_to_object(port), disp); } done: @@ -472,6 +484,88 @@ done: return rv; } +int +_kernelrpc_mach_port_type_trap(struct _kernelrpc_mach_port_type_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + mach_port_type_t type; + + if (task != current_task()) { + goto done; + } + + rv = mach_port_type(task->itk_space, args->name, &type); + if (rv == KERN_SUCCESS) { + rv = copyout(&type, args->ptype, sizeof(type)); + } + +done: + if (task) { + task_deallocate(task); + } + return rv; +} + +int +_kernelrpc_mach_port_request_notification_trap( + struct _kernelrpc_mach_port_request_notification_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + ipc_port_t notify, previous; + mach_msg_type_name_t disp; + mach_port_name_t previous_name = MACH_PORT_NULL; + + if (task != current_task()) { + goto done; + } + + disp = ipc_object_copyin_type(args->notifyPoly); + if (disp != MACH_MSG_TYPE_PORT_SEND_ONCE) { + goto done; + } + + if (MACH_PORT_VALID(args->notify)) { + rv = ipc_object_copyin(task->itk_space, args->notify, args->notifyPoly, + (ipc_object_t *)¬ify, 0, NULL, 0); + } else { + notify = CAST_MACH_NAME_TO_PORT(args->notify); + } + if (rv != KERN_SUCCESS) { + goto done; + } + + rv = mach_port_request_notification(task->itk_space, args->name, + args->msgid, args->sync, notify, &previous); + if (rv != KERN_SUCCESS) { + ipc_object_destroy(ip_to_object(notify), disp); + goto done; + } + + if (IP_VALID(previous)) { + // Remove once is fixed. + // We need to make ith_knote NULL as ipc_object_copyout() uses + // thread-argument-passing and its value should not be garbage + current_thread()->ith_knote = ITH_KNOTE_NULL; + rv = ipc_object_copyout(task->itk_space, ip_to_object(previous), + MACH_MSG_TYPE_PORT_SEND_ONCE, NULL, NULL, &previous_name); + if (rv != KERN_SUCCESS) { + ipc_object_destroy(ip_to_object(previous), + MACH_MSG_TYPE_PORT_SEND_ONCE); + goto done; + } + } + + rv = copyout(&previous_name, args->previous, sizeof(previous_name)); + +done: + if (task) { + task_deallocate(task); + } + return rv; +} + kern_return_t host_create_mach_voucher_trap(struct host_create_mach_voucher_args *args) { diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index c66b20303..e4a901230 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -155,8 +155,8 @@ mach_msg_rcv_link_special_reply_port( void mach_msg_receive_results_complete(ipc_object_t object); -security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE; -audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE; +const security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE; +const audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE; mach_msg_format_0_trailer_t trailer_template = { /* mach_msg_trailer_type_t */ MACH_MSG_TRAILER_FORMAT_0, @@ -568,14 +568,13 @@ mach_msg_overwrite_trap( mr = ipc_mqueue_copyin(space, rcv_name, &mqueue, &object); if (mr != MACH_MSG_SUCCESS) { - mach_port_guard_exception(rcv_name, 0, 0, kGUARD_EXC_RCV_INVALID_NAME); return mr; } /* hold ref for object */ if ((option & MACH_RCV_SYNC_WAIT) && !(option & MACH_SEND_SYNC_OVERRIDE)) { ipc_port_t special_reply_port; - __IGNORE_WCASTALIGN(special_reply_port = (ipc_port_t) object); + special_reply_port = ip_object_to_port(object); /* link the special reply port to the destination */ mr = mach_msg_rcv_link_special_reply_port(special_reply_port, (mach_port_name_t)override); @@ -635,20 +634,19 @@ mach_msg_rcv_link_special_reply_port( return MACH_RCV_INVALID_NOTIFY; } - kr = ipc_object_copyin(current_space(), - dest_name_port, MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &dest_port); + kr = ipc_port_translate_send(current_space(), dest_name_port, &dest_port); + if (kr == KERN_SUCCESS) { + ip_reference(dest_port); + ip_unlock(dest_port); - /* - * The receive right of dest port might have gone away, - * do not fail the receive in that case. - */ - if (kr == KERN_SUCCESS && IP_VALID(dest_port)) { + /* + * The receive right of dest port might have gone away, + * do not fail the receive in that case. + */ ipc_port_link_special_reply_port(special_reply_port, - dest_port); + dest_port, FALSE); - /* release the send right */ - ipc_port_release_send(dest_port); + ip_release(dest_port); } return MACH_MSG_SUCCESS; } @@ -672,7 +670,7 @@ mach_msg_receive_results_complete(ipc_object_t object) boolean_t get_turnstile = self->turnstile ? FALSE : TRUE; if (io_otype(object) == IOT_PORT) { - __IGNORE_WCASTALIGN(port = (ipc_port_t) object); + port = ip_object_to_port(object); } else { assert(self->turnstile != TURNSTILE_NULL); return; diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index 41089c941..9f4d8b677 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -104,28 +104,6 @@ #endif -/* - * Forward declarations - */ -void mach_port_names_helper( - ipc_port_timestamp_t timestamp, - ipc_entry_t entry, - mach_port_name_t name, - mach_port_name_t *names, - mach_port_type_t *types, - ipc_entry_num_t *actualp); - -void mach_port_gst_helper( - ipc_pset_t pset, - ipc_entry_num_t maxnames, - mach_port_name_t *names, - ipc_entry_num_t *actualp); - -/* Needs port locked */ -void mach_port_get_status_helper( - ipc_port_t port, - mach_port_status_t *status); - /* Zeroed template of qos flags */ static mach_port_qos_t qos_template; @@ -138,8 +116,7 @@ static mach_port_qos_t qos_template; * Conditions: * Space containing entry is [at least] read-locked. */ - -void +static void mach_port_names_helper( ipc_port_timestamp_t timestamp, ipc_entry_t entry, @@ -156,14 +133,14 @@ mach_port_names_helper( bits = entry->ie_bits; request = entry->ie_request; - __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object); + port = ip_object_to_port(entry->ie_object); if (bits & MACH_PORT_TYPE_RECEIVE) { assert(IP_VALID(port)); if (request != IE_REQ_NONE) { ip_lock(port); - assert(ip_active(port)); + require_ip_active(port); type |= ipc_port_request_type(port, name, request); ip_unlock(port); } @@ -462,7 +439,6 @@ mach_port_type( kr = ipc_right_lookup_write(space, name, &entry); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; } @@ -634,11 +610,15 @@ mach_port_allocate_qos( /* * Routine: mach_port_allocate_full [kernel call] * Purpose: - * Allocates a right in a space. Supports all of the - * special cases, such as specifying a subsystem, - * a specific name, a real-time port, etc. - * The name may be any legal name in the space that doesn't + * Allocates a right in a space. Supports the + * special case of specifying a name. The name may + * be any legal name in the space that doesn't * currently denote a right. + * + * While we no longer support users requesting + * preallocated message for the port, we still + * check for errors in such requests and then + * just clear the request. * Conditions: * Nothing locked. * Returns: @@ -658,7 +638,6 @@ mach_port_allocate_full( mach_port_qos_t *qosp, mach_port_name_t *namep) { - ipc_kmsg_t kmsg = IKM_NULL; kern_return_t kr; if (space == IS_NULL) { @@ -675,38 +654,85 @@ mach_port_allocate_full( } } + /* + * Don't actually honor prealloc requests from user-space + * (for security reasons, and because it isn't guaranteed anyway). + * Keep old errors for legacy reasons. + */ if (qosp->prealloc) { if (qosp->len > MACH_MSG_SIZE_MAX - MAX_TRAILER_SIZE) { return KERN_RESOURCE_SHORTAGE; - } else { - mach_msg_size_t size = qosp->len + MAX_TRAILER_SIZE; - - if (right != MACH_PORT_RIGHT_RECEIVE) { - return KERN_INVALID_VALUE; - } - - kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size); - if (kmsg == IKM_NULL) { - return KERN_RESOURCE_SHORTAGE; - } } + if (right != MACH_PORT_RIGHT_RECEIVE) { + return KERN_INVALID_VALUE; + } + qosp->prealloc = 0; } + kr = mach_port_allocate_internal(space, right, qosp, namep); + return kr; +} + + +/* + * Routine: mach_port_allocate_internal [kernel private] + * Purpose: + * Allocates a right in a space. Supports all of the + * special cases, a specific name, a real-time port, etc. + * The name may be any legal name in the space that doesn't + * currently denote a right. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS The right is allocated. + * KERN_INVALID_TASK The space is null. + * KERN_INVALID_TASK The space is dead. + * KERN_INVALID_VALUE "right" isn't a legal kind of right. + * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + * KERN_NO_SPACE No room in space for another right. + */ +kern_return_t +mach_port_allocate_internal( + ipc_space_t space, + mach_port_right_t right, + mach_port_qos_t *qosp, + mach_port_name_t *namep) +{ + kern_return_t kr; + + assert(space != IS_NULL); + switch (right) { case MACH_PORT_RIGHT_RECEIVE: { + ipc_kmsg_t kmsg = IKM_NULL; ipc_port_t port; + /* + * For in-kernel uses, only allow small (from the kmsg zone) + * preallocated messages for the port. + */ + if (qosp->prealloc) { + mach_msg_size_t size = qosp->len; + + if (size > IKM_SAVED_MSG_SIZE - MAX_TRAILER_SIZE) { + panic("mach_port_allocate_internal: too large a prealloc kmsg"); + } + kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size + MAX_TRAILER_SIZE); + if (kmsg == IKM_NULL) { + return KERN_RESOURCE_SHORTAGE; + } + } + if (qosp->name) { kr = ipc_port_alloc_name(space, *namep, &port); } else { - kr = ipc_port_alloc(space, namep, &port); + kr = ipc_port_alloc(space, FALSE, namep, &port); } if (kr == KERN_SUCCESS) { if (kmsg != IKM_NULL) { ipc_kmsg_set_prealloc(kmsg, port); } - ip_unlock(port); } else if (kmsg != IKM_NULL) { ipc_kmsg_free(kmsg); @@ -878,7 +904,6 @@ mach_port_get_refs( kr = ipc_right_lookup_write(space, name, &entry); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; } @@ -1112,8 +1137,7 @@ mach_port_set_mscount( } /* port is locked and active */ - ipc_port_set_mscount(port, mscount); - + port->ip_mscount = mscount; ip_unlock(port); return KERN_SUCCESS; } @@ -1327,7 +1351,7 @@ mach_port_get_set_status( } /* just use a portset reference from here on out */ - __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj); + pset = ips_object_to_pset(psobj); ips_reference(pset); ips_unlock(pset); @@ -1420,9 +1444,9 @@ mach_port_move_member( mach_port_name_t member, mach_port_name_t after) { - ipc_entry_t entry; + ipc_object_t port_obj, ps_obj; ipc_port_t port; - ipc_pset_t nset; + ipc_pset_t nset = IPS_NULL; kern_return_t kr; uint64_t wq_link_id = 0; uint64_t wq_reserved_prepost = 0; @@ -1458,54 +1482,34 @@ mach_port_move_member( } } - kr = ipc_right_lookup_read(space, member, &entry); + if (after != MACH_PORT_NULL) { + kr = ipc_object_translate_two(space, + member, MACH_PORT_RIGHT_RECEIVE, &port_obj, + after, MACH_PORT_RIGHT_PORT_SET, &ps_obj); + } else { + kr = ipc_object_translate(space, + member, MACH_PORT_RIGHT_RECEIVE, &port_obj); + } if (kr != KERN_SUCCESS) { goto done; } - /* space is read-locked and active */ - if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) { - is_read_unlock(space); - kr = KERN_INVALID_RIGHT; - goto done; + port = ip_object_to_port(port_obj); + if (after != MACH_PORT_NULL) { + nset = ips_object_to_pset(ps_obj); } + /* port and nset are locked */ - __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object); - assert(port != IP_NULL); - - if (after == MACH_PORT_NULL) { - nset = IPS_NULL; - } else { - entry = ipc_entry_lookup(space, after); - if (entry == IE_NULL) { - is_read_unlock(space); - kr = KERN_INVALID_NAME; - goto done; - } - - if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) { - is_read_unlock(space); - kr = KERN_INVALID_RIGHT; - goto done; - } - - __IGNORE_WCASTALIGN(nset = (ipc_pset_t) entry->ie_object); - assert(nset != IPS_NULL); - } - ip_lock(port); - assert(ip_active(port)); ipc_pset_remove_from_all(port); - if (nset != IPS_NULL) { - ips_lock(nset); + if (after != MACH_PORT_NULL) { kr = ipc_pset_add(nset, port, &wq_link_id, &wq_reserved_prepost); ips_unlock(nset); } + ip_unlock(port); - is_read_unlock(space); done: - /* * on success the ipc_pset_add() will consume the wq_link_id * value (resetting it to 0), so this function is always safe to call. @@ -1552,6 +1556,7 @@ done: * KERN_INVALID_CAPABILITY The notify port is dead. * MACH_NOTIFY_PORT_DESTROYED: * KERN_INVALID_VALUE Sync isn't zero. + * KERN_FAILURE Re-registering for this notification * MACH_NOTIFY_DEAD_NAME: * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. * KERN_INVALID_ARGUMENT Name denotes dead name, but @@ -1592,7 +1597,7 @@ mach_port_request_notification( return kr; } - port = (ipc_port_t) entry->ie_object; + port = ip_object_to_port(entry->ie_object); if (port->ip_subsystem != NULL) { is_write_unlock(space); @@ -1606,7 +1611,7 @@ mach_port_request_notification( switch (id) { case MACH_NOTIFY_PORT_DESTROYED: { - ipc_port_t port, previous; + ipc_port_t port; if (sync != 0) { return KERN_INVALID_VALUE; @@ -1628,10 +1633,16 @@ mach_port_request_notification( return KERN_INVALID_RIGHT; } - ipc_port_pdrequest(port, notify, &previous); - /* port is unlocked */ + /* Allow only one registeration of this notification */ + if (port->ip_pdrequest != IP_NULL) { + ip_unlock(port); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_KERN_FAILURE); + return KERN_FAILURE; + } - *previousp = previous; + ipc_port_pdrequest(port, notify, previousp); + /* port is unlocked */ + assert(*previousp == IP_NULL); break; } @@ -1728,12 +1739,12 @@ mach_port_insert_right( return KERN_INVALID_VALUE; } - if (!IO_VALID((ipc_object_t) poly)) { + if (!IP_VALID(poly)) { return KERN_INVALID_CAPABILITY; } - return ipc_object_copyout_name(space, (ipc_object_t) poly, - polyPoly, FALSE, name); + return ipc_object_copyout_name(space, ip_to_object(poly), + polyPoly, name); } /* @@ -1779,7 +1790,8 @@ mach_port_extract_right( return KERN_INVALID_RIGHT; } - kr = ipc_object_copyin(space, name, msgt_name, (ipc_object_t *) poly); + kr = ipc_object_copyin(space, name, msgt_name, (ipc_object_t *) poly, 0, NULL, + IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); if (kr == KERN_SUCCESS) { *polyPoly = ipc_object_copyin_type(msgt_name); @@ -1797,7 +1809,7 @@ mach_port_extract_right( * Returns: * None. */ -void +static void mach_port_get_status_helper( ipc_port_t port, mach_port_status_t *statusp) @@ -1830,6 +1842,12 @@ mach_port_get_status_helper( if (port->ip_strict_guard) { statusp->mps_flags |= MACH_PORT_STATUS_FLAG_STRICT_GUARD; } + if (port->ip_immovable_receive) { + statusp->mps_flags |= MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE; + } + } + if (port->ip_no_grant) { + statusp->mps_flags |= MACH_PORT_STATUS_FLAG_NO_GRANT; } return; } @@ -2038,7 +2056,7 @@ mach_port_set_attributes( * associated it with a kobject already (timer, host_notify target), * or is a special reply port. */ - if (is_ipc_kobject(ip_kotype(port)) || port->ip_specialreply) { + if (ip_is_kobject(port) || port->ip_specialreply) { ip_unlock(port); return KERN_INVALID_ARGUMENT; } @@ -2097,7 +2115,7 @@ mach_port_set_attributes( * it with a kobject already (timer, host_notify target), * or is a special reply port. */ - if (is_ipc_kobject(ip_kotype(port)) || port->ip_specialreply) { + if (ip_is_kobject(port) || port->ip_specialreply) { ip_unlock(port); return KERN_INVALID_ARGUMENT; } @@ -2175,8 +2193,8 @@ mach_port_insert_member( assert(psobj != IO_NULL); assert(obj != IO_NULL); - __IGNORE_WCASTALIGN(kr = ipc_pset_add((ipc_pset_t)psobj, (ipc_port_t)obj, - &wq_link_id, &wq_reserved_prepost)); + kr = ipc_pset_add(ips_object_to_pset(psobj), ip_object_to_port(obj), + &wq_link_id, &wq_reserved_prepost); io_unlock(psobj); io_unlock(obj); @@ -2236,7 +2254,7 @@ mach_port_extract_member( assert(psobj != IO_NULL); assert(obj != IO_NULL); - __IGNORE_WCASTALIGN(kr = ipc_pset_remove((ipc_pset_t)psobj, (ipc_port_t)obj)); + kr = ipc_pset_remove(ips_object_to_pset(psobj), ip_object_to_port(obj)); io_unlock(psobj); io_unlock(obj); @@ -2288,15 +2306,25 @@ static kern_return_t mach_port_guard_locked( ipc_port_t port, uint64_t guard, - boolean_t strict) + uint64_t flags) { if (port->ip_context) { return KERN_INVALID_ARGUMENT; } + int strict = (flags & MPG_STRICT)? 1 : 0; + int immovable_receive = (flags & MPG_IMMOVABLE_RECEIVE)? 1 : 0; + + imq_lock(&port->ip_messages); port->ip_context = guard; port->ip_guarded = 1; - port->ip_strict_guard = (strict)?1:0; + port->ip_strict_guard = strict; + /* ip_immovable_receive bit is sticky and can't be un-guarded */ + if (!port->ip_immovable_receive) { + port->ip_immovable_receive = immovable_receive; + } + imq_unlock(&port->ip_messages); + return KERN_SUCCESS; } @@ -2330,8 +2358,12 @@ mach_port_unguard_locked( return KERN_INVALID_ARGUMENT; } + imq_lock(&port->ip_messages); port->ip_context = 0; port->ip_guarded = port->ip_strict_guard = 0; + /* Don't clear the ip_immovable_receive bit */ + imq_unlock(&port->ip_messages); + return KERN_SUCCESS; } @@ -2359,7 +2391,13 @@ mach_port_guard_exception( EXC_GUARD_ENCODE_TARGET(code, name); mach_exception_subcode_t subcode = (uint64_t)portguard; thread_t t = current_thread(); - thread_guard_violation(t, code, subcode); + boolean_t fatal = FALSE; + if (t->task->task_exc_guard & TASK_EXC_GUARD_MP_FATAL) { + fatal = TRUE; + } else if (reason <= MAX_FATAL_kGUARD_EXC_CODE) { + fatal = TRUE; + } + thread_guard_violation(t, code, subcode, fatal); } @@ -2392,6 +2430,8 @@ mach_port_guard_ast(thread_t t, case kGUARD_EXC_SET_CONTEXT: case kGUARD_EXC_UNGUARDED: case kGUARD_EXC_INCORRECT_GUARD: + case kGUARD_EXC_IMMOVABLE: + case kGUARD_EXC_STRICT_REPLY: task_exception_notify(EXC_GUARD, code, subcode); task_bsdtask_kill(task); break; @@ -2465,14 +2505,22 @@ mach_port_construct( } /* Allocate a new port in the IPC space */ - kr = ipc_port_alloc(space, name, &port); + kr = ipc_port_alloc(space, (options->flags & MPO_INSERT_SEND_RIGHT), + name, &port); if (kr != KERN_SUCCESS) { return kr; } /* Port locked and active */ if (options->flags & MPO_CONTEXT_AS_GUARD) { - kr = mach_port_guard_locked(port, (uint64_t) context, (options->flags & MPO_STRICT)); + uint64_t flags = 0; + if (options->flags & MPO_STRICT) { + flags |= MPG_STRICT; + } + if (options->flags & MPO_IMMOVABLE_RECEIVE) { + flags |= MPG_IMMOVABLE_RECEIVE; + } + kr = mach_port_guard_locked(port, (uint64_t) context, flags); /* A newly allocated and locked port should always be guarded successfully */ assert(kr == KERN_SUCCESS); } else { @@ -2513,23 +2561,12 @@ mach_port_construct( } } - if (options->flags & MPO_INSERT_SEND_RIGHT) { - kr = ipc_object_copyin(space, *name, MACH_MSG_TYPE_MAKE_SEND, (ipc_object_t *)&port); - if (kr != KERN_SUCCESS) { - goto cleanup; - } - - kr = mach_port_insert_right(space, *name, port, MACH_MSG_TYPE_PORT_SEND); - if (kr != KERN_SUCCESS) { - goto cleanup; - } - } - return KERN_SUCCESS; cleanup: /* Attempt to destroy port. If its already destroyed by some other thread, we're done */ - (void) mach_port_destruct(space, *name, 0, context); + (void) mach_port_destruct(space, *name, + (options->flags & MPO_INSERT_SEND_RIGHT) ? -1 : 0, context); return kr; } @@ -2604,6 +2641,7 @@ mach_port_guard( { kern_return_t kr; ipc_port_t port; + uint64_t flags = 0; if (space == IS_NULL) { return KERN_INVALID_TASK; @@ -2624,7 +2662,11 @@ mach_port_guard( } /* Port locked and active */ - kr = mach_port_guard_locked(port, guard, strict); + if (strict) { + flags = MPG_STRICT; + } + + kr = mach_port_guard_locked(port, guard, flags); ip_unlock(port); if (KERN_INVALID_ARGUMENT == kr) { @@ -2681,3 +2723,131 @@ mach_port_unguard( return kr; } + +/* + * Routine: mach_port_guard_with_flags [kernel call] + * Purpose: + * Guard a mach port with specified guard value and guard flags. + * The context field of the port is used as the guard. + * Conditions: + * Should hold receive right for that port + * Returns: + * KERN_SUCCESS The name is destroyed. + * KERN_INVALID_TASK The space is null. + * KERN_INVALID_TASK The space is dead. + * KERN_INVALID_NAME The name doesn't denote a right. + * KERN_INVALID_RIGHT The right isn't correct. + * KERN_INVALID_ARGUMENT Port already contains a context/guard. + * KERN_INVALID_CAPABILITY Cannot set MPG_IMMOVABLE_RECEIVE flag for a port with + * a movable port-destroyed notification port + */ +kern_return_t +mach_port_guard_with_flags( + ipc_space_t space, + mach_port_name_t name, + uint64_t guard, + uint64_t flags) +{ + kern_return_t kr; + ipc_port_t port; + + if (space == IS_NULL) { + return KERN_INVALID_TASK; + } + + if (!MACH_PORT_VALID(name)) { + return KERN_INVALID_NAME; + } + + kr = ipc_port_translate_receive(space, name, &port); + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, + ((KERN_INVALID_NAME == kr) ? + kGUARD_EXC_INVALID_NAME : + kGUARD_EXC_INVALID_RIGHT)); + return kr; + } + + /* Port locked and active */ + kr = mach_port_guard_locked(port, guard, flags); + ip_unlock(port); + + if (KERN_INVALID_ARGUMENT == kr) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_ARGUMENT); + } + + return kr; +} + +/* + * Routine: mach_port_swap_guard [kernel call] + * Purpose: + * Swap guard value. + * Conditions: + * Port should already be guarded. + * Returns: + * KERN_SUCCESS The name is destroyed. + * KERN_INVALID_TASK The space is null. + * KERN_INVALID_TASK The space is dead. + * KERN_INVALID_NAME The name doesn't denote a right. + * KERN_INVALID_RIGHT The right isn't correct. + * KERN_INVALID_ARGUMENT Port doesn't contain a guard; is strictly guarded + * or the old_guard doesnt match the context + */ +kern_return_t +mach_port_swap_guard( + ipc_space_t space, + mach_port_name_t name, + uint64_t old_guard, + uint64_t new_guard) +{ + kern_return_t kr; + ipc_port_t port; + + if (space == IS_NULL) { + return KERN_INVALID_TASK; + } + + if (!MACH_PORT_VALID(name)) { + return KERN_INVALID_NAME; + } + + kr = ipc_port_translate_receive(space, name, &port); + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, + ((KERN_INVALID_NAME == kr) ? + kGUARD_EXC_INVALID_NAME : + kGUARD_EXC_INVALID_RIGHT)); + return kr; + } + + /* Port locked and active */ + if (!port->ip_guarded) { + ip_unlock(port); + mach_port_guard_exception(name, old_guard, 0, kGUARD_EXC_UNGUARDED); + return KERN_INVALID_ARGUMENT; + } + + if (port->ip_strict_guard) { + uint64_t portguard = port->ip_context; + ip_unlock(port); + /* For strictly guarded ports, disallow overwriting context; Raise Exception */ + mach_port_guard_exception(name, old_guard, portguard, kGUARD_EXC_SET_CONTEXT); + return KERN_INVALID_ARGUMENT; + } + + if (port->ip_context != old_guard) { + uint64_t portguard = port->ip_context; + ip_unlock(port); + mach_port_guard_exception(name, old_guard, portguard, kGUARD_EXC_INCORRECT_GUARD); + return KERN_INVALID_ARGUMENT; + } + + imq_lock(&port->ip_messages); + port->ip_context = new_guard; + imq_unlock(&port->ip_messages); + + ip_unlock(port); + + return KERN_SUCCESS; +} diff --git a/osfmk/kdp/kdp_callout.h b/osfmk/kdp/kdp_callout.h index 9646da05f..eb0431b95 100644 --- a/osfmk/kdp/kdp_callout.h +++ b/osfmk/kdp/kdp_callout.h @@ -26,12 +26,15 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include + typedef enum { KDP_EVENT_ENTER, KDP_EVENT_EXIT, KDP_EVENT_PANICLOG } kdp_event_t; +__BEGIN_DECLS typedef void (*kdp_callout_fn_t)(void *arg, kdp_event_t event); /* @@ -43,3 +46,5 @@ typedef void (*kdp_callout_fn_t)(void *arg, kdp_event_t event); * non-trivial service. */ extern void kdp_register_callout(kdp_callout_fn_t fn, void *arg); + +__END_DECLS diff --git a/osfmk/kdp/kdp_core.c b/osfmk/kdp/kdp_core.c index 08edfb7c6..5971b1cfe 100644 --- a/osfmk/kdp/kdp_core.c +++ b/osfmk/kdp/kdp_core.c @@ -737,12 +737,11 @@ kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr, uintptr_t * pvphy if (ppn && pvphysaddr) { uint64_t phys = ptoa_64(ppn); -#if defined(__arm__) || defined(__arm64__) - if (isphysmem(phys)) *pvphysaddr = phystokv(phys); -#else - if (physmap_enclosed(phys)) *pvphysaddr = (uintptr_t)PHYSMAP_PTOV(phys); -#endif - else ppn = 0; + if (physmap_enclosed(phys)) { + *pvphysaddr = phystokv(phys); + } else { + ppn = 0; + } } return (ppn); @@ -758,16 +757,18 @@ pmap_traverse_present_mappings(pmap_t __unused pmap, IOReturn ret; vm_map_offset_t vcurstart, vcur; uint64_t vincr = 0; - vm_map_offset_t debug_start; - vm_map_offset_t debug_end; + vm_map_offset_t debug_start = trunc_page((vm_map_offset_t) debug_buf_base); + vm_map_offset_t debug_end = round_page((vm_map_offset_t) (debug_buf_base + debug_buf_size)); +#if defined(XNU_TARGET_OS_BRIDGE) + vm_map_offset_t macos_panic_start = trunc_page((vm_map_offset_t) macos_panic_base); + vm_map_offset_t macos_panic_end = round_page((vm_map_offset_t) (macos_panic_base + macos_panic_size)); +#endif + boolean_t lastvavalid; #if defined(__arm__) || defined(__arm64__) vm_page_t m = VM_PAGE_NULL; #endif - debug_start = trunc_page((vm_map_offset_t) debug_buf_base); - debug_end = round_page((vm_map_offset_t) (debug_buf_base + debug_buf_size)); - #if defined(__x86_64__) assert(!is_ept_pmap(pmap)); #endif @@ -827,8 +828,12 @@ pmap_traverse_present_mappings(pmap_t __unused pmap, if (ppn != 0) { if (((vcur < debug_start) || (vcur >= debug_end)) - && !(EFI_VALID_PAGE(ppn) || - pmap_valid_page(ppn))) + && !(EFI_VALID_PAGE(ppn) || pmap_valid_page(ppn)) +#if defined(XNU_TARGET_OS_BRIDGE) + // include the macOS panic region if it's mapped + && ((vcur < macos_panic_start) || (vcur >= macos_panic_end)) +#endif + ) { /* not something we want */ ppn = 0; @@ -1170,7 +1175,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) assert (existing_log_size <= debug_buf_size); - if (kd_variant == KERN_DUMP_DISK) { + if ((kd_variant == KERN_DUMP_DISK) || (kd_variant == KERN_DUMP_STACKSHOT_DISK)) { /* Open the file for output */ if ((ret = (*outproc)(KDP_WRQ, NULL, 0, NULL)) != kIOReturnSuccess) { kern_coredump_log(NULL, "outproc(KDP_WRQ, NULL, 0, NULL) returned 0x%x\n", ret); @@ -1184,7 +1189,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) bzero(&outvars, sizeof(outvars)); outvars.outproc = outproc; - if (kd_variant == KERN_DUMP_DISK) { + if ((kd_variant == KERN_DUMP_DISK) || (kd_variant == KERN_DUMP_STACKSHOT_DISK)) { outvars.zoutput = kdp_core_zoutput; /* Space for file header, panic log, core log */ foffset = (KERN_COREDUMP_HEADERSIZE + existing_log_size + KERN_COREDUMP_MAXDEBUGLOGSIZE + @@ -1215,6 +1220,35 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) kern_coredump_log(NULL, "%s", (kd_variant == KERN_DUMP_DISK) ? "Writing local cores..." : "Transmitting kernel state, please wait:\n"); + +#if defined(__x86_64__) + if (((kd_variant == KERN_DUMP_STACKSHOT_DISK) || (kd_variant == KERN_DUMP_DISK)) && ((panic_stackshot_buf != 0) && (panic_stackshot_len != 0))) { + uint64_t compressed_stackshot_len = 0; + + if ((ret = kdp_reset_output_vars(&outvars, panic_stackshot_len)) != KERN_SUCCESS) { + kern_coredump_log(NULL, "Failed to reset outvars for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret); + dump_succeeded = FALSE; + } else if ((ret = kdp_core_output(&outvars, panic_stackshot_len, (void *)panic_stackshot_buf)) != KERN_SUCCESS) { + kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, %p) returned 0x%x\n", + panic_stackshot_len, (void *) panic_stackshot_buf, ret); + dump_succeeded = FALSE; + } else if ((ret = kdp_core_output(&outvars, 0, NULL)) != KERN_SUCCESS) { + kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outvars, ret); + dump_succeeded = FALSE; + } else if ((ret = kern_dump_record_file(&outvars, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len)) != KERN_SUCCESS) { + kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret); + dump_succeeded = FALSE; + } else { + kern_coredump_log(NULL, "Recorded panic stackshot in corefile at offset 0x%llx, compressed to %llu bytes\n", foffset, compressed_stackshot_len); + foffset = roundup((foffset + compressed_stackshot_len), KERN_COREDUMP_BEGIN_FILEBYTES_ALIGN); + if ((ret = kern_dump_seek_to_next_file(&outvars, foffset)) != kIOReturnSuccess) { + kern_coredump_log(NULL, "Failed to seek to stackshot file offset 0x%llx, kern_dump_seek_to_next_file returned 0x%x\n", foffset, ret); + dump_succeeded = FALSE; + } + } + } +#endif + if (kd_variant == KERN_DUMP_DISK) { /* * Dump co-processors as well, foffset will be overwritten with the @@ -1223,7 +1257,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) if (kern_do_coredump(&outvars, FALSE, foffset, &foffset) != 0) { dump_succeeded = FALSE; } - } else { + } else if (kd_variant != KERN_DUMP_STACKSHOT_DISK) { /* Only the kernel */ if (kern_do_coredump(&outvars, TRUE, foffset, &foffset) != 0) { dump_succeeded = FALSE; @@ -1231,34 +1265,6 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) } if (kd_variant == KERN_DUMP_DISK) { -#if defined(__x86_64__) && (DEVELOPMENT || DEBUG) - /* Write the macOS panic stackshot on its own to a separate 'corefile' */ - if (panic_stackshot_buf && panic_stackshot_len) { - uint64_t compressed_stackshot_len = 0; - - /* Seek to the offset of the next 'file' (foffset provided/updated from kern_do_coredump) */ - if ((ret = kern_dump_seek_to_next_file(&outvars, foffset)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "Failed to seek to stackshot file offset 0x%llx, kern_dump_seek_to_next_file returned 0x%x\n", foffset, ret); - dump_succeeded = FALSE; - } else if ((ret = kdp_reset_output_vars(&outvars, panic_stackshot_len)) != KERN_SUCCESS) { - kern_coredump_log(NULL, "Failed to reset outvars for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret); - dump_succeeded = FALSE; - } else if ((ret = kdp_core_output(&outvars, panic_stackshot_len, (void *)panic_stackshot_buf)) != KERN_SUCCESS) { - kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, %p) returned 0x%x\n", - panic_stackshot_len, (void *) panic_stackshot_buf, ret); - dump_succeeded = FALSE; - } else if ((ret = kdp_core_output(&outvars, 0, NULL)) != KERN_SUCCESS) { - kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outvars, ret); - dump_succeeded = FALSE; - } else if ((ret = kern_dump_record_file(&outvars, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len)) != KERN_SUCCESS) { - kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret); - dump_succeeded = FALSE; - } else { - kern_coredump_log(NULL, "Recorded panic stackshot in corefile at offset 0x%llx, compressed to %llu bytes\n", foffset, compressed_stackshot_len); - } - } -#endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */ - /* Write the debug log -- first seek to the end of the corefile header */ foffset = KERN_COREDUMP_HEADERSIZE; if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { @@ -1356,14 +1362,14 @@ kern_dump(enum kern_dump_type kd_variant) #if KASAN kasan_disable(); #endif - if (kd_variant == KERN_DUMP_DISK) { + if ((kd_variant == KERN_DUMP_DISK) || (kd_variant == KERN_DUMP_STACKSHOT_DISK)) { if (dumped_local) return (0); if (local_dump_in_progress) return (-1); local_dump_in_progress = TRUE; #if CONFIG_EMBEDDED hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_BUSY; #endif - ret = do_kern_dump(&kern_dump_disk_proc, KERN_DUMP_DISK); + ret = do_kern_dump(&kern_dump_disk_proc, kd_variant); if (ret == 0) { dumped_local = TRUE; kern_dump_successful = TRUE; @@ -1548,12 +1554,6 @@ kdp_core_init(void) PE_consistent_debug_register(kDbgIdAstrisConnection, kvtophys((vm_offset_t) hwsd_info), sizeof(pmap_paddr_t)); PE_consistent_debug_register(kDbgIdAstrisConnectionVers, CUR_XNU_HWSDCI_STRUCT_VERS, sizeof(uint32_t)); #endif /* CONFIG_EMBEDDED */ - -#if defined(__x86_64__) && (DEVELOPMENT || DEBUG) - /* Allocate space in the kernel map for the panic stackshot */ - kr = kmem_alloc(kernel_map, &panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, VM_KERN_MEMORY_DIAG); - assert (KERN_SUCCESS == kr); -#endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */ } #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ diff --git a/osfmk/kdp/kdp_core.h b/osfmk/kdp/kdp_core.h index d69d92b5f..a297107f3 100644 --- a/osfmk/kdp/kdp_core.h +++ b/osfmk/kdp/kdp_core.h @@ -137,6 +137,7 @@ enum kern_dump_type { #if CONFIG_EMBEDDED KERN_DUMP_HW_SHMEM_DBG, /* coordinated hardware shared memory debugger core dump */ #endif + KERN_DUMP_STACKSHOT_DISK, /* local, stackshot on device coredump */ }; int kern_dump(enum kern_dump_type kd_variant); diff --git a/osfmk/kdp/kdp_en_debugger.h b/osfmk/kdp/kdp_en_debugger.h index edb879981..ea5d3d500 100644 --- a/osfmk/kdp/kdp_en_debugger.h +++ b/osfmk/kdp/kdp_en_debugger.h @@ -26,10 +26,13 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include + /* * Ethernet debugger header file */ +__BEGIN_DECLS typedef void (*kdp_send_t)(void * pkt, unsigned int pkt_len); typedef void (*kdp_receive_t)(void * pkt, unsigned int * pkt_len, unsigned int timeout); @@ -39,3 +42,4 @@ kdp_register_send_receive(kdp_send_t send, kdp_receive_t receive); void kdp_unregister_send_receive(kdp_send_t send, kdp_receive_t receive); +__END_DECLS diff --git a/osfmk/kdp/kdp_udp.c b/osfmk/kdp/kdp_udp.c index c8636563a..260f10ddb 100644 --- a/osfmk/kdp/kdp_udp.c +++ b/osfmk/kdp/kdp_udp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -265,7 +265,7 @@ static void kdp_serial_send(void *rpkt, unsigned int rpkt_len); #endif static uint32_t kdp_current_ip_address = 0; -static struct kdp_ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}}; +static struct kdp_ether_addr kdp_current_mac_address = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}}; static void *kdp_current_ifp; static void kdp_handler( void *); @@ -282,12 +282,12 @@ static boolean_t router_specified = FALSE; static boolean_t corename_specified = FALSE; static unsigned int panicd_port = CORE_REMOTE_PORT; -static struct kdp_ether_addr etherbroadcastaddr = {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; +static struct kdp_ether_addr etherbroadcastaddr = {.ether_addr_octet = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; -static struct kdp_ether_addr router_mac = {{0, 0, 0, 0, 0, 0}}; -static struct kdp_ether_addr destination_mac = {{0, 0, 0, 0, 0, 0}}; -static struct kdp_ether_addr temp_mac = {{0, 0, 0, 0, 0, 0}}; -static struct kdp_ether_addr current_resolved_MAC = {{0, 0, 0, 0, 0, 0}}; +static struct kdp_ether_addr router_mac = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}}; +static struct kdp_ether_addr destination_mac = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}}; +static struct kdp_ether_addr temp_mac = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}}; +static struct kdp_ether_addr current_resolved_MAC = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}}; static boolean_t flag_panic_dump_in_progress = FALSE; static boolean_t flag_router_mac_initialized = FALSE; @@ -703,7 +703,7 @@ void kdp_set_interface(void *ifp, const struct kdp_ether_addr *macaddr) { char kdpstr[80]; - struct kdp_in_addr addr = { 0 }; + struct kdp_in_addr addr = { .s_addr = 0 }; unsigned int len; kdp_current_ifp = ifp; @@ -1556,18 +1556,33 @@ create_panic_header(unsigned int request, const char *corename, if (request == KDP_WRQ) { char *cp; + size_t length_remaining = (sizeof(pkt.data) - pkt.off), bytes_filled = 0; cp = coreh->th_u.tu_rpl; - cp += strlcpy(cp, corename, KDP_MAXPACKET); + bytes_filled = strlcpy(cp, corename, length_remaining); + cp += bytes_filled; *cp++ = '\0'; - cp += strlcpy(cp, mode, KDP_MAXPACKET - strlen(corename)); + /* account for the extra NULL character that has been added historically */ + length_remaining -= (bytes_filled + 1); + + bytes_filled = strlcpy(cp, mode, length_remaining); + cp += bytes_filled; + *cp++ = '\0'; + /* account for the extra NULL character that has been added historically */ + length_remaining -= (bytes_filled + 1); + + bytes_filled = strlcpy(cp, KDP_FEATURE_MASK_STRING, length_remaining); + cp += bytes_filled; *cp++ = '\0'; - cp += strlcpy(cp, KDP_FEATURE_MASK_STRING, sizeof(KDP_FEATURE_MASK_STRING)); - *cp++ = '\0'; /* Redundant */ + /* account for the extra NULL character that has been added historically */ + length_remaining -= (bytes_filled + 1); + bcopy(&kdp_crashdump_feature_mask, cp, sizeof(kdp_crashdump_feature_mask)); kdp_crashdump_pkt_size = KDP_LARGE_CRASHDUMP_PKT_SIZE; - PE_parse_boot_argn("kdp_crashdump_pkt_size", &kdp_crashdump_pkt_size, sizeof(kdp_crashdump_pkt_size)); cp += sizeof(kdp_crashdump_feature_mask); + length_remaining -= sizeof(kdp_crashdump_feature_mask); + + PE_parse_boot_argn("kdp_crashdump_pkt_size", &kdp_crashdump_pkt_size, sizeof(kdp_crashdump_pkt_size)); *(uint32_t *)cp = htonl(kdp_crashdump_pkt_size); } else { coreh->th_block = htonl((unsigned int) block); @@ -1803,6 +1818,7 @@ kdp_get_xnu_version(char *versionbuf) char vstr[20]; int retval = -1; char *vptr; + size_t length_remaining = (sizeof(pkt.data) - pkt.off); strlcpy(vstr, "custom", 10); if (kdp_machine_vm_read((mach_vm_address_t)(uintptr_t)version, versionbuf, 128)) { @@ -1823,7 +1839,7 @@ kdp_get_xnu_version(char *versionbuf) retval = 0; } } - strlcpy(versionbuf, vstr, KDP_MAXPACKET); + strlcpy(versionbuf, vstr, length_remaining); return retval; } @@ -2279,7 +2295,7 @@ kdp_init(void) #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ #if !(MACH_KDP && CONFIG_KDP_INTERACTIVE_DEBUGGING) -static struct kdp_ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}}; +static struct kdp_ether_addr kdp_current_mac_address = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}}; /* XXX ugly forward declares to stop warnings */ void *kdp_get_interface(void); diff --git a/osfmk/kdp/ml/arm/kdp_machdep.c b/osfmk/kdp/ml/arm/kdp_machdep.c index 1e1cb028c..a324da172 100644 --- a/osfmk/kdp/ml/arm/kdp_machdep.c +++ b/osfmk/kdp/ml/arm/kdp_machdep.c @@ -41,6 +41,9 @@ #include #include +#if defined(HAS_APPLE_PAC) +#include +#endif #define KDP_TEST_HARNESS 0 #if KDP_TEST_HARNESS @@ -364,7 +367,7 @@ kdp_trap(unsigned int exception, struct arm_saved_state * saved_state) * increment for both of them. */ if ((instr == GDB_TRAP_INSTR1) || (instr == GDB_TRAP_INSTR2)) { - set_saved_state_pc(saved_state, get_saved_state_pc(saved_state) + 4); + add_saved_state_pc(saved_state, 4); } #else #error Unknown architecture. @@ -722,6 +725,10 @@ machine_trace_thread64(thread_t thread, } prevlr = *(uint64_t *)kern_virt_addr; +#if defined(HAS_APPLE_PAC) + /* return addresses on stack signed by arm64e ABI */ + prevlr = (uint64_t) ptrauth_strip((void *)prevlr, ptrauth_key_return_address); +#endif if (!user_p) { prevlr = VM_KERNEL_UNSLIDE(prevlr); } diff --git a/osfmk/kdp/processor_core.c b/osfmk/kdp/processor_core.c index 6050ad502..d0c41d90a 100644 --- a/osfmk/kdp/processor_core.c +++ b/osfmk/kdp/processor_core.c @@ -94,7 +94,7 @@ uint32_t coredump_registered_count = 0; struct kern_coredump_core *kernel_helper = NULL; static struct kern_coredump_core * -kern_register_coredump_helper_internal(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks, +kern_register_coredump_helper_internal(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks, void *refcon, const char *core_description, boolean_t xnu_callback, boolean_t is64bit, uint32_t mh_magic, cpu_type_t cpu_type, cpu_subtype_t cpu_subtype) { @@ -166,7 +166,7 @@ kern_register_coredump_helper_internal(int kern_coredump_config_vers, kern_cored } kern_return_t -kern_register_coredump_helper(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks, +kern_register_coredump_helper(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks, void *refcon, const char *core_description, boolean_t is64bit, uint32_t mh_magic, cpu_type_t cpu_type, cpu_subtype_t cpu_subtype) { @@ -720,7 +720,7 @@ kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_ #else /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ kern_return_t -kern_register_coredump_helper(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks, void* refcon, +kern_register_coredump_helper(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks, void* refcon, const char *core_description, boolean_t is64bit, uint32_t mh_magic, cpu_type_t cpu_type, cpu_subtype_t cpu_subtype) { diff --git a/osfmk/kdp/processor_core.h b/osfmk/kdp/processor_core.h index 46bf717d3..3d736c1f8 100644 --- a/osfmk/kdp/processor_core.h +++ b/osfmk/kdp/processor_core.h @@ -169,7 +169,7 @@ typedef struct { * coredump infrastructure. In addition to the callback config and version of the config * structure, a description of the core should be provided -- i.e.: AP */ -kern_return_t kern_register_coredump_helper(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks, void *refcon, +kern_return_t kern_register_coredump_helper(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks, void *refcon, const char *core_description, boolean_t is64bit, uint32_t mh_magic, cpu_type_t cpu_type, cpu_subtype_t cpu_subtype); #if PRIVATE diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index c58d9a7f1..9f53a26ec 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -14,16 +14,19 @@ DATAFILES = \ kcdata.h PRIVATE_DATAFILES = \ + arithmetic_128.h \ + block_hint.h \ cs_blobs.h \ - trustcache.h \ debug.h \ ecc.h \ - block_hint.h \ lock_stat.h \ monotonic.h \ - arithmetic_128.h \ - turnstile.h \ - remote_time.h + remote_time.h \ + restartable.h \ + sched_clutch.h \ + trustcache.h \ + turnstile.h + EXPORT_FILES = \ affinity.h \ @@ -33,6 +36,7 @@ EXPORT_FILES = \ bits.h \ btlog.h \ call_entry.h \ + circle_queue.h \ clock.h \ coalition.h \ cpu_number.h \ @@ -59,6 +63,7 @@ EXPORT_FILES = \ policy_internal.h \ processor.h \ queue.h \ + mpsc_queue.h \ priority_queue.h \ sched_prim.h \ sfi.h \ @@ -80,6 +85,7 @@ PRIVATE_EXPORT_FILES = \ copyout_shim.h XNU_ONLY_EXPORTS = \ + arcade.h \ cpu_quiesce.h \ ipc_kobject.h \ ux_handler.h diff --git a/osfmk/kern/arcade.c b/osfmk/kern/arcade.c new file mode 100644 index 000000000..48c4b6014 --- /dev/null +++ b/osfmk/kern/arcade.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#if !defined(MAXPATHLEN) +#define MAXPATHLEN 4096 +#endif + +extern struct proc *current_proc(void); +extern int proc_pidpathinfo_internal(struct proc *p, uint64_t arg, + char *buffer, uint32_t buffersize, + int32_t *retval); +extern off_t proc_getexecutableoffset(struct proc *p); + +/* + * Simple structure to represent a handle for the Arcade registration. + * + * This registration is done with an independent kobject callback, rather + * than a reply, so that we execute it in the context of the user-space + * server replying (in order to do an entitlement check on the reply). + * + * We cache the resulting upcall port until it fails, and then we go + * get another one. + */ +struct arcade_register { + ipc_port_t ar_port; +}; +typedef struct arcade_register *arcade_register_t; + +static struct arcade_register arcade_register_global; + +void +arcade_prepare(task_t task, thread_t thread) +{ + /* Platform binaries are exempt */ + if (task->t_flags & TF_PLATFORM) { + return; + } + + /* Check to see if the task has the arcade entitlement */ + if (!IOTaskHasEntitlement(task, "com.apple.developer.arcade-operations")) { + return; + } + + /* Others will stop in the AST to make an upcall */ + thread_ast_set(thread, AST_ARCADE); +} + +static lck_grp_attr_t *arcade_upcall_lck_grp_attr; +static lck_grp_t *arcade_upcall_lck_grp; +static lck_mtx_t arcade_upcall_mutex; + +static ipc_port_t arcade_upcall_port = IP_NULL; +static boolean_t arcade_upcall_refresh_in_progress = FALSE; +static boolean_t arcade_upcall_refresh_waiters = FALSE; + +void +arcade_init(void) +{ + ipc_port_t port; + + arcade_upcall_lck_grp_attr = lck_grp_attr_alloc_init(); + arcade_upcall_lck_grp = lck_grp_alloc_init("arcade_upcall", arcade_upcall_lck_grp_attr); + lck_mtx_init(&arcade_upcall_mutex, arcade_upcall_lck_grp, NULL); + + /* Initialize the global arcade_register kobject and associated port */ + port = ipc_kobject_alloc_port((ipc_kobject_t)&arcade_register_global, + IKOT_ARCADE_REG, IPC_KOBJECT_ALLOC_MAKE_SEND); + arcade_register_global.ar_port = port; +} + +arcade_register_t +convert_port_to_arcade_register( + ipc_port_t port) +{ + arcade_register_t arcade_reg = ARCADE_REG_NULL; + + if (IP_VALID(port)) { + /* No need to lock port because of how refs managed */ + if (ip_kotype(port) == IKOT_ARCADE_REG) { + assert(ip_active(port)); + arcade_reg = (arcade_register_t)port->ip_kobject; + assert(arcade_reg == &arcade_register_global); + assert(arcade_reg->ar_port == port); + } + } + return arcade_reg; +} + +ipc_port_t +convert_arcade_register_to_port( + arcade_register_t arcade_reg) +{ + ipc_port_t port = IP_NULL; + + if (arcade_reg == &arcade_register_global) { + port = arcade_reg->ar_port; + } + return port; +} + +kern_return_t +arcade_register_new_upcall( + arcade_register_t arcade_reg, + mach_port_t port) +{ + if (arcade_reg == ARCADE_REG_NULL) { + return KERN_INVALID_ARGUMENT; + } + assert(arcade_reg == &arcade_register_global); + + /* Check to see if this is the real arcade subscription service */ + if (!IOTaskHasEntitlement(current_task(), "com.apple.arcade.fpsd")) { + return KERN_INVALID_VALUE; + } + + lck_mtx_lock(&arcade_upcall_mutex); + + if (arcade_upcall_refresh_in_progress) { + /* If we have an old arcade upcall port, discard it */ + if (IP_VALID(arcade_upcall_port)) { + ipc_port_release_send(arcade_upcall_port); + arcade_upcall_port = IP_NULL; + } + arcade_upcall_port = port; /* owns send right */ + + /* Wake up anyone waiting for the update */ + lck_mtx_unlock(&arcade_upcall_mutex); + thread_wakeup(&arcade_upcall_port); + return KERN_SUCCESS; + } + + lck_mtx_unlock(&arcade_upcall_mutex); + return KERN_FAILURE; +} + + +static kern_return_t +arcade_upcall_refresh(uint64_t deadline) +{ + ipc_port_t fairplayd_port = IP_NULL; + wait_result_t wr = THREAD_NOT_WAITING; + kern_return_t kr; + + LCK_MTX_ASSERT(&arcade_upcall_mutex, LCK_MTX_ASSERT_OWNED); + + /* If someone else is doing the update, wait for them */ + if (arcade_upcall_refresh_in_progress) { + arcade_upcall_refresh_waiters = TRUE; + wr = lck_mtx_sleep(&arcade_upcall_mutex, LCK_SLEEP_DEFAULT, + &arcade_upcall_refresh_in_progress, THREAD_INTERRUPTIBLE); + goto out; + } + + arcade_upcall_refresh_in_progress = TRUE; + + /* If we have an old arcade upcall port, discard it */ + if (IP_VALID(arcade_upcall_port)) { + ipc_port_release_send(arcade_upcall_port); + arcade_upcall_port = IP_NULL; + } + +#if 0 + if (host_get_fairplayd_port(host_priv_self(), &fairplayd_port) != KERN_SUCCESS) { + panic("arcade_upcall_refresh(get fairplayd)"); + } +#else + /* Temporary hack because launchd is rejecting the other special port number */ + if (host_get_unfreed_port(host_priv_self(), &fairplayd_port) != KERN_SUCCESS) { + panic("arcade_upcall_refresh(get fairplayd)"); + } +#endif + + /* If no valid fairplayd port registered, we're done */ + if (!IP_VALID(fairplayd_port)) { + goto finish_in_progress; + } + + /* + * Send a fairplayd notification to request a new arcade upcall port. + * Pass along a send right to the arcade_register kobject to complete + * the registration. + */ + ipc_port_t port = convert_arcade_register_to_port(&arcade_register_global); + kr = fairplayd_arcade_request(fairplayd_port, port); + + ipc_port_release_send(fairplayd_port); + + switch (kr) { + case MACH_MSG_SUCCESS: + break; + default: + goto finish_in_progress; + } + + /* + * Wait on the arcade upcall port to get registered through the + * registration kobject waiting with a deadline here. + */ + wr = lck_mtx_sleep_deadline(&arcade_upcall_mutex, LCK_SLEEP_DEFAULT, + &arcade_upcall_port, THREAD_INTERRUPTIBLE, deadline); + +finish_in_progress: + arcade_upcall_refresh_in_progress = FALSE; + + /* Wakeup any waiters */ + if (arcade_upcall_refresh_waiters) { + arcade_upcall_refresh_waiters = FALSE; + thread_wakeup_with_result(&arcade_upcall_refresh_in_progress, wr); + } + +out: + switch (wr) { + case THREAD_AWAKENED: + return KERN_SUCCESS; + default: + return KERN_FAILURE; + } +} + +static kern_return_t +__MAKING_UPCALL_TO_ARCADE_VALIDATION_SERVICE__(mach_port_t port, + vm_map_copy_t path, + vm_size_t pathlen, + off_t offset, + boolean_t *should_killp) +{ + mach_msg_type_number_t len = (mach_msg_type_number_t)pathlen; + return arcade_upcall(port, (vm_offset_t)path, len, offset, should_killp); +} + +void +arcade_ast(__unused thread_t thread) +{ + ipc_port_t port; + uint64_t deadline; + kern_return_t kr; + int retval; + + /* Determine the deadline */ + clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline); + +restart: + lck_mtx_lock(&arcade_upcall_mutex); + port = ipc_port_copy_send(arcade_upcall_port); + /* + * if the arcade_upcall_port was inactive, "port" will be IP_DEAD. + * Otherwise, it holds a send right to the arcade_upcall_port. + */ + + while (!IP_VALID(port)) { + /* + * Refresh the arcade upcall port. If that gives up, + * give up ourselves. + */ + kr = arcade_upcall_refresh(deadline); + if (kr != KERN_SUCCESS) { + lck_mtx_unlock(&arcade_upcall_mutex); + goto fail; + } + port = ipc_port_copy_send(arcade_upcall_port); + } + lck_mtx_unlock(&arcade_upcall_mutex); + + /* We have an upcall port send right */ + + /* Gather the data we need to send in the upcall */ + off_t offset; + struct proc *p = current_proc(); + char *path; + vm_map_copy_t copy; + + kr = kmem_alloc(ipc_kernel_map, (vm_offset_t *)&path, MAXPATHLEN, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + ipc_port_release_send(port); + return; + } + bzero(path, MAXPATHLEN); + retval = proc_pidpathinfo_internal(p, 0, path, MAXPATHLEN, NULL); + assert(!retval); + kr = vm_map_unwire(ipc_kernel_map, + vm_map_trunc_page((vm_offset_t)path, VM_MAP_PAGE_MASK(ipc_kernel_map)), + vm_map_round_page((vm_offset_t)path + MAXPATHLEN, VM_MAP_PAGE_MASK(ipc_kernel_map)), + FALSE); + assert(kr == KERN_SUCCESS); + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)path, MAXPATHLEN, TRUE, ©); + assert(kr == KERN_SUCCESS); + + offset = proc_getexecutableoffset(p); + + /* MAKE THE UPCALL */ + boolean_t should_kill = TRUE; + kr = __MAKING_UPCALL_TO_ARCADE_VALIDATION_SERVICE__(port, copy, MAXPATHLEN, offset, &should_kill); + ipc_port_release_send(port); + + switch (kr) { + case MACH_SEND_INVALID_DEST: + vm_map_copy_discard(copy); + /* fall thru */ + case MIG_SERVER_DIED: + goto restart; + case KERN_SUCCESS: + if (should_kill == TRUE) { + /* + * Invalid subscription. UI already presented as to why it did not + * launch. + */ + task_terminate_internal(current_task()); + } + break; + default: +fail: + /* + * Failure of the subscription validation mechanism, not a rejection. + * for a missing subscription. There will be no indication WHY this + * process didn't launch. We might want this to be an exit_with_reason() + * in the future. + */ + task_terminate_internal(current_task()); + break; + } +} diff --git a/osfmk/kern/arcade.h b/osfmk/kern/arcade.h new file mode 100644 index 000000000..b3a230ffc --- /dev/null +++ b/osfmk/kern/arcade.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KERN_ARCADE_H_ +#define _KERN_ARCADE_H_ + +#include +#include + +#include + + +#if XNU_KERNEL_PRIVATE + +struct arcade_register; + +extern void arcade_init(void); + +extern void arcade_ast(thread_t thread); + +extern void arcade_prepare(task_t task, thread_t thread); + +extern void arcade_register_notify(mach_msg_header_t *msg); + +extern void arcade_register_reference(arcade_register_t arcade_reg); + +extern void arcade_register_release(arcade_register_t arcade_reg); + +extern mach_port_t convert_arcade_register_to_port(arcade_register_t arcade_reg); + +extern arcade_register_t convert_port_to_arcade_register(mach_port_t port); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _KERN_ARCADE_H_ */ diff --git a/osfmk/kern/assert.h b/osfmk/kern/assert.h index e6da6c5a3..80ca1193e 100644 --- a/osfmk/kern/assert.h +++ b/osfmk/kern/assert.h @@ -70,6 +70,9 @@ __BEGIN_DECLS /* Assert error */ +#if !CONFIG_NONFATAL_ASSERTS +__abortlike +#endif extern void Assert( const char *file, int line, diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index 21fb3f554..d0e341529 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #if CONFIG_TELEMETRY @@ -75,6 +76,10 @@ #include // for MACF AST hook #include +#if CONFIG_ARCADE +#include +#endif + static void __attribute__((noinline, noreturn, disable_tail_calls)) thread_preempted(__unused void* parameter, __unused wait_result_t result) { @@ -217,6 +222,13 @@ ast_taken_user(void) } #endif +#if CONFIG_ARCADE + if (reasons & AST_ARCADE) { + thread_ast_clear(thread, AST_ARCADE); + arcade_ast(thread); + } +#endif + if (reasons & AST_APC) { thread_ast_clear(thread, AST_APC); thread_apc_ast(thread); @@ -237,6 +249,11 @@ ast_taken_user(void) kperf_kpc_thread_ast(thread); } + if (reasons & AST_RESET_PCS) { + thread_ast_clear(thread, AST_RESET_PCS); + thread_reset_pcs_ast(thread); + } + if (reasons & AST_KEVENT) { thread_ast_clear(thread, AST_KEVENT); uint16_t bits = atomic_exchange(&thread->kevent_ast_bits, 0); @@ -319,8 +336,7 @@ ast_taken_user(void) assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0); assert((thread->sched_flags & TH_SFLAG_DEPRESS) == 0); - assert(thread->promotions == 0); - assert(thread->was_promoted_on_wakeup == 0); + assert(thread->kern_promotion_schedpri == 0); assert(thread->waiting_for_mutex == NULL); assert(thread->rwlock_count == 0); } diff --git a/osfmk/kern/ast.h b/osfmk/kern/ast.h index 23fe59218..8487484ab 100644 --- a/osfmk/kern/ast.h +++ b/osfmk/kern/ast.h @@ -118,7 +118,8 @@ typedef uint32_t ast_t; #define AST_BSD 0x80 #define AST_KPERF 0x100 /* kernel profiling */ #define AST_MACF 0x200 /* MACF user ret pending */ -/* 0x400, 0x800 unused */ +#define AST_RESET_PCS 0x400 /* restartable ranges */ +#define AST_ARCADE 0x800 /* arcade subsciption support */ #define AST_GUARD 0x1000 #define AST_TELEMETRY_USER 0x2000 /* telemetry sample requested on interrupt from userspace */ #define AST_TELEMETRY_KERNEL 0x4000 /* telemetry sample requested on interrupt from kernel */ @@ -140,7 +141,8 @@ typedef uint32_t ast_t; AST_TELEMETRY_PMI | AST_TELEMETRY_IO) /* Per-thread ASTs follow the thread at context-switch time. */ -#define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL | AST_KEVENT) +#define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_RESET_PCS | \ + AST_ARCADE | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL | AST_KEVENT) /* Handle AST_URGENT detected while in the kernel */ extern void ast_taken_kernel(void); @@ -180,8 +182,8 @@ extern void ast_propagate(thread_t thread); * * See act_set_ast() for an example. */ -#define thread_ast_set(act, reason) (hw_atomic_or_noret(&(act)->ast, (reason))) -#define thread_ast_clear(act, reason) (hw_atomic_and_noret(&(act)->ast, ~(reason))) +#define thread_ast_set(act, reason) ((void)os_atomic_or(&(act)->ast, (reason), relaxed)) +#define thread_ast_clear(act, reason) ((void)os_atomic_andnot(&(act)->ast, (reason), relaxed)) #ifdef MACH_BSD @@ -197,5 +199,7 @@ extern void dtrace_ast(void); extern void kevent_ast(thread_t thread, uint16_t bits); extern void act_set_astkevent(thread_t thread, uint16_t bits); +extern uint16_t act_clear_astkevent(thread_t thread, uint16_t bits); +extern void act_set_ast_reset_pcs(thread_t thread); #endif /* _KERN_AST_H_ */ diff --git a/osfmk/kern/audit_sessionport.c b/osfmk/kern/audit_sessionport.c index d67c3edaf..eb51597a2 100644 --- a/osfmk/kern/audit_sessionport.c +++ b/osfmk/kern/audit_sessionport.c @@ -57,53 +57,13 @@ ipc_port_t audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport) { - ipc_port_t sendport = IPC_PORT_NULL; - ipc_port_t port; - - /* - * If we don't have an existing session port, then create one. - */ - port = *sessionport; - if (!IP_VALID(port)) { - ipc_port_t new_port = ipc_port_alloc_kernel(); - if (!IP_VALID(new_port)) { - return new_port; - } - ipc_kobject_set(new_port, (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT); - if (!OSCompareAndSwapPtr(port, new_port, sessionport)) { - ipc_port_dealloc_kernel(new_port); - } - port = *sessionport; - } - - assert(ip_active(port) && IKOT_AU_SESSIONPORT == ip_kotype(port)); - sendport = ipc_port_make_send(port); - - /* - * If we don't have a no-senders notification outstanding against - * the port, take a reference on the session and request one. - */ - if (IP_NULL == port->ip_nsrequest) { - ipc_port_t notifyport; - - audit_session_aiaref(aia_p); - - - ip_lock(port); - /* Need a send-once right for the target of the notification */ - notifyport = ipc_port_make_sonce_locked(port); - /* Request a no-senders notification (at the new make-send threshold) */ - ipc_port_nsrequest(port, port->ip_mscount, notifyport, ¬ifyport); - /* port unlocked */ - - if (IP_NULL != notifyport) { - /* race requesting notification */ - audit_session_aiaunref(aia_p); - ipc_port_release_sonce(notifyport); - } + audit_session_aiaref(aia_p); + if (!ipc_kobject_make_send_lazy_alloc_port(sessionport, + (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT)) { + audit_session_aiaunref(aia_p); } - return sendport; + return *sessionport; } @@ -129,7 +89,7 @@ audit_session_porttoaia(ipc_port_t port) if (IP_VALID(port)) { ip_lock(port); if (IKOT_AU_SESSIONPORT == ip_kotype(port)) { - assert(ip_active(port)); + require_ip_active(port); aia_p = (struct auditinfo_addr *)port->ip_kobject; } ip_unlock(port); @@ -147,53 +107,21 @@ audit_session_porttoaia(ipc_port_t port) * Parameters: msg A Mach no-senders notification message. * * Notes: It is possible that new send rights are created after a - * no-senders notification has been sent (i.e. via audit_session_mksend). - * We check the port's mscount against the notification's not_count - * to detect when this happens, and re-arm the notification in that - * case. - * - * In the normal case (no new senders), we first mark the port - * as dying by setting its object type to IKOT_NONE so that - * audit_session_mksend will no longer use it to create - * additional send rights. We can then safely call - * audit_session_port_destroy with no locks. + * no-senders notification has been sent, but they will be protected + * by another aia reference. */ void audit_session_nosenders(mach_msg_header_t *msg) { mach_no_senders_notification_t *notification = (void *)msg; ipc_port_t port = notification->not_header.msgh_remote_port; - ipc_port_t notifyport; struct auditinfo_addr *port_aia_p = NULL; + require_ip_active(port); assert(IKOT_AU_SESSIONPORT == ip_kotype(port)); - ip_lock(port); - assert(ip_active(port)); port_aia_p = (struct auditinfo_addr *)port->ip_kobject; assert(NULL != port_aia_p); - /* - * if new send rights have been made since the last notify - * request, re-arm the notification with the new threshold. - */ - if (port->ip_mscount > notification->not_count) { - notifyport = ipc_port_make_sonce_locked(port); - ipc_port_nsrequest(port, port->ip_mscount, notifyport, ¬ifyport); - /* port unlocked */ - - if (IP_NULL != notifyport) { - /* race re-arming the notification */ - ipc_port_release_sonce(notifyport); - audit_session_aiaunref(port_aia_p); - } - return; - } - - /* - * Otherwise, no more extant send rights, so release the - * reference held on the session by those send rights. - */ - ip_unlock(port); audit_session_aiaunref(port_aia_p); } @@ -203,7 +131,7 @@ audit_session_portdestroy(ipc_port_t *sessionport) ipc_port_t port = *sessionport; if (IP_VALID(port)) { - assert(ip_active(port)); + require_ip_active(port); assert(IKOT_AU_SESSIONPORT == ip_kotype(port)); ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); ipc_port_dealloc_kernel(port); diff --git a/osfmk/kern/backtrace.c b/osfmk/kern/backtrace.c index d99787543..82daadce6 100644 --- a/osfmk/kern/backtrace.c +++ b/osfmk/kern/backtrace.c @@ -40,12 +40,16 @@ #include #endif +#if defined(HAS_APPLE_PAC) +#include +#endif -uint32_t __attribute__((noinline)) -backtrace(uintptr_t *bt, uint32_t max_frames) +unsigned int __attribute__((noinline)) +backtrace(uintptr_t *bt, unsigned int max_frames, bool *was_truncated_out) { - return backtrace_frame(bt, max_frames, __builtin_frame_address(0)); + return backtrace_frame(bt, max_frames, __builtin_frame_address(0), + was_truncated_out); } /* @@ -57,12 +61,13 @@ backtrace(uintptr_t *bt, uint32_t max_frames) * inlined, it doesn't record the frame of the function it's inside (because * there's no stack frame). */ -uint32_t __attribute__((noinline, not_tail_called)) -backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) +unsigned int __attribute__((noinline, not_tail_called)) +backtrace_frame(uintptr_t *bt, unsigned int max_frames, void *start_frame, + bool *was_truncated_out) { thread_t thread = current_thread(); uintptr_t *fp; - uint32_t frame_index = 0; + unsigned int frame_index = 0; uintptr_t top, bottom; bool in_valid_stack; @@ -98,7 +103,12 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) break; } +#if defined(HAS_APPLE_PAC) + /* return addresses signed by arm64e ABI */ + bt[frame_index++] = (uintptr_t) ptrauth_strip((void *)ret_addr, ptrauth_key_return_address); +#else /* defined(HAS_APPLE_PAC) */ bt[frame_index++] = ret_addr; +#endif /* !defined(HAS_APPLE_PAC) */ /* stacks grow down; backtracing should be moving to higher addresses */ if (next_fp <= fp) { @@ -107,6 +117,15 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) fp = next_fp; } + /* NULL-terminate the list, if space is available */ + if (frame_index != max_frames) { + bt[frame_index] = 0; + } + + if (fp != NULL && frame_index == max_frames && was_truncated_out) { + *was_truncated_out = true; + } + return frame_index; #undef IN_STK_BOUNDS } @@ -197,8 +216,9 @@ interrupted_kernel_pc_fp(uintptr_t *pc, uintptr_t *fp) #error "interrupted_kernel_pc_fp: unsupported architecture" #endif /* !defined(__arm__) */ -uint32_t -backtrace_interrupted(uintptr_t *bt, uint32_t max_frames) +unsigned int +backtrace_interrupted(uintptr_t *bt, unsigned int max_frames, + bool *was_truncated_out) { uintptr_t pc; uintptr_t fp; @@ -218,32 +238,32 @@ backtrace_interrupted(uintptr_t *bt, uint32_t max_frames) return 1; } - return backtrace_frame(bt + 1, max_frames - 1, (void *)fp) + 1; + return backtrace_frame(bt + 1, max_frames - 1, (void *)fp, + was_truncated_out) + 1; } int -backtrace_user(uintptr_t *bt, uint32_t max_frames, uint32_t *frames_out, - bool *user_64_out) +backtrace_user(uintptr_t *bt, unsigned int max_frames, + unsigned int *frames_out, bool *user_64_out, bool *was_truncated_out) { - return backtrace_thread_user(current_thread(), bt, max_frames, frames_out, - user_64_out); + return backtrace_thread_user(current_thread(), bt, max_frames, + frames_out, user_64_out, was_truncated_out); } int -backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, - uint32_t *frames_out, bool *user_64_out) +backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int max_frames, + unsigned int *frames_out, bool *user_64_out, bool *was_truncated_out) { bool user_64; - uintptr_t pc, fp, next_fp; + uintptr_t pc = 0, fp = 0, next_fp = 0; vm_map_t map = NULL, old_map = NULL; - uint32_t frame_index = 0; + unsigned int frame_index = 0; int err = 0; - size_t frame_size; + size_t frame_size = 0; assert(bt != NULL); assert(max_frames > 0); assert(frames_out != NULL); - assert(user_64_out != NULL); #if defined(__x86_64__) @@ -297,10 +317,6 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, #error "backtrace_thread_user: unsupported architecture" #endif /* !defined(__arm__) */ - if (max_frames == 0) { - goto out; - } - bt[frame_index++] = pc; if (frame_index >= max_frames) { @@ -327,7 +343,7 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, } u32; } frame; - frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t)); + frame_size = 2 * (user_64 ? 8 : 4); /* switch to the correct map, for copyin */ if (thread != current_thread()) { @@ -343,6 +359,9 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, while (fp != 0 && frame_index < max_frames) { err = copyin(fp, (char *)&frame, frame_size); if (err) { + if (was_truncated_out) { + *was_truncated_out = true; + } goto out; } @@ -353,7 +372,13 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, } uintptr_t ret_addr = user_64 ? frame.u64.ret : frame.u32.ret; +#if defined(HAS_APPLE_PAC) + /* return addresses signed by arm64e ABI */ + bt[frame_index++] = (uintptr_t)ptrauth_strip((void *)ret_addr, + ptrauth_key_return_address); +#else /* defined(HAS_APPLE_PAC) */ bt[frame_index++] = ret_addr; +#endif /* !defined(HAS_APPLE_PAC) */ /* stacks grow down; backtracing should be moving to higher addresses */ if (next_fp <= fp) { @@ -368,7 +393,19 @@ out: vm_map_deallocate(map); } - *user_64_out = user_64; + /* NULL-terminate the list, if space is available */ + if (frame_index != max_frames) { + bt[frame_index] = 0; + } + + if (fp != 0 && frame_index == max_frames && was_truncated_out) { + *was_truncated_out = true; + } + + if (user_64_out) { + *user_64_out = user_64; + } + *frames_out = frame_index; return err; #undef INVALID_USER_FP diff --git a/osfmk/kern/backtrace.h b/osfmk/kern/backtrace.h index 8bdafcddb..8b56b26df 100644 --- a/osfmk/kern/backtrace.h +++ b/osfmk/kern/backtrace.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2016-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,8 +26,8 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef BACKTRACE_H -#define BACKTRACE_H +#ifndef KERN_BACKTRACE_H +#define KERN_BACKTRACE_H #include #include @@ -35,55 +35,108 @@ __BEGIN_DECLS -/* - * Backtrace the current thread, storing up to max_frames return addresses in - * bt. Returns the number of return addresses stored. +/*! + * @function backtrace + * + * @abstract backtrace the current thread's kernel stack + * + * @discussion Backtrace the kernel stack of the current thread, storing up + * to btlen return addresses in bt. Returns the number of return addresses + * stored and sets was_truncated to true if it is non-NULL and the backtrace was + * truncated to fit in the provided space. The backtrace starts at the calling + * function. A zero will be stored after the return addresses in the buffer, + * if space allows. + * + * @param bt Clients must provide a buffer in which to store the return + * addresses. + * + * @param btlen Along with the buffer, its length (in terms of uintptr_t) must + * also be provided. + * + * @param was_truncated Optionally, clients can provide a boolean out-parameter + * that will be set to true if the backtrace was truncated due to a lack of + * buffer space. + * + * @return The number of return addresses written to bt is returned. The + * function cannot return an error. */ -uint32_t backtrace(uintptr_t *bt, uint32_t max_frames) +unsigned int backtrace(uintptr_t *bt, unsigned int btlen, bool *was_truncated) __attribute__((noinline)); -/* - * Backtrace the current thread starting at the frame pointer start_fp, storing - * up to max_frames return addresses in bt. Returns the number of return - * addresses stored. +/*! + * @function backtrace_from + * + * @abstract backtrace the current thread's kernel stack from a frame pointer + * + * @discussion Backtrace the kernel stack of the current thread from the given + * frame pointer startfp, storing up to btlen return addresses in bt. Returns + * the number of return addresses written and sets trunc to true if trunc is + * non-NULL and the backtrace was truncated to fit in the provided space. The + * frame pointer provided must point to a valid frame on the current thread's + * stack. + * + * @param startfp The frame pointer to start backtracing from is required, and + * must be point to a valid frame on the current thread's stack. + * + * @seealso backtrace */ -uint32_t backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) +unsigned int backtrace_frame(uintptr_t *bt, unsigned int btlen, void *startfp, + bool *was_truncated) __attribute__((noinline, not_tail_called)); -/* - * Backtrace the kernel stack of the context that was interrupted, storing up - * to max_frames return addresses in bt. Returns 0 on success, and non-zero - * otherwise. On success, the number of frames written is stored at the value - * pointed to by frames_out. +/*! + * @function backtrace_interrupted + * + * @abstract backtrace the interrupted context + * + * @discussion Backtrace the kernel stack of the interrupted thread, storing up + * to btlen return addresses in bt. This function must be called from interrupt + * context. * - * Must be called from interrupt context. + * @seealso backtrace */ -uint32_t backtrace_interrupted(uintptr_t *bt, uint32_t max_frames); +unsigned int backtrace_interrupted(uintptr_t *bt, unsigned int btlen, + bool *was_truncated); -/* - * Backtrace the user stack of the current thread, storing up to max_frames - * return addresses in bt. Returns 0 on success, and non-zero otherwise. On - * success, the number of frames written is stored at the value pointed to by - * frames_out and the value pointed to by user_64_out is set true if the user - * space thread was running in 64-bit mode, and false otherwise. +/*! + * @function backtrace_user + * + * @abstract backtrace the current thread's user space stack + * + * @discussion Backtrace the user stack of the current thread, storing up to + * btlen return addresses in bt. This function cannot be called on a kernel + * thread, nor can it be called from interrupt context or with interrupts + * disabled. * - * Must not be called from interrupt context or with interrupts disabled. + * @param btwritten On success, the number of return addresses written is stored + * here. + * + * @param user64 On success, true is stored here if user space was running in + * 64-bit mode, and false is stored otherwise. + * + * @return Returns 0 on success and an errno value on error. + * + * @seealso backtrace */ -int backtrace_user(uintptr_t *bt, uint32_t max_frames, uint32_t *frames_out, - bool *user_64_out); +int backtrace_user(uintptr_t *bt, unsigned int btlen, unsigned int *btwritten, + bool *user64, bool *was_truncated); /* - * Backtrace the user stack of the given thread, storing up to max_frames return - * addresses in bt. Returns 0 on success, and non-zero otherwise. On success, - * the number of frames written is stored at the value pointed to by frames_out - * and the value pointed to by user_64_out is set true if the user space thread - * was running in 64-bit mode, and false otherwise. + * @function backtrace_thread_user + * + * @abstract backtrace a given thread's user space stack + * + * @discussion Backtrace the user stack of the given thread, storing up to btlen + * return addresses in bt. This function cannot be called on a kernel thread, + * nor can it be called from interrupt context or with interrupts disabled. + * + * @param thread The user thread to backtrace is required. * - * Must not be called from interrupt context or with interrupts disabled. + * @see backtrace_user */ -int backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, - uint32_t *frames_out, bool *user_64_out); +int backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int btlen, + unsigned int *btwritten, bool *user64, bool *was_truncated); __END_DECLS -#endif /* !defined(BACKTRACE_H) */ +#endif /* !defined(KERN_BACKTRACE_H) */ diff --git a/osfmk/kern/bits.h b/osfmk/kern/bits.h index 47db873e1..00dbc4b78 100644 --- a/osfmk/kern/bits.h +++ b/osfmk/kern/bits.h @@ -31,6 +31,7 @@ #ifndef __BITS_H__ #define __BITS_H__ +#include #include #include #include diff --git a/osfmk/kern/block_hint.h b/osfmk/kern/block_hint.h index 7f351fe98..25fb8477e 100644 --- a/osfmk/kern/block_hint.h +++ b/osfmk/kern/block_hint.h @@ -48,6 +48,7 @@ typedef enum thread_snapshot_wait_flags { kThreadWaitParkedWorkQueue = 0x0f, kThreadWaitWorkloopSyncWait = 0x10, kThreadWaitOnProcess = 0x11, + kThreadWaitSleepWithInheritor = 0x12, } __attribute__((packed)) block_hint_t; _Static_assert(sizeof(block_hint_t) <= sizeof(short), @@ -70,6 +71,8 @@ extern void kdp_pthread_find_owner(thread_t thread, thread_waitinfo_t *waitinfo) extern void *kdp_pthread_get_thread_kwq(thread_t thread); extern void kdp_workloop_sync_wait_find_owner(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo); extern void kdp_wait4_find_process(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo); +extern void kdp_sleep_with_inheritor_find_owner(struct waitq * waitq, __unused event64_t event, thread_waitinfo_t * waitinfo); +extern void kdp_turnstile_fill_tsinfo(struct turnstile *ts, thread_turnstileinfo_t *tsinfo); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index add2c1d51..e89c8cfb8 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -44,6 +44,7 @@ #include /* last */ #include #include +#include #if MONOTONIC #include @@ -51,6 +52,7 @@ #endif /* MONOTONIC */ #include +#include /* CS_CDHASH_LEN */ #undef thread_should_halt @@ -68,13 +70,15 @@ int fill_task_rusage(task_t task, rusage_info_current *ri); int fill_task_io_rusage(task_t task, rusage_info_current *ri); int fill_task_qos_rusage(task_t task, rusage_info_current *ri); void fill_task_monotonic_rusage(task_t task, rusage_info_current *ri); -uint64_t get_task_logical_writes(task_t task); +uint64_t get_task_logical_writes(task_t task, boolean_t external); void fill_task_billed_usage(task_t task, rusage_info_current *ri); void task_bsdtask_kill(task_t); extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p); +extern uint64_t get_dispatchqueue_label_offset_from_proc(void *p); extern uint64_t proc_uniqueid(void *p); extern int proc_pidversion(void *p); +extern int proc_getcdhash(void *p, char *cdhash); #if MACH_BSD extern void psignal(void *, int); @@ -124,6 +128,20 @@ get_bsdthread_info(thread_t th) return th->uthread; } +/* + * This is used to remember any FS error from VNOP_PAGEIN code when + * invoked under vm_fault(). The value is an errno style value. It can + * be retrieved by exception handlers using thread_get_state(). + */ +void +set_thread_pagein_error(thread_t th, int error) +{ + assert(th == current_thread()); + if (error == 0 || th->t_pagein_error == 0) { + th->t_pagein_error = error; + } +} + #if defined(__x86_64__) /* * Returns non-zero if the thread has a non-NULL task @@ -313,23 +331,6 @@ get_task_ipcspace(task_t t) return t->itk_space; } -int -get_task_numactivethreads(task_t task) -{ - thread_t inc; - int num_active_thr = 0; - task_lock(task); - - for (inc = (thread_t)(void *)queue_first(&task->threads); - !queue_end(&task->threads, (queue_entry_t)inc); inc = (thread_t)(void *)queue_next(&inc->task_threads)) { - if (inc->active) { - num_active_thr++; - } - } - task_unlock(task); - return num_active_thr; -} - int get_task_numacts(task_t t) { @@ -689,6 +690,18 @@ get_task_cpu_time(task_t task) return 0; } +uint32_t +get_task_loadTag(task_t task) +{ + return os_atomic_load(&task->loadTag, relaxed); +} + +uint32_t +set_task_loadTag(task_t task, uint32_t loadTag) +{ + return os_atomic_xchg(&task->loadTag, loadTag, relaxed); +} + /* * */ @@ -1007,8 +1020,8 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_t err = 1; goto out; } - ptinfo->pth_user_time = ((basic_info.user_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.user_time.microseconds * (integer_t)NSEC_PER_USEC)); - ptinfo->pth_system_time = ((basic_info.system_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.system_time.microseconds * (integer_t)NSEC_PER_USEC)); + ptinfo->pth_user_time = (((uint64_t)basic_info.user_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.user_time.microseconds * NSEC_PER_USEC)); + ptinfo->pth_system_time = (((uint64_t)basic_info.system_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.system_time.microseconds * NSEC_PER_USEC)); ptinfo->pth_cpu_usage = basic_info.cpu_usage; ptinfo->pth_policy = basic_info.policy; @@ -1078,14 +1091,17 @@ fill_task_rusage(task_t task, rusage_info_current *ri) { struct task_power_info powerinfo; + uint64_t runnable_time = 0; + assert(task != TASK_NULL); task_lock(task); - task_power_info_locked(task, &powerinfo, NULL, NULL); + task_power_info_locked(task, &powerinfo, NULL, NULL, &runnable_time); ri->ri_pkg_idle_wkups = powerinfo.task_platform_idle_wakeups; ri->ri_interrupt_wkups = powerinfo.task_interrupt_wakeups; ri->ri_user_time = powerinfo.total_user; ri->ri_system_time = powerinfo.total_system; + ri->ri_runnable_time = runnable_time; ledger_get_balance(task->ledger, task_ledgers.phys_footprint, (ledger_amount_t *)&ri->ri_phys_footprint); @@ -1175,12 +1191,19 @@ fill_task_monotonic_rusage(task_t task, rusage_info_current *ri) } uint64_t -get_task_logical_writes(task_t task) +get_task_logical_writes(task_t task, boolean_t external) { assert(task != TASK_NULL); struct ledger_entry_info lei; task_lock(task); + + if (external == FALSE) { + ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei); + } else { + ledger_get_entry_info(task->ledger, task_ledgers.logical_writes_to_external, &lei); + } + ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei); task_unlock(task); @@ -1199,6 +1222,18 @@ get_task_dispatchqueue_serialno_offset(task_t task) return dq_serialno_offset; } +uint64_t +get_task_dispatchqueue_label_offset(task_t task) +{ + uint64_t dq_label_offset = 0; + + if (task->bsd_info) { + dq_label_offset = get_dispatchqueue_label_offset_from_proc(task->bsd_info); + } + + return dq_label_offset; +} + uint64_t get_task_uniqueid(task_t task) { @@ -1226,3 +1261,37 @@ get_task_crash_label(task_t task) return task->crash_label; } #endif + +int +fill_taskipctableinfo(task_t task, uint32_t *table_size, uint32_t *table_free) +{ + ipc_space_t space = task->itk_space; + if (space == NULL) { + return -1; + } + + is_read_lock(space); + if (!is_active(space)) { + is_read_unlock(space); + return -1; + } + + *table_size = space->is_table_size; + *table_free = space->is_table_free; + + is_read_unlock(space); + + return 0; +} + +int +get_task_cdhash(task_t task, char cdhash[static CS_CDHASH_LEN]) +{ + int result = 0; + + task_lock(task); + result = task->bsd_info ? proc_getcdhash(task->bsd_info, cdhash) : ESRCH; + task_unlock(task); + + return result; +} diff --git a/osfmk/kern/btlog.c b/osfmk/kern/btlog.c index 584be02cf..93f6e3117 100644 --- a/osfmk/kern/btlog.c +++ b/osfmk/kern/btlog.c @@ -157,7 +157,6 @@ lookup_btrecord_byhash(btlog_t *btlog, uint32_t md5_hash, void *bt[], size_t btc recindex = btlog->head; record = lookup_btrecord(btlog, recindex); while (recindex != BTLOG_RECORDINDEX_NONE) { - assert(record->bthash); assert(!TAILQ_EMPTY(&record->element_record_queue)); if (record->bthash == md5_hash) { /* @@ -677,8 +676,6 @@ retry: hashidx = calculate_hashidx_for_element((uintptr_t)element, btlog); hashelem = btlog_get_elem_from_freelist(btlog); - assert(record->bthash); - hashelem->elem = ~((uintptr_t)element); hashelem->operation = record->operation; hashelem->recindex = recindex; diff --git a/osfmk/kern/circle_queue.h b/osfmk/kern/circle_queue.h new file mode 100644 index 000000000..4ec2af237 --- /dev/null +++ b/osfmk/kern/circle_queue.h @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_CIRCLE_QUEUE_H_ +#define _KERN_CIRCLE_QUEUE_H_ + +#include +#include + +__BEGIN_DECLS + +/* + * Circle Queue Management APIs + * + * These are similar to the queues from queue.h, + * but the circle queue head is a single pointer to the first element + * of the queue. + */ + +typedef struct circle_queue_head { + queue_entry_t head; +} circle_queue_head_t, *circle_queue_t; + +static inline bool +circle_queue_empty(circle_queue_t cq) +{ + return cq->head == NULL; +} + +static inline queue_entry_t +circle_queue_first(circle_queue_t cq) +{ + return cq->head; +} + +static inline queue_entry_t +circle_queue_last(circle_queue_t cq) +{ + queue_entry_t elt = circle_queue_first(cq); + if (elt) { + __builtin_assume(elt->prev != NULL); + return elt->prev; + } + return NULL; +} + +static inline queue_entry_t +circle_queue_next(circle_queue_t cq, queue_entry_t elt) +{ + return elt->next == cq->head ? NULL : elt->next; +} + +static inline size_t +circle_queue_length(circle_queue_t cq) +{ + queue_entry_t elt = circle_queue_first(cq); + size_t n = 0; + + for (; elt; elt = circle_queue_next(cq, elt)) { + n++; + } + return n; +} + +static inline void +circle_enqueue_tail(circle_queue_t cq, queue_entry_t elt) +{ + queue_entry_t head = circle_queue_first(cq); + queue_entry_t tail = circle_queue_last(cq); + + if (head == NULL) { + cq->head = elt->next = elt->prev = elt; + } else { + elt->next = head; + elt->prev = tail; + tail->next = elt; + head->prev = elt; + } +} + +static inline void +circle_enqueue_head(circle_queue_t cq, queue_entry_t elt) +{ + circle_enqueue_tail(cq, elt); + cq->head = elt; +} + +static inline void +circle_dequeue(circle_queue_t cq, queue_entry_t elt) +{ + queue_entry_t elt_prev = elt->prev; + queue_entry_t elt_next = elt->next; + + if (elt == elt_next) { + assert(cq->head == elt); + cq->head = NULL; + } else { + elt_prev->next = elt_next; + elt_next->prev = elt_prev; + if (cq->head == elt) { + cq->head = elt_next; + } + } + __DEQUEUE_ELT_CLEANUP(elt); +} + +static inline queue_entry_t +circle_dequeue_head(circle_queue_t cq) +{ + queue_entry_t elt = circle_queue_first(cq); + if (elt) { + circle_dequeue(cq, elt); + } + return elt; +} + +static inline queue_entry_t +circle_dequeue_tail(circle_queue_t cq) +{ + queue_entry_t elt = circle_queue_last(cq); + if (elt) { + circle_dequeue(cq, elt); + } + return elt; +} + +/* + * Macro: cqe_element + * Function: + * Convert a cirle_queue_entry_t pointer to a queue element pointer. + * Get a pointer to the user-defined element containing + * a given cirle_queue_entry_t + * Header: + * * cqe_element(cirle_queue_entry_t qe, , field) + * qe - queue entry to convert + * - what's in the queue (e.g., struct some_data) + * - is the chain field in + * Note: + * Do not use pointer types for + */ +#define cqe_element(qe, type, field) __container_of(qe, type, field) + +/* + * Macro: cqe_foreach + * Function: + * Iterate over each queue_entry_t structure. + * Generates a 'for' loop, setting 'qe' to + * each queue_entry_t in the queue. + * Header: + * cqe_foreach(queue_entry_t qe, queue_t head) + * qe - iteration variable + * head - pointer to queue_head_t (head of queue) + * Note: + * This should only be used with Method 1 queue iteration (linkage chains) + */ +#define cqe_foreach(qe, head) \ + for (qe = circle_queue_first(head); qe; qe = circle_queue_next(head, qe)) + +/* + * Macro: cqe_foreach_safe + * Function: + * Safely iterate over each queue_entry_t structure. + * + * Use this iterator macro if you plan to remove the + * queue_entry_t, qe, from the queue during the + * iteration. + * Header: + * cqe_foreach_safe(queue_entry_t qe, queue_t head) + * qe - iteration variable + * head - pointer to queue_head_t (head of queue) + * Note: + * This should only be used with Method 1 queue iteration (linkage chains) + */ +#define cqe_foreach_safe(qe, head) \ + for (queue_entry_t _ne, _qe = circle_queue_first(head); \ + (qe = _qe) && (_ne = circle_queue_next(head, _qe), 1); \ + _qe = _ne) + +/* + * Macro: cqe_foreach_element + * Function: + * Iterate over each _element_ in a queue + * where each queue_entry_t points to another + * queue_entry_t, i.e., managed by the [de|en]queue_head/ + * [de|en]queue_tail / remqueue / etc. function. + * Header: + * cqe_foreach_element( *elt, queue_t head, ) + * elt - iteration variable + * - what's in the queue (e.g., struct some_data) + * - is the chain field in + * Note: + * This should only be used with Method 1 queue iteration (linkage chains) + */ +#define cqe_foreach_element(elt, head, field) \ + for (queue_entry_t _qe = circle_queue_first(head); \ + _qe && (elt = cqe_element(_qe, typeof(*(elt)), field), 1); \ + _qe = circle_queue_next(head, _qe)) + +/* + * Macro: cqe_foreach_element_safe + * Function: + * Safely iterate over each _element_ in a queue + * where each queue_entry_t points to another + * queue_entry_t, i.e., managed by the [de|en]queue_head/ + * [de|en]queue_tail / remqueue / etc. function. + * + * Use this iterator macro if you plan to remove the + * element, elt, from the queue during the iteration. + * Header: + * cqe_foreach_element_safe( *elt, queue_t head, ) + * elt - iteration variable + * - what's in the queue (e.g., struct some_data) + * - is the chain field in + * Note: + * This should only be used with Method 1 queue iteration (linkage chains) + */ +#define cqe_foreach_element_safe(elt, head, field) \ + for (queue_entry_t _ne, _qe = circle_queue_first(head); \ + _qe && (elt = cqe_element(_qe, typeof(*(elt)), field), \ + _ne = circle_queue_next(head, _qe), 1); \ + _qe = _ne) + +/* Dequeue an element from head, or return NULL if the queue is empty */ +#define cqe_dequeue_head(head, type, field) ({ \ + queue_entry_t _tmp_entry = circle_dequeue_head((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = cqe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Dequeue an element from tail, or return NULL if the queue is empty */ +#define cqe_dequeue_tail(head, type, field) ({ \ + queue_entry_t _tmp_entry = circle_dequeue_tail((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = cqe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Peek at the first element, or return NULL if the queue is empty */ +#define cqe_queue_first(head, type, field) ({ \ + queue_entry_t _tmp_entry = circle_queue_first((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = cqe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Peek at the next element, or return NULL if it is last */ +#define cqe_queue_next(elt, head, type, field) ({ \ + queue_entry_t _tmp_entry = circle_queue_next((head), (elt)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = cqe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Peek at the tail element, or return NULL if the queue is empty */ +#define cqe_queue_last(head, type, field) ({ \ + queue_entry_t _tmp_entry = circle_queue_last((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = cqe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* + * Macro: circle_queue_init + * Function: + * Initialize the given circle queue. + * Header: + * void circle_queue_init(q) + * circle_queue_t q; \* MODIFIED *\ + */ +#define circle_queue_init(q) \ +MACRO_BEGIN \ + (q)->head = NULL; \ +MACRO_END + +__END_DECLS + +#endif /* _KERN_QUEUE_H_ */ diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index 578a7f6a6..6801e0f31 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,7 +89,7 @@ uint32_t hz_tick_interval = 1; static uint64_t has_monotonic_clock = 0; -decl_simple_lock_data(, clock_lock) +decl_simple_lock_data(, clock_lock); lck_grp_attr_t * settime_lock_grp_attr; lck_grp_t * settime_lock_grp; lck_attr_t * settime_lock_attr; @@ -295,7 +295,7 @@ static void print_all_clock_variables_internal(const char *, struct clock_calend * * The trick is to use a generation count and set the low bit when it is * being updated/read; by doing this, we guarantee, through use of the - * hw_atomic functions, that the generation is incremented when the bit + * os_atomic functions, that the generation is incremented when the bit * is cleared atomically (by using a 1 bit add). */ static struct unlocked_clock_calend { @@ -1673,7 +1673,7 @@ clock_get_calendar_nanotime_nowait( * off the "in progress" bit to get the current generation * count. */ - (void)hw_atomic_and(&stable.gen, ~(uint32_t)1); + os_atomic_andnot(&stable.gen, 1, relaxed); /* * If an update _is_ in progress, the generation count will be @@ -1712,7 +1712,7 @@ clock_track_calend_nowait(void) * will flag an update in progress to an async caller trying * to examine the contents. */ - (void)hw_atomic_or(&flipflop[i].gen, 1); + os_atomic_or(&flipflop[i].gen, 1, relaxed); flipflop[i].calend = tmp; @@ -1722,7 +1722,7 @@ clock_track_calend_nowait(void) * count after taking a copy while in progress, the count * will be off by two. */ - (void)hw_atomic_add(&flipflop[i].gen, 1); + os_atomic_inc(&flipflop[i].gen, relaxed); } } diff --git a/osfmk/kern/clock_oldops.c b/osfmk/kern/clock_oldops.c index 430a2da53..3ec6264d8 100644 --- a/osfmk/kern/clock_oldops.c +++ b/osfmk/kern/clock_oldops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,7 +96,7 @@ typedef struct alarm alarm_data_t; #define ALARM_DONE 4 /* alarm has expired */ /* local data declarations */ -decl_simple_lock_data(static, alarm_lock) /* alarm synchronization */ +decl_simple_lock_data(static, alarm_lock); /* alarm synchronization */ static struct zone *alarm_zone; /* zone for user alarms */ static struct alarm *alrmfree; /* alarm free list pointer */ static struct alarm *alrmdone; /* alarm done list pointer */ @@ -139,9 +139,10 @@ kern_return_t rtclock_getattr( mach_msg_type_number_t *count); SECURITY_READ_ONLY_EARLY(struct clock_ops) sysclk_ops = { - NULL, rtclock_init, - rtclock_gettime, - rtclock_getattr, + .c_config = NULL, + .c_init = rtclock_init, + .c_gettime = rtclock_gettime, + .c_getattr = rtclock_getattr, }; kern_return_t calend_gettime( @@ -153,20 +154,26 @@ kern_return_t calend_getattr( mach_msg_type_number_t *count); SECURITY_READ_ONLY_EARLY(struct clock_ops) calend_ops = { - NULL, NULL, - calend_gettime, - calend_getattr, + .c_config = NULL, + .c_init = NULL, + .c_gettime = calend_gettime, + .c_getattr = calend_getattr, }; /* * List of clock devices. */ -SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = { - /* SYSTEM_CLOCK */ - { &sysclk_ops, 0, 0 }, - - /* CALENDAR_CLOCK */ - { &calend_ops, 0, 0 } +SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = { + [SYSTEM_CLOCK] = { + .cl_ops = &sysclk_ops, + .cl_service = IPC_PORT_NULL, + .cl_control = IPC_PORT_NULL, + }, + [CALENDAR_CLOCK] = { + .cl_ops = &calend_ops, + .cl_service = IPC_PORT_NULL, + .cl_control = IPC_PORT_NULL, + }, }; int clock_count = sizeof(clock_list) / sizeof(clock_list[0]); diff --git a/osfmk/kern/coalition.c b/osfmk/kern/coalition.c index 0db480817..025a2c3f1 100644 --- a/osfmk/kern/coalition.c +++ b/osfmk/kern/coalition.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,10 +31,15 @@ #include #include +#include #include #include #include #include /* for TASK_CHUNK */ +#if MONOTONIC +#include +#endif /* MONOTONIC */ +#include #include #include #include @@ -45,13 +50,16 @@ #include #include +#include + #include /* * BSD interface functions */ int coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz); -boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal); +coalition_t task_get_coalition(task_t task, int type); +boolean_t coalition_is_leader(task_t task, coalition_t coal); task_t coalition_get_leader(coalition_t coal); int coalition_get_task_count(coalition_t coal); uint64_t coalition_get_page_count(coalition_t coal, int *ntasks); @@ -61,6 +69,14 @@ int coalition_get_pid_list(coalition_t coal, uint32_t rolemask, int sort_order, /* defined in task.c */ extern ledger_template_t task_ledger_template; +/* + * Templates; task template is copied due to potential allocation limits on + * task ledgers. + */ +ledger_template_t coalition_task_ledger_template = NULL; +ledger_template_t coalition_ledger_template = NULL; + +extern int proc_selfpid(void); /* * Coalition zone needs limits. We expect there will be as many coalitions as * tasks (same order of magnitude), so use the task zone's limits. @@ -175,6 +191,10 @@ static void i_coal_resource_iterate_tasks(coalition_t coal, void *ctx, static_assert(COALITION_NUM_THREAD_QOS_TYPES == THREAD_QOS_LAST); struct i_resource_coalition { + /* + * This keeps track of resource utilization of tasks that are no longer active + * in the coalition and is updated when a task is removed from the coalition. + */ ledger_t ledger; uint64_t bytesread; uint64_t byteswritten; @@ -184,9 +204,15 @@ struct i_resource_coalition { uint64_t logical_deferred_writes; uint64_t logical_invalidated_writes; uint64_t logical_metadata_writes; + uint64_t logical_immediate_writes_to_external; + uint64_t logical_deferred_writes_to_external; + uint64_t logical_invalidated_writes_to_external; + uint64_t logical_metadata_writes_to_external; uint64_t cpu_ptime; uint64_t cpu_time_eqos[COALITION_NUM_THREAD_QOS_TYPES]; /* cpu time per effective QoS class */ uint64_t cpu_time_rqos[COALITION_NUM_THREAD_QOS_TYPES]; /* cpu time per requested QoS class */ + uint64_t cpu_instructions; + uint64_t cpu_cycles; uint64_t task_count; /* tasks that have started in this coalition */ uint64_t dead_task_count; /* tasks that have exited in this coalition; @@ -200,6 +226,11 @@ struct i_resource_coalition { uint64_t time_nonempty; queue_head_t tasks; /* List of active tasks in the coalition */ + /* + * This ledger is used for triggering resource exception. For the tracked resources, this is updated + * when the member tasks' resource usage changes. + */ + ledger_t resource_monitor_ledger; }; /* @@ -212,7 +243,7 @@ static kern_return_t i_coal_jetsam_adopt_task(coalition_t coal, task_t task); static kern_return_t i_coal_jetsam_remove_task(coalition_t coal, task_t task); static kern_return_t i_coal_jetsam_set_taskrole(coalition_t coal, task_t task, int role); -static int i_coal_jetsam_get_taskrole(coalition_t coal, task_t task); +int i_coal_jetsam_get_taskrole(coalition_t coal, task_t task); static void i_coal_jetsam_iterate_tasks(coalition_t coal, void *ctx, void (*callback)(coalition_t, void *, task_t)); @@ -256,7 +287,7 @@ struct coalition { queue_chain_t coalitions; /* global list of coalitions */ - decl_lck_mtx_data(, lock) /* Coalition lock. */ + decl_lck_mtx_data(, lock); /* Coalition lock. */ /* put coalition type-specific structures here */ union { @@ -316,6 +347,178 @@ static const struct coalition_type #endif /* CONFIG_EMBEDDED */ +/* + * + * Coalition ledger implementation + * + */ + +struct coalition_ledger_indices coalition_ledgers = +{.logical_writes = -1, }; +void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_COALITION_IS_CAUSING_TOO_MUCH_IO(int flavor); + +ledger_t +coalition_ledger_get_from_task(task_t task) +{ + ledger_t ledger = LEDGER_NULL; + coalition_t coal = task->coalition[COALITION_TYPE_RESOURCE]; + + if (coal != NULL && (!queue_empty(&task->task_coalition[COALITION_TYPE_RESOURCE]))) { + ledger = coal->r.resource_monitor_ledger; + ledger_reference(ledger); + } + return ledger; +} + + +enum { + COALITION_IO_LEDGER_ENABLE, + COALITION_IO_LEDGER_DISABLE +}; + +void +coalition_io_monitor_ctl(struct coalition *coalition, uint32_t flags, int64_t limit) +{ + ledger_t ledger = coalition->r.resource_monitor_ledger; + + if (flags == COALITION_IO_LEDGER_ENABLE) { + /* Configure the logical I/O ledger */ + ledger_set_limit(ledger, coalition_ledgers.logical_writes, (limit * 1024 * 1024), 0); + ledger_set_period(ledger, coalition_ledgers.logical_writes, (COALITION_LEDGER_MONITOR_INTERVAL_SECS * NSEC_PER_SEC)); + } else if (flags == COALITION_IO_LEDGER_DISABLE) { + ledger_disable_refill(ledger, coalition_ledgers.logical_writes); + ledger_disable_callback(ledger, coalition_ledgers.logical_writes); + } +} + +int +coalition_ledger_set_logical_writes_limit(struct coalition *coalition, int64_t limit) +{ + int error = 0; + + /* limit = -1 will be used to disable the limit and the callback */ + if (limit > COALITION_MAX_LOGICAL_WRITES_LIMIT || limit == 0 || limit < -1) { + error = EINVAL; + goto out; + } + + coalition_lock(coalition); + if (limit == -1) { + coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_DISABLE, limit); + } else { + coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_ENABLE, limit); + } + coalition_unlock(coalition); +out: + return error; +} + +void __attribute__((noinline)) +SENDING_NOTIFICATION__THIS_COALITION_IS_CAUSING_TOO_MUCH_IO(int flavor) +{ + int pid = proc_selfpid(); + ledger_amount_t new_limit; + task_t task = current_task(); + struct ledger_entry_info lei; + kern_return_t kr; + ledger_t ledger; + struct coalition *coalition = task->coalition[COALITION_TYPE_RESOURCE]; + + assert(coalition != NULL); + ledger = coalition->r.resource_monitor_ledger; + + switch (flavor) { + case FLAVOR_IO_LOGICAL_WRITES: + ledger_get_entry_info(ledger, coalition_ledgers.logical_writes, &lei); + trace_resource_violation(RMON_LOGWRITES_VIOLATED, &lei); + break; + default: + goto Exit; + } + + os_log(OS_LOG_DEFAULT, "Coalition [%lld] caught causing excessive I/O (flavor: %d). Task I/O: %lld MB. [Limit : %lld MB per %lld secs]. Triggered by process [%d]\n", + coalition->id, flavor, (lei.lei_balance / (1024 * 1024)), (lei.lei_limit / (1024 * 1024)), + (lei.lei_refill_period / NSEC_PER_SEC), pid); + + kr = send_resource_violation(send_disk_writes_violation, task, &lei, kRNFlagsNone); + if (kr) { + os_log(OS_LOG_DEFAULT, "ERROR %#x returned from send_resource_violation(disk_writes, ...)\n", kr); + } + + /* + * Continue to monitor the coalition after it hits the initital limit, but increase + * the limit exponentially so that we don't spam the listener. + */ + new_limit = (lei.lei_limit / 1024 / 1024) * 4; + coalition_lock(coalition); + if (new_limit > COALITION_MAX_LOGICAL_WRITES_LIMIT) { + coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_DISABLE, -1); + } else { + coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_ENABLE, new_limit); + } + coalition_unlock(coalition); + +Exit: + return; +} + +void +coalition_io_rate_exceeded(int warning, const void *param0, __unused const void *param1) +{ + if (warning == 0) { + SENDING_NOTIFICATION__THIS_COALITION_IS_CAUSING_TOO_MUCH_IO((int)param0); + } +} + +void +init_coalition_ledgers(void) +{ + ledger_template_t t; + assert(coalition_ledger_template == NULL); + + if ((t = ledger_template_create("Per-coalition ledgers")) == NULL) { + panic("couldn't create coalition ledger template"); + } + + coalition_ledgers.logical_writes = ledger_entry_add(t, "logical_writes", "res", "bytes"); + + if (coalition_ledgers.logical_writes < 0) { + panic("couldn't create entries for coaliton ledger template"); + } + + ledger_set_callback(t, coalition_ledgers.logical_writes, coalition_io_rate_exceeded, (void *)FLAVOR_IO_LOGICAL_WRITES, NULL); + ledger_template_complete(t); + + coalition_task_ledger_template = ledger_template_copy(task_ledger_template, "Coalition task ledgers"); + + if (coalition_task_ledger_template == NULL) { + panic("couldn't create coalition task ledger template"); + } + + ledger_template_complete(coalition_task_ledger_template); + + coalition_ledger_template = t; +} + +void +coalition_io_ledger_update(task_t task, int32_t flavor, boolean_t is_credit, uint32_t io_size) +{ + ledger_t ledger; + coalition_t coal = task->coalition[COALITION_TYPE_RESOURCE]; + + assert(coal != NULL); + ledger = coal->r.resource_monitor_ledger; + if (LEDGER_VALID(ledger)) { + if (flavor == FLAVOR_IO_LOGICAL_WRITES) { + if (is_credit) { + ledger_credit(ledger, coalition_ledgers.logical_writes, io_size); + } else { + ledger_debit(ledger, coalition_ledgers.logical_writes, io_size); + } + } + } +} + static void coalition_notify_user(uint64_t id, uint32_t flags) { @@ -341,12 +544,18 @@ i_coal_resource_init(coalition_t coal, boolean_t privileged) { (void)privileged; assert(coal && coal->type == COALITION_TYPE_RESOURCE); - coal->r.ledger = ledger_instantiate(task_ledger_template, + coal->r.ledger = ledger_instantiate(coalition_task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES); if (coal->r.ledger == NULL) { return KERN_RESOURCE_SHORTAGE; } + coal->r.resource_monitor_ledger = ledger_instantiate(coalition_ledger_template, + LEDGER_CREATE_ACTIVE_ENTRIES); + if (coal->r.resource_monitor_ledger == NULL) { + return KERN_RESOURCE_SHORTAGE; + } + queue_init(&coal->r.tasks); return KERN_SUCCESS; @@ -356,7 +565,9 @@ static void i_coal_resource_dealloc(coalition_t coal) { assert(coal && coal->type == COALITION_TYPE_RESOURCE); + ledger_dereference(coal->r.ledger); + ledger_dereference(coal->r.resource_monitor_ledger); } static kern_return_t @@ -429,12 +640,24 @@ i_coal_resource_remove_task(coalition_t coal, task_t task) #else cr->energy += task_energy(task); #endif - cr->logical_immediate_writes += task->task_immediate_writes; - cr->logical_deferred_writes += task->task_deferred_writes; - cr->logical_invalidated_writes += task->task_invalidated_writes; - cr->logical_metadata_writes += task->task_metadata_writes; + cr->logical_immediate_writes += task->task_writes_counters_internal.task_immediate_writes; + cr->logical_deferred_writes += task->task_writes_counters_internal.task_deferred_writes; + cr->logical_invalidated_writes += task->task_writes_counters_internal.task_invalidated_writes; + cr->logical_metadata_writes += task->task_writes_counters_internal.task_metadata_writes; + cr->logical_immediate_writes_to_external += task->task_writes_counters_external.task_immediate_writes; + cr->logical_deferred_writes_to_external += task->task_writes_counters_external.task_deferred_writes; + cr->logical_invalidated_writes_to_external += task->task_writes_counters_external.task_invalidated_writes; + cr->logical_metadata_writes_to_external += task->task_writes_counters_external.task_metadata_writes; cr->cpu_ptime += task_cpu_ptime(task); task_update_cpu_time_qos_stats(task, cr->cpu_time_eqos, cr->cpu_time_rqos); +#if MONOTONIC + uint64_t counts[MT_CORE_NFIXED] = {}; + (void)mt_fixed_task_counts(task, counts); + cr->cpu_cycles += counts[MT_CORE_CYCLES]; +#if defined(MT_CORE_INSTRS) + cr->cpu_instructions += counts[MT_CORE_INSTRS]; +#endif /* defined(MT_CORE_INSTRS) */ +#endif /* MONOTONIC */ } /* remove the task from the coalition's list */ @@ -498,7 +721,7 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us } } - ledger_t sum_ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES); + ledger_t sum_ledger = ledger_instantiate(coalition_task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES); if (sum_ledger == LEDGER_NULL) { return KERN_RESOURCE_SHORTAGE; } @@ -518,6 +741,10 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us uint64_t logical_deferred_writes = coal->r.logical_deferred_writes; uint64_t logical_invalidated_writes = coal->r.logical_invalidated_writes; uint64_t logical_metadata_writes = coal->r.logical_metadata_writes; + uint64_t logical_immediate_writes_to_external = coal->r.logical_immediate_writes_to_external; + uint64_t logical_deferred_writes_to_external = coal->r.logical_deferred_writes_to_external; + uint64_t logical_invalidated_writes_to_external = coal->r.logical_invalidated_writes_to_external; + uint64_t logical_metadata_writes_to_external = coal->r.logical_metadata_writes_to_external; int64_t cpu_time_billed_to_me = 0; int64_t cpu_time_billed_to_others = 0; int64_t energy_billed_to_me = 0; @@ -527,6 +754,9 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us memcpy(cpu_time_eqos, coal->r.cpu_time_eqos, sizeof(cpu_time_eqos)); uint64_t cpu_time_rqos[COALITION_NUM_THREAD_QOS_TYPES]; memcpy(cpu_time_rqos, coal->r.cpu_time_rqos, sizeof(cpu_time_rqos)); + uint64_t cpu_instructions = coal->r.cpu_instructions; + uint64_t cpu_cycles = coal->r.cpu_cycles; + /* * Add to that all the active tasks' ledgers. Tasks cannot deallocate * out from under us, since we hold the coalition lock. @@ -549,12 +779,25 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us #else energy += task_energy(task); #endif - logical_immediate_writes += task->task_immediate_writes; - logical_deferred_writes += task->task_deferred_writes; - logical_invalidated_writes += task->task_invalidated_writes; - logical_metadata_writes += task->task_metadata_writes; + logical_immediate_writes += task->task_writes_counters_internal.task_immediate_writes; + logical_deferred_writes += task->task_writes_counters_internal.task_deferred_writes; + logical_invalidated_writes += task->task_writes_counters_internal.task_invalidated_writes; + logical_metadata_writes += task->task_writes_counters_internal.task_metadata_writes; + logical_immediate_writes_to_external += task->task_writes_counters_external.task_immediate_writes; + logical_deferred_writes_to_external += task->task_writes_counters_external.task_deferred_writes; + logical_invalidated_writes_to_external += task->task_writes_counters_external.task_invalidated_writes; + logical_metadata_writes_to_external += task->task_writes_counters_external.task_metadata_writes; + cpu_ptime += task_cpu_ptime(task); task_update_cpu_time_qos_stats(task, cpu_time_eqos, cpu_time_rqos); +#if MONOTONIC + uint64_t counts[MT_CORE_NFIXED] = {}; + (void)mt_fixed_task_counts(task, counts); + cpu_cycles += counts[MT_CORE_CYCLES]; +#if defined(MT_CORE_INSTRS) + cpu_instructions += counts[MT_CORE_INSTRS]; +#endif /* defined(MT_CORE_INSTRS) */ +#endif /* MONOTONIC */ } kr = ledger_get_balance(sum_ledger, task_ledgers.cpu_time_billed_to_me, (int64_t *)&cpu_time_billed_to_me); @@ -620,9 +863,15 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us cru_out->logical_deferred_writes = logical_deferred_writes; cru_out->logical_invalidated_writes = logical_invalidated_writes; cru_out->logical_metadata_writes = logical_metadata_writes; + cru_out->logical_immediate_writes_to_external = logical_immediate_writes_to_external; + cru_out->logical_deferred_writes_to_external = logical_deferred_writes_to_external; + cru_out->logical_invalidated_writes_to_external = logical_invalidated_writes_to_external; + cru_out->logical_metadata_writes_to_external = logical_metadata_writes_to_external; cru_out->cpu_ptime = cpu_ptime; cru_out->cpu_time_eqos_len = COALITION_NUM_THREAD_QOS_TYPES; memcpy(cru_out->cpu_time_eqos, cpu_time_eqos, sizeof(cru_out->cpu_time_eqos)); + cru_out->cpu_cycles = cpu_cycles; + cru_out->cpu_instructions = cpu_instructions; ledger_dereference(sum_ledger); sum_ledger = LEDGER_NULL; @@ -776,7 +1025,7 @@ i_coal_jetsam_set_taskrole(coalition_t coal, task_t task, int role) return KERN_SUCCESS; } -static int +int i_coal_jetsam_get_taskrole(coalition_t coal, task_t task) { struct i_jetsam_coalition *cj; @@ -1176,7 +1425,7 @@ task_coalition_adjust_focal_count(task_t task, int count, uint32_t *new_count) return FALSE; } - *new_count = hw_atomic_add(&coal->focal_task_count, count); + *new_count = os_atomic_add(&coal->focal_task_count, count, relaxed); assert(*new_count != UINT32_MAX); return TRUE; } @@ -1200,7 +1449,7 @@ task_coalition_adjust_nonfocal_count(task_t task, int count, uint32_t *new_count return FALSE; } - *new_count = hw_atomic_add(&coal->nonfocal_task_count, count); + *new_count = os_atomic_add(&coal->nonfocal_task_count, count, relaxed); assert(*new_count != UINT32_MAX); return TRUE; } @@ -1672,6 +1921,8 @@ coalitions_init(void) init_task_ledgers(); + init_coalition_ledgers(); + for (i = 0, ctype = &s_coalition_types[0]; i < COALITION_NUM_TYPES; ctype++, i++) { /* verify the entry in the global coalition types array */ if (ctype->type != i || @@ -1735,47 +1986,38 @@ coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz) } /* - * Jetsam coalition interface - * + * Return the coaltion of the given type to which the task belongs. */ -boolean_t -coalition_is_leader(task_t task, int coal_type, coalition_t *coal) +coalition_t +task_get_coalition(task_t task, int coal_type) { coalition_t c; - boolean_t ret; - - if (coal) { /* handle the error cases gracefully */ - *coal = COALITION_NULL; - } - - if (!task) { - return FALSE; - } - if (coal_type > COALITION_TYPE_MAX) { - return FALSE; + if (task == NULL || coal_type > COALITION_TYPE_MAX) { + return COALITION_NULL; } c = task->coalition[coal_type]; - if (!c) { - return FALSE; - } + assert(c == COALITION_NULL || (int)c->type == coal_type); + return c; +} - assert((int)c->type == coal_type); +/* + * Report if the given task is the leader of the given jetsam coalition. + */ +boolean_t +coalition_is_leader(task_t task, coalition_t coal) +{ + boolean_t ret = FALSE; - coalition_lock(c); + if (coal != COALITION_NULL) { + coalition_lock(coal); - if (coal) { - *coal = c; - } + ret = (coal->type == COALITION_TYPE_JETSAM && coal->j.leader == task); - ret = FALSE; - if (c->type == COALITION_TYPE_JETSAM && c->j.leader == task) { - ret = TRUE; + coalition_unlock(coal); } - coalition_unlock(c); - return ret; } diff --git a/osfmk/kern/coalition.h b/osfmk/kern/coalition.h index 29da7719c..5afbf004b 100644 --- a/osfmk/kern/coalition.h +++ b/osfmk/kern/coalition.h @@ -79,6 +79,23 @@ void coalition_for_each_task(coalition_t coal, void *ctx, void coalition_set_efficient(coalition_t coal); +/* Coalition ledger */ +struct coalition_ledger_indices { + int logical_writes; +}; +void init_coalition_ledgers(void); +int coalition_ledger_set_logical_writes_limit(coalition_t coal, int64_t limit); +void coalition_io_monitor_ctl(struct coalition *coalition, uint32_t flags, int64_t limit); +ledger_t coalition_ledger_get_from_task(task_t task); +void coalition_io_rate_exceeded(int warning, const void *param0, __unused const void *param1); +void coalition_io_ledger_update(task_t task, int32_t flavor, boolean_t is_credit, uint32_t io_size); + +/* Max limit for coalition logical_writes ledger in MB. Setting to 16 TB */ +#define COALITION_MAX_LOGICAL_WRITES_LIMIT ((ledger_amount_t)(1ULL << 24)) +/* logical_writes ledger's refill time interval */ +#define COALITION_LEDGER_MONITOR_INTERVAL_SECS (24 * 60 * 60) + + typedef void (*coalition_iterate_fn_t)(void*, int, coalition_t); kern_return_t coalition_iterate_stackshot(coalition_iterate_fn_t callout, void *arg, uint32_t coalition_type); diff --git a/osfmk/kern/cpu_quiesce.c b/osfmk/kern/cpu_quiesce.c index bd04dc7da..57c43f5b4 100644 --- a/osfmk/kern/cpu_quiesce.c +++ b/osfmk/kern/cpu_quiesce.c @@ -80,7 +80,7 @@ static uint64_t cpu_checkin_last_commit; #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ static uint64_t cpu_checkin_min_interval; -uint32_t cpu_checkin_min_interval_us; +static uint32_t cpu_checkin_min_interval_us; #if __LP64__ static_assert(MAX_CPUS <= 32); @@ -134,6 +134,12 @@ cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) cpu_checkin_min_interval = abstime; } +uint32_t +cpu_quiescent_counter_get_min_interval_us(void) +{ + return cpu_checkin_min_interval_us; +} + /* * Called when all running CPUs have checked in. @@ -151,7 +157,7 @@ cpu_quiescent_counter_commit(uint64_t ctime) cpu_checkin_last_commit = ctime; - old_state = os_atomic_and(&cpu_quiescing_checkin_state, ~CPU_CHECKIN_MASK, release); + old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release); KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); } @@ -272,8 +278,8 @@ cpu_quiescent_counter_leave(uint64_t ctime) checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); - checkin_mask_t orig_state = os_atomic_and_orig(&cpu_quiescing_checkin_state, - ~mask, acq_rel); + checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state, + mask, acq_rel); assert((orig_state & cpu_expected_bit(cpuid))); diff --git a/osfmk/kern/cpu_quiesce.h b/osfmk/kern/cpu_quiesce.h index 1c9537042..261669a2d 100644 --- a/osfmk/kern/cpu_quiesce.h +++ b/osfmk/kern/cpu_quiesce.h @@ -54,8 +54,8 @@ extern void cpu_quiescent_counter_ast(void); extern void cpu_quiescent_counter_init(void); /* use of these is guarded by the config */ -extern uint32_t cpu_checkin_min_interval_us; extern void cpu_quiescent_counter_set_min_interval_us(uint32_t new_value); +extern uint32_t cpu_quiescent_counter_get_min_interval_us(void); #else /* CONFIG_QUIESCE_COUNTER */ diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index dd7a28996..8578f687a 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +86,8 @@ #include #if defined(__i386__) || defined(__x86_64__) +#include + #include #include #endif @@ -173,24 +176,27 @@ uint64_t debugger_panic_options = 0; const char *debugger_message = NULL; unsigned long debugger_panic_caller = 0; -void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx, - uint64_t panic_options_mask, void *panic_data, unsigned long panic_caller); -static void kdp_machine_reboot_type(unsigned int type); -__attribute__((noreturn)) void panic_spin_forever(void); +void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, + unsigned int reason, void *ctx, uint64_t panic_options_mask, void *panic_data, + unsigned long panic_caller) __dead2; +static void kdp_machine_reboot_type(unsigned int type, uint64_t debugger_flags); +void panic_spin_forever(void) __dead2; extern kern_return_t do_stackshot(void); +extern void PE_panic_hook(const char*); +#if CONFIG_NONFATAL_ASSERTS int mach_assert = 1; +#endif #define NESTEDDEBUGGERENTRYMAX 5 +static unsigned int max_debugger_entry_count = NESTEDDEBUGGERENTRYMAX; #if CONFIG_EMBEDDED #define DEBUG_BUF_SIZE (4096) #define KDBG_TRACE_PANIC_FILENAME "/var/log/panic.trace" #else -/* - * EXTENDED_/DEBUG_BUF_SIZE can't grow without updates to SMC and iBoot to store larger panic logs on co-processor systems */ #define DEBUG_BUF_SIZE ((3 * PAGE_SIZE) + offsetof(struct macos_panic_header, mph_data)) -#define EXTENDED_DEBUG_BUF_SIZE 0x0013ff80 +/* EXTENDED_DEBUG_BUF_SIZE definition is now in debug.h */ static_assert(((EXTENDED_DEBUG_BUF_SIZE % PANIC_FLUSH_BOUNDARY) == 0), "Extended debug buf size must match SMC alignment requirements"); #define KDBG_TRACE_PANIC_FILENAME "/var/tmp/panic.trace" #endif @@ -257,6 +263,14 @@ int kext_assertions_enable = FALSE; #endif +/* + * Maintain the physically-contiguous carveout for the `phys_carveout_mb` + * boot-arg. + */ +SECURITY_READ_ONLY_LATE(vm_offset_t) phys_carveout = 0; +SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout_pa = 0; +SECURITY_READ_ONLY_LATE(size_t) phys_carveout_size = 0; + void panic_init(void) { @@ -269,9 +283,11 @@ panic_init(void) uuid_unparse_upper(*(uuid_t *)uuid, kernel_uuid_string); } +#if CONFIG_NONFATAL_ASSERTS if (!PE_parse_boot_argn("assertions", &mach_assert, sizeof(mach_assert))) { mach_assert = 1; } +#endif /* * Initialize the value of the debug boot-arg @@ -298,6 +314,11 @@ panic_init(void) #endif #endif /* CONFIG_EMBEDDED */ } + + if (!PE_parse_boot_argn("nested_panic_max", &max_debugger_entry_count, sizeof(max_debugger_entry_count))) { + max_debugger_entry_count = NESTEDDEBUGGERENTRYMAX; + } + #endif /* ((CONFIG_EMBEDDED && MACH_KDP) || defined(__x86_64__)) */ #if DEVELOPMENT || DEBUG @@ -342,6 +363,15 @@ extended_debug_log_init(void) debug_buf_size = (EXTENDED_DEBUG_BUF_SIZE - offsetof(struct macos_panic_header, mph_data)); extended_debug_log_enabled = TRUE; + + /* + * Insert a compiler barrier so we don't free the other panic stackshot buffer + * until after we've marked the new one as available + */ + __compiler_barrier(); + kmem_free(kernel_map, panic_stackshot_buf, panic_stackshot_buf_len); + panic_stackshot_buf = 0; + panic_stackshot_buf_len = 0; } #endif /* defined (__x86_64__) */ @@ -358,14 +388,64 @@ debug_log_init(void) debug_buf_ptr = debug_buf_base; debug_buf_size = gPanicSize - sizeof(struct embedded_panic_header); #else + kern_return_t kr = KERN_SUCCESS; bzero(panic_info, DEBUG_BUF_SIZE); assert(debug_buf_base != NULL); assert(debug_buf_ptr != NULL); assert(debug_buf_size != 0); + + /* + * We allocate a buffer to store a panic time stackshot. If we later discover that this is a + * system that supports flushing a stackshot via an extended debug log (see above), we'll free this memory + * as it's not necessary on this platform. This information won't be available until the IOPlatform has come + * up. + */ + kr = kmem_alloc(kernel_map, &panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, VM_KERN_MEMORY_DIAG); + assert(kr == KERN_SUCCESS); + if (kr == KERN_SUCCESS) { + panic_stackshot_buf_len = PANIC_STACKSHOT_BUFSIZE; + } #endif } +void +phys_carveout_init(void) +{ + if (!PE_i_can_has_debugger(NULL)) { + return; + } + + unsigned int phys_carveout_mb = 0; + + if (!PE_parse_boot_argn("phys_carveout_mb", &phys_carveout_mb, + sizeof(phys_carveout_mb))) { + return; + } + if (phys_carveout_mb == 0) { + return; + } + + size_t size = 0; + if (os_mul_overflow(phys_carveout_mb, 1024 * 1024, &size)) { + printf("phys_carveout_mb size overflowed (%uMB)\n", + phys_carveout_mb); + return; + } + + kern_return_t kr = kmem_alloc_contig(kernel_map, &phys_carveout, size, + VM_MAP_PAGE_MASK(kernel_map), 0, 0, KMA_NOPAGEWAIT, + VM_KERN_MEMORY_DIAG); + if (kr != KERN_SUCCESS) { + printf("failed to allocate %uMB for phys_carveout_mb: %u\n", + phys_carveout_mb, (unsigned int)kr); + return; + } + + phys_carveout_pa = kvtophys(phys_carveout); + phys_carveout_size = size; +} + static void DebuggerLock() { @@ -373,7 +453,7 @@ DebuggerLock() int debugger_exp_cpu = DEBUGGER_NO_CPU; assert(ml_get_interrupts_enabled() == FALSE); - if (debugger_cpu == my_cpu) { + if (atomic_load(&debugger_cpu) == my_cpu) { return; } @@ -387,7 +467,7 @@ DebuggerLock() static void DebuggerUnlock() { - assert(debugger_cpu == cpu_number()); + assert(atomic_load_explicit(&debugger_cpu, memory_order_relaxed) == cpu_number()); /* * We don't do an atomic exchange here in case @@ -396,7 +476,7 @@ DebuggerUnlock() * lock so we can simply store DEBUGGER_NO_CPU and follow with * a barrier. */ - debugger_cpu = DEBUGGER_NO_CPU; + atomic_store(&debugger_cpu, DEBUGGER_NO_CPU); OSMemoryBarrier(); return; @@ -486,10 +566,12 @@ Assert( const char *expression ) { +#if CONFIG_NONFATAL_ASSERTS if (!mach_assert) { kprintf("%s:%d non-fatal Assertion: %s", file, line, expression); return; } +#endif panic_plain("%s:%d Assertion failed: %s", file, line, expression); } @@ -513,7 +595,7 @@ DebuggerWithContext(unsigned int reason, void *ctx, const char *message, CPUDEBUGGERCOUNT++; - if (CPUDEBUGGERCOUNT > NESTEDDEBUGGERENTRYMAX) { + if (CPUDEBUGGERCOUNT > max_debugger_entry_count) { static boolean_t in_panic_kprintf = FALSE; /* Notify any listeners that we've started a panic */ @@ -522,12 +604,12 @@ DebuggerWithContext(unsigned int reason, void *ctx, const char *message, if (!in_panic_kprintf) { in_panic_kprintf = TRUE; kprintf("Detected nested debugger entry count exceeding %d\n", - NESTEDDEBUGGERENTRYMAX); + max_debugger_entry_count); in_panic_kprintf = FALSE; } if (!panicDebugging) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_options_mask); } panic_spin_forever(); @@ -689,8 +771,11 @@ void panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, thread_t thread, const char *str, ...) { va_list panic_str_args; + __assert_only os_ref_count_t th_ref_count; assert_thread_magic(thread); + th_ref_count = os_ref_get_count(&thread->ref_count); + assertf(th_ref_count > 0, "panic_with_thread_context called with invalid thread %p with refcount %u", thread, th_ref_count); /* Take a reference on the thread so it doesn't disappear by the time we try to backtrace it */ thread_reference(thread); @@ -718,17 +803,12 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign if (ml_wants_panic_trap_to_debugger()) { ml_panic_trap_to_debugger(panic_format_str, panic_args, reason, ctx, panic_options_mask, panic_caller); - - /* - * This should not return, but we return here for the tail call - * as it simplifies the backtrace. - */ - return; + __builtin_trap(); } CPUDEBUGGERCOUNT++; - if (CPUDEBUGGERCOUNT > NESTEDDEBUGGERENTRYMAX) { + if (CPUDEBUGGERCOUNT > max_debugger_entry_count) { static boolean_t in_panic_kprintf = FALSE; /* Notify any listeners that we've started a panic */ @@ -737,12 +817,12 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign if (!in_panic_kprintf) { in_panic_kprintf = TRUE; kprintf("Detected nested debugger entry count exceeding %d\n", - NESTEDDEBUGGERENTRYMAX); + max_debugger_entry_count); in_panic_kprintf = FALSE; } if (!panicDebugging) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, panic_options_mask); } panic_spin_forever(); @@ -752,11 +832,7 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((panic_options_mask & DEBUGGER_OPTION_RECURPANIC_ENTRY)); #endif -#if CONFIG_EMBEDDED - if (PE_arm_debug_panic_hook) { - PE_arm_debug_panic_hook(panic_format_str); - } -#endif + PE_panic_hook(panic_format_str); #if defined (__x86_64__) plctrace_disable(); @@ -805,11 +881,11 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign * Not reached. */ panic_stop(); + __builtin_unreachable(); } -__attribute__((noreturn)) void -panic_spin_forever() +panic_spin_forever(void) { paniclog_append_noflush("\nPlease go to https://panic.apple.com to report this panic\n"); @@ -818,17 +894,21 @@ panic_spin_forever() } static void -kdp_machine_reboot_type(unsigned int type) +kdp_machine_reboot_type(unsigned int type, uint64_t debugger_flags) { printf("Attempting system restart..."); - PEHaltRestart(type); + if ((type == kPEPanicRestartCPU) && (debugger_flags & DEBUGGER_OPTION_SKIP_PANICEND_CALLOUTS)) { + PEHaltRestart(kPEPanicRestartCPUNoPanicEndCallouts); + } else { + PEHaltRestart(type); + } halt_all_cpus(TRUE); } void kdp_machine_reboot(void) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, 0); } /* @@ -930,7 +1010,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned /* DEBUGGER_OPTION_PANICLOGANDREBOOT is used for two finger resets on embedded so we get a paniclog */ if (debugger_panic_options & DEBUGGER_OPTION_PANICLOGANDREBOOT) { - PEHaltRestart(kPEPanicRestartCPU); + PEHaltRestart(kPEPanicRestartCPUNoCallouts); } } @@ -942,14 +1022,14 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned */ if ((debugger_panic_options & DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP) && (debug_boot_arg & DB_REBOOT_POST_CORE)) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } /* * Consider generating a local corefile if the infrastructure is configured * and we haven't disabled on-device coredumps. */ - if (!(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) { + if (on_device_corefile_enabled()) { if (!kdp_has_polled_corefile()) { if (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI)) { paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)", @@ -992,13 +1072,13 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned */ if ((debug_boot_arg & DB_REBOOT_POST_CORE) && ((ret == 0) || (debugger_panic_options & DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT))) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } } } if (debug_boot_arg & DB_REBOOT_ALWAYS) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } /* If KDP is configured, try to trap to the debugger */ @@ -1025,7 +1105,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ if (!panicDebugging) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } panic_spin_forever(); @@ -1372,7 +1452,24 @@ panic_display_disk_errors(void) panic_disk_error_description[sizeof(panic_disk_error_description) - 1] = '\0'; paniclog_append_noflush("Root disk errors: \"%s\"\n", panic_disk_error_description); } -}; +} + +static void +panic_display_shutdown_status(void) +{ +#if defined(__i386__) || defined(__x86_64__) + paniclog_append_noflush("System shutdown begun: %s\n", IOPMRootDomainGetWillShutdown() ? "YES" : "NO"); + if (gIOPolledCoreFileMode == kIOPolledCoreFileModeNotInitialized) { + paniclog_append_noflush("Panic diags file unavailable, panic occurred prior to initialization\n"); + } else if (gIOPolledCoreFileMode != kIOPolledCoreFileModeDisabled) { + /* + * If we haven't marked the corefile as explicitly disabled, and we've made it past initialization, then we know the current + * system was configured to use disk based diagnostics at some point. + */ + paniclog_append_noflush("Panic diags file available: %s (0x%x)\n", (gIOPolledCoreFileMode != kIOPolledCoreFileModeClosed) ? "YES" : "NO", kdp_polled_corefile_error()); + } +#endif +} extern const char version[]; extern char osversion[]; @@ -1401,6 +1498,7 @@ panic_display_system_configuration(boolean_t launchd_exit) } panic_display_model_name(); panic_display_disk_errors(); + panic_display_shutdown_status(); if (!launchd_exit) { panic_display_uptime(); panic_display_zprint(); @@ -1528,7 +1626,8 @@ kern_feature_override(uint32_t fmask) { if (kern_feature_overrides == 0) { uint32_t fdisables = 0; - /* Expected to be first invoked early, in a single-threaded + /* + * Expected to be first invoked early, in a single-threaded * environment */ if (PE_parse_boot_argn("validation_disables", &fdisables, sizeof(fdisables))) { @@ -1540,3 +1639,32 @@ kern_feature_override(uint32_t fmask) } return (kern_feature_overrides & fmask) == fmask; } + +boolean_t +on_device_corefile_enabled(void) +{ + assert(debug_boot_arg_inited); +#if CONFIG_KDP_INTERACTIVE_DEBUGGING + if ((debug_boot_arg != 0) && !(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) { + return TRUE; + } +#endif + return FALSE; +} + +boolean_t +panic_stackshot_to_disk_enabled(void) +{ + assert(debug_boot_arg_inited); +#if defined(__x86_64__) + if (PEGetCoprocessorVersion() < kCoprocessorVersion2) { + /* Only enabled on pre-Gibraltar machines where it hasn't been disabled explicitly */ + if ((debug_boot_arg != 0) && (debug_boot_arg & DB_DISABLE_STACKSHOT_TO_DISK)) { + return FALSE; + } + + return TRUE; + } +#endif + return FALSE; +} diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 7e82f1b34..57effae1c 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -217,8 +217,8 @@ enum micro_snapshot_flags { * Flags used in the following assortment of snapshots. */ enum generic_snapshot_flags { - kUser64_p = 0x1, - kKernel64_p = 0x2 + kUser64_p = 0x1, /* Userspace uses 64 bit pointers */ + kKernel64_p = 0x2 /* The kernel uses 64 bit pointers */ }; #define VM_PRESSURE_TIME_WINDOW 5 /* seconds */ @@ -270,6 +270,7 @@ enum { #define KF_MATV_OVRD (0x8) #define KF_STACKSHOT_OVRD (0x10) #define KF_COMPRSV_OVRD (0x20) +#define KF_INTERRUPT_MASKED_DEBUG_OVRD (0x40) boolean_t kern_feature_override(uint32_t fmask); @@ -351,6 +352,35 @@ struct macos_panic_header { #define MACOS_PANIC_HEADER_FLAG_COREDUMP_FAILED 0x200 #define MACOS_PANIC_HEADER_FLAG_STACKSHOT_KERNEL_ONLY 0x400 +/* + * Any change to the below structure should mirror the structure defined in MacEFIFirmware + * (and vice versa) + */ + +struct efi_aurr_panic_header { + uint32_t efi_aurr_magic; + uint32_t efi_aurr_crc; + uint32_t efi_aurr_version; + uint32_t efi_aurr_reset_cause; + uint32_t efi_aurr_reset_log_offset; + uint32_t efi_aurr_reset_log_len; + char efi_aurr_panic_data[]; +} __attribute__((packed)); + +/* + * EXTENDED_/DEBUG_BUF_SIZE can't grow without updates to SMC and iBoot to store larger panic logs on co-processor systems + */ +#define EXTENDED_DEBUG_BUF_SIZE 0x0013ff80 + +#define EFI_AURR_PANIC_STRING_MAX_LEN 112 +#define EFI_AURR_EXTENDED_LOG_SIZE (EXTENDED_DEBUG_BUF_SIZE - sizeof(struct efi_aurr_panic_header) - EFI_AURR_PANIC_STRING_MAX_LEN) + +struct efi_aurr_extended_panic_log { + char efi_aurr_extended_log_buf[EFI_AURR_EXTENDED_LOG_SIZE]; + uint32_t efi_aurr_log_tail; /* Circular buffer indices */ + uint32_t efi_aurr_log_head; /* ditto.. */ +} __attribute__((packed)); + #endif /* __APPLE_API_UNSTABLE */ #endif /* __APPLE_API_PRIVATE */ @@ -358,7 +388,8 @@ struct macos_panic_header { __BEGIN_DECLS -extern void panic(const char *string, ...) __printflike(1, 2); +__abortlike __printflike(1, 2) +extern void panic(const char *string, ...); __END_DECLS @@ -445,6 +476,7 @@ enum { * release bridgeOS. */ #define DB_REBOOT_ALWAYS 0x100000 /* Don't wait for debugger connection */ +#define DB_DISABLE_STACKSHOT_TO_DISK 0x200000 /* Disable writing stackshot to local disk */ /* * Values for a 64-bit mask that's passed to the debugger. @@ -460,6 +492,8 @@ enum { #define DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP 0x80ULL /* don't try to save local coredumps for this panic */ #define DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT 0x100ULL /* attempt to save coredump. always reboot */ #define DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE 0x200ULL /* backtrace the specified thread in the paniclog (x86 only) */ +#define DEBUGGER_OPTION_PRINT_CPU_USAGE_PANICLOG 0x400ULL /* print extra CPU usage data in the panic log */ +#define DEBUGGER_OPTION_SKIP_PANICEND_CALLOUTS 0x800ULL /* (bridgeOS) skip the kPEPanicEnd callouts -- don't wait for x86 to finish sending panic data */ #define DEBUGGER_INTERNAL_OPTIONS_MASK (DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE) @@ -472,12 +506,20 @@ __BEGIN_DECLS #define PANIC_LOCATION __FILE__ ":" LINE_NUMBER(__LINE__) #if CONFIG_EMBEDDED -#define panic(ex, ...) (panic)(# ex, ## __VA_ARGS__) +#define panic(ex, ...) ({ \ + __asm__("" ::: "memory"); \ + (panic)(# ex, ## __VA_ARGS__); \ + }) #else -#define panic(ex, ...) (panic)(# ex "@" PANIC_LOCATION, ## __VA_ARGS__) +#define panic(ex, ...) ({ \ + __asm__("" ::: "memory"); \ + (panic)(# ex "@" PANIC_LOCATION, ## __VA_ARGS__); \ + }) #endif -void panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mask, const char *str, ...); +__abortlike __printflike(4, 5) +void panic_with_options(unsigned int reason, void *ctx, + uint64_t debugger_options_mask, const char *str, ...); void Debugger(const char * message); void populate_model_name(char *); @@ -497,7 +539,9 @@ __END_DECLS #if defined (__x86_64__) struct thread; -void panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, struct thread* th, const char *str, ...); +__abortlike __printflike(5, 6) +void panic_with_thread_context(unsigned int reason, void *ctx, + uint64_t debugger_options_mask, struct thread* th, const char *str, ...); #endif /* limit the max size to a reasonable length */ @@ -535,6 +579,19 @@ extern "C" { kern_return_t stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, uint64_t delta_since_timestamp, unsigned *bytes_traced); + +/* + * Returns whether on device corefiles are enabled based on the build + * and boot configuration. + */ +boolean_t on_device_corefile_enabled(void); + +/* + * Returns whether panic stackshot to disk is enabled based on the build + * and boot configuration. + */ +boolean_t panic_stackshot_to_disk_enabled(void); + #ifdef __cplusplus } #endif @@ -542,11 +599,16 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, #if !CONFIG_EMBEDDED extern char debug_buf[]; extern boolean_t coprocessor_paniclog_flush; -extern boolean_t extended_debug_log_enabled;; +extern boolean_t extended_debug_log_enabled; #endif /* !CONFIG_EMBEDDED */ extern char *debug_buf_base; +#if defined(XNU_TARGET_OS_BRIDGE) +extern uint64_t macos_panic_base; +extern unsigned int macos_panic_size; +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ + extern char kernel_uuid_string[]; extern char panic_disk_error_description[]; extern size_t panic_disk_error_description_size; @@ -587,10 +649,22 @@ extern const char *debugger_panic_str; extern char *debug_buf_ptr; extern unsigned int debug_buf_size; -extern void debug_log_init(void); -extern void debug_putc(char); +extern void debug_log_init(void); +extern void debug_putc(char); + +extern void panic_init(void); + +/* + * Initialize the physical carveout requested with the `phys_carveout_mb` + * boot-arg. This should only be called at kernel startup, when physically + * contiguous pages are plentiful. + */ +extern void phys_carveout_init(void); + +extern uintptr_t phys_carveout_pa; +extern size_t phys_carveout_size; + -extern void panic_init(void); #if defined (__x86_64__) extern void extended_debug_log_init(void); @@ -598,12 +672,12 @@ extern void extended_debug_log_init(void); int packA(char *inbuf, uint32_t length, uint32_t buflen); void unpackA(char *inbuf, uint32_t length); -#if DEVELOPMENT || DEBUG #define PANIC_STACKSHOT_BUFSIZE (1024 * 1024) extern uintptr_t panic_stackshot_buf; +extern size_t panic_stackshot_buf_len; + extern size_t panic_stackshot_len; -#endif /* DEVELOPMENT || DEBUG */ #endif /* defined (__x86_64__) */ void SavePanicInfo(const char *message, void *panic_data, uint64_t panic_options); diff --git a/osfmk/kern/exc_guard.h b/osfmk/kern/exc_guard.h index 18ec56256..1084cdf0d 100644 --- a/osfmk/kern/exc_guard.h +++ b/osfmk/kern/exc_guard.h @@ -56,9 +56,9 @@ */ #define EXC_GUARD_DECODE_GUARD_TYPE(code) \ - (((code) >> 61) & 0x7ull) + ((((uint64_t)(code)) >> 61) & 0x7ull) #define EXC_GUARD_DECODE_GUARD_FLAVOR(code) \ - (((code) >> 32) & 0x1fffffff) + ((((uint64_t)(code)) >> 32) & 0x1fffffff) #define EXC_GUARD_DECODE_GUARD_TARGET(code) \ ((uint32_t)(code)) diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index d6d6ffbf2..c059f1c50 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -92,7 +92,7 @@ #include -extern int panic_on_exception_triage; +bool panic_on_exception_triage = false; unsigned long c_thr_exc_raise = 0; unsigned long c_thr_exc_raise_state = 0; @@ -123,6 +123,23 @@ kern_return_t bsd_exception( mach_msg_type_number_t codeCnt); #endif /* MACH_BSD */ +/* + * Routine: exception_init + * Purpose: + * Global initialization of state for exceptions. + * Conditions: + * None. + */ +void +exception_init(void) +{ + int tmp = 0; + + if (PE_parse_boot_argn("-panic_on_exception_triage", &tmp, sizeof(tmp))) { + panic_on_exception_triage = true; + } +} + /* * Routine: exception_deliver * Purpose: @@ -204,7 +221,7 @@ exception_deliver( lck_mtx_unlock(mutex); code64 = (behavior & MACH_EXCEPTION_CODES); - behavior &= ~MACH_EXCEPTION_CODES; + behavior &= ~MACH_EXCEPTION_MASK; if (!code64) { small_code[0] = CAST_DOWN_EXPLICIT(exception_data_type_t, code[0]); @@ -230,17 +247,12 @@ exception_deliver( #endif if (behavior != EXCEPTION_STATE) { - if (thread != current_thread() || exception == EXC_CORPSE_NOTIFY) { - task_reference(task); - task_port = convert_task_to_port(task); - /* task ref consumed */ - thread_reference(thread); - thread_port = convert_thread_to_port(thread); - /* thread ref consumed */ - } else { - task_port = retrieve_task_self_fast(thread->task); - thread_port = retrieve_thread_self_fast(thread); - } + task_reference(task); + task_port = convert_task_to_port(task); + /* task ref consumed */ + thread_reference(thread); + thread_port = convert_thread_to_port(thread); + /* thread ref consumed */ } switch (behavior) { diff --git a/osfmk/kern/exception.h b/osfmk/kern/exception.h index 0f5a81eff..163994656 100644 --- a/osfmk/kern/exception.h +++ b/osfmk/kern/exception.h @@ -50,6 +50,9 @@ struct exception_action { struct label *label; /* MAC label associated with action */ }; +/* Initialize global state needed for exceptions. */ +extern void exception_init(void); + /* Make an up-call to a thread's exception server */ extern kern_return_t exception_triage( exception_type_t exception, diff --git a/osfmk/kern/hibernate.c b/osfmk/kern/hibernate.c index 9e0cd1e96..71ef32fae 100644 --- a/osfmk/kern/hibernate.c +++ b/osfmk/kern/hibernate.c @@ -60,13 +60,14 @@ hibernate_alloc_page_lists( page_list = hibernate_page_list_allocate(TRUE); if (!page_list) { + HIBLOG("%s: failed for page_list\n", __FUNCTION__); retval = KERN_RESOURCE_SHORTAGE; goto done; } page_list_wired = hibernate_page_list_allocate(FALSE); if (!page_list_wired) { kfree(page_list, page_list->list_size); - + HIBLOG("%s: failed for page_list_wired\n", __FUNCTION__); retval = KERN_RESOURCE_SHORTAGE; goto done; } @@ -74,7 +75,7 @@ hibernate_alloc_page_lists( if (!page_list_pal) { kfree(page_list, page_list->list_size); kfree(page_list_wired, page_list_wired->list_size); - + HIBLOG("%s: failed for page_list_pal\n", __FUNCTION__); retval = KERN_RESOURCE_SHORTAGE; goto done; } diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 3d3e853ef..e336fcc09 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -95,6 +95,9 @@ #include #include +#include // IOTaskHasEntitlement +#include // DriverKit entitlement strings + #if CONFIG_ATM #include @@ -340,11 +343,19 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num user_arch_info = (host_preferred_user_arch_t)info; #if defined(PREFERRED_USER_CPU_TYPE) && defined(PREFERRED_USER_CPU_SUBTYPE) - user_arch_info->cpu_type = PREFERRED_USER_CPU_TYPE; - user_arch_info->cpu_subtype = PREFERRED_USER_CPU_SUBTYPE; + cpu_type_t preferred_cpu_type; + cpu_subtype_t preferred_cpu_subtype; + if (!PE_get_default("kern.preferred_cpu_type", &preferred_cpu_type, sizeof(cpu_type_t))) { + preferred_cpu_type = PREFERRED_USER_CPU_TYPE; + } + if (!PE_get_default("kern.preferred_cpu_subtype", &preferred_cpu_subtype, sizeof(cpu_subtype_t))) { + preferred_cpu_subtype = PREFERRED_USER_CPU_SUBTYPE; + } + user_arch_info->cpu_type = preferred_cpu_type; + user_arch_info->cpu_subtype = preferred_cpu_subtype; #else - int master_id = master_processor->cpu_id; - user_arch_info->cpu_type = slot_type(master_id); + int master_id = master_processor->cpu_id; + user_arch_info->cpu_type = slot_type(master_id); user_arch_info->cpu_subtype = slot_subtype(master_id); #endif @@ -1314,6 +1325,10 @@ host_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) return KERN_INVALID_ARGUMENT; } + if (task_is_driver(current_task())) { + return KERN_NO_ACCESS; + } + #if CONFIG_MACF if (mac_task_check_set_host_special_port(current_task(), id, port) != 0) { return KERN_NO_ACCESS; @@ -1341,6 +1356,17 @@ host_get_special_port(host_priv_t host_priv, __unused int node, int id, ipc_port return KERN_INVALID_ARGUMENT; } + task_t task = current_task(); + if (task && task_is_driver(task) && id > HOST_MAX_SPECIAL_KERNEL_PORT) { + /* allow HID drivers to get the sysdiagnose port for keychord handling */ + if (IOTaskHasEntitlement(task, kIODriverKitHIDFamilyEventServiceEntitlementKey) && + id == HOST_SYSDIAGNOSE_PORT) { + goto get_special_port; + } + return KERN_NO_ACCESS; + } + +get_special_port: host_lock(host_priv); port = realhost.special[id]; *portp = ipc_port_copy_send(port); diff --git a/osfmk/kern/host.h b/osfmk/kern/host.h index 480eb4bf8..8ada4462a 100644 --- a/osfmk/kern/host.h +++ b/osfmk/kern/host.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,7 +77,7 @@ #include struct host { - decl_lck_mtx_data(, lock) /* lock to protect exceptions */ + decl_lck_mtx_data(, lock); /* lock to protect exceptions */ ipc_port_t special[HOST_MAX_SPECIAL_PORT + 1]; struct exception_action exc_actions[EXC_TYPES_COUNT]; }; diff --git a/osfmk/kern/host_notify.c b/osfmk/kern/host_notify.c index 27c8bc750..dfb2703b4 100644 --- a/osfmk/kern/host_notify.c +++ b/osfmk/kern/host_notify.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2009 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,7 +37,7 @@ #include "mach/host_notify_reply.h" -decl_lck_mtx_data(, host_notify_lock) +decl_lck_mtx_data(, host_notify_lock); lck_mtx_ext_t host_notify_lock_ext; lck_grp_t host_notify_lock_grp; diff --git a/osfmk/kern/hv_support.c b/osfmk/kern/hv_support.c index 683076b2f..74a06ea76 100644 --- a/osfmk/kern/hv_support.c +++ b/osfmk/kern/hv_support.c @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include #include @@ -230,3 +231,9 @@ hv_thread_trap(uint64_t index, uint64_t arg) { return HV_TRAP_DISPATCH(HV_THREAD_TRAP, index, hv_get_thread_target(), arg); } + +boolean_t +hv_ast_pending(void) +{ + return current_cpu_datap()->cpu_pending_ast & (AST_APC | AST_BSD); +} diff --git a/osfmk/kern/hv_support.h b/osfmk/kern/hv_support.h index 72d5bd2cf..a945a18b9 100644 --- a/osfmk/kern/hv_support.h +++ b/osfmk/kern/hv_support.h @@ -80,6 +80,7 @@ extern void hv_release_callbacks(void); extern void hv_suspend(void); extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg); extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg); +extern boolean_t hv_ast_pending(void); #if defined(__cplusplus) } diff --git a/osfmk/kern/ipc_clock.c b/osfmk/kern/ipc_clock.c index 502f876a4..800c7b857 100644 --- a/osfmk/kern/ipc_clock.c +++ b/osfmk/kern/ipc_clock.c @@ -194,7 +194,7 @@ port_name_to_clock( if (ipc_port_translate_send(space, clock_name, &port) != KERN_SUCCESS) { return clock; } - if (ip_active(port) && (ip_kotype(port) == IKOT_CLOCK)) { + if (ip_kotype(port) == IKOT_CLOCK) { clock = (clock_t) port->ip_kobject; } ip_unlock(port); diff --git a/osfmk/kern/ipc_host.c b/osfmk/kern/ipc_host.c index c3b4a4516..2b1b29008 100644 --- a/osfmk/kern/ipc_host.c +++ b/osfmk/kern/ipc_host.c @@ -106,32 +106,17 @@ ipc_host_init(void) /* * Allocate and set up the two host ports. */ - port = ipc_port_alloc_kernel(); - if (port == IP_NULL) { - panic("ipc_host_init"); - } - - ipc_kobject_set(port, (ipc_kobject_t) &realhost, IKOT_HOST_SECURITY); - kernel_set_special_port(&realhost, HOST_SECURITY_PORT, - ipc_port_make_send(port)); - - port = ipc_port_alloc_kernel(); - if (port == IP_NULL) { - panic("ipc_host_init"); - } - - ipc_kobject_set(port, (ipc_kobject_t) &realhost, IKOT_HOST); - kernel_set_special_port(&realhost, HOST_PORT, - ipc_port_make_send(port)); + port = ipc_kobject_alloc_port((ipc_kobject_t) &realhost, IKOT_HOST_SECURITY, + IPC_KOBJECT_ALLOC_MAKE_SEND); + kernel_set_special_port(&realhost, HOST_SECURITY_PORT, port); - port = ipc_port_alloc_kernel(); - if (port == IP_NULL) { - panic("ipc_host_init"); - } + port = ipc_kobject_alloc_port((ipc_kobject_t) &realhost, IKOT_HOST, + IPC_KOBJECT_ALLOC_MAKE_SEND); + kernel_set_special_port(&realhost, HOST_PORT, port); - ipc_kobject_set(port, (ipc_kobject_t) &realhost, IKOT_HOST_PRIV); - kernel_set_special_port(&realhost, HOST_PRIV_PORT, - ipc_port_make_send(port)); + port = ipc_kobject_alloc_port((ipc_kobject_t) &realhost, IKOT_HOST_PRIV, + IPC_KOBJECT_ALLOC_MAKE_SEND); + kernel_set_special_port(&realhost, HOST_PRIV_PORT, port); /* the rest of the special ports will be set up later */ @@ -297,7 +282,7 @@ convert_port_to_host( if (ip_kotype(port) == IKOT_HOST || ip_kotype(port) == IKOT_HOST_PRIV) { host = (host_t) port->ip_kobject; - assert(ip_active(port)); + require_ip_active(port); } } return host; @@ -583,7 +568,7 @@ host_set_exception_ports( } if (IP_VALID(new_port)) { - switch (new_behavior & ~MACH_EXCEPTION_CODES) { + switch (new_behavior & ~MACH_EXCEPTION_MASK) { case EXCEPTION_DEFAULT: case EXCEPTION_STATE: case EXCEPTION_STATE_IDENTITY: diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index 2d63117cf..d2e0c1746 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -101,6 +101,7 @@ #include #endif #include +#include #include #include @@ -110,6 +111,10 @@ #include +#if CONFIG_ARCADE +#include +#endif + #if CONFIG_AUDIT #include #endif @@ -128,6 +133,11 @@ #include #include #include + +#if CONFIG_ARCADE +#include +#endif /* CONFIG_ARCADE */ + #include #include #include @@ -195,6 +205,7 @@ const struct mig_subsystem *mig_e[] = { (const struct mig_subsystem *)&mach_voucher_subsystem, (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem, (const struct mig_subsystem *)&memory_entry_subsystem, + (const struct mig_subsystem *)&task_restartable_subsystem, #if XK_PROXY (const struct mig_subsystem *)&do_uproxy_xk_uproxy_subsystem, @@ -207,6 +218,9 @@ const struct mig_subsystem *mig_e[] = { #endif /* MCMSG && iPSC860 */ (const struct mig_subsystem *)&catch_exc_subsystem, (const struct mig_subsystem *)&catch_mach_exc_subsystem, +#if CONFIG_ARCADE + (const struct mig_subsystem *)&arcade_register_subsystem, +#endif }; void @@ -272,7 +286,6 @@ ipc_kobject_server( mach_msg_size_t reply_size; ipc_kmsg_t reply; kern_return_t kr; - ipc_port_t *destp; ipc_port_t replyp = IPC_PORT_NULL; mach_msg_format_0_trailer_t *trailer; mig_hash_t *ptr; @@ -280,9 +293,25 @@ ipc_kobject_server( uint32_t exec_token; boolean_t exec_token_changed = FALSE; int request_msgh_id = request->ikm_header->msgh_id; + natural_t ikot; + ipc_port_t port; + reply = NULL; + port = request->ikm_header->msgh_remote_port; + if (IP_VALID(port)) { + ikot = ip_kotype(port); + } else { + ikot = IKOT_UNKNOWN; + } + if (ikot == IKOT_UEXT_OBJECT) { + kr = uext_server(request, &reply); + if ((MIG_NO_REPLY == kr) || (KERN_SUCCESS == kr)) { + ipc_kmsg_trace_send(request, option); + goto msgdone; + } + } /* - * Find out corresponding mig_hash entry if any + * Find corresponding mig_hash entry if any */ { unsigned int i = (unsigned int)MIG_HASH(request_msgh_id); @@ -297,7 +326,7 @@ ipc_kobject_server( reply_size = mig_reply_size; } else { reply_size = ptr->size; -#if MACH_COUNTER +#if MACH_COUNTERS ptr->callcount++; #endif } @@ -353,8 +382,7 @@ ipc_kobject_server( * Check if the port is a task port, if its a task port then * snapshot the task exec token before the mig routine call. */ - ipc_port_t port = request->ikm_header->msgh_remote_port; - if (IP_VALID(port) && ip_kotype(port) == IKOT_TASK) { + if (ikot == IKOT_TASK) { task = convert_port_to_task_with_exec_token(port, &exec_token); } @@ -386,31 +414,39 @@ ipc_kobject_server( kernel_task->messages_sent++; } + if (!(reply->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) && + ((mig_reply_error_t *) reply->ikm_header)->RetCode != KERN_SUCCESS) { + kr = ((mig_reply_error_t *) reply->ikm_header)->RetCode; + } else { + kr = KERN_SUCCESS; + } + +msgdone: /* * Destroy destination. The following code differs from * ipc_object_destroy in that we release the send-once * right instead of generating a send-once notification - * (which would bring us here again, creating a loop). + * (which would bring us here again, creating a loop). * It also differs in that we only expect send or * send-once rights, never receive rights. * * We set msgh_remote_port to IP_NULL so that the kmsg * destroy routines don't try to destroy the port twice. */ - destp = (ipc_port_t *) &request->ikm_header->msgh_remote_port; switch (MACH_MSGH_BITS_REMOTE(request->ikm_header->msgh_bits)) { case MACH_MSG_TYPE_PORT_SEND: - ipc_port_release_send(*destp); + ipc_port_release_send(request->ikm_header->msgh_remote_port); + request->ikm_header->msgh_remote_port = IP_NULL; break; case MACH_MSG_TYPE_PORT_SEND_ONCE: - ipc_port_release_sonce(*destp); + ipc_port_release_sonce(request->ikm_header->msgh_remote_port); + request->ikm_header->msgh_remote_port = IP_NULL; break; default: panic("ipc_kobject_server: strange destination rights"); } - *destp = IP_NULL; /* * Destroy voucher. The kernel MIG servers never take ownership @@ -423,13 +459,6 @@ ipc_kobject_server( request->ikm_voucher = IP_NULL; } - if (!(reply->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) && - ((mig_reply_error_t *) reply->ikm_header)->RetCode != KERN_SUCCESS) { - kr = ((mig_reply_error_t *) reply->ikm_header)->RetCode; - } else { - kr = KERN_SUCCESS; - } - if ((kr == KERN_SUCCESS) || (kr == MIG_NO_REPLY)) { /* * The server function is responsible for the contents @@ -449,18 +478,23 @@ ipc_kobject_server( ipc_kmsg_destroy(request); } - replyp = (ipc_port_t)reply->ikm_header->msgh_remote_port; - if (kr == MIG_NO_REPLY) { /* * The server function will send a reply message * using the reply port right, which it has saved. */ - ipc_kmsg_free(reply); - + if (reply) { + ipc_kmsg_free(reply); + } return IKM_NULL; - } else if (!IP_VALID(replyp)) { + } + + if (reply) { + replyp = reply->ikm_header->msgh_remote_port; + } + + if (!IP_VALID(replyp)) { /* * Can't queue the reply message if the destination * (the reply port) isn't valid. @@ -567,10 +601,122 @@ ipc_kobject_set_atomically( { assert(type == IKOT_NONE || ip_active(port)); #if MACH_ASSERT - port->ip_spares[2] = (port->ip_bits & IO_BITS_KOTYPE); + port->ip_spares[2] = (port->ip_object.io_bits & IO_BITS_KOTYPE); #endif /* MACH_ASSERT */ - port->ip_bits = (port->ip_bits & ~IO_BITS_KOTYPE) | type; + port->ip_object.io_bits = (port->ip_object.io_bits & ~IO_BITS_KOTYPE) | type; port->ip_kobject = kobject; + if (type != IKOT_NONE) { + /* Once set, this bit can never be unset */ + port->ip_object.io_bits |= IO_BITS_KOBJECT; + } +} + +/* + * Routine: ipc_kobject_alloc_port + * Purpose: + * Allocate a kobject port in the kernel space of the specified type. + * + * This function never fails. + * + * Conditions: + * No locks held (memory is allocated) + */ +ipc_port_t +ipc_kobject_alloc_port( + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_kobject_alloc_options_t options) +{ + ipc_port_t port = ipc_port_alloc_kernel(); + + if (port == IP_NULL) { + panic("ipc_kobject_alloc_port(): failed to allocate port"); + } + + ipc_kobject_set_atomically(port, kobject, type); + + if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) { + ipc_port_make_send_locked(port); + } + if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { + ipc_port_make_sonce_locked(port); + port->ip_nsrequest = port; + } + if (options & IPC_KOBJECT_ALLOC_NO_GRANT) { + port->ip_no_grant = 1; + } + if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) { + port->ip_immovable_send = 1; + } + + return port; +} + +/* + * Routine: ipc_kobject_make_send_lazy_alloc_port + * Purpose: + * Make a send once for a kobject port. + * + * A location owning this port is passed in port_store. + * If no port exists, a port is made lazily. + * + * A send right is made for the port, and if this is the first one + * (possibly not for the first time), then the no-more-senders + * notification is rearmed. + * + * When a notification is armed, the kobject must donate + * one of its references to the port. It is expected + * the no-more-senders notification will consume this reference. + * + * Returns: + * TRUE if a notification was armed + * FALSE else + * + * Conditions: + * Nothing is locked, memory can be allocated. + * The caller must be able to donate a kobject reference to the port. + */ +boolean_t +ipc_kobject_make_send_lazy_alloc_port( + ipc_port_t *port_store, + ipc_kobject_t kobject, + ipc_kobject_type_t type) +{ + ipc_port_t port, previous; + boolean_t rc = FALSE; + + port = os_atomic_load(port_store, dependency); + + if (!IP_VALID(port)) { + port = ipc_kobject_alloc_port(kobject, type, + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + if (os_atomic_cmpxchgv(port_store, IP_NULL, port, &previous, release)) { + return TRUE; + } + + // undo what ipc_kobject_alloc_port() did above + port->ip_nsrequest = IP_NULL; + port->ip_mscount = 0; + port->ip_sorights = 0; + port->ip_srights = 0; + ip_release(port); + ip_release(port); + ipc_port_dealloc_kernel(port); + + port = previous; + } + + ip_lock(port); + ipc_port_make_send_locked(port); + if (port->ip_srights == 1) { + ipc_port_make_sonce_locked(port); + assert(port->ip_nsrequest == IP_NULL); + port->ip_nsrequest = port; + rc = TRUE; + } + ip_unlock(port); + + return rc; } /* @@ -616,7 +762,7 @@ ipc_kobject_notify( mach_msg_header_t *reply_header) { mach_msg_max_trailer_t * trailer; - ipc_port_t port = (ipc_port_t) request_header->msgh_remote_port; + ipc_port_t port = request_header->msgh_remote_port; ((mig_reply_error_t *) reply_header)->RetCode = MIG_NO_REPLY; @@ -706,6 +852,7 @@ ipc_kobject_notify( case IKOT_IOKIT_OBJECT: case IKOT_IOKIT_CONNECT: case IKOT_IOKIT_IDENT: + case IKOT_UEXT_OBJECT: { return iokit_notify(request_header); } diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 95f150776..4431f29ca 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -130,17 +130,16 @@ typedef natural_t ipc_kobject_type_t; #define IKOT_VOUCHER_ATTR_CONTROL 38 #define IKOT_WORK_INTERVAL 39 #define IKOT_UX_HANDLER 40 +#define IKOT_UEXT_OBJECT 41 +#define IKOT_ARCADE_REG 42 /* * Add new entries here and adjust IKOT_UNKNOWN. * Please keep ipc/ipc_object.c:ikot_print_array up to date. */ -#define IKOT_UNKNOWN 41 /* magic catchall */ +#define IKOT_UNKNOWN 43 /* magic catchall */ #define IKOT_MAX_TYPE (IKOT_UNKNOWN+1) /* # of IKOT_ types */ - -#define is_ipc_kobject(ikot) ((ikot) != IKOT_NONE) - #ifdef MACH_KERNEL_PRIVATE /* @@ -149,27 +148,56 @@ typedef natural_t ipc_kobject_type_t; */ /* Dispatch a kernel server function */ -extern ipc_kmsg_t ipc_kobject_server( - ipc_kmsg_t request, - mach_msg_option_t option); +extern ipc_kmsg_t ipc_kobject_server( + ipc_kmsg_t request, + mach_msg_option_t option); /* Make a port represent a kernel object of the given type */ -extern void ipc_kobject_set( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t type); +extern void ipc_kobject_set( + ipc_port_t port, + ipc_kobject_t kobject, + ipc_kobject_type_t type); + +extern void ipc_kobject_set_atomically( + ipc_port_t port, + ipc_kobject_t kobject, + ipc_kobject_type_t type); + +__options_decl(ipc_kobject_alloc_options_t, uint32_t, { + /* Just make the naked port */ + IPC_KOBJECT_ALLOC_NONE = 0x00000000, + /* Make a send right */ + IPC_KOBJECT_ALLOC_MAKE_SEND = 0x00000001, + /* Register for no-more-senders */ + IPC_KOBJECT_ALLOC_NSREQUEST = 0x00000002, + /* Make it no grant port */ + IPC_KOBJECT_ALLOC_NO_GRANT = 0x00000004, + /* Make all the send rights immovable */ + IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008, +}); + +/* Allocates a kobject port, never fails */ +extern ipc_port_t ipc_kobject_alloc_port( + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_kobject_alloc_options_t options); + +/* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */ +extern boolean_t ipc_kobject_make_send_lazy_alloc_port( + ipc_port_t *port_store, + ipc_kobject_t kobject, + ipc_kobject_type_t type) __result_use_check; -extern void ipc_kobject_set_atomically( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t type); /* Release any kernel object resources associated with a port */ -extern void ipc_kobject_destroy( - ipc_port_t port); +extern void ipc_kobject_destroy( + ipc_port_t port); #define null_conversion(port) (port) +extern kern_return_t +uext_server(ipc_kmsg_t request, ipc_kmsg_t * reply); + #endif /* MACH_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index 4770d8b87..722384a00 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -420,10 +420,9 @@ mach_msg_rpc_from_kernel_body( for (;;) { ipc_mqueue_t mqueue; - ipc_object_t object; assert(reply->ip_in_pset == 0); - assert(ip_active(reply)); + require_ip_active(reply); /* JMM - why this check? */ if (!self->active && !self->inspection) { @@ -445,8 +444,7 @@ mach_msg_rpc_from_kernel_body( kmsg = self->ith_kmsg; seqno = self->ith_seqno; - __IGNORE_WCASTALIGN(object = (ipc_object_t) reply); - mach_msg_receive_results_complete(object); + mach_msg_receive_results_complete(ip_to_object(reply)); if (mr == MACH_MSG_SUCCESS) { break; @@ -586,6 +584,13 @@ mach_msg_destroy_from_kernel_proper(mach_msg_header_t *msg) kfree(dsc->address, (vm_size_t) dsc->count * sizeof(mach_port_t)); break; } + case MACH_MSG_GUARDED_PORT_DESCRIPTOR: { + mach_msg_guarded_port_descriptor_t *dsc = (mach_msg_guarded_port_descriptor_t *)&daddr->guarded_port; + if (IO_VALID((ipc_object_t) dsc->name)) { + ipc_object_destroy((ipc_object_t) dsc->name, dsc->disposition); + } + break; + } default: break; } @@ -633,7 +638,7 @@ mach_msg_overwrite( if ((send_size & 3) || send_size < sizeof(mach_msg_header_t) || - (send_size < sizeof(mach_msg_body_t) && (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX))) { + (send_size < sizeof(mach_msg_base_t) && (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX))) { return MACH_SEND_MSG_TOO_SMALL; } @@ -962,7 +967,13 @@ mig_object_deallocate( mig_object_t mig_object) { assert(mig_object != MIG_OBJECT_NULL); - mig_object->pVtbl->Release((IMIGObject *)mig_object); + ipc_port_t port = mig_object->port; + if (mig_object->pVtbl->Release((IMIGObject *)mig_object) == 0) { + if (IP_VALID(port)) { + assert(!port->ip_srights); + ipc_port_dealloc_kernel(port); + } + } } /* @@ -981,56 +992,20 @@ ipc_port_t convert_mig_object_to_port( mig_object_t mig_object) { - ipc_port_t port; - boolean_t deallocate = TRUE; - if (mig_object == MIG_OBJECT_NULL) { return IP_NULL; } - port = mig_object->port; - while ((port == IP_NULL) || - ((port = ipc_port_make_send(port)) == IP_NULL)) { - ipc_port_t previous; - - /* - * Either the port was never set up, or it was just - * deallocated out from under us by the no-senders - * processing. In either case, we must: - * Attempt to make one - * Arrange for no senders - * Try to atomically register it with the object - * Destroy it if we are raced. - */ - port = ipc_port_alloc_kernel(); - ip_lock(port); - ipc_kobject_set_atomically(port, - (ipc_kobject_t) mig_object, - IKOT_MIG); - - /* make a sonce right for the notification */ - port->ip_sorights++; - ip_reference(port); - - ipc_port_nsrequest(port, 1, port, &previous); - /* port unlocked */ - - assert(previous == IP_NULL); - - if (OSCompareAndSwapPtr((void *)IP_NULL, (void *)port, - (void * volatile *)&mig_object->port)) { - deallocate = FALSE; - } else { - ipc_port_dealloc_kernel(port); - port = mig_object->port; - } - } - - if (deallocate) { - mig_object->pVtbl->Release((IMIGObject *)mig_object); + /* + * make a send right and donate our reference for mig_object_no_senders + * if this is the first send right + */ + if (!ipc_kobject_make_send_lazy_alloc_port(&mig_object->port, + (ipc_kobject_t) mig_object, IKOT_MIG)) { + mig_object_deallocate(mig_object); } - return port; + return mig_object->port; } @@ -1082,59 +1057,18 @@ convert_port_to_mig_object( * Base implementation of a no-senders notification handler * for MIG objects. If there truly are no more senders, must * destroy the port and drop its reference on the object. - * Returns: - * TRUE - port deallocate and reference dropped - * FALSE - more senders arrived, re-registered for notification * Conditions: * Nothing locked. */ - -boolean_t +void mig_object_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount) + ipc_port_t port) { - mig_object_t mig_object; - - ip_lock(port); - if (port->ip_mscount > mscount) { - ipc_port_t previous; - - /* - * Somebody created new send rights while the - * notification was in-flight. Just create a - * new send-once right and re-register with - * the new (higher) mscount threshold. - */ - /* make a sonce right for the notification */ - port->ip_sorights++; - ip_reference(port); - ipc_port_nsrequest(port, mscount, port, &previous); - /* port unlocked */ - - assert(previous == IP_NULL); - return FALSE; - } - - /* - * Clear the port pointer while we have it locked. - */ - mig_object = (mig_object_t)port->ip_kobject; - mig_object->port = IP_NULL; + require_ip_active(port); + assert(IKOT_MIG == ip_kotype(port)); - /* - * Bring the sequence number and mscount in - * line with ipc_port_destroy assertion. - */ - port->ip_mscount = 0; - port->ip_messages.imq_seqno = 0; - ipc_port_destroy(port); /* releases lock */ - - /* - * Release the port's reference on the object. - */ - mig_object->pVtbl->Release((IMIGObject *)mig_object); - return TRUE; + /* consume the reference donated by convert_mig_object_to_port */ + mig_object_deallocate((mig_object_t)port->ip_kobject); } /* diff --git a/osfmk/kern/ipc_mig.h b/osfmk/kern/ipc_mig.h index 3fb5a8cba..a4fad67e1 100644 --- a/osfmk/kern/ipc_mig.h +++ b/osfmk/kern/ipc_mig.h @@ -167,13 +167,20 @@ extern mach_msg_return_t mach_msg_send_from_kernel_with_options_legacy( mach_msg_size_t send_size, mach_msg_option_t option, mach_msg_timeout_t timeout_val); -#endif /* XNU_KERNEL_PRIVATE */ +extern mach_msg_return_t mach_msg_send_from_kernel_with_options( + mach_msg_header_t *msg, + mach_msg_size_t send_size, + mach_msg_option_t option, + mach_msg_timeout_t timeout_val) +__XNU_INTERNAL(mach_msg_send_from_kernel_with_options); +#else extern mach_msg_return_t mach_msg_send_from_kernel_with_options( mach_msg_header_t *msg, mach_msg_size_t send_size, mach_msg_option_t option, mach_msg_timeout_t timeout_val); +#endif /* XNU_KERNEL_PRIVATE */ __END_DECLS @@ -229,9 +236,8 @@ extern mig_object_t convert_port_to_mig_object( ipc_port_t port, const MIGIID *iid); -boolean_t mig_object_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount); +extern void mig_object_no_senders( + ipc_port_t port); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/ipc_misc.c b/osfmk/kern/ipc_misc.c index 655d385e8..16c3c5a51 100644 --- a/osfmk/kern/ipc_misc.c +++ b/osfmk/kern/ipc_misc.c @@ -52,27 +52,8 @@ extern void fileport_releasefg(struct fileglob *); ipc_port_t fileport_alloc(struct fileglob *fg) { - ipc_port_t fileport; - ipc_port_t sendport; - ipc_port_t notifyport; - - fileport = ipc_port_alloc_kernel(); - if (fileport == IP_NULL) { - goto out; - } - - ipc_kobject_set(fileport, (ipc_kobject_t)fg, IKOT_FILEPORT); - ip_lock(fileport); /* unlocked by ipc_port_nsrequest */ - notifyport = ipc_port_make_sonce_locked(fileport); - ipc_port_nsrequest(fileport, 1, notifyport, ¬ifyport); - - sendport = ipc_port_make_send(fileport); - if (!IP_VALID(sendport)) { - panic("Couldn't allocate send right for fileport!\n"); - } - -out: - return fileport; + return ipc_kobject_alloc_port((ipc_kobject_t)fg, IKOT_FILEPORT, + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); } @@ -174,7 +155,8 @@ fileport_invoke(task_t task, mach_port_name_t name, struct fileglob *fg; kr = ipc_object_copyin(task->itk_space, name, - MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport); + MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport, 0, NULL, + IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); if (kr != KERN_SUCCESS) { return kr; } diff --git a/osfmk/kern/ipc_sync.c b/osfmk/kern/ipc_sync.c index d09b42157..7f65888d5 100644 --- a/osfmk/kern/ipc_sync.c +++ b/osfmk/kern/ipc_sync.c @@ -65,8 +65,7 @@ port_name_to_semaphore( return KERN_INVALID_NAME; } - kr = ipc_object_translate(current_space(), name, MACH_PORT_RIGHT_SEND, - (ipc_object_t *) &kern_port); + kr = ipc_port_translate_send(current_space(), name, &kern_port); if (kr != KERN_SUCCESS) { *semaphorep = SEMAPHORE_NULL; return kr; @@ -108,7 +107,7 @@ convert_port_to_semaphore(ipc_port_t port) * keeps the semaphore bound to the port (and active). */ if (ip_kotype(port) == IKOT_SEMAPHORE) { - assert(ip_active(port)); + require_ip_active(port); semaphore = (semaphore_t) port->ip_kobject; semaphore_reference(semaphore); return semaphore; @@ -132,47 +131,19 @@ convert_port_to_semaphore(ipc_port_t port) ipc_port_t convert_semaphore_to_port(semaphore_t semaphore) { - ipc_port_t port, send; - if (semaphore == SEMAPHORE_NULL) { return IP_NULL; } - /* caller is donating a reference */ - port = semaphore->port; - - if (!IP_VALID(port)) { - port = ipc_port_alloc_kernel(); - assert(IP_VALID(port)); - ipc_kobject_set_atomically(port, (ipc_kobject_t) semaphore, IKOT_SEMAPHORE); - - /* If we lose the race, deallocate and pick up the other guy's port */ - if (!OSCompareAndSwapPtr(IP_NULL, port, &semaphore->port)) { - ipc_port_dealloc_kernel(port); - port = semaphore->port; - assert(ip_kotype(port) == IKOT_SEMAPHORE); - assert(port->ip_kobject == (ipc_kobject_t)semaphore); - } - } - - ip_lock(port); - assert(ip_active(port)); - send = ipc_port_make_send_locked(port); - - if (1 == port->ip_srights) { - ipc_port_t old_notify; - - /* transfer our ref to the port, and arm the no-senders notification */ - assert(IP_NULL == port->ip_nsrequest); - ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify); - /* port unlocked */ - assert(IP_NULL == old_notify); - } else { - /* piggyback on the existing port reference, so consume ours */ - ip_unlock(port); + /* + * make a send right and donate our reference for + * semaphore_notify if this is the first send right + */ + if (!ipc_kobject_make_send_lazy_alloc_port(&semaphore->port, + (ipc_kobject_t) semaphore, IKOT_SEMAPHORE)) { semaphore_dereference(semaphore); } - return send; + return semaphore->port; } /* @@ -194,13 +165,11 @@ semaphore_notify(mach_msg_header_t *msg) { mach_no_senders_notification_t *notification = (void *)msg; ipc_port_t port = notification->not_header.msgh_remote_port; - semaphore_t semaphore; - assert(ip_active(port)); + require_ip_active(port); assert(IKOT_SEMAPHORE == ip_kotype(port)); - semaphore = (semaphore_t)port->ip_kobject; - semaphore_dereference(semaphore); + semaphore_dereference((semaphore_t)port->ip_kobject); } lock_set_t diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index 03fdc53bf..7d0384cf2 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -431,10 +431,8 @@ ipc_task_reset( struct label *unset_label = mac_exc_create_label(); #endif - new_kport = ipc_port_alloc_kernel(); - if (new_kport == IP_NULL) { - panic("ipc_task_reset"); - } + new_kport = ipc_kobject_alloc_port((ipc_kobject_t)task, IKOT_TASK, + IPC_KOBJECT_ALLOC_MAKE_SEND); itk_lock(task); @@ -443,6 +441,7 @@ ipc_task_reset( if (old_kport == IP_NULL) { /* the task is already terminated (can this happen?) */ itk_unlock(task); + ipc_port_release_send(new_kport); ipc_port_dealloc_kernel(new_kport); #if CONFIG_MACF mac_exc_free_label(unset_label); @@ -450,9 +449,8 @@ ipc_task_reset( return; } - task->itk_self = new_kport; old_sself = task->itk_sself; - task->itk_sself = ipc_port_make_send(new_kport); + task->itk_sself = task->itk_self = new_kport; /* Set the old kport to IKOT_NONE and update the exec token while under the port lock */ ip_lock(old_kport); @@ -460,8 +458,6 @@ ipc_task_reset( task->exec_token += 1; ip_unlock(old_kport); - ipc_kobject_set(new_kport, (ipc_kobject_t) task, IKOT_TASK); - for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { old_exc_actions[i] = IP_NULL; @@ -519,18 +515,13 @@ ipc_thread_init( { ipc_port_t kport; - kport = ipc_port_alloc_kernel(); - if (kport == IP_NULL) { - panic("ipc_thread_init"); - } + kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD, + IPC_KOBJECT_ALLOC_MAKE_SEND); - thread->ith_self = kport; - thread->ith_sself = ipc_port_make_send(kport); + thread->ith_sself = thread->ith_self = kport; thread->ith_special_reply_port = NULL; thread->exc_actions = NULL; - ipc_kobject_set(kport, (ipc_kobject_t)thread, IKOT_THREAD); - #if IMPORTANCE_INHERITANCE thread->ith_assertions = 0; #endif @@ -582,6 +573,11 @@ ipc_thread_disable( if (kport != IP_NULL) { ipc_kobject_set(kport, IKO_NULL, IKOT_NONE); } + + /* unbind the thread special reply port */ + if (IP_VALID(thread->ith_special_reply_port)) { + ipc_port_unbind_special_reply_port(thread, TRUE); + } } /* @@ -623,11 +619,6 @@ ipc_thread_terminate( assert(thread->ith_assertions == 0); #endif - /* unbind the thread special reply port */ - if (IP_VALID(thread->ith_special_reply_port)) { - ipc_port_unbind_special_reply_port(thread, TRUE); - } - assert(ipc_kmsg_queue_empty(&thread->ith_messages)); if (thread->ith_rpc_reply != IP_NULL) { @@ -663,18 +654,18 @@ ipc_thread_reset( struct label *new_label = mac_exc_create_label(); #endif - new_kport = ipc_port_alloc_kernel(); - if (new_kport == IP_NULL) { - panic("ipc_task_reset"); - } + new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD, + IPC_KOBJECT_ALLOC_MAKE_SEND); thread_mtx_lock(thread); old_kport = thread->ith_self; + old_sself = thread->ith_sself; if (old_kport == IP_NULL && thread->inspection == FALSE) { /* the is already terminated (can this happen?) */ thread_mtx_unlock(thread); + ipc_port_release_send(new_kport); ipc_port_dealloc_kernel(new_kport); #if CONFIG_MACF mac_exc_free_label(new_label); @@ -682,13 +673,10 @@ ipc_thread_reset( return; } - thread->ith_self = new_kport; - old_sself = thread->ith_sself; - thread->ith_sself = ipc_port_make_send(new_kport); + thread->ith_sself = thread->ith_self = new_kport; if (old_kport != IP_NULL) { ipc_kobject_set(old_kport, IKO_NULL, IKOT_NONE); } - ipc_kobject_set(new_kport, (ipc_kobject_t) thread, IKOT_THREAD); /* * Only ports that were set by root-owned processes @@ -754,6 +742,7 @@ ipc_port_t retrieve_task_self_fast( task_t task) { + __assert_only ipc_port_t sright; ipc_port_t port; assert(task == current_task()); @@ -763,12 +752,8 @@ retrieve_task_self_fast( if ((port = task->itk_sself) == task->itk_self) { /* no interposing */ - - ip_lock(port); - assert(ip_active(port)); - ip_reference(port); - port->ip_srights++; - ip_unlock(port); + sright = ipc_port_copy_send(port); + assert(sright == port); } else { port = ipc_port_copy_send(port); } @@ -793,6 +778,7 @@ ipc_port_t retrieve_thread_self_fast( thread_t thread) { + __assert_only ipc_port_t sright; ipc_port_t port; assert(thread == current_thread()); @@ -803,12 +789,8 @@ retrieve_thread_self_fast( if ((port = thread->ith_sself) == thread->ith_self) { /* no interposing */ - - ip_lock(port); - assert(ip_active(port)); - ip_reference(port); - port->ip_srights++; - ip_unlock(port); + sright = ipc_port_copy_send(port); + assert(sright == port); } else { port = ipc_port_copy_send(port); } @@ -886,7 +868,7 @@ mach_reply_port( mach_port_name_t name; kern_return_t kr; - kr = ipc_port_alloc(current_task()->itk_space, &name, &port); + kr = ipc_port_alloc(current_task()->itk_space, FALSE, &name, &port); if (kr == KERN_SUCCESS) { ip_unlock(port); } else { @@ -913,7 +895,6 @@ thread_get_special_reply_port( { ipc_port_t port; mach_port_name_t name; - mach_port_name_t send_name; kern_return_t kr; thread_t thread = current_thread(); @@ -925,25 +906,10 @@ thread_get_special_reply_port( } } - kr = ipc_port_alloc(current_task()->itk_space, &name, &port); + kr = ipc_port_alloc(current_task()->itk_space, TRUE, &name, &port); if (kr == KERN_SUCCESS) { ipc_port_bind_special_reply_port_locked(port); - - /* Make a send right and insert it in the space at specified name */ - ipc_port_make_send_locked(port); ip_unlock(port); - send_name = ipc_port_copyout_name_send(port, current_task()->itk_space, name); - /* - * If insertion of send right failed, userland is doing something bad, error out. - * The space was marked inactive or the receive right just inserted above at the - * given name was moved, in either case do not try to deallocate the receive right. - */ - if (send_name == MACH_PORT_NULL || send_name == MACH_PORT_DEAD) { - if (IP_VALID(thread->ith_special_reply_port)) { - ipc_port_unbind_special_reply_port(thread, TRUE); - } - name = MACH_PORT_NULL; - } } else { name = MACH_PORT_NULL; } @@ -971,8 +937,9 @@ ipc_port_bind_special_reply_port_locked( thread->ith_special_reply_port = port; port->ip_specialreply = 1; port->ip_sync_link_state = PORT_SYNC_LINK_ANY; + port->ip_messages.imq_srp_owner_thread = thread; - reset_ip_srp_bits(port); + ipc_special_reply_port_bits_reset(port); } /* @@ -1003,7 +970,7 @@ ipc_port_unbind_special_reply_port( thread->ith_special_reply_port = NULL; ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, - IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY, FALSE); + IPC_PORT_ADJUST_UNLINK_THREAD, FALSE); /* port unlocked */ ip_release(special_reply_port); @@ -1214,6 +1181,10 @@ task_set_special_port( return KERN_INVALID_ARGUMENT; } + if (task_is_driver(current_task())) { + return KERN_NO_ACCESS; + } + switch (which) { case TASK_KERNEL_PORT: whichp = &task->itk_sself; @@ -1546,21 +1517,32 @@ convert_port_to_locked_task_inspect(ipc_port_t port) return TASK_INSPECT_NULL; } - -/* - * Routine: convert_port_to_task - * Purpose: - * Convert from a port to a task. - * Doesn't consume the port ref; produces a task ref, - * which may be null. - * Conditions: - * Nothing locked. - */ -task_t -convert_port_to_task( - ipc_port_t port) +static task_t +convert_port_to_task_locked( + ipc_port_t port, + uint32_t *exec_token) { - return convert_port_to_task_with_exec_token(port, NULL); + task_t task = TASK_NULL; + + ip_lock_held(port); + require_ip_active(port); + + if (ip_kotype(port) == IKOT_TASK) { + task_t ct = current_task(); + task = (task_t)port->ip_kobject; + assert(task != TASK_NULL); + + if (task_conversion_eval(ct, task)) { + return TASK_NULL; + } + + if (exec_token) { + *exec_token = task->exec_token; + } + task_reference_internal(task); + } + + return task; } /* @@ -1582,30 +1564,32 @@ convert_port_to_task_with_exec_token( if (IP_VALID(port)) { ip_lock(port); - - if (ip_active(port) && - ip_kotype(port) == IKOT_TASK) { - task_t ct = current_task(); - task = (task_t)port->ip_kobject; - assert(task != TASK_NULL); - - if (task_conversion_eval(ct, task)) { - ip_unlock(port); - return TASK_NULL; - } - - if (exec_token) { - *exec_token = task->exec_token; - } - task_reference_internal(task); + if (ip_active(port)) { + task = convert_port_to_task_locked(port, exec_token); } - ip_unlock(port); } return task; } +/* + * Routine: convert_port_to_task + * Purpose: + * Convert from a port to a task. + * Doesn't consume the port ref; produces a task ref, + * which may be null. + * Conditions: + * Nothing locked. + */ +task_t +convert_port_to_task( + ipc_port_t port) +{ + return convert_port_to_task_with_exec_token(port, NULL); +} + + /* * Routine: convert_port_to_task_name * Purpose: @@ -1639,6 +1623,25 @@ convert_port_to_task_name( return task; } +static task_inspect_t +convert_port_to_task_inspect_locked( + ipc_port_t port) +{ + task_inspect_t task = TASK_INSPECT_NULL; + + ip_lock_held(port); + require_ip_active(port); + + if (ip_kotype(port) == IKOT_TASK) { + task = (task_inspect_t)port->ip_kobject; + assert(task != TASK_INSPECT_NULL); + + task_reference_internal(task); + } + + return task; +} + /* * Routine: convert_port_to_task_inspect * Purpose: @@ -1656,15 +1659,9 @@ convert_port_to_task_inspect( if (IP_VALID(port)) { ip_lock(port); - - if (ip_active(port) && - ip_kotype(port) == IKOT_TASK) { - task = (task_inspect_t)port->ip_kobject; - assert(task != TASK_INSPECT_NULL); - - task_reference_internal(task); + if (ip_active(port)) { + task = convert_port_to_task_inspect_locked(port); } - ip_unlock(port); } @@ -1814,29 +1811,54 @@ convert_port_to_map( * Nothing locked. */ -thread_t -convert_port_to_thread( - ipc_port_t port) +static thread_t +convert_port_to_thread_locked( + ipc_port_t port, + port_to_thread_options_t options) { thread_t thread = THREAD_NULL; - if (IP_VALID(port)) { - ip_lock(port); + ip_lock_held(port); + require_ip_active(port); - if (ip_active(port) && - ip_kotype(port) == IKOT_THREAD) { - thread = (thread_t)port->ip_kobject; - assert(thread != THREAD_NULL); + if (ip_kotype(port) == IKOT_THREAD) { + thread = (thread_t)port->ip_kobject; + assert(thread != THREAD_NULL); + if (options & PORT_TO_THREAD_NOT_CURRENT_THREAD) { + if (thread == current_thread()) { + return THREAD_NULL; + } + } + + if (options & PORT_TO_THREAD_IN_CURRENT_TASK) { + if (thread->task != current_task()) { + return THREAD_NULL; + } + } else { /* Use task conversion rules for thread control conversions */ if (task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) { - ip_unlock(port); return THREAD_NULL; } - - thread_reference_internal(thread); } + thread_reference_internal(thread); + } + + return thread; +} + +thread_t +convert_port_to_thread( + ipc_port_t port) +{ + thread_t thread = THREAD_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port)) { + thread = convert_port_to_thread_locked(port, PORT_TO_THREAD_NONE); + } ip_unlock(port); } @@ -1899,28 +1921,21 @@ convert_thread_inspect_to_port(thread_inspect_t thread) * A name of MACH_PORT_NULL is valid for the null thread. * Conditions: * Nothing locked. - * - * TODO: Could this be faster if it were ipc_port_translate_send based, like thread_switch? - * We could avoid extra lock/unlock and extra ref operations on the port. */ thread_t port_name_to_thread( - mach_port_name_t name) + mach_port_name_t name, + port_to_thread_options_t options) { thread_t thread = THREAD_NULL; ipc_port_t kport; + kern_return_t kr; if (MACH_PORT_VALID(name)) { - if (ipc_object_copyin(current_space(), name, - MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *)&kport) != KERN_SUCCESS) { - return THREAD_NULL; - } - - thread = convert_port_to_thread(kport); - - if (IP_VALID(kport)) { - ipc_port_release_send(kport); + kr = ipc_port_translate_send(current_space(), name, &kport); + if (kr == KERN_SUCCESS) { + thread = convert_port_to_thread_locked(kport, options); + ip_unlock(kport); } } @@ -1931,22 +1946,15 @@ task_t port_name_to_task( mach_port_name_t name) { - ipc_port_t kern_port; + ipc_port_t kport; kern_return_t kr; task_t task = TASK_NULL; if (MACH_PORT_VALID(name)) { - kr = ipc_object_copyin(current_space(), name, - MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &kern_port); - if (kr != KERN_SUCCESS) { - return TASK_NULL; - } - - task = convert_port_to_task(kern_port); - - if (IP_VALID(kern_port)) { - ipc_port_release_send(kern_port); + kr = ipc_port_translate_send(current_space(), name, &kport); + if (kr == KERN_SUCCESS) { + task = convert_port_to_task_locked(kport, NULL); + ip_unlock(kport); } } return task; @@ -1956,22 +1964,15 @@ task_inspect_t port_name_to_task_inspect( mach_port_name_t name) { - ipc_port_t kern_port; + ipc_port_t kport; kern_return_t kr; task_inspect_t ti = TASK_INSPECT_NULL; if (MACH_PORT_VALID(name)) { - kr = ipc_object_copyin(current_space(), name, - MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *)&kern_port); - if (kr != KERN_SUCCESS) { - return TASK_NULL; - } - - ti = convert_port_to_task_inspect(kern_port); - - if (IP_VALID(kern_port)) { - ipc_port_release_send(kern_port); + kr = ipc_port_translate_send(current_space(), name, &kport); + if (kr == KERN_SUCCESS) { + ti = convert_port_to_task_inspect_locked(kport); + ip_unlock(kport); } } return ti; @@ -2070,12 +2071,8 @@ convert_task_suspension_token_to_port( task_lock(task); if (task->active) { if (task->itk_resume == IP_NULL) { - task->itk_resume = ipc_port_alloc_kernel(); - if (!IP_VALID(task->itk_resume)) { - panic("failed to create resume port"); - } - - ipc_kobject_set(task->itk_resume, (ipc_kobject_t) task, IKOT_TASK_RESUME); + task->itk_resume = ipc_kobject_alloc_port((ipc_kobject_t) task, + IKOT_TASK_RESUME, IPC_KOBJECT_ALLOC_NONE); } /* @@ -2232,7 +2229,7 @@ thread_set_exception_ports( } if (IP_VALID(new_port)) { - switch (new_behavior & ~MACH_EXCEPTION_CODES) { + switch (new_behavior & ~MACH_EXCEPTION_MASK) { case EXCEPTION_DEFAULT: case EXCEPTION_STATE: case EXCEPTION_STATE_IDENTITY: @@ -2327,7 +2324,7 @@ task_set_exception_ports( } if (IP_VALID(new_port)) { - switch (new_behavior & ~MACH_EXCEPTION_CODES) { + switch (new_behavior & ~MACH_EXCEPTION_MASK) { case EXCEPTION_DEFAULT: case EXCEPTION_STATE: case EXCEPTION_STATE_IDENTITY: @@ -2452,7 +2449,7 @@ thread_swap_exception_ports( } if (IP_VALID(new_port)) { - switch (new_behavior & ~MACH_EXCEPTION_CODES) { + switch (new_behavior & ~MACH_EXCEPTION_MASK) { case EXCEPTION_DEFAULT: case EXCEPTION_STATE: case EXCEPTION_STATE_IDENTITY: @@ -2573,7 +2570,7 @@ task_swap_exception_ports( } if (IP_VALID(new_port)) { - switch (new_behavior & ~MACH_EXCEPTION_CODES) { + switch (new_behavior & ~MACH_EXCEPTION_MASK) { case EXCEPTION_DEFAULT: case EXCEPTION_STATE: case EXCEPTION_STATE_IDENTITY: diff --git a/osfmk/kern/ipc_tt.h b/osfmk/kern/ipc_tt.h index ce6d746e3..5ad86b999 100644 --- a/osfmk/kern/ipc_tt.h +++ b/osfmk/kern/ipc_tt.h @@ -175,8 +175,15 @@ extern thread_t convert_port_to_thread( extern thread_inspect_t convert_port_to_thread_inspect( ipc_port_t port); +__options_decl(port_to_thread_options_t, uint32_t, { + PORT_TO_THREAD_NONE = 0x0000, + PORT_TO_THREAD_IN_CURRENT_TASK = 0x0001, + PORT_TO_THREAD_NOT_CURRENT_THREAD = 0x0002, +}); + extern thread_t port_name_to_thread( - mach_port_name_t port_name); + mach_port_name_t port_name, + port_to_thread_options_t options); /* Deallocate a space ref produced by convert_port_to_space */ extern void space_deallocate( diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 63b8aaffb..ffe8d7658 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -96,11 +96,11 @@ vm_size_t kalloc_kernmap_size; /* size of kallocs that can come from kernel map /* how many times we couldn't allocate out of kalloc_map and fell back to kernel_map */ unsigned long kalloc_fallback_count; -unsigned int kalloc_large_inuse; -vm_size_t kalloc_large_total; -vm_size_t kalloc_large_max; -vm_size_t kalloc_largest_allocated = 0; -uint64_t kalloc_large_sum; +uint_t kalloc_large_inuse; +vm_size_t kalloc_large_total; +vm_size_t kalloc_large_max; +vm_size_t kalloc_largest_allocated = 0; +uint64_t kalloc_large_sum; int kalloc_fake_zone_index = -1; /* index of our fake zone in statistics arrays */ @@ -191,8 +191,9 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) * 4096 Y N N * 6144 N N N * 8192 Y N N + * 12288 N X X * 16384 N N N - * 32768 N N N + * 32768 X N N * */ static const struct kalloc_zone_config { @@ -300,8 +301,8 @@ static const struct kalloc_zone_config { KZC_ENTRY(4096, true), KZC_ENTRY(6144, false), KZC_ENTRY(8192, true), - KZC_ENTRY(16384, false), - KZC_ENTRY(32768, false), + KZC_ENTRY(12288, false), + KZC_ENTRY(16384, false) #endif /* CONFIG_EMBEDDED */ @@ -407,13 +408,7 @@ kalloc_init( kalloc_map_min = min; kalloc_map_max = min + kalloc_map_size - 1; - /* - * Create zones up to a least 4 pages because small page-multiples are - * common allocations. Also ensure that zones up to size 16KB bytes exist. - * This is desirable because messages are allocated with kalloc(), and - * messages up through size 8192 are common. - */ - kalloc_max = PAGE_SIZE << 2; + kalloc_max = (k_zone_config[MAX_K_ZONE - 1].kzc_size << 1); if (kalloc_max < KiB(16)) { kalloc_max = KiB(16); } @@ -674,6 +669,7 @@ vm_size_t DTRACE_VM3(kfree, vm_size_t, -1, vm_size_t, size, void*, addr); kalloc_spin_lock(); + assert(kalloc_large_total >= size); kalloc_large_total -= size; kalloc_large_inuse--; kalloc_unlock(); @@ -685,9 +681,9 @@ vm_size_t void * kalloc_canblock( - vm_size_t * psize, - boolean_t canblock, - vm_allocation_site_t * site) + vm_size_t *psize, + boolean_t canblock, + vm_allocation_site_t *site) { zone_t z; vm_size_t size; @@ -724,6 +720,8 @@ kalloc_canblock( /* large allocation - use guard pages instead of small redzones */ size = round_page(req_size + 2 * PAGE_SIZE); assert(size >= MAX_SIZE_ZDLUT && size >= kalloc_max_prerounded); +#else + size = round_page(size); #endif if (size >= kalloc_kernmap_size) { @@ -760,6 +758,7 @@ kalloc_canblock( } kalloc_large_inuse++; + assert(kalloc_large_total + size >= kalloc_large_total); /* no wrap around */ kalloc_large_total += size; kalloc_large_sum += size; @@ -775,7 +774,7 @@ kalloc_canblock( /* fixup the return address to skip the redzone */ addr = (void *)kasan_alloc((vm_offset_t)addr, size, req_size, PAGE_SIZE); #else - *psize = round_page(size); + *psize = size; #endif DTRACE_VM3(kalloc, vm_size_t, size, vm_size_t, *psize, void*, addr); return addr; @@ -863,6 +862,7 @@ void kmem_free(alloc_map, (vm_offset_t)data, size); kalloc_spin_lock(); + assert(kalloc_large_total >= size); kalloc_large_total -= size; kalloc_large_inuse--; @@ -949,7 +949,7 @@ OSMalloc_Tagref( panic("OSMalloc_Tagref():'%s' has bad state 0x%08X\n", tag->OSMT_name, tag->OSMT_state); } - (void)hw_atomic_add(&tag->OSMT_refcnt, 1); + os_atomic_inc(&tag->OSMT_refcnt, relaxed); } void @@ -960,8 +960,8 @@ OSMalloc_Tagrele( panic("OSMalloc_Tagref():'%s' has bad state 0x%08X\n", tag->OSMT_name, tag->OSMT_state); } - if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) { - if (hw_compare_and_store(OSMT_VALID | OSMT_RELEASED, OSMT_VALID | OSMT_RELEASED, &tag->OSMT_state)) { + if (os_atomic_dec(&tag->OSMT_refcnt, relaxed) == 0) { + if (os_atomic_cmpxchg(&tag->OSMT_state, OSMT_VALID | OSMT_RELEASED, OSMT_VALID | OSMT_RELEASED, acq_rel)) { OSMalloc_tag_spin_lock(); (void)remque((queue_entry_t)tag); OSMalloc_tag_unlock(); @@ -976,11 +976,11 @@ void OSMalloc_Tagfree( OSMallocTag tag) { - if (!hw_compare_and_store(OSMT_VALID, OSMT_VALID | OSMT_RELEASED, &tag->OSMT_state)) { + if (!os_atomic_cmpxchg(&tag->OSMT_state, OSMT_VALID, OSMT_VALID | OSMT_RELEASED, acq_rel)) { panic("OSMalloc_Tagfree():'%s' has bad state 0x%08X \n", tag->OSMT_name, tag->OSMT_state); } - if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) { + if (os_atomic_dec(&tag->OSMT_refcnt, relaxed) == 0) { OSMalloc_tag_spin_lock(); (void)remque((queue_entry_t)tag); OSMalloc_tag_unlock(); diff --git a/osfmk/kern/kcdata.h b/osfmk/kern/kcdata.h index 85cf4998b..f00a3be8f 100644 --- a/osfmk/kern/kcdata.h +++ b/osfmk/kern/kcdata.h @@ -436,45 +436,47 @@ struct kcdata_type_definition { * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes * in STACKSHOT_KCTYPE_* types. */ -#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ -#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ -#define STACKSHOT_KCCONTAINER_TASK 0x903u -#define STACKSHOT_KCCONTAINER_THREAD 0x904u -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ -#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ -#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ -#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ -#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ -#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ -#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ -#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ -#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */ -#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ -#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ -#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ -#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ -#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ -#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ -#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u /* struct stack_snapshot_stacktop */ -#define STACKSHOT_KCTYPE_ASID 0x925u /* uint32_t */ -#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u /* uint64_t */ -#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ +#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ +#define STACKSHOT_KCCONTAINER_TASK 0x903u +#define STACKSHOT_KCCONTAINER_THREAD 0x904u +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ +#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ +#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ +#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ +#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ +#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ +#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ +#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ +#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */ +#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ +#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ +#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ +#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ +#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ +#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ +#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u /* struct stack_snapshot_stacktop */ +#define STACKSHOT_KCTYPE_ASID 0x925u /* uint32_t */ +#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u /* uint64_t */ +#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL 0x928u /* dispatch queue label */ +#define STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO 0x929u /* struct stackshot_thread_turnstileinfo */ #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u /* task_delta_snapshot_v2 */ #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */ @@ -517,6 +519,7 @@ struct user64_dyld_uuid_info { }; enum task_snapshot_flags { + /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */ kTaskRsrcFlagged = 0x4, // In the EXC_RESOURCE danger zone? kTerminatedSnapshot = 0x8, kPidSuspended = 0x10, // true for suspended task @@ -546,6 +549,7 @@ enum task_snapshot_flags { }; enum thread_snapshot_flags { + /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */ kHasDispatchSerial = 0x4, kStacksPCOnly = 0x8, /* Stack traces have no frame pointers. */ kThreadDarwinBG = 0x10, /* Thread is darwinbg */ @@ -814,6 +818,18 @@ typedef struct stackshot_thread_waitinfo { uint8_t wait_type; /* The type of object that the thread is waiting on */ } __attribute__((packed)) thread_waitinfo_t; +typedef struct stackshot_thread_turnstileinfo { + uint64_t waiter; /* The thread that's waiting on the object */ + uint64_t turnstile_context; /* Associated data (either thread id, or workq addr) */ + uint8_t turnstile_priority; + uint8_t number_of_hops; +#define STACKSHOT_TURNSTILE_STATUS_UNKNOWN (1 << 0) /* The final inheritor is unknown (bug?) */ +#define STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ (1 << 1) /* A waitq was found to be locked */ +#define STACKSHOT_TURNSTILE_STATUS_WORKQUEUE (1 << 2) /* The final inheritor is a workqueue */ +#define STACKSHOT_TURNSTILE_STATUS_THREAD (1 << 3) /* The final inheritor is a thread */ + uint64_t turnstile_flags; +} __attribute__((packed)) thread_turnstileinfo_t; + #define STACKSHOT_WAITOWNER_KERNEL (UINT64_MAX - 1) #define STACKSHOT_WAITOWNER_PORT_LOCKED (UINT64_MAX - 2) #define STACKSHOT_WAITOWNER_PSET_LOCKED (UINT64_MAX - 3) @@ -895,6 +911,8 @@ struct crashinfo_proc_uniqidentifierinfo { #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE 0x828 /* uint64_t */ #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED 0x829 /* uint64_t */ #define TASK_CRASHINFO_LEDGER_WIRED_MEM 0x82A /* uint64_t */ +#define TASK_CRASHINFO_PROC_PERSONA_ID 0x82B /* uid_t */ +#define TASK_CRASHINFO_MEMORY_LIMIT_INCREASE 0x82C /* uint32_t */ @@ -971,7 +989,7 @@ kcdata_iter_unsafe(void *buffer) return iter; } -static const kcdata_iter_t kcdata_invalid_iter = { .item = 0, .end = 0 }; +static const kcdata_iter_t kcdata_invalid_iter = { .item = NULL, .end = NULL }; static inline int diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index 05ab16ce9..8a9c4cd75 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -125,10 +125,13 @@ static int kdp_stackshot_kcdata_format(int pid, uint32_t trace_flag uint32_t kdp_stack_snapshot_bytes_traced(void); static void kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap); static boolean_t kdp_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_result); +static int kdp_copyin_string(task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, uint32_t *kdp_fault_results); static boolean_t kdp_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results); static uint64_t proc_was_throttled_from_task(task_t task); static void stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t * waitinfo); static int stackshot_thread_has_valid_waitinfo(thread_t thread); +static void stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_t *tsinfo); +static int stackshot_thread_has_valid_turnstileinfo(thread_t thread); #if CONFIG_COALITIONS static void stackshot_coalition_jetsam_count(void *arg, int i, coalition_t coal); @@ -148,6 +151,7 @@ static uint64_t proc_did_throttle_from_task(task_t task); extern void proc_name_kdp(task_t task, char * buf, int size); extern int proc_threadname_kdp(void * uth, char * buf, size_t size); extern void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime); +extern boolean_t proc_binary_uuid_kdp(task_t task, uuid_t uuid); extern int memorystatus_get_pressure_status_kdp(void); extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); @@ -238,6 +242,8 @@ SECURITY_READ_ONLY_LATE(static uint32_t) max_tracebuf_size = SANE_TRACEBUF_SIZE; #define ROUNDUP(x, y) ((((x)+(y)-1)/(y))*(y)) #endif +#define STACKSHOT_QUEUE_LABEL_MAXSIZE 64 + /* * Initialize the mutex governing access to the stack snapshot subsystem * and other stackshot related bits. @@ -1023,18 +1029,48 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla } } - if (task_pid > 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { - uint32_t uuid_info_size = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); - uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; + if (save_loadinfo_p && task_pid > 0 && (uuid_info_count < MAX_LOADINFOS)) { + uint32_t copied_uuid_count = 0; + uint32_t uuid_info_size = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); + uint32_t uuid_info_array_size = 0; - kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), - uuid_info_size, uuid_info_count, &out_addr)); + /* If we found some UUID information, first try to copy it in -- this will only be non-zero if we had a pmap above */ + if (uuid_info_count > 0) { + uuid_info_array_size = uuid_info_count * uuid_info_size; - /* Copy in the UUID info array - * It may be nonresident, in which case just fix up nloadinfos to 0 in the task_snap - */ - if (have_pmap && !kdp_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) { - bzero((void *)out_addr, uuid_info_array_size); + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), + uuid_info_size, uuid_info_count, &out_addr)); + + if (!kdp_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) { + bzero((void *)out_addr, uuid_info_array_size); + } else { + copied_uuid_count = uuid_info_count; + } + } + + uuid_t binary_uuid; + if (!copied_uuid_count && proc_binary_uuid_kdp(task, binary_uuid)) { + /* We failed to copyin the UUID information, try to store the UUID of the main binary we have in the proc */ + if (uuid_info_array_size == 0) { + /* We just need to store one UUID */ + uuid_info_array_size = uuid_info_size; + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), + uuid_info_size, 1, &out_addr)); + } + + if (task_64bit_addr) { + struct user64_dyld_uuid_info *uuid_info = (struct user64_dyld_uuid_info *)out_addr; + uint64_t image_load_address = task->mach_header_vm_address; + + stackshot_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t)); + stackshot_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address)); + } else { + struct user32_dyld_uuid_info *uuid_info = (struct user32_dyld_uuid_info *)out_addr; + uint32_t image_load_address = (uint32_t) task->mach_header_vm_address; + + stackshot_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t)); + stackshot_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address)); + } } } else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { uintptr_t image_load_address; @@ -1197,7 +1233,7 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace #if __arm__ || __arm64__ if (collect_asid && have_pmap) { - uint32_t asid = task->map->pmap->asid; + uint32_t asid = PMAP_VASID(task->map->pmap); kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr)); stackshot_memcpy((void*)out_addr, &asid, sizeof(asid)); } @@ -1300,7 +1336,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t #if __arm__ || __arm64__ if (collect_asid && have_pmap) { - uint32_t asid = task->map->pmap->asid; + uint32_t asid = PMAP_VASID(task->map->pmap); kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr)); stackshot_memcpy((void*)out_addr, &asid, sizeof(asid)); } @@ -1375,7 +1411,8 @@ kcdata_record_thread_snapshot( cur_thread_snap = (struct thread_snapshot_v4 *)out_addr; /* Populate the thread snapshot header */ - cur_thread_snap->ths_thread_id = thread_tid(thread); + cur_thread_snap->ths_ss_flags = 0; + cur_thread_snap->ths_thread_id = thread_tid(thread); cur_thread_snap->ths_wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event); cur_thread_snap->ths_continuation = VM_KERNEL_UNSLIDE(thread->continuation); cur_thread_snap->ths_total_syscalls = thread->syscalls_mach + thread->syscalls_unix; @@ -1400,6 +1437,27 @@ kcdata_record_thread_snapshot( cur_thread_snap->ths_ss_flags |= kHasDispatchSerial; cur_thread_snap->ths_dqserialnum = dqserialnum; } + + /* try copying in the queue label */ + uint64_t label_offs = get_task_dispatchqueue_label_offset(task); + if (label_offs) { + uint64_t dqlabeladdr = dqaddr + label_offs; + uint64_t actual_dqlabeladdr = 0; + + copyin_ok = kdp_copyin_word(task, dqlabeladdr, &actual_dqlabeladdr, FALSE, NULL); + if (copyin_ok && actual_dqlabeladdr != 0) { + char label_buf[STACKSHOT_QUEUE_LABEL_MAXSIZE]; + int len; + + bzero(label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE * sizeof(char)); + len = kdp_copyin_string(task, actual_dqlabeladdr, label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE, FALSE, NULL); + if (len > 0) { + mach_vm_address_t label_addr = 0; + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL, len, &label_addr)); + stackshot_strlcpy((char*)label_addr, &label_buf[0], len); + } + } + } } } } @@ -1415,7 +1473,6 @@ kcdata_record_thread_snapshot( cur_thread_snap->ths_sys_time = 0; } - cur_thread_snap->ths_ss_flags = 0; if (thread->thread_tag & THREAD_TAG_MAINTHREAD) { cur_thread_snap->ths_ss_flags |= kThreadMain; } @@ -1658,7 +1715,9 @@ classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_fl processor_t last_processor = thread->last_processor; boolean_t thread_on_core = - (last_processor != PROCESSOR_NULL && last_processor->state == PROCESSOR_RUNNING && last_processor->active_thread == thread); + (last_processor != PROCESSOR_NULL && + (last_processor->state == PROCESSOR_SHUTDOWN || last_processor->state == PROCESSOR_RUNNING) && + last_processor->active_thread == thread); *thread_on_core_p = thread_on_core; @@ -1694,6 +1753,7 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) int num_delta_thread_snapshots = 0; int num_nonrunnable_threads = 0; int num_waitinfo_threads = 0; + int num_turnstileinfo_threads = 0; uint64_t task_start_abstime = 0; boolean_t task_delta_stackshot = FALSE; @@ -1701,6 +1761,13 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) boolean_t some_thread_ran = FALSE; unaligned_u64 *task_snap_ss_flags = NULL; +#if INTERRUPT_MASKED_DEBUG && MONOTONIC + uint64_t task_begin_cpu_cycle_count = 0; + if (!panic_stackshot) { + task_begin_cpu_cycle_count = mt_cur_cpu_cycles(); + } +#endif + if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) { error = KERN_FAILURE; goto error_exit; @@ -1783,8 +1850,14 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) /* We want to report owner information regardless of whether a thread * has changed since the last delta, whether it's a normal stackshot, * or whether it's nonrunnable */ - if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) { - num_waitinfo_threads++; + if (save_owner_info) { + if (stackshot_thread_has_valid_waitinfo(thread)) { + num_waitinfo_threads++; + } + + if (stackshot_thread_has_valid_turnstileinfo(thread)) { + num_turnstileinfo_threads++; + } } } @@ -1806,8 +1879,10 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) nonrunnable_tids = (uint64_t *)out_addr; } - thread_waitinfo_t *thread_waitinfo = NULL; - int current_waitinfo_index = 0; + thread_waitinfo_t *thread_waitinfo = NULL; + thread_turnstileinfo_t *thread_turnstileinfo = NULL; + int current_waitinfo_index = 0; + int current_turnstileinfo_index = 0; if (num_waitinfo_threads > 0) { kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO, @@ -1815,7 +1890,15 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) thread_waitinfo = (thread_waitinfo_t *)out_addr; } - if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || num_waitinfo_threads > 0) { + if (num_turnstileinfo_threads > 0) { + /* get space for the turnstile info */ + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO, + sizeof(thread_turnstileinfo_t), num_turnstileinfo_threads, &out_addr)); + thread_turnstileinfo = (thread_turnstileinfo_t *)out_addr; + } + + if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || + num_waitinfo_threads > 0 || num_turnstileinfo_threads > 0) { queue_iterate(&task->threads, thread, thread_t, task_threads) { if (active_kthreads_only_p && thread->kernel_stack == 0) { @@ -1823,10 +1906,18 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) } /* If we want owner info, we should capture it regardless of its classification */ - if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) { - stackshot_thread_wait_owner_info( - thread, - &thread_waitinfo[current_waitinfo_index++]); + if (save_owner_info) { + if (stackshot_thread_has_valid_waitinfo(thread)) { + stackshot_thread_wait_owner_info( + thread, + &thread_waitinfo[current_waitinfo_index++]); + } + + if (stackshot_thread_has_valid_turnstileinfo(thread)) { + stackshot_thread_turnstileinfo( + thread, + &thread_turnstileinfo[current_turnstileinfo_index++]); + } } boolean_t thread_on_core; @@ -1883,6 +1974,13 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, task_snap_ss_flags)); kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, task_snap_ss_flags)); } + +#if INTERRUPT_MASKED_DEBUG && MONOTONIC + if (!panic_stackshot) { + kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - task_begin_cpu_cycle_count), + "task_cpu_cycle_count")); + } +#endif /* mark end of task snapshot data */ kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, task_uniqueid)); @@ -1906,6 +2004,14 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac uint32_t length_to_copy = 0, tmp32 = 0; abs_time = mach_absolute_time(); +#if INTERRUPT_MASKED_DEBUG && MONOTONIC + uint64_t stackshot_begin_cpu_cycle_count = 0; + + if (!panic_stackshot) { + stackshot_begin_cpu_cycle_count = mt_cur_cpu_cycles(); + } +#endif + /* process the flags */ boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); boolean_t use_fault_path = ((trace_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0); @@ -2020,6 +2126,15 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac #if CONFIG_COALITIONS int num_coalitions = 0; struct jetsam_coalition_snapshot *coalitions = NULL; + +#if INTERRUPT_MASKED_DEBUG && MONOTONIC + uint64_t coalition_begin_cpu_cycle_count = 0; + + if (!panic_stackshot && (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS)) { + coalition_begin_cpu_cycle_count = mt_cur_cpu_cycles(); + } +#endif + /* Iterate over coalitions */ if (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) { if (coalition_iterate_stackshot(stackshot_coalition_jetsam_count, &num_coalitions, COALITION_TYPE_JETSAM) != KERN_SUCCESS) { @@ -2037,6 +2152,12 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac goto error_exit; } } +#if INTERRUPT_MASKED_DEBUG && MONOTONIC + if (!panic_stackshot && (coalition_begin_cpu_cycle_count != 0)) { + kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - coalition_begin_cpu_cycle_count), + "coalitions_cpu_cycle_count")); + } +#endif #else trace_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS); #endif /* CONFIG_COALITIONS */ @@ -2089,6 +2210,13 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, trace_flags, "stackshot_out_flags")); +#if INTERRUPT_MASKED_DEBUG && MONOTONIC + if (!panic_stackshot) { + kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - stackshot_begin_cpu_cycle_count), + "stackshot_total_cpu_cycle_cnt")); + } +#endif + kcd_exit_on_error(kcdata_write_buffer_end(stackshot_kcdata_p)); /* === END of populating stackshot data === */ @@ -2294,7 +2422,7 @@ boolean_t kdp_copyin_word( task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results) { - if (task_has_64Bit_data(task)) { + if (task_has_64Bit_addr(task)) { return kdp_copyin(task->map, addr, result, sizeof(uint64_t), try_fault, kdp_fault_results); } else { uint32_t buf; @@ -2304,6 +2432,46 @@ kdp_copyin_word( } } +int +kdp_copyin_string( + task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, uint32_t *kdp_fault_results) +{ + int i; + uint64_t validated = 0, valid_from; + uint64_t phys_src, phys_dest; + + for (i = 0; i < buf_sz; i++) { + if (validated == 0) { + valid_from = i; + phys_src = kdp_find_phys(task->map, addr + i, try_fault, kdp_fault_results); + phys_dest = kvtophys((vm_offset_t)&buf[i]); + uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK); + uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK); + if (phys_src && phys_dest) { + validated = MIN(src_rem, dst_rem); + if (validated) { + bcopy_phys(phys_src, phys_dest, 1); + validated--; + } else { + return 0; + } + } else { + return 0; + } + } else { + bcopy_phys(phys_src + (i - valid_from), phys_dest + (i - valid_from), 1); + validated--; + } + + if (buf[i] == '\0') { + return i + 1; + } + } + + /* ran out of space */ + return -1; +} + boolean_t kdp_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_results) { @@ -2416,13 +2584,7 @@ machine_trace_thread_get_kva(vm_offset_t cur_target_addr, vm_map_t map, uint32_t if (cur_phys_addr == 0) { return 0; } -#if __x86_64__ - kern_virt_target_addr = (vm_offset_t) PHYSMAP_PTOV(cur_phys_addr); -#elif __arm__ || __arm64__ kern_virt_target_addr = phystokv(cur_phys_addr); -#else -#error Oh come on... we should really unify the physical -> kernel virtual interface -#endif prev_target_page = cur_target_page; prev_target_kva = (kern_virt_target_addr & ~PAGE_MASK); validate_next_addr = FALSE; @@ -2526,11 +2688,33 @@ stackshot_thread_has_valid_waitinfo(thread_t thread) } } +/* Determine if a thread has turnstileinfo that stackshot can provide */ +static int +stackshot_thread_has_valid_turnstileinfo(thread_t thread) +{ + struct turnstile *ts = thread_get_waiting_turnstile(thread); + + return stackshot_thread_has_valid_waitinfo(thread) && + ts != TURNSTILE_NULL; +} + +static void +stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_t *tsinfo) +{ + struct turnstile *ts; + + /* acquire turnstile information and store it in the stackshot */ + ts = thread_get_waiting_turnstile(thread); + tsinfo->waiter = thread_tid(thread); + kdp_turnstile_fill_tsinfo(ts, tsinfo); +} + static void stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo) { - waitinfo->waiter = thread_tid(thread); - waitinfo->wait_type = thread->block_hint; + waitinfo->waiter = thread_tid(thread); + waitinfo->wait_type = thread->block_hint; + switch (waitinfo->wait_type) { case kThreadWaitKernelMutex: kdp_lck_mtx_find_owner(thread->waitq, thread->wait_event, waitinfo); @@ -2564,6 +2748,9 @@ stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo) case kThreadWaitOnProcess: kdp_wait4_find_process(thread, thread->wait_event, waitinfo); break; + case kThreadWaitSleepWithInheritor: + kdp_sleep_with_inheritor_find_owner(thread->waitq, thread->wait_event, waitinfo); + break; default: waitinfo->owner = 0; waitinfo->context = 0; diff --git a/osfmk/kern/kern_types.h b/osfmk/kern/kern_types.h index 556f67273..3e9f12902 100644 --- a/osfmk/kern/kern_types.h +++ b/osfmk/kern/kern_types.h @@ -81,7 +81,7 @@ typedef int wait_result_t; #define THREAD_NOT_WAITING 10 /* thread didn't need to wait */ typedef void (*thread_continue_t)(void *, wait_result_t); -#define THREAD_CONTINUE_NULL ((thread_continue_t) 0) +#define THREAD_CONTINUE_NULL ((thread_continue_t) NULL) /* * Interruptible flag for waits. diff --git a/osfmk/kern/kmod.c b/osfmk/kern/kmod.c index 287a21382..10ff9f8de 100644 --- a/osfmk/kern/kmod.c +++ b/osfmk/kern/kmod.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -126,7 +126,7 @@ kmod_control( { NOT_SUPPORTED_KERNEL(); return KERN_NOT_SUPPORTED; -}; +} /********************************************************************/ kern_return_t diff --git a/osfmk/kern/ledger.c b/osfmk/kern/ledger.c index 7f7bba139..a0d925872 100644 --- a/osfmk/kern/ledger.c +++ b/osfmk/kern/ledger.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -96,6 +97,7 @@ struct entry_template { }; lck_grp_t ledger_lck_grp; +os_refgrp_decl(static, ledger_refgrp, "ledger", NULL); /* * Modifying the reference count, table size, or table contents requires @@ -206,6 +208,41 @@ ledger_template_create(const char *name) return template; } +ledger_template_t +ledger_template_copy(ledger_template_t template, const char *name) +{ + struct entry_template * new_entries = NULL; + ledger_template_t new_template = ledger_template_create(name); + + if (new_template == NULL) { + return new_template; + } + + template_lock(template); + assert(template->lt_initialized); + + new_entries = (struct entry_template *) + kalloc(sizeof(struct entry_template) * template->lt_table_size); + + if (new_entries) { + /* Copy the template entries. */ + bcopy(template->lt_entries, new_entries, sizeof(struct entry_template) * template->lt_table_size); + kfree(new_template->lt_entries, sizeof(struct entry_template) * new_template->lt_table_size); + + new_template->lt_entries = new_entries; + new_template->lt_table_size = template->lt_table_size; + new_template->lt_cnt = template->lt_cnt; + } else { + /* Tear down the new template; we've failed. :( */ + ledger_template_dereference(new_template); + new_template = NULL; + } + + template_unlock(template); + + return new_template; +} + void ledger_template_dereference(ledger_template_t template) { @@ -214,6 +251,8 @@ ledger_template_dereference(ledger_template_t template) template_unlock(template); if (template->lt_refs == 0) { + kfree(template->lt_entries, sizeof(struct entry_template) * template->lt_table_size); + lck_mtx_destroy(&template->lt_lock, &ledger_lck_grp); kfree(template, sizeof(*template)); } } @@ -385,7 +424,7 @@ ledger_instantiate(ledger_template_t template, int entry_type) ledger->l_template = template; ledger->l_id = ledger_cnt++; - os_ref_init(&ledger->l_refs, NULL); + os_ref_init(&ledger->l_refs, &ledger_refgrp); ledger->l_size = (int32_t)cnt; template_lock(template); @@ -433,35 +472,25 @@ flag_clear(volatile uint32_t *flags, uint32_t bit) /* * Take a reference on a ledger */ -kern_return_t +void ledger_reference(ledger_t ledger) { if (!LEDGER_VALID(ledger)) { - return KERN_INVALID_ARGUMENT; - } - os_ref_retain(&ledger->l_refs); - return KERN_SUCCESS; -} - -int -ledger_reference_count(ledger_t ledger) -{ - if (!LEDGER_VALID(ledger)) { - return -1; + return; } - return os_ref_get_count(&ledger->l_refs); + os_ref_retain(&ledger->l_refs); } /* * Remove a reference on a ledger. If this is the last reference, * deallocate the unused ledger. */ -kern_return_t +void ledger_dereference(ledger_t ledger) { if (!LEDGER_VALID(ledger)) { - return KERN_INVALID_ARGUMENT; + return; } if (os_ref_release(&ledger->l_refs) == 0) { @@ -471,8 +500,6 @@ ledger_dereference(ledger_t ledger) pmap_ledger_free(ledger); } } - - return KERN_SUCCESS; } /* @@ -828,7 +855,7 @@ ledger_rollup(ledger_t to_ledger, ledger_t from_ledger) { int i; - assert(to_ledger->l_template == from_ledger->l_template); + assert(to_ledger->l_template->lt_cnt == from_ledger->l_template->lt_cnt); for (i = 0; i < to_ledger->l_size; i++) { ledger_rollup_entry(to_ledger, from_ledger, i); @@ -847,7 +874,7 @@ ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledger, int entry) { struct ledger_entry *from_le, *to_le; - assert(to_ledger->l_template == from_ledger->l_template); + assert(to_ledger->l_template->lt_cnt == from_ledger->l_template->lt_cnt); if (ENTRY_VALID(from_ledger, entry) && ENTRY_VALID(to_ledger, entry)) { from_le = &from_ledger->l_entries[entry]; to_le = &to_ledger->l_entries[entry]; @@ -1305,6 +1332,7 @@ ledger_ast(thread_t thread) { struct ledger *l = thread->t_ledger; struct ledger *thl; + struct ledger *coalition_ledger; uint32_t block; uint64_t now; uint8_t task_flags; @@ -1388,6 +1416,11 @@ top: } block |= ledger_check_needblock(l, now); + coalition_ledger = coalition_ledger_get_from_task(task); + if (LEDGER_VALID(coalition_ledger)) { + block |= ledger_check_needblock(coalition_ledger, now); + } + ledger_dereference(coalition_ledger); /* * If we are supposed to block on the availability of one or more * resources, find the first entry in deficit for which we should wait. @@ -1453,7 +1486,7 @@ ledger_check_needblock(ledger_t l, uint64_t now) if (le->le_flags & LF_REFILL_SCHEDULED) { assert(!(le->le_flags & LF_TRACKING_MAX)); - if ((le->_le.le_refill.le_last_refill + le->_le.le_refill.le_refill_period) > now) { + if ((le->_le.le_refill.le_last_refill + le->_le.le_refill.le_refill_period) <= now) { ledger_refill(now, l, i); if (limit_exceeded(le) == FALSE) { continue; diff --git a/osfmk/kern/ledger.h b/osfmk/kern/ledger.h index 3e3e6c323..9be77bb0c 100644 --- a/osfmk/kern/ledger.h +++ b/osfmk/kern/ledger.h @@ -96,7 +96,7 @@ struct ledger_entry { struct ledger { uint64_t l_id; - struct os_refcnt l_refs; + os_refcnt_t l_refs; int32_t l_size; struct ledger_template *l_template; struct ledger_entry l_entries[0] __attribute__((aligned(8))); @@ -141,6 +141,7 @@ typedef void (*ledger_callback_t)(int warning, const void * param0, const void * extern void ledger_init(void); extern ledger_template_t ledger_template_create(const char *name); +extern ledger_template_t ledger_template_copy(ledger_template_t template, const char *name); extern void ledger_template_dereference(ledger_template_t template); extern int ledger_entry_add(ledger_template_t template, const char *key, const char *group, const char *units); @@ -207,9 +208,8 @@ extern kern_return_t ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledge extern void ledger_ast(thread_t thread); -extern int ledger_reference_count(ledger_t ledger); -extern kern_return_t ledger_reference(ledger_t ledger); -extern kern_return_t ledger_dereference(ledger_t ledger); +extern void ledger_reference(ledger_t ledger); +extern void ledger_dereference(ledger_t ledger); /* Support for ledger() syscall */ #ifdef LEDGER_DEBUG diff --git a/osfmk/kern/lock_group.h b/osfmk/kern/lock_group.h index 56472c560..e677ded1a 100644 --- a/osfmk/kern/lock_group.h +++ b/osfmk/kern/lock_group.h @@ -31,8 +31,7 @@ #include #include -#define LCK_GRP_NULL (lck_grp_t *)0 - +#define LCK_GRP_NULL (lck_grp_t *)NULL typedef unsigned int lck_type_t; @@ -42,6 +41,7 @@ typedef unsigned int lck_type_t; #if XNU_KERNEL_PRIVATE +#include /* * Arguments wrapped in LCK_GRP_ARG() will be elided * when LOCK_STATS is not set. @@ -86,7 +86,7 @@ typedef struct _lck_grp_stats_ { typedef struct _lck_grp_ { queue_chain_t lck_grp_link; - uint32_t lck_grp_refcnt; + os_refcnt_t lck_grp_refcnt; uint32_t lck_grp_spincnt; uint32_t lck_grp_mtxcnt; uint32_t lck_grp_rwcnt; @@ -99,6 +99,7 @@ typedef struct _lck_grp_ { typedef struct _lck_grp_ lck_grp_t; #endif /* XNU_KERNEL_PRIVATE */ + #ifdef MACH_KERNEL_PRIVATE typedef struct _lck_grp_attr_ { uint32_t grp_attr_val; @@ -113,7 +114,7 @@ extern lck_grp_attr_t LockDefaultGroupAttr; typedef struct __lck_grp_attr__ lck_grp_attr_t; #endif /* MACH_KERNEL_PRIVATE */ -#define LCK_GRP_ATTR_NULL (lck_grp_attr_t *)0 +#define LCK_GRP_ATTR_NULL (lck_grp_attr_t *)NULL __BEGIN_DECLS @@ -157,7 +158,6 @@ extern void lck_grp_lckcnt_incr( extern void lck_grp_lckcnt_decr( lck_grp_t *grp, lck_type_t lck_type); - #endif /* MACH_KERNEL_PRIVATE */ #endif /* _KERN_LOCK_GROUP_H */ diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index 04106709b..78aee369c 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,7 +54,6 @@ * the rights to redistribute these changes. */ -#define ATOMIC_PRIVATE 1 #define LOCK_PRIVATE 1 #include @@ -90,25 +89,16 @@ #define ALIGN_TEST(p, t) do{}while(0) #endif -/* Silence the volatile to _Atomic cast warning */ -#define ATOMIC_CAST(t, p) ((_Atomic t*)(uintptr_t)(p)) - -/* Enforce program order of loads and stores. */ -#define ordered_load(target, type) \ - __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed) -#define ordered_store(target, type, value) \ - __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed) - -#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data, uintptr_t) -#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, uintptr_t, (value)) - #define NOINLINE __attribute__((noinline)) +#define ordered_load_hw(lock) os_atomic_load(&(lock)->lock_data, compiler_acq_rel) +#define ordered_store_hw(lock, value) os_atomic_store(&(lock)->lock_data, (value), compiler_acq_rel) + queue_head_t lck_grp_queue; unsigned int lck_grp_cnt; -decl_lck_mtx_data(, lck_grp_lock) +decl_lck_mtx_data(, lck_grp_lock); static lck_mtx_ext_t lck_grp_lock_ext; SECURITY_READ_ONLY_LATE(boolean_t) spinlock_timeout_panic = TRUE; @@ -175,7 +165,7 @@ lck_mod_init( LockCompatGroup.lck_grp_attr |= LCK_GRP_ATTR_TIME_STAT; } - LockCompatGroup.lck_grp_refcnt = 1; + os_ref_init(&LockCompatGroup.lck_grp_refcnt, NULL); enqueue_tail(&lck_grp_queue, (queue_entry_t)&LockCompatGroup); lck_grp_cnt = 1; @@ -228,7 +218,7 @@ void lck_grp_attr_setstat( lck_grp_attr_t *attr) { - (void)hw_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT); + os_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT, relaxed); } @@ -307,7 +297,7 @@ lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr) #endif /* LOCK_STATS */ } - grp->lck_grp_refcnt = 1; + os_ref_init(&grp->lck_grp_refcnt, NULL); lck_mtx_lock(&lck_grp_lock); enqueue_tail(&lck_grp_queue, (queue_entry_t)grp); @@ -339,7 +329,7 @@ void lck_grp_reference( lck_grp_t *grp) { - (void)hw_atomic_add(&grp->lck_grp_refcnt, 1); + os_ref_retain(&grp->lck_grp_refcnt); } @@ -351,9 +341,11 @@ void lck_grp_deallocate( lck_grp_t *grp) { - if (hw_atomic_sub(&grp->lck_grp_refcnt, 1) == 0) { - kfree(grp, sizeof(lck_grp_t)); + if (os_ref_release(&grp->lck_grp_refcnt) != 0) { + return; } + + kfree(grp, sizeof(lck_grp_t)); } /* @@ -381,7 +373,7 @@ lck_grp_lckcnt_incr( return panic("lck_grp_lckcnt_incr(): invalid lock type: %d\n", lck_type); } - (void)hw_atomic_add(lckcnt, 1); + os_atomic_inc(lckcnt, relaxed); } /* @@ -411,7 +403,7 @@ lck_grp_lckcnt_decr( return; } - updated = (int)hw_atomic_sub(lckcnt, 1); + updated = os_atomic_dec(lckcnt, relaxed); assert(updated >= 0); } @@ -467,7 +459,7 @@ void lck_attr_setdebug( lck_attr_t *attr) { - (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG); + os_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG, relaxed); } /* @@ -477,7 +469,7 @@ void lck_attr_cleardebug( lck_attr_t *attr) { - (void)hw_atomic_and(&attr->lck_attr_val, ~LCK_ATTR_DEBUG); + os_atomic_andnot(&attr->lck_attr_val, LCK_ATTR_DEBUG, relaxed); } @@ -488,7 +480,7 @@ void lck_attr_rw_shared_priority( lck_attr_t *attr) { - (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY); + os_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY, relaxed); } @@ -513,6 +505,31 @@ hw_lock_init(hw_lock_t lock) ordered_store_hw(lock, 0); } +#if __SMP__ +static inline bool +hw_lock_trylock_contended(hw_lock_t lock, uintptr_t newval) +{ +#if OS_ATOMIC_USE_LLSC + uintptr_t oldval; + os_atomic_rmw_loop(&lock->lock_data, oldval, newval, acquire, { + if (oldval != 0) { + wait_for_event(); // clears the monitor so we don't need give_up() + return false; + } + }); + return true; +#else // !OS_ATOMIC_USE_LLSC +#if OS_ATOMIC_HAS_LLSC + uintptr_t oldval = os_atomic_load_exclusive(&lock->lock_data, relaxed); + if (oldval != 0) { + wait_for_event(); // clears the monitor so we don't need give_up() + return false; + } +#endif // OS_ATOMIC_HAS_LLSC + return os_atomic_cmpxchg(&lock->lock_data, 0, newval, acquire); +#endif // !OS_ATOMIC_USE_LLSC +} + /* * Routine: hw_lock_lock_contended * @@ -520,8 +537,6 @@ hw_lock_init(hw_lock_t lock) * timeout is in mach_absolute_time ticks. Called with * preemption disabled. */ - -#if __SMP__ static unsigned int NOINLINE hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic LCK_GRP_ARG(lck_grp_t *grp)) { @@ -551,8 +566,7 @@ hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean continue; } #endif - if (atomic_compare_exchange(&lock->lock_data, 0, data, - memory_order_acquire_smp, TRUE)) { + if (hw_lock_trylock_contended(lock, data)) { #if CONFIG_DTRACE || LOCK_STATS if (__improbable(stat_enabled)) { lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin); @@ -578,6 +592,42 @@ hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean } #endif // __SMP__ +void * +hw_wait_while_equals(void **address, void *current) +{ +#if __SMP__ + void *v; + uint64_t end = 0; + + for (;;) { + for (int i = 0; i < LOCK_SNOOP_SPINS; i++) { + cpu_pause(); +#if OS_ATOMIC_HAS_LLSC + v = os_atomic_load_exclusive(address, relaxed); + if (__probable(v != current)) { + os_atomic_clear_exclusive(); + return v; + } + wait_for_event(); +#else + v = os_atomic_load(address, relaxed); + if (__probable(v != current)) { + return v; + } +#endif // OS_ATOMIC_HAS_LLSC + } + if (end == 0) { + end = ml_get_timebase() + LOCK_PANIC_TIMEOUT; + } else if (ml_get_timebase() >= end) { + panic("Wait while equals timeout @ *%p == %p", address, v); + } + } +#else // !__SMP__ + panic("Value at %p is %p", address, current); + __builtin_unreachable(); +#endif // !__SMP__ +} + static inline void hw_lock_lock_internal(hw_lock_t lock, thread_t thread LCK_GRP_ARG(lck_grp_t *grp)) { @@ -585,14 +635,12 @@ hw_lock_lock_internal(hw_lock_t lock, thread_t thread LCK_GRP_ARG(lck_grp_t *grp state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; #if __SMP__ - #if LOCK_PRETEST if (ordered_load_hw(lock)) { goto contended; } #endif // LOCK_PRETEST - if (atomic_compare_exchange(&lock->lock_data, 0, state, - memory_order_acquire_smp, TRUE)) { + if (hw_lock_trylock_contended(lock, state)) { goto end; } #if LOCK_PRETEST @@ -659,14 +707,12 @@ int disable_preemption_for_thread(thread); state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; #if __SMP__ - #if LOCK_PRETEST if (ordered_load_hw(lock)) { goto contended; } #endif // LOCK_PRETEST - if (atomic_compare_exchange(&lock->lock_data, 0, state, - memory_order_acquire_smp, TRUE)) { + if (hw_lock_trylock_contended(lock, state)) { success = 1; goto end; } @@ -704,8 +750,8 @@ hw_lock_try_internal(hw_lock_t lock, thread_t thread LCK_GRP_ARG(lck_grp_t *grp) goto failed; } #endif // LOCK_PRETEST - success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK, - memory_order_acquire_smp, FALSE); + success = os_atomic_cmpxchg(&lock->lock_data, 0, + LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK, acquire); #else if (lock->lock_data == 0) { lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; @@ -754,7 +800,7 @@ int static inline void hw_lock_unlock_internal(hw_lock_t lock) { - __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp); + os_atomic_store(&lock->lock_data, 0, release); #if __arm__ || __arm64__ // ARM tests are only for open-source exclusion set_event(); @@ -790,6 +836,198 @@ hw_lock_held(hw_lock_t lock) return ordered_load_hw(lock) != 0; } +#if __SMP__ +static unsigned int +hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)); +#endif + +static inline unsigned int +hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) +{ + unsigned int success = 0; + uint32_t mask = (1 << bit); +#if !__SMP__ + uint32_t state; +#endif + +#if __SMP__ + if (__improbable(!hw_atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE))) { + success = hw_lock_bit_to_contended(lock, mask, timeout LCK_GRP_ARG(grp)); + } else { + success = 1; + } +#else // __SMP__ + (void)timeout; + state = ordered_load_bit(lock); + if (!(mask & state)) { + ordered_store_bit(lock, state | mask); + success = 1; + } +#endif // __SMP__ + + if (success) { + lck_grp_spin_update_held(lock LCK_GRP_ARG(grp)); + } + + return success; +} + +unsigned +int +(hw_lock_bit_to)(hw_lock_bit_t * lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) +{ + _disable_preemption(); + return hw_lock_bit_to_internal(lock, bit, timeout LCK_GRP_ARG(grp)); +} + +#if __SMP__ +static unsigned int NOINLINE +hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) +{ + uint64_t end = 0; + int i; +#if CONFIG_DTRACE || LOCK_STATS + uint64_t begin = 0; + boolean_t stat_enabled = lck_grp_spin_spin_enabled(lock LCK_GRP_ARG(grp)); +#endif /* CONFIG_DTRACE || LOCK_STATS */ + +#if LOCK_STATS || CONFIG_DTRACE + if (__improbable(stat_enabled)) { + begin = mach_absolute_time(); + } +#endif /* LOCK_STATS || CONFIG_DTRACE */ + for (;;) { + for (i = 0; i < LOCK_SNOOP_SPINS; i++) { + // Always load-exclusive before wfe + // This grabs the monitor and wakes up on a release event + if (hw_atomic_test_and_set32(lock, mask, mask, memory_order_acquire, TRUE)) { + goto end; + } + } + if (end == 0) { + end = ml_get_timebase() + timeout; + } else if (ml_get_timebase() >= end) { + break; + } + } + return 0; +end: +#if CONFIG_DTRACE || LOCK_STATS + if (__improbable(stat_enabled)) { + lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin); + } + lck_grp_spin_update_miss(lock LCK_GRP_ARG(grp)); +#endif /* CONFIG_DTRACE || LCK_GRP_STAT */ + + return 1; +} +#endif // __SMP__ + +void +(hw_lock_bit)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) +{ + if (hw_lock_bit_to(lock, bit, LOCK_PANIC_TIMEOUT, LCK_GRP_PROBEARG(grp))) { + return; + } +#if __SMP__ + panic("hw_lock_bit(): timed out (%p)", lock); +#else + panic("hw_lock_bit(): interlock held (%p)", lock); +#endif +} + +void +(hw_lock_bit_nopreempt)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) +{ + if (__improbable(get_preemption_level() == 0)) { + panic("Attempt to take no-preempt bitlock %p in preemptible context", lock); + } + if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT LCK_GRP_ARG(grp))) { + return; + } +#if __SMP__ + panic("hw_lock_bit_nopreempt(): timed out (%p)", lock); +#else + panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock); +#endif +} + +unsigned +int +(hw_lock_bit_try)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) +{ + uint32_t mask = (1 << bit); +#if !__SMP__ + uint32_t state; +#endif + boolean_t success = FALSE; + + _disable_preemption(); +#if __SMP__ + // TODO: consider weak (non-looping) atomic test-and-set + success = hw_atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE); +#else + state = ordered_load_bit(lock); + if (!(mask & state)) { + ordered_store_bit(lock, state | mask); + success = TRUE; + } +#endif // __SMP__ + if (!success) { + _enable_preemption(); + } + + if (success) { + lck_grp_spin_update_held(lock LCK_GRP_ARG(grp)); + } + + return success; +} + +static inline void +hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit) +{ + uint32_t mask = (1 << bit); +#if !__SMP__ + uint32_t state; +#endif + +#if __SMP__ + os_atomic_andnot(lock, mask, release); +#if __arm__ + set_event(); +#endif +#else // __SMP__ + state = ordered_load_bit(lock); + ordered_store_bit(lock, state & ~mask); +#endif // __SMP__ +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit); +#endif +} + +/* + * Routine: hw_unlock_bit + * + * Release spin-lock. The second parameter is the bit number to test and set. + * Decrement the preemption level. + */ +void +hw_unlock_bit(hw_lock_bit_t * lock, unsigned int bit) +{ + hw_unlock_bit_internal(lock, bit); + _enable_preemption(); +} + +void +hw_unlock_bit_nopreempt(hw_lock_bit_t * lock, unsigned int bit) +{ + if (__improbable(get_preemption_level() == 0)) { + panic("Attempt to release no-preempt bitlock %p in preemptible context", lock); + } + hw_unlock_bit_internal(lock, bit); +} + /* * Routine: lck_spin_sleep */ @@ -983,37 +1221,9 @@ lck_mtx_sleep_deadline( * The lock owner is always promoted to the max priority of all its waiters. * Max priority is capped at MAXPRI_PROMOTE. * - * lck_mtx_pri being set implies that the lock owner is promoted to at least lck_mtx_pri - * This prevents the thread from dropping in priority while holding a mutex - * (note: Intel locks currently don't do this, to avoid thread lock churn) - * - * thread->promotions has a +1 for every mutex currently promoting the thread - * and 1 for was_promoted_on_wakeup being set. - * TH_SFLAG_PROMOTED is set on a thread whenever it has any promotions - * from any mutex (i.e. thread->promotions != 0) - * - * was_promoted_on_wakeup is set on a thread which is woken up by a mutex when - * it raises the priority of the woken thread to match lck_mtx_pri. - * It can be set for multiple iterations of wait, fail to acquire, re-wait, etc - * was_promoted_on_wakeup being set always implies a +1 promotions count. - * * The last waiter is not given a promotion when it wakes up or acquires the lock. * When the last waiter is waking up, a new contender can always come in and * steal the lock without having to wait for the last waiter to make forward progress. - * - * lck_mtx_waiters has a +1 for every waiter currently between wait and acquire - * This prevents us from asserting that every wakeup wakes up a thread. - * This also causes excess thread_wakeup calls in the unlock path. - * It can only be fooled into thinking there are more waiters than are - * actually blocked, not less. - * It does allows us to reduce the complexity of the lock state. - * - * This also means that a starved bg thread as the last waiter could end up - * keeping the lock in the contended state for a long period of time, which - * may keep lck_mtx_pri artificially high for a very long time even though - * it is not participating or blocking anyone else. - * Intel locks don't have this problem because they can go uncontended - * as soon as there are no blocked threads involved. */ /* @@ -1034,9 +1244,10 @@ lck_mtx_sleep_deadline( void lck_mtx_lock_wait( lck_mtx_t *lck, - thread_t holder) + thread_t holder, + struct turnstile **ts) { - thread_t self = current_thread(); + thread_t thread = current_thread(); lck_mtx_t *mutex; __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); @@ -1057,64 +1268,27 @@ lck_mtx_lock_wait( KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0); - spl_t s = splsched(); - thread_lock(holder); - - assert_promotions_invariant(holder); - - if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0) { - assert(holder->sched_pri >= mutex->lck_mtx_pri); - } - - integer_t priority = self->sched_pri; - priority = MAX(priority, self->base_pri); - priority = MAX(priority, BASEPRI_DEFAULT); - priority = MIN(priority, MAXPRI_PROMOTE); - - if (mutex->lck_mtx_pri == 0) { - /* This is the first promotion for this mutex */ - if (holder->promotions++ == 0) { - /* This is the first promotion for holder */ - sched_thread_promote_to_pri(holder, priority, trace_lck); - } else { - /* Holder was previously promoted due to a different mutex, raise to match this one */ - sched_thread_update_promotion_to_pri(holder, priority, trace_lck); - } - } else { - /* Holder was previously promoted due to this mutex, check if the pri needs to go up */ - sched_thread_update_promotion_to_pri(holder, priority, trace_lck); - } - - assert(holder->promotions > 0); - assert(holder->promotion_priority >= priority); - - if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0) { - assert(holder->sched_pri >= mutex->lck_mtx_pri); - } - - assert_promotions_invariant(holder); - - thread_unlock(holder); - splx(s); + assert(thread->waiting_for_mutex == NULL); + thread->waiting_for_mutex = mutex; + mutex->lck_mtx_waiters++; - if (mutex->lck_mtx_pri < priority) { - mutex->lck_mtx_pri = priority; + if (*ts == NULL) { + *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); } - if (self->waiting_for_mutex == NULL) { - self->waiting_for_mutex = mutex; - mutex->lck_mtx_waiters++; - } + struct turnstile *turnstile = *ts; + thread_set_pending_block_hint(thread, kThreadWaitKernelMutex); + turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); - assert(self->waiting_for_mutex == mutex); + waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); - thread_set_pending_block_hint(self, kThreadWaitKernelMutex); - assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_mtx_ilk_unlock(mutex); + turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + thread_block(THREAD_CONTINUE_NULL); - assert(mutex->lck_mtx_waiters > 0); + thread->waiting_for_mutex = NULL; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); #if CONFIG_DTRACE @@ -1146,11 +1320,11 @@ lck_mtx_lock_wait( */ int lck_mtx_lock_acquire( - lck_mtx_t *lck) + lck_mtx_t *lck, + struct turnstile *ts) { thread_t thread = current_thread(); lck_mtx_t *mutex; - integer_t priority; if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { mutex = lck; @@ -1158,79 +1332,19 @@ lck_mtx_lock_acquire( mutex = &lck->lck_mtx_ptr->lck_mtx; } - /* - * If waiting_for_mutex is set, then this thread was previously blocked waiting on this lock - * If it's un-set, then this thread stole the lock from another waiter. - */ - if (thread->waiting_for_mutex == mutex) { - assert(mutex->lck_mtx_waiters > 0); - - thread->waiting_for_mutex = NULL; - mutex->lck_mtx_waiters--; - } - assert(thread->waiting_for_mutex == NULL); if (mutex->lck_mtx_waiters > 0) { - priority = mutex->lck_mtx_pri; - } else { - /* I was the last waiter, so the mutex is no longer promoted or contended */ - mutex->lck_mtx_pri = 0; - priority = 0; - } - - if (priority || thread->was_promoted_on_wakeup) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - - /* - * Note: was_promoted_on_wakeup can happen for multiple wakeups in a row without - * an intervening acquire if a thread keeps failing to acquire the lock - * - * If priority is true but not promoted on wakeup, - * then this is a lock steal of a promoted mutex, so it needs a ++ of promotions. - * - * If promoted on wakeup is true, but priority is not, - * then this is the last owner, and the last owner does not need a promotion. - */ - - spl_t s = splsched(); - thread_lock(thread); - - assert_promotions_invariant(thread); - - if (thread->was_promoted_on_wakeup) { - assert(thread->promotions > 0); + if (ts == NULL) { + ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); } - if (priority) { - if (thread->promotions++ == 0) { - /* This is the first promotion for holder */ - sched_thread_promote_to_pri(thread, priority, trace_lck); - } else { - /* - * Holder was previously promoted due to a different mutex, raise to match this one - * Or, this thread was promoted on wakeup but someone else later contended on mutex - * at higher priority before we got here - */ - sched_thread_update_promotion_to_pri(thread, priority, trace_lck); - } - } - - if (thread->was_promoted_on_wakeup) { - thread->was_promoted_on_wakeup = 0; - if (--thread->promotions == 0) { - sched_thread_unpromote(thread, trace_lck); - } - } - - assert_promotions_invariant(thread); - - if (priority && (thread->sched_flags & TH_SFLAG_DEPRESS) == 0) { - assert(thread->sched_pri >= priority); - } + turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + } - thread_unlock(thread); - splx(s); + if (ts != NULL) { + turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); } return mutex->lck_mtx_waiters; @@ -1243,11 +1357,10 @@ lck_mtx_lock_acquire( * * Called with the interlock locked. * - * TODO: the 'waiters' flag does not indicate waiters exist on the waitqueue, - * it indicates waiters exist between wait and acquire. - * This means that here we may do extra unneeded wakeups. + * NOTE: callers should call turnstile_clenup after + * dropping the interlock. */ -void +boolean_t lck_mtx_unlock_wakeup( lck_mtx_t *lck, thread_t holder) @@ -1255,6 +1368,8 @@ lck_mtx_unlock_wakeup( thread_t thread = current_thread(); lck_mtx_t *mutex; __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + struct turnstile *ts; + kern_return_t did_wake; if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { mutex = lck; @@ -1270,88 +1385,29 @@ lck_mtx_unlock_wakeup( trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0); assert(mutex->lck_mtx_waiters > 0); - assert(thread->was_promoted_on_wakeup == 0); assert(thread->waiting_for_mutex == NULL); - /* - * The waiters count does not precisely match the number of threads on the waitqueue, - * therefore we cannot assert that we actually wake up a thread here - */ + ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + if (mutex->lck_mtx_waiters > 1) { - thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri); + /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */ + did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE); } else { - thread_wakeup_one(LCK_MTX_EVENT(lck)); + did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE); } + assert(did_wake == KERN_SUCCESS); - /* When mutex->lck_mtx_pri is set, it means means I as the owner have a promotion. */ - if (mutex->lck_mtx_pri) { - spl_t s = splsched(); - thread_lock(thread); - - assert(thread->promotions > 0); - - assert_promotions_invariant(thread); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); - if (--thread->promotions == 0) { - sched_thread_unpromote(thread, trace_lck); - } - - assert_promotions_invariant(thread); - - thread_unlock(thread); - splx(s); - } + mutex->lck_mtx_waiters--; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); -} - -/* - * Callout from the waitqueue code from inside thread_wakeup_one_with_pri - * At splsched, thread is pulled from waitq, still locked, not on runqueue yet - * - * We always make sure to set the promotion flag, even if the thread is already at this priority, - * so that it doesn't go down. - */ -void -lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority) -{ - assert(priority <= MAXPRI_PROMOTE); - assert(thread->waiting_for_mutex != NULL); - - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(thread->waiting_for_mutex); - - assert_promotions_invariant(thread); - - if (thread->was_promoted_on_wakeup) { - /* Thread was previously promoted, but contended again */ - sched_thread_update_promotion_to_pri(thread, priority, trace_lck); - return; - } - - if (thread->promotions > 0 && priority <= thread->promotion_priority) { - /* - * Thread is already promoted to the right level, no need to do more - * I can draft off of another promotion here, which is OK - * because I know the thread will soon run acquire to get its own promotion - */ - assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); - return; - } - - thread->was_promoted_on_wakeup = 1; - - if (thread->promotions++ == 0) { - /* This is the first promotion for this thread */ - sched_thread_promote_to_pri(thread, priority, trace_lck); - } else { - /* Holder was previously promoted due to a different mutex, raise to match this one */ - sched_thread_update_promotion_to_pri(thread, priority, trace_lck); - } - assert_promotions_invariant(thread); + return mutex->lck_mtx_waiters > 0; } - /* * Routine: mutex_pause * @@ -1703,56 +1759,1495 @@ host_lockgroup_info( } /* - * Atomic primitives, prototyped in kern/simple_lock.h - * Noret versions are more efficient on some architectures + * sleep_with_inheritor and wakeup_with_inheritor KPI + * + * Functions that allow to sleep on an event and use turnstile to propagate the priority of the sleeping threads to + * the latest thread specified as inheritor. + * + * The inheritor management is delegated to the caller, the caller needs to store a thread identifier to provide to this functions to specified upon whom + * direct the push. The inheritor cannot run in user space while holding a push from an event. Therefore is the caller responsibility to call a + * wakeup_with_inheritor from inheritor before running in userspace or specify another inheritor before letting the old inheritor run in userspace. + * + * sleep_with_inheritor requires to hold a locking primitive while invoked, but wakeup_with_inheritor and change_sleep_inheritor don't require it. + * + * Turnstile requires a non blocking primitive as interlock to synchronize the turnstile data structure manipulation, threfore sleep_with_inheritor, change_sleep_inheritor and + * wakeup_with_inheritor will require the same interlock to manipulate turnstiles. + * If sleep_with_inheritor is associated with a locking primitive that can block (like lck_mtx_t or lck_rw_t), an handoff to a non blocking primitive is required before + * invoking any turnstile operation. + * + * All functions will save the turnstile associated with the event on the turnstile kernel hash table and will use the the turnstile kernel hash table bucket + * spinlock as the turnstile interlock. Because we do not want to hold interrupt disabled while holding the bucket interlock a new turnstile kernel hash table + * is instantiated for this KPI to manage the hash without interrupt disabled. + * Also: + * - all events on the system that hash on the same bucket will contend on the same spinlock. + * - every event will have a dedicated wait_queue. + * + * Different locking primitives can be associated with sleep_with_inheritor as long as the primitive_lock() and primitive_unlock() functions are provided to + * sleep_with_inheritor_turnstile to perform the handoff with the bucket spinlock. */ -uint32_t -hw_atomic_add(volatile uint32_t *dest, uint32_t delt) +kern_return_t +wakeup_with_inheritor_and_turnstile_type(event_t event, turnstile_type_t type, wait_result_t result, bool wake_one, lck_wake_action_t action, thread_t *thread_wokenup) { - ALIGN_TEST(dest, uint32_t); - return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t, dest), delt, memory_order_relaxed) + delt; -} + uint32_t index; + struct turnstile *ts = NULL; + kern_return_t ret = KERN_NOT_WAITING; + int priority; + thread_t wokeup; -uint32_t -hw_atomic_sub(volatile uint32_t *dest, uint32_t delt) -{ - ALIGN_TEST(dest, uint32_t); - return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t, dest), delt, memory_order_relaxed) - delt; -} + /* + * the hash bucket spinlock is used as turnstile interlock + */ + turnstile_hash_bucket_lock((uintptr_t)event, &index, type); -uint32_t -hw_atomic_or(volatile uint32_t *dest, uint32_t mask) -{ - ALIGN_TEST(dest, uint32_t); - return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed) | mask; -} + ts = turnstile_prepare((uintptr_t)event, NULL, TURNSTILE_NULL, type); + if (wake_one) { + if (action == LCK_WAKE_DEFAULT) { + priority = WAITQ_PROMOTE_ON_WAKE; + } else { + assert(action == LCK_WAKE_DO_NOT_TRANSFER_PUSH); + priority = WAITQ_ALL_PRIORITIES; + } + + /* + * WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor + * if it finds a thread + */ + wokeup = waitq_wakeup64_identify(&ts->ts_waitq, CAST_EVENT64_T(event), result, priority); + if (wokeup != NULL) { + if (thread_wokenup != NULL) { + *thread_wokenup = wokeup; + } else { + thread_deallocate_safe(wokeup); + } + ret = KERN_SUCCESS; + if (action == LCK_WAKE_DO_NOT_TRANSFER_PUSH) { + goto complete; + } + } else { + if (thread_wokenup != NULL) { + *thread_wokenup = NULL; + } + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + ret = KERN_NOT_WAITING; + } + } else { + ret = waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + } + + /* + * turnstile_update_inheritor_complete could be called while holding the interlock. + * In this case the new inheritor or is null, or is a thread that is just been woken up + * and have not blocked because it is racing with the same interlock used here + * after the wait. + * So there is no chain to update for the new inheritor. + * + * However unless the current thread is the old inheritor, + * old inheritor can be blocked and requires a chain update. + * + * The chain should be short because kernel turnstiles cannot have user turnstiles + * chained after them. + * + * We can anyway optimize this by asking turnstile to tell us + * if old inheritor needs an update and drop the lock + * just in that case. + */ + turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0); + + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + + turnstile_hash_bucket_lock((uintptr_t)NULL, &index, type); + +complete: + turnstile_complete((uintptr_t)event, NULL, NULL, type); + + turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0); + + turnstile_cleanup(); + + return ret; +} + +static wait_result_t +sleep_with_inheritor_and_turnstile_type(event_t event, + thread_t inheritor, + wait_interrupt_t interruptible, + uint64_t deadline, + turnstile_type_t type, + void (^primitive_lock)(void), + void (^primitive_unlock)(void)) +{ + wait_result_t ret; + uint32_t index; + struct turnstile *ts = NULL; + + /* + * the hash bucket spinlock is used as turnstile interlock, + * lock it before releasing the primitive lock + */ + turnstile_hash_bucket_lock((uintptr_t)event, &index, type); + + primitive_unlock(); + + ts = turnstile_prepare((uintptr_t)event, NULL, TURNSTILE_NULL, type); + + thread_set_pending_block_hint(current_thread(), kThreadWaitSleepWithInheritor); + /* + * We need TURNSTILE_DELAYED_UPDATE because we will call + * waitq_assert_wait64 after. + */ + turnstile_update_inheritor(ts, inheritor, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + ret = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(event), interruptible, deadline); + + turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0); + + /* + * Update new and old inheritor chains outside the interlock; + */ + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + + if (ret == THREAD_WAITING) { + ret = thread_block(THREAD_CONTINUE_NULL); + } + + turnstile_hash_bucket_lock((uintptr_t)NULL, &index, type); + + turnstile_complete((uintptr_t)event, NULL, NULL, type); + + turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0); + + turnstile_cleanup(); + + primitive_lock(); + + return ret; +} + +kern_return_t +change_sleep_inheritor_and_turnstile_type(event_t event, + thread_t inheritor, + turnstile_type_t type) +{ + uint32_t index; + struct turnstile *ts = NULL; + kern_return_t ret = KERN_SUCCESS; + /* + * the hash bucket spinlock is used as turnstile interlock + */ + turnstile_hash_bucket_lock((uintptr_t)event, &index, type); + + ts = turnstile_prepare((uintptr_t)event, NULL, TURNSTILE_NULL, type); + + if (!turnstile_has_waiters(ts)) { + ret = KERN_NOT_WAITING; + } + + /* + * We will not call an assert_wait later so use TURNSTILE_IMMEDIATE_UPDATE + */ + turnstile_update_inheritor(ts, inheritor, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0); + + /* + * update the chains outside the interlock + */ + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + + turnstile_hash_bucket_lock((uintptr_t)NULL, &index, type); + + turnstile_complete((uintptr_t)event, NULL, NULL, type); + + turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0); + + turnstile_cleanup(); + + return ret; +} + +typedef void (^void_block_void)(void); + +/* + * sleep_with_inheritor functions with lck_mtx_t as locking primitive. + */ + +wait_result_t +lck_mtx_sleep_with_inheritor_and_turnstile_type(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type) +{ + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + if (lck_sleep_action & LCK_SLEEP_UNLOCK) { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{;}, + ^{lck_mtx_unlock(lock);}); + } else if (lck_sleep_action & LCK_SLEEP_SPIN) { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{lck_mtx_lock_spin(lock);}, + ^{lck_mtx_unlock(lock);}); + } else if (lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS) { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{lck_mtx_lock_spin_always(lock);}, + ^{lck_mtx_unlock(lock);}); + } else { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{lck_mtx_lock(lock);}, + ^{lck_mtx_unlock(lock);}); + } +} + +/* + * Name: lck_spin_sleep_with_inheritor + * + * Description: deschedule the current thread and wait on the waitq associated with event to be woken up. + * While waiting, the sched priority of the waiting thread will contribute to the push of the event that will + * be directed to the inheritor specified. + * An interruptible mode and deadline can be specified to return earlier from the wait. + * + * Args: + * Arg1: lck_spin_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified. + * Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK. + * Arg3: event to wait on. + * Arg4: thread to propagate the event push to. + * Arg5: interruptible flag for wait. + * Arg6: deadline for wait. + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The inheritor specified cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * + * Returns: result of the wait. + */ +wait_result_t +lck_spin_sleep_with_inheritor( + lck_spin_t *lock, + lck_sleep_action_t lck_sleep_action, + event_t event, + thread_t inheritor, + wait_interrupt_t interruptible, + uint64_t deadline) +{ + if (lck_sleep_action & LCK_SLEEP_UNLOCK) { + return sleep_with_inheritor_and_turnstile_type(event, inheritor, + interruptible, deadline, TURNSTILE_SLEEP_INHERITOR, + ^{}, ^{ lck_spin_unlock(lock); }); + } else { + return sleep_with_inheritor_and_turnstile_type(event, inheritor, + interruptible, deadline, TURNSTILE_SLEEP_INHERITOR, + ^{ lck_spin_lock(lock); }, ^{ lck_spin_unlock(lock); }); + } +} + +/* + * Name: lck_mtx_sleep_with_inheritor + * + * Description: deschedule the current thread and wait on the waitq associated with event to be woken up. + * While waiting, the sched priority of the waiting thread will contribute to the push of the event that will + * be directed to the inheritor specified. + * An interruptible mode and deadline can be specified to return earlier from the wait. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified. + * Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS. + * Arg3: event to wait on. + * Arg4: thread to propagate the event push to. + * Arg5: interruptible flag for wait. + * Arg6: deadline for wait. + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The inheritor specified cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * + * Returns: result of the wait. + */ +wait_result_t +lck_mtx_sleep_with_inheritor(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline) +{ + return lck_mtx_sleep_with_inheritor_and_turnstile_type(lock, lck_sleep_action, event, inheritor, interruptible, deadline, TURNSTILE_SLEEP_INHERITOR); +} + +/* + * sleep_with_inheritor functions with lck_rw_t as locking primitive. + */ + +wait_result_t +lck_rw_sleep_with_inheritor_and_turnstile_type(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type) +{ + __block lck_rw_type_t lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; + + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + if (lck_sleep_action & LCK_SLEEP_UNLOCK) { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{;}, + ^{lck_rw_type = lck_rw_done(lock);}); + } else if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{lck_rw_lock(lock, lck_rw_type);}, + ^{lck_rw_type = lck_rw_done(lock);}); + } else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{lck_rw_lock_exclusive(lock);}, + ^{lck_rw_type = lck_rw_done(lock);}); + } else { + return sleep_with_inheritor_and_turnstile_type(event, + inheritor, + interruptible, + deadline, + type, + ^{lck_rw_lock_shared(lock);}, + ^{lck_rw_type = lck_rw_done(lock);}); + } +} + +/* + * Name: lck_rw_sleep_with_inheritor + * + * Description: deschedule the current thread and wait on the waitq associated with event to be woken up. + * While waiting, the sched priority of the waiting thread will contribute to the push of the event that will + * be directed to the inheritor specified. + * An interruptible mode and deadline can be specified to return earlier from the wait. + * + * Args: + * Arg1: lck_rw_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified. + * Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE. + * Arg3: event to wait on. + * Arg4: thread to propagate the event push to. + * Arg5: interruptible flag for wait. + * Arg6: deadline for wait. + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The inheritor specified cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * + * Returns: result of the wait. + */ +wait_result_t +lck_rw_sleep_with_inheritor(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline) +{ + return lck_rw_sleep_with_inheritor_and_turnstile_type(lock, lck_sleep_action, event, inheritor, interruptible, deadline, TURNSTILE_SLEEP_INHERITOR); +} + +/* + * wakeup_with_inheritor functions are independent from the locking primitive. + */ + +/* + * Name: wakeup_one_with_inheritor + * + * Description: wake up one waiter for event if any. The thread woken up will be the one with the higher sched priority waiting on event. + * The push for the event will be transferred from the last inheritor to the woken up thread if LCK_WAKE_DEFAULT is specified. + * If LCK_WAKE_DO_NOT_TRANSFER_PUSH is specified the push will not be transferred. + * + * Args: + * Arg1: event to wake from. + * Arg2: wait result to pass to the woken up thread. + * Arg3: wake flag. LCK_WAKE_DEFAULT or LCK_WAKE_DO_NOT_TRANSFER_PUSH. + * Arg4: pointer for storing the thread wokenup. + * + * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise. + * + * Conditions: The new inheritor wokenup cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * A reference for the wokenup thread is acquired. + * NOTE: this cannot be called from interrupt context. + */ +kern_return_t +wakeup_one_with_inheritor(event_t event, wait_result_t result, lck_wake_action_t action, thread_t *thread_wokenup) +{ + return wakeup_with_inheritor_and_turnstile_type(event, + TURNSTILE_SLEEP_INHERITOR, + result, + TRUE, + action, + thread_wokenup); +} + +/* + * Name: wakeup_all_with_inheritor + * + * Description: wake up all waiters waiting for event. The old inheritor will lose the push. + * + * Args: + * Arg1: event to wake from. + * Arg2: wait result to pass to the woken up threads. + * + * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise. + * + * Conditions: NOTE: this cannot be called from interrupt context. + */ +kern_return_t +wakeup_all_with_inheritor(event_t event, wait_result_t result) +{ + return wakeup_with_inheritor_and_turnstile_type(event, + TURNSTILE_SLEEP_INHERITOR, + result, + FALSE, + 0, + NULL); +} + +/* + * change_sleep_inheritor is independent from the locking primitive. + */ + +/* + * Name: change_sleep_inheritor + * + * Description: Redirect the push of the waiting threads of event to the new inheritor specified. + * + * Args: + * Arg1: event to redirect the push. + * Arg2: new inheritor for event. + * + * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise. + * + * Conditions: In case of success, the new inheritor cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * NOTE: this cannot be called from interrupt context. + */ +kern_return_t +change_sleep_inheritor(event_t event, thread_t inheritor) +{ + return change_sleep_inheritor_and_turnstile_type(event, + inheritor, + TURNSTILE_SLEEP_INHERITOR); +} + +void +kdp_sleep_with_inheritor_find_owner(struct waitq * waitq, __unused event64_t event, thread_waitinfo_t * waitinfo) +{ + assert(waitinfo->wait_type == kThreadWaitSleepWithInheritor); + assert(waitq_is_turnstile_queue(waitq)); + waitinfo->owner = 0; + waitinfo->context = 0; + + if (waitq_held(waitq)) { + return; + } + + struct turnstile *turnstile = waitq_to_turnstile(waitq); + assert(turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + waitinfo->owner = thread_tid(turnstile->ts_inheritor); +} + +typedef void (*void_func_void)(void); + +static kern_return_t +gate_try_close(gate_t *gate) +{ + uintptr_t state; + thread_t holder; + kern_return_t ret; + __assert_only bool waiters; + thread_t thread = current_thread(); + + if (os_atomic_cmpxchg(&gate->gate_data, 0, GATE_THREAD_TO_STATE(thread), acquire)) { + return KERN_SUCCESS; + } + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + + if (holder == NULL) { + waiters = gate_has_waiters(state); + assert(waiters == FALSE); + + state = GATE_THREAD_TO_STATE(current_thread()); + state |= GATE_ILOCK; + ordered_store_gate(gate, state); + ret = KERN_SUCCESS; + } else { + if (holder == current_thread()) { + panic("Trying to close a gate already owned by current thread %p", current_thread()); + } + ret = KERN_FAILURE; + } + + gate_iunlock(gate); + return ret; +} + +static void +gate_close(gate_t* gate) +{ + uintptr_t state; + thread_t holder; + __assert_only bool waiters; + thread_t thread = current_thread(); + + if (os_atomic_cmpxchg(&gate->gate_data, 0, GATE_THREAD_TO_STATE(thread), acquire)) { + return; + } + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + + if (holder != NULL) { + panic("Closing a gate already owned by %p from current thread %p", holder, current_thread()); + } + + waiters = gate_has_waiters(state); + assert(waiters == FALSE); + + state = GATE_THREAD_TO_STATE(thread); + state |= GATE_ILOCK; + ordered_store_gate(gate, state); + + gate_iunlock(gate); +} + +static void +gate_open_turnstile(gate_t *gate) +{ + struct turnstile *ts = NULL; + + ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(GATE_EVENT(gate)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX); + /* + * We can do the cleanup while holding the interlock. + * It is ok because: + * 1. current_thread is the previous inheritor and it is running + * 2. new inheritor is NULL. + * => No chain of turnstiles needs to be updated. + */ + turnstile_cleanup(); +} + +static void +gate_open(gate_t *gate) +{ + uintptr_t state; + thread_t holder; + bool waiters; + thread_t thread = current_thread(); + + if (os_atomic_cmpxchg(&gate->gate_data, GATE_THREAD_TO_STATE(thread), 0, release)) { + return; + } + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + waiters = gate_has_waiters(state); + + if (holder != thread) { + panic("Opening gate owned by %p from current thread %p", holder, thread); + } + + if (waiters) { + gate_open_turnstile(gate); + } + + state = GATE_ILOCK; + ordered_store_gate(gate, state); + + gate_iunlock(gate); +} + +static kern_return_t +gate_handoff_turnstile(gate_t *gate, + int flags, + thread_t *thread_woken_up, + bool *waiters) +{ + struct turnstile *ts = NULL; + kern_return_t ret = KERN_FAILURE; + thread_t hp_thread; + + ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + /* + * Wake up the higest priority thread waiting on the gate + */ + hp_thread = waitq_wakeup64_identify(&ts->ts_waitq, CAST_EVENT64_T(GATE_EVENT(gate)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE); + + if (hp_thread != NULL) { + /* + * In this case waitq_wakeup64_identify has called turnstile_update_inheritor for us + */ + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + *thread_woken_up = hp_thread; + *waiters = turnstile_has_waiters(ts); + /* + * Note: hp_thread is the new holder and the new inheritor. + * In case there are no more waiters, it doesn't need to be the inheritor + * and it shouldn't be it by the time it finishes the wait, so that its next open or + * handoff can go through the fast path. + * We could set the inheritor to NULL here, or the new holder itself can set it + * on its way back from the sleep. In the latter case there are more chanses that + * new waiters will come by, avoiding to do the opearation at all. + */ + ret = KERN_SUCCESS; + } else { + /* + * waiters can have been woken up by an interrupt and still not + * have updated gate->waiters, so we couldn't find them on the waitq. + * Update the inheritor to NULL here, so that the current thread can return to userspace + * indipendently from when the interrupted waiters will finish the wait. + */ + if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) { + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + } + // there are no waiters. + ret = KERN_NOT_WAITING; + } + + turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX); + + /* + * We can do the cleanup while holding the interlock. + * It is ok because: + * 1. current_thread is the previous inheritor and it is running + * 2. new inheritor is NULL or it is a just wokenup thread that will race acquiring the lock + * of the gate before trying to sleep. + * => No chain of turnstiles needs to be updated. + */ + turnstile_cleanup(); + + return ret; +} + +static kern_return_t +gate_handoff(gate_t *gate, + int flags) +{ + kern_return_t ret; + thread_t new_holder = NULL; + uintptr_t state; + thread_t holder; + bool waiters; + thread_t thread = current_thread(); + + assert(flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS || flags == GATE_HANDOFF_DEFAULT); + + if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) { + if (os_atomic_cmpxchg(&gate->gate_data, GATE_THREAD_TO_STATE(thread), 0, release)) { + //gate opened but there were no waiters, so return KERN_NOT_WAITING. + return KERN_NOT_WAITING; + } + } + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + waiters = gate_has_waiters(state); + + if (holder != current_thread()) { + panic("Handing off gate owned by %p from current thread %p", holder, current_thread()); + } + + if (waiters) { + ret = gate_handoff_turnstile(gate, flags, &new_holder, &waiters); + if (ret == KERN_SUCCESS) { + state = GATE_THREAD_TO_STATE(new_holder); + if (waiters) { + state |= GATE_WAITERS; + } + } else { + if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) { + state = 0; + } + } + } else { + if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) { + state = 0; + } + ret = KERN_NOT_WAITING; + } + state |= GATE_ILOCK; + ordered_store_gate(gate, state); + + gate_iunlock(gate); + + if (new_holder) { + thread_deallocate(new_holder); + } + return ret; +} + +static void_func_void +gate_steal_turnstile(gate_t *gate, + thread_t new_inheritor) +{ + struct turnstile *ts = NULL; + + ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + + turnstile_update_inheritor(ts, new_inheritor, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX); + + /* + * turnstile_cleanup might need to update the chain of the old holder. + * This operation should happen without the turnstile interlock held. + */ + return turnstile_cleanup; +} + +static void +gate_steal(gate_t *gate) +{ + uintptr_t state; + thread_t holder; + thread_t thread = current_thread(); + bool waiters; + + void_func_void func_after_interlock_unlock; + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + waiters = gate_has_waiters(state); + + assert(holder != NULL); + state = GATE_THREAD_TO_STATE(thread) | GATE_ILOCK; + if (waiters) { + state |= GATE_WAITERS; + ordered_store_gate(gate, state); + func_after_interlock_unlock = gate_steal_turnstile(gate, thread); + gate_iunlock(gate); + + func_after_interlock_unlock(); + } else { + ordered_store_gate(gate, state); + gate_iunlock(gate); + } +} + +static void_func_void +gate_wait_turnstile(gate_t *gate, + wait_interrupt_t interruptible, + uint64_t deadline, + thread_t holder, + wait_result_t* wait, + bool* waiters) +{ + struct turnstile *ts; + uintptr_t state; + + ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + + turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(GATE_EVENT(gate)), interruptible, deadline); + + gate_iunlock(gate); + + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + + *wait = thread_block(THREAD_CONTINUE_NULL); + + gate_ilock(gate); + + *waiters = turnstile_has_waiters(ts); + + if (!*waiters) { + /* + * We want to enable the fast path as soon as we see that there are no more waiters. + * On the fast path the holder will not do any turnstile operations. + * Set the inheritor as NULL here. + * + * NOTE: if it was an open operation that woke this thread up, the inheritor has + * already been set to NULL. + */ + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + if (holder && + ((*wait != THREAD_AWAKENED) || // thread interrupted or timedout + holder == current_thread())) { // thread was woken up and it is the new holder + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + } + } + + turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX); + + /* + * turnstile_cleanup might need to update the chain of the old holder. + * This operation should happen without the turnstile primitive interlock held. + */ + return turnstile_cleanup; +} + +static gate_wait_result_t +gate_wait(gate_t* gate, + wait_interrupt_t interruptible, + uint64_t deadline, + void (^primitive_unlock)(void), + void (^primitive_lock)(void)) +{ + gate_wait_result_t ret; + void_func_void func_after_interlock_unlock; + wait_result_t wait_result; + uintptr_t state; + thread_t holder; + bool waiters; + + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + + if (holder == NULL) { + panic("Trying to wait on open gate thread %p gate %p", current_thread(), gate); + } + + state |= GATE_WAITERS; + ordered_store_gate(gate, state); + + /* + * Release the primitive lock before any + * turnstile operation. Turnstile + * does not support a blocking primitive as + * interlock. + * + * In this way, concurrent threads will be + * able to acquire the primitive lock + * but still will wait for me through the + * gate interlock. + */ + primitive_unlock(); + + func_after_interlock_unlock = gate_wait_turnstile( gate, + interruptible, + deadline, + holder, + &wait_result, + &waiters); + + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + + switch (wait_result) { + case THREAD_INTERRUPTED: + case THREAD_TIMED_OUT: + assert(holder != current_thread()); + + if (waiters) { + state |= GATE_WAITERS; + } else { + state &= ~GATE_WAITERS; + } + ordered_store_gate(gate, state); + + if (wait_result == THREAD_INTERRUPTED) { + ret = GATE_INTERRUPTED; + } else { + ret = GATE_TIMED_OUT; + } + break; + default: + /* + * Note it is possible that even if the gate was handed off to + * me, someone called gate_steal() before I woke up. + * + * As well as it is possible that the gate was opened, but someone + * closed it while I was waking up. + * + * In both cases we return GATE_OPENED, as the gate was opened to me + * at one point, it is the caller responsibility to check again if + * the gate is open. + */ + if (holder == current_thread()) { + ret = GATE_HANDOFF; + } else { + ret = GATE_OPENED; + } + break; + } + + gate_iunlock(gate); + + /* + * turnstile func that needs to be executed without + * holding the primitive interlock + */ + func_after_interlock_unlock(); + + primitive_lock(); + + return ret; +} +static void +gate_assert(gate_t *gate, int flags) +{ + uintptr_t state; + thread_t holder; + + gate_ilock(gate); + state = ordered_load_gate(gate); + holder = GATE_STATE_TO_THREAD(state); + + switch (flags) { + case GATE_ASSERT_CLOSED: + assert(holder != NULL); + break; + case GATE_ASSERT_OPEN: + assert(holder == NULL); + break; + case GATE_ASSERT_HELD: + assert(holder == current_thread()); + break; + default: + panic("invalid %s flag %d", __func__, flags); + } + + gate_iunlock(gate); +} + +static void +gate_init(gate_t *gate) +{ + gate->gate_data = 0; + gate->turnstile = NULL; +} + +static void +gate_destroy(__assert_only gate_t *gate) +{ + assert(gate->gate_data == 0); + assert(gate->turnstile == NULL); +} + +/* + * Name: lck_rw_gate_init + * + * Description: initializes a variable declared with decl_lck_rw_gate_data. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + */ +void +lck_rw_gate_init(lck_rw_t *lock, gate_t *gate) +{ + (void) lock; + gate_init(gate); +} + +/* + * Name: lck_rw_gate_destroy + * + * Description: destroys a variable previously initialized. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + */ +void +lck_rw_gate_destroy(lck_rw_t *lock, gate_t *gate) +{ + (void) lock; + gate_destroy(gate); +} + +/* + * Name: lck_rw_gate_try_close + * + * Description: Tries to close the gate. + * In case of success the current thread will be set as + * the holder of the gate. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * + * Returns: + * KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder + * of the gate. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * If the intent is to conditionally probe the gate before waiting, the lock must not be dropped + * between the calls to lck_rw_gate_try_close() and lck_rw_gate_wait(). + * + * KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate. + * lck_rw_gate_wait() should be called instead if the intent is to unconditionally wait on this gate. + * The calls to lck_rw_gate_try_close() and lck_rw_gate_wait() should + * be done without dropping the lock that is protecting the gate in between. + */ +int +lck_rw_gate_try_close(__assert_only lck_rw_t *lock, gate_t *gate) +{ + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + return gate_try_close(gate); +} + +/* + * Name: lck_rw_gate_close + * + * Description: Closes the gate. The current thread will be set as + * the holder of the gate. Will panic if the gate is already closed. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be open. + * + */ +void +lck_rw_gate_close(__assert_only lck_rw_t *lock, gate_t *gate) +{ + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + return gate_close(gate); +} + +/* + * Name: lck_rw_gate_open + * + * Description: Opens the gate and wakes up possible waiters. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + */ +void +lck_rw_gate_open(__assert_only lck_rw_t *lock, gate_t *gate) +{ + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + gate_open(gate); +} + +/* + * Name: lck_rw_gate_handoff + * + * Description: Tries to transfer the ownership of the gate. The waiter with highest sched + * priority will be selected as the new holder of the gate, and woken up, + * with the gate remaining in the closed state throughout. + * If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING + * will be returned. + * GATE_HANDOFF_OPEN_IF_NO_WAITERS flag can be used to specify if the gate should be opened in + * case no waiters were found. + * + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * Arg3: flags - GATE_HANDOFF_DEFAULT or GATE_HANDOFF_OPEN_IF_NO_WAITERS + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + * Returns: + * KERN_SUCCESS in case one of the waiters became the new holder. + * KERN_NOT_WAITING in case there were no waiters. + * + */ +kern_return_t +lck_rw_gate_handoff(__assert_only lck_rw_t *lock, gate_t *gate, int flags) +{ + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + return gate_handoff(gate, flags); +} + +/* + * Name: lck_rw_gate_steal + * + * Description: Set the current ownership of the gate. It sets the current thread as the + * new holder of the gate. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * NOTE: the previous holder should not call lck_rw_gate_open() or lck_rw_gate_handoff() + * anymore. + * + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be closed and the current thread must not already be the holder. + * + */ +void +lck_rw_gate_steal(__assert_only lck_rw_t *lock, gate_t *gate) +{ + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + gate_steal(gate); +} + +/* + * Name: lck_rw_gate_wait + * + * Description: Waits for the current thread to become the holder of the gate or for the + * gate to become open. An interruptible mode and deadline can be specified + * to return earlier from the wait. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE. + * Arg3: interruptible flag for wait. + * Arg4: deadline + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The gate must be closed. + * + * Returns: Reason why the thread was woken up. + * GATE_HANDOFF - the current thread was handed off the ownership of the gate. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * GATE_OPENED - the gate was opened by the holder. + * GATE_TIMED_OUT - the thread was woken up by a timeout. + * GATE_INTERRUPTED - the thread was interrupted while sleeping. + * + */ +gate_wait_result_t +lck_rw_gate_wait(lck_rw_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline) +{ + __block lck_rw_type_t lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; + + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + if (lck_sleep_action & LCK_SLEEP_UNLOCK) { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_rw_type = lck_rw_done(lock);}, + ^{;}); + } else if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_rw_type = lck_rw_done(lock);}, + ^{lck_rw_lock(lock, lck_rw_type);}); + } else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_rw_type = lck_rw_done(lock);}, + ^{lck_rw_lock_exclusive(lock);}); + } else { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_rw_type = lck_rw_done(lock);}, + ^{lck_rw_lock_shared(lock);}); + } +} + +/* + * Name: lck_rw_gate_assert + * + * Description: asserts that the gate is in the specified state. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * Arg3: flags to specified assert type. + * GATE_ASSERT_CLOSED - the gate is currently closed + * GATE_ASSERT_OPEN - the gate is currently opened + * GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder + */ +void +lck_rw_gate_assert(__assert_only lck_rw_t *lock, gate_t *gate, int flags) +{ + LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD); + + gate_assert(gate, flags); + return; +} + +/* + * Name: lck_mtx_gate_init + * + * Description: initializes a variable declared with decl_lck_mtx_gate_data. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + */ +void +lck_mtx_gate_init(lck_mtx_t *lock, gate_t *gate) +{ + (void) lock; + gate_init(gate); +} + +/* + * Name: lck_mtx_gate_destroy + * + * Description: destroys a variable previously initialized + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + */ +void +lck_mtx_gate_destroy(lck_mtx_t *lock, gate_t *gate) +{ + (void) lock; + gate_destroy(gate); +} + +/* + * Name: lck_mtx_gate_try_close + * + * Description: Tries to close the gate. + * In case of success the current thread will be set as + * the holder of the gate. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * + * Returns: + * KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder + * of the gate. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * If the intent is to conditionally probe the gate before waiting, the lock must not be dropped + * between the calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait(). + * + * KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate. + * lck_mtx_gate_wait() should be called instead if the intent is to unconditionally wait on this gate. + * The calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait() should + * be done without dropping the lock that is protecting the gate in between. + */ +int +lck_mtx_gate_try_close(__assert_only lck_mtx_t *lock, gate_t *gate) +{ + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + return gate_try_close(gate); +} + +/* + * Name: lck_mtx_gate_close + * + * Description: Closes the gate. The current thread will be set as + * the holder of the gate. Will panic if the gate is already closed. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be open. + * + */ void -hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask) +lck_mtx_gate_close(__assert_only lck_mtx_t *lock, gate_t *gate) { - ALIGN_TEST(dest, uint32_t); - __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed); + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + return gate_close(gate); +} + +/* + * Name: lck_mtx_gate_open + * + * Description: Opens of the gate and wakes up possible waiters. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + */ +void +lck_mtx_gate_open(__assert_only lck_mtx_t *lock, gate_t *gate) +{ + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + gate_open(gate); } -uint32_t -hw_atomic_and(volatile uint32_t *dest, uint32_t mask) +/* + * Name: lck_mtx_gate_handoff + * + * Description: Set the current ownership of the gate. The waiter with highest sched + * priority will be selected as the new holder of the gate, and woken up, + * with the gate remaining in the closed state throughout. + * If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING + * will be returned. + * OPEN_ON_FAILURE flag can be used to specify if the gate should be opened in + * case no waiters were found. + * + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * Arg3: flags - GATE_NO_FALGS or OPEN_ON_FAILURE + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + * Returns: + * KERN_SUCCESS in case one of the waiters became the new holder. + * KERN_NOT_WAITING in case there were no waiters. + * + */ +kern_return_t +lck_mtx_gate_handoff(__assert_only lck_mtx_t *lock, gate_t *gate, int flags) { - ALIGN_TEST(dest, uint32_t); - return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed) & mask; + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + return gate_handoff(gate, flags); } +/* + * Name: lck_mtx_gate_steal + * + * Description: Steals the ownership of the gate. It sets the current thread as the + * new holder of the gate. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * NOTE: the previous holder should not call lck_mtx_gate_open() or lck_mtx_gate_handoff() + * anymore. + * + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be closed and the current thread must not already be the holder. + * + */ void -hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask) +lck_mtx_gate_steal(__assert_only lck_mtx_t *lock, gate_t *gate) +{ + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + gate_steal(gate); +} + +/* + * Name: lck_mtx_gate_wait + * + * Description: Waits for the current thread to become the holder of the gate or for the + * gate to become open. An interruptible mode and deadline can be specified + * to return earlier from the wait. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS. + * Arg3: interruptible flag for wait. + * Arg4: deadline + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The gate must be closed. + * + * Returns: Reason why the thread was woken up. + * GATE_HANDOFF - the current thread was handed off the ownership of the gate. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * GATE_OPENED - the gate was opened by the holder. + * GATE_TIMED_OUT - the thread was woken up by a timeout. + * GATE_INTERRUPTED - the thread was interrupted while sleeping. + * + */ +gate_wait_result_t +lck_mtx_gate_wait(lck_mtx_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline) { - ALIGN_TEST(dest, uint32_t); - __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed); + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + if (lck_sleep_action & LCK_SLEEP_UNLOCK) { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_mtx_unlock(lock);}, + ^{;}); + } else if (lck_sleep_action & LCK_SLEEP_SPIN) { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_mtx_unlock(lock);}, + ^{lck_mtx_lock_spin(lock);}); + } else if (lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS) { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_mtx_unlock(lock);}, + ^{lck_mtx_lock_spin_always(lock);}); + } else { + return gate_wait(gate, + interruptible, + deadline, + ^{lck_mtx_unlock(lock);}, + ^{lck_mtx_lock(lock);}); + } } -uint32_t -hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest) +/* + * Name: lck_mtx_gate_assert + * + * Description: asserts that the gate is in the specified state. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * Arg3: flags to specified assert type. + * GATE_ASSERT_CLOSED - the gate is currently closed + * GATE_ASSERT_OPEN - the gate is currently opened + * GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder + */ +void +lck_mtx_gate_assert(__assert_only lck_mtx_t *lock, gate_t *gate, int flags) { - ALIGN_TEST(dest, uint32_t); - return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t, dest), &oldval, newval, - memory_order_acq_rel_smp, memory_order_relaxed); + LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED); + + gate_assert(gate, flags); } diff --git a/osfmk/kern/locks.h b/osfmk/kern/locks.h index dd5f3a54a..51c1da4c1 100644 --- a/osfmk/kern/locks.h +++ b/osfmk/kern/locks.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2012 Apple Inc. All rights reserved. + * Copyright (c) 2003-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -58,6 +58,10 @@ typedef unsigned int lck_sleep_action_t; #define LCK_SLEEP_MASK 0x3f /* Valid actions */ +typedef unsigned int lck_wake_action_t; + +#define LCK_WAKE_DEFAULT 0x00 /* If waiters are present, transfer their push to the wokenup thread */ +#define LCK_WAKE_DO_NOT_TRANSFER_PUSH 0x01 /* Do not transfer waiters push when waking up */ #ifdef MACH_KERNEL_PRIVATE typedef struct _lck_attr_ { @@ -75,7 +79,7 @@ extern lck_attr_t LockDefaultLckAttr; typedef struct __lck_attr__ lck_attr_t; #endif -#define LCK_ATTR_NULL (lck_attr_t *)0 +#define LCK_ATTR_NULL (lck_attr_t *)NULL __BEGIN_DECLS @@ -118,7 +122,7 @@ extern void lck_attr_rw_shared_priority( extern void lck_attr_free( lck_attr_t *attr); -#define decl_lck_spin_data(class, name) class lck_spin_t name; +#define decl_lck_spin_data(class, name) class lck_spin_t name extern lck_spin_t *lck_spin_alloc_init( lck_grp_t *grp, @@ -191,7 +195,7 @@ extern void lck_mtx_init_ext(lck_mtx_t *lck, struct _lck_mtx_ext_ *lck_ext, #endif -#define decl_lck_mtx_data(class, name) class lck_mtx_t name; +#define decl_lck_mtx_data(class, name) class lck_mtx_t name extern lck_mtx_t *lck_mtx_alloc_init( lck_grp_t *grp, @@ -227,16 +231,594 @@ extern wait_result_t lck_mtx_sleep_deadline( event_t event, wait_interrupt_t interruptible, uint64_t deadline); + +#ifdef KERNEL_PRIVATE +/* + * Name: lck_spin_sleep_with_inheritor + * + * Description: deschedule the current thread and wait on the waitq associated with event to be woken up. + * While waiting, the sched priority of the waiting thread will contribute to the push of the event that will + * be directed to the inheritor specified. + * An interruptible mode and deadline can be specified to return earlier from the wait. + * + * Args: + * Arg1: lck_spin_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified. + * Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK. + * Arg3: event to wait on. + * Arg4: thread to propagate the event push to. + * Arg5: interruptible flag for wait. + * Arg6: deadline for wait. + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The inheritor specified cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * + * Returns: result of the wait. + */ +extern wait_result_t lck_spin_sleep_with_inheritor(lck_spin_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline); + +/* + * Name: lck_mtx_sleep_with_inheritor + * + * Description: deschedule the current thread and wait on the waitq associated with event to be woken up. + * While waiting, the sched priority of the waiting thread will contribute to the push of the event that will + * be directed to the inheritor specified. + * An interruptible mode and deadline can be specified to return earlier from the wait. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified. + * Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS. + * Arg3: event to wait on. + * Arg4: thread to propagate the event push to. + * Arg5: interruptible flag for wait. + * Arg6: deadline for wait. + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The inheritor specified cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * + * Returns: result of the wait. + */ +extern wait_result_t lck_mtx_sleep_with_inheritor(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline); + +/* + * Name: lck_mtx_sleep_with_inheritor + * + * Description: deschedule the current thread and wait on the waitq associated with event to be woken up. + * While waiting, the sched priority of the waiting thread will contribute to the push of the event that will + * be directed to the inheritor specified. + * An interruptible mode and deadline can be specified to return earlier from the wait. + * + * Args: + * Arg1: lck_rw_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified. + * Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE. + * Arg3: event to wait on. + * Arg4: thread to propagate the event push to. + * Arg5: interruptible flag for wait. + * Arg6: deadline for wait. + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The inheritor specified cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * + * Returns: result of the wait. + */ +extern wait_result_t lck_rw_sleep_with_inheritor(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline); + +/* + * Name: wakeup_one_with_inheritor + * + * Description: wake up one waiter for event if any. The thread woken up will be the one with the higher sched priority waiting on event. + * The push for the event will be transferred from the last inheritor to the woken up thread. + * + * Args: + * Arg1: event to wake from. + * Arg2: wait result to pass to the woken up thread. + * Arg3: pointer for storing the thread wokenup. + * + * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise. + * + * Conditions: The new inheritor wokenup cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * A reference for the wokenup thread is acquired. + * NOTE: this cannot be called from interrupt context. + */ +extern kern_return_t wakeup_one_with_inheritor(event_t event, wait_result_t result, lck_wake_action_t action, thread_t *thread_wokenup); + +/* + * Name: wakeup_all_with_inheritor + * + * Description: wake up all waiters waiting for event. The old inheritor will lose the push. + * + * Args: + * Arg1: event to wake from. + * Arg2: wait result to pass to the woken up threads. + * + * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise. + * + * Conditions: NOTE: this cannot be called from interrupt context. + */ +extern kern_return_t wakeup_all_with_inheritor(event_t event, wait_result_t result); + +/* + * Name: change_sleep_inheritor + * + * Description: Redirect the push of the waiting threads of event to the new inheritor specified. + * + * Args: + * Arg1: event to redirect the push. + * Arg2: new inheritor for event. + * + * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise. + * + * Conditions: In case of success, the new inheritor cannot run in user space until another inheritor is specified for the event or a + * wakeup for the event is called. + * NOTE: this cannot be called from interrupt context. + */ +extern kern_return_t change_sleep_inheritor(event_t event, thread_t inheritor); + +/* + * gate structure + */ +typedef struct gate { + uintptr_t gate_data; // thread holder, interlock bit and waiter bit + struct turnstile *turnstile; // protected by the interlock bit +} gate_t; + +#define GATE_ILOCK_BIT 0 +#define GATE_WAITERS_BIT 1 + +#define GATE_ILOCK (1 << GATE_ILOCK_BIT) +#define GATE_WAITERS (1 << GATE_WAITERS_BIT) + +#define gate_ilock(gate) hw_lock_bit((hw_lock_bit_t*)(&(gate)->gate_data), GATE_ILOCK_BIT, LCK_GRP_NULL) +#define gate_iunlock(gate) hw_unlock_bit((hw_lock_bit_t*)(&(gate)->gate_data), GATE_ILOCK_BIT) +#define gate_has_waiters(state) ((state & GATE_WAITERS) != 0) +#define ordered_load_gate(gate) os_atomic_load(&(gate)->gate_data, compiler_acq_rel) +#define ordered_store_gate(gate, value) os_atomic_store(&(gate)->gate_data, value, compiler_acq_rel) + +#define GATE_THREAD_MASK (~(uintptr_t)(GATE_ILOCK | GATE_WAITERS)) +#define GATE_STATE_TO_THREAD(state) (thread_t)(state & GATE_THREAD_MASK) +#define GATE_THREAD_TO_STATE(thread) ((uintptr_t)thread) + +/* + * Possible gate_wait_result_t values. + */ +typedef int gate_wait_result_t; +#define GATE_HANDOFF 0 +#define GATE_OPENED 1 +#define GATE_TIMED_OUT 2 +#define GATE_INTERRUPTED 3 + +/* + * Gate flags used by gate_assert + */ +#define GATE_ASSERT_CLOSED 0 +#define GATE_ASSERT_OPEN 1 +#define GATE_ASSERT_HELD 2 + +/* + * Gate flags used by gate_handoff + */ +#define GATE_HANDOFF_DEFAULT 0 +#define GATE_HANDOFF_OPEN_IF_NO_WAITERS 1 + +#define GATE_EVENT(gate) ((event_t) gate) +#define EVENT_TO_GATE(event) ((gate_t *) event) + +/* + * Name: decl_lck_rw_gate_data + * + * Description: declares a gate variable with specified storage class. + * The gate itself will be stored in this variable and it is the caller's responsibility + * to ensure that this variable's memory is going to be accessible by all threads that will use + * the gate. + * Every gate function will require a pointer to this variable as parameter. The same pointer should + * be used in every thread. + * + * The variable needs to be initialized once with lck_rw_gate_init() and destroyed once with + * lck_rw_gate_destroy() when not needed anymore. + * + * The gate will be used in conjunction with a lck_rw_t. + * + * Args: + * Arg1: storage class. + * Arg2: variable name. + */ +#define decl_lck_rw_gate_data(class, name) class gate_t name + +/* + * Name: lck_rw_gate_init + * + * Description: initializes a variable declared with decl_lck_rw_gate_data. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + */ +extern void lck_rw_gate_init(lck_rw_t *lock, gate_t *gate); + +/* + * Name: lck_rw_gate_destroy + * + * Description: destroys a variable previously initialized. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + */ +extern void lck_rw_gate_destroy(lck_rw_t *lock, gate_t *gate); + +/* + * Name: lck_rw_gate_try_close + * + * Description: Tries to close the gate. + * In case of success the current thread will be set as + * the holder of the gate. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * + * Returns: + * KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder + * of the gate. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * If the intent is to conditionally probe the gate before waiting, the lock must not be dropped + * between the calls to lck_rw_gate_try_close() and lck_rw_gate_wait(). + * + * KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate. + * lck_rw_gate_wait() should be called instead if the intent is to unconditionally wait on this gate. + * The calls to lck_rw_gate_try_close() and lck_rw_gate_wait() should + * be done without dropping the lock that is protecting the gate in between. + */ +extern kern_return_t lck_rw_gate_try_close(lck_rw_t *lock, gate_t *gate); + +/* + * Name: lck_rw_gate_close + * + * Description: Closes the gate. The current thread will be set as + * the holder of the gate. Will panic if the gate is already closed. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be open. + * + */ +extern void lck_rw_gate_close(lck_rw_t *lock, gate_t *gate); + + +/* + * Name: lck_rw_gate_open + * + * Description: Opens the gate and wakes up possible waiters. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + */ +extern void lck_rw_gate_open(lck_rw_t *lock, gate_t *gate); + +/* + * Name: lck_rw_gate_handoff + * + * Description: Tries to transfer the ownership of the gate. The waiter with highest sched + * priority will be selected as the new holder of the gate, and woken up, + * with the gate remaining in the closed state throughout. + * If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING + * will be returned. + * GATE_HANDOFF_OPEN_IF_NO_WAITERS flag can be used to specify if the gate should be opened in + * case no waiters were found. + * + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * Arg3: flags - GATE_HANDOFF_DEFAULT or GATE_HANDOFF_OPEN_IF_NO_WAITERS + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + * Returns: + * KERN_SUCCESS in case one of the waiters became the new holder. + * KERN_NOT_WAITING in case there were no waiters. + * + */ +extern kern_return_t lck_rw_gate_handoff(lck_rw_t *lock, gate_t *gate, int flags); + +/* + * Name: lck_rw_gate_steal + * + * Description: Set the current ownership of the gate. It sets the current thread as the + * new holder of the gate. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * NOTE: the previous holder should not call lck_rw_gate_open() or lck_rw_gate_handoff() + * anymore. + * + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be closed and the current thread must not already be the holder. + * + */ +extern void lck_rw_gate_steal(lck_rw_t *lock, gate_t *gate); + +/* + * Name: lck_rw_gate_wait + * + * Description: Waits for the current thread to become the holder of the gate or for the + * gate to become open. An interruptible mode and deadline can be specified + * to return earlier from the wait. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE. + * Arg3: interruptible flag for wait. + * Arg4: deadline + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The gate must be closed. + * + * Returns: Reason why the thread was woken up. + * GATE_HANDOFF - the current thread was handed off the ownership of the gate. + * A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * GATE_OPENED - the gate was opened by the holder. + * GATE_TIMED_OUT - the thread was woken up by a timeout. + * GATE_INTERRUPTED - the thread was interrupted while sleeping. + * + */ +extern gate_wait_result_t lck_rw_gate_wait(lck_rw_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline); + +/* + * Name: lck_rw_gate_assert + * + * Description: asserts that the gate is in the specified state. + * + * Args: + * Arg1: lck_rw_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_rw_gate_data. + * Arg3: flags to specified assert type. + * GATE_ASSERT_CLOSED - the gate is currently closed + * GATE_ASSERT_OPEN - the gate is currently opened + * GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder + */ +extern void lck_rw_gate_assert(lck_rw_t *lock, gate_t *gate, int flags); + +/* + * Name: decl_lck_mtx_gate_data + * + * Description: declares a gate variable with specified storage class. + * The gate itself will be stored in this variable and it is the caller's responsibility + * to ensure that this variable's memory is going to be accessible by all threads that will use + * the gate. + * Every gate function will require a pointer to this variable as parameter. The same pointer should + * be used in every thread. + * + * The variable needs to be initialized once with lck_mtx_gate_init() and destroyed once with + * lck_mtx_gate_destroy() when not needed anymore. + * + * The gate will be used in conjunction with a lck_mtx_t. + * + * Args: + * Arg1: storage class. + * Arg2: variable name. + */ +#define decl_lck_mtx_gate_data(class, name) class gate_t name + +/* + * Name: lck_mtx_gate_init + * + * Description: initializes a variable declared with decl_lck_mtx_gate_data. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + */ +extern void lck_mtx_gate_init(lck_mtx_t *lock, gate_t *gate); + +/* + * Name: lck_mtx_gate_destroy + * + * Description: destroys a variable previously initialized + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + */ +extern void lck_mtx_gate_destroy(lck_mtx_t *lock, gate_t *gate); + +/* + * Name: lck_mtx_gate_try_close + * + * Description: Tries to close the gate. + * In case of success the current thread will be set as + * the holder of the gate. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * + * Returns: + * KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder + * of the gate. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * If the intent is to conditionally probe the gate before waiting, the lock must not be dropped + * between the calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait(). + * + * KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate. + * lck_mtx_gate_wait() should be called instead if the intent is to unconditionally wait on this gate. + * The calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait() should + * be done without dropping the lock that is protecting the gate in between. + */ +extern kern_return_t lck_mtx_gate_try_close(lck_mtx_t *lock, gate_t *gate); + +/* + * Name: lck_mtx_gate_close + * + * Description: Closes the gate. The current thread will be set as + * the holder of the gate. Will panic if the gate is already closed. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be open. + * + */ +extern void lck_mtx_gate_close(lck_mtx_t *lock, gate_t *gate); + +/* + * Name: lck_mtx_gate_open + * + * Description: Opens of the gate and wakes up possible waiters. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + */ +extern void lck_mtx_gate_open(lck_mtx_t *lock, gate_t *gate); + +/* + * Name: lck_mtx_gate_handoff + * + * Description: Set the current ownership of the gate. The waiter with highest sched + * priority will be selected as the new holder of the gate, and woken up, + * with the gate remaining in the closed state throughout. + * If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING + * will be returned. + * OPEN_ON_FAILURE flag can be used to specify if the gate should be opened in + * case no waiters were found. + * + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * Arg3: flags - GATE_NO_FALGS or OPEN_ON_FAILURE + * + * Conditions: Lock must be held. Returns with the lock held. + * The current thread must be the holder of the gate. + * + * Returns: + * KERN_SUCCESS in case one of the waiters became the new holder. + * KERN_NOT_WAITING in case there were no waiters. + * + */ +extern kern_return_t lck_mtx_gate_handoff(lck_mtx_t *lock, gate_t *gate, int flags); + +/* + * Name: lck_mtx_gate_steal + * + * Description: Steals the ownership of the gate. It sets the current thread as the + * new holder of the gate. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * NOTE: the previous holder should not call lck_mtx_gate_open() or lck_mtx_gate_handoff() + * anymore. + * + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * + * Conditions: Lock must be held. Returns with the lock held. + * The gate must be closed and the current thread must not already be the holder. + * + */ +extern void lck_mtx_gate_steal(lck_mtx_t *lock, gate_t *gate); + +/* + * Name: lck_mtx_gate_wait + * + * Description: Waits for the current thread to become the holder of the gate or for the + * gate to become open. An interruptible mode and deadline can be specified + * to return earlier from the wait. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS. + * Arg3: interruptible flag for wait. + * Arg4: deadline + * + * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified. + * Lock will be dropped while waiting. + * The gate must be closed. + * + * Returns: Reason why the thread was woken up. + * GATE_HANDOFF - the current thread was handed off the ownership of the gate. + * A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on + * to wake up possible waiters on the gate before returning to userspace. + * GATE_OPENED - the gate was opened by the holder. + * GATE_TIMED_OUT - the thread was woken up by a timeout. + * GATE_INTERRUPTED - the thread was interrupted while sleeping. + * + */ +extern gate_wait_result_t lck_mtx_gate_wait(lck_mtx_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline); + +/* + * Name: lck_mtx_gate_assert + * + * Description: asserts that the gate is in the specified state. + * + * Args: + * Arg1: lck_mtx_t lock used to protect the gate. + * Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data. + * Arg3: flags to specified assert type. + * GATE_ASSERT_CLOSED - the gate is currently closed + * GATE_ASSERT_OPEN - the gate is currently opened + * GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder + */ +extern void lck_mtx_gate_assert(lck_mtx_t *lock, gate_t *gate, int flags); + + +#endif //KERNEL_PRIVATE + #if DEVELOPMENT || DEBUG +#define FULL_CONTENDED 0 +#define HALF_CONTENDED 1 +#define MAX_CONDENDED 2 + extern void erase_all_test_mtx_stats(void); extern int get_test_mtx_stats_string(char* buffer, int buffer_size); extern void lck_mtx_test_init(void); extern void lck_mtx_test_lock(void); extern void lck_mtx_test_unlock(void); extern int lck_mtx_test_mtx_uncontended(int iter, char* buffer, int buffer_size); -extern int lck_mtx_test_mtx_contended(int iter, char* buffer, int buffer_size); +extern int lck_mtx_test_mtx_contended(int iter, char* buffer, int buffer_size, int type); extern int lck_mtx_test_mtx_uncontended_loop_time(int iter, char* buffer, int buffer_size); -extern int lck_mtx_test_mtx_contended_loop_time(int iter, char* buffer, int buffer_size); +extern int lck_mtx_test_mtx_contended_loop_time(int iter, char* buffer, int buffer_size, int type); #endif #ifdef KERNEL_PRIVATE @@ -310,14 +892,17 @@ __END_DECLS #define LCK_MTX_ASSERT_NOTOWNED LCK_ASSERT_NOTOWNED #ifdef MACH_KERNEL_PRIVATE +struct turnstile; extern void lck_mtx_lock_wait( lck_mtx_t *lck, - thread_t holder); + thread_t holder, + struct turnstile **ts); extern int lck_mtx_lock_acquire( - lck_mtx_t *lck); + lck_mtx_t *lck, + struct turnstile *ts); -extern void lck_mtx_unlock_wakeup( +extern boolean_t lck_mtx_unlock_wakeup( lck_mtx_t *lck, thread_t holder); @@ -331,7 +916,7 @@ extern void lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority); #endif -#define decl_lck_rw_data(class, name) class lck_rw_t name; +#define decl_lck_rw_data(class, name) class lck_rw_t name typedef unsigned int lck_rw_type_t; diff --git a/osfmk/kern/ltable.h b/osfmk/kern/ltable.h index 9b1e47c72..bc05894c3 100644 --- a/osfmk/kern/ltable.h +++ b/osfmk/kern/ltable.h @@ -85,15 +85,15 @@ struct lt_elem { }; /* reference count bits should _always_ be the low-order bits */ -#define LT_BITS_REFCNT_MASK (0x1FFFFFFF) +#define LT_BITS_REFCNT_MASK (0x1FFFFFFFU) #define LT_BITS_REFCNT_SHIFT (0) #define LT_BITS_REFCNT (LT_BITS_REFCNT_MASK << LT_BITS_REFCNT_SHIFT) -#define LT_BITS_TYPE_MASK (0x3) +#define LT_BITS_TYPE_MASK (0x3U) #define LT_BITS_TYPE_SHIFT (29) #define LT_BITS_TYPE (LT_BITS_TYPE_MASK << LT_BITS_TYPE_SHIFT) -#define LT_BITS_VALID_MASK (0x1) +#define LT_BITS_VALID_MASK (0x1U) #define LT_BITS_VALID_SHIFT (31) #define LT_BITS_VALID (LT_BITS_VALID_MASK << LT_BITS_VALID_SHIFT) diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index a61ee71db..c90a8ea6d 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include @@ -111,8 +112,14 @@ extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t); struct machine_info machine_info; /* Forwards */ -void processor_doshutdown( - processor_t processor); +static void +processor_doshutdown(processor_t processor); + +static void +processor_offline(void * parameter, __unused wait_result_t result); + +static void +processor_offline_intstack(processor_t processor) __dead2; /* * processor_up: @@ -126,19 +133,32 @@ processor_up( { processor_set_t pset; spl_t s; + boolean_t pset_online = false; s = splsched(); init_ast_check(processor); pset = processor->processor_set; pset_lock(pset); + if (pset->online_processor_count == 0) { + /* About to bring the first processor of a pset online */ + pset_online = true; + } ++pset->online_processor_count; pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); - (void)hw_atomic_add(&processor_avail_count, 1); + os_atomic_inc(&processor_avail_count, relaxed); if (processor->is_recommended) { - (void)hw_atomic_add(&processor_avail_count_user, 1); + os_atomic_inc(&processor_avail_count_user, relaxed); } commpage_update_active_cpus(); - pset_unlock(pset); + if (pset_online) { + /* New pset is coming up online; callout to the + * scheduler in case it wants to adjust runqs. + */ + SCHED(pset_made_schedulable)(processor, pset, true); + /* pset lock dropped */ + } else { + pset_unlock(pset); + } ml_cpu_up(); splx(s); @@ -252,20 +272,22 @@ processor_shutdown( /* * Called with interrupts disabled. */ -void +static void processor_doshutdown( - processor_t processor) + processor_t processor) { - thread_t old_thread, self = current_thread(); - processor_t prev; - processor_set_t pset; + thread_t self = current_thread(); /* * Get onto the processor to shutdown */ - prev = thread_bind(processor); + processor_t prev = thread_bind(processor); thread_block(THREAD_CONTINUE_NULL); + /* interrupts still disabled */ + assert(ml_get_interrupts_enabled() == FALSE); + + assert(processor == current_processor()); assert(processor->state == PROCESSOR_SHUTDOWN); #if CONFIG_DTRACE @@ -283,88 +305,127 @@ processor_doshutdown( } #endif - pset = processor->processor_set; + processor_set_t pset = processor->processor_set; + pset_lock(pset); pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE); --pset->online_processor_count; - (void)hw_atomic_sub(&processor_avail_count, 1); + os_atomic_dec(&processor_avail_count, relaxed); if (processor->is_recommended) { - (void)hw_atomic_sub(&processor_avail_count_user, 1); + os_atomic_dec(&processor_avail_count_user, relaxed); } commpage_update_active_cpus(); SCHED(processor_queue_shutdown)(processor); /* pset lock dropped */ SCHED(rt_queue_shutdown)(processor); + thread_bind(prev); + + /* interrupts still disabled */ + /* - * Continue processor shutdown in shutdown context. - * - * We save the current context in machine_processor_shutdown in such a way - * that when this thread is next invoked it will return from here instead of - * from the machine_switch_context() in thread_invoke like a normal context switch. - * - * As such, 'old_thread' is neither the idle thread nor the current thread - it's whatever - * thread invoked back to this one. (Usually, it's another processor's idle thread.) - * - * TODO: Make this a real thread_run of the idle_thread, so we don't have to keep this in sync - * with thread_invoke. + * Continue processor shutdown on the processor's idle thread. + * The handoff won't fail because the idle thread has a reserved stack. + * Switching to the idle thread leaves interrupts disabled, + * so we can't accidentally take an interrupt after the context switch. */ - thread_bind(prev); - old_thread = machine_processor_shutdown(self, processor_offline, processor); + thread_t shutdown_thread = processor->idle_thread; + shutdown_thread->continuation = processor_offline; + shutdown_thread->parameter = processor; - thread_dispatch(old_thread, self); + thread_run(self, NULL, NULL, shutdown_thread); } /* - * Complete the shutdown and place the processor offline. - * - * Called at splsched in the shutdown context. - * This performs a minimal thread_invoke() to the idle thread, - * so it needs to be kept in sync with what thread_invoke() does. + * Called in the context of the idle thread to shut down the processor * - * The onlining half of this is done in load_context(). + * A shut-down processor looks like it's 'running' the idle thread parked + * in this routine, but it's actually been powered off and has no hardware state. */ -void +static void processor_offline( - processor_t processor) + void * parameter, + __unused wait_result_t result) { + processor_t processor = (processor_t) parameter; + thread_t self = current_thread(); + __assert_only thread_t old_thread = THREAD_NULL; + assert(processor == current_processor()); - assert(processor->active_thread == current_thread()); + assert(self->state & TH_IDLE); + assert(processor->idle_thread == self); + assert(ml_get_interrupts_enabled() == FALSE); + assert(self->continuation == NULL); + assert(processor->processor_offlined == false); - thread_t old_thread = processor->active_thread; - thread_t new_thread = processor->idle_thread; + bool enforce_quiesce_safety = gEnforceQuiesceSafety; - if (!new_thread->kernel_stack) { - /* the idle thread has a reserved stack, so this will never fail */ - if (!stack_alloc_try(new_thread)) { - panic("processor_offline"); - } + /* + * Scheduling is now disabled for this processor. + * Ensure that primitives that need scheduling (like mutexes) know this. + */ + if (enforce_quiesce_safety) { + disable_preemption(); } - processor->active_thread = new_thread; - processor_state_update_idle(processor); - processor->starting_pri = IDLEPRI; - processor->deadline = UINT64_MAX; - new_thread->last_processor = processor; + /* convince slave_main to come back here */ + processor->processor_offlined = true; + + /* + * Switch to the interrupt stack and shut down the processor. + * + * When the processor comes back, it will eventually call load_context which + * restores the context saved by machine_processor_shutdown, returning here. + */ + old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor); + + /* old_thread should be NULL because we got here through Load_context */ + assert(old_thread == THREAD_NULL); + + assert(processor == current_processor()); + assert(processor->idle_thread == current_thread()); - uint64_t ctime = mach_absolute_time(); + assert(ml_get_interrupts_enabled() == FALSE); + assert(self->continuation == NULL); - processor->last_dispatch = ctime; - old_thread->last_run_time = ctime; + /* Extract the machine_param value stashed by slave_main */ + void * machine_param = self->parameter; + self->parameter = NULL; - /* Update processor->thread_timer and ->kernel_timer to point to the new thread */ - processor_timer_switch_thread(ctime, &new_thread->system_timer); - PROCESSOR_DATA(processor, kernel_timer) = &new_thread->system_timer; - timer_stop(PROCESSOR_DATA(processor, current_state), ctime); + /* Re-initialize the processor */ + slave_machine_init(machine_param); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE, - old_thread->reason, (uintptr_t)thread_tid(new_thread), - old_thread->sched_pri, new_thread->sched_pri, 0); + assert(processor->processor_offlined == true); + processor->processor_offlined = false; - machine_set_current_thread(new_thread); + if (enforce_quiesce_safety) { + enable_preemption(); + } + + /* + * Now that the processor is back, invoke the idle thread to find out what to do next. + * idle_thread will enable interrupts. + */ + thread_block(idle_thread); + /*NOTREACHED*/ +} + +/* + * Complete the shutdown and place the processor offline. + * + * Called at splsched in the shutdown context + * (i.e. on the idle thread, on the interrupt stack) + * + * The onlining half of this is done in load_context(). + */ +static void +processor_offline_intstack( + processor_t processor) +{ + assert(processor == current_processor()); + assert(processor->active_thread == current_thread()); - thread_dispatch(old_thread, new_thread); + timer_stop(PROCESSOR_DATA(processor, current_state), processor->last_dispatch); cpu_quiescent_counter_leave(processor->last_dispatch); diff --git a/osfmk/kern/machine.h b/osfmk/kern/machine.h index 9dbb6eb2b..2c285e5c4 100644 --- a/osfmk/kern/machine.h +++ b/osfmk/kern/machine.h @@ -45,13 +45,11 @@ * Machine support declarations. */ -extern void processor_up( - processor_t processor); +extern void processor_up( + processor_t processor); -extern void processor_offline( - processor_t processor); - -extern void processor_start_thread(void *machine_param); +extern void processor_start_thread(void *machine_param, + wait_result_t result); /* * Must be implemented in machine dependent code. diff --git a/osfmk/kern/memset_s.c b/osfmk/kern/memset_s.c index 37b89450d..9f51a4b6f 100644 --- a/osfmk/kern/memset_s.c +++ b/osfmk/kern/memset_s.c @@ -64,3 +64,17 @@ memset_s(void *s, size_t smax, int c, size_t n) return err; } + +int +timingsafe_bcmp(const void *b1, const void *b2, size_t n) +{ + const unsigned char *p1 = b1, *p2 = b2; + unsigned char ret = 0; + + for (; n > 0; n--) { + ret |= *p1++ ^ *p2++; + } + + /* map zero to zero and nonzero to one */ + return (ret + 0xff) >> 8; +} diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index c1dee4267..a2585e473 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -86,17 +86,44 @@ extern int testbit( int which, int *bitmap); -/* Move an aligned 32 or 64-bit word from user space to kernel space +/* + * Move an aligned 32 or 64-bit word from user space to kernel space * using a single read instruction + */ +extern int copyin_atomic32( + const user_addr_t user_addr, + uint32_t *kernel_addr); + +extern int copyin_atomic64( + const user_addr_t user_addr, + uint64_t *kernel_addr); + +/* + * Does an atomic copyin at the specified user_address and compares + * it to the passed in value, and if it matches, waits. * - * when reading a 32-bit word, the value is 0-extended into the kernel space - * 64-bit buffer passed as `kernel_addr` - * (think `*kernel_addr = *(uint32_t *)user_addr`) + * This is used to implement adaptive spinning for userspace synchronization + * + * Returns: + * 0: the value mached, and it paused efficiently for the platform + * ESTALE: the value didn't match, and it returned immediately + * other: the copyin failed (EFAULT, EINVAL, ...) */ -extern int copyin_word( +extern int copyin_atomic32_wait_if_equals( const user_addr_t user_addr, - uint64_t *kernel_addr, - vm_size_t nbytes); + uint32_t value); + +/* + * Move a 32 or 64-bit word from kernel space to user space + * using a single write instruction + */ +extern int copyout_atomic32( + uint32_t u32, + user_addr_t user_addr); + +extern int copyout_atomic64( + uint64_t u64, + user_addr_t user_addr); /* Move a NUL-terminated string from a user space to kernel space */ extern int copyinstr( @@ -121,9 +148,6 @@ extern int copyoutmsg( extern void inval_copy_windows(thread_t); extern void copy_window_fault(thread_t, vm_map_t, int); -extern int copyin_validate(const user_addr_t, uintptr_t, vm_size_t); -extern int copyout_validate(uintptr_t, const user_addr_t, vm_size_t); - extern int sscanf(const char *input, const char *fmt, ...) __scanflike(2, 3); /* sprintf() is being deprecated. Please use snprintf() instead. */ diff --git a/osfmk/kern/mk_timer.c b/osfmk/kern/mk_timer.c index 883a1e31b..8de9c9012 100644 --- a/osfmk/kern/mk_timer.c +++ b/osfmk/kern/mk_timer.c @@ -49,7 +49,9 @@ static zone_t mk_timer_zone; static mach_port_qos_t mk_timer_qos = { - FALSE, TRUE, 0, sizeof(mk_timer_expire_msg_t) + .name = FALSE, + .prealloc = TRUE, + .len = sizeof(mk_timer_expire_msg_t), }; static void mk_timer_expire( @@ -71,7 +73,7 @@ mk_timer_create_trap( return MACH_PORT_NULL; } - result = mach_port_allocate_qos(myspace, MACH_PORT_RIGHT_RECEIVE, + result = mach_port_allocate_internal(myspace, MACH_PORT_RIGHT_RECEIVE, &mk_timer_qos, &name); if (result == KERN_SUCCESS) { result = ipc_port_translate_receive(myspace, name, &port); diff --git a/osfmk/kern/monotonic.h b/osfmk/kern/monotonic.h index e8fcde164..9b744407b 100644 --- a/osfmk/kern/monotonic.h +++ b/osfmk/kern/monotonic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Apple Inc. All rights reserved. + * Copyright (c) 2017-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,6 +28,8 @@ #ifndef KERN_MONOTONIC_H #define KERN_MONOTONIC_H +#if MONOTONIC + #include #include #include @@ -156,4 +158,6 @@ __END_DECLS #endif /* MACH_KERNEL_PRIVATE */ +#endif /* MONOTONIC */ + #endif /* !defined(KERN_MONOTONIC_H) */ diff --git a/osfmk/kern/mpsc_queue.c b/osfmk/kern/mpsc_queue.c new file mode 100644 index 000000000..4784b0dc1 --- /dev/null +++ b/osfmk/kern/mpsc_queue.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#pragma mark Single Consumer calls + +__attribute__((noinline)) +static mpsc_queue_chain_t +_mpsc_queue_wait_for_enqueuer(struct mpsc_queue_chain *_Atomic *ptr) +{ + return hw_wait_while_equals((void **)ptr, NULL); +} + +void +mpsc_queue_restore_batch(mpsc_queue_head_t q, mpsc_queue_chain_t first, + mpsc_queue_chain_t last) +{ + mpsc_queue_chain_t head = os_atomic_load(&q->mpqh_head.mpqc_next, relaxed); + + os_atomic_store(&last->mpqc_next, head, relaxed); + + if (head == NULL && + !os_atomic_cmpxchg(&q->mpqh_tail, &q->mpqh_head, last, release)) { + head = os_atomic_load(&q->mpqh_head.mpqc_next, relaxed); + if (__improbable(head == NULL)) { + head = _mpsc_queue_wait_for_enqueuer(&q->mpqh_head.mpqc_next); + } + os_atomic_store(&last->mpqc_next, head, relaxed); + } + + os_atomic_store(&q->mpqh_head.mpqc_next, first, relaxed); +} + +mpsc_queue_chain_t +mpsc_queue_dequeue_batch(mpsc_queue_head_t q, mpsc_queue_chain_t *tail_out, + os_atomic_dependency_t dependency) +{ + mpsc_queue_chain_t head, tail; + + q = os_atomic_inject_dependency(q, dependency); + + tail = os_atomic_load(&q->mpqh_tail, relaxed); + if (__improbable(tail == &q->mpqh_head)) { + *tail_out = NULL; + return NULL; + } + + head = os_atomic_load(&q->mpqh_head.mpqc_next, relaxed); + if (__improbable(head == NULL)) { + head = _mpsc_queue_wait_for_enqueuer(&q->mpqh_head.mpqc_next); + } + os_atomic_store(&q->mpqh_head.mpqc_next, NULL, relaxed); + /* + * 22708742: set tail to &q->mpqh_head with release, so that NULL write + * to head above doesn't clobber the head set by concurrent enqueuer + * + * The other half of the seq_cst is required to pair with any enqueuer that + * contributed to an element in this list (pairs with the release fence in + * __mpsc_queue_append_update_tail(). + * + * Making this seq_cst instead of acq_rel makes mpsc_queue_append*() + * visibility transitive (when items hop from one queue to the next) + * which is expected by clients implicitly. + * + * Note that this is the same number of fences that a traditional lock + * would have, but as a once-per-batch cost. + */ + *tail_out = os_atomic_xchg(&q->mpqh_tail, &q->mpqh_head, seq_cst); + + return head; +} + +mpsc_queue_chain_t +mpsc_queue_batch_next(mpsc_queue_chain_t cur, mpsc_queue_chain_t tail) +{ + mpsc_queue_chain_t elm = NULL; + if (cur == tail || cur == NULL) { + return elm; + } + + elm = os_atomic_load(&cur->mpqc_next, relaxed); + if (__improbable(elm == NULL)) { + elm = _mpsc_queue_wait_for_enqueuer(&cur->mpqc_next); + } + return elm; +} + +#pragma mark "GCD"-like facilities + +static void _mpsc_daemon_queue_drain(mpsc_daemon_queue_t, thread_t); +static void _mpsc_daemon_queue_enqueue(mpsc_daemon_queue_t, mpsc_queue_chain_t); + +/* thread based queues */ + +static void +_mpsc_queue_thread_continue(void *param, wait_result_t wr __unused) +{ + mpsc_daemon_queue_t dq = param; + + assert(dq->mpd_thread == current_thread()); + _mpsc_daemon_queue_drain(dq, dq->mpd_thread); + thread_block_parameter(_mpsc_queue_thread_continue, dq); +} + +static void +_mpsc_queue_thread_wakeup(mpsc_daemon_queue_t dq) +{ + thread_wakeup_thread((event_t)dq, dq->mpd_thread); +} + +static kern_return_t +_mpsc_daemon_queue_init_with_thread(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, int pri, const char *name, + mpsc_daemon_queue_kind_t kind) +{ + kern_return_t kr; + + *dq = (struct mpsc_daemon_queue){ + .mpd_kind = kind, + .mpd_invoke = invoke, + .mpd_queue = MPSC_QUEUE_INITIALIZER(dq->mpd_queue), + .mpd_chain = { MPSC_QUEUE_NOTQUEUED_MARKER }, + }; + + kr = kernel_thread_create(_mpsc_queue_thread_continue, dq, pri, + &dq->mpd_thread); + if (kr == KERN_SUCCESS) { + thread_set_thread_name(dq->mpd_thread, name); + thread_start_in_assert_wait(dq->mpd_thread, (event_t)dq, THREAD_UNINT); + thread_deallocate(dq->mpd_thread); + } + return kr; +} + +kern_return_t +mpsc_daemon_queue_init_with_thread(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, int pri, const char *name) +{ + return _mpsc_daemon_queue_init_with_thread(dq, invoke, pri, name, + MPSC_QUEUE_KIND_THREAD); +} + +/* thread-call based queues */ + +static void +_mpsc_queue_thread_call_drain(thread_call_param_t arg0, + thread_call_param_t arg1 __unused) +{ + _mpsc_daemon_queue_drain((mpsc_daemon_queue_t)arg0, NULL); +} + +static void +_mpsc_queue_thread_call_wakeup(mpsc_daemon_queue_t dq) +{ + thread_call_enter(dq->mpd_call); +} + +void +mpsc_daemon_queue_init_with_thread_call(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, thread_call_priority_t pri) +{ + *dq = (struct mpsc_daemon_queue){ + .mpd_kind = MPSC_QUEUE_KIND_THREAD_CALL, + .mpd_invoke = invoke, + .mpd_queue = MPSC_QUEUE_INITIALIZER(dq->mpd_queue), + .mpd_chain = { MPSC_QUEUE_NOTQUEUED_MARKER }, + }; + dq->mpd_call = thread_call_allocate_with_options( + _mpsc_queue_thread_call_drain, dq, pri, THREAD_CALL_OPTIONS_ONCE); +} + +/* nested queues */ + +void +mpsc_daemon_queue_nested_invoke(mpsc_queue_chain_t elm, + __unused mpsc_daemon_queue_t tq) +{ + mpsc_daemon_queue_t dq; + dq = mpsc_queue_element(elm, struct mpsc_daemon_queue, mpd_chain); + _mpsc_daemon_queue_drain(dq, NULL); +} + +static void +_mpsc_daemon_queue_nested_wakeup(mpsc_daemon_queue_t dq) +{ + _mpsc_daemon_queue_enqueue(dq->mpd_target, &dq->mpd_chain); +} + +void +mpsc_daemon_queue_init_with_target(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, mpsc_daemon_queue_t target) +{ + *dq = (struct mpsc_daemon_queue){ + .mpd_kind = MPSC_QUEUE_KIND_NESTED, + .mpd_invoke = invoke, + .mpd_target = target, + .mpd_queue = MPSC_QUEUE_INITIALIZER(dq->mpd_queue), + .mpd_chain = { MPSC_QUEUE_NOTQUEUED_MARKER }, + }; +} + +/* enqueue, drain & cancelation */ + +static void +_mpsc_daemon_queue_drain(mpsc_daemon_queue_t dq, thread_t self) +{ + mpsc_daemon_invoke_fn_t invoke = dq->mpd_invoke; + mpsc_daemon_queue_kind_t kind = dq->mpd_kind; + mpsc_queue_chain_t head, cur, tail; + mpsc_daemon_queue_state_t st; + + if (kind == MPSC_QUEUE_KIND_THREAD_CRITICAL) { + self->options |= TH_OPT_SYSTEM_CRITICAL; + } + +again: + /* + * Most of the time we're woken up because we're dirty, + * This atomic xor sets DRAINING and clears WAKEUP in a single atomic + * in that case. + * + * However, if we're woken up for cancelation, the state may be reduced to + * the CANCELED bit set only, and then the xor will actually set WAKEUP. + * We need to correct this and clear it back to avoid looping below. + * This is safe to do as no one is allowed to enqueue more work after + * cancelation has happened. + * + * We use `st` as a dependency token to pair with the release fence in + * _mpsc_daemon_queue_enqueue() which gives us the guarantee that the update + * to the tail of the MPSC queue that made it non empty is visible to us. + */ + st = os_atomic_xor(&dq->mpd_state, + MPSC_QUEUE_STATE_DRAINING | MPSC_QUEUE_STATE_WAKEUP, dependency); + assert(st & MPSC_QUEUE_STATE_DRAINING); + if (__improbable(st & MPSC_QUEUE_STATE_WAKEUP)) { + assert(st & MPSC_QUEUE_STATE_CANCELED); + os_atomic_andnot(&dq->mpd_state, MPSC_QUEUE_STATE_WAKEUP, relaxed); + } + + os_atomic_dependency_t dep = os_atomic_make_dependency((uintptr_t)st); + while ((head = mpsc_queue_dequeue_batch(&dq->mpd_queue, &tail, dep))) { + mpsc_queue_batch_foreach_safe(cur, head, tail) { + os_atomic_store(&cur->mpqc_next, + MPSC_QUEUE_NOTQUEUED_MARKER, relaxed); + invoke(cur, dq); + } + } + + if (self) { + assert_wait((event_t)dq, THREAD_UNINT); + } + + /* + * Unlike GCD no fence is necessary here: there is no concept similar + * to "dispatch_sync()" that would require changes this thread made to be + * visible to other threads as part of the mpsc_daemon_queue machinery. + * + * Making updates that happened on the daemon queue visible to other threads + * is the responsibility of the client. + */ + st = os_atomic_andnot(&dq->mpd_state, MPSC_QUEUE_STATE_DRAINING, relaxed); + + /* + * A wakeup has happened while we were draining, + * which means that the queue did an [ empty -> non empty ] + * transition during our drain. + * + * Chances are we already observed and drained everything, + * but we need to be absolutely sure, so start a drain again + * as the enqueuer observed the DRAINING bit and has skipped calling + * _mpsc_daemon_queue_wakeup(). + */ + if (__improbable(st & MPSC_QUEUE_STATE_WAKEUP)) { + if (self) { + clear_wait(self, THREAD_AWAKENED); + } + goto again; + } + + /* dereferencing `dq` past this point is unsafe */ + + if (kind == MPSC_QUEUE_KIND_THREAD_CRITICAL) { + self->options &= ~TH_OPT_SYSTEM_CRITICAL; + } + + if (__improbable(st & MPSC_QUEUE_STATE_CANCELED)) { + thread_wakeup(&dq->mpd_state); + if (self) { + clear_wait(self, THREAD_AWAKENED); + thread_terminate_self(); + __builtin_unreachable(); + } + } +} + +static void +_mpsc_daemon_queue_wakeup(mpsc_daemon_queue_t dq) +{ + switch (dq->mpd_kind) { + case MPSC_QUEUE_KIND_NESTED: + _mpsc_daemon_queue_nested_wakeup(dq); + break; + case MPSC_QUEUE_KIND_THREAD: + case MPSC_QUEUE_KIND_THREAD_CRITICAL: + _mpsc_queue_thread_wakeup(dq); + break; + case MPSC_QUEUE_KIND_THREAD_CALL: + _mpsc_queue_thread_call_wakeup(dq); + break; + default: + panic("mpsc_queue[%p]: invalid kind (%d)", dq, dq->mpd_kind); + } +} + +static void +_mpsc_daemon_queue_enqueue(mpsc_daemon_queue_t dq, mpsc_queue_chain_t elm) +{ + mpsc_daemon_queue_state_t st; + + if (mpsc_queue_append(&dq->mpd_queue, elm)) { + /* + * Pairs with the acquire fence in _mpsc_daemon_queue_drain(). + */ + st = os_atomic_or_orig(&dq->mpd_state, MPSC_QUEUE_STATE_WAKEUP, release); + if (__improbable(st & MPSC_QUEUE_STATE_CANCELED)) { + panic("mpsc_queue[%p]: use after cancelation", dq); + } + + if ((st & (MPSC_QUEUE_STATE_DRAINING | MPSC_QUEUE_STATE_WAKEUP)) == 0) { + _mpsc_daemon_queue_wakeup(dq); + } + } +} + +void +mpsc_daemon_enqueue(mpsc_daemon_queue_t dq, mpsc_queue_chain_t elm, + mpsc_queue_options_t options) +{ + if (options & MPSC_QUEUE_DISABLE_PREEMPTION) { + disable_preemption(); + } + + _mpsc_daemon_queue_enqueue(dq, elm); + + if (options & MPSC_QUEUE_DISABLE_PREEMPTION) { + enable_preemption(); + } +} + +void +mpsc_daemon_queue_cancel_and_wait(mpsc_daemon_queue_t dq) +{ + mpsc_daemon_queue_state_t st; + + assert_wait((event_t)&dq->mpd_state, THREAD_UNINT); + + st = os_atomic_or_orig(&dq->mpd_state, MPSC_QUEUE_STATE_CANCELED, relaxed); + if (__improbable(st & MPSC_QUEUE_STATE_CANCELED)) { + panic("mpsc_queue[%p]: cancelled twice (%x)", dq, st); + } + + if (dq->mpd_kind == MPSC_QUEUE_KIND_NESTED && st == 0) { + clear_wait(current_thread(), THREAD_AWAKENED); + } else { + disable_preemption(); + _mpsc_daemon_queue_wakeup(dq); + enable_preemption(); + thread_block(THREAD_CONTINUE_NULL); + } + + switch (dq->mpd_kind) { + case MPSC_QUEUE_KIND_NESTED: + dq->mpd_target = NULL; + break; + case MPSC_QUEUE_KIND_THREAD: + case MPSC_QUEUE_KIND_THREAD_CRITICAL: + dq->mpd_thread = NULL; + break; + case MPSC_QUEUE_KIND_THREAD_CALL: + thread_call_cancel_wait(dq->mpd_call); + thread_call_free(dq->mpd_call); + dq->mpd_call = NULL; + break; + default: + panic("mpsc_queue[%p]: invalid kind (%d)", dq, dq->mpd_kind); + } + dq->mpd_kind = MPSC_QUEUE_KIND_UNKNOWN; +} + +#pragma mark deferred deallocation daemon + +static struct mpsc_daemon_queue thread_deferred_deallocation_queue; + +void +thread_deallocate_daemon_init(void) +{ + kern_return_t kr; + + kr = _mpsc_daemon_queue_init_with_thread(&thread_deferred_deallocation_queue, + mpsc_daemon_queue_nested_invoke, MINPRI_KERNEL, + "daemon.deferred-deallocation", MPSC_QUEUE_KIND_THREAD_CRITICAL); + if (kr != KERN_SUCCESS) { + panic("thread_deallocate_daemon_init: creating daemon failed (%d)", kr); + } +} + +void +thread_deallocate_daemon_register_queue(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke) +{ + mpsc_daemon_queue_init_with_target(dq, invoke, + &thread_deferred_deallocation_queue); +} diff --git a/osfmk/kern/mpsc_queue.h b/osfmk/kern/mpsc_queue.h new file mode 100644 index 000000000..a2a6218ec --- /dev/null +++ b/osfmk/kern/mpsc_queue.h @@ -0,0 +1,671 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_MPSC_QUEUE_H_ +#define _KERN_MPSC_QUEUE_H_ + +#ifdef XNU_KERNEL_PRIVATE + +#include +#include +#include + +#endif // XNU_KERNEL_PRIVATE + +#include + +__BEGIN_DECLS + +/*! + * @typedef struct mpsc_queue_chain + * + * @brief + * Type for the intrusive linkage used by MPSC queues. + */ +typedef struct mpsc_queue_chain { + struct mpsc_queue_chain *_Atomic mpqc_next; +} *mpsc_queue_chain_t; + +/*! + * @typedef struct mpsc_queue_head + * + * @brief + * The type for a multi-producer single-consumer queue. + * + * @discussion + * MPSC queues allow for producers to not be affected by other producers or the + * consumer. Which means in turn that having producers in interrupt context + * does not require that other producers disable interrupts like a traditional + * spinlock based approach would require. + * + * These queues shine when data is produced from the entire system and is + * consumed from a single serial context (logging, tracing, ...). + * mpsc_daemon_queue_t is provided as a fully ready/easy-to-use pre-packaged + * solution for these common use cases. + * + * - mpsc_queue_append() can be used to append a single item + * - mpsc_queue_append_list() can be used to append a batch of items at once. + * + * Functions for the consumer side assume proper serialization that is not + * provided by the MPSC queue itself. Dequeuing doesn't require preemption + * to be disabled. + * + *

Algorithm

+ * + * The base of the enqueue algorithm is a single atomic exchange (first half, + * called __mpsc_queue_append_update_tail) and a list fixup (2nd half, called + * __mpsc_queue_append_update_prev). + * + * Graphically, enqueuing `X` looks like this, with each step being done + * atomically (for the empty queue case, `tail` points to `head`): + * + * | orig state | update_tail | update_prev | + * +---------------------+---------------------+---------------------+ + * | | | | + * | head -> e1 -> e2 -. | head -> e1 -> e2 -. | head -> e1 -> e2 -. | + * | | | | | | | + * | ,- ... <--' | ,- ... <--' | ,- ... <--' | + * | | | | | | | + * | v | v | v | + * | tail -> eN -> NULL | tail eN -> NULL | tail eN | + * | | | | | | | + * | | | | | v | + * | X -> NULL | `---> X -> NULL | '---> X -> NULL | + * | | | | + * +---------------------+---------------------+---------------------+ + * + * + * There is a small 1-instruction gap of inconsistency which makes the chosen + * algorithm non linearizable, and requires enqueuers to disable preemption + * during the enqueue so as not to starve the consumer forever. + * + * As far as memory visibility is concerned, enqueuing uses a release fence in + * update_tail which pairs with memory fences in mpsc_queue_dequeue_batch(). + * + * Note: as far as the data structure in memory, its layout is equivalent to + * a BSD STAILQ. However because of this inconsistency + * window and memory ordering concerns, it is incorrect to use STAILQ + * macros on an MPSC queue. + */ +typedef struct mpsc_queue_head { + struct mpsc_queue_chain mpqh_head; + struct mpsc_queue_chain *_Atomic mpqh_tail; +} *mpsc_queue_head_t; + +/*! + * @macro MPSC_QUEUE_INITIALIZER + * + * @brief + * Macro to use in static initializers for mpsc queues. + * + * @param head + * The name of the variable to initialize. + */ +#define MPSC_QUEUE_INITIALIZER(head) { .mpqh_tail = &(head).mpqh_head } + +#ifdef XNU_KERNEL_PRIVATE + +/*! + * @function mpsc_queue_init + * + * @brief + * Dynamically initialize an mpsc queue. + * + * @discussion + * This initialization assumes that the object holding the queue head + * is initialized before it can be made visible to other threads/cores. + * + * @param q + * The queue to initialize. + */ +static inline void +mpsc_queue_init(mpsc_queue_head_t q) +{ + os_atomic_init(&q->mpqh_head.mpqc_next, NULL); + os_atomic_init(&q->mpqh_tail, &q->mpqh_head); +} + +/*! + * @typedef enum mpsc_queue_options + */ +typedef enum mpsc_queue_options { + MPSC_QUEUE_NONE = 0, + MPSC_QUEUE_DISABLE_PREEMPTION = 1 << 0, +} mpsc_queue_options_t; + +/*! + * @const MPSC_QUEUE_NOTQUEUED_MARKER + * + * @brief + * Magical marker that implementations can use to poison the chain pointer of + * elements not on any MPSC queue. + */ +#define MPSC_QUEUE_NOTQUEUED_MARKER ((mpsc_queue_chain_t)~0ul) + +/*! + * @macro mpsc_queue_element + * + * @brief + * Macro to find the pointer of an element back from its MPSC chain linkage. + */ +#define mpsc_queue_element(ptr, type, field) __container_of(ptr, type, field) + + +#pragma mark Advanced Multi Producer calls + +/** + * @function __mpsc_queue_append_update_tail + * + * @brief + * First half of the enqueue operation onto a multi-producer single-consumer + * queue. + * + * @discussion + * This function is available for algorithms that need to do things (such as + * taking a refcount) before calling __mpsc_queue_append_update_prev(). + * + * Preemption should be disabled before calling + * __mpsc_queue_append_update_tail(), and until + * __mpsc_queue_append_update_prev() has returned. + * + * @param q + * The queue to update. + * + * @param elm + * The element to append to `q`. + * + * @returns + * A token to later pass to __mpsc_queue_append_update_prev() + * to complete the enqueue. + */ +static inline mpsc_queue_chain_t +__mpsc_queue_append_update_tail(mpsc_queue_head_t q, mpsc_queue_chain_t elm) +{ + os_atomic_store(&elm->mpqc_next, NULL, relaxed); + return os_atomic_xchg(&q->mpqh_tail, elm, release); +} + +/** + * @function __mpsc_queue_append_was_empty + * + * @brief + * Tests whether the queue was empty at the time + * __mpsc_queue_append_update_tail() was called. + * + * @param q + * The queue to test emptiness for. + * + * @param prev + * The token returned by __mpsc_queue_append_update_tail(). + * + * @returns + * Whether the queue was empty (true) or not (false). + */ +static inline bool +__mpsc_queue_append_was_empty(mpsc_queue_head_t q, mpsc_queue_chain_t prev) +{ + return &q->mpqh_head == prev; +} + +/** + * @function __mpsc_queue_append_update_prev + * + * @brief + * Second half of the enqueue operation onto a multi-producer single-consumer + * queue. + * + * @discussion + * This function is available for algorithms that need to do things (such as + * taking a refcount) before calling __mpsc_queue_append_update_prev(). + * + * Preemption should be disabled before calling + * __mpsc_queue_append_update_tail(), and until + * __mpsc_queue_append_update_prev() has returned. + * + * @param prev + * The token returned by __mpsc_queue_append_update_tail(). + * + * @param elm + * The element to append to the queue. + */ +static inline void +__mpsc_queue_append_update_prev(mpsc_queue_chain_t prev, mpsc_queue_chain_t elm) +{ + os_atomic_store(&prev->mpqc_next, elm, relaxed); +} + + +#pragma mark Multi Producer calls + +/** + * @function mpsc_queue_append_list + * + * @brief + * Enqueues a list of elements onto a queue. + * + * @discussion + * This enqueues a list that has to be fully formed from `first` to `last` + * at the end of `q`. + * + * Preemption should be disabled when calling mpsc_queue_append_list(). + * + * @param q + * The queue to update. + * + * @param first + * The first of the list elements being appended. + * + * @param last + * The last of the list elements being appended. + */ +static inline bool +mpsc_queue_append_list(mpsc_queue_head_t q, mpsc_queue_chain_t first, + mpsc_queue_chain_t last) +{ + mpsc_queue_chain_t prev = __mpsc_queue_append_update_tail(q, last); + __mpsc_queue_append_update_prev(prev, first); + return __mpsc_queue_append_was_empty(q, prev); +} + +/** + * @function __mpsc_queue_append_update_tail + * + * @brief + * Enqueues an element onto a queue. + * + * @discussion + * Preemption should be disabled when calling mpsc_queue_append(). + * + * @param q the queue to update + * @param elm the element to append + */ +static inline bool +mpsc_queue_append(mpsc_queue_head_t q, mpsc_queue_chain_t elm) +{ + return mpsc_queue_append_list(q, elm, elm); +} + + +#pragma mark Single Consumer calls + +/** + * @function mpsc_queue_dequeue_batch() + * + * @brief + * Atomically empty a queue at once and return the batch head and tail. + * + * @discussion + * Consumer function, must be called in a serialized way with respect to any + * other consumer function. + * + * @param q + * The queue + * + * @param tail + * An out pointer filled with the last element captured. + * + * @param dependency + * A dependency token (to rely on consume / hardware dependencies) + * When not trying to take advantage of hardware dependencies, just pass NULL. + * + * @returns + * The first element of the batch if any, or NULL the queue was empty. + */ +mpsc_queue_chain_t +mpsc_queue_dequeue_batch(mpsc_queue_head_t q, mpsc_queue_chain_t *tail, + os_atomic_dependency_t dependency); + +/** + * @function mpsc_queue_batch_next() + * + * @brief + * Function used to consume an element from a batch dequeued with + * mpsc_queue_dequeue_batch(). + * + * @discussion + * Once a batch has been dequeued, there is no need to hold the consumer lock + * anymore to consume it. + * + * mpsc_queue_batch_foreach_safe() is the preferred interface to consume + * the whole batch. + * + * @param cur + * The current inspected element of the batch (must be the batch head or + * a value returned by mpsc_queue_batch_next()). + * + * @param tail + * The last element of the batch. + * + * @returns + * The next element if any. + */ +mpsc_queue_chain_t +mpsc_queue_batch_next(mpsc_queue_chain_t cur, mpsc_queue_chain_t tail); + +/** + * @macro mpsc_queue_batch_foreach_safe + * + * @brief + * Macro used to enumerate a batch dequeued with mpsc_queue_dequeue_batch(). + * + * @param item + * The item being currently visited. + * + * @param head + * The first element of the batch. + * + * @param tail + * The last element of the batch. + */ +#define mpsc_queue_batch_foreach_safe(item, head, tail) \ + for (mpsc_queue_chain_t __tmp, __item = (head), __tail = (tail); \ + __tmp = mpsc_queue_batch_next(__item, __tail), (item) = __item; \ + __item = __tmp) + +/** + * @function mpsc_queue_restore_batch() + * + * @brief + * "Restore"s a batch at the head of the queue. + * + * @discussion + * Consumer function, must be called in a serialized way with respect to any + * other consumer function. + * + * @param q + * The queue + * + * @param first + * The first element to put back. + * + * @param last + * The last element to put back. + * It is the responsibility of the caller to ensure the linkages from first to + * last are properly set up before calling this function. + */ +void +mpsc_queue_restore_batch(mpsc_queue_head_t q, mpsc_queue_chain_t first, + mpsc_queue_chain_t last); + + +#pragma mark "GCD"-like facilities + +/*! + * @typedef struct mpsc_daemon_queue + * + * @brief + * Daemon queues are a ready-to use packaging of the low level MPSC queue + * primitive. + * + * @discussion + * mpsc_queue_t requires handling of state transitions of the queue and + * dequeuing yourself, which is a non trivial task. + * + * Daemon queues are a simple packaged solution that allows for mpsc_queue_t to + * form hierarchies (mostly for layering purposes), and be serviced at the + * bottom of such a hierarchy by a thread or a thread call. + * + * Daemon queues assume homogenous items, and are setup with an `invoke` + * callback that is called in the dequeuer on every item as they are dequeued. + */ +typedef struct mpsc_daemon_queue *mpsc_daemon_queue_t; + +/*! + * @typedef struct mpsc_daemon_queue + * + * @brief + * The type for MPSC Daemon Queues invoke callbacks. + */ +typedef void (*mpsc_daemon_invoke_fn_t)(mpsc_queue_chain_t elm, + mpsc_daemon_queue_t dq); + +/*! + * @enum mpsc_daemon_queue_kind + * + * @brief + * Internal type, not to be used by clients. + */ +typedef enum mpsc_daemon_queue_kind { + MPSC_QUEUE_KIND_UNKNOWN, + MPSC_QUEUE_KIND_NESTED, + MPSC_QUEUE_KIND_THREAD, + MPSC_QUEUE_KIND_THREAD_CRITICAL, + MPSC_QUEUE_KIND_THREAD_CALL, +} mpsc_daemon_queue_kind_t; + +/*! + * @enum mpsc_daemon_queue_state + * + * @brief + * Internal type, not to be used by clients. + */ +typedef enum mpsc_daemon_queue_state { + MPSC_QUEUE_STATE_DRAINING = 0x0001, + MPSC_QUEUE_STATE_WAKEUP = 0x0002, + MPSC_QUEUE_STATE_CANCELED = 0x0004, +} mpsc_daemon_queue_state_t; + +struct mpsc_daemon_queue { + mpsc_daemon_queue_kind_t mpd_kind; + mpsc_daemon_queue_state_t _Atomic mpd_state; + mpsc_daemon_invoke_fn_t mpd_invoke; + union { + mpsc_daemon_queue_t mpd_target; + struct thread *mpd_thread; + struct thread_call *mpd_call; + }; + struct mpsc_queue_head mpd_queue; + struct mpsc_queue_chain mpd_chain; +}; + +/*! + * @function mpsc_daemon_queue_init_with_thread + * + * @brief + * Sets up a daemon queue to be a base queue drained by a kernel thread. + * + * @discussion + * The function will allocate the thread and start it in assert_wait. + * + * @param dq + * The queue to initialize + * + * @param invoke + * The invoke function called on individual items on the queue during drain. + * + * @param pri + * The scheduler priority for the created thread. + * + * @param name + * The name to give to the created thread. + * + * @returns + * Whether creating the thread was successful. + */ +kern_return_t +mpsc_daemon_queue_init_with_thread(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, int pri, const char *name); + + +/*! + * @function mpsc_daemon_queue_init_with_thread_call + * + * @brief + * Sets up a daemon queue to be a base queue drained by a thread call. + * + * @param dq + * The queue to initialize + * + * @param invoke + * The invoke function called on individual items on the queue during drain. + * + * @param pri + * The priority the thread call will run at. + */ +void +mpsc_daemon_queue_init_with_thread_call(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, thread_call_priority_t pri); + +/*! + * @function mpsc_daemon_queue_init_with_target + * + * @brief + * Sets up a daemon queue to target another daemon queue. + * + * @discussion + * The targetting relationship is useful for subsystem layering purposes only. + * Because draining a given queue is atomic with respect to its target, target + * queue hierarchies are prone to starvation. + * + * @param dq + * The queue to initialize + * + * @param invoke + * The invoke function called on individual items on the queue during drain. + * + * @param target + * The target queue of the initialized queue, which has to be initialized with + * the mpsc_daemon_queue_nested_invoke invoke handler. + */ +void +mpsc_daemon_queue_init_with_target(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke, mpsc_daemon_queue_t target); + +/*! + * @function mpsc_daemon_queue_nested_invoke + * + * @brief + * The invoke function to pass to mpsc_daemon_queue_init_* when a queue is meant + * to be targeted by other queues. + */ +void +mpsc_daemon_queue_nested_invoke(mpsc_queue_chain_t elm, + mpsc_daemon_queue_t dq); + +/*! + * @function mpsc_daemon_queue_cancel_and_wait + * + * @brief + * Cancels the queue so that the object owning it can be destroyed. + * + * @discussion + * This interface will cancel the queue and wait synchronously for the + * cancelation to have taken effect, possibly waiting on elements currently + * draining. + * + * Sending objects to the daemon queue after cancelation is undefined. + * + * Calling this function multiple times is undefined. + * + * Tearing down daemon queue hierarchies is the responsibility of the adopter. + */ +void +mpsc_daemon_queue_cancel_and_wait(mpsc_daemon_queue_t dq); + +/*! + * @function mpsc_daemon_enqueue + * + * @brief + * Send ("async") an item to a given daemon on a given queue. + * + * @discussion + * It is the responsibility of the caller to ensure preemption is disabled when + * this call is made. + * + * @param dq + * The daemon queue to enqueue the element onto. + * + * @param elm + * The item to enqueue. + * + * @param options + * Options applicable to the enqueue. In particupar passing + * MPSC_QUEUE_DISABLE_PREEMPTION makes sure preemption is properly disabled + * during the enqueue. + */ +void +mpsc_daemon_enqueue(mpsc_daemon_queue_t dq, mpsc_queue_chain_t elm, + mpsc_queue_options_t options); + + +#pragma mark Deferred deallocation daemon + +/*! + * @function thread_deallocate_daemon_init + * + * @brief + * Initializes the deferred deallocation daemon, called by thread_daemon_init(). + * + * @discussion + * The deferred deallocation daemon is a kernel thread based daemon queue that + * is targeted by nested daemon queues. + * + * It is used to perform deferred deallocation for objects that can't safely be + * deallocated from the context where the deallocation should normally occur. + * + * Subsystems using it are for example: turnstiles, workqueues, threads. + * + * @warning + * New queues should be added to this daemon with great care, + * as abusing it can lead to unbounded amount of kernel work. + */ +void +thread_deallocate_daemon_init(void); + +/*! + * @function thread_deallocate_daemon_register_queue + * + * @brief + * Dynamically register a queue for deferred deletion with the deferred + * deallocation daemon. + * + * @param dq + * The daemon queue to register with the deferred deallocation daemon. + * + * @param invoke + * The callback called on every element of this queue by the deallocation + * daemon. + */ +void +thread_deallocate_daemon_register_queue(mpsc_daemon_queue_t dq, + mpsc_daemon_invoke_fn_t invoke); + + +#pragma mark tests +#if DEBUG || DEVELOPMENT + +int +mpsc_test_pingpong(uint64_t count, uint64_t *out); + +#endif /* DEBUG || DEVELOPMENT */ + +#endif /* XNU_KERNEL_PRIVATE */ + +__END_DECLS + +#endif /* _KERN_MPSC_QUEUE_H_ */ diff --git a/osfmk/kern/policy_internal.h b/osfmk/kern/policy_internal.h index 0a2e47e35..094113569 100644 --- a/osfmk/kern/policy_internal.h +++ b/osfmk/kern/policy_internal.h @@ -41,7 +41,8 @@ #include #include #include - +#include +#include /* ****************************** * XNU-internal functionality @@ -74,42 +75,41 @@ extern kern_return_t task_importance(task_t task, integer_t importance); /* flavors (also DBG_IMPORTANCE subclasses 0x20 - 0x3F) */ /* internal or external, thread or task */ -#define TASK_POLICY_DARWIN_BG 0x21 -#define TASK_POLICY_IOPOL 0x22 -#define TASK_POLICY_IO 0x23 -#define TASK_POLICY_PASSIVE_IO 0x24 +#define TASK_POLICY_DARWIN_BG IMP_TASK_POLICY_DARWIN_BG +#define TASK_POLICY_IOPOL IMP_TASK_POLICY_IOPOL +#define TASK_POLICY_IO IMP_TASK_POLICY_IO +#define TASK_POLICY_PASSIVE_IO IMP_TASK_POLICY_PASSIVE_IO /* internal, task only */ -#define TASK_POLICY_DARWIN_BG_IOPOL 0x27 +#define TASK_POLICY_DARWIN_BG_IOPOL IMP_TASK_POLICY_DARWIN_BG_IOPOL /* task-only attributes */ -#define TASK_POLICY_TAL 0x28 -#define TASK_POLICY_BOOST 0x29 -#define TASK_POLICY_ROLE 0x2A +#define TASK_POLICY_TAL IMP_TASK_POLICY_TAL +#define TASK_POLICY_BOOST IMP_TASK_POLICY_BOOST +#define TASK_POLICY_ROLE IMP_TASK_POLICY_ROLE /* unused 0x2B */ -#define TASK_POLICY_TERMINATED 0x2C -#define TASK_POLICY_NEW_SOCKETS_BG 0x2D -#define TASK_POLICY_SUP_ACTIVE 0x2E -#define TASK_POLICY_LATENCY_QOS 0x2F -#define TASK_POLICY_THROUGH_QOS 0x30 -#define TASK_POLICY_WATCHERS_BG 0x31 - -#define TASK_POLICY_SFI_MANAGED 0x34 -#define TASK_POLICY_ALL_SOCKETS_BG 0x37 - -#define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS 0x39 /* latency as value1, throughput as value2 */ -#define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS 0x3A /* latency as value1, throughput as value2 */ +#define TASK_POLICY_TERMINATED IMP_TASK_POLICY_TERMINATED +#define TASK_POLICY_NEW_SOCKETS_BG IMP_TASK_POLICY_NEW_SOCKETS_BG +#define TASK_POLICY_SUP_ACTIVE IMP_TASK_POLICY_SUP_ACTIVE +#define TASK_POLICY_LATENCY_QOS IMP_TASK_POLICY_LATENCY_QOS +#define TASK_POLICY_THROUGH_QOS IMP_TASK_POLICY_THROUGH_QOS +#define TASK_POLICY_WATCHERS_BG IMP_TASK_POLICY_WATCHERS_BG +#define TASK_POLICY_SFI_MANAGED IMP_TASK_POLICY_SFI_MANAGED +#define TASK_POLICY_ALL_SOCKETS_BG IMP_TASK_POLICY_ALL_SOCKETS_BG + +#define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS IMP_TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS /* latency as value1, throughput as value2 */ +#define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS IMP_TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS /* latency as value1, throughput as value2 */ /* thread-only attributes */ -#define TASK_POLICY_PIDBIND_BG 0x32 +#define TASK_POLICY_PIDBIND_BG IMP_TASK_POLICY_PIDBIND_BG /* unused 0x33 */ -#define TASK_POLICY_QOS 0x35 -#define TASK_POLICY_QOS_OVERRIDE 0x36 -#define TASK_POLICY_QOS_AND_RELPRIO 0x38 /* QoS as value1, relative priority as value2 */ -#define TASK_POLICY_QOS_WORKQ_OVERRIDE 0x3B -#define TASK_POLICY_QOS_PROMOTE 0x3C -#define TASK_POLICY_QOS_IPC_OVERRIDE 0x3D -// was TASK_POLICY_QOS_SYNC_IPC_OVERRIDE 0x3E +#define TASK_POLICY_QOS 0x35 /* Used only as a convenience for getter */ +#define TASK_POLICY_QOS_OVERRIDE IMP_TASK_POLICY_QOS_OVERRIDE +#define TASK_POLICY_QOS_AND_RELPRIO IMP_TASK_POLICY_QOS_AND_RELPRIO /* QoS as value1, relative priority as value2 */ +#define TASK_POLICY_QOS_WORKQ_OVERRIDE IMP_TASK_POLICY_QOS_WORKQ_OVERRIDE +#define TASK_POLICY_QOS_PROMOTE IMP_TASK_POLICY_QOS_PROMOTE +#define TASK_POLICY_QOS_KEVENT_OVERRIDE IMP_TASK_POLICY_QOS_KEVENT_OVERRIDE +#define TASK_POLICY_QOS_SERVICER_OVERRIDE IMP_TASK_POLICY_QOS_SERVICER_OVERRIDE #define TASK_POLICY_MAX 0x3F @@ -133,8 +133,8 @@ extern int proc_task_role_to_darwin_role(int task_role); /* Functions used by kern_exec.c */ extern void task_set_main_thread_qos(task_t task, thread_t main_thread); -extern void proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, - ipc_port_t * portwatch_ports, int portwatch_count); +extern void proc_set_task_spawnpolicy(task_t task, thread_t thread, int apptype, int qos_clamp, int role, + ipc_port_t * portwatch_ports, uint32_t portwatch_count); extern void proc_inherit_task_role(task_t new_task, task_t old_task); /* IO Throttle tiers */ @@ -167,6 +167,9 @@ extern int task_get_apptype(task_t); extern void proc_apply_task_networkbg(void * bsd_info, thread_t thread); #endif /* MACH_BSD */ +extern void thread_freeze_base_pri(thread_t thread); +extern bool thread_unfreeze_base_pri(thread_t thread); + /* Functions used by pthread_shims.c */ extern int proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, int override_qos, boolean_t first_override_for_resource, @@ -245,15 +248,21 @@ extern kern_return_t thread_policy_set_internal(thread_t thread, thread_policy_f thread_policy_t policy_info, mach_msg_type_number_t count); extern boolean_t thread_recompute_user_promotion_locked(thread_t thread); +extern boolean_t thread_recompute_kernel_promotion_locked(thread_t thread); extern thread_qos_t thread_user_promotion_qos_for_pri(int priority); extern void thread_set_exec_promotion(thread_t thread); extern void thread_clear_exec_promotion(thread_t thread); -/* for IPC override management */ -extern void thread_add_ipc_override(thread_t thread, uint32_t qos_override); -extern void thread_update_ipc_override(thread_t thread, uint32_t qos_override); -extern void thread_drop_ipc_override(thread_t thread); +/* for servicer override management (workloops only) */ +extern void thread_add_servicer_override(thread_t thread, uint32_t qos_override); +extern void thread_update_servicer_override(thread_t thread, uint32_t qos_override); +extern void thread_drop_servicer_override(thread_t thread); + +/* for generic kevent override management */ +extern void thread_add_kevent_override(thread_t thread, uint32_t qos_override); +extern void thread_update_kevent_override(thread_t thread, uint32_t qos_override); +extern void thread_drop_kevent_override(thread_t thread); /* for ipc_pset.c */ extern thread_qos_t thread_get_requested_qos(thread_t thread, int *relpri); @@ -280,7 +289,8 @@ typedef struct task_pend_token { tpt_update_throttle :1, tpt_update_thread_sfi :1, tpt_force_recompute_pri :1, - tpt_update_tg_ui_flag :1; + tpt_update_tg_ui_flag :1, + tpt_update_turnstile :1; } *task_pend_token_t; extern void task_policy_update_complete_unlocked(task_t task, task_pend_token_t pend_token); diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index 9fb14d262..0feea0aeb 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -179,6 +179,10 @@ #include #endif +#ifdef HAS_APPLE_PAC +#include +#include +#endif /* HAS_APPLE_PAC */ #define isdigit(d) ((d) >= '0' && (d) <= '9') #define Ctod(c) ((c) - '0') @@ -256,6 +260,11 @@ __doprnt( char c; int capitals; int long_long; + enum { + INT, + SHORT, + CHAR, + } numeric_type = INT; int nprinted = 0; while ((c = *fmt) != '\0') { @@ -269,6 +278,7 @@ __doprnt( fmt++; long_long = 0; + numeric_type = INT; length = 0; prec = -1; ladjust = FALSE; @@ -337,6 +347,13 @@ __doprnt( long_long = 1; c = *++fmt; } + } else if (c == 'h') { + c = *++fmt; + numeric_type = SHORT; + if (c == 'h') { + numeric_type = CHAR; + c = *++fmt; + } } else if (c == 'q' || c == 'L') { long_long = 1; c = *++fmt; @@ -521,6 +538,7 @@ __doprnt( case 'u': truncate = _doprnt_truncates; + /* FALLTHROUGH */ case 'U': base = 10; goto print_unsigned; @@ -530,6 +548,7 @@ __doprnt( if (sizeof(int) < sizeof(void *)) { long_long = 1; } + /* FALLTHROUGH */ case 'x': truncate = _doprnt_truncates; base = 16; @@ -542,12 +561,14 @@ __doprnt( case 'r': truncate = _doprnt_truncates; + /* FALLTHROUGH */ case 'R': base = radix; goto print_signed; case 'n': truncate = _doprnt_truncates; + /* FALLTHROUGH */ case 'N': base = radix; goto print_unsigned; @@ -558,6 +579,16 @@ print_signed: } else { n = va_arg(argp, int); } + switch (numeric_type) { + case SHORT: + n = (short)n; + break; + case CHAR: + n = (char)n; + break; + default: + break; + } if (n >= 0) { u = n; sign_char = plus_sign; @@ -573,6 +604,16 @@ print_unsigned: } else { u = va_arg(argp, unsigned int); } + switch (numeric_type) { + case SHORT: + u = (unsigned short)u; + break; + case CHAR: + u = (unsigned char)u; + break; + default: + break; + } goto print_num; print_num: @@ -591,6 +632,13 @@ print_num: const char* strp = str; int strl = sizeof(str) - 1; +#ifdef HAS_APPLE_PAC + /** + * Strip out the pointer authentication code before + * checking whether the pointer is a kernel address. + */ + u = (unsigned long long)VM_KERNEL_STRIP_PTR(u); +#endif /* HAS_APPLE_PAC */ if (u >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && u <= VM_MAX_KERNEL_ADDRESS) { while (*strp != '\0') { @@ -681,7 +729,19 @@ dummy_putc(int ch, void *arg) { void (*real_putc)(char) = arg; - real_putc(ch); + /* + * Attempts to panic (or otherwise log to console) during early boot + * can result in _doprnt() and _doprnt_log() being called from + * _kprintf() before PE_init_kprintf() has been called. This causes + * the "putc" param to _doprnt() and _doprnt_log() to be passed as + * NULL. That NULL makes its way here, and we would try jump to it. + * Given that this is a poor idea, and this happens at very early + * boot, there is not a way to report this easily (we are likely + * already panicing), so we'll just do nothing instead of crashing. + */ + if (real_putc) { + real_putc(ch); + } } void @@ -710,11 +770,11 @@ _doprnt_log( boolean_t new_printf_cpu_number = FALSE; #endif /* MP_PRINTF */ -decl_simple_lock_data(, printf_lock) -decl_simple_lock_data(, bsd_log_spinlock) +decl_simple_lock_data(, printf_lock); +decl_simple_lock_data(, bsd_log_spinlock); lck_grp_t oslog_stream_lock_grp; -decl_lck_spin_data(, oslog_stream_lock) +decl_lck_spin_data(, oslog_stream_lock); void oslog_lock_init(void); extern void bsd_log_init(void); diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 26c60c043..5ac1ce756 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,8 @@ #include #include #include +#include +#include #ifdef CONFIG_MACH_APPROXIMATE_TIME #include /* for commpage_update_mach_approximate_time */ @@ -85,8 +87,6 @@ #include #endif /* MONOTONIC */ -static void sched_update_thread_bucket(thread_t thread); - /* * thread_quantum_expire: * @@ -156,6 +156,7 @@ thread_quantum_expire( */ if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && !(thread->sched_flags & TH_SFLAG_PROMOTED) && + !(thread->kern_promotion_schedpri != 0) && !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) && !(thread->options & TH_OPT_SYSTEM_CRITICAL)) { uint64_t new_computation; @@ -278,6 +279,10 @@ sched_set_thread_base_priority(thread_t thread, int priority) } int old_base_pri = thread->base_pri; + thread->req_base_pri = priority; + if (thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) { + priority = MAX(priority, old_base_pri); + } thread->base_pri = priority; if ((thread->state & TH_RUN) == TH_RUN) { @@ -301,11 +306,49 @@ sched_set_thread_base_priority(thread_t thread, int priority) machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE, ctime, PERFCONTROL_CALLOUT_WAKE_UNSAFE, thread); } - sched_update_thread_bucket(thread); +#if !CONFIG_SCHED_CLUTCH + /* For the clutch scheduler, this operation is done in set_sched_pri() */ + SCHED(update_thread_bucket)(thread); +#endif /* !CONFIG_SCHED_CLUTCH */ thread_recompute_sched_pri(thread, SETPRI_DEFAULT); } +/* + * sched_set_kernel_thread_priority: + * + * Set the absolute base priority of the thread + * and reset its scheduled priority. + * + * Called with the thread unlocked. + */ +void +sched_set_kernel_thread_priority(thread_t thread, int new_priority) +{ + spl_t s = splsched(); + + thread_lock(thread); + + assert(thread->sched_mode != TH_MODE_REALTIME); + assert(thread->effective_policy.thep_qos == THREAD_QOS_UNSPECIFIED); + + if (new_priority > thread->max_priority) { + new_priority = thread->max_priority; + } +#if CONFIG_EMBEDDED + if (new_priority < MAXPRI_THROTTLE) { + new_priority = MAXPRI_THROTTLE; + } +#endif /* CONFIG_EMBEDDED */ + + thread->importance = new_priority - thread->task_priority; + + sched_set_thread_base_priority(thread, new_priority); + + thread_unlock(thread); + splx(s); +} + /* * thread_recompute_sched_pri: * @@ -342,6 +385,14 @@ thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options) priority = DEPRESSPRI; } + if (thread->kern_promotion_schedpri > 0) { + priority = MAX(priority, thread->kern_promotion_schedpri); + + if (sched_mode != TH_MODE_REALTIME) { + priority = MIN(priority, MAXPRI_PROMOTE); + } + } + if (sched_flags & TH_SFLAG_PROMOTED) { priority = MAX(priority, thread->promotion_priority); @@ -412,6 +463,15 @@ lightweight_update_priority(thread_t thread) thread->cpu_delta += delta; +#if CONFIG_SCHED_CLUTCH + /* + * Update the CPU usage for the thread group to which the thread belongs. + * The implementation assumes that the thread ran for the entire delta + * as part of the same thread group. + */ + sched_clutch_cpu_usage_update(thread, delta); +#endif /* CONFIG_SCHED_CLUTCH */ + priority = sched_compute_timeshare_priority(thread); if (priority != thread->sched_pri) { @@ -427,17 +487,40 @@ lightweight_update_priority(thread_t thread) * is usage = (usage >> shift1) +/- (usage >> abs(shift2)) where the * +/- is determined by the sign of shift 2. */ -struct shift_data { - int shift1; - int shift2; -}; -#define SCHED_DECAY_TICKS 32 -static struct shift_data sched_decay_shifts[SCHED_DECAY_TICKS] = { - {1, 1}, {1, 3}, {1, -3}, {2, -7}, {3, 5}, {3, -5}, {4, -8}, {5, 7}, - {5, -7}, {6, -10}, {7, 10}, {7, -9}, {8, -11}, {9, 12}, {9, -11}, {10, -13}, - {11, 14}, {11, -13}, {12, -15}, {13, 17}, {13, -15}, {14, -17}, {15, 19}, {16, 18}, - {16, -19}, {17, 22}, {18, 20}, {18, -20}, {19, 26}, {20, 22}, {20, -22}, {21, -27} +const struct shift_data sched_decay_shifts[SCHED_DECAY_TICKS] = { + { .shift1 = 1, .shift2 = 1 }, + { .shift1 = 1, .shift2 = 3 }, + { .shift1 = 1, .shift2 = -3 }, + { .shift1 = 2, .shift2 = -7 }, + { .shift1 = 3, .shift2 = 5 }, + { .shift1 = 3, .shift2 = -5 }, + { .shift1 = 4, .shift2 = -8 }, + { .shift1 = 5, .shift2 = 7 }, + { .shift1 = 5, .shift2 = -7 }, + { .shift1 = 6, .shift2 = -10 }, + { .shift1 = 7, .shift2 = 10 }, + { .shift1 = 7, .shift2 = -9 }, + { .shift1 = 8, .shift2 = -11 }, + { .shift1 = 9, .shift2 = 12 }, + { .shift1 = 9, .shift2 = -11 }, + { .shift1 = 10, .shift2 = -13 }, + { .shift1 = 11, .shift2 = 14 }, + { .shift1 = 11, .shift2 = -13 }, + { .shift1 = 12, .shift2 = -15 }, + { .shift1 = 13, .shift2 = 17 }, + { .shift1 = 13, .shift2 = -15 }, + { .shift1 = 14, .shift2 = -17 }, + { .shift1 = 15, .shift2 = 19 }, + { .shift1 = 16, .shift2 = 18 }, + { .shift1 = 16, .shift2 = -19 }, + { .shift1 = 17, .shift2 = 22 }, + { .shift1 = 18, .shift2 = 20 }, + { .shift1 = 18, .shift2 = -20 }, + { .shift1 = 19, .shift2 = 26 }, + { .shift1 = 20, .shift2 = 22 }, + { .shift1 = 20, .shift2 = -22 }, + { .shift1 = 21, .shift2 = -27 } }; /* @@ -447,7 +530,9 @@ static struct shift_data sched_decay_shifts[SCHED_DECAY_TICKS] = { */ extern int sched_pri_decay_band_limit; -#ifdef CONFIG_EMBEDDED + +/* Only use the decay floor logic on embedded non-clutch schedulers */ +#if CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH int sched_compute_timeshare_priority(thread_t thread) @@ -479,7 +564,7 @@ sched_compute_timeshare_priority(thread_t thread) return priority; } -#else /* CONFIG_EMBEDDED */ +#else /* CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH */ int sched_compute_timeshare_priority(thread_t thread) @@ -496,7 +581,7 @@ sched_compute_timeshare_priority(thread_t thread) return priority; } -#endif /* CONFIG_EMBEDDED */ +#endif /* CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH */ /* * can_update_priority @@ -556,7 +641,16 @@ update_priority( thread->cpu_usage += delta + thread->cpu_delta; thread->cpu_delta = 0; - struct shift_data *shiftp = &sched_decay_shifts[ticks]; +#if CONFIG_SCHED_CLUTCH + /* + * Update the CPU usage for the thread group to which the thread belongs. + * The implementation assumes that the thread ran for the entire delta + * as part of the same thread group. + */ + sched_clutch_cpu_usage_update(thread, delta); +#endif /* CONFIG_SCHED_CLUTCH */ + + const struct shift_data *shiftp = &sched_decay_shifts[ticks]; if (shiftp->shift2 > 0) { thread->cpu_usage = (thread->cpu_usage >> shiftp->shift1) + @@ -589,7 +683,11 @@ update_priority( * values. The updated pri_shift would be used to calculate the * new priority of the thread. */ +#if CONFIG_SCHED_CLUTCH + thread->pri_shift = sched_clutch_thread_pri_shift(thread, thread->th_sched_bucket); +#else /* CONFIG_SCHED_CLUTCH */ thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket]; +#endif /* CONFIG_SCHED_CLUTCH */ /* Recompute scheduled priority if appropriate. */ if (thread->sched_mode == TH_MODE_TIMESHARE) { @@ -603,9 +701,13 @@ update_priority( /* * TH_BUCKET_RUN is a count of *all* runnable non-idle threads. * Each other bucket is a count of the runnable non-idle threads - * with that property. + * with that property. All updates to these counts should be + * performed with os_atomic_* operations. + * + * For the clutch scheduler, this global bucket is used only for + * keeping the total global run count. */ -volatile uint32_t sched_run_buckets[TH_BUCKET_MAX]; +uint32_t sched_run_buckets[TH_BUCKET_MAX]; static void sched_incr_bucket(sched_bucket_t bucket) @@ -613,7 +715,7 @@ sched_incr_bucket(sched_bucket_t bucket) assert(bucket >= TH_BUCKET_FIXPRI && bucket <= TH_BUCKET_SHARE_BG); - hw_atomic_add(&sched_run_buckets[bucket], 1); + os_atomic_inc(&sched_run_buckets[bucket], relaxed); } static void @@ -622,19 +724,17 @@ sched_decr_bucket(sched_bucket_t bucket) assert(bucket >= TH_BUCKET_FIXPRI && bucket <= TH_BUCKET_SHARE_BG); - assert(sched_run_buckets[bucket] > 0); + assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0); - hw_atomic_sub(&sched_run_buckets[bucket], 1); + os_atomic_dec(&sched_run_buckets[bucket], relaxed); } -/* TH_RUN & !TH_IDLE controls whether a thread has a run count */ - uint32_t sched_run_incr(thread_t thread) { assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN); - uint32_t new_count = hw_atomic_add(&sched_run_buckets[TH_BUCKET_RUN], 1); + uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed); sched_incr_bucket(thread->th_sched_bucket); @@ -648,12 +748,12 @@ sched_run_decr(thread_t thread) sched_decr_bucket(thread->th_sched_bucket); - uint32_t new_count = hw_atomic_sub(&sched_run_buckets[TH_BUCKET_RUN], 1); + uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed); return new_count; } -static void +void sched_update_thread_bucket(thread_t thread) { sched_bucket_t old_bucket = thread->th_sched_bucket; @@ -718,7 +818,7 @@ sched_set_thread_mode(thread_t thread, sched_mode_t new_mode) thread->sched_mode = new_mode; - sched_update_thread_bucket(thread); + SCHED(update_thread_bucket)(thread); } /* @@ -789,95 +889,6 @@ sched_thread_mode_undemote(thread_t thread, uint32_t reason) } } -/* - * Promote thread to a specific priority - * - * Promotion must not last past syscall boundary - * Clients must always pair promote and unpromote 1:1 - * - * Called at splsched with thread locked - */ -void -sched_thread_promote_to_pri(thread_t thread, - int priority, - __kdebug_only uintptr_t trace_obj /* already unslid */) -{ - assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED); - assert(thread->promotion_priority == 0); - assert(priority <= MAXPRI_PROMOTE); - assert(priority > 0); - - KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED), - thread_tid(thread), trace_obj, priority); - - thread->sched_flags |= TH_SFLAG_PROMOTED; - thread->promotion_priority = priority; - - thread_recompute_sched_pri(thread, SETPRI_DEFAULT); -} - - -/* - * Update a pre-existing priority promotion to have a higher priority floor - * Priority can only go up from the previous value - * Update must occur while a promotion is active - * - * Called at splsched with thread locked - */ -void -sched_thread_update_promotion_to_pri(thread_t thread, - int priority, - __kdebug_only uintptr_t trace_obj /* already unslid */) -{ - assert(thread->promotions > 0); - assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); - assert(thread->promotion_priority > 0); - assert(priority <= MAXPRI_PROMOTE); - - if (thread->promotion_priority < priority) { - KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED_UPDATE), - thread_tid(thread), trace_obj, priority); - - thread->promotion_priority = priority; - thread_recompute_sched_pri(thread, SETPRI_DEFAULT); - } -} - -/* - * End a priority promotion - * Demotes a thread back to its expected priority without the promotion in place - * - * Called at splsched with thread locked - */ -void -sched_thread_unpromote(thread_t thread, - __kdebug_only uintptr_t trace_obj /* already unslid */) -{ - assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); - assert(thread->promotion_priority > 0); - - KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UNPROMOTED), - thread_tid(thread), trace_obj, 0); - - thread->sched_flags &= ~TH_SFLAG_PROMOTED; - thread->promotion_priority = 0; - - thread_recompute_sched_pri(thread, SETPRI_DEFAULT); -} - -/* called with thread locked */ -void -assert_promotions_invariant(thread_t thread) -{ - if (thread->promotions > 0) { - assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); - } - - if (thread->promotions == 0) { - assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED); - } -} - /* * Promote thread to have a sched pri floor for a specific reason * diff --git a/osfmk/kern/priority_queue.h b/osfmk/kern/priority_queue.h index dcb7d76a8..fc35f70a3 100644 --- a/osfmk/kern/priority_queue.h +++ b/osfmk/kern/priority_queue.h @@ -741,8 +741,8 @@ priority_queue_entry_increase(struct priority_queue *que, priority_queue_entry_t * min element */ #define priority_queue_min(q, type, field) ({ \ - assert(pqueue_is_min_heap(que)); \ - priority_queue_entry_key(pqueue_unpack_root(q), type, field); \ + assert(pqueue_is_min_heap(q)); \ + pqe_element(pqueue_unpack_root(q), type, field); \ }) /* @@ -807,7 +807,7 @@ priority_queue_entry_increase(struct priority_queue *que, priority_queue_entry_t * min element */ #define priority_queue_remove_min(q, type, field, cmp_fn) ({ \ - assert(pqueue_is_min_heap(que)); \ + assert(pqueue_is_min_heap(q)); \ pqe_element(pqueue_remove_root(q, pqueue_unpack_root(q), cmp_fn), type, field); \ }) diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 486efc100..85c506f04 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,7 +94,7 @@ struct processor_set pset0; struct pset_node pset_node0; -decl_simple_lock_data(static, pset_node_lock) +decl_simple_lock_data(static, pset_node_lock); lck_grp_t pset_lck_grp; @@ -105,13 +105,13 @@ int tasks_count; int terminated_tasks_count; queue_head_t threads; int threads_count; -decl_lck_mtx_data(, tasks_threads_lock) -decl_lck_mtx_data(, tasks_corpse_lock) +decl_lck_mtx_data(, tasks_threads_lock); +decl_lck_mtx_data(, tasks_corpse_lock); processor_t processor_list; unsigned int processor_count; static processor_t processor_list_tail; -decl_simple_lock_data(, processor_list_lock) +decl_simple_lock_data(, processor_list_lock); uint32_t processor_avail_count; uint32_t processor_avail_count_user; @@ -198,7 +198,7 @@ processor_init( assert(cpu_id < MAX_SCHED_CPUS); processor->state = PROCESSOR_OFF_LINE; - processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL; + processor->active_thread = processor->startup_thread = processor->idle_thread = THREAD_NULL; processor->processor_set = pset; processor_state_update_idle(processor); processor->starting_pri = MINPRI; @@ -207,10 +207,11 @@ processor_init( processor->quantum_end = UINT64_MAX; processor->deadline = UINT64_MAX; processor->first_timeslice = FALSE; + processor->processor_offlined = false; processor->processor_primary = processor; /* no SMT relationship known at this point */ processor->processor_secondary = NULL; - processor->is_SMT = FALSE; - processor->is_recommended = (pset->recommended_bitmask & (1ULL << cpu_id)) ? TRUE : FALSE; + processor->is_SMT = false; + processor->is_recommended = true; processor->processor_self = IP_NULL; processor_data_init(processor); processor->processor_list = NULL; @@ -221,6 +222,9 @@ processor_init( s = splsched(); pset_lock(pset); bit_set(pset->cpu_bitmask, cpu_id); + bit_set(pset->recommended_bitmask, cpu_id); + bit_set(pset->primary_map, cpu_id); + bit_set(pset->cpu_state_map[PROCESSOR_OFF_LINE], cpu_id); if (pset->cpu_set_count++ == 0) { pset->cpu_set_low = pset->cpu_set_hi = cpu_id; } else { @@ -402,10 +406,9 @@ pset_init( pset->cpu_set_count = 0; pset->last_chosen = -1; pset->cpu_bitmask = 0; - pset->recommended_bitmask = ~0ULL; - pset->primary_map = ~0ULL; - pset->cpu_state_map[PROCESSOR_OFF_LINE] = ~0ULL; - for (uint i = PROCESSOR_SHUTDOWN; i < PROCESSOR_STATE_LEN; i++) { + pset->recommended_bitmask = 0; + pset->primary_map = 0; + for (uint i = 0; i < PROCESSOR_STATE_LEN; i++) { pset->cpu_state_map[i] = 0; } pset->pending_AST_URGENT_cpu_mask = 0; @@ -662,8 +665,8 @@ processor_start( * start up thread. */ if (processor->active_thread == THREAD_NULL && - processor->next_thread == THREAD_NULL) { - result = kernel_thread_create((thread_continue_t)processor_start_thread, NULL, MAXPRI_KERNEL, &thread); + processor->startup_thread == THREAD_NULL) { + result = kernel_thread_create(processor_start_thread, NULL, MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) { s = splsched(); pset_lock(pset); @@ -677,7 +680,7 @@ processor_start( s = splsched(); thread_lock(thread); thread->bound_processor = processor; - processor->next_thread = thread; + processor->startup_thread = thread; thread->state = TH_RUN; thread->last_made_runnable_time = mach_absolute_time(); thread_unlock(thread); @@ -1416,9 +1419,39 @@ pset_reference( return; } + +#if CONFIG_SCHED_CLUTCH + +/* + * The clutch scheduler decides the recommendation of a thread based + * on its thread group's properties and recommendations. The only thread + * level property it looks at is the bucket for the thread to implement + * the policy of not running Utility & BG buckets on the P-cores. Any + * other policy being added to this routine might need to be reflected + * in places such as sched_clutch_hierarchy_thread_pset() & + * sched_clutch_migrate_thread_group() which rely on getting the recommendations + * right. + * + * Note: The current implementation does not support TH_SFLAG_ECORE_ONLY & + * TH_SFLAG_PCORE_ONLY flags which are used for debugging utilities. A similar + * version of that functionality can be implemented by putting these flags + * on a thread group instead of individual thread basis. + * + */ pset_cluster_type_t recommended_pset_type(thread_t thread) { (void)thread; return PSET_SMP; } + +#else /* CONFIG_SCHED_CLUTCH */ + +pset_cluster_type_t +recommended_pset_type(thread_t thread) +{ + (void)thread; + return PSET_SMP; +} + +#endif /* CONFIG_SCHED_CLUTCH */ diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index 223aae3b7..06e54544c 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,6 +83,9 @@ #include #include #include +#include +#include +#include /* * Processor state is accessed by locking the scheduling lock @@ -131,14 +134,16 @@ */ #endif -#define PROCESSOR_OFF_LINE 0 /* Not available */ -#define PROCESSOR_SHUTDOWN 1 /* Going off-line */ -#define PROCESSOR_START 2 /* Being started */ -/* 3 Formerly Inactive (unavailable) */ -#define PROCESSOR_IDLE 4 /* Idle (available) */ -#define PROCESSOR_DISPATCHING 5 /* Dispatching (idle -> active) */ -#define PROCESSOR_RUNNING 6 /* Normal execution */ -#define PROCESSOR_STATE_LEN (PROCESSOR_RUNNING+1) +typedef enum { + PROCESSOR_OFF_LINE = 0, /* Not available */ + PROCESSOR_SHUTDOWN = 1, /* Going off-line */ + PROCESSOR_START = 2, /* Being started */ + PROCESSOR_UNUSED = 3, /* Formerly Inactive (unavailable) */ + PROCESSOR_IDLE = 4, /* Idle (available) */ + PROCESSOR_DISPATCHING = 5, /* Dispatching (idle -> active) */ + PROCESSOR_RUNNING = 6, /* Normal execution */ + PROCESSOR_STATE_LEN = (PROCESSOR_RUNNING + 1) +} processor_state_t; typedef enum { PSET_SMP, @@ -160,10 +165,10 @@ struct processor_set { #define SCHED_PSET_TLOCK (1) #if __SMP__ #if defined(SCHED_PSET_TLOCK) - /* TODO: reorder struct for temporal cache locality */ +/* TODO: reorder struct for temporal cache locality */ __attribute__((aligned(128))) lck_ticket_t sched_lock; #else /* SCHED_PSET_TLOCK*/ - __attribute__((aligned(128))) simple_lock_data_t sched_lock; + __attribute__((aligned(128))) lck_spin_t sched_lock; /* lock for above */ #endif /* SCHED_PSET_TLOCK*/ #endif @@ -171,6 +176,9 @@ struct processor_set { struct run_queue pset_runq; /* runq for this processor set */ #endif struct rt_queue rt_runq; /* realtime runq for this processor set */ +#if CONFIG_SCHED_CLUTCH + struct sched_clutch_root pset_clutch_root; /* clutch hierarchy root */ +#endif /* CONFIG_SCHED_CLUTCH */ #if defined(CONFIG_SCHED_TRADITIONAL) int pset_runq_bound_count; @@ -221,16 +229,16 @@ extern struct pset_node pset_node0; extern queue_head_t tasks, terminated_tasks, threads, corpse_tasks; /* Terminated tasks are ONLY for stackshot */ extern int tasks_count, terminated_tasks_count, threads_count; -decl_lck_mtx_data(extern, tasks_threads_lock) -decl_lck_mtx_data(extern, tasks_corpse_lock) +decl_lck_mtx_data(extern, tasks_threads_lock); +decl_lck_mtx_data(extern, tasks_corpse_lock); struct processor { - int state; /* See above */ + processor_state_t state; /* See above */ bool is_SMT; bool is_recommended; struct thread *active_thread; /* thread running on processor */ - struct thread *next_thread; /* next thread when dispatched */ struct thread *idle_thread; /* this processor's idle thread. */ + struct thread *startup_thread; processor_set_t processor_set; /* assigned set */ @@ -255,6 +263,7 @@ struct processor { uint64_t deadline; /* current deadline */ bool first_timeslice; /* has the quantum expired since context switch */ + bool processor_offlined; /* has the processor been explicitly processor_offline'ed */ bool must_idle; /* Needs to be forced idle as next selected thread is allowed on this processor */ processor_t processor_primary; /* pointer to primary processor for @@ -279,7 +288,7 @@ struct processor { }; extern processor_t processor_list; -decl_simple_lock_data(extern, processor_list_lock) +decl_simple_lock_data(extern, processor_list_lock); #define MAX_SCHED_CPUS 64 /* Maximum number of CPUs supported by the scheduler. bits.h:bitmap_*() macros need to be used to support greater than 64 */ extern processor_t processor_array[MAX_SCHED_CPUS]; /* array indexed by cpuid */ @@ -304,20 +313,16 @@ extern lck_grp_t pset_lck_grp; #define pset_unlock(p) lck_ticket_unlock(&(p)->sched_lock) #define pset_assert_locked(p) lck_ticket_assert_owned(&(p)->sched_lock) #else /* SCHED_PSET_TLOCK*/ -#define pset_lock(p) simple_lock(&(p)->sched_lock, &pset_lck_grp) -#define pset_unlock(p) simple_unlock(&(p)->sched_lock) -#define pset_lock_init(p) simple_lock_init(&(p)->sched_lock, 0) -#if defined(__arm__) || defined(__arm64__) +#define pset_lock_init(p) lck_spin_init(&(p)->sched_lock, &pset_lck_grp, NULL) +#define pset_lock(p) lck_spin_lock_grp(&(p)->sched_lock, &pset_lck_grp) +#define pset_unlock(p) lck_spin_unlock(&(p)->sched_lock) #define pset_assert_locked(p) LCK_SPIN_ASSERT(&(p)->sched_lock, LCK_ASSERT_OWNED) -#else /* arm || arm64 */ -/* See pset_lock() should be converted to use lck_spin_lock() instead of simple_lock() */ -#define pset_assert_locked(p) do { (void)p; } while(0) -#endif /* !arm && !arm64 */ -#endif /* !SCHED_PSET_TLOCK */ +#endif /*!SCHED_PSET_TLOCK*/ + #define rt_lock_lock(p) simple_lock(&SCHED(rt_runq)(p)->rt_lock, &pset_lck_grp) #define rt_lock_unlock(p) simple_unlock(&SCHED(rt_runq)(p)->rt_lock) #define rt_lock_init(p) simple_lock_init(&SCHED(rt_runq)(p)->rt_lock, 0) -#else /* !SMP */ +#else #define pset_lock(p) do { (void)p; } while(0) #define pset_unlock(p) do { (void)p; } while(0) #define pset_lock_init(p) do { (void)p; } while(0) @@ -468,6 +473,8 @@ extern unsigned int processor_count; extern processor_t cpu_to_processor(int cpu); extern kern_return_t enable_smt_processors(bool enable); + +extern boolean_t processor_in_panic_context(processor_t processor); __END_DECLS #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/processor_data.c b/osfmk/kern/processor_data.c index 01a738675..b658db17f 100644 --- a/osfmk/kern/processor_data.c +++ b/osfmk/kern/processor_data.c @@ -47,3 +47,10 @@ processor_data_init( PROCESSOR_DATA(processor, debugger_state).db_current_op = DBOP_NONE; } + +boolean_t +processor_in_panic_context( + processor_t processor) +{ + return PROCESSOR_DATA(processor, debugger_state).db_entry_count > 0; +} diff --git a/osfmk/kern/processor_data.h b/osfmk/kern/processor_data.h index 6c2f21ec5..bee13ded0 100644 --- a/osfmk/kern/processor_data.h +++ b/osfmk/kern/processor_data.h @@ -36,6 +36,8 @@ * #include kern/processor.h instead of this file. */ +#ifdef XNU_KERNEL_PRIVATE + #ifdef MACH_KERNEL_PRIVATE #include @@ -150,4 +152,8 @@ MACRO_END #endif /* MACH_KERNEL_PRIVATE */ +extern boolean_t processor_in_panic_context(processor_t processor); + +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* _KERN_PROCESSOR_DATA_H_ */ diff --git a/osfmk/kern/queue.h b/osfmk/kern/queue.h index 6af62629f..68032cbe7 100644 --- a/osfmk/kern/queue.h +++ b/osfmk/kern/queue.h @@ -71,6 +71,7 @@ #include #include +#include __BEGIN_DECLS @@ -231,14 +232,14 @@ __QUEUE_ELT_VALIDATE(queue_entry_t elt) { queue_entry_t elt_next, elt_prev; - if (__improbable(elt == (queue_entry_t)0)) { + if (__improbable(elt == (queue_entry_t)NULL)) { panic("Invalid queue element %p", elt); } elt_next = elt->next; elt_prev = elt->prev; - if (__improbable(elt_next == (queue_entry_t)0 || elt_prev == (queue_entry_t)0)) { + if (__improbable(elt_next == (queue_entry_t)NULL || elt_prev == (queue_entry_t)NULL)) { panic("Invalid queue element pointers for %p: next %p prev %p", elt, elt_next, elt_prev); } if (__improbable(elt_next->prev != elt || elt_prev->next != elt)) { @@ -250,8 +251,8 @@ __QUEUE_ELT_VALIDATE(queue_entry_t elt) static inline void __DEQUEUE_ELT_CLEANUP(queue_entry_t elt) { - (elt)->next = (queue_entry_t) 0; - (elt)->prev = (queue_entry_t) 0; + (elt)->next = (queue_entry_t)NULL; + (elt)->prev = (queue_entry_t)NULL; } #else #define __QUEUE_ELT_VALIDATE(elt) do { } while (0) @@ -292,7 +293,7 @@ static __inline__ queue_entry_t dequeue_head( queue_t que) { - queue_entry_t elt = (queue_entry_t) 0; + queue_entry_t elt = (queue_entry_t)NULL; queue_entry_t new_head; if (que->next != que) { @@ -311,7 +312,7 @@ static __inline__ queue_entry_t dequeue_tail( queue_t que) { - queue_entry_t elt = (queue_entry_t) 0; + queue_entry_t elt = (queue_entry_t)NULL; queue_entry_t new_tail; if (que->prev != que) { @@ -449,8 +450,7 @@ re_queue_tail(queue_t que, queue_entry_t elt) * Note: * Do not use pointer types for */ -#define qe_element(qe, type, field) \ - ((type *)((void *)((char *)(qe) - __offsetof(type, field)))) +#define qe_element(qe, type, field) __container_of(qe, type, field) /* * Macro: qe_foreach diff --git a/osfmk/kern/remote_time.c b/osfmk/kern/remote_time.c index 11022ed4d..0fa8aa491 100644 --- a/osfmk/kern/remote_time.c +++ b/osfmk/kern/remote_time.c @@ -25,7 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include #include #include #include @@ -39,6 +38,7 @@ #include #include #include +#include #if CONFIG_MACH_BRIDGE_SEND_TIME @@ -58,9 +58,9 @@ uint32_t mach_bridge_timer_enable(uint32_t new_value, int change); extern void mach_bridge_send_timestamp(uint64_t); void -mach_bridge_timer_maintenance() +mach_bridge_timer_maintenance(void) { - if (!bt_init_flag) { + if (!os_atomic_load(&bt_init_flag, acquire)) { return; } @@ -81,7 +81,7 @@ done: void mach_bridge_timer_init(void) { - assert(!bt_init_flag); + assert(!os_atomic_load(&bt_init_flag, relaxed)); /* Initialize the lock */ static lck_grp_t *bt_lck_grp = NULL; bt_lck_grp = lck_grp_alloc_init("bridgetimestamp", LCK_GRP_ATTR_NULL); @@ -97,7 +97,7 @@ uint32_t mach_bridge_timer_enable(uint32_t new_value, int change) { uint32_t current_value = 0; - assert(bt_init_flag == 1); + assert(os_atomic_load(&bt_init_flag, relaxed)); lck_spin_lock(bt_maintenance_lock); if (change) { bt_enable_flag = new_value; @@ -119,6 +119,7 @@ mach_bridge_timer_enable(uint32_t new_value, int change) void mach_bridge_add_timestamp(uint64_t remote_timestamp, uint64_t local_timestamp); void bt_calibration_thread_start(void); lck_spin_t *ts_conversion_lock = NULL; +void bt_params_add(struct bt_params *params); /* function called by sysctl */ struct bt_params bt_params_get_latest(void); @@ -140,7 +141,7 @@ static uint64_t received_remote_timestamp = 0; static struct bt_params bt_params_hist[BT_PARAMS_COUNT] = {}; static int bt_params_idx = -1; -static inline void +void bt_params_add(struct bt_params *params) { lck_spin_assert(ts_conversion_lock, LCK_ASSERT_OWNED); @@ -149,6 +150,7 @@ bt_params_add(struct bt_params *params) bt_params_hist[bt_params_idx] = *params; } +#if defined(XNU_TARGET_OS_BRIDGE) static inline struct bt_params* bt_params_find(uint64_t local_ts) { @@ -169,6 +171,20 @@ bt_params_find(uint64_t local_ts) return NULL; } +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ + +static inline struct bt_params +bt_params_get_latest_locked(void) +{ + lck_spin_assert(ts_conversion_lock, LCK_ASSERT_OWNED); + + struct bt_params latest_params = {}; + if (bt_params_idx >= 0) { + latest_params = bt_params_hist[bt_params_idx]; + } + + return latest_params; +} struct bt_params bt_params_get_latest(void) @@ -176,11 +192,9 @@ bt_params_get_latest(void) struct bt_params latest_params = {}; /* Check if ts_converison_lock has been initialized */ - if (atomic_load(&bt_init_flag)) { + if (os_atomic_load(&bt_init_flag, acquire)) { lck_spin_lock(ts_conversion_lock); - if (bt_params_idx >= 0) { - latest_params = bt_params_hist[bt_params_idx]; - } + latest_params = bt_params_get_latest_locked(); lck_spin_unlock(ts_conversion_lock); } return latest_params; @@ -472,7 +486,9 @@ bt_calibration_thread_start(void) * the local time. * * If local_timestamp = 0, then the remote_timestamp is calculated - * corresponding to the current mach_absolute_time. Monotonicity of + * corresponding to the current mach_absolute_time. + * + * If XNU_TARGET_OS_BRIDGE is defined, then monotonicity of * predicted time is guaranteed only for recent local_timestamp values * lesser than the current mach_absolute_time upto 1 second. * @@ -499,27 +515,31 @@ mach_bridge_remote_time(uint64_t local_timestamp) /* neither the send or receive side of the bridge is defined: echo the input */ return local_timestamp; #else - if (!atomic_load(&bt_init_flag)) { + if (!os_atomic_load(&bt_init_flag, acquire)) { return 0; } + uint64_t remote_timestamp = 0; + lck_spin_lock(ts_conversion_lock); uint64_t now = mach_absolute_time(); - - uint64_t remote_timestamp = 0; - uint64_t local_timestamp_ns = 0; if (!local_timestamp) { local_timestamp = now; - } else if (local_timestamp > now) { - goto out_unlock; } - absolutetime_to_nanoseconds(local_timestamp, &local_timestamp_ns); - struct bt_params *params = bt_params_find(local_timestamp_ns); - remote_timestamp = mach_bridge_compute_timestamp(local_timestamp_ns, params); - -out_unlock: +#if defined(XNU_TARGET_OS_BRIDGE) + uint64_t local_timestamp_ns = 0; + if (local_timestamp < now) { + absolutetime_to_nanoseconds(local_timestamp, &local_timestamp_ns); + struct bt_params *params = bt_params_find(local_timestamp_ns); + remote_timestamp = mach_bridge_compute_timestamp(local_timestamp_ns, params); + } +#else + struct bt_params params = bt_params_get_latest_locked(); + remote_timestamp = mach_bridge_compute_timestamp(local_timestamp, ¶ms); +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ lck_spin_unlock(ts_conversion_lock); KDBG(MACHDBG_CODE(DBG_MACH_CLOCK, MACH_BRIDGE_REMOTE_TIME), local_timestamp, remote_timestamp, now); + return remote_timestamp; #endif /* !defined(CONFIG_MACH_BRIDGE_RECV_TIME) */ #endif /* defined(CONFIG_MACH_BRIDGE_SEND_TIME) */ diff --git a/osfmk/kern/remote_time.h b/osfmk/kern/remote_time.h index dc1d04a61..020e845b4 100644 --- a/osfmk/kern/remote_time.h +++ b/osfmk/kern/remote_time.h @@ -55,7 +55,12 @@ mach_bridge_compute_timestamp(uint64_t local_ts_ns, struct bt_params *params) */ int64_t remote_ts = 0; int64_t rate_prod = 0; - rate_prod = (int64_t)(params->rate * (double)((int64_t)local_ts_ns - (int64_t)params->base_local_ts)); + /* To avoid precision loss due to typecasting from int64_t to double */ + if (params->rate != 1.0) { + rate_prod = (int64_t)(params->rate * (double)((int64_t)local_ts_ns - (int64_t)params->base_local_ts)); + } else { + rate_prod = (int64_t)local_ts_ns - (int64_t)params->base_local_ts; + } if (os_add_overflow((int64_t)params->base_remote_ts, rate_prod, &remote_ts)) { return 0; } diff --git a/osfmk/kern/restartable.c b/osfmk/kern/restartable.c new file mode 100644 index 000000000..c4e0a7aa0 --- /dev/null +++ b/osfmk/kern/restartable.c @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * @file osfmk/kern/restartable.c + * + * @brief + * This module implements restartable userspace functions. + * + * @discussion + * task_restartable_ranges_register() allows task to configure + * the restartable ranges, only once per task, + * before it has made its second thread. + * + * task_restartable_ranges_synchronize() can later be used to trigger + * restarts for threads with a PC in a restartable region. + * + * It is implemented with an AST (AST_RESET_PCS) that will cause threads + * as they return to userspace to reset PCs in a restartable region + * to the recovery offset of this region. + * + * Because signal delivery would mask the proper saved PC for threads, + * sigreturn also forcefully sets the AST and will go through the logic + * every single time. + */ + +typedef int (*cmpfunc_t)(const void *a, const void *b); +extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp); + +struct restartable_ranges { + queue_chain_t rr_link; + os_refcnt_t rr_ref; + uint32_t rr_count; + uint32_t rr_hash; + task_restartable_range_t rr_ranges[]; +}; + +#if DEBUG || DEVELOPMENT +#define RR_HASH_SIZE 256 +#else +// Release kernel userspace should have shared caches and a single registration +#define RR_HASH_SIZE 16 +#endif + +static queue_head_t rr_hash[RR_HASH_SIZE]; +lck_spin_t rr_spinlock; +lck_grp_t rr_lock_grp; + +#define rr_lock() lck_spin_lock_grp(&rr_spinlock, &rr_lock_grp) +#define rr_unlock() lck_spin_unlock(&rr_spinlock); + +#pragma mark internals + +/** + * @function _ranges_cmp + * + * @brief + * Compares two ranges together. + */ +static int +_ranges_cmp(const void *_r1, const void *_r2) +{ + const task_restartable_range_t *r1 = _r1; + const task_restartable_range_t *r2 = _r2; + + if (r1->location != r2->location) { + return r1->location < r2->location ? -1 : 1; + } + if (r1->length == r2->length) { + return 0; + } + return r1->length < r2->length ? -1 : 1; +} + +/** + * @function _ranges_validate + * + * @brief + * Validates an array of PC ranges for wraps and intersections. + * + * @discussion + * This sorts and modifies the input. + * + * The ranges must: + * - not wrap around, + * - have a length/recovery offset within a page of the range start + * + * @returns + * - KERN_SUCCESS: ranges are valid + * - KERN_INVALID_ARGUMENT: ranges are invalid + */ +static kern_return_t +_ranges_validate(task_t task, task_restartable_range_t *ranges, uint32_t count) +{ + qsort(ranges, count, sizeof(task_restartable_range_t), _ranges_cmp); + uint64_t limit = task_has_64Bit_data(task) ? UINT64_MAX : UINT32_MAX; + uint64_t end, recovery; + + for (size_t i = 0; i < count; i++) { + if (ranges[i].length > TASK_RESTARTABLE_OFFSET_MAX || + ranges[i].recovery_offs > TASK_RESTARTABLE_OFFSET_MAX) { + return KERN_INVALID_ARGUMENT; + } + if (ranges[i].flags) { + return KERN_INVALID_ARGUMENT; + } + if (os_add_overflow(ranges[i].location, ranges[i].length, &end)) { + return KERN_INVALID_ARGUMENT; + } + if (os_add_overflow(ranges[i].location, ranges[i].recovery_offs, &recovery)) { + return KERN_INVALID_ARGUMENT; + } + if (ranges[i].location > limit || end > limit || recovery > limit) { + return KERN_INVALID_ARGUMENT; + } + if (i + 1 < count && end > ranges[i + 1].location) { + return KERN_INVALID_ARGUMENT; + } + } + + return KERN_SUCCESS; +} + +/** + * @function _ranges_lookup + * + * @brief + * Lookup the left side of a range for a given PC within a set of ranges. + * + * @returns + * - 0: no PC range found + * - the left-side of the range. + */ +__attribute__((always_inline)) +static mach_vm_address_t +_ranges_lookup(struct restartable_ranges *rr, mach_vm_address_t pc) +{ + task_restartable_range_t *ranges = rr->rr_ranges; + uint32_t l = 0, r = rr->rr_count; + + if (pc <= ranges[0].location) { + return 0; + } + if (pc >= ranges[r - 1].location + ranges[r - 1].length) { + return 0; + } + + while (l < r) { + uint32_t i = (r + l) / 2; + mach_vm_address_t location = ranges[i].location; + + if (pc <= location) { + /* if the PC is exactly at pc_start, no reset is needed */ + r = i; + } else if (location + ranges[i].length <= pc) { + /* if the PC is exactly at the end, it's out of the function */ + l = i + 1; + } else { + /* else it's strictly in the range, return the recovery pc */ + return location + ranges[i].recovery_offs; + } + } + + return 0; +} + +/** + * @function _restartable_ranges_dispose + * + * @brief + * Helper to dispose of a range that has reached a 0 refcount. + */ +__attribute__((noinline)) +static void +_restartable_ranges_dispose(struct restartable_ranges *rr, bool hash_remove) +{ + if (hash_remove) { + rr_lock(); + remqueue(&rr->rr_link); + rr_unlock(); + } + kfree(rr, sizeof(*rr) + rr->rr_count * sizeof(task_restartable_range_t)); +} + +/** + * @function _restartable_ranges_equals + * + * @brief + * Helper to compare two restartable ranges. + */ +static bool +_restartable_ranges_equals( + const struct restartable_ranges *rr1, + const struct restartable_ranges *rr2) +{ + size_t rr1_size = rr1->rr_count * sizeof(task_restartable_range_t); + return rr1->rr_hash == rr2->rr_hash && + rr1->rr_count == rr2->rr_count && + memcmp(rr1->rr_ranges, rr2->rr_ranges, rr1_size) == 0; +} + +/** + * @function _restartable_ranges_create + * + * @brief + * Helper to create a uniqued restartable range. + * + * @returns + * - KERN_SUCCESS + * - KERN_INVALID_ARGUMENT: the validation of the new ranges failed. + * - KERN_RESOURCE_SHORTAGE: too many ranges, out of memory + */ +static kern_return_t +_restartable_ranges_create(task_t task, task_restartable_range_t *ranges, + uint32_t count, struct restartable_ranges **rr_storage) +{ + struct restartable_ranges *rr, *rr_found, *rr_base; + queue_head_t *head; + uint32_t base_count, total_count; + size_t base_size, size; + kern_return_t kr; + + rr_base = *rr_storage; + base_count = rr_base ? rr_base->rr_count : 0; + base_size = sizeof(task_restartable_range_t) * base_count; + size = sizeof(task_restartable_range_t) * count; + + if (os_add_overflow(base_count, count, &total_count)) { + return KERN_INVALID_ARGUMENT; + } + if (total_count > 1024) { + return KERN_RESOURCE_SHORTAGE; + } + + rr = kalloc(sizeof(*rr) + base_size + size); + if (rr == NULL) { + return KERN_RESOURCE_SHORTAGE; + } + + queue_chain_init(rr->rr_link); + os_ref_init(&rr->rr_ref, NULL); + rr->rr_count = total_count; + if (base_size) { + memcpy(rr->rr_ranges, rr_base->rr_ranges, base_size); + } + memcpy(rr->rr_ranges + base_count, ranges, size); + kr = _ranges_validate(task, rr->rr_ranges, total_count); + if (kr) { + _restartable_ranges_dispose(rr, false); + return kr; + } + rr->rr_hash = os_hash_jenkins(rr->rr_ranges, + rr->rr_count * sizeof(task_restartable_range_t)); + + head = &rr_hash[rr->rr_hash % RR_HASH_SIZE]; + + rr_lock(); + queue_iterate(head, rr_found, struct restartable_ranges *, rr_link) { + if (_restartable_ranges_equals(rr, rr_found) && + os_ref_retain_try(&rr_found->rr_ref)) { + goto found; + } + } + + enqueue_tail(head, &rr->rr_link); + rr_found = rr; + +found: + if (rr_base && os_ref_release_relaxed(&rr_base->rr_ref) == 0) { + remqueue(&rr_base->rr_link); + } else { + rr_base = NULL; + } + rr_unlock(); + + *rr_storage = rr_found; + + if (rr_found != rr) { + _restartable_ranges_dispose(rr, false); + } + if (rr_base) { + _restartable_ranges_dispose(rr_base, false); + } + return KERN_SUCCESS; +} + +#pragma mark extern interfaces + +void +restartable_ranges_release(struct restartable_ranges *rr) +{ + if (os_ref_release_relaxed(&rr->rr_ref) == 0) { + _restartable_ranges_dispose(rr, true); + } +} + +void +thread_reset_pcs_ast(thread_t thread) +{ + task_t task = thread->task; + struct restartable_ranges *rr; + mach_vm_address_t pc; + + /* + * Because restartable_ranges are set while the task only has on thread + * and can't be mutated outside of this, no lock is required to read this. + */ + rr = task->restartable_ranges; + if (rr) { + /* pairs with the barrier in task_restartable_ranges_synchronize() */ + os_atomic_thread_fence(acquire); + + pc = _ranges_lookup(rr, machine_thread_pc(thread)); + + if (pc) { + machine_thread_reset_pc(thread, pc); + } + } +} + +void +restartable_init(void) +{ + lck_grp_init(&rr_lock_grp, "restartable ranges", LCK_GRP_ATTR_NULL); + lck_spin_init(&rr_spinlock, &rr_lock_grp, LCK_ATTR_NULL); + for (size_t i = 0; i < RR_HASH_SIZE; i++) { + queue_head_init(rr_hash[i]); + } +} + +#pragma mark MiG interfaces + +kern_return_t +task_restartable_ranges_register( + task_t task, + task_restartable_range_t *ranges, + mach_msg_type_number_t count) +{ + kern_return_t kr; + thread_t th; + + if (task != current_task()) { + return KERN_FAILURE; + } + + kr = _ranges_validate(task, ranges, count); + + if (kr == KERN_SUCCESS) { + task_lock(task); + + queue_iterate(&task->threads, th, thread_t, task_threads) { + if (th != current_thread()) { + kr = KERN_NOT_SUPPORTED; + break; + } + } +#if !DEBUG && !DEVELOPMENT + /* + * For security reasons, on release kernels, only allow for this to be + * configured once. + * + * But to be able to test the feature we need to relax this for + * dev kernels. + */ + if (task->restartable_ranges) { + kr = KERN_NOT_SUPPORTED; + } +#endif + if (kr == KERN_SUCCESS) { + kr = _restartable_ranges_create(task, ranges, count, + &task->restartable_ranges); + } + task_unlock(task); + } + + return kr; +} + +kern_return_t +task_restartable_ranges_synchronize(task_t task) +{ + thread_t thread; + + if (task != current_task()) { + return KERN_FAILURE; + } + + /* pairs with the barrier in thread_reset_pcs_ast() */ + os_atomic_thread_fence(release); + + task_lock(task); + + if (task->restartable_ranges) { + queue_iterate(&task->threads, thread, thread_t, task_threads) { + if (thread != current_thread()) { + thread_mtx_lock(thread); + act_set_ast_reset_pcs(thread); + thread_mtx_unlock(thread); + } + } + } + + task_unlock(task); + + return KERN_SUCCESS; +} diff --git a/osfmk/kern/restartable.h b/osfmk/kern/restartable.h new file mode 100644 index 000000000..af6ba4d78 --- /dev/null +++ b/osfmk/kern/restartable.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_RESTARTABLE_H_ +#define _KERN_RESTARTABLE_H_ + +#include +#include +#include + +__BEGIN_DECLS + +/*! + * @typedef task_restartable_range_t + * + * @brief + * Describes a userspace recoverable range. + * + * @field location + * The pointer to the beginning of a restartable section. + * + * @field length + * The length of the critical section anchored at location. + * + * @field recovery_offs + * The offset from the initial location that should be used for the recovery + * codepath. + * + * @field flags + * Currently unused, pass 0. + */ +typedef struct { + mach_vm_address_t location; + unsigned short length; + unsigned short recovery_offs; + unsigned int flags; +} task_restartable_range_t; + +typedef task_restartable_range_t *task_restartable_range_array_t; + +/*! + * @function task_restartable_ranges_register + * + * @brief + * Register a set of restartable ranges for the current task. + * + * @param task + * The task to operate on + * + * @param ranges + * An array of address ranges for which PC resets are performed. + * + * @param count + * The number of address ranges. + * + * @returns + * - KERN_SUCCESS on success + * - KERN_FAILURE if the task isn't the current one + * - KERN_INVALID_ARGUMENT for various invalid inputs + * - KERN_NOT_SUPPORTED the request is not supported (second registration on + * release kernels, registration when the task has gone wide) + * - KERN_RESOURCE_SHORTAGE if not enough memory + */ +extern kern_return_t task_restartable_ranges_register( + task_t task, + task_restartable_range_array_t ranges, + mach_msg_type_number_t count); + +/*! + * @function task_restartable_ranges_synchronize + * + * @brief + * Require for all threads in the task to reset their PC + * if within a restartable range. + * + * @param task + * The task to operate on (needs to be current task) + * + * @returns + * - KERN_SUCCESS + * - KERN_FAILURE if the task isn't the current one + */ +extern kern_return_t task_restartable_ranges_synchronize(task_t task); + +/*! + * @const TASK_RESTARTABLE_OFFSET_MAX + * The maximum value length / recovery_offs can have. + */ +#define TASK_RESTARTABLE_OFFSET_MAX 4096u + +#ifdef KERNEL_PRIVATE + +struct restartable_ranges; + +/** + * @function restartable_init + * + * @brief + * Initializes the restartable module. + */ +extern void restartable_init(void); + +/** + * @function restartable_ranges_release + * + * @brief + * Release a reference on a restartable range. + */ +extern void restartable_ranges_release(struct restartable_ranges *ranges); + +/** + * @function thread_reset_pcs_ast + * + * @brief + * Perform the work at the AST boundary to reset thread PCS. + */ +extern void thread_reset_pcs_ast(struct thread *thread); + +#endif // KERNEL_PRIVATE + +__END_DECLS + +#endif /* _KERN_RESTARTABLE_H_ */ diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index f3c1c88a2..be8727dd0 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,16 +70,16 @@ #include #include #include -#include +#include #include #include #include #include #include -#define NRQS 128 /* 128 levels per run queue */ +#define NRQS_MAX (128) /* maximum number of priority levels */ -#define MAXPRI (NRQS-1) +#define MAXPRI (NRQS_MAX-1) #define MINPRI 0 /* lowest legal priority schedulable */ #define IDLEPRI MINPRI /* idle thread priority */ #define NOPRI -1 @@ -142,7 +142,7 @@ */ #define BASEPRI_RTQUEUES (BASEPRI_REALTIME + 1) /* 97 */ -#define BASEPRI_REALTIME (MAXPRI - (NRQS / 4) + 1) /* 96 */ +#define BASEPRI_REALTIME (MAXPRI - (NRQS_MAX / 4) + 1) /* 96 */ #define MAXPRI_KERNEL (BASEPRI_REALTIME - 1) /* 95 */ #define BASEPRI_PREEMPT_HIGH (BASEPRI_PREEMPT + 1) /* 93 */ @@ -150,18 +150,18 @@ #define BASEPRI_VM (BASEPRI_PREEMPT - 1) /* 91 */ #define BASEPRI_KERNEL (MINPRI_KERNEL + 1) /* 81 */ -#define MINPRI_KERNEL (MAXPRI_KERNEL - (NRQS / 8) + 1) /* 80 */ +#define MINPRI_KERNEL (MAXPRI_KERNEL - (NRQS_MAX / 8) + 1) /* 80 */ #define MAXPRI_RESERVED (MINPRI_KERNEL - 1) /* 79 */ #define BASEPRI_GRAPHICS (MAXPRI_RESERVED - 3) /* 76 */ -#define MINPRI_RESERVED (MAXPRI_RESERVED - (NRQS / 8) + 1) /* 64 */ +#define MINPRI_RESERVED (MAXPRI_RESERVED - (NRQS_MAX / 8) + 1) /* 64 */ #define MAXPRI_USER (MINPRI_RESERVED - 1) /* 63 */ #define BASEPRI_CONTROL (BASEPRI_DEFAULT + 17) /* 48 */ #define BASEPRI_FOREGROUND (BASEPRI_DEFAULT + 16) /* 47 */ #define BASEPRI_BACKGROUND (BASEPRI_DEFAULT + 15) /* 46 */ #define BASEPRI_USER_INITIATED (BASEPRI_DEFAULT + 6) /* 37 */ -#define BASEPRI_DEFAULT (MAXPRI_USER - (NRQS / 4)) /* 31 */ +#define BASEPRI_DEFAULT (MAXPRI_USER - (NRQS_MAX / 4)) /* 31 */ #define MAXPRI_SUPPRESSED (BASEPRI_DEFAULT - 3) /* 28 */ #define BASEPRI_UTILITY (BASEPRI_DEFAULT - 11) /* 20 */ #define MAXPRI_THROTTLE (MINPRI + 4) /* 4 */ @@ -174,6 +174,10 @@ #define MINPRI_EXEC (BASEPRI_DEFAULT) /* floor when in exec state */ #define MINPRI_WAITQ (BASEPRI_DEFAULT) /* floor when in waitq handover state */ +#define NRQS (BASEPRI_REALTIME) /* Non-realtime levels for runqs */ + +/* Ensure that NRQS is large enough to represent all non-realtime threads; even promoted ones */ +_Static_assert((NRQS == (MAXPRI_PROMOTE + 1)), "Runqueues are too small to hold all non-realtime threads"); /* Type used for thread->sched_mode and saved_mode */ typedef enum { @@ -183,14 +187,25 @@ typedef enum { TH_MODE_TIMESHARE, /* use timesharing algorithm */ } sched_mode_t; +/* + * Since the clutch scheduler organizes threads based on the thread group + * and the scheduling bucket, its important to not mix threads from multiple + * priority bands into the same bucket. To achieve that, in the clutch bucket + * world, there is a scheduling bucket per QoS effectively. + */ + /* Buckets used for load calculation */ typedef enum { - TH_BUCKET_RUN = 0, /* All runnable threads */ - TH_BUCKET_FIXPRI, /* Fixed-priority */ - TH_BUCKET_SHARE_FG, /* Timeshare thread above BASEPRI_DEFAULT */ - TH_BUCKET_SHARE_DF, /* Timeshare thread between BASEPRI_DEFAULT and BASEPRI_UTILITY */ - TH_BUCKET_SHARE_UT, /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */ - TH_BUCKET_SHARE_BG, /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */ + TH_BUCKET_FIXPRI = 0, /* Fixed-priority */ + TH_BUCKET_SHARE_FG, /* Timeshare thread above BASEPRI_DEFAULT */ +#if CONFIG_SCHED_CLUTCH + TH_BUCKET_SHARE_IN, /* Timeshare thread between BASEPRI_USER_INITIATED and BASEPRI_DEFAULT */ +#endif /* CONFIG_SCHED_CLUTCH */ + TH_BUCKET_SHARE_DF, /* Timeshare thread between BASEPRI_DEFAULT and BASEPRI_UTILITY */ + TH_BUCKET_SHARE_UT, /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */ + TH_BUCKET_SHARE_BG, /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */ + TH_BUCKET_RUN, /* All runnable threads */ + TH_BUCKET_SCHED_MAX = TH_BUCKET_RUN, /* Maximum schedulable buckets */ TH_BUCKET_MAX, } sched_bucket_t; @@ -200,18 +215,18 @@ typedef enum { #define invalid_pri(pri) ((pri) < MINPRI || (pri) > MAXPRI) struct runq_stats { - uint64_t count_sum; - uint64_t last_change_timestamp; + uint64_t count_sum; + uint64_t last_change_timestamp; }; #if defined(CONFIG_SCHED_TIMESHARE_CORE) || defined(CONFIG_SCHED_PROTO) struct run_queue { - int highq; /* highest runnable queue */ - bitmap_t bitmap[BITMAP_LEN(NRQS)]; /* run queue bitmap array */ - int count; /* # of threads total */ - int urgency; /* level of preemption urgency */ - queue_head_t queues[NRQS]; /* one for each priority */ + int highq; /* highest runnable queue */ + bitmap_t bitmap[BITMAP_LEN(NRQS)]; /* run queue bitmap array */ + int count; /* # of threads total */ + int urgency; /* level of preemption urgency */ + circle_queue_head_t queues[NRQS]; /* one for each priority */ struct runq_stats runq_stats; }; @@ -236,7 +251,7 @@ struct rt_queue { _Atomic int count; /* # of threads total */ queue_head_t queue; /* all runnable RT threads */ #if __SMP__ - decl_simple_lock_data(, rt_lock) + decl_simple_lock_data(, rt_lock); #endif struct runq_stats runq_stats; }; @@ -393,10 +408,17 @@ extern uint32_t avenrun[3], mach_factor[3]; extern uint64_t max_unsafe_computation; extern uint64_t max_poll_computation; -extern volatile uint32_t sched_run_buckets[TH_BUCKET_MAX]; +extern uint32_t sched_run_buckets[TH_BUCKET_MAX]; extern uint32_t sched_run_incr(thread_t thread); extern uint32_t sched_run_decr(thread_t thread); +extern void sched_update_thread_bucket(thread_t thread); + +#define SCHED_DECAY_TICKS 32 +struct shift_data { + int shift1; + int shift2; +}; /* * thread_timer_delta macro takes care of both thread timers. diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index 709803b9e..a6a855c9f 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -174,7 +174,8 @@ static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load)) /* - * Routine to capture the latest runnable counts and update sched_load */ + * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers) + */ void compute_sched_load(void) { @@ -187,12 +188,12 @@ compute_sched_load(void) uint32_t ncpus = processor_avail_count; uint32_t load_now[TH_BUCKET_MAX]; - load_now[TH_BUCKET_RUN] = sched_run_buckets[TH_BUCKET_RUN]; - load_now[TH_BUCKET_FIXPRI] = sched_run_buckets[TH_BUCKET_FIXPRI]; - load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG]; - load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF]; - load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT]; - load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG]; + load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); + load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed); + load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed); + load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed); + load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed); + load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed); assert(load_now[TH_BUCKET_RUN] >= 0); assert(load_now[TH_BUCKET_FIXPRI] >= 0); @@ -285,7 +286,7 @@ compute_sched_load(void) void compute_averages(uint64_t stdelta) { - uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1; + uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1; uint32_t ncpus = processor_avail_count; /* Update the global pri_shifts based on the latest values */ diff --git a/osfmk/kern/sched_clutch.c b/osfmk/kern/sched_clutch.c new file mode 100644 index 000000000..7a246a05e --- /dev/null +++ b/osfmk/kern/sched_clutch.c @@ -0,0 +1,2174 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#if CONFIG_SCHED_CLUTCH + +/* Forward declarations of static routines */ + +/* Root level hierarchy management */ +static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t); +static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t); +static void sched_clutch_root_pri_update(sched_clutch_root_t); +static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t); +static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t); +static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t); + +/* Root bucket level hierarchy management */ +static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t); +static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t); +static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t); + +/* Clutch bucket level hierarchy management */ +static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t); +static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t); +static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t); +static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t); +static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t); + +static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t); +static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t); +static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t); +static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t); + +/* Clutch timeshare properties updates */ +static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t); +static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t); +static void sched_clutch_bucket_cpu_adjust(sched_clutch_bucket_t); +static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t); +static boolean_t sched_thread_sched_pri_promoted(thread_t); +/* Clutch membership management */ +static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t); +static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t); +static thread_t sched_clutch_thread_highest(sched_clutch_root_t); + +/* Clutch properties updates */ +static uint32_t sched_clutch_root_urgency(sched_clutch_root_t); +static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t); +static int sched_clutch_root_priority(sched_clutch_root_t); + + +/* Helper debugging routines */ +static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t); + + + +/* + * Global priority queue comparator routine for root buckets. The + * routine implements the priority queue as a minimum deadline queue + * to achieve EDF scheduling. + */ +priority_queue_compare_fn_t sched_clutch_root_bucket_compare; + + +/* + * Special markers for buckets that have invalid WCELs/quantums etc. + */ +#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0) +#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0) + +/* + * Root level bucket WCELs + * + * The root level bucket selection algorithm is an Earliest Deadline + * First (EDF) algorithm where the deadline for buckets are defined + * by the worst-case-execution-latency and the make runnable timestamp + * for the bucket. + * + */ +static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = { + SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */ + 0, /* FG */ + 37500, /* IN (37.5ms) */ + 75000, /* DF (75ms) */ + 150000, /* UT (150ms) */ + 250000 /* BG (250ms) */ +}; +static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0}; + +/* + * Root level bucket warp + * + * Each root level bucket has a warp value associated with it as well. + * The warp value allows the root bucket to effectively warp ahead of + * lower priority buckets for a limited time even if it has a later + * deadline. The warping behavior provides extra (but limited) + * opportunity for high priority buckets to remain responsive. + */ + +/* Special warp deadline value to indicate that the bucket has not used any warp yet */ +#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64) + +/* Warp window durations for various tiers */ +static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = { + SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */ + 8000, /* FG (8ms)*/ + 4000, /* IN (4ms) */ + 2000, /* DF (2ms) */ + 1000, /* UT (1ms) */ + 0 /* BG (0ms) */ +}; +static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0}; + +/* + * Thread level quantum + * + * The algorithm defines quantums for threads at various buckets. This + * (combined with the root level bucket quantums) restricts how much + * the lower priority levels can preempt the higher priority threads. + */ +static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = { + 10000, /* FIXPRI (10ms) */ + 10000, /* FG (10ms) */ + 8000, /* IN (8ms) */ + 6000, /* DF (6ms) */ + 4000, /* UT (4ms) */ + 2000 /* BG (2ms) */ +}; +static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0}; + +enum sched_clutch_state { + SCHED_CLUTCH_STATE_EMPTY = 0, + SCHED_CLUTCH_STATE_RUNNABLE, +}; + +/* + * sched_clutch_us_to_abstime() + * + * Initializer for converting all durations in usec to abstime + */ +static void +sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals) +{ + for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) { + if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) { + abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64; + } else { + clock_interval_to_absolutetime_interval(us_vals[i], + NSEC_PER_USEC, &abstime_vals[i]); + } + } +} + +#if DEVELOPMENT || DEBUG + +/* + * sched_clutch_hierarchy_locked_assert() + * + * Debugging helper routine. Asserts that the hierarchy is locked. The locking + * for the hierarchy depends on where the hierarchy is hooked. The current + * implementation hooks the hierarchy at the pset, so the hierarchy is locked + * using the pset lock. + */ +static inline void +sched_clutch_hierarchy_locked_assert( + sched_clutch_root_t root_clutch) +{ + pset_assert_locked(root_clutch->scr_pset); +} + +#else /* DEVELOPMENT || DEBUG */ + +static inline void +sched_clutch_hierarchy_locked_assert( + __unused sched_clutch_root_t root_clutch) +{ +} + +#endif /* DEVELOPMENT || DEBUG */ + +/* + * sched_clutch_thr_count_inc() + * + * Increment thread count at a hierarchy level with overflow checks. + */ +static void +sched_clutch_thr_count_inc( + uint16_t *thr_count) +{ + if (__improbable(os_inc_overflow(thr_count))) { + panic("sched_clutch thread count overflowed!"); + } +} + +/* + * sched_clutch_thr_count_dec() + * + * Decrement thread count at a hierarchy level with underflow checks. + */ +static void +sched_clutch_thr_count_dec( + uint16_t *thr_count) +{ + if (__improbable(os_dec_overflow(thr_count))) { + panic("sched_clutch thread count underflowed!"); + } +} + + +/* + * sched_clutch_root_init() + * + * Routine to initialize the scheduler hierarchy root. + */ +static void +sched_clutch_root_init( + sched_clutch_root_t root_clutch, + processor_set_t pset) +{ + root_clutch->scr_thr_count = 0; + root_clutch->scr_priority = NOPRI; + root_clutch->scr_urgency = 0; + root_clutch->scr_pset = pset; + + /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */ + queue_init(&root_clutch->scr_clutch_buckets); + + /* Initialize the queue which maintains all runnable foreign clutch buckets */ + queue_init(&root_clutch->scr_foreign_buckets); + + /* Initialize the bitmap and priority queue of runnable root buckets */ + sched_clutch_root_bucket_compare = priority_heap_make_comparator(a, b, struct sched_clutch_root_bucket, scrb_pqlink, { + return (a->scrb_deadline < b->scrb_deadline) ? 1 : ((a->scrb_deadline == b->scrb_deadline) ? 0 : -1); + }); + priority_queue_init(&root_clutch->scr_root_buckets, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MIN_HEAP); + bitmap_zero(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX); + bitmap_zero(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX); + + /* Initialize all the root buckets */ + for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) { + sched_clutch_root_bucket_init(&root_clutch->scr_buckets[i], i); + } +} + +/* + * sched_clutch_root_bucket_init() + * + * Routine to initialize root buckets. + */ +static void +sched_clutch_root_bucket_init( + sched_clutch_root_bucket_t root_bucket, + sched_bucket_t bucket) +{ + root_bucket->scrb_bucket = bucket; + priority_queue_init(&root_bucket->scrb_clutch_buckets, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP); + priority_queue_entry_init(&root_bucket->scrb_pqlink); + root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64; + root_bucket->scrb_warped_deadline = 0; + root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket]; +} + +/* + * sched_clutch_root_bucket_pri_compare() + * + * Routine to compare root buckets based on the highest runnable clutch + * bucket priorities in the root buckets. + */ +static int +sched_clutch_root_bucket_pri_compare( + sched_clutch_root_bucket_t a, + sched_clutch_root_bucket_t b) +{ + sched_clutch_bucket_t a_highest = sched_clutch_root_bucket_highest_clutch_bucket(a); + sched_clutch_bucket_t b_highest = sched_clutch_root_bucket_highest_clutch_bucket(b); + return (a_highest->scb_priority > b_highest->scb_priority) ? + 1 : ((a_highest->scb_priority == b_highest->scb_priority) ? 0 : -1); +} + +/* + * sched_clutch_root_select_aboveui() + * + * Special case scheduling for Above UI bucket. + * + * AboveUI threads are typically system critical threads that need low latency + * which is why they are handled specially. + * + * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is + * important to maintain some native priority order between those buckets. The policy + * implemented here is to compare the highest clutch buckets of both buckets; if the + * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the + * deadline based scheduling which should pickup the timeshare buckets. + * + * The implementation allows extremely low latency CPU access for Above UI threads + * while supporting the use case of high priority timeshare threads contending with + * lower priority fixed priority threads. + */ +static boolean_t +sched_clutch_root_select_aboveui( + sched_clutch_root_t root_clutch) +{ + if (bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_FIXPRI)) { + sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI]; + sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_buckets[TH_BUCKET_SHARE_FG]; + + if (!bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_SHARE_FG)) { + /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */ + return true; + } + if (sched_clutch_root_bucket_pri_compare(root_bucket_aboveui, root_bucket_sharefg) >= 0) { + /* If the aboveUI bucket has a higher native clutch bucket priority, schedule it */ + return true; + } + } + return false; +} + + +/* + * sched_clutch_root_highest_root_bucket() + * + * Main routine to find the highest runnable root level bucket. + * This routine is called from performance sensitive contexts; so it is + * crucial to keep this O(1). + * + */ +static sched_clutch_root_bucket_t +sched_clutch_root_highest_root_bucket( + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) { + return NULL; + } + + if (sched_clutch_root_select_aboveui(root_clutch)) { + return &root_clutch->scr_buckets[TH_BUCKET_FIXPRI]; + } + + /* + * Above UI bucket is not runnable or has a low priority clutch bucket; use the earliest deadline model + * to schedule threads. The idea is that as the timeshare buckets use CPU, they will drop their + * interactivity score and allow low priority AboveUI clutch buckets to be scheduled. + */ + + /* Find the earliest deadline bucket */ + sched_clutch_root_bucket_t edf_bucket = priority_queue_min(&root_clutch->scr_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink); + + sched_clutch_root_bucket_t warp_bucket = NULL; + int warp_bucket_index = -1; +evaluate_warp_buckets: + /* Check if any higher runnable buckets have warp available */ + warp_bucket_index = bitmap_lsb_first(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX); + + if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) { + /* No higher buckets have warp available; choose the edf bucket and replenish its warp */ + sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp); + edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket]; + edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED; + bitmap_set(root_clutch->scr_warp_available, edf_bucket->scrb_bucket); + return edf_bucket; + } + + /* + * Looks like there is a root bucket which is higher in the natural priority + * order than edf_bucket and might have some warp remaining. + */ + warp_bucket = &root_clutch->scr_buckets[warp_bucket_index]; + if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) { + /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */ + warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining; + sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp); + return warp_bucket; + } + if (warp_bucket->scrb_warped_deadline > timestamp) { + /* Root bucket already has a warp window open with some warp remaining */ + sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp); + return warp_bucket; + } + + /* For this bucket, warp window was opened sometime in the past but has now + * expired. Mark the bucket as not avilable for warp anymore and re-run the + * warp bucket selection logic. + */ + warp_bucket->scrb_warp_remaining = 0; + bitmap_clear(root_clutch->scr_warp_available, warp_bucket->scrb_bucket); + goto evaluate_warp_buckets; +} + +/* + * sched_clutch_root_bucket_deadline_calculate() + * + * Calculate the deadline for the bucket based on its WCEL + */ +static uint64_t +sched_clutch_root_bucket_deadline_calculate( + sched_clutch_root_bucket_t root_bucket, + uint64_t timestamp) +{ + /* For fixpri AboveUI bucket always return it as the earliest deadline */ + if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) { + return 0; + } + + /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */ + return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket]; +} + +/* + * sched_clutch_root_bucket_deadline_update() + * + * Routine to update the deadline of the root bucket when it is selected. + * Updating the deadline also moves the root_bucket in the EDF priority + * queue. + */ +static void +sched_clutch_root_bucket_deadline_update( + sched_clutch_root_bucket_t root_bucket, + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) { + /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */ + return; + } + uint64_t old_deadline = root_bucket->scrb_deadline; + uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp); + assert(old_deadline <= new_deadline); + if (old_deadline != new_deadline) { + root_bucket->scrb_deadline = new_deadline; + /* Since the priority queue is a min-heap, use the decrease routine even though the deadline has a larger value now */ + priority_queue_entry_decrease(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare); + } +} + +/* + * sched_clutch_root_bucket_runnable() + * + * Routine to insert a newly runnable root bucket into the hierarchy. + * Also updates the deadline and warp parameters as necessary. + */ +static void +sched_clutch_root_bucket_runnable( + sched_clutch_root_bucket_t root_bucket, + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + /* Mark the root bucket as runnable */ + bitmap_set(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE, + root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, 0, 0, 0); + + if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) { + /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */ + return; + } + + root_bucket->scrb_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp); + priority_queue_insert(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare); + + if (root_bucket->scrb_warp_remaining) { + /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */ + bitmap_set(root_clutch->scr_warp_available, root_bucket->scrb_bucket); + } +} + +/* + * sched_clutch_root_bucket_empty() + * + * Routine to remove an empty root bucket from the hierarchy. + * Also updates the deadline and warp parameters as necessary. + */ +static void +sched_clutch_root_bucket_empty( + sched_clutch_root_bucket_t root_bucket, + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + bitmap_clear(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE, + root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0, 0); + + if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) { + /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */ + return; + } + + priority_queue_remove(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, sched_clutch_root_bucket_compare); + + bitmap_clear(root_clutch->scr_warp_available, root_bucket->scrb_bucket); + if (root_bucket->scrb_warped_deadline > timestamp) { + /* + * For root buckets that were using the warp, check if the warp + * deadline is in the future. If yes, remove the wall time the + * warp was active and update the warp remaining. This allows + * the root bucket to use the remaining warp the next time it + * becomes runnable. + */ + root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp; + } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) { + /* + * If the root bucket's warped deadline is in the past, it has used up + * all the warp it was assigned. Empty out its warp remaining. + */ + root_bucket->scrb_warp_remaining = 0; + } +} + +/* + * sched_clutch_root_pri_update() + * + * The root level priority is used for thread selection and preemption + * logic. + */ +static void +sched_clutch_root_pri_update( + sched_clutch_root_t root_clutch) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) { + /* No runnable root buckets */ + root_clutch->scr_priority = NOPRI; + assert(root_clutch->scr_urgency == 0); + return; + } + sched_clutch_root_bucket_t root_bucket = NULL; + /* Special case for AboveUI (uses same logic as thread selection) */ + if (sched_clutch_root_select_aboveui(root_clutch)) { + root_bucket = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI]; + } else { + /* + * AboveUI bucket is not runnable or has a low clutch bucket priority, + * select the next runnable root bucket in natural priority order. This logic + * is slightly different from thread selection, because thread selection + * considers deadlines, warps etc. to decide the most optimal bucket at a + * given timestamp. Since the priority value is used for preemption decisions + * only, it needs to be based on the highest runnable thread available in + * the timeshare domain. + */ + int root_bucket_index = bitmap_lsb_next(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI); + assert(root_bucket_index != -1); + root_bucket = &root_clutch->scr_buckets[root_bucket_index]; + } + /* For the selected root bucket, find the highest priority clutch bucket */ + sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket); + root_clutch->scr_priority = priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq); +} + +/* + * sched_clutch_root_urgency_inc() + * + * Routine to increment the urgency at the root level based on the thread + * priority that is being inserted into the hierarchy. The root urgency + * counter is updated based on the urgency of threads in any of the + * clutch buckets which are part of the hierarchy. + * + * Always called with the pset lock held. + */ +static void +sched_clutch_root_urgency_inc( + sched_clutch_root_t root_clutch, + thread_t thread) +{ + if (SCHED(priority_is_urgent)(thread->sched_pri)) { + root_clutch->scr_urgency++; + } +} + +/* + * sched_clutch_root_urgency_dec() + * + * Routine to decrement the urgency at the root level based on the thread + * priority that is being removed from the hierarchy. The root urgency + * counter is updated based on the urgency of threads in any of the + * clutch buckets which are part of the hierarchy. + * + * Always called with the pset lock held. + */ +static void +sched_clutch_root_urgency_dec( + sched_clutch_root_t root_clutch, + thread_t thread) +{ + if (SCHED(priority_is_urgent)(thread->sched_pri)) { + root_clutch->scr_urgency--; + } +} + +/* + * Clutch bucket level scheduling + * + * The second level of scheduling is the clutch bucket level scheduling + * which tries to schedule thread groups within root_buckets. Each + * clutch represents a thread group and a clutch_bucket represents + * threads at a particular sched_bucket within that thread group. The + * goal of this level of scheduling is to allow interactive thread + * groups low latency access to the CPU. It also provides slight + * scheduling preference for App and unrestricted thread groups. + * + * The clutch bucket scheduling algorithm measures an interactivity + * score for all clutch buckets. The interactivity score is based + * on the ratio of the CPU used and the voluntary blocking of threads + * within the clutch bucket. The algorithm is very close to the ULE + * scheduler on FreeBSD in terms of calculations. The interactivity + * score provides an interactivity boost in the range of + * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive + * thread groups to win over CPU spinners. + */ + +/* Priority boost range for interactivity */ +#define SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT (8) +uint8_t sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT; + +/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */ +uint64_t sched_clutch_bucket_adjust_threshold = 0; +#define SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS (500000) + +/* The ratio to scale the cpu/blocked time per window */ +#define SCHED_CLUTCH_BUCKET_ADJUST_RATIO (10) + +/* rate at which interactivity score is recalculated. This keeps the score smooth in terms of extremely bursty behavior */ +uint64_t sched_clutch_bucket_interactivity_delta = 0; +#define SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT (25000) + +/* + * In order to allow App thread groups some preference over daemon thread + * groups, the App clutch_buckets get a 8 point boost. The boost value should + * be chosen such that badly behaved apps are still penalized over well + * behaved interactive daemon clutch_buckets. + */ +#define SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT (8) +uint8_t sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT; + +/* Initial value for voluntary blocking time for the clutch_bucket */ +#define SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID (uint32_t)(~0) + +/* + * sched_clutch_bucket_init() + * + * Initializer for clutch buckets. + */ +static void +sched_clutch_bucket_init( + sched_clutch_bucket_t clutch_bucket, + sched_clutch_t clutch, + sched_bucket_t bucket) +{ + bzero(clutch_bucket, sizeof(struct sched_clutch_bucket)); + + clutch_bucket->scb_bucket = bucket; + /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */ + clutch_bucket->scb_priority = 0; + /* + * All thread groups should be initialized to be interactive; this allows the newly launched + * thread groups to fairly compete with already running thread groups. + */ + clutch_bucket->scb_interactivity_score = (sched_clutch_bucket_interactive_pri * 2); + clutch_bucket->scb_foreign = false; + + os_atomic_store(&clutch_bucket->scb_timeshare_tick, 0, relaxed); + os_atomic_store(&clutch_bucket->scb_pri_shift, INT8_MAX, relaxed); + + clutch_bucket->scb_interactivity_ts = 0; + clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID; + priority_queue_entry_init(&clutch_bucket->scb_pqlink); + clutch_bucket->scb_clutch = clutch; + clutch_bucket->scb_root = NULL; + priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP); + run_queue_init(&clutch_bucket->scb_runq); +} + +/* + * sched_clutch_init_with_thread_group() + * + * Initialize the sched_clutch when the thread group is being created + */ +void +sched_clutch_init_with_thread_group( + sched_clutch_t clutch, + struct thread_group *tg) +{ + os_atomic_store(&clutch->sc_thr_count, 0, relaxed); + + /* Initialize all the clutch buckets */ + for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) { + sched_clutch_bucket_init(&(clutch->sc_clutch_buckets[i]), clutch, i); + } + + /* Grouping specific fields */ + clutch->sc_tg = tg; + os_atomic_store(&clutch->sc_tg_priority, 0, relaxed); +} + +/* + * sched_clutch_destroy() + * + * Destructor for clutch; called from thread group release code. + */ +void +sched_clutch_destroy( + __unused sched_clutch_t clutch) +{ + assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0); +} + + +/* + * sched_clutch_bucket_hierarchy_insert() + * + * Routine to insert a newly runnable clutch_bucket into the root hierarchy. + */ +static void +sched_clutch_bucket_hierarchy_insert( + sched_clutch_root_t root_clutch, + sched_clutch_bucket_t clutch_bucket, + sched_bucket_t bucket, + uint64_t timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + if (bucket > TH_BUCKET_FIXPRI) { + /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */ + enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink); + } + sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket]; + + /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */ + if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) { + sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp); + } + + /* Insert the clutch bucket into the root bucket priority queue */ + priority_queue_insert(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE, + thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0); +} + +/* + * sched_clutch_bucket_hierarchy_remove() + * + * Rotuine to remove a empty clutch bucket from the root hierarchy. + */ +static void +sched_clutch_bucket_hierarchy_remove( + sched_clutch_root_t root_clutch, + sched_clutch_bucket_t clutch_bucket, + sched_bucket_t bucket, + uint64_t timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + if (bucket > TH_BUCKET_FIXPRI) { + /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */ + remqueue(&clutch_bucket->scb_listlink); + } + + sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket]; + + /* Remove the clutch bucket from the root bucket priority queue */ + priority_queue_remove(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed); + clutch_bucket->scb_blocked_ts = timestamp; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE, + thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0); + + /* If the root bucket priority queue is now empty, remove it from the root priority queue */ + if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) { + sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp); + } +} + +/* + * sched_clutch_bucket_base_pri() + * + * Calculates the "base" priority of the clutch bucket. The base + * priority of the clutch bucket is the sum of the max of highest + * base_pri and highest sched_pri in the clutch bucket and any + * grouping specific (App/Daemon...) boosts applicable to the + * clutch_bucket. + */ +static uint8_t +sched_clutch_bucket_base_pri( + sched_clutch_bucket_t clutch_bucket) +{ + uint8_t clutch_boost = 0; + assert(clutch_bucket->scb_runq.count != 0); + + sched_clutch_t clutch = clutch_bucket->scb_clutch; + + /* + * Since the clutch bucket can contain threads that are members of the group due + * to the sched_pri being promoted or due to their base pri, the base priority of + * the entire clutch bucket should be based on the highest thread (promoted or base) + * in the clutch bucket. + */ + uint8_t max_pri = priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq) ? 0 : priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq); + + /* + * For all AboveUI clutch buckets and clutch buckets for thread groups that + * havent been specified as SCHED_CLUTCH_TG_PRI_LOW, give a priority boost + */ + if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) || + (os_atomic_load(&clutch->sc_tg_priority, relaxed) != SCHED_CLUTCH_TG_PRI_LOW)) { + clutch_boost = sched_clutch_bucket_pri_boost; + } + return max_pri + clutch_boost; +} + +/* + * sched_clutch_bucket_interactivity_score_calculate() + * + * Routine to calculate the interactivity score for the clutch bucket. The + * interactivity score is based on the ratio of CPU used by all threads in + * the bucket and the blocked time of the bucket as a whole. + */ +static uint8_t +sched_clutch_bucket_interactivity_score_calculate( + sched_clutch_bucket_t clutch_bucket, + uint64_t timestamp) +{ + if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) { + /* + * Since the root bucket selection algorithm for Above UI looks at clutch bucket + * priorities, make sure all AboveUI buckets are marked interactive. + */ + assert(clutch_bucket->scb_interactivity_score == (2 * sched_clutch_bucket_interactive_pri)); + return clutch_bucket->scb_interactivity_score; + } + + if (clutch_bucket->scb_interactivity_ts == 0) { + /* + * This indicates a newly initialized clutch bucket; return the default interactivity score + * and update timestamp. + */ + clutch_bucket->scb_interactivity_ts = timestamp; + return clutch_bucket->scb_interactivity_score; + } + + if (timestamp < (clutch_bucket->scb_interactivity_ts + sched_clutch_bucket_interactivity_delta)) { + return clutch_bucket->scb_interactivity_score; + } + + /* Check if the clutch bucket accounting needs to be scaled */ + sched_clutch_bucket_cpu_adjust(clutch_bucket); + clutch_bucket->scb_interactivity_ts = timestamp; + + sched_clutch_bucket_cpu_data_t scb_cpu_data; + scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed); + clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used; + clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked; + + /* + * In extremely CPU contended cases, it is possible that the clutch bucket has been runnable + * for a long time but none of its threads have been picked up for execution. In that case, both + * the CPU used and blocked would be 0. + */ + if ((cpu_blocked == 0) && (cpu_used == 0)) { + return clutch_bucket->scb_interactivity_score; + } + + /* + * For all timeshare buckets, calculate the interactivity score of the bucket + * and add it to the base priority + */ + uint8_t interactive_score = 0; + if (cpu_blocked > cpu_used) { + /* Interactive clutch_bucket case */ + interactive_score = sched_clutch_bucket_interactive_pri + + ((sched_clutch_bucket_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked); + } else { + /* Non-interactive clutch_bucket case */ + interactive_score = ((sched_clutch_bucket_interactive_pri * cpu_blocked) / cpu_used); + } + clutch_bucket->scb_interactivity_score = interactive_score; + return interactive_score; +} + +/* + * sched_clutch_bucket_pri_calculate() + * + * The priority calculation algorithm for the clutch_bucket is a slight + * modification on the ULE interactivity score. It uses the base priority + * of the clutch bucket and applies an interactivity score boost to the + * highly responsive clutch buckets. + */ + +static uint8_t +sched_clutch_bucket_pri_calculate( + sched_clutch_bucket_t clutch_bucket, + uint64_t timestamp) +{ + /* For empty clutch buckets, return priority 0 */ + if (clutch_bucket->scb_thr_count == 0) { + return 0; + } + + uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket); + uint8_t interactive_score = sched_clutch_bucket_interactivity_score_calculate(clutch_bucket, timestamp); + + assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX); + uint8_t pri = base_pri + interactive_score; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE, + thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0); + return pri; +} + +/* + * sched_clutch_root_bucket_highest_clutch_bucket() + * + * Routine to find the highest priority clutch bucket + * within the root bucket. + */ +static sched_clutch_bucket_t +sched_clutch_root_bucket_highest_clutch_bucket( + sched_clutch_root_bucket_t root_bucket) +{ + if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) { + return NULL; + } + return priority_queue_max(&root_bucket->scrb_clutch_buckets, struct sched_clutch_bucket, scb_pqlink); +} + +/* + * sched_clutch_bucket_runnable() + * + * Perform all operations needed when a new clutch bucket becomes runnable. + * It involves inserting the clutch_bucket into the hierarchy and updating the + * root priority appropriately. + */ +static boolean_t +sched_clutch_bucket_runnable( + sched_clutch_bucket_t clutch_bucket, + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp); + clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); + sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp); + /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */ + sched_clutch_bucket_timeshare_update(clutch_bucket); + int16_t root_old_pri = root_clutch->scr_priority; + sched_clutch_root_pri_update(root_clutch); + return root_clutch->scr_priority > root_old_pri; +} + +/* + * sched_clutch_bucket_update() + * + * Update the clutch_bucket's position in the hierarchy based on whether + * the newly runnable thread changes its priority. Also update the root + * priority accordingly. + */ +static boolean_t +sched_clutch_bucket_update( + sched_clutch_bucket_t clutch_bucket, + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); + if (new_pri == clutch_bucket->scb_priority) { + return false; + } + struct priority_queue *bucket_prioq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets; + + if (new_pri < clutch_bucket->scb_priority) { + clutch_bucket->scb_priority = new_pri; + priority_queue_entry_decrease(bucket_prioq, &clutch_bucket->scb_pqlink, + clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + } else { + clutch_bucket->scb_priority = new_pri; + priority_queue_entry_increase(bucket_prioq, &clutch_bucket->scb_pqlink, + clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + } + + int16_t root_old_pri = root_clutch->scr_priority; + sched_clutch_root_pri_update(root_clutch); + return root_clutch->scr_priority > root_old_pri; +} + +/* + * sched_clutch_bucket_empty() + * + * Perform all the operations needed when a clutch_bucket is no longer runnable. + * It involves removing the clutch bucket from the hierarchy and updaing the root + * priority appropriately. + */ +static void +sched_clutch_bucket_empty( + sched_clutch_bucket_t clutch_bucket, + sched_clutch_root_t root_clutch, + uint64_t timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp); + clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); + sched_clutch_root_pri_update(root_clutch); +} + +/* + * sched_clutch_cpu_usage_update() + * + * Routine to update CPU usage of the thread in the hierarchy. + */ +void +sched_clutch_cpu_usage_update( + thread_t thread, + uint64_t delta) +{ + if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { + return; + } + sched_clutch_t clutch = sched_clutch_for_thread(thread); + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]); + sched_clutch_bucket_cpu_usage_update(clutch_bucket, delta); +} + +/* + * sched_clutch_bucket_cpu_usage_update() + * + * Routine to update the CPU usage of the clutch_bucket. + */ +static void +sched_clutch_bucket_cpu_usage_update( + sched_clutch_bucket_t clutch_bucket, + uint64_t delta) +{ + if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) { + /* Since Above UI bucket has maximum interactivity score always, nothing to do here */ + return; + } + + /* + * The CPU usage should not overflow the clutch_cpu_data_t type. Since the usage is used to + * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX. + */ + delta = MIN(delta, CLUTCH_CPU_DATA_MAX); + os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed); +} + +/* + * sched_clutch_bucket_cpu_blocked_update() + * + * Routine to update CPU blocked time for clutch_bucket. + */ +static void +sched_clutch_bucket_cpu_blocked_update( + sched_clutch_bucket_t clutch_bucket, + uint64_t timestamp) +{ + if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) || + (clutch_bucket->scb_blocked_ts == SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID)) { + /* For Above UI bucket and a newly initialized clutch bucket, nothing to do here */ + return; + } + + uint64_t blocked_time = timestamp - clutch_bucket->scb_blocked_ts; + if (blocked_time > sched_clutch_bucket_adjust_threshold) { + blocked_time = sched_clutch_bucket_adjust_threshold; + } + + /* + * The CPU blocked should not overflow the clutch_cpu_data_t type. Since the blocked is used to + * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX. + */ + blocked_time = MIN(blocked_time, CLUTCH_CPU_DATA_MAX); + clutch_cpu_data_t __assert_only cpu_blocked_orig = os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed); + /* The blocked time is scaled every so often, it should never overflow */ + assert(blocked_time <= (CLUTCH_CPU_DATA_MAX - cpu_blocked_orig)); +} + +/* + * sched_clutch_bucket_cpu_adjust() + * + * Routine to scale the cpu usage and blocked time once the sum gets bigger + * than sched_clutch_bucket_adjust_threshold. Allows the values to remain + * manageable and maintain the same ratio while allowing clutch buckets to + * adjust behavior and reflect in the interactivity score in a reasonable + * amount of time. + */ +static void +sched_clutch_bucket_cpu_adjust( + sched_clutch_bucket_t clutch_bucket) +{ + sched_clutch_bucket_cpu_data_t old_cpu_data = {}; + sched_clutch_bucket_cpu_data_t new_cpu_data = {}; + do { + old_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed); + clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used; + clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked; + if ((cpu_used + cpu_blocked) < sched_clutch_bucket_adjust_threshold) { + return; + } + + /* + * The accumulation of CPU used and blocked is past the threshold; scale it + * down to lose old history. + */ + new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_ADJUST_RATIO; + new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_ADJUST_RATIO; + } while (!os_atomic_cmpxchg(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed)); +} + +/* + * Thread level scheduling algorithm + * + * The thread level scheduling algorithm uses the mach timeshare + * decay based algorithm to achieve sharing between threads within the + * same clutch bucket. The load/priority shifts etc. are all maintained + * at the clutch bucket level and used for decay calculation of the + * threads. The load sampling is still driven off the scheduler tick + * for runnable clutch buckets (it does not use the new higher frequency + * EWMA based load calculation). The idea is that the contention and load + * within clutch_buckets should be limited enough to not see heavy decay + * and timeshare effectively. + */ + +/* + * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr() + * + * Increment the run count for the clutch bucket associated with the + * thread. + */ +uint32_t +sched_clutch_thread_run_bucket_incr( + thread_t thread, + sched_bucket_t bucket) +{ + if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { + return 0; + } + sched_clutch_t clutch = sched_clutch_for_thread(thread); + return sched_clutch_run_bucket_incr(clutch, bucket); +} + +static uint32_t +sched_clutch_run_bucket_incr( + sched_clutch_t clutch, + sched_bucket_t bucket) +{ + assert(bucket != TH_BUCKET_RUN); + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]); + uint32_t result = os_atomic_inc(&(clutch_bucket->scb_run_count), relaxed); + return result; +} + +/* + * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr() + * + * Decrement the run count for the clutch bucket associated with the + * thread. + */ +uint32_t +sched_clutch_thread_run_bucket_decr( + thread_t thread, + sched_bucket_t bucket) +{ + if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { + return 0; + } + sched_clutch_t clutch = sched_clutch_for_thread(thread); + return sched_clutch_run_bucket_decr(clutch, bucket); +} + +static uint32_t +sched_clutch_run_bucket_decr( + sched_clutch_t clutch, + sched_bucket_t bucket) +{ + assert(bucket != TH_BUCKET_RUN); + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]); + uint32_t result = os_atomic_dec(&(clutch_bucket->scb_run_count), relaxed); + return result; +} + +/* + * sched_clutch_bucket_timeshare_update() + * + * Routine to update the load and priority shift for the clutch_bucket every + * sched_tick. For runnable clutch_buckets, the sched tick handling code + * iterates the clutch buckets and calls this routine. For all others, the + * clutch_bucket maintains a "last updated schedtick" parameter. As threads + * become runnable in the clutch bucket, if this value is outdated, the load + * and shifts are updated. + * + * Possible optimization: + * - The current algorithm samples the load every sched tick (125ms). + * This is prone to spikes in runnable counts; if that turns out to be + * a problem, a simple solution would be to do the EWMA trick to sample + * load at every load_tick (30ms) and use the averaged value for the pri + * shift calculation. + */ +static void +sched_clutch_bucket_timeshare_update( + sched_clutch_bucket_t clutch_bucket) +{ + if (clutch_bucket->scb_bucket < TH_BUCKET_SHARE_FG) { + return; + } + + /* + * Update the timeshare parameters for the clutch bucket if they havent been updated + * in this tick. + */ + uint32_t bucket_sched_ts = os_atomic_load(&clutch_bucket->scb_timeshare_tick, relaxed); + uint32_t current_sched_ts = sched_tick; + if (bucket_sched_ts != current_sched_ts) { + os_atomic_store(&clutch_bucket->scb_timeshare_tick, current_sched_ts, relaxed); + uint32_t bucket_load = (os_atomic_load(&clutch_bucket->scb_run_count, relaxed) / processor_avail_count); + bucket_load = MIN(bucket_load, NRQS - 1); + uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_load]; + os_atomic_store(&clutch_bucket->scb_pri_shift, pri_shift, relaxed); + } +} + +/* + * sched_clutch_thread_clutch_update() + * + * Routine called when the thread changes its thread group. The current + * implementation relies on the fact that the thread group is changed only + * from the context of the thread itself. Due to this fact, the thread + * group change causes only counter updates in the old & new clutch + * buckets and no hierarchy changes. The routine also attributes the CPU + * used so far to the old clutch. + */ +void +sched_clutch_thread_clutch_update( + thread_t thread, + sched_clutch_t old_clutch, + sched_clutch_t new_clutch) +{ + uint32_t cpu_delta; + assert(current_thread() == thread); + + if (old_clutch) { + sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket); + /* + * Calculate the CPU used by this thread in the old bucket and + * add it to the old clutch bucket. This uses the same CPU usage + * logic as update_priority etc. + */ + thread_timer_delta(thread, cpu_delta); + if (thread->pri_shift < INT8_MAX) { + thread->sched_usage += cpu_delta; + } + thread->cpu_delta += cpu_delta; + sched_clutch_bucket_cpu_usage_update(&(old_clutch->sc_clutch_buckets[thread->th_sched_bucket]), cpu_delta); + } + + if (new_clutch) { + sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket); + } +} + +/* Thread Insertion/Removal/Selection routines */ + +/* + * sched_clutch_thread_insert() + * + * Routine to insert a thread into the sched clutch hierarchy. + * Update the counts at all levels of the hierarchy and insert the nodes + * as they become runnable. Always called with the pset lock held. + */ +static boolean_t +sched_clutch_thread_insert( + sched_clutch_root_t root_clutch, + thread_t thread, + integer_t options) +{ + boolean_t result = FALSE; + + sched_clutch_hierarchy_locked_assert(root_clutch); + sched_clutch_t clutch = sched_clutch_for_thread(thread); + assert(thread->thread_group == clutch->sc_tg); + + uint64_t current_timestamp = mach_absolute_time(); + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]); + assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch)); + + /* Insert thread into the clutch_bucket runq using sched_pri */ + run_queue_enqueue(&clutch_bucket->scb_runq, thread, options); + /* Increment the urgency counter for the root if necessary */ + sched_clutch_root_urgency_inc(root_clutch, thread); + + /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */ + priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link, + sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + os_atomic_inc(&clutch->sc_thr_count, relaxed); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE, + thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0); + + /* Enqueue the clutch into the hierarchy (if needed) and update properties */ + if (clutch_bucket->scb_thr_count == 0) { + sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count); + sched_clutch_thr_count_inc(&root_clutch->scr_thr_count); + /* Insert the newly runnable clutch bucket into the hierarchy */ + result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp); + } else { + sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count); + sched_clutch_thr_count_inc(&root_clutch->scr_thr_count); + /* Update the position of the clutch bucket in the hierarchy */ + result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp); + } + return result; +} + +/* + * sched_clutch_thread_remove() + * + * Routine to remove a thread from the sched clutch hierarchy. + * Update the counts at all levels of the hierarchy and remove the nodes + * as they become empty. Always called with the pset lock held. + */ +static void +sched_clutch_thread_remove( + sched_clutch_root_t root_clutch, + thread_t thread, + uint64_t current_timestamp) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + sched_clutch_t clutch = sched_clutch_for_thread(thread); + assert(thread->thread_group == clutch->sc_tg); + assert(thread->runq != PROCESSOR_NULL); + + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]); + assert(clutch_bucket->scb_root == root_clutch); + + /* Decrement the urgency counter for the root if necessary */ + sched_clutch_root_urgency_dec(root_clutch, thread); + /* Remove thread from the clutch_bucket */ + run_queue_remove(&clutch_bucket->scb_runq, thread); + + priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE, + thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_EMPTY, 0); + + /* Update counts at various levels of the hierarchy */ + os_atomic_dec(&clutch->sc_thr_count, relaxed); + sched_clutch_thr_count_dec(&root_clutch->scr_thr_count); + sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count); + + /* Remove the clutch from hierarchy (if needed) and update properties */ + if (clutch_bucket->scb_thr_count == 0) { + sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp); + } else { + sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp); + } +} + +/* + * sched_clutch_thread_highest() + * + * Routine to find and remove the highest priority thread + * from the sched clutch hierarchy. The algorithm looks at the + * hierarchy for the most eligible runnable thread and calls + * sched_clutch_thread_remove(). Always called with the + * pset lock held. + */ +static thread_t +sched_clutch_thread_highest( + sched_clutch_root_t root_clutch) +{ + sched_clutch_hierarchy_locked_assert(root_clutch); + uint64_t current_timestamp = mach_absolute_time(); + + /* Select the highest priority root bucket */ + sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp); + if (root_bucket == NULL) { + return THREAD_NULL; + } + /* Since a thread is being picked from this root bucket, update its deadline */ + sched_clutch_root_bucket_deadline_update(root_bucket, root_clutch, current_timestamp); + + /* Find the highest priority clutch bucket in this root bucket */ + sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket); + assert(clutch_bucket != NULL); + + /* Find the highest priority runnable thread in this clutch bucket */ + thread_t thread = run_queue_peek(&clutch_bucket->scb_runq); + assert(thread != NULL); + + /* Remove and return the thread from the hierarchy */ + sched_clutch_thread_remove(root_clutch, thread, current_timestamp); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE, + thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0); + return thread; +} + + +/* High level global accessor routines */ + +/* + * sched_clutch_root_urgency() + * + * Routine to get the urgency of the highest runnable + * thread in the hierarchy. + */ +static uint32_t +sched_clutch_root_urgency( + sched_clutch_root_t root_clutch) +{ + return root_clutch->scr_urgency; +} + +/* + * sched_clutch_root_count_sum() + * + * The count_sum mechanism is used for scheduler runq + * statistics calculation. Its only useful for debugging + * purposes; since it takes a mach_absolute_time() on + * other scheduler implementations, its better to avoid + * populating this until absolutely necessary. + */ +static uint32_t +sched_clutch_root_count_sum( + __unused sched_clutch_root_t root_clutch) +{ + return 0; +} + +/* + * sched_clutch_root_priority() + * + * Routine to get the priority of the highest runnable + * thread in the hierarchy. + */ +static int +sched_clutch_root_priority( + sched_clutch_root_t root_clutch) +{ + return root_clutch->scr_priority; +} + +/* + * sched_clutch_root_count() + * + * Returns total number of runnable threads in the hierarchy. + */ +uint32_t +sched_clutch_root_count( + sched_clutch_root_t root_clutch) +{ + return root_clutch->scr_thr_count; +} + +/* + * sched_clutch_thread_pri_shift() + * + * Routine to get the priority shift value for a thread. + * Since the timesharing is done at the clutch_bucket level, + * this routine gets the clutch_bucket and retrieves the + * values from there. + */ +uint32_t +sched_clutch_thread_pri_shift( + thread_t thread, + sched_bucket_t bucket) +{ + if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { + return UINT8_MAX; + } + assert(bucket != TH_BUCKET_RUN); + sched_clutch_t clutch = sched_clutch_for_thread(thread); + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]); + return os_atomic_load(&clutch_bucket->scb_pri_shift, relaxed); +} + +#pragma mark -- Clutch Scheduler Algorithm + +static void +sched_clutch_init(void); + +static void +sched_clutch_timebase_init(void); + +static thread_t +sched_clutch_steal_thread(processor_set_t pset); + +static void +sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context); + +static boolean_t +sched_clutch_processor_enqueue(processor_t processor, thread_t thread, + sched_options_t options); + +static boolean_t +sched_clutch_processor_queue_remove(processor_t processor, thread_t thread); + +static ast_t +sched_clutch_processor_csw_check(processor_t processor); + +static boolean_t +sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte); + +static int +sched_clutch_runq_count(processor_t processor); + +static boolean_t +sched_clutch_processor_queue_empty(processor_t processor); + +static uint64_t +sched_clutch_runq_stats_count_sum(processor_t processor); + +static int +sched_clutch_processor_bound_count(processor_t processor); + +static void +sched_clutch_pset_init(processor_set_t pset); + +static void +sched_clutch_processor_init(processor_t processor); + +static thread_t +sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason); + +static void +sched_clutch_processor_queue_shutdown(processor_t processor); + +static sched_mode_t +sched_clutch_initial_thread_sched_mode(task_t parent_task); + +static uint32_t +sched_clutch_initial_quantum_size(thread_t thread); + +static bool +sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread); + +static uint32_t +sched_clutch_run_incr(thread_t thread); + +static uint32_t +sched_clutch_run_decr(thread_t thread); + +static void +sched_clutch_update_thread_bucket(thread_t thread); + +const struct sched_dispatch_table sched_clutch_dispatch = { + .sched_name = "clutch", + .init = sched_clutch_init, + .timebase_init = sched_clutch_timebase_init, + .processor_init = sched_clutch_processor_init, + .pset_init = sched_clutch_pset_init, + .maintenance_continuation = sched_timeshare_maintenance_continue, + .choose_thread = sched_clutch_choose_thread, + .steal_thread_enabled = sched_steal_thread_enabled, + .steal_thread = sched_clutch_steal_thread, + .compute_timeshare_priority = sched_compute_timeshare_priority, + .choose_processor = choose_processor, + .processor_enqueue = sched_clutch_processor_enqueue, + .processor_queue_shutdown = sched_clutch_processor_queue_shutdown, + .processor_queue_remove = sched_clutch_processor_queue_remove, + .processor_queue_empty = sched_clutch_processor_queue_empty, + .priority_is_urgent = priority_is_urgent, + .processor_csw_check = sched_clutch_processor_csw_check, + .processor_queue_has_priority = sched_clutch_processor_queue_has_priority, + .initial_quantum_size = sched_clutch_initial_quantum_size, + .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode, + .can_update_priority = can_update_priority, + .update_priority = update_priority, + .lightweight_update_priority = lightweight_update_priority, + .quantum_expire = sched_default_quantum_expire, + .processor_runq_count = sched_clutch_runq_count, + .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum, + .processor_bound_count = sched_clutch_processor_bound_count, + .thread_update_scan = sched_clutch_thread_update_scan, + .multiple_psets_enabled = TRUE, + .sched_groups_enabled = FALSE, + .avoid_processor_enabled = TRUE, + .thread_avoid_processor = sched_clutch_thread_avoid_processor, + .processor_balance = sched_SMT_balance, + + .rt_runq = sched_rtglobal_runq, + .rt_init = sched_rtglobal_init, + .rt_queue_shutdown = sched_rtglobal_queue_shutdown, + .rt_runq_scan = sched_rtglobal_runq_scan, + .rt_runq_count_sum = sched_rtglobal_runq_count_sum, + + .qos_max_parallelism = sched_qos_max_parallelism, + .check_spill = sched_check_spill, + .ipi_policy = sched_ipi_policy, + .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_clutch_run_incr, + .run_count_decr = sched_clutch_run_decr, + .update_thread_bucket = sched_clutch_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, +}; + +__attribute__((always_inline)) +static inline run_queue_t +sched_clutch_bound_runq(processor_t processor) +{ + return &processor->runq; +} + +__attribute__((always_inline)) +static inline sched_clutch_root_t +sched_clutch_processor_root_clutch(processor_t processor) +{ + return &processor->processor_set->pset_clutch_root; +} + +__attribute__((always_inline)) +static inline run_queue_t +sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread) +{ + assert(thread->bound_processor == processor); + return sched_clutch_bound_runq(processor); +} + +static uint32_t +sched_clutch_initial_quantum_size(thread_t thread) +{ + if (thread == THREAD_NULL) { + return std_quantum; + } + assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX); + return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket]; +} + +static sched_mode_t +sched_clutch_initial_thread_sched_mode(task_t parent_task) +{ + if (parent_task == kernel_task) { + return TH_MODE_FIXED; + } else { + return TH_MODE_TIMESHARE; + } +} + +static void +sched_clutch_processor_init(processor_t processor) +{ + run_queue_init(&processor->runq); +} + +static void +sched_clutch_pset_init(processor_set_t pset) +{ + sched_clutch_root_init(&pset->pset_clutch_root, pset); +} + +static void +sched_clutch_init(void) +{ + if (!PE_parse_boot_argn("sched_clutch_bucket_interactive_pri", &sched_clutch_bucket_interactive_pri, sizeof(sched_clutch_bucket_interactive_pri))) { + sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT; + } + if (!PE_parse_boot_argn("sched_clutch_bucket_pri_boost", &sched_clutch_bucket_pri_boost, sizeof(sched_clutch_bucket_pri_boost))) { + sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT; + } + sched_timeshare_init(); +} + +static void +sched_clutch_timebase_init(void) +{ + sched_timeshare_timebase_init(); + sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel); + sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp); + sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum); + clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS, + NSEC_PER_USEC, &sched_clutch_bucket_adjust_threshold); + + uint32_t interactivity_delta = 0; + if (!PE_parse_boot_argn("sched_clutch_bucket_interactivity_delta_usecs", &interactivity_delta, sizeof(interactivity_delta))) { + interactivity_delta = SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT; + } + clock_interval_to_absolutetime_interval(interactivity_delta, NSEC_PER_USEC, &sched_clutch_bucket_interactivity_delta); +} + +static thread_t +sched_clutch_choose_thread( + processor_t processor, + int priority, + __unused ast_t reason) +{ + int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)); + uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)); + run_queue_t bound_runq = sched_clutch_bound_runq(processor); + boolean_t choose_from_boundq = false; + + if (bound_runq->highq < priority && + clutch_pri < priority) { + return THREAD_NULL; + } + + if (bound_runq->count && clutch_count) { + if (bound_runq->highq >= clutch_pri) { + choose_from_boundq = true; + } + } else if (bound_runq->count) { + choose_from_boundq = true; + } else if (clutch_count) { + choose_from_boundq = false; + } else { + return THREAD_NULL; + } + + thread_t thread = THREAD_NULL; + if (choose_from_boundq == false) { + sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); + thread = sched_clutch_thread_highest(pset_clutch_root); + } else { + thread = run_queue_dequeue(bound_runq, SCHED_HEADQ); + } + return thread; +} + +static boolean_t +sched_clutch_processor_enqueue( + processor_t processor, + thread_t thread, + sched_options_t options) +{ + boolean_t result; + + thread->runq = processor; + if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { + sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); + result = sched_clutch_thread_insert(pset_clutch_root, thread, options); + } else { + run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread); + result = run_queue_enqueue(rq, thread, options); + } + return result; +} + +static boolean_t +sched_clutch_processor_queue_empty(processor_t processor) +{ + return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 && + sched_clutch_bound_runq(processor)->count == 0; +} + +static ast_t +sched_clutch_processor_csw_check(processor_t processor) +{ + boolean_t has_higher; + int pri; + + if (sched_clutch_thread_avoid_processor(processor, current_thread())) { + return AST_PREEMPT | AST_URGENT; + } + + run_queue_t bound_runq = sched_clutch_bound_runq(processor); + int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)); + + assert(processor->active_thread != NULL); + + pri = MAX(clutch_pri, bound_runq->highq); + + if (processor->first_timeslice) { + has_higher = (pri > processor->current_pri); + } else { + has_higher = (pri >= processor->current_pri); + } + + if (has_higher) { + if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) { + return AST_PREEMPT | AST_URGENT; + } + + if (bound_runq->urgency > 0) { + return AST_PREEMPT | AST_URGENT; + } + + return AST_PREEMPT; + } + + return AST_NONE; +} + +static boolean_t +sched_clutch_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte) +{ + run_queue_t bound_runq = sched_clutch_bound_runq(processor); + + int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq); + + if (gte) { + return qpri >= priority; + } else { + return qpri > priority; + } +} + +static int +sched_clutch_runq_count(processor_t processor) +{ + return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count; +} + +static uint64_t +sched_clutch_runq_stats_count_sum(processor_t processor) +{ + uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum; + + if (processor->cpu_id == processor->processor_set->cpu_set_low) { + return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor)); + } else { + return bound_sum; + } +} +static int +sched_clutch_processor_bound_count(processor_t processor) +{ + return sched_clutch_bound_runq(processor)->count; +} + +static void +sched_clutch_processor_queue_shutdown(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); + thread_t thread; + queue_head_t tqueue; + + /* We only need to migrate threads if this is the last active processor in the pset */ + if (pset->online_processor_count > 0) { + pset_unlock(pset); + return; + } + + queue_init(&tqueue); + while (sched_clutch_root_count(pset_clutch_root) > 0) { + thread = sched_clutch_thread_highest(pset_clutch_root); + enqueue_tail(&tqueue, &thread->runq_links); + } + + pset_unlock(pset); + + qe_foreach_element_safe(thread, &tqueue, runq_links) { + remqueue(&thread->runq_links); + + thread_lock(thread); + + thread_setrun(thread, SCHED_TAILQ); + + thread_unlock(thread); + } +} + +static boolean_t +sched_clutch_processor_queue_remove( + processor_t processor, + thread_t thread) +{ + run_queue_t rq; + processor_set_t pset = processor->processor_set; + + pset_lock(pset); + + if (processor == thread->runq) { + /* + * Thread is on a run queue and we have a lock on + * that run queue. + */ + if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { + sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); + sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time()); + } else { + rq = sched_clutch_thread_bound_runq(processor, thread); + run_queue_remove(rq, thread); + } + } else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + processor = PROCESSOR_NULL; + } + + pset_unlock(pset); + + return processor != PROCESSOR_NULL; +} + +static thread_t +sched_clutch_steal_thread(processor_set_t pset) +{ + processor_set_t nset, cset = pset; + thread_t thread; + + do { + sched_clutch_root_t pset_clutch_root = &cset->pset_clutch_root; + if (sched_clutch_root_count(pset_clutch_root) > 0) { + thread = sched_clutch_thread_highest(pset_clutch_root); + pset_unlock(cset); + return thread; + } + + nset = next_pset(cset); + + if (nset != pset) { + pset_unlock(cset); + + cset = nset; + pset_lock(cset); + } + } while (nset != pset); + + pset_unlock(cset); + + return THREAD_NULL; +} + +static void +sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context) +{ + boolean_t restart_needed = FALSE; + processor_t processor = processor_list; + processor_set_t pset; + thread_t thread; + spl_t s; + + /* + * We update the threads associated with each processor (bound and idle threads) + * and then update the threads in each pset runqueue. + */ + + do { + do { + pset = processor->processor_set; + + s = splsched(); + pset_lock(pset); + + restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context); + + pset_unlock(pset); + splx(s); + + if (restart_needed) { + break; + } + + thread = processor->idle_thread; + if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) { + if (thread_update_add_thread(thread) == FALSE) { + restart_needed = TRUE; + break; + } + } + } while ((processor = processor->processor_list) != NULL); + + /* Ok, we now have a collection of candidates -- fix them. */ + thread_update_process_threads(); + } while (restart_needed); + + pset = &pset0; + + do { + do { + s = splsched(); + pset_lock(pset); + + if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) { + queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets; + sched_clutch_bucket_t clutch_bucket; + qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) { + sched_clutch_bucket_timeshare_update(clutch_bucket); + restart_needed = runq_scan(&clutch_bucket->scb_runq, scan_context); + if (restart_needed) { + break; + } + } + } + + pset_unlock(pset); + splx(s); + if (restart_needed) { + break; + } + } while ((pset = pset->pset_list) != NULL); + + /* Ok, we now have a collection of candidates -- fix them. */ + thread_update_process_threads(); + } while (restart_needed); +} + +extern int sched_allow_rt_smt; + +/* Return true if this thread should not continue running on this processor */ +static bool +sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread) +{ + if (processor->processor_primary != processor) { + /* + * This is a secondary SMT processor. If the primary is running + * a realtime thread, only allow realtime threads on the secondary. + */ + if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) { + return true; + } + } + + return false; +} + +/* + * For the clutch scheduler, the run counts are maintained in the clutch + * buckets (i.e thread group scheduling structure). + */ +static uint32_t +sched_clutch_run_incr(thread_t thread) +{ + assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN); + uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed); + sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket); + return new_count; +} + +static uint32_t +sched_clutch_run_decr(thread_t thread) +{ + assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN); + uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed); + sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket); + return new_count; +} + +static sched_bucket_t +sched_convert_pri_to_bucket(uint8_t priority) +{ + sched_bucket_t bucket = TH_BUCKET_RUN; + + if (priority > BASEPRI_USER_INITIATED) { + bucket = TH_BUCKET_SHARE_FG; + } else if (priority > BASEPRI_DEFAULT) { + bucket = TH_BUCKET_SHARE_IN; + } else if (priority > BASEPRI_UTILITY) { + bucket = TH_BUCKET_SHARE_DF; + } else if (priority > MAXPRI_THROTTLE) { + bucket = TH_BUCKET_SHARE_UT; + } else { + bucket = TH_BUCKET_SHARE_BG; + } + return bucket; +} + +/* + * For threads that have changed sched_pri without changing the + * base_pri for any reason other than decay, use the sched_pri + * as the bucketizing priority instead of base_pri. All such + * changes are typically due to kernel locking primitives boosts + * or demotions. + */ +static boolean_t +sched_thread_sched_pri_promoted(thread_t thread) +{ + return (thread->sched_flags & TH_SFLAG_PROMOTED) || + (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) || + (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) || + (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || + (thread->kern_promotion_schedpri != 0); +} + +/* + * Routine to update the scheduling bucket for the thread. + * + * In the clutch scheduler implementation, the thread's bucket + * is based on sched_pri if it was promoted due to a kernel + * primitive; otherwise its based on the thread base_pri. This + * enhancement allows promoted threads to reach a higher priority + * bucket and potentially get selected sooner for scheduling. + * + * Also, the clutch scheduler does not honor fixed priority below + * FG priority. It simply puts those threads in the corresponding + * timeshare bucket. The reason for to do that is because it is + * extremely hard to define the scheduling properties of such threads + * and they typically lead to performance issues. + */ + +void +sched_clutch_update_thread_bucket(thread_t thread) +{ + sched_bucket_t old_bucket = thread->th_sched_bucket; + sched_bucket_t new_bucket = TH_BUCKET_RUN; + assert(thread->runq == PROCESSOR_NULL); + + int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri; + + switch (thread->sched_mode) { + case TH_MODE_FIXED: + if (pri >= BASEPRI_FOREGROUND) { + new_bucket = TH_BUCKET_FIXPRI; + } else { + new_bucket = sched_convert_pri_to_bucket(pri); + } + break; + + case TH_MODE_REALTIME: + new_bucket = TH_BUCKET_FIXPRI; + break; + + case TH_MODE_TIMESHARE: + new_bucket = sched_convert_pri_to_bucket(pri); + break; + + default: + panic("unexpected mode: %d", thread->sched_mode); + break; + } + + if (old_bucket == new_bucket) { + return; + } + + thread->th_sched_bucket = new_bucket; + thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket); + + /* + * Since this is called after the thread has been removed from the runq, + * only the run counts need to be updated. The re-insert into the runq + * would put the thread into the correct new bucket's runq. + */ + if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) { + sched_clutch_thread_run_bucket_decr(thread, old_bucket); + sched_clutch_thread_run_bucket_incr(thread, new_bucket); + } +} + + +#endif /* CONFIG_SCHED_CLUTCH */ diff --git a/osfmk/kern/sched_clutch.h b/osfmk/kern/sched_clutch.h new file mode 100644 index 000000000..4cfad12f5 --- /dev/null +++ b/osfmk/kern/sched_clutch.h @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_SCHED_CLUTCH_H_ +#define _KERN_SCHED_CLUTCH_H_ + +#include +#include +#include +#include +#include + +#if CONFIG_SCHED_CLUTCH + +/* + * Clutch ordering based on thread group flags (specified + * by the thread grouping mechanism). These properties + * define a thread group specific priority boost. + * + * The current implementation gives a slight boost to + * HIGH & MED thread groups which effectively deprioritizes + * daemon thread groups which are marked "Efficient" on AMP + * systems. + */ +#define SCHED_CLUTCH_TG_PRI_LOW 0x0 +#define SCHED_CLUTCH_TG_PRI_MED 0x1 +#define SCHED_CLUTCH_TG_PRI_HIGH 0x2 + +/* + * For the current implementation, bound threads are not managed + * in the clutch hierarchy. This helper macro is used to indicate + * if the thread should be in the hierarchy. + */ +#define SCHED_CLUTCH_THREAD_ELIGIBLE(thread) ((thread->bound_processor) == PROCESSOR_NULL) + +/* + * + * Clutch hierarchy locking protocol + * + * The scheduler clutch hierarchy is protected by a combination of + * atomics and pset lock. + * - All fields protected by the pset lock are annotated with (P) + * - All fields updated using atomics are annotated with (A) + * - All fields that are unprotected and are not updated after + * initialization are annotated with (I) + */ + +/* + * struct sched_clutch_root_bucket + * + * A clutch_root_bucket represents all threads across all thread groups + * that are in the same scheduler bucket (FG/IN/...). The clutch_root_bucket + * is selected for execution by the root level bucket selection algorithm + * which bases the decision on the clutch_root_bucket's deadline (EDF). The + * deadline for a root bucket is calculated based on its runnable timestamp + * and the worst-case-execution-latency values specied in sched_clutch_root_bucket_wcel[] + */ +struct sched_clutch_root_bucket { + /* (I) sched bucket represented by this root bucket */ + uint8_t scrb_bucket; + /* (P) priority queue for all clutch buckets in this sched bucket */ + struct priority_queue scrb_clutch_buckets; + /* (P) priority queue entry to use for enqueueing root bucket into root prioq */ + struct priority_queue_entry scrb_pqlink; + /* (P) ageout deadline for this root bucket */ + uint64_t scrb_deadline; + /* (P) warped deadline for root bucket */ + uint64_t scrb_warped_deadline; + /* (P) warp remaining for root bucket */ + uint64_t scrb_warp_remaining; +}; +typedef struct sched_clutch_root_bucket *sched_clutch_root_bucket_t; + +/* + * struct sched_clutch_root + * + * A clutch_root represents the root of the hierarchy. It maintains a + * priority queue of all runnable root buckets. The clutch_root also + * maintains the information about the last clutch_root_bucket scheduled + * in order to implement bucket level quantum. The bucket level quantums + * allow low priority buckets to get a "fair" chance of using the CPU even + * if they contain a bunch of short executing threads. The bucket quantums + * are configured using sched_clutch_root_bucket_quantum[] + */ +struct sched_clutch_root { + /* (P) root level priority; represents the highest runnable thread in the hierarchy */ + int16_t scr_priority; + /* (P) total number of runnable threads in the hierarchy */ + uint16_t scr_thr_count; + /* (P) root level urgency; represents the urgency of the whole hierarchy for pre-emption purposes */ + int16_t scr_urgency; + + /* (I) processor set this hierarchy belongs to */ + processor_set_t scr_pset; + /* + * (P) list of all runnable clutch buckets across the system; + * allows easy iteration in the sched tick based timesharing code + */ + queue_head_t scr_clutch_buckets; + /* + * (P) list of all runnable foreign buckets in this hierarchy; + * used for tracking thread groups which need to be migrated when + * psets are available + */ + queue_head_t scr_foreign_buckets; + + /* Root level bucket management */ + + /* (P) bitmap of all runnable clutch_root_buckets; used for root pri calculation */ + bitmap_t scr_runnable_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)]; + /* (P) bitmap of all runnable root buckets which have warps remaining */ + bitmap_t scr_warp_available[BITMAP_LEN(TH_BUCKET_SCHED_MAX)]; + /* (P) priority queue of all runnable clutch_root_buckets */ + struct priority_queue scr_root_buckets; + /* (P) storage for all possible clutch_root_buckets */ + struct sched_clutch_root_bucket scr_buckets[TH_BUCKET_SCHED_MAX]; +}; +typedef struct sched_clutch_root *sched_clutch_root_t; + +/* forward declaration for sched_clutch */ +struct sched_clutch; + +/* + * sched_clutch_bucket_cpu_data_t + * + * Used for maintaining clutch bucket used and blocked time. The + * values are used for calculating the interactivity score for the + * clutch bucket. + * + * Since the CPU used/blocked calculation uses wide atomics, the data + * types used are different based on the platform. + */ + +#if __arm64__ + +#define CLUTCH_CPU_DATA_MAX (UINT64_MAX) +typedef uint64_t clutch_cpu_data_t; +typedef unsigned __int128 clutch_cpu_data_wide_t; + +#else /* __arm64__ */ + +#define CLUTCH_CPU_DATA_MAX (UINT32_MAX) +typedef uint32_t clutch_cpu_data_t; +typedef uint64_t clutch_cpu_data_wide_t; + +#endif /* __arm64__ */ + +typedef union sched_clutch_bucket_cpu_data { + struct { + /* Clutch bucket CPU used across all threads */ + clutch_cpu_data_t scbcd_cpu_used; + /* Clutch bucket voluntary blocked time */ + clutch_cpu_data_t scbcd_cpu_blocked; + } cpu_data; + clutch_cpu_data_wide_t scbcd_cpu_data_packed; +} sched_clutch_bucket_cpu_data_t; + +/* + * struct sched_clutch_bucket + * + * A sched_clutch_bucket represents the set of threads for a thread + * group at a particular scheduling bucket. It maintains information + * about the CPU usage & blocking behavior of all threads part of + * the clutch_bucket and maintains the timesharing attributes for + * threads in its runq. It uses the decay based algorithm to timeshare + * among threads in the runq. + */ +struct sched_clutch_bucket { + /* (I) bucket for the clutch_bucket */ + uint8_t scb_bucket; + /* (P) priority of the clutch bucket */ + uint8_t scb_priority; + /* (P) interactivity score of the clutch bucket */ + uint8_t scb_interactivity_score; + /* (P) flag to indicate if the bucket is a foreign bucket */ + bool scb_foreign; + + /* Properties used for timesharing threads in this clutch_bucket */ + + /* (P) number of threads in this clutch_bucket; should match runq.count */ + uint16_t scb_thr_count; + /* (A) run count (running + runnable) for this clutch_bucket */ + uint16_t _Atomic scb_run_count; + /* (A) sched tick when the clutch bucket load/shifts were updated */ + uint32_t _Atomic scb_timeshare_tick; + /* (A) priority shifts for threads in the clutch_bucket */ + uint32_t _Atomic scb_pri_shift; + /* (P) linkage for all clutch_buckets in a root bucket; used for tick operations */ + queue_chain_t scb_listlink; + + + /* (P) timestamp for the last time the interactivity score was updated */ + uint64_t scb_interactivity_ts; + /* (P) timestamp for the last time the clutch_bucket blocked */ + uint64_t scb_blocked_ts; + + /* (A) CPU usage information for the clutch bucket */ + sched_clutch_bucket_cpu_data_t scb_cpu_data; + + /* (P) linkage for clutch_bucket in root_bucket priority queue */ + struct priority_queue_entry scb_pqlink; + /* (I) clutch to which this clutch bucket belongs */ + struct sched_clutch *scb_clutch; + /* (A) pointer to the root of the hierarchy this bucket is in */ + struct sched_clutch_root *scb_root; + /* (P) priority queue of threads based on their promoted/base priority */ + struct priority_queue scb_clutchpri_prioq; + /* (P) runq of threads in clutch_bucket */ + struct run_queue scb_runq; +}; +typedef struct sched_clutch_bucket *sched_clutch_bucket_t; + + +/* + * struct sched_clutch + * + * A sched_clutch is a 1:1 mapping to a thread group. It maintains the + * storage for all clutch buckets for this thread group and some properties + * of the thread group (such as flags etc.) + */ +struct sched_clutch { + /* + * (A) number of runnable threads in sched_clutch; needs to be atomic + * to support cross cluster sched_clutch migrations. + */ + uint16_t _Atomic sc_thr_count; + /* + * Grouping specific parameters. Currently the implementation only + * supports thread_group based grouping. + */ + union { + /* (A) priority specified by the thread grouping mechanism */ + uint8_t _Atomic sc_tg_priority; + }; + union { + /* (I) Pointer to thread group */ + struct thread_group *sc_tg; + }; + /* (I) storage for all clutch_buckets for this clutch */ + struct sched_clutch_bucket sc_clutch_buckets[TH_BUCKET_SCHED_MAX]; +}; +typedef struct sched_clutch *sched_clutch_t; + + +/* Clutch lifecycle management */ +void sched_clutch_init_with_thread_group(sched_clutch_t, struct thread_group *); +void sched_clutch_destroy(sched_clutch_t); + +/* Clutch thread membership management */ +void sched_clutch_thread_clutch_update(thread_t, sched_clutch_t, sched_clutch_t); + +/* Clutch timesharing stats management */ +uint32_t sched_clutch_thread_run_bucket_incr(thread_t, sched_bucket_t); +uint32_t sched_clutch_thread_run_bucket_decr(thread_t, sched_bucket_t); +void sched_clutch_cpu_usage_update(thread_t, uint64_t); +uint32_t sched_clutch_thread_pri_shift(thread_t, sched_bucket_t); + +/* Clutch properties accessors */ +uint32_t sched_clutch_root_count(sched_clutch_root_t); + +/* Grouping specific external routines */ +extern sched_clutch_t sched_clutch_for_thread(thread_t); + +#endif /* CONFIG_SCHED_CLUTCH */ + +#endif /* _KERN_SCHED_CLUTCH_H_ */ diff --git a/osfmk/kern/sched_clutch.md b/osfmk/kern/sched_clutch.md new file mode 100644 index 000000000..64da1a58f --- /dev/null +++ b/osfmk/kern/sched_clutch.md @@ -0,0 +1,151 @@ +# Clutch Scheduler + +## Background + +The XNU kernel runs on a variety of platforms with strong requirements for being dynamic and efficient. It needs to deliver on a wide range of requirements; from quick access to CPU for latency sensitive workloads (eg. UI interactions, multimedia recording/playback) to starvation avoidance for lower priority batch workloads (eg. photos sync, source compilation). The traditional Mach scheduler attempts to achieve these goals by expecting all threads in the system to be tagged with a priority number and treating high priority threads as interactive threads and low priority threads as batch threads. It then uses a timesharing model based on priority decay to penalize threads as they use CPU to achieve fairshare and starvation avoidance. This approach however loses the relationship between threads and higher level user workloads, making it impossible for the scheduler to reason about the workload as a whole which is what the end user cares about. One artifact of this thread based timesharing approach is that threads at the same priority level are treated similarly irrespective of which user workload they are servicing, which often leads to non-optimal decisions. It ultimately leads to priority inflation across the platform with individual subsystems raising their priority to avoid starvation and timesharing with other unrelated threads. The traditional thread level scheduling model also suffers from the following issues: + +* **Inaccurate accounting**: CPU accounting at the thread level incentivizes creating more threads on the system. Also in the world of GCD and workqueues where threads are created and destroyed rapidly, thread level accounting is inaccurate and allows excessive CPU usage. +* **Poor isolation**: In the Mach scheduler, timesharing is achieved by decaying the priority of threads depending on global system load. This property could lead to a burst of activity at the same or lower priority band causing decay for the App/UI thread leading to poor performance and responsiveness. The scheduler offers very limited isolation between threads working on latency sensitive UI workloads and threads performing bulk non-latency sensitive operations. + +## Clutch Scheduler Design + +In order to reason about higher level user workloads, the clutch scheduler schedules groups of threads instead of individual threads. Breaking away from the traditional single-tier scheduling model, it implements a hierarchical scheduler which makes optimal decisions at various thread grouping levels. The hierarchical scheduler, as its implemented today, has 3 levels: + +* Scheduling Bucket Level +* Thread Group Level +* Thread Level + +### Scheduling Bucket Level + +The highest level is the scheduling bucket level which decides which class of threads should be picked for execution. The kernel maintains a notion of scheduling bucket per thread which are defined based on the base/scheduling priority of the threads. These scheduling buckets roughly map to the QoS classes used by the OS runtime to define performance expectations for various pieces of work. All runnable threads with the same scheduling bucket are represented by a single entry at this level. These entries are known as *root buckets* throughout the implementation. The goal of this level is to provide low latency access to the CPU for high QoS classes while ensuring starvation avoidance for the low QoS classes. + +**Implementation** + +The scheduling bucket level uses an Earliest Deadline First (EDF) algorithm to decide which root bucket should be selected next for execution. Each root bucket with runnable threads is represented as an entry in a priority queue which is ordered by the bucket's deadline. The bucket selection algorithm simply selects the root bucket with the earliest deadline in the priority queue. The deadline for a root bucket is calculated based on its first-runnable timestamp and its **Worst Case Execution Latency (WCEL)** value which is pre-defined for each bucket. The WCEL values are picked based on the decay curve followed by the Mach timesharing algorithm to allow the system to function similar to the existing scheduler from a higher level perspective. + +``` +static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = { + SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */ + 0, /* FG */ + 37500, /* IN (37.5ms) */ + 75000, /* DF (75ms) */ + 150000, /* UT (150ms) */ + 250000 /* BG (250ms) */ +}; +``` + +Whenever a root bucket transitions from non-runnable to runnable, its deadline is set to (now + WCEL[bucket]). This ensures that the bucket would be scheduled at WCEL[bucket] even in a heavily loaded system. Once the root bucket is picked for execution, its deadline is pushed by WCEL[bucket] into the future. This basic implementation of EDF suffers from one major issue. In a heavily loaded system, it is possible that the higher buckets have used up enough CPU in the recent past such that its behind the lower buckets in deadline order. Now, if a small burst of user-critical workload shows up, the high bucket has to wait for the lower buckets to run before it can get CPU which might lead to performance issues. In order to address that, the bucket level scheduler implements a root bucket warp mechanism. Each bucket is provided a warp value which is refreshed whenever the bucket is selected due to its deadline expiring. + +``` +static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = { + SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */ + 8000, /* FG (8ms)*/ + 4000, /* IN (4ms) */ + 2000, /* DF (2ms) */ + 1000, /* UT (1ms) */ + 0 /* BG (0ms) */ +}; +``` +The root bucket selection logic finds the earliest deadline bucket and then checks if there are any higher (in natural priority order) buckets that have warp remaining. If there is such a higher bucket, it would select that bucket and effectively open a warp window. During this warp window the scheduler would continue to select this warping bucket over lower priority buckets. Once the warping bucket is drained or the warp window expires, the scheduler goes back to scheduling buckets in deadline order. This mechanism provides a bounded advantage to higher level buckets to allow them to remain responsive in the presence of bursty workloads. + +The FIXPRI bucket is special cased since it contains extremely latency sensitive threads. Since the priority range for AboveUI and FG Timeshare buckets overlap, it is important to maintain some native priority order between those buckets. The policy implemented here is to compare the highest clutch buckets of both buckets; if the Above UI bucket is higher, schedule it immediately. Otherwise fall through to the deadline based scheduling as described above. The implementation allows extremely low latency CPU access for Above UI threads while supporting the use case of high priority timeshare threads contending with lower priority fixed priority threads which is observed in some media workloads. Since the timeshare bucket will eventually drop in priority as it consumes CPU, this model provides the desired behavior for timeshare threads above UI. + +The scheduling bucket level also maintains a bitmap of runnable root buckets to allow quick checks for empty hierarchy and root level priority calculation. + +The EDF algorithm is the best choice for this level due to the following reasons: + +* Deadline based scheduling allows the scheduler to define strict bounds on worst case execution latencies for all scheduling buckets. +* The EDF algorithm is dynamic based on bucket runnability and selection. Since all deadline updates are computationally cheap, the algorithm can maintain up-to-date information without measurable overhead. +* It achieves the goals of maintaining low scheduling latency for high buckets and starvation avoidance for low buckets efficiently. +* Since the bucket level scheduler deals with a fixed small number of runnable buckets in the worst case, it is easy to configure in terms of defining deadlines, warps etc. + +### Thread Group Level + +The second level is the “thread group” level which decides which thread group within a bucket should be selected next for execution. Thread groups are a mechanism introduced with the AMP scheduler which represent a collection of threads working on behalf of a specific workload. Each thread group with runnable threads within a bucket is represented as an entry at this level. These entries are known as *clutch buckets* throughout the implementation. The goal of this level is to share the CPU among various user workloads with preference to interactive applications over compute-intensive batch workloads. + +**Implementation** + +The thread group level implements a variation of the FreeBSD ULE scheduler to decide which clutch bucket should be selected next for execution. Each clutch bucket with runnable threads is represented as an entry in a priority queue which is ordered by clutch bucket priorities. The clutch bucket selection algorithm simply selects the clutch bucket with the highest priority in the priority queue. The priority calculation for the clutch buckets is based on the following factors: + +* **Highest runnable thread in the clutch bucket**: The clutch bucket maintains a priority queue which contains threads ordered by their promoted or base priority (whichever property made the thread eligible to be part of that clutch bucket). It uses the highest of these threads to calculate the base priority of the clutch bucket. The use of both base and sched priority allows the scheduler to honor priority differences specified from userspace via SPIs, priority boosts due to priority inheritance mechanisms like turnstiles and other priority affecting mechanisms outside the core scheduler. +* **Interactivity score**: The scheduler calculates an interactivity score based on the ratio of voluntary blocking time and CPU usage time for the clutch bucket as a whole. This score allows the scheduler to prefer highly interactive thread groups over batch processing compute intensive thread groups. +* **Thread Group Type**: In order to improve battery life on AMP devices, the OS marks daemon thread groups as “Efficient”. These thread groups typically represent work that is not directly related to the user requested workload. The scheduler de-prioritizes these thread groups over others by factoring this into the priority calculation. + +The interactivity score based algorithm is well suited for this level due to the following reasons: + +* It allows for a fair sharing of CPU among thread groups based on their recent behavior. Since the algorithm only looks at recent CPU usage history, it also adapts to changing behavior quickly. +* Since the priority calculation is fairly cheap, the scheduler is able to maintain up-to-date information about all thread groups which leads to more optimal decisions. +* Thread groups provide a convenient abstraction for groups of threads working together for a user workload. Basing scheduling decisions on this abstraction allows the system to make interesting choices such as preferring Apps over daemons which is typically better for system responsiveness. + +### Thread Level + +At the lowest level the scheduler decides which thread within a clutch bucket should be selected next for execution. Each runnable thread in the clutch bucket is represented as an entry in a runqueue which is organized based on the schedpri of threads. The thread selection algorithm simply selects the highest priority thread in the runqueue. The schedpri calculation for the threads is based on the traditional Mach scheduling algorithm which uses load & CPU usage to decay priority for a thread. The thread decay model is more suited at this level as compared to the global scheduler because the load calculation only accounts for threads in the same clutch bucket. Since all threads in the same clutch bucket belong to the same thread group and scheduling bucket, this algorithm provides quick CPU access for latency sensitive threads within the clutch bucket without impacting other non-related threads in the system. + +**Implementation** + +The thread level scheduler implements the Mach timesharing algorithm to decide which thread within the clutch bucket should be selected next for execution. All runnable threads in a clutch bucket are inserted into the runqueue based on the schedpri. The scheduler calculates the schedpri of the threads in a clutch bucket based on the number of runnable threads in the clutch bucket and the CPU usage of individual threads. The load information is updated every scheduler tick and the threads use this information for priority decay calculation as they use CPU. The priority decay algorithm attempts to reward bursty interactive threads and penalize CPU intensive threads. Once a thread is selected for running, it is assigned a quantum which is based on the scheduling bucket it belongs to. The quanta for various buckets are defined statically as: + +``` +static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = { + 10000, /* FIXPRI (10ms) */ + 10000, /* FG (10ms) */ + 8000, /* IN (8ms) */ + 6000, /* DF (6ms) */ + 4000, /* UT (4ms) */ + 2000 /* BG (2ms) */ +}; +``` + +The per-bucket thread quantum allows the scheduler to bound the worst case execution latency for a low priority thread which has been starved by higher priority threads. + +##Scheduler Priority Calculations + +###Root Priority Calculation + +The scheduler maintains a root level priority for the hierarchy in order to make decisions regarding pre-emptions and thread selection. The root priority is updated as threads are inserted/removed from the hierarchy. The root level also maintains the urgency bits to help with pre-emption decisions. Since the root level priority/urgency is used for pre-emption decisions, it is based on the threads in the hierarchy and is calculated as follows: + +``` +Root Priority Calculation: +* If AboveUI bucket is runnable, +* Compare priority of AboveUI highest clutch bucket (CBUI) with Timeshare FG highest clutch bucket (CBFG) +* If pri(CBUI) >= pri(CBFG), select CBUI +* Otherwise find the (non-AboveUI) highest priority root bucket that is runnable and select its highest clutch bucket +* Find the highest priority (promoted or base pri) thread within that clutch bucket and assign that as root priority + +Root Urgency Calculation: +* On thread insertion into the hierarchy, increment the root level urgency based on thread's sched_pri +* On thread removal from the hierarchy, decrement the root level urgency based on thread's sched_pri + +``` + +###Root Bucket Priority Calculation + +The root bucket priority is simply the deadline of the root bucket which is calculated by adding the WCEL of the bucket to the timestamp of the root bucket becoming runnable. + +``` +root-bucket priority = now + WCEL[bucket] +``` + +###Clutch Bucket Priority Calculation + +As mentioned earlier, the priority value of a clutch bucket is calculated based on the highest runnable thread, interactivity score and the thread group type. The actual calculation algorithm is as follows: + +``` +* Find the highest runnable thread (promoted or basepri) in the clutch bucket (maxpri) +* Check if the thread group for this clutch bucket is marked Efficient. +* If not, assign a positive boost value (clutch_boost) +* Calculate the ratio of CPU blocked and CPU used for the clutch bucket. +* If blocked > used, assign a score (interactivity_score) in the higher range. +* Else, assign a score (interactivity_score) in the lower range. +* clutch-bucket priority = maxpri + clutch_boost + interactivity_score +``` + +###Thread Priority Calculation + +The thread priority calculation is based on the Mach timesharing algorithm. It is calculated in the following manner: + +``` +* Every scheduler tick, snapshot the load for the clutch bucket +* Use the load value to calculate the priority shift values for all threads in the clutch bucket +* thread priority = base priority - (thread CPU usage >> priority shift) +``` diff --git a/osfmk/kern/sched_dualq.c b/osfmk/kern/sched_dualq.c index e7f506c06..0f04cd427 100644 --- a/osfmk/kern/sched_dualq.c +++ b/osfmk/kern/sched_dualq.c @@ -56,7 +56,8 @@ static void sched_dualq_thread_update_scan(sched_update_scan_context_t scan_context); static boolean_t -sched_dualq_processor_enqueue(processor_t processor, thread_t thread, integer_t options); +sched_dualq_processor_enqueue(processor_t processor, thread_t thread, + sched_options_t options); static boolean_t sched_dualq_processor_queue_remove(processor_t processor, thread_t thread); @@ -126,7 +127,6 @@ const struct sched_dispatch_table sched_dualq_dispatch = { .processor_runq_stats_count_sum = sched_dualq_runq_stats_count_sum, .processor_bound_count = sched_dualq_processor_bound_count, .thread_update_scan = sched_dualq_thread_update_scan, - .direct_dispatch_to_idle_processors = FALSE, .multiple_psets_enabled = TRUE, .sched_groups_enabled = FALSE, .avoid_processor_enabled = TRUE, @@ -143,6 +143,10 @@ const struct sched_dispatch_table sched_dualq_dispatch = { .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, }; __attribute__((always_inline)) @@ -238,7 +242,7 @@ sched_dualq_choose_thread( } if (processor->is_SMT) { - thread_t potential_thread = run_queue_dequeue(chosen_runq, SCHED_PEEK | SCHED_HEADQ); + thread_t potential_thread = run_queue_peek(chosen_runq); if (potential_thread == THREAD_NULL) { return THREAD_NULL; } @@ -280,7 +284,7 @@ static boolean_t sched_dualq_processor_enqueue( processor_t processor, thread_t thread, - integer_t options) + sched_options_t options) { run_queue_t rq = dualq_runq_for_thread(processor, thread); boolean_t result; diff --git a/osfmk/kern/sched_grrr.c b/osfmk/kern/sched_grrr.c index af61fd552..5f663aba7 100644 --- a/osfmk/kern/sched_grrr.c +++ b/osfmk/kern/sched_grrr.c @@ -138,7 +138,7 @@ static boolean_t sched_grrr_processor_enqueue( processor_t processor, thread_t thread, - integer_t options); + sched_options_t options); static void sched_grrr_processor_queue_shutdown( @@ -219,7 +219,6 @@ const struct sched_dispatch_table sched_grrr_dispatch = { .processor_runq_stats_count_sum = sched_grrr_processor_runq_stats_count_sum, .processor_bound_count = sched_grrr_processor_bound_count, .thread_update_scan = sched_grrr_thread_update_scan, - .direct_dispatch_to_idle_processors = TRUE, .multiple_psets_enabled = TRUE, .sched_groups_enabled = FALSE, .avoid_processor_enabled = FALSE, @@ -236,6 +235,10 @@ const struct sched_dispatch_table sched_grrr_dispatch = { .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, }; extern int max_unsafe_quanta; @@ -348,7 +351,7 @@ static boolean_t sched_grrr_processor_enqueue( processor_t processor, thread_t thread, - integer_t options __unused) + sched_options_t options __unused) { grrr_run_queue_t rq = &processor->grrr_runq; boolean_t result; diff --git a/osfmk/kern/sched_multiq.c b/osfmk/kern/sched_multiq.c index 9b4084777..a96f7bd63 100644 --- a/osfmk/kern/sched_multiq.c +++ b/osfmk/kern/sched_multiq.c @@ -246,7 +246,8 @@ static void sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context); static boolean_t -sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options); +sched_multiq_processor_enqueue(processor_t processor, thread_t thread, + sched_options_t options); static boolean_t sched_multiq_processor_queue_remove(processor_t processor, thread_t thread); @@ -319,7 +320,6 @@ const struct sched_dispatch_table sched_multiq_dispatch = { .processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum, .processor_bound_count = sched_multiq_processor_bound_count, .thread_update_scan = sched_multiq_thread_update_scan, - .direct_dispatch_to_idle_processors = FALSE, .multiple_psets_enabled = FALSE, .sched_groups_enabled = TRUE, .avoid_processor_enabled = TRUE, @@ -336,6 +336,10 @@ const struct sched_dispatch_table sched_multiq_dispatch = { .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, }; @@ -494,9 +498,9 @@ entry_queue_first_entry(entry_queue_t rq) { assert(rq->count != 0); - queue_t queue = &rq->queues[rq->highq]; + circle_queue_t queue = &rq->queues[rq->highq]; - sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links); + sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links); assert(entry->sched_pri == rq->highq); @@ -527,9 +531,9 @@ group_first_thread(sched_group_t group) assert(rq->count != 0); - queue_t queue = &rq->queues[rq->highq]; + circle_queue_t queue = &rq->queues[rq->highq]; - thread_t thread = qe_queue_first(queue, struct thread, runq_links); + thread_t thread = cqe_queue_first(queue, struct thread, runq_links); assert(thread != THREAD_NULL); assert_thread_magic(thread); @@ -546,7 +550,7 @@ group_first_thread(sched_group_t group) static void entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri) { - queue_t q; + circle_queue_t q; sched_entry_t elem; assert(queue_chain_linked(&entry->entry_links)); @@ -554,7 +558,7 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr q = &runq->queues[expected_pri]; - qe_foreach_element(elem, q, entry_links) { + cqe_foreach_element(elem, q, entry_links) { if (elem == entry) { return; } @@ -567,7 +571,7 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr static void sched_group_check_thread(sched_group_t group, thread_t thread) { - queue_t q; + circle_queue_t q; thread_t elem; int pri = thread->sched_pri; @@ -575,7 +579,7 @@ sched_group_check_thread(sched_group_t group, thread_t thread) q = &group->runq.queues[pri]; - qe_foreach_element(elem, q, runq_links) { + cqe_foreach_element(elem, q, runq_links) { if (elem == thread) { return; } @@ -635,19 +639,19 @@ static sched_entry_t entry_queue_dequeue_entry(entry_queue_t rq) { sched_entry_t sched_entry; - queue_t queue = &rq->queues[rq->highq]; + circle_queue_t queue = &rq->queues[rq->highq]; assert(rq->count > 0); - assert(!queue_empty(queue)); + assert(!circle_queue_empty(queue)); - sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links); + sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (SCHED(priority_is_urgent)(rq->highq)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { rq_bitmap_clear(rq->bitmap, rq->highq); rq->highq = bitmap_first(rq->bitmap, NRQS); } @@ -667,13 +671,13 @@ entry_queue_enqueue_entry( integer_t options) { int sched_pri = entry->sched_pri; - queue_t queue = &rq->queues[sched_pri]; + circle_queue_t queue = &rq->queues[sched_pri]; boolean_t result = FALSE; assert(entry->runq == 0); - if (queue_empty(queue)) { - enqueue_tail(queue, &entry->entry_links); + if (circle_queue_empty(queue)) { + circle_enqueue_tail(queue, &entry->entry_links); rq_bitmap_set(rq->bitmap, sched_pri); if (sched_pri > rq->highq) { @@ -682,9 +686,9 @@ entry_queue_enqueue_entry( } } else { if (options & SCHED_TAILQ) { - enqueue_tail(queue, &entry->entry_links); + circle_enqueue_tail(queue, &entry->entry_links); } else { - enqueue_head(queue, &entry->entry_links); + circle_enqueue_head(queue, &entry->entry_links); } } if (SCHED(priority_is_urgent)(sched_pri)) { @@ -722,7 +726,7 @@ entry_queue_remove_entry( rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(&rq->queues[sched_pri])) { + if (circle_queue_empty(&rq->queues[sched_pri])) { /* update run queue status */ rq_bitmap_clear(rq->bitmap, sched_pri); rq->highq = bitmap_first(rq->bitmap, NRQS); @@ -737,8 +741,8 @@ entry_queue_change_entry( sched_entry_t entry, integer_t options) { - int sched_pri = entry->sched_pri; - queue_t queue = &rq->queues[sched_pri]; + int sched_pri = entry->sched_pri; + circle_queue_t queue = &rq->queues[sched_pri]; #if defined(MULTIQ_SANITY_CHECK) if (multiq_sanity_check) { @@ -746,10 +750,11 @@ entry_queue_change_entry( } #endif + circle_dequeue(queue, &entry->entry_links); if (options & SCHED_TAILQ) { - re_queue_tail(queue, &entry->entry_links); + circle_enqueue_tail(queue, &entry->entry_links); } else { - re_queue_head(queue, &entry->entry_links); + circle_enqueue_head(queue, &entry->entry_links); } } /* @@ -764,14 +769,14 @@ group_run_queue_dequeue_thread( boolean_t *queue_empty) { thread_t thread; - queue_t queue = &rq->queues[rq->highq]; + circle_queue_t queue = &rq->queues[rq->highq]; assert(rq->count > 0); - assert(!queue_empty(queue)); + assert(!circle_queue_empty(queue)); *thread_pri = rq->highq; - thread = qe_dequeue_head(queue, struct thread, runq_links); + thread = cqe_dequeue_head(queue, struct thread, runq_links); assert_thread_magic(thread); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); @@ -779,7 +784,7 @@ group_run_queue_dequeue_thread( if (SCHED(priority_is_urgent)(rq->highq)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { rq_bitmap_clear(rq->bitmap, rq->highq); rq->highq = bitmap_first(rq->bitmap, NRQS); *queue_empty = TRUE; @@ -801,14 +806,14 @@ group_run_queue_enqueue_thread( integer_t thread_pri, integer_t options) { - queue_t queue = &rq->queues[thread_pri]; + circle_queue_t queue = &rq->queues[thread_pri]; boolean_t result = FALSE; assert(thread->runq == PROCESSOR_NULL); assert_thread_magic(thread); - if (queue_empty(queue)) { - enqueue_tail(queue, &thread->runq_links); + if (circle_queue_empty(queue)) { + circle_enqueue_tail(queue, &thread->runq_links); rq_bitmap_set(rq->bitmap, thread_pri); if (thread_pri > rq->highq) { @@ -817,9 +822,9 @@ group_run_queue_enqueue_thread( result = TRUE; } else { if (options & SCHED_TAILQ) { - enqueue_tail(queue, &thread->runq_links); + circle_enqueue_tail(queue, &thread->runq_links); } else { - enqueue_head(queue, &thread->runq_links); + circle_enqueue_head(queue, &thread->runq_links); } } if (SCHED(priority_is_urgent)(thread_pri)) { @@ -841,12 +846,13 @@ group_run_queue_remove_thread( thread_t thread, integer_t thread_pri) { + circle_queue_t queue = &rq->queues[thread_pri]; boolean_t result = FALSE; assert_thread_magic(thread); assert(thread->runq != PROCESSOR_NULL); - remqueue(&thread->runq_links); + circle_dequeue(queue, &thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -854,7 +860,7 @@ group_run_queue_remove_thread( rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(&rq->queues[thread_pri])) { + if (circle_queue_empty(queue)) { /* update run queue status */ rq_bitmap_clear(rq->bitmap, thread_pri); rq->highq = bitmap_first(rq->bitmap, NRQS); @@ -1148,7 +1154,7 @@ static boolean_t sched_multiq_processor_enqueue( processor_t processor, thread_t thread, - integer_t options) + sched_options_t options) { boolean_t result; @@ -1415,7 +1421,7 @@ group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) queue_index = bitmap_next(runq->bitmap, queue_index)) { sched_entry_t entry; - qe_foreach_element(entry, &runq->queues[queue_index], entry_links) { + cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) { assert(count > 0); sched_group_t group = group_for_entry(entry); diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index e5a3d2e2e..c59175da0 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -75,7 +75,7 @@ #include #include #include -#include +#include #include #include @@ -235,12 +235,6 @@ static void preempt_pri_init(void); #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -#if CONFIG_SCHED_IDLE_IN_PLACE -static thread_t thread_select_idle( - thread_t thread, - processor_t processor); -#endif - thread_t processor_idle( thread_t thread, processor_t processor); @@ -280,11 +274,9 @@ sched_vm_group_maintenance(void); #if defined(CONFIG_SCHED_TIMESHARE_CORE) int8_t sched_load_shifts[NRQS]; -bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)]; +bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)]; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -const struct sched_dispatch_table *sched_current_dispatch = NULL; - /* * Statically allocate a buffer to hold the longest possible * scheduler description string, as currently implemented. @@ -309,67 +301,10 @@ uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS; /* Global flag which indicates whether Background Stepper Context is enabled */ static int cpu_throttle_enabled = 1; -#if DEBUG - -/* Since using the indirect function dispatch table has a negative impact on - * context switch performance, only allow DEBUG kernels to use that mechanism. - */ -static void -sched_init_override(void) -{ - char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' }; - - /* Check for runtime selection of the scheduler algorithm */ - if (!PE_parse_boot_argn("sched", sched_arg, sizeof(sched_arg))) { - sched_arg[0] = '\0'; - } - if (strlen(sched_arg) > 0) { - if (0) { - /* Allow pattern below */ -#if defined(CONFIG_SCHED_TRADITIONAL) - } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) { - sched_current_dispatch = &sched_traditional_dispatch; - } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) { - sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; -#endif -#if defined(CONFIG_SCHED_MULTIQ) - } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) { - sched_current_dispatch = &sched_multiq_dispatch; - } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) { - sched_current_dispatch = &sched_dualq_dispatch; -#endif - } else { -#if defined(CONFIG_SCHED_TRADITIONAL) - printf("Unrecognized scheduler algorithm: %s\n", sched_arg); - printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name); - sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; -#else - panic("Unrecognized scheduler algorithm: %s", sched_arg); -#endif - } - kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name)); - } else { -#if defined(CONFIG_SCHED_MULTIQ) - sched_current_dispatch = &sched_dualq_dispatch; -#elif defined(CONFIG_SCHED_TRADITIONAL) - sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; -#else -#error No default scheduler implementation -#endif - kprintf("Scheduler: Default of %s\n", SCHED(sched_name)); - } -} - -#endif /* DEBUG */ - void sched_init(void) { -#if DEBUG - sched_init_override(); -#else /* DEBUG */ kprintf("Scheduler: Default of %s\n", SCHED(sched_name)); -#endif /* DEBUG */ if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) { /* No boot-args, check in device tree */ @@ -505,7 +440,7 @@ pset_rt_init(processor_set_t pset) { rt_lock_init(pset); - pset->rt_runq.count = 0; + os_atomic_init(&pset->rt_runq.count, 0); queue_init(&pset->rt_runq.queue); memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats); } @@ -734,28 +669,18 @@ thread_unblock( } /* Update the runnable thread count */ - new_run_count = sched_run_incr(thread); + new_run_count = SCHED(run_count_incr)(thread); } else { /* * Either the thread is idling in place on another processor, * or it hasn't finished context switching yet. */ -#if CONFIG_SCHED_IDLE_IN_PLACE - if (thread->state & TH_IDLE) { - processor_t processor = thread->last_processor; - - if (processor != current_processor()) { - machine_signal_idle(processor); - } - } -#else assert((thread->state & TH_IDLE) == 0); -#endif /* * The run count is only dropped after the context switch completes * and the thread is still waiting, so we should not run_incr here */ - new_run_count = sched_run_buckets[TH_BUCKET_RUN]; + new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); } @@ -875,7 +800,7 @@ thread_go( if (thread_unblock(thread, wresult)) { #if SCHED_TRACE_THREAD_WAKEUPS backtrace(&thread->thread_wakeup_bt[0], - (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t))); + (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL); #endif thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); } @@ -901,6 +826,10 @@ thread_mark_wait_locked( boolean_t at_safe_point; wait_interrupt_t interruptible = interruptible_orig; + if (thread->state & TH_IDLE) { + panic("Invalid attempt to wait while running the idle thread"); + } + assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT))); /* @@ -2237,139 +2166,12 @@ idle: pset_unlock(pset); #endif -#if CONFIG_SCHED_IDLE_IN_PLACE - /* - * Choose idle thread if fast idle is not possible. - */ - if (processor->processor_primary != processor) { - return processor->idle_thread; - } - - if ((thread->state & (TH_IDLE | TH_TERMINATE | TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES) { - return processor->idle_thread; - } - - /* - * Perform idling activities directly without a - * context switch. Return dispatched thread, - * else check again for a runnable thread. - */ - new_thread = thread_select_idle(thread, processor); - -#else /* !CONFIG_SCHED_IDLE_IN_PLACE */ - - /* - * Do a full context switch to idle so that the current - * thread can start running on another processor without - * waiting for the fast-idled processor to wake up. - */ new_thread = processor->idle_thread; - -#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */ } while (new_thread == THREAD_NULL); return new_thread; } -#if CONFIG_SCHED_IDLE_IN_PLACE -/* - * thread_select_idle: - * - * Idle the processor using the current thread context. - * - * Called with thread locked, then dropped and relocked. - */ -static thread_t -thread_select_idle( - thread_t thread, - processor_t processor) -{ - thread_t new_thread; - uint64_t arg1, arg2; - int urgency; - - sched_run_decr(thread); - - thread->state |= TH_IDLE; - processor_state_update_idle(procssor); - - /* Reload precise timing global policy to thread-local policy */ - thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); - - thread_unlock(thread); - - /* - * Switch execution timing to processor idle thread. - */ - processor->last_dispatch = mach_absolute_time(); - -#ifdef CONFIG_MACH_APPROXIMATE_TIME - commpage_update_mach_approximate_time(processor->last_dispatch); -#endif - - thread->last_run_time = processor->last_dispatch; - processor_timer_switch_thread(processor->last_dispatch, - &processor->idle_thread->system_timer); - PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer; - - - /* - * Cancel the quantum timer while idling. - */ - timer_call_quantum_timer_cancel(&processor->quantum_timer); - processor->first_timeslice = FALSE; - - if (thread->sched_call) { - (*thread->sched_call)(SCHED_CALL_BLOCK, thread); - } - - thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL); - - /* - * Enable interrupts and perform idling activities. No - * preemption due to TH_IDLE being set. - */ - spllo(); new_thread = processor_idle(thread, processor); - - /* - * Return at splsched. - */ - if (thread->sched_call) { - (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); - } - - thread_lock(thread); - - /* - * If awakened, switch to thread timer and start a new quantum. - * Otherwise skip; we will context switch to another thread or return here. - */ - if (!(thread->state & TH_WAIT)) { - uint64_t time_now = processor->last_dispatch = mach_absolute_time(); - processor_timer_switch_thread(time_now, &thread->system_timer); - timer_update(&thread->runnable_timer, time_now); - PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; - thread_quantum_init(thread); - processor->quantum_end = time_now + thread->quantum_remaining; - timer_call_quantum_timer_enter(&processor->quantum_timer, - thread, processor->quantum_end, time_now); - processor->first_timeslice = TRUE; - - thread->computation_epoch = time_now; - } - - thread->state &= ~TH_IDLE; - - urgency = thread_get_urgency(thread, &arg1, &arg2); - - thread_tell_urgency(urgency, arg1, arg2, 0, new_thread); - - sched_run_incr(thread); - - return new_thread; -} -#endif /* CONFIG_SCHED_IDLE_IN_PLACE */ - /* * thread_invoke * @@ -2468,6 +2270,7 @@ thread_invoke( /* * Context switch by performing a stack handoff. + * Requires both threads to be parked in a continuation. */ continuation = thread->continuation; parameter = thread->parameter; @@ -2521,6 +2324,12 @@ thread_invoke( kperf_off_cpu(self); #endif /* KPERF */ + /* + * This is where we actually switch thread identity, + * and address space if required. However, register + * state is not switched - this routine leaves the + * stack and register state active on the current CPU. + */ TLOG(1, "thread_invoke: calling stack_handoff\n"); stack_handoff(self, thread); @@ -2545,8 +2354,16 @@ thread_invoke( counter(c_thread_invoke_hits++); + boolean_t enable_interrupts = TRUE; + + /* idle thread needs to stay interrupts-disabled */ + if ((thread->state & TH_IDLE)) { + enable_interrupts = FALSE; + } + assert(continuation); - call_continuation(continuation, parameter, thread->wait_result, TRUE); + call_continuation(continuation, parameter, + thread->wait_result, enable_interrupts); /*NOTREACHED*/ } else if (thread == self) { /* same thread but with continuation */ @@ -2573,7 +2390,15 @@ thread_invoke( self->continuation = self->parameter = NULL; - call_continuation(continuation, parameter, self->wait_result, TRUE); + boolean_t enable_interrupts = TRUE; + + /* idle thread needs to stay interrupts-disabled */ + if ((self->state & TH_IDLE)) { + enable_interrupts = FALSE; + } + + call_continuation(continuation, parameter, + self->wait_result, enable_interrupts); /*NOTREACHED*/ } } else { @@ -2669,30 +2494,33 @@ need_stack: * been stored on the stack or a non-volatile register, but a stale idea of * what was on the CPU is newly-accurate because that thread is again * running on the CPU. + * + * If one of the threads is using a continuation, thread_continue + * is used to stitch up its context. + * + * If we are invoking a thread which is resuming from a continuation, + * the CPU will invoke thread_continue next. + * + * If the current thread is parking in a continuation, then its state + * won't be saved and the stack will be discarded. When the stack is + * re-allocated, it will be configured to resume from thread_continue. */ assert(continuation == self->continuation); thread = machine_switch_context(self, continuation, thread); assert(self == current_thread_volatile()); TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread); + assert(continuation == NULL && self->continuation == NULL); + DTRACE_SCHED(on__cpu); #if KPERF kperf_on_cpu(self, NULL, __builtin_frame_address(0)); #endif /* KPERF */ - /* - * We have been resumed and are set to run. - */ + /* We have been resumed and are set to run. */ thread_dispatch(thread, self); - if (continuation) { - self->continuation = self->parameter = NULL; - - call_continuation(continuation, parameter, self->wait_result, TRUE); - /*NOTREACHED*/ - } - return TRUE; } @@ -2716,7 +2544,7 @@ pset_cancel_deferred_dispatch( uint32_t sampled_sched_run_count; pset_lock(pset); - sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN]; + sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); /* * If we have emptied the run queue, and our current thread is runnable, we @@ -2768,7 +2596,6 @@ pset_cancel_deferred_dispatch( * reasonable facsimile of PROCESSOR_IDLE. */ - assert(active_processor->next_thread == THREAD_NULL); processor_state_update_idle(active_processor); active_processor->deadline = UINT64_MAX; pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE); @@ -2809,6 +2636,7 @@ thread_csw_callout( * "self" is the new current thread that we have context switched to * * Called at splsched. + * */ void thread_dispatch( @@ -3048,7 +2876,7 @@ thread_dispatch( thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE; thread->chosen_processor = PROCESSOR_NULL; - new_run_count = sched_run_decr(thread); + new_run_count = SCHED(run_count_decr)(thread); #if CONFIG_SCHED_SFI if (thread->reason & AST_SFI) { @@ -3153,8 +2981,7 @@ thread_dispatch( * TODO: Can we state that redispatching our old thread is also * uninteresting? */ - if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) && - !(self->state & TH_IDLE)) { + if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) { pset_cancel_deferred_dispatch(processor->processor_set, processor); } #endif @@ -3291,6 +3118,9 @@ thread_run( * * Called at splsched when a thread first receives * a new stack after a continuation. + * + * Called with THREAD_NULL as the old thread when + * invoked by machine_load_context. */ void thread_continue( @@ -3305,6 +3135,8 @@ thread_continue( continuation = self->continuation; parameter = self->parameter; + assert(continuation != NULL); + #if KPERF kperf_on_cpu(self, continuation, NULL); #endif @@ -3320,7 +3152,13 @@ thread_continue( TLOG(1, "thread_continue: calling call_continuation\n"); - boolean_t enable_interrupts = thread != THREAD_NULL; + boolean_t enable_interrupts = TRUE; + + /* bootstrap thread, idle thread need to stay interrupts-disabled */ + if (thread == THREAD_NULL || (self->state & TH_IDLE)) { + enable_interrupts = FALSE; + } + call_continuation(continuation, parameter, self->wait_result, enable_interrupts); /*NOTREACHED*/ } @@ -3360,7 +3198,7 @@ run_queue_init( } rq->urgency = rq->count = 0; for (int i = 0; i < NRQS; i++) { - queue_init(&rq->queues[i]); + circle_queue_init(&rq->queues[i]); } } @@ -3375,25 +3213,16 @@ run_queue_init( */ thread_t run_queue_dequeue( - run_queue_t rq, - integer_t options) + run_queue_t rq, + sched_options_t options) { - thread_t thread; - queue_t queue = &rq->queues[rq->highq]; - - if (options & SCHED_PEEK) { - if (options & SCHED_HEADQ) { - thread = qe_queue_first(queue, struct thread, runq_links); - } else { - thread = qe_queue_last(queue, struct thread, runq_links); - } - return thread; - } + thread_t thread; + circle_queue_t queue = &rq->queues[rq->highq]; if (options & SCHED_HEADQ) { - thread = qe_dequeue_head(queue, struct thread, runq_links); + thread = cqe_dequeue_head(queue, struct thread, runq_links); } else { - thread = qe_dequeue_tail(queue, struct thread, runq_links); + thread = cqe_dequeue_tail(queue, struct thread, runq_links); } assert(thread != THREAD_NULL); @@ -3405,7 +3234,7 @@ run_queue_dequeue( if (SCHED(priority_is_urgent)(rq->highq)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { bitmap_clear(rq->bitmap, rq->highq); rq->highq = bitmap_first(rq->bitmap, NRQS); } @@ -3423,17 +3252,17 @@ run_queue_dequeue( */ boolean_t run_queue_enqueue( - run_queue_t rq, - thread_t thread, - integer_t options) + run_queue_t rq, + thread_t thread, + sched_options_t options) { - queue_t queue = &rq->queues[thread->sched_pri]; - boolean_t result = FALSE; + circle_queue_t queue = &rq->queues[thread->sched_pri]; + boolean_t result = FALSE; assert_thread_magic(thread); - if (queue_empty(queue)) { - enqueue_tail(queue, &thread->runq_links); + if (circle_queue_empty(queue)) { + circle_enqueue_tail(queue, &thread->runq_links); rq_bitmap_set(rq->bitmap, thread->sched_pri); if (thread->sched_pri > rq->highq) { @@ -3442,9 +3271,9 @@ run_queue_enqueue( } } else { if (options & SCHED_TAILQ) { - enqueue_tail(queue, &thread->runq_links); + circle_enqueue_tail(queue, &thread->runq_links); } else { - enqueue_head(queue, &thread->runq_links); + circle_enqueue_head(queue, &thread->runq_links); } } if (SCHED(priority_is_urgent)(thread->sched_pri)) { @@ -3468,17 +3297,19 @@ run_queue_remove( run_queue_t rq, thread_t thread) { + circle_queue_t queue = &rq->queues[thread->sched_pri]; + assert(thread->runq != PROCESSOR_NULL); assert_thread_magic(thread); - remqueue(&thread->runq_links); + circle_dequeue(queue, &thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (SCHED(priority_is_urgent)(thread->sched_pri)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(&rq->queues[thread->sched_pri])) { + if (circle_queue_empty(queue)) { /* update run queue status */ bitmap_clear(rq->bitmap, thread->sched_pri); rq->highq = bitmap_first(rq->bitmap, NRQS); @@ -3487,6 +3318,28 @@ run_queue_remove( thread->runq = PROCESSOR_NULL; } +/* + * run_queue_peek + * + * Peek at the runq and return the highest + * priority thread from the runq. + * + * The run queue must be locked. + */ +thread_t +run_queue_peek( + run_queue_t rq) +{ + if (rq->count > 0) { + circle_queue_t queue = &rq->queues[rq->highq]; + thread_t thread = cqe_queue_first(queue, struct thread, runq_links); + assert_thread_magic(thread); + return thread; + } else { + return THREAD_NULL; + } +} + /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */ void sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context) @@ -3585,22 +3438,6 @@ realtime_setrun( /* */ assert(thread->bound_processor == PROCESSOR_NULL); - /* - * Dispatch directly onto idle processor. - */ - if ((thread->bound_processor == processor) - && processor->state == PROCESSOR_IDLE) { - processor->next_thread = thread; - processor_state_update_from_thread(processor, thread); - processor->deadline = thread->realtime.deadline; - pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); - - ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); - pset_unlock(pset); - sched_ipi_perform(processor, ipi_type); - return; - } - if (processor->current_pri < BASEPRI_RTQUEUES) { preempt = (AST_PREEMPT | AST_URGENT); } else if (thread->realtime.deadline < processor->deadline) { @@ -3614,7 +3451,6 @@ realtime_setrun( ipi_type = SCHED_IPI_NONE; if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - processor->next_thread = THREAD_NULL; processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); @@ -3624,7 +3460,7 @@ realtime_setrun( ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT); } } else if (processor->state == PROCESSOR_DISPATCHING) { - if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) { + if ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline)) { processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; } @@ -3833,13 +3669,12 @@ processor_setrun( if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - processor->next_thread = THREAD_NULL; processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_action = eExitIdle; } else if (processor->state == PROCESSOR_DISPATCHING) { - if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) { + if (processor->current_pri < thread->sched_pri) { processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; } @@ -3857,7 +3692,6 @@ processor_setrun( thread->sched_pri >= processor->current_pri) { ipi_action = eInterruptRunning; } else if (processor->state == PROCESSOR_IDLE) { - processor->next_thread = THREAD_NULL; processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); @@ -4237,6 +4071,25 @@ choose_processor( } /* + * lc_processor is used to indicate the best processor set run queue + * on which to enqueue a thread when all available CPUs are busy with + * higher priority threads, so try to make sure it is initialized. + */ + if (lc_processor == PROCESSOR_NULL) { + cpumap_t available_map = ((pset->cpu_state_map[PROCESSOR_IDLE] | + pset->cpu_state_map[PROCESSOR_RUNNING] | + pset->cpu_state_map[PROCESSOR_DISPATCHING]) & + pset->recommended_bitmask); + cpuid = lsb_first(available_map); + if (cpuid >= 0) { + lc_processor = processor_array[cpuid]; + lowest_count = SCHED(processor_runq_count)(lc_processor); + } + } + + /* + * Move onto the next processor set. + * * If all primary processors in this pset are running a higher * priority thread, move on to next pset. Only when we have * exhausted the search for primary processors do we @@ -4263,6 +4116,7 @@ choose_processor( * the secondary processor that would perturb the least priority * primary, or the least busy primary. */ + boolean_t fallback_processor = false; do { /* lowest_priority is evaluated in the main loops above */ if (lp_idle_secondary_processor != PROCESSOR_NULL) { @@ -4276,14 +4130,15 @@ choose_processor( lc_processor = PROCESSOR_NULL; } else { /* - * All processors are executing higher - * priority threads, and the lowest_count - * candidate was not usable, so we pick a processor - * to give this thread somewhere to be enqueued. + * All processors are executing higher priority threads, and + * the lowest_count candidate was not usable. * - * TODO: Need tracepoint or something to show when this happens - * TODO: Prefer a processor in the original pset + * For AMP platforms running the clutch scheduler always + * return a processor from the requested pset to allow the + * thread to be enqueued in the correct runq. For non-AMP + * platforms, simply return the master_processor. */ + fallback_processor = true; processor = master_processor; } @@ -4299,12 +4154,16 @@ choose_processor( /* * We must verify that the chosen processor is still available. - * master_processor is an exception, since we may need to preempt - * a running thread on it during processor shutdown (for sleep), - * and that thread needs to be enqueued on its runqueue to run - * when the processor is restarted. + * The cases where we pick the master_processor or the fallback + * processor are execptions, since we may need enqueue a thread + * on its runqueue if this is the last remaining processor + * during pset shutdown. + * + * would really help here since it + * gets rid of the weird last processor SHUTDOWN case where + * the pset is still schedulable. */ - if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) { + if (processor != master_processor && (fallback_processor == false) && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) { processor = PROCESSOR_NULL; } } while (processor == PROCESSOR_NULL); @@ -4325,7 +4184,7 @@ choose_processor( void thread_setrun( thread_t thread, - integer_t options) + sched_options_t options) { processor_t processor; processor_set_t pset; @@ -4656,6 +4515,18 @@ set_sched_pri( /* If we're already at this priority, no need to mess with the runqueue */ if (new_priority == old_priority) { +#if CONFIG_SCHED_CLUTCH + /* For the first thread in the system, the priority is correct but + * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch + * scheduler relies on the bucket being set for all threads, update + * its bucket here. + */ + if (thread->th_sched_bucket == TH_BUCKET_RUN) { + assert(is_current_thread); + SCHED(update_thread_bucket)(thread); + } +#endif /* CONFIG_SCHED_CLUTCH */ + return; } @@ -4668,6 +4539,16 @@ set_sched_pri( thread->sched_pri = new_priority; +#if CONFIG_SCHED_CLUTCH + /* + * Since for the clutch scheduler, the thread's bucket determines its runq + * in the hierarchy it is important to update the bucket when the thread + * lock is held and the thread has been removed from the runq hierarchy. + */ + SCHED(update_thread_bucket)(thread); + +#endif /* CONFIG_SCHED_CLUTCH */ + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), (uintptr_t)thread_tid(thread), thread->base_pri, @@ -4853,7 +4734,7 @@ thread_run_queue_remove( * thread locked, at splsched */ void -thread_run_queue_reinsert(thread_t thread, integer_t options) +thread_run_queue_reinsert(thread_t thread, sched_options_t options) { assert(thread->runq == PROCESSOR_NULL); assert(thread->state & (TH_RUN)); @@ -5026,21 +4907,13 @@ processor_idle( } } -#if CONFIG_SCHED_IDLE_IN_PLACE - if (thread != THREAD_NULL) { - /* Did idle-in-place thread wake up */ - if ((thread->state & (TH_WAIT | TH_SUSP)) != TH_WAIT || thread->wake_active) { - break; - } - } -#endif - IDLE_KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0); machine_track_platform_idle(TRUE); machine_idle(); + /* returns with interrupts enabled */ machine_track_platform_idle(FALSE); @@ -5077,8 +4950,6 @@ processor_idle( cpu_quiescent_counter_join(ctime); - assert(processor->next_thread == NULL); - ast_t reason = AST_NONE; /* We're handling all scheduling AST's */ @@ -5105,20 +4976,42 @@ processor_idle( * Each processor has a dedicated thread which * executes the idle loop when there is no suitable * previous context. + * + * This continuation is entered with interrupts disabled. */ void -idle_thread(void) +idle_thread(__assert_only void* parameter, + __unused wait_result_t result) { - processor_t processor = current_processor(); - thread_t new_thread; + assert(ml_get_interrupts_enabled() == FALSE); + assert(parameter == NULL); + + processor_t processor = current_processor(); + + /* + * Ensure that anything running in idle context triggers + * preemption-disabled checks. + */ + disable_preemption(); + + /* + * Enable interrupts temporarily to handle any pending interrupts + * or IPIs before deciding to sleep + */ + spllo(); + + thread_t new_thread = processor_idle(THREAD_NULL, processor); + /* returns with interrupts disabled */ + + enable_preemption(); - new_thread = processor_idle(THREAD_NULL, processor); if (new_thread != THREAD_NULL) { - thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread); + thread_run(processor->idle_thread, + idle_thread, NULL, new_thread); /*NOTREACHED*/ } - thread_block((thread_continue_t)idle_thread); + thread_block(idle_thread); /*NOTREACHED*/ } @@ -5131,7 +5024,7 @@ idle_thread_create( spl_t s; char name[MAXTHREADNAMESIZE]; - result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread); + result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) { return result; } @@ -5345,31 +5238,35 @@ sched_timeshare_consider_maintenance(uint64_t ctime) uint64_t ndeadline = ctime + sched_tick_interval; - if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) { + if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) { thread_wakeup((event_t)sched_timeshare_maintenance_continue); sched_maintenance_wakeups++; } } - uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed); +#if !CONFIG_SCHED_CLUTCH + /* + * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch + * scheduler, the load is maintained at the thread group and bucket level. + */ + uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed); if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) { uint64_t new_deadline = 0; - if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline, - memory_order_relaxed, memory_order_relaxed)) { + if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) { compute_sched_load(); new_deadline = ctime + sched_load_compute_interval_abs; - __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed); + os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed); } } +#endif /* CONFIG_SCHED_CLUTCH */ #if __arm64__ - uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed); + uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed); if (__improbable(perf_deadline && ctime >= perf_deadline)) { /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */ - if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0, - memory_order_relaxed, memory_order_relaxed)) { + if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) { machine_perfcontrol_deadline_passed(perf_deadline); } } @@ -5477,9 +5374,9 @@ runq_scan( queue_index >= 0; queue_index = bitmap_next(runq->bitmap, queue_index)) { thread_t thread; - queue_t queue = &runq->queues[queue_index]; + circle_queue_t queue = &runq->queues[queue_index]; - qe_foreach_element(thread, queue, runq_links) { + cqe_foreach_element(thread, queue, runq_links) { assert(count > 0); assert_thread_magic(thread); @@ -5983,6 +5880,8 @@ sched_update_recommended_cores(uint64_t recommended_cores) bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */ } + boolean_t pset_newly_recommended = false; + /* First set recommended cores */ pset_lock(pset); avail_count = 0; @@ -5991,11 +5890,15 @@ sched_update_recommended_cores(uint64_t recommended_cores) if (nset != pset) { pset_unlock(pset); pset = nset; + pset_newly_recommended = false; pset_lock(pset); } if (bit_test(recommended_cores, processor->cpu_id)) { processor->is_recommended = TRUE; + if (bit_first(pset->recommended_bitmask) == -1) { + pset_newly_recommended = true; + } bit_set(pset->recommended_bitmask, processor->cpu_id); if (processor->state == PROCESSOR_IDLE) { @@ -6006,6 +5909,9 @@ sched_update_recommended_cores(uint64_t recommended_cores) if (processor->state != PROCESSOR_OFF_LINE) { avail_count++; } + if (pset_newly_recommended) { + SCHED(pset_made_schedulable)(processor, pset, false); + } } } while ((processor = processor->processor_list) != NULL); pset_unlock(pset); @@ -6103,13 +6009,6 @@ sched_qos_max_parallelism(__unused int qos, uint64_t options) (host_info_t)&hinfo, &count); assert(kret == KERN_SUCCESS); - /* We would not want multiple realtime threads running on the - * same physical core; even for SMT capable machines. - */ - if (options & QOS_PARALLELISM_REALTIME) { - return hinfo.physical_cpu; - } - if (options & QOS_PARALLELISM_COUNT_LOGICAL) { return hinfo.logical_cpu; } else { @@ -6149,20 +6048,8 @@ sched_perfcontrol_update_callback_deadline(uint64_t new_deadline) * then I cancelled the callback, otherwise I didn't */ - uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, - memory_order_relaxed); - - - while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline, - &old_deadline, new_deadline, - memory_order_relaxed, memory_order_relaxed)) { - ; - } - - - /* now old_deadline contains previous value, which might not be the same if it raced */ - - return (old_deadline != 0) ? TRUE : FALSE; + return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline, + relaxed) != 0; } #endif /* __arm64__ */ @@ -6170,7 +6057,13 @@ sched_perfcontrol_update_callback_deadline(uint64_t new_deadline) void sched_update_pset_load_average(processor_set_t pset) { - int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); +#if CONFIG_SCHED_CLUTCH + int non_rt_load = sched_clutch_root_count(&pset->pset_clutch_root); +#else /* CONFIG_SCHED_CLUTCH */ + int non_rt_load = pset->pset_runq.count; +#endif /* CONFIG_SCHED_CLUTCH */ + + int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); int new_load_average = (pset->load_average + load) >> 1; pset->load_average = new_load_average; @@ -6316,6 +6209,14 @@ sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor) return ok_to_run_realtime_thread; } +void +sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock) +{ + if (drop_lock) { + pset_unlock(pset); + } +} + void thread_set_no_smt(bool set) { @@ -6365,4 +6266,4 @@ sysctl_task_get_no_smt(void) } return '0'; } -#endif +#endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index 12776a617..2f806bdd0 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -78,6 +78,7 @@ #ifdef MACH_KERNEL_PRIVATE #include +#include /* Initialization */ extern void sched_init(void); @@ -140,24 +141,23 @@ extern int thread_run( thread_t new_thread); /* Resume thread with new stack */ -extern void thread_continue( - thread_t old_thread); +extern __dead2 void thread_continue(thread_t old_thread); /* Invoke continuation */ -extern void call_continuation( +extern __dead2 void call_continuation( thread_continue_t continuation, - void *parameter, + void *parameter, wait_result_t wresult, - boolean_t enable_interrupts); + boolean_t enable_interrupts); /* * Flags that can be passed to set_sched_pri * to skip side effects */ -typedef enum { +__options_decl(set_sched_pri_options_t, uint32_t, { SETPRI_DEFAULT = 0x0, SETPRI_LAZY = 0x1, /* Avoid setting AST flags or sending IPIs */ -} set_sched_pri_options_t; +}); /* Set the current scheduled priority */ extern void set_sched_pri( @@ -170,6 +170,12 @@ extern void sched_set_thread_base_priority( thread_t thread, int priority); +/* Set absolute base priority of the specified thread */ +extern void sched_set_kernel_thread_priority( + thread_t thread, + int priority); + + /* Set the thread's true scheduling mode */ extern void sched_set_thread_mode(thread_t thread, sched_mode_t mode); @@ -180,12 +186,6 @@ extern void sched_thread_mode_demote(thread_t thread, extern void sched_thread_mode_undemote(thread_t thread, uint32_t reason); -extern void sched_thread_promote_to_pri(thread_t thread, int priority, uintptr_t trace_obj); -extern void sched_thread_update_promotion_to_pri(thread_t thread, int priority, uintptr_t trace_obj); -extern void sched_thread_unpromote(thread_t thread, uintptr_t trace_obj); - -extern void assert_promotions_invariant(thread_t thread); - extern void sched_thread_promote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj); extern void sched_thread_unpromote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj); @@ -212,8 +212,10 @@ extern void lightweight_update_priority( extern void sched_default_quantum_expire(thread_t thread); -/* Idle processor thread */ -extern void idle_thread(void); +/* Idle processor thread continuation */ +extern void idle_thread( + void* parameter, + wait_result_t result); extern kern_return_t idle_thread_create( processor_t processor); @@ -228,19 +230,18 @@ extern wait_result_t thread_block_reason( void *parameter, ast_t reason); -/* Reschedule thread for execution */ -extern void thread_setrun( - thread_t thread, - integer_t options); - -typedef enum { +__options_decl(sched_options_t, uint32_t, { SCHED_NONE = 0x0, SCHED_TAILQ = 0x1, SCHED_HEADQ = 0x2, SCHED_PREEMPT = 0x4, SCHED_REBALANCE = 0x8, - SCHED_PEEK = 0x10, -} sched_options_t; +}); + +/* Reschedule thread for execution */ +extern void thread_setrun( + thread_t thread, + sched_options_t options); extern processor_set_t task_choose_pset( task_t task); @@ -267,17 +268,20 @@ extern void run_queue_init( extern thread_t run_queue_dequeue( run_queue_t runq, - integer_t options); + sched_options_t options); extern boolean_t run_queue_enqueue( run_queue_t runq, - thread_t thread, - integer_t options); + thread_t thread, + sched_options_t options); extern void run_queue_remove( run_queue_t runq, thread_t thread); +extern thread_t run_queue_peek( + run_queue_t runq); + struct sched_update_scan_context { uint64_t earliest_bg_make_runnable_time; uint64_t earliest_normal_make_runnable_time; @@ -287,6 +291,11 @@ typedef struct sched_update_scan_context *sched_update_scan_context_t; extern void sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context); +extern void sched_pset_made_schedulable( + processor_t processor, + processor_set_t pset, + boolean_t drop_lock); + /* * Enum to define various events which need IPIs. The IPI policy * engine decides what kind of IPI to use based on destination @@ -351,7 +360,7 @@ extern boolean_t thread_run_queue_remove(thread_t thread); thread_t thread_run_queue_remove_for_handoff(thread_t thread); /* Put a thread back in the run queue after being yanked */ -extern void thread_run_queue_reinsert(thread_t thread, integer_t options); +extern void thread_run_queue_reinsert(thread_t thread, sched_options_t options); extern void thread_timer_expire( void *thread, @@ -469,8 +478,6 @@ extern void thread_exception_return(void) __dead2; /* String declaring the name of the current scheduler */ extern char sched_string[SCHED_STRING_MAX_LENGTH]; -extern thread_t port_name_to_thread_for_ulock(mach_port_name_t thread_name); - /* Attempt to context switch to a specific runnable thread */ extern wait_result_t thread_handoff_deallocate(thread_t thread); @@ -572,22 +579,19 @@ extern boolean_t preemption_enabled(void); * a function pointer table. */ -#if !defined(CONFIG_SCHED_TRADITIONAL) && !defined(CONFIG_SCHED_PROTO) && !defined(CONFIG_SCHED_GRRR) && !defined(CONFIG_SCHED_MULTIQ) +#if !defined(CONFIG_SCHED_TRADITIONAL) && !defined(CONFIG_SCHED_PROTO) && !defined(CONFIG_SCHED_GRRR) && !defined(CONFIG_SCHED_MULTIQ) && !defined(CONFIG_SCHED_CLUTCH) #error Enable at least one scheduler algorithm in osfmk/conf/MASTER.XXX #endif -#if DEBUG -#define SCHED(f) (sched_current_dispatch->f) -#else /* DEBUG */ -/* - * For DEV & REL kernels, use a static dispatch table instead of - * using the indirect function table. - */ +#if CONFIG_SCHED_CLUTCH +extern const struct sched_dispatch_table sched_clutch_dispatch; +#define SCHED(f) (sched_clutch_dispatch.f) +#else /* CONFIG_SCHED_CLUTCH */ extern const struct sched_dispatch_table sched_dualq_dispatch; #define SCHED(f) (sched_dualq_dispatch.f) +#endif /* CONFIG_SCHED_CLUTCH */ -#endif /* DEBUG */ struct sched_dispatch_table { const char *sched_name; @@ -636,7 +640,7 @@ struct sched_dispatch_table { boolean_t (*processor_enqueue)( processor_t processor, thread_t thread, - integer_t options); + sched_options_t options); /* Migrate threads away in preparation for processor shutdown */ void (*processor_queue_shutdown)( @@ -713,13 +717,6 @@ struct sched_dispatch_table { void (*thread_update_scan)(sched_update_scan_context_t scan_context); - /* - * Use processor->next_thread to pin a thread to an idle - * processor. If FALSE, threads are enqueued and can - * be stolen by other processors. - */ - boolean_t direct_dispatch_to_idle_processors; - /* Supports more than one pset */ boolean_t multiple_psets_enabled; /* Supports scheduler groups */ @@ -747,6 +744,16 @@ struct sched_dispatch_table { void (*check_spill)(processor_set_t pset, thread_t thread); sched_ipi_type_t (*ipi_policy)(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event); bool (*thread_should_yield)(processor_t processor, thread_t thread); + + /* Routine to update run counts */ + uint32_t (*run_count_incr)(thread_t thread); + uint32_t (*run_count_decr)(thread_t thread); + + /* Routine to update scheduling bucket for a thread */ + void (*update_thread_bucket)(thread_t thread); + + /* Routine to inform the scheduler when a new pset becomes schedulable */ + void (*pset_made_schedulable)(processor_t processor, processor_set_t pset, boolean_t drop_lock); }; #if defined(CONFIG_SCHED_TRADITIONAL) @@ -767,11 +774,9 @@ extern const struct sched_dispatch_table sched_proto_dispatch; extern const struct sched_dispatch_table sched_grrr_dispatch; #endif -/* - * It is an error to invoke any scheduler-related code - * before this is set up - */ -extern const struct sched_dispatch_table *sched_current_dispatch; +#if defined(CONFIG_SCHED_CLUTCH) +extern const struct sched_dispatch_table sched_clutch_dispatch; +#endif #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/sched_proto.c b/osfmk/kern/sched_proto.c index f1297189d..ce5d226f1 100644 --- a/osfmk/kern/sched_proto.c +++ b/osfmk/kern/sched_proto.c @@ -97,7 +97,7 @@ static boolean_t sched_proto_processor_enqueue( processor_t processor, thread_t thread, - integer_t options); + sched_options_t options); static void sched_proto_processor_queue_shutdown( @@ -182,7 +182,6 @@ const struct sched_dispatch_table sched_proto_dispatch = { .processor_runq_stats_count_sum = sched_proto_processor_runq_stats_count_sum, .processor_bound_count = sched_proto_processor_bound_count, .thread_update_scan = sched_proto_thread_update_scan, - .direct_dispatch_to_idle_processors = TRUE, .multiple_psets_enabled = TRUE, .sched_groups_enabled = FALSE, .avoid_processor_enabled = FALSE, @@ -199,6 +198,10 @@ const struct sched_dispatch_table sched_proto_dispatch = { .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, }; static struct run_queue *global_runq; @@ -307,7 +310,7 @@ sched_proto_choose_thread(processor_t processor, ast_t reason __unused) { run_queue_t rq = global_runq; - queue_t queue; + circle_queue_t queue; int pri, count; thread_t thread; @@ -329,18 +332,17 @@ sched_proto_choose_thread(processor_t processor, */ while (count > 0 && pri >= priority) { - thread = (thread_t)queue_first(queue); - while (!queue_end(queue, (queue_entry_t)thread)) { + cqe_foreach_element_safe(thread, queue, runq_links) { if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) && runqueue_generation != thread->runqueue_generation) { - remqueue((queue_entry_t)thread); + circle_dequeue(queue, &thread->runq_links); thread->runq = PROCESSOR_NULL; thread->runqueue_generation = runqueue_generation; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { bitmap_clear(rq->bitmap, pri); rq->highq = bitmap_first(rq->bitmap, NRQS); } @@ -401,7 +403,7 @@ static boolean_t sched_proto_processor_enqueue( processor_t processor __unused, thread_t thread, - integer_t options) + sched_options_t options) { run_queue_t rq = global_runq; boolean_t result; @@ -439,20 +441,7 @@ sched_proto_processor_queue_remove( * Thread is on a run queue and we have a lock on * that run queue. */ - remqueue((queue_entry_t)thread); - SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); - rq->count--; - if (SCHED(priority_is_urgent)(thread->sched_pri)) { - rq->urgency--; assert(rq->urgency >= 0); - } - - if (queue_empty(rq->queues + thread->sched_pri)) { - /* update run queue status */ - bitmap_clear(rq->bitmap, thread->sched_pri); - rq->highq = bitmap_first(rq->bitmap, NRQS); - } - - thread->runq = PROCESSOR_NULL; + run_queue_remove(rq, thread); } else { /* * The thread left the run queue before we could diff --git a/osfmk/kern/sched_traditional.c b/osfmk/kern/sched_traditional.c index e91504583..3297904d0 100644 --- a/osfmk/kern/sched_traditional.c +++ b/osfmk/kern/sched_traditional.c @@ -81,7 +81,8 @@ static void sched_traditional_processor_queue_shutdown(processor_t processor); static boolean_t -sched_traditional_processor_enqueue(processor_t processor, thread_t thread, integer_t options); +sched_traditional_processor_enqueue(processor_t processor, thread_t thread, + sched_options_t options); static boolean_t sched_traditional_processor_queue_remove(processor_t processor, thread_t thread); @@ -160,7 +161,6 @@ const struct sched_dispatch_table sched_traditional_dispatch = { .processor_runq_stats_count_sum = sched_traditional_processor_runq_stats_count_sum, .processor_bound_count = sched_traditional_processor_bound_count, .thread_update_scan = sched_traditional_thread_update_scan, - .direct_dispatch_to_idle_processors = TRUE, .multiple_psets_enabled = TRUE, .sched_groups_enabled = FALSE, .avoid_processor_enabled = FALSE, @@ -177,6 +177,10 @@ const struct sched_dispatch_table sched_traditional_dispatch = { .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, }; const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch = { @@ -208,7 +212,6 @@ const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch .processor_runq_stats_count_sum = sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum, .processor_bound_count = sched_traditional_processor_bound_count, .thread_update_scan = sched_traditional_thread_update_scan, - .direct_dispatch_to_idle_processors = FALSE, .multiple_psets_enabled = TRUE, .sched_groups_enabled = FALSE, .avoid_processor_enabled = FALSE, @@ -225,6 +228,10 @@ const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, .thread_should_yield = sched_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, }; static void @@ -337,17 +344,16 @@ sched_traditional_choose_thread_from_runq( run_queue_t rq, int priority) { - queue_t queue = rq->queues + rq->highq; + circle_queue_t queue = rq->queues + rq->highq; int pri = rq->highq; int count = rq->count; thread_t thread; while (count > 0 && pri >= priority) { - thread = (thread_t)(uintptr_t)queue_first(queue); - while (!queue_end(queue, (queue_entry_t)thread)) { + cqe_foreach_element_safe(thread, queue, runq_links) { if (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) { - remqueue((queue_entry_t)thread); + circle_dequeue(queue, &thread->runq_links); thread->runq = PROCESSOR_NULL; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); @@ -355,16 +361,13 @@ sched_traditional_choose_thread_from_runq( if (SCHED(priority_is_urgent)(pri)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { bitmap_clear(rq->bitmap, pri); rq->highq = bitmap_first(rq->bitmap, NRQS); } - return thread; } count--; - - thread = (thread_t)(uintptr_t)queue_next((queue_entry_t)thread); } queue--; pri--; @@ -397,8 +400,8 @@ sched_traditional_initial_thread_sched_mode(task_t parent_task) */ static boolean_t sched_traditional_processor_enqueue(processor_t processor, - thread_t thread, - integer_t options) + thread_t thread, + sched_options_t options) { run_queue_t rq = runq_for_processor(processor); boolean_t result; @@ -521,21 +524,18 @@ sched_traditional_processor_queue_shutdown(processor_t processor) { processor_set_t pset = processor->processor_set; run_queue_t rq = runq_for_processor(processor); - queue_t queue = rq->queues + rq->highq; + circle_queue_t queue = rq->queues + rq->highq; int pri = rq->highq; int count = rq->count; - thread_t next, thread; - queue_head_t tqueue; + thread_t thread; + circle_queue_head_t tqueue; - queue_init(&tqueue); + circle_queue_init(&tqueue); while (count > 0) { - thread = (thread_t)(uintptr_t)queue_first(queue); - while (!queue_end(queue, (queue_entry_t)thread)) { - next = (thread_t)(uintptr_t)queue_next((queue_entry_t)thread); - + cqe_foreach_element_safe(thread, queue, runq_links) { if (thread->bound_processor == PROCESSOR_NULL) { - remqueue((queue_entry_t)thread); + circle_dequeue(queue, &thread->runq_links); thread->runq = PROCESSOR_NULL; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); @@ -544,16 +544,14 @@ sched_traditional_processor_queue_shutdown(processor_t processor) if (SCHED(priority_is_urgent)(pri)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { bitmap_clear(rq->bitmap, pri); rq->highq = bitmap_first(rq->bitmap, NRQS); } - enqueue_tail(&tqueue, (queue_entry_t)thread); + circle_enqueue_tail(&tqueue, &thread->runq_links); } count--; - - thread = next; } queue--; pri--; @@ -561,7 +559,7 @@ sched_traditional_processor_queue_shutdown(processor_t processor) pset_unlock(pset); - while ((thread = (thread_t)(uintptr_t)dequeue_head(&tqueue)) != THREAD_NULL) { + while ((thread = cqe_dequeue_head(&tqueue, struct thread, runq_links)) != THREAD_NULL) { thread_lock(thread); thread_setrun(thread, SCHED_TAILQ); @@ -652,16 +650,15 @@ static thread_t sched_traditional_steal_processor_thread(processor_t processor) { run_queue_t rq = runq_for_processor(processor); - queue_t queue = rq->queues + rq->highq; + circle_queue_t queue = rq->queues + rq->highq; int pri = rq->highq; int count = rq->count; thread_t thread; while (count > 0) { - thread = (thread_t)(uintptr_t)queue_first(queue); - while (!queue_end(queue, (queue_entry_t)thread)) { + cqe_foreach_element_safe(thread, queue, runq_links) { if (thread->bound_processor == PROCESSOR_NULL) { - remqueue((queue_entry_t)thread); + circle_dequeue(queue, &thread->runq_links); thread->runq = PROCESSOR_NULL; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); @@ -670,7 +667,7 @@ sched_traditional_steal_processor_thread(processor_t processor) if (SCHED(priority_is_urgent)(pri)) { rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(queue)) { + if (circle_queue_empty(queue)) { bitmap_clear(rq->bitmap, pri); rq->highq = bitmap_first(rq->bitmap, NRQS); } @@ -678,8 +675,6 @@ sched_traditional_steal_processor_thread(processor_t processor) return thread; } count--; - - thread = (thread_t)(uintptr_t)queue_next((queue_entry_t)thread); } queue--; pri--; diff --git a/osfmk/kern/simple_lock.h b/osfmk/kern/simple_lock.h index 258d323db..d791567b0 100644 --- a/osfmk/kern/simple_lock.h +++ b/osfmk/kern/simple_lock.h @@ -77,6 +77,7 @@ #include #ifdef MACH_KERNEL_PRIVATE +#include #include extern void hw_lock_init( @@ -141,61 +142,14 @@ extern void hw_lock_unlock_nopreempt( extern unsigned int hw_lock_held( hw_lock_t); +extern boolean_t hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait); #endif /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS -extern uint32_t hw_atomic_add( - volatile uint32_t *dest, - uint32_t delt); - -extern uint32_t hw_atomic_sub( - volatile uint32_t *dest, - uint32_t delt); - -extern uint32_t hw_atomic_or( - volatile uint32_t *dest, - uint32_t mask); - -extern uint32_t hw_atomic_and( - volatile uint32_t *dest, - uint32_t mask); - -/* - * Variant of hw_atomic_or which doesn't return a value; potentially - * more efficient on some platforms. - */ -extern void hw_atomic_or_noret( - volatile uint32_t *dest, - uint32_t mask); -/* - * Variant of hw_atomic_and which doesn't return a value; potentially - * more efficient on some platforms. - */ - -extern void hw_atomic_and_noret( - volatile uint32_t *dest, - uint32_t mask); - -extern uint32_t hw_compare_and_store( - uint32_t oldval, - uint32_t newval, - volatile uint32_t *dest); - -extern void hw_queue_atomic( - unsigned int *anchor, - unsigned int *elem, - unsigned int disp); - -extern void hw_queue_atomic_list( - unsigned int *anchor, - unsigned int *first, - unsigned int *last, - unsigned int disp); - -extern unsigned int *hw_dequeue_atomic( - unsigned int *anchor, - unsigned int disp); +extern void * hw_wait_while_equals( + void **address, + void *current); extern void usimple_lock_init( usimple_lock_t, @@ -213,6 +167,19 @@ extern unsigned int usimple_lock_try( extern void usimple_lock_try_lock_loop( usimple_lock_t, lck_grp_t*); + +#if defined(__x86_64__) +extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_deadline( + usimple_lock_t, + uint64_t, + lck_grp_t*); + +extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_duration( + usimple_lock_t, + uint64_t, + lck_grp_t*); +#endif + #else extern void usimple_lock( usimple_lock_t); @@ -228,6 +195,18 @@ extern void usimple_lock_try_lock_loop( usimple_lock_t); #define usimple_lock_try_lock_loop(lck, grp) usimple_lock_try_lock_loop(lck) +#if defined(__x86_64__) +extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_deadline( + usimple_lock_t, + uint64_t); +#define usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl, grp) usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl) + +extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_duration( + usimple_lock_t, + uint64_t); +#define usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur, grp) usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur) +#endif + #endif /* LOCK_STATS */ extern void usimple_unlock( @@ -250,9 +229,73 @@ __END_DECLS #define simple_unlock(l) usimple_unlock(l) #define simple_lock_try(l, grp) usimple_lock_try(l, grp) #define simple_lock_try_lock_loop(l, grp) usimple_lock_try_lock_loop(l, grp) +#define simple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) +#define simple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) usimple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) #define simple_lock_addr(l) (&(l)) #endif /* !defined(simple_lock_init) */ +#ifdef MACH_KERNEL_PRIVATE + +typedef uint32_t hw_lock_bit_t; + +#if LOCK_STATS +extern void hw_lock_bit( + hw_lock_bit_t *, + unsigned int, + lck_grp_t*); + +extern void hw_lock_bit_nopreempt( + hw_lock_bit_t *, + unsigned int, + lck_grp_t*); + +extern unsigned int hw_lock_bit_try( + hw_lock_bit_t *, + unsigned int, + lck_grp_t*); + +extern unsigned int hw_lock_bit_to( + hw_lock_bit_t *, + unsigned int, + uint32_t, + lck_grp_t*); + +#else +extern void hw_lock_bit( + hw_lock_bit_t *, + unsigned int); +#define hw_lock_bit(lck, bit, grp) hw_lock_bit(lck, bit) + +extern void hw_lock_bit_nopreempt( + hw_lock_bit_t *, + unsigned int); +#define hw_lock_bit_nopreempt(lck, bit, grp) hw_lock_bit_nopreempt(lck, bit) + +extern unsigned int hw_lock_bit_try( + hw_lock_bit_t *, + unsigned int); +#define hw_lock_bit_try(lck, bit, grp) hw_lock_bit_try(lck, bit) + +extern unsigned int hw_lock_bit_to( + hw_lock_bit_t *, + unsigned int, + uint32_t); +#define hw_lock_bit_to(lck, bit, timeout, grp) hw_lock_bit_to(lck, bit, timeout) + +#endif /* LOCK_STATS */ + +extern void hw_unlock_bit( + hw_lock_bit_t *, + unsigned int); + +extern void hw_unlock_bit_nopreempt( + hw_lock_bit_t *, + unsigned int); + +#define hw_lock_bit_held(l, b) (((*(l))&(1< -#include #include #include @@ -86,6 +85,7 @@ #include #include #include +#include #include #include #if CONFIG_SCHED_SFI @@ -98,7 +98,6 @@ #if CONFIG_TELEMETRY #include #endif -#include #include #include #include @@ -220,11 +219,6 @@ unsigned int trace_wrap = 0; boolean_t trace_serial = FALSE; boolean_t early_boot_complete = FALSE; -/* physically contiguous carveouts */ -SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout = 0; -SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout_pa = 0; -SECURITY_READ_ONLY_LATE(size_t) phys_carveout_size = 0; - /* mach leak logging */ int log_leaks = 0; @@ -250,13 +244,6 @@ kernel_early_bootstrap(void) serverperfmode = 1; } - lck_mod_init(); - - /* - * Initialize the timer callout world - */ - timer_call_init(); - #if CONFIG_SCHED_SFI /* * Configure SFI classes @@ -387,6 +374,9 @@ kernel_bootstrap(void) kernel_bootstrap_log("thread_init"); thread_init(); + kernel_bootstrap_log("restartable_init"); + restartable_init(); + kernel_bootstrap_log("workq_init"); workq_init(); @@ -414,6 +404,9 @@ kernel_bootstrap(void) /* initialize host_statistics */ host_statistics_init(); + /* initialize exceptions */ + exception_init(); + /* * Create a kernel thread to execute the kernel bootstrap. */ @@ -431,6 +424,7 @@ kernel_bootstrap(void) /* TODO: do a proper thread_start() (without the thread_setrun()) */ thread->state = TH_RUN; thread->last_made_runnable_time = mach_absolute_time(); + thread_set_thread_name(thread, "kernel_bootstrap_thread"); thread_deallocate(thread); @@ -522,28 +516,12 @@ kernel_bootstrap_thread(void) #if (defined(__i386__) || defined(__x86_64__)) && NCOPY_WINDOWS > 0 /* * Create and initialize the physical copy window for processor 0 - * This is required before starting kicking off IOKit. + * This is required before starting kicking off IOKit. */ cpu_physwindow_init(0); #endif - if (PE_i_can_has_debugger(NULL)) { - unsigned int phys_carveout_mb = 0; - if (PE_parse_boot_argn("phys_carveout_mb", &phys_carveout_mb, - sizeof(phys_carveout_mb)) && phys_carveout_mb > 0) { - phys_carveout_size = phys_carveout_mb * 1024 * 1024; - kern_return_t kr = kmem_alloc_contig(kernel_map, - (vm_offset_t *)&phys_carveout, phys_carveout_size, - VM_MAP_PAGE_MASK(kernel_map), 0, 0, KMA_NOPAGEWAIT, - VM_KERN_MEMORY_DIAG); - if (kr != KERN_SUCCESS) { - kprintf("failed to allocate %uMB for phys_carveout_mb: %u\n", - phys_carveout_mb, (unsigned int)kr); - } else { - phys_carveout_pa = kvtophys((vm_offset_t)phys_carveout); - } - } - } + phys_carveout_init(); #if MACH_KDP kernel_bootstrap_log("kdp_init"); @@ -700,6 +678,10 @@ kernel_bootstrap_thread(void) bsd_init(); #endif +#if defined (__x86_64__) + x86_64_protect_data_const(); +#endif + /* * Get rid of pages used for early boot tracing. @@ -730,6 +712,8 @@ kernel_bootstrap_thread(void) * slave_main: * * Load the first thread to start a processor. + * This path will also be used by the master processor + * after being offlined. */ void slave_main(void *machine_param) @@ -741,13 +725,19 @@ slave_main(void *machine_param) * Use the idle processor thread if there * is no dedicated start up thread. */ - if (processor->next_thread == THREAD_NULL) { + if (processor->processor_offlined == true) { + /* Return to the saved processor_offline context */ + assert(processor->startup_thread == THREAD_NULL); + thread = processor->idle_thread; - thread->continuation = (thread_continue_t)processor_start_thread; thread->parameter = machine_param; + } else if (processor->startup_thread) { + thread = processor->startup_thread; + processor->startup_thread = THREAD_NULL; } else { - thread = processor->next_thread; - processor->next_thread = THREAD_NULL; + thread = processor->idle_thread; + thread->continuation = processor_start_thread; + thread->parameter = machine_param; } load_context(thread); @@ -762,7 +752,8 @@ slave_main(void *machine_param) * Called at splsched. */ void -processor_start_thread(void *machine_param) +processor_start_thread(void *machine_param, + __unused wait_result_t result) { processor_t processor = current_processor(); thread_t self = current_thread(); @@ -774,7 +765,7 @@ processor_start_thread(void *machine_param) * reenter the idle loop, else terminate. */ if (self == processor->idle_thread) { - thread_block((thread_continue_t)idle_thread); + thread_block(idle_thread); } thread_terminate(self); @@ -785,6 +776,8 @@ processor_start_thread(void *machine_param) * load_context: * * Start the first thread on a processor. + * This may be the first thread ever run on a processor, or + * it could be a processor that was previously offlined. */ static void __attribute__((noreturn)) load_context( @@ -799,7 +792,6 @@ load_context( machine_set_current_thread(thread); load_context_kprintf("processor_up\n"); - processor_up(processor); PMAP_ACTIVATE_KERNEL(processor->cpu_id); @@ -822,7 +814,7 @@ load_context( * running for load calculations. */ if (!(thread->state & TH_IDLE)) { - sched_run_incr(thread); + SCHED(run_count_incr)(thread); } processor->active_thread = thread; @@ -834,6 +826,8 @@ load_context( processor->deadline = UINT64_MAX; thread->last_processor = processor; + processor_up(processor); + processor->last_dispatch = mach_absolute_time(); timer_start(&thread->system_timer, processor->last_dispatch); PROCESSOR_DATA(processor, thread_timer) = PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 898a5e8bb..062c9cb4b 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -480,7 +480,7 @@ semaphore_signal_thread_trap( * pre-post the semaphore. */ if (thread_name != MACH_PORT_NULL) { - thread = port_name_to_thread(thread_name); + thread = port_name_to_thread(thread_name, PORT_TO_THREAD_NONE); if (thread == THREAD_NULL) { return KERN_INVALID_ARGUMENT; } diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 0bb52436d..ef2316883 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -205,15 +205,20 @@ thread_switch( thread_t thread = THREAD_NULL; thread_t self = current_thread(); mach_port_name_t thread_name = args->thread_name; - int option = args->option; + int option = args->option; mach_msg_timeout_t option_time = args->option_time; - uint32_t scale_factor = NSEC_PER_MSEC; - boolean_t depress_option = FALSE; - boolean_t wait_option = FALSE; + uint32_t scale_factor = NSEC_PER_MSEC; + boolean_t depress_option = FALSE; + boolean_t wait_option = FALSE; wait_interrupt_t interruptible = THREAD_ABORTSAFE; + port_to_thread_options_t ptt_options = PORT_TO_THREAD_NOT_CURRENT_THREAD; /* * Validate and process option. + * + * OSLock boosting only applies to other threads + * in your same task (even if you have a port for + * a thread in another task) */ switch (option) { case SWITCH_OPTION_NONE: @@ -232,10 +237,12 @@ thread_switch( case SWITCH_OPTION_OSLOCK_DEPRESS: depress_option = TRUE; interruptible |= THREAD_WAIT_NOREPORT; + ptt_options |= PORT_TO_THREAD_IN_CURRENT_TASK; break; case SWITCH_OPTION_OSLOCK_WAIT: wait_option = TRUE; interruptible |= THREAD_WAIT_NOREPORT; + ptt_options |= PORT_TO_THREAD_IN_CURRENT_TASK; break; default: return KERN_INVALID_ARGUMENT; @@ -245,46 +252,21 @@ thread_switch( * Translate the port name if supplied. */ if (thread_name != MACH_PORT_NULL) { - ipc_port_t port; - - if (ipc_port_translate_send(self->task->itk_space, - thread_name, &port) == KERN_SUCCESS) { - ip_reference(port); - ip_unlock(port); - - thread = convert_port_to_thread(port); - ip_release(port); - - if (thread == self) { - thread_deallocate(thread); - thread = THREAD_NULL; - } - } + thread = port_name_to_thread(thread_name, ptt_options); } if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) { if (thread != THREAD_NULL) { - if (thread->task != self->task) { - /* - * OSLock boosting only applies to other threads - * in your same task (even if you have a port for - * a thread in another task) - */ - - thread_deallocate(thread); - thread = THREAD_NULL; - } else { - /* - * Attempt to kick the lock owner up to our same IO throttling tier. - * If the thread is currently blocked in throttle_lowpri_io(), - * it will immediately break out. - * - * TODO: SFI break out? - */ - int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO); - - set_thread_iotier_override(thread, new_policy); - } + /* + * Attempt to kick the lock owner up to our same IO throttling tier. + * If the thread is currently blocked in throttle_lowpri_io(), + * it will immediately break out. + * + * TODO: SFI break out? + */ + int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO); + + set_thread_iotier_override(thread, new_policy); } } @@ -353,41 +335,6 @@ thread_yield_with_continuation( } -/* Returns a +1 thread reference */ -thread_t -port_name_to_thread_for_ulock(mach_port_name_t thread_name) -{ - thread_t thread = THREAD_NULL; - thread_t self = current_thread(); - - /* - * Translate the port name if supplied. - */ - if (thread_name != MACH_PORT_NULL) { - ipc_port_t port; - - if (ipc_port_translate_send(self->task->itk_space, - thread_name, &port) == KERN_SUCCESS) { - ip_reference(port); - ip_unlock(port); - - thread = convert_port_to_thread(port); - ip_release(port); - - if (thread == THREAD_NULL) { - return thread; - } - - if ((thread == self) || (thread->task != self->task)) { - thread_deallocate(thread); - thread = THREAD_NULL; - } - } - } - - return thread; -} - /* This function is called after an assert_wait(), therefore it must not * cause another wait until after the thread_run() or thread_block() * @@ -531,6 +478,9 @@ thread_depress_expire(void *p0, if (--thread->depress_timer_active == 0) { thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; + if ((thread->state & TH_RUN) == TH_RUN) { + thread->last_basepri_change_time = mach_absolute_time(); + } thread_recompute_sched_pri(thread, SETPRI_DEFAULT); } @@ -579,6 +529,9 @@ thread_depress_abort_locked(thread_t thread) assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK); thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; + if ((thread->state & TH_RUN) == TH_RUN) { + thread->last_basepri_change_time = mach_absolute_time(); + } thread_recompute_sched_pri(thread, SETPRI_LAZY); diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index b6dee96ef..571530ed4 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -179,8 +179,8 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 73 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 74 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 75 */ MACH_TRAP(kern_invalid, 0, 0, NULL), -/* 76 */ MACH_TRAP(kern_invalid, 0, 0, NULL), -/* 77 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 76 */ MACH_TRAP(_kernelrpc_mach_port_type_trap, 3, 3, munge_wwww), +/* 77 */ MACH_TRAP(_kernelrpc_mach_port_request_notification_trap, 7, 7, munge_wwwwwww), /* 78 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 79 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 80 */ MACH_TRAP(kern_invalid, 0, 0, NULL), @@ -200,7 +200,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 94 */ MACH_TRAP(mk_timer_cancel_trap, 2, 2, munge_ww), /* 95 */ MACH_TRAP(mk_timer_arm_leeway_trap, 4, 6, munge_wlll), /* traps 64 - 95 reserved (debo) */ -/* 96 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 96 */ MACH_TRAP(debug_control_port_for_pid, 3, 3, munge_www), /* 97 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 98 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 99 */ MACH_TRAP(kern_invalid, 0, 0, NULL), @@ -315,8 +315,8 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 74 */ "kern_invalid", /* 75 */ "kern_invalid", /* 76 */ "kern_invalid", -/* 77 */ "kern_invalid", -/* 78 */ "kern_invalid", +/* 77 */ "_kernelrpc_mach_port_type_trap", +/* 78 */ "_kernelrpc_mach_port_request_notification_trap", /* 79 */ "kern_invalid", /* 80 */ "kern_invalid", /* 81 */ "kern_invalid", @@ -335,7 +335,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 94 */ "mk_timer_cancel_trap", /* 95 */ "kern_invalid", /* traps 64 - 95 reserved (debo) */ -/* 96 */ "kern_invalid", +/* 96 */ "debug_control_port_for_pid", /* 97 */ "kern_invalid", /* 98 */ "kern_invalid", /* 99 */ "kern_invalid", diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 708ef1787..0374456e1 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -125,6 +125,7 @@ #include #include #include +#include #include #if CONFIG_TELEMETRY @@ -144,10 +145,11 @@ #include #include #include +#include #include #include /* for coredump */ - +#include /* * Exported interfaces */ @@ -163,6 +165,8 @@ #include #include +#include + #if CONFIG_ATM #include #endif @@ -216,10 +220,26 @@ SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__( .purgeable_nonvolatile = -1, .purgeable_volatile_compressed = -1, .purgeable_nonvolatile_compressed = -1, + .tagged_nofootprint = -1, + .tagged_footprint = -1, + .tagged_nofootprint_compressed = -1, + .tagged_footprint_compressed = -1, .network_volatile = -1, .network_nonvolatile = -1, .network_volatile_compressed = -1, .network_nonvolatile_compressed = -1, + .media_nofootprint = -1, + .media_footprint = -1, + .media_nofootprint_compressed = -1, + .media_footprint_compressed = -1, + .graphics_nofootprint = -1, + .graphics_footprint = -1, + .graphics_nofootprint_compressed = -1, + .graphics_footprint_compressed = -1, + .neural_nofootprint = -1, + .neural_footprint = -1, + .neural_nofootprint_compressed = -1, + .neural_footprint_compressed = -1, .platform_idle_wakeups = -1, .interrupt_wakeups = -1, #if !CONFIG_EMBEDDED @@ -229,12 +249,15 @@ SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__( .cpu_time_billed_to_others = -1, .physical_writes = -1, .logical_writes = -1, - .energy_billed_to_me = -1, - .energy_billed_to_others = -1, + .logical_writes_to_external = -1, +#if DEBUG || DEVELOPMENT .pages_grabbed = -1, .pages_grabbed_kern = -1, .pages_grabbed_iopl = -1, - .pages_grabbed_upl = -1}; + .pages_grabbed_upl = -1, +#endif + .energy_billed_to_me = -1, + .energy_billed_to_others = -1}; /* System sleep state */ boolean_t tasks_suspend_state; @@ -253,6 +276,7 @@ kern_return_t task_resume_internal(task_t); static kern_return_t task_start_halt_locked(task_t task, boolean_t should_mark_corpse); extern kern_return_t iokit_task_terminate(task_t task); +extern void iokit_task_app_suspended_changed(task_t task); extern kern_return_t exception_deliver(thread_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t, struct exception_action *, lck_mtx_t *); extern void bsd_copythreadname(void *dst_uth, void *src_uth); @@ -293,7 +317,8 @@ uint64_t task_iomon_interval_secs; /* Per-task I/O monitor interval in secs #define IO_TELEMETRY_DEFAULT_LIMIT (10ll * 1024ll * 1024ll) int64_t io_telemetry_limit; /* Threshold to take a microstackshot (0 indicated I/O telemetry is turned off) */ int64_t global_logical_writes_count = 0; /* Global count for logical writes */ -static boolean_t global_update_logical_writes(int64_t); +int64_t global_logical_writes_to_external_count = 0; /* Global count for logical writes to external storage*/ +static boolean_t global_update_logical_writes(int64_t, int64_t*); #define TASK_MAX_THREAD_LIMIT 256 @@ -309,6 +334,8 @@ int hwm_user_cores = 0; /* high watermark violations generate user core files */ #endif #ifdef MACH_BSD +extern uint32_t proc_platform(struct proc *); +extern uint32_t proc_sdk(struct proc *); extern void proc_getexecutableuuid(void *, unsigned char *, unsigned long); extern int proc_pid(struct proc *p); extern int proc_selfpid(void); @@ -324,6 +351,7 @@ extern void proc_memstat_terminated(struct proc* p, boolean_t set); extern void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); extern void memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); extern boolean_t memorystatus_allowed_vm_map_fork(task_t task); +extern uint64_t memorystatus_available_memory_internal(proc_t p); #if DEVELOPMENT || DEBUG extern void memorystatus_abort_vm_map_fork(task_t); @@ -337,9 +365,9 @@ extern void memorystatus_abort_vm_map_fork(task_t); int exc_resource_threads_enabled; #endif /* DEVELOPMENT || DEBUG */ -#if (DEVELOPMENT || DEBUG) && TASK_EXC_GUARD_DELIVER_CORPSE -uint32_t task_exc_guard_default = TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_CORPSE | - TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_CORPSE; +#if (DEVELOPMENT || DEBUG) +uint32_t task_exc_guard_default = TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_ONCE | TASK_EXC_GUARD_MP_CORPSE | + TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_ONCE | TASK_EXC_GUARD_VM_CORPSE; #else uint32_t task_exc_guard_default = 0; #endif @@ -351,7 +379,29 @@ static void task_wait_locked(task_t task, boolean_t until_not_runnable); static void task_release_locked(task_t task); static void task_synchronizer_destroy_all(task_t task); +static os_ref_count_t +task_add_turnstile_watchports_locked( + task_t task, + struct task_watchports *watchports, + struct task_watchport_elem **previous_elem_array, + ipc_port_t *portwatch_ports, + uint32_t portwatch_count); + +static os_ref_count_t +task_remove_turnstile_watchports_locked( + task_t task, + struct task_watchports *watchports, + ipc_port_t *port_freelist); + +static struct task_watchports * +task_watchports_alloc_init( + task_t task, + thread_t thread, + uint32_t count); +static void +task_watchports_deallocate( + struct task_watchports *watchports); void task_set_64bit( @@ -453,8 +503,12 @@ task_set_platform_binary( task_lock(task); if (is_platform) { task->t_flags |= TF_PLATFORM; + /* set exc guard default behavior for first-party code */ + task->task_exc_guard = (task_exc_guard_default & TASK_EXC_GUARD_ALL); } else { task->t_flags &= ~(TF_PLATFORM); + /* set exc guard default behavior for third-party code */ + task->task_exc_guard = ((task_exc_guard_default >> TASK_EXC_GUARD_THIRD_PARTY_DEFAULT_SHIFT) & TASK_EXC_GUARD_ALL); } task_unlock(task); } @@ -496,6 +550,16 @@ task_set_dyld_info( task_unlock(task); } +void +task_set_mach_header_address( + task_t task, + mach_vm_address_t addr) +{ + task_lock(task); + task->mach_header_vm_address = addr; + task_unlock(task); +} + void task_atm_reset(__unused task_t task) { @@ -541,54 +605,80 @@ task_clear_exec_copy_flag(task_t task) task->t_procflags &= ~TPF_EXEC_COPY; } -/* - * This wait event is t_procflags instead of t_flags because t_flags is volatile - * - * TODO: store the flags in the same place as the event - * rdar://problem/28501994 - */ event_t task_get_return_wait_event(task_t task) { - return (event_t)&task->t_procflags; + return (event_t)&task->returnwait_inheritor; } void -task_clear_return_wait(task_t task) +task_clear_return_wait(task_t task, uint32_t flags) { - task_lock(task); - - task->t_flags &= ~TF_LRETURNWAIT; - - if (task->t_flags & TF_LRETURNWAITER) { + if (flags & TCRW_CLEAR_INITIAL_WAIT) { thread_wakeup(task_get_return_wait_event(task)); - task->t_flags &= ~TF_LRETURNWAITER; } - task_unlock(task); + if (flags & TCRW_CLEAR_FINAL_WAIT) { + is_write_lock(task->itk_space); + + task->t_returnwaitflags &= ~TRW_LRETURNWAIT; + task->returnwait_inheritor = NULL; + + if (task->t_returnwaitflags & TRW_LRETURNWAITER) { + struct turnstile *turnstile = turnstile_prepare((uintptr_t) task_get_return_wait_event(task), + NULL, TURNSTILE_NULL, TURNSTILE_ULOCK); + + waitq_wakeup64_all(&turnstile->ts_waitq, + CAST_EVENT64_T(task_get_return_wait_event(task)), + THREAD_AWAKENED, 0); + + turnstile_update_inheritor(turnstile, NULL, + TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD); + turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_HELD); + + turnstile_complete((uintptr_t) task_get_return_wait_event(task), NULL, NULL, TURNSTILE_ULOCK); + turnstile_cleanup(); + task->t_returnwaitflags &= ~TRW_LRETURNWAITER; + } + is_write_unlock(task->itk_space); + } } void __attribute__((noreturn)) task_wait_to_return(void) { - task_t task; + task_t task = current_task(); - task = current_task(); - task_lock(task); + is_write_lock(task->itk_space); + + if (task->t_returnwaitflags & TRW_LRETURNWAIT) { + struct turnstile *turnstile = turnstile_prepare((uintptr_t) task_get_return_wait_event(task), + NULL, TURNSTILE_NULL, TURNSTILE_ULOCK); - if (task->t_flags & TF_LRETURNWAIT) { do { - task->t_flags |= TF_LRETURNWAITER; - assert_wait(task_get_return_wait_event(task), THREAD_UNINT); - task_unlock(task); + task->t_returnwaitflags |= TRW_LRETURNWAITER; + turnstile_update_inheritor(turnstile, task->returnwait_inheritor, + (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + waitq_assert_wait64(&turnstile->ts_waitq, + CAST_EVENT64_T(task_get_return_wait_event(task)), + THREAD_UNINT, TIMEOUT_WAIT_FOREVER); + + is_write_unlock(task->itk_space); + + turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD); thread_block(THREAD_CONTINUE_NULL); - task_lock(task); - } while (task->t_flags & TF_LRETURNWAIT); + is_write_lock(task->itk_space); + } while (task->t_returnwaitflags & TRW_LRETURNWAIT); + + turnstile_complete((uintptr_t) task_get_return_wait_event(task), NULL, NULL, TURNSTILE_ULOCK); } - task_unlock(task); + is_write_unlock(task->itk_space); + turnstile_cleanup(); + #if CONFIG_MACF /* @@ -843,12 +933,18 @@ task_init(void) * Create the kernel task as the first task. */ #ifdef __LP64__ - if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TRUE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS) + if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TRUE, TF_NONE, TPF_NONE, TWF_NONE, &kernel_task) != KERN_SUCCESS) #else - if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, FALSE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS) + if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, FALSE, TF_NONE, TPF_NONE, TWF_NONE, &kernel_task) != KERN_SUCCESS) #endif { panic("task_init\n");} +#if defined(HAS_APPLE_PAC) + kernel_task->rop_pid = KERNEL_ROP_ID; + // kernel_task never runs at EL0, but machine_thread_state_convert_from/to_user() relies on + // disable_user_jop to be false for kernel threads (e.g. in exception delivery on thread_exception_daemon) + ml_task_set_disable_user_jop(kernel_task, FALSE); +#endif vm_map_deallocate(kernel_task->map); kernel_task->map = kernel_map; @@ -997,15 +1093,33 @@ init_task_ledgers(void) task_ledgers.purgeable_nonvolatile = ledger_entry_add(t, "purgeable_nonvolatile", "physmem", "bytes"); task_ledgers.purgeable_volatile_compressed = ledger_entry_add(t, "purgeable_volatile_compress", "physmem", "bytes"); task_ledgers.purgeable_nonvolatile_compressed = ledger_entry_add(t, "purgeable_nonvolatile_compress", "physmem", "bytes"); +#if DEBUG || DEVELOPMENT task_ledgers.pages_grabbed = ledger_entry_add(t, "pages_grabbed", "physmem", "count"); task_ledgers.pages_grabbed_kern = ledger_entry_add(t, "pages_grabbed_kern", "physmem", "count"); task_ledgers.pages_grabbed_iopl = ledger_entry_add(t, "pages_grabbed_iopl", "physmem", "count"); task_ledgers.pages_grabbed_upl = ledger_entry_add(t, "pages_grabbed_upl", "physmem", "count"); - +#endif + task_ledgers.tagged_nofootprint = ledger_entry_add(t, "tagged_nofootprint", "physmem", "bytes"); + task_ledgers.tagged_footprint = ledger_entry_add(t, "tagged_footprint", "physmem", "bytes"); + task_ledgers.tagged_nofootprint_compressed = ledger_entry_add(t, "tagged_nofootprint_compressed", "physmem", "bytes"); + task_ledgers.tagged_footprint_compressed = ledger_entry_add(t, "tagged_footprint_compressed", "physmem", "bytes"); task_ledgers.network_volatile = ledger_entry_add(t, "network_volatile", "physmem", "bytes"); task_ledgers.network_nonvolatile = ledger_entry_add(t, "network_nonvolatile", "physmem", "bytes"); task_ledgers.network_volatile_compressed = ledger_entry_add(t, "network_volatile_compressed", "physmem", "bytes"); task_ledgers.network_nonvolatile_compressed = ledger_entry_add(t, "network_nonvolatile_compressed", "physmem", "bytes"); + task_ledgers.media_nofootprint = ledger_entry_add(t, "media_nofootprint", "physmem", "bytes"); + task_ledgers.media_footprint = ledger_entry_add(t, "media_footprint", "physmem", "bytes"); + task_ledgers.media_nofootprint_compressed = ledger_entry_add(t, "media_nofootprint_compressed", "physmem", "bytes"); + task_ledgers.media_footprint_compressed = ledger_entry_add(t, "media_footprint_compressed", "physmem", "bytes"); + task_ledgers.graphics_nofootprint = ledger_entry_add(t, "graphics_nofootprint", "physmem", "bytes"); + task_ledgers.graphics_footprint = ledger_entry_add(t, "graphics_footprint", "physmem", "bytes"); + task_ledgers.graphics_nofootprint_compressed = ledger_entry_add(t, "graphics_nofootprint_compressed", "physmem", "bytes"); + task_ledgers.graphics_footprint_compressed = ledger_entry_add(t, "graphics_footprint_compressed", "physmem", "bytes"); + task_ledgers.neural_nofootprint = ledger_entry_add(t, "neural_nofootprint", "physmem", "bytes"); + task_ledgers.neural_footprint = ledger_entry_add(t, "neural_footprint", "physmem", "bytes"); + task_ledgers.neural_nofootprint_compressed = ledger_entry_add(t, "neural_nofootprint_compressed", "physmem", "bytes"); + task_ledgers.neural_footprint_compressed = ledger_entry_add(t, "neural_footprint_compressed", "physmem", "bytes"); + task_ledgers.platform_idle_wakeups = ledger_entry_add(t, "platform_idle_wakeups", "power", "count"); @@ -1045,6 +1159,7 @@ init_task_ledgers(void) task_ledgers.cpu_time_billed_to_others = ledger_entry_add(t, "cpu_time_billed_to_others", "sched", "ns"); task_ledgers.physical_writes = ledger_entry_add(t, "physical_writes", "res", "bytes"); task_ledgers.logical_writes = ledger_entry_add(t, "logical_writes", "res", "bytes"); + task_ledgers.logical_writes_to_external = ledger_entry_add(t, "logical_writes_to_external", "res", "bytes"); task_ledgers.energy_billed_to_me = ledger_entry_add(t, "energy_billed_to_me", "power", "nj"); task_ledgers.energy_billed_to_others = ledger_entry_add(t, "energy_billed_to_others", "power", "nj"); @@ -1064,15 +1179,32 @@ init_task_ledgers(void) (task_ledgers.purgeable_nonvolatile < 0) || (task_ledgers.purgeable_volatile_compressed < 0) || (task_ledgers.purgeable_nonvolatile_compressed < 0) || + (task_ledgers.tagged_nofootprint < 0) || + (task_ledgers.tagged_footprint < 0) || + (task_ledgers.tagged_nofootprint_compressed < 0) || + (task_ledgers.tagged_footprint_compressed < 0) || (task_ledgers.network_volatile < 0) || (task_ledgers.network_nonvolatile < 0) || (task_ledgers.network_volatile_compressed < 0) || (task_ledgers.network_nonvolatile_compressed < 0) || + (task_ledgers.media_nofootprint < 0) || + (task_ledgers.media_footprint < 0) || + (task_ledgers.media_nofootprint_compressed < 0) || + (task_ledgers.media_footprint_compressed < 0) || + (task_ledgers.graphics_nofootprint < 0) || + (task_ledgers.graphics_footprint < 0) || + (task_ledgers.graphics_nofootprint_compressed < 0) || + (task_ledgers.graphics_footprint_compressed < 0) || + (task_ledgers.neural_nofootprint < 0) || + (task_ledgers.neural_footprint < 0) || + (task_ledgers.neural_nofootprint_compressed < 0) || + (task_ledgers.neural_footprint_compressed < 0) || (task_ledgers.platform_idle_wakeups < 0) || (task_ledgers.interrupt_wakeups < 0) || (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0) || (task_ledgers.physical_writes < 0) || (task_ledgers.logical_writes < 0) || + (task_ledgers.logical_writes_to_external < 0) || (task_ledgers.energy_billed_to_me < 0) || (task_ledgers.energy_billed_to_others < 0) ) { @@ -1090,15 +1222,32 @@ init_task_ledgers(void) ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile); ledger_track_credit_only(t, task_ledgers.purgeable_volatile_compressed); ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile_compressed); +#if DEBUG || DEVELOPMENT ledger_track_credit_only(t, task_ledgers.pages_grabbed); ledger_track_credit_only(t, task_ledgers.pages_grabbed_kern); ledger_track_credit_only(t, task_ledgers.pages_grabbed_iopl); ledger_track_credit_only(t, task_ledgers.pages_grabbed_upl); - +#endif + ledger_track_credit_only(t, task_ledgers.tagged_nofootprint); + ledger_track_credit_only(t, task_ledgers.tagged_footprint); + ledger_track_credit_only(t, task_ledgers.tagged_nofootprint_compressed); + ledger_track_credit_only(t, task_ledgers.tagged_footprint_compressed); ledger_track_credit_only(t, task_ledgers.network_volatile); ledger_track_credit_only(t, task_ledgers.network_nonvolatile); ledger_track_credit_only(t, task_ledgers.network_volatile_compressed); ledger_track_credit_only(t, task_ledgers.network_nonvolatile_compressed); + ledger_track_credit_only(t, task_ledgers.media_nofootprint); + ledger_track_credit_only(t, task_ledgers.media_footprint); + ledger_track_credit_only(t, task_ledgers.media_nofootprint_compressed); + ledger_track_credit_only(t, task_ledgers.media_footprint_compressed); + ledger_track_credit_only(t, task_ledgers.graphics_nofootprint); + ledger_track_credit_only(t, task_ledgers.graphics_footprint); + ledger_track_credit_only(t, task_ledgers.graphics_nofootprint_compressed); + ledger_track_credit_only(t, task_ledgers.graphics_footprint_compressed); + ledger_track_credit_only(t, task_ledgers.neural_nofootprint); + ledger_track_credit_only(t, task_ledgers.neural_footprint); + ledger_track_credit_only(t, task_ledgers.neural_nofootprint_compressed); + ledger_track_credit_only(t, task_ledgers.neural_footprint_compressed); ledger_track_maximum(t, task_ledgers.phys_footprint, 60); #if MACH_ASSERT @@ -1115,10 +1264,26 @@ init_task_ledgers(void) ledger_panic_on_negative(t, task_ledgers.purgeable_volatile_compressed); ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile_compressed); + ledger_panic_on_negative(t, task_ledgers.tagged_nofootprint); + ledger_panic_on_negative(t, task_ledgers.tagged_footprint); + ledger_panic_on_negative(t, task_ledgers.tagged_nofootprint_compressed); + ledger_panic_on_negative(t, task_ledgers.tagged_footprint_compressed); ledger_panic_on_negative(t, task_ledgers.network_volatile); ledger_panic_on_negative(t, task_ledgers.network_nonvolatile); ledger_panic_on_negative(t, task_ledgers.network_volatile_compressed); ledger_panic_on_negative(t, task_ledgers.network_nonvolatile_compressed); + ledger_panic_on_negative(t, task_ledgers.media_nofootprint); + ledger_panic_on_negative(t, task_ledgers.media_footprint); + ledger_panic_on_negative(t, task_ledgers.media_nofootprint_compressed); + ledger_panic_on_negative(t, task_ledgers.media_footprint_compressed); + ledger_panic_on_negative(t, task_ledgers.graphics_nofootprint); + ledger_panic_on_negative(t, task_ledgers.graphics_footprint); + ledger_panic_on_negative(t, task_ledgers.graphics_nofootprint_compressed); + ledger_panic_on_negative(t, task_ledgers.graphics_footprint_compressed); + ledger_panic_on_negative(t, task_ledgers.neural_nofootprint); + ledger_panic_on_negative(t, task_ledgers.neural_footprint); + ledger_panic_on_negative(t, task_ledgers.neural_nofootprint_compressed); + ledger_panic_on_negative(t, task_ledgers.neural_footprint_compressed); } #endif /* MACH_ASSERT */ @@ -1129,7 +1294,6 @@ init_task_ledgers(void) ledger_set_callback(t, task_ledgers.interrupt_wakeups, task_wakeups_rate_exceeded, NULL, NULL); ledger_set_callback(t, task_ledgers.physical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_PHYSICAL_WRITES, NULL); - ledger_set_callback(t, task_ledgers.logical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_LOGICAL_WRITES, NULL); ledger_template_complete(t); task_ledger_template = t; @@ -1146,6 +1310,7 @@ task_create_internal( boolean_t is_64bit_data, uint32_t t_flags, uint32_t t_procflags, + uint8_t t_returnwaitflags, task_t *child_task) /* OUT */ { task_t new_task; @@ -1169,6 +1334,10 @@ task_create_internal( return KERN_RESOURCE_SHORTAGE; } +#if defined(HAS_APPLE_PAC) + ml_task_set_rop_pid(new_task, parent_task, inherit_memory); + ml_task_set_disable_user_jop(new_task, inherit_memory ? parent_task->disable_user_jop : FALSE); +#endif new_task->ledger = ledger; @@ -1180,7 +1349,8 @@ task_create_internal( if (!(t_flags & TF_CORPSE_FORK) && inherit_memory) { new_task->map = vm_map_fork(ledger, parent_task->map, 0); } else { - new_task->map = vm_map_create(pmap_create(ledger, 0, is_64bit), + unsigned int pmap_flags = is_64bit ? PMAP_CREATE_64BIT : 0; + new_task->map = vm_map_create(pmap_create_options(ledger, 0, pmap_flags), (vm_map_offset_t)(VM_MIN_ADDRESS), (vm_map_offset_t)(VM_MAX_ADDRESS), TRUE); } @@ -1202,11 +1372,14 @@ task_create_internal( new_task->priv_flags = 0; new_task->t_flags = t_flags; new_task->t_procflags = t_procflags; + new_task->t_returnwaitflags = t_returnwaitflags; + new_task->returnwait_inheritor = current_thread(); new_task->importance = 0; new_task->crashed_thread_id = 0; new_task->exec_token = 0; - - new_task->task_exc_guard = task_exc_guard_default; + new_task->watchports = NULL; + new_task->restartable_ranges = NULL; + new_task->task_exc_guard = 0; #if CONFIG_ATM new_task->atm_context = NULL; @@ -1308,6 +1481,7 @@ task_create_internal( new_task->all_image_info_addr = parent_task->all_image_info_addr; new_task->all_image_info_size = parent_task->all_image_info_size; + new_task->mach_header_vm_address = 0; if (inherit_memory && parent_task->affinity_space) { task_affinity_create(parent_task, new_task); @@ -1386,6 +1560,7 @@ task_create_internal( new_task->c_switch = 0; new_task->p_switch = 0; new_task->ps_switch = 0; + new_task->decompressions = 0; new_task->low_mem_notified_warn = 0; new_task->low_mem_notified_critical = 0; new_task->purged_memory_warn = 0; @@ -1398,10 +1573,15 @@ task_create_internal( new_task->task_timer_wakeups_bin_1 = 0; new_task->task_timer_wakeups_bin_2 = 0; new_task->task_gpu_ns = 0; - new_task->task_immediate_writes = 0; - new_task->task_deferred_writes = 0; - new_task->task_invalidated_writes = 0; - new_task->task_metadata_writes = 0; + new_task->task_writes_counters_internal.task_immediate_writes = 0; + new_task->task_writes_counters_internal.task_deferred_writes = 0; + new_task->task_writes_counters_internal.task_invalidated_writes = 0; + new_task->task_writes_counters_internal.task_metadata_writes = 0; + new_task->task_writes_counters_external.task_immediate_writes = 0; + new_task->task_writes_counters_external.task_deferred_writes = 0; + new_task->task_writes_counters_external.task_invalidated_writes = 0; + new_task->task_writes_counters_external.task_metadata_writes = 0; + new_task->task_energy = 0; #if MONOTONIC memset(&new_task->task_monotonic, 0, sizeof(new_task->task_monotonic)); @@ -1448,15 +1628,18 @@ task_create_internal( new_task->dispatchqueue_offset = parent_task->dispatchqueue_offset; } + new_task->task_can_transfer_memory_ownership = FALSE; new_task->task_volatile_objects = 0; new_task->task_nonvolatile_objects = 0; - new_task->task_purgeable_disowning = FALSE; - new_task->task_purgeable_disowned = FALSE; + new_task->task_objects_disowning = FALSE; + new_task->task_objects_disowned = FALSE; + new_task->task_owned_objects = 0; queue_init(&new_task->task_objq); task_objq_lock_init(new_task); #if __arm64__ new_task->task_legacy_footprint = FALSE; + new_task->task_extra_footprint_limit = FALSE; #endif /* __arm64__ */ new_task->task_region_footprint = FALSE; new_task->task_has_crossed_thread_limit = FALSE; @@ -1476,6 +1659,7 @@ task_create_internal( new_task->t_flags &= ~(TF_DARKWAKE_MODE); queue_init(&new_task->io_user_clients); + new_task->loadTag = 0; ipc_task_enable(new_task); @@ -1509,6 +1693,7 @@ task_rollup_accounting_info(task_t to_task, task_t from_task) to_task->faults = from_task->faults; to_task->pageins = from_task->pageins; to_task->cow_faults = from_task->cow_faults; + to_task->decompressions = from_task->decompressions; to_task->messages_sent = from_task->messages_sent; to_task->messages_received = from_task->messages_received; to_task->syscalls_mach = from_task->syscalls_mach; @@ -1528,10 +1713,14 @@ task_rollup_accounting_info(task_t to_task, task_t from_task) to_task->task_timer_wakeups_bin_1 = from_task->task_timer_wakeups_bin_1; to_task->task_timer_wakeups_bin_2 = from_task->task_timer_wakeups_bin_2; to_task->task_gpu_ns = from_task->task_gpu_ns; - to_task->task_immediate_writes = from_task->task_immediate_writes; - to_task->task_deferred_writes = from_task->task_deferred_writes; - to_task->task_invalidated_writes = from_task->task_invalidated_writes; - to_task->task_metadata_writes = from_task->task_metadata_writes; + to_task->task_writes_counters_internal.task_immediate_writes = from_task->task_writes_counters_internal.task_immediate_writes; + to_task->task_writes_counters_internal.task_deferred_writes = from_task->task_writes_counters_internal.task_deferred_writes; + to_task->task_writes_counters_internal.task_invalidated_writes = from_task->task_writes_counters_internal.task_invalidated_writes; + to_task->task_writes_counters_internal.task_metadata_writes = from_task->task_writes_counters_internal.task_metadata_writes; + to_task->task_writes_counters_external.task_immediate_writes = from_task->task_writes_counters_external.task_immediate_writes; + to_task->task_writes_counters_external.task_deferred_writes = from_task->task_writes_counters_external.task_deferred_writes; + to_task->task_writes_counters_external.task_invalidated_writes = from_task->task_writes_counters_external.task_invalidated_writes; + to_task->task_writes_counters_external.task_metadata_writes = from_task->task_writes_counters_external.task_metadata_writes; to_task->task_energy = from_task->task_energy; /* Skip ledger roll up for memory accounting entries */ @@ -1591,7 +1780,18 @@ task_deallocate( return; } + /* + * The task should be dead at this point. Ensure other resources + * like threads, are gone before we trash the world. + */ + assert(queue_empty(&task->threads)); + assert(task->bsd_info == NULL); + assert(!is_active(task->itk_space)); + assert(!task->active); + assert(task->active_thread_count == 0); + lck_mtx_lock(&tasks_threads_lock); + assert(terminated_tasks_count > 0); queue_remove(&terminated_tasks, task, task_t, tasks); terminated_tasks_count--; lck_mtx_unlock(&tasks_threads_lock); @@ -1635,19 +1835,24 @@ task_deallocate( } #endif /* MACH_ASSERT */ - vm_purgeable_disown(task); - assert(task->task_purgeable_disowned); + vm_owned_objects_disown(task); + assert(task->task_objects_disowned); if (task->task_volatile_objects != 0 || - task->task_nonvolatile_objects != 0) { + task->task_nonvolatile_objects != 0 || + task->task_owned_objects != 0) { panic("task_deallocate(%p): " - "volatile_objects=%d nonvolatile_objects=%d\n", + "volatile_objects=%d nonvolatile_objects=%d owned=%d\n", task, task->task_volatile_objects, - task->task_nonvolatile_objects); + task->task_nonvolatile_objects, + task->task_owned_objects); } vm_map_deallocate(task->map); is_release(task->itk_space); + if (task->restartable_ranges) { + restartable_ranges_release(task->restartable_ranges); + } ledger_get_entries(task->ledger, task_ledgers.interrupt_wakeups, &interrupt_wakeups, &debit); @@ -1898,7 +2103,7 @@ task_deliver_crash_notification( task_reference(task); task_port = convert_task_to_port(task); ip_lock(task_port); - assert(ip_active(task_port)); + require_ip_active(task_port); ipc_port_nsrequest(task_port, task_port->ip_mscount, ipc_port_make_sonce_locked(task_port), &old_notify); /* port unlocked */ assert(IP_NULL == old_notify); @@ -2070,7 +2275,7 @@ task_port_notify(mach_msg_header_t *msg) ipc_port_t port = notification->not_header.msgh_remote_port; task_t task; - assert(ip_active(port)); + require_ip_active(port); assert(IKOT_TASK == ip_kotype(port)); task = (task_t) port->ip_kobject; @@ -2417,6 +2622,11 @@ task_terminate_internal( */ task_synchronizer_destroy_all(task); + /* + * Clear the watchport boost on the task. + */ + task_remove_turnstile_watchports(task); + /* * Destroy the IPC space, leaving just a reference for it. */ @@ -2806,6 +3016,12 @@ task_wait_locked( } } +boolean_t +task_is_app_suspended(task_t task) +{ + return task->pidsuspended; +} + /* * task_release_locked: * @@ -3103,6 +3319,11 @@ release_task_hold( return KERN_SUCCESS; } +boolean_t +get_task_suspended(task_t task) +{ + return 0 != task->user_stop_count; +} /* * task_suspend: @@ -3124,7 +3345,7 @@ task_suspend( task_t task) { kern_return_t kr; - mach_port_t port, send, old_notify; + mach_port_t port; mach_port_name_t name; if (task == TASK_NULL || task == kernel_task) { @@ -3133,43 +3354,23 @@ task_suspend( task_lock(task); - /* - * Claim a send right on the task resume port, and request a no-senders - * notification on that port (if none outstanding). - */ - if (task->itk_resume == IP_NULL) { - task->itk_resume = ipc_port_alloc_kernel(); - if (!IP_VALID(task->itk_resume)) { - panic("failed to create resume port"); - } - ipc_kobject_set(task->itk_resume, (ipc_kobject_t)task, IKOT_TASK_RESUME); - } - - port = task->itk_resume; - ip_lock(port); - assert(ip_active(port)); - - send = ipc_port_make_send_locked(port); - assert(IP_VALID(send)); - - if (port->ip_nsrequest == IP_NULL) { - ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify); - assert(old_notify == IP_NULL); - /* port unlocked */ - } else { - ip_unlock(port); - } - /* * place a legacy hold on the task. */ kr = place_task_hold(task, TASK_HOLD_LEGACY); if (kr != KERN_SUCCESS) { task_unlock(task); - ipc_port_release_send(send); return kr; } + /* + * Claim a send right on the task resume port, and request a no-senders + * notification on that port (if none outstanding). + */ + (void)ipc_kobject_make_send_lazy_alloc_port(&task->itk_resume, + (ipc_kobject_t)task, IKOT_TASK_RESUME); + port = task->itk_resume; + task_unlock(task); /* @@ -3177,8 +3378,8 @@ task_suspend( * but we'll look it up when calling a traditional resume. Any IPC operations that * deallocate the send right will auto-release the suspension. */ - if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, (ipc_object_t)send, - MACH_MSG_TYPE_MOVE_SEND, &name)) != KERN_SUCCESS) { + if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, ip_to_object(port), + MACH_MSG_TYPE_MOVE_SEND, NULL, NULL, &name)) != KERN_SUCCESS) { printf("warning: %s(%d) failed to copyout suspension token for pid %d with error: %d\n", proc_name_address(current_task()->bsd_info), proc_pid(current_task()->bsd_info), task_pid(task), kr); @@ -3215,7 +3416,7 @@ task_resume( is_write_lock(space); if (is_active(space) && IP_VALID(task->itk_resume) && - ipc_hash_lookup(space, (ipc_object_t)task->itk_resume, &resume_port_name, &resume_port_entry) == TRUE) { + ipc_hash_lookup(space, ip_to_object(task->itk_resume), &resume_port_name, &resume_port_entry) == TRUE) { /* * We found a suspension token in the caller's IPC space. Release a send right to indicate that * we are holding one less legacy hold on the task from this caller. If the release failed, @@ -3325,7 +3526,7 @@ task_resume2( boolean_t task_suspension_notify(mach_msg_header_t *request_header) { - ipc_port_t port = (ipc_port_t) request_header->msgh_remote_port; + ipc_port_t port = request_header->msgh_remote_port; task_t task = convert_port_to_task_suspension_token(port); mach_msg_type_number_t not_count; @@ -3373,7 +3574,7 @@ task_suspension_notify(mach_msg_header_t *request_header) return TRUE; } -kern_return_t +static kern_return_t task_pidsuspend_locked(task_t task) { kern_return_t kr; @@ -3418,6 +3619,10 @@ task_pidsuspend( task_unlock(task); + if ((KERN_SUCCESS == kr) && task->message_app_suspended) { + iokit_task_app_suspended_changed(task); + } + return kr; } @@ -3456,6 +3661,10 @@ task_pidresume( task_unlock(task); + if ((KERN_SUCCESS == kr) && task->message_app_suspended) { + iokit_task_app_suspended_changed(task); + } + #if CONFIG_FREEZE task_lock(task); @@ -3472,6 +3681,436 @@ task_pidresume( return kr; } +os_refgrp_decl(static, task_watchports_refgrp, "task_watchports", NULL); + +/* + * task_add_turnstile_watchports: + * Setup watchports to boost the main thread of the task. + * + * Arguments: + * task: task being spawned + * thread: main thread of task + * portwatch_ports: array of watchports + * portwatch_count: number of watchports + * + * Conditions: + * Nothing locked. + */ +void +task_add_turnstile_watchports( + task_t task, + thread_t thread, + ipc_port_t *portwatch_ports, + uint32_t portwatch_count) +{ + struct task_watchports *watchports = NULL; + struct task_watchport_elem *previous_elem_array[TASK_MAX_WATCHPORT_COUNT] = {}; + os_ref_count_t refs; + + /* Check if the task has terminated */ + if (!task->active) { + return; + } + + assert(portwatch_count <= TASK_MAX_WATCHPORT_COUNT); + + watchports = task_watchports_alloc_init(task, thread, portwatch_count); + + /* Lock the ipc space */ + is_write_lock(task->itk_space); + + /* Setup watchports to boost the main thread */ + refs = task_add_turnstile_watchports_locked(task, + watchports, previous_elem_array, portwatch_ports, + portwatch_count); + + /* Drop the space lock */ + is_write_unlock(task->itk_space); + + if (refs == 0) { + task_watchports_deallocate(watchports); + } + + /* Drop the ref on previous_elem_array */ + for (uint32_t i = 0; i < portwatch_count && previous_elem_array[i] != NULL; i++) { + task_watchport_elem_deallocate(previous_elem_array[i]); + } +} + +/* + * task_remove_turnstile_watchports: + * Clear all turnstile boost on the task from watchports. + * + * Arguments: + * task: task being terminated + * + * Conditions: + * Nothing locked. + */ +void +task_remove_turnstile_watchports( + task_t task) +{ + os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT; + struct task_watchports *watchports = NULL; + ipc_port_t port_freelist[TASK_MAX_WATCHPORT_COUNT] = {}; + uint32_t portwatch_count; + + /* Lock the ipc space */ + is_write_lock(task->itk_space); + + /* Check if watchport boost exist */ + if (task->watchports == NULL) { + is_write_unlock(task->itk_space); + return; + } + watchports = task->watchports; + portwatch_count = watchports->tw_elem_array_count; + + refs = task_remove_turnstile_watchports_locked(task, watchports, + port_freelist); + + is_write_unlock(task->itk_space); + + /* Drop all the port references */ + for (uint32_t i = 0; i < portwatch_count && port_freelist[i] != NULL; i++) { + ip_release(port_freelist[i]); + } + + /* Clear the task and thread references for task_watchport */ + if (refs == 0) { + task_watchports_deallocate(watchports); + } +} + +/* + * task_transfer_turnstile_watchports: + * Transfer all watchport turnstile boost from old task to new task. + * + * Arguments: + * old_task: task calling exec + * new_task: new exec'ed task + * thread: main thread of new task + * + * Conditions: + * Nothing locked. + */ +void +task_transfer_turnstile_watchports( + task_t old_task, + task_t new_task, + thread_t new_thread) +{ + struct task_watchports *old_watchports = NULL; + struct task_watchports *new_watchports = NULL; + os_ref_count_t old_refs = TASK_MAX_WATCHPORT_COUNT; + os_ref_count_t new_refs = TASK_MAX_WATCHPORT_COUNT; + uint32_t portwatch_count; + + if (old_task->watchports == NULL || !new_task->active) { + return; + } + + /* Get the watch port count from the old task */ + is_write_lock(old_task->itk_space); + if (old_task->watchports == NULL) { + is_write_unlock(old_task->itk_space); + return; + } + + portwatch_count = old_task->watchports->tw_elem_array_count; + is_write_unlock(old_task->itk_space); + + new_watchports = task_watchports_alloc_init(new_task, new_thread, portwatch_count); + + /* Lock the ipc space for old task */ + is_write_lock(old_task->itk_space); + + /* Lock the ipc space for new task */ + is_write_lock(new_task->itk_space); + + /* Check if watchport boost exist */ + if (old_task->watchports == NULL || !new_task->active) { + is_write_unlock(new_task->itk_space); + is_write_unlock(old_task->itk_space); + (void)task_watchports_release(new_watchports); + task_watchports_deallocate(new_watchports); + return; + } + + old_watchports = old_task->watchports; + assert(portwatch_count == old_task->watchports->tw_elem_array_count); + + /* Setup new task watchports */ + new_task->watchports = new_watchports; + + for (uint32_t i = 0; i < portwatch_count; i++) { + ipc_port_t port = old_watchports->tw_elem[i].twe_port; + + if (port == NULL) { + task_watchport_elem_clear(&new_watchports->tw_elem[i]); + continue; + } + + /* Lock the port and check if it has the entry */ + ip_lock(port); + imq_lock(&port->ip_messages); + + task_watchport_elem_init(&new_watchports->tw_elem[i], new_task, port); + + if (ipc_port_replace_watchport_elem_conditional_locked(port, + &old_watchports->tw_elem[i], &new_watchports->tw_elem[i]) == KERN_SUCCESS) { + task_watchport_elem_clear(&old_watchports->tw_elem[i]); + + task_watchports_retain(new_watchports); + old_refs = task_watchports_release(old_watchports); + + /* Check if all ports are cleaned */ + if (old_refs == 0) { + old_task->watchports = NULL; + } + } else { + task_watchport_elem_clear(&new_watchports->tw_elem[i]); + } + /* mqueue and port unlocked by ipc_port_replace_watchport_elem_conditional_locked */ + } + + /* Drop the reference on new task_watchports struct returned by task_watchports_alloc_init */ + new_refs = task_watchports_release(new_watchports); + if (new_refs == 0) { + new_task->watchports = NULL; + } + + is_write_unlock(new_task->itk_space); + is_write_unlock(old_task->itk_space); + + /* Clear the task and thread references for old_watchport */ + if (old_refs == 0) { + task_watchports_deallocate(old_watchports); + } + + /* Clear the task and thread references for new_watchport */ + if (new_refs == 0) { + task_watchports_deallocate(new_watchports); + } +} + +/* + * task_add_turnstile_watchports_locked: + * Setup watchports to boost the main thread of the task. + * + * Arguments: + * task: task to boost + * watchports: watchport structure to be attached to the task + * previous_elem_array: an array of old watchport_elem to be returned to caller + * portwatch_ports: array of watchports + * portwatch_count: number of watchports + * + * Conditions: + * ipc space of the task locked. + * returns array of old watchport_elem in previous_elem_array + */ +static os_ref_count_t +task_add_turnstile_watchports_locked( + task_t task, + struct task_watchports *watchports, + struct task_watchport_elem **previous_elem_array, + ipc_port_t *portwatch_ports, + uint32_t portwatch_count) +{ + os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT; + + /* Check if the task is still active */ + if (!task->active) { + refs = task_watchports_release(watchports); + return refs; + } + + assert(task->watchports == NULL); + task->watchports = watchports; + + for (uint32_t i = 0, j = 0; i < portwatch_count; i++) { + ipc_port_t port = portwatch_ports[i]; + + task_watchport_elem_init(&watchports->tw_elem[i], task, port); + if (port == NULL) { + task_watchport_elem_clear(&watchports->tw_elem[i]); + continue; + } + + ip_lock(port); + imq_lock(&port->ip_messages); + + /* Check if port is in valid state to be setup as watchport */ + if (ipc_port_add_watchport_elem_locked(port, &watchports->tw_elem[i], + &previous_elem_array[j]) != KERN_SUCCESS) { + task_watchport_elem_clear(&watchports->tw_elem[i]); + continue; + } + /* port and mqueue unlocked on return */ + + ip_reference(port); + task_watchports_retain(watchports); + if (previous_elem_array[j] != NULL) { + j++; + } + } + + /* Drop the reference on task_watchport struct returned by os_ref_init */ + refs = task_watchports_release(watchports); + if (refs == 0) { + task->watchports = NULL; + } + + return refs; +} + +/* + * task_remove_turnstile_watchports_locked: + * Clear all turnstile boost on the task from watchports. + * + * Arguments: + * task: task to remove watchports from + * watchports: watchports structure for the task + * port_freelist: array of ports returned with ref to caller + * + * + * Conditions: + * ipc space of the task locked. + * array of ports with refs are returned in port_freelist + */ +static os_ref_count_t +task_remove_turnstile_watchports_locked( + task_t task, + struct task_watchports *watchports, + ipc_port_t *port_freelist) +{ + os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT; + + for (uint32_t i = 0, j = 0; i < watchports->tw_elem_array_count; i++) { + ipc_port_t port = watchports->tw_elem[i].twe_port; + if (port == NULL) { + continue; + } + + /* Lock the port and check if it has the entry */ + ip_lock(port); + imq_lock(&port->ip_messages); + if (ipc_port_clear_watchport_elem_internal_conditional_locked(port, + &watchports->tw_elem[i]) == KERN_SUCCESS) { + task_watchport_elem_clear(&watchports->tw_elem[i]); + port_freelist[j++] = port; + refs = task_watchports_release(watchports); + + /* Check if all ports are cleaned */ + if (refs == 0) { + task->watchports = NULL; + break; + } + } + /* mqueue and port unlocked by ipc_port_clear_watchport_elem_internal_conditional_locked */ + } + return refs; +} + +/* + * task_watchports_alloc_init: + * Allocate and initialize task watchport struct. + * + * Conditions: + * Nothing locked. + */ +static struct task_watchports * +task_watchports_alloc_init( + task_t task, + thread_t thread, + uint32_t count) +{ + struct task_watchports *watchports = kalloc(sizeof(struct task_watchports) + + count * sizeof(struct task_watchport_elem)); + + task_reference(task); + thread_reference(thread); + watchports->tw_task = task; + watchports->tw_thread = thread; + watchports->tw_elem_array_count = count; + os_ref_init(&watchports->tw_refcount, &task_watchports_refgrp); + + return watchports; +} + +/* + * task_watchports_deallocate: + * Deallocate task watchport struct. + * + * Conditions: + * Nothing locked. + */ +static void +task_watchports_deallocate( + struct task_watchports *watchports) +{ + uint32_t portwatch_count = watchports->tw_elem_array_count; + + task_deallocate(watchports->tw_task); + thread_deallocate(watchports->tw_thread); + kfree(watchports, sizeof(struct task_watchports) + portwatch_count * sizeof(struct task_watchport_elem)); +} + +/* + * task_watchport_elem_deallocate: + * Deallocate task watchport element and release its ref on task_watchport. + * + * Conditions: + * Nothing locked. + */ +void +task_watchport_elem_deallocate( + struct task_watchport_elem *watchport_elem) +{ + os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT; + task_t task = watchport_elem->twe_task; + struct task_watchports *watchports = NULL; + ipc_port_t port = NULL; + + assert(task != NULL); + + /* Take the space lock to modify the elememt */ + is_write_lock(task->itk_space); + + watchports = task->watchports; + assert(watchports != NULL); + + port = watchport_elem->twe_port; + assert(port != NULL); + + task_watchport_elem_clear(watchport_elem); + refs = task_watchports_release(watchports); + + if (refs == 0) { + task->watchports = NULL; + } + + is_write_unlock(task->itk_space); + + ip_release(port); + if (refs == 0) { + task_watchports_deallocate(watchports); + } +} + +/* + * task_has_watchports: + * Return TRUE if task has watchport boosts. + * + * Conditions: + * Nothing locked. + */ +boolean_t +task_has_watchports(task_t task) +{ + return task->watchports != NULL; +} #if DEVELOPMENT || DEBUG @@ -3601,7 +4240,7 @@ task_freeze( task_unlock(task); - kr = vm_map_freeze(task->map, + kr = vm_map_freeze(task, purgeable_count, wired_count, clean_count, @@ -4322,7 +4961,7 @@ task_info( break; } - task_power_info_locked(task, (task_power_info_t)task_info_out, NULL, NULL); + task_power_info_locked(task, (task_power_info_t)task_info_out, NULL, NULL, NULL); break; } @@ -4333,7 +4972,7 @@ task_info( break; } task_power_info_v2_t tpiv2 = (task_power_info_v2_t) task_info_out; - task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy, tpiv2); + task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy, tpiv2, NULL); break; } @@ -4343,6 +4982,39 @@ task_info( task_vm_info_t vm_info; vm_map_t map; +#if __arm64__ + struct proc *p; + uint32_t platform, sdk; + p = current_proc(); + platform = proc_platform(p); + sdk = proc_sdk(p); + if (original_task_info_count > TASK_VM_INFO_REV2_COUNT && + platform == PLATFORM_IOS && + sdk != 0 && + (sdk >> 16) <= 12) { + /* + * Some iOS apps pass an incorrect value for + * task_info_count, expressed in number of bytes + * instead of number of "natural_t" elements. + * For the sake of backwards binary compatibility + * for apps built with an iOS12 or older SDK and using + * the "rev2" data structure, let's fix task_info_count + * for them, to avoid stomping past the actual end + * of their buffer. + */ +#if DEVELOPMENT || DEBUG + printf("%s:%d %d[%s] rdar://49484582 task_info_count %d -> %d platform %d sdk %d.%d.%d\n", __FUNCTION__, __LINE__, proc_pid(p), proc_name_address(p), original_task_info_count, TASK_VM_INFO_REV2_COUNT, platform, (sdk >> 16), ((sdk >> 8) & 0xff), (sdk & 0xff)); +#endif /* DEVELOPMENT || DEBUG */ + DTRACE_VM4(workaround_task_vm_info_count, + mach_msg_type_number_t, original_task_info_count, + mach_msg_type_number_t, TASK_VM_INFO_REV2_COUNT, + uint32_t, platform, + uint32_t, sdk); + original_task_info_count = TASK_VM_INFO_REV2_COUNT; + *task_info_count = original_task_info_count; + } +#endif /* __arm64__ */ + if (*task_info_count < TASK_VM_INFO_REV0_COUNT) { error = KERN_INVALID_ARGUMENT; break; @@ -4445,6 +5117,90 @@ task_info( vm_info->max_address = map->max_offset; *task_info_count = TASK_VM_INFO_REV2_COUNT; } + if (original_task_info_count >= TASK_VM_INFO_REV3_COUNT) { + ledger_get_lifetime_max(task->ledger, + task_ledgers.phys_footprint, + &vm_info->ledger_phys_footprint_peak); + ledger_get_balance(task->ledger, + task_ledgers.purgeable_nonvolatile, + &vm_info->ledger_purgeable_nonvolatile); + ledger_get_balance(task->ledger, + task_ledgers.purgeable_nonvolatile_compressed, + &vm_info->ledger_purgeable_novolatile_compressed); + ledger_get_balance(task->ledger, + task_ledgers.purgeable_volatile, + &vm_info->ledger_purgeable_volatile); + ledger_get_balance(task->ledger, + task_ledgers.purgeable_volatile_compressed, + &vm_info->ledger_purgeable_volatile_compressed); + ledger_get_balance(task->ledger, + task_ledgers.network_nonvolatile, + &vm_info->ledger_tag_network_nonvolatile); + ledger_get_balance(task->ledger, + task_ledgers.network_nonvolatile_compressed, + &vm_info->ledger_tag_network_nonvolatile_compressed); + ledger_get_balance(task->ledger, + task_ledgers.network_volatile, + &vm_info->ledger_tag_network_volatile); + ledger_get_balance(task->ledger, + task_ledgers.network_volatile_compressed, + &vm_info->ledger_tag_network_volatile_compressed); + ledger_get_balance(task->ledger, + task_ledgers.media_footprint, + &vm_info->ledger_tag_media_footprint); + ledger_get_balance(task->ledger, + task_ledgers.media_footprint_compressed, + &vm_info->ledger_tag_media_footprint_compressed); + ledger_get_balance(task->ledger, + task_ledgers.media_nofootprint, + &vm_info->ledger_tag_media_nofootprint); + ledger_get_balance(task->ledger, + task_ledgers.media_nofootprint_compressed, + &vm_info->ledger_tag_media_nofootprint_compressed); + ledger_get_balance(task->ledger, + task_ledgers.graphics_footprint, + &vm_info->ledger_tag_graphics_footprint); + ledger_get_balance(task->ledger, + task_ledgers.graphics_footprint_compressed, + &vm_info->ledger_tag_graphics_footprint_compressed); + ledger_get_balance(task->ledger, + task_ledgers.graphics_nofootprint, + &vm_info->ledger_tag_graphics_nofootprint); + ledger_get_balance(task->ledger, + task_ledgers.graphics_nofootprint_compressed, + &vm_info->ledger_tag_graphics_nofootprint_compressed); + ledger_get_balance(task->ledger, + task_ledgers.neural_footprint, + &vm_info->ledger_tag_neural_footprint); + ledger_get_balance(task->ledger, + task_ledgers.neural_footprint_compressed, + &vm_info->ledger_tag_neural_footprint_compressed); + ledger_get_balance(task->ledger, + task_ledgers.neural_nofootprint, + &vm_info->ledger_tag_neural_nofootprint); + ledger_get_balance(task->ledger, + task_ledgers.neural_nofootprint_compressed, + &vm_info->ledger_tag_neural_nofootprint_compressed); + *task_info_count = TASK_VM_INFO_REV3_COUNT; + } + if (original_task_info_count >= TASK_VM_INFO_REV4_COUNT) { + if (task->bsd_info) { + vm_info->limit_bytes_remaining = + memorystatus_available_memory_internal(task->bsd_info); + } else { + vm_info->limit_bytes_remaining = 0; + } + *task_info_count = TASK_VM_INFO_REV4_COUNT; + } + if (original_task_info_count >= TASK_VM_INFO_REV5_COUNT) { + thread_t thread; + integer_t total = task->decompressions; + queue_iterate(&task->threads, thread, thread_t, task_threads) { + total += thread->decompressions; + } + vm_info->decompressions = total; + *task_info_count = TASK_VM_INFO_REV5_COUNT; + } if (task != kernel_task) { vm_map_unlock_read(map); @@ -4546,6 +5302,7 @@ task_info( { #if DEVELOPMENT || DEBUG task_debug_info_internal_t dbg_info; + ipc_space_t space = task->itk_space; if (*task_info_count < TASK_DEBUG_INFO_INTERNAL_COUNT) { error = KERN_NOT_SUPPORTED; break; @@ -4557,8 +5314,11 @@ task_info( } dbg_info = (task_debug_info_internal_t) task_info_out; dbg_info->ipc_space_size = 0; - if (task->itk_space) { - dbg_info->ipc_space_size = task->itk_space->is_table_size; + + if (space) { + is_read_lock(space); + dbg_info->ipc_space_size = space->is_table_size; + is_read_unlock(space); } dbg_info->suspend_count = task->suspend_count; @@ -4626,11 +5386,14 @@ task_power_info_locked( task_t task, task_power_info_t info, gpu_energy_data_t ginfo, - task_power_info_v2_t infov2) + task_power_info_v2_t infov2, + uint64_t *runnable_time) { thread_t thread; ledger_amount_t tmp; + uint64_t runnable_time_sum = 0; + task_lock_assert_owned(task); ledger_get_entries(task->ledger, task_ledgers.interrupt_wakeups, @@ -4643,6 +5406,7 @@ task_power_info_locked( info->total_user = task->total_user_time; info->total_system = task->total_system_time; + runnable_time_sum = task->total_runnable_time; #if CONFIG_EMBEDDED if (infov2) { @@ -4696,12 +5460,20 @@ task_power_info_locked( info->total_user += tval; } + tval = timer_grab(&thread->runnable_timer); + + runnable_time_sum += tval; + if (ginfo) { ginfo->task_gpu_utilisation += ml_gpu_stat(thread); } thread_unlock(thread); splx(x); } + + if (runnable_time) { + *runnable_time = runnable_time_sum; + } } /* @@ -5482,8 +6254,13 @@ task_set_phys_footprint_limit_internal( boolean_t memlimit_is_fatal) { ledger_amount_t old; + kern_return_t ret; - ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &old); + ret = ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &old); + + if (ret != KERN_SUCCESS) { + return ret; + } /* * Check that limit >> 20 will not give an "unexpected" 32-bit @@ -5550,8 +6327,13 @@ task_get_phys_footprint_limit( int *limit_mb) { ledger_amount_t limit; + kern_return_t ret; + + ret = ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &limit); + if (ret != KERN_SUCCESS) { + return ret; + } - ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &limit); /* * Check that limit >> 20 will not give an "unexpected" signed, 32-bit * result. There are, however, implicit assumptions that -1 mb limit @@ -5902,13 +6684,13 @@ SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(void) } static boolean_t -global_update_logical_writes(int64_t io_delta) +global_update_logical_writes(int64_t io_delta, int64_t *global_write_count) { int64_t old_count, new_count; boolean_t needs_telemetry; do { - new_count = old_count = global_logical_writes_count; + new_count = old_count = *global_write_count; new_count += io_delta; if (new_count >= io_telemetry_limit) { new_count = 0; @@ -5916,7 +6698,7 @@ global_update_logical_writes(int64_t io_delta) } else { needs_telemetry = FALSE; } - } while (!OSCompareAndSwap64(old_count, new_count, &global_logical_writes_count)); + } while (!OSCompareAndSwap64(old_count, new_count, global_write_count)); return needs_telemetry; } @@ -5924,7 +6706,10 @@ void task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp) { int64_t io_delta = 0; + int64_t * global_counter_to_update; boolean_t needs_telemetry = FALSE; + int ledger_to_update = 0; + struct task_writes_counters * writes_counters_to_update; if ((!task) || (!io_size) || (!vp)) { return; @@ -5933,29 +6718,45 @@ task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp) KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_DATA_WRITE)) | DBG_FUNC_NONE, task_pid(task), io_size, flags, (uintptr_t)VM_KERNEL_ADDRPERM(vp), 0); DTRACE_IO4(logical_writes, struct task *, task, uint32_t, io_size, int, flags, vnode *, vp); + + // Is the drive backing this vnode internal or external to the system? + if (vnode_isonexternalstorage(vp) == false) { + global_counter_to_update = &global_logical_writes_count; + ledger_to_update = task_ledgers.logical_writes; + writes_counters_to_update = &task->task_writes_counters_internal; + } else { + global_counter_to_update = &global_logical_writes_to_external_count; + ledger_to_update = task_ledgers.logical_writes_to_external; + writes_counters_to_update = &task->task_writes_counters_external; + } + switch (flags) { case TASK_WRITE_IMMEDIATE: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_immediate_writes)); - ledger_credit(task->ledger, task_ledgers.logical_writes, io_size); + OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_immediate_writes)); + ledger_credit(task->ledger, ledger_to_update, io_size); + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); break; case TASK_WRITE_DEFERRED: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_deferred_writes)); - ledger_credit(task->ledger, task_ledgers.logical_writes, io_size); + OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_deferred_writes)); + ledger_credit(task->ledger, ledger_to_update, io_size); + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); break; case TASK_WRITE_INVALIDATED: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_invalidated_writes)); - ledger_debit(task->ledger, task_ledgers.logical_writes, io_size); + OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_invalidated_writes)); + ledger_debit(task->ledger, ledger_to_update, io_size); + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, FALSE, io_size); break; case TASK_WRITE_METADATA: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_metadata_writes)); - ledger_credit(task->ledger, task_ledgers.logical_writes, io_size); + OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_metadata_writes)); + ledger_credit(task->ledger, ledger_to_update, io_size); + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); break; } io_delta = (flags == TASK_WRITE_INVALIDATED) ? ((int64_t)io_size * -1ll) : ((int64_t)io_size); if (io_telemetry_limit != 0) { /* If io_telemetry_limit is 0, disable global updates and I/O telemetry */ - needs_telemetry = global_update_logical_writes(io_delta); + needs_telemetry = global_update_logical_writes(io_delta, global_counter_to_update); if (needs_telemetry) { act_set_io_telemetry_ast(current_thread()); } @@ -5975,18 +6776,12 @@ task_io_monitor_ctl(task_t task, uint32_t *flags) /* Configure the physical I/O ledger */ ledger_set_limit(ledger, task_ledgers.physical_writes, (task_iomon_limit_mb * 1024 * 1024), 0); ledger_set_period(ledger, task_ledgers.physical_writes, (task_iomon_interval_secs * NSEC_PER_SEC)); - - /* Configure the logical I/O ledger */ - ledger_set_limit(ledger, task_ledgers.logical_writes, (task_iomon_limit_mb * 1024 * 1024), 0); - ledger_set_period(ledger, task_ledgers.logical_writes, (task_iomon_interval_secs * NSEC_PER_SEC)); } else if (*flags & IOMON_DISABLE) { /* * Caller wishes to disable I/O monitor on the task. */ ledger_disable_refill(ledger, task_ledgers.physical_writes); ledger_disable_callback(ledger, task_ledgers.physical_writes); - ledger_disable_refill(ledger, task_ledgers.logical_writes); - ledger_disable_callback(ledger, task_ledgers.logical_writes); } task_unlock(task); @@ -6023,9 +6818,6 @@ SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MUCH_IO(int flavor) case FLAVOR_IO_PHYSICAL_WRITES: ledger_get_entry_info(task->ledger, task_ledgers.physical_writes, &lei); break; - case FLAVOR_IO_LOGICAL_WRITES: - ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei); - break; } @@ -6403,6 +7195,13 @@ task_could_use_secluded_mem( { return task->task_could_use_secluded_mem; } + +boolean_t +task_could_also_use_secluded_mem( + task_t task) +{ + return task->task_could_also_use_secluded_mem; +} #endif /* CONFIG_SECLUDED_MEMORY */ queue_head_t * @@ -6411,6 +7210,12 @@ task_io_user_clients(task_t task) return &task->io_user_clients; } +void +task_set_message_app_suspended(task_t task, boolean_t enable) +{ + task->message_app_suspended = enable; +} + void task_copy_fields_for_exec(task_t dst_task, task_t src_task) { @@ -6472,14 +7277,162 @@ task_get_darkwake_mode(task_t task) return (task->t_flags & TF_DARKWAKE_MODE) != 0; } +kern_return_t +task_get_exc_guard_behavior( + task_t task, + task_exc_guard_behavior_t *behaviorp) +{ + if (task == TASK_NULL) { + return KERN_INVALID_TASK; + } + *behaviorp = task->task_exc_guard; + return KERN_SUCCESS; +} + +#ifndef TASK_EXC_GUARD_ALL +/* Temporary define until two branches are merged */ +#define TASK_EXC_GUARD_ALL (TASK_EXC_GUARD_VM_ALL | 0xf0) +#endif + +kern_return_t +task_set_exc_guard_behavior( + task_t task, + task_exc_guard_behavior_t behavior) +{ + if (task == TASK_NULL) { + return KERN_INVALID_TASK; + } + if (behavior & ~TASK_EXC_GUARD_ALL) { + return KERN_INVALID_VALUE; + } + task->task_exc_guard = behavior; + return KERN_SUCCESS; +} + #if __arm64__ +extern int legacy_footprint_entitlement_mode; +extern void memorystatus_act_on_legacy_footprint_entitlement(proc_t, boolean_t); + void task_set_legacy_footprint( - task_t task, - boolean_t new_val) + task_t task) { task_lock(task); - task->task_legacy_footprint = new_val; + task->task_legacy_footprint = TRUE; + task_unlock(task); +} + +void +task_set_extra_footprint_limit( + task_t task) +{ + if (task->task_extra_footprint_limit) { + return; + } + task_lock(task); + if (!task->task_extra_footprint_limit) { + memorystatus_act_on_legacy_footprint_entitlement(task->bsd_info, TRUE); + task->task_extra_footprint_limit = TRUE; + } task_unlock(task); } #endif /* __arm64__ */ + +static inline ledger_amount_t +task_ledger_get_balance( + ledger_t ledger, + int ledger_idx) +{ + ledger_amount_t amount; + amount = 0; + ledger_get_balance(ledger, ledger_idx, &amount); + return amount; +} + +/* + * Gather the amount of memory counted in a task's footprint due to + * being in a specific set of ledgers. + */ +void +task_ledgers_footprint( + ledger_t ledger, + ledger_amount_t *ledger_resident, + ledger_amount_t *ledger_compressed) +{ + *ledger_resident = 0; + *ledger_compressed = 0; + + /* purgeable non-volatile memory */ + *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.purgeable_nonvolatile); + *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.purgeable_nonvolatile_compressed); + + /* "default" tagged memory */ + *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.tagged_footprint); + *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.tagged_footprint_compressed); + + /* "network" currently never counts in the footprint... */ + + /* "media" tagged memory */ + *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.media_footprint); + *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.media_footprint_compressed); + + /* "graphics" tagged memory */ + *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.graphics_footprint); + *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.graphics_footprint_compressed); + + /* "neural" tagged memory */ + *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.neural_footprint); + *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.neural_footprint_compressed); +} + +void +task_set_memory_ownership_transfer( + task_t task, + boolean_t value) +{ + task_lock(task); + task->task_can_transfer_memory_ownership = value; + task_unlock(task); +} + +void +task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num) +{ + vm_object_t find_vmo; + int64_t size = 0; + + task_objq_lock(task); + if (query != NULL) { + queue_iterate(&task->task_objq, find_vmo, vm_object_t, task_objq) + { + int byte_size; + vm_object_query_t p = &query[size++]; + + p->object_id = (vm_object_id_t) VM_KERNEL_ADDRPERM(find_vmo); + p->virtual_size = find_vmo->internal ? find_vmo->vo_size : 0; + p->resident_size = find_vmo->resident_page_count * PAGE_SIZE; + p->wired_size = find_vmo->wired_page_count * PAGE_SIZE; + p->reusable_size = find_vmo->reusable_page_count * PAGE_SIZE; + p->vo_no_footprint = find_vmo->vo_no_footprint; + p->vo_ledger_tag = find_vmo->vo_ledger_tag; + p->purgable = find_vmo->purgable; + + if (find_vmo->internal && find_vmo->pager_created && find_vmo->pager != NULL) { + p->compressed_size = vm_compressor_pager_get_count(find_vmo->pager) * PAGE_SIZE; + } else { + p->compressed_size = 0; + } + + /* make sure to not overrun */ + byte_size = (int) size * sizeof(vm_object_query_data_t); + if ((int)(byte_size + sizeof(vm_object_query_data_t)) > len) { + break; + } + } + } else { + size = task->task_owned_objects; + } + task_objq_unlock(task); + + *num = size; +} diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index 00f16e0e4..1db8fb09d 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -99,6 +99,7 @@ #include #include #include +#include #endif /* XNU_KERNEL_PRIVATE */ #ifdef MACH_KERNEL_PRIVATE @@ -143,20 +144,32 @@ struct _cpu_time_qos_stats { uint64_t cpu_time_qos_user_interactive; }; +struct task_writes_counters { + uint64_t task_immediate_writes; + uint64_t task_deferred_writes; + uint64_t task_invalidated_writes; + uint64_t task_metadata_writes; +}; + +struct task_watchports; #include struct task { /* Synchronization/destruction information */ - decl_lck_mtx_data(, lock) /* Task's lock */ + decl_lck_mtx_data(, lock); /* Task's lock */ os_refcnt_t ref_count; /* Number of references to me */ boolean_t active; /* Task has not been terminated */ boolean_t halting; /* Task is being halted */ + boolean_t message_app_suspended; /* Let iokit know when pidsuspended */ + /* Virtual timers */ uint32_t vtimers; /* Miscellaneous */ vm_map_t map; /* Address space description */ queue_chain_t tasks; /* global list of tasks */ + struct task_watchports *watchports; /* watchports passed in spawn */ + turnstile_inheritor_t returnwait_inheritor; /* inheritor for task_wait */ #if defined(CONFIG_SCHED_MULTIQ) sched_group_t sched_group; @@ -164,6 +177,7 @@ struct task { /* Threads in this task */ queue_head_t threads; + struct restartable_ranges *restartable_ranges; processor_set_t pset_hint; struct affinity_space *affinity_space; @@ -192,7 +206,7 @@ struct task { uint64_t total_runnable_time; /* IPC structures */ - decl_lck_mtx_data(, itk_lock_data) + decl_lck_mtx_data(, itk_lock_data); struct ipc_port *itk_self; /* not a right, doesn't hold ref */ struct ipc_port *itk_nself; /* not a right, doesn't hold ref */ struct ipc_port *itk_sself; /* a send right */ @@ -221,6 +235,7 @@ struct task { MACHINE_TASK integer_t faults; /* faults counter */ + integer_t decompressions; /* decompression counter */ integer_t pageins; /* pageins counter */ integer_t cow_faults; /* copy on write fault counter */ integer_t messages_sent; /* messages sent counter */ @@ -252,8 +267,6 @@ struct task { #define TF_CORPSE 0x00000020 /* task is a corpse */ #define TF_PENDING_CORPSE 0x00000040 /* task corpse has not been reported yet */ #define TF_CORPSE_FORK 0x00000080 /* task is a forked corpse */ -#define TF_LRETURNWAIT 0x00000100 /* task is waiting for fork/posix_spawn/exec to complete */ -#define TF_LRETURNWAITER 0x00000200 /* task is waiting for TF_LRETURNWAIT to get cleared */ #define TF_PLATFORM 0x00000400 /* task is a platform binary */ #define TF_CA_CLIENT_WI 0x00000800 /* task has CA_CLIENT work interval */ #define TF_DARKWAKE_MODE 0x00001000 /* task is in darkwake mode */ @@ -311,6 +324,11 @@ struct task { #define task_is_exec_copy_internal(task) \ (((task)->t_procflags & TPF_EXEC_COPY) != 0) + uint8_t t_returnwaitflags; +#define TWF_NONE 0 +#define TRW_LRETURNWAIT 0x01 /* task is waiting for fork/posix_spawn/exec to complete */ +#define TRW_LRETURNWAITER 0x02 /* task is waiting for TRW_LRETURNWAIT to get cleared */ + mach_vm_address_t all_image_info_addr; /* dyld __all_image_info */ mach_vm_size_t all_image_info_size; /* section location and size */ @@ -373,10 +391,9 @@ struct task { memlimit_attrs_reserved :28; /* reserved for future use */ io_stat_info_t task_io_stats; - uint64_t task_immediate_writes __attribute__((aligned(8))); - uint64_t task_deferred_writes __attribute__((aligned(8))); - uint64_t task_invalidated_writes __attribute__((aligned(8))); - uint64_t task_metadata_writes __attribute__((aligned(8))); + + struct task_writes_counters task_writes_counters_internal; + struct task_writes_counters task_writes_counters_external; /* * The cpu_time_qos_stats fields are protected by the task lock @@ -395,18 +412,21 @@ struct task { struct mt_task task_monotonic; #endif /* MONOTONIC */ + uint8_t task_can_transfer_memory_ownership; + uint8_t task_objects_disowning; + uint8_t task_objects_disowned; /* # of purgeable volatile VM objects owned by this task: */ int task_volatile_objects; /* # of purgeable but not volatile VM objects owned by this task: */ int task_nonvolatile_objects; - boolean_t task_purgeable_disowning; - boolean_t task_purgeable_disowned; + int task_owned_objects; queue_head_t task_objq; - decl_lck_mtx_data(, task_objq_lock) /* protects "task_objq" */ + decl_lck_mtx_data(, task_objq_lock); /* protects "task_objq" */ unsigned int task_thread_limit:16; #if __arm64__ unsigned int task_legacy_footprint:1; + unsigned int task_extra_footprint_limit:1; #endif /* __arm64__ */ unsigned int task_region_footprint:1; unsigned int task_has_crossed_thread_limit:1; @@ -438,23 +458,20 @@ struct task { uint8_t task_suppressed_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ - uint32_t task_exc_guard; + task_exc_guard_behavior_t task_exc_guard; queue_head_t io_user_clients; -}; -#define TASK_EXC_GUARD_VM_DELIVER 0x01 /* Deliver virtual memory EXC_GUARD exceptions */ -#define TASK_EXC_GUARD_VM_ONCE 0x02 /* Deliver them only once */ -#define TASK_EXC_GUARD_VM_CORPSE 0x04 /* Deliver them via a forked corpse */ -#define TASK_EXC_GUARD_VM_FATAL 0x08 /* Virtual Memory EXC_GUARD delivery is fatal */ -#define TASK_EXC_GUARD_VM_ALL 0x0f + mach_vm_address_t mach_header_vm_address; -#define TASK_EXC_GUARD_MP_DELIVER 0x10 /* Deliver mach port EXC_GUARD exceptions */ -#define TASK_EXC_GUARD_MP_ONCE 0x20 /* Deliver them only once */ -#define TASK_EXC_GUARD_MP_CORPSE 0x04 /* Deliver them via a forked corpse */ -#define TASK_EXC_GUARD_MP_FATAL 0x80 /* mach port EXC_GUARD delivery is fatal */ + uint32_t loadTag; /* dext ID used for logging identity */ +}; -extern uint32_t task_exc_guard_default; +/* + * EXC_GUARD default delivery behavior for optional Mach port and VM guards. + * Applied to new tasks at creation time. + */ +extern task_exc_guard_behavior_t task_exc_guard_default; extern kern_return_t task_violated_guard(mach_exception_code_t, mach_exception_subcode_t, void *); @@ -475,6 +492,11 @@ extern kern_return_t #define itk_lock(task) lck_mtx_lock(&(task)->itk_lock_data) #define itk_unlock(task) lck_mtx_unlock(&(task)->itk_lock_data) +/* task clear return wait flags */ +#define TCRW_CLEAR_INITIAL_WAIT 0x1 +#define TCRW_CLEAR_FINAL_WAIT 0x2 +#define TCRW_CLEAR_ALL_WAIT (TCRW_CLEAR_INITIAL_WAIT | TCRW_CLEAR_FINAL_WAIT) + #define TASK_REFERENCE_LEAK_DEBUG 0 #if TASK_REFERENCE_LEAK_DEBUG @@ -506,9 +528,49 @@ extern void init_task_ledgers(void); #define current_task_fast() (current_thread()->task) #define current_task() current_task_fast() +extern bool task_is_driver(task_t task); + extern lck_attr_t task_lck_attr; extern lck_grp_t task_lck_grp; +struct task_watchport_elem { + task_t twe_task; + ipc_port_t twe_port; /* (Space lock) */ +}; + +struct task_watchports { + os_refcnt_t tw_refcount; /* (Space lock) */ + task_t tw_task; /* (Space lock) & tw_refcount == 0 */ + thread_t tw_thread; /* (Space lock) & tw_refcount == 0 */ + uint32_t tw_elem_array_count; /* (Space lock) */ + struct task_watchport_elem tw_elem[]; /* (Space lock) & (Portlock) & (mq lock) */ +}; + +#define task_watchports_retain(x) (os_ref_retain(&(x)->tw_refcount)) +#define task_watchports_release(x) (os_ref_release(&(x)->tw_refcount)) + +#define task_watchport_elem_init(elem, task, port) \ +do { \ + (elem)->twe_task = (task); \ + (elem)->twe_port = (port); \ +} while(0) + +#define task_watchport_elem_clear(elem) task_watchport_elem_init((elem), NULL, NULL) + +extern void +task_add_turnstile_watchports( + task_t task, + thread_t thread, + ipc_port_t *portwatch_ports, + uint32_t portwatch_count); + +extern void +task_watchport_elem_deallocate( + struct task_watchport_elem *watchport_elem); + +extern boolean_t +task_has_watchports(task_t task); + #else /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -516,10 +578,19 @@ __BEGIN_DECLS extern task_t current_task(void); extern void task_reference(task_t task); +extern bool task_is_driver(task_t task); #define TF_NONE 0 -#define TF_LRETURNWAIT 0x00000100 /* task is waiting for fork/posix_spawn/exec to complete */ -#define TF_LRETURNWAITER 0x00000200 /* task is waiting for TF_LRETURNWAIT to get cleared */ + +#define TWF_NONE 0 +#define TRW_LRETURNWAIT 0x01 /* task is waiting for fork/posix_spawn/exec to complete */ +#define TRW_LRETURNWAITER 0x02 /* task is waiting for TRW_LRETURNWAIT to get cleared */ + +/* task clear return wait flags */ +#define TCRW_CLEAR_INITIAL_WAIT 0x1 +#define TCRW_CLEAR_FINAL_WAIT 0x2 +#define TCRW_CLEAR_ALL_WAIT (TCRW_CLEAR_INITIAL_WAIT | TCRW_CLEAR_FINAL_WAIT) + #define TPF_NONE 0 #define TPF_EXEC_COPY 0x00000002 /* task is the new copy of an exec */ @@ -531,6 +602,10 @@ __END_DECLS __BEGIN_DECLS +#ifdef KERNEL_PRIVATE +extern boolean_t task_is_app_suspended(task_t task); +#endif + #ifdef XNU_KERNEL_PRIVATE /* Hold all threads in a task */ @@ -553,8 +628,6 @@ extern kern_return_t task_resume_internal( task_t task); /* Suspends a task by placing a hold on its threads */ extern kern_return_t task_pidsuspend( task_t task); -extern kern_return_t task_pidsuspend_locked( - task_t task); /* Resumes a previously paused task */ extern kern_return_t task_pidresume( @@ -565,6 +638,14 @@ extern kern_return_t task_send_trace_memory( uint32_t pid, uint64_t uniqueid); +extern void task_remove_turnstile_watchports( + task_t task); + +extern void task_transfer_turnstile_watchports( + task_t old_task, + task_t new_task, + thread_t new_thread); + #if DEVELOPMENT || DEBUG extern kern_return_t task_disconnect_page_mappings( @@ -612,6 +693,7 @@ extern kern_return_t task_create_internal( boolean_t is_64bit_data, uint32_t flags, uint32_t procflags, + uint8_t t_returnwaitflags, task_t *child_task); /* OUT */ extern kern_return_t task_info( @@ -624,7 +706,8 @@ extern void task_power_info_locked( task_t task, task_power_info_t info, gpu_energy_data_t gpu_energy, - task_power_info_v2_t infov2); + task_power_info_v2_t infov2, + uint64_t *runnable_time); extern uint64_t task_gpu_utilisation( task_t task); @@ -676,12 +759,14 @@ extern void task_set_dyld_info( mach_vm_address_t addr, mach_vm_size_t size); +extern void task_set_mach_header_address( + task_t task, + mach_vm_address_t addr); + /* Get number of activations in a task */ extern int get_task_numacts( task_t task); -extern int get_task_numactivethreads(task_t task); - struct label; extern kern_return_t task_collect_crash_info( task_t task, @@ -694,6 +779,7 @@ void task_wait_till_threads_terminate_locked(task_t task); /* JMM - should just be temporary (implementation in bsd_kern still) */ extern void set_bsdtask_info(task_t, void *); +extern uint32_t set_task_loadTag(task_t task, uint32_t loadTag); extern vm_map_t get_task_map_reference(task_t); extern vm_map_t swap_task_map(task_t, thread_t, vm_map_t); extern pmap_t get_task_pmap(task_t); @@ -710,6 +796,7 @@ extern uint64_t get_task_purgeable_size(task_t); extern uint64_t get_task_cpu_time(task_t); extern uint64_t get_task_dispatchqueue_offset(task_t); extern uint64_t get_task_dispatchqueue_serialno_offset(task_t); +extern uint64_t get_task_dispatchqueue_label_offset(task_t); extern uint64_t get_task_uniqueid(task_t task); extern int get_task_version(task_t task); @@ -725,6 +812,7 @@ extern uint64_t get_task_page_table(task_t); extern uint64_t get_task_network_nonvolatile(task_t); extern uint64_t get_task_network_nonvolatile_compressed(task_t); extern uint64_t get_task_wired_mem(task_t); +extern uint32_t get_task_loadTag(task_t task); extern kern_return_t task_convert_phys_footprint_limit(int, int *); extern kern_return_t task_set_phys_footprint_limit_internal(task_t, int, int *, boolean_t, boolean_t); @@ -777,10 +865,26 @@ struct _task_ledger_indices { int purgeable_nonvolatile; int purgeable_volatile_compressed; int purgeable_nonvolatile_compressed; + int tagged_nofootprint; + int tagged_footprint; + int tagged_nofootprint_compressed; + int tagged_footprint_compressed; int network_volatile; int network_nonvolatile; int network_volatile_compressed; int network_nonvolatile_compressed; + int media_nofootprint; + int media_footprint; + int media_nofootprint_compressed; + int media_footprint_compressed; + int graphics_nofootprint; + int graphics_footprint; + int graphics_nofootprint_compressed; + int graphics_footprint_compressed; + int neural_nofootprint; + int neural_footprint; + int neural_nofootprint_compressed; + int neural_footprint_compressed; int platform_idle_wakeups; int interrupt_wakeups; #if CONFIG_SCHED_SFI @@ -790,12 +894,15 @@ struct _task_ledger_indices { int cpu_time_billed_to_others; int physical_writes; int logical_writes; + int logical_writes_to_external; int energy_billed_to_me; int energy_billed_to_others; +#if DEBUG || DEVELOPMENT int pages_grabbed; int pages_grabbed_kern; int pages_grabbed_iopl; int pages_grabbed_upl; +#endif }; extern struct _task_ledger_indices task_ledgers; @@ -817,7 +924,7 @@ extern void task_set_32bit_log_flag(task_t task); #endif /* CONFIG_32BIT_TELEMETRY */ extern boolean_t task_is_active(task_t task); extern boolean_t task_is_halting(task_t task); -extern void task_clear_return_wait(task_t task); +extern void task_clear_return_wait(task_t task, uint32_t flags); extern void task_wait_to_return(void) __attribute__((noreturn)); extern event_t task_get_return_wait_event(task_t task); @@ -825,6 +932,10 @@ extern void task_atm_reset(task_t task); extern void task_bank_reset(task_t task); extern void task_bank_init(task_t task); +#if CONFIG_ARCADE +extern void task_prep_arcade(task_t task, thread_t thread); +#endif /* CONFIG_ARCADE */ + extern int task_pid(task_t task); extern boolean_t task_has_assertions(task_t task); /* End task_policy */ @@ -833,9 +944,12 @@ extern void task_set_gpu_denied(task_t task, boolean_t denied); extern boolean_t task_is_gpu_denied(task_t task); extern queue_head_t * task_io_user_clients(task_t task); +extern void task_set_message_app_suspended(task_t task, boolean_t enable); extern void task_copy_fields_for_exec(task_t dst_task, task_t src_task); +extern void task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num); + #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -847,6 +961,7 @@ extern vm_map_t get_task_map(task_t); extern ledger_t get_task_ledger(task_t); extern boolean_t get_task_pidsuspended(task_t); +extern boolean_t get_task_suspended(task_t); extern boolean_t get_task_frozen(task_t); /* Convert from a task to a port */ @@ -860,10 +975,10 @@ extern task_suspension_token_t convert_port_to_task_suspension_token(ipc_port_t extern boolean_t task_suspension_notify(mach_msg_header_t *); -#define TASK_WRITE_IMMEDIATE 0x1 -#define TASK_WRITE_DEFERRED 0x2 -#define TASK_WRITE_INVALIDATED 0x4 -#define TASK_WRITE_METADATA 0x8 +#define TASK_WRITE_IMMEDIATE 0x1 +#define TASK_WRITE_DEFERRED 0x2 +#define TASK_WRITE_INVALIDATED 0x4 +#define TASK_WRITE_METADATA 0x8 extern void task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp); #if CONFIG_SECLUDED_MEMORY @@ -880,19 +995,23 @@ extern boolean_t task_can_use_secluded_mem( task_t task, boolean_t is_allocate); extern boolean_t task_could_use_secluded_mem(task_t task); +extern boolean_t task_could_also_use_secluded_mem(task_t task); #endif /* CONFIG_SECLUDED_MEMORY */ extern void task_set_darkwake_mode(task_t, boolean_t); extern boolean_t task_get_darkwake_mode(task_t); #if __arm64__ -extern void task_set_legacy_footprint(task_t task, boolean_t new_val); +extern void task_set_legacy_footprint(task_t task); +extern void task_set_extra_footprint_limit(task_t task); #endif /* __arm64__ */ #if CONFIG_MACF extern struct label *get_task_crash_label(task_t task); #endif /* CONFIG_MACF */ +extern int get_task_cdhash(task_t task, char cdhash[]); + #endif /* KERNEL_PRIVATE */ extern task_t kernel_task; @@ -911,6 +1030,12 @@ extern void task_suspension_token_deallocate( extern boolean_t task_self_region_footprint(void); extern void task_self_region_footprint_set(boolean_t newval); +extern void task_ledgers_footprint(ledger_t ledger, + ledger_amount_t *ledger_resident, + ledger_amount_t *ledger_compressed); +extern void task_set_memory_ownership_transfer( + task_t task, + boolean_t value); __END_DECLS diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index 2faf0f7cd..e51b5b7bc 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -865,7 +865,7 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t break; } } else { - /* Daemons get USER_INTERACTIVE squashed to USER_INITIATED */ + /* Daemons and dext get USER_INTERACTIVE squashed to USER_INITIATED */ next.tep_qos_ceiling = THREAD_QOS_USER_INITIATED; } @@ -1077,6 +1077,7 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t case TASK_APPTYPE_DAEMON_STANDARD: case TASK_APPTYPE_DAEMON_ADAPTIVE: case TASK_APPTYPE_DAEMON_BACKGROUND: + case TASK_APPTYPE_DRIVER: default: next.tep_live_donor = 0; break; @@ -1907,8 +1908,8 @@ extern boolean_t ipc_importance_interactive_receiver; * TODO: Make this function more table-driven instead of ad-hoc */ void -proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, - ipc_port_t * portwatch_ports, int portwatch_count) +proc_set_task_spawnpolicy(task_t task, thread_t thread, int apptype, int qos_clamp, int role, + ipc_port_t * portwatch_ports, uint32_t portwatch_count) { struct task_pend_token pend_token = {}; @@ -1968,6 +1969,13 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, task_importance_mark_denap_receiver(task, FALSE); break; + case TASK_APPTYPE_DRIVER: + task_importance_mark_donor(task, FALSE); + task_importance_mark_live_donor(task, FALSE); + task_importance_mark_receiver(task, FALSE); + task_importance_mark_denap_receiver(task, FALSE); + break; + case TASK_APPTYPE_NONE: break; } @@ -1975,10 +1983,10 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, if (portwatch_ports != NULL && apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) { int portwatch_boosts = 0; - for (int i = 0; i < portwatch_count; i++) { + for (uint32_t i = 0; i < portwatch_count; i++) { ipc_port_t port = NULL; - if ((port = portwatch_ports[i]) != NULL) { + if (IP_VALID(port = portwatch_ports[i])) { int boost = 0; task_add_importance_watchport(task, port, &boost); portwatch_boosts += boost; @@ -1990,6 +1998,11 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, } } + /* Redirect the turnstile push of watchports to task */ + if (portwatch_count && portwatch_ports != NULL) { + task_add_turnstile_watchports(task, thread, portwatch_ports, portwatch_count); + } + task_lock(task); if (apptype == TASK_APPTYPE_APP_TAL) { @@ -2065,6 +2078,7 @@ task_compute_main_thread_qos(task_t task) case TASK_APPTYPE_DAEMON_INTERACTIVE: case TASK_APPTYPE_DAEMON_STANDARD: case TASK_APPTYPE_DAEMON_ADAPTIVE: + case TASK_APPTYPE_DRIVER: primordial_qos = THREAD_QOS_LEGACY; break; @@ -2117,6 +2131,15 @@ task_is_daemon(task_t task) } } +bool +task_is_driver(task_t task) +{ + if (!task) { + return FALSE; + } + return task->requested_policy.trp_apptype == TASK_APPTYPE_DRIVER; +} + boolean_t task_is_app(task_t task) { diff --git a/osfmk/kern/telemetry.c b/osfmk/kern/telemetry.c index 7df3ce15b..b723f0b7c 100644 --- a/osfmk/kern/telemetry.c +++ b/osfmk/kern/telemetry.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * Copyright (c) 2012-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,7 +106,12 @@ uint32_t telemetry_timestamp = 0; * compute_averages(). It will notify its client (if one * exists) when it has enough data to be worth flushing. */ -struct micro_snapshot_buffer telemetry_buffer = {0, 0, 0, 0}; +struct micro_snapshot_buffer telemetry_buffer = { + .buffer = 0, + .size = 0, + .current_position = 0, + .end_point = 0 +}; int telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked? int telemetry_buffer_notify_at = 0; @@ -478,20 +483,23 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro * buffer with the global telemetry lock held -- so we must do our (possibly faulting) * copies from userland here, before taking the lock. */ - uintptr_t frames[MAX_CALLSTACK_FRAMES] = {}; - bool user64; - int backtrace_error = backtrace_user(frames, MAX_CALLSTACK_FRAMES, &btcount, &user64); + + uintptr_t frames[128]; + bool user64_regs = false; + int backtrace_error = backtrace_user(frames, + sizeof(frames) / sizeof(frames[0]), &btcount, &user64_regs, NULL); if (backtrace_error) { return; } + bool user64_va = task_has_64Bit_addr(task); /* * Find the actual [slid] address of the shared cache's UUID, and copy it in from userland. */ - int shared_cache_uuid_valid = 0; - uint64_t shared_cache_base_address; - struct _dyld_cache_header shared_cache_header; - uint64_t shared_cache_slide; + int shared_cache_uuid_valid = 0; + uint64_t shared_cache_base_address = 0; + struct _dyld_cache_header shared_cache_header = {}; + uint64_t shared_cache_slide = 0; /* * Don't copy in the entire shared cache header; we only need the UUID. Calculate the @@ -516,15 +524,18 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro * * XXX - make this common with kdp? */ - uint32_t uuid_info_count = 0; - mach_vm_address_t uuid_info_addr = 0; - if (task_has_64Bit_addr(task)) { + uint32_t uuid_info_count = 0; + mach_vm_address_t uuid_info_addr = 0; + uint32_t uuid_info_size = 0; + if (user64_va) { + uuid_info_size = sizeof(struct user64_dyld_uuid_info); struct user64_dyld_all_image_infos task_image_infos; if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) { uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; uuid_info_addr = task_image_infos.uuidArray; } } else { + uuid_info_size = sizeof(struct user32_dyld_uuid_info); struct user32_dyld_all_image_infos task_image_infos; if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) { uuid_info_count = task_image_infos.uuidArrayCount; @@ -549,7 +560,6 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro uuid_info_count = TELEMETRY_MAX_UUID_COUNT; } - uint32_t uuid_info_size = (uint32_t)(task_has_64Bit_addr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; char *uuid_info_array = NULL; @@ -579,10 +589,10 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro if (dqkeyaddr != 0) { uint64_t dqaddr = 0; uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task); - if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) && + if ((copyin(dqkeyaddr, (char *)&dqaddr, (user64_va ? 8 : 4)) == 0) && (dqaddr != 0) && (dq_serialno_offset != 0)) { uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset; - if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) { + if (copyin(dqserialnumaddr, (char *)&dqserialnum, (user64_va ? 8 : 4)) == 0) { dqserialnum_valid = 1; } } @@ -694,7 +704,7 @@ copytobuffer: tsnap->latency_qos = task_grab_latency_qos(task); strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm)); - if (task_has_64Bit_addr(thread->task)) { + if (user64_va) { tsnap->ss_flags |= kUser64_p; } @@ -796,7 +806,7 @@ copytobuffer: current_buffer->current_position += sizeof(dqserialnum); } - if (user64) { + if (user64_regs) { framesize = 8; thsnap->ss_flags |= kUser64_p; } else { @@ -1182,9 +1192,9 @@ bootprofile_timer_call( if (bootprofile_buffer_current_position < bootprofile_buffer_size) { uint32_t flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_TRYLOCK | STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS; -#if __x86_64__ +#if !CONFIG_EMBEDDED flags |= STACKSHOT_SAVE_KEXT_LOADINFO; -#endif /* __x86_64__ */ +#endif /* OR on flags specified in boot-args */ diff --git a/osfmk/kern/test_lock.c b/osfmk/kern/test_lock.c index 2e1069bb9..9f88ebefa 100644 --- a/osfmk/kern/test_lock.c +++ b/osfmk/kern/test_lock.c @@ -648,6 +648,7 @@ struct lck_mtx_thread_arg { int my_locked; int* other_locked; thread_t other_thread; + int type; }; static void @@ -660,6 +661,8 @@ test_mtx_lock_unlock_contended_thread( thread_t other_thread; int* my_locked; int* other_locked; + int type; + uint64_t start, stop; printf("Starting thread %p\n", current_thread()); @@ -672,6 +675,7 @@ test_mtx_lock_unlock_contended_thread( my_locked = &info->my_locked; other_locked = info->other_locked; + type = info->type; *my_locked = 0; val = os_atomic_inc(&synch, relaxed); @@ -682,19 +686,26 @@ test_mtx_lock_unlock_contended_thread( //warming up the test for (i = 0; i < WARMUP_ITER; i++) { lck_mtx_test_lock(); - - os_atomic_xchg(my_locked, 1, relaxed); + int prev = os_atomic_load(other_locked, relaxed); + os_atomic_add(my_locked, 1, relaxed); if (i != WARMUP_ITER - 1) { - while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { - ; + if (type == FULL_CONTENDED) { + while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { + ; + } + } else { + start = mach_absolute_time(); + stop = start + (MutexSpin / 2); + while (mach_absolute_time() < stop) { + ; + } } - os_atomic_xchg(my_locked, 0, relaxed); } lck_mtx_test_unlock(); if (i != WARMUP_ITER - 1) { - while (os_atomic_load(other_locked, relaxed) == 0) { + while (os_atomic_load(other_locked, relaxed) == prev) { ; } } @@ -723,18 +734,25 @@ test_mtx_lock_unlock_contended_thread( for (i = 0; i < iterations; i++) { lck_mtx_test_lock(); - - os_atomic_xchg(my_locked, 1, relaxed); + int prev = os_atomic_load(other_locked, relaxed); + os_atomic_add(my_locked, 1, relaxed); if (i != iterations - 1) { - while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { - ; + if (type == FULL_CONTENDED) { + while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { + ; + } + } else { + start = mach_absolute_time(); + stop = start + (MutexSpin / 2); + while (mach_absolute_time() < stop) { + ; + } } - os_atomic_xchg(my_locked, 0, relaxed); } lck_mtx_test_unlock_mtx(); if (i != iterations - 1) { - while (os_atomic_load(other_locked, relaxed) == 0) { + while (os_atomic_load(other_locked, relaxed) == prev) { ; } } @@ -750,7 +768,8 @@ kern_return_t lck_mtx_test_mtx_contended( int iter, char* buffer, - int buffer_size) + int buffer_size, + int type) { thread_t thread1, thread2; kern_return_t result; @@ -759,10 +778,17 @@ lck_mtx_test_mtx_contended( wait_barrier = 0; iterations = iter; + if (type < 0 || type > MAX_CONDENDED) { + printf("%s invalid type %d\n", __func__, type); + return 0; + } + erase_all_test_mtx_stats(); targs[0].other_thread = NULL; targs[1].other_thread = NULL; + targs[0].type = type; + targs[1].type = type; result = kernel_thread_start((thread_continue_t)test_mtx_lock_unlock_contended_thread, &targs[0], &thread1); if (result != KERN_SUCCESS) { @@ -812,6 +838,8 @@ test_mtx_lck_unlock_contended_loop_time_thread( thread_t other_thread; int* my_locked; int* other_locked; + int type; + uint64_t start, stop; printf("Starting thread %p\n", current_thread()); @@ -824,6 +852,7 @@ test_mtx_lck_unlock_contended_loop_time_thread( my_locked = &info->my_locked; other_locked = info->other_locked; + type = info->type; *my_locked = 0; val = os_atomic_inc(&synch, relaxed); @@ -835,18 +864,26 @@ test_mtx_lck_unlock_contended_loop_time_thread( for (i = 0; i < WARMUP_ITER; i++) { lck_mtx_lock(&test_mtx); - os_atomic_xchg(my_locked, 1, relaxed); + int prev = os_atomic_load(other_locked, relaxed); + os_atomic_add(my_locked, 1, relaxed); if (i != WARMUP_ITER - 1) { - while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { - ; + if (type == FULL_CONTENDED) { + while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { + ; + } + } else { + start = mach_absolute_time(); + stop = start + (MutexSpin / 2); + while (mach_absolute_time() < stop) { + ; + } } - os_atomic_xchg(my_locked, 0, relaxed); } lck_mtx_unlock(&test_mtx); if (i != WARMUP_ITER - 1) { - while (os_atomic_load(other_locked, relaxed) == 0) { + while (os_atomic_load(other_locked, relaxed) == prev) { ; } } @@ -878,18 +915,26 @@ test_mtx_lck_unlock_contended_loop_time_thread( for (i = 0; i < iterations; i++) { lck_mtx_lock(&test_mtx); - os_atomic_xchg(my_locked, 1, relaxed); + int prev = os_atomic_load(other_locked, relaxed); + os_atomic_add(my_locked, 1, relaxed); if (i != iterations - 1) { - while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { - ; + if (type == FULL_CONTENDED) { + while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) { + ; + } + } else { + start = mach_absolute_time(); + stop = start + (MutexSpin / 2); + while (mach_absolute_time() < stop) { + ; + } } - os_atomic_xchg(my_locked, 0, relaxed); } lck_mtx_unlock(&test_mtx); if (i != iterations - 1) { - while (os_atomic_load(other_locked, relaxed) == 0) { + while (os_atomic_load(other_locked, relaxed) == prev) { ; } } @@ -910,7 +955,8 @@ int lck_mtx_test_mtx_contended_loop_time( int iter, char *buffer, - int buffer_size) + int buffer_size, + int type) { thread_t thread1, thread2; kern_return_t result; @@ -921,6 +967,11 @@ lck_mtx_test_mtx_contended_loop_time( iterations = iter; uint64_t time, time_run; + if (type < 0 || type > MAX_CONDENDED) { + printf("%s invalid type %d\n", __func__, type); + return 0; + } + targs[0].other_thread = NULL; targs[1].other_thread = NULL; @@ -938,6 +989,8 @@ lck_mtx_test_mtx_contended_loop_time( /* this are t1 args */ targs[0].my_locked = 0; targs[0].other_locked = &targs[1].my_locked; + targs[0].type = type; + targs[1].type = type; os_atomic_xchg(&targs[0].other_thread, thread2, release); diff --git a/osfmk/kern/test_mpsc_queue.c b/osfmk/kern/test_mpsc_queue.c new file mode 100644 index 000000000..d369ed1bb --- /dev/null +++ b/osfmk/kern/test_mpsc_queue.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#if !DEBUG && !DEVELOPMENT +#error "Test only file" +#endif + +#include + +struct mpsc_test_pingpong_queue { + struct mpsc_daemon_queue queue; + struct mpsc_queue_chain link; + struct mpsc_test_pingpong_queue *other; + uint64_t *count, *end; +}; + +static void +mpsc_test_pingpong_invoke(mpsc_queue_chain_t elm, mpsc_daemon_queue_t dq) +{ + struct mpsc_test_pingpong_queue *q; + q = mpsc_queue_element(elm, struct mpsc_test_pingpong_queue, link); + assert(&q->queue == dq); + + if (*q->count % 10000 == 0) { + printf("mpsc_test_pingpong: %lld asyncs left\n", *q->count); + } + if ((*q->count)-- > 0) { + mpsc_daemon_enqueue(&q->other->queue, &q->other->link, + MPSC_QUEUE_DISABLE_PREEMPTION); + } else { + *q->end = mach_absolute_time(); + thread_wakeup(&mpsc_test_pingpong_invoke); + } +} + +/* + * The point of this test is to exercise the enqueue/unlock-drain race + * since the MPSC queue tries to mimize wakeups when it knows it's useless. + * + * It also ensures basic enqueue properties, + * and will panic if anything goes wrong to help debugging state. + * + * Performance wise, we will always go through the wakeup codepath, + * hence this is mostly a benchmark of + * assert_wait()/clear_wait()/thread_block()/thread_wakeup() + * rather than a benchmark of the MPSC queues. + */ +int +mpsc_test_pingpong(uint64_t count, uint64_t *out) +{ + struct mpsc_test_pingpong_queue ping, pong; + kern_return_t kr; + wait_result_t wr; + + if (count < 1000 || count > 1000 * 1000) { + return EINVAL; + } + + printf("mpsc_test_pingpong: START\n"); + + kr = mpsc_daemon_queue_init_with_thread(&ping.queue, + mpsc_test_pingpong_invoke, MINPRI_KERNEL, "ping"); + if (kr != KERN_SUCCESS) { + panic("mpsc_test_pingpong: unable to create pong: %x", kr); + } + + kr = mpsc_daemon_queue_init_with_thread(&pong.queue, + mpsc_test_pingpong_invoke, MINPRI_KERNEL, "pong"); + if (kr != KERN_SUCCESS) { + panic("mpsc_test_pingpong: unable to create ping: %x", kr); + } + + uint64_t n = count, start, end; + ping.count = pong.count = &n; + ping.end = pong.end = &end; + ping.other = &pong; + pong.other = &ping; + + assert_wait_timeout(&mpsc_test_pingpong_invoke, THREAD_UNINT, + 5000, 1000 * NSEC_PER_USEC); + start = mach_absolute_time(); + mpsc_daemon_enqueue(&ping.queue, &ping.link, MPSC_QUEUE_DISABLE_PREEMPTION); + + wr = thread_block(THREAD_CONTINUE_NULL); + if (wr == THREAD_TIMED_OUT) { + panic("mpsc_test_pingpong: timed out: ping:%p pong:%p", &ping, &pong); + } + + printf("mpsc_test_pingpong: CLEANUP\n"); + + mpsc_daemon_queue_cancel_and_wait(&ping.queue); + mpsc_daemon_queue_cancel_and_wait(&pong.queue); + absolutetime_to_nanoseconds(end - start, out); + + printf("mpsc_test_pingpong: %lld ping-pongs in %lld ns (%lld.%03lld us/async)\n", + count, *out, (*out / count) / 1000, (*out / count) % 1000); + return 0; +} diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index 7dfcb02b3..f554222b1 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -122,6 +122,7 @@ #include #include #include +#include #include #if KPC @@ -144,9 +145,16 @@ #include #include #include +#if CONFIG_KSANCOV +#include +#endif #include +#if defined(HAS_APPLE_PAC) +#include +#include +#endif /* defined(HAS_APPLE_PAC) */ /* * Exported interfaces @@ -165,25 +173,16 @@ lck_grp_t thread_lck_grp; struct zone *thread_qos_override_zone; -decl_simple_lock_data(static, thread_stack_lock) -static queue_head_t thread_stack_queue; - -decl_simple_lock_data(static, thread_terminate_lock) -static queue_head_t thread_terminate_queue; - -static queue_head_t thread_deallocate_queue; - -static queue_head_t turnstile_deallocate_queue; +static struct mpsc_daemon_queue thread_stack_queue; +static struct mpsc_daemon_queue thread_terminate_queue; +static struct mpsc_daemon_queue thread_deallocate_queue; +static struct mpsc_daemon_queue thread_exception_queue; +decl_simple_lock_data(static, crashed_threads_lock); static queue_head_t crashed_threads_queue; -static queue_head_t workq_deallocate_queue; - -decl_simple_lock_data(static, thread_exception_lock) -static queue_head_t thread_exception_queue; - struct thread_exception_elt { - queue_chain_t elt; + struct mpsc_queue_chain link; exception_type_t exception_type; task_t exception_task; thread_t exception_thread; @@ -211,7 +210,7 @@ int task_threadmax = CONFIG_THREAD_MAX; static uint64_t thread_unique_id = 100; -struct _thread_ledger_indices thread_ledgers = { -1 }; +struct _thread_ledger_indices thread_ledgers = { .cpu_time = -1 }; static ledger_template_t thread_ledger_template = NULL; static void init_thread_ledgers(void); @@ -290,7 +289,6 @@ thread_bootstrap(void) thread_template.sched_pri = 0; thread_template.max_priority = 0; thread_template.task_priority = 0; - thread_template.promotions = 0; thread_template.rwlock_count = 0; thread_template.waiting_for_mutex = NULL; @@ -394,7 +392,7 @@ thread_bootstrap(void) thread_template.effective_policy = (struct thread_effective_policy) {}; bzero(&thread_template.overrides, sizeof(thread_template.overrides)); - thread_template.sync_ipc_overrides = 0; + thread_template.kevent_overrides = 0; thread_template.iotier_override = THROTTLE_LEVEL_NONE; thread_template.thread_io_stats = NULL; @@ -405,6 +403,7 @@ thread_bootstrap(void) thread_template.thread_timer_wakeups_bin_1 = thread_template.thread_timer_wakeups_bin_2 = 0; thread_template.callout_woken_from_icontext = thread_template.callout_woken_from_platform_idle = 0; + thread_template.guard_exc_fatal = 0; thread_template.thread_tag = 0; @@ -413,6 +412,7 @@ thread_bootstrap(void) thread_template.th_work_interval = NULL; + thread_template.decompressions = 0; init_thread = thread_template; /* fiddle with init thread to skip asserts in set_sched_pri */ @@ -491,6 +491,7 @@ thread_corpse_continue(void) /*NOTREACHED*/ } +__dead2 static void thread_terminate_continue(void) { @@ -562,12 +563,12 @@ thread_terminate_self(void) /* * After this subtraction, this thread should never access - * task->bsd_info unless it got 0 back from the hw_atomic_sub. It + * task->bsd_info unless it got 0 back from the os_atomic_dec. It * could be racing with other threads to be the last thread in the * process, and the last thread in the process will tear down the proc * structure and zero-out task->bsd_info. */ - threadcnt = hw_atomic_sub(&task->active_thread_count, 1); + threadcnt = os_atomic_dec(&task->active_thread_count, relaxed); /* * If we are the last thread to terminate and the task is @@ -683,8 +684,7 @@ thread_terminate_self(void) assert((thread->sched_flags & TH_SFLAG_RW_PROMOTED) == 0); assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0); assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0); - assert(thread->promotions == 0); - assert(thread->was_promoted_on_wakeup == 0); + assert(thread->kern_promotion_schedpri == 0); assert(thread->waiting_for_mutex == NULL); assert(thread->rwlock_count == 0); @@ -735,8 +735,6 @@ thread_deallocate_complete( assert(os_ref_get_count(&thread->ref_count) == 0); - assert(thread_owned_workloops_count(thread) == 0); - if (!(thread->state & TH_TERMINATE2)) { panic("thread_deallocate: thread not properly terminated\n"); } @@ -799,29 +797,6 @@ thread_deallocate_complete( zfree(thread_zone, thread); } -void -thread_starts_owning_workloop(thread_t thread) -{ - atomic_fetch_add_explicit(&thread->kqwl_owning_count, 1, - memory_order_relaxed); -} - -void -thread_ends_owning_workloop(thread_t thread) -{ - __assert_only uint32_t count; - count = atomic_fetch_sub_explicit(&thread->kqwl_owning_count, 1, - memory_order_relaxed); - assert(count > 0); -} - -uint32_t -thread_owned_workloops_count(thread_t thread) -{ - return atomic_load_explicit(&thread->kqwl_owning_count, - memory_order_relaxed); -} - /* * thread_inspect_deallocate: * @@ -835,49 +810,41 @@ thread_inspect_deallocate( } /* - * thread_exception_daemon: + * thread_exception_queue_invoke: * * Deliver EXC_{RESOURCE,GUARD} exception */ static void -thread_exception_daemon(void) +thread_exception_queue_invoke(mpsc_queue_chain_t elm, + __assert_only mpsc_daemon_queue_t dq) { struct thread_exception_elt *elt; task_t task; thread_t thread; exception_type_t etype; - simple_lock(&thread_exception_lock, LCK_GRP_NULL); - while ((elt = (struct thread_exception_elt *)dequeue_head(&thread_exception_queue)) != NULL) { - simple_unlock(&thread_exception_lock); - - etype = elt->exception_type; - task = elt->exception_task; - thread = elt->exception_thread; - assert_thread_magic(thread); + assert(dq == &thread_exception_queue); + elt = mpsc_queue_element(elm, struct thread_exception_elt, link); - kfree(elt, sizeof(*elt)); - - /* wait for all the threads in the task to terminate */ - task_lock(task); - task_wait_till_threads_terminate_locked(task); - task_unlock(task); - - /* Consumes the task ref returned by task_generate_corpse_internal */ - task_deallocate(task); - /* Consumes the thread ref returned by task_generate_corpse_internal */ - thread_deallocate(thread); + etype = elt->exception_type; + task = elt->exception_task; + thread = elt->exception_thread; + assert_thread_magic(thread); - /* Deliver the notification, also clears the corpse. */ - task_deliver_crash_notification(task, thread, etype, 0); + kfree(elt, sizeof(*elt)); - simple_lock(&thread_exception_lock, LCK_GRP_NULL); - } + /* wait for all the threads in the task to terminate */ + task_lock(task); + task_wait_till_threads_terminate_locked(task); + task_unlock(task); - assert_wait((event_t)&thread_exception_queue, THREAD_UNINT); - simple_unlock(&thread_exception_lock); + /* Consumes the task ref returned by task_generate_corpse_internal */ + task_deallocate(task); + /* Consumes the thread ref returned by task_generate_corpse_internal */ + thread_deallocate(thread); - thread_block((thread_continue_t)thread_exception_daemon); + /* Deliver the notification, also clears the corpse. */ + task_deliver_crash_notification(task, thread, etype, 0); } /* @@ -897,11 +864,8 @@ thread_exception_enqueue( elt->exception_task = task; elt->exception_thread = thread; - simple_lock(&thread_exception_lock, LCK_GRP_NULL); - enqueue_tail(&thread_exception_queue, (queue_entry_t)elt); - simple_unlock(&thread_exception_lock); - - thread_wakeup((event_t)&thread_exception_queue); + mpsc_daemon_enqueue(&thread_exception_queue, &elt->link, + MPSC_QUEUE_DISABLE_PREEMPTION); } /* @@ -934,150 +898,94 @@ thread_copy_resource_info( *dst_thread->thread_io_stats = *src_thread->thread_io_stats; } -/* - * thread_terminate_daemon: - * - * Perform final clean up for terminating threads. - */ static void -thread_terminate_daemon(void) +thread_terminate_queue_invoke(mpsc_queue_chain_t e, + __assert_only mpsc_daemon_queue_t dq) { - thread_t self, thread; - task_t task; - - self = current_thread(); - self->options |= TH_OPT_SYSTEM_CRITICAL; - - (void)splsched(); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - -thread_terminate_start: - while ((thread = qe_dequeue_head(&thread_terminate_queue, struct thread, runq_links)) != THREAD_NULL) { - assert_thread_magic(thread); - - /* - * if marked for crash reporting, skip reaping. - * The corpse delivery thread will clear bit and enqueue - * for reaping when done - */ - if (thread->inspection) { - enqueue_tail(&crashed_threads_queue, &thread->runq_links); - continue; - } - - simple_unlock(&thread_terminate_lock); - (void)spllo(); - - task = thread->task; - - task_lock(task); - task->total_user_time += timer_grab(&thread->user_timer); - task->total_ptime += timer_grab(&thread->ptime); - task->total_runnable_time += timer_grab(&thread->runnable_timer); - if (thread->precise_user_kernel_time) { - task->total_system_time += timer_grab(&thread->system_timer); - } else { - task->total_user_time += timer_grab(&thread->system_timer); - } - - task->c_switch += thread->c_switch; - task->p_switch += thread->p_switch; - task->ps_switch += thread->ps_switch; - - task->syscalls_unix += thread->syscalls_unix; - task->syscalls_mach += thread->syscalls_mach; - - task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1; - task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2; - task->task_gpu_ns += ml_gpu_stat(thread); - task->task_energy += ml_energy_stat(thread); - -#if MONOTONIC - mt_terminate_update(task, thread); -#endif /* MONOTONIC */ + thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links); + task_t task = thread->task; - thread_update_qos_cpu_time(thread); + assert(dq == &thread_terminate_queue); - queue_remove(&task->threads, thread, thread_t, task_threads); - task->thread_count--; - - /* - * If the task is being halted, and there is only one thread - * left in the task after this one, then wakeup that thread. - */ - if (task->thread_count == 1 && task->halting) { - thread_wakeup((event_t)&task->halting); - } + task_lock(task); + /* + * if marked for crash reporting, skip reaping. + * The corpse delivery thread will clear bit and enqueue + * for reaping when done + * + * Note: the inspection field is set under the task lock + * + * FIXME[mad]: why enqueue for termination before `inspection` is false ? + */ + if (__improbable(thread->inspection)) { + simple_lock(&crashed_threads_lock, &thread_lck_grp); task_unlock(task); - lck_mtx_lock(&tasks_threads_lock); - queue_remove(&threads, thread, thread_t, threads); - threads_count--; - lck_mtx_unlock(&tasks_threads_lock); - - thread_deallocate(thread); - - (void)splsched(); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); + enqueue_tail(&crashed_threads_queue, &thread->runq_links); + simple_unlock(&crashed_threads_lock); + return; } - while ((thread = qe_dequeue_head(&thread_deallocate_queue, struct thread, runq_links)) != THREAD_NULL) { - assert_thread_magic(thread); - simple_unlock(&thread_terminate_lock); - (void)spllo(); + task->total_user_time += timer_grab(&thread->user_timer); + task->total_ptime += timer_grab(&thread->ptime); + task->total_runnable_time += timer_grab(&thread->runnable_timer); + if (thread->precise_user_kernel_time) { + task->total_system_time += timer_grab(&thread->system_timer); + } else { + task->total_user_time += timer_grab(&thread->system_timer); + } - thread_deallocate_complete(thread); + task->c_switch += thread->c_switch; + task->p_switch += thread->p_switch; + task->ps_switch += thread->ps_switch; - (void)splsched(); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - } + task->syscalls_unix += thread->syscalls_unix; + task->syscalls_mach += thread->syscalls_mach; - struct turnstile *turnstile; - while ((turnstile = qe_dequeue_head(&turnstile_deallocate_queue, struct turnstile, ts_deallocate_link)) != TURNSTILE_NULL) { - simple_unlock(&thread_terminate_lock); - (void)spllo(); + task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1; + task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2; + task->task_gpu_ns += ml_gpu_stat(thread); + task->task_energy += ml_energy_stat(thread); + task->decompressions += thread->decompressions; - turnstile_destroy(turnstile); +#if MONOTONIC + mt_terminate_update(task, thread); +#endif /* MONOTONIC */ - (void)splsched(); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - } + thread_update_qos_cpu_time(thread); - queue_entry_t qe; + queue_remove(&task->threads, thread, thread_t, task_threads); + task->thread_count--; /* - * see workq_deallocate_enqueue: struct workqueue is opaque to thread.c and - * we just link pieces of memory here + * If the task is being halted, and there is only one thread + * left in the task after this one, then wakeup that thread. */ - while ((qe = dequeue_head(&workq_deallocate_queue))) { - simple_unlock(&thread_terminate_lock); - (void)spllo(); + if (task->thread_count == 1 && task->halting) { + thread_wakeup((event_t)&task->halting); + } + + task_unlock(task); - workq_destroy((struct workqueue *)qe); + lck_mtx_lock(&tasks_threads_lock); + queue_remove(&threads, thread, thread_t, threads); + threads_count--; + lck_mtx_unlock(&tasks_threads_lock); - (void)splsched(); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - } + thread_deallocate(thread); +} - /* - * Check if something enqueued in thread terminate/deallocate queue - * while processing workq deallocate queue - */ - if (!queue_empty(&thread_terminate_queue) || - !queue_empty(&thread_deallocate_queue) || - !queue_empty(&turnstile_deallocate_queue)) { - goto thread_terminate_start; - } +static void +thread_deallocate_queue_invoke(mpsc_queue_chain_t e, + __assert_only mpsc_daemon_queue_t dq) +{ + thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links); - assert_wait((event_t)&thread_terminate_queue, THREAD_UNINT); - simple_unlock(&thread_terminate_lock); - /* splsched */ + assert(dq == &thread_deallocate_queue); - self->options &= ~TH_OPT_SYSTEM_CRITICAL; - thread_block((thread_continue_t)thread_terminate_daemon); - /*NOTREACHED*/ + thread_deallocate_complete(thread); } /* @@ -1093,11 +1001,8 @@ thread_terminate_enqueue( { KDBG_RELEASE(TRACE_DATA_THREAD_TERMINATE, thread->thread_id); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - enqueue_tail(&thread_terminate_queue, &thread->runq_links); - simple_unlock(&thread_terminate_lock); - - thread_wakeup((event_t)&thread_terminate_queue); + mpsc_daemon_enqueue(&thread_terminate_queue, &thread->mpsc_links, + MPSC_QUEUE_DISABLE_PREEMPTION); } /* @@ -1109,56 +1014,8 @@ static void thread_deallocate_enqueue( thread_t thread) { - spl_t s = splsched(); - - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - enqueue_tail(&thread_deallocate_queue, &thread->runq_links); - simple_unlock(&thread_terminate_lock); - - thread_wakeup((event_t)&thread_terminate_queue); - splx(s); -} - -/* - * turnstile_deallocate_enqueue: - * - * Enqueue a turnstile for final deallocation. - */ -void -turnstile_deallocate_enqueue( - struct turnstile *turnstile) -{ - spl_t s = splsched(); - - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - enqueue_tail(&turnstile_deallocate_queue, &turnstile->ts_deallocate_link); - simple_unlock(&thread_terminate_lock); - - thread_wakeup((event_t)&thread_terminate_queue); - splx(s); -} - -/* - * workq_deallocate_enqueue: - * - * Enqueue a workqueue for final deallocation. - */ -void -workq_deallocate_enqueue( - struct workqueue *wq) -{ - spl_t s = splsched(); - - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); - /* - * this is just to delay a zfree(), so we link the memory with no regards - * for how the struct looks like. - */ - enqueue_tail(&workq_deallocate_queue, (queue_entry_t)wq); - simple_unlock(&thread_terminate_lock); - - thread_wakeup((event_t)&thread_terminate_queue); - splx(s); + mpsc_daemon_enqueue(&thread_deallocate_queue, &thread->mpsc_links, + MPSC_QUEUE_DISABLE_PREEMPTION); } /* @@ -1167,13 +1024,11 @@ workq_deallocate_enqueue( * who are no longer being inspected. */ void -thread_terminate_crashed_threads() +thread_terminate_crashed_threads(void) { thread_t th_remove; - boolean_t should_wake_terminate_queue = FALSE; - spl_t s = splsched(); - simple_lock(&thread_terminate_lock, LCK_GRP_NULL); + simple_lock(&crashed_threads_lock, &thread_lck_grp); /* * loop through the crashed threads queue * to put any threads that are not being inspected anymore @@ -1184,58 +1039,39 @@ thread_terminate_crashed_threads() assert(th_remove != current_thread()); if (th_remove->inspection == FALSE) { - re_queue_tail(&thread_terminate_queue, &th_remove->runq_links); - should_wake_terminate_queue = TRUE; + remqueue(&th_remove->runq_links); + mpsc_daemon_enqueue(&thread_terminate_queue, &th_remove->mpsc_links, + MPSC_QUEUE_NONE); } } - simple_unlock(&thread_terminate_lock); - splx(s); - if (should_wake_terminate_queue == TRUE) { - thread_wakeup((event_t)&thread_terminate_queue); - } + simple_unlock(&crashed_threads_lock); } /* - * thread_stack_daemon: + * thread_stack_queue_invoke: * * Perform stack allocation as required due to * invoke failures. */ static void -thread_stack_daemon(void) +thread_stack_queue_invoke(mpsc_queue_chain_t elm, + __assert_only mpsc_daemon_queue_t dq) { - thread_t thread; - spl_t s; - - s = splsched(); - simple_lock(&thread_stack_lock, LCK_GRP_NULL); - - while ((thread = qe_dequeue_head(&thread_stack_queue, struct thread, runq_links)) != THREAD_NULL) { - assert_thread_magic(thread); - - simple_unlock(&thread_stack_lock); - splx(s); - - /* allocate stack with interrupts enabled so that we can call into VM */ - stack_alloc(thread); + thread_t thread = mpsc_queue_element(elm, struct thread, mpsc_links); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_END, thread_tid(thread), 0, 0, 0, 0); + assert(dq == &thread_stack_queue); - s = splsched(); - thread_lock(thread); - thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); - thread_unlock(thread); + /* allocate stack with interrupts enabled so that we can call into VM */ + stack_alloc(thread); - simple_lock(&thread_stack_lock, LCK_GRP_NULL); - } + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_END, thread_tid(thread), 0, 0, 0, 0); - assert_wait((event_t)&thread_stack_queue, THREAD_UNINT); - simple_unlock(&thread_stack_lock); + spl_t s = splsched(); + thread_lock(thread); + thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); + thread_unlock(thread); splx(s); - - thread_block((thread_continue_t)thread_stack_daemon); - /*NOTREACHED*/ } /* @@ -1252,52 +1088,39 @@ thread_stack_enqueue( KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_START, thread_tid(thread), 0, 0, 0, 0); assert_thread_magic(thread); - simple_lock(&thread_stack_lock, LCK_GRP_NULL); - enqueue_tail(&thread_stack_queue, &thread->runq_links); - simple_unlock(&thread_stack_lock); - - thread_wakeup((event_t)&thread_stack_queue); + mpsc_daemon_enqueue(&thread_stack_queue, &thread->mpsc_links, + MPSC_QUEUE_DISABLE_PREEMPTION); } void thread_daemon_init(void) { kern_return_t result; - thread_t thread = NULL; - simple_lock_init(&thread_terminate_lock, 0); - queue_init(&thread_terminate_queue); - queue_init(&thread_deallocate_queue); - queue_init(&workq_deallocate_queue); - queue_init(&turnstile_deallocate_queue); - queue_init(&crashed_threads_queue); + thread_deallocate_daemon_init(); - result = kernel_thread_start_priority((thread_continue_t)thread_terminate_daemon, NULL, MINPRI_KERNEL, &thread); - if (result != KERN_SUCCESS) { - panic("thread_daemon_init: thread_terminate_daemon"); - } + thread_deallocate_daemon_register_queue(&thread_terminate_queue, + thread_terminate_queue_invoke); - thread_deallocate(thread); + thread_deallocate_daemon_register_queue(&thread_deallocate_queue, + thread_deallocate_queue_invoke); - simple_lock_init(&thread_stack_lock, 0); - queue_init(&thread_stack_queue); + simple_lock_init(&crashed_threads_lock, 0); + queue_init(&crashed_threads_queue); - result = kernel_thread_start_priority((thread_continue_t)thread_stack_daemon, NULL, BASEPRI_PREEMPT_HIGH, &thread); + result = mpsc_daemon_queue_init_with_thread(&thread_stack_queue, + thread_stack_queue_invoke, BASEPRI_PREEMPT_HIGH, + "daemon.thread-stack"); if (result != KERN_SUCCESS) { panic("thread_daemon_init: thread_stack_daemon"); } - thread_deallocate(thread); - - simple_lock_init(&thread_exception_lock, 0); - queue_init(&thread_exception_queue); - - result = kernel_thread_start_priority((thread_continue_t)thread_exception_daemon, NULL, MINPRI_KERNEL, &thread); + result = mpsc_daemon_queue_init_with_thread(&thread_exception_queue, + thread_exception_queue_invoke, MINPRI_KERNEL, + "daemon.thread-exception"); if (result != KERN_SUCCESS) { panic("thread_daemon_init: thread_exception_daemon"); } - - thread_deallocate(thread); } #define TH_OPTION_NONE 0x00 @@ -1384,19 +1207,27 @@ thread_create_internal( new_thread->continuation = continuation; new_thread->parameter = parameter; new_thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE; - priority_queue_init(&new_thread->inheritor_queue, + priority_queue_init(&new_thread->sched_inheritor_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); + priority_queue_init(&new_thread->base_inheritor_queue, PRIORITY_QUEUE_BUILTIN_MAX_HEAP); +#if CONFIG_SCHED_CLUTCH + priority_queue_entry_init(&new_thread->sched_clutchpri_link); +#endif /* CONFIG_SCHED_CLUTCH */ /* Allocate I/O Statistics structure */ new_thread->thread_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info)); assert(new_thread->thread_io_stats != NULL); bzero(new_thread->thread_io_stats, sizeof(struct io_stat_info)); - new_thread->sync_ipc_overrides = 0; #if KASAN kasan_init_thread(&new_thread->kasan_data); #endif +#if CONFIG_KSANCOV + new_thread->ksancov_data = NULL; +#endif + #if CONFIG_IOSCHED /* Clear out the I/O Scheduling info for AppleFSCompression */ new_thread->decmp_upl = NULL; @@ -1503,6 +1334,7 @@ thread_create_internal( new_thread->max_priority = parent_task->max_priority; new_thread->task_priority = parent_task->priority; + int new_priority = (priority < 0) ? parent_task->priority: priority; new_priority = (priority < 0)? parent_task->priority: priority; if (new_priority > new_thread->max_priority) { @@ -1520,7 +1352,11 @@ thread_create_internal( #if defined(CONFIG_SCHED_TIMESHARE_CORE) new_thread->sched_stamp = sched_tick; +#if CONFIG_SCHED_CLUTCH + new_thread->pri_shift = sched_clutch_thread_pri_shift(new_thread, new_thread->th_sched_bucket); +#else /* CONFIG_SCHED_CLUTCH */ new_thread->pri_shift = sched_pri_shifts[new_thread->th_sched_bucket]; +#endif /* CONFIG_SCHED_CLUTCH */ #endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */ #if CONFIG_EMBEDDED @@ -1536,8 +1372,7 @@ thread_create_internal( parent_task->thread_count++; /* So terminating threads don't need to take the task lock to decrement */ - hw_atomic_add(&parent_task->active_thread_count, 1); - + os_atomic_inc(&parent_task->active_thread_count, relaxed); queue_enter(&threads, new_thread, thread_t, threads); threads_count++; @@ -2119,8 +1954,8 @@ thread_info_internal( * the PROC_PIDTHREADINFO flavor (which can't be used on corpses) */ retrieve_thread_basic_info(thread, &basic_info); - extended_info->pth_user_time = ((basic_info.user_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.user_time.microseconds * (integer_t)NSEC_PER_USEC)); - extended_info->pth_system_time = ((basic_info.system_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.system_time.microseconds * (integer_t)NSEC_PER_USEC)); + extended_info->pth_user_time = (((uint64_t)basic_info.user_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.user_time.microseconds * NSEC_PER_USEC)); + extended_info->pth_system_time = (((uint64_t)basic_info.system_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.system_time.microseconds * NSEC_PER_USEC)); extended_info->pth_cpu_usage = basic_info.cpu_usage; extended_info->pth_policy = basic_info.policy; @@ -2359,33 +2194,43 @@ clear_thread_rwlock_boost(void) } } - /* * XXX assuming current thread only, for now... */ void thread_guard_violation(thread_t thread, - mach_exception_data_type_t code, mach_exception_data_type_t subcode) + mach_exception_data_type_t code, mach_exception_data_type_t subcode, boolean_t fatal) { assert(thread == current_thread()); - /* don't set up the AST for kernel threads */ + /* Don't set up the AST for kernel threads; this check is needed to ensure + * that the guard_exc_* fields in the thread structure are set only by the + * current thread and therefore, don't require a lock. + */ if (thread->task == kernel_task) { return; } - spl_t s = splsched(); + assert(EXC_GUARD_DECODE_GUARD_TYPE(code)); + /* * Use the saved state area of the thread structure * to store all info required to handle the AST when - * returning to userspace + * returning to userspace. It's possible that there is + * already a pending guard exception. If it's non-fatal, + * it can only be over-written by a fatal exception code. */ - assert(EXC_GUARD_DECODE_GUARD_TYPE(code)); + if (thread->guard_exc_info.code && (thread->guard_exc_fatal || !fatal)) { + return; + } + thread->guard_exc_info.code = code; thread->guard_exc_info.subcode = subcode; + thread->guard_exc_fatal = fatal ? 1 : 0; + + spl_t s = splsched(); thread_ast_set(thread, AST_GUARD); ast_propagate(thread); - splx(s); } @@ -2407,6 +2252,7 @@ guard_ast(thread_t t) t->guard_exc_info.code = 0; t->guard_exc_info.subcode = 0; + t->guard_exc_fatal = 0; switch (EXC_GUARD_DECODE_GUARD_TYPE(code)) { case GUARD_TYPE_NONE: @@ -2534,20 +2380,17 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void) } /* TODO: show task total runtime (via TASK_ABSOLUTETIME_INFO)? */ - printf("process %s[%d] thread %llu caught burning CPU! " - "It used more than %d%% CPU over %u seconds " - "(actual recent usage: %d%% over ~%llu seconds). " - "Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys) " - "ledger balance: %lld mabs credit: %lld mabs debit: %lld mabs " - "limit: %llu mabs period: %llu ns last refill: %llu ns%s.\n", - procname, pid, tid, - percentage, interval_sec, - usage_percent, - (lei.lei_last_refill + NSEC_PER_SEC / 2) / NSEC_PER_SEC, + printf("process %s[%d] thread %llu caught burning CPU! It used more than %d%% CPU over %u seconds\n", + procname, pid, tid, percentage, interval_sec); + printf(" (actual recent usage: %d%% over ~%llu seconds)\n", + usage_percent, (lei.lei_last_refill + NSEC_PER_SEC / 2) / NSEC_PER_SEC); + printf(" Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys)\n", thread_total_time.seconds, thread_total_time.microseconds, thread_user_time.seconds, thread_user_time.microseconds, - thread_system_time.seconds, thread_system_time.microseconds, - lei.lei_balance, lei.lei_credit, lei.lei_debit, + thread_system_time.seconds, thread_system_time.microseconds); + printf(" Ledger balance: %lld; mabs credit: %lld; mabs debit: %lld\n", + lei.lei_balance, lei.lei_credit, lei.lei_debit); + printf(" mabs limit: %llu; mabs period: %llu ns; last refill: %llu ns%s.\n", lei.lei_limit, lei.lei_refill_period, lei.lei_last_refill, (fatal ? " [fatal violation]" : "")); @@ -3008,10 +2851,6 @@ thread_should_halt( * thread_set_voucher_name - reset the voucher port name bound to this thread * * Conditions: nothing locked - * - * If we already converted the previous name to a cached voucher - * reference, then we discard that reference here. The next lookup - * will cache it again. */ kern_return_t @@ -3022,6 +2861,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name) ipc_voucher_t voucher; ledger_t bankledger = NULL; struct thread_group *banktg = NULL; + uint32_t persona_id = 0; if (MACH_PORT_DEAD == voucher_name) { return KERN_INVALID_RIGHT; @@ -3036,7 +2876,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name) return KERN_INVALID_ARGUMENT; } } - bank_get_bank_ledger_and_thread_group(new_voucher, &bankledger, &banktg); + bank_get_bank_ledger_thread_group_and_persona(new_voucher, &bankledger, &banktg, &persona_id); thread_mtx_lock(thread); voucher = thread->ith_voucher; @@ -3051,7 +2891,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name) (uintptr_t)thread_tid(thread), (uintptr_t)voucher_name, VM_KERNEL_ADDRPERM((uintptr_t)new_voucher), - 1, 0); + persona_id, 0); if (IPC_VOUCHER_NULL != voucher) { ipc_voucher_release(voucher); @@ -3065,10 +2905,6 @@ thread_set_voucher_name(mach_port_name_t voucher_name) * * Conditions: nothing locked * - * A reference to the voucher may be lazily pending, if someone set the voucher name - * but nobody has done a lookup yet. In that case, we'll have to do the equivalent - * lookup here. - * * NOTE: At the moment, there is no distinction between the current and effective * vouchers because we only set them at the thread level currently. */ @@ -3079,7 +2915,6 @@ thread_get_mach_voucher( ipc_voucher_t *voucherp) { ipc_voucher_t voucher; - mach_port_name_t voucher_name; if (THREAD_NULL == thread) { return KERN_INVALID_ARGUMENT; @@ -3088,7 +2923,6 @@ thread_get_mach_voucher( thread_mtx_lock(thread); voucher = thread->ith_voucher; - /* if already cached, just return a ref */ if (IPC_VOUCHER_NULL != voucher) { ipc_voucher_reference(voucher); thread_mtx_unlock(thread); @@ -3096,41 +2930,9 @@ thread_get_mach_voucher( return KERN_SUCCESS; } - voucher_name = thread->ith_voucher_name; - - /* convert the name to a port, then voucher reference */ - if (MACH_PORT_VALID(voucher_name)) { - ipc_port_t port; - - if (KERN_SUCCESS != - ipc_object_copyin(thread->task->itk_space, voucher_name, - MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&port)) { - thread->ith_voucher_name = MACH_PORT_NULL; - thread_mtx_unlock(thread); - *voucherp = IPC_VOUCHER_NULL; - return KERN_SUCCESS; - } - - /* convert to a voucher ref to return, and cache a ref on thread */ - voucher = convert_port_to_voucher(port); - ipc_voucher_reference(voucher); - thread->ith_voucher = voucher; - thread_mtx_unlock(thread); - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - (uintptr_t)port, - VM_KERNEL_ADDRPERM((uintptr_t)voucher), - 2, 0); - - - ipc_port_release_send(port); - } else { - thread_mtx_unlock(thread); - } + thread_mtx_unlock(thread); - *voucherp = voucher; + *voucherp = IPC_VOUCHER_NULL; return KERN_SUCCESS; } @@ -3140,8 +2942,8 @@ thread_get_mach_voucher( * Conditions: callers holds a reference on the voucher. * nothing locked. * - * We grab another reference to the voucher and bind it to the thread. Any lazy - * binding is erased. The old voucher reference associated with the thread is + * We grab another reference to the voucher and bind it to the thread. + * The old voucher reference associated with the thread is * discarded. */ kern_return_t @@ -3152,6 +2954,7 @@ thread_set_mach_voucher( ipc_voucher_t old_voucher; ledger_t bankledger = NULL; struct thread_group *banktg = NULL; + uint32_t persona_id = 0; if (THREAD_NULL == thread) { return KERN_INVALID_ARGUMENT; @@ -3162,7 +2965,7 @@ thread_set_mach_voucher( } ipc_voucher_reference(voucher); - bank_get_bank_ledger_and_thread_group(voucher, &bankledger, &banktg); + bank_get_bank_ledger_thread_group_and_persona(voucher, &bankledger, &banktg, &persona_id); thread_mtx_lock(thread); old_voucher = thread->ith_voucher; @@ -3177,7 +2980,7 @@ thread_set_mach_voucher( (uintptr_t)thread_tid(thread), (uintptr_t)MACH_PORT_NULL, VM_KERNEL_ADDRPERM((uintptr_t)voucher), - 3, 0); + persona_id, 0); ipc_voucher_release(old_voucher); @@ -3296,12 +3099,44 @@ thread_set_allocation_name(kern_allocation_name_t new_name) return ret; } +void * +thread_iokit_tls_get(uint32_t index) +{ + assert(index < THREAD_SAVE_IOKIT_TLS_COUNT); + return current_thread()->saved.iokit.tls[index]; +} + +void +thread_iokit_tls_set(uint32_t index, void * data) +{ + assert(index < THREAD_SAVE_IOKIT_TLS_COUNT); + current_thread()->saved.iokit.tls[index] = data; +} + uint64_t thread_get_last_wait_duration(thread_t thread) { return thread->last_made_runnable_time - thread->last_run_time; } +integer_t +thread_kern_get_pri(thread_t thr) +{ + return thr->base_pri; +} + +void +thread_kern_set_pri(thread_t thr, integer_t pri) +{ + sched_set_kernel_thread_priority(thr, pri); +} + +integer_t +thread_kern_get_kernel_maxpri(void) +{ + return MAXPRI_KERNEL; +} + #if CONFIG_DTRACE uint32_t dtrace_get_thread_predcache(thread_t thread) @@ -3343,11 +3178,11 @@ dtrace_get_thread_tracing(thread_t thread) } } -boolean_t -dtrace_get_thread_reentering(thread_t thread) +uint16_t +dtrace_get_thread_inprobe(thread_t thread) { if (thread != THREAD_NULL) { - return (thread->options & TH_OPT_DTRACE) ? TRUE : FALSE; + return thread->t_dtrace_inprobe; } else { return 0; } @@ -3371,6 +3206,14 @@ kasan_get_thread_data(thread_t thread) } #endif +#if CONFIG_KSANCOV +void ** +__sanitizer_get_thread_data(thread_t thread) +{ + return &thread->ksancov_data; +} +#endif + int64_t dtrace_calc_thread_recent_vtime(thread_t thread) { @@ -3413,14 +3256,10 @@ dtrace_set_thread_tracing(thread_t thread, int64_t accum) } void -dtrace_set_thread_reentering(thread_t thread, boolean_t vbool) +dtrace_set_thread_inprobe(thread_t thread, uint16_t inprobe) { if (thread != THREAD_NULL) { - if (vbool) { - thread->options |= TH_OPT_DTRACE; - } else { - thread->options &= (~TH_OPT_DTRACE); - } + thread->t_dtrace_inprobe = inprobe; } } @@ -3439,7 +3278,14 @@ dtrace_set_thread_recover(thread_t thread, vm_offset_t recover) vm_offset_t dtrace_sign_and_set_thread_recover(thread_t thread, vm_offset_t recover) { +#if defined(HAS_APPLE_PAC) + return dtrace_set_thread_recover(thread, + (vm_address_t)ptrauth_sign_unauthenticated((void *)recover, + ptrauth_key_function_pointer, + ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER))); +#else /* defined(HAS_APPLE_PAC) */ return dtrace_set_thread_recover(thread, recover); +#endif /* defined(HAS_APPLE_PAC) */ } void diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 39b9c4fcf..833466376 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -125,6 +125,7 @@ #include #include #include +#include #include #include @@ -132,6 +133,7 @@ #include +#include #include #include @@ -172,6 +174,7 @@ struct thread { union { queue_chain_t runq_links; /* run queue links */ queue_chain_t wait_links; /* wait queue links */ + struct mpsc_queue_chain mpsc_links; /* thread daemon mpsc links */ struct priority_queue_entry wait_prioq_links; /* priority ordered waitq links */ }; @@ -181,17 +184,29 @@ struct thread { struct waitq *waitq; /* wait queue this thread is enqueued on */ struct turnstile *turnstile; /* thread's turnstile, protected by primitives interlock */ void *inheritor; /* inheritor of the primitive the thread will block on */ - struct priority_queue inheritor_queue; /* Inheritor queue */ + struct priority_queue sched_inheritor_queue; /* Inheritor queue for kernel promotion */ + struct priority_queue base_inheritor_queue; /* Inheritor queue for user promotion */ + +#if CONFIG_SCHED_CLUTCH + /* + * In the clutch scheduler, the threads are maintained in runqs at the clutch_bucket + * level (clutch_bucket defines a unique thread group and scheduling bucket pair). In + * order to determine the priority of the clutch bucket as a whole, it is necessary to + * find the highest thread in it. The thread could be present in the clutch bucket due + * to its base_pri or its promoted pri. This link is used to maintain that queue. + */ + struct priority_queue_entry sched_clutchpri_link; + +#endif /* CONFIG_SCHED_CLUTCH */ /* Data updated during assert_wait/thread_wakeup */ #if __SMP__ - decl_simple_lock_data(, sched_lock) /* scheduling lock (thread_lock()) */ - decl_simple_lock_data(, wake_lock) /* for thread stop / wait (wake_lock()) */ + decl_simple_lock_data(, sched_lock); /* scheduling lock (thread_lock()) */ + decl_simple_lock_data(, wake_lock); /* for thread stop / wait (wake_lock()) */ #endif integer_t options; /* options set by thread itself */ #define TH_OPT_INTMASK 0x0003 /* interrupt / abort level */ #define TH_OPT_VMPRIV 0x0004 /* may allocate reserved memory */ -#define TH_OPT_DTRACE 0x0008 /* executing under dtrace_probe */ #define TH_OPT_SYSTEM_CRITICAL 0x0010 /* Thread must always be allowed to run - even under heavy load */ #define TH_OPT_PROC_CPULIMIT 0x0020 /* Thread has a task-wide CPU limit applied to it */ #define TH_OPT_PRVT_CPULIMIT 0x0040 /* Thread has a thread-private CPU limit applied to it */ @@ -220,6 +235,10 @@ struct thread { struct kasan_thread_data kasan_data; #endif +#if CONFIG_KSANCOV + void *ksancov_data; +#endif + /* Thread state: */ int state; /* @@ -262,7 +281,7 @@ struct thread { /* unused TH_SFLAG_PRI_UPDATE 0x0100 */ #define TH_SFLAG_EAGERPREEMPT 0x0200 /* Any preemption of this thread should be treated as if AST_URGENT applied */ #define TH_SFLAG_RW_PROMOTED 0x0400 /* promote reason: blocking with RW lock held */ -/* unused TH_SFLAG_THROTTLE_DEMOTED 0x0800 */ +#define TH_SFLAG_BASE_PRI_FROZEN 0x0800 /* (effective) base_pri is frozen */ #define TH_SFLAG_WAITQ_PROMOTED 0x1000 /* promote reason: waitq wakeup (generally for IPC receive) */ @@ -274,7 +293,8 @@ struct thread { #define TH_SFLAG_RW_PROMOTED_BIT (10) /* 0x400 */ int16_t sched_pri; /* scheduled (current) priority */ - int16_t base_pri; /* base priority */ + int16_t base_pri; /* effective base priority (equal to req_base_pri unless TH_SFLAG_BASE_PRI_FROZEN) */ + int16_t req_base_pri; /* requested base priority */ int16_t max_priority; /* copy of max base priority */ int16_t task_priority; /* copy of task base priority */ int16_t promotion_priority; /* priority thread is currently promoted to */ @@ -285,16 +305,14 @@ struct thread { #endif #endif - int16_t promotions; /* level of promotion */ int iotier_override; /* atomic operations to set, cleared on ret to user */ - struct os_refcnt ref_count; /* number of references to me */ + os_refcnt_t ref_count; /* number of references to me */ lck_mtx_t* waiting_for_mutex; /* points to mutex we're waiting for until we acquire it */ uint32_t rwlock_count; /* Number of lck_rw_t locks held by thread */ integer_t importance; /* task-relative importance */ - uint32_t was_promoted_on_wakeup; /* thread promoted on wakeup to acquire mutex */ /* Priority depression expiration */ integer_t depress_timer_active; @@ -412,6 +430,10 @@ struct thread { kern_return_t result; /* primary result */ mach_msg_continue_t continuation; } sema; + struct { +#define THREAD_SAVE_IOKIT_TLS_COUNT 8 + void *tls[THREAD_SAVE_IOKIT_TLS_COUNT]; + } iokit; } saved; /* Only user threads can cause guard exceptions, only kernel threads can be thread call threads */ @@ -456,7 +478,7 @@ struct thread { boolean_t pmap_footprint_suspended; #endif /* DEVELOPMENT || DEBUG */ - decl_lck_mtx_data(, mutex) + decl_lck_mtx_data(, mutex); /* Pending thread ast(s) */ @@ -484,8 +506,9 @@ struct thread { #endif #if CONFIG_DTRACE - uint32_t t_dtrace_flags; /* DTrace thread states */ + uint16_t t_dtrace_flags; /* DTrace thread states */ #define TH_DTRACE_EXECSUCCESS 0x01 + uint16_t t_dtrace_inprobe; /* Executing under dtrace_probe */ uint32_t t_dtrace_predcache; /* DTrace per thread predicate value hint */ int64_t t_dtrace_tracing; /* Thread time under dtrace_probe() */ int64_t t_dtrace_vtime; @@ -498,10 +521,11 @@ struct thread { uint64_t t_page_creation_throttled_hard; uint64_t t_page_creation_throttled_soft; #endif /* DEVELOPMENT || DEBUG */ + int t_pagein_error; /* for vm_fault(), holds error from vnop_pagein() */ #ifdef KPERF -/* The high 7 bits are the number of frames to sample of a user callstack. */ -#define T_KPERF_CALLSTACK_DEPTH_OFFSET (25) +/* The high 8 bits are the number of frames to sample of a user callstack. */ +#define T_KPERF_CALLSTACK_DEPTH_OFFSET (24) #define T_KPERF_SET_CALLSTACK_DEPTH(DEPTH) (((uint32_t)(DEPTH)) << T_KPERF_CALLSTACK_DEPTH_OFFSET) #define T_KPERF_GET_CALLSTACK_DEPTH(FLAGS) ((FLAGS) >> T_KPERF_CALLSTACK_DEPTH_OFFSET) #endif @@ -559,10 +583,9 @@ struct thread { user_addr_t override_resource; } *overrides; - uint32_t ipc_overrides; - _Atomic uint32_t kqwl_owning_count; - uint32_t sync_ipc_overrides; + uint32_t kevent_overrides; uint16_t user_promotion_basepri; + uint16_t kern_promotion_schedpri; _Atomic uint16_t kevent_ast_bits; io_stat_info_t thread_io_stats; /* per-thread I/O statistics */ @@ -576,10 +599,16 @@ struct thread { uint32_t thread_timer_wakeups_bin_1; uint32_t thread_timer_wakeups_bin_2; uint16_t thread_tag; + /* + * callout_* fields are only set for thread call threads whereas guard_exc_fatal is set + * by user threads on themselves while taking a guard exception. So it's okay for them to + * share this bitfield. + */ uint16_t callout_woken_from_icontext:1, callout_woken_from_platform_idle:1, callout_woke_thread:1, - thread_bitfield_unused:13; + guard_exc_fatal:1, + thread_bitfield_unused:12; mach_port_name_t ith_voucher_name; ipc_voucher_t ith_voucher; @@ -596,6 +625,7 @@ struct thread { turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */ block_hint_t pending_block_hint; block_hint_t block_hint; /* What type of primitive last caused us to block. */ + integer_t decompressions; /* Per-thread decompressions counter to be added to per-task decompressions counter */ }; #define ith_state saved.receive.state @@ -657,9 +687,6 @@ MACRO_END extern void thread_deallocate( thread_t thread); -extern void thread_deallocate_safe( - thread_t thread); - extern void thread_inspect_deallocate( thread_inspect_t thread); @@ -690,9 +717,6 @@ extern void thread_copy_resource_info( extern void thread_terminate_crashed_threads(void); -extern void turnstile_deallocate_enqueue( - struct turnstile *turnstile); - extern void thread_stack_enqueue( thread_t thread); @@ -702,7 +726,7 @@ extern void thread_hold( extern void thread_release( thread_t thread); -extern void thread_corpse_continue(void); +extern void thread_corpse_continue(void) __dead2; extern boolean_t thread_is_active(thread_t thread); @@ -789,7 +813,6 @@ extern thread_t machine_switch_context( extern void machine_load_context( thread_t thread) __attribute__((noreturn)); - extern kern_return_t machine_thread_state_initialize( thread_t thread); @@ -799,6 +822,16 @@ extern kern_return_t machine_thread_set_state( thread_state_t state, mach_msg_type_number_t count); +extern mach_vm_address_t machine_thread_pc( + thread_t thread); + +extern void machine_thread_reset_pc( + thread_t thread, + mach_vm_address_t pc); + +extern boolean_t machine_thread_on_core( + thread_t thread); + extern kern_return_t machine_thread_get_state( thread_t thread, thread_flavor_t flavor, @@ -866,7 +899,7 @@ vm_offset_t max_valid_stack_address(void); static inline uint16_t thread_set_tag_internal(thread_t thread, uint16_t tag) { - return __sync_fetch_and_or(&thread->thread_tag, tag); + return os_atomic_or_orig(&thread->thread_tag, tag, relaxed); } static inline uint16_t @@ -889,7 +922,7 @@ extern void thread_mtx_lock(thread_t thread); extern void thread_mtx_unlock(thread_t thread); -extern thread_t current_thread(void); +extern thread_t current_thread(void) __attribute__((const)); extern void thread_reference( thread_t thread); @@ -897,6 +930,19 @@ extern void thread_reference( extern void thread_deallocate( thread_t thread); +#if BSD_KERNEL_PRIVATE +/* Duplicated from osfmk/kern/ipc_tt.h */ +__options_decl(port_to_thread_options_t, uint32_t, { + PORT_TO_THREAD_NONE = 0x0000, + PORT_TO_THREAD_IN_CURRENT_TASK = 0x0001, + PORT_TO_THREAD_NOT_CURRENT_THREAD = 0x0002, +}); + +extern thread_t port_name_to_thread( + mach_port_name_t port_name, + port_to_thread_options_t options); +#endif /* BSD_KERNEL_PRIVATE */ + __END_DECLS #endif /* MACH_KERNEL_PRIVATE */ @@ -905,22 +951,21 @@ __END_DECLS __BEGIN_DECLS -extern void thread_starts_owning_workloop( - thread_t thread); - -extern void thread_ends_owning_workloop( - thread_t thread); - -extern uint32_t thread_owned_workloops_count( +extern void thread_deallocate_safe( thread_t thread); - extern uint64_t thread_dispatchqaddr( thread_t thread); extern uint64_t thread_rettokern_addr( thread_t thread); +extern integer_t thread_kern_get_pri(thread_t thr) __attribute__((const)); + +extern void thread_kern_set_pri(thread_t thr, integer_t pri); + +extern integer_t thread_kern_get_kernel_maxpri(void) __attribute__((const)); + __END_DECLS #endif /* KERNEL_PRIVATE */ @@ -1114,6 +1159,7 @@ extern int thread_task_has_ldt(thread_t); #endif extern void *get_bsdthread_info(thread_t); extern void set_bsdthread_info(thread_t, void *); +extern void set_thread_pagein_error(thread_t, int); extern void *uthread_alloc(task_t, thread_t, int); extern event_t workq_thread_init_and_wq_lock(task_t, thread_t); // bsd/pthread/ extern void uthread_cleanup_name(void *uthread); @@ -1143,13 +1189,13 @@ extern void act_set_io_telemetry_ast(thread_t); extern uint32_t dtrace_get_thread_predcache(thread_t); extern int64_t dtrace_get_thread_vtime(thread_t); extern int64_t dtrace_get_thread_tracing(thread_t); -extern boolean_t dtrace_get_thread_reentering(thread_t); +extern uint16_t dtrace_get_thread_inprobe(thread_t); extern int dtrace_get_thread_last_cpu_id(thread_t); extern vm_offset_t dtrace_get_kernel_stack(thread_t); extern void dtrace_set_thread_predcache(thread_t, uint32_t); extern void dtrace_set_thread_vtime(thread_t, int64_t); extern void dtrace_set_thread_tracing(thread_t, int64_t); -extern void dtrace_set_thread_reentering(thread_t, boolean_t); +extern void dtrace_set_thread_inprobe(thread_t, uint16_t); extern vm_offset_t dtrace_set_thread_recover(thread_t, vm_offset_t); extern vm_offset_t dtrace_sign_and_set_thread_recover(thread_t, vm_offset_t); extern void dtrace_thread_bootstrap(void); @@ -1182,7 +1228,7 @@ extern void mach_port_guard_ast(thread_t, extern void virt_memory_guard_ast(thread_t, mach_exception_code_t, mach_exception_subcode_t); extern void thread_guard_violation(thread_t, - mach_exception_code_t, mach_exception_subcode_t); + mach_exception_code_t, mach_exception_subcode_t, boolean_t); extern void thread_update_io_stats(thread_t, int size, int io_flags); extern kern_return_t thread_set_voucher_name(mach_port_name_t name); @@ -1191,22 +1237,6 @@ extern kern_return_t thread_get_current_voucher_origin_pid(int32_t *pid); extern void set_thread_rwlock_boost(void); extern void clear_thread_rwlock_boost(void); -/*! @function thread_has_thread_name - * @abstract Checks if a thread has a name. - * @discussion This function takes one input, a thread, and returns a boolean value indicating if that thread already has a name associated with it. - * @param th The thread to inspect. - * @result TRUE if the thread has a name, FALSE otherwise. - */ -extern boolean_t thread_has_thread_name(thread_t th); - -/*! @function thread_set_thread_name - * @abstract Set a thread's name. - * @discussion This function takes two input parameters: a thread to name, and the name to apply to the thread. The name will be attached to the thread in order to better identify the thread. - * @param th The thread to be named. - * @param name The name to apply to the thread. - */ -extern void thread_set_thread_name(thread_t th, const char* name); - extern void thread_enable_send_importance(thread_t thread, boolean_t enable); /* @@ -1268,6 +1298,21 @@ extern bool thread_get_no_smt(void); #endif /* XNU_KERNEL_PRIVATE */ +/*! @function thread_has_thread_name + * @abstract Checks if a thread has a name. + * @discussion This function takes one input, a thread, and returns a boolean value indicating if that thread already has a name associated with it. + * @param th The thread to inspect. + * @result TRUE if the thread has a name, FALSE otherwise. + */ +extern boolean_t thread_has_thread_name(thread_t th); + +/*! @function thread_set_thread_name + * @abstract Set a thread's name. + * @discussion This function takes two input parameters: a thread to name, and the name to apply to the thread. The name will be copied over to the thread in order to better identify the thread. If the name is longer than MAXTHREADNAMESIZE - 1, it will be truncated. + * @param th The thread to be named. + * @param name The name to apply to the thread. + */ +extern void thread_set_thread_name(thread_t th, const char* name); /*! @function kernel_thread_start * @abstract Create a kernel thread. @@ -1293,6 +1338,8 @@ extern ipc_port_t convert_thread_inspect_to_port(thread_inspect_t); extern boolean_t is_vm_privileged(void); extern boolean_t set_vm_privilege(boolean_t); extern kern_allocation_name_t thread_set_allocation_name(kern_allocation_name_t new_name); +extern void *thread_iokit_tls_get(uint32_t index); +extern void thread_iokit_tls_set(uint32_t index, void * data); #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index c93dda8e3..944d61d99 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -1132,12 +1132,32 @@ act_set_astbsd( void act_set_astkevent(thread_t thread, uint16_t bits) { - atomic_fetch_or(&thread->kevent_ast_bits, bits); + os_atomic_or(&thread->kevent_ast_bits, bits, relaxed); /* kevent AST shouldn't send immediate IPIs */ act_set_ast_async(thread, AST_KEVENT); } +uint16_t +act_clear_astkevent(thread_t thread, uint16_t bits) +{ + /* + * avoid the atomic operation if none of the bits is set, + * which will be the common case. + */ + uint16_t cur = os_atomic_load(&thread->kevent_ast_bits, relaxed); + if (cur & bits) { + cur = os_atomic_andnot_orig(&thread->kevent_ast_bits, bits, relaxed); + } + return cur & bits; +} + +void +act_set_ast_reset_pcs(thread_t thread) +{ + act_set_ast(thread, AST_RESET_PCS); +} + void act_set_kperf( thread_t thread) diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index 5f2676de7..7c8be9695 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -63,11 +63,11 @@ typedef enum { TCF_COUNT = 2, } thread_call_flavor_t; -typedef enum { +__options_decl(thread_call_group_flags_t, uint32_t, { TCG_NONE = 0x0, TCG_PARALLEL = 0x1, TCG_DEALLOC_ACTIVE = 0x2, -} thread_call_group_flags_t; +}); static struct thread_call_group { const char * tcg_name; diff --git a/osfmk/kern/thread_group.c b/osfmk/kern/thread_group.c index f67111223..49f212298 100644 --- a/osfmk/kern/thread_group.c +++ b/osfmk/kern/thread_group.c @@ -40,6 +40,7 @@ #include #include #include +#include #if CONFIG_EMBEDDED diff --git a/osfmk/kern/thread_policy.c b/osfmk/kern/thread_policy.c index 75f81a456..3ba515bef 100644 --- a/osfmk/kern/thread_policy.c +++ b/osfmk/kern/thread_policy.c @@ -128,7 +128,7 @@ static void proc_set_thread_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token); static void -thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2); +thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token); static int thread_get_requested_policy_spinlocked(thread_t thread, int category, int flavor, int* value2); @@ -644,6 +644,50 @@ unlock: return kr; } +void +thread_freeze_base_pri(thread_t thread) +{ + assert(thread == current_thread()); + + spl_t s = splsched(); + thread_lock(thread); + + assert((thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) == 0); + thread->sched_flags |= TH_SFLAG_BASE_PRI_FROZEN; + + thread_unlock(thread); + splx(s); +} + +bool +thread_unfreeze_base_pri(thread_t thread) +{ + assert(thread == current_thread()); + integer_t base_pri; + ast_t ast = 0; + + spl_t s = splsched(); + thread_lock(thread); + + assert(thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN); + thread->sched_flags &= ~TH_SFLAG_BASE_PRI_FROZEN; + + base_pri = thread->req_base_pri; + if (base_pri != thread->base_pri) { + /* + * This function returns "true" if the base pri change + * is the most likely cause for the preemption. + */ + sched_set_thread_base_priority(thread, base_pri); + ast = ast_peek(AST_PREEMPT); + } + + thread_unlock(thread); + splx(s); + + return ast != 0; +} + uint8_t thread_workq_pri_for_qos(thread_qos_t qos) { @@ -938,6 +982,9 @@ thread_update_qos_cpu_time(thread_t thread) * * Called with thread_lock and thread mutex held. */ +extern thread_t vm_pageout_scan_thread; +extern boolean_t vps_dynamic_priority_enabled; + void thread_recompute_priority( thread_t thread) @@ -1301,7 +1348,7 @@ thread_policy_get( info->thps_user_promotions = 0; info->thps_user_promotion_basepri = thread->user_promotion_basepri; - info->thps_ipc_overrides = thread->ipc_overrides; + info->thps_ipc_overrides = thread->kevent_overrides; proc_get_thread_policy_bitfield(thread, info); @@ -1464,7 +1511,8 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) { next_qos = MAX(requested.thrp_qos_override, next_qos); next_qos = MAX(requested.thrp_qos_promote, next_qos); - next_qos = MAX(requested.thrp_qos_ipc_override, next_qos); + next_qos = MAX(requested.thrp_qos_kevent_override, next_qos); + next_qos = MAX(requested.thrp_qos_wlsvc_override, next_qos); next_qos = MAX(requested.thrp_qos_workq_override, next_qos); } @@ -1658,6 +1706,8 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr pend_token->tpt_update_thread_sfi = 1; } + integer_t old_base_pri = thread->base_pri; + /* * Step 5: * Update other subsystems as necessary if something has changed @@ -1672,6 +1722,20 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr recompute_priority) { thread_recompute_priority(thread); } + + /* + * Check if the thread is waiting on a turnstile and needs priority propagation. + */ + if (pend_token->tpt_update_turnstile && + ((old_base_pri == thread->base_pri) || + !thread_get_waiting_turnstile(thread))) { + /* + * Reset update turnstile pend token since either + * the thread priority did not change or thread is + * not blocked on a turnstile. + */ + pend_token->tpt_update_turnstile = 0; + } } @@ -1750,6 +1814,10 @@ thread_policy_update_complete_unlocked(thread_t thread, task_pend_token_t pend_t if (pend_token->tpt_update_thread_sfi) { sfi_reevaluate(thread); } + + if (pend_token->tpt_update_turnstile) { + turnstile_update_thread_priority_chain(thread); + } } /* @@ -1790,7 +1858,7 @@ proc_set_thread_policy_spinlocked(thread_t thread, thread_tid(thread), threquested_0(thread), threquested_1(thread), value, 0); - thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2); + thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2, pend_token); thread_policy_update_spinlocked(thread, FALSE, pend_token); @@ -1805,10 +1873,11 @@ proc_set_thread_policy_spinlocked(thread_t thread, */ static void thread_set_requested_policy_spinlocked(thread_t thread, - int category, - int flavor, - int value, - int value2) + int category, + int flavor, + int value, + int value2, + task_pend_token_t pend_token) { int tier, passive; @@ -1869,26 +1938,24 @@ thread_set_requested_policy_spinlocked(thread_t thread, requested.thrp_through_qos = value; break; - case TASK_POLICY_QOS: - assert(category == TASK_POLICY_ATTRIBUTE); - requested.thrp_qos = value; - break; - case TASK_POLICY_QOS_OVERRIDE: assert(category == TASK_POLICY_ATTRIBUTE); requested.thrp_qos_override = value; + pend_token->tpt_update_turnstile = 1; break; case TASK_POLICY_QOS_AND_RELPRIO: assert(category == TASK_POLICY_ATTRIBUTE); requested.thrp_qos = value; requested.thrp_qos_relprio = value2; + pend_token->tpt_update_turnstile = 1; DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio); break; case TASK_POLICY_QOS_WORKQ_OVERRIDE: assert(category == TASK_POLICY_ATTRIBUTE); requested.thrp_qos_workq_override = value; + pend_token->tpt_update_turnstile = 1; break; case TASK_POLICY_QOS_PROMOTE: @@ -1896,9 +1963,16 @@ thread_set_requested_policy_spinlocked(thread_t thread, requested.thrp_qos_promote = value; break; - case TASK_POLICY_QOS_IPC_OVERRIDE: + case TASK_POLICY_QOS_KEVENT_OVERRIDE: assert(category == TASK_POLICY_ATTRIBUTE); - requested.thrp_qos_ipc_override = value; + requested.thrp_qos_kevent_override = value; + pend_token->tpt_update_turnstile = 1; + break; + + case TASK_POLICY_QOS_SERVICER_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos_wlsvc_override = value; + pend_token->tpt_update_turnstile = 1; break; case TASK_POLICY_TERMINATED: @@ -2023,9 +2097,13 @@ thread_get_requested_policy_spinlocked(thread_t thread, assert(category == TASK_POLICY_ATTRIBUTE); value = requested.thrp_qos_promote; break; - case TASK_POLICY_QOS_IPC_OVERRIDE: + case TASK_POLICY_QOS_KEVENT_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_qos_kevent_override; + break; + case TASK_POLICY_QOS_SERVICER_OVERRIDE: assert(category == TASK_POLICY_ATTRIBUTE); - value = requested.thrp_qos_ipc_override; + value = requested.thrp_qos_wlsvc_override; break; case TASK_POLICY_TERMINATED: assert(category == TASK_POLICY_ATTRIBUTE); @@ -2644,10 +2722,9 @@ void proc_thread_qos_deallocate(thread_t thread) { /* This thread must have no more IPC overrides. */ - assert(thread->ipc_overrides == 0); - assert(thread->requested_policy.thrp_qos_ipc_override == THREAD_QOS_UNSPECIFIED); - assert(thread->sync_ipc_overrides == 0); - assert(thread->requested_policy.thrp_qos_sync_ipc_override == THREAD_QOS_UNSPECIFIED); + assert(thread->kevent_overrides == 0); + assert(thread->requested_policy.thrp_qos_kevent_override == THREAD_QOS_UNSPECIFIED); + assert(thread->requested_policy.thrp_qos_wlsvc_override == THREAD_QOS_UNSPECIFIED); /* * Clear out any lingering override objects. @@ -2688,7 +2765,7 @@ task_set_main_thread_qos(task_t task, thread_t thread) int primordial_qos = task_compute_main_thread_qos(task); - proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, primordial_qos, 0, &pend_token); thread_mtx_unlock(thread); @@ -2719,6 +2796,46 @@ task_get_default_manager_qos(task_t task) return primordial_qos; } +/* + * Check if the kernel promotion on thread has changed + * and apply it. + * + * thread locked on entry and exit + */ +boolean_t +thread_recompute_kernel_promotion_locked(thread_t thread) +{ + boolean_t needs_update = FALSE; + int kern_promotion_schedpri = thread_get_inheritor_turnstile_sched_priority(thread); + + /* + * For now just assert that kern_promotion_schedpri <= MAXPRI_PROMOTE. + * TURNSTILE_KERNEL_PROMOTE adds threads on the waitq already capped to MAXPRI_PROMOTE + * and propagates the priority through the chain with the same cap, because as of now it does + * not differenciate on the kernel primitive. + * + * If this assumption will change with the adoption of a kernel primitive that does not + * cap the when adding/propagating, + * then here is the place to put the generic cap for all kernel primitives + * (converts the assert to kern_promotion_schedpri = MIN(priority, MAXPRI_PROMOTE)) + */ + assert(kern_promotion_schedpri <= MAXPRI_PROMOTE); + + if (kern_promotion_schedpri != thread->kern_promotion_schedpri) { + KDBG(MACHDBG_CODE( + DBG_MACH_SCHED, MACH_TURNSTILE_KERNEL_CHANGE) | DBG_FUNC_NONE, + thread_tid(thread), + kern_promotion_schedpri, + thread->kern_promotion_schedpri); + + needs_update = TRUE; + thread->kern_promotion_schedpri = kern_promotion_schedpri; + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); + } + + return needs_update; +} + /* * Check if the user promotion on thread has changed * and apply it. @@ -2731,7 +2848,7 @@ thread_recompute_user_promotion_locked(thread_t thread) { boolean_t needs_update = FALSE; struct task_pend_token pend_token = {}; - int user_promotion_basepri = MIN(thread_get_inheritor_turnstile_priority(thread), MAXPRI_USER); + int user_promotion_basepri = MIN(thread_get_inheritor_turnstile_base_priority(thread), MAXPRI_USER); int old_base_pri = thread->base_pri; thread_qos_t qos_promotion; @@ -2745,6 +2862,11 @@ thread_recompute_user_promotion_locked(thread_t thread) user_promotion_basepri, thread->user_promotion_basepri, 0, 0); + KDBG(MACHDBG_CODE( + DBG_MACH_SCHED, MACH_TURNSTILE_USER_CHANGE) | DBG_FUNC_NONE, + thread_tid(thread), + user_promotion_basepri, + thread->user_promotion_basepri); } /* Update the user promotion base pri */ @@ -2791,8 +2913,8 @@ thread_user_promotion_qos_for_pri(int priority) } /* - * Set the thread's QoS IPC override - * Owned by the IPC subsystem + * Set the thread's QoS Kevent override + * Owned by the Kevent subsystem * * May be called with spinlocks held, but not spinlocks * that may deadlock against the thread lock, the throttle lock, or the SFI lock. @@ -2802,7 +2924,7 @@ thread_user_promotion_qos_for_pri(int priority) * Before the thread is deallocated, there must be 0 remaining overrides. */ static void -thread_ipc_override(thread_t thread, +thread_kevent_override(thread_t thread, uint32_t qos_override, boolean_t is_new_override) { @@ -2812,13 +2934,13 @@ thread_ipc_override(thread_t thread, spl_t s = splsched(); thread_lock(thread); - uint32_t old_override = thread->requested_policy.thrp_qos_ipc_override; + uint32_t old_override = thread->requested_policy.thrp_qos_kevent_override; assert(qos_override > THREAD_QOS_UNSPECIFIED); assert(qos_override < THREAD_QOS_LAST); if (is_new_override) { - if (thread->ipc_overrides++ == 0) { + if (thread->kevent_overrides++ == 0) { /* This add is the first override for this thread */ assert(old_override == THREAD_QOS_UNSPECIFIED); } else { @@ -2827,7 +2949,7 @@ thread_ipc_override(thread_t thread, } } else { /* There must be at least one override (the previous add call) in effect */ - assert(thread->ipc_overrides > 0); + assert(thread->kevent_overrides > 0); assert(old_override > THREAD_QOS_UNSPECIFIED); } @@ -2835,7 +2957,7 @@ thread_ipc_override(thread_t thread, * We can't allow lowering if there are several IPC overrides because * the caller can't possibly know the whole truth */ - if (thread->ipc_overrides == 1) { + if (thread->kevent_overrides == 1) { needs_update = qos_override != old_override; } else { needs_update = qos_override > old_override; @@ -2843,7 +2965,7 @@ thread_ipc_override(thread_t thread, if (needs_update) { proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_QOS_IPC_OVERRIDE, + TASK_POLICY_QOS_KEVENT_OVERRIDE, qos_override, 0, &pend_token); assert(pend_token.tpt_update_sockets == 0); } @@ -2855,37 +2977,35 @@ thread_ipc_override(thread_t thread, } void -thread_add_ipc_override(thread_t thread, - uint32_t qos_override) +thread_add_kevent_override(thread_t thread, uint32_t qos_override) { - thread_ipc_override(thread, qos_override, TRUE); + thread_kevent_override(thread, qos_override, TRUE); } void -thread_update_ipc_override(thread_t thread, - uint32_t qos_override) +thread_update_kevent_override(thread_t thread, uint32_t qos_override) { - thread_ipc_override(thread, qos_override, FALSE); + thread_kevent_override(thread, qos_override, FALSE); } void -thread_drop_ipc_override(thread_t thread) +thread_drop_kevent_override(thread_t thread) { struct task_pend_token pend_token = {}; spl_t s = splsched(); thread_lock(thread); - assert(thread->ipc_overrides > 0); + assert(thread->kevent_overrides > 0); - if (--thread->ipc_overrides == 0) { + if (--thread->kevent_overrides == 0) { /* * There are no more overrides for this thread, so we should * clear out the saturated override value */ proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_QOS_IPC_OVERRIDE, THREAD_QOS_UNSPECIFIED, + TASK_POLICY_QOS_KEVENT_OVERRIDE, THREAD_QOS_UNSPECIFIED, 0, &pend_token); } @@ -2895,6 +3015,69 @@ thread_drop_ipc_override(thread_t thread) thread_policy_update_complete_unlocked(thread, &pend_token); } +/* + * Set the thread's QoS Workloop Servicer override + * Owned by the Kevent subsystem + * + * May be called with spinlocks held, but not spinlocks + * that may deadlock against the thread lock, the throttle lock, or the SFI lock. + * + * One 'add' must be balanced by one 'drop'. + * Between 'add' and 'drop', the overide QoS value may be updated with an 'update'. + * Before the thread is deallocated, there must be 0 remaining overrides. + */ +static void +thread_servicer_override(thread_t thread, + uint32_t qos_override, + boolean_t is_new_override) +{ + struct task_pend_token pend_token = {}; + + spl_t s = splsched(); + thread_lock(thread); + + if (is_new_override) { + assert(!thread->requested_policy.thrp_qos_wlsvc_override); + } else { + assert(thread->requested_policy.thrp_qos_wlsvc_override); + } + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_SERVICER_OVERRIDE, + qos_override, 0, &pend_token); + + thread_unlock(thread); + splx(s); + + assert(pend_token.tpt_update_sockets == 0); + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +void +thread_add_servicer_override(thread_t thread, uint32_t qos_override) +{ + assert(qos_override > THREAD_QOS_UNSPECIFIED); + assert(qos_override < THREAD_QOS_LAST); + + thread_servicer_override(thread, qos_override, TRUE); +} + +void +thread_update_servicer_override(thread_t thread, uint32_t qos_override) +{ + assert(qos_override > THREAD_QOS_UNSPECIFIED); + assert(qos_override < THREAD_QOS_LAST); + + thread_servicer_override(thread, qos_override, FALSE); +} + +void +thread_drop_servicer_override(thread_t thread) +{ + thread_servicer_override(thread, THREAD_QOS_UNSPECIFIED, FALSE); +} + + /* Get current requested qos / relpri, may be called from spinlock context */ thread_qos_t thread_get_requested_qos(thread_t thread, int *relpri) diff --git a/osfmk/kern/timer_queue.h b/osfmk/kern/timer_queue.h index 08c841925..cba8edd6d 100644 --- a/osfmk/kern/timer_queue.h +++ b/osfmk/kern/timer_queue.h @@ -176,6 +176,8 @@ extern int setPop(uint64_t time); extern void timer_resync_deadlines(void); +extern void timer_queue_expire_local(void *arg); + extern void timer_set_deadline(uint64_t deadline); extern void quantum_timer_set_deadline(uint64_t deadline); diff --git a/osfmk/kern/tlock.c b/osfmk/kern/tlock.c index 22c57cc56..1c75a2ecc 100644 --- a/osfmk/kern/tlock.c +++ b/osfmk/kern/tlock.c @@ -95,6 +95,23 @@ tlock_mark_owned(lck_ticket_t *tlock, thread_t cthread) __c11_atomic_store((_Atomic thread_t *)&tlock->lck_owner, cthread, __ATOMIC_RELAXED); } +#if __arm__ || __arm64__ +__unused static uint8_t +load_exclusive_acquire8(uint8_t *target) +{ + uint8_t value; +#if __arm__ + value = __builtin_arm_ldrex(target); + __c11_atomic_thread_fence(__ATOMIC_ACQUIRE); +#else + value = __builtin_arm_ldaex(target); // ldaxr + /* "Compiler barrier", no barrier instructions are emitted */ + atomic_signal_fence(memory_order_acquire); +#endif + return value; +} +#endif + /* On contention, poll for ownership * Returns when the current ticket is observed equal to "mt" */ @@ -117,7 +134,7 @@ tlock_contended(uint8_t *tp, uint8_t mt, lck_ticket_t *tlock, thread_t cthread) * TODO: determine specific micro-architectures * which benefit, modern CPUs may not */ - clear_exclusive(); + os_atomic_clear_exclusive(); tlock_mark_owned(tlock, cthread); return; } diff --git a/osfmk/kern/trustcache.h b/osfmk/kern/trustcache.h index 355039d84..7017c7fa8 100644 --- a/osfmk/kern/trustcache.h +++ b/osfmk/kern/trustcache.h @@ -35,6 +35,7 @@ #include +#ifdef PLATFORM_BridgeOS /* Version 0 trust caches: No defined sorting order (thus only suitable for small trust caches). * Used for loadable trust caches only, until phasing out support. */ typedef uint8_t trust_cache_hash0[CS_CDHASH_LEN]; @@ -44,6 +45,7 @@ struct trust_cache_module0 { uint32_t num_hashes; trust_cache_hash0 hashes[]; } __attribute__((__packed__)); +#endif /* Version 1 trust caches: Always sorted by cdhash, added hash type and flags field. @@ -65,6 +67,22 @@ struct trust_cache_module1 { // Trust Cache Entry Flags #define CS_TRUST_CACHE_AMFID 0x1 // valid cdhash for amfid +/* Trust Cache lookup functions return their result as a 32bit value + * comprised of subfields, for straightforward passing through layers. + * + * Format: + * + * 0xXXCCBBAA + * + * AA: 0-7: lookup result + * bit 0: TC_LOOKUP_FOUND: set if any entry found + * bit 1: (obsolete) TC_LOOKUP_FALLBACK: set if found in legacy static trust cache + * bit 2-7: reserved + * BB: 8-15: entry flags pass-through, see "Trust Cache Entry Flags" above + * CC: 16-23: code directory hash type of entry, see CS_HASHTYPE_* in cs_blobs.h + * XX: 24-31: reserved + */ + #define TC_LOOKUP_HASH_TYPE_SHIFT 16 #define TC_LOOKUP_HASH_TYPE_MASK 0xff0000L; #define TC_LOOKUP_FLAGS_SHIFT 8 @@ -73,7 +91,6 @@ struct trust_cache_module1 { #define TC_LOOKUP_RESULT_MASK 0xffL #define TC_LOOKUP_FOUND 1 -// #define TC_LOOKUP_FALLBACK 2 /* obsolete with removal of legacy static trust caches */ #ifdef XNU_KERNEL_PRIVATE diff --git a/osfmk/kern/turnstile.c b/osfmk/kern/turnstile.c index ea58c5477..6375a3704 100644 --- a/osfmk/kern/turnstile.c +++ b/osfmk/kern/turnstile.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include @@ -49,11 +49,12 @@ static zone_t turnstiles_zone; static int turnstile_max_hop; +static struct mpsc_daemon_queue turnstile_deallocate_queue; #define MAX_TURNSTILES (thread_max) #define TURNSTILES_CHUNK (THREAD_CHUNK) /* Global table for turnstile promote policy for all type of turnstiles */ -turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = { +static const turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = { [TURNSTILE_NONE] = TURNSTILE_PROMOTE_NONE, [TURNSTILE_KERNEL_MUTEX] = TURNSTILE_KERNEL_PROMOTE, [TURNSTILE_ULOCK] = TURNSTILE_USER_PROMOTE, @@ -62,6 +63,20 @@ turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = { [TURNSTILE_WORKLOOPS] = TURNSTILE_USER_IPC_PROMOTE, [TURNSTILE_WORKQS] = TURNSTILE_USER_IPC_PROMOTE, [TURNSTILE_KNOTE] = TURNSTILE_USER_IPC_PROMOTE, + [TURNSTILE_SLEEP_INHERITOR] = TURNSTILE_KERNEL_PROMOTE, +}; + +/* Global table for turnstile hash lock policy for all type of turnstiles */ +static const turnstile_hash_lock_policy_t turnstile_hash_lock_policy[TURNSTILE_TOTAL_TYPES] = { + [TURNSTILE_NONE] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_KERNEL_MUTEX] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_ULOCK] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_PTHREAD_MUTEX] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_SYNC_IPC] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_WORKLOOPS] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_WORKQS] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_KNOTE] = TURNSTILE_HASH_LOCK_POLICY_NONE, + [TURNSTILE_SLEEP_INHERITOR] = (TURNSTILE_IRQ_UNSAFE_HASH | TURNSTILE_LOCKED_HASH), }; os_refgrp_decl(static, turnstile_refgrp, "turnstile", NULL); @@ -90,8 +105,7 @@ static struct turnstile_stats turnstile_boost_stats[TURNSTILE_MAX_HOP_DEFAULT] = static struct turnstile_stats turnstile_unboost_stats[TURNSTILE_MAX_HOP_DEFAULT] = {}; uint64_t thread_block_on_turnstile_count; uint64_t thread_block_on_regular_waitq_count; - -#endif +#endif /* DEVELOPMENT || DEBUG */ #ifndef max #define max(a, b) (((a) > (b)) ? (a) : (b)) @@ -161,6 +175,9 @@ static turnstile_stats_update_flags_t thread_get_update_flags_for_turnstile_propagation_stoppage(thread_t thread); static turnstile_stats_update_flags_t turnstile_get_update_flags_for_above_UI_pri_change(struct turnstile *turnstile); +static void turnstile_stash_inheritor(turnstile_inheritor_t new_inheritor, + turnstile_update_flags_t flags); +static int turnstile_compute_thread_push(struct turnstile *turnstile, thread_t thread); #if DEVELOPMENT || DEBUG /* Test primitives and interfaces for testing turnstiles */ @@ -173,6 +190,9 @@ struct tstile_test_prim { struct tstile_test_prim *test_prim_ts_inline; struct tstile_test_prim *test_prim_global_htable; +struct tstile_test_prim *test_prim_global_ts_kernel; +struct tstile_test_prim *test_prim_global_ts_kernel_hash; + static void tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr); #endif @@ -235,9 +255,12 @@ struct turnstile_htable_bucket { }; SECURITY_READ_ONLY_LATE(static uint32_t) ts_htable_buckets; -/* Global hashtable for turnstiles */ +/* Global hashtable for turnstiles managed with interrupts disabled */ +SECURITY_READ_ONLY_LATE(static struct turnstile_htable_bucket *)turnstile_htable_irq_safe; +/* Global hashtable for turnstiles managed with interrupts enabled */ SECURITY_READ_ONLY_LATE(static struct turnstile_htable_bucket *)turnstile_htable; + /* Bucket locks for turnstile hashtable */ lck_grp_t turnstiles_htable_lock_grp; lck_attr_t turnstiles_htable_lock_attr; @@ -250,6 +273,9 @@ lck_grp_attr_t turnstiles_htable_lock_grp_attr; #define turnstile_bucket_unlock(bucket) \ lck_spin_unlock(&bucket->ts_ht_bucket_lock) +#define kdp_turnstile_bucket_is_locked(bucket) \ + kdp_lck_spin_is_acquired(&bucket->ts_ht_bucket_lock) + /* * Name: turnstiles_hashtable_init * @@ -271,18 +297,26 @@ turnstiles_hashtable_init(void) assert(ts_htable_buckets <= TURNSTILE_HTABLE_BUCKETS_MAX); uint32_t ts_htable_size = ts_htable_buckets * sizeof(struct turnstile_htable_bucket); + turnstile_htable_irq_safe = (struct turnstile_htable_bucket *)kalloc(ts_htable_size); + if (turnstile_htable_irq_safe == NULL) { + panic("Turnstiles hash table memory allocation failed!"); + } + turnstile_htable = (struct turnstile_htable_bucket *)kalloc(ts_htable_size); if (turnstile_htable == NULL) { panic("Turnstiles hash table memory allocation failed!"); } - lck_grp_attr_setdefault(&turnstiles_htable_lock_grp_attr); lck_grp_init(&turnstiles_htable_lock_grp, "turnstiles_htable_locks", &turnstiles_htable_lock_grp_attr); lck_attr_setdefault(&turnstiles_htable_lock_attr); - /* Initialize all the buckets of the hashtable */ + /* Initialize all the buckets of the hashtables */ for (uint32_t i = 0; i < ts_htable_buckets; i++) { - struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[i]); + struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable_irq_safe[i]); + turnstile_bucket_lock_init(ts_bucket); + SLIST_INIT(&ts_bucket->ts_ht_bucket_list); + + ts_bucket = &(turnstile_htable[i]); turnstile_bucket_lock_init(ts_bucket); SLIST_INIT(&ts_bucket->ts_ht_bucket_list); } @@ -377,6 +411,110 @@ turnstile_hash(uintptr_t proprietor) return hash & (ts_htable_buckets - 1); } +static inline struct turnstile_htable_bucket * +turnstile_get_bucket(uint32_t index, turnstile_type_t type) +{ + struct turnstile_htable_bucket *ts_bucket; + int hash_policy = turnstile_hash_lock_policy[type]; + + if (hash_policy & TURNSTILE_IRQ_UNSAFE_HASH) { + ts_bucket = &(turnstile_htable[index]); + } else { + ts_bucket = &(turnstile_htable_irq_safe[index]); + } + + return ts_bucket; +} + +/* + * Name: turnstile_hash_bucket_lock + * + * Description: locks the spinlock associated with proprietor's bucket. + * if proprietor is specified the index for the hash will be + * recomputed and returned in index_proprietor, + * otherwise the value save in index_proprietor is used as index. + * + * Args: + * Arg1: proprietor (key) for hashing + * Arg2: index for proprietor in the hash + * Arg3: turnstile type + * + * Returns: old value of irq if irq were disabled before acquiring the lock. + */ +unsigned +turnstile_hash_bucket_lock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type) +{ + struct turnstile_htable_bucket *ts_bucket; + int hash_policy = turnstile_hash_lock_policy[type]; + bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH); + spl_t ret = 0; + uint32_t index; + + /* + * If the proprietor is specified, the caller doesn't know + * the index in the hash, so compute it. + * Otherwise use the value of index provided. + */ + if (proprietor) { + index = turnstile_hash(proprietor); + *index_proprietor = index; + } else { + index = *index_proprietor; + } + + ts_bucket = turnstile_get_bucket(index, type); + + if (irq_safe) { + ret = splsched(); + } + + turnstile_bucket_lock(ts_bucket); + + return ret; +} + +/* + * Name: turnstile_hash_bucket_unlock + * + * Description: unlocks the spinlock associated with proprietor's bucket. + * if proprietor is specified the index for the hash will be + * recomputed and returned in index_proprietor, + * otherwise the value save in index_proprietor is used as index. + * + * Args: + * Arg1: proprietor (key) for hashing + * Arg2: index for proprietor in the hash + * Arg3: turnstile type + * Arg4: irq value returned by turnstile_hash_bucket_lock + * + */ +void +turnstile_hash_bucket_unlock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type, unsigned s) +{ + struct turnstile_htable_bucket *ts_bucket; + int hash_policy = turnstile_hash_lock_policy[type]; + bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH); + uint32_t index; + + /* + * If the proprietor is specified, the caller doesn't know + * the index in the hash, so compute it. + * Otherwise use the value of index provided. + */ + if (proprietor) { + index = turnstile_hash(proprietor); + *index_proprietor = index; + } else { + index = *index_proprietor; + } + ts_bucket = turnstile_get_bucket(index, type); + + turnstile_bucket_unlock(ts_bucket); + if (irq_safe) { + splx(s); + } +} + /* * Name: turnstile_htable_lookup_add * @@ -389,6 +527,7 @@ turnstile_hash(uintptr_t proprietor) * Args: * Arg1: proprietor * Arg2: new turnstile for primitive + * Arg3: turnstile_type_t type * * Returns: * Previous turnstile for proprietor in the hash table @@ -396,15 +535,26 @@ turnstile_hash(uintptr_t proprietor) static struct turnstile * turnstile_htable_lookup_add( uintptr_t proprietor, - struct turnstile *new_turnstile) + struct turnstile *new_turnstile, + turnstile_type_t type) { uint32_t index = turnstile_hash(proprietor); assert(index < ts_htable_buckets); - struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]); + struct turnstile_htable_bucket *ts_bucket; + int hash_policy = turnstile_hash_lock_policy[type]; + bool needs_lock = !(hash_policy & TURNSTILE_LOCKED_HASH); + bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH); spl_t s; - s = splsched(); - turnstile_bucket_lock(ts_bucket); + ts_bucket = turnstile_get_bucket(index, type); + + if (needs_lock) { + if (irq_safe) { + s = splsched(); + } + turnstile_bucket_lock(ts_bucket); + } + struct turnstile *ts; SLIST_FOREACH(ts, &ts_bucket->ts_ht_bucket_list, ts_htable_link) { @@ -413,8 +563,12 @@ turnstile_htable_lookup_add( * Found an entry in the hashtable for this proprietor; add thread turnstile to freelist * and return this turnstile */ - turnstile_bucket_unlock(ts_bucket); - splx(s); + if (needs_lock) { + turnstile_bucket_unlock(ts_bucket); + if (irq_safe) { + splx(s); + } + } turnstile_freelist_insert(ts, new_turnstile); return ts; } @@ -423,8 +577,12 @@ turnstile_htable_lookup_add( /* No entry for this proprietor; add the new turnstile in the hash table */ SLIST_INSERT_HEAD(&ts_bucket->ts_ht_bucket_list, new_turnstile, ts_htable_link); turnstile_state_add(new_turnstile, TURNSTILE_STATE_HASHTABLE); - turnstile_bucket_unlock(ts_bucket); - splx(s); + if (needs_lock) { + turnstile_bucket_unlock(ts_bucket); + if (irq_safe) { + splx(s); + } + } /* Since there was no previous entry for this proprietor, return TURNSTILE_NULL */ return TURNSTILE_NULL; } @@ -442,6 +600,7 @@ turnstile_htable_lookup_add( * Args: * Arg1: proprietor * Arg2: free turnstile to be returned + * Arg3: turnstile_type_t type * * Returns: * turnstile for this proprietor in the hashtable after the removal @@ -449,16 +608,27 @@ turnstile_htable_lookup_add( static struct turnstile * turnstable_htable_lookup_remove( uintptr_t proprietor, - struct turnstile **free_turnstile) + struct turnstile **free_turnstile, + turnstile_type_t type) { uint32_t index = turnstile_hash(proprietor); assert(index < ts_htable_buckets); - struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]); + struct turnstile_htable_bucket *ts_bucket; struct turnstile *ret_turnstile = TURNSTILE_NULL; + int hash_policy = turnstile_hash_lock_policy[type]; + bool needs_lock = !(hash_policy & TURNSTILE_LOCKED_HASH); + bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH); spl_t s; - s = splsched(); - turnstile_bucket_lock(ts_bucket); + ts_bucket = turnstile_get_bucket(index, type); + + if (needs_lock) { + if (irq_safe) { + s = splsched(); + } + turnstile_bucket_lock(ts_bucket); + } + struct turnstile *ts, **prev_tslink; /* Find the turnstile for the given proprietor in the hashtable */ SLIST_FOREACH_PREVPTR(ts, prev_tslink, &ts_bucket->ts_ht_bucket_list, ts_htable_link) { @@ -474,8 +644,12 @@ turnstable_htable_lookup_remove( /* No turnstiles on the freelist; remove the turnstile from the hashtable and mark it freed */ *prev_tslink = SLIST_NEXT(ret_turnstile, ts_htable_link); turnstile_state_remove(ret_turnstile, TURNSTILE_STATE_HASHTABLE); - turnstile_bucket_unlock(ts_bucket); - splx(s); + if (needs_lock) { + turnstile_bucket_unlock(ts_bucket); + if (irq_safe) { + splx(s); + } + } *free_turnstile = ret_turnstile; return TURNSTILE_NULL; } else { @@ -483,8 +657,12 @@ turnstable_htable_lookup_remove( * Turnstile has free turnstiles on its list; leave the hashtable unchanged * and return the first turnstile in the freelist as the free turnstile */ - turnstile_bucket_unlock(ts_bucket); - splx(s); + if (needs_lock) { + turnstile_bucket_unlock(ts_bucket); + if (irq_safe) { + splx(s); + } + } *free_turnstile = turnstile_freelist_remove(ret_turnstile); return ret_turnstile; } @@ -499,21 +677,39 @@ turnstable_htable_lookup_remove( * * Args: * Arg1: proprietor + * Arg2: turnstile_type_t type * * Returns: * Turnstile for proprietor in the hash table */ static struct turnstile * turnstile_htable_lookup( - uintptr_t proprietor) + uintptr_t proprietor, + turnstile_type_t type) { uint32_t index = turnstile_hash(proprietor); assert(index < ts_htable_buckets); - struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]); + bool kdp_ctx = !not_in_kdp; + struct turnstile_htable_bucket *ts_bucket = turnstile_get_bucket(index, type); + int hash_policy = turnstile_hash_lock_policy[type]; + bool needs_lock = !(hash_policy & TURNSTILE_LOCKED_HASH); + bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH); spl_t s; - s = splsched(); - turnstile_bucket_lock(ts_bucket); + if (needs_lock) { + if (irq_safe && !kdp_ctx) { + s = splsched(); + } + + if (kdp_ctx) { + if (kdp_turnstile_bucket_is_locked(ts_bucket)) { + /* This should move to TURNSTILE_BUSY once 51725781 is in the build */ + return TURNSTILE_NULL; + } + } else { + turnstile_bucket_lock(ts_bucket); + } + } struct turnstile *ts = TURNSTILE_NULL; struct turnstile *ret_turnstile = TURNSTILE_NULL; @@ -525,11 +721,39 @@ turnstile_htable_lookup( } } - turnstile_bucket_unlock(ts_bucket); - splx(s); + if (needs_lock && !kdp_ctx) { + turnstile_bucket_unlock(ts_bucket); + if (irq_safe) { + splx(s); + } + } + return ret_turnstile; } +/* + * Name: turnstile_deallocate_queue_invoke + * + * Description: invoke function for the asynchronous turnstile deallocation + * queue + * + * Arg1: &turnstile_deallocate_queue + * Arg2: a pointer to the turnstile ts_deallocate_link member of a tunrstile to + * destroy. + * + * Returns: None. + */ +static void +turnstile_deallocate_queue_invoke(mpsc_queue_chain_t e, + __assert_only mpsc_daemon_queue_t dq) +{ + struct turnstile *ts; + + ts = mpsc_queue_element(e, struct turnstile, ts_deallocate_link); + assert(dq == &turnstile_deallocate_queue); + turnstile_destroy(ts); +} + /* * Name: turnstiles_init * @@ -553,6 +777,9 @@ turnstiles_init(void) turnstiles_hashtable_init(); + thread_deallocate_daemon_register_queue(&turnstile_deallocate_queue, + turnstile_deallocate_queue_invoke); + #if DEVELOPMENT || DEBUG /* Initialize the global turnstile locks and lock group */ @@ -566,6 +793,8 @@ turnstiles_init(void) /* Initialize turnstile test primitive */ tstile_test_prim_init(&test_prim_ts_inline); tstile_test_prim_init(&test_prim_global_htable); + tstile_test_prim_init(&test_prim_global_ts_kernel); + tstile_test_prim_init(&test_prim_global_ts_kernel_hash); #endif return; } @@ -620,12 +849,12 @@ turnstile_init(struct turnstile *turnstile) turnstile->ts_inheritor = TURNSTILE_INHERITOR_NULL; SLIST_INIT(&turnstile->ts_free_turnstiles); - turnstile->ts_type_gencount = 0; + os_atomic_init(&turnstile->ts_type_gencount, 0); turnstile_set_type_and_increment_gencount(turnstile, TURNSTILE_NONE); turnstile_state_init(turnstile, TURNSTILE_STATE_THREAD); os_ref_init_count(&turnstile->ts_refcount, &turnstile_refgrp, 1); turnstile->ts_proprietor = TURNSTILE_PROPRIETOR_NULL; - turnstile->ts_priority = MAXPRI_THROTTLE; + turnstile->ts_priority = 0; turnstile->ts_inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE; turnstile->ts_port_ref = 0; priority_queue_init(&turnstile->ts_inheritor_queue, @@ -694,8 +923,8 @@ turnstile_deallocate_safe(struct turnstile *turnstile) } if (__improbable(os_ref_release(&turnstile->ts_refcount) == 0)) { - /* enqueue the turnstile for thread deallocate deamon to call turnstile_destroy */ - turnstile_deallocate_enqueue(turnstile); + mpsc_daemon_enqueue(&turnstile_deallocate_queue, + &turnstile->ts_deallocate_link, MPSC_QUEUE_DISABLE_PREEMPTION); } } @@ -772,7 +1001,7 @@ turnstile_prepare( thread_turnstile->ts_proprietor = proprietor; turnstile_state_remove(thread_turnstile, TURNSTILE_STATE_THREAD); - thread_turnstile->ts_priority = MAXPRI_THROTTLE; + thread_turnstile->ts_priority = 0; #if DEVELOPMENT || DEBUG thread_turnstile->ts_prev_thread = thread_turnstile->ts_thread; thread_turnstile->ts_thread = NULL; @@ -802,7 +1031,7 @@ turnstile_prepare( /* * Lookup the primitive in the turnstile hash table and see if it already has an entry. */ - ret_turnstile = turnstile_htable_lookup_add(proprietor, thread_turnstile); + ret_turnstile = turnstile_htable_lookup_add(proprietor, thread_turnstile, type); if (ret_turnstile == NULL) { ret_turnstile = thread_turnstile; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -820,13 +1049,13 @@ turnstile_prepare( * Name: turnstile_complete * * Description: Transfer the primitive's turnstile or from it's freelist to current thread. - * Function is called holding the interlock (spinlock) of the primitive. * Current thread will have a turnstile attached to it after this call. * * Args: * Arg1: proprietor * Arg2: pointer in primitive struct to update turnstile * Arg3: pointer to store the returned turnstile instead of attaching it to thread + * Arg4: type of primitive * * Returns: * None. @@ -835,7 +1064,8 @@ void turnstile_complete( uintptr_t proprietor, struct turnstile **tstore, - struct turnstile **out_turnstile) + struct turnstile **out_turnstile, + turnstile_type_t type) { thread_t thread = current_thread(); struct turnstile *primitive_turnstile = TURNSTILE_NULL; @@ -861,7 +1091,7 @@ turnstile_complete( primitive_turnstile = *tstore; } else { /* Use the global hash to find and remove a turnstile */ - primitive_turnstile = turnstable_htable_lookup_remove(proprietor, &thread_turnstile); + primitive_turnstile = turnstable_htable_lookup_remove(proprietor, &thread_turnstile, type); } if (primitive_turnstile == NULL) { /* @@ -910,6 +1140,42 @@ turnstile_complete( return; } +/* + * Name: turnstile_kernel_update_inheritor_on_wake_locked + * + * Description: Set thread as the inheritor of the turnstile and + * boost the inheritor. + * Args: + * Arg1: turnstile + * Arg2: new_inheritor + * Arg3: flags + * + * Called with turnstile locked + */ +void +turnstile_kernel_update_inheritor_on_wake_locked( + struct turnstile *turnstile, + turnstile_inheritor_t new_inheritor, + turnstile_update_flags_t flags __assert_only) +{ + /* for now only kernel primitives are allowed to call this function */ + __assert_only turnstile_promote_policy_t policy = + turnstile_promote_policy[turnstile_get_type(turnstile)]; + + assert(flags & TURNSTILE_INHERITOR_THREAD); + assert(policy == TURNSTILE_KERNEL_PROMOTE || policy == TURNSTILE_USER_PROMOTE); + + turnstile_stash_inheritor((thread_t)new_inheritor, TURNSTILE_INHERITOR_THREAD); + /* + * new_inheritor has just been removed from the turnstile waitq, + * the turnstile new priority needs to be recomputed so that + * when new_inheritor will become this turnstile inheritor can + * inherit the correct priority. + */ + turnstile_recompute_priority_locked(turnstile); + turnstile_update_inheritor_locked(turnstile); +} + /* * Name: turnstile_update_inheritor_locked * @@ -947,116 +1213,124 @@ turnstile_update_inheritor_locked( switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { case TURNSTILE_USER_PROMOTE: case TURNSTILE_USER_IPC_PROMOTE: + break; + case TURNSTILE_KERNEL_PROMOTE: + /* some sanity checks, turnstile kernel can push just between threads */ + if (old_inheritor) { + assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + } - /* Check if update is needed */ - if (old_inheritor == new_inheritor && old_inheritor == NULL) { - break; + if (new_inheritor) { + assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD); } - if (old_inheritor == new_inheritor) { - if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { - thread_t thread_inheritor = (thread_t)new_inheritor; + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); + } - assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + /* Check if update is needed */ + if (old_inheritor == new_inheritor && old_inheritor == NULL) { + goto done; + } - /* adjust turnstile position in the thread's inheritor list */ - new_inheritor_needs_update = thread_update_turnstile_promotion( - thread_inheritor, turnstile); - } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { - struct turnstile *inheritor_turnstile = new_inheritor; + if (old_inheritor == new_inheritor) { + if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_t thread_inheritor = (thread_t)new_inheritor; - assert(old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE); + assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD); - new_inheritor_needs_update = turnstile_update_turnstile_promotion( - inheritor_turnstile, turnstile); - } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { - /* - * When we are still picking "WORKQ" then possible racing - * updates will call redrive through their own propagation - * and we don't need to update anything here. - */ - turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | - TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); - } else { - panic("Inheritor flags lost along the way"); - } + /* adjust turnstile position in the thread's inheritor list */ + new_inheritor_needs_update = thread_update_turnstile_promotion( + thread_inheritor, turnstile); + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + struct turnstile *inheritor_turnstile = new_inheritor; - /* Update turnstile stats */ - if (!new_inheritor_needs_update) { - turnstile_stats_update(1, TSU_PRI_PROPAGATION | - TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile); - } - break; + assert(old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE); + + new_inheritor_needs_update = turnstile_update_turnstile_promotion( + inheritor_turnstile, turnstile); + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + /* + * When we are still picking "WORKQ" then possible racing + * updates will call redrive through their own propagation + * and we don't need to update anything here. + */ + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); + } else { + panic("Inheritor flags lost along the way"); } - if (old_inheritor != NULL) { - if (old_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { - thread_t thread_inheritor = (thread_t)old_inheritor; - - /* remove turnstile from thread's inheritor list */ - old_inheritor_needs_update = thread_remove_turnstile_promotion(thread_inheritor, turnstile); - } else if (old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { - struct turnstile *old_turnstile = old_inheritor; - - old_inheritor_needs_update = turnstile_remove_turnstile_promotion( - old_turnstile, turnstile); - } else if (old_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { - /* - * We don't need to do anything when the push was WORKQ - * because nothing is pushed on in the first place. - */ - turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | - TSU_TURNSTILE_ARG, turnstile); - } else { - panic("Inheritor flags lost along the way"); - } - /* Update turnstile stats */ - if (!old_inheritor_needs_update) { - turnstile_stats_update(1, TSU_PRI_PROPAGATION | TSU_TURNSTILE_ARG, - turnstile); - } + /* Update turnstile stats */ + if (!new_inheritor_needs_update) { + turnstile_stats_update(1, TSU_PRI_PROPAGATION | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile); } + goto done; + } - if (new_inheritor != NULL) { - if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { - thread_t thread_inheritor = (thread_t)new_inheritor; - - assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD); - /* add turnstile to thread's inheritor list */ - new_inheritor_needs_update = thread_add_turnstile_promotion( - thread_inheritor, turnstile); - } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { - struct turnstile *new_turnstile = new_inheritor; - - new_inheritor_needs_update = turnstile_add_turnstile_promotion( - new_turnstile, turnstile); - } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { - struct workqueue *wq_inheritor = new_inheritor; - - new_inheritor_needs_update = workq_add_turnstile_promotion( - wq_inheritor, turnstile); - if (!new_inheritor_needs_update) { - turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | - TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); - } - } else { - panic("Inheritor flags lost along the way"); - } - /* Update turnstile stats */ - if (!new_inheritor_needs_update) { - turnstile_stats_update(1, TSU_PRI_PROPAGATION | - TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile); - } + if (old_inheritor != NULL) { + if (old_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_t thread_inheritor = (thread_t)old_inheritor; + + /* remove turnstile from thread's inheritor list */ + old_inheritor_needs_update = thread_remove_turnstile_promotion(thread_inheritor, turnstile); + } else if (old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + struct turnstile *old_turnstile = old_inheritor; + + old_inheritor_needs_update = turnstile_remove_turnstile_promotion( + old_turnstile, turnstile); + } else if (old_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + /* + * We don't need to do anything when the push was WORKQ + * because nothing is pushed on in the first place. + */ + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG, turnstile); + } else { + panic("Inheritor flags lost along the way"); + } + /* Update turnstile stats */ + if (!old_inheritor_needs_update) { + turnstile_stats_update(1, TSU_PRI_PROPAGATION | TSU_TURNSTILE_ARG, + turnstile); } + } - break; + if (new_inheritor != NULL) { + if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_t thread_inheritor = (thread_t)new_inheritor; - case TURNSTILE_KERNEL_PROMOTE: - break; - default: - panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); + assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + /* add turnstile to thread's inheritor list */ + new_inheritor_needs_update = thread_add_turnstile_promotion( + thread_inheritor, turnstile); + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + struct turnstile *new_turnstile = new_inheritor; + + new_inheritor_needs_update = turnstile_add_turnstile_promotion( + new_turnstile, turnstile); + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + struct workqueue *wq_inheritor = new_inheritor; + + new_inheritor_needs_update = workq_add_turnstile_promotion( + wq_inheritor, turnstile); + if (!new_inheritor_needs_update) { + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); + } + } else { + panic("Inheritor flags lost along the way"); + } + /* Update turnstile stats */ + if (!new_inheritor_needs_update) { + turnstile_stats_update(1, TSU_PRI_PROPAGATION | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile); + } } +done: if (old_inheritor_needs_update) { old_inheritor_flags |= TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE; } @@ -1077,28 +1351,25 @@ turnstile_update_inheritor_locked( } /* - * Name: turnstile_update_inheritor + * Name: turnstile_stash_inheritor * - * Description: Update the inheritor of the turnstile and boost the - * inheritor. It will take a thread reference on the inheritor. + * Description: Save the new inheritor reference of the turnstile on the + * current thread. It will take a thread reference on the inheritor. * Called with the interlock of the primitive held. * * Args: - * Arg1: turnstile - * Arg2: inheritor - * Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait + * Arg1: inheritor + * Arg2: flags * * Returns: * old inheritor reference is stashed on current thread's struct. */ -void -turnstile_update_inheritor( - struct turnstile *turnstile, +static void +turnstile_stash_inheritor( turnstile_inheritor_t new_inheritor, turnstile_update_flags_t flags) { thread_t thread = current_thread(); - spl_t spl; /* * Set the inheritor on calling thread struct, no need @@ -1123,6 +1394,32 @@ turnstile_update_inheritor( panic("Missing type in flags (%x) for inheritor (%p)", flags, new_inheritor); } +} + +/* + * Name: turnstile_update_inheritor + * + * Description: Update the inheritor of the turnstile and boost the + * inheritor. It will take a thread reference on the inheritor. + * Called with the interlock of the primitive held. + * + * Args: + * Arg1: turnstile + * Arg2: inheritor + * Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait + * + * Returns: + * old inheritor reference is stashed on current thread's struct. + */ +void +turnstile_update_inheritor( + struct turnstile *turnstile, + turnstile_inheritor_t new_inheritor, + turnstile_update_flags_t flags) +{ + spl_t spl; + + turnstile_stash_inheritor(new_inheritor, flags); /* Do not perform the update if delayed update is specified */ if (flags & TURNSTILE_DELAYED_UPDATE) { @@ -1157,7 +1454,7 @@ turnstile_update_inheritor( */ static boolean_t turnstile_need_thread_promotion_update( - struct turnstile *dst_turnstile __assert_only, + struct turnstile *dst_turnstile, thread_t thread) { int thread_link_priority; @@ -1166,7 +1463,10 @@ turnstile_need_thread_promotion_update( thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue), &(thread->wait_prioq_links)); - needs_update = (thread_link_priority == thread->base_pri) ? FALSE : TRUE; + int priority = turnstile_compute_thread_push(dst_turnstile, thread); + + needs_update = (thread_link_priority == priority) ? FALSE : TRUE; + return needs_update; } @@ -1221,21 +1521,25 @@ turnstile_update_thread_promotion_locked( struct turnstile *dst_turnstile, thread_t thread) { - int thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue), + int thread_link_priority; + + int priority = turnstile_compute_thread_push(dst_turnstile, thread); + + thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue), &(thread->wait_prioq_links)); - if (thread->base_pri != thread_link_priority) { + if (priority != thread_link_priority) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_MOVED_IN_TURNSTILE_WAITQ))) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile), thread_tid(thread), - thread->base_pri, + priority, thread_link_priority, 0); } if (!turnstile_priority_queue_update_entry_key( &dst_turnstile->ts_waitq.waitq_prio_queue, - &thread->wait_prioq_links, thread->base_pri)) { + &thread->wait_prioq_links, priority)) { return FALSE; } @@ -1243,7 +1547,6 @@ turnstile_update_thread_promotion_locked( return turnstile_recompute_priority_locked(dst_turnstile); } - /* * Name: thread_add_turnstile_promotion * @@ -1273,12 +1576,30 @@ thread_add_turnstile_promotion( VM_KERNEL_UNSLIDE_OR_PERM(turnstile), turnstile->ts_priority, 0, 0); - priority_queue_entry_init(&(turnstile->ts_inheritor_links)); - if (priority_queue_insert(&thread->inheritor_queue, - &turnstile->ts_inheritor_links, turnstile->ts_priority, - PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { - /* Update thread priority */ - needs_update = thread_recompute_user_promotion_locked(thread); + priority_queue_entry_init(&turnstile->ts_inheritor_links); + + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + + if (priority_queue_insert(&(thread->base_inheritor_queue), + &turnstile->ts_inheritor_links, turnstile->ts_priority, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + needs_update = thread_recompute_user_promotion_locked(thread); + } + + break; + case TURNSTILE_KERNEL_PROMOTE: + + if (priority_queue_insert(&(thread->sched_inheritor_queue), + &turnstile->ts_inheritor_links, turnstile->ts_priority, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + needs_update = thread_recompute_kernel_promotion_locked(thread); + } + + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); } /* Update turnstile stats */ @@ -1290,10 +1611,10 @@ thread_add_turnstile_promotion( } thread_unlock(thread); + return needs_update; } - /* * Name: thread_remove_turnstile_promotion * @@ -1314,7 +1635,6 @@ thread_remove_turnstile_promotion( { boolean_t needs_update = FALSE; - /* Update the pairing heap */ thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -1323,11 +1643,26 @@ thread_remove_turnstile_promotion( VM_KERNEL_UNSLIDE_OR_PERM(turnstile), 0, 0, 0); - if (priority_queue_remove(&thread->inheritor_queue, - &turnstile->ts_inheritor_links, - PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { - /* Update thread priority */ - needs_update = thread_recompute_user_promotion_locked(thread); + /* Update the pairing heap */ + + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + if (priority_queue_remove(&(thread->base_inheritor_queue), + &turnstile->ts_inheritor_links, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + needs_update = thread_recompute_user_promotion_locked(thread); + } + break; + case TURNSTILE_KERNEL_PROMOTE: + if (priority_queue_remove(&(thread->sched_inheritor_queue), + &turnstile->ts_inheritor_links, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + needs_update = thread_recompute_kernel_promotion_locked(thread); + } + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); } /* Update turnstile stats */ @@ -1338,6 +1673,7 @@ thread_remove_turnstile_promotion( } thread_unlock(thread); + return needs_update; } @@ -1360,11 +1696,21 @@ thread_needs_turnstile_promotion_update( struct turnstile *turnstile) { boolean_t needs_update = FALSE; - int turnstile_link_priority; + int turnstile_link_priority = 0; - /* Update the pairing heap */ - turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue), - &(turnstile->ts_inheritor_links)); + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + turnstile_link_priority = priority_queue_entry_key(&(thread->base_inheritor_queue), + &(turnstile->ts_inheritor_links)); + break; + case TURNSTILE_KERNEL_PROMOTE: + turnstile_link_priority = priority_queue_entry_key(&(thread->sched_inheritor_queue), + &(turnstile->ts_inheritor_links)); + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); + } needs_update = (turnstile_link_priority == turnstile->ts_priority) ? FALSE : TRUE; return needs_update; @@ -1388,8 +1734,30 @@ thread_update_turnstile_promotion_locked( thread_t thread, struct turnstile *turnstile) { - int turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue), - &(turnstile->ts_inheritor_links)); + boolean_t needs_update = FALSE; + int turnstile_link_priority = 0; + + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + turnstile_link_priority = priority_queue_entry_key(&(thread->base_inheritor_queue), &turnstile->ts_inheritor_links); + + if (turnstile_priority_queue_update_entry_key(&(thread->base_inheritor_queue), + &turnstile->ts_inheritor_links, turnstile->ts_priority)) { + needs_update = thread_recompute_user_promotion_locked(thread); + } + break; + case TURNSTILE_KERNEL_PROMOTE: + turnstile_link_priority = priority_queue_entry_key(&(thread->sched_inheritor_queue), &turnstile->ts_inheritor_links); + + if (turnstile_priority_queue_update_entry_key(&(thread->sched_inheritor_queue), + &turnstile->ts_inheritor_links, turnstile->ts_priority)) { + needs_update = thread_recompute_kernel_promotion_locked(thread); + } + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); + } if (turnstile->ts_priority != turnstile_link_priority) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -1400,13 +1768,7 @@ thread_update_turnstile_promotion_locked( turnstile_link_priority, 0); } - if (!turnstile_priority_queue_update_entry_key(&thread->inheritor_queue, - &turnstile->ts_inheritor_links, turnstile->ts_priority)) { - return FALSE; - } - - /* Update thread priority */ - return thread_recompute_user_promotion_locked(thread); + return needs_update; } @@ -1437,8 +1799,9 @@ thread_update_turnstile_promotion( return needs_update; } - /* Update the pairing heap */ thread_lock(thread); + + /* Update the pairing heap */ needs_update = thread_update_turnstile_promotion_locked(thread, turnstile); /* Update turnstile stats */ @@ -1448,36 +1811,65 @@ thread_update_turnstile_promotion( TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); } + thread_unlock(thread); + return needs_update; } /* - * Name: thread_get_inheritor_turnstile_priority + * Name: thread_get_inheritor_turnstile_sched_priority + * + * Description: Get the max sched priority of all the inheritor turnstiles + * + * Arg1: thread + * + * Returns: Max sched priority of all the inheritor turnstiles. + * + * Condition: thread locked + */ +int +thread_get_inheritor_turnstile_sched_priority(thread_t thread) +{ + struct turnstile *max_turnstile; + + max_turnstile = priority_queue_max(&thread->sched_inheritor_queue, + struct turnstile, ts_inheritor_links); + + if (max_turnstile) { + return priority_queue_entry_key(&thread->sched_inheritor_queue, + &max_turnstile->ts_inheritor_links); + } + + return 0; +} + +/* + * Name: thread_get_inheritor_turnstile_base_priority * - * Description: Get the max priority of all the inheritor turnstiles + * Description: Get the max base priority of all the inheritor turnstiles * * Arg1: thread * - * Returns: Max priority of all the inheritor turnstiles. + * Returns: Max base priority of all the inheritor turnstiles. * * Condition: thread locked */ int -thread_get_inheritor_turnstile_priority(thread_t thread) +thread_get_inheritor_turnstile_base_priority(thread_t thread) { struct turnstile *max_turnstile; - max_turnstile = priority_queue_max(&thread->inheritor_queue, + max_turnstile = priority_queue_max(&thread->base_inheritor_queue, struct turnstile, ts_inheritor_links); if (max_turnstile) { - return priority_queue_entry_key(&thread->inheritor_queue, + return priority_queue_entry_key(&thread->base_inheritor_queue, &max_turnstile->ts_inheritor_links); } - return MAXPRI_THROTTLE; + return 0; } @@ -1516,7 +1908,6 @@ thread_get_waiting_turnstile(thread_t thread) return turnstile; } - /* * Name: turnstile_lookup_by_proprietor * @@ -1524,6 +1915,7 @@ thread_get_waiting_turnstile(thread_t thread) * turnstile hash. * * Arg1: port + * Arg2: turnstile_type_t type * * Returns: turnstile: if the proprietor has a turnstile. * TURNSTILE_NULL: otherwise. @@ -1531,12 +1923,11 @@ thread_get_waiting_turnstile(thread_t thread) * Condition: proprietor interlock held. */ struct turnstile * -turnstile_lookup_by_proprietor(uintptr_t proprietor) +turnstile_lookup_by_proprietor(uintptr_t proprietor, turnstile_type_t type) { - return turnstile_htable_lookup(proprietor); + return turnstile_htable_lookup(proprietor, type); } - /* * Name: thread_get_update_flags_for_turnstile_propagation_stoppage * @@ -1831,6 +2222,88 @@ turnstile_remove_turnstile_promotion( return needs_update; } +/* + * Name: turnstile_compute_thread_push + * + * Description: Compute the priority at which the thread will push + * on the turnstile. + * + * Arg1: turnstile + * Arg2: thread + * + * Condition: wq locked + */ +static int +turnstile_compute_thread_push( + struct turnstile *turnstile, + thread_t thread) +{ + int priority = 0; + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + priority = thread->base_pri; + break; + case TURNSTILE_KERNEL_PROMOTE: + /* + * Ideally this should be policy based + * according to the turnstile type. + * + * The priority with which each thread pushes on + * a primitive should be primitive dependent. + */ + priority = thread->sched_pri; + priority = MAX(priority, thread->base_pri); + priority = MAX(priority, BASEPRI_DEFAULT); + priority = MIN(priority, MAXPRI_PROMOTE); + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); + } + + return priority; +} + +/* + * Name: turnstile_waitq_add_thread_priority_queue + * + * Description: add thread to the turnstile wq + * + * Arg1: turnstile wq + * Arg2: thread to add + * + * Condition: wq locked + */ +void +turnstile_waitq_add_thread_priority_queue( + struct waitq *wq, + thread_t thread) +{ + struct turnstile *turnstile = waitq_to_turnstile(wq); + int priority = turnstile_compute_thread_push(turnstile, thread); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(turnstile), + thread_tid(thread), + priority, 0, 0); + /* + * For turnstile queues (which use priority queues), + * insert the thread in the heap based on its priority. + * Note that the priority queue implementation + * is currently not stable, so does not maintain fifo for + * threads at the same pri. Also, if the pri + * of the thread changes while its blocked in the waitq, + * the thread position should be updated in the priority + * queue by calling priority queue increase/decrease + * operations. + */ + priority_queue_entry_init(&(thread->wait_prioq_links)); + priority_queue_insert(&wq->waitq_prio_queue, + &thread->wait_prioq_links, priority, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); +} + /* * Name: turnstile_recompute_priority_locked * @@ -1854,12 +2327,13 @@ turnstile_recompute_priority_locked( boolean_t needs_priority_update = FALSE; thread_t max_thread = THREAD_NULL; struct turnstile *max_turnstile; - int thread_max_pri = MAXPRI_THROTTLE; - int turnstile_max_pri = MAXPRI_THROTTLE; + int thread_max_pri = 0; + int turnstile_max_pri = 0; switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { case TURNSTILE_USER_PROMOTE: case TURNSTILE_USER_IPC_PROMOTE: + case TURNSTILE_KERNEL_PROMOTE: old_priority = turnstile->ts_priority; @@ -1875,6 +2349,7 @@ turnstile_recompute_priority_locked( struct turnstile, ts_inheritor_links); if (max_turnstile) { + assert(turnstile_promote_policy[turnstile_get_type(turnstile)] != TURNSTILE_KERNEL_PROMOTE); turnstile_max_pri = priority_queue_entry_key(&turnstile->ts_inheritor_queue, &max_turnstile->ts_inheritor_links); } @@ -1896,8 +2371,6 @@ turnstile_recompute_priority_locked( break; case TURNSTILE_PROMOTE_NONE: - case TURNSTILE_KERNEL_PROMOTE: - /* The turnstile was repurposed, do nothing */ break; @@ -1990,6 +2463,139 @@ turnstile_workq_proprietor_of_max_turnstile( return max_priority; } +/* + * Name: turnstile_workloop_pusher_info + * + * Description: Returns the priority of the turnstile push for a workloop, + * and the thread or knote responsible for this push. + * + * Args: workloop turnstile + * + * Returns: + * Priority of the push or 0 + * Thread (with a +1 reference) with that push or THREAD_NULL. + * Port (with a +1 reference) with that push, or IP_NULL. + * Sync IPC knote with the highest push (or NULL) + */ +int +turnstile_workloop_pusher_info( + struct turnstile *turnstile, + thread_t *thread_out, + ipc_port_t *port_out, + struct knote **knote_out) +{ + struct turnstile *max_ts; + thread_t max_thread; + int max_thread_pri = 0; + int max_ts_pri = 0; + ipc_port_t port; + + assert(turnstile_get_type(turnstile) == TURNSTILE_WORKLOOPS); + + spl_t s = splsched(); + waitq_lock(&turnstile->ts_waitq); + + max_thread = priority_queue_max(&turnstile->ts_waitq.waitq_prio_queue, + struct thread, wait_prioq_links); + if (max_thread) { + max_thread_pri = priority_queue_entry_key( + &turnstile->ts_waitq.waitq_prio_queue, + &max_thread->wait_prioq_links); + } + + max_ts = priority_queue_max(&turnstile->ts_inheritor_queue, + struct turnstile, ts_inheritor_links); + if (max_ts) { + max_ts_pri = priority_queue_entry_key(&turnstile->ts_inheritor_queue, + &max_ts->ts_inheritor_links); + } + + /* + * Reasons to push on a workloop turnstile are: + * + * 1. threads in dispatch sync + * + * 2. sync IPC pushes, which in turn have 4 sub-cases: + * + * 2.a. special reply port or receive right pushing through a knote + * turnstile, + * + * 2.b. special reply port stashed on a knote, pushing on the workloop + * directly, + * + * 2.c. receive right stashed on a knote, pushing on the workloop + * directly, + * + * 2.d. a receive right monitored by a knote, pushing on the workloop + * directly. + * + * See ipc_port_send_update_inheritor(), ipc_port_recv_update_inheritor(). + * + * Note: dereferencing the knote in the caller is safe provided this + * function i scalled under the proper interlocks (the filt_wllock + req + * lock) which serializes with the knote going away. + */ + if (max_thread_pri > max_ts_pri) { + thread_reference(max_thread); + *thread_out = max_thread; + *port_out = NULL; + *knote_out = NULL; + } else if (max_ts_pri) { + switch (turnstile_get_type(max_ts)) { + case TURNSTILE_KNOTE: + /* 2.a. */ + *thread_out = THREAD_NULL; + *port_out = IP_NULL; + *knote_out = (struct knote *)max_ts->ts_proprietor; + break; + + case TURNSTILE_SYNC_IPC: + /* 2.[bcd] */ + port = (ipc_port_t)max_ts->ts_proprietor; + ip_reference(port); + *thread_out = THREAD_NULL; + *port_out = port; + *knote_out = NULL; + break; + + default: + panic("Unexpected type for turnstile %p", max_ts); + } + } else { + *thread_out = THREAD_NULL; + *port_out = IP_NULL; + *knote_out = NULL; + } + + waitq_unlock(&turnstile->ts_waitq); + splx(s); + + return max(max_thread_pri, max_ts_pri); +} + +/* + * Name: turnstile_has_waiters + * + * Description: returns if there are waiters on the turnstile + * + * Arg1: turnstile: turnstile + * + * Returns: TRUE if there are waiters, FALSE otherwise. + */ + +boolean_t +turnstile_has_waiters(struct turnstile *turnstile) +{ + boolean_t ret; + + spl_t s = splsched(); + waitq_lock(&turnstile->ts_waitq); + ret = !priority_queue_empty(&turnstile->ts_waitq.waitq_prio_queue); + waitq_unlock(&turnstile->ts_waitq); + splx(s); + + return ret; +} /* * Name: turnstile_update_inheritor_priority_chain @@ -2023,8 +2629,8 @@ turnstile_update_inheritor_priority_chain( if (turnstile_flags & TURNSTILE_INHERITOR_THREAD) { thread = inheritor; thread_lock(thread); - //TODO: Need to call sched promotion for kernel mutex. thread_recompute_user_promotion_locked(thread); + thread_recompute_kernel_promotion_locked(thread); } else if (turnstile_flags & TURNSTILE_INHERITOR_TURNSTILE) { turnstile = inheritor; waitq_lock(&turnstile->ts_waitq); @@ -2151,6 +2757,23 @@ turnstile_cleanup(void) } } +/* + * Name: turnstile_update_thread_priority_chain + * + * Description: Priority of a thread blocked on a turnstile + * has changed, update the turnstile priority. + * + * Arg1: thread: thread whose priority has changed. + * + * Returns: None. + */ +void +turnstile_update_thread_priority_chain(thread_t thread) +{ + turnstile_update_inheritor_priority_chain(thread, + TURNSTILE_INHERITOR_THREAD | TURNSTILE_UPDATE_BOOST); +} + /* * Name: turnstile_update_inheritor_workq_priority_chain * @@ -2177,6 +2800,7 @@ turnstile_update_inheritor_workq_priority_chain(struct turnstile *turnstile, spl if (!workq_lock_held) { workq_reference(wq); + disable_preemption(); } waitq_unlock(&turnstile->ts_waitq); splx(s); @@ -2184,6 +2808,7 @@ turnstile_update_inheritor_workq_priority_chain(struct turnstile *turnstile, spl workq_schedule_creator_turnstile_redrive(wq, workq_lock_held); if (!workq_lock_held) { + enable_preemption(); workq_deallocate_safe(wq); } } @@ -2478,14 +3103,14 @@ turnstile_stats_update( /* * Check if turnstile stats needs to be updated. * Bail out if the turnstile or thread does not - * have any user promotion, i.e. pri 4. + * have any user promotion. * Bail out if it is the first hop of WQ turnstile * since WQ's use of a turnstile for the admission check * introduces a lot of noise due to state changes. */ if (flags & TSU_TURNSTILE_ARG) { struct turnstile *ts = (struct turnstile *)inheritor; - if (ts->ts_priority <= MAXPRI_THROTTLE) { + if (ts->ts_priority == 0) { return; } @@ -2494,7 +3119,7 @@ turnstile_stats_update( } } else if (flags & TSU_THREAD_ARG) { thread_t thread = (thread_t)inheritor; - if (thread->user_promotion_basepri <= MAXPRI_THROTTLE) { + if (thread->user_promotion_basepri == 0) { return; } } else { @@ -2534,6 +3159,60 @@ turnstile_stats_update( #endif } +static uint64_t +kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, uint8_t *hops) +{ + if (waitq_held(&ts->ts_waitq)) { + *flags |= STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ; + return 0; + } + + *hops = *hops + 1; + + if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + return kdp_turnstile_traverse_inheritor_chain(ts->ts_inheritor, flags, hops); + } + + if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + *flags |= STACKSHOT_TURNSTILE_STATUS_THREAD; + return (uint64_t) thread_tid(ts->ts_inheritor); + } + + if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + *flags |= STACKSHOT_TURNSTILE_STATUS_WORKQUEUE; + return VM_KERNEL_UNSLIDE_OR_PERM(ts->ts_inheritor); + } + + *flags |= STACKSHOT_TURNSTILE_STATUS_UNKNOWN; + return 0; +} + +void +kdp_turnstile_fill_tsinfo(struct turnstile *ts, thread_turnstileinfo_t *tsinfo) +{ + uint64_t final_inheritor; + uint64_t flags = 0; + uint8_t hops = 0; + + tsinfo->turnstile_context = 0; + tsinfo->number_of_hops = 0; + tsinfo->turnstile_priority = 0; + + assert(ts != TURNSTILE_NULL); + + if (waitq_held(&ts->ts_waitq)) { + tsinfo->turnstile_flags |= STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ; + return; + } + + final_inheritor = kdp_turnstile_traverse_inheritor_chain(ts, &flags, &hops); + + /* store some metadata about the turnstile itself */ + tsinfo->turnstile_flags = flags; + tsinfo->number_of_hops = hops; + tsinfo->turnstile_priority = ts->ts_priority; + tsinfo->turnstile_context = final_inheritor; +} #if DEVELOPMENT || DEBUG @@ -2592,10 +3271,45 @@ tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr) } int -tstile_test_prim_lock(boolean_t use_hashtable) +tstile_test_prim_lock(int val) { - struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline; + struct tstile_test_prim *test_prim; + boolean_t use_hashtable; + turnstile_type_t type; + wait_interrupt_t wait_type; + + switch (val) { + case SYSCTL_TURNSTILE_TEST_USER_DEFAULT: + test_prim = test_prim_ts_inline; + use_hashtable = FALSE; + wait_type = THREAD_ABORTSAFE; + type = TURNSTILE_ULOCK; + break; + case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE: + test_prim = test_prim_global_htable; + use_hashtable = TRUE; + wait_type = THREAD_ABORTSAFE; + type = TURNSTILE_ULOCK; + break; + case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT: + test_prim = test_prim_global_ts_kernel; + use_hashtable = FALSE; + wait_type = THREAD_UNINT | THREAD_WAIT_NOREPORT_USER; + type = TURNSTILE_KERNEL_MUTEX; + break; + case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE: + test_prim = test_prim_global_ts_kernel_hash; + use_hashtable = TRUE; + wait_type = THREAD_UNINT | THREAD_WAIT_NOREPORT_USER; + type = TURNSTILE_KERNEL_MUTEX; + break; + + default: + return -1; + } + lock_start: + /* take the interlock of the primitive */ tstile_test_prim_lock_interlock(test_prim); @@ -2612,7 +3326,7 @@ lock_start: /* primitive locked, get a turnstile */ prim_turnstile = turnstile_prepare((uintptr_t)test_prim, use_hashtable ? NULL : &test_prim->ttprim_turnstile, - TURNSTILE_NULL, TURNSTILE_ULOCK); + TURNSTILE_NULL, type); assert(prim_turnstile != TURNSTILE_NULL); @@ -2629,12 +3343,11 @@ lock_start: turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD); turnstile_complete((uintptr_t)test_prim, - use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL); + use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL, type); tstile_test_prim_unlock_interlock(test_prim); turnstile_cleanup(); - return 0; } @@ -2644,7 +3357,7 @@ lock_start: (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); waitq_assert_wait64(&prim_turnstile->ts_waitq, - CAST_EVENT64_T(test_prim), THREAD_ABORTSAFE, + CAST_EVENT64_T(test_prim), wait_type, TIMEOUT_WAIT_FOREVER); /* drop the interlock */ @@ -2659,7 +3372,7 @@ lock_start: tstile_test_prim_lock_interlock(test_prim); test_prim->tt_prim_waiters--; turnstile_complete((uintptr_t)test_prim, - use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL); + use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL, type); tstile_test_prim_unlock_interlock(test_prim); @@ -2674,9 +3387,37 @@ lock_start: } int -tstile_test_prim_unlock(boolean_t use_hashtable) +tstile_test_prim_unlock(int val) { - struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline; + struct tstile_test_prim *test_prim; + boolean_t use_hashtable; + turnstile_type_t type; + + switch (val) { + case SYSCTL_TURNSTILE_TEST_USER_DEFAULT: + test_prim = test_prim_ts_inline; + use_hashtable = FALSE; + type = TURNSTILE_ULOCK; + break; + case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE: + test_prim = test_prim_global_htable; + use_hashtable = TRUE; + type = TURNSTILE_ULOCK; + break; + case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT: + test_prim = test_prim_global_ts_kernel; + use_hashtable = FALSE; + type = TURNSTILE_KERNEL_MUTEX; + break; + case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE: + test_prim = test_prim_global_ts_kernel_hash; + use_hashtable = TRUE; + type = TURNSTILE_KERNEL_MUTEX; + break; + default: + return -1; + } + /* take the interlock of the primitive */ tstile_test_prim_lock_interlock(test_prim); @@ -2704,7 +3445,7 @@ tstile_test_prim_unlock(boolean_t use_hashtable) /* primitive locked, get a turnstile */ prim_turnstile = turnstile_prepare((uintptr_t)test_prim, use_hashtable ? NULL : &test_prim->ttprim_turnstile, - TURNSTILE_NULL, TURNSTILE_ULOCK); + TURNSTILE_NULL, type); assert(prim_turnstile != TURNSTILE_NULL); @@ -2715,12 +3456,12 @@ tstile_test_prim_unlock(boolean_t use_hashtable) waitq_wakeup64_one(&prim_turnstile->ts_waitq, CAST_EVENT64_T(test_prim), - THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI); + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD); turnstile_complete((uintptr_t)test_prim, - use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL); + use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL, type); tstile_test_prim_unlock_interlock(test_prim); diff --git a/osfmk/kern/turnstile.h b/osfmk/kern/turnstile.h index b67497581..6050fa3d9 100644 --- a/osfmk/kern/turnstile.h +++ b/osfmk/kern/turnstile.h @@ -53,6 +53,8 @@ struct turnstile_stats { #include #include #include +#include +#include /* * turnstile_type_t : Indicates the type of primitive the turnstile is associated with @@ -67,7 +69,8 @@ typedef enum __attribute__((packed)) turnstile_type { TURNSTILE_WORKLOOPS = 5, TURNSTILE_WORKQS = 6, TURNSTILE_KNOTE = 7, - TURNSTILE_TOTAL_TYPES = 8, + TURNSTILE_SLEEP_INHERITOR = 8, + TURNSTILE_TOTAL_TYPES = 9, } turnstile_type_t; /* @@ -112,6 +115,12 @@ typedef enum __attribute__((packed)) turnstile_type { * TURNSTILE_KNOTE * Interlock: the knote lock * Inheritor: WL turnstile + * + * TURNSTILE_SLEEP_INHERITOR + * Interlock: turnstile_htable bucket spinlock. + * Inheritor: threads. + * Lock order: turnstile lock, thread lock. + * */ typedef enum __attribute__((flag_enum)) turnstile_promote_policy { @@ -121,6 +130,12 @@ typedef enum __attribute__((flag_enum)) turnstile_promote_policy { TURNSTILE_USER_IPC_PROMOTE = 0x4, } turnstile_promote_policy_t; +typedef enum __attribute__((flag_enum)) turnstile_hash_lock_policy { + TURNSTILE_HASH_LOCK_POLICY_NONE = 0, + TURNSTILE_IRQ_UNSAFE_HASH = 0x1, + TURNSTILE_LOCKED_HASH = 0x2, +} turnstile_hash_lock_policy_t; + /* * Turnstile state flags * @@ -178,7 +193,7 @@ MACRO_END #endif /* DEVELOPMENT || DEBUG */ -/* Foward declaration of turnstile */ +struct knote; struct turnstile; /* @@ -311,7 +326,7 @@ struct turnstile { struct priority_queue ts_inheritor_queue; /* Queue of turnstile with us as an inheritor (WL) */ union { struct priority_queue_entry ts_inheritor_links; /* Inheritor queue links */ - queue_chain_t ts_deallocate_link; /* thread deallocate link */ + struct mpsc_queue_chain ts_deallocate_link; /* thread deallocate link */ }; SLIST_ENTRY(turnstile) ts_htable_link; /* linkage for turnstile in global hash table */ uintptr_t ts_proprietor; /* hash key lookup turnstile (IL) */ @@ -333,7 +348,7 @@ struct turnstile { /* IL - interlock, WL - turnstile lock i.e. waitq lock */ -#define TURNSTILE_PROPRIETOR_NULL 0 +#define TURNSTILE_PROPRIETOR_NULL 0ul /* * Name: turnstiles_init @@ -398,6 +413,21 @@ turnstile_reference(struct turnstile *turnstile); void turnstile_deallocate(struct turnstile *turnstile); +/* + * Name: turnstile_waitq_add_thread_priority_queue + * + * Description: add thread to the turnstile waitq + * + * Arg1: waitq + * Arg2: thread + * + * Conditions: waitq locked + */ +void +turnstile_waitq_add_thread_priority_queue( + struct waitq* wq, + thread_t thread); + /* * Name: turnstile_deallocate_safe * @@ -464,6 +494,27 @@ turnstile_workq_proprietor_of_max_turnstile( struct turnstile *turnstile, uintptr_t *proprietor); +/* + * Name: turnstile_workloop_pusher_info + * + * Description: Returns the priority of the turnstile push for a workloop, + * and the thread or knote responsible for this push. + * + * Args: workloop turnstile + * + * Returns: + * Priority of the push or 0 + * Thread (with a +1 reference) with that push or THREAD_NULL. + * Port (with a +1 reference) with that push, or IP_NULL. + * Sync IPC knote with the highest push (or NULL) + */ +int +turnstile_workloop_pusher_info( + struct turnstile *turnstile, + thread_t *thread, + ipc_port_t *port, + struct knote **knote_out); + /* * Name: turnstile_cleanup * @@ -477,6 +528,19 @@ turnstile_workq_proprietor_of_max_turnstile( void turnstile_cleanup(void); +/* + * Name: turnstile_update_thread_priority_chain + * + * Description: Priority of a thread blocked on a turnstile + * has changed, update the turnstile priority. + * + * Arg1: thread: thread whose priority has changed. + * + * Returns: None. + */ +void +turnstile_update_thread_priority_chain(thread_t thread); + /* * Name: turnstile_update_inheritor_locked * @@ -494,18 +558,32 @@ void turnstile_update_inheritor_locked(struct turnstile *turnstile); /* - * Name: thread_get_inheritor_turnstile_priority + * Name: thread_get_inheritor_turnstile_base_priority * - * Description: Get the max priority of all the inheritor turnstiles + * Description: Get the max base priority of all the inheritor turnstiles * * Arg1: thread * - * Returns: Max priority of all the inheritor turnstiles. + * Returns: Max base priority of all the inheritor turnstiles. * * Condition: thread locked */ int -thread_get_inheritor_turnstile_priority(thread_t thread); +thread_get_inheritor_turnstile_base_priority(thread_t thread); + +/* + * Name: thread_get_inheritor_turnstile_sched_priority + * + * Description: Get the max sched priority of all the inheritor turnstiles + * + * Arg1: thread + * + * Returns: Max sched priority of all the inheritor turnstiles. + * + * Condition: thread locked + */ +int +thread_get_inheritor_turnstile_sched_priority(thread_t thread); /* * Name: thread_get_waiting_turnstile @@ -529,6 +607,7 @@ thread_get_waiting_turnstile(thread_t thread); * turnstile hash. * * Arg1: port + * Arg2: turnstile_type_t type * * Returns: turnstile: if the proprietor has a turnstile. * TURNSTILE_NULL: otherwise. @@ -536,7 +615,20 @@ thread_get_waiting_turnstile(thread_t thread); * Condition: proprietor interlock held. */ struct turnstile * -turnstile_lookup_by_proprietor(uintptr_t proprietor); +turnstile_lookup_by_proprietor(uintptr_t proprietor, turnstile_type_t type); + +/* + * Name: turnstile_has_waiters + * + * Description: returns if there are waiters on the turnstile + * + * Arg1: turnstile: turnstile + * + * Returns: TRUE if there are waiters, FALSE otherwise. + */ + +boolean_t +turnstile_has_waiters(struct turnstile *turnstile); /* * Name: turnstile_stats_update @@ -557,12 +649,17 @@ turnstile_stats_update( #if DEVELOPMENT || DEBUG +#define SYSCTL_TURNSTILE_TEST_USER_DEFAULT 1 +#define SYSCTL_TURNSTILE_TEST_USER_HASHTABLE 2 +#define SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT 3 +#define SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE 4 + /* Functions used by debug test primitive exported by sysctls */ int -tstile_test_prim_lock(boolean_t use_hashtable); +tstile_test_prim_lock(int val); int -tstile_test_prim_unlock(boolean_t use_hashtable); +tstile_test_prim_unlock(int val); int turnstile_get_boost_stats_sysctl(void *req); @@ -573,6 +670,42 @@ turnstile_get_unboost_stats_sysctl(void *req); /* Interface */ +/* + * Name: turnstile_hash_bucket_lock + * + * Description: locks the spinlock associated with proprietor's bucket. + * if proprietor is specified the index for the hash will be + * recomputed and returned in index_proprietor, + * otherwise the value save in index_proprietor is used as index. + * + * Args: + * Arg1: proprietor (key) for hashing + * Arg2: index for proprietor in the hash + * Arg3: turnstile type + * + * Returns: old value of irq if irq were disabled before acquiring the lock. + */ +unsigned +turnstile_hash_bucket_lock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type); + +/* + * Name: turnstile_hash_bucket_unlock + * + * Description: unlocks the spinlock associated with proprietor's bucket. + * if proprietor is specified the index for the hash will be + * recomputed and returned in index_proprietor, + * otherwise the value save in index_proprietor is used as index. + * + * Args: + * Arg1: proprietor (key) for hashing + * Arg2: index for proprietor in the hash + * Arg3: turnstile type + * Arg4: irq value returned by turnstile_hash_bucket_lock + * + */ +void +turnstile_hash_bucket_unlock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type, unsigned s); + /* * Name: turnstile_prepare * @@ -609,6 +742,7 @@ turnstile_prepare( * Arg1: proprietor * Arg2: pointer in primitive struct to update turnstile * Arg3: pointer to store the returned turnstile instead of attaching it to thread + * Arg4: type of primitive * * Returns: * None. @@ -617,7 +751,8 @@ void turnstile_complete( uintptr_t proprietor, struct turnstile **tstore, - struct turnstile **turnstile); + struct turnstile **turnstile, + turnstile_type_t type); /* * Name: turnstile_update_inheritor @@ -665,6 +800,46 @@ turnstile_update_inheritor_complete( struct turnstile *turnstile, turnstile_update_complete_flags_t flags); + +/* + * Name: turnstile_kernel_update_inheritor_on_wake_locked + * + * Description: Set thread as the inheritor of the turnstile and + * boost the inheritor. + * Args: + * Arg1: turnstile + * Arg2: new_inheritor + * Arg3: flags + * + * Called with turnstile locked + */ +void +turnstile_kernel_update_inheritor_on_wake_locked( + struct turnstile *turnstile, + turnstile_inheritor_t new_inheritor, + turnstile_update_flags_t flags); + +/* + * Internal KPI for sleep_with_inheritor, wakeup_with_inheritor, change_sleep_inheritor + * meant to allow specifing the turnstile type to use to have different policy + * on how to push on the inheritor. + * + * Differently from the "standard" KPI in locks.h these are meant to be used only + * if you know what you are doing with turnstile. + */ + +extern wait_result_t +lck_mtx_sleep_with_inheritor_and_turnstile_type(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type); + +extern wait_result_t +lck_rw_sleep_with_inheritor_and_turnstile_type(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type); + +extern kern_return_t +wakeup_with_inheritor_and_turnstile_type(event_t event, turnstile_type_t type, wait_result_t result, bool wake_one, lck_wake_action_t action, thread_t *thread_wokenup); + +extern kern_return_t +change_sleep_inheritor_and_turnstile_type(event_t event, thread_t inheritor, turnstile_type_t type); + #endif /* KERNEL_PRIVATE */ #if XNU_KERNEL_PRIVATE @@ -673,14 +848,10 @@ struct workqueue; /* pthread_workqueue.c */ extern void workq_reference(struct workqueue *wq); extern void workq_deallocate_safe(struct workqueue *wq); -extern void workq_destroy(struct workqueue *wq); extern bool workq_is_current_thread_updating_turnstile(struct workqueue *wq); extern void workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked); -/* thread.c */ -extern void workq_deallocate_enqueue(struct workqueue *wq); - #endif /* XNU_KERNEL_PRIVATE */ #endif /* _TURNSTILE_H_ */ diff --git a/osfmk/kern/ux_handler.c b/osfmk/kern/ux_handler.c index a20237379..0329eeea6 100644 --- a/osfmk/kern/ux_handler.c +++ b/osfmk/kern/ux_handler.c @@ -68,13 +68,8 @@ SECURITY_READ_ONLY_LATE(ipc_port_t) ux_handler_port = IP_NULL; void ux_handler_init(void) { - ux_handler_port = ipc_port_alloc_kernel(); - - if (ux_handler_port == IP_NULL) { - panic("can't allocate unix exception port"); - } - - ipc_kobject_set(ux_handler_port, (ipc_kobject_t)&ux_handler_kobject, IKOT_UX_HANDLER); + ux_handler_port = ipc_kobject_alloc_port((ipc_kobject_t)&ux_handler_kobject, + IKOT_UX_HANDLER, IPC_KOBJECT_ALLOC_NONE); } /* diff --git a/osfmk/kern/waitq.c b/osfmk/kern/waitq.c index 1a38d76fe..2348ef572 100644 --- a/osfmk/kern/waitq.c +++ b/osfmk/kern/waitq.c @@ -164,7 +164,7 @@ lck_grp_t waitq_lck_grp; * Prepost callback function for specially marked waitq sets * (prepost alternative) */ -extern void waitq_set__CALLING_PREPOST_HOOK__(void *ctx, void *memberctx, int priority); +extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx); #define DEFAULT_MIN_FREE_TABLE_ELEM 100 static uint32_t g_min_free_table_elem; @@ -1706,7 +1706,7 @@ waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip) skip = 0; } memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t)); - backtrace(buf, g_nwaitq_btframes + skip); + backtrace(buf, g_nwaitq_btframes + skip, NULL); memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t)); } #else /* no stats */ @@ -1850,29 +1850,8 @@ waitq_thread_insert(struct waitq *wq, thread_t thread, boolean_t fifo) { if (waitq_is_turnstile_queue(wq)) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)), - thread_tid(thread), - thread->base_pri, 0, 0); - turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL); - - /* - * For turnstile queues (which use priority queues), - * insert the thread in the heap based on its current - * base_pri. Note that the priority queue implementation - * is currently not stable, so does not maintain fifo for - * threads at the same base_pri. Also, if the base_pri - * of the thread changes while its blocked in the waitq, - * the thread position should be updated in the priority - * queue by calling priority queue increase/decrease - * operations. - */ - priority_queue_entry_init(&(thread->wait_prioq_links)); - priority_queue_insert(&wq->waitq_prio_queue, - &thread->wait_prioq_links, thread->base_pri, - PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + turnstile_waitq_add_thread_priority_queue(wq, thread); } else { turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL); if (fifo) { @@ -2059,6 +2038,7 @@ struct waitq_select_args { event64_t event; waitq_select_cb select_cb; void *select_ctx; + int priority; uint64_t *reserved_preposts; @@ -2119,16 +2099,13 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx, */ do_waitq_select_n_locked(&args); - if (*(args.nthreads) > 0 || - (args.threadq && !queue_empty(args.threadq))) { + if (*args.nthreads > 0 || (args.threadq && !queue_empty(args.threadq))) { /* at least 1 thread was selected and returned: don't prepost */ - if (args.max_threads > 0 && - *(args.nthreads) >= args.max_threads) { + if (args.max_threads > 0 && *args.nthreads >= args.max_threads) { /* break out of the setid walk */ ret = WQ_ITERATE_FOUND; } - goto out_unlock; - } else { + } else if (args.event == NO_EVENT64) { /* * No thread selected: prepost 'waitq' to 'wqset' * if wqset can handle preposts and the event is set to 0. @@ -2139,14 +2116,39 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx, * callout function and pass the set's 'prepost_hook.' This * could potentially release another thread to handle events. */ - if (args.event == NO_EVENT64) { - if (waitq_set_can_prepost(wqset)) { - wq_prepost_do_post_locked( - wqset, waitq, args.reserved_preposts); - } else if (waitq_set_has_prepost_hook(wqset)) { - waitq_set__CALLING_PREPOST_HOOK__( - wqset->wqset_prepost_hook, waitq, 0); - } + if (waitq_set_can_prepost(wqset)) { + wq_prepost_do_post_locked( + wqset, waitq, args.reserved_preposts); + } else if (waitq_set_has_prepost_hook(wqset)) { + waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook; + + /* + * When calling out to the prepost hook, + * we drop the waitq lock, to allow for the kevent + * subsytem to call into the waitq subsystem again, + * without risking a deadlock. + * + * However, we need to guard against wqset going away, + * so we increment the prepost hook use count + * while the lock is dropped. + * + * This lets waitq_set_deinit() know to wait for the + * prepost hook call to be done before it can proceed. + * + * Note: we need to keep preemption disabled the whole + * time as waitq_set_deinit will spin on this. + */ + + disable_preemption(); + os_atomic_inc(hook, relaxed); + waitq_set_unlock(wqset); + + waitq_set__CALLING_PREPOST_HOOK__(hook); + + /* Note: after this decrement, the wqset may be deallocated */ + os_atomic_dec(hook, relaxed); + enable_preemption(); + return ret; } } @@ -2324,6 +2326,13 @@ waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq, if (first_thread == THREAD_NULL) { first_thread = thread; + /* + * turnstile_kernel_update_inheritor_on_wake_locked will lock + * first_thread, so call it before locking it. + */ + if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) { + turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD); + } } /* For the peek operation, break out early */ @@ -2431,6 +2440,7 @@ do_waitq_select_n_locked(struct waitq_select_args *args) /* we know this is the first (and only) thread */ ++(*nthreads); *(args->spl) = (safeq != waitq) ? spl : splsched(); + thread_lock(first_thread); thread_clear_waitq_state(first_thread); waitq_thread_remove(safeq, first_thread); @@ -2510,7 +2520,8 @@ waitq_select_n_locked(struct waitq *waitq, void *select_ctx, uint64_t *reserved_preposts, queue_t threadq, - int max_threads, spl_t *spl) + int max_threads, spl_t *spl, + int priority) { int nthreads = 0; @@ -2520,6 +2531,7 @@ waitq_select_n_locked(struct waitq *waitq, .event = event, .select_cb = select_cb, .select_ctx = select_ctx, + .priority = priority, .reserved_preposts = reserved_preposts, .threadq = threadq, .max_threads = max_threads, @@ -2547,14 +2559,13 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event, uint64_t *reserved_preposts, int priority, spl_t *spl) { - (void)priority; int nthreads; queue_head_t threadq; queue_init(&threadq); nthreads = waitq_select_n_locked(waitq, event, NULL, NULL, - reserved_preposts, &threadq, 1, spl); + reserved_preposts, &threadq, 1, spl, priority); /* if we selected a thread, return it (still locked) */ if (!queue_empty(&threadq)) { @@ -2569,96 +2580,6 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event, return THREAD_NULL; } -struct find_max_pri_ctx { - integer_t max_sched_pri; - integer_t max_base_pri; - thread_t highest_thread; -}; - -/** - * callback function that finds the max priority thread - * - * Conditions: - * 'waitq' is locked - * 'thread' is not locked - */ -static thread_t -waitq_find_max_pri_cb(void *ctx_in, - __unused struct waitq *waitq, - __unused int is_global, - thread_t thread) -{ - struct find_max_pri_ctx *ctx = (struct find_max_pri_ctx *)ctx_in; - - /* - * thread is not locked, use pri as a hint only - * wake up the highest base pri, and find the highest sched pri at that base pri - */ - integer_t sched_pri = *(volatile int16_t *)&thread->sched_pri; - integer_t base_pri = *(volatile int16_t *)&thread->base_pri; - - if (ctx->highest_thread == THREAD_NULL || - (base_pri > ctx->max_base_pri) || - (base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) { - /* don't select the thread, just update ctx */ - - ctx->max_sched_pri = sched_pri; - ctx->max_base_pri = base_pri; - ctx->highest_thread = thread; - } - - return THREAD_NULL; -} - -/** - * select from a waitq the highest priority thread waiting for a given event - * - * Conditions: - * 'waitq' is locked - * - * Returns: - * A locked thread that's been removed from the waitq, but has not - * yet been put on a run queue. Caller is responsible to call splx - * with the '*spl' value. - */ -static thread_t -waitq_select_max_locked(struct waitq *waitq, event64_t event, - uint64_t *reserved_preposts, - spl_t *spl) -{ - __assert_only int nthreads; - assert(!waitq->waitq_set_id); /* doesn't support recursive sets */ - - struct find_max_pri_ctx ctx = { - .max_sched_pri = 0, - .max_base_pri = 0, - .highest_thread = THREAD_NULL, - }; - - /* - * Scan the waitq to find the highest priority thread. - * This doesn't remove any thread from the queue - */ - nthreads = waitq_select_n_locked(waitq, event, - waitq_find_max_pri_cb, - &ctx, reserved_preposts, NULL, 1, spl); - - assert(nthreads == 0); - - if (ctx.highest_thread != THREAD_NULL) { - __assert_only kern_return_t ret; - - /* Remove only the thread we just found */ - ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl); - - assert(ret == KERN_SUCCESS); - return ctx.highest_thread; - } - - return THREAD_NULL; -} - - struct select_thread_ctx { thread_t thread; event64_t event; @@ -3051,9 +2972,6 @@ maybe_adjust_thread_pri(thread_t thread, } sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq); - } else if (priority > 0) { - /* Mutex subsystem wants to see this thread before we 'go' it */ - lck_mtx_wakeup_adjust_pri(thread, priority); } } @@ -3123,7 +3041,7 @@ waitq_wakeup64_all_locked(struct waitq *waitq, nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL, reserved_preposts, - &wakeup_queue, -1, &th_spl); + &wakeup_queue, -1, &th_spl, priority); /* set each thread running */ ret = KERN_NOT_WAITING; @@ -3175,16 +3093,9 @@ waitq_wakeup64_one_locked(struct waitq *waitq, assert(waitq_held(waitq)); - if (priority == WAITQ_SELECT_MAX_PRI) { - thread = waitq_select_max_locked(waitq, wake_event, - reserved_preposts, - &th_spl); - } else { - thread = waitq_select_one_locked(waitq, wake_event, - reserved_preposts, - priority, &th_spl); - } - + thread = waitq_select_one_locked(waitq, wake_event, + reserved_preposts, + priority, &th_spl); if (thread != THREAD_NULL) { waitq_stats_count_wakeup(waitq); @@ -3233,15 +3144,9 @@ waitq_wakeup64_identify_locked(struct waitq *waitq, assert(waitq_held(waitq)); - if (priority == WAITQ_SELECT_MAX_PRI) { - thread = waitq_select_max_locked(waitq, wake_event, - reserved_preposts, - spl); - } else { - thread = waitq_select_one_locked(waitq, wake_event, - reserved_preposts, - priority, spl); - } + thread = waitq_select_one_locked(waitq, wake_event, + reserved_preposts, + priority, spl); if (thread != THREAD_NULL) { waitq_stats_count_wakeup(waitq); @@ -3508,7 +3413,7 @@ wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset, * NULL on failure */ struct waitq_set * -waitq_set_alloc(int policy, void *prepost_hook) +waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook) { struct waitq_set *wqset; @@ -3537,7 +3442,7 @@ waitq_set_alloc(int policy, void *prepost_hook) kern_return_t waitq_set_init(struct waitq_set *wqset, int policy, uint64_t *reserved_link, - void *prepost_hook) + waitq_set_prepost_hook_t *prepost_hook) { struct waitq_link *link; kern_return_t ret; @@ -3677,6 +3582,20 @@ waitq_set_deinit(struct waitq_set *wqset) waitq_set_lock(wqset); + if (waitq_set_has_prepost_hook(wqset)) { + waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook; + /* + * If the wqset_prepost_hook value is non 0, + * then another core is currently posting to this waitq set + * and we need for it to finish what it's doing. + */ + while (os_atomic_load(hook, relaxed) != 0) { + waitq_set_unlock(wqset); + delay(1); + waitq_set_lock(wqset); + } + } + set_id = wqset->wqset_id; if (waitqs_is_linked(wqset) || set_id == 0) { diff --git a/osfmk/kern/waitq.h b/osfmk/kern/waitq.h index 9eb863a7b..2d8975733 100644 --- a/osfmk/kern/waitq.h +++ b/osfmk/kern/waitq.h @@ -49,7 +49,7 @@ */ #define WAITQ_ALL_PRIORITIES (-1) #define WAITQ_PROMOTE_PRIORITY (-2) -#define WAITQ_SELECT_MAX_PRI (-3) +#define WAITQ_PROMOTE_ON_WAKE (-3) typedef enum e_waitq_lock_state { WAITQ_KEEP_LOCKED = 0x01, @@ -175,7 +175,7 @@ struct waitq_set { }; }; -#define WQSET_NOT_LINKED ((uint64_t)(~0)) +#define WQSET_NOT_LINKED ((uint64_t)(~0)) static_assert(sizeof(struct waitq_set) == WQS_OPAQUE_SIZE, "waitq_set structure size mismatch"); static_assert(__alignof(struct waitq_set) == WQS_OPAQUE_ALIGN, "waitq_set structure alignment mismatch"); @@ -388,14 +388,17 @@ extern struct waitq *_global_eventq(char *event, size_t event_length); extern struct waitq *global_waitq(int index); +typedef uint16_t waitq_set_prepost_hook_t; + /* * set alloc/init/free */ -extern struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook); +extern struct waitq_set *waitq_set_alloc(int policy, + waitq_set_prepost_hook_t *prepost_hook); extern kern_return_t waitq_set_init(struct waitq_set *wqset, int policy, uint64_t *reserved_link, - void *prepost_hook); + waitq_set_prepost_hook_t *prepost_hook); extern void waitq_set_deinit(struct waitq_set *wqset); diff --git a/osfmk/kern/work_interval.c b/osfmk/kern/work_interval.c index 4c1d4cbda..5986b975d 100644 --- a/osfmk/kern/work_interval.c +++ b/osfmk/kern/work_interval.c @@ -106,49 +106,6 @@ wi_release(struct work_interval *work_interval) } } -/* - * work_interval_port_alloc - * - * Description: Obtain a send right for the given work interval struct. - * - * Parameters: work_interval - A work_interval struct - * Consumes a +1 ref count on work_interval, now owned by the port. - * - * Returns: Port of type IKOT_WORK_INTERVAL with work_interval set as its kobject. - * Returned with a +1 send right and no-senders notification armed. - * Work interval struct reference is held by the port. - */ -static ipc_port_t -work_interval_port_alloc(struct work_interval *work_interval) -{ - ipc_port_t work_interval_port = ipc_port_alloc_kernel(); - - if (work_interval_port == IP_NULL) { - panic("failed to allocate work interval port"); - } - - assert(work_interval->wi_port == IP_NULL); - - ip_lock(work_interval_port); - ipc_kobject_set_atomically(work_interval_port, (ipc_kobject_t)work_interval, - IKOT_WORK_INTERVAL); - - ipc_port_t notify_port = ipc_port_make_sonce_locked(work_interval_port); - ipc_port_t old_notify_port = IP_NULL; - ipc_port_nsrequest(work_interval_port, 1, notify_port, &old_notify_port); - /* port unlocked */ - - assert(old_notify_port == IP_NULL); - - /* This is the only make-send that will happen on this port */ - ipc_port_t send_port = ipc_port_make_send(work_interval_port); - assert(IP_VALID(send_port)); - - work_interval->wi_port = work_interval_port; - - return send_port; -} - /* * work_interval_port_convert * @@ -390,12 +347,11 @@ kern_work_interval_create(thread_t thread, task_t creating_task = current_task(); if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) { /* - * CA_CLIENT work intervals do not create new thread groups - * and are non-joinable. - * There can only be one CA_CLIENT work interval (created by UIKit) + * CA_CLIENT work intervals do not create new thread groups. + * There can only be one CA_CLIENT work interval (created by UIKit or AppKit) * per each application task */ - if (create_flags & (WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP)) { + if (create_flags & WORK_INTERVAL_FLAG_GROUP) { return KERN_FAILURE; } if (!task_is_app(creating_task)) { @@ -417,11 +373,14 @@ kern_work_interval_create(thread_t thread, if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) { - /* work_interval has a +1 ref, moves to the port */ - ipc_port_t port = work_interval_port_alloc(work_interval); mach_port_name_t name = MACH_PORT_NULL; - name = ipc_port_copyout_send(port, current_space()); + /* work_interval has a +1 ref, moves to the port */ + work_interval->wi_port = ipc_kobject_alloc_port( + (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL, + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + + name = ipc_port_copyout_send(work_interval->wi_port, current_space()); if (!MACH_PORT_VALID(name)) { /* diff --git a/osfmk/kern/xpr.c b/osfmk/kern/xpr.c deleted file mode 100644 index 0c28eabab..000000000 --- a/osfmk/kern/xpr.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - * xpr silent tracing circular buffer. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * After a spontaneous reboot, it is desirable to look - * at the old xpr buffer. Assuming xprbootstrap allocates - * the buffer in the same place in physical memory and - * the reboot doesn't clear memory, this should work. - * xprptr will be reset, but the saved value should be OK. - * Just set xprenable false so the buffer isn't overwritten. - */ - -decl_simple_lock_data(, xprlock) -boolean_t xprenable = TRUE; /* Enable xpr tracing */ -int nxprbufs = 0; /* Number of contiguous xprbufs allocated */ -int xprflags = 0; /* Bit mask of xpr flags enabled */ -struct xprbuf *xprbase; /* Pointer to circular buffer nxprbufs*sizeof(xprbuf)*/ -struct xprbuf *xprptr; /* Currently allocated xprbuf */ -struct xprbuf *xprlast; /* Pointer to end of circular buffer */ - -void -xpr( - const char *msg, - long arg1, - long arg2, - long arg3, - long arg4, - long arg5) -{ - spl_t s; - struct xprbuf *x; - - /* If we aren't initialized, ignore trace request */ - if (!xprenable || (xprptr == 0)) { - return; - } - /* Guard against all interrupts and allocate next buffer. */ - - s = splhigh(); - simple_lock(&xprlock, LCK_GRP_NULL); - x = xprptr++; - if (xprptr >= xprlast) { - /* wrap around */ - xprptr = xprbase; - } - /* Save xprptr in allocated memory. */ - *(struct xprbuf **)xprlast = xprptr; - simple_unlock(&xprlock); - x->timestamp = XPR_TIMESTAMP; - splx(s); - x->msg = msg; - x->arg1 = arg1; - x->arg2 = arg2; - x->arg3 = arg3; - x->arg4 = arg4; - x->arg5 = arg5; - mp_disable_preemption(); - x->cpuinfo = cpu_number(); - mp_enable_preemption(); -} diff --git a/osfmk/kern/xpr.h b/osfmk/kern/xpr.h deleted file mode 100644 index e63d9e6d3..000000000 --- a/osfmk/kern/xpr.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Include file for xpr circular buffer silent tracing. - * - */ -/* - * If the kernel flag XPRDEBUG is set, the XPR macro is enabled. The - * macro should be invoked something like the following: - * XPR(XPR_SYSCALLS, "syscall: %d, 0x%x\n", syscallno, arg1, 0,0,0); - * which will expand into the following code: - * if (xprflags & XPR_SYSCALLS) - * xpr("syscall: %d, 0x%x\n", syscallno, arg1, 0,0,0); - * Xpr will log the pointer to the printf string and up to 5 arguements, - * along with a timestamp and cpuinfo (for multi-processor systems), into - * a circular buffer. The actual printf processing is delayed until after - * the buffer has been collected. It is assumed that the text/data segments - * of the kernel can easily be reconstructed in a post-processor which - * performs the printf processing. - * - * If the XPRDEBUG compilation switch is not set, the XPR macro expands - * to nothing. - */ - -#ifndef _KERN_XPR_H_ -#define _KERN_XPR_H_ - -#ifdef MACH_KERNEL -#include -#else /* MACH_KERNEL */ -#include -#endif /* MACH_KERNEL */ - -#include - -#if XPR_DEBUG - -#define XPR(flags, msg, arg1, arg2, arg3, arg4, arg5) \ -MACRO_BEGIN \ - if (xprflags & (flags)) { \ - xpr((msg), (long)(arg1), (long)(arg2), \ - (long)(arg3), (long)(arg4), (long)(arg5)); \ - } \ -MACRO_END - -extern int xprflags; - -/* - * flags for message types. - */ -#define XPR_TRAPS (1 << 1) -#define XPR_SCHED (1 << 2) -#define XPR_LOCK (1 << 3) -#define XPR_SLOCK (1 << 4) -#define XPR_PMAP (1 << 6) -#define XPR_VM_MAP (1 << 7) -#define XPR_VM_OBJECT (1 << 8) -#define XPR_VM_OBJECT_CACHE (1 << 9) -#define XPR_VM_PAGE (1 << 10) -#define XPR_VM_PAGEOUT (1 << 11) -#define XPR_MEMORY_OBJECT (1 << 12) -#define XPR_VM_FAULT (1 << 13) -#define XPR_VM_OBJECT_REP (1 << 14) -#define XPR_DEFAULT_PAGER (1 << 15) -#define XPR_INODE_PAGER (1 << 16) -#define XPR_INODE_PAGER_DATA (1 << 17) -#define XPR_XMM (1 << 18) - -#else /* XPR_DEBUG */ -#define XPR(flags, msg, arg1, arg2, arg3, arg4, arg5) -#endif /* XPR_DEBUG */ - -struct xprbuf { - const char *msg; - long arg1, arg2, arg3, arg4, arg5; - int timestamp; - int cpuinfo; -}; - -/* Bootstrap XPR facility */ -extern void xprbootstrap(void); - -/* Enable XPR facility */ -extern void xprinit(void); - -/* Log an XPR message */ -extern void xpr( - const char *msg, - long arg1, - long arg2, - long arg3, - long arg4, - long arg5); - -#endif /* _KERN_XPR_H_ */ diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 19562002c..f25e40407 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -370,7 +370,7 @@ vm_offset_t zone_map_max_address = 0; /* VM region for all metadata structures */ vm_offset_t zone_metadata_region_min = 0; vm_offset_t zone_metadata_region_max = 0; -decl_lck_mtx_data(static, zone_metadata_region_lck) +decl_lck_mtx_data(static, zone_metadata_region_lck); lck_attr_t zone_metadata_lock_attr; lck_mtx_ext_t zone_metadata_region_lck_ext; @@ -383,12 +383,6 @@ struct zone_free_element { #if CONFIG_ZCACHE -#if !CONFIG_GZALLOC -bool use_caching = TRUE; -#else -bool use_caching = FALSE; -#endif /* !CONFIG_GZALLOC */ - /* * Decides whether per-cpu zone caching is to be enabled for all zones. * Can be set to TRUE via the boot-arg '-zcache_all'. @@ -412,11 +406,15 @@ zone_caching_enabled(zone_t z) /* * Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap */ -decl_simple_lock_data(, all_zones_lock) +decl_simple_lock_data(, all_zones_lock); unsigned int num_zones_in_use; unsigned int num_zones; +#if KASAN +#define MAX_ZONES 512 +#else /* !KASAN */ #define MAX_ZONES 320 +#endif/* !KASAN */ struct zone zone_array[MAX_ZONES]; /* Used to keep track of empty slots in the zone_array */ @@ -428,7 +426,7 @@ bitmap_t zone_empty_bitmap[BITMAP_LEN(MAX_ZONES)]; * Or we can end up with multiple test zones (if a second zinit() comes through before zdestroy()), which could lead us to * run out of zones. */ -decl_simple_lock_data(, zone_test_lock) +decl_simple_lock_data(, zone_test_lock); static boolean_t zone_test_running = FALSE; static zone_t test_zone_ptr = NULL; #endif /* DEBUG || DEVELOPMENT */ @@ -636,6 +634,31 @@ get_zone_page(struct zone_page_metadata *page_meta) } } +/* + * Routine to panic if a pointer is not mapped to an expected zone. + * This can be used as a means of pinning an object to the zone it is expected + * to be a part of. Causes a panic if the address does not belong to any + * specified zone, does not belong to any zone, has been freed and therefore + * unmapped from the zone, or the pointer contains an uninitialized value that + * does not belong to any zone. + */ + +void +zone_require(void *addr, zone_t expected_zone) +{ + struct zone *src_zone = NULL; + struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE); + + src_zone = PAGE_METADATA_GET_ZONE(page_meta); + if (__improbable(src_zone == NULL)) { + panic("Address not in a zone for zone_require check (addr: %p)", addr); + } + + if (__improbable(src_zone != expected_zone)) { + panic("Address not in expected zone for zone_require check (addr: %p, zone: %s)", addr, src_zone->zone_name); + } +} + /* * ZTAGS */ @@ -677,7 +700,7 @@ static vm_map_t zone_tags_map; // simple heap allocator for allocating the tags for new memory -decl_lck_mtx_data(, ztLock) /* heap lock */ +decl_lck_mtx_data(, ztLock); /* heap lock */ enum{ ztFreeIndexCount = 8, ztFreeIndexMax = (ztFreeIndexCount - 1), @@ -1186,8 +1209,8 @@ is_sane_zone_element(zone_t zone, } /* Someone wrote to freed memory. */ +__dead2 static inline void -/* noreturn */ zone_element_was_modified_panic(zone_t zone, vm_offset_t element, vm_offset_t found, @@ -1210,10 +1233,9 @@ zone_element_was_modified_panic(zone_t zone, * The primary and backup pointers don't match. * Determine which one was likely the corrupted pointer, find out what it * probably should have been, and panic. - * I would like to mark this as noreturn, but panic() isn't marked noreturn. */ +__dead2 static void -/* noreturn */ backup_ptr_mismatch_panic(zone_t zone, vm_offset_t element, vm_offset_t primary, @@ -1517,7 +1539,7 @@ MACRO_END /* * Exclude more than one concurrent garbage collection */ -decl_lck_mtx_data(, zone_gc_lock) +decl_lck_mtx_data(, zone_gc_lock); lck_attr_t zone_gc_lck_attr; lck_grp_t zone_gc_lck_grp; @@ -1532,7 +1554,10 @@ vm_size_t panic_kext_memory_size = 0; #define ZALLOC_DEBUG_ZONEGC 0x00000001 #define ZALLOC_DEBUG_ZCRAM 0x00000002 -uint32_t zalloc_debug = 0; + +#if DEBUG || DEVELOPMENT +static uint32_t zalloc_debug = 0; +#endif /* * Zone leak debugging code @@ -2294,6 +2319,7 @@ zinit( bitmap_clear(zone_empty_bitmap, index); num_zones_in_use++; z->zone_valid = TRUE; + z->zone_destruction = FALSE; /* All other state is already set up since the zone was previously in use. Return early. */ simple_unlock(&all_zones_lock); @@ -2380,7 +2406,9 @@ zinit( z->zp_count = 0; z->kasan_quarantine = TRUE; z->zone_valid = TRUE; + z->zone_destruction = FALSE; z->cpu_cache_enabled = FALSE; + z->clear_memory = FALSE; #if CONFIG_ZLEAKS z->zleak_capture = 0; @@ -2544,7 +2572,7 @@ static void zone_replenish_thread(zone_t); /* High priority VM privileged thread used to asynchronously refill a designated * zone, such as the reserved VM map entry zone. */ -__attribute__((noreturn)) +__dead2 static void zone_replenish_thread(zone_t z) { @@ -2576,6 +2604,10 @@ zone_replenish_thread(zone_t z) zflags |= KMA_NOENCRYPT; } + if (z->clear_memory) { + zflags |= KMA_ZERO; + } + /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */ if (is_zone_map_nearing_exhaustion()) { thread_wakeup((event_t) &vm_pageout_garbage_collect); @@ -2655,6 +2687,7 @@ zdestroy(zone_t z) */ z->zone_valid = FALSE; #endif + z->zone_destruction = TRUE; unlock_zone(z); #if CONFIG_ZCACHE @@ -2796,10 +2829,12 @@ zcram( assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata)))); } +#if DEBUG || DEVELOPMENT if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) { kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name, (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size); } +#endif /* DEBUG || DEVELOPMENT */ ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE)); @@ -2878,6 +2913,11 @@ zfill( vm_size_t alloc_size = zone->alloc_size; vm_size_t elem_per_alloc = alloc_size / zone->elem_size; vm_size_t nalloc = (nelem + elem_per_alloc - 1) / elem_per_alloc; + int zflags = KMA_KOBJECT; + + if (zone->clear_memory) { + zflags |= KMA_ZERO; + } /* Don't mix-and-match zfill with foreign memory */ assert(!zone->allows_foreign); @@ -2887,7 +2927,7 @@ zfill( thread_wakeup((event_t) &vm_pageout_garbage_collect); } - kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, KMA_KOBJECT, VM_KERN_MEMORY_ZONE); + kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE); if (kr != KERN_SUCCESS) { printf("%s: kernel_memory_allocate() of %lu bytes failed\n", __func__, (unsigned long)(nalloc * alloc_size)); @@ -2911,9 +2951,11 @@ zone_bootstrap(void) { char temp_buf[16]; +#if DEBUG || DEVELOPMENT if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) { zalloc_debug = 0; } +#endif /* DEBUG || DEVELOPMENT */ /* Set up zone element poisoning */ zp_init(); @@ -3204,6 +3246,18 @@ zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr) } } +/* + * When deleting page mappings from the kernel map, it might be necessary to split + * apart an existing vm_map_entry. That means that a "free" operation, will need to + * *allocate* new vm_map_entry structures before it can free a page. + * + * This reserve here is the number of elements which are held back from everyone except + * the zone_gc thread. This is done so the zone_gc thread should never have to wait for + * the zone replenish thread for vm_map_entry structs. If it did, it could wind up + * in a deadlock. + */ +#define VM_MAP_ENTRY_RESERVE_CNT 8 + /* * zalloc returns an element from the specified zone. */ @@ -3222,9 +3276,8 @@ zalloc_internal( vm_offset_t addr = 0; kern_return_t retval; uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ - unsigned int numsaved = 0; - boolean_t zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE; - thread_t thr = current_thread(); + unsigned int numsaved = 0; + thread_t thr = current_thread(); boolean_t check_poison = FALSE; boolean_t set_doing_alloc_with_vm_priv = FALSE; @@ -3268,7 +3321,7 @@ zalloc_internal( if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) { /* Avoid backtracing twice if zone logging is on */ if (numsaved == 0) { - zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH); + zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL); } else { zleak_tracedepth = numsaved; } @@ -3289,6 +3342,10 @@ zalloc_internal( #if KASAN_ZALLOC addr = kasan_fixup_allocated_element_address(zone, addr); #endif + if (__improbable(DO_LOGGING(zone) && addr)) { + btlog_add_entry(zone->zlog_btlog, (void *)addr, + ZOP_ALLOC, (void **)zbt, numsaved); + } DTRACE_VM2(zalloc, zone_t, zone, void*, addr); return (void *)addr; } @@ -3299,48 +3356,56 @@ zalloc_internal( lock_zone(zone); assert(zone->zone_valid); + /* + * Check if we need another thread to replenish the zone. + * This is used for elements, like vm_map_entry, which are + * needed themselves to implement zalloc(). + */ if (zone->async_prio_refill && zone->zone_replenish_thread) { - vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size)); - vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size; - zone_replenish_wakeup = (zfreec < zrefillwm); - zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0)); - - do { - if (zone_replenish_wakeup) { - zone_replenish_wakeups_initiated++; - /* Signal the potentially waiting - * refill thread. - */ - thread_wakeup(&zone->zone_replenish_thread); + vm_size_t curr_free; + vm_size_t refill_level; + const vm_size_t reserved_min = VM_MAP_ENTRY_RESERVE_CNT * zone->elem_size; - /* We don't want to wait around for zone_replenish_thread to bump up the free count - * if we're in zone_gc(). This keeps us from deadlocking with zone_replenish_thread. - */ - if (thr->options & TH_OPT_ZONE_GC) { - break; - } + for (;;) { + curr_free = (zone->cur_size - (zone->count * zone->elem_size)); + refill_level = zone->prio_refill_watermark * zone->elem_size; - unlock_zone(zone); - /* Scheduling latencies etc. may prevent - * the refill thread from keeping up - * with demand. Throttle consumers - * when we fall below half the - * watermark, unless VM privileged - */ - if (zone_alloc_throttle) { - zone_replenish_throttle_count++; - assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); - thread_block(THREAD_CONTINUE_NULL); - } - lock_zone(zone); - assert(zone->zone_valid); + /* + * Nothing to do if there are plenty of elements. + */ + if (curr_free > refill_level) { + break; + } + + /* + * Wakeup the replenish thread. + */ + zone_replenish_wakeups_initiated++; + thread_wakeup(&zone->zone_replenish_thread); + + /* + * If we: + * - still have head room, more than half the refill amount, or + * - this is a VMPRIV thread and we're still above reserved, or + * - this is the zone garbage collection thread which may use the reserve + * then we don't have to wait for the replenish thread. + * + * The reserve for the garbage collection thread is to avoid a deadlock + * on the zone_map_lock between the replenish thread and GC thread. + */ + if (curr_free > refill_level / 2 || + ((thr->options & TH_OPT_VMPRIV) && curr_free > reserved_min) || + (thr->options & TH_OPT_ZONE_GC)) { + break; } + zone_replenish_throttle_count++; + unlock_zone(zone); + assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); + thread_block(THREAD_CONTINUE_NULL); + lock_zone(zone); - zfreec = (zone->cur_size - (zone->count * zone->elem_size)); - zrefillwm = zone->prio_refill_watermark * zone->elem_size; - zone_replenish_wakeup = (zfreec < zrefillwm); - zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0)); - } while (zone_alloc_throttle == TRUE); + assert(zone->zone_valid); + } } if (__probable(addr == 0)) { @@ -3350,9 +3415,10 @@ zalloc_internal( /* If we're here because of zone_gc(), we didn't wait for zone_replenish_thread to finish. * So we need to ensure that we did successfully grab an element. And we only need to assert * this for zones that have a replenish thread configured (in this case, the Reserved VM map - * entries zone). + * entries zone). The value of reserved_min in the previous bit of code should have given us + * headroom even though the GC thread didn't wait. */ - if (thr->options & TH_OPT_ZONE_GC && zone->async_prio_refill) { + if ((thr->options & TH_OPT_ZONE_GC) && zone->async_prio_refill) { assert(addr != 0); } @@ -3444,6 +3510,10 @@ zalloc_internal( zflags |= KMA_NOENCRYPT; } + if (zone->clear_memory) { + zflags |= KMA_ZERO; + } + /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */ if (is_zone_map_nearing_exhaustion()) { thread_wakeup((event_t) &vm_pageout_garbage_collect); @@ -3573,7 +3643,7 @@ zalloc_internal( unsigned int count, idx; /* Fill element, from tail, with backtrace in reverse order */ if (numsaved == 0) { - numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH); + numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL); } count = (unsigned int)(zone->elem_size / sizeof(uintptr_t)); if (count >= numsaved) { @@ -3976,7 +4046,16 @@ zone_change( break; case Z_CACHING_ENABLED: #if CONFIG_ZCACHE - if (value == TRUE && use_caching) { + if (value == TRUE) { +#if CONFIG_GZALLOC + /* + * Per cpu zone caching should be + * disabled if gzalloc is enabled. + */ + if (gzalloc_enabled()) { + break; + } +#endif if (zcache_ready()) { zcache_init(zone); } else { @@ -3985,6 +4064,9 @@ zone_change( } #endif break; + case Z_CLEARMEMORY: + zone->clear_memory = value; + break; default: panic("Zone_change: Wrong Item Type!"); /* break; */ @@ -4012,72 +4094,81 @@ zone_free_count(zone_t zone) return free_count; } -/* Drops the elements in the free queue of a zone. Called by zone_gc() on each zone, and when a zone is zdestroy'ed. */ +/* + * Drops (i.e. frees) the elements in the all free pages queue of a zone. + * Called by zone_gc() on each zone and when a zone is zdestroy()ed. + */ void drop_free_elements(zone_t z) { - vm_size_t elt_size, size_freed; - unsigned int total_freed_pages = 0; - uint64_t old_all_free_count; - struct zone_page_metadata *page_meta; - queue_head_t page_meta_head; + vm_size_t elt_size; + unsigned int total_freed_pages = 0; + struct zone_page_metadata *page_meta; + vm_address_t free_page_address; + vm_size_t size_to_free; lock_zone(z); - if (queue_empty(&z->pages.all_free)) { - unlock_zone(z); - return; - } - /* - * Snatch all of the free elements away from the zone. - */ elt_size = z->elem_size; - old_all_free_count = z->count_all_free_pages; - queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages); - queue_init(&z->pages.all_free); - z->count_all_free_pages = 0; - unlock_zone(z); - /* Iterate through all elements to find out size and count of elements we snatched */ - size_freed = 0; - queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) { + while (!queue_empty(&z->pages.all_free)) { + page_meta = (struct zone_page_metadata *)queue_first(&z->pages.all_free); assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */ - size_freed += elt_size * page_meta->free_count; - } + /* + * Don't drain zones with async refill to below the refill threshold, + * as they need some reserve to function properly. + */ + if (!z->zone_destruction && + z->async_prio_refill && z->zone_replenish_thread && + (vm_size_t)(page_meta->free_count - z->countfree) < z->prio_refill_watermark) { + break; + } - /* Update the zone size and free element count */ - lock_zone(z); - z->cur_size -= size_freed; - z->countfree -= size_freed / elt_size; - unlock_zone(z); + (void)dequeue_head(&z->pages.all_free); + + assert(z->countfree >= page_meta->free_count); + z->countfree -= page_meta->free_count; + + assert(z->count_all_free_pages >= page_meta->page_count); + z->count_all_free_pages -= page_meta->page_count; + + assert(z->cur_size >= page_meta->free_count * elt_size); + z->cur_size -= page_meta->free_count * elt_size; + + ZONE_PAGE_COUNT_DECR(z, page_meta->page_count); + unlock_zone(z); - while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) { - vm_address_t free_page_address; /* Free the pages for metadata and account for them */ free_page_address = get_zone_page(page_meta); - ZONE_PAGE_COUNT_DECR(z, page_meta->page_count); total_freed_pages += page_meta->page_count; - old_all_free_count -= page_meta->page_count; + size_to_free = page_meta->page_count * PAGE_SIZE; #if KASAN_ZALLOC - kasan_poison_range(free_page_address, page_meta->page_count * PAGE_SIZE, ASAN_VALID); + kasan_poison_range(free_page_address, size_to_free, ASAN_VALID); #endif #if VM_MAX_TAG_ZONES if (z->tags) { - ztMemoryRemove(z, free_page_address, (page_meta->page_count * PAGE_SIZE)); + ztMemoryRemove(z, free_page_address, size_to_free); } #endif /* VM_MAX_TAG_ZONES */ - kmem_free(zone_map, free_page_address, (page_meta->page_count * PAGE_SIZE)); + kmem_free(zone_map, free_page_address, size_to_free); if (current_thread()->options & TH_OPT_ZONE_GC) { thread_yield_to_preemption(); } + lock_zone(z); + } + if (z->zone_destruction) { + assert(queue_empty(&z->pages.all_free)); + assert(z->count_all_free_pages == 0); } + unlock_zone(z); - /* We freed all the pages from the all_free list for this zone */ - assert(old_all_free_count == 0); +#if DEBUG || DEVELOPMENT if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) { - kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed / elt_size, total_freed_pages); + kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, + (unsigned long)((total_freed_pages * PAGE_SIZE) / elt_size), total_freed_pages); } +#endif /* DEBUG || DEVELOPMENT */ } /* Zone garbage collection @@ -4113,9 +4204,11 @@ zone_gc(boolean_t consider_jetsams) max_zones = num_zones; simple_unlock(&all_zones_lock); +#if DEBUG || DEVELOPMENT if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) { kprintf("zone_gc() starting...\n"); } +#endif /* DEBUG || DEVELOPMENT */ for (i = 0; i < max_zones; i++) { z = &(zone_array[i]); @@ -4710,11 +4803,6 @@ mach_zone_force_gc( extern unsigned int stack_total; extern unsigned long long stack_allocs; -#if defined(__i386__) || defined (__x86_64__) -extern unsigned int inuse_ptepages_count; -extern long long alloc_ptepages_count; -#endif - zone_t zone_find_largest(void) { diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index 412390316..c5f356ff9 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -122,7 +122,7 @@ struct zone { int countfree; /* Number of free elements */ int count_all_free_pages; /* Number of pages collectable by GC */ lck_attr_t lock_attr; /* zone lock attribute */ - decl_lck_mtx_data(, lock) /* zone lock */ + decl_lck_mtx_data(, lock); /* zone lock */ lck_mtx_ext_t lock_ext; /* placeholder for indirect mutex */ vm_size_t cur_size; /* current memory utilization */ vm_size_t max_size; /* how large can this zone grow */ @@ -130,7 +130,7 @@ struct zone { vm_size_t alloc_size; /* size used for more memory */ uint64_t page_count __attribute__((aligned(8))); /* number of pages used by this zone */ uint64_t sum_count; /* count of allocs (life of zone) */ - uint32_t + uint64_t /* boolean_t */ exhaustible :1, /* (F) merely return if empty? */ /* boolean_t */ collectable :1, /* (F) garbage collect empty pages */ /* boolean_t */ expandable :1, /* (T) expand zone (with message)? */ @@ -155,7 +155,9 @@ struct zone { /* boolean_t */ zone_valid :1, /* boolean_t */ cpu_cache_enable_when_ready :1, /* boolean_t */ cpu_cache_enabled :1, - /* future */ _reserved :3; + /* boolean_t */ clear_memory :1, + /* boolean_t */ zone_destruction :1, + /* future */ _reserved :33; int index; /* index into zone_info arrays for this zone */ const char *zone_name; /* a name for the zone */ @@ -278,6 +280,7 @@ __BEGIN_DECLS #define Z_TAGS_ENABLED 11 /* Store tags */ #endif /* XNU_KERNEL_PRIVATE */ #define Z_CACHING_ENABLED 12 /*enable and initialize per-cpu caches for the zone*/ +#define Z_CLEARMEMORY 13 /* Use KMA_ZERO on new allocations */ #ifdef XNU_KERNEL_PRIVATE @@ -463,6 +466,15 @@ extern void zone_change( extern void zdestroy( zone_t zone); +#ifdef XNU_KERNEL_PRIVATE + +/* Panic if a pointer is not mapped to the zone specified */ +extern void zone_require( + void *addr, + zone_t expected_zone); + +#endif /* XNU_KERNEL_PRIVATE */ + __END_DECLS #endif /* _KERN_ZALLOC_H_ */ diff --git a/osfmk/kern/zcache.c b/osfmk/kern/zcache.c index 0ca209fe2..bd0a50dc8 100644 --- a/osfmk/kern/zcache.c +++ b/osfmk/kern/zcache.c @@ -654,13 +654,13 @@ zcache_canary_validate(zone_t zone, void *element) vm_offset_t primary_value = (*primary ^ (uintptr_t)element); if (primary_value != zcache_canary) { - panic("Zone cache element was used after free! Element %p was corrupted at beginning; Expected %p but found %p; canary %p", - element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*primary), (void *)zcache_canary); + panic("Zone cache element was used after free! Element %p was corrupted at beginning; Expected %p but found %p; canary %p; zone %p (%s)", + element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*primary), (void *)zcache_canary, zone, zone->zone_name); } vm_offset_t backup_value = (*backup ^ (uintptr_t)element); if (backup_value != zcache_canary) { - panic("Zone cache element was used after free! Element %p was corrupted at end; Expected %p but found %p; canary %p", - element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*backup), (void *)zcache_canary); + panic("Zone cache element was used after free! Element %p was corrupted at end; Expected %p but found %p; canary %p; zone %p (%s)", + element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*backup), (void *)zcache_canary, zone, zone->zone_name); } } diff --git a/osfmk/kextd/Makefile b/osfmk/kextd/Makefile index a49df09c0..624f8c42d 100644 --- a/osfmk/kextd/Makefile +++ b/osfmk/kextd/Makefile @@ -41,7 +41,7 @@ COMP_FILES = ${MIG_KUSRC} do_build_all:: $(COMP_FILES) ${MIG_KUSRC} : kextd_mach.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user kextd_mach.c \ -header kextd_mach.h \ diff --git a/osfmk/kperf/action.c b/osfmk/kperf/action.c index 2ff723f93..90d8e341f 100644 --- a/osfmk/kperf/action.c +++ b/osfmk/kperf/action.c @@ -124,7 +124,7 @@ static kern_return_t kperf_sample_internal(struct kperf_sample *sbuf, struct kperf_context *context, unsigned sample_what, unsigned sample_flags, - unsigned actionid, uint32_t ucallstack_depth) + unsigned actionid, unsigned ucallstack_depth) { int pended_ucallstack = 0; int pended_th_dispatch = 0; @@ -164,29 +164,24 @@ kperf_sample_internal(struct kperf_sample *sbuf, bool is_kernel = (context->cur_pid == 0); if (actionid && actionid <= actionc) { - sbuf->kcallstack.nframes = actionv[actionid - 1].kcallstack_depth; + sbuf->kcallstack.kpkc_nframes = + actionv[actionid - 1].kcallstack_depth; } else { - sbuf->kcallstack.nframes = MAX_CALLSTACK_FRAMES; + sbuf->kcallstack.kpkc_nframes = MAX_KCALLSTACK_FRAMES; } if (ucallstack_depth) { - sbuf->ucallstack.nframes = ucallstack_depth; + sbuf->ucallstack.kpuc_nframes = ucallstack_depth; } else { - sbuf->ucallstack.nframes = MAX_CALLSTACK_FRAMES; + sbuf->ucallstack.kpuc_nframes = MAX_UCALLSTACK_FRAMES; } - sbuf->kcallstack.flags = CALLSTACK_VALID; - sbuf->ucallstack.flags = CALLSTACK_VALID; + sbuf->kcallstack.kpkc_flags = 0; + sbuf->ucallstack.kpuc_flags = 0; - /* an event occurred. Sample everything and dump it in a - * buffer. - */ - - /* collect data from samplers */ if (sample_what & SAMPLER_TH_INFO) { kperf_thread_info_sample(&sbuf->th_info, context); - /* See if we should drop idle thread samples */ if (!(sample_flags & SAMPLE_FLAG_IDLE_THREADS)) { if (sbuf->th_info.kpthi_runmode & 0x40) { on_idle_thread = true; @@ -223,7 +218,7 @@ kperf_sample_internal(struct kperf_sample *sbuf, if (sample_flags & SAMPLE_FLAG_PEND_USER) { if (sample_what & SAMPLER_USTACK) { - pended_ucallstack = kperf_ucallstack_pend(context, sbuf->ucallstack.nframes); + pended_ucallstack = kperf_ucallstack_pend(context, sbuf->ucallstack.kpuc_nframes); } if (sample_what & SAMPLER_TH_DISPATCH) { @@ -323,6 +318,9 @@ log_sample: } } + if (sample_what & SAMPLER_PMC_CONFIG) { + kperf_kpc_config_log(&(sbuf->kpcdata)); + } if (sample_what & SAMPLER_PMC_THREAD) { kperf_kpc_thread_log(&(sbuf->kpcdata)); } else if (sample_what & SAMPLER_PMC_CPU) { @@ -483,12 +481,12 @@ void kperf_ast_set_callstack_depth(thread_t thread, uint32_t depth) { uint32_t ast_flags = kperf_get_thread_flags(thread); - uint32_t existing_callstack_depth = T_KPERF_GET_CALLSTACK_DEPTH(ast_flags); + uint32_t existing_callstack_depth = + T_KPERF_GET_CALLSTACK_DEPTH(ast_flags); - if (existing_callstack_depth != depth) { + if (existing_callstack_depth < depth) { ast_flags &= ~T_KPERF_SET_CALLSTACK_DEPTH(depth); ast_flags |= T_KPERF_SET_CALLSTACK_DEPTH(depth); - kperf_set_thread_flags(thread, ast_flags); } } @@ -614,8 +612,8 @@ kperf_action_reset(void) kperf_action_set_samplers(i + 1, 0); kperf_action_set_userdata(i + 1, 0); kperf_action_set_filter(i + 1, -1); - kperf_action_set_ucallstack_depth(i + 1, MAX_CALLSTACK_FRAMES); - kperf_action_set_kcallstack_depth(i + 1, MAX_CALLSTACK_FRAMES); + kperf_action_set_ucallstack_depth(i + 1, MAX_UCALLSTACK_FRAMES); + kperf_action_set_kcallstack_depth(i + 1, MAX_KCALLSTACK_FRAMES); } } @@ -667,8 +665,8 @@ kperf_action_set_count(unsigned count) for (unsigned int i = old_count; i < count; i++) { new_actionv[i].pid_filter = -1; - new_actionv[i].ucallstack_depth = MAX_CALLSTACK_FRAMES; - new_actionv[i].kcallstack_depth = MAX_CALLSTACK_FRAMES; + new_actionv[i].ucallstack_depth = MAX_UCALLSTACK_FRAMES; + new_actionv[i].kcallstack_depth = MAX_KCALLSTACK_FRAMES; } actionv = new_actionv; @@ -688,7 +686,7 @@ kperf_action_set_ucallstack_depth(unsigned action_id, uint32_t depth) return EINVAL; } - if (depth > MAX_CALLSTACK_FRAMES) { + if (depth > MAX_UCALLSTACK_FRAMES) { return EINVAL; } @@ -704,7 +702,7 @@ kperf_action_set_kcallstack_depth(unsigned action_id, uint32_t depth) return EINVAL; } - if (depth > MAX_CALLSTACK_FRAMES) { + if (depth > MAX_KCALLSTACK_FRAMES) { return EINVAL; } @@ -723,7 +721,7 @@ kperf_action_get_ucallstack_depth(unsigned action_id, uint32_t * depth_out) assert(depth_out); if (action_id == 0) { - *depth_out = MAX_CALLSTACK_FRAMES; + *depth_out = MAX_UCALLSTACK_FRAMES; } else { *depth_out = actionv[action_id - 1].ucallstack_depth; } @@ -741,7 +739,7 @@ kperf_action_get_kcallstack_depth(unsigned action_id, uint32_t * depth_out) assert(depth_out); if (action_id == 0) { - *depth_out = MAX_CALLSTACK_FRAMES; + *depth_out = MAX_KCALLSTACK_FRAMES; } else { *depth_out = actionv[action_id - 1].kcallstack_depth; } diff --git a/osfmk/kperf/callstack.c b/osfmk/kperf/callstack.c index 228cd9fe0..d6f0fb9a9 100644 --- a/osfmk/kperf/callstack.c +++ b/osfmk/kperf/callstack.c @@ -44,10 +44,10 @@ #endif static void -callstack_fixup_user(struct callstack *cs, thread_t thread) +callstack_fixup_user(struct kp_ucallstack *cs, thread_t thread) { uint64_t fixup_val = 0; - assert(cs->nframes < MAX_CALLSTACK_FRAMES); + assert(cs->kpuc_nframes < MAX_UCALLSTACK_FRAMES); #if defined(__x86_64__) user_addr_t sp_user; @@ -83,7 +83,7 @@ callstack_fixup_user(struct callstack *cs, thread_t thread) /* encode thumb mode into low bit of PC */ if (get_saved_state_cpsr(state) & PSR_TF) { - cs->frames[0] |= 1ULL; + cs->kpuc_frames[0] |= 1ULL; } fixup_val = get_saved_state_lr(state); @@ -93,7 +93,7 @@ callstack_fixup_user(struct callstack *cs, thread_t thread) #endif out: - cs->frames[cs->nframes++] = fixup_val; + cs->kpuc_frames[cs->kpuc_nframes++] = fixup_val; } #if defined(__x86_64__) @@ -186,10 +186,10 @@ interrupted_kernel_lr(uintptr_t *lr) static void -callstack_fixup_interrupted(struct callstack *cs) +callstack_fixup_interrupted(struct kp_kcallstack *cs) { uintptr_t fixup_val = 0; - assert(cs->nframes < MAX_CALLSTACK_FRAMES); + assert(cs->kpkc_nframes < MAX_KCALLSTACK_FRAMES); /* * Only provide arbitrary data on development or debug kernels. @@ -202,12 +202,12 @@ callstack_fixup_interrupted(struct callstack *cs) #endif /* defined(__x86_64__) */ #endif /* DEVELOPMENT || DEBUG */ - assert(cs->flags & CALLSTACK_KERNEL); - cs->frames[cs->nframes++] = fixup_val; + assert(cs->kpkc_flags & CALLSTACK_KERNEL); + cs->kpkc_frames[cs->kpkc_nframes++] = fixup_val; } void -kperf_continuation_sample(struct callstack *cs, struct kperf_context *context) +kperf_continuation_sample(struct kp_kcallstack *cs, struct kperf_context *context) { thread_t thread; @@ -218,42 +218,46 @@ kperf_continuation_sample(struct callstack *cs, struct kperf_context *context) assert(thread != NULL); assert(thread->continuation != NULL); - cs->flags = CALLSTACK_CONTINUATION | CALLSTACK_VALID | CALLSTACK_KERNEL; + cs->kpkc_flags = CALLSTACK_CONTINUATION | CALLSTACK_VALID | CALLSTACK_KERNEL; #ifdef __LP64__ - cs->flags |= CALLSTACK_64BIT; + cs->kpkc_flags |= CALLSTACK_64BIT; #endif - cs->nframes = 1; - cs->frames[0] = VM_KERNEL_UNSLIDE(thread->continuation); + cs->kpkc_nframes = 1; + cs->kpkc_frames[0] = VM_KERNEL_UNSLIDE(thread->continuation); } void -kperf_backtrace_sample(struct callstack *cs, struct kperf_context *context) +kperf_backtrace_sample(struct kp_kcallstack *cs, struct kperf_context *context) { assert(cs != NULL); assert(context != NULL); assert(context->cur_thread == current_thread()); - cs->flags = CALLSTACK_KERNEL | CALLSTACK_KERNEL_WORDS; + cs->kpkc_flags = CALLSTACK_KERNEL | CALLSTACK_KERNEL_WORDS; #ifdef __LP64__ - cs->flags |= CALLSTACK_64BIT; + cs->kpkc_flags |= CALLSTACK_64BIT; #endif BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_START, 1); - cs->nframes = backtrace_frame((uintptr_t *)&(cs->frames), cs->nframes - 1, - context->starting_fp); - if (cs->nframes > 0) { - cs->flags |= CALLSTACK_VALID; + bool trunc = false; + cs->kpkc_nframes = backtrace_frame(cs->kpkc_word_frames, + cs->kpkc_nframes - 1, context->starting_fp, &trunc); + if (cs->kpkc_nframes > 0) { + cs->kpkc_flags |= CALLSTACK_VALID; /* * Fake the value pointed to by the stack pointer or the link * register for symbolicators. */ - cs->frames[cs->nframes + 1] = 0; - cs->nframes += 1; + cs->kpkc_word_frames[cs->kpkc_nframes + 1] = 0; + cs->kpkc_nframes += 1; + } + if (trunc) { + cs->kpkc_nframes |= CALLSTACK_TRUNCATED; } - BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_END, cs->nframes); + BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_END, cs->kpkc_nframes); } kern_return_t chudxnu_thread_get_callstack64_kperf(thread_t thread, @@ -261,96 +265,96 @@ kern_return_t chudxnu_thread_get_callstack64_kperf(thread_t thread, boolean_t user_only); void -kperf_kcallstack_sample(struct callstack *cs, struct kperf_context *context) +kperf_kcallstack_sample(struct kp_kcallstack *cs, struct kperf_context *context) { thread_t thread; assert(cs != NULL); assert(context != NULL); - assert(cs->nframes <= MAX_CALLSTACK_FRAMES); + assert(cs->kpkc_nframes <= MAX_KCALLSTACK_FRAMES); thread = context->cur_thread; assert(thread != NULL); BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread), - cs->nframes); - - cs->flags = CALLSTACK_KERNEL; + cs->kpkc_nframes); + cs->kpkc_flags = CALLSTACK_KERNEL; #ifdef __LP64__ - cs->flags |= CALLSTACK_64BIT; + cs->kpkc_flags |= CALLSTACK_64BIT; #endif if (ml_at_interrupt_context()) { assert(thread == current_thread()); - cs->flags |= CALLSTACK_KERNEL_WORDS; - cs->nframes = backtrace_interrupted((uintptr_t *)cs->frames, - cs->nframes - 1); - if (cs->nframes != 0) { + cs->kpkc_flags |= CALLSTACK_KERNEL_WORDS; + bool trunc = false; + cs->kpkc_nframes = backtrace_interrupted( + cs->kpkc_word_frames, cs->kpkc_nframes - 1, &trunc); + if (cs->kpkc_nframes != 0) { callstack_fixup_interrupted(cs); } + if (trunc) { + cs->kpkc_flags |= CALLSTACK_TRUNCATED; + } } else { /* * Rely on legacy CHUD backtracer to backtrace kernel stacks on * other threads. */ kern_return_t kr; - kr = chudxnu_thread_get_callstack64_kperf(thread, cs->frames, - &cs->nframes, FALSE); + kr = chudxnu_thread_get_callstack64_kperf(thread, + cs->kpkc_frames, &cs->kpkc_nframes, FALSE); if (kr == KERN_SUCCESS) { - cs->flags |= CALLSTACK_VALID; + cs->kpkc_flags |= CALLSTACK_VALID; } else if (kr == KERN_RESOURCE_SHORTAGE) { - cs->flags |= CALLSTACK_VALID; - cs->flags |= CALLSTACK_TRUNCATED; + cs->kpkc_flags |= CALLSTACK_VALID; + cs->kpkc_flags |= CALLSTACK_TRUNCATED; } else { - cs->nframes = 0; + cs->kpkc_nframes = 0; } } - if (cs->nframes == 0) { + if (!(cs->kpkc_flags & CALLSTACK_VALID)) { BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK); } - BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), cs->flags, cs->nframes); + BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), + cs->kpkc_flags, cs->kpkc_nframes); } void -kperf_ucallstack_sample(struct callstack *cs, struct kperf_context *context) +kperf_ucallstack_sample(struct kp_ucallstack *cs, struct kperf_context *context) { - thread_t thread; - bool user_64 = false; - int err; - - assert(cs != NULL); - assert(context != NULL); - assert(cs->nframes <= MAX_CALLSTACK_FRAMES); assert(ml_get_interrupts_enabled() == TRUE); - thread = context->cur_thread; + thread_t thread = context->cur_thread; assert(thread != NULL); - BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread), - cs->nframes); + BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_START, + (uintptr_t)thread_tid(thread), cs->kpuc_nframes); - cs->flags = 0; - - err = backtrace_thread_user(thread, (uintptr_t *)cs->frames, - cs->nframes - 1, &cs->nframes, &user_64); - cs->flags |= CALLSTACK_KERNEL_WORDS; - if (user_64) { - cs->flags |= CALLSTACK_64BIT; + bool user64 = false; + bool trunc = false; + int err = backtrace_thread_user(thread, cs->kpuc_frames, + cs->kpuc_nframes - 1, &cs->kpuc_nframes, &user64, &trunc); + cs->kpuc_flags = CALLSTACK_KERNEL_WORDS; + if (user64) { + cs->kpuc_flags |= CALLSTACK_64BIT; + } + if (trunc) { + cs->kpuc_flags |= CALLSTACK_TRUNCATED; } if (!err || err == EFAULT) { callstack_fixup_user(cs, thread); - cs->flags |= CALLSTACK_VALID; + cs->kpuc_flags |= CALLSTACK_VALID; } else { - cs->nframes = 0; + cs->kpuc_nframes = 0; BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK, err); } BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), - cs->flags, cs->nframes); + cs->kpuc_flags, cs->kpuc_nframes); } static inline uintptr_t @@ -378,38 +382,36 @@ scrub_frame(uint64_t *bt, int n_frames, int frame) } static void -callstack_log(struct callstack *cs, uint32_t hcode, uint32_t dcode) +callstack_log(uint32_t hdrid, uint32_t dataid, void *vframes, + unsigned int nframes, unsigned int flags) { - BUF_VERB(PERF_CS_LOG | DBG_FUNC_START, cs->flags, cs->nframes); + BUF_VERB(PERF_CS_LOG | DBG_FUNC_START, flags, nframes); - /* framing information for the stack */ - BUF_DATA(hcode, cs->flags, cs->nframes); + BUF_DATA(hdrid, flags, nframes); - /* how many batches of 4 */ - unsigned int nframes = cs->nframes; - unsigned int n = nframes / 4; + unsigned int nevts = nframes / 4; unsigned int ovf = nframes % 4; if (ovf != 0) { - n++; + nevts++; } - bool kern = cs->flags & CALLSTACK_KERNEL; + bool kern = flags & CALLSTACK_KERNEL; - if (cs->flags & CALLSTACK_KERNEL_WORDS) { - uintptr_t *frames = (uintptr_t *)cs->frames; - for (unsigned int i = 0; i < n; i++) { + if (flags & CALLSTACK_KERNEL_WORDS) { + uintptr_t *frames = vframes; + for (unsigned int i = 0; i < nevts; i++) { unsigned int j = i * 4; - BUF_DATA(dcode, + BUF_DATA(dataid, scrub_word(frames, nframes, j + 0, kern), scrub_word(frames, nframes, j + 1, kern), scrub_word(frames, nframes, j + 2, kern), scrub_word(frames, nframes, j + 3, kern)); } } else { - for (unsigned int i = 0; i < n; i++) { - uint64_t *frames = cs->frames; + for (unsigned int i = 0; i < nevts; i++) { + uint64_t *frames = vframes; unsigned int j = i * 4; - BUF_DATA(dcode, + BUF_DATA(dataid, scrub_frame(frames, nframes, j + 0), scrub_frame(frames, nframes, j + 1), scrub_frame(frames, nframes, j + 2), @@ -417,19 +419,21 @@ callstack_log(struct callstack *cs, uint32_t hcode, uint32_t dcode) } } - BUF_VERB(PERF_CS_LOG | DBG_FUNC_END, cs->flags, cs->nframes); + BUF_VERB(PERF_CS_LOG | DBG_FUNC_END, flags, nframes); } void -kperf_kcallstack_log( struct callstack *cs ) +kperf_kcallstack_log(struct kp_kcallstack *cs) { - callstack_log(cs, PERF_CS_KHDR, PERF_CS_KDATA); + callstack_log(PERF_CS_KHDR, PERF_CS_KDATA, cs->kpkc_frames, + cs->kpkc_nframes, cs->kpkc_flags); } void -kperf_ucallstack_log( struct callstack *cs ) +kperf_ucallstack_log(struct kp_ucallstack *cs) { - callstack_log(cs, PERF_CS_UHDR, PERF_CS_UDATA); + callstack_log(PERF_CS_UHDR, PERF_CS_UDATA, cs->kpuc_frames, + cs->kpuc_nframes, cs->kpuc_flags); } int @@ -662,6 +666,9 @@ chudxnu_thread_get_callstack64_kperf( } #elif __arm64__ +#if defined(HAS_APPLE_PAC) +#include +#endif // chudxnu_thread_get_callstack gathers a raw callstack along with any information needed to // fix it up later (in case we stopped program as it was saving values into prev stack frame, etc.) @@ -789,7 +796,12 @@ chudxnu_thread_get_callstack64_internal( (vm_offset_t)fp, (vm_size_t)sizeof(frame)); if (kr == KERN_SUCCESS) { +#if defined(HAS_APPLE_PAC) + /* return addresses on stack will be signed by arm64e ABI */ + pc = (uint64_t)ptrauth_strip((void *)frame[1], ptrauth_key_return_address); +#else pc = frame[1]; +#endif nextFramePointer = (uint64_t *)frame[0]; } else { pc = 0ULL; @@ -803,7 +815,12 @@ chudxnu_thread_get_callstack64_internal( (vm_offset_t)fp, (vm_size_t)sizeof(frame)); if (kr == KERN_SUCCESS) { +#if defined(HAS_APPLE_PAC) + /* return addresses on stack will be signed by arm64e ABI */ + pc = (uint64_t)ptrauth_strip((void *)frame[1], ptrauth_key_return_address); +#else pc = frame[1]; +#endif nextFramePointer = (uint64_t *)(frame[0]); } else { pc = 0ULL; diff --git a/osfmk/kperf/callstack.h b/osfmk/kperf/callstack.h index 76d442ced..a144a8b95 100644 --- a/osfmk/kperf/callstack.h +++ b/osfmk/kperf/callstack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011-2019 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,7 +29,8 @@ #ifndef KPERF_CALLSTACK_H #define KPERF_CALLSTACK_H -#define MAX_CALLSTACK_FRAMES (128) +#define MAX_KCALLSTACK_FRAMES (128) +#define MAX_UCALLSTACK_FRAMES (256) /* the callstack contains valid data */ #define CALLSTACK_VALID (1U << 0) @@ -46,22 +47,30 @@ /* the frames field is filled with uintptr_t, not uint64_t */ #define CALLSTACK_KERNEL_WORDS (1U << 6) -struct callstack { - uint32_t flags; - uint32_t nframes; - /* WARNING this can be uintptr_t instead if CALLSTACK_KERNEL_WORDS is set */ - uint64_t frames[MAX_CALLSTACK_FRAMES]; +struct kp_ucallstack { + uint32_t kpuc_flags; + uint32_t kpuc_nframes; + uintptr_t kpuc_frames[MAX_UCALLSTACK_FRAMES]; +}; + +struct kp_kcallstack { + uint32_t kpkc_flags; + uint32_t kpkc_nframes; + union { + uintptr_t kpkc_word_frames[MAX_KCALLSTACK_FRAMES]; + uint64_t kpkc_frames[MAX_KCALLSTACK_FRAMES]; + }; }; struct kperf_context; -void kperf_kcallstack_sample(struct callstack *cs, struct kperf_context *); -void kperf_kcallstack_log(struct callstack *cs); -void kperf_continuation_sample(struct callstack *cs, struct kperf_context *); -void kperf_backtrace_sample(struct callstack *cs, struct kperf_context *context); +void kperf_kcallstack_sample(struct kp_kcallstack *cs, struct kperf_context *); +void kperf_kcallstack_log(struct kp_kcallstack *cs); +void kperf_continuation_sample(struct kp_kcallstack *cs, struct kperf_context *); +void kperf_backtrace_sample(struct kp_kcallstack *cs, struct kperf_context *context); -void kperf_ucallstack_sample(struct callstack *cs, struct kperf_context *); +void kperf_ucallstack_sample(struct kp_ucallstack *cs, struct kperf_context *); int kperf_ucallstack_pend(struct kperf_context *, uint32_t depth); -void kperf_ucallstack_log(struct callstack *cs); +void kperf_ucallstack_log(struct kp_ucallstack *cs); #endif /* !defined(KPERF_CALLSTACK_H) */ diff --git a/osfmk/kperf/kperf.c b/osfmk/kperf/kperf.c index bd5c582f0..17a94be8e 100644 --- a/osfmk/kperf/kperf.c +++ b/osfmk/kperf/kperf.c @@ -342,9 +342,11 @@ kperf_port_to_pid(mach_port_name_t portname) if (task == TASK_NULL) { return -1; } + pid_t pid = task_pid(task); - /* drop the ref taken by port_name_to_task */ - (void)task_deallocate_internal(task); + + os_ref_count_t __assert_only count = task_deallocate_internal(task); + assert(count != 0); return pid; } diff --git a/osfmk/kperf/kperf_kpc.c b/osfmk/kperf/kperf_kpc.c index 26b7b777e..43df937a2 100644 --- a/osfmk/kperf/kperf_kpc.c +++ b/osfmk/kperf/kperf_kpc.c @@ -90,7 +90,7 @@ kperf_kpc_cpu_sample(struct kpcdata *kpcd, int sample_config) BUF_INFO(PERF_KPC_CPU_SAMPLE | DBG_FUNC_END, kpcd->running, kpcd->counterc); } -static void +void kperf_kpc_config_log(const struct kpcdata *kpcd) { BUF_DATA(PERF_KPC_CONFIG, @@ -98,64 +98,66 @@ kperf_kpc_config_log(const struct kpcdata *kpcd) kpcd->counterc, kpc_get_counter_count(KPC_CLASS_FIXED_MASK), kpcd->configc); + +#if __LP64__ + unsigned int max = (kpcd->configc + 3) / 4; + for (unsigned int i = 0; i < max; i++) { + uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE); + BUF_DATA(PERF_KPC_CFG_REG | flag, + kpcd->configv[0 + i * 4], kpcd->configv[1 + i * 4], + kpcd->configv[2 + i * 4], kpcd->configv[3 + i * 4]); + } +#else /* __LP64__ */ + unsigned int max = (kpcd->configc + 1) / 2; + for (unsigned int i = 0; i < max; i++) { + uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE); + BUF_DATA(PERF_KPC_CFG_REG32 | flag, + kpcd->configv[i * 2] >> 32ULL, + kpcd->configv[i * 2] & 0xffffffffULL, + kpcd->configv[i * 2 + 1] >> 32ULL, + kpcd->configv[i * 2 + 1] & 0xffffffffULL); + } +#endif /* !__LP64__ */ } static void kperf_kpc_log(uint32_t code, uint32_t code32, const struct kpcdata *kpcd) { - unsigned i; - #if __LP64__ - (void)code32; - /* config registers */ - for (i = 0; i < ((kpcd->configc + 3) / 4); i++) { - BUF_DATA(PERF_KPC_CFG_REG, - kpcd->configv[0 + i * 4], - kpcd->configv[1 + i * 4], - kpcd->configv[2 + i * 4], - kpcd->configv[3 + i * 4]); - } - +#pragma unused(code32) + unsigned int max = (kpcd->counterc + 3) / 4; /* and the actual counts with one 64-bit argument each */ - for (i = 0; i < ((kpcd->counterc + 3) / 4); i++) { - BUF_DATA(code, + for (unsigned int i = 0; i < max; i++) { + uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE); + BUF_DATA(code | flag, kpcd->counterv[0 + i * 4], kpcd->counterv[1 + i * 4], kpcd->counterv[2 + i * 4], kpcd->counterv[3 + i * 4]); } -#else - (void)code; - /* config registers */ - for (i = 0; i < ((kpcd->configc + 1) / 2); i++) { - BUF_DATA(PERF_KPC_CFG_REG32, - (kpcd->configv[0 + i * 2] >> 32ULL), - kpcd->configv[0 + i * 2] & 0xffffffffULL, - (kpcd->configv[1 + i * 2] >> 32ULL), - kpcd->configv[1 + i * 2] & 0xffffffffULL); - } - +#else /* __LP64__ */ +#pragma unused(code) + unsigned int max = (kpcd->counterc + 1) / 2; /* and the actual counts with two 32-bit trace arguments each */ - for (i = 0; i < ((kpcd->counterc + 1) / 2); i++) { - BUF_DATA(code32, + for (unsigned int i = 0; i < max; i++) { + uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE); + BUF_DATA(code32 | flag, (kpcd->counterv[0 + i * 2] >> 32ULL), kpcd->counterv[0 + i * 2] & 0xffffffffULL, (kpcd->counterv[1 + i * 2] >> 32ULL), kpcd->counterv[1 + i * 2] & 0xffffffffULL); } -#endif +#endif /* !__LP64__ */ } void kperf_kpc_cpu_log(const struct kpcdata *kpcd) { - kperf_kpc_config_log(kpcd); kperf_kpc_log(PERF_KPC_DATA, PERF_KPC_DATA32, kpcd); } void kperf_kpc_thread_log(const struct kpcdata *kpcd) { - kperf_kpc_config_log(kpcd); kperf_kpc_log(PERF_KPC_DATA_THREAD, PERF_KPC_DATA_THREAD32, kpcd); } diff --git a/osfmk/kperf/kperf_kpc.h b/osfmk/kperf/kperf_kpc.h index 9b5d58b71..a65fe14fd 100644 --- a/osfmk/kperf/kperf_kpc.h +++ b/osfmk/kperf/kperf_kpc.h @@ -49,5 +49,6 @@ void kperf_kpc_thread_sample(struct kpcdata *, int); void kperf_kpc_cpu_sample(struct kpcdata *, int); void kperf_kpc_thread_log(const struct kpcdata *); void kperf_kpc_cpu_log(const struct kpcdata *); +void kperf_kpc_config_log(const struct kpcdata *); #endif /* __KPERF_KPC_H__ */ diff --git a/osfmk/kperf/kperf_timer.c b/osfmk/kperf/kperf_timer.c index 7d1c478ba..a6287f39c 100644 --- a/osfmk/kperf/kperf_timer.c +++ b/osfmk/kperf/kperf_timer.c @@ -177,7 +177,7 @@ kperf_timer_handler(void *param0, __unused void *param1) uint32_t actionid = KPERF_TMR_ACTION(action_state); if (actionid == 0) { - return; + goto deactivate; } #if DEVELOPMENT || DEBUG diff --git a/osfmk/kperf/sample.h b/osfmk/kperf/sample.h index 35e186ebb..9af5ba5b5 100644 --- a/osfmk/kperf/sample.h +++ b/osfmk/kperf/sample.h @@ -43,8 +43,8 @@ struct kperf_sample { struct kperf_task_snapshot tk_snapshot; - struct callstack kcallstack; - struct callstack ucallstack; + struct kp_kcallstack kcallstack; + struct kp_ucallstack ucallstack; struct meminfo meminfo; #if KPC diff --git a/osfmk/kperf/thread_samplers.c b/osfmk/kperf/thread_samplers.c index 91ebb5026..901e500f7 100644 --- a/osfmk/kperf/thread_samplers.c +++ b/osfmk/kperf/thread_samplers.c @@ -159,8 +159,10 @@ kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc, thsc->kpthsc_requested_qos_override = MAX(thread->requested_policy.thrp_qos_override, thread->requested_policy.thrp_qos_workq_override); thsc->kpthsc_requested_qos_promote = thread->requested_policy.thrp_qos_promote; - thsc->kpthsc_requested_qos_ipc_override = thread->requested_policy.thrp_qos_ipc_override; - thsc->kpthsc_requested_qos_sync_ipc_override = thread->requested_policy.thrp_qos_sync_ipc_override; + thsc->kpthsc_requested_qos_kevent_override = MAX( + thread->requested_policy.thrp_qos_kevent_override, + thread->requested_policy.thrp_qos_wlsvc_override); + thsc->kpthsc_requested_qos_sync_ipc_override = THREAD_QOS_UNSPECIFIED; thsc->kpthsc_effective_latency_qos = thread->effective_policy.thep_latency_qos; BUF_INFO(PERF_TI_SCHEDSAMPLE | DBG_FUNC_END); @@ -182,8 +184,7 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc) | thsc->kpthsc_requested_qos_override, ((uint64_t)thsc->kpthsc_effective_latency_qos << 61) | ((uint64_t)thsc->kpthsc_requested_qos_promote << 58) - | ((uint64_t)thsc->kpthsc_requested_qos_ipc_override << 55) - | ((uint64_t)thsc->kpthsc_requested_qos_sync_ipc_override << 52) + | ((uint64_t)thsc->kpthsc_requested_qos_kevent_override << 55) ); BUF_DATA(PERF_TI_SCHEDDATA_3, thsc->kpthsc_runnable_time); #else @@ -200,8 +201,7 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc) | thsc->kpthsc_requested_qos_override, ((uint32_t)thsc->kpthsc_effective_latency_qos << 29) | ((uint32_t)thsc->kpthsc_requested_qos_promote << 26) - | ((uint32_t)thsc->kpthsc_requested_qos_ipc_override << 23) - | ((uint32_t)thsc->kpthsc_requested_qos_sync_ipc_override << 20) + | ((uint32_t)thsc->kpthsc_requested_qos_kevent_override << 23) ); BUF_DATA(PERF_TI_SCHEDDATA3_32, UPPER_32(thsc->kpthsc_runnable_time), LOWER_32(thsc->kpthsc_runnable_time)); @@ -353,12 +353,8 @@ kperf_thread_inscyc_log(struct kperf_context *context) return; } - uint64_t counts[MT_CORE_NFIXED]; - - int ret = mt_fixed_thread_counts(cur_thread, counts); - if (ret) { - return; - } + uint64_t counts[MT_CORE_NFIXED] = { 0 }; + mt_cur_thread_fixed_counts(counts); #if defined(__LP64__) BUF_DATA(PERF_TI_INSCYCDATA, counts[MT_CORE_INSTRS], counts[MT_CORE_CYCLES]); diff --git a/osfmk/kperf/thread_samplers.h b/osfmk/kperf/thread_samplers.h index 9696dfec5..09a188554 100644 --- a/osfmk/kperf/thread_samplers.h +++ b/osfmk/kperf/thread_samplers.h @@ -43,6 +43,9 @@ void kperf_thread_info_sample(struct kperf_thread_info *, struct kperf_context *); void kperf_thread_info_log(struct kperf_thread_info *); +// legacy names +#define kpthsc_requested_qos_ipc_override kpthsc_requested_qos_kevent_override + /* scheduling information */ struct kperf_thread_scheduling { uint64_t kpthsc_user_time; @@ -55,8 +58,8 @@ struct kperf_thread_scheduling { kpthsc_requested_qos :3, kpthsc_requested_qos_override :3, kpthsc_requested_qos_promote :3, - kpthsc_requested_qos_ipc_override :3, - kpthsc_requested_qos_sync_ipc_override :3, + kpthsc_requested_qos_kevent_override :3, + kpthsc_requested_qos_sync_ipc_override :3, /* obsolete */ kpthsc_effective_latency_qos :3; }; diff --git a/osfmk/libsa/string.h b/osfmk/libsa/string.h index 2a1b67746..1bcf828cc 100644 --- a/osfmk/libsa/string.h +++ b/osfmk/libsa/string.h @@ -50,7 +50,11 @@ extern "C" { #ifndef NULL #if defined (__cplusplus) +#if __cplusplus >= 201103L +#define NULL nullptr +#else #define NULL 0 +#endif #else #define NULL ((void *)0) #endif @@ -93,50 +97,85 @@ extern int strprefix(const char *s1, const char *s2); extern int bcmp(const void *, const void *, size_t); extern void bcopy(const void *, void *, size_t); extern void bzero(void *, size_t); +extern int timingsafe_bcmp(const void *b1, const void *b2, size_t n); #ifdef PRIVATE #include #endif +#if __has_builtin(__builtin_dynamic_object_size) +#define XNU_BOS __builtin_dynamic_object_size +#else +#define XNU_BOS __builtin_object_size +#endif + + +/* __nochk_ functions for opting out of type 1 bounds checking */ +__attribute__((always_inline)) static inline void * +__nochk_memcpy(void *dest, const void *src, size_t len) +{ + return __builtin___memcpy_chk(dest, src, len, XNU_BOS(dest, 0)); +} +__attribute__((always_inline)) static inline void * +__nochk_memmove(void *dest, const void *src, size_t len) +{ + return __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, 0)); +} +__attribute__((always_inline)) static inline void +__nochk_bcopy(const void *src, void *dest, size_t len) +{ + __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, 0)); +} + #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_13 /* older deployment target */ #elif defined(KASAN) || (defined (_FORTIFY_SOURCE) && _FORTIFY_SOURCE == 0) -/* FORTIFY_SOURCE disabled */ +/* _FORTIFY_SOURCE disabled */ #else /* _chk macros */ + +#ifdef XNU_KERNEL_PRIVATE +/* Stricter checking in xnu than kexts. When type is set to 1, __builtin_object_size + * returns the size of the closest surrounding sub-object, which would detect copying past + * the end of a struct member. */ +#define BOS_COPY_TYPE 1 +#else +#define BOS_COPY_TYPE 0 +#endif + #if __has_builtin(__builtin___memcpy_chk) -#define memcpy(dest, src, len) __builtin___memcpy_chk(dest, src, len, __builtin_object_size(dest, 0)) +#define memcpy(dest, src, len) __builtin___memcpy_chk(dest, src, len, XNU_BOS(dest, BOS_COPY_TYPE)) #endif #if __has_builtin(__builtin___memmove_chk) -#define memmove(dest, src, len) __builtin___memmove_chk(dest, src, len, __builtin_object_size(dest, 0)) +#define memmove(dest, src, len) __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, BOS_COPY_TYPE)) #endif #if __has_builtin(__builtin___strncpy_chk) -#define strncpy(dest, src, len) __builtin___strncpy_chk(dest, src, len, __builtin_object_size(dest, 1)) +#define strncpy(dest, src, len) __builtin___strncpy_chk(dest, src, len, XNU_BOS(dest, 1)) #endif #if __has_builtin(__builtin___strncat_chk) -#define strncat(dest, src, len) __builtin___strncat_chk(dest, src, len, __builtin_object_size(dest, 1)) +#define strncat(dest, src, len) __builtin___strncat_chk(dest, src, len, XNU_BOS(dest, 1)) #endif #if __has_builtin(__builtin___strlcat_chk) -#define strlcat(dest, src, len) __builtin___strlcat_chk(dest, src, len, __builtin_object_size(dest, 1)) +#define strlcat(dest, src, len) __builtin___strlcat_chk(dest, src, len, XNU_BOS(dest, 1)) #endif #if __has_builtin(__builtin___strlcpy_chk) -#define strlcpy(dest, src, len) __builtin___strlcpy_chk(dest, src, len, __builtin_object_size(dest, 1)) +#define strlcpy(dest, src, len) __builtin___strlcpy_chk(dest, src, len, XNU_BOS(dest, 1)) #endif #if __has_builtin(__builtin___strcpy_chk) -#define strcpy(dest, src, len) __builtin___strcpy_chk(dest, src, __builtin_object_size(dest, 1)) +#define strcpy(dest, src, len) __builtin___strcpy_chk(dest, src, XNU_BOS(dest, 1)) #endif #if __has_builtin(__builtin___strcat_chk) -#define strcat(dest, src) __builtin___strcat_chk(dest, src, __builtin_object_size(dest, 1)) +#define strcat(dest, src) __builtin___strcat_chk(dest, src, XNU_BOS(dest, 1)) #endif #if __has_builtin(__builtin___memmove_chk) -#define bcopy(src, dest, len) __builtin___memmove_chk(dest, src, len, __builtin_object_size(dest, 0)) +#define bcopy(src, dest, len) __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, BOS_COPY_TYPE)) #endif #endif /* _chk macros */ diff --git a/osfmk/libsa/types.h b/osfmk/libsa/types.h index 737206913..9a9326fc7 100644 --- a/osfmk/libsa/types.h +++ b/osfmk/libsa/types.h @@ -99,7 +99,7 @@ typedef volatile unsigned long vulong_t; * Deprecation macro */ #if __GNUC__ >= 3 -#define __deprecated __attribute__((deprecated)) +#define __deprecated __attribute__((__deprecated__)) #else #define __deprecated /* nothing */ #endif diff --git a/osfmk/lockd/Makefile b/osfmk/lockd/Makefile index 8ad03c5eb..a2591e477 100644 --- a/osfmk/lockd/Makefile +++ b/osfmk/lockd/Makefile @@ -42,7 +42,7 @@ COMP_FILES = ${MIG_KUSRC} do_build_all:: $(COMP_FILES) ${MIG_KUSRC} : lockd_mach.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user lockd_mach.c \ -header lockd_mach.h \ diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index e728f0a4f..310027b65 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -49,14 +49,20 @@ MIG_DEFS = \ thread_act.defs \ vm_map.defs +MIG_PRIVATE_DEFS = \ + restartable.defs + MACH_PRIVATE_DEFS = \ coalition_notification.defs \ + fairplayd_notification.defs \ + arcade_upcall.defs \ ktrace_background.defs \ mach_notify.defs \ memory_object_control.defs \ memory_object_default.defs \ sysdiagnose_notification.defs \ upl.defs \ + vfs_nspace.defs \ vm32_map.defs # @@ -68,12 +74,15 @@ MIG_USHDRS = \ clock_reply_server.h \ coalition_notification_server.h \ exc_server.h \ + fairplayd_notification_server.h \ + arcade_upcall_server.h \ mach_exc_server.h \ memory_object_default_server.h \ notify_server.h \ task_access_server.h \ telemetry_notification_server.h \ - sysdiagnose_notification_server.h + sysdiagnose_notification_server.h \ + vfs_nspace_server.h MIG_UUHDRS = \ clock.h \ @@ -95,7 +104,8 @@ MIG_UUHDRS = \ task_access.h \ thread_act.h \ upl.h \ - vm_map.h + vm_map.h \ + vfs_nspace.h MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS} @@ -170,6 +180,8 @@ PRIVATE_DATAFILES = \ bootstrap.h \ coalition.h \ coalition_notification.defs \ + fairplayd_notification.defs \ + arcade_upcall.defs \ host_info.h \ ktrace_background.defs \ mach_host.defs \ @@ -189,7 +201,9 @@ PRIVATE_DATAFILES = \ task_policy.h \ thread_policy.h \ thread_switch.h \ - vm_prot.h + vfs_nspace.defs \ + vm_prot.h \ + ${MIG_PRIVATE_DEFS} INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} @@ -222,7 +236,7 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -231,7 +245,7 @@ ${MIG_UUHDRS} : \ ${MIG_USHDRS} : \ %_server.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -275,6 +289,8 @@ MIG_KUSRC = \ clock_reply_user.c \ coalition_notification_user.c \ exc_user.c \ + fairplayd_notification_user.c \ + arcade_upcall_user.c \ host_notify_reply_user.c \ ktrace_background_user.c \ mach_exc_user.c \ @@ -286,10 +302,12 @@ MIG_KUSRC = \ task_access_user.c \ telemetry_notification_user.c \ upl_user.c \ + vfs_nspace_user.c \ vm_map_user.c \ sysdiagnose_notification_user.c MIG_KSHDRS = \ + arcade_register_server.h \ clock_server.h \ clock_priv_server.h \ exc_server.h \ @@ -308,6 +326,7 @@ MIG_KSHDRS = \ memory_object_default_server.h \ processor_server.h \ processor_set_server.h \ + restartable_server.h \ task_server.h \ thread_act_server.h \ upl_server.h \ @@ -315,6 +334,7 @@ MIG_KSHDRS = \ vm32_map_server.h MIG_KSSRC = \ + arcade_register_server.c \ clock_server.c \ clock_priv_server.c \ exc_server.c \ @@ -333,6 +353,7 @@ MIG_KSSRC = \ memory_object_default_server.c \ processor_server.c \ processor_set_server.c \ + restartable_server.c \ task_server.c \ thread_act_server.c \ upl_server.c \ @@ -363,7 +384,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -373,7 +394,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/mach/arcade_register.defs b/osfmk/mach/arcade_register.defs new file mode 100644 index 000000000..78f4eec5f --- /dev/null +++ b/osfmk/mach/arcade_register.defs @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Interface definition for the fairplay upcall mechanism. + */ + +subsystem +#if KERNEL_SERVER + KernelServer +#endif /* KERNEL_SERVER */ + arcade_register 51471; + +#include +#include + +routine arcade_register_new_upcall( + arcade_register : arcade_register_t; + arcade_upcall : mach_port_t); + diff --git a/osfmk/mach/arcade_upcall.defs b/osfmk/mach/arcade_upcall.defs new file mode 100644 index 000000000..db724f0e5 --- /dev/null +++ b/osfmk/mach/arcade_upcall.defs @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + +* @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Interface definition for the fairplay upcall mechanism. + */ + +subsystem +#if KERNEL_USER + KernelUser +#endif /* KERNEL_USER */ + arcade_upcall 61471; + +#include +#include + +routine arcade_upcall( + arcade_upcall : mach_port_t; + path : pointer_t; + offset : uint64_t; + out should_kill : boolean_t); + +/* vim: set ft=c : */ + diff --git a/osfmk/mach/arm/_structs.h b/osfmk/mach/arm/_structs.h index cc815f80b..d5f4d864d 100644 --- a/osfmk/mach/arm/_structs.h +++ b/osfmk/mach/arm/_structs.h @@ -35,76 +35,69 @@ #include /* __uint32_t */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_EXCEPTION_STATE struct __darwin_arm_exception_state +#define _STRUCT_ARM_EXCEPTION_STATE struct __darwin_arm_exception_state _STRUCT_ARM_EXCEPTION_STATE { - __uint32_t __exception; /* number of arm exception taken */ - __uint32_t __fsr; /* Fault status */ - __uint32_t __far; /* Virtual Fault Address */ + __uint32_t __exception; /* number of arm exception taken */ + __uint32_t __fsr; /* Fault status */ + __uint32_t __far; /* Virtual Fault Address */ }; #else /* !__DARWIN_UNIX03 */ -#define _STRUCT_ARM_EXCEPTION_STATE struct arm_exception_state +#define _STRUCT_ARM_EXCEPTION_STATE struct arm_exception_state _STRUCT_ARM_EXCEPTION_STATE { - __uint32_t exception; /* number of arm exception taken */ - __uint32_t fsr; /* Fault status */ - __uint32_t far; /* Virtual Fault Address */ + __uint32_t exception; /* number of arm exception taken */ + __uint32_t fsr; /* Fault status */ + __uint32_t far; /* Virtual Fault Address */ }; #endif /* __DARWIN_UNIX03 */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_EXCEPTION_STATE64 struct __darwin_arm_exception_state64 +#define _STRUCT_ARM_EXCEPTION_STATE64 struct __darwin_arm_exception_state64 _STRUCT_ARM_EXCEPTION_STATE64 { - __uint64_t __far; /* Virtual Fault Address */ - __uint32_t __esr; /* Exception syndrome */ - __uint32_t __exception; /* number of arm exception taken */ + __uint64_t __far; /* Virtual Fault Address */ + __uint32_t __esr; /* Exception syndrome */ + __uint32_t __exception; /* number of arm exception taken */ }; #else /* !__DARWIN_UNIX03 */ -#define _STRUCT_ARM_EXCEPTION_STATE64 struct arm_exception_state64 +#define _STRUCT_ARM_EXCEPTION_STATE64 struct arm_exception_state64 _STRUCT_ARM_EXCEPTION_STATE64 { - __uint64_t far; /* Virtual Fault Address */ - __uint32_t esr; /* Exception syndrome */ - __uint32_t exception; /* number of arm exception taken */ + __uint64_t far; /* Virtual Fault Address */ + __uint32_t esr; /* Exception syndrome */ + __uint32_t exception; /* number of arm exception taken */ }; #endif /* __DARWIN_UNIX03 */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_THREAD_STATE struct __darwin_arm_thread_state +#define _STRUCT_ARM_THREAD_STATE struct __darwin_arm_thread_state _STRUCT_ARM_THREAD_STATE { - __uint32_t __r[13]; /* General purpose register r0-r12 */ - __uint32_t __sp; /* Stack pointer r13 */ - __uint32_t __lr; /* Link register r14 */ - __uint32_t __pc; /* Program counter r15 */ - __uint32_t __cpsr; /* Current program status register */ + __uint32_t __r[13]; /* General purpose register r0-r12 */ + __uint32_t __sp; /* Stack pointer r13 */ + __uint32_t __lr; /* Link register r14 */ + __uint32_t __pc; /* Program counter r15 */ + __uint32_t __cpsr; /* Current program status register */ }; #else /* !__DARWIN_UNIX03 */ -#define _STRUCT_ARM_THREAD_STATE struct arm_thread_state +#define _STRUCT_ARM_THREAD_STATE struct arm_thread_state _STRUCT_ARM_THREAD_STATE { - __uint32_t r[13]; /* General purpose register r0-r12 */ - __uint32_t sp; /* Stack pointer r13 */ - __uint32_t lr; /* Link register r14 */ - __uint32_t pc; /* Program counter r15 */ - __uint32_t cpsr; /* Current program status register */ + __uint32_t r[13]; /* General purpose register r0-r12 */ + __uint32_t sp; /* Stack pointer r13 */ + __uint32_t lr; /* Link register r14 */ + __uint32_t pc; /* Program counter r15 */ + __uint32_t cpsr; /* Current program status register */ }; #endif /* __DARWIN_UNIX03 */ -#if __DARWIN_UNIX03 -#define _STRUCT_ARM_THREAD_STATE64 struct __darwin_arm_thread_state64 -_STRUCT_ARM_THREAD_STATE64 -{ - __uint64_t __x[29]; /* General purpose registers x0-x28 */ - __uint64_t __fp; /* Frame pointer x29 */ - __uint64_t __lr; /* Link register x30 */ - __uint64_t __sp; /* Stack pointer x31 */ - __uint64_t __pc; /* Program counter */ - __uint32_t __cpsr; /* Current program status register */ - __uint32_t __pad; /* Same size for 32-bit or 64-bit clients */ -}; -#else /* !__DARWIN_UNIX03 */ +#if defined(KERNEL) + +#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 0 +#define __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH 0x1 +#define __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR 0x2 + #define _STRUCT_ARM_THREAD_STATE64 struct arm_thread_state64 _STRUCT_ARM_THREAD_STATE64 { @@ -114,92 +107,342 @@ _STRUCT_ARM_THREAD_STATE64 __uint64_t sp; /* Stack pointer x31 */ __uint64_t pc; /* Program counter */ __uint32_t cpsr; /* Current program status register */ - __uint32_t __pad; /* Same size for 32-bit or 64-bit clients */ + __uint32_t flags; /* Flags describing structure format */ }; + +#else /* defined(KERNEL) */ + +/* + * By default, the pointer fields in the arm_thread_state64_t structure are + * opaque on the arm64e architecture and require the use of accessor macros. + * This mode can also be enabled on the arm64 architecture by building with + * -D__DARWIN_OPAQUE_ARM_THREAD_STATE64=1. + */ +#if defined(__arm64__) && defined(__LP64__) + +#if __has_feature(ptrauth_calls) +#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 1 +#define __DARWIN_PTRAUTH_ARM_THREAD_STATE64 1 +#endif /* __has_feature(ptrauth_calls) */ + +#ifndef __DARWIN_OPAQUE_ARM_THREAD_STATE64 +#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 0 +#endif + +#else /* defined(__arm64__) && defined(__LP64__) */ + +#undef __DARWIN_OPAQUE_ARM_THREAD_STATE64 +#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 0 + +#endif /* defined(__arm64__) && defined(__LP64__) */ + +#if __DARWIN_UNIX03 +#define _STRUCT_ARM_THREAD_STATE64 struct __darwin_arm_thread_state64 +#if __DARWIN_OPAQUE_ARM_THREAD_STATE64 +_STRUCT_ARM_THREAD_STATE64 +{ + __uint64_t __x[29]; /* General purpose registers x0-x28 */ + void* __opaque_fp; /* Frame pointer x29 */ + void* __opaque_lr; /* Link register x30 */ + void* __opaque_sp; /* Stack pointer x31 */ + void* __opaque_pc; /* Program counter */ + __uint32_t __cpsr; /* Current program status register */ + __uint32_t __opaque_flags; /* Flags describing structure format */ +}; +#else /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */ +_STRUCT_ARM_THREAD_STATE64 +{ + __uint64_t __x[29]; /* General purpose registers x0-x28 */ + __uint64_t __fp; /* Frame pointer x29 */ + __uint64_t __lr; /* Link register x30 */ + __uint64_t __sp; /* Stack pointer x31 */ + __uint64_t __pc; /* Program counter */ + __uint32_t __cpsr; /* Current program status register */ + __uint32_t __pad; /* Same size for 32-bit or 64-bit clients */ +}; +#endif /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */ +#else /* !__DARWIN_UNIX03 */ +#define _STRUCT_ARM_THREAD_STATE64 struct arm_thread_state64 +#if __DARWIN_OPAQUE_ARM_THREAD_STATE64 +_STRUCT_ARM_THREAD_STATE64 +{ + __uint64_t x[29]; /* General purpose registers x0-x28 */ + void* __opaque_fp; /* Frame pointer x29 */ + void* __opaque_lr; /* Link register x30 */ + void* __opaque_sp; /* Stack pointer x31 */ + void* __opaque_pc; /* Program counter */ + __uint32_t cpsr; /* Current program status register */ + __uint32_t __opaque_flags; /* Flags describing structure format */ +}; +#else /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */ +_STRUCT_ARM_THREAD_STATE64 +{ + __uint64_t x[29]; /* General purpose registers x0-x28 */ + __uint64_t fp; /* Frame pointer x29 */ + __uint64_t lr; /* Link register x30 */ + __uint64_t sp; /* Stack pointer x31 */ + __uint64_t pc; /* Program counter */ + __uint32_t cpsr; /* Current program status register */ + __uint32_t __pad; /* Same size for 32-bit or 64-bit clients */ +}; +#endif /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */ #endif /* __DARWIN_UNIX03 */ -#if !defined(KERNEL) + #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) + +/* Accessor macros for arm_thread_state64_t pointer fields */ + +#if __has_feature(ptrauth_calls) && defined(__LP64__) +#include + +#if !__DARWIN_OPAQUE_ARM_THREAD_STATE64 || !__DARWIN_PTRAUTH_ARM_THREAD_STATE64 +#error "Invalid configuration" +#endif + +#define __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH 0x1 +#define __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR 0x2 + +/* Return pc field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_pc(ts) \ + __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + (uintptr_t)(__tsp->__opaque_pc && !(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_auth_data(__tsp->__opaque_pc, \ + ptrauth_key_process_independent_code, \ + ptrauth_string_discriminator("pc")) : __tsp->__opaque_pc); }) +/* Return pc field of arm_thread_state64_t as a function pointer. May return + * NULL if a valid function pointer cannot be constructed, the caller should + * fall back to the __darwin_arm_thread_state64_get_pc() macro in that case. */ +#define __darwin_arm_thread_state64_get_pc_fptr(ts) \ + __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + (__tsp->__opaque_pc && !(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_auth_function(__tsp->__opaque_pc, \ + ptrauth_key_process_independent_code, \ + ptrauth_string_discriminator("pc")) : NULL); }) +/* Set pc field of arm_thread_state64_t to a function pointer */ +#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \ + __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + __typeof__(fptr) __f = (fptr); __tsp->__opaque_pc = \ + (__f ? (!(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_auth_and_resign(__f, ptrauth_key_function_pointer, 0, \ + ptrauth_key_process_independent_code, \ + ptrauth_string_discriminator("pc")) : ptrauth_auth_data(__f, \ + ptrauth_key_function_pointer, 0)) : __f); }) +/* Return lr field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_lr(ts) \ + __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + (uintptr_t)(__tsp->__opaque_lr && !(__tsp->__opaque_flags & ( \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH | \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) ? \ + ptrauth_auth_data(__tsp->__opaque_lr, \ + ptrauth_key_process_independent_code, \ + ptrauth_string_discriminator("lr")) : __tsp->__opaque_lr); }) +/* Return lr field of arm_thread_state64_t as a function pointer. May return + * NULL if a valid function pointer cannot be constructed, the caller should + * fall back to the __darwin_arm_thread_state64_get_lr() macro in that case. */ +#define __darwin_arm_thread_state64_get_lr_fptr(ts) \ + __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + (__tsp->__opaque_lr && !(__tsp->__opaque_flags & ( \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH | \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) ? \ + ptrauth_auth_function(__tsp->__opaque_lr, \ + ptrauth_key_process_independent_code, \ + ptrauth_string_discriminator("lr")) : NULL); }) +/* Set lr field of arm_thread_state64_t to a function pointer */ +#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \ + __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + __typeof__(fptr) __f = (fptr); __tsp->__opaque_lr = \ + (__f ? (!(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? (__tsp->__opaque_flags \ + &= ~__DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR , \ + ptrauth_auth_and_resign(__f, ptrauth_key_function_pointer, 0, \ + ptrauth_key_process_independent_code, \ + ptrauth_string_discriminator("lr"))) : ptrauth_auth_data(__f, \ + ptrauth_key_function_pointer, 0)) : __f); }) +/* Return sp field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_sp(ts) \ + __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + (uintptr_t)(__tsp->__opaque_sp && !(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_auth_data(__tsp->__opaque_sp, \ + ptrauth_key_process_independent_data, \ + ptrauth_string_discriminator("sp")) : __tsp->__opaque_sp); }) +/* Set sp field of arm_thread_state64_t to a data pointer value */ +#define __darwin_arm_thread_state64_set_sp(ts, ptr) \ + __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + void *__p = (void*)(uintptr_t)(ptr); __tsp->__opaque_sp = \ + (__p && !(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_sign_unauthenticated(__p, \ + ptrauth_key_process_independent_data, \ + ptrauth_string_discriminator("sp")) : __p); }) +/* Return fp field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_fp(ts) \ + __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + (uintptr_t)(__tsp->__opaque_fp && !(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_auth_data(__tsp->__opaque_fp, \ + ptrauth_key_process_independent_data, \ + ptrauth_string_discriminator("fp")) : __tsp->__opaque_fp); }) +/* Set fp field of arm_thread_state64_t to a data pointer value */ +#define __darwin_arm_thread_state64_set_fp(ts, ptr) \ + __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \ + void *__p = (void*)(uintptr_t)(ptr); __tsp->__opaque_fp = \ + (__p && !(__tsp->__opaque_flags & \ + __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? \ + ptrauth_sign_unauthenticated(__p, \ + ptrauth_key_process_independent_data, \ + ptrauth_string_discriminator("fp")) : __p); }) + +#else /* __has_feature(ptrauth_calls) && defined(__LP64__) */ + +#if __DARWIN_OPAQUE_ARM_THREAD_STATE64 + +#ifndef __LP64__ +#error "Invalid configuration" +#endif + +/* Return pc field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_pc(ts) \ + ((uintptr_t)((ts).__opaque_pc)) +/* Return pc field of arm_thread_state64_t as a function pointer */ +#define __darwin_arm_thread_state64_get_pc_fptr(ts) \ + ((ts).__opaque_pc) +/* Set pc field of arm_thread_state64_t to a function pointer */ +#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \ + ((ts).__opaque_pc = (fptr)) +/* Return lr field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_lr(ts) \ + ((uintptr_t)((ts).__opaque_lr)) +/* Return lr field of arm_thread_state64_t as a function pointer */ +#define __darwin_arm_thread_state64_get_lr_fptr(ts) \ + ((ts).__opaque_lr) +/* Set lr field of arm_thread_state64_t to a function pointer */ +#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \ + ((ts).__opaque_lr = (fptr)) +/* Return sp field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_sp(ts) \ + ((uintptr_t)((ts).__opaque_sp)) +/* Set sp field of arm_thread_state64_t to a data pointer value */ +#define __darwin_arm_thread_state64_set_sp(ts, ptr) \ + ((ts).__opaque_sp = (void*)(uintptr_t)(ptr)) +/* Return fp field of arm_thread_state64_t as a data pointer value */ +#define __darwin_arm_thread_state64_get_fp(ts) \ + ((uintptr_t)((ts).__opaque_fp)) +/* Set fp field of arm_thread_state64_t to a data pointer value */ +#define __darwin_arm_thread_state64_set_fp(ts, ptr) \ + ((ts).__opaque_fp = (void*)(uintptr_t)(ptr)) + +#else /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */ #if __DARWIN_UNIX03 + +/* Return pc field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_pc(ts) \ - ((ts).__pc) + ((ts).__pc) +/* Return pc field of arm_thread_state64_t as a function pointer */ #define __darwin_arm_thread_state64_get_pc_fptr(ts) \ - ((void*)(uintptr_t)((ts).__pc)) + ((void*)(uintptr_t)((ts).__pc)) +/* Set pc field of arm_thread_state64_t to a function pointer */ #define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \ - ((ts).__pc = (uintptr_t)(fptr)) + ((ts).__pc = (uintptr_t)(fptr)) +/* Return lr field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_lr(ts) \ - ((ts).__lr) + ((ts).__lr) +/* Return lr field of arm_thread_state64_t as a function pointer */ #define __darwin_arm_thread_state64_get_lr_fptr(ts) \ - ((void*)(uintptr_t)((ts).__lr)) + ((void*)(uintptr_t)((ts).__lr)) +/* Set lr field of arm_thread_state64_t to a function pointer */ #define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \ - ((ts).__lr = (uintptr_t)(fptr)) + ((ts).__lr = (uintptr_t)(fptr)) +/* Return sp field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_sp(ts) \ - ((ts).__sp) + ((ts).__sp) +/* Set sp field of arm_thread_state64_t to a data pointer value */ #define __darwin_arm_thread_state64_set_sp(ts, ptr) \ - ((ts).__sp = (uintptr_t)(ptr)) + ((ts).__sp = (uintptr_t)(ptr)) +/* Return fp field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_fp(ts) \ - ((ts).__fp) + ((ts).__fp) +/* Set fp field of arm_thread_state64_t to a data pointer value */ #define __darwin_arm_thread_state64_set_fp(ts, ptr) \ - ((ts).__fp = (uintptr_t)(ptr)) -#else /* !__DARWIN_UNIX03 */ + ((ts).__fp = (uintptr_t)(ptr)) + +#else /* __DARWIN_UNIX03 */ + +/* Return pc field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_pc(ts) \ - ((ts).pc) + ((ts).pc) +/* Return pc field of arm_thread_state64_t as a function pointer */ #define __darwin_arm_thread_state64_get_pc_fptr(ts) \ - ((void*)(uintptr_t)((ts).pc)) + ((void*)(uintptr_t)((ts).pc)) +/* Set pc field of arm_thread_state64_t to a function pointer */ #define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \ - ((ts).pc = (uintptr_t)(fptr)) + ((ts).pc = (uintptr_t)(fptr)) +/* Return lr field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_lr(ts) \ - ((ts).lr) + ((ts).lr) +/* Return lr field of arm_thread_state64_t as a function pointer */ #define __darwin_arm_thread_state64_get_lr_fptr(ts) \ - ((void*)(uintptr_t)((ts).lr)) + ((void*)(uintptr_t)((ts).lr)) +/* Set lr field of arm_thread_state64_t to a function pointer */ #define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \ - ((ts).lr = (uintptr_t)(fptr)) + ((ts).lr = (uintptr_t)(fptr)) +/* Return sp field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_sp(ts) \ - ((ts).sp) + ((ts).sp) +/* Set sp field of arm_thread_state64_t to a data pointer value */ #define __darwin_arm_thread_state64_set_sp(ts, ptr) \ - ((ts).sp = (uintptr_t)(ptr)) + ((ts).sp = (uintptr_t)(ptr)) +/* Return fp field of arm_thread_state64_t as a data pointer value */ #define __darwin_arm_thread_state64_get_fp(ts) \ - ((ts).fp) + ((ts).fp) +/* Set fp field of arm_thread_state64_t to a data pointer value */ #define __darwin_arm_thread_state64_set_fp(ts, ptr) \ - ((ts).fp = (uintptr_t)(ptr)) + ((ts).fp = (uintptr_t)(ptr)) + #endif /* __DARWIN_UNIX03 */ +#endif /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */ + +#endif /* __has_feature(ptrauth_calls) && defined(__LP64__) */ #endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */ #endif /* !defined(KERNEL) */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_VFP_STATE struct __darwin_arm_vfp_state +#define _STRUCT_ARM_VFP_STATE struct __darwin_arm_vfp_state _STRUCT_ARM_VFP_STATE { - __uint32_t __r[64]; - __uint32_t __fpscr; + __uint32_t __r[64]; + __uint32_t __fpscr; }; #else /* !__DARWIN_UNIX03 */ -#define _STRUCT_ARM_VFP_STATE struct arm_vfp_state +#define _STRUCT_ARM_VFP_STATE struct arm_vfp_state _STRUCT_ARM_VFP_STATE { - __uint32_t r[64]; - __uint32_t fpscr; + __uint32_t r[64]; + __uint32_t fpscr; }; #endif /* __DARWIN_UNIX03 */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_NEON_STATE64 struct __darwin_arm_neon_state64 -#define _STRUCT_ARM_NEON_STATE struct __darwin_arm_neon_state +#define _STRUCT_ARM_NEON_STATE64 struct __darwin_arm_neon_state64 +#define _STRUCT_ARM_NEON_STATE struct __darwin_arm_neon_state #if defined(__arm64__) _STRUCT_ARM_NEON_STATE64 { - __uint128_t __v[32]; - __uint32_t __fpsr; - __uint32_t __fpcr; + __uint128_t __v[32]; + __uint32_t __fpsr; + __uint32_t __fpcr; }; _STRUCT_ARM_NEON_STATE { - __uint128_t __v[16]; - __uint32_t __fpsr; - __uint32_t __fpcr; + __uint128_t __v[16]; + __uint32_t __fpsr; + __uint32_t __fpcr; }; - #elif defined(__arm__) /* * No 128-bit intrinsic for ARM; leave it opaque for now. @@ -225,15 +468,16 @@ _STRUCT_ARM_NEON_STATE #if defined(__arm64__) _STRUCT_ARM_NEON_STATE64 { - __uint128_t q[32]; - uint32_t fpsr; - uint32_t fpcr; + __uint128_t q[32]; + uint32_t fpsr; + uint32_t fpcr; }; + _STRUCT_ARM_NEON_STATE { - __uint128_t q[16]; - uint32_t fpsr; - uint32_t fpcr; + __uint128_t q[16]; + uint32_t fpsr; + uint32_t fpcr; }; #elif defined(__arm__) /* @@ -255,6 +499,13 @@ _STRUCT_ARM_NEON_STATE #endif /* __DARWIN_UNIX03 */ + +#define _STRUCT_ARM_PAGEIN_STATE struct __arm_pagein_state +_STRUCT_ARM_PAGEIN_STATE +{ + int __pagein_error; +}; + /* * Debug State */ @@ -265,19 +516,19 @@ _STRUCT_ARM_NEON_STATE #define _STRUCT_ARM_DEBUG_STATE struct __darwin_arm_debug_state _STRUCT_ARM_DEBUG_STATE { - __uint32_t __bvr[16]; - __uint32_t __bcr[16]; - __uint32_t __wvr[16]; - __uint32_t __wcr[16]; + __uint32_t __bvr[16]; + __uint32_t __bcr[16]; + __uint32_t __wvr[16]; + __uint32_t __wcr[16]; }; #else /* !__DARWIN_UNIX03 */ #define _STRUCT_ARM_DEBUG_STATE struct arm_debug_state _STRUCT_ARM_DEBUG_STATE { - __uint32_t bvr[16]; - __uint32_t bcr[16]; - __uint32_t wvr[16]; - __uint32_t wcr[16]; + __uint32_t bvr[16]; + __uint32_t bcr[16]; + __uint32_t wvr[16]; + __uint32_t wcr[16]; }; #endif /* __DARWIN_UNIX03 */ @@ -286,22 +537,22 @@ _STRUCT_ARM_DEBUG_STATE /* ARM's arm_debug_state is ARM64's arm_legacy_debug_state */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_LEGACY_DEBUG_STATE struct arm_legacy_debug_state +#define _STRUCT_ARM_LEGACY_DEBUG_STATE struct arm_legacy_debug_state _STRUCT_ARM_LEGACY_DEBUG_STATE { - __uint32_t __bvr[16]; - __uint32_t __bcr[16]; - __uint32_t __wvr[16]; - __uint32_t __wcr[16]; + __uint32_t __bvr[16]; + __uint32_t __bcr[16]; + __uint32_t __wvr[16]; + __uint32_t __wcr[16]; }; #else /* __DARWIN_UNIX03 */ -#define _STRUCT_ARM_LEGACY_DEBUG_STATE struct arm_legacy_debug_state +#define _STRUCT_ARM_LEGACY_DEBUG_STATE struct arm_legacy_debug_state _STRUCT_ARM_LEGACY_DEBUG_STATE { - __uint32_t bvr[16]; - __uint32_t bcr[16]; - __uint32_t wvr[16]; - __uint32_t wcr[16]; + __uint32_t bvr[16]; + __uint32_t bcr[16]; + __uint32_t wvr[16]; + __uint32_t wcr[16]; }; #endif /* __DARWIN_UNIX03 */ #else @@ -309,55 +560,55 @@ _STRUCT_ARM_LEGACY_DEBUG_STATE #endif #if __DARWIN_UNIX03 -#define _STRUCT_ARM_DEBUG_STATE32 struct __darwin_arm_debug_state32 +#define _STRUCT_ARM_DEBUG_STATE32 struct __darwin_arm_debug_state32 _STRUCT_ARM_DEBUG_STATE32 { - __uint32_t __bvr[16]; - __uint32_t __bcr[16]; - __uint32_t __wvr[16]; - __uint32_t __wcr[16]; - __uint64_t __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ + __uint32_t __bvr[16]; + __uint32_t __bcr[16]; + __uint32_t __wvr[16]; + __uint32_t __wcr[16]; + __uint64_t __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ }; -#define _STRUCT_ARM_DEBUG_STATE64 struct __darwin_arm_debug_state64 +#define _STRUCT_ARM_DEBUG_STATE64 struct __darwin_arm_debug_state64 _STRUCT_ARM_DEBUG_STATE64 { - __uint64_t __bvr[16]; - __uint64_t __bcr[16]; - __uint64_t __wvr[16]; - __uint64_t __wcr[16]; - __uint64_t __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ + __uint64_t __bvr[16]; + __uint64_t __bcr[16]; + __uint64_t __wvr[16]; + __uint64_t __wcr[16]; + __uint64_t __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ }; #else /* !__DARWIN_UNIX03 */ -#define _STRUCT_ARM_DEBUG_STATE32 struct arm_debug_state32 +#define _STRUCT_ARM_DEBUG_STATE32 struct arm_debug_state32 _STRUCT_ARM_DEBUG_STATE32 { - __uint32_t bvr[16]; - __uint32_t bcr[16]; - __uint32_t wvr[16]; - __uint32_t wcr[16]; - __uint64_t mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ + __uint32_t bvr[16]; + __uint32_t bcr[16]; + __uint32_t wvr[16]; + __uint32_t wcr[16]; + __uint64_t mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ }; -#define _STRUCT_ARM_DEBUG_STATE64 struct arm_debug_state64 +#define _STRUCT_ARM_DEBUG_STATE64 struct arm_debug_state64 _STRUCT_ARM_DEBUG_STATE64 { - __uint64_t bvr[16]; - __uint64_t bcr[16]; - __uint64_t wvr[16]; - __uint64_t wcr[16]; - __uint64_t mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ + __uint64_t bvr[16]; + __uint64_t bcr[16]; + __uint64_t wvr[16]; + __uint64_t wcr[16]; + __uint64_t mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */ }; #endif /* __DARWIN_UNIX03 */ #if __DARWIN_UNIX03 -#define _STRUCT_ARM_CPMU_STATE64 struct __darwin_arm_cpmu_state64 +#define _STRUCT_ARM_CPMU_STATE64 struct __darwin_arm_cpmu_state64 _STRUCT_ARM_CPMU_STATE64 { __uint64_t __ctrs[16]; }; #else /* __DARWIN_UNIX03 */ -#define _STRUCT_ARM_CPMU_STATE64 struct arm_cpmu_state64 +#define _STRUCT_ARM_CPMU_STATE64 struct arm_cpmu_state64 _STRUCT_ARM_CPMU_STATE64 { __uint64_t ctrs[16]; diff --git a/osfmk/mach/arm/exception.h b/osfmk/mach/arm/exception.h index 14478d091..06658bc1e 100644 --- a/osfmk/mach/arm/exception.h +++ b/osfmk/mach/arm/exception.h @@ -45,6 +45,17 @@ #define EXC_ARM_UNDEFINED 1 /* Undefined */ +/* + * EXC_ARITHMETIC + */ + +#define EXC_ARM_FP_UNDEFINED 0 /* Undefined Floating Point Exception */ +#define EXC_ARM_FP_IO 1 /* Invalid Floating Point Operation */ +#define EXC_ARM_FP_DZ 2 /* Floating Point Divide by Zero */ +#define EXC_ARM_FP_OF 3 /* Floating Point Overflow */ +#define EXC_ARM_FP_UF 4 /* Floating Point Underflow */ +#define EXC_ARM_FP_IX 5 /* Inexact Floating Point Result */ +#define EXC_ARM_FP_ID 6 /* Floating Point Denormal Input */ /* * EXC_BAD_ACCESS @@ -54,7 +65,7 @@ #define EXC_ARM_DA_ALIGN 0x101 /* Alignment Fault */ #define EXC_ARM_DA_DEBUG 0x102 /* Debug (watch/break) Fault */ #define EXC_ARM_SP_ALIGN 0x103 /* SP Alignment Fault */ -#define EXC_ARM_SWP 0x104 /* SWP instruction */ +#define EXC_ARM_SWP 0x104 /* SWP instruction */ /* * EXC_BREAKPOINT diff --git a/osfmk/mach/arm/sdt_isa.h b/osfmk/mach/arm/sdt_isa.h index 0751024a2..519e1d935 100644 --- a/osfmk/mach/arm/sdt_isa.h +++ b/osfmk/mach/arm/sdt_isa.h @@ -30,8 +30,6 @@ #ifndef _MACH_ARM_SDT_ISA_H #define _MACH_ARM_SDT_ISA_H -/* #pragma ident "@(#)sdt.h 1.7 05/06/08 SMI" */ - /* * Only define when testing. This makes the calls into actual calls to * test functions. diff --git a/osfmk/mach/arm/thread_state.h b/osfmk/mach/arm/thread_state.h index 3a6369f8f..bedc4090a 100644 --- a/osfmk/mach/arm/thread_state.h +++ b/osfmk/mach/arm/thread_state.h @@ -33,7 +33,7 @@ #define _MACH_ARM_THREAD_STATE_H_ /* Size of maximum exported thread state in words */ -#define ARM_THREAD_STATE_MAX (144) /* Size of biggest state possible */ +#define ARM_THREAD_STATE_MAX (1296) /* Size of biggest state possible */ #if defined (__arm__) || defined(__arm64__) #define THREAD_STATE_MAX ARM_THREAD_STATE_MAX diff --git a/osfmk/mach/arm/thread_status.h b/osfmk/mach/arm/thread_status.h index 27a5441d8..b12c02b5b 100644 --- a/osfmk/mach/arm/thread_status.h +++ b/osfmk/mach/arm/thread_status.h @@ -46,45 +46,52 @@ * Flavors */ -#define ARM_THREAD_STATE 1 +#define ARM_THREAD_STATE 1 #define ARM_UNIFIED_THREAD_STATE ARM_THREAD_STATE -#define ARM_VFP_STATE 2 -#define ARM_EXCEPTION_STATE 3 -#define ARM_DEBUG_STATE 4 /* pre-armv8 */ -#define THREAD_STATE_NONE 5 -#define ARM_THREAD_STATE64 6 -#define ARM_EXCEPTION_STATE64 7 -// ARM_THREAD_STATE_LAST (legacy) 8 -#define ARM_THREAD_STATE32 9 +#define ARM_VFP_STATE 2 +#define ARM_EXCEPTION_STATE 3 +#define ARM_DEBUG_STATE 4 /* pre-armv8 */ +#define THREAD_STATE_NONE 5 +#define ARM_THREAD_STATE64 6 +#define ARM_EXCEPTION_STATE64 7 +// ARM_THREAD_STATE_LAST 8 /* legacy */ +#define ARM_THREAD_STATE32 9 /* API */ -#define ARM_DEBUG_STATE32 14 -#define ARM_DEBUG_STATE64 15 -#define ARM_NEON_STATE 16 -#define ARM_NEON_STATE64 17 -#define ARM_CPMU_STATE64 18 +#define ARM_DEBUG_STATE32 14 +#define ARM_DEBUG_STATE64 15 +#define ARM_NEON_STATE 16 +#define ARM_NEON_STATE64 17 +#define ARM_CPMU_STATE64 18 #ifdef XNU_KERNEL_PRIVATE /* For kernel use */ -#define ARM_SAVED_STATE32 20 -#define ARM_SAVED_STATE64 21 -#define ARM_NEON_SAVED_STATE32 22 -#define ARM_NEON_SAVED_STATE64 23 +#define ARM_SAVED_STATE32 20 +#define ARM_SAVED_STATE64 21 +#define ARM_NEON_SAVED_STATE32 22 +#define ARM_NEON_SAVED_STATE64 23 #endif /* XNU_KERNEL_PRIVATE */ + +#define ARM_STATE_FLAVOR_IS_OTHER_VALID(_flavor_) 0 + +#define ARM_PAGEIN_STATE 27 + #define VALID_THREAD_STATE_FLAVOR(x) \ -((x == ARM_THREAD_STATE) || \ - (x == ARM_VFP_STATE) || \ - (x == ARM_EXCEPTION_STATE) || \ - (x == ARM_DEBUG_STATE) || \ - (x == THREAD_STATE_NONE) || \ - (x == ARM_THREAD_STATE32) || \ - (x == ARM_THREAD_STATE64) || \ - (x == ARM_EXCEPTION_STATE64) || \ - (x == ARM_NEON_STATE) || \ - (x == ARM_NEON_STATE64) || \ - (x == ARM_DEBUG_STATE32) || \ - (x == ARM_DEBUG_STATE64)) + ((x == ARM_THREAD_STATE) || \ + (x == ARM_VFP_STATE) || \ + (x == ARM_EXCEPTION_STATE) || \ + (x == ARM_DEBUG_STATE) || \ + (x == THREAD_STATE_NONE) || \ + (x == ARM_THREAD_STATE32) || \ + (x == ARM_THREAD_STATE64) || \ + (x == ARM_EXCEPTION_STATE64) || \ + (x == ARM_NEON_STATE) || \ + (x == ARM_NEON_STATE64) || \ + (x == ARM_DEBUG_STATE32) || \ + (x == ARM_DEBUG_STATE64) || \ + (x == ARM_PAGEIN_STATE) || \ + (ARM_STATE_FLAVOR_IS_OTHER_VALID(x))) struct arm_state_hdr { uint32_t flavor; @@ -92,32 +99,50 @@ struct arm_state_hdr { }; typedef struct arm_state_hdr arm_state_hdr_t; -typedef _STRUCT_ARM_THREAD_STATE arm_thread_state_t; -typedef _STRUCT_ARM_THREAD_STATE arm_thread_state32_t; -typedef _STRUCT_ARM_THREAD_STATE64 arm_thread_state64_t; +typedef _STRUCT_ARM_THREAD_STATE arm_thread_state_t; +typedef _STRUCT_ARM_THREAD_STATE arm_thread_state32_t; +typedef _STRUCT_ARM_THREAD_STATE64 arm_thread_state64_t; #if !defined(KERNEL) #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) + +/* Accessor macros for arm_thread_state64_t pointer fields */ + +/* Return pc field of arm_thread_state64_t as a data pointer value */ #define arm_thread_state64_get_pc(ts) \ __darwin_arm_thread_state64_get_pc(ts) +/* Return pc field of arm_thread_state64_t as a function pointer. May return + * NULL if a valid function pointer cannot be constructed, the caller should + * fall back to the arm_thread_state64_get_pc() macro in that case. */ #define arm_thread_state64_get_pc_fptr(ts) \ __darwin_arm_thread_state64_get_pc_fptr(ts) +/* Set pc field of arm_thread_state64_t to a function pointer */ #define arm_thread_state64_set_pc_fptr(ts, fptr) \ __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) +/* Return lr field of arm_thread_state64_t as a data pointer value */ #define arm_thread_state64_get_lr(ts) \ __darwin_arm_thread_state64_get_lr(ts) +/* Return lr field of arm_thread_state64_t as a function pointer. May return + * NULL if a valid function pointer cannot be constructed, the caller should + * fall back to the arm_thread_state64_get_lr() macro in that case. */ #define arm_thread_state64_get_lr_fptr(ts) \ __darwin_arm_thread_state64_get_lr_fptr(ts) +/* Set lr field of arm_thread_state64_t to a function pointer */ #define arm_thread_state64_set_lr_fptr(ts, fptr) \ __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) +/* Return sp field of arm_thread_state64_t as a data pointer value */ #define arm_thread_state64_get_sp(ts) \ __darwin_arm_thread_state64_get_sp(ts) +/* Set sp field of arm_thread_state64_t to a data pointer value */ #define arm_thread_state64_set_sp(ts, ptr) \ __darwin_arm_thread_state64_set_sp(ts, ptr) +/* Return fp field of arm_thread_state64_t as a data pointer value */ #define arm_thread_state64_get_fp(ts) \ __darwin_arm_thread_state64_get_fp(ts) +/* Set fp field of arm_thread_state64_t to a data pointer value */ #define arm_thread_state64_set_fp(ts, ptr) \ __darwin_arm_thread_state64_set_fp(ts, ptr) + #endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */ #endif /* !defined(KERNEL) */ @@ -128,79 +153,86 @@ struct arm_unified_thread_state { arm_thread_state64_t ts_64; } uts; }; -#define ts_32 uts.ts_32 -#define ts_64 uts.ts_64 +#define ts_32 uts.ts_32 +#define ts_64 uts.ts_64 typedef struct arm_unified_thread_state arm_unified_thread_state_t; #define ARM_THREAD_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_thread_state_t)/sizeof(uint32_t))) + (sizeof (arm_thread_state_t)/sizeof(uint32_t))) #define ARM_THREAD_STATE32_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_thread_state32_t)/sizeof(uint32_t))) + (sizeof (arm_thread_state32_t)/sizeof(uint32_t))) #define ARM_THREAD_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_thread_state64_t)/sizeof(uint32_t))) + (sizeof (arm_thread_state64_t)/sizeof(uint32_t))) #define ARM_UNIFIED_THREAD_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_unified_thread_state_t)/sizeof(uint32_t))) + (sizeof (arm_unified_thread_state_t)/sizeof(uint32_t))) + +typedef _STRUCT_ARM_VFP_STATE arm_vfp_state_t; +typedef _STRUCT_ARM_NEON_STATE arm_neon_state_t; +typedef _STRUCT_ARM_NEON_STATE arm_neon_state32_t; +typedef _STRUCT_ARM_NEON_STATE64 arm_neon_state64_t; -typedef _STRUCT_ARM_VFP_STATE arm_vfp_state_t; -typedef _STRUCT_ARM_NEON_STATE arm_neon_state_t; -typedef _STRUCT_ARM_NEON_STATE arm_neon_state32_t; -typedef _STRUCT_ARM_NEON_STATE64 arm_neon_state64_t; -typedef _STRUCT_ARM_EXCEPTION_STATE arm_exception_state_t; -typedef _STRUCT_ARM_EXCEPTION_STATE arm_exception_state32_t; -typedef _STRUCT_ARM_EXCEPTION_STATE64 arm_exception_state64_t; +typedef _STRUCT_ARM_EXCEPTION_STATE arm_exception_state_t; +typedef _STRUCT_ARM_EXCEPTION_STATE arm_exception_state32_t; +typedef _STRUCT_ARM_EXCEPTION_STATE64 arm_exception_state64_t; -typedef _STRUCT_ARM_DEBUG_STATE32 arm_debug_state32_t; -typedef _STRUCT_ARM_DEBUG_STATE64 arm_debug_state64_t; +typedef _STRUCT_ARM_DEBUG_STATE32 arm_debug_state32_t; +typedef _STRUCT_ARM_DEBUG_STATE64 arm_debug_state64_t; + +typedef _STRUCT_ARM_PAGEIN_STATE arm_pagein_state_t; #if defined(XNU_KERNEL_PRIVATE) && defined(__arm64__) /* See below for ARM64 kernel structure definition for arm_debug_state. */ -#else +#else /* defined(XNU_KERNEL_PRIVATE) && defined(__arm64__) */ /* * Otherwise not ARM64 kernel and we must preserve legacy ARM definitions of * arm_debug_state for binary compatability of userland consumers of this file. */ #if defined(__arm__) -typedef _STRUCT_ARM_DEBUG_STATE arm_debug_state_t; +typedef _STRUCT_ARM_DEBUG_STATE arm_debug_state_t; #elif defined(__arm64__) -typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_debug_state_t; -#else +typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_debug_state_t; +#else /* defined(__arm__) */ #error Undefined architecture -#endif -#endif +#endif /* defined(__arm__) */ +#endif /* defined(XNU_KERNEL_PRIVATE) && defined(__arm64__) */ #define ARM_VFP_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_vfp_state_t)/sizeof(uint32_t))) + (sizeof (arm_vfp_state_t)/sizeof(uint32_t))) #define ARM_EXCEPTION_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_exception_state_t)/sizeof(uint32_t))) + (sizeof (arm_exception_state_t)/sizeof(uint32_t))) #define ARM_EXCEPTION_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_exception_state64_t)/sizeof(uint32_t))) + (sizeof (arm_exception_state64_t)/sizeof(uint32_t))) #define ARM_DEBUG_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_debug_state_t)/sizeof(uint32_t))) + (sizeof (arm_debug_state_t)/sizeof(uint32_t))) #define ARM_DEBUG_STATE32_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_debug_state32_t)/sizeof(uint32_t))) + (sizeof (arm_debug_state32_t)/sizeof(uint32_t))) + +#define ARM_PAGEIN_STATE_COUNT ((mach_msg_type_number_t) \ + (sizeof (arm_pagein_state_t)/sizeof(uint32_t))) #define ARM_DEBUG_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_debug_state64_t)/sizeof(uint32_t))) + (sizeof (arm_debug_state64_t)/sizeof(uint32_t))) #define ARM_NEON_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_neon_state_t)/sizeof(uint32_t))) + (sizeof (arm_neon_state_t)/sizeof(uint32_t))) #define ARM_NEON_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_neon_state64_t)/sizeof(uint32_t))) + (sizeof (arm_neon_state64_t)/sizeof(uint32_t))) + +#define MACHINE_THREAD_STATE ARM_THREAD_STATE +#define MACHINE_THREAD_STATE_COUNT ARM_UNIFIED_THREAD_STATE_COUNT -#define MACHINE_THREAD_STATE ARM_THREAD_STATE -#define MACHINE_THREAD_STATE_COUNT ARM_UNIFIED_THREAD_STATE_COUNT /* * Largest state on this machine: */ -#define THREAD_MACHINE_STATE_MAX THREAD_STATE_MAX +#define THREAD_MACHINE_STATE_MAX THREAD_STATE_MAX #ifdef XNU_KERNEL_PRIVATE @@ -243,17 +275,17 @@ const_thread_state64(const arm_unified_thread_state_t *its) #if defined(__arm__) #include -#define ARM_SAVED_STATE THREAD_STATE_NONE + 1 +#define ARM_SAVED_STATE (THREAD_STATE_NONE + 1) struct arm_saved_state { - uint32_t r[13]; /* General purpose register r0-r12 */ - uint32_t sp; /* Stack pointer r13 */ - uint32_t lr; /* Link register r14 */ - uint32_t pc; /* Program counter r15 */ - uint32_t cpsr; /* Current program status register */ - uint32_t fsr; /* Fault status */ - uint32_t far; /* Virtual Fault Address */ - uint32_t exception;/* exception number */ + uint32_t r[13]; /* General purpose register r0-r12 */ + uint32_t sp; /* Stack pointer r13 */ + uint32_t lr; /* Link register r14 */ + uint32_t pc; /* Program counter r15 */ + uint32_t cpsr; /* Current program status register */ + uint32_t fsr; /* Fault status */ + uint32_t far; /* Virtual Fault Address */ + uint32_t exception; /* exception number */ }; typedef struct arm_saved_state arm_saved_state_t; @@ -262,6 +294,12 @@ typedef struct arm_saved_state arm_saved_state_t; */ typedef struct arm_saved_state arm_saved_state32_t; +static inline void +copy_signed_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src) +{ + *dst = *src; +} + static inline arm_saved_state32_t* saved_state32(arm_saved_state_t *iss) { @@ -276,13 +314,13 @@ is_saved_state32(const arm_saved_state_t *iss __unused) struct arm_saved_state_tagged { - uint32_t tag; - struct arm_saved_state state; + uint32_t tag; + struct arm_saved_state state; }; typedef struct arm_saved_state_tagged arm_saved_state_tagged_t; #define ARM_SAVED_STATE32_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_saved_state_t)/sizeof(unsigned int))) + (sizeof (arm_saved_state_t)/sizeof(unsigned int))) static inline register_t @@ -291,6 +329,12 @@ get_saved_state_pc(const arm_saved_state_t *iss) return iss->pc; } +static inline void +add_saved_state_pc(arm_saved_state_t *iss, int diff) +{ + iss->pc += diff; +} + static inline void set_saved_state_pc(arm_saved_state_t *iss, register_t pc) { @@ -339,6 +383,13 @@ get_saved_state_cpsr(const arm_saved_state_t *iss) return iss->cpsr; } +static inline void +mask_saved_state_cpsr(arm_saved_state_t *iss, uint32_t set_bits, uint32_t clear_bits) +{ + iss->cpsr |= set_bits; + iss->cpsr &= clear_bits; +} + static inline void set_saved_state_cpsr(arm_saved_state_t *iss, register_t cpsr) { @@ -368,46 +419,49 @@ set_saved_state_reg(arm_saved_state_t *iss, unsigned regno, register_t val) */ struct arm_saved_state32 { - uint32_t r[13]; /* General purpose register r0-r12 */ - uint32_t sp; /* Stack pointer r13 */ - uint32_t lr; /* Link register r14 */ - uint32_t pc; /* Program counter r15 */ - uint32_t cpsr; /* Current program status register */ - uint32_t far; /* Virtual fault address */ - uint32_t esr; /* Exception syndrome register */ - uint32_t exception; /* Exception number */ + uint32_t r[13]; /* General purpose register r0-r12 */ + uint32_t sp; /* Stack pointer r13 */ + uint32_t lr; /* Link register r14 */ + uint32_t pc; /* Program counter r15 */ + uint32_t cpsr; /* Current program status register */ + uint32_t far; /* Virtual fault address */ + uint32_t esr; /* Exception syndrome register */ + uint32_t exception; /* Exception number */ }; typedef struct arm_saved_state32 arm_saved_state32_t; struct arm_saved_state32_tagged { - uint32_t tag; - struct arm_saved_state32 state; + uint32_t tag; + struct arm_saved_state32 state; }; typedef struct arm_saved_state32_tagged arm_saved_state32_tagged_t; #define ARM_SAVED_STATE32_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_saved_state32_t)/sizeof(unsigned int))) + (sizeof(arm_saved_state32_t)/sizeof(unsigned int))) struct arm_saved_state64 { - uint64_t x[29]; /* General purpose registers x0-x28 */ - uint64_t fp; /* Frame pointer x29 */ - uint64_t lr; /* Link register x30 */ - uint64_t sp; /* Stack pointer x31 */ - uint64_t pc; /* Program counter */ - uint32_t cpsr; /* Current program status register */ - uint32_t reserved; /* Reserved padding */ - uint64_t far; /* Virtual fault address */ - uint32_t esr; /* Exception syndrome register */ - uint32_t exception; /* Exception number */ + uint64_t x[29]; /* General purpose registers x0-x28 */ + uint64_t fp; /* Frame pointer x29 */ + uint64_t lr; /* Link register x30 */ + uint64_t sp; /* Stack pointer x31 */ + uint64_t pc; /* Program counter */ + uint32_t cpsr; /* Current program status register */ + uint32_t reserved; /* Reserved padding */ + uint64_t far; /* Virtual fault address */ + uint32_t esr; /* Exception syndrome register */ + uint32_t exception; /* Exception number */ +#if defined(HAS_APPLE_PAC) + uint64_t jophash; +#endif /* defined(HAS_APPLE_PAC) */ }; typedef struct arm_saved_state64 arm_saved_state64_t; #define ARM_SAVED_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_saved_state64_t)/sizeof(unsigned int))) + (sizeof(arm_saved_state64_t)/sizeof(unsigned int))) struct arm_saved_state64_tagged { - uint32_t tag; - struct arm_saved_state64 state; + uint32_t tag; + struct arm_saved_state64 state; }; typedef struct arm_saved_state64_tagged arm_saved_state64_tagged_t; @@ -418,11 +472,85 @@ struct arm_saved_state { struct arm_saved_state64 ss_64; } uss; } __attribute__((aligned(16))); -#define ss_32 uss.ss_32 -#define ss_64 uss.ss_64 +#define ss_32 uss.ss_32 +#define ss_64 uss.ss_64 typedef struct arm_saved_state arm_saved_state_t; +#if defined(XNU_KERNEL_PRIVATE) +#if defined(HAS_APPLE_PAC) +/* + * Methods used to sign and check thread state to detect corruptions of saved + * thread state across exceptions and context switches. + */ +extern void ml_sign_thread_state(arm_saved_state_t *, uint64_t, uint32_t, uint64_t, uint64_t, uint64_t); + +extern void ml_check_signed_state(const arm_saved_state_t *, uint64_t, uint32_t, uint64_t, uint64_t, uint64_t); + +/* XXX: including stddef.f here breaks ctfmerge on some builds, so use __builtin_offsetof() instead of offsetof() */ +#define ss64_offsetof(x) __builtin_offsetof(struct arm_saved_state, ss_64.x) + +/** + * Verify the signed thread state in _iss, execute the assembly instructions + * _instr, and re-sign the modified thread state. Varargs specify additional + * inputs. + * + * _instr may read or modify the thread state in the following registers: + * + * x0: _iss + * x1: authed _iss->ss_64.pc + * w2: authed _iss->ss_64.cpsr + * x3: authed _iss->ss_64.lr + * x4: authed _iss->ss_64.x16 + * x5: authed _iss->ss_64.x17 + * x6: scratch register + * x7: scratch register + */ +#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...) \ + asm volatile ( \ + "mov x8, lr" "\n" \ + "mov x0, %[iss]" "\n" \ + "ldp x4, x5, [x0, %[SS64_X16]]" "\n" \ + "ldr x6, [x0, %[SS64_PC]]" "\n" \ + "ldr w7, [x0, %[SS64_CPSR]]" "\n" \ + "ldr x3, [x0, %[SS64_LR]]" "\n" \ + "mov x1, x6" "\n" \ + "mov w2, w7" "\n" \ + "bl _ml_check_signed_state" "\n" \ + "mov x1, x6" "\n" \ + "mov w2, w7" "\n" \ + _instr "\n" \ + "bl _ml_sign_thread_state" "\n" \ + "mov lr, x8" "\n" \ + : \ + : [iss] "r"(_iss), \ + [SS64_X16] "i"(ss64_offsetof(x[16])), \ + [SS64_PC] "i"(ss64_offsetof(pc)), \ + [SS64_CPSR] "i"(ss64_offsetof(cpsr)), \ + [SS64_LR] "i"(ss64_offsetof(lr)),##__VA_ARGS__ \ + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8" \ + ) + +static inline void +check_and_sign_copied_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src) +{ + MANIPULATE_SIGNED_THREAD_STATE(src, + "mov x0, %[dst]", + [dst] "r"(dst) + ); +} +#endif /* defined(HAS_APPLE_PAC) */ + +static inline void +copy_signed_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src) +{ + *dst = *src; +#if defined(HAS_APPLE_PAC) + check_and_sign_copied_thread_state(dst, src); +#endif +} + +#endif /* defined(XNU_KERNEL_PRIVATE) */ static inline boolean_t is_saved_state32(const arm_saved_state_t *iss) @@ -466,13 +594,41 @@ get_saved_state_pc(const arm_saved_state_t *iss) return is_saved_state32(iss) ? const_saved_state32(iss)->pc : const_saved_state64(iss)->pc; } +static inline void +add_saved_state_pc(arm_saved_state_t *iss, int diff) +{ + if (is_saved_state32(iss)) { + uint64_t pc = saved_state32(iss)->pc + diff; + saved_state32(iss)->pc = CAST_ASSERT_SAFE(uint32_t, pc); + } else { +#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov w6, %w[diff] \n" + "add x1, x1, w6, sxtw \n" + "str x1, [x0, %[SS64_PC]] \n", + [diff] "r"(diff) + ); +#else + saved_state64(iss)->pc += diff; +#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */ + } +} + static inline void set_saved_state_pc(arm_saved_state_t *iss, register_t pc) { if (is_saved_state32(iss)) { saved_state32(iss)->pc = CAST_ASSERT_SAFE(uint32_t, pc); } else { +#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov x1, %[pc] \n" + "str x1, [x0, %[SS64_PC]] \n", + [pc] "r"(pc) + ); +#else saved_state64(iss)->pc = pc; +#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */ } } @@ -504,7 +660,15 @@ set_saved_state_lr(arm_saved_state_t *iss, register_t lr) if (is_saved_state32(iss)) { saved_state32(iss)->lr = CAST_ASSERT_SAFE(uint32_t, lr); } else { +#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov x3, %[lr] \n" + "str x3, [x0, %[SS64_LR]] \n", + [lr] "r"(lr) + ); +#else saved_state64(iss)->lr = lr; +#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */ } } @@ -550,6 +714,25 @@ set_saved_state_reg(arm_saved_state_t *iss, unsigned reg, register_t value) if (is_saved_state32(iss)) { saved_state32(iss)->r[reg] = CAST_ASSERT_SAFE(uint32_t, value); } else { +#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) + /* x16 and x17 are part of the jophash */ + if (reg == 16) { + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov x4, %[value] \n" + "str x4, [x0, %[SS64_X16]] \n", + [value] "r"(value) + ); + return; + } else if (reg == 17) { + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov x5, %[value] \n" + "str x5, [x0, %[SS64_X17]] \n", + [value] "r"(value), + [SS64_X17] "i"(ss64_offsetof(x[17])) + ); + return; + } +#endif saved_state64(iss)->x[reg] = value; } } @@ -560,13 +743,45 @@ get_saved_state_cpsr(const arm_saved_state_t *iss) return is_saved_state32(iss) ? const_saved_state32(iss)->cpsr : const_saved_state64(iss)->cpsr; } +static inline void +mask_saved_state_cpsr(arm_saved_state_t *iss, uint32_t set_bits, uint32_t clear_bits) +{ + if (is_saved_state32(iss)) { + saved_state32(iss)->cpsr |= set_bits; + saved_state32(iss)->cpsr &= ~clear_bits; + } else { +#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov w6, %w[set_bits] \n" + "orr w2, w2, w6, lsl #0 \n" + "mov w6, %w[clear_bits] \n" + "bic w2, w2, w6, lsl #0 \n" + "str w2, [x0, %[SS64_CPSR]] \n", + [set_bits] "r"(set_bits), + [clear_bits] "r"(clear_bits) + ); +#else + saved_state64(iss)->cpsr |= set_bits; + saved_state64(iss)->cpsr &= ~clear_bits; +#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */ + } +} + static inline void set_saved_state_cpsr(arm_saved_state_t *iss, uint32_t cpsr) { if (is_saved_state32(iss)) { saved_state32(iss)->cpsr = cpsr; } else { +#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) + MANIPULATE_SIGNED_THREAD_STATE(iss, + "mov w2, %w[cpsr] \n" + "str w2, [x0, %[SS64_CPSR]] \n", + [cpsr] "r"(cpsr) + ); +#else saved_state64(iss)->cpsr = cpsr; +#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */ } } @@ -626,10 +841,10 @@ get_saved_state_svc_number(const arm_saved_state_t *iss) return is_saved_state32(iss) ? (int)const_saved_state32(iss)->r[12] : (int)const_saved_state64(iss)->x[ARM64_SYSCALL_CODE_REG_NUM]; /* Only first word counts here */ } -typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_legacy_debug_state_t; +typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_legacy_debug_state_t; struct arm_debug_aggregate_state { - arm_state_hdr_t dsh; + arm_state_hdr_t dsh; union { arm_debug_state32_t ds32; arm_debug_state64_t ds64; @@ -639,7 +854,7 @@ struct arm_debug_aggregate_state { typedef struct arm_debug_aggregate_state arm_debug_state_t; #define ARM_LEGACY_DEBUG_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_legacy_debug_state_t)/sizeof(uint32_t))) + (sizeof (arm_legacy_debug_state_t)/sizeof(uint32_t))) /* * NEON context @@ -650,31 +865,31 @@ typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); struct arm_neon_saved_state32 { union { - uint128_t q[16]; - uint64_t d[32]; - uint32_t s[32]; + uint128_t q[16]; + uint64_t d[32]; + uint32_t s[32]; } v; - uint32_t fpsr; - uint32_t fpcr; + uint32_t fpsr; + uint32_t fpcr; }; typedef struct arm_neon_saved_state32 arm_neon_saved_state32_t; #define ARM_NEON_SAVED_STATE32_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_neon_saved_state32_t)/sizeof(unsigned int))) + (sizeof (arm_neon_saved_state32_t)/sizeof(unsigned int))) struct arm_neon_saved_state64 { union { - uint128_t q[32]; - uint64x2_t d[32]; - uint32x4_t s[32]; + uint128_t q[32]; + uint64x2_t d[32]; + uint32x4_t s[32]; } v; - uint32_t fpsr; - uint32_t fpcr; + uint32_t fpsr; + uint32_t fpcr; }; typedef struct arm_neon_saved_state64 arm_neon_saved_state64_t; #define ARM_NEON_SAVED_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof (arm_neon_saved_state64_t)/sizeof(unsigned int))) + (sizeof (arm_neon_saved_state64_t)/sizeof(unsigned int))) struct arm_neon_saved_state { arm_state_hdr_t nsh; @@ -684,8 +899,8 @@ struct arm_neon_saved_state { } uns; }; typedef struct arm_neon_saved_state arm_neon_saved_state_t; -#define ns_32 uns.ns_32 -#define ns_64 uns.ns_64 +#define ns_32 uns.ns_32 +#define ns_64 uns.ns_64 static inline boolean_t is_neon_saved_state32(const arm_neon_saved_state_t *state) @@ -725,13 +940,13 @@ typedef struct arm_context arm_context_t; extern void saved_state_to_thread_state64(const arm_saved_state_t*, arm_thread_state64_t*); extern void thread_state64_to_saved_state(const arm_thread_state64_t*, arm_saved_state_t*); -#else +#else /* defined(__arm__) */ #error Unknown arch -#endif +#endif /* defined(__arm__) */ extern void saved_state_to_thread_state32(const arm_saved_state_t*, arm_thread_state32_t*); extern void thread_state32_to_saved_state(const arm_thread_state32_t*, arm_saved_state_t*); #endif /* XNU_KERNEL_PRIVATE */ -#endif /* _ARM_THREAD_STATUS_H_ */ +#endif /* _ARM_THREAD_STATUS_H_ */ diff --git a/osfmk/mach/arm/vm_param.h b/osfmk/mach/arm/vm_param.h index 8f43cebb4..12939b6ee 100644 --- a/osfmk/mach/arm/vm_param.h +++ b/osfmk/mach/arm/vm_param.h @@ -153,8 +153,11 @@ extern unsigned PAGE_SHIFT_CONST; #define VM_MAX_ADDRESS ((vm_address_t) 0x0000000080000000ULL) /* system-wide values */ -#define MACH_VM_MIN_ADDRESS ((mach_vm_offset_t) 0x0ULL) -#define MACH_VM_MAX_ADDRESS ((mach_vm_offset_t) 0x0000000FC0000000ULL) +#define MACH_VM_MIN_ADDRESS_RAW 0x0ULL +#define MACH_VM_MAX_ADDRESS_RAW 0x0000000FC0000000ULL +#define MACH_VM_MIN_ADDRESS ((mach_vm_offset_t) MACH_VM_MIN_ADDRESS_RAW) +#define MACH_VM_MAX_ADDRESS ((mach_vm_offset_t) MACH_VM_MAX_ADDRESS_RAW) + #else #error architecture not supported @@ -177,7 +180,7 @@ extern unsigned PAGE_SHIFT_CONST; */ #define VM_KERNEL_POINTER_SIGNIFICANT_BITS 37 #define VM_MIN_KERNEL_ADDRESS ((vm_address_t) 0xffffffe000000000ULL) -#define VM_MAX_KERNEL_ADDRESS ((vm_address_t) 0xfffffff3ffffffffULL) +#define VM_MAX_KERNEL_ADDRESS ((vm_address_t) 0xfffffffbffffffffULL) #else #error architecture not supported #endif @@ -185,7 +188,12 @@ extern unsigned PAGE_SHIFT_CONST; #define VM_MIN_KERNEL_AND_KEXT_ADDRESS \ VM_MIN_KERNEL_ADDRESS +#if __has_feature(ptrauth_calls) +#include +#define VM_KERNEL_STRIP_PTR(_v) (ptrauth_strip((void *)(uintptr_t)(_v), ptrauth_key_asia)) +#else /* !ptrauth_calls */ #define VM_KERNEL_STRIP_PTR(_v) (_v) +#endif /* ptrauth_calls */ #define VM_KERNEL_ADDRESS(_va) \ ((((vm_address_t)VM_KERNEL_STRIP_PTR(_va)) >= VM_MIN_KERNEL_ADDRESS) && \ @@ -198,11 +206,18 @@ extern unsigned PAGE_SHIFT_CONST; extern unsigned long gVirtBase, gPhysBase, gPhysSize; #define isphysmem(a) (((vm_address_t)(a) - gPhysBase) < gPhysSize) +#define physmap_enclosed(a) isphysmem(a) #if KASAN /* Increase the stack sizes to account for the redzones that get added to every * stack object. */ # define KERNEL_STACK_SIZE (4*4*4096) +#elif DEBUG +/** + * Increase the stack size to account for less efficient use of stack space when + * compiling with -O0. + */ +# define KERNEL_STACK_SIZE (2*4*4096) #else # define KERNEL_STACK_SIZE (4*4096) #endif diff --git a/osfmk/mach/coalition.h b/osfmk/mach/coalition.h index 2974440ff..82be1cf5a 100644 --- a/osfmk/mach/coalition.h +++ b/osfmk/mach/coalition.h @@ -138,11 +138,17 @@ struct coalition_resource_usage { uint64_t logical_deferred_writes; uint64_t logical_invalidated_writes; uint64_t logical_metadata_writes; + uint64_t logical_immediate_writes_to_external; + uint64_t logical_deferred_writes_to_external; + uint64_t logical_invalidated_writes_to_external; + uint64_t logical_metadata_writes_to_external; uint64_t energy_billed_to_me; uint64_t energy_billed_to_others; uint64_t cpu_ptime; uint64_t cpu_time_eqos_len; /* Stores the number of thread QoS types */ uint64_t cpu_time_eqos[COALITION_NUM_THREAD_QOS_TYPES]; + uint64_t cpu_instructions; + uint64_t cpu_cycles; }; #ifdef PRIVATE @@ -158,6 +164,9 @@ struct coalition_resource_usage { #define COALITION_INFO_SET_NAME 2 #define COALITION_INFO_SET_EFFICIENCY 3 +/* coalition_ledger_set operations */ +#define COALITION_LEDGER_SET_LOGICAL_WRITES_LIMIT 1 + #define COALITION_EFFICIENCY_VALID_FLAGS (COALITION_FLAGS_EFFICIENT) /* structure returned from libproc coalition listing interface */ diff --git a/osfmk/mach/exception_types.h b/osfmk/mach/exception_types.h index 83c8c90e7..31ee691b7 100644 --- a/osfmk/mach/exception_types.h +++ b/osfmk/mach/exception_types.h @@ -123,9 +123,13 @@ * the thread identity and state. */ +#define MACH_EXCEPTION_ERRORS 0x40000000 +/* include additional exception specific errors, not used yet. */ + #define MACH_EXCEPTION_CODES 0x80000000 /* Send 64-bit code and subcode in the exception header */ +#define MACH_EXCEPTION_MASK (MACH_EXCEPTION_CODES | MACH_EXCEPTION_ERRORS) /* * Masks for exception definitions, above * bit zero is unused, therefore 1 word = 31 exception types diff --git a/osfmk/mach/fairplayd_notification.defs b/osfmk/mach/fairplayd_notification.defs new file mode 100644 index 000000000..31250ed7f --- /dev/null +++ b/osfmk/mach/fairplayd_notification.defs @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Interface definition for the fairplay upcall mechanism. + */ + +subsystem +#if KERNEL_USER + KernelUser +#endif /* KERNEL_USER */ + fairplay 41471; + +#include +#include + +/* + * Notification from the kernel requesting a new arcade service + * port from fairplayd. Instead of replying with a port, the + * new port passed to the arcade_provider port sent here via + * arcade_set_upcall_port(). + */ +simpleroutine fairplayd_arcade_request( + fairplayd_port : mach_port_t; + arcade_reg_port : mach_port_t); + diff --git a/osfmk/mach/host_info.h b/osfmk/mach/host_info.h index 2bca65e0d..12d8b3e45 100644 --- a/osfmk/mach/host_info.h +++ b/osfmk/mach/host_info.h @@ -125,7 +125,7 @@ typedef struct host_can_has_debugger_info *host_can_has_debugger_info_t; #define HOST_CAN_HAS_DEBUGGER_COUNT ((mach_msg_type_number_t) \ (sizeof(host_can_has_debugger_info_data_t)/sizeof(integer_t))) -#pragma pack(4) +#pragma pack(push, 4) struct host_basic_info { integer_t max_cpus; /* max number of CPUs possible */ @@ -141,7 +141,7 @@ struct host_basic_info { uint64_t max_mem; /* actual size of physical memory */ }; -#pragma pack() +#pragma pack(pop) typedef struct host_basic_info host_basic_info_data_t; typedef struct host_basic_info *host_basic_info_t; diff --git a/osfmk/mach/host_special_ports.h b/osfmk/mach/host_special_ports.h index f4632ed13..d09b44b6b 100644 --- a/osfmk/mach/host_special_ports.h +++ b/osfmk/mach/host_special_ports.h @@ -106,8 +106,10 @@ #define HOST_RESOURCE_NOTIFY_PORT (20 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_CLOSURED_PORT (21 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_SYSPOLICYD_PORT (22 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_FILECOORDINATIOND_PORT (23 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_FAIRPLAYD_PORT (24 + HOST_MAX_SPECIAL_KERNEL_PORT) -#define HOST_MAX_SPECIAL_PORT HOST_SYSPOLICYD_PORT +#define HOST_MAX_SPECIAL_PORT HOST_FAIRPLAYD_PORT /* MAX = last since rdar://35861175 */ /* obsolete name */ @@ -260,6 +262,18 @@ #define host_set_syspolicyd_port(host, port) \ (host_set_special_port((host), HOST_SYSPOLICYD_PORT, (port))) +#define host_get_filecoordinationd_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_FILECOORDINATIOND_PORT, (port))) +#define host_set_filecoordinationd_port(host, port) \ + (host_set_special_port((host), HOST_FILECOORDINATIOND_PORT, (port))) + +#define host_get_fairplayd_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_FAIRPLAYD_PORT, (port))) +#define host_set_fairplayd_port(host, port) \ + (host_set_special_port((host), HOST_FAIRPLAYD_PORT, (port))) + /* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences. * All lookups go through send_resource_violation() */ diff --git a/osfmk/mach/i386/_structs.h b/osfmk/mach/i386/_structs.h index 3f63a0058..b998ba056 100644 --- a/osfmk/mach/i386/_structs.h +++ b/osfmk/mach/i386/_structs.h @@ -624,6 +624,12 @@ _STRUCT_X86_DEBUG_STATE32 }; #endif /* !__DARWIN_UNIX03 */ +#define _STRUCT_X86_PAGEIN_STATE struct __x86_pagein_state +_STRUCT_X86_PAGEIN_STATE +{ + int __pagein_error; +}; + /* * 64 bit versions of the above */ @@ -690,7 +696,7 @@ _STRUCT_X86_THREAD_STATE64 #define _STRUCT_X86_THREAD_FULL_STATE64 struct __darwin_x86_thread_full_state64 _STRUCT_X86_THREAD_FULL_STATE64 { - _STRUCT_X86_THREAD_STATE64 ss64; + _STRUCT_X86_THREAD_STATE64 __ss64; __uint64_t __ds; __uint64_t __es; __uint64_t __ss; diff --git a/osfmk/mach/i386/thread_status.h b/osfmk/mach/i386/thread_status.h index 324ac645b..2744c0be6 100644 --- a/osfmk/mach/i386/thread_status.h +++ b/osfmk/mach/i386/thread_status.h @@ -119,11 +119,10 @@ #define x86_AVX_STATE32 16 #define x86_AVX_STATE64 (x86_AVX_STATE32 + 1) #define x86_AVX_STATE (x86_AVX_STATE32 + 2) -#if !defined(RC_HIDE_XNU_J137) #define x86_AVX512_STATE32 19 #define x86_AVX512_STATE64 (x86_AVX512_STATE32 + 1) #define x86_AVX512_STATE (x86_AVX512_STATE32 + 2) -#endif /* not RC_HIDE_XNU_J137 */ +#define x86_PAGEIN_STATE 22 #define x86_THREAD_FULL_STATE64 23 /* @@ -138,47 +137,28 @@ * platform. The macro must be manually updated to include all of the valid * exception flavors as defined above. */ -#if !defined(RC_HIDE_XNU_J137) -#define VALID_THREAD_STATE_FLAVOR(x) \ - ((x == x86_THREAD_STATE32) || \ - (x == x86_FLOAT_STATE32) || \ - (x == x86_EXCEPTION_STATE32) || \ - (x == x86_DEBUG_STATE32) || \ - (x == x86_THREAD_STATE64) || \ - (x == x86_THREAD_FULL_STATE64) || \ - (x == x86_FLOAT_STATE64) || \ - (x == x86_EXCEPTION_STATE64) || \ - (x == x86_DEBUG_STATE64) || \ - (x == x86_THREAD_STATE) || \ - (x == x86_FLOAT_STATE) || \ - (x == x86_EXCEPTION_STATE) || \ - (x == x86_DEBUG_STATE) || \ - (x == x86_AVX_STATE32) || \ - (x == x86_AVX_STATE64) || \ - (x == x86_AVX_STATE) || \ - (x == x86_AVX512_STATE32) || \ - (x == x86_AVX512_STATE64) || \ - (x == x86_AVX512_STATE) || \ +#define VALID_THREAD_STATE_FLAVOR(x) \ + ((x == x86_THREAD_STATE32) || \ + (x == x86_FLOAT_STATE32) || \ + (x == x86_EXCEPTION_STATE32) || \ + (x == x86_DEBUG_STATE32) || \ + (x == x86_THREAD_STATE64) || \ + (x == x86_THREAD_FULL_STATE64) || \ + (x == x86_FLOAT_STATE64) || \ + (x == x86_EXCEPTION_STATE64) || \ + (x == x86_DEBUG_STATE64) || \ + (x == x86_THREAD_STATE) || \ + (x == x86_FLOAT_STATE) || \ + (x == x86_EXCEPTION_STATE) || \ + (x == x86_DEBUG_STATE) || \ + (x == x86_AVX_STATE32) || \ + (x == x86_AVX_STATE64) || \ + (x == x86_AVX_STATE) || \ + (x == x86_AVX512_STATE32) || \ + (x == x86_AVX512_STATE64) || \ + (x == x86_AVX512_STATE) || \ + (x == x86_PAGEIN_STATE) || \ (x == THREAD_STATE_NONE)) -#else -#define VALID_THREAD_STATE_FLAVOR(x) \ - ((x == x86_THREAD_STATE32) || \ - (x == x86_FLOAT_STATE32) || \ - (x == x86_EXCEPTION_STATE32) || \ - (x == x86_DEBUG_STATE32) || \ - (x == x86_THREAD_STATE64) || \ - (x == x86_FLOAT_STATE64) || \ - (x == x86_EXCEPTION_STATE64) || \ - (x == x86_DEBUG_STATE64) || \ - (x == x86_THREAD_STATE) || \ - (x == x86_FLOAT_STATE) || \ - (x == x86_EXCEPTION_STATE) || \ - (x == x86_DEBUG_STATE) || \ - (x == x86_AVX_STATE32) || \ - (x == x86_AVX_STATE64) || \ - (x == x86_AVX_STATE) || \ - (x == THREAD_STATE_NONE)) -#endif /* not RC_HIDE_XNU_J137 */ struct x86_state_hdr { uint32_t flavor; @@ -221,11 +201,9 @@ typedef _STRUCT_X86_AVX_STATE32 x86_avx_state32_t; #define x86_AVX_STATE32_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_avx_state32_t)/sizeof(unsigned int))) -#if !defined(RC_HIDE_XNU_J137) typedef _STRUCT_X86_AVX512_STATE32 x86_avx512_state32_t; #define x86_AVX512_STATE32_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_avx512_state32_t)/sizeof(unsigned int))) -#endif /* not RC_HIDE_XNU_J137 */ /* * to be deprecated in the future @@ -262,11 +240,9 @@ typedef _STRUCT_X86_AVX_STATE64 x86_avx_state64_t; #define x86_AVX_STATE64_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_avx_state64_t)/sizeof(unsigned int))) -#if !defined(RC_HIDE_XNU_J137) typedef _STRUCT_X86_AVX512_STATE64 x86_avx512_state64_t; #define x86_AVX512_STATE64_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_avx512_state64_t)/sizeof(unsigned int))) -#endif /* not RC_HIDE_XNU_J137 */ typedef _STRUCT_X86_EXCEPTION_STATE64 x86_exception_state64_t; #define x86_EXCEPTION_STATE64_COUNT ((mach_msg_type_number_t) \ @@ -280,6 +256,12 @@ typedef _STRUCT_X86_DEBUG_STATE64 x86_debug_state64_t; #define X86_DEBUG_STATE64_COUNT x86_DEBUG_STATE64_COUNT +typedef _STRUCT_X86_PAGEIN_STATE x86_pagein_state_t; +#define x86_PAGEIN_STATE_COUNT \ + ((mach_msg_type_number_t)(sizeof(x86_pagein_state_t) / sizeof(int))) + +#define X86_PAGEIN_STATE_COUNT x86_PAGEIN_STATE_COUNT + /* * Combined thread, float and exception states */ @@ -323,7 +305,6 @@ struct x86_avx_state { } ufs; }; -#if !defined(RC_HIDE_XNU_J137) struct x86_avx512_state { x86_state_hdr_t ash; union { @@ -331,7 +312,6 @@ struct x86_avx512_state { x86_avx512_state64_t as64; } ufs; }; -#endif /* not RC_HIDE_XNU_J137 */ typedef struct x86_thread_state x86_thread_state_t; #define x86_THREAD_STATE_COUNT ((mach_msg_type_number_t) \ @@ -353,11 +333,9 @@ typedef struct x86_avx_state x86_avx_state_t; #define x86_AVX_STATE_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_avx_state_t)/sizeof(unsigned int))) -#if !defined(RC_HIDE_XNU_J137) typedef struct x86_avx512_state x86_avx512_state_t; #define x86_AVX512_STATE_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_avx512_state_t)/sizeof(unsigned int))) -#endif /* not RC_HIDE_XNU_J137 */ /* * Machine-independent way for servers and Mach's exception mechanism to diff --git a/osfmk/mach/kmod.h b/osfmk/mach/kmod.h index 361cfdfd2..a4fba6d3a 100644 --- a/osfmk/mach/kmod.h +++ b/osfmk/mach/kmod.h @@ -67,7 +67,7 @@ typedef kern_return_t kmod_stop_func_t(struct kmod_info * ki, void * data); * * All structures must be #pragma pack(4). ***********************************************************************/ -#pragma pack(4) +#pragma pack(push, 4) /* Run-time struct only; never saved to a file */ typedef struct kmod_reference { @@ -133,7 +133,7 @@ typedef struct kmod_info_64_v1 { uint64_t stop_addr; } kmod_info_64_v1_t; -#pragma pack() +#pragma pack(pop) #if PRAGMA_MARK #pragma mark Kmod structure declaration macros diff --git a/osfmk/mach/mach_param.h b/osfmk/mach/mach_param.h index 18e2cb68b..fb4261301 100644 --- a/osfmk/mach/mach_param.h +++ b/osfmk/mach/mach_param.h @@ -70,4 +70,7 @@ #define TASK_PORT_REGISTER_MAX 3 +/* Number of watchport for task */ +#define TASK_MAX_WATCHPORT_COUNT 32 + #endif /* _MACH_MACH_PARAM_H_ */ diff --git a/osfmk/mach/mach_port.defs b/osfmk/mach/mach_port.defs index 5bc503421..ea3328933 100644 --- a/osfmk/mach/mach_port.defs +++ b/osfmk/mach/mach_port.defs @@ -623,4 +623,39 @@ routine mach_port_special_reply_port_reset_link( #else skip; #endif + +/* + * Guard an already existing port. Allows guarding + * receive rights only. Uses the context field in the + * port structure to store the guard. + */ +routine mach_port_guard_with_flags( + task : ipc_space_t; + name : mach_port_name_t; +#ifdef LIBSYSCALL_INTERFACE + guard : mach_port_context_t; +#else + guard : uint64_t; +#endif + flags : uint64_t); + +/* + * Swap guard value of an existing guarded port. Works + * only if it is not a strict guard. + */ +routine mach_port_swap_guard( + task : ipc_space_t; + name : mach_port_name_t; +#ifdef LIBSYSCALL_INTERFACE + old_guard : mach_port_context_t; +#else + old_guard : uint64_t; +#endif + +#ifdef LIBSYSCALL_INTERFACE + new_guard : mach_port_context_t); +#else + new_guard : uint64_t); +#endif + /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_time.h b/osfmk/mach/mach_time.h index b41206e1f..f2601ade4 100644 --- a/osfmk/mach/mach_time.h +++ b/osfmk/mach/mach_time.h @@ -54,7 +54,7 @@ kern_return_t mach_wait_until( uint64_t mach_absolute_time(void); -__OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_8_0) +__OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0) uint64_t mach_approximate_time(void); /* diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index 2639dbfb1..064514ebc 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -320,6 +320,20 @@ extern kern_return_t mach_voucher_extract_attr_recipe_trap( mach_voucher_attr_raw_recipe_t recipe, mach_msg_type_number_t *recipe_size); +extern kern_return_t _kernelrpc_mach_port_type_trap( + ipc_space_t task, + mach_port_name_t name, + mach_port_type_t *ptype); + +extern kern_return_t _kernelrpc_mach_port_request_notification_trap( + ipc_space_t task, + mach_port_name_t name, + mach_msg_id_t msgid, + mach_port_mscount_t sync, + mach_port_name_t notify, + mach_msg_type_name_t notifyPoly, + mach_port_name_t *previous); + /* * Obsolete interfaces. */ @@ -338,6 +352,11 @@ extern kern_return_t pid_for_task( mach_port_name_t t, int *x); +extern kern_return_t debug_control_port_for_pid( + mach_port_name_t target_tport, + int pid, + mach_port_name_t *t); + #else /* KERNEL */ #ifdef XNU_KERNEL_PRIVATE @@ -370,7 +389,7 @@ extern kern_return_t pid_for_task( #endif #define PAD_ARG_(arg_type, arg_name) \ - char arg_name##_l_[PADL_(arg_type)]; arg_type arg_name; char arg_name##_r_[PADR_(arg_type)]; + char arg_name##_l_[PADL_(arg_type)]; arg_type arg_name; char arg_name##_r_[PADR_(arg_type)] /* * To support 32-bit clients as well as 64-bit clients, argument @@ -503,6 +522,14 @@ struct pid_for_task_args { extern kern_return_t pid_for_task( struct pid_for_task_args *args); +struct debug_control_port_for_pid_args { + PAD_ARG_(mach_port_name_t, target_tport); + PAD_ARG_(int, pid); + PAD_ARG_(user_addr_t, t); +}; +extern kern_return_t debug_control_port_for_pid( + struct debug_control_port_for_pid_args *args); + struct macx_swapon_args { PAD_ARG_(uint64_t, filename); PAD_ARG_(int, flags); @@ -814,6 +841,26 @@ struct mach_voucher_extract_attr_recipe_args { extern kern_return_t mach_voucher_extract_attr_recipe_trap( struct mach_voucher_extract_attr_recipe_args *args); +struct _kernelrpc_mach_port_type_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_right_t, name); + PAD_ARG_(user_addr_t, ptype); +}; +extern kern_return_t _kernelrpc_mach_port_type_trap( + struct _kernelrpc_mach_port_type_args *args); + +struct _kernelrpc_mach_port_request_notification_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); + PAD_ARG_(mach_msg_id_t, msgid); + PAD_ARG_(mach_port_mscount_t, sync); + PAD_ARG_(mach_port_name_t, notify); + PAD_ARG_(mach_msg_type_name_t, notifyPoly); + PAD_ARG_(user_addr_t, previous); +}; +extern kern_return_t _kernelrpc_mach_port_request_notification_trap( + struct _kernelrpc_mach_port_request_notification_args *args); + /* not published to LP64 clients yet */ struct iokit_user_client_trap_args { diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index 27dbd26a6..d2e9fb0b4 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -218,6 +218,12 @@ type ipc_space_inspect_t = mach_port_t #endif /* KERNEL_SERVER */ ; +type arcade_register_t = mach_port_t +#if KERNEL_SERVER + intran: arcade_register_t convert_port_to_arcade_register(mach_port_t) +#endif /* KERNEL_SERVER */ + ; + type vm_prot_t = int; type vm_inherit_t = int; type vm_purgable_t = int; @@ -258,11 +264,12 @@ type thread_policy_t = array[*:16] of integer_t; * task_basic_info_64_2_t * mach_task_basic_info_t (12 ints) * task_power_info_t (18 ints) + * task_vm_info_t (87 ints) * If other task_info flavors are added, this * definition may need to be changed. (See * mach/task_info.h and mach/policy.h) */ type task_flavor_t = int; -type task_info_t = array[*:52] of integer_t; +type task_info_t = array[*:87] of integer_t; type task_purgable_info_t = struct[68] of integer_t; @@ -272,6 +279,8 @@ type task_policy_t = array[*:16] of integer_t; type task_inspect_flavor_t = natural_t; type task_inspect_info_t = array[*:4] of integer_t; +type task_exc_guard_behavior_t = uint32_t; + type mem_entry_name_port_t = mach_port_t #if KERNEL_SERVER intran: mem_entry_name_port_t null_conversion(mach_port_t) @@ -549,6 +558,8 @@ type task_suspension_token_t = mach_port_move_send_once_t #endif /* KERNEL_SERVER */ ; +type vfs_path_t = c_string[4096]; +type nspace_path_t = c_string[1024]; /* 1024 == PATH_MAX */ /* public voucher types */ @@ -624,6 +635,9 @@ simport ; /* for lock-set conversions */ simport ; /* for semaphore conversions */ simport ; /* for memory object type conversions */ simport ; /* for vm_map conversions */ +#if CONFIG_ARCADE +simport ; /* for arcade_register conversions */ +#endif #endif /* MACH_KERNEL_PRIVATE */ simport ; /* pick up kernel-specific MIG things */ diff --git a/osfmk/mach/mach_types.h b/osfmk/mach/mach_types.h index 480c60768..5430caaeb 100644 --- a/osfmk/mach/mach_types.h +++ b/osfmk/mach/mach_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -134,6 +134,7 @@ typedef struct alarm *alarm_t; typedef struct clock *clock_serv_t; typedef struct clock *clock_ctrl_t; +typedef struct arcade_register *arcade_register_t; /* * OBSOLETE: lock_set interfaces are obsolete. @@ -155,6 +156,8 @@ struct ledger; struct alarm; struct clock; +struct arcade_register; + __END_DECLS #endif /* MACH_KERNEL_PRIVATE */ @@ -188,6 +191,7 @@ typedef mach_port_t alarm_t; typedef mach_port_t clock_serv_t; typedef mach_port_t clock_ctrl_t; +typedef mach_port_t arcade_register_t; #endif /* KERNEL */ /* @@ -254,21 +258,46 @@ typedef clock_serv_t clock_serv_port_t; typedef clock_ctrl_t clock_ctrl_port_t; typedef exception_handler_t exception_port_t; typedef exception_handler_array_t exception_port_arrary_t; +typedef char vfs_path_t[4096]; +typedef char nspace_path_t[1024]; /* 1024 == PATH_MAX */ - +#ifdef KERNEL +#define TASK_NULL ((task_t) NULL) +#define TASK_NAME_NULL ((task_name_t) NULL) +#define TASK_INSPECT_NULL ((task_inspect_t) NULL) +#define THREAD_NULL ((thread_t) NULL) +#define THREAD_INSPECT_NULL ((thread_inspect_t)NULL) +#define TID_NULL ((uint64_t) NULL) +#define THR_ACT_NULL ((thread_act_t) NULL) +#define IPC_SPACE_NULL ((ipc_space_t) NULL) +#define IPC_SPACE_INSPECT_NULL ((ipc_space_inspect_t) NULL) +#define COALITION_NULL ((coalition_t) NULL) +#define HOST_NULL ((host_t) NULL) +#define HOST_PRIV_NULL ((host_priv_t)NULL) +#define HOST_SECURITY_NULL ((host_security_t)NULL) +#define PROCESSOR_SET_NULL ((processor_set_t) NULL) +#define PROCESSOR_NULL ((processor_t) NULL) +#define SEMAPHORE_NULL ((semaphore_t) NULL) +#define LOCK_SET_NULL ((lock_set_t) NULL) +#define LEDGER_NULL ((ledger_t) NULL) +#define ALARM_NULL ((alarm_t) NULL) +#define CLOCK_NULL ((clock_t) NULL) +#define UND_SERVER_NULL ((UNDServerRef) NULL) +#define ARCADE_REG_NULL ((arcade_register_t) NULL) +#else #define TASK_NULL ((task_t) 0) #define TASK_NAME_NULL ((task_name_t) 0) #define TASK_INSPECT_NULL ((task_inspect_t) 0) #define THREAD_NULL ((thread_t) 0) -#define THREAD_INSPECT_NULL ((thread_inspect_t)0) +#define THREAD_INSPECT_NULL ((thread_inspect_t) 0) #define TID_NULL ((uint64_t) 0) #define THR_ACT_NULL ((thread_act_t) 0) #define IPC_SPACE_NULL ((ipc_space_t) 0) #define IPC_SPACE_INSPECT_NULL ((ipc_space_inspect_t) 0) #define COALITION_NULL ((coalition_t) 0) #define HOST_NULL ((host_t) 0) -#define HOST_PRIV_NULL ((host_priv_t)0) -#define HOST_SECURITY_NULL ((host_security_t)0) +#define HOST_PRIV_NULL ((host_priv_t) 0) +#define HOST_SECURITY_NULL ((host_security_t) 0) #define PROCESSOR_SET_NULL ((processor_set_t) 0) #define PROCESSOR_NULL ((processor_t) 0) #define SEMAPHORE_NULL ((semaphore_t) 0) @@ -277,6 +306,8 @@ typedef exception_handler_array_t exception_port_arrary_t; #define ALARM_NULL ((alarm_t) 0) #define CLOCK_NULL ((clock_t) 0) #define UND_SERVER_NULL ((UNDServerRef) 0) +#define ARCADE_REG_NULL ((arcade_register_t) 0) +#endif /* DEPRECATED */ typedef natural_t ledger_item_t; diff --git a/osfmk/mach/mach_voucher_types.h b/osfmk/mach/mach_voucher_types.h index 6181a64e4..f7a7afcbd 100644 --- a/osfmk/mach/mach_voucher_types.h +++ b/osfmk/mach/mach_voucher_types.h @@ -158,7 +158,7 @@ typedef mach_voucher_attr_recipe_command_t *mach_voucher_attr_recipe_command_arr * * An element in a recipe list to create a voucher. */ -#pragma pack(1) +#pragma pack(push, 1) typedef struct mach_voucher_attr_recipe_data { mach_voucher_attr_key_t key; @@ -179,7 +179,7 @@ typedef mach_msg_type_number_t mach_voucher_attr_raw_recipe_array_size_t; #define MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE 5120 #define MACH_VOUCHER_TRAP_STACK_LIMIT 256 -#pragma pack() +#pragma pack(pop) /* * VOUCHER ATTRIBUTE MANAGER Writer types @@ -216,7 +216,7 @@ typedef mach_port_t ipc_voucher_attr_control_t; struct ipc_voucher_attr_manager; struct ipc_voucher_attr_control; #endif -typedef struct ipc_voucher_attr_manager *ipc_voucher_attr_manager_t; +typedef const struct ipc_voucher_attr_manager *ipc_voucher_attr_manager_t; typedef struct ipc_voucher_attr_control *ipc_voucher_attr_control_t; #endif #define IPC_VOUCHER_ATTR_MANAGER_NULL ((ipc_voucher_attr_manager_t) 0) diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 672bd17cb..654bfc30d 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -129,6 +129,7 @@ __END_DECLS */ #define CPU_ARCH_MASK 0xff000000 /* mask for architecture bits */ #define CPU_ARCH_ABI64 0x01000000 /* 64 bit ABI */ +#define CPU_ARCH_ABI64_32 0x02000000 /* ABI for 64-bit hardware with 32-bit types; LP32 */ /* * Machine types known by all. @@ -152,6 +153,7 @@ __END_DECLS #define CPU_TYPE_HPPA ((cpu_type_t) 11) #define CPU_TYPE_ARM ((cpu_type_t) 12) #define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) +#define CPU_TYPE_ARM64_32 (CPU_TYPE_ARM | CPU_ARCH_ABI64_32) #define CPU_TYPE_MC88000 ((cpu_type_t) 13) #define CPU_TYPE_SPARC ((cpu_type_t) 14) #define CPU_TYPE_I860 ((cpu_type_t) 15) @@ -159,6 +161,7 @@ __END_DECLS /* skip ((cpu_type_t) 17) */ #define CPU_TYPE_POWERPC ((cpu_type_t) 18) #define CPU_TYPE_POWERPC64 (CPU_TYPE_POWERPC | CPU_ARCH_ABI64) +/* skip ((cpu_type_t) 19) */ /* * Machine subtypes (these are defined here, instead of in a machine @@ -352,22 +355,32 @@ __END_DECLS #define CPU_SUBTYPE_ARM_V6 ((cpu_subtype_t) 6) #define CPU_SUBTYPE_ARM_V5TEJ ((cpu_subtype_t) 7) #define CPU_SUBTYPE_ARM_XSCALE ((cpu_subtype_t) 8) -#define CPU_SUBTYPE_ARM_V7 ((cpu_subtype_t) 9) +#define CPU_SUBTYPE_ARM_V7 ((cpu_subtype_t) 9) /* ARMv7-A and ARMv7-R */ #define CPU_SUBTYPE_ARM_V7F ((cpu_subtype_t) 10) /* Cortex A9 */ #define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t) 11) /* Swift */ #define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t) 12) +#define CPU_SUBTYPE_ARM_V8 ((cpu_subtype_t) 13) #define CPU_SUBTYPE_ARM_V6M ((cpu_subtype_t) 14) /* Not meant to be run under xnu */ #define CPU_SUBTYPE_ARM_V7M ((cpu_subtype_t) 15) /* Not meant to be run under xnu */ #define CPU_SUBTYPE_ARM_V7EM ((cpu_subtype_t) 16) /* Not meant to be run under xnu */ - -#define CPU_SUBTYPE_ARM_V8 ((cpu_subtype_t) 13) +#define CPU_SUBTYPE_ARM_V8M ((cpu_subtype_t) 17) /* Not meant to be run under xnu */ /* * ARM64 subtypes */ #define CPU_SUBTYPE_ARM64_ALL ((cpu_subtype_t) 0) #define CPU_SUBTYPE_ARM64_V8 ((cpu_subtype_t) 1) +#define CPU_SUBTYPE_ARM64E ((cpu_subtype_t) 2) +/* CPU subtype feature flags for ptrauth on arm64e platforms */ +#define CPU_SUBTYPE_ARM64_PTR_AUTH_MASK 0x0f000000 +#define CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(x) (((x) & CPU_SUBTYPE_ARM64_PTR_AUTH_MASK) >> 24) + +/* + * ARM64_32 subtypes + */ +#define CPU_SUBTYPE_ARM64_32_ALL ((cpu_subtype_t) 0) +#define CPU_SUBTYPE_ARM64_32_V8 ((cpu_subtype_t) 1) #endif /* !__ASSEMBLER__ */ @@ -409,6 +422,7 @@ __END_DECLS #define CPUFAMILY_ARM_TWISTER 0x92fb37c8 #define CPUFAMILY_ARM_HURRICANE 0x67ceee93 #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6 +#define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f /* The following synonyms are deprecated: */ #define CPUFAMILY_INTEL_6_23 CPUFAMILY_INTEL_PENRYN diff --git a/osfmk/mach/machine/sdt.h b/osfmk/mach/machine/sdt.h index cceb7b419..9c24d5db7 100644 --- a/osfmk/mach/machine/sdt.h +++ b/osfmk/mach/machine/sdt.h @@ -48,106 +48,106 @@ #define DTRACE_PROBE1(provider, name, arg0) { \ uintptr_t __dtrace_args[ARG1_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ DTRACE_CALL1ARG(provider, name) \ } #define DTRACE_PROBE2(provider, name, arg0, arg1) { \ uintptr_t __dtrace_args[ARGS2_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ DTRACE_CALL2ARGS(provider, name) \ } #define DTRACE_PROBE3(provider, name, arg0, arg1, arg2) { \ uintptr_t __dtrace_args[ARGS3_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ DTRACE_CALL3ARGS(provider, name) \ } #define DTRACE_PROBE4(provider, name, arg0, arg1, arg2, arg3) { \ uintptr_t __dtrace_args[ARGS4_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ DTRACE_CALL4ARGS(provider, name) \ } #define DTRACE_PROBE5(provider, name, arg0, arg1, arg2, arg3, arg4) { \ uintptr_t __dtrace_args[ARGS5_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ - __dtrace_args[4] = (uintptr_t)arg4; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ + __dtrace_args[4] = (uintptr_t)(arg4); \ DTRACE_CALL5ARGS(provider, name) \ } #define DTRACE_PROBE6(provider, name, arg0, arg1, arg2, arg3, arg4, arg5) { \ uintptr_t __dtrace_args[ARGS6_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ - __dtrace_args[4] = (uintptr_t)arg4; \ - __dtrace_args[5] = (uintptr_t)arg5; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ + __dtrace_args[4] = (uintptr_t)(arg4); \ + __dtrace_args[5] = (uintptr_t)(arg5); \ DTRACE_CALL6ARGS(provider, name) \ } #define DTRACE_PROBE7(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6) { \ uintptr_t __dtrace_args[ARGS7_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ - __dtrace_args[4] = (uintptr_t)arg4; \ - __dtrace_args[5] = (uintptr_t)arg5; \ - __dtrace_args[6] = (uintptr_t)arg6; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ + __dtrace_args[4] = (uintptr_t)(arg4); \ + __dtrace_args[5] = (uintptr_t)(arg5); \ + __dtrace_args[6] = (uintptr_t)(arg6); \ DTRACE_CALL7ARGS(provider, name) \ } #define DTRACE_PROBE8(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) { \ uintptr_t __dtrace_args[ARGS8_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ - __dtrace_args[4] = (uintptr_t)arg4; \ - __dtrace_args[5] = (uintptr_t)arg5; \ - __dtrace_args[6] = (uintptr_t)arg6; \ - __dtrace_args[7] = (uintptr_t)arg7; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ + __dtrace_args[4] = (uintptr_t)(arg4); \ + __dtrace_args[5] = (uintptr_t)(arg5); \ + __dtrace_args[6] = (uintptr_t)(arg6); \ + __dtrace_args[7] = (uintptr_t)(arg7); \ DTRACE_CALL8ARGS(provider, name) \ } #define DTRACE_PROBE9(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) { \ uintptr_t __dtrace_args[ARGS9_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ - __dtrace_args[4] = (uintptr_t)arg4; \ - __dtrace_args[5] = (uintptr_t)arg5; \ - __dtrace_args[6] = (uintptr_t)arg6; \ - __dtrace_args[7] = (uintptr_t)arg7; \ - __dtrace_args[8] = (uintptr_t)arg8; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ + __dtrace_args[4] = (uintptr_t)(arg4); \ + __dtrace_args[5] = (uintptr_t)(arg5); \ + __dtrace_args[6] = (uintptr_t)(arg6); \ + __dtrace_args[7] = (uintptr_t)(arg7); \ + __dtrace_args[8] = (uintptr_t)(arg8); \ DTRACE_CALL9ARGS(provider, name) \ } #define DTRACE_PROBE10(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9) { \ uintptr_t __dtrace_args[ARGS10_EXTENT] __attribute__ ((aligned (16))); \ - __dtrace_args[0] = (uintptr_t)arg0; \ - __dtrace_args[1] = (uintptr_t)arg1; \ - __dtrace_args[2] = (uintptr_t)arg2; \ - __dtrace_args[3] = (uintptr_t)arg3; \ - __dtrace_args[4] = (uintptr_t)arg4; \ - __dtrace_args[5] = (uintptr_t)arg5; \ - __dtrace_args[6] = (uintptr_t)arg6; \ - __dtrace_args[7] = (uintptr_t)arg7; \ - __dtrace_args[8] = (uintptr_t)arg8; \ - __dtrace_args[9] = (uintptr_t)arg9; \ + __dtrace_args[0] = (uintptr_t)(arg0); \ + __dtrace_args[1] = (uintptr_t)(arg1); \ + __dtrace_args[2] = (uintptr_t)(arg2); \ + __dtrace_args[3] = (uintptr_t)(arg3); \ + __dtrace_args[4] = (uintptr_t)(arg4); \ + __dtrace_args[5] = (uintptr_t)(arg5); \ + __dtrace_args[6] = (uintptr_t)(arg6); \ + __dtrace_args[7] = (uintptr_t)(arg7); \ + __dtrace_args[8] = (uintptr_t)(arg8); \ + __dtrace_args[9] = (uintptr_t)(arg9); \ DTRACE_CALL10ARGS(provider, name) \ } @@ -224,6 +224,9 @@ #define DTRACE_MEMORYSTATUS3(name, type1, arg1, type2, arg2, type3, arg3) \ DTRACE_PROBE3(__sdt_, name, arg1, arg2, arg3); +#define DTRACE_MEMORYSTATUS4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__sdt_, name, arg1, arg2, arg3, arg4); + #define DTRACE_MEMORYSTATUS6(name, type1, arg1, type2, arg2, \ type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ DTRACE_PROBE6(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6) @@ -276,6 +279,10 @@ type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ DTRACE_PROBE6(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6) +#define DTRACE_VM7(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7) \ + DTRACE_PROBE7(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7) + #define DTRACE_IP(name) \ DTRACE_PROBE(__ip_, name) @@ -304,6 +311,34 @@ type4, arg4, type5, arg5, type6, arg6, type7, arg7) \ DTRACE_PROBE7(__ip_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7) +#define DTRACE_ROUTE(name) \ + DTRACE_PROBE(__route_, name) + +#define DTRACE_ROUTE1(name, type1, arg1) \ + DTRACE_PROBE1(__route_, name, arg1) + +#define DTRACE_ROUTE2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__route_, name, arg1, arg2) + +#define DTRACE_ROUTE3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__route_, name, arg1, arg2, arg3) + +#define DTRACE_ROUTE4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__route_, name, arg1, arg2, arg3, arg4) + +#define DTRACE_ROUTE5(name, typ1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__route_, name, arg1, arg2, arg3, arg4, arg5) + +#define DTRACE_ROUTE6(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + DTRACE_PROBE6(__route_, name, arg1, arg2, arg3, arg4, arg5, arg6) + +#define DTRACE_ROUTE7(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6, type7, arg7) \ + DTRACE_PROBE7(__route_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7) + #define DTRACE_TCP(name) \ DTRACE_PROBE(__tcp_, name) @@ -384,6 +419,28 @@ type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ DTRACE_PROBE6(__boost_, name, arg1, arg2, arg3, arg4, arg5, arg6); +#if KASAN +#define DTRACE_KASAN(name) \ + DTRACE_PROBE(__kasan_, name); + +#define DTRACE_KASAN1(name, type1, arg1) \ + DTRACE_PROBE1(__kasan_, name, arg1); + +#define DTRACE_KASAN2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__kasan_, name, arg1, arg2); + +#define DTRACE_KASAN3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__kasan_, name, arg1, arg2, arg3); + +#define DTRACE_KASAN4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__kasan_, name, arg1, arg2, arg3, arg4); + +#define DTRACE_KASAN5(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__kasan_, name, arg1, arg2, arg3, arg4, arg5); +#endif /* KASAN */ + #if PRIVATE #endif /* PRIVATE */ diff --git a/osfmk/mach/memory_entry.defs b/osfmk/mach/memory_entry.defs index 07e8fa454..bcc83f26c 100644 --- a/osfmk/mach/memory_entry.defs +++ b/osfmk/mach/memory_entry.defs @@ -32,10 +32,6 @@ subsystem #endif /* KERNEL_SERVER */ memory_entry 4900; -#if !KERNEL && !LIBSYSCALL_INTERFACE - UserPrefix _kernelrpc_; -#endif - #include #include #include @@ -50,3 +46,9 @@ routine mach_memory_entry_access_tracking( inout access_tracking : int; out access_tracking_reads : uint32_t; out access_tracking_writes : uint32_t); + +routine mach_memory_entry_ownership( + mem_entry : mem_entry_name_port_t; + owner : task_t; + ledger_tag : int; + ledger_flags : int); diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index fec1df84a..6c5cdd941 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -369,14 +369,17 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; /* named entry processor mapping options */ /* enumerated */ -#define MAP_MEM_NOOP 0 -#define MAP_MEM_COPYBACK 1 -#define MAP_MEM_IO 2 -#define MAP_MEM_WTHRU 3 -#define MAP_MEM_WCOMB 4 /* Write combining mode */ - /* aka store gather */ -#define MAP_MEM_INNERWBACK 5 -#define MAP_MEM_POSTED 6 +#define MAP_MEM_NOOP 0 +#define MAP_MEM_COPYBACK 1 +#define MAP_MEM_IO 2 +#define MAP_MEM_WTHRU 3 +#define MAP_MEM_WCOMB 4 /* Write combining mode */ + /* aka store gather */ +#define MAP_MEM_INNERWBACK 5 +#define MAP_MEM_POSTED 6 +#define MAP_MEM_RT 7 +#define MAP_MEM_POSTED_REORDERED 8 +#define MAP_MEM_POSTED_COMBINED_REORDERED 9 #define GET_MAP_MEM(flags) \ ((((unsigned int)(flags)) >> 24) & 0xFF) @@ -386,7 +389,7 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; & 0xFF000000) | ((flags) & 0xFFFFFF)); /* leave room for vm_prot bits (0xFF ?) */ -#define MAP_MEM_LEDGER_TAG_NETWORK 0x002000 /* charge to "network" ledger */ +#define MAP_MEM_LEDGER_TAGGED 0x002000 /* object owned by a specific task and ledger */ #define MAP_MEM_PURGABLE_KERNEL_ONLY 0x004000 /* volatility controlled by kernel */ #define MAP_MEM_GRAB_SECLUDED 0x008000 /* can grab secluded pages */ #define MAP_MEM_ONLY 0x010000 /* change processor caching */ @@ -409,9 +412,9 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; MAP_MEM_USE_DATA_ADDR | \ MAP_MEM_VM_COPY | \ MAP_MEM_VM_SHARE | \ + MAP_MEM_LEDGER_TAGGED | \ MAP_MEM_4K_DATA_ADDR) #define MAP_MEM_FLAGS_ALL ( \ - MAP_MEM_LEDGER_TAG_NETWORK | \ MAP_MEM_FLAGS_USER) #ifdef KERNEL diff --git a/osfmk/mach/message.h b/osfmk/mach/message.h index ceb069a6a..a1a3a0325 100644 --- a/osfmk/mach/message.h +++ b/osfmk/mach/message.h @@ -251,6 +251,12 @@ typedef unsigned int mach_msg_copy_options_t; #define MACH_MSG_KALLOC_COPY_T 4 #endif /* MACH_KERNEL */ +#define MACH_MSG_GUARD_FLAGS_NONE 0x0000 +#define MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE 0x0001 /* Move the receive right and mark it as immovable */ +#define MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND 0x0002 /* Verify that the port is unguarded */ +#define MACH_MSG_GUARD_FLAGS_MASK 0x0003 /* Valid flag bits */ +typedef unsigned int mach_msg_guard_flags_t; + /* * In a complex mach message, the mach_msg_header_t is followed by * a descriptor count, then an array of that number of descriptors @@ -269,8 +275,9 @@ typedef unsigned int mach_msg_descriptor_type_t; #define MACH_MSG_OOL_DESCRIPTOR 1 #define MACH_MSG_OOL_PORTS_DESCRIPTOR 2 #define MACH_MSG_OOL_VOLATILE_DESCRIPTOR 3 +#define MACH_MSG_GUARDED_PORT_DESCRIPTOR 4 -#pragma pack(4) +#pragma pack(push, 4) typedef struct{ natural_t pad1; @@ -363,6 +370,48 @@ typedef struct{ #endif } mach_msg_ool_ports_descriptor_t; +typedef struct{ + uint32_t context; + mach_port_name_t name; + mach_msg_guard_flags_t flags : 16; + mach_msg_type_name_t disposition : 8; + mach_msg_descriptor_type_t type : 8; +} mach_msg_guarded_port_descriptor32_t; + +typedef struct{ + uint64_t context; + mach_msg_guard_flags_t flags : 16; + mach_msg_type_name_t disposition : 8; + mach_msg_descriptor_type_t type : 8; + mach_port_name_t name; +} mach_msg_guarded_port_descriptor64_t; + +typedef struct{ +#if defined(KERNEL) + mach_port_t name; +#if !defined(__LP64__) + uint32_t pad1; +#endif + mach_msg_guard_flags_t flags : 16; + mach_msg_type_name_t disposition : 8; + mach_msg_descriptor_type_t type : 8; +#if defined(__LP64__) + uint32_t pad_end; +#endif /* defined(__LP64__) */ +#else + mach_port_context_t context; +#if !defined(__LP64__) + mach_port_name_t name; +#endif + mach_msg_guard_flags_t flags : 16; + mach_msg_type_name_t disposition : 8; + mach_msg_descriptor_type_t type : 8; +#if defined(__LP64__) + mach_port_name_t name; +#endif /* defined(__LP64__) */ +#endif /* defined(KERNEL) */ +} mach_msg_guarded_port_descriptor_t; + /* * LP64support - This union definition is not really * appropriate in LP64 mode because not all descriptors @@ -374,6 +423,7 @@ typedef union{ mach_msg_ool_descriptor32_t out_of_line; mach_msg_ool_ports_descriptor32_t ool_ports; mach_msg_type_descriptor_t type; + mach_msg_guarded_port_descriptor32_t guarded_port; } mach_msg_descriptor_t; #else typedef union{ @@ -381,6 +431,7 @@ typedef union{ mach_msg_ool_descriptor_t out_of_line; mach_msg_ool_ports_descriptor_t ool_ports; mach_msg_type_descriptor_t type; + mach_msg_guarded_port_descriptor_t guarded_port; } mach_msg_descriptor_t; #endif @@ -576,10 +627,10 @@ typedef mach_msg_security_trailer_t mach_msg_format_0_trailer_t; #define MACH_MSG_TRAILER_FORMAT_0_SIZE sizeof(mach_msg_format_0_trailer_t) #define KERNEL_SECURITY_TOKEN_VALUE { {0, 1} } -extern security_token_t KERNEL_SECURITY_TOKEN; +extern const security_token_t KERNEL_SECURITY_TOKEN; #define KERNEL_AUDIT_TOKEN_VALUE { {0, 0, 0, 0, 0, 0, 0, 0} } -extern audit_token_t KERNEL_AUDIT_TOKEN; +extern const audit_token_t KERNEL_AUDIT_TOKEN; typedef integer_t mach_msg_options_t; @@ -597,7 +648,7 @@ typedef union{ mach_msg_empty_rcv_t rcv; } mach_msg_empty_t; -#pragma pack() +#pragma pack(pop) /* utility to round the message size - will become machine dependent */ #define round_msg(x) (((mach_msg_size_t)(x) + sizeof (natural_t) - 1) & \ @@ -683,7 +734,7 @@ typedef integer_t mach_msg_option_t; #define MACH_RCV_LARGE_IDENTITY 0x00000008 /* identify source of large messages */ #define MACH_SEND_TIMEOUT 0x00000010 /* timeout value applies to send */ -#define MACH_SEND_OVERRIDE 0x00000020 /* priority override for send */ +#define MACH_SEND_OVERRIDE 0x00000020 /* priority override for send */ #define MACH_SEND_INTERRUPT 0x00000040 /* don't restart interrupted sends */ #define MACH_SEND_NOTIFY 0x00000080 /* arm send-possible notify */ #define MACH_SEND_ALWAYS 0x00010000 /* ignore qlimits - kernel only */ @@ -692,16 +743,23 @@ typedef integer_t mach_msg_option_t; #define MACH_SEND_NODENAP MACH_SEND_NOIMPORTANCE #define MACH_SEND_IMPORTANCE 0x00080000 /* msg carries importance - kernel only */ #define MACH_SEND_SYNC_OVERRIDE 0x00100000 /* msg should do sync ipc override */ -#define MACH_SEND_PROPAGATE_QOS 0x00200000 /* IPC should propagate the caller's QoS */ +#define MACH_SEND_PROPAGATE_QOS 0x00200000 /* IPC should propagate the caller's QoS */ #define MACH_SEND_SYNC_USE_THRPRI MACH_SEND_PROPAGATE_QOS /* obsolete name */ -#define MACH_SEND_KERNEL 0x00400000 /* full send from kernel space - kernel only */ +#define MACH_SEND_KERNEL 0x00400000 /* full send from kernel space - kernel only */ +#define MACH_SEND_SYNC_BOOTSTRAP_CHECKIN 0x00800000 /* special reply port should boost thread doing sync bootstrap checkin */ #define MACH_RCV_TIMEOUT 0x00000100 /* timeout value applies to receive */ -#define MACH_RCV_NOTIFY 0x00000200 /* reserved - legacy */ +#define MACH_RCV_NOTIFY 0x00000000 /* legacy name (value was: 0x00000200) */ #define MACH_RCV_INTERRUPT 0x00000400 /* don't restart interrupted receive */ #define MACH_RCV_VOUCHER 0x00000800 /* willing to receive voucher port */ -#define MACH_RCV_OVERWRITE 0x00001000 /* scatter receive (deprecated) */ +#define MACH_RCV_OVERWRITE 0x00000000 /* scatter receive (deprecated) */ +#define MACH_RCV_GUARDED_DESC 0x00001000 /* Can receive new guarded descriptor */ #define MACH_RCV_SYNC_WAIT 0x00004000 /* sync waiter waiting for rcv */ +#define MACH_RCV_SYNC_PEEK 0x00008000 /* sync waiter waiting to peek */ + +#define MACH_MSG_STRICT_REPLY 0x00000200 /* Enforce specific properties about the reply port, and + * the context in which a thread replies to a message. + * This flag must be passed on both the SEND and RCV */ #ifdef XNU_KERNEL_PRIVATE @@ -745,12 +803,15 @@ typedef integer_t mach_msg_option_t; #define MACH_SEND_USER (MACH_SEND_MSG | MACH_SEND_TIMEOUT | \ MACH_SEND_NOTIFY | MACH_SEND_OVERRIDE | \ MACH_SEND_TRAILER | MACH_SEND_NOIMPORTANCE | \ - MACH_SEND_SYNC_OVERRIDE | MACH_SEND_PROPAGATE_QOS) + MACH_SEND_SYNC_OVERRIDE | MACH_SEND_PROPAGATE_QOS | \ + MACH_SEND_SYNC_BOOTSTRAP_CHECKIN | \ + MACH_MSG_STRICT_REPLY | MACH_RCV_GUARDED_DESC) #define MACH_RCV_USER (MACH_RCV_MSG | MACH_RCV_TIMEOUT | \ MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \ MACH_RCV_VOUCHER | MACH_RCV_TRAILER_MASK | \ - MACH_RCV_SYNC_WAIT) + MACH_RCV_SYNC_WAIT | MACH_RCV_SYNC_PEEK | \ + MACH_RCV_GUARDED_DESC | MACH_MSG_STRICT_REPLY) #define MACH_MSG_OPTION_USER (MACH_SEND_USER | MACH_RCV_USER) @@ -768,6 +829,21 @@ typedef integer_t mach_msg_option_t; #define MACH_SEND_KERNEL_DEFAULT (MACH_SEND_MSG | \ MACH_SEND_ALWAYS | MACH_SEND_NOIMPORTANCE) +#define MACH_SEND_WITH_STRICT_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG)) == \ + (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG)) + +#define MACH_SEND_REPLY_IS_IMMOVABLE(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | \ + MACH_SEND_MSG | MACH_RCV_MSG | \ + MACH_RCV_GUARDED_DESC)) == \ + (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG | MACH_RCV_GUARDED_DESC)) + +#define MACH_RCV_WITH_STRICT_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG)) == \ + (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG)) + +#define MACH_RCV_WITH_IMMOVABLE_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | \ + MACH_RCV_MSG | MACH_RCV_GUARDED_DESC)) == \ + (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG | MACH_RCV_GUARDED_DESC)) + #endif /* MACH_KERNEL_PRIVATE */ /* @@ -881,8 +957,12 @@ typedef kern_return_t mach_msg_return_t; /* A field in the header had a bad value. */ #define MACH_SEND_INVALID_TRAILER 0x10000011 /* The trailer to be sent does not match kernel format. */ +#define MACH_SEND_INVALID_CONTEXT 0x10000012 +/* The sending thread context did not match the context on the dest port */ #define MACH_SEND_INVALID_RT_OOL_SIZE 0x10000015 /* compatibility: no longer a returned error */ +#define MACH_SEND_NO_GRANT_DEST 0x10000016 +/* The destination port doesn't accept ports in body */ #define MACH_RCV_IN_PROGRESS 0x10004001 /* Thread is waiting for receive. (Internal use only.) */ @@ -916,6 +996,8 @@ typedef kern_return_t mach_msg_return_t; /* trailer type or number of trailer elements not supported */ #define MACH_RCV_IN_PROGRESS_TIMED 0x10004011 /* Waiting for receive with timeout. (Internal use only.) */ +#define MACH_RCV_INVALID_REPLY 0x10004012 +/* invalid reply port used in a STRICT_REPLY message */ #ifdef XNU_KERNEL_PRIVATE #define MACH_PEEK_IN_PROGRESS 0x10008001 diff --git a/osfmk/mach/mig.h b/osfmk/mach/mig.h index ee94955bc..74d2d0109 100644 --- a/osfmk/mach/mig.h +++ b/osfmk/mach/mig.h @@ -140,6 +140,17 @@ typedef struct mig_symtab { */ } mig_symtab_t; +/* + * A compiler attribute for annotating all MIG server routines and other + * functions that should behave similarly. Allows the compiler to perform + * additional static bug-finding over them. + */ +#if __has_attribute(mig_server_routine) +#define MIG_SERVER_ROUTINE __attribute__((mig_server_routine)) +#else +#define MIG_SERVER_ROUTINE +#endif + #ifdef PRIVATE /* MIG object runtime - not ready for public consumption */ diff --git a/osfmk/mach/port.h b/osfmk/mach/port.h index db15ea83f..2eb09af2e 100644 --- a/osfmk/mach/port.h +++ b/osfmk/mach/port.h @@ -140,7 +140,7 @@ struct ipc_port; typedef struct ipc_port *ipc_port_t; -#define IPC_PORT_NULL ((ipc_port_t) 0UL) +#define IPC_PORT_NULL ((ipc_port_t) NULL) #define IPC_PORT_DEAD ((ipc_port_t)~0UL) #define IPC_PORT_VALID(port) \ ((port) != IPC_PORT_NULL && (port) != IPC_PORT_DEAD) @@ -189,7 +189,11 @@ typedef mach_port_t *mach_port_array_t; * that a port right was present, but it died. */ +#if defined(XNU_KERNEL_PRIVATE) && defined(__cplusplus) +#define MACH_PORT_NULL NULL +#else #define MACH_PORT_NULL 0 /* intentional loose typing */ +#endif #define MACH_PORT_DEAD ((mach_port_name_t) ~0) #define MACH_PORT_VALID(name) \ (((name) != MACH_PORT_NULL) && \ @@ -243,8 +247,13 @@ typedef natural_t mach_port_right_t; #define MACH_PORT_RIGHT_SEND_ONCE ((mach_port_right_t) 2) #define MACH_PORT_RIGHT_PORT_SET ((mach_port_right_t) 3) #define MACH_PORT_RIGHT_DEAD_NAME ((mach_port_right_t) 4) -#define MACH_PORT_RIGHT_LABELH ((mach_port_right_t) 5) -#define MACH_PORT_RIGHT_NUMBER ((mach_port_right_t) 6) +#define MACH_PORT_RIGHT_LABELH ((mach_port_right_t) 5) /* obsolete right */ +#define MACH_PORT_RIGHT_NUMBER ((mach_port_right_t) 6) /* right not implemented */ + +#ifdef MACH_KERNEL_PRIVATE +#define MACH_PORT_RIGHT_VALID_TRANSLATE(right) \ + ((right) >= MACH_PORT_RIGHT_SEND && (right) <= MACH_PORT_RIGHT_DEAD_NAME) +#endif typedef natural_t mach_port_type_t; typedef mach_port_type_t *mach_port_type_array_t; @@ -258,7 +267,13 @@ typedef mach_port_type_t *mach_port_type_array_t; #define MACH_PORT_TYPE_SEND_ONCE MACH_PORT_TYPE(MACH_PORT_RIGHT_SEND_ONCE) #define MACH_PORT_TYPE_PORT_SET MACH_PORT_TYPE(MACH_PORT_RIGHT_PORT_SET) #define MACH_PORT_TYPE_DEAD_NAME MACH_PORT_TYPE(MACH_PORT_RIGHT_DEAD_NAME) -#define MACH_PORT_TYPE_LABELH MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH) +#define MACH_PORT_TYPE_LABELH MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH) /* obsolete */ + + +#ifdef MACH_KERNEL_PRIVATE +/* Holder used to have a receive right - remembered to filter exceptions */ +#define MACH_PORT_TYPE_EX_RECEIVE MACH_PORT_TYPE_LABELH +#endif /* Convenient combinations. */ @@ -332,6 +347,8 @@ typedef struct mach_port_limits { #define MACH_PORT_STATUS_FLAG_IMP_DONATION 0x08 #define MACH_PORT_STATUS_FLAG_REVIVE 0x10 #define MACH_PORT_STATUS_FLAG_TASKPTR 0x20 +#define MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE 0x40 +#define MACH_PORT_STATUS_FLAG_NO_GRANT 0x80 typedef struct mach_port_info_ext { mach_port_status_t mpie_status; @@ -384,6 +401,7 @@ typedef struct mach_port_qos { #define MPO_INSERT_SEND_RIGHT 0x10 /* Insert a send right for the port */ #define MPO_STRICT 0x20 /* Apply strict guarding for port */ #define MPO_DENAP_RECEIVER 0x40 /* Mark the port as App de-nap receiver */ +#define MPO_IMMOVABLE_RECEIVE 0x80 /* Mark the port as immovable; protected by the guard context */ /* * Structure to define optional attributes for a newly * constructed port. @@ -410,7 +428,9 @@ enum mach_port_guard_exception_codes { kGUARD_EXC_SET_CONTEXT = 1u << 2, kGUARD_EXC_UNGUARDED = 1u << 3, kGUARD_EXC_INCORRECT_GUARD = 1u << 4, - /* start of non-fatal guards */ + kGUARD_EXC_IMMOVABLE = 1u << 5, + kGUARD_EXC_STRICT_REPLY = 1u << 6, + /* start of [optionally] non-fatal guards */ kGUARD_EXC_INVALID_RIGHT = 1u << 8, kGUARD_EXC_INVALID_NAME = 1u << 9, kGUARD_EXC_INVALID_VALUE = 1u << 10, @@ -420,12 +440,31 @@ enum mach_port_guard_exception_codes { kGUARD_EXC_KERN_FAILURE = 1u << 14, kGUARD_EXC_KERN_RESOURCE = 1u << 15, kGUARD_EXC_SEND_INVALID_REPLY = 1u << 16, - kGUARD_EXC_SEND_INVALID_VOUCHER = 1u << 16, - kGUARD_EXC_SEND_INVALID_RIGHT = 1u << 17, - kGUARD_EXC_RCV_INVALID_NAME = 1u << 18, - kGUARD_EXC_RCV_INVALID_NOTIFY = 1u << 19 + kGUARD_EXC_SEND_INVALID_VOUCHER = 1u << 17, + kGUARD_EXC_SEND_INVALID_RIGHT = 1u << 18, + kGUARD_EXC_RCV_INVALID_NAME = 1u << 19, + kGUARD_EXC_RCV_GUARDED_DESC = 1u << 20, /* should never be fatal; for development only */ }; +#define MAX_FATAL_kGUARD_EXC_CODE (1u << 6) + +/* + * These flags are used as bits in the subcode of kGUARD_EXC_STRICT_REPLY exceptions. + */ +#define MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_DISP (0x01ull << 56) +#define MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_PORT (0x02ull << 56) +#define MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER (0x04ull << 56) +#define MPG_FLAGS_STRICT_REPLY_NO_BANK_ATTR (0x08ull << 56) +#define MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA (0x10ull << 56) +#define MPG_FLAGS_STRICT_REPLY_MASK (0xffull << 56) + +/* + * Flags for mach_port_guard_with_flags. These flags extend + * the attributes associated with a guarded port. + */ +#define MPG_STRICT 0x01 /* Apply strict guarding for a port */ +#define MPG_IMMOVABLE_RECEIVE 0x02 /* Receive right cannot be moved out of the space */ + #if !__DARWIN_UNIX03 && !defined(_NO_PORT_T_FROM_MACH) /* * Mach 3.0 renamed everything to have mach_ in front of it. diff --git a/osfmk/mach/restartable.defs b/osfmk/mach/restartable.defs new file mode 100644 index 000000000..74c1125a5 --- /dev/null +++ b/osfmk/mach/restartable.defs @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +subsystem +#if KERNEL_SERVER + KernelServer +#endif /* KERNEL_SERVER */ + task_restartable 8000; + +#include +#include +#include + +import ; + +type task_restartable_range_t = array[2] of uint64_t; +type task_restartable_range_array_t = array[*:64] of task_restartable_range_t; + +routine task_restartable_ranges_register( + target_task : task_t; + ranges : task_restartable_range_array_t); + +routine task_restartable_ranges_synchronize( + target_task : task_t); + +/* vim: set ft=c : */ + diff --git a/osfmk/mach/shared_region.h b/osfmk/mach/shared_region.h index be70167dc..0faf73ee8 100644 --- a/osfmk/mach/shared_region.h +++ b/osfmk/mach/shared_region.h @@ -70,6 +70,12 @@ #define SHARED_REGION_NESTING_MIN_ARM ? #define SHARED_REGION_NESTING_MAX_ARM ? +#define SHARED_REGION_BASE_ARM64_32 0x1A000000ULL +#define SHARED_REGION_SIZE_ARM64_32 0x40000000ULL +#define SHARED_REGION_NESTING_BASE_ARM64_32 0x1A000000ULL +#define SHARED_REGION_NESTING_SIZE_ARM64_32 0x40000000ULL +#define SHARED_REGION_NESTING_MIN_ARM64_32 ? +#define SHARED_REGION_NESTING_MAX_ARM64_32 ? #ifdef XNU_KERNEL_PRIVATE /* ARM64_TODO: move to higher memory */ @@ -102,6 +108,13 @@ #define SHARED_REGION_NESTING_SIZE SHARED_REGION_NESTING_SIZE_ARM #define SHARED_REGION_NESTING_MIN SHARED_REGION_NESTING_MIN_ARM #define SHARED_REGION_NESTING_MAX SHARED_REGION_NESTING_MAX_ARM +#elif defined(__arm64__) && !defined(__LP64__) +#define SHARED_REGION_BASE SHARED_REGION_BASE_ARM64_32 +#define SHARED_REGION_SIZE SHARED_REGION_SIZE_ARM64_32 +#define SHARED_REGION_NESTING_BASE SHARED_REGION_NESTING_BASE_ARM64_32 +#define SHARED_REGION_NESTING_SIZE SHARED_REGION_NESTING_SIZE_ARM64_32 +#define SHARED_REGION_NESTING_MIN SHARED_REGION_NESTING_MIN_ARM64_32 +#define SHARED_REGION_NESTING_MAX SHARED_REGION_NESTING_MAX_ARM64_32 #elif defined(__arm64__) && defined(__LP64__) #define SHARED_REGION_BASE SHARED_REGION_BASE_ARM64 #define SHARED_REGION_SIZE SHARED_REGION_SIZE_ARM64 diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index b8c1c4d32..f1e419809 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -151,6 +151,10 @@ kernel_trap(mach_voucher_extract_attr_recipe_trap,-72,4) /* mach_voucher_attr_command */ /* mach_voucher_debug_info */ +/* more mach_port traps */ +kernel_trap(_kernelrpc_mach_port_type_trap,-76,3) +kernel_trap(_kernelrpc_mach_port_request_notification_trap,-77,7) + kernel_trap(mach_timebase_info_trap,-89,1) #if defined(__LP64__) @@ -176,6 +180,8 @@ kernel_trap(mk_timer_arm_leeway,-95,4) #else kernel_trap(mk_timer_arm_leeway,-95,7) #endif +kernel_trap(debug_control_port_for_pid,-96,3) + /* * N.B: Trap #-100 is in use by IOTrap.s in the IOKit Framework * (iokit_user_client_trap) diff --git a/osfmk/mach/task.defs b/osfmk/mach/task.defs index 5ac64e7d5..378fe2039 100644 --- a/osfmk/mach/task.defs +++ b/osfmk/mach/task.defs @@ -504,5 +504,13 @@ routine task_inspect( flavor : task_inspect_flavor_t; out info_out : task_inspect_info_t, CountInOut); +routine task_get_exc_guard_behavior( + task : task_inspect_t; + out behavior : task_exc_guard_behavior_t); + +routine task_set_exc_guard_behavior( + task : task_t; + behavior : task_exc_guard_behavior_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/task_info.h b/osfmk/mach/task_info.h index 1248749f1..93fa357c9 100644 --- a/osfmk/mach/task_info.h +++ b/osfmk/mach/task_info.h @@ -88,7 +88,7 @@ typedef integer_t task_info_data_t[TASK_INFO_MAX]; * Currently defined information structures. */ -#pragma pack(4) +#pragma pack(push, 4) /* Don't use this, use MACH_TASK_BASIC_INFO instead */ #define TASK_BASIC_INFO_32 4 /* basic information */ @@ -390,12 +390,47 @@ struct task_vm_info { /* added for rev2 */ mach_vm_address_t min_address; mach_vm_address_t max_address; + + /* added for rev3 */ + int64_t ledger_phys_footprint_peak; + int64_t ledger_purgeable_nonvolatile; + int64_t ledger_purgeable_novolatile_compressed; + int64_t ledger_purgeable_volatile; + int64_t ledger_purgeable_volatile_compressed; + int64_t ledger_tag_network_nonvolatile; + int64_t ledger_tag_network_nonvolatile_compressed; + int64_t ledger_tag_network_volatile; + int64_t ledger_tag_network_volatile_compressed; + int64_t ledger_tag_media_footprint; + int64_t ledger_tag_media_footprint_compressed; + int64_t ledger_tag_media_nofootprint; + int64_t ledger_tag_media_nofootprint_compressed; + int64_t ledger_tag_graphics_footprint; + int64_t ledger_tag_graphics_footprint_compressed; + int64_t ledger_tag_graphics_nofootprint; + int64_t ledger_tag_graphics_nofootprint_compressed; + int64_t ledger_tag_neural_footprint; + int64_t ledger_tag_neural_footprint_compressed; + int64_t ledger_tag_neural_nofootprint; + int64_t ledger_tag_neural_nofootprint_compressed; + + /* added for rev4 */ + uint64_t limit_bytes_remaining; + + /* added for rev5 */ + integer_t decompressions; }; typedef struct task_vm_info task_vm_info_data_t; typedef struct task_vm_info *task_vm_info_t; #define TASK_VM_INFO_COUNT ((mach_msg_type_number_t) \ (sizeof (task_vm_info_data_t) / sizeof (natural_t))) -#define TASK_VM_INFO_REV2_COUNT TASK_VM_INFO_COUNT +#define TASK_VM_INFO_REV5_COUNT TASK_VM_INFO_COUNT +#define TASK_VM_INFO_REV4_COUNT /* doesn't include decompressions */ \ + ((mach_msg_type_number_t) (TASK_VM_INFO_REV5_COUNT - 1)) +#define TASK_VM_INFO_REV3_COUNT /* doesn't include limit bytes */ \ + ((mach_msg_type_number_t) (TASK_VM_INFO_REV4_COUNT - 2)) +#define TASK_VM_INFO_REV2_COUNT /* doesn't include extra ledgers info */ \ + ((mach_msg_type_number_t) (TASK_VM_INFO_REV3_COUNT - 42)) #define TASK_VM_INFO_REV1_COUNT /* doesn't include min and max address */ \ ((mach_msg_type_number_t) (TASK_VM_INFO_REV2_COUNT - 4)) #define TASK_VM_INFO_REV0_COUNT /* doesn't include phys_footprint */ \ @@ -496,6 +531,35 @@ typedef struct task_debug_info_internal task_debug_info_internal_data_t; #endif /* PRIVATE */ +/* + * Type to control EXC_GUARD delivery options for a task + * via task_get/set_exc_guard_behavior interface(s). + */ +typedef uint32_t task_exc_guard_behavior_t; + +/* EXC_GUARD optional delivery settings on a per-task basis */ +#define TASK_EXC_GUARD_VM_DELIVER 0x01 /* Deliver virtual memory EXC_GUARD exceptions */ +#define TASK_EXC_GUARD_VM_ONCE 0x02 /* Deliver them only once */ +#define TASK_EXC_GUARD_VM_CORPSE 0x04 /* Deliver them via a forked corpse */ +#define TASK_EXC_GUARD_VM_FATAL 0x08 /* Virtual Memory EXC_GUARD delivery is fatal */ +#define TASK_EXC_GUARD_VM_ALL 0x0f + +#define TASK_EXC_GUARD_MP_DELIVER 0x10 /* Deliver mach port EXC_GUARD exceptions */ +#define TASK_EXC_GUARD_MP_ONCE 0x20 /* Deliver them only once */ +#define TASK_EXC_GUARD_MP_CORPSE 0x40 /* Deliver them via a forked corpse */ +#define TASK_EXC_GUARD_MP_FATAL 0x80 /* mach port EXC_GUARD delivery is fatal */ +#define TASK_EXC_GUARD_MP_ALL 0xf0 + +#define TASK_EXC_GUARD_ALL 0xff /* All optional deliver settings */ + +#ifdef PRIVATE +/* + * Experimental mode of setting default guard behavior for non-Apple processes + * The default for 3rd party guards is shifted up 8 bits - but otherwise the same values as above. + */ +#define TASK_EXC_GUARD_THIRD_PARTY_DEFAULT_SHIFT 0x8 /* 3rd party default shifted up in boot-arg */ +#endif + /* * Obsolete interfaces. */ @@ -506,6 +570,6 @@ typedef struct task_debug_info_internal task_debug_info_internal_data_t; #define TASK_SCHED_INFO 14 -#pragma pack() +#pragma pack(pop) #endif /* _MACH_TASK_INFO_H_ */ diff --git a/osfmk/mach/task_policy.h b/osfmk/mach/task_policy.h index f1b7cc0c5..6aa6e9180 100644 --- a/osfmk/mach/task_policy.h +++ b/osfmk/mach/task_policy.h @@ -337,6 +337,7 @@ typedef struct task_policy_state *task_policy_state_t; #define TASK_APPTYPE_DAEMON_BACKGROUND 4 #define TASK_APPTYPE_APP_DEFAULT 5 #define TASK_APPTYPE_APP_TAL 6 +#define TASK_APPTYPE_DRIVER 7 /* task policy state flags */ #define TASK_IMP_RECEIVER 0x00000001 diff --git a/osfmk/mach/task_special_ports.h b/osfmk/mach/task_special_ports.h index 779071686..d1e5ec465 100644 --- a/osfmk/mach/task_special_ports.h +++ b/osfmk/mach/task_special_ports.h @@ -128,4 +128,8 @@ typedef int task_special_port_t; #define task_set_task_debug_control_port(task, port) \ (task_set_special_port((task), TASK_DEBUG_CONTROL_PORT, (port))) +#ifdef XNU_KERNEL_PRIVATE +#define DEBUG_PORT_ENTITLEMENT "com.apple.private.debug_port" +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* _MACH_TASK_SPECIAL_PORTS_H_ */ diff --git a/osfmk/mach/thread_policy.h b/osfmk/mach/thread_policy.h index 7f6ac49ff..b0c82bdc5 100644 --- a/osfmk/mach/thread_policy.h +++ b/osfmk/mach/thread_policy.h @@ -375,6 +375,9 @@ typedef struct thread_qos_policy *thread_qos_policy_t; #define THREAD_POLICY_INTERNAL_STRUCT_VERSION 5 +// legacy names +#define thrp_qos_ipc_override thrp_qos_kevent_override + struct thread_requested_policy { uint64_t thrp_int_darwinbg :1, /* marked as darwinbg via setpriority */ thrp_ext_darwinbg :1, @@ -390,12 +393,13 @@ struct thread_requested_policy { thrp_qos_relprio :4, /* thread qos relative priority (store as inverse, -10 -> 0xA) */ thrp_qos_override :3, /* thread qos class override */ thrp_qos_promote :3, /* thread qos class from promotion */ - thrp_qos_ipc_override :3, /* thread qos class from ipc override */ + thrp_qos_kevent_override:3, /* thread qos class from kevent override */ thrp_terminated :1, /* heading for termination */ thrp_qos_sync_ipc_override:3, /* now unused */ thrp_qos_workq_override :3, /* thread qos class override (workq) */ + thrp_qos_wlsvc_override :3, /* workloop servicer qos class override */ - thrp_reserved :26; + thrp_reserved :23; }; struct thread_effective_policy { diff --git a/osfmk/mach/thread_status.h b/osfmk/mach/thread_status.h index 90ff1e0cb..a91b936eb 100644 --- a/osfmk/mach/thread_status.h +++ b/osfmk/mach/thread_status.h @@ -89,6 +89,7 @@ typedef natural_t thread_state_data_t[THREAD_STATE_MAX]; #define THREAD_STATE_FLAVOR_LIST_NEW 128 #define THREAD_STATE_FLAVOR_LIST_10_9 129 #define THREAD_STATE_FLAVOR_LIST_10_13 130 +#define THREAD_STATE_FLAVOR_LIST_10_15 131 typedef int thread_state_flavor_t; typedef thread_state_flavor_t *thread_state_flavor_array_t; diff --git a/osfmk/mach/vfs_nspace.defs b/osfmk/mach/vfs_nspace.defs new file mode 100644 index 000000000..aaca5bf61 --- /dev/null +++ b/osfmk/mach/vfs_nspace.defs @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Interface definition for the namespace facility. + */ + +subsystem +#if KERNEL_USER + KernelUser +#endif /* KERNEL_USER */ + vfs_nspace 867800; /* 'V''N'00 */ + +#include +#include +#include + +ServerPrefix receive_; +UserPrefix send_; + +/* DEPRECATED */ +routine nspace_handle( + nspace_handler_port : mach_port_t; + pid : uint32_t; + in path : vfs_path_t; + out handler_error : int +); + +routine nspace_resolve_cancel( + nspace_handler_port : mach_port_t; + req_id : uint32_t +); + +routine nspace_resolve_path( + nspace_handler_port : mach_port_t; + req_id : uint32_t; + pid : uint32_t; + op : uint32_t; + in path : nspace_path_t; + out xxx_rdar44371223 : int +); + +/* vim: set ft=c : */ diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 2bb038e21..deef9ffd8 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -327,7 +327,12 @@ extern vm_offset_t vm_kernel_builtinkmod_text_end; */ __BEGIN_DECLS +#if XNU_KERNEL_PRIVATE +extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr) +__XNU_INTERNAL(vm_kernel_addrhash); +#else extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr); +#endif __END_DECLS #define __DO_UNSLIDE(_v) ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) - vm_kernel_slide) diff --git a/osfmk/mach/vm_region.h b/osfmk/mach/vm_region.h index 416699cb8..21abcad6e 100644 --- a/osfmk/mach/vm_region.h +++ b/osfmk/mach/vm_region.h @@ -50,7 +50,7 @@ #include -#pragma pack(4) +#pragma pack(push, 4) // LP64todo: all the current tools are 32bit, obviously never worked for 64b // so probably should be a real 32b ID vs. ptr. @@ -270,17 +270,24 @@ struct vm_region_submap_info_64 { vm32_object_id_t object_id; /* obj/map name, not a handle */ unsigned short user_wired_count; unsigned int pages_reusable; + vm_object_id_t object_id_full; }; typedef struct vm_region_submap_info_64 *vm_region_submap_info_64_t; typedef struct vm_region_submap_info_64 vm_region_submap_info_data_64_t; -#define VM_REGION_SUBMAP_INFO_V1_SIZE \ +#define VM_REGION_SUBMAP_INFO_V2_SIZE \ (sizeof (vm_region_submap_info_data_64_t)) +#define VM_REGION_SUBMAP_INFO_V1_SIZE \ + (VM_REGION_SUBMAP_INFO_V2_SIZE - \ + sizeof (vm_object_id_t) /* object_id_full */ ) #define VM_REGION_SUBMAP_INFO_V0_SIZE \ (VM_REGION_SUBMAP_INFO_V1_SIZE - \ sizeof (unsigned int) /* pages_reusable */ ) +#define VM_REGION_SUBMAP_INFO_V2_COUNT_64 \ + ((mach_msg_type_number_t) \ + (VM_REGION_SUBMAP_INFO_V2_SIZE / sizeof (natural_t))) #define VM_REGION_SUBMAP_INFO_V1_COUNT_64 \ ((mach_msg_type_number_t) \ (VM_REGION_SUBMAP_INFO_V1_SIZE / sizeof (natural_t))) @@ -289,7 +296,7 @@ typedef struct vm_region_submap_info_64 vm_region_submap_info_data_64_t (VM_REGION_SUBMAP_INFO_V0_SIZE / sizeof (natural_t))) /* set this to the latest version */ -#define VM_REGION_SUBMAP_INFO_COUNT_64 VM_REGION_SUBMAP_INFO_V1_COUNT_64 +#define VM_REGION_SUBMAP_INFO_COUNT_64 VM_REGION_SUBMAP_INFO_V2_COUNT_64 struct vm_region_submap_short_info_64 { vm_prot_t protection; /* present access protection */ @@ -314,8 +321,6 @@ typedef struct vm_region_submap_short_info_64 vm_region_submap_short_info_dat ((mach_msg_type_number_t) \ (sizeof (vm_region_submap_short_info_data_64_t) / sizeof (natural_t))) - - struct mach_vm_read_entry { mach_vm_address_t address; mach_vm_size_t size; @@ -342,7 +347,7 @@ typedef struct vm_read_entry vm_read_entry_t[VM_MAP_ENTRY_MAX]; typedef struct vm32_read_entry vm32_read_entry_t[VM_MAP_ENTRY_MAX]; #endif -#pragma pack() +#pragma pack(pop) #define VM_PAGE_INFO_MAX diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index 9e72c81d2..267d5df2f 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -223,6 +223,7 @@ typedef struct vm_purgeable_info *vm_purgeable_info_t; #define VM_PAGE_QUERY_PAGE_CS_VALIDATED 0x100 #define VM_PAGE_QUERY_PAGE_CS_TAINTED 0x200 #define VM_PAGE_QUERY_PAGE_CS_NX 0x400 +#define VM_PAGE_QUERY_PAGE_REUSABLE 0x800 #ifdef MACH_KERNEL_PRIVATE @@ -333,12 +334,13 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_USER_MAP (VM_FLAGS_USER_ALLOCATE | \ VM_FLAGS_RETURN_4K_DATA_ADDR | \ VM_FLAGS_RETURN_DATA_ADDR) -#define VM_FLAGS_USER_REMAP (VM_FLAGS_FIXED | \ - VM_FLAGS_ANYWHERE | \ - VM_FLAGS_RANDOM_ADDR | \ - VM_FLAGS_OVERWRITE| \ - VM_FLAGS_RETURN_DATA_ADDR |\ - VM_FLAGS_RESILIENT_CODESIGN) +#define VM_FLAGS_USER_REMAP (VM_FLAGS_FIXED | \ + VM_FLAGS_ANYWHERE | \ + VM_FLAGS_RANDOM_ADDR | \ + VM_FLAGS_OVERWRITE| \ + VM_FLAGS_RETURN_DATA_ADDR | \ + VM_FLAGS_RESILIENT_CODESIGN | \ + VM_FLAGS_RESILIENT_MEDIA) #define VM_FLAGS_SUPERPAGE_SHIFT 16 #define SUPERPAGE_NONE 0 /* no superpages, if all bits are 0 */ @@ -379,7 +381,14 @@ typedef struct { vmkf_remap_prot_copy:1, vmkf_cs_enforcement_override:1, vmkf_cs_enforcement:1, - __vmkf_unused:16; + vmkf_nested_pmap:1, + vmkf_no_copy_on_read:1, +#if !defined(CONFIG_EMBEDDED) + vmkf_32bit_map_va:1, + __vmkf_unused:13; +#else + __vmkf_unused:14; +#endif } vm_map_kernel_flags_t; #define VM_MAP_KERNEL_FLAGS_NONE (vm_map_kernel_flags_t) { \ .vmkf_atomic_entry = 0, /* keep entry atomic (no coalescing) */ \ @@ -398,10 +407,39 @@ typedef struct { .vmkf_remap_prot_copy = 0, /* vm_remap for VM_PROT_COPY */ \ .vmkf_cs_enforcement_override = 0, /* override CS_ENFORCEMENT */ \ .vmkf_cs_enforcement = 0, /* new value for CS_ENFORCEMENT */ \ + .vmkf_nested_pmap = 0, /* use a nested pmap */ \ + .vmkf_no_copy_on_read = 0, /* do not use copy_on_read */ \ .__vmkf_unused = 0 \ } + +typedef struct { + unsigned int + vmnekf_ledger_tag:3, + vmnekf_ledger_no_footprint:1, + __vmnekf_unused:28; +} vm_named_entry_kernel_flags_t; +#define VM_NAMED_ENTRY_KERNEL_FLAGS_NONE (vm_named_entry_kernel_flags_t) { \ + .vmnekf_ledger_tag = 0, \ + .vmnekf_ledger_no_footprint = 0, \ + .__vmnekf_unused = 0 \ +} + #endif /* KERNEL_PRIVATE */ +/* current accounting postmark */ +#define __VM_LEDGER_ACCOUNTING_POSTMARK 2019032600 + +/* discrete values: */ +#define VM_LEDGER_TAG_NONE 0x00000000 +#define VM_LEDGER_TAG_DEFAULT 0x00000001 +#define VM_LEDGER_TAG_NETWORK 0x00000002 +#define VM_LEDGER_TAG_MEDIA 0x00000003 +#define VM_LEDGER_TAG_GRAPHICS 0x00000004 +#define VM_LEDGER_TAG_NEURAL 0x00000005 +#define VM_LEDGER_TAG_MAX 0x00000005 +/* individual bits: */ +#define VM_LEDGER_FLAG_NO_FOOTPRINT 0x00000001 +#define VM_LEDGER_FLAGS (VM_LEDGER_FLAG_NO_FOOTPRINT) #define VM_MEMORY_MALLOC 1 @@ -569,6 +607,15 @@ typedef struct { /* memory allocated by Accounts framework */ #define VM_MEMORY_ACCOUNTS 98 +/* memory allocated by Sanitizer runtime libraries */ +#define VM_MEMORY_SANITIZER 99 + +/* Differentiate memory needed by GPU drivers and frameworks from generic IOKit allocations */ +#define VM_MEMORY_IOACCELERATOR 100 + +/* memory allocated by CoreMedia for global image registration of frames */ +#define VM_MEMORY_CM_REGWARP 101 + /* Reserve 240-255 for application */ #define VM_MEMORY_APPLICATION_SPECIFIC_1 240 #define VM_MEMORY_APPLICATION_SPECIFIC_16 255 diff --git a/osfmk/mach/vm_types.h b/osfmk/mach/vm_types.h index 057c00533..95eaafd5e 100644 --- a/osfmk/mach/vm_types.h +++ b/osfmk/mach/vm_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,8 +95,8 @@ typedef struct _vm_map *vm_map_t; typedef struct vm_object *vm_object_t; typedef struct vm_object_fault_info *vm_object_fault_info_t; -#define PMAP_NULL ((pmap_t) 0) -#define VM_OBJECT_NULL ((vm_object_t) 0) +#define PMAP_NULL ((pmap_t) NULL) +#define VM_OBJECT_NULL ((vm_object_t) NULL) #else /* KERNEL_PRIVATE */ @@ -104,7 +104,11 @@ typedef mach_port_t vm_map_t; #endif /* KERNEL_PRIVATE */ +#ifdef KERNEL +#define VM_MAP_NULL ((vm_map_t) NULL) +#else #define VM_MAP_NULL ((vm_map_t) 0) +#endif /* * Evolving definitions, likely to change. @@ -166,7 +170,7 @@ struct vm_allocation_site { uint16_t flags; uint16_t subtotalscount; struct vm_allocation_total subtotals[0]; - char name[0]; + /* char name[0]; -- this is placed after subtotals, see KA_NAME() */ }; typedef struct vm_allocation_site vm_allocation_site_t; @@ -197,7 +201,7 @@ typedef struct upl *upl_t; typedef struct vm_map_copy *vm_map_copy_t; typedef struct vm_named_entry *vm_named_entry_t; -#define VM_MAP_COPY_NULL ((vm_map_copy_t) 0) +#define VM_MAP_COPY_NULL ((vm_map_copy_t) NULL) #else /* KERNEL_PRIVATE */ @@ -206,8 +210,14 @@ typedef mach_port_t vm_named_entry_t; #endif /* KERNEL_PRIVATE */ +#ifdef KERNEL +#define UPL_NULL ((upl_t) NULL) +#define VM_NAMED_ENTRY_NULL ((vm_named_entry_t) NULL) +#else #define UPL_NULL ((upl_t) 0) #define VM_NAMED_ENTRY_NULL ((vm_named_entry_t) 0) +#endif + #ifdef PRIVATE typedef struct { uint64_t rtfabstime; // mach_continuous_time at start of fault diff --git a/osfmk/machine/Makefile b/osfmk/machine/Makefile index 8542493e9..0e31820d1 100644 --- a/osfmk/machine/Makefile +++ b/osfmk/machine/Makefile @@ -10,7 +10,9 @@ PRIVATE_DATAFILES = \ cpu_capabilities.h KERNELFILES = \ + atomic_impl.h \ atomic.h \ + config.h \ cpu_capabilities.h \ cpu_number.h \ io_map_entries.h \ @@ -20,10 +22,12 @@ KERNELFILES = \ machine_remote_time.h \ machine_routines.h \ machine_kpc.h \ + memory_types.h \ monotonic.h \ pal_routines.h \ pal_hibernate.h \ - simple_lock.h + simple_lock.h \ + smp.h EXPORT_FILES = \ machine_remote_time.h diff --git a/osfmk/machine/atomic.h b/osfmk/machine/atomic.h index 3c3676248..ab11c7004 100644 --- a/osfmk/machine/atomic.h +++ b/osfmk/machine/atomic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,82 +29,735 @@ #ifndef _MACHINE_ATOMIC_H #define _MACHINE_ATOMIC_H -#include +/* + * Internal implementation details are in a separate header + */ +#include + +/*! + * @file + * + * @brief + * This file defines nicer (terser and safer) wrappers for C11's . + * + * @discussion + * @see xnu.git::doc/atomics.md which provides more extensive documentation + * about this header. + * + * Note that some of the macros defined in this file may be overridden by + * architecture specific headers. + * + * All the os_atomic* functions take an operation ordering argument that can be: + * - C11 memory orders: relaxed, acquire, release, acq_rel or seq_cst which + * imply a memory fence on SMP machines, and always carry the matching + * compiler barrier semantics. + * + * - the os_atomic-specific `dependency` memory ordering that is used to + * document intent to a carry a data or address dependency. + * See doc/atomics.md for more information. + * + * - a compiler barrier: compiler_acquire, compiler_release, compiler_acq_rel + * without a corresponding memory fence. + */ + +/*! + * @function os_compiler_barrier + * + * @brief + * Provide a compiler barrier according to the specified ordering. + * + * @param m + * An optional ordering among `acquire`, `release` or `acq_rel` which defaults + * to `acq_rel` when not specified. + * These are equivalent to the `compiler_acquire`, `compiler_release` and + * `compiler_acq_rel` orderings taken by the os_atomic* functions + */ +#define os_compiler_barrier(b...) \ + atomic_signal_fence(_os_compiler_barrier_##b) + +/*! + * @function os_atomic_thread_fence + * + * @brief + * Memory fence which is elided in non-SMP mode, but always carries the + * corresponding compiler barrier. + * + * @param m + * The ordering for this fence. + */ +#define os_atomic_thread_fence(m) ({ \ + atomic_thread_fence(memory_order_##m##_smp); \ + atomic_signal_fence(memory_order_##m); \ +}) -#define _os_atomic_c11_atomic(p) \ - ((typeof(*(p)) _Atomic *)(p)) +/*! + * @function os_atomic_init + * + * @brief + * Wrapper for C11 atomic_init() + * + * @discussion + * This initialization is not performed atomically, and so must only be used as + * part of object initialization before the object is made visible to other + * threads/cores. + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to initialize the variable with. + * + * @returns + * The value loaded from @a p. + */ +#define os_atomic_init(p, v) \ + atomic_init(_os_atomic_c11_atomic(p), v) -#define _os_atomic_basetypeof(p) \ - typeof(atomic_load(((typeof(*(p)) _Atomic *)(p)))) +/*! + * @function os_atomic_load_is_plain, os_atomic_store_is_plain + * + * @brief + * Return whether a relaxed atomic load (resp. store) to an atomic variable + * is implemented as a single plain load (resp. store) instruction. + * + * @discussion + * Non-relaxed loads/stores may involve additional memory fence instructions + * or more complex atomic instructions. + * + * This is a construct that can safely be used in static asserts. + * + * @param p + * A pointer to an atomic variable. + * + * @returns + * True when relaxed atomic loads (resp. stores) compile to a plain load + * (resp. store) instruction, false otherwise. + */ +#define os_atomic_load_is_plain(p) (sizeof(*(p)) <= sizeof(void *)) +#define os_atomic_store_is_plain(p) os_atomic_load_is_plain(p) -#define _os_atomic_c11_op_orig(p, v, m, o) \ - atomic_##o##_explicit(_os_atomic_c11_atomic(p), v, \ - memory_order_##m) +/*! + * @function os_atomic_load + * + * @brief + * Wrapper for C11 atomic_load_explicit(), guaranteed to compile to a single + * plain load instruction (when @a m is `relaxed`). + * + * @param p + * A pointer to an atomic variable. + * + * @param m + * The ordering to use. + * + * @returns + * The value loaded from @a p. + */ +#define os_atomic_load(p, m) ({ \ + _Static_assert(os_atomic_load_is_plain(p), "Load is wide"); \ + _os_atomic_basetypeof(p) _r; \ + _os_compiler_barrier_before_atomic(m); \ + _r = atomic_load_explicit(_os_atomic_c11_atomic(p), \ + memory_order_##m##_smp); \ + _os_compiler_barrier_after_atomic(m); \ + _r; \ +}) -#define _os_atomic_c11_op(p, v, m, o, op) \ - ({ typeof(v) _v = (v); _os_atomic_c11_op_orig(p, v, m, o) op _v; }) +/*! + * @function os_atomic_load_wide + * + * @brief + * Wrapper for C11 atomic_load_explicit(), which may be implemented by a + * compare-exchange loop for double-wide variables. + * + * @param p + * A pointer to an atomic variable. + * + * @param m + * The ordering to use. + * + * @returns + * The value loaded from @a p. + */ +#define os_atomic_load_wide(p, m) ({ \ + _os_atomic_basetypeof(p) _r; \ + _os_compiler_barrier_before_atomic(m); \ + _r = atomic_load_explicit(_os_atomic_c11_atomic(p), \ + memory_order_##m##_smp); \ + _os_compiler_barrier_after_atomic(m); \ + _r; \ +}) -#define os_atomic_thread_fence(m) atomic_thread_fence(memory_order_##m) +/*! + * @function os_atomic_store + * + * @brief + * Wrapper for C11 atomic_store_explicit(), guaranteed to compile to a single + * plain store instruction (when @a m is `relaxed`). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to store. + * + * @param m + * The ordering to use. + * + * @returns + * The value stored at @a p. + */ +#define os_atomic_store(p, v, m) ({ \ + _Static_assert(os_atomic_store_is_plain(p), "Store is wide"); \ + _os_atomic_basetypeof(p) _v = (v); \ + _os_compiler_barrier_before_atomic(m); \ + atomic_store_explicit(_os_atomic_c11_atomic(p), _v, \ + memory_order_##m##_smp); \ + _os_compiler_barrier_after_atomic(m); \ + _v; \ +}) -#define os_atomic_load(p, m) \ - atomic_load_explicit(_os_atomic_c11_atomic(p), memory_order_##m) -#define os_atomic_store(p, v, m) _os_atomic_c11_op_orig(p, v, m, store) +/*! + * @function os_atomic_store_wide + * + * @brief + * Wrapper for C11 atomic_store_explicit(), which may be implemented by a + * compare-exchange loop for double-wide variables. + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to store. + * + * @param m + * The ordering to use. + * + * @returns + * The value stored at @a p. + */ +#define os_atomic_store_wide(p, v, m) ({ \ + _os_atomic_basetypeof(p) _v = (v); \ + _os_compiler_barrier_before_atomic(m); \ + atomic_store_explicit(_os_atomic_c11_atomic(p), _v, \ + memory_order_##m##_smp); \ + _os_compiler_barrier_after_atomic(m); \ + _v; \ +}) +/*! + * @function os_atomic_add, os_atomic_add_orig + * + * @brief + * Wrappers for C11 atomic_fetch_add_explicit(). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to add. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_add_orig returns the value of the variable before the atomic add, + * os_atomic_add returns the value of the variable after the atomic add. + */ #define os_atomic_add_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_add) #define os_atomic_add(p, v, m) _os_atomic_c11_op(p, v, m, fetch_add, +) +/*! + * @function os_atomic_inc, os_atomic_inc_orig + * + * @brief + * Perform an atomic increment. + * + * @param p + * A pointer to an atomic variable. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_inc_orig returns the value of the variable before the atomic increment, + * os_atomic_inc returns the value of the variable after the atomic increment. + */ #define os_atomic_inc_orig(p, m) _os_atomic_c11_op_orig(p, 1, m, fetch_add) #define os_atomic_inc(p, m) _os_atomic_c11_op(p, 1, m, fetch_add, +) +/*! + * @function os_atomic_sub, os_atomic_sub_orig + * + * @brief + * Wrappers for C11 atomic_fetch_sub_explicit(). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to subtract. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_sub_orig returns the value of the variable before the atomic subtract, + * os_atomic_sub returns the value of the variable after the atomic subtract. + */ #define os_atomic_sub_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_sub) #define os_atomic_sub(p, v, m) _os_atomic_c11_op(p, v, m, fetch_sub, -) +/*! + * @function os_atomic_dec, os_atomic_dec_orig + * + * @brief + * Perform an atomic decrement. + * + * @param p + * A pointer to an atomic variable. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_dec_orig returns the value of the variable before the atomic decrement, + * os_atomic_dec returns the value of the variable after the atomic decrement. + */ #define os_atomic_dec_orig(p, m) _os_atomic_c11_op_orig(p, 1, m, fetch_sub) #define os_atomic_dec(p, m) _os_atomic_c11_op(p, 1, m, fetch_sub, -) +/*! + * @function os_atomic_and, os_atomic_and_orig + * + * @brief + * Wrappers for C11 atomic_fetch_and_explicit(). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to and. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_and_orig returns the value of the variable before the atomic and, + * os_atomic_and returns the value of the variable after the atomic and. + */ #define os_atomic_and_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_and) #define os_atomic_and(p, v, m) _os_atomic_c11_op(p, v, m, fetch_and, &) +/*! + * @function os_atomic_andnot, os_atomic_andnot_orig + * + * @brief + * Wrappers for C11 atomic_fetch_and_explicit(p, ~value). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value whose complement to and. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_andnot_orig returns the value of the variable before the atomic andnot, + * os_atomic_andnot returns the value of the variable after the atomic andnot. + */ +#define os_atomic_andnot_orig(p, v, m) _os_atomic_c11_op_orig(p, ~(v), m, fetch_and) +#define os_atomic_andnot(p, v, m) _os_atomic_c11_op(p, ~(v), m, fetch_and, &) + +/*! + * @function os_atomic_or, os_atomic_or_orig + * + * @brief + * Wrappers for C11 atomic_fetch_or_explicit(). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to or. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_or_orig returns the value of the variable before the atomic or, + * os_atomic_or returns the value of the variable after the atomic or. + */ #define os_atomic_or_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_or) #define os_atomic_or(p, v, m) _os_atomic_c11_op(p, v, m, fetch_or, |) +/*! + * @function os_atomic_xor, os_atomic_xor_orig + * + * @brief + * Wrappers for C11 atomic_fetch_xor_explicit(). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to xor. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_xor_orig returns the value of the variable before the atomic xor, + * os_atomic_xor returns the value of the variable after the atomic xor. + */ #define os_atomic_xor_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_xor) #define os_atomic_xor(p, v, m) _os_atomic_c11_op(p, v, m, fetch_xor, ^) +/*! + * @function os_atomic_min, os_atomic_min_orig + * + * @brief + * Wrappers for Clang's __atomic_fetch_min() + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to minimize. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_min_orig returns the value of the variable before the atomic min, + * os_atomic_min returns the value of the variable after the atomic min. + */ +#define os_atomic_min_orig(p, v, m) _os_atomic_clang_op_orig(p, v, m, fetch_min) +#define os_atomic_min(p, v, m) _os_atomic_clang_op(p, v, m, fetch_min, MIN) + +/*! + * @function os_atomic_max, os_atomic_max_orig + * + * @brief + * Wrappers for Clang's __atomic_fetch_max() + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to maximize. + * + * @param m + * The ordering to use. + * + * @returns + * os_atomic_max_orig returns the value of the variable before the atomic max, + * os_atomic_max returns the value of the variable after the atomic max. + */ +#define os_atomic_max_orig(p, v, m) _os_atomic_clang_op_orig(p, v, m, fetch_max) +#define os_atomic_max(p, v, m) _os_atomic_clang_op(p, v, m, fetch_max, MAX) + +/*! + * @function os_atomic_xchg + * + * @brief + * Wrapper for C11 atomic_exchange_explicit(). + * + * @param p + * A pointer to an atomic variable. + * + * @param v + * The value to exchange with. + * + * @param m + * The ordering to use. + * + * @returns + * The value of the variable before the exchange. + */ #define os_atomic_xchg(p, v, m) _os_atomic_c11_op_orig(p, v, m, exchange) -#define os_atomic_cmpxchg(p, e, v, m) \ - ({ _os_atomic_basetypeof(p) _r = (e); \ - atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \ - &_r, v, memory_order_##m, memory_order_relaxed); }) -#define os_atomic_cmpxchgv(p, e, v, g, m) \ - ({ _os_atomic_basetypeof(p) _r = (e); int _b = \ - atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \ - &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r; _b; }) -#define os_atomic_cmpxchgvw(p, e, v, g, m) \ - ({ _os_atomic_basetypeof(p) _r = (e); int _b = \ - atomic_compare_exchange_weak_explicit(_os_atomic_c11_atomic(p), \ - &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r; _b; }) +/*! + * @function os_atomic_cmpxchg + * + * @brief + * Wrapper for C11 atomic_compare_exchange_strong_explicit(). + * + * @discussion + * Loops around os_atomic_cmpxchg() may want to consider using the + * os_atomic_rmw_loop() construct instead to take advantage of the C11 weak + * compare-exchange operation. + * + * @param p + * A pointer to an atomic variable. + * + * @param e + * The value expected in the atomic variable. + * + * @param v + * The value to store if the atomic variable has the expected value @a e. + * + * @param m + * The ordering to use in case of success. + * The ordering in case of failure is always `relaxed`. + * + * @returns + * 0 if the compare-exchange failed. + * 1 if the compare-exchange succeeded. + */ +#define os_atomic_cmpxchg(p, e, v, m) ({ \ + _os_atomic_basetypeof(p) _r = (e); int _b; \ + _os_compiler_barrier_before_atomic(m); \ + _b = atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \ + &_r, v, memory_order_##m##_smp, memory_order_relaxed); \ + _os_compiler_barrier_after_atomic(m); \ + _b; \ +}) + +/*! + * @function os_atomic_cmpxchgv + * + * @brief + * Wrapper for C11 atomic_compare_exchange_strong_explicit(). + * + * @discussion + * Loops around os_atomic_cmpxchgv() may want to consider using the + * os_atomic_rmw_loop() construct instead to take advantage of the C11 weak + * compare-exchange operation. + * + * @param p + * A pointer to an atomic variable. + * + * @param e + * The value expected in the atomic variable. + * + * @param v + * The value to store if the atomic variable has the expected value @a e. + * + * @param g + * A pointer to a location that is filled with the value that was present in + * the atomic variable before the compare-exchange (whether successful or not). + * This can be used to redrive compare-exchange loops. + * + * @param m + * The ordering to use in case of success. + * The ordering in case of failure is always `relaxed`. + * + * @returns + * 0 if the compare-exchange failed. + * 1 if the compare-exchange succeeded. + */ +#define os_atomic_cmpxchgv(p, e, v, g, m) ({ \ + _os_atomic_basetypeof(p) _r = (e); int _b; \ + _os_compiler_barrier_before_atomic(m); \ + _b = atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \ + &_r, v, memory_order_##m##_smp, memory_order_relaxed); \ + _os_compiler_barrier_after_atomic(m); \ + *(g) = _r; _b; \ +}) +/*! + * @function os_atomic_rmw_loop + * + * @brief + * Advanced read-modify-write construct to wrap compare-exchange loops. + * + * @param p + * A pointer to an atomic variable to be modified. + * + * @param ov + * The name of the variable that will contain the original value of the atomic + * variable (reloaded every iteration of the loop). + * + * @param nv + * The name of the variable that will contain the new value to compare-exchange + * the atomic variable to (typically computed from @a ov every iteration of the + * loop). + * + * @param m + * The ordering to use in case of success. + * The ordering in case of failure is always `relaxed`. + * + * @param ... + * Code block that validates the value of @p ov and computes the new value of + * @p nv that the atomic variable will be compare-exchanged to in an iteration + * of the loop. + * + * The loop can be aborted using os_atomic_rmw_loop_give_up(), e.g. when the + * value of @p ov is found to be "invalid" for the ovarall operation. + * `continue` cannot be used in this context. + * + * No stores to memory should be performed within the code block as it may cause + * LL/SC transactions used to implement compare-exchange to fail persistently. + * + * @returns + * 0 if the loop was aborted with os_atomic_rmw_loop_give_up(). + * 1 if the loop completed. + */ #define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ - bool _result = false; \ + int _result = 0; \ typeof(p) _p = (p); \ - ov = os_atomic_load(_p, relaxed); \ + _os_compiler_barrier_before_atomic(m); \ + ov = atomic_load_explicit(_os_atomic_c11_atomic(_p), \ + memory_order_relaxed); \ do { \ __VA_ARGS__; \ - _result = os_atomic_cmpxchgvw(_p, ov, nv, &ov, m); \ - } while (!_result); \ + _result = atomic_compare_exchange_weak_explicit( \ + _os_atomic_c11_atomic(_p), &ov, nv, \ + memory_order_##m##_smp, memory_order_relaxed); \ + } while (__builtin_expect(!_result, 0)); \ + _os_compiler_barrier_after_atomic(m); \ _result; \ }) -#define os_atomic_rmw_loop_give_up_with_fence(m, expr) \ - ({ os_atomic_thread_fence(m); expr; __builtin_unreachable(); }) -#define os_atomic_rmw_loop_give_up(expr) \ - os_atomic_rmw_loop_give_up_with_fence(relaxed, expr) +/*! + * @function os_atomic_rmw_loop_give_up + * + * @brief + * Abort an os_atomic_rmw_loop() loop. + * + * @param ... + * Optional code block to execute before the `break` out of the loop. May + * further alter the control flow (e.g. using `return`, `goto`, ...). + */ +#define os_atomic_rmw_loop_give_up(...) ({ __VA_ARGS__; break; }) + +/*! + * @typedef os_atomic_dependency_t + * + * @brief + * Type for dependency tokens that can be derived from loads with dependency + * and injected into various expressions. + * + * @warning + * The implementation of atomic dependencies makes painstakingly sure that the + * compiler doesn't know that os_atomic_dependency_t::__opaque_zero is always 0. + * + * Users of os_atomic_dependency_t MUST NOT test its value (even with an + * assert), as doing so would allow the compiler to reason about the value and + * elide its use to inject hardware dependencies (thwarting the entire purpose + * of the construct). + */ +typedef struct { unsigned long __opaque_zero; } os_atomic_dependency_t; + +/*! + * @const OS_ATOMIC_DEPENDENCY_NONE + * + * @brief + * A value to pass to functions that can carry dependencies, to indicate that + * no dependency should be carried. + */ +#define OS_ATOMIC_DEPENDENCY_NONE \ + ((os_atomic_dependency_t){ 0UL }) + +/*! + * @function os_atomic_make_dependency + * + * @brief + * Create a dependency token that can be injected into expressions to force a + * hardware dependency. + * + * @discussion + * This function is only useful for cases where the dependency needs to be used + * several times. + * + * os_atomic_load_with_dependency_on() and os_atomic_inject_dependency() are + * otherwise capable of automatically creating dependency tokens. + * + * @param v + * The result of: + * - an os_atomic_load(..., dependency), + * - an os_atomic_inject_dependency(), + * - an os_atomic_load_with_dependency_on(). + * + * Note that due to implementation limitations, the type of @p v must be + * register-sized, if necessary an explicit cast is required. + * + * @returns + * An os_atomic_dependency_t token that can be used to prolongate dependency + * chains. + * + * The token value is always 0, but the compiler must never be able to reason + * about that fact (c.f. os_atomic_dependency_t) + */ +#define os_atomic_make_dependency(v) \ + ((void)(v), OS_ATOMIC_DEPENDENCY_NONE) + +/*! + * @function os_atomic_inject_dependency + * + * @brief + * Inject a hardware dependency resulting from a `dependency` load into a + * specified pointer. + * + * @param p + * A pointer to inject the dependency into. + * + * @param e + * - a dependency token returned from os_atomic_make_dependency(), + * + * - OS_ATOMIC_DEPENDENCY_NONE, which turns this operation into a no-op, + * + * - any value accepted by os_atomic_make_dependency(). + * + * @returns + * A value equal to @a p but that prolongates the dependency chain rooted at + * @a e. + */ +#define os_atomic_inject_dependency(p, e) \ + ((typeof(*(p)) *)((p) + _os_atomic_auto_dependency(e).__opaque_zero)) -#define os_atomic_force_dependency_on(p, e) (p) +/*! + * @function os_atomic_load_with_dependency_on + * + * @brief + * Load that prolongates the dependency chain rooted at `v`. + * + * @discussion + * This is shorthand for: + * + * + * os_atomic_load(os_atomic_inject_dependency(p, e), dependency) + * + * + * @param p + * A pointer to an atomic variable. + * + * @param e + * - a dependency token returned from os_atomic_make_dependency(), + * + * - OS_ATOMIC_DEPENDENCY_NONE, which turns this operation into a no-op, + * + * - any value accepted by os_atomic_make_dependency(). + * + * @returns + * The value loaded from @a p. + */ #define os_atomic_load_with_dependency_on(p, e) \ - os_atomic_load(os_atomic_force_dependency_on(p, e), relaxed) + os_atomic_load(os_atomic_inject_dependency(p, e), dependency) + +/*! + * @const OS_ATOMIC_HAS_LLSC + * + * @brief + * Whether the platform has LL/SC features. + * + * @discussion + * When set, the os_atomic_*_exclusive() macros are defined. + */ +#define OS_ATOMIC_HAS_LLSC 0 + +/*! + * @const OS_ATOMIC_USE_LLSC + * + * @brief + * Whether os_atomic* use LL/SC internally. + * + * @discussion + * OS_ATOMIC_USE_LLSC implies OS_ATOMIC_HAS_LLSC. + */ +#define OS_ATOMIC_USE_LLSC 0 #if defined (__x86_64__) #include "i386/atomic.h" diff --git a/osfmk/machine/atomic_impl.h b/osfmk/machine/atomic_impl.h new file mode 100644 index 000000000..9e646f80e --- /dev/null +++ b/osfmk/machine/atomic_impl.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * This header provides some gory details to implement the + * interfaces. Nothing in this header should be called directly, no promise is + * made to keep this interface stable. + */ + +#ifndef _MACHINE_ATOMIC_H +#error "Do not include directly, use " +#endif + +#ifndef _MACHINE_ATOMIC_IMPL_H +#define _MACHINE_ATOMIC_IMPL_H + +#include +#include + +static inline int +memory_order_has_acquire(enum memory_order ord) +{ + switch (ord) { + case memory_order_consume: + case memory_order_acquire: + case memory_order_acq_rel: + case memory_order_seq_cst: + return 1; + default: + return 0; + } +} + +static inline int +memory_order_has_release(enum memory_order ord) +{ + switch (ord) { + case memory_order_release: + case memory_order_acq_rel: + case memory_order_seq_cst: + return 1; + default: + return 0; + } +} + +#if __SMP__ + +#define memory_order_relaxed_smp memory_order_relaxed +#define memory_order_compiler_acquire_smp memory_order_relaxed +#define memory_order_compiler_release_smp memory_order_relaxed +#define memory_order_compiler_acq_rel_smp memory_order_relaxed +#define memory_order_consume_smp memory_order_consume +#define memory_order_dependency_smp memory_order_acquire +#define memory_order_acquire_smp memory_order_acquire +#define memory_order_release_smp memory_order_release +#define memory_order_acq_rel_smp memory_order_acq_rel +#define memory_order_seq_cst_smp memory_order_seq_cst + +#else + +#define memory_order_relaxed_smp memory_order_relaxed +#define memory_order_compiler_acquire_smp memory_order_relaxed +#define memory_order_compiler_release_smp memory_order_relaxed +#define memory_order_compiler_acq_rel_smp memory_order_relaxed +#define memory_order_consume_smp memory_order_relaxed +#define memory_order_dependency_smp memory_order_relaxed +#define memory_order_acquire_smp memory_order_relaxed +#define memory_order_release_smp memory_order_relaxed +#define memory_order_acq_rel_smp memory_order_relaxed +#define memory_order_seq_cst_smp memory_order_relaxed + +#endif + +/* + * Hack needed for os_compiler_barrier() to work (including with empty argument) + */ +#define _os_compiler_barrier_relaxed memory_order_relaxed +#define _os_compiler_barrier_acquire memory_order_acquire +#define _os_compiler_barrier_release memory_order_release +#define _os_compiler_barrier_acq_rel memory_order_acq_rel +#define _os_compiler_barrier_ memory_order_acq_rel + +/* + * Mapping between compiler barrier/memory orders and: + * - compiler barriers before atomics ("rel_barrier") + * - compiler barriers after atomics ("acq_barrier") + */ +#define _os_rel_barrier_relaxed memory_order_relaxed +#define _os_rel_barrier_compiler_acquire memory_order_relaxed +#define _os_rel_barrier_compiler_release memory_order_release +#define _os_rel_barrier_compiler_acq_rel memory_order_release +#define _os_rel_barrier_consume memory_order_relaxed +#define _os_rel_barrier_dependency memory_order_relaxed +#define _os_rel_barrier_acquire memory_order_relaxed +#define _os_rel_barrier_release memory_order_release +#define _os_rel_barrier_acq_rel memory_order_release +#define _os_rel_barrier_seq_cst memory_order_release + +#define _os_acq_barrier_relaxed memory_order_relaxed +#define _os_acq_barrier_compiler_acquire memory_order_acquire +#define _os_acq_barrier_compiler_release memory_order_relaxed +#define _os_acq_barrier_compiler_acq_rel memory_order_acquire +#define _os_acq_barrier_consume memory_order_acquire +#define _os_acq_barrier_dependency memory_order_acquire +#define _os_acq_barrier_acquire memory_order_acquire +#define _os_acq_barrier_release memory_order_relaxed +#define _os_acq_barrier_acq_rel memory_order_acquire +#define _os_acq_barrier_seq_cst memory_order_acquire + +#define _os_compiler_barrier_before_atomic(m) \ + atomic_signal_fence(_os_rel_barrier_##m) +#define _os_compiler_barrier_after_atomic(m) \ + atomic_signal_fence(_os_acq_barrier_##m) + +/* + * Mapping between compiler barrier/memmory orders and: + * - memory fences before atomics ("rel_fence") + * - memory fences after atomics ("acq_fence") + */ +#define _os_rel_fence_relaxed memory_order_relaxed +#define _os_rel_fence_compiler_acquire memory_order_relaxed +#define _os_rel_fence_compiler_release memory_order_release +#define _os_rel_fence_compiler_acq_rel memory_order_release +#define _os_rel_fence_consume memory_order_relaxed_smp +#define _os_rel_fence_dependency memory_order_relaxed_smp +#define _os_rel_fence_acquire memory_order_relaxed_smp +#define _os_rel_fence_release memory_order_release_smp +#define _os_rel_fence_acq_rel memory_order_release_smp +#define _os_rel_fence_seq_cst memory_order_release_smp + +#define _os_acq_fence_relaxed memory_order_relaxed +#define _os_acq_fence_compiler_acquire memory_order_relaxed +#define _os_acq_fence_compiler_release memory_order_relaxed +#define _os_acq_fence_compiler_acq_rel memory_order_relaxed +#define _os_acq_fence_consume memory_order_acquire_smp +#define _os_acq_fence_dependency memory_order_dependency_smp +#define _os_acq_fence_acquire memory_order_acquire_smp +#define _os_acq_fence_release memory_order_relaxed_smp +#define _os_acq_fence_acq_rel memory_order_acquire_smp +#define _os_acq_fence_seq_cst memory_order_acquire_smp + +#define _os_memory_fence_before_atomic(m) \ + atomic_thread_fence(_os_rel_fence_##m) +#define _os_memory_fence_after_atomic(m) \ + atomic_thread_fence(_os_acq_fence_##m) + +/* + * Misc. helpers + */ + +/* + * For this implementation, we make sure the compiler cannot coalesce any of the + * os_atomic calls by casting all atomic variables to `volatile _Atomic`. + * + * At the time this decision was taken, clang has been treating all `_Atomic` + * accesses as if qualified `volatile _Atomic`, so the cast below freezes that + * aspect of the codegen in time. + * + * When/if clang starts coalescing non-volatile _Atomics, we may decide to add + * coalescing orderings, e.g. {relaxed,acquire,release,acq_rel,seq_cst}_nv. + */ +#define _os_atomic_c11_atomic(p) \ + ((typeof(*(p)) volatile _Atomic *)(p)) + +#define _os_atomic_basetypeof(p) \ + typeof(atomic_load(_os_atomic_c11_atomic(p))) + +#define _os_atomic_op_orig(p, v, m, o) ({ \ + _os_atomic_basetypeof(p) _r; \ + _os_compiler_barrier_before_atomic(m); \ + _r = o(_os_atomic_c11_atomic(p), v, memory_order_##m##_smp); \ + _os_compiler_barrier_after_atomic(m); \ + _r; \ +}) + +#define _os_atomic_c11_op_orig(p, v, m, o) \ + _os_atomic_op_orig(p, v, m, atomic_##o##_explicit) + +#define _os_atomic_c11_op(p, v, m, o, op) \ + ({ typeof(v) _v = (v); _os_atomic_c11_op_orig(p, _v, m, o) op _v; }) + +#define _os_atomic_clang_op_orig(p, v, m, o) \ + _os_atomic_op_orig(p, v, m, __atomic_##o) + +#define _os_atomic_clang_op(p, v, m, o, op) \ + ({ typeof(v) _v = (v); _os_atomic_basetypeof(p) _r = \ + _os_atomic_clang_op_orig(p, _v, m, o); op(_r, _v); }) + +#define _os_atomic_auto_dependency(e) \ + _Generic(e, \ + os_atomic_dependency_t: (e), \ + default: os_atomic_make_dependency(e)) + +#endif /* _MACHINE_ATOMIC_IMPL_H */ diff --git a/osfmk/machine/memory_types.h b/osfmk/machine/memory_types.h new file mode 100644 index 000000000..bb0d63789 --- /dev/null +++ b/osfmk/machine/memory_types.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _MACHINE_MEMORY_TYPES_H +#define _MACHINE_MEMORY_TYPES_H + +#if defined (__i386__) || defined(__x86_64__) +#include "i386/memory_types.h" +#elif defined (__arm__) || defined (__arm64__) +#include "arm/memory_types.h" +#else +#error architecture not supported +#endif + +#endif /* _MACHINE_MEMORY_TYPES_H */ diff --git a/osfmk/machine/monotonic.h b/osfmk/machine/monotonic.h index 4b3d9df31..5ee28781e 100644 --- a/osfmk/machine/monotonic.h +++ b/osfmk/machine/monotonic.h @@ -48,6 +48,11 @@ struct mt_cpu { uint64_t mtc_snaps[MT_CORE_NFIXED]; uint64_t mtc_counts[MT_CORE_NFIXED]; uint64_t mtc_counts_last[MT_CORE_NFIXED]; + uint64_t mtc_npmis; + /* + * Whether this CPU should be using PMCs. + */ + bool mtc_active; }; struct mt_thread { @@ -60,6 +65,8 @@ struct mt_task { }; struct mt_cpu *mt_cur_cpu(void); + +uint64_t mt_count_pmis(void); void mt_mtc_update_fixed_counts(struct mt_cpu *mtc, uint64_t *counts, uint64_t *counts_since); uint64_t mt_mtc_update_count(struct mt_cpu *mtc, unsigned int ctr); diff --git a/osfmk/machine/xpr.h b/osfmk/machine/xpr.h deleted file mode 100644 index ee3be2d26..000000000 --- a/osfmk/machine/xpr.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _MACHINE_XPR_H -#define _MACHINE_XPR_H - -#if defined (__i386__) || defined (__x86_64__) -#include "i386/xpr.h" -#elif defined (__arm__) || defined (__arm64__) -#include "arm/xpr.h" -#else -#error architecture not supported -#endif - -#endif /* _MACHINE_XPR_H */ diff --git a/osfmk/prng/prng_random.c b/osfmk/prng/prng_random.c index 44c865a17..4e000828b 100644 --- a/osfmk/prng/prng_random.c +++ b/osfmk/prng/prng_random.c @@ -26,583 +26,329 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include -#include -#include - +#include #include #include #include -#include #include -#include -#include -#include -#include +static struct cckprng_ctx *prng_ctx; -#include +static SECURITY_READ_ONLY_LATE(struct cckprng_funcs) prng_funcs; +static SECURITY_READ_ONLY_LATE(int) prng_ready; -#if defined(__arm__) || defined(__arm64__) -#include // For MAX_CPUS -#endif - -#if defined(__x86_64__) -#include +entropy_data_t EntropyData = {}; -static int -rdseed_step(uint64_t * seed) -{ - uint8_t ok; +#define SEED_SIZE (SHA256_DIGEST_LENGTH) +static uint8_t bootseed[SEED_SIZE]; - asm volatile ("rdseed %0; setc %1" : "=r"(*seed), "=qm"(ok)); - - return (int)ok; -} - -static int -rdseed_retry(uint64_t * seed, size_t nretries) +static void +bootseed_init_bootloader(const struct ccdigest_info * di, ccdigest_ctx_t ctx) { - size_t i; + uint8_t seed[64]; + uint32_t n; - for (i = 0; i < nretries; i += 1) { - if (rdseed_step(seed)) { - return 1; - } else { - asm volatile ("pause"); - } + n = PE_get_random_seed(seed, sizeof(seed)); + if (n < sizeof(seed)) { + /* + * Insufficient entropy is fatal. We must fill the + * entire entropy buffer during initializaton. + */ + panic("Expected %lu seed bytes from bootloader, but got %u.\n", sizeof(seed), n); } - return 0; + ccdigest_update(di, ctx, sizeof(seed), seed); + cc_clear(sizeof(seed), seed); } -static size_t -rdseed_seed(void * buf, size_t nwords) -{ - uint64_t * buf_words; - size_t i; - - if (nwords > 8) { - nwords = 8; - } - - buf_words = buf; - for (i = 0; i < nwords; i += 1) { - if (!rdseed_retry(buf_words + i, 10)) { - return i; - } - } - - return nwords; -} +#if defined(__x86_64__) +#include -static int -rdrand_step(uint64_t * rand) +static void +bootseed_init_native(const struct ccdigest_info * di, ccdigest_ctx_t ctx) { + uint64_t x; uint8_t ok; + size_t i = 0; + size_t n; - asm volatile ("rdrand %0; setc %1" : "=r"(*rand), "=qm"(ok)); - - return (int)ok; -} - -static int -rdrand_retry(uint64_t * rand, size_t nretries) -{ - size_t i; - - for (i = 0; i < nretries; i += 1) { - if (rdrand_step(rand)) { - return 1; + if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) { + n = SEED_SIZE / sizeof(x); + + while (i < n) { + asm volatile ("rdseed %0; setc %1" : "=r"(x), "=qm"(ok) : : "cc"); + if (ok) { + ccdigest_update(di, ctx, sizeof(x), &x); + i += 1; + } else { + // Intel recommends to pause between unsuccessful rdseed attempts. + cpu_pause(); + } } - } - - return 0; -} - -static size_t -rdrand_seed(void * buf, size_t nwords) -{ - size_t i; - uint64_t w; - uint8_t hash[CCSHA256_OUTPUT_SIZE]; - const struct ccdigest_info * di = &ccsha256_ltc_di; - - ccdigest_di_decl(di, ctx); - ccdigest_init(di, ctx); - - for (i = 0; i < 1023; i += 1) { - if (!rdrand_retry(&w, 10)) { - nwords = 0; - goto out; + } else if (cpuid_features() & CPUID_FEATURE_RDRAND) { + // The Intel documentation guarantees a reseed every 512 rdrand calls. + n = (SEED_SIZE / sizeof(x)) * 512; + + while (i < n) { + asm volatile ("rdrand %0; setc %1" : "=r"(x), "=qm"(ok) : : "cc"); + if (ok) { + ccdigest_update(di, ctx, sizeof(x), &x); + i += 1; + } else { + // Intel does not recommend pausing between unsuccessful rdrand attempts. + } } - ccdigest_update(di, ctx, sizeof w, &w); } - ccdigest_final(di, ctx, hash); - - if (nwords > 2) { - nwords = 2; - } - - memcpy(buf, hash, nwords * sizeof(uint64_t)); - -out: - ccdigest_di_clear(di, ctx); - bzero(hash, sizeof hash); - bzero(&w, sizeof w); - - return nwords; + cc_clear(sizeof(x), &x); } +#else + static void -intel_entropysource(void * buf, size_t * nbytes) +bootseed_init_native(__unused const struct ccdigest_info * di, __unused ccdigest_ctx_t ctx) { - size_t nwords; - - /* only handle complete words */ - assert(*nbytes % sizeof(uint64_t) == 0); - - nwords = (*nbytes) / sizeof(uint64_t); - if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) { - nwords = rdseed_seed(buf, nwords); - *nbytes = nwords * sizeof(uint64_t); - } else if (cpuid_features() & CPUID_FEATURE_RDRAND) { - nwords = rdrand_seed(buf, nwords); - *nbytes = nwords * sizeof(uint64_t); - } else { - *nbytes = 0; - } } -#endif /* defined(__x86_64__) */ - -void entropy_buffer_read(void * buffer, size_t * count); - -typedef void (*entropysource)(void * buf, size_t * nbytes); - -static const entropysource entropysources[] = { - entropy_buffer_read, -#if defined(__x86_64__) - intel_entropysource, #endif -}; - -static const size_t nsources = sizeof entropysources / sizeof entropysources[0]; - -static size_t -entropy_readall(void * buf, size_t nbytes_persource) -{ - uint8_t * buf_bytes = buf; - size_t i; - size_t nbytes_total = 0; - - for (i = 0; i < nsources; i += 1) { - size_t nbytes = nbytes_persource; - entropysources[i](buf_bytes, &nbytes); - bzero(buf_bytes + nbytes, nbytes_persource - nbytes); - nbytes_total += nbytes; - buf_bytes += nbytes_persource; - } - return nbytes_total; -} - -static struct { - struct cckprng_ctx ctx; - struct { - lck_grp_t * group; - lck_attr_t * attrs; - lck_grp_attr_t * group_attrs; - lck_mtx_t * mutex; - } lock; -} prng; - -static SECURITY_READ_ONLY_LATE(prng_fns_t) prng_fns = NULL; - -static int -prng_init(cckprng_ctx_t ctx, size_t nbytes, const void * seed) +static void +bootseed_init(void) { - int err = prng_fns->init(ctx, nbytes, seed); - if (err == CCKPRNG_ABORT) { - panic("prng_init"); - } - return err; -} - -#define PERMIT_WRITE_RANDOM 0 + const struct ccdigest_info * di = &ccsha256_ltc_di; -#if PERMIT_WRITE_RANDOM -static int -prng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void * seed) -{ - int err = prng_fns->reseed(ctx, nbytes, seed); - if (err == CCKPRNG_ABORT) { - panic("prng_reseed"); - } - return err; -} -#endif + ccdigest_di_decl(di, ctx); + ccdigest_init(di, ctx); -static int -prng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void * entropy) -{ - int err = prng_fns->addentropy(ctx, nbytes, entropy); - if (err == CCKPRNG_ABORT) { - panic("prng_addentropy"); - } - return err; -} + bootseed_init_bootloader(di, ctx); + bootseed_init_native(di, ctx); -static int -prng_generate(cckprng_ctx_t ctx, size_t nbytes, void * out) -{ - int err = prng_fns->generate(ctx, nbytes, out); - if (err == CCKPRNG_ABORT) { - panic("prng_generate"); - } - return err; + ccdigest_final(di, ctx, bootseed); + ccdigest_di_clear(di, ctx); } -entropy_data_t EntropyData = {.index_ptr = EntropyData.buffer}; +#define EARLY_RANDOM_STATE_STATIC_SIZE (264) static struct { - uint8_t seed[nsources][EARLY_RANDOM_SEED_SIZE]; - int seedset; - uint8_t master_drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE]; - struct ccdrbg_state * drbg_states[MAX_CPUS]; + uint8_t drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE]; struct ccdrbg_info drbg_info; const struct ccdrbg_nisthmac_custom drbg_custom; } erandom = {.drbg_custom = { - .di = &ccsha1_eay_di, + .di = &ccsha256_ltc_di, .strictFIPS = 0, }}; static void read_erandom(void * buf, uint32_t nbytes); -void -entropy_buffer_read(void * buffer, size_t * count) -{ - boolean_t current_state; - unsigned int i, j; - - if (!erandom.seedset) { - panic("early_random was never invoked"); - } - - if (*count > ENTROPY_BUFFER_BYTE_SIZE) { - *count = ENTROPY_BUFFER_BYTE_SIZE; - } - - current_state = ml_early_set_interrupts_enabled(FALSE); - - memcpy(buffer, EntropyData.buffer, *count); - - /* Consider removing this mixing step rdar://problem/31668239 */ - for (i = 0, j = (ENTROPY_BUFFER_SIZE - 1); i < ENTROPY_BUFFER_SIZE; j = i, i++) { - EntropyData.buffer[i] = EntropyData.buffer[i] ^ EntropyData.buffer[j]; - } - - (void) ml_early_set_interrupts_enabled(current_state); - -#if DEVELOPMENT || DEBUG - uint32_t * word = buffer; - /* Good for both 32-bit and 64-bit kernels. */ - for (i = 0; i < ENTROPY_BUFFER_SIZE; i += 4) { - /* - * We use "EARLY" here so that we can grab early entropy on - * ARM, where tracing is not started until after PRNG is - * initialized. - */ - KERNEL_DEBUG_EARLY(ENTROPY_READ(i / 4), word[i + 0], word[i + 1], word[i + 2], word[i + 3]); - } -#endif -} - /* * Return a uniformly distributed 64-bit random number. * - * This interface should have minimal dependencies on kernel - * services, and thus be available very early in the life - * of the kernel. - * This provides cryptographically secure randomness. - * Each processor has its own generator instance. - * It is seeded (lazily) with entropy provided by the Booter. + * This interface should have minimal dependencies on kernel services, + * and thus be available very early in the life of the kernel. * - * For the algorithm switched from LCG to - * NIST HMAC DBRG as follows: - * - When first called (on OSX this is very early while page tables are being - * built) early_random() calls ccdrbg_factory_hmac() to set-up a ccdbrg info - * structure. - * - The boot processor's ccdrbg state structure is a statically allocated area - * which is then initialized by calling the ccdbrg_init method. - * The initial entropy is 16 bytes of boot entropy. - * The nonce is the first 8 bytes of entropy xor'ed with a timestamp - * from ml_get_timebase(). - * The personalization data provided is null. - * - The first 64-bit random value is returned on the boot processor from - * an invocation of the ccdbrg_generate method. - * - Non-boot processor's DRBG state structures are allocated dynamically - * from prng_init(). Each is initialized with the same 16 bytes of entropy - * but with a different timestamped nonce and cpu number as personalization. - * - Subsequent calls to early_random() pass to read_erandom() to generate - * an 8-byte random value. read_erandom() ensures that pre-emption is - * disabled and selects the DBRG state from the current processor. - * The ccdbrg_generate method is called for the required random output. - * If this method returns CCDRBG_STATUS_NEED_RESEED, the erandom.seed buffer - * is re-filled with kernel-harvested entropy and the ccdbrg_reseed method is - * called with this new entropy. The kernel panics if a reseed fails. + * This provides cryptographically secure randomness contingent on the + * quality of the seed. It is seeded (lazily) with entropy provided by + * the Booter. + * + * The implementation is a NIST HMAC-SHA256 DRBG instance used as + * follows: + * + * - When first called (on macOS this is very early while page tables + * are being built) early_random() calls ccdrbg_factory_hmac() to + * set-up a ccdbrg info structure. + * + * - The boot seed (64 bytes) is hashed with SHA256. Where available, + * hardware RNG outputs are mixed into the seed. (See + * bootseed_init.) The resulting seed is 32 bytes. + * + * - The ccdrbg state structure is a statically allocated area which + * is then initialized by calling the ccdbrg_init method. The + * initial entropy is the 32-byte seed described above. The nonce + * is an 8-byte timestamp from ml_get_timebase(). The + * personalization data provided is a fixed string. + * + * - 64-bit outputs are generated via read_erandom, a wrapper around + * the ccdbrg_generate method. (Since "strict FIPS" is disabled, + * the DRBG will never request a reseed.) + * + * - After the kernel PRNG is initialized, read_erandom defers + * generation to it via read_random_generate. (Note that this + * function acquires a per-processor mutex.) */ uint64_t early_random(void) { - uint32_t cnt = 0; uint64_t result; uint64_t nonce; int rc; - int ps; - struct ccdrbg_state * state; - - if (!erandom.seedset) { - erandom.seedset = 1; - cnt = PE_get_random_seed((unsigned char *)EntropyData.buffer, sizeof(EntropyData.buffer)); - - if (cnt < sizeof(EntropyData.buffer)) { - /* - * Insufficient entropy is fatal. We must fill the - * entire entropy buffer during initializaton. - */ - panic("EntropyData needed %lu bytes, but got %u.\n", sizeof(EntropyData.buffer), cnt); - } + const char ps[] = "xnu early random"; + static int init = 0; - entropy_readall(&erandom.seed, EARLY_RANDOM_SEED_SIZE); + if (init == 0) { + bootseed_init(); /* Init DRBG for NIST HMAC */ ccdrbg_factory_nisthmac(&erandom.drbg_info, &erandom.drbg_custom); - assert(erandom.drbg_info.size <= sizeof(erandom.master_drbg_state)); - state = (struct ccdrbg_state *)erandom.master_drbg_state; - erandom.drbg_states[master_cpu] = state; + assert(erandom.drbg_info.size <= sizeof(erandom.drbg_state)); /* * Init our DBRG from the boot entropy and a timestamp as nonce * and the cpu number as personalization. */ - assert(sizeof(erandom.seed) > sizeof(nonce)); + assert(sizeof(bootseed) > sizeof(nonce)); nonce = ml_get_timebase(); - ps = 0; /* boot cpu */ - rc = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(ps), &ps); - cc_clear(sizeof(nonce), &nonce); + rc = ccdrbg_init(&erandom.drbg_info, (struct ccdrbg_state *)erandom.drbg_state, sizeof(bootseed), bootseed, sizeof(nonce), &nonce, sizeof(ps) - 1, ps); if (rc != CCDRBG_STATUS_OK) { panic("ccdrbg_init() returned %d", rc); } - /* Generate output */ - rc = ccdrbg_generate(&erandom.drbg_info, state, sizeof(result), &result, 0, NULL); - if (rc != CCDRBG_STATUS_OK) { - panic("ccdrbg_generate() returned %d", rc); - } + cc_clear(sizeof(nonce), &nonce); - return result; + init = 1; } - ; -#if defined(__x86_64__) - /* - * Calling read_erandom() before gsbase is initialized is potentially - * catastrophic, so assert that it's not set to the magic value set - * in i386_init.c before proceeding with the call. We cannot use - * assert here because it ultimately calls panic, which executes - * operations that involve accessing %gs-relative data (and additionally - * causes a debug trap which will not work properly this early in boot.) - */ - if (rdmsr64(MSR_IA32_GS_BASE) == EARLY_GSBASE_MAGIC) { - kprintf("[early_random] Cannot proceed: GSBASE is not initialized\n"); - hlt(); - /*NOTREACHED*/ - } -#endif read_erandom(&result, sizeof(result)); return result; } static void -read_erandom(void * buffer, u_int numBytes) +read_random_generate(uint8_t *buffer, u_int numbytes); + +static void +read_erandom(void * buf, uint32_t nbytes) { - int cpu; + uint8_t * buffer_bytes = buf; + size_t n; int rc; - size_t nbytes; - struct ccdrbg_state * state; - - mp_disable_preemption(); - cpu = cpu_number(); - state = erandom.drbg_states[cpu]; - assert(state); - for (;;) { - /* Generate output */ - rc = ccdrbg_generate(&erandom.drbg_info, state, numBytes, buffer, 0, NULL); - if (rc == CCDRBG_STATUS_OK) { - break; - } - if (rc == CCDRBG_STATUS_NEED_RESEED) { - /* It's time to reseed. Get more entropy */ - nbytes = entropy_readall(erandom.seed, EARLY_RANDOM_SEED_SIZE); - assert(nbytes >= EARLY_RANDOM_SEED_SIZE); - rc = ccdrbg_reseed(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, 0, NULL); - cc_clear(sizeof(erandom.seed), erandom.seed); - if (rc == CCDRBG_STATUS_OK) { - continue; - } - panic("read_erandom reseed error %d\n", rc); - } - panic("read_erandom ccdrbg error %d\n", rc); + + // We defer to the kernel PRNG after it has been installed and + // initialized. This happens during corecrypto kext + // initialization. + if (prng_ready) { + read_random_generate(buf, nbytes); + return; } - mp_enable_preemption(); -} -void -read_frandom(void * buffer, u_int numBytes) -{ - uint8_t * buffer_bytes = buffer; - int nbytes; + // The DBRG request size is limited, so we break the request into + // chunks. + while (nbytes > 0) { + n = MIN(nbytes, PAGE_SIZE); - /* - * Split up into requests for blocks smaller than - * than the DBRG request limit. iThis limit is private but - * for NISTHMAC it's known to be greater then 4096. - */ - while (numBytes) { - nbytes = MIN(numBytes, PAGE_SIZE); - read_erandom(buffer_bytes, nbytes); - buffer_bytes += nbytes; - numBytes -= nbytes; + // Since "strict FIPS" is disabled, the DRBG will never + // request a reseed; therefore, we panic on any error + rc = ccdrbg_generate(&erandom.drbg_info, (struct ccdrbg_state *)erandom.drbg_state, n, buffer_bytes, 0, NULL); + if (rc != CCDRBG_STATUS_OK) { + panic("read_erandom ccdrbg error %d\n", rc); + } + + buffer_bytes += n; + nbytes -= n; } } void -early_random_cpu_init(int cpu) +read_frandom(void * buffer, u_int numBytes) { - uint64_t nonce; - int rc; - struct ccdrbg_state * state; - - /* - * Allocate state and initialize DBRG state for early_random() - * for this processor. - */ - assert(cpu != master_cpu); - assert(erandom.drbg_states[cpu] == NULL); - - state = kalloc(erandom.drbg_info.size); - if (state == NULL) { - panic("prng_init kalloc failed\n"); - } - erandom.drbg_states[cpu] = state; - - /* - * Init our DBRG from boot entropy, nonce as timestamp - * and use the cpu number as the personalization parameter. - */ - nonce = ml_get_timebase(); - rc = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(cpu), &cpu); - cc_clear(sizeof(nonce), &nonce); - if (rc != CCDRBG_STATUS_OK) { - panic("ccdrbg_init() returned %d", rc); - } + read_erandom(buffer, numBytes); } void -register_and_init_prng(prng_fns_t fns) +register_and_init_prng(struct cckprng_ctx *ctx, const struct cckprng_funcs *funcs) { - uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE]; - size_t nbytes; - assert(cpu_number() == master_cpu); - assert(prng_fns == NULL); + assert(!prng_ready); - prng_fns = fns; + prng_ctx = ctx; + prng_funcs = *funcs; - /* make a mutex to control access */ - prng.lock.group_attrs = lck_grp_attr_alloc_init(); - prng.lock.group = lck_grp_alloc_init("random", prng.lock.group_attrs); - prng.lock.attrs = lck_attr_alloc_init(); - prng.lock.mutex = lck_mtx_alloc_init(prng.lock.group, prng.lock.attrs); + uint64_t nonce = ml_get_timebase(); + prng_funcs.init(prng_ctx, MAX_CPUS, sizeof(EntropyData.buffer), EntropyData.buffer, &EntropyData.sample_count, sizeof(bootseed), bootseed, sizeof(nonce), &nonce); + prng_funcs.initgen(prng_ctx, master_cpu); + prng_ready = 1; - nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE); - (void)prng_init(&prng.ctx, nbytes, buf); - cc_clear(sizeof(buf), buf); + cc_clear(sizeof(bootseed), bootseed); + cc_clear(sizeof(erandom), &erandom); } -static void -Reseed(void) +void +random_cpu_init(int cpu) { - uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE]; - size_t nbytes; + assert(cpu != master_cpu); - lck_mtx_assert(prng.lock.mutex, LCK_MTX_ASSERT_OWNED); + if (!prng_ready) { + panic("random_cpu_init: kernel prng has not been installed"); + } - nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE); - PRNG_CCKPRNG((void)prng_addentropy(&prng.ctx, nbytes, buf)); - cc_clear(sizeof(buf), buf); + prng_funcs.initgen(prng_ctx, cpu); } /* export good random numbers to the rest of the kernel */ void read_random(void * buffer, u_int numbytes) { - int err; - - lck_mtx_lock(prng.lock.mutex); + prng_funcs.refresh(prng_ctx); + read_random_generate(buffer, numbytes); +} +static void +ensure_gsbase(void) +{ +#if defined(__x86_64__) && (DEVELOPMENT || DEBUG) /* - * Call PRNG, reseeding and retrying if requested. + * Calling cpu_number() before gsbase is initialized is potentially + * catastrophic, so assert that it's not set to the magic value set + * in i386_init.c before proceeding with the call. We cannot use + * assert here because it ultimately calls panic, which executes + * operations that involve accessing %gs-relative data (and additionally + * causes a debug trap which will not work properly this early in boot.) */ - for (;;) { - PRNG_CCKPRNG(err = prng_generate(&prng.ctx, numbytes, buffer)); - if (err == CCKPRNG_OK) { - break; - } - if (err == CCKPRNG_NEED_ENTROPY) { - Reseed(); - continue; - } - panic("read_random() error %d\n", err); + if (rdmsr64(MSR_IA32_GS_BASE) == EARLY_GSBASE_MAGIC) { + kprintf("[early_random] Cannot proceed: GSBASE is not initialized\n"); + hlt(); + /*NOTREACHED*/ } +#endif +} + +static void +read_random_generate(uint8_t *buffer, u_int numbytes) +{ + ensure_gsbase(); + + while (numbytes > 0) { + size_t n = MIN(numbytes, CCKPRNG_GENERATE_MAX_NBYTES); - lck_mtx_unlock(prng.lock.mutex); + prng_funcs.generate(prng_ctx, cpu_number(), n, buffer); + + buffer += n; + numbytes -= n; + } } int write_random(void * buffer, u_int numbytes) { -#if PERMIT_WRITE_RANDOM - int err; + uint8_t seed[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; - lck_mtx_lock(prng.lock.mutex); - err = prng_reseed(&prng.ctx, numbytes, buffer); - lck_mtx_unlock(prng.lock.mutex); + /* hash the input to minimize the time we need to hold the lock */ + SHA256_Init(&ctx); + SHA256_Update(&ctx, buffer, numbytes); + SHA256_Final(seed, &ctx); + + prng_funcs.reseed(prng_ctx, sizeof(seed), seed); + cc_clear(sizeof(seed), seed); - return err ? EIO : 0; -#else -#pragma unused(buffer, numbytes) return 0; -#endif } /* @@ -620,9 +366,7 @@ void random_bool_init(struct bool_gen * bg) { /* Seed the random boolean generator */ - for (int i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) { - bg->seed[i] = (unsigned int)early_random(); - } + read_frandom(bg->seed, sizeof(bg->seed)); bg->state = 0; simple_lock_init(&bg->lock, 0); } diff --git a/osfmk/prng/random.h b/osfmk/prng/random.h index a49b6c730..61432793b 100644 --- a/osfmk/prng/random.h +++ b/osfmk/prng/random.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,18 +31,30 @@ __BEGIN_DECLS +#include + #ifdef XNU_KERNEL_PRIVATE -#define ENTROPY_BUFFER_BYTE_SIZE 64 +#define ENTROPY_BUFFER_BYTE_SIZE 32 #define ENTROPY_BUFFER_SIZE (ENTROPY_BUFFER_BYTE_SIZE / sizeof(uint32_t)) +// This mask can be applied to EntropyData.sample_count to get an +// index suitable for storing the next sample in +// EntropyData.buffer. Note that ENTROPY_BUFFER_SIZE must be a power +// of two for the following mask calculation to be valid. +#define ENTROPY_BUFFER_INDEX_MASK (ENTROPY_BUFFER_SIZE - 1) + typedef struct entropy_data { /* - * TODO: Should index_ptr be volatile? Are we exposed to any races that + * TODO: Should sample_count be volatile? Are we exposed to any races that * we care about if it is not? */ - uint32_t * index_ptr; + + // At 32 bits, this counter can overflow. Since we're primarily + // interested in the delta from one read to the next, we don't + // worry about this too much. + uint32_t sample_count; uint32_t buffer[ENTROPY_BUFFER_SIZE]; } entropy_data_t; @@ -51,55 +63,12 @@ extern entropy_data_t EntropyData; /* Trace codes for DBG_SEC_KERNEL: */ #define ENTROPY_READ(n) SECURITYDBG_CODE(DBG_SEC_KERNEL, n) /* n: 0 .. 3 */ -/* - * Early_random implementation params: */ -#define EARLY_RANDOM_SEED_SIZE (16) -#define EARLY_RANDOM_STATE_STATIC_SIZE (264) +void random_cpu_init(int cpu); -void early_random_cpu_init(int cpu); - -/* - * Wrapper for requesting a CCKPRNG operation. - * This macro makes the DRBG call with pre-emption disabled to ensure that - * any attempt to block will cause a panic. And the operation is timed and - * cannot exceed 10msec (for development kernels). - * But skip this while we retain Yarrow. - */ -#define YARROW 1 -#if YARROW -#define PRNG_CCKPRNG(op) \ - MACRO_BEGIN \ - op; \ - MACRO_END -#else -#define PRNG_CCKPRNG(op) \ - MACRO_BEGIN \ - uint64_t start; \ - uint64_t stop; \ - disable_preemption(); \ - start = mach_absolute_time(); \ - op; \ - stop = mach_absolute_time(); \ - enable_preemption(); \ - assert(stop - start < 10 * NSEC_PER_MSEC || machine_timeout_suspended()); \ - (void)start; \ - (void)stop; \ - MACRO_END -#endif #endif /* XNU_KERNEL_PRIVATE */ -#include - -/* kernel prng */ -typedef const struct prng_fns { - int (*init)(cckprng_ctx_t ctx, size_t nbytes, const void * seed); - int (*reseed)(cckprng_ctx_t ctx, size_t nbytes, const void * seed); - int (*addentropy)(cckprng_ctx_t ctx, size_t nbytes, const void * entropy); - int (*generate)(cckprng_ctx_t ctx, size_t nbytes, void * out); -} * prng_fns_t; - -void register_and_init_prng(prng_fns_t fns); +void register_and_init_prng(struct cckprng_ctx *ctx, const struct cckprng_funcs *funcs); #include /* Definitions for boolean PRNG */ @@ -107,7 +76,7 @@ void register_and_init_prng(prng_fns_t fns); struct bool_gen { unsigned int seed[RANDOM_BOOL_GEN_SEED_COUNT]; unsigned int state; - decl_simple_lock_data(, lock) + decl_simple_lock_data(, lock); }; extern void random_bool_init(struct bool_gen * bg); diff --git a/osfmk/profiling/Makefile b/osfmk/profiling/Makefile deleted file mode 100644 index b7dc25208..000000000 --- a/osfmk/profiling/Makefile +++ /dev/null @@ -1,55 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - machine - -INSTINC_SUBDIRS_X86_64 = \ - x86_64 - -INSTINC_SUBDIRS_X86_64H = \ - x86_64 - -INSTINC_SUBDIRS_ARM = \ - arm - -INSTINC_SUBDIRS_ARM64 = \ - arm - -EXPINC_SUBDIRS = \ - machine - -EXPINC_SUBDIRS_ARM = \ - arm - -EXPINC_SUBDIRS_ARM64 = \ - arm - -EXPINC_SUBDIRS_X86_64 = \ - x86_64 - -EXPINC_SUBDIRS_X86_64H = \ - x86_64 - -DATAFILES = \ - profile-internal.h profile-mk.h profile-kgmon.c - -MIGINCLUDES = \ - -INSTALL_MI_LIST = ${DATAFILES} ${_MIG_HDRS_} ${MIGINCLUDES} - -INSTALL_MI_DIR = profile - -EXPORT_MI_LIST = ${DATAFILES} ${_MIG_HDRS_} ${MIGINCLUDES} - -EXPORT_MI_DIR = profile - -.ORDER: ${_MIG_HDRS_} ${MIGINCLUDES} - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/osfmk/profiling/i386/Makefile b/osfmk/profiling/i386/Makefile deleted file mode 100644 index 1253a004b..000000000 --- a/osfmk/profiling/i386/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -include $(MakeInc_cmd) -include $(MakeInc_def) - -DATAFILES = \ - profile-md.h profile-md.c profile-asm.s - -INSTALL_MD_LIST = ${DATAFILES} - -INSTALL_MD_DIR = profile/i386 - -EXPORT_MD_LIST = ${DATAFILES} - -EXPORT_MD_DIR = profile/i386 - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/osfmk/profiling/i386/profile-md.h b/osfmk/profiling/i386/profile-md.h deleted file mode 100644 index 942a5438e..000000000 --- a/osfmk/profiling/i386/profile-md.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:49 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:08 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.5.2 1996/07/31 09:57:36 paire - * Added some more constraints to __asm__ functions for compilation - * under gcc2.7.1 for PROF_CNT_[L]{ADD|SUB} macros - * [96/06/14 paire] - * - * Revision 1.1.5.1 1995/01/06 19:53:52 devrcs - * mk6 CR668 - 1.3b26 merge - * new file for mk6 - * [1994/10/12 22:25:27 dwm] - * - * Revision 1.1.2.2 1994/05/16 19:19:26 meissner - * Add {,L}PROF_CNT_{SUB,LSUB,OVERFLOW} macros for gprof command. - * [1994/05/10 10:36:06 meissner] - * - * Correct 64-bit integer asms to specify result values as inputs, and use =g instead of =m. - * Cast the integer argument to PROF_CNT_ADD to unsigned long, so a short register is widened. - * Add more support for writing the gprof command. - * PROF_CNT_{EQ,NE} should not use ^=, it just uses ^. - * Round PROF_CNT_DIGITS up to 24 bytes so it is word aligned. - * _profile_cnt_to_decimal now takes the low/high values as separate arguments. - * Delete _profile_cnt_to_hex. - * [1994/04/28 21:45:07 meissner] - * - * Add more 64 bit arithmetic macros to support writing gprof. - * [1994/04/20 15:47:05 meissner] - * - * Revision 1.1.2.1 1994/04/08 17:51:56 meissner - * Correct spelling on LPROF_CNT_TO_LDOUBLE macro. - * [1994/04/08 16:18:06 meissner] - * - * Make LHISTCOUNTER be 64 bits. - * Define LPROF_CNT_INC to increment LHISTCOUNTER. - * [1994/04/08 12:40:32 meissner] - * - * Make most stats 64 bits, except for things like memory allocation. - * [1994/04/02 14:58:34 meissner] - * - * Add overflow support for {gprof,prof,old,dummy}_mcount counters. - * [1994/03/17 20:13:37 meissner] - * - * Add gprof/prof overflow support - * [1994/03/17 14:56:56 meissner] - * - * Define LHISTCOUNTER. - * [1994/02/28 12:05:16 meissner] - * - * Set HISTFRACTION to 4, so new lprofil call takes the same space. - * [1994/02/24 16:15:34 meissner] - * - * Add too_low/too_high to profile_stats. - * [1994/02/16 22:38:23 meissner] - * - * Make prof_cnt_t unsigned long. - * [1994/02/11 16:52:09 meissner] - * - * Remember function unique ptr in gfuncs structure to reset profiling. - * Add support for range checking gprof arc {from,self}pc addresses. - * Add counter for # times acontext was locked. - * Expand copyright. - * [1994/02/07 12:41:08 meissner] - * - * Keep track of the number of times the kernel overflows the HISTCOUNTER counter. - * [1994/02/03 20:13:31 meissner] - * - * Add stats for {user,kernel,idle} mode in the kernel. - * [1994/02/03 15:17:36 meissner] - * - * No change. - * [1994/02/03 00:58:59 meissner] - * - * Combine _profile_{vars,stats,md}; Allow more than one _profile_vars. - * [1994/02/01 12:04:04 meissner] - * - * Split # records to # gprof and # prof records. - * Add my_cpu/max_cpu fields. - * [1994/01/28 23:33:30 meissner] - * - * Eliminate hash_{size,mask} from gfuncs structure. - * [1994/01/26 20:23:41 meissner] - * - * Add structure size fields to _profile_{vars,stats,md}. - * Add major/minor version number to _profile_md. - * Move allocation context block pointer to main structure. - * Delete shift count for allocation contexts. - * [1994/01/25 01:46:08 meissner] - * - * Add HASHFRACTION - * [1994/01/22 01:14:02 meissner] - * - * Split profile-md.h into profile-internal.h and profile-md. - * [1994/01/20 20:57:18 meissner] - * - * Fixup copyright. - * [1994/01/18 23:08:14 meissner] - * - * Make flags byte-sized. - * Add have_bb flag. - * Add init_format flag. - * [1994/01/18 21:57:18 meissner] - * - * CR 10198 - Initial version. - * [1994/01/18 19:44:59 meissner] - * - * $EndLog$ - */ - -#ifndef _PROFILE_MD_H -#define _PROFILE_MD_H - -#include - -/* - * Define the interfaces between the assembly language profiling support - * that is common between the kernel, mach servers, and user space library. - */ - -/* - * Integer types used. - */ - -/* - * These hold either a pointer or a signed/unsigned int. - * They are 32 bit on i386 and 64 bit on x86_64. - */ -typedef long prof_ptrint_t; -typedef unsigned long prof_uptrint_t; - -typedef long prof_lock_t; /* lock word type */ -typedef unsigned char prof_flag_t; /* type for boolean flags */ - -/* - * Double precision counter. - */ - -/* These are 64 bit on both i386 and x86_64 */ -typedef unsigned long prof_cnt_t; - -/* x86_64 */ -#define PROF_CNT_INC(cnt) (cnt++) -#define PROF_CNT_ADD(cnt, val) (cnt+=val) -#define PROF_CNT_LADD(cnt, val) (cnt+=val) -#define PROF_CNT_SUB(cnt, val) (cnt-=val) -#define PROF_CNT_LSUB(cnt, val) (cnt-=val) - -#define PROF_ULONG_TO_CNT(cnt, val) (((cnt).high = 0), ((cnt).low = val)) -#define PROF_CNT_OVERFLOW(cnt, high, low) (((high) = (cnt).high), ((low) = (cnt).low)) -#define PROF_CNT_TO_ULONG(cnt) (((cnt).high == 0) ? (cnt).low : 0xffffffffu) -#define PROF_CNT_TO_LDOUBLE(cnt) ((((long double)(cnt).high) * 4294967296.0L) + (long double)(cnt).low) -#define PROF_CNT_TO_DECIMAL(buf, cnt) _profile_cnt_to_decimal(buf, (cnt).low, (cnt).high) -#define PROF_CNT_EQ_0(cnt) (((cnt).high | (cnt).low) == 0) -#define PROF_CNT_NE_0(cnt) (((cnt).high | (cnt).low) != 0) -#define PROF_CNT_EQ(cnt1, cnt2) ((((cnt1).high ^ (cnt2).high) | ((cnt1).low ^ (cnt2).low)) == 0) -#define PROF_CNT_NE(cnt1, cnt2) ((((cnt1).high ^ (cnt2).high) | ((cnt1).low ^ (cnt2).low)) != 0) -#define PROF_CNT_GT(cnt1, cnt2) (((cnt1).high > (cnt2).high) || ((cnt1).low > (cnt2).low)) -#define PROF_CNT_LT(cnt1, cnt2) (((cnt1).high < (cnt2).high) || ((cnt1).low < (cnt2).low)) - -/* max # digits + null to hold prof_cnt_t values (round up to multiple of 4) */ -#define PROF_CNT_DIGITS 24 - -/* - * Types of the profil counter. - */ - -typedef unsigned short HISTCOUNTER; /* profil */ -typedef prof_cnt_t LHISTCOUNTER; /* lprofil */ - -#define LPROF_ULONG_TO_CNT(cnt, val) PROF_ULONG_TO_CNT(cnt,val) -#define LPROF_CNT_INC(lp) PROF_CNT_INC(lp) -#define LPROF_CNT_ADD(lp, val) PROF_CNT_ADD(lp,val) -#define LPROF_CNT_LADD(lp, val) PROF_CNT_LADD(lp,val) -#define LPROF_CNT_SUB(lp, val) PROF_CNT_SUB(lp,val) -#define LPROF_CNT_LSUB(lp, val) PROF_CNT_LSUB(lp,val) -#define LPROF_CNT_OVERFLOW(lp, high, low) PROF_CNT_OVERFLOW(lp,high,low) -#define LPROF_CNT_TO_ULONG(lp) PROF_CNT_TO_ULONG(lp) -#define LPROF_CNT_TO_LDOUBLE(lp) PROF_CNT_TO_LDOUBLE(lp) -#define LPROF_CNT_TO_DECIMAL(buf, cnt) PROF_CNT_TO_DECIMAL(buf,cnt) -#define LPROF_CNT_EQ_0(cnt) PROF_CNT_EQ_0(cnt) -#define LPROF_CNT_NE_0(cnt) PROF_CNT_NE_0(cnt) -#define LPROF_CNT_EQ(cnt1, cnt2) PROF_CNT_EQ(cnt1,cnt2) -#define LPROF_CNT_NE(cnt1, cnt2) PROF_CNT_NE(cnt1,cnt2) -#define LPROF_CNT_GT(cnt1, cnt2) PROF_CNT_GT(cnt1,cnt2) -#define LPROF_CNT_LT(cnt1, cnt2) PROF_CNT_LT(cnt1,cnt2) -#define LPROF_CNT_DIGITS PROF_CNT_DIGITS - -/* - * fraction of text space to allocate for histogram counters - */ - -#define HISTFRACTION 4 - -/* - * Fraction of text space to allocate for from hash buckets. - */ - -#define HASHFRACTION HISTFRACTION - -/* - * Prof call count, external format. - */ - -struct prof_ext { - prof_uptrint_t cvalue; /* caller address */ - prof_uptrint_t cncall; /* # of calls */ -}; - -/* - * Prof call count, internal format. - */ - -struct prof_int { - struct prof_ext prof; /* external prof struct */ - prof_uptrint_t overflow; /* # times prof counter overflowed */ -}; - -/* - * Gprof arc, external format. - */ - -struct gprof_arc { - prof_uptrint_t frompc; /* caller's caller */ - prof_uptrint_t selfpc; /* caller's address */ - prof_uptrint_t count; /* # times arc traversed */ -}; - -/* - * Gprof arc, internal format. - */ - -struct hasharc { - struct hasharc *next; /* next gprof record */ - struct gprof_arc arc; /* gprof record */ - prof_uptrint_t overflow; /* # times counter overflowed */ -}; - -/* - * Linked list of all function profile blocks. - */ - -#define MAX_CACHE 3 /* # cache table entries */ - -struct gfuncs { - struct hasharc **hash_ptr; /* gprof hash table */ - struct hasharc **unique_ptr; /* function unique pointer */ - struct prof_int prof; /* -p stats for elf */ - struct hasharc *cache_ptr[MAX_CACHE]; /* cache element pointers */ -}; - -/* - * Profile information which might be written out in ELF {,g}mon.out files. - */ - -#define MAX_BUCKETS 9 /* max bucket chain to print out */ - -struct profile_stats { /* Debugging counters */ - prof_uptrint_t major_version; /* major version number */ - prof_uptrint_t minor_version; /* minor version number */ - prof_uptrint_t stats_size; /* size of profile_vars structure */ - prof_uptrint_t profil_buckets; /* # profil buckets */ - prof_uptrint_t my_cpu; /* identify current cpu/thread */ - prof_uptrint_t max_cpu; /* identify max cpu/thread */ - prof_uptrint_t prof_records; /* # of functions profiled */ - prof_uptrint_t gprof_records; /* # of gprof arcs */ - prof_uptrint_t hash_buckets; /* # gprof hash buckets */ - prof_uptrint_t bogus_count; /* # of bogus functions found in gprof */ - - prof_cnt_t cnt; /* # of calls to _{,g}prof_mcount */ - prof_cnt_t dummy; /* # of calls to _dummy_mcount */ - prof_cnt_t old_mcount; /* # of calls to old mcount */ - prof_cnt_t hash_search; /* # hash buckets searched */ - prof_cnt_t hash_num; /* # times hash table searched */ - prof_cnt_t user_ticks; /* # ticks in user space */ - prof_cnt_t kernel_ticks; /* # ticks in kernel space */ - prof_cnt_t idle_ticks; /* # ticks in idle mode */ - prof_cnt_t overflow_ticks; /* # ticks where HISTCOUNTER overflowed */ - prof_cnt_t acontext_locked; /* # times an acontext was locked */ - prof_cnt_t too_low; /* # times a histogram tick was too low */ - prof_cnt_t too_high; /* # times a histogram tick was too high */ - prof_cnt_t prof_overflow; /* # times a prof count field overflowed */ - prof_cnt_t gprof_overflow; /* # times a gprof count field overflowed */ - - /* allocation statistics */ - prof_uptrint_t num_alloc[(int)ACONTEXT_MAX]; /* # allocations */ - prof_uptrint_t bytes_alloc[(int)ACONTEXT_MAX]; /* bytes allocated */ - prof_uptrint_t num_context[(int)ACONTEXT_MAX]; /* # contexts */ - prof_uptrint_t wasted[(int)ACONTEXT_MAX]; /* wasted bytes */ - prof_uptrint_t overhead[(int)ACONTEXT_MAX]; /* overhead bytes */ - - prof_uptrint_t buckets[MAX_BUCKETS + 1]; /* # hash indexes that have n buckets */ - prof_cnt_t cache_hits[MAX_CACHE]; /* # times nth cache entry matched */ - - prof_cnt_t stats_unused[64]; /* reserved for future use */ -}; - -#define PROFILE_MAJOR_VERSION 1 -#define PROFILE_MINOR_VERSION 1 - -/* - * Machine dependent fields. - */ - -struct profile_md { - int major_version; /* major version number */ - int minor_version; /* minor version number */ - size_t md_size; /* size of profile_md structure */ - struct hasharc **hash_ptr; /* gprof hash table */ - size_t hash_size; /* size of hash table */ - prof_uptrint_t num_cache; /* # of cache entries */ - void (*save_mcount_ptr)(void); /* save for _mcount_ptr */ - void(**mcount_ptr_ptr)(void); /* pointer to _mcount_ptr */ - struct hasharc *dummy_ptr; /* pointer to dummy gprof record */ - void *(*alloc_pages)(size_t); /* pointer to _profile_alloc_pages */ - char num_buffer[PROF_CNT_DIGITS]; /* convert 64 bit ints to string */ - long md_unused[58]; /* add unused fields */ -}; - -/* - * Record information about each function call. Specify - * caller, caller's caller, and a unique label for use by - * the profiling routines. - */ -extern void _prof_mcount(void); -extern void _gprof_mcount(void); -extern void _dummy_mcount(void); -extern void (*_mcount_ptr)(void); - -/* - * Function in profile-md.c to convert prof_cnt_t to string format (decimal & hex). - */ -extern char *_profile_cnt_to_decimal(char *, prof_uptrint_t, prof_uptrint_t); - -#endif /* _PROFILE_MD_H */ diff --git a/osfmk/profiling/machine/Makefile b/osfmk/profiling/machine/Makefile deleted file mode 100644 index 3ee985875..000000000 --- a/osfmk/profiling/machine/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -include $(MakeInc_cmd) -include $(MakeInc_def) - -DATAFILES = \ - profile-md.h - -INSTALL_MI_LIST = ${DATAFILES} - -INSTALL_MI_DIR = profile/machine - -EXPORT_MI_LIST = ${DATAFILES} - -EXPORT_MI_DIR = profile/machine - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/osfmk/profiling/machine/profile-md.h b/osfmk/profiling/machine/profile-md.h deleted file mode 100644 index 11a08f978..000000000 --- a/osfmk/profiling/machine/profile-md.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _MACH_MACHINE_PROFILE_MD_H_ -#define _MACH_MACHINE_PROFILE_MD_H_ - -#if defined (__i386__) || defined (__x86_64__) -#include "profiling/i386/profile-md.h" -#elif defined (__arm__) || defined (__arm64__) -#include "profiling/arm/profile-md.h" -#else -#error architecture not supported -#endif - -#endif /* _MACH_MACHINE_PROFILE_MD_H_ */ diff --git a/osfmk/profiling/profile-internal.h b/osfmk/profiling/profile-internal.h deleted file mode 100644 index 8f6cdfeef..000000000 --- a/osfmk/profiling/profile-internal.h +++ /dev/null @@ -1,374 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Define the internal interfaces between the profiling support that is - * common between the kernel, mach servers, and user space library. - */ - -#ifndef _PROFILE_INTERNAL_H -#define _PROFILE_INTERNAL_H - -/* - * Allow us not to require stdio.h in kernel/server space, but - * use it in user space. - */ - -#if !defined(MACH_KERNEL) && !defined(_KERNEL) -#include -#endif - -/* - * Scaling factor for the profil system call. - */ - -#define SCALE_1_TO_1 0x10000L - - -/* - * Forward reference to structures used. - */ - -struct profile_vars; -struct profile_stats; -struct profile_md; -struct profile_dci; -struct profile_profil; -struct callback; -struct gprof_arc; -struct prof_ext; - -/* - * Profiling type - */ - -typedef enum profile_type { - PROFILE_NONE, - PROFILE_GPROF, - PROFILE_PROF -} profile_type_t; - -/* - * Whether to allocate memory in _profile_md_init. - */ - -typedef enum profile_alloc_mem { - PROFILE_ALLOC_MEM_NO, - PROFILE_ALLOC_MEM_YES -} profile_alloc_mem_t; - -/* - * Allocation context block types. - */ - -typedef enum acontext_type { - ACONTEXT_PROF, /* 0: prof records */ - ACONTEXT_GPROF, /* 1: gprof arcs */ - ACONTEXT_GFUNC, /* 2: gprof function headers */ - ACONTEXT_MISC, /* 3: misc. allocations */ - ACONTEXT_PROFIL, /* 4: profil based allocations */ - ACONTEXT_DCI, /* 5: dci based allocations */ - ACONTEXT_BASIC_BLOCK, /* 6: basic block allocations */ - ACONTEXT_CALLBACK, /* 7: callback structures */ - ACONTEXT_MAX = 32 /* # allocation contexts */ -} acontext_type_t; - -#define ACONTEXT_FIRST ACONTEXT_PROF - -#define ACONTEXT_NAMES { \ - "prof", \ - "gprof", \ - "gfunc", \ - "misc", \ - "profil", \ - "dci", \ - "bb", \ - "callback", \ - "#8", \ - "#9", \ - "#10", \ - "#11", \ - "#12", \ - "#13", \ - "#14", \ - "#15", \ - "#16", \ - "#17", \ - "#18", \ - "#19", \ - "#20", \ - "#21", \ - "#22", \ - "#23", \ - "#24", \ - "#25", \ - "#26", \ - "#27", \ - "#28", \ - "#29", \ - "#30", \ - "#31", \ - } - -/* - * Kgmon control codes - */ - -typedef enum kgmon_control { - KGMON_UNUSED, /* insure no 0 is ever used */ - KGMON_GET_STATUS, /* return whether or not profiling is active */ - KGMON_GET_PROFILE_VARS, /* return the _profile_vars structure */ - KGMON_GET_PROFILE_STATS, /* return the _profile_stats structure */ - KGMON_GET_DEBUG, /* return whether or not debugging is on */ - - KGMON_SET_PROFILE_ON = 50, /* turn on profiling */ - KGMON_SET_PROFILE_OFF, /* turn off profiling */ - KGMON_SET_PROFILE_RESET, /* reset profiling tables */ - KGMON_SET_DEBUG_ON, /* turn on debugging */ - KGMON_SET_DEBUG_OFF /* turn off debugging */ -} kgmon_control_t; - -#define KGMON_GET_MIN KGMON_GET_STATUS -#define KGMON_GET_MAX KGMON_GET_DEBUG -#define KGMON_SET_MIN KGMON_SET_PROFILE_ON -#define KGMON_SET_MAX KGMON_SET_DEBUG_OFF - -#define ENCODE_KGMON(num, control, cpu_thread) \ - ((num) = ((cpu_thread) << 8) | (control)) - -#define DECODE_KGMON(num, control, cpu_thread) \ -do { \ - control = (num) & 0xff; \ - cpu_thread = (num) >> 8; \ -} while (0) - -#define LEGAL_KGMON(num) (((unsigned long)(num)) <= 0xffff) - -/* - * Pull in all of the machine dependent types now after defining the enums. - */ - -#include - -/* - * general rounding functions. - */ - -#define ROUNDDOWN(x, y) (((x)/(y))*(y)) -#define ROUNDUP(x, y) ((((x)+(y)-1)/(y))*(y)) - -/* - * Linked list of pages allocated for a particular allocation context block. - */ - -struct page_list { - void *first; /* pointer to first byte available */ - void *ptr; /* pointer to next available byte */ - struct page_list *next; /* next page allocated */ - size_t bytes_free; /* # bytes available */ - size_t bytes_allocated; /* # bytes allocates so far */ - size_t num_allocations; /* # of allocations */ -}; - -/* - * Allocation context block. - */ - -struct alloc_context { - struct alloc_context *next; /* next allocation context block */ - struct page_list *plist; /* head of page list */ - prof_lock_t lock; /* lock field available to asm */ -}; - - -/* - * Callback structure that records information for one record in the - * profiling output. - */ - -#define STR_MAX 32 - -struct callback { - void *sec_ptr; /* callback user data */ - /* callback function */ - size_t (*callback)(struct profile_vars *, struct callback *); - long sec_val1; /* section specific value */ - long sec_val2; /* section specific value */ - size_t sec_recsize; /* record size */ - size_t sec_length; /* total length */ - char sec_name[STR_MAX]; /* section name */ -}; - -/* - * Basic profil information (except for the profil buffer). - */ - -struct profile_profil { - prof_uptrint_t lowpc; /* lowest address */ - prof_uptrint_t highpc; /* highest address */ - size_t text_len; /* highpc-lowpc */ - size_t profil_len; /* length of the profil buffer */ - size_t counter_size; /* size of indivual counters (HISTCOUNTER) */ - unsigned long scale; /* scaling factor (65536 / scale) */ - unsigned long profil_unused[8]; /* currently unused */ -}; - -/* - * Profiling internal variables. This structure is intended to be machine independent. - */ - -struct profile_vars { - int major_version; /* major version number */ - int minor_version; /* minor version number */ - size_t vars_size; /* size of profile_vars structure */ - size_t plist_size; /* size of page_list structure */ - size_t acontext_size; /* size of allocation context struct */ - size_t callback_size; /* size of callback structure */ - profile_type_t type; /* profile type */ - const char *error_msg; /* error message for perror */ - const char *filename; /* filename to write to */ - char *str_ptr; /* string table */ - -#if !defined(MACH_KERNEL) && !defined(_KERNEL) - FILE *stream; /* stdio stream to write to */ - FILE *diag_stream; /* stdio stream to write diagnostics to */ - /* function to write out some bytes */ - size_t (*fwrite_func)(const void *, size_t, size_t, FILE *); -#else - void *stream; /* pointer passed to fwrite_func */ - void *diag_stream; /* stdio stream to write diagnostics to */ - /* function to write out some bytes */ - size_t (*fwrite_func)(const void *, size_t, size_t, void *); -#endif - - size_t page_size; /* machine pagesize */ - size_t str_bytes; /* # bytes in string table */ - size_t str_total; /* # bytes allocated total for string table */ - long clock_ticks; /* # clock ticks per second */ - - /* profil related variables */ - struct profile_profil profil_info; /* profil information */ - HISTCOUNTER *profil_buf; /* profil buffer */ - - /* Profiling output selection */ - void (*output_init)(struct profile_vars *); /* output init function */ - void (*output)(struct profile_vars *); /* output function */ - void *output_ptr; /* output specific info */ - - /* allocation contexts */ - struct alloc_context *acontext[(int)ACONTEXT_MAX]; - - void (*bogus_func)(void); /* Function to use if address out of bounds */ - prof_uptrint_t vars_unused[63]; /* future growth */ - - /* Various flags */ - prof_flag_t init; /* != 0 if initialized */ - prof_flag_t active; /* != 0 if profiling is active */ - prof_flag_t do_profile; /* != 0 if profiling is being done */ - prof_flag_t use_dci; /* != 0 if using DCI */ - - prof_flag_t use_profil; /* != 0 if using profil */ - prof_flag_t recursive_alloc; /* != 0 if alloc taking place */ - prof_flag_t output_uarea; /* != 0 if output the uarea */ - prof_flag_t output_stats; /* != 0 if output the stats */ - - prof_flag_t output_clock; /* != 0 if output the clock ticks */ - prof_flag_t multiple_sections; /* != 0 if output allows multiple sections */ - prof_flag_t have_bb; /* != 0 if we have basic block data */ - prof_flag_t init_format; /* != 0 if output format has been chosen */ - - prof_flag_t debug; /* != 0 if debugging */ - prof_flag_t check_funcs; /* != 0 if check gprof arcs for being in range */ - prof_flag_t flag_unused[62]; /* space for more flags */ - - struct profile_stats stats; /* profiling statistics */ - struct profile_md md; /* machine dependent info */ -}; - -/* - * Profiling static data. - */ - -extern struct profile_vars _profile_vars; - -/* - * Functions called by the machine dependent routines, and provided by - * specific routines to the kernel, server, and user space library. - */ - -#if (__GNUC__ < 2) || (__GNUC__ == 2 && __GNUC_MINOR__ < 5) || defined(lint) -#define __attribute__(arg) -#endif - -#if defined(_KERNEL) || defined(MACH_KERNEL) -#define _profile_printf printf -#else -extern int _profile_printf(const char *, ...) __attribute__((format(printf, 1, 2))); -#endif - -extern void *_profile_alloc_pages(size_t); -extern void _profile_free_pages(void *, size_t); -extern void _profile_error(struct profile_vars *); - -/* - * Functions provided by the machine dependent files. - */ - -extern void _profile_md_init(struct profile_vars *, profile_type_t, profile_alloc_mem_t); -extern int _profile_md_start(void); -extern int _profile_md_stop(void); -extern void *_profile_alloc(struct profile_vars *, size_t, acontext_type_t); -extern size_t _gprof_write(struct profile_vars *, struct callback *); -extern size_t _prof_write(struct profile_vars *, struct callback *); -extern void _profile_update_stats(struct profile_vars *); -extern void _profile_reset(struct profile_vars *); - -#if !defined(_KERNEL) && !defined(MACH_KERNEL) -extern void _profile_print_stats(FILE *, const struct profile_stats *, const struct profile_profil *); -extern void _profile_merge_stats(struct profile_stats *, const struct profile_stats *); -#else - -/* - * Functions defined in profile-kgmon.c - */ - -extern long _profile_kgmon(int, - size_t, - long, - int, - void **, - void (*)(kgmon_control_t)); -#ifdef _KERNEL -extern void kgmon_server_control(kgmon_control_t); - -#endif /* _KERNEL */ -#endif /* _KERNEL or MACH_KERNEL */ - -#endif /* _PROFILE_INTERNAL_H */ diff --git a/osfmk/profiling/profile-kgmon.c b/osfmk/profiling/profile-kgmon.c deleted file mode 100644 index c29f61037..000000000 --- a/osfmk/profiling/profile-kgmon.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:49 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:08 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.5.1 1995/01/06 19:54:04 devrcs - * mk6 CR668 - 1.3b26 merge - * new file for mk6 - * [1994/10/12 22:25:34 dwm] - * - * Revision 1.1.2.1 1994/04/08 17:52:05 meissner - * Add callback function to _profile_kgmon. - * [1994/02/16 22:38:31 meissner] - * - * _profile_kgmon now returns pointer to area, doesn't do move itself. - * [1994/02/11 16:52:17 meissner] - * - * Move all printfs into if (pv->debug) { ... } blocks. - * Add debug printfs protected by if (pv->debug) for all error conditions. - * Add code to reset profiling information. - * Add code to get/set debug flag. - * Expand copyright. - * [1994/02/07 12:41:14 meissner] - * - * Add support to copy arbitrary regions. - * Delete several of the KGMON_GET commands, now that arb. regions are supported. - * Explicitly call _profile_update_stats before dumping vars or stats. - * [1994/02/03 00:59:05 meissner] - * - * Combine _profile_{vars,stats,md}; Allow more than one _profile_vars. - * [1994/02/01 12:04:09 meissner] - * - * CR 10198 - Initial version. - * [1994/01/28 23:33:37 meissner] - * - * $EndLog$ - */ - -#include - -#ifdef MACH_KERNEL -#include -#endif - -#ifndef PROFILE_VARS -#define PROFILE_VARS(cpu) (&_profile_vars) -#endif - -/* - * Kgmon interface. This returns the count of bytes moved if everything was ok, - * or -1 if there were errors. - */ - -long -_profile_kgmon(int write, - size_t count, - long indx, - int max_cpus, - void **p_ptr, - void (*control_func)(kgmon_control_t)) -{ - kgmon_control_t kgmon; - int cpu; - int error = 0; - int i; - struct profile_vars *pv; - static struct callback dummy_callback; - - *p_ptr = (void *)0; - - /* - * If the number passed is not within bounds, just copy the data directly. - */ - - if (!LEGAL_KGMON(indx)) { - *p_ptr = (void *)indx; - if (!write) { - if (PROFILE_VARS(0)->debug) { - printf("_profile_kgmon: copy %5ld bytes, from 0x%lx\n", - (long)count, - (long)indx); - } - } else { - if (PROFILE_VARS(0)->debug) { - printf("_profile_kgmon: copy %5ld bytes, to 0x%lx\n", - (long)count, - (long)indx); - } - } - - return count; - } - - /* - * Decode the record number into the component pieces. - */ - - DECODE_KGMON(indx, kgmon, cpu); - - if (PROFILE_VARS(0)->debug) { - printf("_profile_kgmon: start: kgmon control = %2d, cpu = %d, count = %ld\n", - kgmon, cpu, (long)count); - } - - /* Validate the CPU number */ - if (cpu < 0 || cpu >= max_cpus) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON, bad cpu %d\n", cpu); - } - - return -1; - } else { - pv = PROFILE_VARS(cpu); - - if (!write) { - switch (kgmon) { - default: - if (PROFILE_VARS(0)->debug) { - printf("Unknown KGMON read command\n"); - } - - error = -1; - break; - - case KGMON_GET_STATUS: /* return whether or not profiling is active */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_GET_STATUS: cpu = %d\n", cpu); - } - - error = -1; - break; - } - - if (count != sizeof(pv->active)) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_GET_STATUS: count = %ld, should be %ld\n", - (long)count, - (long)sizeof(pv->active)); - } - - error = -1; - break; - } - - *p_ptr = (void *)&pv->active; - break; - - case KGMON_GET_DEBUG: /* return whether or not debugging is active */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_GET_DEBUG: cpu = %d\n", cpu); - } - - error = -1; - break; - } - - if (count != sizeof(pv->debug)) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_GET_DEBUG: count = %ld, should be %ld\n", - (long)count, - (long)sizeof(pv->active)); - } - - error = -1; - break; - } - - *p_ptr = (void *)&pv->debug; - break; - - case KGMON_GET_PROFILE_VARS: /* return the _profile_vars structure */ - if (count != sizeof(struct profile_vars)) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_GET_PROFILE_VARS: count = %ld, should be %ld\n", - (long)count, - (long)sizeof(struct profile_vars)); - } - - error = -1; - break; - } - - _profile_update_stats(pv); - *p_ptr = (void *)pv; - break; - - case KGMON_GET_PROFILE_STATS: /* return the _profile_stats structure */ - if (count != sizeof(struct profile_stats)) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_GET_PROFILE_STATS: count = %ld, should be = %ld\n", - (long)count, - (long)sizeof(struct profile_stats)); - } - - error = -1; - break; - } - - _profile_update_stats(pv); - *p_ptr = (void *)&pv->stats; - break; - } - } else { - switch (kgmon) { - default: - if (PROFILE_VARS(0)->debug) { - printf("Unknown KGMON write command\n"); - } - - error = -1; - break; - - case KGMON_SET_PROFILE_ON: /* turn on profiling */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_SET_PROFILE_ON, cpu = %d\n", cpu); - } - - error = -1; - break; - } - - if (!PROFILE_VARS(0)->active) { - for (i = 0; i < max_cpus; i++) { - PROFILE_VARS(i)->active = 1; - } - - if (control_func) { - (*control_func)(kgmon); - } - - _profile_md_start(); - } - - count = 0; - break; - - case KGMON_SET_PROFILE_OFF: /* turn off profiling */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_SET_PROFILE_OFF, cpu = %d\n", cpu); - } - - error = -1; - break; - } - - if (PROFILE_VARS(0)->active) { - for (i = 0; i < max_cpus; i++) { - PROFILE_VARS(i)->active = 0; - } - - _profile_md_stop(); - - if (control_func) { - (*control_func)(kgmon); - } - } - - count = 0; - break; - - case KGMON_SET_PROFILE_RESET: /* reset profiling */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_SET_PROFILE_RESET, cpu = %d\n", cpu); - } - - error = -1; - break; - } - - for (i = 0; i < max_cpus; i++) { - _profile_reset(PROFILE_VARS(i)); - } - - if (control_func) { - (*control_func)(kgmon); - } - - count = 0; - break; - - case KGMON_SET_DEBUG_ON: /* turn on profiling */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_SET_DEBUG_ON, cpu = %d\n", cpu); - } - - error = -1; - break; - } - - if (!PROFILE_VARS(0)->debug) { - for (i = 0; i < max_cpus; i++) { - PROFILE_VARS(i)->debug = 1; - } - - if (control_func) { - (*control_func)(kgmon); - } - } - - count = 0; - break; - - case KGMON_SET_DEBUG_OFF: /* turn off profiling */ - if (cpu != 0) { - if (PROFILE_VARS(0)->debug) { - printf("KGMON_SET_DEBUG_OFF, cpu = %d\n", cpu); - } - - error = -1; - break; - } - - if (PROFILE_VARS(0)->debug) { - for (i = 0; i < max_cpus; i++) { - PROFILE_VARS(i)->debug = 0; - } - - if (control_func) { - (*control_func)(kgmon); - } - } - - count = 0; - break; - } - } - } - - if (error) { - if (PROFILE_VARS(0)->debug) { - printf("_profile_kgmon: done: kgmon control = %2d, cpu = %d, error = %d\n", - kgmon, cpu, error); - } - - return -1; - } - - if (PROFILE_VARS(0)->debug) { - printf("_profile_kgmon: done: kgmon control = %2d, cpu = %d, count = %ld\n", - kgmon, cpu, (long)count); - } - - return count; -} diff --git a/osfmk/profiling/profile-mk.c b/osfmk/profiling/profile-mk.c deleted file mode 100644 index 4111735ab..000000000 --- a/osfmk/profiling/profile-mk.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Microkernel interface to common profiling. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -extern char etext[], pstart[]; - -void * -_profile_alloc_pages(size_t size) -{ - vm_offset_t addr; - - /* - * For the MK, we can't support allocating pages at runtime, because we - * might be at interrupt level, so abort if we didn't size the table - * properly. - */ - - if (PROFILE_VARS(0)->active) { - panic("Call to _profile_alloc_pages while profiling is running."); - } - - if (kmem_alloc(kernel_map, &addr, size)) { - panic("Could not allocate memory for profiling"); - } - - memset((void *)addr, '\0', size); - if (PROFILE_VARS(0)->debug) { - printf("Allocated %d bytes for profiling, address 0x%x\n", (int)size, (int)addr); - } - - return (caddr_t)addr; -} - -void -_profile_free_pages(void *addr, size_t size) -{ - if (PROFILE_VARS(0)->debug) { - printf("Freed %d bytes for profiling, address 0x%x\n", (int)size, (int)addr); - } - - kmem_free(kernel_map, (vm_offset_t)addr, size); - return; -} - -void -_profile_error(struct profile_vars *pv) -{ - panic("Fatal error in profiling"); -} - -void -kmstartup(void) -{ - prof_uptrint_t textsize; - prof_uptrint_t monsize; - prof_uptrint_t lowpc; - prof_uptrint_t highpc; - struct profile_vars *pv; - - /* - * round lowpc and highpc to multiples of the density we're using - * so the rest of the scaling (here and in gprof) stays in ints. - */ - - lowpc = ROUNDDOWN((prof_uptrint_t)&pstart[0], HISTFRACTION * sizeof(LHISTCOUNTER)); - highpc = ROUNDUP((prof_uptrint_t)&etext[0], HISTFRACTION * sizeof(LHISTCOUNTER)); - textsize = highpc - lowpc; - monsize = (textsize / HISTFRACTION) * sizeof(LHISTCOUNTER); - - pv = PROFILE_VARS(0); - -#ifdef DEBUG_PROFILE - pv->debug = 1; -#endif - pv->page_size = PAGE_SIZE; - _profile_md_init(pv, PROFILE_GPROF, PROFILE_ALLOC_MEM_YES); - - /* Profil related variables */ - pv->profil_buf = _profile_alloc(pv, monsize, ACONTEXT_PROFIL); - pv->profil_info.highpc = highpc; - pv->profil_info.lowpc = lowpc; - pv->profil_info.text_len = textsize; - pv->profil_info.profil_len = monsize; - pv->profil_info.counter_size = sizeof(LHISTCOUNTER); - pv->profil_info.scale = 0x10000 / HISTFRACTION; - pv->stats.profil_buckets = monsize / sizeof(LHISTCOUNTER); - - /* Other gprof variables */ - pv->stats.my_cpu = 0; - pv->stats.max_cpu = 1; /* initial number of cpus */ - pv->init = 1; - pv->active = 1; - pv->use_dci = 0; - pv->use_profil = 1; - pv->check_funcs = 1; /* for now */ - - if (pv->debug) { - printf("Profiling kernel, s_textsize=%ld, monsize=%ld [0x%lx..0x%lx], cpu = %d\n", - (long)textsize, - (long)monsize, - (long)lowpc, - (long)highpc, - 0); - } - - _profile_md_start(); -} - -/* driver component */ - -int -gprofprobe(caddr_t port, void *ctlr) -{ - return 1; -} - -void -gprofattach(void) -{ - kmstartup(); - return; -} - -/* struct bus_device *gprofinfo[NGPROF]; */ -struct bus_device *gprofinfo[1]; - -struct bus_driver gprof_driver = { - gprofprobe, 0, gprofattach, 0, 0, "gprof", gprofinfo, "gprofc", 0, 0 -}; - - -io_return_t -gprofopen(dev_t dev, - int flags, - io_req_t ior) -{ - ior->io_error = D_SUCCESS; - return 0; -} - -void -gprofclose(dev_t dev) -{ - return; -} - -void -gprofstrategy(io_req_t ior) -{ - void *sys_ptr = (void *)0; - - long count = _profile_kgmon(!(ior->io_op & IO_READ), - ior->io_count, - ior->io_recnum, - 1, - &sys_ptr, - (void (*)(kgmon_control_t))0); - - if (count < 0) { - ior->io_error = D_INVALID_RECNUM; - } else { - if (count > 0 && sys_ptr != (void *)0) { - if (ior->io_op & IO_READ) { - memcpy((void *)ior->io_data, sys_ptr, count); - } else { - memcpy(sys_ptr, (void *)ior->io_data, count); - } - } - - ior->io_error = D_SUCCESS; - ior->io_residual = ior->io_count - count; - } - - iodone(ior); -} - -io_return_t -gprofread(dev_t dev, - io_req_t ior) -{ - return block_io(gprofstrategy, minphys, ior); -} - -io_return_t -gprofwrite(dev_t dev, - io_req_t ior) -{ - return block_io(gprofstrategy, minphys, ior); -} diff --git a/osfmk/profiling/profile-mk.h b/osfmk/profiling/profile-mk.h deleted file mode 100644 index 8e3690085..000000000 --- a/osfmk/profiling/profile-mk.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Microkernel interface to common profiling. - */ - -#include -#include -#include -#include - -/* - * JMM - We don't use these, just the BSD interfaces. - */ -#if 0 -extern void kmstartup(void); -extern int gprofprobe(caddr_t, void *); -extern void gprofattach(void); -extern int gprofopen(dev_t, int, io_req_t); -extern void gprofclose(dev_t); -extern void gprofstrategy(io_req_t); -extern int gprofread(dev_t, io_req_t); -extern int gprofwrite(dev_t, io_req_t); -#endif - -/* - * Macros to access the nth cpu's profile variable structures. - */ - -#define PROFILE_VARS(cpu) (&_profile_vars) diff --git a/osfmk/tests/kernel_tests.c b/osfmk/tests/kernel_tests.c index 01669bf9c..4748deb1c 100644 --- a/osfmk/tests/kernel_tests.c +++ b/osfmk/tests/kernel_tests.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,7 @@ #include #include #include +#include #if !(DEVELOPMENT || DEBUG) #error "Testing is not enabled on RELEASE configurations" @@ -63,6 +64,13 @@ kern_return_t zalloc_test(void); kern_return_t RandomULong_test(void); kern_return_t kcdata_api_test(void); kern_return_t priority_queue_test(void); +kern_return_t ts_kernel_primitive_test(void); +kern_return_t ts_kernel_sleep_inheritor_test(void); +kern_return_t ts_kernel_gate_test(void); +kern_return_t ts_kernel_turnstile_chain_test(void); +kern_return_t ts_kernel_timingsafe_bcmp_test(void); + +extern kern_return_t kprintf_hhx_test(void); #if defined(__arm__) || defined(__arm64__) kern_return_t pmap_coredump_test(void); @@ -81,12 +89,18 @@ extern kern_return_t ex_cb_test(void); #if __ARM_PAN_AVAILABLE__ extern kern_return_t arm64_pan_test(void); #endif +#if defined(HAS_APPLE_PAC) +extern kern_return_t arm64_ropjop_test(void); +#endif /* defined(HAS_APPLE_PAC) */ #endif /* __arm64__ */ extern kern_return_t test_thread_call(void); -struct xnupost_panic_widget xt_panic_widgets = {NULL, NULL, NULL, NULL}; +struct xnupost_panic_widget xt_panic_widgets = {.xtp_context_p = NULL, + .xtp_outval_p = NULL, + .xtp_func_name = NULL, + .xtp_func = NULL}; struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test), XNUPOST_TEST_CONFIG_BASIC(RandomULong_test), @@ -98,6 +112,9 @@ struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test #if __ARM_PAN_AVAILABLE__ XNUPOST_TEST_CONFIG_BASIC(arm64_pan_test), #endif +#if defined(HAS_APPLE_PAC) + XNUPOST_TEST_CONFIG_BASIC(arm64_ropjop_test), +#endif /* defined(HAS_APPLE_PAC) */ #endif /* __arm64__ */ XNUPOST_TEST_CONFIG_BASIC(kcdata_api_test), XNUPOST_TEST_CONFIG_BASIC(console_serial_test), @@ -109,7 +126,13 @@ struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test XNUPOST_TEST_CONFIG_BASIC(bitmap_post_test), //XNUPOST_TEST_CONFIG_TEST_PANIC(kcdata_api_assert_tests) XNUPOST_TEST_CONFIG_BASIC(test_thread_call), - XNUPOST_TEST_CONFIG_BASIC(priority_queue_test), }; + XNUPOST_TEST_CONFIG_BASIC(priority_queue_test), + XNUPOST_TEST_CONFIG_BASIC(ts_kernel_primitive_test), + XNUPOST_TEST_CONFIG_BASIC(ts_kernel_sleep_inheritor_test), + XNUPOST_TEST_CONFIG_BASIC(ts_kernel_gate_test), + XNUPOST_TEST_CONFIG_BASIC(ts_kernel_turnstile_chain_test), + XNUPOST_TEST_CONFIG_BASIC(ts_kernel_timingsafe_bcmp_test), + XNUPOST_TEST_CONFIG_BASIC(kprintf_hhx_test), }; uint32_t kernel_post_tests_count = sizeof(kernel_post_tests) / sizeof(xnupost_test_data_t); @@ -685,10 +708,34 @@ struct sample_disk_io_stats { } __attribute__((packed)); struct kcdata_subtype_descriptor test_disk_io_stats_def[] = { - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"}, - {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"}, - {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"}, - {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"}, + { + .kcs_flags = KCS_SUBTYPE_FLAGS_NONE, + .kcs_elem_type = KC_ST_UINT64, + .kcs_elem_offset = 0 * sizeof(uint64_t), + .kcs_elem_size = sizeof(uint64_t), + .kcs_name = "disk_reads_count" + }, + { + .kcs_flags = KCS_SUBTYPE_FLAGS_NONE, + .kcs_elem_type = KC_ST_UINT64, + .kcs_elem_offset = 1 * sizeof(uint64_t), + .kcs_elem_size = sizeof(uint64_t), + .kcs_name = "disk_reads_size" + }, + { + .kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY, + .kcs_elem_type = KC_ST_UINT64, + .kcs_elem_offset = 2 * sizeof(uint64_t), + .kcs_elem_size = KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), + .kcs_name = "io_priority_count" + }, + { + .kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY, + .kcs_elem_type = KC_ST_UINT64, + .kcs_elem_offset = (2 + 4) * sizeof(uint64_t), + .kcs_elem_size = sizeof(uint64_t), + .kcs_name = "io_priority_size" + }, }; kern_return_t @@ -926,3 +973,1679 @@ pmap_coredump_test(void) return KERN_SUCCESS; } #endif + +struct ts_kern_prim_test_args { + int *end_barrier; + int *notify_b; + int *wait_event_b; + int before_num; + int *notify_a; + int *wait_event_a; + int after_num; + int priority_to_check; +}; + +static void +wait_threads( + int* var, + int num) +{ + if (var != NULL) { + while (os_atomic_load(var, acquire) != num) { + assert_wait((event_t) var, THREAD_UNINT); + if (os_atomic_load(var, acquire) != num) { + (void) thread_block(THREAD_CONTINUE_NULL); + } else { + clear_wait(current_thread(), THREAD_AWAKENED); + } + } + } +} + +static void +wake_threads( + int* var) +{ + if (var) { + os_atomic_inc(var, relaxed); + thread_wakeup((event_t) var); + } +} + +extern void IOSleep(int); + +static void +thread_lock_unlock_kernel_primitive( + void *args, + __unused wait_result_t wr) +{ + thread_t thread = current_thread(); + struct ts_kern_prim_test_args *info = (struct ts_kern_prim_test_args*) args; + int pri; + + thread_lock(thread); + pri = thread->sched_pri; + thread_unlock(thread); + + wait_threads(info->wait_event_b, info->before_num); + wake_threads(info->notify_b); + + tstile_test_prim_lock(SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT); + + wake_threads(info->notify_a); + wait_threads(info->wait_event_a, info->after_num); + + IOSleep(100); + + if (info->priority_to_check) { + thread_lock(thread); + pri = thread->sched_pri; + thread_unlock(thread); + T_ASSERT(pri == info->priority_to_check, "Priority thread: current sched %d sched wanted %d", pri, info->priority_to_check); + } + + tstile_test_prim_unlock(SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT); + + wake_threads(info->end_barrier); + thread_terminate_self(); +} + +kern_return_t +ts_kernel_primitive_test(void) +{ + thread_t owner, thread1, thread2; + struct ts_kern_prim_test_args targs[2] = {}; + kern_return_t result; + int end_barrier = 0; + int owner_locked = 0; + int waiters_ready = 0; + + T_LOG("Testing turnstile kernel primitive"); + + targs[0].notify_b = NULL; + targs[0].wait_event_b = NULL; + targs[0].before_num = 0; + targs[0].notify_a = &owner_locked; + targs[0].wait_event_a = &waiters_ready; + targs[0].after_num = 2; + targs[0].priority_to_check = 90; + targs[0].end_barrier = &end_barrier; + + // Start owner with priority 80 + result = kernel_thread_start_priority((thread_continue_t)thread_lock_unlock_kernel_primitive, &targs[0], 80, &owner); + T_ASSERT(result == KERN_SUCCESS, "Starting owner"); + + targs[1].notify_b = &waiters_ready; + targs[1].wait_event_b = &owner_locked; + targs[1].before_num = 1; + targs[1].notify_a = NULL; + targs[1].wait_event_a = NULL; + targs[1].after_num = 0; + targs[1].priority_to_check = 0; + targs[1].end_barrier = &end_barrier; + + // Start waiters with priority 85 and 90 + result = kernel_thread_start_priority((thread_continue_t)thread_lock_unlock_kernel_primitive, &targs[1], 85, &thread1); + T_ASSERT(result == KERN_SUCCESS, "Starting thread1"); + + result = kernel_thread_start_priority((thread_continue_t)thread_lock_unlock_kernel_primitive, &targs[1], 90, &thread2); + T_ASSERT(result == KERN_SUCCESS, "Starting thread2"); + + wait_threads(&end_barrier, 3); + + return KERN_SUCCESS; +} + +#define MTX_LOCK 0 +#define RW_LOCK 1 + +#define NUM_THREADS 4 + +struct synch_test_common { + unsigned int nthreads; + thread_t *threads; + int max_pri; + int test_done; +}; + +static kern_return_t +init_synch_test_common(struct synch_test_common *info, unsigned int nthreads) +{ + info->nthreads = nthreads; + info->threads = kalloc(sizeof(thread_t) * nthreads); + if (!info->threads) { + return ENOMEM; + } + + return KERN_SUCCESS; +} + +static void +destroy_synch_test_common(struct synch_test_common *info) +{ + kfree(info->threads, sizeof(thread_t) * info->nthreads); +} + +static void +start_threads(thread_continue_t func, struct synch_test_common *info, bool sleep_after_first) +{ + thread_t thread; + kern_return_t result; + uint i; + int priority = 75; + + info->test_done = 0; + + for (i = 0; i < info->nthreads; i++) { + info->threads[i] = NULL; + } + + info->max_pri = priority + (info->nthreads - 1) * 5; + if (info->max_pri > 95) { + info->max_pri = 95; + } + + for (i = 0; i < info->nthreads; i++) { + result = kernel_thread_start_priority((thread_continue_t)func, info, priority, &thread); + os_atomic_store(&info->threads[i], thread, release); + T_ASSERT(result == KERN_SUCCESS, "Starting thread %d, priority %d, %p", i, priority, thread); + + priority += 5; + + if (i == 0 && sleep_after_first) { + IOSleep(100); + } + } +} + +static unsigned int +get_max_pri(struct synch_test_common * info) +{ + return info->max_pri; +} + +static void +wait_all_thread(struct synch_test_common * info) +{ + wait_threads(&info->test_done, info->nthreads); +} + +static void +notify_waiter(struct synch_test_common * info) +{ + wake_threads(&info->test_done); +} + +static void +wait_for_waiters(struct synch_test_common *info) +{ + uint i, j; + thread_t thread; + + for (i = 0; i < info->nthreads; i++) { + j = 0; + while (os_atomic_load(&info->threads[i], acquire) == NULL) { + if (j % 100 == 0) { + IOSleep(10); + } + j++; + } + + if (info->threads[i] != current_thread()) { + j = 0; + do { + thread = os_atomic_load(&info->threads[i], relaxed); + if (thread == (thread_t) 1) { + break; + } + + if (!(thread->state & TH_RUN)) { + break; + } + + if (j % 100 == 0) { + IOSleep(100); + } + j++; + + if (thread->started == FALSE) { + continue; + } + } while (thread->state & TH_RUN); + } + } +} + +static void +exclude_current_waiter(struct synch_test_common *info) +{ + uint i, j; + + for (i = 0; i < info->nthreads; i++) { + j = 0; + while (os_atomic_load(&info->threads[i], acquire) == NULL) { + if (j % 100 == 0) { + IOSleep(10); + } + j++; + } + + if (os_atomic_load(&info->threads[i], acquire) == current_thread()) { + os_atomic_store(&info->threads[i], (thread_t)1, release); + return; + } + } +} + +struct info_sleep_inheritor_test { + struct synch_test_common head; + lck_mtx_t mtx_lock; + lck_rw_t rw_lock; + decl_lck_mtx_gate_data(, gate); + boolean_t gate_closed; + int prim_type; + boolean_t work_to_do; + unsigned int max_pri; + unsigned int steal_pri; + int synch_value; + int synch; + int value; + int handoff_failure; + thread_t thread_inheritor; +}; + +static void +primitive_lock(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_lock(&info->mtx_lock); + break; + case RW_LOCK: + lck_rw_lock(&info->rw_lock, LCK_RW_TYPE_EXCLUSIVE); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +primitive_unlock(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_unlock(&info->mtx_lock); + break; + case RW_LOCK: + lck_rw_unlock(&info->rw_lock, LCK_RW_TYPE_EXCLUSIVE); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static wait_result_t +primitive_sleep_with_inheritor(struct info_sleep_inheritor_test *info) +{ + wait_result_t ret = KERN_SUCCESS; + switch (info->prim_type) { + case MTX_LOCK: + ret = lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + break; + case RW_LOCK: + ret = lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + break; + default: + panic("invalid type %d", info->prim_type); + } + + return ret; +} + +static void +primitive_wakeup_one_with_inheritor(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + case RW_LOCK: + wakeup_one_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED, LCK_WAKE_DEFAULT, &info->thread_inheritor); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +primitive_wakeup_all_with_inheritor(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + case RW_LOCK: + wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED); + break; + default: + panic("invalid type %d", info->prim_type); + } + return; +} + +static void +primitive_change_sleep_inheritor(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + case RW_LOCK: + change_sleep_inheritor((event_t) &info->thread_inheritor, info->thread_inheritor); + break; + default: + panic("invalid type %d", info->prim_type); + } + return; +} + +static kern_return_t +primitive_gate_try_close(struct info_sleep_inheritor_test *info) +{ + kern_return_t ret = KERN_SUCCESS; + switch (info->prim_type) { + case MTX_LOCK: + ret = lck_mtx_gate_try_close(&info->mtx_lock, &info->gate); + break; + case RW_LOCK: + ret = lck_rw_gate_try_close(&info->rw_lock, &info->gate); + break; + default: + panic("invalid type %d", info->prim_type); + } + return ret; +} + +static gate_wait_result_t +primitive_gate_wait(struct info_sleep_inheritor_test *info) +{ + gate_wait_result_t ret = GATE_OPENED; + switch (info->prim_type) { + case MTX_LOCK: + ret = lck_mtx_gate_wait(&info->mtx_lock, &info->gate, LCK_SLEEP_DEFAULT, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + break; + case RW_LOCK: + ret = lck_rw_gate_wait(&info->rw_lock, &info->gate, LCK_SLEEP_DEFAULT, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + break; + default: + panic("invalid type %d", info->prim_type); + } + return ret; +} + +static void +primitive_gate_open(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_gate_open(&info->mtx_lock, &info->gate); + break; + case RW_LOCK: + lck_rw_gate_open(&info->rw_lock, &info->gate); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +primitive_gate_close(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_gate_close(&info->mtx_lock, &info->gate); + break; + case RW_LOCK: + lck_rw_gate_close(&info->rw_lock, &info->gate); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +primitive_gate_steal(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_gate_steal(&info->mtx_lock, &info->gate); + break; + case RW_LOCK: + lck_rw_gate_steal(&info->rw_lock, &info->gate); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static kern_return_t +primitive_gate_handoff(struct info_sleep_inheritor_test *info, int flags) +{ + kern_return_t ret = KERN_SUCCESS; + switch (info->prim_type) { + case MTX_LOCK: + ret = lck_mtx_gate_handoff(&info->mtx_lock, &info->gate, flags); + break; + case RW_LOCK: + ret = lck_rw_gate_handoff(&info->rw_lock, &info->gate, flags); + break; + default: + panic("invalid type %d", info->prim_type); + } + return ret; +} + +static void +primitive_gate_assert(struct info_sleep_inheritor_test *info, int type) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_gate_assert(&info->mtx_lock, &info->gate, type); + break; + case RW_LOCK: + lck_rw_gate_assert(&info->rw_lock, &info->gate, type); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +primitive_gate_init(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_gate_init(&info->mtx_lock, &info->gate); + break; + case RW_LOCK: + lck_rw_gate_init(&info->rw_lock, &info->gate); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +primitive_gate_destroy(struct info_sleep_inheritor_test *info) +{ + switch (info->prim_type) { + case MTX_LOCK: + lck_mtx_gate_destroy(&info->mtx_lock, &info->gate); + break; + case RW_LOCK: + lck_rw_gate_destroy(&info->rw_lock, &info->gate); + break; + default: + panic("invalid type %d", info->prim_type); + } +} + +static void +thread_inheritor_like_mutex( + void *args, + __unused wait_result_t wr) +{ + wait_result_t wait; + + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + + /* + * spin here to start concurrently + */ + wake_threads(&info->synch); + wait_threads(&info->synch, info->synch_value); + + primitive_lock(info); + + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + } else { + wait = primitive_sleep_with_inheritor(info); + T_ASSERT(wait == THREAD_AWAKENED || wait == THREAD_NOT_WAITING, "sleep_with_inheritor return"); + } + primitive_unlock(info); + + IOSleep(100); + info->value++; + + primitive_lock(info); + + T_ASSERT(info->thread_inheritor == current_thread(), "thread_inheritor is %p", info->thread_inheritor); + primitive_wakeup_one_with_inheritor(info); + T_LOG("woken up %p", info->thread_inheritor); + + if (info->thread_inheritor == NULL) { + T_ASSERT(info->handoff_failure == 0, "handoff failures"); + info->handoff_failure++; + } else { + T_ASSERT(info->thread_inheritor != current_thread(), "thread_inheritor is %p", info->thread_inheritor); + thread_deallocate(info->thread_inheritor); + } + + primitive_unlock(info); + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_just_inheritor_do_work( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + uint max_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + primitive_lock(info); + + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + primitive_unlock(info); + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + + wait_threads(&info->synch, info->synch_value - 1); + + wait_for_waiters((struct synch_test_common *)info); + + max_pri = get_max_pri((struct synch_test_common *) info); + T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri); + + os_atomic_store(&info->synch, 0, relaxed); + primitive_lock(info); + primitive_wakeup_all_with_inheritor(info); + } else { + wake_threads(&info->synch); + primitive_sleep_with_inheritor(info); + } + + primitive_unlock(info); + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_steal_work( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + primitive_lock(info); + + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + exclude_current_waiter((struct synch_test_common *)info); + + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + primitive_unlock(info); + + wait_threads(&info->synch, info->synch_value - 2); + + wait_for_waiters((struct synch_test_common *)info); + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + primitive_lock(info); + if (info->thread_inheritor == current_thread()) { + primitive_wakeup_all_with_inheritor(info); + } + } else { + if (info->steal_pri == 0) { + info->steal_pri = my_pri; + info->thread_inheritor = current_thread(); + primitive_change_sleep_inheritor(info); + exclude_current_waiter((struct synch_test_common *)info); + + primitive_unlock(info); + + wait_threads(&info->synch, info->synch_value - 2); + + T_LOG("Thread pri %d stole push %p", my_pri, current_thread()); + wait_for_waiters((struct synch_test_common *)info); + + T_ASSERT((uint) current_thread()->sched_pri == info->steal_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, info->steal_pri); + + primitive_lock(info); + primitive_wakeup_all_with_inheritor(info); + } else { + if (my_pri > info->steal_pri) { + info->steal_pri = my_pri; + } + wake_threads(&info->synch); + primitive_sleep_with_inheritor(info); + exclude_current_waiter((struct synch_test_common *)info); + } + } + primitive_unlock(info); + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_no_inheritor_work( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + primitive_lock(info); + + info->value--; + if (info->value == 0) { + primitive_wakeup_all_with_inheritor(info); + } else { + info->thread_inheritor = NULL; + primitive_sleep_with_inheritor(info); + } + + primitive_unlock(info); + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_mtx_work( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + int i; + u_int8_t rand; + unsigned int mod_rand; + uint max_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + + for (i = 0; i < 10; i++) { + lck_mtx_lock(&info->mtx_lock); + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + lck_mtx_unlock(&info->mtx_lock); + + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + + wait_threads(&info->synch, info->synch_value - 1); + wait_for_waiters((struct synch_test_common *)info); + max_pri = get_max_pri((struct synch_test_common *) info); + T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri); + + os_atomic_store(&info->synch, 0, relaxed); + + lck_mtx_lock(&info->mtx_lock); + info->thread_inheritor = NULL; + wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED); + lck_mtx_unlock(&info->mtx_lock); + continue; + } + + read_random(&rand, sizeof(rand)); + mod_rand = rand % 2; + + wake_threads(&info->synch); + switch (mod_rand) { + case 0: + lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + lck_mtx_unlock(&info->mtx_lock); + break; + case 1: + lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_UNLOCK, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + break; + default: + panic("rand()mod4 returned %u (random %u)", mod_rand, rand); + } + } + + /* + * spin here to stop using the lock as mutex + */ + wake_threads(&info->synch); + wait_threads(&info->synch, info->synch_value); + + for (i = 0; i < 10; i++) { + /* read_random might sleep so read it before acquiring the mtx as spin */ + read_random(&rand, sizeof(rand)); + + lck_mtx_lock_spin(&info->mtx_lock); + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + lck_mtx_unlock(&info->mtx_lock); + + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + wait_for_waiters((struct synch_test_common *)info); + max_pri = get_max_pri((struct synch_test_common *) info); + T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri); + + lck_mtx_lock_spin(&info->mtx_lock); + info->thread_inheritor = NULL; + wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED); + lck_mtx_unlock(&info->mtx_lock); + continue; + } + + mod_rand = rand % 2; + switch (mod_rand) { + case 0: + lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_SPIN, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + lck_mtx_unlock(&info->mtx_lock); + break; + case 1: + lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_SPIN_ALWAYS, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + lck_mtx_unlock(&info->mtx_lock); + break; + default: + panic("rand()mod4 returned %u (random %u)", mod_rand, rand); + } + } + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_rw_work( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + int i; + lck_rw_type_t type; + u_int8_t rand; + unsigned int mod_rand; + uint max_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + + for (i = 0; i < 10; i++) { +try_again: + type = LCK_RW_TYPE_SHARED; + lck_rw_lock(&info->rw_lock, type); + if (info->thread_inheritor == NULL) { + type = LCK_RW_TYPE_EXCLUSIVE; + + if (lck_rw_lock_shared_to_exclusive(&info->rw_lock)) { + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + lck_rw_unlock(&info->rw_lock, type); + wait_threads(&info->synch, info->synch_value - 1); + + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + wait_for_waiters((struct synch_test_common *)info); + max_pri = get_max_pri((struct synch_test_common *) info); + T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri); + + os_atomic_store(&info->synch, 0, relaxed); + + lck_rw_lock(&info->rw_lock, type); + info->thread_inheritor = NULL; + wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED); + lck_rw_unlock(&info->rw_lock, type); + continue; + } + } else { + goto try_again; + } + } + + read_random(&rand, sizeof(rand)); + mod_rand = rand % 4; + + wake_threads(&info->synch); + switch (mod_rand) { + case 0: + lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + lck_rw_unlock(&info->rw_lock, type); + break; + case 1: + lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_UNLOCK, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + break; + case 2: + lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_SHARED, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + lck_rw_unlock(&info->rw_lock, LCK_RW_TYPE_SHARED); + break; + case 3: + lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_EXCLUSIVE, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + lck_rw_unlock(&info->rw_lock, LCK_RW_TYPE_EXCLUSIVE); + break; + default: + panic("rand()mod4 returned %u (random %u)", mod_rand, rand); + } + } + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +test_sleep_with_wake_all(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + info->synch = 0; + info->synch_value = info->head.nthreads; + + info->thread_inheritor = NULL; + + start_threads((thread_continue_t)thread_just_inheritor_do_work, (struct synch_test_common *)info, TRUE); + wait_all_thread((struct synch_test_common *)info); +} + +static void +test_sleep_with_wake_one(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + + info->synch = 0; + info->synch_value = info->head.nthreads; + info->value = 0; + info->handoff_failure = 0; + info->thread_inheritor = NULL; + + start_threads((thread_continue_t)thread_inheritor_like_mutex, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); + + T_ASSERT(info->value == (int)info->head.nthreads, "value protected by sleep"); + T_ASSERT(info->handoff_failure == 1, "handoff failures"); +} + +static void +test_change_sleep_inheritor(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + + info->thread_inheritor = NULL; + info->steal_pri = 0; + info->synch = 0; + info->synch_value = info->head.nthreads; + + start_threads((thread_continue_t)thread_steal_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +static void +test_no_inheritor(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + info->synch = 0; + info->synch_value = info->head.nthreads; + + info->thread_inheritor = NULL; + info->value = info->head.nthreads; + + start_threads((thread_continue_t)thread_no_inheritor_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +static void +test_rw_lock(struct info_sleep_inheritor_test *info) +{ + info->thread_inheritor = NULL; + info->value = info->head.nthreads; + info->synch = 0; + info->synch_value = info->head.nthreads; + + start_threads((thread_continue_t)thread_rw_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +static void +test_mtx_lock(struct info_sleep_inheritor_test *info) +{ + info->thread_inheritor = NULL; + info->value = info->head.nthreads; + info->synch = 0; + info->synch_value = info->head.nthreads; + + start_threads((thread_continue_t)thread_mtx_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +kern_return_t +ts_kernel_sleep_inheritor_test(void) +{ + struct info_sleep_inheritor_test info = {}; + + init_synch_test_common((struct synch_test_common *)&info, NUM_THREADS); + + lck_attr_t* lck_attr = lck_attr_alloc_init(); + lck_grp_attr_t* lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_t* lck_grp = lck_grp_alloc_init("test sleep_inheritor", lck_grp_attr); + + lck_mtx_init(&info.mtx_lock, lck_grp, lck_attr); + lck_rw_init(&info.rw_lock, lck_grp, lck_attr); + + /* + * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor + */ + T_LOG("Testing mtx sleep with inheritor and wake_all_with_inheritor"); + test_sleep_with_wake_all(&info, MTX_LOCK); + + /* + * Testing rw_mtx_sleep_with_inheritor and wakeup_all_with_inheritor + */ + T_LOG("Testing rw sleep with inheritor and wake_all_with_inheritor"); + test_sleep_with_wake_all(&info, RW_LOCK); + + /* + * Testing lck_mtx_sleep_with_inheritor and wakeup_one_with_inheritor + */ + T_LOG("Testing mtx sleep with inheritor and wake_one_with_inheritor"); + test_sleep_with_wake_one(&info, MTX_LOCK); + + /* + * Testing lck_rw_sleep_with_inheritor and wakeup_one_with_inheritor + */ + T_LOG("Testing rw sleep with inheritor and wake_one_with_inheritor"); + test_sleep_with_wake_one(&info, RW_LOCK); + + /* + * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor + * and change_sleep_inheritor + */ + T_LOG("Testing change_sleep_inheritor with mxt sleep"); + test_change_sleep_inheritor(&info, MTX_LOCK); + + /* + * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor + * and change_sleep_inheritor + */ + T_LOG("Testing change_sleep_inheritor with rw sleep"); + test_change_sleep_inheritor(&info, RW_LOCK); + + /* + * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor + * with inheritor NULL + */ + T_LOG("Testing inheritor NULL"); + test_no_inheritor(&info, MTX_LOCK); + + /* + * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor + * with inheritor NULL + */ + T_LOG("Testing inheritor NULL"); + test_no_inheritor(&info, RW_LOCK); + + /* + * Testing mtx locking combinations + */ + T_LOG("Testing mtx locking combinations"); + test_mtx_lock(&info); + + /* + * Testing rw locking combinations + */ + T_LOG("Testing rw locking combinations"); + test_rw_lock(&info); + + destroy_synch_test_common((struct synch_test_common *)&info); + + lck_attr_free(lck_attr); + lck_grp_attr_free(lck_grp_attr); + lck_rw_destroy(&info.rw_lock, lck_grp); + lck_mtx_destroy(&info.mtx_lock, lck_grp); + lck_grp_free(lck_grp); + + return KERN_SUCCESS; +} + +static void +thread_gate_aggressive( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + + primitive_lock(info); + if (info->thread_inheritor == NULL) { + info->thread_inheritor = current_thread(); + primitive_gate_assert(info, GATE_ASSERT_OPEN); + primitive_gate_close(info); + exclude_current_waiter((struct synch_test_common *)info); + + primitive_unlock(info); + + wait_threads(&info->synch, info->synch_value - 2); + wait_for_waiters((struct synch_test_common *)info); + T_LOG("Thread pri %d first to run %p", my_pri, current_thread()); + + primitive_lock(info); + if (info->thread_inheritor == current_thread()) { + primitive_gate_open(info); + } + } else { + if (info->steal_pri == 0) { + info->steal_pri = my_pri; + info->thread_inheritor = current_thread(); + primitive_gate_steal(info); + exclude_current_waiter((struct synch_test_common *)info); + + primitive_unlock(info); + wait_threads(&info->synch, info->synch_value - 2); + + T_LOG("Thread pri %d stole push %p", my_pri, current_thread()); + wait_for_waiters((struct synch_test_common *)info); + T_ASSERT((uint) current_thread()->sched_pri == info->steal_pri, "gate keeper priority current is %d, should be %d", current_thread()->sched_pri, info->steal_pri); + + primitive_lock(info); + primitive_gate_open(info); + } else { + if (my_pri > info->steal_pri) { + info->steal_pri = my_pri; + } + wake_threads(&info->synch); + primitive_gate_wait(info); + exclude_current_waiter((struct synch_test_common *)info); + } + } + primitive_unlock(info); + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_gate_like_mutex( + void *args, + __unused wait_result_t wr) +{ + gate_wait_result_t wait; + kern_return_t ret; + uint my_pri = current_thread()->sched_pri; + + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + + /* + * spin here to start concurrently + */ + wake_threads(&info->synch); + wait_threads(&info->synch, info->synch_value); + + primitive_lock(info); + + if (primitive_gate_try_close(info) != KERN_SUCCESS) { + wait = primitive_gate_wait(info); + T_ASSERT(wait == GATE_HANDOFF, "gate_wait return"); + } + + primitive_gate_assert(info, GATE_ASSERT_HELD); + + primitive_unlock(info); + + IOSleep(100); + info->value++; + + primitive_lock(info); + + ret = primitive_gate_handoff(info, GATE_HANDOFF_DEFAULT); + if (ret == KERN_NOT_WAITING) { + T_ASSERT(info->handoff_failure == 0, "handoff failures"); + primitive_gate_handoff(info, GATE_HANDOFF_OPEN_IF_NO_WAITERS); + info->handoff_failure++; + } + + primitive_unlock(info); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_just_one_do_work( + void *args, + __unused wait_result_t wr) +{ + struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args; + uint my_pri = current_thread()->sched_pri; + uint max_pri; + + T_LOG("Started thread pri %d %p", my_pri, current_thread()); + + primitive_lock(info); +check_again: + if (info->work_to_do) { + if (primitive_gate_try_close(info) == KERN_SUCCESS) { + primitive_gate_assert(info, GATE_ASSERT_HELD); + primitive_unlock(info); + + T_LOG("Thread pri %d acquired the gate %p", my_pri, current_thread()); + wait_threads(&info->synch, info->synch_value - 1); + wait_for_waiters((struct synch_test_common *)info); + max_pri = get_max_pri((struct synch_test_common *) info); + T_ASSERT((uint) current_thread()->sched_pri == max_pri, "gate owner priority current is %d, should be %d", current_thread()->sched_pri, max_pri); + os_atomic_store(&info->synch, 0, relaxed); + + primitive_lock(info); + info->work_to_do = FALSE; + primitive_gate_open(info); + } else { + primitive_gate_assert(info, GATE_ASSERT_CLOSED); + wake_threads(&info->synch); + primitive_gate_wait(info); + goto check_again; + } + } + primitive_unlock(info); + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + thread_terminate_self(); +} + +static void +test_gate_push(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + + primitive_gate_init(info); + info->work_to_do = TRUE; + info->synch = 0; + info->synch_value = NUM_THREADS; + + start_threads((thread_continue_t)thread_just_one_do_work, (struct synch_test_common *) info, TRUE); + wait_all_thread((struct synch_test_common *)info); + + primitive_gate_destroy(info); +} + +static void +test_gate_handoff(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + + primitive_gate_init(info); + + info->synch = 0; + info->synch_value = NUM_THREADS; + info->value = 0; + info->handoff_failure = 0; + + start_threads((thread_continue_t)thread_gate_like_mutex, (struct synch_test_common *)info, false); + wait_all_thread((struct synch_test_common *)info); + + T_ASSERT(info->value == NUM_THREADS, "value protected by gate"); + T_ASSERT(info->handoff_failure == 1, "handoff failures"); + + primitive_gate_destroy(info); +} + +static void +test_gate_steal(struct info_sleep_inheritor_test *info, int prim_type) +{ + info->prim_type = prim_type; + + primitive_gate_init(info); + + info->synch = 0; + info->synch_value = NUM_THREADS; + info->thread_inheritor = NULL; + info->steal_pri = 0; + + start_threads((thread_continue_t)thread_gate_aggressive, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); + + primitive_gate_destroy(info); +} + +kern_return_t +ts_kernel_gate_test(void) +{ + struct info_sleep_inheritor_test info = {}; + + T_LOG("Testing gate primitive"); + + init_synch_test_common((struct synch_test_common *)&info, NUM_THREADS); + + lck_attr_t* lck_attr = lck_attr_alloc_init(); + lck_grp_attr_t* lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_t* lck_grp = lck_grp_alloc_init("test gate", lck_grp_attr); + + lck_mtx_init(&info.mtx_lock, lck_grp, lck_attr); + lck_rw_init(&info.rw_lock, lck_grp, lck_attr); + + /* + * Testing the priority inherited by the keeper + * lck_mtx_gate_try_close, lck_mtx_gate_open, lck_mtx_gate_wait + */ + T_LOG("Testing gate push, lck"); + test_gate_push(&info, MTX_LOCK); + + T_LOG("Testing gate push, rw"); + test_gate_push(&info, RW_LOCK); + + /* + * Testing the handoff + * lck_mtx_gate_wait, lck_mtx_gate_handoff + */ + T_LOG("Testing gate handoff, lck"); + test_gate_handoff(&info, MTX_LOCK); + + T_LOG("Testing gate handoff, rw"); + test_gate_handoff(&info, RW_LOCK); + + /* + * Testing the steal + * lck_mtx_gate_close, lck_mtx_gate_wait, lck_mtx_gate_steal, lck_mtx_gate_handoff + */ + T_LOG("Testing gate steal, lck"); + test_gate_steal(&info, MTX_LOCK); + + T_LOG("Testing gate steal, rw"); + test_gate_steal(&info, RW_LOCK); + + destroy_synch_test_common((struct synch_test_common *)&info); + + lck_attr_free(lck_attr); + lck_grp_attr_free(lck_grp_attr); + lck_mtx_destroy(&info.mtx_lock, lck_grp); + lck_grp_free(lck_grp); + + return KERN_SUCCESS; +} + +#define NUM_THREAD_CHAIN 6 + +struct turnstile_chain_test { + struct synch_test_common head; + lck_mtx_t mtx_lock; + int synch_value; + int synch; + int synch2; + gate_t gates[NUM_THREAD_CHAIN]; +}; + +static void +thread_sleep_gate_chain_work( + void *args, + __unused wait_result_t wr) +{ + struct turnstile_chain_test *info = (struct turnstile_chain_test*) args; + thread_t self = current_thread(); + uint my_pri = self->sched_pri; + uint max_pri; + uint i; + thread_t inheritor = NULL, woken_up; + event_t wait_event, wake_event; + kern_return_t ret; + + T_LOG("Started thread pri %d %p", my_pri, self); + + /* + * Need to use the threads ids, wait for all of them to be populated + */ + + while (os_atomic_load(&info->head.threads[info->head.nthreads - 1], acquire) == NULL) { + IOSleep(10); + } + + max_pri = get_max_pri((struct synch_test_common *) info); + + for (i = 0; i < info->head.nthreads; i = i + 2) { + // even threads will close a gate + if (info->head.threads[i] == self) { + lck_mtx_lock(&info->mtx_lock); + lck_mtx_gate_close(&info->mtx_lock, &info->gates[i]); + lck_mtx_unlock(&info->mtx_lock); + break; + } + } + + wake_threads(&info->synch2); + wait_threads(&info->synch2, info->synch_value); + + if (self == os_atomic_load(&info->head.threads[0], acquire)) { + wait_threads(&info->synch, info->synch_value - 1); + wait_for_waiters((struct synch_test_common *)info); + + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + lck_mtx_lock(&info->mtx_lock); + lck_mtx_gate_open(&info->mtx_lock, &info->gates[0]); + lck_mtx_unlock(&info->mtx_lock); + } else { + wait_event = NULL; + wake_event = NULL; + for (i = 0; i < info->head.nthreads; i++) { + if (info->head.threads[i] == self) { + inheritor = info->head.threads[i - 1]; + wait_event = (event_t) &info->head.threads[i - 1]; + wake_event = (event_t) &info->head.threads[i]; + break; + } + } + assert(wait_event != NULL); + + lck_mtx_lock(&info->mtx_lock); + wake_threads(&info->synch); + + if (i % 2 != 0) { + lck_mtx_gate_wait(&info->mtx_lock, &info->gates[i - 1], LCK_SLEEP_UNLOCK, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + ret = wakeup_one_with_inheritor(wake_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &woken_up); + if (ret == KERN_SUCCESS) { + T_ASSERT(i != (info->head.nthreads - 1), "thread id"); + T_ASSERT(woken_up == info->head.threads[i + 1], "wakeup_one_with_inheritor woke next"); + } else { + T_ASSERT(i == (info->head.nthreads - 1), "thread id"); + } + + // i am still the inheritor, wake all to drop inheritership + ret = wakeup_all_with_inheritor(wake_event, LCK_WAKE_DEFAULT); + T_ASSERT(ret == KERN_NOT_WAITING, "waiters on event"); + } else { + // I previously closed a gate + lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_UNLOCK, wait_event, inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + lck_mtx_lock(&info->mtx_lock); + lck_mtx_gate_open(&info->mtx_lock, &info->gates[i]); + lck_mtx_unlock(&info->mtx_lock); + } + } + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_gate_chain_work( + void *args, + __unused wait_result_t wr) +{ + struct turnstile_chain_test *info = (struct turnstile_chain_test*) args; + thread_t self = current_thread(); + uint my_pri = self->sched_pri; + uint max_pri; + uint i; + T_LOG("Started thread pri %d %p", my_pri, self); + + + /* + * Need to use the threads ids, wait for all of them to be populated + */ + while (os_atomic_load(&info->head.threads[info->head.nthreads - 1], acquire) == NULL) { + IOSleep(10); + } + + max_pri = get_max_pri((struct synch_test_common *) info); + + for (i = 0; i < info->head.nthreads; i++) { + if (info->head.threads[i] == self) { + lck_mtx_lock(&info->mtx_lock); + lck_mtx_gate_close(&info->mtx_lock, &info->gates[i]); + lck_mtx_unlock(&info->mtx_lock); + break; + } + } + assert(i != info->head.nthreads); + + wake_threads(&info->synch2); + wait_threads(&info->synch2, info->synch_value); + + if (self == os_atomic_load(&info->head.threads[0], acquire)) { + wait_threads(&info->synch, info->synch_value - 1); + + wait_for_waiters((struct synch_test_common *)info); + + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + lck_mtx_lock(&info->mtx_lock); + lck_mtx_gate_open(&info->mtx_lock, &info->gates[0]); + lck_mtx_unlock(&info->mtx_lock); + } else { + lck_mtx_lock(&info->mtx_lock); + wake_threads(&info->synch); + lck_mtx_gate_wait(&info->mtx_lock, &info->gates[i - 1], LCK_SLEEP_UNLOCK, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + lck_mtx_lock(&info->mtx_lock); + lck_mtx_gate_open(&info->mtx_lock, &info->gates[i]); + lck_mtx_unlock(&info->mtx_lock); + } + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +thread_sleep_chain_work( + void *args, + __unused wait_result_t wr) +{ + struct turnstile_chain_test *info = (struct turnstile_chain_test*) args; + thread_t self = current_thread(); + uint my_pri = self->sched_pri; + uint max_pri; + event_t wait_event, wake_event; + uint i; + thread_t inheritor = NULL, woken_up = NULL; + kern_return_t ret; + + T_LOG("Started thread pri %d %p", my_pri, self); + + /* + * Need to use the threads ids, wait for all of them to be populated + */ + while (os_atomic_load(&info->head.threads[info->head.nthreads - 1], acquire) == NULL) { + IOSleep(10); + } + + max_pri = get_max_pri((struct synch_test_common *) info); + + if (self == os_atomic_load(&info->head.threads[0], acquire)) { + wait_threads(&info->synch, info->synch_value - 1); + + wait_for_waiters((struct synch_test_common *)info); + + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + ret = wakeup_one_with_inheritor((event_t) &info->head.threads[0], THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &woken_up); + T_ASSERT(ret == KERN_SUCCESS, "wakeup_one_with_inheritor woke next"); + T_ASSERT(woken_up == info->head.threads[1], "thread woken up"); + + // i am still the inheritor, wake all to drop inheritership + ret = wakeup_all_with_inheritor((event_t) &info->head.threads[0], LCK_WAKE_DEFAULT); + T_ASSERT(ret == KERN_NOT_WAITING, "waiters on event"); + } else { + wait_event = NULL; + wake_event = NULL; + for (i = 0; i < info->head.nthreads; i++) { + if (info->head.threads[i] == self) { + inheritor = info->head.threads[i - 1]; + wait_event = (event_t) &info->head.threads[i - 1]; + wake_event = (event_t) &info->head.threads[i]; + break; + } + } + + assert(wait_event != NULL); + lck_mtx_lock(&info->mtx_lock); + wake_threads(&info->synch); + + lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_UNLOCK, wait_event, inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + + T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri); + + ret = wakeup_one_with_inheritor(wake_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &woken_up); + if (ret == KERN_SUCCESS) { + T_ASSERT(i != (info->head.nthreads - 1), "thread id"); + T_ASSERT(woken_up == info->head.threads[i + 1], "wakeup_one_with_inheritor woke next"); + } else { + T_ASSERT(i == (info->head.nthreads - 1), "thread id"); + } + + // i am still the inheritor, wake all to drop inheritership + ret = wakeup_all_with_inheritor(wake_event, LCK_WAKE_DEFAULT); + T_ASSERT(ret == KERN_NOT_WAITING, "waiters on event"); + } + + assert(current_thread()->kern_promotion_schedpri == 0); + notify_waiter((struct synch_test_common *)info); + + thread_terminate_self(); +} + +static void +test_sleep_chain(struct turnstile_chain_test *info) +{ + info->synch = 0; + info->synch_value = info->head.nthreads; + + start_threads((thread_continue_t)thread_sleep_chain_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +static void +test_gate_chain(struct turnstile_chain_test *info) +{ + info->synch = 0; + info->synch2 = 0; + info->synch_value = info->head.nthreads; + + start_threads((thread_continue_t)thread_gate_chain_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +static void +test_sleep_gate_chain(struct turnstile_chain_test *info) +{ + info->synch = 0; + info->synch2 = 0; + info->synch_value = info->head.nthreads; + + start_threads((thread_continue_t)thread_sleep_gate_chain_work, (struct synch_test_common *)info, FALSE); + wait_all_thread((struct synch_test_common *)info); +} + +kern_return_t +ts_kernel_turnstile_chain_test(void) +{ + struct turnstile_chain_test info = {}; + int i; + + init_synch_test_common((struct synch_test_common *)&info, NUM_THREAD_CHAIN); + lck_attr_t* lck_attr = lck_attr_alloc_init(); + lck_grp_attr_t* lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_t* lck_grp = lck_grp_alloc_init("test gate", lck_grp_attr); + + lck_mtx_init(&info.mtx_lock, lck_grp, lck_attr); + for (i = 0; i < NUM_THREAD_CHAIN; i++) { + lck_mtx_gate_init(&info.mtx_lock, &info.gates[i]); + } + + T_LOG("Testing sleep chain, lck"); + test_sleep_chain(&info); + + T_LOG("Testing gate chain, lck"); + test_gate_chain(&info); + + T_LOG("Testing sleep and gate chain, lck"); + test_sleep_gate_chain(&info); + + destroy_synch_test_common((struct synch_test_common *)&info); + for (i = 0; i < NUM_THREAD_CHAIN; i++) { + lck_mtx_gate_destroy(&info.mtx_lock, &info.gates[i]); + } + lck_attr_free(lck_attr); + lck_grp_attr_free(lck_grp_attr); + lck_mtx_destroy(&info.mtx_lock, lck_grp); + lck_grp_free(lck_grp); + + return KERN_SUCCESS; +} + +kern_return_t +ts_kernel_timingsafe_bcmp_test(void) +{ + int i, buf_size; + char *buf = NULL; + + // empty + T_ASSERT(timingsafe_bcmp(NULL, NULL, 0) == 0, NULL); + T_ASSERT(timingsafe_bcmp("foo", "foo", 0) == 0, NULL); + T_ASSERT(timingsafe_bcmp("foo", "bar", 0) == 0, NULL); + + // equal + T_ASSERT(timingsafe_bcmp("foo", "foo", strlen("foo")) == 0, NULL); + + // unequal + T_ASSERT(timingsafe_bcmp("foo", "bar", strlen("foo")) == 1, NULL); + T_ASSERT(timingsafe_bcmp("foo", "goo", strlen("foo")) == 1, NULL); + T_ASSERT(timingsafe_bcmp("foo", "fpo", strlen("foo")) == 1, NULL); + T_ASSERT(timingsafe_bcmp("foo", "fop", strlen("foo")) == 1, NULL); + + // all possible bitwise differences + for (i = 1; i < 256; i += 1) { + unsigned char a = 0; + unsigned char b = (unsigned char)i; + + T_ASSERT(timingsafe_bcmp(&a, &b, sizeof(a)) == 1, NULL); + } + + // large + buf_size = 1024 * 16; + buf = kalloc(buf_size); + T_EXPECT_NOTNULL(buf, "kalloc of buf"); + + read_random(buf, buf_size); + T_ASSERT(timingsafe_bcmp(buf, buf, buf_size) == 0, NULL); + T_ASSERT(timingsafe_bcmp(buf, buf + 1, buf_size - 1) == 1, NULL); + T_ASSERT(timingsafe_bcmp(buf, buf + 128, 128) == 1, NULL); + + memcpy(buf + 128, buf, 128); + T_ASSERT(timingsafe_bcmp(buf, buf + 128, 128) == 0, NULL); + + kfree(buf, buf_size); + + return KERN_SUCCESS; +} + +kern_return_t +kprintf_hhx_test(void) +{ + printf("POST hhx test %hx%hx%hx%hx %hhx%hhx%hhx%hhx - %llx", + (unsigned short)0xfeed, (unsigned short)0xface, + (unsigned short)0xabad, (unsigned short)0xcafe, + (unsigned char)'h', (unsigned char)'h', (unsigned char)'x', + (unsigned char)'!', + 0xfeedfaceULL); + return KERN_SUCCESS; +} diff --git a/osfmk/tests/pmap_tests.c b/osfmk/tests/pmap_tests.c index ee73016ad..99624e77e 100644 --- a/osfmk/tests/pmap_tests.c +++ b/osfmk/tests/pmap_tests.c @@ -30,12 +30,18 @@ #include #include #include - +#if defined(__arm64__) +#include +#endif extern ledger_template_t task_ledger_template; +extern boolean_t arm_force_fast_fault(ppnum_t, vm_prot_t, int, void*); +extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, bool, bool); + kern_return_t test_pmap_enter_disconnect(unsigned int num_loops); kern_return_t test_pmap_iommu_disconnect(void); +kern_return_t test_pmap_extended(void); #define PMAP_TEST_VA (0xDEAD << PAGE_SHIFT) @@ -46,7 +52,7 @@ typedef struct { } pmap_test_thread_args; static pmap_t -pmap_create_wrapper() +pmap_create_wrapper(unsigned int flags) { pmap_t new_pmap = NULL; ledger_t ledger; @@ -54,7 +60,7 @@ pmap_create_wrapper() if ((ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES)) == NULL) { return NULL; } - new_pmap = pmap_create(ledger, 0, FALSE); + new_pmap = pmap_create_options(ledger, 0, flags); ledger_dereference(ledger); return new_pmap; } @@ -74,7 +80,7 @@ test_pmap_enter_disconnect(unsigned int num_loops) { kern_return_t kr = KERN_SUCCESS; thread_t disconnect_thread; - pmap_t new_pmap = pmap_create_wrapper(); + pmap_t new_pmap = pmap_create_wrapper(0); if (new_pmap == NULL) { return KERN_FAILURE; } @@ -118,3 +124,9 @@ test_pmap_iommu_disconnect(void) { return KERN_SUCCESS; } + +kern_return_t +test_pmap_extended(void) +{ + return KERN_SUCCESS; +} diff --git a/osfmk/tests/xnupost.h b/osfmk/tests/xnupost.h index cee9312b9..4d22f2639 100644 --- a/osfmk/tests/xnupost.h +++ b/osfmk/tests/xnupost.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,13 +90,29 @@ extern uint32_t kernel_post_tests_count; extern uint32_t total_post_tests_count; #define XNUPOST_TEST_CONFIG_BASIC(func) \ - { \ - XT_CONFIG_RUN, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \ + { \ + .xt_config = XT_CONFIG_RUN, \ + .xt_test_num = 0, \ + .xt_retval = -1, \ + .xt_expected_retval = T_STATE_PASS, \ + .xt_begin_time = 0, \ + .xt_end_time = 0, \ + .xt_test_actions = 0, \ + .xt_func = (func), \ + .xt_name = "xnu."#func \ } #define XNUPOST_TEST_CONFIG_TEST_PANIC(func) \ - { \ - XT_CONFIG_EXPECT_PANIC, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \ + { \ + .xt_config = XT_CONFIG_EXPECT_PANIC, \ + .xt_test_num = 0, \ + .xt_retval = -1, \ + .xt_expected_retval = T_STATE_PASS, \ + .xt_begin_time = 0, \ + .xt_end_time = 0, \ + .xt_test_actions = 0, \ + .xt_func = (func), \ + .xt_name = "xnu."#func \ } void xnupost_init(void); diff --git a/osfmk/vm/Makefile b/osfmk/vm/Makefile index 0453c363e..d9f9cc209 100644 --- a/osfmk/vm/Makefile +++ b/osfmk/vm/Makefile @@ -9,6 +9,7 @@ include $(MakeInc_def) DATAFILES = EXPORT_ONLY_FILES = \ + memory_types.h \ pmap.h \ vm_fault.h \ vm_kern.h \ diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index 707f58aa9..cedf45dbc 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,19 +97,19 @@ mach_get_vm_end(vm_map_t map) */ const struct memory_object_pager_ops vnode_pager_ops = { - vnode_pager_reference, - vnode_pager_deallocate, - vnode_pager_init, - vnode_pager_terminate, - vnode_pager_data_request, - vnode_pager_data_return, - vnode_pager_data_initialize, - vnode_pager_data_unlock, - vnode_pager_synchronize, - vnode_pager_map, - vnode_pager_last_unmap, - NULL, /* data_reclaim */ - "vnode pager" + .memory_object_reference = vnode_pager_reference, + .memory_object_deallocate = vnode_pager_deallocate, + .memory_object_init = vnode_pager_init, + .memory_object_terminate = vnode_pager_terminate, + .memory_object_data_request = vnode_pager_data_request, + .memory_object_data_return = vnode_pager_data_return, + .memory_object_data_initialize = vnode_pager_data_initialize, + .memory_object_data_unlock = vnode_pager_data_unlock, + .memory_object_synchronize = vnode_pager_synchronize, + .memory_object_map = vnode_pager_map, + .memory_object_last_unmap = vnode_pager_last_unmap, + .memory_object_data_reclaim = NULL, + .memory_object_pager_name = "vnode pager" }; typedef struct vnode_pager { @@ -985,7 +985,6 @@ vnode_pager_lookup_vnode( static int fill_vnodeinfoforaddr( vm_map_entry_t entry, uintptr_t * vnodeaddr, uint32_t * vid); - int fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vnodeaddr, uint32_t *vid) { @@ -1017,30 +1016,27 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { if (do_region_footprint && address == tmp_entry->vme_end) { - ledger_amount_t nonvol, nonvol_compressed; + ledger_amount_t ledger_resident; + ledger_amount_t ledger_compressed; /* * This request is right after the last valid * memory region; instead of reporting the * end of the address space, report a fake * memory region to account for non-volatile - * purgeable memory owned by this task. + * purgeable and/or ledger-tagged memory + * owned by this task. */ - - ledger_get_balance( - task->ledger, - task_ledgers.purgeable_nonvolatile, - &nonvol); - ledger_get_balance( - task->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - &nonvol_compressed); - if (nonvol + nonvol_compressed == 0) { + task_ledgers_footprint(task->ledger, + &ledger_resident, + &ledger_compressed); + if (ledger_resident + ledger_compressed == 0) { /* nothing to report */ vm_map_unlock_read(map); vm_map_deallocate(map); return 0; } + /* provide fake region for purgeable */ pinfo->pri_offset = address; pinfo->pri_protection = VM_PROT_DEFAULT; @@ -1050,22 +1046,22 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * pinfo->pri_user_wired_count = 0; pinfo->pri_user_tag = -1; pinfo->pri_pages_resident = - (uint32_t) (nonvol / PAGE_SIZE); + (uint32_t) (ledger_resident / PAGE_SIZE); pinfo->pri_pages_shared_now_private = 0; pinfo->pri_pages_swapped_out = - (uint32_t) (nonvol_compressed / PAGE_SIZE); + (uint32_t) (ledger_compressed / PAGE_SIZE); pinfo->pri_pages_dirtied = - (uint32_t) (nonvol / PAGE_SIZE); + (uint32_t) (ledger_resident / PAGE_SIZE); pinfo->pri_ref_count = 1; pinfo->pri_shadow_depth = 0; pinfo->pri_share_mode = SM_PRIVATE; pinfo->pri_private_pages_resident = - (uint32_t) (nonvol / PAGE_SIZE); + (uint32_t) (ledger_resident / PAGE_SIZE); pinfo->pri_shared_pages_resident = 0; pinfo->pri_obj_id = INFO_MAKE_FAKE_OBJECT_ID(map, task_ledgers.purgeable_nonvolatile); pinfo->pri_address = address; pinfo->pri_size = - (uint64_t) (nonvol + nonvol_compressed); + (uint64_t) (ledger_resident + ledger_compressed); pinfo->pri_depth = 0; vm_map_unlock_read(map); @@ -1228,6 +1224,58 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi return 0; } +int +find_region_details(task_t task, vm_map_offset_t offset, + uintptr_t *vnodeaddr, uint32_t *vid, + uint64_t *start, uint64_t *len) +{ + vm_map_t map; + vm_map_entry_t tmp_entry, entry; + int rc = 0; + + task_lock(task); + map = task->map; + if (map == VM_MAP_NULL) { + task_unlock(task); + return 0; + } + vm_map_reference(map); + task_unlock(task); + + vm_map_lock_read(map); + if (!vm_map_lookup_entry(map, offset, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + rc = 0; + goto ret; + } + } else { + entry = tmp_entry; + } + + while (entry != vm_map_to_entry(map)) { + *vnodeaddr = 0; + *vid = 0; + *start = 0; + *len = 0; + + if (entry->is_sub_map == 0) { + if (fill_vnodeinfoforaddr(entry, vnodeaddr, vid)) { + *start = entry->vme_start; + *len = entry->vme_end - entry->vme_start; + rc = 1; + goto ret; + } + } + + entry = entry->vme_next; + } + +ret: + vm_map_unlock_read(map); + vm_map_deallocate(map); + return rc; +} + static int fill_vnodeinfoforaddr( vm_map_entry_t entry, diff --git a/osfmk/vm/device_vm.c b/osfmk/vm/device_vm.c index 6b478027a..377d1aacc 100644 --- a/osfmk/vm/device_vm.c +++ b/osfmk/vm/device_vm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,19 +66,19 @@ /* until component support available */ const struct memory_object_pager_ops device_pager_ops = { - device_pager_reference, - device_pager_deallocate, - device_pager_init, - device_pager_terminate, - device_pager_data_request, - device_pager_data_return, - device_pager_data_initialize, - device_pager_data_unlock, - device_pager_synchronize, - device_pager_map, - device_pager_last_unmap, - NULL, /* data_reclaim */ - "device pager" + .memory_object_reference = device_pager_reference, + .memory_object_deallocate = device_pager_deallocate, + .memory_object_init = device_pager_init, + .memory_object_terminate = device_pager_terminate, + .memory_object_data_request = device_pager_data_request, + .memory_object_data_return = device_pager_data_return, + .memory_object_data_initialize = device_pager_data_initialize, + .memory_object_data_unlock = device_pager_data_unlock, + .memory_object_synchronize = device_pager_synchronize, + .memory_object_map = device_pager_map, + .memory_object_last_unmap = device_pager_last_unmap, + .memory_object_data_reclaim = NULL, + .memory_object_pager_name = "device pager" }; typedef uintptr_t device_port_t; @@ -179,6 +179,8 @@ device_pager_setup( &control); object = memory_object_control_to_vm_object(control); + memory_object_mark_trusted(control); + assert(object != VM_OBJECT_NULL); vm_object_lock(object); object->true_share = TRUE; @@ -383,7 +385,6 @@ device_pager_reference( device_object = device_pager_lookup(mem_obj); os_ref_retain(&device_object->ref_count); - DTRACE_VM2(device_pager_reference, device_pager_t, device_object, unsigned int, os_ref_get_count(&device_object->ref_count)); diff --git a/osfmk/vm/lz4.h b/osfmk/vm/lz4.h index 190f4a26f..512efd04b 100644 --- a/osfmk/vm/lz4.h +++ b/osfmk/vm/lz4.h @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include "lz4_assembly_select.h" #include "lz4_constants.h" diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index 9a35734bc..db1574d06 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,7 +84,6 @@ */ #include /* For memcpy() */ -#include #include #include /* For current_thread() */ #include @@ -106,7 +105,7 @@ #include memory_object_default_t memory_manager_default = MEMORY_OBJECT_DEFAULT_NULL; -decl_lck_mtx_data(, memory_manager_default_lock) +decl_lck_mtx_data(, memory_manager_default_lock); /* @@ -166,11 +165,6 @@ memory_object_lock_page( boolean_t should_flush, vm_prot_t prot) { - XPR(XPR_MEMORY_OBJECT, - "m_o_lock_page, page 0x%X rtn %d flush %d prot %d\n", - m, should_return, should_flush, prot, 0); - - if (m->vmp_busy || m->vmp_cleaning) { return MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK; } @@ -447,10 +441,6 @@ vm_object_sync( boolean_t rv; int flags; - XPR(XPR_VM_OBJECT, - "vm_o_sync, object 0x%X, offset 0x%X size 0x%x flush %d rtn %d\n", - object, offset, size, should_flush, should_return); - /* * Lock the object, and acquire a paging reference to * prevent the memory_object and control ports from @@ -1058,10 +1048,6 @@ vm_object_set_attributes_common( { boolean_t object_became_ready; - XPR(XPR_MEMORY_OBJECT, - "m_o_set_attr_com, object 0x%X flg %x strat %d\n", - object, (may_cache & 1), copy_strategy, 0, 0); - if (object == VM_OBJECT_NULL) { return KERN_INVALID_ARGUMENT; } @@ -1879,6 +1865,24 @@ memory_object_mark_io_tracking( } } +void +memory_object_mark_trusted( + memory_object_control_t control) +{ + vm_object_t object; + + if (control == NULL) { + return; + } + object = memory_object_control_to_vm_object(control); + + if (object != VM_OBJECT_NULL) { + vm_object_lock(object); + object->pager_trusted = TRUE; + vm_object_unlock(object); + } +} + #if CONFIG_SECLUDED_MEMORY void memory_object_mark_eligible_for_secluded( diff --git a/osfmk/vm/memory_object.h b/osfmk/vm/memory_object.h index e70c96b02..cc4eba042 100644 --- a/osfmk/vm/memory_object.h +++ b/osfmk/vm/memory_object.h @@ -145,6 +145,9 @@ extern void memory_object_mark_unused( extern void memory_object_mark_io_tracking( memory_object_control_t control); +extern void memory_object_mark_trusted( + memory_object_control_t control); + #if CONFIG_SECLUDED_MEMORY extern void memory_object_mark_eligible_for_secluded( memory_object_control_t control, diff --git a/osfmk/vm/memory_types.h b/osfmk/vm/memory_types.h new file mode 100644 index 000000000..a91846275 --- /dev/null +++ b/osfmk/vm/memory_types.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* machine independent WIMG bits */ + +#ifndef _VM_MEMORY_TYPES_H_ +#define _VM_MEMORY_TYPES_H_ + +#include + +#define VM_MEM_GUARDED 0x1 /* (G) Guarded Storage */ +#define VM_MEM_COHERENT 0x2 /* (M) Memory Coherency */ +#define VM_MEM_NOT_CACHEABLE 0x4 /* (I) Cache Inhibit */ +#define VM_MEM_WRITE_THROUGH 0x8 /* (W) Write-Through */ + +#define VM_WIMG_USE_DEFAULT 0x80 +#define VM_WIMG_MASK 0xFF + +#endif /* _VM_MEMORY_TYPES_H_ */ diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index af1838296..873bae998 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -119,6 +119,7 @@ extern boolean_t pmap_has_managed_page(ppnum_t first, ppnum_t last); #include #include +#include /* * Routines used for initialization. @@ -133,6 +134,7 @@ extern boolean_t pmap_has_managed_page(ppnum_t first, ppnum_t last); */ extern void *pmap_steal_memory(vm_size_t size); /* Early memory allocation */ +extern void *pmap_steal_freeable_memory(vm_size_t size); /* Early memory allocation */ extern uint_t pmap_free_pages(void); /* report remaining unused physical pages */ @@ -140,8 +142,6 @@ extern void pmap_startup(vm_offset_t *startp, vm_offset_t *endp); /* allocate vm extern void pmap_init(void); /* Initialization, once we have kernel virtual memory. */ -extern void pmap_pv_fixup(vm_offset_t start, vm_size_t size); - extern void mapping_adjust(void); /* Adjust free mapping count */ extern void mapping_free_prime(void); /* Primes the mapping block release list */ @@ -150,7 +150,7 @@ extern void mapping_free_prime(void); /* Primes the mapping block release list * /* * If machine/pmap.h defines MACHINE_PAGES, it must implement * the above functions. The pmap module has complete control. - * Otherwise, it must implement + * Otherwise, it must implement the following functions: * pmap_free_pages * pmap_virtual_space * pmap_next_page @@ -163,34 +163,31 @@ extern void mapping_free_prime(void); /* Primes the mapping block release list * * However, for best performance pmap_free_pages should be accurate. */ -extern boolean_t pmap_next_page(ppnum_t *pnum); -extern boolean_t pmap_next_page_hi(ppnum_t *pnum); -/* During VM initialization, - * return the next unused - * physical page. +/* + * Routines to return the next unused physical page. + */ +extern boolean_t pmap_next_page(ppnum_t *pnum); +extern boolean_t pmap_next_page_hi(ppnum_t *pnum, boolean_t might_free); +#ifdef __x86_64__ +extern kern_return_t pmap_next_page_large(ppnum_t *pnum); +extern void pmap_hi_pages_done(void); +#endif + +/* + * Report virtual space available for the kernel. */ -extern void pmap_virtual_space( +extern void pmap_virtual_space( vm_offset_t *virtual_start, vm_offset_t *virtual_end); -/* During VM initialization, - * report virtual space - * available for the kernel. - */ #endif /* MACHINE_PAGES */ /* - * Routines to manage the physical map data structure. + * Routines to manage the physical map data structure. */ -extern pmap_t pmap_create( /* Create a pmap_t. */ +extern pmap_t pmap_create_options( /* Create a pmap_t. */ ledger_t ledger, vm_map_size_t size, - boolean_t is_64bit); -#if __x86_64__ -extern pmap_t pmap_create_options( - ledger_t ledger, - vm_map_size_t size, - int flags); -#endif + unsigned int flags); extern pmap_t(pmap_kernel)(void); /* Return the kernel's pmap */ extern void pmap_reference(pmap_t pmap); /* Gain a reference. */ @@ -330,9 +327,9 @@ extern void pmap_sync_page_attributes_phys(ppnum_t pa); * the given physical page is mapped into no pmap. * pmap_assert_free() will panic() if pn is not free. */ -extern boolean_t pmap_verify_free(ppnum_t pn); +extern boolean_t pmap_verify_free(ppnum_t pn); #if MACH_ASSERT -extern void pmap_assert_free(ppnum_t pn); +extern void pmap_assert_free(ppnum_t pn); #endif /* @@ -649,24 +646,14 @@ extern void pmap_clear_noencrypt(ppnum_t pn); extern pmap_t kernel_pmap; /* The kernel's map */ #define pmap_kernel() (kernel_pmap) -/* machine independent WIMG bits */ - -#define VM_MEM_GUARDED 0x1 /* (G) Guarded Storage */ -#define VM_MEM_COHERENT 0x2 /* (M) Memory Coherency */ -#define VM_MEM_NOT_CACHEABLE 0x4 /* (I) Cache Inhibit */ -#define VM_MEM_WRITE_THROUGH 0x8 /* (W) Write-Through */ - -#define VM_WIMG_USE_DEFAULT 0x80 -#define VM_WIMG_MASK 0xFF - #define VM_MEM_SUPERPAGE 0x100 /* map a superpage instead of a base page */ #define VM_MEM_STACK 0x200 -#if __x86_64__ /* N.B. These use the same numerical space as the PMAP_EXPAND_OPTIONS * definitions in i386/pmap_internal.h */ #define PMAP_CREATE_64BIT 0x1 +#if __x86_64__ #define PMAP_CREATE_EPT 0x2 #define PMAP_CREATE_KNOWN_FLAGS (PMAP_CREATE_64BIT | PMAP_CREATE_EPT) #endif @@ -718,7 +705,9 @@ extern void pmap_remove_options( /* Remove mappings. */ extern void fillPage(ppnum_t pa, unsigned int fill); #if defined(__LP64__) -void pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr); +extern void pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr); +extern kern_return_t pmap_pre_expand_large(pmap_t pmap, vm_map_offset_t vaddr); +extern vm_size_t pmap_query_pagesize(pmap_t map, vm_map_offset_t vaddr); #endif mach_vm_size_t pmap_query_resident(pmap_t pmap, @@ -774,12 +763,16 @@ kern_return_t pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_stat #endif +#ifdef PLATFORM_BridgeOS struct pmap_legacy_trust_cache { struct pmap_legacy_trust_cache *next; uuid_t uuid; uint32_t num_hashes; uint8_t hashes[][CS_CDHASH_LEN]; }; +#else +struct pmap_legacy_trust_cache; +#endif extern kern_return_t pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache *trust_cache, const vm_size_t trust_cache_len); @@ -815,6 +808,15 @@ extern pmap_tc_ret_t pmap_load_image4_trust_cache( vm_size_t img4_manifest_actual_len, bool dry_run); +extern bool pmap_is_trust_cache_loaded(const uuid_t uuid); +extern uint32_t pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]); +extern bool pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN]); + +extern bool pmap_in_ppl(void); + +extern void *pmap_claim_reserved_ppl_page(void); +extern void pmap_free_reserved_ppl_page(void *kva); + extern void pmap_ledger_alloc_init(size_t); extern ledger_t pmap_ledger_alloc(void); extern void pmap_ledger_free(ledger_t); diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 416e90fa2..1b7922574 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -123,19 +123,19 @@ void crypt_info_deallocate(struct pager_crypt_info *crypt_info); * These routines are invoked by VM via the memory_object_*() interfaces. */ const struct memory_object_pager_ops apple_protect_pager_ops = { - apple_protect_pager_reference, - apple_protect_pager_deallocate, - apple_protect_pager_init, - apple_protect_pager_terminate, - apple_protect_pager_data_request, - apple_protect_pager_data_return, - apple_protect_pager_data_initialize, - apple_protect_pager_data_unlock, - apple_protect_pager_synchronize, - apple_protect_pager_map, - apple_protect_pager_last_unmap, - NULL, /* data_reclaim */ - "apple_protect" + .memory_object_reference = apple_protect_pager_reference, + .memory_object_deallocate = apple_protect_pager_deallocate, + .memory_object_init = apple_protect_pager_init, + .memory_object_terminate = apple_protect_pager_terminate, + .memory_object_data_request = apple_protect_pager_data_request, + .memory_object_data_return = apple_protect_pager_data_return, + .memory_object_data_initialize = apple_protect_pager_data_initialize, + .memory_object_data_unlock = apple_protect_pager_data_unlock, + .memory_object_synchronize = apple_protect_pager_synchronize, + .memory_object_map = apple_protect_pager_map, + .memory_object_last_unmap = apple_protect_pager_last_unmap, + .memory_object_data_reclaim = NULL, + .memory_object_pager_name = "apple_protect" }; /* @@ -167,7 +167,7 @@ typedef struct apple_protect_pager { int apple_protect_pager_count = 0; /* number of pagers */ int apple_protect_pager_count_mapped = 0; /* number of unmapped pagers */ queue_head_t apple_protect_pager_queue; -decl_lck_mtx_data(, apple_protect_pager_lock) +decl_lck_mtx_data(, apple_protect_pager_lock); /* * Maximum number of unmapped pagers we're willing to keep around. @@ -511,24 +511,13 @@ retry_src_fault: dst_pnum = (ppnum_t) upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE)); assert(dst_pnum != 0); -#if __x86_64__ - src_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) - << PAGE_SHIFT); - dst_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#elif __arm__ || __arm64__ src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); dst_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#else -#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." - src_vaddr = 0; - dst_vaddr = 0; -#endif + src_page_object = VM_PAGE_OBJECT(src_page); /* @@ -1164,6 +1153,8 @@ apple_protect_pager_create( &control); assert(kr == KERN_SUCCESS); + memory_object_mark_trusted(control); + lck_mtx_lock(&apple_protect_pager_lock); /* the new pager is now ready to be used */ pager->is_ready = TRUE; diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index ee77679da..6c5a42214 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -44,9 +44,12 @@ #include #include -#if !CONFIG_EMBEDDED +#if defined(__x86_64__) #include #endif +#if defined(__arm64__) +#include +#endif #include @@ -595,7 +598,12 @@ vm_compressor_init(void) PE_parse_boot_argn("vm_compression_limit", &vm_compression_limit, sizeof(vm_compression_limit)); #ifdef CONFIG_EMBEDDED +#if XNU_TARGET_OS_WATCH + // rdar://problem/51012698 + vm_compressor_minorcompact_threshold_divisor = 40; +#else vm_compressor_minorcompact_threshold_divisor = 20; +#endif vm_compressor_majorcompact_threshold_divisor = 30; vm_compressor_unthrottle_threshold_divisor = 40; vm_compressor_catchup_threshold_divisor = 60; @@ -641,7 +649,7 @@ vm_compressor_init(void) compressor_pool_max_size = C_SEG_MAX_LIMIT; compressor_pool_max_size *= C_SEG_BUFSIZE; -#if defined(__x86_64__) +#if !CONFIG_EMBEDDED if (vm_compression_limit == 0) { if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) { @@ -873,7 +881,7 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact) if (c_size) { uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset]; if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) { - panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata); + panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata); } } #endif @@ -1088,12 +1096,14 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) { int old_state = c_seg->c_state; -#if __i386__ || __x86_64__ +#if !CONFIG_EMBEDDED +#if DEVELOPMENT || DEBUG if (new_state != C_IS_FILLING) { LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED); } LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED); #endif +#endif /* !CONFIG_EMBEDDED */ switch (old_state) { case C_IS_EMPTY: assert(new_state == C_IS_FILLING || new_state == C_IS_FREE); @@ -3048,16 +3058,6 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset)); -#ifndef _OPEN_SOURCE - /* TODO: The HW codec can generate, lazily, a '2nd page not mapped' - * exception. So on such a platform, or platforms where we're confident - * the codec does not require a buffer page to absorb trailing writes, - * we can create an unmapped hole at the tail of the segment, rather - * than a populated mapping. This will also guarantee that the codec - * does not overwrite valid data past the edge of the segment and - * thus eliminate the depopulation overhead. - */ -#endif if (unused_bytes) { offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset))); @@ -3561,8 +3561,16 @@ sv_compression: static inline void sv_decompress(int32_t *ddst, int32_t pattern) { -#if __x86_64__ +// assert(__builtin_constant_p(PAGE_SIZE) != 0); +#if defined(__x86_64__) memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t)); +#elif defined(__arm64__) + assert((PAGE_SIZE % 128) == 0); + if (pattern == 0) { + fill32_dczva((addr64_t)ddst, PAGE_SIZE); + } else { + fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern); + } #else size_t i; @@ -3570,9 +3578,10 @@ sv_decompress(int32_t *ddst, int32_t pattern) * compiler to emit NEON stores, cf. * Loop autovectorization * anomalies. - * We use separate loops for each PAGE_SIZE + */ + /* * We use separate loops for each PAGE_SIZE * to allow the autovectorizer to engage, as PAGE_SIZE - * is currently not a constant. + * may not be a constant. */ __unreachable_ok_push @@ -3758,7 +3767,7 @@ bypass_busy_check: unsigned csvpop; uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset]; if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) { - panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata); + panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata); } #endif @@ -3987,17 +3996,7 @@ vm_compressor_get(ppnum_t pn, int *slot, int flags) */ dptr = (int32_t *)(uintptr_t)dst; data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data; -#if __x86_64__ - memset_word(dptr, data, PAGE_SIZE / sizeof(int32_t)); -#else - { - int i; - - for (i = 0; i < (int)(PAGE_SIZE / sizeof(int32_t)); i++) { - *dptr++ = data; - } - } -#endif + sv_decompress(dptr, data); if (!(flags & C_KEEP)) { c_segment_sv_hash_drop_ref(slot_ptr->s_cindx); diff --git a/osfmk/vm/vm_compressor_backing_store.c b/osfmk/vm/vm_compressor_backing_store.c index 8f6971fb4..4874789d5 100644 --- a/osfmk/vm/vm_compressor_backing_store.c +++ b/osfmk/vm/vm_compressor_backing_store.c @@ -425,6 +425,16 @@ vm_compressor_swap_init() panic("vm_swapfile_gc_thread: create failed"); } thread_set_thread_name(thread, "VM_swapfile_gc"); + + /* + * Swapfile garbage collection will need to allocate memory + * to complete its swap reclaim and in-memory compaction. + * So allow it to dip into the reserved VM page pool. + */ + thread_lock(thread); + thread->options |= TH_OPT_VMPRIV; + thread_unlock(thread); + thread_deallocate(thread); proc_set_thread_policy_with_tid(kernel_task, thread->thread_id, @@ -679,6 +689,10 @@ vm_swapfile_create_thread(void) break; } + if (compressor_store_stop_compaction == TRUE) { + break; + } + clock_get_system_nanotime(&sec, &nsec); if (VM_SWAP_SHOULD_CREATE(sec) == 0) { @@ -700,6 +714,10 @@ vm_swapfile_create_thread(void) thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap); } + if (compressor_store_stop_compaction == TRUE) { + thread_wakeup((event_t)&compressor_store_stop_compaction); + } + assert_wait((event_t)&vm_swapfile_create_needed, THREAD_UNINT); lck_mtx_unlock(&vm_swap_data_lock); @@ -813,6 +831,10 @@ vm_swapfile_gc_thread(void) thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap); } + if (compressor_store_stop_compaction == TRUE) { + thread_wakeup((event_t)&compressor_store_stop_compaction); + } + assert_wait((event_t)&vm_swapfile_gc_needed, THREAD_UNINT); lck_mtx_unlock(&vm_swap_data_lock); @@ -1110,8 +1132,6 @@ again: soc->swp_io_busy = 1; vm_swapout_soc_busy++; } - vm_swapout_thread_throttle_adjust(); - vm_pageout_io_throttle(); c_seg_is_empty: if (c_swapout_count == 0) { @@ -1123,6 +1143,12 @@ c_seg_is_empty: if ((soc = vm_swapout_find_done_soc())) { vm_swapout_complete_soc(soc); } + lck_mtx_unlock_always(c_list_lock); + + vm_swapout_thread_throttle_adjust(); + vm_pageout_io_throttle(); + + lck_mtx_lock_spin_always(c_list_lock); } if ((soc = vm_swapout_find_done_soc())) { vm_swapout_complete_soc(soc); diff --git a/osfmk/vm/vm_compressor_pager.c b/osfmk/vm/vm_compressor_pager.c index 6eda97684..a0a93f882 100644 --- a/osfmk/vm/vm_compressor_pager.c +++ b/osfmk/vm/vm_compressor_pager.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -63,6 +63,8 @@ #include #include +#include + #include #include #include @@ -119,19 +121,19 @@ kern_return_t compressor_memory_object_data_reclaim( __unused boolean_t reclaim_backing_store); const struct memory_object_pager_ops compressor_pager_ops = { - compressor_memory_object_reference, - compressor_memory_object_deallocate, - compressor_memory_object_init, - compressor_memory_object_terminate, - compressor_memory_object_data_request, - compressor_memory_object_data_return, - compressor_memory_object_data_initialize, - compressor_memory_object_data_unlock, - compressor_memory_object_synchronize, - compressor_memory_object_map, - compressor_memory_object_last_unmap, - compressor_memory_object_data_reclaim, - "compressor pager" + .memory_object_reference = compressor_memory_object_reference, + .memory_object_deallocate = compressor_memory_object_deallocate, + .memory_object_init = compressor_memory_object_init, + .memory_object_terminate = compressor_memory_object_terminate, + .memory_object_data_request = compressor_memory_object_data_request, + .memory_object_data_return = compressor_memory_object_data_return, + .memory_object_data_initialize = compressor_memory_object_data_initialize, + .memory_object_data_unlock = compressor_memory_object_data_unlock, + .memory_object_synchronize = compressor_memory_object_synchronize, + .memory_object_map = compressor_memory_object_map, + .memory_object_last_unmap = compressor_memory_object_last_unmap, + .memory_object_data_reclaim = compressor_memory_object_data_reclaim, + .memory_object_pager_name = "compressor pager" }; /* internal data structures */ @@ -662,7 +664,7 @@ compressor_pager_slot_lookup( * This memory barrier should take care of this * according to the platform requirements. */ - __c11_atomic_thread_fence(memory_order_release); + os_atomic_thread_fence(release); chunk = pager->cpgr_slots.cpgr_islots[chunk_idx] = t_chunk; t_chunk = NULL; diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 331777917..277c96487 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,7 +82,6 @@ #include #include #include -#include #include #include #include @@ -114,6 +113,8 @@ #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */ +int vm_protect_privileged_from_untrusted = 1; + unsigned int vm_object_pagein_throttle = 16; /* @@ -151,6 +152,12 @@ uint64_t vm_hard_throttle_threshold; #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000 +#define VM_STAT_DECOMPRESSIONS() \ +MACRO_BEGIN \ + VM_STAT_INCR(decompressions); \ + current_thread()->decompressions++; \ +MACRO_END + boolean_t current_thread_aborted(void); /* Forward declarations of internal routines. */ @@ -203,7 +210,7 @@ uint64_t vm_cs_defer_to_pmap_cs = 0; uint64_t vm_cs_defer_to_pmap_cs_not = 0; #endif /* PMAP_CS */ -void vm_pre_fault(vm_map_offset_t); +void vm_pre_fault(vm_map_offset_t, vm_prot_t); extern char *kdp_compressor_decompressed_page; extern addr64_t kdp_compressor_decompressed_page_paddr; @@ -270,6 +277,8 @@ vm_fault_init(void) PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode)); } printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode); + + PE_parse_boot_argn("vm_protect_privileged_from_untrusted", &vm_protect_privileged_from_untrusted, sizeof(vm_protect_privileged_from_untrusted)); } void @@ -987,11 +996,6 @@ vm_fault_page( first_m = VM_PAGE_NULL; access_required = fault_type; - - XPR(XPR_VM_FAULT, - "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n", - object, offset, fault_type, *protection, 0); - /* * default type of fault */ @@ -1081,10 +1085,6 @@ vm_fault_page( #endif wait_result = PAGE_SLEEP(object, m, interruptible); - XPR(XPR_VM_FAULT, - "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", - object, offset, - m, 0, 0); counter(c_vm_fault_page_block_busy_kernel++); if (wait_result != THREAD_AWAKENED) { @@ -1207,12 +1207,6 @@ vm_fault_page( return error; } - XPR(XPR_VM_FAULT, - "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n", - object, offset, - m, - first_object, 0); - if (object != first_object) { /* * free the absent page we just found @@ -1270,11 +1264,6 @@ vm_fault_page( vm_page_queues_remove(m, FALSE); vm_page_unlock_queues(); } - XPR(XPR_VM_FAULT, - "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n", - object, offset, - next_object, - offset + object->vo_shadow_offset, 0); offset += object->vo_shadow_offset; fault_info->lo_offset += object->vo_shadow_offset; @@ -1310,10 +1299,6 @@ vm_fault_page( #if TRACEFAULTPAGE dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */ #endif - XPR(XPR_VM_FAULT, - "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n", - object, offset, - m, 0, 0); /* * take an extra ref so that object won't die */ @@ -1391,9 +1376,6 @@ vm_fault_page( #if TRACEFAULTPAGE dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - XPR(XPR_VM_FAULT, - "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n", - object, offset, m, 0, 0); assert(!m->vmp_busy); assert(!m->vmp_absent); @@ -1476,10 +1458,6 @@ vm_fault_page( VM_PAGE_FREE(m); } - XPR(XPR_VM_FAULT, - "vm_f_page: ready wait obj 0x%X, offset 0x%X\n", - object, offset, 0, 0, 0); - /* * take an extra ref so object won't die */ @@ -1729,11 +1707,6 @@ vm_fault_page( wants_copy_flag = VM_PROT_NONE; } - XPR(XPR_VM_FAULT, - "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n", - object, offset, m, - access_required | wants_copy_flag, 0); - if (object->copy == first_object) { /* * if we issue the memory_object_data_request in @@ -1878,11 +1851,6 @@ dont_look_for_page: assert(m == VM_PAGE_NULL); } - XPR(XPR_VM_FAULT, - "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n", - object, offset, m, - object->shadow, 0); - next_object = object->shadow; if (next_object == VM_OBJECT_NULL) { @@ -1985,11 +1953,6 @@ dont_look_for_page: !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded)); #endif /* EXTRA_ASSERTIONS */ - XPR(XPR_VM_FAULT, - "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n", - object, offset, m, - first_object, first_m); - /* * If the page is being written, but isn't * already owned by the top-level object, @@ -2038,10 +2001,6 @@ dont_look_for_page: return VM_FAULT_MEMORY_SHORTAGE; } - XPR(XPR_VM_FAULT, - "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n", - object, offset, - m, copy_m, 0); vm_page_copy(m, copy_m); @@ -2373,10 +2332,6 @@ done: *result_page = m; *top_page = first_m; - XPR(XPR_VM_FAULT, - "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n", - object, offset, m, first_m, 0); - if (m != VM_PAGE_NULL) { assert(VM_PAGE_OBJECT(m) == object); @@ -2410,7 +2365,7 @@ done: vm_fault_is_sequential(object, offset, fault_info->behavior); vm_fault_deactivate_behind(object, offset, fault_info->behavior); } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) { - VM_STAT_INCR(decompressions); + VM_STAT_DECOMPRESSIONS(); } if (type_of_fault) { *type_of_fault = my_fault; @@ -2778,21 +2733,29 @@ vm_fault_enter(vm_page_t m, pathname_len = __PATH_MAX; filename = pathname + pathname_len; filename_len = __PATH_MAX; + + if (vnode_pager_get_object_name(file_object->pager, + pathname, + pathname_len, + filename, + filename_len, + &truncated_path) == KERN_SUCCESS) { + /* safety first... */ + pathname[__PATH_MAX - 1] = '\0'; + filename[__PATH_MAX - 1] = '\0'; + + vnode_pager_get_object_mtime(file_object->pager, + &mtime, + &cs_mtime); + } else { + kfree(pathname, __PATH_MAX * 2); + pathname = NULL; + filename = NULL; + pathname_len = 0; + filename_len = 0; + truncated_path = FALSE; + } } - vnode_pager_get_object_name(file_object->pager, - pathname, - pathname_len, - filename, - filename_len, - &truncated_path); - if (pathname) { - /* safety first... */ - pathname[__PATH_MAX - 1] = '\0'; - filename[__PATH_MAX - 1] = '\0'; - } - vnode_pager_get_object_mtime(file_object->pager, - &mtime, - &cs_mtime); } printf("CODE SIGNING: process %d[%s]: " "rejecting invalid page at address 0x%llx " @@ -2886,13 +2849,21 @@ vm_fault_enter(vm_page_t m, } if (panic_on_cs_killed && object->object_is_shared_cache) { + char *tainted_contents; + vm_map_offset_t src_vaddr; + src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT); + tainted_contents = kalloc(PAGE_SIZE); + bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE); + printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents); panic("CODE SIGNING: process %d[%s]: " - "rejecting invalid page at address 0x%llx " + "rejecting invalid page (phys#0x%x) at address 0x%llx " "from offset 0x%llx in file \"%s%s%s\" " "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) " "(signed:%d validated:%d tainted:%d nx:%d" "wpmapped:%d dirty:%d depth:%d)\n", - pid, procname, (addr64_t) vaddr, + pid, procname, + VM_PAGE_GET_PHYS_PAGE(m), + (addr64_t) vaddr, file_offset, (pathname ? pathname : ""), (truncated_path ? "/.../" : ""), @@ -3261,20 +3232,10 @@ MACRO_END } #endif /* VM_OBJECT_ACCESS_TRACKING */ + #if PMAP_CS - /* - * If CS enforcement is on, we don't ask for an executable page if the - * fault does not call for execution, because that can fail in - * situations where the caller only actually wanted read access. - * However, it may be better to instead retry without execute on - * failure, or pass a flag into pmap_enter to do the right thing. - */ - // TODO: maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults - if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) { - prot &= ~VM_PROT_EXECUTE; - } +pmap_enter_retry: #endif - /* Prevent a deadlock by not * holding the object lock if we need to wait for a page in * pmap_enter() - */ @@ -3282,6 +3243,18 @@ MACRO_END wired, pmap_options | PMAP_OPTIONS_NOWAIT, pe_result); +#if PMAP_CS + /* + * Retry without execute permission if we encountered a codesigning + * failure on a non-execute fault. This allows applications which + * don't actually need to execute code to still map it for read access. + */ + if ((pe_result == KERN_CODESIGN_ERROR) && pmap_cs_enforced(pmap) && + (prot & VM_PROT_EXECUTE) && !(caller_prot & VM_PROT_EXECUTE)) { + prot &= ~VM_PROT_EXECUTE; + goto pmap_enter_retry; + } +#endif #if __x86_64__ if (pe_result == KERN_INVALID_ARGUMENT && pmap == PMAP_NULL && @@ -3351,12 +3324,12 @@ after_the_pmap_enter: } void -vm_pre_fault(vm_map_offset_t vaddr) +vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot) { if (pmap_find_phys(current_map()->pmap, vaddr) == 0) { vm_fault(current_map(), /* map */ vaddr, /* vaddr */ - VM_PROT_READ, /* fault_type */ + prot, /* fault_type */ FALSE, /* change_wiring */ VM_KERN_MEMORY_NONE, /* tag - not wiring */ THREAD_UNINT, /* interruptible */ @@ -3418,6 +3391,14 @@ vm_fault( NULL); } +static boolean_t +current_proc_is_privileged(void) +{ + return csproc_get_platform_binary(current_proc()); +} + +uint64_t vm_copied_on_read = 0; + kern_return_t vm_fault_internal( vm_map_t map, @@ -3468,13 +3449,16 @@ vm_fault_internal( int throttle_delay; int compressed_count_delta; int grab_options; + boolean_t need_copy; + boolean_t need_copy_on_read; vm_map_offset_t trace_vaddr; vm_map_offset_t trace_real_vaddr; -#if DEVELOPMENT || DEBUG vm_map_offset_t real_vaddr; + boolean_t resilient_media_retry = FALSE; + vm_object_t resilient_media_object = VM_OBJECT_NULL; + vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1; real_vaddr = vaddr; -#endif /* DEVELOPMENT || DEBUG */ trace_real_vaddr = vaddr; vaddr = vm_map_trunc_page(vaddr, PAGE_MASK); @@ -3521,7 +3505,12 @@ vm_fault_internal( current_task()->faults++; original_fault_type = fault_type; + need_copy = FALSE; if (fault_type & VM_PROT_WRITE) { + need_copy = TRUE; + } + + if (need_copy) { object_lock_type = OBJECT_LOCK_EXCLUSIVE; } else { object_lock_type = OBJECT_LOCK_SHARED; @@ -3554,7 +3543,20 @@ RetryFault: map = original_map; vm_map_lock_read(map); - kr = vm_map_lookup_locked(&map, vaddr, fault_type, + if (resilient_media_retry) { + /* + * If we have to insert a fake zero-filled page to hide + * a media failure to provide the real page, we need to + * resolve any pending copy-on-write on this mapping. + * VM_PROT_COPY tells vm_map_lookup_locked() to deal + * with that even if this is not a "write" fault. + */ + need_copy = TRUE; + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + } + + kr = vm_map_lookup_locked(&map, vaddr, + (fault_type | (need_copy ? VM_PROT_COPY : 0)), object_lock_type, &version, &object, &offset, &prot, &wired, &fault_info, @@ -3571,12 +3573,49 @@ RetryFault: fault_info.mark_zf_absent = FALSE; fault_info.batch_pmap_op = FALSE; + if (resilient_media_retry) { + /* + * We're retrying this fault after having detected a media + * failure from a "resilient_media" mapping. + * Check that the mapping is still pointing at the object + * that just failed to provide a page. + */ + assert(resilient_media_object != VM_OBJECT_NULL); + assert(resilient_media_offset != (vm_object_offset_t)-1); + if (object != VM_OBJECT_NULL && + object == resilient_media_object && + offset == resilient_media_offset && + fault_info.resilient_media) { + /* + * This mapping still points at the same object + * and is still "resilient_media": proceed in + * "recovery-from-media-failure" mode, where we'll + * insert a zero-filled page in the top object. + */ +// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset); + } else { + /* not recovering: reset state */ +// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset); + resilient_media_retry = FALSE; + /* release our extra reference on failed object */ +// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object); + vm_object_deallocate(resilient_media_object); + resilient_media_object = VM_OBJECT_NULL; + resilient_media_offset = (vm_object_offset_t)-1; + } + } else { + assert(resilient_media_object == VM_OBJECT_NULL); + resilient_media_offset = (vm_object_offset_t)-1; + } + /* * If the page is wired, we must fault for the current protection * value, to avoid further faults. */ if (wired) { fault_type = prot | VM_PROT_WRITE; + } + if (wired || need_copy) { /* * since we're treating this fault as a 'write' * we must hold the top object lock exclusively @@ -3851,7 +3890,7 @@ reclaimed_from_pageout: if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) || (physpage_p != NULL && (prot & VM_PROT_WRITE))) { -upgrade_for_validation: +upgrade_lock_and_retry: /* * We might need to validate this page * against its code signature, so we @@ -3901,7 +3940,58 @@ upgrade_for_validation: goto FastPmapEnter; } - if ((fault_type & VM_PROT_WRITE) == 0) { + if (!need_copy && + !fault_info.no_copy_on_read && + cur_object != object && + !cur_object->internal && + !cur_object->pager_trusted && + vm_protect_privileged_from_untrusted && + !((prot & VM_PROT_EXECUTE) && + cur_object->code_signed && + cs_process_enforcement(NULL)) && + current_proc_is_privileged()) { + /* + * We're faulting on a page in "object" and + * went down the shadow chain to "cur_object" + * to find out that "cur_object"'s pager + * is not "trusted", i.e. we can not trust it + * to always return the same contents. + * Since the target is a "privileged" process, + * let's treat this as a copy-on-read fault, as + * if it was a copy-on-write fault. + * Once "object" gets a copy of this page, it + * won't have to rely on "cur_object" to + * provide the contents again. + * + * This is done by setting "need_copy" and + * retrying the fault from the top with the + * appropriate locking. + * + * Special case: if the mapping is executable + * and the untrusted object is code-signed and + * the process is "cs_enforced", we do not + * copy-on-read because that would break + * code-signing enforcement expectations (an + * executable page must belong to a code-signed + * object) and we can rely on code-signing + * to re-validate the page if it gets evicted + * and paged back in. + */ +// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset); + vm_copied_on_read++; + need_copy = TRUE; + + vm_object_unlock(object); + vm_object_unlock(cur_object); + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + vm_map_unlock_read(map); + if (real_map != map) { + vm_map_unlock(real_map); + } + goto RetryFault; + } + + if (!(fault_type & VM_PROT_WRITE) && !need_copy) { if (!pmap_has_prot_policy(prot)) { prot &= ~VM_PROT_WRITE; } else { @@ -3986,7 +4076,6 @@ FastPmapEnter: need_retry_ptr, &type_of_fault); } -#if DEVELOPMENT || DEBUG { int event_code = 0; @@ -4002,7 +4091,6 @@ FastPmapEnter: DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); } -#endif if (kr == KERN_SUCCESS && physpage_p != NULL) { /* for vm_map_wire_and_extract() */ @@ -4111,7 +4199,7 @@ FastPmapEnter: if ((cur_object_lock_type == OBJECT_LOCK_SHARED) && VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) { - goto upgrade_for_validation; + goto upgrade_lock_and_retry; } /* @@ -4417,7 +4505,7 @@ FastPmapEnter: type_of_fault = my_fault_type; - VM_STAT_INCR(decompressions); + VM_STAT_DECOMPRESSIONS(); if (cur_object != object) { if (insert_cur_object) { @@ -4439,7 +4527,8 @@ FastPmapEnter: * that the pager doesn't have this page */ } - if (cur_object->shadow == VM_OBJECT_NULL) { + if (cur_object->shadow == VM_OBJECT_NULL || + resilient_media_retry) { /* * Zero fill fault. Page gets * inserted into the original object. @@ -4485,6 +4574,9 @@ FastPmapEnter: goto RetryFault; } } + if (!object->internal) { + panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object); + } m = vm_page_alloc(object, offset); m_object = NULL; @@ -4588,6 +4680,22 @@ handle_copy_delay: assert(object != kernel_object); assert(object != vm_submap_object); + if (resilient_media_retry) { + /* + * We could get here if we failed to get a free page + * to zero-fill and had to take the slow path again. + * Reset our "recovery-from-failed-media" state. + */ + assert(resilient_media_object != VM_OBJECT_NULL); + assert(resilient_media_offset != (vm_object_offset_t)-1); + /* release our extra reference on failed object */ +// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object); + vm_object_deallocate(resilient_media_object); + resilient_media_object = VM_OBJECT_NULL; + resilient_media_offset = (vm_object_offset_t)-1; + resilient_media_retry = FALSE; + } + /* * Make a reference to this object to * prevent its disposal while we are messing with @@ -4598,8 +4706,7 @@ handle_copy_delay: vm_object_reference_locked(object); vm_object_paging_begin(object); - XPR(XPR_VM_FAULT, "vm_fault -> vm_fault_page\n", 0, 0, 0, 0, 0); - + set_thread_pagein_error(cthread, 0); error_code = 0; result_page = VM_PAGE_NULL; @@ -4627,10 +4734,35 @@ handle_copy_delay: */ if (kr != VM_FAULT_SUCCESS && kr != VM_FAULT_SUCCESS_NO_VM_PAGE) { - /* - * we didn't succeed, lose the object reference immediately. - */ - vm_object_deallocate(object); + if (kr == VM_FAULT_MEMORY_ERROR && + fault_info.resilient_media) { + assertf(object->internal, "object %p", object); + /* + * This fault failed but the mapping was + * "media resilient", so we'll retry the fault in + * recovery mode to get a zero-filled page in the + * top object. + * Keep the reference on the failing object so + * that we can check that the mapping is still + * pointing to it when we retry the fault. + */ +// printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page); + assert(!resilient_media_retry); /* no double retry */ + assert(resilient_media_object == VM_OBJECT_NULL); + assert(resilient_media_offset == (vm_object_offset_t)-1); + resilient_media_retry = TRUE; + resilient_media_object = object; + resilient_media_offset = offset; +// printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset); + goto RetryFault; + } else { + /* + * we didn't succeed, lose the object reference + * immediately. + */ + vm_object_deallocate(object); + object = VM_OBJECT_NULL; /* no longer valid */ + } /* * See why we failed, and take corrective action. @@ -4848,11 +4980,47 @@ handle_copy_delay: object_locks_dropped = FALSE; } + if (!need_copy && + !fault_info.no_copy_on_read && + m != VM_PAGE_NULL && + VM_PAGE_OBJECT(m) != object && + !VM_PAGE_OBJECT(m)->pager_trusted && + vm_protect_privileged_from_untrusted && + !((prot & VM_PROT_EXECUTE) && + VM_PAGE_OBJECT(m)->code_signed && + cs_process_enforcement(NULL)) && + current_proc_is_privileged()) { + /* + * We found the page we want in an "untrusted" VM object + * down the shadow chain. Since the target is "privileged" + * we want to perform a copy-on-read of that page, so that the + * mapped object gets a stable copy and does not have to + * rely on the "untrusted" object to provide the same + * contents if the page gets reclaimed and has to be paged + * in again later on. + * + * Special case: if the mapping is executable and the untrusted + * object is code-signed and the process is "cs_enforced", we + * do not copy-on-read because that would break code-signing + * enforcement expectations (an executable page must belong + * to a code-signed object) and we can rely on code-signing + * to re-validate the page if it gets evicted and paged back in. + */ +// printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset); + vm_copied_on_read++; + need_copy_on_read = TRUE; + need_copy = TRUE; + } else { + need_copy_on_read = FALSE; + } + /* * If we want to wire down this page, but no longer have * adequate permissions, we must start all over. + * If we decided to copy-on-read, we must also start all over. */ - if (wired && (fault_type != (prot | VM_PROT_WRITE))) { + if ((wired && (fault_type != (prot | VM_PROT_WRITE))) || + need_copy_on_read) { vm_map_unlock_read(map); if (real_map != map) { vm_map_unlock(real_map); @@ -4907,7 +5075,6 @@ handle_copy_delay: } assert(VM_PAGE_OBJECT(m) == m_object); -#if DEVELOPMENT || DEBUG { int event_code = 0; @@ -4923,7 +5090,6 @@ handle_copy_delay: DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); } -#endif if (kr != KERN_SUCCESS) { /* abort this page fault */ vm_map_unlock_read(map); @@ -5094,6 +5260,18 @@ cleanup: done: thread_interrupt_level(interruptible_state); + if (resilient_media_object != VM_OBJECT_NULL) { + assert(resilient_media_retry); + assert(resilient_media_offset != (vm_object_offset_t)-1); + /* release extra reference on failed object */ +// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object); + vm_object_deallocate(resilient_media_object); + resilient_media_object = VM_OBJECT_NULL; + resilient_media_offset = (vm_object_offset_t)-1; + resilient_media_retry = FALSE; + } + assert(!resilient_media_retry); + /* * Only I/O throttle on faults which cause a pagein/swapin. */ @@ -5289,9 +5467,6 @@ vm_fault_unwire( vm_object_lock(object); vm_object_paging_begin(object); - XPR(XPR_VM_FAULT, - "vm_fault_unwire -> vm_fault_page\n", - 0, 0, 0, 0, 0); result_page = VM_PAGE_NULL; result = vm_fault_page( object, @@ -5744,7 +5919,6 @@ RetryDestinationFault:; } fault_info_dst.cluster_size = cluster_size; - XPR(XPR_VM_FAULT, "vm_fault_copy -> vm_fault_page\n", 0, 0, 0, 0, 0); dst_page = VM_PAGE_NULL; result = vm_fault_page(dst_object, vm_object_trunc_page(dst_offset), @@ -5839,9 +6013,6 @@ RetrySourceFault:; } fault_info_src.cluster_size = cluster_size; - XPR(XPR_VM_FAULT, - "vm_fault_copy(2) -> vm_fault_page\n", - 0, 0, 0, 0, 0); result_page = VM_PAGE_NULL; result = vm_fault_page( src_object, @@ -6516,7 +6687,7 @@ vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr * further user stack traversals, thus avoiding copyin()s and further * faults. */ - int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64); + int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64, NULL); if ((btr == 0) && (bfrs > 0)) { cfpc = bpc; diff --git a/osfmk/vm/vm_fault.h b/osfmk/vm/vm_fault.h index 973851680..8fe4c76d8 100644 --- a/osfmk/vm/vm_fault.h +++ b/osfmk/vm/vm_fault.h @@ -96,9 +96,13 @@ extern kern_return_t vm_fault( #endif int interruptible, pmap_t pmap, - vm_map_offset_t pmap_addr); + vm_map_offset_t pmap_addr) +#if XNU_KERNEL_PRIVATE +__XNU_INTERNAL(vm_fault) +#endif +; -extern void vm_pre_fault(vm_map_offset_t); +extern void vm_pre_fault(vm_map_offset_t, vm_prot_t); #ifdef MACH_KERNEL_PRIVATE diff --git a/osfmk/vm/vm_fourk_pager.c b/osfmk/vm/vm_fourk_pager.c index cdc379909..4a9e7a43e 100644 --- a/osfmk/vm/vm_fourk_pager.c +++ b/osfmk/vm/vm_fourk_pager.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,19 +118,19 @@ kern_return_t fourk_pager_last_unmap(memory_object_t mem_obj); * These routines are invoked by VM via the memory_object_*() interfaces. */ const struct memory_object_pager_ops fourk_pager_ops = { - fourk_pager_reference, - fourk_pager_deallocate, - fourk_pager_init, - fourk_pager_terminate, - fourk_pager_data_request, - fourk_pager_data_return, - fourk_pager_data_initialize, - fourk_pager_data_unlock, - fourk_pager_synchronize, - fourk_pager_map, - fourk_pager_last_unmap, - NULL, /* data_reclaim */ - "fourk_pager" + .memory_object_reference = fourk_pager_reference, + .memory_object_deallocate = fourk_pager_deallocate, + .memory_object_init = fourk_pager_init, + .memory_object_terminate = fourk_pager_terminate, + .memory_object_data_request = fourk_pager_data_request, + .memory_object_data_return = fourk_pager_data_return, + .memory_object_data_initialize = fourk_pager_data_initialize, + .memory_object_data_unlock = fourk_pager_data_unlock, + .memory_object_synchronize = fourk_pager_synchronize, + .memory_object_map = fourk_pager_map, + .memory_object_last_unmap = fourk_pager_last_unmap, + .memory_object_data_reclaim = NULL, + .memory_object_pager_name = "fourk_pager" }; /* @@ -163,7 +163,7 @@ typedef struct fourk_pager { int fourk_pager_count = 0; /* number of pagers */ int fourk_pager_count_mapped = 0; /* number of unmapped pagers */ queue_head_t fourk_pager_queue; -decl_lck_mtx_data(, fourk_pager_lock) +decl_lck_mtx_data(, fourk_pager_lock); /* * Maximum number of unmapped pagers we're willing to keep around. @@ -759,6 +759,8 @@ fourk_pager_create(void) &control); assert(kr == KERN_SUCCESS); + memory_object_mark_trusted(control); + lck_mtx_lock(&fourk_pager_lock); /* the new pager is now ready to be used */ pager->is_ready = TRUE; @@ -892,23 +894,8 @@ fourk_pager_data_request( dst_pnum = (ppnum_t) upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE)); assert(dst_pnum != 0); -#if __x86_64__ - dst_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#elif __arm__ || __arm64__ dst_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#else - kr = pmap_enter(kernel_pmap, - dst_vaddr, - dst_pnum, - VM_PROT_READ | VM_PROT_WRITE, - VM_PROT_NONE, - 0, - TRUE); - - assert(kr == KERN_SUCCESS); -#endif /* retrieve appropriate data for each 4K-page in this page */ if (PAGE_SHIFT == FOURK_PAGE_SHIFT && @@ -1084,29 +1071,9 @@ retry_src_fault: vm_page_unlock_queues(); } -#if __x86_64__ - src_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) - << PAGE_SHIFT); -#elif __arm__ || __arm64__ src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); -#else - /* - * Establish an explicit mapping of the source - * physical page. - */ - kr = pmap_enter(kernel_pmap, - src_vaddr, - VM_PAGE_GET_PHYS_PAGE(src_page), - VM_PROT_READ, - VM_PROT_NONE, - 0, - TRUE); - - assert(kr == KERN_SUCCESS); -#endif /* * Validate the 4K page we want from diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 2c2454821..e20fd75a0 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -93,6 +93,7 @@ boolean_t vm_kernel_ready = FALSE; boolean_t kmem_ready = FALSE; boolean_t kmem_alloc_ready = FALSE; boolean_t zlog_ready = FALSE; +boolean_t iokit_iomd_setownership_enabled = TRUE; vm_offset_t kmapoff_kaddr; unsigned int kmapoff_pgcnt; @@ -180,7 +181,7 @@ vm_mem_bootstrap(void) zsize += zsize >> 1; #endif /* __LP64__ */ -#if defined(__x86_64__) +#if !CONFIG_EMBEDDED /* * The max_zonemap_size was based on physical memory and might make the * end of the zone go beyond what vm_page_[un]pack_ptr() can handle. @@ -249,6 +250,13 @@ vm_mem_bootstrap(void) zcache_bootstrap(); #endif vm_rtfault_record_init(); + + PE_parse_boot_argn("iokit_iomd_setownership_enabled", &iokit_iomd_setownership_enabled, sizeof(iokit_iomd_setownership_enabled)); + if (!iokit_iomd_setownership_enabled) { + kprintf("IOKit IOMD setownership DISABLED\n"); + } else { + kprintf("IOKit IOMD setownership ENABLED\n"); + } } void diff --git a/osfmk/vm/vm_init.h b/osfmk/vm/vm_init.h index a0ba80a9b..86b1c0128 100644 --- a/osfmk/vm/vm_init.h +++ b/osfmk/vm/vm_init.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,6 +34,6 @@ extern void vm_mem_bootstrap(void); extern void vm_mem_init(void); -extern void vm_map_steal_memory(void);; +extern void vm_map_steal_memory(void); #endif /* VM_INIT_H */ diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index 29f44cd6d..d20642916 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -81,6 +81,7 @@ #include #include +#include #include #include @@ -89,8 +90,8 @@ * Variables exported by this module. */ -vm_map_t kernel_map; -vm_map_t kernel_pageable_map; +SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map; +vm_map_t kernel_pageable_map; extern boolean_t vm_kernel_ready; @@ -370,8 +371,6 @@ kernel_memory_allocate( if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) { for (i = 0; i < wired_page_count; i++) { - uint64_t unavailable; - for (;;) { if (flags & KMA_LOMEM) { mem = vm_page_grablo(); @@ -391,8 +390,11 @@ kernel_memory_allocate( kr = KERN_RESOURCE_SHORTAGE; goto out; } - unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE; + /* VM privileged threads should have waited in vm_page_grab() and not get here. */ + assert(!(current_thread()->options & TH_OPT_VMPRIV)); + + uint64_t unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE; if (unavailable > max_mem || map_size > (max_mem - unavailable)) { kr = KERN_RESOURCE_SHORTAGE; goto out; diff --git a/osfmk/vm/vm_kern.h b/osfmk/vm/vm_kern.h index 7d78e27cd..9d25f2777 100644 --- a/osfmk/vm/vm_kern.h +++ b/osfmk/vm/vm_kern.h @@ -108,7 +108,7 @@ extern kern_return_t kmem_alloc( vm_map_t map, vm_offset_t *addrp, vm_size_t size, - vm_tag_t tag); + vm_tag_t tag) __XNU_INTERNAL(kmem_alloc); extern kern_return_t kmem_alloc_contig( vm_map_t map, @@ -131,7 +131,7 @@ extern kern_return_t kmem_alloc_pageable( vm_map_t map, vm_offset_t *addrp, vm_size_t size, - vm_tag_t tag); + vm_tag_t tag) __XNU_INTERNAL(kmem_alloc_pageable); extern kern_return_t kmem_alloc_aligned( vm_map_t map, @@ -166,7 +166,7 @@ extern kern_return_t kmem_alloc_kobject( vm_map_t map, vm_offset_t *addrp, vm_size_t size, - vm_tag_t tag); + vm_tag_t tag) __XNU_INTERNAL(kmem_alloc_kobject); extern kern_return_t kernel_memory_populate( vm_map_t map, @@ -434,7 +434,12 @@ extern vm_map_t ipc_kernel_map; #ifdef KERNEL __BEGIN_DECLS +#if MACH_KERNEL_PRIVATE +extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr) +__XNU_INTERNAL(vm_kernel_addrhash); +#else extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr); +#endif __END_DECLS extern void vm_kernel_addrhide( diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index bc5c093d2..031cb8298 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,7 +102,6 @@ #include #include #include -#include #include #include @@ -141,6 +140,8 @@ int vm_map_debug_fourk = 0; SECURITY_READ_ONLY_LATE(int) vm_map_executable_immutable = 1; int vm_map_executable_immutable_verbose = 0; +os_refgrp_decl(static, map_refgrp, "vm_map", NULL); + extern u_int32_t random(void); /* from */ /* Internal prototypes */ @@ -397,6 +398,7 @@ boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ (NEW)->vme_resilient_codesign = FALSE; \ (NEW)->vme_resilient_media = FALSE; \ (NEW)->vme_atomic = FALSE; \ + (NEW)->vme_no_copy_on_read = FALSE; \ MACRO_END #define vm_map_entry_copy_full(NEW, OLD) \ @@ -406,6 +408,43 @@ boolean_t _vmecf_reserved = (NEW)->from_reserved_zone; \ (NEW)->from_reserved_zone = _vmecf_reserved; \ MACRO_END +/* + * Normal lock_read_to_write() returns FALSE/0 on failure. + * These functions evaluate to zero on success and non-zero value on failure. + */ +__attribute__((always_inline)) +int +vm_map_lock_read_to_write(vm_map_t map) +{ + if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) { + DTRACE_VM(vm_map_lock_upgrade); + return 0; + } + return 1; +} + +__attribute__((always_inline)) +boolean_t +vm_map_try_lock(vm_map_t map) +{ + if (lck_rw_try_lock_exclusive(&(map)->lock)) { + DTRACE_VM(vm_map_lock_w); + return TRUE; + } + return FALSE; +} + +__attribute__((always_inline)) +boolean_t +vm_map_try_lock_read(vm_map_t map) +{ + if (lck_rw_try_lock_shared(&(map)->lock)) { + DTRACE_VM(vm_map_lock_r); + return TRUE; + } + return FALSE; +} + /* * Decide if we want to allow processes to execute from their data or stack areas. * override_nx() returns true if we do. Data/stack execution can be enabled independently @@ -640,9 +679,6 @@ vm_map_apple_protected( * properly page-aligned) or a "fourk_pager", itself backed by a * vnode pager (if 4K-aligned but not page-aligned). */ -#else /* __arm64__ */ - assert(start_aligned == start); - assert(end_aligned == end); #endif /* __arm64__ */ map_addr = start_aligned; @@ -1129,10 +1165,10 @@ vm_map_create_options( result->size = 0; result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */ result->user_wire_size = 0; -#if __x86_64__ +#if !CONFIG_EMBEDDED result->vmmap_high_start = 0; -#endif /* __x86_64__ */ - result->map_refcnt = 1; +#endif + os_ref_init_count(&result->map_refcnt, &map_refgrp, 1); #if TASK_SWAPPER result->res_count = 1; result->sw_state = MAP_SW_IN; @@ -1230,7 +1266,7 @@ _vm_map_entry_create( #if MAP_ENTRY_CREATION_DEBUG entry->vme_creation_maphdr = map_header; backtrace(&entry->vme_creation_bt[0], - (sizeof(entry->vme_creation_bt) / sizeof(uintptr_t))); + (sizeof(entry->vme_creation_bt) / sizeof(uintptr_t)), NULL); #endif return entry; } @@ -1310,7 +1346,7 @@ vm_map_res_reference(vm_map_t map) { /* assert map is locked */ assert(map->res_count >= 0); - assert(map->map_refcnt >= map->res_count); + assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); if (map->res_count == 0) { lck_mtx_unlock(&map->s_lock); vm_map_lock(map); @@ -1337,8 +1373,8 @@ vm_map_reference_swap(vm_map_t map) assert(map != VM_MAP_NULL); lck_mtx_lock(&map->s_lock); assert(map->res_count >= 0); - assert(map->map_refcnt >= map->res_count); - map->map_refcnt++; + assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); + os_ref_retain_locked(&map->map_refcnt); vm_map_res_reference(map); lck_mtx_unlock(&map->s_lock); } @@ -1364,7 +1400,7 @@ vm_map_res_deallocate(vm_map_t map) vm_map_unlock(map); lck_mtx_lock(&map->s_lock); } - assert(map->map_refcnt >= map->res_count); + assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); } #endif /* MACH_ASSERT && TASK_SWAPPER */ @@ -2093,6 +2129,10 @@ vm_memory_malloc_no_cow( { uint64_t alias_mask; + if (alias > 63) { + return FALSE; + } + alias_mask = 1ULL << alias; if (alias_mask & vm_memory_malloc_no_cow_mask) { return TRUE; @@ -2148,6 +2188,7 @@ vm_map_enter( boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); boolean_t is_submap = vmk_flags.vmkf_submap; boolean_t permanent = vmk_flags.vmkf_permanent; + boolean_t no_copy_on_read = vmk_flags.vmkf_no_copy_on_read; boolean_t entry_for_jit = vmk_flags.vmkf_map_jit; boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct; boolean_t resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0); @@ -2241,13 +2282,32 @@ vm_map_enter( } } - if (resilient_codesign || resilient_media) { + if (resilient_codesign) { + assert(!is_submap); if ((cur_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE)) || (max_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE))) { return KERN_PROTECTION_FAILURE; } } + if (resilient_media) { + assert(!is_submap); +// assert(!needs_copy); + if (object != VM_OBJECT_NULL && + !object->internal) { + /* + * This mapping is directly backed by an external + * memory manager (e.g. a vnode pager for a file): + * we would not have any safe place to inject + * a zero-filled page if an actual page is not + * available, without possibly impacting the actual + * contents of the mapped object (e.g. the file), + * so we can't provide any media resiliency here. + */ + return KERN_INVALID_ARGUMENT; + } + } + if (is_submap) { if (purgable) { /* submaps can not be purgeable */ @@ -2285,7 +2345,15 @@ vm_map_enter( #endif /* __arm__ */ effective_max_offset = 0x00000000FFFFF000ULL; } else { +#if !defined(CONFIG_EMBEDDED) + if (__improbable(vmk_flags.vmkf_32bit_map_va)) { + effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL); + } else { + effective_max_offset = map->max_offset; + } +#else effective_max_offset = map->max_offset; +#endif } if (size == 0 || @@ -2392,13 +2460,13 @@ StartAgain:; } start = *address; } -#if __x86_64__ +#if !CONFIG_EMBEDDED else if ((start == 0 || start == vm_map_min(map)) && !map->disable_vmentry_reuse && map->vmmap_high_start != 0) { start = map->vmmap_high_start; } -#endif /* __x86_64__ */ +#endif /* @@ -2815,6 +2883,7 @@ StartAgain:; (!entry->vme_resilient_codesign) && (!entry->vme_resilient_media) && (!entry->vme_atomic) && + (entry->vme_no_copy_on_read == no_copy_on_read) && ((entry->vme_end - entry->vme_start) + size <= (user_alias == VM_MEMORY_REALLOC ? @@ -2888,6 +2957,7 @@ StartAgain:; 0, no_cache, permanent, + no_copy_on_read, superpage_size, clear_map_aligned, is_submap, @@ -2903,8 +2973,8 @@ StartAgain:; } if (resilient_media && - !((cur_protection | max_protection) & - (VM_PROT_WRITE | VM_PROT_EXECUTE))) { + (object == VM_OBJECT_NULL || + object->internal)) { new_entry->vme_resilient_media = TRUE; } @@ -2955,13 +3025,13 @@ StartAgain:; assert(!new_entry->iokit_acct); submap = (vm_map_t) object; submap_is_64bit = vm_map_is_64bit(submap); - use_pmap = (user_alias == VM_MEMORY_SHARED_PMAP); + use_pmap = vmk_flags.vmkf_nested_pmap; #ifndef NO_NESTED_PMAP if (use_pmap && submap->pmap == NULL) { ledger_t ledger = map->pmap->ledger; /* we need a sub pmap to nest... */ - submap->pmap = pmap_create(ledger, 0, - submap_is_64bit); + submap->pmap = pmap_create_options(ledger, 0, + submap_is_64bit ? PMAP_CREATE_64BIT : 0); if (submap->pmap == NULL) { /* let's proceed without nesting... */ } @@ -3264,6 +3334,7 @@ vm_map_enter_fourk( boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); boolean_t is_submap = vmk_flags.vmkf_submap; boolean_t permanent = vmk_flags.vmkf_permanent; + boolean_t no_copy_on_read = vmk_flags.vmkf_permanent; boolean_t entry_for_jit = vmk_flags.vmkf_map_jit; // boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct; unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT); @@ -3532,7 +3603,8 @@ vm_map_enter_fourk( copy_object, 0, /* offset */ FALSE, /* needs_copy */ - FALSE, FALSE, + FALSE, + FALSE, cur_protection, max_protection, VM_BEHAVIOR_DEFAULT, ((entry_for_jit) @@ -3541,6 +3613,7 @@ vm_map_enter_fourk( 0, no_cache, permanent, + no_copy_on_read, superpage_size, clear_map_aligned, is_submap, @@ -5194,7 +5267,7 @@ vm_map_clip_unnest( pmap_unnest(map->pmap, entry->vme_start, entry->vme_end - entry->vme_start); - if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) { + if ((map->mapped_in_other_pmaps) && os_ref_get_count(&map->map_refcnt) != 0) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, @@ -5599,8 +5672,8 @@ vm_map_submap( /* nest if platform code will allow */ if (submap->pmap == NULL) { ledger_t ledger = map->pmap->ledger; - submap->pmap = pmap_create(ledger, - (vm_map_size_t) 0, FALSE); + submap->pmap = pmap_create_options(ledger, + (vm_map_size_t) 0, 0); if (submap->pmap == PMAP_NULL) { vm_map_unlock(map); return KERN_NO_SPACE; @@ -5652,10 +5725,6 @@ vm_map_protect( int pmap_options = 0; kern_return_t kr; - XPR(XPR_VM_MAP, - "vm_map_protect, 0x%X start 0x%X end 0x%X, new 0x%X %d", - map, start, end, new_prot, set_max); - if (new_prot & VM_PROT_COPY) { vm_map_offset_t new_start; vm_prot_t cur_prot, max_prot; @@ -7349,8 +7418,9 @@ vm_map_submap_pmap_clean( VME_SUBMAP(entry), VME_OFFSET(entry)); } else { - if ((map->mapped_in_other_pmaps) && (map->map_refcnt) - && (VME_OBJECT(entry) != NULL)) { + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0 && + VME_OBJECT(entry) != NULL) { vm_object_pmap_protect_options( VME_OBJECT(entry), (VME_OFFSET(entry) + @@ -7385,8 +7455,9 @@ vm_map_submap_pmap_clean( VME_SUBMAP(entry), VME_OFFSET(entry)); } else { - if ((map->mapped_in_other_pmaps) && (map->map_refcnt) - && (VME_OBJECT(entry) != NULL)) { + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0 && + VME_OBJECT(entry) != NULL) { vm_object_pmap_protect_options( VME_OBJECT(entry), VME_OFFSET(entry), @@ -7479,16 +7550,23 @@ vm_map_guard_exception( unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY; unsigned int target = 0; /* should we pass in pid associated with map? */ mach_exception_data_type_t subcode = (uint64_t)gap_start; + boolean_t fatal = FALSE; + + task_t task = current_task(); /* Can't deliver exceptions to kernel task */ - if (current_task() == kernel_task) { + if (task == kernel_task) { return; } EXC_GUARD_ENCODE_TYPE(code, guard_type); EXC_GUARD_ENCODE_FLAVOR(code, reason); EXC_GUARD_ENCODE_TARGET(code, target); - thread_guard_violation(current_thread(), code, subcode); + + if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) { + fatal = TRUE; + } + thread_guard_violation(current_thread(), code, subcode, fatal); } /* @@ -7518,8 +7596,8 @@ vm_map_delete( unsigned int last_timestamp = ~0; /* unlikely value */ int interruptible; vm_map_offset_t gap_start; - vm_map_offset_t save_start = start; - vm_map_offset_t save_end = end; + __unused vm_map_offset_t save_start = start; + __unused vm_map_offset_t save_end = end; const vm_map_offset_t FIND_GAP = 1; /* a not page aligned value */ const vm_map_offset_t GAPS_OK = 2; /* a different not page aligned value */ @@ -7609,7 +7687,7 @@ vm_map_delete( SAVE_HINT_MAP_WRITE(map, entry->vme_prev); } else { if (map->pmap == kernel_pmap && - map->map_refcnt != 0) { + os_ref_get_count(&map->map_refcnt) != 0) { panic("vm_map_delete(%p,0x%llx,0x%llx): " "no map entry at 0x%llx\n", map, @@ -8041,7 +8119,8 @@ vm_map_delete( entry->vme_end - entry->vme_start, pmap_flags); #endif /* NO_NESTED_PMAP */ - if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) { + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, @@ -8058,7 +8137,8 @@ vm_map_delete( } else if (VME_OBJECT(entry) != kernel_object && VME_OBJECT(entry) != compressor_object) { object = VME_OBJECT(entry); - if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) { + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0) { vm_object_pmap_protect_options( object, VME_OFFSET(entry), entry->vme_end - entry->vme_start, @@ -8113,7 +8193,7 @@ vm_map_delete( next = entry->vme_next; if (map->pmap == kernel_pmap && - map->map_refcnt != 0 && + os_ref_get_count(&map->map_refcnt) != 0 && entry->vme_end < end && (next == vm_map_to_entry(map) || next->vme_start != entry->vme_end)) { @@ -8229,18 +8309,6 @@ vm_map_delete( vm_map_offset_t, save_start, vm_map_offset_t, save_end); if (!(flags & VM_MAP_REMOVE_GAPS_OK)) { -#if defined(DEVELOPMENT) || defined(DEBUG) - /* log just once if not checking, otherwise log each one */ - if (!map->warned_delete_gap || - (task_exc_guard_default & TASK_EXC_GUARD_VM_ALL) != 0) { - printf("vm_map_delete: map %p [%p...%p] nothing at %p\n", - (void *)map, (void *)save_start, (void *)save_end, - (void *)gap_start); - if (!map->warned_delete_gap) { - map->warned_delete_gap = 1; - } - } -#endif vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP); } } @@ -8931,7 +8999,7 @@ start_overwrite: entry->is_sub_map = FALSE; vm_map_deallocate( VME_SUBMAP(entry)); - VME_OBJECT_SET(entry, NULL); + VME_OBJECT_SET(entry, VM_OBJECT_NULL); VME_OFFSET_SET(entry, 0); entry->is_shared = FALSE; entry->needs_copy = FALSE; @@ -9611,7 +9679,7 @@ vm_map_copy_overwrite_unaligned( } dst_object = vm_object_allocate((vm_map_size_t) entry->vme_end - entry->vme_start); - VME_OBJECT(entry) = dst_object; + VME_OBJECT_SET(entry, dst_object); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); vm_map_lock_write_to_read(dst_map); @@ -10735,6 +10803,7 @@ StartAgain:; while (entry != vm_map_copy_to_entry(copy)) { new = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); vm_map_entry_copy_full(new, entry); + new->vme_no_copy_on_read = FALSE; assert(!new->iokit_acct); if (new->is_sub_map) { /* clr address space specifics */ @@ -11080,8 +11149,6 @@ vm_map_copyin_internal( src_destroy, copy_result); } - XPR(XPR_VM_MAP, "vm_map_copyin_common map 0x%x addr 0x%x len 0x%x dest %d\n", src_map, src_addr, len, src_destroy, 0); - /* * Allocate a header element for the list. * @@ -11342,13 +11409,10 @@ vm_map_copyin_internal( RestartCopy: - XPR(XPR_VM_MAP, "vm_map_copyin_common src_obj 0x%x ent 0x%x obj 0x%x was_wired %d\n", - src_object, new_entry, VME_OBJECT(new_entry), - was_wired, 0); if ((src_object == VM_OBJECT_NULL || (!was_wired && !map_share && !tmp_entry->is_shared)) && vm_object_copy_quickly( - &VME_OBJECT(new_entry), + VME_OBJECT_PTR(new_entry), src_offset, src_size, &src_needs_copy, @@ -11425,7 +11489,7 @@ CopySlowly: src_offset, src_size, THREAD_UNINT, - &VME_OBJECT(new_entry)); + VME_OBJECT_PTR(new_entry)); VME_OFFSET_SET(new_entry, 0); new_entry->needs_copy = FALSE; } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && @@ -11455,7 +11519,7 @@ CopySlowly: result = vm_object_copy_strategically(src_object, src_offset, src_size, - &VME_OBJECT(new_entry), + VME_OBJECT_PTR(new_entry), &new_offset, &new_entry_needs_copy); if (new_offset != VME_OFFSET(new_entry)) { @@ -12368,7 +12432,12 @@ vm_map_fork( #error Unknown architecture. #endif - new_pmap = pmap_create(ledger, (vm_map_size_t) 0, pmap_is64bit); + unsigned int pmap_flags = 0; + pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0; +#if defined(HAS_APPLE_PAC) + pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0; +#endif + new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags); vm_map_reference_swap(old_map); vm_map_lock(old_map); @@ -12473,7 +12542,7 @@ vm_map_fork( } if (!vm_object_copy_quickly( - &VME_OBJECT(new_entry), + VME_OBJECT_PTR(new_entry), VME_OFFSET(old_entry), (old_entry->vme_end - old_entry->vme_start), @@ -12711,6 +12780,7 @@ submap_recurse: vm_map_entry_t submap_entry; vm_prot_t subentry_protection; vm_prot_t subentry_max_protection; + boolean_t subentry_no_copy_on_read; boolean_t mapped_needs_copy = FALSE; local_vaddr = vaddr; @@ -12920,6 +12990,7 @@ RetrySubMap: subentry_protection = submap_entry->protection; subentry_max_protection = submap_entry->max_protection; + subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read; vm_map_unlock(map); submap_entry = NULL; /* not valid after map unlock */ @@ -12996,6 +13067,8 @@ RetrySubMap: entry->protection |= subentry_protection; } entry->max_protection |= subentry_max_protection; + /* propagate no_copy_on_read */ + entry->vme_no_copy_on_read = subentry_no_copy_on_read; if ((entry->protection & VM_PROT_WRITE) && (entry->protection & VM_PROT_EXECUTE) && @@ -13209,6 +13282,8 @@ protection_failure: #endif /* CONFIG_PMAP_CS */ fault_info->mark_zf_absent = FALSE; fault_info->batch_pmap_op = FALSE; + fault_info->resilient_media = entry->vme_resilient_media; + fault_info->no_copy_on_read = entry->vme_no_copy_on_read; } /* @@ -13348,6 +13423,9 @@ vm_map_region_recurse_64( if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64; } + if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) { + *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + } } user_address = *address; @@ -13534,23 +13612,19 @@ recurse_again: next_entry == NULL && /* & there are no more regions */ /* & we haven't already provided our fake region: */ user_address <= vm_map_last_entry(map)->vme_end) { - ledger_amount_t nonvol, nonvol_compressed; + ledger_amount_t ledger_resident, ledger_compressed; + /* * Add a fake memory region to account for - * purgeable memory that counts towards this - * task's memory footprint, i.e. the resident - * compressed pages of non-volatile objects - * owned by that task. + * purgeable and/or ledger-tagged memory that + * counts towards this task's memory footprint, + * i.e. the resident/compressed pages of non-volatile + * objects owned by that task. */ - ledger_get_balance( - map->pmap->ledger, - task_ledgers.purgeable_nonvolatile, - &nonvol); - ledger_get_balance( - map->pmap->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - &nonvol_compressed); - if (nonvol + nonvol_compressed == 0) { + task_ledgers_footprint(map->pmap->ledger, + &ledger_resident, + &ledger_compressed); + if (ledger_resident + ledger_compressed == 0) { /* no purgeable memory usage to report */ return KERN_INVALID_ADDRESS; } @@ -13561,9 +13635,9 @@ recurse_again: submap_info->inheritance = VM_INHERIT_DEFAULT; submap_info->offset = 0; submap_info->user_tag = -1; - submap_info->pages_resident = (unsigned int) (nonvol / PAGE_SIZE); + submap_info->pages_resident = (unsigned int) (ledger_resident / PAGE_SIZE); submap_info->pages_shared_now_private = 0; - submap_info->pages_swapped_out = (unsigned int) (nonvol_compressed / PAGE_SIZE); + submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / PAGE_SIZE); submap_info->pages_dirtied = submap_info->pages_resident; submap_info->ref_count = 1; submap_info->shadow_depth = 0; @@ -13590,7 +13664,7 @@ recurse_again: short_info->ref_count = 1; } *nesting_depth = 0; - *size = (vm_map_size_t) (nonvol + nonvol_compressed); + *size = (vm_map_size_t) (ledger_resident + ledger_compressed); // *address = user_address; *address = vm_map_last_entry(map)->vme_end; return KERN_SUCCESS; @@ -13706,7 +13780,7 @@ recurse_again: } else { extended.share_mode = SM_PRIVATE; } - extended.ref_count = VME_SUBMAP(curr_entry)->map_refcnt; + extended.ref_count = os_ref_get_count(&VME_SUBMAP(curr_entry)->map_refcnt); } } @@ -13724,6 +13798,9 @@ recurse_again: if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { submap_info->pages_reusable = extended.pages_reusable; } + if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) { + submap_info->object_id_full = (vm_object_id_t) (VME_OBJECT(curr_entry) != NULL) ? VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry)) : 0ULL; + } } else { short_info->external_pager = extended.external_pager; short_info->shadow_depth = extended.shadow_depth; @@ -14039,7 +14116,7 @@ vm_map_region_top_walk( OBJ_RESIDENT_COUNT(obj, entry_size); } else { if (ref_count == 1 || - (ref_count == 2 && !(obj->pager_trusted) && !(obj->internal))) { + (ref_count == 2 && obj->named)) { top->share_mode = SM_PRIVATE; top->private_pages_resident = OBJ_RESIDENT_COUNT(obj, @@ -14235,7 +14312,7 @@ collect_object_info: shadow_object = obj->shadow; shadow_depth = 0; - if (!(obj->pager_trusted) && !(obj->internal)) { + if (!(obj->internal)) { extended->external_pager = 1; } @@ -14246,8 +14323,7 @@ collect_object_info: shadow_depth++) { vm_object_t next_shadow; - if (!(shadow_object->pager_trusted) && - !(shadow_object->internal)) { + if (!(shadow_object->internal)) { extended->external_pager = 1; } @@ -14342,7 +14418,7 @@ vm_map_region_look_for_page( while (TRUE) { - if (!(object->pager_trusted) && !(object->internal)) { + if (!(object->internal)) { extended->external_pager = 1; } @@ -14506,6 +14582,7 @@ vm_map_simplify_entry( this_entry->vme_resilient_codesign) && (prev_entry->vme_resilient_media == this_entry->vme_resilient_media) && + (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) && (prev_entry->wired_count == this_entry->wired_count) && (prev_entry->user_wired_count == this_entry->user_wired_count) && @@ -14751,10 +14828,6 @@ vm_map_behavior_set( vm_map_entry_t entry; vm_map_entry_t temp_entry; - XPR(XPR_VM_MAP, - "vm_map_behavior_set, 0x%X start 0x%X end 0x%X behavior %d", - map, start, end, new_behavior, 0); - if (start > end || start < vm_map_min(map) || end > vm_map_max(map)) { @@ -14847,9 +14920,9 @@ vm_map_behavior_set( /* * Internals for madvise(MADV_WILLNEED) system call. * - * The present implementation is to do a read-ahead if the mapping corresponds - * to a mapped regular file. If it's an anonymous mapping, then we do nothing - * and basically ignore the "advice" (which we are always free to do). + * The implementation is to do:- + * a) read-ahead if the mapping corresponds to a mapped regular file + * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping */ @@ -14929,69 +15002,98 @@ vm_map_willneed( } /* - * If there's no read permission to this mapping, then just - * skip it. + * If the entry is a submap OR there's no read permission + * to this mapping, then just skip it. */ - if ((entry->protection & VM_PROT_READ) == 0) { + if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) { entry = entry->vme_next; start = entry->vme_start; continue; } - /* - * Find the file object backing this map entry. If there is - * none, then we simply ignore the "will need" advice for this - * entry and go on to the next one. - */ - if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { - entry = entry->vme_next; - start = entry->vme_start; - continue; - } + object = VME_OBJECT(entry); - /* - * The data_request() could take a long time, so let's - * release the map lock to avoid blocking other threads. - */ - vm_map_unlock_read(map); + if (object == NULL || + (object && object->internal)) { + /* + * Memory range backed by anonymous memory. + */ + vm_size_t region_size = 0, effective_page_size = 0; + vm_map_offset_t addr = 0, effective_page_mask = 0; - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); + region_size = len; + addr = start; - /* - * Get the data from the object asynchronously. - * - * Note that memory_object_data_request() places limits on the - * amount of I/O it will do. Regardless of the len we - * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it - * silently truncates the len to that size. This isn't - * necessarily bad since madvise shouldn't really be used to - * page in unlimited amounts of data. Other Unix variants - * limit the willneed case as well. If this turns out to be an - * issue for developers, then we can always adjust the policy - * here and still be backwards compatible since this is all - * just "advice". - */ - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* ignored */ - VM_PROT_READ, - (memory_object_fault_info_t)&fault_info); + effective_page_mask = MAX(vm_map_page_mask(current_map()), PAGE_MASK); + effective_page_size = effective_page_mask + 1; - vm_object_lock(object); - vm_object_paging_end(object); - vm_object_unlock(object); + vm_map_unlock_read(map); - /* - * If we couldn't do the I/O for some reason, just give up on - * the madvise. We still return success to the user since - * madvise isn't supposed to fail when the advice can't be - * taken. - */ - if (kr != KERN_SUCCESS) { - return KERN_SUCCESS; + while (region_size) { + vm_pre_fault( + vm_map_trunc_page(addr, effective_page_mask), + VM_PROT_READ | VM_PROT_WRITE); + + region_size -= effective_page_size; + addr += effective_page_size; + } + } else { + /* + * Find the file object backing this map entry. If there is + * none, then we simply ignore the "will need" advice for this + * entry and go on to the next one. + */ + if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { + entry = entry->vme_next; + start = entry->vme_start; + continue; + } + + vm_object_paging_begin(object); + pager = object->pager; + vm_object_unlock(object); + + /* + * The data_request() could take a long time, so let's + * release the map lock to avoid blocking other threads. + */ + vm_map_unlock_read(map); + + /* + * Get the data from the object asynchronously. + * + * Note that memory_object_data_request() places limits on the + * amount of I/O it will do. Regardless of the len we + * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it + * silently truncates the len to that size. This isn't + * necessarily bad since madvise shouldn't really be used to + * page in unlimited amounts of data. Other Unix variants + * limit the willneed case as well. If this turns out to be an + * issue for developers, then we can always adjust the policy + * here and still be backwards compatible since this is all + * just "advice". + */ + kr = memory_object_data_request( + pager, + offset + object->paging_offset, + 0, /* ignored */ + VM_PROT_READ, + (memory_object_fault_info_t)&fault_info); + + vm_object_lock(object); + vm_object_paging_end(object); + vm_object_unlock(object); + + /* + * If we couldn't do the I/O for some reason, just give up on + * the madvise. We still return success to the user since + * madvise isn't supposed to fail when the advice can't be + * taken. + */ + + if (kr != KERN_SUCCESS) { + return KERN_SUCCESS; + } } start += len; @@ -15480,6 +15582,7 @@ vm_map_entry_insert( unsigned wired_count, boolean_t no_cache, boolean_t permanent, + boolean_t no_copy_on_read, unsigned int superpage_size, boolean_t clear_map_aligned, boolean_t is_submap, @@ -15563,9 +15666,6 @@ vm_map_entry_insert( { new_entry->used_for_jit = TRUE; map->jit_entry_exists = TRUE; - - /* Tell the pmap that it supports JIT. */ - pmap_set_jit_entitled(map->pmap); } } else { new_entry->used_for_jit = FALSE; @@ -15575,6 +15675,7 @@ vm_map_entry_insert( new_entry->vme_resilient_codesign = FALSE; new_entry->vme_resilient_media = FALSE; new_entry->vme_atomic = FALSE; + new_entry->vme_no_copy_on_read = no_copy_on_read; /* * Insert the new entry into the list. @@ -15706,7 +15807,8 @@ vm_map_remap_extract( * This entry uses "IOKit accounting". */ } else if (object != VM_OBJECT_NULL && - object->purgable != VM_PURGABLE_DENY) { + (object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) { /* * Purgeable objects have their own accounting: * no pmap accounting for them. @@ -15852,16 +15954,20 @@ vm_map_remap_extract( */ RestartCopy: if (!copy) { - /* - * Cannot allow an entry describing a JIT - * region to be shared across address spaces. - */ - if (src_entry->used_for_jit == TRUE && !same_map) { + if (src_entry->used_for_jit == TRUE) { + if (same_map) { + } else { #if CONFIG_EMBEDDED - result = KERN_INVALID_ARGUMENT; - break; + /* + * Cannot allow an entry describing a JIT + * region to be shared across address spaces. + */ + result = KERN_INVALID_ARGUMENT; + break; #endif /* CONFIG_EMBEDDED */ + } } + src_entry->is_shared = TRUE; new_entry->is_shared = TRUE; if (!(new_entry->is_sub_map)) { @@ -15873,7 +15979,7 @@ RestartCopy: new_entry->needs_copy = TRUE; object = VM_OBJECT_NULL; } else if (src_entry->wired_count == 0 && - vm_object_copy_quickly(&VME_OBJECT(new_entry), + vm_object_copy_quickly(VME_OBJECT_PTR(new_entry), VME_OFFSET(new_entry), (new_entry->vme_end - new_entry->vme_start), @@ -15946,7 +16052,7 @@ RestartCopy: (new_entry->vme_end - new_entry->vme_start), THREAD_UNINT, - &VME_OBJECT(new_entry)); + VME_OBJECT_PTR(new_entry)); VME_OFFSET_SET(new_entry, 0); new_entry->needs_copy = FALSE; @@ -15959,7 +16065,7 @@ RestartCopy: offset, (new_entry->vme_end - new_entry->vme_start), - &VME_OBJECT(new_entry), + VME_OBJECT_PTR(new_entry), &new_offset, &new_entry_needs_copy); if (new_offset != VME_OFFSET(new_entry)) { @@ -16126,6 +16232,13 @@ vm_map_remap( return KERN_INVALID_ARGUMENT; } + if (flags & VM_FLAGS_RESILIENT_MEDIA) { + /* must be copy-on-write to be "media resilient" */ + if (!copy) { + return KERN_INVALID_ARGUMENT; + } + } + result = vm_map_remap_extract(src_map, memory_address, size, copy, &map_header, cur_protection, @@ -16165,6 +16278,12 @@ vm_map_remap( entry->vme_start += *address; entry->vme_end += *address; assert(!entry->map_aligned); + if ((flags & VM_FLAGS_RESILIENT_MEDIA) && + !entry->is_sub_map && + (VME_OBJECT(entry) == VM_OBJECT_NULL || + VME_OBJECT(entry)->internal)) { + entry->vme_resilient_media = TRUE; + } vm_map_store_entry_link(target_map, insp_entry, entry, vmk_flags); insp_entry = entry; @@ -16876,6 +16995,7 @@ vm_map_page_range_info_internal( vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0; vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0; boolean_t do_region_footprint; + ledger_amount_t ledger_resident, ledger_compressed; switch (flavor) { case VM_PAGE_INFO_BASIC: @@ -16913,6 +17033,8 @@ vm_map_page_range_info_internal( vm_map_lock_read(map); + task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed); + for (curr_s_offset = start; curr_s_offset < end;) { /* * New lookup needs reset of these variables. @@ -16924,8 +17046,6 @@ vm_map_page_range_info_internal( if (do_region_footprint && curr_s_offset >= vm_map_last_entry(map)->vme_end) { - ledger_amount_t nonvol_compressed; - /* * Request for "footprint" info about a page beyond * the end of address space: this must be for @@ -16934,13 +17054,9 @@ vm_map_page_range_info_internal( * memory owned by this task. */ disposition = 0; - nonvol_compressed = 0; - ledger_get_balance( - map->pmap->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - &nonvol_compressed); + if (curr_s_offset - vm_map_last_entry(map)->vme_end <= - (unsigned) nonvol_compressed) { + (unsigned) ledger_compressed) { /* * We haven't reported all the "non-volatile * compressed" pages yet, so report this fake @@ -17214,6 +17330,9 @@ vm_map_page_range_info_internal( } else { disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL; } + if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) { + disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; + } } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) { assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; @@ -17344,6 +17463,9 @@ vm_map_page_range_info_internal( if (m->vmp_cs_nx) { disposition |= VM_PAGE_QUERY_PAGE_CS_NX; } + if (m->vmp_reusable || curr_object->all_reusable) { + disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; + } } } @@ -17794,10 +17916,10 @@ vm_map_reference( lck_mtx_lock(&map->s_lock); #if TASK_SWAPPER assert(map->res_count > 0); - assert(map->map_refcnt >= map->res_count); + assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); map->res_count++; #endif - map->map_refcnt++; + os_ref_retain_locked(&map->map_refcnt); lck_mtx_unlock(&map->s_lock); } @@ -17819,13 +17941,13 @@ vm_map_deallocate( } lck_mtx_lock(&map->s_lock); - ref = --map->map_refcnt; + ref = os_ref_release_locked(&map->map_refcnt); if (ref > 0) { vm_map_res_deallocate(map); lck_mtx_unlock(&map->s_lock); return; } - assert(map->map_refcnt == 0); + assert(os_ref_get_count(&map->map_refcnt) == 0); lck_mtx_unlock(&map->s_lock); #if TASK_SWAPPER @@ -17901,6 +18023,19 @@ vm_map_set_jumbo(vm_map_t map) #endif } +/* + * This map has a JIT entitlement + */ +void +vm_map_set_jit_entitled(vm_map_t map) +{ +#if defined (__arm64__) + pmap_set_jit_entitled(map->pmap); +#else /* arm64 */ + (void) map; +#endif +} + /* * Expand the maximum size of an existing map. */ @@ -18384,12 +18519,12 @@ extern unsigned int memorystatus_freeze_shared_mb_per_process_max; kern_return_t vm_map_freeze( - vm_map_t map, + task_t task, unsigned int *purgeable_count, unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, - __unused unsigned int dirty_budget, + unsigned int dirty_budget, unsigned int *shared_count, int *freezer_error_code, boolean_t eval_only) @@ -18408,6 +18543,8 @@ vm_map_freeze( * block any page faults or lookups while we are * in the middle of freezing this vm map. */ + vm_map_t map = task->map; + vm_map_lock(map); assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); @@ -18459,6 +18596,30 @@ again: if (src_object->internal == TRUE) { if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * We skip purgeable objects during evaluation phase only. + * If we decide to freeze this process, we'll explicitly + * purge these objects before we go around again with + * 'evaluation_phase' set to FALSE. + */ + + if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) { + /* + * We want to purge objects that may not belong to this task but are mapped + * in this task alone. Since we already purged this task's purgeable memory + * at the end of a successful evaluation phase, we want to avoid doing no-op calls + * on this task's purgeable objects. Hence the check for only volatile objects. + */ + if (evaluation_phase == FALSE && + (src_object->purgable == VM_PURGABLE_VOLATILE) && + (src_object->ref_count == 1)) { + vm_object_lock(src_object); + vm_object_purge(src_object, 0); + vm_object_unlock(src_object); + } + continue; + } + /* * Pages belonging to this object could be swapped to disk. * Make sure it's not a shared object because we could end @@ -18468,6 +18629,7 @@ again: * more than once within our own map. But we don't do full searches, * we just look at the entries following our current entry. */ + if (src_object->ref_count > 1) { if (src_object != cur_shared_object) { obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); @@ -18503,8 +18665,7 @@ again: } } - vm_object_compressed_freezer_pageout(src_object); - + uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget); *wired_count += src_object->wired_page_count; if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { @@ -18519,6 +18680,10 @@ again: kr = KERN_NO_SPACE; break; } + if (paged_out_count >= dirty_budget) { + break; + } + dirty_budget -= paged_out_count; } } } @@ -18550,6 +18715,8 @@ again: goto done; } + vm_purgeable_purge_task_owned(task); + goto again; } else { kr = KERN_SUCCESS; @@ -18923,7 +19090,7 @@ vm_commit_pagezero_status(vm_map_t lmap) pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset); } -#if __x86_64__ +#if !CONFIG_EMBEDDED void vm_map_set_high_start( vm_map_t map, @@ -18931,7 +19098,7 @@ vm_map_set_high_start( { map->vmmap_high_start = high_start; } -#endif /* __x86_64__ */ +#endif #if PMAP_CS kern_return_t @@ -19722,8 +19889,16 @@ vm_map_copy_footprint_ledgers( vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting); vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table); + vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile); vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem); } @@ -19771,3 +19946,146 @@ vm_map_copy_ledger( delta); } } + +#if MACH_ASSERT + +extern int pmap_ledgers_panic; +extern int pmap_ledgers_panic_leeway; + +#define LEDGER_DRIFT(__LEDGER) \ + int __LEDGER##_over; \ + ledger_amount_t __LEDGER##_over_total; \ + ledger_amount_t __LEDGER##_over_max; \ + int __LEDGER##_under; \ + ledger_amount_t __LEDGER##_under_total; \ + ledger_amount_t __LEDGER##_under_max + +struct { + uint64_t num_pmaps_checked; + + LEDGER_DRIFT(phys_footprint); + LEDGER_DRIFT(internal); + LEDGER_DRIFT(internal_compressed); + LEDGER_DRIFT(iokit_mapped); + LEDGER_DRIFT(alternate_accounting); + LEDGER_DRIFT(alternate_accounting_compressed); + LEDGER_DRIFT(page_table); + LEDGER_DRIFT(purgeable_volatile); + LEDGER_DRIFT(purgeable_nonvolatile); + LEDGER_DRIFT(purgeable_volatile_compressed); + LEDGER_DRIFT(purgeable_nonvolatile_compressed); + LEDGER_DRIFT(tagged_nofootprint); + LEDGER_DRIFT(tagged_footprint); + LEDGER_DRIFT(tagged_nofootprint_compressed); + LEDGER_DRIFT(tagged_footprint_compressed); + LEDGER_DRIFT(network_volatile); + LEDGER_DRIFT(network_nonvolatile); + LEDGER_DRIFT(network_volatile_compressed); + LEDGER_DRIFT(network_nonvolatile_compressed); + LEDGER_DRIFT(media_nofootprint); + LEDGER_DRIFT(media_footprint); + LEDGER_DRIFT(media_nofootprint_compressed); + LEDGER_DRIFT(media_footprint_compressed); + LEDGER_DRIFT(graphics_nofootprint); + LEDGER_DRIFT(graphics_footprint); + LEDGER_DRIFT(graphics_nofootprint_compressed); + LEDGER_DRIFT(graphics_footprint_compressed); + LEDGER_DRIFT(neural_nofootprint); + LEDGER_DRIFT(neural_footprint); + LEDGER_DRIFT(neural_nofootprint_compressed); + LEDGER_DRIFT(neural_footprint_compressed); +} pmap_ledgers_drift; + +void +vm_map_pmap_check_ledgers( + pmap_t pmap, + ledger_t ledger, + int pid, + char *procname) +{ + ledger_amount_t bal; + boolean_t do_panic; + + do_panic = FALSE; + + pmap_ledgers_drift.num_pmaps_checked++; + +#define LEDGER_CHECK_BALANCE(__LEDGER) \ +MACRO_BEGIN \ + int panic_on_negative = TRUE; \ + ledger_get_balance(ledger, \ + task_ledgers.__LEDGER, \ + &bal); \ + ledger_get_panic_on_negative(ledger, \ + task_ledgers.__LEDGER, \ + &panic_on_negative); \ + if (bal != 0) { \ + if (panic_on_negative || \ + (pmap_ledgers_panic && \ + pmap_ledgers_panic_leeway > 0 && \ + (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ + bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ + do_panic = TRUE; \ + } \ + printf("LEDGER BALANCE proc %d (%s) " \ + "\"%s\" = %lld\n", \ + pid, procname, #__LEDGER, bal); \ + if (bal > 0) { \ + pmap_ledgers_drift.__LEDGER##_over++; \ + pmap_ledgers_drift.__LEDGER##_over_total += bal; \ + if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ + pmap_ledgers_drift.__LEDGER##_over_max = bal; \ + } \ + } else if (bal < 0) { \ + pmap_ledgers_drift.__LEDGER##_under++; \ + pmap_ledgers_drift.__LEDGER##_under_total += bal; \ + if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ + pmap_ledgers_drift.__LEDGER##_under_max = bal; \ + } \ + } \ + } \ +MACRO_END + + LEDGER_CHECK_BALANCE(phys_footprint); + LEDGER_CHECK_BALANCE(internal); + LEDGER_CHECK_BALANCE(internal_compressed); + LEDGER_CHECK_BALANCE(iokit_mapped); + LEDGER_CHECK_BALANCE(alternate_accounting); + LEDGER_CHECK_BALANCE(alternate_accounting_compressed); + LEDGER_CHECK_BALANCE(page_table); + LEDGER_CHECK_BALANCE(purgeable_volatile); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile); + LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); + LEDGER_CHECK_BALANCE(tagged_nofootprint); + LEDGER_CHECK_BALANCE(tagged_footprint); + LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed); + LEDGER_CHECK_BALANCE(tagged_footprint_compressed); + LEDGER_CHECK_BALANCE(network_volatile); + LEDGER_CHECK_BALANCE(network_nonvolatile); + LEDGER_CHECK_BALANCE(network_volatile_compressed); + LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); + LEDGER_CHECK_BALANCE(media_nofootprint); + LEDGER_CHECK_BALANCE(media_footprint); + LEDGER_CHECK_BALANCE(media_nofootprint_compressed); + LEDGER_CHECK_BALANCE(media_footprint_compressed); + LEDGER_CHECK_BALANCE(graphics_nofootprint); + LEDGER_CHECK_BALANCE(graphics_footprint); + LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed); + LEDGER_CHECK_BALANCE(graphics_footprint_compressed); + LEDGER_CHECK_BALANCE(neural_nofootprint); + LEDGER_CHECK_BALANCE(neural_footprint); + LEDGER_CHECK_BALANCE(neural_nofootprint_compressed); + LEDGER_CHECK_BALANCE(neural_footprint_compressed); + + if (do_panic) { + if (pmap_ledgers_panic) { + panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", + pmap, pid, procname); + } else { + printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", + pmap, pid, procname); + } + } +} +#endif /* MACH_ASSERT */ diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 533b8d78c..3360cdfb4 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,7 @@ #include #include #include +#include #ifdef KERNEL_PRIVATE @@ -113,6 +114,7 @@ __END_DECLS #include #include +#include #define current_map_fast() (current_thread()->map) #define current_map() (current_map_fast()) @@ -130,7 +132,7 @@ __END_DECLS * used for inter-map copy operations */ typedef struct vm_map_entry *vm_map_entry_t; -#define VM_MAP_ENTRY_NULL ((vm_map_entry_t) 0) +#define VM_MAP_ENTRY_NULL ((vm_map_entry_t) NULL) /* @@ -172,7 +174,7 @@ extern queue_head_t vm_named_entry_list; */ struct vm_named_entry { - decl_lck_mtx_data(, Lock) /* Synchronization */ + decl_lck_mtx_data(, Lock); /* Synchronization */ union { vm_object_t object; /* object I point to */ vm_map_t map; /* map backing submap */ @@ -217,55 +219,6 @@ struct vm_map_links { vm_map_offset_t end; /* end address */ }; -/* - * IMPORTANT: - * The "alias" field can be updated while holding the VM map lock - * "shared". It's OK as along as it's the only field that can be - * updated without the VM map "exclusive" lock. - */ -#define VME_OBJECT(entry) ((entry)->vme_object.vmo_object) -#define VME_OBJECT_SET(entry, object) \ - MACRO_BEGIN \ - (entry)->vme_object.vmo_object = (object); \ - MACRO_END -#define VME_SUBMAP(entry) ((entry)->vme_object.vmo_submap) -#define VME_SUBMAP_SET(entry, submap) \ - MACRO_BEGIN \ - (entry)->vme_object.vmo_submap = (submap); \ - MACRO_END -#define VME_OFFSET(entry) ((entry)->vme_offset & ~PAGE_MASK) -#define VME_OFFSET_SET(entry, offset) \ - MACRO_BEGIN \ - int __alias; \ - __alias = VME_ALIAS((entry)); \ - assert((offset & PAGE_MASK) == 0); \ - (entry)->vme_offset = offset | __alias; \ - MACRO_END -#define VME_OBJECT_SHADOW(entry, length) \ - MACRO_BEGIN \ - vm_object_t __object; \ - vm_object_offset_t __offset; \ - __object = VME_OBJECT((entry)); \ - __offset = VME_OFFSET((entry)); \ - vm_object_shadow(&__object, &__offset, (length)); \ - if (__object != VME_OBJECT((entry))) { \ - VME_OBJECT_SET((entry), __object); \ - (entry)->use_pmap = TRUE; \ - } \ - if (__offset != VME_OFFSET((entry))) { \ - VME_OFFSET_SET((entry), __offset); \ - } \ - MACRO_END - -#define VME_ALIAS_MASK (PAGE_MASK) -#define VME_ALIAS(entry) ((unsigned int)((entry)->vme_offset & VME_ALIAS_MASK)) -#define VME_ALIAS_SET(entry, alias) \ - MACRO_BEGIN \ - vm_map_offset_t __offset; \ - __offset = VME_OFFSET((entry)); \ - (entry)->vme_offset = __offset | ((alias) & VME_ALIAS_MASK); \ - MACRO_END - /* * FOOTPRINT ACCOUNTING: * The "memory footprint" is better described in the pmap layer. @@ -344,8 +297,8 @@ struct vm_map_entry { /* boolean_t */ vme_resilient_codesign:1, /* boolean_t */ vme_resilient_media:1, /* boolean_t */ vme_atomic:1, /* entry cannot be split/coalesced */ - __unused:4; - ; + /* boolean_t */ vme_no_copy_on_read:1, + __unused:3; unsigned short wired_count; /* can be paged if = 0 */ unsigned short user_wired_count; /* for vm_wire */ @@ -362,6 +315,86 @@ struct vm_map_entry { #endif }; +#define VME_SUBMAP_PTR(entry) \ + (&((entry)->vme_object.vmo_submap)) +#define VME_SUBMAP(entry) \ + ((vm_map_t)((uintptr_t)0 + *VME_SUBMAP_PTR(entry))) +#define VME_OBJECT_PTR(entry) \ + (&((entry)->vme_object.vmo_object)) +#define VME_OBJECT(entry) \ + ((vm_object_t)((uintptr_t)0 + *VME_OBJECT_PTR(entry))) +#define VME_OFFSET(entry) \ + ((entry)->vme_offset & ~PAGE_MASK) +#define VME_ALIAS_MASK (PAGE_MASK) +#define VME_ALIAS(entry) \ + ((unsigned int)((entry)->vme_offset & VME_ALIAS_MASK)) + +static inline void +VME_OBJECT_SET( + vm_map_entry_t entry, + vm_object_t object) +{ + entry->vme_object.vmo_object = object; + if (object != VM_OBJECT_NULL && !object->internal) { + entry->vme_resilient_media = FALSE; + } + entry->vme_resilient_codesign = FALSE; + entry->used_for_jit = FALSE; +} +static inline void +VME_SUBMAP_SET( + vm_map_entry_t entry, + vm_map_t submap) +{ + entry->vme_object.vmo_submap = submap; +} +static inline void +VME_OFFSET_SET( + vm_map_entry_t entry, + vm_map_offset_t offset) +{ + int alias; + alias = VME_ALIAS(entry); + assert((offset & PAGE_MASK) == 0); + entry->vme_offset = offset | alias; +} +/* + * IMPORTANT: + * The "alias" field can be updated while holding the VM map lock + * "shared". It's OK as along as it's the only field that can be + * updated without the VM map "exclusive" lock. + */ +static inline void +VME_ALIAS_SET( + vm_map_entry_t entry, + int alias) +{ + vm_map_offset_t offset; + offset = VME_OFFSET(entry); + entry->vme_offset = offset | (alias & VME_ALIAS_MASK); +} + +static inline void +VME_OBJECT_SHADOW( + vm_map_entry_t entry, + vm_object_size_t length) +{ + vm_object_t object; + vm_object_offset_t offset; + + object = VME_OBJECT(entry); + offset = VME_OFFSET(entry); + vm_object_shadow(&object, &offset, length); + if (object != VME_OBJECT(entry)) { + VME_OBJECT_SET(entry, object); + entry->use_pmap = TRUE; + } + if (offset != VME_OFFSET(entry)) { + VME_OFFSET_SET(entry, offset); + } +} + + /* * Convenience macros for dealing with superpages * SUPERPAGE_NBASEPAGES is architecture dependent and defined in pmap.h @@ -426,9 +459,9 @@ struct _vm_map { vm_map_size_t size; /* virtual size */ vm_map_size_t user_wire_limit;/* rlimit on user locked memory */ vm_map_size_t user_wire_size; /* current size of user locked memory in this map */ -#if __x86_64__ +#if !CONFIG_EMBEDDED vm_map_offset_t vmmap_high_start; -#endif /* __x86_64__ */ +#endif union { /* @@ -446,7 +479,7 @@ struct _vm_map { } vmu1; #define highest_entry_end vmu1.vmu1_highest_entry_end #define lowest_unnestable_start vmu1.vmu1_lowest_unnestable_start - decl_lck_mtx_data(, s_lock) /* Lock ref, res fields */ + decl_lck_mtx_data(, s_lock); /* Lock ref, res fields */ lck_mtx_ext_t s_lock_ext; vm_map_entry_t hint; /* hint for quick lookups */ union { @@ -455,7 +488,7 @@ struct _vm_map { } vmmap_u_1; #define hole_hint vmmap_u_1.vmmap_hole_hint #define vmmap_corpse_footprint vmmap_u_1.vmmap_corpse_footprint - union{ + union { vm_map_entry_t _first_free; /* First free space hint */ struct vm_map_links* _holes; /* links all holes between entries */ } f_s; /* Union for free space data structures being used */ @@ -463,7 +496,7 @@ struct _vm_map { #define first_free f_s._first_free #define holes_list f_s._holes - int map_refcnt; /* Reference count */ + struct os_refcnt map_refcnt; /* Reference count */ #if TASK_SWAPPER int res_count; /* Residence count (swap) */ @@ -483,8 +516,7 @@ struct _vm_map { /* boolean_t */ map_disallow_new_exec:1, /* Disallow new executable code */ /* boolean_t */ jit_entry_exists:1, /* boolean_t */ has_corpse_footprint:1, - /* boolean_t */ warned_delete_gap:1, - /* reserved */ pad:19; + /* reserved */ pad:20; unsigned int timestamp; /* Version number */ }; @@ -633,39 +665,14 @@ struct vm_map_copy { lck_rw_lock_exclusive_to_shared(&(map)->lock); \ MACRO_END -/* - * lock_read_to_write() returns FALSE on failure. This function evaluates to - * zero on success and non-zero value on failure. - */ -static inline int -vm_map_lock_read_to_write(vm_map_t map) -{ - if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) { - DTRACE_VM(vm_map_lock_upgrade); - return 0; - } - return 1; -} +__attribute__((always_inline)) +int vm_map_lock_read_to_write(vm_map_t map); -static inline boolean_t -vm_map_try_lock(vm_map_t map) -{ - if (lck_rw_try_lock_exclusive(&(map)->lock)) { - DTRACE_VM(vm_map_lock_w); - return TRUE; - } - return FALSE; -} +__attribute__((always_inline)) +boolean_t vm_map_try_lock(vm_map_t map); -static inline boolean_t -vm_map_try_lock_read(vm_map_t map) -{ - if (lck_rw_try_lock_shared(&(map)->lock)) { - DTRACE_VM(vm_map_lock_r); - return TRUE; - } - return FALSE; -} +__attribute__((always_inline)) +boolean_t vm_map_try_lock_read(vm_map_t map); #if MACH_ASSERT || DEBUG #define vm_map_lock_assert_held(map) \ @@ -767,6 +774,7 @@ extern vm_map_entry_t vm_map_entry_insert( unsigned wired_count, boolean_t no_cache, boolean_t permanent, + boolean_t no_copy_on_read, unsigned int superpage_size, boolean_t clear_map_aligned, boolean_t is_submap, @@ -810,14 +818,14 @@ extern void vm_map_reference_swap( #else /* MACH_ASSERT */ #define vm_map_reference(map) \ -MACRO_BEGIN \ - vm_map_t Map = (map); \ - if (Map) { \ - lck_mtx_lock(&Map->s_lock); \ - Map->res_count++; \ - Map->map_refcnt++; \ - lck_mtx_unlock(&Map->s_lock); \ - } \ +MACRO_BEGIN \ + vm_map_t Map = (map); \ + if (Map) { \ + lck_mtx_lock(&Map->s_lock); \ + Map->res_count++; \ + os_ref_retain(&Map->map_refcnt); \ + lck_mtx_unlock(&Map->s_lock); \ + } \ MACRO_END #define vm_map_res_reference(map) \ @@ -850,7 +858,7 @@ MACRO_END MACRO_BEGIN \ vm_map_t Map = (map); \ lck_mtx_lock(&Map->s_lock); \ - ++Map->map_refcnt; \ + os_ref_retain(&Map->map_refcnt);\ vm_map_res_reference(Map); \ lck_mtx_unlock(&Map->s_lock); \ MACRO_END @@ -869,7 +877,7 @@ MACRO_BEGIN \ vm_map_t Map = (map); \ if (Map) { \ lck_mtx_lock(&Map->s_lock); \ - Map->map_refcnt++; \ + os_ref_retain(&Map->map_refcnt);\ lck_mtx_unlock(&Map->s_lock); \ } \ MACRO_END @@ -1447,6 +1455,9 @@ extern void vm_map_set_32bit( extern void vm_map_set_jumbo( vm_map_t map); +extern void vm_map_set_jit_entitled( + vm_map_t map); + extern void vm_map_set_max_addr( vm_map_t map, vm_map_offset_t new_max_offset); @@ -1474,11 +1485,11 @@ extern kern_return_t vm_map_raise_max_offset( extern kern_return_t vm_map_raise_min_offset( vm_map_t map, vm_map_offset_t new_min_offset); -#if __x86_64__ +#if !CONFIG_EMBEDDED extern void vm_map_set_high_start( vm_map_t map, vm_map_offset_t high_start); -#endif /* __x86_64__ */ +#endif extern vm_map_offset_t vm_compute_max_offset( boolean_t is64); @@ -1534,6 +1545,20 @@ extern boolean_t vm_map_page_aligned( vm_map_offset_t offset, vm_map_offset_t mask); +static inline int +vm_map_range_overflows(vm_map_offset_t addr, vm_map_size_t size) +{ + vm_map_offset_t sum; + return os_add_overflow(addr, size, &sum); +} + +static inline int +mach_vm_range_overflows(mach_vm_offset_t addr, mach_vm_size_t size) +{ + mach_vm_offset_t sum; + return os_add_overflow(addr, size, &sum); +} + #ifdef XNU_KERNEL_PRIVATE extern kern_return_t vm_map_page_info( vm_map_t map, @@ -1590,13 +1615,16 @@ static inline void vm_prot_to_wimg(unsigned int prot, unsigned int *wimg) { switch (prot) { - case MAP_MEM_NOOP: break; - case MAP_MEM_IO: *wimg = VM_WIMG_IO; break; - case MAP_MEM_COPYBACK: *wimg = VM_WIMG_USE_DEFAULT; break; - case MAP_MEM_INNERWBACK: *wimg = VM_WIMG_INNERWBACK; break; - case MAP_MEM_POSTED: *wimg = VM_WIMG_POSTED; break; - case MAP_MEM_WTHRU: *wimg = VM_WIMG_WTHRU; break; - case MAP_MEM_WCOMB: *wimg = VM_WIMG_WCOMB; break; + case MAP_MEM_NOOP: break; + case MAP_MEM_IO: *wimg = VM_WIMG_IO; break; + case MAP_MEM_COPYBACK: *wimg = VM_WIMG_USE_DEFAULT; break; + case MAP_MEM_INNERWBACK: *wimg = VM_WIMG_INNERWBACK; break; + case MAP_MEM_POSTED: *wimg = VM_WIMG_POSTED; break; + case MAP_MEM_POSTED_REORDERED: *wimg = VM_WIMG_POSTED_REORDERED; break; + case MAP_MEM_POSTED_COMBINED_REORDERED: *wimg = VM_WIMG_POSTED_COMBINED_REORDERED; break; + case MAP_MEM_WTHRU: *wimg = VM_WIMG_WTHRU; break; + case MAP_MEM_WCOMB: *wimg = VM_WIMG_WCOMB; break; + case MAP_MEM_RT: *wimg = VM_WIMG_RT; break; default: panic("Unrecognized mapping type %u\n", prot); } @@ -1671,7 +1699,7 @@ extern int vm_map_disconnect_page_mappings( #if CONFIG_FREEZE extern kern_return_t vm_map_freeze( - vm_map_t map, + task_t task, unsigned int *purgeable_count, unsigned int *wired_count, unsigned int *clean_count, diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index e4782aedb..df03e1ca0 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -125,7 +125,7 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh #endif #if MAP_ENTRY_INSERTION_DEBUG backtrace(&entry->vme_insertion_bt[0], - (sizeof(entry->vme_insertion_bt) / sizeof(uintptr_t))); + (sizeof(entry->vme_insertion_bt) / sizeof(uintptr_t)), NULL); #endif } diff --git a/osfmk/vm/vm_map_store_rb.c b/osfmk/vm/vm_map_store_rb.c index c66a1446a..b036575e7 100644 --- a/osfmk/vm/vm_map_store_rb.c +++ b/osfmk/vm/vm_map_store_rb.c @@ -57,14 +57,15 @@ rb_node_compare(struct vm_map_store *node, struct vm_map_store *parent) return 0; } +__dead2 void -vm_map_store_walk_rb( vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *vm_entry) +vm_map_store_walk_rb(vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *vm_entry) { - struct vm_map_header hdr = map->hdr; - struct vm_map_store *rb_entry = RB_ROOT(&(hdr.rb_head_store)); - vm_map_entry_t cur = *vm_entry; + struct vm_map_header *hdr = &map->hdr; + struct vm_map_store *rb_entry = RB_ROOT(&hdr->rb_head_store); + vm_map_entry_t cur = *vm_entry; - rb_entry = RB_FIND( rb_head, &(hdr.rb_head_store), &(cur->store)); + rb_entry = RB_FIND(rb_head, &hdr->rb_head_store, &(cur->store)); if (rb_entry == NULL) { panic("NO SUCH ENTRY %p. Gave back %p", *vm_entry, *wrong_vme); } else { @@ -74,12 +75,12 @@ vm_map_store_walk_rb( vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *v boolean_t -vm_map_store_lookup_entry_rb( vm_map_t map, vm_map_offset_t address, vm_map_entry_t *vm_entry) +vm_map_store_lookup_entry_rb(vm_map_t map, vm_map_offset_t address, vm_map_entry_t *vm_entry) { - struct vm_map_header hdr = map->hdr; - struct vm_map_store *rb_entry = RB_ROOT(&(hdr.rb_head_store)); - vm_map_entry_t cur = vm_map_to_entry(map); - vm_map_entry_t prev = VM_MAP_ENTRY_NULL; + struct vm_map_header *hdr = &map->hdr; + struct vm_map_store *rb_entry = RB_ROOT(&hdr->rb_head_store); + vm_map_entry_t cur = vm_map_to_entry(map); + vm_map_entry_t prev = VM_MAP_ENTRY_NULL; while (rb_entry != (struct vm_map_store*)NULL) { cur = VME_FOR_STORE(rb_entry); @@ -226,7 +227,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry) return; } - hole_entry = (vm_map_entry_t) map->holes_list; + hole_entry = CAST_DOWN(vm_map_entry_t, map->holes_list); next_hole_entry = hole_entry->vme_next; map_entry = vm_map_first_entry(map); @@ -236,7 +237,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry) hole_entry = next_hole_entry; next_hole_entry = hole_entry->vme_next; - if (hole_entry == (vm_map_entry_t)map->holes_list) { + if (hole_entry == CAST_DOWN(vm_map_entry_t, map->holes_list)) { break; } } @@ -264,7 +265,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry) hole_entry = next_hole_entry; next_hole_entry = hole_entry->vme_next; - if (hole_entry == (vm_map_entry_t)map->holes_list) { + if (hole_entry == CAST_DOWN(vm_map_entry_t, map->holes_list)) { break; } } diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 099fdea8d..5b6250afc 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,7 +80,6 @@ #include #include #include -#include #include #include #include @@ -353,10 +352,6 @@ _vm_object_allocate( vm_object_size_t size, vm_object_t object) { - XPR(XPR_VM_OBJECT, - "vm_object_allocate, object 0x%X size 0x%X\n", - object, size, 0, 0, 0); - *object = vm_object_template; vm_page_queue_init(&object->memq); #if UPL_DEBUG || CONFIG_IOSCHED @@ -539,8 +534,8 @@ vm_object_bootstrap(void) vm_object_template.volatile_fault = FALSE; vm_object_template.all_reusable = FALSE; vm_object_template.blocked_access = FALSE; - vm_object_template.vo_ledger_tag = VM_OBJECT_LEDGER_TAG_NONE; - vm_object_template.__object2_unused_bits = 0; + vm_object_template.vo_ledger_tag = VM_LEDGER_TAG_NONE; + vm_object_template.vo_no_footprint = FALSE; #if CONFIG_IOSCHED || UPL_DEBUG vm_object_template.uplq.prev = NULL; vm_object_template.uplq.next = NULL; @@ -650,6 +645,7 @@ vm_io_reprioritize_init(void) result = kernel_thread_start_priority(io_reprioritize_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread); if (result == KERN_SUCCESS) { + thread_set_thread_name(thread, "VM_io_reprioritize_thread"); thread_deallocate(thread); } else { panic("Could not create io_reprioritize_thread"); @@ -671,6 +667,7 @@ vm_object_reaper_init(void) if (kr != KERN_SUCCESS) { panic("failed to launch vm_object_reaper_thread kr=0x%x", kr); } + thread_set_thread_name(thread, "VM_object_reaper_thread"); thread_deallocate(thread); } @@ -909,12 +906,6 @@ vm_object_deallocate( continue; } - XPR(XPR_VM_OBJECT, - "vm_o_deallocate: 0x%X res %d paging_ops %d thread 0x%p ref %d\n", - object, object->resident_page_count, - object->paging_in_progress, - (void *)current_thread(), object->ref_count); - VM_OBJ_RES_DECR(object); /* XXX ? */ /* * Terminate this object. If it had a shadow, @@ -1333,9 +1324,6 @@ vm_object_terminate( { vm_object_t shadow_object; - XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n", - object, object->ref_count, 0, 0, 0); - vm_object_lock_assert_exclusive(object); if (!object->pageout && (!object->internal && object->can_persist) && @@ -1484,12 +1472,21 @@ vm_object_reap( if (object->internal && (object->purgable != VM_PURGABLE_DENY || object->vo_ledger_tag)) { + int ledger_flags; + kern_return_t kr; + + ledger_flags = 0; + if (object->vo_no_footprint) { + ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT; + } assert(!object->alive); assert(object->terminating); - vm_object_ownership_change(object, - object->vo_ledger_tag, /* unchanged */ - NULL, /* no owner */ - FALSE); /* task_objq not locked */ + kr = vm_object_ownership_change(object, + object->vo_ledger_tag, /* unchanged */ + NULL, /* no owner */ + ledger_flags, + FALSE); /* task_objq not locked */ + assert(kr == KERN_SUCCESS); assert(object->vo_owner == NULL); } @@ -2109,7 +2106,7 @@ typedef uint64_t chunk_state_t; * while processing a higher level object in the shadow chain. */ -#define PAGE_ALREADY_HANDLED(c, p) (((c) & (1LL << (p))) == 0) +#define PAGE_ALREADY_HANDLED(c, p) (((c) & (1ULL << (p))) == 0) /* * Mark the page at offset 'p' in the bit map as having been processed. @@ -2117,7 +2114,7 @@ typedef uint64_t chunk_state_t; #define MARK_PAGE_HANDLED(c, p) \ MACRO_BEGIN \ - (c) = (c) & ~(1LL << (p)); \ + (c) = (c) & ~(1ULL << (p)); \ MACRO_END @@ -2875,9 +2872,6 @@ vm_object_copy_slowly( struct vm_object_fault_info fault_info = {}; - XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n", - src_object, src_offset, size, 0, 0); - if (size == 0) { vm_object_unlock(src_object); *_result_object = VM_OBJECT_NULL; @@ -3018,7 +3012,6 @@ vm_object_copy_slowly( } fault_info.cluster_size = cluster_size; - XPR(XPR_VM_FAULT, "vm_object_copy_slowly -> vm_fault_page", 0, 0, 0, 0, 0); _result_page = VM_PAGE_NULL; result = vm_fault_page(src_object, src_offset, VM_PROT_READ, FALSE, @@ -3161,8 +3154,6 @@ vm_object_copy_quickly( vm_object_t object = *_object; memory_object_copy_strategy_t copy_strategy; - XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n", - *_object, offset, size, 0, 0); if (object == VM_OBJECT_NULL) { *_src_needs_copy = FALSE; *_dst_needs_copy = FALSE; @@ -3674,10 +3665,6 @@ Retry: vm_object_unlock(src_object); vm_object_unlock(new_copy); - XPR(XPR_VM_OBJECT, - "vm_object_copy_delayed: used copy object %X for source %X\n", - new_copy, src_object, 0, 0, 0); - return new_copy; } @@ -3776,7 +3763,6 @@ vm_object_copy_strategically( break; case MEMORY_OBJECT_COPY_SYMMETRIC: - XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n", src_object, src_offset, size, 0, 0); vm_object_unlock(src_object); result = KERN_MEMORY_RESTART_COPY; break; @@ -4003,6 +3989,7 @@ vm_object_memory_object_associate( assert(object->pager_created); assert(!object->pager_initialized); assert(!object->pager_ready); + assert(object->pager_trusted); } else { object = vm_object_allocate(size); assert(object != VM_OBJECT_NULL); @@ -4124,6 +4111,7 @@ vm_object_compressor_pager_create( */ object->pager_created = TRUE; + object->pager_trusted = TRUE; object->paging_offset = 0; vm_object_unlock(object); @@ -4444,9 +4432,6 @@ vm_object_do_collapse( backing_object->alive = FALSE; vm_object_unlock(backing_object); - XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n", - backing_object, 0, 0, 0, 0); - #if VM_OBJECT_TRACKING if (vm_object_tracking_inited) { btlog_remove_entries_for_element(vm_object_tracking_btlog, @@ -4624,9 +4609,6 @@ vm_object_collapse( return; } - XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n", - object, 0, 0, 0, 0); - if (object == VM_OBJECT_NULL) { return; } @@ -4794,12 +4776,6 @@ retry: goto retry; } - XPR(XPR_VM_OBJECT, - "vm_object_collapse: %x to %x, pager %x, pager_control %x\n", - backing_object, object, - backing_object->pager, - backing_object->pager_control, 0); - /* * Collapse the object with its backing * object, and try again with the object's @@ -5153,10 +5129,6 @@ vm_object_coalesce( return TRUE; } - XPR(XPR_VM_OBJECT, - "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n", - prev_object, prev_offset, prev_size, next_size, 0); - vm_object_lock(prev_object); /* @@ -5516,11 +5488,6 @@ vm_object_lock_request( should_flush = flags & MEMORY_OBJECT_DATA_FLUSH; - XPR(XPR_MEMORY_OBJECT, - "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n", - object, offset, size, - (((should_return & 1) << 1) | should_flush), prot); - /* * Check for bogus arguments. */ @@ -6579,8 +6546,6 @@ MACRO_END assert(object1->__object3_unused_bits == 0); assert(object2->__object3_unused_bits == 0); #endif /* CONFIG_SECLUDED_MEMORY */ - assert(object1->__object2_unused_bits == 0); - assert(object2->__object2_unused_bits == 0); #if UPL_DEBUG /* "uplq" refers to the object not its contents (see upl_transpose()) */ #endif @@ -7491,15 +7456,16 @@ vm_object_compressed_freezer_done() } -void +uint32_t vm_object_compressed_freezer_pageout( - vm_object_t object) + vm_object_t object, uint32_t dirty_budget) { vm_page_t p; vm_page_t local_freeq = NULL; int local_freed = 0; kern_return_t retval = KERN_SUCCESS; int obj_resident_page_count_snapshot = 0; + uint32_t paged_out_count = 0; assert(object != VM_OBJECT_NULL); assert(object->internal); @@ -7517,7 +7483,7 @@ vm_object_compressed_freezer_pageout( if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) { vm_object_unlock(object); - return; + return paged_out_count; } } @@ -7563,7 +7529,7 @@ vm_object_compressed_freezer_pageout( vm_object_activity_begin(object); - while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq)) { + while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq) && paged_out_count < dirty_budget) { p = (vm_page_t)vm_page_queue_first(&object->memq); KERNEL_DEBUG(0xe0430004 | DBG_FUNC_START, object, local_freed, 0, 0, 0); @@ -7643,6 +7609,7 @@ vm_object_compressed_freezer_pageout( p->vmp_snext = local_freeq; local_freeq = p; local_freed++; + paged_out_count++; if (local_freed >= MAX_FREE_BATCH) { OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); @@ -7681,6 +7648,7 @@ vm_object_compressed_freezer_pageout( thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS); clock_get_uptime(&c_freezer_last_yield_ts); } + return paged_out_count; } #endif /* CONFIG_FREEZE */ @@ -8110,24 +8078,96 @@ vm_object_ledger_tag_ledgers( { assert(object->shadow == VM_OBJECT_NULL); + *do_footprint = !object->vo_no_footprint; + switch (object->vo_ledger_tag) { - case VM_OBJECT_LEDGER_TAG_NONE: - /* regular purgeable memory */ + case VM_LEDGER_TAG_NONE: + /* + * Regular purgeable memory: + * counts in footprint only when nonvolatile. + */ + *do_footprint = TRUE; assert(object->purgable != VM_PURGABLE_DENY); *ledger_idx_volatile = task_ledgers.purgeable_volatile; *ledger_idx_nonvolatile = task_ledgers.purgeable_nonvolatile; *ledger_idx_volatile_compressed = task_ledgers.purgeable_volatile_compressed; *ledger_idx_nonvolatile_compressed = task_ledgers.purgeable_nonvolatile_compressed; - *do_footprint = TRUE; break; - case VM_OBJECT_LEDGER_TAG_NETWORK: + case VM_LEDGER_TAG_DEFAULT: + /* + * "default" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no_footprint". + */ + *ledger_idx_volatile = task_ledgers.tagged_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.tagged_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.tagged_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.tagged_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_nofootprint_compressed; + } + break; + case VM_LEDGER_TAG_NETWORK: + /* + * "network" tagged memory: + * never counts in footprint. + */ + *do_footprint = FALSE; *ledger_idx_volatile = task_ledgers.network_volatile; *ledger_idx_volatile_compressed = task_ledgers.network_volatile_compressed; *ledger_idx_nonvolatile = task_ledgers.network_nonvolatile; *ledger_idx_nonvolatile_compressed = task_ledgers.network_nonvolatile_compressed; - *do_footprint = FALSE; break; - case VM_OBJECT_LEDGER_TAG_MEDIA: + case VM_LEDGER_TAG_MEDIA: + /* + * "media" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no footprint". + */ + *ledger_idx_volatile = task_ledgers.media_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.media_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.media_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.media_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.media_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.media_nofootprint_compressed; + } + break; + case VM_LEDGER_TAG_GRAPHICS: + /* + * "graphics" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no footprint". + */ + *ledger_idx_volatile = task_ledgers.graphics_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.graphics_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.graphics_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.graphics_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_nofootprint_compressed; + } + break; + case VM_LEDGER_TAG_NEURAL: + /* + * "neural" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no footprint". + */ + *ledger_idx_volatile = task_ledgers.neural_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.neural_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.neural_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.neural_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.neural_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.neural_nofootprint_compressed; + } + break; default: panic("%s: object %p has unsupported ledger_tag %d\n", __FUNCTION__, object, object->vo_ledger_tag); @@ -8139,7 +8179,8 @@ vm_object_ownership_change( vm_object_t object, int new_ledger_tag, task_t new_owner, - boolean_t task_objq_locked) + int new_ledger_flags, + boolean_t old_task_objq_locked) { int old_ledger_tag; task_t old_owner; @@ -8151,14 +8192,84 @@ vm_object_ownership_change( int ledger_idx_nonvolatile_compressed; int ledger_idx; int ledger_idx_compressed; - boolean_t do_footprint; + boolean_t do_footprint, old_no_footprint, new_no_footprint; + boolean_t new_task_objq_locked; vm_object_lock_assert_exclusive(object); - assert(object->internal); + + if (!object->internal) { + return KERN_INVALID_ARGUMENT; + } + if (new_ledger_tag == VM_LEDGER_TAG_NONE && + object->purgable == VM_PURGABLE_DENY) { + /* non-purgeable memory must have a valid non-zero ledger tag */ + return KERN_INVALID_ARGUMENT; + } + if (new_ledger_tag < 0 || + new_ledger_tag > VM_LEDGER_TAG_MAX) { + return KERN_INVALID_ARGUMENT; + } + if (new_ledger_flags & ~VM_LEDGER_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + if (object->vo_ledger_tag == VM_LEDGER_TAG_NONE && + object->purgable == VM_PURGABLE_DENY) { + /* + * This VM object is neither ledger-tagged nor purgeable. + * We can convert it to "ledger tag" ownership iff it + * has not been used at all yet (no resident pages and + * no pager) and it's going to be assigned to a valid task. + */ + if (object->resident_page_count != 0 || + object->pager != NULL || + object->pager_created || + object->ref_count != 1 || + object->vo_owner != TASK_NULL || + object->copy_strategy != MEMORY_OBJECT_COPY_NONE || + new_owner == TASK_NULL) { + return KERN_FAILURE; + } + } + + if (new_ledger_flags & VM_LEDGER_FLAG_NO_FOOTPRINT) { + new_no_footprint = TRUE; + } else { + new_no_footprint = FALSE; + } +#if __arm64__ + if (!new_no_footprint && + object->purgable != VM_PURGABLE_DENY && + new_owner != TASK_NULL && + new_owner != VM_OBJECT_OWNER_DISOWNED && + new_owner->task_legacy_footprint) { + /* + * This task has been granted "legacy footprint" and should + * not be charged for its IOKit purgeable memory. Since we + * might now change the accounting of such memory to the + * "graphics" ledger, for example, give it the "no footprint" + * option. + */ + new_no_footprint = TRUE; + } +#endif /* __arm64__ */ + assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE); + assert(object->shadow == VM_OBJECT_NULL); + assert(object->copy == VM_OBJECT_NULL); old_ledger_tag = object->vo_ledger_tag; + old_no_footprint = object->vo_no_footprint; old_owner = VM_OBJECT_OWNER(object); + DTRACE_VM7(object_ownership_change, + vm_object_t, object, + task_t, old_owner, + int, old_ledger_tag, + int, old_no_footprint, + task_t, new_owner, + int, new_ledger_tag, + int, new_no_footprint); + + assert(object->internal); resident_count = object->resident_page_count - object->wired_page_count; wired_count = object->wired_page_count; compressed_count = vm_compressor_pager_get_count(object->pager); @@ -8169,8 +8280,9 @@ vm_object_ownership_change( if (old_owner != TASK_NULL && ((old_owner != new_owner) /* new owner ... */ || /* ... or ... */ - (old_ledger_tag && /* ... new ledger */ - old_ledger_tag != new_ledger_tag))) { + (old_no_footprint != new_no_footprint) /* new "no_footprint" */ + || /* ... or ... */ + old_ledger_tag != new_ledger_tag)) { /* ... new ledger */ /* * Take this object off of the old owner's ledgers. */ @@ -8236,10 +8348,11 @@ vm_object_ownership_change( /* remove object from old_owner's list of owned objects */ DTRACE_VM2(object_owner_remove, vm_object_t, object, - task_t, new_owner); - if (!task_objq_locked) { + task_t, old_owner); + if (!old_task_objq_locked) { task_objq_lock(old_owner); } + old_owner->task_owned_objects--; queue_remove(&old_owner->task_objq, object, vm_object_t, task_objq); switch (object->purgable) { @@ -8255,7 +8368,7 @@ vm_object_ownership_change( default: break; } - if (!task_objq_locked) { + if (!old_task_objq_locked) { task_objq_unlock(old_owner); } } @@ -8264,12 +8377,49 @@ vm_object_ownership_change( /* * Switch to new ledger tag and/or owner. */ + + new_task_objq_locked = FALSE; + if (new_owner != old_owner && + new_owner != TASK_NULL && + new_owner != VM_OBJECT_OWNER_DISOWNED) { + /* + * If the new owner is not accepting new objects ("disowning"), + * the object becomes "disowned" and will be added to + * the kernel's task_objq. + * + * Check first without locking, to avoid blocking while the + * task is disowning its objects. + */ + if (new_owner->task_objects_disowning) { + new_owner = VM_OBJECT_OWNER_DISOWNED; + } else { + task_objq_lock(new_owner); + /* check again now that we have the lock */ + if (new_owner->task_objects_disowning) { + new_owner = VM_OBJECT_OWNER_DISOWNED; + task_objq_unlock(new_owner); + } else { + new_task_objq_locked = TRUE; + } + } + } + object->vo_ledger_tag = new_ledger_tag; object->vo_owner = new_owner; + object->vo_no_footprint = new_no_footprint; if (new_owner == VM_OBJECT_OWNER_DISOWNED) { + /* + * Disowned objects are added to the kernel's task_objq but + * are marked as owned by "VM_OBJECT_OWNER_DISOWNED" to + * differentiate them from objects intentionally owned by + * the kernel. + */ assert(old_owner != kernel_task); new_owner = kernel_task; + assert(!new_task_objq_locked); + task_objq_lock(new_owner); + new_task_objq_locked = TRUE; } /* @@ -8278,8 +8428,9 @@ vm_object_ownership_change( if (new_owner != TASK_NULL && ((new_owner != old_owner) /* new owner ... */ || /* ... or ... */ - (new_ledger_tag && /* ... new ledger */ - new_ledger_tag != old_ledger_tag))) { + (new_no_footprint != old_no_footprint) /* ... new "no_footprint" */ + || /* ... or ... */ + new_ledger_tag != old_ledger_tag)) { /* ... new ledger */ /* * Add this object to the new owner's ledgers. */ @@ -8346,7 +8497,8 @@ vm_object_ownership_change( DTRACE_VM2(object_owner_add, vm_object_t, object, task_t, new_owner); - task_objq_lock(new_owner); + assert(new_task_objq_locked); + new_owner->task_owned_objects++; queue_enter(&new_owner->task_objq, object, vm_object_t, task_objq); switch (object->purgable) { @@ -8362,9 +8514,100 @@ vm_object_ownership_change( default: break; } - task_objq_unlock(new_owner); } } + if (new_task_objq_locked) { + task_objq_unlock(new_owner); + } + return KERN_SUCCESS; } + +void +vm_owned_objects_disown( + task_t task) +{ + vm_object_t next_object; + vm_object_t object; + int collisions; + kern_return_t kr; + + if (task == NULL) { + return; + } + + collisions = 0; + +again: + if (task->task_objects_disowned) { + /* task has already disowned its owned objects */ + assert(task->task_volatile_objects == 0); + assert(task->task_nonvolatile_objects == 0); + assert(task->task_owned_objects == 0); + return; + } + + task_objq_lock(task); + + task->task_objects_disowning = TRUE; + + for (object = (vm_object_t) queue_first(&task->task_objq); + !queue_end(&task->task_objq, (queue_entry_t) object); + object = next_object) { + if (task->task_nonvolatile_objects == 0 && + task->task_volatile_objects == 0 && + task->task_owned_objects == 0) { + /* no more objects owned by "task" */ + break; + } + + next_object = (vm_object_t) queue_next(&object->task_objq); + +#if DEBUG + assert(object->vo_purgeable_volatilizer == NULL); +#endif /* DEBUG */ + assert(object->vo_owner == task); + if (!vm_object_lock_try(object)) { + task_objq_unlock(task); + mutex_pause(collisions++); + goto again; + } + /* transfer ownership to the kernel */ + assert(VM_OBJECT_OWNER(object) != kernel_task); + kr = vm_object_ownership_change( + object, + object->vo_ledger_tag, /* unchanged */ + VM_OBJECT_OWNER_DISOWNED, /* new owner */ + 0, /* new_ledger_flags */ + TRUE); /* old_owner->task_objq locked */ + assert(kr == KERN_SUCCESS); + assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED); + vm_object_unlock(object); + } + + if (__improbable(task->task_volatile_objects != 0 || + task->task_nonvolatile_objects != 0 || + task->task_owned_objects != 0)) { + panic("%s(%p): volatile=%d nonvolatile=%d owned=%d q=%p q_first=%p q_last=%p", + __FUNCTION__, + task, + task->task_volatile_objects, + task->task_nonvolatile_objects, + task->task_owned_objects, + &task->task_objq, + queue_first(&task->task_objq), + queue_last(&task->task_objq)); + } + + /* there shouldn't be any objects owned by task now */ + assert(task->task_volatile_objects == 0); + assert(task->task_nonvolatile_objects == 0); + assert(task->task_owned_objects == 0); + assert(task->task_objects_disowning); + + /* and we don't need to try and disown again */ + task->task_objects_disowned = TRUE; + + task_objq_unlock(task); +} diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index 3fef567bb..eedfb09e4 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -128,7 +128,9 @@ struct vm_object_fault_info { /* boolean_t */ pmap_cs_associated:1, /* boolean_t */ mark_zf_absent:1, /* boolean_t */ batch_pmap_op:1, - __vm_object_fault_info_unused_bits:25; + /* boolean_t */ resilient_media:1, + /* boolean_t */ no_copy_on_read:1, + __vm_object_fault_info_unused_bits:23; int pmap_options; }; @@ -362,8 +364,8 @@ struct vm_object { #else /* VM_OBJECT_ACCESS_TRACKING */ __unused_access_tracking:1, #endif /* VM_OBJECT_ACCESS_TRACKING */ - vo_ledger_tag:2, - __object2_unused_bits:2; /* for expansion */ + vo_ledger_tag:3, + vo_no_footprint:1; #if VM_OBJECT_ACCESS_TRACKING uint32_t access_tracking_reads; @@ -407,12 +409,6 @@ struct vm_object { #endif /* DEBUG */ }; -/* values for object->vo_ledger_tag */ -#define VM_OBJECT_LEDGER_TAG_NONE 0 -#define VM_OBJECT_LEDGER_TAG_NETWORK 1 -#define VM_OBJECT_LEDGER_TAG_MEDIA 2 -#define VM_OBJECT_LEDGER_TAG_RESERVED 3 - #define VM_OBJECT_PURGEABLE_FAULT_ERROR(object) \ ((object)->volatile_fault && \ ((object)->purgable == VM_PURGABLE_VOLATILE || \ @@ -892,9 +888,9 @@ __private_extern__ void vm_object_reap_pages( #if CONFIG_FREEZE -__private_extern__ void +__private_extern__ uint32_t vm_object_compressed_freezer_pageout( - vm_object_t object); + vm_object_t object, uint32_t dirty_budget); __private_extern__ void vm_object_compressed_freezer_done( @@ -1203,8 +1199,9 @@ extern void vm_object_ledger_tag_ledgers( boolean_t *do_footprint); extern kern_return_t vm_object_ownership_change( vm_object_t object, - int ledger_tag, - task_t owner, + int new_ledger_tag, + task_t new_owner, + int new_ledger_flags, boolean_t task_objq_locked); #endif /* _VM_VM_OBJECT_H_ */ diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 9e0304dbf..e9a3fbdf8 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -373,11 +373,14 @@ vm_page_pack_ptr(uintptr_t p) static inline uintptr_t vm_page_unpack_ptr(uintptr_t p) { + extern unsigned int vm_pages_count; + if (!p) { return (uintptr_t)0; } if (p & VM_PACKED_FROM_VM_PAGES_ARRAY) { + assert((uint32_t)(p & ~VM_PACKED_FROM_VM_PAGES_ARRAY) < vm_pages_count); return (uintptr_t)(&vm_pages[(uint32_t)(p & ~VM_PACKED_FROM_VM_PAGES_ARRAY)]); } return (p << VM_PACKED_POINTER_SHIFT) + (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS; @@ -1151,9 +1154,36 @@ unsigned int vm_page_inactive_count; /* How many pages are inactive? */ extern unsigned int vm_page_secluded_count; /* How many pages are secluded? */ extern -unsigned int vm_page_secluded_count_free; +unsigned int vm_page_secluded_count_free; /* how many of them are free? */ extern -unsigned int vm_page_secluded_count_inuse; +unsigned int vm_page_secluded_count_inuse; /* how many of them are in use? */ +/* + * We keep filling the secluded pool with new eligible pages and + * we can overshoot our target by a lot. + * When there's memory pressure, vm_pageout_scan() will re-balance the queues, + * pushing the extra secluded pages to the active or free queue. + * Since these "over target" secluded pages are actually "available", jetsam + * should consider them as such, so make them visible to jetsam via the + * "vm_page_secluded_count_over_target" counter and update it whenever we + * update vm_page_secluded_count or vm_page_secluded_target. + */ +extern +unsigned int vm_page_secluded_count_over_target; +#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE() \ + MACRO_BEGIN \ + if (vm_page_secluded_count > vm_page_secluded_target) { \ + vm_page_secluded_count_over_target = \ + (vm_page_secluded_count - vm_page_secluded_target); \ + } else { \ + vm_page_secluded_count_over_target = 0; \ + } \ + MACRO_END +#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET() vm_page_secluded_count_over_target +#else /* CONFIG_SECLUDED_MEMORY */ +#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE() \ + MACRO_BEGIN \ + MACRO_END +#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET() 0 #endif /* CONFIG_SECLUDED_MEMORY */ extern unsigned int vm_page_cleaned_count; /* How many pages are in the clean queue? */ @@ -1195,6 +1225,8 @@ extern unsigned int vm_page_gobble_count; extern unsigned int vm_page_stolen_count; /* Count of stolen pages not acccounted in zones */ +extern +unsigned int vm_page_kern_lpage_count; /* Count of large pages used in early boot */ #if DEVELOPMENT || DEBUG @@ -1453,6 +1485,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail); memorystatus_pages_update( \ vm_page_pageable_external_count + \ vm_page_free_count + \ + VM_PAGE_SECLUDED_COUNT_OVER_TARGET() + \ (VM_DYNAMIC_PAGING_ENABLED() ? 0 : vm_page_purgeable_count) \ ); \ } while(0) diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 6b6e3d04d..21b7d3951 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,7 +87,6 @@ #include #include #include -#include #include #include #include @@ -137,12 +136,17 @@ extern unsigned int memorystatus_frozen_count; extern unsigned int memorystatus_suspended_count; extern vm_pressure_level_t memorystatus_vm_pressure_level; +extern lck_mtx_t memorystatus_jetsam_fg_band_lock; +extern uint32_t memorystatus_jetsam_fg_band_waiters; + void vm_pressure_response(void); extern void consider_vm_pressure_events(void); #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4 #endif /* VM_PRESSURE_EVENTS */ +thread_t vm_pageout_scan_thread = THREAD_NULL; +boolean_t vps_dynamic_priority_enabled = FALSE; #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */ #ifdef CONFIG_EMBEDDED @@ -306,9 +310,13 @@ extern void vm_pageout_scan(void); void vm_tests(void); /* forward */ +boolean_t vm_pageout_running = FALSE; + +uint32_t vm_page_upl_tainted = 0; +uint32_t vm_page_iopl_tainted = 0; + #if !CONFIG_EMBEDDED static boolean_t vm_pageout_waiter = FALSE; -static boolean_t vm_pageout_running = FALSE; #endif /* !CONFIG_EMBEDDED */ @@ -529,11 +537,6 @@ vm_pageclean_setup( assert(!m->vmp_cleaning); #endif - XPR(XPR_VM_PAGEOUT, - "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n", - VM_PAGE_OBJECT(m), m->vmp_offset, m, - new_m, new_offset); - pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); /* @@ -589,10 +592,6 @@ vm_pageout_initialize_page( vm_object_offset_t paging_offset; memory_object_t pager; - XPR(XPR_VM_PAGEOUT, - "vm_pageout_initialize_page, page 0x%X\n", - m, 0, 0, 0, 0); - assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); object = VM_PAGE_OBJECT(m); @@ -699,11 +698,6 @@ vm_pageout_cluster(vm_page_t m) vm_object_t object = VM_PAGE_OBJECT(m); struct vm_pageout_queue *q; - - XPR(XPR_VM_PAGEOUT, - "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n", - object, m->vmp_offset, m, 0, 0); - VM_PAGE_CHECK(m); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); vm_object_lock_assert_exclusive(object); @@ -1741,694 +1735,1356 @@ update_vm_info(void) extern boolean_t hibernation_vmqueues_inspection; -void -vm_page_balance_inactive(int max_to_move) -{ - vm_page_t m; +/* + * Return values for functions called by vm_pageout_scan + * that control its flow. + * + * PROCEED -- vm_pageout_scan will keep making forward progress. + * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns. + * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue. + */ - LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#define VM_PAGEOUT_SCAN_PROCEED (0) +#define VM_PAGEOUT_SCAN_DONE_RETURN (1) +#define VM_PAGEOUT_SCAN_NEXT_ITERATION (2) + +/* + * This function is called only from vm_pageout_scan and + * it moves overflow secluded pages (one-at-a-time) to the + * batched 'local' free Q or active Q. + */ +static void +vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed) +{ +#if CONFIG_SECLUDED_MEMORY + /* + * Deal with secluded_q overflow. + */ + if (vm_page_secluded_count > vm_page_secluded_target) { + vm_page_t secluded_page; - if (hibernation_vmqueues_inspection == TRUE) { /* - * It is likely that the hibernation code path is - * dealing with these very queues as we are about - * to move pages around in/from them and completely - * change the linkage of the pages. - * - * And so we skip the rebalancing of these queues. + * SECLUDED_AGING_BEFORE_ACTIVE: + * Excess secluded pages go to the active queue and + * will later go to the inactive queue. */ - return; + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); + assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); + + vm_page_queues_remove(secluded_page, FALSE); + assert(!secluded_page->vmp_fictitious); + assert(!VM_PAGE_WIRED(secluded_page)); + + if (secluded_page->vmp_object == 0) { + /* transfer to free queue */ + assert(secluded_page->vmp_busy); + secluded_page->vmp_snext = *local_freeq; + *local_freeq = secluded_page; + *local_freed += 1; + } else { + /* transfer to head of active queue */ + vm_page_enqueue_active(secluded_page, FALSE); + secluded_page = VM_PAGE_NULL; + } } - vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + - vm_page_inactive_count + - vm_page_speculative_count); +#else /* CONFIG_SECLUDED_MEMORY */ - while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) { - VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1); - - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); +#pragma unused(local_freeq) +#pragma unused(local_freed) - assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); - assert(!m->vmp_laundry); - assert(VM_PAGE_OBJECT(m) != kernel_object); - assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + return; - DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); +#endif /* CONFIG_SECLUDED_MEMORY */ +} - /* - * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise... - * - * a TLB flush isn't really needed here since at worst we'll miss the reference bit being - * updated in the PTE if a remote processor still has this mapping cached in its TLB when the - * new reference happens. If no futher references happen on the page after that remote TLB flushes - * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue - * by pageout_scan, which is just fine since the last reference would have happened quite far - * in the past (TLB caches don't hang around for very long), and of course could just as easily - * have happened before we moved the page - */ - if (m->vmp_pmapped == TRUE) { - pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); - } +/* + * This function is called only from vm_pageout_scan and + * it initializes the loop targets for vm_pageout_scan(). + */ +static void +vps_init_page_targets(void) +{ + /* + * LD TODO: Other page targets should be calculated here too. + */ + vm_page_anonymous_min = vm_page_inactive_target / 20; - /* - * The page might be absent or busy, - * but vm_page_deactivate can handle that. - * FALSE indicates that we don't want a H/W clear reference - */ - vm_page_deactivate_internal(m, FALSE); + if (vm_pageout_state.vm_page_speculative_percentage > 50) { + vm_pageout_state.vm_page_speculative_percentage = 50; + } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) { + vm_pageout_state.vm_page_speculative_percentage = 1; } -} + vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + + vm_page_inactive_count); +} /* - * vm_pageout_scan does the dirty work for the pageout daemon. - * It returns with both vm_page_queue_free_lock and vm_page_queue_lock - * held and vm_page_free_wanted == 0. + * This function is called only from vm_pageout_scan and + * it purges a single VM object at-a-time and will either + * make vm_pageout_scan() restart the loop or keeping moving forward. */ -void -vm_pageout_scan(void) +static int +vps_purge_object() { - unsigned int loop_count = 0; - unsigned int inactive_burst_count = 0; - unsigned int reactivated_this_call; - unsigned int reactivate_limit; - vm_page_t local_freeq = NULL; - int local_freed = 0; - int delayed_unlock; - int delayed_unlock_limit = 0; - int refmod_state = 0; - int vm_pageout_deadlock_target = 0; - struct vm_pageout_queue *iq; - struct vm_pageout_queue *eq; - struct vm_speculative_age_q *sq; - struct flow_control flow_control = { 0, { 0, 0 } }; - boolean_t inactive_throttled = FALSE; - mach_timespec_t ts; - unsigned int msecs = 0; - vm_object_t object = NULL; - uint32_t inactive_reclaim_run; - boolean_t exceeded_burst_throttle; - boolean_t grab_anonymous = FALSE; - boolean_t force_anonymous = FALSE; - boolean_t force_speculative_aging = FALSE; - int anons_grabbed = 0; - int page_prev_q_state = 0; -#if CONFIG_BACKGROUND_QUEUE - boolean_t page_from_bg_q = FALSE; -#endif - int cache_evict_throttle = 0; - uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; - uint32_t inactive_external_count; - int force_purge = 0; - int divisor; -#define DELAY_SPECULATIVE_AGE 1000 - int delay_speculative_age = 0; - vm_object_t m_object = VM_OBJECT_NULL; + int force_purge; + + assert(available_for_purge >= 0); + force_purge = 0; /* no force-purging */ #if VM_PRESSURE_EVENTS vm_pressure_level_t pressure_level; -#endif /* VM_PRESSURE_EVENTS */ - - VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START, - vm_pageout_vminfo.vm_pageout_freed_speculative, - vm_pageout_state.vm_pageout_inactive_clean, - vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, - vm_pageout_vminfo.vm_pageout_inactive_dirty_external); - flow_control.state = FCS_IDLE; - iq = &vm_pageout_queue_internal; - eq = &vm_pageout_queue_external; - sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + pressure_level = memorystatus_vm_pressure_level; + if (pressure_level > kVMPressureNormal) { + if (pressure_level >= kVMPressureCritical) { + force_purge = vm_pageout_state.memorystatus_purge_on_critical; + } else if (pressure_level >= kVMPressureUrgent) { + force_purge = vm_pageout_state.memorystatus_purge_on_urgent; + } else if (pressure_level >= kVMPressureWarning) { + force_purge = vm_pageout_state.memorystatus_purge_on_warning; + } + } +#endif /* VM_PRESSURE_EVENTS */ - XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0); + if (available_for_purge || force_purge) { + memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START); - /* Ask the pmap layer to return any pages it no longer needs. */ - uint64_t pmap_wired_pages_freed = pmap_release_pages_fast(); + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0); + if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) { + VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1); + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0); + memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); - vm_page_lock_queues(); + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1); + memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); + } - vm_page_wire_count -= pmap_wired_pages_freed; + return VM_PAGEOUT_SCAN_PROCEED; +} - delayed_unlock = 1; +/* + * This function is called only from vm_pageout_scan and + * it will try to age the next speculative Q if the oldest + * one is empty. + */ +static int +vps_age_speculative_queue(boolean_t force_speculative_aging) +{ +#define DELAY_SPECULATIVE_AGE 1000 /* - * Calculate the max number of referenced pages on the inactive - * queue that we will reactivate. + * try to pull pages from the aging bins... + * see vm_page.h for an explanation of how + * this mechanism works */ - reactivated_this_call = 0; - reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count + - vm_page_inactive_count); - inactive_reclaim_run = 0; + boolean_t can_steal = FALSE; + int num_scanned_queues; + static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/ + mach_timespec_t ts; + struct vm_speculative_age_q *aq; + struct vm_speculative_age_q *sq; - vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; - /* - * We must limit the rate at which we send pages to the pagers - * so that we don't tie up too many pages in the I/O queues. - * We implement a throttling mechanism using the laundry count - * to limit the number of pages outstanding to the default - * and external pagers. We can bypass the throttles and look - * for clean pages if the pageout queues don't drain in a timely - * fashion since this may indicate that the pageout paths are - * stalled waiting for memory, which only we can provide. - */ + aq = &vm_page_queue_speculative[speculative_steal_index]; -Restart: + num_scanned_queues = 0; + while (vm_page_queue_empty(&aq->age_q) && + num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) { + speculative_steal_index++; - assert(object == NULL); - assert(delayed_unlock != 0); + if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) { + speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; + } - vm_page_anonymous_min = vm_page_inactive_target / 20; + aq = &vm_page_queue_speculative[speculative_steal_index]; + } - if (vm_pageout_state.vm_page_speculative_percentage > 50) { - vm_pageout_state.vm_page_speculative_percentage = 50; - } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) { - vm_pageout_state.vm_page_speculative_percentage = 1; + if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) { + /* + * XXX We've scanned all the speculative + * queues but still haven't found one + * that is not empty, even though + * vm_page_speculative_count is not 0. + */ + if (!vm_page_queue_empty(&sq->age_q)) { + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } +#if DEVELOPMENT || DEBUG + panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count); +#endif + /* readjust... */ + vm_page_speculative_count = 0; + /* ... and continue */ + return VM_PAGEOUT_SCAN_NEXT_ITERATION; } - vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + - vm_page_inactive_count); + if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) { + can_steal = TRUE; + } else { + if (!delay_speculative_age) { + mach_timespec_t ts_fully_aged; - for (;;) { - vm_page_t m; + ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000; + ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000) + * 1000 * NSEC_PER_USEC; - DTRACE_VM2(rev, int, 1, (uint64_t *), NULL); + ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts); - if (vm_upl_wait_for_pages < 0) { - vm_upl_wait_for_pages = 0; + clock_sec_t sec; + clock_nsec_t nsec; + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = (unsigned int) sec; + ts.tv_nsec = nsec; + + if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) { + can_steal = TRUE; + } else { + delay_speculative_age++; + } + } else { + delay_speculative_age++; + if (delay_speculative_age == DELAY_SPECULATIVE_AGE) { + delay_speculative_age = 0; + } } + } + if (can_steal == TRUE) { + vm_page_speculate_ageit(aq); + } - delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages; + return VM_PAGEOUT_SCAN_PROCEED; +} - if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) { - delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX; +/* + * This function is called only from vm_pageout_scan and + * it evicts a single VM object from the cache. + */ +static int inline +vps_object_cache_evict(vm_object_t *object_to_unlock) +{ + static int cache_evict_throttle = 0; + struct vm_speculative_age_q *sq; + + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + + if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) { + int pages_evicted; + + if (*object_to_unlock != NULL) { + vm_object_unlock(*object_to_unlock); + *object_to_unlock = NULL; } + KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); -#if CONFIG_SECLUDED_MEMORY - /* - * Deal with secluded_q overflow. - */ - if (vm_page_secluded_count > vm_page_secluded_target) { - vm_page_t secluded_page; + pages_evicted = vm_object_cache_evict(100, 10); + + KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0); + + if (pages_evicted) { + vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted; + + VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE, + vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0); + memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE); /* - * SECLUDED_AGING_BEFORE_ACTIVE: - * Excess secluded pages go to the active queue and - * will later go to the inactive queue. + * we just freed up to 100 pages, + * so go back to the top of the main loop + * and re-evaulate the memory situation */ - assert((vm_page_secluded_count_free + - vm_page_secluded_count_inuse) == - vm_page_secluded_count); - secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); - assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); - - vm_page_queues_remove(secluded_page, FALSE); - assert(!secluded_page->vmp_fictitious); - assert(!VM_PAGE_WIRED(secluded_page)); - - if (secluded_page->vmp_object == 0) { - /* transfer to free queue */ - assert(secluded_page->vmp_busy); - secluded_page->vmp_snext = local_freeq; - local_freeq = secluded_page; - local_freed++; - } else { - /* transfer to head of active queue */ - vm_page_enqueue_active(secluded_page, FALSE); - secluded_page = VM_PAGE_NULL; - } + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } else { + cache_evict_throttle = 1000; } -#endif /* CONFIG_SECLUDED_MEMORY */ + } + if (cache_evict_throttle) { + cache_evict_throttle--; + } - assert(delayed_unlock); + return VM_PAGEOUT_SCAN_PROCEED; +} + + +/* + * This function is called only from vm_pageout_scan and + * it calculates the filecache min. that needs to be maintained + * as we start to steal pages. + */ +static void +vps_calculate_filecache_min(void) +{ + int divisor = vm_pageout_state.vm_page_filecache_min_divisor; +#if CONFIG_JETSAM + /* + * don't let the filecache_min fall below 15% of available memory + * on systems with an active compressor that isn't nearing its + * limits w/r to accepting new data + * + * on systems w/o the compressor/swapper, the filecache is always + * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY + * since most (if not all) of the anonymous pages are in the + * throttled queue (which isn't counted as available) which + * effectively disables this filter + */ + if (vm_compressor_low_on_space() || divisor == 0) { + vm_pageout_state.vm_page_filecache_min = 0; + } else { + vm_pageout_state.vm_page_filecache_min = + ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; + } +#else + if (vm_compressor_out_of_space() || divisor == 0) { + vm_pageout_state.vm_page_filecache_min = 0; + } else { /* - * maintain our balance + * don't let the filecache_min fall below the specified critical level */ - vm_page_balance_inactive(1); + vm_pageout_state.vm_page_filecache_min = + ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; + } +#endif + if (vm_page_free_count < (vm_page_free_reserved / 4)) { + vm_pageout_state.vm_page_filecache_min = 0; + } +} +/* + * This function is called only from vm_pageout_scan and + * it updates the flow control time to detect if VM pageoutscan + * isn't making progress. + */ +static void +vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control) +{ + mach_timespec_t ts; + clock_sec_t sec; + clock_nsec_t nsec; - /********************************************************************** - * above this point we're playing with the active and secluded queues - * below this point we're playing with the throttling mechanisms - * and the inactive queue - **********************************************************************/ + ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000; + ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; + clock_get_system_nanotime(&sec, &nsec); + flow_control->ts.tv_sec = (unsigned int) sec; + flow_control->ts.tv_nsec = nsec; + ADD_MACH_TIMESPEC(&flow_control->ts, &ts); - if (vm_page_free_count + local_freed >= vm_page_free_target) { - vm_pageout_scan_wants_object = VM_OBJECT_NULL; + flow_control->state = FCS_DELAYED; - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - /* - * make sure the pageout I/O threads are running - * throttled in case there are still requests - * in the laundry... since we have met our targets - * we don't need the laundry to be cleaned in a timely - * fashion... so let's avoid interfering with foreground - * activity - */ - vm_pageout_adjust_eq_iothrottle(eq, TRUE); + vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++; +} - lck_mtx_lock(&vm_page_queue_free_lock); +/* + * This function is called only from vm_pageout_scan and + * it is the flow control logic of VM pageout scan which + * controls if it should block and for how long. + * Any blocking of vm_pageout_scan happens ONLY in this function. + */ +static int +vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock, + vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count) +{ + boolean_t exceeded_burst_throttle = FALSE; + unsigned int msecs = 0; + uint32_t inactive_external_count; + mach_timespec_t ts; + struct vm_pageout_queue *iq; + struct vm_pageout_queue *eq; + struct vm_speculative_age_q *sq; - if ((vm_page_free_count >= vm_page_free_target) && - (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { + iq = &vm_pageout_queue_internal; + eq = &vm_pageout_queue_external; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + + /* + * Sometimes we have to pause: + * 1) No inactive pages - nothing to do. + * 2) Loop control - no acceptable pages found on the inactive queue + * within the last vm_pageout_burst_inactive_throttle iterations + * 3) Flow control - default pageout queue is full + */ + if (vm_page_queue_empty(&vm_page_queue_inactive) && + vm_page_queue_empty(&vm_page_queue_anonymous) && + vm_page_queue_empty(&vm_page_queue_cleaned) && + vm_page_queue_empty(&sq->age_q)) { + VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1); + msecs = vm_pageout_state.vm_pageout_empty_wait; + } else if (inactive_burst_count >= + MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle, + (vm_page_inactive_count + + vm_page_speculative_count))) { + VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1); + msecs = vm_pageout_state.vm_pageout_burst_wait; + + exceeded_burst_throttle = TRUE; + } else if (VM_PAGE_Q_THROTTLED(iq) && + VM_DYNAMIC_PAGING_ENABLED()) { + clock_sec_t sec; + clock_nsec_t nsec; + + switch (flow_control->state) { + case FCS_IDLE: + if ((vm_page_free_count + *local_freed) < vm_page_free_target && + vm_pageout_state.vm_restricted_to_single_processor == FALSE) { /* - * done - we have met our target *and* - * there is no one waiting for a page. + * since the compressor is running independently of vm_pageout_scan + * let's not wait for it just yet... as long as we have a healthy supply + * of filecache pages to work with, let's keep stealing those. */ -return_from_scan: - assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); - - VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, - vm_pageout_state.vm_pageout_inactive, - vm_pageout_state.vm_pageout_inactive_used, 0, 0); - VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, - vm_pageout_vminfo.vm_pageout_freed_speculative, - vm_pageout_state.vm_pageout_inactive_clean, - vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, - vm_pageout_vminfo.vm_pageout_inactive_dirty_external); + inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - return; + if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min && + (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { + *anons_grabbed = ANONS_GRABBED_LIMIT; + VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1); + return VM_PAGEOUT_SCAN_PROCEED; + } } - lck_mtx_unlock(&vm_page_queue_free_lock); - } - /* - * Before anything, we check if we have any ripe volatile - * objects around. If so, try to purge the first object. - * If the purge fails, fall through to reclaim a page instead. - * If the purge succeeds, go back to the top and reevalute - * the new memory situation. - */ + vps_flow_control_reset_deadlock_timer(flow_control); + msecs = vm_pageout_state.vm_pageout_deadlock_wait; - assert(available_for_purge >= 0); - force_purge = 0; /* no force-purging */ + break; -#if VM_PRESSURE_EVENTS - pressure_level = memorystatus_vm_pressure_level; + case FCS_DELAYED: + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = (unsigned int) sec; + ts.tv_nsec = nsec; - if (pressure_level > kVMPressureNormal) { - if (pressure_level >= kVMPressureCritical) { - force_purge = vm_pageout_state.memorystatus_purge_on_critical; - } else if (pressure_level >= kVMPressureUrgent) { - force_purge = vm_pageout_state.memorystatus_purge_on_urgent; - } else if (pressure_level >= kVMPressureWarning) { - force_purge = vm_pageout_state.memorystatus_purge_on_warning; + if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) { + /* + * the pageout thread for the default pager is potentially + * deadlocked since the + * default pager queue has been throttled for more than the + * allowable time... we need to move some clean pages or dirty + * pages belonging to the external pagers if they aren't throttled + * vm_page_free_wanted represents the number of threads currently + * blocked waiting for pages... we'll move one page for each of + * these plus a fixed amount to break the logjam... once we're done + * moving this number of pages, we'll re-enter the FSC_DELAYED state + * with a new timeout target since we have no way of knowing + * whether we've broken the deadlock except through observation + * of the queue associated with the default pager... we need to + * stop moving pages and allow the system to run to see what + * state it settles into. + */ + + *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief + + vm_page_free_wanted + vm_page_free_wanted_privileged; + VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1); + flow_control->state = FCS_DEADLOCK_DETECTED; + thread_wakeup((event_t) &vm_pageout_garbage_collect); + return VM_PAGEOUT_SCAN_PROCEED; } - } -#endif /* VM_PRESSURE_EVENTS */ + /* + * just resniff instead of trying + * to compute a new delay time... we're going to be + * awakened immediately upon a laundry completion, + * so we won't wait any longer than necessary + */ + msecs = vm_pageout_state.vm_pageout_idle_wait; + break; - if (available_for_purge || force_purge) { - if (object != NULL) { - vm_object_unlock(object); - object = NULL; + case FCS_DEADLOCK_DETECTED: + if (*vm_pageout_deadlock_target) { + return VM_PAGEOUT_SCAN_PROCEED; } - memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START); + vps_flow_control_reset_deadlock_timer(flow_control); + msecs = vm_pageout_state.vm_pageout_deadlock_wait; - VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0); - if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) { - VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1); - VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0); - memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); - continue; - } - VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1); - memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); + break; } + } else { + /* + * No need to pause... + */ + return VM_PAGEOUT_SCAN_PROCEED; + } - if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) { - /* - * try to pull pages from the aging bins... - * see vm_page.h for an explanation of how - * this mechanism works - */ - struct vm_speculative_age_q *aq; - boolean_t can_steal = FALSE; - int num_scanned_queues; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; - aq = &vm_page_queue_speculative[speculative_steal_index]; + vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed, + VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - num_scanned_queues = 0; - while (vm_page_queue_empty(&aq->age_q) && - num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) { - speculative_steal_index++; + if (vm_page_free_count >= vm_page_free_target) { + /* + * we're here because + * 1) someone else freed up some pages while we had + * the queues unlocked above + * and we've hit one of the 3 conditions that + * cause us to pause the pageout scan thread + * + * since we already have enough free pages, + * let's avoid stalling and return normally + * + * before we return, make sure the pageout I/O threads + * are running throttled in case there are still requests + * in the laundry... since we have enough free pages + * we don't need the laundry to be cleaned in a timely + * fashion... so let's avoid interfering with foreground + * activity + * + * we don't want to hold vm_page_queue_free_lock when + * calling vm_pageout_adjust_eq_iothrottle (since it + * may cause other locks to be taken), we do the intitial + * check outside of the lock. Once we take the lock, + * we recheck the condition since it may have changed. + * if it has, no problem, we will make the threads + * non-throttled before actually blocking + */ + vm_pageout_adjust_eq_iothrottle(eq, TRUE); + } + lck_mtx_lock(&vm_page_queue_free_lock); - if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) { - speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; - } + if (vm_page_free_count >= vm_page_free_target && + (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { + return VM_PAGEOUT_SCAN_DONE_RETURN; + } + lck_mtx_unlock(&vm_page_queue_free_lock); - aq = &vm_page_queue_speculative[speculative_steal_index]; - } + if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) { + /* + * we're most likely about to block due to one of + * the 3 conditions that cause vm_pageout_scan to + * not be able to make forward progress w/r + * to providing new pages to the free queue, + * so unthrottle the I/O threads in case we + * have laundry to be cleaned... it needs + * to be completed ASAP. + * + * even if we don't block, we want the io threads + * running unthrottled since the sum of free + + * clean pages is still under our free target + */ + vm_pageout_adjust_eq_iothrottle(eq, FALSE); + } + if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) { + /* + * if we get here we're below our free target and + * we're stalling due to a full laundry queue or + * we don't have any inactive pages other then + * those in the clean queue... + * however, we have pages on the clean queue that + * can be moved to the free queue, so let's not + * stall the pageout scan + */ + flow_control->state = FCS_IDLE; + return VM_PAGEOUT_SCAN_PROCEED; + } + if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) { + flow_control->state = FCS_IDLE; + return VM_PAGEOUT_SCAN_PROCEED; + } - if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) { - /* - * XXX We've scanned all the speculative - * queues but still haven't found one - * that is not empty, even though - * vm_page_speculative_count is not 0. - */ - if (!vm_page_queue_empty(&sq->age_q)) { - continue; - } -#if DEVELOPMENT || DEBUG - panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count); -#endif - /* readjust... */ - vm_page_speculative_count = 0; - /* ... and continue */ - continue; - } + VM_CHECK_MEMORYSTATUS; - if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) { - can_steal = TRUE; - } else { - if (!delay_speculative_age) { - mach_timespec_t ts_fully_aged; + if (flow_control->state != FCS_IDLE) { + VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1); + } - ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000; - ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000) - * 1000 * NSEC_PER_USEC; + iq->pgo_throttled = TRUE; + assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC); - ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts); + counter(c_vm_pageout_scan_block++); - clock_sec_t sec; - clock_nsec_t nsec; - clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = (unsigned int) sec; - ts.tv_nsec = nsec; + vm_page_unlock_queues(); - if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) { - can_steal = TRUE; - } else { - delay_speculative_age++; - } - } else { - delay_speculative_age++; - if (delay_speculative_age == DELAY_SPECULATIVE_AGE) { - delay_speculative_age = 0; - } - } - } - if (can_steal == TRUE) { - vm_page_speculate_ageit(aq); - } - } - force_speculative_aging = FALSE; + assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); - if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) { - int pages_evicted; + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, + iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); + memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START); - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - } - KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); + thread_block(THREAD_CONTINUE_NULL); - pages_evicted = vm_object_cache_evict(100, 10); + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END, + iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); + memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END); - KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0); + vm_page_lock_queues(); - if (pages_evicted) { - vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted; + iq->pgo_throttled = FALSE; - VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE, - vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0); - memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE); + vps_init_page_targets(); - /* - * we just freed up to 100 pages, - * so go back to the top of the main loop - * and re-evaulate the memory situation - */ - continue; - } else { - cache_evict_throttle = 1000; - } - } - if (cache_evict_throttle) { - cache_evict_throttle--; - } + return VM_PAGEOUT_SCAN_NEXT_ITERATION; +} - divisor = vm_pageout_state.vm_page_filecache_min_divisor; +/* + * This function is called only from vm_pageout_scan and + * it will find and return the most appropriate page to be + * reclaimed. + */ +static int +vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous, + boolean_t *is_page_from_bg_q, unsigned int reactivated_this_call) +{ + vm_page_t m = NULL; + vm_object_t m_object = VM_OBJECT_NULL; + uint32_t inactive_external_count; + struct vm_speculative_age_q *sq; + struct vm_pageout_queue *iq; + int retval = VM_PAGEOUT_SCAN_PROCEED; + + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + iq = &vm_pageout_queue_internal; + + while (1) { + *is_page_from_bg_q = FALSE; + + m = NULL; + m_object = VM_OBJECT_NULL; + + if (VM_DYNAMIC_PAGING_ENABLED()) { + assert(vm_page_throttled_count == 0); + assert(vm_page_queue_empty(&vm_page_queue_throttled)); + } -#if CONFIG_JETSAM /* - * don't let the filecache_min fall below 15% of available memory - * on systems with an active compressor that isn't nearing its - * limits w/r to accepting new data - * - * on systems w/o the compressor/swapper, the filecache is always - * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY - * since most (if not all) of the anonymous pages are in the - * throttled queue (which isn't counted as available) which - * effectively disables this filter + * Try for a clean-queue inactive page. + * These are pages that vm_pageout_scan tried to steal earlier, but + * were dirty and had to be cleaned. Pick them up now that they are clean. */ - if (vm_compressor_low_on_space() || divisor == 0) { - vm_pageout_state.vm_page_filecache_min = 0; - } else { - vm_pageout_state.vm_page_filecache_min = - ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; - } -#else - if (vm_compressor_out_of_space() || divisor == 0) { - vm_pageout_state.vm_page_filecache_min = 0; - } else { - /* - * don't let the filecache_min fall below the specified critical level - */ - vm_pageout_state.vm_page_filecache_min = - ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; - } -#endif - if (vm_page_free_count < (vm_page_free_reserved / 4)) { - vm_pageout_state.vm_page_filecache_min = 0; + if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + + break; } - exceeded_burst_throttle = FALSE; /* - * Sometimes we have to pause: - * 1) No inactive pages - nothing to do. - * 2) Loop control - no acceptable pages found on the inactive queue - * within the last vm_pageout_burst_inactive_throttle iterations - * 3) Flow control - default pageout queue is full + * The next most eligible pages are ones we paged in speculatively, + * but which have not yet been touched and have been aged out. */ - if (vm_page_queue_empty(&vm_page_queue_inactive) && - vm_page_queue_empty(&vm_page_queue_anonymous) && - vm_page_queue_empty(&vm_page_queue_cleaned) && - vm_page_queue_empty(&sq->age_q)) { - VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1); - msecs = vm_pageout_state.vm_pageout_empty_wait; - goto vm_pageout_scan_delay; - } else if (inactive_burst_count >= - MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle, - (vm_page_inactive_count + - vm_page_speculative_count))) { - VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1); - msecs = vm_pageout_state.vm_pageout_burst_wait; - - exceeded_burst_throttle = TRUE; - goto vm_pageout_scan_delay; - } else if (VM_PAGE_Q_THROTTLED(iq) && - VM_DYNAMIC_PAGING_ENABLED()) { - clock_sec_t sec; - clock_nsec_t nsec; + if (!vm_page_queue_empty(&sq->age_q)) { + m = (vm_page_t) vm_page_queue_first(&sq->age_q); - switch (flow_control.state) { - case FCS_IDLE: - if ((vm_page_free_count + local_freed) < vm_page_free_target && - vm_pageout_state.vm_restricted_to_single_processor == FALSE) { - /* - * since the compressor is running independently of vm_pageout_scan - * let's not wait for it just yet... as long as we have a healthy supply - * of filecache pages to work with, let's keep stealing those. - */ - inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; + assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); - if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min && - (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { - anons_grabbed = ANONS_GRABBED_LIMIT; - VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1); - goto consider_inactive; - } - } -reset_deadlock_timer: - ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000; - ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; - clock_get_system_nanotime(&sec, &nsec); - flow_control.ts.tv_sec = (unsigned int) sec; - flow_control.ts.tv_nsec = nsec; - ADD_MACH_TIMESPEC(&flow_control.ts, &ts); - - flow_control.state = FCS_DELAYED; - msecs = vm_pageout_state.vm_pageout_deadlock_wait; - - vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++; + if (!m->vmp_dirty || force_anonymous == FALSE) { break; + } else { + m = NULL; + } + } - case FCS_DELAYED: - clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = (unsigned int) sec; - ts.tv_nsec = nsec; +#if CONFIG_BACKGROUND_QUEUE + if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) { + vm_object_t bg_m_object = NULL; - if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) { - /* - * the pageout thread for the default pager is potentially - * deadlocked since the - * default pager queue has been throttled for more than the - * allowable time... we need to move some clean pages or dirty - * pages belonging to the external pagers if they aren't throttled - * vm_page_free_wanted represents the number of threads currently - * blocked waiting for pages... we'll move one page for each of - * these plus a fixed amount to break the logjam... once we're done - * moving this number of pages, we'll re-enter the FSC_DELAYED state - * with a new timeout target since we have no way of knowing - * whether we've broken the deadlock except through observation - * of the queue associated with the default pager... we need to - * stop moving pages and allow the system to run to see what - * state it settles into. - */ - vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief + - vm_page_free_wanted + vm_page_free_wanted_privileged; - VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1); - flow_control.state = FCS_DEADLOCK_DETECTED; - thread_wakeup((event_t) &vm_pageout_garbage_collect); - goto consider_inactive; - } + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background); + + bg_m_object = VM_PAGE_OBJECT(m); + + if (!VM_PAGE_PAGEABLE(m)) { /* - * just resniff instead of trying - * to compute a new delay time... we're going to be - * awakened immediately upon a laundry completion, - * so we won't wait any longer than necessary + * This page is on the background queue + * but not on a pageable queue. This is + * likely a transient state and whoever + * took it out of its pageable queue + * will likely put it back on a pageable + * queue soon but we can't deal with it + * at this point, so let's ignore this + * page. */ - msecs = vm_pageout_state.vm_pageout_idle_wait; - break; + } else if (force_anonymous == FALSE || bg_m_object->internal) { + if (bg_m_object->internal && + (VM_PAGE_Q_THROTTLED(iq) || + vm_compressor_out_of_space() == TRUE || + vm_page_free_count < (vm_page_free_reserved / 4))) { + vm_pageout_skipped_bq_internal++; + } else { + *is_page_from_bg_q = TRUE; - case FCS_DEADLOCK_DETECTED: - if (vm_pageout_deadlock_target) { - goto consider_inactive; + if (bg_m_object->internal) { + vm_pageout_vminfo.vm_pageout_considered_bq_internal++; + } else { + vm_pageout_vminfo.vm_pageout_considered_bq_external++; + } + break; } - goto reset_deadlock_timer; } -vm_pageout_scan_delay: - vm_pageout_scan_wants_object = VM_OBJECT_NULL; + } +#endif /* CONFIG_BACKGROUND_QUEUE */ - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); + inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - if (vm_page_free_count >= vm_page_free_target) { - /* - * we're here because - * 1) someone else freed up some pages while we had - * the queues unlocked above - * and we've hit one of the 3 conditions that - * cause us to pause the pageout scan thread - * - * since we already have enough free pages, - * let's avoid stalling and return normally - * - * before we return, make sure the pageout I/O threads - * are running throttled in case there are still requests - * in the laundry... since we have enough free pages - * we don't need the laundry to be cleaned in a timely - * fashion... so let's avoid interfering with foreground - * activity - * - * we don't want to hold vm_page_queue_free_lock when - * calling vm_pageout_adjust_eq_iothrottle (since it - * may cause other locks to be taken), we do the intitial - * check outside of the lock. Once we take the lock, - * we recheck the condition since it may have changed. - * if it has, no problem, we will make the threads - * non-throttled before actually blocking - */ - vm_pageout_adjust_eq_iothrottle(eq, TRUE); - } - lck_mtx_lock(&vm_page_queue_free_lock); + if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) || + (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { + *grab_anonymous = TRUE; + *anons_grabbed = 0; - if (vm_page_free_count >= vm_page_free_target && - (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { - goto return_from_scan; - } - lck_mtx_unlock(&vm_page_queue_free_lock); + vm_pageout_vminfo.vm_pageout_skipped_external++; + goto want_anonymous; + } + *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min); - if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) { - /* - * we're most likely about to block due to one of - * the 3 conditions that cause vm_pageout_scan to - * not be able to make forward progress w/r - * to providing new pages to the free queue, - * so unthrottle the I/O threads in case we - * have laundry to be cleaned... it needs - * to be completed ASAP. - * - * even if we don't block, we want the io threads - * running unthrottled since the sum of free + - * clean pages is still under our free target - */ - vm_pageout_adjust_eq_iothrottle(eq, FALSE); - } - if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) { - /* - * if we get here we're below our free target and - * we're stalling due to a full laundry queue or - * we don't have any inactive pages other then - * those in the clean queue... - * however, we have pages on the clean queue that - * can be moved to the free queue, so let's not - * stall the pageout scan - */ - flow_control.state = FCS_IDLE; - goto consider_inactive; +#if CONFIG_JETSAM + /* If the file-backed pool has accumulated + * significantly more pages than the jetsam + * threshold, prefer to reclaim those + * inline to minimise compute overhead of reclaiming + * anonymous pages. + * This calculation does not account for the CPU local + * external page queues, as those are expected to be + * much smaller relative to the global pools. + */ + + struct vm_pageout_queue *eq = &vm_pageout_queue_external; + + if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) { + if (vm_page_pageable_external_count > + vm_pageout_state.vm_page_filecache_min) { + if ((vm_page_pageable_external_count * + vm_pageout_memorystatus_fb_factor_dr) > + (memorystatus_available_pages_critical * + vm_pageout_memorystatus_fb_factor_nr)) { + *grab_anonymous = FALSE; + + VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1); + } } - if (flow_control.state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) { - flow_control.state = FCS_IDLE; - goto consider_inactive; + if (*grab_anonymous) { + VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1); } + } +#endif /* CONFIG_JETSAM */ - VM_CHECK_MEMORYSTATUS; +want_anonymous: + if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) { + if (!vm_page_queue_empty(&vm_page_queue_inactive)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + *anons_grabbed = 0; + + if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) { + if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { + if ((++reactivated_this_call % 100)) { + vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++; + + vm_page_activate(m); + VM_STAT_INCR(reactivations); +#if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG + if (*is_page_from_bg_q == TRUE) { + if (m_object->internal) { + vm_pageout_rejected_bq_internal++; + } else { + vm_pageout_rejected_bq_external++; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ + vm_pageout_state.vm_pageout_inactive_used++; + + m = NULL; + retval = VM_PAGEOUT_SCAN_NEXT_ITERATION; + + break; + } - if (flow_control.state != FCS_IDLE) { - VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1); + /* + * steal 1% of the file backed pages even if + * we are under the limit that has been set + * for a healthy filecache + */ + } + } + break; } + } + if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); + + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + *anons_grabbed += 1; + + break; + } - iq->pgo_throttled = TRUE; - assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC); + m = NULL; + } - counter(c_vm_pageout_scan_block++); + *victim_page = m; - vm_page_unlock_queues(); + return retval; +} - assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); +/* + * This function is called only from vm_pageout_scan and + * it will put a page back on the active/inactive queue + * if we can't reclaim it for some reason. + */ +static void +vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q) +{ + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) { + vm_page_enqueue_inactive(m, FALSE); + } else { + vm_page_activate(m); + } - VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, - iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); - memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START); +#if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG + vm_object_t m_object = VM_PAGE_OBJECT(m); - thread_block(THREAD_CONTINUE_NULL); + if (page_from_bg_q == TRUE) { + if (m_object->internal) { + vm_pageout_rejected_bq_internal++; + } else { + vm_pageout_rejected_bq_external++; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ +} - VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END, - iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); - memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END); +/* + * This function is called only from vm_pageout_scan and + * it will try to grab the victim page's VM object (m_object) + * which differs from the previous victim page's object (object). + */ +static int +vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q) +{ + struct vm_speculative_age_q *sq; - vm_page_lock_queues(); + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; - iq->pgo_throttled = FALSE; + /* + * the object associated with candidate page is + * different from the one we were just working + * with... dump the lock if we still own it + */ + if (*object != NULL) { + vm_object_unlock(*object); + *object = NULL; + } + /* + * Try to lock object; since we've alread got the + * page queues lock, we can only 'try' for this one. + * if the 'try' fails, we need to do a mutex_pause + * to allow the owner of the object lock a chance to + * run... otherwise, we're likely to trip over this + * object in the same state as we work our way through + * the queue... clumps of pages associated with the same + * object are fairly typical on the inactive and active queues + */ + if (!vm_object_lock_try_scan(m_object)) { + vm_page_t m_want = NULL; + + vm_pageout_vminfo.vm_pageout_inactive_nolock++; + + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1); + } + + pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); + + m->vmp_reference = FALSE; + + if (!m_object->object_is_shared_cache) { + /* + * don't apply this optimization if this is the shared cache + * object, it's too easy to get rid of very hot and important + * pages... + * m->vmp_object must be stable since we hold the page queues lock... + * we can update the scan_collisions field sans the object lock + * since it is a separate field and this is the only spot that does + * a read-modify-write operation and it is never executed concurrently... + * we can asynchronously set this field to 0 when creating a UPL, so it + * is possible for the value to be a bit non-determistic, but that's ok + * since it's only used as a hint + */ + m_object->scan_collisions = 1; + } + if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + } else if (!vm_page_queue_empty(&sq->age_q)) { + m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); + } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) && + !vm_page_queue_empty(&vm_page_queue_inactive)) { + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); + } + + /* + * this is the next object we're going to be interested in + * try to make sure its available after the mutex_pause + * returns control + */ + if (m_want) { + vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want); + } + + vps_requeue_page(m, page_prev_q_state, page_from_bg_q); + + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } else { + *object = m_object; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + } + + return VM_PAGEOUT_SCAN_PROCEED; +} + +/* + * This function is called only from vm_pageout_scan and + * it notices that pageout scan may be rendered ineffective + * due to a FS deadlock and will jetsam a process if possible. + * If jetsam isn't supported, it'll move the page to the active + * queue to try and get some different pages pushed onwards so + * we can try to get out of this scenario. + */ +static void +vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit, + int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q) +{ + struct vm_pageout_queue *eq; + vm_object_t cur_object = VM_OBJECT_NULL; + + cur_object = *object; + + eq = &vm_pageout_queue_external; + + if (cur_object->internal == FALSE) { + /* + * we need to break up the following potential deadlock case... + * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written. + * b) The thread doing the writing is waiting for pages while holding the truncate lock + * c) Most of the pages in the inactive queue belong to this file. + * + * we are potentially in this deadlock because... + * a) the external pageout queue is throttled + * b) we're done with the active queue and moved on to the inactive queue + * c) we've got a dirty external page + * + * since we don't know the reason for the external pageout queue being throttled we + * must suspect that we are deadlocked, so move the current page onto the active queue + * in an effort to cause a page from the active queue to 'age' to the inactive queue + * + * if we don't have jetsam configured (i.e. we have a dynamic pager), set + * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous + * pool the next time we select a victim page... if we can make enough new free pages, + * the deadlock will break, the external pageout queue will empty and it will no longer + * be throttled + * + * if we have jetsam configured, keep a count of the pages reactivated this way so + * that we can try to find clean pages in the active/inactive queues before + * deciding to jetsam a process + */ + vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++; + + vm_page_check_pageable_safe(m); + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); + vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; + vm_page_active_count++; + vm_page_pageable_external_count++; + + vm_pageout_adjust_eq_iothrottle(eq, FALSE); + +#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM + +#pragma unused(force_anonymous) + + *vm_pageout_inactive_external_forced_reactivate_limit -= 1; + + if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) { + *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + /* + * Possible deadlock scenario so request jetsam action + */ + + assert(cur_object); + vm_object_unlock(cur_object); + + cur_object = VM_OBJECT_NULL; + + /* + * VM pageout scan needs to know we have dropped this lock and so set the + * object variable we got passed in to NULL. + */ + *object = VM_OBJECT_NULL; + + vm_page_unlock_queues(); + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START, + vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); + + /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */ + if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) { + VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1); + } + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, + vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); + + vm_page_lock_queues(); + *delayed_unlock = 1; + } +#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ + +#pragma unused(vm_pageout_inactive_external_forced_reactivate_limit) +#pragma unused(delayed_unlock) + + *force_anonymous = TRUE; +#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ + } else { + vm_page_activate(m); + VM_STAT_INCR(reactivations); + +#if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG + if (is_page_from_bg_q == TRUE) { + if (cur_object->internal) { + vm_pageout_rejected_bq_internal++; + } else { + vm_pageout_rejected_bq_external++; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ + + vm_pageout_state.vm_pageout_inactive_used++; + } +} + + +void +vm_page_balance_inactive(int max_to_move) +{ + vm_page_t m; + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + if (hibernation_vmqueues_inspection == TRUE) { + /* + * It is likely that the hibernation code path is + * dealing with these very queues as we are about + * to move pages around in/from them and completely + * change the linkage of the pages. + * + * And so we skip the rebalancing of these queues. + */ + return; + } + vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + + vm_page_inactive_count + + vm_page_speculative_count); + + while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) { + VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1); + + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); + + assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); + assert(!m->vmp_laundry); + assert(VM_PAGE_OBJECT(m) != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + + DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); + + /* + * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise... + * + * a TLB flush isn't really needed here since at worst we'll miss the reference bit being + * updated in the PTE if a remote processor still has this mapping cached in its TLB when the + * new reference happens. If no futher references happen on the page after that remote TLB flushes + * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue + * by pageout_scan, which is just fine since the last reference would have happened quite far + * in the past (TLB caches don't hang around for very long), and of course could just as easily + * have happened before we moved the page + */ + if (m->vmp_pmapped == TRUE) { + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); + } + + /* + * The page might be absent or busy, + * but vm_page_deactivate can handle that. + * FALSE indicates that we don't want a H/W clear reference + */ + vm_page_deactivate_internal(m, FALSE); + } +} + + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + * It returns with both vm_page_queue_free_lock and vm_page_queue_lock + * held and vm_page_free_wanted == 0. + */ +void +vm_pageout_scan(void) +{ + unsigned int loop_count = 0; + unsigned int inactive_burst_count = 0; + unsigned int reactivated_this_call; + unsigned int reactivate_limit; + vm_page_t local_freeq = NULL; + int local_freed = 0; + int delayed_unlock; + int delayed_unlock_limit = 0; + int refmod_state = 0; + int vm_pageout_deadlock_target = 0; + struct vm_pageout_queue *iq; + struct vm_pageout_queue *eq; + struct vm_speculative_age_q *sq; + struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } }; + boolean_t inactive_throttled = FALSE; + vm_object_t object = NULL; + uint32_t inactive_reclaim_run; + boolean_t grab_anonymous = FALSE; + boolean_t force_anonymous = FALSE; + boolean_t force_speculative_aging = FALSE; + int anons_grabbed = 0; + int page_prev_q_state = 0; + boolean_t page_from_bg_q = FALSE; + uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; + vm_object_t m_object = VM_OBJECT_NULL; + int retval = 0; + boolean_t lock_yield_check = FALSE; + + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START, + vm_pageout_vminfo.vm_pageout_freed_speculative, + vm_pageout_state.vm_pageout_inactive_clean, + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, + vm_pageout_vminfo.vm_pageout_inactive_dirty_external); + + flow_control.state = FCS_IDLE; + iq = &vm_pageout_queue_internal; + eq = &vm_pageout_queue_external; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + + /* Ask the pmap layer to return any pages it no longer needs. */ + uint64_t pmap_wired_pages_freed = pmap_release_pages_fast(); + + vm_page_lock_queues(); + + vm_page_wire_count -= pmap_wired_pages_freed; + + delayed_unlock = 1; + + /* + * Calculate the max number of referenced pages on the inactive + * queue that we will reactivate. + */ + reactivated_this_call = 0; + reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count + + vm_page_inactive_count); + inactive_reclaim_run = 0; + + vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + + /* + * We must limit the rate at which we send pages to the pagers + * so that we don't tie up too many pages in the I/O queues. + * We implement a throttling mechanism using the laundry count + * to limit the number of pages outstanding to the default + * and external pagers. We can bypass the throttles and look + * for clean pages if the pageout queues don't drain in a timely + * fashion since this may indicate that the pageout paths are + * stalled waiting for memory, which only we can provide. + */ + + vps_init_page_targets(); + assert(object == NULL); + assert(delayed_unlock != 0); + + for (;;) { + vm_page_t m; + + DTRACE_VM2(rev, int, 1, (uint64_t *), NULL); + + if (lock_yield_check) { + lock_yield_check = FALSE; + + if (delayed_unlock++ > delayed_unlock_limit) { + int freed = local_freed; + + vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, + VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); + if (freed == 0) { + lck_mtx_yield(&vm_page_queue_lock); + } + } else if (vm_pageout_scan_wants_object) { + vm_page_unlock_queues(); + mutex_pause(0); + vm_page_lock_queues(); + } + } + + if (vm_upl_wait_for_pages < 0) { + vm_upl_wait_for_pages = 0; + } + + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages; + + if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) { + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX; + } + + vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed); + + assert(delayed_unlock); + + /* + * maintain our balance + */ + vm_page_balance_inactive(1); + + + /********************************************************************** + * above this point we're playing with the active and secluded queues + * below this point we're playing with the throttling mechanisms + * and the inactive queue + **********************************************************************/ + + if (vm_page_free_count + local_freed >= vm_page_free_target) { + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + + vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, + VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); + /* + * make sure the pageout I/O threads are running + * throttled in case there are still requests + * in the laundry... since we have met our targets + * we don't need the laundry to be cleaned in a timely + * fashion... so let's avoid interfering with foreground + * activity + */ + vm_pageout_adjust_eq_iothrottle(eq, TRUE); + + lck_mtx_lock(&vm_page_queue_free_lock); + + if ((vm_page_free_count >= vm_page_free_target) && + (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { + /* + * done - we have met our target *and* + * there is no one waiting for a page. + */ +return_from_scan: + assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, + vm_pageout_state.vm_pageout_inactive, + vm_pageout_state.vm_pageout_inactive_used, 0, 0); + VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, + vm_pageout_vminfo.vm_pageout_freed_speculative, + vm_pageout_state.vm_pageout_inactive_clean, + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, + vm_pageout_vminfo.vm_pageout_inactive_dirty_external); + + return; + } + lck_mtx_unlock(&vm_page_queue_free_lock); + } + + /* + * Before anything, we check if we have any ripe volatile + * objects around. If so, try to purge the first object. + * If the purge fails, fall through to reclaim a page instead. + * If the purge succeeds, go back to the top and reevalute + * the new memory situation. + */ + retval = vps_purge_object(); + + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + /* + * Success + */ + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + + lock_yield_check = FALSE; + continue; + } + + /* + * If our 'aged' queue is empty and we have some speculative pages + * in the other queues, let's go through and see if we need to age + * them. + * + * If we succeeded in aging a speculative Q or just that everything + * looks normal w.r.t queue age and queue counts, we keep going onward. + * + * If, for some reason, we seem to have a mismatch between the spec. + * page count and the page queues, we reset those variables and + * restart the loop (LD TODO: Track this better?). + */ + if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) { + retval = vps_age_speculative_queue(force_speculative_aging); + + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + lock_yield_check = FALSE; + continue; + } + } + force_speculative_aging = FALSE; + + /* + * Check to see if we need to evict objects from the cache. + * + * Note: 'object' here doesn't have anything to do with + * the eviction part. We just need to make sure we have dropped + * any object lock we might be holding if we need to go down + * into the eviction logic. + */ + retval = vps_object_cache_evict(&object); + + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + lock_yield_check = FALSE; + continue; + } + + + /* + * Calculate our filecache_min that will affect the loop + * going forward. + */ + vps_calculate_filecache_min(); + + /* + * LD TODO: Use a structure to hold all state variables for a single + * vm_pageout_scan iteration and pass that structure to this function instead. + */ + retval = vps_flow_control(&flow_control, &anons_grabbed, &object, + &delayed_unlock, &local_freeq, &local_freed, + &vm_pageout_deadlock_target, inactive_burst_count); + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { if (loop_count >= vm_page_inactive_count) { loop_count = 0; } + inactive_burst_count = 0; - goto Restart; - /*NOTREACHED*/ - } + assert(object == NULL); + assert(delayed_unlock != 0); + lock_yield_check = FALSE; + continue; + } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) { + goto return_from_scan; + } flow_control.state = FCS_IDLE; -consider_inactive: + vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count), vm_pageout_inactive_external_forced_reactivate_limit); loop_count++; @@ -2438,157 +3094,22 @@ consider_inactive: /* * Choose a victim. */ - while (1) { -#if CONFIG_BACKGROUND_QUEUE - page_from_bg_q = FALSE; -#endif /* CONFIG_BACKGROUND_QUEUE */ - - m = NULL; - m_object = VM_OBJECT_NULL; - - if (VM_DYNAMIC_PAGING_ENABLED()) { - assert(vm_page_throttled_count == 0); - assert(vm_page_queue_empty(&vm_page_queue_throttled)); - } - - /* - * Try for a clean-queue inactive page. - * These are pages that vm_pageout_scan tried to steal earlier, but - * were dirty and had to be cleaned. Pick them up now that they are clean. - */ - if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - - assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); - - break; - } - - /* - * The next most eligible pages are ones we paged in speculatively, - * but which have not yet been touched and have been aged out. - */ - if (!vm_page_queue_empty(&sq->age_q)) { - m = (vm_page_t) vm_page_queue_first(&sq->age_q); - - assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); - - if (!m->vmp_dirty || force_anonymous == FALSE) { - break; - } else { - m = NULL; - } - } - -#if CONFIG_BACKGROUND_QUEUE - if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) { - vm_object_t bg_m_object = NULL; - - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background); - - bg_m_object = VM_PAGE_OBJECT(m); - if (!VM_PAGE_PAGEABLE(m)) { - /* - * This page is on the background queue - * but not on a pageable queue. This is - * likely a transient state and whoever - * took it out of its pageable queue - * will likely put it back on a pageable - * queue soon but we can't deal with it - * at this point, so let's ignore this - * page. - */ - } else if (force_anonymous == FALSE || bg_m_object->internal) { - if (bg_m_object->internal && - (VM_PAGE_Q_THROTTLED(iq) || - vm_compressor_out_of_space() == TRUE || - vm_page_free_count < (vm_page_free_reserved / 4))) { - vm_pageout_skipped_bq_internal++; - } else { - page_from_bg_q = TRUE; - - if (bg_m_object->internal) { - vm_pageout_vminfo.vm_pageout_considered_bq_internal++; - } else { - vm_pageout_vminfo.vm_pageout_considered_bq_external++; - } - break; - } - } - } -#endif - inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - - if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) || - (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { - grab_anonymous = TRUE; - anons_grabbed = 0; + m = NULL; + retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, reactivated_this_call); - vm_pageout_vminfo.vm_pageout_skipped_external++; - goto want_anonymous; - } - grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min); + if (m == NULL) { + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + reactivated_this_call++; -#if CONFIG_JETSAM - /* If the file-backed pool has accumulated - * significantly more pages than the jetsam - * threshold, prefer to reclaim those - * inline to minimise compute overhead of reclaiming - * anonymous pages. - * This calculation does not account for the CPU local - * external page queues, as those are expected to be - * much smaller relative to the global pools. - */ - if (grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) { - if (vm_page_pageable_external_count > - vm_pageout_state.vm_page_filecache_min) { - if ((vm_page_pageable_external_count * - vm_pageout_memorystatus_fb_factor_dr) > - (memorystatus_available_pages_critical * - vm_pageout_memorystatus_fb_factor_nr)) { - grab_anonymous = FALSE; - - VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1); - } - } - if (grab_anonymous) { - VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1); - } - } -#endif /* CONFIG_JETSAM */ + inactive_burst_count = 0; -want_anonymous: - if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) { - if (!vm_page_queue_empty(&vm_page_queue_inactive)) { - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - - assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); - anons_grabbed = 0; - - if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) { - if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { - if ((++reactivated_this_call % 100)) { - vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++; - goto must_activate_page; - } - /* - * steal 1% of the file backed pages even if - * we are under the limit that has been set - * for a healthy filecache - */ - } - } - break; + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); } - } - if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); - anons_grabbed++; - - break; + lock_yield_check = TRUE; + continue; } /* @@ -2603,17 +3124,20 @@ want_anonymous: VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1); if (!vm_page_queue_empty(&sq->age_q)) { - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } if (vm_page_speculative_count) { force_speculative_aging = TRUE; - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } panic("vm_pageout: no victim"); /* NOTREACHED */ } + assert(VM_PAGE_PAGEABLE(m)); m_object = VM_PAGE_OBJECT(m); force_anonymous = FALSE; @@ -2642,78 +3166,19 @@ want_anonymous: * already got the lock */ if (m_object != object) { + boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT); + /* - * the object associated with candidate page is - * different from the one we were just working - * with... dump the lock if we still own it - */ - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - } - /* - * Try to lock object; since we've alread got the - * page queues lock, we can only 'try' for this one. - * if the 'try' fails, we need to do a mutex_pause - * to allow the owner of the object lock a chance to - * run... otherwise, we're likely to trip over this - * object in the same state as we work our way through - * the queue... clumps of pages associated with the same - * object are fairly typical on the inactive and active queues + * vps_switch_object() will always drop the 'object' lock first + * and then try to acquire the 'm_object' lock. So 'object' has to point to + * either 'm_object' or NULL. */ - if (!vm_object_lock_try_scan(m_object)) { - vm_page_t m_want = NULL; - - vm_pageout_vminfo.vm_pageout_inactive_nolock++; - - if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { - VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1); - } - - pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); - - m->vmp_reference = FALSE; - - if (!m_object->object_is_shared_cache) { - /* - * don't apply this optimization if this is the shared cache - * object, it's too easy to get rid of very hot and important - * pages... - * m->vmp_object must be stable since we hold the page queues lock... - * we can update the scan_collisions field sans the object lock - * since it is a separate field and this is the only spot that does - * a read-modify-write operation and it is never executed concurrently... - * we can asynchronously set this field to 0 when creating a UPL, so it - * is possible for the value to be a bit non-determistic, but that's ok - * since it's only used as a hint - */ - m_object->scan_collisions = 1; - } - if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - } else if (!vm_page_queue_empty(&sq->age_q)) { - m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); - } else if ((grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || - vm_page_queue_empty(&vm_page_queue_anonymous)) && - !vm_page_queue_empty(&vm_page_queue_inactive)) { - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - } + retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q); - /* - * this is the next object we're going to be interested in - * try to make sure its available after the mutex_pause - * returns control - */ - if (m_want) { - vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want); - } - - goto requeue_page; + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + lock_yield_check = TRUE; + continue; } - object = m_object; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; } assert(m_object == object); assert(VM_PAGE_OBJECT(m) == m_object); @@ -2729,24 +3194,11 @@ want_anonymous: if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1); } -requeue_page: - if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) { - vm_page_enqueue_inactive(m, FALSE); - } else { - vm_page_activate(m); - } -#if CONFIG_BACKGROUND_QUEUE -#if DEVELOPMENT || DEBUG - if (page_from_bg_q == TRUE) { - if (m_object->internal) { - vm_pageout_rejected_bq_internal++; - } else { - vm_pageout_rejected_bq_external++; - } - } -#endif -#endif - goto done_with_inactivepage; + + vps_requeue_page(m, page_prev_q_state, page_from_bg_q); + + lock_yield_check = TRUE; + continue; } /* @@ -2770,7 +3222,8 @@ requeue_page: * just leave it off the paging queues */ if (m->vmp_free_when_done || m->vmp_cleaning) { - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } @@ -2839,7 +3292,9 @@ reclaim_page: } inactive_burst_count = 0; - goto done_with_inactivepage; + + lock_yield_check = TRUE; + continue; } if (object->copy == VM_OBJECT_NULL) { /* @@ -2915,18 +3370,15 @@ reclaim_page: /* deal with a rogue "reusable" page */ VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object); } - divisor = vm_pageout_state.vm_page_xpmapped_min_divisor; - if (divisor == 0) { + if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) { vm_pageout_state.vm_page_xpmapped_min = 0; } else { - vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / divisor; + vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor; } if (!m->vmp_no_cache && -#if CONFIG_BACKGROUND_QUEUE page_from_bg_q == FALSE && -#endif (m->vmp_reference || (m->vmp_xpmapped && !object->internal && (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) { /* @@ -2959,7 +3411,6 @@ reactivate_page: vm_page_deactivate(m); VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1); } else { -must_activate_page: /* * The page was/is being used, so put back on active list. */ @@ -2976,14 +3427,16 @@ must_activate_page: vm_pageout_rejected_bq_external++; } } -#endif -#endif +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); } vm_pageout_state.vm_pageout_inactive_used++; - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } /* * Make sure we call pmap_get_refmod() if it @@ -2998,10 +3451,6 @@ must_activate_page: } } - XPR(XPR_VM_PAGEOUT, - "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n", - object, m->vmp_offset, m, 0, 0); - /* * we've got a candidate page to steal... * @@ -3045,81 +3494,22 @@ throttle_inactive: VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1); inactive_burst_count = 0; - goto done_with_inactivepage; + + lock_yield_check = TRUE; + continue; } if (inactive_throttled == TRUE) { - if (object->internal == FALSE) { - /* - * we need to break up the following potential deadlock case... - * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written. - * b) The thread doing the writing is waiting for pages while holding the truncate lock - * c) Most of the pages in the inactive queue belong to this file. - * - * we are potentially in this deadlock because... - * a) the external pageout queue is throttled - * b) we're done with the active queue and moved on to the inactive queue - * c) we've got a dirty external page - * - * since we don't know the reason for the external pageout queue being throttled we - * must suspect that we are deadlocked, so move the current page onto the active queue - * in an effort to cause a page from the active queue to 'age' to the inactive queue - * - * if we don't have jetsam configured (i.e. we have a dynamic pager), set - * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous - * pool the next time we select a victim page... if we can make enough new free pages, - * the deadlock will break, the external pageout queue will empty and it will no longer - * be throttled - * - * if we have jetsam configured, keep a count of the pages reactivated this way so - * that we can try to find clean pages in the active/inactive queues before - * deciding to jetsam a process - */ - vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++; - - vm_page_check_pageable_safe(m); - assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); - vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq); - m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; - vm_page_active_count++; - vm_page_pageable_external_count++; - - vm_pageout_adjust_eq_iothrottle(eq, FALSE); - -#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM - vm_pageout_inactive_external_forced_reactivate_limit--; - - if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) { - vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; - /* - * Possible deadlock scenario so request jetsam action - */ - assert(object); - vm_object_unlock(object); - object = VM_OBJECT_NULL; - vm_page_unlock_queues(); - - VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START, - vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); - - /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */ - if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) { - VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1); - } + vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit, + &delayed_unlock, &force_anonymous, page_from_bg_q); - VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, - vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); + inactive_burst_count = 0; - vm_page_lock_queues(); - delayed_unlock = 1; - } -#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ - force_anonymous = TRUE; -#endif - inactive_burst_count = 0; - goto done_with_inactivepage; - } else { - goto must_activate_page; + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); } + + lock_yield_check = TRUE; + continue; } /* @@ -3261,21 +3651,6 @@ throttle_inactive: vm_pageout_cluster(m); inactive_burst_count = 0; -done_with_inactivepage: - - if (delayed_unlock++ > delayed_unlock_limit) { - int freed = local_freed; - - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - if (freed == 0) { - lck_mtx_yield(&vm_page_queue_lock); - } - } else if (vm_pageout_scan_wants_object) { - vm_page_unlock_queues(); - mutex_pause(0); - vm_page_lock_queues(); - } /* * back to top of pageout scan loop */ @@ -3335,11 +3710,9 @@ vm_pageout_continue(void) DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1); -#if !CONFIG_EMBEDDED lck_mtx_lock(&vm_page_queue_free_lock); vm_pageout_running = TRUE; lck_mtx_unlock(&vm_page_queue_free_lock); -#endif /* CONFIG_EMBEDDED */ vm_pageout_scan(); /* @@ -3350,8 +3723,8 @@ vm_pageout_continue(void) assert(vm_page_free_wanted_privileged == 0); assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); -#if !CONFIG_EMBEDDED vm_pageout_running = FALSE; +#if !CONFIG_EMBEDDED if (vm_pageout_waiter) { vm_pageout_waiter = FALSE; thread_wakeup((event_t)&vm_pageout_waiter); @@ -3944,6 +4317,7 @@ vm_pageout_iothread_internal(struct cq *cq) } + thread_set_thread_name(current_thread(), "VM_compressor"); #if DEVELOPMENT || DEBUG vmct_stats.vmct_minpages[cq->id] = INT32_MAX; @@ -4063,53 +4437,67 @@ vm_pressure_response(void) } #endif /* VM_PRESSURE_EVENTS */ +/* + * Function called by a kernel thread to either get the current pressure level or + * wait until memory pressure changes from a given level. + */ kern_return_t mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) { -#if CONFIG_EMBEDDED - - return KERN_FAILURE; - -#elif !VM_PRESSURE_EVENTS +#if !VM_PRESSURE_EVENTS return KERN_FAILURE; #else /* VM_PRESSURE_EVENTS */ - kern_return_t kr = KERN_SUCCESS; + wait_result_t wr = 0; + vm_pressure_level_t old_level = memorystatus_vm_pressure_level; - if (pressure_level != NULL) { - vm_pressure_level_t old_level = memorystatus_vm_pressure_level; + if (pressure_level == NULL) { + return KERN_INVALID_ARGUMENT; + } - if (wait_for_pressure == TRUE) { - wait_result_t wr = 0; + if (*pressure_level == kVMPressureJetsam) { + if (!wait_for_pressure) { + return KERN_INVALID_ARGUMENT; + } - while (old_level == *pressure_level) { - wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed, - THREAD_INTERRUPTIBLE); - if (wr == THREAD_WAITING) { - wr = thread_block(THREAD_CONTINUE_NULL); - } - if (wr == THREAD_INTERRUPTED) { - return KERN_ABORTED; - } - if (wr == THREAD_AWAKENED) { - old_level = memorystatus_vm_pressure_level; + lck_mtx_lock(&memorystatus_jetsam_fg_band_lock); + wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters, + THREAD_INTERRUPTIBLE); + if (wr == THREAD_WAITING) { + ++memorystatus_jetsam_fg_band_waiters; + lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock); + wr = thread_block(THREAD_CONTINUE_NULL); + } else { + lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock); + } + if (wr != THREAD_AWAKENED) { + return KERN_ABORTED; + } + *pressure_level = kVMPressureJetsam; + return KERN_SUCCESS; + } - if (old_level != *pressure_level) { - break; - } - } + if (wait_for_pressure == TRUE) { + while (old_level == *pressure_level) { + wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed, + THREAD_INTERRUPTIBLE); + if (wr == THREAD_WAITING) { + wr = thread_block(THREAD_CONTINUE_NULL); + } + if (wr == THREAD_INTERRUPTED) { + return KERN_ABORTED; } - } - *pressure_level = old_level; - kr = KERN_SUCCESS; - } else { - kr = KERN_INVALID_ARGUMENT; + if (wr == THREAD_AWAKENED) { + old_level = memorystatus_vm_pressure_level; + } + } } - return kr; + *pressure_level = old_level; + return KERN_SUCCESS; #endif /* VM_PRESSURE_EVENTS */ } @@ -4238,34 +4626,41 @@ extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end; void vm_set_restrictions() { - host_basic_info_data_t hinfo; - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + int vm_restricted_to_single_processor = 0; + + if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) { + kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor); + vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE); + } else { + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; #define BSD_HOST 1 - host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - assert(hinfo.max_cpus > 0); + assert(hinfo.max_cpus > 0); - if (hinfo.max_cpus <= 3) { - /* - * on systems with a limited number of CPUS, bind the - * 4 major threads that can free memory and that tend to use - * a fair bit of CPU under pressured conditions to a single processor. - * This insures that these threads don't hog all of the available CPUs - * (important for camera launch), while allowing them to run independently - * w/r to locks... the 4 threads are - * vm_pageout_scan, vm_pageout_iothread_internal (compressor), - * vm_compressor_swap_trigger_thread (minor and major compactions), - * memorystatus_thread (jetsams). - * - * the first time the thread is run, it is responsible for checking the - * state of vm_restricted_to_single_processor, and if TRUE it calls - * thread_bind_master... someday this should be replaced with a group - * scheduling mechanism and KPI. - */ - vm_pageout_state.vm_restricted_to_single_processor = TRUE; - } else { - vm_pageout_state.vm_restricted_to_single_processor = FALSE; + if (hinfo.max_cpus <= 3) { + /* + * on systems with a limited number of CPUS, bind the + * 4 major threads that can free memory and that tend to use + * a fair bit of CPU under pressured conditions to a single processor. + * This insures that these threads don't hog all of the available CPUs + * (important for camera launch), while allowing them to run independently + * w/r to locks... the 4 threads are + * vm_pageout_scan, vm_pageout_iothread_internal (compressor), + * vm_compressor_swap_trigger_thread (minor and major compactions), + * memorystatus_thread (jetsams). + * + * the first time the thread is run, it is responsible for checking the + * state of vm_restricted_to_single_processor, and if TRUE it calls + * thread_bind_master... someday this should be replaced with a group + * scheduling mechanism and KPI. + */ + vm_pageout_state.vm_restricted_to_single_processor = TRUE; + } else { + vm_pageout_state.vm_restricted_to_single_processor = FALSE; + } } } @@ -4282,19 +4677,53 @@ vm_pageout(void) */ s = splsched(); + vm_pageout_scan_thread = self; + +#if CONFIG_VPS_DYNAMIC_PRIO + + int vps_dynprio_bootarg = 0; + + if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) { + vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE); + kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled); + } else { + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + vps_dynamic_priority_enabled = TRUE; + } else { + vps_dynamic_priority_enabled = FALSE; + } + } + + if (vps_dynamic_priority_enabled) { + sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE); + thread_set_eager_preempt(self); + } else { + sched_set_kernel_thread_priority(self, BASEPRI_VM); + } + +#else /* CONFIG_VPS_DYNAMIC_PRIO */ + + vps_dynamic_priority_enabled = FALSE; + sched_set_kernel_thread_priority(self, BASEPRI_VM); + +#endif /* CONFIG_VPS_DYNAMIC_PRIO */ + thread_lock(self); self->options |= TH_OPT_VMPRIV; - sched_set_thread_base_priority(self, BASEPRI_VM); thread_unlock(self); if (!self->reserved_stack) { self->reserved_stack = self->kernel_stack; } - if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE && + vps_dynamic_priority_enabled == FALSE) { thread_vm_bind_group_add(); } + + + splx(s); thread_set_thread_name(current_thread(), "VM_pageout_scan"); @@ -4412,7 +4841,7 @@ vm_pageout(void) if (result != KERN_SUCCESS) { panic("vm_pageout_iothread_external: create failed"); } - + thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread"); thread_deallocate(vm_pageout_state.vm_pageout_external_iothread); result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, @@ -4421,7 +4850,7 @@ vm_pageout(void) if (result != KERN_SUCCESS) { panic("vm_pageout_garbage_collect: create failed"); } - + thread_set_thread_name(thread, "VM_pageout_garbage_collect"); thread_deallocate(thread); #if VM_PRESSURE_EVENTS @@ -5267,7 +5696,7 @@ check_busy: pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE); assert(pg_num == (dst_offset - offset) / PAGE_SIZE); - lite_list[pg_num >> 5] |= 1 << (pg_num & 31); + lite_list[pg_num >> 5] |= 1U << (pg_num & 31); if (hw_dirty) { if (pmap_flushes_delayed == FALSE) { @@ -5512,7 +5941,7 @@ check_busy: pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE); assert(pg_num == (dst_offset - offset) / PAGE_SIZE); - lite_list[pg_num >> 5] |= 1 << (pg_num & 31); + lite_list[pg_num >> 5] |= 1U << (pg_num & 31); if (hw_dirty) { pmap_clear_modify(phys_page); @@ -5542,7 +5971,22 @@ check_busy: upl->flags &= ~UPL_CLEAR_DIRTY; upl->flags |= UPL_SET_DIRTY; dirty = TRUE; - upl->flags |= UPL_SET_DIRTY; + /* + * Page belonging to a code-signed object is about to + * be written. Mark it tainted and disconnect it from + * all pmaps so processes have to fault it back in and + * deal with the tainted bit. + */ + if (object->code_signed && dst_page->vmp_cs_tainted == FALSE) { + dst_page->vmp_cs_tainted = TRUE; + vm_page_upl_tainted++; + if (dst_page->vmp_pmapped) { + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); + if (refmod_state & VM_MEM_REFERENCED) { + dst_page->vmp_reference = TRUE; + } + } + } } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) { /* * clean in place for read implies @@ -6343,7 +6787,7 @@ process_upl_to_enter: pg_num = (unsigned int) (new_offset / PAGE_SIZE); assert(pg_num == new_offset / PAGE_SIZE); - if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) { + if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { VM_PAGE_GRAB_FICTITIOUS(alias_page); vm_object_lock(object); @@ -6773,8 +7217,8 @@ process_upl_to_commit: pg_num = (unsigned int) (target_offset / PAGE_SIZE); assert(pg_num == target_offset / PAGE_SIZE); - if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) { - lite_list[pg_num >> 5] &= ~(1 << (pg_num & 31)); + if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { + lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31)); if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) { m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset)); @@ -7009,10 +7453,17 @@ process_upl_to_commit: if (m->vmp_free_when_done) { /* * With the clean queue enabled, UPL_PAGEOUT should - * no longer set the pageout bit. It's pages now go + * no longer set the pageout bit. Its pages now go * to the clean queue. + * + * We don't use the cleaned Q anymore and so this + * assert isn't correct. The code for the clean Q + * still exists and might be used in the future. If we + * go back to the cleaned Q, we will re-enable this + * assert. + * + * assert(!(upl->flags & UPL_PAGEOUT)); */ - assert(!(flags & UPL_PAGEOUT)); assert(!m_object->internal); m->vmp_free_when_done = FALSE; @@ -7454,8 +7905,8 @@ process_upl_to_abort: m = VM_PAGE_NULL; if (upl->flags & UPL_LITE) { - if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) { - lite_list[pg_num >> 5] &= ~(1 << (pg_num & 31)); + if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { + lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31)); if (!(upl->flags & UPL_KERNEL_OBJECT)) { m = vm_page_lookup(shadow_object, target_offset + @@ -7914,7 +8365,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us } entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE); assert(entry >= 0 && entry < object->resident_page_count); - lite_list[entry >> 5] |= 1 << (entry & 31); + lite_list[entry >> 5] |= 1U << (entry & 31); phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); @@ -8039,7 +8490,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update); - lite_list[entry >> 5] |= 1 << (entry & 31); + lite_list[entry >> 5] |= 1U << (entry & 31); phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); @@ -8719,6 +9170,22 @@ memory_error: if (!(cntrl_flags & UPL_COPYOUT_FROM)) { SET_PAGE_DIRTY(dst_page, TRUE); + /* + * Page belonging to a code-signed object is about to + * be written. Mark it tainted and disconnect it from + * all pmaps so processes have to fault it back in and + * deal with the tainted bit. + */ + if (object->code_signed && dst_page->vmp_cs_tainted == FALSE) { + dst_page->vmp_cs_tainted = TRUE; + vm_page_iopl_tainted++; + if (dst_page->vmp_pmapped) { + int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); + if (refmod & VM_MEM_REFERENCED) { + dst_page->vmp_reference = TRUE; + } + } + } } if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) { pmap_sync_page_attributes_phys(phys_page); @@ -8730,7 +9197,7 @@ record_phys_addr: upl->flags |= UPL_HAS_BUSY; } - lite_list[entry >> 5] |= 1 << (entry & 31); + lite_list[entry >> 5] |= 1U << (entry & 31); if (phys_page > upl->highest_page) { upl->highest_page = phys_page; @@ -9023,7 +9490,7 @@ upl_range_needed( * virtaul address space each time we need to work with * a physical page. */ -decl_simple_lock_data(, vm_paging_lock) +decl_simple_lock_data(, vm_paging_lock); #define VM_PAGING_NUM_PAGES 64 vm_map_offset_t vm_paging_base_address = 0; boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, }; @@ -9107,20 +9574,10 @@ vm_paging_map_object( if (page != VM_PAGE_NULL && *size == PAGE_SIZE) { /* use permanent 1-to-1 kernel mapping of physical memory ? */ -#if __x86_64__ - *address = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << - PAGE_SHIFT); - *need_unmap = FALSE; - return KERN_SUCCESS; -#elif __arm__ || __arm64__ *address = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT); *need_unmap = FALSE; return KERN_SUCCESS; -#else -#warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." -#endif assert(page->vmp_busy); /* @@ -9492,7 +9949,8 @@ vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size) } vector_upl->upl_elems[i] = NULL; - invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1); + invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls, + relaxed); if (invalid_upls == vector_upl->num_upls) { return TRUE; } else { @@ -10339,7 +10797,7 @@ vm_test_wire_and_extract(void) ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES); - user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT), + user_map = vm_map_create(pmap_create_options(ledger, 0, PMAP_CREATE_64BIT), 0x100000000ULL, 0x200000000ULL, TRUE); diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index b0608aef5..378c4765c 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -347,7 +347,7 @@ struct upl_io_completion { struct upl { - decl_lck_mtx_data(, Lock) /* Synchronization */ + decl_lck_mtx_data(, Lock); /* Synchronization */ int ref_count; int ext_ref_count; int flags; @@ -481,7 +481,7 @@ extern void vm_paging_unmap_object( vm_object_t object, vm_map_offset_t start, vm_map_offset_t end); -decl_simple_lock_data(extern, vm_paging_lock) +decl_simple_lock_data(extern, vm_paging_lock); /* * Backing store throttle when BS is exhausted @@ -644,7 +644,7 @@ struct vm_pageout_vminfo { unsigned long vm_pageout_skipped_external; unsigned long vm_pageout_pages_evicted; - unsigned long vm_pageout_pages_purged;; + unsigned long vm_pageout_pages_purged; unsigned long vm_pageout_freed_cleaned; unsigned long vm_pageout_freed_speculative; unsigned long vm_pageout_freed_external; diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index 43d45dbbe..66dbe7ce7 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -109,6 +109,14 @@ extern kern_return_t vm_map_purgable_control( vm_purgable_t control, int *state); +#if MACH_ASSERT +extern void vm_map_pmap_check_ledgers( + pmap_t pmap, + ledger_t ledger, + int pid, + char *procname); +#endif /* MACH_ASSERT */ + extern kern_return_t vnode_pager_get_object_vnode( memory_object_t mem_obj, @@ -191,11 +199,11 @@ extern void swapfile_pager_bootstrap(void); extern memory_object_t swapfile_pager_setup(struct vnode *vp); extern memory_object_control_t swapfile_pager_control(memory_object_t mem_obj); -#if __arm64__ || ((__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)) +#if __arm64__ || (__ARM_ARCH_7K__ >= 2) #define SIXTEENK_PAGE_SIZE 0x4000 #define SIXTEENK_PAGE_MASK 0x3FFF #define SIXTEENK_PAGE_SHIFT 14 -#endif /* __arm64__ || ((__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)) */ +#endif /* __arm64__ || (__ARM_ARCH_7K__ >= 2) */ #if __arm64__ #define FOURK_PAGE_SIZE 0x1000 @@ -473,6 +481,7 @@ extern void log_unnest_badness( vm_map_offset_t lowest_unnestable_addr); struct proc; +struct proc *current_proc(void); extern int cs_allow_invalid(struct proc *p); extern int cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed); @@ -566,11 +575,11 @@ extern int proc_get_memstat_priority(struct proc*, boolean_t); /* the object purger. purges the next eligible object from memory. */ /* returns TRUE if an object was purged, otherwise FALSE. */ boolean_t vm_purgeable_object_purge_one_unlocked(int force_purge_below_group); -void vm_purgeable_disown(task_t task); void vm_purgeable_nonvolatile_owner_update(task_t owner, int delta); void vm_purgeable_volatile_owner_update(task_t owner, int delta); +void vm_owned_objects_disown(task_t task); struct trim_list { @@ -622,6 +631,7 @@ extern int secluded_for_filecache; extern int secluded_for_fbdp; #endif +extern uint64_t vm_page_secluded_drain(void); extern void memory_object_mark_eligible_for_secluded( memory_object_control_t control, boolean_t eligible_for_secluded); @@ -635,6 +645,7 @@ extern kern_return_t mach_make_memory_entry_internal( memory_object_size_t *size, memory_object_offset_t offset, vm_prot_t permission, + vm_named_entry_kernel_flags_t vmne_kflags, ipc_port_t *object_handle, ipc_port_t parent_handle); @@ -655,6 +666,17 @@ extern kern_return_t mach_make_memory_entry_internal( #define VM_SWAP_FLAGS_FORCE_DEFRAG 1 #define VM_SWAP_FLAGS_FORCE_RECLAIM 2 +#if __arm64__ +/* + * Flags to control the behavior of + * the legacy footprint entitlement. + */ +#define LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE (1) +#define LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT (2) +#define LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE (3) + +#endif /* __arm64__ */ + #endif /* _VM_VM_PROTOS_H_ */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index 6ebfaf77a..17350f63b 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -84,7 +84,7 @@ struct purgeable_q purgeable_queues[PURGEABLE_Q_TYPE_MAX]; queue_head_t purgeable_nonvolatile_queue; int purgeable_nonvolatile_count; -decl_lck_mtx_data(, vm_purgeable_queue_lock) +decl_lck_mtx_data(, vm_purgeable_queue_lock); static token_idx_t vm_purgeable_token_remove_first(purgeable_q_t queue); @@ -100,14 +100,16 @@ vm_purgeable_token_check_queue(purgeable_q_t queue) token_idx_t unripe = 0; int our_inactive_count; + #if DEVELOPMENT - static unsigned lightweight_check = 0; + static int lightweight_check = 0; /* - * Due to performance impact, only perform this check - * every 100 times on DEVELOPMENT kernels. + * Due to performance impact, perform this check less frequently on DEVELOPMENT kernels. + * Checking the queue scales linearly with its length, so we compensate by + * by performing this check less frequently as the queue grows. */ - if (lightweight_check++ < 100) { + if (lightweight_check++ < (100 + queue->debug_count_tokens / 512)) { return; } @@ -1287,105 +1289,6 @@ vm_purgeable_account( } #endif /* DEVELOPMENT || DEBUG */ -void -vm_purgeable_disown( - task_t task) -{ - vm_object_t next_object; - vm_object_t object; - int collisions; - - if (task == NULL) { - return; - } - - /* - * Scan the purgeable objects queues for objects owned by "task". - * This has to be done "atomically" under the "vm_purgeable_queue" - * lock, to ensure that no new purgeable object get associated - * with this task or moved between queues while we're scanning. - */ - - /* - * Scan non-volatile queue for objects owned by "task". - */ - - collisions = 0; - -again: - if (task->task_purgeable_disowned) { - /* task has already disowned its purgeable memory */ - assert(task->task_volatile_objects == 0); - assert(task->task_nonvolatile_objects == 0); - return; - } - - lck_mtx_lock(&vm_purgeable_queue_lock); - task_objq_lock(task); - - task->task_purgeable_disowning = TRUE; - - for (object = (vm_object_t) queue_first(&task->task_objq); - !queue_end(&task->task_objq, (queue_entry_t) object); - object = next_object) { - if (task->task_nonvolatile_objects == 0 && - task->task_volatile_objects == 0) { - /* no more purgeable objects owned by "task" */ - break; - } - - next_object = (vm_object_t) queue_next(&object->task_objq); - if (object->purgable == VM_PURGABLE_DENY) { - /* not a purgeable object: skip */ - continue; - } - -#if DEBUG - assert(object->vo_purgeable_volatilizer == NULL); -#endif /* DEBUG */ - assert(object->vo_owner == task); - if (!vm_object_lock_try(object)) { - lck_mtx_unlock(&vm_purgeable_queue_lock); - task_objq_unlock(task); - mutex_pause(collisions++); - goto again; - } - /* transfer ownership to the kernel */ - assert(VM_OBJECT_OWNER(object) != kernel_task); - vm_object_ownership_change( - object, - object->vo_ledger_tag, /* unchanged */ - VM_OBJECT_OWNER_DISOWNED, /* new owner */ - TRUE); /* old_owner->task_objq locked */ - assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED); - vm_object_unlock(object); - } - - if (__improbable(task->task_volatile_objects != 0 || - task->task_nonvolatile_objects != 0)) { - panic("%s(%p): volatile=%d nonvolatile=%d q=%p q_first=%p q_last=%p", - __FUNCTION__, - task, - task->task_volatile_objects, - task->task_nonvolatile_objects, - &task->task_objq, - queue_first(&task->task_objq), - queue_last(&task->task_objq)); - } - - /* there shouldn't be any purgeable objects owned by task now */ - assert(task->task_volatile_objects == 0); - assert(task->task_nonvolatile_objects == 0); - assert(task->task_purgeable_disowning); - - /* and we don't need to try and disown again */ - task->task_purgeable_disowned = TRUE; - - lck_mtx_unlock(&vm_purgeable_queue_lock); - task_objq_unlock(task); -} - - static uint64_t vm_purgeable_queue_purge_task_owned( purgeable_q_t queue, @@ -1505,6 +1408,9 @@ vm_purgeable_nonvolatile_enqueue( vm_object_t object, task_t owner) { + int ledger_flags; + kern_return_t kr; + vm_object_lock_assert_exclusive(object); assert(object->purgable == VM_PURGABLE_NONVOLATILE); @@ -1513,7 +1419,7 @@ vm_purgeable_nonvolatile_enqueue( lck_mtx_lock(&vm_purgeable_queue_lock); if (owner != NULL && - owner->task_purgeable_disowning) { + owner->task_objects_disowning) { /* task is exiting and no longer tracking purgeable objects */ owner = VM_OBJECT_OWNER_DISOWNED; } @@ -1526,10 +1432,16 @@ vm_purgeable_nonvolatile_enqueue( object->vo_purgeable_volatilizer = NULL; #endif /* DEBUG */ - vm_object_ownership_change(object, - object->vo_ledger_tag, /* tag unchanged */ + ledger_flags = 0; + if (object->vo_no_footprint) { + ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT; + } + kr = vm_object_ownership_change(object, + object->vo_ledger_tag, /* tag unchanged */ owner, + ledger_flags, FALSE); /* task_objq_locked */ + assert(kr == KERN_SUCCESS); assert(object->objq.next == NULL); assert(object->objq.prev == NULL); @@ -1549,6 +1461,7 @@ vm_purgeable_nonvolatile_dequeue( vm_object_t object) { task_t owner; + kern_return_t kr; vm_object_lock_assert_exclusive(object); @@ -1563,11 +1476,13 @@ vm_purgeable_nonvolatile_dequeue( */ /* transfer ownership to the kernel */ assert(VM_OBJECT_OWNER(object) != kernel_task); - vm_object_ownership_change( + kr = vm_object_ownership_change( object, object->vo_ledger_tag, /* unchanged */ VM_OBJECT_OWNER_DISOWNED, /* new owner */ + 0, /* ledger_flags */ FALSE); /* old_owner->task_objq locked */ + assert(kr == KERN_SUCCESS); assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED); } @@ -1763,7 +1678,7 @@ vm_object_owner_compressed_update( switch (object->purgable) { case VM_PURGABLE_DENY: /* not purgeable: must be ledger-tagged */ - assert(object->vo_ledger_tag != VM_OBJECT_LEDGER_TAG_NONE); + assert(object->vo_ledger_tag != VM_LEDGER_TAG_NONE); /* fallthru */ case VM_PURGABLE_NONVOLATILE: if (delta > 0) { diff --git a/osfmk/vm/vm_purgeable_internal.h b/osfmk/vm/vm_purgeable_internal.h index f2599e771..fb0a7d473 100644 --- a/osfmk/vm/vm_purgeable_internal.h +++ b/osfmk/vm/vm_purgeable_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -82,7 +82,7 @@ extern int available_for_purge; * mostly used on a user context and we don't want any contention with the * pageout daemon. */ -decl_lck_mtx_data(extern, vm_purgeable_queue_lock) +decl_lck_mtx_data(extern, vm_purgeable_queue_lock); /* add a new token to queue. called by vm_object_purgeable_control */ /* enter with page queue locked */ diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 4cdb91692..bf3b6d3a8 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,7 +77,6 @@ #include #include #include -#include #include #include #include @@ -96,6 +95,9 @@ #include #include #include +#if defined (__x86_64__) +#include +#endif #if CONFIG_PHANTOM_CACHE #include @@ -105,6 +107,9 @@ #include +#if defined(HAS_APPLE_PAC) +#include +#endif #if MACH_ASSERT @@ -116,6 +121,10 @@ #endif /* MACH_ASSERT */ +extern boolean_t vm_pageout_running; +extern thread_t vm_pageout_scan_thread; +extern boolean_t vps_dynamic_priority_enabled; + char vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; char vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; char vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; @@ -328,7 +337,7 @@ unsigned int vm_page_free_count; zone_t vm_page_array_zone; zone_t vm_page_zone; vm_locks_array_t vm_page_locks; -decl_lck_mtx_data(, vm_page_alloc_lock) +decl_lck_mtx_data(, vm_page_alloc_lock); lck_mtx_ext_t vm_page_alloc_lock_ext; unsigned int vm_page_local_q_count = 0; @@ -399,6 +408,7 @@ unsigned int vm_page_inactive_count; unsigned int vm_page_secluded_count; unsigned int vm_page_secluded_count_free; unsigned int vm_page_secluded_count_inuse; +unsigned int vm_page_secluded_count_over_target; #endif /* CONFIG_SECLUDED_MEMORY */ unsigned int vm_page_anonymous_count; unsigned int vm_page_throttled_count; @@ -409,6 +419,9 @@ unsigned int vm_page_wire_count_on_boot = 0; unsigned int vm_page_stolen_count = 0; unsigned int vm_page_wire_count_initial; unsigned int vm_page_gobble_count = 0; +unsigned int vm_page_kern_lpage_count = 0; + +uint64_t booter_size; /* external so it can be found in core dumps */ #define VM_PAGE_WIRE_COUNT_WARNING 0 #define VM_PAGE_GOBBLE_COUNT_WARNING 0 @@ -644,6 +657,12 @@ vm_get_delayed_page(int grab_options) assert(vm_delayed_count > 0); --vm_delayed_count; +#if defined(__x86_64__) + /* x86 cluster code requires increasing phys_page in vm_pages[] */ + if (vm_pages_count > 0) { + assert(pnum > vm_pages[vm_pages_count - 1].vmp_phys_page); + } +#endif p = &vm_pages[vm_pages_count]; assert(p < vm_page_array_ending_addr); vm_page_init(p, pnum, FALSE); @@ -687,8 +706,8 @@ vm_free_delayed_pages(void) vm_page_t p; vm_page_t list = NULL; uint_t cnt = 0; - vm_offset_t start_free_page; - vm_size_t free_size; + vm_offset_t start_free_va; + int64_t free_size; while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) { if (vm_himemory_mode) { @@ -711,29 +730,39 @@ vm_free_delayed_pages(void) vm_page_release(p, FALSE); } #if DEVELOPMENT || DEBUG - kprintf("vm_free_delayed_pages: freed %d pages\n", cnt); + kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt); #endif /* * Free up any unused full pages at the end of the vm_pages[] array */ - start_free_page = round_page((vm_offset_t)&vm_pages[vm_pages_count]); - if (start_free_page < (vm_offset_t)vm_page_array_ending_addr) { - free_size = trunc_page((vm_offset_t)vm_page_array_ending_addr - start_free_page); - if (free_size > 0) { -#if DEVELOPMENT || DEBUG - kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n", - (long)free_size, (long)start_free_page); + start_free_va = round_page((vm_offset_t)&vm_pages[vm_pages_count]); + +#if defined(__x86_64__) + /* + * Since x86 might have used large pages for vm_pages[], we can't + * free starting in the middle of a partially used large page. + */ + if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) { + start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK); + } #endif - pmap_pv_fixup(start_free_page, free_size); - ml_static_mfree(start_free_page, free_size); - vm_page_array_ending_addr = (void *)start_free_page; + if (start_free_va < (vm_offset_t)vm_page_array_ending_addr) { + free_size = trunc_page((vm_offset_t)vm_page_array_ending_addr - start_free_va); + if (free_size > 0) { + ml_static_mfree(start_free_va, (vm_offset_t)free_size); + vm_page_array_ending_addr = (void *)start_free_va; /* * Note there's no locking here, as only this thread will ever change this value. * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at. */ - --vm_page_stolen_count; + vm_page_stolen_count -= (free_size >> PAGE_SHIFT); + +#if DEVELOPMENT || DEBUG + kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n", + (long)free_size, (long)start_free_va); +#endif } } @@ -1183,6 +1212,9 @@ vm_page_bootstrap( #endif vm_page_wire_count_initial = vm_page_wire_count; + /* capture this for later use */ + booter_size = ml_get_booter_memory_size(); + printf("vm_page_bootstrap: %d free pages, %d wired pages, (up to %d of which are delayed free)\n", vm_page_free_count, vm_page_wire_count, vm_delayed_count); @@ -1192,81 +1224,103 @@ vm_page_bootstrap( #ifndef MACHINE_PAGES /* - * We implement pmap_steal_memory and pmap_startup with the help - * of two simpler functions, pmap_virtual_space and pmap_next_page. + * This is the early boot time allocator for data structures needed to bootstrap the VM system. + * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this + * on ARM yet, due to the combination of a large base page size and smaller RAM devices. */ - -void * -pmap_steal_memory( - vm_size_t size) +static void * +pmap_steal_memory_internal( + vm_size_t size, + boolean_t might_free) { kern_return_t kr; - vm_offset_t addr, vaddr; + vm_offset_t addr; + vm_offset_t map_addr; ppnum_t phys_page; /* - * We round the size to a round multiple. + * Size needs to be aligned to word size. */ - size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1); /* - * If this is the first call to pmap_steal_memory, - * we have to initialize ourself. + * On the first call, get the initial values for virtual address space + * and page align them. */ - if (virtual_space_start == virtual_space_end) { pmap_virtual_space(&virtual_space_start, &virtual_space_end); + virtual_space_start = round_page(virtual_space_start); + virtual_space_end = trunc_page(virtual_space_end); +#if defined(__x86_64__) /* - * The initial values must be aligned properly, and - * we don't trust the pmap module to do it right. + * Release remaining unused section of preallocated KVA and the 4K page tables + * that map it. This makes the VA available for large page mappings. */ - - virtual_space_start = round_page(virtual_space_start); - virtual_space_end = trunc_page(virtual_space_end); + Idle_PTs_release(virtual_space_start, virtual_space_end); +#endif } /* - * Allocate virtual memory for this request. + * Allocate the virtual space for this request. On x86, we'll align to a large page + * address if the size is big enough to back with at least 1 large page. */ - +#if defined(__x86_64__) + if (size >= I386_LPGBYTES) { + virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK); + } +#endif addr = virtual_space_start; virtual_space_start += size; //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */ /* - * Allocate and map physical pages to back new virtual pages. + * Allocate and map physical pages to back the new virtual space. */ + map_addr = round_page(addr); + while (map_addr < addr + size) { +#if defined(__x86_64__) + /* + * Back with a large page if properly aligned on x86 + */ + if ((map_addr & I386_LPGMASK) == 0 && + map_addr + I386_LPGBYTES <= addr + size && + pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS && + pmap_next_page_large(&phys_page) == KERN_SUCCESS) { + kr = pmap_enter(kernel_pmap, map_addr, phys_page, + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, + VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE); + + if (kr != KERN_SUCCESS) { + panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u", + (unsigned long)map_addr, phys_page); + } + map_addr += I386_LPGBYTES; + vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT; + vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT; + vm_page_kern_lpage_count++; + continue; + } +#endif - for (vaddr = round_page(addr); - vaddr < addr + size; - vaddr += PAGE_SIZE) { - if (!pmap_next_page_hi(&phys_page)) { + if (!pmap_next_page_hi(&phys_page, might_free)) { panic("pmap_steal_memory() size: 0x%llx\n", (uint64_t)size); } - /* - * XXX Logically, these mappings should be wired, - * but some pmap modules barf if they are. - */ -#if defined(__LP64__) -#ifdef __arm64__ - /* ARM64_TODO: verify that we really don't need this */ -#else - pmap_pre_expand(kernel_pmap, vaddr); -#endif +#if defined(__x86_64__) + pmap_pre_expand(kernel_pmap, map_addr); #endif - kr = pmap_enter(kernel_pmap, vaddr, phys_page, + kr = pmap_enter(kernel_pmap, map_addr, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); if (kr != KERN_SUCCESS) { - panic("pmap_steal_memory() pmap_enter failed, vaddr=%#lx, phys_page=%u", - (unsigned long)vaddr, phys_page); + panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u", + (unsigned long)map_addr, phys_page); } + map_addr += PAGE_SIZE; /* * Account for newly stolen memory @@ -1275,12 +1329,35 @@ pmap_steal_memory( vm_page_stolen_count++; } +#if defined(__x86_64__) + /* + * The call with might_free is currently the last use of pmap_steal_memory*(). + * Notify the pmap layer to record which high pages were allocated so far. + */ + if (might_free) { + pmap_hi_pages_done(); + } +#endif #if KASAN kasan_notify_address(round_page(addr), size); #endif return (void *) addr; } +void * +pmap_steal_memory( + vm_size_t size) +{ + return pmap_steal_memory_internal(size, FALSE); +} + +void * +pmap_steal_freeable_memory( + vm_size_t size) +{ + return pmap_steal_memory_internal(size, TRUE); +} + #if CONFIG_SECLUDED_MEMORY /* boot-args to control secluded memory */ unsigned int secluded_mem_mb = 0; /* # of MBs of RAM to seclude */ @@ -1336,7 +1413,7 @@ pmap_startup( mem_sz += round_page(virtual_space_start) - virtual_space_start; /* Account for any slop */ npages = (uint_t)(mem_sz / (PAGE_SIZE + sizeof(*vm_pages))); /* scaled to include the vm_page_ts */ - vm_pages = (vm_page_t) pmap_steal_memory(npages * sizeof *vm_pages); + vm_pages = (vm_page_t) pmap_steal_freeable_memory(npages * sizeof *vm_pages); /* * Check if we want to initialize pages to a known value @@ -1483,6 +1560,12 @@ pmap_startup( assert((i + vm_first_phys_ppnum) == phys_page); #endif +#if defined(__x86_64__) + /* The x86 clump freeing code requires increasing ppn's to work correctly */ + if (i > 0) { + assert(phys_page > vm_pages[i - 1].vmp_phys_page); + } +#endif ++vm_pages_count; vm_page_init(&vm_pages[i], phys_page, FALSE); if (fill) { @@ -1684,9 +1767,6 @@ vm_page_insert_internal( int ledger_idx_nonvolatile_compressed; boolean_t do_footprint; - XPR(XPR_VM_PAGE, - "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n", - object, offset, mem, 0, 0); #if 0 /* * we may not hold the page queue lock @@ -2028,11 +2108,6 @@ vm_page_remove( m_object = VM_PAGE_OBJECT(mem); - XPR(XPR_VM_PAGE, - "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n", - m_object, mem->vmp_offset, - mem, 0, 0); - vm_object_lock_assert_exclusive(m_object); assert(mem->vmp_tabled); assert(!mem->vmp_cleaning); @@ -2434,11 +2509,6 @@ vm_page_rename( assert(m_object != new_object); assert(m_object); - XPR(XPR_VM_PAGE, - "vm_page_rename, new object 0x%X, offset 0x%X page 0x%X\n", - new_object, new_offset, - mem, 0, 0); - /* * Changes to mem->vmp_object require the page lock because * the pageout daemon uses that lock to get the object. @@ -3272,15 +3342,22 @@ return_page_from_cpu_list: /* * Decide if we should poke the pageout daemon. * We do this if the free count is less than the low - * water mark, or if the free count is less than the high - * water mark (but above the low water mark) and the inactive - * count is less than its target. + * water mark. VM Pageout Scan will keep running till + * the free_count > free_target (& hence above free_min). + * This wakeup is to catch the possibility of the counts + * dropping between VM Pageout Scan parking and this check. * * We don't have the counts locked ... if they change a little, * it doesn't really matter. */ if (vm_page_free_count < vm_page_free_min) { - thread_wakeup((event_t) &vm_page_free_wanted); + lck_mtx_lock(&vm_page_queue_free_lock); + if (vm_pageout_running == FALSE) { + lck_mtx_unlock(&vm_page_queue_free_lock); + thread_wakeup((event_t) &vm_page_free_wanted); + } else { + lck_mtx_unlock(&vm_page_queue_free_lock); + } } VM_CHECK_MEMORYSTATUS; @@ -3434,6 +3511,66 @@ reactivate_secluded_page: return mem; } + +uint64_t +vm_page_secluded_drain(void) +{ + vm_page_t local_freeq; + int local_freed; + uint64_t num_reclaimed; + unsigned int saved_secluded_count, saved_secluded_target; + + num_reclaimed = 0; + local_freeq = NULL; + local_freed = 0; + + vm_page_lock_queues(); + + saved_secluded_count = vm_page_secluded_count; + saved_secluded_target = vm_page_secluded_target; + vm_page_secluded_target = 0; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); + while (vm_page_secluded_count) { + vm_page_t secluded_page; + + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); + assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); + + vm_page_queues_remove(secluded_page, FALSE); + assert(!secluded_page->vmp_fictitious); + assert(!VM_PAGE_WIRED(secluded_page)); + + if (secluded_page->vmp_object == 0) { + /* transfer to free queue */ + assert(secluded_page->vmp_busy); + secluded_page->vmp_snext = local_freeq; + local_freeq = secluded_page; + local_freed += 1; + } else { + /* transfer to head of active queue */ + vm_page_enqueue_active(secluded_page, FALSE); + secluded_page = VM_PAGE_NULL; + } + num_reclaimed++; + } + vm_page_secluded_target = saved_secluded_target; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); + +// printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed); + + vm_page_unlock_queues(); + + if (local_freed) { + vm_page_free_list(local_freeq, TRUE); + local_freeq = NULL; + local_freed = 0; + } + + return num_reclaimed; +} #endif /* CONFIG_SECLUDED_MEMORY */ @@ -3467,6 +3604,7 @@ vm_page_release( #if CONFIG_SECLUDED_MEMORY int need_secluded_wakeup = 0; #endif /* CONFIG_SECLUDED_MEMORY */ + event_t wakeup_event = NULL; if (page_queues_locked) { LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); @@ -3533,6 +3671,7 @@ vm_page_release( vm_page_queue_enter_first(&vm_page_queue_secluded, mem, vmp_pageq); mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; vm_page_secluded_count++; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); vm_page_secluded_count_free++; if (!page_queues_locked) { vm_page_unlock_queues(); @@ -3597,15 +3736,25 @@ vm_page_release( lck_mtx_unlock(&vm_page_queue_free_lock); if (need_priv_wakeup) { - thread_wakeup_one((event_t) &vm_page_free_wanted_privileged); + wakeup_event = &vm_page_free_wanted_privileged; } #if CONFIG_SECLUDED_MEMORY else if (need_secluded_wakeup) { - thread_wakeup_one((event_t) &vm_page_free_wanted_secluded); + wakeup_event = &vm_page_free_wanted_secluded; } #endif /* CONFIG_SECLUDED_MEMORY */ else if (need_wakeup) { - thread_wakeup_one((event_t) &vm_page_free_count); + wakeup_event = &vm_page_free_count; + } + + if (wakeup_event) { + if (vps_dynamic_priority_enabled == TRUE) { + thread_t thread_woken = NULL; + wakeup_one_with_inheritor((event_t) wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken); + thread_deallocate(thread_woken); + } else { + thread_wakeup_one((event_t) wakeup_event); + } } VM_CHECK_MEMORYSTATUS; @@ -3634,6 +3783,7 @@ vm_page_release_startup( mem->vmp_lopage = FALSE; mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; vm_page_secluded_count++; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); vm_page_secluded_count_free++; queue_free = &vm_page_queue_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ @@ -3679,6 +3829,7 @@ vm_page_wait( kern_return_t wait_result; int need_wakeup = 0; int is_privileged = current_thread()->options & TH_OPT_VMPRIV; + event_t wait_event = NULL; lck_mtx_lock_spin(&vm_page_queue_free_lock); @@ -3696,7 +3847,7 @@ vm_page_wait( if (vm_page_free_wanted_privileged++ == 0) { need_wakeup = 1; } - wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible); + wait_event = (event_t)&vm_page_free_wanted_privileged; #if CONFIG_SECLUDED_MEMORY } else if (secluded_for_apps && task_can_use_secluded_mem(current_task(), FALSE)) { @@ -3712,25 +3863,41 @@ vm_page_wait( if (vm_page_free_wanted_secluded++ == 0) { need_wakeup = 1; } - wait_result = assert_wait( - (event_t)&vm_page_free_wanted_secluded, - interruptible); + wait_event = (event_t)&vm_page_free_wanted_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ } else { if (vm_page_free_wanted++ == 0) { need_wakeup = 1; } - wait_result = assert_wait((event_t)&vm_page_free_count, - interruptible); + wait_event = (event_t)&vm_page_free_count; } - lck_mtx_unlock(&vm_page_queue_free_lock); - counter(c_vm_page_wait_block++); - if (need_wakeup) { - thread_wakeup((event_t)&vm_page_free_wanted); - } + /* + * We don't do a vm_pageout_scan wakeup if we already have + * some waiters because vm_pageout_scan checks for waiters + * before it returns and does so behind the vm_page_queue_free_lock, + * which we own when we bump the waiter counts. + */ + + if (vps_dynamic_priority_enabled == TRUE) { + /* + * We are waking up vm_pageout_scan here. If it needs + * the vm_page_queue_free_lock before we unlock it + * we'll end up just blocking and incur an extra + * context switch. Could be a perf. issue. + */ + + counter(c_vm_page_wait_block++); - if (wait_result == THREAD_WAITING) { + if (need_wakeup) { + thread_wakeup((event_t)&vm_page_free_wanted); + } + + /* + * LD: This event is going to get recorded every time because + * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor. + * We just block in that routine. + */ VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START, vm_page_free_wanted_privileged, vm_page_free_wanted, @@ -3740,12 +3907,39 @@ vm_page_wait( 0, #endif /* CONFIG_SECLUDED_MEMORY */ 0); - wait_result = thread_block(THREAD_CONTINUE_NULL); - VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, - VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); + wait_result = lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock, + LCK_SLEEP_UNLOCK, + wait_event, + vm_pageout_scan_thread, + interruptible, + 0); + } else { + wait_result = assert_wait(wait_event, interruptible); + + lck_mtx_unlock(&vm_page_queue_free_lock); + counter(c_vm_page_wait_block++); + + if (need_wakeup) { + thread_wakeup((event_t)&vm_page_free_wanted); + } + + if (wait_result == THREAD_WAITING) { + VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START, + vm_page_free_wanted_privileged, + vm_page_free_wanted, +#if CONFIG_SECLUDED_MEMORY + vm_page_free_wanted_secluded, +#else /* CONFIG_SECLUDED_MEMORY */ + 0, +#endif /* CONFIG_SECLUDED_MEMORY */ + 0); + wait_result = thread_block(THREAD_CONTINUE_NULL); + VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, + VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); + } } - return wait_result == THREAD_AWAKENED; + return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING); } /* @@ -4099,6 +4293,8 @@ vm_page_free_list( #if CONFIG_SECLUDED_MEMORY unsigned int need_wakeup_secluded = 0; #endif /* CONFIG_SECLUDED_MEMORY */ + event_t priv_wakeup_event, secluded_wakeup_event, normal_wakeup_event; + boolean_t priv_wakeup_all, secluded_wakeup_all, normal_wakeup_all; lck_mtx_lock_spin(&vm_page_queue_free_lock); @@ -4174,27 +4370,32 @@ vm_page_free_list( } lck_mtx_unlock(&vm_page_queue_free_lock); + priv_wakeup_event = NULL; + secluded_wakeup_event = NULL; + normal_wakeup_event = NULL; + + priv_wakeup_all = FALSE; + secluded_wakeup_all = FALSE; + normal_wakeup_all = FALSE; + + if (need_priv_wakeup != 0) { /* * There shouldn't be that many VM-privileged threads, * so let's wake them all up, even if we don't quite * have enough pages to satisfy them all. */ - thread_wakeup((event_t)&vm_page_free_wanted_privileged); + priv_wakeup_event = (event_t)&vm_page_free_wanted_privileged; + priv_wakeup_all = TRUE; } #if CONFIG_SECLUDED_MEMORY if (need_wakeup_secluded != 0 && vm_page_free_wanted_secluded == 0) { - thread_wakeup((event_t) - &vm_page_free_wanted_secluded); + secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded; + secluded_wakeup_all = TRUE; + need_wakeup_secluded = 0; } else { - for (; - need_wakeup_secluded != 0; - need_wakeup_secluded--) { - thread_wakeup_one( - (event_t) - &vm_page_free_wanted_secluded); - } + secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded; } #endif /* CONFIG_SECLUDED_MEMORY */ if (need_wakeup != 0 && vm_page_free_wanted == 0) { @@ -4203,13 +4404,82 @@ vm_page_free_list( * after this, so let's wake them all up at * once. */ - thread_wakeup((event_t) &vm_page_free_count); + normal_wakeup_event = (event_t) &vm_page_free_count; + normal_wakeup_all = TRUE; + need_wakeup = 0; } else { - for (; need_wakeup != 0; need_wakeup--) { + normal_wakeup_event = (event_t) &vm_page_free_count; + } + + if (priv_wakeup_event || +#if CONFIG_SECLUDED_MEMORY + secluded_wakeup_event || +#endif /* CONFIG_SECLUDED_MEMORY */ + normal_wakeup_event) { + if (vps_dynamic_priority_enabled == TRUE) { + thread_t thread_woken = NULL; + + if (priv_wakeup_all == TRUE) { + wakeup_all_with_inheritor(priv_wakeup_event, THREAD_AWAKENED); + } + +#if CONFIG_SECLUDED_MEMORY + if (secluded_wakeup_all == TRUE) { + wakeup_all_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED); + } + + while (need_wakeup_secluded-- != 0) { + /* + * Wake up one waiter per page we just released. + */ + wakeup_one_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken); + thread_deallocate(thread_woken); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + if (normal_wakeup_all == TRUE) { + wakeup_all_with_inheritor(normal_wakeup_event, THREAD_AWAKENED); + } + + while (need_wakeup-- != 0) { + /* + * Wake up one waiter per page we just released. + */ + wakeup_one_with_inheritor(normal_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken); + thread_deallocate(thread_woken); + } + } else { /* - * Wake up one waiter per page we just released. + * Non-priority-aware wakeups. */ - thread_wakeup_one((event_t) &vm_page_free_count); + + if (priv_wakeup_all == TRUE) { + thread_wakeup(priv_wakeup_event); + } + +#if CONFIG_SECLUDED_MEMORY + if (secluded_wakeup_all == TRUE) { + thread_wakeup(secluded_wakeup_event); + } + + while (need_wakeup_secluded-- != 0) { + /* + * Wake up one waiter per page we just released. + */ + thread_wakeup_one(secluded_wakeup_event); + } + +#endif /* CONFIG_SECLUDED_MEMORY */ + if (normal_wakeup_all == TRUE) { + thread_wakeup(normal_wakeup_event); + } + + while (need_wakeup-- != 0) { + /* + * Wake up one waiter per page we just released. + */ + thread_wakeup_one(normal_wakeup_event); + } } } @@ -4685,6 +4955,7 @@ vm_page_activate( vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq); m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; vm_page_secluded_count++; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); vm_page_secluded_count_inuse++; assert(!m_object->internal); // vm_page_pageable_external_count++; @@ -4760,6 +5031,7 @@ vm_page_speculate( */ aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000; aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; + ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } else { aq = &vm_page_queue_speculative[speculative_age_index]; @@ -4785,6 +5057,7 @@ vm_page_speculate( aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000; aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; + ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } } @@ -5128,9 +5401,6 @@ void vm_page_zero_fill( vm_page_t m) { - XPR(XPR_VM_PAGE, - "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n", - VM_PAGE_OBJECT(m), m->vmp_offset, m, 0, 0); #if 0 /* * we don't hold the page queue lock @@ -5187,11 +5457,6 @@ vm_page_copy( src_m_object = VM_PAGE_OBJECT(src_m); - XPR(XPR_VM_PAGE, - "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n", - src_m_object, src_m->vmp_offset, - VM_PAGE_OBJECT(dest_m), dest_m->vmp_offset, - 0); #if 0 /* * we don't hold the page queue lock @@ -6234,7 +6499,13 @@ cpm_allocate( * determine need for wakeups */ if (vm_page_free_count < vm_page_free_min) { - thread_wakeup((event_t) &vm_page_free_wanted); + lck_mtx_lock(&vm_page_queue_free_lock); + if (vm_pageout_running == FALSE) { + lck_mtx_unlock(&vm_page_queue_free_lock); + thread_wakeup((event_t) &vm_page_free_wanted); + } else { + lck_mtx_unlock(&vm_page_queue_free_lock); + } } VM_CHECK_MEMORYSTATUS; @@ -6298,10 +6569,22 @@ vm_page_do_delayed_work( if (!vm_page_trylockspin_queues()) { vm_object_unlock(object); + /* + * "Turnstile enabled vm_pageout_scan" can be runnable + * for a very long time without getting on a core. + * If this is a higher priority thread it could be + * waiting here for a very long time respecting the fact + * that pageout_scan would like its object after VPS does + * a mutex_pause(0). + * So we cap the number of yields in the vm_object_lock_avoid() + * case to a single mutex_pause(0) which will give vm_pageout_scan + * 10us to run and grab the object if needed. + */ vm_page_lockspin_queues(); for (j = 0;; j++) { - if (!vm_object_lock_avoid(object) && + if ((!vm_object_lock_avoid(object) || + (vps_dynamic_priority_enabled && (j > 0))) && _vm_object_lock_try(object)) { break; } @@ -8394,6 +8677,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) { vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq); vm_page_secluded_count--; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); if (m_object == VM_OBJECT_NULL) { vm_page_secluded_count_free--; was_pageable = FALSE; @@ -8628,6 +8912,9 @@ vm_tag_bt(void) /* Pull return address from one spot above the frame pointer */ retaddr = *(frameptr + 1); +#if defined(HAS_APPLE_PAC) + retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address); +#endif if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text)) || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) { @@ -8937,6 +9224,7 @@ kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subt subidx = 0; assert(VM_KERN_MEMORY_NONE != subtag); + lck_spin_lock(&vm_allocation_sites_lock); for (; subidx < allocation->subtotalscount; subidx++) { if (VM_KERN_MEMORY_NONE == allocation->subtotals[subidx].tag) { allocation->subtotals[subidx].tag = subtag; @@ -8946,6 +9234,7 @@ kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subt break; } } + lck_spin_unlock(&vm_allocation_sites_lock); assert(subidx < allocation->subtotalscount); if (subidx >= allocation->subtotalscount) { return; @@ -8957,13 +9246,10 @@ kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subt if (delta < 0) { assertf(total->total >= ((uint64_t)-delta), "name %p", allocation); - OSAddAtomic64(delta, &total->total); assertf(other->mapped >= ((uint64_t)-delta), "other %p", other); - OSAddAtomic64(delta, &other->mapped); - } else { - OSAddAtomic64(delta, &other->mapped); - OSAddAtomic64(delta, &total->total); } + OSAddAtomic64(delta, &other->mapped); + OSAddAtomic64(delta, &total->total); } const char * @@ -9215,14 +9501,12 @@ vm_page_diagnose_estimate(void) return count; } - kern_return_t vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes) { uint64_t wired_size; uint64_t wired_managed_size; uint64_t wired_reserved_size; - uint64_t booter_size; boolean_t iterate; mach_memory_info_t * counts; @@ -9241,7 +9525,6 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone #endif wired_managed_size = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial); - booter_size = ml_get_booter_memory_size(); wired_size += booter_size; assert(num_info >= VM_KERN_COUNTER_COUNT); @@ -9455,6 +9738,7 @@ start_secluded_suppression(task_t task) task->task_suppressed_secluded = TRUE; vm_page_secluded_save_target = vm_page_secluded_target; vm_page_secluded_target = 0; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); } lck_spin_unlock(&secluded_suppress_slock); } @@ -9466,6 +9750,7 @@ stop_secluded_suppression(task_t task) if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) { task->task_suppressed_secluded = FALSE; vm_page_secluded_target = vm_page_secluded_save_target; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); } lck_spin_unlock(&secluded_suppress_slock); } diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index 03624ce07..d8befa53c 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -122,6 +122,9 @@ */ #define PROCESS_SHARED_CACHE_LAYOUT 0x00 +#if defined(HAS_APPLE_PAC) +#include +#endif /* HAS_APPLE_PAC */ /* "dyld" uses this to figure out what the kernel supports */ int shared_region_version = 3; @@ -181,10 +184,10 @@ kern_return_t vm_shared_region_slide_mapping( memory_object_control_t); /* forward */ static int __commpage_setup = 0; -#if defined(__i386__) || defined(__x86_64__) +#if !CONFIG_EMBEDDED static int __system_power_source = 1; /* init to extrnal power source */ static void post_sys_powersource_internal(int i, int internal); -#endif /* __i386__ || __x86_64__ */ +#endif /* @@ -729,7 +732,6 @@ vm_shared_region_create( switch (cputype) { #if defined(__arm__) || defined(__arm64__) case CPU_TYPE_ARM: - case CPU_TYPE_ARM64: base_address = SHARED_REGION_BASE_ARM; size = SHARED_REGION_SIZE_ARM; pmap_nesting_start = SHARED_REGION_NESTING_BASE_ARM; @@ -775,7 +777,7 @@ vm_shared_region_create( { struct pmap *pmap_nested; - pmap_nested = pmap_create(NULL, 0, is_64bit); + pmap_nested = pmap_create_options(NULL, 0, is_64bit ? PMAP_CREATE_64BIT : 0); if (pmap_nested != PMAP_NULL) { pmap_set_nested(pmap_nested); sub_map = vm_map_create(pmap_nested, 0, size, TRUE); @@ -786,7 +788,7 @@ vm_shared_region_create( vm_map_set_page_shift(sub_map, SIXTEENK_PAGE_SHIFT); } -#elif (__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS) +#elif (__ARM_ARCH_7K__ >= 2) /* enforce 16KB alignment for watch targets with new ABI */ vm_map_set_page_shift(sub_map, SIXTEENK_PAGE_SHIFT); #endif /* __arm64__ */ @@ -796,7 +798,7 @@ vm_shared_region_create( } #else /* create a VM sub map and its pmap */ - sub_map = vm_map_create(pmap_create(NULL, 0, is_64bit), + sub_map = vm_map_create(pmap_create_options(NULL, 0, is_64bit), 0, size, TRUE); #endif @@ -848,6 +850,9 @@ vm_shared_region_create( si->start = 0; si->end = 0; si->slide = 0; +#if defined(HAS_APPLE_PAC) + si->si_ptrauth = FALSE; /* no pointer authentication by default */ +#endif /* HAS_APPLE_PAC */ si->slide_object = NULL; si->slide_info_size = 0; si->slide_info_entry = NULL; @@ -1153,6 +1158,7 @@ vm_shared_region_map_file( vm_map_kernel_flags_t vmk_flags; mach_vm_offset_t sfm_min_address = ~0; mach_vm_offset_t sfm_max_address = 0; + mach_vm_offset_t sfm_end; struct _dyld_cache_header sr_cache_header; #if __arm64__ @@ -1234,8 +1240,17 @@ vm_shared_region_map_file( sfm_min_address = mappings[i].sfm_address; } - if ((mappings[i].sfm_address + mappings[i].sfm_size) > sfm_max_address) { - sfm_max_address = mappings[i].sfm_address + mappings[i].sfm_size; + if (os_add_overflow(mappings[i].sfm_address, + mappings[i].sfm_size, + &sfm_end) || + (vm_map_round_page(sfm_end, VM_MAP_PAGE_MASK(sr_map)) < + mappings[i].sfm_address)) { + /* overflow */ + kr = KERN_INVALID_ARGUMENT; + break; + } + if (sfm_end > sfm_max_address) { + sfm_max_address = sfm_end; } if (mappings[i].sfm_init_prot & VM_PROT_ZF) { @@ -1274,6 +1289,8 @@ vm_shared_region_map_file( vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; vmk_flags.vmkf_already = TRUE; + /* no copy-on-read for mapped binaries */ + vmk_flags.vmkf_no_copy_on_read = 1; /* establish that mapping, OK if it's "already" there */ if (map_port == MACH_PORT_NULL) { @@ -1335,6 +1352,12 @@ vm_shared_region_map_file( first_mapping = target_address; } +#if defined(HAS_APPLE_PAC) + /* + * Set "sr_slid_mapping" + * it is used to get the userland address for address authentication. + */ +#endif if ((slid_mapping == (mach_vm_offset_t) -1) && (mapping_to_slide == &mappings[i])) { slid_mapping = target_address; @@ -1385,26 +1408,30 @@ vm_shared_region_map_file( mappings[i].sfm_size = 0; kr = KERN_SUCCESS; } else { - /* this mapping failed ! */ - SHARED_REGION_TRACE_ERROR( - ("shared_region: mapping[%d]: " - "address:0x%016llx size:0x%016llx " - "offset:0x%016llx " - "maxprot:0x%x prot:0x%x failed 0x%x\n", - i, - (long long)mappings[i].sfm_address, - (long long)mappings[i].sfm_size, - (long long)mappings[i].sfm_file_offset, - mappings[i].sfm_max_prot, - mappings[i].sfm_init_prot, - kr)); - - vm_shared_region_undo_mappings(sr_map, sr_base_address, mappings, i); break; } } } + if (kr != KERN_SUCCESS) { + /* the last mapping we tried (mappings[i]) failed ! */ + assert(i < mappings_count); + SHARED_REGION_TRACE_ERROR( + ("shared_region: mapping[%d]: " + "address:0x%016llx size:0x%016llx " + "offset:0x%016llx " + "maxprot:0x%x prot:0x%x failed 0x%x\n", + i, + (long long)mappings[i].sfm_address, + (long long)mappings[i].sfm_size, + (long long)mappings[i].sfm_file_offset, + mappings[i].sfm_max_prot, + mappings[i].sfm_init_prot, + kr)); + /* undo all the previous mappings */ + vm_shared_region_undo_mappings(sr_map, sr_base_address, mappings, i); + } + if (kr == KERN_SUCCESS && slide_size != 0 && mapping_to_slide != NULL) { @@ -1694,25 +1721,29 @@ vm_shared_region_enter( /* * We may need to map several pmap-nested portions, due to platform * specific restrictions on pmap nesting. - * The pmap-nesting is triggered by the "VM_MEMORY_SHARED_PMAP" alias... + * The pmap-nesting is triggered by the "vmkf_nested_pmap" flag... */ for (; sr_pmap_nesting_size > 0; sr_offset += mapping_size, sr_size -= mapping_size, sr_pmap_nesting_size -= mapping_size) { + vm_map_kernel_flags_t vmk_flags; + target_address = sr_address + sr_offset; mapping_size = sr_pmap_nesting_size; if (mapping_size > pmap_nesting_size_max) { mapping_size = (vm_map_offset_t) pmap_nesting_size_max; } + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_nested_pmap = TRUE; kr = vm_map_enter_mem_object( map, &target_address, mapping_size, 0, VM_FLAGS_FIXED, - VM_MAP_KERNEL_FLAGS_NONE, + vmk_flags, VM_MEMORY_SHARED_PMAP, sr_handle, sr_offset, @@ -1911,6 +1942,13 @@ vm_shared_region_slide_mapping( si->start = start; si->end = si->start + size; si->slide = slide; +#if defined(HAS_APPLE_PAC) + if (sr->sr_cpu_type == CPU_TYPE_ARM64 && + sr->sr_cpu_subtype == CPU_SUBTYPE_ARM64E) { + /* arm64e has pointer authentication */ + si->si_ptrauth = TRUE; + } +#endif /* HAS_APPLE_PAC */ /* find the shared region's map entry to slide */ sr_map = vm_shared_region_vm_map(sr); @@ -2465,6 +2503,11 @@ vm_shared_region_slide_page_v3(vm_shared_region_slide_info_t si, vm_offset_t vad return KERN_FAILURE; } +#if defined(HAS_APPLE_PAC) + uint16_t diversity_data = (uint16_t)(value >> 32); + bool hasAddressDiversity = (value & (1ULL << 48)) != 0; + ptrauth_key key = (ptrauth_key)((value >> 49) & 0x3); +#endif /* HAS_APPLE_PAC */ bool isAuthenticated = (value & (1ULL << 63)) != 0; if (isAuthenticated) { @@ -2474,6 +2517,23 @@ vm_shared_region_slide_page_v3(vm_shared_region_slide_info_t si, vm_offset_t vad const uint64_t value_add = s_info->value_add; value += value_add; +#if defined(HAS_APPLE_PAC) + uint64_t discriminator = diversity_data; + if (hasAddressDiversity) { + // First calculate a new discriminator using the address of where we are trying to store the value + uintptr_t pageOffset = rebaseLocation - page_content; + discriminator = __builtin_ptrauth_blend_discriminator((void*)(((uintptr_t)uservaddr) + pageOffset), discriminator); + } + + if (si->si_ptrauth && + !(BootArgs->bootFlags & kBootFlagsDisableUserJOP)) { + /* + * these pointers are used in user mode. disable the kernel key diversification + * so we can sign them for use in user mode. + */ + value = (uintptr_t)pmap_sign_user_ptr((void *)value, key, discriminator); + } +#endif /* HAS_APPLE_PAC */ } else { // The new value for a rebase is the low 51-bits of the threaded value plus the slide. // Regular pointer which needs to fit in 51-bits of value. @@ -2658,7 +2718,7 @@ _vm_commpage_init( if (kr != KERN_SUCCESS) { panic("_vm_commpage_init: could not allocate mem_entry"); } - new_map = vm_map_create(pmap_create(NULL, 0, 0), 0, size, TRUE); + new_map = vm_map_create(pmap_create_options(NULL, 0, 0), 0, size, PMAP_CREATE_64BIT); if (new_map == VM_MAP_NULL) { panic("_vm_commpage_init: could not allocate VM map"); } @@ -2736,11 +2796,11 @@ vm_commpage_init(void) /* populate them according to this specific platform */ commpage_populate(); __commpage_setup = 1; -#if defined(__i386__) || defined(__x86_64__) +#if !CONFIG_EMBEDDED if (__system_power_source == 0) { post_sys_powersource_internal(0, 1); } -#endif /* __i386__ || __x86_64__ */ +#endif SHARED_REGION_TRACE_DEBUG( ("commpage: init() <-\n")); @@ -2812,6 +2872,7 @@ vm_commpage_enter( (commpage_size & (pmap_nesting_size_min - 1)) == 0) { /* the commpage is properly aligned or sized for pmap-nesting */ tag = VM_MEMORY_SHARED_PMAP; + vmk_flags.vmkf_nested_pmap = TRUE; } /* map the comm page in the task's address space */ assert(commpage_handle != IPC_PORT_NULL); @@ -3030,19 +3091,19 @@ done: * 1 if it is internal power source ie battery */ void -#if defined(__i386__) || defined(__x86_64__) +#if !CONFIG_EMBEDDED post_sys_powersource(int i) #else post_sys_powersource(__unused int i) #endif { -#if defined(__i386__) || defined(__x86_64__) +#if !CONFIG_EMBEDDED post_sys_powersource_internal(i, 0); -#endif /* __i386__ || __x86_64__ */ +#endif } -#if defined(__i386__) || defined(__x86_64__) +#if !CONFIG_EMBEDDED static void post_sys_powersource_internal(int i, int internal) { @@ -3058,4 +3119,4 @@ post_sys_powersource_internal(int i, int internal) } } } -#endif /* __i386__ || __x86_64__ */ +#endif diff --git a/osfmk/vm/vm_shared_region.h b/osfmk/vm/vm_shared_region.h index bfe7f518b..95fe9fa54 100644 --- a/osfmk/vm/vm_shared_region.h +++ b/osfmk/vm/vm_shared_region.h @@ -187,6 +187,9 @@ struct vm_shared_region_slide_info { mach_vm_offset_t start; mach_vm_offset_t end; uint32_t slide; +#if defined(HAS_APPLE_PAC) + boolean_t si_ptrauth; +#endif /* HAS_APPLE_PAC */ vm_object_t slide_object; mach_vm_size_t slide_info_size; vm_shared_region_slide_info_entry_t slide_info_entry; diff --git a/osfmk/vm/vm_shared_region_pager.c b/osfmk/vm/vm_shared_region_pager.c index a4d1fc46f..35cd0e817 100644 --- a/osfmk/vm/vm_shared_region_pager.c +++ b/osfmk/vm/vm_shared_region_pager.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,19 +118,19 @@ kern_return_t shared_region_pager_last_unmap(memory_object_t mem_obj); * These routines are invoked by VM via the memory_object_*() interfaces. */ const struct memory_object_pager_ops shared_region_pager_ops = { - shared_region_pager_reference, - shared_region_pager_deallocate, - shared_region_pager_init, - shared_region_pager_terminate, - shared_region_pager_data_request, - shared_region_pager_data_return, - shared_region_pager_data_initialize, - shared_region_pager_data_unlock, - shared_region_pager_synchronize, - shared_region_pager_map, - shared_region_pager_last_unmap, - NULL, /* data_reclaim */ - "shared_region" + .memory_object_reference = shared_region_pager_reference, + .memory_object_deallocate = shared_region_pager_deallocate, + .memory_object_init = shared_region_pager_init, + .memory_object_terminate = shared_region_pager_terminate, + .memory_object_data_request = shared_region_pager_data_request, + .memory_object_data_return = shared_region_pager_data_return, + .memory_object_data_initialize = shared_region_pager_data_initialize, + .memory_object_data_unlock = shared_region_pager_data_unlock, + .memory_object_synchronize = shared_region_pager_synchronize, + .memory_object_map = shared_region_pager_map, + .memory_object_last_unmap = shared_region_pager_last_unmap, + .memory_object_data_reclaim = NULL, + .memory_object_pager_name = "shared_region" }; /* @@ -159,7 +159,7 @@ typedef struct shared_region_pager { int shared_region_pager_count = 0; /* number of pagers */ int shared_region_pager_count_mapped = 0; /* number of unmapped pagers */ queue_head_t shared_region_pager_queue; -decl_lck_mtx_data(, shared_region_pager_lock) +decl_lck_mtx_data(, shared_region_pager_lock); /* * Maximum number of unmapped pagers we're willing to keep around. @@ -513,24 +513,12 @@ retry_src_fault: dst_pnum = (ppnum_t) upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE)); assert(dst_pnum != 0); -#if __x86_64__ - src_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) - << PAGE_SHIFT); - dst_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#elif __arm__ || __arm64__ src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); dst_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#else -#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." - src_vaddr = 0; - dst_vaddr = 0; -#endif src_page_object = VM_PAGE_OBJECT(src_page); /* @@ -983,6 +971,7 @@ shared_region_pager_create( shared_region_pager_t pager; memory_object_control_t control; kern_return_t kr; + vm_object_t object; pager = (shared_region_pager_t) kalloc(sizeof(*pager)); if (pager == SHARED_REGION_PAGER_NULL) { @@ -1027,9 +1016,19 @@ shared_region_pager_create( &control); assert(kr == KERN_SUCCESS); + memory_object_mark_trusted(control); + lck_mtx_lock(&shared_region_pager_lock); /* the new pager is now ready to be used */ pager->is_ready = TRUE; + object = memory_object_to_vm_object((memory_object_t) pager); + assert(object); + /* + * No one knows about this object and so we get away without the object lock. + * This object is _eventually_ backed by the dyld shared cache and so we want + * to benefit from the lock priority boosting. + */ + object->object_is_shared_cache = TRUE; lck_mtx_unlock(&shared_region_pager_lock); /* wakeup anyone waiting for this pager to be ready */ diff --git a/osfmk/vm/vm_swapfile_pager.c b/osfmk/vm/vm_swapfile_pager.c index a8b27af22..8ebc2d3e1 100644 --- a/osfmk/vm/vm_swapfile_pager.c +++ b/osfmk/vm/vm_swapfile_pager.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,8 +33,8 @@ #include #include #include -#include +#include #include #include #include @@ -115,19 +115,19 @@ kern_return_t swapfile_pager_last_unmap(memory_object_t mem_obj); * These routines are invoked by VM via the memory_object_*() interfaces. */ const struct memory_object_pager_ops swapfile_pager_ops = { - swapfile_pager_reference, - swapfile_pager_deallocate, - swapfile_pager_init, - swapfile_pager_terminate, - swapfile_pager_data_request, - swapfile_pager_data_return, - swapfile_pager_data_initialize, - swapfile_pager_data_unlock, - swapfile_pager_synchronize, - swapfile_pager_map, - swapfile_pager_last_unmap, - NULL, /* data_reclaim */ - "swapfile pager" + .memory_object_reference = swapfile_pager_reference, + .memory_object_deallocate = swapfile_pager_deallocate, + .memory_object_init = swapfile_pager_init, + .memory_object_terminate = swapfile_pager_terminate, + .memory_object_data_request = swapfile_pager_data_request, + .memory_object_data_return = swapfile_pager_data_return, + .memory_object_data_initialize = swapfile_pager_data_initialize, + .memory_object_data_unlock = swapfile_pager_data_unlock, + .memory_object_synchronize = swapfile_pager_synchronize, + .memory_object_map = swapfile_pager_map, + .memory_object_last_unmap = swapfile_pager_last_unmap, + .memory_object_data_reclaim = NULL, + .memory_object_pager_name = "swapfile pager" }; /* @@ -140,7 +140,7 @@ typedef struct swapfile_pager { /* pager-specific data */ queue_chain_t pager_queue; /* next & prev pagers */ - struct os_refcnt ref_count; /* reference count */ + unsigned int ref_count; /* reference count */ boolean_t is_ready; /* is this pager ready ? */ boolean_t is_mapped; /* is this pager mapped ? */ struct vnode *swapfile_vnode;/* the swapfile's vnode */ @@ -153,7 +153,7 @@ typedef struct swapfile_pager { */ int swapfile_pager_count = 0; /* number of pagers */ queue_head_t swapfile_pager_queue; -decl_lck_mtx_data(, swapfile_pager_lock) +decl_lck_mtx_data(, swapfile_pager_lock); /* * Statistics & counters. @@ -334,7 +334,7 @@ swapfile_pager_data_request( pager = swapfile_pager_lookup(mem_obj); assert(pager->is_ready); - assert(os_ref_get_count(&pager->ref_count) > 1); /* pager is alive and mapped */ + assert(pager->ref_count > 1); /* pager is alive and mapped */ PAGER_DEBUG(PAGER_PAGEIN, ("swapfile_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); @@ -493,7 +493,8 @@ swapfile_pager_reference( pager = swapfile_pager_lookup(mem_obj); lck_mtx_lock(&swapfile_pager_lock); - os_ref_retain_locked(&pager->ref_count); + assert(pager->ref_count > 0); + pager->ref_count++; lck_mtx_unlock(&swapfile_pager_lock); } @@ -567,9 +568,9 @@ swapfile_pager_deallocate_internal( } /* drop a reference on this pager */ - os_ref_count_t refcount = os_ref_release_locked(&pager->ref_count); + pager->ref_count--; - if (refcount == 1) { + if (pager->ref_count == 1) { /* * Only the "named" reference is left, which means that * no one is really holding on to this pager anymore. @@ -579,7 +580,7 @@ swapfile_pager_deallocate_internal( /* the pager is all ours: no need for the lock now */ lck_mtx_unlock(&swapfile_pager_lock); swapfile_pager_terminate_internal(pager); - } else if (refcount == 0) { + } else if (pager->ref_count == 0) { /* * Dropped the existence reference; the memory object has * been terminated. Do some final cleanup and release the @@ -667,7 +668,7 @@ swapfile_pager_map( lck_mtx_lock(&swapfile_pager_lock); assert(pager->is_ready); - assert(os_ref_get_count(&pager->ref_count) > 0); /* pager is alive */ + assert(pager->ref_count > 0); /* pager is alive */ if (pager->is_mapped == FALSE) { /* * First mapping of this pager: take an extra reference @@ -675,7 +676,7 @@ swapfile_pager_map( * are removed. */ pager->is_mapped = TRUE; - os_ref_retain_locked(&pager->ref_count); + pager->ref_count++; } lck_mtx_unlock(&swapfile_pager_lock); @@ -726,7 +727,7 @@ swapfile_pager_lookup( assert(mem_obj->mo_pager_ops == &swapfile_pager_ops); __IGNORE_WCASTALIGN(pager = (swapfile_pager_t) mem_obj); - assert(os_ref_get_count(&pager->ref_count) > 0); + assert(pager->ref_count > 0); return pager; } @@ -755,7 +756,7 @@ swapfile_pager_create( pager->swp_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; pager->is_ready = FALSE;/* not ready until it has a "name" */ - os_ref_init(&pager->ref_count, NULL); /* setup reference */ + pager->ref_count = 1; /* setup reference */ pager->is_mapped = FALSE; pager->swapfile_vnode = vp; @@ -772,7 +773,7 @@ swapfile_pager_create( if (!queue_end(&swapfile_pager_queue, (queue_entry_t) pager2)) { /* while we hold the lock, transfer our setup ref to winner */ - os_ref_retain_locked(&pager2->ref_count); + pager2->ref_count++; /* we lost the race, down with the loser... */ lck_mtx_unlock(&swapfile_pager_lock); pager->swapfile_vnode = NULL; @@ -799,6 +800,8 @@ swapfile_pager_create( &control); assert(kr == KERN_SUCCESS); + memory_object_mark_trusted(control); + lck_mtx_lock(&swapfile_pager_lock); /* the new pager is now ready to be used */ pager->is_ready = TRUE; @@ -839,7 +842,7 @@ swapfile_pager_setup( pager = SWAPFILE_PAGER_NULL; } else { /* make sure pager doesn't disappear */ - os_ref_retain_locked(&pager->ref_count); + pager->ref_count++; } lck_mtx_unlock(&swapfile_pager_lock); diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 92df95613..ab106cb5a 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -122,6 +122,7 @@ #include #include +#include vm_size_t upl_offset_to_pagelist = 0; @@ -2286,6 +2287,8 @@ mach_make_memory_entry_64( ipc_port_t *object_handle, ipc_port_t parent_handle) { + vm_named_entry_kernel_flags_t vmne_kflags; + if ((permission & MAP_MEM_FLAGS_MASK) & ~MAP_MEM_FLAGS_USER) { /* * Unknown flag: reject for forward compatibility. @@ -2293,10 +2296,15 @@ mach_make_memory_entry_64( return KERN_INVALID_VALUE; } + vmne_kflags = VM_NAMED_ENTRY_KERNEL_FLAGS_NONE; + if (permission & MAP_MEM_LEDGER_TAGGED) { + vmne_kflags.vmnekf_ledger_tag = VM_LEDGER_TAG_DEFAULT; + } return mach_make_memory_entry_internal(target_map, size, offset, permission, + vmne_kflags, object_handle, parent_handle); } @@ -2305,8 +2313,9 @@ kern_return_t mach_make_memory_entry_internal( vm_map_t target_map, memory_object_size_t *size, - memory_object_offset_t offset, + memory_object_offset_t offset, vm_prot_t permission, + vm_named_entry_kernel_flags_t vmne_kflags, ipc_port_t *object_handle, ipc_port_t parent_handle) { @@ -2423,6 +2432,9 @@ mach_make_memory_entry_internal( } return KERN_SUCCESS; } else if (permission & MAP_MEM_NAMED_CREATE) { + int ledger_flags = 0; + task_t owner; + map_end = vm_map_round_page(offset + *size, PAGE_MASK); map_size = map_end - map_start; @@ -2451,48 +2463,78 @@ mach_make_memory_entry_internal( object = vm_object_allocate(map_size); assert(object != VM_OBJECT_NULL); - if (permission & MAP_MEM_PURGABLE) { - task_t owner; + /* + * XXX + * We use this path when we want to make sure that + * nobody messes with the object (coalesce, for + * example) before we map it. + * We might want to use these objects for transposition via + * vm_object_transpose() too, so we don't want any copy or + * shadow objects either... + */ + object->copy_strategy = MEMORY_OBJECT_COPY_NONE; + object->true_share = TRUE; - if (!(permission & VM_PROT_WRITE)) { - /* if we can't write, we can't purge */ - vm_object_deallocate(object); - kr = KERN_INVALID_ARGUMENT; - goto make_mem_done; - } - object->purgable = VM_PURGABLE_NONVOLATILE; - if (permission & MAP_MEM_PURGABLE_KERNEL_ONLY) { - object->purgeable_only_by_kernel = TRUE; - } + owner = current_task(); + if ((permission & MAP_MEM_PURGABLE) || + vmne_kflags.vmnekf_ledger_tag) { assert(object->vo_owner == NULL); assert(object->resident_page_count == 0); assert(object->wired_page_count == 0); - vm_object_lock(object); - owner = current_task(); -#if __arm64__ - if (owner->task_legacy_footprint) { - /* - * For ios11, we failed to account for - * this memory. Keep doing that for - * legacy apps (built before ios12), - * for backwards compatibility's sake... - */ - owner = kernel_task; + assert(owner != TASK_NULL); + if (vmne_kflags.vmnekf_ledger_no_footprint) { + ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT; + object->vo_no_footprint = TRUE; } + if (permission & MAP_MEM_PURGABLE) { + if (!(permission & VM_PROT_WRITE)) { + /* if we can't write, we can't purge */ + vm_object_deallocate(object); + kr = KERN_INVALID_ARGUMENT; + goto make_mem_done; + } + object->purgable = VM_PURGABLE_NONVOLATILE; + if (permission & MAP_MEM_PURGABLE_KERNEL_ONLY) { + object->purgeable_only_by_kernel = TRUE; + } +#if __arm64__ + if (owner->task_legacy_footprint) { + /* + * For ios11, we failed to account for + * this memory. Keep doing that for + * legacy apps (built before ios12), + * for backwards compatibility's sake... + */ + owner = kernel_task; + } #endif /* __arm64__ */ - vm_purgeable_nonvolatile_enqueue(object, owner); - vm_object_unlock(object); + vm_object_lock(object); + vm_purgeable_nonvolatile_enqueue(object, owner); + vm_object_unlock(object); + } } - if (permission & MAP_MEM_LEDGER_TAG_NETWORK) { - /* make this object owned by the calling task */ + if (vmne_kflags.vmnekf_ledger_tag) { + /* + * Bill this object to the current task's + * ledgers for the given tag. + */ + if (vmne_kflags.vmnekf_ledger_no_footprint) { + ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT; + } vm_object_lock(object); - vm_object_ownership_change( + object->vo_ledger_tag = vmne_kflags.vmnekf_ledger_tag; + kr = vm_object_ownership_change( object, - VM_OBJECT_LEDGER_TAG_NETWORK, - current_task(), /* new owner */ + vmne_kflags.vmnekf_ledger_tag, + owner, /* new owner */ + ledger_flags, FALSE); /* task_objq locked? */ vm_object_unlock(object); + if (kr != KERN_SUCCESS) { + vm_object_deallocate(object); + goto make_mem_done; + } } #if CONFIG_SECLUDED_MEMORY @@ -2527,18 +2569,6 @@ mach_make_memory_entry_internal( /* the object has no pages, so no WIMG bits to update here */ - /* - * XXX - * We use this path when we want to make sure that - * nobody messes with the object (coalesce, for - * example) before we map it. - * We might want to use these objects for transposition via - * vm_object_transpose() too, so we don't want any copy or - * shadow objects either... - */ - object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - object->true_share = TRUE; - user_entry->backing.object = object; user_entry->internal = TRUE; user_entry->is_sub_map = FALSE; @@ -3297,10 +3327,11 @@ redo_lookup: } if (parent_entry->is_sub_map) { - user_entry->backing.map = parent_entry->backing.map; - vm_map_lock(user_entry->backing.map); - user_entry->backing.map->map_refcnt++; - vm_map_unlock(user_entry->backing.map); + vm_map_t map = parent_entry->backing.map; + user_entry->backing.map = map; + lck_mtx_lock(&map->s_lock); + os_ref_retain_locked(&map->map_refcnt); + lck_mtx_unlock(&map->s_lock); } else { object = parent_entry->backing.object; assert(object != VM_OBJECT_NULL); @@ -3455,7 +3486,6 @@ mach_memory_entry_allocate( { vm_named_entry_t user_entry; ipc_port_t user_handle; - ipc_port_t previous; user_entry = (vm_named_entry_t) kalloc(sizeof *user_entry); if (user_entry == NULL) { @@ -3465,25 +3495,6 @@ mach_memory_entry_allocate( named_entry_lock_init(user_entry); - user_handle = ipc_port_alloc_kernel(); - if (user_handle == IP_NULL) { - kfree(user_entry, sizeof *user_entry); - return KERN_FAILURE; - } - ip_lock(user_handle); - - /* make a sonce right */ - user_handle->ip_sorights++; - ip_reference(user_handle); - - /* make a send right */ - user_handle->ip_mscount++; - user_handle->ip_srights++; - ip_reference(user_handle); - - ipc_port_nsrequest(user_handle, 1, user_handle, &previous); - /* nsrequest unlocks user_handle */ - user_entry->backing.object = NULL; user_entry->is_sub_map = FALSE; user_entry->is_copy = FALSE; @@ -3494,8 +3505,9 @@ mach_memory_entry_allocate( user_entry->protection = VM_PROT_NONE; user_entry->ref_count = 1; - ipc_kobject_set(user_handle, (ipc_kobject_t) user_entry, - IKOT_NAMED_ENTRY); + user_handle = ipc_kobject_alloc_port((ipc_kobject_t)user_entry, + IKOT_NAMED_ENTRY, + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); *user_entry_p = user_entry; *user_handle_p = user_handle; @@ -3731,6 +3743,88 @@ memory_entry_access_tracking_internal( return kr; } +kern_return_t +mach_memory_entry_ownership( + ipc_port_t entry_port, + task_t owner, + int ledger_tag, + int ledger_flags) +{ + task_t cur_task; + kern_return_t kr; + vm_named_entry_t mem_entry; + vm_object_t object; + + cur_task = current_task(); + if (cur_task != kernel_task && + (owner != cur_task || + (ledger_flags & VM_LEDGER_FLAG_NO_FOOTPRINT) || + ledger_tag == VM_LEDGER_TAG_NETWORK)) { + /* + * An entitlement is required to: + * + tranfer memory ownership to someone else, + * + request that the memory not count against the footprint, + * + tag as "network" (since that implies "no footprint") + */ + if (!cur_task->task_can_transfer_memory_ownership && + IOTaskHasEntitlement(cur_task, + "com.apple.private.memory.ownership_transfer")) { + cur_task->task_can_transfer_memory_ownership = TRUE; + } + if (!cur_task->task_can_transfer_memory_ownership) { + return KERN_NO_ACCESS; + } + } + + if (ledger_flags & ~VM_LEDGER_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + if (ledger_tag <= 0 || + ledger_tag > VM_LEDGER_TAG_MAX) { + return KERN_INVALID_ARGUMENT; + } + + if (!IP_VALID(entry_port) || + ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { + return KERN_INVALID_ARGUMENT; + } + mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + + named_entry_lock(mem_entry); + + if (mem_entry->is_sub_map || + mem_entry->is_copy) { + named_entry_unlock(mem_entry); + return KERN_INVALID_ARGUMENT; + } + + object = mem_entry->backing.object; + if (object == VM_OBJECT_NULL) { + named_entry_unlock(mem_entry); + return KERN_INVALID_ARGUMENT; + } + + vm_object_lock(object); + + /* check that named entry covers entire object ? */ + if (mem_entry->offset != 0 || object->vo_size != mem_entry->size) { + vm_object_unlock(object); + named_entry_unlock(mem_entry); + return KERN_INVALID_ARGUMENT; + } + + named_entry_unlock(mem_entry); + + kr = vm_object_ownership_change(object, + ledger_tag, + owner, + ledger_flags, + FALSE); /* task_objq_locked */ + vm_object_unlock(object); + + return kr; +} + kern_return_t mach_memory_entry_get_page_counts( ipc_port_t entry_port, diff --git a/osfmk/voucher/Makefile b/osfmk/voucher/Makefile index 534b780ef..46c5051e7 100644 --- a/osfmk/voucher/Makefile +++ b/osfmk/voucher/Makefile @@ -52,7 +52,7 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -61,7 +61,7 @@ ${MIG_UUHDRS} : \ ${MIG_USHDRS} : \ %_server.h : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ @@ -97,7 +97,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -107,7 +107,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)) $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/voucher/ipc_pthread_priority.c b/osfmk/voucher/ipc_pthread_priority.c index 28d3f54b3..22118c325 100644 --- a/osfmk/voucher/ipc_pthread_priority.c +++ b/osfmk/voucher/ipc_pthread_priority.c @@ -97,7 +97,7 @@ ipc_pthread_priority_release(ipc_voucher_attr_manager_t __assert_only manager); /* * communication channel from voucher system to IPC_PTHREAD_PRIORITY */ -struct ipc_voucher_attr_manager ipc_pthread_priority_manager = { +const struct ipc_voucher_attr_manager ipc_pthread_priority_manager = { .ivam_release_value = ipc_pthread_priority_release_value, .ivam_get_value = ipc_pthread_priority_get_value, .ivam_extract_content = ipc_pthread_priority_extract_content, diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c index 557fae0ec..5482ff39b 100644 --- a/osfmk/x86_64/copyio.c +++ b/osfmk/x86_64/copyio.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -79,7 +80,10 @@ const int copysize_limit_panic = (64 * MB); */ extern int _bcopy(const void *, void *, vm_size_t); extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); -extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len); +extern int _copyin_atomic32(const char *src, uint32_t *dst); +extern int _copyin_atomic64(const char *src, uint64_t *dst); +extern int _copyout_atomic32(const uint32_t *u32, char *src); +extern int _copyout_atomic64(const uint64_t *u64, char *src); /* On by default, optionally disabled by boot-arg */ extern boolean_t copyio_zalloc_check; @@ -92,7 +96,10 @@ extern boolean_t copyio_zalloc_check; #define COPYINSTR 2 /* string variant of copyout */ #define COPYINPHYS 3 /* from user virtual to kernel physical */ #define COPYOUTPHYS 4 /* from kernel physical to user virtual */ -#define COPYINWORD 5 /* from user virtual to kernel virtual */ +#define COPYINATOMIC32 5 /* from user virtual to kernel virtual */ +#define COPYINATOMIC64 6 /* from user virtual to kernel virtual */ +#define COPYOUTATOMIC32 7 /* from user virtual to kernel virtual */ +#define COPYOUTATOMIC64 8 /* from user virtual to kernel virtual */ #if ENABLE_SMAPLOG typedef struct { @@ -210,11 +217,27 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, goto out; } + if (copy_type >= COPYINATOMIC32 && copy_type <= COPYOUTATOMIC64) { + if (__improbable(pmap == kernel_pmap)) { + error = EFAULT; + goto out; + } + } + #if KASAN - if (copy_type == COPYIN || copy_type == COPYINSTR || copy_type == COPYINWORD) { + switch (copy_type) { + case COPYIN: + case COPYINSTR: + case COPYINATOMIC32: + case COPYINATOMIC64: __asan_storeN((uptr)kernel_addr, nbytes); - } else if (copy_type == COPYOUT) { + break; + case COPYOUT: + case COPYOUTATOMIC32: + case COPYOUTATOMIC64: __asan_loadN((uptr)kernel_addr, nbytes); + kasan_check_uninitialized((vm_address_t)kernel_addr, nbytes); + break; } #endif @@ -288,10 +311,24 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, nbytes); break; - case COPYINWORD: - error = _copyin_word((const void *) user_addr, - (void *) kernel_addr, - nbytes); + case COPYINATOMIC32: + error = _copyin_atomic32((const void *) user_addr, + (void *) kernel_addr); + break; + + case COPYINATOMIC64: + error = _copyin_atomic64((const void *) user_addr, + (void *) kernel_addr); + break; + + case COPYOUTATOMIC32: + error = _copyout_atomic32((const void *) kernel_addr, + (void *) user_addr); + break; + + case COPYOUTATOMIC64: + error = _copyout_atomic64((const void *) kernel_addr, + (void *) user_addr); break; case COPYINSTR: @@ -395,23 +432,63 @@ copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes) } /* - * copyin_word - * Read an aligned value from userspace as a single memory transaction. - * This function supports userspace synchronization features + * copy{in,out}_atomic{32,64} + * Read or store an aligned value from userspace as a single memory transaction. + * These functions support userspace synchronization features */ int -copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes) +copyin_atomic32(const user_addr_t user_addr, uint32_t *kernel_addr) +{ + /* Test alignment */ + if (user_addr & 3) { + return EINVAL; + } + return copyio(COPYINATOMIC32, user_addr, (char *)(uintptr_t)kernel_addr, 4, NULL, 0); +} + +int +copyin_atomic32_wait_if_equals(const user_addr_t user_addr, uint32_t value) +{ + uint32_t u32; + int result = copyin_atomic32(user_addr, &u32); + if (__improbable(result)) { + return result; + } + if (u32 != value) { + return ESTALE; + } + cpu_pause(); + return 0; +} + +int +copyin_atomic64(const user_addr_t user_addr, uint64_t *kernel_addr) +{ + /* Test alignment */ + if (user_addr & 7) { + return EINVAL; + } + return copyio(COPYINATOMIC64, user_addr, (char *)(uintptr_t)kernel_addr, 8, NULL, 0); +} + +int +copyout_atomic32(uint32_t value, user_addr_t user_addr) { - /* Verify sizes */ - if ((nbytes != 4) && (nbytes != 8)) { + /* Test alignment */ + if (user_addr & 3) { return EINVAL; } + return copyio(COPYOUTATOMIC32, user_addr, (char *)&value, 4, NULL, 0); +} +int +copyout_atomic64(uint64_t value, user_addr_t user_addr) +{ /* Test alignment */ - if (user_addr & (nbytes - 1)) { + if (user_addr & 7) { return EINVAL; } - return copyio(COPYINWORD, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0); + return copyio(COPYOUTATOMIC64, user_addr, (char *)&value, 8, NULL, 0); } int diff --git a/osfmk/x86_64/cswitch.s b/osfmk/x86_64/cswitch.s index cb72f459e..e09e1e179 100644 --- a/osfmk/x86_64/cswitch.s +++ b/osfmk/x86_64/cswitch.s @@ -61,25 +61,45 @@ #include #include +/* + * void Load_context( + * thread_t thread) // %rdi + * + * Loads the first thread context to run on a CPU, + * i.e. without switching from a previous thread. + * + * returns 'old' thread in %rax (which is always NULL) + */ Entry(Load_context) - movq TH_KERNEL_STACK(%rdi),%rcx /* get kernel stack */ - leaq -IKS_SIZE(%rcx),%rdx - addq EXT(kernel_stack_size)(%rip),%rdx /* point to stack top */ - movq %rcx,%gs:CPU_ACTIVE_STACK /* store stack address */ - movq %rdx,%gs:CPU_KERNEL_STACK /* store stack top */ + movq %rdi, %rdx /* move thread arg to rdx */ - movq %rdx,%rsp - xorl %ebp, %ebp + movq %rdx,%gs:CPU_ACTIVE_THREAD /* new thread is active */ + movq TH_KERNEL_STACK(%rdx),%rdx /* get its kernel stack */ + lea -IKS_SIZE(%rdx),%rcx + add EXT(kernel_stack_size)(%rip),%rcx /* point to stack top */ + + movq %rdx,%gs:CPU_ACTIVE_STACK /* set current stack */ + movq %rcx,%gs:CPU_KERNEL_STACK /* set stack top */ - xorl %edi,%edi /* return zero (no old thread) */ - call EXT(thread_continue) + movq KSS_RSP(%rcx),%rsp /* switch stacks */ + movq KSS_RBX(%rcx),%rbx /* restore registers */ + movq KSS_RBP(%rcx),%rbp + movq KSS_R12(%rcx),%r12 + movq KSS_R13(%rcx),%r13 + movq KSS_R14(%rcx),%r14 + movq KSS_R15(%rcx),%r15 + xorl %eax, %eax /* set return value to zero (no old thread) */ + + jmp *KSS_RIP(%rcx) /* return old thread */ /* * thread_t Switch_context( - * thread_t old, // %rsi - * thread_continue_t continuation, // %rdi + * thread_t old, // %rdi + * thread_continue_t continuation, // %rsi * thread_t new) // %rdx + * + * returns 'old' thread in %rax */ Entry(Switch_context) popq %rax /* pop return PC */ @@ -114,14 +134,21 @@ Entry(Switch_context) movq KSS_R13(%rcx),%r13 movq KSS_R14(%rcx),%r14 movq KSS_R15(%rcx),%r15 - jmp *KSS_RIP(%rcx) /* return old thread */ - + jmp *KSS_RIP(%rcx) /* return old thread in %rax */ +/* + * machine_stack_attach sets this as the RIP of newly-attached stacks + * %rbx is the C routine to call + * %rax is the parameter to pass to the C routine + * + * This stub is needed to convert the return value of the old thread from Switch_context + * in %rax into a parameter to thread_continue passed in %rdi, because using the + * same register for the first argument and first retval makes too much sense for the SysV ABI. + */ Entry(Thread_continue) movq %rax, %rdi /* this is the old thread from Switch_context */ - xorq %rbp,%rbp /* zero frame pointer */ call *%rbx /* call real continuation */ - + int3 /* (should never return) */ /* * thread_t Shutdown_context( @@ -131,9 +158,7 @@ Entry(Thread_continue) * * saves the kernel context of the thread, * switches to the interrupt stack, - * continues the thread (with thread_continue), * then runs routine on the interrupt stack. - * */ Entry(Shutdown_context) movq %gs:CPU_KERNEL_STACK,%rcx /* get old kernel stack top */ @@ -143,7 +168,8 @@ Entry(Shutdown_context) movq %r13,KSS_R13(%rcx) movq %r14,KSS_R14(%rcx) movq %r15,KSS_R15(%rcx) - popq KSS_RIP(%rcx) /* save return PC */ + popq %r8 /* extract return PC */ + movq %r8,KSS_RIP(%rcx) /* save return PC */ movq %rsp,KSS_RSP(%rcx) /* save SP */ movq %gs:CPU_ACTIVE_STACK,%rcx /* get old kernel stack */ @@ -155,7 +181,12 @@ Entry(Shutdown_context) movq %rsp, %gs:CPU_ACTIVE_STACK movq EXT(kernel_stack_size)(%rip),%rcx /* point to stack top */ subq %rcx, %gs:CPU_ACTIVE_STACK + + pushq %r8 /* set up a call frame on new stack */ + pushq %rbp + movq %rsp, %rbp + movq %rdx,%rdi /* processor arg to routine */ call *%rsi /* call routine to run */ - hlt /* (should never return) */ + int3 /* (should never return) */ diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index d17bb5bb7..d54c1c095 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -87,7 +87,7 @@ EXT(idt64_hndl_table0): /* 0x00 */ .quad EXT(ks_dispatch) /* 0x08 */ .quad EXT(ks_64bit_return) /* 0x10 */ .quad 0 /* Populated with CPU shadow displacement*/ -/* 0x18 */ .quad EXT(ks_return) +/* 0x18 */ .quad EXT(ks_32bit_return) #define TBL0_OFF_DISP_USER_WITH_POPRAX 0x20 /* 0x20 */ .quad EXT(ks_dispatch_user_with_pop_rax) #define TBL0_OFF_DISP_KERN_WITH_POPRAX 0x28 @@ -244,9 +244,14 @@ Entry(idt64_mdep_scall) * PCB stack and then dispatch as normal. * For faults in kernel-space, we need to scrub for kernel exit faults and * treat these as user-space faults. But for all other kernel-space faults - * we continue to run on the IST1 stack and we dispatch to handle the fault + * we continue to run on the IST1 stack as we dispatch to handle the fault * as fatal. */ +Entry(idt64_segnp) + pushq $(HNDL_ALLTRAPS) + pushq $(T_SEGMENT_NOT_PRESENT) + jmp L_check_for_kern_flt + Entry(idt64_gen_prot) pushq $(HNDL_ALLTRAPS) pushq $(T_GENERAL_PROTECTION) @@ -267,6 +272,16 @@ L_check_for_kern_flt: pushq %rax testb $3, 8+ISF64_CS(%rsp) jnz L_dispatch_from_user_no_push_rax /* Fault from user, go straight to dispatch */ + + /* Check if the fault occurred in the 32-bit segment restoration window (which executes with user gsb) */ + leaq L_32bit_seg_restore_begin(%rip), %rax + cmpq %rax, 8+ISF64_RIP(%rsp) + jb L_not_32bit_segrestores + leaq L_32bit_seg_restore_done(%rip), %rax + cmpq %rax, 8+ISF64_RIP(%rsp) + jae L_not_32bit_segrestores + jmp 1f +L_not_32bit_segrestores: leaq EXT(ret32_iret)(%rip), %rax cmpq %rax, 8+ISF64_RIP(%rsp) je 1f @@ -309,6 +324,7 @@ L_check_for_kern_flt: /* * Fix the stack so the original trap frame is current, then jump to dispatch */ + movq %rax, 16+ISF64_CS(%rsp) movq ISF64_RSP-24(%rbx), %rax @@ -326,10 +342,6 @@ L_check_for_kern_flt: popq %rbx jmp L_dispatch_from_user_no_push_rax -Entry(idt64_segnp) - pushq $(HNDL_ALLTRAPS) - pushq $(T_SEGMENT_NOT_PRESENT) - jmp L_dispatch /* * Fatal exception handlers: @@ -551,6 +563,7 @@ L_dispatch_from_kernel_no_push_rax: jmp *(%rax) /* User return: register restoration and address space switch sequence */ Entry(ks_64bit_return) + mov R64_R14(%r15), %r14 mov R64_R13(%r15), %r13 mov R64_R12(%r15), %r12 @@ -810,6 +823,176 @@ L_64bit_entry_reject: movq $(T_INVALID_OPCODE), 8+ISF64_TRAPNO(%rsp) jmp L_dispatch_kgsb +Entry(ks_32bit_return) + + /* Validate CS/DS/ES/FS/GS segment selectors with the Load Access Rights instruction prior to restoration */ + /* Exempt "known good" statically configured selectors, e.g. USER_CS, USER_DS and 0 */ + cmpl $(USER_CS), R32_CS(%r15) + jz 11f + larw R32_CS(%r15), %ax + jnz L_32_reset_cs + /* Ensure that the segment referenced by CS in the saved state is a code segment (bit 11 == 1) */ + testw $0x800, %ax + jz L_32_reset_cs /* Update stored %cs with known-good selector if ZF == 1 */ + jmp 11f +L_32_reset_cs: + movl $(USER_CS), R32_CS(%r15) +11: + cmpl $(USER_DS), R32_DS(%r15) + jz 22f + cmpl $0, R32_DS(%r15) + jz 22f + larw R32_DS(%r15), %ax + jz 22f + movl $(USER_DS), R32_DS(%r15) +22: + cmpl $(USER_DS), R32_ES(%r15) + jz 33f + cmpl $0, R32_ES(%r15) + jz 33f + larw R32_ES(%r15), %ax + jz 33f + movl $(USER_DS), R32_ES(%r15) +33: + cmpl $(USER_DS), R32_FS(%r15) + jz 44f + cmpl $0, R32_FS(%r15) + jz 44f + larw R32_FS(%r15), %ax + jz 44f + movl $(USER_DS), R32_FS(%r15) +44: + cmpl $(USER_CTHREAD), R32_GS(%r15) + jz 55f + cmpl $0, R32_GS(%r15) + jz 55f + larw R32_GS(%r15), %ax + jz 55f + movl $(USER_CTHREAD), R32_GS(%r15) +55: + + /* + * Restore general 32-bit registers + */ + movl R32_EAX(%r15), %eax + movl R32_EBX(%r15), %ebx + movl R32_ECX(%r15), %ecx + movl R32_EDX(%r15), %edx + movl R32_EBP(%r15), %ebp + movl R32_ESI(%r15), %esi + movl R32_EDI(%r15), %edi + movl R32_DS(%r15), %r8d + movl R32_ES(%r15), %r9d + movl R32_FS(%r15), %r10d + movl R32_GS(%r15), %r11d + + /* Switch to the per-cpu (doublemapped) exception stack */ + mov %gs:CPU_ESTACK, %rsp + + /* Now transfer the ISF to the exception stack in preparation for iret, below */ + movl R32_SS(%r15), %r12d + push %r12 + movl R32_UESP(%r15), %r12d + push %r12 + movl R32_EFLAGS(%r15), %r12d + push %r12 + movl R32_CS(%r15), %r12d + push %r12 + movl R32_EIP(%r15), %r12d + push %r12 + + movl %gs:CPU_NEED_SEGCHK, %r14d /* %r14 will be zeroed just before we return */ + + /* + * Finally, switch to the user pagetables. After this, all %gs-relative + * accesses MUST be to cpu shadow data ONLY. Note that after we restore %gs + * (after the swapgs), no %gs-relative accesses should be performed. + */ + /* Discover user cr3/ASID */ + mov %gs:CPU_UCR3, %r13 +#if DEBUG + mov %r13, %gs:CPU_EXIT_CR3 +#endif + mov %r13, %cr3 + + swapgs + + /* + * Restore segment registers. A #GP taken here will push state onto IST1, + * not the exception stack. Note that the placement of the labels here + * corresponds to the fault address-detection logic (so do not change them + * without also changing that code). + */ +L_32bit_seg_restore_begin: + mov %r8, %ds + mov %r9, %es + mov %r10, %fs + mov %r11, %gs +L_32bit_seg_restore_done: + + /* Zero 64-bit-exclusive GPRs to prevent data leaks */ + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + xor %r11, %r11 + xor %r12, %r12 + xor %r13, %r13 + xor %r15, %r15 + + /* + * At this point, the stack contains: + * + * +--------------+ + * | Return SS | +32 + * | Return RSP | +24 + * | Return RFL | +16 + * | Return CS | +8 + * | Return RIP | <-- rsp + * +--------------+ + */ + + cmpl $(SYSENTER_CS), 8(%rsp) + /* test for sysexit */ + je L_rtu_via_sysexit + + cmpl $1, %r14d + je L_verw_island + +L_after_verw: + xor %r14, %r14 + +.globl EXT(ret32_iret) +EXT(ret32_iret): + iretq /* return from interrupt */ + +L_verw_island: + verw 32(%rsp) + jmp L_after_verw + +L_verw_island_1: + verw 16(%rsp) + jmp L_after_verw_1 + +L_rtu_via_sysexit: + pop %rdx /* user return eip */ + pop %rcx /* pop and toss cs */ + andl $(~EFL_IF), (%rsp) /* clear interrupts enable, sti below */ + + /* + * %ss is now at 16(%rsp) + */ + cmpl $1, %r14d + je L_verw_island_1 +L_after_verw_1: + xor %r14, %r14 + + popf /* flags - carry denotes failure */ + pop %rcx /* user return esp */ + + + sti /* interrupts enabled after sysexit */ + sysexitl /* 32-bit sysexit */ + /* End of double-mapped TEXT */ .text @@ -842,9 +1025,6 @@ Entry(ks_dispatch_user_with_pop_rax) pop %rax jmp EXT(ks_dispatch_user) -Entry (ks_return) - jmp . - Entry(ks_dispatch_user) cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP je L_dispatch_U32 /* 32-bit user task */ @@ -1086,7 +1266,15 @@ L_cr3_switch_return: movq $0, %gs:CPU_DR7 4: cmpl $(SS_64), SS_FLAVOR(%r15) /* 64-bit state? */ - je L_64bit_return + jne L_32bit_return + + /* + * Restore general 64-bit registers. + * Here on fault stack and PCB address in R15. + */ + leaq EXT(idt64_hndl_table0)(%rip), %rax + jmp *8(%rax) + L_32bit_return: #if DEBUG_IDT64 @@ -1098,155 +1286,9 @@ L_32bit_return: 1: #endif /* DEBUG_IDT64 */ - /* - * Restore registers into the machine state for iret. - * Here on fault stack and PCB address in R11. - */ - movl R32_EIP(%r15), %eax - movl %eax, R64_RIP(%r15) - movl R32_EFLAGS(%r15), %eax - movl %eax, R64_RFLAGS(%r15) - movl R32_CS(%r15), %eax - movl %eax, R64_CS(%r15) - movl R32_UESP(%r15), %eax - movl %eax, R64_RSP(%r15) - movl R32_SS(%r15), %eax - movl %eax, R64_SS(%r15) - - /* Validate CS/DS/ES/FS/GS segment selectors with the Load Access Rights instruction prior to restoration */ - /* Exempt "known good" statically configured selectors, e.g. USER_CS, USER_DS and 0 */ - cmpl $(USER_CS), R32_CS(%r15) - jz 11f - larw R32_CS(%r15), %ax - jnz L_32_reset_cs - /* Ensure that the segment referenced by CS in the saved state is a code segment (bit 11 == 1) */ - testw $0x800, %ax - jz L_32_reset_cs /* Update stored %cs with known-good selector if ZF == 1 */ - jmp 11f -L_32_reset_cs: - movl $(USER_CS), R32_CS(%r15) -11: - cmpl $(USER_DS), R32_DS(%r15) - jz 22f - cmpl $0, R32_DS(%r15) - jz 22f - larw R32_DS(%r15), %ax - jz 22f - movl $(USER_DS), R32_DS(%r15) -22: - cmpl $(USER_DS), R32_ES(%r15) - jz 33f - cmpl $0, R32_ES(%r15) - jz 33f - larw R32_ES(%r15), %ax - jz 33f - movl $(USER_DS), R32_ES(%r15) -33: - cmpl $(USER_DS), R32_FS(%r15) - jz 44f - cmpl $0, R32_FS(%r15) - jz 44f - larw R32_FS(%r15), %ax - jz 44f - movl $(USER_DS), R32_FS(%r15) -44: - cmpl $(USER_CTHREAD), R32_GS(%r15) - jz 55f - cmpl $0, R32_GS(%r15) - jz 55f - larw R32_GS(%r15), %ax - jz 55f - movl $(USER_CTHREAD), R32_GS(%r15) -55: - /* - * Restore general 32-bit registers - */ - movl R32_EAX(%r15), %eax - movl R32_EBX(%r15), %ebx - movl R32_ECX(%r15), %ecx - movl R32_EDX(%r15), %edx - movl R32_EBP(%r15), %ebp - movl R32_ESI(%r15), %esi - movl R32_EDI(%r15), %edi - - /* - * Restore segment registers. A segment exception taken here will - * push state on the IST1 stack and will not affect the "PCB stack". - */ - mov %r15, %rsp /* Set the PCB as the stack */ - movl %gs:CPU_NEED_SEGCHK, %r14d /* %r14 will be restored below */ - swapgs - - /* Zero 64-bit-exclusive GPRs to prevent data leaks */ - xor %r8, %r8 - xor %r9, %r9 - xor %r10, %r10 - xor %r11, %r11 - xor %r12, %r12 - xor %r13, %r13 - xor %r15, %r15 - - movw R32_DS(%rsp), %ds - movw R32_ES(%rsp), %es - movw R32_FS(%rsp), %fs - movw R32_GS(%rsp), %gs - - /* pop compat frame + trapno, trapfn and error */ - add $(ISS64_OFFSET)+8+8+8, %rsp - - /* - * At this point, the stack contains: - * - * +--------------+ - * | Return SS | +32 - * | Return RSP | +24 - * | Return RFL | +16 - * | Return CS | +8 - * | Return RIP | <-- rsp - * +--------------+ - */ - - cmpl $(SYSENTER_CS), 8(%rsp) - /* test for sysexit */ - je L_rtu_via_sysexit - - cmpl $1, %r14d - je L_verw_island - -L_after_verw: - xor %r14, %r14 - -.globl EXT(ret32_iret) -EXT(ret32_iret): - iretq /* return from interrupt */ - -L_verw_island: - verw 32(%rsp) - jmp L_after_verw - -L_verw_island_1: - verw 16(%rsp) - jmp L_after_verw_1 - -L_rtu_via_sysexit: - pop %rdx /* user return eip */ - pop %rcx /* pop and toss cs */ - andl $(~EFL_IF), (%rsp) /* clear interrupts enable, sti below */ - - /* - * %ss is now at 16(%rsp) - */ - cmpl $1, %r14d - je L_verw_island_1 -L_after_verw_1: - xor %r14, %r14 - - popf /* flags - carry denotes failure */ - pop %rcx /* user return esp */ - + leaq EXT(idt64_hndl_table0)(%rip), %rax + jmp *0x18(%rax) - sti /* interrupts enabled after sysexit */ - sysexitl /* 32-bit sysexit */ L_dr_restore_island: movq TH_PCB_IDS(%rdx),%rax /* Obtain this thread's debug state */ @@ -1298,8 +1340,6 @@ ret_to_kernel: hlt 2: #endif - -L_64bit_return: /* * Restore general 64-bit registers. * Here on fault stack and PCB address in R15. diff --git a/osfmk/x86_64/kpc_x86.c b/osfmk/x86_64/kpc_x86.c index da2ccc40d..ce27db8dd 100644 --- a/osfmk/x86_64/kpc_x86.c +++ b/osfmk/x86_64/kpc_x86.c @@ -227,7 +227,7 @@ kpc_reload_configurable(int ctr) return old; } -void kpc_pmi_handler(x86_saved_state_t *state); +void kpc_pmi_handler(void); static void set_running_fixed(boolean_t on) @@ -470,7 +470,7 @@ kpc_get_curcpu_counters_mp_call(void *args) r = kpc_get_curcpu_counters(handler->classes, NULL, &handler->buf[offset]); /* number of counters added by this CPU, needs to be atomic */ - hw_atomic_add(&(handler->nb_counters), r); + os_atomic_add(&(handler->nb_counters), r, relaxed); } int @@ -632,7 +632,7 @@ kpc_set_config_arch(struct kpc_config_remote *mp_config) /* PMI stuff */ void -kpc_pmi_handler(__unused x86_saved_state_t *state) +kpc_pmi_handler(void) { uint64_t status, extra; uint32_t ctr; diff --git a/osfmk/x86_64/locore.s b/osfmk/x86_64/locore.s index d88f2a08a..4c71fd461 100644 --- a/osfmk/x86_64/locore.s +++ b/osfmk/x86_64/locore.s @@ -319,35 +319,85 @@ _bcopystr_fail: ret /* - * Copyin 32 or 64 bit aligned word as a single transaction + * Copyin 32 bit aligned word as a single transaction * rdi: source address (user) * rsi: destination address (kernel) - * rdx: size (4 or 8) */ -Entry(_copyin_word) +Entry(_copyin_atomic32) pushq %rbp /* Save registers */ movq %rsp, %rbp - cmpl $0x4, %edx /* If size = 4 */ - je L_copyin_word_4 /* handle 32-bit load */ - movl $(EINVAL), %eax /* Set up error status */ - cmpl $0x8, %edx /* If size != 8 */ - jne L_copyin_word_exit /* exit with error */ RECOVERY_SECTION - RECOVER(L_copyin_word_fail) /* Set up recovery handler for next instruction*/ - movq (%rdi), %rax /* Load quad from user */ - jmp L_copyin_word_store -L_copyin_word_4: - RECOVERY_SECTION - RECOVER(L_copyin_word_fail) /* Set up recovery handler for next instruction */ + RECOVER(L_copyin_atomic32_fail) /* Set up recovery handler for next instruction */ movl (%rdi), %eax /* Load long from user */ -L_copyin_word_store: + movl %eax, (%rsi) /* Store to kernel */ + xorl %eax, %eax /* Return success */ + popq %rbp /* Restore registers */ + retq /* Return */ + +L_copyin_atomic32_fail: + movl $(EFAULT), %eax /* Return error for failure */ + popq %rbp /* Restore registers */ + retq /* Return */ + +/* + * Copyin 64 bit aligned word as a single transaction + * rdi: source address (user) + * rsi: destination address (kernel) + */ +Entry(_copyin_atomic64) + pushq %rbp /* Save registers */ + movq %rsp, %rbp + RECOVERY_SECTION + RECOVER(L_copyin_atomic64_fail) /* Set up recovery handler for next instruction*/ + movq (%rdi), %rax /* Load quad from user */ movq %rax, (%rsi) /* Store to kernel */ xorl %eax, %eax /* Return success */ -L_copyin_word_exit: popq %rbp /* Restore registers */ retq /* Return */ -L_copyin_word_fail: +L_copyin_atomic64_fail: + movl $(EFAULT), %eax /* Return error for failure */ + popq %rbp /* Restore registers */ + retq /* Return */ + +/* + * Copyin 32 bit aligned word as a single transaction + * rdi: source address (kernel) + * rsi: destination address (user) + */ +Entry(_copyout_atomic32) + pushq %rbp /* Save registers */ + movq %rsp, %rbp + movl (%rdi), %eax /* Load long from kernel */ + RECOVERY_SECTION + RECOVER(L_copyout_atomic32_fail) /* Set up recovery handler for next instruction*/ + movl %eax, (%rsi) /* Store long to user */ + xorl %eax, %eax /* Return success */ + popq %rbp /* Restore registers */ + retq /* Return */ + +L_copyout_atomic32_fail: + movl $(EFAULT), %eax /* Return error for failure */ + popq %rbp /* Restore registers */ + retq /* Return */ + +/* + * Copyin 64 bit aligned word as a single transaction + * rdi: source address (kernel) + * rsi: destination address (user) + */ +Entry(_copyout_atomic64) + pushq %rbp /* Save registers */ + movq %rsp, %rbp + movq (%rdi), %rax /* Load quad from kernel */ + RECOVERY_SECTION + RECOVER(L_copyout_atomic64_fail) /* Set up recovery handler for next instruction*/ + movq %rax, (%rsi) /* Store quad to user */ + xorl %eax, %eax /* Return success */ + popq %rbp /* Restore registers */ + retq /* Return */ + +L_copyout_atomic64_fail: movl $(EFAULT), %eax /* Return error for failure */ popq %rbp /* Restore registers */ retq /* Return */ diff --git a/osfmk/x86_64/loose_ends.c b/osfmk/x86_64/loose_ends.c index cb63ffcad..807ecfc52 100644 --- a/osfmk/x86_64/loose_ends.c +++ b/osfmk/x86_64/loose_ends.c @@ -912,14 +912,10 @@ void fillPage(ppnum_t pa, unsigned int fill) { uint64_t src; - int i; int cnt = PAGE_SIZE / sizeof(unsigned int); - unsigned int *addr; src = i386_ptob(pa); - for (i = 0, addr = (unsigned int *)PHYSMAP_PTOV(src); i < cnt; i++) { - *addr++ = fill; - } + memset_word((int *)PHYSMAP_PTOV(src), fill, cnt); } static inline void diff --git a/osfmk/x86_64/machine_remote_time.c b/osfmk/x86_64/machine_remote_time.c index 3c834c041..6172f3181 100644 --- a/osfmk/x86_64/machine_remote_time.c +++ b/osfmk/x86_64/machine_remote_time.c @@ -27,7 +27,7 @@ */ #include #include -#include +#include #include #include @@ -55,10 +55,10 @@ mach_bridge_register_regwrite_timestamp_callback(mach_bridge_regwrite_timestamp_ { static uint64_t delay_amount = 0; - if (!atomic_load(&bt_init_flag)) { + if (!os_atomic_load(&bt_init_flag, relaxed)) { mach_bridge_timer_init(); nanoseconds_to_absolutetime(DELAY_INTERVAL_NS, &delay_amount); - bt_init_flag = 1; + os_atomic_store(&bt_init_flag, 1, release); } lck_spin_lock(bt_maintenance_lock); diff --git a/osfmk/x86_64/monotonic_x86_64.c b/osfmk/x86_64/monotonic_x86_64.c index 9a69f0805..b182f653a 100644 --- a/osfmk/x86_64/monotonic_x86_64.c +++ b/osfmk/x86_64/monotonic_x86_64.c @@ -158,17 +158,33 @@ mt_core_set_snap(unsigned int ctr, uint64_t count) #define GLOBAL_OVF 0x390 +static void mt_check_for_pmi(struct mt_cpu *mtc, x86_saved_state_t *state); + +static void +enable_counters(void) +{ + wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE); + wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN); +} + +static void +disable_counters(void) +{ + wrmsr64(GLOBAL_CTRL, 0); +} + static void core_down(cpu_data_t *cpu) { if (!mt_core_supported) { return; } - assert(ml_get_interrupts_enabled() == FALSE); + struct mt_cpu *mtc = &cpu->cpu_monotonic; - wrmsr64(GLOBAL_CTRL, 0); - mt_mtc_update_fixed_counts(&cpu->cpu_monotonic, NULL, NULL); + disable_counters(); + mt_mtc_update_fixed_counts(mtc, NULL, NULL); + mtc->mtc_active = false; } static void @@ -187,8 +203,8 @@ core_up(cpu_data_t *cpu) for (int i = 0; i < MT_CORE_NFIXED; i++) { mt_core_set_snap(i, mtc->mtc_snaps[i]); } - wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE); - wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN); + enable_counters(); + mtc->mtc_active = true; } void @@ -206,17 +222,27 @@ mt_cpu_up(cpu_data_t *cpu) ml_set_interrupts_enabled(intrs_en); } -static int -mt_pmi_x86_64(x86_saved_state_t *state) +uint64_t +mt_count_pmis(void) { - uint64_t status; - struct mt_cpu *mtc; + uint64_t npmis = 0; + for (unsigned int i = 0; i < real_ncpus; i++) { + cpu_data_t *cpu = cpu_data_ptr[i]; + npmis += cpu->cpu_monotonic.mtc_npmis; + } + return npmis; +} - assert(ml_get_interrupts_enabled() == FALSE); - mtc = mt_cur_cpu(); - status = rdmsr64(GLOBAL_STATUS); +static void +mt_check_for_pmi(struct mt_cpu *mtc, x86_saved_state_t *state) +{ + uint64_t status = rdmsr64(GLOBAL_STATUS); + + mtc->mtc_npmis += 1; - (void)atomic_fetch_add_explicit(&mt_pmis, 1, memory_order_relaxed); + if (mtc->mtc_active) { + disable_counters(); + } for (unsigned int i = 0; i < MT_CORE_NFIXED; i++) { if (status & CTR_FIX_POS(i)) { @@ -228,8 +254,11 @@ mt_pmi_x86_64(x86_saved_state_t *state) mtc->mtc_counts[i] += delta; if (mt_microstackshots && mt_microstackshot_ctr == i) { - x86_saved_state64_t *state64 = saved_state64(state); - bool user_mode = (state64->isf.cs & 0x3) ? true : false; + bool user_mode = false; + if (state) { + x86_saved_state64_t *state64 = saved_state64(state); + user_mode = (state64->isf.cs & 0x3) != 0; + } KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1), mt_microstackshot_ctr, user_mode); mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx); @@ -245,9 +274,20 @@ mt_pmi_x86_64(x86_saved_state_t *state) /* if any of the configurable counters overflowed, tell kpc */ if (status & ((UINT64_C(1) << 4) - 1)) { - extern void kpc_pmi_handler(x86_saved_state_t *state); - kpc_pmi_handler(state); + extern void kpc_pmi_handler(void); + kpc_pmi_handler(); + } + + if (mtc->mtc_active) { + enable_counters(); } +} + +static int +mt_pmi_x86_64(x86_saved_state_t *state) +{ + assert(ml_get_interrupts_enabled() == FALSE); + mt_check_for_pmi(mt_cur_cpu(), state); return 0; } @@ -290,6 +330,9 @@ mt_microstackshot_start_arch(uint64_t period) void mt_early_init(void) { + if (PE_parse_boot_argn("-nomt_core", NULL, 0)) { + return; + } i386_cpu_info_t *info = cpuid_info(); if (info->cpuid_arch_perf_leaf.version >= 2) { lapic_set_pmi_func((i386_intr_func_t)mt_pmi_x86_64); diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 50557b010..87298757b 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -145,7 +145,7 @@ #include #include - +#include #if MACH_ASSERT int pmap_stats_assert = 1; #endif /* MACH_ASSERT */ @@ -192,11 +192,11 @@ uint32_t npvhashmask = 0, npvhashbuckets = 0; pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL; pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL; -decl_simple_lock_data(, pv_hashed_free_list_lock) -decl_simple_lock_data(, pv_hashed_kern_free_list_lock) -decl_simple_lock_data(, pv_hash_table_lock) +decl_simple_lock_data(, pv_hashed_free_list_lock); +decl_simple_lock_data(, pv_hashed_kern_free_list_lock); +decl_simple_lock_data(, pv_hash_table_lock); -decl_simple_lock_data(, phys_backup_lock) +decl_simple_lock_data(, phys_backup_lock); zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ @@ -229,7 +229,7 @@ pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE]; #define current_pmap() (vm_map_pmap(current_thread()->map)) struct pmap kernel_pmap_store; -pmap_t kernel_pmap; +SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = NULL; struct zone *pmap_zone; /* zone of pmap structures */ @@ -244,16 +244,16 @@ int pt_fake_zone_index = -1; extern long NMIPI_acks; -boolean_t kernel_text_ps_4K = TRUE; +SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE; extern char end; static int nkpt; #if DEVELOPMENT || DEBUG -boolean_t pmap_disable_kheap_nx = FALSE; -boolean_t pmap_disable_kstack_nx = FALSE; -boolean_t wpkernel = TRUE; +SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE; +SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE; +SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE; #else const boolean_t wpkernel = TRUE; #endif @@ -410,7 +410,7 @@ pmap_bootstrap( */ kernel_pmap = &kernel_pmap_store; - kernel_pmap->ref_count = 1; + os_ref_init(&kernel_pmap->ref_count, NULL); #if DEVELOPMENT || DEBUG kernel_pmap->nx_enabled = TRUE; #endif @@ -699,6 +699,37 @@ hibernate_rebuild_pmap_structs(void) #endif +/* + * Create pv entries for kernel pages mapped by early startup code. + * These have to exist so we can ml_static_mfree() them later. + */ +static void +pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va) +{ + ppnum_t ppn; + pv_rooted_entry_t pv_h; + uint32_t pgsz; + + start_va = round_page(start_va); + end_va = trunc_page(end_va); + while (start_va < end_va) { + pgsz = PAGE_SIZE; + ppn = pmap_find_phys(kernel_pmap, start_va); + if (ppn != 0 && IS_MANAGED_PAGE(ppn)) { + pv_h = pai_to_pvh(ppn); + assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */ + assert(pv_h->pmap == 0); + pv_h->va_and_flags = start_va; + pv_h->pmap = kernel_pmap; + queue_init(&pv_h->qlink); + if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) { + pgsz = I386_LPGBYTES; + } + } + start_va += pgsz; + } +} + /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap @@ -793,7 +824,8 @@ pmap_init(void) last_managed_page = pn; } - if (pn >= lowest_hi && pn <= highest_hi) { + if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) || + (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) { pmap_phys_attributes[pn] |= PHYS_NOENCRYPT; } } @@ -843,19 +875,16 @@ pmap_init(void) zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE); zone_change(pv_hashed_list_zone, Z_GZALLOC_EXEMPT, TRUE); - /* create pv entries for kernel pages that might get pmap_remove()ed */ - vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS; - for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) { - pv_rooted_entry_t pv_h; + /* + * Create pv entries for kernel pages that might get pmap_remove()ed. + * + * - very low pages that were identity mapped. + * - vm_pages[] entries that might be unused and reclaimed. + */ + assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr); + pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start); + pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr); - pv_h = pai_to_pvh(ppn); - assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */ - assert(pv_h->pmap == NULL); - pv_h->va_and_flags = vaddr; - vaddr += PAGE_SIZE; - pv_h->pmap = kernel_pmap; - queue_init(&pv_h->qlink); - } pmap_initialized = TRUE; max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t); @@ -872,31 +901,6 @@ pmap_init(void) #endif /* CONFIG_VMX */ } -/* - * Create pv entries for kernel pages mapped by low level - * startup code. These have to exist so we can pmap_remove() them. - */ -void -pmap_pv_fixup(vm_offset_t start, vm_size_t length) -{ - ppnum_t ppn; - pv_rooted_entry_t pv_h; - - while (length != 0) { - ppn = pmap_find_phys(kernel_pmap, start); - if (ppn != 0) { - pv_h = pai_to_pvh(ppn); - assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */ - assert(pv_h->pmap == 0); - pv_h->va_and_flags = start; - pv_h->pmap = kernel_pmap; - queue_init(&pv_h->qlink); - } - start += PAGE_SIZE; - length -= PAGE_SIZE; - } -} - static void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) @@ -939,6 +943,24 @@ pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolea DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0); } +/* + * Reclaim memory for early boot 4K page tables that were converted to large page mappings. + * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(), + * so we can free it using its address in that array. + */ +static void +pmap_free_early_PT(ppnum_t ppn, uint32_t cnt) +{ + ppnum_t KPTphys_ppn; + vm_offset_t offset; + + KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys); + assert(ppn >= KPTphys_ppn); + assert(ppn + cnt <= KPTphys_ppn + NKPT); + offset = (ppn - KPTphys_ppn) << PAGE_SHIFT; + ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt); +} + /* * Called once VM is fully initialized so that we can release unused * sections of low memory to the general pool. @@ -985,7 +1007,7 @@ pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolea * The now unused level-1 PTE pages are also freed. */ extern ppnum_t vm_kernel_base_page; -static uint32_t constptes = 0, dataptes = 0; +static uint32_t dataptes = 0; void pmap_lowmem_finalize(void) @@ -1059,6 +1081,14 @@ pmap_lowmem_finalize(void) */ pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base); + /* + * Release any memory for early boot 4K page table pages that got replaced + * with large page mappings for vm_pages[]. We know this memory is part of + * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free + * it using that address. + */ + pmap_free_early_PT(released_PT_ppn, released_PT_cnt); + /* * If text and data are both 2MB-aligned, * we can map text with large-pages, @@ -1123,8 +1153,10 @@ pmap_lowmem_finalize(void) vm_offset_t pte_phys; pt_entry_t *pdep; pt_entry_t pde; + ppnum_t KPT_ppn; pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva); + KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT); ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); DBG("myva: %p pdep: %p ptep: %p\n", (void *) myva, (void *) pdep, (void *) ptep); @@ -1145,34 +1177,14 @@ pmap_lowmem_finalize(void) /* * Free the now-unused level-1 pte. - * Note: ptep is a virtual address to the pte in the - * recursive map. We can't use this address to free - * the page. Instead we need to compute its address - * in the Idle PTEs in "low memory". */ - vm_offset_t vm_ptep = (vm_offset_t) KPTphys - + (pte_phys >> PTPGSHIFT); - DBG("ml_static_mfree(%p,0x%x) for pte\n", - (void *) vm_ptep, PAGE_SIZE); - ml_static_mfree(vm_ptep, PAGE_SIZE); + pmap_free_early_PT(KPT_ppn, 1); } /* Change variable read by sysctl machdep.pmap */ pmap_kernel_text_ps = I386_LPGBYTES; } - boolean_t doconstro = TRUE; -#if DEVELOPMENT || DEBUG - (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro)); -#endif - if (doconstro) { - if (sconst & PAGE_MASK) { - panic("CONST segment misaligned 0x%lx 0x%lx\n", - sconst, econst); - } - kprintf("Marking const DATA read-only\n"); - } - vm_offset_t dva; for (dva = sdata; dva < edata; dva += I386_PGBYTES) { @@ -1187,20 +1199,6 @@ pmap_lowmem_finalize(void) } assert(dataptes > 0); - for (dva = sconst; dva < econst; dva += I386_PGBYTES) { - pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva); - - dpte = *dptep; - - assert((dpte & INTEL_PTE_VALID)); - dpte |= INTEL_PTE_NX; - dpte &= ~INTEL_PTE_WRITE; - constptes++; - pmap_store_pte(dptep, dpte); - } - - assert(constptes > 0); - kernel_segment_command_t * seg; kernel_section_t * sec; @@ -1255,6 +1253,25 @@ pmap_lowmem_finalize(void) splx(spl); } +/* + * Mark the const data segment as read-only, non-executable. + */ +void +x86_64_protect_data_const() +{ + boolean_t doconstro = TRUE; +#if DEVELOPMENT || DEBUG + (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro)); +#endif + if (doconstro) { + if (sconst & PAGE_MASK) { + panic("CONST segment misaligned 0x%lx 0x%lx\n", + sconst, econst); + } + kprintf("Marking const DATA read-only\n"); + pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ); + } +} /* * this function is only used for debugging fron the vm layer */ @@ -1285,7 +1302,6 @@ pmap_verify_free( return result; } - #if MACH_ASSERT void pmap_assert_free(ppnum_t pn) @@ -1401,6 +1417,22 @@ hv_ept_pmap_create(void **ept_pmap, void **eptp) return; } +/* + * pmap_create() is used by some special, legacy 3rd party kexts. + * In our kernel code, always use pmap_create_options(). + */ +extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit); + +__attribute__((used)) +pmap_t +pmap_create( + ledger_t ledger, + vm_map_size_t sz, + boolean_t is_64bit) +{ + return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0); +} + /* * Create and return a physical map. * @@ -1418,7 +1450,7 @@ pmap_t pmap_create_options( ledger_t ledger, vm_map_size_t sz, - int flags) + unsigned int flags) { pmap_t p; vm_size_t size; @@ -1457,8 +1489,7 @@ pmap_create_options( p->pmap_rwl.lck_rw_can_sleep = FALSE; bzero(&p->stats, sizeof(p->stats)); - - p->ref_count = 1; + os_ref_init(&p->ref_count, NULL); #if DEVELOPMENT || DEBUG p->nx_enabled = 1; #endif @@ -1542,15 +1573,6 @@ pmap_create_options( return p; } -pmap_t -pmap_create( - ledger_t ledger, - vm_map_size_t sz, - boolean_t is_64bit) -{ - return pmap_create_options(ledger, sz, ((is_64bit) ? PMAP_CREATE_64BIT : 0)); -} - /* * We maintain stats and ledgers so that a task's physical footprint is: * phys_footprint = ((internal - alternate_accounting) @@ -1563,114 +1585,6 @@ pmap_create( */ #if MACH_ASSERT -struct { - uint64_t num_pmaps_checked; - - int phys_footprint_over; - ledger_amount_t phys_footprint_over_total; - ledger_amount_t phys_footprint_over_max; - int phys_footprint_under; - ledger_amount_t phys_footprint_under_total; - ledger_amount_t phys_footprint_under_max; - - int internal_over; - ledger_amount_t internal_over_total; - ledger_amount_t internal_over_max; - int internal_under; - ledger_amount_t internal_under_total; - ledger_amount_t internal_under_max; - - int internal_compressed_over; - ledger_amount_t internal_compressed_over_total; - ledger_amount_t internal_compressed_over_max; - int internal_compressed_under; - ledger_amount_t internal_compressed_under_total; - ledger_amount_t internal_compressed_under_max; - - int iokit_mapped_over; - ledger_amount_t iokit_mapped_over_total; - ledger_amount_t iokit_mapped_over_max; - int iokit_mapped_under; - ledger_amount_t iokit_mapped_under_total; - ledger_amount_t iokit_mapped_under_max; - - int alternate_accounting_over; - ledger_amount_t alternate_accounting_over_total; - ledger_amount_t alternate_accounting_over_max; - int alternate_accounting_under; - ledger_amount_t alternate_accounting_under_total; - ledger_amount_t alternate_accounting_under_max; - - int alternate_accounting_compressed_over; - ledger_amount_t alternate_accounting_compressed_over_total; - ledger_amount_t alternate_accounting_compressed_over_max; - int alternate_accounting_compressed_under; - ledger_amount_t alternate_accounting_compressed_under_total; - ledger_amount_t alternate_accounting_compressed_under_max; - - int page_table_over; - ledger_amount_t page_table_over_total; - ledger_amount_t page_table_over_max; - int page_table_under; - ledger_amount_t page_table_under_total; - ledger_amount_t page_table_under_max; - - int purgeable_volatile_over; - ledger_amount_t purgeable_volatile_over_total; - ledger_amount_t purgeable_volatile_over_max; - int purgeable_volatile_under; - ledger_amount_t purgeable_volatile_under_total; - ledger_amount_t purgeable_volatile_under_max; - - int purgeable_nonvolatile_over; - ledger_amount_t purgeable_nonvolatile_over_total; - ledger_amount_t purgeable_nonvolatile_over_max; - int purgeable_nonvolatile_under; - ledger_amount_t purgeable_nonvolatile_under_total; - ledger_amount_t purgeable_nonvolatile_under_max; - - int purgeable_volatile_compressed_over; - ledger_amount_t purgeable_volatile_compressed_over_total; - ledger_amount_t purgeable_volatile_compressed_over_max; - int purgeable_volatile_compressed_under; - ledger_amount_t purgeable_volatile_compressed_under_total; - ledger_amount_t purgeable_volatile_compressed_under_max; - - int purgeable_nonvolatile_compressed_over; - ledger_amount_t purgeable_nonvolatile_compressed_over_total; - ledger_amount_t purgeable_nonvolatile_compressed_over_max; - int purgeable_nonvolatile_compressed_under; - ledger_amount_t purgeable_nonvolatile_compressed_under_total; - ledger_amount_t purgeable_nonvolatile_compressed_under_max; - - int network_volatile_over; - ledger_amount_t network_volatile_over_total; - ledger_amount_t network_volatile_over_max; - int network_volatile_under; - ledger_amount_t network_volatile_under_total; - ledger_amount_t network_volatile_under_max; - - int network_nonvolatile_over; - ledger_amount_t network_nonvolatile_over_total; - ledger_amount_t network_nonvolatile_over_max; - int network_nonvolatile_under; - ledger_amount_t network_nonvolatile_under_total; - ledger_amount_t network_nonvolatile_under_max; - - int network_volatile_compressed_over; - ledger_amount_t network_volatile_compressed_over_total; - ledger_amount_t network_volatile_compressed_over_max; - int network_volatile_compressed_under; - ledger_amount_t network_volatile_compressed_under_total; - ledger_amount_t network_volatile_compressed_under_max; - - int network_nonvolatile_compressed_over; - ledger_amount_t network_nonvolatile_compressed_over_total; - ledger_amount_t network_nonvolatile_compressed_over_max; - int network_nonvolatile_compressed_under; - ledger_amount_t network_nonvolatile_compressed_under_total; - ledger_amount_t network_nonvolatile_compressed_under_max; -} pmap_ledgers_drift; static void pmap_check_ledgers(pmap_t pmap); #else /* MACH_ASSERT */ static inline void @@ -1689,7 +1603,7 @@ extern int vm_wired_objects_page_count; void pmap_destroy(pmap_t p) { - int c; + os_ref_count_t c; if (p == PMAP_NULL) { return; @@ -1700,7 +1614,7 @@ pmap_destroy(pmap_t p) PMAP_LOCK_EXCLUSIVE(p); - c = --p->ref_count; + c = os_ref_release_locked(&p->ref_count); pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE); @@ -1762,7 +1676,7 @@ pmap_reference(pmap_t p) { if (p != PMAP_NULL) { PMAP_LOCK_EXCLUSIVE(p); - p->ref_count++; + os_ref_retain_locked(&p->ref_count); PMAP_UNLOCK_EXCLUSIVE(p);; } } @@ -2273,73 +2187,148 @@ pmap_expand( return KERN_SUCCESS; } - -/* On K64 machines with more than 32GB of memory, pmap_steal_memory - * will allocate past the 1GB of pre-expanded virtual kernel area. This - * function allocates all the page tables using memory from the same pool - * that pmap_steal_memory uses, rather than calling vm_page_grab (which - * isn't available yet). */ -void -pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr) +/* + * Query a pmap to see what size a given virtual address is mapped with. + * If the vaddr is not mapped, returns 0. + */ +vm_size_t +pmap_query_pagesize( + pmap_t pmap, + vm_map_offset_t vaddr) { - ppnum_t pn; - pt_entry_t *pte; - boolean_t is_ept = is_ept_pmap(pmap); + pd_entry_t *pdep; + vm_size_t size = 0; + assert(!is_ept_pmap(pmap)); PMAP_LOCK_EXCLUSIVE(pmap); + pdep = pmap_pde(pmap, vaddr); + if (pdep != PD_ENTRY_NULL) { + if (*pdep & INTEL_PTE_PS) { + size = I386_LPGBYTES; + } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) { + size = I386_PGBYTES; + } + } + + PMAP_UNLOCK_EXCLUSIVE(pmap); + + return size; +} + +/* + * Ensure the page table hierarchy is filled in down to + * the large page level. Additionally returns FAILURE if + * a lower page table already exists. + */ +static kern_return_t +pmap_pre_expand_large_internal( + pmap_t pmap, + vm_map_offset_t vaddr) +{ + ppnum_t pn; + pt_entry_t *pte; + boolean_t is_ept = is_ept_pmap(pmap); + kern_return_t kr = KERN_SUCCESS; + if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) { - if (!pmap_next_page_hi(&pn)) { - panic("pmap_pre_expand"); + if (!pmap_next_page_hi(&pn, FALSE)) { + panic("pmap_pre_expand_large no PDPT"); } pmap_zero_page(pn); pte = pmap64_pml4(pmap, vaddr); - pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) - | PTE_READ(is_ept) - | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) - | PTE_WRITE(is_ept)); + pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) | + PTE_READ(is_ept) | + (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) | + PTE_WRITE(is_ept)); pte = pmap64_user_pml4(pmap, vaddr); - pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) - | PTE_READ(is_ept) - | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) - | PTE_WRITE(is_ept)); + pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) | + PTE_READ(is_ept) | + (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) | + PTE_WRITE(is_ept)); } if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) { - if (!pmap_next_page_hi(&pn)) { - panic("pmap_pre_expand"); + if (!pmap_next_page_hi(&pn, FALSE)) { + panic("pmap_pre_expand_large no PDE"); } pmap_zero_page(pn); pte = pmap64_pdpt(pmap, vaddr); - pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) - | PTE_READ(is_ept) - | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) - | PTE_WRITE(is_ept)); + pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) | + PTE_READ(is_ept) | + (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) | + PTE_WRITE(is_ept)); + } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) { + kr = KERN_FAILURE; } - if (pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) { - if (!pmap_next_page_hi(&pn)) { - panic("pmap_pre_expand"); - } + return kr; +} - pmap_zero_page(pn); +/* + * Wrapper that locks the pmap. + */ +kern_return_t +pmap_pre_expand_large( + pmap_t pmap, + vm_map_offset_t vaddr) +{ + kern_return_t kr; + + PMAP_LOCK_EXCLUSIVE(pmap); + kr = pmap_pre_expand_large_internal(pmap, vaddr); + PMAP_UNLOCK_EXCLUSIVE(pmap); + return kr; +} + +/* + * On large memory machines, pmap_steal_memory() will allocate past + * the 1GB of pre-allocated/mapped virtual kernel area. This function + * expands kernel the page tables to cover a given vaddr. It uses pages + * from the same pool that pmap_steal_memory() uses, since vm_page_grab() + * isn't available yet. + */ +void +pmap_pre_expand( + pmap_t pmap, + vm_map_offset_t vaddr) +{ + ppnum_t pn; + pt_entry_t *pte; + boolean_t is_ept = is_ept_pmap(pmap); - pte = pmap_pde(pmap, vaddr); + /* + * This returns failure if a 4K page table already exists. + * Othewise it fills in the page table hierarchy down + * to that level. + */ + PMAP_LOCK_EXCLUSIVE(pmap); + if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) { + PMAP_UNLOCK_EXCLUSIVE(pmap); + return; + } - pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) - | PTE_READ(is_ept) - | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) - | PTE_WRITE(is_ept)); + /* Add the lowest table */ + if (!pmap_next_page_hi(&pn, FALSE)) { + panic("pmap_pre_expand"); } + pmap_zero_page(pn); + + pte = pmap_pde(pmap, vaddr); + + pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) | + PTE_READ(is_ept) | + (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) | + PTE_WRITE(is_ept)); PMAP_UNLOCK_EXCLUSIVE(pmap); } @@ -2367,124 +2356,6 @@ pmap_sync_page_attributes_phys(ppnum_t pa) cache_flush_page_phys(pa); } - - -#ifdef CURRENTLY_UNUSED_AND_UNTESTED - -int collect_ref; -int collect_unref; - -/* - * Routine: pmap_collect - * Function: - * Garbage collects the physical map system for - * pages which are no longer used. - * Success need not be guaranteed -- that is, there - * may well be pages which are not referenced, but - * others may be collected. - * Usage: - * Called by the pageout daemon when pages are scarce. - */ -void -pmap_collect( - pmap_t p) -{ - pt_entry_t *pdp, *ptp; - pt_entry_t *eptp; - int wired; - boolean_t is_ept; - - if (p == PMAP_NULL) { - return; - } - - if (p == kernel_pmap) { - return; - } - - is_ept = is_ept_pmap(p); - - /* - * Garbage collect map. - */ - PMAP_LOCK(p); - - for (pdp = (pt_entry_t *)p->dirbase; - pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI + 1)]; - pdp++) { - if (*pdp & PTE_VALID_MASK(is_ept)) { - if (*pdp & PTE_REF(is_ept)) { - pmap_store_pte(pdp, *pdp & ~PTE_REF(is_ept)); - collect_ref++; - } else { - collect_unref++; - ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase)); - eptp = ptp + NPTEPG; - - /* - * If the pte page has any wired mappings, we cannot - * free it. - */ - wired = 0; - { - pt_entry_t *ptep; - for (ptep = ptp; ptep < eptp; ptep++) { - if (iswired(*ptep)) { - wired = 1; - break; - } - } - } - if (!wired) { - /* - * Remove the virtual addresses mapped by this pte page. - */ - pmap_remove_range(p, - pdetova(pdp - (pt_entry_t *)p->dirbase), - ptp, - eptp); - - /* - * Invalidate the page directory pointer. - */ - pmap_store_pte(pdp, 0x0); - - PMAP_UNLOCK(p); - - /* - * And free the pte page itself. - */ - { - vm_page_t m; - - vm_object_lock(p->pm_obj); - - m = vm_page_lookup(p->pm_obj, (vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]) * PAGE_SIZE); - if (m == VM_PAGE_NULL) { - panic("pmap_collect: pte page not in object"); - } - - vm_object_unlock(p->pm_obj); - - VM_PAGE_FREE(m); - - OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(p, PAGE_SIZE); - } - - PMAP_LOCK(p); - } - } - } - } - - PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL); - PMAP_UNLOCK(p); - return; -} -#endif - - void pmap_copy_page(ppnum_t src, ppnum_t dst) { @@ -3224,10 +3095,8 @@ static void pmap_check_ledgers( pmap_t pmap) { - ledger_amount_t bal; - int pid; - char *procname; - boolean_t do_panic; + int pid; + char *procname; if (pmap->pmap_pid == 0) { /* @@ -3245,73 +3114,10 @@ pmap_check_ledgers( return; } - do_panic = FALSE; pid = pmap->pmap_pid; procname = pmap->pmap_procname; - pmap_ledgers_drift.num_pmaps_checked++; - -#define LEDGER_CHECK_BALANCE(__LEDGER) \ -MACRO_BEGIN \ - int panic_on_negative = TRUE; \ - ledger_get_balance(pmap->ledger, \ - task_ledgers.__LEDGER, \ - &bal); \ - ledger_get_panic_on_negative(pmap->ledger, \ - task_ledgers.__LEDGER, \ - &panic_on_negative); \ - if (bal != 0) { \ - if (panic_on_negative || \ - (pmap_ledgers_panic && \ - pmap_ledgers_panic_leeway > 0 && \ - (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ - bal < (pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ - do_panic = TRUE; \ - } \ - printf("LEDGER BALANCE proc %d (%s) " \ - "\"%s\" = %lld\n", \ - pid, procname, #__LEDGER, bal); \ - if (bal > 0) { \ - pmap_ledgers_drift.__LEDGER##_over++; \ - pmap_ledgers_drift.__LEDGER##_over_total += bal; \ - if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ - pmap_ledgers_drift.__LEDGER##_over_max = bal; \ - } \ - } else if (bal < 0) { \ - pmap_ledgers_drift.__LEDGER##_under++; \ - pmap_ledgers_drift.__LEDGER##_under_total += bal; \ - if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ - pmap_ledgers_drift.__LEDGER##_under_max = bal; \ - } \ - } \ - } \ -MACRO_END - - LEDGER_CHECK_BALANCE(phys_footprint); - LEDGER_CHECK_BALANCE(internal); - LEDGER_CHECK_BALANCE(internal_compressed); - LEDGER_CHECK_BALANCE(iokit_mapped); - LEDGER_CHECK_BALANCE(alternate_accounting); - LEDGER_CHECK_BALANCE(alternate_accounting_compressed); - LEDGER_CHECK_BALANCE(page_table); - LEDGER_CHECK_BALANCE(purgeable_volatile); - LEDGER_CHECK_BALANCE(purgeable_nonvolatile); - LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); - LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); - LEDGER_CHECK_BALANCE(network_volatile); - LEDGER_CHECK_BALANCE(network_nonvolatile); - LEDGER_CHECK_BALANCE(network_volatile_compressed); - LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); - - if (do_panic) { - if (pmap_ledgers_panic) { - panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", - pmap, pid, procname); - } else { - printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", - pmap, pid, procname); - } - } + vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname); if (pmap->stats.resident_count != 0 || #if 35156815 @@ -3464,3 +3270,44 @@ pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cach return PMAP_TC_UNKNOWN_FORMAT; } + +bool +pmap_is_trust_cache_loaded(const uuid_t __unused uuid) +{ + // Unsupported on this architecture. + return false; +} + +bool +pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20]) +{ + // Unsupported on this architecture. + return false; +} + +uint32_t +pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20]) +{ + // Unsupported on this architecture. + return false; +} + +bool +pmap_in_ppl(void) +{ + // Nonexistent on this architecture. + return false; +} + +void * +pmap_claim_reserved_ppl_page(void) +{ + // Unsupported on this architecture. + return NULL; +} + +void +pmap_free_reserved_ppl_page(void __unused *kva) +{ + // Unsupported on this architecture. +} diff --git a/pexpert/arm/pe_consistent_debug.c b/pexpert/arm/pe_consistent_debug.c index e569811d6..1fe7142ac 100644 --- a/pexpert/arm/pe_consistent_debug.c +++ b/pexpert/arm/pe_consistent_debug.c @@ -52,6 +52,24 @@ consistent_debug_allocate_entry(void) return NULL; } +boolean_t +PE_consistent_debug_lookup_entry(uint64_t record_id, uint64_t *phys_addr, uint64_t *length) +{ + assert(phys_addr != NULL); + assert(length != NULL); + + for (unsigned int i = 0; i < consistent_debug_registry->top_level_header.num_records; i++) { + if (consistent_debug_registry->records[i].record_id == record_id) { + *phys_addr = consistent_debug_registry->records[i].physaddr; + *length = consistent_debug_registry->records[i].length; + + return true; + } + } + + return false; +} + int PE_consistent_debug_inherit(void) { diff --git a/pexpert/arm/pe_identify_machine.c b/pexpert/arm/pe_identify_machine.c index 4328c7f2f..34ec23be7 100644 --- a/pexpert/arm/pe_identify_machine.c +++ b/pexpert/arm/pe_identify_machine.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. */ #include @@ -13,6 +13,7 @@ #include #endif +#include #include #if DEVELOPMENT || DEBUG #include @@ -33,7 +34,7 @@ static uint32_t gTCFG0Value; static uint32_t pe_arm_init_timer(void *args); #if DEVELOPMENT || DEBUG -decl_simple_lock_data(, panic_trace_lock; ) +decl_simple_lock_data(, panic_hook_lock); #endif /* * pe_identify_machine: @@ -96,11 +97,6 @@ pe_identify_machine(boot_args * bootArgs) hclk = mclk / 4; pclk = hclk / 2; tclk = 100000; /* timer is at 100khz */ - } else if (!strcmp(gPESoCDeviceType, "bcm2837-io")) { - mclk = 1200000000; - hclk = mclk / 4; - pclk = hclk / 2; - tclk = 1000000; } else { use_dt = 1; } @@ -278,9 +274,6 @@ pe_arm_get_soc_revision(void) extern void fleh_fiq_generic(void); -#if defined(ARM_BOARD_CLASS_S5L8960X) -static struct tbd_ops s5l8960x_funcs = {NULL, NULL, NULL}; -#endif /* defined(ARM_BOARD_CLASS_S5L8960X) */ #if defined(ARM_BOARD_CLASS_T7000) static struct tbd_ops t7000_funcs = {NULL, NULL, NULL}; @@ -321,6 +314,9 @@ static struct tbd_ops t8015_funcs = {NULL, NULL, NULL}; + + + #if defined(ARM_BOARD_CLASS_BCM2837) static struct tbd_ops bcm2837_funcs = {NULL, NULL, NULL}; #endif /* defined(ARM_BOARD_CLASS_BCM2837) */ @@ -341,23 +337,31 @@ typedef enum{ } panic_trace_t; static panic_trace_t bootarg_panic_trace; +static int bootarg_stop_clocks; + // The command buffer contains the converted commands from the device tree for commanding cpu_halt, enable_trace, etc. #define DEBUG_COMMAND_BUFFER_SIZE 256 typedef struct command_buffer_element { uintptr_t address; - uint16_t destination_cpu_selector; uintptr_t value; + uint16_t destination_cpu_selector; + uint16_t delay_us; + bool is_32bit; } command_buffer_element_t; static command_buffer_element_t debug_command_buffer[DEBUG_COMMAND_BUFFER_SIZE]; // statically allocate to prevent needing alloc at runtime -static uint32_t next_command_bufffer_entry = 0; // index of next unused slot in debug_command_buffer +static uint32_t next_command_buffer_entry = 0; // index of next unused slot in debug_command_buffer -#define CPU_SELECTOR_SHIFT ((sizeof(int)-2)*8) -#define CPU_SELECTOR_MASK (0xFFFF << CPU_SELECTOR_SHIFT) -#define REGISTER_OFFSET_MASK (~CPU_SELECTOR_MASK) +#define CPU_SELECTOR_SHIFT (16) +#define CPU_SELECTOR_MASK (0xFFFF << CPU_SELECTOR_SHIFT) +#define REGISTER_OFFSET_MASK ((1 << CPU_SELECTOR_SHIFT) - 1) #define REGISTER_OFFSET(register_prop) (register_prop & REGISTER_OFFSET_MASK) -#define CPU_SELECTOR(register_offset) (register_offset >> CPU_SELECTOR_SHIFT) // Upper 16bits holds the cpu selector -#define MAX_WINDOW_SIZE 0xFFFF -#define PE_ISSPACE(c) (c == ' ' || c == '\t' || c == '\n' || c == '\12') +#define CPU_SELECTOR(register_offset) ((register_offset & CPU_SELECTOR_MASK) >> CPU_SELECTOR_SHIFT) // Upper 16bits holds the cpu selector +#define MAX_WINDOW_SIZE 0xFFFF +#define PE_ISSPACE(c) (c == ' ' || c == '\t' || c == '\n' || c == '\12') +#define DELAY_SHIFT (32) +#define DELAY_MASK (0xFFFFULL << DELAY_SHIFT) +#define DELAY_US(register_offset) ((register_offset & DELAY_MASK) >> DELAY_SHIFT) +#define REGISTER_32BIT_MASK (1ULL << 63) /* * 0x0000 - all cpus * 0x0001 - cpu 0 @@ -376,6 +380,8 @@ static command_buffer_element_t *cpu_halt; static command_buffer_element_t *enable_trace; static command_buffer_element_t *enable_alt_trace; static command_buffer_element_t *trace_halt; +static command_buffer_element_t *enable_stop_clocks; +static command_buffer_element_t *stop_clocks; // Record which CPU is currently running one of our debug commands, so we can trap panic reentrancy to PE_arm_debug_panic_hook. static int running_debug_command_on_cpu_number = -1; @@ -396,13 +402,13 @@ pe_init_debug_command(DTEntry entryP, command_buffer_element_t **command_buffer, } // make sure command will fit - if (next_command_bufffer_entry + prop_size / sizeof(uintptr_t) > DEBUG_COMMAND_BUFFER_SIZE - 1) { + if (next_command_buffer_entry + prop_size / sizeof(uintptr_t) > DEBUG_COMMAND_BUFFER_SIZE - 1) { panic("pe_init_debug_command: property %s is %u bytes, command buffer only has %lu bytes remaining\n", - entry_name, prop_size, ((DEBUG_COMMAND_BUFFER_SIZE - 1) - next_command_bufffer_entry) * sizeof(uintptr_t)); + entry_name, prop_size, ((DEBUG_COMMAND_BUFFER_SIZE - 1) - next_command_buffer_entry) * sizeof(uintptr_t)); } // Hold the pointer in a temp variable and later assign it to command buffer, in case we panic while half-initialized - command_starting_index = next_command_bufffer_entry; + command_starting_index = next_command_buffer_entry; // convert to real virt addresses and stuff commands into debug_command_buffer for (; prop_size; reg_prop += 2, prop_size -= 2 * sizeof(uintptr_t)) { @@ -420,14 +426,21 @@ pe_init_debug_command(DTEntry entryP, command_buffer_element_t **command_buffer, if ((REGISTER_OFFSET(*reg_prop) + sizeof(uintptr_t)) >= reg_window_size) { panic("pe_init_debug_command: Command Offset is %lx, exceeds allocated size of %x\n", REGISTER_OFFSET(*reg_prop), reg_window_size ); } - debug_command_buffer[next_command_bufffer_entry].address = debug_reg_window + REGISTER_OFFSET(*reg_prop); - debug_command_buffer[next_command_bufffer_entry].destination_cpu_selector = CPU_SELECTOR(*reg_prop); - debug_command_buffer[next_command_bufffer_entry++].value = *(reg_prop + 1); + debug_command_buffer[next_command_buffer_entry].address = debug_reg_window + REGISTER_OFFSET(*reg_prop); + debug_command_buffer[next_command_buffer_entry].destination_cpu_selector = CPU_SELECTOR(*reg_prop); +#if defined(__arm64__) + debug_command_buffer[next_command_buffer_entry].delay_us = DELAY_US(*reg_prop); + debug_command_buffer[next_command_buffer_entry].is_32bit = ((*reg_prop & REGISTER_32BIT_MASK) != 0); +#else + debug_command_buffer[next_command_buffer_entry].delay_us = 0; + debug_command_buffer[next_command_buffer_entry].is_32bit = false; +#endif + debug_command_buffer[next_command_buffer_entry++].value = *(reg_prop + 1); } } // null terminate the address field of the command to end it - debug_command_buffer[next_command_bufffer_entry++].address = 0; + debug_command_buffer[next_command_buffer_entry++].address = 0; // save pointer into table for this command *command_buffer = &debug_command_buffer[command_starting_index]; @@ -437,18 +450,31 @@ static void pe_run_debug_command(command_buffer_element_t *command_buffer) { // When both the CPUs panic, one will get stuck on the lock and the other CPU will be halted when the first executes the debug command - simple_lock(&panic_trace_lock, LCK_GRP_NULL); + simple_lock(&panic_hook_lock, LCK_GRP_NULL); + running_debug_command_on_cpu_number = cpu_number(); while (command_buffer && command_buffer->address) { if (IS_CPU_SELECTED(running_debug_command_on_cpu_number, command_buffer->destination_cpu_selector)) { - *((volatile uintptr_t*)(command_buffer->address)) = command_buffer->value; // register = value; + if (command_buffer->is_32bit) { + *((volatile uint32_t*)(command_buffer->address)) = (uint32_t)(command_buffer->value); + } else { + *((volatile uintptr_t*)(command_buffer->address)) = command_buffer->value; // register = value; + } + if (command_buffer->delay_us != 0) { + uint64_t deadline; + nanoseconds_to_absolutetime(command_buffer->delay_us * NSEC_PER_USEC, &deadline); + deadline += ml_get_timebase(); + while (ml_get_timebase() < deadline) { + ; + } + } } command_buffer++; } running_debug_command_on_cpu_number = -1; - simple_unlock(&panic_trace_lock); + simple_unlock(&panic_hook_lock); } @@ -470,10 +496,12 @@ PE_arm_debug_enable_trace(void) } static void -PEARMDebugPanicHook(const char *str) +PE_arm_panic_hook(const char *str __unused) { (void)str; // not used - + if (bootarg_stop_clocks != 0) { + pe_run_debug_command(stop_clocks); + } // if panic trace is enabled if (bootarg_panic_trace != 0) { if (running_debug_command_on_cpu_number == cpu_number()) { @@ -482,19 +510,40 @@ PEARMDebugPanicHook(const char *str) return; // allow the normal panic operation to occur. } - // Stop tracing to freze the buffer and return to normal panic processing. + // Stop tracing to freeze the buffer and return to normal panic processing. pe_run_debug_command(trace_halt); } } -void (*PE_arm_debug_panic_hook)(const char *str) = PEARMDebugPanicHook; +void (*PE_arm_debug_panic_hook)(const char *str) = PE_arm_panic_hook; + +void +PE_init_cpu(void) +{ + if (bootarg_stop_clocks != 0) { + pe_run_debug_command(enable_stop_clocks); + } +} #else -void (*PE_arm_debug_panic_hook)(const char *str) = NULL; +void(*const PE_arm_debug_panic_hook)(const char *str) = NULL; + +void +PE_init_cpu(void) +{ +} #endif // DEVELOPMENT || DEBUG +void +PE_panic_hook(const char *str __unused) +{ + if (PE_arm_debug_panic_hook != NULL) { + PE_arm_debug_panic_hook(str); + } +} + void pe_arm_init_debug(void *args) { @@ -516,7 +565,7 @@ pe_arm_init_debug(void *args) // When args != NULL, this means we're being called from arm_init on the boot CPU. // This controls one-time initialization of the Panic Trace infrastructure - simple_lock_init(&panic_trace_lock, 0); //assuming single threaded mode + simple_lock_init(&panic_hook_lock, 0); //assuming single threaded mode // Panic_halt is deprecated. Please use panic_trace istead. unsigned int temp_bootarg_panic_trace; @@ -536,6 +585,12 @@ pe_arm_init_debug(void *args) // start tracing now if enabled PE_arm_debug_enable_trace(); } + unsigned int temp_bootarg_stop_clocks; + if (PE_parse_boot_argn("stop_clocks", &temp_bootarg_stop_clocks, sizeof(temp_bootarg_stop_clocks))) { + pe_init_debug_command(entryP, &enable_stop_clocks, "enable_stop_clocks"); + pe_init_debug_command(entryP, &stop_clocks, "stop_clocks"); + bootarg_stop_clocks = temp_bootarg_stop_clocks; + } #endif } } else { @@ -615,11 +670,6 @@ pe_arm_init_timer(void *args) timer_base = gTimerBase; soc_phys = gSocPhys; -#if defined(ARM_BOARD_CLASS_S5L8960X) - if (!strcmp(gPESoCDeviceType, "s5l8960x-io")) { - tbd_funcs = &s5l8960x_funcs; - } else -#endif #if defined(ARM_BOARD_CLASS_T7000) if (!strcmp(gPESoCDeviceType, "t7000-io") || !strcmp(gPESoCDeviceType, "t7001-io")) { diff --git a/pexpert/arm/pe_init.c b/pexpert/arm/pe_init.c index 1113d5a5c..fda4a4dbb 100644 --- a/pexpert/arm/pe_init.c +++ b/pexpert/arm/pe_init.c @@ -59,6 +59,17 @@ vm_offset_t gPanicBase; unsigned int gPanicSize; struct embedded_panic_header *panic_info = NULL; +#if (DEVELOPMENT || DEBUG) && defined(XNU_TARGET_OS_BRIDGE) +/* + * On DEVELOPMENT bridgeOS, we map the x86 panic region + * so we can include this data in bridgeOS corefiles + */ +uint64_t macos_panic_base = 0; +unsigned int macos_panic_size = 0; + +struct macos_panic_header *mac_panic_header = NULL; +#endif + /* Maximum size of panic log excluding headers, in bytes */ static unsigned int panic_text_len; @@ -83,7 +94,20 @@ check_for_panic_log(void) uint32_t *panic_region_length; /* - * Find the vram node in the device tree + * DT properties for the panic region are populated by UpdateDeviceTree() in iBoot: + * + * chosen { + * embedded-panic-log-size = <0x00080000>; + * [a bunch of other stuff] + * }; + * + * pram { + * reg = <0x00000008_fbc48000 0x00000000_000b4000>; + * }; + * + * reg[0] is the physical address + * reg[1] is the size of iBoot's kMemoryRegion_Panic (not used) + * embedded-panic-log-size is the maximum amount of data to store in the buffer */ if (kSuccess != DTLookupEntry(0, "pram", &entry)) { return; @@ -101,16 +125,25 @@ check_for_panic_log(void) return; } - /* - * Map the first page of VRAM into the kernel for use in case of - * panic - */ - /* Note: map as normal memory. */ gPanicBase = ml_io_map_wcomb(reg_prop[0], panic_region_length[0]); /* Deduct the size of the panic header from the panic region size */ panic_text_len = panic_region_length[0] - sizeof(struct embedded_panic_header); gPanicSize = panic_region_length[0]; + +#if DEVELOPMENT && defined(XNU_TARGET_OS_BRIDGE) + if (PE_consistent_debug_enabled()) { + uint64_t macos_panic_physbase = 0; + uint64_t macos_panic_physlen = 0; + /* Populate the macOS panic region data if it's present in consistent debug */ + if (PE_consistent_debug_lookup_entry(kDbgIdMacOSPanicRegion, &macos_panic_physbase, &macos_panic_physlen)) { + macos_panic_base = ml_io_map_with_prot(macos_panic_physbase, macos_panic_physlen, VM_PROT_READ); + mac_panic_header = (struct macos_panic_header *) ((void *) macos_panic_base); + macos_panic_size = macos_panic_physlen; + } + } +#endif /* DEVELOPMENT && defined(XNU_TARGET_OS_BRIDGE) */ + #endif panic_info = (struct embedded_panic_header *)gPanicBase; @@ -476,20 +509,13 @@ PE_call_timebase_callback(void) /* * The default PE_poll_input handler. */ -static int +int PE_stub_poll_input(__unused unsigned int options, char *c) { *c = uart_getc(); return 0; /* 0 for success, 1 for unsupported */ } -/* - * Called by the kernel debugger to poll for keyboard input. - * Keyboard drivers may replace the default stub function - * with their polled-mode input function. - */ -int (*PE_poll_input) (unsigned int options, char *c) = PE_stub_poll_input; - /* * This routine will return 1 if you are running on a device with a variant * of iBoot that allows debugging. This is typically not the case on production diff --git a/pexpert/arm/pe_kprintf.c b/pexpert/arm/pe_kprintf.c index 5287e5c86..fdac3ab1a 100644 --- a/pexpert/arm/pe_kprintf.c +++ b/pexpert/arm/pe_kprintf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. */ /* * file: pe_kprintf.c @@ -18,7 +18,9 @@ void (*PE_kputc)(char c) = 0; SECURITY_READ_ONLY_LATE(unsigned int) disable_serial_output = TRUE; -decl_simple_lock_data(static, kprintf_lock) +decl_simple_lock_data(static, kprintf_lock); + +static void serial_putc_crlf(char c); void PE_init_kprintf(boolean_t vm_initialized) @@ -39,7 +41,7 @@ PE_init_kprintf(boolean_t vm_initialized) } if (serial_init()) { - PE_kputc = serial_putc; + PE_kputc = serial_putc_crlf; } else { PE_kputc = cnputc; } @@ -131,6 +133,15 @@ kprintf(const char *fmt, ...) } } +static void +serial_putc_crlf(char c) +{ + if (c == '\n') { + uart_putc('\r'); + } + uart_putc(c); +} + void serial_putc(char c) { diff --git a/pexpert/arm/pe_serial.c b/pexpert/arm/pe_serial.c index 3e70e3f2d..0d8ffefe2 100644 --- a/pexpert/arm/pe_serial.c +++ b/pexpert/arm/pe_serial.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -35,13 +36,13 @@ struct pe_serial_functions { void (*td0) (int c); int (*rr0) (void); int (*rd0) (void); + struct pe_serial_functions *next; }; -static struct pe_serial_functions *gPESF; +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions*) gPESF = NULL; -static int uart_initted = 0; /* 1 if init'ed */ - -static vm_offset_t uart_base; +static int uart_initted = 0; /* 1 if init'ed */ +static vm_offset_t uart_base = 0; /*****************************************************************************/ @@ -51,6 +52,8 @@ static int32_t dt_pclk = -1; static int32_t dt_sampling = -1; static int32_t dt_ubrdiv = -1; +static void ln2410_uart_set_baud_rate(__unused int unit, uint32_t baud_rate); + static void ln2410_uart_init(void) { @@ -66,7 +69,7 @@ ln2410_uart_init(void) rUCON0 = ucon0; rUMCON0 = 0x00; /* Clear Flow Control */ - gPESF->uart_set_baud_rate(0, 115200); + ln2410_uart_set_baud_rate(0, 115200); rUFCON0 = 0x03; /* Clear & Enable FIFOs */ rUMCON0 = 0x01; /* Assert RTS on UART0 */ @@ -137,15 +140,24 @@ ln2410_rd0(void) return (int)rURXH0; } -static struct pe_serial_functions ln2410_serial_functions = { - ln2410_uart_init, ln2410_uart_set_baud_rate, - ln2410_tr0, ln2410_td0, ln2410_rr0, ln2410_rd0 +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) ln2410_serial_functions = +{ + .uart_init = ln2410_uart_init, + .uart_set_baud_rate = ln2410_uart_set_baud_rate, + .tr0 = ln2410_tr0, + .td0 = ln2410_td0, + .rr0 = ln2410_rr0, + .rd0 = ln2410_rd0 }; #endif /* S3CUART */ /*****************************************************************************/ +static void +dcc_uart_init(void) +{ +} static unsigned int read_dtr(void) @@ -213,9 +225,14 @@ dcc_rd0(void) return read_dtr(); } -static struct pe_serial_functions dcc_serial_functions = { - NULL, NULL, - dcc_tr0, dcc_td0, dcc_rr0, dcc_rd0 +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dcc_serial_functions = +{ + .uart_init = dcc_uart_init, + .uart_set_baud_rate = NULL, + .tr0 = dcc_tr0, + .td0 = dcc_td0, + .rr0 = dcc_rr0, + .rd0 = dcc_rd0 }; /*****************************************************************************/ @@ -465,7 +482,7 @@ validation_failure: PE_consistent_debug_register(kDbgIdConsoleHeaderAP, pa_panic_base, panic_size); } -static struct pe_serial_functions shmcon_serial_functions = +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) shmcon_serial_functions = { .uart_init = shmcon_init, .uart_set_baud_rate = shmcon_set_baud_rate, @@ -505,6 +522,7 @@ static uint64_t prev_dockfifo_spaces; // Previous w_stat level of the Dock static uint32_t dockfifo_capacity; static uint64_t dockfifo_stall_grace; +static vm_offset_t dockfifo_uart_base = 0; //======================= // Local funtions @@ -521,7 +539,7 @@ dockfifo_drain_on_stall() // It's been more than DOCKFIFO_WR_MAX_STALL_US and nobody read from the FIFO // Drop a character. (void)rDOCKFIFO_R_DATA(DOCKFIFO_UART_READ, 1); - prev_dockfifo_spaces++; + os_atomic_inc(&prev_dockfifo_spaces, relaxed); return 1; } return 0; @@ -548,7 +566,7 @@ static void dockfifo_uart_td0(int c) { rDOCKFIFO_W_DATA(DOCKFIFO_UART_WRITE, 1) = (unsigned)(c & 0xff); - prev_dockfifo_spaces--; // After writing a byte we have one fewer space than previously expected. + os_atomic_dec(&prev_dockfifo_spaces, relaxed); // After writing a byte we have one fewer space than previously expected. } static int @@ -578,7 +596,7 @@ dockfifo_uart_init(void) dockfifo_capacity = rDOCKFIFO_W_STAT(DOCKFIFO_UART_WRITE) & 0xffff; } -static struct pe_serial_functions dockfifo_uart_serial_functions = +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dockfifo_uart_serial_functions = { .uart_init = dockfifo_uart_init, .uart_set_baud_rate = NULL, @@ -601,6 +619,7 @@ static bool use_sw_drain; static uint64_t prev_dockchannel_drained_time; // Last time we've seen the DockChannel drained by an external agent static uint64_t prev_dockchannel_spaces; // Previous w_stat level of the DockChannel. static uint64_t dockchannel_stall_grace; +static vm_offset_t dockchannel_uart_base = 0; //======================= // Local funtions @@ -617,7 +636,7 @@ dockchannel_drain_on_stall() // It's been more than DOCKCHANEL_WR_MAX_STALL_US and nobody read from the FIFO // Drop a character. (void)rDOCKCHANNELS_DEV_RDATA1(DOCKCHANNEL_UART_CHANNEL); - prev_dockchannel_spaces++; + os_atomic_inc(&prev_dockchannel_spaces, relaxed); return 1; } return 0; @@ -648,7 +667,7 @@ dockchannel_uart_td0(int c) { rDOCKCHANNELS_DEV_WDATA1(DOCKCHANNEL_UART_CHANNEL) = (unsigned)(c & 0xff); if (use_sw_drain) { - prev_dockchannel_spaces--; // After writing a byte we have one fewer space than previously expected. + os_atomic_dec(&prev_dockchannel_spaces, relaxed); // After writing a byte we have one fewer space than previously expected. } } @@ -664,6 +683,15 @@ dockchannel_uart_rd0(void) return (int)((rDOCKCHANNELS_DEV_RDATA1(DOCKCHANNEL_UART_CHANNEL) >> 8) & 0xff); } +static void +dockchannel_uart_clear_intr(void) +{ + rDOCKCHANNELS_AGENT_AP_INTR_CTRL &= ~(0x3); + rDOCKCHANNELS_AGENT_AP_INTR_STATUS |= 0x3; + rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL &= ~(0x3); + rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS |= 0x3; +} + static void dockchannel_uart_init(void) { @@ -672,10 +700,7 @@ dockchannel_uart_init(void) } // Clear all interrupt enable and status bits - rDOCKCHANNELS_AGENT_AP_INTR_CTRL &= ~(0x3); - rDOCKCHANNELS_AGENT_AP_INTR_STATUS |= 0x3; - rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL &= ~(0x3); - rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS |= 0x3; + dockchannel_uart_clear_intr(); // Setup DRAIN timer rDOCKCHANNELS_DEV_DRAIN_CFG(DOCKCHANNEL_UART_CHANNEL) = max_dockchannel_drain_period; @@ -685,7 +710,7 @@ dockchannel_uart_init(void) rDOCKCHANNELS_DOCK_RDATA1(DOCKCHANNEL_UART_CHANNEL); } -static struct pe_serial_functions dockchannel_uart_serial_functions = +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dockchannel_uart_serial_functions = { .uart_init = dockchannel_uart_init, .uart_set_baud_rate = NULL, @@ -699,8 +724,8 @@ static struct pe_serial_functions dockchannel_uart_serial_functions = /****************************************************************************/ #ifdef PI3_UART -vm_offset_t pi3_gpio_base_vaddr; -vm_offset_t pi3_aux_base_vaddr; +vm_offset_t pi3_gpio_base_vaddr = 0; +vm_offset_t pi3_aux_base_vaddr = 0; static int pi3_uart_tr0(void) { @@ -775,7 +800,7 @@ pi3_uart_init(void) BCM2837_PUT32(BCM2837_AUX_MU_CNTL_REG_V, 3); } -static struct pe_serial_functions pi3_uart_serial_functions = +SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) pi3_uart_serial_functions = { .uart_init = pi3_uart_init, .uart_set_baud_rate = NULL, @@ -787,111 +812,104 @@ static struct pe_serial_functions pi3_uart_serial_functions = #endif /* PI3_UART */ /*****************************************************************************/ + +static void +register_serial_functions(struct pe_serial_functions *fns) +{ + fns->next = gPESF; + gPESF = fns; +} + int serial_init(void) { DTEntry entryP = NULL; - uint32_t prop_size, dccmode; + uint32_t prop_size; vm_offset_t soc_base; uintptr_t *reg_prop; - uint32_t *prop_value = NULL; - char *serial_compat = 0; -#ifdef SHMCON - uint32_t jconmode; -#endif -#ifdef DOCKFIFO_UART - uint32_t no_dockfifo_uart; -#endif -#ifdef DOCKCHANNEL_UART - uint32_t no_dockchannel_uart; -#endif -#ifdef PI3_UART - uint32_t is_pi3; -#endif + uint32_t *prop_value __unused = NULL; + char *serial_compat __unused = 0; + uint32_t dccmode; - if (uart_initted && gPESF) { - gPESF->uart_init(); + struct pe_serial_functions *fns = gPESF; + + if (uart_initted) { + while (fns != NULL) { + fns->uart_init(); + fns = fns->next; + } kprintf("reinit serial\n"); return 1; } dccmode = 0; if (PE_parse_boot_argn("dcc", &dccmode, sizeof(dccmode))) { - gPESF = &dcc_serial_functions; - uart_initted = 1; - return 1; + register_serial_functions(&dcc_serial_functions); } #ifdef SHMCON - jconmode = 0; + uint32_t jconmode = 0; if (PE_parse_boot_argn("jcon", &jconmode, sizeof jconmode)) { - gPESF = &shmcon_serial_functions; - gPESF->uart_init(); - uart_initted = 1; - return 1; + register_serial_functions(&shmcon_serial_functions); } #endif /* SHMCON */ -#ifdef PI3_UART -#pragma unused(prop_value) - is_pi3 = 0; - if (PE_parse_boot_argn("-pi3", &is_pi3, sizeof(is_pi3))) { // FIXME: remove the not operator after boot args are set up. - pi3_gpio_base_vaddr = ml_io_map((vm_offset_t)BCM2837_GPIO_BASE, BCM2837_GPIO_SIZE); - pi3_aux_base_vaddr = ml_io_map((vm_offset_t)BCM2837_AUX_BASE, BCM2837_AUX_SIZE); - gPESF = &pi3_uart_serial_functions; - gPESF->uart_init(); - uart_initted = 1; - return 1; - } -#endif /* PI3_UART */ - soc_base = pe_arm_get_soc_base_phys(); if (soc_base == 0) { return 0; } +#ifdef PI3_UART + if (DTFindEntry("name", "gpio", &entryP) == kSuccess) { + DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); + pi3_gpio_base_vaddr = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); + } + if (DTFindEntry("name", "aux", &entryP) == kSuccess) { + DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); + pi3_aux_base_vaddr = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); + } + if ((pi3_gpio_base_vaddr != 0) && (pi3_aux_base_vaddr != 0)) { + register_serial_functions(&pi3_uart_serial_functions); + } +#endif /* PI3_UART */ + #ifdef DOCKFIFO_UART - no_dockfifo_uart = 0; + uint32_t no_dockfifo_uart = 0; PE_parse_boot_argn("no-dockfifo-uart", &no_dockfifo_uart, sizeof(no_dockfifo_uart)); if (no_dockfifo_uart == 0) { if (DTFindEntry("name", "dockfifo-uart", &entryP) == kSuccess) { DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); - uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); - } else { - return 0; + dockfifo_uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); + register_serial_functions(&dockfifo_uart_serial_functions); } - gPESF = &dockfifo_uart_serial_functions; - gPESF->uart_init(); - uart_initted = 1; - return 1; } #endif /* DOCKFIFO_UART */ #ifdef DOCKCHANNEL_UART - no_dockchannel_uart = 0; - // Keep the old name for boot-arg - PE_parse_boot_argn("no-dockfifo-uart", &no_dockchannel_uart, sizeof(no_dockchannel_uart)); - if (no_dockchannel_uart == 0) { - if (DTFindEntry("name", "dockchannel-uart", &entryP) == kSuccess) { - DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); - // Should be two reg entries - if (prop_size / sizeof(uintptr_t) != 4) { - panic("Malformed dockchannel-uart property"); - } - uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); - dock_agent_base = ml_io_map(soc_base + *(reg_prop + 2), *(reg_prop + 3)); - gPESF = &dockchannel_uart_serial_functions; + uint32_t no_dockchannel_uart = 0; + if (DTFindEntry("name", "dockchannel-uart", &entryP) == kSuccess) { + DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); + // Should be two reg entries + if (prop_size / sizeof(uintptr_t) != 4) { + panic("Malformed dockchannel-uart property"); + } + dockchannel_uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); + dock_agent_base = ml_io_map(soc_base + *(reg_prop + 2), *(reg_prop + 3)); + PE_parse_boot_argn("no-dockfifo-uart", &no_dockchannel_uart, sizeof(no_dockchannel_uart)); + // Keep the old name for boot-arg + if (no_dockchannel_uart == 0) { + register_serial_functions(&dockchannel_uart_serial_functions); DTGetProperty(entryP, "max-aop-clk", (void **)&prop_value, &prop_size); max_dockchannel_drain_period = (uint32_t)((prop_value)? (*prop_value * 0.03) : DOCKCHANNEL_DRAIN_PERIOD); DTGetProperty(entryP, "enable-sw-drain", (void **)&prop_value, &prop_size); use_sw_drain = (prop_value)? *prop_value : 0; - gPESF->uart_init(); - uart_initted = 1; - return 1; + } else { + dockchannel_uart_clear_intr(); } // If no dockchannel-uart is found in the device tree, fall back // to looking for the traditional UART serial console. } + #endif /* DOCKCHANNEL_UART */ /* @@ -938,24 +956,25 @@ serial_init(void) } } if (!strcmp(serial_compat, "uart,16550")) { - gPESF = &ln2410_serial_functions; + register_serial_functions(&ln2410_serial_functions); } else if (!strcmp(serial_compat, "uart-16550")) { - gPESF = &ln2410_serial_functions; + register_serial_functions(&ln2410_serial_functions); } else if (!strcmp(serial_compat, "uart,s5i3000")) { - gPESF = &ln2410_serial_functions; + register_serial_functions(&ln2410_serial_functions); } else if (!strcmp(serial_compat, "uart-1,samsung")) { - gPESF = &ln2410_serial_functions; + register_serial_functions(&ln2410_serial_functions); } -#elif defined (ARM_BOARD_CONFIG_MV88F6710) - if (!strcmp(serial_compat, "uart16x50,mmio")) { - gPESF = &uart16x50_serial_functions; - } -#endif - else { +#endif /* S3CUART */ + + if (gPESF == NULL) { return 0; } - gPESF->uart_init(); + fns = gPESF; + while (fns != NULL) { + fns->uart_init(); + fns = fns->next; + } uart_initted = 1; @@ -965,22 +984,25 @@ serial_init(void) void uart_putc(char c) { - if (uart_initted) { - while (!gPESF->tr0()) { + struct pe_serial_functions *fns = gPESF; + while (fns != NULL) { + while (!fns->tr0()) { ; /* Wait until THR is empty. */ } - gPESF->td0(c); + fns->td0(c); + fns = fns->next; } } int uart_getc(void) { /* returns -1 if no data available */ - if (uart_initted) { - if (!gPESF->rr0()) { - return -1; /* Receive data read */ + struct pe_serial_functions *fns = gPESF; + while (fns != NULL) { + if (fns->rr0()) { + return fns->rd0(); } - return gPESF->rd0(); + fns = fns->next; } return -1; } diff --git a/pexpert/conf/Makefile b/pexpert/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/pexpert/conf/Makefile +++ b/pexpert/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/pexpert/conf/Makefile.template b/pexpert/conf/Makefile.template index b9962d602..b5357650d 100644 --- a/pexpert/conf/Makefile.template +++ b/pexpert/conf/Makefile.template @@ -66,9 +66,9 @@ $(SOBJS): .SFLAGS $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS) $(COMPONENT).filelist: $(OBJS) - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist do_all: $(COMPONENT).filelist diff --git a/pexpert/conf/files.arm b/pexpert/conf/files.arm index b6b86ef4a..f687f7c30 100644 --- a/pexpert/conf/files.arm +++ b/pexpert/conf/files.arm @@ -1,5 +1,3 @@ -OPTIONS/gprof optional gprof - pexpert/arm/pe_bootargs.c standard pexpert/arm/pe_identify_machine.c standard pexpert/arm/pe_init.c standard diff --git a/pexpert/conf/files.arm64 b/pexpert/conf/files.arm64 index aada62e1e..b2ab3b4d4 100644 --- a/pexpert/conf/files.arm64 +++ b/pexpert/conf/files.arm64 @@ -1,5 +1,3 @@ -OPTIONS/gprof optional gprof - pexpert/arm/pe_bootargs.c standard pexpert/arm/pe_consistent_debug.c standard pexpert/arm/pe_identify_machine.c standard diff --git a/pexpert/conf/files.x86_64 b/pexpert/conf/files.x86_64 index 0ba9ffc18..d0c246e7c 100644 --- a/pexpert/conf/files.x86_64 +++ b/pexpert/conf/files.x86_64 @@ -1,5 +1,3 @@ -OPTIONS/gprof optional gprof - pexpert/i386/pe_init.c standard pexpert/i386/pe_bootargs.c standard pexpert/i386/pe_identify_machine.c standard diff --git a/pexpert/gen/bootargs.c b/pexpert/gen/bootargs.c index 5bf70059c..fddac8d3e 100644 --- a/pexpert/gen/bootargs.c +++ b/pexpert/gen/bootargs.c @@ -51,23 +51,6 @@ struct i24 { #define NUM 0 #define STR 1 -#if !defined(__LP64__) && !defined(__arm__) -boolean_t -PE_parse_boot_arg( - const char *arg_string, - void *arg_ptr) -{ - int max_len = -1; - -#if CONFIG_EMBEDDED - /* Limit arg size to 4 byte when no size is given */ - max_len = 4; -#endif - - return PE_parse_boot_argn(arg_string, arg_ptr, max_len); -} -#endif - static boolean_t PE_parse_boot_argn_internal( const char *arg_string, @@ -393,7 +376,16 @@ getval( boolean_t PE_imgsrc_mount_supported() { +#if CONFIG_LOCKERBOOT + /* + * Booting from a locker requires that we be able to mount the containing + * volume inside the locker. This looks redundant, but this is here in case + * the other conditional needs to be modified for some reason. + */ + return TRUE; +#else return TRUE; +#endif } boolean_t diff --git a/pexpert/i386/pe_identify_machine.c b/pexpert/i386/pe_identify_machine.c index 71e26b08f..7d8d0bdcb 100644 --- a/pexpert/i386/pe_identify_machine.c +++ b/pexpert/i386/pe_identify_machine.c @@ -71,3 +71,8 @@ pe_identify_machine(__unused boot_args *args) gPEClockFrequencyInfo.bus_to_dec_rate_den = gPEClockFrequencyInfo.bus_clock_rate_hz / gPEClockFrequencyInfo.dec_clock_rate_hz; } + +void +PE_panic_hook(const char *str __unused) +{ +} diff --git a/pexpert/i386/pe_init.c b/pexpert/i386/pe_init.c index c2debbbd1..4892e95c7 100644 --- a/pexpert/i386/pe_init.c +++ b/pexpert/i386/pe_init.c @@ -336,21 +336,13 @@ PE_call_timebase_callback(void) /* * The default (non-functional) PE_poll_input handler. */ -static int +int PE_stub_poll_input(__unused unsigned int options, char * c) { *c = 0xff; return 1; /* 0 for success, 1 for unsupported */ } -/* - * Called by the kernel debugger to poll for keyboard input. - * Keyboard drivers may replace the default stub function - * with their polled-mode input function. - */ -int (*PE_poll_input)(unsigned int options, char * c) - = PE_stub_poll_input; - boolean_t PE_reboot_on_panic(void) { diff --git a/pexpert/i386/pe_kprintf.c b/pexpert/i386/pe_kprintf.c index 63e10d9f8..ce2ff230e 100644 --- a/pexpert/i386/pe_kprintf.c +++ b/pexpert/i386/pe_kprintf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,6 +40,12 @@ #include #include #include +#include +#include +#include + +extern uint64_t LockTimeOut; +extern processor_t current_processor(void); /* Globals */ void (*PE_kputc)(char c); @@ -53,7 +59,7 @@ SECURITY_READ_ONLY_LATE(unsigned int) disable_serial_output = FALSE; SECURITY_READ_ONLY_LATE(unsigned int) disable_serial_output = TRUE; #endif -decl_simple_lock_data(static, kprintf_lock) +decl_simple_lock_data(static, kprintf_lock); void PE_init_kprintf(boolean_t vm_initialized) @@ -110,6 +116,9 @@ _kprintf(const char *format, ...) static int cpu_last_locked = 0; +#define KPRINTF_LOCKWAIT_PATIENT (LockTimeOut) +#define KPRINTF_LOCKWAIT_IMPATIENT (LockTimeOut >> 4) + __attribute__((noinline, not_tail_called)) void kprintf(const char *fmt, ...) @@ -117,6 +126,8 @@ kprintf(const char *fmt, ...) va_list listp; va_list listp2; boolean_t state; + boolean_t in_panic_context = FALSE; + unsigned int kprintf_lock_grabbed; void *caller = __builtin_return_address(0); if (!disable_serial_output) { @@ -142,17 +153,16 @@ kprintf(const char *fmt, ...) return; } - /* - * Spin to get kprintf lock but poll for incoming signals - * while interrupts are masked. - */ state = ml_set_interrupts_enabled(FALSE); pal_preemption_assert(); - while (!simple_lock_try(&kprintf_lock, LCK_GRP_NULL)) { - (void) cpu_signal_handler(NULL); - } + in_panic_context = processor_in_panic_context(current_processor()); + + // If current CPU is in panic context, be a little more impatient. + kprintf_lock_grabbed = simple_lock_try_lock_mp_signal_safe_loop_duration(&kprintf_lock, + in_panic_context ? KPRINTF_LOCKWAIT_IMPATIENT : KPRINTF_LOCKWAIT_PATIENT, + LCK_GRP_NULL); if (cpu_number() != cpu_last_locked) { MP_DEBUG_KPRINTF("[cpu%d...]\n", cpu_number()); @@ -164,7 +174,10 @@ kprintf(const char *fmt, ...) _doprnt(fmt, &listp, PE_kputc, 16); va_end(listp); - simple_unlock(&kprintf_lock); + if (kprintf_lock_grabbed) { + simple_unlock(&kprintf_lock); + } + ml_set_interrupts_enabled(state); // If interrupts are enabled diff --git a/pexpert/pexpert/arm/Makefile b/pexpert/pexpert/arm/Makefile index d5b46d1a9..86386e462 100644 --- a/pexpert/pexpert/arm/Makefile +++ b/pexpert/pexpert/arm/Makefile @@ -11,6 +11,7 @@ DATAFILES = \ board_config.h \ boot.h \ consistent_debug.h \ + dockchannel.h \ PL192_VIC.h \ protos.h \ S3cUART.h \ diff --git a/pexpert/pexpert/arm/S7002.h b/pexpert/pexpert/arm/S7002.h index a39829ba3..6c6d2e07c 100644 --- a/pexpert/pexpert/arm/S7002.h +++ b/pexpert/pexpert/arm/S7002.h @@ -25,13 +25,13 @@ #define DOCKFIFO_W_SPACING (0x1000) #define DOCKFIFO_SPACING (0x3000) -#define rDOCKFIFO_R_DATA(_f, _n) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + ((_n) * 4))) -#define rDOCKFIFO_R_STAT(_f) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x14)) -#define rDOCKFIFO_W_DATA(_f, _n) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + ((_n) * 4))) -#define rDOCKFIFO_W_STAT(_f) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + 0x14)) -#define rDOCKFIFO_CNFG(_f) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2000)) -#define rDOCKFIFO_DRAIN(_f) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2004)) -#define rDOCKFIFO_INTMASK(_f) (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2008)) +#define rDOCKFIFO_R_DATA(_f, _n) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + ((_n) * 4))) +#define rDOCKFIFO_R_STAT(_f) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x14)) +#define rDOCKFIFO_W_DATA(_f, _n) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + ((_n) * 4))) +#define rDOCKFIFO_W_STAT(_f) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + 0x14)) +#define rDOCKFIFO_CNFG(_f) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2000)) +#define rDOCKFIFO_DRAIN(_f) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2004)) +#define rDOCKFIFO_INTMASK(_f) (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2008)) #endif diff --git a/pexpert/pexpert/arm/T8002.h b/pexpert/pexpert/arm/T8002.h index 9f90baead..e231d30be 100644 --- a/pexpert/pexpert/arm/T8002.h +++ b/pexpert/pexpert/arm/T8002.h @@ -11,35 +11,17 @@ #include -#define rPMGR_EVENT_TMR (*(volatile uint32_t *) (timer_base + 0x00000)) -#define rPMGR_EVENT_TMR_PERIOD (*(volatile uint32_t *) (timer_base + 0x00004)) -#define rPMGR_EVENT_TMR_CTL (*(volatile uint32_t *) (timer_base + 0x00008)) - -#define PMGR_EVENT_TMR_CTL_EN (1 << 0) - -#define DOCKCHANNEL_UART (1) -#define DOCKCHANNEL_STRIDE (0x10000) - -// Channel index -#define DOCKCHANNEL_UART_CHANNEL (0) +#include // AOP_CLOCK frequency * 30 ms #define DOCKCHANNEL_DRAIN_PERIOD (96000000 * 0.03) -#define rDOCKCHANNELS_AGENT_AP_INTR_CTRL (*(volatile uint32_t *) (dock_agent_base + 0x00)) -#define rDOCKCHANNELS_AGENT_AP_INTR_STATUS (*(volatile uint32_t *) (dock_agent_base + 0x04)) -#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL (*(volatile uint32_t *) (dock_agent_base + 0x08)) -#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS (*(volatile uint32_t *) (dock_agent_base + 0x0c)) - -#define rDOCKCHANNELS_DEV_DRAIN_CFG(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x0008)) +#define rPMGR_EVENT_TMR (*(volatile uint32_t *) (timer_base + 0x00000)) +#define rPMGR_EVENT_TMR_PERIOD (*(volatile uint32_t *) (timer_base + 0x00004)) +#define rPMGR_EVENT_TMR_CTL (*(volatile uint32_t *) (timer_base + 0x00008)) -#define rDOCKCHANNELS_DEV_WDATA1(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4004)) -#define rDOCKCHANNELS_DEV_WSTAT(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4014)) -#define rDOCKCHANNELS_DEV_RDATA0(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4018)) -#define rDOCKCHANNELS_DEV_RDATA1(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x401c)) +#define PMGR_EVENT_TMR_CTL_EN (1 << 0) -#define rDOCKCHANNELS_DOCK_RDATA1(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc01c)) -#define rDOCKCHANNELS_DOCK_RDATA3(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc024)) #endif #endif /* ! _PEXPERT_ARM_T8002_H */ diff --git a/pexpert/pexpert/arm/consistent_debug.h b/pexpert/pexpert/arm/consistent_debug.h index ceb317594..62549b868 100644 --- a/pexpert/pexpert/arm/consistent_debug.h +++ b/pexpert/pexpert/arm/consistent_debug.h @@ -69,6 +69,8 @@ typedef enum { #define kDbgIdAstrisConnection DEBUG_RECORD_ID_LONG('A','S','T','R','C','N','X','N') #define kDbgIdAstrisConnectionVers DEBUG_RECORD_ID_LONG('A','S','T','R','C','V','E','R') +#define kDbgIdMacOSPanicRegion DEBUG_RECORD_ID_LONG('M','A','C','P','A','N','I','C') + #define kDbgIdUnusedEntry 0x0ULL #define kDbgIdReservedEntry DEBUG_RECORD_ID_LONG('R','E','S','E','R','V','E', 'D') #define kDbgIdFreeReqEntry DEBUG_RECORD_ID_LONG('F','R','E','E','-','R','E','Q') @@ -126,6 +128,12 @@ int PE_consistent_debug_inherit(void); */ int PE_consistent_debug_register(uint64_t record_id, uint64_t physaddr, uint64_t length); +/* + * Lookup an exidting entry from the consistent debug structure, populate the attributes + * if it exists. + */ +boolean_t PE_consistent_debug_lookup_entry(uint64_t record_id, uint64_t *phys_addr, uint64_t *length); + /* * Returns whether consistent debug is enabled on the current device. */ diff --git a/pexpert/pexpert/arm/dockchannel.h b/pexpert/pexpert/arm/dockchannel.h new file mode 100644 index 000000000..0d012ddbc --- /dev/null +++ b/pexpert/pexpert/arm/dockchannel.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _PEXPERT_ARM_DOCKCHANNEL_H +#define _PEXPERT_ARM_DOCKCHANNEL_H + +#define DOCKCHANNEL_UART (1) +#define DOCKCHANNEL_STRIDE (0x10000) + +// Channel index +#define DOCKCHANNEL_UART_CHANNEL (0) + +#define rDOCKCHANNELS_AGENT_AP_INTR_CTRL (*(volatile uint32_t *) (dock_agent_base + 0x00)) +#define rDOCKCHANNELS_AGENT_AP_INTR_STATUS (*(volatile uint32_t *) (dock_agent_base + 0x04)) +#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL (*(volatile uint32_t *) (dock_agent_base + 0x08)) +#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS (*(volatile uint32_t *) (dock_agent_base + 0x0c)) + +#define rDOCKCHANNELS_DEV_DRAIN_CFG(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x0008)) + +#define rDOCKCHANNELS_DEV_WDATA1(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4004)) +#define rDOCKCHANNELS_DEV_WSTAT(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4014)) +#define rDOCKCHANNELS_DEV_RDATA0(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4018)) +#define rDOCKCHANNELS_DEV_RDATA1(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x401c)) + +#define rDOCKCHANNELS_DOCK_RDATA1(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc01c)) +#define rDOCKCHANNELS_DOCK_RDATA3(_ch) (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc024)) + +#endif /* !_PEXPERT_ARM_DOCKCHANNEL_H */ diff --git a/pexpert/pexpert/arm64/BCM2837.h b/pexpert/pexpert/arm64/BCM2837.h index 59148fd86..cc3a2147f 100644 --- a/pexpert/pexpert/arm64/BCM2837.h +++ b/pexpert/pexpert/arm64/BCM2837.h @@ -18,29 +18,6 @@ #define PI3_BREAK asm volatile("brk #0"); -#define BCM2837_GPIO_BASE 0x3F200000 -#define BCM2837_GPIO_SIZE 0xA0 -#define BCM2837_GPFSEL0 0x3F200000 -#define BCM2837_GPSET0 0x3F20001C -#define BCM2837_GPCLR0 0x3F200028 -#define BCM2837_GPPUD 0x3F200094 -#define BCM2837_GPPUDCLK0 0x3F200098 - -#define BCM2837_AUX_BASE 0x3F215000 -#define BCM2837_AUX_SIZE 0x70 -#define BCM2837_AUX_ENABLES 0x3F215004 -#define BCM2837_AUX_MU_IO_REG 0x3F215040 -#define BCM2837_AUX_MU_IER_REG 0x3F215044 -#define BCM2837_AUX_MU_IIR_REG 0x3F215048 -#define BCM2837_AUX_MU_LCR_REG 0x3F21504C -#define BCM2837_AUX_MU_MCR_REG 0x3F215050 -#define BCM2837_AUX_MU_LSR_REG 0x3F215054 -#define BCM2837_AUX_MU_MSR_REG 0x3F215058 -#define BCM2837_AUX_MU_SCRATCH 0x3F21505C -#define BCM2837_AUX_MU_CNTL_REG 0x3F215060 -#define BCM2837_AUX_MU_STAT_REG 0x3F215064 -#define BCM2837_AUX_MU_BAUD_REG 0x3F215068 - #define BCM2837_GPFSEL0_V (pi3_gpio_base_vaddr + 0x0) #define BCM2837_GPSET0_V (pi3_gpio_base_vaddr + 0x1C) #define BCM2837_GPCLR0_V (pi3_gpio_base_vaddr + 0x28) diff --git a/pexpert/pexpert/arm64/Makefile b/pexpert/pexpert/arm64/Makefile index 49f2b889e..059b64ee8 100644 --- a/pexpert/pexpert/arm64/Makefile +++ b/pexpert/pexpert/arm64/Makefile @@ -13,15 +13,14 @@ DATAFILES = \ board_config.h \ boot.h \ S3c2410x.h \ - S5L8960X.h \ T7000.h \ S8000.h \ T8010.h \ - cyclone.h \ typhoon.h \ twister.h \ hurricane.h \ - BCM2837.h + BCM2837.h \ + spr_locks.h INSTALL_MD_LIST = ${DATAFILES} diff --git a/pexpert/pexpert/arm64/S5L8960X.h b/pexpert/pexpert/arm64/S5L8960X.h deleted file mode 100644 index 82e140383..000000000 --- a/pexpert/pexpert/arm64/S5L8960X.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2011 Apple Inc. All rights reserved. - */ - -#ifndef _PEXPERT_ARM_S5L8960X_H -#define _PEXPERT_ARM_S5L8960X_H - -#include -#include - -#define WITH_CLASSIC_S2R 1 - -#ifndef ASSEMBLER - -#include - -#endif - -#endif /* ! _PEXPERT_ARM_S5L8960X_H */ diff --git a/pexpert/pexpert/arm64/T8010.h b/pexpert/pexpert/arm64/T8010.h index ed1ecbb11..826414b54 100644 --- a/pexpert/pexpert/arm64/T8010.h +++ b/pexpert/pexpert/arm64/T8010.h @@ -11,32 +11,12 @@ #ifndef ASSEMBLER #include +#include #include -#define DOCKCHANNEL_UART (1) -#define DOCKCHANNEL_STRIDE (0x10000) - -// Channel index -#define DOCKCHANNEL_UART_CHANNEL (0) - // AOP_CLOCK frequency * 30 ms #define DOCKCHANNEL_DRAIN_PERIOD (192000000 * 0.03) -#define rDOCKCHANNELS_AGENT_AP_INTR_CTRL (*(volatile uint32_t *) (dock_agent_base + 0x00)) -#define rDOCKCHANNELS_AGENT_AP_INTR_STATUS (*(volatile uint32_t *) (dock_agent_base + 0x04)) -#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL (*(volatile uint32_t *) (dock_agent_base + 0x08)) -#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS (*(volatile uint32_t *) (dock_agent_base + 0x0c)) - -#define rDOCKCHANNELS_DEV_DRAIN_CFG(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x0008)) - -#define rDOCKCHANNELS_DEV_WDATA1(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4004)) -#define rDOCKCHANNELS_DEV_WSTAT(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4014)) -#define rDOCKCHANNELS_DEV_RDATA0(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4018)) -#define rDOCKCHANNELS_DEV_RDATA1(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x401c)) - -#define rDOCKCHANNELS_DOCK_RDATA1(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc01c)) -#define rDOCKCHANNELS_DOCK_RDATA3(_ch) (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc024)) - #endif #endif /* ! _PEXPERT_ARM_T8010_H */ diff --git a/pexpert/pexpert/arm64/arm64_common.h b/pexpert/pexpert/arm64/arm64_common.h index ac3c6d320..3d32aca8b 100644 --- a/pexpert/pexpert/arm64/arm64_common.h +++ b/pexpert/pexpert/arm64/arm64_common.h @@ -7,158 +7,187 @@ #ifdef APPLE_ARM64_ARCH_FAMILY -#define ARM64_REG_HID0 S3_0_c15_c0_0 -#define ARM64_REG_HID0_LoopBuffDisb (1<<20) -#define ARM64_REG_HID0_ICPrefLimitOneBrn (1<<25) -#define ARM64_REG_HID0_PMULLFuseDisable (1ULL<<33) -#define ARM64_REG_HID0_ICPrefDepth_bshift 60 -#define ARM64_REG_HID0_ICPrefDepth_bmsk (7ULL <not a p-core, non-zero=>p-core + * arg0: register in which to store result + * 0=>not a p-core, non-zero=>p-core */ .macro ARM64_IS_PCORE #if defined(APPLEMONSOON) || HAS_CLUSTER - mrs $0, MPIDR_EL1 - and $0, $0, #(MPIDR_PNE) -#endif + mrs $0, MPIDR_EL1 + and $0, $0, #(MPIDR_PNE) +#endif /* defined(APPLEMONSOON) || HAS_CLUSTER */ .endmacro /* * reads a special purpose register, using a different msr for e- vs. p-cores - * arg0: register indicating the current core type, see ARM64_IS_PCORE - * arg1: register in which to store the result of the read - * arg2: SPR to use for e-core - * arg3: SPR to use for p-core or non-AMP architecture + * arg0: register indicating the current core type, see ARM64_IS_PCORE + * arg1: register in which to store the result of the read + * arg2: SPR to use for e-core + * arg3: SPR to use for p-core or non-AMP architecture */ .macro ARM64_READ_EP_SPR #if defined(APPLEMONSOON) || HAS_CLUSTER - cbnz $0, 1f + cbnz $0, 1f // e-core - mrs $1, $2 - b 2f + mrs $1, $2 + b 2f // p-core 1: -#endif - mrs $1, $3 +#endif /* defined(APPLEMONSOON) || HAS_CLUSTER */ + mrs $1, $3 2: .endmacro @@ -217,14 +296,14 @@ */ .macro ARM64_WRITE_EP_SPR #if defined(APPLEMONSOON) || HAS_CLUSTER - cbnz $0, 1f + cbnz $0, 1f // e-core - msr $2, $1 - b 2f + msr $2, $1 + b 2f // p-core 1: -#endif - msr $3, $1 +#endif /* defined(APPLEMONSOON) || HAS_CLUSTER */ + msr $3, $1 2: .endmacro diff --git a/pexpert/pexpert/arm64/board_config.h b/pexpert/pexpert/arm64/board_config.h index c4a0edd1d..bad756857 100644 --- a/pexpert/pexpert/arm64/board_config.h +++ b/pexpert/pexpert/arm64/board_config.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2019 Apple Inc. All rights reserved. * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. */ #ifndef _PEXPERT_ARM_BOARD_CONFIG_H @@ -7,19 +7,6 @@ #include -#ifdef ARM64_BOARD_CONFIG_S5L8960X -#define APPLE_ARM64_ARCH_FAMILY 1 -#define APPLECYCLONE -#define ARM_ARCH_TIMER -#include -#define __ARM_L2CACHE_SIZE_LOG__ 20 -#define ARM_BOARD_WFE_TIMEOUT_NS 1000 -#define ARM_BOARD_CLASS_S5L8960X -#define KERNEL_INTEGRITY_WT 1 -#define PEXPERT_NO_3X_IMAGES 1 -#define CORE_NCTRS 8 -#define CPMU_AIC_PMI 1 -#endif /* ARM64_BOARD_CONFIG_S5L8960X */ #ifdef ARM64_BOARD_CONFIG_T7000 #define APPLE_ARM64_ARCH_FAMILY 1 @@ -164,6 +151,9 @@ + + + #ifdef ARM64_BOARD_CONFIG_BCM2837 #define BCM2837 #define BCM2837_BRINGUP @@ -172,6 +162,7 @@ #define __ARM_L2CACHE_SIZE_LOG__ 19 #define ARM_BOARD_CLASS_BCM2837 #define CPU_COUNT 4 +#define CORE_NCTRS 8 /* Placeholder; KPC is not enabled for this target */ #endif /* ARM64_BOARD_CONFIG_BCM2837 */ #endif /* ! _PEXPERT_ARM_BOARD_CONFIG_H */ diff --git a/pexpert/pexpert/arm64/boot.h b/pexpert/pexpert/arm64/boot.h index 1bb953297..1bcf4990e 100644 --- a/pexpert/pexpert/arm64/boot.h +++ b/pexpert/pexpert/arm64/boot.h @@ -9,11 +9,21 @@ #ifndef _PEXPERT_ARM64_BOOT_H_ #define _PEXPERT_ARM64_BOOT_H_ +#ifdef KERNEL #include +#endif #include #include -#define BOOT_LINE_LENGTH 256 +/* + * Maximum size of an environment variable value. This particular value is + * chosen to accommodate the maximum encoded size of the system token as + * computed in https://confluence.sd.apple.com/display/TK/System+Token. + * + * This value matches iBoot's IBOOT_MAX_ENV_VAR_DATA_SIZE. + * There are no iBoot headers so have to duplicate it here for now. + */ +#define BOOT_LINE_LENGTH 608 /* * Video information.. diff --git a/pexpert/pexpert/arm64/cyclone.h b/pexpert/pexpert/arm64/cyclone.h deleted file mode 100644 index 6d5d900fa..000000000 --- a/pexpert/pexpert/arm64/cyclone.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. - */ - -#ifndef _PEXPERT_ARM_CYCLONE_H -#define _PEXPERT_ARM_CYCLONE_H - -#ifdef APPLECYCLONE -#include "arm64_common.h" - -#define MONITOR 1 /* Use EL3 monitor */ -#define NO_ECORE 1 -#define HAS_32BIT_DBGWRAP 1 - -/* - * Determined by experiment (not described in manual): - * A0 is variant 0, B0 is variant 1. See arm64/proc_reg.h - * for how these values are constructed from the MIDR. - */ -#define CYCLONE_CPU_VERSION_A0 0x00 -#define CYCLONE_CPU_VERSION_B0 0x10 - -#endif - -#endif /* ! _PEXPERT_ARM_CYCLONE_H */ diff --git a/pexpert/pexpert/arm64/hurricane.h b/pexpert/pexpert/arm64/hurricane.h index dea4c8f3e..bf1b181d2 100644 --- a/pexpert/pexpert/arm64/hurricane.h +++ b/pexpert/pexpert/arm64/hurricane.h @@ -1,13 +1,14 @@ /* - * Copyright (c) 2014 Apple Inc. All rights reserved. + * Copyright (c) 2014-2018 Apple Inc. All rights reserved. */ #ifndef _PEXPERT_ARM_HURRICANE_H #define _PEXPERT_ARM_HURRICANE_H -#define NO_MONITOR 1 /* No EL3 for this CPU -- ever */ -#define HAS_MIGSTS 1 /* Has MIGSTS register, and supports migration between p-core and e-core */ -#define HAS_KTRR 1 /* Has KTRR registers */ +#define NO_MONITOR 1 /* No EL3 for this CPU -- ever */ +#define HAS_MIGSTS 1 /* Has MIGSTS register, and supports migration between p-core and e-core */ +#define HAS_KTRR 1 /* Has KTRR registers */ +#define HAS_CPMU_L2C_EVENTS 1 /* Has L2 cache events in CPMU */ #ifdef APPLEHURRICANE #include "arm64_common.h" diff --git a/pexpert/pexpert/arm64/spr_locks.h b/pexpert/pexpert/arm64/spr_locks.h new file mode 100644 index 000000000..5d42a95b0 --- /dev/null +++ b/pexpert/pexpert/arm64/spr_locks.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _PEXPERT_ARM64_SPR_LOCKS_H +#define _PEXPERT_ARM64_SPR_LOCKS_H + +#define MSR_RO_CTL_HID4 (1ULL << 4) +#define MSR_RO_CTL_CYC_OVRD (1ULL << 27) +#define MSR_RO_CTL_ACC_OVRD (1ULL << 47) + +#define MSR_RO_CTL_VAL (~0ULL & ~(MSR_RO_CTL_HID4 | MSR_RO_CTL_CYC_OVRD | MSR_RO_CTL_ACC_OVRD)) +#define MSR_LOCK_VAL (1ULL << 0) + +#define CPU_PIO_RO_CTL_DBG_WRAP (1ULL << 49) +#define CPU_PIO_RO_CTL_TRACE_CORE_CFG (1ULL << 54) + +#define CPU_PIO_RO_CTL_VAL (~0ULL & ~(CPU_PIO_RO_CTL_DBG_WRAP | CPU_PIO_RO_CTL_TRACE_CORE_CFG)) +#define CPU_PIO_LOCK_VAL (1ULL << 0) + +#define ACC_PIO_RO_CTL_PBLK_OVRD (1ULL << 47) +#define ACC_PIO_RO_CTL_DBG_CTL (1ULL << 48) +#define ACC_PIO_RO_CTL_DBG_PMGR (1ULL << 50) +#define ACC_PIO_RO_CTL_DBG_WRAP_GLB (1ULL << 51) +#define ACC_PIO_RO_CTL_TRACE_CTL (1ULL << 53) +#define ACC_PIO_RO_CTL_TRC_UT_CTL (1ULL << 55) +#define ACC_PIO_RO_CTL_OCLA_CTL (1ULL << 56) + +#define ACC_PIO_RO_CTL_VAL (~0ULL & ~(ACC_PIO_RO_CTL_PBLK_OVRD | ACC_PIO_RO_CTL_DBG_CTL | ACC_PIO_RO_CTL_DBG_PMGR | \ + ACC_PIO_RO_CTL_DBG_WRAP_GLB | ACC_PIO_RO_CTL_TRACE_CTL | \ + ACC_PIO_RO_CTL_TRC_UT_CTL | ACC_PIO_RO_CTL_OCLA_CTL)) +#define ACC_PIO_LOCK_VAL (1ULL << 0) + +#endif /* _PEXPERT_ARM64_SPR_LOCKS_H */ diff --git a/pexpert/pexpert/arm64/twister.h b/pexpert/pexpert/arm64/twister.h index 0a17b3f22..4fc2b8480 100644 --- a/pexpert/pexpert/arm64/twister.h +++ b/pexpert/pexpert/arm64/twister.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Apple Inc. All rights reserved. + * Copyright (c) 2014-2018 Apple Inc. All rights reserved. */ #ifndef _PEXPERT_ARM_TWISTER_H @@ -8,6 +8,7 @@ #define MONITOR 1 /* Use EL3 monitor */ #define NO_ECORE 1 #define HAS_32BIT_DBGWRAP 1 +#define HAS_CPMU_L2C_EVENTS 1 /* Has L2 cache events in CPMU */ #ifdef APPLETWISTER #include "arm64_common.h" diff --git a/pexpert/pexpert/arm64/typhoon.h b/pexpert/pexpert/arm64/typhoon.h index e91c1faa5..dba7d4362 100644 --- a/pexpert/pexpert/arm64/typhoon.h +++ b/pexpert/pexpert/arm64/typhoon.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * Copyright (c) 2012-2018 Apple Inc. All rights reserved. */ #ifndef _PEXPERT_ARM_TYPHOON_H @@ -8,6 +8,8 @@ #define MONITOR 1 /* Use EL3 monitor */ #define NO_ECORE 1 #define HAS_32BIT_DBGWRAP 1 +#define HAS_CPMU_BIU_EVENTS 1 /* Has BIU events in CPMU */ +#define HAS_CPMU_L2C_EVENTS 1 /* Has L2 cache events in CPMU */ #ifdef APPLETYPHOON #include "arm64_common.h" diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index d01a6ee19..c721ce842 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -91,6 +91,8 @@ uint32_t PE_get_random_seed( uint32_t PE_i_can_has_debugger( uint32_t *); +int PE_stub_poll_input(unsigned int options, char *c); + #if defined(__arm__) || defined(__arm64__) boolean_t PE_panic_debugging_enabled(void); @@ -230,10 +232,6 @@ enum { kPEWaitForInput = 0x00000001, kPERawInput = 0x00000002 }; -extern int (*PE_poll_input)( - unsigned int options, - char * c); - extern int (*PE_write_IIC)( unsigned char addr, unsigned char reg, @@ -314,12 +312,6 @@ extern PE_state_t PE_state; extern char * PE_boot_args( void); -#if !defined(__LP64__) && !defined(__arm__) -extern boolean_t PE_parse_boot_arg( - const char *arg_string, - void *arg_ptr) __deprecated; -#endif - extern boolean_t PE_parse_boot_argn( const char *arg_string, void *arg_ptr, @@ -384,14 +376,20 @@ extern void pe_init_debug(void); extern boolean_t PE_imgsrc_mount_supported(void); +extern void PE_panic_hook(const char *str); + +extern void PE_init_cpu(void); + #if defined(__arm__) || defined(__arm64__) typedef void (*perfmon_interrupt_handler_func)(cpu_id_t source); extern kern_return_t PE_cpu_perfmon_interrupt_install_handler(perfmon_interrupt_handler_func handler); extern void PE_cpu_perfmon_interrupt_enable(cpu_id_t target, boolean_t enable); -extern void (*PE_arm_debug_panic_hook)(const char *str); #if DEVELOPMENT || DEBUG extern void PE_arm_debug_enable_trace(void); +extern void (*PE_arm_debug_panic_hook)(const char *str); +#else +extern void(*const PE_arm_debug_panic_hook)(const char *str); #endif #endif diff --git a/san/Kasan_kasan.exports b/san/Kasan_kasan.exports index 4911193fb..fdca03b8c 100644 --- a/san/Kasan_kasan.exports +++ b/san/Kasan_kasan.exports @@ -101,6 +101,7 @@ ___asan_version_mismatch_check_apple_900 ___asan_version_mismatch_check_apple_902 ___asan_version_mismatch_check_apple_1000 ___asan_version_mismatch_check_apple_1001 +___asan_version_mismatch_check_apple_clang_1100 ___asan_init ___asan_memcpy ___asan_memmove @@ -115,3 +116,48 @@ ___asan_strlcat ___asan_strncat ___asan_strlen ___asan_strnlen +___ubsan_handle_add_overflow +___ubsan_handle_add_overflow_abort +___ubsan_handle_builtin_unreachable +___ubsan_handle_divrem_overflow +___ubsan_handle_divrem_overflow_abort +___ubsan_handle_float_cast_overflow +___ubsan_handle_float_cast_overflow_abort +___ubsan_handle_function_type_mismatch +___ubsan_handle_function_type_mismatch_abort +___ubsan_handle_implicit_conversion +___ubsan_handle_implicit_conversion_abort +___ubsan_handle_invalid_builtin +___ubsan_handle_invalid_builtin_abort +___ubsan_handle_load_invalid_value +___ubsan_handle_load_invalid_value_abort +___ubsan_handle_missing_return +___ubsan_handle_mul_overflow +___ubsan_handle_mul_overflow_abort +___ubsan_handle_negate_overflow +___ubsan_handle_negate_overflow_abort +___ubsan_handle_nonnull_arg +___ubsan_handle_nonnull_arg_abort +___ubsan_handle_nonnull_return +___ubsan_handle_nonnull_return_abort +___ubsan_handle_nullability_arg +___ubsan_handle_nullability_arg_abort +___ubsan_handle_nullability_return +___ubsan_handle_nullability_return_abort +___ubsan_handle_out_of_bounds +___ubsan_handle_out_of_bounds_abort +___ubsan_handle_pointer_overflow +___ubsan_handle_pointer_overflow_abort +___ubsan_handle_shift_out_of_bounds +___ubsan_handle_shift_out_of_bounds_abort +___ubsan_handle_sub_overflow +___ubsan_handle_sub_overflow_abort +___ubsan_handle_type_mismatch_v1 +___ubsan_handle_type_mismatch_v1_abort +___ubsan_handle_vla_bound_not_positive +___ubsan_handle_vla_bound_not_positive_abort +___sanitizer_cov_trace_pc +___sanitizer_cov_trace_pc_guard +___sanitizer_cov_trace_pc_guard_init +___sanitizer_cov_trace_pc_indirect +___sanitizer_cov_pcs_init diff --git a/san/Makefile b/san/Makefile index e8c092167..816390994 100644 --- a/san/Makefile +++ b/san/Makefile @@ -17,6 +17,7 @@ PRIVATE_KERNELFILES = \ # Available only in xnu proper PRIVATE_XNUFILES = \ + ksancov.h \ kasan.h INSTALL_MI_LIST = ${DATAFILES} @@ -32,7 +33,7 @@ COMP_SUBDIRS = conf .DELETE_ON_ERROR: $(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/ubsan-blacklist $(SOURCE)/kasan-blacklist-% - @echo "$(ColorH)GENERATING$(Color0) $(ColorLF)$(notdir $@)$(Color0)" + $(call makelog,$(ColorH)GENERATING$(Color0) $(ColorLF)$(notdir $@)$(Color0)) $(_v)sed -e 's,^src:\./,src:'"$(SRCROOT)/," $^ > $@ $(_v)$(SOURCE)/tools/validate_blacklist.sh "$@" @@ -57,7 +58,7 @@ endif # Our external dependency on allsymbols is fine because this runs in a later phase (config_install vs. config_all) $(OBJPATH)/%.symbolset: $(SOURCE)/%.exports - @echo "$(ColorH)SYMBOLSET$(Color0) $(ColorF)$*$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorH)SYMBOLSET$(Color0) $(ColorF)$*$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(KEXT_CREATE_SYMBOL_SET) \ $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ -import $(OBJPATH)/allsymbols \ @@ -66,12 +67,12 @@ $(OBJPATH)/%.symbolset: $(SOURCE)/%.exports $(DSTROOT_KEXT): $(DSTROOT_KEXT_PATH)/% : $(OBJPATH)/%.symbolset $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ $(SYMROOT_KEXT): $(SYMROOT_KEXT_PATH)/% : $(DSTROOT_KEXT_PATH)/% $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" + $(call makelog,$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))") $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ do_config_install:: $(DSTROOT_KEXT) $(SYMROOT_KEXT) @@ -85,7 +86,7 @@ endif $(KASAN_HELPER_SCRIPTS): $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/% : $(SOURCE)/tools/% $(_v)$(MKDIR) $(dir $@) - @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(call makelog,$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)) $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ do_config_install:: $(KASAN_HELPER_SCRIPTS) diff --git a/san/conf/Makefile b/san/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/san/conf/Makefile +++ b/san/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/san/conf/Makefile.template b/san/conf/Makefile.template index 8c60bc15b..42c6ee1ed 100644 --- a/san/conf/Makefile.template +++ b/san/conf/Makefile.template @@ -47,19 +47,6 @@ COMP_SUBDIRS = # Rebuild if per-file overrides change ${OBJS}: $(firstword $(MAKEFILE_LIST)) -# set file list manually -OBJS = - -ifeq ($(KASAN),1) -OBJS += kasan.o kasan-fakestack.o kasan-memintrinsics.o kasan_dynamic_blacklist.o -OBJS += kasan-$(CURRENT_ARCH_CONFIG_LC).o -OBJS += kasan-test.o kasan-test-$(CURRENT_ARCH_CONFIG_LC).o -endif - -ifeq ($(UBSAN),1) -OBJS += ubsan.o ubsan_log.o -endif - # Rebuild if global compile flags change $(COBJS): .CFLAGS .CFLAGS: ALWAYS @@ -76,13 +63,13 @@ $(SOBJS): .SFLAGS $(_v)$(REPLACECONTENTS) $@ $(KASAN) $(COMPONENT).filelist: $(OBJS) .KASANFLAGS - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist $(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_blacklist_dynamic.h: $(SRCROOT)/$(COMPONENT)/kasan-blacklist-dynamic - @echo "$(ColorH)GENERATING$(Color0) $(ColorLF)$(notdir $@)$(Color0)" + $(call makelog,$(ColorH)GENERATING$(Color0) $(ColorLF)$(notdir $@)$(Color0)) @$(SRCROOT)/$(COMPONENT)/tools/generate_dynamic_blacklist.py "$<" > "$@" $(SRCROOT)/$(COMPONENT)/kasan_dynamic_blacklist.c: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_blacklist_dynamic.h diff --git a/san/conf/files b/san/conf/files index 0c312a11f..e4388a3df 100644 --- a/san/conf/files +++ b/san/conf/files @@ -1,7 +1,14 @@ -san/kasan.c standard -san/kasan-fakestack.c standard -san/kasan-test.c standard -san/kasan-memintrinsics.c standard -san/kasan_dynamic_blacklist.c standard -san/ubsan.c standard -san/ubsan_log.c standard +OPTIONS/CONFIG_KASAN optional config_kasan +OPTIONS/CONFIG_UBSAN optional config_ubsan +OPTIONS/CONFIG_KSANCOV optional config_ksancov + +san/kasan.c optional config_kasan +san/kasan-fakestack.c optional config_kasan +san/kasan-test.c optional config_kasan +san/kasan-memintrinsics.c optional config_kasan +san/kasan_dynamic_blacklist.c optional config_kasan + +san/ubsan.c optional config_ubsan +san/ubsan_log.c optional config_ubsan + +san/ksancov.c optional config_ksancov diff --git a/san/conf/files.arm64 b/san/conf/files.arm64 index 4303b854d..d43c7ffab 100644 --- a/san/conf/files.arm64 +++ b/san/conf/files.arm64 @@ -1,3 +1,2 @@ -# KASAN -san/kasan-arm64.c standard -san/kasan-test-arm64.s standard +san/kasan-arm64.c optional config_kasan +san/kasan-test-arm64.s optional config_kasan diff --git a/san/conf/files.x86_64 b/san/conf/files.x86_64 index bd884e798..91b496f47 100644 --- a/san/conf/files.x86_64 +++ b/san/conf/files.x86_64 @@ -1,5 +1,2 @@ -# options - -# KASAN -san/kasan-x86_64.c standard -san/kasan-test-x86_64.s standard +san/kasan-x86_64.c optional config_kasan +san/kasan-test-x86_64.s optional config_kasan diff --git a/san/kasan-arm64.c b/san/kasan-arm64.c index 3d3a23364..909a075ef 100644 --- a/san/kasan-arm64.c +++ b/san/kasan-arm64.c @@ -45,7 +45,7 @@ #include #include -#include +#include #include @@ -69,11 +69,10 @@ extern vm_offset_t intstack, intstack_top; extern vm_offset_t excepstack, excepstack_top; void kasan_bootstrap(boot_args *, vm_offset_t pgtable); -void flush_mmu_tlb(void); -#define KASAN_SHIFT_ARM64 0xdffffff800000000ULL /* Defined in makedefs/MakeInc.def */ -#define KASAN_SHADOW_MIN 0xfffffff400000000ULL -#define KASAN_SHADOW_MAX 0xfffffff680000000ULL +#define KASAN_SHIFT_ARM64 0xe000000000000000ULL /* Defined in makedefs/MakeInc.def */ +#define KASAN_SHADOW_MIN 0xfffffffc00000000ULL +#define KASAN_SHADOW_MAX 0xffffffff80000000ULL _Static_assert(KASAN_SHIFT == KASAN_SHIFT_ARM64, "KASan inconsistent shadow shift"); _Static_assert(VM_MAX_KERNEL_ADDRESS < KASAN_SHADOW_MIN, "KASan shadow overlaps with kernel VM"); @@ -124,7 +123,6 @@ kasan_map_shadow_internal(vm_offset_t address, vm_size_t size, bool is_zero, boo uint64_t *base = cpu_tte; uint64_t *pte; -#if !__ARM64_TWO_LEVEL_PMAP__ /* lookup L1 entry */ pte = base + ((shadow_base & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); if (*pte & ARM_TTE_VALID) { @@ -134,7 +132,6 @@ kasan_map_shadow_internal(vm_offset_t address, vm_size_t size, bool is_zero, boo *pte = ((uint64_t)alloc_zero_page() & ARM_TTE_TABLE_MASK) | ARM_TTE_VALID | ARM_TTE_TYPE_TABLE; } base = (uint64_t *)phystokv(*pte & ARM_TTE_TABLE_MASK); -#endif /* lookup L2 entry */ pte = base + ((shadow_base & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); @@ -204,7 +201,6 @@ kasan_map_shadow_early(vm_offset_t address, vm_size_t size, bool is_zero) uint64_t *base = (uint64_t *)bootstrap_pgtable_phys; -#if !__ARM64_TWO_LEVEL_PMAP__ /* lookup L1 entry */ pte = base + ((virt_shadow_target & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); if (*pte & ARM_TTE_VALID) { @@ -216,7 +212,6 @@ kasan_map_shadow_early(vm_offset_t address, vm_size_t size, bool is_zero) *pte = ((uint64_t)pg & ARM_TTE_TABLE_MASK) | ARM_TTE_VALID | ARM_TTE_TYPE_TABLE; } base = (uint64_t *)(*pte & ARM_TTE_TABLE_MASK); -#endif /* lookup L2 entry */ pte = base + ((virt_shadow_target & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); @@ -332,14 +327,12 @@ kasan_is_shadow_mapped(uintptr_t shadowp) assert(shadowp >= KASAN_SHADOW_MIN); assert(shadowp < KASAN_SHADOW_MAX); -#if !__ARM64_TWO_LEVEL_PMAP__ /* lookup L1 entry */ pte = base + ((shadowp & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); if (!(*pte & ARM_TTE_VALID)) { return false; } base = (uint64_t *)phystokv(*pte & ARM_TTE_TABLE_MASK); -#endif /* lookup L2 entry */ pte = base + ((shadowp & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); diff --git a/san/kasan-blacklist b/san/kasan-blacklist index 48ce86d46..38df385ad 100644 --- a/san/kasan-blacklist +++ b/san/kasan-blacklist @@ -25,6 +25,9 @@ src:./san/kasan-x86_64.c src:./san/kasan-memintrinsics.c src:./san/kasan_dynamic_blacklist.c +# Exclude other sanitizers +src:./san/ksancov.c + # Exclude dtrace function that does weird stack manipulations fun:fbt_perfCallback diff --git a/san/kasan-blacklist-x86_64 b/san/kasan-blacklist-x86_64 index 517bce143..69a8dc15f 100644 --- a/san/kasan-blacklist-x86_64 +++ b/san/kasan-blacklist-x86_64 @@ -66,7 +66,6 @@ src:./osfmk/i386/pmap_x86_common.c src:./osfmk/i386/pmCPU.c src:./osfmk/i386/startup64.c src:./osfmk/i386/lapic_native.c -src:./osfmk/i386/fpu.c src:./osfmk/vm/vm_compressor.c fun:doublemap_init fun:getsegbynamefromheader diff --git a/san/kasan.c b/san/kasan.c index 9ec9433df..cec75e459 100644 --- a/san/kasan.c +++ b/san/kasan.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -66,9 +67,11 @@ vm_offset_t kernel_vtop; static unsigned kasan_enabled; static unsigned quarantine_enabled; -static unsigned enabled_checks = TYPE_ALL; /* bitmask of enabled checks */ -static unsigned report_ignored; /* issue non-fatal report for disabled/blacklisted checks */ -static unsigned free_yield = 0; /* ms yield after each free */ +static unsigned enabled_checks = TYPE_ALL & ~TYPE_LEAK; /* bitmask of enabled checks */ +static unsigned report_ignored; /* issue non-fatal report for disabled/blacklisted checks */ +static unsigned free_yield = 0; /* ms yield after each free */ +static unsigned leak_threshold = 3; /* threshold for uninitialized memory leak detection */ +static unsigned leak_fatal_threshold = 0; /* threshold for treating leaks as fatal errors (0 means never) */ /* forward decls */ static void kasan_crash_report(uptr p, uptr width, access_t access, violation_t reason); @@ -91,6 +94,8 @@ extern vm_size_t ml_stack_size(void); static const size_t BACKTRACE_BITS = 4; static const size_t BACKTRACE_MAXFRAMES = (1UL << BACKTRACE_BITS) - 1; +static vm_size_t kasan_alloc_retrieve_bt(vm_address_t addr, uintptr_t frames[static BACKTRACE_MAXFRAMES]); + decl_simple_lock_data(, kasan_vm_lock); static thread_t kasan_lock_holder; @@ -317,6 +322,67 @@ kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow) return true; } +static void +kasan_report_leak(vm_address_t base, vm_size_t sz, vm_offset_t offset, vm_size_t leak_sz) +{ + if (leak_fatal_threshold > leak_threshold && leak_sz >= leak_fatal_threshold){ + kasan_violation(base + offset, leak_sz, TYPE_LEAK, REASON_UNINITIALIZED); + } + + char string_rep[BACKTRACE_MAXFRAMES * 20] = {}; + vm_offset_t stack_base = dtrace_get_kernel_stack(current_thread()); + bool is_stack = (base >= stack_base && base < (stack_base + kernel_stack_size)); + + if (!is_stack) { + uintptr_t alloc_bt[BACKTRACE_MAXFRAMES] = {}; + vm_size_t num_frames = 0; + size_t l = 0; + num_frames = kasan_alloc_retrieve_bt(base, alloc_bt); + for (vm_size_t i = 0; i < num_frames; i++) { + l += snprintf(string_rep + l, sizeof(string_rep) - l, " %lx", alloc_bt[i]); + } + } + + DTRACE_KASAN5(leak_detected, + vm_address_t, base, + vm_size_t, sz, + vm_offset_t, offset, + vm_size_t, leak_sz, + char *, string_rep); +} + +/* + * Check for possible uninitialized memory contained in [base, base+sz). + */ +void +kasan_check_uninitialized(vm_address_t base, vm_size_t sz) +{ + if (!(enabled_checks & TYPE_LEAK) || sz < leak_threshold) { + return; + } + + vm_address_t cur = base; + vm_address_t end = base + sz; + vm_size_t count = 0; + vm_size_t max_count = 0; + vm_address_t leak_offset = 0; + uint8_t byte = 0; + + while (cur < end) { + byte = *(uint8_t *)cur; + count = (byte == KASAN_UNINITIALIZED_HEAP) ? (count + 1) : 0; + if (count > max_count) { + max_count = count; + leak_offset = cur - (count - 1) - base; + } + cur += 1; + } + + if (max_count >= leak_threshold) { + kasan_report_leak(base, sz, leak_offset, max_count); + } +} + /* * * KASAN violation reporting @@ -332,6 +398,8 @@ access_str(access_t type) return "store to"; } else if (type & TYPE_FREE) { return "free of"; + } else if (type & TYPE_LEAK) { + return "leak from"; } else { return "access of"; } @@ -468,7 +536,8 @@ kasan_log_report(uptr p, uptr width, access_t access, violation_t reason) * print a backtrace */ - nframes = backtrace_frame(bt, nframes, __builtin_frame_address(0)); /* ignore current frame */ + nframes = backtrace_frame(bt, nframes, __builtin_frame_address(0), + NULL); /* ignore current frame */ buf[0] = '\0'; l += snprintf(buf+l, len-l, "Backtrace: "); @@ -483,8 +552,8 @@ kasan_log_report(uptr p, uptr width, access_t access, violation_t reason) #define REPORT_DECLARE(n) \ void OS_NORETURN __asan_report_load##n(uptr p) { kasan_crash_report(p, n, TYPE_LOAD, 0); } \ void OS_NORETURN __asan_report_store##n(uptr p) { kasan_crash_report(p, n, TYPE_STORE, 0); } \ - void UNSUPPORTED_API(__asan_report_exp_load##n, uptr a, int32_t b); \ - void UNSUPPORTED_API(__asan_report_exp_store##n, uptr a, int32_t b); + void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_load##n, uptr a, int32_t b); \ + void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_store##n, uptr a, int32_t b); REPORT_DECLARE(1) REPORT_DECLARE(2) @@ -731,12 +800,23 @@ kasan_init(void) if (arg & KASAN_ARGS_NOPOISON_GLOBAL) { enabled_checks &= ~TYPE_POISON_GLOBAL; } + if (arg & KASAN_ARGS_CHECK_LEAKS) { + enabled_checks |= TYPE_LEAK; + } } if (PE_parse_boot_argn("kasan.free_yield_ms", &arg, sizeof(arg))) { free_yield = arg; } + if (PE_parse_boot_argn("kasan.leak_threshold", &arg, sizeof(arg))) { + leak_threshold = arg; + } + + if (PE_parse_boot_argn("kasan.leak_fatal_threshold", &arg, sizeof(arg))) { + leak_fatal_threshold = arg; + } + /* kasan.bl boot-arg handled in kasan_init_dybl() */ quarantine_enabled = 1; @@ -870,7 +950,7 @@ kasan_alloc_bt(uint32_t *ptr, vm_size_t sz, vm_size_t skip) if (frames > 0) { frames = min(frames + skip, BACKTRACE_MAXFRAMES); - frames = backtrace(bt, frames); + frames = backtrace(bt, frames, NULL); while (frames > sz && skip > 0) { bt++; @@ -906,6 +986,40 @@ kasan_alloc_crc(vm_offset_t addr) return crc; } +static vm_size_t +kasan_alloc_retrieve_bt(vm_address_t addr, uintptr_t frames[static BACKTRACE_MAXFRAMES]) +{ + vm_size_t num_frames = 0; + uptr shadow = (uptr)SHADOW_FOR_ADDRESS(addr); + uptr max_search = shadow - 4096; + vm_address_t alloc_base = 0; + size_t fsize = 0; + + /* walk the shadow backwards to find the allocation base */ + while (shadow >= max_search) { + if (*(uint8_t *)shadow == ASAN_HEAP_LEFT_RZ) { + alloc_base = ADDRESS_FOR_SHADOW(shadow) + 8; + break; + } + shadow--; + } + + if (alloc_base) { + struct kasan_alloc_header *header = header_for_user_addr(alloc_base); + if (magic_for_addr(alloc_base, LIVE_XOR) == header->magic) { + struct kasan_alloc_footer *footer = footer_for_user_addr(alloc_base, &fsize); + if ((fsize/sizeof(footer->backtrace[0])) >= header->frames) { + num_frames = header->frames; + for (size_t i = 0; i < num_frames; i++) { + frames[i] = footer->backtrace[i] + vm_kernel_slid_base; + } + } + } + } + + return num_frames; +} + /* * addr: base address of full allocation (including redzones) * size: total size of allocation (include redzones) @@ -930,6 +1044,10 @@ kasan_alloc(vm_offset_t addr, vm_size_t size, vm_size_t req, vm_size_t leftrz) addr += leftrz; + if (enabled_checks & TYPE_LEAK) { + __nosan_memset((void *)addr, KASAN_UNINITIALIZED_HEAP, req); + } + /* stash the allocation sizes in the left redzone */ struct kasan_alloc_header *h = header_for_user_addr(addr); h->magic = magic_for_addr(addr, LIVE_XOR); @@ -1206,11 +1324,32 @@ __asan_poison_cxx_array_cookie(uptr p) *shadow = ASAN_ARRAY_COOKIE; } +/* + * Unpoison the C++ array cookie (if it exists). We don't know exactly where it + * lives relative to the start of the buffer, but it's always the word immediately + * before the start of the array data, so for naturally-aligned objects we need to + * search at most 2 shadow bytes. + */ +void +kasan_unpoison_cxx_array_cookie(void *ptr) +{ + uint8_t *shadow = SHADOW_FOR_ADDRESS((uptr)ptr); + for (size_t i = 0; i < 2; i++) { + if (shadow[i] == ASAN_ARRAY_COOKIE) { + shadow[i] = ASAN_VALID; + return; + } else if (shadow[i] != ASAN_VALID) { + /* must have seen the cookie by now */ + return; + } + } +} + #define ACCESS_CHECK_DECLARE(type, sz, access) \ void __asan_##type##sz(uptr addr) { \ kasan_check_range((const void *)addr, sz, access); \ } \ - void UNSUPPORTED_API(__asan_exp_##type##sz, uptr a, int32_t b); + void OS_NORETURN UNSUPPORTED_API(__asan_exp_##type##sz, uptr a, int32_t b); ACCESS_CHECK_DECLARE(load, 1, TYPE_LOAD); ACCESS_CHECK_DECLARE(load, 2, TYPE_LOAD); @@ -1314,17 +1453,18 @@ UNUSED_ABI(__asan_version_mismatch_check_apple_900, void); UNUSED_ABI(__asan_version_mismatch_check_apple_902, void); UNUSED_ABI(__asan_version_mismatch_check_apple_1000, void); UNUSED_ABI(__asan_version_mismatch_check_apple_1001, void); +UNUSED_ABI(__asan_version_mismatch_check_apple_clang_1100, void); -void UNSUPPORTED_API(__asan_init_v5, void); -void UNSUPPORTED_API(__asan_register_globals, uptr a, uptr b); -void UNSUPPORTED_API(__asan_unregister_globals, uptr a, uptr b); -void UNSUPPORTED_API(__asan_register_elf_globals, uptr a, uptr b, uptr c); -void UNSUPPORTED_API(__asan_unregister_elf_globals, uptr a, uptr b, uptr c); +void OS_NORETURN UNSUPPORTED_API(__asan_init_v5, void); +void OS_NORETURN UNSUPPORTED_API(__asan_register_globals, uptr a, uptr b); +void OS_NORETURN UNSUPPORTED_API(__asan_unregister_globals, uptr a, uptr b); +void OS_NORETURN UNSUPPORTED_API(__asan_register_elf_globals, uptr a, uptr b, uptr c); +void OS_NORETURN UNSUPPORTED_API(__asan_unregister_elf_globals, uptr a, uptr b, uptr c); -void UNSUPPORTED_API(__asan_exp_loadN, uptr addr, size_t sz, int32_t e); -void UNSUPPORTED_API(__asan_exp_storeN, uptr addr, size_t sz, int32_t e); -void UNSUPPORTED_API(__asan_report_exp_load_n, uptr addr, unsigned long b, int32_t c); -void UNSUPPORTED_API(__asan_report_exp_store_n, uptr addr, unsigned long b, int32_t c); +void OS_NORETURN UNSUPPORTED_API(__asan_exp_loadN, uptr addr, size_t sz, int32_t e); +void OS_NORETURN UNSUPPORTED_API(__asan_exp_storeN, uptr addr, size_t sz, int32_t e); +void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_load_n, uptr addr, unsigned long b, int32_t c); +void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_store_n, uptr addr, unsigned long b, int32_t c); /* * @@ -1370,6 +1510,8 @@ SYSCTL_UINT(_kern_kasan, OID_AUTO, checks, CTLFLAG_RW, &enabled_checks, 0, ""); SYSCTL_UINT(_kern_kasan, OID_AUTO, quarantine, CTLFLAG_RW, &quarantine_enabled, 0, ""); SYSCTL_UINT(_kern_kasan, OID_AUTO, report_ignored, CTLFLAG_RW, &report_ignored, 0, ""); SYSCTL_UINT(_kern_kasan, OID_AUTO, free_yield_ms, CTLFLAG_RW, &free_yield, 0, ""); +SYSCTL_UINT(_kern_kasan, OID_AUTO, leak_threshold, CTLFLAG_RW, &leak_threshold, 0, ""); +SYSCTL_UINT(_kern_kasan, OID_AUTO, leak_fatal_threshold, CTLFLAG_RW, &leak_fatal_threshold, 0, ""); SYSCTL_UINT(_kern_kasan, OID_AUTO, memused, CTLFLAG_RD, &shadow_pages_used, 0, ""); SYSCTL_UINT(_kern_kasan, OID_AUTO, memtotal, CTLFLAG_RD, &shadow_pages_total, 0, ""); SYSCTL_UINT(_kern_kasan, OID_AUTO, kexts, CTLFLAG_RD, &kexts_loaded, 0, ""); diff --git a/san/kasan.h b/san/kasan.h index 308efa2e9..fcfc44462 100644 --- a/san/kasan.h +++ b/san/kasan.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,12 +103,16 @@ void kasan_notify_address_nopoison(vm_offset_t address, vm_size_t size); void kasan_unpoison_stack(vm_offset_t stack, vm_size_t size); void kasan_unpoison_curstack(bool whole_stack); bool kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow); +void kasan_unpoison_cxx_array_cookie(void *ptr); void kasan_fakestack_drop(thread_t thread); /* mark all fakestack entries for thread as unused */ void kasan_fakestack_gc(thread_t thread); /* free and poison all unused fakestack objects for thread */ void kasan_fakestack_suspend(void); void kasan_fakestack_resume(void); +/* check for uninitialized memory */ +void kasan_check_uninitialized(vm_address_t base, vm_size_t sz); + struct kasan_test; void __kasan_runtests(struct kasan_test *, int numtests); @@ -172,7 +176,7 @@ extern const uintptr_t __asan_shadow_memory_dynamic_address; ret func ## 2(__VA_ARGS__); \ ret func ## 4(__VA_ARGS__); \ ret func ## 8(__VA_ARGS__); \ - ret func ## 16(__VA_ARGS__); \ + ret func ## 16(__VA_ARGS__) __BEGIN_DECLS diff --git a/san/kasan_dynamic_blacklist.c b/san/kasan_dynamic_blacklist.c index 04d0973dc..cb661abd0 100644 --- a/san/kasan_dynamic_blacklist.c +++ b/san/kasan_dynamic_blacklist.c @@ -363,7 +363,8 @@ kasan_is_blacklisted(access_t type) return false; } - nframes = backtrace_frame(bt, MAX_FRAMES, __builtin_frame_address(0)); + nframes = backtrace_frame(bt, MAX_FRAMES, __builtin_frame_address(0), + NULL); boolean_t flag; if (nframes >= 1) { diff --git a/san/kasan_internal.h b/san/kasan_internal.h index 7a920961e..f0565a004 100644 --- a/san/kasan_internal.h +++ b/san/kasan_internal.h @@ -71,6 +71,10 @@ typedef uintptr_t uptr; #define KASAN_ARGS_NODYCHECKS 0x0100U #define KASAN_ARGS_NOPOISON_HEAP 0x0200U #define KASAN_ARGS_NOPOISON_GLOBAL 0x0400U +#define KASAN_ARGS_CHECK_LEAKS 0x0800U + +/* uninitialized memory detection */ +#define KASAN_UNINITIALIZED_HEAP 0xbe #ifndef KASAN # error KASAN undefined @@ -110,6 +114,7 @@ enum __attribute__((flag_enum)) kasan_access_types { TYPE_POISON_HEAP = BIT(14), /* no TYPE_POISON_STACK, because the runtime does not control stack poisoning */ TYPE_TEST = BIT(15), + TYPE_LEAK = BIT(16), /* masks */ TYPE_MEM = TYPE_MEMR | TYPE_MEMW, /* memory intrinsics */ @@ -130,6 +135,7 @@ enum kasan_violation_types { REASON_INVALID_SIZE = 2, /* free size did not match alloc size */ REASON_MOD_AFTER_FREE = 3, /* object modified after free */ REASON_MOD_OOB = 4, /* out of bounds modification of object */ + REASON_UNINITIALIZED = 5, /* leak of uninitialized kernel memory */ }; typedef enum kasan_access_types access_t; diff --git a/san/ksancov.c b/san/ksancov.c new file mode 100644 index 000000000..a8d7c8188 --- /dev/null +++ b/san/ksancov.c @@ -0,0 +1,769 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include /* dev_t */ +#include /* must come after sys/stat.h */ +#include /* must come after sys/stat.h */ + +#include +#include +#include + +#include + +/* header mess... */ +struct uthread; +typedef struct uthread * uthread_t; + +#include +#include +#include + +#define USE_PC_TABLE 0 +#define KSANCOV_MAX_DEV 64 + +extern boolean_t ml_at_interrupt_context(void); +extern boolean_t ml_get_interrupts_enabled(void); + +static int ksancov_detach(dev_t dev); + +static int dev_major; +static size_t nedges = 0; +static uint32_t __unused npcs = 0; + +static _Atomic unsigned active_devs; + +enum { + KS_MODE_NONE, + KS_MODE_TRACE, + KS_MODE_COUNTERS, + KS_MODE_MAX +}; + +struct ksancov_dev { + unsigned mode; + + union { + struct ksancov_trace *trace; + struct ksancov_counters *counters; + }; + size_t sz; /* size of allocated trace/counters buffer */ + + size_t maxpcs; + + thread_t thread; + dev_t dev; +}; + +/* array of devices indexed by devnode minor */ +static struct ksancov_dev *ksancov_devs[KSANCOV_MAX_DEV]; + +static struct ksancov_edgemap *ksancov_edgemap; + +static inline struct ksancov_dev * +get_dev(dev_t dev) +{ + int mn = minor(dev); + return ksancov_devs[mn]; +} + +void +__sanitizer_cov_trace_pc_indirect(void * __unused callee) +{ + return; +} + +#define GUARD_SEEN (uint32_t)0x80000000 +#define GUARD_IDX_MASK (uint32_t)0x0fffffff + +static inline void __attribute__((always_inline)) +trace_pc_guard(uint32_t *guardp, void *caller) +{ + /* record the pc for this guard */ + if (guardp) { + uint32_t gd = *guardp; + if (__improbable(gd && !(gd & GUARD_SEEN) && ksancov_edgemap)) { + size_t idx = gd & GUARD_IDX_MASK; + if (idx < ksancov_edgemap->nedges) { + ksancov_edgemap->addrs[idx] = (uint32_t)(VM_KERNEL_UNSLIDE(caller) - VM_MIN_KERNEL_ADDRESS - 1); + *guardp |= GUARD_SEEN; + } + } + } + + if (__probable(os_atomic_load(&active_devs, relaxed) == 0)) { + /* early exit when nothing is active */ + return; + } + + if (ml_at_interrupt_context()) { + return; + } + + uint32_t pc = (uint32_t)(VM_KERNEL_UNSLIDE(caller) - VM_MIN_KERNEL_ADDRESS - 1); + + thread_t th = current_thread(); + if (__improbable(th == THREAD_NULL)) { + return; + } + + struct ksancov_dev *dev = *(struct ksancov_dev **)__sanitizer_get_thread_data(th); + if (__probable(dev == NULL)) { + return; + } + + if (dev->mode == KS_MODE_TRACE) { + struct ksancov_trace *trace = dev->trace; + if (os_atomic_load(&trace->enabled, relaxed) == 0) { + return; + } + + if (os_atomic_load(&trace->head, relaxed) >= dev->maxpcs) { + return; /* overflow */ + } + + uint32_t idx = os_atomic_inc_orig(&trace->head, relaxed); + if (__improbable(idx >= dev->maxpcs)) { + return; + } + + trace->pcs[idx] = pc; + } else { + size_t idx = *guardp & GUARD_IDX_MASK; + + struct ksancov_counters *counters = dev->counters; + if (os_atomic_load(&counters->enabled, relaxed) == 0) { + return; + } + + /* saturating 8bit add */ + if (counters->hits[idx] < KSANCOV_MAX_HITS) { + counters->hits[idx]++; + } + } +} + +void __attribute__((noinline)) +__sanitizer_cov_trace_pc(void) +{ + trace_pc_guard(NULL, __builtin_return_address(0)); +} + +void __attribute__((noinline)) +__sanitizer_cov_trace_pc_guard(uint32_t *guardp) +{ + trace_pc_guard(guardp, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) +{ + /* assign a unique number to each guard */ + for (; start != stop; start++) { + if (*start == 0) { + if (nedges < KSANCOV_MAX_EDGES) { + *start = ++nedges; + } + } + } +} + +void +__sanitizer_cov_pcs_init(uintptr_t *start, uintptr_t *stop) +{ +#if USE_PC_TABLE + static const uintptr_t pc_table_seen_flag = 0x100; + + for (; start < stop; start += 2) { + uintptr_t pc = start[0]; + uintptr_t flags = start[1]; + + /* + * This function gets called multiple times on the same range, so mark the + * ones we've seen using unused bits in the flags field. + */ + if (flags & pc_table_seen_flag) { + continue; + } + + start[1] |= pc_table_seen_flag; + assert(npcs < KSANCOV_MAX_EDGES - 1); + edge_addrs[++npcs] = pc; + } +#else + (void)start; + (void)stop; +#endif +} + +static void * +ksancov_do_map(uintptr_t base, size_t sz, vm_prot_t prot) +{ + kern_return_t kr; + mach_port_t mem_entry = MACH_PORT_NULL; + mach_vm_address_t user_addr = 0; + memory_object_size_t size = sz; + + kr = mach_make_memory_entry_64(kernel_map, + &size, + (mach_vm_offset_t)base, + MAP_MEM_VM_SHARE | prot, + &mem_entry, + MACH_PORT_NULL); + if (kr != KERN_SUCCESS) { + return NULL; + } + + kr = mach_vm_map_kernel(get_task_map(current_task()), + &user_addr, + size, + 0, + VM_FLAGS_ANYWHERE, + VM_MAP_KERNEL_FLAGS_NONE, + VM_KERN_MEMORY_NONE, + mem_entry, + 0, + FALSE, + prot, + prot, + VM_INHERIT_SHARE); + + /* + * At this point, either vm_map() has taken a reference on the memory entry + * and we can release our local reference, or the map failed and the entry + * needs to be freed. + */ + mach_memory_entry_port_release(mem_entry); + + if (kr != KERN_SUCCESS) { + return NULL; + } + + return (void *)user_addr; +} + +/* + * map the sancov buffer into the current process + */ +static int +ksancov_map(dev_t dev, void **bufp, size_t *sizep) +{ + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + uintptr_t addr; + size_t size = d->sz; + + if (d->mode == KS_MODE_TRACE) { + if (!d->trace) { + return EINVAL; + } + addr = (uintptr_t)d->trace; + } else if (d->mode == KS_MODE_COUNTERS) { + if (!d->counters) { + return EINVAL; + } + addr = (uintptr_t)d->counters; + } else { + return EINVAL; /* not configured */ + } + + void *buf = ksancov_do_map(addr, size, VM_PROT_READ | VM_PROT_WRITE); + if (buf == NULL) { + return ENOMEM; + } + + *bufp = buf; + *sizep = size; + return 0; +} + +/* + * map the edge -> pc mapping as read-only + */ +static int +ksancov_map_edgemap(dev_t dev, void **bufp, size_t *sizep) +{ + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + uintptr_t addr = (uintptr_t)ksancov_edgemap; + size_t size = sizeof(struct ksancov_edgemap) + ksancov_edgemap->nedges * sizeof(uint32_t); + + void *buf = ksancov_do_map(addr, size, VM_PROT_READ); + if (buf == NULL) { + return ENOMEM; + } + + *bufp = buf; + *sizep = size; + return 0; +} + + +/* + * Device node management + */ + +static int +ksancov_open(dev_t dev, int flags, int devtype, proc_t p) +{ +#pragma unused(flags,devtype,p) + if (minor(dev) >= KSANCOV_MAX_DEV) { + return EBUSY; + } + + /* allocate a device entry */ + struct ksancov_dev *d = kalloc_tag(sizeof(struct ksancov_dev), VM_KERN_MEMORY_DIAG); + if (!d) { + return ENOMEM; + } + + d->mode = KS_MODE_NONE; + d->trace = NULL; + d->maxpcs = 1024U * 64; /* default to 256k buffer => 64k pcs */ + d->dev = dev; + d->thread = THREAD_NULL; + + ksancov_devs[minor(dev)] = d; + + return 0; +} + +static int +ksancov_trace_alloc(dev_t dev, size_t maxpcs) +{ + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + if (d->mode != KS_MODE_NONE) { + return EBUSY; /* trace/counters already created */ + } + assert(d->trace == NULL); + + uintptr_t buf; + size_t sz; + if (os_mul_and_add_overflow(maxpcs, sizeof(uint32_t), sizeof(struct ksancov_trace), &sz)) { + return EINVAL; + } + + /* allocate the shared memory buffer */ + kern_return_t kr = kmem_alloc_flags(kernel_map, &buf, sz, VM_KERN_MEMORY_DIAG, KMA_ZERO); + if (kr != KERN_SUCCESS) { + return ENOMEM; + } + + struct ksancov_trace *trace = (struct ksancov_trace *)buf; + trace->magic = KSANCOV_TRACE_MAGIC; + trace->offset = VM_MIN_KERNEL_ADDRESS; + trace->head = 0; + trace->enabled = 0; + trace->maxpcs = maxpcs; + + d->trace = trace; + d->sz = sz; + d->maxpcs = maxpcs; + d->mode = KS_MODE_TRACE; + + return 0; +} + +static int +ksancov_counters_alloc(dev_t dev) +{ + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + if (d->mode != KS_MODE_NONE) { + return EBUSY; /* trace/counters already created */ + } + assert(d->counters == NULL); + + uintptr_t buf; + size_t sz = sizeof(struct ksancov_counters) + ksancov_edgemap->nedges * sizeof(uint8_t); + + /* allocate the shared memory buffer */ + kern_return_t kr = kmem_alloc_flags(kernel_map, &buf, sz, VM_KERN_MEMORY_DIAG, KMA_ZERO); + if (kr != KERN_SUCCESS) { + return ENOMEM; + } + + struct ksancov_counters *counters = (struct ksancov_counters *)buf; + counters->magic = KSANCOV_COUNTERS_MAGIC; + counters->nedges = ksancov_edgemap->nedges; + counters->enabled = 0; + + d->counters = counters; + d->sz = sz; + d->mode = KS_MODE_COUNTERS; + + return 0; +} + +/* + * attach a thread to a ksancov dev instance + */ +static int +ksancov_attach(dev_t dev, thread_t th) +{ + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + if (d->thread != THREAD_NULL) { + int ret = ksancov_detach(dev); + if (ret) { + return ret; + } + } + + if (th != current_thread()) { + /* can only attach to self presently */ + return EINVAL; + } + + struct ksancov_dev **devp = (void *)__sanitizer_get_thread_data(th); + if (*devp) { + return EBUSY; /* one dev per thread */ + } + + d->thread = th; + thread_reference(d->thread); + + os_atomic_store(devp, d, relaxed); + os_atomic_add(&active_devs, 1, relaxed); + + return 0; +} + +extern void +thread_wait( + thread_t thread, + boolean_t until_not_runnable); + + +/* + * disconnect thread from ksancov dev + */ +static int +ksancov_detach(dev_t dev) +{ + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + if (d->thread == THREAD_NULL) { + /* no thread attached */ + return 0; + } + + /* disconnect dev from thread */ + struct ksancov_dev **devp = (void *)__sanitizer_get_thread_data(d->thread); + if (*devp != NULL) { + assert(*devp == d); + os_atomic_store(devp, NULL, relaxed); + } + + if (d->thread != current_thread()) { + /* wait until it's safe to yank */ + thread_wait(d->thread, TRUE); + } + + /* drop our thread reference */ + thread_deallocate(d->thread); + d->thread = THREAD_NULL; + + return 0; +} + +static int +ksancov_close(dev_t dev, int flags, int devtype, proc_t p) +{ +#pragma unused(flags,devtype,p) + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; + } + + if (d->mode == KS_MODE_TRACE) { + struct ksancov_trace *trace = d->trace; + if (trace) { + /* trace allocated - delete it */ + + os_atomic_sub(&active_devs, 1, relaxed); + os_atomic_store(&trace->enabled, 0, relaxed); /* stop tracing */ + + ksancov_detach(dev); + + /* free trace */ + kmem_free(kernel_map, (uintptr_t)d->trace, d->sz); + d->trace = NULL; + d->sz = 0; + } + } else if (d->mode == KS_MODE_COUNTERS) { + struct ksancov_counters *counters = d->counters; + if (counters) { + os_atomic_sub(&active_devs, 1, relaxed); + os_atomic_store(&counters->enabled, 0, relaxed); /* stop tracing */ + + ksancov_detach(dev); + + /* free counters */ + kmem_free(kernel_map, (uintptr_t)d->counters, d->sz); + d->counters = NULL; + d->sz = 0; + } + } + + ksancov_devs[minor(dev)] = NULL; /* dev no longer discoverable */ + + /* free the ksancov device instance */ + kfree(d, sizeof(struct ksancov_dev)); + + return 0; +} + +static void +ksancov_testpanic(volatile uint64_t guess) +{ + const uint64_t tgt = 0xf85de3b12891c817UL; + +#define X(n) ((tgt & (0xfUL << (4*n))) == (guess & (0xfUL << (4*n)))) + + if (X(0)) { + if (X(1)) { + if (X(2)) { + if (X(3)) { + if (X(4)) { + if (X(5)) { + if (X(6)) { + if (X(7)) { + if (X(8)) { + if (X(9)) { + if (X(10)) { + if (X(11)) { + if (X(12)) { + if (X(13)) { + if (X(14)) { + if (X(15)) { + panic("ksancov: found test value\n"); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } +} + +static int +ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p) +{ +#pragma unused(fflag,p) + int ret = 0; + void *data = (void *)_data; + + struct ksancov_dev *d = get_dev(dev); + if (!d) { + return EINVAL; /* dev not open */ + } + + if (cmd == KSANCOV_IOC_TRACE) { + size_t maxpcs = *(size_t *)data; + ret = ksancov_trace_alloc(dev, maxpcs); + if (ret) { + return ret; + } + } else if (cmd == KSANCOV_IOC_COUNTERS) { + ret = ksancov_counters_alloc(dev); + if (ret) { + return ret; + } + } else if (cmd == KSANCOV_IOC_MAP) { + struct ksancov_buf_desc *mcmd = (struct ksancov_buf_desc *)data; + + if (d->mode == KS_MODE_NONE) { + return EINVAL; /* mode not configured */ + } + + /* map buffer into the userspace VA space */ + void *buf; + size_t size; + ret = ksancov_map(dev, &buf, &size); + if (ret) { + return ret; + } + + mcmd->ptr = (uintptr_t)buf; + mcmd->sz = size; + } else if (cmd == KSANCOV_IOC_MAP_EDGEMAP) { + struct ksancov_buf_desc *mcmd = (struct ksancov_buf_desc *)data; + + /* map buffer into the userspace VA space */ + void *buf; + size_t size; + ret = ksancov_map_edgemap(dev, &buf, &size); + if (ret) { + return ret; + } + + mcmd->ptr = (uintptr_t)buf; + mcmd->sz = size; + } else if (cmd == KSANCOV_IOC_START) { + if (d->mode == KS_MODE_NONE) { + return EINVAL; /* not configured */ + } + + ret = ksancov_attach(dev, current_thread()); + if (ret) { + return ret; + } + } else if (cmd == KSANCOV_IOC_NEDGES) { + size_t *nptr = (size_t *)data; + *nptr = nedges; + } else if (cmd == KSANCOV_IOC_TESTPANIC) { + uint64_t guess = *(uint64_t *)data; + ksancov_testpanic(guess); + } else { + /* unknown ioctl */ + return ENODEV; + } + + return ret; +} + +static int +ksancov_dev_clone(dev_t dev, int action) +{ +#pragma unused(dev) + if (action == DEVFS_CLONE_ALLOC) { + for (size_t i = 0; i < KSANCOV_MAX_DEV; i++) { + if (ksancov_devs[i] == NULL) { + return i; + } + } + } else if (action == DEVFS_CLONE_FREE) { + return 0; + } + + return -1; +} + +static struct cdevsw + ksancov_cdev = { + .d_open = ksancov_open, + .d_close = ksancov_close, + .d_ioctl = ksancov_ioctl, + + .d_read = eno_rdwrt, + .d_write = eno_rdwrt, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_type = 0 +}; + +int +ksancov_init_dev(void) +{ + dev_major = cdevsw_add(-1, &ksancov_cdev); + if (dev_major < 0) { + printf("ksancov: failed to allocate major device node\n"); + return -1; + } + + dev_t dev = makedev(dev_major, 0); + void *node = devfs_make_node_clone(dev, DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, + ksancov_dev_clone, KSANCOV_DEVNODE); + if (!node) { + printf("ksancov: failed to create device node\n"); + return -1; + } + + /* This could be moved to the first use of /dev/ksancov to save memory */ + uintptr_t buf; + size_t sz = sizeof(struct ksancov_edgemap) + KSANCOV_MAX_EDGES * sizeof(uint32_t); + + kern_return_t kr = kmem_alloc_flags(kernel_map, &buf, sz, VM_KERN_MEMORY_DIAG, KMA_ZERO); + if (kr) { + printf("ksancov: failed to allocate edge addr map\n"); + return -1; + } + + ksancov_edgemap = (void *)buf; + ksancov_edgemap->magic = KSANCOV_EDGEMAP_MAGIC; + ksancov_edgemap->nedges = nedges; + ksancov_edgemap->offset = VM_MIN_KERNEL_ADDRESS; + + return 0; +} diff --git a/san/ksancov.h b/san/ksancov.h new file mode 100644 index 000000000..80936b49f --- /dev/null +++ b/san/ksancov.h @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KSANCOV_H_ +#define _KSANCOV_H_ + +#include +#include +#include + +#define KSANCOV_DEVNODE "ksancov" +#define KSANCOV_PATH "/dev/" KSANCOV_DEVNODE + +/* + * ioctl + */ + +struct ksancov_buf_desc { + uintptr_t ptr; /* ptr to shared buffer [out] */ + size_t sz; /* size of shared buffer [out] */ +}; + +/* Set mode */ +#define KSANCOV_IOC_TRACE _IOW('K', 1, size_t) /* number of pcs */ +#define KSANCOV_IOC_COUNTERS _IO('K', 2) + +/* Establish a shared mapping of the coverage buffer. */ +#define KSANCOV_IOC_MAP _IOWR('K', 8, struct ksancov_buf_desc) + +/* Establish a shared mapping of the edge address buffer. */ +#define KSANCOV_IOC_MAP_EDGEMAP _IOWR('K', 9, struct ksancov_buf_desc) + +/* Log the current thread */ +#define KSANCOV_IOC_START _IOW('K', 10, uintptr_t) + +#define KSANCOV_IOC_NEDGES _IOR('K', 50, size_t) + +#define KSANCOV_IOC_TESTPANIC _IOW('K', 20, uint64_t) + + +/* + * shared kernel-user mapping + */ + +#define KSANCOV_MAX_EDGES 512UL*1024 +#define KSANCOV_MAX_HITS UINT8_MAX +#define KSANCOV_TRACE_MAGIC (uint32_t)0x5AD17F5BU +#define KSANCOV_COUNTERS_MAGIC (uint32_t)0x5AD27F6BU +#define KSANCOV_EDGEMAP_MAGIC (uint32_t)0x5AD37F7BU + +struct ksancov_header { + uint32_t magic; + _Atomic uint32_t enabled; +}; + +struct ksancov_trace { + /* userspace R/O fields */ + union { + struct ksancov_header hdr; + struct { + uint32_t magic; + _Atomic uint32_t enabled; + }; + }; + + uintptr_t offset; /* pc entries relative to this */ + uint32_t maxpcs; + _Atomic uint32_t head; + uint32_t pcs[]; +}; + +struct ksancov_counters { + union { + struct ksancov_header hdr; + struct { + uint32_t magic; + _Atomic uint32_t enabled; + }; + }; + + uint32_t nedges; /* total number of edges */ + uint8_t hits[]; /* hits on each edge (8bit saturating) */ +}; + +struct ksancov_edgemap { + uint32_t magic; + uint32_t nedges; + uintptr_t offset; /* edge addrs relative to this */ + uint32_t addrs[]; /* address of each edge relative to 'offset' */ +}; + +#if XNU_KERNEL_PRIVATE +int ksancov_init_dev(void); +void **__sanitizer_get_thread_data(thread_t); + +/* + * SanitizerCoverage ABI + */ +extern void __sanitizer_cov_trace_pc_guard(uint32_t *guard); +extern void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop); +extern void __sanitizer_cov_pcs_init(uintptr_t *start, uintptr_t *stop); +extern void __sanitizer_cov_trace_pc(void); +extern void __sanitizer_cov_trace_pc_indirect(void *callee); +#endif + +#ifndef KERNEL + +#include +#include +#include + +/* + * ksancov userspace API + * + * Usage: + * 1) open the ksancov device + * 2) set the coverage mode (trace or edge counters) + * 3) map the coverage buffer + * 4) start the trace on a thread + * 5) flip the enable bit + */ + +static inline int +ksancov_open(void) +{ + return open(KSANCOV_PATH, 0); +} + +static inline int +ksancov_map(int fd, uintptr_t *buf, size_t *sz) +{ + int ret; + struct ksancov_buf_desc mc = {0}; + + ret = ioctl(fd, KSANCOV_IOC_MAP, &mc); + if (ret == -1) { + return errno; + } + + *buf = mc.ptr; + if (sz) { + *sz = mc.sz; + } + + struct ksancov_trace *trace = (void *)mc.ptr; + assert(trace->magic == KSANCOV_TRACE_MAGIC || + trace->magic == KSANCOV_COUNTERS_MAGIC); + + return 0; +} + +static inline int +ksancov_map_edgemap(int fd, uintptr_t *buf, size_t *sz) +{ + int ret; + struct ksancov_buf_desc mc = {0}; + + ret = ioctl(fd, KSANCOV_IOC_MAP_EDGEMAP, &mc); + if (ret == -1) { + return errno; + } + + *buf = mc.ptr; + if (sz) { + *sz = mc.sz; + } + + struct ksancov_trace *trace = (void *)mc.ptr; + assert(trace->magic == KSANCOV_EDGEMAP_MAGIC); + + return 0; +} + +static inline size_t +ksancov_nedges(int fd) +{ + size_t nedges; + int ret = ioctl(fd, KSANCOV_IOC_NEDGES, &nedges); + if (ret == -1) { + return SIZE_MAX; + } + return nedges; +} + +static inline int +ksancov_mode_trace(int fd, size_t entries) +{ + int ret; + ret = ioctl(fd, KSANCOV_IOC_TRACE, &entries); + if (ret == -1) { + return errno; + } + return 0; +} + +static inline int +ksancov_mode_counters(int fd) +{ + int ret; + ret = ioctl(fd, KSANCOV_IOC_COUNTERS); + if (ret == -1) { + return errno; + } + return 0; +} + +static inline int +ksancov_thread_self(int fd) +{ + int ret; + uintptr_t th = 0; + ret = ioctl(fd, KSANCOV_IOC_START, &th); + if (ret == -1) { + return errno; + } + return 0; +} + +static inline int +ksancov_start(void *buf) +{ + struct ksancov_header *hdr = (struct ksancov_header *)buf; + atomic_store_explicit(&hdr->enabled, 1, memory_order_relaxed); + return 0; +} + +static inline int +ksancov_stop(void *buf) +{ + struct ksancov_header *hdr = (struct ksancov_header *)buf; + atomic_store_explicit(&hdr->enabled, 0, memory_order_relaxed); + return 0; +} + +static inline int +ksancov_reset(void *buf) +{ + struct ksancov_header *hdr = (struct ksancov_header *)buf; + if (hdr->magic == KSANCOV_TRACE_MAGIC) { + struct ksancov_trace *trace = (struct ksancov_trace *)buf; + atomic_store_explicit(&trace->head, 0, memory_order_relaxed); + } else if (hdr->magic == KSANCOV_COUNTERS_MAGIC) { + struct ksancov_counters *counters = (struct ksancov_counters *)buf; + bzero(counters->hits, counters->nedges); + } else { + return EINVAL; + } + return 0; +} + +static inline uintptr_t +ksancov_edge_addr(struct ksancov_edgemap *addrs, size_t idx) +{ + assert(addrs); + if (idx >= addrs->nedges) { + return 0; + } + return addrs->addrs[idx] + addrs->offset; +} + +static inline size_t +ksancov_trace_max_pcs(struct ksancov_trace *trace) +{ + return trace->maxpcs; +} + +static inline uintptr_t +ksancov_trace_offset(struct ksancov_trace *trace) +{ + assert(trace); + return trace->offset; +} + +static inline size_t +ksancov_trace_head(struct ksancov_trace *trace) +{ + size_t maxlen = trace->maxpcs; + size_t head = atomic_load_explicit(&trace->head, memory_order_acquire); + return head < maxlen ? head : maxlen; +} + +static inline uintptr_t +ksancov_trace_entry(struct ksancov_trace *trace, size_t i) +{ + if (i >= trace->head) { + return 0; + } + + return trace->pcs[i] + trace->offset; +} + +#endif + +#endif /* _KSANCOV_H_ */ diff --git a/san/memintrinsics.h b/san/memintrinsics.h index 9e5f2eda2..0c0a11ece 100644 --- a/san/memintrinsics.h +++ b/san/memintrinsics.h @@ -54,7 +54,7 @@ __nosan_bcmp(const void *a, const void *b, size_t sz) static inline void __nosan_bcopy(const void *src, void *dst, size_t sz) { - return bcopy(src, dst, sz); + bcopy(src, dst, sz); } static inline int __nosan_memcmp(const void *a, const void *b, size_t sz) @@ -64,7 +64,7 @@ __nosan_memcmp(const void *a, const void *b, size_t sz) static inline void __nosan_bzero(void *dst, size_t sz) { - return bzero(dst, sz); + bzero(dst, sz); } static inline size_t diff --git a/san/tools/ksancov.c b/san/tools/ksancov.c new file mode 100644 index 000000000..0a35fc0d7 --- /dev/null +++ b/san/tools/ksancov.c @@ -0,0 +1,217 @@ +#if 0 +CC = clang + CFLAGS = -O3 + $(MAKEFILE_LIST:.c = ): + + ifeq (0, 1) + * / +#endif + +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "../ksancov.h" + + static void + usage(void) + { + fprintf(stderr, + "usage: ./ksancov [OPTIONS]\n\n" + " -t | --trace use trace (PC log) mode [default]\n" + " -c | --counters use edge counter mode\n" + " -n | --entries override max entries in trace log\n" + " -x | --exec instrument execution of binary at \n"); + exit(1); + } + + int + main(int argc, char *argv[]) + { + struct ksancov_trace *trace = NULL; + struct ksancov_counters *counters = NULL; + struct ksancov_header *header = NULL; + + int ret; + size_t max_entries = 64UL * 1024; + char *path = NULL; + bool docounters = false; + + struct option opts[] = { + { "entries", required_argument, NULL, 'n' }, + { "exec", required_argument, NULL, 'x' }, + + { "trace", no_argument, NULL, 't' }, + { "counters", no_argument, NULL, 'c' }, + + { NULL, 0, NULL, 0 } + }; + + int ch; + while ((ch = getopt_long(argc, argv, "tn:x:c", opts, NULL)) != -1) { + switch (ch) { + case 'n': + max_entries = strtoul(optarg, NULL, 0); + break; + case 'x': + path = optarg; + break; + case 't': + docounters = false; + break; + case 'c': + docounters = true; + break; + default: + usage(); + } + ; + } + + int fd; + uintptr_t addr; + size_t sz; + + fd = ksancov_open(); + if (fd < 0) { + perror("ksancov_open"); + return errno; + } + fprintf(stderr, "opened ksancov on fd %i\n", fd); + + uintptr_t e; + ret = ksancov_map_edgemap(fd, &e, NULL); + if (ret) { + perror("ksancov map counters\n"); + return ret; + } + struct ksancov_edgemap *map = (void *)e; + fprintf(stderr, "nedges (edgemap) = %u\n", map->nedges); + + if (docounters) { + ret = ksancov_mode_counters(fd); + if (ret) { + perror("ksancov set mode\n"); + return ret; + } + } else { + ret = ksancov_mode_trace(fd, max_entries); + if (ret) { + perror("ksancov set mode\n"); + return ret; + } + } + + ret = ksancov_map(fd, &addr, &sz); + if (ret) { + perror("ksancov map"); + return ret; + } + fprintf(stderr, "mapped to 0x%lx + %lu\n", addr, sz); + + if (docounters) { + counters = (void *)addr; + fprintf(stderr, "nedges (counters) = %u\n", counters->nedges); + } else { + trace = (void *)addr; + fprintf(stderr, "maxpcs = %lu\n", ksancov_trace_max_pcs(trace)); + } + header = (void *)addr; + + if (path) { + int pid = fork(); + if (pid == 0) { + /* child */ + + ret = ksancov_thread_self(fd); + if (ret) { + perror("ksancov thread"); + return ret; + } + + ksancov_reset(header); + ksancov_start(header); + ret = execl(path, path, 0); + perror("execl"); + + exit(1); + } else { + /* parent */ + waitpid(pid, NULL, 0); + ksancov_stop(header); + } + } else { + ret = ksancov_thread_self(fd); + if (ret) { + perror("ksancov thread"); + return ret; + } + + ksancov_reset(header); + ksancov_start(header); + int ppid = getppid(); + ksancov_stop(header); + fprintf(stderr, "ppid = %i\n", ppid); + } + + if (docounters) { + for (size_t i = 0; i < counters->nedges; i++) { + size_t hits = counters->hits[i]; + if (hits) { + fprintf(stderr, "0x%lx: %lu hits [idx %lu]\n", ksancov_edge_addr(map, i), hits, i); + } + } + } else { + size_t head = ksancov_trace_head(trace); + fprintf(stderr, "head = %lu\n", head); + for (uint32_t i = 0; i < head; i++) { + uintptr_t pc = ksancov_trace_entry(trace, i); + fprintf(stderr, "0x%lx\n", pc); + } + } + + ret = close(fd); + fprintf(stderr, "close = %i\n", ret); + + return 0; + } + +/* + * endif + # */ diff --git a/san/ubsan-blacklist b/san/ubsan-blacklist index 2e48edff5..1104fdd17 100644 --- a/san/ubsan-blacklist +++ b/san/ubsan-blacklist @@ -1,9 +1,15 @@ [.*] src:./san/ubsan* +[bounds] +src:./osfmk/corecrypto/* + [alignment] src:./libsa/bootstrap.cpp src:./bsd/net/necp_client.c src:./pexpert/arm/pe_identify_machine.c +[object-size] +src:./osfmk/i386/locks_i386.c + diff --git a/san/ubsan.c b/san/ubsan.c index 86b6d293b..04eea747f 100644 --- a/san/ubsan.c +++ b/san/ubsan.c @@ -33,7 +33,7 @@ static const bool ubsan_print = false; static const uint32_t line_acquired = 0x80000000UL; - +static const char *get_type_check_kind(uint8_t kind); static size_t format_loc(struct san_src_loc *loc, char *dst, size_t sz) { @@ -98,24 +98,41 @@ format_shift(struct ubsan_violation *v, char *buf, size_t sz) return n; } -static const char *const -align_kinds[] = { - "load", - "store", - "", - "member access", - "", +static const char * const +type_check_kinds[] = { + "load of", "store to", "reference binding to", "member access within", + "member call on", "constructor call on", "downcast of", "downcast of", + "upcast of", "cast to virtual base of", "_Nonnull binding to" }; +static const char * +get_type_check_kind(uint8_t kind) +{ + return (kind < (sizeof(type_check_kinds) / sizeof(type_check_kinds[0]))) + ? type_check_kinds[kind] + : "some"; +} + static size_t -format_alignment(struct ubsan_violation *v, char *buf, size_t sz) +format_type_mismatch(struct ubsan_violation *v, char *buf, size_t sz) { size_t n = 0; - struct san_type_desc *ty = v->align->ty; + size_t alignment = 1 << v->align->align; + void *ptr = (void*)v->lhs; + const char * kind = get_type_check_kind(v->align->kind); + if (NULL == ptr) { + //null pointer use + n += snprintf(buf + n, sz - n, "%s NULL pointer of type %s\n", kind, v->align->ty->name); + } else if (alignment && ((uintptr_t)ptr & (alignment - 1))) { + //misaligned pointer use + n += snprintf(buf + n, sz - n, "%s mis-aligned address %p for type %s ", kind, (void*)v->lhs, v->align->ty->name); + n += snprintf(buf + n, sz - n, "which requires %d byte alignment\n", 1 << v->align->align); + } else { + //insufficient object size + n += snprintf(buf + n, sz - n, "%s address %p with insufficient space for an object of type %s\n", + kind, ptr, v->align->ty->name); + } - n += snprintf(buf + n, sz - n, "mis-aligned %s of 0x%llx\n", align_kinds[v->align->kind], v->lhs); - n += snprintf(buf + n, sz - n, " expected %d-byte alignment, type = %s\n", - 1 << v->align->align, ty->name); return n; } @@ -150,8 +167,8 @@ ubsan_format(struct ubsan_violation *v, char *buf, size_t sz) case UBSAN_SHIFT: n += format_shift(v, buf + n, sz - n); break; - case UBSAN_ALIGN: - n += format_alignment(v, buf + n, sz - n); + case UBSAN_TYPE_MISMATCH: + n += format_type_mismatch(v, buf + n, sz - n); break; case UBSAN_POINTER_OVERFLOW: n += snprintf(buf + n, sz - n, "pointer overflow, before = 0x%llx, after = 0x%llx\n", v->lhs, v->rhs); @@ -159,6 +176,9 @@ ubsan_format(struct ubsan_violation *v, char *buf, size_t sz) case UBSAN_OOB: n += format_oob(v, buf + n, sz - n); break; + case UBSAN_GENERIC: + n += snprintf(buf + n, sz - n, "%s\n", v->func); + break; default: panic("unknown violation"); } @@ -236,14 +256,14 @@ DEFINE_OVERFLOW(negate) void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *desc, uint64_t val) { - struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc }; + struct ubsan_violation v = { UBSAN_TYPE_MISMATCH, val, 0, .align = desc, &desc->loc }; ubsan_handle(&v, false); } void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *desc, uint64_t val) { - struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc }; + struct ubsan_violation v = { UBSAN_TYPE_MISMATCH, val, 0, .align = desc, &desc->loc }; ubsan_handle(&v, true); } @@ -274,3 +294,27 @@ __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *desc, uint64_t idx) struct ubsan_violation v = { UBSAN_OOB, idx, 0, .oob = desc, &desc->loc }; ubsan_handle(&v, true); } + +#define DEFINE_GENERIC(check) \ + void __ubsan_handle_##check (struct san_src_loc* loc) \ + { \ + struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \ + ubsan_handle(&v, false); \ + } \ + void __ubsan_handle_##check##_abort(struct san_src_loc* loc) \ + { \ + struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \ + ubsan_handle(&v, true); \ + } + +DEFINE_GENERIC(invalid_builtin) +DEFINE_GENERIC(load_invalid_value) +DEFINE_GENERIC(nonnull_arg) +DEFINE_GENERIC(vla_bound_not_positive) +DEFINE_GENERIC(float_cast_overflow) +DEFINE_GENERIC(function_type_mismatch) +DEFINE_GENERIC(missing_return) +DEFINE_GENERIC(nonnull_return) +DEFINE_GENERIC(nullability_arg) +DEFINE_GENERIC(nullability_return) +DEFINE_GENERIC(implicit_conversion) diff --git a/san/ubsan.h b/san/ubsan.h index e78dacefc..9dff870a2 100644 --- a/san/ubsan.h +++ b/san/ubsan.h @@ -95,6 +95,8 @@ enum { UBSAN_ALIGN, UBSAN_POINTER_OVERFLOW, UBSAN_OOB, + UBSAN_GENERIC, + UBSAN_TYPE_MISMATCH, UBSAN_VIOLATION_MAX, }; @@ -109,6 +111,7 @@ struct ubsan_violation { struct ubsan_align_desc *align; struct ubsan_ptroverflow_desc *ptroverflow; struct ubsan_oob_desc *oob; + const char *func; }; struct san_src_loc *loc; }; @@ -121,23 +124,47 @@ size_t ubsan_format(struct ubsan_violation *, char *buf, size_t sz); */ void __ubsan_handle_add_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_add_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *); +void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_divrem_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_negate_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *); +void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx); +void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx); +void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_shift_out_of_bounds(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_shift_out_of_bounds_abort(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *, uint64_t val); void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *, uint64_t val); -void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); -void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx); -void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx); + +/* currently unimplemented */ +void __ubsan_handle_float_cast_overflow(struct san_src_loc *); +void __ubsan_handle_float_cast_overflow_abort(struct san_src_loc *); +void __ubsan_handle_function_type_mismatch(struct san_src_loc *); +void __ubsan_handle_function_type_mismatch_abort(struct san_src_loc *); +void __ubsan_handle_implicit_conversion(struct san_src_loc *); +void __ubsan_handle_implicit_conversion_abort(struct san_src_loc *); +void __ubsan_handle_invalid_builtin(struct san_src_loc *); +void __ubsan_handle_invalid_builtin_abort(struct san_src_loc *); +void __ubsan_handle_load_invalid_value(struct san_src_loc *); +void __ubsan_handle_load_invalid_value_abort(struct san_src_loc *); +void __ubsan_handle_missing_return(struct san_src_loc *); +void __ubsan_handle_missing_return_abort(struct san_src_loc *); +void __ubsan_handle_nonnull_arg(struct san_src_loc *); +void __ubsan_handle_nonnull_arg_abort(struct san_src_loc *); +void __ubsan_handle_nonnull_return(struct san_src_loc *); +void __ubsan_handle_nonnull_return_abort(struct san_src_loc *); +void __ubsan_handle_nullability_arg(struct san_src_loc *); +void __ubsan_handle_nullability_arg_abort(struct san_src_loc *); +void __ubsan_handle_nullability_return(struct san_src_loc *); +void __ubsan_handle_nullability_return_abort(struct san_src_loc *); +void __ubsan_handle_vla_bound_not_positive(struct san_src_loc *); +void __ubsan_handle_vla_bound_not_positive_abort(struct san_src_loc *); #endif /* _UBSAN_H_ */ diff --git a/san/ubsan_log.c b/san/ubsan_log.c index a02bf51df..d0a3fcc69 100644 --- a/san/ubsan_log.c +++ b/san/ubsan_log.c @@ -101,7 +101,7 @@ sysctl_ubsan_log_dump SYSCTL_HANDLER_ARGS if (!buf) { return 0; } - buf[0] = '\0'; + bzero(buf, sz); for (size_t i = start; i != end; i = next_entry(i)) { n += ubsan_format(&ubsan_log[i], buf + n, sz - n); diff --git a/security/Makefile b/security/Makefile index 1917d6e86..77eb0bf7c 100644 --- a/security/Makefile +++ b/security/Makefile @@ -6,7 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -INCDIR=/usr/local/include +INCDIR=$(SDKHEADERSROOT)/usr/local/include # Installs header file for user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders diff --git a/security/conf/Makefile b/security/conf/Makefile index 7bd79d9ae..05c4b79cf 100644 --- a/security/conf/Makefile +++ b/security/conf/Makefile @@ -23,7 +23,7 @@ endif $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS) $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG) - $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG); + $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG) do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile $(_v)${MAKE} \ @@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile SOURCE=$(subst conf/,,$(SOURCE)) \ TARGET=${TARGET} \ OBJPATH=${OBJPATH} \ - build_all; + build_all do_build_all:: do_all diff --git a/security/conf/Makefile.template b/security/conf/Makefile.template index 8330c0a5f..2d75b556c 100644 --- a/security/conf/Makefile.template +++ b/security/conf/Makefile.template @@ -75,9 +75,9 @@ $(SOBJS): .SFLAGS $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS) $(COMPONENT).filelist: $(OBJS) - @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" + $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)) $(_v)for obj in ${OBJS}; do \ - echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ + $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist do_all: $(COMPONENT).filelist diff --git a/security/mac_audit.c b/security/mac_audit.c index 504cf4b9f..44b591fd4 100644 --- a/security/mac_audit.c +++ b/security/mac_audit.c @@ -236,12 +236,12 @@ mac_audit_text(char *text, mac_policy_handle_t handle) { char *sanitized; const char *name; - int i, size, plen, len; + size_t i, size, plen, text_len; name = mac_get_mpc(handle)->mpc_name; - len = strlen(text); + text_len = strlen(text); plen = 2 + strlen(name); - if (plen + len >= MAC_AUDIT_DATA_LIMIT) { + if (plen + text_len >= MAC_AUDIT_DATA_LIMIT) { return EINVAL; } @@ -249,18 +249,18 @@ mac_audit_text(char *text, mac_policy_handle_t handle) * Make sure the text is only composed of only ASCII printable * characters. */ - for (i = 0; i < len; i++) { + for (i = 0; i < text_len; i++) { if (text[i] < (char) 32 || text[i] > (char) 126) { return EINVAL; } } - size = len + plen + 1; + size = text_len + plen + 1; sanitized = (char *)zalloc(mac_audit_data_zone); strlcpy(sanitized, name, MAC_AUDIT_DATA_LIMIT); - strncat(sanitized, ": ", MAC_AUDIT_DATA_LIMIT - plen + 2); - strncat(sanitized, text, MAC_AUDIT_DATA_LIMIT - plen); + strlcat(sanitized, ": ", MAC_AUDIT_DATA_LIMIT); + strlcat(sanitized, text, MAC_AUDIT_DATA_LIMIT); return audit_mac_data(MAC_AUDIT_TEXT_TYPE, size, (u_char *)sanitized); } diff --git a/security/mac_base.c b/security/mac_base.c index c31e5fdc6..6a99e38fd 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -181,9 +181,20 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, SECURITY_MAC_CTLFLAGS, * already has to deal with uninitialized labels, this probably won't * be a problem. */ +#if CONFIG_MACF_LAZY_VNODE_LABELS +unsigned int mac_label_vnodes = 1; +#else unsigned int mac_label_vnodes = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, SECURITY_MAC_CTLFLAGS, - &mac_label_vnodes, 0, "Label all vnodes"); +#endif /* CONFIG_MACF_LAZY_VNODE_LABELS */ +SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, SECURITY_MAC_CTLFLAGS +#if CONFIG_MACF_LAZY_VNODE_LABELS + | CTLFLAG_RD +#endif + , &mac_label_vnodes, 0, "Label all vnodes"); + +unsigned int mac_vnode_label_count = 0; +SYSCTL_UINT(_security_mac, OID_AUTO, vnode_label_count, SECURITY_MAC_CTLFLAGS | CTLFLAG_RD, + &mac_vnode_label_count, 0, "Count of vnode labels"); unsigned int mac_device_enforce = 1; SYSCTL_UINT(_security_mac, OID_AUTO, device_enforce, SECURITY_MAC_CTLFLAGS, diff --git a/security/mac_framework.h b/security/mac_framework.h index b40928bb0..3f9b67198 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -266,6 +266,7 @@ int mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp, int mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp); int mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp, struct componentname *cnp, const char *vfc_name); +int mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp); int mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp, const char *name); int mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, @@ -364,6 +365,7 @@ int mac_proc_check_setlcid(proc_t proc1, proc_t proc2, pid_t pid1, pid_t pid2); int mac_proc_check_signal(proc_t proc1, proc_t proc2, int signum); +int mac_proc_check_syscall_unix(proc_t proc, int scnum); int mac_proc_check_wait(proc_t proc1, proc_t proc2); void mac_proc_notify_exit(proc_t proc); int mac_setsockopt_label(kauth_cred_t cred, struct socket *so, @@ -411,7 +413,6 @@ int mac_system_check_acct(kauth_cred_t cred, struct vnode *vp); int mac_system_check_audit(kauth_cred_t cred, void *record, int length); int mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp); int mac_system_check_auditon(kauth_cred_t cred, int cmd); -int mac_system_check_chud(kauth_cred_t cred); int mac_system_check_host_priv(kauth_cred_t cred); int mac_system_check_info(kauth_cred_t, const char *info_type); int mac_system_check_nfsd(kauth_cred_t cred); @@ -563,6 +564,7 @@ int mac_vnode_label_externalize_audit(struct vnode *vp, struct mac *mac); void mac_vnode_label_free(struct label *label); void mac_vnode_label_init(struct vnode *vp); int mac_vnode_label_init_needed(struct vnode *vp); +struct label *mac_vnode_label_allocate(vnode_t vp); void mac_vnode_label_recycle(struct vnode *vp); void mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp, struct label *newlabel); diff --git a/security/mac_internal.h b/security/mac_internal.h index ec457f0cb..503b2ea4b 100644 --- a/security/mac_internal.h +++ b/security/mac_internal.h @@ -169,6 +169,7 @@ extern unsigned int mac_label_mbufs; #endif extern unsigned int mac_label_vnodes; +extern unsigned int mac_vnode_label_count; static bool mac_proc_check_enforce(proc_t p); diff --git a/security/mac_policy.h b/security/mac_policy.h index 09a8bec12..9baaa2df9 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -1804,6 +1804,22 @@ typedef int mpo_mount_check_mount_t( struct componentname *cnp, const char *vfc_name ); +/** + * @brief Access control check for mounting a file system (late) + * @param cred Subject credential + * @param mp Mount point + * + * Similar to mpo_mount_check_mount, but occurs after VFS_MOUNT has been + * called, making it possible to access mnt_vfsstat.f_mntfromname and other + * fields. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_mount_check_mount_late_t( + kauth_cred_t cred, + struct mount *mp + ); /** * @brief Access control check for fs_snapshot_create * @param cred Subject credential @@ -3073,6 +3089,24 @@ typedef int mpo_proc_check_signal_t( struct proc *proc, int signum ); +/** + * @brief Access control check for Unix syscalls. + * @param proc Subject process + * @param scnum Syscall number; see bsd/kern/syscalls.master. + * + * Determine whether the subject process can perform the passed syscall (number). + * + * @warning Programs typically expect to be able to make syscalls as part of + * their normal process lifecycle; caution should be exercised when restricting + * which syscalls a process can perform. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. Suggested failure: EPERM for lack of privilege. + */ +typedef int mpo_proc_check_syscall_unix_t( + struct proc *proc, + int scnum + ); /** * @brief Access control check for wait * @param cred Subject credential @@ -3106,32 +3140,6 @@ typedef int mpo_proc_check_wait_t( typedef void mpo_proc_notify_exit_t( struct proc *proc ); -/** - * @brief Destroy process label - * @param label The label to be destroyed - * - * Destroy a process label. Since the object is going - * out of scope, policy modules should free any internal storage - * associated with the label so that it may be destroyed. - */ -typedef void mpo_proc_label_destroy_t( - struct label *label - ); -/** - * @brief Initialize process label - * @param label New label to initialize - * @see mpo_cred_label_init_t - * - * Initialize the label for a newly instantiated BSD process structure. - * Normally, security policies will store the process label in the user - * credential rather than here in the process structure. However, - * there are some floating label policies that may need to temporarily - * store a label in the process structure until it is safe to update - * the user credential label. Sleeping is permitted. - */ -typedef void mpo_proc_label_init_t( - struct label *label - ); /** * @brief Access control check for skywalk flow connect * @param cred Subject credential @@ -3836,20 +3844,6 @@ typedef int mpo_system_check_auditon_t( kauth_cred_t cred, int cmd ); -/** - * @brief Access control check for using CHUD facilities - * @param cred Subject credential - * - * Determine whether the subject identified by the credential can perform - * performance-related tasks using the CHUD system call. This interface is - * deprecated. - * - * @return Return 0 if access is granted, otherwise an appropriate value for - * errno should be returned. - */ -typedef int mpo_system_check_chud_t( - kauth_cred_t cred - ); /** * @brief Access control check for obtaining the host control port * @param cred Subject credential @@ -5859,11 +5853,12 @@ typedef int mpo_vnode_label_internalize_t( ); /** * @brief Clean up a vnode label - * @param label The label to be cleaned for re-use + * @param label The label to be cleaned or purged * * Clean up a vnode label. Darwin (Tiger, 8.x) allocates vnodes on demand, but * typically never frees them. Before vnodes are placed back on free lists for - * re-use, policies can cleanup or overwrite any information present in the label. + * re-use, policies can cleanup or overwrite any information present in the label, + * or free any internal resources used for the label. */ typedef void mpo_vnode_label_recycle_t( struct label *label @@ -6288,7 +6283,7 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 55 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 58 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -6428,8 +6423,8 @@ struct mac_policy_ops { mpo_vnode_check_rename_t *mpo_vnode_check_rename; mpo_kext_check_query_t *mpo_kext_check_query; mpo_proc_notify_exec_complete_t *mpo_proc_notify_exec_complete; - mpo_reserved_hook_t *mpo_reserved5; - mpo_reserved_hook_t *mpo_reserved6; + mpo_reserved_hook_t *mpo_reserved4; + mpo_proc_check_syscall_unix_t *mpo_proc_check_syscall_unix; mpo_proc_check_expose_task_t *mpo_proc_check_expose_task; mpo_proc_check_set_host_special_port_t *mpo_proc_check_set_host_special_port; mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port; @@ -6441,9 +6436,9 @@ struct mac_policy_ops { mpo_exc_action_label_update_t *mpo_exc_action_label_update; mpo_vnode_check_trigger_resolve_t *mpo_vnode_check_trigger_resolve; + mpo_mount_check_mount_late_t *mpo_mount_check_mount_late; mpo_reserved_hook_t *mpo_reserved1; mpo_reserved_hook_t *mpo_reserved2; - mpo_reserved_hook_t *mpo_reserved3; mpo_skywalk_flow_check_connect_t *mpo_skywalk_flow_check_connect; mpo_skywalk_flow_check_listen_t *mpo_skywalk_flow_check_listen; @@ -6479,8 +6474,8 @@ struct mac_policy_ops { mpo_proc_check_setlcid_t *mpo_proc_check_setlcid; mpo_proc_check_signal_t *mpo_proc_check_signal; mpo_proc_check_wait_t *mpo_proc_check_wait; - mpo_proc_label_destroy_t *mpo_proc_label_destroy; - mpo_proc_label_init_t *mpo_proc_label_init; + mpo_reserved_hook_t *mpo_reserved5; + mpo_reserved_hook_t *mpo_reserved6; mpo_socket_check_accept_t *mpo_socket_check_accept; mpo_socket_check_accepted_t *mpo_socket_check_accepted; @@ -6630,7 +6625,7 @@ struct mac_policy_ops { mpo_iokit_check_set_properties_t *mpo_iokit_check_set_properties; - mpo_system_check_chud_t *mpo_system_check_chud; + mpo_reserved_hook_t *mpo_reserved3; mpo_vnode_check_searchfs_t *mpo_vnode_check_searchfs; @@ -6922,6 +6917,8 @@ int mac_file_removexattr(struct fileglob *fg, const char *name); */ intptr_t mac_label_get(struct label *l, int slot); void mac_label_set(struct label *l, int slot, intptr_t v); +intptr_t mac_vnode_label_get(struct vnode *vp, int slot, intptr_t sentinel); +void mac_vnode_label_set(struct vnode *vp, int slot, intptr_t v); #define mac_get_mpc(h) (mac_policy_list.entries[h].mpc) diff --git a/security/mac_process.c b/security/mac_process.c index 3552fe991..603b7499c 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -506,6 +506,26 @@ mac_proc_check_signal(proc_t curp, struct proc *proc, int signum) return error; } +int +mac_proc_check_syscall_unix(proc_t curp, int scnum) +{ + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) { + return 0; + } +#endif + if (!mac_proc_check_enforce(curp)) { + return 0; + } + + MAC_CHECK(proc_check_syscall_unix, curp, scnum); + + return error; +} + int mac_proc_check_wait(proc_t curp, struct proc *proc) { diff --git a/security/mac_system.c b/security/mac_system.c index a0c00105b..f34eee62e 100644 --- a/security/mac_system.c +++ b/security/mac_system.c @@ -87,23 +87,6 @@ mac_system_check_acct(kauth_cred_t cred, struct vnode *vp) return error; } -int -mac_system_check_chud(kauth_cred_t cred) -{ - int error; - -#if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_system_enforce) { - return 0; - } -#endif - - MAC_CHECK(system_check_chud, cred); - - return error; -} - int mac_system_check_host_priv(kauth_cred_t cred) { diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 81b311012..95afa830b 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -63,6 +63,7 @@ */ #include +#include #include #include @@ -80,6 +81,8 @@ #include #include #include +#include + #include #include @@ -89,6 +92,32 @@ /* convert {R,W,X}_OK values to V{READ,WRITE,EXEC} */ #define ACCESS_MODE_TO_VNODE_MASK(m) (m << 6) + +/* + * Optional tracing of policy operations. Define VFS_TRACE_POLICY_OPS to trace the operations. + * + * Along with DBG_FSYSTEM and DBG_VFS, dcode in the macros below is used to construct + * KDBG_EVENTID(DBG_FSYSTEM, DBG_VFS, dcode) global event id, see bsd/sys/kdebug.h. + * Note that dcode is multiplied by 4 and ORed as part of the construction. See bsd/kern/trace_codes + * for list of system-wide {global event id, name} pairs. Currently DBG_VFS event ids are in range + * [0x3130000, 0x313016C]. + */ + +//#define VFS_TRACE_POLICY_OPS + +#ifdef VFS_TRACE_POLICY_OPS +#define DBG_VFS_CODE(dcode) FSDBG_CODE(DBG_VFS, dcode) +#define VFS_KERNEL_DEBUG_START0(dcode) KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_START, 0, 0, 0, 0, 0) +#define VFS_KERNEL_DEBUG_END0(dcode) KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_END, 0, 0, 0, 0, 0) +#define VFS_KERNEL_DEBUG_START1(dcode, darg) KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_START, darg, 0, 0, 0, 0) +#define VFS_KERNEL_DEBUG_END1(dcode, darg) KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_END, darg, 0, 0, 0, 0) +#else +#define VFS_KERNEL_DEBUG_START0(dcode) do {} while (0) +#define VFS_KERNEL_DEBUG_END0(dcode) do {} while (0) +#define VFS_KERNEL_DEBUG_START1(dcode, darg) do {} while (0) +#define VFS_KERNEL_DEBUG_END1(dcode, darg) do {} while (0) +#endif + static struct label * mac_devfsdirent_label_alloc(void) { @@ -98,7 +127,9 @@ mac_devfsdirent_label_alloc(void) if (label == NULL) { return NULL; } + VFS_KERNEL_DEBUG_START0(0); MAC_PERFORM(devfs_label_init, label); + VFS_KERNEL_DEBUG_END0(0); return label; } @@ -117,7 +148,9 @@ mac_mount_label_alloc(void) if (label == NULL) { return NULL; } + VFS_KERNEL_DEBUG_START0(1); MAC_PERFORM(mount_label_init, label); + VFS_KERNEL_DEBUG_END0(1); return label; } @@ -136,7 +169,10 @@ mac_vnode_label_alloc(void) if (label == NULL) { return NULL; } + VFS_KERNEL_DEBUG_START0(2); MAC_PERFORM(vnode_label_init, label); + VFS_KERNEL_DEBUG_END0(2); + OSIncrementAtomic(&mac_vnode_label_count); return label; } @@ -149,7 +185,21 @@ mac_vnode_label_init(vnode_t vp) int mac_vnode_label_init_needed(vnode_t vp) { +#if CONFIG_MACF_LAZY_VNODE_LABELS + (void)vp; + return false; +#else return mac_label_vnodes != 0 && vp->v_label == NULL; +#endif +} + +struct label * +mac_vnode_label_allocate(vnode_t vp) +{ + if (mac_vnode_label_init_needed(vp)) { + vp->v_label = mac_vnode_label_alloc(); + } + return vp->v_label; } /* @@ -161,12 +211,21 @@ void mac_vnode_label_recycle(vnode_t vp) { MAC_PERFORM(vnode_label_recycle, vp->v_label); +#if CONFIG_MACF_LAZY_VNODE_LABELS + if (vp->v_label) { + mac_vnode_label_destroy(vp); + vp->v_label = NULL; + vp->v_lflag &= ~VL_LABELED; + } +#endif } static void mac_devfs_label_free(struct label *label) { + VFS_KERNEL_DEBUG_START1(3, label); MAC_PERFORM(devfs_label_destroy, label); + VFS_KERNEL_DEBUG_END1(3, label); mac_labelzone_free(label); } @@ -182,7 +241,9 @@ mac_devfs_label_destroy(struct devnode *de) static void mac_mount_label_free(struct label *label) { + VFS_KERNEL_DEBUG_START1(4, label); MAC_PERFORM(mount_label_destroy, label); + VFS_KERNEL_DEBUG_END1(4, label); mac_labelzone_free(label); } @@ -198,11 +259,15 @@ mac_mount_label_destroy(struct mount *mp) void mac_vnode_label_free(struct label *label) { - MAC_PERFORM(vnode_label_destroy, label); - mac_labelzone_free(label); + if (label != NULL) { + VFS_KERNEL_DEBUG_START1(5, label); + MAC_PERFORM(vnode_label_destroy, label); + VFS_KERNEL_DEBUG_END1(5, label); + mac_labelzone_free(label); + OSDecrementAtomic(&mac_vnode_label_count); + } } -#ifndef __APPLE__ void mac_vnode_label_destroy(struct vnode *vp) { @@ -211,16 +276,17 @@ mac_vnode_label_destroy(struct vnode *vp) vp->v_label = NULL; } } -#endif void mac_vnode_label_copy(struct label *src, struct label *dest) { + VFS_KERNEL_DEBUG_START1(6, src); if (src == NULL) { MAC_PERFORM(vnode_label_init, dest); } else { MAC_PERFORM(vnode_label_copy, src, dest); } + VFS_KERNEL_DEBUG_END1(6, src); } int @@ -287,7 +353,9 @@ mac_devfs_label_copy(struct label *src, struct label *dest) } #endif + VFS_KERNEL_DEBUG_START1(7, src); MAC_PERFORM(devfs_label_copy, src, dest); + VFS_KERNEL_DEBUG_END1(7, src); } void @@ -301,8 +369,10 @@ mac_devfs_label_update(struct mount *mp, struct devnode *de, } #endif + VFS_KERNEL_DEBUG_START1(8, vp); MAC_PERFORM(devfs_label_update, mp, de, de->dn_label, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(8, vp); } int @@ -348,10 +418,12 @@ mac_vnode_label_associate_devfs(struct mount *mp, struct devnode *de, } #endif + VFS_KERNEL_DEBUG_START1(9, vp); MAC_PERFORM(vnode_label_associate_devfs, mp, mp ? mp->mnt_mntlabel : NULL, de, de->dn_label, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(9, vp); } int @@ -359,8 +431,10 @@ mac_vnode_label_associate_extattr(struct mount *mp, struct vnode *vp) { int error; + VFS_KERNEL_DEBUG_START1(10, vp); MAC_CHECK(vnode_label_associate_extattr, mp, mp->mnt_mntlabel, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(10, vp); return error; } @@ -378,8 +452,10 @@ mac_vnode_label_associate_singlelabel(struct mount *mp, struct vnode *vp) return; } + VFS_KERNEL_DEBUG_START1(11, vp); MAC_PERFORM(vnode_label_associate_singlelabel, mp, mp ? mp->mnt_mntlabel : NULL, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(11, vp); } int @@ -399,8 +475,10 @@ mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(12, vp); MAC_CHECK(vnode_notify_create, cred, mp, mp->mnt_mntlabel, dvp, dvp->v_label, vp, vp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(12, vp); return error; } @@ -421,8 +499,10 @@ mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(13, vp); MAC_PERFORM(vnode_notify_rename, cred, vp, vp->v_label, dvp, dvp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(13, vp); } void @@ -440,7 +520,9 @@ mac_vnode_notify_open(vfs_context_t ctx, struct vnode *vp, int acc_flags) if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(14, vp); MAC_PERFORM(vnode_notify_open, cred, vp, vp->v_label, acc_flags); + VFS_KERNEL_DEBUG_END1(14, vp); } void @@ -459,7 +541,9 @@ mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(15, vp); MAC_PERFORM(vnode_notify_link, cred, dvp, dvp->v_label, vp, vp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(15, vp); } void @@ -477,7 +561,9 @@ mac_vnode_notify_deleteextattr(vfs_context_t ctx, struct vnode *vp, const char * if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(16, vp); MAC_PERFORM(vnode_notify_deleteextattr, cred, vp, vp->v_label, name); + VFS_KERNEL_DEBUG_END1(16, vp); } void @@ -495,7 +581,9 @@ mac_vnode_notify_setacl(vfs_context_t ctx, struct vnode *vp, struct kauth_acl *a if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(17, vp); MAC_PERFORM(vnode_notify_setacl, cred, vp, vp->v_label, acl); + VFS_KERNEL_DEBUG_END1(17, vp); } void @@ -513,7 +601,9 @@ mac_vnode_notify_setattrlist(vfs_context_t ctx, struct vnode *vp, struct attrlis if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(18, vp); MAC_PERFORM(vnode_notify_setattrlist, cred, vp, vp->v_label, alist); + VFS_KERNEL_DEBUG_END1(18, vp); } void @@ -531,7 +621,9 @@ mac_vnode_notify_setextattr(vfs_context_t ctx, struct vnode *vp, const char *nam if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(19, vp); MAC_PERFORM(vnode_notify_setextattr, cred, vp, vp->v_label, name, uio); + VFS_KERNEL_DEBUG_END1(19, vp); } void @@ -549,7 +641,9 @@ mac_vnode_notify_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags) if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(20, vp); MAC_PERFORM(vnode_notify_setflags, cred, vp, vp->v_label, flags); + VFS_KERNEL_DEBUG_END1(20, vp); } void @@ -567,7 +661,9 @@ mac_vnode_notify_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode) if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(21, vp); MAC_PERFORM(vnode_notify_setmode, cred, vp, vp->v_label, mode); + VFS_KERNEL_DEBUG_END1(21, vp); } void @@ -585,7 +681,9 @@ mac_vnode_notify_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, gid_t if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(22, vp); MAC_PERFORM(vnode_notify_setowner, cred, vp, vp->v_label, uid, gid); + VFS_KERNEL_DEBUG_END1(22, vp); } void @@ -603,7 +701,9 @@ mac_vnode_notify_setutimes(vfs_context_t ctx, struct vnode *vp, struct timespec if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(23, vp); MAC_PERFORM(vnode_notify_setutimes, cred, vp, vp->v_label, atime, mtime); + VFS_KERNEL_DEBUG_END1(23, vp); } void @@ -621,7 +721,9 @@ mac_vnode_notify_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnod if (!mac_cred_check_enforce(cred)) { return; } + VFS_KERNEL_DEBUG_START1(24, vp); MAC_PERFORM(vnode_notify_truncate, cred, file_cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(24, vp); } /* @@ -645,8 +747,10 @@ mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp, return; } + VFS_KERNEL_DEBUG_START1(25, vp); MAC_PERFORM(vnode_label_update_extattr, mp, mp->mnt_mntlabel, vp, vp->v_label, name); + VFS_KERNEL_DEBUG_END1(25, vp); if (error == 0) { return; } @@ -678,7 +782,9 @@ mac_vnode_label_store(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(26, vp); MAC_CHECK(vnode_label_store, cred, vp, vp->v_label, intlabel); + VFS_KERNEL_DEBUG_END1(26, vp); return error; } @@ -710,6 +816,7 @@ mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode * * calling exec_spawnattr_getmacpolicyinfo() and before passing the * spawnattrlen as an argument to the hook. */ + VFS_KERNEL_DEBUG_START1(27, vp); { struct mac_policy_conf *mpc; u_int i; @@ -756,6 +863,7 @@ mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode * } } *labelupdateerror = error; + VFS_KERNEL_DEBUG_END1(27, vp); } int @@ -775,6 +883,7 @@ mac_cred_check_label_update_execve(vfs_context_t ctx, struct vnode *vp, off_t of cred = vfs_context_ucred(ctx); + VFS_KERNEL_DEBUG_START1(28, vp); /* * NB: Cannot use MAC_BOOLEAN macro because we need a sequence point after * calling exec_spawnattr_getmacpolicyinfo() and before passing the @@ -820,6 +929,7 @@ mac_cred_check_label_update_execve(vfs_context_t ctx, struct vnode *vp, off_t of mac_policy_list_unbusy(); } } + VFS_KERNEL_DEBUG_END1(28, vp); return result; } @@ -844,7 +954,9 @@ mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp, } /* Convert {R,W,X}_OK values to V{READ,WRITE,EXEC} for entry points */ mask = ACCESS_MODE_TO_VNODE_MASK(acc_mode); + VFS_KERNEL_DEBUG_START1(29, vp); MAC_CHECK(vnode_check_access, cred, vp, vp->v_label, mask); + VFS_KERNEL_DEBUG_END1(29, vp); return error; } @@ -864,7 +976,9 @@ mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(30, dvp); MAC_CHECK(vnode_check_chdir, cred, dvp, dvp->v_label); + VFS_KERNEL_DEBUG_END1(30, dvp); return error; } @@ -885,7 +999,9 @@ mac_vnode_check_chroot(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(31, dvp); MAC_CHECK(vnode_check_chroot, cred, dvp, dvp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(31, dvp); return error; } @@ -906,8 +1022,10 @@ mac_vnode_check_clone(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(32, dvp); MAC_CHECK(vnode_check_clone, cred, dvp, dvp->v_label, vp, vp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(32, dvp); return error; } int @@ -927,7 +1045,9 @@ mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(33, dvp); MAC_CHECK(vnode_check_create, cred, dvp, dvp->v_label, cnp, vap); + VFS_KERNEL_DEBUG_END1(33, dvp); return error; } @@ -948,8 +1068,10 @@ mac_vnode_check_unlink(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(34, dvp); MAC_CHECK(vnode_check_unlink, cred, dvp, dvp->v_label, vp, vp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(34, dvp); return error; } #if 0 @@ -970,7 +1092,9 @@ mac_vnode_check_deleteacl(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(35, dvp); MAC_CHECK(vnode_check_deleteacl, cred, vp, vp->v_label, type); + VFS_KERNEL_DEBUG_END1(35, dvp); return error; } #endif @@ -992,7 +1116,9 @@ mac_vnode_check_deleteextattr(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(36, vp); MAC_CHECK(vnode_check_deleteextattr, cred, vp, vp->v_label, name); + VFS_KERNEL_DEBUG_END1(36, vp); return error; } int @@ -1012,8 +1138,10 @@ mac_vnode_check_exchangedata(vfs_context_t ctx, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(37, v1); MAC_CHECK(vnode_check_exchangedata, cred, v1, v1->v_label, v2, v2->v_label); + VFS_KERNEL_DEBUG_END1(37, v1); return error; } @@ -1035,7 +1163,9 @@ mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(38, vp); MAC_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type); + VFS_KERNEL_DEBUG_END1(38, vp); return error; } #endif @@ -1057,7 +1187,9 @@ mac_vnode_check_getattr(vfs_context_t ctx, struct ucred *file_cred, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(39, vp); MAC_CHECK(vnode_check_getattr, cred, file_cred, vp, vp->v_label, va); + VFS_KERNEL_DEBUG_END1(39, vp); return error; } @@ -1078,7 +1210,9 @@ mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(40, vp); MAC_CHECK(vnode_check_getattrlist, cred, vp, vp->v_label, alist); + VFS_KERNEL_DEBUG_END1(40, vp); /* Falsify results instead of returning error? */ return error; @@ -1105,6 +1239,7 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, * calling exec_spawnattr_getmacpolicyinfo() and before passing the * spawnattrlen as an argument to the hook. */ + VFS_KERNEL_DEBUG_START1(41, vp); { struct mac_policy_conf *mpc; u_int i; @@ -1153,6 +1288,7 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, mac_policy_list_unbusy(); } } + VFS_KERNEL_DEBUG_END1(41, vp); return error; } @@ -1173,7 +1309,9 @@ mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(42, vp); MAC_CHECK(vnode_check_fsgetpath, cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(42, vp); return error; } @@ -1199,8 +1337,10 @@ mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob, } #endif + VFS_KERNEL_DEBUG_START1(43, vp); MAC_CHECK(vnode_check_signature, vp, vp->v_label, cpu_type, cs_blob, cs_flags, signer_type, flags, &fatal_failure_desc, &fatal_failure_desc_len); + VFS_KERNEL_DEBUG_END1(43, vp); if (fatal_failure_desc_len) { // A fatal code signature validation failure occured, formulate a crash @@ -1305,7 +1445,9 @@ mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(44, vp); MAC_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type); + VFS_KERNEL_DEBUG_END1(44, vp); return error; } #endif @@ -1327,8 +1469,10 @@ mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(45, vp); MAC_CHECK(vnode_check_getextattr, cred, vp, vp->v_label, name, uio); + VFS_KERNEL_DEBUG_END1(45, vp); return error; } @@ -1348,7 +1492,9 @@ mac_vnode_check_ioctl(vfs_context_t ctx, struct vnode *vp, u_int cmd) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(46, vp); MAC_CHECK(vnode_check_ioctl, cred, vp, vp->v_label, cmd); + VFS_KERNEL_DEBUG_END1(46, vp); return error; } @@ -1369,8 +1515,10 @@ mac_vnode_check_kqfilter(vfs_context_t ctx, kauth_cred_t file_cred, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(47, vp); MAC_CHECK(vnode_check_kqfilter, cred, file_cred, kn, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(47, vp); return error; } @@ -1392,8 +1540,10 @@ mac_vnode_check_link(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(48, vp); MAC_CHECK(vnode_check_link, cred, dvp, dvp->v_label, vp, vp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(48, vp); return error; } @@ -1413,7 +1563,9 @@ mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(49, vp); MAC_CHECK(vnode_check_listextattr, cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(49, vp); return error; } @@ -1434,7 +1586,9 @@ mac_vnode_check_lookup_preflight(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(50, dvp); MAC_CHECK(vnode_check_lookup_preflight, cred, dvp, dvp->v_label, path, pathlen); + VFS_KERNEL_DEBUG_END1(50, dvp); return error; } @@ -1455,7 +1609,9 @@ mac_vnode_check_lookup(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(51, dvp); MAC_CHECK(vnode_check_lookup, cred, dvp, dvp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(51, dvp); return error; } @@ -1475,7 +1631,9 @@ mac_vnode_check_open(vfs_context_t ctx, struct vnode *vp, int acc_mode) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(52, vp); MAC_CHECK(vnode_check_open, cred, vp, vp->v_label, acc_mode); + VFS_KERNEL_DEBUG_END1(52, vp); return error; } @@ -1496,8 +1654,10 @@ mac_vnode_check_read(vfs_context_t ctx, struct ucred *file_cred, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(53, vp); MAC_CHECK(vnode_check_read, cred, file_cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(53, vp); return error; } @@ -1518,7 +1678,9 @@ mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *dvp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(54, dvp); MAC_CHECK(vnode_check_readdir, cred, dvp, dvp->v_label); + VFS_KERNEL_DEBUG_END1(54, dvp); return error; } @@ -1538,7 +1700,9 @@ mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(55, vp); MAC_CHECK(vnode_check_readlink, cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(55, vp); return error; } @@ -1559,7 +1723,9 @@ mac_vnode_check_label_update(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(56, vp); MAC_CHECK(vnode_check_label_update, cred, vp, vp->v_label, newlabel); + VFS_KERNEL_DEBUG_END1(56, vp); return error; } @@ -1583,21 +1749,25 @@ mac_vnode_check_rename(vfs_context_t ctx, struct vnode *dvp, return 0; } + VFS_KERNEL_DEBUG_START1(57, vp); MAC_CHECK(vnode_check_rename_from, cred, dvp, dvp->v_label, vp, vp->v_label, cnp); if (error) { + VFS_KERNEL_DEBUG_END1(57, vp); return error; } MAC_CHECK(vnode_check_rename_to, cred, tdvp, tdvp->v_label, tvp, tvp != NULL ? tvp->v_label : NULL, dvp == tdvp, tcnp); if (error) { + VFS_KERNEL_DEBUG_END1(57, vp); return error; } MAC_CHECK(vnode_check_rename, cred, dvp, dvp->v_label, vp, vp->v_label, cnp, tdvp, tdvp->v_label, tvp, tvp != NULL ? tvp->v_label : NULL, tcnp); + VFS_KERNEL_DEBUG_END1(57, vp); return error; } @@ -1617,7 +1787,9 @@ mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(58, vp); MAC_CHECK(vnode_check_revoke, cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(58, vp); return error; } @@ -1637,7 +1809,9 @@ mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, struct attrlist *a if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(59, vp); MAC_CHECK(vnode_check_searchfs, cred, vp, vp->v_label, alist); + VFS_KERNEL_DEBUG_END1(59, vp); return error; } @@ -1657,7 +1831,9 @@ mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, int which) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(60, vp); MAC_CHECK(vnode_check_select, cred, vp, vp->v_label, which); + VFS_KERNEL_DEBUG_END1(60, vp); return error; } @@ -1678,7 +1854,9 @@ mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(61, vp); MAC_CHECK(vnode_check_setacl, cred, vp, vp->v_label, acl); + VFS_KERNEL_DEBUG_END1(61, vp); return error; } @@ -1699,7 +1877,9 @@ mac_vnode_check_setattrlist(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(62, vp); MAC_CHECK(vnode_check_setattrlist, cred, vp, vp->v_label, alist); + VFS_KERNEL_DEBUG_END1(62, vp); return error; } @@ -1720,8 +1900,10 @@ mac_vnode_check_setextattr(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(63, vp); MAC_CHECK(vnode_check_setextattr, cred, vp, vp->v_label, name, uio); + VFS_KERNEL_DEBUG_END1(63, vp); return error; } @@ -1741,7 +1923,9 @@ mac_vnode_check_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(64, vp); MAC_CHECK(vnode_check_setflags, cred, vp, vp->v_label, flags); + VFS_KERNEL_DEBUG_END1(64, vp); return error; } @@ -1761,7 +1945,9 @@ mac_vnode_check_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(65, vp); MAC_CHECK(vnode_check_setmode, cred, vp, vp->v_label, mode); + VFS_KERNEL_DEBUG_END1(65, vp); return error; } @@ -1782,7 +1968,9 @@ mac_vnode_check_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(66, vp); MAC_CHECK(vnode_check_setowner, cred, vp, vp->v_label, uid, gid); + VFS_KERNEL_DEBUG_END1(66, vp); return error; } @@ -1803,8 +1991,10 @@ mac_vnode_check_setutimes(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(67, vp); MAC_CHECK(vnode_check_setutimes, cred, vp, vp->v_label, atime, mtime); + VFS_KERNEL_DEBUG_END1(67, vp); return error; } @@ -1825,8 +2015,10 @@ mac_vnode_check_stat(vfs_context_t ctx, struct ucred *file_cred, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(68, vp); MAC_CHECK(vnode_check_stat, cred, file_cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(68, vp); return error; } @@ -1847,7 +2039,9 @@ mac_vnode_check_trigger_resolve(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(69, dvp); MAC_CHECK(vnode_check_trigger_resolve, cred, dvp, dvp->v_label, cnp); + VFS_KERNEL_DEBUG_END1(69, dvp); return error; } @@ -1868,8 +2062,10 @@ mac_vnode_check_truncate(vfs_context_t ctx, struct ucred *file_cred, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(70, vp); MAC_CHECK(vnode_check_truncate, cred, file_cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(70, vp); return error; } @@ -1891,7 +2087,9 @@ mac_vnode_check_write(vfs_context_t ctx, struct ucred *file_cred, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(71, vp); MAC_CHECK(vnode_check_write, cred, file_cred, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(71, vp); return error; } @@ -1913,7 +2111,9 @@ mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(72, dvp); MAC_CHECK(vnode_check_uipc_bind, cred, dvp, dvp->v_label, cnp, vap); + VFS_KERNEL_DEBUG_END1(72, dvp); return error; } @@ -1933,7 +2133,9 @@ mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(73, vp); MAC_CHECK(vnode_check_uipc_connect, cred, vp, vp->v_label, (socket_t) so); + VFS_KERNEL_DEBUG_END1(73, vp); return error; } @@ -1955,7 +2157,9 @@ mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp, struct label *newlab tmpl = NULL; } + VFS_KERNEL_DEBUG_START1(74, vp); MAC_PERFORM(vnode_label_update, cred, vp, vp->v_label, newlabel); + VFS_KERNEL_DEBUG_END1(74, vp); vnode_unlock(vp); if (tmpl != NULL) { @@ -1975,7 +2179,9 @@ mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offset) } #endif + VFS_KERNEL_DEBUG_START1(75, vp); MAC_CHECK(vnode_find_sigs, p, vp, offset, vp->v_label); + VFS_KERNEL_DEBUG_END1(75, vp); return error; } @@ -2026,7 +2232,9 @@ mac_mount_label_associate(vfs_context_t ctx, struct mount *mp) } } + VFS_KERNEL_DEBUG_START1(76, mp); MAC_PERFORM(mount_label_associate, cred, mp, mp->mnt_mntlabel); + VFS_KERNEL_DEBUG_END1(76, mp); #if DEBUG printf("MAC Framework enabling %s support: %s -> %s (%s)\n", mp->mnt_flag & MNT_MULTILABEL ? "multilabel" : "singlelabel", @@ -2053,7 +2261,32 @@ mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(77, vp); MAC_CHECK(mount_check_mount, cred, vp, vp->v_label, cnp, vfc_name); + VFS_KERNEL_DEBUG_END1(77, vp); + + return error; +} + +int +mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return 0; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return 0; + } + VFS_KERNEL_DEBUG_START1(78, mp); + MAC_CHECK(mount_check_mount_late, cred, mp); + VFS_KERNEL_DEBUG_END1(78, mp); return error; } @@ -2075,7 +2308,9 @@ mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(79, mp); MAC_CHECK(mount_check_snapshot_create, cred, mp, name); + VFS_KERNEL_DEBUG_END1(79, mp); return error; } @@ -2096,7 +2331,9 @@ mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(80, mp); MAC_CHECK(mount_check_snapshot_delete, cred, mp, name); + VFS_KERNEL_DEBUG_END1(80, mp); return error; } @@ -2117,7 +2354,9 @@ mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(81, mp); MAC_CHECK(mount_check_snapshot_revert, cred, mp, name); + VFS_KERNEL_DEBUG_END1(81, mp); return error; } @@ -2137,7 +2376,9 @@ mac_mount_check_remount(vfs_context_t ctx, struct mount *mp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(82, mp); MAC_CHECK(mount_check_remount, cred, mp, mp->mnt_mntlabel); + VFS_KERNEL_DEBUG_END1(82, mp); return error; } @@ -2158,7 +2399,9 @@ mac_mount_check_umount(vfs_context_t ctx, struct mount *mp) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(83, mp); MAC_CHECK(mount_check_umount, cred, mp, mp->mnt_mntlabel); + VFS_KERNEL_DEBUG_END1(83, mp); return error; } @@ -2180,7 +2423,9 @@ mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(84, mp); MAC_CHECK(mount_check_getattr, cred, mp, mp->mnt_mntlabel, vfa); + VFS_KERNEL_DEBUG_END1(84, mp); return error; } @@ -2201,7 +2446,9 @@ mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp, if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(85, mp); MAC_CHECK(mount_check_setattr, cred, mp, mp->mnt_mntlabel, vfa); + VFS_KERNEL_DEBUG_END1(85, mp); return error; } @@ -2221,7 +2468,9 @@ mac_mount_check_stat(vfs_context_t ctx, struct mount *mount) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(86, mount); MAC_CHECK(mount_check_stat, cred, mount, mount->mnt_mntlabel); + VFS_KERNEL_DEBUG_END1(86, mount); return error; } @@ -2242,7 +2491,9 @@ mac_mount_check_label_update(vfs_context_t ctx, struct mount *mount) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(87, mount); MAC_CHECK(mount_check_label_update, cred, mount, mount->mnt_mntlabel); + VFS_KERNEL_DEBUG_END1(87, mount); return error; } @@ -2263,7 +2514,9 @@ mac_mount_check_fsctl(vfs_context_t ctx, struct mount *mp, u_int cmd) if (!mac_cred_check_enforce(cred)) { return 0; } + VFS_KERNEL_DEBUG_START1(88, mp); MAC_CHECK(mount_check_fsctl, cred, mp, mp->mnt_mntlabel, cmd); + VFS_KERNEL_DEBUG_END1(88, mp); return error; } @@ -2279,8 +2532,10 @@ mac_devfs_label_associate_device(dev_t dev, struct devnode *de, } #endif + VFS_KERNEL_DEBUG_START1(89, de); MAC_PERFORM(devfs_label_associate_device, dev, de, de->dn_label, fullpath); + VFS_KERNEL_DEBUG_END1(89, de); } void @@ -2294,8 +2549,10 @@ mac_devfs_label_associate_directory(const char *dirname, int dirnamelen, } #endif + VFS_KERNEL_DEBUG_START1(90, de); MAC_PERFORM(devfs_label_associate_directory, dirname, dirnamelen, de, de->dn_label, fullpath); + VFS_KERNEL_DEBUG_END1(90, de); } int @@ -2369,18 +2626,21 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp, error = 0; + VFS_KERNEL_DEBUG_START1(91, vp); /* * If no backing file, let the policy choose which label to use. */ if (fnp->fd_fd == -1) { MAC_PERFORM(vnode_label_associate_file, vfs_context_ucred(ctx), mp, mp->mnt_mntlabel, NULL, NULL, vp, vp->v_label); + VFS_KERNEL_DEBUG_END1(91, vp); return 0; } p = vfs_context_proc(ctx); error = fp_lookup(p, fnp->fd_fd, &fp, 0); if (error) { + VFS_KERNEL_DEBUG_END1(91, vp); return error; } @@ -2395,7 +2655,12 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp, if ((error = vnode_getwithref(fvp))) { goto out; } - MAC_PERFORM(vnode_label_copy, fvp->v_label, vp->v_label); + if (fvp->v_label != NULL) { + if (mac_label_vnodes != 0 && vp->v_label == NULL) { + mac_vnode_label_init(vp); /* init dst label */ + } + MAC_PERFORM(vnode_label_copy, fvp->v_label, vp->v_label); + } (void)vnode_put(fvp); break; #if CONFIG_MACF_SOCKET_SUBSET @@ -2437,6 +2702,34 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp, break; } out: + VFS_KERNEL_DEBUG_END1(91, vp); fp_drop(p, fnp->fd_fd, fp, 0); return error; } + +intptr_t +mac_vnode_label_get(struct vnode *vp, int slot, intptr_t sentinel) +{ + struct label *l; + + KASSERT(vp != NULL, ("mac_vnode_label_get: NULL vnode")); + l = vp->v_label; + if (l != NULL) { + return mac_label_get(l, slot); + } else { + return sentinel; + } +} + +void +mac_vnode_label_set(struct vnode *vp, int slot, intptr_t v) +{ + struct label *l; + KASSERT(vp != NULL, ("mac_vnode_label_set: NULL vnode")); + l = vp->v_label; + if (l == NULL) { + mac_vnode_label_init(vp); + l = vp->v_label; + } + mac_label_set(l, slot, v); +} diff --git a/security/mac_vfs_subr.c b/security/mac_vfs_subr.c index 6f4c096c7..3fb5132d5 100644 --- a/security/mac_vfs_subr.c +++ b/security/mac_vfs_subr.c @@ -46,12 +46,17 @@ vnode_label(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int flags, vfs_context_t ctx) { int error = 0; - + bool exit_fast; /* fast path checks... */ /* are we labeling vnodes? If not still notify of create */ - if (mac_label_vnodes == 0) { +#if CONFIG_MACF_LAZY_VNODE_LABELS + exit_fast = true; +#else + exit_fast = (mac_label_vnodes == 0); +#endif + if (exit_fast) { if (flags & VNODE_LABEL_CREATE) { error = mac_vnode_notify_create(ctx, mp, dvp, vp, cnp); diff --git a/tests/Makefile b/tests/Makefile index 5f165b8b7..78b2cfa4f 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -11,6 +11,7 @@ ENABLE_LTE_TESTS=YES OTHER_LTE_INCLUDE_FILES += \ /System/Library/PrivateFrameworks/LoggingSupport.framework, \ /System/Library/PrivateFrameworks/MobileKeyBag.framework, \ + /System/Library/Frameworks/IOSurface.framework, \ /usr/local/lib/libdarwintest_utils.dylib, \ /usr/lib/libapple_crypto.dylib, @@ -24,7 +25,7 @@ include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common OTHER_CFLAGS = -Weverything -Wno-gnu-union-cast -Wno-missing-field-initializers -Wno-partial-availability OTHER_CFLAGS += -Wno-missing-noreturn -Wno-vla -Wno-reserved-id-macro -Wno-documentation-unknown-command OTHER_CFLAGS += -Wno-padded -Wno-used-but-marked-unused -Wno-covered-switch-default -Wno-nullability-extension -OTHER_CFLAGS += -Wno-gnu-empty-initializer -Wno-unused-macros -Wno-undef +OTHER_CFLAGS += -Wno-gnu-empty-initializer -Wno-unused-macros -Wno-undef -Wno-fixed-enum-extension OTHER_CFLAGS += --std=gnu11 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders OTHER_CFLAGS += -UT_NAMESPACE_PREFIX -DT_NAMESPACE_PREFIX=xnu OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks @@ -56,10 +57,24 @@ backtracing: OTHER_LDFLAGS += -framework CoreSymbolication data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit +immovable_send: excserver +immovable_send: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +immovable_send: OTHER_LDFLAGS += -ldarwintest_utils -lpthread -framework IOKit + +CUSTOM_TARGETS += immovable_send_client +immovable_send: immovable_send_client + +immovable_send_client: immovable_send_client.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) immovable_send_client.c -o $(SYMROOT)/immovable_send_client + +install-immovable_send_client: immovable_send_client + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/immovable_send_client $(INSTALLDIR)/ + kdebug: INVALID_ARCHS = i386 kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf -EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c +EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c ifneq ($(PLATFORM),iPhoneOS) EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c perf_compressor.c memorystatus_freeze_test.c @@ -68,11 +83,19 @@ endif perf_compressor: OTHER_LDFLAGS += -ldarwintest_utils perf_compressor: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist +memorystatus_freeze_test: CODE_SIGN_ENTITLEMENTS=./task_for_pid_entitlement.plist memorystatus_freeze_test: OTHER_LDFLAGS += -ldarwintest_utils +memorystatus_freeze_test: OTHER_CFLAGS += -ldarwintest_utils memorystatus_assertion_helpers.c -stackshot: OTHER_CFLAGS += -Wno-objc-messaging-id -stackshot: OTHER_LDFLAGS += -lkdd -framework Foundation -stackshot: INVALID_ARCHS = i386 +memorystatus_is_assertion: OTHER_LDFLAGS += -ldarwintest_utils +memorystatus_is_assertion: OTHER_CFLAGS += memorystatus_assertion_helpers.c + +shared_cache_tests: INVALID_ARCHS = i386 +shared_cache_tests: OTHER_LDFLAGS += -ldarwintest_utils + +stackshot_tests: OTHER_CFLAGS += -Wno-objc-messaging-id +stackshot_tests: OTHER_LDFLAGS += -lkdd -ldarwintest_utils -framework Foundation +stackshot_tests: INVALID_ARCHS = i386 telemetry: OTHER_LDFLAGS = -framework ktrace -framework CoreFoundation telemetry: INVALID_ARCHS = i386 @@ -106,11 +129,45 @@ perf_exit: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils perf_exit: INVALID_ARCHS = i386 perf_exit: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist +CUSTOM_TARGETS += prioritize_process_launch_helper +prioritize_process_launch: prioritize_process_launch_helper + +prioritize_process_launch_helper: prioritize_process_launch_helper.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) prioritize_process_launch_helper.c -o $(SYMROOT)/prioritize_process_launch_helper + echo $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; \ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; + +install-prioritize_process_launch_helper: prioritize_process_launch_helper + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/prioritize_process_launch_helper $(INSTALLDIR)/ + + perf_spawn_fork: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist +mach_exception_reply: OTHER_CFLAGS += -Wno-cast-align + os_thread_self_restrict: os_thread_self_restrict.c os_thread_self_restrict-entitlements.plist os_thread_self_restrict: CODE_SIGN_ENTITLEMENTS=os_thread_self_restrict-entitlements.plist +osptr: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++98 +osptr: OTHER_CXXFLAGS += osptr_helper.cpp + +osptr_dumb: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++17 + +osptr_11: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++11 +osptr_11: OTHER_CXXFLAGS += osptr_helper.cpp +osptr_11: osptr.cpp + $(CXX) $(DT_CXXFLAGS) $(OTHER_CXXFLAGS) $(CXXFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + +osptr_17: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++17 +osptr_17: OTHER_CXXFLAGS += osptr_helper.cpp +osptr_17: osptr.cpp + $(CXX) $(DT_CXXFLAGS) $(OTHER_CXXFLAGS) $(CXXFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + +EXCLUDED_SOURCES += osptr_helper.cpp + +os_refcnt: OTHER_CFLAGS += -I$(SRCROOT)/../libkern/ -Wno-gcc-compat -Wno-undef -O3 -flto + task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements task_inspect: OTHER_CFLAGS += -DENTITLED=1 @@ -126,8 +183,6 @@ install-perf_exit_proc: perf_exit_proc mkdir -p $(INSTALLDIR) cp $(SYMROOT)/perf_exit_proc $(INSTALLDIR)/ -perf_kdebug: INVALID_ARCHS = i386 - stackshot_idle_25570396: INVALID_ARCHS = i386 stackshot_idle_25570396: OTHER_LDFLAGS += -lkdd -framework Foundation @@ -160,7 +215,7 @@ install-vm_set_max_addr_helper: vm_set_max_addr_helper cp $(SYMROOT)/vm_set_max_addr_helper $(INSTALLDIR)/ ifeq ($(PLATFORM),iPhoneOS) -OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled +OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled vm_phys_footprint_legacy jumbo_va_spaces_28530648: CODE_SIGN_ENTITLEMENTS = jumbo_va_spaces_28530648.entitlements jumbo_va_spaces_28530648: OTHER_CFLAGS += -DENTITLED=1 jumbo_va_spaces_28530648: OTHER_LDFLAGS += -ldarwintest_utils @@ -168,6 +223,13 @@ jumbo_va_spaces_28530648: OTHER_LDFLAGS += -ldarwintest_utils jumbo_va_spaces_28530648_unentitled: OTHER_LDFLAGS += -ldarwintest_utils jumbo_va_spaces_28530648_unentitled: jumbo_va_spaces_28530648.c $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + +vm_phys_footprint_legacy: OTHER_LDFLAGS += -framework CoreFoundation -framework IOSurface +vm_phys_footprint_legacy: OTHER_CFLAGS += -DLEGACY_FOOTPRINT_ENTITLED=1 +vm_phys_footprint_legacy: CODE_SIGN_ENTITLEMENTS=./legacy_footprint.entitlement +vm_phys_footprint_legacy: vm_phys_footprint.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + endif task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist @@ -177,11 +239,13 @@ proc_info: OTHER_LDFLAGS += -ldarwintest_utils proc_info_list_kthreads: CODE_SIGN_ENTITLEMENTS = ./proc_info_list_kthreads.entitlements +proc_info_44873309: CODE_SIGN_ENTITLEMENTS = ./proc_info_44873309.entitlements + disk_mount_conditioner: disk_mount_conditioner* disk_mount_conditioner: CODE_SIGN_ENTITLEMENTS=./disk_mount_conditioner-entitlements.plist disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils -OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled +disk_mount_conditioner: OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled disk_mount_conditioner_unentitled: OTHER_CFLAGS += -DTEST_UNENTITLED disk_mount_conditioner_unentitled: OTHER_LDFLAGS += -ldarwintest_utils disk_mount_conditioner_unentitled: disk_mount_conditioner.c @@ -200,6 +264,8 @@ thread_group_set_32261625: INVALID_ARCHS = i386 task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist +task_vm_info_decompressions: INVALID_ARCHS = x86_64 i386 + socket_bind_35243417: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist @@ -209,8 +275,14 @@ ifneq (osx,$(TARGET_NAME)) EXCLUDED_SOURCES += no32exec_35914211.c no32exec_35914211_helper.c endif -no32exec_35914211_helper: INVALID_ARCHS = x86_64 -no32exec_35914211: INVALID_ARCHS = i386 +no32exec_35914211_helper: INVALID_ARCHS = x86_64 +no32exec_35914211_helper_binprefs: + $(CC) $(OTHER_CFLAGS) $(CFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -ldarwintest -arch i386 -arch x86_64 \ + no32exec_35914211_helper_binprefs.c -o $(SYMROOT)/no32exec_35914211_helper_binprefs + +no32exec_35914211: INVALID_ARCHS = i386 +no32exec_35914211: no32exec_35914211_helper +no32exec_35914211: no32exec_35914211_helper_binprefs MIG:=SDKROOT=$(SDKROOT) $(shell xcrun -sdk "$(TARGETSDK)" -find mig) @@ -227,6 +299,9 @@ install-excserver: ; exc_resource_threads: excserver exc_resource_threads: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +fp_exception: excserver +fp_exception: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) + ifneq (osx,$(TARGET_NAME)) EXCLUDED_SOURCES += ldt_code32.s ldt.c else @@ -239,7 +314,8 @@ $(OBJROOT)/ldt_mach_exc_server.c: ldt: INVALID_ARCHS = i386 ldt: $(OBJROOT)/ldt_mach_exc_server.c -ldt: OTHER_CFLAGS += -I $(OBJROOT) $(SRCROOT)/ldt_code32.s -Wl,-pagezero_size,0x1000 +ldt: OTHER_CFLAGS += -I $(OBJROOT) $(SRCROOT)/ldt_code32.s -Wl,-pagezero_size,0x1000 -Wno-missing-variable-declarations +ldt: CODE_SIGN_ENTITLEMENTS=ldt_entitlement.plist endif ifneq ($(PLATFORM),BridgeOS) @@ -249,8 +325,9 @@ remote_time: INVALID_ARCHS = armv7 armv7s arm64_32 endif vm_phys_footprint: OTHER_LDFLAGS += -framework CoreFoundation -framework IOSurface -vm_phys_footprint_legacy: legacy_footprint.entitlement -vm_phys_footprint_legacy: OTHER_LDFLAGS += -framework CoreFoundation -framework IOSurface -vm_phys_footprint_legacy: CODE_SIGN_ENTITLEMENTS=./legacy_footprint.entitlement + +debug_control_port_for_pid: CODE_SIGN_ENTITLEMENTS = ./debug_control_port_for_pid_entitlement.plist + +prng: OTHER_LDFLAGS += -ldarwintest_utils include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets diff --git a/tests/avx.c b/tests/avx.c index 345361957..44c0a9ed1 100644 --- a/tests/avx.c +++ b/tests/avx.c @@ -16,7 +16,8 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.intel"), - T_META_CHECK_LEAKS(false) + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) ); #define NORMAL_RUN_TIME (10) diff --git a/tests/backtracing.c b/tests/backtracing.c index 379960766..f0af5447d 100644 --- a/tests/backtracing.c +++ b/tests/backtracing.c @@ -9,15 +9,17 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define USER_FRAMES (12) -#define NON_RECURSE_FRAMES (5) +#define NON_RECURSE_FRAMES (4) static const char *user_bt[USER_FRAMES] = { - NULL, NULL, + NULL, "backtrace_thread", "recurse_a", "recurse_b", "recurse_a", "recurse_b", - "recurse_a", "recurse_b", "recurse_a", + "recurse_a", "recurse_b", "recurse_a", "recurse_b", "expect_stack", NULL }; @@ -28,13 +30,15 @@ expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol, const char *name; unsigned int frame_idx = max_frames - bt_idx - 1; - if (bt[frame_idx] == NULL) { - T_LOG("frame %2u: skipping system frame", frame_idx); + if (CSIsNull(symbol)) { + T_FAIL("invalid symbol for address %#lx at frame %d", addr, + frame_idx); return; } - if (CSIsNull(symbol)) { - T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx); + if (bt[frame_idx] == NULL) { + T_LOG("frame %2u: skipping system frame %s", frame_idx, + CSSymbolGetName(symbol)); return; } diff --git a/tests/coalition_info.c b/tests/coalition_info.c new file mode 100644 index 000000000..14ce533c3 --- /dev/null +++ b/tests/coalition_info.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +static void +skip_if_monotonic_unsupported(void) +{ + int r; + int supported = 0; + size_t supported_size = sizeof(supported); + + r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size, + NULL, 0); + if (r < 0) { + T_WITH_ERRNO; + T_SKIP("could not find \"kern.monotonic.supported\" sysctl"); + } + + if (!supported) { + T_SKIP("monotonic is not supported on this platform"); + } +} + +T_DECL(coalition_resource_info_counters, + "ensure that coalition resource info produces valid counter data") +{ + skip_if_monotonic_unsupported(); + + struct proc_pidcoalitioninfo idinfo = {}; + int ret = proc_pidinfo(getpid(), PROC_PIDCOALITIONINFO, 0, + &idinfo, sizeof(idinfo)); + T_ASSERT_POSIX_SUCCESS(ret, "proc_pidinfo(... PROC_PIDCOALITIONINFO ...)"); + + uint64_t resid = idinfo.coalition_id[COALITION_TYPE_RESOURCE]; + + struct coalition_resource_usage coalusage[2] = {}; + ret = coalition_info_resource_usage(resid, &coalusage[0], + sizeof(coalusage[0])); + T_ASSERT_POSIX_SUCCESS(ret, "coalition_info_resource_usage()"); + T_EXPECT_GT(coalusage[0].cpu_instructions, UINT64_C(0), + "instruction count is non-zero"); + T_EXPECT_GT(coalusage[0].cpu_cycles, UINT64_C(0), + "cycle count is non-zero"); + + sleep(1); + + ret = coalition_info_resource_usage(resid, &coalusage[1], + sizeof(coalusage[1])); + T_ASSERT_POSIX_SUCCESS(ret, "coalition_info_resource_usage()"); + + T_EXPECT_GE(coalusage[1].cpu_instructions, coalusage[0].cpu_instructions, + "instruction count is monotonically increasing (+%" PRIu64 ")", + coalusage[1].cpu_instructions - coalusage[0].cpu_instructions); + T_EXPECT_GE(coalusage[1].cpu_cycles, coalusage[0].cpu_cycles, + "cycle count is monotonically increasing (+%" PRIu64 ")", + coalusage[1].cpu_cycles - coalusage[0].cpu_cycles); +} diff --git a/tests/cpucount.c b/tests/cpucount.c index a3641bdbe..24a2c156c 100644 --- a/tests/cpucount.c +++ b/tests/cpucount.c @@ -27,6 +27,8 @@ #include /* private header for _os_cpu_number */ +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + /* const variables aren't constants, but enums are */ enum { max_threads = 40 }; diff --git a/tests/data_protection.c b/tests/data_protection.c index 7a7e4dc8a..bb0411dec 100644 --- a/tests/data_protection.c +++ b/tests/data_protection.c @@ -750,8 +750,8 @@ set_passcode(char * new_passcode, char * old_passcode) } char * const keystorectl_args[] = { - KEYSTORECTL_PATH, - "change-password", + KEYBAGDTEST_PATH, + "syspass", old_passcode, new_passcode, NULL diff --git a/tests/debug_control_port_for_pid.c b/tests/debug_control_port_for_pid.c new file mode 100644 index 000000000..6985908e2 --- /dev/null +++ b/tests/debug_control_port_for_pid.c @@ -0,0 +1,24 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +T_DECL(debug_control_port_for_pid_success, + "Verify that with debug_port entitlement you can call debug_control_port_for_pid", + T_META_ASROOT(true), T_META_CHECK_LEAKS(false)) +{ + if (geteuid() != 0) { + T_SKIP("test requires root privileges to run."); + } + + mach_port_t port = MACH_PORT_NULL; + T_ASSERT_MACH_SUCCESS(debug_control_port_for_pid(mach_task_self(), 1, &port), "debug_control_port_for_pid"); + T_EXPECT_NE(port, MACH_PORT_NULL, "debug_port"); + mach_port_deallocate(mach_task_self(), port); +} diff --git a/tests/debug_control_port_for_pid_entitlement.plist b/tests/debug_control_port_for_pid_entitlement.plist new file mode 100644 index 000000000..c1cadeafc --- /dev/null +++ b/tests/debug_control_port_for_pid_entitlement.plist @@ -0,0 +1,8 @@ + + + + + com.apple.private.debug_port + + + diff --git a/tests/disk_mount_conditioner.c b/tests/disk_mount_conditioner.c index 6c733f451..4cc70598d 100644 --- a/tests/disk_mount_conditioner.c +++ b/tests/disk_mount_conditioner.c @@ -24,7 +24,8 @@ static void perf_setup(char **path, int *fd); T_GLOBAL_META( T_META_NAMESPACE("xnu.vfs.dmc"), - T_META_ASROOT(true) + T_META_ASROOT(true), + T_META_RUN_CONCURRENTLY(true) ); #pragma mark Entitled Tests @@ -271,7 +272,8 @@ T_DECL(fsctl_set_nonroot, } T_DECL(fsctl_delays, - "Validate I/O delays when DMC is enabled") + "Validate I/O delays when DMC is enabled", + T_META_RUN_CONCURRENTLY(false)) { char *path; int fd; diff --git a/tests/exc_resource_threads.c b/tests/exc_resource_threads.c index 173a8ef82..09caf8cda 100644 --- a/tests/exc_resource_threads.c +++ b/tests/exc_resource_threads.c @@ -19,6 +19,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + static dispatch_semaphore_t sync_sema; kern_return_t diff --git a/tests/extended_getdirentries64.c b/tests/extended_getdirentries64.c new file mode 100644 index 000000000..f30652ada --- /dev/null +++ b/tests/extended_getdirentries64.c @@ -0,0 +1,45 @@ +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include + +#define PRIVATE 1 +#include "../bsd/sys/dirent.h" + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +ssize_t __getdirentries64(int fd, void *buf, size_t bufsize, off_t *basep); + +T_DECL(getdirentries64_extended, "check for GETDIRENTRIES64_EOF") +{ + char buf[GETDIRENTRIES64_EXTENDED_BUFSIZE]; + getdirentries64_flags_t *flags; + ssize_t result; + off_t offset; + int fd; + bool eof = false; + + flags = (getdirentries64_flags_t *)(uintptr_t)(buf + sizeof(buf) - + sizeof(getdirentries64_flags_t)); + fd = open("/", O_DIRECTORY | O_RDONLY); + T_ASSERT_POSIX_SUCCESS(fd, "open(/)"); + + for (;;) { + *flags = (getdirentries64_flags_t)~0; + result = __getdirentries64(fd, buf, sizeof(buf), &offset); + T_ASSERT_POSIX_SUCCESS(result, "__getdirentries64()"); + T_ASSERT_LE((size_t)result, sizeof(buf) - sizeof(getdirentries64_flags_t), + "The kernel should have left space for the flags"); + T_ASSERT_NE(*flags, (getdirentries64_flags_t)~0, + "The kernel should have returned status"); + if (eof) { + T_ASSERT_EQ(result, 0l, "At EOF, we really should be done"); + T_ASSERT_TRUE(*flags & GETDIRENTRIES64_EOF, "And EOF should still be set"); + T_END; + } + T_ASSERT_NE(result, 0l, "We're not at EOF, we should have an entry"); + eof = (*flags & GETDIRENTRIES64_EOF); + } +} diff --git a/tests/fp_exception.c b/tests/fp_exception.c new file mode 100644 index 000000000..5010d9f8a --- /dev/null +++ b/tests/fp_exception.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/** + * On devices that support it, this test ensures that a mach exception is + * generated when an ARMv8 floating point exception is triggered. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __has_feature(ptrauth_calls) +#include +#endif + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +/* The bit to set in FPCR to enable the divide-by-zero floating point exception. */ +#define FPCR_DIV_EXC 0x200 + +/* Whether we caught the EXC_ARITHMETIC mach exception or not. */ +static volatile bool mach_exc_caught = false; + +/** + * mach_exc_server() is a MIG-generated function that verifies the message + * that was received is indeed a mach exception and then calls + * catch_mach_exception_raise_state() to handle the exception. + */ +extern boolean_t mach_exc_server(mach_msg_header_t *, mach_msg_header_t *); + +/** + * This has to be defined for linking purposes, but it's unused in this test. + */ +kern_return_t +catch_mach_exception_raise( + mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t type, + exception_data_t codes, + mach_msg_type_number_t code_count) +{ +#pragma unused(exception_port, thread, task, type, codes, code_count) + T_FAIL("Triggered catch_mach_exception_raise() which shouldn't happen..."); + __builtin_unreachable(); +} + +/** + * Called by mach_exc_server() to handle the exception. This will verify the + * exception is a floating point divide-by-zero exception and will then modify + * the thread state to move to the next instruction. + */ +kern_return_t +catch_mach_exception_raise_state( + mach_port_t exception_port, + exception_type_t type, + exception_data_t codes, + mach_msg_type_number_t code_count, + int *flavor, + thread_state_t in_state, + mach_msg_type_number_t in_state_count, + thread_state_t out_state, + mach_msg_type_number_t *out_state_count) +{ +#pragma unused(exception_port, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count) +#ifdef __arm64__ + T_LOG("Caught a mach exception!\n"); + + /* Floating point divide by zero should cause an EXC_ARITHMETIC exception. */ + T_ASSERT_EQ(type, EXC_ARITHMETIC, "Caught an EXC_ARITHMETIC exception"); + + /* There should only be two code vales. */ + T_ASSERT_EQ(code_count, 2, "Two code values were provided with the mach exception"); + + /** + * The code values should be 64-bit since MACH_EXCEPTION_CODES was specified + * when setting the exception port. + */ + uint64_t *codes_64 = (uint64_t*)codes; + T_LOG("Mach exception codes[0]: %#llx, codes[1]: %#llx\n", codes_64[0], codes_64[1]); + + /* Verify that we're receiving 64-bit ARM thread state values. */ + T_ASSERT_EQ(*flavor, ARM_THREAD_STATE64, "The thread state flavor is ARM_THREAD_STATE64"); + T_ASSERT_EQ(in_state_count, ARM_THREAD_STATE64_COUNT, "The thread state count is ARM_THREAD_STATE64_COUNT"); + + /* Verify the exception is a floating point divide-by-zero exception. */ + T_ASSERT_EQ(codes_64[0], EXC_ARM_FP_DZ, "The subcode is EXC_ARM_FP_DZ (floating point divide-by-zero)"); + + /** + * Increment the PC to the next instruction so the thread doesn't cause + * another exception when it resumes. + */ + *out_state_count = in_state_count; /* size of state object in 32-bit words */ + memcpy((void*)out_state, (void*)in_state, in_state_count * 4); + arm_thread_state64_t *state = (arm_thread_state64_t*)out_state; + + void *pc = (void*)(arm_thread_state64_get_pc(*state) + 4); +#if __has_feature(ptrauth_calls) + /* Have to sign the new PC value when pointer authentication is enabled. */ + pc = ptrauth_sign_unauthenticated(pc, ptrauth_key_function_pointer, 0); +#endif + arm_thread_state64_set_pc_fptr(*state, pc); + + mach_exc_caught = true; +#endif /* __arm64__ */ + + /* Return KERN_SUCCESS to tell the kernel to keep running the victim thread. */ + return KERN_SUCCESS; +} + +/** + * This has to be defined for linking purposes, but it's unused in this test. + */ +kern_return_t +catch_mach_exception_raise_state_identity( + mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t type, + exception_data_t codes, + mach_msg_type_number_t code_count, + int *flavor, + thread_state_t in_state, + mach_msg_type_number_t in_state_count, + thread_state_t out_state, + mach_msg_type_number_t *out_state_count) +{ +#pragma unused(exception_port, thread, task, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count) + T_FAIL("Triggered catch_mach_exception_raise_state_identity() which shouldn't happen..."); + __builtin_unreachable(); +} + +/** + * Thread to handle the mach exception generated by the floating point exception. + * + * @param arg The exception port to wait for a message on. + */ +void * +exc_server_thread(void *arg) +{ + mach_port_t exc_port = *(mach_port_t*)arg; + + /** + * mach_msg_server_once is a helper function provided by libsyscall that + * handles creating mach messages, blocks waiting for a message on the + * exception port, calls mach_exc_server() to handle the exception, and + * sends a reply based on the return value of mach_exc_server(). + */ +#define MACH_MSG_REPLY_SIZE 4096 + kern_return_t kr = mach_msg_server_once(mach_exc_server, MACH_MSG_REPLY_SIZE, exc_port, 0); + T_ASSERT_MACH_SUCCESS(kr, "Received mach exception message"); + + pthread_exit((void*)0); + __builtin_unreachable(); +} + +T_DECL(armv8_fp_exception, + "Test that ARMv8 floating point exceptions generate mach exceptions.") +{ +#ifndef __arm64__ + T_SKIP("Running on non-arm64 target, skipping..."); +#else + pthread_t exc_thread; + mach_port_t exc_port = MACH_PORT_NULL; + mach_port_t task = mach_task_self(); + mach_port_t thread = mach_thread_self(); + kern_return_t kr = KERN_SUCCESS; + + /* Attempt to enable Divide-by-Zero floating point exceptions in hardware. */ + uint64_t fpcr = __builtin_arm_rsr64("FPCR") | FPCR_DIV_EXC; + __builtin_arm_wsr64("FPCR", fpcr); +#define DSB_ISH 0xb + __builtin_arm_dsb(DSB_ISH); + + /* Devices that don't support floating point exceptions have FPCR as RAZ/WI. */ + if (__builtin_arm_rsr64("FPCR") != fpcr) { + T_SKIP("Running on a device that doesn't support floating point exceptions, skipping..."); + } + + /* Create the mach port the exception messages will be sent to. */ + kr = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port); + T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port"); + + /** + * Insert a send right into the exception port that the kernel will use to + * send the exception thread the exception messages. + */ + kr = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port"); + + /* Tell the kernel what port to send EXC_ARITHMETIC exceptions to. */ + kr = thread_set_exception_ports( + thread, + EXC_MASK_ARITHMETIC, + exc_port, + EXCEPTION_STATE | MACH_EXCEPTION_CODES, + ARM_THREAD_STATE64); + T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); + + /* Spawn the exception server's thread. */ + int err = pthread_create(&exc_thread, (pthread_attr_t*)0, exc_server_thread, (void*)&exc_port); + T_ASSERT_POSIX_ZERO(err, "Spawned exception server thread"); + + /* No need to wait for the exception server to be joined when it exits. */ + pthread_detach(exc_thread); + + /** + * This should cause a floating point divide-by-zero exception to get triggered. + * + * The kernel shouldn't resume this thread until the mach exception is handled + * by the exception server that was just spawned. The exception handler will + * explicitly increment the PC += 4 to move to the next instruction. + */ + float a = 6.5f; + float b = 0.0f; + __asm volatile ("fdiv %s0, %s1, %s2" : "=w" (a) : "w" (a), "w" (b)); + + if (mach_exc_caught) { + T_PASS("The expected floating point divide-by-zero exception was caught!"); + } else { + T_FAIL("The floating point divide-by-zero exception was not captured :("); + } +#endif /* __arm64__ */ +} diff --git a/tests/freebsd_waitpid_nohang.c b/tests/freebsd_waitpid_nohang.c index 815abe79e..872585d26 100644 --- a/tests/freebsd_waitpid_nohang.c +++ b/tests/freebsd_waitpid_nohang.c @@ -31,6 +31,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(waitpid_nohang, "FreeBSDarwin--waitpid_nohang") { pid_t child, pid; diff --git a/tests/gettimeofday.c b/tests/gettimeofday.c index e2f792b4c..e10e939ef 100644 --- a/tests/gettimeofday.c +++ b/tests/gettimeofday.c @@ -4,6 +4,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + extern int __gettimeofday(struct timeval *, struct timezone *); T_DECL(gettimeofday, "gettimeofday()", diff --git a/tests/host_statistics_rate_limiting.c b/tests/host_statistics_rate_limiting.c index 27809e747..b26ade3a0 100644 --- a/tests/host_statistics_rate_limiting.c +++ b/tests/host_statistics_rate_limiting.c @@ -7,6 +7,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #if !defined(CS_OPS_CLEARPLATFORM) #define CS_OPS_CLEARPLATFORM 13 #endif diff --git a/tests/immovable_rights.c b/tests/immovable_rights.c new file mode 100644 index 000000000..bc484d8ef --- /dev/null +++ b/tests/immovable_rights.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +T_DECL(immovable_rights, "Create a port with immovable receive rights") { + mach_port_t imm_port; + mach_port_options_t opts = { + .flags = MPO_CONTEXT_AS_GUARD | MPO_IMMOVABLE_RECEIVE + }; + kern_return_t kr; + + kr = mach_port_construct(mach_task_self(), &opts, 0x10, &imm_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + mach_port_status_t status; + mach_msg_type_number_t status_size = MACH_PORT_RECEIVE_STATUS_COUNT; + kr = mach_port_get_attributes(mach_task_self(), imm_port, + MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes"); + T_LOG("Status flags %d", status.mps_flags); + T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set"); + + mach_port_t imm_port2; + mach_port_options_t opts2 = {}; + + kr = mach_port_construct(mach_task_self(), &opts2, 0, &imm_port2); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + kr = mach_port_guard_with_flags(mach_task_self(), imm_port2, 0x11, (uint64_t)MPG_IMMOVABLE_RECEIVE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_guard_with_flags"); + + kr = mach_port_get_attributes(mach_task_self(), imm_port2, + MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes"); + T_LOG("Status flags %d", status.mps_flags); + T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set"); + + kr = mach_port_swap_guard(mach_task_self(), imm_port2, 0x11, 0xde18); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_swap_guard"); + + kr = mach_port_unguard(mach_task_self(), imm_port2, 0xde18); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_unguard"); +} diff --git a/tests/immovable_send.c b/tests/immovable_send.c new file mode 100644 index 000000000..2e1f90d19 --- /dev/null +++ b/tests/immovable_send.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define TASK_EXC_GUARD_MP_DELIVER 0x10 +#define MAX_ARGV 2 + +extern char **environ; + +kern_return_t +catch_mach_exception_raise_state(mach_port_t exception_port, + exception_type_t exception, + const mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + const thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state"); + return KERN_NOT_SUPPORTED; +} + +kern_return_t +catch_mach_exception_raise_state_identity(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state_identity"); + return KERN_NOT_SUPPORTED; +} + +kern_return_t +catch_mach_exception_raise(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count) +{ +#pragma unused(exception_port, task, thread, code_count) + T_ASSERT_EQ(exception, EXC_GUARD, "exception type"); + T_LOG("Exception raised with exception code : %llx\n", *code); + T_END; + return KERN_SUCCESS; +} + +typedef struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + mach_msg_trailer_t trailer; // subtract this when sending +} ipc_complex_message; + +struct args { + char *server_port_name; + mach_port_t server_port; +}; + +void parse_args(struct args *args); +void server_setup(struct args* args); +void* exception_server_thread(void *arg); +mach_port_t create_exception_port(void); + +#define TEST_TIMEOUT 10 + +void +parse_args(struct args *args) +{ + args->server_port_name = "TEST_IMMOVABLE_SEND"; + args->server_port = MACH_PORT_NULL; +} + +/* Create a mach IPC listener which will respond to the client's message */ +void +server_setup(struct args *args) +{ + kern_return_t ret; + mach_port_t bsport; + + ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + &args->server_port); + T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_allocate()"); + + ret = mach_port_insert_right(mach_task_self(), args->server_port, args->server_port, + MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_insert_right()"); + + ret = task_get_bootstrap_port(mach_task_self(), &bsport); + T_ASSERT_MACH_SUCCESS(ret, "server: task_get_bootstrap_port()"); + + ret = bootstrap_register(bsport, args->server_port_name, args->server_port); + T_ASSERT_MACH_SUCCESS(ret, "server: bootstrap_register()"); + + T_LOG("server: waiting for IPC messages from client on port '%s'.\n", + args->server_port_name); +} + +mach_port_t +create_exception_port() +{ + kern_return_t kret; + mach_port_t exc_port = MACH_PORT_NULL; + mach_port_t task = mach_task_self(); + + kret = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port); + T_EXPECT_MACH_SUCCESS(kret, "mach_port_allocate exc_port"); + + kret = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND); + T_EXPECT_MACH_SUCCESS(kret, "mach_port_insert_right exc_port"); + + return exc_port; +} + +void * +exception_server_thread(void *arg) +{ + kern_return_t kr; + mach_port_t exc_port = *(mach_port_t *)arg; + T_EXPECT_NE(exc_port, MACH_PORT_NULL, "exception port is not null"); + + /* Handle exceptions on exc_port */ + kr = mach_msg_server(mach_exc_server, 4096, exc_port, 0); + T_EXPECT_MACH_SUCCESS(kr, "mach_msg_server"); + + return NULL; +} + +T_DECL(catch_exception, "Send guard port descriptor to another process", T_META_IGNORECRASHES(".*immovable_send_client.*")) +{ + uint32_t task_exc_guard = 0; + size_t te_size = sizeof(&task_exc_guard); + kern_return_t kr; + mach_msg_type_number_t maskCount = 1; + exception_mask_t mask; + exception_handler_t handler; + exception_behavior_t behavior; + thread_state_flavor_t flavor; + mach_port_t task = mach_task_self(); + struct args* server_args = (struct args*)malloc(sizeof(struct args)); + posix_spawnattr_t attrs; + char *test_prog_name = "./immovable_send_client"; + char *child_args[MAX_ARGV]; + + T_LOG("Check if task_exc_guard exception has been enabled\n"); + sysctlbyname("kern.task_exc_guard_default", &task_exc_guard, &te_size, NULL, 0); + //TODO: check if sysctlbyname is successful + + /* Create the bootstrap port */ + parse_args(server_args); + server_setup(server_args); + + /* Create the exception port for the server */ + mach_port_t exc_port = create_exception_port(); + T_EXPECT_NOTNULL(exc_port, "Create a new exception port"); + + pthread_t s_exc_thread; + + /* Create exception serving thread */ + int ret = pthread_create(&s_exc_thread, NULL, exception_server_thread, &exc_port); + T_EXPECT_POSIX_SUCCESS(ret, "pthread_create exception_server_thread"); + + /* Get current exception ports */ + kr = task_get_exception_ports(task, EXC_MASK_GUARD, &mask, + &maskCount, &handler, &behavior, &flavor); + T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports"); + + /* Initialize posix_spawn attributes */ + posix_spawnattr_init(&attrs); + + int err = posix_spawnattr_setexceptionports_np(&attrs, EXC_MASK_GUARD, exc_port, + (exception_behavior_t) (EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0); + T_EXPECT_POSIX_SUCCESS(err, "posix_spawnattr_setflags"); + + child_args[0] = test_prog_name; + child_args[1] = NULL; + + err = posix_spawn(NULL, child_args[0], NULL, &attrs, &child_args[0], environ); + T_EXPECT_POSIX_SUCCESS(err, "posix_spawn immovable_send_client"); + + int child_status; + /* Wait for child and check for exception */ + if (-1 == wait4(-1, &child_status, 0, NULL)) { + T_FAIL("wait4: child mia"); + } + + if (WIFEXITED(child_status) && WEXITSTATUS(child_status)) { + T_LOG("Child exited with status = %x", child_status); + } + + sigsuspend(0); +} diff --git a/tests/immovable_send_client.c b/tests/immovable_send_client.c new file mode 100644 index 000000000..682cdcf8c --- /dev/null +++ b/tests/immovable_send_client.c @@ -0,0 +1,130 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + mach_msg_trailer_t trailer; // subtract this when sending +} ipc_complex_message; + +static ipc_complex_message icm_request = {}; + +struct args { + const char *progname; + int verbose; + int voucher; + int num_msgs; + const char *server_port_name; + mach_port_t server_port; + mach_port_t reply_port; + mach_port_t voucher_port; + int request_msg_size; + void *request_msg; + int reply_msg_size; + void *reply_msg; + mach_port_t sp_voucher_port; + uint32_t persona_id; + long client_pid; +}; + +static void +parse_args(struct args *args) +{ + args->verbose = 0; + args->voucher = 0; + args->server_port_name = "TEST_IMMOVABLE_SEND"; + args->server_port = MACH_PORT_NULL; + args->reply_port = MACH_PORT_NULL; + args->voucher_port = MACH_PORT_NULL; + args->num_msgs = 1; + args->request_msg_size = sizeof(ipc_complex_message) - sizeof(mach_msg_trailer_t); + //args->reply_msg_size = sizeof(ipc_complex_message2) - sizeof(mach_msg_trailer_t); + args->request_msg = &icm_request; + args->reply_msg = NULL; + args->client_pid = getpid(); +} + +int +main() +{ + struct args client_args = {}; + parse_args(&client_args); + + /* Find the bootstrap port */ + mach_port_t bsport; + kern_return_t ret = task_get_bootstrap_port(mach_task_self(), &bsport); + if (ret) { + mach_error("client: task_get_bootstrap_port()", ret); + exit(1); + } + + printf("client: Look up bootstrap service port\n"); + ret = bootstrap_look_up(bsport, client_args.server_port_name, + &client_args.server_port); + if (ret) { + mach_error("client: bootstrap_look_up()", ret); + exit(1); + } + + printf("client: Look up the ioconnect service port to be sent\n"); + io_service_t amfi = IO_OBJECT_NULL; + io_connect_t connect = IO_OBJECT_NULL; + IOReturn ioret; + + amfi = IOServiceGetMatchingService(kIOMasterPortDefault, IOServiceMatching("AppleMobileFileIntegrity")); + if (amfi == IO_OBJECT_NULL) { + fprintf(stderr, "client: unable to find AppleMobileFileIntegrity service\n"); + exit(1); + } + ioret = IOServiceOpen(amfi, mach_task_self(), 0, &connect); + if (ioret != kIOReturnSuccess) { + fprintf(stderr, "client: unable to open user client: 0x%x\n", ret); + exit(1); + } + + printf("client: Found the matching io_connect port = %d\n", connect); + + /* Construct the message */ + mach_msg_header_t *request = (mach_msg_header_t *)client_args.request_msg; + request->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, 0, + 0, 0) | MACH_MSGH_BITS_COMPLEX; + request->msgh_size = (mach_msg_size_t)client_args.request_msg_size; + request->msgh_remote_port = client_args.server_port; + request->msgh_local_port = MACH_PORT_NULL; + request->msgh_id = 1; + + ipc_complex_message *complexmsg = (ipc_complex_message *)request; + complexmsg->body.msgh_descriptor_count = 1; + complexmsg->port_descriptor.name = connect; + complexmsg->port_descriptor.disposition = MACH_MSG_TYPE_MOVE_SEND; + complexmsg->port_descriptor.type = MACH_MSG_PORT_DESCRIPTOR; + + mach_msg_option_t option = MACH_SEND_MSG; + + printf("client: Sending request (expecting it to fail) \n"); + mach_msg_return_t mret = mach_msg(request, + option, + (mach_msg_size_t)client_args.request_msg_size, + 0, + MACH_PORT_NULL, + MACH_MSG_TIMEOUT_NONE, + MACH_PORT_NULL); + + printf("client: mach_msg returned %x\n", mret); + if (mret != MACH_SEND_INVALID_RIGHT) { + mach_error("client: mach_msg", mret); + exit(1); + } + + printf("It should never reach here\n"); + + return 0; +} diff --git a/tests/in_cksum_test.c b/tests/in_cksum_test.c new file mode 100644 index 000000000..573172776 --- /dev/null +++ b/tests/in_cksum_test.c @@ -0,0 +1,235 @@ +/* arm64 os_cpu_in_cksum_mbuf sometimes incorrect with unaligned input buffer */ + +#include +#include +#include + +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +extern uint32_t os_cpu_in_cksum(const void *, uint32_t, uint32_t); + +/****************************************************************/ +static void +log_hexdump(const void *inp, size_t len) +{ + unsigned i, off = 0; + char buf[9 + 16 * 3 + 1]; + for (i = 0; i < len; i++) { + if (i % 16 == 0) { + off = (unsigned)snprintf(buf, sizeof(buf), "%08x:", i); + } + off += (unsigned)snprintf(buf + off, sizeof(buf) - off, " %02x", (((const uint8_t *)inp)[i]) & 0xff); + if (i % 16 == 15) { + T_LOG("%s", buf); + } + } + if (len % 16) { + T_LOG("%s", buf); + } +} + +/* I was going to use the one from rfc1701 section 4.1 + * but then I saw the errata. Hopefully this is dumb but + * correct, even though it is not particularly efficient. + */ +static uint16_t +dumb_in_cksum(const uint8_t *buf, size_t len) +{ + uint32_t partial = 0; + while (len > 1) { + uint16_t val = buf[1]; + val <<= 8; + val |= buf[0]; + len -= 2; + buf += 2; + partial += val; + while ((val = partial >> 16)) { + partial &= 0xffff; + partial += val; + } + } + if (len) { + uint16_t val = buf[0]; + partial += val; + while ((val = partial >> 16)) { + partial &= 0xffff; + partial += val; + } + } + return ~partial & 0xffff; +} + +/* Calculate a checksum divided into partial checksums */ +static uint16_t +split_in_cksum(const uint8_t *buf, int nsegs, const uint32_t *seglens, const uint8_t *aligns, uint8_t *tmpbuf) +{ + uint32_t partial = 0; + + for (int i = 0; i < nsegs; i++) { + /* Only the last segment can have an odd length */ + assert((i + 1 == nsegs) || seglens[i] % 2 == 0); + + /* Copy a segment into the tmpbuf with the requested alignment */ + memcpy(tmpbuf + aligns[i], buf, seglens[i]); + + partial = os_cpu_in_cksum(tmpbuf + aligns[i], seglens[i], partial); + buf += seglens[i]; + } + + return ~partial & 0xffff; +} + +static void +test_checksum(const uint8_t *data, uint32_t len) +{ + uint16_t dsum = dumb_in_cksum(data, len); + + const uint8_t MAXALIGN = 8; + + uint8_t tmpbuf[len + MAXALIGN]; + uint32_t seglens[2]; + uint8_t aligns[2]; + for (uint16_t split = 0; split < len; split += 2) { + seglens[0] = split; + seglens[1] = len - split; + for (aligns[0] = 0; aligns[0] < MAXALIGN; aligns[0]++) { + for (aligns[1] = 0; aligns[1] < MAXALIGN; aligns[1]++) { + uint16_t osum = split_in_cksum(data, 2, seglens, aligns, tmpbuf); + if (osum != dsum) { + /* hexdump packet and alignments for debugging */ + log_hexdump(data, len); + T_LOG("len %d seg[0] %d seg[1] %d align[0] %d align[1] %d\n", len, seglens[0], seglens[1], aligns[0], aligns[1]); + } + T_QUIET; T_ASSERT_EQ(osum, dsum, "checksum mismatch got 0x%04x expecting 0x%04x", htons(osum), htons(dsum)); + } + } + } + T_PASS("OK len %d", len); +} + +static void +test_one_random_packet(uint32_t maxlen) +{ + /* Pick a packet length */ + uint32_t len = arc4random_uniform(maxlen); + uint8_t data[len]; + arc4random_buf(data, len); + test_checksum(data, len); +} + +/* + * This is the checksummed portion of the first packet in checksum_error.pcap + * It is known to cause a problem at splits 44 and 46 with second alignment of 1 or 3 + */ +static uint8_t pkt49479689[] = { +/*00000000*/ 0xc0, 0xa8, 0x01, 0x06, 0xc0, 0xa8, 0x01, 0x07, 0x00, 0x06, 0x05, 0xc8, 0xcb, 0xf1, 0xc0, 0x24, // |...............$| +/*00000010*/ 0x2d, 0x23, 0x48, 0xd6, 0x3b, 0x44, 0x96, 0x7f, 0x80, 0x10, 0x20, 0x86, 0x00, 0x00, 0x00, 0x00, // |-#H.;D.... ..,..| +/*00000020*/ 0x01, 0x01, 0x08, 0x0a, 0x0c, 0xc4, 0x69, 0x3a, 0x31, 0x63, 0xb3, 0x37, 0x55, 0xe1, 0x62, 0x48, // |......i:1c.7U.bH| +/*00000030*/ 0xa4, 0xff, 0xff, 0xa0, 0xc5, 0xd9, 0x5d, 0xd2, 0x4d, 0xe4, 0xca, 0xd7, 0x83, 0x27, 0xcc, 0x90, // |......].M....'..| +/*00000040*/ 0x02, 0x26, 0x63, 0xd3, 0x02, 0x3c, 0xf1, 0x20, 0x15, 0xa6, 0x8b, 0xff, 0x98, 0x8d, 0x57, 0x2a, // |.&c..<. ......W*| +/*00000050*/ 0x06, 0x4b, 0x06, 0x49, 0x5d, 0x8a, 0x28, 0x66, 0xe6, 0x57, 0x71, 0xd9, 0x27, 0xd1, 0xb9, 0xd6, // |.K.I].(f.Wq.'...| +/*00000060*/ 0x20, 0x48, 0x13, 0x2e, 0xbf, 0x30, 0x8c, 0xce, 0x49, 0x99, 0x2a, 0xb7, 0x94, 0xa4, 0x3a, 0x8e, // | H...0..I.*...:.| +/*00000070*/ 0x35, 0xcc, 0x48, 0xb2, 0x7f, 0xe1, 0xca, 0x2f, 0x08, 0x49, 0x7f, 0x35, 0x61, 0xcf, 0x59, 0xa2, // |5.H..../.I.5a.Y.| +/*00000080*/ 0x3a, 0x5e, 0x10, 0x5a, 0x0a, 0xd7, 0xa2, 0x38, 0x64, 0xe1, 0x7c, 0x5d, 0xbd, 0x29, 0x65, 0x5a, // |:^.Z...8d.|].)eZ| +/*00000090*/ 0xf2, 0x14, 0x30, 0x51, 0x9b, 0x56, 0xbb, 0xe2, 0x04, 0x48, 0x04, 0x23, 0x53, 0x30, 0x3a, 0x0a, // |..0Q.V...H.#S0:.| +/*000000a0*/ 0x48, 0x5a, 0xdd, 0xe4, 0xd7, 0x5e, 0x5b, 0x5d, 0x90, 0x89, 0x7d, 0xf0, 0xad, 0x24, 0x1a, 0xa8, // |HZ...^[]..}..$..| +/*000000b0*/ 0x81, 0xc1, 0x6b, 0x11, 0x97, 0x68, 0xc0, 0xbb, 0xe4, 0x5c, 0xba, 0x1a, 0xe8, 0x9c, 0xc9, 0x8b, // |..k..h...\......| +/*000000c0*/ 0xb8, 0x2b, 0x11, 0x85, 0x7f, 0xbf, 0x19, 0x81, 0xb0, 0xfc, 0xfd, 0x4a, 0xac, 0x7b, 0xd3, 0x60, // |.+.........J.{.`| +/*000000d0*/ 0x44, 0x1f, 0x5e, 0x8d, 0x05, 0x6e, 0xd7, 0xd1, 0xef, 0x11, 0x84, 0xd3, 0x0d, 0x63, 0xcf, 0x56, // |D.^..n.......c.V| +/*000000e0*/ 0xf9, 0x27, 0xc4, 0xd0, 0x39, 0x0e, 0xac, 0x7e, 0xba, 0xb3, 0xb8, 0x9c, 0x21, 0x21, 0xc8, 0xa0, // |.'..9..~....!!..| +/*000000f0*/ 0xbc, 0xd8, 0x82, 0x6f, 0x81, 0xa6, 0xc2, 0xf5, 0xe0, 0xdb, 0x41, 0xd0, 0xd4, 0x18, 0x2a, 0x5b, // |...o......A...*[| +/*00000100*/ 0x93, 0x3d, 0x5a, 0x08, 0xe2, 0xac, 0x8d, 0xd3, 0x7d, 0xcc, 0x49, 0x33, 0xc9, 0xb8, 0x9e, 0x12, // |.=Z.....}.I3....| +/*00000110*/ 0x86, 0x63, 0x38, 0x9c, 0xce, 0x4a, 0xb7, 0xcc, 0xe9, 0x4b, 0x5e, 0xb5, 0x24, 0x42, 0x47, 0x28, // |.c8..J...K^.$BG(| +/*00000120*/ 0x1c, 0x09, 0xe8, 0x84, 0xa6, 0xf0, 0x5f, 0x03, 0x94, 0x6f, 0x6a, 0x18, 0x60, 0xc3, 0x12, 0x58, // |......_..oj.`..X| +/*00000130*/ 0x6c, 0xbe, 0x13, 0x85, 0xa4, 0xdf, 0xe1, 0x8c, 0x3a, 0x04, 0xe9, 0x56, 0xa3, 0x09, 0x41, 0xf1, // |l.......:..V..A.| +/*00000140*/ 0x70, 0xf5, 0xc4, 0x27, 0x8e, 0x18, 0x09, 0x56, 0x5f, 0x82, 0x08, 0xec, 0x84, 0x55, 0x3b, 0x58, // |p..'...V_....U;X| +/*00000150*/ 0x84, 0x7b, 0xc8, 0x63, 0x70, 0x6a, 0x83, 0x04, 0xc8, 0xff, 0xe7, 0x6a, 0xbc, 0xee, 0xc0, 0xfe, // |.{.cpj.....j....| +/*00000160*/ 0xef, 0x60, 0xb7, 0x04, 0xb5, 0x57, 0x53, 0x5b, 0xeb, 0x4d, 0xec, 0x22, 0xe8, 0x59, 0x22, 0x64, // |.`...WS[.M.".Y"d| +/*00000170*/ 0x20, 0x5a, 0x61, 0x7d, 0x92, 0x02, 0x80, 0xd0, 0x85, 0x56, 0x98, 0x75, 0xbe, 0x35, 0xaf, 0xe4, // | Za}.....V.u.5..| +/*00000180*/ 0xc3, 0x06, 0xfa, 0xc2, 0x29, 0xce, 0x80, 0xe2, 0x68, 0xf3, 0xd8, 0x4b, 0x72, 0x46, 0x6e, 0xa3, // |....)...h..KrFn.| +/*00000190*/ 0x88, 0x57, 0xfb, 0x08, 0xec, 0x60, 0x2f, 0x3c, 0xa4, 0xaf, 0x08, 0x64, 0x45, 0x16, 0xba, 0x7b, // |.W...`/<...dE..{| +/*000001a0*/ 0xad, 0x24, 0x7a, 0x1f, 0x53, 0x46, 0x0c, 0xe6, 0xe9, 0x99, 0xd7, 0x2b, 0x9d, 0x62, 0xd9, 0x4a, // |.$z.SF.....+.b.J| +/*000001b0*/ 0x80, 0x2a, 0x43, 0xc2, 0x78, 0xa6, 0x6b, 0x38, 0x8e, 0xc8, 0x40, 0x6b, 0x03, 0xe2, 0x47, 0x04, // |.*C.x.k8..@k..G.| +/*000001c0*/ 0xda, 0x08, 0x72, 0xf5, 0xbc, 0x66, 0x3f, 0x33, 0x4d, 0xb6, 0x26, 0xd0, 0x66, 0x8c, 0xa0, 0x70, // |..r..f?3M.&.f..p| +/*000001d0*/ 0x25, 0xbc, 0x68, 0xda, 0x02, 0x79, 0x89, 0xed, 0x0c, 0xfc, 0xe7, 0x3d, 0x15, 0xcf, 0x5e, 0xc9, // |%.h..y.....=..^.| +/*000001e0*/ 0x63, 0xe0, 0x64, 0xb1, 0xfb, 0x28, 0xf7, 0x29, 0x52, 0xcf, 0x7a, 0xe3, 0x6d, 0x46, 0xc5, 0x1a, // |c.d..(.)R.z.mF..| +/*000001f0*/ 0x71, 0x24, 0x4e, 0x12, 0x56, 0x86, 0xc7, 0xf5, 0x98, 0x3e, 0xa9, 0xbc, 0x5d, 0xe9, 0x22, 0x88, // |q$N.V....>..].".| +/*00000200*/ 0x9b, 0x61, 0xc4, 0xa2, 0xcc, 0x27, 0x54, 0x07, 0x88, 0xeb, 0xe1, 0x4e, 0xaa, 0x0a, 0xd6, 0x94, // |.a...'T....N....| +/*00000210*/ 0x83, 0x32, 0xf8, 0x1d, 0xff, 0x67, 0xe5, 0x63, 0x78, 0x04, 0x11, 0x24, 0x25, 0xd7, 0x22, 0x54, // |.2...g.cx..$%."T| +/*00000220*/ 0x73, 0x87, 0xc9, 0x53, 0x72, 0x51, 0xda, 0x24, 0x33, 0xd7, 0x5c, 0x40, 0x86, 0x77, 0xf9, 0xc2, // |s..SrQ.$3.\@.w..| +/*00000230*/ 0xeb, 0x7d, 0x4c, 0x72, 0xeb, 0xc9, 0x8b, 0xcc, 0x79, 0xcd, 0x4a, 0x5a, 0x9e, 0xe2, 0x83, 0x20, // |.}Lr....y.JZ... | +/*00000240*/ 0x19, 0x5b, 0x4b, 0xe6, 0x5c, 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x7b, 0x80, 0x69, // |.[K.\........{.i| +/*00000250*/ 0x29, 0x53, 0x97, 0xc2, 0xc9, 0x4c, 0x00, 0x00, 0x00, 0x00, 0x67, 0x75, 0x81, 0x80, 0x12, 0x6e, // |)S...L....gu...n| +/*00000260*/ 0x50, 0x66, 0xe9, 0x0a, 0x28, 0x3b, 0x1a, 0xf1, 0xcb, 0x46, 0x72, 0xf7, 0xe9, 0x9f, 0x84, 0x29, // |Pf..(;...Fr....)| +/*00000270*/ 0xb9, 0x95, 0xf9, 0x6d, 0x5d, 0x04, 0x51, 0x7f, 0x0e, 0xf0, 0xe4, 0x3d, 0x4b, 0xd2, 0xb2, 0xb5, // |...m].Q....=K...| +/*00000280*/ 0x51, 0xf0, 0x31, 0x8e, 0x55, 0x18, 0x54, 0xf7, 0xee, 0x03, 0x37, 0x07, 0x33, 0x43, 0x8b, 0x5a, // |Q.1.U.T...7.3C.Z| +/*00000290*/ 0x1d, 0x16, 0xe8, 0xc4, 0x8b, 0x2c, 0x8a, 0x01, 0x5c, 0x45, 0xc6, 0xd1, 0x9d, 0xa9, 0x0a, 0xe2, // |.....,..\E......| +/*000002a0*/ 0x15, 0x4b, 0x8b, 0x00, 0x84, 0xbf, 0x3d, 0xad, 0xed, 0x86, 0x8e, 0x5c, 0x76, 0xe9, 0xbe, 0x4b, // |.K....=....\v..K| +/*000002b0*/ 0xd5, 0xb5, 0xb0, 0x08, 0x7f, 0xd7, 0x71, 0x57, 0x44, 0x67, 0x31, 0x8b, 0x43, 0x7d, 0xf8, 0x5a, // |......qWDg1.C}.Z| +/*000002c0*/ 0xcd, 0xe6, 0x4c, 0xec, 0x89, 0xa5, 0xd1, 0x03, 0x86, 0xfd, 0x01, 0x7d, 0x22, 0x32, 0xf0, 0xc3, // |..L........}"2..| +/*000002d0*/ 0x23, 0x99, 0x8e, 0x69, 0x14, 0x54, 0x54, 0x03, 0xec, 0x27, 0x6a, 0x7d, 0x13, 0xc7, 0xe2, 0x39, // |#..i.TT..'j}...9| +/*000002e0*/ 0x2b, 0xc0, 0x1a, 0x70, 0x82, 0xe9, 0x80, 0x73, 0xf4, 0x27, 0x26, 0xca, 0x5c, 0xf6, 0x7f, 0x46, // |+..p...s.'&.\..F| +/*000002f0*/ 0xf7, 0x00, 0x58, 0x3c, 0x3a, 0xcc, 0x1e, 0x9b, 0xd2, 0x22, 0x78, 0x04, 0x23, 0xc6, 0xfb, 0xdf, // |..X<:...."x.#...| +/*00000300*/ 0x8b, 0x36, 0xd6, 0xfa, 0xd8, 0x53, 0xbd, 0x0e, 0xaf, 0x1a, 0x04, 0xd1, 0x81, 0xd6, 0x1f, 0x1a, // |.6...S..........| +/*00000310*/ 0x74, 0x4d, 0xcf, 0xf6, 0xcf, 0x61, 0x6c, 0xd9, 0x7f, 0x1e, 0xb3, 0x1c, 0x2e, 0x74, 0x1a, 0x37, // |tM...al......t.7| +/*00000320*/ 0xfa, 0x2a, 0x24, 0x6d, 0xc2, 0x6d, 0x54, 0xfb, 0xd7, 0x9b, 0x34, 0x87, 0xeb, 0xac, 0x38, 0xc7, // |.*$m.mT...4...8.| +/*00000330*/ 0xe3, 0xc9, 0x6a, 0x98, 0x04, 0x2b, 0x33, 0x2d, 0x87, 0xf4, 0x25, 0xd6, 0x64, 0x14, 0xe8, 0xd0, // |..j..+3-..%.d...| +/*00000340*/ 0x84, 0x18, 0xc0, 0x39, 0x4d, 0xb5, 0xe5, 0xe2, 0xdb, 0x74, 0x59, 0x52, 0xad, 0x91, 0x1a, 0x55, // |...9M....tYR...U| +/*00000350*/ 0xae, 0xa3, 0xe1, 0x73, 0x4e, 0x76, 0x14, 0x94, 0xab, 0xec, 0x69, 0xb7, 0x0c, 0xa3, 0x71, 0x14, // |...sNv....i...q.| +/*00000360*/ 0x04, 0xbf, 0xf9, 0x75, 0xca, 0x2b, 0x8a, 0xa4, 0x5b, 0xe6, 0xe8, 0x61, 0x8d, 0xad, 0x1a, 0x62, // |...u.+..[..a...b| +/*00000370*/ 0x97, 0xaa, 0xfa, 0x3f, 0x88, 0x75, 0xcd, 0xe7, 0x29, 0x66, 0xbd, 0xcf, 0x50, 0xfd, 0x10, 0x09, // |...?.u..)f..P...| +/*00000380*/ 0x45, 0x2e, 0x97, 0xd5, 0x7c, 0xb4, 0x12, 0x7a, 0x5f, 0xfc, 0x1c, 0x74, 0x02, 0xf0, 0xa7, 0x98, // |E...|..z_..t....| +/*00000390*/ 0xd2, 0x03, 0x86, 0x19, 0x08, 0x54, 0x3d, 0x4d, 0x88, 0x13, 0x88, 0x87, 0x26, 0x61, 0x3e, 0x88, // |.....T=M....&a>.| +/*000003a0*/ 0xf8, 0x18, 0xcc, 0xac, 0x6f, 0xec, 0x12, 0x57, 0xfe, 0x80, 0xa3, 0xbe, 0x04, 0x39, 0x52, 0xe0, // |....o..W.....9R.| +/*000003b0*/ 0xc3, 0xfa, 0xed, 0x4f, 0xf5, 0x07, 0x59, 0x7e, 0xfa, 0xb9, 0x35, 0x36, 0xf2, 0x55, 0x23, 0xab, // |...O..Y~..56.U#.| +/*000003c0*/ 0x15, 0x65, 0x57, 0xb2, 0xce, 0xdb, 0x63, 0xe0, 0x1f, 0x1f, 0xa5, 0xfa, 0x70, 0x2e, 0x53, 0x76, // |.eW...c.....p.Sv| +/*000003d0*/ 0x20, 0x5b, 0x54, 0xc2, 0x0f, 0xe9, 0xca, 0x2c, 0x82, 0xf1, 0x30, 0x61, 0xbb, 0x99, 0x1e, 0x2a, // | [T....,..0a...*| +/*000003e0*/ 0xa2, 0x71, 0x91, 0x39, 0x07, 0xda, 0xcd, 0x50, 0xbb, 0x73, 0x5b, 0xa4, 0x05, 0x26, 0xee, 0x9f, // |.q.9...P.s[..&..| +/*000003f0*/ 0x5e, 0x88, 0x72, 0x92, 0xc9, 0x60, 0x2b, 0xd7, 0x6a, 0x91, 0x40, 0x52, 0x6b, 0xd1, 0xab, 0x00, // |^.r..`+.j.@Rk...| +/*00000400*/ 0xcc, 0x60, 0x53, 0x9b, 0x36, 0x40, 0x3b, 0x60, 0x18, 0x7f, 0x5f, 0xc2, 0x8c, 0x44, 0x08, 0xae, // |.`S.6@;`.._..D..| +/*00000410*/ 0x95, 0xae, 0x8c, 0xd7, 0x8d, 0x68, 0x4a, 0x42, 0x64, 0x1d, 0xdf, 0xdc, 0x17, 0x1a, 0x28, 0xe0, // |.....hJBd.....(.| +/*00000420*/ 0x55, 0x35, 0x00, 0x65, 0xe4, 0xd4, 0xd7, 0x3e, 0x1c, 0x6a, 0xa1, 0xbf, 0xba, 0xd8, 0x29, 0xce, // |U5.e...>.j....).| +/*00000430*/ 0xa6, 0x1f, 0xf9, 0x06, 0xff, 0x70, 0x43, 0xc8, 0xa0, 0x49, 0x03, 0xcd, 0x19, 0xf2, 0x16, 0x01, // |.....pC..I......| +/*00000440*/ 0x46, 0xf0, 0x29, 0xdb, 0xc2, 0x85, 0x89, 0x20, 0x37, 0x91, 0xd3, 0x74, 0x1c, 0x38, 0x08, 0xb3, // |F.).... 7..t.8..| +/*00000450*/ 0xd5, 0xa3, 0x4c, 0x52, 0x6e, 0xb3, 0x24, 0xc0, 0xbc, 0xd6, 0xc6, 0x64, 0x0b, 0x40, 0x44, 0xc4, // |..LRn.$....d.@D.| +/*00000460*/ 0xb9, 0x11, 0x10, 0x2a, 0xcd, 0x43, 0x99, 0x47, 0xe9, 0xfb, 0xf0, 0xe0, 0x56, 0x13, 0x40, 0x41, // |...*.C.G....V.@A| +/*00000470*/ 0x8a, 0x41, 0xcc, 0x92, 0x8d, 0xd5, 0xb9, 0x47, 0x05, 0xc7, 0x72, 0x76, 0x02, 0x09, 0x05, 0xd9, // |.A.....G..rv....| +/*00000480*/ 0x12, 0xb6, 0xa8, 0x0a, 0x86, 0x28, 0x5c, 0x41, 0x7e, 0xf1, 0xbc, 0xa9, 0x93, 0xae, 0xdf, 0x0b, // |.....(\A~.......| +/*00000490*/ 0xa1, 0xfc, 0x47, 0xb5, 0xde, 0x1c, 0x25, 0xe9, 0x8b, 0xb2, 0x03, 0x3a, 0xa7, 0x36, 0x4e, 0xcb, // |..G...%....:.6N.| +/*000004a0*/ 0xfa, 0xcd, 0xe6, 0x4f, 0x67, 0x3f, 0xe2, 0xa3, 0x3d, 0xdb, 0x61, 0x0d, 0x99, 0x05, 0x15, 0x96, // |...Og?..=.a.....| +/*000004b0*/ 0x14, 0x4e, 0x89, 0xf7, 0x8b, 0xdd, 0x84, 0x48, 0x35, 0xa8, 0x5c, 0x73, 0x67, 0x5d, 0x55, 0x5d, // |.N.....H5.\sg]U]| +/*000004c0*/ 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x82, 0x80, 0x69, 0x29, 0x54, 0x97, 0xc2, 0xcd, // |..........i)T...| +/*000004d0*/ 0x4c, 0x00, 0x00, 0x00, 0x00, 0x44, 0x07, 0x64, 0xa1, 0x66, 0xe3, 0x3c, 0x6e, 0x51, 0x96, 0x6a, // |L....D.d.f...$...h.,..g| +/*00000530*/ 0x5f, 0x0b, 0x85, 0x75, 0x70, 0xa5, 0x03, 0x0e, 0x25, 0xe2, 0x09, 0x34, 0x78, 0x66, 0x6f, 0xe0, // |_..up...%..4xfo.| +/*00000540*/ 0xf6, 0xac, 0xaf, 0xc6, 0x4a, 0xbc, 0xda, 0xc5, 0x06, 0x9e, 0x53, 0xe8, 0x75, 0x0b, 0x50, 0xde, // |....J.....S.u.P.| +/*00000550*/ 0xf7, 0xc0, 0x7f, 0x78, 0x97, 0x13, 0x22, 0x76, 0x18, 0x88, 0xf9, 0x99, 0xa1, 0x05, 0x42, 0xee, // |...x.."v......B.| +/*00000560*/ 0x40, 0xf0, 0xb7, 0x00, 0x0e, 0xf5, 0xac, 0x7c, 0xe5, 0x8b, 0x1f, 0x05, 0xe3, 0xd1, 0x9d, 0x6b, // |@......|.......k| +/*00000570*/ 0xd4, 0x9c, 0x3d, 0x14, 0x08, 0x21, 0xce, 0x72, 0x8f, 0x91, 0x9c, 0xba, 0xdd, 0x46, 0xcd, 0xef, // |..=..!.r.....F..| +/*00000580*/ 0x6d, 0x7b, 0x0d, 0x7d, 0x59, 0x91, 0x05, 0xc2, 0xde, 0x6c, 0x8a, 0x65, 0xd0, 0x97, 0xb1, 0x93, // |m{.}Y....l.e....| +/*00000590*/ 0x9f, 0x51, 0xec, 0x79, 0x30, 0x44, 0xbd, 0xe5, 0xdf, 0x94, 0xed, 0xad, 0x18, 0xd7, 0x24, 0x89, // |.Q.y0D........$.| +/*000005a0*/ 0x36, 0x65, 0xc5, 0x88, 0xc0, 0x9a, 0xb7, 0xaa, 0x58, 0x60, 0xfe, 0x6c, 0xe8, 0xf3, 0x39, 0x6b, // |6e......X`.l..9k| +/*000005b0*/ 0x45, 0xe6, 0x34, 0xbc, 0x61, 0x68, 0xa2, 0x70, 0x16, 0x49, 0x8b, 0x7d, 0x78, 0x09, 0x99, 0x21, // |E.4.ah.p.I.}x..!| +/*000005c0*/ 0x5a, 0xea, 0xfd, 0xbc, 0x69, 0x23, 0xd5, 0x15, 0xd1, 0x5c, 0x32, 0x8b, 0xc0, 0x7b, 0xb2, 0x1e, // |Z...i#...\2..{..| +/*000005d0*/ 0x56, 0xf1, 0x6b, 0xd0, // |V.k.| +}; + +T_DECL(in_cksum_49479689a, "tests os_cpu_in_cksum with known problem packet in various random segmentation and memory alignment") +{ + uint16_t dsum = dumb_in_cksum(pkt49479689, sizeof(pkt49479689)); + T_ASSERT_EQ(ntohs(dsum), (uint16_t)0xa32b, "verifying dumb chksum"); + test_checksum(pkt49479689, sizeof(pkt49479689)); +} + +T_DECL(in_cksum_49479689b, "tests os_cpu_in_cksum with many random packets in various random segmentation and memory alignment") +{ + for (int i = 0; i < 100; i++) { + test_one_random_packet(4096); + } +} diff --git a/tests/jumbo_va_spaces_28530648.c b/tests/jumbo_va_spaces_28530648.c index 6f76a7a73..33f9faa24 100644 --- a/tests/jumbo_va_spaces_28530648.c +++ b/tests/jumbo_va_spaces_28530648.c @@ -7,6 +7,7 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); #define GB (1ULL * 1024 * 1024 * 1024) diff --git a/tests/kdebug.c b/tests/kdebug.c index cd9b6c776..d8a400c02 100644 --- a/tests/kdebug.c +++ b/tests/kdebug.c @@ -16,13 +16,18 @@ #include #include #include +#include + +#include "ktrace_helpers.h" T_GLOBAL_META( T_META_NAMESPACE("xnu.ktrace"), T_META_ASROOT(true)); -#define KDBG_TEST_MACROS 1 -#define KDBG_TEST_OLD_TIMES 2 +#define KDBG_TEST_MACROS 1 +#define KDBG_TEST_OLD_TIMES 2 +#define KDBG_TEST_FUTURE_TIMES 3 +#define KDBG_TEST_IOP_SYNC_FLUSH 4 static void assert_kdebug_test(unsigned int flavor) @@ -39,6 +44,8 @@ assert_kdebug_test(unsigned int flavor) T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -49,10 +56,16 @@ T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events") events_seen++; T_PASS("saw traced event"); - T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct"); - T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct"); - T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct"); - T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct"); + if (ktrace_is_kernel_64_bit(s)) { + T_EXPECT_EQ(tp->arg1, UINT64_C(0xfeedfacefeedface), + "argument 1 of traced event is correct"); + } else { + T_EXPECT_EQ(tp->arg1, UINT64_C(0xfeedface), + "argument 1 of traced event is correct"); + } + T_EXPECT_EQ(tp->arg2, 2ULL, "argument 2 of traced event is correct"); + T_EXPECT_EQ(tp->arg3, 3ULL, "argument 3 of traced event is correct"); + T_EXPECT_EQ(tp->arg4, 4ULL, "argument 4 of traced event is correct"); ktrace_end(s, 1); }); @@ -66,7 +79,8 @@ T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events") ktrace_filter_pid(s, getpid()); T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL); + T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 0xfeedfacefeedface, 2, + 3, 4), NULL); ktrace_end(s, 0); dispatch_main(); @@ -78,6 +92,8 @@ T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events") T_DECL(kdebug_signpost_syscall, "test that kdebug_signpost(2) emits correct events") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -92,10 +108,10 @@ T_DECL(kdebug_signpost_syscall, single_seen++; T_PASS("single signpost is traced"); - T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct"); - T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct"); - T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct"); - T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct"); + T_EXPECT_EQ(tp->arg1, 1ULL, "argument 1 of single signpost is correct"); + T_EXPECT_EQ(tp->arg2, 2ULL, "argument 2 of single signpost is correct"); + T_EXPECT_EQ(tp->arg3, 3ULL, "argument 3 of single signpost is correct"); + T_EXPECT_EQ(tp->arg4, 4ULL, "argument 4 of single signpost is correct"); }); ktrace_events_single_paired(s, @@ -104,18 +120,17 @@ T_DECL(kdebug_signpost_syscall, paired_seen++; T_PASS("paired signposts are traced"); - T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct"); - T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct"); - T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct"); - T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct"); + T_EXPECT_EQ(start->arg1, 5ULL, "argument 1 of start signpost is correct"); + T_EXPECT_EQ(start->arg2, 6ULL, "argument 2 of start signpost is correct"); + T_EXPECT_EQ(start->arg3, 7ULL, "argument 3 of start signpost is correct"); + T_EXPECT_EQ(start->arg4, 8ULL, "argument 4 of start signpost is correct"); - T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct"); - T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct"); - T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct"); - T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct"); + T_EXPECT_EQ(end->arg1, 9ULL, "argument 1 of end signpost is correct"); + T_EXPECT_EQ(end->arg2, 10ULL, "argument 2 of end signpost is correct"); + T_EXPECT_EQ(end->arg3, 11ULL, "argument 3 of end signpost is correct"); + T_EXPECT_EQ(end->arg4, 12ULL, "argument 4 of end signpost is correct"); - T_EXPECT_EQ(single_seen, 1, - "signposts are traced in the correct order"); + T_EXPECT_EQ(single_seen, 1, "signposts are traced in the correct order"); ktrace_end(s, 1); }); @@ -134,6 +149,8 @@ T_DECL(kdebug_signpost_syscall, T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), "started tracing"); +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" T_EXPECT_POSIX_SUCCESS(kdebug_signpost(SIGNPOST_SINGLE_CODE, 1, 2, 3, 4), "emitted single signpost"); T_EXPECT_POSIX_SUCCESS( @@ -142,11 +159,70 @@ T_DECL(kdebug_signpost_syscall, T_EXPECT_POSIX_SUCCESS( kdebug_signpost_end(SIGNPOST_PAIRED_CODE, 9, 10, 11, 12), "emitted end signpost"); +#pragma clang diagnostic pop ktrace_end(s, 0); dispatch_main(); } +T_DECL(syscall_tracing, + "ensure that syscall arguments are traced propertly") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + __block bool seen = 0; + + ktrace_filter_pid(s, getpid()); + + static const int telemetry_syscall_no = 451; + static const uint64_t arg1 = 0xfeedfacefeedface; + + ktrace_events_single(s, BSDDBG_CODE(DBG_BSD_EXCP_SC, telemetry_syscall_no), + ^(struct trace_point *evt){ + if (KDBG_EXTRACT_CODE(evt->debugid) != telemetry_syscall_no || seen) { + return; + } + + seen = true; + if (ktrace_is_kernel_64_bit(s)) { + T_EXPECT_EQ(evt->arg1, arg1, + "argument 1 of syscall event is correct"); + } else { + T_EXPECT_EQ(evt->arg1, (uint64_t)(uint32_t)(arg1), + "argument 1 of syscall event is correct"); + } + + ktrace_end(s, 1); + }); + + ktrace_set_completion_handler(s, ^{ + T_ASSERT_TRUE(seen, + "should have seen a syscall event for kevent_id(2)"); + ktrace_session_destroy(s); + T_END; + }); + + int error = ktrace_start(s, dispatch_get_main_queue()); + T_ASSERT_POSIX_ZERO(error, "started tracing"); + + /* + * telemetry(2) has a 64-bit argument that will definitely be traced, and + * is unlikely to be used elsewhere by this process. + */ + extern int __telemetry(uint64_t cmd, uint64_t deadline, uint64_t interval, + uint64_t leeway, uint64_t arg4, uint64_t arg5); + (void)__telemetry(arg1, 0, 0, 0, 0, 0); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^{ + T_LOG("ending test due to timeout"); + ktrace_end(s, 0); + }); + + dispatch_main(); +} + #pragma mark kdebug behaviors #define WRAPPING_EVENTS_COUNT (150000) @@ -161,6 +237,8 @@ T_DECL(wrapping, int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5; int current_secs = wait_wrapping_secs; + start_controlling_ktrace(); + /* use sysctls manually to bypass libktrace assumptions */ int mib[4] = { CTL_KERN, KERN_KDEBUG }; @@ -239,12 +317,14 @@ T_DECL(reject_old_events, { __block uint64_t event_horizon_ts; + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); __block int events = 0; - ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), - KDBG_EVENTID(DBG_BSD + 1, 0, 0), ^(struct trace_point *tp) { + ktrace_events_single(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 1), + ^(struct trace_point *tp) { events++; T_EXPECT_GT(tp->timestamp, event_horizon_ts, "events in trace should be from after tracing began"); @@ -279,6 +359,8 @@ T_DECL(ascending_time_order, __block unsigned int prev_cpu = 0; __block bool in_order = true; + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -451,6 +533,8 @@ T_DECL(dyld_events, "test that dyld registering libraries emits events") uint8_t *saw_unmapping = &(saw_events[1]); uint8_t *saw_shared_cache = &(saw_events[2]); + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -651,6 +735,8 @@ expect_kdbg_test_events(ktrace_session_t s, bool use_all_callback, T_DECL(kernel_events, "ensure kernel macros work") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -665,7 +751,7 @@ T_DECL(kernel_events, "ensure kernel macros work") * OS. */ unsigned int dev_exp; -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) dev_exp = is_development_kernel() ? EXP_KERNEL_EVENTS : 0U; #else dev_exp = EXP_KERNEL_EVENTS; @@ -685,6 +771,8 @@ T_DECL(kernel_events, "ensure kernel macros work") T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -713,6 +801,8 @@ T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work") T_DECL(kernel_events_noprocfilt, "ensure that the no process filter kernel macros work") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -744,7 +834,7 @@ T_DECL(kernel_events_noprocfilt, static volatile bool continue_abuse = true; #define STRESS_DEBUGID (0xfeedfac0) -#define ABUSE_SECS (10) +#define ABUSE_SECS (2) #define TIMER_NS (100 * NSEC_PER_USEC) /* * Use the quantum as the gap threshold. @@ -767,6 +857,8 @@ kdebug_abuser_thread(void *ctx) T_DECL(stress, "emit events on all but one CPU with a small buffer", T_META_CHECK_LEAKS(false)) { + start_controlling_ktrace(); + T_SETUPBEGIN; ktrace_session_t s = ktrace_session_create(); T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); @@ -875,7 +967,7 @@ T_DECL(stress, "emit events on all but one CPU with a small buffer", prev_timestamp = tp->timestamp; }); ktrace_events_single(sread, TRACE_LOST_EVENTS, ^(struct trace_point *tp){ - T_LOG("lost: %llu on %d (%lu)", tp->timestamp, tp->cpuid, tp->arg1); + T_LOG("lost: %llu on %d (%llu)", tp->timestamp, tp->cpuid, tp->arg1); }); __block uint64_t last_write = 0; @@ -891,7 +983,7 @@ T_DECL(stress, "emit events on all but one CPU with a small buffer", end->timestamp - start->timestamp, &dur_ns); T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns"); - T_LOG("write: %llu (+%gs): %gus on %d: %lu events", start->timestamp, + T_LOG("write: %llu (+%gs): %gus on %d: %llu events", start->timestamp, (double)delta_ns / 1e9, (double)dur_ns / 1e3, end->cpuid, end->arg1); last_write = end->timestamp; }); @@ -974,6 +1066,8 @@ T_DECL(stress, "emit events on all but one CPU with a small buffer", T_DECL(round_trips, "test sustained tracing with multiple round-trips through the kernel") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -1037,6 +1131,8 @@ T_DECL(round_trips, */ T_DECL(event_coverage, "ensure events appear up to the end of tracing") { + start_controlling_ktrace(); + ktrace_session_t s = ktrace_session_create(); T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); @@ -1131,6 +1227,8 @@ set_nevents(unsigned int nevents) T_DECL(set_buffer_size, "ensure large buffer sizes can be set") { + start_controlling_ktrace(); + uint64_t memsize = 0; T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.memsize", &memsize, &(size_t){ sizeof(memsize) }, NULL, 0), "get memory size"); @@ -1158,3 +1256,205 @@ T_DECL(set_buffer_size, "ensure large buffer sizes can be set") "%u events in kernel when %u requested", actualevents, i); } } + +static void * +donothing(__unused void *arg) +{ + return NULL; +} + +T_DECL(long_names, "ensure long command names are reported") +{ + start_controlling_ktrace(); + + char longname[] = "thisisaverylongprocessname!"; + char *longname_ptr = longname; + static_assert(sizeof(longname) > 16, + "the name should be longer than MAXCOMLEN"); + + int ret = sysctlbyname("kern.procname", NULL, NULL, longname, + sizeof(longname)); + T_ASSERT_POSIX_SUCCESS(ret, + "use sysctl kern.procname to lengthen the name"); + + ktrace_session_t ktsess = ktrace_session_create(); + + /* + * 32-bit kernels can only trace 16 bytes of the string in their event + * arguments. + */ + if (!ktrace_is_kernel_64_bit(ktsess)) { + longname[16] = '\0'; + } + + ktrace_filter_pid(ktsess, getpid()); + + __block bool saw_newthread = false; + ktrace_events_single(ktsess, TRACE_STRING_NEWTHREAD, + ^(struct trace_point *tp) { + if (ktrace_get_pid_for_thread(ktsess, tp->threadid) == + getpid()) { + saw_newthread = true; + + char argname[32] = {}; + strncat(argname, (char *)&tp->arg1, sizeof(tp->arg1)); + strncat(argname, (char *)&tp->arg2, sizeof(tp->arg2)); + strncat(argname, (char *)&tp->arg3, sizeof(tp->arg3)); + strncat(argname, (char *)&tp->arg4, sizeof(tp->arg4)); + + T_EXPECT_EQ_STR((char *)argname, longname_ptr, + "process name of new thread should be long"); + + ktrace_end(ktsess, 1); + } + }); + + ktrace_set_completion_handler(ktsess, ^{ + ktrace_session_destroy(ktsess); + T_EXPECT_TRUE(saw_newthread, + "should have seen the new thread"); + T_END; + }); + + int error = ktrace_start(ktsess, dispatch_get_main_queue()); + T_ASSERT_POSIX_ZERO(error, "started tracing"); + + pthread_t thread = NULL; + error = pthread_create(&thread, NULL, donothing, NULL); + T_ASSERT_POSIX_ZERO(error, "create new thread"); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^{ + ktrace_end(ktsess, 0); + }); + + error = pthread_join(thread, NULL); + T_ASSERT_POSIX_ZERO(error, "join to thread"); + + dispatch_main(); +} + +T_DECL(continuous_time, "make sure continuous time status can be queried", + T_META_RUN_CONCURRENTLY(true)) +{ + bool cont_time = kdebug_using_continuous_time(); + T_ASSERT_FALSE(cont_time, "should not be using continuous time yet"); +} + +static const uint32_t frame_eventid = KDBG_EVENTID(DBG_BSD, + DBG_BSD_KDEBUG_TEST, 1); + +static ktrace_session_t +future_events_session(void) +{ + ktrace_session_t ktsess = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(ktsess, "failed to create session"); + + ktrace_events_single(ktsess, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), + ^(struct trace_point *tp __unused) { + T_FAIL("saw future test event from IOP"); + }); + ktrace_events_single(ktsess, frame_eventid, ^(struct trace_point *tp) { + if (tp->debugid & DBG_FUNC_START) { + T_LOG("saw start event"); + } else { + T_LOG("saw event traced after trying to trace future event, ending"); + ktrace_end(ktsess, 1); + } + }); + + ktrace_set_collection_interval(ktsess, 100); + return ktsess; +} + +T_DECL(future_iop_events, + "make sure IOPs cannot trace events in the future while live tracing") +{ + start_controlling_ktrace(); + ktrace_session_t ktsess = future_events_session(); + ktrace_set_completion_handler(ktsess, ^{ + ktrace_session_destroy(ktsess); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(ktsess, dispatch_get_main_queue()), + "start tracing"); + kdebug_trace(frame_eventid | DBG_FUNC_START, 0, 0, 0, 0); + assert_kdebug_test(KDBG_TEST_FUTURE_TIMES); + kdebug_trace(frame_eventid | DBG_FUNC_END, 0, 0, 0, 0); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^{ + T_FAIL("ending tracing after timeout"); + ktrace_end(ktsess, 0); + }); + + dispatch_main(); +} + +T_DECL(future_iop_events_disabled, + "make sure IOPs cannot trace events in the future after disabling tracing") +{ + start_controlling_ktrace(); + ktrace_session_t ktsess = future_events_session(); + T_ASSERT_POSIX_ZERO(ktrace_configure(ktsess), "configure tracing"); + + kdebug_trace(frame_eventid | DBG_FUNC_START, 0, 0, 0, 0); + assert_kdebug_test(KDBG_TEST_FUTURE_TIMES); + kdebug_trace(frame_eventid | DBG_FUNC_END, 0, 0, 0, 0); + + T_ASSERT_POSIX_ZERO(ktrace_disable_configured(ktsess), + "disable tracing"); + ktrace_session_destroy(ktsess); + + ktsess = future_events_session(); + T_QUIET; + T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(ktsess), "use existing trace"); + ktrace_set_completion_handler(ktsess, ^{ + ktrace_session_destroy(ktsess); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(ktsess, dispatch_get_main_queue()), + "start tracing existing session"); + + dispatch_main(); +} + +T_DECL(iop_events_disable, + "make sure IOP events are flushed before disabling trace") +{ + start_controlling_ktrace(); + ktrace_session_t ktsess = future_events_session(); + + assert_kdebug_test(KDBG_TEST_IOP_SYNC_FLUSH); + T_ASSERT_POSIX_ZERO(ktrace_configure(ktsess), "configure tracing"); + + kdebug_trace(frame_eventid | DBG_FUNC_START, 0, 0, 0, 0); + + T_ASSERT_POSIX_ZERO(ktrace_disable_configured(ktsess), + "disable tracing"); + ktrace_session_destroy(ktsess); + + ktsess = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; + T_ASSERT_NOTNULL(ktsess, "create session"); + + ktrace_events_single(ktsess, + KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0xff), + ^(struct trace_point *tp __unused) { + T_PASS("saw IOP event from sync flush"); + }); + + T_QUIET; + T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(ktsess), "use existing trace"); + ktrace_set_completion_handler(ktsess, ^{ + ktrace_session_destroy(ktsess); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(ktsess, dispatch_get_main_queue()), + "start tracing existing session"); + + dispatch_main(); +} diff --git a/tests/kernel_mtx_perf.c b/tests/kernel_mtx_perf.c index 76af0603a..39a73a070 100644 --- a/tests/kernel_mtx_perf.c +++ b/tests/kernel_mtx_perf.c @@ -21,6 +21,10 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.kernel_mtx_perf_test")); #define ITER 100000 #define TEST_MTX_MAX_STATS 8 +#define FULL_CONTENDED 0 +#define HALF_CONTENDED 1 +#define MAX_CONDENDED 2 + #define TEST_MTX_LOCK_STATS 0 #define TEST_MTX_UNLOCK_MTX_STATS 6 @@ -28,7 +32,8 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.kernel_mtx_perf_test")); static void test_from_kernel_lock_unlock_contended(void) { - int i, ret, name_size; + int i, ret; + unsigned long name_size; uint64_t avg, run, tot; size_t size; char iter[35]; @@ -37,7 +42,7 @@ test_from_kernel_lock_unlock_contended(void) T_LOG("Testing locking/unlocking mutex from kernel with contention.\n"); T_LOG("Requesting test with %d iterations\n", ITER); - size = 1000; + size = 2000; buff = calloc(size, sizeof(char)); T_QUIET; T_ASSERT_NOTNULL(buff, "Allocating buffer fo sysctl"); @@ -45,85 +50,95 @@ test_from_kernel_lock_unlock_contended(void) ret = sysctlbyname("kern.test_mtx_contended", buff, &size, iter, sizeof(iter)); T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname kern.test_mtx_contended"); - T_LOG("%s stats:\n%s\n", __func__, buff); + T_LOG("\n%s stats :\n%s\n", __func__, buff); - /* first line is "STATS INNER LOOP" */ buff_p = buff; - while (*buff_p != '\n') { - buff_p++; - } - buff_p++; - - /* - * Sequence of statistic lines like - * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS - * for all TEST_MTX_MAX_STATS statistics - */ - for (i = 0; i < TEST_MTX_MAX_STATS; i++) { - avg_p = strstr(buff_p, "avg "); - - /* contended test records statistics only for lock/unlock for now */ - if (i == TEST_MTX_LOCK_STATS || i == TEST_MTX_UNLOCK_MTX_STATS) { - T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %i average not found", i); - sscanf(avg_p, "avg %llu", &avg); - - name = strstr(buff_p, "TEST_MTX_"); - end_name = strstr(buff_p, "_STATS"); - name_size = end_name - name - strlen("TEST_MTX_") + 1; - - char name_string[40]; - char avg_name_string[50]; - char *pre_string = "contended "; - snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); - pre_string = "avg contended "; - snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); - T_PERF(name_string, avg, "ns", avg_name_string); + int t; + for (t = 0; t < MAX_CONDENDED; t++) { + char *type; + if (t == FULL_CONTENDED) { + type = "FULL_CONTENDED "; + } else { + type = "HALF_CONTENDED "; } - buff_p = avg_p; + /* first line is "STATS INNER LOOP" */ while (*buff_p != '\n') { buff_p++; } buff_p++; - } - while (*buff_p != '\n') { + /* + * Sequence of statistic lines like + * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS + * for all TEST_MTX_MAX_STATS statistics + */ + for (i = 0; i < TEST_MTX_MAX_STATS; i++) { + avg_p = strstr(buff_p, "avg "); + + /* contended test records statistics only for lock/unlock for now */ + if (i == TEST_MTX_LOCK_STATS || i == TEST_MTX_UNLOCK_MTX_STATS) { + T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %i average not found", i); + sscanf(avg_p, "avg %llu", &avg); + + name = strstr(buff_p, "TEST_MTX_"); + end_name = strstr(buff_p, "_STATS"); + name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1; + + char name_string[40]; + char avg_name_string[50]; + char *pre_string = "contended "; + snprintf(name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]); + pre_string = "avg contended "; + snprintf(avg_name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]); + T_PERF(name_string, avg, "ns", avg_name_string); + } + + buff_p = avg_p; + while (*buff_p != '\n') { + buff_p++; + } + buff_p++; + } + + while (*buff_p != '\n') { + buff_p++; + } buff_p++; - } - buff_p++; - /* next line is "STATS OUTER LOOP" */ - while (*buff_p != '\n') { + /* next line is "STATS OUTER LOOP" */ + while (*buff_p != '\n') { + buff_p++; + } buff_p++; - } - buff_p++; - /* contended test records statistics only for lock/unlock for now */ - avg_p = strstr(buff_p, "run time "); - T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %d loop run time not found", 0); - sscanf(avg_p, "run time %llu", &run); + /* contended test records statistics only for lock/unlock for now */ + avg_p = strstr(buff_p, "run time "); + T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %d loop run time not found", 0); + sscanf(avg_p, "run time %llu", &run); - avg_p = strstr(buff_p, "total time "); - T_QUIET; T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", 0); - sscanf(avg_p, "total time %llu", &tot); + avg_p = strstr(buff_p, "total time "); + T_QUIET; T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", 0); + sscanf(avg_p, "total time %llu", &tot); - if (run < tot) { - avg = run; - } else { - avg = tot; - } + if (run < tot) { + avg = run; + } else { + avg = tot; + } - name = strstr(buff_p, "TEST_MTX_"); - end_name = strstr(buff_p, "_STATS"); - name_size = end_name - name - strlen("TEST_MTX_") + 1; + name = strstr(buff_p, "TEST_MTX_"); + end_name = strstr(buff_p, "_STATS"); + name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1; - char name_string[50]; - char avg_name_string[60]; - char *pre_string = "contended loop "; - snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); - pre_string = "avg time contended loop "; - snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); - T_PERF(name_string, avg / ITER, "ns", avg_name_string); + char name_string[50]; + char avg_name_string[60]; + char *pre_string = "contended loop "; + snprintf(name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]); + pre_string = "avg time contended loop "; + snprintf(avg_name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]); + T_PERF(name_string, avg / ITER, "ns", avg_name_string); + } free(buff); } @@ -131,7 +146,8 @@ test_from_kernel_lock_unlock_contended(void) static void test_from_kernel_lock_unlock_uncontended(void) { - int i, ret, name_size; + int i, ret; + unsigned long name_size; uint64_t avg, run, tot; size_t size; char iter[35]; @@ -169,7 +185,7 @@ test_from_kernel_lock_unlock_uncontended(void) name = strstr(buff_p, "TEST_MTX_"); end_name = strstr(buff_p, "_STATS"); - name_size = end_name - name - strlen("TEST_MTX_") + 1; + name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1; char name_string[40]; char avg_name_string[50]; @@ -219,7 +235,7 @@ test_from_kernel_lock_unlock_uncontended(void) name = strstr(buff_p, "TEST_MTX_"); end_name = strstr(buff_p, "_STATS"); - name_size = end_name - name - strlen("TEST_MTX_") + 1; + name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1; char name_string[50]; char avg_name_string[60]; @@ -238,78 +254,175 @@ test_from_kernel_lock_unlock_uncontended(void) free(buff); } -extern char **environ; +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +static bool +get_freq(float val, char scale, int *int_val) +{ + switch (scale) { + case 'M': + case 'm': + *int_val = (int) val; + break; + case 'G': + case 'g': + *int_val = (int) (val * 1000); + break; + default: + return FALSE; + } + return TRUE; +} + +static bool +parse_freq(char* buff, int buff_size, const char* string_start, int string_start_size, char* to_parse) +{ + char* start; + float val; + char scale; + int int_val; + + start = strstr(to_parse, string_start); + if (start == NULL) { + return FALSE; + } + + if (strstr(start, "Hz") != NULL) { + sscanf(start + string_start_size, "%f%cHz", &val, &scale); + } else { + if (strstr(start, "hz") != NULL) { + sscanf(start + string_start_size, "%f%chz", &val, &scale); + } else { + return FALSE; + } + } + + if (!get_freq(val, scale, &int_val)) { + return FALSE; + } + + snprintf(buff, buff_size, "%d", int_val); + + return TRUE; +} + +static bool freq_fixed = FALSE; +static char str_val_min[10]; +static char str_val_max[10]; + +static bool +get_previous_freq_values(void) +{ + FILE *fp; + char out_xcpm[1035]; + bool min_scan = FALSE; + bool max_scan = FALSE; + + memset(str_val_min, 0, sizeof(str_val_min)); + memset(str_val_max, 0, sizeof(str_val_max)); + + fp = popen("/usr/local/bin/xcpm limits", "r"); + if (fp == NULL) { + return FALSE; + } + + while (fgets(out_xcpm, sizeof(out_xcpm) - 1, fp) != NULL && (!max_scan || !min_scan)) { + if (!max_scan) { + max_scan = parse_freq(str_val_max, sizeof(str_val_max), "Max frequency:", sizeof("Max frequency:"), out_xcpm); + } + if (!min_scan) { + min_scan = parse_freq(str_val_min, sizeof(str_val_min), "Min frequency:", sizeof("Min frequency:"), out_xcpm); + } + } + + pclose(fp); + + if (!max_scan || !min_scan) { + return FALSE; + } + + return TRUE; +} +#endif + static void fix_cpu_frequency(void) { -#if CONFIG_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) int spawn_ret, pid; char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-f", "5000", NULL}; T_LOG("Setting cpu frequency to %d\n", 5000); - spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ); - waitpid(pid, &spawn_ret, 0); + spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, *_NSGetEnviron()); + T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn"); + T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed"); + T_QUIET; T_ASSERT_EQ(spawn_ret, 0, " clpcctrl failed"); -#else /*CONFIG_EMBEDDED*/ +#else /*(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)*/ int spawn_ret, pid; - int ret, nom_freq; + int ret; size_t len; - float val; - char scale; - char *buffer, *cpu_freq; + char *buffer; char str_val[10]; + if (!get_previous_freq_values()) { + T_LOG("Impossible to parse freq values from xcpm"); + freq_fixed = FALSE; + return; + } + ret = sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string"); - buffer = malloc(len + 2); + buffer = calloc(len + 2, sizeof(char)); ret = sysctlbyname("machdep.cpu.brand_string", buffer, &len, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string"); buffer[len + 1] = '\0'; - cpu_freq = strstr(buffer, "CPU @ "); - if (cpu_freq == NULL) { - T_LOG("Could not fix frequency, %s field not present\n", "CPU @ "); - goto out; - } - - if (strstr(cpu_freq, "Hz") != NULL) { - sscanf(cpu_freq, "CPU @ %f%cHz", &val, &scale); - } else { - if (strstr(cpu_freq, "hz") != NULL) { - sscanf(cpu_freq, "CPU @ %f%chz", &val, &scale); - } else { - T_LOG("Could not fix frequency, %s field not present\n", "Hz"); - goto out; - } + memset(str_val, 0, sizeof(str_val)); + if (!parse_freq(str_val, sizeof(str_val), "CPU @", sizeof("CPU @"), buffer)) { + T_LOG("Impossible to parse freq values from machdep.cpu.brand_string (string was %s)", buffer); + freq_fixed = FALSE; + return; } - switch (scale) { - case 'M': - case 'm': - nom_freq = (int) val; - break; - case 'G': - case 'g': - nom_freq = (int) (val * 1000); - break; - default: - T_LOG("Could not fix frequency, scale field is %c\n", scale); - goto out; - } - - snprintf(str_val, 10, "%d", nom_freq); - T_LOG("Setting min and max cpu frequency to %d (%s)\n", nom_freq, str_val); + T_LOG("Previous min and max cpu frequency (%s) (%s)\n", str_val_min, str_val_max); + T_LOG("Setting min and max cpu frequency to (%s)\n", str_val); char *xcpm_args[] = {"/usr/local/bin/xcpm", "limits", str_val, str_val, NULL}; - spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, environ); - waitpid(pid, &spawn_ret, 0); + spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, *_NSGetEnviron()); + T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn"); + T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed"); + T_QUIET; T_ASSERT_EQ(spawn_ret, 0, "xcpm limits failed"); + + freq_fixed = TRUE; -out: free(buffer); return; -#endif /*CONFIG_EMBEDDED*/ +#endif /*(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)*/ +} + +static void +cleanup_cpu_freq(void) +{ +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) + int spawn_ret, pid; + char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-d", NULL}; + spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, *_NSGetEnviron()); + T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn"); + T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed"); + T_QUIET; T_ASSERT_EQ(spawn_ret, 0, "clpcctrl failed"); + +#else + if (freq_fixed) { + int spawn_ret, pid; + char *xcpm_args[] = {"/usr/local/bin/xcpm", "limits", str_val_min, str_val_max, NULL}; + spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, *_NSGetEnviron()); + T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn"); + T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed"); + T_QUIET; T_ASSERT_EQ(spawn_ret, 0, "xcpm limits failed"); + } +#endif } T_DECL(kernel_mtx_perf_test, @@ -318,6 +431,8 @@ T_DECL(kernel_mtx_perf_test, { fix_cpu_frequency(); + T_ATEND(cleanup_cpu_freq); + test_from_kernel_lock_unlock_uncontended(); test_from_kernel_lock_unlock_contended(); } diff --git a/tests/kernel_uuid_match.c b/tests/kernel_uuid_match.c index f5f32d45b..29099bae6 100644 --- a/tests/kernel_uuid_match.c +++ b/tests/kernel_uuid_match.c @@ -16,9 +16,11 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define MAX_LEN 1024 -#if TARGET_OS_MAC && !TARGET_OS_EMBEDDED +#if TARGET_OS_MAC && !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) //running on macOS #define KERNEL_SEARCH_DIR "/System/Library/Kernels/*" #else diff --git a/tests/kevent_pty.c b/tests/kevent_pty.c index 734de7902..d84d65fa5 100644 --- a/tests/kevent_pty.c +++ b/tests/kevent_pty.c @@ -17,7 +17,8 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.kevent"), - T_META_CHECK_LEAKS(false)); + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true)); #define TIMEOUT_SECS 10 @@ -147,7 +148,8 @@ reader_thread(void *arg) if (errno == EINTR) { continue; } else if (errno == EBADF) { - T_LOG("reader got an error (%s), shutting down", strerror(errno)); + T_LOG("reader got an error (%s), shutting down", + strerror(errno)); return NULL; } else { T_ASSERT_POSIX_SUCCESS(rdsize, "read on PTY"); @@ -178,7 +180,8 @@ writer_thread(void *arg) if (errno == EINTR) { continue; } else { - T_LOG("writer got an error (%s), shutting down", strerror(errno)); + T_LOG("writer got an error (%s), shutting down", + strerror(errno)); return NULL; } } @@ -192,16 +195,6 @@ writer_thread(void *arg) static int attach_master, attach_slave; static pthread_t reader, writer; -static void -join_threads(void) -{ - close(attach_slave); - close(attach_master); - writing = false; - pthread_join(reader, NULL); - pthread_join(writer, NULL); -} - static void redispatch(dispatch_group_t grp, dispatch_source_type_t type, int fd) { @@ -246,7 +239,6 @@ T_DECL(attach_while_tty_wakeups, (void *)(uintptr_t)attach_master), NULL); T_ASSERT_POSIX_ZERO(pthread_create(&writer, NULL, writer_thread, (void *)(uintptr_t)attach_slave), NULL); - T_ATEND(join_threads); T_SETUPEND; redispatch(grp, DISPATCH_SOURCE_TYPE_READ, attach_master); diff --git a/tests/kevent_qos.c b/tests/kevent_qos.c index 9bbb7d62e..403917306 100644 --- a/tests/kevent_qos.c +++ b/tests/kevent_qos.c @@ -58,7 +58,7 @@ struct test_msg { #pragma mark pthread callbacks -static void +static pthread_t thread_create_at_qos(qos_class_t qos, void * (*function)(void *)); static void send(mach_port_t send_port, mach_port_t reply_port, mach_port_t msg_port, mach_msg_priority_t qos, mach_msg_option_t options); @@ -207,6 +207,62 @@ workloop_cb_test_sync_send_and_enable(uint64_t *workloop_id, struct kevent_qos_s T_END; } +/* + * WL handler which checks the overridden Qos and then handoffs the IPC, + * enables the knote and checks for the Qos again that it hasn't dropped the sync ipc override. + */ +static void +workloop_cb_test_sync_send_and_enable_handoff(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + unsigned override_priority; + int error; + + T_LOG("Workloop handler workloop_cb_test_sync_send_and_enable_handoff called"); + + EXPECT_TEST_MSG(*eventslist); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + /* The effective Qos should be the one expected after override */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should be %s", + g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + /* Snapshot the current override priority */ + override_priority = get_user_promotion_basepri(); + + struct kevent_qos_s *kev = *eventslist; + mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0]; + + /* handoff the IPC */ + struct kevent_qos_s handoff_kev = { + .filter = EVFILT_WORKLOOP, + .ident = hdr->msgh_remote_port, + .flags = EV_ADD | EV_DISABLE, + .fflags = 0x80000000, + }; + + error = kevent_id(*workloop_id, &handoff_kev, 1, &handoff_kev, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST); + T_QUIET; T_ASSERT_POSIX_SUCCESS(error, "kevent_id"); + T_ASSERT_EQ(0, error, "Handed off the sync IPC"); + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + + /* + * Check if the override has not been dropped. + */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should still be %s", + g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + *events = 0; + T_END; +} + /* * WL handler receives the first message and checks sync ipc override, then enables the knote * and receives 2nd message and checks it sync ipc override. @@ -346,7 +402,7 @@ populate_kevent(struct kevent_qos_s *kev, unsigned long long port) kev->ident = port; kev->filter = EVFILT_MACHPORT; kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; - kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + kev->fflags = (MACH_RCV_MSG | MACH_RCV_VOUCHER | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)); kev->data = 1; @@ -355,15 +411,15 @@ populate_kevent(struct kevent_qos_s *kev, unsigned long long port) static void enable_kevent(uint64_t *workloop_id, unsigned long long port) { - kern_return_t kr; struct kevent_qos_s kev; + int error; populate_kevent(&kev, port); struct kevent_qos_s kev_err[] = {{ 0 }}; - kr = kevent_id(*workloop_id, &kev, 1, kev_err, 1, NULL, + error = kevent_id(*workloop_id, &kev, 1, kev_err, 1, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST); - T_QUIET; T_ASSERT_POSIX_SUCCESS(kr, "kevent_id"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(error, "kevent_id"); } /* @@ -847,16 +903,14 @@ send( send_msg.qos = (uint32_t)_pthread_qos_class_encode(qc, relpri, 0); } - ret = mach_msg(&(send_msg.header), - MACH_SEND_MSG | - MACH_SEND_TIMEOUT | - MACH_SEND_OVERRIDE | - ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options), - send_msg.header.msgh_size, - 0, - MACH_PORT_NULL, - 10000, - 0); + mach_msg_option_t send_opts = options; + if (reply_port) { + send_opts |= MACH_SEND_SYNC_OVERRIDE; + } + send_opts |= MACH_SEND_MSG | MACH_SEND_TIMEOUT | MACH_SEND_OVERRIDE; + + ret = mach_msg(&send_msg.header, send_opts, send_msg.header.msgh_size, + 0, MACH_PORT_NULL, 10000, qos); T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg"); } @@ -957,7 +1011,7 @@ qos_client_send_to_intransit(void *arg __unused) (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); } - T_LOG("Sent 5 msgs, now trying to send sync ipc messgae, which will block with a timeout\n"); + T_LOG("Sent 5 msgs, now trying to send sync ipc message, which will block with a timeout\n"); /* Send the message to the in-transit port, it should block and override the rcv's workloop */ send(msg_port, special_reply_port, MACH_PORT_NULL, (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); @@ -974,7 +1028,7 @@ T_HELPER_DECL(qos_client_send_to_intransit_with_thr_pri, sleep(HELPER_TIMEOUT_SECS); } -static void +static pthread_t thread_create_at_qos(qos_class_t qos, void * (*function)(void *)) { qos_class_t qos_thread; @@ -994,6 +1048,7 @@ thread_create_at_qos(qos_class_t qos, void * (*function)(void *)) T_LOG("pthread created\n"); pthread_get_qos_class_np(thread, &qos_thread, NULL); T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL); + return thread; } static void * @@ -1032,6 +1087,58 @@ qos_send_and_sync_rcv(void *arg __unused) return NULL; } +static void * +qos_sync_rcv(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + T_LOG("Client: from created thread\n"); + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* enqueue two messages to make sure that mqueue is not empty */ + send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0), 0); + + sleep(RECV_TIMEOUT_SECS); + + /* sync wait on msg port */ + receive(special_reply_port, qos_send_port); + + T_LOG("Client done doing sync rcv, now waiting for server to end the test"); + sleep(SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +static void +thread_wait_to_block(mach_port_t thread_port) +{ + thread_extended_info_data_t extended_info; + kern_return_t kr; + + while (1) { + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + if (extended_info.pth_run_state == TH_STATE_WAITING) { + T_LOG("Target thread blocked\n"); + break; + } + thread_switch(thread_port, SWITCH_OPTION_DEPRESS, 0); + } +} + T_HELPER_DECL(qos_client_send_sync_and_sync_rcv, "Send messages and syncronously wait for rcv") { @@ -1039,6 +1146,25 @@ T_HELPER_DECL(qos_client_send_sync_and_sync_rcv, sleep(HELPER_TIMEOUT_SECS); } +T_HELPER_DECL(qos_client_sync_rcv_qos_change, + "Send messages and syncronously wait for rcv and change qos of waiting thread") +{ + pthread_t rcv_thread; + + rcv_thread = thread_create_at_qos(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], qos_sync_rcv); + + T_LOG("Waiting for %d seconds before changing qos of rcv thread", SEND_TIMEOUT_SECS); + sleep(SEND_TIMEOUT_SECS); + + /* Wait for the thread to block */ + thread_wait_to_block(pthread_mach_thread_np(rcv_thread)); + + /* Update the rcv thread's qos */ + pthread_override_qos_class_start_np(rcv_thread, g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0); + + sleep(HELPER_TIMEOUT_SECS); +} + static void * qos_client_send_sync_msg_and_test_link(void *arg) { @@ -1327,7 +1453,7 @@ qos_client_create_sepcial_reply_and_spawn_thread(void *arg __unused) /* Create a new thread to send the sync message on our special reply port */ thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_destroy_other_threads_port); - /* Client starting to receive messgae */ + /* Client starting to receive message */ receive(special_reply_port, qos_send_port); sleep(3 * SEND_TIMEOUT_SECS); @@ -1457,6 +1583,10 @@ expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[ T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( worker_cb, event_cb, (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_and_enable_handoff") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable_handoff, 0, 0), NULL); } else if (strcmp(wl_function, "workloop_cb_test_send_two_sync") == 0) { T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( worker_cb, event_cb, @@ -1521,7 +1651,7 @@ expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[ .ident = port, .filter = EVFILT_MACHPORT, .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, - .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + .fflags = (MACH_RCV_MSG | MACH_RCV_VOUCHER | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)), .data = 1, @@ -1553,6 +1683,51 @@ T_HELPER_DECL(server_kevent_id, RECV_TIMEOUT_SECS); } +static void * +special_reply_port_thread(void *ctxt) +{ + kern_return_t ret; + mach_port_t rcv_port = *(mach_port_t *)ctxt; + struct test_msg rcv_msg = { + .header = { + .msgh_remote_port = MACH_PORT_NULL, + .msgh_local_port = rcv_port, + .msgh_size = sizeof(rcv_msg), + }, + }; + + ret = mach_msg(&rcv_msg.header, MACH_RCV_MSG | MACH_RCV_TIMEOUT, 0, + rcv_msg.header.msgh_size, rcv_port, 1000, MACH_PORT_NULL); + + T_EXPECT_EQ(ret, MACH_RCV_TIMED_OUT, "receive should not panic"); + + *(mach_port_t *)ctxt = MACH_PORT_NULL; + + sleep(1); // give some time to pthread_exit + + ret = mach_msg(&rcv_msg.header, MACH_RCV_MSG | MACH_RCV_TIMEOUT, 0, + rcv_msg.header.msgh_size, rcv_port, 1000, MACH_PORT_NULL); + + T_EXPECT_EQ(ret, MACH_RCV_TIMED_OUT, "receive should not panic"); + + T_END; +} + +T_DECL(special_reply_port, "basic special reply port robustness checks", + T_META_RUN_CONCURRENTLY(true)) +{ + pthread_t thread; + mach_port_t srp = thread_get_special_reply_port(); + + pthread_create(&thread, NULL, special_reply_port_thread, &srp); + + while (srp) { + usleep(1000); + } + + pthread_exit(NULL); +} + #define TEST_QOS(server_name, client_name, name, wl_function_name, qos_bo, qos_bo_name, qos_qo, qos_qo_name, qos_ao, qos_ao_name) \ T_DECL(server_kevent_id_##name, \ "Event delivery at " qos_ao_name " QoS using a kevent_id", \ @@ -1677,6 +1852,7 @@ TEST_QOS("server_kevent_id", "qos_client_send_two_msg_and_destroy", send_two_UI_ QOS_CLASS_BACKGROUND, "background", QOS_CLASS_MAINTENANCE, "maintenance", QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + /* * Test 11: test sending two ports with chaining * @@ -1689,7 +1865,29 @@ TEST_QOS("server_kevent_id", "qos_client_send_complex_msg_with_pri", send_comple QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") /* - * Test 12 - 19 + * Test 12: test sending two ports with chaining + * + * Send a sync IPC to a connection port, which itself is embedded in a message + * sent as a sync IPC to a service port. + */ +TEST_QOS("server_kevent_id", "qos_client_send_complex_msg_with_pri", send_complex_sync_UI_and_enable_and_handoff, "workloop_cb_test_sync_send_and_enable_handoff", + QOS_CLASS_USER_INITIATED, "user initiated", + QOS_CLASS_USER_INITIATED, "user initiated", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + +/* + * Test 13: test changing qos of a thread to trigger turnstile push + * + * Send a sync IPC to a service port and change the qos of the blocked thread + * to verify that changing qos triggers a turnstile push. + */ +TEST_QOS("server_kevent_id", "qos_client_sync_rcv_qos_change", qos_change_to_IN, "workloop_cb_test_intransit", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INITIATED, "user initiated") + +/* + * Test 14 - 21 * * Test single sync ipc link with server that breaks/preserves the link in different ways. */ @@ -1732,8 +1930,9 @@ TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_s QOS_CLASS_DEFAULT, "default", QOS_CLASS_DEFAULT, "default", QOS_CLASS_DEFAULT, "default") + /* - * Test 20 - 23 + * Test 22 - 25 * * Test sequential sync ipc link with server that breaks/preserves the link. */ diff --git a/tests/kpc.c b/tests/kpc.c index 1f74ada51..62b87e68e 100644 --- a/tests/kpc.c +++ b/tests/kpc.c @@ -1,20 +1,21 @@ +/* Copyright (c) 2018 Apple Inc. All rights reserved. */ + #include #include #include +#include #include -T_DECL(fixed_counters, - "test that fixed counters return monotonically increasing values", - T_META_ASROOT(YES)) -{ - T_SKIP("unimplemented"); -} +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ktrace"), + T_META_ASROOT(true), + T_META_CHECK_LEAKS(false)); T_DECL(fixed_thread_counters, - "test that fixed thread counters return monotonically increasing values", - T_META_ASROOT(YES)) + "test that fixed thread counters return monotonically increasing values") { + int err; uint32_t ctrs_cnt; uint64_t *ctrs_a; @@ -66,3 +67,66 @@ T_DECL(fixed_thread_counters, free(ctrs_a); free(ctrs_b); } + +#if defined(__arm64__) +/* + * This policy only applies to arm64 devices. + */ + +static int g_prev_disablewl = 0; + +static void +whitelist_atend(void) +{ + int ret = sysctlbyname("kpc.disable_whitelist", NULL, NULL, + &g_prev_disablewl, sizeof(g_prev_disablewl)); + if (ret < 0) { + T_LOG("failed to reset whitelist: %d (%s)", errno, strerror(errno)); + } +} + +T_DECL(whitelist, "ensure kpc's whitelist is filled out") +{ + /* Start enforcing the whitelist. */ + int set = 0; + size_t getsz = sizeof(g_prev_disablewl); + int ret = sysctlbyname("kpc.disable_whitelist", &g_prev_disablewl, &getsz, + &set, sizeof(set)); + if (ret < 0 && errno == ENOENT) { + T_SKIP("kpc not running with a whitelist, or RELEASE kernel"); + } + + T_ASSERT_POSIX_SUCCESS(ret, "started enforcing the event whitelist"); + T_ATEND(whitelist_atend); + + uint32_t nconfigs = kpc_get_config_count(KPC_CLASS_CONFIGURABLE_MASK); + uint64_t *config = calloc(nconfigs, sizeof(*config)); + + /* + * Check that events in the whitelist are allowed. CORE_CYCLE (0x2) is + * always present in the whitelist. + */ + config[0] = 0x02; + ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); + T_ASSERT_POSIX_SUCCESS(ret, "configured kpc to count cycles"); + + /* Check that non-event bits are ignored by the whitelist. */ + config[0] = 0x102; + ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); + T_ASSERT_POSIX_SUCCESS(ret, + "configured kpc to count cycles with non-event bits set"); + + /* Check that configurations of non-whitelisted events fail. */ + config[0] = 0xfe; + ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); + T_ASSERT_POSIX_FAILURE(ret, EPERM, + "shouldn't allow arbitrary events with whitelist enabled"); + + /* Clean up the configuration. */ + config[0] = 0; + (void)kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); + + free(config); +} + +#endif /* defined(__arm64__) */ diff --git a/tests/kperf.c b/tests/kperf.c index 0c6684ae2..29ceeab7d 100644 --- a/tests/kperf.c +++ b/tests/kperf.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include "kperf_helpers.h" +#include "ktrace_helpers.h" T_GLOBAL_META( T_META_NAMESPACE("xnu.kperf"), @@ -40,12 +42,16 @@ spinning_thread(void *semp) return NULL; } -#define PERF_STK_KHDR UINT32_C(0x25020014) -#define PERF_STK_UHDR UINT32_C(0x25020018) -#define PERF_TMR_FIRE KDBG_EVENTID(DBG_PERF, 3, 0) -#define PERF_TMR_HNDLR KDBG_EVENTID(DBG_PERF, 3, 2) -#define PERF_TMR_PEND KDBG_EVENTID(DBG_PERF, 3, 3) -#define PERF_TMR_SKIP KDBG_EVENTID(DBG_PERF, 3, 4) +#define PERF_STK_KHDR UINT32_C(0x25020014) +#define PERF_STK_UHDR UINT32_C(0x25020018) +#define PERF_TMR_FIRE KDBG_EVENTID(DBG_PERF, 3, 0) +#define PERF_TMR_HNDLR KDBG_EVENTID(DBG_PERF, 3, 2) +#define PERF_TMR_PEND KDBG_EVENTID(DBG_PERF, 3, 3) +#define PERF_TMR_SKIP KDBG_EVENTID(DBG_PERF, 3, 4) +#define PERF_KPC_CONFIG KDBG_EVENTID(DBG_PERF, 6, 4) +#define PERF_KPC_REG KDBG_EVENTID(DBG_PERF, 6, 5) +#define PERF_KPC_REG32 KDBG_EVENTID(DBG_PERF, 6, 7) +#define PERF_INSTR_DATA KDBG_EVENTID(DBG_PERF, 1, 17) #define SCHED_HANDOFF KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, \ MACH_STACK_HANDOFF) @@ -59,12 +65,6 @@ spinning_thread(void *semp) #define TIMER_PERIOD_NS (1 * NSEC_PER_MSEC) -static void -reset_ktrace(void) -{ - kperf_reset(); -} - /* * Ensure that kperf is correctly IPIing CPUs that are actively scheduling by * bringing up threads and ensuring that threads on-core are sampled by each @@ -74,6 +74,8 @@ reset_ktrace(void) T_DECL(ipi_active_cpus, "make sure that kperf IPIs all active CPUs") { + start_controlling_ktrace(); + int ncpus = dt_ncpu(); T_QUIET; T_ASSERT_LT(ncpus, MAX_CPUS, @@ -282,7 +284,6 @@ T_DECL(ipi_active_cpus, T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL); T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling"); - T_ATEND(reset_ktrace); T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0)), @@ -297,9 +298,9 @@ T_DECL(ipi_active_cpus, #define KDEBUG_TRIGGER_TIMEOUT_NS (10 * NSEC_PER_SEC) -#define NON_TRIGGER_CLASS UINT8_C(0xfd) -#define NON_TRIGGER_SUBCLASS UINT8_C(0xff) -#define NON_TRIGGER_CODE UINT8_C(0xff) +#define NON_TRIGGER_CLASS UINT32_C(0xfd) +#define NON_TRIGGER_SUBCLASS UINT32_C(0xff) +#define NON_TRIGGER_CODE UINT32_C(0xff) #define NON_TRIGGER_EVENT \ (KDBG_EVENTID(NON_TRIGGER_CLASS, NON_TRIGGER_SUBCLASS, \ @@ -319,13 +320,13 @@ expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids, ktrace_events_single(s, PERF_STK_KHDR, ^(struct trace_point *tp) { missing_kernel_stacks--; - T_LOG("saw kernel stack with %lu frames, flags = %#lx", tp->arg2, - tp->arg1); + T_LOG("saw kernel stack with %" PRIu64 " frames, flags = %#" + PRIx64, tp->arg2, tp->arg1); }); ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) { missing_user_stacks--; - T_LOG("saw user stack with %lu frames, flags = %#lx", tp->arg2, - tp->arg1); + T_LOG("saw user stack with %" PRIu64 " frames, flags = %#" + PRIx64, tp->arg2, tp->arg1); }); for (unsigned int i = 0; i < n_debugids; i++) { @@ -386,16 +387,18 @@ expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids, }); } -#define TRIGGER_CLASS UINT8_C(0xfe) -#define TRIGGER_CLASS_END UINT8_C(0xfd) -#define TRIGGER_SUBCLASS UINT8_C(0xff) -#define TRIGGER_CODE UINT8_C(0) +#define TRIGGER_CLASS UINT32_C(0xfe) +#define TRIGGER_CLASS_END UINT32_C(0xfd) +#define TRIGGER_SUBCLASS UINT32_C(0xff) +#define TRIGGER_CODE UINT32_C(0) #define TRIGGER_DEBUGID \ (KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, TRIGGER_CODE)) T_DECL(kdebug_trigger_classes, "test that kdebug trigger samples on classes") { + start_controlling_ktrace(); + const uint32_t class_debugids[] = { KDBG_EVENTID(TRIGGER_CLASS, 1, 1), KDBG_EVENTID(TRIGGER_CLASS, 2, 1), @@ -411,6 +414,8 @@ T_DECL(kdebug_trigger_classes, T_DECL(kdebug_trigger_subclasses, "test that kdebug trigger samples on subclasses") { + start_controlling_ktrace(); + const uint32_t subclass_debugids[] = { KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 0), KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 1), @@ -426,6 +431,8 @@ T_DECL(kdebug_trigger_subclasses, T_DECL(kdebug_trigger_debugids, "test that kdebug trigger samples on debugids") { + start_controlling_ktrace(); + const uint32_t debugids[] = { TRIGGER_DEBUGID }; @@ -440,9 +447,17 @@ T_DECL(kdebug_trigger_debugids, * events from that class. */ +static void +reset_kperf(void) +{ + (void)kperf_reset(); +} + T_DECL(kdbg_callstacks, "test that the kdbg_callstacks samples on syscalls") { + start_controlling_ktrace(); + ktrace_session_t s; __block bool saw_user_stack = false; @@ -471,7 +486,7 @@ T_DECL(kdbg_callstacks, #pragma clang diagnostic ignored "-Wdeprecated-declarations" T_ASSERT_POSIX_SUCCESS(kperf_kdbg_callstacks_set(1), NULL); #pragma clang diagnostic pop - T_ATEND(kperf_reset); + T_ATEND(reset_kperf); T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); @@ -526,6 +541,8 @@ expect_stacks_traced(void (^cb)(void)) T_DECL(pet, "test that PET mode samples kernel and user stacks") { + start_controlling_ktrace(); + configure_kperf_stacks_timer(-1, 10); T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL); @@ -540,6 +557,8 @@ T_DECL(lightweight_pet, "test that lightweight PET mode samples kernel and user stacks", T_META_ASROOT(true)) { + start_controlling_ktrace(); + int set = 1; configure_kperf_stacks_timer(-1, 10); @@ -556,6 +575,8 @@ T_DECL(lightweight_pet, T_DECL(pet_stress, "repeatedly enable and disable PET mode") { + start_controlling_ktrace(); + int niters = 1000; while (niters--) { configure_kperf_stacks_timer(-1, 10); @@ -568,6 +589,8 @@ T_DECL(pet_stress, "repeatedly enable and disable PET mode") T_DECL(timer_stress, "repeatedly enable and disable timers") { + start_controlling_ktrace(); + int niters = 1000; while (niters--) { configure_kperf_stacks_timer(-1, 1); @@ -576,3 +599,153 @@ T_DECL(timer_stress, "repeatedly enable and disable timers") } ; } + +T_DECL(pmc_config_only, "shouldn't show PMC config events unless requested") +{ + start_controlling_ktrace(); + + __block bool saw_kpc_config = false; + __block bool saw_kpc_reg = false; + + ktrace_session_t s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, "ktrace_session_create"); + + /* + * Make sure BSD events are traced in order to trigger samples on syscalls. + */ + ktrace_events_single(s, PERF_KPC_CONFIG, + ^(__unused struct trace_point *tp) { + saw_kpc_config = true; + }); + ktrace_events_single(s, PERF_KPC_REG, + ^(__unused struct trace_point *tp) { + saw_kpc_reg = true; + }); + ktrace_events_single(s, PERF_KPC_REG32, + ^(__unused struct trace_point *tp) { + saw_kpc_reg = true; + }); + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + T_EXPECT_FALSE(saw_kpc_config, + "should see no KPC configs without sampler enabled"); + T_EXPECT_FALSE(saw_kpc_reg, + "should see no KPC registers without sampler enabled"); + T_END; + }); + + uint32_t nconfigs = kpc_get_config_count(KPC_CLASS_CONFIGURABLE_MASK); + uint64_t *config = calloc(nconfigs, sizeof(*config)); + config[0] = 0x02; + int ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); + T_ASSERT_POSIX_SUCCESS(ret, "configured kpc"); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kpc_set_counting(KPC_CLASS_CONFIGURABLE_MASK), + "kpc_set_counting"); + + (void)kperf_action_count_set(1); + T_ATEND(reset_kperf); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, KPERF_SAMPLER_PMC_CPU), + NULL); + + (void)kperf_timer_count_set(1); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0, + kperf_ns_to_ticks(TIMER_PERIOD_NS)), NULL); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL); + + T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling"); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^(void) { + ktrace_end(s, 1); + }); + + dispatch_main(); +} + +static void +skip_if_monotonic_unsupported(void) +{ + int r; + int supported = 0; + size_t supported_size = sizeof(supported); + + r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size, + NULL, 0); + if (r < 0) { + T_WITH_ERRNO; + T_SKIP("could not find \"kern.monotonic.supported\" sysctl"); + } + + if (!supported) { + T_SKIP("monotonic is not supported on this platform"); + } +} + +#define INSTRS_CYCLES_UPPER 500 +#define INSTRS_CYCLES_LOWER 50 + +T_DECL(instrs_cycles, "ensure instructions and cycles are sampled") +{ + skip_if_monotonic_unsupported(); + + start_controlling_ktrace(); + + ktrace_session_t sess = ktrace_session_create(); + + __block uint64_t ninstrs_cycles = 0; + __block uint64_t nzeroes = 0; + ktrace_events_single(sess, PERF_INSTR_DATA, + ^(__unused struct trace_point *tp) { + ninstrs_cycles++; + if (tp->arg1 == 0) { + T_LOG("%llx (%s)\n", tp->threadid, tp->command); + nzeroes++; + } + if (ninstrs_cycles >= INSTRS_CYCLES_UPPER) { + ktrace_end(sess, 1); + } + }); + + ktrace_set_collection_interval(sess, 200); + + ktrace_set_completion_handler(sess, ^{ + T_EXPECT_GE(ninstrs_cycles, (uint64_t)INSTRS_CYCLES_LOWER, + "saw enough instructions and cycles events"); + T_EXPECT_EQ(nzeroes, UINT64_C(0), + "saw no events with 0 instructions"); + T_END; + }); + + (void)kperf_action_count_set(1); + T_ATEND(reset_kperf); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, + KPERF_SAMPLER_TH_INSTRS_CYCLES), NULL); + + (void)kperf_timer_count_set(1); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0, + kperf_ns_to_ticks(TIMER_PERIOD_NS)), NULL); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL); + + T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling"); + + T_ASSERT_POSIX_ZERO(ktrace_start(sess, dispatch_get_main_queue()), + NULL); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^(void) { + ktrace_end(sess, 1); + }); + + dispatch_main(); +} + diff --git a/tests/kperf_backtracing.c b/tests/kperf_backtracing.c index a586569c3..5c5e3dcfc 100644 --- a/tests/kperf_backtracing.c +++ b/tests/kperf_backtracing.c @@ -7,18 +7,23 @@ #include #include #include +#include #include #include #include "kperf_helpers.h" +#include "ktrace_helpers.h" #define PERF_STK_KHDR UINT32_C(0x25020014) #define PERF_STK_UHDR UINT32_C(0x25020018) #define PERF_STK_KDATA UINT32_C(0x2502000c) #define PERF_STK_UDATA UINT32_C(0x25020010) +#define CALLSTACK_VALID 0x1 +#define CALLSTACK_TRUNCATED 0x10 + T_GLOBAL_META( - T_META_NAMESPACE("xnu.kperf"), + T_META_NAMESPACE("xnu.ktrace"), T_META_CHECK_LEAKS(false)); static void @@ -29,12 +34,14 @@ expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol, unsigned int frame_idx = max_frames - bt_idx - 1; if (!bt[frame_idx]) { - T_LOG("frame %2u: skipping system frame", frame_idx); + T_LOG("frame %2u: skipping system frame '%s'", frame_idx, + CSSymbolGetName(symbol)); return; } if (CSIsNull(symbol)) { - T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx); + T_FAIL("invalid symbol for address %#lx at frame %d", addr, + frame_idx); return; } @@ -105,11 +112,11 @@ expect_backtrace(ktrace_session_t s, uint64_t tid, unsigned int *stacks_seen, return; } - T_LOG("found stack from thread %#lx", tp->threadid); + T_LOG("found stack from thread %#" PRIx64, tp->threadid); stacks++; if (!(tp->arg1 & 1)) { - T_FAIL("invalid %s stack on thread %#lx", kern ? "kernel" : "user", - tp->threadid); + T_FAIL("invalid %s stack on thread %#" PRIx64, + kern ? "kernel" : "user", tp->threadid); return; } @@ -209,26 +216,36 @@ recurse_b(dispatch_semaphore_t spinning, unsigned int frames) return recurse_a(spinning, frames - 1) + 1; } -#define USER_FRAMES (12) +#define USER_FRAMES (12) #if defined(__x86_64__) -#define RECURSE_START_OFFSET (4) -#else /* defined(__x86_64__) */ + #define RECURSE_START_OFFSET (3) -#endif /* defined(__x86_64__) */ + +#else /* defined(__x86_64__) */ + +#define RECURSE_START_OFFSET (2) + +#endif /* !defined(__x86_64__) */ static const char *user_bt[USER_FRAMES] = { #if defined(__x86_64__) + /* + * x86_64 has an extra "thread_start" frame here. + */ NULL, #endif /* defined(__x86_64__) */ - NULL, NULL, + NULL, "backtrace_thread", "recurse_a", "recurse_b", "recurse_a", "recurse_b", - "recurse_a", "recurse_b", "recurse_a", + "recurse_a", "recurse_b", "recurse_a", "recurse_b", #if !defined(__x86_64__) - "recurse_b", + /* + * Pick up the slack to make the number of frames constant. + */ + "recurse_a", #endif /* !defined(__x86_64__) */ - NULL + NULL, }; #if defined(__arm__) @@ -300,7 +317,8 @@ backtrace_thread(void *arg) } static uint64_t -create_backtrace_thread(dispatch_semaphore_t notify_spinning) +create_backtrace_thread(void *(*thread_fn)(void *), + dispatch_semaphore_t notify_spinning) { pthread_t thread = NULL; uint64_t tid; @@ -315,7 +333,7 @@ create_backtrace_thread(dispatch_semaphore_t notify_spinning) } }); - T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread, + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, thread_fn, (void *)notify_spinning), NULL); T_QUIET; T_ASSERT_NOTNULL(thread, "backtrace thread created"); dispatch_semaphore_wait(backtrace_started, DISPATCH_TIME_FOREVER); @@ -343,7 +361,7 @@ start_backtrace_thread(void) #define TEST_TIMEOUT_NS (5 * NSEC_PER_SEC) #endif /* !TARGET_OS_WATCH */ -T_DECL(backtraces_kdebug_trigger, +T_DECL(kdebug_trigger, "test that backtraces from kdebug trigger are correct", T_META_ASROOT(true)) { @@ -352,12 +370,16 @@ T_DECL(backtraces_kdebug_trigger, kperf_kdebug_filter_t filter; uint64_t tid; + start_controlling_ktrace(); + s = ktrace_session_create(); T_ASSERT_NOTNULL(s, "ktrace session was created"); + ktrace_set_collection_interval(s, 100); + T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL); - tid = create_backtrace_thread(NULL); + tid = create_backtrace_thread(backtrace_thread, NULL); expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES, 0); expect_backtrace(s, tid, &stacks_seen, true, kernel_bt, KERNEL_FRAMES, 0); @@ -403,7 +425,7 @@ T_DECL(backtraces_kdebug_trigger, dispatch_main(); } -T_DECL(backtraces_user_timer, +T_DECL(user_timer, "test that user backtraces on a timer are correct", T_META_ASROOT(true)) { @@ -412,14 +434,18 @@ T_DECL(backtraces_user_timer, uint64_t tid; dispatch_semaphore_t wait_for_spinning = dispatch_semaphore_create(0); + start_controlling_ktrace(); + s = ktrace_session_create(); T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); + ktrace_set_collection_interval(s, 100); + ktrace_filter_pid(s, getpid()); configure_kperf_stacks_timer(getpid(), 10); - tid = create_backtrace_thread(wait_for_spinning); + tid = create_backtrace_thread(backtrace_thread, wait_for_spinning); /* potentially calling dispatch function and system call */ expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES - 1, 2); @@ -447,7 +473,144 @@ T_DECL(backtraces_user_timer, dispatch_main(); } +static volatile bool spin = true; + +__attribute__((noinline, not_tail_called)) +static void +recurse_spin(dispatch_semaphore_t notify_sema, int depth) +{ + if (depth > 0) { + recurse_spin(notify_sema, depth - 1); + } else { + dispatch_semaphore_signal(notify_sema); + while (spin); + } +} + +static void * +spin_thread(void *arg) +{ + dispatch_semaphore_t notify_sema = arg; + dispatch_semaphore_signal(backtrace_started); + recurse_spin(notify_sema, 257); + return NULL; +} + +T_DECL(truncated_user_stacks, "ensure stacks are marked as truncated") +{ + start_controlling_ktrace(); + + ktrace_session_t s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, "ktrace session was created"); + + ktrace_set_collection_interval(s, 100); + + T_QUIET; + T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL); + + configure_kperf_stacks_timer(getpid(), 10); + + __block bool saw_stack = false; + ktrace_set_completion_handler(s, ^{ + T_EXPECT_TRUE(saw_stack, "saw the user stack"); + T_END; + }); + + dispatch_semaphore_t notify_sema = dispatch_semaphore_create(0); + uint64_t tid = create_backtrace_thread(spin_thread, notify_sema); + + ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) { + if (tp->threadid != tid) { + return; + } + T_LOG("found %llu frame stack", tp->arg2); + T_EXPECT_BITS_SET(tp->arg1, CALLSTACK_VALID, + "found valid callstack"); + T_EXPECT_BITS_SET(tp->arg1, CALLSTACK_TRUNCATED, + "found truncated callstack"); + saw_stack = true; + ktrace_end(s, 1); + }); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), + "start tracing"); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS), + dispatch_get_main_queue(), ^(void) + { + T_LOG("ending test after timeout"); + ktrace_end(s, 0); + }); + + dispatch_main(); +} + +T_DECL(max_user_stacks, "ensure stacks up to 256 frames can be captured") +{ + start_controlling_ktrace(); + + ktrace_session_t s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, "ktrace session was created"); + + ktrace_set_collection_interval(s, 100); + + T_QUIET; + T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL); + + configure_kperf_stacks_timer(getpid(), 10); + + __block bool saw_stack = false; + __block bool saw_stack_data = false; + __block uint64_t nevents = 0; + ktrace_set_completion_handler(s, ^{ + T_EXPECT_TRUE(saw_stack, "saw the user stack"); + T_LOG("saw %" PRIu64 " stack data events", nevents); + T_EXPECT_TRUE(saw_stack_data, "saw all frames of the user stack"); + T_END; + }); + + dispatch_semaphore_t notify_sema = dispatch_semaphore_create(0); + uint64_t tid = create_backtrace_thread(spin_thread, notify_sema); + + ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) { + if (tp->threadid != tid) { + return; + } + T_LOG("found %llu frame stack", tp->arg2); + T_EXPECT_BITS_SET(tp->arg1, CALLSTACK_VALID, + "found valid callstack"); + T_EXPECT_EQ(tp->arg2, UINT64_C(256), + "found the correct number of frames"); + saw_stack = true; + }); + + ktrace_events_single(s, PERF_STK_UDATA, ^(struct trace_point *tp) { + if (tp->threadid != tid && !saw_stack) { + return; + } + nevents++; + if (nevents == 256 / 4) { + ktrace_end(s, 1); + } + saw_stack_data = true; + }); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), + "start tracing"); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS), + dispatch_get_main_queue(), ^(void) + { + T_LOG("ending test after timeout"); + ktrace_end(s, 0); + }); + + dispatch_main(); +} + /* TODO test kernel stacks in all modes */ /* TODO legacy PET mode backtracing */ -/* TODO test deep stacks, further than 128 frames, make sure they are truncated */ -/* TODO test constrained stacks */ diff --git a/tests/kqueue_add_and_trigger.c b/tests/kqueue_add_and_trigger.c index 8dded5ed3..ec3bb3003 100644 --- a/tests/kqueue_add_and_trigger.c +++ b/tests/kqueue_add_and_trigger.c @@ -3,6 +3,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + /* EVFILT_USER doesn't properly support add&fire atomic combination * * Chek that using EV_ADD and EV_TRIGGER on a EV_USER actually trigger the event just added. diff --git a/tests/kqueue_close.c b/tests/kqueue_close.c index 5678d3251..4937898d9 100644 --- a/tests/kqueue_close.c +++ b/tests/kqueue_close.c @@ -6,6 +6,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + /* * close() of kqueue FD races with kqueue_scan park * diff --git a/tests/kqueue_fifo_18776047.c b/tests/kqueue_fifo_18776047.c index d2a285d7c..40a1e2719 100644 --- a/tests/kqueue_fifo_18776047.c +++ b/tests/kqueue_fifo_18776047.c @@ -13,6 +13,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define TMP_FILE_PATH "/tmp/test_kqueue_fifo_18776047" #define READ_BUFFER_LEN 256 diff --git a/tests/ktrace_helpers.h b/tests/ktrace_helpers.h new file mode 100644 index 000000000..05191cbae --- /dev/null +++ b/tests/ktrace_helpers.h @@ -0,0 +1,59 @@ +#ifndef KTRACE_HELPERS_H +#define KTRACE_HELPERS_H + +#include +#include +#include +#include + +static inline void +reset_ktrace(void) +{ + (void)sysctl((int[]){ CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE }, 3, + NULL, 0, NULL, 0); + kperf_reset(); +} + +static inline void +start_controlling_ktrace(void) +{ + T_SETUPBEGIN; + + int state = 0; + size_t statesz = sizeof(state); + int ret = sysctlbyname("ktrace.state", &state, &statesz, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "getting ktrace state"); + + if (state == 1) { + int ownerpid = 0; + size_t pidsz = sizeof(ownerpid); + ret = sysctlbyname("ktrace.owning_pid", &ownerpid, &pidsz, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "getting owning pid"); + + if (ownerpid <= 0) { + T_LOG("ktrace is in foreground, but no owner"); + goto out; + } + + char ownername[1024]; + ret = proc_name(ownerpid, ownername, sizeof(ownername)); + if (ret == 0) { + T_LOG("ktrace is in foreground, but owner (%d) has no name", ownerpid); + goto out; + } + + T_LOG("ktrace is in foreground, owned by %s, sending SIGKILL", ownername); + kill(ownerpid, SIGKILL); + usleep(500000); + + ret = proc_name(ownerpid, ownername, sizeof(ownername)); + T_QUIET; T_ASSERT_EQ(ret, 0, "should have killed ktrace owner"); + } + +out: + reset_ktrace(); + T_ATEND(reset_ktrace); + T_SETUPEND; +} + +#endif /* !defined(KTRACE_HELPERS_H) */ diff --git a/tests/ldt.c b/tests/ldt.c index 3f2378e31..e6261ddae 100644 --- a/tests/ldt.c +++ b/tests/ldt.c @@ -141,13 +141,10 @@ T_GLOBAL_META( * a robust implementation should determine the proper range to use via * another means. */ -#define FIXED_STACK_ADDR ((uintptr_t)0x10000000) /* must be page-aligned */ #ifndef STANDALONE /* libdarwintest needs LOTs of stack */ #endif #define FIXED_STACK_SIZE (PAGE_SIZE * 16) - -#define FIXED_TRAMP_ADDR (FIXED_STACK_ADDR + FIXED_STACK_SIZE + PAGE_SIZE) #define FIXED_TRAMP_MAXLEN (PAGE_SIZE * 8) #pragma pack(1) @@ -185,7 +182,8 @@ static far_call_t input_desc = { .seg = COMPAT_MODE_CS_SELECTOR, .off = 0 }; static uint64_t stackAddr = 0; static compat_tramp_t thunkit = NULL; static uint64_t thunk64_addr; -static stackaddr_to_gsbase_t stack2gs[] = { { FIXED_STACK_ADDR, FIXED_STACK_ADDR + FIXED_STACK_SIZE, 0 } }; +/* stack2gs[0] is initialized in map_lowmem_stack() */ +static stackaddr_to_gsbase_t stack2gs[] = { { 0 } }; extern int compat_mode_trampoline(far_call_t *, void *, uint64_t); extern void long_mode_trampoline(void); @@ -303,9 +301,9 @@ handle_arithmetic_exception(_STRUCT_X86_THREAD_FULL_STATE64 *xtfs64, uint64_t *i { fprintf(stderr, "Caught divide-error exception\n"); fprintf(stderr, "cs=0x%x rip=0x%x gs=0x%x ss=0x%x rsp=0x%llx\n", - (unsigned)xtfs64->ss64.__cs, - (unsigned)xtfs64->ss64.__rip, (unsigned)xtfs64->ss64.__gs, - (unsigned)xtfs64->__ss, xtfs64->ss64.__rsp); + (unsigned)xtfs64->__ss64.__cs, + (unsigned)xtfs64->__ss64.__rip, (unsigned)xtfs64->__ss64.__gs, + (unsigned)xtfs64->__ss, xtfs64->__ss64.__rsp); *ip_skip_countp = 2; } @@ -320,9 +318,9 @@ handle_badinsn_exception(_STRUCT_X86_THREAD_FULL_STATE64 *xtfs64, uint64_t __unu fprintf(stderr, "Caught invalid opcode exception\n"); fprintf(stderr, "cs=%x rip=%x gs=%x ss=0x%x rsp=0x%llx | handling between 0x%llx and 0x%llx\n", - (unsigned)xtfs64->ss64.__cs, - (unsigned)xtfs64->ss64.__rip, (unsigned)xtfs64->ss64.__gs, - (unsigned)xtfs64->__ss, xtfs64->ss64.__rsp, + (unsigned)xtfs64->__ss64.__cs, + (unsigned)xtfs64->__ss64.__rip, (unsigned)xtfs64->__ss64.__gs, + (unsigned)xtfs64->__ss, xtfs64->__ss64.__rsp, start_addr, end_addr); /* @@ -334,14 +332,14 @@ handle_badinsn_exception(_STRUCT_X86_THREAD_FULL_STATE64 *xtfs64, uint64_t __unu * (Note that due to the way the invalid opcode indication was implemented, * %rip is already set to the next instruction.) */ - if (xtfs64->ss64.__rip >= start_addr && xtfs64->ss64.__rip <= end_addr) { + if (xtfs64->__ss64.__rip >= start_addr && xtfs64->__ss64.__rip <= end_addr) { /* * On return from the failed sysenter, %cs is changed to the * sysenter code selector and %ss is set to 0x23, so switch them * back to sane values. */ - if ((unsigned)xtfs64->ss64.__cs == SYSENTER_SELECTOR) { - xtfs64->ss64.__cs = COMPAT_MODE_CS_SELECTOR; + if ((unsigned)xtfs64->__ss64.__cs == SYSENTER_SELECTOR) { + xtfs64->__ss64.__cs = COMPAT_MODE_CS_SELECTOR; xtfs64->__ss = 0x23; /* XXX */ } } @@ -393,8 +391,8 @@ catch_mach_exception_raise_state_identity(mach_port_t exception_port, default: fprintf(stderr, "Unsupported catch_mach_exception_raise_state_identity: code 0x%llx sub 0x%llx\n", code[0], codeCnt > 1 ? code[1] : 0LL); - fprintf(stderr, "flavor=%d %%cs=0x%x %%rip=0x%llx\n", *flavor, (unsigned)xtfs64->ss64.__cs, - xtfs64->ss64.__rip); + fprintf(stderr, "flavor=%d %%cs=0x%x %%rip=0x%llx\n", *flavor, (unsigned)xtfs64->__ss64.__cs, + xtfs64->__ss64.__rip); } /* @@ -403,12 +401,12 @@ catch_mach_exception_raise_state_identity(mach_port_t exception_port, * new state's cs register to just after the div instruction * to enable the thread to resume. */ - if ((unsigned)xtfs64->ss64.__cs == COMPAT_MODE_CS_SELECTOR) { + if ((unsigned)xtfs64->__ss64.__cs == COMPAT_MODE_CS_SELECTOR) { *new_stateCnt = old_stateCnt; *new_xtfs64 = *xtfs64; - new_xtfs64->ss64.__rip += rip_skip_count; - fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)new_xtfs64->ss64.__cs, - new_xtfs64->ss64.__rip); + new_xtfs64->__ss64.__rip += rip_skip_count; + fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)new_xtfs64->__ss64.__cs, + new_xtfs64->__ss64.__rip); return KERN_SUCCESS; } else { return KERN_NOT_SUPPORTED; @@ -500,7 +498,7 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext) #ifndef STANDALONE T_ASSERT_FAIL("Unexpected signal %d\n", signo); #else - restore_gsbase(mctx.fp_fullp->__ss.ss64.__rsp); + restore_gsbase(mctx.fp_fullp->__ss.__ss64.__rsp); fprintf(stderr, "Not handling signal %d\n", signo); abort(); #endif @@ -521,10 +519,10 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext) int cnt = i386_set_ldt((int)idx, &descs[idx], 1); if (cnt != (int)idx) { #ifdef DEBUG - fprintf(stderr, "i386_set_ldt unexpectedly returned %d\n", cnt); + fprintf(stderr, "i386_set_ldt unexpectedly returned %d (errno = %s)\n", cnt, strerror(errno)); #endif #ifndef STANDALONE - T_LOG("i386_set_ldt unexpectedly returned %d\n", cnt); + T_LOG("i386_set_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); T_ASSERT_FAIL("i386_set_ldt failure"); #else exit(1); @@ -567,9 +565,9 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext) * Since we're handing this signal on the same thread, we may need to * restore GSbase. */ - uint64_t orig_gsbase = stack_range_to_GSbase(ss64->ss64.__rsp, 0); + uint64_t orig_gsbase = stack_range_to_GSbase(ss64->__ss64.__rsp, 0); if (orig_gsbase != 0 && orig_gsbase != ss64->__gsbase) { - restore_gsbase(ss64->ss64.__rsp); + restore_gsbase(ss64->__ss64.__rsp); } if (signo == SIGFPE) { @@ -584,10 +582,10 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext) * new state's cs register to just after the div instruction * to enable the thread to resume. */ - if ((unsigned)ss64->ss64.__cs == COMPAT_MODE_CS_SELECTOR) { - ss64->ss64.__rip += rip_skip_count; - fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)ss64->ss64.__cs, - ss64->ss64.__rip); + if ((unsigned)ss64->__ss64.__cs == COMPAT_MODE_CS_SELECTOR) { + ss64->__ss64.__rip += rip_skip_count; + fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)ss64->__ss64.__cs, + ss64->__ss64.__rip); } } else { _STRUCT_X86_THREAD_STATE64 *ss64 = &mctx.fp_basep->__ss; @@ -675,28 +673,42 @@ dump_desc(union ldt_entry *entp) static int map_lowmem_stack(void **lowmemstk) { - void *addr, *redzone; + void *addr; + int err; - if ((redzone = mmap((void *)(FIXED_STACK_ADDR - PAGE_SIZE), PAGE_SIZE, PROT_READ, - MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) { + if ((addr = mmap(0, FIXED_STACK_SIZE + PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_32BIT | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) { return errno; } - if ((addr = mmap((void *)FIXED_STACK_ADDR, FIXED_STACK_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) { - (void)munmap(redzone, PAGE_SIZE); - return errno; + if ((uintptr_t)addr > 0xFFFFF000ULL) { + /* Error: This kernel does not support MAP_32BIT or there's a bug. */ +#ifndef STANDALONE + T_ASSERT_FAIL("%s: failed to map a 32-bit-accessible stack", __func__); +#else + fprintf(stderr, "This kernel returned a virtual address > 4G (%p) despite MAP_32BIT. Aborting.\n", addr); + exit(1); +#endif + } + + /* Enforce one page of redzone at the bottom of the stack */ + if (mprotect(addr, PAGE_SIZE, PROT_NONE) < 0) { + err = errno; + (void) munmap(addr, FIXED_STACK_SIZE + PAGE_SIZE); + return err; } if (lowmemstk) { - *lowmemstk = addr; + stack2gs[0].stack_base = (uintptr_t)addr + PAGE_SIZE; + stack2gs[0].stack_limit = stack2gs[0].stack_base + FIXED_STACK_SIZE; + *lowmemstk = (void *)((uintptr_t)addr + PAGE_SIZE); } return 0; } static int -map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, void *baseaddr, +map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, size_t szlimit) { void *addr; @@ -707,14 +719,24 @@ map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, void *ba } #ifdef DEBUG - printf("baseaddr = %p, size = %lu, szlimit = %u\n", baseaddr, sz, (unsigned)szlimit); + printf("size = %lu, szlimit = %u\n", sz, (unsigned)szlimit); #endif - if ((addr = mmap(baseaddr, sz, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) { + if ((addr = mmap(0, sz, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_32BIT | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) { return errno; } + if ((uintptr_t)addr > 0xFFFFF000ULL) { + /* Error: This kernel does not support MAP_32BIT or there's a bug. */ +#ifndef STANDALONE + T_ASSERT_FAIL("%s: failed to map a 32-bit-accessible trampoline", __func__); +#else + fprintf(stderr, "This kernel returned a virtual address > 4G (%p) despite MAP_32BIT. Aborting.\n", addr); + exit(1); +#endif + } + #ifdef DEBUG printf("Mapping code @%p..%p => %p..%p\n", (void *)code_src, (void *)((uintptr_t)code_src + (unsigned)code_len), @@ -724,7 +746,9 @@ map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, void *ba bcopy(code_src, addr, code_len); /* Fill the rest of the page with NOPs */ - memset((void *)((uintptr_t)addr + code_len), 0x90, sz - code_len); + if ((sz - code_len) > 0) { + memset((void *)((uintptr_t)addr + code_len), 0x90, sz - code_len); + } if (codeptr) { *codeptr = addr; @@ -740,31 +764,7 @@ map_32bit_trampoline(compat_tramp_t *lowmemtrampp) return map_32bit_code_impl((uint8_t *)&compat_mode_trampoline, (size_t)compat_mode_trampoline_len, (void **)lowmemtrampp, - (void *)FIXED_TRAMP_ADDR, FIXED_TRAMP_MAXLEN); -} - -static int -enable_ldt64(int *val) -{ - int ldt64_enable_value = 1; - int ldt64_enable_old = 0; - size_t ldt64_value_sz = sizeof(ldt64_enable_value); - int err; - - /* Enable the feature for this test (development kernels only) */ - if ((err = sysctlbyname("machdep.ldt64", 0, 0, &ldt64_enable_value, - ldt64_value_sz)) != 0) { - if (errno == EPERM) { - if ((err = sysctlbyname("machdep.ldt64", &ldt64_enable_old, - &ldt64_value_sz, 0, 0)) == 0) { - *val = ldt64_enable_old; - } - } - return errno; - } - - *val = ldt64_enable_value; - return 0; + FIXED_TRAMP_MAXLEN); } static uint64_t @@ -922,7 +922,6 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in void *addr; uintptr_t code_addr; uintptr_t thunk64_movabs_addr; - int enable_status = 0; descs = malloc(sizeof(union ldt_entry) * 256); if (descs == 0) { @@ -934,29 +933,15 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in #endif } - if ((err = enable_ldt64(&enable_status)) != 0 && enable_status == 0) { -#ifndef STANDALONE - T_LOG("Warning: Couldn't set ldt64=1 via sysctl: %s\n", - strerror(err)); - T_ASSERT_FAIL("Couldn't enable ldt64 feature.\n"); -#else - fprintf(stderr, "Warning: Couldn't set ldt64=1 via sysctl: %s\n", - strerror(err)); - exit(1); -#endif - } - #ifdef DEBUG printf("32-bit code is at %p\n", (void *)&code_32); #endif if ((err = map_lowmem_stack(&addr)) != 0) { -#ifdef DEBUG - fprintf(stderr, "Failed to mmap lowmem stack: %s\n", strerror(err)); -#endif #ifndef STANDALONE - T_ASSERT_FAIL("failed to mmap lowmem stack"); + T_ASSERT_FAIL("failed to mmap lowmem stack: %s", strerror(err)); #else + fprintf(stderr, "Failed to mmap lowmem stack: %s\n", strerror(err)); exit(1); #endif } @@ -966,28 +951,12 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in printf("lowstack addr = %p\n", (void *)stackAddr); #endif - if ((err = create_worker_thread(cmargp, (uint32_t)stackAddr, cmthreadp)) != 0) { -#ifdef DEBUG - fprintf(stderr, "Fatal: Could not create thread: %s\n", strerror(err)); -#endif -#ifndef STANDALONE - T_LOG("Fatal: Could not create thread: %s\n", strerror(err)); - T_ASSERT_FAIL("Thread creation failure"); -#else - exit(1); -#endif - } - - if ((err = map_32bit_trampoline(&thunkit)) != 0) { -#ifdef DEBUG - fprintf(stderr, "Failed to map trampoline into lowmem: %s\n", strerror(err)); -#endif - join_32bit_thread(cmthreadp, cmargp); #ifndef STANDALONE T_LOG("Failed to map trampoline into lowmem: %s\n", strerror(err)); T_ASSERT_FAIL("Failed to map trampoline into lowmem"); #else + fprintf(stderr, "Failed to map trampoline into lowmem: %s\n", strerror(err)); exit(1); #endif } @@ -1002,12 +971,11 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in bzero(descs, sizeof(union ldt_entry) * 256); if ((cnt = i386_get_ldt(0, descs, 1)) <= 0) { - fprintf(stderr, "i386_get_ldt unexpectedly returned %d\n", cnt); - join_32bit_thread(cmthreadp, cmargp); #ifndef STANDALONE - T_LOG("i386_get_ldt unexpectedly returned %d\n", cnt); + T_LOG("i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); T_ASSERT_FAIL("i386_get_ldt failure"); #else + fprintf(stderr, "i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); exit(1); #endif } @@ -1041,14 +1009,11 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in /* Set the LDT: */ cnt = i386_set_ldt((int)idx, &descs[idx], 1); if (cnt != (int)idx) { -#ifdef DEBUG - fprintf(stderr, "i386_set_ldt unexpectedly returned %d\n", cnt); -#endif - join_32bit_thread(cmthreadp, cmargp); #ifndef STANDALONE - T_LOG("i386_set_ldt unexpectedly returned %d\n", cnt); + T_LOG("i386_set_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); T_ASSERT_FAIL("i386_set_ldt failure"); #else + fprintf(stderr, "i386_set_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); exit(1); #endif } @@ -1068,19 +1033,28 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in } #endif } else { -#ifdef DEBUG - fprintf(stderr, "i386_get_ldt unexpectedly returned %d\n", cnt); -#endif - join_32bit_thread(cmthreadp, cmargp); #ifndef STANDALONE - T_LOG("i386_get_ldt unexpectedly returned %d\n", cnt); + T_LOG("i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); T_ASSERT_FAIL("i386_get_ldt failure"); #else + fprintf(stderr, "i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno)); exit(1); #endif } free(descs); + + if ((err = create_worker_thread(cmargp, (uint32_t)stackAddr, cmthreadp)) != 0) { +#ifdef DEBUG + fprintf(stderr, "Fatal: Could not create thread: %s\n", strerror(err)); +#endif +#ifndef STANDALONE + T_LOG("Fatal: Could not create thread: %s\n", strerror(err)); + T_ASSERT_FAIL("Thread creation failure"); +#else + exit(1); +#endif + } } #ifdef STANDALONE diff --git a/tests/ldt_entitlement.plist b/tests/ldt_entitlement.plist new file mode 100644 index 000000000..19058c68d --- /dev/null +++ b/tests/ldt_entitlement.plist @@ -0,0 +1,10 @@ + + + + + com.apple.security.ldt-in-64bit-process + + com.apple.security.mmap-map-32bit + + + diff --git a/tests/mach_boottime_usec.c b/tests/mach_boottime_usec.c index 7a9a47277..23e199e37 100644 --- a/tests/mach_boottime_usec.c +++ b/tests/mach_boottime_usec.c @@ -7,6 +7,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(mach_boottime_usec, "mach_boottime_usec()", T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT)) { diff --git a/tests/mach_continuous_time.c b/tests/mach_continuous_time.c index 0d49b7bdf..fe782e0f9 100644 --- a/tests/mach_continuous_time.c +++ b/tests/mach_continuous_time.c @@ -36,7 +36,7 @@ update(uint64_t *a, uint64_t *c) } T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic values", - T_META_ALL_VALID_ARCHS(true)) + T_META_ALL_VALID_ARCHS(true), T_META_RUN_CONCURRENTLY(true)) { mach_timebase_info(&tb_info); #ifdef HAS_KERNEL_TIME_TRAPS @@ -69,7 +69,7 @@ T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic valu } T_DECL(mat_monotonic, "Testing mach_absolute_time returns sane, monotonic values", - T_META_ALL_VALID_ARCHS(true)) + T_META_ALL_VALID_ARCHS(true), T_META_RUN_CONCURRENTLY(true)) { mach_timebase_info(&tb_info); #ifdef HAS_KERNEL_TIME_TRAPS @@ -100,7 +100,8 @@ T_DECL(mat_monotonic, "Testing mach_absolute_time returns sane, monotonic values } } -T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge") +T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge", + T_META_RUN_CONCURRENTLY(true)) { mach_timebase_info(&tb_info); @@ -136,7 +137,8 @@ update_kern(uint64_t *abs, uint64_t *cont) #endif #ifdef HAS_KERNEL_TIME_TRAPS -T_DECL(mct_pause_kern, "Testing kernel mach_continuous_time and mach_absolute_time don't diverge") +T_DECL(mct_pause_kern, "Testing kernel mach_continuous_time and mach_absolute_time don't diverge", + T_META_RUN_CONCURRENTLY(true)) { mach_timebase_info(&tb_info); @@ -281,7 +283,7 @@ T_DECL(mct_settimeofday_kern, "Testing kernel mach_continuous_time behavior over #endif T_DECL(mct_aproximate, "Testing mach_continuous_approximate_time()", - T_META_ALL_VALID_ARCHS(true)) + T_META_ALL_VALID_ARCHS(true), T_META_RUN_CONCURRENTLY(true)) { mach_timebase_info(&tb_info); diff --git a/tests/mach_exception_reply.c b/tests/mach_exception_reply.c new file mode 100644 index 000000000..d34bc2575 --- /dev/null +++ b/tests/mach_exception_reply.c @@ -0,0 +1,457 @@ +#define T_NAMESPACE "xnu.ipc" +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +#define MSG 1024 +#define PG_ALLOC 4096 + +typedef enum { + ReplyWithNoError, + ReplyWithReplyPort, + ReplyWithReplyPortMove, + ReplyWithReplyPortCplxBit, + ReplyWithReplyPortMoveCplxBit, + ReplyWithPortDesc, + ReplyWithOOLDesc, + ReplyWithVoucher, + ReplyWithVoucherGarbage +} ReplyType; + +struct exc_thread_arg { + ReplyType rt; + mach_port_t port; +}; + +static const char * +reply_type_str(ReplyType rt) +{ + switch (rt) { + case ReplyWithNoError: + return "ReplyWithNoError"; + case ReplyWithReplyPort: + return "ReplyWithReplyPort"; + case ReplyWithReplyPortMove: + return "ReplyWithReplyPortMove"; + case ReplyWithReplyPortCplxBit: + return "ReplyWithReplyPortCplxBit"; + case ReplyWithReplyPortMoveCplxBit: + return "ReplyWithReplyPortMoveCplxBit"; + case ReplyWithPortDesc: + return "ReplyWithPortDesc"; + case ReplyWithOOLDesc: + return "ReplyWithOOLDesc"; + case ReplyWithVoucher: + return "ReplyWithVoucher"; + case ReplyWithVoucherGarbage: + return "ReplyWithVoucherGarbage"; + } +} + +static mach_voucher_t +create_pthpriority_voucher(void) +{ + char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)]; + + mach_voucher_t voucher = MACH_PORT_NULL; + kern_return_t kr; + ipc_pthread_priority_value_t ipc_pthread_priority_value = + (ipc_pthread_priority_value_t)_pthread_qos_class_encode(QOS_CLASS_USER_INTERACTIVE, 0, 0); + + mach_voucher_attr_raw_recipe_size_t recipe_size = 0; + mach_voucher_attr_recipe_t recipe = + (mach_voucher_attr_recipe_t)&voucher_buf[0]; + + recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY; + recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE; + recipe->previous_voucher = MACH_VOUCHER_NULL; + + memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value)); + recipe->content_size = sizeof(ipc_pthread_priority_value_t); + recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size; + + kr = host_create_mach_voucher(mach_host_self(), + (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0], + recipe_size, + &voucher); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_create_mach_voucher"); + return voucher; +} + +static void * +handle_exceptions(void *arg) +{ + struct exc_thread_arg *ta = (struct exc_thread_arg *)arg; + mach_port_t ePort = ta->port; + ReplyType reply_type = ta->rt; + + char msg_store[MSG + MAX_TRAILER_SIZE]; + char reply_store[MSG]; + mach_msg_header_t *msg = (mach_msg_header_t *)msg_store; + vm_address_t page; + kern_return_t kr; + + kr = vm_allocate(mach_task_self(), &page, PG_ALLOC, VM_FLAGS_ANYWHERE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "ool page allocation of %d bytes", PG_ALLOC); + + mach_voucher_t voucher = create_pthpriority_voucher(); + + while (1) { + bzero(msg, sizeof(msg_store)); + + msg->msgh_local_port = ePort; + msg->msgh_size = MSG; + kr = mach_msg_receive(msg); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "exception msg recv"); + + bzero(reply_store, sizeof(reply_store)); + + switch (reply_type) { + case ReplyWithNoError: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + NDR_record_t ndr; + kern_return_t kr; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0, 0, 0); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = MACH_PORT_NULL; + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + break; + } + + case ReplyWithReplyPort: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + NDR_record_t ndr; + kern_return_t kr; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_COPY_SEND, 0, 0); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = ePort; /* Bogus */ + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + break; + } + + case ReplyWithReplyPortMove: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + NDR_record_t ndr; + kern_return_t kr; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_MOVE_SEND, 0, 0); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = ePort; /* Bogus */ + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + break; + } + + case ReplyWithReplyPortCplxBit: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + mach_msg_body_t body; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_COPY_SEND, 0, MACH_MSGH_BITS_COMPLEX); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = ePort; /* Bogus */ + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + reply->body.msgh_descriptor_count = 0; + break; + } + + case ReplyWithReplyPortMoveCplxBit: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + mach_msg_body_t body; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_MOVE_SEND, 0, MACH_MSGH_BITS_COMPLEX); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = ePort; /* Bogus */ + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + reply->body.msgh_descriptor_count = 0; + break; + } + + case ReplyWithPortDesc: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + mach_msg_body_t body; + mach_msg_port_descriptor_t port; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0, 0, MACH_MSGH_BITS_COMPLEX); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = MACH_PORT_NULL; + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + reply->body.msgh_descriptor_count = 1; + reply->port.type = MACH_MSG_PORT_DESCRIPTOR; + reply->port.name = ePort; + reply->port.disposition = MACH_MSG_TYPE_COPY_SEND; + break; + } + + case ReplyWithOOLDesc: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + mach_msg_body_t body; + mach_msg_ool_descriptor_t ool; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0, 0, MACH_MSGH_BITS_COMPLEX); + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = MACH_PORT_NULL; + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + reply->body.msgh_descriptor_count = 1; + reply->ool.type = MACH_MSG_OOL_DESCRIPTOR; + reply->ool.address = (void *)page; + reply->ool.size = PG_ALLOC; + reply->ool.deallocate = 0; + reply->ool.copy = MACH_MSG_VIRTUAL_COPY; + break; + } + + case ReplyWithVoucher: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + NDR_record_t ndr; + kern_return_t kr; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = MACH_PORT_NULL; + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + reply->kr = KERN_SUCCESS; + + /* try to send a voucher */ + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, + 0, + MACH_MSG_TYPE_MOVE_SEND, + 0); + reply->hdr.msgh_voucher_port = voucher; + voucher = MACH_VOUCHER_NULL; + break; + } + + case ReplyWithVoucherGarbage: { +#pragma pack(4) + typedef struct { + mach_msg_header_t hdr; + NDR_record_t ndr; + kern_return_t kr; + } reply_fmt_t; +#pragma pack() + reply_fmt_t *reply = (reply_fmt_t *)reply_store; + + reply->hdr.msgh_remote_port = msg->msgh_remote_port; + reply->hdr.msgh_local_port = MACH_PORT_NULL; + reply->hdr.msgh_size = sizeof(*reply); + reply->hdr.msgh_id = msg->msgh_id + 100; + reply->kr = KERN_SUCCESS; + + /* don't claim to send a voucher */ + reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, + 0, 0, 0); + /* but put some bits in the field */ + reply->hdr.msgh_voucher_port = (mach_voucher_t)0xdead; + break; + } + + default: + T_ASSERT_FAIL("Invalid ReplyType: %d", reply_type); + T_END; + } + + if (voucher) { + kr = mach_port_mod_refs(mach_task_self(), voucher, + MACH_PORT_RIGHT_SEND, -1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "destroy voucher"); + } + + T_LOG("sending exception reply of type (%s)", reply_type_str(reply_type)); + kr = mach_msg_send((mach_msg_header_t *)reply_store); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "exception reply msg send"); + + T_PASS("Successfully delivered exception reply message of type %s", reply_type_str(reply_type)); + T_END; + return NULL; + } +} + +static sigjmp_buf jb; +static int *bad_pointer = NULL; +static int s_sigmask = 0; + +static void +signal_handler(int sig, siginfo_t *sip __unused, void *ucontext __unused) +{ + if (sigmask(sig) & s_sigmask) { /* TODO: check that the fault was generated by us */ + siglongjmp(jb, sig); + } else { + siglongjmp(jb, -sig); + } +} + +static int +handle_signals(void) +{ + int mask = 0; + + struct sigaction sa = { + .sa_sigaction = signal_handler, + .sa_flags = SA_SIGINFO + }; + sigfillset(&sa.sa_mask); + + T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGTRAP, &sa, NULL), NULL); + mask |= sigmask(SIGTRAP); + + T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGSEGV, &sa, NULL), NULL); + mask |= sigmask(SIGSEGV); + + T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGILL, &sa, NULL), NULL); + mask |= sigmask(SIGILL); + + return mask; +} + +static void +test_exc_reply_type(ReplyType reply_type) +{ + kern_return_t kr; + task_t me = mach_task_self(); + thread_t self = mach_thread_self(); + pthread_t handler_thread; + pthread_attr_t attr; + mach_port_t ePort; + + s_sigmask = handle_signals(); + T_LOG("task self = 0x%x, thread self = 0x%x\n", me, self); + + kr = mach_port_allocate(me, MACH_PORT_RIGHT_RECEIVE, &ePort); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "allocate receive right"); + + kr = mach_port_insert_right(me, ePort, ePort, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "insert right into port=[%d]", ePort); + + kr = thread_set_exception_ports(self, EXC_MASK_ALL, ePort, EXCEPTION_DEFAULT, THREAD_STATE_NONE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "set exception ports on self=[%d], handler=[%d]", self, ePort); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + struct exc_thread_arg *ta = (struct exc_thread_arg *)malloc(sizeof(*ta)); + T_QUIET; T_ASSERT_NOTNULL(ta, "exception handler thread args allocation"); + ta->port = ePort; + ta->rt = reply_type; + + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&handler_thread, &attr, handle_exceptions, (void *)ta), + "pthread creation"); + + pthread_attr_destroy(&attr); + + /* cause exception! */ + int x = sigsetjmp(jb, 0); //s_sigmask); + if (x == 0) { + *bad_pointer = 0; + } else if (x < 0) { + T_FAIL("Unexpected state on return-from-exception"); + T_END; + } else { + T_PASS("Successfully recovered from exception"); + T_END; + } + T_FAIL("Unexpected end of test!"); + T_END; +} + +T_DECL(mach_exc_ReplyNoError, "exception server reply with no error", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithNoError); +} +T_DECL(mach_exc_ReplyWithReplyPort, "exception server reply with reply port", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithReplyPort); +} +T_DECL(mach_exc_ReplyWithReplyPortMove, "exception server reply with reply port as MOVE_SEND", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithReplyPortMove); +} +T_DECL(mach_exc_ReplyWithReplyPortCplxBit, "exception server reply with reply port and complex bit set", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithReplyPortCplxBit); +} +T_DECL(mach_exc_ReplyWithReplyPortMoveCplxBit, "exception server reply with reply port as MOVE_SEND and complex bit set", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithReplyPortMoveCplxBit); +} +T_DECL(mach_exc_ReplyWithOOLPort, "exception server reply with OOL port descriptor", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithPortDesc); +} +T_DECL(mach_exc_ReplyWithOOLDesc, "exception server reply with OOL memory descriptor", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithOOLDesc); +} +T_DECL(mach_exc_ReplyWithVoucher, "exception server reply with a voucher", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithVoucher); +} +T_DECL(mach_exc_ReplyWithVoucherGarbage, "exception server reply with bits in msgh_voucher_port", + T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*")) +{ + test_exc_reply_type(ReplyWithVoucherGarbage); +} diff --git a/tests/mach_get_times.c b/tests/mach_get_times.c index d6fe33a7b..057126566 100644 --- a/tests/mach_get_times.c +++ b/tests/mach_get_times.c @@ -6,6 +6,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define T_LOG_VERBOSE(...) #define timespec2nanosec(ts) ((uint64_t)((ts)->tv_sec) * NSEC_PER_SEC + (uint64_t)((ts)->tv_nsec)) diff --git a/tests/mach_port_deallocate_21692215.c b/tests/mach_port_deallocate_21692215.c index 0b1510600..f072b1041 100644 --- a/tests/mach_port_deallocate_21692215.c +++ b/tests/mach_port_deallocate_21692215.c @@ -4,6 +4,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define NR_PORTS 4 T_DECL(mach_port_deallocate, "mach_port_deallocate deallocates also PORT_SET"){ diff --git a/tests/mach_port_insert_right.c b/tests/mach_port_insert_right.c index dec8548c1..b0c3d76e0 100644 --- a/tests/mach_port_insert_right.c +++ b/tests/mach_port_insert_right.c @@ -3,6 +3,18 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +static inline mach_port_type_t +get_port_type(mach_port_t mp) +{ + mach_port_type_t type; + T_QUIET; + T_ASSERT_MACH_SUCCESS(mach_port_type(mach_task_self(), mp, &type), + "mach_port_type(mP)"); + return type; +} + T_DECL(mach_port_insert_right, "insert send right for an existing right", T_META_CHECK_LEAKS(false)) { mach_port_t port = MACH_PORT_NULL; @@ -14,6 +26,14 @@ T_DECL(mach_port_insert_right, "insert send right for an existing right", T_META retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port); T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port); + T_ASSERT_EQ(get_port_type(port), MACH_PORT_TYPE_RECEIVE, + "0x%x should be a receive right", port); + + retval = mach_port_insert_right(task, port, port, MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_SUCCESS(retval, "insert a send right for port=[%d] with name=[%d]", port, port); + T_ASSERT_EQ(get_port_type(port), MACH_PORT_TYPE_RECEIVE | MACH_PORT_TYPE_SEND, + "0x%x should be a send-receive right", port); + mach_port_name_t name = 123; retval = mach_port_insert_right(task, name, port, MACH_MSG_TYPE_MAKE_SEND); @@ -26,6 +46,9 @@ T_DECL(mach_port_insert_right, "insert send right for an existing right", T_META retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port2); T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port2); + T_ASSERT_EQ(get_port_type(port2), MACH_PORT_TYPE_RECEIVE, + "0x%x should be a receive right", port2); + name = port; retval = mach_port_insert_right(task, name, port2, MACH_MSG_TYPE_MAKE_SEND); T_ASSERT_MACH_ERROR(retval, KERN_RIGHT_EXISTS, "insert a send right for port=[%d] with name=[%d]", port2, name); diff --git a/tests/mach_port_mod_refs.c b/tests/mach_port_mod_refs.c index acb7d119a..184d2f62a 100644 --- a/tests/mach_port_mod_refs.c +++ b/tests/mach_port_mod_refs.c @@ -7,6 +7,7 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); T_DECL(mach_port_mod_refs, "mach_port_mod_refs"){ mach_port_t port_set; diff --git a/tests/mach_timebase_info.c b/tests/mach_timebase_info.c index d58dd85c4..43fb263a0 100644 --- a/tests/mach_timebase_info.c +++ b/tests/mach_timebase_info.c @@ -2,6 +2,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + extern kern_return_t mach_timebase_info_trap(mach_timebase_info_t info); T_DECL(mach_timebase_info, "mach_timebase_info(_trap)", diff --git a/tests/memorystatus_assertion_helpers.c b/tests/memorystatus_assertion_helpers.c new file mode 100644 index 000000000..daa7731b0 --- /dev/null +++ b/tests/memorystatus_assertion_helpers.c @@ -0,0 +1,239 @@ +#include +#include + +#include + +#include "memorystatus_assertion_helpers.h" + +static void log_state(uint32_t state); + +int +set_priority(pid_t pid, int32_t priority, uint64_t user_data, boolean_t is_assertion_driven) +{ + int err; + uint32_t flag = 0; + memorystatus_priority_properties_t mjp = { 0 }; + + if (is_assertion_driven) { + /* + * Control over an assertion driven priority will be + * relinquished when priority == JETSAM_PRIORITY_IDLE + */ + if (priority == JETSAM_PRIORITY_IDLE) { + T_LOG("Relinquish ...assertion... priority(%d) for pid[%d]", priority, pid); + } else { + T_LOG("Setting ...assertion... priority(%d) for pid[%d]", priority, pid); + } + flag |= MEMORYSTATUS_SET_PRIORITY_ASSERTION; + } else { + T_LOG("Setting ...requested... priority(%d) for pid[%d]", priority, pid); + flag = 0; + } + + mjp.priority = priority; + mjp.user_data = user_data; + + err = memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, pid, flag, &mjp, sizeof(mjp)); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES failed"); + return err; +} + +boolean_t +check_properties(pid_t pid, int32_t expected_priority, int32_t expected_limit_mb, uint64_t expected_user_data, boolean_t expected_assertion_state, const char *test) +{ + const char *PROP_CHECK_ERROR_STRING = "property mismatch"; + boolean_t verbose = true; + boolean_t ret; + + int32_t actual_priority = 0; + int32_t actual_limit_mb = 0; + uint64_t actual_user_data = 0; + uint32_t actual_state = 0; + + verbose = false; + (void)get_priority_props(pid, verbose, &actual_priority, &actual_limit_mb, &actual_user_data, &actual_state); + + if (test != NULL) { + T_LOG("check_properties: %s", test); + } + + ret = verify_assertion_state(actual_state, expected_assertion_state); + T_QUIET; + T_ASSERT_TRUE(ret, "verify_assertion_state failed"); + + + /* + * These tests use well defined limits, so we don't try to handle defaults like + * a limit of <= 0 which typically applies a system-wide per process limit. + */ + + if ((actual_priority != expected_priority) || (actual_limit_mb != expected_limit_mb) || (actual_user_data != expected_user_data)) { + /* we have a mismatch */ + T_LOG("%s test failed: %s\n", test, PROP_CHECK_ERROR_STRING); + + if (actual_priority != expected_priority) { + T_LOG("priority mismatch [actual / expected] [%d / %d]", actual_priority, expected_priority); + } + + if (actual_limit_mb != expected_limit_mb) { + T_LOG("limit mismatch [actual / expected] [%d / %d]", actual_limit_mb, expected_limit_mb); + } + + if (actual_user_data != expected_user_data) { + T_LOG("user data mismatch [actual / expected] [0x%llx / 0x%llx]", actual_user_data, expected_user_data); + } + + T_LOG("state is 0x%x\n", actual_state); + log_state(actual_state); + + T_ASSERT_FAIL("check_properties: %s", test); + } else { + T_PASS("check_properties: %s ok", test); + return true; + } + return false; +} + +int +set_assertion_priority(pid_t pid, int32_t priority, uint64_t user_data) +{ + return set_priority(pid, priority, user_data, TRUE); +} + +int +relinquish_assertion_priority(pid_t pid, uint64_t user_data) +{ + return set_assertion_priority(pid, JETSAM_PRIORITY_IDLE, user_data); +} + +int +set_memlimits( + pid_t pid, + int32_t active_limit_mb, int32_t inactive_limit_mb, + boolean_t active_is_fatal, boolean_t inactive_is_fatal) +{ + int err; + memorystatus_memlimit_properties_t mmprops; + + memset(&mmprops, 0, sizeof(memorystatus_memlimit_properties_t)); + + mmprops.memlimit_active = active_limit_mb; + mmprops.memlimit_inactive = inactive_limit_mb; + + if (active_is_fatal) { + mmprops.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } else { + mmprops.memlimit_active_attr &= ~(uint32_t)MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } + + if (inactive_is_fatal) { + mmprops.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } else { + mmprops.memlimit_inactive_attr &= ~(uint32_t)MEMORYSTATUS_MEMLIMIT_ATTR_FATAL; + } + + T_LOG("Setting pid[%d] limits active [%d %s] inactive [%d %s]", pid, + mmprops.memlimit_active, (active_is_fatal ? "hard" : "soft"), + mmprops.memlimit_inactive, (inactive_is_fatal ? "hard" : "soft")); + + err = memorystatus_control(MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES, pid, 0, &mmprops, sizeof(mmprops)); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES failed"); + return err; +} + +boolean_t +get_priority_props(pid_t pid, boolean_t verbose, int32_t *priority, int32_t *limit_mb, uint64_t *user_data, uint32_t *state) +{ + memorystatus_priority_entry_t entry = {0}; + + int size = memorystatus_control(MEMORYSTATUS_CMD_GET_PRIORITY_LIST, pid, 0, &entry, sizeof(entry)); + + /* validate size returned */ + if (size <= 0) { + T_ASSERT_FAIL("get_priority: can't get list size: %d!\n", size); + } + + if (size != sizeof(entry)) { + T_ASSERT_FAIL("get_priority: returned unexpected entry size\n"); + } + + if (entry.pid != pid) { + T_ASSERT_FAIL("get_priority: returned unexpected entry pid\n"); + } + + T_LOG("get_priority_props: pid[%d] limit %d, user_data 0x%llx, priority %d, state 0x%x", + entry.pid, entry.limit, entry.user_data, entry.priority, entry.state); + + + if (verbose) { + log_state(entry.state); + } + + if (priority) { + *priority = entry.priority; + } + if (limit_mb) { + *limit_mb = entry.limit; + } + if (user_data) { + *user_data = entry.user_data; + } + if (state) { + *state = entry.state; + } + + return true; +} + +boolean_t +verify_assertion_state(uint32_t state, boolean_t expected_assertion_state) +{ + boolean_t actual_assertion_state; + char *actual_string; + char *expected_string; + + if (expected_assertion_state == ASSERTION_STATE_IS_SET) { + expected_string = "ASSERTION_STATE_IS_SET"; + } else { + expected_string = "ASSERTION_STATE_IS_RELINQUISHED"; + } + + if (state & kMemorystatusAssertion) { + /* + * An assertion driven jetsam priority is at play. + */ + actual_assertion_state = ASSERTION_STATE_IS_SET; + actual_string = "ASSERTION_STATE_IS_SET"; + } else { + /* + * There is no assertion driven jetsam priority in place. + */ + actual_assertion_state = ASSERTION_STATE_IS_RELINQUISHED; + actual_string = "ASSERTION_STATE_IS_RELINQUISHED"; + } + + if (actual_assertion_state == expected_assertion_state) { + T_PASS("%s as expected", expected_string); + return true; + } else { + T_FAIL("state 0x%x: %s but expected %s", state, actual_string, expected_string); + // log_state(state); + return false; /* failed */ + } +} + +static void +log_state(uint32_t state) +{ + T_LOG("\t%s kMemorystatusSuspended", ((state & kMemorystatusSuspended) ? "IS " : "NOT")); + T_LOG("\t%s kMemorystatusFrozen", ((state & kMemorystatusFrozen) ? "IS " : "NOT")); + T_LOG("\t%s kMemorystatusWasThawed", ((state & kMemorystatusWasThawed) ? "IS " : "NOT")); + T_LOG("\t%s kMemorystatusTracked", ((state & kMemorystatusTracked) ? "IS " : "NOT")); + T_LOG("\t%s kMemorystatusSupportsIdleExit", ((state & kMemorystatusSupportsIdleExit) ? "IS " : "NOT")); + T_LOG("\t%s kMemorystatusDirty", ((state & kMemorystatusDirty) ? "IS " : "NOT")); + T_LOG("\t%s kMemorystatusAssertion", ((state & kMemorystatusAssertion) ? "IS " : "NOT")); +} diff --git a/tests/memorystatus_assertion_helpers.h b/tests/memorystatus_assertion_helpers.h new file mode 100644 index 000000000..782774358 --- /dev/null +++ b/tests/memorystatus_assertion_helpers.h @@ -0,0 +1,92 @@ +#ifndef MEMORYSTATUS_ASSERTION_HELPERS_H +#define MEMORYSTATUS_ASSERTION_HELPERS_H + +#include +#include + +#define ASSERTION_STATE_IS_SET true +#define ASSERTION_STATE_IS_RELINQUISHED false + +/* Helper functions for setting and checking memorystatus assertions + * on processes. + */ + +/* + * Set the jetsam priority and user data for a process. + * + * If this request is assertion driven, the kernel will + * set the process's assertion priority. + * + * If this request is not assertion driven, the kernel + * will set the process's requested priority. + * + * The kernel will then apply policy and move the process + * to the appropriate jetsam priority. + * + * Returns: 0 on success + * non-0 on failure + */ +int +set_priority(pid_t pid, int32_t priority, uint64_t user_data, boolean_t is_assertion_driven); + +/* + * Return: true on success + * false on failure --> this asserts a failure and quits test + */ +boolean_t +check_properties(pid_t pid, int32_t expected_priority, int32_t expected_limit_mb, uint64_t expected_user_data, boolean_t expected_assertion_state, const char *test); + +/* + * Set the active and inactive memlimits for a process. + * Set the fatalness for each limit. + * + * Returns: 0 on success + * non-zero on failure + */ +int +set_memlimits( + pid_t pid, + int32_t active_limit_mb, int32_t inactive_limit_mb, + boolean_t active_is_fatal, boolean_t inactive_is_fatal); + +/* + * Returns: 0 on success + * non-0 on failure + */ +int +set_assertion_priority(pid_t pid, int32_t priority, uint64_t user_data); + +/* + * Returns: 0 on success + * non-0 on failure + */ +int +relinquish_assertion_priority(pid_t pid, uint64_t user_data); + +/* + * Get the priority properties for a single process. + * + * This returns the process's effective jetsam priority, jetsam limit, + * user_data (not kernel related), and proc's kernel state. + * If this call fails, there is no reason to continue the test. + * + * Return: true on success + * false on failure --> this asserts fail and test quits + */ +boolean_t +get_priority_props(pid_t pid, boolean_t verbose, int32_t *priority, int32_t *limit_mb, uint64_t *user_data, uint32_t *state); + +/* + * Input: + * state: kernel state bits from the get_priority_props() call + * expected_assertion_state: + * true if process should be holding an assertion state. + * false if no assertion state is held (eg: relinquished). + * + * Return true: verification passed + * false: verification failed + */ +boolean_t +verify_assertion_state(uint32_t state, boolean_t expected_assertion_state); + +#endif /* MEMORYSTATUS_ASSERTION_HELPERS_H */ diff --git a/tests/memorystatus_freeze_test.c b/tests/memorystatus_freeze_test.c index abd029e11..c9399519a 100644 --- a/tests/memorystatus_freeze_test.c +++ b/tests/memorystatus_freeze_test.c @@ -3,6 +3,10 @@ #include #include #include +#include +#include /* Needed for vm_region info */ +#include +#include #ifdef T_NAMESPACE #undef T_NAMESPACE @@ -10,6 +14,8 @@ #include #include +#include "memorystatus_assertion_helpers.h" + T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), T_META_CHECK_LEAKS(false) @@ -17,6 +23,7 @@ T_GLOBAL_META( #define MEM_SIZE_MB 10 #define NUM_ITERATIONS 5 +#define FREEZE_PAGES_MAX 256 #define CREATE_LIST(X) \ X(SUCCESS) \ @@ -29,6 +36,7 @@ T_GLOBAL_META( X(MEMORYSTATUS_CONTROL_FAILED) \ X(IS_FREEZABLE_NOT_AS_EXPECTED) \ X(MEMSTAT_PRIORITY_CHANGE_FAILED) \ + X(INVALID_ALLOCATE_PAGES_ARGUMENTS) \ X(EXIT_CODE_MAX) #define EXIT_CODES_ENUM(VAR) VAR, @@ -41,14 +49,199 @@ static const char *exit_codes_str[] = { CREATE_LIST(EXIT_CODES_STRING) }; +static int +get_vmpage_size() +{ + int vmpage_size; + size_t size = sizeof(vmpage_size); + int ret = sysctlbyname("vm.pagesize", &vmpage_size, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to query vm.pagesize"); + T_QUIET; T_ASSERT_GT(vmpage_size, 0, "vm.pagesize is not > 0"); + return vmpage_size; +} -static pid_t pid = -1; +static pid_t child_pid = -1; static int freeze_count = 0; void move_to_idle_band(void); -void run_freezer_test(int size_mb); +void run_freezer_test(int); void freeze_helper_process(void); +/* Gets and optionally sets the freeze pages max threshold */ +int sysctl_freeze_pages_max(int* new_value); + +/* NB: in_shared_region and get_rprvt are pulled from the memorystatus unit test. + * We're moving away from those unit tests, so they're copied here. + */ + +/* Cribbed from 'top'... */ +static int +in_shared_region(mach_vm_address_t addr, cpu_type_t type) +{ + mach_vm_address_t base = 0, size = 0; + + switch (type) { + case CPU_TYPE_ARM: + base = SHARED_REGION_BASE_ARM; + size = SHARED_REGION_SIZE_ARM; + break; + + case CPU_TYPE_ARM64: + base = SHARED_REGION_BASE_ARM64; + size = SHARED_REGION_SIZE_ARM64; + break; + + + case CPU_TYPE_X86_64: + base = SHARED_REGION_BASE_X86_64; + size = SHARED_REGION_SIZE_X86_64; + break; + + case CPU_TYPE_I386: + base = SHARED_REGION_BASE_I386; + size = SHARED_REGION_SIZE_I386; + break; + + case CPU_TYPE_POWERPC: + base = SHARED_REGION_BASE_PPC; + size = SHARED_REGION_SIZE_PPC; + break; + + case CPU_TYPE_POWERPC64: + base = SHARED_REGION_BASE_PPC64; + size = SHARED_REGION_SIZE_PPC64; + break; + + default: { + int t = type; + + fprintf(stderr, "unknown CPU type: 0x%x\n", t); + abort(); + } + } + + return addr >= base && addr < (base + size); +} + +/* Get the resident private memory of the given pid */ +static unsigned long long +get_rprvt(pid_t pid) +{ + mach_port_name_t task; + kern_return_t kr; + + mach_vm_size_t rprvt = 0; + mach_vm_size_t empty = 0; + mach_vm_size_t fw_private = 0; + mach_vm_size_t pagesize = vm_kernel_page_size; // The vm_region page info is reported + // in terms of vm_kernel_page_size. + mach_vm_size_t regs = 0; + + mach_vm_address_t addr; + mach_vm_size_t size; + + int split = 0; + + kr = task_for_pid(mach_task_self(), pid, &task); + T_QUIET; T_ASSERT_TRUE(kr == KERN_SUCCESS, "Unable to get task_for_pid of child"); + + for (addr = 0;; addr += size) { + vm_region_top_info_data_t info; + mach_msg_type_number_t count = VM_REGION_TOP_INFO_COUNT; + mach_port_t object_name; + + kr = mach_vm_region(task, &addr, &size, VM_REGION_TOP_INFO, (vm_region_info_t)&info, &count, &object_name); + if (kr != KERN_SUCCESS) { + break; + } + +#if defined (__arm64__) + if (in_shared_region(addr, CPU_TYPE_ARM64)) { +#else + if (in_shared_region(addr, CPU_TYPE_ARM)) { +#endif + // Private Shared + fw_private += info.private_pages_resident * pagesize; + + /* + * Check if this process has the globally shared + * text and data regions mapped in. If so, set + * split to TRUE and avoid checking + * again. + */ + if (split == FALSE && info.share_mode == SM_EMPTY) { + vm_region_basic_info_data_64_t b_info; + mach_vm_address_t b_addr = addr; + mach_vm_size_t b_size = size; + count = VM_REGION_BASIC_INFO_COUNT_64; + + kr = mach_vm_region(task, &b_addr, &b_size, VM_REGION_BASIC_INFO_64, (vm_region_info_t)&b_info, &count, &object_name); + if (kr != KERN_SUCCESS) { + break; + } + + if (b_info.reserved) { + split = TRUE; + } + } + + /* + * Short circuit the loop if this isn't a shared + * private region, since that's the only region + * type we care about within the current address + * range. + */ + if (info.share_mode != SM_PRIVATE) { + continue; + } + } + regs++; + + /* + * Update counters according to the region type. + */ + + if (info.share_mode == SM_COW && info.ref_count == 1) { + // Treat single reference SM_COW as SM_PRIVATE + info.share_mode = SM_PRIVATE; + } + + switch (info.share_mode) { + case SM_LARGE_PAGE: + // Treat SM_LARGE_PAGE the same as SM_PRIVATE + // since they are not shareable and are wired. + case SM_PRIVATE: + rprvt += info.private_pages_resident * pagesize; + rprvt += info.shared_pages_resident * pagesize; + break; + + case SM_EMPTY: + empty += size; + break; + + case SM_COW: + case SM_SHARED: + if (pid == 0) { + // Treat kernel_task specially + if (info.share_mode == SM_COW) { + rprvt += info.private_pages_resident * pagesize; + } + break; + } + + if (info.share_mode == SM_COW) { + rprvt += info.private_pages_resident * pagesize; + } + break; + + default: + assert(0); + break; + } + } + + return rprvt; +} void move_to_idle_band(void) @@ -75,9 +268,12 @@ freeze_helper_process(void) { size_t length; int ret, freeze_enabled, errno_freeze_sysctl; + uint64_t resident_memory_before, resident_memory_after, vmpage_size; + vmpage_size = (uint64_t) get_vmpage_size(); + resident_memory_before = get_rprvt(child_pid) / vmpage_size; - T_LOG("Freezing child pid %d", pid); - ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid)); + T_LOG("Freezing child pid %d", child_pid); + ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &child_pid, sizeof(child_pid)); errno_freeze_sysctl = errno; sleep(1); @@ -85,7 +281,7 @@ freeze_helper_process(void) * The child process toggles its freezable state on each iteration. * So a failure for every alternate freeze is expected. */ - if (freeze_count % 2 == 0) { + if (freeze_count % 2) { length = sizeof(freeze_enabled); T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.freeze_enabled", &freeze_enabled, &length, NULL, 0), "failed to query vm.freeze_enabled"); @@ -97,9 +293,13 @@ freeze_helper_process(void) T_LOG("Freeze has been disabled. Terminating early."); T_END; } - - T_LOG("Freeze succeeded. Thawing child pid %d", pid); - ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid)); + resident_memory_after = get_rprvt(child_pid) / vmpage_size; + uint64_t freeze_pages_max = (uint64_t) sysctl_freeze_pages_max(NULL); + T_QUIET; T_ASSERT_LT(resident_memory_after, resident_memory_before, "Freeze didn't reduce resident memory set"); + if (resident_memory_before > freeze_pages_max) { + T_QUIET; T_ASSERT_LE(resident_memory_before - resident_memory_after, freeze_pages_max, "Freeze pages froze more than the threshold."); + } + ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &child_pid, sizeof(child_pid)); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed"); } else { T_QUIET; T_ASSERT_TRUE(ret != KERN_SUCCESS, "Freeze should have failed"); @@ -108,11 +308,11 @@ freeze_helper_process(void) freeze_count++; - T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGUSR1), "failed to send SIGUSR1 to child process"); } void -run_freezer_test(int size_mb) +run_freezer_test(int num_pages) { int ret, freeze_enabled; char sz_str[50]; @@ -138,7 +338,7 @@ run_freezer_test(int size_mb) if (freeze_count < NUM_ITERATIONS) { freeze_helper_process(); } else { - kill(pid, SIGKILL); + kill(child_pid, SIGKILL); dispatch_source_cancel(ds_freeze); } }); @@ -149,7 +349,7 @@ run_freezer_test(int size_mb) T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath"); T_LOG("Executable path: %s", testpath); - sprintf(sz_str, "%d", size_mb); + sprintf(sz_str, "%d", num_pages); launch_tool_args = (char *[]){ testpath, "-n", @@ -160,19 +360,19 @@ run_freezer_test(int size_mb) }; /* Spawn the child process. Suspend after launch until the exit proc handler has been set up. */ - ret = dt_launch_tool(&pid, launch_tool_args, true, NULL, NULL); + ret = dt_launch_tool(&child_pid, launch_tool_args, true, NULL, NULL); if (ret != 0) { T_LOG("dt_launch tool returned %d with error code %d", ret, errno); } - T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(child_pid, "dt_launch_tool"); - ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue()); + ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)child_pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue()); T_QUIET; T_ASSERT_NOTNULL(ds_proc, "dispatch_source_create (ds_proc)"); dispatch_source_set_event_handler(ds_proc, ^{ int status = 0, code = 0; - pid_t rc = waitpid(pid, &status, 0); - T_QUIET; T_ASSERT_EQ(rc, pid, "waitpid"); + pid_t rc = waitpid(child_pid, &status, 0); + T_QUIET; T_ASSERT_EQ(rc, child_pid, "waitpid"); code = WEXITSTATUS(status); if (code == 0) { @@ -185,35 +385,24 @@ run_freezer_test(int size_mb) }); dispatch_activate(ds_proc); - T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGCONT), "failed to send SIGCONT to child process"); dispatch_main(); } -T_HELPER_DECL(allocate_pages, - "allocates pages to freeze", - T_META_ASROOT(true)) { - int i, j, ret, size_mb, vmpgsize; - size_t len; +static void +allocate_pages(int num_pages) +{ + int i, j, vmpgsize; char val; - __block int num_pages, num_iter = 0; + __block int num_iter = 0; __block char **buf; dispatch_source_t ds_signal; - - len = sizeof(vmpgsize); - ret = sysctlbyname("vm.pagesize", &vmpgsize, &len, NULL, 0); - if (ret != 0) { - exit(SYSCTL_VM_PAGESIZE_FAILED); - } - if (vmpgsize == 0) { - exit(VM_PAGESIZE_IS_ZERO); - } - - if (argc < 1) { - exit(TOO_FEW_ARGUMENTS); + vmpgsize = get_vmpage_size(); + if (num_pages < 1) { + printf("Invalid number of pages to allocate: %d\n", num_pages); + exit(INVALID_ALLOCATE_PAGES_ARGUMENTS); } - size_mb = atoi(argv[0]); - num_pages = size_mb * 1024 * 1024 / vmpgsize; buf = (char**)malloc(sizeof(char*) * (size_t)num_pages); /* Gives us the compression ratio we see in the typical case (~2.7) */ @@ -252,6 +441,10 @@ T_HELPER_DECL(allocate_pages, } current_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0); + /* Sysprocs start off as unfreezable. Verify that first. */ + if (num_iter == 0 && current_state != 0) { + exit(IS_FREEZABLE_NOT_AS_EXPECTED); + } /* Toggle freezable state */ new_state = (current_state) ? 0: 1; @@ -278,6 +471,182 @@ T_HELPER_DECL(allocate_pages, dispatch_main(); } -T_DECL(freeze, "VM freezer test") { - run_freezer_test(MEM_SIZE_MB); +T_HELPER_DECL(allocate_pages, + "allocates pages to freeze", + T_META_ASROOT(true)) { + if (argc < 1) { + exit(TOO_FEW_ARGUMENTS); + } + + int num_pages = atoi(argv[0]); + allocate_pages(num_pages); +} + +T_DECL(freeze, "VM freezer test", T_META_ASROOT(true)) { + run_freezer_test( + (MEM_SIZE_MB << 20) / get_vmpage_size()); +} + +static int old_freeze_pages_max = 0; +static void +reset_freeze_pages_max() +{ + if (old_freeze_pages_max != 0) { + sysctl_freeze_pages_max(&old_freeze_pages_max); + } +} + +int +sysctl_freeze_pages_max(int* new_value) +{ + static int set_end_handler = false; + int freeze_pages_max, ret; + size_t size = sizeof(freeze_pages_max); + ret = sysctlbyname("kern.memorystatus_freeze_pages_max", &freeze_pages_max, &size, new_value, size); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Unable to query kern.memorystatus_freeze_pages_max"); + if (!set_end_handler) { + // Save the original value and instruct darwintest to restore it after the test completes + old_freeze_pages_max = freeze_pages_max; + T_ATEND(reset_freeze_pages_max); + set_end_handler = true; + } + return old_freeze_pages_max; +} + +T_DECL(freeze_over_max_threshold, "Max Freeze Threshold is Enforced", T_META_ASROOT(true)) { + int freeze_pages_max = FREEZE_PAGES_MAX; + sysctl_freeze_pages_max(&freeze_pages_max); + run_freezer_test(FREEZE_PAGES_MAX * 2); +} + +T_HELPER_DECL(frozen_background, "Frozen background process", T_META_ASROOT(true)) { + kern_return_t kern_ret; + /* Set the process to freezable */ + kern_ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), 1, NULL, 0); + T_QUIET; T_ASSERT_EQ(kern_ret, KERN_SUCCESS, "set process is freezable"); + /* Signal to our parent that we can be frozen */ + if (kill(getppid(), SIGUSR1) != 0) { + T_LOG("Unable to signal to parent process!"); + exit(1); + } + while (1) { + ; + } +} + +/* Launches the frozen_background helper as a managed process. */ +static pid_t +launch_frozen_background_process() +{ + pid_t pid; + char **launch_tool_args; + char testpath[PATH_MAX]; + uint32_t testpath_buf_size; + int ret; + + testpath_buf_size = sizeof(testpath); + ret = _NSGetExecutablePath(testpath, &testpath_buf_size); + printf("Launching %s\n", testpath); + launch_tool_args = (char *[]){ + testpath, + "-n", + "frozen_background", + NULL + }; + ret = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL); + if (ret != 0) { + T_LOG("dt_launch tool returned %d with error code %d", ret, errno); + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_launch_tool"); + /* Set the process's managed bit, so that the kernel treats this process like an app instead of a sysproc. */ + ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED, pid, 1, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "memorystatus_control"); + return pid; +} + +static void +freeze_process(pid_t pid) +{ + int ret, freeze_enabled, errno_freeze_sysctl; + size_t length; + T_LOG("Freezing pid %d", pid); + + ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid)); + errno_freeze_sysctl = errno; + length = sizeof(freeze_enabled); + T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.freeze_enabled", &freeze_enabled, &length, NULL, 0), + "failed to query vm.freeze_enabled"); + if (freeze_enabled) { + errno = errno_freeze_sysctl; + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed"); + } else { + /* If freezer is disabled, skip the test. This can happen due to disk space shortage. */ + T_LOG("Freeze has been disabled. Terminating early."); + T_END; + } +} + +static void +memorystatus_assertion_test_demote_frozen() +{ +#if !CONFIG_EMBEDDED + T_SKIP("Freezing processes is only supported on embedded"); +#endif + /* + * Test that if we assert a priority on a process, freeze it, and then demote all frozen processes, it does not get demoted below the asserted priority. + * Then remove thee assertion, and ensure it gets demoted properly. + */ + /* these values will remain fixed during testing */ + int active_limit_mb = 15; /* arbitrary */ + int inactive_limit_mb = 7; /* arbitrary */ + /* Launch the child process, and elevate its priority */ + int requestedpriority; + dispatch_source_t ds_signal, ds_exit; + requestedpriority = JETSAM_PRIORITY_UI_SUPPORT; + + /* Wait for the child process to tell us that it's ready, and then freeze it */ + signal(SIGUSR1, SIG_IGN); + ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create"); + dispatch_source_set_event_handler(ds_signal, ^{ + int sysctl_ret; + /* Freeze the process, trigger agressive demotion, and check that it hasn't been demoted. */ + freeze_process(child_pid); + /* Agressive demotion */ + sysctl_ret = sysctlbyname("kern.memorystatus_demote_frozen_processes", NULL, NULL, NULL, 0); + T_ASSERT_POSIX_SUCCESS(sysctl_ret, "sysctl kern.memorystatus_demote_frozen_processes failed"); + /* Check */ + (void)check_properties(child_pid, requestedpriority, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_SET, "Priority was set"); + T_LOG("Relinquishing our assertion."); + /* Relinquish our assertion, and check that it gets demoted. */ + relinquish_assertion_priority(child_pid, 0x0); + (void)check_properties(child_pid, JETSAM_PRIORITY_AGING_BAND2, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Assertion was reqlinquished."); + /* Kill the child */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Unable to kill child process"); + T_END; + }); + + /* Launch the child process and set the initial properties on it. */ + child_pid = launch_frozen_background_process(); + set_memlimits(child_pid, active_limit_mb, inactive_limit_mb, false, false); + set_assertion_priority(child_pid, requestedpriority, 0x0); + (void)check_properties(child_pid, requestedpriority, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_SET, "Priority was set"); + /* Listen for exit. */ + ds_exit = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)child_pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue()); + dispatch_source_set_event_handler(ds_exit, ^{ + int status = 0, code = 0; + pid_t rc = waitpid(child_pid, &status, 0); + T_QUIET; T_ASSERT_EQ(rc, child_pid, "waitpid"); + code = WEXITSTATUS(status); + T_QUIET; T_ASSERT_EQ(code, 0, "Child exited cleanly"); + T_END; + }); + + dispatch_activate(ds_exit); + dispatch_activate(ds_signal); + dispatch_main(); +} + +T_DECL(assertion_test_demote_frozen, "demoted frozen process goes to asserted priority.", T_META_ASROOT(true)) { + memorystatus_assertion_test_demote_frozen(); } diff --git a/tests/memorystatus_is_assertion.c b/tests/memorystatus_is_assertion.c new file mode 100644 index 000000000..6475513e4 --- /dev/null +++ b/tests/memorystatus_is_assertion.c @@ -0,0 +1,506 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "memorystatus_assertion_helpers.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm"), + T_META_CHECK_LEAKS(false) + ); + +extern char **environ; + +/* + * This test has multiple sub-tests that set and then verify jetsam priority transitions + * as though they were driven by assertions. It uses the MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES + * version of the memorystatus_control() system call and specifically tests the use of the + * MEMORYSTATUS_SET_PRIORITY_ASSERTION flag. + * + * The kernel will apply policy that chooses a maximum jetsam priority, resolving conflicts + * between an assertion driven priority and clean/dirty transition policy. + * + * Processes that do not opt into dirty-tracking should behave as they always have. + * This is the typical App transition behavior. + * + * Processes that do opt into dirty-tracking have more complex policy: + * For example: + * A MAX assertion priority will prevent a dirty process from transitioning to a clean + * state if the process opts into idle-exit. + * See: memorystatus_schedule_idle_demotion_locked() where we note that + * the process isn't going to be making the trip to the lower bands. + * + * But a MAX assertion evaluation will not prevent a clean process from transition to dirty. + * Assertion driven priorities should not change memory limits, they are expected to + * just change a process's position in the jetsam priority bands. + * + * MEMORYSTATUS_CMD_xxx requires root (in the absence of entitlement). + * Use T_META_ASROOT(true) to accomplish this. + * + * A note on test strategy. It is not necessary to spawn a child to test these + * assertion calls. The test can act on itself, that is, it can make calls to + * set and relinquish assertion state just like it can make calls to do dirty/clean + * transitions. Of course, in reality, we expect only runningboardd to manipulate + * assertion based priorities. + */ + +/* + * New flag to tell kernel this is an assertion driven priority update. + */ +#ifndef MEMORYSTATUS_SET_PRIORITY_ASSERTION +#define MEMORYSTATUS_SET_PRIORITY_ASSERTION 0x1 +#endif + +static void +proc_will_set_clean(pid_t pid) +{ + proc_set_dirty(pid, false); + T_LOG("pid[%d] --> now clean", pid); + return; +} + +static void +proc_will_set_dirty(pid_t pid) +{ + proc_set_dirty(pid, true); + T_LOG("pid[%d] --> now dirty", pid); + return; +} + +#define kJetsamAgingPolicyNone (0) +#define kJetsamAgingPolicyLegacy (1) +#define kJetsamAgingPolicySysProcsReclaimedFirst (2) +#define kJetsamAgingPolicyAppsReclaimedFirst (3) +#define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst + +#ifndef kMemorystatusAssertion +#define kMemorystatusAssertion 0x40 +#endif + +/* + * Make repetitive (eg: back-to-back) calls using MEMORYSTATUS_SET_PRIORITY_ASSERTION. + * We know that runningboardd may try to relinquish its hold on an assertion priority + * when it hasn't first set the assertion priority. The kernel must survive this + * pattern even though it might be considered poor behavior on runningboardd's part. + * When dirty tracking processes are involved, we are exercising the kernel's + * idle-deferred paths. Only assertion state (whether or not assertion state is + * set or relinquished) is verified in this round of tests. + * Test is invoked three times: + * Scenario 1) as a non-dirty-tracking process (like a typical app) + * relinquish assertion priority multiple times + * set same assertion priority multiple times. + * Scenario 2) setup a dirty-tracking process that is clean (like a typical extension) + * relinquish assertion priority multiple times + * set same assertion priority multiple times. + * Scenario 3) setup dirty-tracking process that is dirty (like a typical extension) + * relinquish assertion priority multiple times + * set same assertion priority multiple times. + */ + +static void +memorystatus_assertion_test_repetitive(char *test, boolean_t turn_on_dirty_tracking, boolean_t start_clean) +{ + int count; + int maxcount = 3; + boolean_t verbose; + uint32_t state; + uint64_t user_data = 0; + pid_t mypid = getpid(); + + /* these values will remain fixed during testing */ + int active_limit_mb = 15; /* arbitrary */ + int inactive_limit_mb = 7; /* arbitrary */ + + /* these values may vary during test */ + int requestedpriority = 0; + int assertionpriority = 0; + + T_SETUPBEGIN; + + requestedpriority = JETSAM_PRIORITY_UI_SUPPORT; + assertionpriority = JETSAM_PRIORITY_FOREGROUND; + set_memlimits(mypid, active_limit_mb, inactive_limit_mb, true, true); + set_priority(mypid, requestedpriority, 0, false); + + if (turn_on_dirty_tracking) { + proc_track_dirty(mypid, (PROC_DIRTY_TRACK | PROC_DIRTY_ALLOW_IDLE_EXIT | PROC_DIRTY_DEFER)); + + if (start_clean) { + proc_will_set_clean(mypid); + } else { + proc_will_set_dirty(mypid); + } + } else { + /* + * Do nothing. + * Acts like an app with no dirty tracking + * By default launches in the requested priority and is + * considered idle because it's below FG band. + */ + } + + + verbose = false; + (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, NULL); + + /* log current setup state */ + T_LOG("SETUP STATE COMPLETE: Test %s", test); + + T_SETUPEND; + + int i; + boolean_t ret; + for (i = 0; i < 2; i++) { + if (i == 1 && turn_on_dirty_tracking) { + T_LOG("Avoid idle-deferred - sleeping for 20"); + sleep(20); + + if (start_clean) { + proc_will_set_dirty(mypid); + } else { + proc_will_set_clean(mypid); + } + + (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, &state); + } + + /* + * Relinquish assertion priority even though we don't + * currently hold an assertion priority. + */ + for (count = 0; count < maxcount; count++) { + if (relinquish_assertion_priority(mypid, user_data)) { + T_ASSERT_FAIL("relinquish_assertion_priority failed"); + } + } + + /* Verify assertion state is relinquished */ + (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, &state); + + ret = verify_assertion_state(state, ASSERTION_STATE_IS_RELINQUISHED); + T_QUIET; + T_ASSERT_TRUE(ret, "verify_assertion_state failed"); + + + + /* + * Set an assertion priority multiple times in a row. + */ + for (count = 0; count < maxcount; count++) { + if (set_assertion_priority(mypid, assertionpriority, user_data) != 0) { + T_ASSERT_FAIL("set_assertion_priority failed"); + } + } + + /* Verify state holds an assertion priority */ + (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, &state); + + ret = verify_assertion_state(state, ASSERTION_STATE_IS_SET); + T_QUIET; + T_ASSERT_TRUE(ret, "verify_assertion_state failed"); + } +} + +/* + * Process is dirty tracking and opts into pressured exit. + */ +static void +memorystatus_assertion_test_allow_idle_exit() +{ + pid_t mypid = getpid(); + + /* these values will remain fixed during testing */ + int active_limit_mb = 15; /* arbitrary */ + int inactive_limit_mb = 7; /* arbitrary */ + + /* these values may vary during test */ + int requestedpriority = JETSAM_PRIORITY_UI_SUPPORT; + + T_SETUPBEGIN; + + set_memlimits(mypid, active_limit_mb, inactive_limit_mb, true, true); + set_priority(mypid, requestedpriority, 0, false); + + proc_track_dirty(mypid, (PROC_DIRTY_TRACK | PROC_DIRTY_ALLOW_IDLE_EXIT | PROC_DIRTY_DEFER)); + + proc_will_set_clean(mypid); + + (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Clean start"); + + T_LOG("SETUP STATE COMPLETE"); + + int g_jetsam_aging_policy = 0; + /* + * Jetsam aging policy + * Failure to retrieve is not fatal. + */ + size_t size = sizeof(g_jetsam_aging_policy); + if (sysctlbyname("kern.jetsam_aging_policy", &g_jetsam_aging_policy, &size, NULL, 0) != 0) { + T_LOG(__func__, true, "Unable to retrieve jetsam aging policy (not fatal)"); + } + + T_SETUPEND; + + /* + * Relinquish assertion priority even though we don't hold it. No change in state expected. + */ + T_LOG("********Test0 clean: no state change on relinquish"); + relinquish_assertion_priority(mypid, 0xF00D); + (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, inactive_limit_mb, 0xF00D, ASSERTION_STATE_IS_RELINQUISHED, "Test0"); + + T_LOG("********Test1 clean: deferred now assertion[10]"); + set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED); + (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test1"); + + /* Test2 */ + T_LOG("********Test2 clean: assertion[10 -> 3]"); + set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE); + (void)check_properties(mypid, JETSAM_PRIORITY_BACKGROUND, inactive_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test2"); + + /* Test3 */ + T_LOG("********Test3 clean: assertion[3 -> 0], but now deferred"); + relinquish_assertion_priority(mypid, 0xBEEF); + (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test3"); + + /* Test4 */ + T_LOG("********Test4 clean: deferred now assertion[10]"); + set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED); + (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test4"); + + T_LOG("Avoid idle-deferred moving forward. Sleeping for 20"); + sleep(20); + + /* Test5 */ + T_LOG("********Test5 dirty: set dirty priority but assertion[10] prevails"); + proc_will_set_dirty(mypid); /* active priority is less than FG*/ + (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test5"); + + /* Test6 */ + T_LOG("********Test6 dirty: assertion[10 -> 3] but dirty priority prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFEEB); /* active priority is > BG */ + (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xFEEB, ASSERTION_STATE_IS_SET, "Test6"); + + /* Test7 */ + T_LOG("********Test7 dirty: assertion[3 -> 0] but dirty prevails"); + relinquish_assertion_priority(mypid, 0xBEEF); + (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test7"); + + + /* Test8 */ + T_LOG("********Test8 dirty: assertion[0 -> 10] overrides dirty"); + set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED); + (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test8"); + + /* Test9 */ + T_LOG("********Test9 dirty wants to go clean, but clean state is prevented as assertion[10] prevails"); + proc_will_set_clean(mypid); + (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test9"); + + /* Test10 */ + T_LOG("********Test10 dirty goes dirty and stays dirty, and assertion[10] prevails again"); + proc_will_set_dirty(mypid); + (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test10"); + + /* Test11 */ + T_LOG("********Test11 dirty: assertion[10 -> 3] but dirty prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE); + (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test11"); + + /* Test12 */ + T_LOG("********Test12 dirty: assertion[3 -> 0] but dirty prevails"); + relinquish_assertion_priority(mypid, 0xBEEF); + (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test12"); + + + /* Test13 */ + T_LOG("********Test13 dirty goes clean: both assertion[0] and clean"); + proc_will_set_clean(mypid); + if (g_jetsam_aging_policy == kJetsamAgingPolicySysProcsReclaimedFirst) { + /* For sysproc aging policy the daemon should be at idle deferred and with an active memory limit */ + (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test13"); + } else { + /* For the legacy aging policy, daemon should be at idle band with inactive memory limit */ + (void)check_properties(mypid, JETSAM_PRIORITY_IDLE, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test13"); + } +} + +/* + * Process is dirty tracking and does not opt into pressured exit. + * This test lives above Foreground. Assertions will have no affect + * except where the assertion priority bumps it above the requested priority. + */ +static void +memorystatus_assertion_test_do_not_allow_idle_exit() +{ + pid_t mypid = getpid(); + + /* these values will remain fixed during testing */ + int active_limit_mb = 15; /* arbitrary */ + int inactive_limit_mb = 7; /* arbitrary */ + int requestedpriority = JETSAM_PRIORITY_AUDIO_AND_ACCESSORY; + + T_SETUPBEGIN; + + set_memlimits(mypid, active_limit_mb, inactive_limit_mb, true, true); + set_priority(mypid, requestedpriority, 0, false); + proc_track_dirty(mypid, (PROC_DIRTY_TRACK)); + + proc_will_set_dirty(mypid); + + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Dirty start"); + + proc_will_set_clean(mypid); + + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Clean transition"); + + T_LOG("SETUP STATE COMPLETE"); + + T_SETUPEND; + + /* + * Relinquish assertion priority even though we don't hold it. No change in state expected. + */ + + + /* Test0 */ + T_LOG("********Test0 clean: no state change on relinquish"); + relinquish_assertion_priority(mypid, 0xF00D); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xF00D, ASSERTION_STATE_IS_RELINQUISHED, "Test0"); + + /* Test1 */ + T_LOG("********Test1 clean: assertion[0 -> 10] but inactive priority prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test1"); + + /* Test2 */ + T_LOG("********Test2 clean: assertion[10 -> 3] but inactive priority prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test2"); + + /* Test3 */ + T_LOG("********Test3 clean: assertion[3 -> 0], but inactive priority prevails"); + relinquish_assertion_priority(mypid, 0xBEEF); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test3"); + + /* Test4 */ + T_LOG("********Test4 go dirty: assertion[0] has no affect, active priority prevails"); + proc_will_set_dirty(mypid); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test4"); + + /* Test5 */ + T_LOG("********Test5 dirty: assertion[0 -> 10] active priority prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test5"); + + /* Test6 */ + T_LOG("********Test6 dirty: assertion[10 -> 3] active priority prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test6"); + + /* Test 7 */ + T_LOG("********Test7 dirty: assertion[3 -> 0], active priority prevails"); + relinquish_assertion_priority(mypid, 0xBEEF); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test7"); + + /* Test8 */ + T_LOG("********Test8 dirty: assertion[0 -> 19], dirty but now assertion[19] prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_CRITICAL, 0xFEED); + (void)check_properties(mypid, JETSAM_PRIORITY_CRITICAL, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test8"); + + + /* Test9 */ + T_LOG("********Test9 go clean: inactive priority but assertion[19] prevails"); + proc_will_set_clean(mypid); + (void)check_properties(mypid, JETSAM_PRIORITY_CRITICAL, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test9"); + + /* Test10 */ + T_LOG("********Test10 clean: assertion[19 -> 3] inactive limit prevails"); + set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test10"); + + + /* Test11 */ + T_LOG("********Test11 clean: assertion[3 -> 0] inactive priority still prevails"); + relinquish_assertion_priority(mypid, 0xBEEF); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test11"); + + /* Test12 */ + T_LOG("********Test12 dirty goes clean: both assertion[0] and clean"); + proc_will_set_clean(mypid); + (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test12"); +} + +T_DECL(assertion_test_bad_flags, "verify bad flag returns an error", T_META_TIMEOUT(30), T_META_ASROOT(true)) { + int err; + uint32_t flag = 0; + + memorystatus_priority_properties_t mjp = { 0 }; + + mjp.priority = JETSAM_PRIORITY_FOREGROUND; + mjp.user_data = 0; + + /* + * init a bad flag + */ + + flag = 0xf; + + err = memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, getpid(), flag, &mjp, sizeof(mjp)); + + T_QUIET; + T_ASSERT_POSIX_FAILURE(err, EINVAL, "MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES should fail with bad flags (err=%d)", err); +} + + +T_DECL(assertion_test_repetitive_non_dirty_tracking, "Scenario #1 - repetitive assertion priority on non-dirty-tracking process", T_META_TIMEOUT(60), T_META_ASROOT(true)) { + /* + * Verify back-to-back assertion calls set assertion state as expected. + * false --> non-dirty-tracking process (like a typical app) + * false --> clean/dirty does not apply here + */ + + memorystatus_assertion_test_repetitive("Scenario #1", false, false); +} + +T_DECL(assertion_test_repetitive_dirty_tracking_clean, "Scenario #2 - repetitive assertion priority on clean dirty-tracking process", T_META_TIMEOUT(60), T_META_ASROOT(true)) { + /* + * Verify back-to-back assertion calls set assertion state as expected. + * true --> dirty-tracking process (like a typical extension/widget) + * true --> start clean / inactive + * This will exercise idle-deferred paths. + */ + memorystatus_assertion_test_repetitive("Scenario #2", true, true); +} + +T_DECL(assertion_test_repetitive_dirty_tracking_dirty, "Scenario #3 - repetitive assertion priority on dirty dirty-tracking processes", T_META_TIMEOUT(60), T_META_ASROOT(true)) { + /* + * Verify back-to-back assertion calls set assertion state as expected. + * true --> dirty-tracking process (like a typical extension/widget) + * false --> start dirty / active state + * This will exercise idle-deferred paths. + */ + memorystatus_assertion_test_repetitive("Scenario #3", true, false); +} + + +T_DECL(assertion_test_allow_idle_exit, "set assertion priorities on process supporting idle exit", T_META_TIMEOUT(360), T_META_ASROOT(true)) { + memorystatus_assertion_test_allow_idle_exit(); +} + +T_DECL(assertion_test_do_not_allow_idle_exit, "set assertion priorities on process no idle exit allowed", T_META_TIMEOUT(360), T_META_ASROOT(true)) { + memorystatus_assertion_test_do_not_allow_idle_exit(); +} diff --git a/tests/memorystatus_zone_test.c b/tests/memorystatus_zone_test.c index bc376ee57..b660e5c6a 100644 --- a/tests/memorystatus_zone_test.c +++ b/tests/memorystatus_zone_test.c @@ -22,29 +22,41 @@ T_GLOBAL_META( T_META_CHECK_LEAKS(false) ); -#define TIMEOUT_SECS 1500 +#define TIMEOUT_SECS 10 * 60 /* abort if test takes > 10 minutes */ -#if TARGET_OS_EMBEDDED -#define ALLOCATION_SIZE_VM_REGION (16*1024) /* 16 KB */ -#define ALLOCATION_SIZE_VM_OBJECT ALLOCATION_SIZE_VM_REGION +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +#define ALLOCATION_SIZE_VM_REGION (16*1024) /* 16 KB */ +#define ALLOCATION_SIZE_VM_OBJECT ALLOCATION_SIZE_VM_REGION #else -#define ALLOCATION_SIZE_VM_REGION (1024*1024*100) /* 100 MB */ -#define ALLOCATION_SIZE_VM_OBJECT (16*1024) /* 16 KB */ +#define ALLOCATION_SIZE_VM_REGION (1024*1024*100) /* 100 MB */ +#define ALLOCATION_SIZE_VM_OBJECT (16*1024) /* 16 KB */ #endif #define MAX_CHILD_PROCS 100 +#define NUM_GIVE_BACK 5 +#define NUM_GIVE_BACK_PORTS 20 + +/* 60% is too high on bridgeOS to achieve without vm-pageshortage jetsams. Set it to 40%. */ +#if TARGET_OS_BRIDGE +#define ZONEMAP_JETSAM_LIMIT_SYSCTL "kern.zone_map_jetsam_limit=40" +#else #define ZONEMAP_JETSAM_LIMIT_SYSCTL "kern.zone_map_jetsam_limit=60" +#endif #define VME_ZONE_TEST_OPT "allocate_vm_regions" #define VM_OBJECTS_ZONE_TEST_OPT "allocate_vm_objects" #define GENERIC_ZONE_TEST_OPT "allocate_from_generic_zone" -#define VME_ZONE "VM map entries" -#define VMOBJECTS_ZONE "vm objects" -#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98 +#define VME_ZONE "VM map entries" +#define VMOBJECTS_ZONE "vm objects" +#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98 + +#define VM_TAG1 100 +#define VM_TAG2 101 -#define VM_TAG1 100 -#define VM_TAG2 101 +#define LARGE_MEM_GB 32 +#define LARGE_MEM_JETSAM_LIMIT 40 +#define JETSAM_LIMIT_LOWEST 10 enum { VME_ZONE_TEST = 0, @@ -60,8 +72,6 @@ typedef struct test_config_struct { } test_config_struct; static test_config_struct current_test; -static int num_children = 0; -static bool test_ending = false; static dispatch_source_t ds_signal = NULL; static dispatch_source_t ds_timer = NULL; static dispatch_queue_t dq_spawn = NULL; @@ -71,12 +81,13 @@ static mach_zone_info_array_t zone_info_array = NULL; static mach_zone_name_t largest_zone_name; static mach_zone_info_t largest_zone_info; -static char testpath[PATH_MAX]; +static pthread_mutex_t test_mtx = PTHREAD_MUTEX_INITIALIZER; /* protects the next 3 things */ +static bool test_ending = false; +static int num_children = 0; static pid_t child_pids[MAX_CHILD_PROCS]; -static pthread_mutex_t test_ending_mtx; -static void allocate_vm_regions(void); -static void allocate_vm_objects(void); +static char testpath[PATH_MAX]; +static void allocate_vm_stuff(int); static void allocate_from_generic_zone(void); static void begin_test_teardown(void); static void cleanup_and_end_test(void); @@ -85,7 +96,7 @@ static void spawn_child_process(void); static void run_test(void); static bool verify_generic_jetsam_criteria(void); static bool vme_zone_compares_to_vm_objects(void); -static void print_zone_map_size(void); +static int query_zone_map_size(void); static void query_zone_info(void); static void print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi); @@ -96,56 +107,70 @@ extern kern_return_t mach_zone_info_for_largest_zone( mach_zone_info_t *info ); +static bool +check_time(time_t start, int timeout) +{ + return start + timeout < time(NULL); +} + +/* + * flag values for allocate_vm_stuff() + */ +#define REGIONS 1 +#define OBJECTS 2 + static void -allocate_vm_regions(void) +allocate_vm_stuff(int flags) { - uint64_t alloc_size = ALLOCATION_SIZE_VM_REGION, i = 0; + uint64_t alloc_size, i; + time_t start = time(NULL); + mach_vm_address_t give_back[NUM_GIVE_BACK]; + char *msg; + + if (flags == REGIONS) { + alloc_size = ALLOCATION_SIZE_VM_REGION; + msg = ""; + } else { + alloc_size = ALLOCATION_SIZE_VM_OBJECT; + msg = " each region backed by a VM object"; + } + + printf("[%d] Allocating VM regions, each of size %lld KB%s\n", getpid(), (alloc_size >> 10), msg); - printf("[%d] Allocating VM regions, each of size %lld KB\n", getpid(), (alloc_size >> 10)); for (i = 0;; i++) { mach_vm_address_t addr = (mach_vm_address_t)NULL; /* Alternate VM tags between consecutive regions to prevent coalescing */ - int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE; + int vmflags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE; - if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) { + if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, vmflags)) != KERN_SUCCESS) { break; } - } - printf("[%d] Number of allocations: %lld\n", getpid(), i); - /* Signal to the parent that we're done allocating */ - kill(getppid(), SIGUSR1); + /* + * If interested in objects, touch the region so the VM object is created, + * then free this page. Keeps us from holding a lot of dirty pages. + */ + if (flags == OBJECTS) { + *((int *)addr) = 0; + madvise((void *)addr, (size_t)alloc_size, MADV_FREE); + } - while (1) { - sleep(2); - /* Exit if parent has exited. Ensures child processes don't linger around after the test exits */ - if (getppid() == 1) { + if (check_time(start, TIMEOUT_SECS)) { + printf("[%d] child timeout during allocations\n", getpid()); exit(0); } - } -} - -static void -allocate_vm_objects(void) -{ - uint64_t alloc_size = ALLOCATION_SIZE_VM_OBJECT, i = 0; - - printf("[%d] Allocating VM regions, each of size %lld KB, each backed by a VM object\n", getpid(), (alloc_size >> 10)); - for (i = 0;; i++) { - mach_vm_address_t addr = (mach_vm_address_t)NULL; - /* Alternate VM tags between consecutive regions to prevent coalescing */ - int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE; - - if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) { - break; + if (i < NUM_GIVE_BACK) { + give_back[i] = addr; } - /* Touch the region so the VM object can actually be created */ - *((int *)addr) = 0; - /* OK to free this page. Keeps us from holding a lot of dirty pages */ - madvise((void *)addr, (size_t)alloc_size, MADV_FREE); } + + /* return some of the resource to avoid O-O-M problems */ + for (uint64_t j = 0; j < NUM_GIVE_BACK && j < i; ++j) { + mach_vm_deallocate(mach_task_self(), give_back[j], (mach_vm_size_t)alloc_size); + } + printf("[%d] Number of allocations: %lld\n", getpid(), i); /* Signal to the parent that we're done allocating */ @@ -157,13 +182,21 @@ allocate_vm_objects(void) if (getppid() == 1) { exit(0); } + + if (check_time(start, TIMEOUT_SECS)) { + printf("[%d] child timeout while waiting\n", getpid()); + exit(0); + } } } + static void allocate_from_generic_zone(void) { uint64_t i = 0; + time_t start = time(NULL); + mach_port_t give_back[NUM_GIVE_BACK_PORTS]; printf("[%d] Allocating mach_ports\n", getpid()); for (i = 0;; i++) { @@ -172,6 +205,20 @@ allocate_from_generic_zone(void) if ((mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port)) != KERN_SUCCESS) { break; } + + if (check_time(start, TIMEOUT_SECS)) { + printf("[%d] child timeout during allocations\n", getpid()); + exit(0); + } + + if (i < NUM_GIVE_BACK_PORTS) { + give_back[i] = port; + } + } + + /* return some of the resource to avoid O-O-M problems */ + for (uint64_t j = 0; j < NUM_GIVE_BACK_PORTS && j < i; ++j) { + mach_port_deallocate(mach_task_self(), give_back[j]); } printf("[%d] Number of allocations: %lld\n", getpid(), i); @@ -184,6 +231,11 @@ allocate_from_generic_zone(void) if (getppid() == 1) { exit(0); } + + if (check_time(start, TIMEOUT_SECS)) { + printf("[%d] child timeout while waiting\n", getpid()); + exit(0); + } } } @@ -194,6 +246,8 @@ print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi) zn->mzn_name, zi->mzi_cur_size, zi->mzi_count); } +static time_t main_start; + static void query_zone_info(void) { @@ -201,6 +255,9 @@ query_zone_info(void) kern_return_t kr; static uint64_t num_calls = 0; + if (check_time(main_start, TIMEOUT_SECS)) { + T_ASSERT_FAIL("Global timeout expired"); + } for (i = 0; i < current_test.num_zones; i++) { kr = mach_zone_info_for_zone(mach_host_self(), current_test.zone_names[i], &(zone_info_array[i])); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_zone(%s) returned %d [%s]", current_test.zone_names[i].mzn_name, kr, mach_error_string(kr)); @@ -209,7 +266,7 @@ query_zone_info(void) T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_largest_zone returned %d [%s]", kr, mach_error_string(kr)); num_calls++; - if (num_calls % 10 != 0) { + if (num_calls % 5 != 0) { return; } @@ -264,6 +321,19 @@ verify_generic_jetsam_criteria(void) static void begin_test_teardown(void) { + int ret, old_limit = 95; + + /* + * Restore kern.zone_map_jetsam_limit to the default high value, to prevent further jetsams. + * We should change the value of old_limit if ZONE_MAP_JETSAM_LIMIT_DEFAULT changes in the kernel. + * We don't have a way to capture what the original value was before the test, because the + * T_META_SYSCTL_INT macro will have changed the value before the test starts running. + */ + ret = sysctlbyname("kern.zone_map_jetsam_limit", NULL, NULL, &old_limit, sizeof(old_limit)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_jetsam_limit failed"); + T_LOG("kern.zone_map_jetsam_limit set to %d%%", old_limit); + + /* End ktrace session */ if (session != NULL) { T_LOG("Ending ktrace session..."); @@ -299,13 +369,13 @@ cleanup_and_end_test(void) * The atend handler executes on a different dispatch queue. * We want to do the cleanup only once. */ - pthread_mutex_lock(&test_ending_mtx); + pthread_mutex_lock(&test_mtx); if (test_ending) { - pthread_mutex_unlock(&test_ending_mtx); + pthread_mutex_unlock(&test_mtx); return; } - test_ending = true; - pthread_mutex_unlock(&test_ending_mtx); + test_ending = TRUE; + pthread_mutex_unlock(&test_mtx); dispatch_async(dq_spawn, ^{ /* @@ -325,23 +395,25 @@ cleanup_and_end_test(void) } }); + pthread_mutex_lock(&test_mtx); T_LOG("Number of processes spawned: %d", num_children); T_LOG("Killing child processes..."); /* Kill all the child processes that were spawned */ for (i = 0; i < num_children; i++) { - kill(child_pids[i], SIGKILL); + pid_t pid = child_pids[i]; + int status = 0; + /* - * Sleep between kills to avoid hogging the VM map entries zone lock (on the task_terminate path). + * Kill and wait for each child to exit * Without this we were seeing hw_lock_bit timeouts in BATS. */ - sleep(1); - } - for (i = 0; i < num_children; i++) { - int status = 0; - if (waitpid(child_pids[i], &status, 0) < 0) { + kill(pid, SIGKILL); + pthread_mutex_unlock(&test_mtx); + if (waitpid(pid, &status, 0) < 0) { T_LOG("waitpid returned status %d", status); } + pthread_mutex_lock(&test_mtx); } sleep(1); @@ -382,11 +454,20 @@ setup_ktrace_session(void) }); /* Listen for memorystatus_do_kill trace events */ - ret = ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, ^(ktrace_event_t event) { + ret = ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)), ^(ktrace_event_t event) { int i; bool received_jetsam_event = false; - /* We don't care about jetsams for any other reason except zone-map-exhaustion */ + /* + * libktrace does not support DBG_FUNC_START/END in the event filter. It simply ignores it. + * So we need to explicitly check for the end event (a successful jetsam kill) here, + * instead of passing in ((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START). + */ + if (!(event->debugid & DBG_FUNC_START)) { + return; + } + + /* Check for zone-map-exhaustion jetsam. */ if (event->arg2 == kMemorystatusKilledZoneMapExhaustion) { begin_test_teardown(); T_LOG("[memorystatus_do_kill] jetsam reason: zone-map-exhaustion, pid: %d\n\n", (int)event->arg1); @@ -400,6 +481,7 @@ setup_ktrace_session(void) * The test simulates this scenario, we should see a targeted jetsam for the * vm objects zone too. */ + pthread_mutex_lock(&test_mtx); for (i = 0; i < num_children; i++) { if (child_pids[i] == (pid_t)event->arg1) { received_jetsam_event = true; @@ -407,6 +489,7 @@ setup_ktrace_session(void) break; } } + pthread_mutex_unlock(&test_mtx); /* * If we didn't see a targeted jetsam, verify that the largest zone actually * fulfilled the criteria for generic jetsams. @@ -421,6 +504,27 @@ setup_ktrace_session(void) } T_QUIET; T_ASSERT_TRUE(received_jetsam_event, "Jetsam event not as expected"); + } else { + /* + * The test relies on the children being able to send a signal to the parent, to continue spawning new processes + * that leak more zone memory. If a child is jetsammed for some other reason, the parent can get stuck waiting for + * a signal from the child, never being able to make progress (We spawn only a single process at a time to rate-limit + * the zone memory bloat.). If this happens, the test eventually times out. So if a child is jetsammed for some + * reason other than zone-map-exhaustion, end the test early. + * + * This typically happens when we end up triggering vm-pageshortage jetsams before zone-map-exhaustion jetsams. + * Lowering the zone_map_jetsam_limit if the zone map size was initially low should help with this too. + * See sysctlbyname("kern.zone_map_jetsam_limit"...) in run_test() below. + */ + pthread_mutex_lock(&test_mtx); + for (i = 0; i < num_children; i++) { + if (child_pids[i] == (pid_t)event->arg1) { + begin_test_teardown(); + T_PASS("Child pid %d was jetsammed due to reason %d. Terminating early.", + (int)event->arg1, (int)event->arg2); + } + } + pthread_mutex_unlock(&test_mtx); } }); T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_events_single"); @@ -429,8 +533,8 @@ setup_ktrace_session(void) T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start"); } -static void -print_zone_map_size(void) +static int +query_zone_map_size(void) { int ret; uint64_t zstats[2]; @@ -440,6 +544,16 @@ print_zone_map_size(void) T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_size_and_capacity failed"); T_LOG("Zone map capacity: %-30lldZone map size: %lld [%lld%% full]", zstats[1], zstats[0], (zstats[0] * 100) / zstats[1]); + +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) + int memstat_level; + size_t memstat_level_size = sizeof(memstat_level); + ret = sysctlbyname("kern.memorystatus_level", &memstat_level, &memstat_level_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_level failed"); + + T_LOG("kern.memorystatus_level = %d%%", memstat_level); +#endif + return (int)(zstats[0] * 100 / zstats[1]); } static void @@ -449,22 +563,30 @@ spawn_child_process(void) char helper_func[50]; char *launch_tool_args[4]; - T_QUIET; T_ASSERT_LT(num_children, MAX_CHILD_PROCS, "Spawned %d children. Timing out...", MAX_CHILD_PROCS); + pthread_mutex_lock(&test_mtx); + if (!test_ending) { + if (num_children == MAX_CHILD_PROCS) { + pthread_mutex_unlock(&test_mtx); + T_ASSERT_FAIL("Spawned too many children. Aborting test"); + /* not reached */ + } - strlcpy(helper_func, current_test.helper_func, sizeof(helper_func)); - launch_tool_args[0] = testpath; - launch_tool_args[1] = "-n"; - launch_tool_args[2] = helper_func; - launch_tool_args[3] = NULL; + strlcpy(helper_func, current_test.helper_func, sizeof(helper_func)); + launch_tool_args[0] = testpath; + launch_tool_args[1] = "-n"; + launch_tool_args[2] = helper_func; + launch_tool_args[3] = NULL; - /* Spawn the child process */ - int rc = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL); - if (rc != 0) { - T_LOG("dt_launch tool returned %d with error code %d", rc, errno); - } - T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool"); + /* Spawn the child process */ + int rc = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL); + if (rc != 0) { + T_LOG("dt_launch tool returned %d with error code %d", rc, errno); + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool"); - child_pids[num_children++] = pid; + child_pids[num_children++] = pid; + } + pthread_mutex_unlock(&test_mtx); } static void @@ -472,12 +594,13 @@ run_test(void) { uint64_t mem; uint32_t testpath_buf_size, pages; - int ret, dev, pgsz; + int ret, dev, pgsz, initial_zone_occupancy, old_limit, new_limit = 0; size_t sysctl_size; T_ATEND(cleanup_and_end_test); T_SETUPBEGIN; + main_start = time(NULL); dev = 0; sysctl_size = sizeof(dev); ret = sysctlbyname("kern.development", &dev, &sysctl_size, NULL, 0); @@ -506,9 +629,41 @@ run_test(void) T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pages failed"); T_LOG("vm.pages: %d", pages); - zone_info_array = (mach_zone_info_array_t) calloc((unsigned long)current_test.num_zones, sizeof *zone_info_array); + sysctl_size = sizeof(old_limit); + ret = sysctlbyname("kern.zone_map_jetsam_limit", &old_limit, &sysctl_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_jetsam_limit failed"); + T_LOG("kern.zone_map_jetsam_limit: %d", old_limit); - print_zone_map_size(); + initial_zone_occupancy = query_zone_map_size(); + + /* On large memory systems, set the zone_map jetsam limit lower so we can hit it without timing out. */ + if (mem > (uint64_t)LARGE_MEM_GB * 1024 * 1024 * 1024) { + new_limit = LARGE_MEM_JETSAM_LIMIT; + } + + /* + * If we start out with the zone map < 5% full, aim for 10% as the limit, so we don't time out. + * For anything else aim for 2x the initial size, capped by whatever value was set by T_META_SYSCTL_INT, + * or LARGE_MEM_JETSAM_LIMIT for large memory systems. + */ + if (initial_zone_occupancy < 5) { + new_limit = JETSAM_LIMIT_LOWEST; + } else { + new_limit = initial_zone_occupancy * 2; + } + + if (new_limit > 0 && new_limit < old_limit) { + /* + * We should be fine messing with the zone_map_jetsam_limit here, i.e. outside of T_META_SYSCTL_INT. + * When the test ends, T_META_SYSCTL_INT will restore the zone_map_jetsam_limit to what it was + * before the test anyway. + */ + ret = sysctlbyname("kern.zone_map_jetsam_limit", NULL, NULL, &new_limit, sizeof(new_limit)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_jetsam_limit failed"); + T_LOG("kern.zone_map_jetsam_limit set to %d%%", new_limit); + } + + zone_info_array = (mach_zone_info_array_t) calloc((unsigned long)current_test.num_zones, sizeof *zone_info_array); /* * If the timeout specified by T_META_TIMEOUT is hit, the atend handler does not get called. @@ -529,7 +684,7 @@ run_test(void) T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create: signal"); dispatch_source_set_event_handler(ds_signal, ^{ - print_zone_map_size(); + (void)query_zone_map_size(); /* Wait a few seconds before spawning another child. Keeps us from allocating too aggressively */ sleep(5); @@ -537,7 +692,7 @@ run_test(void) }); dispatch_activate(ds_signal); - /* Timer to query jetsam-relevant zone info every second. Print it every 10 seconds. */ + /* Timer to query jetsam-relevant zone info every second. Print it every 5 seconds. */ ds_timer = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, dispatch_queue_create("timer_queue", NULL)); T_QUIET; T_ASSERT_NOTNULL(ds_timer, "dispatch_source_create: timer"); dispatch_source_set_timer(ds_timer, dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), NSEC_PER_SEC, 0); @@ -582,13 +737,14 @@ move_to_idle_band(void) T_HELPER_DECL(allocate_vm_regions, "allocates VM regions") { - allocate_vm_regions(); + move_to_idle_band(); + allocate_vm_stuff(REGIONS); } T_HELPER_DECL(allocate_vm_objects, "allocates VM objects and VM regions") { move_to_idle_band(); - allocate_vm_objects(); + allocate_vm_stuff(OBJECTS); } T_HELPER_DECL(allocate_from_generic_zone, "allocates from a generic zone") diff --git a/tests/mktimer_kobject.c b/tests/mktimer_kobject.c index a66986363..4210e0509 100644 --- a/tests/mktimer_kobject.c +++ b/tests/mktimer_kobject.c @@ -7,6 +7,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(mktimer_kobject, "mktimer_kobject()", T_META_ALL_VALID_ARCHS(true)) { mach_port_t timer_port = MACH_PORT_NULL; diff --git a/tests/mo_immovable_receive.c b/tests/mo_immovable_receive.c new file mode 100644 index 000000000..14b4f0e44 --- /dev/null +++ b/tests/mo_immovable_receive.c @@ -0,0 +1,227 @@ +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +typedef struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_guarded_port_descriptor_t guarded_port_descriptor1; + mach_msg_guarded_port_descriptor_t guarded_port_descriptor2; + mach_msg_trailer_t trailer; // subtract this when sending +} ipc_complex_message; + +static ipc_complex_message icm_request = {}; + +struct args { + const char *progname; + int verbose; + int voucher; + int num_msgs; + const char *server_port_name; + mach_port_t server_port; + mach_port_t reply_port; + mach_port_t voucher_port; + int request_msg_size; + void *request_msg; + int reply_msg_size; + void *reply_msg; + mach_port_t sp_voucher_port; + uint32_t persona_id; + long client_pid; +}; + +void parse_args(struct args *args); +void* create_buffer(int *buffer_size); +void client(struct args *args); +void server_setup(struct args* args); +void server(struct args *args); + +void +parse_args(struct args *args) +{ + args->verbose = 0; + args->voucher = 0; + args->server_port_name = "TEST"; + args->server_port = MACH_PORT_NULL; + args->reply_port = MACH_PORT_NULL; + args->voucher_port = MACH_PORT_NULL; + args->num_msgs = 1; + args->request_msg_size = sizeof(ipc_complex_message); + args->request_msg = &icm_request; + args->client_pid = getpid(); +} + +/* Create a mach IPC listener which will respond to the client's message */ +void +server_setup(struct args* args) +{ + kern_return_t ret; + mach_port_t bsport; + + ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + &args->server_port); + T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_allocate()"); + + ret = mach_port_insert_right(mach_task_self(), args->server_port, args->server_port, + MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_insert_right()"); + + ret = task_get_bootstrap_port(mach_task_self(), &bsport); + T_ASSERT_MACH_SUCCESS(ret, "server: task_get_bootstrap_port()"); + + ret = bootstrap_register(bsport, args->server_port_name, args->server_port); + T_ASSERT_MACH_SUCCESS(ret, "server: bootstrap_register()"); + + T_LOG("server: waiting for IPC messages from client on port '%s'.\n", + args->server_port_name); +} + +/* Server process loop + * + * Listens for message. + * + */ +void +server(struct args *args) +{ + mach_msg_header_t *request; + mach_msg_option_t rcvoption; + kern_return_t ret; + + request = (mach_msg_header_t *)args->request_msg; + + rcvoption = MACH_RCV_MSG | MACH_RCV_INTERRUPT | MACH_RCV_GUARDED_DESC; + + T_LOG("server: Awaiting message\n"); + ret = mach_msg(request, + rcvoption, + 0, + sizeof(ipc_complex_message), + args->server_port, + MACH_MSG_TIMEOUT_NONE, + MACH_PORT_NULL); + + T_ASSERT_MACH_SUCCESS(ret, "server: mach_msg receive"); + + ipc_complex_message *request_complexmsg = (ipc_complex_message *)request; + T_ASSERT_NE(request_complexmsg->guarded_port_descriptor1.name, 0, "server: Should not receive mach_port_null; name = %x", request_complexmsg->guarded_port_descriptor1.name); + T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor1.type, MACH_MSG_GUARDED_PORT_DESCRIPTOR, "server: Received a guarded port descriptor"); + T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor1.disposition, MACH_MSG_TYPE_PORT_RECEIVE, "server: Received a receive right"); + T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor1.context, (unsigned long)request, "server: Received a port with correct context = %p", request); + T_LOG("Guard flags = %d", request_complexmsg->guarded_port_descriptor1.flags); + + T_ASSERT_NE(request_complexmsg->guarded_port_descriptor2.name, 0, "server: Should not receive mach_port_null; name = %x", request_complexmsg->guarded_port_descriptor2.name); + T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor2.type, MACH_MSG_GUARDED_PORT_DESCRIPTOR, "server: Received a guarded port descriptor"); + T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor2.disposition, MACH_MSG_TYPE_PORT_RECEIVE, "server: Received a receive right"); + T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor2.context, (unsigned long)request, "server: Received a port with correct context = %p", request); + + mach_port_status_t status; + mach_msg_type_number_t status_size = MACH_PORT_RECEIVE_STATUS_COUNT; + + kern_return_t kr = mach_port_get_attributes(mach_task_self(), request_complexmsg->guarded_port_descriptor1.name, + MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes for descriptor 1"); + T_LOG("Status flags %d", status.mps_flags); + T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set for descriptor1"); + + kr = mach_port_get_attributes(mach_task_self(), request_complexmsg->guarded_port_descriptor2.name, + MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes for descriptor 2"); + T_LOG("Status flags %d", status.mps_flags); + T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set for descriptor2"); + + mach_msg_destroy(request); +} + +void +client(struct args *args) +{ + //Find the bootstrap port + mach_port_t bsport; + mach_port_t guarded_port; + mach_port_t unguarded_port; + + kern_return_t ret = task_get_bootstrap_port(mach_task_self(), &bsport); + T_ASSERT_MACH_SUCCESS(ret, "client: task_get_bootstrap_port()"); + + //Look up the service port + ret = bootstrap_look_up(bsport, (char *)args->server_port_name, + &args->server_port); + T_ASSERT_MACH_SUCCESS(ret, "client: bootstrap_look_up()"); + + //Create the unguarded port + ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + &unguarded_port); + T_ASSERT_MACH_SUCCESS(ret, "client: mach_port_allocate() reply port"); + + mach_port_options_t opts = { + .flags = MPO_CONTEXT_AS_GUARD + }; + + ret = mach_port_construct(mach_task_self(), &opts, 0x10, &guarded_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "mach_port_construct"); + + //Construct the message + mach_msg_header_t *request = (mach_msg_header_t *)args->request_msg; + request->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE, + 0, 0) | MACH_MSGH_BITS_COMPLEX; + request->msgh_size = (mach_msg_size_t)args->request_msg_size; + request->msgh_remote_port = args->server_port; + request->msgh_local_port = args->reply_port; + request->msgh_id = 1; + + ipc_complex_message *complexmsg = (ipc_complex_message *)request; + complexmsg->body.msgh_descriptor_count = 2; + complexmsg->guarded_port_descriptor1.name = guarded_port; + complexmsg->guarded_port_descriptor1.disposition = MACH_MSG_TYPE_MOVE_RECEIVE; + complexmsg->guarded_port_descriptor1.flags = MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE; + complexmsg->guarded_port_descriptor1.context = 0x10; + complexmsg->guarded_port_descriptor1.type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + + complexmsg->guarded_port_descriptor2.name = unguarded_port; + complexmsg->guarded_port_descriptor2.disposition = MACH_MSG_TYPE_MOVE_RECEIVE; + complexmsg->guarded_port_descriptor2.flags = MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE | MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND; + complexmsg->guarded_port_descriptor2.context = 0; + complexmsg->guarded_port_descriptor2.type = MACH_MSG_GUARDED_PORT_DESCRIPTOR; + + mach_msg_option_t option = MACH_SEND_MSG; + + //Listen for the reply on the reply port + T_LOG("client: Sending request\n"); + ret = mach_msg(request, + option, + (mach_msg_size_t)args->request_msg_size, + 0, + MACH_PORT_NULL, + MACH_MSG_TIMEOUT_NONE, + MACH_PORT_NULL); + T_ASSERT_MACH_SUCCESS(ret, "client: mach_msg_overwrite()"); +} + +T_DECL(mo_immovable_receive, "Send a message containing a guard port descriptor for an immovable receive right") +{ + struct args args = {}; + parse_args(&args); + args.request_msg_size -= sizeof(mach_msg_trailer_t); + args.reply_msg_size -= sizeof(mach_msg_trailer_t); + + //Create the server + pid_t pid = fork(); + if (pid == 0) { + T_LOG("Server is up"); + server_setup(&args); + server(&args); + exit(0); + } + + sleep(2); + T_LOG("Preparing client to send a request"); + client(&args); + T_ASSERT_POSIX_SUCCESS(waitpid(pid, NULL, 0), "waitpid()"); +} diff --git a/tests/mpsc.c b/tests/mpsc.c new file mode 100644 index 000000000..08ce2567e --- /dev/null +++ b/tests/mpsc.c @@ -0,0 +1,26 @@ +/* + * mpsc: test the MPSC interface + */ + +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.mpsc"), + T_META_RUN_CONCURRENTLY(true)); + +T_DECL(pingpong, "mpsc_pingpong") +{ + uint64_t count = 100 * 1000, nsecs = 0; + size_t nlen = sizeof(nsecs); + int error; + + error = sysctlbyname("kern.mpsc_test_pingpong", &nsecs, &nlen, + &count, sizeof(count)); + T_ASSERT_POSIX_SUCCESS(error, "sysctlbyname"); + T_LOG("%lld asyncs in %lld ns (%g us/async)", count, nsecs, + (nsecs / 1e3) / count); +} diff --git a/tests/net_tun_pr_35136664.c b/tests/net_tun_pr_35136664.c index c644f2ad8..89b8fc995 100644 --- a/tests/net_tun_pr_35136664.c +++ b/tests/net_tun_pr_35136664.c @@ -9,7 +9,8 @@ #include #include -T_GLOBAL_META(T_META_NAMESPACE("xnu.net")); +T_GLOBAL_META(T_META_NAMESPACE("xnu.net"), + T_META_RUN_CONCURRENTLY(true)); T_DECL(PR_35136664_utun, "This bind a utun and close it without connecting") diff --git a/tests/net_tuntests.c b/tests/net_tuntests.c index d4b2477f5..8965080c5 100644 --- a/tests/net_tuntests.c +++ b/tests/net_tuntests.c @@ -1,3 +1,5 @@ +/* -*- compile-command: "xcrun --sdk iphoneos.internal make net_tuntests" -*- */ + #include #include #include @@ -6,10 +8,11 @@ #include #include #include +#include #include +#include #include #include -#include #include #include #include @@ -32,6 +35,14 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.net.tun")); +/* Disable all these test until is fixed */ +T_GLOBAL_META(T_META_ENABLED(false)); + +#if 0 +#undef T_QUIET +#define T_QUIET +#endif + #if 0 static void log_hexdump(const void *inp, size_t len) @@ -51,17 +62,22 @@ log_hexdump(const void *inp, size_t len) T_LOG("%s", buf); } } +#else +static void +log_hexdump(const void *inp, size_t len) +{ +#pragma unused(inp, len) +} #endif -static uint64_t -get_skywalk_features(void) +static bool +is_netagent_enabled(void) { - uint64_t features = 0; - size_t len = sizeof(features); - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(sysctlbyname("kern.skywalk.features", &features, &len, NULL, 0), NULL); - T_QUIET; T_ASSERT_EQ(len, sizeof(features), NULL); - T_QUIET; T_ASSERT_TRUE(features & SK_FEATURE_SKYWALK, NULL); - return features; + int enabled = 0; + size_t len = sizeof(enabled); + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(sysctlbyname("net.link.generic.system.enable_netagent", &enabled, &len, NULL, 0), NULL); + T_QUIET; T_ASSERT_EQ(len, sizeof(enabled), NULL); + return enabled == 1; } static bool g_is_ipsec_test; @@ -73,6 +89,10 @@ static int g_OPT_GET_CHANNEL_UUID = -1; static int g_OPT_IFNAME = -1; static char *g_CONTROL_NAME = NULL; +static int create_tunsock_old(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); +static int create_tunsock_new(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); +static int (*create_tunsock)(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); + static void setup_ipsec_test(void) { @@ -83,6 +103,7 @@ setup_ipsec_test(void) g_OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID; g_OPT_IFNAME = IPSEC_OPT_IFNAME; g_CONTROL_NAME = IPSEC_CONTROL_NAME; + create_tunsock = create_tunsock_new; g_is_ipsec_test = true; } @@ -96,21 +117,74 @@ setup_utun_test(void) g_OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID; g_OPT_IFNAME = UTUN_OPT_IFNAME; g_CONTROL_NAME = UTUN_CONTROL_NAME; + create_tunsock = create_tunsock_old; g_is_utun_test = true; } +static bool +setblocking(int s, bool blocking) +{ + int flags; + bool ret; + + T_QUIET; T_EXPECT_POSIX_SUCCESS(flags = fcntl(s, F_GETFL, 0), NULL); + + ret = !(flags & O_NONBLOCK); + + if (blocking) { + flags &= ~O_NONBLOCK; + } else { + flags |= O_NONBLOCK; + } + +#if 0 + T_LOG("Setting fd %d from %s to %s\n", + s, ret ? "blocking" : "nonblocking", + blocking ? "blocking" : "nonblocking"); +#endif + + T_QUIET; T_EXPECT_POSIX_SUCCESS(flags = fcntl(s, F_SETFL, flags), NULL); + + return ret; +} + + static void -check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_channel, uuid_t uuid) +check_enables(int tunsock, int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) { int scratch; socklen_t scratchlen, uuidlen; - uuid_t scratchuuid; + uuid_t scratchuuid[channel_count]; if (!uuid) { uuid = scratchuuid; } //T_LOG("checking tunsock %d", tunsock); + if (g_is_ipsec_test && channel_count && !enable_netif) { + /* Unfortunately, the connect incorrectly unwinds the bind if it get an error. + * until that is fixed, expect EINVAL here + */ + scratchlen = sizeof(scratch); + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, + &scratch, &scratchlen), EINVAL, NULL); + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &scratch, &scratchlen), EINVAL, NULL); + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, + &scratch, &scratchlen), EINVAL, NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, + uuid, &uuidlen), EINVAL, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } + return; + } + + scratchlen = sizeof(scratch); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, &scratch, &scratchlen), NULL); @@ -121,7 +195,7 @@ check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_c T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, &scratch, &scratchlen), NULL); T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)scratchlen, sizeof(scratch), NULL); - if (get_skywalk_features() & SK_FEATURE_NETNS) { + if (is_netagent_enabled()) { if (enable_netif) { T_QUIET; T_EXPECT_EQ(scratch, enable_flowswitch, NULL); } else { @@ -138,23 +212,31 @@ check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_c if (g_is_ipsec_test && !enable_netif) { T_QUIET; T_EXPECT_EQ(scratch, 0, NULL); } else { - T_QUIET; T_EXPECT_EQ(scratch, enable_channel, NULL); + T_QUIET; T_EXPECT_EQ(scratch, (int)channel_count, NULL); } if (scratch) { - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL); + } } else { - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } } } @@ -309,20 +391,26 @@ create_sa(const char ifname[IFXNAMSIZ], uint8_t type, uint32_t spi, struct in_ad addcmd.dst.saddr.sin_port = htons(0); addcmd.dst.saddr.sin_addr = *dst; - //log_hexdump(&addcmd, sizeof(addcmd)); + log_hexdump(&addcmd, sizeof(addcmd)); ssize_t slen; T_QUIET; T_EXPECT_POSIX_SUCCESS(slen = send(g_pfkeyso, &addcmd, sizeof(addcmd), 0), NULL); T_QUIET; T_EXPECT_EQ(slen, (ssize_t)sizeof(addcmd), NULL); } +/* This version of the test expects channels to be enabled after connect. + * Once the utun driver is converted, switch to create_tunsock_new + */ static int -create_tunsock(int enable_netif, int enable_flowswitch, int enable_channel) +create_tunsock_old(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) { int tunsock; struct ctl_info kernctl_info; struct sockaddr_ctl kernctl_addr; - uuid_t uuid; + uuid_t scratchuuid[channel_count]; + if (!uuid) { + uuid = scratchuuid; + } socklen_t uuidlen; startover: @@ -340,21 +428,25 @@ startover: kernctl_addr.sc_id = kernctl_info.ctl_id; kernctl_addr.sc_unit = 0; - //T_LOG("enable_netif = %d, enable_flowswitch = %d, enable_channel = %d", - //enable_netif, enable_channel, enable_flowswitch); + T_LOG("%s: enable_netif = %d, enable_flowswitch = %d, channel_count = %d", + __func__, enable_netif, enable_flowswitch, channel_count); T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, &enable_netif, sizeof(enable_netif)), EINVAL, NULL); T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &enable_channel, sizeof(enable_channel)), EINVAL, NULL); - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + &channel_count, sizeof(channel_count)), EINVAL, NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), EINVAL, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL); @@ -363,13 +455,17 @@ startover: T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &enable_channel, sizeof(enable_channel)), EINVAL, NULL); - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + &channel_count, sizeof(channel_count)), EINVAL, NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)); if (error == -1 && errno == EBUSY) { @@ -386,7 +482,7 @@ startover: T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, &enable_netif, sizeof(enable_netif)), EINVAL, NULL); - if (get_skywalk_features() & SK_FEATURE_NETNS) { + if (is_netagent_enabled()) { if (enable_netif) { T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, &enable_flowswitch, sizeof(enable_flowswitch)), NULL); @@ -399,45 +495,208 @@ startover: &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL); } - if (enable_channel) { + if (channel_count) { if (g_is_ipsec_test && !enable_netif) { /* ipsec doesn't support channels without a netif */ T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &enable_channel, sizeof(enable_channel)), EOPNOTSUPP, NULL); - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + &channel_count, sizeof(channel_count)), EOPNOTSUPP, NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } } else { T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &enable_channel, sizeof(enable_channel)), NULL); - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + &channel_count, sizeof(channel_count)), NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL); + } } } else { T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &enable_channel, sizeof(enable_channel)), ENXIO, NULL); - uuid_clear(uuid); - uuidlen = sizeof(uuid_t); + &channel_count, sizeof(channel_count)), ENXIO, NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL); - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } + } + + check_enables(tunsock, enable_netif, enable_flowswitch, channel_count, uuid); + + //T_LOG("Returning tunsock %d", tunsock); + + return tunsock; +} + +/* This version of the test expects channels to be enabled before connect + * Once the utun driver is converted, rename this to just create_tunsock + */ +static int +create_tunsock_new(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) +{ + int tunsock; + struct ctl_info kernctl_info; + struct sockaddr_ctl kernctl_addr; + uuid_t scratchuuid[channel_count]; + if (!uuid) { + uuid = scratchuuid; + } + socklen_t uuidlen; + +startover: + + T_QUIET; T_EXPECT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL); + + memset(&kernctl_info, 0, sizeof(kernctl_info)); + strlcpy(kernctl_info.ctl_name, g_CONTROL_NAME, sizeof(kernctl_info.ctl_name)); + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL); + + memset(&kernctl_addr, 0, sizeof(kernctl_addr)); + kernctl_addr.sc_len = sizeof(kernctl_addr); + kernctl_addr.sc_family = AF_SYSTEM; + kernctl_addr.ss_sysaddr = AF_SYS_CONTROL; + kernctl_addr.sc_id = kernctl_info.ctl_id; + kernctl_addr.sc_unit = 0; + + T_LOG("%s: enable_netif = %d, enable_flowswitch = %d, channel_count = %d", + __func__, enable_netif, enable_flowswitch, channel_count); + + T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, + &enable_netif, sizeof(enable_netif)), EINVAL, NULL); + T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); + T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, + &channel_count, sizeof(channel_count)), EINVAL, NULL); + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, + uuid, &uuidlen), EINVAL, NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } + + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL); + + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, + &enable_netif, sizeof(enable_netif)), NULL); + T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, + &channel_count, sizeof(channel_count)), NULL); + + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, + uuid, &uuidlen), ENXIO, NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } + + int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)); + if (error == -1 && errno == EBUSY) { + /* XXX remove this retry nonsense when this is fixed: + * creating an interface without specifying specific interface name should not return EBUSY + */ + close(tunsock); + T_LOG("connect got EBUSY, sleeping 1 second before retry"); + sleep(1); + goto startover; + } + if (g_is_ipsec_test && channel_count && !enable_netif) { + /* ipsec doesn't support channels without a netif */ + T_QUIET; T_EXPECT_POSIX_FAILURE(error, ENOTSUP, "connect() == -1 && errno == ENOTSUP"); + } else { + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(error, "connect() == 0"); + } + + T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, + &enable_netif, sizeof(enable_netif)), EINVAL, NULL); + + if (g_is_ipsec_test && channel_count && !enable_netif) { + /* Connect failed above, so we get EINVAL */ + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); + } else { + if (is_netagent_enabled()) { + if (enable_netif) { + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &enable_flowswitch, sizeof(enable_flowswitch)), NULL); + } else { + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &enable_flowswitch, sizeof(enable_flowswitch)), ENOENT, NULL); + } + } else { + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, + &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL); + } + } + + T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, + &channel_count, sizeof(channel_count)), EINVAL, NULL); + + for (int i = 0; i < channel_count; i++) { + uuid_clear(uuid[i]); + } + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; + if (!channel_count || (g_is_ipsec_test && channel_count && !enable_netif)) { + /* ipsec doesn't support channels without a netif */ + if (g_is_ipsec_test && channel_count && !enable_netif) { + /* Unfortunately, the connect incorrectly unwinds the bind if it get an error. + * until that is fixed, expect EINVAL here + */ + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, + uuid, &uuidlen), EINVAL, NULL); + } else { + T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, + uuid, &uuidlen), ENXIO, NULL); + } + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); + } + } else { + uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, + uuid, &uuidlen), NULL); + T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); + for (int i = 0; i < channel_count; i++) { + T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL); + } } - check_enables(tunsock, enable_netif, enable_flowswitch, enable_channel, uuid); + check_enables(tunsock, enable_netif, enable_flowswitch, channel_count, uuid); //T_LOG("Returning tunsock %d", tunsock); return tunsock; } +static int (*create_tunsock)(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) = create_tunsock_new; + #if 0 static void ipsec_stats(void) @@ -458,21 +717,21 @@ static void permute_enables(void) { int tunsock; - T_EXPECT_GE(tunsock = create_tunsock(false, false, false), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(false, false, false, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(false, false, true), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(false, false, true, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(false, true, false), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(false, true, false, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(false, true, true), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(false, true, true, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(true, false, false), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(true, false, false, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(true, false, true), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(true, false, true, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(true, true, false), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(true, true, false, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); - T_EXPECT_GE(tunsock = create_tunsock(true, true, true), 0, NULL); + T_EXPECT_GE(tunsock = create_tunsock(true, true, true, NULL), 0, NULL); T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL); } @@ -502,9 +761,9 @@ cleanup_tunsock(void) } static void -setup_tunsock(void) +setup_tunsock(int channel_count, uuid_t uuids[]) { - T_ASSERT_GE(g_tunsock = create_tunsock(true, false, true), 0, NULL); + T_ASSERT_GE(g_tunsock = create_tunsock(true, false, channel_count, uuids), 0, NULL); T_ATEND(cleanup_tunsock); char ifname[IFXNAMSIZ]; @@ -529,11 +788,388 @@ setup_tunsock(void) T_DECL(setup_ipsec, "This test sets up an ipsec interface") { setup_ipsec_test(); - setup_tunsock(); + setup_tunsock(1, NULL); } T_DECL(setup_utun, "This test sets up a utun interface") { setup_utun_test(); - setup_tunsock(); + setup_tunsock(1, NULL); +} + +static const int SOCKET_TRAFFIC_CLASSES[] = { + SO_TC_BK_SYS, // BK + SO_TC_BK, // BK + SO_TC_BE, // BE + SO_TC_RD, // BE + SO_TC_OAM, // BE + SO_TC_AV, // VI + SO_TC_RV, // VI + SO_TC_VI, // VI + SO_TC_VO, // VO + SO_TC_CTL, // VO +}; + +// this should match ipsec_find_tx_ring_by_svc in ipsec driver +static const int SOCKET_TC_TO_RING[] = { + 3, + 3, + 2, + 2, + 2, + 1, + 1, + 1, + 0, + 0, +}; + +/* How many sockets map to this ring */ +static const int RING_TO_TC_COUNT[] = { + 2, 3, 3, 2, +}; + +static void +setup_channels_and_rings(int kq, int channel_count, channel_t channels[], channel_ring_t rxrings[], channel_ring_t txrings[], uuid_t uuids[], int cfds[]) +{ + setup_tunsock(channel_count, uuids); + +#if 0 + // give time to enable a tcpdump if desired + T_LOG("Sleeping 10"); + sleep(10); + T_LOG("Done"); +#endif + + for (int ri = 0; ri < channel_count; ri++) { + if (rxrings) { + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(channels[ri] = os_channel_create(uuids[ri], 0), NULL); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(rxrings[ri] = os_channel_rx_ring(channels[ri], + os_channel_ring_id(channels[ri], CHANNEL_FIRST_RX_RING)), NULL); + } + if (txrings) { + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(channels[ri] = os_channel_create(uuids[ri], 0), NULL); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(rxrings[ri] = os_channel_rx_ring(channels[ri], + os_channel_ring_id(channels[ri], CHANNEL_FIRST_TX_RING)), NULL); + } + + struct kevent kev; + T_QUIET; T_EXPECT_POSIX_SUCCESS(cfds[ri] = os_channel_get_fd(channels[ri]), NULL); + EV_SET(&kev, cfds[ri], EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, (void *)(uintptr_t)ri); + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(kevent(kq, &kev, 1, NULL, 0, NULL), NULL); + } +} + +static void +cleanup_channels_and_rings(int channel_count, channel_t channels[], channel_ring_t rxrings[], channel_ring_t txrings[], uuid_t uuids[]) +{ + for (int ri = 0; ri < channel_count; ri++) { + if (rxrings) { + rxrings[ri] = NULL; + } + if (txrings) { + rxrings[ri] = NULL; + } + os_channel_destroy(channels[ri]); + channels[ri] = NULL; + uuid_clear(uuids[ri]); + } +} + +static void +setup_sockets(int sockets[SO_TC_MAX], int type) +{ + for (int si = 0; si < SO_TC_MAX; si++) { + T_QUIET; T_EXPECT_POSIX_SUCCESS(sockets[si] = socket(PF_INET, type, 0), NULL); + + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(sockets[si], SOL_SOCKET, + SO_TRAFFIC_CLASS, &SOCKET_TRAFFIC_CLASSES[si], sizeof(SOCKET_TRAFFIC_CLASSES[si])), NULL); + + // XXX setsockopt(IP_BOUND_IF) here? + + struct sockaddr_in sin; + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr = g_addr1; + + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(sockets[si], (struct sockaddr *)&sin, sizeof(sin)), NULL); + + char sbuf[INET6_ADDRSTRLEN]; + inet_ntop(sin.sin_family, &sin.sin_addr.s_addr, sbuf, sizeof(sbuf)); +#if 0 + T_LOG("%s socket %d bound to %s port %d", + type == SOCK_DGRAM ? "udp" : type == SOCK_STREAM ? "tcp" : "???", + sockets[si], sbuf, ntohs(sin.sin_port)); +#endif + setblocking(sockets[si], false); + } +} + +static void +cleanup_sockets(int sockets[SO_TC_MAX]) +{ + for (int si = 0; si < SO_TC_MAX; si++) { + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(sockets[si]), NULL); + sockets[si] = -1; + } +} + +static void +drain_ring(channel_ring_t rxring) +{ + uint32_t i, sc = os_channel_available_slot_count(rxring); + channel_slot_t rxprev = NULL; + for (i = 0; i < sc; i++) { + slot_prop_t rxprop; + channel_slot_t rxslot; + + memset(&rxprop, 0, sizeof(rxprop)); + T_QUIET; T_WITH_ERRNO; T_EXPECT_NOTNULL(rxslot = os_channel_get_next_slot(rxring, rxprev, &rxprop), NULL); + T_QUIET; T_ASSERT_NE_UINT(0, rxprop.sp_len, NULL); + T_QUIET; T_ASSERT_NOTNULL((void *)rxprop.sp_buf_ptr, NULL); + + log_hexdump((void *)rxprop.sp_buf_ptr, rxprop.sp_len); + + rxprev = rxslot; + } + if (sc) { + T_QUIET; T_EXPECT_POSIX_ZERO(os_channel_advance_slot(rxring, rxprev), NULL); + } +} + +static void +send_one_packet(int s, int type) +{ + struct sockaddr_in sin; + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr = g_addr2; + sin.sin_port = ntohs(12345); + + if (type == SOCK_STREAM) { + T_QUIET; T_EXPECT_POSIX_FAILURE(connect(s, (struct sockaddr *)&sin, sizeof(sin)), EINPROGRESS, NULL); + } + if (type == SOCK_DGRAM) { + T_QUIET; T_WITH_ERRNO; T_EXPECT_EQ_LONG((long)sizeof(s), sendto(s, &s, sizeof(s), 0, + (struct sockaddr *)&sin, sizeof(sin)), NULL); + } +} + +static void +expect_empty_rings(int channel_count, channel_ring_t rings[]) +{ + /* Check all the rings and make sure there are no packets */ + for (int ri = 0; ri < channel_count; ri++) { + T_QUIET; T_EXPECT_EQ_UINT(0U, os_channel_available_slot_count(rings[ri]), NULL); + } +} + +static void +xfer_1_packet_singly(int channel_count, int type) +{ + uuid_t uuids[channel_count]; + channel_t channels[channel_count]; + int sockets[SO_TC_MAX]; + channel_ring_t rxrings[channel_count]; + int cfds[channel_count]; + int kq; + + T_QUIET; T_EXPECT_POSIX_SUCCESS(kq = kqueue(), NULL); + + setup_channels_and_rings(kq, channel_count, channels, rxrings, NULL, uuids, cfds); + + setup_sockets(sockets, type); + + for (int si = 0; si < SO_TC_MAX; si++) { + expect_empty_rings(channel_count, rxrings); + + send_one_packet(sockets[si], type); + + int expected_ring = channel_count == 1 ? 0 : SOCKET_TC_TO_RING[si]; + + /* Wait for the packet delivery and check that it's only one packet and on the correct ring */ + struct kevent kev[channel_count + 1]; + int nev; + memset(kev, 0, sizeof(kev)); + struct timespec to = { 0, 100 * NSEC_PER_MSEC }; // 100 ms + T_QUIET; T_EXPECT_POSIX_SUCCESS(nev = kevent(kq, NULL, 0, kev, channel_count + 1, &to), NULL); + T_QUIET; T_EXPECT_EQ_INT(nev, 1, NULL); + T_QUIET; T_EXPECT_EQ_PTR((void *)kev[0].ident, (void *)(uintptr_t)cfds[expected_ring], NULL); + T_QUIET; T_EXPECT_EQ_PTR(kev[0].udata, (void *)(uintptr_t)expected_ring, NULL); + T_QUIET; T_EXPECT_EQ_SHORT(kev[0].filter, (short)EVFILT_READ, NULL); + T_QUIET; T_EXPECT_FALSE(kev[0].flags & EV_ERROR, NULL); + + /* Make sure it comes out the expected interface */ + for (int ri = 0; ri < channel_count; ri++) { + errno = 0; + + uint32_t sc = os_channel_available_slot_count(rxrings[ri]); + + /* Check that the packet appears only on the expected ring and + * is the only packet on the expected ring. + */ + T_QUIET; T_EXPECT_EQ_UINT(ri == expected_ring, sc, NULL); + + if ((ri == expected_ring) == sc) { + T_PASS("tc index %d ring %d expected ring %d slot count %u", si, ri, expected_ring, sc); + } else { + T_FAIL("tc index %d ring %d expected ring %d slot count %u", si, ri, expected_ring, sc); + } + + drain_ring(rxrings[ri]); + } + } + + cleanup_sockets(sockets); + + cleanup_channels_and_rings(channel_count, channels, rxrings, NULL, uuids); + + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(kq), NULL); +} + +T_DECL(ipsec35889979u1s, "transfers 1 packet at a time of each sevice class over udp to a single ring") +{ + setup_ipsec_test(); + xfer_1_packet_singly(1, SOCK_DGRAM); +} + +T_DECL(ipsec35889979u4s, "transfers 1 packet at a time of each sevice class over udp to 4 rings") +{ + setup_ipsec_test(); + xfer_1_packet_singly(4, SOCK_DGRAM); +} + +T_DECL(ipsec35889979t1s, "transfers 1 packet at a time of each sevice class over tcp to a single ring") +{ + setup_ipsec_test(); + xfer_1_packet_singly(1, SOCK_STREAM); +} + + +T_DECL(ipsec35889979t4s, "transfers 1 packet at a time of each sevice class over tcp to 4 rings", + /* This test will fail because tcp syn packets get elevated + * due to ack prioritization + */ + T_META_ENABLED(false)) +{ + setup_ipsec_test(); + xfer_1_packet_singly(4, SOCK_STREAM); +} + +static void +xfer_1_packet_together(int channel_count, int type) +{ + uuid_t uuids[channel_count]; + channel_t channels[channel_count]; + int sockets[SO_TC_MAX]; + channel_ring_t rxrings[channel_count]; + int cfds[channel_count]; + int kq; + + T_QUIET; T_EXPECT_POSIX_SUCCESS(kq = kqueue(), NULL); + + setup_channels_and_rings(kq, channel_count, channels, rxrings, NULL, uuids, cfds); + + setup_sockets(sockets, type); + + for (int si = 0; si < SO_TC_MAX; si++) { + expect_empty_rings(channel_count, rxrings); + + send_one_packet(sockets[si], type); + } + + /* Sleep to make sure all packets get delivered */ + struct timespec to = { 0, 100 * NSEC_PER_MSEC }; // 100 ms + nanosleep(&to, NULL); + + /* Wait for the packet delivery and check that all rings event */ + struct kevent kev[channel_count + 1]; + int nev; + memset(kev, 0, sizeof(kev)); + T_QUIET; T_EXPECT_POSIX_SUCCESS(nev = kevent(kq, NULL, 0, kev, channel_count + 1, &to), NULL); + T_QUIET; T_EXPECT_EQ_INT(nev, channel_count, NULL); + + uint32_t found[channel_count]; + memset(found, 0, sizeof(found)); + for (int e = 0; e < nev; e++) { + T_LOG("kevent %lu filter 0x%4x flags 0x%04x fflags 0x%08x data %"PRIdPTR" udata %p", + kev[e].ident, kev[e].filter, kev[e].flags, kev[e].fflags, kev[e].data, kev[e].udata); + + T_QUIET; T_ASSERT_GE_PTR(kev[e].udata, (void *)0, NULL); + T_QUIET; T_ASSERT_LT_PTR(kev[e].udata, (void *)(intptr_t)channel_count, NULL); + int ri = (int)kev[e].udata; + T_QUIET; T_EXPECT_EQ_UINT(found[ri], 0U, NULL); + + T_QUIET; T_EXPECT_EQ_ULONG(kev[e].ident, (uintptr_t)cfds[ri], NULL); + T_QUIET; T_EXPECT_EQ_SHORT(kev[e].filter, (short)EVFILT_READ, NULL); + T_QUIET; T_EXPECT_FALSE(kev[e].flags & EV_ERROR, NULL); + + if (channel_count == 1) { + T_QUIET; T_EXPECT_EQ_LONG(kev[e].data, (long)SO_TC_MAX, NULL); + } else { + T_QUIET; T_EXPECT_EQ_LONG(kev[e].data, (long)RING_TO_TC_COUNT[ri], NULL); + } + + found[ri] += (uint32_t)kev[e].data; + } + /* Check that something came out of all rings */ + for (int ri = 0; ri < channel_count; ri++) { + T_QUIET; T_EXPECT_NE_UINT(found[ri], 0U, NULL); + } + + /* Make sure it comes out the expected interface */ + for (int ri = 0; ri < channel_count; ri++) { + uint32_t sc = os_channel_available_slot_count(rxrings[ri]); + if (channel_count == 1) { + if (sc == SO_TC_MAX) { + T_PASS("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, SO_TC_MAX); + } else { + T_FAIL("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, SO_TC_MAX); + } + } else { + if (sc == (uint32_t)RING_TO_TC_COUNT[ri]) { + T_PASS("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, (uint32_t)RING_TO_TC_COUNT[ri]); + } else { + T_FAIL("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, (uint32_t)RING_TO_TC_COUNT[ri]); + } + } + + drain_ring(rxrings[ri]); + } + + cleanup_sockets(sockets); + + cleanup_channels_and_rings(channel_count, channels, rxrings, NULL, uuids); + + T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(kq), NULL); +} + +T_DECL(ipsec35889979u1m, "transfers 1 packet together of each sevice class over udp to a single ring") +{ + setup_ipsec_test(); + xfer_1_packet_together(1, SOCK_DGRAM); +} + +T_DECL(ipsec35889979u4m, "transfers 1 packet together of each sevice class over udp to 4 rings") +{ + setup_ipsec_test(); + xfer_1_packet_together(4, SOCK_DGRAM); +} + +T_DECL(ipsec35889979t1m, "transfers 1 packet together of each sevice class over tcp to a single ring") +{ + setup_ipsec_test(); + xfer_1_packet_together(1, SOCK_STREAM); +} + +T_DECL(ipsec35889979t4m, "transfers 1 packet together of each sevice class over tcp to 4 rings", + /* This test will fail because tcp syn packets get elevated + * due to ack prioritization + */ + T_META_ENABLED(false)) +{ + setup_ipsec_test(); + xfer_1_packet_together(4, SOCK_STREAM); } diff --git a/tests/no32exec_35914211.c b/tests/no32exec_35914211.c index ea36703aa..b1f87634f 100644 --- a/tests/no32exec_35914211.c +++ b/tests/no32exec_35914211.c @@ -3,15 +3,27 @@ #include #include #include +#include +#include +#include -T_DECL(no32exec_bootarg, "make sure the no32exec boot-arg is honored", T_META_BOOTARGS_SET("-no32exec")) +static int binprefs_child_is_64 = 0; + +static void +signal_handler(__unused int sig) +{ + binprefs_child_is_64 = 1; + return; +} + +T_DECL(no32exec_bootarg_with_spawn, "make sure the no32exec boot-arg is honored, using posix_spawn", T_META_BOOTARGS_SET("-no32exec")) { int spawn_ret, pid; char path[1024]; uint32_t size = sizeof(path); - T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL); - T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL); + T_QUIET; T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL); + T_QUIET; T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL); spawn_ret = posix_spawn(&pid, path, NULL, NULL, NULL, NULL); if (spawn_ret == 0) { @@ -21,3 +33,61 @@ T_DECL(no32exec_bootarg, "make sure the no32exec boot-arg is honored", T_META_BO } T_ASSERT_EQ(spawn_ret, EBADARCH, NULL); } + +T_DECL(no32exec_bootarg_with_spawn_binprefs, "make sure the no32exec boot-arg is honored, using posix_spawn" + "with binprefs on a fat i386/x86_64 Mach-O", T_META_BOOTARGS_SET("-no32exec")) +{ + int pid, ret; + posix_spawnattr_t spawnattr; + cpu_type_t cpuprefs[] = { CPU_TYPE_X86, CPU_TYPE_X86_64 }; + + char path[1024]; + uint32_t size = sizeof(path); + T_QUIET; T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL); + T_QUIET; T_ASSERT_LT(strlcat(path, "_helper_binprefs", size), size, NULL); + + T_QUIET; T_ASSERT_NE(signal(SIGUSR1, signal_handler), SIG_ERR, "signal"); + + ret = posix_spawnattr_init(&spawnattr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init"); + + ret = posix_spawnattr_setbinpref_np(&spawnattr, sizeof(cpuprefs) / sizeof(cpuprefs[0]), cpuprefs, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setbinpref_np"); + + ret = posix_spawn(&pid, path, NULL, &spawnattr, NULL, NULL); + T_ASSERT_EQ(ret, 0, "posix_spawn should succeed despite 32-bit binpref appearing first"); + + sleep(1); + ret = kill(pid, SIGUSR1); // ping helper; helper should ping back if running 64-bit + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill"); + + ret = wait(NULL); + T_QUIET; T_ASSERT_EQ(ret, pid, "child pid"); + + T_ASSERT_EQ(binprefs_child_is_64, 1, "child process should be running in 64-bit mode"); + + ret = posix_spawnattr_destroy(&spawnattr); + T_QUIET; T_ASSERT_EQ(ret, 0, "posix_spawnattr_destroy"); +} + +T_DECL(no32_exec_bootarg_with_exec, "make sure the no32exec boot-arg is honored, using fork and exec", T_META_BOOTARGS_SET("-no32exec")) +{ + int pid; + char path[1024]; + uint32_t size = sizeof(path); + + T_QUIET; T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL); + T_QUIET; T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL); + + pid = fork(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork"); + + if (pid == 0) { /* child */ + execve(path, NULL, NULL); /* this should fail, resulting in the call to exit below */ + exit(errno); + } else { /* parent */ + int wait_ret = 0; + waitpid(pid, &wait_ret, 0); + T_ASSERT_EQ(WEXITSTATUS(wait_ret), EBADARCH, "execve should set errno = EBADARCH"); + } +} diff --git a/tests/no32exec_35914211_helper_binprefs.c b/tests/no32exec_35914211_helper_binprefs.c new file mode 100644 index 000000000..0909633eb --- /dev/null +++ b/tests/no32exec_35914211_helper_binprefs.c @@ -0,0 +1,34 @@ +#include +#include +#include + +int can_signal_parent = 0; + +void +signal_handler(int sig) +{ + if (sig == SIGUSR1) { + can_signal_parent = 1; + } + return; +} + +T_DECL(no32exec_bootarg_with_spawn_binprefs_helper, "helper for no32exec_bootarg_with_spawn_binprefs test") +{ + unsigned long ptrSize = sizeof(long); + int ppid = getppid(); + + signal(SIGUSR1, signal_handler); + signal(SIGALRM, signal_handler); + + // parent will signal us if they're no32exec_bootarg_with_spawn_binprefs, otherwise timeout + alarm(3); + pause(); + + /* signal to parent process if we are running in 64-bit mode */ + if (can_signal_parent && ptrSize == 8) { + kill(ppid, SIGUSR1); + } + + T_SKIP("nothing to see here"); +} diff --git a/tests/os_proc.c b/tests/os_proc.c new file mode 100644 index 000000000..9f2f0cea7 --- /dev/null +++ b/tests/os_proc.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +#if !TARGET_OS_OSX +void test_os_proc_available_memory(void); +extern int getpid(void); + +T_DECL(test_os_proc_available_memory, "Basic available memory") +{ + kern_return_t err; + task_vm_info_data_t vm_info = {}; + mach_msg_type_number_t count = TASK_VM_INFO_REV4_COUNT; + uint64_t remainingBytes; + + err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); + remainingBytes = os_proc_available_memory(); + + T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded"); + T_EXPECT_EQ(count, TASK_VM_INFO_REV4_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV4_COUNT (%d)\n", count, TASK_VM_INFO_REV4_COUNT); + T_EXPECT_NE(remainingBytes, 0ULL, "os_proc_available_memory() should not return 0"); + T_EXPECT_NE(vm_info.limit_bytes_remaining, 0ULL, "vm_info.limit_bytes_remaining should not return 0"); + T_EXPECT_EQ(vm_info.limit_bytes_remaining, remainingBytes, + "task_info --rev4 call returned value 0x%llx for vm_info.limit_bytes_remaining. Expected 0x%llx", + vm_info.limit_bytes_remaining, remainingBytes); + + /* this should now make the available memory return 0 */ + proc_track_dirty(getpid(), PROC_DIRTY_TRACK); + + count = TASK_VM_INFO_REV4_COUNT; + err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); + remainingBytes = os_proc_available_memory(); + + T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded"); + T_EXPECT_EQ(count, TASK_VM_INFO_REV4_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV4_COUNT\n", count); + T_EXPECT_EQ(remainingBytes, 0ULL, "os_proc_available_memory() should return 0"); + T_EXPECT_EQ(vm_info.limit_bytes_remaining, 0ULL, "vm_info.limit_bytes_remaining should return 0"); + T_EXPECT_EQ(vm_info.limit_bytes_remaining, remainingBytes, + "task_info --rev4 call returned value 0x%llx for vm_info.limit_bytes_remaining. Expected 0x%llx", + vm_info.limit_bytes_remaining, remainingBytes); +} +#else +T_DECL(test_os_proc_available_memory, "Basic available memory") +{ + T_SKIP("Not available on macOS"); +} +#endif diff --git a/tests/os_refcnt.c b/tests/os_refcnt.c new file mode 100644 index 000000000..36263be20 --- /dev/null +++ b/tests/os_refcnt.c @@ -0,0 +1,394 @@ +#include +#include +#include +#include +#include + +#define DEVELOPMENT 1 +#define DEBUG 0 +#define XNU_KERNEL_PRIVATE 1 + +#define OS_REFCNT_DEBUG 1 +#define STRESS_TESTS 0 + +void handle_panic(const char *func, char *str, ...); +#define panic(...) handle_panic(__func__, __VA_ARGS__) + +#include "../libkern/os/refcnt.h" +#include "../libkern/os/refcnt.c" + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +/* import some of the refcnt internal state for testing */ +extern bool ref_debug_enable; +os_refgrp_decl_extern(global_ref_group); + +T_GLOBAL_META( + T_META_NAMESPACE("os_refcnt"), + T_META_CHECK_LEAKS(false) + ); + +T_DECL(os_refcnt, "Basic atomic refcount") +{ + struct os_refcnt rc; + os_ref_init(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 1, "refcount correctly initialized"); + + os_ref_retain(&rc); + os_ref_retain(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 3, "retain increased count"); + + os_ref_count_t x = os_ref_release(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 2, "release decreased count"); + T_ASSERT_EQ_UINT(x, 2, "release returned correct count"); + + os_ref_release_live(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 1, "release_live decreased count"); + + x = os_ref_release(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 0, "released"); + T_ASSERT_EQ_UINT(x, 0, "returned released"); + + os_ref_init(&rc, NULL); + x = os_ref_retain_try(&rc); + T_ASSERT_GT_INT(x, 0, "try retained"); + + (void)os_ref_release(&rc); + (void)os_ref_release(&rc); + T_QUIET; T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 0, "release"); + + x = os_ref_retain_try(&rc); + T_ASSERT_EQ_INT(x, 0, "try failed"); +} + +T_DECL(refcnt_raw, "Raw refcount") +{ + os_ref_atomic_t rc; + os_ref_init_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 1, "refcount correctly initialized"); + + os_ref_retain_raw(&rc, NULL); + os_ref_retain_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 3, "retain increased count"); + + os_ref_count_t x = os_ref_release_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 2, "release decreased count"); + T_ASSERT_EQ_UINT(x, 2, "release returned correct count"); + + os_ref_release_live_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 1, "release_live decreased count"); + + x = os_ref_release_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 0, "released"); + T_ASSERT_EQ_UINT(x, 0, "returned released"); + + os_ref_init_raw(&rc, NULL); + x = os_ref_retain_try_raw(&rc, NULL); + T_ASSERT_GT_INT(x, 0, "try retained"); + + (void)os_ref_release_raw(&rc, NULL); + (void)os_ref_release_raw(&rc, NULL); + T_QUIET; T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 0, "release"); + + x = os_ref_retain_try_raw(&rc, NULL); + T_ASSERT_EQ_INT(x, 0, "try failed"); +} + +T_DECL(refcnt_locked, "Locked refcount") +{ + struct os_refcnt rc; + os_ref_init(&rc, NULL); + + os_ref_retain_locked(&rc); + os_ref_retain_locked(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 3, "retain increased count"); + + os_ref_count_t x = os_ref_release_locked(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 2, "release decreased count"); + T_ASSERT_EQ_UINT(x, 2, "release returned correct count"); + + (void)os_ref_release_locked(&rc); + x = os_ref_release_locked(&rc); + T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 0, "released"); + T_ASSERT_EQ_UINT(x, 0, "returned released"); +} + +T_DECL(refcnt_raw_locked, "Locked raw refcount") +{ + os_ref_atomic_t rc; + os_ref_init_raw(&rc, NULL); + + os_ref_retain_locked_raw(&rc, NULL); + os_ref_retain_locked_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 3, "retain increased count"); + + os_ref_count_t x = os_ref_release_locked_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 2, "release decreased count"); + T_ASSERT_EQ_UINT(x, 2, "release returned correct count"); + + (void)os_ref_release_locked_raw(&rc, NULL); + x = os_ref_release_locked_raw(&rc, NULL); + T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 0, "released"); + T_ASSERT_EQ_UINT(x, 0, "returned released"); +} + +T_DECL(refcnt_mask_locked, "Locked bitwise refcount") +{ + const os_ref_count_t b = 12; + os_ref_atomic_t rc; + os_ref_count_t reserved = 0xaaa; + os_ref_init_count_mask(&rc, NULL, 1, reserved, b); + + os_ref_retain_locked_mask(&rc, NULL, b); + os_ref_retain_locked_mask(&rc, NULL, b); + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, b), 3, "retain increased count"); + + os_ref_count_t x = os_ref_release_locked_mask(&rc, NULL, b); + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, b), 2, "release decreased count"); + T_ASSERT_EQ_UINT(x, 2, "release returned correct count"); + T_ASSERT_EQ_UINT(rc & ((1U << b) - 1), reserved, "Reserved bits not modified"); + + (void)os_ref_release_locked_mask(&rc, NULL, b); + x = os_ref_release_locked_mask(&rc, NULL, b); + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, b), 0, "released"); + T_ASSERT_EQ_UINT(x, 0, "returned released"); + T_ASSERT_EQ_UINT(rc & ((1U << b) - 1), reserved, "Reserved bits not modified"); +} + +static void +do_bitwise_test(const os_ref_count_t bits) +{ + os_ref_atomic_t rc; + os_ref_count_t reserved = 0xaaaaaaaaU & ((1U << bits) - 1); + os_ref_init_count_mask(&rc, NULL, 1, reserved, bits); + + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 1, "[%u bits] refcount initialized", bits); + + os_ref_retain_mask(&rc, NULL, bits); + os_ref_retain_mask(&rc, NULL, bits); + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 3, "retain increased count"); + + os_ref_count_t x = os_ref_release_mask(&rc, NULL, bits); + T_ASSERT_EQ_UINT(x, 2, "release returned correct count"); + + os_ref_release_live_mask(&rc, NULL, bits); + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 1, "release_live decreased count"); + + x = os_ref_release_mask(&rc, NULL, bits); + T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 0, "released"); + T_ASSERT_EQ_UINT(x, 0, "returned released"); + + T_ASSERT_EQ_UINT(rc & ((1U << bits) - 1), reserved, "Reserved bits not modified"); + + os_ref_init_count_mask(&rc, NULL, 1, reserved, bits); + x = os_ref_retain_try_mask(&rc, NULL, bits); + T_ASSERT_GT_INT(x, 0, "try retained"); + + (void)os_ref_release_mask(&rc, NULL, bits); + (void)os_ref_release_mask(&rc, NULL, bits); + T_QUIET; T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 0, "release"); + + x = os_ref_retain_try_mask(&rc, NULL, bits); + T_ASSERT_EQ_INT(x, 0, "try failed"); + + T_ASSERT_EQ_UINT(rc & ((1U << bits) - 1), reserved, "Reserved bits not modified"); +} + +T_DECL(refcnt_bitwise, "Bitwise refcount") +{ + do_bitwise_test(0); + do_bitwise_test(1); + do_bitwise_test(8); + do_bitwise_test(26); + + os_ref_atomic_t rc = 0xaaaaaaaa; + + const os_ref_count_t nbits = 3; + const os_ref_count_t count = 5; + const os_ref_count_t bits = 7; + os_ref_init_count_mask(&rc, NULL, count, bits, nbits); + + os_ref_count_t mask = (1U << nbits) - 1; + T_ASSERT_EQ_UINT(rc & mask, bits, "bits correctly initialized"); + T_ASSERT_EQ_UINT(rc >> nbits, count, "count correctly initialized"); +} + +os_refgrp_decl(static, g1, "test group", NULL); +os_refgrp_decl_extern(g1); + +T_DECL(refcnt_groups, "Group accounting") +{ +#if OS_REFCNT_DEBUG + ref_debug_enable = true; + + struct os_refcnt rc; + os_ref_init(&rc, &g1); + + T_ASSERT_EQ_UINT(g1.grp_children, 1, "group attached"); + T_ASSERT_EQ_UINT(global_ref_group.grp_children, 1, "global group attached"); + T_ASSERT_EQ_UINT(g1.grp_count, 1, "group count"); + T_ASSERT_EQ_ULLONG(g1.grp_retain_total, 1ULL, "group retains"); + T_ASSERT_EQ_ULLONG(g1.grp_release_total, 0ULL, "group releases"); + + os_ref_retain(&rc); + os_ref_retain(&rc); + os_ref_release_live(&rc); + os_ref_release_live(&rc); + + T_EXPECT_EQ_ULLONG(g1.grp_retain_total, 3ULL, "group retains"); + T_EXPECT_EQ_ULLONG(g1.grp_release_total, 2ULL, "group releases"); + + os_ref_count_t x = os_ref_release(&rc); + T_QUIET; T_ASSERT_EQ_UINT(x, 0, "released"); + + T_ASSERT_EQ_UINT(g1.grp_children, 0, "group detatched"); + T_ASSERT_EQ_UINT(g1.grp_count, 0, "group count"); +#else + T_SKIP("Refcount debugging disabled"); +#endif +} + +enum { + OSREF_UNDERFLOW = 1, + OSREF_OVERFLOW = 2, + OSREF_RESURRECTION = 3, + OSREF_DEALLOC_LIVE = 4, +}; + +static jmp_buf jb; +static bool expect_panic = false; + +void +handle_panic(const char *func, char *__unused str, ...) +{ + int ret = -1; + if (!expect_panic) { + T_FAIL("unexpected panic from %s", func); + T_LOG("corrupt program state, aborting"); + abort(); + } + expect_panic = false; + + if (strcmp(func, "os_ref_panic_underflow") == 0) { + ret = OSREF_UNDERFLOW; + } else if (strcmp(func, "os_ref_panic_overflow") == 0) { + ret = OSREF_OVERFLOW; + } else if (strcmp(func, "os_ref_panic_resurrection") == 0) { + ret = OSREF_RESURRECTION; + } else if (strcmp(func, "os_ref_panic_live") == 0) { + ret = OSREF_DEALLOC_LIVE; + } else { + T_LOG("unexpected panic from %s", func); + } + + longjmp(jb, ret); +} + +T_DECL(refcnt_underflow, "Underflow") +{ + os_ref_atomic_t rc; + os_ref_init_raw(&rc, NULL); + (void)os_ref_release_raw(&rc, NULL); + + int x = setjmp(jb); + if (x == 0) { + expect_panic = true; + (void)os_ref_release_raw(&rc, NULL); + T_FAIL("underflow not caught"); + } else { + T_ASSERT_EQ_INT(x, OSREF_UNDERFLOW, "underflow caught"); + } +} + +T_DECL(refcnt_overflow, "Overflow") +{ + os_ref_atomic_t rc; + os_ref_init_count_raw(&rc, NULL, 0x0fffffffU); + + int x = setjmp(jb); + if (x == 0) { + expect_panic = true; + (void)os_ref_retain_raw(&rc, NULL); + T_FAIL("overflow not caught"); + } else { + T_ASSERT_EQ_INT(x, OSREF_OVERFLOW, "overflow caught"); + } +} + +T_DECL(refcnt_resurrection, "Resurrection") +{ + os_ref_atomic_t rc; + os_ref_init_raw(&rc, NULL); + os_ref_count_t n = os_ref_release_raw(&rc, NULL); + + T_QUIET; T_EXPECT_EQ_UINT(n, 0, "reference not released"); + + int x = setjmp(jb); + if (x == 0) { + expect_panic = true; + (void)os_ref_retain_raw(&rc, NULL); + T_FAIL("resurrection not caught"); + } else { + T_ASSERT_EQ_INT(x, OSREF_RESURRECTION, "resurrection caught"); + } +} + +T_DECL(refcnt_dealloc_live, "Dealloc expected live object") +{ + os_ref_atomic_t rc; + os_ref_init_raw(&rc, NULL); + + expect_panic = true; + int x = setjmp(jb); + if (x == 0) { + expect_panic = true; + os_ref_release_live_raw(&rc, NULL); + T_FAIL("dealloc live not caught"); + } else { + T_ASSERT_EQ_INT(x, OSREF_DEALLOC_LIVE, "dealloc live caught"); + } +} + +T_DECL(refcnt_initializer, "Static intializers") +{ + struct os_refcnt rc = OS_REF_INITIALIZER; + os_ref_atomic_t rca = OS_REF_ATOMIC_INITIALIZER; + + T_ASSERT_EQ_INT(0, os_ref_retain_try(&rc), NULL); + T_ASSERT_EQ_INT(0, os_ref_get_count_raw(&rca), NULL); +} + +#if STRESS_TESTS + +static const unsigned long iters = 1024 * 1024 * 32; + +static void * +func(void *_rc) +{ + struct os_refcnt *rc = _rc; + for (unsigned long i = 0; i < iters; i++) { + os_ref_retain(rc); + os_ref_release_live(rc); + } + return NULL; +} + +T_DECL(refcnt_stress, "Stress test") +{ + pthread_t th1, th2; + + struct os_refcnt rc; + os_ref_init(&rc, NULL); + + T_ASSERT_POSIX_ZERO(pthread_create(&th1, NULL, func, &rc), "pthread_create"); + T_ASSERT_POSIX_ZERO(pthread_create(&th2, NULL, func, &rc), "pthread_create"); + + void *r1, *r2; + T_ASSERT_POSIX_ZERO(pthread_join(th1, &r1), "pthread_join"); + T_ASSERT_POSIX_ZERO(pthread_join(th2, &r2), "pthread_join"); + + os_ref_count_t x = os_ref_release(&rc); + T_ASSERT_EQ_INT(x, 0, "Consistent refcount"); +} + +#endif diff --git a/tests/os_unaligned.c b/tests/os_unaligned.c new file mode 100644 index 000000000..311ecbb73 --- /dev/null +++ b/tests/os_unaligned.c @@ -0,0 +1,36 @@ +#include +#include +#include + +#include "../libkern/os/ptrtools.h" + +#define CHECK_ALIGNMENT(T) \ +{ \ + T *__p; \ + T_QUIET; T_EXPECT_EQ_ULONG(__alignof__(*__p), sizeof(*__p), #T " native alignment"); \ + T_ASSERT_EQ_ULONG(__alignof__(os_unaligned_deref(__p)), 1UL, #T " alignment"); \ +} + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +struct A { + int a; +}; + +T_DECL(os_unaligned, "Unaligned pointer access") +{ + int x = 0x914842; + int *p = &x; + + T_ASSERT_EQ_INT(os_unaligned_deref(p), x, "load"); + os_unaligned_deref(&x) = INT_MIN; + T_ASSERT_EQ_INT(x, INT_MIN, "store"); + + CHECK_ALIGNMENT(unsigned); + CHECK_ALIGNMENT(long long); + CHECK_ALIGNMENT(uintptr_t); + CHECK_ALIGNMENT(int16_t); + CHECK_ALIGNMENT(uint64_t); + CHECK_ALIGNMENT(struct A); + CHECK_ALIGNMENT(void *); +} diff --git a/tests/osptr.cpp b/tests/osptr.cpp new file mode 100644 index 000000000..054b8693c --- /dev/null +++ b/tests/osptr.cpp @@ -0,0 +1,772 @@ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++11-extensions" + +#include +#include +#include +#include +#include + +#if 0 +# define OSPTR_LOG T_LOG +#elif 0 +# define OSPTR_LOG printf +#else +# define OSPTR_LOG(x...) do { } while(0) +#endif + +T_GLOBAL_META( + T_META_NAMESPACE("osptr"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) + ); + +static int num_instances = 0; +static int num_retains = 0; +static int num_releases = 0; + +class OSMetaClassBase +{ + static int id_counter; + static OSMetaClassBase *freelist; + +public: + int inst_id; + mutable int refcount; + mutable OSMetaClassBase *next; + static void *type_id; + + OSMetaClassBase() : refcount(1), next(nullptr) + { + inst_id = id_counter++; + num_instances++; + OSPTR_LOG("[%p, %d] constructed\n", this, inst_id); + } + + virtual ~OSMetaClassBase() + { + OSPTR_LOG("[%p, %d] destroyed\n", this, inst_id); + } + + virtual void + retain() const + { + T_QUIET; T_EXPECT_GT_INT(refcount, 0, "Instance resurrected"); + refcount++; + num_retains++; + OSPTR_LOG("[%p, %d] retain, refcount=%d\n", this, inst_id, refcount); + } + + virtual void + release() const + { + T_QUIET; T_EXPECT_GT_INT(refcount, 0, "Double free"); + refcount--; + num_releases++; + OSPTR_LOG("[%p, %d] release, refcount=%d\n", this, inst_id, refcount); + + /* + * Don't delete the object, but keep it around so that we + * can detect double frees + */ + if (refcount == 0) { + num_instances--; + this->next = freelist; + freelist = const_cast(this); + } + } + + virtual void + taggedRetain(void *tag) const + { + OSPTR_LOG("tag[%p] ", tag); + retain(); + } + + virtual void + taggedRelease(void *tag) const + { + OSPTR_LOG("tag[%p] ", tag); + release(); + } +}; + +int OSMetaClassBase::id_counter; +OSMetaClassBase *OSMetaClassBase::freelist; + +void *OSMetaClassBase::type_id; + +#define OSTypeID(T) T::type_id +#define OSTypeAlloc(T) new T +#define OSDynamicCast(T, p) dynamic_cast(p) + +#define LIBKERN_SMART_POINTERS +#include + +class Base : public OSMetaClassBase { +public: + Base() : OSMetaClassBase() + { + } +}; + +class Derived : public Base { +public: + Derived() : Base() + { + } +}; + +class Other : public OSMetaClassBase { +public: + Other() : OSMetaClassBase() + { + } +}; + +typedef OSPtr BasePtr; +typedef OSPtr DerivedPtr; +typedef OSPtr OtherPtr; + +static void +default_constructor() +{ + BasePtr a; + T_ASSERT_NULL(a.get(), "Default NULL construction"); + T_ASSERT_EQ_INT(num_instances, 0, "No instances created"); +} + +static void +null_constructor() +{ + BasePtr a(nullptr); + T_ASSERT_NULL(a.get(), "Default NULL construction"); + T_ASSERT_EQ_INT(num_instances, 0, "No instances created"); +} + +static void +raw_constructor() +{ + Base *a = new Base(); + T_ASSERT_EQ_INT(num_instances, 1, "Created instance"); + + { + BasePtr p(a); + + T_ASSERT_EQ_INT(num_instances, 1, "No new instance"); + T_ASSERT_EQ_PTR(p.get(), a, "osptr bound to correct object"); + T_ASSERT_EQ_INT(a->refcount, 2, "Object refcount incremented"); + } + + T_ASSERT_EQ_INT(a->refcount, 1, "Object refcount decremented"); + a->release(); + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); +} + +static void +alloc() +{ + BasePtr a = BasePtr::alloc(); + + T_ASSERT_NOTNULL(a.get(), "osptr seated"); + T_ASSERT_EQ_INT(num_instances, 1, "Instance created"); + T_ASSERT_EQ_INT(a->refcount, 1, "Reference created"); +} + +static void +destroy() +{ + { + BasePtr a = BasePtr::alloc(); + T_ASSERT_EQ_INT(num_instances, 1, "Instance created"); + } + + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); +} + +static void +copy() +{ + BasePtr a = BasePtr::alloc(); + BasePtr b; + int a_id = a->inst_id; + + BasePtr a_copy(a); + + T_ASSERT_EQ_INT(a_copy->inst_id, a_id, NULL); + T_ASSERT_EQ_INT(a->refcount, 2, NULL); + T_ASSERT_EQ_INT(a_copy->refcount, 2, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_EXPECT_EQ_INT(num_retains, 1, NULL); + + BasePtr b_copy(b); + T_ASSERT_NULL(b_copy.get(), "Copy null osptr"); + + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_EXPECT_EQ_INT(num_retains, 1, NULL); + + BasePtr a_copy2 = a; + T_ASSERT_EQ_PTR(a_copy2.get(), a.get(), NULL); + + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_EXPECT_EQ_INT(num_retains, 2, NULL); + T_EXPECT_EQ_INT(num_releases, 0, NULL); +} + +static void +copy_subclass() +{ + auto a = DerivedPtr::alloc(); + BasePtr b(a); + + T_ASSERT_EQ_PTR(a.get(), b.get(), NULL); + T_ASSERT_EQ_INT(b->refcount, 2, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + + a = nullptr; + T_ASSERT_NOTNULL(b.get(), NULL); + T_ASSERT_EQ_INT(b->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); +} + +static void +assign() +{ + int a_id, b_id; + + BasePtr p; + BasePtr a = BasePtr::alloc(); + BasePtr b = BasePtr::alloc(); + + a_id = a->inst_id; + b_id = b->inst_id; + + p = a; + + T_ASSERT_EQ_PTR(p.get(), a.get(), "Assigned osptr references same object"); + T_ASSERT_EQ_INT(p->inst_id, a_id, NULL); + T_ASSERT_EQ_INT(a->refcount, 2, "Assigned osptr bumps refcount"); + T_QUIET; T_ASSERT_TRUE(b->refcount == 1, NULL); + + p = b; + + T_ASSERT_EQ_PTR(p.get(), b.get(), "Assigned osptr references same object"); + T_ASSERT_EQ_INT(p->inst_id, b_id, NULL); + T_ASSERT_EQ_INT(a->refcount, 1, "Previous assignee drops reference"); + T_ASSERT_EQ_INT(b->refcount, 2, "New assignee bumps reference"); + + T_ASSERT_EQ_INT(a->inst_id, a_id, NULL); + T_ASSERT_EQ_INT(b->inst_id, b_id, NULL); + + a = nullptr; + + T_ASSERT_EQ_INT(num_instances, 1, "Assignment to null releases object"); + + b = nullptr; + p = nullptr; + + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); +} + +static void +assign_raw() +{ + Base *a1 = new Base(); + Base *a2 = new Base(); + + { + BasePtr p; + + p = a1; + T_ASSERT_EQ_PTR(p.get(), a1, NULL); + T_ASSERT_EQ_INT(a1->refcount, 2, NULL); + T_ASSERT_EQ_INT(a2->refcount, 1, NULL); + + p = a2; + T_ASSERT_EQ_PTR(p.get(), a2, NULL); + T_ASSERT_EQ_INT(a1->refcount, 1, NULL); + T_ASSERT_EQ_INT(a2->refcount, 2, NULL); + } + + T_ASSERT_EQ_INT(a1->refcount, 1, NULL); + T_ASSERT_EQ_INT(a2->refcount, 1, NULL); + + a1->release(); + a2->release(); + + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); +} + +static void +assign_null() +{ + BasePtr a = BasePtr::alloc(); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + + a = nullptr; + + T_ASSERT_NULL(a.get(), NULL); + T_ASSERT_EQ_INT(num_instances, 0, "No instances created"); + + a = BasePtr::alloc(); + BasePtr b(a.get()); + + T_ASSERT_EQ_INT(a->refcount, 2, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + + b = nullptr; + + T_ASSERT_EQ_INT(a->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + + a = nullptr; + + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); +} + +static void +assign_subclass() +{ + int a_id, b_id; + + OSPtr base; + BasePtr a = BasePtr::alloc(); + BasePtr b = BasePtr::alloc(); + + a_id = a->inst_id; + b_id = b->inst_id; + + base = a; + + T_ASSERT_TRUE(base.get() == static_cast(a.get()), NULL); + T_ASSERT_TRUE(base->inst_id == a_id, NULL); + T_ASSERT_TRUE(a->refcount == 2, NULL); + T_ASSERT_TRUE(b->refcount == 1, NULL); + + base = b; + + T_ASSERT_TRUE(base.get() == static_cast(b.get()), NULL); + T_ASSERT_TRUE(base->inst_id == b_id, NULL); + T_ASSERT_TRUE(a->refcount == 1, NULL); + T_ASSERT_TRUE(b->refcount == 2, NULL); + + T_ASSERT_TRUE(a->inst_id == a_id, NULL); + T_ASSERT_TRUE(b->inst_id == b_id, NULL); + + a = nullptr; + + T_ASSERT_TRUE(num_instances == 1, NULL); + + b = nullptr; + base = nullptr; + + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); +} + +static void +assign_compatible() +{ + OSPtr a = OSPtr::alloc(); + OSPtr b = a; + T_ASSERT_EQ_PTR(a.get(), b.get(), NULL); + + OSPtr c = OSPtr::alloc(); + OSPtr d = c; + T_ASSERT_EQ_PTR(c.get(), d.get(), NULL); +} + +static void +move() +{ + OSPtr a = OSPtr::alloc(); + int a_id = a->inst_id; + + OSPtr b(os::move(a)); + + T_ASSERT_TRUE(a.get() == NULL, NULL); + T_ASSERT_TRUE(b->inst_id == a_id, NULL); + T_ASSERT_TRUE(b->refcount == 1, NULL); + T_ASSERT_TRUE(num_instances == 1, NULL); + T_EXPECT_EQ_INT(num_retains, 0, NULL); +} + +static void +move_assign() +{ + OSPtr a = OSPtr::alloc(); + OSPtr b = OSPtr::alloc(); + int a_id = a->inst_id; + int b_id = b->inst_id; + + OSPtr d; + + d = os::move(a); + + T_ASSERT_TRUE(a.get() == NULL, NULL); + T_ASSERT_TRUE(d->inst_id == a_id, NULL); + T_ASSERT_TRUE(d->refcount == 1, NULL); + T_ASSERT_TRUE(num_instances == 2, NULL); + + d = os::move(b); + T_ASSERT_TRUE(a.get() == NULL, NULL); + T_ASSERT_TRUE(b.get() == NULL, NULL); + T_ASSERT_TRUE(d->inst_id == b_id, NULL); + T_ASSERT_TRUE(d->refcount == 1, NULL); + T_ASSERT_TRUE(num_instances == 1, NULL); + T_EXPECT_EQ_INT(num_retains, 0, NULL); +} + +static void +move_assign_null() +{ + BasePtr a = BasePtr::alloc(); + BasePtr b = a; + + T_EXPECT_EQ_INT(num_retains, 1, NULL); + + a = os::move(nullptr); + + T_ASSERT_TRUE(a.get() == NULL, NULL); + T_ASSERT_TRUE(b->refcount == 1, NULL); + + b = os::move(nullptr); + + T_ASSERT_EQ_INT(num_instances, 0, "All instances released"); + T_EXPECT_EQ_INT(num_retains, 1, NULL); +} + +static void +move_assign_raw() +{ + BasePtr a = BasePtr::alloc(); + Base *b = new Base; + Base *tmp = b; + + T_ASSERT_EQ_INT(num_instances, 2, NULL); + + a = os::move(tmp); + + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_ASSERT_NULL(tmp, NULL); + T_ASSERT_EQ_PTR(a.get(), b, NULL); + T_ASSERT_EQ_INT(a->refcount, 2, NULL); + b->release(); + T_ASSERT_EQ_INT(a->refcount, 1, NULL); +} + +static void +move_assign_subclass() +{ + auto a = DerivedPtr::alloc(); + BasePtr b; + + b = os::move(a); + + T_ASSERT_NULL(a.get(), NULL); + T_ASSERT_NOTNULL(b.get(), NULL); + T_ASSERT_EQ_INT(b->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); +} + +static void +move_assign_self() +{ + OSPtr a = OSPtr::alloc(); + int a_id = a->inst_id; + + a = os::move(a); + + T_ASSERT_NOTNULL(a.get(), "osptr seated"); + T_ASSERT_TRUE(a->inst_id == a_id, NULL); + T_ASSERT_TRUE(a->refcount == 1, NULL); + T_ASSERT_TRUE(num_instances == 1, NULL); + T_EXPECT_EQ_INT(num_retains, 0, NULL); +} + +static void +test_const_cast() +{ + OSPtr a = OSPtr::alloc(); + + OSPtr b; + + b = a.const_pointer_cast(); + + T_ASSERT_TRUE(a.get() == b.get(), NULL); + T_ASSERT_TRUE(a->refcount == 2, NULL); + T_ASSERT_TRUE(b->refcount == 2, NULL); + + T_ASSERT_TRUE(num_instances == 1, NULL); + T_EXPECT_EQ_INT(num_retains, 1, NULL); +} + +static void +const_cast_move() +{ + OSPtr a = OSPtr::alloc(); + int a_id = a->inst_id; + + OSPtr b; + + b = os::move(a).const_pointer_cast(); + + T_ASSERT_TRUE(a.get() == NULL, NULL); + T_ASSERT_TRUE(b->inst_id == a_id, NULL); + T_ASSERT_TRUE(b->refcount == 1, NULL); + + T_ASSERT_TRUE(num_instances == 1, NULL); + T_EXPECT_EQ_INT(num_retains, 0, NULL); +} + +static void +const_cast_move_self() +{ + BasePtr a = BasePtr::alloc(); + int a_id = a->inst_id; + + a = os::move(a).const_pointer_cast(); + + T_ASSERT_NOTNULL(a.get(), "osptr seated"); + T_ASSERT_TRUE(a->inst_id == a_id, NULL); + T_ASSERT_TRUE(a->refcount == 1, NULL); + T_ASSERT_TRUE(num_instances == 1, NULL); + T_ASSERT_TRUE(num_retains == 0, NULL); +} + +static void +test_static_cast() +{ + DerivedPtr a = DerivedPtr::alloc(); + + BasePtr b; + + b = a.static_pointer_cast(); + + T_ASSERT_TRUE(a.get() == b.get(), NULL); + T_ASSERT_TRUE(a->refcount == 2, NULL); + T_ASSERT_TRUE(b->refcount == 2, NULL); + + T_ASSERT_TRUE(num_instances == 1, NULL); + T_EXPECT_TRUE(num_retains == 1, NULL); +} + +static void +static_cast_move() +{ + DerivedPtr a = DerivedPtr::alloc(); + int a_id = a->inst_id; + + BasePtr b; + + b = os::move(a).static_pointer_cast(); + + T_ASSERT_NULL(a.get(), NULL); + T_ASSERT_EQ_INT(b->inst_id, a_id, NULL); + T_ASSERT_EQ_INT(b->refcount, 1, NULL); + + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_EXPECT_EQ_INT(num_retains, 0, NULL); +} + +static void +static_cast_move_self() +{ + BasePtr a = BasePtr::alloc(); + int a_id = a->inst_id; + + a = os::move(a).static_pointer_cast(); + + T_ASSERT_NOTNULL(a.get(), "osptr seated"); + T_ASSERT_TRUE(a->inst_id == a_id, NULL); + T_ASSERT_TRUE(a->refcount == 1, NULL); + T_ASSERT_TRUE(num_instances == 1, NULL); + T_ASSERT_TRUE(num_retains == 0, NULL); +} + +static void +tagged_ptr() +{ + OSTaggedPtr a; + auto b = OSTaggedPtr::alloc(); + + T_ASSERT_NULL(a.get(), NULL); + T_ASSERT_NOTNULL(b.get(), NULL); + + T_ASSERT_TRUE(typeid(a.get()) == typeid(Base *), NULL); + T_ASSERT_TRUE(typeid(b.get()) == typeid(Derived *), NULL); +} + +static void +attach() +{ + Base *a = new Base(); + BasePtr b; + b.attach(os::move(a)); + + T_ASSERT_NULL(a, NULL); + T_ASSERT_NOTNULL(b.get(), NULL); + T_ASSERT_EQ_INT(b->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_ASSERT_EQ_INT(num_retains, 0, NULL); + + b.attach(new Base); + T_ASSERT_NOTNULL(b.get(), NULL); + T_ASSERT_EQ_INT(b->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_ASSERT_EQ_INT(num_retains, 0, NULL); + T_ASSERT_EQ_INT(num_releases, 1, NULL); +} + +static void +detach() +{ + BasePtr a = BasePtr::alloc(); + Base *p = a.detach(); + + T_ASSERT_NULL(a.get(), NULL); + T_ASSERT_NOTNULL(p, NULL); + T_ASSERT_EQ_INT(p->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + T_ASSERT_EQ_INT(num_retains, 0, NULL); + T_ASSERT_EQ_INT(num_releases, 0, NULL); + + BasePtr b(os::move(p), os::no_retain); // re-seat so that 'p' gets freed +} + +static void +foreign() +{ + auto a = OSPtr::alloc(); + auto b = OSTaggedPtr::alloc(); + + void *a_ptr = a.get(); + void *b_ptr = b.get(); + + a.swap(b); + + T_ASSERT_EQ_PTR(b.get(), a_ptr, NULL); + T_ASSERT_EQ_PTR(a.get(), b_ptr, NULL); + T_ASSERT_EQ_INT(a->refcount, 1, NULL); + T_ASSERT_EQ_INT(b->refcount, 1, NULL); + T_ASSERT_EQ_INT(num_instances, 2, NULL); + T_ASSERT_GE_INT(num_retains, 2, NULL); +} + +static void +test_dynamic_cast() +{ + auto a = DerivedPtr::alloc(); + T_ASSERT_NOTNULL(a.get(), NULL); + BasePtr b = a; + + auto c = b.dynamic_pointer_cast(); + T_ASSERT_NOTNULL(c.get(), NULL); + + T_ASSERT_EQ_INT(c->refcount, 3, NULL); + T_ASSERT_EQ_INT(num_instances, 1, NULL); + + auto d = OtherPtr::alloc(); + auto e = d.dynamic_pointer_cast(); + auto f = OSDynamicCastPtr(OtherPtr::alloc()); + + T_ASSERT_NULL(e.get(), NULL); + T_ASSERT_NULL(f.get(), NULL); + + T_ASSERT_EQ_INT(num_instances, 2, NULL); + T_ASSERT_EQ_INT(d->refcount, 1, NULL); + + auto g = OSDynamicCastPtr(DerivedPtr::alloc()); + T_ASSERT_EQ_INT(num_instances, 3, NULL); + T_ASSERT_EQ_INT(g->refcount, 1, NULL); +} + +#define OSPTR_TEST_DECL(name) \ + T_DECL(name, #name) { \ + num_instances = 0; \ + num_retains = 0; \ + num_releases = 0; \ + name(); \ + T_QUIET; T_ASSERT_EQ_INT(num_instances, 0, "Instance leak"); \ + } + +OSPTR_TEST_DECL(default_constructor) +OSPTR_TEST_DECL(null_constructor) +OSPTR_TEST_DECL(raw_constructor) +OSPTR_TEST_DECL(alloc) +OSPTR_TEST_DECL(destroy) +OSPTR_TEST_DECL(copy) +OSPTR_TEST_DECL(copy_subclass) +OSPTR_TEST_DECL(assign) +OSPTR_TEST_DECL(assign_raw) +OSPTR_TEST_DECL(assign_null) +OSPTR_TEST_DECL(assign_subclass) +OSPTR_TEST_DECL(assign_compatible) +OSPTR_TEST_DECL(move) +OSPTR_TEST_DECL(move_assign) +OSPTR_TEST_DECL(move_assign_null) +OSPTR_TEST_DECL(move_assign_raw) +OSPTR_TEST_DECL(move_assign_subclass) +OSPTR_TEST_DECL(move_assign_self) +OSPTR_TEST_DECL(test_const_cast) +OSPTR_TEST_DECL(const_cast_move) +OSPTR_TEST_DECL(const_cast_move_self) +OSPTR_TEST_DECL(test_static_cast) +OSPTR_TEST_DECL(static_cast_move) +OSPTR_TEST_DECL(static_cast_move_self) +OSPTR_TEST_DECL(tagged_ptr) +OSPTR_TEST_DECL(attach) +OSPTR_TEST_DECL(detach) +OSPTR_TEST_DECL(foreign) +OSPTR_TEST_DECL(test_dynamic_cast) + + +/* + * Test that the "trivial_abi" attribute works as expected + */ + +struct Complex { + uintptr_t val; + Complex() : val(71) + { + } + ~Complex() + { + } +}; + +struct Trivial { + uintptr_t val; + Trivial() : val(42) + { + } + ~Trivial() + { + } +} __attribute__((trivial_abi)); + +/* defined in osptr_helper.cpp */ +__BEGIN_DECLS +extern uintptr_t pass_trivial(Trivial); +extern uintptr_t pass_complex(Complex); +__END_DECLS +Trivial return_trivial(uintptr_t); +Complex return_complex(uintptr_t); + +T_DECL(trivial_abi, "Test trivial_abi classes are passed by value") +{ + Trivial a; + uintptr_t x = pass_trivial(a); + T_EXPECT_EQ_ULONG(a.val, x, "Trivial class argument passed by-value"); + + Complex b; + uintptr_t y = pass_complex(b); + T_EXPECT_NE_ULONG(b.val, y, "Non-trivial class argument passed by-reference"); + + Trivial c = return_trivial(55); + T_EXPECT_EQ_ULONG(c.val, 55UL, "Trivial class returned by-value"); + + Complex d = return_complex(99); + T_EXPECT_NE_ULONG(d.val, 99UL, "Non-trivial class returned by-reference"); +} + +#pragma clang diagnostic pop diff --git a/tests/osptr_dumb.cpp b/tests/osptr_dumb.cpp new file mode 100644 index 000000000..8cb7e4f29 --- /dev/null +++ b/tests/osptr_dumb.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include + +#if 0 +# define OSPTR_LOG T_LOG +#elif 0 +# define OSPTR_LOG printf +#else +# define OSPTR_LOG(x...) do { } while(0) +#endif + +T_GLOBAL_META( + T_META_NAMESPACE("osptr"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) + ); + +class OSMetaClassBase +{ +public: + virtual void + retain() const + { + } + virtual void + release() const + { + } + virtual void + taggedRetain(void *tag) const + { + } + virtual void + taggedRelease(void *tag) const + { + } + + static void *type_id; +}; + +void *OSMetaClassBase::type_id; + +#define OSTypeAlloc(T) new T +#define OSTypeID(T) T::type_id + +#include + +class Base : public OSMetaClassBase { +public: + Base() : OSMetaClassBase() + { + } +}; + +class Derived : public Base { +public: + Derived() : Base() + { + } +}; + +typedef OSPtr BasePtr; +typedef OSPtr DerivedPtr; + +T_DECL(dumb_osptr, "Dumb OSPtrs work") +{ + BasePtr x = nullptr; + T_ASSERT_EQ_PTR(x, nullptr, NULL); + T_ASSERT_TRUE(typeid(BasePtr) == typeid(Base *), NULL); + T_ASSERT_TRUE(typeid(DerivedPtr) == typeid(Derived *), NULL); + + OSTaggedPtr y = nullptr; + OSTaggedPtr z = nullptr; + T_ASSERT_EQ_PTR(y, nullptr, NULL); + T_ASSERT_TRUE(typeid(y) == typeid(Base *), NULL); + T_ASSERT_TRUE(typeid(z) == typeid(Derived *), NULL); +} diff --git a/tests/osptr_helper.cpp b/tests/osptr_helper.cpp new file mode 100644 index 000000000..28eef3dce --- /dev/null +++ b/tests/osptr_helper.cpp @@ -0,0 +1,24 @@ +#include + +extern "C" { +uintptr_t +pass_trivial(uintptr_t x) +{ + return x; +} +uintptr_t +pass_complex(uintptr_t x) +{ + return x; +} +uintptr_t +_Z14return_trivialm(uintptr_t x) +{ + return x; +} +uintptr_t +_Z14return_complexm(uintptr_t x) +{ + return x; +} +} diff --git a/tests/perf_compressor.c b/tests/perf_compressor.c index 3e8aa68d8..e30acbb5d 100644 --- a/tests/perf_compressor.c +++ b/tests/perf_compressor.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,8 @@ enum { X(DISPATCH_SOURCE_CREATE_FAILED) \ X(INITIAL_SIGNAL_TO_PARENT_FAILED) \ X(SIGNAL_TO_PARENT_FAILED) \ + X(MEMORYSTATUS_CONTROL_FAILED) \ + X(IS_FREEZABLE_NOT_AS_EXPECTED) \ X(EXIT_CODE_MAX) #define EXIT_CODES_ENUM(VAR) VAR, @@ -47,8 +50,9 @@ static const char *exit_codes_str[] = { #define SYSCTL_FREEZE_TO_MEMORY "kern.memorystatus_freeze_to_memory=1" static pid_t pid = -1; -static dt_stat_t r; -static dt_stat_time_t s; +static dt_stat_t ratio; +static dt_stat_time_t compr_time; +static dt_stat_time_t decompr_time; void allocate_zero_pages(char **buf, int num_pages, int vmpgsize); void allocate_mostly_zero_pages(char **buf, int num_pages, int vmpgsize); @@ -128,7 +132,7 @@ freeze_helper_process(void) T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_before, &length, NULL, 0), "failed to query vm.compressor_input_bytes"); - T_STAT_MEASURE(s) { + T_STAT_MEASURE(compr_time) { ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid)); errno_sysctl_freeze = errno; }; @@ -152,7 +156,7 @@ freeze_helper_process(void) T_END; } - dt_stat_add(r, (double)(input_after - input_before) / (double)(compressed_after - compressed_before)); + dt_stat_add(ratio, (double)(input_after - input_before) / (double)(compressed_after - compressed_before)); ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid)); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed"); @@ -163,8 +167,6 @@ freeze_helper_process(void) void cleanup(void) { - int status = 0; - /* No helper process. */ if (pid == -1) { return; @@ -182,9 +184,10 @@ run_compressor_test(int size_mb, int page_type) char **launch_tool_args; char testpath[PATH_MAX]; uint32_t testpath_buf_size; - dispatch_source_t ds_freeze, ds_proc; + dispatch_source_t ds_freeze, ds_proc, ds_decompr; int freeze_enabled; size_t length; + __block bool decompr_latency_is_stable = false; length = sizeof(freeze_enabled); T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.freeze_enabled", &freeze_enabled, &length, NULL, 0), @@ -196,24 +199,35 @@ run_compressor_test(int size_mb, int page_type) T_ATEND(cleanup); - r = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio"); - s = dt_stat_time_create("compressor_latency"); + ratio = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio"); + compr_time = dt_stat_time_create("compressor_latency"); + // This sets the A/B failure threshold at 50% of baseline for compressor_latency - dt_stat_set_variable(s, kPCFailureThresholdPctVar, 50.0); + dt_stat_set_variable((struct dt_stat *)compr_time, kPCFailureThresholdPctVar, 50.0); + + signal(SIGUSR2, SIG_IGN); + ds_decompr = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR2, 0, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(ds_decompr, "dispatch_source_create (ds_decompr)"); + + dispatch_source_set_event_handler(ds_decompr, ^{ + decompr_latency_is_stable = true; + }); + dispatch_activate(ds_decompr); signal(SIGUSR1, SIG_IGN); ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); T_QUIET; T_ASSERT_NOTNULL(ds_freeze, "dispatch_source_create (ds_freeze)"); dispatch_source_set_event_handler(ds_freeze, ^{ - if (!dt_stat_stable(s)) { + if (!(dt_stat_stable(compr_time) && decompr_latency_is_stable)) { freeze_helper_process(); } else { - dt_stat_finalize(s); - dt_stat_finalize(r); + dt_stat_finalize(compr_time); + dt_stat_finalize(ratio); kill(pid, SIGKILL); dispatch_source_cancel(ds_freeze); + dispatch_source_cancel(ds_decompr); } }); dispatch_activate(ds_freeze); @@ -266,7 +280,7 @@ run_compressor_test(int size_mb, int page_type) } T_HELPER_DECL(allocate_pages, "allocates pages to compress") { - int i, j, ret, size_mb, page_type, vmpgsize; + int i, j, ret, size_mb, page_type, vmpgsize, freezable_state; size_t vmpgsize_length; __block int num_pages; __block char **buf; @@ -312,6 +326,20 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") { i = buf[j][0]; } + decompr_time = dt_stat_time_create("decompression_latency"); + + /* Opt in to freezing. */ + printf("[%d] Setting state to freezable\n", getpid()); + if (memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), 1, NULL, 0) != KERN_SUCCESS) { + exit(MEMORYSTATUS_CONTROL_FAILED); + } + + /* Verify that the state has been set correctly */ + freezable_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0); + if (freezable_state != 1) { + exit(IS_FREEZABLE_NOT_AS_EXPECTED); + } + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), dispatch_get_main_queue(), ^{ /* Signal to the parent that we're done allocating and it's ok to freeze us */ printf("[%d] Sending initial signal to parent to begin freezing\n", getpid()); @@ -326,13 +354,33 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") { exit(DISPATCH_SOURCE_CREATE_FAILED); } + __block bool collect_dt_stat_measurements = true; + dispatch_source_set_event_handler(ds_signal, ^{ volatile int tmp; + uint64_t decompr_start_time, decompr_end_time; + + decompr_start_time = mach_absolute_time(); /* Make sure all the pages are accessed before trying to freeze again */ for (int x = 0; x < num_pages; x++) { tmp = buf[x][0]; } + + decompr_end_time = mach_absolute_time(); + + if (collect_dt_stat_measurements) { + if (dt_stat_stable(decompr_time)) { + collect_dt_stat_measurements = false; + dt_stat_finalize(decompr_time); + if (kill(getppid(), SIGUSR2) != 0) { + exit(SIGNAL_TO_PARENT_FAILED); + } + } else { + dt_stat_mach_time_add(decompr_time, decompr_end_time - decompr_start_time); + } + } + if (kill(getppid(), SIGUSR1) != 0) { exit(SIGNAL_TO_PARENT_FAILED); } @@ -348,42 +396,49 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") { #ifndef DT_IOSMARK T_DECL(compr_10MB_zero, "Compression latency for 10MB - zero pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(10, ALL_ZEROS); } T_DECL(compr_10MB_mostly_zero, "Compression latency for 10MB - mostly zero pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(10, MOSTLY_ZEROS); } T_DECL(compr_10MB_random, "Compression latency for 10MB - random pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(10, RANDOM); } T_DECL(compr_10MB_typical, "Compression latency for 10MB - typical pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(10, TYPICAL); } T_DECL(compr_100MB_zero, "Compression latency for 100MB - zero pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(100, ALL_ZEROS); } T_DECL(compr_100MB_mostly_zero, "Compression latency for 100MB - mostly zero pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(100, MOSTLY_ZEROS); } T_DECL(compr_100MB_random, "Compression latency for 100MB - random pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(100, RANDOM); } @@ -391,6 +446,7 @@ T_DECL(compr_100MB_random, T_DECL(compr_100MB_typical, "Compression latency for 100MB - typical pages", + T_META_ASROOT(true), T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) { run_compressor_test(100, TYPICAL); } diff --git a/tests/perf_kdebug.c b/tests/perf_kdebug.c deleted file mode 100644 index d2861ba66..000000000 --- a/tests/perf_kdebug.c +++ /dev/null @@ -1,193 +0,0 @@ -#ifdef T_NAMESPACE -#undef T_NAMESPACE -#endif -#include - -#include -#include -#include -#include -#include - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.perf.kdebug"), - T_META_ASROOT(true), - T_META_CHECK_LEAKS(false), - T_META_TAG_PERF - ); - -// -// Helper functions for direct control over the kernel trace facility. -// - -static void -_sysctl_reset() -{ - int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE }; - if (sysctl(mib, 3, NULL, NULL, NULL, 0)) { - T_FAIL("KERN_KDREMOVE sysctl failed"); - } -} - -static void -_sysctl_setbuf(uint32_t capacity) -{ - int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, (int)capacity }; - if (sysctl(mib, 4, NULL, NULL, NULL, 0)) { - T_FAIL("KERN_KDSETBUF sysctl failed"); - } -} - -static void -_sysctl_setup() -{ - int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETUP }; - if (sysctl(mib, 3, NULL, NULL, NULL, 0)) { - T_FAIL("KERN_KDSETUP sysctl failed"); - } -} - -static void -_sysctl_enable(int value) -{ - int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, value }; - if (sysctl(mib, 4, NULL, NULL, NULL, 0) < 0) { - T_FAIL("KERN_KDENABLE sysctl failed"); - } -} - -static void -_sysctl_enable_typefilter(uint8_t* type_filter_bitmap) -{ - int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSET_TYPEFILTER }; - size_t needed = KDBG_TYPEFILTER_BITMAP_SIZE; - if (sysctl(mib, 3, type_filter_bitmap, &needed, NULL, 0)) { - T_FAIL("KERN_KDSET_TYPEFILTER sysctl failed"); - } -} - -static void -_sysctl_nowrap(bool is_nowrap) -{ - int mib[] = { CTL_KERN, KERN_KDEBUG, is_nowrap ? KERN_KDEFLAGS : KERN_KDDFLAGS, KDBG_NOWRAP }; - if (sysctl(mib, 4, NULL, NULL, NULL, 0)) { - T_FAIL("KDBG_NOWRAP sysctl failed"); - } -} - -static void -enable_tracing(bool value) -{ - _sysctl_enable(value ? KDEBUG_ENABLE_TRACE : 0); -} - -static void -enable_typefilter_all_reject() -{ - uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE]; - memset(type_filter_bitmap, 0, sizeof(type_filter_bitmap)); - _sysctl_enable_typefilter(type_filter_bitmap); -} - -static void -enable_typefilter_all_pass() -{ - uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE]; - memset(type_filter_bitmap, 0xff, sizeof(type_filter_bitmap)); - _sysctl_enable_typefilter(type_filter_bitmap); -} - -static void -loop_kdebug_trace(dt_stat_time_t s) -{ - do { - dt_stat_token start = dt_stat_time_begin(s); - for (uint32_t i = 0; i < 100; i++) { - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i); - } - dt_stat_time_end_batch(s, 1000, start); - } while (!dt_stat_stable(s)); -} - -static void -loop_getppid(dt_stat_time_t s) -{ - do { - dt_stat_token start = dt_stat_time_begin(s); - for (uint32_t i = 0; i < 100; i++) { - getppid(); - getppid(); - getppid(); - getppid(); - getppid(); - getppid(); - getppid(); - getppid(); - getppid(); - getppid(); - } - dt_stat_time_end_batch(s, 1000, start); - } while (!dt_stat_stable(s)); -} - -static void -reset_kdebug_trace(void) -{ - _sysctl_reset(); -} - -static void -test(const char* test_name, void (^pretest_setup)(void), void (*test)(dt_stat_time_t s)) -{ - T_ATEND(reset_kdebug_trace); - _sysctl_reset(); - _sysctl_setbuf(1000000); - _sysctl_nowrap(false); - _sysctl_setup(); - - pretest_setup(); - - dt_stat_time_t s = dt_stat_time_create("%s", test_name); - - test(s); - - dt_stat_finalize(s); -} - -// -// Begin tests... -// - -T_DECL(kdebug_trace_baseline_syscall, - "Test the latency of a syscall while kernel tracing is disabled") { - test("kdebug_trace_baseline_syscall", ^{ enable_tracing(false); }, loop_getppid); -} - -T_DECL(kdebug_trace_kdbg_disabled, - "Test the latency of kdebug_trace while kernel tracing is disabled") { - test("kdebug_trace_kdbg_disabled", ^{ enable_tracing(false); }, loop_kdebug_trace); -} - -T_DECL(kdebug_trace_kdbg_enabled, - "Test the latency of kdebug_trace while kernel tracing is enabled with no typefilter") { - test("kdebug_trace_kdbg_enabled", ^{ enable_tracing(true); }, loop_kdebug_trace); -} - -T_DECL(kdebug_trace_kdbg_enabled_typefilter_pass, - "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that passes the event") { - test("kdebug_trace_kdbg_enabled_typefilter_pass", ^{ enable_tracing(true); enable_typefilter_all_pass(); }, loop_kdebug_trace); -} - -T_DECL(kdebug_trace_kdbg_enabled_typefilter_reject, - "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that rejects the event") { - test("kdebug_trace_kdbg_enabled_typefilter_reject", ^{ enable_tracing(true); enable_typefilter_all_reject(); }, loop_kdebug_trace); -} diff --git a/tests/perf_vmfault.c b/tests/perf_vmfault.c index 384d35862..d0f64ab0a 100644 --- a/tests/perf_vmfault.c +++ b/tests/perf_vmfault.c @@ -55,10 +55,13 @@ static memregion_config *memregion_config_per_thread; static size_t pgsize; static int num_threads; static int ready_thread_count; +static int finished_thread_count; static dt_stat_time_t runtime; static pthread_cond_t start_cvar; static pthread_cond_t threads_ready_cvar; +static pthread_cond_t threads_finished_cvar; static pthread_mutex_t ready_thread_count_lock; +static pthread_mutex_t finished_thread_count_lock; static void map_mem_regions_default(int fault_type, size_t memsize); static void map_mem_regions_single(int fault_type, size_t memsize); @@ -275,6 +278,15 @@ thread_setup(void *arg) T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock"); fault_pages(my_index); + + /* Up the finished count */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&finished_thread_count_lock), "pthread_mutex_lock"); + finished_thread_count++; + if (finished_thread_count == num_threads) { + /* All the threads are done. Wake up the main thread */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_signal(&threads_finished_cvar), "pthread_cond_signal"); + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&finished_thread_count_lock), "pthread_mutex_unlock"); return NULL; } @@ -289,7 +301,10 @@ execute_threads(void) T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_ready_cvar, NULL), "pthread_cond_init"); T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&start_cvar, NULL), "pthread_cond_init"); T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&ready_thread_count_lock, NULL), "pthread_mutex_init"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_finished_cvar, NULL), "pthread_cond_init"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&finished_thread_count_lock, NULL), "pthread_mutex_init"); ready_thread_count = 0; + finished_thread_count = 0; threads = (pthread_t *)malloc(sizeof(*threads) * (size_t)num_threads); thread_indices = (int *)malloc(sizeof(*thread_indices) * (size_t)num_threads); @@ -300,20 +315,28 @@ execute_threads(void) } T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock"); - if (ready_thread_count != num_threads) { + while (ready_thread_count != num_threads) { T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_ready_cvar, &ready_thread_count_lock), "pthread_cond_wait"); } T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock"); T_STAT_MEASURE(runtime) { + /* Ungate the threads */ T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_broadcast(&start_cvar), "pthread_cond_broadcast"); - for (thread_index = 0; thread_index < num_threads; thread_index++) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr), - "pthread_join"); + /* Wait for the threads to finish */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&finished_thread_count_lock), "pthread_mutex_lock"); + while (finished_thread_count != num_threads) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_finished_cvar, &finished_thread_count_lock), "pthread_cond_wait"); } }; + /* Join the threads */ + for (thread_index = 0; thread_index < num_threads; thread_index++) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr), + "pthread_join"); + } + free(threads); free(thread_indices); } @@ -344,8 +367,6 @@ run_test(int fault_type, int mapping_variant, size_t memsize) snprintf(metric_str, 32, "Runtime-%s", variant_str[mapping_variant]); runtime = dt_stat_time_create(metric_str); - // This sets the A/B failure threshold at 50% of baseline for Runtime - dt_stat_set_variable((dt_stat_t)runtime, kPCFailureThresholdPctVar, 50.0); while (!dt_stat_stable(runtime)) { map_mem_regions(fault_type, mapping_variant, memsize); execute_threads(); @@ -418,6 +439,9 @@ T_DECL(read_soft_fault_multithreaded, nthreads = (int)strtol(e, NULL, 0); } else { nthreads = get_ncpu(); + if (nthreads == 1) { + T_SKIP("Skipping multi-threaded test on single core device."); + } } setup_and_run_test(SOFT_FAULT, nthreads); } @@ -439,6 +463,9 @@ T_DECL(zero_fill_fault_multithreaded, nthreads = (int)strtol(e, NULL, 0); } else { nthreads = get_ncpu(); + if (nthreads == 1) { + T_SKIP("Skipping multi-threaded test on single core device."); + } } setup_and_run_test(ZERO_FILL, nthreads); } diff --git a/tests/phys_footprint_interval_max.c b/tests/phys_footprint_interval_max.c index 10a64fbe5..84f45dce0 100644 --- a/tests/phys_footprint_interval_max.c +++ b/tests/phys_footprint_interval_max.c @@ -33,6 +33,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define ALLOC_SIZE_LARGE 5*1024*1024 #define ALLOC_SIZE_SMALL 2*1024*1024 diff --git a/tests/pipe_drain.c b/tests/pipe_drain.c new file mode 100644 index 000000000..4808e0ea3 --- /dev/null +++ b/tests/pipe_drain.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void +signal_handler(int sig, siginfo_t *sip __unused, void *ucontext __unused) +{ + if (sig == SIGPIPE) { + T_FAIL("Received SIGPIPE"); + } + + exit(141); +} + +static void * +thread_read(void *arg) +{ + int fd = (int) (uintptr_t)arg; + char buf[10]; + + read(fd, buf, 10); + T_LOG("thread returned from read"); + return 0; +} + +T_DECL(pipe_drain, + "test a pipe with multiple read descriptor could close one descriptor and drain that descriptor") +{ + int pipe_fd[2]; + int dup_fd; + int ret; + char buf[10] = "Hello"; + pthread_t thread; + + /* Install the signal handler for SIGPIPE */ + + struct sigaction sa = { + .sa_sigaction = signal_handler, + .sa_flags = SA_SIGINFO + }; + sigfillset(&sa.sa_mask); + + T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGPIPE, &sa, NULL), NULL); + + ret = pipe(pipe_fd); + T_EXPECT_EQ(ret, 0, NULL); + + dup_fd = dup(pipe_fd[0]); + T_EXPECT_GE(dup_fd, 0, NULL); + + pthread_create(&thread, NULL, thread_read, (void *) (uintptr_t) pipe_fd[0]); + + sleep(5); + + close(pipe_fd[0]); + ret = (int)write(pipe_fd[1], buf, strlen(buf) + 1); + T_EXPECT_EQ(ret, (int)strlen(buf) + 1, NULL); +} diff --git a/tests/pipe_kevent.c b/tests/pipe_kevent.c new file mode 100644 index 000000000..8a02261b2 --- /dev/null +++ b/tests/pipe_kevent.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_DECL(pipe_noblock_kevent, + "Set a pipe and no block and setup EVFLT_WRITE kevent on it and make sure it does not fire when the pipe is full") +{ + int fd[2], write_fd; + dispatch_queue_t dq1 = dispatch_queue_create("com.apple.test.pipe_noblock_kevent.queue", DISPATCH_QUEUE_SERIAL); + + pipe(fd); + write_fd = fd[1]; + __block int iter = 1; + + /* Make sure the pipe is No block */ + fcntl(write_fd, F_SETFL, (O_NONBLOCK)); + + dispatch_source_t write_source = dispatch_source_create(DISPATCH_SOURCE_TYPE_WRITE, (uintptr_t)write_fd, 0, dq1); + dispatch_source_set_event_handler(write_source, ^{ + unsigned long length = dispatch_source_get_data(write_source); + + T_LOG("Iteration: %d, Length available: %lu\n", iter++, length); + + char buf[512] = "deadbeef"; + ssize_t rv = write(write_fd, buf, 512); + T_EXPECT_POSIX_SUCCESS(rv, "write success"); + if (rv < 0) { + T_FAIL("Write should have succeeded but failed with error %ld", rv); + T_END; + } + }); + + dispatch_resume(write_source); + + T_LOG("Arming a timer for 15 seconds to exit, assuming kevent will block before that"); + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 15 * NSEC_PER_SEC), dispatch_get_main_queue(), ^{ + T_LOG("PASS: Kevent blocked as expected in the EVFLT_WRITE"); + T_END; + }); + + dispatch_main(); +} diff --git a/tests/poll.c b/tests/poll.c index 49e4be65f..c72cf5db8 100644 --- a/tests/poll.c +++ b/tests/poll.c @@ -10,7 +10,8 @@ #include #include -T_GLOBAL_META(T_META_NAMESPACE("xnu.poll")); +T_GLOBAL_META(T_META_NAMESPACE("xnu.poll"), + T_META_RUN_CONCURRENTLY(true)); #define SLEEP_TIME_SECS 1 #define POLL_TIMEOUT_MS 1800 @@ -26,7 +27,7 @@ T_DECL(sleep_with_no_fds, "poll() called with no fds provided should act like sleep") { uint64_t begin_time, sleep_time, poll_time; - struct pollfd pfd = { 0 }; + struct pollfd pfd = { .fd = 0, .events = 0, .revents = 0 }; begin_time = mach_absolute_time(); sleep(SLEEP_TIME_SECS); diff --git a/tests/port_descriptions.c b/tests/port_descriptions.c index 3f1f96ea3..55d3c12b1 100644 --- a/tests/port_descriptions.c +++ b/tests/port_descriptions.c @@ -28,11 +28,13 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + static void -expect_special_port_description(const char *(*fn)(mach_port_t), +expect_special_port_description(const char *(*fn)(int), mach_port_t port, const char *namestr) { - const char *desc = fn(port); + const char *desc = fn((int)port); T_EXPECT_NOTNULL(desc, "%s is %s", namestr, desc); if (desc) { T_QUIET; T_EXPECT_GT(strlen(desc), strlen(""), @@ -72,10 +74,12 @@ T_DECL(host_special_port_descriptions, TEST_HSP(HOST_RESOURCE_NOTIFY_PORT); TEST_HSP(HOST_CLOSURED_PORT); TEST_HSP(HOST_SYSPOLICYD_PORT); + TEST_HSP(HOST_FILECOORDINATIOND_PORT); + TEST_HSP(HOST_FAIRPLAYD_PORT); #undef TEST_HSP - T_EXPECT_EQ(HOST_SYSPOLICYD_PORT, HOST_MAX_SPECIAL_PORT, + T_EXPECT_EQ(HOST_FAIRPLAYD_PORT, HOST_MAX_SPECIAL_PORT, "checked all of the ports"); const char *invalid_hsp = @@ -151,6 +155,7 @@ T_DECL(host_special_port_mapping, TEST_HSP(HOST_RESOURCE_NOTIFY_PORT); TEST_HSP(HOST_CLOSURED_PORT); TEST_HSP(HOST_SYSPOLICYD_PORT); + TEST_HSP(HOST_FILECOORDINATIOND_PORT); #undef TEST_HSP diff --git a/tests/posix_spawn_file_actions.c b/tests/posix_spawn_file_actions.c new file mode 100644 index 000000000..2093069b2 --- /dev/null +++ b/tests/posix_spawn_file_actions.c @@ -0,0 +1,156 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +/* TEST_PATH needs to be something that exists, but is not the cwd */ +#define TEST_PATH "/System/Library/Caches" + +T_DECL(posix_spawn_file_actions_addchdir_np, "Check posix_spawn_file_actions_addchdir_np", + T_META_ASROOT(true)) +{ + posix_spawn_file_actions_t file_actions; + int ret; + + ret = posix_spawn_file_actions_init(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); + + ret = posix_spawn_file_actions_addchdir_np(&file_actions, TEST_PATH); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addchdir_np"); + + char * const prog = "/bin/sh"; + char * const argv_child[] = { prog, + "-c", + "test $(pwd) = \"" TEST_PATH "\"", + NULL, }; + pid_t child_pid; + extern char **environ; + + ret = posix_spawn(&child_pid, prog, &file_actions, NULL, argv_child, environ); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn"); + + T_LOG("parent: spawned child with pid %d\n", child_pid); + + ret = posix_spawn_file_actions_destroy(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy"); + + T_LOG("parent: waiting for child process\n"); + + int status = 0; + int waitpid_result = waitpid(child_pid, &status, 0); + T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid"); + T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned"); + T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally"); + T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success"); +} + +T_DECL(posix_spawn_file_actions_addchdir_np_errors, "Check posix_spawn_file_actions_addchdir_np errors", + T_META_ASROOT(true)) +{ + char longpath[PATH_MAX + 1]; + posix_spawn_file_actions_t file_actions; + int ret; + + memset(longpath, 'a', PATH_MAX); + longpath[PATH_MAX] = '\0'; + + ret = posix_spawn_file_actions_init(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); + + ret = posix_spawn_file_actions_addchdir_np(NULL, "/"); + T_ASSERT_EQ(ret, EINVAL, "NULL *file_actions returns EINVAL"); + + ret = posix_spawn_file_actions_addchdir_np(&file_actions, longpath); + T_ASSERT_EQ(ret, ENAMETOOLONG, "Path longer than PATH_MAX returns ENAMETOOLONG"); + + ret = posix_spawn_file_actions_destroy(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy"); +} + +T_DECL(posix_spawn_file_actions_addfchdir_np, "Check posix_spawn_file_actions_addfchdir_np", + T_META_ASROOT(true)) +{ + posix_spawn_file_actions_t file_actions; + int ret; + int test_fd; + + ret = posix_spawn_file_actions_init(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); + + test_fd = open(TEST_PATH, O_RDONLY | O_CLOEXEC); + T_ASSERT_POSIX_SUCCESS(test_fd, "open " TEST_PATH); + + ret = posix_spawn_file_actions_addfchdir_np(&file_actions, test_fd); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addfchdir_np"); + + char * const prog = "/bin/sh"; + char * const argv_child[] = { prog, + "-c", + "test $(pwd) = \"" TEST_PATH "\"", + NULL, }; + pid_t child_pid; + extern char **environ; + + ret = posix_spawn(&child_pid, prog, &file_actions, NULL, argv_child, environ); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn"); + + T_LOG("parent: spawned child with pid %d\n", child_pid); + + ret = posix_spawn_file_actions_destroy(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy"); + + T_LOG("parent: waiting for child process\n"); + + int status = 0; + int waitpid_result = waitpid(child_pid, &status, 0); + T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid"); + T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned"); + T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally"); + T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success"); + + ret = close(test_fd); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "close test fd"); +} + +T_DECL(posix_spawn_file_actions_addfchdir_np_errors, "Check posix_spawn_file_actions_addfchdir_np errors", + T_META_ASROOT(true)) +{ + posix_spawn_file_actions_t file_actions; + int ret; + + ret = posix_spawn_file_actions_init(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); + + ret = posix_spawn_file_actions_addfchdir_np(NULL, 0); + T_ASSERT_EQ(ret, EINVAL, "NULL *file_actions returns EINVAL"); + + ret = posix_spawn_file_actions_addfchdir_np(&file_actions, -1); + T_ASSERT_EQ(ret, EBADF, "-1 file descriptor returns EBADF"); + + ret = posix_spawn_file_actions_destroy(&file_actions); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy"); +} diff --git a/tests/posix_spawn_file_actions_add_fileportdup2_np.c b/tests/posix_spawn_file_actions_add_fileportdup2_np.c new file mode 100644 index 000000000..e1c8710bb --- /dev/null +++ b/tests/posix_spawn_file_actions_add_fileportdup2_np.c @@ -0,0 +1,74 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +T_DECL(posix_spawn_file_actions_add_fileportdup2_np, + "Check posix_spawnattr for posix_spawn_file_actions_add_fileportdup2_np", + T_META_ASROOT(true)) +{ + posix_spawnattr_t attr; + posix_spawn_file_actions_t fact; + int ret, pipes[2]; + mach_port_t mp; + + ret = pipe(pipes); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pipe"); + + ret = fileport_makeport(pipes[1], &mp); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "fileport_makefd"); + + ret = posix_spawnattr_init(&attr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init"); + + ret = posix_spawn_file_actions_init(&fact); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); + + ret = posix_spawn_file_actions_add_fileportdup2_np(&fact, mp, STDOUT_FILENO); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_add_fileportdup2_np"); + + char * const prog = "/bin/echo"; + char * const argv_child[] = { prog, "1", NULL }; + pid_t child_pid; + extern char **environ; + + ret = posix_spawn(&child_pid, prog, &fact, &attr, argv_child, environ); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn"); + + ret = posix_spawn_file_actions_destroy(&fact); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy"); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy"); + + T_LOG("parent: spawned child with pid %d\n", child_pid); + + int status = 0; + int waitpid_result = waitpid(child_pid, &status, 0); + T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid"); + T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned"); + T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally"); + T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success"); + + char buf[1]; + ssize_t rc = read(pipes[0], buf, sizeof(buf)); + T_ASSERT_POSIX_SUCCESS(rc, "read"); + T_ASSERT_EQ(rc, 1l, "should have read one byte"); + T_ASSERT_EQ(buf[0], '1', "should have read '1'"); +} diff --git a/tests/posix_spawn_posix_cred.c b/tests/posix_spawn_posix_cred.c new file mode 100644 index 000000000..c80062292 --- /dev/null +++ b/tests/posix_spawn_posix_cred.c @@ -0,0 +1,91 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +T_DECL(posix_spawn_posix_cred, "Check posix_spawnattr for POSIX creds", + T_META_ASROOT(true)) +{ + posix_spawnattr_t attr; + int ret; + + ret = posix_spawnattr_init(&attr); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init"); + + ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_START_SUSPENDED); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setflags"); + + ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSID); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setflags(POSIX_SPAWN_SETSID)"); + + ret = posix_spawnattr_set_uid_np(&attr, 502); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_uid_np"); + + ret = posix_spawnattr_set_gid_np(&attr, 501); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_gid_np"); + + gid_t groups[3] = { 501, 250, 299 }; + ret = posix_spawnattr_set_groups_np(&attr, 3, &groups, KAUTH_UID_NONE); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_groups_np"); + + ret = posix_spawnattr_set_login_np(&attr, "fake-name"); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_login_np"); + + char * const prog = "/bin/sh"; + char * const argv_child[] = { prog, + "-c", + "test $(logname) = \"fake-name\" -a \"$(id -G)\" = \"501 250 299\"", + NULL, }; + pid_t child_pid; + extern char **environ; + + ret = posix_spawn(&child_pid, prog, NULL, &attr, argv_child, environ); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn"); + + T_LOG("parent: spawned child with pid %d\n", child_pid); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy"); + + struct proc_bsdinfo info; + + ret = proc_pidinfo(child_pid, PROC_PIDTBSDINFO, 1, &info, sizeof(info)); + T_QUIET; + T_ASSERT_EQ(ret, (int)sizeof(info), "proc_pidinfo(PROC_PIDTBSDINFO)"); + + T_EXPECT_TRUE((bool)(info.pbi_flags & PROC_FLAG_SLEADER), + "check setsid happened"); + T_EXPECT_EQ(info.pbi_uid, 502, "UID was set"); + T_EXPECT_EQ(info.pbi_gid, 501, "GID was set"); + + ret = kill(child_pid, SIGCONT); + T_ASSERT_POSIX_SUCCESS(ret, "kill(signal)"); + + T_LOG("parent: waiting for child process\n"); + + int status = 0; + int waitpid_result = waitpid(child_pid, &status, 0); + T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid"); + T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned"); + T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally"); + T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success"); +} diff --git a/tests/prioritize_process_launch.c b/tests/prioritize_process_launch.c new file mode 100644 index 000000000..8f7ed11ef --- /dev/null +++ b/tests/prioritize_process_launch.c @@ -0,0 +1,838 @@ +/* + * prioritize process launch: Tests prioritized process launch across posix spawn and exec. + */ + +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.prioritize_process_launch"), + T_META_RUN_CONCURRENTLY(true)); + +#define HELPER_TIMEOUT_SECS (3000) +#define MACH_RCV_OPTIONS (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \ + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | \ + MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)) + +static pthread_t +thread_create_at_qos(qos_class_t qos, void * (*function)(void *), void *arg); +static mach_port_t sr_port; + + +#pragma mark Mach receive + +static mach_voucher_t +create_pthpriority_voucher(mach_msg_priority_t qos) +{ + char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)]; + + mach_voucher_t voucher = MACH_PORT_NULL; + kern_return_t ret; + ipc_pthread_priority_value_t ipc_pthread_priority_value = + (ipc_pthread_priority_value_t)qos; + + mach_voucher_attr_raw_recipe_array_t recipes; + mach_voucher_attr_raw_recipe_size_t recipe_size = 0; + mach_voucher_attr_recipe_t recipe = + (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size]; + + recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY; + recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE; + recipe->previous_voucher = MACH_VOUCHER_NULL; + memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value)); + recipe->content_size = sizeof(ipc_pthread_priority_value_t); + recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size; + + recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0]; + + ret = host_create_mach_voucher(mach_host_self(), + recipes, + recipe_size, + &voucher); + + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher"); + return voucher; +} + +static void +send( + mach_port_t send_port, + mach_port_t reply_port, + mach_port_t msg_port, + mach_msg_priority_t qos, + mach_msg_option_t options, + int send_disposition) +{ + kern_return_t ret = 0; + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + } send_msg = { + .header = { + .msgh_remote_port = send_port, + .msgh_local_port = reply_port, + .msgh_bits = MACH_MSGH_BITS_SET(send_disposition, + reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0, + MACH_MSG_TYPE_MOVE_SEND, + MACH_MSGH_BITS_COMPLEX), + .msgh_id = 0x100, + .msgh_size = sizeof(send_msg), + }, + .body = { + .msgh_descriptor_count = 1, + }, + .port_descriptor = { + .name = msg_port, + .disposition = MACH_MSG_TYPE_MOVE_RECEIVE, + .type = MACH_MSG_PORT_DESCRIPTOR, + }, + }; + + if (options & MACH_SEND_SYNC_USE_THRPRI) { + send_msg.header.msgh_voucher_port = create_pthpriority_voucher(qos); + } + + if (msg_port == MACH_PORT_NULL) { + send_msg.body.msgh_descriptor_count = 0; + } + + ret = mach_msg(&(send_msg.header), + MACH_SEND_MSG | + MACH_SEND_TIMEOUT | + MACH_SEND_OVERRIDE | + ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options), + send_msg.header.msgh_size, + 0, + MACH_PORT_NULL, + 10000, + 0); + + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg"); +} + +static void +receive( + mach_port_t rcv_port, + mach_port_t notify_port) +{ + kern_return_t ret = 0; + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + mach_msg_trailer_t trailer; + } rcv_msg = { + .header = + { + .msgh_remote_port = MACH_PORT_NULL, + .msgh_local_port = rcv_port, + .msgh_size = sizeof(rcv_msg), + }, + }; + + T_LOG("Client: Starting sync receive\n"); + + ret = mach_msg(&(rcv_msg.header), + MACH_RCV_MSG | + MACH_RCV_SYNC_WAIT, + 0, + rcv_msg.header.msgh_size, + rcv_port, + 0, + notify_port); +} + +static int +get_pri(thread_t thread_port) +{ + kern_return_t kr; + + thread_extended_info_data_t extended_info; + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + return extended_info.pth_curpri; +} + +static void +set_thread_name(const char *fn_name) +{ + char name[50] = ""; + + thread_t thread_port = pthread_mach_thread_np(pthread_self()); + + int pri = get_pri(thread_port); + + snprintf(name, sizeof(name), "%s at pri %2d", fn_name, pri); + pthread_setname_np(name); +} + +static void +thread_wait_to_block(mach_port_t thread_port) +{ + thread_extended_info_data_t extended_info; + kern_return_t kr; + + while (1) { + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + if (extended_info.pth_run_state == TH_STATE_WAITING) { + T_LOG("Target thread blocked\n"); + break; + } + thread_switch(thread_port, SWITCH_OPTION_DEPRESS, 0); + } +} + +static void * +thread_sync_rcv(void *arg) +{ + mach_port_t port = (mach_port_t)arg; + mach_port_t special_reply_port; + + set_thread_name(__FUNCTION__); + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + sr_port = special_reply_port; + /* Do a sync rcv on special reply port and push on given arg port */ + receive(special_reply_port, port); + return NULL; +} + +static pthread_t +thread_create_at_qos(qos_class_t qos, void * (*function)(void *), void *arg) +{ + qos_class_t qos_thread; + pthread_t pthread; + pthread_attr_t attr; + int ret; + + ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL); + if (ret != 0) { + T_LOG("set priority failed\n"); + } + + pthread_attr_init(&attr); + pthread_attr_set_qos_class_np(&attr, qos, 0); + pthread_create(&pthread, &attr, function, arg); + + T_LOG("pthread created\n"); + pthread_get_qos_class_np(pthread, &qos_thread, NULL); + return pthread; +} + +static mach_port_t +get_sync_push_port_at_qos(qos_class_t qos) +{ + mach_port_t port; + kern_return_t kr; + pthread_t pthread; + thread_t thread; + + /* Create a rcv right to have a sync ipc push from a thread */ + kr = mach_port_allocate(mach_task_self(), + MACH_PORT_RIGHT_RECEIVE, + &port); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "sync push port mach_port_allocate"); + + kr = mach_port_insert_right(mach_task_self(), + port, + port, + MACH_MSG_TYPE_MAKE_SEND); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "sync push port mach_port_insert_right"); + + /* Create a thread at given qos and start a sync push on given port */ + pthread = thread_create_at_qos(qos, thread_sync_rcv, (void *)(uintptr_t)port); + thread = pthread_mach_thread_np(pthread); + thread_wait_to_block(thread); + + return port; +} + +static mach_port_t +create_port_and_copyin_a_port(mach_port_t port) +{ + mach_port_t new_port; + kern_return_t kr; + + /* Create a rcv right */ + kr = mach_port_allocate(mach_task_self(), + MACH_PORT_RIGHT_RECEIVE, + &new_port); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "copyin mach_port_allocate"); + + kr = mach_port_insert_right(mach_task_self(), + new_port, + new_port, + MACH_MSG_TYPE_MAKE_SEND); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "copyin mach_port_insert_right"); + + send(new_port, MACH_PORT_NULL, port, 0, 0, MACH_MSG_TYPE_COPY_SEND); + return new_port; +} + +static pid_t +posix_spawn_child_with_watch_ports( + char *binary, + char *arg, + mach_port_t *port_array, + int arrayCnt) +{ + pid_t child_pid = 0; + char *new_argv[] = { binary, arg, NULL}; + errno_t ret; + posix_spawnattr_t attr; + + ret = posix_spawnattr_init(&attr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init"); + + ret = posix_spawnattr_set_importancewatch_port_np(&attr, arrayCnt, port_array); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_importancewatch_port_np"); + + ret = posix_spawn(&child_pid, binary, NULL, &attr, new_argv, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn"); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy"); + + return child_pid; +} + +static void +worker_cb(pthread_priority_t __unused priority) +{ + T_FAIL("a worker thread was created"); +} + +static void +event_cb(void ** __unused events, int * __unused nevents) +{ + T_FAIL("a kevent routine was called instead of workloop"); +} + +static void +workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist, int *events) +{ + pid_t pid; + int stat; + int priority; + mach_port_t port; + struct kevent_qos_s *kev = *eventslist; + mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0]; + port = hdr->msgh_local_port; + + T_LOG("Workloop handler workloop_cb_test_intransit called. "); + T_LOG("The total events returned is %d", *events); + + priority = get_pri(mach_thread_self()); + T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority); + + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1); + + /* Make sure our priority has dropped */ + priority = get_pri(mach_thread_self()); + T_EXPECT_EQ(priority, 31, "Priority of servicer is %d", priority); + + sleep(2); + + /*enqueue the port to sever the temp onwer boost */ + create_port_and_copyin_a_port(port); + + waitpid(pid, &stat, 0); + + *events = 0; + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost did not work correctly with knotes"); + T_END; +} + +static void +workloop_cb_test_knote_kill(uint64_t *workloop_id __unused, void **eventslist, int *events) +{ + pid_t pid; + int stat; + int priority; + mach_port_t port; + struct kevent_qos_s *kev = *eventslist; + mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0]; + port = hdr->msgh_local_port; + + T_LOG("Workloop handler workloop_cb_test_knote_kill called. "); + T_LOG("The total events returned is %d", *events); + + priority = get_pri(mach_thread_self()); + T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority); + + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXIT", &port, 1); + + sleep(2); + + /* Make sure our priority is boosted again */ + priority = get_pri(mach_thread_self()); + T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority); + + waitpid(pid, &stat, 0); + + *events = 0; + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 47, "Temp owner boost did not work correctly with knotes"); + T_END; +} + +static void +workloop_cb_test_sync_bootstrap(uint64_t *workloop_id __unused, void **eventslist, int *events) +{ + static pid_t pid = 0; + int stat; + int priority; + static mach_port_t port = MACH_PORT_NULL; + struct kevent_qos_s *kev = *eventslist; + mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0]; + + T_LOG("Workloop handler workloop_cb_test_knote_kill called. "); + T_LOG("The total events returned is %d", *events); + + /* Check if called for peek */ + if (hdr == NULL) { + priority = get_pri(mach_thread_self()); + T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority); + + port = (mach_port_t)kev->ident; + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "MSGSYNC", &port, 1); + } else { + /* Wait till the priority of servicer is 47 */ + T_LOG("Waiting for the servicer to be boosted"); + do { + sleep(1); + priority = get_pri(mach_thread_self()); + } while (priority != 47); + + T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority); + + /* Get the reply port and send the receive right in it */ + mach_port_t reply_port = hdr->msgh_remote_port; + T_LOG("The rcv right to send is %d", port); + send(reply_port, MACH_PORT_NULL, port, 0, 0, MACH_MSG_TYPE_MOVE_SEND_ONCE); + + waitpid(pid, &stat, 0); + + /* The handler priority should not be boosted anymore */ + priority = get_pri(mach_thread_self()); + T_EXPECT_EQ(priority, 31, "Priority of servicer is %d", priority); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost did not work correctly with knotes"); + T_END; + } + *events = 0; +} + +static void +register_workloop_for_port( + mach_port_t port, + pthread_workqueue_function_workloop_t func, + unsigned int options) +{ + int r; + + /* register workloop handler with pthread */ + if (func != NULL) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)func, 0, 0), NULL); + } + + /* attach port to workloop */ + struct kevent_qos_s kev[] = {{ + .ident = port, + .filter = EVFILT_MACHPORT, + .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, + .fflags = options, + .data = 1, + .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_DEFAULT, 0, 0) + }}; + + struct kevent_qos_s kev_err[] = {{ 0 }}; + + /* Setup workloop for mach msg rcv */ + r = kevent_id(25, kev, 1, kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); + T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id"); +} + +/* + * Test 1: Test turnstile boosting for temp owner ports for posix_spawn. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. + */ +T_DECL(posix_spawn_basic_priority, "Basic posix spawn temp owner priority test", T_META_ASROOT(YES)) +{ + mach_port_t port; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXIT", &port, 1); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 47, "spawn did not properly boost main thread"); + T_END; +} + +/* + * Test 2: Test turnstile boosting for temp owner ports for posix_spawn and exec. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. The spawned binary will exec + * and verify that it still has the push. + */ +T_DECL(posix_spawn_exec_basic_priority, "Basic posix spawn/exec temp owner priority test", T_META_ASROOT(YES)) +{ + mach_port_t port; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXEC", &port, 1); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 47, "spawn/exec did not properly boost main thread"); + T_END; +} + +/* + * Test 3: Test turnstile boosting for temp owner ports for posix_spawn and set exec. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. The spawned binary will + * posix_spawn set exec and verify that it still has the push. + */ +T_DECL(posix_spawn_set_exec_basic_priority, "Basic posix spawn set exec temp owner priority test", T_META_ASROOT(YES)) +{ + mach_port_t port; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "SETEXEC", &port, 1); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 47, "spawn set exec did not properly boost main thread"); + T_END; +} + +/* + * Test 4: Test turnstile boosting for temp owner ports for posix_spawn and set exec. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. The spawned binary already + * having the temp owner push will try to do set exec with watchports which should fail. + */ +T_DECL(posix_spawn_set_exec_with_more_ports, "posix spawn set exec with more watch ports", T_META_ASROOT(YES)) +{ + mach_port_t port; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "SETEXEC_PORTS", &port, 1); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), EINVAL, "spawn set exec did not error out when watchports were passed to already boosted process"); + T_END; +} + +/* + * Test 5: Test turnstile boosting for temp owner ports for multiple posix_spawns. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port, then + * pass the same port as a watchport to another posix_spawn and verify that the boost was + * transferred to the new process. + */ +T_DECL(posix_spawn_multiple, "multiple posix_spawn with same watchport", T_META_ASROOT(YES)) +{ + mach_port_t port; + pid_t pid1, pid2; + int stat1, stat2; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid1 = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1); + + /* Let the child 1 execute a little, the sleep here is optional */ + sleep(2); + + pid2 = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXIT", &port, 1); + + waitpid(pid2, &stat2, 0); + waitpid(pid1, &stat1, 0); + + T_QUIET; T_LOG("The return stat for child 1 is is %d", WEXITSTATUS(stat1)); + T_QUIET; T_LOG("The return stat for child 2 is is %d", WEXITSTATUS(stat2)); + T_EXPECT_EQ(WEXITSTATUS(stat2), 47, "spawn of multiple processes with same watchport did not transfer the boost correctly"); + T_EXPECT_EQ(WEXITSTATUS(stat1), 31, "spawn of multiple processes with same watchport did not transfer the boost correctly"); + T_END; +} + +/* + * Test 6: Test turnstile boosting for temp owner ports for posix_spawn for dead port. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. Destroy the port and verify + * the temp owner push has gone away. + */ +T_DECL(posix_spawn_dead_reply_port, "posix spawn with reply port destory", T_META_ASROOT(YES)) +{ + mach_port_t port; + kern_return_t kr; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1); + + /* Let the child execute a little, the sleep here is optional */ + sleep(2); + + /* Destory the special reply port */ + kr = mach_port_mod_refs(mach_task_self(), sr_port, MACH_PORT_RIGHT_RECEIVE, -1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "posix_spaw_dead_port mach_port_mod_refs"); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost was not removed on port death"); + T_END; +} + +/* + * Test 7: Test turnstile boosting for temp owner ports for posix_spawn for dead port. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. Destroy the port and verify + * the temp owner push has gone. + */ +T_DECL(posix_spawn_dead_port, "posix spawn with port destory", T_META_ASROOT(YES)) +{ + mach_port_t port; + kern_return_t kr; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1); + + /* Destory the port */ + kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_RECEIVE, -1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "posix_spaw_dead_port mach_port_mod_refs"); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost was not removed on port death"); + T_END; +} + +/* + * Test 8: Test turnstile boosting for temp owner ports for posix_spawn when port is copied in. + * + * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and + * test that spawned binary has the temp owner push of the port. Copyin the port and verify + * the temp owner push has gone. + */ +T_DECL(posix_spawn_copyin_port, "posix spawn with copyin port", T_META_ASROOT(YES)) +{ + mach_port_t port; + pid_t pid; + int stat; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1); + + /* Let the child execute a little, the sleep here is optional */ + sleep(2); + + /* Copyin the port in another port */ + create_port_and_copyin_a_port(port); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost was not removed on port copyin"); + T_END; +} + +/* + * Test 9: Test turnstile boosting for temp owner ports for posix_spawn with multiple ports. + * + * Create multiple ports with sync IPC push and then pass the port to posix_spawn as watch ports and + * test that spawned binary has the temp owner push of the ports. Copyin ports one by one and verify + * the push has gone. + */ +T_DECL(posix_spawn_multiple_port, "posix spawn with multiple ports", T_META_ASROOT(YES)) +{ + mach_port_t port[2]; + pid_t pid; + int stat; + + port[0] = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + port[1] = get_sync_push_port_at_qos(QOS_CLASS_USER_INITIATED); + pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "MULTIWAIT", port, 2); + + /* Let the child execute a little, the sleep here is optional */ + sleep(2); + + /* Copyin the port in another port */ + create_port_and_copyin_a_port(port[0]); + + /* Let the child execute a little, the sleep here is optional */ + sleep(2); + + /* Copyin the port in another port */ + create_port_and_copyin_a_port(port[1]); + + waitpid(pid, &stat, 0); + + T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat)); + T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost did not work correctly with multiple ports"); + T_END; +} + +/* + * Test 10: Test turnstile boosting for temp owner ports for posix_spawn when port attached to a knote. + * + * Create a port with sync IPC push attach a workloop knote to it, send a message on the port, then in the + * servicer pass the port to posix_spawn as a watch port and test that spawned binary has the temp owner + * push of the port and the servicer looses the boost. + */ +T_DECL(posix_spawn_knote, "posix spawn with temp owner port attached to knote", T_META_ASROOT(YES)) +{ + mach_port_t port; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + + /* attach port to a workloop */ + register_workloop_for_port(port, workloop_cb_test_intransit, MACH_RCV_OPTIONS); + + /* send a message on port to activate workloop handler */ + send(port, MACH_PORT_NULL, MACH_PORT_NULL, QOS_CLASS_DEFAULT, 0, MACH_MSG_TYPE_COPY_SEND); + sigsuspend(0); +} + +/* + * Test 11: Test turnstile boosting for temp owner ports for posix_spawn when port attached to a knote. + * + * Create a port with sync IPC push attach a workloop knote to it, send a message on the port, then in the + * servicer pass the port to posix_spawn as a watch port and test that spawned binary has the temp owner + * push of the port and the servicer looses the boost, verify that once the spawned binary dies, the servicer + * gets the push. + */ +T_DECL(posix_spawn_knote_ret, "posix spawn with temp owner port attached to knote with spawned binary dead", T_META_ASROOT(YES)) +{ + mach_port_t port; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + + register_workloop_for_port(port, workloop_cb_test_knote_kill, MACH_RCV_OPTIONS); + + /* send a message on port to activate workloop handler */ + send(port, MACH_PORT_NULL, MACH_PORT_NULL, QOS_CLASS_DEFAULT, 0, MACH_MSG_TYPE_COPY_SEND); + sigsuspend(0); +} + +/* + * Test 12: Test turnstile boosting for temp owner ports and mach msg option for sync bootstrap_checkin. + * + * Create a port with sync IPC push attach a workloop knote to it, send a message on the port, then in the + * servicer pass the port to posix_spawn as a watch port and test that spawned binary has the temp owner + * push of the port and the servicer looses the boost, the spawn binary then does a sync bootstrap_checkin + * with test binary to get the receive right and verify that is still has the boost. + */ +T_DECL(mach_msg_sync_boostrap_checkin, "test mach msg option for sync bootstrap_checkin", T_META_ASROOT(YES)) +{ + mach_port_t port; + mach_port_t sync_port; + kern_return_t kr; + + port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE); + + register_workloop_for_port(port, workloop_cb_test_sync_bootstrap, MACH_RCV_SYNC_PEEK); + + /* Create a mach port for spawned binary to do bootstrap checkin */ + kr = mach_port_allocate(mach_task_self(), + MACH_PORT_RIGHT_RECEIVE, + &sync_port); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_port_allocate"); + + kr = mach_port_insert_right(mach_task_self(), + sync_port, + sync_port, + MACH_MSG_TYPE_MAKE_SEND); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_port_insert_right"); + + kr = mach_port_mod_refs(mach_task_self(), sync_port, MACH_PORT_RIGHT_SEND, 1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_port_mod_refs"); + + register_workloop_for_port(sync_port, NULL, MACH_RCV_OPTIONS); + + /* Stash the port in task to make sure child also gets it */ + kr = mach_ports_register(mach_task_self(), &sync_port, 1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_ports_register"); + + /* send a message on port to activate workloop handler */ + send(port, MACH_PORT_NULL, MACH_PORT_NULL, QOS_CLASS_DEFAULT, 0, MACH_MSG_TYPE_COPY_SEND); + sigsuspend(0); +} diff --git a/tests/prioritize_process_launch_helper.c b/tests/prioritize_process_launch_helper.c new file mode 100644 index 000000000..f190e6253 --- /dev/null +++ b/tests/prioritize_process_launch_helper.c @@ -0,0 +1,335 @@ +/* + * prioritize process launch: Tests prioritized process launch across posix spawn and exec. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +mach_port_t +receive( + mach_port_t rcv_port, + mach_port_t notify_port); + +static int +get_pri(thread_t thread_port) +{ + kern_return_t kr; + + thread_extended_info_data_t extended_info; + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + if (kr != KERN_SUCCESS) { + printf("thread info failed to get current priority of the thread\n"); + } + return extended_info.pth_curpri; +} + +static void +set_thread_name(const char *fn_name) +{ + char name[50] = ""; + + thread_t thread_port = pthread_mach_thread_np(pthread_self()); + + int pri = get_pri(thread_port); + + snprintf(name, sizeof(name), "%s at pri %2d", fn_name, pri); + pthread_setname_np(name); +} + +static void +send( + mach_port_t send_port, + mach_port_t reply_port, + mach_port_t msg_port, + mach_msg_option_t options, + int send_disposition) +{ + kern_return_t ret = 0; + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + } send_msg = { + .header = { + .msgh_remote_port = send_port, + .msgh_local_port = reply_port, + .msgh_bits = MACH_MSGH_BITS_SET(send_disposition, + reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0, + MACH_MSG_TYPE_MOVE_SEND, + MACH_MSGH_BITS_COMPLEX), + .msgh_id = 0x100, + .msgh_size = sizeof(send_msg), + }, + .body = { + .msgh_descriptor_count = 1, + }, + .port_descriptor = { + .name = msg_port, + .disposition = MACH_MSG_TYPE_MOVE_RECEIVE, + .type = MACH_MSG_PORT_DESCRIPTOR, + }, + }; + + if (msg_port == MACH_PORT_NULL) { + send_msg.body.msgh_descriptor_count = 0; + } + + ret = mach_msg(&(send_msg.header), + MACH_SEND_MSG | + MACH_SEND_TIMEOUT | + MACH_SEND_OVERRIDE | + ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options), + send_msg.header.msgh_size, + 0, + MACH_PORT_NULL, + 10000, + 0); + + if (ret != KERN_SUCCESS) { + printf("mach_msg_send failed with error %d\n", ret); + } +} + +mach_port_t +receive( + mach_port_t rcv_port, + mach_port_t notify_port) +{ + kern_return_t ret = 0; + mach_port_t service_port; + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + mach_msg_trailer_t trailer; + } rcv_msg = { + .header = + { + .msgh_remote_port = MACH_PORT_NULL, + .msgh_local_port = rcv_port, + .msgh_size = sizeof(rcv_msg), + }, + }; + + printf("Client: Starting sync receive\n"); + + ret = mach_msg(&(rcv_msg.header), + MACH_RCV_MSG | MACH_RCV_LARGE | + (notify_port ? MACH_RCV_SYNC_WAIT : 0), + 0, + rcv_msg.header.msgh_size, + rcv_port, + 0, + notify_port); + + printf("mach msg rcv returned %d\n", ret); + + + if (rcv_msg.body.msgh_descriptor_count != 1) { + if (notify_port) { + printf("Did not receive a service port in mach msg %d\n", rcv_msg.body.msgh_descriptor_count); + } + return MACH_PORT_NULL; + } + + service_port = rcv_msg.port_descriptor.name; + return service_port; +} + +int +main(int argc __attribute__((unused)), char *argv[]) +{ + int priority; + set_thread_name(__FUNCTION__); + + /* Check for priority */ + priority = get_pri(mach_thread_self()); + printf("The priority of child is %d\n", priority); + + if (strcmp(argv[1], "EXIT") == 0) { + printf("Helper process exiting\n"); + exit(priority); + } else if (strcmp(argv[1], "EXEC") == 0) { + int ret; + + printf("Helper process execing\n"); + /* exec the same binary with EXIT arg */ + char *binary = "prioritize_process_launch_helper"; + char *new_argv[] = {binary, "EXIT", NULL}; + ret = execve(binary, new_argv, NULL); + exit(ret); + } else if (strcmp(argv[1], "SETEXEC") == 0) { + int ret; + int child_pid; + posix_spawnattr_t attr; + + ret = posix_spawnattr_init(&attr); + if (ret != 0) { + printf("posix_spawnattr_init failed \n"); + exit(ret); + } + ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETEXEC); + if (ret != 0) { + printf("posix_spawnattr_setflags failed \n"); + exit(ret); + } + + printf("Helper process doing posix_spawn set_exec\n"); + /* set exec the same binary with EXIT arg */ + char *binary = "prioritize_process_launch_helper"; + char *new_argv[] = {binary, "EXIT", NULL}; + + ret = posix_spawn(&child_pid, binary, NULL, &attr, new_argv, NULL); + exit(ret); + } else if (strcmp(argv[1], "SETEXEC_PORTS") == 0) { + int ret; + int child_pid; + posix_spawnattr_t attr; + mach_port_t port; + + kern_return_t kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port); + if (kr != KERN_SUCCESS) { + printf("mach_port_allocate failed with error %d\n", kr); + exit(kr); + } + + kr = mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND); + if (kr != KERN_SUCCESS) { + printf("mach_port_insert_right failed with error %d\n", kr); + exit(kr); + } + + ret = posix_spawnattr_init(&attr); + if (ret != 0) { + printf("posix_spawnattr_init failed \n"); + exit(ret); + } + + ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETEXEC); + if (ret != 0) { + printf("posix_spawnattr_setflags failed \n"); + exit(ret); + } + + ret = posix_spawnattr_set_importancewatch_port_np(&attr, 1, &port); + if (ret != 0) { + printf("posix_spawnattr_set_importance_port_np failed \n"); + exit(ret); + } + + printf("Helper process doing posix_spawn set_exec\n"); + /* set exec the same binary with EXIT arg */ + char *binary = "prioritize_process_launch_helper"; + char *new_argv[] = {binary, "EXIT", NULL}; + + ret = posix_spawn(&child_pid, binary, NULL, &attr, new_argv, NULL); + printf("spawned failed with error %d\n", ret); + exit(ret); + } else if (strcmp(argv[1], "WAIT") == 0) { + do { + sleep(1); + priority = get_pri(mach_thread_self()); + } while (priority == 47); + exit(priority); + } else if (strcmp(argv[1], "MULTIWAIT") == 0) { + do { + sleep(1); + priority = get_pri(mach_thread_self()); + } while (priority == 47); + printf("The priority came down to %d\n", priority); + do { + sleep(1); + priority = get_pri(mach_thread_self()); + } while (priority == 37); + printf("The priority came down to %d\n", priority); + exit(priority); + } else if (strcmp(argv[1], "MSGSYNC") == 0) { + int ret_val = 31; + mach_port_array_t port_array = NULL; + unsigned int portCnt = 0; + mach_port_t send_port; + mach_port_t special_reply_port; + mach_port_t service_port; + kern_return_t kr; + + priority = get_pri(mach_thread_self()); + printf("The priority of spawned binary is to %d\n", priority); + if (priority != 47) { + ret_val = 0; + } + + /* Get the stashed send right using mach_ports_lookup */ + kr = mach_ports_lookup(mach_task_self(), &port_array, &portCnt); + if (kr != KERN_SUCCESS) { + printf("mach_ports_lookup failed with return value %d and port count %d\n", kr, portCnt); + exit(0); + } + + send_port = port_array[0]; + special_reply_port = thread_get_special_reply_port(); + if (!MACH_PORT_VALID(special_reply_port)) { + printf("Failed to special reply port for thread\n"); + exit(0); + } + + /* Perform a Sync bootstrap checkin */ + send(send_port, special_reply_port, MACH_PORT_NULL, MACH_SEND_SYNC_BOOTSTRAP_CHECKIN, MACH_MSG_TYPE_COPY_SEND); + sleep(2); + + /* Make sure we are still boosted */ + priority = get_pri(mach_thread_self()); + printf("The priority of spawned binary is to %d\n", priority); + if (priority != 47) { + ret_val = 0; + } + + /* Receive the service port */ + service_port = receive(special_reply_port, send_port); + + /* Make sure we are still boosted */ + priority = get_pri(mach_thread_self()); + printf("The priority of spawned binary is to %d\n", priority); + if (priority != 47) { + ret_val = 0; + } + + /* Try to receive on service port */ + receive(service_port, MACH_PORT_NULL); + + /* Make sure we are no longer boosted */ + priority = get_pri(mach_thread_self()); + printf("The priority of spawned binary is to %d\n", priority); + if (priority != 31) { + ret_val = 0; + } + exit(ret_val); + } + + exit(0); +} diff --git a/tests/prng.c b/tests/prng.c new file mode 100644 index 000000000..18b6ee869 --- /dev/null +++ b/tests/prng.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +#define BUF_SIZE ((size_t)(1 << 25)) +#define BLOCK_SIZE ((size_t)16) + +static int +cmp(const void *a, const void *b) +{ + return memcmp(a, b, 16); +} + +static void +prng_sanitycheck(uint8_t *buf, size_t buf_size) +{ + size_t nblocks = buf_size / BLOCK_SIZE; + qsort(buf, nblocks, BLOCK_SIZE, cmp); + + for (size_t i = 0; i < nblocks - 1; i += 1) { + T_QUIET; + T_ASSERT_NE(memcmp(buf, buf + BLOCK_SIZE, BLOCK_SIZE), 0, "duplicate block"); + buf += BLOCK_SIZE; + } +} + +static void +prng_getentropy(void *ctx, size_t i) +{ + uint8_t *buf = ((uint8_t *)ctx) + (BUF_SIZE * i); + + for (size_t j = 0; j < BUF_SIZE; j += 256) { + T_QUIET; + T_ASSERT_POSIX_SUCCESS(getentropy(&buf[j], 256), "getentropy"); + } + + prng_sanitycheck(buf, BUF_SIZE); +} + +static void +prng_devrandom(void *ctx, size_t i) +{ + uint8_t *buf = ((uint8_t *)ctx) + (BUF_SIZE * i); + + int fd = open("/dev/random", O_RDONLY); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(fd, "open"); + + size_t n = BUF_SIZE; + while (n > 0) { + ssize_t m = read(fd, buf, n); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(m, "read"); + + n -= (size_t)m; + buf += m; + } + + buf = ((uint8_t *)ctx) + (BUF_SIZE * i); + prng_sanitycheck(buf, BUF_SIZE); +} + +T_DECL(prng, "prng test") +{ + size_t ncpu = (size_t)dt_ncpu(); + + uint8_t *buf = malloc(BUF_SIZE * ncpu); + T_QUIET; + T_ASSERT_NOTNULL(buf, "malloc"); + + dispatch_apply_f(ncpu, DISPATCH_APPLY_AUTO, buf, prng_getentropy); + + dispatch_apply_f(ncpu, DISPATCH_APPLY_AUTO, buf, prng_devrandom); + + prng_sanitycheck(buf, BUF_SIZE * ncpu); + + free(buf); +} diff --git a/tests/proc_info.c b/tests/proc_info.c index 8502f2d87..51206a2f4 100644 --- a/tests/proc_info.c +++ b/tests/proc_info.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,8 @@ #include #undef PRIVATE +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define ACT_CHANGE_UID 1 #define ACT_CHANGE_RUID 2 #define ACT_EXIT 127 @@ -732,8 +735,7 @@ free_proc_info(void ** proc_info, int num) T_DECL(proc_info_listpids_all_pids, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { /* * Get the value of nprocs with no buffer sent in @@ -800,8 +802,7 @@ T_DECL(proc_info_listpids_all_pids, T_DECL(proc_info_listpids_pgrp_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler); T_LOG("Test to verify PROC_PGRP_ONLY returns correct value"); @@ -823,8 +824,7 @@ T_DECL(proc_info_listpids_pgrp_only, T_DECL(proc_info_listpids_ppid_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler); T_LOG("Test to verify PROC_PPID_ONLY returns correct value"); @@ -844,8 +844,7 @@ T_DECL(proc_info_listpids_ppid_only, T_DECL(proc_info_listpids_uid_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler); T_LOG("Test to verify PROC_UID_ONLY returns correct value"); @@ -864,8 +863,7 @@ T_DECL(proc_info_listpids_uid_only, T_DECL(proc_info_listpids_ruid_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler); T_LOG("Test to verify PROC_RUID_ONLY returns correct value"); @@ -884,8 +882,7 @@ T_DECL(proc_info_listpids_ruid_only, T_DECL(proc_info_listpids_tty_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { int ret = isatty(STDOUT_FILENO); if (ret != 1) { @@ -915,8 +912,7 @@ T_DECL(proc_info_listpids_tty_only, T_DECL(proc_info_pidinfo_proc_piduniqidentifierinfo, "Test to identify PROC_PIDUNIQIDENTIFIERINFO returns correct unique identifiers for process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; proc_info_caller(P_UNIQIDINFO | C_UNIQIDINFO, proc_info, NULL); @@ -936,8 +932,7 @@ T_DECL(proc_info_pidinfo_proc_piduniqidentifierinfo, T_DECL(proc_info_pidinfo_proc_pidtbsdinfo, "Test to verify PROC_PIDTBSDINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; int child_pid = 0; @@ -969,8 +964,7 @@ T_DECL(proc_info_pidinfo_proc_pidtbsdinfo, T_DECL(proc_info_pidt_shortbsdinfo, "Test to verify PROC_PIDT_SHORTBSDINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; int child_pid = 0; @@ -999,8 +993,7 @@ T_DECL(proc_info_pidt_shortbsdinfo, T_DECL(proc_info_pidt_bsdinfowithuniqid, "Test to verify PROC_PIDT_BSDINFOWITHUNIQID returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[4]; int child_pid = 0; @@ -1044,8 +1037,7 @@ T_DECL(proc_info_pidt_bsdinfowithuniqid, T_DECL(proc_info_proc_pidtask_info, "Test to verify PROC_PIDTASKINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; proc_info_caller(P_TASK_INFO | P_TASK_INFO_NEW, proc_info, NULL); @@ -1102,8 +1094,7 @@ T_DECL(proc_info_proc_pidtask_info, T_DECL(proc_info_proc_pidtaskallinfo, "Test to verify PROC_PIDTASKALLINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[4]; int child_pid = 0; @@ -1180,8 +1171,7 @@ T_DECL(proc_info_proc_pidtaskallinfo, T_DECL(proc_info_proc_pidlistthreads, "Test to verify PROC_PIDLISTTHREADS returns valid information about process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[1]; proc_info_caller(THREAD_ADDR, proc_info, NULL); @@ -1189,8 +1179,7 @@ T_DECL(proc_info_proc_pidlistthreads, T_DECL(proc_info_proc_pidthreadinfo, "Test to verify PROC_PIDTHREADINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; int child_pid = 0; @@ -1228,8 +1217,7 @@ T_DECL(proc_info_proc_pidthreadinfo, T_DECL(proc_info_proc_threadid64info, "Test to verify PROC_PIDTHREADID64INFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; proc_info_caller(PTHINFO | PTHINFO_64, proc_info, NULL); @@ -1257,8 +1245,7 @@ T_DECL(proc_info_proc_threadid64info, T_DECL(proc_info_proc_pidthreadpathinfo, "Test to verify PROC_PIDTHREADPATHINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[2]; proc_info_caller(PTHINFO | PINFO_PATH, proc_info, NULL); @@ -1289,8 +1276,7 @@ T_DECL(proc_info_proc_pidthreadpathinfo, T_DECL(proc_info_proc_pidarchinfo, "Test to verify PROC_PIDARCHINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[1]; proc_info_caller(PAI, proc_info, NULL); @@ -1312,8 +1298,7 @@ T_DECL(proc_info_proc_pidarchinfo, T_DECL(proc_info_proc_pidregioninfo, "Test to verify PROC_PIDREGIONINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[3]; proc_info_caller(PREGINFO, proc_info, NULL); @@ -1363,8 +1348,7 @@ T_DECL(proc_info_proc_pidregioninfo, T_DECL(proc_info_proc_pidregionpathinfo, "Test to verify PROC_PIDREGIONPATHINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_INSTALLEDUSEROS)) + T_META_ASROOT(true)) { void * proc_info[3]; proc_info_caller(PREGINFO_PATH, proc_info, NULL); @@ -1451,8 +1435,7 @@ T_DECL(proc_info_proc_pidregionpathinfo, T_DECL(proc_info_proc_pidregionpathinfo2, "Test to verify PROC_PIDREGIONPATHINFO2 returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_INSTALLEDUSEROS)) + T_META_ASROOT(true)) { void * proc_info[3]; proc_info_caller(PREGINFO_PATH_2, proc_info, NULL); @@ -1544,8 +1527,7 @@ T_DECL(proc_info_proc_pidregionpathinfo2, T_DECL(proc_info_proc_pidregionpathinfo3, "Test to verify PROC_PIDREGIONPATHINFO3 returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_INSTALLEDUSEROS)) + T_META_ASROOT(true)) { void * proc_info[5]; proc_info_caller(PREGINFO_PATH_3, proc_info, NULL); @@ -1569,8 +1551,7 @@ T_DECL(proc_info_proc_pidregionpathinfo3, T_DECL(proc_info_proc_pidvnodepathinfo, "Test to verify PROC_PIDVNODEPATHINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { void * proc_info[1]; proc_info_caller(PVNINFO, proc_info, NULL); @@ -1605,8 +1586,7 @@ T_DECL(proc_info_proc_pidvnodepathinfo, T_DECL(proc_info_pidinfo_proc_pidlistfds, "proc_info API tests to verify PROC_INFO_CALL_PIDINFO/PROC_PIDLISTFDS", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { int retval; int orig_nfiles = 0; @@ -1654,8 +1634,7 @@ T_DECL(proc_info_pidinfo_proc_pidlistfds, T_DECL(proc_info_proc_pidpathinfo, "Test to verify PROC_PIDPATHINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { char * pid_path = NULL; pid_path = malloc(sizeof(char) * PROC_PIDPATHINFO_MAXSIZE); @@ -1671,8 +1650,7 @@ T_DECL(proc_info_proc_pidpathinfo, T_DECL(proc_info_proc_pidlistfileports, "Test to verify PROC_PIDLISTFILEPORTS returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { struct proc_fileportinfo * fileport_info = NULL; mach_port_t tmp_file_port = MACH_PORT_NULL; @@ -1723,8 +1701,7 @@ T_DECL(proc_info_proc_pidlistfileports, T_DECL(proc_info_proc_pidcoalitioninfo, "Test to verify PROC_PIDCOALITIONINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler); int child_pid = proc_config->child_pids[0]; @@ -1751,8 +1728,7 @@ T_DECL(proc_info_proc_pidcoalitioninfo, T_DECL(proc_info_proc_pidworkqueueinfo, "Test to verify PROC_PIDWORKQUEUEINFO returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler); int child_pid = proc_config->child_pids[0]; @@ -1778,8 +1754,7 @@ T_DECL(proc_info_proc_pidworkqueueinfo, } T_DECL(proc_info_proc_pidnoteexit, "Test to verify PROC_PIDNOTEEXIT returns valid information about the process", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { /* * Ask the child to close pipe and quit, cleanup pipes for parent @@ -1800,8 +1775,7 @@ T_DECL(proc_info_proc_pidnoteexit, T_DECL(proc_info_negative_tests, "Test to validate PROC_INFO_CALL_PIDINFO for invalid arguments", - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_POSTINIT)) + T_META_ASROOT(true)) { proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler); int child_pid = proc_config->child_pids[0]; @@ -2073,7 +2047,8 @@ T_DECL(dynamic_kqueue_extended_info, "the kernel should report valid extended dy #pragma mark proc_listpids -T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered by kdebug", T_META_ASROOT(YES)) +T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered by kdebug", + T_META_ASROOT(YES), T_META_RUN_CONCURRENTLY(false)) { int mib[4] = {CTL_KERN, KERN_KDEBUG}; int npids; @@ -2118,3 +2093,67 @@ T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered b T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDREMOVE sysctl"); } + +#pragma mark misc + +static int prf_fd; +static char prf_path[PATH_MAX]; +static void +prf_end(void) +{ + close(prf_fd); + unlink(prf_path); +} + +T_DECL(proc_regionfilename, "proc_regionfilename() should work") +{ + static char expected[] = "'very rigorous maritime engineering standards' && the front fell off"; + static char real[sizeof(expected)]; + int rc; + void *addr; + + prf_fd = CONF_TMP_FILE_OPEN(prf_path); + T_ATEND(prf_end); + + rc = (int) write(prf_fd, expected, sizeof(expected)); + T_ASSERT_POSIX_SUCCESS(rc, "write to tmpfile"); + + addr = mmap(0, 0x1000, PROT_READ, MAP_PRIVATE, prf_fd, 0); + T_WITH_ERRNO; + T_ASSERT_NE_PTR(addr, MAP_FAILED, "mmap of tmpfile"); + + T_WITH_ERRNO; + T_ASSERT_GT(proc_regionfilename(getpid(), (uint64_t) addr, real, MAXPATHLEN), 0, "proc_regionfilename"); + T_EXPECT_EQ_STR(basename(prf_path), basename(real), "filename"); +} + +T_DECL(proc_regionpath, "PROC_PIDREGIONPATH should return addr, length and path") +{ + int rc; + struct proc_regionpath path; + static char some_text[] = "'very rigorous maritime engineering standards' && the front fell off"; + unsigned long rounded_length = (sizeof(some_text) & (unsigned long) ~(PAGE_SIZE - 1)) + PAGE_SIZE; + void *addr; + + prf_fd = CONF_TMP_FILE_OPEN(prf_path); + T_ATEND(prf_end); + + rc = (int) write(prf_fd, some_text, sizeof(some_text)); + T_ASSERT_POSIX_SUCCESS(rc, "write to tmpfile"); + + addr = mmap(0, PAGE_SIZE, PROT_READ, MAP_PRIVATE, prf_fd, 0); + T_WITH_ERRNO; + T_ASSERT_NE_PTR(addr, MAP_FAILED, "mmap of tmpfile"); + + rc = proc_pidinfo(getpid(), PROC_PIDREGIONPATH, (uint64_t)addr, &path, sizeof(struct proc_regionpath)); + T_ASSERT_POSIX_SUCCESS(rc, "proc_pidinfo"); + + T_ASSERT_EQ((unsigned long) path.prpo_regionlength, rounded_length, "regionlength must match"); + T_ASSERT_EQ_PTR((void *) path.prpo_addr, addr, "addr must match"); + + rc = proc_pidinfo(getpid(), PROC_PIDREGIONPATH, (uint64_t)((char *) addr + 20), &path, sizeof(struct proc_regionpath)); + T_ASSERT_POSIX_SUCCESS(rc, "proc_pidinfo 20 bytes past the base address"); + + T_ASSERT_EQ((unsigned long) path.prpo_regionlength, rounded_length, "regionlength must match, even when 20 bytes past the base address"); + T_ASSERT_EQ_PTR((void *) path.prpo_addr, addr, "addr must match, even when 20 bytes past the base address"); +} diff --git a/tests/proc_info_44873309.c b/tests/proc_info_44873309.c new file mode 100644 index 000000000..cdd2bfc5b --- /dev/null +++ b/tests/proc_info_44873309.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +T_DECL(proc_info_44873309, "ensure new proc_pidinfo flavor returns correct table sizes", + T_META_CHECK_LEAKS(false), T_META_ASROOT(true)) +{ + mach_port_t port; + int retval; + + pid_t pid = getpid(); + struct proc_ipctableinfo table_info = {}; + retval = proc_pidinfo(pid, PROC_PIDIPCTABLEINFO, 0, (void *)&table_info, (uint32_t)sizeof(table_info)); + T_WITH_ERRNO; T_EXPECT_GT(retval, 0, "proc_pidinfo(PROC_PIDIPCTABLEINFO) returned %d", retval); + T_EXPECT_EQ(retval, (int)sizeof(table_info), "proc_pidinfo(PROC_PIDIPCTABLEINFO) table_size = %u, table_free = %u", + table_info.table_size, table_info.table_free); + + kern_return_t ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port); + T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_RECEIVE"); + + struct proc_ipctableinfo table_info2 = {}; + retval = proc_pidinfo(pid, PROC_PIDIPCTABLEINFO, 0, (void *)&table_info2, (uint32_t)sizeof(table_info2)); + T_WITH_ERRNO; T_EXPECT_GT(retval, 0, "proc_pidinfo(PROC_PIDIPCTABLEINFO) returned %d", retval); + T_EXPECT_EQ(retval, (int)sizeof(table_info2), "proc_pidinfo(PROC_PIDIPCTABLEINFO) table_size2 = %u, table_free2 = %u", + table_info2.table_size, table_info2.table_free); + + T_EXPECT_EQ(table_info.table_free, table_info2.table_free + 1, "Comparing the table_free values"); +} diff --git a/tests/proc_info_44873309.entitlements b/tests/proc_info_44873309.entitlements new file mode 100644 index 000000000..a333f4755 --- /dev/null +++ b/tests/proc_info_44873309.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.kernel.global-proc-info + + + diff --git a/tests/proc_info_list_kthreads.c b/tests/proc_info_list_kthreads.c index 8af5647fe..9bdba1c3e 100644 --- a/tests/proc_info_list_kthreads.c +++ b/tests/proc_info_list_kthreads.c @@ -14,6 +14,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define MAX_TRIES 20 #define EXTRA_THREADS 15 diff --git a/tests/proc_info_udata.c b/tests/proc_info_udata.c index 4482e275c..e482a848d 100644 --- a/tests/proc_info_udata.c +++ b/tests/proc_info_udata.c @@ -4,6 +4,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(proc_udata_info, "Get and set a proc udata token"){ uint64_t token = mach_absolute_time(); proc_info_udata_t udata; diff --git a/tests/proc_uuid_policy_26567533.c b/tests/proc_uuid_policy_26567533.c index 470d5ca4e..887573c8e 100644 --- a/tests/proc_uuid_policy_26567533.c +++ b/tests/proc_uuid_policy_26567533.c @@ -3,6 +3,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define NUM_PROC_UUID_POLICY_FLAGS 4 T_DECL(proc_uuid_policy_26567533, "Tests passing a NULL uuid in (uap->uuid).", T_META_LTEPHASE(LTE_POSTINIT)) diff --git a/tests/processor_info.c b/tests/processor_info.c index b8eebdb35..0b94fa9de 100644 --- a/tests/processor_info.c +++ b/tests/processor_info.c @@ -7,7 +7,8 @@ #include #include -T_GLOBAL_META(T_META_ASROOT(true)); +T_GLOBAL_META(T_META_ASROOT(true), + T_META_RUN_CONCURRENTLY(true)); T_DECL(processor_cpu_stat64, "ensure 64-bit processor statistics are reported correctly", @@ -46,7 +47,6 @@ T_DECL(processor_cpu_stat64, memset(prestats, 0xff, cpu_count * sizeof(*prestats)); for (int i = 0; i < (int)cpu_count; i++) { - printf("%d\n", PROCESSOR_CPU_STAT64_COUNT); mach_msg_type_number_t info_count = PROCESSOR_CPU_STAT64_COUNT; kr = processor_info(cpu_ports[i], PROCESSOR_CPU_STAT64, &host, (processor_info_t)&prestats[i], &info_count); diff --git a/tests/pwrite_avoid_sigxfsz_28581610.c b/tests/pwrite_avoid_sigxfsz_28581610.c index abeff39dc..63fd25d74 100644 --- a/tests/pwrite_avoid_sigxfsz_28581610.c +++ b/tests/pwrite_avoid_sigxfsz_28581610.c @@ -11,7 +11,9 @@ #include #include -#define TMP_FILE_PATH "/tmp/test_pwrite" +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +#define TMP_FILE_PATH "/tmp/test_pwrite_28581610" static sigjmp_buf xfsz_jmpbuf; diff --git a/tests/quiesce_counter.c b/tests/quiesce_counter.c index c10df2ad9..d864d8531 100644 --- a/tests/quiesce_counter.c +++ b/tests/quiesce_counter.c @@ -41,6 +41,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #ifndef _COMM_PAGE_CPU_QUIESCENT_COUNTER T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER increments", diff --git a/tests/remote_time.c b/tests/remote_time.c index 1cb3f94cc..8a05c73c1 100644 --- a/tests/remote_time.c +++ b/tests/remote_time.c @@ -4,6 +4,9 @@ #include #include #include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + extern uint64_t __mach_bridge_remote_time(uint64_t); T_DECL(remote_time_syscall, "test mach_bridge_remote_time syscall", diff --git a/tests/restart.c b/tests/restart.c new file mode 100644 index 000000000..e0ea5fdbe --- /dev/null +++ b/tests/restart.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +extern task_restartable_range_t range; +extern void restartable_function(int *); +static int step = 0; + +#if defined(__x86_64__) +__asm__(" .align 4\n" + " .text\n" + " .private_extern _restartable_function\n" + "_restartable_function:\n" + // this should use $arg1 but I don't know intel calling conventions + // so the argument to restartable_function() is actually ignored + // as we know what it is anyway, and Intel PC-relative addressing, + // unlike ARM, is pretty readable + " incl _step(%rip)\n" + "1:\n" + " pause\n" + " jmp 1b\n" + "LExit_restartable_function:\n" + " ret\n"); +#elif defined(__arm64__) +__asm__(" .align 4\n" + " .text\n" + " .private_extern _restartable_function\n" + "_restartable_function:\n" + " ldr x11, [x0]\n" + " add x11, x11, #1\n" + " str x11, [x0]\n" + "1:\n" + " b 1b\n" + "LExit_restartable_function:\n" + " ret\n"); +#elif defined(__arm__) +__asm__(" .align 4\n" + " .text\n" + " .thumb\n" + " .private_extern _restartable_function\n" + " .thumb_func\n" + "_restartable_function:\n" + "0:\n" + " ldr r12, [r0]\n" + " add r12, r12, #1\n" + " str r12, [r0]\n" + "1:\n" + " b 1b\n" + "LExit_restartable_function:\n" + " bx lr\n"); +#elif defined(__i386__) +#define SKIP_TEST 1 +#else +#error Architecture unsupported +#endif + +#ifndef SKIP_TEST +__asm__(" .align 4\n" + " .data\n" + " .private_extern _range\n" + "_range:\n" +#if __LP64__ + " .quad _restartable_function\n" +#else + " .long _restartable_function\n" + " .long 0\n" +#endif + " .short LExit_restartable_function - _restartable_function\n" + " .short LExit_restartable_function - _restartable_function\n" + " .long 0\n"); +#endif + +static void +noop_signal(int signo __unused) +{ +} + +static void * +task_restartable_ranges_thread(void *_ctx) +{ + int *stepp = _ctx; + restartable_function(stepp); // increments step + T_PASS("was successfully restarted\n"); + (*stepp)++; + return NULL; +} + +static void +wait_for_step(int which) +{ + for (int i = 0; step != which && i < 10; i++) { + usleep(100000); + } +} + +T_DECL(task_restartable_ranges, "test task_restartable_ranges") +{ +#ifdef SKIP_TEST + T_SKIP("Not supported"); +#else + kern_return_t kr; + pthread_t th; + int rc; + + signal(SIGUSR1, noop_signal); + + kr = task_restartable_ranges_register(mach_task_self(), &range, 1); + T_ASSERT_MACH_SUCCESS(kr, "task_restartable_ranges_register"); + + { + rc = pthread_create(&th, NULL, &task_restartable_ranges_thread, &step); + T_ASSERT_POSIX_SUCCESS(rc, "pthread_create"); + + wait_for_step(1); + T_ASSERT_EQ(step, 1, "The thread started (sync)"); + + kr = task_restartable_ranges_synchronize(mach_task_self()); + T_ASSERT_MACH_SUCCESS(kr, "task_restartable_ranges_synchronize"); + + T_LOG("wait for the function to be restarted (sync)"); + wait_for_step(2); + T_ASSERT_EQ(step, 2, "The thread exited (sync)"); + pthread_join(th, NULL); + } + + { + rc = pthread_create(&th, NULL, &task_restartable_ranges_thread, &step); + T_ASSERT_POSIX_SUCCESS(rc, "pthread_create"); + + wait_for_step(3); + T_ASSERT_EQ(step, 3, "The thread started (signal)"); + + rc = pthread_kill(th, SIGUSR1); + T_ASSERT_POSIX_SUCCESS(rc, "pthread_kill"); + + T_LOG("wait for the function to be restarted (signal)"); + wait_for_step(4); + T_ASSERT_EQ(step, 4, "The thread exited (signal)"); + pthread_join(th, NULL); + } +#endif +} diff --git a/tests/settimeofday_29193041.c b/tests/settimeofday_29193041.c index fe04a2ec5..5acfb74ee 100644 --- a/tests/settimeofday_29193041.c +++ b/tests/settimeofday_29193041.c @@ -30,10 +30,6 @@ T_DECL(settime_32089962_not_entitled_root, struct timeval adj_time; struct timex ntptime; - if (geteuid() != 0) { - T_SKIP("settimeofday_root_29193041 test requires root privileges to run."); - } - /* test settimeofday */ T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL); T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL); @@ -67,7 +63,7 @@ T_DECL(settime_32089962_not_entitled_not_root, T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL); /* test settimeofday */ -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL); #else res = settimeofday(&settimeofdaytime, NULL); @@ -95,10 +91,6 @@ T_DECL(settimeofday_29193041_not_entitled_root, struct timeval time; long new_time; - if (geteuid() != 0) { - T_SKIP("settimeofday_root_29193041 test requires root privileges to run."); - } - T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL); /* increment the time of one day */ @@ -137,7 +129,7 @@ T_DECL(settimeofday_29193041_not_entitled_not_root, time.tv_sec = new_time; time.tv_usec = 0; -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL); #else int res = settimeofday(&time, NULL); @@ -146,7 +138,7 @@ T_DECL(settimeofday_29193041_not_entitled_not_root, T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL); -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* expext to be past new_time */ T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed without root and without entitlement"); time.tv_sec -= DAY; diff --git a/tests/shared_cache_tests.c b/tests/shared_cache_tests.c new file mode 100644 index 000000000..572309d03 --- /dev/null +++ b/tests/shared_cache_tests.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.shared_cache")); + +// Give the test up to two minutes because in the failure case we want to invoke update_dyld_shared_cache, which +// might take a bit to do. +T_DECL(present, "tests that the device is running with a shared cache", T_META_ASROOT(true), T_META_TIMEOUT(120)) +{ + size_t shared_cache_len = 0; + const void *cache_header = _dyld_get_shared_cache_range(&shared_cache_len); + if ((cache_header == NULL) || (shared_cache_len == 0)) { +#if TARGET_OS_OSX + char *tmp_dir = (char *) dt_tmpdir(); + T_QUIET; T_ASSERT_NOTNULL(tmp_dir, "darwintest created tmp dir"); + // Try to invoke update_dyld_shared_cache to gather information on why we're not running with a shared cache + char *shared_cache_update_cmd[] = { "/usr/bin/update_dyld_shared_cache", "-debug", "-cache_dir", tmp_dir, NULL }; + pid_t child1 = dt_launch_tool_pipe(shared_cache_update_cmd, false, NULL, ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { + T_LOG("%s", data); + return false; + }, ^bool (__unused char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { + T_LOG("%s", data); + return false; + }, BUFFER_PATTERN_LINE, NULL); + + int status = 0; + dt_waitpid(child1, &status, NULL, 0); + + T_LOG("waitpid for %d returned with status %d", child1, WEXITSTATUS(status)); +#endif // TARGET_OS_OSX + T_ASSERT_NOTNULL(cache_header, "shared cache present"); + T_ASSERT_GT((int) shared_cache_len, 0, "shared cache has non-zero length"); + } + + T_PASS("shared cache appears to be present and valid"); +} diff --git a/tests/sigchld_return.c b/tests/sigchld_return.c index 25ec4f275..01080d3b3 100644 --- a/tests/sigchld_return.c +++ b/tests/sigchld_return.c @@ -6,6 +6,7 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); static int exitcode = 0x6789BEEF; int should_exit = 0; diff --git a/tests/sigcont_return.c b/tests/sigcont_return.c index 5e9258923..1788cad68 100644 --- a/tests/sigcont_return.c +++ b/tests/sigcont_return.c @@ -6,6 +6,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(sigcontreturn, "checks that a call to waitid() for a child that is stopped and then continued returns correctly") { pid_t pid; diff --git a/tests/socket_bind_35685803.c b/tests/socket_bind_35685803.c index b1173c2e2..e5dca6a2a 100644 --- a/tests/socket_bind_35685803.c +++ b/tests/socket_bind_35685803.c @@ -11,6 +11,9 @@ #include #include #include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); static bool debug; @@ -167,10 +170,14 @@ multithreaded_bind_test(bool v6, int socket_count) static void run_multithreaded_bind_test(int number_of_runs, bool v6, int socket_count) { +#if TARGET_OS_BRIDGE + T_SKIP("Not enough memory to handle this test"); +#else /* TARGET_OS_BRIDGE */ for (int i = 0; i < number_of_runs; i++) { multithreaded_bind_test(v6, socket_count); } T_PASS("multithreaded_bind_test %s", v6 ? "IPv6" : "IPv4"); +#endif /* TARGET_OS_BRIDGE */ } T_DECL(socket_bind_35685803, diff --git a/tests/socket_poll_close_25786011.c b/tests/socket_poll_close_25786011.c index 5454e80b1..92993534c 100644 --- a/tests/socket_poll_close_25786011.c +++ b/tests/socket_poll_close_25786011.c @@ -3,6 +3,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(socket_poll_close_25786011, "Tests an invalid poll call to a socket and then calling close.", T_META_LTEPHASE(LTE_POSTINIT)) { int my_socket, ret; diff --git a/tests/stackshot.m b/tests/stackshot.m deleted file mode 100644 index 7aef17c1d..000000000 --- a/tests/stackshot.m +++ /dev/null @@ -1,1022 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * mirrors the dyld_cache_header struct defined in dyld_cache_format.h from dyld source code - * TODO: remove once rdar://42361850 is in the build - */ -struct dyld_cache_header -{ - char magic[16]; // e.g. "dyld_v0 i386" - uint32_t mappingOffset; // file offset to first dyld_cache_mapping_info - uint32_t mappingCount; // number of dyld_cache_mapping_info entries - uint32_t imagesOffset; // file offset to first dyld_cache_image_info - uint32_t imagesCount; // number of dyld_cache_image_info entries - uint64_t dyldBaseAddress; // base address of dyld when cache was built - uint64_t codeSignatureOffset; // file offset of code signature blob - uint64_t codeSignatureSize; // size of code signature blob (zero means to end of file) - uint64_t slideInfoOffset; // file offset of kernel slid info - uint64_t slideInfoSize; // size of kernel slid info - uint64_t localSymbolsOffset; // file offset of where local symbols are stored - uint64_t localSymbolsSize; // size of local symbols information - uint8_t uuid[16]; // unique value for each shared cache file - uint64_t cacheType; // 0 for development, 1 for production - uint32_t branchPoolsOffset; // file offset to table of uint64_t pool addresses - uint32_t branchPoolsCount; // number of uint64_t entries - uint64_t accelerateInfoAddr; // (unslid) address of optimization info - uint64_t accelerateInfoSize; // size of optimization info - uint64_t imagesTextOffset; // file offset to first dyld_cache_image_text_info - uint64_t imagesTextCount; // number of dyld_cache_image_text_info entries - uint64_t dylibsImageGroupAddr; // (unslid) address of ImageGroup for dylibs in this cache - uint64_t dylibsImageGroupSize; // size of ImageGroup for dylibs in this cache - uint64_t otherImageGroupAddr; // (unslid) address of ImageGroup for other OS dylibs - uint64_t otherImageGroupSize; // size of oImageGroup for other OS dylibs - uint64_t progClosuresAddr; // (unslid) address of list of program launch closures - uint64_t progClosuresSize; // size of list of program launch closures - uint64_t progClosuresTrieAddr; // (unslid) address of trie of indexes into program launch closures - uint64_t progClosuresTrieSize; // size of trie of indexes into program launch closures - uint32_t platform; // platform number (macOS=1, etc) - uint32_t formatVersion : 8, // dyld3::closure::kFormatVersion - dylibsExpectedOnDisk : 1, // dyld should expect the dylib exists on disk and to compare inode/mtime to see if cache is valid - simulator : 1, // for simulator of specified platform - locallyBuiltCache : 1, // 0 for B&I built cache, 1 for locally built cache - padding : 21; // TBD -}; - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.stackshot"), - T_META_CHECK_LEAKS(false), - T_META_ASROOT(true) - ); - -static const char *current_process_name(void); -static void verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count); -static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid); -static void parse_thread_group_stackshot(void **sbuf, size_t sslen); -static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen); -static void initialize_thread(void); - -#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024) -#define MAX_STACKSHOT_BUFFER_SIZE (6 * 1024 * 1024) - -/* bit flags for parse_stackshot */ -#define PARSE_STACKSHOT_DELTA 0x1 -#define PARSE_STACKSHOT_ZOMBIE 0x2 -#define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT 0x4 - -T_DECL(microstackshots, "test the microstackshot syscall") -{ - void *buf = NULL; - unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE; - - while (1) { - buf = malloc(size); - T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer"); - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wdeprecated-declarations" - int len = syscall(SYS_microstackshot, buf, size, - STACKSHOT_GET_MICROSTACKSHOT); -#pragma clang diagnostic pop - if (len == ENOSYS) { - T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY"); - } - if (len == -1 && errno == ENOSPC) { - /* syscall failed because buffer wasn't large enough, try again */ - free(buf); - buf = NULL; - size *= 2; - T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE, - "growing stackshot buffer to sane size"); - continue; - } - T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall"); - break; - } - - T_EXPECT_EQ(*(uint32_t *)buf, - (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC, - "magic value for microstackshot matches"); - - free(buf); -} - -struct scenario { - const char *name; - uint32_t flags; - bool should_fail; - bool maybe_unsupported; - pid_t target_pid; - uint64_t since_timestamp; - uint32_t size_hint; - dt_stat_time_t timer; -}; - -static void -quiet(struct scenario *scenario) -{ - if (scenario->timer) { - T_QUIET; - } -} - -static void -take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size)) -{ - initialize_thread(); - - void *config = stackshot_config_create(); - quiet(scenario); - T_ASSERT_NOTNULL(config, "created stackshot config"); - - int ret = stackshot_config_set_flags(config, scenario->flags); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags); - - if (scenario->size_hint > 0) { - ret = stackshot_config_set_size_hint(config, scenario->size_hint); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config", - scenario->size_hint); - } - - if (scenario->target_pid > 0) { - ret = stackshot_config_set_pid(config, scenario->target_pid); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config", - scenario->target_pid); - } - - if (scenario->since_timestamp > 0) { - ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config", - scenario->since_timestamp); - } - - int retries_remaining = 5; - -retry: ; - uint64_t start_time = mach_absolute_time(); - ret = stackshot_capture_with_config(config); - uint64_t end_time = mach_absolute_time(); - - if (scenario->should_fail) { - T_EXPECTFAIL; - T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); - return; - } - - if (ret == EBUSY || ret == ETIMEDOUT) { - if (retries_remaining > 0) { - if (!scenario->timer) { - T_LOG("stackshot_capture_with_config failed with %s (%d), retrying", - strerror(ret), ret); - } - - retries_remaining--; - goto retry; - } else { - T_ASSERT_POSIX_ZERO(ret, - "called stackshot_capture_with_config (no retries remaining)"); - } - } else if ((ret == ENOTSUP) && scenario->maybe_unsupported) { - T_SKIP("kernel indicated this stackshot configuration is not supported"); - } else { - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); - } - - if (scenario->timer) { - dt_stat_mach_time_add(scenario->timer, end_time - start_time); - } - void *buf = stackshot_config_get_stackshot_buffer(config); - size_t size = stackshot_config_get_stackshot_size(config); - if (scenario->name) { - char sspath[MAXPATHLEN]; - strlcpy(sspath, scenario->name, sizeof(sspath)); - strlcat(sspath, ".kcdata", sizeof(sspath)); - T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(sspath, sizeof(sspath)), - "create result file path"); - - T_LOG("writing stackshot to %s", sspath); - - FILE *f = fopen(sspath, "w"); - T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(f, - "open stackshot output file"); - - size_t written = fwrite(buf, size, 1, f); - T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file"); - - fclose(f); - } - cb(buf, size); - - ret = stackshot_config_dealloc(config); - T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config"); -} - -T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed") -{ - struct scenario scenario = { - .name = "kcdata", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | - STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), - }; - - T_LOG("taking kcdata stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(0, ssbuf, sslen, -1); - }); -} - -T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed") -{ - struct scenario scenario = { - .name = "faulting", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT - | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING), - }; - - T_LOG("taking faulting stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(0, ssbuf, sslen, -1); - }); -} - -T_DECL(bad_flags, "test a poorly-formed stackshot syscall") -{ - struct scenario scenario = { - .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */, - .should_fail = true, - }; - - T_LOG("attempting to take stackshot with kernel-only flag"); - take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) { - T_ASSERT_FAIL("stackshot data callback called"); - }); -} - -T_DECL(delta, "test delta stackshots") -{ - struct scenario scenario = { - .name = "delta", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), - }; - - T_LOG("taking full stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); - - T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); - - parse_stackshot(0, ssbuf, sslen, -1); - - struct scenario delta_scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT - | STACKSHOT_COLLECT_DELTA_SNAPSHOT), - .since_timestamp = stackshot_time - }; - - take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { - parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1); - }); - }); -} - -T_DECL(shared_cache_layout, "test stackshot inclusion of shared cache layout") -{ - struct scenario scenario = { - .name = "shared_cache_layout", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT | - STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT), - }; - - T_LOG("taking stackshot with STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT set"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_LAYOUT, ssbuf, sslen, -1); - }); -} - -static void *stuck_sysctl_thread(void *arg) { - int val = 1; - dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg; - - dispatch_semaphore_signal(child_thread_started); - T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread"); - - return NULL; -} - -T_HELPER_DECL(zombie_child, "child process to sample as a zombie") -{ - pthread_t pthread; - dispatch_semaphore_t child_thread_started = dispatch_semaphore_create(0); - T_QUIET; T_ASSERT_NOTNULL(child_thread_started, "zombie child thread semaphore"); - - /* spawn another thread to get stuck in the kernel, then call exit() to become a zombie */ - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&pthread, NULL, stuck_sysctl_thread, &child_thread_started), "pthread_create"); - - dispatch_semaphore_wait(child_thread_started, DISPATCH_TIME_FOREVER); - - /* sleep for a bit in the hope of ensuring that the other thread has called the sysctl before we signal the parent */ - usleep(100); - T_ASSERT_POSIX_SUCCESS(kill(getppid(), SIGUSR1), "signaled parent to take stackshot"); - - exit(0); -} - -T_DECL(zombie, "tests a stackshot of a zombie task with a thread stuck in the kernel") -{ - char path[PATH_MAX]; - uint32_t path_size = sizeof(path); - T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); - char *args[] = { path, "-n", "zombie_child", NULL }; - - dispatch_source_t child_sig_src; - dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0); - T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "zombie child semaphore"); - - dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL); - T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "signal processing queue"); - - pid_t pid; - - T_LOG("spawning a child"); - - signal(SIGUSR1, SIG_IGN); - child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q); - T_QUIET; T_ASSERT_NOTNULL(child_sig_src, "dispatch_source_create (child_sig_src)"); - - dispatch_source_set_event_handler(child_sig_src, ^{ dispatch_semaphore_signal(child_ready_sem); }); - dispatch_activate(child_sig_src); - - int sp_ret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); - T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid); - - dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER); - - T_LOG("received signal from child, capturing stackshot"); - - struct proc_bsdshortinfo bsdshortinfo; - int retval, iterations_to_wait = 10; - - while (iterations_to_wait > 0) { - retval = proc_pidinfo(pid, PROC_PIDT_SHORTBSDINFO, 0, &bsdshortinfo, sizeof(bsdshortinfo)); - if ((retval == 0) && errno == ESRCH) { - T_LOG("unable to find child using proc_pidinfo, assuming zombie"); - break; - } - - T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(retval, 0, "proc_pidinfo(PROC_PIDT_SHORTBSDINFO) returned a value > 0"); - T_QUIET; T_ASSERT_EQ(retval, (int)sizeof(bsdshortinfo), "proc_pidinfo call for PROC_PIDT_SHORTBSDINFO returned expected size"); - - if (bsdshortinfo.pbsi_flags & PROC_FLAG_INEXIT) { - T_LOG("child proc info marked as in exit"); - break; - } - - iterations_to_wait--; - if (iterations_to_wait == 0) { - /* - * This will mark the test as failed but let it continue so we - * don't leave a process stuck in the kernel. - */ - T_FAIL("unable to discover that child is marked as exiting"); - } - - /* Give the child a few more seconds to make it to exit */ - sleep(5); - } - - /* Give the child some more time to make it through exit */ - sleep(10); - - struct scenario scenario = { - .name = "zombie", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), - }; - - take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) { - /* First unwedge the child so we can reap it */ - int val = 1, status; - T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child"); - - T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on zombie child"); - - parse_stackshot(PARSE_STACKSHOT_ZOMBIE, ssbuf, sslen, pid); - }); -} - -static void -expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen) -{ - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - - bool in_task = false; - bool in_thread = false; - bool saw_instrs_cycles = false; - iter = kcdata_iter_next(iter); - - KCDATA_ITER_FOREACH(iter) { - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_CONTAINER_BEGIN: - switch (kcdata_iter_container_type(iter)) { - case STACKSHOT_KCCONTAINER_TASK: - in_task = true; - saw_instrs_cycles = false; - break; - - case STACKSHOT_KCCONTAINER_THREAD: - in_thread = true; - saw_instrs_cycles = false; - break; - - default: - break; - } - break; - - case STACKSHOT_KCTYPE_INSTRS_CYCLES: - saw_instrs_cycles = true; - break; - - case KCDATA_TYPE_CONTAINER_END: - if (in_thread) { - T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, - "saw instructions and cycles in thread"); - in_thread = false; - } else if (in_task) { - T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, - "saw instructions and cycles in task"); - in_task = false; - } - - default: - break; - } - } -} - -static void -skip_if_monotonic_unsupported(void) -{ - int supported = 0; - size_t supported_size = sizeof(supported); - int ret = sysctlbyname("kern.monotonic.supported", &supported, - &supported_size, 0, 0); - if (ret < 0 || !supported) { - T_SKIP("monotonic is unsupported"); - } -} - -T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot") -{ - skip_if_monotonic_unsupported(); - - struct scenario scenario = { - .name = "instrs-cycles", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES - | STACKSHOT_KCDATA_FORMAT), - }; - - T_LOG("attempting to take stackshot with instructions and cycles"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(0, ssbuf, sslen, -1); - expect_instrs_cycles_in_stackshot(ssbuf, sslen); - }); -} - -T_DECL(delta_instrs_cycles, - "test delta stackshots with instructions and cycles") -{ - skip_if_monotonic_unsupported(); - - struct scenario scenario = { - .name = "delta-instrs-cycles", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES - | STACKSHOT_KCDATA_FORMAT), - }; - - T_LOG("taking full stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); - - T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); - - parse_stackshot(0, ssbuf, sslen, -1); - expect_instrs_cycles_in_stackshot(ssbuf, sslen); - - struct scenario delta_scenario = { - .name = "delta-instrs-cycles-next", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES - | STACKSHOT_KCDATA_FORMAT - | STACKSHOT_COLLECT_DELTA_SNAPSHOT), - .since_timestamp = stackshot_time, - }; - - take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { - parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1); - expect_instrs_cycles_in_stackshot(dssbuf, dsslen); - }); - }); -} - -static void -check_thread_groups_supported() -{ - int err; - int supported = 0; - size_t supported_size = sizeof(supported); - err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0); - - if (err || !supported) - T_SKIP("thread groups not supported on this system"); -} - -T_DECL(thread_groups, "test getting thread groups in stackshot") -{ - check_thread_groups_supported(); - - struct scenario scenario = { - .name = "thread-groups", - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP - | STACKSHOT_KCDATA_FORMAT), - }; - - T_LOG("attempting to take stackshot with thread group flag"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_thread_group_stackshot(ssbuf, sslen); - }); -} - -static void -parse_page_table_asid_stackshot(void **ssbuf, size_t sslen) -{ - bool seen_asid = false; - bool seen_page_table_snapshot = false; - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, - "buffer provided is a stackshot"); - - iter = kcdata_iter_next(iter); - KCDATA_ITER_FOREACH(iter) { - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_ARRAY: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_array_valid(iter), - "checked that array is valid"); - - if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_PAGE_TABLES) { - continue; - } - - T_ASSERT_FALSE(seen_page_table_snapshot, "check that we haven't yet seen a page table snapshot"); - seen_page_table_snapshot = true; - - T_ASSERT_EQ((size_t) kcdata_iter_array_elem_size(iter), sizeof(uint64_t), - "check that each element of the pagetable dump is the expected size"); - - uint64_t *pt_array = kcdata_iter_payload(iter); - uint32_t elem_count = kcdata_iter_array_elem_count(iter); - uint32_t j; - bool nonzero_tte = false; - for (j = 0; j < elem_count;) { - T_QUIET; T_ASSERT_LE(j + 4, elem_count, "check for valid page table segment header"); - uint64_t pa = pt_array[j]; - uint64_t num_entries = pt_array[j + 1]; - uint64_t start_va = pt_array[j + 2]; - uint64_t end_va = pt_array[j + 3]; - - T_QUIET; T_ASSERT_NE(pa, (uint64_t) 0, "check that the pagetable physical address is non-zero"); - T_QUIET; T_ASSERT_EQ(pa % (num_entries * sizeof(uint64_t)), (uint64_t) 0, "check that the pagetable physical address is correctly aligned"); - T_QUIET; T_ASSERT_NE(num_entries, (uint64_t) 0, "check that a pagetable region has more than 0 entries"); - T_QUIET; T_ASSERT_LE(j + 4 + num_entries, (uint64_t) elem_count, "check for sufficient space in page table array"); - T_QUIET; T_ASSERT_GT(end_va, start_va, "check for valid VA bounds in page table segment header"); - - for (uint32_t k = j + 4; k < (j + 4 + num_entries); ++k) { - if (pt_array[k] != 0) { - nonzero_tte = true; - T_QUIET; T_ASSERT_EQ((pt_array[k] >> 48) & 0xf, (uint64_t) 0, "check that bits[48:51] of arm64 TTE are clear"); - // L0-L2 table and non-compressed L3 block entries should always have bit 1 set; assumes L0-L2 blocks will not be used outside the kernel - bool table = ((pt_array[k] & 0x2) != 0); - if (table) { - T_QUIET; T_ASSERT_NE(pt_array[k] & ((1ULL << 48) - 1) & ~((1ULL << 12) - 1), (uint64_t) 0, "check that arm64 TTE physical address is non-zero"); - } else { // should be a compressed PTE - T_QUIET; T_ASSERT_NE(pt_array[k] & 0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has at least one of bits [63:62] set"); - T_QUIET; T_ASSERT_EQ(pt_array[k] & ~0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has no other bits besides [63:62] set"); - } - } - } - - j += (4 + num_entries); - } - T_ASSERT_TRUE(nonzero_tte, "check that we saw at least one non-empty TTE"); - T_ASSERT_EQ(j, elem_count, "check that page table dump size matches extent of last header"); - break; - } - case STACKSHOT_KCTYPE_ASID: { - T_ASSERT_FALSE(seen_asid, "check that we haven't yet seen an ASID"); - seen_asid = true; - } - } - } - T_ASSERT_TRUE(seen_page_table_snapshot, "check that we have seen a page table snapshot"); - T_ASSERT_TRUE(seen_asid, "check that we have seen an ASID"); -} - -T_DECL(dump_page_tables, "test stackshot page table dumping support") -{ - struct scenario scenario = { - .name = "asid-page-tables", - .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_ASID | STACKSHOT_PAGE_TABLES), - .size_hint = (1ULL << 23), // 8 MB - .target_pid = getpid(), - .maybe_unsupported = true, - }; - - T_LOG("attempting to take stackshot with ASID and page table flags"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_page_table_asid_stackshot(ssbuf, sslen); - }); -} - -#pragma mark performance tests - -#define SHOULD_REUSE_SIZE_HINT 0x01 -#define SHOULD_USE_DELTA 0x02 -#define SHOULD_TARGET_SELF 0x04 - -static void -stackshot_perf(unsigned int options) -{ - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), - }; - - dt_stat_t size = dt_stat_create("bytes", "size"); - dt_stat_time_t duration = dt_stat_time_create("duration"); - scenario.timer = duration; - - if (options & SHOULD_TARGET_SELF) { - scenario.target_pid = getpid(); - } - - while (!dt_stat_stable(duration) || !dt_stat_stable(size)) { - __block uint64_t last_time = 0; - __block uint32_t size_hint = 0; - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - dt_stat_add(size, (double)sslen); - last_time = stackshot_timestamp(ssbuf, sslen); - size_hint = (uint32_t)sslen; - }); - if (options & SHOULD_USE_DELTA) { - scenario.since_timestamp = last_time; - scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT; - } - if (options & SHOULD_REUSE_SIZE_HINT) { - scenario.size_hint = size_hint; - } - } - - dt_stat_finalize(duration); - dt_stat_finalize(size); -} - -T_DECL(perf_no_size_hint, "test stackshot performance with no size hint", - T_META_TAG_PERF) -{ - stackshot_perf(0); -} - -T_DECL(perf_size_hint, "test stackshot performance with size hint", - T_META_TAG_PERF) -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT); -} - -T_DECL(perf_process, "test stackshot performance targeted at process", - T_META_TAG_PERF) -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF); -} - -T_DECL(perf_delta, "test delta stackshot performance", - T_META_TAG_PERF) -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA); -} - -T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process", - T_META_TAG_PERF) -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF); -} - -static uint64_t -stackshot_timestamp(void *ssbuf, size_t sslen) -{ - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - - uint32_t type = kcdata_iter_type(iter); - if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) { - T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter)); - } - - iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME); - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot"); - - return *(uint64_t *)kcdata_iter_payload(iter); -} - -#define TEST_THREAD_NAME "stackshot_test_thread" - -static void -parse_thread_group_stackshot(void **ssbuf, size_t sslen) -{ - bool seen_thread_group_snapshot = false; - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, - "buffer provided is a stackshot"); - - NSMutableSet *thread_groups = [[NSMutableSet alloc] init]; - - iter = kcdata_iter_next(iter); - KCDATA_ITER_FOREACH(iter) { - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_ARRAY: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_array_valid(iter), - "checked that array is valid"); - - if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) { - continue; - } - - seen_thread_group_snapshot = true; - - if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) { - struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter); - for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { - struct thread_group_snapshot_v2 *tgs = tgs_array + j; - [thread_groups addObject:@(tgs->tgs_id)]; - } - - } - else { - struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter); - for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { - struct thread_group_snapshot *tgs = tgs_array + j; - [thread_groups addObject:@(tgs->tgs_id)]; - } - } - break; - } - } - } - KCDATA_ITER_FOREACH(iter) { - NSError *error = nil; - - switch (kcdata_iter_type(iter)) { - - case KCDATA_TYPE_CONTAINER_BEGIN: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_container_valid(iter), - "checked that container is valid"); - - if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) { - break; - } - - NSDictionary *container = parseKCDataContainer(&iter, &error); - T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); - T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); - - int tg = [container[@"thread_snapshots"][@"thread_group"] intValue]; - - T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists"); - - break; - }; - - } - } - T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot"); -} - -static void -verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count) -{ - uuid_t cur_shared_cache_uuid; - __block uint32_t lib_index = 0, libs_found = 0; - - _dyld_get_shared_cache_uuid(cur_shared_cache_uuid); - int result = dyld_shared_cache_iterate_text(cur_shared_cache_uuid, ^(const dyld_shared_cache_dylib_text_info* info) { - T_QUIET; T_ASSERT_LT(lib_index, uuid_count, "dyld_shared_cache_iterate_text exceeded number of libraries returned by kernel"); - - libs_found++; - struct dyld_uuid_info_64 *cur_stackshot_uuid_entry = &uuids[lib_index]; - T_QUIET; T_ASSERT_EQ(memcmp(info->dylibUuid, cur_stackshot_uuid_entry->imageUUID, sizeof(info->dylibUuid)), 0, - "dyld returned UUID doesn't match kernel returned UUID"); - T_QUIET; T_ASSERT_EQ(info->loadAddressUnslid, cur_stackshot_uuid_entry->imageLoadAddress, - "dyld returned load address doesn't match kernel returned load address"); - lib_index++; - }); - - T_ASSERT_EQ(result, 0, "iterate shared cache layout"); - T_ASSERT_EQ(libs_found, uuid_count, "dyld iterator returned same number of libraries as kernel"); - - T_LOG("verified %d libraries from dyld shared cache", libs_found); -} - -static void -parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid) -{ - bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA); - bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE); - bool expect_shared_cache_layout = false; - bool expect_shared_cache_uuid = !delta; - bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false; - - if (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_LAYOUT) { - size_t shared_cache_length = 0; - const struct dyld_cache_header *cache_header = NULL; - cache_header = _dyld_get_shared_cache_range(&shared_cache_length); - T_QUIET; T_ASSERT_NOTNULL(cache_header, "current process running with shared cache"); - T_QUIET; T_ASSERT_GT(shared_cache_length, sizeof(struct _dyld_cache_header), "valid shared cache length populated by _dyld_get_shared_cache_range"); - - if (cache_header->locallyBuiltCache) { - T_LOG("device running with locally built shared cache, expect shared cache layout"); - expect_shared_cache_layout = true; - } else { - T_LOG("device running with B&I built shared-cache, no shared cache layout expected"); - } - } - - if (expect_zombie_child) { - T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid greater than zero"); - } - - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - if (delta) { - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, - "buffer provided is a delta stackshot"); - } else { - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, - "buffer provided is a stackshot"); - } - - iter = kcdata_iter_next(iter); - KCDATA_ITER_FOREACH(iter) { - NSError *error = nil; - - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_ARRAY: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_array_valid(iter), - "checked that array is valid"); - - NSMutableDictionary *array = parseKCDataArray(iter, &error); - T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot"); - T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array"); - - if (kcdata_iter_array_elem_type(iter) == STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT) { - struct dyld_uuid_info_64 *shared_cache_uuids = kcdata_iter_payload(iter); - uint32_t uuid_count = kcdata_iter_array_elem_count(iter); - T_ASSERT_NOTNULL(shared_cache_uuids, "parsed shared cache layout array"); - T_ASSERT_GT(uuid_count, 0, "returned valid number of UUIDs from shared cache"); - verify_stackshot_sharedcache_layout(shared_cache_uuids, uuid_count); - found_shared_cache_layout = true; - } - - break; - } - - case KCDATA_TYPE_CONTAINER_BEGIN: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_container_valid(iter), - "checked that container is valid"); - - if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) { - break; - } - - NSDictionary *container = parseKCDataContainer(&iter, &error); - T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); - T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); - - int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue]; - if (expect_zombie_child && (pid == child_pid)) { - found_zombie_child = true; - - uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue]; - T_ASSERT_TRUE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "child zombie marked as terminated"); - - continue; - } else if (pid != getpid()) { - break; - } - - T_EXPECT_EQ_STR(current_process_name(), - [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String], - "current process name matches in stackshot"); - - uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue]; - T_ASSERT_FALSE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "current process not marked as terminated"); - - T_QUIET; - T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue], - "unique pid is greater than pid"); - - bool found_main_thread = false; - for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) { - NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key]; - NSDictionary *thread_snap = thread[@"thread_snapshot"]; - - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0, - "thread ID of thread in current task is valid"); - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0, - "base priority of thread in current task is valid"); - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0, - "scheduling priority of thread in current task is valid"); - - NSString *pth_name = thread[@"pth_name"]; - if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) { - found_main_thread = true; - - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0, - "total syscalls of current thread is valid"); - - NSDictionary *cpu_times = thread[@"cpu_times"]; - T_EXPECT_GE([cpu_times[@"runnable_time"] intValue], - [cpu_times[@"system_time"] intValue] + - [cpu_times[@"user_time"] intValue], - "runnable time of current thread is valid"); - } - } - T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot"); - break; - } - case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { - struct dyld_uuid_info_64_v2 *shared_cache_info = kcdata_iter_payload(iter); - uuid_t shared_cache_uuid; - T_QUIET; T_ASSERT_TRUE(_dyld_get_shared_cache_uuid(shared_cache_uuid), "retrieve current shared cache UUID"); - T_QUIET; T_ASSERT_EQ(memcmp(shared_cache_info->imageUUID, shared_cache_uuid, sizeof(shared_cache_uuid)), 0, - "dyld returned UUID doesn't match kernel returned UUID for system shared cache"); - found_shared_cache_uuid = true; - break; - } - } - } - - if (expect_zombie_child) { - T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata"); - } - - if (expect_shared_cache_layout) { - T_QUIET; T_ASSERT_TRUE(found_shared_cache_layout, "shared cache layout found in kcdata"); - } - - if (expect_shared_cache_uuid) { - T_QUIET; T_ASSERT_TRUE(found_shared_cache_uuid, "shared cache UUID found in kcdata"); - } - - T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata"); -} - -static const char * -current_process_name(void) -{ - static char name[64]; - - if (!name[0]) { - int ret = proc_name(getpid(), name, sizeof(name)); - T_QUIET; - T_ASSERT_POSIX_SUCCESS(ret, "proc_name failed for current process"); - } - - return name; -} - -static void -initialize_thread(void) -{ - int ret = pthread_setname_np(TEST_THREAD_NAME); - T_QUIET; - T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME); -} diff --git a/tests/stackshot_block_owner_14362384.m b/tests/stackshot_block_owner_14362384.m index aabe544b8..e7f34ea3c 100644 --- a/tests/stackshot_block_owner_14362384.m +++ b/tests/stackshot_block_owner_14362384.m @@ -24,7 +24,7 @@ #include #include -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #include #endif @@ -118,7 +118,7 @@ void check_python(void *stackshot, const char *fmt, ...) { save_stackshot(stackshot, "/tmp/ss"); -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) va_list args; va_start(args, fmt); char *re_string = NULL; diff --git a/tests/stackshot_spawn_exit_stress.c b/tests/stackshot_spawn_exit_stress.c index 448095598..342ea2bc6 100644 --- a/tests/stackshot_spawn_exit_stress.c +++ b/tests/stackshot_spawn_exit_stress.c @@ -18,13 +18,7 @@ T_GLOBAL_META( T_META_ASROOT(true) ); -#if TARGET_OS_WATCH -#define SPAWN_ITERATIONS 1999 -#elif TARGET_OS_IPHONE -#define SPAWN_ITERATIONS 4999 -#else -#define SPAWN_ITERATIONS 9999 -#endif +#define TEST_DURATION_NS (60 * NSEC_PER_SEC) #define REAP_INTERVAL 10 @@ -78,13 +72,15 @@ retry: T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config"); } -T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children processes are spawning+exiting") +T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children processes are spawning+exiting", T_META_TIMEOUT(120)) { char path[PATH_MAX]; uint32_t path_size = sizeof(path); T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); char *args[] = { path, "-n", "spawn_children_helper", NULL }; + uint64_t stop_time = clock_gettime_nsec_np(CLOCK_UPTIME_RAW) + TEST_DURATION_NS; + dispatch_queue_t stackshot_queue = dispatch_queue_create("stackshot_queue", NULL); dispatch_async(stackshot_queue, ^(void) { int num_stackshots = 0; @@ -108,7 +104,8 @@ T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children proces "set stdout of child to NULL"); int children_unreaped = 0, status; - for (int iterations_remaining = SPAWN_ITERATIONS; iterations_remaining > 0; iterations_remaining--) { + uint64_t iterations_completed = 0; + while (clock_gettime_nsec_np(CLOCK_UPTIME_RAW) < stop_time) { pid_t pid; int sp_ret = posix_spawn(&pid, args[0], &actions, NULL, args, NULL); @@ -123,9 +120,10 @@ T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children proces } } - if ((iterations_remaining % 100) == 0) { - T_LOG("spawned %d children thus far", (SPAWN_ITERATIONS - iterations_remaining)); + if ((iterations_completed % 100) == 0) { + T_LOG("spawned %llu children thus far", iterations_completed); } + iterations_completed++; } while (children_unreaped) { diff --git a/tests/stackshot_tests.m b/tests/stackshot_tests.m new file mode 100644 index 000000000..29fa817e1 --- /dev/null +++ b/tests/stackshot_tests.m @@ -0,0 +1,1302 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.stackshot"), + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true) + ); + +static const char *current_process_name(void); +static void verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count); +static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid); +static void parse_thread_group_stackshot(void **sbuf, size_t sslen); +static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen); +static void initialize_thread(void); + +#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024) +#define MAX_STACKSHOT_BUFFER_SIZE (6 * 1024 * 1024) + +/* bit flags for parse_stackshot */ +#define PARSE_STACKSHOT_DELTA 0x01 +#define PARSE_STACKSHOT_ZOMBIE 0x02 +#define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT 0x04 +#define PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL 0x08 +#define PARSE_STACKSHOT_TURNSTILEINFO 0x10 + +#define TEST_STACKSHOT_QUEUE_LABEL "houston.we.had.a.problem" +#define TEST_STACKSHOT_QUEUE_LABEL_LENGTH sizeof(TEST_STACKSHOT_QUEUE_LABEL) + +T_DECL(microstackshots, "test the microstackshot syscall") +{ + void *buf = NULL; + unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE; + + while (1) { + buf = malloc(size); + T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer"); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + int len = syscall(SYS_microstackshot, buf, size, + STACKSHOT_GET_MICROSTACKSHOT); +#pragma clang diagnostic pop + if (len == ENOSYS) { + T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY"); + } + if (len == -1 && errno == ENOSPC) { + /* syscall failed because buffer wasn't large enough, try again */ + free(buf); + buf = NULL; + size *= 2; + T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE, + "growing stackshot buffer to sane size"); + continue; + } + T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall"); + break; + } + + T_EXPECT_EQ(*(uint32_t *)buf, + (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC, + "magic value for microstackshot matches"); + + free(buf); +} + +struct scenario { + const char *name; + uint32_t flags; + bool quiet; + bool should_fail; + bool maybe_unsupported; + pid_t target_pid; + uint64_t since_timestamp; + uint32_t size_hint; + dt_stat_time_t timer; +}; + +static void +quiet(struct scenario *scenario) +{ + if (scenario->timer || scenario->quiet) { + T_QUIET; + } +} + +static void +take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size)) +{ + initialize_thread(); + + void *config = stackshot_config_create(); + quiet(scenario); + T_ASSERT_NOTNULL(config, "created stackshot config"); + + int ret = stackshot_config_set_flags(config, scenario->flags); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags); + + if (scenario->size_hint > 0) { + ret = stackshot_config_set_size_hint(config, scenario->size_hint); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config", + scenario->size_hint); + } + + if (scenario->target_pid > 0) { + ret = stackshot_config_set_pid(config, scenario->target_pid); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config", + scenario->target_pid); + } + + if (scenario->since_timestamp > 0) { + ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config", + scenario->since_timestamp); + } + + int retries_remaining = 5; + +retry: ; + uint64_t start_time = mach_absolute_time(); + ret = stackshot_capture_with_config(config); + uint64_t end_time = mach_absolute_time(); + + if (scenario->should_fail) { + T_EXPECTFAIL; + T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); + return; + } + + if (ret == EBUSY || ret == ETIMEDOUT) { + if (retries_remaining > 0) { + if (!scenario->timer) { + T_LOG("stackshot_capture_with_config failed with %s (%d), retrying", + strerror(ret), ret); + } + + retries_remaining--; + goto retry; + } else { + T_ASSERT_POSIX_ZERO(ret, + "called stackshot_capture_with_config (no retries remaining)"); + } + } else if ((ret == ENOTSUP) && scenario->maybe_unsupported) { + T_SKIP("kernel indicated this stackshot configuration is not supported"); + } else { + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); + } + + if (scenario->timer) { + dt_stat_mach_time_add(scenario->timer, end_time - start_time); + } + void *buf = stackshot_config_get_stackshot_buffer(config); + size_t size = stackshot_config_get_stackshot_size(config); + if (scenario->name) { + char sspath[MAXPATHLEN]; + strlcpy(sspath, scenario->name, sizeof(sspath)); + strlcat(sspath, ".kcdata", sizeof(sspath)); + T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(sspath, sizeof(sspath)), + "create result file path"); + + if (!scenario->quiet) { + T_LOG("writing stackshot to %s", sspath); + } + + FILE *f = fopen(sspath, "w"); + T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(f, + "open stackshot output file"); + + size_t written = fwrite(buf, size, 1, f); + T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file"); + + fclose(f); + } + cb(buf, size); + + ret = stackshot_config_dealloc(config); + T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config"); +} + +T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed") +{ + struct scenario scenario = { + .name = "kcdata", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | + STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("taking kcdata stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(0, ssbuf, sslen, -1); + }); +} + +T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed") +{ + struct scenario scenario = { + .name = "faulting", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT + | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING), + }; + + T_LOG("taking faulting stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(0, ssbuf, sslen, -1); + }); +} + +T_DECL(bad_flags, "test a poorly-formed stackshot syscall") +{ + struct scenario scenario = { + .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */, + .should_fail = true, + }; + + T_LOG("attempting to take stackshot with kernel-only flag"); + take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) { + T_ASSERT_FAIL("stackshot data callback called"); + }); +} + +T_DECL(delta, "test delta stackshots") +{ + struct scenario scenario = { + .name = "delta", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("taking full stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); + + T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); + + parse_stackshot(0, ssbuf, sslen, -1); + + struct scenario delta_scenario = { + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT + | STACKSHOT_COLLECT_DELTA_SNAPSHOT), + .since_timestamp = stackshot_time + }; + + take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { + parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1); + }); + }); +} + +T_DECL(shared_cache_layout, "test stackshot inclusion of shared cache layout") +{ + struct scenario scenario = { + .name = "shared_cache_layout", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT | + STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT), + }; + + size_t shared_cache_length; + const void *cache_header = _dyld_get_shared_cache_range(&shared_cache_length); + if (cache_header == NULL) { + T_SKIP("Device not running with shared cache, skipping test..."); + } + + if (shared_cache_length == 0) { + T_SKIP("dyld reports that currently running shared cache has zero length"); + } + + T_LOG("taking stackshot with STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT set"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_LAYOUT, ssbuf, sslen, -1); + }); +} + +T_DECL(stress, "test that taking stackshots for 60 seconds doesn't crash the system") +{ + uint64_t max_diff_time = 60ULL /* seconds */ * 1000000000ULL; + uint64_t start_time; + + struct scenario scenario = { + .name = "stress", + .quiet = true, + .flags = (STACKSHOT_KCDATA_FORMAT | + STACKSHOT_THREAD_WAITINFO | + STACKSHOT_SAVE_LOADINFO | + STACKSHOT_SAVE_KEXT_LOADINFO | + STACKSHOT_GET_GLOBAL_MEM_STATS | + // STACKSHOT_GET_BOOT_PROFILE | + STACKSHOT_SAVE_IMP_DONATION_PIDS | + STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT | + STACKSHOT_THREAD_GROUP | + STACKSHOT_SAVE_JETSAM_COALITIONS | + STACKSHOT_ASID | + // STACKSHOT_PAGE_TABLES | + 0), + }; + + start_time = clock_gettime_nsec_np(CLOCK_MONOTONIC); + while (clock_gettime_nsec_np(CLOCK_MONOTONIC) - start_time < max_diff_time) { + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + printf("."); + fflush(stdout); + }); + + /* Leave some time for the testing infrastructure to catch up */ + usleep(10000); + + } + printf("\n"); +} + +T_DECL(dispatch_queue_label, "test that kcdata stackshots contain libdispatch queue labels") +{ + struct scenario scenario = { + .name = "kcdata", + .flags = (STACKSHOT_GET_DQ | STACKSHOT_KCDATA_FORMAT), + }; + dispatch_semaphore_t child_ready_sem, parent_done_sem; + dispatch_queue_t dq; + +#if TARGET_OS_WATCH + T_SKIP("This test is flaky on watches: 51663346"); +#endif + + child_ready_sem = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "dqlabel child semaphore"); + + parent_done_sem = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(parent_done_sem, "dqlabel parent semaphore"); + + dq = dispatch_queue_create(TEST_STACKSHOT_QUEUE_LABEL, NULL); + T_QUIET; T_ASSERT_NOTNULL(dq, "dispatch queue"); + + /* start the helper thread */ + dispatch_async(dq, ^{ + dispatch_semaphore_signal(child_ready_sem); + + dispatch_semaphore_wait(parent_done_sem, DISPATCH_TIME_FOREVER); + }); + + /* block behind the child starting up */ + dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER); + + T_LOG("taking kcdata stackshot with libdispatch queue labels"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL, ssbuf, sslen, -1); + }); + + dispatch_semaphore_signal(parent_done_sem); +} + +static void *stuck_sysctl_thread(void *arg) { + int val = 1; + dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg; + + dispatch_semaphore_signal(child_thread_started); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread"); + + return NULL; +} + +T_HELPER_DECL(zombie_child, "child process to sample as a zombie") +{ + pthread_t pthread; + dispatch_semaphore_t child_thread_started = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(child_thread_started, "zombie child thread semaphore"); + + /* spawn another thread to get stuck in the kernel, then call exit() to become a zombie */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&pthread, NULL, stuck_sysctl_thread, &child_thread_started), "pthread_create"); + + dispatch_semaphore_wait(child_thread_started, DISPATCH_TIME_FOREVER); + + /* sleep for a bit in the hope of ensuring that the other thread has called the sysctl before we signal the parent */ + usleep(100); + T_ASSERT_POSIX_SUCCESS(kill(getppid(), SIGUSR1), "signaled parent to take stackshot"); + + exit(0); +} + +T_DECL(zombie, "tests a stackshot of a zombie task with a thread stuck in the kernel") +{ + char path[PATH_MAX]; + uint32_t path_size = sizeof(path); + T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); + char *args[] = { path, "-n", "zombie_child", NULL }; + + dispatch_source_t child_sig_src; + dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "zombie child semaphore"); + + dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL); + T_QUIET; T_ASSERT_NOTNULL(signal_processing_q, "signal processing queue"); + + pid_t pid; + + T_LOG("spawning a child"); + + signal(SIGUSR1, SIG_IGN); + child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q); + T_QUIET; T_ASSERT_NOTNULL(child_sig_src, "dispatch_source_create (child_sig_src)"); + + dispatch_source_set_event_handler(child_sig_src, ^{ dispatch_semaphore_signal(child_ready_sem); }); + dispatch_activate(child_sig_src); + + int sp_ret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid); + + dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER); + + T_LOG("received signal from child, capturing stackshot"); + + struct proc_bsdshortinfo bsdshortinfo; + int retval, iterations_to_wait = 10; + + while (iterations_to_wait > 0) { + retval = proc_pidinfo(pid, PROC_PIDT_SHORTBSDINFO, 0, &bsdshortinfo, sizeof(bsdshortinfo)); + if ((retval == 0) && errno == ESRCH) { + T_LOG("unable to find child using proc_pidinfo, assuming zombie"); + break; + } + + T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(retval, 0, "proc_pidinfo(PROC_PIDT_SHORTBSDINFO) returned a value > 0"); + T_QUIET; T_ASSERT_EQ(retval, (int)sizeof(bsdshortinfo), "proc_pidinfo call for PROC_PIDT_SHORTBSDINFO returned expected size"); + + if (bsdshortinfo.pbsi_flags & PROC_FLAG_INEXIT) { + T_LOG("child proc info marked as in exit"); + break; + } + + iterations_to_wait--; + if (iterations_to_wait == 0) { + /* + * This will mark the test as failed but let it continue so we + * don't leave a process stuck in the kernel. + */ + T_FAIL("unable to discover that child is marked as exiting"); + } + + /* Give the child a few more seconds to make it to exit */ + sleep(5); + } + + /* Give the child some more time to make it through exit */ + sleep(10); + + struct scenario scenario = { + .name = "zombie", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) { + /* First unwedge the child so we can reap it */ + int val = 1, status; + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on zombie child"); + + parse_stackshot(PARSE_STACKSHOT_ZOMBIE, ssbuf, sslen, pid); + }); +} + +static uint32_t +get_user_promotion_basepri(void) +{ + mach_msg_type_number_t count = THREAD_POLICY_STATE_COUNT; + struct thread_policy_state thread_policy; + boolean_t get_default = FALSE; + mach_port_t thread_port = pthread_mach_thread_np(pthread_self()); + + kern_return_t kr = thread_policy_get(thread_port, THREAD_POLICY_STATE, + (thread_policy_t)&thread_policy, &count, &get_default); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_get"); + return thread_policy.thps_user_promotion_basepri; +} + +static int +get_pri(thread_t thread_port) +{ + kern_return_t kr; + + thread_extended_info_data_t extended_info; + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + return extended_info.pth_curpri; +} + + +T_DECL(turnstile_singlehop, "turnstile single hop test") +{ + dispatch_queue_t dq1, dq2; + dispatch_semaphore_t sema_x; + dispatch_queue_attr_t dq1_attr, dq2_attr; + qos_class_t main_qos = 0; + int main_relpri = 0, main_relpri2 = 0, main_afterpri = 0; + struct scenario scenario = { + .name = "turnstile_singlehop", + .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT), + }; + dq1_attr = dispatch_queue_attr_make_with_qos_class(DISPATCH_QUEUE_SERIAL, QOS_CLASS_UTILITY, 0); + dq2_attr = dispatch_queue_attr_make_with_qos_class(DISPATCH_QUEUE_SERIAL, QOS_CLASS_USER_INITIATED, 0); + pthread_mutex_t lock_a = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_t lock_b = PTHREAD_MUTEX_INITIALIZER; + + pthread_mutex_t *lockap = &lock_a, *lockbp = &lock_b; + + dq1 = dispatch_queue_create("q1", dq1_attr); + dq2 = dispatch_queue_create("q2", dq2_attr); + sema_x = dispatch_semaphore_create(0); + + pthread_mutex_lock(lockap); + dispatch_async(dq1, ^{ + pthread_mutex_lock(lockbp); + T_ASSERT_POSIX_SUCCESS(pthread_get_qos_class_np(pthread_self(), &main_qos, &main_relpri), "get qos class"); + T_LOG("The priority of q1 is %d\n", get_pri(mach_thread_self())); + dispatch_semaphore_signal(sema_x); + pthread_mutex_lock(lockap); + }); + dispatch_semaphore_wait(sema_x, DISPATCH_TIME_FOREVER); + + T_LOG("Async1 completed"); + + pthread_set_qos_class_self_np(QOS_CLASS_UTILITY, 0); + T_ASSERT_POSIX_SUCCESS(pthread_get_qos_class_np(pthread_self(), &main_qos, &main_relpri), "get qos class"); + T_LOG("The priority of main is %d\n", get_pri(mach_thread_self())); + main_relpri = get_pri(mach_thread_self()); + + dispatch_async(dq2, ^{ + T_ASSERT_POSIX_SUCCESS(pthread_get_qos_class_np(pthread_self(), &main_qos, &main_relpri2), "get qos class"); + T_LOG("The priority of q2 is %d\n", get_pri(mach_thread_self())); + dispatch_semaphore_signal(sema_x); + pthread_mutex_lock(lockbp); + }); + dispatch_semaphore_wait(sema_x, DISPATCH_TIME_FOREVER); + + T_LOG("Async2 completed"); + + while (1) { + main_afterpri = get_user_promotion_basepri(); + if (main_relpri != main_afterpri) { + T_LOG("Success with promotion pri is %d", main_afterpri); + break; + } + + usleep(100); + } + + take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_TURNSTILEINFO, ssbuf, sslen, -1); + }); +} + + +static void +expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen) +{ + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + + bool in_task = false; + bool in_thread = false; + bool saw_instrs_cycles = false; + iter = kcdata_iter_next(iter); + + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_CONTAINER_BEGIN: + switch (kcdata_iter_container_type(iter)) { + case STACKSHOT_KCCONTAINER_TASK: + in_task = true; + saw_instrs_cycles = false; + break; + + case STACKSHOT_KCCONTAINER_THREAD: + in_thread = true; + saw_instrs_cycles = false; + break; + + default: + break; + } + break; + + case STACKSHOT_KCTYPE_INSTRS_CYCLES: + saw_instrs_cycles = true; + break; + + case KCDATA_TYPE_CONTAINER_END: + if (in_thread) { + T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, + "saw instructions and cycles in thread"); + in_thread = false; + } else if (in_task) { + T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, + "saw instructions and cycles in task"); + in_task = false; + } + + default: + break; + } + } +} + +static void +skip_if_monotonic_unsupported(void) +{ + int supported = 0; + size_t supported_size = sizeof(supported); + int ret = sysctlbyname("kern.monotonic.supported", &supported, + &supported_size, 0, 0); + if (ret < 0 || !supported) { + T_SKIP("monotonic is unsupported"); + } +} + +T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot") +{ + skip_if_monotonic_unsupported(); + + struct scenario scenario = { + .name = "instrs-cycles", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES + | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("attempting to take stackshot with instructions and cycles"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(0, ssbuf, sslen, -1); + expect_instrs_cycles_in_stackshot(ssbuf, sslen); + }); +} + +T_DECL(delta_instrs_cycles, + "test delta stackshots with instructions and cycles") +{ + skip_if_monotonic_unsupported(); + + struct scenario scenario = { + .name = "delta-instrs-cycles", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES + | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("taking full stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); + + T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); + + parse_stackshot(0, ssbuf, sslen, -1); + expect_instrs_cycles_in_stackshot(ssbuf, sslen); + + struct scenario delta_scenario = { + .name = "delta-instrs-cycles-next", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES + | STACKSHOT_KCDATA_FORMAT + | STACKSHOT_COLLECT_DELTA_SNAPSHOT), + .since_timestamp = stackshot_time, + }; + + take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { + parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1); + expect_instrs_cycles_in_stackshot(dssbuf, dsslen); + }); + }); +} + +static void +check_thread_groups_supported() +{ + int err; + int supported = 0; + size_t supported_size = sizeof(supported); + err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0); + + if (err || !supported) + T_SKIP("thread groups not supported on this system"); +} + +T_DECL(thread_groups, "test getting thread groups in stackshot") +{ + check_thread_groups_supported(); + + struct scenario scenario = { + .name = "thread-groups", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP + | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("attempting to take stackshot with thread group flag"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_thread_group_stackshot(ssbuf, sslen); + }); +} + +static void +parse_page_table_asid_stackshot(void **ssbuf, size_t sslen) +{ + bool seen_asid = false; + bool seen_page_table_snapshot = false; + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, + "buffer provided is a stackshot"); + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_array_valid(iter), + "checked that array is valid"); + + if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_PAGE_TABLES) { + continue; + } + + T_ASSERT_FALSE(seen_page_table_snapshot, "check that we haven't yet seen a page table snapshot"); + seen_page_table_snapshot = true; + + T_ASSERT_EQ((size_t) kcdata_iter_array_elem_size(iter), sizeof(uint64_t), + "check that each element of the pagetable dump is the expected size"); + + uint64_t *pt_array = kcdata_iter_payload(iter); + uint32_t elem_count = kcdata_iter_array_elem_count(iter); + uint32_t j; + bool nonzero_tte = false; + for (j = 0; j < elem_count;) { + T_QUIET; T_ASSERT_LE(j + 4, elem_count, "check for valid page table segment header"); + uint64_t pa = pt_array[j]; + uint64_t num_entries = pt_array[j + 1]; + uint64_t start_va = pt_array[j + 2]; + uint64_t end_va = pt_array[j + 3]; + + T_QUIET; T_ASSERT_NE(pa, (uint64_t) 0, "check that the pagetable physical address is non-zero"); + T_QUIET; T_ASSERT_EQ(pa % (num_entries * sizeof(uint64_t)), (uint64_t) 0, "check that the pagetable physical address is correctly aligned"); + T_QUIET; T_ASSERT_NE(num_entries, (uint64_t) 0, "check that a pagetable region has more than 0 entries"); + T_QUIET; T_ASSERT_LE(j + 4 + num_entries, (uint64_t) elem_count, "check for sufficient space in page table array"); + T_QUIET; T_ASSERT_GT(end_va, start_va, "check for valid VA bounds in page table segment header"); + + for (uint32_t k = j + 4; k < (j + 4 + num_entries); ++k) { + if (pt_array[k] != 0) { + nonzero_tte = true; + T_QUIET; T_ASSERT_EQ((pt_array[k] >> 48) & 0xf, (uint64_t) 0, "check that bits[48:51] of arm64 TTE are clear"); + // L0-L2 table and non-compressed L3 block entries should always have bit 1 set; assumes L0-L2 blocks will not be used outside the kernel + bool table = ((pt_array[k] & 0x2) != 0); + if (table) { + T_QUIET; T_ASSERT_NE(pt_array[k] & ((1ULL << 48) - 1) & ~((1ULL << 12) - 1), (uint64_t) 0, "check that arm64 TTE physical address is non-zero"); + } else { // should be a compressed PTE + T_QUIET; T_ASSERT_NE(pt_array[k] & 0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has at least one of bits [63:62] set"); + T_QUIET; T_ASSERT_EQ(pt_array[k] & ~0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has no other bits besides [63:62] set"); + } + } + } + + j += (4 + num_entries); + } + T_ASSERT_TRUE(nonzero_tte, "check that we saw at least one non-empty TTE"); + T_ASSERT_EQ(j, elem_count, "check that page table dump size matches extent of last header"); + break; + } + case STACKSHOT_KCTYPE_ASID: { + T_ASSERT_FALSE(seen_asid, "check that we haven't yet seen an ASID"); + seen_asid = true; + } + } + } + T_ASSERT_TRUE(seen_page_table_snapshot, "check that we have seen a page table snapshot"); + T_ASSERT_TRUE(seen_asid, "check that we have seen an ASID"); +} + +T_DECL(dump_page_tables, "test stackshot page table dumping support") +{ + struct scenario scenario = { + .name = "asid-page-tables", + .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_ASID | STACKSHOT_PAGE_TABLES), + .size_hint = (1ULL << 23), // 8 MB + .target_pid = getpid(), + .maybe_unsupported = true, + }; + + T_LOG("attempting to take stackshot with ASID and page table flags"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_page_table_asid_stackshot(ssbuf, sslen); + }); +} + +static void stackshot_verify_current_proc_uuid_info(void **ssbuf, size_t sslen, uint64_t expected_offset, const struct proc_uniqidentifierinfo *proc_info_data) +{ + const uuid_t *current_uuid = (const uuid_t *)(&proc_info_data->p_uuid); + + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, "buffer provided is a stackshot"); + + iter = kcdata_iter_next(iter); + + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; T_ASSERT_TRUE(kcdata_iter_array_valid(iter), "checked that array is valid"); + if (kcdata_iter_array_elem_type(iter) == KCDATA_TYPE_LIBRARY_LOADINFO64) { + struct user64_dyld_uuid_info *info = (struct user64_dyld_uuid_info *) kcdata_iter_payload(iter); + if (uuid_compare(*current_uuid, info->imageUUID) == 0) { + T_ASSERT_EQ(expected_offset, info->imageLoadAddress, "found matching UUID with matching binary offset"); + return; + } + } else if (kcdata_iter_array_elem_type(iter) == KCDATA_TYPE_LIBRARY_LOADINFO) { + struct user32_dyld_uuid_info *info = (struct user32_dyld_uuid_info *) kcdata_iter_payload(iter); + if (uuid_compare(*current_uuid, info->imageUUID) == 0) { + T_ASSERT_EQ(expected_offset, ((uint64_t) info->imageLoadAddress), "found matching UUID with matching binary offset"); + return; + } + } + break; + } + default: + break; + } + } + + T_FAIL("failed to find matching UUID in stackshot data"); +} + +T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always populated") +{ + struct proc_uniqidentifierinfo proc_info_data = { }; + mach_msg_type_number_t count; + kern_return_t kernel_status; + task_dyld_info_data_t task_dyld_info; + struct dyld_all_image_infos *target_infos; + int retval; + bool found_image_in_image_infos = false; + uint64_t expected_mach_header_offset = 0; + + /* Find the UUID of our main binary */ + retval = proc_pidinfo(getpid(), PROC_PIDUNIQIDENTIFIERINFO, 0, &proc_info_data, sizeof(proc_info_data)); + T_QUIET; T_EXPECT_POSIX_SUCCESS(retval, "proc_pidinfo PROC_PIDUNIQIDENTIFIERINFO"); + T_QUIET; T_ASSERT_EQ_INT(retval, (int) sizeof(proc_info_data), "proc_pidinfo PROC_PIDUNIQIDENTIFIERINFO returned data"); + + uuid_string_t str = {}; + uuid_unparse(*(uuid_t*)&proc_info_data.p_uuid, str); + T_LOG("Found current UUID is %s", str); + + /* Find the location of the dyld image info metadata */ + count = TASK_DYLD_INFO_COUNT; + kernel_status = task_info(mach_task_self(), TASK_DYLD_INFO, (task_info_t)&task_dyld_info, &count); + T_QUIET; T_ASSERT_EQ(kernel_status, KERN_SUCCESS, "retrieve task_info for TASK_DYLD_INFO"); + + target_infos = (struct dyld_all_image_infos *)task_dyld_info.all_image_info_addr; + + /* Find our binary in the dyld image info array */ + for (int i = 0; i < (int) target_infos->uuidArrayCount; i++) { + if (uuid_compare(target_infos->uuidArray[i].imageUUID, *(uuid_t*)&proc_info_data.p_uuid) == 0) { + expected_mach_header_offset = (uint64_t) target_infos->uuidArray[i].imageLoadAddress; + found_image_in_image_infos = true; + } + } + + T_ASSERT_TRUE(found_image_in_image_infos, "found binary image in dyld image info list"); + + /* Overwrite the dyld image info data so the kernel has to fallback to the UUID stored in the proc structure */ + target_infos->uuidArrayCount = 0; + + struct scenario scenario = { + .name = "proc_uuid_info", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT), + .target_pid = getpid(), + }; + + T_LOG("attempting to take stackshot for current PID"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + stackshot_verify_current_proc_uuid_info(ssbuf, sslen, expected_mach_header_offset, &proc_info_data); + }); +} + +#pragma mark performance tests + +#define SHOULD_REUSE_SIZE_HINT 0x01 +#define SHOULD_USE_DELTA 0x02 +#define SHOULD_TARGET_SELF 0x04 + +static void +stackshot_perf(unsigned int options) +{ + struct scenario scenario = { + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + dt_stat_t size = dt_stat_create("bytes", "size"); + dt_stat_time_t duration = dt_stat_time_create("duration"); + scenario.timer = duration; + + if (options & SHOULD_TARGET_SELF) { + scenario.target_pid = getpid(); + } + + while (!dt_stat_stable(duration) || !dt_stat_stable(size)) { + __block uint64_t last_time = 0; + __block uint32_t size_hint = 0; + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + dt_stat_add(size, (double)sslen); + last_time = stackshot_timestamp(ssbuf, sslen); + size_hint = (uint32_t)sslen; + }); + if (options & SHOULD_USE_DELTA) { + scenario.since_timestamp = last_time; + scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT; + } + if (options & SHOULD_REUSE_SIZE_HINT) { + scenario.size_hint = size_hint; + } + } + + dt_stat_finalize(duration); + dt_stat_finalize(size); +} + +T_DECL(perf_no_size_hint, "test stackshot performance with no size hint", + T_META_TAG_PERF) +{ + stackshot_perf(0); +} + +T_DECL(perf_size_hint, "test stackshot performance with size hint", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT); +} + +T_DECL(perf_process, "test stackshot performance targeted at process", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF); +} + +T_DECL(perf_delta, "test delta stackshot performance", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA); +} + +T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF); +} + +static uint64_t +stackshot_timestamp(void *ssbuf, size_t sslen) +{ + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + + uint32_t type = kcdata_iter_type(iter); + if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) { + T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter)); + } + + iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME); + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot"); + + return *(uint64_t *)kcdata_iter_payload(iter); +} + +#define TEST_THREAD_NAME "stackshot_test_thread" + +static void +parse_thread_group_stackshot(void **ssbuf, size_t sslen) +{ + bool seen_thread_group_snapshot = false; + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, + "buffer provided is a stackshot"); + + NSMutableSet *thread_groups = [[NSMutableSet alloc] init]; + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_array_valid(iter), + "checked that array is valid"); + + if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) { + continue; + } + + seen_thread_group_snapshot = true; + + if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) { + struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter); + for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { + struct thread_group_snapshot_v2 *tgs = tgs_array + j; + [thread_groups addObject:@(tgs->tgs_id)]; + } + + } + else { + struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter); + for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { + struct thread_group_snapshot *tgs = tgs_array + j; + [thread_groups addObject:@(tgs->tgs_id)]; + } + } + break; + } + } + } + KCDATA_ITER_FOREACH(iter) { + NSError *error = nil; + + switch (kcdata_iter_type(iter)) { + + case KCDATA_TYPE_CONTAINER_BEGIN: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_container_valid(iter), + "checked that container is valid"); + + if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) { + break; + } + + NSDictionary *container = parseKCDataContainer(&iter, &error); + T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); + T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); + + int tg = [container[@"thread_snapshots"][@"thread_group"] intValue]; + + T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists"); + + break; + }; + + } + } + T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot"); +} + +static void +verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count) +{ + uuid_t cur_shared_cache_uuid; + __block uint32_t lib_index = 0, libs_found = 0; + + _dyld_get_shared_cache_uuid(cur_shared_cache_uuid); + int result = dyld_shared_cache_iterate_text(cur_shared_cache_uuid, ^(const dyld_shared_cache_dylib_text_info* info) { + T_QUIET; T_ASSERT_LT(lib_index, uuid_count, "dyld_shared_cache_iterate_text exceeded number of libraries returned by kernel"); + + libs_found++; + struct dyld_uuid_info_64 *cur_stackshot_uuid_entry = &uuids[lib_index]; + T_QUIET; T_ASSERT_EQ(memcmp(info->dylibUuid, cur_stackshot_uuid_entry->imageUUID, sizeof(info->dylibUuid)), 0, + "dyld returned UUID doesn't match kernel returned UUID"); + T_QUIET; T_ASSERT_EQ(info->loadAddressUnslid, cur_stackshot_uuid_entry->imageLoadAddress, + "dyld returned load address doesn't match kernel returned load address"); + lib_index++; + }); + + T_ASSERT_EQ(result, 0, "iterate shared cache layout"); + T_ASSERT_EQ(libs_found, uuid_count, "dyld iterator returned same number of libraries as kernel"); + + T_LOG("verified %d libraries from dyld shared cache", libs_found); +} + +static void +parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid) +{ + bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA); + bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE); + bool expect_shared_cache_layout = false; + bool expect_shared_cache_uuid = !delta; + bool expect_dispatch_queue_label = (stackshot_parsing_flags & PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL); + bool expect_turnstile_lock = (stackshot_parsing_flags & PARSE_STACKSHOT_TURNSTILEINFO); + bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false; + bool found_dispatch_queue_label = false, found_turnstile_lock = false; + + if (expect_shared_cache_uuid) { + uuid_t shared_cache_uuid; + if (!_dyld_get_shared_cache_uuid(shared_cache_uuid)) { + T_LOG("Skipping verifying shared cache UUID in stackshot data because not running with a shared cache"); + expect_shared_cache_uuid = false; + } + } + + if (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_LAYOUT) { + size_t shared_cache_length = 0; + const void *cache_header = _dyld_get_shared_cache_range(&shared_cache_length); + T_QUIET; T_ASSERT_NOTNULL(cache_header, "current process running with shared cache"); + T_QUIET; T_ASSERT_GT(shared_cache_length, sizeof(struct _dyld_cache_header), "valid shared cache length populated by _dyld_get_shared_cache_range"); + + if (_dyld_shared_cache_is_locally_built()) { + T_LOG("device running with locally built shared cache, expect shared cache layout"); + expect_shared_cache_layout = true; + } else { + T_LOG("device running with B&I built shared-cache, no shared cache layout expected"); + } + } + + if (expect_zombie_child) { + T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid greater than zero"); + } + + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + if (delta) { + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, + "buffer provided is a delta stackshot"); + } else { + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, + "buffer provided is a stackshot"); + } + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) { + NSError *error = nil; + + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_array_valid(iter), + "checked that array is valid"); + + NSMutableDictionary *array = parseKCDataArray(iter, &error); + T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot"); + T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array"); + + if (kcdata_iter_array_elem_type(iter) == STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT) { + struct dyld_uuid_info_64 *shared_cache_uuids = kcdata_iter_payload(iter); + uint32_t uuid_count = kcdata_iter_array_elem_count(iter); + T_ASSERT_NOTNULL(shared_cache_uuids, "parsed shared cache layout array"); + T_ASSERT_GT(uuid_count, 0, "returned valid number of UUIDs from shared cache"); + verify_stackshot_sharedcache_layout(shared_cache_uuids, uuid_count); + found_shared_cache_layout = true; + } + + break; + } + + case KCDATA_TYPE_CONTAINER_BEGIN: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_container_valid(iter), + "checked that container is valid"); + + if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) { + break; + } + + NSDictionary *container = parseKCDataContainer(&iter, &error); + T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); + T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); + + if (expect_dispatch_queue_label && !found_dispatch_queue_label) { + for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) { + NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key]; + NSString *dql = thread[@"dispatch_queue_label"]; + + if ([dql isEqualToString:@TEST_STACKSHOT_QUEUE_LABEL]) { + found_dispatch_queue_label = true; + break; + } + } + } + + int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue]; + if (expect_zombie_child && (pid == child_pid)) { + found_zombie_child = true; + + uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue]; + T_ASSERT_TRUE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "child zombie marked as terminated"); + + continue; + } else if (pid != getpid()) { + break; + } + + T_EXPECT_EQ_STR(current_process_name(), + [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String], + "current process name matches in stackshot"); + + uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue]; + T_ASSERT_FALSE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "current process not marked as terminated"); + + T_QUIET; + T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue], + "unique pid is greater than pid"); + + bool found_main_thread = false; + uint64_t main_thread_id = -1; + for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) { + NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key]; + NSDictionary *thread_snap = thread[@"thread_snapshot"]; + + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0, + "thread ID of thread in current task is valid"); + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0, + "base priority of thread in current task is valid"); + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0, + "scheduling priority of thread in current task is valid"); + + NSString *pth_name = thread[@"pth_name"]; + if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) { + found_main_thread = true; + main_thread_id = [thread_snap[@"ths_thread_id"] intValue]; + + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0, + "total syscalls of current thread is valid"); + + NSDictionary *cpu_times = thread[@"cpu_times"]; + T_EXPECT_GE([cpu_times[@"runnable_time"] intValue], + [cpu_times[@"system_time"] intValue] + + [cpu_times[@"user_time"] intValue], + "runnable time of current thread is valid"); + } + } + T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot"); + + if (expect_turnstile_lock && !found_turnstile_lock) { + NSArray *tsinfos = container[@"task_snapshots"][@"thread_turnstileinfo"]; + + for (id i in tsinfos) { + if ([i[@"turnstile_context"] intValue] == main_thread_id) { + found_turnstile_lock = true; + break; + } + } + } + break; + } + case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { + struct dyld_uuid_info_64_v2 *shared_cache_info = kcdata_iter_payload(iter); + uuid_t shared_cache_uuid; + T_QUIET; T_ASSERT_TRUE(_dyld_get_shared_cache_uuid(shared_cache_uuid), "retrieve current shared cache UUID"); + T_QUIET; T_ASSERT_EQ(memcmp(shared_cache_info->imageUUID, shared_cache_uuid, sizeof(shared_cache_uuid)), 0, + "dyld returned UUID doesn't match kernel returned UUID for system shared cache"); + found_shared_cache_uuid = true; + break; + } + } + } + + if (expect_zombie_child) { + T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata"); + } + + if (expect_shared_cache_layout) { + T_QUIET; T_ASSERT_TRUE(found_shared_cache_layout, "shared cache layout found in kcdata"); + } + + if (expect_shared_cache_uuid) { + T_QUIET; T_ASSERT_TRUE(found_shared_cache_uuid, "shared cache UUID found in kcdata"); + } + + if (expect_dispatch_queue_label) { + T_QUIET; T_ASSERT_TRUE(found_dispatch_queue_label, "dispatch queue label found in kcdata"); + } + + if (expect_turnstile_lock) { + T_QUIET; T_ASSERT_TRUE(found_turnstile_lock, "found expected deadlock"); + } + + T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata"); +} + +static const char * +current_process_name(void) +{ + static char name[64]; + + if (!name[0]) { + int ret = proc_name(getpid(), name, sizeof(name)); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_name failed for current process"); + } + + return name; +} + +static void +initialize_thread(void) +{ + int ret = pthread_setname_np(TEST_THREAD_NAME); + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME); +} diff --git a/tests/suspended_spawn_26184412.c b/tests/suspended_spawn_26184412.c index 6a8977bb5..976026d96 100644 --- a/tests/suspended_spawn_26184412.c +++ b/tests/suspended_spawn_26184412.c @@ -14,6 +14,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + /* * Test to validate that suspended-spawn DTRTs when a SIGKILL is recieved * while the process is waiting for SIGCONT. diff --git a/tests/sysctl_get_owned_vmobjects.c b/tests/sysctl_get_owned_vmobjects.c new file mode 100644 index 000000000..f1a1ffbce --- /dev/null +++ b/tests/sysctl_get_owned_vmobjects.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static const char* g_sysctl_name = "vm.get_owned_vmobjects"; + +static void +main_test(void) +{ + int ret; + mach_port_name_t task_name; + vmobject_list_output_t out_buffer; + size_t out_size; + size_t output_size; + const vm_size_t tmp_size = 16 * 1024 * 1024; /* arbitrary size */ + vm_address_t tmp_buf; + vm_address_t tmp_buf2; + mach_vm_size_t addr_size; + mach_vm_address_t addr; + kern_return_t kr; + mach_port_t __self = mach_task_self(); + vm_region_submap_info_data_64_t regionInfo; + uint32_t nestingDepth; + mach_msg_type_number_t count; + + /* allocate a temporary buffer */ + kr = vm_allocate(__self, &tmp_buf, tmp_size, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate(%zu) error 0x%x (%s)", + tmp_size, kr, mach_error_string(kr)); + T_QUIET; + T_EXPECT_NE(tmp_buf, 0UL, "failed to allocate temporary purgable buffer\n"); + + kr = vm_allocate(__self, &tmp_buf2, tmp_size, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate(%zu) error 0x%x (%s)", + tmp_size, kr, mach_error_string(kr)); + T_QUIET; + T_EXPECT_NE(tmp_buf2, 0UL, "failed to allocate temporary purgable buffer\n"); + + /* expected failures */ + out_size = tmp_size; + ret = sysctlbyname(g_sysctl_name, NULL, 0, NULL, 0); + T_EXPECT_EQ(ret, -1, "expected failure with 0 parameters\n"); + T_EXPECT_EQ(errno, EINVAL, "expected EINVAL with 0 parameters\n"); + + ret = sysctlbyname(g_sysctl_name, (void*) tmp_buf, &out_size, NULL, 0); + T_EXPECT_EQ(ret, -1, "expected failure with no new parameters\n"); + T_EXPECT_EQ(errno, EINVAL, "expected EINVAL with 0 new parameters\n"); + + out_size = tmp_size; + ret = sysctlbyname(g_sysctl_name, NULL, 0, (void*) tmp_buf, out_size); + T_EXPECT_EQ(ret, -1, "expected failure with no old parameters\n"); + T_EXPECT_EQ(errno, EINVAL, "expected EINVAL with 0 old parameters\n"); + + task_name = MACH_PORT_NULL; + ret = sysctlbyname(g_sysctl_name, (void*) tmp_buf, &out_size, &task_name, sizeof(task_name)); + T_EXPECT_EQ(ret, -1, "expected failure with task_name == MACH_PORT_NULL in new parameters\n"); + T_EXPECT_EQ(errno, ESRCH, "expected ESRCH with invalid task port name\n"); + + /* we should get the number of entries we should allocate for */ + out_size = 0; + output_size = 0; + task_name = mach_task_self(); + ret = sysctlbyname(g_sysctl_name, NULL, &out_size, &task_name, sizeof(task_name)); + T_QUIET; + T_EXPECT_EQ(ret, 0, "failed getting the number of entries\n"); + T_EXPECT_EQ(out_size, 2 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "expeccted one entry\n"); + + /* calculcate and allocate the proper sized output buffer */ + output_size = out_size; + out_buffer = (vmobject_list_output_t)calloc(output_size, 1); + T_QUIET; + T_EXPECT_NE(out_buffer, NULL, "failed to allocate the output buffer for sysctlbyname\n"); + + /* get the truncated list for the current process */ + memset(out_buffer, 0, output_size); + out_size = 1 * sizeof(vm_object_query_data_t) + sizeof(int64_t); + ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name)); + + T_QUIET; + T_EXPECT_EQ(ret, 0, "sysctlbyname failed\n"); + T_EXPECT_EQ(out_size, 1 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "sysctl return size is incorrect\n"); + T_EXPECT_EQ(out_buffer->entries, 1ULL, "should have 1 vm object\n"); + T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n"); + + /* get the list for the current process */ + out_size = output_size; + memset(out_buffer, 0, output_size); + ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name)); + + T_QUIET; + T_EXPECT_EQ(ret, 0, "sysctlbyname failed\n"); + T_EXPECT_EQ(out_size, 2 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "sysctl return size is incorrect\n"); + T_EXPECT_EQ(out_buffer->entries, 2ULL, "should have 2 vm objects\n"); + T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n"); + + addr = tmp_buf; + addr_size = tmp_size; + nestingDepth = UINT_MAX; + count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + kr = mach_vm_region_recurse(__self, &addr, &addr_size, &nestingDepth, (vm_region_info_t)®ionInfo, &count); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_vm_region_recurse(%zu) error 0x%x (%s)\n", + tmp_size, kr, mach_error_string(kr)); + T_EXPECT_EQ(regionInfo.object_id_full, out_buffer->data[0].object_id, "object_id_full does not match out_buffer->object[0]\n"); + + addr = tmp_buf2; + addr_size = tmp_size; + nestingDepth = UINT_MAX; + count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + kr = mach_vm_region_recurse(__self, &addr, &addr_size, &nestingDepth, (vm_region_info_t)®ionInfo, &count); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_vm_region_recurse(%zu) error 0x%x (%s)\n", + tmp_size, kr, mach_error_string(kr)); + T_EXPECT_EQ(regionInfo.object_id_full, out_buffer->data[1].object_id, "object_id_full does not match out_buffer->object[1]\n"); + + kr = vm_deallocate(__self, tmp_buf, tmp_size); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate(%zu) error 0x%x (%s)\n", + tmp_size, kr, mach_error_string(kr)); + + kr = vm_deallocate(__self, tmp_buf2, tmp_size); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate(%zu) error 0x%x (%s)\n", + tmp_size, kr, mach_error_string(kr)); + + free(out_buffer); + out_buffer = NULL; +} + +T_DECL(test_get_vmobject_list, "Get owned vm_objects for process") +{ + main_test(); +} diff --git a/tests/task_info.c b/tests/task_info.c index 74ab31f23..a40a5d569 100644 --- a/tests/task_info.c +++ b/tests/task_info.c @@ -13,6 +13,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + /* ************************************************************************************* * Test the task_info API. * @@ -163,6 +165,42 @@ T_DECL(task_vm_info, "tests task vm info", T_META_ASROOT(true), T_META_LTEPHASE( "task_info --rev2 call returned value 0x%llx for vm_info.max_address. Expected anything other than 0x%llx since " "this value should be modified by rev2", vm_info.max_address, CANARY); + + /* + * Test the REV4 version of TASK_VM_INFO. + */ + + count = TASK_VM_INFO_REV4_COUNT; + vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL; + vm_info.min_address = CANARY; + vm_info.max_address = CANARY; + vm_info.limit_bytes_remaining = CANARY; + + err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count); + + T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded"); + + T_EXPECT_EQ(count, TASK_VM_INFO_REV4_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV4_COUNT\n", count); + + T_EXPECT_NE(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL, + "task_info --rev4 call returned value %llu for vm_info.phys_footprint. Expected anything other than %u since this " + "value should be modified by rev4", + vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL); + + T_EXPECT_NE(vm_info.min_address, CANARY, + "task_info --rev4 call returned value 0x%llx for vm_info.min_address. Expected anything other than 0x%llx since " + "this value should be modified by rev4", + vm_info.min_address, CANARY); + + T_EXPECT_NE(vm_info.max_address, CANARY, + "task_info --rev4 call returned value 0x%llx for vm_info.max_address. Expected anything other than 0x%llx since " + "this value should be modified by rev4", + vm_info.max_address, CANARY); + + T_EXPECT_NE(vm_info.limit_bytes_remaining, CANARY, + "task_info --rev4 call returned value 0x%llx for vm_info.limit_bytes_remaining. Expected anything other than 0x%llx since " + "this value should be modified by rev4", + vm_info.limit_bytes_remaining, CANARY); } T_DECL(host_debug_info, "tests host debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT)) diff --git a/tests/task_info_28439149.c b/tests/task_info_28439149.c index f56e38da5..a5872b27d 100644 --- a/tests/task_info_28439149.c +++ b/tests/task_info_28439149.c @@ -12,6 +12,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + static void do_child(int *pipefd) { diff --git a/tests/task_inspect.c b/tests/task_inspect.c index 5b3dff783..b9fbe2ee7 100644 --- a/tests/task_inspect.c +++ b/tests/task_inspect.c @@ -13,7 +13,8 @@ #include #include -T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc")); +T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(true)); /* * Attempt to inspect kernel_task using a task_inspect_t. Interact with the diff --git a/tests/task_vm_info_decompressions.c b/tests/task_vm_info_decompressions.c new file mode 100644 index 000000000..281f9677d --- /dev/null +++ b/tests/task_vm_info_decompressions.c @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KB 1024 +#define MALLOC_SIZE_PER_THREAD (64 * KB) +#define freezer_path "/usr/local/bin/freeze" + +/* BridgeOS could spend more time execv freezer */ +#if TARGET_OS_BRIDGE +static int timeout = 600; +#else +static int timeout = 120; +#endif + +static _Atomic int thread_malloc_count = 0; +static _Atomic int thread_thawed_count = 0; +static _Atomic int phase = 0; + +struct thread_args { + int id; +}; + +static void +freeze_pid(pid_t pid) +{ + char pid_str[6]; + char *args[3]; + pid_t child_pid; + int status; + + sprintf(pid_str, "%d", pid); + child_pid = fork(); + if (child_pid == 0) { + /* Launch freezer */ + args[0] = freezer_path; + args[1] = pid_str; + args[2] = NULL; + execv(freezer_path, args); + /* execve() does not return on success */ + perror("execve"); + T_FAIL("execve() failed"); + } + + /* Wait for freezer to complete */ + T_LOG("Waiting for freezer %d to complete", child_pid); + while (0 == waitpid(child_pid, &status, WNOHANG)) { + if (timeout < 0) { + kill(child_pid, SIGKILL); + T_FAIL("Freezer took too long to freeze the test"); + } + sleep(1); + timeout--; + } + if (WIFEXITED(status) != 1 || WEXITSTATUS(status) != 0) { + T_FAIL("Freezer error'd out"); + } +} +static void * +worker_thread_function(void *args) +{ + struct thread_args *targs = args; + int thread_id = targs->id; + char *array; + + /* Allocate memory */ + array = malloc(MALLOC_SIZE_PER_THREAD); + T_EXPECT_NOTNULL(array, "thread %d allocated heap memory to be dirtied", thread_id); + + /* Waiting for phase 1 (touch pages) to start */ + while (atomic_load(&phase) != 1) { + ; + } + + /* Phase 1: touch pages */ + T_LOG("thread %d phase 1: dirtying %d heap pages (%d bytes)", thread_id, MALLOC_SIZE_PER_THREAD / (int)PAGE_SIZE, MALLOC_SIZE_PER_THREAD); + memset(&array[0], 1, MALLOC_SIZE_PER_THREAD); + atomic_fetch_add(&thread_malloc_count, 1); + + /* Wait for process to be frozen */ + while (atomic_load(&phase) != 2) { + ; + } + + /* Phase 2, process thawed, trigger decompressions by re-faulting pages */ + T_LOG("thread %d phase 2: faulting pages back in to trigger decompressions", thread_id); + memset(&array[0], 1, MALLOC_SIZE_PER_THREAD); + + /* Main thread will retrieve vm statistics once all threads are thawed */ + atomic_fetch_add(&thread_thawed_count, 1); + + free(array); + + +#if 0 /* Test if the thread's decompressions counter was added to the task decompressions counter when a thread terminates */ + if (thread_id < 2) { + sleep(10); + } +#endif + + return NULL; +} + +static pthread_t* +create_threads(int nthreads, pthread_t *threads, struct thread_args *targs) +{ + int i; + int err; + pthread_attr_t attr; + + err = pthread_attr_init(&attr); + T_ASSERT_POSIX_ZERO(err, "pthread_attr_init"); + for (i = 0; i < nthreads; i++) { + targs[i].id = i; + err = pthread_create(&threads[i], &attr, worker_thread_function, (void*)&targs[i]); + T_QUIET; T_ASSERT_POSIX_ZERO(err, "pthread_create"); + } + + return threads; +} + +static void +join_threads(int nthreads, pthread_t *threads) +{ + int i; + int err; + + for (i = 0; i < nthreads; i++) { + err = pthread_join(threads[i], NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(err, "pthread_join"); + } +} + +T_DECL(task_vm_info_decompressions, + "Test multithreaded per-task decompressions counter") +{ + int err; + int ncpu; + size_t ncpu_size = sizeof(ncpu); + int npages; + int compressor_mode; + size_t compressor_mode_size = sizeof(compressor_mode); + task_vm_info_data_t vm_info; + mach_msg_type_number_t count; + pthread_t *threads; + struct thread_args *targs; + + T_SETUPBEGIN; + + /* Make sure freezer is enabled on target machine */ + err = sysctlbyname("vm.compressor_mode", &compressor_mode, &compressor_mode_size, NULL, 0); + if (compressor_mode < 8) { + T_SKIP("This test requires freezer which is not available on the testing platform (vm.compressor_mode is set to %d)", compressor_mode); + } +#if TARGET_OS_BRIDGE + T_SKIP("This test requires freezer which is not available on bridgeOS (vm.compressor_mode is set to %d)", compressor_mode); +#endif + + /* Set number of threads to ncpu available on testing device */ + err = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0); + T_EXPECT_EQ_INT(0, err, "Detected %d cpus\n", ncpu); + + /* Set total number of pages to be frozen */ + npages = ncpu * MALLOC_SIZE_PER_THREAD / (int)PAGE_SIZE; + T_LOG("Test will be freezing at least %d heap pages\n", npages); + + /* Change state to freezable */ + err = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), (uint32_t)1, NULL, 0); + T_EXPECT_EQ(KERN_SUCCESS, err, "set pid %d to be freezable", getpid()); + + /* Call into kernel to retrieve vm_info and make sure we do not have any decompressions before the test */ + count = TASK_VM_INFO_COUNT; + err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); + T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count); + T_EXPECT_EQ_INT(0, err, "task_info(TASK_VM_INFO) returned 0"); + T_EXPECT_EQ_INT(0, vm_info.decompressions, "Expected 0 decompressions before test starts"); + + /* Thread data */ + threads = malloc(sizeof(pthread_t) * (size_t)ncpu); + targs = malloc(sizeof(struct thread_args) * (size_t)ncpu); + + T_SETUPEND; + + /* Phase 1: create threads to write to malloc memory */ + create_threads(ncpu, threads, targs); + atomic_fetch_add(&phase, 1); + + /* Wait for all threads to dirty their malloc pages */ + while (atomic_load(&thread_malloc_count) != ncpu) { + sleep(1); + } + T_EXPECT_EQ(ncpu, atomic_load(&thread_malloc_count), "%d threads finished writing to malloc pages\n", ncpu); + + /* Launch freezer to compress the dirty pages */ + T_LOG("Running freezer to compress pages for pid %d", getpid()); + freeze_pid(getpid()); + + /* Phase 2: triger decompression in threads */ + atomic_fetch_add(&phase, 1); + + /* Wait for all threads to decompress their malloc pages */ + while (atomic_load(&thread_thawed_count) != ncpu) { + sleep(1); + } + + /* Phase 3: Call into kernel to retrieve vm_info and to get the updated decompressions counter */ + count = TASK_VM_INFO_COUNT; + err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); + T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count); + T_EXPECT_EQ(0, err, "task_info(TASK_VM_INFO) returned 0"); + + /* Make sure this task has decompressed at least all of the dirtied memory */ + T_EXPECT_GE_INT(vm_info.decompressions, npages, "decompressed %d pages (>= heap pages: %d)", vm_info.decompressions, npages); + T_PASS("Correctly retrieve per-task decompressions stats"); + + /* Cleanup */ + join_threads(ncpu, threads); + free(threads); + free(targs); +} diff --git a/tests/telemetry.c b/tests/telemetry.c index 810dcf2d8..abf66285b 100644 --- a/tests/telemetry.c +++ b/tests/telemetry.c @@ -7,6 +7,7 @@ #include #include #include +#include #include enum telemetry_pmi { @@ -23,6 +24,26 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging.telemetry"), extern int __telemetry(uint64_t cmd, uint64_t deadline, uint64_t interval, uint64_t leeway, uint64_t arg4, uint64_t arg5); +/* + * Microstackshots based on PMI are only supported on devices with monotonic + * support. + */ + +static void +skip_if_pmi_unsupported(void) +{ + int supported = 0; + int ret = sysctlbyname("kern.monotonic.supported", &supported, + &(size_t){ sizeof(supported), }, NULL, 0); + if (ret < 0) { + T_SKIP("monotonic sysctl generated an error: %d (%s)", errno, + strerror(errno)); + } + if (!supported) { + T_SKIP("monotonic must be supported for microstackshots"); + } +} + /* * Data Analytics (da) also has a microstackshot configuration -- set a PMI * cycle interval of 0 to force it to disable microstackshot on PMI. @@ -50,7 +71,11 @@ disable_da_microstackshots(void) CFNumberRef num = CFNumberCreate(NULL, kCFNumberSInt64Type, &zero); set_da_microstackshot_period(num); T_LOG("notified da of tasking change, sleeping"); +#if TARGET_OS_WATCH + sleep(8); +#else /* TARGET_OS_WATCH */ sleep(3); +#endif /* !TARGET_OS_WATCH */ } /* @@ -68,8 +93,7 @@ reenable_da_microstackshots(void) static void telemetry_cleanup(void) { - int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_NONE, 0, 0, 0, 0); - T_EXPECT_POSIX_SUCCESS(ret, "telemetry(... NONE ...)"); + (void)__telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_NONE, 0, 0, 0, 0); reenable_da_microstackshots(); } @@ -107,9 +131,7 @@ thread_spin(__unused void *arg) T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI") { -#if TARGET_OS_WATCH - T_SKIP("unsupported platform"); -#endif /* TARGET_OS_WATCH */ + skip_if_pmi_unsupported(); T_SETUPBEGIN; ktrace_session_t s = ktrace_session_create(); @@ -122,6 +144,7 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI") __block int interrupt_records = 0; __block int timer_arm_records = 0; __block int unknown_records = 0; + __block int empty_records = 0; ktrace_events_single(s, MT_MICROSTACKSHOT, ^(__unused struct trace_point *tp) { pmi_events++; @@ -141,6 +164,14 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI") timer_arm_records++; } + if (start->arg2 == end->arg2) { + /* + * The buffer didn't grow for this record -- there was + * an error. + */ + empty_records++; + } + const uint8_t any_record = kPMIRecord | kIORecord | kInterruptRecord | kTimerArmingRecord; if ((start->arg1 & any_record) == 0) { @@ -158,8 +189,11 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI") pmi_records / (double)SLEEP_SECS); T_EXPECT_EQ(unknown_records, 0, "saw zero unknown record events"); T_EXPECT_GT(microstackshot_record_events, 0, - "saw non-zero microstackshot record events (%g/sec)", + "saw non-zero microstackshot record events (%d -- %g/sec)", + microstackshot_record_events, microstackshot_record_events / (double)SLEEP_SECS); + T_EXPECT_NE(empty_records, microstackshot_record_events, + "saw non-empty records (%d empty)", empty_records); if (interrupt_records > 0) { T_LOG("saw %g interrupt records per second", @@ -216,6 +250,8 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI") T_DECL(error_handling, "ensure that error conditions for the telemetry syscall are observed") { + skip_if_pmi_unsupported(); + telemetry_init(); int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS, diff --git a/tests/testposixshm.c b/tests/testposixshm.c index e715b428d..72a34ad8e 100644 --- a/tests/testposixshm.c +++ b/tests/testposixshm.c @@ -9,6 +9,8 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + static int nthreads = 0; static int fd; static _Atomic int phase = 0; diff --git a/tests/thread_group_set_32261625.c b/tests/thread_group_set_32261625.c index 1c7eb3f6c..507219204 100644 --- a/tests/thread_group_set_32261625.c +++ b/tests/thread_group_set_32261625.c @@ -2,6 +2,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define TEST_EVENTID (0xfedcbb00) static void* diff --git a/tests/time.c b/tests/time.c new file mode 100644 index 000000000..178b4c6ae --- /dev/null +++ b/tests/time.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_CHECK_LEAKS(false)); + +T_DECL(settimeofday, "check setting and getting time of day", + T_META_ASROOT(true)) +{ + struct timeval origtime = {}; + struct timezone origtz = {}; + int ret = gettimeofday(&origtime, &origtz); + T_ASSERT_POSIX_SUCCESS(ret, "get current time with gettimeofday(2)"); + +#if TARGET_OS_BRIDGE + /* + * bridgeOS is not allowed to set the time -- only the macOS side can. + */ + T_SKIP("bridgeOS is not allowed to call settimeofday(2)"); +#endif /* TARGET_OS_BRIDGE */ + + struct timeval newtime = {}; + newtime = origtime; + newtime.tv_sec -= 60; + ret = settimeofday(&newtime, NULL); + T_ASSERT_POSIX_SUCCESS(ret, + "set time back 60 seconds with settimeofday(2)"); + + ret = gettimeofday(&newtime, NULL); + T_ASSERT_POSIX_SUCCESS(ret, "get new time with gettimeofday(2)"); + + T_ASSERT_GT(origtime.tv_sec, newtime.tv_sec, + "new time should be before original time"); + + newtime = origtime; + newtime.tv_sec += 1; + ret = settimeofday(&newtime, NULL); + T_ASSERT_POSIX_SUCCESS(ret, + "set time close to original value with gettimeofday(2)"); +} + +static char tmppath[PATH_MAX] = ""; + +static void +cleanup_tmpfile(void) +{ + if (tmppath[0] != '\0') { + unlink(tmppath); + } +} + +static int +create_tmpfile(void) +{ + const char *tmpdir = getenv("TMPDIR"); + strlcat(tmppath, tmpdir ? tmpdir : "/tmp", sizeof(tmppath)); + strlcat(tmppath, "xnu_quick_test.XXXXX", sizeof(tmppath)); + int fd = mkstemp(tmppath); + T_ASSERT_POSIX_SUCCESS(fd, "created temporary file at %s", tmppath); + T_ATEND(cleanup_tmpfile); + return fd; +} + +T_DECL(futimes, "check that futimes updates file times", + T_META_RUN_CONCURRENTLY(true)) +{ + int tmpfd = create_tmpfile(); + + struct stat stbuf = {}; + int ret = fstat(tmpfd, &stbuf); + T_ASSERT_POSIX_SUCCESS(ret, "get file metadata with fstat(2)"); + struct timeval amtimes[2] = {}; + TIMESPEC_TO_TIMEVAL(&amtimes[0], &stbuf.st_atimespec); + TIMESPEC_TO_TIMEVAL(&amtimes[1], &stbuf.st_mtimespec); + + amtimes[0].tv_sec -= 120; + amtimes[1].tv_sec -= 120; + + ret = futimes(tmpfd, amtimes); + T_ASSERT_POSIX_SUCCESS(ret, "update file times with utimes(2)"); + + ret = fstat(tmpfd, &stbuf); + T_ASSERT_POSIX_SUCCESS(ret, "get file metadata after update with fstat(2)"); + struct timeval newamtimes[2] = {}; + TIMESPEC_TO_TIMEVAL(&newamtimes[0], &stbuf.st_atimespec); + TIMESPEC_TO_TIMEVAL(&newamtimes[1], &stbuf.st_mtimespec); + + /* + * Reading the metadata shouldn't count as an access. + */ + T_ASSERT_EQ(amtimes[0].tv_sec, newamtimes[0].tv_sec, + "access time matches what was set"); + T_ASSERT_EQ(amtimes[1].tv_sec, newamtimes[1].tv_sec, + "modification time matches what was set"); +} diff --git a/tests/turnstile_multihop.c b/tests/turnstile_multihop.c index 95dfadeb0..65fd2db07 100644 --- a/tests/turnstile_multihop.c +++ b/tests/turnstile_multihop.c @@ -34,6 +34,12 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstile_multihop")); #define HELPER_TIMEOUT_SECS (3000) +struct test_msg { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; +}; + static boolean_t spin_for_ever = false; static void @@ -220,46 +226,106 @@ get_user_promotion_basepri(void) return thread_policy.thps_user_promotion_basepri; } -static int messages_received = 0; +#define LISTENER_WLID 0x100 +#define CONN_WLID 0x200 + +static uint32_t +register_port_options(void) +{ + return MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | + MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | + MACH_RCV_VOUCHER; +} + +static void +register_port(uint64_t wlid, mach_port_t port) +{ + int r; + + struct kevent_qos_s kev = { + .ident = port, + .filter = EVFILT_MACHPORT, + .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, + .fflags = register_port_options(), + .data = 1, + .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0) + }; + + struct kevent_qos_s kev_err = { 0 }; + + /* Setup workloop for mach msg rcv */ + r = kevent_id(wlid, &kev, 1, &kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); + T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id"); +} + /* * Basic WL handler callback, it checks the * effective Qos of the servicer thread. */ static void -workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist __unused, int *events) +workloop_cb_test_intransit(uint64_t *workloop_id, void **eventslist, int *events) { - messages_received++; - T_LOG("Workloop handler workloop_cb_test_intransit called. Received message no %d", - messages_received); + static bool got_peer; + + struct kevent_qos_s *kev = eventslist[0]; + mach_msg_header_t *hdr; + struct test_msg *tmsg; + T_LOG("Workloop handler %s called. Received message on 0x%llx", + __func__, *workloop_id); /* Skip the test if we can't check Qos */ if (geteuid() != 0) { T_SKIP("kevent_qos test requires root privileges to run."); } - if (messages_received == 1) { - sleep(5); - T_LOG("Do some CPU work."); - do_work(5000); + T_QUIET; T_ASSERT_EQ(*events, 1, "should have one event"); + + hdr = (mach_msg_header_t *)kev->ext[0]; + T_ASSERT_NOTNULL(hdr, "has a message"); + T_ASSERT_EQ(hdr->msgh_size, (uint32_t)sizeof(struct test_msg), "of the right size"); + tmsg = (struct test_msg *)hdr; + + switch (*workloop_id) { + case LISTENER_WLID: + T_LOG("Registering peer connection"); + T_QUIET; T_ASSERT_FALSE(got_peer, "Should not have seen peer yet"); + got_peer = true; + break; + + case CONN_WLID: + T_LOG("Received message on peer"); + break; + + default: + T_FAIL("???"); + } + + sleep(5); + T_LOG("Do some CPU work."); + do_work(5000); - /* Check if the override now is IN + 60 boost */ - T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED, - "dispatch_source event handler QoS should be QOS_CLASS_USER_INITIATED"); - T_EXPECT_EQ(get_user_promotion_basepri(), 60u, - "dispatch_source event handler should be overridden at 60"); + /* Check if the override now is IN + 60 boost */ + T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED, + "dispatch_source event handler QoS should be QOS_CLASS_USER_INITIATED"); + T_EXPECT_EQ(get_user_promotion_basepri(), 60u, + "dispatch_source event handler should be overridden at 60"); + + if (*workloop_id == LISTENER_WLID) { + register_port(CONN_WLID, tmsg->port_descriptor.name); - /* Enable the knote to get 2nd message */ - struct kevent_qos_s *kev = *eventslist; kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; - kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | - MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | - MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | - MACH_RCV_VOUCHER); + kev->fflags = register_port_options(); + kev->ext[0] = kev->ext[1] = kev->ext[2] = kev->ext[3] = 0; *events = 1; } else { + /* this will unblock the waiter */ + mach_msg_destroy(hdr); *events = 0; - exit(0); } } @@ -331,11 +397,7 @@ send( { kern_return_t ret = 0; - struct { - mach_msg_header_t header; - mach_msg_body_t body; - mach_msg_port_descriptor_t port_descriptor; - } send_msg = { + struct test_msg send_msg = { .header = { .msgh_remote_port = send_port, .msgh_local_port = reply_port, @@ -598,7 +660,7 @@ thread_at_sixty(void *arg __unused) T_QUIET; T_LOG("The time for priority 60 thread to acquire lock was %llu \n", (after_lock_time - before_lock_time)); - exit(0); + T_END; } static void * @@ -669,35 +731,44 @@ thread_at_default(void *arg __unused) static void * thread_at_maintenance(void *arg __unused) { - mach_port_t qos_send_port; + mach_port_t service_port; + mach_port_t conn_port; mach_port_t special_reply_port; + mach_port_options_t opts = { + .flags = MPO_INSERT_SEND_RIGHT, + }; main_thread_port = mach_thread_self(); set_thread_name(__FUNCTION__); kern_return_t kr = bootstrap_look_up(bootstrap_port, - TURNSTILE_MULTIHOP_SERVICE_NAME, &qos_send_port); + TURNSTILE_MULTIHOP_SERVICE_NAME, &service_port); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + kr = mach_port_construct(mach_task_self(), &opts, 0ull, &conn_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + special_reply_port = thread_get_special_reply_port(); T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); /* Become the dispatch sync owner, dispatch_sync_owner will be set in dispatch_sync_wait function */ - /* Send an async message */ - send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + /* Send a sync message */ + send(conn_port, special_reply_port, MACH_PORT_NULL, (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0); - /* Send a sync message */ - send(qos_send_port, special_reply_port, MACH_PORT_NULL, + /* Send an async checkin message */ + send(service_port, MACH_PORT_NULL, conn_port, (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0); /* Create a new thread at QOS_CLASS_DEFAULT qos */ thread_create_at_qos(QOS_CLASS_DEFAULT, thread_at_default); /* Block on Sync IPC */ - receive(special_reply_port, qos_send_port); + receive(special_reply_port, service_port); + + T_LOG("received reply"); dispatch_sync_cancel(def_thread_port, QOS_CLASS_DEFAULT); return NULL; @@ -706,19 +777,8 @@ thread_at_maintenance(void *arg __unused) T_HELPER_DECL(three_ulock_sync_ipc_hop, "Create chain of 4 threads with 3 ulocks and 1 sync IPC at different qos") { - dt_stat_time_t roundtrip_stat = dt_stat_time_create("multihop_lock_acquire"); - - T_STAT_MEASURE_LOOP(roundtrip_stat) { - if (fork() == 0) { - thread_create_at_qos(QOS_CLASS_MAINTENANCE, thread_at_maintenance); - sigsuspend(0); - exit(0); - } - wait(NULL); - } - - dt_stat_finalize(roundtrip_stat); - T_END; + thread_create_at_qos(QOS_CLASS_MAINTENANCE, thread_at_maintenance); + sigsuspend(0); } static void @@ -744,41 +804,14 @@ thread_create_at_qos(qos_class_t qos, void * (*function)(void *)) #pragma mark Mach receive - kevent_qos -static void -expect_kevent_id_recv(mach_port_t port) +T_HELPER_DECL(server_kevent_id, + "Reply with the QoS that a dispatch source event handler ran with") { - int r; - T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( worker_cb, event_cb, (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL); - struct kevent_qos_s kev[] = {{ - .ident = port, - .filter = EVFILT_MACHPORT, - .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, - .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | - MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | - MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | - MACH_RCV_VOUCHER), - .data = 1, - .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0) - }}; - - struct kevent_qos_s kev_err[] = {{ 0 }}; - - /* Setup workloop for mach msg rcv */ - r = kevent_id(25, kev, 1, kev_err, 1, NULL, - NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); - - T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); - T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id"); -} - -T_HELPER_DECL(server_kevent_id, - "Reply with the QoS that a dispatch source event handler ran with") -{ - expect_kevent_id_recv(get_server_port()); + register_port(LISTENER_WLID, get_server_port()); sigsuspend(0); T_ASSERT_FAIL("should receive a message"); } diff --git a/tests/turnstile_multihop_helper.h b/tests/turnstile_multihop_helper.h index 28b5becd8..8ba659d02 100644 --- a/tests/turnstile_multihop_helper.h +++ b/tests/turnstile_multihop_helper.h @@ -168,8 +168,7 @@ ull_unlock(lock_t *lock, int id, uint opcode, uint flags) if (prev == (ULL_WAITERS | ull_locked)) { /* locked with waiters */ - *lock = 0; - __c11_atomic_thread_fence(__ATOMIC_ACQ_REL); + __c11_atomic_store(lock, 0, __ATOMIC_SEQ_CST); if ((flags & ULF_WAKE_THREAD) && (_os_get_self() == main_thread_name)) { flags &= ~(uint)ULF_WAKE_THREAD; diff --git a/tests/turnstiles_test.c b/tests/turnstiles_test.c index 34b9667f3..64636f539 100644 --- a/tests/turnstiles_test.c +++ b/tests/turnstiles_test.c @@ -19,9 +19,10 @@ #include #include -#define SYSCTL_TURNSTILE_TEST_DEFAULT 1 -#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE 2 - +#define SYSCTL_TURNSTILE_TEST_USER_DEFAULT 1 +#define SYSCTL_TURNSTILE_TEST_USER_HASHTABLE 2 +#define SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT 3 +#define SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE 4 T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstiles_test")); @@ -48,7 +49,7 @@ thread_create_at_qos(qos_class_t qos, void * (*function)(void *), int type) } static int -get_pri(thread_t thread_port) +get_sched_pri(thread_t thread_port) { kern_return_t kr; @@ -61,6 +62,20 @@ get_pri(thread_t thread_port) return extended_info.pth_curpri; } +static int +get_base_pri(thread_t thread_port) +{ + kern_return_t kr; + + thread_extended_info_data_t extended_info; + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + return extended_info.pth_priority; +} + static void turnstile_prim_lock(int type) { @@ -68,7 +83,7 @@ turnstile_prim_lock(int type) uint64_t tid; int in_val = type; pthread_threadid_np(NULL, &tid); - T_LOG("sysctlbyname lock called from thread %llu \n", tid); + T_LOG("sysctlbyname lock type %d called from thread %llu \n", type, tid); ret = sysctlbyname("kern.turnstiles_test_lock", NULL, 0, &in_val, sizeof(in_val)); T_LOG("sysctlbyname lock returned from thread %llu with value %d \n", tid, ret); } @@ -80,15 +95,84 @@ turnstile_prim_unlock(int type) uint64_t tid; int in_val = type; pthread_threadid_np(NULL, &tid); - T_LOG("sysctlbyname unlock called from thread %llu \n", tid); + T_LOG("sysctlbyname unlock type %d called from thread %llu \n", type, tid); ret = sysctlbyname("kern.turnstiles_test_unlock", NULL, 0, &in_val, sizeof(in_val)); T_LOG("sysctlbyname unlock returned from thread %llu with value %d \n", tid, ret); } +struct thread_data { + int pri_to_set; + int lock1; + int lock2; + unsigned int sleep; + int sched_pri_to_check; + int base_pri_to_check; +}; + +static void * +chain_locking(void* args) +{ + struct thread_data* data = (struct thread_data*) args; + int policy, pri; + int ret; + struct sched_param param; + + /* Change our priority to pri_to_set */ + ret = pthread_getschedparam(pthread_self(), &policy, ¶m); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_getschedparam"); + + param.sched_priority = data->pri_to_set; + + /* this sets both sched and base pri */ + ret = pthread_setschedparam(pthread_self(), policy, ¶m); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_setschedparam"); + + pri = get_sched_pri(mach_thread_self()); + + T_ASSERT_EQ(pri, data->pri_to_set, "Priority before holding locks"); + + /* take lock1 */ + if (data->lock1) { + turnstile_prim_lock(data->lock1); + } + + /* take lock2 */ + if (data->lock2) { + turnstile_prim_lock(data->lock2); + } + + if (data->sleep) { + sleep(data->sleep); + } + + if (data->sched_pri_to_check) { + pri = get_sched_pri(mach_thread_self()); + T_ASSERT_EQ(pri, data->sched_pri_to_check, "Sched priority while holding locks"); + } + + if (data->base_pri_to_check) { + pri = get_base_pri(mach_thread_self()); + T_ASSERT_EQ(pri, data->base_pri_to_check, "Base priority while holding locks"); + } + + if (data->lock2) { + turnstile_prim_unlock(data->lock2); + } + + if (data->lock1) { + turnstile_prim_unlock(data->lock1); + } + + pri = get_sched_pri(mach_thread_self()); + T_ASSERT_EQ(pri, data->pri_to_set, "Priority after releasing locks"); + + return NULL; +} + static void * take_lock_check_priority(void * arg) { - int old_pri = get_pri(mach_thread_self()); + int old_pri = get_base_pri(mach_thread_self()); int unboosted_pri; int boosted_pri; int after_unlock_pri; @@ -102,20 +186,20 @@ take_lock_check_priority(void * arg) /* Take the test lock */ turnstile_prim_lock(type); - unboosted_pri = get_pri(mach_thread_self()); + unboosted_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri); sleep(8); /* Check for elevated priority */ - boosted_pri = get_pri(mach_thread_self()); + boosted_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri); /* Drop the lock */ turnstile_prim_unlock(type); /* Check for regular priority */ - after_unlock_pri = get_pri(mach_thread_self()); + after_unlock_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(after_unlock_pri, 37, "thread(%llu) priority after dropping lock is %d\n", tid, after_unlock_pri); return NULL; @@ -130,7 +214,7 @@ try_to_take_lock_and_unlock(void *arg) pthread_threadid_np(NULL, &tid); sleep(4); - int old_pri = get_pri(mach_thread_self()); + int old_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri); /* Try taking the test lock */ @@ -143,7 +227,7 @@ try_to_take_lock_and_unlock(void *arg) static void * take_lock_and_exit(void * arg) { - int old_pri = get_pri(mach_thread_self()); + int old_pri = get_base_pri(mach_thread_self()); int unboosted_pri; int boosted_pri; uint64_t tid; @@ -156,13 +240,13 @@ take_lock_and_exit(void * arg) /* Take the test lock */ turnstile_prim_lock(type); - unboosted_pri = get_pri(mach_thread_self()); + unboosted_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri); sleep(8); /* Check for elevated priority */ - boosted_pri = get_pri(mach_thread_self()); + boosted_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri); /* return without unlocking the lock */ @@ -178,7 +262,7 @@ unlock_an_owner_exited_lock(void *arg) pthread_threadid_np(NULL, &tid); sleep(12); - int old_pri = get_pri(mach_thread_self()); + int old_pri = get_base_pri(mach_thread_self()); T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri); /* Unlock the test lock causing the turnstile code to call thread_deallocate_safe */ @@ -246,13 +330,166 @@ test3(int type) return; } -T_DECL(turnstile_test, "Turnstile test", T_META_ASROOT(YES)) +/* + * Test 4: test if a chain of user-space turnstile primitives followed by kernel primitives works correctly. + */ +static void +test4(void) { - test1(SYSCTL_TURNSTILE_TEST_DEFAULT); - test2(SYSCTL_TURNSTILE_TEST_DEFAULT); - test3(SYSCTL_TURNSTILE_TEST_DEFAULT); + pthread_t threads[5] = {}; + struct thread_data data[5] = {}; + + T_LOG("Test 4: test if a chain of user-space turnstile primitives followed by kernel primitives works correctly"); + + /* + * Chain: t4->ud->t3->uh->t2->kh->t1->kd->t0 + * ud and uh (user space turnstiles) will push base pri and sched pri + * kd and kh (kernel space turnstiles) will push sched pri + * sched pri should be propagated up to the end + * kh is the breaking point of the chain for sched pri + */ + + + /* Create a thread at priority 4 and take SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT lock */ + data[0].pri_to_set = 4; + data[0].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be not locked */ + data[0].lock2 = NULL; + data[0].sleep = 10; /* long sleep, nothing is blocking this thread */ + data[0].sched_pri_to_check = 60; + data[0].base_pri_to_check = 4; + pthread_create(&threads[0], NULL, chain_locking, (void *)&data[0]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 31 and take SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT */ + data[1].pri_to_set = 31; + data[1].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be not locked */ + data[1].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be locked */ + data[1].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */ + data[1].sched_pri_to_check = 60; + data[1].base_pri_to_check = 31; + pthread_create(&threads[1], NULL, chain_locking, (void *)&data[1]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 40 and take SYSCTL_TURNSTILE_TEST_USER_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE */ + data[2].pri_to_set = 40; + data[2].lock1 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be not locked */ + data[2].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be locked */ + data[2].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */ + data[2].sched_pri_to_check = 60; + data[2].base_pri_to_check = 60; + pthread_create(&threads[2], NULL, chain_locking, (void *)&data[2]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 47 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT lock followed by SYSCTL_TURNSTILE_TEST_USER_HASHTABLE */ + data[3].pri_to_set = 47; + data[3].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be not locked */ + data[3].lock2 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be locked */ + data[3].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */ + data[3].sched_pri_to_check = 60; + data[3].base_pri_to_check = 60; + pthread_create(&threads[3], NULL, chain_locking, (void *)&data[3]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 60 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT */ + data[4].pri_to_set = 60; + data[4].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be locked */ + data[4].lock2 = NULL; + data[4].sleep = 0; /* no need to sleep, nothing should be pushing by the time it acquires the lock */ + data[4].sched_pri_to_check = 60; /* this is its own priority */ + data[4].base_pri_to_check = 60; + pthread_create(&threads[4], NULL, chain_locking, (void *)&data[4]); - test1(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE); - test2(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE); - test3(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE); + sleep(16); + return; +} + +/* + * Test 5: test if a chain of user-space turnstile primitives interleaved by kernel primitives works correctly. + */ +static void +test5(void) +{ + pthread_t threads[5] = {}; + struct thread_data data[5] = {}; + + T_LOG("Test 5: test if a chain of user-space turnstile primitives interleaved by kernel primitives works correctly"); + + /* + * Chain: t4->ud->t3->kh->t2->uh->t1->kd->t0 + * ud and uh (user space turnstiles) will push base pri and sched pri + * kd and kh (kernel space turnstiles) will push sched pri + * uh is the breaking point of the chain for sched pri + */ + + /* Create a thread at priority 4 and take SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT lock */ + data[0].pri_to_set = 4; + data[0].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be not locked */ + data[0].lock2 = NULL; + data[0].sleep = 10; /* long sleep, nothing is blocking this thread */ + data[0].sched_pri_to_check = 41; + data[0].base_pri_to_check = 4; + pthread_create(&threads[0], NULL, chain_locking, (void *)&data[0]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 31 and take SYSCTL_TURNSTILE_TEST_USER_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT */ + data[1].pri_to_set = 31; + data[1].lock1 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be not locked */ + data[1].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be locked */ + data[1].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */ + data[1].sched_pri_to_check = 41; + data[1].base_pri_to_check = 41; + pthread_create(&threads[1], NULL, chain_locking, (void *)&data[1]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 41 and take SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_USER_HASHTABLE */ + data[2].pri_to_set = 41; + data[2].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be not locked */ + data[2].lock2 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be locked */ + data[2].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */ + data[2].sched_pri_to_check = 60; + data[2].base_pri_to_check = 41; + pthread_create(&threads[2], NULL, chain_locking, (void *)&data[2]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 47 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE */ + data[3].pri_to_set = 47; + data[3].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be not locked */ + data[3].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be locked */ + data[3].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */ + data[3].sched_pri_to_check = 60; + data[3].base_pri_to_check = 60; + pthread_create(&threads[3], NULL, chain_locking, (void *)&data[3]); + sleep(2); /* give the thread time to acquire the lock */ + + /* Create a thread at priority 60 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT */ + data[4].pri_to_set = 60; + data[4].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be locked */ + data[4].lock2 = NULL; + data[4].sleep = 0; /* no need to sleep, nothing should be pushing by the time it acquires the lock */ + data[4].sched_pri_to_check = 60; /* this is its own priority */ + data[4].base_pri_to_check = 60; + pthread_create(&threads[4], NULL, chain_locking, (void *)&data[4]); + + sleep(16); + return; +} + +T_DECL(turnstile_test, "Turnstile test", T_META_ASROOT(YES)) +{ + test1(SYSCTL_TURNSTILE_TEST_USER_DEFAULT); + test2(SYSCTL_TURNSTILE_TEST_USER_DEFAULT); + test3(SYSCTL_TURNSTILE_TEST_USER_DEFAULT); + + test1(SYSCTL_TURNSTILE_TEST_USER_HASHTABLE); + test2(SYSCTL_TURNSTILE_TEST_USER_HASHTABLE); + test3(SYSCTL_TURNSTILE_TEST_USER_HASHTABLE); + + /* + * rdar://problem/46302128 + * These tests are using a sysctl to lock a dummy kernel resource that uses turnstile. + * However a thread holding a kernel push from turnstile should never return in + * userspace, and rdar://problem/24194397 adds an assert for it. + */ + //test4(); + //test5(); } diff --git a/tests/utimensat.c b/tests/utimensat.c index c534bde51..be5d5121e 100644 --- a/tests/utimensat.c +++ b/tests/utimensat.c @@ -13,6 +13,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + #define FILENAME "utimensat" static const struct timespec tptr[][2] = { diff --git a/tests/verify_kalloc_config.c b/tests/verify_kalloc_config.c index 64a9f6901..2c5c08727 100644 --- a/tests/verify_kalloc_config.c +++ b/tests/verify_kalloc_config.c @@ -6,7 +6,8 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), - T_META_CHECK_LEAKS(false) + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) ); static void run_test(void); diff --git a/tests/vm_phys_footprint.c b/tests/vm_phys_footprint.c index 4dbea7be7..44c003118 100644 --- a/tests/vm_phys_footprint.c +++ b/tests/vm_phys_footprint.c @@ -10,14 +10,21 @@ #include #include +#include +#include + +#include #include extern int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3); -#if ENTITLED && defined(__arm64__) -#define LEGACY_FOOTPRINT 1 +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); +boolean_t legacy_footprint; + +#if LEGACY_FOOTPRINT_ENTITLED && defined(__arm64__) +#define TEST_VM_NAMESPACE "xnu.vm_legacy" #else /* ENTITLED && __arm64__ */ -#define LEGACY_FOOTPRINT 0 +#define TEST_VM_NAMESPACE "xnu.vm" #endif /* ENTITLED && __arm64__ */ #define MEM_SIZE (100 * 1024 * 1024) /* 100 MB */ @@ -35,6 +42,8 @@ ledger_init(void) struct ledger_template_info *templateInfo; int64_t templateCnt; int i; + int legacy_footprint_entitlement_mode; + size_t oldlen; if (ledger_inited) { return; @@ -42,6 +51,24 @@ ledger_init(void) ledger_inited = 1; T_SETUPBEGIN; + + legacy_footprint = FALSE; +#if LEGACY_FOOTPRINT_ENTITLED + int ret; + + T_QUIET; + T_WITH_ERRNO; + oldlen = sizeof(legacy_footprint_entitlement_mode); + ret = sysctlbyname("kern.legacy_footprint_entitlement_mode", + &legacy_footprint_entitlement_mode, + &oldlen, + NULL, + 0); + if (ret == 0 && legacy_footprint_entitlement_mode == 2) { + legacy_footprint = TRUE; + } +#endif /* LEGACY_FOOTPRINT_ENTITLED */ + T_QUIET; T_WITH_ERRNO; T_ASSERT_EQ(ledger(LEDGER_INFO, @@ -192,7 +219,7 @@ pre_warm( T_DECL(phys_footprint_anonymous, "phys_footprint for anonymous memory", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; @@ -265,7 +292,7 @@ T_DECL(phys_footprint_anonymous, T_DECL(phys_footprint_file, "phys_footprint for mapped file", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; @@ -365,7 +392,7 @@ T_DECL(phys_footprint_file, T_DECL(phys_footprint_purgeable, "phys_footprint for purgeable memory", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; @@ -484,7 +511,7 @@ T_DECL(phys_footprint_purgeable, T_DECL(phys_footprint_purgeable_ownership, "phys_footprint for owned purgeable memory", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; @@ -648,7 +675,7 @@ T_DECL(phys_footprint_purgeable_ownership, #ifdef MAP_MEM_LEDGER_TAGGED T_DECL(phys_footprint_ledger_purgeable_owned, "phys_footprint for ledger-tagged purgeable memory ownership", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; @@ -821,7 +848,7 @@ T_DECL(phys_footprint_ledger_purgeable_owned, T_DECL(phys_footprint_ledger_owned, "phys_footprint for ledger-tagged memory ownership", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; @@ -830,7 +857,6 @@ T_DECL(phys_footprint_ledger_owned, kern_return_t kr; mach_vm_address_t pre_vm_addr, vm_addr; mach_vm_size_t vm_size, dirty_size, me_size; - int state; mach_port_t me_port; /* pre-warm to account for page table expansion */ @@ -988,6 +1014,11 @@ setIntValue(CFMutableDictionaryRef dict, const CFStringRef key, int value) CFDictionarySetValue(dict, key, number); CFRelease(number); } +static inline void +setBoolValue(CFMutableDictionaryRef dict, const CFStringRef key, bool value) +{ + CFDictionarySetValue(dict, key, value ? kCFBooleanTrue : kCFBooleanFalse); +} typedef void (^SurfacePlaneBlock)(void *data, size_t planeIndex, size_t width, size_t height, size_t rowbytes); static IOReturn SurfaceApplyPlaneBlock(IOSurfaceRef surface, SurfacePlaneBlock block) @@ -1049,6 +1080,24 @@ ClearSurface(IOSurfaceRef surface) } }); } +static size_t +SurfaceGetMemorySize(IOSurfaceRef surface) +{ + size_t planeCount = IOSurfaceGetPlaneCount(surface); + + if (planeCount == 0) { + size_t rb = IOSurfaceGetBytesPerRow(surface); + size_t h = IOSurfaceGetHeight(surface); + return rb * h; + } else if (planeCount == 2) { + size_t rb0 = IOSurfaceGetBytesPerRowOfPlane(surface, 0); + size_t h0 = IOSurfaceGetHeightOfPlane(surface, 0); + size_t rb1 = IOSurfaceGetBytesPerRowOfPlane(surface, 1); + size_t h1 = IOSurfaceGetHeightOfPlane(surface, 1); + return rb0 * h0 + rb1 * h1; + } + return 0; +} static IOSurfaceRef CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignment, uint32_t fmt, bool purgeable, bool clear) { @@ -1075,11 +1124,11 @@ CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignme setIntValue(props, kIOSurfaceWidth, (int)pixelsWide); setIntValue(props, kIOSurfaceHeight, (int)pixelsHigh); setIntValue(props, kIOSurfacePixelFormat, (int)fmt); -#if TARGET_OS_IPHONE - setIntValue(props, kIOSurfaceNonPurgeable, purgeable); -#else /* TARGET_OS_IPHONE */ +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) + setBoolValue(props, kIOSurfaceNonPurgeable, !purgeable); +#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ (void)purgeable; -#endif /* TARGET_OS_IPHONE */ +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ { if (bpe != bpp) { // i.e. a 422 format such as 'yuvf' etc. setIntValue(props, kIOSurfaceElementWidth, 2); @@ -1099,17 +1148,19 @@ CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignme } T_DECL(phys_footprint_purgeable_iokit, "phys_footprint for purgeable IOKit memory", - T_META_NAMESPACE("xnu.vm"), + T_META_NAMESPACE(TEST_VM_NAMESPACE), T_META_LTEPHASE(LTE_POSTINIT)) { uint64_t footprint_before, pagetable_before; uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; + uint64_t footprint_expected, footprint_delta_slop; + int64_t footprint_delta; IOSurfaceRef surface; uint32_t old_state; uint64_t surface_size; T_SETUPBEGIN; + footprint_delta_slop = 8 * vm_kernel_page_size; ledger_init(); surface = CreateSurface(1024, 1024, 0, 32, true, true); IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state); @@ -1123,92 +1174,227 @@ T_DECL(phys_footprint_purgeable_iokit, get_ledger_info(&footprint_before, &pagetable_before); surface = CreateSurface(1024, 1024, 0, 32, true, true); get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: creating IOSurface: no footprint impact"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "create IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ + if (legacy_footprint) { + footprint_expected = footprint_before; + footprint_expected += (pagetable_after - pagetable_before); + footprint_delta = (int64_t)(footprint_after - footprint_expected); + T_LOG("LEGACY FOOTPRINT: creating purgeable IOSurface: no footprint impact"); + T_EXPECT_LE((uint64_t)llabs(footprint_delta), footprint_delta_slop, + "create purgeable IOSurface %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_delta); + } else { + footprint_expected = footprint_before + surface_size; + footprint_expected += (pagetable_after - pagetable_before); + footprint_delta = (int64_t)(footprint_after - footprint_expected); + T_LOG("creating purgeable IOSurface increases phys_footprint"); + T_EXPECT_LE((uint64_t)llabs(footprint_delta), footprint_delta_slop, + "create purgeable IOSurface %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_delta); + } + + /* make IOSurface volatile: footprint shrinks */ + get_ledger_info(&footprint_before, &pagetable_before); + IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state); + get_ledger_info(&footprint_after, &pagetable_after); + if (legacy_footprint) { + footprint_expected = footprint_before; + footprint_expected += (pagetable_after - pagetable_before); + T_LOG("LEGACY FOOTPRINT: volatile IOSurface: no footprint impact"); + T_EXPECT_EQ(footprint_after, footprint_expected, + "volatile IOSurface %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_after - footprint_expected); + } else { + footprint_expected = footprint_before - surface_size; + footprint_expected += (pagetable_after - pagetable_before); + T_LOG("making IOSurface volatile decreases phys_footprint"); + T_EXPECT_EQ(footprint_after, footprint_expected, + "made volatile %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_after - footprint_expected); + } + + /* make IOSurface non-volatile: footprint grows */ + get_ledger_info(&footprint_before, &pagetable_before); + IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state); + get_ledger_info(&footprint_after, &pagetable_after); + if (legacy_footprint) { + footprint_expected = footprint_before; + footprint_expected += (pagetable_after - pagetable_before); + T_LOG("LEGACY FOOTPRINT: non-volatile IOSurface: no footprint impact"); + T_EXPECT_EQ(footprint_after, footprint_expected, + "non-volatile IOSurface %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_after - footprint_expected); + } else { + footprint_expected = footprint_before + surface_size; + footprint_expected += (pagetable_after - pagetable_before); + T_LOG("making IOSurface non-volatile increases phys_footprint"); + T_EXPECT_EQ(footprint_after, footprint_expected, + "made non-volatile %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_after - footprint_expected); + } + + /* accessing IOSurface re-mapping: no footprint impact */ + + /* deallocating IOSurface re-mapping: no footprint impact */ + + /* release IOSurface: footprint shrinks */ + get_ledger_info(&footprint_before, &pagetable_before); + CFRelease(surface); + get_ledger_info(&footprint_after, &pagetable_after); + if (legacy_footprint) { + footprint_expected = footprint_before; + footprint_expected += (pagetable_after - pagetable_before); + T_LOG("LEGACY FOOTPRINT: release IOSurface: no footprint impact"); + T_EXPECT_EQ(footprint_after, footprint_expected, + "releasing IOSurface %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_after - footprint_expected); + } else { + footprint_expected = footprint_before - surface_size; + footprint_expected += (pagetable_after - pagetable_before); + T_LOG("releasing IOSurface decreases phys_footprint"); + T_EXPECT_EQ(footprint_after, footprint_expected, + "released IOSurface %lld bytes: " + "footprint %lld -> %lld expected %lld delta %lld", + surface_size, footprint_before, footprint_after, + footprint_expected, footprint_after - footprint_expected); + } +} + +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +T_DECL(phys_footprint_nonpurgeable_iokit, + "phys_footprint for non-purgeable IOKit memory", + T_META_NAMESPACE(TEST_VM_NAMESPACE), + T_META_LTEPHASE(LTE_POSTINIT)) +{ + uint64_t footprint_before, pagetable_before; + uint64_t footprint_after, pagetable_after; + uint64_t footprint_expected, footprint_delta_slop; + int64_t footprint_delta; + IOSurfaceRef surface; + uint64_t surface_size; + void *map_base; + size_t map_size; + mach_vm_address_t remap_addr; + kern_return_t kr; + vm_prot_t cur_prot, max_prot; + uint32_t old_state; + + + T_SETUPBEGIN; + ledger_init(); + surface = CreateSurface(1024, 1024, 0, 32, false, true); + CFRelease(surface); + footprint_delta_slop = 8 * vm_kernel_page_size; + T_SETUPEND; + + surface_size = 1024 * 1024 * 4; + + /* create IOsurface: footprint grows */ + get_ledger_info(&footprint_before, &pagetable_before); + surface = CreateSurface(1024, 1024, 0, 32, false, true); + get_ledger_info(&footprint_after, &pagetable_after); footprint_expected = footprint_before + surface_size; footprint_expected += (pagetable_after - pagetable_before); - T_LOG("creating IOSurface increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "create IOSurface %lld bytes: " + footprint_delta = (int64_t)(footprint_after - footprint_expected); + T_LOG("creating non-purgeable IOSurface increases phys_footprint"); + T_EXPECT_LE((uint64_t)llabs(footprint_delta), footprint_delta_slop, + "create non-purgeable IOSurface %lld bytes: " "footprint %lld -> %lld expected %lld delta %lld", surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ + footprint_expected, footprint_delta); - /* make IOSurface volatile: footprint shrinks */ + /* make IOSurface volatile: fail and no footprint impact */ get_ledger_info(&footprint_before, &pagetable_before); IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state); get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT footprint_expected = footprint_before; footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: volatile IOSurface: no footprint impact"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "volatile IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ - footprint_expected = footprint_before - surface_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making IOSurface volatile decreases phys_footprint"); + T_LOG("making non-purgeable IOSurface volatile: no footprint impact"); T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld bytes: " + "made volatile %lld non-purgeable bytes: " "footprint %lld -> %lld expected %lld delta %lld", surface_size, footprint_before, footprint_after, footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ - /* make IOSurface non-volatile: footprint grows */ + /* re-mapping IOSurface: no footprint impact */ get_ledger_info(&footprint_before, &pagetable_before); - IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state); + map_base = IOSurfaceGetBaseAddress(surface); + map_size = SurfaceGetMemorySize(surface); +// T_EXPECT_EQ(map_size, surface_size, "map_size %lld surface_size %lld", +// map_size, surface_size); + remap_addr = 0; + kr = mach_vm_remap(mach_task_self(), + &remap_addr, + (mach_vm_size_t)surface_size, + 0, + VM_FLAGS_ANYWHERE, + mach_task_self(), + (mach_vm_address_t)map_base, + FALSE, /* copy */ + &cur_prot, + &max_prot, + VM_INHERIT_DEFAULT); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_remap() error 0x%x (%s)", + kr, mach_error_string(kr)); get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT footprint_expected = footprint_before; footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: non-volatile IOSurface: no footprint impact"); + T_LOG("re-mapping IOSurface does not impact phys_footprint"); T_EXPECT_EQ(footprint_after, footprint_expected, - "non-volatile IOSurface %lld bytes: " + "remapping IOSurface %lld bytes: " "footprint %lld -> %lld expected %lld delta %lld", surface_size, footprint_before, footprint_after, footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ + + /* accessing IOSurface re-mapping: footprint grows */ + get_ledger_info(&footprint_before, &pagetable_before); + memset((char *)(uintptr_t)remap_addr, 'p', (size_t)surface_size); + get_ledger_info(&footprint_after, &pagetable_after); footprint_expected = footprint_before + surface_size; footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making IOSurface non-volatile increases phys_footprint"); + T_LOG("accessing re-mapped IOSurface grows phys_footprint"); T_EXPECT_EQ(footprint_after, footprint_expected, - "made non-volatile %lld bytes: " + "accessing remapped IOSurface %lld bytes: " "footprint %lld -> %lld expected %lld delta %lld", surface_size, footprint_before, footprint_after, footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ - - /* accessing IOSurface re-mapping: no footprint impact */ - - /* deallocating IOSurface re-mapping: no footprint impact */ - /* release IOSurface: footprint shrinks */ + /* deallocating IOSurface re-mapping: footprint shrinks */ get_ledger_info(&footprint_before, &pagetable_before); - CFRelease(surface); + kr = mach_vm_deallocate(mach_task_self(), + remap_addr, + (mach_vm_size_t)surface_size); + T_QUIET; + T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", + kr, mach_error_string(kr)); get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT - footprint_expected = footprint_before; + footprint_expected = footprint_before - surface_size; footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: release IOSurface: no footprint impact"); + T_LOG("deallocating re-mapping of IOSurface shrinks phys_footprint"); T_EXPECT_EQ(footprint_after, footprint_expected, - "releasing IOSurface %lld bytes: " + "deallocating remapped IOSurface %lld bytes: " "footprint %lld -> %lld expected %lld delta %lld", surface_size, footprint_before, footprint_after, footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ + + /* release IOSurface: footprint shrinks */ + get_ledger_info(&footprint_before, &pagetable_before); + CFRelease(surface); + get_ledger_info(&footprint_after, &pagetable_after); footprint_expected = footprint_before - surface_size; footprint_expected += (pagetable_after - pagetable_before); T_LOG("releasing IOSurface decreases phys_footprint"); @@ -1217,5 +1403,5 @@ T_DECL(phys_footprint_purgeable_iokit, "footprint %lld -> %lld expected %lld delta %lld", surface_size, footprint_before, footprint_after, footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ } +#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ diff --git a/tests/vm_phys_footprint_legacy.c b/tests/vm_phys_footprint_legacy.c deleted file mode 100644 index c6357797f..000000000 --- a/tests/vm_phys_footprint_legacy.c +++ /dev/null @@ -1,1223 +0,0 @@ -#define ENTITLED 1 - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -extern int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3); - -#if ENTITLED && defined(__arm64__) -#define LEGACY_FOOTPRINT 1 -#else /* ENTITLED && __arm64__ */ -#define LEGACY_FOOTPRINT 0 -#endif /* ENTITLED && __arm64__ */ - -#define MEM_SIZE (100 * 1024 * 1024) /* 100 MB */ - -static int64_t ledger_count = -1; -static int footprint_index = -1; -static int pagetable_index = -1; -static struct ledger_entry_info *lei = NULL; - -static void -ledger_init(void) -{ - static int ledger_inited = 0; - struct ledger_info li; - struct ledger_template_info *templateInfo; - int64_t templateCnt; - int i; - - if (ledger_inited) { - return; - } - ledger_inited = 1; - - T_SETUPBEGIN; - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_EQ(ledger(LEDGER_INFO, - (caddr_t)(uintptr_t)getpid(), - (caddr_t)&li, - NULL), - 0, - "ledger(LEDGER_INFO)"); - - templateCnt = li.li_entries; - templateInfo = malloc((size_t)li.li_entries * sizeof(struct ledger_template_info)); - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_NE(templateInfo, NULL, "malloc()"); - - ledger_count = li.li_entries; - footprint_index = -1; - pagetable_index = -1; - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_GE(ledger(LEDGER_TEMPLATE_INFO, - (caddr_t)templateInfo, - (caddr_t)&templateCnt, - NULL), - 0, - "ledger(LEDGER_TEMPLATE_INFO)"); - for (i = 0; i < templateCnt; i++) { - if (!strncmp(templateInfo[i].lti_name, - "phys_footprint", - strlen("phys_footprint"))) { - footprint_index = i; - } else if (!strncmp(templateInfo[i].lti_name, - "page_table", - strlen("page_table"))) { - pagetable_index = i; - } - } - free(templateInfo); - - lei = (struct ledger_entry_info *) - malloc((size_t)ledger_count * sizeof(*lei)); - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_NE(lei, NULL, "malloc(ledger_entry_info)"); - - T_QUIET; - T_ASSERT_NE(footprint_index, -1, "no footprint_index"); - T_QUIET; - T_ASSERT_NE(pagetable_index, -1, "no pagetable_index"); - - T_SETUPEND; -} - -static void -get_ledger_info( - uint64_t *phys_footprint, - uint64_t *page_table) -{ - int64_t count; - - count = ledger_count; - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_GE(ledger(LEDGER_ENTRY_INFO, - (caddr_t)(uintptr_t)getpid(), - (caddr_t)lei, - (caddr_t)&count), - 0, - "ledger(LEDGER_ENTRY_INFO)"); - T_QUIET; - T_ASSERT_GT(count, (int64_t)footprint_index, "no entry for footprint"); - T_QUIET; - T_ASSERT_GT(count, (int64_t)pagetable_index, "no entry for pagetable"); - if (phys_footprint) { - *phys_footprint = (uint64_t)(lei[footprint_index].lei_balance); - } - if (page_table) { - *page_table = (uint64_t)(lei[pagetable_index].lei_balance); - } -} - -static mach_vm_address_t -pre_warm( - mach_vm_size_t vm_size) -{ - kern_return_t kr; - mach_vm_address_t vm_addr; - unsigned char BigBufOnStack[100 * 1024]; - uint64_t footprint, page_table; - - /* make sure ledgers are ready to be queried */ - ledger_init(); - - T_SETUPBEGIN; - - /* - * Touch a few pages ahead on the stack, to make - * sure we don't see a footprint increase due to - * an extra stack page later. - */ - memset(BigBufOnStack, 0xb, sizeof(BigBufOnStack)); - T_QUIET; - T_EXPECT_EQ(BigBufOnStack[0], 0xb, - "BigBufOnStack[0] == 0x%x", - BigBufOnStack[0]); - T_QUIET; - T_EXPECT_EQ(BigBufOnStack[sizeof(BigBufOnStack) - 1], 0xb, - "BigBufOnStack[%lu] == 0x%x", - sizeof(BigBufOnStack), - BigBufOnStack[sizeof(BigBufOnStack) - 1]); - - /* - * Pre-allocate, touch and then release the same amount - * of memory we'll be allocating later during the test, - * to account for any memory overhead (page tables, global - * variables, ...). - */ - vm_addr = 0; - kr = mach_vm_allocate(mach_task_self(), - &vm_addr, - vm_size, - VM_FLAGS_ANYWHERE); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate(%lld) error 0x%x (%s)", - vm_size, kr, mach_error_string(kr)); - memset((char *)(uintptr_t)vm_addr, 'p', (size_t)vm_size); - kr = mach_vm_deallocate(mach_task_self(), - vm_addr, - vm_size); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - - /* - * Exercise the ledger code to make sure it's ready to run - * without any extra memory overhead later. - */ - get_ledger_info(&footprint, &page_table); - - T_SETUPEND; - - /* - * Return the start of the virtual range we pre-warmed, so that the - * test can check that it's using the same range. - */ - return vm_addr; -} - -T_DECL(legacy_phys_footprint_anonymous, - "phys_footprint for anonymous memory", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - kern_return_t kr; - mach_vm_address_t pre_vm_addr, vm_addr; - mach_vm_size_t vm_size, dirty_size; - - /* pre-warm to account for page table expansion */ - pre_vm_addr = pre_warm(MEM_SIZE); - - /* allocating virtual memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_addr = 0; - vm_size = MEM_SIZE; - kr = mach_vm_allocate(mach_task_self(), &vm_addr, vm_size, - VM_FLAGS_ANYWHERE); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("virtual allocation does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "virtual allocation of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* touching memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - dirty_size = vm_size / 2; - memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("modifying anonymous memory increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "touched %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* deallocating memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("deallocating dirty anonymous memory decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "deallocated %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -} - -#define TEMP_FILE_TEMPLATE "/tmp/phys_footprint_data.XXXXXXXX" -#define TEMP_FILE_SIZE (1 * 1024 * 1024) - -T_DECL(legacy_phys_footprint_file, - "phys_footprint for mapped file", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - mach_vm_address_t pre_vm_addr; - int fd; - char *map_addr; - size_t map_size, dirty_size; - ssize_t nbytes; - char tmp_file_name[PATH_MAX] = TEMP_FILE_TEMPLATE; - char *buf; - size_t buf_size; - - T_SETUPBEGIN; - buf_size = TEMP_FILE_SIZE; - T_QUIET; - T_ASSERT_NOTNULL(buf = (char *)malloc(buf_size), - "allocate %zu-byte buffer", buf_size); - memset(buf, 'f', buf_size); - T_WITH_ERRNO; - T_QUIET; - T_ASSERT_NOTNULL(mktemp(tmp_file_name), - "create temporary file name"); - T_WITH_ERRNO; - T_QUIET; - T_ASSERT_GE(fd = open(tmp_file_name, O_CREAT | O_RDWR), - 0, - "create temp file"); - T_WITH_ERRNO; - T_QUIET; - T_ASSERT_EQ(nbytes = write(fd, buf, buf_size), - (ssize_t)buf_size, - "write %zu bytes", buf_size); - free(buf); - T_SETUPEND; - - /* pre-warm to account for page table expansion */ - pre_vm_addr = pre_warm(TEMP_FILE_SIZE); - - /* mapping a file does not impact footprint... */ - get_ledger_info(&footprint_before, &pagetable_before); - map_size = TEMP_FILE_SIZE; - T_WITH_ERRNO; - T_QUIET; - T_ASSERT_NOTNULL(map_addr = (char *)mmap(NULL, map_size, - PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, fd, 0), - "mmap()"); - T_QUIET; - T_EXPECT_EQ((mach_vm_address_t)map_addr, pre_vm_addr, - "pre-warm mishap"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("mapping file does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "mapping file with %zu bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - map_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* touching file-backed memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - dirty_size = map_size / 2; - memset(map_addr, 'F', dirty_size); - /* ... should not impact footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("modifying file-backed memory does not impact phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "touched %zu bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* deallocating file-backed memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - T_WITH_ERRNO; - T_QUIET; - T_ASSERT_EQ(munmap(map_addr, map_size), - 0, - "unmap file"); - /* ... should not impact footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("unmapping file-backed memory does not impact phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "unmapped %zu dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -} - -T_DECL(legacy_phys_footprint_purgeable, - "phys_footprint for purgeable memory", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - kern_return_t kr; - mach_vm_address_t pre_vm_addr, vm_addr; - mach_vm_size_t vm_size, dirty_size; - int state; - - /* pre-warm to account for page table expansion */ - pre_vm_addr = pre_warm(MEM_SIZE); - - /* allocating purgeable virtual memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_addr = 0; - vm_size = MEM_SIZE; - kr = mach_vm_allocate(mach_task_self(), &vm_addr, vm_size, - VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("purgeable virtual allocation does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "purgeable virtual allocation of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* touching memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - dirty_size = vm_size / 2; - memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("modifying anonymous memory increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "touched %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making it volatile... */ - get_ledger_info(&footprint_before, &pagetable_before); - state = VM_PURGABLE_VOLATILE; - T_QUIET; - T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(), - vm_addr, - VM_PURGABLE_SET_STATE, - &state), - KERN_SUCCESS, - "vm_purgable_control(VOLATILE)"); - T_QUIET; - T_ASSERT_EQ(state, VM_PURGABLE_NONVOLATILE, - "memory was non-volatile"); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making volatile decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making it non-volatile... */ - get_ledger_info(&footprint_before, &pagetable_before); - state = VM_PURGABLE_NONVOLATILE; - T_QUIET; - T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(), - vm_addr, - VM_PURGABLE_SET_STATE, - &state), - KERN_SUCCESS, - "vm_purgable_control(NONVOLATILE)"); - T_QUIET; - T_ASSERT_EQ(state, VM_PURGABLE_VOLATILE, - "memory was volatile"); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making non-volatile increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made non-volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* deallocating memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("deallocating memory decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "deallocated %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -} - -T_DECL(legacy_phys_footprint_purgeable_ownership, - "phys_footprint for owned purgeable memory", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - kern_return_t kr; - mach_vm_address_t pre_vm_addr, vm_addr; - mach_vm_size_t vm_size, dirty_size, me_size; - int state; - mach_port_t me_port; - - /* pre-warm to account for page table expansion */ - pre_vm_addr = pre_warm(MEM_SIZE); - - /* allocating purgeable virtual memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_addr = 0; - vm_size = MEM_SIZE; - kr = mach_vm_allocate(mach_task_self(), &vm_addr, vm_size, - VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("purgeable virtual allocation does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "purgeable virtual allocation of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* touching memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - dirty_size = vm_size / 2; - memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("modifying anonymous memory increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "touched %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making it volatile... */ - get_ledger_info(&footprint_before, &pagetable_before); - state = VM_PURGABLE_VOLATILE; - T_QUIET; - T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(), - vm_addr, - VM_PURGABLE_SET_STATE, - &state), - KERN_SUCCESS, - "vm_purgable_control(VOLATILE)"); - T_QUIET; - T_ASSERT_EQ(state, VM_PURGABLE_NONVOLATILE, - "memory was non-volatile"); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making volatile decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making it non-volatile... */ - get_ledger_info(&footprint_before, &pagetable_before); - state = VM_PURGABLE_NONVOLATILE; - T_QUIET; - T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(), - vm_addr, - VM_PURGABLE_SET_STATE, - &state), - KERN_SUCCESS, - "vm_purgable_control(NONVOLATILE)"); - T_QUIET; - T_ASSERT_EQ(state, VM_PURGABLE_VOLATILE, - "memory was volatile"); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making non-volatile increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made non-volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making a memory entry... */ - get_ledger_info(&footprint_before, &pagetable_before); - me_size = vm_size; - me_port = MACH_PORT_NULL; - kr = mach_make_memory_entry_64(mach_task_self(), - &me_size, - vm_addr, - VM_PROT_READ | VM_PROT_WRITE, - &me_port, - MACH_PORT_NULL); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "make_memory_entry() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(me_size, vm_size, "memory entry size mismatch"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making a memory entry does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "making a memory entry of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* deallocating memory while holding memory entry... */ - get_ledger_info(&footprint_before, &pagetable_before); - kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("deallocating owned memory while holding memory entry " - "does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "deallocated %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* releasing the memory entry... */ - kr = mach_port_deallocate(mach_task_self(), me_port); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_port_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("releasing memory entry decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -} - -#ifdef MAP_MEM_LEDGER_TAGGED -T_DECL(legacy_phys_footprint_ledger_purgeable_owned, - "phys_footprint for ledger-tagged purgeable memory ownership", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - kern_return_t kr; - mach_vm_address_t pre_vm_addr, vm_addr; - mach_vm_size_t vm_size, dirty_size, me_size; - int state; - mach_port_t me_port; - - /* pre-warm to account for page table expansion */ - pre_vm_addr = pre_warm(MEM_SIZE); - - /* making a memory entry... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_size = MEM_SIZE; - me_size = vm_size; - me_port = MACH_PORT_NULL; - kr = mach_make_memory_entry_64(mach_task_self(), - &me_size, - 0, - (MAP_MEM_NAMED_CREATE | - MAP_MEM_LEDGER_TAGGED | - MAP_MEM_PURGABLE | - VM_PROT_READ | VM_PROT_WRITE), - &me_port, - MACH_PORT_NULL); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "make_memory_entry() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(me_size, vm_size, "memory entry size mismatch"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making a memory entry does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "making a memory entry of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* mapping ledger-tagged virtual memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_addr = 0; - kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size, - 0, /* mask */ - VM_FLAGS_ANYWHERE, - me_port, - 0, /* offset */ - FALSE, /* copy */ - VM_PROT_READ | VM_PROT_WRITE, - VM_PROT_READ | VM_PROT_WRITE, - VM_INHERIT_DEFAULT); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_map() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("mapping ledger-tagged memory does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "ledger-tagged mapping of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* touching memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - dirty_size = vm_size / 2; - memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("modifying ledger-tagged memory increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "touched %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making it volatile... */ - get_ledger_info(&footprint_before, &pagetable_before); - state = VM_PURGABLE_VOLATILE; - T_QUIET; - T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(), - vm_addr, - VM_PURGABLE_SET_STATE, - &state), - KERN_SUCCESS, - "vm_purgable_control(VOLATILE)"); - T_QUIET; - T_ASSERT_EQ(state, VM_PURGABLE_NONVOLATILE, - "memory was non-volatile"); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making volatile decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* making it non-volatile... */ - get_ledger_info(&footprint_before, &pagetable_before); - state = VM_PURGABLE_NONVOLATILE; - T_QUIET; - T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(), - vm_addr, - VM_PURGABLE_SET_STATE, - &state), - KERN_SUCCESS, - "vm_purgable_control(NONVOLATILE)"); - T_QUIET; - T_ASSERT_EQ(state, VM_PURGABLE_VOLATILE, - "memory was volatile"); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making non-volatile increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made non-volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* deallocating memory while holding memory entry... */ - get_ledger_info(&footprint_before, &pagetable_before); - kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("deallocating owned memory while holding memory entry " - "does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "deallocated %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* releasing the memory entry... */ - kr = mach_port_deallocate(mach_task_self(), me_port); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_port_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("releasing memory entry decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -} - -T_DECL(legacy_phys_footprint_ledger_owned, - "phys_footprint for ledger-tagged memory ownership", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - kern_return_t kr; - mach_vm_address_t pre_vm_addr, vm_addr; - mach_vm_size_t vm_size, dirty_size, me_size; - int state; - mach_port_t me_port; - - /* pre-warm to account for page table expansion */ - pre_vm_addr = pre_warm(MEM_SIZE); - - /* making a memory entry... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_size = MEM_SIZE; - me_size = vm_size; - me_port = MACH_PORT_NULL; - kr = mach_make_memory_entry_64(mach_task_self(), - &me_size, - 0, - (MAP_MEM_NAMED_CREATE | - MAP_MEM_LEDGER_TAGGED | - VM_PROT_READ | VM_PROT_WRITE), - &me_port, - MACH_PORT_NULL); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "make_memory_entry() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(me_size, vm_size, "memory entry size mismatch"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making a memory entry does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "making a memory entry of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* mapping ledger-tagged virtual memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - vm_addr = 0; - kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size, - 0, /* mask */ - VM_FLAGS_ANYWHERE, - me_port, - 0, /* offset */ - FALSE, /* copy */ - VM_PROT_READ | VM_PROT_WRITE, - VM_PROT_READ | VM_PROT_WRITE, - VM_INHERIT_DEFAULT); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_map() error 0x%x (%s)", - kr, mach_error_string(kr)); - T_QUIET; - T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap"); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("mapping ledger-tagged memory does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "ledger-tagged mapping of %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - vm_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* touching memory... */ - get_ledger_info(&footprint_before, &pagetable_before); - dirty_size = vm_size / 2; - memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size); - /* ... should increase footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before + dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("modifying ledger-tagged memory increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "touched %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* deallocating memory while holding memory entry... */ - get_ledger_info(&footprint_before, &pagetable_before); - kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should not change footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("deallocating owned memory while holding memory entry " - "does not change phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "deallocated %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); - - /* releasing the memory entry... */ - kr = mach_port_deallocate(mach_task_self(), me_port); - T_QUIET; - T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_port_deallocate() error 0x%x (%s)", - kr, mach_error_string(kr)); - /* ... should decrease footprint */ - get_ledger_info(&footprint_after, &pagetable_after); - footprint_expected = footprint_before - dirty_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("releasing memory entry decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld dirty bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - dirty_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -} -#endif /* MAP_MEM_LEDGER_TAGGED */ - -/* IOSurface code from: CoreImage/CoreImageTests/CIRender/SurfaceUtils.c */ -#include -#include -#include -static size_t -bytes_per_element(uint32_t format) -{ - size_t bpe = 0; - switch (format) { - case 32: // kCVPixelFormatType_32ARGB (ARGB8) - bpe = 4; - break; - default: - bpe = 0; - break; - } - return bpe; -} -static size_t -bytes_per_pixel(uint32_t format) -{ - size_t bpe = 0; - switch (format) { - case 32: // kCVPixelFormatType_32ARGB (ARGB8) - bpe = 4; - break; - default: - bpe = 0; - break; - } - return bpe; -} -static inline size_t -roundSizeToMultiple(size_t size, size_t mult) -{ - return ((size + mult - 1) / mult) * mult; -} -static inline void -setIntValue(CFMutableDictionaryRef dict, const CFStringRef key, int value) -{ - CFNumberRef number = CFNumberCreate(0, kCFNumberIntType, &value); - CFDictionarySetValue(dict, key, number); - CFRelease(number); -} -typedef void (^SurfacePlaneBlock)(void *data, size_t planeIndex, size_t width, size_t height, size_t rowbytes); -static IOReturn -SurfaceApplyPlaneBlock(IOSurfaceRef surface, SurfacePlaneBlock block) -{ - if (surface == nil || block == nil) { - return kIOReturnBadArgument; - } - - IOReturn result = kIOReturnSuccess; - size_t planeCount = IOSurfaceGetPlaneCount(surface); - - if (planeCount == 0) { - result = IOSurfaceLock(surface, 0, NULL); - if (result != kIOReturnSuccess) { - return result; - } - - void* base = IOSurfaceGetBaseAddress(surface); - size_t rb = IOSurfaceGetBytesPerRow(surface); - size_t w = IOSurfaceGetWidth(surface); - size_t h = IOSurfaceGetHeight(surface); - - if (base && rb && w && h) { - block(base, 0, w, h, rb); - } - - IOSurfaceUnlock(surface, 0, NULL); - } else if (planeCount == 2) { - for (size_t i = 0; i < planeCount; i++) { - result = IOSurfaceLock(surface, 0, NULL); - if (result != kIOReturnSuccess) { - return result; - } - - void* base = IOSurfaceGetBaseAddressOfPlane(surface, i); - size_t rb = IOSurfaceGetBytesPerRowOfPlane(surface, i); - size_t w = IOSurfaceGetWidthOfPlane(surface, i); - size_t h = IOSurfaceGetHeightOfPlane(surface, i); - - if (base && rb && w && h) { - block(base, i, w, h, rb); - } - - IOSurfaceUnlock(surface, 0, NULL); - } - } - return result; -} -static void -ClearSurface(IOSurfaceRef surface) -{ - const int zero = 0; - (void) SurfaceApplyPlaneBlock(surface, ^(void *p, size_t i, __unused size_t w, size_t h, size_t rb) - { - if (i == 0) { - memset(p, zero, rb * h); - } else { - memset(p, 128, rb * h); - } - }); -} -static IOSurfaceRef -CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignment, uint32_t fmt, bool purgeable, bool clear) -{ - IOSurfaceRef surface = nil; - - if (pixelsWide < 1 || pixelsHigh < 1 || fmt == 0) { - return nil; - } - - size_t bpp = bytes_per_pixel(fmt); - size_t bpe = bytes_per_element(fmt); - if (bpp == 0 || bpe == 0) { - return nil; - } - - size_t rowbytes = pixelsWide * bpp; - if (rowBytesAlignment == 0) { - rowBytesAlignment = 16; - } - rowbytes = roundSizeToMultiple(rowbytes, rowBytesAlignment); - - CFMutableDictionaryRef props = CFDictionaryCreateMutable(0, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - setIntValue(props, kIOSurfaceBytesPerRow, (int)rowbytes); - setIntValue(props, kIOSurfaceWidth, (int)pixelsWide); - setIntValue(props, kIOSurfaceHeight, (int)pixelsHigh); - setIntValue(props, kIOSurfacePixelFormat, (int)fmt); -#if TARGET_OS_IPHONE - setIntValue(props, kIOSurfaceNonPurgeable, purgeable); -#else /* TARGET_OS_IPHONE */ - (void)purgeable; -#endif /* TARGET_OS_IPHONE */ - { - if (bpe != bpp) { // i.e. a 422 format such as 'yuvf' etc. - setIntValue(props, kIOSurfaceElementWidth, 2); - setIntValue(props, kIOSurfaceElementHeight, 1); - } - setIntValue(props, kIOSurfaceBytesPerElement, (int)bpe); - } - - surface = IOSurfaceCreate(props); - - if (clear) { - ClearSurface(surface); - } - - CFRelease(props); - return surface; -} -T_DECL(legacy_phys_footprint_purgeable_iokit, - "phys_footprint for purgeable IOKit memory", - T_META_NAMESPACE("xnu.vm"), - T_META_LTEPHASE(LTE_POSTINIT)) -{ - uint64_t footprint_before, pagetable_before; - uint64_t footprint_after, pagetable_after; - uint64_t footprint_expected; - IOSurfaceRef surface; - uint32_t old_state; - uint64_t surface_size; - - T_SETUPBEGIN; - ledger_init(); - surface = CreateSurface(1024, 1024, 0, 32, true, true); - IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state); - IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state); - CFRelease(surface); - T_SETUPEND; - - surface_size = 1024 * 1024 * 4; - - /* create IOsurface: footprint grows */ - get_ledger_info(&footprint_before, &pagetable_before); - surface = CreateSurface(1024, 1024, 0, 32, true, true); - get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: creating IOSurface: no footprint impact"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "create IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ - footprint_expected = footprint_before + surface_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("creating IOSurface increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "create IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ - - /* make IOSurface volatile: footprint shrinks */ - get_ledger_info(&footprint_before, &pagetable_before); - IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state); - get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: volatile IOSurface: no footprint impact"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "volatile IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ - footprint_expected = footprint_before - surface_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making IOSurface volatile decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made volatile %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ - - /* make IOSurface non-volatile: footprint grows */ - get_ledger_info(&footprint_before, &pagetable_before); - IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state); - get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: non-volatile IOSurface: no footprint impact"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "non-volatile IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ - footprint_expected = footprint_before + surface_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("making IOSurface non-volatile increases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "made non-volatile %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ - - /* accessing IOSurface re-mapping: no footprint impact */ - - /* deallocating IOSurface re-mapping: no footprint impact */ - - /* release IOSurface: footprint shrinks */ - get_ledger_info(&footprint_before, &pagetable_before); - CFRelease(surface); - get_ledger_info(&footprint_after, &pagetable_after); -#if LEGACY_FOOTPRINT - footprint_expected = footprint_before; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("LEGACY FOOTPRINT: release IOSurface: no footprint impact"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "releasing IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#else /* LEGACY_FOOTPRINT */ - footprint_expected = footprint_before - surface_size; - footprint_expected += (pagetable_after - pagetable_before); - T_LOG("releasing IOSurface decreases phys_footprint"); - T_EXPECT_EQ(footprint_after, footprint_expected, - "released IOSurface %lld bytes: " - "footprint %lld -> %lld expected %lld delta %lld", - surface_size, footprint_before, footprint_after, - footprint_expected, footprint_after - footprint_expected); -#endif /* LEGACY_FOOTPRINT */ -} diff --git a/tests/vm_set_max_addr_test.c b/tests/vm_set_max_addr_test.c index ac03b77ff..e1c06259f 100644 --- a/tests/vm_set_max_addr_test.c +++ b/tests/vm_set_max_addr_test.c @@ -8,6 +8,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + extern char * testpath; T_DECL(set_max_addr, diff --git a/tests/voucher_entry_18826844.c b/tests/voucher_entry_18826844.c index f5107ff84..9b6ea0837 100644 --- a/tests/voucher_entry_18826844.c +++ b/tests/voucher_entry_18826844.c @@ -10,6 +10,8 @@ #include #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + T_DECL(voucher_entry, "voucher_entry", T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true)) { kern_return_t kr = KERN_SUCCESS; diff --git a/tests/voucher_traps.c b/tests/voucher_traps.c index 0b4967720..6731d3bb1 100644 --- a/tests/voucher_traps.c +++ b/tests/voucher_traps.c @@ -20,6 +20,7 @@ #include +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); static mach_port_t get_atm_voucher(void) diff --git a/tests/work_interval_test.c b/tests/work_interval_test.c index c80267309..c46de4069 100644 --- a/tests/work_interval_test.c +++ b/tests/work_interval_test.c @@ -13,7 +13,8 @@ #include -T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler")); +T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"), + T_META_RUN_CONCURRENTLY(true)); static mach_port_t port = MACH_PORT_NULL; diff --git a/tests/workq_sigprof.c b/tests/workq_sigprof.c index 458307962..deb7d3792 100644 --- a/tests/workq_sigprof.c +++ b/tests/workq_sigprof.c @@ -12,6 +12,8 @@ #if !TARGET_OS_IPHONE +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + static pthread_t workq_thread; static bool signal_received; diff --git a/tests/xnu_quick_test.c b/tests/xnu_quick_test.c index 412f8558a..81b4bdbc7 100644 --- a/tests/xnu_quick_test.c +++ b/tests/xnu_quick_test.c @@ -10,15 +10,15 @@ #include #include -T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false)); +T_GLOBAL_META( + T_META_NAMESPACE("xnu.quicktest"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) + ); + char g_target_path[PATH_MAX]; -/* ************************************************************************************************************** - * Test the syscall system call. - * ************************************************************************************************************** - */ -T_DECL(syscall, - "xnu_quick_test for syscall", T_META_CHECK_LEAKS(NO)) +T_DECL(syscall, "xnu_quick_test for syscall") { int my_fd = -1; char * my_pathp; @@ -59,12 +59,8 @@ T_DECL(syscall, T_ATEND(remove_target_directory); } -/* ************************************************************************************************************** - * Test fork wait4, and exit system calls. - * ************************************************************************************************************** - */ T_DECL(fork_wait4_exit, - "Tests forking off a process and waiting for the child to exit", T_META_CHECK_LEAKS(false)) + "Tests forking off a process and waiting for the child to exit") { int my_err, my_status; pid_t my_pid, my_wait_pid; @@ -104,15 +100,15 @@ T_DECL(fork_wait4_exit, "check if wait4 returns right exit status"); } -T_DECL(getrusage, "Sanity check of getrusage") +T_DECL(getrusage, "check getrusage works") { - struct rusage my_rusage; - - T_WITH_ERRNO; - T_ASSERT_EQ(getrusage( RUSAGE_SELF, &my_rusage ), 0, NULL); - T_LOG("Checking that getrusage returned sane values"); - T_EXPECT_LT(my_rusage.ru_msgrcv, 1000, NULL); - T_EXPECT_GE(my_rusage.ru_msgrcv, 0, NULL); - T_EXPECT_LT(my_rusage.ru_nsignals, 1000, NULL); - T_EXPECT_GE(my_rusage.ru_nsignals, 0, NULL); + struct rusage rubuf; + + int ret = getrusage(RUSAGE_SELF, &rubuf); + T_ASSERT_POSIX_SUCCESS(ret, "getrusage for self"); + + T_EXPECT_LT(rubuf.ru_msgrcv, 1000, "upper bound on messages received"); + T_EXPECT_GE(rubuf.ru_msgrcv, 0, "lower bound on messages reseived"); + T_EXPECT_LT(rubuf.ru_nsignals, 1000, "upper bound on signals"); + T_EXPECT_GE(rubuf.ru_nsignals, 0, "lower bound on signals"); } diff --git a/tests/xnu_quick_test_entitled.c b/tests/xnu_quick_test_entitled.c index b3d6a9d4b..24c96e43f 100644 --- a/tests/xnu_quick_test_entitled.c +++ b/tests/xnu_quick_test_entitled.c @@ -9,11 +9,15 @@ #include #include -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #include #endif -T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false)); +T_GLOBAL_META( + T_META_NAMESPACE("xnu.quicktest"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) + ); /* ************************************************************************************************************** @@ -31,7 +35,7 @@ T_DECL(ioctl, "Sanity check of ioctl by exercising DKIOCGETBLOCKCOUNT and DKIOCG long long my_block_count; char my_name[MAXPATHLEN]; -#if !TARGET_OS_EMBEDDED +#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) /* * this test won't be able to open the root disk device unless CSR is * disabled or in AppleInternal mode diff --git a/tests/xnu_quick_test_getsetpriority.c b/tests/xnu_quick_test_getsetpriority.c index ec62af549..4772c5912 100644 --- a/tests/xnu_quick_test_getsetpriority.c +++ b/tests/xnu_quick_test_getsetpriority.c @@ -5,7 +5,11 @@ #include #include -T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false)); +T_GLOBAL_META( + T_META_NAMESPACE("xnu.quicktest"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true) + ); T_DECL(getpriority_setpriority, "Tests getpriority and setpriority system calls", T_META_ASROOT(true)) { diff --git a/tools/cocci/OSAtomic_rewrite.cocci b/tools/cocci/OSAtomic_rewrite.cocci new file mode 100644 index 000000000..6c34e2a04 --- /dev/null +++ b/tools/cocci/OSAtomic_rewrite.cocci @@ -0,0 +1,202 @@ +// To apply, at the top of xnu.git: +// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/OSAtomic_rewrite.cocci -dir . +// +// coccinelle insists on adding a space for (void) casts which can be fixed with: +// $ git grep -l '(void) os_atomic' | xargs -n1 sed -i '' -e 's/(void) os_atomic/(void)os_atomic/' + +@@ expression E; @@ + +( +- OSIncrementAtomic(E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSIncrementAtomic8(E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSIncrementAtomic16(E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSIncrementAtomic32(E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSIncrementAtomic64(E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSIncrementAtomicLong(E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSAddAtomic(1, E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSAddAtomic8(1, E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSAddAtomic16(1, E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSAddAtomic32(1, E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSAddAtomic64(1, E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSAddAtomicLong(1, E) ++ os_atomic_inc_orig(E, relaxed) +| +- OSDecrementAtomic(E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSDecrementAtomic8(E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSDecrementAtomic16(E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSDecrementAtomic32(E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSDecrementAtomic64(E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSDecrementAtomicLong(E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSAddAtomic(-1, E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSAddAtomic8(-1, E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSAddAtomic16(-1, E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSAddAtomic32(-1, E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSAddAtomic64(-1, E) ++ os_atomic_dec_orig(E, relaxed) +| +- OSAddAtomicLong(-1, E) ++ os_atomic_dec_orig(E, relaxed) +) + +@@ expression E, F; @@ + +( +- OSAddAtomic(-F, E) ++ os_atomic_sub_orig(E, F, relaxed) +| +- OSAddAtomic8(-F, E) ++ os_atomic_sub_orig(E, F, relaxed) +| +- OSAddAtomic16(-F, E) ++ os_atomic_sub_orig(E, F, relaxed) +| +- OSAddAtomic32(-F, E) ++ os_atomic_sub_orig(E, F, relaxed) +| +- OSAddAtomic64(-F, E) ++ os_atomic_sub_orig(E, F, relaxed) +| +- OSAddAtomicLong(-F, E) ++ os_atomic_sub_orig(E, F, relaxed) +| +- OSAddAtomic(F, E) ++ os_atomic_add_orig(E, F, relaxed) +| +- OSAddAtomic8(F, E) ++ os_atomic_add_orig(E, F, relaxed) +| +- OSAddAtomic16(F, E) ++ os_atomic_add_orig(E, F, relaxed) +| +- OSAddAtomic32(F, E) ++ os_atomic_add_orig(E, F, relaxed) +| +- OSAddAtomic64(F, E) ++ os_atomic_add_orig(E, F, relaxed) +| +- OSAddAtomicLong(F, E) ++ os_atomic_add_orig(E, F, relaxed) +| +- OSBitOrAtomic(F, E) ++ os_atomic_or_orig(E, F, relaxed) +| +- OSBitOrAtomic8(F, E) ++ os_atomic_or_orig(E, F, relaxed) +| +- OSBitOrAtomic16(F, E) ++ os_atomic_or_orig(E, F, relaxed) +| +- OSBitOrAtomic32(F, E) ++ os_atomic_or_orig(E, F, relaxed) +| +- OSBitOrAtomic64(F, E) ++ os_atomic_or_orig(E, F, relaxed) +| +- OSBitOrAtomicLong(F, E) ++ os_atomic_or_orig(E, F, relaxed) +| +- OSBitXorAtomic(F, E) ++ os_atomic_xor_orig(E, F, relaxed) +| +- OSBitXorAtomic8(F, E) ++ os_atomic_xor_orig(E, F, relaxed) +| +- OSBitXorAtomic16(F, E) ++ os_atomic_xor_orig(E, F, relaxed) +| +- OSBitXorAtomic32(F, E) ++ os_atomic_xor_orig(E, F, relaxed) +| +- OSBitXorAtomic64(F, E) ++ os_atomic_xor_orig(E, F, relaxed) +| +- OSBitXorAtomicLong(F, E) ++ os_atomic_xor_orig(E, F, relaxed) +| +- OSBitAndAtomic(F, E) ++ os_atomic_and_orig(E, F, relaxed) +| +- OSBitAndAtomic8(F, E) ++ os_atomic_and_orig(E, F, relaxed) +| +- OSBitAndAtomic16(F, E) ++ os_atomic_and_orig(E, F, relaxed) +| +- OSBitAndAtomic32(F, E) ++ os_atomic_and_orig(E, F, relaxed) +| +- OSBitAndAtomic64(F, E) ++ os_atomic_and_orig(E, F, relaxed) +| +- OSBitAndAtomicLong(F, E) ++ os_atomic_and_orig(E, F, relaxed) +) + +@@ expression E, F, A; @@ + +( +- OSCompareAndSwap(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- OSCompareAndSwapPtr(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- OSCompareAndSwap8(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- OSCompareAndSwap16(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- OSCompareAndSwap32(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- OSCompareAndSwap64(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- OSCompareAndSwapLong(F, E, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +) + +// vim:ft=diff: diff --git a/tools/cocci/c11_atomic_builtin_rewrite.cocci b/tools/cocci/c11_atomic_builtin_rewrite.cocci new file mode 100644 index 000000000..7072ed0c3 --- /dev/null +++ b/tools/cocci/c11_atomic_builtin_rewrite.cocci @@ -0,0 +1,162 @@ +// To apply, at the top of xnu.git: +// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/c11_atomic_builtin_rewrite.cocci + +@memory_order@ +identifier m =~ "(memory_order_(relaxed|consume|acquire|release|acq_rel|seq_cst)(|_smp)|__ATOMIC_(RELAXED|CONSUME|ACQUIRE|RELEASE|ACQ_REL|SEQ_CST))"; +@@ + +m + +@script:ocaml os_memory_order@ +m << memory_order.m; +new_m; +@@ + +new_m := make_ident (String.lowercase_ascii (Str.global_replace (Str.regexp "memory_order_\\|__ATOMIC_\\|_smp") "" m)) + +@fence@ +identifier memory_order.m; +identifier os_memory_order.new_m; +@@ + +- __c11_atomic_thread_fence(m) ++ os_atomic_thread_fence(new_m) + +@load@ +expression E; +type T; +identifier memory_order.m; +identifier os_memory_order.new_m; +@@ + +- __c11_atomic_load ++ os_atomic_load + ( +( +-((T)E) ++E +| +-(T)E ++E +| +E +) + , +-m ++new_m + ) + +@inc@ +expression E; +type T; +identifier memory_order.m; +identifier os_memory_order.new_m; +@@ + +- __c11_atomic_fetch_add ++ os_atomic_inc_orig + ( +( +-((T)E) ++E +| +-(T)E ++E +| +E +) + , +-1, m ++new_m + ) + +@dec@ +expression E; +type T; +identifier memory_order.m; +identifier os_memory_order.new_m; +@@ + +- __c11_atomic_fetch_sub ++ os_atomic_dec_orig + ( +( +-((T)E) ++E +| +-(T)E ++E +| +E +) + , +-1, m ++new_m + ) + +@single_arg@ +expression E, F; +type T; +identifier memory_order.m; +identifier os_memory_order.new_m; +@@ + +( +- __c11_atomic_store ++ os_atomic_store +| +- __c11_atomic_fetch_add ++ os_atomic_add_orig +| +- __c11_atomic_fetch_sub ++ os_atomic_sub_orig +| +- __c11_atomic_fetch_and ++ os_atomic_and_orig +| +- __c11_atomic_fetch_or ++ os_atomic_or_orig +| +- __c11_atomic_fetch_xor ++ os_atomic_xor_orig +) + ( +( +-((T)E) ++E +| +-(T)E ++E +| +E +) + , F, +-m ++new_m + ) + +@cmpxcgh@ +expression E, F, G; +type T; +identifier memory_order.m; +identifier os_memory_order.new_m; +@@ + +- __c11_atomic_compare_exchange_strong ++ os_atomic_cmpxchgv + ( +( +-((T)E) ++E +| +-(T)E ++E +| +E +) + , +- &F, G, m, memory_order_relaxed ++ F, G, &F, new_m + ) + +// vim:ft=diff: diff --git a/tools/cocci/hw_atomic_rewrite.cocci b/tools/cocci/hw_atomic_rewrite.cocci new file mode 100644 index 000000000..d4e8b2f1d --- /dev/null +++ b/tools/cocci/hw_atomic_rewrite.cocci @@ -0,0 +1,96 @@ +// To apply, at the top of xnu.git: +// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/hw_atomic_rewrite.cocci -dir . +// +// coccinelle insists on adding a space for (void) casts which can be fixed with: +// $ git grep -l 'os_atomic' | xargs -n1 sed -i '' -e 's/os_atomic/os_atomic/' + +@@ expression E, F; @@ // hw_atomic_add -> os_atomic_{inc,dec} + +( +- hw_atomic_add(E, -1) + 1 ++ os_atomic_dec_orig(E, relaxed) +| +- hw_atomic_add(E, -1) ++ os_atomic_dec(E, relaxed) +| +- hw_atomic_add(E, -F) + F ++ os_atomic_sub_orig(E, F, relaxed) +| +- hw_atomic_add(E, -F) ++ os_atomic_sub(E, F, relaxed) +| +- hw_atomic_add(E, 1) - 1 ++ os_atomic_inc_orig(E, relaxed) +| +- hw_atomic_add(E, 1) ++ os_atomic_inc(E, relaxed) +| +- hw_atomic_add(E, F) - F ++ os_atomic_add_orig(E, F, relaxed) +| +- hw_atomic_add(E, F) ++ os_atomic_add(E, F, relaxed) +) + +@@ expression E, F; @@ // hw_atomic_sub -> os_atomic_{inc,dec} + +( +- hw_atomic_sub(E, -1) - 1 ++ os_atomic_inc_orig(E, relaxed) +| +- hw_atomic_sub(E, -1) ++ os_atomic_inc(E, relaxed) +| +- hw_atomic_sub(E, -F) - F ++ os_atomic_add_orig(E, F, relaxed) +| +- hw_atomic_sub(E, -F) ++ os_atomic_add(E, F, relaxed) +| +- hw_atomic_sub(E, 1) + 1 ++ os_atomic_dec_orig(E, relaxed) +| +- hw_atomic_sub(E, 1) ++ os_atomic_dec(E, relaxed) +| +- hw_atomic_sub(E, F) + F ++ os_atomic_sub_orig(E, F, relaxed) +| +- hw_atomic_sub(E, F) ++ os_atomic_sub(E, F, relaxed) +) + +@@ expression E, F; @@ // hw_atomic_and -> os_atomic_and + +( +- hw_atomic_and(E, ~F) ++ os_atomic_andnot(E, F, relaxed) +| +- hw_atomic_and(E, F) ++ os_atomic_and(E, F, relaxed) +| +- hw_atomic_and_noret(E, ~F) ++ os_atomic_andnot(E, F, relaxed) +| +- hw_atomic_and_noret(E, F) ++ os_atomic_and(E, F, relaxed) +) + +@@ expression E, F; @@ // hw_atomic_or -> os_atomic_or + +( +- hw_atomic_or(E, F) ++ os_atomic_or(E, F, relaxed) +| +- hw_atomic_or_noret(E, F) ++ os_atomic_or(E, F, relaxed) +) + +@@ expression E, F, A; @@ // hw_compare_and_store + +( +- hw_compare_and_store(E, F, A) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +) + +// vim:ft=diff: diff --git a/tools/cocci/mcache_atomic_rewrite.cocci b/tools/cocci/mcache_atomic_rewrite.cocci new file mode 100644 index 000000000..f5f1ec9a4 --- /dev/null +++ b/tools/cocci/mcache_atomic_rewrite.cocci @@ -0,0 +1,159 @@ +// To apply, at the top of xnu.git: +// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/mcache_atomic_rewrite.cocci -dir . +// +// coccinelle insists on adding a space for (void) casts which can be fixed with: +// $ git grep -l '(void) os_atomic' | xargs -n1 sed -i '' -e 's/(void) os_atomic/(void)os_atomic/' + +@@ expression E, F, A; @@ + +( +- atomic_add_16_ov(E, 1) ++ os_atomic_inc_orig(E, relaxed) +| +- atomic_add_16(E, 1) ++ os_atomic_inc(E, relaxed) +| +- atomic_add_32_ov(E, 1) ++ os_atomic_inc_orig(E, relaxed) +| +- atomic_add_32(E, 1) ++ os_atomic_inc(E, relaxed) +| +- atomic_add_64_ov(E, 1) ++ os_atomic_inc_orig(E, relaxed) +| +- atomic_add_64(E, 1) ++ os_atomic_inc(E, relaxed) +| +- atomic_add_16_ov(E, -1) ++ os_atomic_dec_orig(E, relaxed) +| +- atomic_add_16(E, -1) ++ os_atomic_dec(E, relaxed) +| +- atomic_add_32_ov(E, -1) ++ os_atomic_dec_orig(E, relaxed) +| +- atomic_add_32(E, -1) ++ os_atomic_dec(E, relaxed) +| +- atomic_add_64_ov(E, -1) ++ os_atomic_dec_orig(E, relaxed) +| +- atomic_add_64(E, -1) ++ os_atomic_dec(E, relaxed) +| +- atomic_add_16_ov(E, F) ++ os_atomic_add_orig(E, F, relaxed) +| +- atomic_add_16(E, F) ++ os_atomic_add(E, F, relaxed) +| +- atomic_add_32_ov(E, F) ++ os_atomic_add_orig(E, F, relaxed) +| +- atomic_add_32(E, F) ++ os_atomic_add(E, F, relaxed) +| +- atomic_add_64_ov(E, F) ++ os_atomic_add_orig(E, F, relaxed) +| +- atomic_add_64(E, F) ++ os_atomic_add(E, F, relaxed) +| +- atomic_test_set_32(A, E, F) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- atomic_test_set_64(A, E, F) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- atomic_test_set_ptr(A, E, F) ++ os_atomic_cmpxchg(A, E, F, acq_rel) +| +- atomic_set_32(E, F) ++ os_atomic_store(E, F, release) +| +- atomic_set_64(E, F) ++ os_atomic_store(E, F, release) +| +- atomic_set_ptr(E, F) ++ os_atomic_store(E, F, release) +| +- atomic_get_64(E, A) ++ E = os_atomic_load(A, relaxed) +| +- membar_sync() ++ os_atomic_thread_fence(seq_cst) +| +- atomic_or_8_ov(E, F) ++ os_atomic_or_orig(E, F, relaxed) +| +- atomic_or_16_ov(E, F) ++ os_atomic_or_orig(E, F, relaxed) +| +- atomic_or_32_ov(E, F) ++ os_atomic_or_orig(E, F, relaxed) +| +- atomic_or_8(E, F) ++ os_atomic_or(E, F, relaxed) +| +- atomic_or_16(E, F) ++ os_atomic_or(E, F, relaxed) +| +- atomic_or_32(E, F) ++ os_atomic_or(E, F, relaxed) +| +- atomic_and_8_ov(E, F) ++ os_atomic_and_orig(E, F, relaxed) +| +- atomic_and_16_ov(E, F) ++ os_atomic_and_orig(E, F, relaxed) +| +- atomic_and_32_ov(E, F) ++ os_atomic_and_orig(E, F, relaxed) +| +- atomic_and_8(E, F) ++ os_atomic_and(E, F, relaxed) +| +- atomic_and_16(E, F) ++ os_atomic_and(E, F, relaxed) +| +- atomic_and_32(E, F) ++ os_atomic_and(E, F, relaxed) +| +- atomic_bitset_8_ov(E, F) ++ os_atomic_or_orig(E, F, relaxed) +| +- atomic_bitset_16_ov(E, F) ++ os_atomic_or_orig(E, F, relaxed) +| +- atomic_bitset_32_ov(E, F) ++ os_atomic_or_orig(E, F, relaxed) +| +- atomic_bitset_8(E, F) ++ os_atomic_or(E, F, relaxed) +| +- atomic_bitset_16(E, F) ++ os_atomic_or(E, F, relaxed) +| +- atomic_bitset_32(E, F) ++ os_atomic_or(E, F, relaxed) +| +- atomic_bitclear_8_ov(E, F) ++ os_atomic_andnot_orig(E, F, relaxed) +| +- atomic_bitclear_16_ov(E, F) ++ os_atomic_andnot_orig(E, F, relaxed) +| +- atomic_bitclear_32_ov(E, F) ++ os_atomic_andnot_orig(E, F, relaxed) +| +- atomic_bitclear_8(E, F) ++ os_atomic_andnot(E, F, relaxed) +| +- atomic_bitclear_16(E, F) ++ os_atomic_andnot(E, F, relaxed) +| +- atomic_bitclear_32(E, F) ++ os_atomic_andnot(E, F, relaxed) +) diff --git a/tools/cocci/os_atomic_normalize.cocci b/tools/cocci/os_atomic_normalize.cocci new file mode 100644 index 000000000..efc1d4647 --- /dev/null +++ b/tools/cocci/os_atomic_normalize.cocci @@ -0,0 +1,94 @@ +// To apply, at the top of xnu.git: +// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/os_atomic_normalize.cocci -dir . +// +// coccinelle insists on adding a space for (void) casts which can be fixed with: +// $ git grep -l '(void) os_atomic' | xargs -n1 sed -i '' -e 's/(void) os_atomic/(void)os_atomic/' + +@os_atomic@ +identifier fn =~ "^os_atomic"; +@@ + +fn + +@script:ocaml unorig@ +fn << os_atomic.fn; +new_fn; +@@ + +new_fn := make_ident (Str.global_replace (Str.regexp "_orig") "" fn) + +@@ +identifier os_atomic.fn; +identifier unorig.new_fn; +expression A, B, C; +@@ + +-(void)fn ++new_fn + (...) + +@@ expression E, F, m; @@ + +( +- os_atomic_add(E, 1, m) ++ os_atomic_inc(E, m) +| +- os_atomic_add_orig(E, 1, m) ++ os_atomic_inc_orig(E, m) +| +- os_atomic_sub(E, -1, m) ++ os_atomic_inc(E, m) +| +- os_atomic_sub_orig(E, -1, m) ++ os_atomic_inc_orig(E, m) +| +- os_atomic_add(E, -1, m) ++ os_atomic_dec(E, m) +| +- os_atomic_add_orig(E, -1, m) ++ os_atomic_dec_orig(E, m) +| +- os_atomic_sub(E, 1, m) ++ os_atomic_dec(E, m) +| +- os_atomic_sub_orig(E, 1, m) ++ os_atomic_dec_orig(E, m) +| +- os_atomic_add(E, -(F), m) ++ os_atomic_sub(E, F, m) +| +- os_atomic_add_orig(E, -(F), m) ++ os_atomic_sub_orig(E, F, m) +| +- os_atomic_add(E, -F, m) ++ os_atomic_sub(E, F, m) +| +- os_atomic_add_orig(E, -F, m) ++ os_atomic_sub_orig(E, F, m) +| +- os_atomic_sub(E, -(F), m) ++ os_atomic_add(E, F, m) +| +- os_atomic_sub_orig(E, -(F), m) ++ os_atomic_add_orig(E, F, m) +| +- os_atomic_sub(E, -F, m) ++ os_atomic_add(E, F, m) +| +- os_atomic_sub_orig(E, -F, m) ++ os_atomic_add_orig(E, F, m) +| +- os_atomic_and(E, ~(F), m) ++ os_atomic_andnot(E, F, m) +| +- os_atomic_and_orig(E, ~(F), m) ++ os_atomic_andnot_orig(E, F, m) +| +- os_atomic_and(E, ~F, m) ++ os_atomic_andnot(E, F, m) +| +- os_atomic_and_orig(E, ~F, m) ++ os_atomic_andnot_orig(E, F, m) +) + +// vim:ft=diff: diff --git a/tools/lldbmacros/Makefile b/tools/lldbmacros/Makefile index aec946f22..17d6b10ee 100644 --- a/tools/lldbmacros/Makefile +++ b/tools/lldbmacros/Makefile @@ -38,11 +38,13 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ core/xnu_lldb_init.py \ plugins/__init__.py \ plugins/zprint_perf_log.py \ + sysregdoc/AArch64-esr_el1.xml \ atm.py \ bank.py \ turnstile.py \ kevent.py \ workqueue.py \ + ulock.py \ xnu.py \ xnudefines.py \ ktrace.py \ @@ -75,7 +77,8 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ waitq.py \ pgtrace.py \ xnutriage.py \ - zonetriage.py + zonetriage.py \ + sysreg.py ifneq ($(PLATFORM),MacOSX) LLDBMACROS_PYTHON_FILES+= \ diff --git a/tools/lldbmacros/bank.py b/tools/lldbmacros/bank.py index e54ee3487..71bd84431 100755 --- a/tools/lldbmacros/bank.py +++ b/tools/lldbmacros/bank.py @@ -34,7 +34,7 @@ def GetBankTaskSummary(bank_task): """ format_str = "{0: <#020x} {1: <16d} {2: <#020x} {3: <16d} {4: <16d} {5: <16d} {6: <16d} {7: <16d}" - out_string = format_str.format(bank_task, bank_task.bt_proc_persona.pid, bank_task.bt_creditcard, unsigned(bank_task.bt_elem.be_refs), unsigned(bank_task.bt_elem.be_made), bank_task.bt_proc_persona.persona_id, bank_task.bt_proc_persona.uid, bank_task.bt_proc_persona.gid) + out_string = format_str.format(bank_task, bank_task.bt_proc_persona.pid, bank_task.bt_ledger, unsigned(bank_task.bt_elem.be_refs), unsigned(bank_task.bt_elem.be_made), bank_task.bt_proc_persona.persona_id, bank_task.bt_proc_persona.uid, bank_task.bt_proc_persona.gid) #if DEVELOPMENT format_str = "{0: <#020x} {1: <20s}" diff --git a/tools/lldbmacros/core/cvalue.py b/tools/lldbmacros/core/cvalue.py index 3b1c4eadd..e58c7752f 100755 --- a/tools/lldbmacros/core/cvalue.py +++ b/tools/lldbmacros/core/cvalue.py @@ -295,15 +295,29 @@ class value(object): return content def _GetValueAsSigned(self): + if self._sbval19k84obscure747_is_ptr: + print "ERROR: You cannot get 'int' from pointer type %s, please use unsigned(obj) for such purposes." % str(self._sbval19k84obscure747_type) + raise ValueError("Cannot get signed int for pointer data.") serr = lldb.SBError() retval = self._sbval19k84obscure747.GetValueAsSigned(serr) if serr.success: return retval raise ValueError("Failed to read signed data. "+ str(self._sbval19k84obscure747) +"(type =" + str(self._sbval19k84obscure747_type) + ") Error description: " + serr.GetCString()) - + + def _GetValueAsCast(self, dest_type): + if type(dest_type) is not lldb.SBType: + raise ValueError("Invalid type for dest_type: {}".format(type(dest_type))) + addr = self._GetValueAsUnsigned() + sbval = self._sbval19k84obscure747.target.CreateValueFromExpression("newname", "(void *)"+str(addr)) + val = value(sbval.Cast(dest_type)) + return val + def _GetValueAsUnsigned(self): serr = lldb.SBError() - retval = self._sbval19k84obscure747.GetValueAsUnsigned(serr) + if self._sbval19k84obscure747_is_ptr: + retval = self._sbval19k84obscure747.GetValueAsAddress() + else: + retval = self._sbval19k84obscure747.GetValueAsUnsigned(serr) if serr.success: return retval raise ValueError("Failed to read unsigned data. "+ str(self._sbval19k84obscure747) +"(type =" + str(self._sbval19k84obscure747_type) + ") Error description: " + serr.GetCString()) @@ -311,7 +325,7 @@ class value(object): def _GetValueAsString(self, offset = 0, maxlen = 1024): serr = lldb.SBError() sbdata = None - if self._sbval19k84obscure747.TypeIsPointerType(): + if self._sbval19k84obscure747_is_ptr: sbdata = self._sbval19k84obscure747.GetPointeeData(offset, maxlen) else: sbdata = self._sbval19k84obscure747.GetData() @@ -381,7 +395,7 @@ def dereference(val): obj_ptr = (int *)0x1234 #C val = *obj_ptr #C """ - if type(val) is value and val.GetSBValue().TypeIsPointerType(): + if type(val) is value and val._sbval19k84obscure747_is_ptr: return value(val.GetSBValue().Dereference()) raise TypeError('Cannot dereference this type.') @@ -410,8 +424,8 @@ def cast(obj, target_type): elif type(target_type) is value: dest_type = target_type.GetSBValue().GetType() - if type(obj) is value : - return value(obj.GetSBValue().Cast(dest_type)) + if type(obj) is value: + return obj._GetValueAsCast(dest_type) elif type(obj) is int: print "ERROR: You cannot cast an 'int' to %s, please use kern.GetValueFromAddress() for such purposes." % str(target_type) raise TypeError("object of type %s cannot be casted to %s" % (str(type(obj)), str(target_type))) diff --git a/tools/lldbmacros/core/kernelcore.py b/tools/lldbmacros/core/kernelcore.py index 43a3bd864..ff2376e2e 100755 --- a/tools/lldbmacros/core/kernelcore.py +++ b/tools/lldbmacros/core/kernelcore.py @@ -223,7 +223,7 @@ def IterateRBTreeEntry(element, element_type, field_name): elt = cast(elt, element_type) -def IteratePriorityQueueEntry(root, element_type, field_name): +def IteratePriorityQueue(root, element_type, field_name): """ iterate over a priority queue as defined with struct priority_queue from osfmk/kern/priority_queue.h root - value : Value object for the priority queue element_type - str : Type of the link element @@ -246,6 +246,19 @@ def IteratePriorityQueueEntry(root, element_type, field_name): if addr: queue.append(addr) elt = elt.next +def IterateMPSCQueue(root, element_type, field_name): + """ iterate over an MPSC queue as defined with struct mpsc_queue_head from osfmk/kern/mpsc_queue.h + root - value : Value object for the mpsc queue + element_type - str : Type of the link element + field_name - str : Name of the field in link element's structure + returns: + A generator does not return. It is used for iterating + value : an object thats of type (element_type). Always a pointer object + """ + elt = root.mpqh_head.mpqc_next + while unsigned(elt): + yield containerof(elt, element_type, field_name) + elt = elt.mpqc_next class KernelTarget(object): """ A common kernel object that provides access to kernel objects and information. @@ -327,6 +340,7 @@ class KernelTarget(object): addr = int(addr, 16) else: addr = int(addr) + addr = self.StripKernelPAC(addr) ret_array = [] symbolicator = self._GetSymbolicator() syms = symbolicator.symbolicate(addr) @@ -424,6 +438,17 @@ class KernelTarget(object): val = ((addr + size) & (unsigned(self.GetGlobalVariable("page_size"))-1)) return (val < size and val > 0) + def StripUserPAC(self, addr): + if self.arch != 'arm64e': + return addr + T0Sz = self.GetGlobalVariable('gT0Sz') + return StripPAC(addr, T0Sz) + + def StripKernelPAC(self, addr): + if self.arch != 'arm64e': + return addr + T1Sz = self.GetGlobalVariable('gT1Sz') + return StripPAC(addr, T1Sz) def PhysToKVARM64(self, addr): ptov_table = self.GetGlobalVariable('ptov_table') diff --git a/tools/lldbmacros/core/operating_system.py b/tools/lldbmacros/core/operating_system.py index 2e7e21847..c1fc18cc3 100755 --- a/tools/lldbmacros/core/operating_system.py +++ b/tools/lldbmacros/core/operating_system.py @@ -649,6 +649,32 @@ def IterateQueue(queue_head, element_ptr_type, element_field_name): yield elt cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next') +def IterateCircleQueue(queue_head, element_ptr_type, element_field_name): + """ iterate over a circle queue in kernel of type circle_queue_head_t. refer to osfmk/kern/circle_queue.h + params: + queue_head - lldb.SBValue : Value object for queue_head. + element_type - lldb.SBType : a pointer type of the element 'next' points to. Typically its structs like thread, task etc.. + element_field_name - str : name of the field in target struct. + returns: + A generator does not return. It is used for iterating. + SBValue : an object thats of type (element_type) queue_head->next. Always a pointer object + """ + head = queue_head.head + queue_head_addr = 0x0 + if head.TypeIsPointerType(): + queue_head_addr = head.GetValueAsUnsigned() + else: + queue_head_addr = head.GetAddress().GetLoadAddress(osplugin_target_obj) + cur_elt = head + while True: + if not cur_elt.IsValid() or cur_elt.GetValueAsUnsigned() == 0: + break + elt = cur_elt.Cast(element_ptr_type) + yield elt + cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next') + if cur_elt.GetValueAsUnsigned() == queue_head_addr: + break + def GetUniqueSessionID(process_obj): """ Create a unique session identifier. params: diff --git a/tools/lldbmacros/core/standard.py b/tools/lldbmacros/core/standard.py index bb96a17cd..9df6b7635 100755 --- a/tools/lldbmacros/core/standard.py +++ b/tools/lldbmacros/core/standard.py @@ -4,7 +4,7 @@ import sys import re class ArgumentError(Exception): - """ Exception class for raising errors in command arguments. The lldb_command framework will catch this + """ Exception class for raising errors in command arguments. The lldb_command framework will catch this class of exceptions and print suitable error message to user. """ def __init__(self, msg): @@ -28,77 +28,199 @@ class RedirectStdStreams(object): sys.stdout = self.old_stdout sys.stderr = self.old_stderr +class IndentScope(object): + def __init__(self, O): + self._O = O + + def __enter__(self): + self._O._indent += ' ' + + def __exit__(self, exc_type, exc_value, traceback): + self._O._indent = self._O._indent[:-4] + +class HeaderScope(object): + def __init__(self, O, hdr, indent = False): + self._O = O + self._header = hdr + self._indent = indent + + def __enter__(self): + self._oldHeader = self._O._header + self._oldLastHeader = self._O._lastHeader + self._O._header = self._header + self._O._lastHeader = None + if self._indent: + self._O._indent += ' ' + + def __exit__(self, exc_type, exc_value, traceback): + self._O._header = self._oldHeader + self._O._lastHeader = self._oldLastHeader + if self._indent: + self._O._indent = self._O._indent[:-4] + +class VT(object): + Black = "\033[38;5;0m" + DarkRed = "\033[38;5;1m" + DarkGreen = "\033[38;5;2m" + Brown = "\033[38;5;3m" + DarkBlue = "\033[38;5;4m" + DarkMagenta = "\033[38;5;5m" + DarkCyan = "\033[38;5;6m" + Grey = "\033[38;5;7m" + + DarkGrey = "\033[38;5;8m" + Red = "\033[38;5;9m" + Green = "\033[38;5;10m" + Yellow = "\033[38;5;11m" + Blue = "\033[38;5;12m" + Magenta = "\033[38;5;13m" + Cyan = "\033[38;5;14m" + White = "\033[38;5;15m" + + Default = "\033[39m" + + Bold = "\033[1m" + EndBold = "\033[22m" + + Oblique = "\033[3m" + EndOblique = "\033[23m" + + Underline = "\033[4m" + EndUnderline = "\033[24m" + + Reset = "\033[0m" + +class NOVT(object): + def __getattribute__(self, *args): + return "" + class CommandOutput(object): """ - An output handler for all commands. Use Output.print to direct all output of macro via the handler. + An output handler for all commands. Use Output.print to direct all output of macro via the handler. These arguments are passed after a "--". eg (lldb) zprint -- -o /tmp/zprint.out.txt - - Currently this provide capabilities + + Currently this provide capabilities + -h show help -o path/to/filename - The output of this command execution will be saved to file. Parser information or errors will + The output of this command execution will be saved to file. Parser information or errors will not be sent to file though. eg /tmp/output.txt -s filter_string - the "filter_string" param is parsed to python regex expression and each line of output - will be printed/saved only if it matches the expression. + the "filter_string" param is parsed to python regex expression and each line of output + will be printed/saved only if it matches the expression. The command header will not be filtered in any case. + -p + Send the output of the command to plugin. + -v ... + Up verbosity + -c + configure color """ - def __init__(self, cmd_name, CommandResult): + def __init__(self, cmd_name, CommandResult=None, fhandle=None): """ Create a new instance to handle command output. params: - CommandResult : SBCommandReturnObject result param from lldb's command invocation. + CommandResult : SBCommandReturnObject result param from lldb's command invocation. """ self.fname=None - self.fhandle=None + self.fhandle=fhandle self.FILTER=False self.pluginRequired = False self.pluginName = None self.cmd_name = cmd_name self.resultObj = CommandResult - self.immediateOutput = False self.verbose_level = 0 self.target_cmd_args = [] self.target_cmd_options = {} + self.color = None + self.isatty = os.isatty(sys.__stdout__.fileno()) + self._indent = '' + self._buffer = '' - def write(self, s): - """ Handler for all commands output. By default just print to stdout """ - if self.FILTER and not self.reg.search(s): - return - if self.FILTER: - s += "\n" + self._header = None + self._lastHeader = None + self._line = 0 + def _write(self, s): if self.fhandle != None: - self.fhandle.write(s) + self.fhandle.write(self._indent + s + "\n") else: - if self.immediateOutput: - sys.__stdout__.write(s) - else: - res_str = s - if s.endswith("\n"): - res_str = s[:-1] - if self.resultObj and len(res_str) > 0: self.resultObj.AppendMessage(res_str) + self.resultObj.AppendMessage(self._indent + s) + self._line += 1 + + def _doColor(self): + if self.color is True: + return True; + return self.color is None and self.isatty + + def _needsHeader(self): + if self._header is None: + return False + if self._lastHeader is None: + return True + if not self.isatty: + return False + return self._line - self._lastHeader > 40 + + def indent(self): + return IndentScope(self) + + def table(self, header, indent = False): + return HeaderScope(self, header, indent) + + def format(self, s, *args, **kwargs): + if self._doColor(): + kwargs['VT'] = VT + else: + kwargs['VT'] = NOVT() + + return s.format(*args, **kwargs) + + def error(self, s, *args, **kwargs): + print self.format("{cmd.cmd_name}: {VT.Red}"+s+"{VT.Default}", cmd=self, *args, **kwargs) + + def write(self, s): + """ Handler for all commands output. By default just print to stdout """ + + s = self._buffer + s + + while s.find('\n') != -1: + l, s = s.split("\n", 1) + if self.FILTER: + if not self.reg.search(l): + continue + if self._doColor(): + l = self.reg.sub(VT.Underline + r"\g<0>" + VT.EndUnderline, l); + + if len(l) and self._needsHeader(): + for hdr in self._header.split("\n"): + self._write(self.format("{VT.Bold}{:s}{VT.EndBold}", hdr)) + self._lastHeader = self._line + + self._write(l) + + self._buffer = s def flush(self): if self.fhandle != None: self.fhandle.flush() - + def __del__(self): """ closes any open files. report on any errors """ - if self.fhandle != None : + if self.fhandle != None and self.fname != None: self.fhandle.close() - + def setOptions(self, cmdargs, cmdoptions =''): - """ parse the arguments passed to the command - param : + """ parse the arguments passed to the command + param : cmdargs => [] of (typically args.split()) - cmdoptions : str - string of command level options. + cmdoptions : str - string of command level options. These should be CAPITAL LETTER options only. """ opts=() args = cmdargs cmdoptions = cmdoptions.upper() try: - opts,args = getopt.gnu_getopt(args,'hvo:s:p:'+ cmdoptions,[]) + opts,args = getopt.gnu_getopt(args,'hvo:s:p:c:'+ cmdoptions,[]) self.target_cmd_args = args except getopt.GetoptError,err: raise ArgumentError(str(err)) @@ -113,6 +235,7 @@ class CommandOutput(object): self.fhandle=open(self.fname,"w") print "saving results in file ",str(a) self.fhandle.write("(lldb)%s %s \n" % (self.cmd_name, " ".join(cmdargs))) + self.isatty = os.isatty(self.fhandle.fileno()) elif o == "-s" and len(a) > 0: self.reg = re.compile(a.strip(),re.MULTILINE|re.DOTALL) self.FILTER=True @@ -121,12 +244,17 @@ class CommandOutput(object): self.pluginRequired = True self.pluginName = a.strip() #print "passing output to " + a.strip() - elif o == "-v" : + elif o == "-v": self.verbose_level += 1 + elif o == "-c": + if a in ["always", '1']: + self.color = True + elif a in ["never", '0']: + self.color = False + else: + self.color = None else: o = o.strip() self.target_cmd_options[o] = a - - diff --git a/tools/lldbmacros/core/xnu_lldb_init.py b/tools/lldbmacros/core/xnu_lldb_init.py index e7f494b96..c0f1a8002 100755 --- a/tools/lldbmacros/core/xnu_lldb_init.py +++ b/tools/lldbmacros/core/xnu_lldb_init.py @@ -1,10 +1,15 @@ +from __future__ import absolute_import +from __future__ import print_function import os +import sys import re +PY3 = sys.version_info > (3,) + def GetSettingsValues(debugger, setting_variable_name): """ Queries the lldb internal settings params: - debugger : lldb.SBDebugger instance + debugger : lldb.SBDebugger instance setting_variable_name: str - string name of the setting(eg prompt) returns: [] : Array of strings. Empty array if setting is not found/set @@ -66,9 +71,16 @@ def __lldb_init_module(debugger, internal_dict): if "DEBUG_XNU_LLDBMACROS" in os.environ and len(os.environ['DEBUG_XNU_LLDBMACROS']) > 0: debug_session_enabled = True prev_os_plugin = "".join(GetSettingsValues(debugger, 'target.process.python-os-plugin-path')) - print "Loading kernel debugging from %s" % __file__ - print "LLDB version %s" % debugger.GetVersionString() - self_path = str(__file__) + if PY3: + print("#" * 30) + print("WARNING! Python version 3 is not supported for xnu lldbmacros.") + print("Please restart your debugging session with the following workaround") + print("\ndefaults write com.apple.dt.lldb DefaultPythonVersion 2\n") + print("#" * 30) + print("\n") + print("Loading kernel debugging from %s" % __file__) + print("LLDB version %s" % debugger.GetVersionString()) + self_path = "{}".format(__file__) base_dir_name = self_path[:self_path.rfind("/")] core_os_plugin = base_dir_name + "/lldbmacros/core/operating_system.py" osplugin_cmd = "settings set target.process.python-os-plugin-path \"%s\"" % core_os_plugin @@ -86,22 +98,22 @@ def __lldb_init_module(debugger, internal_dict): pass if debug_session_enabled : if len(prev_os_plugin) > 0: - print "\nDEBUG_XNU_LLDBMACROS is set. Skipping the setting of OS plugin from dSYM.\nYou can manually set the OS plugin by running\n" + osplugin_cmd + print("\nDEBUG_XNU_LLDBMACROS is set. Skipping the setting of OS plugin from dSYM.\nYou can manually set the OS plugin by running\n" + osplugin_cmd) else: - print osplugin_cmd + print(osplugin_cmd) debugger.HandleCommand(osplugin_cmd) - print "\nDEBUG_XNU_LLDBMACROS is set. Skipping the load of xnu debug framework.\nYou can manually load the framework by running\n" + xnu_load_cmd + print("\nDEBUG_XNU_LLDBMACROS is set. Skipping the load of xnu debug framework.\nYou can manually load the framework by running\n" + xnu_load_cmd) else: - print osplugin_cmd + print(osplugin_cmd) debugger.HandleCommand(osplugin_cmd) - print whitelist_trap_cmd + print(whitelist_trap_cmd) debugger.HandleCommand(whitelist_trap_cmd) - print xnu_load_cmd + print(xnu_load_cmd) debugger.HandleCommand(xnu_load_cmd) - print disable_optimization_warnings_cmd + print(disable_optimization_warnings_cmd) debugger.HandleCommand(disable_optimization_warnings_cmd) if source_map_cmd: - print source_map_cmd + print(source_map_cmd) debugger.HandleCommand(source_map_cmd) load_kexts = True @@ -111,15 +123,15 @@ def __lldb_init_module(debugger, internal_dict): if os.access(builtinkexts_path, os.F_OK): kexts = os.listdir(builtinkexts_path) if len(kexts) > 0: - print "\nBuiltin kexts: %s\n" % kexts + print("\nBuiltin kexts: %s\n" % kexts) if load_kexts == False: - print "XNU_LLDBMACROS_NOBUILTINKEXTS is set, not loading:\n" + print("XNU_LLDBMACROS_NOBUILTINKEXTS is set, not loading:\n") for kextdir in kexts: script = os.path.join(builtinkexts_path, kextdir, kextdir.split('.')[-1] + ".py") import_kext_cmd = "command script import \"%s\"" % script - print "%s" % import_kext_cmd + print("%s" % import_kext_cmd) if load_kexts: debugger.HandleCommand(import_kext_cmd) - print "\n" + print("\n") diff --git a/tools/lldbmacros/ioreg.py b/tools/lldbmacros/ioreg.py index e2bdaf20e..1e55dc219 100755 --- a/tools/lldbmacros/ioreg.py +++ b/tools/lldbmacros/ioreg.py @@ -34,6 +34,7 @@ def GetObjectSummary(obj): return vt = dereference(Cast(obj, 'uintptr_t *')) - 2 * sizeof('uintptr_t') + vt = kern.StripKernelPAC(vt) vtype = kern.SymbolicateFromAddress(vt) if len(vtype): vtype_str = " <" + vtype[0].GetName() + ">" @@ -91,6 +92,7 @@ def GetObjectTypeStr(obj): return None vt = dereference(Cast(obj, 'uintptr_t *')) - 2 * sizeof('uintptr_t') + vt = kern.StripKernelPAC(vt) vtype = kern.SymbolicateFromAddress(vt) if len(vtype): return vtype[0].GetName() @@ -128,6 +130,7 @@ def GetRegistryEntrySummary(entry): # I'm using uintptr_t for now to work around FindFirstType & Co. should allow you to make pointer types directly vtableAddr = dereference(Cast(entry, 'uintptr_t *')) - 2 * sizeof('uintptr_t *') + vtableAddr = kern.StripKernelPAC(vtableAddr) vtype = kern.SymbolicateFromAddress(vtableAddr) if vtype is None or len(vtype) < 1: out_string += "> 22]) + ipc_name = '{:s}{:s}'.format(ipc_name.strip(), ie_gen_roll[(ie_bits & 0x00c00000) >> 22]) # now show the port destination part destname_str = GetPortDestinationSummary(Cast(ie_object, 'ipc_port_t')) @@ -786,6 +793,9 @@ def ShowTaskRights(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + 'g' : No grant port types of notifications: 'd' : Dead-Name notification requested 's' : Send-Possible notification armed @@ -824,6 +834,9 @@ def ShowTaskRightsBt(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + 'g' : No grant port types of notifications: 'd' : Dead-Name notification requested 's' : Send-Possible notification armed @@ -864,6 +877,9 @@ def ShowAllRights(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + 'g' : No grant port types of notifications: 'd' : Dead-Name notification requested 's' : Send-Possible notification armed diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py index 66fc2e8c4..5db5554e5 100755 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -65,6 +65,7 @@ kcdata_type_def = { 'STACKSHOT_KCCONTAINER_THREAD': 0x904, 'STACKSHOT_KCTYPE_DONATING_PIDS': 0x907, 'STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO': 0x908, + 'STACKSHOT_KCTYPE_THREAD_NAME': 0x909, 'STACKSHOT_KCTYPE_KERN_STACKFRAME': 0x90A, 'STACKSHOT_KCTYPE_KERN_STACKFRAME64': 0x90B, 'STACKSHOT_KCTYPE_USER_STACKFRAME': 0x90C, @@ -95,6 +96,8 @@ kcdata_type_def = { 'STACKSHOT_KCTYPE_ASID' : 0x925, 'STACKSHOT_KCTYPE_PAGE_TABLES' : 0x926, 'STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT' : 0x927, + 'STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL' : 0x928, + 'STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO' : 0x929, 'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940, 'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941, @@ -821,6 +824,9 @@ KNOWN_TYPES_COLLECTION[0x906] = KCTypeDescription(0x906, ( legacy_size = 0x68 ) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL')] = KCSubTypeElement('dispatch_queue_label', KCSUBTYPE_TYPE.KC_ST_CHAR, + KCSubTypeElement.GetSizeForArray(64, 1), 0, 1) + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT'), ( KCSubTypeElement.FromBasicCtype('tds_thread_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), KCSubTypeElement.FromBasicCtype('tds_voucher_identifier', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), @@ -860,7 +866,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT')] = ) -KNOWN_TYPES_COLLECTION[0x909] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_NAME')] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1) KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT'), ( KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), @@ -997,6 +1003,16 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_WAITINFO')] = KCT ), 'thread_waitinfo') +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO'), + ( + KCSubTypeElement.FromBasicCtype('waiter', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('turnstile_context', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('turnstile_priority', KCSUBTYPE_TYPE.KC_ST_UINT8, 16), + KCSubTypeElement.FromBasicCtype('number_of_hops', KCSUBTYPE_TYPE.KC_ST_UINT8, 17), + KCSubTypeElement.FromBasicCtype('turnstile_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 18), + ), + 'thread_turnstileinfo') + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_GROUP'), ( KCSubTypeElement.FromBasicCtype('tgs_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), @@ -1187,7 +1203,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_PAGE_TABLES')] = KCTypeD ) def GetSecondsFromMATime(mat, tb): - return (float(mat) * tb['numer']) / tb['denom'] + return (float(long(mat) * tb['numer']) / tb['denom']) / 1e9 def FindLibraryForAddress(liblist, address): current_lib = None @@ -1283,6 +1299,11 @@ STACKSHOT_WAITOWNER_MTXSPIN = (UINT64_MAX - 5) STACKSHOT_WAITOWNER_THREQUESTED = (UINT64_MAX - 6) STACKSHOT_WAITOWNER_SUSPENDED = (UINT64_MAX - 7) +STACKSHOT_TURNSTILE_STATUS_UNKNOWN = 0x01 +STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ = 0x02 +STACKSHOT_TURNSTILE_STATUS_WORKQUEUE = 0x04 +STACKSHOT_TURNSTILE_STATUS_THREAD = 0x08 + def formatWaitInfo(info): s = 'thread %d: ' % info['waiter']; type = info['wait_type'] @@ -1370,13 +1391,50 @@ def formatWaitInfo(info): s += ", unknown owner" s += ", workloop id %x" % context elif type == kThreadWaitOnProcess: - s += "waitpid, for pid %d" % owner + if owner == 2**64-1: + s += "waitpid, for any children" + elif 2**32 <= owner and owner < 2**64-1: + s += "waitpid, for process group %d" % abs(owner - 2**64) + else: + s += "waitpid, for pid %d" % owner else: s += "unknown type %d (owner %d, context %x)" % (type, owner, context) return s + +def formatTurnstileInfo(ti): + if ti is None: + return " [no turnstile]" + + ts_flags = int(ti['turnstile_flags']) + ctx = int(ti['turnstile_context']) + hop = int(ti['number_of_hops']) + prio = int(ti['turnstile_priority']) + if ts_flags & STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ: + return " [turnstile was in process of being updated]" + if ts_flags & STACKSHOT_TURNSTILE_STATUS_WORKQUEUE: + return " [blocked on workqueue: 0x%x, hops: %x, priority: %d]" % (ctx, hop, prio) + if ts_flags & STACKSHOT_TURNSTILE_STATUS_THREAD: + return " [blocked on: %d, hops: %x, priority: %d]" % (ctx, hop, prio) + if ts_flags & STACKSHOT_TURNSTILE_STATUS_UNKNOWN: + return " [turnstile with unknown inheritor]" + + return " [unknown turnstile status!]" +def formatWaitInfoWithTurnstiles(waitinfos, tsinfos): + wis_tis = [] + for w in waitinfos: + found_pair = False + for t in tsinfos: + if int(w['waiter']) == int(t['waiter']): + wis_tis.append((w, t)) + found_pair = True + break + if not found_pair: + wis_tis.append((w, None)) + + return map(lambda (wi, ti): formatWaitInfo(wi) + formatTurnstileInfo(ti), wis_tis) def SaveStackshotReport(j, outfile_name, incomplete): import time @@ -1514,6 +1572,9 @@ def SaveStackshotReport(j, outfile_name, incomplete): thsnap["qosEffective"] = threadsnap["ths_eqos"] thsnap["qosRequested"] = threadsnap["ths_rqos"] + if "pth_name" in thdata: + thsnap["name"] = thdata["pth_name"]; + if threadsnap['ths_continuation']: thsnap["continuation"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_continuation']) if "kernel_stack_frames" in thdata: @@ -1535,7 +1596,9 @@ def SaveStackshotReport(j, outfile_name, incomplete): if threadsnap['ths_wait_event']: thsnap["waitEvent"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_wait_event']) - if 'thread_waitinfo' in piddata: + if 'thread_waitinfo' in piddata and 'thread_turnstileinfo' in piddata: + tsnap['waitInfo'] = formatWaitInfoWithTurnstiles(piddata['thread_waitinfo'] , piddata['thread_turnstileinfo']) + elif 'thread_waitinfo' in piddata: tsnap['waitInfo'] = map(formatWaitInfo, piddata['thread_waitinfo']) obj['binaryImages'] = AllImageCatalog @@ -1615,6 +1678,9 @@ def iterate_kcdatas(kcdata_file): with data_from_stream(kcdata_file) as data: iterator = kcdata_item_iterator(data) kcdata_buffer = KCObject.FromKCItem(iterator.next()) + if not isinstance(kcdata_buffer, KCBufferObject): + iterator = kcdata_item_iterator(data[16:]) + kcdata_buffer = KCObject.FromKCItem(iterator.next()) if not isinstance(kcdata_buffer, KCBufferObject): try: decoded = base64.b64decode(data) @@ -1641,6 +1707,8 @@ def iterate_kcdatas(kcdata_file): for magic in iterator: kcdata_buffer = KCObject.FromKCItem(magic) + if kcdata_buffer.i_type == 0: + continue if not isinstance(kcdata_buffer, KCBufferObject): raise Exception, "unknown file type" kcdata_buffer.ReadItems(iterator) diff --git a/tools/lldbmacros/kevent.py b/tools/lldbmacros/kevent.py index 1fb875628..4c4e7d7e8 100755 --- a/tools/lldbmacros/kevent.py +++ b/tools/lldbmacros/kevent.py @@ -1,4 +1,5 @@ from xnu import * +from workqueue import GetWorkqueueThreadRequestSummary def IterateProcKqueues(proc): """ Iterate through all kqueues in the given process @@ -57,7 +58,7 @@ def IterateProcKqworkloops(proc): hash_mask = proc_filedesc.fd_kqhashmask for i in xrange(hash_mask + 1): - for kqwl in IterateListEntry(proc_filedesc.fd_kqhash[i], 'struct kqworkloop *', 'kqwl_hashlink', list_prefix='s'): + for kqwl in IterateListEntry(proc_filedesc.fd_kqhash[i], 'struct kqworkloop *', 'kqwl_hashlink'): yield kqwl def IterateAllKqueues(): @@ -67,9 +68,10 @@ def IterateAllKqueues(): kq - yields each kqueue in the system """ for t in kern.tasks: - if unsigned(t.bsd_info) == 0: + proc = unsigned(t.bsd_info) + if proc == 0: continue - proc = kern.GetValueFromAddress(t.bsd_info, 'proc_t') + proc = kern.GetValueFromAddress(proc, 'proc_t') for kq in IterateProcKqueues(proc): yield kq @@ -102,36 +104,41 @@ def GetKnoteKqueue(kn): return kern.GetValueFromAddress(int(kn.kn_kq_packed), 'struct kqueue *') @lldb_type_summary(['knote *']) -@header('{:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<30s} {:<10} {:<10} {:<10} {:<30s}'.format('knote', 'ident', 'kev_flags', 'kqueue', 'udata', 'filtops', 'qos_use', 'qos_req', 'qos_ovr', 'status')) +@header('{:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<30s} {:<10} {:<10} {:<10} {:<20s}'.format('knote', 'ident', 'kev_flags', 'kqueue', 'udata', 'filtops', 'qos_req', 'qos_use', 'qos_ovr', 'status')) def GetKnoteSummary(kn): """ Summarizes a knote and related information returns: str - summary of knote """ - format_string = '{o: <#020x} {o.kn_kevent.ident: <#020x} {o.kn_kevent.flags: <#010x} {kq_ptr: <#020x} {o.kn_kevent.udata: <#020x} {ops_str: <30s} {qos_use: <10s} {qos_req: <10s} {qos_ovr: <10s} {st_str: <30s}' + format_string = '{o: <#020x} {o.kn_kevent.kei_ident: <#020x} {o.kn_kevent.kei_flags: <#010x} {kq_ptr: <#020x} {o.kn_kevent.kei_udata: <#020x} {ops_str: <30s} {qos_req: <10s} {qos_use: <10s} {qos_ovr: <10s} {st_str: <20s}' state = unsigned(kn.kn_status) - fops_str = kern.Symbolicate(kern.globals.sysfilt_ops[unsigned(kn.kn_filtid)]) + fops_str = kern.Symbolicate(kern.globals.sysfilt_ops[unsigned(kn.kn_kevent.kei_filtid)]) + qos_index = int(kn.kn_qos_index) + if qos_index > 6: + qos_req = qos_index + else: + qos_req = int((kn.kn_kevent.kei_qos & 0x003fff00) >> 8).bit_length() return format_string.format( o=kn, - qos_use=xnudefines.thread_qos_short_strings[int(kn.kn_qos_index)], - qos_req=xnudefines.thread_qos_short_strings[int(kn.kn_req_index)], + qos_req=xnudefines.thread_qos_short_strings[qos_req], + qos_use=xnudefines.thread_qos_short_strings[qos_index], qos_ovr=xnudefines.thread_qos_short_strings[int(kn.kn_qos_override)], st_str=xnudefines.GetStateString(xnudefines.kn_state_strings, state), kq_ptr=int(GetKnoteKqueue(kn)), ops_str=fops_str) -@lldb_command('showknote') -def ShowKnote(cmd_args=None): +@lldb_command('showknote', fancy=True) +def ShowKnote(cmd_args=None, cmd_options={}, O=None): """ Show information about a knote usage: showknote """ if not cmd_args: - raise ArgumentError('missing struct knote * argument') + return O.error('missing struct knote * argument') kn = kern.GetValueFromAddress(cmd_args[0], 'struct knote *') - print GetKnoteSummary.header - print GetKnoteSummary(kn) + with O.table(GetKnoteSummary.header): + print GetKnoteSummary(kn) def IterateKqueueKnotes(kq): """ Iterate through all knotes of a given kqueue @@ -147,42 +154,15 @@ def IterateKqueueKnotes(kq): continue yield kn -@lldb_type_summary(['struct kqrequest *']) -@header('{:<20s} {:<20s} {:<5s} {:<5s} {:<5s} {:s}'.format('kqrequest', 'thread', 'qos', 'ovr_qos', 'sa_qos', 'state')) -def GetKqrequestSummary(kqr): - """ Summarize kqrequest information - - params: - kqr - the kqrequest object - returns: str - summary of kqrequest - """ - fmt = '{kqrp: <#020x} {kqr.kqr_thread: <#020x} {qos: <5s} {ovr_qos: <5s} {sa_qos: <5s} {state_str: - """ - if len(cmd_args) < 1: - raise ArgumentError('missing struct kqrequest * argument') - kqr = kern.GetValueFromAddress(cmd_args[0], 'struct kqrequest *') - print GetKqrequestSummary.header - print GetKqrequestSummary(kqr) - print GetKnoteSummary.header - for kn in IterateTAILQ_HEAD(kqr.kqr_suppressed, 'kn_tqe'): - print GetKnoteSummary(kn) +kqueue_summary_fmt = '{ptr: <#020x} {o.kq_p: <#020x} {dyn_id: <#020x} {servicer: <#20x} {owner: <#20x} {o.kq_count: <6d} {wqs: <#020x} {st_str: <10s}' -kqueue_summary_fmt = '{ptr: <#020x} {o.kq_p: <#020x} {dyn_id: <#020x} {servicer: <#20x} {owner: <#20x} {o.kq_count: <6d} {wqs: <#020x} {kqr_state: <30s} {st_str: <10s}' +def GetServicer(req): + if req.tr_state in [3, 4]: # [ BINDING , BOUND ] + return int(req.tr_thread) + return 0 @lldb_type_summary(['struct kqueue *']) -@header('{: <20s} {: <20s} {: <20s} {: <20s} {: <20s} {: <6s} {: <20s} {: <30s} {: <10s}'.format('kqueue', 'process', 'dynamic_id', 'servicer', 'owner', '#evts', 'wqs', 'request', 'state')) +@header('{: <20s} {: <20s} {: <20s} {: <20s} {: <20s} {: <6s} {: <20s} {: <10s}'.format('kqueue', 'process', 'dynamic_id', 'servicer', 'owner', '#evts', 'wqs', 'state')) def GetKqueueSummary(kq): """ Summarize kqueue information @@ -206,30 +186,29 @@ def GetKqfileSummary(kqf): o=kq, ptr=int(kq), wqs=int(kq.kq_wqs), - kqr_state='', dyn_id=0, st_str=xnudefines.GetStateString(xnudefines.kq_state_strings, state), servicer=0, owner=0) -@lldb_command('showkqfile') -def ShowKqfile(cmd_args=None): +@lldb_command('showkqfile', fancy=True) +def ShowKqfile(cmd_args=None, cmd_options={}, O=None): """ Display information about a kqfile object. usage: showkqfile """ if len(cmd_args) < 1: - raise ArgumentError('missing struct kqfile * argument') + return O.error('missing struct kqfile * argument') kqf = kern.GetValueFromAddress(cmd_args[0], 'kqfile *') - print GetKqfileSummary.header - print GetKqfileSummary(kqf) - print GetKnoteSummary.header - for kn in IterateKqueueKnotes(kqf.kqf_kqueue): - print GetKnoteSummary(kn) - for kn in IterateTAILQ_HEAD(kqf.kqf_suppressed, 'kn_tqe'): - print GetKnoteSummary(kn) + with O.table(GetKqfileSummary.header): + print GetKqfileSummary(kqf) + with O.table(GetKnoteSummary.header): + for kn in IterateKqueueKnotes(kqf.kqf_kqueue): + print GetKnoteSummary(kn) + for kn in IterateTAILQ_HEAD(kqf.kqf_suppressed, 'kn_tqe'): + print GetKnoteSummary(kn) @lldb_type_summary(['struct kqworkq *']) @header(GetKqueueSummary.header) @@ -242,25 +221,30 @@ def GetKqworkqSummary(kqwq): """ return GetKqfileSummary(kern.GetValueFromAddress(int(kqwq), 'struct kqfile *')) -@lldb_command('showkqworkq') -def ShowKqworkq(cmd_args=None): +@lldb_command('showkqworkq', fancy=True) +def ShowKqworkq(cmd_args=None, cmd_options={}, O=None): """ Display summary and knote information about a kqworkq. usage: showkqworkq """ if len(cmd_args) < 1: - raise ArgumentError('missing struct kqworkq * argument') + return O.error('missing struct kqworkq * argument') kqwq = kern.GetValueFromAddress(cmd_args[0], 'struct kqworkq *') kq = kqwq.kqwq_kqueue - print GetKqueueSummary.header - print GetKqworkqSummary(kqwq) - print GetKnoteSummary.header - for kn in IterateKqueueKnotes(kq): - print GetKnoteSummary(kn) - for i in xrange(0, xnudefines.KQWQ_NBUCKETS): - for kn in IterateTAILQ_HEAD(kq.kq_queue[i], 'kn_tqe'): + with O.table(GetKqueueSummary.header): + print GetKqworkqSummary(kqwq) + + with O.table(GetWorkqueueThreadRequestSummary.header): + for i in range(1, 8): + print GetWorkqueueThreadRequestSummary(kq.kq_p, kqwq.kqwq_request[i]) + + with O.table(GetKnoteSummary.header): + for kn in IterateKqueueKnotes(kq): print GetKnoteSummary(kn) + for i in xrange(0, xnudefines.KQWQ_NBUCKETS): + for kn in IterateTAILQ_HEAD(kq.kq_queue[i], 'kn_tqe'): + print GetKnoteSummary(kn) @lldb_type_summary(['struct kqworkloop *']) @header(GetKqueueSummary.header) @@ -277,104 +261,98 @@ def GetKqworkloopSummary(kqwl): o=kqwl.kqwl_kqueue, wqs=int(kqwl.kqwl_kqueue.kq_wqs), dyn_id=kqwl.kqwl_dynamicid, - kqr_state=xnudefines.GetStateString(xnudefines.kqrequest_state_strings, kqwl.kqwl_request.kqr_state), st_str=xnudefines.GetStateString(xnudefines.kq_state_strings, state), - servicer=int(kqwl.kqwl_request.kqr_thread), + servicer=GetServicer(kqwl.kqwl_request), owner=int(kqwl.kqwl_owner) ) -@lldb_command('showkqworkloop') -def ShowKqworkloop(cmd_args=None): +@lldb_command('showkqworkloop', fancy=True) +def ShowKqworkloop(cmd_args=None, cmd_options={}, O=None): """ Display information about a kqworkloop. usage: showkqworkloop """ if len(cmd_args) < 1: - raise ArgumentError('missing struct kqworkloop * argument') + return O.error('missing struct kqworkloop * argument') kqwl = kern.GetValueFromAddress(cmd_args[0], 'struct kqworkloop *') - print GetKqworkloopSummary.header - print GetKqworkloopSummary(kqwl) + with O.table(GetKqworkloopSummary.header): + print GetKqworkloopSummary(kqwl) - print GetKqrequestSummary.header - kqr = kern.GetValueFromAddress(unsigned(addressof(kqwl.kqwl_request)), 'struct kqrequest *') - print GetKqrequestSummary(kqr) + with O.table(GetWorkqueueThreadRequestSummary.header): + print GetWorkqueueThreadRequestSummary(kqwl.kqwl_kqueue.kq_p, kqwl.kqwl_request) - print GetKnoteSummary.header - for kn in IterateKqueueKnotes(kqwl.kqwl_kqueue): - print GetKnoteSummary(kn) + with O.table(GetKnoteSummary.header): + for kn in IterateKqueueKnotes(kqwl.kqwl_kqueue): + print GetKnoteSummary(kn) -@lldb_command('showkqueue') -def ShowKqueue(cmd_args=None): +@lldb_command('showkqueue', fancy=True) +def ShowKqueue(cmd_args=None, cmd_options={}, O=None): """ Given a struct kqueue pointer, display the summary of the kqueue usage: showkqueue """ if not cmd_args: - raise ArgumentError('missing struct kqueue * argument') + return O.error('missing struct kqueue * argument') kq = kern.GetValueFromAddress(cmd_args[0], 'struct kqueue *') if int(kq.kq_state) & xnudefines.KQ_WORKQ: - ShowKqworkq(cmd_args=[str(int(kq))]) + ShowKqworkq(cmd_args, cmd_options, O) elif int(kq.kq_state) & xnudefines.KQ_WORKLOOP: - ShowKqworkloop(cmd_args=[str(int(kq))]) + ShowKqworkloop(cmd_args, cmd_options, O) else: - print GetKqueueSummary.header - print GetKqueueSummary(kq) - print GetKnoteSummary.header - for kn in IterateKqueueKnotes(kq): - print GetKnoteSummary(kn) + ShowKqfile(cmd_args, cmd_options, O) -@lldb_command('showprocworkqkqueue') -def ShowProcWorkqKqueue(cmd_args=None): +@lldb_command('showprocworkqkqueue', fancy=True) +def ShowProcWorkqKqueue(cmd_args=None, cmd_options={}, O=None): """ Show the workqueue kqueue for a given process. - usage: showworkqkqueue + usage: showprocworkqkqueue """ if not cmd_args: - raise ArgumentError('missing struct proc * argument') + return O.error('missing struct proc * argument') proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t') ShowKqworkq(cmd_args=[str(int(proc.p_fd.fd_wqkqueue))]) -@lldb_command('showprockqueues') -def ShowProcKqueues(cmd_args=None): +@lldb_command('showprockqueues', fancy=True) +def ShowProcKqueues(cmd_args=None, cmd_options={}, O=None): """ Show the kqueues for a given process. usage: showprockqueues """ if not cmd_args: - raise ArgumentError('missing struct proc * argument') + return O.error('missing struct proc * argument') proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t') - print GetKqueueSummary.header - for kq in IterateProcKqueues(proc): - print GetKqueueSummary(kq) + with O.table(GetKqueueSummary.header): + for kq in IterateProcKqueues(proc): + print GetKqueueSummary(kq) -@lldb_command('showprocknotes') -def ShowProcKnotes(cmd_args=None): +@lldb_command('showprocknotes', fancy=True) +def ShowProcKnotes(cmd_args=None, cmd_options={}, O=None): """ Show the knotes for a given process. usage: showprocknotes """ if not cmd_args: - raise ArgumentError('missing struct proc * argument') + return O.error('missing struct proc * argument') proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t') - print GetKnoteSummary.header - for kn in IterateProcKnotes(proc): - print GetKnoteSummary(kn) + with O.table(GetKnoteSummary.header): + for kn in IterateProcKnotes(proc): + print GetKnoteSummary(kn) -@lldb_command('showallkqueues') -def ShowAllKqueues(cmd_args=[], cmd_options={}): +@lldb_command('showallkqueues', fancy=True) +def ShowAllKqueues(cmd_args=None, cmd_options={}, O=None): """ Display a summary of all the kqueues in the system usage: showallkqueues """ - print GetKqueueSummary.header - for kq in IterateAllKqueues(): - print GetKqueueSummary(kq) + with O.table(GetKqueueSummary.header): + for kq in IterateAllKqueues(): + print GetKqueueSummary(kq) diff --git a/tools/lldbmacros/memory.py b/tools/lldbmacros/memory.py index e157a5db3..a73dc5b8a 100755 --- a/tools/lldbmacros/memory.py +++ b/tools/lldbmacros/memory.py @@ -59,13 +59,13 @@ def CalculateLedgerPeak(phys_footprint_entry): return: value - representing the ledger peak for the given phys footprint entry """ now = kern.globals.sched_tick / 20 - ledger_peak = phys_footprint_entry.le_credit - phys_footprint_entry.le_debit - if hasattr(phys_footprint_entry._le._le_max, 'le_interval_max') and (phys_footprint_entry._le._le_max.le_interval_max > ledger_peak): - ledger_peak = phys_footprint_entry._le._le_max.le_interval_max + ledger_peak = long(phys_footprint_entry.le_credit) - long(phys_footprint_entry.le_debit) + if hasattr(phys_footprint_entry._le._le_max, 'le_interval_max') and (long(phys_footprint_entry._le._le_max.le_interval_max) > ledger_peak): + ledger_peak = long(phys_footprint_entry._le._le_max.le_interval_max) return ledger_peak -@header("{: >8s} {: >12s} {: >12s} {: >10s} {: >12s} {: >14s} {: >10s} {: >12s} {: >10s} {: >10s} {: >10s} {: <20s}\n".format( -'pid', 'effective', 'requested', 'state', 'user_data', 'physical', 'iokit', 'footprint', +@header("{: >8s} {: >12s} {: >12s} {: >10s} {: >10s} {: >12s} {: >14s} {: >10s} {: >12s} {: >10s} {: >10s} {: >10s} {: <20s}\n".format( +'pid', 'effective', 'requested', 'state', 'relaunch', 'user_data', 'physical', 'iokit', 'footprint', 'recent peak', 'lifemax', 'limit', 'command')) def GetMemoryStatusNode(proc_val): """ Internal function to get memorystatus information from the given proc @@ -81,18 +81,18 @@ def GetMemoryStatusNode(proc_val): task_phys_footprint_ledger_entry = task_ledgerp.l_entries[kern.globals.task_ledgers.phys_footprint] page_size = kern.globals.page_size - phys_mem_footprint = (task_physmem_footprint_ledger_entry.le_credit - task_physmem_footprint_ledger_entry.le_debit) / page_size - iokit_footprint = (task_iokit_footprint_ledger_entry.le_credit - task_iokit_footprint_ledger_entry.le_debit) / page_size - phys_footprint = (task_phys_footprint_ledger_entry.le_credit - task_phys_footprint_ledger_entry.le_debit) / page_size - phys_footprint_limit = task_phys_footprint_ledger_entry.le_limit / page_size + phys_mem_footprint = (long(task_physmem_footprint_ledger_entry.le_credit) - long(task_physmem_footprint_ledger_entry.le_debit)) / page_size + iokit_footprint = (long(task_iokit_footprint_ledger_entry.le_credit) - long(task_iokit_footprint_ledger_entry.le_debit)) / page_size + phys_footprint = (long(task_phys_footprint_ledger_entry.le_credit) - long(task_phys_footprint_ledger_entry.le_debit)) / page_size + phys_footprint_limit = long(task_phys_footprint_ledger_entry.le_limit) / page_size ledger_peak = CalculateLedgerPeak(task_phys_footprint_ledger_entry) phys_footprint_spike = ledger_peak / page_size - phys_footprint_lifetime_max = task_phys_footprint_ledger_entry._le._le_max.le_lifetime_max / page_size + phys_footprint_lifetime_max = long(task_phys_footprint_ledger_entry._le._le_max.le_lifetime_max) / page_size - format_string = '{0: >8d} {1: >12d} {2: >12d} {3: #011x} {4: #011x} {5: >12d} {6: >10d} {7: >13d}' + format_string = '{0: >8d} {1: >12d} {2: >12d} {3: #011x} {4: >10d} {5: #011x} {6: >12d} {7: >10d} {8: >13d}' out_str += format_string.format(proc_val.p_pid, proc_val.p_memstat_effectivepriority, - proc_val.p_memstat_requestedpriority, proc_val.p_memstat_state, proc_val.p_memstat_userdata, - phys_mem_footprint, iokit_footprint, phys_footprint) + proc_val.p_memstat_requestedpriority, proc_val.p_memstat_state, proc_val.p_memstat_relaunch_flags, + proc_val.p_memstat_userdata, phys_mem_footprint, iokit_footprint, phys_footprint) if phys_footprint != phys_footprint_spike: out_str += "{: >12d}".format(phys_footprint_spike) else: @@ -333,8 +333,10 @@ def ZcacheCPUPrint(cmd_args=None): # Macro: zprint @lldb_type_summary(['zone','zone_t']) -@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:^6s} {:^6s} {:^6s} {:>10s} {:^15s} {:<20s}".format( -'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', '(ELTS', 'PGS', 'WASTE)', 'CACHE_ELTS', 'FLAGS', 'NAME')) +@header(("{:<18s} {:_^23s} {:_^24s} {:_^13s} {:_^31s}\n"+ +"{:<18s} {:>11s} {:>11s} {:>8s} {:>7s} {:>7s} {:>6s} {:>6s} {:>7s} {:>5s} {:>3s} {:>5s} {:>7s} {:<15s} {:<20s}").format( +'', 'SIZE (bytes)', 'ELEMENTS (#)', 'PAGES', 'ALLOC CHUNK CONFIG', +'ZONE', 'ALLOC', 'FREE', 'ALLOC', 'FREE', 'CACHE', 'COUNT', 'FREE', 'SIZE', 'ELTS', 'PGS', 'WASTE', 'ELT_SZ', 'FLAGS', 'NAME')) def GetZoneSummary(zone): """ Summarize a zone with important information. See help zprint for description of each field params: @@ -343,11 +345,10 @@ def GetZoneSummary(zone): str - summary of the zone """ out_string = "" - format_string = '{:#018x} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:6d} {:6d} {:6d} {:10d} {markings} {name:s} ' + format_string = '{zone:#018x} {zone.cur_size:11,d} {free_size:11,d} {zone.count:8,d} {zone.countfree:7,d} {cache_elem_count:7,d} {zone.page_count:6,d} {zone.count_all_free_pages:6,d} {zone.alloc_size:7,d} {alloc_count:5,d} {alloc_pages:3,d} {alloc_waste:5,d} {zone.elem_size:7,d} {markings:<15s} {zone.zone_name:<20s} ' pagesize = kern.globals.page_size - free_elements = zone.countfree - free_size = free_elements * zone.elem_size + free_size = zone.countfree * zone.elem_size mag_capacity = kern.GetGlobalVariable('magazine_element_count') alloc_pages = zone.alloc_size / pagesize @@ -390,18 +391,16 @@ def GetZoneSummary(zone): if zone.zcache[0].zcc_depot_index != -1: cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity - out_string += format_string.format(zone, zone.cur_size, zone.page_count, - zone.count, free_elements, free_size, zone.count_all_free_pages, - zone.elem_size, zone.alloc_size, alloc_count, - alloc_pages, alloc_waste, cache_elem_count, name = zone.zone_name, markings=markings) + out_string += format_string.format(zone=zone, free_size=free_size, alloc_count=alloc_count, + alloc_pages=alloc_pages, alloc_waste=alloc_waste, cache_elem_count=cache_elem_count, markings=markings) if zone.exhaustible : out_string += "(max: {:d})".format(zone.max_size) return out_string -@lldb_command('zprint') -def Zprint(cmd_args=None): +@lldb_command('zprint', fancy=True) +def Zprint(cmd_args=None, cmd_options={}, O=None): """ Routine to print a summary listing of all the kernel zones All columns are printed in decimal Legend: @@ -424,9 +423,9 @@ def Zprint(cmd_args=None): I - zone was destroyed and is no longer valid """ global kern - print GetZoneSummary.header - for zval in kern.zones: - print GetZoneSummary(zval) + with O.table(GetZoneSummary.header): + for zval in kern.zones: + print GetZoneSummary(zval) @xnudebug_test('test_zprint') def TestZprint(kernel_target, config, lldb_obj, isConnected ): @@ -2236,12 +2235,12 @@ def ShowProcVnodes(cmd_args=None): if int(fdptr.fd_rdir) != 0: print '{0: <25s}\n{1: llb fails to cast addresses to double pointers - fpptr = Cast(fdptr.fd_ofiles, 'fileproc *') + fpptr = Cast(fdptr.fd_ofiles, 'uint64_t *') while count < fdptr.fd_nfiles: fpp = dereference(fpptr) - fproc = Cast(fpp, 'fileproc *') + fproc = kern.GetValueFromAddress(int(fpp), 'fileproc *') if int(fproc) != 0: fglob = dereference(fproc).f_fglob flags = "" @@ -2250,9 +2249,9 @@ def ShowProcVnodes(cmd_args=None): if (fdptr.fd_ofileflags[count] & 2): flags += 'F' if (fdptr.fd_ofileflags[count] & 4): flags += 'R' if (fdptr.fd_ofileflags[count] & 8): flags += 'C' - print '{0: <5d} {1: <7s}'.format(count, flags) + GetVnodeSummary(Cast(fglob.fg_data, 'vnode *')) + print '{0: <5d} {1: <7s} {2: <#020x} '.format(count, flags, fglob) + GetVnodeSummary(Cast(fglob.fg_data, 'vnode *')) count += 1 - fpptr = kern.GetValueFromAddress(int(fpptr) + kern.ptrsize,'fileproc *') + fpptr = kern.GetValueFromAddress(int(fpptr) + kern.ptrsize,'uint64_t *') @lldb_command('showallprocvnodes') def ShowAllProcVnodes(cmd_args=None): @@ -3082,7 +3081,24 @@ FixedTags = { } def GetVMKernName(tag): - return FixedTags[tag] + """ returns the formatted name for a vmtag and + the sub-tag for kmod tags. + """ + if ((tag <= 27) or (tag == 255)): + return (FixedTags[tag], "") + site = kern.globals.vm_allocation_sites[tag] + if site: + if site.flags & 0x007F: + cstr = addressof(site.subtotals[site.subtotalscount]) + return ("{:<50s}".format(str(Cast(cstr, 'char *'))), "") + else: + if site.flags & 0x0200: + xsite = Cast(site,'OSKextAccount *') + tagstr = ".{:<3d}".format(xsite.loadTag) + return (GetKmodIDName(xsite.loadTag), tagstr); + else: + return (kern.Symbolicate(site), "") + return ("", "") @lldb_command("showvmtags", "AS") def showvmtags(cmd_args=None, cmd_options={}): @@ -3101,19 +3117,18 @@ def showvmtags(cmd_args=None, cmd_options={}): if "-A" in cmd_options: all_tags = True page_size = unsigned(kern.globals.page_size) - tagcounts = [] - tagpeaks = [] - for tag in range(256): - tagcounts.append(0) - for tag in range(256): - tagpeaks.append(0) + nsites = unsigned(kern.globals.vm_allocation_tag_highest) + tagcounts = [0] * nsites + tagpeaks = [0] * nsites + tagmapped = [0] * nsites if kern.globals.vm_tag_active_update: - for tag in range(256): + for tag in range(nsites): site = kern.globals.vm_allocation_sites[tag] if site: - tagcounts[unsigned(tag)] = unsigned(site.total) - tagpeaks[unsigned(tag)] = unsigned(site.peak) + tagcounts[tag] = unsigned(site.total) + tagmapped[tag] = unsigned(site.mapped) + tagpeaks[tag] = unsigned(site.peak) else: queue_head = kern.globals.vm_objects_wired for object in IterateQueue(queue_head, 'struct vm_object *', 'wired_objq'): @@ -3123,29 +3138,30 @@ def showvmtags(cmd_args=None, cmd_options={}): CountMapTags(kern.globals.kernel_map, tagcounts, slow) total = 0 - print " {:<7s} {:>7s} {:>7s} {:<50s}".format("tag.kmod","peak","size","name") - for tag in range(256): - if all_tags or tagcounts[tag]: + totalmapped = 0 + print " vm_allocation_tag_highest: {:<7d} ".format(nsites) + print " {:<7s} {:>7s} {:>7s} {:>7s} {:<50s}".format("tag.kmod", "peak", "size", "mapped", "name") + for tag in range(nsites): + if all_tags or tagcounts[tag] or tagmapped[tag]: total += tagcounts[tag] - tagstr = "" - sitestr = "" - if ((tag <= 27) or (tag == 255)): - sitestr = GetVMKernName(tag) - else: - site = kern.globals.vm_allocation_sites[tag] - if site: - if site.flags & 0x007F: - cstr = addressof(site.subtotals[site.subtotalscount]) - sitestr = "{:<50s}".format(str(Cast(cstr, 'char *'))) + totalmapped += tagmapped[tag] + (sitestr, tagstr) = GetVMKernName(tag) + site = kern.globals.vm_allocation_sites[tag] + print " {:>3d}{:<4s} {:>7d}K {:>7d}K {:>7d}K {:<50s}".format(tag, tagstr, tagpeaks[tag] / 1024, tagcounts[tag] / 1024, tagmapped[tag] / 1024, sitestr) + + for sub in range(site.subtotalscount): + alloctag = unsigned(site.subtotals[sub].tag) + amount = unsigned(site.subtotals[sub].total) + subsite = kern.globals.vm_allocation_sites[alloctag] + if alloctag and subsite: + if ((subsite.flags & 0x007f) == 0): + kind_str = "named" else: - if site.flags & 0x0200: - xsite = Cast(site,'OSKextAccount *') - tagstr = ".{:<3d}".format(xsite.loadTag) - sitestr = GetKmodIDName(xsite.loadTag) - else: - sitestr = kern.Symbolicate(site) - print " {:>3d}{:<4s} {:>7d}K {:>7d}K {:<50s}".format(tag,tagstr,tagpeaks[tag] / 1024, tagcounts[tag] / 1024,sitestr) - print "Total: {:>7d}K".format(total / 1024) + kind_str = "from" + (sitestr, tagstr) = GetVMKernName(alloctag) + print " {:>7s} {:>7s} {:>7s} {:>7d}K {:s} {:>3d}{:<4s} {:<50s}".format(" ", " ", " ", amount / 1024, kind_str, alloctag, tagstr, sitestr) + + print "Total: {:>7d}K {:>7d}K".format(total / 1024, totalmapped / 1024) return None @@ -3327,10 +3343,304 @@ def _calc_vm_page_hash(obj, off): return hash_id +def AddressIsFromZoneMap(addr): + zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address') + zone_map_max_address = kern.GetGlobalVariable('zone_map_max_address') + if (unsigned(addr) >= unsigned(zone_map_min_address)) and (unsigned(addr) < unsigned(zone_map_max_address)): + return 1 + else: + return 0 + +def ElementOffsetInForeignPage(): + zone_element_alignment = 32 # defined in zalloc.c + zone_page_metadata_size = sizeof('struct zone_page_metadata') + if zone_page_metadata_size % zone_element_alignment == 0: + offset = zone_page_metadata_size + else: + offset = zone_page_metadata_size + (zone_element_alignment - (zone_page_metadata_size % zone_element_alignment)) + return unsigned(offset) + +def ElementStartAddrFromZonePageMetadata(page_metadata): + zone_metadata_region_min = kern.GetGlobalVariable('zone_metadata_region_min') + zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address') + page_size = kern.GetGlobalVariable('page_size') + if AddressIsFromZoneMap(page_metadata): + page_index = (unsigned(page_metadata) - unsigned(zone_metadata_region_min)) / sizeof('struct zone_page_metadata') + element_start_addr = unsigned(zone_map_min_address) + unsigned(page_index * page_size) + else: + element_start_addr = unsigned(page_metadata) + unsigned(ElementOffsetInForeignPage()) + + return element_start_addr + +def ZonePageStartAddrFromZonePageMetadata(page_metadata): + zone_metadata_region_min = kern.GetGlobalVariable('zone_metadata_region_min') + zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address') + page_size = kern.GetGlobalVariable('page_size') + + if AddressIsFromZoneMap(page_metadata): + page_index = (unsigned(page_metadata) - unsigned(zone_metadata_region_min)) / sizeof('struct zone_page_metadata') + zone_page_addr = unsigned(zone_map_min_address) + unsigned(page_index * page_size) + else: + zone_page_addr = unsigned(page_metadata) + + return unsigned(zone_page_addr) + +def CreateFreeElementsList(zone, first_free): + free_elements = [] + if unsigned(first_free) == 0: + return free_elements + current = first_free + while True: + free_elements.append(unsigned(current)) + next = dereference(Cast(current, 'vm_offset_t *')) + next = (unsigned(next) ^ unsigned(kern.globals.zp_nopoison_cookie)) + next = kern.GetValueFromAddress(next, 'vm_offset_t *') + if unsigned(next) == 0: + break; + current = Cast(next, 'void *') + + return free_elements + +#Macro: showallocatedzoneelement +@lldb_command('showallocatedzoneelement') +def ShowAllocatedElementsInZone(cmd_args=None, cmd_options={}): + """ Show all the allocated elements in a zone + usage: showzoneallocelements
+ """ + if len(cmd_args) < 1: + raise ArgumentError("Please specify a zone") + + zone = kern.GetValueFromAddress(cmd_args[0], 'struct zone *') + elements = FindAllocatedElementsInZone(zone) + i = 1 + for elem in elements: + print "{0: >10d}/{1:<10d} element: {2: <#20x}".format(i, len(elements), elem) + i += 1 + +#EndMacro: showallocatedzoneelement + +def FindAllocatedElementsInZone(zone): + page_size = kern.GetGlobalVariable('page_size') + elements = [] + page_queues = ["any_free_foreign", "intermediate", "all_used"] + found_total = 0 + + for queue in page_queues: + found_in_queue = 0 + if queue == "any_free_foreign" and unsigned(zone.allows_foreign) != 1: + continue + + for zone_page_metadata in IterateQueue(zone.pages.__getattr__(queue), 'struct zone_page_metadata *', 'pages'): + free_elements = [] + first_free_element = kern.GetValueFromAddress(GetFreeList(zone_page_metadata)) + free_elements = CreateFreeElementsList(zone, first_free_element) + + chunk_page_count = zone_page_metadata.page_count + element_addr_start = ElementStartAddrFromZonePageMetadata(zone_page_metadata) + zone_page_start = ZonePageStartAddrFromZonePageMetadata(zone_page_metadata) + next_page = zone_page_start + page_size + element_addr_end = zone_page_start + (chunk_page_count * page_size) + elem = unsigned(element_addr_start) + while elem < element_addr_end: + if elem not in free_elements: + elements.append(elem) + found_in_queue += 1 + elem += zone.elem_size + + if queue == "any_free_foreign": + if (elem + zone.elem_size) >= next_page: + zone_page_start = unsigned((elem + page_size) & ~(page_size - 1)) + next_page = zone_page_start + page_size + elem = zone_page_start + unsigned(ElementOffsetInForeignPage()) + + found_total += found_in_queue +# print "Found {0: 26s}{1: >20s}{2: >10s}{3: >20s}{4: >20s}{5: >16s}".format("vm_pages_index/zone", "vm_page", "q_state", "vm_object", "offset", "ppn", "bitfield", "from_zone_map")) +@lldb_command('scan_vm_pages', 'S:O:F:I:P:B:I:N:ZA') +def ScanVMPages(cmd_args=None, cmd_options={}): + """ Scan the global vm_pages array (-A) and/or vmpages zone (-Z) for pages with matching attributes. + usage: scan_vm_pages [-A start vm_pages index] [-N number of pages to scan] [-Z scan vm_pages zone] + + scan_vm_pages -A: scan vm pages in the global vm_pages array + scan_vm_pages -Z: scan vm pages allocated from the vm.pages zone + scan_vm_pages <-A/-Z> -S : Find vm pages in the specified queue + scan_vm_pages <-A/-Z> -O : Find vm pages in the specified vm_object + scan_vm_pages <-A/-Z> -F : Find vm pages with the specified vmp_offset value + scan_vm_pages <-A/-Z> -P : Find vm pages with the specified physical page number + scan_vm_pages <-A/-Z> -B : Find vm pages with the bitfield set + scan_vm_pages <-A> -I : Start the scan from start_index + scan_vm_pages <-A> -N : Scan at most npages + """ + if (len(cmd_options) < 1): + raise ArgumentError("Please specify at least one matching attribute") + + vm_pages = kern.globals.vm_pages + vm_pages_count = kern.globals.vm_pages_count + + start_index = 0 + npages = vm_pages_count + scan_vmpages_array = False + scan_vmpages_zone = False + attribute_count = 0 + + if "-A" in cmd_options: + scan_vmpages_array = True + + if "-Z" in cmd_options: + scan_vmpages_zone = True + + if scan_vmpages_array == False and scan_vmpages_zone == False: + raise ArgumentError("Please specify where to scan (-A: vm_pages array, -Z: vm.pages zone)") + + attribute_values = {} + if "-S" in cmd_options: + attribute_values["vmp_q_state"] = kern.GetValueFromAddress(cmd_options["-S"], 'int') + attribute_count += 1 + + if "-O" in cmd_options: + attribute_values["vm_object"] = kern.GetValueFromAddress(cmd_options["-O"], 'vm_object_t') + attribute_count += 1 + + if "-F" in cmd_options: + attribute_values["vmp_offset"] = kern.GetValueFromAddress(cmd_options["-F"], 'unsigned long long') + attribute_count += 1 + + if "-P" in cmd_options: + attribute_values["phys_page"] = kern.GetValueFromAddress(cmd_options["-P"], 'unsigned int') + attribute_count += 1 + + if "-B" in cmd_options: + valid_vmp_bitfields = [ + "vmp_in_background", + "vmp_on_backgroundq", + "vmp_gobbled", + "vmp_laundry", + "vmp_no_cache", + "vmp_private", + "vmp_reference", + "vmp_busy", + "vmp_wanted", + "vmp_tabled", + "vmp_hashed", + "vmp_fictitious", + "vmp_clustered", + "vmp_pmapped", + "vmp_xpmapped", + "vmp_free_when_done", + "vmp_absent", + "vmp_error", + "vmp_dirty", + "vmp_cleaning", + "vmp_precious", + "vmp_overwriting", + "vmp_restart", + "vmp_unusual", + "vmp_cs_validated", + "vmp_cs_tainted", + "vmp_cs_nx", + "vmp_reusable", + "vmp_lopage", + "vmp_written_by_kernel", + "vmp_unused_object_bits" + ] + attribute_values["bitfield"] = cmd_options["-B"] + if attribute_values["bitfield"] in valid_vmp_bitfields: + attribute_count += 1 + else: + raise ArgumentError("Unknown bitfield: {0:>20s}".format(bitfield)) + + if "-I" in cmd_options: + start_index = kern.GetValueFromAddress(cmd_options["-I"], 'int') + npages = vm_pages_count - start_index + + if "-N" in cmd_options: + npages = kern.GetValueFromAddress(cmd_options["-N"], 'int') + if npages == 0: + raise ArgumentError("You specified -N 0, nothing to be scanned") + + end_index = start_index + npages - 1 + if end_index >= vm_pages_count: + raise ArgumentError("Index range out of bound. vm_pages_count: {0:d}".format(vm_pages_count)) + + header_after_n_lines = 40 + format_string = "{0: >26s}{1: >#20x}{2: >10d}{3: >#20x}{4: >#20x}{5: >#16x}" + + found_in_array = 0 + if scan_vmpages_array: + print "Scanning vm_pages[{0:d} to {1:d}] for {2:d} matching attribute(s)......".format(start_index, end_index, attribute_count) + i = start_index + while i <= end_index: + page = vm_pages[i] + if match_vm_page_attributes(page, attribute_values) == attribute_count: + if found_in_array % header_after_n_lines == 0: + print ScanVMPages.header + + print format_string.format(str(i), addressof(page), page.vmp_q_state, _vm_page_unpack_ptr(page.vmp_object), page.vmp_offset, _vm_page_get_phys_page(addressof(page))) + found_in_array += 1 + + i += 1 + + found_in_zone = 0 + if scan_vmpages_zone: + page_size = kern.GetGlobalVariable('page_size') + num_zones = kern.GetGlobalVariable('num_zones') + zone_array = kern.GetGlobalVariable('zone_array') + print "Scanning vm.pages zone for {0:d} matching attribute(s)......".format(attribute_count) + i = 0 + while i < num_zones: + zone = zone_array[i] + if str(zone.zone_name) == "vm pages": + break; + i += 1 + + if i == num_zones: + print "Cannot find vm_pages zone, skip the scan" + else: + print "Scanning page queues in the vm_pages zone..." + elements = FindAllocatedElementsInZone(zone) + for elem in elements: + page = kern.GetValueFromAddress(elem, 'vm_page_t') + + if match_vm_page_attributes(page, attribute_values) == attribute_count: + if found_in_zone % header_after_n_lines == 0: + print ScanVMPages.header + + vm_object = _vm_page_unpack_ptr(page.vmp_object) + phys_page = _vm_page_get_phys_page(page) + print format_string.format("vm_pages zone", elem, page.vmp_q_state, vm_object, page.vmp_offset, phys_page) + found_in_zone += 1 + + total = found_in_array + found_in_zone + print "Found {0:d} vm pages ({1:d} in array, {2:d} in zone) matching the requested {3:d} attribute(s)".format(total, found_in_array, found_in_zone, attribute_count) + +#EndMacro scan_vm_pages + VM_PAGE_IS_WIRED = 1 @header("{0: <10s} of {1: <10s} {2: <20s} {3: <20s} {4: <20s} {5: <10s} {6: <5s}\t {7: <28s}\t{8: <50s}".format("index", "total", "vm_page_t", "offset", "next", "phys_page", "wire#", "first bitfield", "second bitfield")) -@lldb_command('vmobjectwalkpages', 'SBNQP:') +@lldb_command('vmobjectwalkpages', 'CSBNQP:O:') def VMObjectWalkPages(cmd_args=None, cmd_options={}): """ Print the resident pages contained in the provided object. If a vm_page_t is provided as well, we specifically look for this page, highlighting it in the output or noting if it was not found. For @@ -3338,11 +3648,13 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): see and compare this to the object's resident page count field. Usage: vmobjectwalkpages : Walk and print all the pages for a given object (up to 4K pages by default) + vmobjectwalkpages -C : list pages in compressor after processing resident pages vmobjectwalkpages -B : Walk and print all the pages for a given object (up to 4K pages by default), traversing the memq backwards vmobjectwalkpages -N : Walk and print all the pages for a given object, ignore the page limit vmobjectwalkpages -Q : Walk all pages for a given object, looking for known signs of corruption (i.e. q_state == VM_PAGE_IS_WIRED && wire_count == 0) vmobjectwalkpages -P : Walk all the pages for a given object, annotate the specified page in the output with *** vmobjectwalkpages -P -S : Walk all the pages for a given object, stopping when we find the specified page + vmobjectwalkpages -O : Like -P, but looks for given offset """ @@ -3357,10 +3669,14 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if "-P" in cmd_options: page = kern.GetValueFromAddress(cmd_options['-P'], 'vm_page_t') + off = -1 + if "-O" in cmd_options: + off = kern.GetValueFromAddress(cmd_options['-O'], 'vm_offset_t') + stop = 0 if "-S" in cmd_options: - if page == 0: - raise ArgumentError("-S can only be passed when a page is specified with -P") + if page == 0 and off < 0: + raise ArgumentError("-S can only be passed when a page is specified with -P or -O") stop = 1 walk_backwards = False @@ -3385,6 +3701,10 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if "-N" in cmd_options: ignore_limit = 1 + show_compressed = 0 + if "-C" in cmd_options: + show_compressed = 1 + page_count = 0 res_page_count = unsigned(obj.resident_page_count) page_found = False @@ -3397,7 +3717,11 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): out_string += "******" page_found = True - if page != 0 or quiet_mode: + if (off > 0 and not(page_found) and vmp.vmp_offset == off): + out_string += "******" + page_found = True + + if page != 0 or off > 0 or quiet_mode: if (page_count % 1000) == 0: print "traversed %d pages ...\n" % (page_count) else: @@ -3457,7 +3781,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if (page_count >= limit and not(ignore_limit)): print out_string + "Limit reached (%d pages), stopping..." % (limit) - return + break print out_string @@ -3468,8 +3792,30 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if (page != 0): print("page found? : %s\n" % page_found) + if (off > 0): + print("page found? : %s\n" % page_found) + print("Object reports resident page count of %d, we saw %d pages when we walked the resident list.\n" % (unsigned(obj.resident_page_count), unsigned(page_count))) + if show_compressed != 0 and obj.pager != 0 and unsigned(obj.pager.mo_pager_ops) == unsigned(addressof(kern.globals.compressor_pager_ops)): + pager = Cast(obj.pager, 'compressor_pager *') + chunks = pager.cpgr_num_slots / 128 + pagesize = kern.globals.page_size + + page_idx = 0 + while page_idx < pager.cpgr_num_slots: + if chunks != 0: + chunk = pager.cpgr_slots.cpgr_islots[page_idx / 128] + slot = chunk[page_idx % 128] + elif pager.cpgr_num_slots > 2: + slot = pager.cpgr_slots.cpgr_dslots[page_idx] + else: + slot = pager.cpgr_slots.cpgr_eslots[page_idx] + + if slot != 0: + print("compressed page for offset: %x slot %x\n" % ((page_idx * pagesize) - obj.paging_offset, slot)) + page_idx = page_idx + 1 + @lldb_command("show_all_apple_protect_pagers") def ShowAllAppleProtectPagers(cmd_args=None): @@ -3563,23 +3909,23 @@ def ShowJetsamSnapshot(cmd_args=None, cmd_options={}): # Dumps the snapshot header info print lldb_run_command('p *memorystatus_jetsam_snapshot') - hdr_format = "{0: >32s} {1: >5s} {2: >4s} {3: >6s} {4: >6s} {5: >20s} {6: >20s} {7: >20s} {8: >5s} {9: >10s} {10: >6s} {11: >6s} {12: >10s} {13: >15s} {14: >15s} {15: >15s} {16: >15s}" + hdr_format = "{0: >32s} {1: >5s} {2: >4s} {3: >6s} {4: >6s} {5: >20s} {6: >20s} {7: >20s} {8: >5s} {9: >10s} {10: >6s} {11: >6s} {12: >10s} {13: >15s} {14: >15s} {15: >15s}" if (show_footprint_details == True): - hdr_format += "{17: >15s} {18: >15s} {19: >12s} {20: >12s} {21: >17s} {22: >10s} {23: >13s} {24: >10s}" + hdr_format += "{16: >15s} {17: >15s} {18: >12s} {19: >12s} {20: >17s} {21: >10s} {22: >13s} {23: >10s}" if (show_footprint_details == False): - print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'max', 'purgeable', 'lifetimeMax') - print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)') + print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'purgeable', 'lifetimeMax') + print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)') else: - print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'max', 'purgeable', 'lifetimeMax', '|| internal', 'internal_comp', 'iokit_mapped', 'purge_nonvol', 'purge_nonvol_comp', 'alt_acct', 'alt_acct_comp', 'page_table') - print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)') + print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'purgeable', 'lifetimeMax', '|| internal', 'internal_comp', 'iokit_mapped', 'purge_nonvol', 'purge_nonvol_comp', 'alt_acct', 'alt_acct_comp', 'page_table') + print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)') entry_format = "{e.name: >32s} {index: >5d} {e.priority: >4d} {e.jse_coalition_jetsam_id: >6d} {e.pid: >6d} "\ "{e.jse_starttime: >20d} {e.jse_killtime: >20d} "\ "{e.jse_idle_delta: >20d} {e.killed: >5d} {e.jse_memory_region_count: >10d} "\ - "{e.fds: >6d} {e.jse_gencount: >6d} {e.state: >10x} {e.pages: >15d} {e.max_pages: >15d} "\ + "{e.fds: >6d} {e.jse_gencount: >6d} {e.state: >10x} {e.pages: >15d} "\ "{e.purgeable_pages: >15d} {e.max_pages_lifetime: >15d}" if (show_footprint_details == True): @@ -3595,7 +3941,7 @@ def ShowJetsamSnapshot(cmd_args=None, cmd_options={}): snapshot_list = kern.globals.memorystatus_jetsam_snapshot.entries idx = 0 while idx < count: - current_entry = Cast(snapshot_list[idx], 'jetsam_snapshot_entry') + current_entry = dereference(Cast(addressof(snapshot_list[idx]), 'jetsam_snapshot_entry *')) print entry_format.format(index=idx, e=current_entry) idx +=1 return diff --git a/tools/lldbmacros/misc.py b/tools/lldbmacros/misc.py index effb5ea28..237927c69 100755 --- a/tools/lldbmacros/misc.py +++ b/tools/lldbmacros/misc.py @@ -942,10 +942,24 @@ def DumpRawTraceFile(cmd_args=[], cmd_options={}): # XXX condition here is on __LP64__ if lp64 : tempbuf += struct.pack('QQQQQQIIQ', - e.timestamp, e.arg1, e.arg2, e.arg3, e.arg4, e.arg5, e.debugid, e.cpuid, e.unused) + unsigned(e.timestamp), + unsigned(e.arg1), + unsigned(e.arg2), + unsigned(e.arg3), + unsigned(e.arg4), + unsigned(e.arg5), + unsigned(e.debugid), + unsigned(e.cpuid), + unsigned(e.unused)) else : - tempbuf += struct.pack('QIIIIII', - e.timestamp, e.arg1, e.arg2, e.arg3, e.arg4, e.arg5, e.debugid) + tempbuf += struct.pack('QIIIIII', + unsigned(e.timestamp), + unsigned(e.arg1), + unsigned(e.arg2), + unsigned(e.arg3), + unsigned(e.arg4), + unsigned(e.arg5), + unsigned(e.debugid)) # Watch for out of order timestamps if earliest_time < (htab[min_kdbp].kd_prev_timebase & KDBG_TIMESTAMP_MASK) : diff --git a/tools/lldbmacros/net.py b/tools/lldbmacros/net.py index 2c6cfd876..c7777f86b 100755 --- a/tools/lldbmacros/net.py +++ b/tools/lldbmacros/net.py @@ -1,17 +1,31 @@ """ Please make sure you read the README COMPLETELY BEFORE reading anything below. - It is very critical that you read coding guidelines in Section E in README file. + It is very critical that you read coding guidelines in Section E in README file. """ - from xnu import * from utils import * from string import * from socket import * +import tempfile import xnudefines from netdefines import * from routedefines import * +def GetDlilIfFlagsAsString(dlil_if_flags): + """ Return a formatted string description of the dlil interface flags + """ + out_string = "" + flags = (unsigned)(dlil_if_flags & 0xffff) + i = 0 + num = 1 + while num <= flags: + if flags & num: + out_string += dlil_if_flags_strings[i] + "," + i += 1 + num = num << 1 + return rstrip(out_string, ",") + def GetIfFlagsAsString(if_flags): """ Return a formatted string description of the interface flags """ @@ -21,7 +35,7 @@ def GetIfFlagsAsString(if_flags): num = 1 while num <= flags: if flags & num: - out_string += if_flags_strings[i] + "," + out_string += if_flags_strings[i] + "," i += 1 num = num << 1 return rstrip(out_string, ",") @@ -36,6 +50,7 @@ def ShowIfConfiguration(ifnet): format_string = "{0: index {3: " out_string += "\n\t(struct ifnet *)" + hex(ifnet) if iface.if_snd.ifcq_len : out_string += "\n\t" + str(iface.if_snd.ifcq_len) @@ -53,6 +68,51 @@ def GetIfConfiguration(ifname): return ifnet return None +# Macro: net_get_always_on_pktap +@lldb_command('net_get_always_on_pktap') +def NetGetAlwaysOnPktap(cmd_args=None): + """ Dump the always-on packet capture to /tmp/dump.pktap + """ + for i in range(0, 10): + ifnet = GetIfConfiguration("pktap"+str(i)) + if not ifnet: + continue + if ifnet.if_bpf == 0: + ifnet = None + continue + if ifnet.if_bpf.bif_dlist.bd_headdrop == 0: + ifnet = None + continue + + break + + if not ifnet: + print "Could not find a pktap interface" + return + + bpf_d = ifnet.if_bpf.bif_dlist + + f = tempfile.NamedTemporaryFile(prefix="dump-", suffix=".pktap", dir="/tmp/", mode="wb", delete=False) + + err = lldb.SBError() + + if bpf_d.bd_hbuf != 0: + addr = bpf_d.bd_hbuf[0]._sbval19k84obscure747.AddressOf().GetValueAsUnsigned() + buf = LazyTarget.GetProcess().ReadMemory(addr, unsigned(bpf_d.bd_hlen), err) + if err.fail: + print "Error, getting sbuf" + f.write(buf) + + addr = bpf_d.bd_sbuf[0]._sbval19k84obscure747.AddressOf().GetValueAsUnsigned() + buf = LazyTarget.GetProcess().ReadMemory(addr, unsigned(bpf_d.bd_slen), err) + if err.fail: + print "Error, getting sbuf" + f.write(buf) + + print f.name + f.close() +# EndMacro: net_get_always_on_pktap + # Macro: ifconfig @lldb_command('ifconfig') def ShowIfconfig(cmd_args=None) : @@ -70,9 +130,20 @@ def ShowIfconfig(cmd_args=None) : print GetIfaddrs(ifnet) # EndMacro: ifconfig +#Macro: ifconfig_dlil +@lldb_command('ifconfig_dlil') +def ShowIfconfigDlil(cmd_args=None) : + """ Display ifconfig-like output for DLIL interface list, print (struct ifnet *) pointer and dlil info for further inspection + """ + dlil_ifnets = kern.globals.dlil_ifnet_head + for dlil_ifnet in IterateTAILQ_HEAD(dlil_ifnets, "dl_if_link"): + ShowIfConfiguration(dlil_ifnet) + print GetIfaddrs(Cast(dlil_ifnet, 'ifnet *')) +# EndMacro: ifconfig_dlil + def GetAddressAsStringColonHex(addr, count): out_string = "" - i = 0 + i = 0 addr_format_string = "{0:02x}" while (i < count): if (i == 0): @@ -92,7 +163,7 @@ def GetSocketAddrAsStringUnix(sockaddr): if (sock_unix == 0): return "(null)" else: - if (len(str(sock_unix.sun_path)) > 0): + if (len(str(sock_unix.sun_path)) > 0): return str(sock_unix.sun_path) else: return "\"\"" @@ -100,7 +171,7 @@ def GetSocketAddrAsStringUnix(sockaddr): def GetInAddrAsString(ia): out_string = "" inaddr = Cast(ia, 'in_addr *') - + packed_value = struct.pack('I', unsigned(ia.s_addr)) out_string = inet_ntoa(packed_value) return out_string @@ -115,7 +186,7 @@ def GetIn6AddrAsString(ia): def GetSocketAddrAsStringInet(sockaddr): sock_in = Cast(sockaddr, 'sockaddr_in *') - return GetInAddrAsString(sock_in.sin_addr) + return GetInAddrAsString(addressof(sock_in.sin_addr)) def GetSocketAddrAsStringInet6(sockaddr): sock_in6 = Cast(sockaddr, 'sockaddr_in6 *') @@ -132,7 +203,7 @@ def GetSocketAddrAsStringLink(sockaddr): else: out_string += GetAddressAsStringColonHex(addressof(sock_link.sdl_data[sock_link.sdl_nlen]), sock_link.sdl_alen) return out_string - + def GetSocketAddrAsStringAT(sockaddr): out_string = "" sock_addr = Cast(sockaddr, 'sockaddr *') @@ -206,7 +277,7 @@ def GetCapabilitiesAsString(flags): num = 1 while num <= flags: if flags & num: - out_string += if_capenable_strings[i] + "," + out_string += if_capenable_strings[i] + "," i += 1 num = num << 1 return rstrip(out_string, ",") @@ -260,7 +331,7 @@ def ShowDlilIfnetConfiguration(dlil_ifnet, show_all) : @lldb_command('showifnets') def ShowIfnets(cmd_args=None) : """ Display ifconfig-like output for all attached and detached interfaces - """ + """ showall = 0 if cmd_args != None and len(cmd_args) > 0 : showall = 1 @@ -394,9 +465,9 @@ def GetSocketProtocolAsString(sock): def GetInAddr4to6AsString(inaddr): out_string = "" if (inaddr is not None): - ia = Cast(inaddr, 'char *') - inaddr_format_string = "{0: 0 : proc = kern.GetValueFromAddress(cmd_args[0], 'proc *') - proc_fd = proc.p_fd if not proc: print "Unknown value passed as argument." return else: - count = 0 - fpp = Cast(proc_fd.fd_ofiles, 'fileproc **') - while (count < proc_fd.fd_nfiles): - fp = Cast(dereference(fpp), 'fileproc *') - if (fp != 0): - fg = Cast(fp.f_fglob, 'fileglob *') - if (int(fg.fg_ops.fo_type) == 2): - if (proc_fd.fd_ofileflags[count] & 4): - out_string += "U: " - else: - out_string += " " - out_string += "fd = " + str(count) + " " - if (fg.fg_data != 0): - out_string += GetSocket(unsigned(fg.fg_data)) - out_string += "\n" - else: - out_string += "" - fpp = kern.GetValueFromAddress(unsigned(fpp + 8), 'fileproc **') - count += 1 - print out_string + print GetProcInfo(proc) + print GetProcSockets(proc, total_snd_cc, total_rcv_cc) else: print "Missing argument 0 in user function." # EndMacro: showprocsockets -def GetProcSockets(proc): - """ Given a proc_t pointer, display information about its sockets - """ - out_string = "" - proc_fd = proc.p_fd - - if proc is None: - out_string += "Unknown value passed as argument." - else: - count = 0 - fpp = Cast(proc_fd.fd_ofiles, 'fileproc **') - while (count < proc_fd.fd_nfiles): - fp = Cast(dereference(fpp), 'fileproc *') - if (fp != 0): - fg = Cast(fp.f_fglob, 'fileglob *') - if (int(fg.fg_ops.fo_type) == 2): - if (proc_fd.fd_ofileflags[count] & 4): - out_string += "U: " - else: - out_string += " " - out_string += "fd = " + str(count) + " " - if (fg.fg_data != 0): - out_string += GetSocket(unsigned(fg.fg_data)) - out_string += "\n" - else: - out_string += "" - fpp = kern.GetValueFromAddress(unsigned(fpp + 8), 'fileproc **') - count += 1 - return out_string - - # Macro: showallprocsockets @lldb_command('showallprocsockets') def ShowAllProcSockets(cmd_args=None): """Display information about the sockets of all the processes """ + total_snd_cc = [0] + total_rcv_cc = [0] for proc in kern.procs: print "================================================================================" print GetProcInfo(proc) - print GetProcSockets(proc) + print GetProcSockets(proc, total_snd_cc, total_rcv_cc) + print ("total_snd_cc: " + str(int(total_snd_cc[0])) + " total_rcv_cc: " + str(int(total_rcv_cc[0])) + "\n") # EndMacro: showallprocsockets @@ -596,7 +667,7 @@ def GetRtEntryPrDetailsAsString(rte): dst_string_format = "{0:<18s}" if (dst.sa_family == AF_INET): out_string += dst_string_format.format(GetSocketAddrAsStringInet(dst)) + " " - else: + else: if (dst.sa_family == AF_INET6): out_string += dst_string_format.format(GetSocketAddrAsStringInet6(dst)) + " " isv6 = 1 @@ -696,7 +767,7 @@ def GetRtEntryPrDetailsAsString(rte): out_string += str(int(rt.rt_ifp.if_unit)) out_string += "\n" return out_string - + RNF_ROOT = 2 def GetRtTableAsString(rt_tables): @@ -737,26 +808,26 @@ def GetRtInetAsString(): rt_tables = kern.globals.rt_tables[2] if (kern.ptrsize == 8): rt_table_header_format_string = "{0:<18s} {1: <16s} {2:<20s} {3:<16s} {4:<8s} {5:<8s} {6:<8s}" - print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if") + print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if") print rt_table_header_format_string.format("-" * 18, "-" * 16, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8) print GetRtTableAsString(rt_tables) else: rt_table_header_format_string = "{0:<8s} {1:<16s} {2:<18s} {3:<8s} {4:<8s} {5:<8s} {6:<8s}" - print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if") - print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8, "-" * 8) + print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if") + print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8, "-" * 8) print GetRtTableAsString(rt_tables) def GetRtInet6AsString(): rt_tables = kern.globals.rt_tables[30] if (kern.ptrsize == 8): rt_table_header_format_string = "{0:<18s} {1: <16s} {2:<20s} {3:<16s} {4:<8s} {5:<8s} {6:<8s}" - print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if") + print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if") print rt_table_header_format_string.format("-" * 18, "-" * 16, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8) print GetRtTableAsString(rt_tables) else: rt_table_header_format_string = "{0:<8s} {1:<16s} {2:<18s} {3:<8s} {4:<8s} {5:<8s} {6:<8s}" - print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if") - print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 18, "-" * 8, "-" * 8, "-" * 8, "-" * 8) + print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if") + print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 18, "-" * 8, "-" * 8, "-" * 8, "-" * 8) print GetRtTableAsString(rt_tables) # Macro: show_rt_inet @@ -764,7 +835,7 @@ def GetRtInet6AsString(): def ShowRtInet(cmd_args=None): """ Display the IPv4 routing table """ - print GetRtInetAsString() + print GetRtInetAsString() # EndMacro: show_rt_inet # Macro: show_rt_inet6 @@ -824,7 +895,7 @@ def ShowRtEntryDebug(cmd_args=None): out_string += "\n" ix += 1 cnt += 1 - + cnt = 0 while (cnt < RTD_TRACE_HIST_SIZE): ix = 0 @@ -838,7 +909,7 @@ def ShowRtEntryDebug(cmd_args=None): out_string += "\n" ix += 1 cnt += 1 - + out_string += "\nTotal locks : " + str(int(rtd.rtd_lock_cnt)) out_string += "\nTotal unlocks : " + str(int(rtd.rtd_unlock_cnt)) @@ -855,7 +926,7 @@ def ShowRtEntryDebug(cmd_args=None): out_string += "\n" ix += 1 cnt += 1 - + cnt = 0 while (cnt < RTD_TRACE_HIST_SIZE): ix = 0 @@ -1474,20 +1545,23 @@ def GetInPcb(pcb, proto): if (proto == IPPROTO_TCP): out_string += " tcp" + elif (proto == IPPROTO_UDP): + out_string += " udp" + elif (proto == IPPROTO_RAW): + out_string += " raw" else: - if (proto == IPPROTO_UDP): - out_string += " udp" - else: - out_string += str(proto) + "." + out_string += str(proto) + "." + if (pcb.inp_vflag & INP_IPV4): out_string += "4 " if (pcb.inp_vflag & INP_IPV6): out_string += "6 " if (pcb.inp_vflag & INP_IPV4): - out_string += " " + out_string += " " out_string += GetInAddrAsString(addressof(pcb.inp_dependladdr.inp46_local.ia46_addr4)) else: + out_string += " " out_string += GetIn6AddrAsString((pcb.inp_dependladdr.inp6_local.__u6_addr.__u6_addr8)) out_string += " " @@ -1495,7 +1569,7 @@ def GetInPcb(pcb, proto): out_string += " " if (pcb.inp_vflag & INP_IPV4): - out_string += " " + out_string += " " out_string += GetInAddrAsString(addressof(pcb.inp_dependfaddr.inp46_foreign.ia46_addr4)) else: out_string += GetIn6AddrAsString((pcb.inp_dependfaddr.inp6_foreign.__u6_addr.__u6_addr8)) @@ -1507,6 +1581,7 @@ def GetInPcb(pcb, proto): if (proto == IPPROTO_TCP): out_string += GetTcpState(pcb.inp_ppcb) + out_string += "\n\t" if (pcb.inp_flags & INP_RECVOPTS): out_string += "recvopts " if (pcb.inp_flags & INP_RECVRETOPTS): @@ -1577,27 +1652,58 @@ def GetInPcb(pcb, proto): out_string += "in_fctree " if (pcb.inp_flags2 & INP2_WANT_APP_POLICY): out_string += "want_app_policy " - + + out_string += "\n\t" so = pcb.inp_socket if (so != 0): - out_string += "[so=" + str(so) + " s=" + str(int(so.so_snd.sb_cc)) + " r=" + str(int(so.so_rcv.sb_cc)) + " usecnt=" + str(int(so.so_usecount)) + "] " + out_string += "so=" + str(so) + " s=" + str(int(so.so_snd.sb_cc)) + " r=" + str(int(so.so_rcv.sb_cc)) + " usecnt=" + str(int(so.so_usecount)) + ", " if (pcb.inp_state == 0 or pcb.inp_state == INPCB_STATE_INUSE): - out_string += "inuse, " + out_string += "inuse" else: if (pcb.inp_state == INPCB_STATE_DEAD): - out_string += "dead, " + out_string += "dead" else: - out_string += "unknown (" + str(int(pcb.inp_state)) + "), " + out_string += "unknown (" + str(int(pcb.inp_state)) + ")" return out_string +def CalcMbufInList(mpkt, pkt_cnt, buf_byte_cnt, mbuf_cnt, mbuf_cluster_cnt): + while (mpkt != 0): + mp = mpkt + mpkt = mpkt.m_hdr.mh_nextpkt + pkt_cnt[0] +=1 + while (mp != 0): + mbuf_cnt[0] += 1 + buf_byte_cnt[int(mp.m_hdr.mh_type)] += 256 + buf_byte_cnt[Mbuf_Type.MT_LAST] += 256 + if (mp.m_hdr.mh_flags & 0x01): + mbuf_cluster_cnt[0] += 1 + buf_byte_cnt[int(mp.m_hdr.mh_type)] += mp.M_dat.MH.MH_dat.MH_ext.ext_size + buf_byte_cnt[Mbuf_Type.MT_LAST] += mp.M_dat.MH.MH_dat.MH_ext.ext_size + mp = mp.m_hdr.mh_next + +def CalcMbufInSB(so, snd_cc, snd_buf, rcv_cc, rcv_buf, snd_record_cnt, rcv_record_cnt, snd_mbuf_cnt, rcv_mbuf_cnt, snd_mbuf_cluster_cnt, rcv_mbuf_cluster_cnt): + snd_cc[0] += so.so_snd.sb_cc + mpkt = so.so_snd.sb_mb + CalcMbufInList(mpkt, snd_record_cnt, snd_buf, snd_mbuf_cnt, snd_mbuf_cluster_cnt) + rcv_cc[0] += so.so_rcv.sb_cc + mpkt = so.so_rcv.sb_mb + CalcMbufInList(mpkt, rcv_record_cnt, rcv_buf, rcv_mbuf_cnt, rcv_mbuf_cluster_cnt) + def GetPcbInfo(pcbi, proto): + tcp_reassqlen = 0 out_string = "" - snd_cc = 0 - snd_buf = unsigned(0) - rcv_cc = 0 - rcv_buf = unsigned(0) + snd_mbuf_cnt = [0] + snd_mbuf_cluster_cnt = [0] + snd_record_cnt = [0] + snd_cc = [0] + snd_buf = [0] * (Mbuf_Type.MT_LAST + 1) + rcv_mbuf_cnt = [0] + rcv_mbuf_cluster_cnt = [0] + rcv_record_cnt = [0] + rcv_cc = [0] + rcv_buf = [0] * (Mbuf_Type.MT_LAST + 1) pcbseen = 0 out_string += "lastport " + str(int(pcbi.ipi_lastport)) + " lastlow " + str(int(pcbi.ipi_lastlow)) + " lasthi " + str(int(pcbi.ipi_lasthi)) + "\n" out_string += "active pcb count is " + str(int(pcbi.ipi_count)) + "\n" @@ -1605,41 +1711,52 @@ def GetPcbInfo(pcbi, proto): out_string += "hash size is " + str(int(hashsize)) + "\n" out_string += str(pcbi.ipi_hashbase) + " has the following inpcb(s):\n" if (kern.ptrsize == 8): - out_string += "pcb proto source address port destination address port\n" + out_string += "pcb proto source port destination port\n" else: out_string += "pcb proto source address port destination address port\n\n" - i = 0 - hashbase = pcbi.ipi_hashbase - while (i < hashsize): - head = hashbase[i] + if proto == IPPROTO_RAW: + head = cast(pcbi.ipi_listhead, 'inpcbhead *') pcb = cast(head.lh_first, 'inpcb *') while pcb != 0: pcbseen += 1 out_string += GetInPcb(pcb, proto) + "\n" so = pcb.inp_socket if so != 0: - snd_cc += so.so_snd.sb_cc - mp = so.so_snd.sb_mb - while mp != 0: - snd_buf += 256 - if (mp.m_hdr.mh_flags & 0x01): - snd_buf += mp.M_dat.MH.MH_dat.MH_ext.ext_size - mp = mp.m_hdr.mh_next - rcv_cc += so.so_rcv.sb_cc - mp = so.so_rcv.sb_mb - while mp != 0: - rcv_buf += 256 - if (mp.m_hdr.mh_flags & 0x01): - rcv_buf += mp.M_dat.MH.MH_dat.MH_ext.ext_size - mp = mp.m_hdr.mh_next - pcb = cast(pcb.inp_hash.le_next, 'inpcb *') - i += 1 - - out_string += "total seen " + str(int(pcbseen)) + " snd_cc " + str(int(snd_cc)) + " rcv_cc " + str(int(rcv_cc)) + "\n" - out_string += "total snd_buf " + str(int(snd_buf)) + " rcv_buf " + str(int(rcv_buf)) + "\n" - out_string += "port hash base is " + hex(pcbi.ipi_porthashbase) + "\n" - + CalcMbufInSB(so, snd_cc, snd_buf, rcv_cc, rcv_buf, snd_record_cnt, rcv_record_cnt, snd_mbuf_cnt, rcv_mbuf_cnt, snd_mbuf_cluster_cnt, rcv_mbuf_cluster_cnt) + pcb = cast(pcb.inp_list.le_next, 'inpcb *') + else: + i = 0 + hashbase = pcbi.ipi_hashbase + while (i < hashsize): + head = hashbase[i] + pcb = cast(head.lh_first, 'inpcb *') + while pcb != 0: + pcbseen += 1 + out_string += GetInPcb(pcb, proto) + "\n" + so = pcb.inp_socket + if so != 0: + CalcMbufInSB(so, snd_cc, snd_buf, rcv_cc, rcv_buf, snd_record_cnt, rcv_record_cnt, snd_mbuf_cnt, rcv_mbuf_cnt, snd_mbuf_cluster_cnt, rcv_mbuf_cluster_cnt) + if proto == IPPROTO_TCP and pcb.inp_ppcb: + tcpcb = cast(pcb.inp_ppcb, 'tcpcb *') + tcp_reassqlen += tcpcb.t_reassqlen + + pcb = cast(pcb.inp_hash.le_next, 'inpcb *') + i += 1 + + out_string += "total pcbs seen: " + str(int(pcbseen)) + "\n" + out_string += "total send mbuf count: " + str(int(snd_mbuf_cnt[0])) + " receive mbuf count: " + str(int(rcv_mbuf_cnt[0])) + "\n" + out_string += "total send mbuf cluster count: " + str(int(snd_mbuf_cluster_cnt[0])) + " receive mbuf cluster count: " + str(int(rcv_mbuf_cluster_cnt[0])) + "\n" + out_string += "total send record count: " + str(int(snd_record_cnt[0])) + " receive record count: " + str(int(rcv_record_cnt[0])) + "\n" + out_string += "total snd_cc (total bytes in send buffers): " + str(int(snd_cc[0])) + " rcv_cc (total bytes in receive buffers): " + str(int(rcv_cc[0])) + "\n" + out_string += "total snd_buf bytes " + str(int(snd_buf[Mbuf_Type.MT_LAST])) + " rcv_buf bytes " + str(int(rcv_buf[Mbuf_Type.MT_LAST])) + "\n" + for x in range(Mbuf_Type.MT_LAST): + if (snd_buf[x] != 0 or rcv_buf[x] != 0): + out_string += "total snd_buf bytes of type " + Mbuf_Type.reverse_mapping[x] + " : " + str(int(snd_buf[x])) + " total recv_buf bytes of type " + Mbuf_Type.reverse_mapping[x] + " : " + str(int(rcv_buf[x])) + "\n" + out_string += "port hash base is " + hex(pcbi.ipi_porthashbase) + "\n" + if proto == IPPROTO_TCP: + out_string += "TCP reassembly queue length: " + str(tcp_reassqlen) + "\n" + i = 0 hashbase = pcbi.ipi_porthashbase while (i < hashsize): @@ -1659,7 +1776,7 @@ def GetInPcbPort(ppcb): out_string += hex(ppcb) + ": lport " out_string += Getntohs(ppcb.phd_port) return out_string - + def Getntohs(port): out_string = "" @@ -1697,23 +1814,26 @@ def ShowKernEventPcbInfo(cmd_args=None): def GetKernControlPcbInfo(ctl_head): out_string = "" kctl = Cast(ctl_head.tqh_first, 'kctl *') - if (kern.ptrsize == 8): - kcb_format_string = "0x{0:<16x} {1:4d} {2:10d}\n" + if (kern.ptrsize == 8): + kcb_format_string = "0x{0:<16x} {1:10d} {2:10d} {3:10d}\n" else: - kcb_format_string = "0x{0:<8x} {1:4d} {2:10d}\n" + kcb_format_string = "0x{0:<8x} {1:10d} {2:10d} {3:10d}\n" while unsigned(kctl) != 0: kctl_name = "controller: " + str(kctl.name) + "\n" out_string += kctl_name kcb = Cast(kctl.kcb_head.tqh_first, 'ctl_cb *') if unsigned(kcb) != 0: if (kern.ptrsize == 8): - out_string += "socket unit usecount\n" - out_string += "------ ---- --------\n" + out_string += "socket usecount snd_cc rcv_cc\n" + out_string += "------ -------- ------ ------\n" else: - out_string += "socket unit usecount\n" - out_string += "------ ---- --------\n" + out_string += "socket usecount snd_cc rcv_cc\n" + out_string += "------ -------- ------ ------\n" while unsigned(kcb) != 0: - out_string += kcb_format_string.format(kcb.so, kcb.unit, kcb.usecount) + so = Cast(kcb.so, 'socket *') + snd_cc = so.so_snd.sb_cc + rcv_cc = so.so_rcv.sb_cc + out_string += kcb_format_string.format(kcb.so, kcb.usecount, snd_cc, rcv_cc) kcb = kcb.next.tqe_next out_string += "\n" kctl = kctl.next.tqe_next @@ -1742,6 +1862,14 @@ def ShowUdpPcbInfo(cmd_args=None): print GetPcbInfo(addressof(kern.globals.udbinfo), IPPROTO_UDP) # EndMacro: show_udp_pcbinfo +# Macro: show_rip_pcbinfo +@lldb_command('show_rip_pcbinfo') +def ShowRipPcbInfo(cmd_args=None): + """ Display the list of Raw IP protocol control block information + """ + print GetPcbInfo(addressof(kern.globals.ripcbinfo), IPPROTO_RAW) +# EndMacro: show_rip_pcbinfo + # Macro: show_tcp_timewaitslots @lldb_command('show_tcp_timewaitslots') def ShowTcpTimeWaitSlots(cmd_args=None): diff --git a/tools/lldbmacros/netdefines.py b/tools/lldbmacros/netdefines.py index 7301b9747..d894ab049 100755 --- a/tools/lldbmacros/netdefines.py +++ b/tools/lldbmacros/netdefines.py @@ -1,3 +1,51 @@ +def enum(*sequential, **named): + enums = dict(zip(sequential, range(len(sequential))), **named) + reverse = dict((value, key) for key, value in enums.iteritems()) + enums['reverse_mapping'] = reverse + return type('Enum', (), enums) + +Mbuf_Type = enum( + 'MT_FREE', + 'MT_DATA', + 'MT_HEADER', + 'MT_SOCKET', + 'MT_PCB', + 'MT_RTABLE', + 'MT_HTABLE', + 'MT_ATABLE', + 'MT_SONAME', + 'MT_SOOPTS', + 'MT_FTABLE', + 'MT_RIGHTS', + 'MT_IFADDR', + 'MT_CONTROL', + 'MT_OOBDATA', + 'MT_TAG', + 'MT_LAST') + +M_EXT = 0x0001 +M_PKTHDR = 0x0002 +M_EOR = 0x0004 +M_PROTO1 = 0x0008 +M_PROTO2 = 0x0010 +M_PROTO3 = 0x0020 +M_LOOP = 0x0040 +M_PROTO5 = 0x0080 + +M_BCAST = 0x0100 +M_MCAST = 0x0200 +M_FRAG = 0x0400 +M_FIRSTFRAG = 0x0800 +M_LASTFRAG = 0x1000 +M_PROMISC = 0x2000 +M_HASFCS = 0x4000 +M_TAGHDR = 0x8000 + +dlil_if_flags_strings = ["DLIF_INUSE", + "DLIF_REUSE", + "DLIF_DEBUG" + ] + if_capenable_strings = ["RXCSUM", "TXCSUM", "VLAN_MTU", @@ -33,6 +81,11 @@ if_flags_strings = ["UP", "MULTICAST" ] +if_refflags_strings = ["IFRF_EMBRYONIC", + "IFRF_ATTACHED", + "IFRF_DETACHING" + ] + if_eflags_strings = ["AUTOCONFIGURING", "unused", "unused", diff --git a/tools/lldbmacros/pmap.py b/tools/lldbmacros/pmap.py index 2424d2d93..8bff2689c 100755 --- a/tools/lldbmacros/pmap.py +++ b/tools/lldbmacros/pmap.py @@ -1050,13 +1050,13 @@ def ShowPTEARM(pte): else: pte_pgoff = pte_pgoff / 4 nttes = page_size / 4 - if ptd.pt_cnt[pt_index].refcnt == 0x4000: + if ptd.ptd_info[pt_index].refcnt == 0x4000: level = 2 granule = nttes * page_size else: level = 3 granule = page_size - print "maps VA: {:#x}".format(long(unsigned(ptd.pt_map[pt_index].va)) + (pte_pgoff * granule)) + print "maps VA: {:#x}".format(long(unsigned(ptd.ptd_info[pt_index].va)) + (pte_pgoff * granule)) pteval = long(unsigned(dereference(kern.GetValueFromAddress(unsigned(pte), 'pt_entry_t *')))) print "value: {:#x}".format(pteval) if kern.arch.startswith('arm64'): diff --git a/tools/lldbmacros/process.py b/tools/lldbmacros/process.py index f37169a3b..cf7afc73c 100755 --- a/tools/lldbmacros/process.py +++ b/tools/lldbmacros/process.py @@ -171,6 +171,7 @@ def GetASTSummary(ast): B - AST_BSD K - AST_KPERF M - AST_MACF + r - AST_RESET_PCS G - AST_GUARD T - AST_TELEMETRY_USER T - AST_TELEMETRY_KERNEL @@ -185,7 +186,7 @@ def GetASTSummary(ast): out_string = "" state = int(ast) thread_state_chars = {0x0:'', 0x1:'P', 0x2:'Q', 0x4:'U', 0x8:'H', 0x10:'Y', 0x20:'A', - 0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', + 0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x400: 'r', 0x1000:'G', 0x2000:'T', 0x4000:'T', 0x8000:'T', 0x10000:'S', 0x20000: 'D', 0x40000: 'I', 0x80000: 'E', 0x100000: 'R', 0x200000: 'N'} state_str = '' @@ -605,7 +606,7 @@ def ShowTaskCoalitions(cmd_args=None, cmd_options={}): # EndMacro: showtaskcoalitions @lldb_type_summary(['proc', 'proc *']) -@header("{0: >6s} {1: ^20s} {2: >14s} {3: ^10s} {4: <20s}".format("pid", "process", "io_policy", "wq_state", "command")) +@header("{0: >6s} {1: <18s} {2: >11s} {3: ^10s} {4: <20s}".format("pid", "process", "io_policy", "wq_state", "command")) def GetProcSummary(proc): """ Summarize the process data. params: @@ -614,7 +615,7 @@ def GetProcSummary(proc): str - string summary of the process. """ out_string = "" - format_string= "{0: >6d} {1: >#020x} {2: >14s} {3: >2d} {4: >2d} {5: >2d} {6: <20s}" + format_string= "{0: >6d} {1: <#018x} {2: >11s} {3: >2d} {4: >2d} {5: >2d} {6: <20s}" pval = proc.GetSBValue() #code.interact(local=locals()) if str(pval.GetType()) != str(gettype('proc *')) : @@ -915,7 +916,7 @@ def DumpThreadTerminateQueue(cmd_args=None): count = 0 print GetThreadSummary.header - for th in IterateQueue(addressof(kern.globals.thread_terminate_queue), 'struct thread *', 'q_link'): + for th in IterateMPSCQueue(addressof(kern.globals.thread_terminate_queue.mpd_queue), 'struct thread', 'mpsc_links'): print GetThreadSummary(th) count += 1 print "{0: 20d} {2: >20d} {3: >20d} {4: >20d} {5: <20s}".format(t, - t.task_immediate_writes, - t.task_deferred_writes, - t.task_invalidated_writes, - t.task_metadata_writes, + print "{0: <#18x} {1: >20d} {2: >20d} {3: >20d} {4: >20d} {5: <20s} {6: <20s} {7: <20s} {8: <20s} {9: <20s}".format(t, + t.task_writes_counters_internal.task_immediate_writes, + t.task_writes_counters_internal.task_deferred_writes, + t.task_writes_counters_internal.task_invalidated_writes, + t.task_writes_counters_internal.task_metadata_writes, + t.task_writes_counters_external.task_immediate_writes, + t.task_writes_counters_external.task_deferred_writes, + t.task_writes_counters_external.task_invalidated_writes, + t.task_writes_counters_external.task_metadata_writes, str(pval.p_comm)) -@lldb_command('showalltasks','C') -def ShowAllTasks(cmd_args=None, cmd_options={}): +@lldb_command('showalltasks','C', fancy=True) +def ShowAllTasks(cmd_args=None, cmd_options={}, O=None): """ Routine to print a summary listing of all the tasks wq_state -> reports "number of workq threads", "number of scheduled workq threads", "number of pending work items" if "number of pending work items" seems stuck at non-zero, it may indicate that the workqueue mechanism is hung @@ -994,11 +999,11 @@ def ShowAllTasks(cmd_args=None, cmd_options={}): showcorpse = True extra_hdr += " " + GetKCDataSummary.header - print GetTaskSummary.header + extra_hdr + " " + GetProcSummary.header - for t in kern.tasks: - pval = Cast(t.bsd_info, 'proc *') - out_str = GetTaskSummary(t, showcorpse) + " " + GetProcSummary(pval) - print out_str + with O.table(GetTaskSummary.header + extra_hdr + " " + GetProcSummary.header): + for t in kern.tasks: + pval = Cast(t.bsd_info, 'proc *') + print GetTaskSummary(t, showcorpse) + " " + GetProcSummary(pval) + ZombTasks() @lldb_command('taskforpmap') diff --git a/tools/lldbmacros/scheduler.py b/tools/lldbmacros/scheduler.py index ed22c6ae8..0708c7658 100755 --- a/tools/lldbmacros/scheduler.py +++ b/tools/lldbmacros/scheduler.py @@ -43,7 +43,7 @@ def ShowInterrupts(cmd_args=None): cpu_data_entry = Cast(element, 'cpu_data_t *') print "CPU {} IRQ: {:d}\n".format(y, cpu_data_entry.cpu_stat.irq_ex_cnt) print "CPU {} IPI: {:d}\n".format(y, cpu_data_entry.cpu_stat.ipi_cnt) - print "CPU {} PMI: {:d}\n".format(y, cpu_data_entry.cpu_stat.pmi_cnt) + print "CPU {} PMI: {:d}\n".format(y, cpu_data_entry.cpu_monotonic.mtc_npmis) print "CPU {} TMR: {:d}\n".format(y, cpu_data_entry.cpu_stat.timer_cnt) x = x + 1 y = y + 1 @@ -162,6 +162,127 @@ def ShowCurremtAbsTime(cmd_args=None): print "Last dispatch time known: %d MATUs" % cur_abstime +bucketStr = ["", "FIXPRI (>UI)", "TIMESHARE_FG", "TIMESHARE_IN", "TIMESHARE_DF", "TIMESHARE_UT", "TIMESHARE_BG"] + +@header(" {:>18s} | {:>20s} | {:>20s} | {:>10s} | {:>10s}".format('Thread Group', 'Interactivity Score', 'Last Timeshare Tick', 'pri_shift', 'highq')) +def GetSchedClutchBucketSummary(clutch_bucket): + return " 0x{:>16x} | {:>20d} | {:>20d} | {:>10d} | {:>10d}".format(clutch_bucket.scb_clutch.sc_tg, clutch_bucket.scb_interactivity_score, clutch_bucket.scb_timeshare_tick, clutch_bucket.scb_pri_shift, clutch_bucket.scb_runq.highq) + +def ShowSchedClutchForPset(pset): + root_clutch = pset.pset_clutch_root + print "\n{:s} : {:d}\n\n".format("Current Timestamp", GetRecentTimestamp()) + print "{:>10s} | {:>20s} | {:>30s} | {:>18s} | {:>10s} | {:>10s} | {:>30s} | {:>30s} | {:>15s} | ".format("Root", "Root Buckets", "Clutch Buckets", "Address", "Priority", "Count", "CPU Usage (MATUs)", "CPU Blocked (MATUs)", "Deadline (abs)") + GetSchedClutchBucketSummary.header + print "=" * 300 + print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10d} | {:>10d} | {:>30s} | {:>30s} | {:>15s} | ".format("Root", "*", "*", addressof(root_clutch), root_clutch.scr_priority, root_clutch.scr_thr_count, "*", "*", "*") + print "-" * 300 + + for i in range(1, 7): + root_bucket = root_clutch.scr_buckets[i] + print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10s} | {:>10s} | {:>30s} | {:>30s} | {:>15d} | ".format("*", bucketStr[i], "*", addressof(root_bucket), "*", "*", "*", "*", root_bucket.scrb_deadline) + prioq = root_bucket.scrb_clutch_buckets + clutch_bucket_list = [] + for clutch_bucket in IteratePriorityQueue(prioq, 'struct sched_clutch_bucket', 'scb_pqlink'): + clutch_bucket_list.append(clutch_bucket) + if len(clutch_bucket_list) > 0: + clutch_bucket_list.sort(key=lambda x: x.scb_priority, reverse=True) + for clutch_bucket in clutch_bucket_list: + cpu_used = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_used + cpu_blocked = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_blocked + print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10d} | {:>10d} | {:>30d} | {:>30d} | {:>15s} | ".format("*", "*", clutch_bucket.scb_clutch.sc_tg.tg_name, clutch_bucket, clutch_bucket.scb_priority, clutch_bucket.scb_thr_count, cpu_used, cpu_blocked, "*") + GetSchedClutchBucketSummary(clutch_bucket) + print "-" * 300 + +@lldb_command('showschedclutch') +def ShowSchedClutch(cmd_args=[]): + """ Routine to print the clutch scheduler hierarchy. + Usage: showschedclutch + """ + if not cmd_args: + raise ArgumentError("Invalid argument") + pset = kern.GetValueFromAddress(cmd_args[0], "processor_set_t") + ShowSchedClutchForPset(pset) + +@lldb_command('showschedclutchroot') +def ShowSchedClutchRoot(cmd_args=[]): + """ show information about the root of the sched clutch hierarchy + Usage: showschedclutchroot + """ + if not cmd_args: + raise ArgumentError("Invalid argument") + root = kern.GetValueFromAddress(cmd_args[0], "struct sched_clutch_root *") + if not root: + print "unknown arguments:", str(cmd_args) + return False + print "{:>30s} : 0x{:16x}".format("Root", root) + print "{:>30s} : 0x{:16x}".format("Pset", root.scr_pset) + print "{:>30s} : {:d}".format("Priority", root.scr_priority) + print "{:>30s} : {:d}".format("Urgency", root.scr_urgency) + print "{:>30s} : {:d}".format("Threads", root.scr_thr_count) + print "{:>30s} : {:d}".format("Current Timestamp", GetRecentTimestamp()) + print "{:>30s} : {:b} (BG/UT/DF/IN/FG/FIX/NULL)".format("Runnable Root Buckets Bitmap", int(root.scr_runnable_bitmap[0])) + +@lldb_command('showschedclutchrootbucket') +def ShowSchedClutchRootBucket(cmd_args=[]): + """ show information about a root bucket in the sched clutch hierarchy + Usage: showschedclutchrootbucket + """ + if not cmd_args: + raise ArgumentError("Invalid argument") + root_bucket = kern.GetValueFromAddress(cmd_args[0], "struct sched_clutch_root_bucket *") + if not root_bucket: + print "unknown arguments:", str(cmd_args) + return False + print "{:<30s} : 0x{:16x}".format("Root Bucket", root_bucket) + print "{:<30s} : {:s}".format("Bucket Name", bucketStr[int(root_bucket.scrb_bucket)]) + print "{:<30s} : {:d}".format("Deadline", root_bucket.scrb_deadline) + print "{:<30s} : {:d}".format("Current Timestamp", GetRecentTimestamp()) + print "\n" + prioq = root_bucket.scrb_clutch_buckets + clutch_bucket_list = [] + for clutch_bucket in IteratePriorityQueue(prioq, 'struct sched_clutch_bucket', 'scb_pqlink'): + clutch_bucket_list.append(clutch_bucket) + if len(clutch_bucket_list) > 0: + print "=" * 240 + print "{:>30s} | {:>18s} | {:>20s} | {:>20s} | ".format("Name", "Clutch Bucket", "Priority", "Count") + GetSchedClutchBucketSummary.header + print "=" * 240 + clutch_bucket_list.sort(key=lambda x: x.scb_priority, reverse=True) + for clutch_bucket in clutch_bucket_list: + print "{:>30s} | 0x{:16x} | {:>20d} | {:>20d} | ".format(clutch_bucket.scb_clutch.sc_tg.tg_name, clutch_bucket, clutch_bucket.scb_priority, clutch_bucket.scb_thr_count) + GetSchedClutchBucketSummary(clutch_bucket) + +@lldb_command('showschedclutchbucket') +def ShowSchedClutchBucket(cmd_args=[]): + """ show information about a clutch bucket in the sched clutch hierarchy + Usage: showschedclutchbucket + """ + if not cmd_args: + raise ArgumentError("Invalid argument") + clutch_bucket = kern.GetValueFromAddress(cmd_args[0], "struct sched_clutch_bucket *") + if not clutch_bucket: + print "unknown arguments:", str(cmd_args) + return False + print "{:<30s} : 0x{:16x}".format("Clutch Bucket", clutch_bucket) + print "{:<30s} : {:s}".format("TG Name", clutch_bucket.scb_clutch.sc_tg.tg_name) + print "{:<30s} : {:d}".format("Priority", clutch_bucket.scb_priority) + print "{:<30s} : {:d}".format("Thread Count", clutch_bucket.scb_thr_count) + print "{:<30s} : 0x{:16x}".format("Thread Group", clutch_bucket.scb_clutch.sc_tg) + cpu_used = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_used + cpu_blocked = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_blocked + print "{:<30s} : {:d}".format("CPU Used (MATUs)", cpu_used) + print "{:<30s} : {:d}".format("CPU Blocked (MATUs)", cpu_blocked) + print "{:<30s} : {:d}".format("Interactivity Score", clutch_bucket.scb_interactivity_score) + print "{:<30s} : {:d}".format("Last Timeshare Update Tick", clutch_bucket.scb_timeshare_tick) + print "{:<30s} : {:d}".format("Priority Shift", clutch_bucket.scb_pri_shift) + print "\n" + runq = clutch_bucket.scb_clutchpri_prioq + thread_list = [] + for thread in IteratePriorityQueue(runq, 'struct thread', 'sched_clutchpri_link'): + thread_list.append(thread) + if len(thread_list) > 0: + print "=" * 240 + print GetThreadSummary.header + "{:s}".format("Process Name") + print "=" * 240 + for thread in thread_list: + proc = Cast(thread.task.bsd_info, 'proc *') + print GetThreadSummary(thread) + "{:s}".format(str(proc.p_comm)) @lldb_command('abs2nano') def ShowAbstimeToNanoTime(cmd_args=[]): @@ -436,11 +557,11 @@ def ShowGroupSetSummary(runq, task_map): if unsigned(runq_queue_p) != unsigned(runq_queue_head): runq_queue_this_count = 0 - for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links"): + for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links", circleQueue=True): runq_queue_this_count += 1 print " Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count) - for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links"): + for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links", circleQueue=True): group_addr = unsigned(entry) - (sizeof(dereference(entry)) * unsigned(entry.sched_pri)) group = kern.GetValueFromAddress(unsigned(group_addr), 'sched_group_t') task = task_map.get(unsigned(group), 0x0) @@ -474,17 +595,17 @@ def ShowRunQSummary(runq): for runq_queue_i in xrange(runq_queue_count) : runq_queue_head = addressof(runq.queues[runq_queue_i]) - runq_queue_p = runq_queue_head.next + runq_queue_p = runq_queue_head.head - if unsigned(runq_queue_p) != unsigned(runq_queue_head): + if unsigned(runq_queue_p): runq_queue_this_count = 0 - for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links"): + for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links", circleQueue=True): runq_queue_this_count += 1 print " Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count) print "\t" + GetThreadSummary.header + "\n" - for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links"): + for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links", circleQueue=True): print "\t" + GetThreadSummary(thread) + "\n" if config['verbosity'] > vHUMAN : print "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n" @@ -496,7 +617,7 @@ def ShowRTRunQSummary(rt_runq): print " Realtime Queue ({:<#012x}) Count {:d}\n".format(addressof(rt_runq.queue), rt_runq.count) if rt_runq.count != 0: print "\t" + GetThreadSummary.header + "\n" - for rt_runq_thread in ParanoidIterateLinkageChain(rt_runq.queue, "thread_t", "runq_links"): + for rt_runq_thread in ParanoidIterateLinkageChain(rt_runq.queue, "thread_t", "runq_links", circleQueue=True): print "\t" + GetThreadSummary(rt_runq_thread) + "\n" def ShowGrrrSummary(grrr_runq): @@ -514,17 +635,11 @@ def ShowGrrrSummary(grrr_runq): print "Count {:d} Weight {:d}\n".format(grrr_group.count, grrr_group.weight) grrr_group_client_head = addressof(grrr_group.clients) print GetThreadSummary.header - for thread in ParanoidIterateLinkageChain(grrr_group_client_head, "thread_t", "runq_links"): + for thread in ParanoidIterateLinkageChain(grrr_group_client_head, "thread_t", "runq_links", circleQueue=True): print "\t" + GetThreadSummary(thread) + "\n" if config['verbosity'] > vHUMAN : print "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n" -def ShowNextThread(processor): - if (processor.next_thread != 0) : - print " " + "Next thread:\n" - print "\t" + GetThreadSummary.header + "\n" - print "\t" + GetThreadSummary(processor.next_thread) + "\n" - def ShowActiveThread(processor): if (processor.active_thread != 0) : print "\t" + GetThreadSummary.header + "\n" @@ -541,10 +656,8 @@ def ShowScheduler(cmd_args=None): show_priority_runq = 0 show_priority_pset_runq = 0 show_group_pset_runq = 0 - if unsigned(kern.globals.sched_current_dispatch) != 0 : - sched_string = str(kern.globals.sched_current_dispatch.sched_name) - else : - sched_string = str(kern.globals.sched_string) + show_clutch = 0 + sched_string = str(kern.globals.sched_string) if sched_string == "traditional": show_priority_runq = 1 @@ -561,24 +674,28 @@ def ShowScheduler(cmd_args=None): elif sched_string == "amp": show_priority_pset_runq = 1 show_priority_runq = 1 + elif sched_string == "clutch": + show_clutch = 1 else : print "Unknown sched_string {:s}".format(sched_string) - if unsigned(kern.globals.sched_current_dispatch) != 0 : - print "Scheduler: {:s} ({:s})\n".format(sched_string, - kern.Symbolicate(unsigned(kern.globals.sched_current_dispatch))) - - run_buckets = kern.globals.sched_run_buckets - - run_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')] - fixpri_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')] - share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] - share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')] - share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] - share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] - - print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals) - print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count) + print "Scheduler: {:s}\n".format(sched_string) + + if show_clutch == 0: + run_buckets = kern.globals.sched_run_buckets + run_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')] + fixpri_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')] + share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')] + share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] + share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] + print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals) + print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count) + + processor_offline = GetEnumValue('processor_state_t::PROCESSOR_OFF_LINE') + processor_idle = GetEnumValue('processor_state_t::PROCESSOR_IDLE') + processor_dispatching = GetEnumValue('processor_state_t::PROCESSOR_DISPATCHING') + processor_running = GetEnumValue('processor_state_t::PROCESSOR_RUNNING') if show_group_pset_runq: if hasattr(kern.globals, "multiq_sanity_check"): @@ -626,13 +743,12 @@ def ShowScheduler(cmd_args=None): processor_array = kern.globals.processor_array print "Active Processors:\n" - active_bitmap = int(pset.cpu_state_map[5]) | int(pset.cpu_state_map[6]) + active_bitmap = int(pset.cpu_state_map[processor_dispatching]) | int(pset.cpu_state_map[processor_running]) for cpuid in IterateBitmap(active_bitmap): processor = processor_array[cpuid] if processor != 0: print " " + GetProcessorSummary(processor) ShowActiveThread(processor) - ShowNextThread(processor) if show_priority_runq: runq = processor.runq @@ -644,13 +760,12 @@ def ShowScheduler(cmd_args=None): print "Idle Processors:\n" - idle_bitmap = int(pset.cpu_state_map[4]) & int(pset.primary_map) + idle_bitmap = int(pset.cpu_state_map[processor_idle]) & int(pset.primary_map) for cpuid in IterateBitmap(idle_bitmap): processor = processor_array[cpuid] if processor != 0: print " " + GetProcessorSummary(processor) ShowActiveThread(processor) - ShowNextThread(processor) if show_priority_runq: ShowRunQSummary(processor.runq) @@ -658,13 +773,12 @@ def ShowScheduler(cmd_args=None): print "Idle Secondary Processors:\n" - idle_bitmap = int(pset.cpu_state_map[4]) & ~(int(pset.primary_map)) + idle_bitmap = int(pset.cpu_state_map[processor_idle]) & ~(int(pset.primary_map)) for cpuid in IterateBitmap(idle_bitmap): processor = processor_array[cpuid] if processor != 0: print " " + GetProcessorSummary(processor) ShowActiveThread(processor) - ShowNextThread(processor) if show_priority_runq: print ShowRunQSummary(processor.runq) @@ -673,7 +787,7 @@ def ShowScheduler(cmd_args=None): print "Other Processors:\n" other_bitmap = 0 - for i in range(0, 4): + for i in range(processor_offline, processor_idle): other_bitmap |= int(pset.cpu_state_map[i]) other_bitmap &= int(pset.cpu_bitmask) for cpuid in IterateBitmap(other_bitmap): @@ -681,40 +795,41 @@ def ShowScheduler(cmd_args=None): if processor != 0: print " " + GetProcessorSummary(processor) ShowActiveThread(processor) - ShowNextThread(processor) if show_priority_runq: ShowRunQSummary(processor.runq) print " \n" + if show_clutch: + print "=== Clutch Scheduler Hierarchy ===\n\n" + ShowSchedClutchForPset(pset) pset = pset.pset_list node = node.node_list - print "\nTerminate Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_terminate_queue)) - first = False - for thread in ParanoidIterateLinkageChain(kern.globals.thread_terminate_queue, "thread_t", "runq_links"): - if first: - print "\t" + GetThreadSummary.header + "\n" - first = True - print "\t" + GetThreadSummary(thread) + "\n" - print "\nCrashed Threads Queue: ({:<#012x})\n".format(addressof(kern.globals.crashed_threads_queue)) - first = False + first = True for thread in ParanoidIterateLinkageChain(kern.globals.crashed_threads_queue, "thread_t", "runq_links"): if first: - print "\t" + GetThreadSummary.header + "\n" - first = True - print "\t" + GetThreadSummary(thread) + "\n" - - print "\nWaiting For Kernel Stacks Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_stack_queue)) - first = False - for thread in ParanoidIterateLinkageChain(kern.globals.thread_stack_queue, "thread_t", "runq_links"): - if first: - print "\t" + GetThreadSummary.header + "\n" - first = True - print "\t" + GetThreadSummary(thread) + "\n" + print "\t" + GetThreadSummary.header + first = False + print "\t" + GetThreadSummary(thread) + + def dump_mpsc_thread_queue(name, head): + head = addressof(head) + print "\n{:s}: ({:<#012x})\n".format(name, head) + first = True + for thread in IterateMPSCQueue(head.mpd_queue, 'struct thread', 'mpsc_links'): + if first: + print "\t" + GetThreadSummary.header + first = False + print "\t" + GetThreadSummary(thread) + + dump_mpsc_thread_queue("Terminate Queue", kern.globals.thread_terminate_queue) + dump_mpsc_thread_queue("Waiting For Kernel Stacks Queue", kern.globals.thread_stack_queue) + dump_mpsc_thread_queue("Thread Exception Queue", kern.globals.thread_exception_queue) + dump_mpsc_thread_queue("Thread Deallocate Queue", kern.globals.thread_deallocate_queue) print "\n" @@ -723,8 +838,8 @@ def ShowScheduler(cmd_args=None): # EndMacro: showallprocessors -def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst=0): - """ Iterate over a Linkage Chain queue in kernel of type queue_head_t. (osfmk/kern/queue.h method 1) +def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst=0, circleQueue=False): + """ Iterate over a Linkage Chain queue in kernel of type queue_head_t or circle_queue_head_t. (osfmk/kern/queue.h method 1 or circle_queue.h) This is equivalent to the qe_foreach_element() macro Blows up aggressively and descriptively when something goes wrong iterating a queue. Prints correctness errors, and throws exceptions on 'cannot proceed' errors @@ -754,11 +869,15 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst if not queue_head.GetSBValue().GetType().IsPointerType() : queue_head = addressof(queue_head) - # Mosh the value into a brand new value, to really get rid of its old cvalue history - queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct queue_entry *') + if circleQueue: + # Mosh the value into a brand new value, to really get rid of its old cvalue history + queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct circle_queue_head *').head + else: + # Mosh the value into a brand new value, to really get rid of its old cvalue history + queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct queue_entry *') if unsigned(queue_head) == 0: - if ParanoidIterateLinkageChain.enable_paranoia: + if not circleQueue and ParanoidIterateLinkageChain.enable_paranoia: print "bad queue_head_t: {:s}".format(queue_head) return @@ -792,7 +911,9 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst obj = 0 try: - while (unsigned(queue_head) != unsigned(link)): + while True: + if not circleQueue and unsigned(queue_head) == unsigned(link): + break; if ParanoidIterateLinkageChain.enable_paranoia: if unsigned(link.next) == 0: raise ValueError("NULL next pointer: queue_head {:>#18x} link: {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, link, link.next, link.prev)) @@ -809,6 +930,8 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst yield obj last_link = link link = link.next + if circleQueue and unsigned(queue_head) == unsigned(link): + break; except: exc_info = sys.exc_info() try: diff --git a/tools/lldbmacros/skywalk.py b/tools/lldbmacros/skywalk.py index 2119bc010..f0cbae8fa 100755 --- a/tools/lldbmacros/skywalk.py +++ b/tools/lldbmacros/skywalk.py @@ -6,6 +6,7 @@ from xnu import * from utils import * from string import * +from net import * import xnudefines @@ -564,3 +565,174 @@ def ShowProcNECP(cmd_args=None): print GetNECPSummary.header for kc in IterateProcNECP(proc): print GetNECPSummary(kc) + +def NexusTypePtr(nx): + if nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_FLOW_SWITCH"): + return "(struct nx_flowswitch *){:18s}".format(hex(nx.nx_arg)) + elif nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_NET_IF"): + return " (struct nx_netif *){:18s}".format(hex(nx.nx_arg)) + elif nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_USER_PIPE"): + return " (struct nx_upipe *){:18s}".format(hex(nx.nx_arg)) + elif nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_KERNEL_PIPE"): + return " (struct kern_nexus *){:18s}".format(hex(nx)) + else: + return "unknown" + +def GetStructNexusSummary(nx): + nexus_summary_string = "" + nexus_summary_string += "{0:s} ".format(NexusTypePtr(nx)) + nexus_summary_string += "{0:30s} ".format(str(Cast(addressof(nx.nx_prov.nxprov_params.nxp_name), 'char *'))) + nexus_summary_string += "rings: tx {:2d} rx {:2d} slots: {:4d} rx {:4d} bufsize {:5d} metasize {:5d} mhints {:2d} ".format( + nx.nx_prov.nxprov_params.nxp_tx_rings, + nx.nx_prov.nxprov_params.nxp_rx_rings, + nx.nx_prov.nxprov_params.nxp_rx_slots, + nx.nx_prov.nxprov_params.nxp_tx_slots, + nx.nx_prov.nxprov_params.nxp_buf_size, + nx.nx_prov.nxprov_params.nxp_meta_size, + nx.nx_prov.nxprov_params.nxp_mhints) + + return nexus_summary_string + +@lldb_command('shownexuses') +def ShowNexuses(cmd_args=None): + """ Show Nexus. + + usage: shownexues + """ + nexus_summaries = [] + nexuses = kern.globals.nx_head + for nx in IterateRBTreeEntry(nexuses, 'struct kern_nexus*', 'nx_link'): + nexus_summaries.append(GetStructNexusSummary(nx)) + nexus_summaries.sort() + for nx_str in nexus_summaries: + print "{0:s}".format(nx_str) + +def GetSockAddr4(sin): + return GetInAddrAsString(sin.sin_addr) + +def GetSockAddr6(sin6): + addr = sin6.sin6_addr.__u6_addr.__u6_addr8 + addr_raw_string = ":".join(["{0:02x}{0:02x}".format(unsigned(addr[i]), + unsigned(addr[i+1])) for i in range(0, 16, 2)]) + return inet_ntop(AF_INET6, inet_pton(AF_INET6, addr_raw_string)) + +def GetSockAddr46(sockaddr46): + if sockaddr46 is None : + raise ArgumentError('sockaddr is None') + if (sockaddr46.sa.sa_family == 2): + return GetSockAddr4(sockaddr46.sin) + elif (sockaddr46.sa.sa_family == 30): + return GetSockAddr6(sockaddr46.sin6) + else: + raise ArgumentError('invalid sockaddr_in_4_6 address family') + +def GetSockPort46(sockaddr46): + if sockaddr46 is None : + raise ArgumentError('sockaddr is None') + if (sockaddr46.sa.sa_family == 2): + return ntohs(sockaddr46.sin.sin_port) + elif (sockaddr46.sa.sa_family == 30): + return ntohs(sockaddr46.sin6.sin6_port) + else: + raise ArgumentError('invalid sockaddr_in_4_6 address family') + +def FlowEntryStr(fe): + return "(struct flow_entry*){} src={},dst={},proto={},sport={},dport={} ".format( + hex(fe), GetSockAddr46(fe.fe_laddr), GetSockAddr46(fe.fe_faddr), + unsigned(fe.fe_key.fk_proto), GetSockPort46(fe.fe_laddr), + GetSockPort46(fe.fe_faddr), fe.fe_owner_name) + +def GetFlowEntryPid(fe): + return fe.fe_owner_pid + +def GetFlowswitchFlowEntries(fsw): + fm = kern.GetValueFromAddress(unsigned(fsw.fsw_flow_mgr), 'struct flow_mgr *') + cht = kern.GetValueFromAddress(unsigned(fm.fm_flow_table), 'struct cuckoo_hashtable *') + + flows = [] + def GetCuckooNodeAsFLowEntry(node, hashValue): + fe = containerof(node, 'struct flow_entry', 'fe_cnode') + flows.append(fe) + + CuckooHashtableForeach(cht, GetCuckooNodeAsFLowEntry) + return flows + +def IsNexusAFlowswitch(nx): + return nx.nx_prov.nxprov_params.nxp_type == GetEnumValue('nexus_type_t::NEXUS_TYPE_FLOW_SWITCH') + +def GetNexusAsFlowswitch(nx): + return kern.GetValueFromAddress(unsigned(nx.nx_arg), 'struct nx_flowswitch *') + +def FlowswitchStr(fsw): + return "{}:\n(struct nx_flowswitch *){}".format(str(fsw.fsw_ifp.if_xname), hex(fsw)) + +@lldb_command('showflowswitches') +def ShowFlowswitches(cmd_args=None): + """ Show flow switches + + usage: showflowswitches [ifname] + """ + ifname = "" + if len(cmd_args) == 1: + ifname = cmd_args[0] + + nexuses = kern.globals.nx_head + for nx in IterateRBTreeEntry(nexuses, 'struct kern_nexus*', 'nx_link'): + if not IsNexusAFlowswitch(nx): + continue + fsw = GetNexusAsFlowswitch(nx) + if ifname not in str(fsw.fsw_ifp.if_xname): + continue + print "{}".format(FlowswitchStr(fsw)) + flows = GetFlowswitchFlowEntries(fsw) + flows.sort(key=GetFlowEntryPid) + for fe in flows: + print " {}".format(FlowEntryStr(fe)) + +def CuckooHashtableForeachSlot(cht, slotHandler): + for i in range(0, cht._n_buckets): + b = cht._buckets[i] + if unsigned(b._inuse) == 0: + continue + for j in range(0, kern.globals._CHT_BUCKET_SLOTS): + s = b._slots[j] + if unsigned(s._node) != 0: + slotHandler(s) + +def CuckooHashtableForeach(cht, handler): + def CuckooHashtableSlotHandler(s): + if unsigned(s._node) == 0: + return + node = s._node + while unsigned(node) != 0: + handler(node, s._hash) + node = node.next + CuckooHashtableForeachSlot(cht, CuckooHashtableSlotHandler) + +@lldb_command('showcuckoohashtable') +def ShowCuckooHashtable(cmd_args=None): + """ Show Cuckoo Hashtable. + + usage: showcuckoohashtable + """ + if not cmd_args: + raise ArgumentError('missing struct cuckoo_hashtable * argument') + + cht = kern.GetValueFromAddress(cmd_args[0], 'struct cuckoo_hashtable *') + + print "(struct cuckoo_hashtable *){:18s} capacity {:d} entries {:d}".format(hex(cht), cht._capacity, cht._n_entries) + def CuckooHashtablePrintNode(node, hashValue): + print " node {} hash 0x{:08x}".format(hex(node), int(hashValue)) + + CuckooHashtableForeach(cht, CuckooHashtablePrintNode) + +@lldb_command('showprotons') +def ShowProtoNS(cmd_args=None): + """ Show the protons table + """ + + protons_tokens = kern.globals.protons_tokens + for pt in IterateRBTreeEntry(protons_tokens, 'struct protons_token *', 'pt_link'): + print "(struct protons_token *){} protocol {:3} pid {:5} epid {:5} ref {:2} flags {}".format( + hex(pt), int(pt.pt_protocol), int(pt.pt_pid), int(pt.pt_epid), + int(pt.pt_refcnt.ref_count), hex(pt.pt_flags)) diff --git a/tools/lldbmacros/structanalyze.py b/tools/lldbmacros/structanalyze.py index 467e2018d..ba262e329 100755 --- a/tools/lldbmacros/structanalyze.py +++ b/tools/lldbmacros/structanalyze.py @@ -1,84 +1,174 @@ import lldb from xnu import * -def _showStructPacking(symbol, prefix, begin_offset=0, typedef=None): - """ - recursively parse the field members of structure. - params : symbol (lldb.SBType) reference to symbol in binary - prefix (string) string to be prefixed for each line of output. Useful for recursive struct parsing. - returns: string containing lines of output. - """ - ctype = "unknown type" - if symbol.GetTypeClass() == lldb.eTypeClassUnion : - ctype = "union" - if symbol.GetTypeClass() == lldb.eTypeClassStruct : - ctype = "struct" - - if typedef: - outstr = "[%4d] (%s) (%s) %s { " % (symbol.GetByteSize(), typedef, ctype, symbol.GetName()) + "\n" - else : - outstr = "[%4d] (%s) %s { " % (symbol.GetByteSize(), ctype, symbol.GetName()) + "\n" - numFields = symbol.GetNumberOfFields() - _has_memory_hole = False - _compact_size = 0 # asuming the struct is perfectly packed - _compact_offset = begin_offset - _previous_bit_offset = 0 - for i in range(numFields): - member = symbol.GetFieldAtIndex(i) - m_offset = member.GetOffsetInBytes() + begin_offset - m_offset_bits = member.GetOffsetInBits() - m_type = member.GetType() - m_name = member.GetName() - m_size = m_type.GetByteSize() - warningstr = "" - debugstr = "" # + str((m_size, m_offset , m_offset_bits, _previous_bit_offset, _compact_offset, begin_offset)) - if _compact_offset != m_offset and (m_offset_bits - _previous_bit_offset) > m_size*8 : - _has_memory_hole = True - warningstr = " *** Possible memory hole ***" - _compact_offset = m_offset - _compact_offset += m_size - - _type_class = m_type.GetTypeClass() - _canonical_type = m_type.GetCanonicalType() - _canonical_type_class = m_type.GetCanonicalType().GetTypeClass() - - if _type_class == lldb.eTypeClassTypedef and (_canonical_type_class == lldb.eTypeClassStruct or _canonical_type_class == lldb.eTypeClassUnion) : - outstr += prefix + ("*%4d," % m_offset) + _showStructPacking(_canonical_type, prefix+" ", m_offset, str(m_type)) + warningstr + debugstr + "\n" - elif _type_class == lldb.eTypeClassStruct or _type_class == lldb.eTypeClassUnion : - outstr += prefix + ("*%4d," % m_offset) + _showStructPacking(m_type, prefix+" ", m_offset) + warningstr + debugstr + "\n" +_UnionStructClass = [ lldb.eTypeClassStruct, lldb.eTypeClassClass, lldb.eTypeClassUnion ] + +def _showStructPacking(O, symbol, begin_offset=0, symsize=0, typedef=None, outerSize=0, memberName=None): + """ + recursively parse the field members of structure. + params : O the output formatter (standard.py) + symbol (lldb.SBType) reference to symbol in binary + returns: string containing lines of output. + """ + ctype = "unknown type" + is_union = False + is_class = False + union_size = None + sym_size = symbol.GetByteSize() + + if symbol.GetTypeClass() == lldb.eTypeClassUnion: + ctype = "union" + is_union = True + union_size = sym_size + if symbol.GetTypeClass() == lldb.eTypeClassStruct: + ctype = "struct" + if symbol.GetTypeClass() == lldb.eTypeClassClass: + ctype = "class" + is_class = True + + if not outerSize or outerSize == sym_size: + outstr = O.format("{:04d},[{:4d}]", begin_offset, sym_size) + elif outerSize < sym_size: # happens with c++ inheritance + outstr = O.format("{:04d},[{:4d}]", begin_offset, outerSize) + else: + outstr = O.format("{:04d},[{:4d}]{VT.DarkRed}{{{:+d}}}{VT.Default}", + begin_offset, sym_size, outerSize - sym_size) + + if typedef: + outstr += O.format(" {0}", typedef) + if symbol.IsAnonymousType(): + outstr += O.format(" ({VT.DarkMagenta}anonymous {0}{VT.Default})", ctype) + else: + outstr += O.format(" ({VT.DarkMagenta}{0} {1}{VT.Default})", ctype, symbol.GetName()) + if memberName: + outstr += O.format(" {0} {{", memberName) else: - outstr += prefix + ("+%4d,[%4d] (%s) %s" % (m_offset, m_size, m_type.GetName(), m_name)) + warningstr + debugstr + "\n" - if i > 0 : - _previous_bit_offset = m_offset_bits - outstr += prefix + "}" - if _has_memory_hole == True : - outstr += " *** Warning: Struct layout leaves memory hole *** " - return outstr - -@lldb_command('showstructpacking') -def showStructInfo(cmd_args=None): - """Show how a structure is packed in the binary. The format is - +, [] () - For example: - (lldb) script lldbmacros.showStructInfo("pollfd") - [ 8] (struct) pollfd { - + 0,[ 4] (int) fd - + 4,[ 2] (short) events - + 6,[ 2] (short) revents - } - syntax: showstructpacking task - """ - if not cmd_args: - raise ArgumentError("Please provide a type name.") - - sym = gettype(cmd_args[0]) - if sym == None: - print "No such struct found" - if sym.GetTypeClass() == lldb.eTypeClassTypedef: - sym = sym.GetCanonicalType() - if sym.GetTypeClass() != lldb.eTypeClassStruct: - print "%s is not a structure" % cmd_args[0] - else: - print _showStructPacking(sym,"", 0) + outstr += ") {" + + print outstr + + with O.indent(): + _previous_size = 0 + _packed_bit_offset = 0 + _nfields = symbol.GetNumberOfFields() + + if is_class: + _next_offset_in_bits = 0 + _nclasses = symbol.GetNumberOfDirectBaseClasses() + + for i in range(_nclasses): + member = symbol.GetDirectBaseClassAtIndex(i) + if i < _nclasses - 1: + m_size_bits = symbol.GetDirectBaseClassAtIndex(i + 1).GetOffsetInBits() + elif _nfields: + m_size_bits = symbol.GetFieldAtIndex(0).GetOffsetInBits() + else: + m_size_bits = symbol.GetByteSize() * 8 + + m_offset = member.GetOffsetInBytes() + begin_offset + m_type = member.GetType() + m_name = member.GetName() + m_size = m_size_bits / 8 + + _previous_size = m_size + _packed_bit_offset = member.GetOffsetInBits() + m_size_bits + + _showStructPacking(O, m_type, m_offset, str(m_type), outerSize=m_size, memberName=m_name) + + for i in range(_nfields): + member = symbol.GetFieldAtIndex(i) + m_offset = member.GetOffsetInBytes() + begin_offset + m_offset_bits = member.GetOffsetInBits() + + m_type = member.GetType() + m_name = member.GetName() + m_size = m_type.GetByteSize() + + if member.IsBitfield(): + m_is_bitfield = True + m_size_bits = member.GetBitfieldSizeInBits() + else: + m_is_bitfield = False + m_size_bits = m_size * 8 + + if not is_union and _packed_bit_offset < m_offset_bits: + m_previous_offset = begin_offset + _packed_bit_offset / 8 + m_hole_bits = m_offset_bits - _packed_bit_offset + if _packed_bit_offset % 8 == 0: + print O.format("{:04d},[{:4d}] ({VT.DarkRed}*** padding ***{VT.Default})", + m_previous_offset, m_hole_bits / 8) + else: + print O.format("{:04d},[{:4d}] ({VT.Brown}*** padding : {:d} ***{VT.Default})", + m_previous_offset, _previous_size, m_hole_bits) + + _previous_size = m_size + _packed_bit_offset = m_offset_bits + m_size_bits + + _type_class = m_type.GetTypeClass() + _canonical_type = m_type.GetCanonicalType() + _canonical_type_class = m_type.GetCanonicalType().GetTypeClass() + + if _type_class == lldb.eTypeClassTypedef and _canonical_type_class in _UnionStructClass: + _showStructPacking(O, _canonical_type, m_offset, str(m_type), outerSize=union_size, memberName=m_name) + elif _type_class in _UnionStructClass: + _showStructPacking(O, m_type, m_offset, outerSize=union_size, memberName=m_name) + else: + outstr = O.format("{:04d},[{:4d}]", m_offset, m_size) + if is_union and union_size != m_size_bits / 8: + outstr += O.format("{VT.DarkRed}{{{:+d}}}{VT.Default}", + union_size - m_size_bits / 8) + if m_is_bitfield: + outstr += O.format(" ({VT.DarkGreen}{:s} : {:d}{VT.Default}) {:s}", + m_type.GetName(), m_size_bits, m_name) + else: + outstr += O.format(" ({VT.DarkGreen}{:s}{VT.Default}) {:s}", + m_type.GetName(), m_name) + print outstr + + referenceSize = min(outerSize, sym_size) or sym_size + if not is_union and _packed_bit_offset < referenceSize * 8: + m_previous_offset = begin_offset + _packed_bit_offset / 8 + m_hole_bits = referenceSize * 8 - _packed_bit_offset + offset = _packed_bit_offset / 8 + begin_offset + if _packed_bit_offset % 8 == 0: + print O.format("{:04d},[{:4d}] ({VT.DarkRed}*** padding ***{VT.Default})", + m_previous_offset, m_hole_bits / 8) + else: + print O.format("{:04d},[{:4d}] ({VT.Brown}padding : {:d}{VT.Default})\n", + m_previous_offset, _previous_size, m_hole_bits) + + print "}" + +@lldb_command('showstructpacking', fancy=True) +def showStructInfo(cmd_args=None, cmd_options={}, O=None): + """Show how a structure is packed in the binary. The format is + , [] () + + For example: + (lldb) showstructpacking pollfd + 0,[ 8] struct pollfd { + 0,[ 4] (int) fd + 4,[ 2] (short) events + 6,[ 2] (short) revents + } + + syntax: showstructpacking task + """ + if not cmd_args: + raise ArgumentError("Please provide a type name.") + + ty_name = cmd_args[0] + try: + sym = gettype(ty_name) + except NameError: + return O.error("Cannot find type named {0}", ty_name) + + if sym.GetTypeClass() == lldb.eTypeClassTypedef: + sym = sym.GetCanonicalType() + + if sym.GetTypeClass() not in _UnionStructClass: + return O.error("{0} is not a structure/union/class type", ty_name) + + _showStructPacking(O, sym, 0) # EndMacro: showstructinto diff --git a/tools/lldbmacros/sysreg.py b/tools/lldbmacros/sysreg.py new file mode 100755 index 000000000..376a0e202 --- /dev/null +++ b/tools/lldbmacros/sysreg.py @@ -0,0 +1,190 @@ +""" Please make sure you read the README file COMPLETELY BEFORE reading anything below. + It is very critical that you read coding guidelines in Section E in README file. +""" + +""" Note for adding new register support: + + 1. Add target register to "supported registers" in the docstring of DecodeSysreg + 2. Populate _SYSREG_TO_DECODE_FUNC_MAP with your implementation, optionally using + _SYSREG_TO_DOCNAME_MAP + 3. Populate _SUPPORTED_SYSREGS list with target register + +""" + +from xnu import * +import os +import sys +import xml.etree.ElementTree as ET + +GREEN = '\033[0;32m' +RED = '\033[0;31m' +NC = '\033[0m' + +_SUPPORTED_SYSREGS = ['ESR_EL1'] + +_SYSREG_DOC_PATH = os.path.dirname(os.path.abspath(__file__)) + '/sysregdoc/' + +_SYSREG_TO_DOCNAME_MAP = { + 'ESR_EL1': 'AArch64-esr_el1.xml' +} + +## Actual definition at the bottom of the file +_SYSREG_TO_DECODE_FUNC_MAP = None + +# Macro: decode_sysreg +@lldb_command('decode_sysreg') +def DecodeSysreg(cmd_args=None): + """ Print out human-understandable explanation of a system register value + usage: decode_sysreg + example: decode_sysreg esr_el1 0x96000021 + + supported registers: + ESR_EL1 + """ + + ## For now, require exactly 2 arguments + if not cmd_args or len(cmd_args) != 2: + raise ArgumentError("Missing arguments.") + + reg_name = cmd_args[0].upper() + reg_value = int(cmd_args[1], 0) + + if reg_name not in _SUPPORTED_SYSREGS: + raise ArgumentError("{} is not supported".format(reg_name)) + + _SYSREG_TO_DECODE_FUNC_MAP[reg_name](reg_value) +# EndMacro: decode_sysreg + + +lldb_alias('decode_esr', 'decode_sysreg esr_el1') + + +def PrintEsrEl1Explanation(regval): + """ Print out a detailed explanation of regval regarded as the value of + ESR_EL1, by parsing ARM machine readable specification + """ + xmlfilename = _SYSREG_DOC_PATH + _SYSREG_TO_DOCNAME_MAP['ESR_EL1'] + tree = ET.parse(xmlfilename) + root = tree.getroot() + + ec = (regval >> 26) & ((1 << 6) - 1) + ecstring = '0b{:06b}'.format(ec) + + print _Colorify(VT.Green, 'EC == ' + ecstring) + + ecxpath = './registers/register/reg_fieldsets/fields/field[@id="EC_31_26"]/field_values/field_value_instance[field_value="{}"]/field_value_description//para'.format(ecstring) + ec_desc_paras = root.findall(ecxpath) + + if ec_desc_paras is None or len(ec_desc_paras) == 0: + print 'EC not defined.' + print '\r\n' + + for para in ec_desc_paras: + sys.stdout.write(para.text) + for child in para: + sys.stdout.write(_GetParaChildrenStr(child)) + sys.stdout.write(child.tail) + print '\r\n' + print '\r\n' + + iss = regval & ((1 << 25) - 1); + issstring = '0x{:07x}'.format(iss) + print _Colorify(VT.Green, 'ISS == ' + issstring) + print '\r\n' + + iss_condition_xpath = './registers/register/reg_fieldsets/fields/field[@id="EC_31_26"]/field_values/field_value_instance[field_value="{}"]/field_value_links_to'.format(ecstring) + iss_condition = root.find(iss_condition_xpath) + iss_condition_str = iss_condition.attrib['linked_field_condition'] + + iss_fields_xpath = './registers/register/reg_fieldsets/fields/field[@id="ISS_24_0"]/partial_fieldset/fields[fields_instance="{}"]//field'.format(iss_condition_str) + iss_fields = root.findall(iss_fields_xpath) + + for field in iss_fields: + _PrintEsrIssField(field, regval) + + +def _GetParaChildrenStr(elem): + """ Convert child tags of element into text for printing + """ + + if elem.tag == 'binarynumber': + return elem.text + if elem.tag == 'arm-defined-word': + return elem.text + elif elem.tag == 'xref': + return elem.attrib['browsertext'].encode('utf-8') + elif elem.tag == 'register_link': + return elem.text + else: + return _Colorify(VT.Red, '*unsupported text*') + + +def _PrintEsrIssField(elem, regval): + """ Print detailed explanation of the ISS field of ESR + """ + + field_name_str = elem.find('field_name').text + field_msb = int(elem.find('field_msb').text) + field_lsb = int(elem.find('field_lsb').text) + fd_before_paras = elem.findall('./field_description[@order="before"]//para') + fd_after_paras = elem.findall('./field_description[@order="after"]//para') + + field_bits = field_msb - field_lsb + 1 + field_value = (regval >> field_lsb) & ((1 << field_bits) - 1) + field_value_string = ('0b{:0' + '{}'.format(field_bits) + 'b}').format(field_value) + + print _Colorify(VT.Green, _GetIndentedString(2, field_name_str) + ' == ' + field_value_string) + + fv_desc_paras = elem.findall('./field_values/field_value_instance[field_value="{}"]/field_value_description//para'.format(field_value_string)) + + if fv_desc_paras and len(fv_desc_paras): + for para in fv_desc_paras: + sys.stdout.write(_GetIndentedString(2, '')) + sys.stdout.write(para.text) + for child in para: + sys.stdout.write(_GetParaChildrenStr(child)) + sys.stdout.write((child.tail)) + print '\r\n' + print '\r\n' + else: + print _Colorify(VT.Red, _GetIndentedString(2, '(No matching value, dumping out full description)')) + for para in fd_before_paras: + sys.stdout.write(_GetIndentedString(2, '')) + sys.stdout.write(para.text) + for child in para: + sys.stdout.write(_GetParaChildrenStr(child)) + sys.stdout.write(child.tail) + print '\r\n' + print '\r\n' + + ## Dump all possible values + all_field_values = elem.findall('./field_values/field_value_instance//field_value') + all_field_values_str = [fv.text for fv in all_field_values] + if all_field_values_str != []: + print _GetIndentedString(2, ', '.join(all_field_values_str)) + + for para in fd_after_paras: + sys.stdout.write(_GetIndentedString(2, '')) + sys.stdout.write(para.text) + for child in para: + sys.stdout.write(_GetParaChildrenStr(child)) + sys.stdout.write(child.tail) + print '\r\n' + print '\r\n' + + +def _GetIndentedString(indentation, msg): + """ Return `msg` indented by `indentation` number of spaces + """ + return ' ' * indentation + msg + + +def _Colorify(color, msg): + """ Return `msg` enclosed by color codes + """ + return color + msg + VT.Reset + + +_SYSREG_TO_DECODE_FUNC_MAP = { + 'ESR_EL1': PrintEsrEl1Explanation +} diff --git a/tools/lldbmacros/sysregdoc/AArch64-esr_el1.xml b/tools/lldbmacros/sysregdoc/AArch64-esr_el1.xml new file mode 100644 index 000000000..c24be2dfa --- /dev/null +++ b/tools/lldbmacros/sysregdoc/AArch64-esr_el1.xml @@ -0,0 +1,6153 @@ + + + + + + + + + + + + + + + ESR_EL1 + Exception Syndrome Register (EL1) + + + + + + + DFSR + Architectural + AArch32 + 31 + 0 + + 31 + 0 + + + + + + + + Holds syndrome information for an exception taken to EL1. + + + + + Exception and fault handling registers + + + + + + + + + + + + ESR_EL1 is a 64-bit register. + + + + + + + + + + + + + + + + + + + ESR_EL1 is made UNKNOWN as a result of an exception return from EL1. +When an UNPREDICTABLE instruction is treated as UNDEFINED, and the exception is taken to EL1, the value of ESR_EL1 is UNKNOWN. The value written to ESR_EL1 must be consistent with a value that could be created as a result of an exception from the same Exception level that generated the exception as a result of a situation that is not UNPREDICTABLE at that Exception level, in order to avoid the possibility of a privilege violation. + + + + + 0 + 63 + 32 + + Reserved, RES0. + + + + + + EC + 31 + 26 + + + Exception Class. Indicates the reason for the exception that this register holds information about. +For each EC value, the table references a subsection that gives information about: + +The cause of the exception, for example the configuration required to enable the trap. +The encoding of the associated ISS. + +Possible values of the EC field are: + + + + + + + 0b000000 + + Unknown reason. + + + + + 0b000001 + + Trapped WFI or WFE instruction execution. +Conditional WFE and WFI instructions that fail their condition code check do not cause an exception. + + + + + 0b000011 + + Trapped MCR or MRC access with (coproc==0b1111) that is not reported using EC 0b000000. + + + + + 0b000100 + + Trapped MCRR or MRRC access with (coproc==0b1111) that is not reported using EC 0b000000. + + + + + 0b000101 + + Trapped MCR or MRC access with (coproc==0b1110). + + + + + 0b000110 + + Trapped LDC or STC access. +The only architected uses of these instruction are: + +An STC to write data to memory from DBGDTRRXint. +An LDC to read data from memory to DBGDTRTXint. + + + + + + 0b000111 + + Access to SVE, Advanced SIMD, or floating-point functionality trapped by CPACR_EL1.FPEN, CPTR_EL2.FPEN, CPTR_EL2.TFP, or CPTR_EL3.TFP control. +Excludes exceptions resulting from CPACR_EL1 when the value of HCR_EL2.TGE is 1, or because SVE or Advanced SIMD and floating-point are not implemented. These are reported with EC value 0b000000 as described in . + + + + + 0b001100 + + Trapped MRRC access with (coproc==0b1110). + + + + + 0b001101 + + Branch Target Exception. + + + When ARMv8.5-BTI is implemented + + + 0b001110 + + Illegal Execution state. + + + + + 0b010001 + + SVC instruction execution in AArch32 state. +This is reported in ESR_EL2 only when the exception is generated because the value of HCR_EL2.TGE is 1. + + + + + 0b010101 + + SVC instruction execution in AArch64 state. + + + + + 0b011000 + + Trapped MSR, MRS or System instruction execution in AArch64 state, that is not reported using EC 0b000000, 0b000001 or 0b000111. +If is implemented, also exceptions generated on a read of an ID register. +If is implemented, also Cache Speculation Variant exceptions. +This includes all instructions that cause exceptions that are part of the encoding space defined in , except for those exceptions reported using EC values 0b000000, 0b000001, or 0b000111. + + + + + 0b011001 + + Access to SVE functionality trapped as a result of CPACR_EL1.ZEN, CPTR_EL2.ZEN, CPTR_EL2.TZ, or CPTR_EL3.EZ, that is not reported using EC 0b000000. +This EC is defined only if is implemented. + + + + + 0b100000 + + Instruction Abort from a lower Exception level, that might be using AArch32 or AArch64. +Used for MMU faults generated by instruction accesses and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions. + + + + + 0b100001 + + Instruction Abort taken without a change in Exception level. +Used for MMU faults generated by instruction accesses and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions. + + + + + 0b100010 + + PC alignment fault exception. + + + + + 0b100100 + + Data Abort from a lower Exception level, that might be using AArch32 or AArch64. +Used for MMU faults generated by data accesses, alignment faults other than those caused by Stack Pointer misalignment, and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions. + + + + + 0b100101 + + Data Abort taken without a change in Exception level. +Used for MMU faults generated by data accesses, alignment faults other than those caused by Stack Pointer misalignment, and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions. + + + + + 0b100110 + + SP alignment fault exception. + + + + + 0b101000 + + Trapped floating-point exception taken from AArch32 state. +This EC value is valid if the implementation supports trapping of floating-point exceptions, otherwise it is reserved. Whether a floating-point implementation supports trapping of floating-point exceptions is IMPLEMENTATION DEFINED. + + + + + 0b101100 + + Trapped floating-point exception taken from AArch64 state. +This EC value is valid if the implementation supports trapping of floating-point exceptions, otherwise it is reserved. Whether a floating-point implementation supports trapping of floating-point exceptions is IMPLEMENTATION DEFINED. + + + + + 0b101111 + + SError interrupt. + + + + + 0b110000 + + Breakpoint exception from a lower Exception level, that might be using AArch32 or AArch64. + + + + + 0b110001 + + Breakpoint exception taken without a change in Exception level. + + + + + 0b110010 + + Software Step exception from a lower Exception level, that might be using AArch32 or AArch64. + + + + + 0b110011 + + Software Step exception taken without a change in Exception level. + + + + + 0b110100 + + Watchpoint exception from a lower Exception level, that might be using AArch32 or AArch64. + + + + + 0b110101 + + Watchpoint exception taken without a change in Exception level. + + + + + 0b111000 + + BKPT instruction execution in AArch32 state. + + + + + 0b111100 + + BRK instruction execution in AArch64 state. +This is reported in ESR_EL3 only if a BRK instruction is executed. + + + + + + + All other EC values are reserved by Arm, and: + +Unused values in the range 0b000000 - 0b101100 (0x00 - 0x2C) are reserved for future use for synchronous exceptions. +Unused values in the range 0b101101 - 0b111111 (0x2D - 0x3F) are reserved for future use, and might be used for synchronous or asynchronous exceptions. + +The effect of programming this field to a reserved value is that behavior is CONSTRAINED UNPREDICTABLE, as described in . + + + + + + + U + + + + + + IL + 25 + 25 + + + Instruction Length for synchronous exceptions. Possible values of this bit are: + + + + + + + 0b0 + + 16-bit instruction trapped. + + + + 0b1 + + + +An SError interrupt. + + +An Instruction Abort exception. + + +A PC alignment fault exception. + + +An SP alignment fault exception. + + +A Data Abort exception for which the value of the ISV bit is 0. + + +An Illegal Execution state exception. + + +Any debug exception except for Breakpoint instruction exceptions. For Breakpoint instruction exceptions, this bit has its standard meaning: + + +0b0: 16-bit T32 BKPT instruction. + + +0b1: 32-bit A32 BKPT instruction or A64 BRK instruction. + + + + +An exception reported using EC value 0b000000. + + + + + + + + + + U + + + + + + ISS + 24 + 0 + + + Instruction Specific Syndrome. Architecturally, this field can be defined independently for each defined Exception class. However, in practice, some ISS encodings are used for more than one Exception class. +Typically, an ISS encoding has a number of subfields. When an ISS subfield holds a register number, the value returned in that field is the AArch64 view of the register number. For an exception taken from AArch32 state, defines this view of the specified AArch32 register. If the AArch32 register descriptor is 0b1111, then: + +If the instruction that generated the exception was not UNPREDICTABLE, the field takes the value 0b11111. +If the instruction that generated the exception was UNPREDICTABLE, the field takes an UNKNOWN value that must be either: +The AArch64 view of the register number of a register that might have been used at the Exception level from which the exception was taken. +The value 0b11111. + + + +When the EC field is 0b000000, indicating an exception with an unknown reason, the ISS field is not valid, RES0. + + + + + I + + + + + + + Exceptions with an unknown reason + + + + + + + + 0 + 24 + 0 + + Reserved, RES0. + + + + + + + When an exception is reported using this EC code the IL field is set to 1. +This EC code is used for all exceptions that are not covered by any other EC value. This includes exceptions that are generated in the following situations: + +The attempted execution of an instruction bit pattern that has no allocated instruction at the current Exception level and Security state, including: +A read access using a System register pattern that is not allocated for reads at the current Exception level and Security state. +A write access using a System register pattern that is not allocated for writes at the current Exception level and Security state. +Instruction encodings for instructions not implemented in the implementation. + + +In Debug state, the attempted execution of an instruction bit pattern that is unallocated in Debug state. +In Non-debug state, the attempted execution of an instruction bit pattern that is unallocated in Non-debug state. +In AArch32 state, attempted execution of a short vector floating-point instruction. +In an implementation that does not include Advanced SIMD and floating-point functionality, an attempted access to Advanced SIMD or floating-point functionality under conditions where that access would be permitted if that functionality was present. This includes the attempted execution of an Advanced SIMD or floating-point instruction, and attempted accesses to Advanced SIMD and floating-point System registers. +An exception generated because of the value of one of the SCTLR_EL1.{ITD, SED, CP15BEN} control bits. +Attempted execution of: +An HVC instruction when disabled by HCR_EL2.HCD or SCR_EL3.HCE. +An SMC instruction when disabled by SCR_EL3.SMD. +An HLT instruction when disabled by EDSCR.HDE. + + +Attempted execution of an MSR or MRS instruction to access SP_EL0 when the value of SPSel.SP is 0. +Attempted execution, in Debug state, of: +A DCPS1 instruction when the value of HCR_EL2.TGE is 1 and EL2 is disabled or not implemented in the current Security state. +A DCPS2 instruction from EL1 or EL0 when EL2 is disabled or not implemented in the current Security state. +A DCPS3 instruction when the value of EDSCR.SDD is 1, or when EL3 is not implemented. + + +When EL3 is using AArch64, attempted execution from Secure EL1 of an SRS instruction using R13_mon. See . +In Debug state when the value of EDSCR.SDD is 1, the attempted execution at EL2, EL1, or EL0 of an instruction that is configured to trap to EL3. +In AArch32 state, the attempted execution of an MRS (banked register) or an MSR (banked register) instruction to SPSR_mon, SP_mon, or LR_mon. +An exception that is taken to EL2 because the value of HCR_EL2.TGE is 1 that, if the value of HCR_EL2.TGE was 0 would have been reported with an ESR_ELx.EC value of 0b000111. +When SVE is not implemented, attempted execution of: +An SVE instruction. +An MSR or MRS instruction to access ZCR_EL1, ZCR_EL2, or ZCR_EL3. + + + + + + + + + + + + + + + + + Exception from a WFI or WFE instruction + + + + + + + + CV + 24 + 24 + + + Condition code valid. Possible values of this bit are: + + + + + + + 0b0 + + The COND field is not valid. + + + + 0b1 + + The COND field is valid. + + + + + + For exceptions taken from AArch64, CV is set to 1. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1. +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether CV is set to 1 or set to 0. See the description of the COND field for more information. + + + + + + + + U + + + + + + COND + 23 + 20 + + + The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1. +For exceptions taken from AArch64, this field is set to 0b1110. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1 and: +If the instruction is conditional, COND is set to the condition code field value from the instruction. +If the instruction is unconditional, COND is set to 0b1110. + + +A conditional A32 instruction that is known to pass its condition code check can be presented either: +With COND set to 0b1110, the value for unconditional. +With the COND value held in the instruction. + + +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether: +CV is set to 0 and COND is set to an UNKNOWN value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction. +CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction. + + +For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is IMPLEMENTATION DEFINED whether the COND field is set to 0b1110, or to the value of any condition that applied to the instruction. + + + + + + + + + + + + U + + + + + + 0 + 19 + 1 + + Reserved, RES0. + + + + + + TI + 0 + 0 + + + Trapped instruction. Possible values of this bit are: + + + + + + + 0b0 + + WFI trapped. + + + + 0b1 + + WFE trapped. + + + + + + + + U + + + + + + + The following sections describe configuration settings for generating this exception: + +. +. +. + + + + + + + + + + + + + + + + + + + + + + + + Exception from an MCR or MRC access + + + + + + + + CV + 24 + 24 + + + Condition code valid. Possible values of this bit are: + + + + + + + 0b0 + + The COND field is not valid. + + + + 0b1 + + The COND field is valid. + + + + + + For exceptions taken from AArch64, CV is set to 1. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1. +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether CV is set to 1 or set to 0. See the description of the COND field for more information. + + + + + + + + U + + + + + + COND + 23 + 20 + + + The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1. +For exceptions taken from AArch64, this field is set to 0b1110. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1 and: +If the instruction is conditional, COND is set to the condition code field value from the instruction. +If the instruction is unconditional, COND is set to 0b1110. + + +A conditional A32 instruction that is known to pass its condition code check can be presented either: +With COND set to 0b1110, the value for unconditional. +With the COND value held in the instruction. + + +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether: +CV is set to 0 and COND is set to an UNKNOWN value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction. +CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction. + + +For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is IMPLEMENTATION DEFINED whether the COND field is set to 0b1110, or to the value of any condition that applied to the instruction. + + + + + + + + + + + + U + + + + + + Opc2 + 19 + 17 + + + The Opc2 value from the issued instruction. +For a trapped VMRS access, holds the value 0b000. + + + + + + + + + + + U + + + + + + Opc1 + 16 + 14 + + + The Opc1 value from the issued instruction. +For a trapped VMRS access, holds the value 0b111. + + + + + + + + + + + U + + + + + + CRn + 13 + 10 + + + The CRn value from the issued instruction. +For a trapped VMRS access, holds the reg field from the VMRS instruction encoding. + + + + + + + + + + + U + + + + + + Rt + 9 + 5 + + + The Rt value from the issued instruction, the general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See . + + + + + + + + + + + U + + + + + + CRm + 4 + 1 + + + The CRm value from the issued instruction. +For a trapped VMRS access, holds the value 0b0000. + + + + + + + + + + + U + + + + + + Direction + 0 + 0 + + + Indicates the direction of the trapped instruction. The possible values of this bit are: + + + + + + + 0b0 + + Write to System register space. MCR instruction. + + + + 0b1 + + Read from System register space. MRC or VMRS instruction. + + + + + + + + U + + + + + + + The following sections describe configuration settings for generating exceptions that are reported using EC value 0b000011: + +. +. +. +. +. +. +. +. +. +. + + +. +. +. +. +. +. + +The following sections describe configuration settings for generating exceptions that are reported using EC value 0b000101: + +. +. +, for trapped accesses to the JIDR. +. +. +. +. +. +. +. + + describes configuration settings for generating exceptions that are reported using EC value 0b001000. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from an MCRR or MRRC access + + + + + + + + CV + 24 + 24 + + + Condition code valid. Possible values of this bit are: + + + + + + + 0b0 + + The COND field is not valid. + + + + 0b1 + + The COND field is valid. + + + + + + For exceptions taken from AArch64, CV is set to 1. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1. +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether CV is set to 1 or set to 0. See the description of the COND field for more information. + + + + + + + + U + + + + + + COND + 23 + 20 + + + The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1. +For exceptions taken from AArch64, this field is set to 0b1110. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1 and: +If the instruction is conditional, COND is set to the condition code field value from the instruction. +If the instruction is unconditional, COND is set to 0b1110. + + +A conditional A32 instruction that is known to pass its condition code check can be presented either: +With COND set to 0b1110, the value for unconditional. +With the COND value held in the instruction. + + +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether: +CV is set to 0 and COND is set to an UNKNOWN value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction. +CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction. + + +For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is IMPLEMENTATION DEFINED whether the COND field is set to 0b1110, or to the value of any condition that applied to the instruction. + + + + + + + + + + + + U + + + + + + Opc1 + 19 + 16 + + + The Opc1 value from the issued instruction. + + + + + + + + + + + U + + + + + + 0 + 15 + 15 + + Reserved, RES0. + + + + + + Rt2 + 14 + 10 + + + The Rt2 value from the issued instruction, the second general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See . + + + + + + + + + + + U + + + + + + Rt + 9 + 5 + + + The Rt value from the issued instruction, the first general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See . + + + + + + + + + + + U + + + + + + CRm + 4 + 1 + + + The CRm value from the issued instruction. + + + + + + + + + + + U + + + + + + Direction + 0 + 0 + + + Indicates the direction of the trapped instruction. The possible values of this bit are: + + + + + + + 0b0 + + Write to System register space. MCRR instruction. + + + + 0b1 + + Read from System register space. MRRC instruction. + + + + + + + + U + + + + + + + The following sections describe configuration settings for generating exceptions that are reported using EC value 0b000100: + +. +. +. +. +. +. +. +. +. +. + +The following sections describe configuration settings for generating exceptions that are reported using EC value 0b001100: + +. +. +. +. +. +. +. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from an LDC or STC instruction + + + + + + + + CV + 24 + 24 + + + Condition code valid. Possible values of this bit are: + + + + + + + 0b0 + + The COND field is not valid. + + + + 0b1 + + The COND field is valid. + + + + + + For exceptions taken from AArch64, CV is set to 1. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1. +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether CV is set to 1 or set to 0. See the description of the COND field for more information. + + + + + + + + U + + + + + + COND + 23 + 20 + + + The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1. +For exceptions taken from AArch64, this field is set to 0b1110. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1 and: +If the instruction is conditional, COND is set to the condition code field value from the instruction. +If the instruction is unconditional, COND is set to 0b1110. + + +A conditional A32 instruction that is known to pass its condition code check can be presented either: +With COND set to 0b1110, the value for unconditional. +With the COND value held in the instruction. + + +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether: +CV is set to 0 and COND is set to an UNKNOWN value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction. +CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction. + + +For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is IMPLEMENTATION DEFINED whether the COND field is set to 0b1110, or to the value of any condition that applied to the instruction. + + + + + + + + + + + + U + + + + + + imm8 + 19 + 12 + + + The immediate value from the issued instruction. + + + + + + + + + + + U + + + + + + 0 + 11 + 10 + + Reserved, RES0. + + + + + + Rn + 9 + 5 + + + The Rn value from the issued instruction, the general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See . +This field is valid only when AM[2] is 0, indicating an immediate form of the LDC or STC instruction. When AM[2] is 1, indicating a literal form of the LDC or STC instruction, this field is UNKNOWN. + + + + + + + + + + + U + + + + + + Offset + 4 + 4 + + + Indicates whether the offset is added or subtracted: + + + + + + + 0b0 + + Subtract offset. + + + + 0b1 + + Add offset. + + + + + + This bit corresponds to the U bit in the instruction encoding. + + + + + + + U + + + + + + AM + 3 + 1 + + + Addressing mode. The permitted values of this field are: + + + + + + + 0b000 + + Immediate unindexed. + + + + 0b001 + + Immediate post-indexed. + + + + 0b010 + + Immediate offset. + + + + 0b011 + + Immediate pre-indexed. + + + + 0b100 + + For a trapped STC instruction or a trapped T32 LDC instruction this encoding is reserved. + + + + 0b110 + + For a trapped STC instruction, this encoding is reserved. + + + + + + The values 0b101 and 0b111 are reserved. The effect of programming this field to a reserved value is that behavior is CONSTRAINED UNPREDICTABLE, as described in . +Bit [2] in this subfield indicates the instruction form, immediate or literal. +Bits [1:0] in this subfield correspond to the bits {P, W} in the instruction encoding. + + + + + + + U + + + + + + Direction + 0 + 0 + + + Indicates the direction of the trapped instruction. The possible values of this bit are: + + + + + + + 0b0 + + Write to memory. STC instruction. + + + + 0b1 + + Read from memory. LDC instruction. + + + + + + + + U + + + + + + + The following sections describe the configuration settings for the traps that are reported using EC value 0b000110: + +. +. +. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from an access to SVE, Advanced SIMD or floating-point functionality, resulting from CPACR_EL1.FPEN, CPTR_EL2.FPEN or CPTR_ELx.TFP + + + The accesses covered by this trap include: + +Execution of SVE or Advanced SIMD and floating-point instructions. +Accesses to the Advanced SIMD and floating-point System registers. + +For an implementation that does not include either SVE or support for floating-point and Advanced SIMD, the exception is reported using the EC value 0b000000. + + + + + CV + 24 + 24 + + + Condition code valid. Possible values of this bit are: + + + + + + + 0b0 + + The COND field is not valid. + + + + 0b1 + + The COND field is valid. + + + + + + For exceptions taken from AArch64, CV is set to 1. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1. +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether CV is set to 1 or set to 0. See the description of the COND field for more information. + + + + + + + + U + + + + + + COND + 23 + 20 + + + The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1. +For exceptions taken from AArch64, this field is set to 0b1110. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1 and: +If the instruction is conditional, COND is set to the condition code field value from the instruction. +If the instruction is unconditional, COND is set to 0b1110. + + +A conditional A32 instruction that is known to pass its condition code check can be presented either: +With COND set to 0b1110, the value for unconditional. +With the COND value held in the instruction. + + +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether: +CV is set to 0 and COND is set to an UNKNOWN value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction. +CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction. + + +For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is IMPLEMENTATION DEFINED whether the COND field is set to 0b1110, or to the value of any condition that applied to the instruction. + + + + + + + + + + + + U + + + + + + 0 + 19 + 0 + + Reserved, RES0. + + + + + + + The following sections describe the configuration settings for the traps that are reported using EC value 0b000111: + +. +. + + + + + + + + + + + + + + + + + + + + + + Exception from an access to SVE functionality, resulting from CPACR_EL1.ZEN, CPTR_EL2.ZEN, CPTR_EL2.TZ, or CPTR_EL3.EZ + + + + + + + + 0 + 24 + 0 + + Reserved, RES0. + + + + When SVE is implemented + + + 0 + 24 + 0 + + Reserved, RES0. + + + + + + + The accesses covered by this trap include: + +Execution of SVE instructions. +Accesses to the SVE system registers, ZCR_ELx and ID_AA64ZFR0_EL1. + +For an implementation that does not include SVE, the exception is reported using the EC value 0b000000. + + + + + + + + + + + + + + Exception from an Illegal Execution state, or a PC or SP alignment fault + + + + + + + + 0 + 24 + 0 + + Reserved, RES0. + + + + + + + There are no configuration settings for generating Illegal Execution state exceptions and PC alignment fault exceptions. For more information about these exceptions see and . + describes the configuration settings for generating SP alignment fault exceptions. + + + + + + + + + + + + + + Exception from HVC or SVC instruction execution + + + + + + + + 0 + 24 + 16 + + Reserved, RES0. + + + + + + imm16 + 15 + 0 + + + The value of the immediate field from the HVC or SVC instruction. +For an HVC instruction, and for an A64 SVC instruction, this is the value of the imm16 field of the issued instruction. +For an A32 or T32 SVC instruction: + +If the instruction is unconditional, then: +For the T32 instruction, this field is zero-extended from the imm8 field of the instruction. +For the A32 instruction, this field is the bottom 16 bits of the imm24 field of the instruction. + + +If the instruction is conditional, this field is UNKNOWN. + + + + + + + + + + + + U + + + + + + + In AArch32 state, the HVC instruction is unconditional, and a conditional SVC instruction generates an exception only if it passes its condition code check. Therefore, the syndrome information for these exceptions does not require conditionality information. +For T32 and A32 instructions, see and . +For A64 instructions, see and . + + + + + + + + + + + + + + + + + Exception from SMC instruction execution in AArch32 state + + + For an SMC instruction that completes normally and generates an exception that is taken to EL3, the ISS encoding is RES0. +For an SMC instruction that is trapped to EL2 from EL1 because HCR_EL2.TSC is 1, the ISS encoding is as shown in the diagram. + + + + + CV + 24 + 24 + + + Condition code valid. Possible values of this bit are: + + + + + + + 0b0 + + The COND field is not valid. + + + + 0b1 + + The COND field is valid. + + + + + + For exceptions taken from AArch64, CV is set to 1. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1. +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether CV is set to 1 or set to 0. See the description of the COND field for more information. + +This field is only valid if CCKNOWNPASS is 1, otherwise it is RES0. + + + + + + + U + + + + + + COND + 23 + 20 + + + The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1. +For exceptions taken from AArch64, this field is set to 0b1110. +For exceptions taken from AArch32: + +When an A32 instruction is trapped, CV is set to 1 and: +If the instruction is conditional, COND is set to the condition code field value from the instruction. +If the instruction is unconditional, COND is set to 0b1110. + + +A conditional A32 instruction that is known to pass its condition code check can be presented either: +With COND set to 0b1110, the value for unconditional. +With the COND value held in the instruction. + + +When a T32 instruction is trapped, it is IMPLEMENTATION DEFINED whether: +CV is set to 0 and COND is set to an UNKNOWN value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction. +CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction. + + +For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is IMPLEMENTATION DEFINED whether the COND field is set to 0b1110, or to the value of any condition that applied to the instruction. + +This field is only valid if CCKNOWNPASS is 1, otherwise it is RES0. + + + + + + + + + + + U + + + + + + CCKNOWNPASS + 19 + 19 + + + Indicates whether the instruction might have failed its condition code check. + + + + + + + 0b0 + + The instruction was unconditional, or was conditional and passed its condition code check. + + + + 0b1 + + The instruction was conditional, and might have failed its condition code check. + + + + + + In an implementation in which an SMC instruction that fails it code check is not trapped, this field can always return the value 0. + + + + + + + U + + + + + + 0 + 18 + 0 + + Reserved, RES0. + + + + + + + describes the configuration settings for trapping SMC instructions from EL1 modes, and describes the case where these exceptions are trapped to EL3. + + + + + + + + + + + + + + + + + + + + + + + Exception from SMC instruction execution in AArch64 state + + + + + + + + 0 + 24 + 16 + + Reserved, RES0. + + + + + + imm16 + 15 + 0 + + + The value of the immediate field from the issued SMC instruction. + + + + + + + + + + + U + + + + + + + The value of ISS[24:0] described here is used both: + +When an SMC instruction is trapped from EL1 modes. +When an SMC instruction is not trapped, so completes normally and generates an exception that is taken to EL3. + + describes the configuration settings for trapping SMC instructions from Non-secure EL1 modes, and describes the case where these exceptions are trapped to EL3. + + + + + + + + + + + + + + + + + Exception from MSR, MRS, or System instruction execution in AArch64 state + + + + + + + + 0 + 24 + 22 + + Reserved, RES0. + + + + + + Op0 + 21 + 20 + + + The Op0 value from the issued instruction. + + + + + + + + + + + U + + + + + + Op2 + 19 + 17 + + + The Op2 value from the issued instruction. + + + + + + + + + + + U + + + + + + Op1 + 16 + 14 + + + The Op1 value from the issued instruction. + + + + + + + + + + + U + + + + + + CRn + 13 + 10 + + + The CRn value from the issued instruction. + + + + + + + + + + + U + + + + + + Rt + 9 + 5 + + + The Rt value from the issued instruction, the general-purpose register used for the transfer. + + + + + + + + + + + U + + + + + + CRm + 4 + 1 + + + The CRm value from the issued instruction. + + + + + + + + + + + U + + + + + + Direction + 0 + 0 + + + Indicates the direction of the trapped instruction. The possible values of this bit are: + + + + + + + 0b0 + + Write access, including MSR instructions. + + + + 0b1 + + Read access, including MRS instructions. + + + + + + + + U + + + + + + + For exceptions caused by System instructions, see for the encoding values returned by an instruction. +The following sections describe configuration settings for generating the exception that is reported using EC value 0b011000: + +In . +. +. +. +. +. +. +. +. +. + + +In . +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + + +In . +. +. +. +. +. +. +. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IMPLEMENTATION DEFINED exception to EL3 + + + + + + + + IMPLEMENTATION DEFINED + 24 + 0 + + IMPLEMENTATION DEFINED. + + + + + + + I + + + + + + U + + + + + + + + + + + + + + + + + + + + + Exception from an Instruction Abort + + + + + + + + 0 + 24 + 13 + + Reserved, RES0. + + + + + + SET + 12 + 11 + + + Synchronous Error Type. When the RAS Extension is implemented and IFSC is 0b010000, describes the state of the PE after taking the Instruction Abort exception. The possible values of this field are: + + + + + + + 0b00 + + Recoverable error (UER). + + + + 0b10 + + Uncontainable error (UC). + + + + 0b11 + + Restartable error (UEO) or Corrected error (CE). + + + + + + All other values are reserved. +Software can use this information to determine what recovery might be possible. Taking a synchronous External Abort exception might result in an unrecoverable PE state.This field is RES0 if either: + +The RAS Extension is not implemented. +The value returned in the IFSC field is not 0b010000. + + + + + + + + U + + + + + + FnV + 10 + 10 + + + FAR not Valid, for a synchronous External abort other than a synchronous External abort on a translation table walk. + + + + + + + 0b0 + + FAR is valid. + + + + 0b1 + + FAR is not valid, and holds an UNKNOWN value. + + + + + + This field is only valid if the IFSC code is 0b010000. It is RES0 for all other aborts. + + + + + + + U + + + + + + EA + 9 + 9 + + + External abort type. This bit can provide an IMPLEMENTATION DEFINED classification of External aborts. +For any abort other than an External abort this bit returns a value of 0. + + + + + + + + + + + U + + + + + + 0 + 8 + 8 + + Reserved, RES0. + + + + + + S1PTW + 7 + 7 + + + For a stage 2 fault, indicates whether the fault was a stage 2 fault on an access made for a stage 1 translation table walk: + + + + + + + 0b0 + + Fault not on a stage 2 translation for a stage 1 translation table walk. + + + + 0b1 + + Fault on the stage 2 translation of an access for a stage 1 translation table walk. + + + + + + For any abort other than a stage 2 fault this bit is RES0. + + + + + + + U + + + + + + 0 + 6 + 6 + + Reserved, RES0. + + + + + + IFSC + 5 + 0 + + + Instruction Fault Status Code. Possible values of this field are: + + + + + + + 0b000000 + + Address size fault, level 0 of translation or translation table base register + + + + 0b000001 + + Address size fault, level 1 + + + + 0b000010 + + Address size fault, level 2 + + + + 0b000011 + + Address size fault, level 3 + + + + 0b000100 + + Translation fault, level 0 + + + + 0b000101 + + Translation fault, level 1 + + + + 0b000110 + + Translation fault, level 2 + + + + 0b000111 + + Translation fault, level 3 + + + + 0b001001 + + Access flag fault, level 1 + + + + 0b001010 + + Access flag fault, level 2 + + + + 0b001011 + + Access flag fault, level 3 + + + + 0b001101 + + Permission fault, level 1 + + + + 0b001110 + + Permission fault, level 2 + + + + 0b001111 + + Permission fault, level 3 + + + + 0b010000 + + Synchronous External abort, not on translation table walk + + + + 0b010100 + + Synchronous External abort, on translation table walk, level 0 + + + + 0b010101 + + Synchronous External abort, on translation table walk, level 1 + + + + 0b010110 + + Synchronous External abort, on translation table walk, level 2 + + + + 0b010111 + + Synchronous External abort, on translation table walk, level 3 + + + + 0b011000 + + Synchronous parity or ECC error on memory access, not on translation table walk + + + + 0b011100 + + Synchronous parity or ECC error on memory access on translation table walk, level 0 + + + + 0b011101 + + Synchronous parity or ECC error on memory access on translation table walk, level 1 + + + + 0b011110 + + Synchronous parity or ECC error on memory access on translation table walk, level 2 + + + + 0b011111 + + Synchronous parity or ECC error on memory access on translation table walk, level 3 + + + + 0b110000 + + TLB conflict abort + + + + 0b110001 + + Unsupported atomic hardware update fault, if the implementation includes . Otherwise reserved. + + + + + + All other values are reserved. +When the RAS Extension is implemented, 0b011000, 0b011100, 0b011101, 0b011110, and 0b011111, are reserved. +Armv8.2 requires the implementation of the RAS Extension.For more information about the lookup level associated with a fault, see . +Because Access flag faults and Permission faults can only result from a Block or Page translation table descriptor, they cannot occur at level 0.If the S1PTW bit is set, then the level refers the level of the stage2 translation that is translating a stage 1 translation walk. + + + + + + + U + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from a Data Abort + + + + + + + + ISV + 24 + 24 + + + Instruction syndrome valid. Indicates whether the syndrome information in ISS[23:0] is valid. + + + + + + + 0b0 + + No valid instruction syndrome. ISS[23:14] are RES0. + + + + 0b1 + + ISS[23:14] hold a valid instruction syndrome. + + + + + + This bit is 0 for all faults reported in ESR_EL2 except the following stage 2 aborts: + +AArch64 loads and stores of a single general-purpose register (including the register specified with 0b11111, including those with Acquire/Release semantics, but excluding Load Exclusive or Store Exclusive and excluding those with writeback. +AArch32 instructions where the instruction: +Is an LDR, LDA, LDRT, LDRSH, LDRSHT, LDRH, LDAH, LDRHT, LDRSB, LDRSBT, LDRB, LDAB, LDRBT, STR, STL, STRT, STRH, STLH, STRHT, STRB, STLB, or STRBT instruction. +Is not performing register writeback. +Is not using R15 as a source or destination register. + + + +For these cases, ISV is UNKNOWN if the exception was generated in Debug state in memory access mode, and otherwise indicates whether ISS[23:14] hold a valid syndrome. +ISV is 0 for all faults reported in ESR_EL1 or ESR_EL3. +When the RAS Extension is implemented, ISV is 0 for any synchronous External abort. +For ISS reporting, a stage 2 abort on a stage 1 translation table walk does not return a valid instruction syndrome, and therefore ISV is 0 for these aborts. +When the RAS Extension is not implemented, the value of ISV on a synchronous External abort on a stage 2 translation table walk is IMPLEMENTATION DEFINED. + + + + + + + U + + + + + + SAS + 23 + 22 + + + Syndrome Access Size. When ISV is 1, indicates the size of the access attempted by the faulting operation. + + + + + + + 0b00 + + Byte + + + + 0b01 + + Halfword + + + + 0b10 + + Word + + + + 0b11 + + Doubleword + + + + + + This field is UNKNOWN when the value of ISV is UNKNOWN. +This field is RES0 when the value of ISV is 0. + + + + + + + U + + + + + + SSE + 21 + 21 + + + Syndrome Sign Extend. When ISV is 1, for a byte, halfword, or word load operation, indicates whether the data item must be sign extended. For these cases, the possible values of this bit are: + + + + + + + 0b0 + + Sign-extension not required. + + + + 0b1 + + Data item must be sign-extended. + + + + + + For all other operations this bit is 0. +This field is UNKNOWN when the value of ISV is UNKNOWN. +This field is RES0 when the value of ISV is 0. + + + + + + + U + + + + + + SRT + 20 + 16 + + + Syndrome Register transfer. When ISV is 1, the register number of the Rt operand of the faulting instruction. If the exception was taken from an Exception level that is using AArch32 then this is the AArch64 view of the register. See . +This field is UNKNOWN when the value of ISV is UNKNOWN. +This field is RES0 when the value of ISV is 0. + + + + + + + + + + + U + + + + + + SF + 15 + 15 + + + Width of the register accessed by the instruction is Sixty-Four. When ISV is 1, the possible values of this bit are: + + + + + + + 0b0 + + Instruction loads/stores a 32-bit wide register. + + + + 0b1 + + Instruction loads/stores a 64-bit wide register. + + + + + + This field specifies the register width identified by the instruction, not the Execution state.This field is UNKNOWN when the value of ISV is UNKNOWN. +This field is RES0 when the value of ISV is 0. + + + + + + + U + + + + + + AR + 14 + 14 + + + Acquire/Release. When ISV is 1, the possible values of this bit are: + + + + + + + 0b0 + + Instruction did not have acquire/release semantics. + + + + 0b1 + + Instruction did have acquire/release semantics. + + + + + + This field is UNKNOWN when the value of ISV is UNKNOWN. +This field is RES0 when the value of ISV is 0. + + + + + + + U + + + + + + VNCR + 13 + 13 + + + Indicates that the fault came from use of VNCR_EL2 register by EL1 code. + + + + + + + 0b0 + + The fault was not generated by the use of VNCR_EL2, by an MRS or MSR instruction executed at EL1. + + + + 0b1 + + The fault was generated by the use of VNCR_EL2, by an MRS or MSR instruction executed at EL1. + + + + + + This field is 0 in ESR_EL1. + + + + + + + U + + + + When ARMv8.4-NV is implemented + + + 0 + 13 + 13 + + Reserved, RES0. + + + + + + SET + 12 + 11 + + + Synchronous Error Type. When the RAS Extension is implemented and DFSC is 0b010000, describes the state of the PE after taking the Data Abort exception. The possible values of this field are: + + + + + + + 0b00 + + Recoverable error (UER). + + + + 0b10 + + Uncontainable error (UC). + + + + 0b11 + + Restartable error (UEO) or Corrected error (CE). + + + + + + All other values are reserved. +Software can use this information to determine what recovery might be possible. Taking a synchronous External Abort exception might result in an unrecoverable PE state.This field is RES0 if either: + +The RAS Extension is not implemented. +The value returned in the DFSC field is not 0b010000. + + + + + + + + U + + + + + + FnV + 10 + 10 + + + FAR not Valid, for a synchronous External abort other than a synchronous External abort on a translation table walk. + + + + + + + 0b0 + + FAR is valid. + + + + 0b1 + + FAR is not valid, and holds an UNKNOWN value. + + + + + + This field is valid only if the DFSC code is 0b010000. It is RES0 for all other aborts. + + + + + + + U + + + + + + EA + 9 + 9 + + + External abort type. This bit can provide an IMPLEMENTATION DEFINED classification of External aborts. +For any abort other than an External abort this bit returns a value of 0. + + + + + + + + + + + U + + + + + + CM + 8 + 8 + + + Cache maintenance. Indicates whether the Data Abort came from a cache maintenance or address translation instruction: + + + + + + + 0b0 + + The Data Abort was not generated by the execution of one of the System instructions identified in the description of value 1. + + + + 0b1 + + The Data Abort was generated by either the execution of a cache maintenance instruction or by a synchronous fault on the execution of an address translation instruction. The DC ZVA instruction is not classified as a cache maintenance instruction, and therefore its execution cannot cause this field to be set to 1. + + + + + + + + U + + + + + + S1PTW + 7 + 7 + + + For a stage 2 fault, indicates whether the fault was a stage 2 fault on an access made for a stage 1 translation table walk: + + + + + + + 0b0 + + Fault not on a stage 2 translation for a stage 1 translation table walk. + + + + 0b1 + + Fault on the stage 2 translation of an access for a stage 1 translation table walk. + + + + + + For any abort other than a stage 2 fault this bit is RES0. + + + + + + + U + + + + + + WnR + 6 + 6 + + + Write not Read. Indicates whether a synchronous abort was caused by an instruction writing to a memory location, or by an instruction reading from a memory location. The possible values of this bit are: + + + + + + + 0b0 + + Abort caused by an instruction reading from a memory location. + + + + 0b1 + + Abort caused by an instruction writing to a memory location. + + + + + + For faults on cache maintenance and address translation instructions, this bit always returns a value of 1. +For faults from an atomic instruction that both reads and writes from a memory location, this bit is set to 0 if a read of the address specified by the instruction would have generated the fault which is being reported, otherwise it is set to 1. The architecture permits, but does not require, a relaxation of this requirement such that for all stage 2 aborts on stage 1 translation table walks for atomic instructions, the WnR bit is always 0. +This field is UNKNOWN for: + +An External abort on an Atomic access. +A fault reported using a DFSC value of 0b110101 or 0b110001, indicating an unsupported Exclusive or atomic access. + + + + + + + + U + + + + + + DFSC + 5 + 0 + + + Data Fault Status Code. Possible values of this field are: + + + + + + + 0b000000 + + Address size fault, level 0 of translation or translation table base register. + + + + 0b000001 + + Address size fault, level 1. + + + + 0b000010 + + Address size fault, level 2. + + + + 0b000011 + + Address size fault, level 3. + + + + 0b000100 + + Translation fault, level 0. + + + + 0b000101 + + Translation fault, level 1. + + + + 0b000110 + + Translation fault, level 2. + + + + 0b000111 + + Translation fault, level 3. + + + + 0b001001 + + Access flag fault, level 1. + + + + 0b001010 + + Access flag fault, level 2. + + + + 0b001011 + + Access flag fault, level 3. + + + + 0b001101 + + Permission fault, level 1. + + + + 0b001110 + + Permission fault, level 2. + + + + 0b001111 + + Permission fault, level 3. + + + + 0b010000 + + Synchronous External abort, not on translation table walk. + + + + 0b010001 + + Synchronous Tag Check fail + + + + 0b010100 + + Synchronous External abort, on translation table walk, level 0. + + + + 0b010101 + + Synchronous External abort, on translation table walk, level 1. + + + + 0b010110 + + Synchronous External abort, on translation table walk, level 2. + + + + 0b010111 + + Synchronous External abort, on translation table walk, level 3. + + + + 0b011000 + + Synchronous parity or ECC error on memory access, not on translation table walk. + + + + 0b011100 + + Synchronous parity or ECC error on memory access on translation table walk, level 0. + + + + 0b011101 + + Synchronous parity or ECC error on memory access on translation table walk, level 1. + + + + 0b011110 + + Synchronous parity or ECC error on memory access on translation table walk, level 2. + + + + 0b011111 + + Synchronous parity or ECC error on memory access on translation table walk, level 3. + + + + 0b100001 + + Alignment fault. + + + + 0b110000 + + TLB conflict abort. + + + + 0b110001 + + Unsupported atomic hardware update fault, if the implementation includes . Otherwise reserved. + + + + 0b110100 + + IMPLEMENTATION DEFINED fault (Lockdown). + + + + 0b110101 + + IMPLEMENTATION DEFINED fault (Unsupported Exclusive or Atomic access). + + + + 0b111101 + + Section Domain Fault, used only for faults reported in the PAR_EL1. + + + + 0b111110 + + Page Domain Fault, used only for faults reported in the PAR_EL1. + + + + + + All other values are reserved. +When the RAS Extension is implemented, 0b011000, 0b011100, 0b011101, 0b011110, and 0b011111, are reserved. +For more information about the lookup level associated with a fault, see . +Because Access flag faults and Permission faults can only result from a Block or Page translation table descriptor, they cannot occur at level 0.If the S1PTW bit is set, then the level refers the level of the stage2 translation that is translating a stage 1 translation walk. + + + + + + + U + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from a trapped floating-point exception + + + + + + + + 0 + 24 + 24 + + Reserved, RES0. + + + + + + TFV + 23 + 23 + + + Trapped Fault Valid bit. Indicates whether the IDF, IXF, UFF, OFF, DZF, and IOF bits hold valid information about trapped floating-point exceptions. The possible values of this bit are: + + + + + + + 0b0 + + The IDF, IXF, UFF, OFF, DZF, and IOF bits do not hold valid information about trapped floating-point exceptions and are UNKNOWN. + + + + 0b1 + + One or more floating-point exceptions occurred during an operation performed while executing the reported instruction. The IDF, IXF, UFF, OFF, DZF, and IOF bits indicate trapped floating-point exceptions that occurred. For more information see . + + + + + + It is IMPLEMENTATION DEFINED whether this field is set to 0 on an exception generated by a trapped floating point exception from a vector instruction. +This is not a requirement. Implementations can set this field to 1 on a trapped floating-point exception from a vector instruction and return valid information in the {IDF, IXF, UFF, OFF, DZF, IOF} fields. + + + + + + + U + + + + + + 0 + 22 + 11 + + Reserved, RES0. + + + + + + VECITR + 10 + 8 + + + For a trapped floating-point exception from an instruction executed in AArch32 state this field is RES1. +For a trapped floating-point exception from an instruction executed in AArch64 state this field is UNKNOWN. + + + + + + + + + + + U + + + + + + IDF + 7 + 7 + + + Input Denormal floating-point exception trapped bit. If the TFV field is 0, this bit is UNKNOWN. Otherwise, the possible values of this bit are: + + + + + + + 0b0 + + Input denormal floating-point exception has not occurred. + + + + 0b1 + + Input denormal floating-point exception occurred during execution of the reported instruction. + + + + + + + + U + + + + + + 0 + 6 + 5 + + Reserved, RES0. + + + + + + IXF + 4 + 4 + + + Inexact floating-point exception trapped bit. If the TFV field is 0, this bit is UNKNOWN. Otherwise, the possible values of this bit are: + + + + + + + 0b0 + + Inexact floating-point exception has not occurred. + + + + 0b1 + + Inexact floating-point exception occurred during execution of the reported instruction. + + + + + + + + U + + + + + + UFF + 3 + 3 + + + Underflow floating-point exception trapped bit. If the TFV field is 0, this bit is UNKNOWN. Otherwise, the possible values of this bit are: + + + + + + + 0b0 + + Underflow floating-point exception has not occurred. + + + + 0b1 + + Underflow floating-point exception occurred during execution of the reported instruction. + + + + + + + + U + + + + + + OFF + 2 + 2 + + + Overflow floating-point exception trapped bit. If the TFV field is 0, this bit is UNKNOWN. Otherwise, the possible values of this bit are: + + + + + + + 0b0 + + Overflow floating-point exception has not occurred. + + + + 0b1 + + Overflow floating-point exception occurred during execution of the reported instruction. + + + + + + + + U + + + + + + DZF + 1 + 1 + + + Divide by Zero floating-point exception trapped bit. If the TFV field is 0, this bit is UNKNOWN. Otherwise, the possible values of this bit are: + + + + + + + 0b0 + + Divide by Zero floating-point exception has not occurred. + + + + 0b1 + + Divide by Zero floating-point exception occurred during execution of the reported instruction. + + + + + + + + U + + + + + + IOF + 0 + 0 + + + Invalid Operation floating-point exception trapped bit. If the TFV field is 0, this bit is UNKNOWN. Otherwise, the possible values of this bit are: + + + + + + + 0b0 + + Invalid Operation floating-point exception has not occurred. + + + + 0b1 + + Invalid Operation floating-point exception occurred during execution of the reported instruction. + + + + + + + + U + + + + + + + In an implementation that supports the trapping of floating-point exceptions: + +From an Exception level using AArch64, the FPCR.{IDE, IXE, UFE, OFE, DZE, IOE} bits enable each of the floating-point exception traps. +From an Exception level using AArch32, the FPSCR.{IDE, IXE, UFE, OFE, DZE, IOE} bits enable each of the floating-point exception traps. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SError interrupt + + + + + + + + IDS + 24 + 24 + + + IMPLEMENTATION DEFINED syndrome. Possible values of this bit are: + + + + + + + 0b0 + + Bits[23:0] of the ISS field holds the fields described in this encoding. +If the RAS Extension is not implemented, this means that bits[23:0] of the ISS field are RES0. + + + + 0b1 + + Bits[23:0] of the ISS field holds IMPLEMENTATION DEFINED syndrome information that can be used to provide additional information about the SError interrupt. + + + + + + This field was previously called ISV. + + + + + + + U + + + + + + 0 + 23 + 14 + + Reserved, RES0. + + + + + + IESB + 13 + 13 + + + Implicit error synchronization event. + + + + + + + 0b0 + + The SError interrupt was either not synchronized by the implicit error synchronization event or not taken immediately. + + + + 0b1 + + The SError interrupt was synchronized by the implicit error synchronization event and taken immediately. + + + + + + This field is RES0 if the value returned in the DFSC field is not 0b010001. +Armv8.2 requires the implementation of the RAS Extension and . + + + + + + + U + + + + When ARMv8.2-IESB is implemented + + + 0 + 13 + 13 + + Reserved, RES0. + + + + + + AET + 12 + 10 + + + Asynchronous Error Type. +When the RAS Extension is implemented and DFSC is 0b010001, describes the state of the PE after taking the SError interrupt exception. The possible values of this field are: + + + + + + + 0b000 + + Uncontainable error (UC). + + + + 0b001 + + Unrecoverable error (UEU). + + + + 0b010 + + Restartable error (UEO). + + + + 0b011 + + Recoverable error (UER). + + + + 0b110 + + Corrected error (CE). + + + + + + All other values are reserved. +If multiple errors are taken as a single SError interrupt exception, the overall state of the PE is reported. For example, if both a Recoverable and Unrecoverable error occurred, the state is Unrecoverable. +Software can use this information to determine what recovery might be possible. The recovery software must also examine any implemented fault records to determine the location and extent of the error.This field is RES0 if either: + +The RAS Extension is not implemented. +The value returned in the DFSC field is not 0b010001. + +Armv8.2 requires the implementation of the RAS Extension. + + + + + + + U + + + + + + EA + 9 + 9 + + + External abort type. When the RAS Extension is implemented, this bit can provide an IMPLEMENTATION DEFINED classification of External aborts. +For any abort other than an External abort this bit returns a value of 0. +This field is RES0 if either: + +The RAS Extension is not implemented. +The value returned in the DFSC field is not 0b010001. + +Armv8.2 requires the implementation of the RAS Extension. + + + + + + + + + + + U + + + + + + 0 + 8 + 6 + + Reserved, RES0. + + + + + + DFSC + 5 + 0 + + + Data Fault Status Code. When the RAS Extension is implemented, possible values of this field are: + + + + + + + 0b000000 + + Uncategorized. + + + + 0b010001 + + Asynchronous SError interrupt. + + + + + + All other values are reserved. +If the RAS Extension is not implemented, this field is RES0. +Armv8.2 requires the implementation of the RAS Extension. + + + + + + + U + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from a Breakpoint or Vector Catch debug exception + + + + + + + + 0 + 24 + 6 + + Reserved, RES0. + + + + + + IFSC + 5 + 0 + + + Instruction Fault Status Code. This field is set to 0b100010, to indicate a Debug exception. + + + + + + + + + + + U + + + + + + + For more information about generating these exceptions: + +For exceptions from AArch64, see . +For exceptions from AArch32, see and . + + + + + + + + + + + + + + + + + + Exception from a Software Step exception + + + + + + + + ISV + 24 + 24 + + + Instruction syndrome valid. Indicates whether the EX bit, ISS[6], is valid, as follows: + + + + + + + 0b0 + + EX bit is RES0. + + + + 0b1 + + EX bit is valid. + + + + + + See the EX bit description for more information. + + + + + + + U + + + + + + 0 + 23 + 7 + + Reserved, RES0. + + + + + + EX + 6 + 6 + + + Exclusive operation. If the ISV bit is set to 1, this bit indicates whether a Load-Exclusive instruction was stepped. + + + + + + + 0b0 + + An instruction other than a Load-Exclusive instruction was stepped. + + + + 0b1 + + A Load-Exclusive instruction was stepped. + + + + + + If the ISV bit is set to 0, this bit is RES0, indicating no syndrome data is available. + + + + + + + U + + + + + + IFSC + 5 + 0 + + + Instruction Fault Status Code. This field is set to 0b100010, to indicate a Debug exception. + + + + + + + + + + + U + + + + + + + For more information about generating these exceptions, see . + + + + + + + + + + + + + + + + + + + + + + + Exception from a Watchpoint exception + + + + + + + + 0 + 24 + 14 + + Reserved, RES0. + + + + + + VNCR + 13 + 13 + + + Indicates that the watchpoint came from use of VNCR_EL2 register by EL1 code. + + + + + + + 0b0 + + The watchpoint was not generated by the use of VNCR_EL2 by EL1 code. + + + + 0b1 + + The watchpoint was generated by the use of VNCR_EL2 by EL1 code. + + + + + + This field is 0 in ESR_EL1. + + + + + + + U + + + + When ARMv8.4-NV is implemented + + + 0 + 13 + 13 + + Reserved, RES0. + + + + + + 0 + 12 + 9 + + Reserved, RES0. + + + + + + CM + 8 + 8 + + + Cache maintenance. Indicates whether the Watchpoint exception came from a cache maintenance or address translation instruction: + + + + + + + 0b0 + + The Watchpoint exception was not generated by the execution of one of the System instructions identified in the description of value 1. + + + + 0b1 + + The Watchpoint exception was generated by either the execution of a cache maintenance instruction or by a synchronous Watchpoint exception on the execution of an address translation instruction. The DC ZVA instruction is not classified as a cache maintenance instruction, and therefore its execution cannot cause this field to be set to 1. + + + + + + + + U + + + + + + 0 + 7 + 7 + + Reserved, RES0. + + + + + + WnR + 6 + 6 + + + Write not Read. Indicates whether the Watchpoint exception was caused by an instruction writing to a memory location, or by an instruction reading from a memory location. The possible values of this bit are: + + + + + + + 0b0 + + Watchpoint exception caused by an instruction reading from a memory location. + + + + 0b1 + + Watchpoint exception caused by an instruction writing to a memory location. + + + + + + For Watchpoint exceptions on cache maintenance and address translation instructions, this bit always returns a value of 1. +For Watchpoint exceptions from an atomic instruction, this field is set to 0 if a read of the location would have generated the Watchpoint exception, otherwise it is set to 1. +If multiple watchpoints match on the same access, it is UNPREDICTABLE which watchpoint generates the Watchpoint exception. + + + + + + + U + + + + + + DFSC + 5 + 0 + + + Data Fault Status Code. This field is set to 0b100010, to indicate a Debug exception. + + + + + + + + + + + U + + + + + + + For more information about generating these exceptions, see . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception from execution of a Breakpoint instruction + + + + + + + + 0 + 24 + 16 + + Reserved, RES0. + + + + + + Comment + 15 + 0 + + + Set to the instruction comment field value, zero extended as necessary. For the AArch32 BKPT instructions, the comment field is described as the immediate field. + + + + + + + + + + + U + + + + + + + For more information about generating these exceptions, see . + + + + + + + + + + + + + + + + + When ARMv8.3-NV is implemented + Exception from ERET, ERETAA or ERETAB instruction + + + This EC value only applies when HCR_EL2.NV is 1. + + + + + 0 + 24 + 2 + + Reserved, RES0. + + + + + + ERET + 1 + 1 + + + Indicates whether an ERET or ERETA* instruction was trapped to EL2. Possible values are: + + + + + + + 0b0 + + ERET instruction trapped to EL2. + + + + 0b1 + + ERETAA or ERETAB instruction trapped to EL2. + + + + + + If this bit is 0, the ERETA field is RES0. + + + + + + + U + + + + + + ERETA + 0 + 0 + + + Indicates whether an ERETAA or ERETAB instruction was trapped to EL2. Possible values are: + + + + + + + 0b0 + + ERETAA instruction trapped to EL2. + + + + 0b1 + + ERETAB instruction trapped to EL2. + + + + + + When the ERET field is 0, this bit is RES0. + + + + + + + U + + + + + + + For more information about generating these exceptions, see . + + + + + When ARMv8.3-NV is implemented + + + + + + + + + + + + + + + + When ARMv8.5-BTI is implemented + Exception from Branch Target Identification instruction + + + + + + + + 0 + 24 + 2 + + Reserved, RES0. + + + + + + BTYPE + 1 + 0 + + + This field is set to the PSTATE.BTYPE value that generated the Branch Target Exception. + + + + + + + + + + + + + For more information about generating these exceptions, see . + + + + + When ARMv8.5-BTI is implemented + + + + + + + + + + + + + Exception from a Pointer Authentication instruction when HCR_EL2.API == 0 || SCR_EL3.API == 0 + + + + + + + + 0 + 24 + 0 + + Reserved, RES0. + + + + + + + For more information about generating these exceptions, see: + +. +. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When HCR_EL2.E2H is 1, without explicit synchronization, access from EL3 using the mnemonic ESR_EL1 or ESR_EL12 are not guaranteed to be ordered with respect to accesses using the other mnemonic. + + + + + + + MRS <Xt>, ESR_EL1 + + + + + + + + + + + + + + +if PSTATE.EL == EL0 then + UNDEFINED; +elsif PSTATE.EL == EL1 then + if EL2Enabled() && !ELUsingAArch32(EL2) && HCR_EL2.TRVM == '1' then + AArch64.SystemAccessTrap(EL2, 0x18); + elsif EL2Enabled() && !ELUsingAArch32(EL2) && HCR_EL2.<NV2,NV1,NV> == '111' then + return NVMem[0x138]; + else + return ESR_EL1; +elsif PSTATE.EL == EL2 then + if HCR_EL2.E2H == '1' then + return ESR_EL2; + else + return ESR_EL1; +elsif PSTATE.EL == EL3 then + return ESR_EL1; + + + + + + + + MSR ESR_EL1, <Xt> + + + + + + + + + + + + + + +if PSTATE.EL == EL0 then + UNDEFINED; +elsif PSTATE.EL == EL1 then + if EL2Enabled() && !ELUsingAArch32(EL2) && HCR_EL2.TVM == '1' then + AArch64.SystemAccessTrap(EL2, 0x18); + elsif EL2Enabled() && !ELUsingAArch32(EL2) && HCR_EL2.<NV2,NV1,NV> == '111' then + NVMem[0x138] = X[t]; + else + ESR_EL1 = X[t]; +elsif PSTATE.EL == EL2 then + if HCR_EL2.E2H == '1' then + ESR_EL2 = X[t]; + else + ESR_EL1 = X[t]; +elsif PSTATE.EL == EL3 then + ESR_EL1 = X[t]; + + + + + + + + MRS <Xt>, ESR_EL12 + + + + + + + + + + + + + + +if PSTATE.EL == EL0 then + UNDEFINED; +elsif PSTATE.EL == EL1 then + if EL2Enabled() && HCR_EL2.<NV2,NV1,NV> == '101' then + return NVMem[0x138]; + elsif EL2Enabled() && HCR_EL2.NV == '1' then + AArch64.SystemAccessTrap(EL2, 0x18); + else + UNDEFINED; +elsif PSTATE.EL == EL2 then + if EL2Enabled() && HCR_EL2.E2H == '1' then + return ESR_EL1; + else + UNDEFINED; +elsif PSTATE.EL == EL3 then + if EL2Enabled() && HCR_EL2.E2H == '1' then + return ESR_EL1; + else + UNDEFINED; + + + + + + + + MSR ESR_EL12, <Xt> + + + + + + + + + + + + + + +if PSTATE.EL == EL0 then + UNDEFINED; +elsif PSTATE.EL == EL1 then + if EL2Enabled() && HCR_EL2.<NV2,NV1,NV> == '101' then + NVMem[0x138] = X[t]; + elsif EL2Enabled() && HCR_EL2.NV == '1' then + AArch64.SystemAccessTrap(EL2, 0x18); + else + UNDEFINED; +elsif PSTATE.EL == EL2 then + if EL2Enabled() && HCR_EL2.E2H == '1' then + ESR_EL1 = X[t]; + else + UNDEFINED; +elsif PSTATE.EL == EL3 then + if EL2Enabled() && HCR_EL2.E2H == '1' then + ESR_EL1 = X[t]; + else + UNDEFINED; + + + + + + + + MRS <Xt>, ESR_EL2 + + + + + + + + + + + + + + +if PSTATE.EL == EL0 then + UNDEFINED; +elsif PSTATE.EL == EL1 then + if EL2Enabled() && HCR_EL2.<NV2,NV> == '11' then + return ESR_EL1; + elsif EL2Enabled() && HCR_EL2.NV == '1' then + AArch64.SystemAccessTrap(EL2, 0x18); + else + UNDEFINED; +elsif PSTATE.EL == EL2 then + return ESR_EL2; +elsif PSTATE.EL == EL3 then + return ESR_EL2; + + + + + + + + MSR ESR_EL2, <Xt> + + + + + + + + + + + + + + +if PSTATE.EL == EL0 then + UNDEFINED; +elsif PSTATE.EL == EL1 then + if EL2Enabled() && HCR_EL2.<NV2,NV> == '11' then + ESR_EL1 = X[t]; + elsif EL2Enabled() && HCR_EL2.NV == '1' then + AArch64.SystemAccessTrap(EL2, 0x18); + else + UNDEFINED; +elsif PSTATE.EL == EL2 then + ESR_EL2 = X[t]; +elsif PSTATE.EL == EL3 then + ESR_EL2 = X[t]; + + + + + + + + + + + + 27/03/2019 21:59; e5e4db499bf9867a4b93324c4dbac985d3da9376 + \ No newline at end of file diff --git a/tools/lldbmacros/turnstile.py b/tools/lldbmacros/turnstile.py index 372e13ec4..1f7c731d6 100755 --- a/tools/lldbmacros/turnstile.py +++ b/tools/lldbmacros/turnstile.py @@ -15,22 +15,24 @@ def GetTurnstileSummary(turnstile): type_and_gencount = Cast(addressof(turnstile.ts_type_gencount), 'union turnstile_type_gencount *') turnstile_type = "" - if type_and_gencount.ts_type == 0: + if type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_NONE'): turnstile_type = "none " - elif type_and_gencount.ts_type == 1: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_KERNEL_MUTEX'): turnstile_type = "knl_mtx" - elif type_and_gencount.ts_type == 2: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_ULOCK'): turnstile_type = "ulock " - elif type_and_gencount.ts_type == 3: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_PTHREAD_MUTEX'): turnstile_type = "pth_mtx" - elif type_and_gencount.ts_type == 4: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_SYNC_IPC'): turnstile_type = "syn_ipc" - elif type_and_gencount.ts_type == 5: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_WORKLOOPS'): turnstile_type = "kqwl " - elif type_and_gencount.ts_type == 6: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_WORKQS'): turnstile_type = "workq " - elif type_and_gencount.ts_type == 7: + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_KNOTE'): turnstile_type = "knote " + elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_SLEEP_INHERITOR'): + turnstile_type = "slp_inh" turnstile_state = "" if turnstile.ts_state & 0x1: @@ -144,4 +146,32 @@ def ShowAllTurnstiles(cmd_args=None, cmd_options={}): PrintTurnstile(turnstile) return True # EndMacro showallbusyturnstiles + +@lldb_command('showthreadbaseturnstiles', fancy=True) +def ShowThreadInheritorBase(cmd_args=None, cmd_options={}, O=None): + """ A DEVELOPMENT macro that walks the list of userspace turnstiles pushing on a thread + and prints them. + usage: (lldb) showthreadbaseturnstiles thread_pointer + """ + if not cmd_args: + return O.error('invalid thread pointer') + + thread = kern.GetValueFromAddress(cmd_args[0], "thread_t") + with O.table(GetTurnstileSummary.header): + for turnstile in IteratePriorityQueue(thread.base_inheritor_queue, 'struct turnstile', 'ts_inheritor_links'): + PrintTurnstile(turnstile) + +@lldb_command('showthreadschedturnstiles', fancy=True) +def ShowThreadInheritorSched(cmd_args=None, cmd_options={}, O=None): + """ A DEVELOPMENT macro that walks the list of kernelspace turnstiles pushing on a thread + and prints them. + usage: (lldb) showthreadschedturnstiles thread_pointer + """ + if not cmd_args: + return O.error('invalid thread pointer') + + thread = kern.GetValueFromAddress(cmd_args[0], "thread_t") + with O.table(GetTurnstileSummary.header): + for turnstile in IteratePriorityQueue(thread.sched_inheritor_queue, 'struct turnstile', 'ts_inheritor_links'): + PrintTurnstile(turnstile) #endif diff --git a/tools/lldbmacros/ulock.py b/tools/lldbmacros/ulock.py new file mode 100755 index 000000000..e2f9b45c2 --- /dev/null +++ b/tools/lldbmacros/ulock.py @@ -0,0 +1,45 @@ +from xnu import * +from scheduler import GetRecentTimestamp +import xnudefines + +ulock_types = { + 1: "COMPARE_AND_WAIT", + 2: "UNFAIR_LOCK", + 3: "UNFAIR_LOCK64_SHARED", + 4: "COMPARE_AND_WAIT64", + 5: "COMPARE_AND_WAIT64_SHARED" +} + +@header("{:<20s} {:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<20s}".format( + 'ull_t', 'kind', 'addr/obj', 'pid/offs', 'owner', 'turnstile', 'waiters')) +def GetUlockSummary(ull): + code = int(ull.ull_opcode) + if ulock_types.has_key(code): + ull_type = ulock_types[code] + else: + ull_type = "{:#x}".format(code) + + s = "{ull: <#20x} {ull_type: <20s}".format(ull=ull, ull_type=ull_type) + ulk=ull.ull_key + if int(ulk.ulk_key_type) is 1: + s += " {ulk.ulk_addr: <#20x} {ulk.ulk_pid: <10d}".format(ulk=ulk) + elif int(ulk.ulk_key_type) is 2: + s += " {ulk.ulk_object: <#20x} {ulk.ulk_offset: <10d}".format(ulk=ulk) + else: + s += " {:<20s} {:<10s}".format("", "") + + return s + " {ull.ull_owner: <#20x} {ull.ull_turnstile: <#20x} {ull.ull_nwaiters: >7d}".format(ull=ull) + +@lldb_command('showallulocks', fancy=True) +def ShowAllUlocks(cmd_args=None, cmd_options={}, O=None): + """ Display a summary of all the ulocks in the system + + usage: showallulocks + """ + + with O.table(GetUlockSummary.header): + count = kern.globals.ull_hash_buckets; + buckets = kern.globals.ull_bucket + for i in xrange(0, count): + for ull in IterateLinkageChain(addressof(buckets[i].ulb_head), 'ull_t *', 'ull_hash_link'): + print GetUlockSummary(ull) diff --git a/tools/lldbmacros/userspace.py b/tools/lldbmacros/userspace.py index 3413fff96..f8844b3dc 100755 --- a/tools/lldbmacros/userspace.py +++ b/tools/lldbmacros/userspace.py @@ -77,6 +77,7 @@ def ShowX86UserStack(thread, user_lib_info = None): return def _PrintARMUserStack(task, cur_pc, cur_fp, framesize, frametype, frameformat, user_lib_info=None): + cur_pc = kern.StripUserPAC(cur_pc) if cur_pc == 0: "No valid user context for this activation." return @@ -87,6 +88,7 @@ def _PrintARMUserStack(task, cur_pc, cur_fp, framesize, frametype, frameformat, frame = GetUserDataAsString(task, cur_fp, framesize) cur_fp = _ExtractDataFromString(frame, 0, frametype) cur_pc = _ExtractDataFromString(frame, (framesize / 2), frametype) + cur_pc = kern.StripUserPAC(cur_pc) if not cur_fp: break print frameformat.format(frameno, cur_fp, cur_pc, GetBinaryNameForPC(cur_pc, user_lib_info)) @@ -864,6 +866,38 @@ def ShowOSMalloc(cmd_args=None): # EndMacro: showosmalloc +def SaveDataToFile(start_addr, length, outputfile, task=None,): + """ Save the data at the specified address (of the specified length) to the file. + params: start_addr : start address of the region of memory to save + length : length of the region of memory to save + outputfile : file to save the data in + task (optional) : task containing the memory region (if from user data) + returns: True if we saved the requested data, False otherwise + """ + if task: + memory_data = GetUserDataAsString(task, start_addr, length) + else: + data_ptr = kern.GetValueFromAddress(start_addr, 'uint8_t *') + if data_ptr == 0: + print "invalid kernel start address specified" + return False + memory_data = [] + for i in range(length): + memory_data.append(chr(data_ptr[i])) + if i % 50000 == 0: + print "%d of %d \r" % (i, length), + memory_data = ''.join(memory_data) + + if len(memory_data) != length: + print "Failed to read {:d} bytes from address {: <#020x}".format(length, start_addr) + return False + + fh = open(outputfile, 'w') + fh.write(memory_data) + fh.close() + print "Saved {:d} bytes to file {:s}".format(length, outputfile) + return True + @lldb_command('savekcdata', 'T:O:') def SaveKCDataToFile(cmd_args=None, cmd_options={}): @@ -891,28 +925,6 @@ def SaveKCDataToFile(cmd_args=None, cmd_options={}): if flags_copyout: if not task: raise ArgumentError('Invalid task pointer provided.') - memory_data = GetUserDataAsString(task, memory_begin_address, memory_size) + return SaveDataToFile(memory_begin_address, memory_size, outputfile, task) else: - data_ptr = kern.GetValueFromAddress(memory_begin_address, 'uint8_t *') - if data_ptr == 0: - print "Kcdata descriptor is NULL" - return False - memory_data = [] - for i in range(memory_size): - memory_data.append(chr(data_ptr[i])) - if i % 50000 == 0: - print "%d of %d \r" % (i, memory_size), - memory_data = ''.join(memory_data) - - if len(memory_data) != memory_size: - print "Failed to read {:d} bytes from address {: <#020x}".format(memory_size, memory_begin_address) - return False - - fh = open(outputfile, 'w') - fh.write(memory_data) - fh.close() - print "Saved {:d} bytes to file {:s}".format(memory_size, outputfile) - return True - - - + return SaveDataToFile(memory_begin_address, memory_size, outputfile, None) diff --git a/tools/lldbmacros/usertaskdebugging/userprocess.py b/tools/lldbmacros/usertaskdebugging/userprocess.py index 74e54223e..a4a9a61b2 100755 --- a/tools/lldbmacros/usertaskdebugging/userprocess.py +++ b/tools/lldbmacros/usertaskdebugging/userprocess.py @@ -10,10 +10,13 @@ CPU_TYPE_I386 = 0x00000007 CPU_TYPE_X86_64 = 0x01000007 CPU_TYPE_ARM = 0x0000000c CPU_TYPE_ARM64 = 0x0100000c +CPU_TYPE_ARM64_32 = 0x0200000c def GetRegisterSetForCPU(cputype, subtype): if cputype == CPU_TYPE_ARM64: retval = Armv8_RegisterSet + elif cputype == CPU_TYPE_ARM64_32: + retval = Armv8_RegisterSet elif cputype == CPU_TYPE_ARM: retval = Armv7_RegisterSet elif cputype == CPU_TYPE_I386: @@ -52,6 +55,9 @@ class UserThreadObject(object): self.saved_state = self.thread.machine.PcbData else: self.saved_state = self.thread.machine.contextData.ss.uss.ss_32 + if cputype == CPU_TYPE_ARM64_32: + self.reg_type = "arm64" + self.saved_state = self.thread.machine.upcb.uss.ss_64 logging.debug("created thread id 0x%x of type %s, is_kern_64bit 0x%x cputype 0x%x" % (self.thread_id, self.reg_type, is_kern_64bit, cputype)) @@ -101,8 +107,7 @@ class UserProcess(target.Process): if task.t_flags & 0x2: dataregisters64bit = True - is_kern_64bit = kern.arch in ['x86_64', 'x86_64h', 'arm64' - ] + is_kern_64bit = kern.arch in ['x86_64', 'x86_64h', 'arm64', 'arm64e'] self.cputype = unsigned(self.proc.p_cputype) self.cpusubtype = unsigned(self.proc.p_cpusubtype) diff --git a/tools/lldbmacros/utils.py b/tools/lldbmacros/utils.py index 33d601f8d..6039f2048 100755 --- a/tools/lldbmacros/utils.py +++ b/tools/lldbmacros/utils.py @@ -474,3 +474,12 @@ def print_hex_data(data, begin_offset=0, desc=""): def Ones(x): return (1 << x)-1 +def StripPAC(x, TySz): + sign_mask = 1 << 55 + ptr_mask = Ones(64-TySz) + pac_mask = ~ptr_mask + sign = x & sign_mask + if sign: + return (x | pac_mask) + 2**64 + else: + return x & ptr_mask diff --git a/tools/lldbmacros/workqueue.py b/tools/lldbmacros/workqueue.py index dae699f27..26bb400f7 100755 --- a/tools/lldbmacros/workqueue.py +++ b/tools/lldbmacros/workqueue.py @@ -68,17 +68,20 @@ def GetWQThreadSummary(th, uth): kqr = uth.uu_kqr_bound if not kqr: kq = 0 - elif kqr.kqr_state & 0x1: # workloop + elif kqr.tr_flags & 0x1: # kevent + kq = p.p_fd.fd_wqkqueue + kind = "kqwq[%s]" % (xnudefines.thread_qos_short_strings[int(kqr.tr_kq_qos_index)]) + elif kqr.tr_flags & 0x2: # workloop kq = ContainerOf(kqr, 'struct kqworkloop', 'kqwl_request') kind = "workloop" else: - kq = p.p_fd.fd_wqkqueue - kind = "kqwq[%s]" % (xnudefines.thread_qos_short_strings[int(kqr.kqr_qos_index)]) + kq = 0 + kind = "???" return "{th: <#020x} {uth: <#020x} {thport: >#010x} {kind: <9s} {kq: <#020x} {idle: <10s} {uu_workq_flags: <30s}".format(th=th, uth=uth, thport=uth.uu_workq_thport, kind=kind, kq=kq, idle=idle, uu_workq_flags=" ".join(uu_workq_flags)) -@header("{:<20s} {:<20s} {:<10s} {:<3s} {:<4s} {:<30s}".format( - 'request', 'kqueue', 'state', '#', 'qos', 'tr_flags')) +@header("{:<20s} {:<20s} {:<20s} {:<10s} {:<4s} {:<6s} {:<6s} {:<6s} {:<30s}".format( + 'request', 'kqueue', 'thread', 'state', '#', 'qos', 'kq_qos', 'kq_ovr', 'tr_flags')) def GetWorkqueueThreadRequestSummary(proc, req): kq = 0 tr_flags = [] @@ -88,12 +91,17 @@ def GetWorkqueueThreadRequestSummary(proc, req): kq = proc.p_fd.fd_wqkqueue if req.tr_flags & 0x02: tr_flags.append("WORKLOOP") - kq = ContainerOf(req, 'struct kqworkloop', 'kqwl_request.kqr_req') + kq = ContainerOf(req, 'struct kqworkloop', 'kqwl_request') if req.tr_flags & 0x04: tr_flags.append("OVERCOMMIT") if req.tr_flags & 0x08: tr_flags.append("PARAMS") if req.tr_flags & 0x10: tr_flags.append("OUTSIDE_QOS") - state = {0: "IDLE", 1: "NEW", 2: "QUEUED", 4: "BINDING" }[int(req.tr_state)] + state = {0: "IDLE", 1: "NEW", 2: "QUEUED", 3: "CANCELED", 4: "BINDING", 5: "BOUND" }[int(req.tr_state)] + if req.tr_kq_wakeup: state += "*" + + thread = 0 + if int(req.tr_state) in [3, 4]: + thread = req.tr_thread qos = int(req.tr_qos) if qos == 8: @@ -103,74 +111,80 @@ def GetWorkqueueThreadRequestSummary(proc, req): else: qos = xnudefines.thread_qos_short_strings[qos] - return "{req: <#020x} {kq: <#020x} {state: <10s} {req.tr_count: <3d} {qos: <4s} {tr_flags: <30s}".format(req=req, kq=kq, state=state, qos=qos, tr_flags=" ".join(tr_flags)) + kq_qos = xnudefines.thread_qos_short_strings[int(req.tr_kq_qos_index)] + kq_ovr = xnudefines.thread_qos_short_strings[int(req.tr_kq_override_index)] + req_addr = unsigned(addressof(req)) -@lldb_command('showwqthread') -def ShowWQThread(cmd_args=None): + return "{req_addr: <#020x} {kq: <#020x} {thread: <#020x} {state: <10s} {req.tr_count: <4d} {qos: <6s} {kq_qos: <6s} {kq_ovr: <6s} {tr_flags: <30s}".format( + req_addr=req_addr, req=req, kq=kq, thread=thread, state=state, qos=qos, kq_qos=kq_qos, kq_ovr=kq_ovr, tr_flags=" ".join(tr_flags)) + +@lldb_command('showwqthread', fancy=True) +def ShowWQThread(cmd_args=None, cmd_options={}, O=None): """ Shows info about a workqueue thread usage: showworkqthread """ if not cmd_args: - raise ArgumentError('missing struct proc * argument') + return O.error('missing struct proc * argument') th = kern.GetValueFromAddress(cmd_args[0], "struct thread *") if not (th.thread_tag & 0x20): raise ArgumentError('not a workqueue thread') - print GetWQThreadSummary.header - print GetWQThreadSummary(th, Cast(th.uthread, 'struct uthread *')) + with O.table(GetWQThreadSummary.header): + print GetWQThreadSummary(th, Cast(th.uthread, 'struct uthread *')) -@lldb_command('showprocworkqueue') -def ShowProcWorkqueue(cmd_args=None): +@lldb_command('showprocworkqueue', fancy=True) +def ShowProcWorkqueue(cmd_args=None, cmd_options={}, O=None): """ Shows the process workqueue usage: showprocworkqueue """ if not cmd_args: - raise ArgumentError('missing struct proc * argument') + return O.error('missing struct proc * argument') proc = kern.GetValueFromAddress(cmd_args[0], "proc_t") wq = Cast(proc.p_wqptr, "struct workqueue *"); - if wq: - print GetWorkqueueSummary.header + if not wq: + return O.error("{:#x} doesn't have a workqueue", proc) + + with O.table(GetWorkqueueSummary.header): print GetWorkqueueSummary(proc, wq) - if wq.wq_reqcount: - print " " - print " " + GetWorkqueueThreadRequestSummary.header + with O.table(GetWorkqueueThreadRequestSummary.header, indent=True): + if wq.wq_reqcount: + print "" if wq.wq_event_manager_threadreq: - print " " + GetWorkqueueThreadRequestSummary(proc, wq.wq_event_manager_threadreq) - for req in IteratePriorityQueueEntry(wq.wq_overcommit_queue, 'struct workq_threadreq_s', 'tr_entry'): - print " " + GetWorkqueueThreadRequestSummary(proc, req) - for req in IteratePriorityQueueEntry(wq.wq_constrained_queue, 'struct workq_threadreq_s', 'tr_entry'): - print " " + GetWorkqueueThreadRequestSummary(proc, req) - for req in IteratePriorityQueueEntry(wq.wq_special_queue, 'struct workq_threadreq_s', 'tr_entry'): - print " " + GetWorkqueueThreadRequestSummary(proc, req) - - print " " - print " " + GetWQThreadSummary.header - for uth in IterateTAILQ_HEAD(wq.wq_thrunlist, "uu_workq_entry"): - print " " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) - for uth in IterateTAILQ_HEAD(wq.wq_thidlelist, "uu_workq_entry"): - print " " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) - for uth in IterateTAILQ_HEAD(wq.wq_thnewlist, "uu_workq_entry"): - print " " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) - -@lldb_command('showallworkqueues') -def ShowAllWorkqueues(cmd_args=None): + print GetWorkqueueThreadRequestSummary(proc, wq.wq_event_manager_threadreq) + for req in IteratePriorityQueue(wq.wq_overcommit_queue, 'struct workq_threadreq_s', 'tr_entry'): + print GetWorkqueueThreadRequestSummary(proc, req) + for req in IteratePriorityQueue(wq.wq_constrained_queue, 'struct workq_threadreq_s', 'tr_entry'): + print GetWorkqueueThreadRequestSummary(proc, req) + for req in IteratePriorityQueue(wq.wq_special_queue, 'struct workq_threadreq_s', 'tr_entry'): + print GetWorkqueueThreadRequestSummary(proc, req) + + with O.table(GetWQThreadSummary.header, indent=True): + print "" + for uth in IterateTAILQ_HEAD(wq.wq_thrunlist, "uu_workq_entry"): + print GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) + for uth in IterateTAILQ_HEAD(wq.wq_thidlelist, "uu_workq_entry"): + print GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) + for uth in IterateTAILQ_HEAD(wq.wq_thnewlist, "uu_workq_entry"): + print GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) + +@lldb_command('showallworkqueues', fancy=True) +def ShowAllWorkqueues(cmd_args=None, cmd_options={}, O=None): """ Display a summary of all the workqueues in the system usage: showallworkqueues """ - print GetWorkqueueSummary.header - - for t in kern.tasks: - proc = Cast(t.bsd_info, 'proc *') - wq = Cast(proc.p_wqptr, "struct workqueue *"); - if wq: - print GetWorkqueueSummary(proc, wq) + with O.table(GetWorkqueueSummary.header): + for t in kern.tasks: + proc = Cast(t.bsd_info, 'proc *') + wq = Cast(proc.p_wqptr, "struct workqueue *"); + if wq: + print GetWorkqueueSummary(proc, wq) diff --git a/tools/lldbmacros/xnu.py b/tools/lldbmacros/xnu.py index 688001678..7ec9ca7c8 100755 --- a/tools/lldbmacros/xnu.py +++ b/tools/lldbmacros/xnu.py @@ -21,6 +21,8 @@ MODULE_NAME=__name__ COMMON_HELP_STRING = """ -h Show the help string for the command. + -c [always|auto|never|0|1] + Control the colorized output of certain commands -o The output of this command execution will be saved to file. Parser information or errors will not be sent to file though. eg /tmp/output.txt -s The "filter_string" param is parsed to python regex expression and each line of output @@ -45,11 +47,11 @@ def header(initial_value): return obj return _set_header -# holds type declarations done by xnu. +# holds type declarations done by xnu. #DONOTTOUCHME: Exclusive use of lldb_type_summary only. -lldb_summary_definitions = {} +lldb_summary_definitions = {} def lldb_type_summary(types_list): - """ A function decorator to register a summary for a type in lldb. + """ A function decorator to register a summary for a type in lldb. params: types_list - [] an array of types that you wish to register a summary callback function. (ex. ['task *', 'task_t']) returns: Nothing. This is a decorator. """ @@ -60,13 +62,13 @@ def lldb_type_summary(types_list): out_string += "\n" + obj.header +"\n" out_string += obj( core.value(lldbval) ) return out_string - + myglobals = globals() summary_function_name = "LLDBSummary" + obj.__name__ myglobals[summary_function_name] = _internal_summary_function summary_function = myglobals[summary_function_name] summary_function.__doc__ = obj.__doc__ - + global lldb_summary_definitions for single_type in types_list: if config['showTypeSummary']: @@ -74,19 +76,20 @@ def lldb_type_summary(types_list): lldb.debugger.HandleCommand("type summary delete --category kernel \""+ single_type + "\"") lldb.debugger.HandleCommand("type summary add \""+ single_type +"\" --category kernel --python-function " + MODULE_NAME + "." + summary_function_name) lldb_summary_definitions[single_type] = obj - + return obj return _get_summary -#global cache of documentation for lldb commands exported by this module +#global cache of documentation for lldb commands exported by this module #DONOTTOUCHME: Exclusive use of lldb_command only. lldb_command_documentation = {} -def lldb_command(cmd_name, option_string = ''): +def lldb_command(cmd_name, option_string = '', fancy=False): """ A function decorator to define a command with namd 'cmd_name' in the lldb scope to call python function. params: cmd_name - str : name of command to be set in lldb prompt. - option_string - str: getopt like option string. Only CAPITAL LETTER options allowed. + option_string - str: getopt like option string. Only CAPITAL LETTER options allowed. see README on Customizing command options. + fancy - bool : whether the command will receive an 'O' object to do fancy output (tables, indent, color) """ if option_string != option_string.upper(): raise RuntimeError("Cannot setup command with lowercase option args. %s" % option_string) @@ -104,16 +107,18 @@ def lldb_command(cmd_name, option_string = ''): command_args = shlex.split(command) lldb.debugger.HandleCommand('type category disable kernel' ) def_verbose_level = config['verbosity'] - + try: stream.setOptions(command_args, option_string) if stream.verbose_level != 0: - config['verbosity'] += stream.verbose_level + config['verbosity'] += stream.verbose_level with RedirectStdStreams(stdout=stream) : + args = { 'cmd_args': stream.target_cmd_args } if option_string: - obj(cmd_args=stream.target_cmd_args, cmd_options=stream.target_cmd_options) - else: - obj(cmd_args=stream.target_cmd_args) + args['cmd_options'] = stream.target_cmd_options + if fancy: + args['O'] = stream + obj(**args) except KeyboardInterrupt: print "Execution interrupted by user" except ArgumentError as arg_error: @@ -133,7 +138,7 @@ However, it is recommended that you report the exception to lldb/kernel debuggin if config['showTypeSummary']: lldb.debugger.HandleCommand('type category enable kernel' ) - + if stream.pluginRequired : plugin = LoadXNUPlugin(stream.pluginName) if plugin == None : @@ -143,10 +148,10 @@ However, it is recommended that you report the exception to lldb/kernel debuggin return_data = plugin.plugin_execute(cmd_name, result.GetOutput()) ProcessXNUPluginResult(return_data) plugin.plugin_cleanup() - + #restore the verbose level after command is complete config['verbosity'] = def_verbose_level - + return myglobals = globals() @@ -163,14 +168,24 @@ However, it is recommended that you report the exception to lldb/kernel debuggin lldb.debugger.HandleCommand("command script delete "+cmd_name) lldb_command_documentation[cmd_name] = (obj.__name__, obj.__doc__.lstrip(), option_string) lldb.debugger.HandleCommand("command script add -f " + MODULE_NAME + "." + command_function_name + " " + cmd_name) + + if fancy: + def wrapped_fun(cmd_args=None, cmd_options={}, O=None): + if O is None: + stream = CommandOutput(cmd_name, fhandle=sys.stdout) + with RedirectStdStreams(stdout=stream): + return obj(cmd_args, cmd_options, stream) + else: + return obj(cmd_args, cmd_options, O) + return wrapped_fun return obj return _cmd def lldb_alias(alias_name, cmd_line): - """ define an alias in the lldb command line. + """ define an alias in the lldb command line. A programatic way of registering an alias. This basically does (lldb)command alias alias_name "cmd_line" - ex. + ex. lldb_alias('readphys16', 'readphys 16') """ alias_name = alias_name.strip() @@ -194,7 +209,7 @@ def SetupLLDBTypeSummaries(reset=False): return def LoadXNUPlugin(name): - """ Try to load a plugin from the plugins directory. + """ Try to load a plugin from the plugins directory. """ retval = None name=name.strip() @@ -208,7 +223,7 @@ def LoadXNUPlugin(name): print "Plugin is not correctly implemented. Please read documentation on implementing plugins" except: print "plugin not found :"+name - + return retval def ProcessXNUPluginResult(result_data): @@ -218,7 +233,7 @@ def ProcessXNUPluginResult(result_data): ret_status = result_data[0] ret_string = result_data[1] ret_commands = result_data[2] - + if ret_status == False: print "Plugin failed: " + ret_string return @@ -233,15 +248,15 @@ def ProcessXNUPluginResult(result_data): #DONOTTOUCHME: Exclusive use of xnudebug_test only lldb_command_tests = {} def xnudebug_test(test_name): - """ A function decoratore to register a test with the framework. Each test is supposed to be of format + """ A function decoratore to register a test with the framework. Each test is supposed to be of format def Test(kernel_target, config, lldb_obj, isConnected ) - + NOTE: The testname should start with "Test" else exception will be raised. """ def _test(obj): global lldb_command_tests if obj.__name__.find("Test") != 0 : - print "Test name ", obj.__name__ , " should start with Test" + print "Test name ", obj.__name__ , " should start with Test" raise ValueError lldb_command_tests[test_name] = (test_name, obj.__name__, obj, obj.__doc__) return obj @@ -249,14 +264,14 @@ def xnudebug_test(test_name): # End Debugging specific utility functions -# Kernel Debugging specific classes and accessor methods +# Kernel Debugging specific classes and accessor methods # global access object for target kernel def GetObjectAtIndexFromArray(array_base, index): """ Subscript indexing for arrays that are represented in C as pointers. for ex. int *arr = malloc(20*sizeof(int)); - now to get 3rd int from 'arr' you'd do + now to get 3rd int from 'arr' you'd do arr[2] in C GetObjectAtIndexFromArray(arr_val,2) params: @@ -278,8 +293,8 @@ kern = None def GetLLDBThreadForKernelThread(thread_obj): """ Get a reference to lldb.SBThread representation for kernel thread. params: - thread_obj : core.cvalue - thread object of type thread_t - returns + thread_obj : core.cvalue - thread object of type thread_t + returns lldb.SBThread - lldb thread object for getting backtrace/registers etc. """ tid = unsigned(thread_obj.thread_id) @@ -369,10 +384,10 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""): if not function: # No debug info for 'function'. - out_string += prefix + out_string += prefix if not is_continuation: - out_string += "{fp:#018x} ".format(fp = frame_p) - + out_string += "{fp:#018x} ".format(fp = frame_p) + symbol = frame.GetSymbol() if not symbol: out_string += GetKextSymbolInfo(load_addr) @@ -381,7 +396,7 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""): start_addr = symbol.GetStartAddress().GetFileAddress() symbol_name = symbol.GetName() symbol_offset = file_addr - start_addr - out_string += "{addr:#018x} {mod}`{symbol} + {offset:#x} \n".format(addr=load_addr, + out_string += "{addr:#018x} {mod}`{symbol} + {offset:#x} \n".format(addr=load_addr, mod=mod_name, symbol=symbol_name, offset=symbol_offset) else: # Debug info is available for 'function'. @@ -391,15 +406,15 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""): func_name = '%s [inlined]' % func_name if frame.IsInlined() else func_name if is_continuation and frame.IsInlined(): debuglog("Skipping frame for thread {:#018x} since its inlined".format(thread_obj)) - continue - out_string += prefix + continue + out_string += prefix if not is_continuation: out_string += "{fp:#018x} ".format(fp=frame_p) out_string += "{addr:#018x} {func}{args} \n".format(addr=load_addr, func=func_name, file=file_name, line=line_num, args="(" + (str(frame.arguments).replace("\n", ", ") if len(frame.arguments) > 0 else "void") + ")") - iteration += 1 + iteration += 1 if frame_p: last_frame_p = frame_p @@ -409,9 +424,9 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""): return out_string def GetSourceInformationForAddress(addr): - """ convert and address to function +offset information. + """ convert and address to function +offset information. params: addr - int address in the binary to be symbolicated - returns: string of format "0xaddress: function + offset" + returns: string of format "0xaddress: function + offset" """ symbols = kern.SymbolicateFromAddress(addr) format_string = "{0:#018x} <{1:s} + {2:#0x}>" @@ -429,7 +444,7 @@ def GetFrameLocalVariable(variable_name, frame_no=0): """ Find a local variable by name params: variable_name: str - name of variable to search for - returns: + returns: core.value - if the variable is found. None - if not found or not Valid """ @@ -466,16 +481,16 @@ def KernelDebugCommandsHelp(cmd_args=None): return None -@lldb_command('showraw') +@lldb_command('showraw') def ShowRawCommand(cmd_args=None): - """ A command to disable the kernel summaries and show data as seen by the system. + """ A command to disable the kernel summaries and show data as seen by the system. This is useful when trying to read every field of a struct as compared to brief summary """ command = " ".join(cmd_args) lldb.debugger.HandleCommand('type category disable kernel' ) lldb.debugger.HandleCommand( command ) lldb.debugger.HandleCommand('type category enable kernel' ) - + @lldb_command('xnudebug') def XnuDebugCommand(cmd_args=None): @@ -537,18 +552,18 @@ def XnuDebugCommand(cmd_args=None): if test_name in lldb_command_tests: test = lldb_command_tests[test_name] print "Running test {:s}".format(test[0]) - if test[2](kern, config, lldb, True) : + if test[2](kern, config, lldb, True) : print "[PASSED] {:s}".format(test[0]) else: print "[FAILED] {:s}".format(test[0]) - return "" + return "" else: print "No such test registered with name: {:s}".format(test_name) print "XNUDEBUG Available tests are:" for i in lldb_command_tests.keys(): print i return None - + return False @lldb_command('showversion') @@ -564,83 +579,68 @@ def ShowVersion(cmd_args=None): """ print kern.version - -@lldb_command('paniclog', 'S') -def ShowPanicLog(cmd_args=None, cmd_options={}): - """ Display the paniclog information - usage: (lldb) paniclog - options: - -v : increase verbosity - -S : parse stackshot data (if panic stackshot available) +def ProcessPanicStackshot(panic_stackshot_addr, panic_stackshot_len): + """ Process the panic stackshot from the panic header, saving it to a file if it is valid + params: panic_stackshot_addr : start address of the panic stackshot binary data + panic_stackshot_len : length of the stackshot binary data + returns: nothing """ + if not panic_stackshot_addr: + print "No panic stackshot available (invalid addr)" + return - if "-S" in cmd_options: - if hasattr(kern.globals, "kc_panic_data"): - stackshot_saved = False - # TODO: Update logic to handle "in-memory" panic stackshot on Gibraltar platforms - # once we drop support for the on disk one there. - if kern.arch == 'x86_64': - if kern.globals.panic_stackshot_len != 0: - stackshot_saved = True - else: - print "No panic stackshot available" - else: - if unsigned(kern.globals.panic_info.eph_panic_flags) & xnudefines.EMBEDDED_PANIC_STACKSHOT_SUCCEEDED_FLAG: - stackshot_saved = True - else: - print "No panic stackshot available" - if stackshot_saved: - kc_data = unsigned(addressof(kern.globals.kc_panic_data)) - ts = int(time.time()) - ss_binfile = "/tmp/panic_%d.bin" % ts - ss_ipsfile = "/tmp/stacks_%d.ips" % ts - print "savekcdata 0x%x -O %s" % (kc_data, ss_binfile) - SaveKCDataToFile(["0x%x" % kc_data], {"-O":ss_binfile}) - self_path = str(__file__) - base_dir_name = self_path[:self_path.rfind("/")] - print "python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile) - (c,so,se) = RunShellCommand("python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile)) - if c == 0: - print "Saved ips stackshot file as %s" % ss_ipsfile - else: - print "Failed to run command: exit code: %d, SO: %s SE: %s" % (c, so, se) - else: - print "kc_panic_data is unavailable for this kernel config." + if not panic_stackshot_len: + print "No panic stackshot available (zero length)" + return; - out_str = "" - warn_str = "" + ts = int(time.time()) + ss_binfile = "/tmp/panic_%d.bin" % ts + ss_ipsfile = "/tmp/stacks_%d.ips" % ts - if kern.arch == 'x86_64': - panic_buf = Cast(kern.globals.panic_info, 'char *') - panic_log_magic = unsigned(kern.globals.panic_info.mph_magic) - panic_log_begin_offset = unsigned(kern.globals.panic_info.mph_panic_log_offset) - panic_log_len = unsigned(kern.globals.panic_info.mph_panic_log_len) - other_log_begin_offset = unsigned(kern.globals.panic_info.mph_other_log_offset) - other_log_len = unsigned(kern.globals.panic_info.mph_other_log_len) - cur_debug_buf_ptr_offset = (unsigned(kern.globals.debug_buf_ptr) - unsigned(kern.globals.panic_info)) - if other_log_begin_offset != 0 and (other_log_len == 0 or other_log_len < (cur_debug_buf_ptr_offset - other_log_begin_offset)): - other_log_len = cur_debug_buf_ptr_offset - other_log_begin_offset - expected_panic_magic = xnudefines.MACOS_PANIC_MAGIC + if not SaveDataToFile(panic_stackshot_addr, panic_stackshot_len, ss_binfile, None): + print "Failed to save stackshot binary data to file" + return + + self_path = str(__file__) + base_dir_name = self_path[:self_path.rfind("/")] + print "python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile) + (c,so,se) = RunShellCommand("python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile)) + if c == 0: + print "Saved ips stackshot file as %s" % ss_ipsfile + return else: - panic_buf = Cast(kern.globals.panic_info, 'char *') - panic_log_magic = unsigned(kern.globals.panic_info.eph_magic) - panic_log_begin_offset = unsigned(kern.globals.panic_info.eph_panic_log_offset) - panic_log_len = unsigned(kern.globals.panic_info.eph_panic_log_len) - other_log_begin_offset = unsigned(kern.globals.panic_info.eph_other_log_offset) - other_log_len = unsigned(kern.globals.panic_info.eph_other_log_len) - expected_panic_magic = xnudefines.EMBEDDED_PANIC_MAGIC - - if panic_log_begin_offset == 0: + print "Failed to run command: exit code: %d, SO: %s SE: %s" % (c, so, se) return +def ParseEmbeddedPanicLog(panic_header, cmd_options={}): + panic_buf = Cast(panic_header, 'char *') + panic_log_magic = unsigned(panic_header.eph_magic) + panic_log_begin_offset = unsigned(panic_header.eph_panic_log_offset) + panic_log_len = unsigned(panic_header.eph_panic_log_len) + other_log_begin_offset = unsigned(panic_header.eph_other_log_offset) + other_log_len = unsigned(panic_header.eph_other_log_len) + expected_panic_magic = xnudefines.EMBEDDED_PANIC_MAGIC + panic_stackshot_addr = unsigned(panic_header) + unsigned(panic_header.eph_stackshot_offset) + panic_stackshot_len = unsigned(panic_header.eph_stackshot_len) + panic_header_flags = unsigned(panic_header.eph_panic_flags) + + warn_str = "" + out_str = "" + if panic_log_magic != 0 and panic_log_magic != expected_panic_magic: - warn_str += "BAD MAGIC! Found 0x%x expected 0x%x".format(panic_log_magic, + warn_str += "BAD MAGIC! Found 0x%x expected 0x%x" % (panic_log_magic, expected_panic_magic) - if panic_log_begin_offset == 0: - if warn_str: - print "\n %s" % warn_str - return + if warn_str: + print "\n %s" % warn_str + if panic_log_begin_offset == 0: + return + + if "-S" in cmd_options: + if panic_header_flags & xnudefines.EMBEDDED_PANIC_STACKSHOT_SUCCEEDED_FLAG: + ProcessPanicStackshot(panic_stackshot_addr, panic_stackshot_len) + else: + print "No panic stackshot available" panic_log_curindex = 0 while panic_log_curindex < panic_log_len: @@ -656,12 +656,198 @@ def ShowPanicLog(cmd_args=None, cmd_options={}): other_log_curindex += 1 print out_str + return + +def ParseMacOSPanicLog(panic_header, cmd_options={}): + panic_buf = Cast(panic_header, 'char *') + panic_log_magic = unsigned(panic_header.mph_magic) + panic_log_begin_offset = unsigned(panic_header.mph_panic_log_offset) + panic_log_len = unsigned(panic_header.mph_panic_log_len) + other_log_begin_offset = unsigned(panic_header.mph_other_log_offset) + other_log_len = unsigned(panic_header.mph_other_log_len) + cur_debug_buf_ptr_offset = (unsigned(kern.globals.debug_buf_ptr) - unsigned(panic_header)) + if other_log_begin_offset != 0 and (other_log_len == 0 or other_log_len < (cur_debug_buf_ptr_offset - other_log_begin_offset)): + other_log_len = cur_debug_buf_ptr_offset - other_log_begin_offset + expected_panic_magic = xnudefines.MACOS_PANIC_MAGIC + panic_stackshot_addr = unsigned(panic_header) + unsigned(panic_header.mph_stackshot_offset) + panic_stackshot_len = unsigned(panic_header.mph_stackshot_len) + panic_header_flags = unsigned(panic_header.mph_panic_flags) + + warn_str = "" + out_str = "" + + if panic_log_magic != 0 and panic_log_magic != expected_panic_magic: + warn_str += "BAD MAGIC! Found 0x%x expected 0x%x" % (panic_log_magic, + expected_panic_magic) if warn_str: print "\n %s" % warn_str + if panic_log_begin_offset == 0: + return + + if "-S" in cmd_options: + if panic_header_flags & xnudefines.MACOS_PANIC_STACKSHOT_SUCCEEDED_FLAG: + ProcessPanicStackshot(panic_stackshot_addr, panic_stackshot_len) + else: + print "No panic stackshot available" + + panic_log_curindex = 0 + while panic_log_curindex < panic_log_len: + p_char = str(panic_buf[(panic_log_begin_offset + panic_log_curindex)]) + out_str += p_char + panic_log_curindex += 1 + + if other_log_begin_offset != 0: + other_log_curindex = 0 + while other_log_curindex < other_log_len: + p_char = str(panic_buf[(other_log_begin_offset + other_log_curindex)]) + out_str += p_char + other_log_curindex += 1 + + print out_str + return + +def ParseAURRPanicLog(panic_header, cmd_options={}): + reset_cause = { + 0x0: "OTHER", + 0x1: "CATERR", + 0x2: "SWD_TIMEOUT", + 0x3: "GLOBAL RESET", + 0x4: "STRAIGHT TO S5", + } + + expected_panic_magic = xnudefines.AURR_PANIC_MAGIC + + panic_buf = Cast(panic_header, 'char *') + + try: + # This line will blow up if there's not type info for this struct (older kernel) + # We fall back to manual parsing below + aurr_panic_header = Cast(panic_header, 'struct efi_aurr_panic_header *') + panic_log_magic = unsigned(aurr_panic_header.efi_aurr_magic) + panic_log_version = unsigned(aurr_panic_header.efi_aurr_version) + panic_log_reset_cause = unsigned(aurr_panic_header.efi_aurr_reset_cause) + panic_log_reset_log_offset = unsigned(aurr_panic_header.efi_aurr_reset_log_offset) + panic_log_reset_log_len = unsigned(aurr_panic_header.efi_aurr_reset_log_len) + except Exception as e: + print "*** Warning: kernel symbol file has no type information for 'struct efi_aurr_panic_header'..." + print "*** Warning: trying to manually parse..." + aurr_panic_header = Cast(panic_header, "uint32_t *") + panic_log_magic = unsigned(aurr_panic_header[0]) + # panic_log_crc = unsigned(aurr_panic_header[1]) + panic_log_version = unsigned(aurr_panic_header[2]) + panic_log_reset_cause = unsigned(aurr_panic_header[3]) + panic_log_reset_log_offset = unsigned(aurr_panic_header[4]) + panic_log_reset_log_len = unsigned(aurr_panic_header[5]) + + if panic_log_magic != 0 and panic_log_magic != expected_panic_magic: + print "BAD MAGIC! Found 0x%x expected 0x%x" % (panic_log_magic, + expected_panic_magic) + return + + print "AURR Panic Version: %d" % (panic_log_version) + + # When it comes time to extend this in the future, please follow the + # construct used below in ShowPanicLog() + if panic_log_version in (xnudefines.AURR_PANIC_VERSION, xnudefines.AURR_CRASHLOG_PANIC_VERSION): + # AURR Report Version 1 (AURR/MacEFI) or 2 (Crashlog) + # see macefifirmware/Vendor/Apple/EfiPkg/AppleDebugSupport/Library/Debugger.h + print "Reset Cause: 0x%x (%s)" % (panic_log_reset_cause, reset_cause.get(panic_log_reset_cause, "UNKNOWN")) + + # Adjust panic log string length (cap to maximum supported values) + if panic_log_version == xnudefines.AURR_PANIC_VERSION: + max_string_len = panic_log_reset_log_len and min(panic_log_reset_log_len, xnudefines.AURR_PANIC_STRING_LEN) or 0 + elif panic_log_version == xnudefines.AURR_CRASHLOG_PANIC_VERSION: + max_string_len = xnudefines.CRASHLOG_PANIC_STRING_LEN + + panic_str_offset = 0 + out_str = "" + + while panic_str_offset < max_string_len: + p_char = str(panic_buf[panic_log_reset_log_offset + panic_str_offset]) + out_str += p_char + panic_str_offset += 1 + + print out_str + # Save Crashlog Binary Data (if available) + if "-S" in cmd_options and panic_log_version == xnudefines.AURR_CRASHLOG_PANIC_VERSION: + crashlog_binary_offset = panic_log_reset_log_offset + xnudefines.CRASHLOG_PANIC_STRING_LEN + crashlog_binary_size = (panic_log_reset_log_len > xnudefines.CRASHLOG_PANIC_STRING_LEN) and (panic_log_reset_log_len - xnudefines.CRASHLOG_PANIC_STRING_LEN) or 0 + + if 0 == crashlog_binary_size: + print "No crashlog data found..." + return + + # Save to file + ts = int(time.time()) + ss_binfile = "/tmp/crashlog_%d.bin" % ts + + if not SaveDataToFile(panic_buf + crashlog_binary_offset, crashlog_binary_size, ss_binfile, None): + print "Failed to save crashlog binary data to file" + return + else: + return ParseUnknownPanicLog(panic_header, cmd_options) + + return + +def ParseUnknownPanicLog(panic_header, cmd_options={}): + magic_ptr = Cast(panic_header, 'uint32_t *') + panic_log_magic = dereference(magic_ptr) + print "Unrecognized panic header format. Magic: 0x%x..." % unsigned(panic_log_magic) + print "Panic region starts at 0x%08x" % int(panic_header) + print "Hint: To dump this panic header in order to try manually parsing it, use this command:" + print " (lldb) memory read -fx -s4 -c64 0x%08x" % int(panic_header) + print " ^ that will dump the first 256 bytes of the panic region" + ## TBD: Hexdump some bits here to allow folks to poke at the region manually? return + +@lldb_command('paniclog', 'SM') +def ShowPanicLog(cmd_args=None, cmd_options={}): + """ Display the paniclog information + usage: (lldb) paniclog + options: + -v : increase verbosity + -S : parse stackshot data (if panic stackshot available) + -M : parse macOS panic area (print panic string (if available), and/or capture crashlog info) + """ + + if "-M" in cmd_options: + if not hasattr(kern.globals, "mac_panic_header"): + print "macOS panic data requested but unavailable on this device" + return + panic_header = kern.globals.mac_panic_header + # DEBUG HACK FOR TESTING + #panic_header = kern.GetValueFromAddress(0xfffffff054098000, "uint32_t *") + else: + panic_header = kern.globals.panic_info + + if hasattr(panic_header, "eph_magic"): + panic_log_magic = unsigned(panic_header.eph_magic) + elif hasattr(panic_header, "mph_magic"): + panic_log_magic = unsigned(panic_header.mph_magic) + else: + print "*** Warning: unsure of panic header format, trying anyway" + magic_ptr = Cast(panic_header, 'uint32_t *') + panic_log_magic = int(dereference(magic_ptr)) + + if panic_log_magic == 0: + # No panic here.. + return + + panic_parsers = { + int(xnudefines.AURR_PANIC_MAGIC) : ParseAURRPanicLog, + int(xnudefines.MACOS_PANIC_MAGIC) : ParseMacOSPanicLog, + int(xnudefines.EMBEDDED_PANIC_MAGIC) : ParseEmbeddedPanicLog, + } + + # Find the right parser (fall back to unknown parser above) + parser = panic_parsers.get(panic_log_magic, ParseUnknownPanicLog) + + # execute it + return parser(panic_header, cmd_options) + @lldb_command('showbootargs') def ShowBootArgs(cmd_args=None): """ Display boot arguments passed to the target kernel @@ -672,7 +858,7 @@ def ShowBootArgs(cmd_args=None): @static_var("last_process_uniq_id", 1) def GetDebuggerStopIDValue(): - """ Create a unique session identifier. + """ Create a unique session identifier. returns: int - a unique number identified by processid and stopid. """ @@ -689,7 +875,7 @@ def GetDebuggerStopIDValue(): GetDebuggerStopIDValue.last_process_uniq_id +=1 proc_uniq_id = GetDebuggerStopIDValue.last_process_uniq_id + 1 - stop_id_str = "{:d}:{:d}".format(proc_uniq_id, stop_id) + stop_id_str = "{:d}:{:d}".format(proc_uniq_id, stop_id) return hash(stop_id_str) # The initialization code to add your commands @@ -703,6 +889,11 @@ def __lldb_init_module(debugger, internal_dict): debugger.HandleCommand('type summary add --regex --summary-string "${var%s}" -C yes -p -v "char \[[0-9]*\]"') debugger.HandleCommand('type format add --format hex -C yes uintptr_t') kern = KernelTarget(debugger) + if not hasattr(lldb.SBValue, 'GetValueAsAddress'): + warn_str = "WARNING: lldb version is too old. Some commands may break. Please update to latest lldb." + if os.isatty(sys.__stdout__.fileno()): + warn_str = VT.DarkRed + warn_str + VT.Default + print warn_str print "xnu debug macros loaded successfully. Run showlldbtypesummaries to enable type summaries." __lldb_init_module(lldb.debugger, None) @@ -729,11 +920,11 @@ def ShowLLDBTypeSummaries(cmd_args=[]): @lldb_command('walkqueue_head', 'S') def WalkQueueHead(cmd_args=[], cmd_options={}): - """ walk a queue_head_t and list all members in it. Note this is for queue_head_t. refer to osfmk/kern/queue.h + """ walk a queue_head_t and list all members in it. Note this is for queue_head_t. refer to osfmk/kern/queue.h Option: -S - suppress summary output. Usage: (lldb) walkqueue_head ex: (lldb) walkqueue_head 0x7fffff80 "thread *" "task_threads" - + """ global lldb_summary_definitions if not cmd_args: @@ -754,7 +945,7 @@ def WalkQueueHead(cmd_args=[], cmd_options={}): print lldb_summary_definitions[el_type](i) else: print "{0: <#020x}".format(i) - + @lldb_command('walklist_entry', 'S') @@ -768,7 +959,7 @@ def WalkList(cmd_args=[], cmd_options={}): Option: -S - suppress summary output. Usage: (lldb) walklist_entry ex: (lldb) walklist_entry 0x7fffff80 "struct proc *" "p_sibling" - + """ global lldb_summary_definitions if not cmd_args: @@ -926,7 +1117,7 @@ def IOTrace_cmd(cmd_args=[], cmd_options={}): from memory import * from process import * -from ipc import * +from ipc import * from pmap import * from ioreg import * from mbufs import * @@ -952,5 +1143,7 @@ from pgtrace import * from xnutriage import * from kevent import * from workqueue import * +from ulock import * from ntstat import * from zonetriage import * +from sysreg import * diff --git a/tools/lldbmacros/xnudefines.py b/tools/lldbmacros/xnudefines.py index 9ae470173..a91d0831d 100755 --- a/tools/lldbmacros/xnudefines.py +++ b/tools/lldbmacros/xnudefines.py @@ -4,7 +4,7 @@ The objective is to provide a single place to be the bridge between C code in xnu and the python macros used by lldb. If you define a variable which has been copied/referred over from C code and has high chance of changing over time. It would be best to define a supporting function of format "populate_". This will help in running them to repopulate. - + Please take a look at example of kobject_types below before making changes to this file. Note: The Format of the function has to be populate_ so that the automated updating will pick it up. """ @@ -70,18 +70,21 @@ arm_level2_access_strings = [ " noaccess", " " ] -kq_state_strings = { 0x000: '', - 0x001: 'SEL', - 0x002: 'SLEEP', - 0x004: 'PROCWAIT', - 0x008: 'KEV32', - 0x010: 'KEV64', - 0x020: 'KEVQOS', - 0x040: 'WORKQ', - 0x080: 'WORKLOOP', - 0x100: 'PROCESS', - 0x200: 'DRAIN', - 0x400: 'WAKEUP' } +kq_state_strings = { 0x0000: '', + 0x0001: 'SEL', + 0x0002: 'SLEEP', + 0x0004: 'PROCWAIT', + 0x0008: '32', + 0x0010: '64', + 0x0020: 'QOS', + 0x0040: 'WQ', + 0x0080: 'WL', + 0x0100: 'PROCESS', + 0x0200: 'DRAIN', + 0x0400: 'WAKEUP', + 0x0800: 'DYN', + 0x1000: 'R2K', + 0x2000: 'TS' } kn_state_strings = { 0x0000: '', 0x0001: 'ACTIVE', @@ -89,23 +92,14 @@ kn_state_strings = { 0x0000: '', 0x0004: 'DISABLED', 0x0008: 'DROPPING', 0x0010: 'LOCKED', - 0x0020: 'ATTACHING', + 0x0020: 'POSTING', 0x0040: 'STAYACTIVE', - 0x0080: 'DEFERDROP', - 0x0100: 'ATTACHED', - 0x0200: 'DISPATCH', - 0x0400: 'UDATASPEC', - 0x0800: 'SUPPRESS', - 0x1000: 'MERGE_QOS', - 0x2000: 'REQVANISH', - 0x4000: 'VANISHED' } - -kqrequest_state_strings = { 0x01: 'WORKLOOP', - 0x02: 'THREQUESTED', - 0x04: 'WAKEUP', - 0x08: 'THOVERCOMMIT', - 0x10: 'R2K_ARMED', - 0x20: 'ALLOC_TURNSTILE' } + 0x0080: 'DEFERDELETE', + 0x0100: 'MERGE_QOS', + 0x0200: 'REQVANISH', + 0x0400: 'VANISHED', + 0x0800: 'SUPPRESS' } + thread_qos_short_strings = { 0: '--', 1: 'MT', 2: 'BG', @@ -179,8 +173,8 @@ proc_flag_explain_strings = ["!0x00000004 - process is 32 bit", #only exception ] #File: xnu/osfmk/kern/ipc_kobject.h # string representations for Kobject types -kobject_types = ['', 'THREAD', 'TASK', 'HOST', 'HOST_PRIV', 'PROCESSOR', 'PSET', 'PSET_NAME', 'TIMER', 'PAGER_REQ', 'DEVICE', 'XMM_OBJECT', 'XMM_PAGER', 'XMM_KERNEL', 'XMM_REPLY', - 'NOTDEF 15', 'NOTDEF 16', 'HOST_SEC', 'LEDGER', 'MASTER_DEV', 'TASK_NAME', 'SUBSYTEM', 'IO_DONE_QUE', 'SEMAPHORE', 'LOCK_SET', 'CLOCK', 'CLOCK_CTRL' , 'IOKIT_SPARE', +kobject_types = ['', 'THREAD', 'TASK', 'HOST', 'HOST_PRIV', 'PROCESSOR', 'PSET', 'PSET_NAME', 'TIMER', 'PAGER_REQ', 'DEVICE', 'XMM_OBJECT', 'XMM_PAGER', 'XMM_KERNEL', 'XMM_REPLY', + 'NOTDEF 15', 'NOTDEF 16', 'HOST_SEC', 'LEDGER', 'MASTER_DEV', 'TASK_NAME', 'SUBSYTEM', 'IO_DONE_QUE', 'SEMAPHORE', 'LOCK_SET', 'CLOCK', 'CLOCK_CTRL' , 'IOKIT_SPARE', 'NAMED_MEM', 'IOKIT_CON', 'IOKIT_OBJ', 'UPL', 'MEM_OBJ_CONTROL', 'AU_SESSIONPORT', 'FILEPORT', 'LABELH', 'TASK_RESUME', 'VOUCHER', 'VOUCHER_ATTR_CONTROL', 'WORK_INTERVAL', 'UX_HANDLER'] @@ -213,7 +207,15 @@ EMBEDDED_PANIC_MAGIC = 0x46554E4B EMBEDDED_PANIC_STACKSHOT_SUCCEEDED_FLAG = 0x02 MACOS_PANIC_MAGIC = 0x44454544 +MACOS_PANIC_STACKSHOT_SUCCEEDED_FLAG = 0x04 + +AURR_PANIC_MAGIC = 0x41555252 +AURR_PANIC_STRING_LEN = 112 +AURR_PANIC_VERSION = 1 + +CRASHLOG_PANIC_STRING_LEN = 32 +AURR_CRASHLOG_PANIC_VERSION = 2 if __name__ == "__main__": populate_kobject_types("../../") - + diff --git a/tools/tests/perf_index/Makefile b/tools/tests/perf_index/Makefile index 15213c0a2..49aaa5c96 100644 --- a/tools/tests/perf_index/Makefile +++ b/tools/tests/perf_index/Makefile @@ -70,13 +70,13 @@ $(OBJROOT)/%.o: $(SRCROOT)/%.c $(CC) $(CFLAGS) $? -o $@ $(DSTROOT): - mkdir -p $(DSTROOT); + mkdir -p $(DSTROOT) $(OBJROOT): - mkdir -p $(OBJROOT); + mkdir -p $(OBJROOT) $(SYMROOT): - mkdir -p $(SYMROOT); + mkdir -p $(SYMROOT) clean: rm -rf $(OBJROOT) diff --git a/tools/tests/perf_index/test_fault_helper.c b/tools/tests/perf_index/test_fault_helper.c index ab4fbadc7..e3f8ee3ef 100644 --- a/tools/tests/perf_index/test_fault_helper.c +++ b/tools/tests/perf_index/test_fault_helper.c @@ -6,7 +6,7 @@ #include #include -#if TARGET_OS_EMBEDDED +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) #define MEMSIZE (1L<<28) #else #define MEMSIZE (1L<<30) diff --git a/tools/tests/personas/Makefile b/tools/tests/personas/Makefile index d2c718f39..ba66220c8 100644 --- a/tools/tests/personas/Makefile +++ b/tools/tests/personas/Makefile @@ -1,6 +1,8 @@ include ../Makefile.common CC:=$(shell xcrun -sdk "$(SDKROOT)" -find cc) +CODESIGN:=$(shell xcrun -sdk "$(SDKROOT)" -find codesign) +CODESIGN_ALLOCATE:=$(shell xcrun -sdk "$(SDKROOT)" -find codesign_allocate) SYMROOT?=$(shell /bin/pwd) @@ -30,15 +32,21 @@ ARCH_FLAGS := $(if $(ARCH_64), $(ARCH_64_FLAGS)) $(if $(ARCH_32), $(ARCH_32_FLAG DSTROOT?=$(shell /bin/pwd) -TARGETS := persona_mgr persona_spawn persona_test_run.sh +TARGETS := persona_mgr persona_spawn persona_test_run.sh persona_spawn_unentitled all: $(addprefix $(DSTROOT)/, $(TARGETS)) -$(DSTROOT)/persona_%: persona_%.c persona_test.h Makefile +$(DSTROOT)/persona_%: persona_%.c persona_test.h Makefile persona-entitlements.plist ${CC} ${CFLAGS} ${ARCH_FLAGS} -o $(SYMROOT)/$(notdir $@) $< + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) \ + $(CODESIGN) -s - --entitlements persona-entitlements.plist $(SYMROOT)/$(notdir $@) if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi -$(DSTROOT)/persona_test_run.sh: persona_test_run.sh +$(DSTROOT)/persona_spawn_unentitled: persona_spawn.c persona_test.h Makefile + ${CC} ${CFLAGS} ${ARCH_FLAGS} -o $(SYMROOT)/$(notdir $@) $< + if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi + +$(DSTROOT)/persona_test_run.sh: persona_test_run_src.sh cp $? $@ chmod +x $@ diff --git a/tools/tests/personas/persona-entitlements.plist b/tools/tests/personas/persona-entitlements.plist new file mode 100644 index 000000000..43ddfad68 --- /dev/null +++ b/tools/tests/personas/persona-entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.private.persona-mgmt + + + diff --git a/tools/tests/personas/persona_test_run.sh b/tools/tests/personas/persona_test_run.sh deleted file mode 100755 index b07ec376c..000000000 --- a/tools/tests/personas/persona_test_run.sh +++ /dev/null @@ -1,569 +0,0 @@ -#!/bin/bash -# persona_test_run.sh -# -# This file aims to be a comprehensive test suite for the persona subsystem. -# It uses two tools: -# 1. persona_mgr - create, destroy, lookup personas -# 2. persona_spawn - spawn processes into personas with a variety of options -# The script relies heavily on the particular output of these tools, so if you -# are modifying / extending those tools, this file also need to be updated to -# properly capture the new output. Specifically, the get_persona_info function -# needs to be maintained / updated. -# -# NOTE: the function get_persona_info() also needs to be kept up to date with -# the types of personas found in bsd/sys/persona.h - -# be sure to bail on script errors and unepected tool failures -set -e - -PERSONA_MGR="${PWD}/persona_mgr" -PERSONA_SPAWN="${PWD}/persona_spawn" - -if [ ! -d "$TMPDIR" ]; then - echo "Couldn't find temp directory '$TMPDIR': check permissions/environment?" - exit 255 -fi - -if [ ! -e "${PERSONA_MGR}" ] || [ ! -x "${PERSONA_MGR}" ]; then - echo "Can't find '${PERSONA_MGR}': skipping test" - exit 0 -fi -if [ ! -e "${PERSONA_SPAWN}" ] || [ ! -x "${PERSONA_SPAWN}" ]; then - echo "Can't find '${PERSONA_SPAWN}': skipping test" - exit 0 -fi - -function check_for_persona_support() { - local errno=0 - ${PERSONA_MGR} support || errno=$? - if [ $errno -eq 78 ]; then - echo "Persona subsystem is not supported - skipping tests" - exit 0 - fi - return 0 -} -check_for_persona_support - - -## bail [failure_msg] -# -# exit the script with an error code that corresponds to the line number -# from which this function was invoked. Because we want to exit with a -# non-zero exit code, we use: 1 + (254 % line). -# -function bail() { - local msg="$1" - local line=$2 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - echo "[$line] ERROR: $msg" 1>&2 - exit $((1 + $line % 254)) -} - -## check_return [message_on_failure] -# -# Check the return value of the previous command or script line. If the -# value of '$?' is not 0, then call bail() with an appropriate message. -# -function check_return() { - local err=$? - local msg=$1 - local line=$2 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - echo "CHECK: $msg" - if [ $err -ne 0 ]; then - bail "e=$err: $msg" $line - fi - - return 0 -} - -## expect_failure [message_on_success] -# -# Check the return value of the previous command or script line. If the -# value of '$?' is 0 (success), then call bail() with a message saying -# that we expected this previous command/line to fail. -# -function expect_failure() { - local err=$? - local msg=$1 - local line=$2 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - if [ $err -eq 0 ]; then - bail "found success, expected failure: $msg" $line - fi - - echo "EXPECT: failure: $msg" - return 0 -} - -## test_num [debug_info] [number] -# -# Check that a variable value is a number, bail() on error. -# -function test_num() { - local type=$1 - local num=$2 - local line=$3 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - if [ -z "$num" ]; then - bail "invalid (NULL) $type" $line - fi - [ "$num" -eq "$num" ] 2>/dev/null - if [ $? -ne 0 ]; then - bail "invalid $type: $num" $line - fi - - return 0 -} - -## global variables used to return values to callers -_ID=-1 -_TYPE="invalid" -_LOGIN="" -_UID=-1 -_GID=-1 -_NGROUPS=-1 -_GROUPS="" - -## get_persona_info {persona_id} {persona_login} -# -# Lookup persona info for the given ID/login. At least one of the ID/login -# parameters must be valid -function get_persona_info() { - local pna_id=${1:-1} - local pna_login=${2:- } - local line=$3 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - - local largs="-u ${pna_id}" - if [ "${pna_login}" != " " ]; then - largs+=" -l ${pna_login}" - fi - - _ID=-1 - _TYPE=-1 - _LOGIN="" - _UID=-1 - _GID=-1 - _NGROUPS=-1 - _GROUPS=() - - local file="${TMPDIR}/plookup" - - ${PERSONA_MGR} lookup ${largs} > "${file}" - check_return "persona lookup of: ${largs}" $line - - _ID=$(cat "${file}" | grep "+id: " | head -1 | sed 's/.*+id:[ ]*\([0-9][0-9]*\).*/\1/') - test_num "Persona ID lookup:${largs}" "$_ID" - - local type=$(cat "${file}" | grep "+type: " | head -1 | sed 's/.*+type:[ ]*\([0-9][0-9]*\).*/\1/') - test_num "+type lookup:${largs}" "$type" - ## - ## NOTE: keep in sync with bsd/sys/persona.h types! - ## - if [ $type -eq 1 ]; then - _TYPE=guest - elif [ $type -eq 2 ]; then - _TYPE=managed - elif [ $type -eq 3 ]; then - _TYPE=priv - elif [ $type -eq 4 ]; then - _TYPE=system - else - _TYPE=invalid - fi - - _LOGIN=$(cat "${file}" | grep "+login: " | head -1 | sed 's/.*+login:[ ]*"\([^"]*\)".*/\1/') - if [ -z "$_LOGIN" ]; then - bail "invalid login for pna_id:$_ID: '$_LOGIN'" $line - fi - - # these are always the same - _UID=$_ID - - _GID=$(cat "${file}" | grep "+gid: " | head -1 | sed 's/.*+gid:[ ]*\([0-9][0-9]*\).*/\1/') - test_num "GID lookup:${largs}" "$_GID" - - _NGROUPS=$(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*\([0-9][0-9]*\)[ ][ ]*{.*}.*/\1/') - test_num "NGROUPS lookup:${largs}" "$_NGROUPS" - - _GROUPS=( $(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*[0-9][0-9]*[ ][ ]*{[ ]*\([^ ].*\)[ ][ ]*}.*/\1/') ) - if [ $_NGROUPS -gt 0 ]; then - if [ -z "${_GROUPS}" ]; then - bail "lookup:${largs}: missing $_NGROUPS groups" $line - fi - if [ ${#_GROUPS[@]} -ne $_NGROUPS ]; then - bail "lookup:${largs} wrong number of groups ${#_GROUPS[@]} != $_NGROUPS" $line - fi - fi -} - -## validate_child_info [output_file] [persona_id] {uid} {gid} {groups} -# -# Parse the output of the 'persona_spawn' command and validate that -# the new child process is in the correct persona with the correct -# process attributes. -# -function validate_child_info() { - local file=$1 - local pna_id=$2 - local uid=${3:--1} - local gid=${4:--1} - local groups=${5:- } - local line=$6 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - local l=( ) - - # get the child's PID - local cpid="$(cat "$file" | grep "Child: PID:" | sed 's/.*Child: PID:\([0-9][0-9]*\).*/\1/')" - test_num "Child PID" "$cpid" $line - - # validate the child's persona - l=( $(cat "$file" | grep "Child: Persona:" | sed 's/.*Child: Persona: \([0-9][0-9]*\) (err:\([0-9][0-9]*\))/\1 \2/') ) - if [ ${#l[@]} -ne 2 ]; then - bail "Invalid Child[$cpid] Persona line" $line - fi - test_num "Child Persona ID" "${l[0]}" $line - test_num "kpersona_info retval" "${l[1]}" $line - - if [ ${l[0]} -ne $pna_id ]; then - bail "Child[$cpid] persona:${l[0]} != specified persona:$pna_id" $line - fi - - # Validate the UID/GID - l=( $(cat "$file" | grep "Child: UID:" | sed 's/.*UID:\([0-9][0-9]*\), GID:\([0-9][0-9]*\).*/\1 \2/') ) - if [ ${#l[@]} -ne 2 ]; then - bail "Invalid Child[$cpid] UID/GID output" $line - fi - if [ $uid -ge 0 ]; then - if [ $uid -ne ${l[0]} ]; then - bail "Child[$cpid] UID:${l[0]} != specified UID:$uid" $line - fi - fi - if [ $gid -ge 0 ]; then - if [ $gid -ne ${l[1]} ]; then - bail "Child[$cpid] GID:${l[1]} != specified GID:$gid" $line - fi - fi - - # TODO: validate / verify groups? - - return 0 -} - - -## spawn_child [persona_id] {uid} {gid} {group_spec} -# -# Create a child process that is spawn'd into the persona given by -# the first argument (pna_id). The new process can have its UID, GID, -# and group membership properties overridden. -# -function spawn_child() { - local pna_id=$1 - local uid=${2:--1} - local gid=${3:--1} - local groups=${4:- } - local line=$5 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - - local file="child.${pna_id}" - local spawn_args="-I $pna_id" - if [ $uid -ge 0 ]; then - spawn_args+=" -u $uid" - file+=".u$uid" - fi - if [ $gid -ge 0 ]; then - spawn_args+=" -g $gid" - file+=".g$gid" - fi - if [ "$groups" != " " ]; then - spawn_args+=" -G $groups" - file+="._groups" - fi - - echo "SPAWN: $file" - ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E > "${TMPDIR}/$file" - check_return "child info: $file" $line - - # Grab the specified persona's info so we can - # verify the child's info against it. - # This function puts data into global variables, e.g. _ID, _GID, etc. - get_persona_info ${pna_id} " " $line - if [ $uid -lt 0 ]; then - uid=$_UID - fi - if [ $gid -lt 0 ]; then - gid=$_GID - fi - if [ "$groups" == " " ]; then - # convert a bash array into a comma-separated list for validation - local _g="${_GROUPS[@]}" - groups="${_g// /,}" - fi - - validate_child_info "${TMPDIR}/$file" "$pna_id" "$uid" "$gid" "$groups" $line - - ## TODO: validate that the first child spawned into a persona *cannot* spawn - ## into a different persona... - ##if [ $uid -eq 0 ]; then - ## ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E -R -v -I 99 /bin/echo "This is running in the system persona" - ## expect_failure "Spawned child that re-execs into non-default persona" $line - ##fi - return 0 -} - -## get_created_id [output_file] -# -# Parse the output of the 'persona_mgr' command to determine the ID -# of the newly created persona. -# -function get_created_id() { - local file=$1 - local o=$(cat "$file" | grep "Created persona" | sed 's/.*Created persona \([0-9][0-9]*\):/\1/') - echo $o - return 0 -} - -## create_persona [login_name] [persona_type] {persona_id} {gid} {group_spec} -# -# Create a new persona with given parameters. -# -# Returns: the newly created persona ID via the global variable, $_ID -# -function create_persona() { - local name=${1} - local type=${2} - local pna_id=${3:--1} - local gid=${4:--1} - local groups=${5:- } - local line=$6 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - - if [ -z "$name" -o -z "$type" ]; then - bail "Invalid arguments to create_persona '$name' '$type'" $line - fi - - local file="persona.at${line}" - # persona ID of '-1' is auto-assigned - local spawn_args="-v -l $name -i $pna_id" - if [ $pna_id -eq -1 ]; then - file+=".auto" - else - file+=".${pna_id}" - fi - - spawn_args+=" -t $type" - file+=".$type" - - if [ $gid -ge 0 ]; then - spawn_args+=" -g $gid" - file+=".g$gid" - fi - if [ "$groups" != " " ]; then - spawn_args+=" -G $groups" - file+="._groups" - fi - - echo "CREATE: $file" - ${PERSONA_MGR} create ${spawn_args} > "${TMPDIR}/${file}" - check_return "persona creation: ${file}" $line - # test output should include persona creation output for later debugging - cat "${TMPDIR}/${file}" - - # validate the output of the persona_mgr tool (what we think we created) - _ID=`get_created_id "${TMPDIR}/${file}"` - test_num "persona_id for $file" "$_ID" $line - if [ ${pna_id} -gt 0 ]; then - if [ $_ID -ne ${pna_id} ]; then - bail "Created persona doesn't have expected ID $_ID != ${pna_id}" $line - fi - fi - - # validate the entire persona information (what a kpersona_lookup says we created) - # This function puts data into global variables, e.g. _ID, _LOGIN, _GID, etc. - echo "VALIDATE: ${file}" - get_persona_info ${pna_id} "$name" $line - if [ "$name" != "$_LOGIN" ]; then - bail "${file}: unexpected login '$_LOGIN' != '$name'" $line - fi - if [ "$type" != "$_TYPE" ]; then - bail "${file}: unexpected type '$_TYPE' != '$type'" $line - fi - if [ ${pna_id} -gt 0 ]; then - if [ ${pna_id} -ne $_ID ]; then - bail "${file}: unexpected ID '$_ID' != '${pna_id}'" $line - fi - fi - if [ $gid -ge 0 ]; then - if [ $gid -ne $_GID ]; then - bail "${file}: unexpected GID '$_GID' != '$gid'" $line - fi - fi - if [ "$groups" != " " ]; then - local _g="${_GROUPS[@]}" - if [ "${_g// /,}" != "$groups" ]; then - bail "${file}: unexpected groups '${_g// /,}' != '$groups'" $line - fi - fi - - return 0 -} - -## destroy_persona [persona_id] -# -# Destroy the given persona. -# -function destroy_persona() { - local pna_id=$1 - local line=$2 - if [ -z "$line" ]; then - line=${BASH_LINENO[0]} - fi - - echo "DESTROY: ${pna_id}" - ${PERSONA_MGR} destroy -v -i ${pna_id} - check_return "destruction of ${pna_id}" $line -} - -# -# -# Begin Tests! -# -# -echo "Running persona tests [$LINENO] ($TMPDIR)" - -## -## Test Group 0: basic creation + spawn tests -## - -# default group, specific ID -create_persona "test0_1" "guest" 1001 -P0ID=$_ID -spawn_child $P0ID -spawn_child $P0ID 1100 -spawn_child $P0ID 0 -spawn_child $P0ID -1 1101 -spawn_child $P0ID 1100 1101 -spawn_child $P0ID 1100 1101 1000,2000,3000 -spawn_child $P0ID 1100 -1 1000,2000,3000 -spawn_child $P0ID -1 -1 1000,2000,3000 -destroy_persona $P0ID - -# specific ID, non-default group -create_persona "test0_2" "guest" 1002 2000 -P0ID=$_ID -spawn_child $P0ID -spawn_child $P0ID 1100 -spawn_child $P0ID 0 -spawn_child $P0ID -1 1101 -spawn_child $P0ID 1100 1101 -spawn_child $P0ID 1100 1101 1000,2000,3000 -spawn_child $P0ID 1100 -1 1000,2000,3000 -spawn_child $P0ID -1 -1 1000,2000,3000 -destroy_persona $P0ID - -# non-default set of groups -create_persona "test0_3" "guest" 1003 2000 2000,3000,4000 -P0ID=$_ID -spawn_child $P0ID -spawn_child $P0ID 1100 -spawn_child $P0ID 0 -spawn_child $P0ID -1 1101 -spawn_child $P0ID 1100 1101 -spawn_child $P0ID 1100 1101 1111,2222,3333 -spawn_child $P0ID 1100 -1 1111,2222,3333 -spawn_child $P0ID -1 -1 1111,2222,3333 -destroy_persona $P0ID - - -## -## Test Group 1: persona creation / re-creation -## - -# Create 3 personas with auto-assigned IDs -create_persona "test1_1" "guest" -P1ID=$_ID -create_persona "test1_2" "managed" -P2ID=$_ID -create_persona "test1_3" "priv" -P3ID=$_ID -create_persona "test1_4" "system" -P4ID=$_ID - -D1=$(($P2ID - $P1ID)) -D2=$(($P3ID - $P2ID)) -D3=$(($P4ID - $P3ID)) -if [ $D1 -ne $D2 -o $D1 -ne $D3 -o $D2 -ne $D3 ]; then - bail "inconsistent automatic Persona ID increment: $D1,$D2,$D3 ($P1ID,$P2ID,$P3ID,$P4ID)" -fi - -# make sure we can't re-allocate the same name / ID -${PERSONA_MGR} create -v -l test1_1 -t guest -i -1 && expect_failure "re-create same name:test1_1 type:guest" -${PERSONA_MGR} create -v -l test1_1 -t managed -i -1 && expect_failure "re-create same name:test1_1 type:managed" -${PERSONA_MGR} create -v -l test1_1_new -t managed -i $P1ID && expect_failure "re-create $P1ID with new name:test1_1_new type:managed" - -## -## Test Group 2: auto-assigned ID tricks -## - -# Notice the difference in IDs, then try to create a persona by -# specifying an ID that will match the next auto-assigned ID -# (should succeed) -P5ID_REQ=$(($P4ID + $D2)) -create_persona "test2_1" "guest" ${P5ID_REQ} -P5ID=$_ID -if [ ! $P5ID -eq ${P5ID_REQ} ]; then - bail "test2_1: ${P5ID_REQ} != $P5ID" -fi - -# try to create a persona with auto-assigned ID -# (resulting persona should have ID != P5ID) -create_persona "test2_2" "guest" -P6ID=$_ID -if [ $P6ID -eq $P5ID ]; then - bail "created duplicate persona IDs: $P6ID == $P5ID" -fi - -## -## Test Group 3: persona destruction -## - -destroy_persona $P1ID -destroy_persona $P2ID -destroy_persona $P3ID -destroy_persona $P4ID -destroy_persona $P5ID -destroy_persona $P6ID - -# try to re-destroy the personas -# (should fail) -${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (1/2) $P1ID" -${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (2/2) $P1ID" -${PERSONA_MGR} destroy -v -i $P2ID && expect_failure "re-destroy $P2ID" -${PERSONA_MGR} destroy -v -i $P3ID && expect_failure "re-destroy $P3ID" -${PERSONA_MGR} destroy -v -i $P4ID && expect_failure "re-destroy $P4ID" -${PERSONA_MGR} destroy -v -i $P5ID && expect_failure "re-destroy $P5ID" -${PERSONA_MGR} destroy -v -i $P6ID && expect_failure "re-destroy $P6ID" - -# cleanup -rm -rf "${TMPDIR}" - -echo "" -echo "${0##/}: SUCCESS" -exit 0 diff --git a/tools/tests/personas/persona_test_run_src.sh b/tools/tests/personas/persona_test_run_src.sh new file mode 100755 index 000000000..95e132a96 --- /dev/null +++ b/tools/tests/personas/persona_test_run_src.sh @@ -0,0 +1,575 @@ +#!/bin/bash +# persona_test_run.sh +# +# This file aims to be a comprehensive test suite for the persona subsystem. +# It uses two tools: +# 1. persona_mgr - create, destroy, lookup personas +# 2. persona_spawn - spawn processes into personas with a variety of options +# The script relies heavily on the particular output of these tools, so if you +# are modifying / extending those tools, this file also need to be updated to +# properly capture the new output. Specifically, the get_persona_info function +# needs to be maintained / updated. +# +# NOTE: the function get_persona_info() also needs to be kept up to date with +# the types of personas found in bsd/sys/persona.h + +PERSONA_MGR="${PWD}/persona_mgr" +PERSONA_SPAWN="${PWD}/persona_spawn" +PERSONA_SPAWN_UNENTITLED="${PWD}/persona_spawn_unentitled" + +TEST_DEFAULT_PERSONA=0 + +if [ ! -d "$TMPDIR" ]; then + echo "Couldn't find temp directory '$TMPDIR': check permissions/environment?" + exit 255 +fi + +if [ ! -e "${PERSONA_MGR}" ] || [ ! -x "${PERSONA_MGR}" ]; then + echo "Can't find '${PERSONA_MGR}': skipping test" + exit 0 +fi +if [ ! -e "${PERSONA_SPAWN}" ] || [ ! -x "${PERSONA_SPAWN}" ]; then + echo "Can't find '${PERSONA_SPAWN}': skipping test" + exit 0 +fi + +function check_for_persona_support() { + local errno=0 + ${PERSONA_MGR} support || errno=$? + if [ $errno -eq 78 ]; then + echo "Persona subsystem is not supported - skipping tests" + exit 0 + fi + return 0 +} +check_for_persona_support + + +## bail [failure_msg] +# +# exit the script with an error code that corresponds to the line number +# from which this function was invoked. Because we want to exit with a +# non-zero exit code, we use: 1 + (254 % line). +# +function bail() { + local msg="$1" + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + echo "[$line] ERROR: $msg" 1>&2 + exit $((1 + $line % 254)) +} + +## check_return [message_on_failure] +# +# Check the return value of the previous command or script line. If the +# value of '$?' is not 0, then call bail() with an appropriate message. +# +function check_return() { + local err=$? + local msg=$1 + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + echo "CHECK: $msg" + if [ $err -ne 0 ]; then + bail "e=$err: $msg" $line + fi + + return 0 +} + +## expect_failure [message_on_success] +# +# Check the return value of the previous command or script line. If the +# value of '$?' is 0 (success), then call bail() with a message saying +# that we expected this previous command/line to fail. +# +function expect_failure() { + local err=$? + local msg=$1 + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + if [ $err -eq 0 ]; then + bail "found success, expected failure: $msg" $line + fi + + echo "EXPECT: failure: $msg" + return 0 +} + +## test_num [debug_info] [number] +# +# Check that a variable value is a number, bail() on error. +# +function test_num() { + local type=$1 + local num=$2 + local line=$3 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + if [ -z "$num" ]; then + bail "invalid (NULL) $type" $line + fi + [ "$num" -eq "$num" ] 2>/dev/null + if [ $? -ne 0 ]; then + bail "invalid $type: $num" $line + fi + + return 0 +} + +## global variables used to return values to callers +_ID=-1 +_TYPE="invalid" +_LOGIN="" +_UID=-1 +_GID=-1 +_NGROUPS=-1 +_GROUPS="" + +## get_persona_info {persona_id} {persona_login} +# +# Lookup persona info for the given ID/login. At least one of the ID/login +# parameters must be valid +function get_persona_info() { + local pna_id=${1:-1} + local pna_login=${2:- } + local line=$3 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + local largs="-u ${pna_id}" + if [ "${pna_login}" != " " ]; then + largs+=" -l ${pna_login}" + fi + + _ID=-1 + _TYPE=-1 + _LOGIN="" + _UID=-1 + _GID=-1 + _NGROUPS=-1 + _GROUPS=() + + local file="${TMPDIR}/plookup" + + ${PERSONA_MGR} lookup ${largs} > "${file}" + check_return "persona lookup of: ${largs}" $line + + _ID=$(cat "${file}" | grep "+id: " | head -1 | sed 's/.*+id:[ ]*\([0-9][0-9]*\).*/\1/') + test_num "Persona ID lookup:${largs}" "$_ID" + + local type=$(cat "${file}" | grep "+type: " | head -1 | sed 's/.*+type:[ ]*\([0-9][0-9]*\).*/\1/') + test_num "+type lookup:${largs}" "$type" + ## + ## NOTE: keep in sync with bsd/sys/persona.h types! + ## + if [ $type -eq 1 ]; then + _TYPE=guest + elif [ $type -eq 2 ]; then + _TYPE=managed + elif [ $type -eq 3 ]; then + _TYPE=priv + elif [ $type -eq 4 ]; then + _TYPE=system + else + _TYPE=invalid + fi + + _LOGIN=$(cat "${file}" | grep "+login: " | head -1 | sed 's/.*+login:[ ]*"\([^"]*\)".*/\1/') + if [ -z "$_LOGIN" ]; then + bail "invalid login for pna_id:$_ID: '$_LOGIN'" $line + fi + + # these are always the same + _UID=$_ID + + _GID=$(cat "${file}" | grep "+gid: " | head -1 | sed 's/.*+gid:[ ]*\([0-9][0-9]*\).*/\1/') + test_num "GID lookup:${largs}" "$_GID" + + _NGROUPS=$(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*\([0-9][0-9]*\)[ ][ ]*{.*}.*/\1/') + test_num "NGROUPS lookup:${largs}" "$_NGROUPS" + + _GROUPS=( $(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*[0-9][0-9]*[ ][ ]*{[ ]*\([^ ].*\)[ ][ ]*}.*/\1/') ) + if [ $_NGROUPS -gt 0 ]; then + if [ -z "${_GROUPS}" ]; then + bail "lookup:${largs}: missing $_NGROUPS groups" $line + fi + if [ ${#_GROUPS[@]} -ne $_NGROUPS ]; then + bail "lookup:${largs} wrong number of groups ${#_GROUPS[@]} != $_NGROUPS" $line + fi + fi +} + +## validate_child_info [output_file] [persona_id] {uid} {gid} {groups} +# +# Parse the output of the 'persona_spawn' command and validate that +# the new child process is in the correct persona with the correct +# process attributes. +# +function validate_child_info() { + local file=$1 + local pna_id=$2 + local uid=${3:--1} + local gid=${4:--1} + local groups=${5:- } + local line=$6 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + local l=( ) + + # get the child's PID + local cpid="$(cat "$file" | grep "Child: PID:" | sed 's/.*Child: PID:\([0-9][0-9]*\).*/\1/')" + test_num "Child PID" "$cpid" $line + + # validate the child's persona + l=( $(cat "$file" | grep "Child: Persona:" | sed 's/.*Child: Persona: \([0-9][0-9]*\) (err:\([0-9][0-9]*\))/\1 \2/') ) + if [ ${#l[@]} -ne 2 ]; then + bail "Invalid Child[$cpid] Persona line" $line + fi + test_num "Child Persona ID" "${l[0]}" $line + test_num "kpersona_info retval" "${l[1]}" $line + + if [ ${l[0]} -ne $pna_id ]; then + bail "Child[$cpid] persona:${l[0]} != specified persona:$pna_id" $line + fi + + # Validate the UID/GID + l=( $(cat "$file" | grep "Child: UID:" | sed 's/.*UID:\([0-9][0-9]*\), GID:\([0-9][0-9]*\).*/\1 \2/') ) + if [ ${#l[@]} -ne 2 ]; then + bail "Invalid Child[$cpid] UID/GID output" $line + fi + if [ $uid -ge 0 ]; then + if [ $uid -ne ${l[0]} ]; then + bail "Child[$cpid] UID:${l[0]} != specified UID:$uid" $line + fi + fi + if [ $gid -ge 0 ]; then + if [ $gid -ne ${l[1]} ]; then + bail "Child[$cpid] GID:${l[1]} != specified GID:$gid" $line + fi + fi + + # TODO: validate / verify groups? + + return 0 +} + + +## spawn_child [persona_id] {uid} {gid} {group_spec} +# +# Create a child process that is spawn'd into the persona given by +# the first argument (pna_id). The new process can have its UID, GID, +# and group membership properties overridden. +# +function spawn_child() { + local pna_id=$1 + local uid=${2:--1} + local gid=${3:--1} + local groups=${4:- } + local line=$5 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + local file="child.${pna_id}" + local spawn_args="-I $pna_id" + if [ $uid -ge 0 ]; then + spawn_args+=" -u $uid" + file+=".u$uid" + fi + if [ $gid -ge 0 ]; then + spawn_args+=" -g $gid" + file+=".g$gid" + fi + if [ "$groups" != " " ]; then + spawn_args+=" -G $groups" + file+="._groups" + fi + + echo "SPAWN: $file" + ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E > "${TMPDIR}/$file" + check_return "child info: $file" $line + + # Grab the specified persona's info so we can + # verify the child's info against it. + # This function puts data into global variables, e.g. _ID, _GID, etc. + get_persona_info ${pna_id} " " $line + if [ $uid -lt 0 ]; then + uid=$_UID + fi + if [ $gid -lt 0 ]; then + gid=$_GID + fi + if [ "$groups" == " " ]; then + # convert a bash array into a comma-separated list for validation + local _g="${_GROUPS[@]}" + groups="${_g// /,}" + fi + + validate_child_info "${TMPDIR}/$file" "$pna_id" "$uid" "$gid" "$groups" $line + + ## validate that the first child spawned into a persona *cannot* spawn + ## into a different persona... + if [ $uid -eq 0 ]; then + ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN_UNENTITLED} child -v -E -R spawn -v $spawn_args -I ${TEST_DEFAULT_PERSONA} /bin/echo "This is running in the system persona" + expect_failure "Spawned child that re-execs into non-default persona" $line + fi + return 0 +} + +## get_created_id [output_file] +# +# Parse the output of the 'persona_mgr' command to determine the ID +# of the newly created persona. +# +function get_created_id() { + local file=$1 + local o=$(cat "$file" | grep "Created persona" | sed 's/.*Created persona \([0-9][0-9]*\):/\1/') + echo $o + return 0 +} + +## create_persona [login_name] [persona_type] {persona_id} {gid} {group_spec} +# +# Create a new persona with given parameters. +# +# Returns: the newly created persona ID via the global variable, $_ID +# +function create_persona() { + local name=${1} + local type=${2} + local pna_id=${3:--1} + local gid=${4:--1} + local groups=${5:- } + local line=$6 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + if [ -z "$name" -o -z "$type" ]; then + bail "Invalid arguments to create_persona '$name' '$type'" $line + fi + + local file="persona.at${line}" + # persona ID of '-1' is auto-assigned + local spawn_args="-v -l $name -i $pna_id" + if [ $pna_id -eq -1 ]; then + file+=".auto" + else + file+=".${pna_id}" + fi + + spawn_args+=" -t $type" + file+=".$type" + + if [ $gid -ge 0 ]; then + spawn_args+=" -g $gid" + file+=".g$gid" + fi + if [ "$groups" != " " ]; then + spawn_args+=" -G $groups" + file+="._groups" + fi + + echo "CREATE: $file" + ${PERSONA_MGR} create ${spawn_args} > "${TMPDIR}/${file}" + check_return "persona creation: ${file}" $line + # test output should include persona creation output for later debugging + cat "${TMPDIR}/${file}" + + # validate the output of the persona_mgr tool (what we think we created) + _ID=`get_created_id "${TMPDIR}/${file}"` + test_num "persona_id for $file" "$_ID" $line + if [ ${pna_id} -gt 0 ]; then + if [ $_ID -ne ${pna_id} ]; then + bail "Created persona doesn't have expected ID $_ID != ${pna_id}" $line + fi + fi + + # validate the entire persona information (what a kpersona_lookup says we created) + # This function puts data into global variables, e.g. _ID, _LOGIN, _GID, etc. + echo "VALIDATE: ${file}" + get_persona_info ${pna_id} "$name" $line + if [ "$name" != "$_LOGIN" ]; then + bail "${file}: unexpected login '$_LOGIN' != '$name'" $line + fi + if [ "$type" != "$_TYPE" ]; then + bail "${file}: unexpected type '$_TYPE' != '$type'" $line + fi + if [ ${pna_id} -gt 0 ]; then + if [ ${pna_id} -ne $_ID ]; then + bail "${file}: unexpected ID '$_ID' != '${pna_id}'" $line + fi + fi + if [ $gid -ge 0 ]; then + if [ $gid -ne $_GID ]; then + bail "${file}: unexpected GID '$_GID' != '$gid'" $line + fi + fi + if [ "$groups" != " " ]; then + local _g="${_GROUPS[@]}" + if [ "${_g// /,}" != "$groups" ]; then + bail "${file}: unexpected groups '${_g// /,}' != '$groups'" $line + fi + fi + + return 0 +} + +## destroy_persona [persona_id] +# +# Destroy the given persona. +# +function destroy_persona() { + local pna_id=$1 + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + echo "DESTROY: ${pna_id}" + ${PERSONA_MGR} destroy -v -i ${pna_id} + check_return "destruction of ${pna_id}" $line +} + +# +# +# Begin Tests! +# +# +echo "Running persona tests [$LINENO] ($TMPDIR)" + +## +## Test Group 0: basic creation + spawn tests +## + +create_persona "test_default_persona" "guest" 9999 +TEST_DEFAULT_PERSONA=$_ID + +# default group, specific ID +create_persona "test0_1" "guest" 1001 +P0ID=$_ID + +spawn_child $P0ID +spawn_child $P0ID 1100 +spawn_child $P0ID 0 +spawn_child $P0ID -1 1101 +spawn_child $P0ID 1100 1101 +spawn_child $P0ID 1100 1101 1000,2000,3000 +spawn_child $P0ID 1100 -1 1000,2000,3000 +spawn_child $P0ID -1 -1 1000,2000,3000 +destroy_persona $P0ID + +# specific ID, non-default group +create_persona "test0_2" "guest" 1002 2000 +P0ID=$_ID +spawn_child $P0ID +spawn_child $P0ID 1100 +spawn_child $P0ID 0 +spawn_child $P0ID -1 1101 +spawn_child $P0ID 1100 1101 +spawn_child $P0ID 1100 1101 1000,2000,3000 +spawn_child $P0ID 1100 -1 1000,2000,3000 +spawn_child $P0ID -1 -1 1000,2000,3000 +destroy_persona $P0ID + +# non-default set of groups +create_persona "test0_3" "guest" 1003 2000 2000,3000,4000 +P0ID=$_ID +spawn_child $P0ID +spawn_child $P0ID 1100 +spawn_child $P0ID 0 +spawn_child $P0ID -1 1101 +spawn_child $P0ID 1100 1101 +spawn_child $P0ID 1100 1101 1111,2222,3333 +spawn_child $P0ID 1100 -1 1111,2222,3333 +spawn_child $P0ID -1 -1 1111,2222,3333 +destroy_persona $P0ID + + +## +## Test Group 1: persona creation / re-creation +## + +# Create 3 personas with auto-assigned IDs +create_persona "test1_1" "guest" +P1ID=$_ID +create_persona "test1_2" "managed" +P2ID=$_ID +create_persona "test1_3" "priv" +P3ID=$_ID +create_persona "test1_4" "guest" +P4ID=$_ID + +D1=$(($P2ID - $P1ID)) +D2=$(($P3ID - $P2ID)) +D3=$(($P4ID - $P3ID)) +if [ $D1 -ne $D2 -o $D1 -ne $D3 -o $D2 -ne $D3 ]; then + bail "inconsistent automatic Persona ID increment: $D1,$D2,$D3 ($P1ID,$P2ID,$P3ID,$P4ID)" +fi + +# make sure we can't re-allocate the same name / ID +${PERSONA_MGR} create -v -l test1_1 -t guest -i -1 && expect_failure "re-create same name:test1_1 type:guest" +${PERSONA_MGR} create -v -l test1_1 -t managed -i -1 && expect_failure "re-create same name:test1_1 type:managed" +${PERSONA_MGR} create -v -l test1_1_new -t managed -i $P1ID && expect_failure "re-create $P1ID with new name:test1_1_new type:managed" + +## +## Test Group 2: auto-assigned ID tricks +## + +# Notice the difference in IDs, then try to create a persona by +# specifying an ID that will match the next auto-assigned ID +# (should succeed) +P5ID_REQ=$(($P4ID + $D2)) +create_persona "test2_1" "guest" ${P5ID_REQ} +P5ID=$_ID +if [ ! $P5ID -eq ${P5ID_REQ} ]; then + bail "test2_1: ${P5ID_REQ} != $P5ID" +fi + +# try to create a persona with auto-assigned ID +# (resulting persona should have ID != P5ID) +create_persona "test2_2" "guest" +P6ID=$_ID +if [ $P6ID -eq $P5ID ]; then + bail "created duplicate persona IDs: $P6ID == $P5ID" +fi + +## +## Test Group 3: persona destruction +## + +destroy_persona $P1ID +destroy_persona $P2ID +destroy_persona $P3ID +destroy_persona $P4ID +destroy_persona $P5ID +destroy_persona $P6ID + +# try to re-destroy the personas +# (should fail) +${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (1/2) $P1ID" +${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (2/2) $P1ID" +${PERSONA_MGR} destroy -v -i $P2ID && expect_failure "re-destroy $P2ID" +${PERSONA_MGR} destroy -v -i $P3ID && expect_failure "re-destroy $P3ID" +${PERSONA_MGR} destroy -v -i $P4ID && expect_failure "re-destroy $P4ID" +${PERSONA_MGR} destroy -v -i $P5ID && expect_failure "re-destroy $P5ID" +${PERSONA_MGR} destroy -v -i $P6ID && expect_failure "re-destroy $P6ID" + +destroy_persona ${TEST_DEFAULT_PERSONA} + +# cleanup +rm -rf "${TMPDIR}" + +echo "" +echo "${0##/}: SUCCESS" +exit 0 diff --git a/tools/tests/zero-to-n/zero-to-n.c b/tools/tests/zero-to-n/zero-to-n.c index 39b7cd915..cd1963c56 100644 --- a/tools/tests/zero-to-n/zero-to-n.c +++ b/tools/tests/zero-to-n/zero-to-n.c @@ -67,6 +67,7 @@ typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY #define CONSTRAINT_NANOS (20000000ll) /* 20 ms */ #define COMPUTATION_NANOS (10000000ll) /* 10 ms */ #define TRACEWORTHY_NANOS (10000000ll) /* 10 ms */ +#define TRACEWORTHY_NANOS_TEST ( 2000000ll) /* 2 ms */ #if DEBUG #define debug_log(args ...) printf(args) @@ -131,6 +132,9 @@ static boolean_t g_test_rt_smt = FALSE; /* Test whether realtime threads are successfully avoiding CPU 0 on Intel */ static boolean_t g_test_rt_avoid0 = FALSE; +/* Print a histgram showing how many threads ran on each CPU */ +static boolean_t g_histogram = FALSE; + /* One randomly chosen thread holds up the train for a certain duration. */ static boolean_t g_do_one_long_spin = FALSE; static uint32_t g_one_long_spin_id = 0; @@ -681,6 +685,11 @@ main(int argc, char **argv) } g_policy = MY_POLICY_REALTIME; g_do_all_spin = TRUE; + g_histogram = true; + /* Don't change g_traceworthy_latency_ns if it's explicity been set to something other than the default */ + if (g_traceworthy_latency_ns == TRACEWORTHY_NANOS) { + g_traceworthy_latency_ns = TRACEWORTHY_NANOS_TEST; + } } else if (g_test_rt_smt) { if (g_nlogicalcpu != 2 * g_nphysicalcpu) { /* Not SMT */ @@ -693,6 +702,7 @@ main(int argc, char **argv) } g_policy = MY_POLICY_REALTIME; g_do_all_spin = TRUE; + g_histogram = true; } else if (g_test_rt_avoid0) { #if defined(__x86_64__) || defined(__i386__) if (g_numthreads == 0) { @@ -704,6 +714,7 @@ main(int argc, char **argv) } g_policy = MY_POLICY_REALTIME; g_do_all_spin = TRUE; + g_histogram = true; #else printf("Attempt to run --test-rt-avoid0 on a non-Intel device\n"); exit(0); @@ -948,13 +959,15 @@ main(int argc, char **argv) } #endif - if (g_test_rt || g_test_rt_smt || g_test_rt_avoid0) { + if (g_histogram) { putchar('\n'); for (uint32_t i = 0; i < g_numcpus; i++) { printf("%d\t%d\n", i, g_cpu_histogram[i].accum); } + } + if (g_test_rt || g_test_rt_smt || g_test_rt_avoid0) { #define PRIMARY 0x5555555555555555ULL #define SECONDARY 0xaaaaaaaaaaaaaaaaULL @@ -970,7 +983,7 @@ main(int argc, char **argv) /* Test for threads running on both primary and secondary cpus of the same core (FAIL) */ fail = ((map & PRIMARY) & ((map & SECONDARY) >> 1)); } else if (g_test_rt) { - fail = __builtin_popcountll(map) != g_numthreads; + fail = (__builtin_popcountll(map) != g_numthreads) && (worst_latencies_ns[i] > g_traceworthy_latency_ns); } else if (g_test_rt_avoid0) { fail = ((map & 0x1) == 0x1); } @@ -1109,6 +1122,7 @@ parse_args(int argc, char *argv[]) { "test-rt", no_argument, (int*)&g_test_rt, TRUE }, { "test-rt-smt", no_argument, (int*)&g_test_rt_smt, TRUE }, { "test-rt-avoid0", no_argument, (int*)&g_test_rt_avoid0, TRUE }, + { "histogram", no_argument, (int*)&g_histogram, TRUE }, { "verbose", no_argument, (int*)&g_verbose, TRUE }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } diff --git a/tools/trace/kqtrace.lua b/tools/trace/kqtrace.lua index 07cbd9fbd..60c9dc5d9 100755 --- a/tools/trace/kqtrace.lua +++ b/tools/trace/kqtrace.lua @@ -75,10 +75,7 @@ function state_string(strings, state) end kqrequest_state_strings = { - ['THREQUESTED'] = 0x02, - ['WAKEUP'] = 0x04, - ['BOUND'] = 0x08, - ['DRAIN'] = 0x40, + ['WAKEUP'] = 1, } kqueue_state_strings = { @@ -94,6 +91,8 @@ kqueue_state_strings = { ['DRAIN'] = 0x200, ['WAKEUP'] = 0x400, ['DYNAMIC'] = 0x800, + ['R2K'] = 0x1000, + ['TURNSTILE'] = 0x2000, } knote_state_strings = { @@ -102,16 +101,13 @@ knote_state_strings = { ['DISABLED'] = 0x0004, ['DROPPING'] = 0x0008, ['LOCKED'] = 0x0010, - ['ATTACHING'] = 0x0020, + ['POSTING'] = 0x0020, ['STAYACTIVE'] = 0x0040, ['DEFERDELETE'] = 0x0080, - ['ATTACHED'] = 0x0100, - ['DISPATCH'] = 0x0200, - ['UDATA_SPECIFIC'] = 0x0400, + ['MERGE_QOS'] = 0x0100, + ['REQVANISH'] = 0x0200, + ['VANISHED'] = 0x0400, ['SUPPRESSED'] = 0x0800, - ['MERGE_QOS'] = 0x1000, - ['REQVANISH'] = 0x2000, - ['VANISHED'] = 0x4000, } kevent_flags_strings = { @@ -159,36 +155,53 @@ function kevent_filter_string(filt) return 'SOCK' elseif filt == -14 then return 'MEMORYSTATUS' - elseif filt == 15 then - return 'KQREAD' - elseif filt == 16 then - return 'PIPE_R' + elseif filt == -15 then + return 'EXCEPT' + elseif filt == -16 then + return 'NW_CHANNEL' + elseif filt == -17 then + return 'WORKLOOP' + elseif filt == 17 then - return 'PIPE_W' + return 'KQREAD' elseif filt == 18 then - return 'PTSD' + return 'PIPE_N' elseif filt == 19 then - return 'SOWRITE' + return 'PIPE_R' elseif filt == 20 then - return 'SOEXCEPT' + return 'PIPE_W' elseif filt == 21 then - return 'SPEC' + return 'PTSD' elseif filt == 22 then - return 'BPFREAD' + return 'SOREAD' elseif filt == 23 then - return 'NECP_FD' + return 'SOWRITE' elseif filt == 24 then - return 'SKYWALK_CHANNEL_W' + return 'SCK' elseif filt == 25 then - return 'SKYWALK_CHANNEL_R' + return 'SOEXCEPT' elseif filt == 26 then - return 'FSEVENT' + return 'SPEC' elseif filt == 27 then - return 'VN' + return 'BPFREAD' elseif filt == 28 then - return 'SKYWALK_CHANNEL_E' + return 'NECP_FD' elseif filt == 29 then + return 'SKYWALK_CHANNEL_W' + elseif filt == 30 then + return 'SKYWALK_CHANNEL_R' + elseif filt == 31 then + return 'SKYWALK_CHANNEL_E' + elseif filt == 32 then + return 'FSEVENT' + elseif filt == 33 then + return 'VN' + elseif filt == 34 then return 'TTY' + elseif filt == 35 then + return 'PTMX' + elseif filt == 36 then + return 'DETACHED' else return string.format('[%d]', filt) end diff --git a/tools/trace/wqtrace.lua b/tools/trace/wqtrace.lua index ae853d433..970dde6fe 100755 --- a/tools/trace/wqtrace.lua +++ b/tools/trace/wqtrace.lua @@ -149,6 +149,8 @@ trace_codename("wq_select_threadreq", function(buf) printf("%s\tSelection failed: no request\n", prefix) elseif buf[2] == 2 then printf("%s\tSelection failed: throttled\n", prefix) + elseif buf[2] == 3 then + printf("%s\tSelection failed: scheduler would preempt\n", prefix) end end)